robinzixuan commited on
Commit
c517fcc
·
verified ·
1 Parent(s): 90068bd

Upload modeling_opt.py

Browse files
Files changed (1) hide show
  1. modeling_opt.py +7 -14
modeling_opt.py CHANGED
@@ -194,8 +194,7 @@ class OPTAttention(nn.Module):
194
 
195
  if (self.head_dim * num_heads) != self.embed_dim:
196
  raise ValueError(
197
- f"embed_dim must be divisible by num_heads (got `embed_dim`: {
198
- self.embed_dim}"
199
  f" and `num_heads`: {num_heads})."
200
  )
201
  self.scaling = self.head_dim**-0.5
@@ -365,16 +364,14 @@ class OPTAttention(nn.Module):
365
 
366
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
367
  raise ValueError(
368
- f"Attention weights should be of size {
369
- (bsz * self.num_heads, tgt_len, src_len)}, but is"
370
  f" {attn_weights.size()}"
371
  )
372
 
373
  if attention_mask is not None:
374
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
375
  raise ValueError(
376
- f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
377
- attention_mask.size()}"
378
  )
379
  attn_weights = attn_weights.view(
380
  bsz, self.num_heads, tgt_len, src_len) + attention_mask
@@ -395,8 +392,7 @@ class OPTAttention(nn.Module):
395
  if layer_head_mask is not None:
396
  if layer_head_mask.size() != (self.num_heads,):
397
  raise ValueError(
398
- f"Head mask for a single layer should be of size {
399
- (self.num_heads,)}, but is"
400
  f" {layer_head_mask.size()}"
401
  )
402
  attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
@@ -430,8 +426,7 @@ class OPTAttention(nn.Module):
430
 
431
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
432
  raise ValueError(
433
- f"`attn_output` should be of size {
434
- (bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
435
  f" {attn_output.size()}"
436
  )
437
 
@@ -1089,8 +1084,7 @@ class OPTDecoder(OPTPreTrainedModel):
1089
  batch_size, mask_seq_length, device=inputs_embeds.device)
1090
  elif attention_mask.shape[1] != mask_seq_length:
1091
  raise ValueError(
1092
- f"The provided attention mask has length {
1093
- attention_mask.shape[1]}, but its length should be "
1094
  f"{mask_seq_length} (sum of the lengths of current and past inputs)"
1095
  )
1096
  causal_attention_mask = _prepare_4d_causal_attention_mask(
@@ -1122,8 +1116,7 @@ class OPTDecoder(OPTPreTrainedModel):
1122
  if attn_mask is not None:
1123
  if attn_mask.size()[0] != (len(self.layers)):
1124
  raise ValueError(
1125
- f"The `{mask_name}` should be specified for {
1126
- len(self.layers)} layers, but it is for"
1127
  f" {head_mask.size()[0]}."
1128
  )
1129
 
 
194
 
195
  if (self.head_dim * num_heads) != self.embed_dim:
196
  raise ValueError(
197
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
 
198
  f" and `num_heads`: {num_heads})."
199
  )
200
  self.scaling = self.head_dim**-0.5
 
364
 
365
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
366
  raise ValueError(
367
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
 
368
  f" {attn_weights.size()}"
369
  )
370
 
371
  if attention_mask is not None:
372
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
373
  raise ValueError(
374
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
 
375
  )
376
  attn_weights = attn_weights.view(
377
  bsz, self.num_heads, tgt_len, src_len) + attention_mask
 
392
  if layer_head_mask is not None:
393
  if layer_head_mask.size() != (self.num_heads,):
394
  raise ValueError(
395
+ f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
 
396
  f" {layer_head_mask.size()}"
397
  )
398
  attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
 
426
 
427
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
428
  raise ValueError(
429
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
 
430
  f" {attn_output.size()}"
431
  )
432
 
 
1084
  batch_size, mask_seq_length, device=inputs_embeds.device)
1085
  elif attention_mask.shape[1] != mask_seq_length:
1086
  raise ValueError(
1087
+ f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
 
1088
  f"{mask_seq_length} (sum of the lengths of current and past inputs)"
1089
  )
1090
  causal_attention_mask = _prepare_4d_causal_attention_mask(
 
1116
  if attn_mask is not None:
1117
  if attn_mask.size()[0] != (len(self.layers)):
1118
  raise ValueError(
1119
+ f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
 
1120
  f" {head_mask.size()[0]}."
1121
  )
1122