robinzixuan commited on
Commit
f794bc1
·
verified ·
1 Parent(s): d35c439

Upload modeling_opt.py

Browse files
Files changed (1) hide show
  1. modeling_opt.py +15 -9
modeling_opt.py CHANGED
@@ -190,13 +190,13 @@ class OPTAttention(nn.Module):
190
  self.num_heads = config.num_attention_heads
191
  self.dropout = config.attention_dropout
192
  self.enable_bias = config.enable_bias
193
- self.head_dim = embed_dim // num_heads
194
  self.is_causal = True
195
 
196
-
197
  if (self.head_dim * num_heads) != self.embed_dim:
198
  raise ValueError(
199
- f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
 
200
  f" and `num_heads`: {num_heads})."
201
  )
202
  self.scaling = self.head_dim**-0.5
@@ -366,14 +366,16 @@ class OPTAttention(nn.Module):
366
 
367
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
368
  raise ValueError(
369
- f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
 
370
  f" {attn_weights.size()}"
371
  )
372
 
373
  if attention_mask is not None:
374
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
375
  raise ValueError(
376
- f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
 
377
  )
378
  attn_weights = attn_weights.view(
379
  bsz, self.num_heads, tgt_len, src_len) + attention_mask
@@ -394,7 +396,8 @@ class OPTAttention(nn.Module):
394
  if layer_head_mask is not None:
395
  if layer_head_mask.size() != (self.num_heads,):
396
  raise ValueError(
397
- f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
 
398
  f" {layer_head_mask.size()}"
399
  )
400
  attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
@@ -428,7 +431,8 @@ class OPTAttention(nn.Module):
428
 
429
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
430
  raise ValueError(
431
- f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
 
432
  f" {attn_output.size()}"
433
  )
434
 
@@ -1086,7 +1090,8 @@ class OPTDecoder(OPTPreTrainedModel):
1086
  batch_size, mask_seq_length, device=inputs_embeds.device)
1087
  elif attention_mask.shape[1] != mask_seq_length:
1088
  raise ValueError(
1089
- f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
 
1090
  f"{mask_seq_length} (sum of the lengths of current and past inputs)"
1091
  )
1092
  causal_attention_mask = _prepare_4d_causal_attention_mask(
@@ -1118,7 +1123,8 @@ class OPTDecoder(OPTPreTrainedModel):
1118
  if attn_mask is not None:
1119
  if attn_mask.size()[0] != (len(self.layers)):
1120
  raise ValueError(
1121
- f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
 
1122
  f" {head_mask.size()[0]}."
1123
  )
1124
 
 
190
  self.num_heads = config.num_attention_heads
191
  self.dropout = config.attention_dropout
192
  self.enable_bias = config.enable_bias
193
+ self.head_dim = self.embed_dim // self.num_heads
194
  self.is_causal = True
195
 
 
196
  if (self.head_dim * num_heads) != self.embed_dim:
197
  raise ValueError(
198
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {
199
+ self.embed_dim}"
200
  f" and `num_heads`: {num_heads})."
201
  )
202
  self.scaling = self.head_dim**-0.5
 
366
 
367
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
368
  raise ValueError(
369
+ f"Attention weights should be of size {
370
+ (bsz * self.num_heads, tgt_len, src_len)}, but is"
371
  f" {attn_weights.size()}"
372
  )
373
 
374
  if attention_mask is not None:
375
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
376
  raise ValueError(
377
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
378
+ attention_mask.size()}"
379
  )
380
  attn_weights = attn_weights.view(
381
  bsz, self.num_heads, tgt_len, src_len) + attention_mask
 
396
  if layer_head_mask is not None:
397
  if layer_head_mask.size() != (self.num_heads,):
398
  raise ValueError(
399
+ f"Head mask for a single layer should be of size {
400
+ (self.num_heads,)}, but is"
401
  f" {layer_head_mask.size()}"
402
  )
403
  attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
 
431
 
432
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
433
  raise ValueError(
434
+ f"`attn_output` should be of size {
435
+ (bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
436
  f" {attn_output.size()}"
437
  )
438
 
 
1090
  batch_size, mask_seq_length, device=inputs_embeds.device)
1091
  elif attention_mask.shape[1] != mask_seq_length:
1092
  raise ValueError(
1093
+ f"The provided attention mask has length {
1094
+ attention_mask.shape[1]}, but its length should be "
1095
  f"{mask_seq_length} (sum of the lengths of current and past inputs)"
1096
  )
1097
  causal_attention_mask = _prepare_4d_causal_attention_mask(
 
1123
  if attn_mask is not None:
1124
  if attn_mask.size()[0] != (len(self.layers)):
1125
  raise ValueError(
1126
+ f"The `{mask_name}` should be specified for {
1127
+ len(self.layers)} layers, but it is for"
1128
  f" {head_mask.size()[0]}."
1129
  )
1130