Upload modeling_opt.py
Browse files- modeling_opt.py +7 -14
modeling_opt.py
CHANGED
@@ -194,8 +194,7 @@ class OPTAttention(nn.Module):
|
|
194 |
|
195 |
if (self.head_dim * num_heads) != self.embed_dim:
|
196 |
raise ValueError(
|
197 |
-
f"embed_dim must be divisible by num_heads (got `embed_dim`: {
|
198 |
-
self.embed_dim}"
|
199 |
f" and `num_heads`: {num_heads})."
|
200 |
)
|
201 |
self.scaling = self.head_dim**-0.5
|
@@ -365,16 +364,14 @@ class OPTAttention(nn.Module):
|
|
365 |
|
366 |
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
367 |
raise ValueError(
|
368 |
-
f"Attention weights should be of size {
|
369 |
-
(bsz * self.num_heads, tgt_len, src_len)}, but is"
|
370 |
f" {attn_weights.size()}"
|
371 |
)
|
372 |
|
373 |
if attention_mask is not None:
|
374 |
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
375 |
raise ValueError(
|
376 |
-
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
|
377 |
-
attention_mask.size()}"
|
378 |
)
|
379 |
attn_weights = attn_weights.view(
|
380 |
bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
@@ -395,8 +392,7 @@ class OPTAttention(nn.Module):
|
|
395 |
if layer_head_mask is not None:
|
396 |
if layer_head_mask.size() != (self.num_heads,):
|
397 |
raise ValueError(
|
398 |
-
f"Head mask for a single layer should be of size {
|
399 |
-
(self.num_heads,)}, but is"
|
400 |
f" {layer_head_mask.size()}"
|
401 |
)
|
402 |
attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
|
@@ -430,8 +426,7 @@ class OPTAttention(nn.Module):
|
|
430 |
|
431 |
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
|
432 |
raise ValueError(
|
433 |
-
f"`attn_output` should be of size {
|
434 |
-
(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
|
435 |
f" {attn_output.size()}"
|
436 |
)
|
437 |
|
@@ -1089,8 +1084,7 @@ class OPTDecoder(OPTPreTrainedModel):
|
|
1089 |
batch_size, mask_seq_length, device=inputs_embeds.device)
|
1090 |
elif attention_mask.shape[1] != mask_seq_length:
|
1091 |
raise ValueError(
|
1092 |
-
f"The provided attention mask has length {
|
1093 |
-
attention_mask.shape[1]}, but its length should be "
|
1094 |
f"{mask_seq_length} (sum of the lengths of current and past inputs)"
|
1095 |
)
|
1096 |
causal_attention_mask = _prepare_4d_causal_attention_mask(
|
@@ -1122,8 +1116,7 @@ class OPTDecoder(OPTPreTrainedModel):
|
|
1122 |
if attn_mask is not None:
|
1123 |
if attn_mask.size()[0] != (len(self.layers)):
|
1124 |
raise ValueError(
|
1125 |
-
f"The `{mask_name}` should be specified for {
|
1126 |
-
len(self.layers)} layers, but it is for"
|
1127 |
f" {head_mask.size()[0]}."
|
1128 |
)
|
1129 |
|
|
|
194 |
|
195 |
if (self.head_dim * num_heads) != self.embed_dim:
|
196 |
raise ValueError(
|
197 |
+
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
|
|
|
198 |
f" and `num_heads`: {num_heads})."
|
199 |
)
|
200 |
self.scaling = self.head_dim**-0.5
|
|
|
364 |
|
365 |
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
366 |
raise ValueError(
|
367 |
+
f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
|
|
|
368 |
f" {attn_weights.size()}"
|
369 |
)
|
370 |
|
371 |
if attention_mask is not None:
|
372 |
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
373 |
raise ValueError(
|
374 |
+
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
|
|
|
375 |
)
|
376 |
attn_weights = attn_weights.view(
|
377 |
bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
|
|
392 |
if layer_head_mask is not None:
|
393 |
if layer_head_mask.size() != (self.num_heads,):
|
394 |
raise ValueError(
|
395 |
+
f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
|
|
|
396 |
f" {layer_head_mask.size()}"
|
397 |
)
|
398 |
attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
|
|
|
426 |
|
427 |
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
|
428 |
raise ValueError(
|
429 |
+
f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
|
|
|
430 |
f" {attn_output.size()}"
|
431 |
)
|
432 |
|
|
|
1084 |
batch_size, mask_seq_length, device=inputs_embeds.device)
|
1085 |
elif attention_mask.shape[1] != mask_seq_length:
|
1086 |
raise ValueError(
|
1087 |
+
f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
|
|
|
1088 |
f"{mask_seq_length} (sum of the lengths of current and past inputs)"
|
1089 |
)
|
1090 |
causal_attention_mask = _prepare_4d_causal_attention_mask(
|
|
|
1116 |
if attn_mask is not None:
|
1117 |
if attn_mask.size()[0] != (len(self.layers)):
|
1118 |
raise ValueError(
|
1119 |
+
f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
|
|
|
1120 |
f" {head_mask.size()[0]}."
|
1121 |
)
|
1122 |
|