jaandoui
/

DNABERT2-AttentionExtracted

@@ -203,7 +203,7 @@ class BertUnpadSelfAttention(nn.Module):
             print(f'REARRANGED ATTENTION: {rearranged_attention.shape}')
         except:
             print(f'REARRANGED ATTENTION: {rearranged_attention[0].shape}')
-        return rearrange(attention, 'nnz h d -> nnz (h d)')
 # Copy of transformer's library BertSelfOutput that will not be caught by surgery methods looking for HF BERT modules.
@@ -254,7 +254,7 @@ class BertUnpadAttention(nn.Module):
             attn_mask: None or (batch, max_seqlen_in_batch)
             bias: None or (batch, heads, max_seqlen_in_batch, max_seqlen_in_batch)
         """
-        self_output = self.self(input_tensor, cu_seqlens, max_s, indices,
                                 attn_mask, bias)
         try:
@@ -266,7 +266,7 @@ class BertUnpadAttention(nn.Module):
             return self.output(index_first_axis(self_output, subset_idx),
                                index_first_axis(input_tensor, subset_idx))
         else:
-            return self.output(self_output, input_tensor)
 class BertGatedLinearUnitMLP(nn.Module):
@@ -347,12 +347,12 @@ class BertLayer(nn.Module):
             attn_mask: None or (batch, max_seqlen_in_batch)
             bias: None or (batch, heads, max_seqlen_in_batch, max_seqlen_in_batch)
         """
-        attention_output = self.attention(hidden_states, cu_seqlens, seqlen,
                                           subset_idx, indices, attn_mask, bias)
         print(f'BertLayer attention_output shape: {attention_output.shape}')
         layer_output = self.mlp(attention_output)
         print(f'BertLayer layer_output shape: {layer_output.shape}')
-        return layer_output, attention_output # JAANDOUI: this only returns layer_output in the original work.
 class BertEncoder(nn.Module):
@@ -467,11 +467,12 @@ class BertEncoder(nn.Module):
         all_encoder_layers = []
         all_attention_weights = []  # List to store attention weights
         if subset_mask is None:
             for layer_module in self.layer:
                 # JAANDOUI: Since we get now attention too, we need to unpack 2 elements instead of 1.
-                hidden_states, attention_weights = layer_module(hidden_states,
                                                                 cu_seqlens,
                                                                 seqlen,
                                                                 None,
@@ -482,6 +483,8 @@ class BertEncoder(nn.Module):
                 # print(f'Inner Attention: {attention_weights}')
                 print(f'Inner Attention shape: {attention_weights.shape}')
                 all_attention_weights.append(attention_weights)  # Store attention weights
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
             # Pad inputs and mask. It will insert back zero-padded tokens.
@@ -494,7 +497,7 @@ class BertEncoder(nn.Module):
             for i in range(len(self.layer) - 1):
                 layer_module = self.layer[i]
                 # JAANDOUI: Since we get now attention too, we need to unpack 2 elements instead of 1.
-                hidden_states, attention_weights = layer_module(hidden_states,
                                                                 cu_seqlens,
                                                                 seqlen,
                                                                 None,
@@ -502,12 +505,14 @@ class BertEncoder(nn.Module):
                                                                 attn_mask=attention_mask,
                                                                 bias=alibi_attn_mask)
                 all_attention_weights.append(attention_weights)  # JAANDOUI: Store attention weights
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
             subset_idx = torch.nonzero(subset_mask[attention_mask_bool],
                                        as_tuple=False).flatten()
             # JAANDOUI: Since we get now attention too, we need to unpack 2 elements instead of 1.
-            hidden_states, attention_weights = self.layer[-1](hidden_states,
                                                               cu_seqlens,
                                                               seqlen,
                                                               subset_idx=subset_idx,
@@ -515,6 +520,8 @@ class BertEncoder(nn.Module):
                                                               attn_mask=attention_mask,
                                                               bias=alibi_attn_mask)
             all_attention_weights.append(attention_weights)  # JAANDOUI: appending the attention of different layers together.
             # print(f'here is the matrix of attentions inside encoder: \n {all_attention_weights}')
             print(f'and this is the [0]shape inside encoder: \n {all_attention_weights[0].shape}')
             # print(f'NUMBER6: {all_attention_weights}')
@@ -522,7 +529,7 @@ class BertEncoder(nn.Module):
             all_encoder_layers.append(hidden_states)
         # JAANDOUI: Since we now return both, we need to handle them wherever BertEncoder forward is called.
-        return all_encoder_layers, all_attention_weights  # Return both hidden states and attention weights
         # return all_encoder_layers  # JAANDOUI: original return.
@@ -649,7 +656,7 @@ class BertModel(BertPreTrainedModel):
         # JAANDOUI: first part where we call self.encoder (which is the instance of BertEncoder defined here)
         # JAANDOUI: need to return the attention weights here too.
-        encoder_outputs, all_attention_weights = self.encoder(
             embedding_output,
             attention_mask,
             output_all_encoded_layers=output_all_encoded_layers,
@@ -681,11 +688,11 @@ class BertModel(BertPreTrainedModel):
         # JAANDOUI: returning all_attention_weights too
         if self.pooler is not None:
             # print(f'NUMBER8: {all_attention_weights}')
-            return encoder_outputs, pooled_output, all_attention_weights
         # JAANDOUI: returning all_attention_weights too
         # print(f'NUMBER9: {all_attention_weights}')
-        return encoder_outputs, None, all_attention_weights
         # JAANDOUI: need to handle the returned elements wherever BertModel is instantiated.
 ###################
@@ -924,27 +931,27 @@ class BertForSequenceClassification(BertPreTrainedModel):
         pooled_output = outputs[1]
         try:
-            print(f'outputs[2] before reassignment SHAPE: {outputs[2][0].shape} ')
         except:
-            print(print(f'outputs[2] before reassignment LENGTH: {len(outputs[2][0])} '))
         # JAANDOUI:
-        all_attention_weights = outputs[2]
         try:
-            print(f'outputs[2] AFTER reassignment SHAPE: {outputs[2][0].shape} ')
         except:
-            print(print(f'outputs[2] AFTER reassignment LENGTH: {len(outputs[2][0])} '))
         try:
-            print(f'all_attention_weights last: {all_attention_weights.shape}')
         except:
             try:
-                print(f'last first except: {all_attention_weights[0].shape}')
             except:
-                print(f'last second except: {len(all_attention_weights[0])}')
         pooled_output = self.dropout(pooled_output)
@@ -984,9 +991,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
         # print(outputs.attentions)
         try:
-            print(f'not stacked final attention SHAPE: {outputs[2][0].shape}')
         except:
-            print(f'not stacked final attention LEN: {len(outputs[2])}')
         # try:
         #     print(f'STACKED final attention SHAPE: {(outputs.attentions).shape}')
@@ -1002,6 +1009,6 @@ class BertForSequenceClassification(BertPreTrainedModel):
             hidden_states=outputs[0],
             #JAANDOUI: returning all_attention_weights here
             # attentions=torch.stack(outputs[2], dim=0),
-            attentions=outputs[2], # JAANDOUI TODO: should I stack here ????
         )

             print(f'REARRANGED ATTENTION: {rearranged_attention.shape}')
         except:
             print(f'REARRANGED ATTENTION: {rearranged_attention[0].shape}')
+        return rearrange(attention, 'nnz h d -> nnz (h d)'), attention_probs
 # Copy of transformer's library BertSelfOutput that will not be caught by surgery methods looking for HF BERT modules.
             attn_mask: None or (batch, max_seqlen_in_batch)
             bias: None or (batch, heads, max_seqlen_in_batch, max_seqlen_in_batch)
         """
+        self_output, attention_probs = self.self(input_tensor, cu_seqlens, max_s, indices,
                                 attn_mask, bias)
         try:
             return self.output(index_first_axis(self_output, subset_idx),
                                index_first_axis(input_tensor, subset_idx))
         else:
+            return self.output(self_output, input_tensor), attention_probs
 class BertGatedLinearUnitMLP(nn.Module):
             attn_mask: None or (batch, max_seqlen_in_batch)
             bias: None or (batch, heads, max_seqlen_in_batch, max_seqlen_in_batch)
         """
+        attention_output, attention_probs = self.attention(hidden_states, cu_seqlens, seqlen,
                                           subset_idx, indices, attn_mask, bias)
         print(f'BertLayer attention_output shape: {attention_output.shape}')
         layer_output = self.mlp(attention_output)
         print(f'BertLayer layer_output shape: {layer_output.shape}')
+        return layer_output, attention_output, attention_probs # JAANDOUI: this only returns layer_output in the original work.
 class BertEncoder(nn.Module):
         all_encoder_layers = []
         all_attention_weights = []  # List to store attention weights
+        all_attention_probs = []
         if subset_mask is None:
             for layer_module in self.layer:
                 # JAANDOUI: Since we get now attention too, we need to unpack 2 elements instead of 1.
+                hidden_states, attention_weights, attention_probs = layer_module(hidden_states,
                                                                 cu_seqlens,
                                                                 seqlen,
                                                                 None,
                 # print(f'Inner Attention: {attention_weights}')
                 print(f'Inner Attention shape: {attention_weights.shape}')
                 all_attention_weights.append(attention_weights)  # Store attention weights
+                all_attention_probs.append(attention_probs)  # Store attention probs
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
             # Pad inputs and mask. It will insert back zero-padded tokens.
             for i in range(len(self.layer) - 1):
                 layer_module = self.layer[i]
                 # JAANDOUI: Since we get now attention too, we need to unpack 2 elements instead of 1.
+                hidden_states, attention_weights, attention_probs = layer_module(hidden_states,
                                                                 cu_seqlens,
                                                                 seqlen,
                                                                 None,
                                                                 attn_mask=attention_mask,
                                                                 bias=alibi_attn_mask)
                 all_attention_weights.append(attention_weights)  # JAANDOUI: Store attention weights
+                all_attention_probs.append(attention_probs)  # Store attention probs
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
             subset_idx = torch.nonzero(subset_mask[attention_mask_bool],
                                        as_tuple=False).flatten()
             # JAANDOUI: Since we get now attention too, we need to unpack 2 elements instead of 1.
+            hidden_states, attention_weights, attention_probs = self.layer[-1](hidden_states,
                                                               cu_seqlens,
                                                               seqlen,
                                                               subset_idx=subset_idx,
                                                               attn_mask=attention_mask,
                                                               bias=alibi_attn_mask)
             all_attention_weights.append(attention_weights)  # JAANDOUI: appending the attention of different layers together.
+            all_attention_probs.append(attention_probs)  # Store attention probs
             # print(f'here is the matrix of attentions inside encoder: \n {all_attention_weights}')
             print(f'and this is the [0]shape inside encoder: \n {all_attention_weights[0].shape}')
             # print(f'NUMBER6: {all_attention_weights}')
             all_encoder_layers.append(hidden_states)
         # JAANDOUI: Since we now return both, we need to handle them wherever BertEncoder forward is called.
+        return all_encoder_layers, all_attention_weights, all_attention_probs  # Return both hidden states and attention weights
         # return all_encoder_layers  # JAANDOUI: original return.
         # JAANDOUI: first part where we call self.encoder (which is the instance of BertEncoder defined here)
         # JAANDOUI: need to return the attention weights here too.
+        encoder_outputs, all_attention_weights, all_attention_probs = self.encoder(
             embedding_output,
             attention_mask,
             output_all_encoded_layers=output_all_encoded_layers,
         # JAANDOUI: returning all_attention_weights too
         if self.pooler is not None:
             # print(f'NUMBER8: {all_attention_weights}')
+            return encoder_outputs, pooled_output, all_attention_weights, all_attention_probs
         # JAANDOUI: returning all_attention_weights too
         # print(f'NUMBER9: {all_attention_weights}')
+        return encoder_outputs, None, all_attention_weights, all_attention_probs
         # JAANDOUI: need to handle the returned elements wherever BertModel is instantiated.
 ###################
         pooled_output = outputs[1]
         try:
+            print(f'outputs[2] before reassignment SHAPE: {outputs[3][0].shape} ')
         except:
+            print(print(f'outputs[2] before reassignment LENGTH: {len(outputs[3][0])} '))
         # JAANDOUI:
+        all_attention_probs = outputs[3]
         try:
+            print(f'outputs[2] AFTER reassignment SHAPE: {outputs[3][0].shape} ')
         except:
+            print(print(f'outputs[2] AFTER reassignment LENGTH: {len(outputs[3][0])} '))
         try:
+            print(f'all_attention_weights last: {all_attention_probs.shape}')
         except:
             try:
+                print(f'last first except: {all_attention_probs[0].shape}')
             except:
+                print(f'last second except: {len(all_attention_probs[0])}')
         pooled_output = self.dropout(pooled_output)
         # print(outputs.attentions)
         try:
+            print(f'not stacked final attention SHAPE: {outputs[3][0].shape}')
         except:
+            print(f'not stacked final attention LEN: {len(outputs[3])}')
         # try:
         #     print(f'STACKED final attention SHAPE: {(outputs.attentions).shape}')
             hidden_states=outputs[0],
             #JAANDOUI: returning all_attention_weights here
             # attentions=torch.stack(outputs[2], dim=0),
+            attentions=outputs[3], # JAANDOUI TODO: should I stack here ????
         )