jaandoui
/

DNABERT2-AttentionExtracted

@@ -169,12 +169,12 @@ class BertUnpadSelfAttention(nn.Module):
                 self.attention_head_size)
             attention_scores = attention_scores + bias
             attention_probs = nn.functional.softmax(attention_scores, dim=-1)
-            print(f'BUSA: attention_probs 1 shape: {attention_probs.shape}')
             attention_probs = self.dropout(attention_probs)
-            print(f'BUSA: attention_probs 2 shape: {attention_probs.shape}')
             attention = torch.matmul(attention_probs, v).permute(0, 2, 1,
                                                                  3)  # b s h d
-            print(f'BUSA: attention shape: {attention.shape}')
         else:
             # Triton implementation only supports 0 attention dropout
             convert_dtype = qkv.dtype not in [torch.float16, torch.bfloat16]
@@ -185,24 +185,24 @@ class BertUnpadSelfAttention(nn.Module):
                 bias_dtype = bias.dtype
                 bias = bias.to(torch.float16)
                 attention = flash_attn_qkvpacked_func(qkv, bias)
-                print(f'BUSA Triton: attention 0 shape: {attention_probs.shape}')
                 attention = attention.to(orig_dtype)
-                print(f'BUSA Triton: attention 1 shape: {attention_probs.shape}')
                 bias = bias.to(bias_dtype)
             else:
                 attention = flash_attn_qkvpacked_func(qkv, bias)
-                print(f'BUSA Triton: attention 2 shape: {attention_probs.shape}')
         # attn_mask is 1 for attend and 0 for don't
         attention = unpad_input_only(attention, torch.squeeze(attn_mask) == 1)
-        print(f'BUSA unpadded final attention shape: {attention_probs.shape}')
-        print(f'ATTENTION: {attention.shape}')
-        print(f'PROBLEM HERE: UNDERSTAND IT!!')
         rearranged_attention = rearrange(attention, 'nnz h d -> nnz (h d)')
         try:
-            print(f'REARRANGED ATTENTION: {rearranged_attention.shape}')
         except:
-            print(f'REARRANGED ATTENTION: {rearranged_attention[0].shape}')
         return rearrange(attention, 'nnz h d -> nnz (h d)'), attention_probs
@@ -257,10 +257,10 @@ class BertUnpadAttention(nn.Module):
         self_output, attention_probs = self.self(input_tensor, cu_seqlens, max_s, indices,
                                 attn_mask, bias)
-        try:
-            print(f'IMPORTANT: {self_output.shape}')
-        except:
-            print(f'IMPORTANT2: {self_output[0].shape}')
         if subset_idx is not None:
             return self.output(index_first_axis(self_output, subset_idx),
@@ -349,9 +349,9 @@ class BertLayer(nn.Module):
         """
         attention_output, attention_probs = self.attention(hidden_states, cu_seqlens, seqlen,
                                           subset_idx, indices, attn_mask, bias)
-        print(f'BertLayer attention_output shape: {attention_output.shape}')
         layer_output = self.mlp(attention_output)
-        print(f'BertLayer layer_output shape: {layer_output.shape}')
         return layer_output, attention_output, attention_probs # JAANDOUI: this only returns layer_output in the original work.
@@ -372,7 +372,7 @@ class BertEncoder(nn.Module):
             [copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
         self.num_attention_heads = config.num_attention_heads
-        print(f'nbr of attention heads: {self.num_attention_heads}')
         # The alibi mask will be dynamically expanded if it is too small for
         # the input the model receives. But it generally helps to initialize it
         # to a reasonably large size to help pre-allocate CUDA memory.
@@ -481,7 +481,7 @@ class BertEncoder(nn.Module):
                                                                 bias=alibi_attn_mask)
                 # JAANDOUI
                 # print(f'Inner Attention: {attention_weights}')
-                print(f'Inner Attention shape: {attention_weights.shape}')
                 all_attention_weights.append(attention_weights)  # Store attention weights
                 all_attention_probs.append(attention_probs)  # Store attention probs
@@ -523,7 +523,7 @@ class BertEncoder(nn.Module):
             all_attention_probs.append(attention_probs)  # Store attention probs
             # print(f'here is the matrix of attentions inside encoder: \n {all_attention_weights}')
-            print(f'and this is the [0]shape inside encoder: \n {all_attention_weights[0].shape}')
             # print(f'NUMBER6: {all_attention_weights}')
         if not output_all_encoded_layers:
             all_encoder_layers.append(hidden_states)
@@ -663,7 +663,7 @@ class BertModel(BertPreTrainedModel):
             subset_mask=subset_mask)
         # print(f'NUMBER7: {all_attention_weights}')
         # print(f'here is the matrix of attentions in BERT: \n {all_attention_weights}')
-        print(f'and this is the [0]shape in BERT: \n {all_attention_weights[0].shape}')
         if masked_tokens_mask is None:
             sequence_output = encoder_outputs[-1]
@@ -930,28 +930,28 @@ class BertForSequenceClassification(BertPreTrainedModel):
         pooled_output = outputs[1]
-        try:
-            print(f'outputs[2] before reassignment SHAPE: {outputs[3][0].shape} ')
-        except:
-            print(print(f'outputs[2] before reassignment LENGTH: {len(outputs[3][0])} '))
         # JAANDOUI:
         all_attention_probs = outputs[3]
-        try:
-            print(f'outputs[2] AFTER reassignment probsss SHAPE: {outputs[3][0].shape} ')
-        except:
-            print(print(f'outputs[2] AFTER reassignment probsss LENGTH: {len(outputs[3][0])} '))
-        try:
-            print(f'all_attention_weights probsss last: {all_attention_probs.shape}')
-        except:
-            try:
-                print(f'last first except probsss: {all_attention_probs[0].shape}')
-            except:
-                print(f'last second except probsss: {len(all_attention_probs[0])}')
         pooled_output = self.dropout(pooled_output)
@@ -990,10 +990,10 @@ class BertForSequenceClassification(BertPreTrainedModel):
             return ((loss,) + output) if loss is not None else output
         # print(outputs.attentions)
-        try:
-            print(f'not stacked final attention probsss SHAPE: {outputs[3][0].shape}')
-        except:
-            print(f'not stacked final attention probsss LEN: {len(outputs[3])}')
         # try:
         #     print(f'STACKED final attention SHAPE: {(outputs.attentions).shape}')

                 self.attention_head_size)
             attention_scores = attention_scores + bias
             attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+            # print(f'BUSA: attention_probs 1 shape: {attention_probs.shape}')
             attention_probs = self.dropout(attention_probs)
+            # print(f'BUSA: attention_probs 2 shape: {attention_probs.shape}')
             attention = torch.matmul(attention_probs, v).permute(0, 2, 1,
                                                                  3)  # b s h d
+            # print(f'BUSA: attention shape: {attention.shape}')
         else:
             # Triton implementation only supports 0 attention dropout
             convert_dtype = qkv.dtype not in [torch.float16, torch.bfloat16]
                 bias_dtype = bias.dtype
                 bias = bias.to(torch.float16)
                 attention = flash_attn_qkvpacked_func(qkv, bias)
+                # print(f'BUSA Triton: attention 0 shape: {attention_probs.shape}')
                 attention = attention.to(orig_dtype)
+                # print(f'BUSA Triton: attention 1 shape: {attention_probs.shape}')
                 bias = bias.to(bias_dtype)
             else:
                 attention = flash_attn_qkvpacked_func(qkv, bias)
+                # print(f'BUSA Triton: attention 2 shape: {attention_probs.shape}')
         # attn_mask is 1 for attend and 0 for don't
         attention = unpad_input_only(attention, torch.squeeze(attn_mask) == 1)
+        # print(f'BUSA unpadded final attention shape: {attention_probs.shape}')
+        # print(f'ATTENTION: {attention.shape}')
+        # print(f'PROBLEM HERE: UNDERSTAND IT!!')
         rearranged_attention = rearrange(attention, 'nnz h d -> nnz (h d)')
         try:
+            # print(f'REARRANGED ATTENTION: {rearranged_attention.shape}')
         except:
+            # print(f'REARRANGED ATTENTION: {rearranged_attention[0].shape}')
         return rearrange(attention, 'nnz h d -> nnz (h d)'), attention_probs
         self_output, attention_probs = self.self(input_tensor, cu_seqlens, max_s, indices,
                                 attn_mask, bias)
+        # try:
+            # print(f'IMPORTANT: {self_output.shape}')
+        # except:
+            # print(f'IMPORTANT2: {self_output[0].shape}')
         if subset_idx is not None:
             return self.output(index_first_axis(self_output, subset_idx),
         """
         attention_output, attention_probs = self.attention(hidden_states, cu_seqlens, seqlen,
                                           subset_idx, indices, attn_mask, bias)
+        # print(f'BertLayer attention_output shape: {attention_output.shape}')
         layer_output = self.mlp(attention_output)
+        # print(f'BertLayer layer_output shape: {layer_output.shape}')
         return layer_output, attention_output, attention_probs # JAANDOUI: this only returns layer_output in the original work.
             [copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
         self.num_attention_heads = config.num_attention_heads
+        # print(f'nbr of attention heads: {self.num_attention_heads}')
         # The alibi mask will be dynamically expanded if it is too small for
         # the input the model receives. But it generally helps to initialize it
         # to a reasonably large size to help pre-allocate CUDA memory.
                                                                 bias=alibi_attn_mask)
                 # JAANDOUI
                 # print(f'Inner Attention: {attention_weights}')
+                # print(f'Inner Attention shape: {attention_weights.shape}')
                 all_attention_weights.append(attention_weights)  # Store attention weights
                 all_attention_probs.append(attention_probs)  # Store attention probs
             all_attention_probs.append(attention_probs)  # Store attention probs
             # print(f'here is the matrix of attentions inside encoder: \n {all_attention_weights}')
+            # print(f'and this is the [0]shape inside encoder: \n {all_attention_weights[0].shape}')
             # print(f'NUMBER6: {all_attention_weights}')
         if not output_all_encoded_layers:
             all_encoder_layers.append(hidden_states)
             subset_mask=subset_mask)
         # print(f'NUMBER7: {all_attention_weights}')
         # print(f'here is the matrix of attentions in BERT: \n {all_attention_weights}')
+        # print(f'and this is the [0]shape in BERT: \n {all_attention_weights[0].shape}')
         if masked_tokens_mask is None:
             sequence_output = encoder_outputs[-1]
         pooled_output = outputs[1]
+        # try:
+        #     print(f'outputs[2] before reassignment SHAPE: {outputs[3][0].shape} ')
+        # except:
+        #     print(print(f'outputs[2] before reassignment LENGTH: {len(outputs[3][0])} '))
         # JAANDOUI:
         all_attention_probs = outputs[3]
+        # try:
+        #     print(f'outputs[2] AFTER reassignment probsss SHAPE: {outputs[3][0].shape} ')
+        # except:
+        #     print(print(f'outputs[2] AFTER reassignment probsss LENGTH: {len(outputs[3][0])} '))
+        # try:
+        #     print(f'all_attention_weights probsss last: {all_attention_probs.shape}')
+        # except:
+        #     try:
+        #         print(f'last first except probsss: {all_attention_probs[0].shape}')
+        #     except:
+        #         print(f'last second except probsss: {len(all_attention_probs[0])}')
         pooled_output = self.dropout(pooled_output)
             return ((loss,) + output) if loss is not None else output
         # print(outputs.attentions)
+        # try:
+        #     print(f'not stacked final attention probsss SHAPE: {outputs[3][0].shape}')
+        # except:
+        #     print(f'not stacked final attention probsss LEN: {len(outputs[3])}')
         # try:
         #     print(f'STACKED final attention SHAPE: {(outputs.attentions).shape}')