Upload DogeForCausalLM

Browse files

Files changed (5) hide show

config.json +47 -37
configuration_doge.py +83 -46
generation_config.json +7 -7
model.safetensors +2 -2
modeling_doge.py +382 -256

config.json CHANGED Viewed

@@ -1,37 +1,47 @@
-{
-  "_name_or_path": "./results/Doge-60M-Instruct",
-  "architectures": [
-    "DogeForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "auto_map": {
-    "AutoConfig": "configuration_doge.DogeConfig",
-    "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
-  },
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "expert_retrieval_size": 256,
-  "hidden_act": "silu",
-  "hidden_bias": false,
-  "hidden_dropout": 0.0,
-  "hidden_size": 512,
-  "initializer_range": 0.02,
-  "intermediate_size": 2048,
-  "is_moe": false,
-  "max_position_embeddings": 2048,
-  "model_type": "doge",
-  "num_attention_heads": 4,
-  "num_cdmmoe_experts": 4096,
-  "num_cdmmoe_experts_per_head": 8,
-  "num_cdmmoe_heads": 4,
-  "num_hidden_layers": 8,
-  "pad_token_id": 0,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float32",
-  "transformers_version": "4.46.1",
-  "use_cache": true,
-  "vocab_size": 32768
-}

+{
+  "_name_or_path": "./results/Doge-60M-Instruct-DPO",
+  "architectures": [
+    "DogeForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_doge.DogeConfig",
+    "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
+  },
+  "bos_token_id": 0,
+  "dynamic_mask_ratio": 0.0,
+  "eos_token_id": 1,
+  "expert_retrieval_size": 256,
+  "hidden_act": "silu",
+  "hidden_bias": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 1024,
+  "is_moe": false,
+  "max_position_embeddings": 2048,
+  "model_type": "doge",
+  "num_attention_heads": 4,
+  "num_cdmmoe_experts": 2048,
+  "num_cdmmoe_experts_per_head": 8,
+  "num_cdmmoe_heads": 4,
+  "num_cdmoe_experts": 16348,
+  "num_cdmoe_experts_per_head": 8,
+  "num_cdmoe_heads": 4,
+  "num_channels": 3,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 2,
+  "pad_token_id": 2,
+  "patch_size": 16,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "factor": 4.0,
+    "original_max_position_embeddings": 2048,
+    "rope_type": "dynamic"
+  },
+  "rope_theta": 10000.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0.dev0",
+  "use_cache": true,
+  "vocab_size": 32768
+}

configuration_doge.py CHANGED Viewed

@@ -25,20 +25,23 @@ from transformers.modeling_rope_utils import rope_config_validation
 class DogeConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
-    model according to the specified arguments, defining the model architecture like [LoserCheems/doge-tiny-test](https://huggingface.co/LoserCheems/doge-tiny-test)
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 32768):
-            Vocabulary size of the Doge model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`DogeModel`]
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 4096):
-            Dimension of the CDMoE representations.
-        num_hidden_layers (`int`, *optional*, defaults to 16):
             Number of hidden layers in the Transformer decoder.
         hidden_bias (`bool`, *optional*, defaults to `False`):
             Whether to use bias in the hidden layers.
@@ -51,24 +54,21 @@ class DogeConfig(PretrainedConfig):
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
-            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
-            accordingly.
             Expected contents:
                 `rope_type` (`str`):
-                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
-                    'llama3'], with 'default' being the original RoPE implementation.
                 `factor` (`float`, *optional*):
-                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
-                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
-                    original maximum pre-trained length.
                 `original_max_position_embeddings` (`int`, *optional*):
-                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
-                    pretraining.
                 `attention_factor` (`float`, *optional*):
                     Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
-                    computation. If unspecified, it defaults to value recommended by the implementation, using the
-                    `factor` field to infer the suggested value.
                 `beta_fast` (`float`, *optional*):
                     Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                     ramp function. If unspecified, it defaults to 32.
@@ -76,13 +76,11 @@ class DogeConfig(PretrainedConfig):
                     Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                     ramp function. If unspecified, it defaults to 1.
                 `short_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
                 `long_factor` (`List[float]`, *optional*):
-                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
-                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
-                    size divided by the number of attention heads divided by 2
                 `low_freq_factor` (`float`, *optional*):
                     Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                 `high_freq_factor` (`float`, *optional*):
@@ -100,56 +98,86 @@ class DogeConfig(PretrainedConfig):
             Beginning of stream token id.
         eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         is_moe (`bool`, *optional*, defaults to `False`):
             Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
-        num_cdmmoe_experts (`int`, *optional*, defaults to 4096):
-            Number of Private Experts for the Cross Domain Mixture of Experts.
-        num_cdmmoe_heads (`int`, *optional*, defaults to 4):
             Number of heads of Private Experts for the Cross Domain Mixture of Experts.
-        num_cdmmoe_experts_per_head (`int`, *optional*, defaults to 8):
             Number of Private Experts per head for the Cross Domain Mixture of Experts.
-        expert_retrieval_size (`int`, *optional*, defaults to 256):
             Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
     """
     model_type = "doge"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         self,
         vocab_size=32768,
         hidden_size=1024,
-        intermediate_size=4096,
-        num_hidden_layers=16,
         hidden_bias=False,
         hidden_dropout=0.0,
         hidden_act="silu",
         max_position_embeddings=2048,
         rope_theta=10000.0,
-        rope_scaling=None,
         initializer_range=0.02,
         rms_norm_eps=1e-06,
         use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
         num_attention_heads=8,
         attention_dropout=0.0,
         is_moe=False,
-        num_cdmmoe_experts=4096,
-        num_cdmmoe_heads=4,
-        num_cdmmoe_experts_per_head=8,
-        expert_retrieval_size=256,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
@@ -162,16 +190,18 @@ class DogeConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.pad_token_id = pad_token_id
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
         self.tie_word_embeddings = tie_word_embeddings
         self.num_attention_heads = num_attention_heads
         self.attention_dropout = attention_dropout
         self.is_moe = is_moe
-        self.num_cdmmoe_experts = num_cdmmoe_experts
-        self.num_cdmmoe_heads = num_cdmmoe_heads
-        self.num_cdmmoe_experts_per_head = num_cdmmoe_experts_per_head
         self.expert_retrieval_size = expert_retrieval_size
         # Validate the correctness of rotary position embeddings parameters
@@ -180,10 +210,17 @@ class DogeConfig(PretrainedConfig):
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
         super().__init__(
-            pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )

 class DogeConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
+    model according to the specified arguments, defining the model architecture like [JingzeShi/Doge-20M](https://huggingface.co/JingzeShi/Doge-20M).
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 32768):
+            Vocabulary size of the Doge model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input image.
+        patch_size (`int`, *optional*, defaults to 16):
+            Patch size of Vision Transformer Embeddings.
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer decoder.
         hidden_bias (`bool`, *optional*, defaults to `False`):
             Whether to use bias in the hidden layers.
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
         rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings.
+            NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
             Expected contents:
                 `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
                 `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
+                    In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
                 `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'.
+                    The original max position embeddings used during pretraining.
                 `attention_factor` (`float`, *optional*):
                     Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation.
+                    If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
                 `beta_fast` (`float`, *optional*):
                     Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                     ramp function. If unspecified, it defaults to 32.
                     Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                     ramp function. If unspecified, it defaults to 1.
                 `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
+                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
                 `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
+                    Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
                 `low_freq_factor` (`float`, *optional*):
                     Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                 `high_freq_factor` (`float`, *optional*):
             Beginning of stream token id.
         eos_token_id (`int`, *optional*, defaults to 2):
             End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to tie weight embeddings
         num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to `None`):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention.
+            If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
+            When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
+            For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf).
+            If it is not specified, will default to `num_attention_heads`.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
+        dynamic_mask_ratio (`float`, *optional*, defaults to 0.0, range [0, 1]):
+            The ratio to control the proportion of the dynamic mask filled with the minimum value.
         is_moe (`bool`, *optional*, defaults to `False`):
             Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
+        num_cdmoe_experts (`int`, *optional*, defaults to 16348):
+            Number of Private Experts for the Cross Domain Mixture of Experts. calculation formula: :math:`\text{num_cdmoe_experts} = (32 \times \text{num_cdmoe_heads})^2`
+        num_cdmoe_heads (`int`, *optional*, defaults to 4):
             Number of heads of Private Experts for the Cross Domain Mixture of Experts.
+        num_cdmoe_experts_per_head (`int`, *optional*, defaults to 8):
             Number of Private Experts per head for the Cross Domain Mixture of Experts.
+        expert_retrieval_size (`int`, *optional*, defaults to 64):
             Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
     """
     model_type = "doge"
     keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `DogeModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.dt_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
     def __init__(
         self,
         vocab_size=32768,
+        num_channels=3,
+        patch_size=16,
         hidden_size=1024,
+        intermediate_size=2048,
+        num_hidden_layers=32,
         hidden_bias=False,
         hidden_dropout=0.0,
         hidden_act="silu",
         max_position_embeddings=2048,
         rope_theta=10000.0,
+        rope_scaling={
+            "rope_type": "dynamic",
+            "factor": 4.0,
+            "original_max_position_embeddings": 2048,
+        },
         initializer_range=0.02,
         rms_norm_eps=1e-06,
         use_cache=True,
+        bos_token_id=0,
+        eos_token_id=1,
+        pad_token_id=2,
+        tie_word_embeddings=True,
         num_attention_heads=8,
+        num_key_value_heads=None,
         attention_dropout=0.0,
+        dynamic_mask_ratio=0.0,
         is_moe=False,
+        num_cdmoe_experts=16348,
+        num_cdmoe_heads=4,
+        num_cdmoe_experts_per_head=8,
+        expert_retrieval_size=64,
         **kwargs,
     ):
         self.vocab_size = vocab_size
+        self.num_channels = num_channels
+        self.patch_size = patch_size
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
         self.tie_word_embeddings = tie_word_embeddings
         self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
         self.attention_dropout = attention_dropout
+        self.dynamic_mask_ratio = dynamic_mask_ratio
         self.is_moe = is_moe
+        self.num_cdmoe_experts = num_cdmoe_experts
+        self.num_cdmoe_heads = num_cdmoe_heads
+        self.num_cdmoe_experts_per_head = num_cdmoe_experts_per_head
         self.expert_retrieval_size = expert_retrieval_size
         # Validate the correctness of rotary position embeddings parameters
             self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         rope_config_validation(self)
+        # for backward compatibility
+        if num_key_value_heads is None:
+            self.num_key_value_heads = num_attention_heads
         super().__init__(
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
+__all__ = ["DogeConfig"]

generation_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
-{
-  "_from_model_config": true,
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "pad_token_id": 0,
-  "transformers_version": "4.46.1"
-}

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 2,
+  "transformers_version": "4.49.0.dev0"
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bb865063ca9536a49054948ea12f7d90519ee363ee98c5fff7bc2de6f82e0a86
-size 268580408

 version https://git-lfs.github.com/spec/v1
+oid sha256:2d30a2a446050f4e9c26bb833e260e5479937577b280c16d1e39f8ce4e66aba1
+size 218325576

modeling_doge.py CHANGED Viewed

@@ -19,7 +19,7 @@
 """PyTorch Doge model."""
 import math
-from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
@@ -36,9 +36,12 @@ from transformers.modeling_outputs import (
 )
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
     replace_return_docstrings,
 )
@@ -49,6 +52,9 @@ try:
 except ImportError:
     einx_add = None
 logger = logging.get_logger(__name__)
@@ -79,7 +85,7 @@ class Residual(nn.Module):
     def __init__(self, hidden_size):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
     def forward(self, residual_states, hidden_states):
         return self.weight * residual_states + hidden_states
@@ -92,10 +98,10 @@ class RotaryEmbedding(nn.Module):
         super().__init__()
         self.rope_kwargs = {}
-        if config.rope_scaling is None:
-            self.rope_type = "default"
         else:
-            self.rope_type = config.rope_scaling
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
         self.base = config.rope_theta
@@ -133,6 +139,7 @@ class RotaryEmbedding(nn.Module):
         # core RoPE block
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
         device_type = x.device.type
         device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
@@ -141,6 +148,7 @@ class RotaryEmbedding(nn.Module):
             cos = emb.cos()
             sin = emb.sin()
         cos = cos * self.attention_scaling
         sin = sin * self.attention_scaling
@@ -168,11 +176,10 @@ def apply_QK_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
             Deprecated and unused.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
@@ -183,82 +190,83 @@ def apply_QK_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
 class DogeDynamicMaskAttention(nn.Module):
     """Dynamic Mask Attention from 'Wonderful Matrices' paper."""
     def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
-                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-        self.hidden_dim = config.hidden_size
-        self.num_attention_heads = config.num_attention_heads
         self.attention_dropout = config.attention_dropout
-        self.attention_head_dim = self.hidden_dim // self.num_attention_heads
         # Q K V O projections
         self.q_proj = nn.Linear(
-            self.hidden_dim,
-            self.num_attention_heads * self.attention_head_dim,
-            bias=config.hidden_bias,
         )
         self.k_proj = nn.Linear(
-            self.hidden_dim,
-            self.num_attention_heads * self.attention_head_dim,
-            bias=config.hidden_bias,
         )
         # dynamic mask for the QK^T attention score matrix
         self.A = nn.Parameter(
-            torch.ones(self.num_attention_heads)
         )
         self.dt_proj = nn.Linear(
-            self.hidden_dim,
-            self.num_attention_heads,
-            bias=config.hidden_bias,
-        )
-        self.v_proj = nn.Linear(
-            self.hidden_dim,
-            self.num_attention_heads * self.attention_head_dim,
-            bias=config.hidden_bias,
         )
         self.o_proj = nn.Linear(
-            self.hidden_dim,
-            self.hidden_dim,
-            bias=config.hidden_bias,
         )
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Cache] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[Cache]]:
-        bsz, q_len, _ = hidden_states.shape
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
-            1, 2
-        )
-        key_states = key_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
-            1, 2
-        )
-        value_states = value_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
-            1, 2
-        )
         cos, sin = position_embeddings
         query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
@@ -268,90 +276,153 @@ class DogeDynamicMaskAttention(nn.Module):
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        # compute attention scores matrix
-        attn_weights = torch.matmul(query_states, key_states.transpose(-1, -2)) / math.sqrt(self.attention_head_dim)
-        # add mask to attention scores
-        if attention_mask is not None:
-            dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
-            dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
-            dynamic_mask = dynamic_mask < 1.0
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]].masked_fill(dynamic_mask[:, :, None, :], torch.finfo(hidden_states.dtype).min)
-            attn_weights = attn_weights + causal_mask
-        # upcast attention scores to fp32
-        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = F.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        # apply attention scores to value states
-        attn_output = torch.matmul(attn_weights, value_states)
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, -1)
         attn_output = self.o_proj(attn_output)
-        return attn_output, past_key_value
-class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
-    def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[Cache]]:
-        bsz, q_len, _ = hidden_states.shape
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-        query_states = query_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
-        cos, sin = position_embeddings
-        query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         if attention_mask is not None:
-            dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
-            dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
-            dynamic_mask = dynamic_mask < 1.0
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]].masked_fill(dynamic_mask[:, :, None, :], torch.finfo(hidden_states.dtype).min)
-        query_states = query_states.contiguous()
-        key_states = key_states.contiguous()
-        value_states = value_states.contiguous()
         attn_output = F.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
             attn_mask=causal_mask,
-            dropout_p=self.attention_dropout,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, -1)
-        attn_output = self.o_proj(attn_output)
-        return attn_output, past_key_value
-DOGE_ATTENTION_CLASSES = {
-    "eager": DogeDynamicMaskAttention,
-    "sdpa": DogeSdpaDynamicMaskAttn,
-}
 class DogeMLP(nn.Module):
@@ -362,21 +433,9 @@ class DogeMLP(nn.Module):
         self.intermediate_dim = config.intermediate_size
         self.act_fn = ACT2FN[config.hidden_act]
-        self.gate_proj = nn.Linear(
-            self.hidden_dim,
-            self.intermediate_dim,
-            bias=config.hidden_bias,
-        )
-        self.up_proj = nn.Linear(
-            self.hidden_dim,
-            self.intermediate_dim,
-            bias=config.hidden_bias,
-        )
-        self.down_proj = nn.Linear(
-            self.intermediate_dim,
-            self.hidden_dim,
-            bias=config.hidden_bias,
-        )
     def forward(
         self,
@@ -396,36 +455,18 @@ class DogeCDMoE(DogeMLP):
         self.act_fn = ACT2FN[config.hidden_act]
         self.expert_retrieval_dim = config.expert_retrieval_size
-        self.num_cdmmoe_experts = config.num_cdmmoe_experts
-        self.num_cdmmoe_heads = config.num_cdmmoe_heads
-        self.num_cdmmoe_experts_per_head = config.num_cdmmoe_experts_per_head
-        self.num_keys = int(math.sqrt(self.num_cdmmoe_experts))
         # queries and keys for retrieval experts
-        self.queries = nn.Linear(
-            self.hidden_dim,
-            self.num_cdmmoe_heads * self.expert_retrieval_dim,
-            bias=False,
-        )
-        self.keys = nn.Parameter(
-            torch.zeros(
-                self.num_cdmmoe_heads,
-                self.num_keys,
-                2,
-                self.expert_retrieval_dim // 2,
-            )
-        )
         # experts
-        self.down_embed  = nn.Embedding(
-            self.num_cdmmoe_experts,
-            self.hidden_dim,
-        )
-        self.up_embed = nn.Embedding(
-            self.num_cdmmoe_experts,
-            self.hidden_dim,
-        )
     def forward(
         self,
@@ -436,11 +477,11 @@ class DogeCDMoE(DogeMLP):
         # get similarity with queries and keys
         queries = self.queries(hidden_states)
-        queries = queries.view(bsz, seq_len, 2, self.num_cdmmoe_heads, -1).permute(2, 0, 1, 3, 4)
         sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
         # get experts with the highest similarity
-        (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmmoe_experts_per_head, dim=-1)
         if einx_add is not None:
             all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
             all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
@@ -449,7 +490,7 @@ class DogeCDMoE(DogeMLP):
             all_scores = all_scores.view(*scores_x.shape[:-1], -1)
             all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
             all_indices = all_indices.view(*indices_x.shape[:-1], -1)
-        scores, pk_indices = all_scores.topk(self.num_cdmmoe_experts_per_head, dim=-1)
         indices = all_indices.gather(-1, pk_indices)
         down_embed = self.down_embed(indices)
         up_embed = self.up_embed(indices)
@@ -468,13 +509,13 @@ class DogeDecoderLayer(nn.Module):
         super().__init__()
         self.hidden_dropout = config.hidden_dropout
-        self.pre_sequence_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.attn = DOGE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
-        self.post_sequence_residual = Residual(config.hidden_size)
-        self.pre_state_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.feed_forward = DogeMLP(config) if config.is_moe == False else DogeCDMoE(config)
-        self.post_state_residual = Residual(config.hidden_size)
     def forward(
         self,
@@ -485,36 +526,14 @@ class DogeDecoderLayer(nn.Module):
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*):
-                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
-                query_sequence_length, key_sequence_length)` if default attention is used.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-                Indices depicting the position of the input sequence tokens in the sequence
-            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
-                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
-                with `head_dim` being the embedding dimension of each attention head.
-            kwargs (`dict`, *optional*):
-                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
-                into the model
-        """
         # sequence transformation
         residual = hidden_states
-        hidden_states = self.pre_sequence_layernorm(hidden_states)
-        hidden_states, present_key_value = self.attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -525,27 +544,41 @@ class DogeDecoderLayer(nn.Module):
         )
         self_attn_weights = None
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
-        hidden_states = self.post_sequence_residual(residual, hidden_states)
         # state transformation
         residual = hidden_states
-        hidden_states = self.pre_state_layernorm(hidden_states)
         hidden_states = self.feed_forward(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
-        hidden_states = self.post_state_residual(residual, hidden_states)
         outputs = (hidden_states,)
         if output_attentions:
             outputs += (self_attn_weights,)
-        if use_cache:
-            outputs += (present_key_value,)
         return outputs
-@add_start_docstrings("The bare Doge Model outputting raw hidden-states without any specific head on top.")
 class DogePreTrainedModel(PreTrainedModel):
     config_class = DogeConfig
     base_model_prefix = "model"
@@ -553,6 +586,7 @@ class DogePreTrainedModel(PreTrainedModel):
     _no_split_modules = ["DogeDecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
     _supports_sdpa = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
@@ -644,8 +678,18 @@ DOGE_INPUTS_DOCSTRING = r"""
 """
-@add_start_docstrings("The bare Doge Model outputting raw hidden-states without any specific head on top.")
 class DogeModel(DogePreTrainedModel):
     def __init__(self, config: DogeConfig):
         super().__init__(config)
         self.config = config
@@ -682,6 +726,7 @@ class DogeModel(DogePreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -702,33 +747,22 @@ class DogeModel(DogePreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.word_embed(input_ids)
-        # kept for BC (non `Cache` `past_key_values` inputs)
-        return_legacy_cache = False
-        if use_cache and not isinstance(past_key_values, Cache):
-            return_legacy_cache = True
-            if past_key_values is None:
-                past_key_values = DynamicCache()
-            else:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-                logger.warning_once(
-                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
-                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
-                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
-                )
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
-                past_seen_tokens,
-                past_seen_tokens + inputs_embeds.shape[1],
-                device=inputs_embeds.device,
             )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
         causal_mask = self._update_causal_mask(
             attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
         )
         hidden_states = inputs_embeds
         # create position embeddings to be shared across the decoder layers
@@ -737,9 +771,8 @@ class DogeModel(DogePreTrainedModel):
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-        for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
@@ -765,13 +798,11 @@ class DogeModel(DogePreTrainedModel):
                     use_cache=use_cache,
                     cache_position=cache_position,
                     position_embeddings=position_embeddings,
                 )
             hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
@@ -781,27 +812,21 @@ class DogeModel(DogePreTrainedModel):
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
-        next_cache = next_decoder_cache if use_cache else None
-        if return_legacy_cache:
-            next_cache = next_cache.to_legacy_cache()
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=next_cache,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
     def _update_causal_mask(
         self,
-        attention_mask: torch.Tensor = None,
-        input_tensor: torch.Tensor = None,
-        cache_position: torch.Tensor = None,
-        past_key_values: Cache = None,
-        output_attentions: bool = False,
     ):
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
@@ -888,8 +913,12 @@ class DogeModel(DogePreTrainedModel):
         return causal_mask
 class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config: DogeConfig):
         super().__init__(config)
@@ -912,13 +941,13 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
     def set_decoder(self, decoder):
         self.model = decoder
-    def get_decoder(self):
-        return self.model
     @add_start_docstrings_to_model_forward(DOGE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -926,7 +955,7 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
@@ -935,7 +964,7 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
-        **loss_kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -950,7 +979,23 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
                 token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
         Returns:
-        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -969,6 +1014,7 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
         )
         hidden_states = outputs[0]
@@ -978,7 +1024,7 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
         loss = None
         if labels is not None:
-            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size, **loss_kwargs)
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -993,18 +1039,98 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
         )
 @add_start_docstrings(
     """
     The Doge Model transformer with a sequence classification head on top (linear layer).
-    [`DogeForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
     """
 )
 class DogeForSequenceClassification(DogePreTrainedModel):
@@ -1041,9 +1167,9 @@ class DogeForSequenceClassification(DogePreTrainedModel):
     ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

 """PyTorch Doge model."""
 import math
+from typing import Callable, List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 )
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from transformers.modeling_utils import PreTrainedModel
+from transformers.processing_utils import Unpack
 from transformers.utils import (
+    LossKwargs,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_greater_or_equal,
     logging,
     replace_return_docstrings,
 )
 except ImportError:
     einx_add = None
+if is_torch_greater_or_equal("2.5"):
+    from torch.nn.attention.flex_attention import flex_attention
 logger = logging.get_logger(__name__)
     def __init__(self, hidden_size):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
     def forward(self, residual_states, hidden_states):
         return self.weight * residual_states + hidden_states
         super().__init__()
         self.rope_kwargs = {}
+        if config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
         else:
+            self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
         self.base = config.rope_theta
         # core RoPE block
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
         position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
         device_type = x.device.type
         device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
             cos = emb.cos()
             sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
         cos = cos * self.attention_scaling
         sin = sin * self.attention_scaling
             Deprecated and unused.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k.
+            For example, note that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim].
+            Then, if q and k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k.
+            Similarly, if q and k have the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
     return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
+    The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class DogeDynamicMaskAttention(nn.Module):
     """Dynamic Mask Attention from 'Wonderful Matrices' paper."""
     def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim ** -0.5
         self.attention_dropout = config.attention_dropout
+        self.dynamic_mask_ratio = config.dynamic_mask_ratio
+        self.ALL_ATTENTION_FUNCTIONS = {
+            "eager": self.eager_attention_forward,
+            "sdpa": self.sdpa_attention_forward,
+            "flex_attention": self.flex_attention_forward,
+        }
         # Q K V O projections
         self.q_proj = nn.Linear(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=config.hidden_bias
         )
         self.k_proj = nn.Linear(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=config.hidden_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=config.hidden_bias
         )
         # dynamic mask for the QK^T attention score matrix
         self.A = nn.Parameter(
+            torch.ones(config.num_attention_heads)
         )
         self.dt_proj = nn.Linear(
+            config.num_key_value_heads * self.head_dim,
+            config.num_attention_heads,
+            bias=config.hidden_bias
         )
         self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim,
+            config.hidden_size,
+            bias=config.hidden_bias
         )
     def forward(
         self,
         hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[Cache] = None,
         cache_position: Optional[torch.LongTensor] = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[Cache]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
         cos, sin = position_embeddings
         query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # calculate dynamic mask from value_states
+        dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(value_states.shape[0], value_states.shape[-2], -1))
+        dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
+        attn_mask = self.prepare_dynamic_mask(
+            hidden_states=hidden_states,
+            dynamic_mask=dynamic_mask,
+            dynamic_mask_ratio=self.dynamic_mask_ratio,
+            attention_mask=attention_mask,
+        )
+        attention_interface: Callable = self.eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = self.ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output = attention_interface(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask=attn_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = self.o_proj(attn_output)
+        return attn_output
+    def prepare_dynamic_mask(
         self,
         hidden_states: torch.Tensor,
+        dynamic_mask: torch.Tensor,
+        dynamic_mask_ratio: float = 0.0,
         attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Combine `dynamic_mask` with `attention_mask` to generate the final `attn_mask`.
+        Args:
+            hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
+            dynamic_mask (`torch.Tensor`): dynamic mask of shape `(batch_size, num_heads, key_sequence_length)`.
+            dynamic_mask_ratio (`float`, *optional*): Ratio from 0.0 to 1.0 used to control the proportion of the dynamic mask filled with the minimum value.
+            attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
+        """
+        min_type = torch.finfo(hidden_states.dtype).min
+        attn_mask = dynamic_mask[:, :, None, :]
+        if 0.0 < dynamic_mask_ratio < 1.0:
+            num_dynamic_mask = int(attn_mask.shape[-1] * dynamic_mask_ratio)
+            if num_dynamic_mask > 0:
+                rate_value = torch.kthvalue(attn_mask, num_dynamic_mask, dim=-1, keepdim=True).values
+                attn_mask = attn_mask.masked_fill(attn_mask < rate_value, min_type)
+        if attention_mask is not None:
+            attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : hidden_states.shape[-2]] == min_type, min_type)
+        return attn_mask
+    def eager_attention_forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        scaling: float,
+        dropout: float = 0.0,
+        **kwargs,
+    ) -> torch.Tensor:
+        key_states = repeat_kv(key, self.num_key_value_groups)
+        value_states = repeat_kv(value, self.num_key_value_groups)
+        # compute attention scores matrix
+        attn_weights = torch.matmul(query, key_states.transpose(-1, -2)) * scaling
+        if attention_mask is not None:
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # upcast attention scores to fp32
+        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+        attn_weights = F.dropout(attn_weights, p=dropout, training=self.training)
+        # apply attention scores to value states
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        return attn_output
+    def sdpa_attention_forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        scaling: float,
+        dropout: float = 0.0,
+        **kwargs,
+    ) -> torch.Tensor:
+        causal_mask = attention_mask
         if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key.shape[-2]]
+        # SDPA with memory-efficient backend is bugged with non-contiguous inputs and custom attn_mask for some torch versions
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        # NOTE: As of pytorch 2.5.1, cuDNN's SDPA backward pass is still incorrect, so we disable cuDNN SDPA (see https://github.com/pytorch/pytorch/issues/138581)
+        torch.backends.cuda.enable_cudnn_sdp(False)
         attn_output = F.scaled_dot_product_attention(
+            query,
+            key,
+            value,
             attn_mask=causal_mask,
+            dropout_p=dropout,
+            scale=scaling,
+            enable_gqa=True,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
+        return attn_output
+    def flex_attention_forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        scaling: float,
+        dropout: float = 0.0,
+        **kwargs,
+    ) -> torch.Tensor:
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key.shape[-2]]
+        # TODO: flex_attention: Captured buffers that require grad are not yet supported.
+        # NOTE: So we only use flex_attention in inference mode.
+        def mask_mod(score, batch, head, q_idx, kv_idx):
+            score = score + causal_mask[batch][head][q_idx][kv_idx]
+            return score
+        attn_output = flex_attention(
+            query,
+            key,
+            value,
+            score_mod=mask_mod,
+            scale=scaling,
+            enable_gqa=True,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        return attn_output
 class DogeMLP(nn.Module):
         self.intermediate_dim = config.intermediate_size
         self.act_fn = ACT2FN[config.hidden_act]
+        self.gate_proj = nn.Linear(self.hidden_dim, self.intermediate_dim, bias=config.hidden_bias)
+        self.up_proj = nn.Linear(self.hidden_dim, self.intermediate_dim, bias=config.hidden_bias)
+        self.down_proj = nn.Linear(self.intermediate_dim, self.hidden_dim, bias=config.hidden_bias)
     def forward(
         self,
         self.act_fn = ACT2FN[config.hidden_act]
         self.expert_retrieval_dim = config.expert_retrieval_size
+        self.num_cdmoe_experts = config.num_cdmoe_experts
+        self.num_cdmoe_heads = config.num_cdmoe_heads
+        self.num_cdmoe_experts_per_head = config.num_cdmoe_experts_per_head
+        self.num_keys = int(math.sqrt(self.num_cdmoe_experts))
         # queries and keys for retrieval experts
+        self.queries = nn.Linear(self.hidden_dim, self.num_cdmoe_heads * self.expert_retrieval_dim, bias=False)
+        self.keys = nn.Parameter(torch.zeros(self.num_cdmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
         # experts
+        self.down_embed  = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
+        self.up_embed = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
     def forward(
         self,
         # get similarity with queries and keys
         queries = self.queries(hidden_states)
+        queries = queries.view(bsz, seq_len, 2, self.num_cdmoe_heads, -1).permute(2, 0, 1, 3, 4)
         sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
         # get experts with the highest similarity
+        (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmoe_experts_per_head, dim=-1)
         if einx_add is not None:
             all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
             all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
             all_scores = all_scores.view(*scores_x.shape[:-1], -1)
             all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
             all_indices = all_indices.view(*indices_x.shape[:-1], -1)
+        scores, pk_indices = all_scores.topk(self.num_cdmoe_experts_per_head, dim=-1)
         indices = all_indices.gather(-1, pk_indices)
         down_embed = self.down_embed(indices)
         up_embed = self.up_embed(indices)
         super().__init__()
         self.hidden_dropout = config.hidden_dropout
+        self.pre_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.self_attn = DogeDynamicMaskAttention(config=config, layer_idx=layer_idx)
+        self.pre_residual = Residual(config.hidden_size)
+        self.post_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.feed_forward = DogeMLP(config) if config.is_moe == False else DogeCDMoE(config)
+        self.post_residual = Residual(config.hidden_size)
     def forward(
         self,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         # sequence transformation
         residual = hidden_states
+        hidden_states = self.pre_layernorm(hidden_states)
+        hidden_states = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
         )
         self_attn_weights = None
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
+        hidden_states = self.pre_residual(residual, hidden_states)
         # state transformation
         residual = hidden_states
+        hidden_states = self.post_layernorm(hidden_states)
         hidden_states = self.feed_forward(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
+        hidden_states = self.post_residual(residual, hidden_states)
         outputs = (hidden_states,)
         if output_attentions:
             outputs += (self_attn_weights,)
         return outputs
+DOGE_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`DogeConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Doge Model outputting raw hidden-states without any specific head on top.",
+    DOGE_START_DOCSTRING,
+)
 class DogePreTrainedModel(PreTrainedModel):
     config_class = DogeConfig
     base_model_prefix = "model"
     _no_split_modules = ["DogeDecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
     _supports_sdpa = True
+    _supports_flex_attn = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
 """
+@add_start_docstrings(
+    "The bare Doge Model outputting raw hidden-states without any specific head on top.",
+    DOGE_START_DOCSTRING,
+)
 class DogeModel(DogePreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DogeDecoderLayer`]
+    Args:
+        config: DogeConfig
+    """
     def __init__(self, config: DogeConfig):
         super().__init__(config)
         self.config = config
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
         if inputs_embeds is None:
             inputs_embeds = self.word_embed(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
         causal_mask = self._update_causal_mask(
             attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
         )
         hidden_states = inputs_embeds
         # create position embeddings to be shared across the decoder layers
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
                     use_cache=use_cache,
                     cache_position=cache_position,
                     position_embeddings=position_embeddings,
+                    **kwargs,
                 )
             hidden_states = layer_outputs[0]
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
+        output = BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
             hidden_states=all_hidden_states,
             attentions=all_self_attns,
         )
+        return output if return_dict else output.to_tuple()
     def _update_causal_mask(
         self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
     ):
         past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         using_static_cache = isinstance(past_key_values, StaticCache)
         return causal_mask
+class KwargsForCausalLM(LossKwargs): ...
 class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
     def __init__(self, config: DogeConfig):
         super().__init__(config)
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
+    def get_decoder(self):
+        return self.model
     def set_decoder(self, decoder):
         self.model = decoder
     @add_start_docstrings_to_model_forward(DOGE_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
                 token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
         Returns:
+        Example:
+        ```python
+         >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+        >>> model = AutoModelForCausalLM.from_pretrained("JingzeShi/Doge-20M-Instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("JingzeShi/Doge-20M-Instruct")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            **kwargs,
         )
         hidden_states = outputs[0]
         loss = None
         if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size, **kwargs)
         if not return_dict:
             output = (logits,) + outputs[1:]
         )
+class DogePatchEmbedding(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial `hidden_states` of shape `(batch_size, seq_len, hidden_size)` to be consumed by a Transformer.
+    """
+    def __init__(self, config: DogeConfig):
+        super().__init__()
+        self.num_channels = config.num_channels
+        self.patch_size = config.patch_size
+        self.hidden_dim = config.hidden_size
+        self.sequence_proj = nn.Conv2d(self.num_channels, self.hidden_dim, kernel_size=self.patch_size, stride=self.patch_size)
+        self.state_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=config.hidden_bias)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        image_embedding = self.sequence_proj(pixel_values).flatten(2).transpose(1, 2)
+        image_embedding = self.state_proj(image_embedding)
+        return image_embedding
+class DogeForCausalVLM(DogeForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: DogeConfig):
+        super().__init__(config)
+        self.config = config
+        self.pixel_embed = DogePatchEmbedding(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        # TODO: @wubingheng111: refer to Llava for implementating the forward method
+        ...
+    def prepare_inputs_for_generation(
+        self,
+        input_ids=None,
+        pixel_values=None,
+        past_key_values=None,
+        input_embeds=None,
+        attention_mask=None,
+        cache_position=None,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        model_inputs = self.model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+            **kwargs,
+        )
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+        return model_inputs
 @add_start_docstrings(
     """
     The Doge Model transformer with a sequence classification head on top (linear layer).
+    [`DogeForSequenceClassification`] uses the last token in order to do the classification, as other causal models (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token.
+    If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row.
+    If no `pad_token_id` is defined, it simply takes the last value in each row of the batch.
+    Since it cannot guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in each row of the batch).
     """
 )
 class DogeForSequenceClassification(DogePreTrainedModel):
     ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in `[0, ..., config.num_labels - 1]`.
+            If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict