initial commit

Browse files

Files changed (7) hide show

.gitignore +0 -0
README.md +0 -0
__init__.py +0 -0
config.json +22 -0
configuration_aria.py +50 -0
model.safetensors +3 -0
modeling_aria.py +565 -0

.gitignore ADDED Viewed

File without changes

README.md ADDED Viewed

File without changes

__init__.py ADDED Viewed

File without changes

config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "architectures": [
+    "AriaForCausalLM"
+  ],
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "hidden_size": 1536,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 8192,
+  "model_type": "aria",
+  "num_attention_heads": 64,
+  "num_hidden_layers": 16,
+  "torch_dtype": "float16",
+  "transformers_version": "4.45.0",
+  "use_cache": true,
+  "vocab_size": 17731,
+  "auto_map": {
+    "AutoConfig": "configuration_aria.AriaConfig",
+    "AutoModel": "modeling_aria.AriaModel",
+    "AutoModelForCausalLM": "modeling_aria.AriaForCausalLM"
+  }
+}

configuration_aria.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from transformers import PretrainedConfig
+class AriaConfig(PretrainedConfig):
+    model_type = "aria"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size: int = 17731,
+        hidden_size: int = 1536,
+        num_hidden_layers: int = 16,
+        num_attention_heads: int = 64,
+        intermediate_size: int = 6144,
+        max_position_embeddings: int = 8192,
+        use_cache: bool = True,
+        bos_token_id: int = 0,
+        eos_token_id: int = 1,
+        tie_word_embeddings: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.return_dict = return_dict
+        if self.intermediate_size % self.hidden_size != 0:
+            raise ValueError("The intermediate size needs to be divisible by hidden size.")
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError("The hidden size needs to be divisible by the number of attention heads.")
+    @property
+    def ff_mult(self):
+        return self.intermediate_size // self.hidden_size
+__all__ = ["AriaConfig"]

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e592f31b380742f5426c0c80c8cac65efc97c6981f3b7b6b3eee193793d0116d
+size 2634219792

modeling_aria.py ADDED Viewed

	@@ -0,0 +1,565 @@

+# This is lightly adapted from https://github.com/EleutherAI/aria/blob/main/aria/model.py
+from dataclasses import dataclass
+from typing import Optional, Union, Tuple, List
+import torch
+import torch.utils.checkpoint
+from torch import nn as nn
+from torch.nn import functional as F, CrossEntropyLoss
+from transformers import Cache, DynamicCache, StaticCache
+from transformers.utils import logging
+from transformers.generation import GenerationMixin
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from .configuration_aria import AriaConfig
+logger = logging.get_logger(__name__)
+class AriaPreTrainedModel(PreTrainedModel):
+    config_class = AriaConfig
+    base_model_prefix = "aria"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["AriaBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = False
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_sdpa = True
+    _supports_flex_attn = False
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class AriaBlock(nn.Module):
+    def __init__(self, model_config: AriaConfig, layer_idx: int):
+        super().__init__()
+        self.drop_p = 0.0
+        self.n_heads = model_config.num_attention_heads
+        self.d_model = model_config.hidden_size
+        self.d_head = model_config.hidden_size // model_config.num_attention_heads
+        self.max_seq_len = model_config.max_position_embeddings
+        self.layer_idx = layer_idx
+        # Attention
+        self.mixed_qkv = nn.Linear(
+            in_features=self.d_model,
+            out_features=3 * self.d_model,
+            bias=False,
+        )
+        self.att_proj_linear = nn.Linear(
+            in_features=self.d_model,
+            out_features=self.d_model,
+            bias=False,
+        )
+        # FF Layer
+        self.ff_gate_proj = nn.Linear(
+            in_features=self.d_model,
+            out_features=self.d_model * model_config.ff_mult,
+            bias=False,
+        )
+        self.ff_up_proj = nn.Linear(
+            in_features=self.d_model,
+            out_features=self.d_model * model_config.ff_mult,
+            bias=False,
+        )
+        self.ff_down_proj = nn.Linear(
+            in_features=self.d_model * model_config.ff_mult,
+            out_features=self.d_model,
+            bias=False,
+        )
+        # Pre layer norms
+        self.norm1 = nn.LayerNorm(self.d_model)
+        self.norm2 = nn.LayerNorm(self.d_model)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.FloatTensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None
+    ):
+        attn_output, attn_weights, present = self._att_block(self.norm1(x), attention_mask, freqs_cis,
+                                               past_key_values=past_key_values,
+                                               use_cache=use_cache,
+                                               output_attentions=output_attentions,
+                                               cache_position=cache_position)
+        x = x + attn_output
+        x = x + self._ff_block(self.norm2(x))
+        outputs = (x, present)
+        if use_cache:
+            outputs = (x, present, attn_weights)
+        else:
+            outputs = (x, attn_weights)
+        return outputs
+    def _att_block(
+        self,
+        x: torch.Tensor,
+        attention_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.FloatTensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None
+    ):
+        batch_size, seq_len, _ = x.shape
+        mixed_qkv = self.mixed_qkv(x)
+        xq, xk, xv = mixed_qkv.chunk(3, -1)
+        # Reshape for rotary embeddings
+        # Need contiguous for q, k since in-place RoPE cannot be applied on a view
+        xq = xq.reshape(
+            batch_size, seq_len, self.n_heads, self.d_head
+        ).contiguous()
+        xk = xk.reshape(
+            batch_size, seq_len, self.n_heads, self.d_head
+        ).contiguous()
+        xv = xv.view(batch_size, seq_len, self.n_heads, self.d_head)
+        # apply_rotary_post_emb expects: (b_sz, s_len, n_head, d_head)
+        xq = apply_rotary_emb(xq, freqs_cis)
+        xk = apply_rotary_emb(xk, freqs_cis)
+        xq, xk, xv = map(lambda t: t.transpose(1, 2), (xq, xk, xv))
+        if past_key_values is not None:
+            cache_kwargs = {
+                #"sin": sin,
+                #"cos": cos,
+                #"partial_rotation_size": self.rotary_ndims,
+                "cache_position": cache_position,
+            }
+            xk, xv = past_key_values.update(xk, xv, self.layer_idx, cache_kwargs)
+        # scaled_dot_product_attention expects: (b_sz, n_head, s_len, d_head)
+        att = F.scaled_dot_product_attention(
+            query=xq,
+            key=xk,
+            value=xv,
+            attn_mask=attention_mask,
+            is_causal=True,
+        )
+        # Reshape for out: (b_sz, s_len, n_head, d_head)
+        out = att.transpose(1, 2).contiguous()
+        out = out.view(batch_size, seq_len, self.n_heads * self.d_head)
+        if not output_attentions:
+            att = None
+        return self.att_proj_linear(out), att, past_key_values
+    def _ff_block(self, x: torch.Tensor):
+        return self.ff_down_proj(
+            F.silu(self.ff_gate_proj(x)) * self.ff_up_proj(x)
+        )
+class AriaModel(AriaPreTrainedModel):
+    """Transformer decoder with no language model head.
+    Args:
+        model_config (ModelConfig): Model config settings.
+    """
+    def __init__(self, model_config: AriaConfig):
+        super().__init__(model_config)
+        self.model_config = model_config
+        self.freqs_cis = None
+        self.tok_embeddings = nn.Embedding(
+            num_embeddings=model_config.vocab_size,
+            embedding_dim=model_config.hidden_size,
+        )
+        self.out_layer_norm = nn.LayerNorm(model_config.hidden_size)
+        self.encode_layers = nn.ModuleList()
+        for i in range(model_config.num_hidden_layers):
+            self.encode_layers.append(AriaBlock(model_config, i))
+        self.gradient_checkpointing = False
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.FloatTensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ):
+        """Forward pass of Transformer.
+        Args:
+            src (torch.tensor): Input to encoder block, of shape (batch_size,
+                seq_len, d_model).
+            attn_mask (Optional[torch.tensor]): Attention mask of shape
+                (batch_size, seq_len). Defaults to None.
+            past_kv (Optional[list[KVCache]]): a list of kv caches. The list index
+                corresponds to the layer index.
+        Returns:
+            torch.tensor: Model outputs with shape (batch_size, seq_len,
+                d_model).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.model_config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.model_config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.model_config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.model_config.use_cache
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.tok_embeddings(input_ids)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+        seq_length = inputs_embeds.shape[1]
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        hidden_states = inputs_embeds
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        if self.freqs_cis is None:
+            self.freqs_cis = precompute_freqs_cis(
+                seq_len=self.model_config.max_position_embeddings,
+                n_elem=self.model_config.hidden_size // self.model_config.num_attention_heads,
+                base=500000,
+                dtype=hidden_states.dtype,
+            ).to(input_ids.device)
+        freqs_cis = self.freqs_cis[: input_ids.shape[1]]
+        kwargs = {
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "output_attentions": output_attentions,
+            "output_hidden_states": output_hidden_states,
+            "return_dict": return_dict,
+            "cache_position": cache_position,
+        }
+        next_decoder_cache = None
+        if self.gradient_checkpointing:
+            for layer in self.encode_layers:
+                def create_custom_forward(module):
+                    def custom_forward(*args):
+                        return module(*args)[0]
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer),
+                    hidden_states,
+                    causal_mask,
+                    freqs_cis,
+                    **kwargs,
+                    preserve_rng_state=True,
+                    use_reentrant=True,
+                )
+        else:
+            all_attentions = () if output_attentions else None
+            all_hidden_states = () if output_hidden_states else None
+            for layer in self.encode_layers:
+                if output_hidden_states:
+                    all_hidden_states = all_hidden_states + (hidden_states,)
+                outputs = layer(hidden_states, causal_mask, freqs_cis=freqs_cis, **kwargs)
+                hidden_states = outputs[0]
+                if use_cache is True:
+                    next_decoder_cache = outputs[1]
+                if output_attentions:
+                    all_attentions = all_attentions + (outputs[2 if use_cache else 1],)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+        hidden_states = self.out_layer_norm(hidden_states)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.model_config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.model_config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.model_config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+class AriaForCausalLM(AriaPreTrainedModel, GenerationMixin):
+    """Transformer decoder with head for language modelling.
+    Args:
+        model_config (ModelConfig): Model config settings.
+    """
+    def __init__(self, model_config: AriaConfig):
+        super().__init__(model_config)
+        self.model_config = model_config
+        self.max_seq_len = model_config.max_position_embeddings
+        self.model = AriaModel(model_config)
+        self.lm_head = nn.Linear(
+            model_config.hidden_size, model_config.vocab_size, bias=False
+        )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.FloatTensor]]]] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ):
+        """Forward pass of Transformer decoder with LM head."""
+        return_dict = return_dict if return_dict is not None else self.model_config.use_return_dict
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden = outputs[0]
+        lm_logits = self.lm_head(hidden)
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shift_logits = lm_logits[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+def precompute_freqs_cis(
+    seq_len: int,
+    n_elem: int,
+    base: int = 500000,
+    dtype: torch.dtype = torch.bfloat16,
+):
+    freqs = 1.0 / (
+        base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem)
+    )
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+    return cache.to(dtype=dtype)
+@torch.jit.script
+def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+    """
+    In-place RoPE. Credits to Katherine Crowson:
+    x shape (b_sz, s_len, n_head, d_head).
+    cos, sin shape (s_len, d_head // 2).
+    """
+    d = x.shape[-1] // 2
+    cos = freqs_cis[..., 0][None, :, None]
+    sin = freqs_cis[..., 1][None, :, None]
+    x1, x2 = x[..., :d], x[..., d : d * 2]
+    tmp = x1.clone()
+    x1.mul_(cos).addcmul_(x2, sin, value=-1)
+    x2.mul_(cos).addcmul_(tmp, sin, value=1)
+    return x
+__all__ = [
+    "AriaForCausalLM",
+    "AriaBlock",
+    "AriaModel",
+    "AriaPreTrainedModel",
+]