Testing

by PankajSingh0018 - opened Jan 4, 2024

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+92

-155841

Files changed (13) hide show

.gitattributes +0 -1
README.md +1 -76
config.json +1 -3
configuration_indictrans.py +1 -3
dict.SRC.json +0 -0
dict.TGT.json +0 -0
model.SRC +0 -0
model.TGT +0 -3
model.safetensors +0 -3
modeling_indictrans.py +89 -436
special_tokens_map.json +0 -6
tokenization_indictrans.py +0 -261
tokenizer_config.json +0 -51

.gitattributes CHANGED Viewed

@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-model.TGT filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -60,82 +60,7 @@ Please refer to `Appendix D: Model Card` of the [preprint](https://arxiv.org/abs
 ### Usage Instructions
-Please refer to the [github repository](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_interface) for a detail description on how to use HF compatible IndicTrans2 models for inference.
-```python
-import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-from IndicTransToolkit import IndicProcessor
-# recommended to run this on a gpu with flash_attn installed
-# don't set attn_implemetation if you don't have flash_attn
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-src_lang, tgt_lang = "eng_Latn", "hin_Deva"
-model_name = "ai4bharat/indictrans2-en-indic-1B"
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-model = AutoModelForSeq2SeqLM.from_pretrained(
-    model_name,
-    trust_remote_code=True,
-    torch_dtype=torch.float16, # performance might slightly vary for bfloat16
-    attn_implementation="flash_attention_2"
-).to(DEVICE)
-ip = IndicProcessor(inference=True)
-input_sentences = [
-    "When I was young, I used to go to the park every day.",
-    "We watched a new movie last week, which was very inspiring.",
-    "If you had met me at that time, we would have gone out to eat.",
-    "My friend has invited me to his birthday party, and I will give him a gift.",
-]
-batch = ip.preprocess_batch(
-    input_sentences,
-    src_lang=src_lang,
-    tgt_lang=tgt_lang,
-)
-# Tokenize the sentences and generate input encodings
-inputs = tokenizer(
-    batch,
-    truncation=True,
-    padding="longest",
-    return_tensors="pt",
-    return_attention_mask=True,
-).to(DEVICE)
-# Generate translations using the model
-with torch.no_grad():
-    generated_tokens = model.generate(
-        **inputs,
-        use_cache=True,
-        min_length=0,
-        max_length=256,
-        num_beams=5,
-        num_return_sequences=1,
-    )
-# Decode the generated tokens into text
-with tokenizer.as_target_tokenizer():
-    generated_tokens = tokenizer.batch_decode(
-        generated_tokens.detach().cpu().tolist(),
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=True,
-    )
-# Postprocess the translations, including entity replacement
-translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
-for input_sentence, translation in zip(input_sentences, translations):
-    print(f"{src_lang}: {input_sentence}")
-    print(f"{tgt_lang}: {translation}")
-```
-### 📢 Long Context IT2 Models
-- New RoPE based IndicTrans2 models which are capable of handling sequence lengths **upto 2048 tokens** are available [here](https://huggingface.co/collections/prajdabre/indictrans2-rope-6742ddac669a05db0804db35)
-- These models can be used by just changing the `model_name` parameter. Please read the model card of the RoPE-IT2 models for more information about the generation.
-- It is recommended to run these models with `flash_attention_2` for efficient generation.
 ### Citation

 ### Usage Instructions
+Please refer to the [github repository](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_inference) for a detail description on how to use HF compatible IndicTrans2 models for inference.
 ### Citation

config.json CHANGED Viewed

@@ -9,7 +9,6 @@
     "AutoConfig": "configuration_indictrans.IndicTransConfig",
     "AutoModelForSeq2SeqLM": "modeling_indictrans.IndicTransForConditionalGeneration"
   },
-  "tokenizer_class": "IndicTransTokenizer",
   "attention_dropout": 0.0,
   "bos_token_id": 0,
   "decoder_attention_heads": 16,
@@ -41,6 +40,5 @@
   "share_decoder_input_output_embed": false,
   "torch_dtype": "float32",
   "transformers_version": "4.32.1",
-  "use_cache": true,
-  "attn_implementation": "eager"
 }

     "AutoConfig": "configuration_indictrans.IndicTransConfig",
     "AutoModelForSeq2SeqLM": "modeling_indictrans.IndicTransForConditionalGeneration"
   },
   "attention_dropout": 0.0,
   "bos_token_id": 0,
   "decoder_attention_heads": 16,
   "share_decoder_input_output_embed": false,
   "torch_dtype": "float32",
   "transformers_version": "4.32.1",
+  "use_cache": true
 }

configuration_indictrans.py CHANGED Viewed

@@ -118,7 +118,6 @@ class IndicTransConfig(PretrainedConfig):
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        attn_implementation="eager",
         **kwargs,
     ):
         self.encoder_vocab_size = encoder_vocab_size
@@ -147,8 +146,7 @@ class IndicTransConfig(PretrainedConfig):
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding
         self.share_decoder_input_output_embed = share_decoder_input_output_embed
-        self.attn_implementation = attn_implementation
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
         **kwargs,
     ):
         self.encoder_vocab_size = encoder_vocab_size
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding
         self.share_decoder_input_output_embed = share_decoder_input_output_embed
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

dict.SRC.json DELETED Viewed

The diff for this file is too large to render. See raw diff

dict.TGT.json DELETED Viewed

The diff for this file is too large to render. See raw diff

model.SRC DELETED Viewed

Binary file (759 kB)

model.TGT DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ac9257c8e76b8b607705b959cc3d075656ea33032f7a974e467b8941df6e98d4
-size 3256903

model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:35d28fe035cd6ac026536b555558b07762425c8b930670219063e4fc3666c96d
-size 4462265272

modeling_indictrans.py CHANGED Viewed

@@ -23,57 +23,25 @@ import torch.nn as nn
 from torch.nn import functional as F
 from transformers.activations import ACT2FN
-from transformers.modeling_attn_mask_utils import (
-    _prepare_4d_attention_mask,
-    _prepare_4d_attention_mask_for_sdpa,
-    _prepare_4d_causal_attention_mask,
-    _prepare_4d_causal_attention_mask_for_sdpa,
-)
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
     Seq2SeqLMOutput,
-    Seq2SeqModelOutput
 )
-from transformers.utils import (
-    logging,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-)
 from transformers.modeling_utils import PreTrainedModel
-from transformers.generation.utils import GenerationMixin
 from .configuration_indictrans import IndicTransConfig
 logger = logging.get_logger(__name__)
-INDICTRANS_PRETRAINED_MODEL_ARCHIVE_LIST = [""]
-try:
-    if is_flash_attn_2_available():
-        from flash_attn import flash_attn_func, flash_attn_varlen_func
-        from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-except:
-    pass
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
 # Copied from transformers.models.bart.modeling_bart.shift_tokens_right
@@ -95,6 +63,54 @@ def shift_tokens_right(
     return shifted_input_ids
 def create_position_ids_from_input_ids(
     input_ids, padding_idx, past_key_values_length=0
 ):
@@ -231,15 +247,12 @@ class IndicTransAttention(nn.Module):
         dropout: float = 0.0,
         is_decoder: bool = False,
         bias: bool = True,
-        is_causal: bool = False,
-        config: Optional[IndicTransConfig] = None,
     ):
         super().__init__()
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
-        self.config = config
         if (self.head_dim * num_heads) != self.embed_dim:
             raise ValueError(
@@ -248,7 +261,6 @@ class IndicTransAttention(nn.Module):
             )
         self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
-        self.is_causal = is_causal
         self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
@@ -390,345 +402,17 @@ class IndicTransAttention(nn.Module):
         attn_output = self.out_proj(attn_output)
         return attn_output, attn_weights_reshaped, past_key_value
-class IndicTransFlashAttention2(IndicTransAttention):
-    """
-    IndicTrans flash attention module. This module inherits from `IndicTransAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-    def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # IndicTransFlashAttention2 attention does not support output_attentions
-        if output_attentions:
-            raise ValueError("IndicTransFlashAttention2 attention does not support output_attentions")
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-        bsz, q_len, _ = hidden_states.size()
-        # get query proj
-        query_states = self._reshape(self.q_proj(hidden_states), -1, bsz)
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0].transpose(1, 2)
-            value_states = past_key_value[1].transpose(1, 2)
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
-            value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
-        else:
-            # self_attention
-            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (LlamaRMSNorm handles it correctly)
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout
-        )
-        attn_output = attn_output.reshape(bsz, q_len, -1)
-        attn_output = self.out_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
-    def _flash_attention_forward(
-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-            attn_output_unpad = flash_attn_varlen_func(
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
-            )
-        return attn_output
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
-        )
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
-        )
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-class IndicTransSdpaAttention(IndicTransAttention):
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-        if output_attentions or layer_head_mask is not None:
-            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "IndicTransModel is using IndicTransSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
-                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states,
-                key_value_states=key_value_states,
-                past_key_value=past_key_value,
-                attention_mask=attention_mask,
-                layer_head_mask=layer_head_mask,
-                output_attentions=output_attentions,
-            )
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-        bsz, tgt_len, _ = hidden_states.size()
-        # get query proj
-        query_states = self.q_proj(hidden_states)
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-        query_states = self._shape(query_states, tgt_len, bsz)
-        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
-        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
-        attn_output = F.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
-        )
-        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-        attn_output = attn_output.transpose(1, 2)
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-        attn_output = self.out_proj(attn_output)
-        return attn_output, None, past_key_value
-INDICTRANS_ATTENTION_CLASSES = {
-    "eager": IndicTransAttention,
-    "sdpa": IndicTransSdpaAttention,
-    "flash_attention_2": IndicTransFlashAttention2,
-}
 # Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->IndicTrans
 class IndicTransEncoderLayer(nn.Module):
     def __init__(self, config: IndicTransConfig):
         super().__init__()
         self.embed_dim = config.encoder_embed_dim
-        self.self_attn = INDICTRANS_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
-            config=config,
         )
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
@@ -806,25 +490,22 @@ class IndicTransDecoderLayer(nn.Module):
         super().__init__()
         self.embed_dim = config.decoder_embed_dim
-        self.self_attn = INDICTRANS_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
-            is_causal=True,
-            config=config,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = INDICTRANS_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
-            config=config,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
@@ -1012,9 +693,6 @@ class IndicTransEncoder(IndicTransPreTrainedModel):
             nn.LayerNorm(embed_dim) if config.layernorm_embedding else None
         )
-        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-        self._use_sdpa = config._attn_implementation == "sdpa"
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
@@ -1101,21 +779,13 @@ class IndicTransEncoder(IndicTransPreTrainedModel):
         hidden_states = inputs_embeds + embed_pos
         if self.layernorm_embedding is not None:
-            hidden_states = self.layernorm_embedding(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         if attention_mask is not None:
-            if self._use_flash_attention_2:
-                attention_mask = attention_mask if 0 in attention_mask else None
-            elif self._use_sdpa and head_mask is None and not output_attentions:
-                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
-                # the manual implementation that requires a 4D causal mask in all cases.
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
-            else:
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -1239,9 +909,6 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
             nn.LayerNorm(embed_dim) if config.layernorm_embedding else None
         )
-        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-        self._use_sdpa = config._attn_implementation == "sdpa"
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
@@ -1364,43 +1031,29 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
-        if self._use_flash_attention_2:
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
-            # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
                 input_shape,
-                inputs_embeds,
-                past_key_values_length,
             )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask, input_shape, inputs_embeds, past_key_values_length
             )
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
-            if self._use_flash_attention_2:
-                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
-            elif self._use_sdpa and cross_attn_head_mask is None and not output_attentions:
-                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
-                # the manual implementation that requires a 4D causal mask in all cases.
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
-                    encoder_attention_mask,
-                    inputs_embeds.dtype,
-                    tgt_len=input_shape[-1],
-                )
-            else:
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                encoder_attention_mask = _prepare_4d_attention_mask(
-                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
-                )
         # embed positions
         positions = self.embed_positions(
@@ -1471,7 +1124,7 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
                     layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(decoder_layer),
                         hidden_states,
-                        attention_mask,
                         encoder_hidden_states,
                         encoder_attention_mask,
                         head_mask[idx] if head_mask is not None else None,
@@ -1483,7 +1136,7 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
                 else:
                     layer_outputs = decoder_layer(
                         hidden_states,
-                        attention_mask=attention_mask,
                         encoder_hidden_states=encoder_hidden_states,
                         encoder_attention_mask=encoder_attention_mask,
                         layer_head_mask=(
@@ -1642,9 +1295,9 @@ class IndicTransModel(IndicTransPreTrainedModel):
 # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ForConditionalGeneration->IndicTrans
-class IndicTransForConditionalGeneration(IndicTransPreTrainedModel, GenerationMixin):
     base_model_prefix = "model"
-    _tied_weights_keys = ["decoder.embed_tokens.weight", "lm_head.weight"]
     _label_smoothing = 0.0
     def __init__(self, config: IndicTransConfig):
@@ -1654,20 +1307,19 @@ class IndicTransForConditionalGeneration(IndicTransPreTrainedModel, GenerationMi
             config.decoder_embed_dim, config.decoder_vocab_size, bias=False
         )
         self.post_init()
     def tie_weights(self):
-        if self.config.share_decoder_input_output_embed:
-            self._tie_or_clone_weights(self.model.decoder.embed_tokens, self.lm_head)
     def get_encoder(self):
-        return self.model.encoder
     def get_decoder(self):
-        return self.model.decoder
-    def get_input_embeddings(self):
-        return self.model.encoder.embed_tokens
     def get_output_embeddings(self):
         return self.lm_head
@@ -1677,6 +1329,7 @@ class IndicTransForConditionalGeneration(IndicTransPreTrainedModel, GenerationMi
     def set_label_smoothing(self, label_smoothing):
         self._label_smoothing = label_smoothing
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -1740,7 +1393,7 @@ class IndicTransForConditionalGeneration(IndicTransPreTrainedModel, GenerationMi
             masked_lm_loss = F.cross_entropy(
                 input=lm_logits.view(-1, self.config.decoder_vocab_size),
                 target=labels.view(-1),
-                ignore_index=-100,
                 label_smoothing=self._label_smoothing,
             )

 from torch.nn import functional as F
 from transformers.activations import ACT2FN
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
     Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
 )
+from transformers.utils import logging
 from transformers.modeling_utils import PreTrainedModel
 from .configuration_indictrans import IndicTransConfig
 logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "IndicTransConfig"
+INDICTRANS_PRETRAINED_MODEL_ARCHIVE_LIST = [""]
 # Copied from transformers.models.bart.modeling_bart.shift_tokens_right
     return shifted_input_ids
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(
+                    tgt_len, past_key_values_length, dtype=dtype, device=device
+                ),
+                mask,
+            ],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(
+        bsz, 1, tgt_len, tgt_len + past_key_values_length
+    )
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
 def create_position_ids_from_input_ids(
     input_ids, padding_idx, past_key_values_length=0
 ):
         dropout: float = 0.0,
         is_decoder: bool = False,
         bias: bool = True,
     ):
         super().__init__()
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
         if (self.head_dim * num_heads) != self.embed_dim:
             raise ValueError(
             )
         self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
         self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         attn_output = self.out_proj(attn_output)
         return attn_output, attn_weights_reshaped, past_key_value
 # Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->IndicTrans
 class IndicTransEncoderLayer(nn.Module):
     def __init__(self, config: IndicTransConfig):
         super().__init__()
         self.embed_dim = config.encoder_embed_dim
+        self.self_attn = IndicTransAttention(
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
         )
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.dropout = config.dropout
         super().__init__()
         self.embed_dim = config.decoder_embed_dim
+        self.self_attn = IndicTransAttention(
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
         )
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.activation_dropout = config.activation_dropout
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = IndicTransAttention(
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
             is_decoder=True,
         )
         self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
             nn.LayerNorm(embed_dim) if config.layernorm_embedding else None
         )
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
         hidden_states = inputs_embeds + embed_pos
         if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        # expand attention_mask
         if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
             nn.LayerNorm(embed_dim) if config.layernorm_embedding else None
         )
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
                 input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
             )
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
             )
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
         # embed positions
         positions = self.embed_positions(
                     layer_outputs = torch.utils.checkpoint.checkpoint(
                         create_custom_forward(decoder_layer),
                         hidden_states,
+                        combined_attention_mask,
                         encoder_hidden_states,
                         encoder_attention_mask,
                         head_mask[idx] if head_mask is not None else None,
                 else:
                     layer_outputs = decoder_layer(
                         hidden_states,
+                        attention_mask=combined_attention_mask,
                         encoder_hidden_states=encoder_hidden_states,
                         encoder_attention_mask=encoder_attention_mask,
                         layer_head_mask=(
 # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ForConditionalGeneration->IndicTrans
+class IndicTransForConditionalGeneration(IndicTransPreTrainedModel):
     base_model_prefix = "model"
+    _tied_weights_keys = None
     _label_smoothing = 0.0
     def __init__(self, config: IndicTransConfig):
             config.decoder_embed_dim, config.decoder_vocab_size, bias=False
         )
+        if config.share_decoder_input_output_embed:
+            self.lm_head.weight = self.model.decoder.embed_tokens.weight
         self.post_init()
     def tie_weights(self):
+        pass
     def get_encoder(self):
+        return self.model.get_encoder()
     def get_decoder(self):
+        return self.model.get_decoder()
     def get_output_embeddings(self):
         return self.lm_head
     def set_label_smoothing(self, label_smoothing):
         self._label_smoothing = label_smoothing
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
             masked_lm_loss = F.cross_entropy(
                 input=lm_logits.view(-1, self.config.decoder_vocab_size),
                 target=labels.view(-1),
+                ignore_index=self.config.pad_token_id,
                 label_smoothing=self._label_smoothing,
             )

special_tokens_map.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "bos_token": "<s>",
-  "eos_token": "</s>",
-  "pad_token": "<pad>",
-  "unk_token": "<unk>"
-}

tokenization_indictrans.py DELETED Viewed

@@ -1,261 +0,0 @@
-import os
-import json
-from typing import Dict, List, Optional, Union, Tuple
-from transformers.utils import logging
-from sentencepiece import SentencePieceProcessor
-from transformers.tokenization_utils import PreTrainedTokenizer
-logger = logging.get_logger(__name__)
-SPIECE_UNDERLINE = "▁"
-SPECIAL_TAGS = {
-    "_bt_",
-    "_ft_",
-    "asm_Beng",
-    "awa_Deva",
-    "ben_Beng",
-    "bho_Deva",
-    "brx_Deva",
-    "doi_Deva",
-    "eng_Latn",
-    "gom_Deva",
-    "gon_Deva",
-    "guj_Gujr",
-    "hin_Deva",
-    "hne_Deva",
-    "kan_Knda",
-    "kas_Arab",
-    "kas_Deva",
-    "kha_Latn",
-    "lus_Latn",
-    "mag_Deva",
-    "mai_Deva",
-    "mal_Mlym",
-    "mar_Deva",
-    "mni_Beng",
-    "mni_Mtei",
-    "npi_Deva",
-    "ory_Orya",
-    "pan_Guru",
-    "san_Deva",
-    "sat_Olck",
-    "snd_Arab",
-    "snd_Deva",
-    "tam_Taml",
-    "tel_Telu",
-    "urd_Arab",
-    "unr_Deva",
-}
-VOCAB_FILES_NAMES = {
-    "src_vocab_fp": "dict.SRC.json",
-    "tgt_vocab_fp": "dict.TGT.json",
-    "src_spm_fp": "model.SRC",
-    "tgt_spm_fp": "model.TGT",
-}
-class IndicTransTokenizer(PreTrainedTokenizer):
-    _added_tokens_encoder = {}
-    _added_tokens_decoder = {}
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    def __init__(
-        self,
-        src_vocab_fp=None,
-        tgt_vocab_fp=None,
-        src_spm_fp=None,
-        tgt_spm_fp=None,
-        unk_token="<unk>",
-        bos_token="<s>",
-        eos_token="</s>",
-        pad_token="<pad>",
-        do_lower_case=False,
-        **kwargs,
-    ):
-        self.src = True
-        self.src_vocab_fp = src_vocab_fp
-        self.tgt_vocab_fp = tgt_vocab_fp
-        self.src_spm_fp = src_spm_fp
-        self.tgt_spm_fp = tgt_spm_fp
-        self.unk_token = unk_token.content
-        self.pad_token = pad_token.content
-        self.eos_token = eos_token.content
-        self.bos_token = bos_token.content
-        self.encoder = self._load_json(self.src_vocab_fp)
-        if self.unk_token not in self.encoder:
-            raise KeyError("<unk> token must be in vocab")
-        assert self.pad_token in self.encoder
-        self.encoder_rev = {v: k for k, v in self.encoder.items()}
-        self.decoder = self._load_json(self.tgt_vocab_fp)
-        if self.unk_token not in self.encoder:
-            raise KeyError("<unk> token must be in vocab")
-        assert self.pad_token in self.encoder
-        self.decoder_rev = {v: k for k, v in self.decoder.items()}
-        # load SentencePiece model for pre-processing
-        self.src_spm = self._load_spm(self.src_spm_fp)
-        self.tgt_spm = self._load_spm(self.tgt_spm_fp)
-        self.current_spm = self.src_spm
-        self.current_encoder = self.encoder
-        self.current_encoder_rev = self.encoder_rev
-        self.unk_token_id = self.encoder[self.unk_token]
-        self.pad_token_id = self.encoder[self.pad_token]
-        self.eos_token_id = self.encoder[self.eos_token]
-        self.bos_token_id = self.encoder[self.bos_token]
-        super().__init__(
-            src_vocab_file=self.src_vocab_fp,
-            tgt_vocab_file=self.src_vocab_fp,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            **kwargs,
-        )
-    def add_new_special_tags(self, new_tags: List[str]):
-        SPECIAL_TAGS.update(new_tags)
-    def _switch_to_input_mode(self):
-        self.src = True
-        self.padding_side = "left"
-        self.current_spm = self.src_spm
-        self.current_encoder = self.encoder
-        self.current_encoder_rev = self.encoder_rev
-    def _switch_to_target_mode(self):
-        self.src = False
-        self.padding_side = "right"
-        self.current_spm = self.tgt_spm
-        self.current_encoder = self.decoder
-        self.current_encoder_rev = self.decoder_rev
-    def _load_spm(self, path: str) -> SentencePieceProcessor:
-        return SentencePieceProcessor(model_file=path)
-    def _save_json(self, data, path: str) -> None:
-        with open(path, "w", encoding="utf-8") as f:
-            json.dump(data, f, indent=2)
-    def _load_json(self, path: str) -> Union[Dict, List]:
-        with open(path, "r", encoding="utf-8") as f:
-            return json.load(f)
-    def _split_tags(self, tokens: List[str]) -> Tuple[List[str], List[str]]:
-        tags = [token for token in tokens if token in SPECIAL_TAGS]
-        tokens = [token for token in tokens if token not in SPECIAL_TAGS]
-        return tags, tokens
-    def _split_pads(self, tokens: List[str]) -> Tuple[List[str], List[str]]:
-        pads = [token for token in tokens if token == self.pad_token]
-        tokens = [token for token in tokens if token != self.pad_token]
-        return pads, tokens
-    @property
-    def src_vocab_size(self) -> int:
-        return len(self.encoder)
-    @property
-    def tgt_vocab_size(self) -> int:
-        return len(self.decoder)
-    def get_src_vocab(self) -> Dict[str, int]:
-        return dict(self.encoder, **self.added_tokens_encoder)
-    def get_tgt_vocab(self) -> Dict[str, int]:
-        return dict(self.decoder, **self.added_tokens_decoder)
-    # hack override
-    def get_vocab(self) -> Dict[str, int]:
-        return self.get_src_vocab()
-    # hack override
-    @property
-    def vocab_size(self) -> int:
-        return self.src_vocab_size
-    def _convert_token_to_id(self, token: str) -> int:
-        """Converts an token (str) into an index (integer) using the source/target vocabulary map."""
-        return self.current_encoder.get(token, self.current_encoder[self.unk_token])
-    def _convert_id_to_token(self, index: int) -> str:
-        """Converts an index (integer) into a token (str) using the source/target vocabulary map."""
-        return self.current_encoder_rev.get(index, self.unk_token)
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """Uses sentencepiece model for detokenization"""
-        pads, tokens = self._split_pads(tokens)
-        if self.src:
-            tags, non_tags = self._split_tags(tokens)
-            return (
-                " ".join(pads)
-                + " "
-                + " ".join(tags)
-                + " "
-                + "".join(non_tags).replace(SPIECE_UNDERLINE, " ").strip()
-            )
-        return (
-            "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-            + " "
-            + " ".join(pads)
-        )
-    def _tokenize(self, text) -> List[str]:
-        if self.src:
-            tokens = text.split(" ")
-            tags, non_tags = self._split_tags(tokens)
-            text = " ".join(non_tags)
-            tokens = self.current_spm.EncodeAsPieces(text)
-            return tags + tokens
-        else:
-            return self.current_spm.EncodeAsPieces(text)
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        if token_ids_1 is None:
-            return token_ids_0 + [self.eos_token_id]
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return token_ids_0 + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
-    def save_vocabulary(
-        self, save_directory: str, filename_prefix: Optional[str] = None
-    ) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        src_spm_fp = os.path.join(save_directory, "model.SRC")
-        tgt_spm_fp = os.path.join(save_directory, "model.TGT")
-        src_vocab_fp = os.path.join(save_directory, "dict.SRC.json")
-        tgt_vocab_fp = os.path.join(save_directory, "dict.TGT.json")
-        self._save_json(self.encoder, src_vocab_fp)
-        self._save_json(self.decoder, tgt_vocab_fp)
-        with open(src_spm_fp, "wb") as f:
-            f.write(self.src_spm.serialized_model_proto())
-        with open(tgt_spm_fp, "wb") as f:
-            f.write(self.tgt_spm.serialized_model_proto())
-        return src_vocab_fp, tgt_vocab_fp, src_spm_fp, tgt_spm_fp

tokenizer_config.json DELETED Viewed

@@ -1,51 +0,0 @@
-{
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<pad>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "</s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "3": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<s>",
-  "clean_up_tokenization_spaces": true,
-  "do_lower_case": false,
-  "eos_token": "</s>",
-  "model_max_length": 256,
-  "pad_token": "<pad>",
-  "name_or_path": "ai4bharat/indictrans2-en-indic-1B",
-  "tokenizer_class": "IndicTransTokenizer",
-  "auto_map": {
-    "AutoTokenizer": [
-      "tokenization_indictrans.IndicTransTokenizer",
-      null
-    ]
-  },
-  "unk_token": "<unk>"
-}