from transformers import PreTrainedTokenizer from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders import json from typing import List, Optional, Union, Dict from transformers.tokenization_utils_base import EncodedInput, BatchEncoding from transformers.utils import PaddingStrategy class OBITokenizer(PreTrainedTokenizer): def __init__( self, vocab_file, unk_token="", bos_token="", eos_token="", pad_token=None, add_bos_token=True, add_eos_token=False, clean_up_tokenization_spaces=False, auto_map={"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]}, tokenizer_class="OBITokenizer", **kwargs, ): super().__init__( unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, add_bos_token=add_bos_token, add_eos_token=add_eos_token, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) # Initialize a BPE model for tokenization bpe_model = models.BPE() self.tokenizer = Tokenizer(bpe_model) # Add pre-tokenization and decoding steps if needed self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() self.tokenizer.decoder = decoders.ByteLevel() # Set the padding token self.pad_token = "[PAD]" # Set the special tokens self.cls_token = "[CLS]" self.sep_token = "[SEP]" self.unk_token = "[UNK]" self.mask_token = "[MASK]" self.bos_token = "[CLS]" self.eos_token = "[SEP]" self.pad_token = "[PAD]" # Load the vocabulary file self.tokenizer.get_vocab().add_special_tokens([self.cls_token, self.sep_token, self.unk_token, self.mask_token]) def _pad( self, encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, ) -> dict: # Modify the _pad method as needed for OBITokenizer # You can inherit the implementation from ChatGLMTokenizer and customize it further return super()._pad(encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask) def train(self, files, save_path): # Training: Fit the tokenizer on your text data trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]) self.tokenizer.train(trainer=trainer, files=files) # Save the trained tokenizer to a file self.tokenizer.save(save_path) def save_config(self, config_file): # Serialize the tokenizer's config to a JSON file config_dict = { "tokenizer_type": "custom", "vocab_size": self.tokenizer.get_vocab_size(), "tokenizer_class": "OBITokenizer", "auto_map": { "AutoTokenizer": ["tokenizeConfig.OBITokenizer", "null"] }, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "pad_token": "[PAD]", "mask_token": "[MASK]" # Add other custom settings if needed } with open(config_file, "w") as f: json.dump(config_dict, f) def encode(self, text): # Encode text using the custom tokenizer encoding = self.tokenizer.encode(text) return encoding.ids def decode(self, ids): # Decode IDs to text using the custom tokenizer return self.tokenizer.decode(ids)