from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders import json class OBITokenizer: def __init__(self): # Initialize a BPE model for tokenization bpe_model = models.BPE() # Initialize the tokenizer self.tokenizer = Tokenizer(bpe_model) # Add pre-tokenization and decoding steps if needed self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() self.tokenizer.decoder = decoders.ByteLevel() def train(self, files,save_path): # Training: Fit the tokenizer on your text data trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]) self.tokenizer.train(trainer=trainer, files=files) # Save the trained tokenizer to a file self.tokenizer.save(save_path) def save_config(self, config_file): # Serialize the tokenizer's config to a JSON file config_dict = { "tokenizer_type": "custom", "vocab_size": self.tokenizer.get_vocab_size(), "tokenizer_class": "OBITokenizer", "auto_map": { "AutoTokenizer": ["tokenizeConfig.OBITokenizer"] }, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "pad_token": "[PAD]", "mask_token": "[MASK]" # Add other custom settings if needed } with open(config_file, "w") as f: json.dump(config_dict, f) def encode(self, text): # Encode text using the custom tokenizer encoding = self.tokenizer.encode(text) return encoding.ids def decode(self, ids): # Decode IDs to text using the custom tokenizer return self.tokenizer.decode(ids)