RDPD-mini / tokenizeConfig.py
aframson's picture
asas
88541c2
raw
history blame
1.79 kB
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
import json
class OBITokenizer:
def __init__(self):
# Initialize a BPE model for tokenization
bpe_model = models.BPE()
# Initialize the tokenizer
self.tokenizer = Tokenizer(bpe_model)
# Add pre-tokenization and decoding steps if needed
self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
self.tokenizer.decoder = decoders.ByteLevel()
def train(self, files,save_path):
# Training: Fit the tokenizer on your text data
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
self.tokenizer.train(trainer=trainer, files=files)
# Save the trained tokenizer to a file
self.tokenizer.save(save_path)
def save_config(self, config_file):
# Serialize the tokenizer's config to a JSON file
config_dict = {
"tokenizer_type": "custom",
"vocab_size": self.tokenizer.get_vocab_size(),
"tokenizer_class": "OBITokenizer",
"auto_map": {
"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]
},
"bos_token": "[CLS]",
"eos_token": "[SEP]",
"unk_token": "[UNK]",
"pad_token": "[PAD]",
"mask_token": "[MASK]"
# Add other custom settings if needed
}
with open(config_file, "w") as f:
json.dump(config_dict, f)
def encode(self, text):
# Encode text using the custom tokenizer
encoding = self.tokenizer.encode(text)
return encoding.ids
def decode(self, ids):
# Decode IDs to text using the custom tokenizer
return self.tokenizer.decode(ids)