|
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders |
|
import json |
|
|
|
class OBITokenizer: |
|
def __init__(self): |
|
|
|
bpe_model = models.BPE() |
|
|
|
self.tokenizer = Tokenizer(bpe_model) |
|
|
|
self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() |
|
self.tokenizer.decoder = decoders.ByteLevel() |
|
|
|
def train(self, files,save_path): |
|
|
|
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]) |
|
self.tokenizer.train(trainer=trainer, files=files) |
|
|
|
self.tokenizer.save(save_path) |
|
|
|
|
|
|
|
def save_config(self, config_file): |
|
|
|
config_dict = { |
|
"tokenizer_type": "custom", |
|
"vocab_size": self.tokenizer.get_vocab_size(), |
|
"tokenizer_class": "OBITokenizer", |
|
"auto_map": { |
|
"AutoTokenizer": ["tokenizeConfig.OBITokenizer"] |
|
}, |
|
"bos_token": "[CLS]", |
|
"eos_token": "[SEP]", |
|
"unk_token": "[UNK]", |
|
"pad_token": "[PAD]", |
|
"mask_token": "[MASK]" |
|
|
|
} |
|
with open(config_file, "w") as f: |
|
json.dump(config_dict, f) |
|
|
|
def encode(self, text): |
|
|
|
encoding = self.tokenizer.encode(text) |
|
return encoding.ids |
|
|
|
def decode(self, ids): |
|
|
|
return self.tokenizer.decode(ids) |