ds
Browse files- tokenizeConfig.py +5 -1
tokenizeConfig.py
CHANGED
@@ -121,6 +121,10 @@ class OBITokenizer(PreTrainedTokenizer):
|
|
121 |
)
|
122 |
|
123 |
# Save the BPE vocab
|
124 |
-
|
|
|
|
|
|
|
|
|
125 |
|
126 |
return (out_vocab_file,)
|
|
|
121 |
)
|
122 |
|
123 |
# Save the BPE vocab
|
124 |
+
# Training: Fit the tokenizer on your text data
|
125 |
+
trainer = trainers.BpeTrainer(special_tokens=["<unk>", "<s>", "</s>","[PAD]"])
|
126 |
+
self.tokenizer.train(trainer=trainer, files=[out_vocab_file])
|
127 |
+
# Save the trained tokenizer to a file
|
128 |
+
self.tokenizer.save(out_vocab_file)
|
129 |
|
130 |
return (out_vocab_file,)
|