aframson commited on
Commit
3eb9418
·
1 Parent(s): d09b77a
Files changed (1) hide show
  1. tokenizeConfig.py +5 -1
tokenizeConfig.py CHANGED
@@ -121,6 +121,10 @@ class OBITokenizer(PreTrainedTokenizer):
121
  )
122
 
123
  # Save the BPE vocab
124
- self.tokenizer.get_vocab().to_file(out_vocab_file)
 
 
 
 
125
 
126
  return (out_vocab_file,)
 
121
  )
122
 
123
  # Save the BPE vocab
124
+ # Training: Fit the tokenizer on your text data
125
+ trainer = trainers.BpeTrainer(special_tokens=["<unk>", "<s>", "</s>","[PAD]"])
126
+ self.tokenizer.train(trainer=trainer, files=[out_vocab_file])
127
+ # Save the trained tokenizer to a file
128
+ self.tokenizer.save(out_vocab_file)
129
 
130
  return (out_vocab_file,)