ad
Browse filesads
- tokenizeConfig.py +5 -1
tokenizeConfig.py
CHANGED
@@ -3,7 +3,10 @@ from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
|
|
3 |
import json
|
4 |
|
5 |
class OBITokenizer(PreTrainedTokenizer):
|
6 |
-
def __init__(self):
|
|
|
|
|
|
|
7 |
# Initialize a BPE model for tokenization
|
8 |
bpe_model = models.BPE()
|
9 |
# Initialize the tokenizer
|
@@ -11,6 +14,7 @@ class OBITokenizer(PreTrainedTokenizer):
|
|
11 |
# Add pre-tokenization and decoding steps if needed
|
12 |
self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
13 |
self.tokenizer.decoder = decoders.ByteLevel()
|
|
|
14 |
|
15 |
def train(self, files,save_path):
|
16 |
# Training: Fit the tokenizer on your text data
|
|
|
3 |
import json
|
4 |
|
5 |
class OBITokenizer(PreTrainedTokenizer):
|
6 |
+
def __init__(self, auto_map={"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]},tokenizer_class= "OBITokenizer",**kwargs):
|
7 |
+
# Initialize your tokenizer with the auto_map parameter if needed
|
8 |
+
self.auto_map=auto_map
|
9 |
+
self.tokenizer_class=tokenizer_class
|
10 |
# Initialize a BPE model for tokenization
|
11 |
bpe_model = models.BPE()
|
12 |
# Initialize the tokenizer
|
|
|
14 |
# Add pre-tokenization and decoding steps if needed
|
15 |
self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
16 |
self.tokenizer.decoder = decoders.ByteLevel()
|
17 |
+
super().__init__(**kwargs)
|
18 |
|
19 |
def train(self, files,save_path):
|
20 |
# Training: Fit the tokenizer on your text data
|