adsa
Browse files- tokenizeConfig.py +20 -1
tokenizeConfig.py
CHANGED
@@ -1,6 +1,11 @@
|
|
1 |
from transformers import PreTrainedTokenizer
|
2 |
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
|
3 |
import json
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
class OBITokenizer(PreTrainedTokenizer):
|
6 |
def __init__(self, auto_map={"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]},tokenizer_class= "OBITokenizer",**kwargs):
|
@@ -16,6 +21,20 @@ class OBITokenizer(PreTrainedTokenizer):
|
|
16 |
self.tokenizer.decoder = decoders.ByteLevel()
|
17 |
super().__init__(**kwargs)
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def train(self, files,save_path):
|
20 |
# Training: Fit the tokenizer on your text data
|
21 |
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
|
@@ -32,7 +51,7 @@ class OBITokenizer(PreTrainedTokenizer):
|
|
32 |
"vocab_size": self.tokenizer.get_vocab_size(),
|
33 |
"tokenizer_class": "OBITokenizer",
|
34 |
"auto_map": {
|
35 |
-
"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]
|
36 |
},
|
37 |
"bos_token": "[CLS]",
|
38 |
"eos_token": "[SEP]",
|
|
|
1 |
from transformers import PreTrainedTokenizer
|
2 |
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
|
3 |
import json
|
4 |
+
from typing import List, Optional, Union, Dict
|
5 |
+
from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
|
6 |
+
from transformers import PreTrainedTokenizer
|
7 |
+
from transformers.utils import logging, PaddingStrategy
|
8 |
+
|
9 |
|
10 |
class OBITokenizer(PreTrainedTokenizer):
|
11 |
def __init__(self, auto_map={"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]},tokenizer_class= "OBITokenizer",**kwargs):
|
|
|
21 |
self.tokenizer.decoder = decoders.ByteLevel()
|
22 |
super().__init__(**kwargs)
|
23 |
|
24 |
+
|
25 |
+
def _pad(
|
26 |
+
self,
|
27 |
+
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
|
28 |
+
max_length: Optional[int] = None,
|
29 |
+
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
30 |
+
pad_to_multiple_of: Optional[int] = None,
|
31 |
+
return_attention_mask: Optional[bool] = None,
|
32 |
+
) -> dict:
|
33 |
+
# Modify the _pad method as needed for OBITokenizer
|
34 |
+
# You can inherit the implementation from ChatGLMTokenizer and customize it further
|
35 |
+
return super()._pad(encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask)
|
36 |
+
|
37 |
+
|
38 |
def train(self, files,save_path):
|
39 |
# Training: Fit the tokenizer on your text data
|
40 |
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
|
|
|
51 |
"vocab_size": self.tokenizer.get_vocab_size(),
|
52 |
"tokenizer_class": "OBITokenizer",
|
53 |
"auto_map": {
|
54 |
+
"AutoTokenizer": ["tokenizeConfig.OBITokenizer","null"]
|
55 |
},
|
56 |
"bos_token": "[CLS]",
|
57 |
"eos_token": "[SEP]",
|