aframson commited on
Commit
ac72af3
·
1 Parent(s): f712f8a
Files changed (1) hide show
  1. tokenizeConfig.py +20 -1
tokenizeConfig.py CHANGED
@@ -1,6 +1,11 @@
1
  from transformers import PreTrainedTokenizer
2
  from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
3
  import json
 
 
 
 
 
4
 
5
  class OBITokenizer(PreTrainedTokenizer):
6
  def __init__(self, auto_map={"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]},tokenizer_class= "OBITokenizer",**kwargs):
@@ -16,6 +21,20 @@ class OBITokenizer(PreTrainedTokenizer):
16
  self.tokenizer.decoder = decoders.ByteLevel()
17
  super().__init__(**kwargs)
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def train(self, files,save_path):
20
  # Training: Fit the tokenizer on your text data
21
  trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
@@ -32,7 +51,7 @@ class OBITokenizer(PreTrainedTokenizer):
32
  "vocab_size": self.tokenizer.get_vocab_size(),
33
  "tokenizer_class": "OBITokenizer",
34
  "auto_map": {
35
- "AutoTokenizer": ["tokenizeConfig.OBITokenizer"]
36
  },
37
  "bos_token": "[CLS]",
38
  "eos_token": "[SEP]",
 
1
  from transformers import PreTrainedTokenizer
2
  from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
3
  import json
4
+ from typing import List, Optional, Union, Dict
5
+ from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
6
+ from transformers import PreTrainedTokenizer
7
+ from transformers.utils import logging, PaddingStrategy
8
+
9
 
10
  class OBITokenizer(PreTrainedTokenizer):
11
  def __init__(self, auto_map={"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]},tokenizer_class= "OBITokenizer",**kwargs):
 
21
  self.tokenizer.decoder = decoders.ByteLevel()
22
  super().__init__(**kwargs)
23
 
24
+
25
+ def _pad(
26
+ self,
27
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
28
+ max_length: Optional[int] = None,
29
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
30
+ pad_to_multiple_of: Optional[int] = None,
31
+ return_attention_mask: Optional[bool] = None,
32
+ ) -> dict:
33
+ # Modify the _pad method as needed for OBITokenizer
34
+ # You can inherit the implementation from ChatGLMTokenizer and customize it further
35
+ return super()._pad(encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask)
36
+
37
+
38
  def train(self, files,save_path):
39
  # Training: Fit the tokenizer on your text data
40
  trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
 
51
  "vocab_size": self.tokenizer.get_vocab_size(),
52
  "tokenizer_class": "OBITokenizer",
53
  "auto_map": {
54
+ "AutoTokenizer": ["tokenizeConfig.OBITokenizer","null"]
55
  },
56
  "bos_token": "[CLS]",
57
  "eos_token": "[SEP]",