aframson commited on
Commit
818bd96
·
1 Parent(s): 70fa35a
Files changed (1) hide show
  1. tokenizeConfig.py +184 -59
tokenizeConfig.py CHANGED
@@ -1,90 +1,215 @@
1
- from transformers import PreTrainedTokenizer
2
- from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
3
- import json
4
- from typing import List, Optional, Union, Dict
5
- from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
6
- from transformers.utils import PaddingStrategy
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  class OBITokenizer(PreTrainedTokenizer):
 
 
 
 
 
 
 
 
 
 
 
 
9
  def __init__(
10
  self,
 
11
  unk_token="<unk>",
12
  bos_token="<s>",
13
  eos_token="</s>",
14
- pad_token=None,
 
15
  add_bos_token=True,
16
  add_eos_token=False,
 
17
  clean_up_tokenization_spaces=False,
18
- auto_map={"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]},
19
- tokenizer_class="OBITokenizer",
20
  **kwargs,
21
  ):
 
22
  super().__init__(
23
- unk_token=unk_token,
24
  bos_token=bos_token,
25
  eos_token=eos_token,
 
26
  pad_token=pad_token,
27
- add_bos_token=add_bos_token,
28
- add_eos_token=add_eos_token,
29
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
30
  **kwargs,
31
  )
 
 
 
 
 
 
 
32
 
33
- # Initialize a BPE model for tokenization
34
- bpe_model = models.BPE()
35
- self.tokenizer = Tokenizer(bpe_model)
36
 
37
- # Add pre-tokenization and decoding steps if needed
38
- self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
39
- self.tokenizer.decoder = decoders.ByteLevel()
 
 
 
40
 
41
- # Set the padding token
42
- self.pad_token = "[PAD]"
 
 
43
 
44
- # Set the special tokens
45
- self.cls_token = "[CLS]"
46
- self.sep_token = "[SEP]"
47
- self.unk_token = "[UNK]"
48
- self.mask_token = "[MASK]"
49
- self.bos_token = "[CLS]"
50
- self.eos_token = "[SEP]"
51
- self.pad_token = "[PAD]"
52
 
53
- # Call the add_special_tokens method on the tokenizer object
54
- self.tokenizer.add_special_tokens([self.cls_token, self.sep_token, self.unk_token, self.mask_token])
 
 
 
 
 
 
 
55
 
56
  def _tokenize(self, text):
57
- # Implement your custom tokenization logic here
58
- # In this example, we split the text into tokens using whitespace
59
- return text.split()
60
 
61
  def _convert_token_to_id(self, token):
62
- # Implement mapping from token to ID using your tokenizer's vocabulary
63
- # Use the tokenizer's method to convert tokens to IDs
64
- return self.tokenizer.encode(token).ids
65
 
66
  def _convert_id_to_token(self, index):
67
- # Implement mapping from ID to token using your tokenizer's vocabulary
68
- # Use the tokenizer's method to convert IDs to tokens
69
- return self.tokenizer.decode([index])
70
-
71
- def encode(self, text):
72
- # Encode text using the custom tokenizer
73
- input_ids = [self._convert_token_to_id(token) for token in self._tokenize(text)]
74
- attention_mask = [1] * len(input_ids)
75
-
76
- return {"input_ids": input_ids, "attention_mask": attention_mask}
77
-
78
- def decode(self, ids):
79
- # Decode IDs to text using the custom tokenizer
80
- tokens = [self._convert_id_to_token(token_id) for token_id in ids]
81
- return " ".join(tokens)
82
-
83
- def get_vocab(self):
84
- # Return the tokenizer's vocabulary
85
- return self.tokenizer.get_vocab()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- def save_vocabulary(self, vocab_path):
88
- # Save the tokenizer's vocabulary to a file
89
- with open(vocab_path, "w") as f:
90
- json.dump(self.tokenizer.get_vocab(), f)
 
1
+ """Tokenization classes for IntermLM."""
2
+ import os
3
+ from shutil import copyfile
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+
6
+ import sentencepiece as spm
7
+
8
+ from transformers.tokenization_utils import PreTrainedTokenizer
9
+ from transformers.utils import logging
10
+
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
15
+
16
+ PRETRAINED_VOCAB_FILES_MAP = {}
17
+
18
 
19
  class OBITokenizer(PreTrainedTokenizer):
20
+ """
21
+ Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
22
+ Args:
23
+ vocab_file (`str`):
24
+ Path to the vocabulary file.
25
+ """
26
+
27
+ vocab_files_names = VOCAB_FILES_NAMES
28
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
29
+ model_input_names = ["input_ids", "attention_mask"]
30
+ _auto_class = "AutoTokenizer"
31
+
32
  def __init__(
33
  self,
34
+ vocab_file,
35
  unk_token="<unk>",
36
  bos_token="<s>",
37
  eos_token="</s>",
38
+ pad_token="</s>",
39
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
40
  add_bos_token=True,
41
  add_eos_token=False,
42
+ decode_with_prefix_space=False,
43
  clean_up_tokenization_spaces=False,
 
 
44
  **kwargs,
45
  ):
46
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
47
  super().__init__(
 
48
  bos_token=bos_token,
49
  eos_token=eos_token,
50
+ unk_token=unk_token,
51
  pad_token=pad_token,
 
 
52
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
53
  **kwargs,
54
  )
55
+ self.vocab_file = vocab_file
56
+ self.add_bos_token = add_bos_token
57
+ self.add_eos_token = add_eos_token
58
+ self.decode_with_prefix_space = decode_with_prefix_space
59
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
60
+ self.sp_model.Load(vocab_file)
61
+ self._no_prefix_space_tokens = None
62
 
63
+ """ Initialisation"""
 
 
64
 
65
+ @property
66
+ def no_prefix_space_tokens(self):
67
+ if self._no_prefix_space_tokens is None:
68
+ vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
69
+ self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
70
+ return self._no_prefix_space_tokens
71
 
72
+ @property
73
+ def vocab_size(self):
74
+ """Returns vocab size"""
75
+ return self.sp_model.get_piece_size()
76
 
77
+ @property
78
+ def bos_token_id(self) -> Optional[int]:
79
+ return self.sp_model.bos_id()
 
 
 
 
 
80
 
81
+ @property
82
+ def eos_token_id(self) -> Optional[int]:
83
+ return self.sp_model.eos_id()
84
+
85
+ def get_vocab(self):
86
+ """Returns vocab as a dict"""
87
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
88
+ vocab.update(self.added_tokens_encoder)
89
+ return vocab
90
 
91
  def _tokenize(self, text):
92
+ """Returns a tokenized string."""
93
+ return self.sp_model.encode(text, out_type=str)
 
94
 
95
  def _convert_token_to_id(self, token):
96
+ """Converts a token (str) in an id using the vocab."""
97
+ return self.sp_model.piece_to_id(token)
 
98
 
99
  def _convert_id_to_token(self, index):
100
+ """Converts an index (integer) in a token (str) using the vocab."""
101
+ token = self.sp_model.IdToPiece(index)
102
+ return token
103
+
104
+ def _maybe_add_prefix_space(self, tokens, decoded):
105
+ if tokens and tokens[0] not in self.no_prefix_space_tokens:
106
+ return " " + decoded
107
+ else:
108
+ return decoded
109
+
110
+ def convert_tokens_to_string(self, tokens):
111
+ """Converts a sequence of tokens (string) in a single string."""
112
+ current_sub_tokens = []
113
+ out_string = ""
114
+ prev_is_special = False
115
+ for token in tokens:
116
+ # make sure that special tokens are not decoded using sentencepiece model
117
+ if token in self.all_special_tokens:
118
+ if not prev_is_special:
119
+ out_string += " "
120
+ out_string += self.sp_model.decode(current_sub_tokens) + token
121
+ prev_is_special = True
122
+ current_sub_tokens = []
123
+ else:
124
+ current_sub_tokens.append(token)
125
+ prev_is_special = False
126
+ out_string += self.sp_model.decode(current_sub_tokens)
127
+ out_string = self.clean_up_tokenization(out_string)
128
+ out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
129
+ return out_string[1:]
130
+
131
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
132
+ """
133
+ Save the vocabulary and special tokens file to a directory.
134
+ Args:
135
+ save_directory (`str`):
136
+ The directory in which to save the vocabulary.
137
+ Returns:
138
+ `Tuple(str)`: Paths to the files saved.
139
+ """
140
+ if not os.path.isdir(save_directory):
141
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
142
+ return
143
+ out_vocab_file = os.path.join(
144
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
145
+ )
146
+
147
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
148
+ copyfile(self.vocab_file, out_vocab_file)
149
+ elif not os.path.isfile(self.vocab_file):
150
+ with open(out_vocab_file, "wb") as fi:
151
+ content_spiece_model = self.sp_model.serialized_model_proto()
152
+ fi.write(content_spiece_model)
153
+
154
+ return (out_vocab_file,)
155
+
156
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
157
+ if self.add_bos_token:
158
+ bos_token_ids = [self.bos_token_id]
159
+ else:
160
+ bos_token_ids = []
161
+
162
+ output = bos_token_ids + token_ids_0
163
+
164
+ if token_ids_1 is not None:
165
+ output = output + token_ids_1
166
+
167
+ if self.add_eos_token:
168
+ output = output + [self.eos_token_id]
169
+
170
+ return output
171
+
172
+ def get_special_tokens_mask(
173
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
174
+ ) -> List[int]:
175
+ """
176
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
177
+ special tokens using the tokenizer `prepare_for_model` method.
178
+ Args:
179
+ token_ids_0 (`List[int]`):
180
+ List of IDs.
181
+ token_ids_1 (`List[int]`, *optional*):
182
+ Optional second list of IDs for sequence pairs.
183
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
184
+ Whether or not the token list is already formatted with special tokens for the model.
185
+ Returns:
186
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
187
+ """
188
+ if already_has_special_tokens:
189
+ return super().get_special_tokens_mask(
190
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
191
+ )
192
+
193
+ if token_ids_1 is None:
194
+ return [1] + ([0] * len(token_ids_0)) + [1]
195
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
196
+
197
+ def create_token_type_ids_from_sequences(
198
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
199
+ ) -> List[int]:
200
+ """
201
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
202
+ use of token type ids, therefore a list of zeros is returned.
203
+ Args:
204
+ token_ids_0 (`List[int]`):
205
+ List of IDs.
206
+ token_ids_1 (`List[int]`, *optional*):
207
+ Optional second list of IDs for sequence pairs.
208
+ Returns:
209
+ `List[int]`: List of zeros.
210
+ """
211
+ eos = [self.eos_token_id]
212
 
213
+ if token_ids_1 is None:
214
+ return len(token_ids_0 + eos) * [0]
215
+ return len(token_ids_0 + eos + token_ids_1 + eos) * [0]