aframson commited on
Commit
d09b77a
·
1 Parent(s): 8730f44
Files changed (1) hide show
  1. tokenizeConfig.py +20 -109
tokenizeConfig.py CHANGED
@@ -1,9 +1,9 @@
1
- """Tokenization classes for IntermLM."""
2
  import os
3
  from shutil import copyfile
4
  from typing import Any, Dict, List, Optional, Tuple
5
 
6
- import sentencepiece as spm
 
7
 
8
  from transformers.tokenization_utils import PreTrainedTokenizer
9
  from transformers.utils import logging
@@ -36,14 +36,11 @@ class OBITokenizer(PreTrainedTokenizer):
36
  bos_token="<s>",
37
  eos_token="</s>",
38
  pad_token="</s>",
39
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
40
  add_bos_token=True,
41
  add_eos_token=False,
42
- decode_with_prefix_space=False,
43
  clean_up_tokenization_spaces=False,
44
  **kwargs,
45
  ):
46
- self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
47
  super().__init__(
48
  bos_token=bos_token,
49
  eos_token=eos_token,
@@ -55,12 +52,14 @@ class OBITokenizer(PreTrainedTokenizer):
55
  self.vocab_file = vocab_file
56
  self.add_bos_token = add_bos_token
57
  self.add_eos_token = add_eos_token
58
- self.decode_with_prefix_space = decode_with_prefix_space
59
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
60
- self.sp_model.Load(vocab_file)
61
- self._no_prefix_space_tokens = None
 
 
62
 
63
- """ Initialisation"""
64
 
65
  @property
66
  def no_prefix_space_tokens(self):
@@ -72,15 +71,15 @@ class OBITokenizer(PreTrainedTokenizer):
72
  @property
73
  def vocab_size(self):
74
  """Returns vocab size"""
75
- return self.sp_model.get_piece_size()
76
 
77
  @property
78
  def bos_token_id(self) -> Optional[int]:
79
- return self.sp_model.bos_id()
80
 
81
  @property
82
  def eos_token_id(self) -> Optional[int]:
83
- return self.sp_model.eos_id()
84
 
85
  def get_vocab(self):
86
  """Returns vocab as a dict"""
@@ -90,43 +89,20 @@ class OBITokenizer(PreTrainedTokenizer):
90
 
91
  def _tokenize(self, text):
92
  """Returns a tokenized string."""
93
- return self.sp_model.encode(text, out_type=str)
 
94
 
95
  def _convert_token_to_id(self, token):
96
  """Converts a token (str) in an id using the vocab."""
97
- return self.sp_model.piece_to_id(token)
98
 
99
  def _convert_id_to_token(self, index):
100
  """Converts an index (integer) in a token (str) using the vocab."""
101
- token = self.sp_model.IdToPiece(index)
102
- return token
103
-
104
- def _maybe_add_prefix_space(self, tokens, decoded):
105
- if tokens and tokens[0] not in self.no_prefix_space_tokens:
106
- return " " + decoded
107
- else:
108
- return decoded
109
 
110
  def convert_tokens_to_string(self, tokens):
111
- """Converts a sequence of tokens (string) in a single string."""
112
- current_sub_tokens = []
113
- out_string = ""
114
- prev_is_special = False
115
- for token in tokens:
116
- # make sure that special tokens are not decoded using sentencepiece model
117
- if token in self.all_special_tokens:
118
- if not prev_is_special:
119
- out_string += " "
120
- out_string += self.sp_model.decode(current_sub_tokens) + token
121
- prev_is_special = True
122
- current_sub_tokens = []
123
- else:
124
- current_sub_tokens.append(token)
125
- prev_is_special = False
126
- out_string += self.sp_model.decode(current_sub_tokens)
127
- out_string = self.clean_up_tokenization(out_string)
128
- out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
129
- return out_string[1:]
130
 
131
  def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
132
  """
@@ -144,72 +120,7 @@ class OBITokenizer(PreTrainedTokenizer):
144
  save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
145
  )
146
 
147
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
148
- copyfile(self.vocab_file, out_vocab_file)
149
- elif not os.path.isfile(self.vocab_file):
150
- with open(out_vocab_file, "wb") as fi:
151
- content_spiece_model = self.sp_model.serialized_model_proto()
152
- fi.write(content_spiece_model)
153
 
154
  return (out_vocab_file,)
155
-
156
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
157
- if self.add_bos_token:
158
- bos_token_ids = [self.bos_token_id]
159
- else:
160
- bos_token_ids = []
161
-
162
- output = bos_token_ids + token_ids_0
163
-
164
- if token_ids_1 is not None:
165
- output = output + token_ids_1
166
-
167
- if self.add_eos_token:
168
- output = output + [self.eos_token_id]
169
-
170
- return output
171
-
172
- def get_special_tokens_mask(
173
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
174
- ) -> List[int]:
175
- """
176
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
177
- special tokens using the tokenizer `prepare_for_model` method.
178
- Args:
179
- token_ids_0 (`List[int]`):
180
- List of IDs.
181
- token_ids_1 (`List[int]`, *optional*):
182
- Optional second list of IDs for sequence pairs.
183
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
184
- Whether or not the token list is already formatted with special tokens for the model.
185
- Returns:
186
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
187
- """
188
- if already_has_special_tokens:
189
- return super().get_special_tokens_mask(
190
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
191
- )
192
-
193
- if token_ids_1 is None:
194
- return [1] + ([0] * len(token_ids_0)) + [1]
195
- return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
196
-
197
- def create_token_type_ids_from_sequences(
198
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
199
- ) -> List[int]:
200
- """
201
- Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
202
- use of token type ids, therefore a list of zeros is returned.
203
- Args:
204
- token_ids_0 (`List[int]`):
205
- List of IDs.
206
- token_ids_1 (`List[int]`, *optional*):
207
- Optional second list of IDs for sequence pairs.
208
- Returns:
209
- `List[int]`: List of zeros.
210
- """
211
- eos = [self.eos_token_id]
212
-
213
- if token_ids_1 is None:
214
- return len(token_ids_0 + eos) * [0]
215
- return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
 
 
1
  import os
2
  from shutil import copyfile
3
  from typing import Any, Dict, List, Optional, Tuple
4
 
5
+ import tokenizers
6
+ from tokenizers import models, pre_tokenizers, decoders, trainers
7
 
8
  from transformers.tokenization_utils import PreTrainedTokenizer
9
  from transformers.utils import logging
 
36
  bos_token="<s>",
37
  eos_token="</s>",
38
  pad_token="</s>",
 
39
  add_bos_token=True,
40
  add_eos_token=False,
 
41
  clean_up_tokenization_spaces=False,
42
  **kwargs,
43
  ):
 
44
  super().__init__(
45
  bos_token=bos_token,
46
  eos_token=eos_token,
 
52
  self.vocab_file = vocab_file
53
  self.add_bos_token = add_bos_token
54
  self.add_eos_token = add_eos_token
55
+ self.tokenizer = tokenizers.Tokenizer(models.BPE())
56
+ self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
57
+ self.tokenizer.decoder = decoders.ByteLevel()
58
+ self.tokenizer.post_processor = tokenizers.processors.ByteLevel()
59
+ self.tokenizer.enable_truncation(max_length=512) # Adjust max_length as needed
60
+ self.tokenizer.enable_padding(max_length=512, pad_token="[PAD]") # Adjust max_length and pad_token as needed
61
 
62
+ self._no_prefix_space_tokens = None
63
 
64
  @property
65
  def no_prefix_space_tokens(self):
 
71
  @property
72
  def vocab_size(self):
73
  """Returns vocab size"""
74
+ return len(self.tokenizer.get_vocab())
75
 
76
  @property
77
  def bos_token_id(self) -> Optional[int]:
78
+ return self.tokenizer.token_to_id("<s>")
79
 
80
  @property
81
  def eos_token_id(self) -> Optional[int]:
82
+ return self.tokenizer.token_to_id("</s>")
83
 
84
  def get_vocab(self):
85
  """Returns vocab as a dict"""
 
89
 
90
  def _tokenize(self, text):
91
  """Returns a tokenized string."""
92
+ encoding = self.tokenizer.encode(text)
93
+ return encoding.tokens
94
 
95
  def _convert_token_to_id(self, token):
96
  """Converts a token (str) in an id using the vocab."""
97
+ return self.tokenizer.token_to_id(token)
98
 
99
  def _convert_id_to_token(self, index):
100
  """Converts an index (integer) in a token (str) using the vocab."""
101
+ return self.tokenizer.id_to_token(index)
 
 
 
 
 
 
 
102
 
103
  def convert_tokens_to_string(self, tokens):
104
+ """Converts a sequence of tokens (string) into a single string."""
105
+ return self.tokenizer.decode(tokens).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
108
  """
 
120
  save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
121
  )
122
 
123
+ # Save the BPE vocab
124
+ self.tokenizer.get_vocab().to_file(out_vocab_file)
 
 
 
 
125
 
126
  return (out_vocab_file,)