File size: 4,574 Bytes
818bd96 d09b77a 818bd96 8730f44 818bd96 88541c2 b5f3842 818bd96 9d6689e 818bd96 9d6689e 818bd96 9d6689e 818bd96 9d6689e 818bd96 d09b77a 9d6689e d09b77a 9d6689e 818bd96 88541c2 818bd96 d09b77a 9d6689e 818bd96 d09b77a 9d6689e 818bd96 d09b77a 818bd96 dfe07c8 e5d8b42 818bd96 d09b77a e7894e2 ac72af3 e5d8b42 818bd96 d09b77a 88541c2 e5d8b42 818bd96 d09b77a 818bd96 d09b77a e7894e2 818bd96 d09b77a 3eb9418 818bd96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import tokenizers
from tokenizers import models, pre_tokenizers, decoders, trainers
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {}
class OBITokenizer(PreTrainedTokenizer):
"""
Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:
vocab_file (`str`):
Path to the vocabulary file.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
_auto_class = "AutoTokenizer"
def __init__(
self,
vocab_file,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token="</s>",
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
**kwargs,
):
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self.tokenizer = tokenizers.Tokenizer(models.BPE())
self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
self.tokenizer.decoder = decoders.ByteLevel()
self.tokenizer.post_processor = tokenizers.processors.ByteLevel()
self.tokenizer.enable_truncation(max_length=512) # Adjust max_length as needed
self.tokenizer.enable_padding(max_length=512, pad_token="[PAD]") # Adjust max_length and pad_token as needed
self._no_prefix_space_tokens = None
@property
def no_prefix_space_tokens(self):
if self._no_prefix_space_tokens is None:
vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
return self._no_prefix_space_tokens
@property
def vocab_size(self):
"""Returns vocab size"""
return len(self.tokenizer.get_vocab())
@property
def bos_token_id(self) -> Optional[int]:
return self.tokenizer.token_to_id("<s>")
@property
def eos_token_id(self) -> Optional[int]:
return self.tokenizer.token_to_id("</s>")
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text):
"""Returns a tokenized string."""
encoding = self.tokenizer.encode(text)
return encoding.ids
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.tokenizer.token_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.tokenizer.id_to_token(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) into a single string."""
return self.tokenizer.decode(tokens)
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# Save the BPE vocab
# Training: Fit the tokenizer on your text data
trainer = trainers.BpeTrainer(special_tokens=["<unk>", "<s>", "</s>","[PAD]"])
self.tokenizer.train(trainer=trainer, files=[out_vocab_file])
# Save the trained tokenizer to a file
self.tokenizer.save(out_vocab_file)
return (out_vocab_file,)
|