RDPD-mini / tokenizeConfig.py
aframson's picture
sd
e7894e2
raw
history blame
4.57 kB
import os
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
import tokenizers
from tokenizers import models, pre_tokenizers, decoders, trainers
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {}
class OBITokenizer(PreTrainedTokenizer):
"""
Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:
vocab_file (`str`):
Path to the vocabulary file.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
_auto_class = "AutoTokenizer"
def __init__(
self,
vocab_file,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
pad_token="</s>",
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
**kwargs,
):
super().__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self.tokenizer = tokenizers.Tokenizer(models.BPE())
self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
self.tokenizer.decoder = decoders.ByteLevel()
self.tokenizer.post_processor = tokenizers.processors.ByteLevel()
self.tokenizer.enable_truncation(max_length=512) # Adjust max_length as needed
self.tokenizer.enable_padding(max_length=512, pad_token="[PAD]") # Adjust max_length and pad_token as needed
self._no_prefix_space_tokens = None
@property
def no_prefix_space_tokens(self):
if self._no_prefix_space_tokens is None:
vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
return self._no_prefix_space_tokens
@property
def vocab_size(self):
"""Returns vocab size"""
return len(self.tokenizer.get_vocab())
@property
def bos_token_id(self) -> Optional[int]:
return self.tokenizer.token_to_id("<s>")
@property
def eos_token_id(self) -> Optional[int]:
return self.tokenizer.token_to_id("</s>")
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def _tokenize(self, text):
"""Returns a tokenized string."""
encoding = self.tokenizer.encode(text)
return encoding.ids
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.tokenizer.token_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.tokenizer.id_to_token(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) into a single string."""
return self.tokenizer.decode(tokens)
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (`str`):
The directory in which to save the vocabulary.
Returns:
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
# Save the BPE vocab
# Training: Fit the tokenizer on your text data
trainer = trainers.BpeTrainer(special_tokens=["<unk>", "<s>", "</s>","[PAD]"])
self.tokenizer.train(trainer=trainer, files=[out_vocab_file])
# Save the trained tokenizer to a file
self.tokenizer.save(out_vocab_file)
return (out_vocab_file,)