aframson
/

RDPD-mini

Text Generation

feature-extraction

Model card Files Files and versions Community

RDPD-mini / tokenizeConfig.py

aframson's picture

asas

88541c2 over 1 year ago

1.79 kB

	from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
	import json

	class OBITokenizer:
	def __init__(self):
	# Initialize a BPE model for tokenization
	bpe_model = models.BPE()
	# Initialize the tokenizer
	self.tokenizer = Tokenizer(bpe_model)
	# Add pre-tokenization and decoding steps if needed
	self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
	self.tokenizer.decoder = decoders.ByteLevel()

	def train(self, files,save_path):
	# Training: Fit the tokenizer on your text data
	trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
	self.tokenizer.train(trainer=trainer, files=files)
	# Save the trained tokenizer to a file
	self.tokenizer.save(save_path)



	def save_config(self, config_file):
	# Serialize the tokenizer's config to a JSON file
	config_dict = {
	"tokenizer_type": "custom",
	"vocab_size": self.tokenizer.get_vocab_size(),
	"tokenizer_class": "OBITokenizer",
	"auto_map": {
	"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]
	},
	"bos_token": "[CLS]",
	"eos_token": "[SEP]",
	"unk_token": "[UNK]",
	"pad_token": "[PAD]",
	"mask_token": "[MASK]"
	# Add other custom settings if needed
	}
	with open(config_file, "w") as f:
	json.dump(config_dict, f)

	def encode(self, text):
	# Encode text using the custom tokenizer
	encoding = self.tokenizer.encode(text)
	return encoding.ids

	def decode(self, ids):
	# Decode IDs to text using the custom tokenizer
	return self.tokenizer.decode(ids)