|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
import os |
|
import unittest |
|
|
|
from transformers.models.biogpt.tokenization_biogpt import VOCAB_FILES_NAMES, BioGptTokenizer |
|
from transformers.testing_utils import slow |
|
|
|
from ...test_tokenization_common import TokenizerTesterMixin |
|
|
|
|
|
class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase): |
|
tokenizer_class = BioGptTokenizer |
|
test_rust_tokenizer = False |
|
|
|
def setUp(self): |
|
super().setUp() |
|
|
|
|
|
vocab = [ |
|
"l", |
|
"o", |
|
"w", |
|
"e", |
|
"r", |
|
"s", |
|
"t", |
|
"i", |
|
"d", |
|
"n", |
|
"w</w>", |
|
"r</w>", |
|
"t</w>", |
|
"lo", |
|
"low", |
|
"er</w>", |
|
"low</w>", |
|
"lowest</w>", |
|
"newer</w>", |
|
"wider</w>", |
|
"<unk>", |
|
] |
|
vocab_tokens = dict(zip(vocab, range(len(vocab)))) |
|
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""] |
|
|
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) |
|
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) |
|
with open(self.vocab_file, "w") as fp: |
|
fp.write(json.dumps(vocab_tokens)) |
|
with open(self.merges_file, "w") as fp: |
|
fp.write("\n".join(merges)) |
|
|
|
def get_input_output_texts(self, tokenizer): |
|
input_text = "lower newer" |
|
output_text = "lower newer" |
|
return input_text, output_text |
|
|
|
def test_full_tokenizer(self): |
|
"""Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt""" |
|
tokenizer = BioGptTokenizer(self.vocab_file, self.merges_file) |
|
|
|
text = "lower" |
|
bpe_tokens = ["low", "er</w>"] |
|
tokens = tokenizer.tokenize(text) |
|
self.assertListEqual(tokens, bpe_tokens) |
|
|
|
input_tokens = tokens + ["<unk>"] |
|
input_bpe_tokens = [14, 15, 20] |
|
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) |
|
|
|
@slow |
|
def test_sequence_builders(self): |
|
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt") |
|
|
|
text = tokenizer.encode("sequence builders", add_special_tokens=False) |
|
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) |
|
|
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) |
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) |
|
|
|
self.assertTrue(encoded_sentence == [2] + text) |
|
self.assertTrue(encoded_pair == [2] + text + [2] + text_2) |
|
|