ใ“ใ‚Œใฏใฒใ‚‰ใŒใชใซๅค‰ๆ›ใ—ใŸใƒ‡ใƒผใ‚ฟใ‚ปใƒƒใƒˆใงไบ‹ๅ‰ๅญฆ็ฟ’ใ—ใŸ่จ€่ชžใƒขใƒ‡ใƒซใงใ™ใ€‚ ใฒใ‚‰ใŒใชใ‚’ๆ–‡ๅญ—ๅ˜ไฝใงใƒˆใƒผใ‚ฏใƒณใซๅˆ†ๅ‰ฒใ—ใฆใ„ใ‚‹ใŸใ‚ใ€ๅ›žๆ–‡ใ‚„ๅทๆŸณใฎใ‚ˆใ†ใช้Ÿณใฎๆ•ฐใ‚’้‡่ฆ–ใ™ใ‚‹ใ‚ฟใ‚นใ‚ฏใซ้ฉใ—ใฆใ„ใพใ™ใ€‚ ไธ‹่จ˜ใฎใ‚ณใƒผใƒ‰ใงๅ‹•ไฝœใ•ใ›ใ‚‹ใ“ใจใŒใงใใพใ™ใ€‚

This is a language model pre-trained on a dataset converted into Japaneses-Hiragana. Since it tokenizes Hiragana at the character level, it is suitable for tasks that emphasize the number of sounds, such as palindromes or senryu (a form of Japanese poetry). You can run it using the code below.

import json
import os
import torch
from typing import Dict, List, Optional, Sequence, Union

from transformers import AutoModelForCausalLM
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer


class CharacterTokenizer(PreTrainedTokenizer):
    def __init__(
        self, characters: Sequence[str] = "", model_max_length: int = 1024, **kwargs
    ):
        self.characters = characters
        self.model_max_length = model_max_length
        cls_token = AddedToken("[CLS]", lstrip=False, rstrip=False)
        sep_token = AddedToken("[SEP]", lstrip=False, rstrip=False)
        bos_token = AddedToken("[BOS]", lstrip=False, rstrip=False)
        eos_token = AddedToken("[EOS]", lstrip=False, rstrip=False)
        mask_token = AddedToken("[MASK]", lstrip=True, rstrip=False)
        pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False)
        unk_token = AddedToken("[UNK]", lstrip=False, rstrip=False)

        self._vocab_str_to_int = {
            "[CLS]": 0,
            "[SEP]": 1,
            "[BOS]": 2,
            "[MASK]": 3,
            "[PAD]": 4,
            "[EOS]": 5,
            "[UNK]": 6,
            **{ch: i + 7 for i, ch in enumerate(characters)},
        }
        self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}

        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            unk_token=unk_token,
            add_prefix_space=False,
            model_max_length=model_max_length,
            **kwargs,
        )

    def vocab_size(self) -> int:
        return len(self._vocab_str_to_int)

    def get_vocab(self):
        return self._vocab_str_to_int

    def _tokenize(self, text: str) -> List[str]:
        return list(text)

    def _convert_token_to_id(self, token: str) -> int:
        return self._vocab_str_to_int.get(token, self._vocab_str_to_int["[UNK]"])

    def _convert_id_to_token(self, index: int) -> str:
        return self._vocab_int_to_str[index]

    def convert_tokens_to_string(self, tokens):
        return "".join(tokens)

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        result = cls + token_ids_0 + sep
        if token_ids_1 is not None:
            result += token_ids_1 + sep
        return result

    def get_special_tokens_mask(
        self,
        token_ids_0: List[int],
        token_ids_1: Optional[List[int]] = None,
        already_has_special_tokens: bool = False,
    ) -> List[int]:
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0,
                token_ids_1=token_ids_1,
                already_has_special_tokens=True,
            )

        result = [1] + ([0] * len(token_ids_0)) + [1]
        if token_ids_1 is not None:
            result += ([0] * len(token_ids_1)) + [1]
        return result

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        result = len(cls + token_ids_0 + sep) * [0]
        if token_ids_1 is not None:
            result += len(token_ids_1 + sep) * [1]
        return result

    def get_config(self) -> Dict:
        return {
            "char_ords": [ord(ch) for ch in self.characters],
            "model_max_length": self.model_max_length,
        }

    @classmethod
    def from_config(cls, config: Dict) -> "HiraganaTokenizer":
        cfg = {}
        cfg["characters"] = [chr(i) for i in config["char_ords"]]
        cfg["model_max_length"] = config["model_max_length"]
        return cls(**cfg)

    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
        cfg_file = os.path.join(save_directory, "tokenizer_config.json")
        cfg = self.get_config()
        with open(cfg_file, "w") as f:
            json.dump(cfg, f, indent=4)

    @classmethod
    def _from_pretrained(
        cls,
        resolved_vocab_files,
        pretrained_model_name_or_path,
        init_configuration,
        *init_inputs,
        token=None,
        cache_dir=None,
        local_files_only=False,
        _commit_hash=None,
        _is_local=False,
        trust_remote_code=False,
        **kwargs,
    ):
        config_file = resolved_vocab_files["tokenizer_config_file"]
        with open(config_file, "r", encoding="utf-8") as f:
            config = json.load(f)
        return cls.from_config(config)


tokenizer = CharacterTokenizer.from_pretrained("hukuda222/hiragana-gpt2-xsmall")
model = AutoModelForCausalLM.from_pretrained("hukuda222/hiragana-gpt2-xsmall")

with torch.no_grad():
    token_ids = tokenizer.encode(
        "ใ“ใ‚“ใซใกใฏ", add_special_tokens=False, return_tensors="pt"
    )
    output_ids = model.generate(
        token_ids.to(model.device),
        max_new_tokens=50,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
    )
output = tokenizer.decode(
    output_ids.tolist()[0][token_ids.size(1) :], skip_special_tokens=True
)
print(output)
Downloads last month
349
Safetensors
Model size
21.1M params
Tensor type
F32
ยท
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no library tag.

Dataset used to train hukuda222/hiragana-gpt2-xsmall