asas
Browse files- config.json +24 -0
- pytorch_model.bin +3 -0
- tokenizeConfig.py +49 -0
- tokenizer.json +0 -0
- tokenizer_config.json +11 -0
config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"OBILanguageModel"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "modelConfig.OBIConfig",
|
7 |
+
"AutoModel": "modelLM.OBILanguageModel",
|
8 |
+
"AutoModelForCausalLM": "modelLM.OBILanguageModel",
|
9 |
+
"AutoModelForQuestionAnswering": "modelLM.OBILanguageModel"
|
10 |
+
},
|
11 |
+
"batch_size": 5,
|
12 |
+
"block_size": 20,
|
13 |
+
"device": "cpu",
|
14 |
+
"eval_interval": 100,
|
15 |
+
"hidden_dropout_prob": 0.1,
|
16 |
+
"hidden_size": 4,
|
17 |
+
"learning_rate": 0.001,
|
18 |
+
"max_iters": 900,
|
19 |
+
"num_attention_heads": 2,
|
20 |
+
"num_hidden_layers": 2,
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.30.2",
|
23 |
+
"vocab_size": 92656
|
24 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b20b5dbce3759e8c4e5bd443cff00d5487468d7a5a519f63aa5c05ab4eea6e22
|
3 |
+
size 3362505
|
tokenizeConfig.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
|
2 |
+
import json
|
3 |
+
|
4 |
+
class OBITokenizer:
|
5 |
+
def __init__(self):
|
6 |
+
# Initialize a BPE model for tokenization
|
7 |
+
bpe_model = models.BPE()
|
8 |
+
# Initialize the tokenizer
|
9 |
+
self.tokenizer = Tokenizer(bpe_model)
|
10 |
+
# Add pre-tokenization and decoding steps if needed
|
11 |
+
self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
12 |
+
self.tokenizer.decoder = decoders.ByteLevel()
|
13 |
+
|
14 |
+
def train(self, files,save_path):
|
15 |
+
# Training: Fit the tokenizer on your text data
|
16 |
+
trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
|
17 |
+
self.tokenizer.train(trainer=trainer, files=files)
|
18 |
+
# Save the trained tokenizer to a file
|
19 |
+
self.tokenizer.save(save_path)
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
def save_config(self, config_file):
|
24 |
+
# Serialize the tokenizer's config to a JSON file
|
25 |
+
config_dict = {
|
26 |
+
"tokenizer_type": "custom",
|
27 |
+
"vocab_size": self.tokenizer.get_vocab_size(),
|
28 |
+
"tokenizer_class": "OBITokenizer",
|
29 |
+
"auto_map": {
|
30 |
+
"AutoTokenizer": ["tokenizeConfig.OBITokenizer"]
|
31 |
+
},
|
32 |
+
"bos_token": "[CLS]",
|
33 |
+
"eos_token": "[SEP]",
|
34 |
+
"unk_token": "[UNK]",
|
35 |
+
"pad_token": "[PAD]",
|
36 |
+
"mask_token": "[MASK]"
|
37 |
+
# Add other custom settings if needed
|
38 |
+
}
|
39 |
+
with open(config_file, "w") as f:
|
40 |
+
json.dump(config_dict, f)
|
41 |
+
|
42 |
+
def encode(self, text):
|
43 |
+
# Encode text using the custom tokenizer
|
44 |
+
encoding = self.tokenizer.encode(text)
|
45 |
+
return encoding.ids
|
46 |
+
|
47 |
+
def decode(self, ids):
|
48 |
+
# Decode IDs to text using the custom tokenizer
|
49 |
+
return self.tokenizer.decode(ids)
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"tokenizer_type": "custom",
|
3 |
+
"vocab_size": 10187,
|
4 |
+
"tokenizer_class": "OBITokenizer",
|
5 |
+
"auto_map": { "AutoTokenizer": ["tokenizeConfig.OBITokenizer"] },
|
6 |
+
"bos_token": "[CLS]",
|
7 |
+
"eos_token": "[SEP]",
|
8 |
+
"unk_token": "[UNK]",
|
9 |
+
"pad_token": "[PAD]",
|
10 |
+
"mask_token": "[MASK]"
|
11 |
+
}
|