aframson commited on
Commit
88541c2
·
1 Parent(s): 77aa464
Files changed (5) hide show
  1. config.json +24 -0
  2. pytorch_model.bin +3 -0
  3. tokenizeConfig.py +49 -0
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +11 -0
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "OBILanguageModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "modelConfig.OBIConfig",
7
+ "AutoModel": "modelLM.OBILanguageModel",
8
+ "AutoModelForCausalLM": "modelLM.OBILanguageModel",
9
+ "AutoModelForQuestionAnswering": "modelLM.OBILanguageModel"
10
+ },
11
+ "batch_size": 5,
12
+ "block_size": 20,
13
+ "device": "cpu",
14
+ "eval_interval": 100,
15
+ "hidden_dropout_prob": 0.1,
16
+ "hidden_size": 4,
17
+ "learning_rate": 0.001,
18
+ "max_iters": 900,
19
+ "num_attention_heads": 2,
20
+ "num_hidden_layers": 2,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.30.2",
23
+ "vocab_size": 92656
24
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b20b5dbce3759e8c4e5bd443cff00d5487468d7a5a519f63aa5c05ab4eea6e22
3
+ size 3362505
tokenizeConfig.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
2
+ import json
3
+
4
+ class OBITokenizer:
5
+ def __init__(self):
6
+ # Initialize a BPE model for tokenization
7
+ bpe_model = models.BPE()
8
+ # Initialize the tokenizer
9
+ self.tokenizer = Tokenizer(bpe_model)
10
+ # Add pre-tokenization and decoding steps if needed
11
+ self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
12
+ self.tokenizer.decoder = decoders.ByteLevel()
13
+
14
+ def train(self, files,save_path):
15
+ # Training: Fit the tokenizer on your text data
16
+ trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
17
+ self.tokenizer.train(trainer=trainer, files=files)
18
+ # Save the trained tokenizer to a file
19
+ self.tokenizer.save(save_path)
20
+
21
+
22
+
23
+ def save_config(self, config_file):
24
+ # Serialize the tokenizer's config to a JSON file
25
+ config_dict = {
26
+ "tokenizer_type": "custom",
27
+ "vocab_size": self.tokenizer.get_vocab_size(),
28
+ "tokenizer_class": "OBITokenizer",
29
+ "auto_map": {
30
+ "AutoTokenizer": ["tokenizeConfig.OBITokenizer"]
31
+ },
32
+ "bos_token": "[CLS]",
33
+ "eos_token": "[SEP]",
34
+ "unk_token": "[UNK]",
35
+ "pad_token": "[PAD]",
36
+ "mask_token": "[MASK]"
37
+ # Add other custom settings if needed
38
+ }
39
+ with open(config_file, "w") as f:
40
+ json.dump(config_dict, f)
41
+
42
+ def encode(self, text):
43
+ # Encode text using the custom tokenizer
44
+ encoding = self.tokenizer.encode(text)
45
+ return encoding.ids
46
+
47
+ def decode(self, ids):
48
+ # Decode IDs to text using the custom tokenizer
49
+ return self.tokenizer.decode(ids)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_type": "custom",
3
+ "vocab_size": 10187,
4
+ "tokenizer_class": "OBITokenizer",
5
+ "auto_map": { "AutoTokenizer": ["tokenizeConfig.OBITokenizer"] },
6
+ "bos_token": "[CLS]",
7
+ "eos_token": "[SEP]",
8
+ "unk_token": "[UNK]",
9
+ "pad_token": "[PAD]",
10
+ "mask_token": "[MASK]"
11
+ }