nikhil-kumar commited on
Commit
b9aafd2
·
verified ·
1 Parent(s): 9b1cf41

Create preprocess_dataset.py

Browse files
Files changed (1) hide show
  1. preprocess_dataset.py +24 -0
preprocess_dataset.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ import json
3
+
4
+ # Load tokenizer
5
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
6
+
7
+ # Convert text dataset to tokenized dataset
8
+ data = []
9
+ with open("flirty_dataset.txt", "r") as f:
10
+ dialogue = f.read().split("---\n")
11
+ for pair in dialogue:
12
+ if pair.strip():
13
+ lines = pair.strip().split("\n")
14
+ user, bot = lines[0].split(": ", 1), lines[1].split(": ", 1)
15
+ input_text = f"{user[0]}: {user[1]}"
16
+ output_text = f"{bot[0]}: {bot[1]}"
17
+ data.append({"input_ids": tokenizer(input_text, truncation=True)["input_ids"],
18
+ "labels": tokenizer(output_text, truncation=True)["input_ids"]})
19
+
20
+ # Save tokenized data
21
+ with open("flirty_dataset_tokenized.json", "w") as f:
22
+ json.dump(data, f)
23
+
24
+ print("Preprocessing complete! Tokenized dataset saved as flirty_dataset_tokenized.json")