Spaces:
Sleeping
Sleeping
Create preprocess_dataset.py
Browse files- preprocess_dataset.py +24 -0
preprocess_dataset.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
import json
|
3 |
+
|
4 |
+
# Load tokenizer
|
5 |
+
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
6 |
+
|
7 |
+
# Convert text dataset to tokenized dataset
|
8 |
+
data = []
|
9 |
+
with open("flirty_dataset.txt", "r") as f:
|
10 |
+
dialogue = f.read().split("---\n")
|
11 |
+
for pair in dialogue:
|
12 |
+
if pair.strip():
|
13 |
+
lines = pair.strip().split("\n")
|
14 |
+
user, bot = lines[0].split(": ", 1), lines[1].split(": ", 1)
|
15 |
+
input_text = f"{user[0]}: {user[1]}"
|
16 |
+
output_text = f"{bot[0]}: {bot[1]}"
|
17 |
+
data.append({"input_ids": tokenizer(input_text, truncation=True)["input_ids"],
|
18 |
+
"labels": tokenizer(output_text, truncation=True)["input_ids"]})
|
19 |
+
|
20 |
+
# Save tokenized data
|
21 |
+
with open("flirty_dataset_tokenized.json", "w") as f:
|
22 |
+
json.dump(data, f)
|
23 |
+
|
24 |
+
print("Preprocessing complete! Tokenized dataset saved as flirty_dataset_tokenized.json")
|