Love_and_Smile / preprocess_dataset.py
nikhil-kumar's picture
Create preprocess_dataset.py
b9aafd2 verified
raw
history blame contribute delete
894 Bytes
from transformers import AutoTokenizer
import json
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# Convert text dataset to tokenized dataset
data = []
with open("flirty_dataset.txt", "r") as f:
dialogue = f.read().split("---\n")
for pair in dialogue:
if pair.strip():
lines = pair.strip().split("\n")
user, bot = lines[0].split(": ", 1), lines[1].split(": ", 1)
input_text = f"{user[0]}: {user[1]}"
output_text = f"{bot[0]}: {bot[1]}"
data.append({"input_ids": tokenizer(input_text, truncation=True)["input_ids"],
"labels": tokenizer(output_text, truncation=True)["input_ids"]})
# Save tokenized data
with open("flirty_dataset_tokenized.json", "w") as f:
json.dump(data, f)
print("Preprocessing complete! Tokenized dataset saved as flirty_dataset_tokenized.json")