Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer | |
import json | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
# Convert text dataset to tokenized dataset | |
data = [] | |
with open("flirty_dataset.txt", "r") as f: | |
dialogue = f.read().split("---\n") | |
for pair in dialogue: | |
if pair.strip(): | |
lines = pair.strip().split("\n") | |
user, bot = lines[0].split(": ", 1), lines[1].split(": ", 1) | |
input_text = f"{user[0]}: {user[1]}" | |
output_text = f"{bot[0]}: {bot[1]}" | |
data.append({"input_ids": tokenizer(input_text, truncation=True)["input_ids"], | |
"labels": tokenizer(output_text, truncation=True)["input_ids"]}) | |
# Save tokenized data | |
with open("flirty_dataset_tokenized.json", "w") as f: | |
json.dump(data, f) | |
print("Preprocessing complete! Tokenized dataset saved as flirty_dataset_tokenized.json") | |