from transformers import AutoTokenizer import json # Load tokenizer tokenizer = AutoTokenizer.from_pretrained("gpt2") # Convert text dataset to tokenized dataset data = [] with open("flirty_dataset.txt", "r") as f: dialogue = f.read().split("---\n") for pair in dialogue: if pair.strip(): lines = pair.strip().split("\n") user, bot = lines[0].split(": ", 1), lines[1].split(": ", 1) input_text = f"{user[0]}: {user[1]}" output_text = f"{bot[0]}: {bot[1]}" data.append({"input_ids": tokenizer(input_text, truncation=True)["input_ids"], "labels": tokenizer(output_text, truncation=True)["input_ids"]}) # Save tokenized data with open("flirty_dataset_tokenized.json", "w") as f: json.dump(data, f) print("Preprocessing complete! Tokenized dataset saved as flirty_dataset_tokenized.json")