File size: 894 Bytes
b9aafd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from transformers import AutoTokenizer
import json

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Convert text dataset to tokenized dataset
data = []
with open("flirty_dataset.txt", "r") as f:
    dialogue = f.read().split("---\n")
    for pair in dialogue:
        if pair.strip():
            lines = pair.strip().split("\n")
            user, bot = lines[0].split(": ", 1), lines[1].split(": ", 1)
            input_text = f"{user[0]}: {user[1]}"
            output_text = f"{bot[0]}: {bot[1]}"
            data.append({"input_ids": tokenizer(input_text, truncation=True)["input_ids"],
                         "labels": tokenizer(output_text, truncation=True)["input_ids"]})

# Save tokenized data
with open("flirty_dataset_tokenized.json", "w") as f:
    json.dump(data, f)

print("Preprocessing complete! Tokenized dataset saved as flirty_dataset_tokenized.json")