Spaces:
Paused
Paused
from datasets import load_from_disk | |
from fastapi import FastAPI | |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling | |
import evaluate | |
import numpy as np | |
import torch | |
# Load the dataset | |
dataset = load_from_disk('LocalDatasets/beyond_good_and_evil') | |
# Initialize the tokenizer | |
checkpoint = "gpt2" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
tokenizer.pad_token = tokenizer.eos_token | |
# Tokenize the dataset | |
def tokenize_function(example): | |
return tokenizer(example['prompt'], example['completion'], truncation=True, padding='max_length', max_length=512) | |
tokenized_datasets = dataset.map(tokenize_function, batched=True) | |
# Prepare the dataset for training | |
def preprocess_function(examples): | |
inputs = tokenizer(examples['prompt'], truncation=True, padding='max_length', max_length=512) | |
outputs = tokenizer(examples['completion'], truncation=True, padding='max_length', max_length=512) | |
inputs["labels"] = outputs["input_ids"] | |
return inputs | |
tokenized_datasets = tokenized_datasets.map(preprocess_function, batched=True, remove_columns=['prompt', 'completion']) | |
# Initialize the data collator | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
# Split the dataset | |
train_dataset = tokenized_datasets['train'] | |
eval_dataset = tokenized_datasets['test'] | |
# Load the model | |
model = AutoModelForCausalLM.from_pretrained(checkpoint) | |
# Define training arguments | |
training_args = TrainingArguments( | |
output_dir='./results', | |
evaluation_strategy='epoch', | |
learning_rate=1e-5, | |
per_device_train_batch_size=4, # Set batch size to 1 | |
per_device_eval_batch_size=4, # Set batch size to 1 | |
num_train_epochs=90, | |
weight_decay=0.01, | |
save_total_limit=2, | |
) | |
# Define the compute metrics function | |
metric = evaluate.load("accuracy") | |
def compute_metrics(eval_pred): | |
logits, labels = eval_pred | |
predictions = np.argmax(logits, axis=-1) | |
return metric.compute(predictions=predictions, references=labels) | |
# Initialize the Trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_dataset, | |
eval_dataset=eval_dataset, | |
tokenizer=tokenizer, | |
data_collator=data_collator, | |
compute_metrics=compute_metrics, | |
) | |
# Train the model | |
trainer.train() | |
# Save the model and tokenizer | |
model.save_pretrained('./saved_model') | |
tokenizer.save_pretrained('./saved_model') | |
# Load the model and tokenizer for inference | |
model = AutoModelForCausalLM.from_pretrained('./saved_model') | |
tokenizer = AutoTokenizer.from_pretrained('./saved_model') | |
# Example inference | |
example_prompt = "What is the main idea of Nietzsche's philosophy?" | |
inputs = tokenizer(example_prompt, return_tensors='pt', truncation=True, padding=True, max_length=512) | |
with torch.no_grad(): | |
outputs = model.generate(**inputs, max_length=100, num_beams=5, early_stopping=True) | |
completion = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
print(f"Generated completion: {completion}") | |
app = FastAPI() | |
def greet_json(param_prompt: str = "Friedrich Nietzsche, I have just started reading your work and I must say, it is quite thought-provoking. I am intrigued by your concept of the 'Will to Truth.' Can you explain to me what this means?"): | |
f_inputs = tokenizer(param_prompt, return_tensors='pt', truncation=True, padding=True, max_length=512) | |
with torch.no_grad(): | |
f_outputs = model.generate(**f_inputs, max_length=200, num_beams=5, early_stopping=True) | |
f_completion = tokenizer.decode(f_outputs[0], skip_special_tokens=True) | |
return {"Answer: ": f_completion} | |