from datasets import load_from_disk from fastapi import FastAPI from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling import evaluate import numpy as np import torch # Load the dataset dataset = load_from_disk('LocalDatasets/beyond_good_and_evil') # Initialize the tokenizer checkpoint = "gpt2" tokenizer = AutoTokenizer.from_pretrained(checkpoint) tokenizer.pad_token = tokenizer.eos_token # Tokenize the dataset def tokenize_function(example): return tokenizer(example['prompt'], example['completion'], truncation=True, padding='max_length', max_length=512) tokenized_datasets = dataset.map(tokenize_function, batched=True) # Prepare the dataset for training def preprocess_function(examples): inputs = tokenizer(examples['prompt'], truncation=True, padding='max_length', max_length=512) outputs = tokenizer(examples['completion'], truncation=True, padding='max_length', max_length=512) inputs["labels"] = outputs["input_ids"] return inputs tokenized_datasets = tokenized_datasets.map(preprocess_function, batched=True, remove_columns=['prompt', 'completion']) # Initialize the data collator data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Split the dataset train_dataset = tokenized_datasets['train'] eval_dataset = tokenized_datasets['test'] # Load the model model = AutoModelForCausalLM.from_pretrained(checkpoint) # Define training arguments training_args = TrainingArguments( output_dir='./results', evaluation_strategy='epoch', learning_rate=1e-5, per_device_train_batch_size=4, # Set batch size to 1 per_device_eval_batch_size=4, # Set batch size to 1 num_train_epochs=90, weight_decay=0.01, save_total_limit=2, ) # Define the compute metrics function metric = evaluate.load("accuracy") def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return metric.compute(predictions=predictions, references=labels) # Initialize the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Train the model trainer.train() # Save the model and tokenizer model.save_pretrained('./saved_model') tokenizer.save_pretrained('./saved_model') # Load the model and tokenizer for inference model = AutoModelForCausalLM.from_pretrained('./saved_model') tokenizer = AutoTokenizer.from_pretrained('./saved_model') # Example inference example_prompt = "What is the main idea of Nietzsche's philosophy?" inputs = tokenizer(example_prompt, return_tensors='pt', truncation=True, padding=True, max_length=512) with torch.no_grad(): outputs = model.generate(**inputs, max_length=100, num_beams=5, early_stopping=True) completion = tokenizer.decode(outputs[0], skip_special_tokens=True) print(f"Generated completion: {completion}") app = FastAPI() @app.get("/{param_prompt}") def greet_json(param_prompt: str = "Friedrich Nietzsche, I have just started reading your work and I must say, it is quite thought-provoking. I am intrigued by your concept of the 'Will to Truth.' Can you explain to me what this means?"): f_inputs = tokenizer(param_prompt, return_tensors='pt', truncation=True, padding=True, max_length=512) with torch.no_grad(): f_outputs = model.generate(**f_inputs, max_length=200, num_beams=5, early_stopping=True) f_completion = tokenizer.decode(f_outputs[0], skip_special_tokens=True) return {"Answer: ": f_completion}