from datasets import load_from_disk from fastapi import FastAPI from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, Adafactor import evaluate import numpy as np import torch import os from datetime import datetime from pprint import pprint # Load the dataset outputDir = "./Nietzsche_Model" outputModelDir = outputDir + '/final' dataset = load_from_disk('LocalDatasets/beyond_good_and_evil') # Initialize the tokenizer checkpoint = "gpt2-medium" # Use a larger model if possible # Load the model and tokenizer for inference model = AutoModelForCausalLM.from_pretrained(checkpoint) tokenizer = AutoTokenizer.from_pretrained(checkpoint) tokenizer.pad_token = tokenizer.eos_token # Tokenize the dataset def tokenize_function(example): return tokenizer(example['prompt'], example['completion'], truncation=True, padding='max_length', max_length=512) tokenized_datasets = dataset.map(tokenize_function, batched=True) # Prepare the dataset for training def preprocess_function(examples): f_inputs = tokenizer(examples['prompt'], truncation=True, padding='max_length', max_length=512) f_outputs = tokenizer(examples['completion'], truncation=True, padding='max_length', max_length=512) f_inputs["labels"] = f_outputs["input_ids"] return f_inputs tokenized_datasets = tokenized_datasets.map(preprocess_function, batched=True, remove_columns=['prompt', 'completion']) # Initialize the data collator data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Split the dataset lc_seed = int((datetime.now().timestamp() * 10**7) / 32487.544) - 2588 train_dataset = tokenized_datasets['train'].shuffle(seed=lc_seed) eval_dataset = tokenized_datasets['test'].shuffle(seed=lc_seed) # Define training arguments training_args = TrainingArguments( output_dir=outputDir, evaluation_strategy='steps', logging_strategy="steps", logging_steps=10, learning_rate=1e-4, # Increase learning rate per_device_train_batch_size=4, # Start with 32 per_device_eval_batch_size=4, # Start with 32 max_steps=256, # Increase training steps weight_decay=0.01, hub_model_id= 'Jack-Anderson/N-TSZ', push_to_hub=True, hub_token= os.getenv('hf_token'), # Other arguments overwrite_output_dir=False, # Overwrite the content of the output directory disable_tqdm=False, # Disable progress bars eval_steps=120, # Number of update steps between two evaluations save_steps=120, # After # steps model is saved optim="adafactor", gradient_accumulation_steps=2, # Use gradient accumulation load_best_model_at_end=True, save_total_limit=10, metric_for_best_model="eval_loss", greater_is_better=False ) # Define the compute metrics function metric = evaluate.load("accuracy") def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) # Flatten the predictions and labels predictions = predictions.flatten() labels = labels.flatten() # Filter out padding tokens (assuming padding token ID is -100) mask = labels != tokenizer.pad_token_id predictions = predictions[mask] labels = labels[mask] return metric.compute(predictions=predictions, references=labels) optimizer = Adafactor( model.parameters(), lr=1e-3, eps=(1e-30, 1e-3), clip_threshold=1.0, decay_rate=-0.8, beta1=None, weight_decay=0.0, scale_parameter=False, relative_step=False, warmup_init=False ) # Initialize the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, optimizers=(optimizer, None) # Pass the Adafactor optimizer here ) # Train the model trainer.train() # Save the model and tokenizer model.save_pretrained(outputModelDir) tokenizer.save_pretrained(outputModelDir) current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") trainer.push_to_hub(commit_message= f"Training Session {current_time}") # Example inference example_prompt = "What is the main idea of Nietzsche's philosophy?" inputs = tokenizer(example_prompt, return_tensors='pt', truncation=True, padding=True, max_length=512) with torch.no_grad(): outputs = model.generate(**inputs, max_length=100, num_beams=5, early_stopping=True) completion = tokenizer.decode(outputs[0], skip_special_tokens=True) print(f"Generated completion: {completion}") app = FastAPI() @app.get("/{param_prompt}") def greet_json(param_prompt: str = "Friedrich Nietzsche, I have just started reading your work and I must say, it is quite thought-provoking. I am intrigued by your concept of the 'Will to Truth.' Can you explain to me what this means?"): f_inputs = tokenizer(param_prompt, return_tensors='pt', truncation=True, padding=True, max_length=512) with torch.no_grad(): f_outputs = model.generate(**f_inputs, max_length=200, num_beams=5, early_stopping=True) f_completion = tokenizer.decode(f_outputs[0], skip_special_tokens=True) return {"Answer: ": f_completion}