import comet_ml from unsloth import PatchDPOTrainer from accelerate import Accelerator from config import SAVED_MODEL PatchDPOTrainer() import torch from transformers import TextStreamer, AutoTokenizer from datasets import load_dataset from unsloth import FastLanguageModel, is_bfloat16_supported from trl import DPOConfig, DPOTrainer from accelerate import init_empty_weights class MyLlamaModel: max_seq_length = 256 NUM_TRAIN_EPOCHS = 6 beta = 0.5 LOAD_IN_4BIT = False device_map = "auto" save_method = "lora" # merged_X just means the whole model is saved, not just the transformer lora_dropout = 0. lora_alpha = 32 learning_rate=2e-5 r = 32 base_output_dir = f"{SAVED_MODEL}/{max_seq_length}maxSeqLen_{NUM_TRAIN_EPOCHS}Epochs_{device_map}devmap_4Bit{LOAD_IN_4BIT}_{save_method}_beta{beta}_loraDropout{lora_dropout}_r{r}_lora_alpha{lora_alpha}_lr{learning_rate}/" def __init__(self): self.model_name="unsloth/DeepSeek-R1-GGUF" self.model_path = f"{self.base_output_dir}/{self.model_name}" def get_model_tokenizer(self, model_name: str): print(f"Using model {model_name}") self.model_name = model_name self.model_path = f"{self.base_output_dir}/{model_name}" model, tokenizer = FastLanguageModel.from_pretrained( model_name=self.model_name, # max_seq_length=self.max_seq_length, load_in_4bit=self.LOAD_IN_4BIT, # "You can activate QLoRA by setting load_in_4bit to True" LLMEngineering, p251 # quantization_config=bnb_config, # helped with memory but caused non-zero probabilities when demoed # # device_map=self.device_map, # try this trust_remote_code=True, ) return model, tokenizer def train_and_save(self): model, tokenizer = self.get_model_tokenizer(self.model_name) with init_empty_weights(): model = FastLanguageModel.get_peft_model( model, r=self.r, lora_alpha=self.lora_alpha, lora_dropout=self.lora_dropout, target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"], ) torch.nn.Module.to_empty(model, device=torch.device("cuda")) # this eliminates 'NotImplementedError: Cannot copy out of meta tensor' accelerator = Accelerator(mixed_precision="bf16", cpu=True) # Enable mixed precision for memory efficiency device = accelerator.device # model.to(device) # optimizer = AdamW(params=model.parameters(), lr=3e-2) # Move the model to the appropriate device model = accelerator.prepare(model) self.do_dpo(model, tokenizer) def do_dpo(self, model, tokenizer): dataset = self.load_prepared_dataset(tokenizer.eos_token) trainer = DPOTrainer( model=model, ref_model=None, tokenizer=tokenizer, beta=self.beta, train_dataset=dataset["train"], eval_dataset=dataset["test"], max_length=self.max_seq_length // 2, max_prompt_length=self.max_seq_length // 2, args=DPOConfig( learning_rate=self.learning_rate, lr_scheduler_type="linear", per_device_train_batch_size=1, per_device_eval_batch_size=1, gradient_accumulation_steps=8, num_train_epochs=self.NUM_TRAIN_EPOCHS, fp16=not is_bfloat16_supported(), bf16=is_bfloat16_supported(), weight_decay=0.01, warmup_steps=10, output_dir="output", eval_strategy="steps", eval_steps=0.2, logging_steps=1, report_to="comet_ml", seed=0, ), ) trainer.train() model.save_pretrained_merged(self.model_path, tokenizer=tokenizer, save_method=self.save_method) # merged_4bit_forced generate_text_using(model, tokenizer) @staticmethod def load_prepared_dataset(eos_token): alpaca_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {} ### Response: """ def format_samples(example): example["prompt"] = alpaca_template.format(example["prompt"]) example["chosen"] = example['chosen'] + eos_token example["rejected"] = example['rejected'] + eos_token return {"prompt": example["prompt"], "chosen": example["chosen"], "rejected": example["rejected"]} dataset = load_dataset("mlabonne/llmtwin-dpo", split="train") dataset = dataset.map(format_samples) dataset = dataset.train_test_split(test_size=0.05) return dataset def generate_text_using(model, tokenizer): print(f"Model of type {type(model)}, tokenizer of type {type(tokenizer)}") #"pt", "tf", "np", "jax", "mlx" inputs = tokenizer(["Who are the creators of the course that is under the 'Decoding ML' umbrella?"], return_tensors="pt").to("cuda") text_streamer = TextStreamer(tokenizer) FastLanguageModel.for_inference(model) _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=MyLlamaModel.max_seq_length, use_cache=True) if __name__ == "__main__": my_model = MyLlamaModel() my_model.train_and_save()