In [1]:
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv()

# Login to Hugging Face Hub
login(token=os.getenv("HUGGINGFACE_TOKEN"))

In [10]:
from datasets import load_dataset
dataset_name = "ai-bites/databricks-mini"
dataset = load_dataset(dataset_name, split="train[0:1000]", cache_dir=".cache/")

dataset

README.md:   0%|          | 0.00/288 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


dolly-mini-train.jsonl:   0%|          | 0.00/5.24M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10544 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [11]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [30]:
import yaml
with open("hyperparams.yaml", 'r') as file:
    hyperparams = yaml.load(file, Loader=yaml.FullLoader)

In [31]:
compute_dtype = getattr(torch, hyperparams['bnb_4bit_compute_dtype'])

bnb_config = BitsAndBytesConfig(
    load_in_4bit=hyperparams['use_4bit'], # Activates 4-bit precision loading
    bnb_4bit_quant_type=hyperparams['bnb_4bit_quant_type'], # nf4
    bnb_4bit_compute_dtype=compute_dtype, # float16
    bnb_4bit_use_double_quant=hyperparams['use_nested_quant'], # False
)

In [32]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and hyperparams['use_4bit']:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("Setting BF16 to True")
        hyperparams['bf16'] = True
    else:
        hyperparams['bf16'] = False

Setting BF16 to True


In [33]:
model = AutoModelForCausalLM.from_pretrained(
    hyperparams['model_name'],
    token=os.getenv("HUGGINGFACE_TOKEN"),
    quantization_config=bnb_config,
    device_map=hyperparams['device_map'],
    cache_dir=".cache/",
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(hyperparams['model_name'], token=os.getenv("HUGGINGFACE_TOKEN"), trust_remote_code=True, cache_dir=".cache/")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [34]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=hyperparams['lora_alpha'],
    lora_dropout=hyperparams['lora_dropout'],
    r=hyperparams['lora_r'],
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj"]
)

In [39]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=hyperparams['output_dir'],
    num_train_epochs=hyperparams['num_train_epochs'],
    per_device_train_batch_size=hyperparams['per_device_train_batch_size'],
    gradient_accumulation_steps=hyperparams['gradient_accumulation_steps'],
    optim=hyperparams['optimizer'],
    save_steps=hyperparams['save_steps'],
    logging_steps=hyperparams['logging_steps'],
    learning_rate=float(hyperparams['learning_rate']),
    weight_decay=hyperparams['weight_decay'],
    fp16=hyperparams['fp16'],
    bf16=hyperparams['bf16'],
    max_grad_norm=hyperparams['max_grad_norm'],
    max_steps=hyperparams['max_steps'],
    warmup_ratio=hyperparams['warmup_ratio'],
    group_by_length=hyperparams['group_by_length'],
    lr_scheduler_type=hyperparams['lr_scheduler_type'],
    report_to="tensorboard",
)
training_arguments

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
eval_use

In [40]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    # formatting_func=format_prompts_fn,
    max_seq_length=hyperparams['max_seq_length'],
    tokenizer=tokenizer,
    args=training_arguments,
    packing=hyperparams['packing'],
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [None]:
trainer.train()
trainer.model.save_pretrained(hyperparams['new_model_name'])

  0%|          | 0/1340 [00:00<?, ?it/s]

{'loss': 3.8879, 'grad_norm': 18.030195236206055, 'learning_rate': 0.0002, 'epoch': 0.02}
{'loss': 2.9569, 'grad_norm': 9.667036056518555, 'learning_rate': 0.0002, 'epoch': 0.04}
{'loss': 2.6361, 'grad_norm': 9.089476585388184, 'learning_rate': 0.0002, 'epoch': 0.06}
{'loss': 2.9523, 'grad_norm': 6.053662300109863, 'learning_rate': 0.0002, 'epoch': 0.07}
{'loss': 2.8543, 'grad_norm': 7.764152526855469, 'learning_rate': 0.0002, 'epoch': 0.09}
{'loss': 2.8802, 'grad_norm': 6.539248466491699, 'learning_rate': 0.0002, 'epoch': 0.11}
{'loss': 2.7047, 'grad_norm': 5.485109329223633, 'learning_rate': 0.0002, 'epoch': 0.13}
{'loss': 2.6576, 'grad_norm': 9.22624397277832, 'learning_rate': 0.0002, 'epoch': 0.15}
{'loss': 2.7756, 'grad_norm': 6.477100372314453, 'learning_rate': 0.0002, 'epoch': 0.17}
{'loss': 2.7012, 'grad_norm': 5.891603946685791, 'learning_rate': 0.0002, 'epoch': 0.19}
{'loss': 2.5026, 'grad_norm': 5.75968599319458, 'learning_rate': 0.0002, 'epoch': 0.21}
{'loss': 2.8085, 'grad