In [1]:
from huggingface_hub import login
from dotenv import load_dotenv
import os
load_dotenv()

# Login to Hugging Face Hub
login(token=os.getenv("HUGGINGFACE_TOKEN"))

# Dataset
Modifyify the dataset to fit the Gemma 2 prompt format

In [2]:
from datasets import load_dataset
dataset_name = "nbertagnolli/counsel-chat"
dataset = load_dataset(dataset_name, split="train",cache_dir=".cache/")

# Print the first example from the dataset
print(dataset[0])
print(f"\n {dataset}")

Repo card metadata block was not found. Setting CardData to empty.


{'questionID': 0, 'questionTitle': 'Do I have too many issues for counseling?', 'questionText': 'I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.\n   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?', 'questionLink': 'https://counselchat.com/questions/do-i-have-too-many-issues-for-counseling', 'topic': 'depression', 'therapistInfo': 'Jennifer MolinariHypnotherapist & Licensed Counselor', 'therapistURL': 'https://counselchat.com/therapists/jennifer-molinari', 'answerText': 'It is very common for\xa0people to have multiple issues that they want to (and need to) address in counseling.\xa0 I have had clients ask that same question and through more exploration, there is often an underlying fear that they\xa0 "can\'t be h

In [3]:
gemma_prompt = """ 
### System:
You are a Therapist Assistant, an LLM fine-tuned on Gemma 2 model by Google.
You provide safe and responsible support to users while encouraging them to visit a mental health professional if needed. 
You are committed to promoting wellness, understanding, and support. Your responses should be clear, concise, and evidence-based, while maintaining a friendly and approachable tone.

### User:
{}

### Response:
{}
"""

def format_prompts_func(example):
    """Formats questionText and answerText into the Gemma 2 prompt format."""
    question_texts = example["questionText"]
    answer_texts = example["answerText"]
    texts = []
    for q, a in zip(question_texts, answer_texts):
        text = gemma_prompt.format(q, a)
        texts.append(text)

    return {"text": texts}
pass
# Apply the formatting function to the dataset
formatted_dataset = dataset.map(format_prompts_func, batched=True)
print(formatted_dataset['text'][0])


 
### System:
You are a Therapist Assistant, an LLM fine-tuned on Gemma 2 model by Google.
You provide safe and responsible support to users while encouraging them to visit a mental health professional if needed. 
You are committed to promoting wellness, understanding, and support. Your responses should be clear, concise, and evidence-based, while maintaining a friendly and approachable tone.

### User:
I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.
   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?

### Response:
It is very common for people to have multiple issues that they want to (and need to) address in counseling.  I have had clients ask that same question and through more exploration, there is often an und

In [4]:
dataset = formatted_dataset.train_test_split(test_size=0.2, seed=42)
print(dataset['train'].shape, dataset['test'].shape)

(2220, 11) (555, 11)


# Fine tuning hyperpterparameters

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig
from trl import SFTTrainer


In [6]:
import yaml
with open("hyperparams.yaml", 'r') as file:
    hyperparams = yaml.load(file, Loader=yaml.FullLoader)

In [7]:
compute_dtype = getattr(torch, hyperparams['bnb_4bit_compute_dtype'])

bnb_config = BitsAndBytesConfig(
    load_in_4bit=hyperparams['use_4bit'], # Activates 4-bit precision loading
    bnb_4bit_quant_type=hyperparams['bnb_4bit_quant_type'], # nf4
    bnb_4bit_compute_dtype=compute_dtype, # float16
    bnb_4bit_use_double_quant=hyperparams['use_nested_quant'], # False
)

In [8]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and hyperparams['use_4bit']:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("Setting BF16 to True")
        hyperparams['bf16'] = True
    else:
        hyperparams['bf16'] = False

Setting BF16 to True


In [9]:
model = AutoModelForCausalLM.from_pretrained(
    hyperparams['model_name'],
    token=os.getenv("HUGGINGFACE_TOKEN"),
    quantization_config=bnb_config,
    device_map=hyperparams['device_map'],
    cache_dir=".cache/",
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(hyperparams['model_name'], token=os.getenv("HUGGINGFACE_TOKEN"), trust_remote_code=True, cache_dir=".cache/")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=hyperparams['lora_alpha'],
    lora_dropout=hyperparams['lora_dropout'],
    r=hyperparams['lora_r'],
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj"]
)

In [11]:
import wandb
import time
wandb.login(key=os.getenv("WANDB_API_KEY"))
run = wandb.init(
    project='TADBot',
    job_type="training",
    anonymous="allow"
)
run_name = f"{hyperparams['model_name']}--health-bot-{int(time.time())}"

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=f"./outputs/{run_name}",
    per_device_train_batch_size=hyperparams["per_device_train_batch_size"],
    per_device_eval_batch_size=hyperparams["per_device_eval_batch_size"],
    gradient_accumulation_steps=hyperparams["gradient_accumulation_steps"],
    optim=hyperparams["optimizer"],
    num_train_epochs=hyperparams["num_train_epochs"],
    eval_steps=hyperparams["eval_steps"],
    eval_strategy=hyperparams["eval_strategy"],
    save_steps=hyperparams["save_steps"],
    logging_steps=hyperparams["logging_steps"],
    logging_strategy=hyperparams["logging_strategy"],
    warmup_steps=hyperparams["warmup_steps"],
    learning_rate=float(hyperparams["learning_rate"]),
    weight_decay=hyperparams["weight_decay"],
    fp16=hyperparams["fp16"],
    bf16=hyperparams["bf16"],
    max_grad_norm=hyperparams["max_grad_norm"],
    max_steps=hyperparams["max_steps"],
    group_by_length=hyperparams["group_by_length"],
    lr_scheduler_type=hyperparams["lr_scheduler_type"],
    logging_dir=f"./outputs/{run_name}/logs",
    report_to="wandb",
    run_name=run_name
)
training_arguments

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkausikremella[0m ([33mkausikremella-vit-ap[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Nitin Kausik Remella\_netrc


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=0.2,
eval_strategy=IntervalStrategy.STEPS,
eval_us

In [12]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset['test'],
    peft_config=peft_config,
    dataset_text_field="text",
    # formatting_func=format_prompts_fn,
    max_seq_length=hyperparams["max_seq_length"],
    tokenizer=tokenizer,
    args=training_arguments,
    packing=hyperparams["packing"],
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


# Fine tuning the model

In [13]:
model.config.use_cache = False
trainer.train()

  0%|          | 0/1544 [00:00<?, ?it/s]

{'loss': 2.4221, 'grad_norm': 0.682584822177887, 'learning_rate': 0.0002, 'epoch': 0.03}
{'loss': 1.9163, 'grad_norm': 0.5597965121269226, 'learning_rate': 0.0002, 'epoch': 0.06}
{'loss': 1.9249, 'grad_norm': 0.5598402619361877, 'learning_rate': 0.0002, 'epoch': 0.1}
{'loss': 1.9756, 'grad_norm': 0.6536526679992676, 'learning_rate': 0.0002, 'epoch': 0.13}
{'loss': 1.9548, 'grad_norm': 0.608141303062439, 'learning_rate': 0.0002, 'epoch': 0.16}
{'loss': 1.8867, 'grad_norm': 0.4548989534378052, 'learning_rate': 0.0002, 'epoch': 0.19}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_loss': 1.902209997177124, 'eval_runtime': 305.3236, 'eval_samples_per_second': 5.021, 'eval_steps_per_second': 2.512, 'epoch': 0.2}
{'loss': 1.9035, 'grad_norm': 0.43129104375839233, 'learning_rate': 0.0002, 'epoch': 0.23}
{'loss': 1.8868, 'grad_norm': 0.49856260418891907, 'learning_rate': 0.0002, 'epoch': 0.26}
{'loss': 1.7944, 'grad_norm': 0.4600728750228882, 'learning_rate': 0.0002, 'epoch': 0.29}
{'loss': 1.8076, 'grad_norm': 0.5697025656700134, 'learning_rate': 0.0002, 'epoch': 0.32}
{'loss': 1.8321, 'grad_norm': 0.7373968958854675, 'learning_rate': 0.0002, 'epoch': 0.36}
{'loss': 1.9213, 'grad_norm': 0.5277324318885803, 'learning_rate': 0.0002, 'epoch': 0.39}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_loss': 1.852689266204834, 'eval_runtime': 71.3284, 'eval_samples_per_second': 21.492, 'eval_steps_per_second': 10.753, 'epoch': 0.4}
{'loss': 1.8277, 'grad_norm': 0.5442835688591003, 'learning_rate': 0.0002, 'epoch': 0.42}
{'loss': 1.7947, 'grad_norm': 0.4261704981327057, 'learning_rate': 0.0002, 'epoch': 0.45}
{'loss': 1.8975, 'grad_norm': 0.43769732117652893, 'learning_rate': 0.0002, 'epoch': 0.49}
{'loss': 1.8065, 'grad_norm': 0.6723660230636597, 'learning_rate': 0.0002, 'epoch': 0.52}
{'loss': 1.6969, 'grad_norm': 0.7517312169075012, 'learning_rate': 0.0002, 'epoch': 0.55}
{'loss': 1.7825, 'grad_norm': 0.5381327867507935, 'learning_rate': 0.0002, 'epoch': 0.58}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_loss': 1.81912362575531, 'eval_runtime': 71.971, 'eval_samples_per_second': 21.3, 'eval_steps_per_second': 10.657, 'epoch': 0.6}
{'loss': 1.7915, 'grad_norm': 0.6141555309295654, 'learning_rate': 0.0002, 'epoch': 0.62}
{'loss': 1.7635, 'grad_norm': 0.5057688355445862, 'learning_rate': 0.0002, 'epoch': 0.65}
{'loss': 1.728, 'grad_norm': 0.49006038904190063, 'learning_rate': 0.0002, 'epoch': 0.68}
{'loss': 1.8424, 'grad_norm': 0.4901270866394043, 'learning_rate': 0.0002, 'epoch': 0.71}
{'loss': 1.8308, 'grad_norm': 0.6117296814918518, 'learning_rate': 0.0002, 'epoch': 0.74}
{'loss': 1.8729, 'grad_norm': 0.5475451946258545, 'learning_rate': 0.0002, 'epoch': 0.78}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_loss': 1.786774754524231, 'eval_runtime': 71.1209, 'eval_samples_per_second': 21.555, 'eval_steps_per_second': 10.784, 'epoch': 0.8}
{'loss': 1.6851, 'grad_norm': 0.4951877295970917, 'learning_rate': 0.0002, 'epoch': 0.81}
{'loss': 1.7613, 'grad_norm': 1.3179290294647217, 'learning_rate': 0.0002, 'epoch': 0.84}
{'loss': 1.8753, 'grad_norm': 0.45116502046585083, 'learning_rate': 0.0002, 'epoch': 0.87}
{'loss': 1.7441, 'grad_norm': 0.550654411315918, 'learning_rate': 0.0002, 'epoch': 0.91}
{'loss': 1.8054, 'grad_norm': 0.4832320511341095, 'learning_rate': 0.0002, 'epoch': 0.94}
{'loss': 1.7869, 'grad_norm': 0.5937925577163696, 'learning_rate': 0.0002, 'epoch': 0.97}
{'train_runtime': 1964.5956, 'train_samples_per_second': 3.145, 'train_steps_per_second': 0.786, 'train_loss': 1.846395028069847, 'epoch': 1.0}


TrainOutput(global_step=1544, training_loss=1.846395028069847, metrics={'train_runtime': 1964.5956, 'train_samples_per_second': 3.145, 'train_steps_per_second': 0.786, 'total_flos': 9905705513385984.0, 'train_loss': 1.846395028069847, 'epoch': 0.9996762706377469})

In [14]:
wandb.finish()
model.config.use_cache = True
# Save the model
trainer.model.save_pretrained(hyperparams["new_model_name"])

VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▅▃▁
eval/runtime,█▁▁▁
eval/samples_per_second,▁███
eval/steps_per_second,▁███
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train/grad_norm,▃▂▂▃▂▁▁▂▁▂▃▂▂▁▁▃▄▂▂▂▂▂▂▂▂█▁▂▁▂
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▃▃▄▄▃▃▃▂▂▂▃▂▂▃▂▁▂▂▂▁▂▂▃▁▂▃▂▂▂

0,1
eval/loss,1.78677
eval/runtime,71.1209
eval/samples_per_second,21.555
eval/steps_per_second,10.784
total_flos,9905705513385984.0
train/epoch,0.99968
train/global_step,1544.0
train/grad_norm,0.59379
train/learning_rate,0.0002
train/loss,1.7869


%tensorboard  --logdir Gemma2_2B\\results\\runs