data: prompt: mistral_formal train: ../data/susgen/FINAL/PER_3500/FINAL_PER3500_30k.json val: null val_split_ratio: 0.005 device: cuda instruct_mask: true local_rank: 0 model: acceleration: null int4_config: bnb_4bit_compute_dtype: bfloat16 bnb_4bit_quant_type: nf4 bnb_4bit_use_double_quant: true load_in_4bit: true load_in_8bit: false int8_config: load_in_4bit: false load_in_8bit: true lora: bias: none inference_mode: false lora_alpha: 32 lora_dropout: 0.1 r: 16 target_modules: - q_proj - k_proj - v_proj - o_proj - gate_proj - up_proj - down_proj - lm_head task_type: CAUSAL_LM lora_path: false model_path: ../ckpts/Mistral-7B-v0.3 quantization: int4 seed: 2024 show_config: false use_lora: true window: null name: 30k-Mistral-7B-v0.3-small output_dir: ../results/ tokenizer: add_bos_token: true add_eos_token: false add_prefix_space: false encode: max_length: 2048 return_tensors: pt truncation: true model_max_length: 2048 padding_side: left pretrained_model_name_or_path: ../ckpts/Mistral-7B-v0.3 truncation_side: right use_fast: true trainer: NewTrainer training: bf16: true deepspeed: ./configs/ds_configs/ds_config_stage_2.json gradient_accumulation_steps: 16 learning_rate: 2.0e-06 logging_steps: 1 lr_scheduler_type: cosine max_steps: 201 optim: paged_adamw_32bit per_device_train_batch_size: 16 remove_unused_columns: false report_to: wandb resume_from_checkpoint: null save_steps: 20 save_strategy: steps warmup_steps: 100 weight_decay: 0.01