badam_mode: layer badam_switch_interval: 100 badam_switch_mode: ascending badam_update_ratio: 0.1 bf16: true cutoff_len: 4096 dataset: smoltalk_chinese dataset_dir: data ddp_timeout: 180000000 deepspeed: cache/ds_z3_config.json do_train: true eval_steps: 5000 eval_strategy: steps finetuning_type: full flash_attn: fa2 gradient_accumulation_steps: 4 learning_rate: 0.0003 logging_steps: 5 lr_scheduler_type: cosine max_grad_norm: 2.0 max_samples: 10000000 model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B neat_packing: true neftune_noise_alpha: 5 num_train_epochs: 2.0 optim: adamw_torch output_dir: saves/DeepSeek-R1-Distill-Qwen-1.5B/full/train_2025-01-23-00-42-56 packing: true per_device_eval_batch_size: 4 per_device_train_batch_size: 4 plot_loss: true preprocessing_num_workers: 16 report_to: none save_steps: 5000 stage: sft template: deepseekr1 trust_remote_code: true use_badam: true val_size: 0.001 warmup_steps: 100