{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 14.25, "learning_rate": 3.1746031746031746e-06, "loss": 2.4132, "step": 10 }, { "epoch": 0.032, "grad_norm": 11.4375, "learning_rate": 6.349206349206349e-06, "loss": 2.462, "step": 20 }, { "epoch": 0.048, "grad_norm": 11.0, "learning_rate": 9.523809523809525e-06, "loss": 2.3388, "step": 30 }, { "epoch": 0.064, "grad_norm": 8.5625, "learning_rate": 1.2698412698412699e-05, "loss": 2.1813, "step": 40 }, { "epoch": 0.08, "grad_norm": 8.0625, "learning_rate": 1.5873015873015872e-05, "loss": 1.9853, "step": 50 }, { "epoch": 0.096, "grad_norm": 9.75, "learning_rate": 1.904761904761905e-05, "loss": 1.9632, "step": 60 }, { "epoch": 0.112, "grad_norm": 8.3125, "learning_rate": 1.9750889679715305e-05, "loss": 1.7842, "step": 70 }, { "epoch": 0.128, "grad_norm": 7.75, "learning_rate": 1.939501779359431e-05, "loss": 1.7123, "step": 80 }, { "epoch": 0.144, "grad_norm": 7.5625, "learning_rate": 1.903914590747331e-05, "loss": 1.6547, "step": 90 }, { "epoch": 0.16, "grad_norm": 6.3125, "learning_rate": 1.8683274021352315e-05, "loss": 1.5779, "step": 100 }, { "epoch": 0.176, "grad_norm": 6.46875, "learning_rate": 1.832740213523132e-05, "loss": 1.5657, "step": 110 }, { "epoch": 0.192, "grad_norm": 5.9375, "learning_rate": 1.7971530249110324e-05, "loss": 1.55, "step": 120 }, { "epoch": 0.208, "grad_norm": 6.28125, "learning_rate": 1.7615658362989325e-05, "loss": 1.504, "step": 130 }, { "epoch": 0.224, "grad_norm": 6.65625, "learning_rate": 1.725978647686833e-05, "loss": 1.5066, "step": 140 }, { "epoch": 0.24, "grad_norm": 6.09375, "learning_rate": 1.690391459074733e-05, "loss": 1.5028, "step": 150 }, { "epoch": 0.256, "grad_norm": 6.59375, "learning_rate": 1.6548042704626336e-05, "loss": 1.4306, "step": 160 }, { "epoch": 0.272, "grad_norm": 7.875, "learning_rate": 1.619217081850534e-05, "loss": 1.4841, "step": 170 }, { "epoch": 0.288, "grad_norm": 6.8125, "learning_rate": 1.583629893238434e-05, "loss": 1.4241, "step": 180 }, { "epoch": 0.304, "grad_norm": 6.4375, "learning_rate": 1.5480427046263346e-05, "loss": 1.4209, "step": 190 }, { "epoch": 0.32, "grad_norm": 7.0625, "learning_rate": 1.5124555160142349e-05, "loss": 1.4404, "step": 200 }, { "epoch": 0.336, "grad_norm": 6.0625, "learning_rate": 1.4768683274021354e-05, "loss": 1.382, "step": 210 }, { "epoch": 0.352, "grad_norm": 6.09375, "learning_rate": 1.4412811387900356e-05, "loss": 1.4079, "step": 220 }, { "epoch": 0.368, "grad_norm": 6.0, "learning_rate": 1.4056939501779361e-05, "loss": 1.3182, "step": 230 }, { "epoch": 0.384, "grad_norm": 6.0, "learning_rate": 1.3701067615658364e-05, "loss": 1.3337, "step": 240 }, { "epoch": 0.4, "grad_norm": 6.0, "learning_rate": 1.3345195729537369e-05, "loss": 1.3165, "step": 250 }, { "epoch": 0.416, "grad_norm": 7.8125, "learning_rate": 1.298932384341637e-05, "loss": 1.2977, "step": 260 }, { "epoch": 0.432, "grad_norm": 6.5625, "learning_rate": 1.2633451957295374e-05, "loss": 1.3135, "step": 270 }, { "epoch": 0.448, "grad_norm": 7.65625, "learning_rate": 1.2277580071174377e-05, "loss": 1.281, "step": 280 }, { "epoch": 0.464, "grad_norm": 6.125, "learning_rate": 1.1921708185053382e-05, "loss": 1.3239, "step": 290 }, { "epoch": 0.48, "grad_norm": 6.71875, "learning_rate": 1.1565836298932385e-05, "loss": 1.287, "step": 300 }, { "epoch": 0.496, "grad_norm": 6.0, "learning_rate": 1.120996441281139e-05, "loss": 1.3248, "step": 310 }, { "epoch": 0.512, "grad_norm": 6.71875, "learning_rate": 1.0854092526690392e-05, "loss": 1.3225, "step": 320 }, { "epoch": 0.528, "grad_norm": 6.375, "learning_rate": 1.0498220640569397e-05, "loss": 1.3121, "step": 330 }, { "epoch": 0.544, "grad_norm": 6.0625, "learning_rate": 1.01423487544484e-05, "loss": 1.2764, "step": 340 }, { "epoch": 0.56, "grad_norm": 6.96875, "learning_rate": 9.786476868327403e-06, "loss": 1.2815, "step": 350 }, { "epoch": 0.576, "grad_norm": 6.28125, "learning_rate": 9.430604982206405e-06, "loss": 1.2981, "step": 360 }, { "epoch": 0.592, "grad_norm": 6.09375, "learning_rate": 9.07473309608541e-06, "loss": 1.2226, "step": 370 }, { "epoch": 0.608, "grad_norm": 7.125, "learning_rate": 8.718861209964413e-06, "loss": 1.2702, "step": 380 }, { "epoch": 0.624, "grad_norm": 6.09375, "learning_rate": 8.362989323843418e-06, "loss": 1.1996, "step": 390 }, { "epoch": 0.64, "grad_norm": 6.21875, "learning_rate": 8.00711743772242e-06, "loss": 1.2685, "step": 400 }, { "epoch": 0.656, "grad_norm": 6.84375, "learning_rate": 7.651245551601423e-06, "loss": 1.3484, "step": 410 }, { "epoch": 0.672, "grad_norm": 6.84375, "learning_rate": 7.295373665480427e-06, "loss": 1.2453, "step": 420 }, { "epoch": 0.688, "grad_norm": 5.875, "learning_rate": 6.939501779359431e-06, "loss": 1.2525, "step": 430 }, { "epoch": 0.704, "grad_norm": 6.65625, "learning_rate": 6.5836298932384346e-06, "loss": 1.1891, "step": 440 }, { "epoch": 0.72, "grad_norm": 6.53125, "learning_rate": 6.227758007117438e-06, "loss": 1.2031, "step": 450 }, { "epoch": 0.736, "grad_norm": 7.03125, "learning_rate": 5.871886120996442e-06, "loss": 1.2297, "step": 460 }, { "epoch": 0.752, "grad_norm": 6.53125, "learning_rate": 5.516014234875445e-06, "loss": 1.1854, "step": 470 }, { "epoch": 0.768, "grad_norm": 6.28125, "learning_rate": 5.160142348754449e-06, "loss": 1.277, "step": 480 }, { "epoch": 0.784, "grad_norm": 5.96875, "learning_rate": 4.8042704626334524e-06, "loss": 1.1824, "step": 490 }, { "epoch": 0.8, "grad_norm": 7.625, "learning_rate": 4.448398576512456e-06, "loss": 1.2103, "step": 500 }, { "epoch": 0.816, "grad_norm": 7.5, "learning_rate": 4.09252669039146e-06, "loss": 1.1943, "step": 510 }, { "epoch": 0.832, "grad_norm": 6.78125, "learning_rate": 3.7366548042704632e-06, "loss": 1.2306, "step": 520 }, { "epoch": 0.848, "grad_norm": 7.0625, "learning_rate": 3.3807829181494666e-06, "loss": 1.2586, "step": 530 }, { "epoch": 0.864, "grad_norm": 6.59375, "learning_rate": 3.0249110320284703e-06, "loss": 1.2192, "step": 540 }, { "epoch": 0.88, "grad_norm": 6.03125, "learning_rate": 2.669039145907473e-06, "loss": 1.207, "step": 550 }, { "epoch": 0.896, "grad_norm": 5.875, "learning_rate": 2.313167259786477e-06, "loss": 1.2041, "step": 560 }, { "epoch": 0.912, "grad_norm": 6.5625, "learning_rate": 1.9572953736654807e-06, "loss": 1.243, "step": 570 }, { "epoch": 0.928, "grad_norm": 5.75, "learning_rate": 1.6014234875444842e-06, "loss": 1.2168, "step": 580 }, { "epoch": 0.944, "grad_norm": 7.65625, "learning_rate": 1.2455516014234877e-06, "loss": 1.2393, "step": 590 }, { "epoch": 0.96, "grad_norm": 6.03125, "learning_rate": 8.896797153024913e-07, "loss": 1.2151, "step": 600 }, { "epoch": 0.976, "grad_norm": 8.125, "learning_rate": 5.338078291814947e-07, "loss": 1.2361, "step": 610 }, { "epoch": 0.992, "grad_norm": 6.96875, "learning_rate": 1.7793594306049826e-07, "loss": 1.2307, "step": 620 }, { "epoch": 1.0, "step": 625, "total_flos": 7089149865623552.0, "train_loss": 1.4169092582702636, "train_runtime": 351.5262, "train_samples_per_second": 28.447, "train_steps_per_second": 1.778 } ], "logging_steps": 10, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7089149865623552.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }