{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.059273894788836747, "eval_steps": 8, "global_step": 30, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001975796492961225, "grad_norm": 7.703702926635742, "learning_rate": 1e-05, "loss": 20.6007, "step": 1 }, { "epoch": 0.001975796492961225, "eval_loss": NaN, "eval_runtime": 161.515, "eval_samples_per_second": 5.281, "eval_steps_per_second": 2.644, "step": 1 }, { "epoch": 0.00395159298592245, "grad_norm": 10.786239624023438, "learning_rate": 2e-05, "loss": 19.4341, "step": 2 }, { "epoch": 0.005927389478883675, "grad_norm": 10.11152172088623, "learning_rate": 3e-05, "loss": 20.713, "step": 3 }, { "epoch": 0.0079031859718449, "grad_norm": 7.585945129394531, "learning_rate": 4e-05, "loss": 18.3887, "step": 4 }, { "epoch": 0.009878982464806126, "grad_norm": 10.86007308959961, "learning_rate": 5e-05, "loss": 19.6414, "step": 5 }, { "epoch": 0.01185477895776735, "grad_norm": 10.720023155212402, "learning_rate": 6e-05, "loss": 22.0141, "step": 6 }, { "epoch": 0.013830575450728575, "grad_norm": 7.739660263061523, "learning_rate": 7e-05, "loss": 22.056, "step": 7 }, { "epoch": 0.0158063719436898, "grad_norm": 5.171518325805664, "learning_rate": 8e-05, "loss": 18.8531, "step": 8 }, { "epoch": 0.0158063719436898, "eval_loss": NaN, "eval_runtime": 146.1491, "eval_samples_per_second": 5.837, "eval_steps_per_second": 2.922, "step": 8 }, { "epoch": 0.017782168436651025, "grad_norm": 5.789409637451172, "learning_rate": 9e-05, "loss": 18.2752, "step": 9 }, { "epoch": 0.01975796492961225, "grad_norm": 7.014954090118408, "learning_rate": 0.0001, "loss": 17.7699, "step": 10 }, { "epoch": 0.021733761422573473, "grad_norm": 8.307316780090332, "learning_rate": 9.938441702975689e-05, "loss": 18.9728, "step": 11 }, { "epoch": 0.0237095579155347, "grad_norm": 5.6757659912109375, "learning_rate": 9.755282581475769e-05, "loss": 16.5401, "step": 12 }, { "epoch": 0.025685354408495925, "grad_norm": 4.361359596252441, "learning_rate": 9.45503262094184e-05, "loss": 18.2953, "step": 13 }, { "epoch": 0.02766115090145715, "grad_norm": 4.344420433044434, "learning_rate": 9.045084971874738e-05, "loss": 18.8302, "step": 14 }, { "epoch": 0.029636947394418373, "grad_norm": 3.632826566696167, "learning_rate": 8.535533905932738e-05, "loss": 15.5949, "step": 15 }, { "epoch": 0.0316127438873796, "grad_norm": 5.199756622314453, "learning_rate": 7.938926261462366e-05, "loss": 16.3128, "step": 16 }, { "epoch": 0.0316127438873796, "eval_loss": NaN, "eval_runtime": 145.9235, "eval_samples_per_second": 5.846, "eval_steps_per_second": 2.926, "step": 16 }, { "epoch": 0.03358854038034083, "grad_norm": 3.910904884338379, "learning_rate": 7.269952498697734e-05, "loss": 15.6483, "step": 17 }, { "epoch": 0.03556433687330205, "grad_norm": 3.871004104614258, "learning_rate": 6.545084971874738e-05, "loss": 17.0178, "step": 18 }, { "epoch": 0.03754013336626327, "grad_norm": 4.427883625030518, "learning_rate": 5.782172325201155e-05, "loss": 18.9645, "step": 19 }, { "epoch": 0.0395159298592245, "grad_norm": 3.5598199367523193, "learning_rate": 5e-05, "loss": 17.7472, "step": 20 }, { "epoch": 0.041491726352185725, "grad_norm": 3.578526020050049, "learning_rate": 4.2178276747988446e-05, "loss": 17.6056, "step": 21 }, { "epoch": 0.04346752284514695, "grad_norm": 2.900790214538574, "learning_rate": 3.4549150281252636e-05, "loss": 17.3251, "step": 22 }, { "epoch": 0.045443319338108176, "grad_norm": 3.5540361404418945, "learning_rate": 2.7300475013022663e-05, "loss": 16.6689, "step": 23 }, { "epoch": 0.0474191158310694, "grad_norm": 2.947385311126709, "learning_rate": 2.061073738537635e-05, "loss": 16.161, "step": 24 }, { "epoch": 0.0474191158310694, "eval_loss": NaN, "eval_runtime": 145.4977, "eval_samples_per_second": 5.863, "eval_steps_per_second": 2.935, "step": 24 }, { "epoch": 0.04939491232403063, "grad_norm": 4.943809509277344, "learning_rate": 1.4644660940672627e-05, "loss": 17.7916, "step": 25 }, { "epoch": 0.05137070881699185, "grad_norm": 5.202208518981934, "learning_rate": 9.549150281252633e-06, "loss": 16.5579, "step": 26 }, { "epoch": 0.05334650530995307, "grad_norm": 2.9827098846435547, "learning_rate": 5.449673790581611e-06, "loss": 16.4835, "step": 27 }, { "epoch": 0.0553223018029143, "grad_norm": 5.730014324188232, "learning_rate": 2.4471741852423237e-06, "loss": 16.5029, "step": 28 }, { "epoch": 0.057298098295875524, "grad_norm": 3.476895570755005, "learning_rate": 6.15582970243117e-07, "loss": 16.4587, "step": 29 }, { "epoch": 0.059273894788836747, "grad_norm": 3.6241424083709717, "learning_rate": 0.0, "loss": 17.1663, "step": 30 } ], "logging_steps": 1, "max_steps": 30, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 8, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.842039240163328e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }