{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05564924114671164, "eval_steps": 9, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005621135469364812, "eval_loss": 0.9150053262710571, "eval_runtime": 163.2125, "eval_samples_per_second": 18.363, "eval_steps_per_second": 2.298, "step": 1 }, { "epoch": 0.0016863406408094434, "grad_norm": 0.2956816852092743, "learning_rate": 3e-05, "loss": 0.9448, "step": 3 }, { "epoch": 0.003372681281618887, "grad_norm": 0.2726631164550781, "learning_rate": 6e-05, "loss": 0.9084, "step": 6 }, { "epoch": 0.00505902192242833, "grad_norm": 0.21325236558914185, "learning_rate": 9e-05, "loss": 0.8568, "step": 9 }, { "epoch": 0.00505902192242833, "eval_loss": 0.8506203293800354, "eval_runtime": 164.5381, "eval_samples_per_second": 18.215, "eval_steps_per_second": 2.279, "step": 9 }, { "epoch": 0.006745362563237774, "grad_norm": 0.2024015486240387, "learning_rate": 9.987820251299122e-05, "loss": 0.841, "step": 12 }, { "epoch": 0.008431703204047217, "grad_norm": 0.21213184297084808, "learning_rate": 9.924038765061042e-05, "loss": 0.8612, "step": 15 }, { "epoch": 0.01011804384485666, "grad_norm": 0.18286366760730743, "learning_rate": 9.806308479691595e-05, "loss": 0.7799, "step": 18 }, { "epoch": 0.01011804384485666, "eval_loss": 0.7897757887840271, "eval_runtime": 164.6058, "eval_samples_per_second": 18.207, "eval_steps_per_second": 2.278, "step": 18 }, { "epoch": 0.011804384485666104, "grad_norm": 0.1957426816225052, "learning_rate": 9.635919272833938e-05, "loss": 0.7759, "step": 21 }, { "epoch": 0.013490725126475547, "grad_norm": 0.1535039246082306, "learning_rate": 9.414737964294636e-05, "loss": 0.7502, "step": 24 }, { "epoch": 0.01517706576728499, "grad_norm": 0.15373417735099792, "learning_rate": 9.145187862775209e-05, "loss": 0.7894, "step": 27 }, { "epoch": 0.01517706576728499, "eval_loss": 0.7679686546325684, "eval_runtime": 164.638, "eval_samples_per_second": 18.204, "eval_steps_per_second": 2.278, "step": 27 }, { "epoch": 0.016863406408094434, "grad_norm": 0.1481175273656845, "learning_rate": 8.83022221559489e-05, "loss": 0.7242, "step": 30 }, { "epoch": 0.01854974704890388, "grad_norm": 0.14273004233837128, "learning_rate": 8.473291852294987e-05, "loss": 0.697, "step": 33 }, { "epoch": 0.02023608768971332, "grad_norm": 0.1542915254831314, "learning_rate": 8.07830737662829e-05, "loss": 0.7494, "step": 36 }, { "epoch": 0.02023608768971332, "eval_loss": 0.7546795606613159, "eval_runtime": 164.7817, "eval_samples_per_second": 18.188, "eval_steps_per_second": 2.276, "step": 36 }, { "epoch": 0.021922428330522766, "grad_norm": 0.1566758006811142, "learning_rate": 7.649596321166024e-05, "loss": 0.7573, "step": 39 }, { "epoch": 0.023608768971332208, "grad_norm": 0.16109135746955872, "learning_rate": 7.191855733945387e-05, "loss": 0.7655, "step": 42 }, { "epoch": 0.025295109612141653, "grad_norm": 0.15883490443229675, "learning_rate": 6.710100716628344e-05, "loss": 0.765, "step": 45 }, { "epoch": 0.025295109612141653, "eval_loss": 0.7461237907409668, "eval_runtime": 164.7581, "eval_samples_per_second": 18.19, "eval_steps_per_second": 2.276, "step": 45 }, { "epoch": 0.026981450252951095, "grad_norm": 0.15612190961837769, "learning_rate": 6.209609477998338e-05, "loss": 0.7528, "step": 48 }, { "epoch": 0.02866779089376054, "grad_norm": 0.15889355540275574, "learning_rate": 5.695865504800327e-05, "loss": 0.7408, "step": 51 }, { "epoch": 0.03035413153456998, "grad_norm": 0.16103751957416534, "learning_rate": 5.174497483512506e-05, "loss": 0.7259, "step": 54 }, { "epoch": 0.03035413153456998, "eval_loss": 0.7405259013175964, "eval_runtime": 164.8345, "eval_samples_per_second": 18.182, "eval_steps_per_second": 2.275, "step": 54 }, { "epoch": 0.03204047217537943, "grad_norm": 0.16598886251449585, "learning_rate": 4.6512176312793736e-05, "loss": 0.7398, "step": 57 }, { "epoch": 0.03372681281618887, "grad_norm": 0.16655276715755463, "learning_rate": 4.131759111665349e-05, "loss": 0.7534, "step": 60 }, { "epoch": 0.03541315345699832, "grad_norm": 0.15536905825138092, "learning_rate": 3.6218132209150045e-05, "loss": 0.7346, "step": 63 }, { "epoch": 0.03541315345699832, "eval_loss": 0.7366407513618469, "eval_runtime": 164.7935, "eval_samples_per_second": 18.186, "eval_steps_per_second": 2.276, "step": 63 }, { "epoch": 0.03709949409780776, "grad_norm": 0.16439783573150635, "learning_rate": 3.12696703292044e-05, "loss": 0.7293, "step": 66 }, { "epoch": 0.0387858347386172, "grad_norm": 0.16857261955738068, "learning_rate": 2.6526421860705473e-05, "loss": 0.7634, "step": 69 }, { "epoch": 0.04047217537942664, "grad_norm": 0.15915103256702423, "learning_rate": 2.2040354826462668e-05, "loss": 0.7338, "step": 72 }, { "epoch": 0.04047217537942664, "eval_loss": 0.7342514991760254, "eval_runtime": 164.7816, "eval_samples_per_second": 18.188, "eval_steps_per_second": 2.276, "step": 72 }, { "epoch": 0.04215851602023609, "grad_norm": 0.1585548371076584, "learning_rate": 1.7860619515673033e-05, "loss": 0.7453, "step": 75 }, { "epoch": 0.04384485666104553, "grad_norm": 0.16192683577537537, "learning_rate": 1.4033009983067452e-05, "loss": 0.7035, "step": 78 }, { "epoch": 0.045531197301854974, "grad_norm": 0.16156958043575287, "learning_rate": 1.0599462319663905e-05, "loss": 0.7299, "step": 81 }, { "epoch": 0.045531197301854974, "eval_loss": 0.7329394817352295, "eval_runtime": 164.6509, "eval_samples_per_second": 18.202, "eval_steps_per_second": 2.278, "step": 81 }, { "epoch": 0.047217537942664416, "grad_norm": 0.16453050076961517, "learning_rate": 7.597595192178702e-06, "loss": 0.7636, "step": 84 }, { "epoch": 0.048903878583473864, "grad_norm": 0.17823824286460876, "learning_rate": 5.060297685041659e-06, "loss": 0.7473, "step": 87 }, { "epoch": 0.050590219224283306, "grad_norm": 0.16380201280117035, "learning_rate": 3.0153689607045845e-06, "loss": 0.7086, "step": 90 }, { "epoch": 0.050590219224283306, "eval_loss": 0.7322941422462463, "eval_runtime": 164.8671, "eval_samples_per_second": 18.178, "eval_steps_per_second": 2.275, "step": 90 }, { "epoch": 0.05227655986509275, "grad_norm": 0.1683541089296341, "learning_rate": 1.4852136862001764e-06, "loss": 0.6752, "step": 93 }, { "epoch": 0.05396290050590219, "grad_norm": 0.16768647730350494, "learning_rate": 4.865965629214819e-07, "loss": 0.7573, "step": 96 }, { "epoch": 0.05564924114671164, "grad_norm": 0.15775783360004425, "learning_rate": 3.04586490452119e-08, "loss": 0.6528, "step": 99 }, { "epoch": 0.05564924114671164, "eval_loss": 0.7322031259536743, "eval_runtime": 164.8241, "eval_samples_per_second": 18.183, "eval_steps_per_second": 2.275, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.172824593819238e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }