{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.009939369843951893, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019878739687903787, "grad_norm": 2.5984435081481934, "learning_rate": 5e-06, "loss": 13.1525, "step": 1 }, { "epoch": 0.00019878739687903787, "eval_loss": 3.2820167541503906, "eval_runtime": 78.0374, "eval_samples_per_second": 27.154, "eval_steps_per_second": 13.583, "step": 1 }, { "epoch": 0.00039757479375807575, "grad_norm": 2.336440324783325, "learning_rate": 1e-05, "loss": 13.7228, "step": 2 }, { "epoch": 0.0005963621906371136, "grad_norm": 2.69608998298645, "learning_rate": 1.5e-05, "loss": 14.2389, "step": 3 }, { "epoch": 0.0007951495875161515, "grad_norm": 2.53593111038208, "learning_rate": 2e-05, "loss": 12.0966, "step": 4 }, { "epoch": 0.0009939369843951894, "grad_norm": 3.020933151245117, "learning_rate": 2.5e-05, "loss": 13.396, "step": 5 }, { "epoch": 0.0011927243812742273, "grad_norm": 2.3294146060943604, "learning_rate": 3e-05, "loss": 13.276, "step": 6 }, { "epoch": 0.0013915117781532651, "grad_norm": 2.662527322769165, "learning_rate": 3.5e-05, "loss": 14.1702, "step": 7 }, { "epoch": 0.001590299175032303, "grad_norm": 2.230471611022949, "learning_rate": 4e-05, "loss": 13.6173, "step": 8 }, { "epoch": 0.0017890865719113408, "grad_norm": 3.186838150024414, "learning_rate": 4.5e-05, "loss": 15.6249, "step": 9 }, { "epoch": 0.001987873968790379, "grad_norm": 2.705491542816162, "learning_rate": 5e-05, "loss": 14.0928, "step": 10 }, { "epoch": 0.0021866613656694165, "grad_norm": 3.163905382156372, "learning_rate": 4.99229333433282e-05, "loss": 13.9178, "step": 11 }, { "epoch": 0.0023854487625484546, "grad_norm": 3.664141893386841, "learning_rate": 4.9692208514878444e-05, "loss": 13.3966, "step": 12 }, { "epoch": 0.002584236159427492, "grad_norm": 3.0179429054260254, "learning_rate": 4.9309248009941914e-05, "loss": 11.8172, "step": 13 }, { "epoch": 0.002584236159427492, "eval_loss": 3.2313973903656006, "eval_runtime": 77.613, "eval_samples_per_second": 27.302, "eval_steps_per_second": 13.658, "step": 13 }, { "epoch": 0.0027830235563065303, "grad_norm": 3.568173885345459, "learning_rate": 4.877641290737884e-05, "loss": 13.0556, "step": 14 }, { "epoch": 0.002981810953185568, "grad_norm": 4.340553283691406, "learning_rate": 4.8096988312782174e-05, "loss": 13.3655, "step": 15 }, { "epoch": 0.003180598350064606, "grad_norm": 3.275731325149536, "learning_rate": 4.72751631047092e-05, "loss": 12.1498, "step": 16 }, { "epoch": 0.0033793857469436436, "grad_norm": 3.076188564300537, "learning_rate": 4.6316004108852305e-05, "loss": 12.0674, "step": 17 }, { "epoch": 0.0035781731438226817, "grad_norm": 5.818179130554199, "learning_rate": 4.522542485937369e-05, "loss": 14.4302, "step": 18 }, { "epoch": 0.0037769605407017197, "grad_norm": 3.2982187271118164, "learning_rate": 4.401014914000078e-05, "loss": 11.2277, "step": 19 }, { "epoch": 0.003975747937580758, "grad_norm": 2.596877098083496, "learning_rate": 4.267766952966369e-05, "loss": 12.7767, "step": 20 }, { "epoch": 0.004174535334459795, "grad_norm": 3.128573417663574, "learning_rate": 4.123620120825459e-05, "loss": 13.4153, "step": 21 }, { "epoch": 0.004373322731338833, "grad_norm": 3.395514965057373, "learning_rate": 3.969463130731183e-05, "loss": 11.9203, "step": 22 }, { "epoch": 0.004572110128217871, "grad_norm": 3.388470411300659, "learning_rate": 3.8062464117898724e-05, "loss": 11.5329, "step": 23 }, { "epoch": 0.004770897525096909, "grad_norm": 3.08424711227417, "learning_rate": 3.634976249348867e-05, "loss": 12.0117, "step": 24 }, { "epoch": 0.004969684921975946, "grad_norm": 3.288663148880005, "learning_rate": 3.456708580912725e-05, "loss": 11.7711, "step": 25 }, { "epoch": 0.005168472318854984, "grad_norm": 2.7411789894104004, "learning_rate": 3.272542485937369e-05, "loss": 13.0947, "step": 26 }, { "epoch": 0.005168472318854984, "eval_loss": 3.0858662128448486, "eval_runtime": 76.1367, "eval_samples_per_second": 27.832, "eval_steps_per_second": 13.922, "step": 26 }, { "epoch": 0.0053672597157340225, "grad_norm": 3.8614823818206787, "learning_rate": 3.083613409639764e-05, "loss": 13.1261, "step": 27 }, { "epoch": 0.0055660471126130606, "grad_norm": 2.7608020305633545, "learning_rate": 2.8910861626005776e-05, "loss": 14.0752, "step": 28 }, { "epoch": 0.005764834509492099, "grad_norm": 3.776360511779785, "learning_rate": 2.6961477393196126e-05, "loss": 12.0604, "step": 29 }, { "epoch": 0.005963621906371136, "grad_norm": 3.3174591064453125, "learning_rate": 2.5e-05, "loss": 13.3155, "step": 30 }, { "epoch": 0.006162409303250174, "grad_norm": 3.493502378463745, "learning_rate": 2.303852260680388e-05, "loss": 13.7616, "step": 31 }, { "epoch": 0.006361196700129212, "grad_norm": 3.398390293121338, "learning_rate": 2.1089138373994223e-05, "loss": 11.2017, "step": 32 }, { "epoch": 0.00655998409700825, "grad_norm": 2.4852347373962402, "learning_rate": 1.9163865903602374e-05, "loss": 11.3623, "step": 33 }, { "epoch": 0.006758771493887287, "grad_norm": 3.126319169998169, "learning_rate": 1.7274575140626318e-05, "loss": 11.5718, "step": 34 }, { "epoch": 0.006957558890766325, "grad_norm": 3.5908548831939697, "learning_rate": 1.5432914190872757e-05, "loss": 13.374, "step": 35 }, { "epoch": 0.007156346287645363, "grad_norm": 4.754377365112305, "learning_rate": 1.3650237506511331e-05, "loss": 13.9317, "step": 36 }, { "epoch": 0.007355133684524401, "grad_norm": 3.6869258880615234, "learning_rate": 1.1937535882101281e-05, "loss": 12.4569, "step": 37 }, { "epoch": 0.0075539210814034394, "grad_norm": 3.3655855655670166, "learning_rate": 1.0305368692688174e-05, "loss": 11.7599, "step": 38 }, { "epoch": 0.007752708478282477, "grad_norm": 2.8551807403564453, "learning_rate": 8.763798791745411e-06, "loss": 11.8171, "step": 39 }, { "epoch": 0.007752708478282477, "eval_loss": 3.0310628414154053, "eval_runtime": 75.945, "eval_samples_per_second": 27.902, "eval_steps_per_second": 13.957, "step": 39 }, { "epoch": 0.007951495875161516, "grad_norm": 3.137924909591675, "learning_rate": 7.3223304703363135e-06, "loss": 12.3101, "step": 40 }, { "epoch": 0.008150283272040553, "grad_norm": 2.641319751739502, "learning_rate": 5.989850859999227e-06, "loss": 11.4512, "step": 41 }, { "epoch": 0.00834907066891959, "grad_norm": 3.6729891300201416, "learning_rate": 4.7745751406263165e-06, "loss": 11.6769, "step": 42 }, { "epoch": 0.008547858065798629, "grad_norm": 3.528848648071289, "learning_rate": 3.6839958911476957e-06, "loss": 11.1653, "step": 43 }, { "epoch": 0.008746645462677666, "grad_norm": 3.0228517055511475, "learning_rate": 2.7248368952908053e-06, "loss": 10.7299, "step": 44 }, { "epoch": 0.008945432859556703, "grad_norm": 3.2013304233551025, "learning_rate": 1.9030116872178316e-06, "loss": 12.7148, "step": 45 }, { "epoch": 0.009144220256435742, "grad_norm": 2.7772860527038574, "learning_rate": 1.2235870926211619e-06, "loss": 11.8285, "step": 46 }, { "epoch": 0.00934300765331478, "grad_norm": 3.5433526039123535, "learning_rate": 6.907519900580861e-07, "loss": 11.6592, "step": 47 }, { "epoch": 0.009541795050193818, "grad_norm": 3.839385509490967, "learning_rate": 3.077914851215585e-07, "loss": 11.4824, "step": 48 }, { "epoch": 0.009740582447072856, "grad_norm": 2.9668209552764893, "learning_rate": 7.706665667180091e-08, "loss": 11.6165, "step": 49 }, { "epoch": 0.009939369843951893, "grad_norm": 3.1155428886413574, "learning_rate": 0.0, "loss": 12.5005, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8184384007962624.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }