|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.6, |
|
"eval_steps": 50, |
|
"global_step": 2800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 10.53576374053955, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.6618, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 0.7731789350509644, |
|
"eval_runtime": 2.2494, |
|
"eval_samples_per_second": 69.353, |
|
"eval_steps_per_second": 3.557, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.800010681152344, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.7629, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.6901325583457947, |
|
"eval_runtime": 2.2539, |
|
"eval_samples_per_second": 69.213, |
|
"eval_steps_per_second": 3.549, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.960265636444092, |
|
"learning_rate": 6e-06, |
|
"loss": 0.7256, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 0.6716309785842896, |
|
"eval_runtime": 2.2526, |
|
"eval_samples_per_second": 69.254, |
|
"eval_steps_per_second": 3.551, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5.574848651885986, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.7243, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.6644517779350281, |
|
"eval_runtime": 2.2546, |
|
"eval_samples_per_second": 69.193, |
|
"eval_steps_per_second": 3.548, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.0581891536712646, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6918, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.6718080043792725, |
|
"eval_runtime": 2.255, |
|
"eval_samples_per_second": 69.18, |
|
"eval_steps_per_second": 3.548, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.797400712966919, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.7433, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.67710280418396, |
|
"eval_runtime": 2.2558, |
|
"eval_samples_per_second": 69.155, |
|
"eval_steps_per_second": 3.546, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 8.121636390686035, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.7523, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 0.680716335773468, |
|
"eval_runtime": 2.2562, |
|
"eval_samples_per_second": 69.144, |
|
"eval_steps_per_second": 3.546, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.615454912185669, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.7322, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.6906686425209045, |
|
"eval_runtime": 2.2633, |
|
"eval_samples_per_second": 68.926, |
|
"eval_steps_per_second": 3.535, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.9651033878326416, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.7497, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 0.6827173233032227, |
|
"eval_runtime": 2.5909, |
|
"eval_samples_per_second": 60.21, |
|
"eval_steps_per_second": 3.088, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.7542426586151123, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7622, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.6903170347213745, |
|
"eval_runtime": 2.4721, |
|
"eval_samples_per_second": 63.105, |
|
"eval_steps_per_second": 3.236, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.384434938430786, |
|
"learning_rate": 1.999390827019096e-05, |
|
"loss": 0.484, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_loss": 0.7237842679023743, |
|
"eval_runtime": 2.9344, |
|
"eval_samples_per_second": 53.162, |
|
"eval_steps_per_second": 2.726, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.1198794841766357, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 0.5145, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.7352678179740906, |
|
"eval_runtime": 3.8017, |
|
"eval_samples_per_second": 41.034, |
|
"eval_steps_per_second": 2.104, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 6.234444618225098, |
|
"learning_rate": 1.9945218953682736e-05, |
|
"loss": 0.5093, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"eval_loss": 0.7311124801635742, |
|
"eval_runtime": 2.2672, |
|
"eval_samples_per_second": 68.808, |
|
"eval_steps_per_second": 3.529, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.112931489944458, |
|
"learning_rate": 1.9902680687415704e-05, |
|
"loss": 0.5248, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 0.734488844871521, |
|
"eval_runtime": 2.2746, |
|
"eval_samples_per_second": 68.582, |
|
"eval_steps_per_second": 3.517, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 3.4556541442871094, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 0.5107, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.723623514175415, |
|
"eval_runtime": 2.256, |
|
"eval_samples_per_second": 69.148, |
|
"eval_steps_per_second": 3.546, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 3.025707960128784, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.5171, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.7228586077690125, |
|
"eval_runtime": 2.2603, |
|
"eval_samples_per_second": 69.016, |
|
"eval_steps_per_second": 3.539, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.2873287200927734, |
|
"learning_rate": 1.9702957262759964e-05, |
|
"loss": 0.5391, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"eval_loss": 0.7198938727378845, |
|
"eval_runtime": 2.4311, |
|
"eval_samples_per_second": 64.168, |
|
"eval_steps_per_second": 3.291, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 3.1473968029022217, |
|
"learning_rate": 1.961261695938319e-05, |
|
"loss": 0.5244, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 0.7222604751586914, |
|
"eval_runtime": 2.6131, |
|
"eval_samples_per_second": 59.699, |
|
"eval_steps_per_second": 3.061, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.5658185482025146, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 0.5435, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_loss": 0.7172784209251404, |
|
"eval_runtime": 3.0626, |
|
"eval_samples_per_second": 50.937, |
|
"eval_steps_per_second": 2.612, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.090545415878296, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.5197, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7204703092575073, |
|
"eval_runtime": 3.4963, |
|
"eval_samples_per_second": 44.619, |
|
"eval_steps_per_second": 2.288, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.921531081199646, |
|
"learning_rate": 1.9271838545667876e-05, |
|
"loss": 0.2538, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"eval_loss": 0.791098952293396, |
|
"eval_runtime": 2.2604, |
|
"eval_samples_per_second": 69.014, |
|
"eval_steps_per_second": 3.539, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.807320475578308, |
|
"learning_rate": 1.913545457642601e-05, |
|
"loss": 0.2521, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"eval_loss": 0.8204991221427917, |
|
"eval_runtime": 2.2623, |
|
"eval_samples_per_second": 68.956, |
|
"eval_steps_per_second": 3.536, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 2.746616840362549, |
|
"learning_rate": 1.8987940462991673e-05, |
|
"loss": 0.2687, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"eval_loss": 0.8025296330451965, |
|
"eval_runtime": 2.2565, |
|
"eval_samples_per_second": 69.132, |
|
"eval_steps_per_second": 3.545, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.3170738220214844, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 0.2689, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 0.8150458931922913, |
|
"eval_runtime": 2.2607, |
|
"eval_samples_per_second": 69.005, |
|
"eval_steps_per_second": 3.539, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.9649097919464111, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 0.2772, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 0.7988224625587463, |
|
"eval_runtime": 2.5979, |
|
"eval_samples_per_second": 60.048, |
|
"eval_steps_per_second": 3.079, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.264338970184326, |
|
"learning_rate": 1.848048096156426e-05, |
|
"loss": 0.2788, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_loss": 0.8175423741340637, |
|
"eval_runtime": 3.4025, |
|
"eval_samples_per_second": 45.849, |
|
"eval_steps_per_second": 2.351, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 2.027390241622925, |
|
"learning_rate": 1.8290375725550417e-05, |
|
"loss": 0.2742, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"eval_loss": 0.8078347444534302, |
|
"eval_runtime": 2.7124, |
|
"eval_samples_per_second": 57.513, |
|
"eval_steps_per_second": 2.949, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.8391352891921997, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.2749, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 0.804284393787384, |
|
"eval_runtime": 2.9467, |
|
"eval_samples_per_second": 52.94, |
|
"eval_steps_per_second": 2.715, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 1.9982004165649414, |
|
"learning_rate": 1.788010753606722e-05, |
|
"loss": 0.2717, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"eval_loss": 0.7994141578674316, |
|
"eval_runtime": 2.2711, |
|
"eval_samples_per_second": 68.688, |
|
"eval_steps_per_second": 3.522, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.782399296760559, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.2715, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.804834246635437, |
|
"eval_runtime": 2.2867, |
|
"eval_samples_per_second": 68.222, |
|
"eval_steps_per_second": 3.499, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.8651448488235474, |
|
"learning_rate": 1.7431448254773943e-05, |
|
"loss": 0.1627, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"eval_loss": 0.859173595905304, |
|
"eval_runtime": 2.2588, |
|
"eval_samples_per_second": 69.062, |
|
"eval_steps_per_second": 3.542, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.4768388271331787, |
|
"learning_rate": 1.7193398003386514e-05, |
|
"loss": 0.1651, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_loss": 0.868316650390625, |
|
"eval_runtime": 2.259, |
|
"eval_samples_per_second": 69.058, |
|
"eval_steps_per_second": 3.541, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 1.4704113006591797, |
|
"learning_rate": 1.6946583704589973e-05, |
|
"loss": 0.1702, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"eval_loss": 0.872775137424469, |
|
"eval_runtime": 2.8294, |
|
"eval_samples_per_second": 55.136, |
|
"eval_steps_per_second": 2.827, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.082715630531311, |
|
"learning_rate": 1.6691306063588583e-05, |
|
"loss": 0.1734, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"eval_loss": 0.8728486895561218, |
|
"eval_runtime": 3.3787, |
|
"eval_samples_per_second": 46.171, |
|
"eval_steps_per_second": 2.368, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 2.210588216781616, |
|
"learning_rate": 1.6427876096865394e-05, |
|
"loss": 0.1752, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_loss": 0.8705567717552185, |
|
"eval_runtime": 3.1278, |
|
"eval_samples_per_second": 49.875, |
|
"eval_steps_per_second": 2.558, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.4183433055877686, |
|
"learning_rate": 1.6156614753256583e-05, |
|
"loss": 0.1706, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"eval_loss": 0.8853814601898193, |
|
"eval_runtime": 3.6433, |
|
"eval_samples_per_second": 42.818, |
|
"eval_steps_per_second": 2.196, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 1.4250963926315308, |
|
"learning_rate": 1.5877852522924733e-05, |
|
"loss": 0.1784, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"eval_loss": 0.884819507598877, |
|
"eval_runtime": 2.2666, |
|
"eval_samples_per_second": 68.827, |
|
"eval_steps_per_second": 3.53, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.252785563468933, |
|
"learning_rate": 1.5591929034707468e-05, |
|
"loss": 0.1729, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"eval_loss": 0.8708668351173401, |
|
"eval_runtime": 2.2648, |
|
"eval_samples_per_second": 68.88, |
|
"eval_steps_per_second": 3.532, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.4024217128753662, |
|
"learning_rate": 1.529919264233205e-05, |
|
"loss": 0.174, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"eval_loss": 0.8670658469200134, |
|
"eval_runtime": 2.2608, |
|
"eval_samples_per_second": 69.003, |
|
"eval_steps_per_second": 3.539, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.6221123933792114, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.174, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.8709214925765991, |
|
"eval_runtime": 2.2598, |
|
"eval_samples_per_second": 69.033, |
|
"eval_steps_per_second": 3.54, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 1.5479576587677002, |
|
"learning_rate": 1.469471562785891e-05, |
|
"loss": 0.1167, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"eval_loss": 0.9011654853820801, |
|
"eval_runtime": 2.738, |
|
"eval_samples_per_second": 56.976, |
|
"eval_steps_per_second": 2.922, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 1.3002970218658447, |
|
"learning_rate": 1.4383711467890776e-05, |
|
"loss": 0.1186, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"eval_loss": 0.9147914052009583, |
|
"eval_runtime": 3.018, |
|
"eval_samples_per_second": 51.69, |
|
"eval_steps_per_second": 2.651, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 1.7996995449066162, |
|
"learning_rate": 1.4067366430758004e-05, |
|
"loss": 0.1153, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"eval_loss": 0.9160046577453613, |
|
"eval_runtime": 3.6692, |
|
"eval_samples_per_second": 42.516, |
|
"eval_steps_per_second": 2.18, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.1670547723770142, |
|
"learning_rate": 1.3746065934159123e-05, |
|
"loss": 0.1214, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"eval_loss": 0.9355931282043457, |
|
"eval_runtime": 2.337, |
|
"eval_samples_per_second": 66.753, |
|
"eval_steps_per_second": 3.423, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 1.1401852369308472, |
|
"learning_rate": 1.342020143325669e-05, |
|
"loss": 0.1193, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"eval_loss": 0.9175124764442444, |
|
"eval_runtime": 2.2626, |
|
"eval_samples_per_second": 68.947, |
|
"eval_steps_per_second": 3.536, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.8389841914176941, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": 0.1186, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"eval_loss": 0.9386661052703857, |
|
"eval_runtime": 2.2532, |
|
"eval_samples_per_second": 69.235, |
|
"eval_steps_per_second": 3.55, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 1.2419942617416382, |
|
"learning_rate": 1.2756373558169992e-05, |
|
"loss": 0.1187, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"eval_loss": 0.9336636662483215, |
|
"eval_runtime": 2.2535, |
|
"eval_samples_per_second": 69.225, |
|
"eval_steps_per_second": 3.55, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 1.0060522556304932, |
|
"learning_rate": 1.2419218955996677e-05, |
|
"loss": 0.1245, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"eval_loss": 0.9188296794891357, |
|
"eval_runtime": 2.2614, |
|
"eval_samples_per_second": 68.983, |
|
"eval_steps_per_second": 3.538, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.7993331551551819, |
|
"learning_rate": 1.2079116908177592e-05, |
|
"loss": 0.1222, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"eval_loss": 0.9250988364219666, |
|
"eval_runtime": 2.4444, |
|
"eval_samples_per_second": 63.82, |
|
"eval_steps_per_second": 3.273, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.1892589330673218, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.1186, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.9481778144836426, |
|
"eval_runtime": 3.3935, |
|
"eval_samples_per_second": 45.97, |
|
"eval_steps_per_second": 2.357, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 0.7223986983299255, |
|
"learning_rate": 1.1391731009600655e-05, |
|
"loss": 0.0726, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"eval_loss": 0.974181056022644, |
|
"eval_runtime": 2.9499, |
|
"eval_samples_per_second": 52.883, |
|
"eval_steps_per_second": 2.712, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 0.7545835971832275, |
|
"learning_rate": 1.1045284632676535e-05, |
|
"loss": 0.0717, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"eval_loss": 0.9890027046203613, |
|
"eval_runtime": 2.7635, |
|
"eval_samples_per_second": 56.449, |
|
"eval_steps_per_second": 2.895, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"grad_norm": 1.2251814603805542, |
|
"learning_rate": 1.0697564737441254e-05, |
|
"loss": 0.072, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"eval_loss": 0.9911813735961914, |
|
"eval_runtime": 2.2537, |
|
"eval_samples_per_second": 69.22, |
|
"eval_steps_per_second": 3.55, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 0.45753681659698486, |
|
"learning_rate": 1.0348994967025012e-05, |
|
"loss": 0.0718, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"eval_loss": 0.9854485988616943, |
|
"eval_runtime": 2.2539, |
|
"eval_samples_per_second": 69.212, |
|
"eval_steps_per_second": 3.549, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 1.0563805103302002, |
|
"learning_rate": 1e-05, |
|
"loss": 0.072, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"eval_loss": 0.9962345957756042, |
|
"eval_runtime": 2.2507, |
|
"eval_samples_per_second": 69.313, |
|
"eval_steps_per_second": 3.555, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 1.6450284719467163, |
|
"learning_rate": 9.651005032974994e-06, |
|
"loss": 0.0699, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"eval_loss": 0.9950909614562988, |
|
"eval_runtime": 2.2532, |
|
"eval_samples_per_second": 69.235, |
|
"eval_steps_per_second": 3.551, |
|
"step": 2800 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 400, |
|
"total_flos": 1.3524716052545536e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|