|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.8666183924692252, |
|
"eval_steps": 50, |
|
"global_step": 187, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004634322954380883, |
|
"grad_norm": 1.1059749126434326, |
|
"learning_rate": 7.499999999999999e-06, |
|
"loss": 2.7798, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004634322954380883, |
|
"eval_loss": 2.9304847717285156, |
|
"eval_runtime": 146.6617, |
|
"eval_samples_per_second": 4.957, |
|
"eval_steps_per_second": 2.482, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.009268645908761766, |
|
"grad_norm": 1.1082885265350342, |
|
"learning_rate": 1.4999999999999999e-05, |
|
"loss": 2.681, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.013902968863142651, |
|
"grad_norm": 1.2035174369812012, |
|
"learning_rate": 2.2499999999999998e-05, |
|
"loss": 2.9068, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.018537291817523532, |
|
"grad_norm": 1.1457459926605225, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 2.9637, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.023171614771904415, |
|
"grad_norm": 1.1010493040084839, |
|
"learning_rate": 3.75e-05, |
|
"loss": 2.7247, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.027805937726285302, |
|
"grad_norm": 1.2098731994628906, |
|
"learning_rate": 4.4999999999999996e-05, |
|
"loss": 2.8731, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03244026068066618, |
|
"grad_norm": 1.0666383504867554, |
|
"learning_rate": 5.2499999999999995e-05, |
|
"loss": 2.6556, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.037074583635047065, |
|
"grad_norm": 0.8229893445968628, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 2.5, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04170890658942795, |
|
"grad_norm": 1.1555801630020142, |
|
"learning_rate": 6.75e-05, |
|
"loss": 2.663, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04634322954380883, |
|
"grad_norm": 1.2436870336532593, |
|
"learning_rate": 7.5e-05, |
|
"loss": 2.3639, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05097755249818972, |
|
"grad_norm": 1.1677322387695312, |
|
"learning_rate": 8.25e-05, |
|
"loss": 2.3472, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.055611875452570604, |
|
"grad_norm": 1.1217212677001953, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 2.2676, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.06024619840695149, |
|
"grad_norm": 0.7882691621780396, |
|
"learning_rate": 9.75e-05, |
|
"loss": 2.1962, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06488052136133236, |
|
"grad_norm": 0.8494845032691956, |
|
"learning_rate": 0.00010499999999999999, |
|
"loss": 2.1433, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06951484431571325, |
|
"grad_norm": 0.980268120765686, |
|
"learning_rate": 0.0001125, |
|
"loss": 2.152, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07414916727009413, |
|
"grad_norm": 0.9087497591972351, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 2.0726, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07878349022447502, |
|
"grad_norm": 0.6476942300796509, |
|
"learning_rate": 0.00012749999999999998, |
|
"loss": 1.9891, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0834178131788559, |
|
"grad_norm": 0.7011496424674988, |
|
"learning_rate": 0.000135, |
|
"loss": 1.9417, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08805213613323679, |
|
"grad_norm": 0.7312366962432861, |
|
"learning_rate": 0.0001425, |
|
"loss": 2.0363, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.09268645908761766, |
|
"grad_norm": 0.6832443475723267, |
|
"learning_rate": 0.00015, |
|
"loss": 1.9493, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09732078204199855, |
|
"grad_norm": 0.5798928737640381, |
|
"learning_rate": 0.00014998857713672935, |
|
"loss": 1.9732, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.10195510499637944, |
|
"grad_norm": 0.5472275018692017, |
|
"learning_rate": 0.00014995431202643217, |
|
"loss": 1.8398, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.10658942795076032, |
|
"grad_norm": 0.7329992055892944, |
|
"learning_rate": 0.000149897215106593, |
|
"loss": 1.8098, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.11122375090514121, |
|
"grad_norm": 0.6771075129508972, |
|
"learning_rate": 0.0001498173037694868, |
|
"loss": 1.7746, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.11585807385952208, |
|
"grad_norm": 0.8174684643745422, |
|
"learning_rate": 0.0001497146023568809, |
|
"loss": 1.7504, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.12049239681390297, |
|
"grad_norm": 0.6408036351203918, |
|
"learning_rate": 0.00014958914215262048, |
|
"loss": 1.7606, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.12512671976828385, |
|
"grad_norm": 0.7159773111343384, |
|
"learning_rate": 0.00014944096137309914, |
|
"loss": 1.7529, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.12976104272266473, |
|
"grad_norm": 0.8010567426681519, |
|
"learning_rate": 0.00014927010515561776, |
|
"loss": 1.9922, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.13439536567704563, |
|
"grad_norm": 0.6913322806358337, |
|
"learning_rate": 0.00014907662554463532, |
|
"loss": 1.6162, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1390296886314265, |
|
"grad_norm": 0.6494315266609192, |
|
"learning_rate": 0.0001488605814759156, |
|
"loss": 1.6571, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14366401158580738, |
|
"grad_norm": 0.6595631241798401, |
|
"learning_rate": 0.00014862203875857477, |
|
"loss": 1.5886, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.14829833454018826, |
|
"grad_norm": 0.7777145504951477, |
|
"learning_rate": 0.0001483610700550354, |
|
"loss": 1.5729, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.15293265749456916, |
|
"grad_norm": 0.7189599871635437, |
|
"learning_rate": 0.00014807775485889264, |
|
"loss": 1.6492, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.15756698044895004, |
|
"grad_norm": 0.7974843978881836, |
|
"learning_rate": 0.0001477721794706997, |
|
"loss": 1.5415, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.16220130340333092, |
|
"grad_norm": 0.8167005181312561, |
|
"learning_rate": 0.0001474444369716801, |
|
"loss": 1.4696, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1668356263577118, |
|
"grad_norm": 0.7308511137962341, |
|
"learning_rate": 0.0001470946271953739, |
|
"loss": 1.6394, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1714699493120927, |
|
"grad_norm": 0.808505654335022, |
|
"learning_rate": 0.00014672285669722765, |
|
"loss": 1.4614, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.17610427226647357, |
|
"grad_norm": 0.7852684259414673, |
|
"learning_rate": 0.00014632923872213652, |
|
"loss": 1.4592, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.18073859522085445, |
|
"grad_norm": 0.7765493988990784, |
|
"learning_rate": 0.00014591389316994876, |
|
"loss": 1.3337, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.18537291817523532, |
|
"grad_norm": 0.9292432069778442, |
|
"learning_rate": 0.0001454769465589431, |
|
"loss": 1.4594, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.19000724112961623, |
|
"grad_norm": 0.9222384095191956, |
|
"learning_rate": 0.00014501853198729012, |
|
"loss": 1.5136, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1946415640839971, |
|
"grad_norm": 0.8896085619926453, |
|
"learning_rate": 0.00014453878909250904, |
|
"loss": 1.4321, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.19927588703837798, |
|
"grad_norm": 0.9805691242218018, |
|
"learning_rate": 0.00014403786400893302, |
|
"loss": 1.2422, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.20391020999275888, |
|
"grad_norm": 0.9406644701957703, |
|
"learning_rate": 0.00014351590932319504, |
|
"loss": 1.3904, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.20854453294713976, |
|
"grad_norm": 1.2533518075942993, |
|
"learning_rate": 0.00014297308402774875, |
|
"loss": 1.2121, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.21317885590152064, |
|
"grad_norm": 1.020983338356018, |
|
"learning_rate": 0.0001424095534724375, |
|
"loss": 1.148, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2178131788559015, |
|
"grad_norm": 0.895693302154541, |
|
"learning_rate": 0.00014182548931412757, |
|
"loss": 1.3044, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.22244750181028242, |
|
"grad_norm": 1.1574026346206665, |
|
"learning_rate": 0.0001412210694644195, |
|
"loss": 1.2857, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.2270818247646633, |
|
"grad_norm": 1.032139539718628, |
|
"learning_rate": 0.00014059647803545467, |
|
"loss": 1.2987, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.23171614771904417, |
|
"grad_norm": 0.9623427391052246, |
|
"learning_rate": 0.0001399519052838329, |
|
"loss": 1.1087, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.23171614771904417, |
|
"eval_loss": 1.1839566230773926, |
|
"eval_runtime": 146.108, |
|
"eval_samples_per_second": 4.976, |
|
"eval_steps_per_second": 2.491, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.23635047067342504, |
|
"grad_norm": 1.263940691947937, |
|
"learning_rate": 0.00013928754755265842, |
|
"loss": 1.0973, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.24098479362780595, |
|
"grad_norm": 1.0416457653045654, |
|
"learning_rate": 0.00013860360721173193, |
|
"loss": 1.1655, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.24561911658218682, |
|
"grad_norm": 1.1260313987731934, |
|
"learning_rate": 0.0001379002925959068, |
|
"loss": 1.0404, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.2502534395365677, |
|
"grad_norm": 0.9762699604034424, |
|
"learning_rate": 0.0001371778179416281, |
|
"loss": 1.0744, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.2548877624909486, |
|
"grad_norm": 1.2078801393508911, |
|
"learning_rate": 0.00013643640332167438, |
|
"loss": 1.0079, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.25952208544532945, |
|
"grad_norm": 1.075649380683899, |
|
"learning_rate": 0.00013567627457812106, |
|
"loss": 1.0864, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.26415640839971033, |
|
"grad_norm": 1.1446782350540161, |
|
"learning_rate": 0.00013489766325354695, |
|
"loss": 1.0853, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.26879073135409126, |
|
"grad_norm": 1.1303826570510864, |
|
"learning_rate": 0.00013410080652050412, |
|
"loss": 1.0711, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.27342505430847214, |
|
"grad_norm": 1.216537356376648, |
|
"learning_rate": 0.0001332859471092728, |
|
"loss": 0.9193, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.278059377262853, |
|
"grad_norm": 1.4602874517440796, |
|
"learning_rate": 0.00013245333323392333, |
|
"loss": 0.9987, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2826937002172339, |
|
"grad_norm": 1.0730090141296387, |
|
"learning_rate": 0.0001316032185167079, |
|
"loss": 1.0272, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.28732802317161477, |
|
"grad_norm": 1.0437711477279663, |
|
"learning_rate": 0.00013073586191080457, |
|
"loss": 1.0511, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.29196234612599564, |
|
"grad_norm": 1.0983786582946777, |
|
"learning_rate": 0.00012985152762143778, |
|
"loss": 0.9506, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2965966690803765, |
|
"grad_norm": 1.3723191022872925, |
|
"learning_rate": 0.00012895048502539882, |
|
"loss": 1.0643, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.30123099203475745, |
|
"grad_norm": 1.2514058351516724, |
|
"learning_rate": 0.00012803300858899104, |
|
"loss": 1.0204, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3058653149891383, |
|
"grad_norm": 1.038238525390625, |
|
"learning_rate": 0.0001270993777844248, |
|
"loss": 0.7876, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.3104996379435192, |
|
"grad_norm": 1.313706874847412, |
|
"learning_rate": 0.0001261498770046874, |
|
"loss": 1.0337, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.3151339608979001, |
|
"grad_norm": 1.1323295831680298, |
|
"learning_rate": 0.00012518479547691435, |
|
"loss": 0.8007, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.31976828385228095, |
|
"grad_norm": 1.0297540426254272, |
|
"learning_rate": 0.00012420442717428804, |
|
"loss": 0.9792, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.32440260680666183, |
|
"grad_norm": 1.201749563217163, |
|
"learning_rate": 0.00012320907072649044, |
|
"loss": 1.0579, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3290369297610427, |
|
"grad_norm": 1.0487987995147705, |
|
"learning_rate": 0.0001221990293287378, |
|
"loss": 0.8679, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3336712527154236, |
|
"grad_norm": 0.9669104814529419, |
|
"learning_rate": 0.00012117461064942435, |
|
"loss": 0.9195, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3383055756698045, |
|
"grad_norm": 1.1065351963043213, |
|
"learning_rate": 0.00012013612673640363, |
|
"loss": 0.9479, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.3429398986241854, |
|
"grad_norm": 1.1072312593460083, |
|
"learning_rate": 0.00011908389392193547, |
|
"loss": 0.8467, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.34757422157856627, |
|
"grad_norm": 1.1270943880081177, |
|
"learning_rate": 0.00011801823272632844, |
|
"loss": 0.7588, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.35220854453294714, |
|
"grad_norm": 1.2270523309707642, |
|
"learning_rate": 0.00011693946776030599, |
|
"loss": 0.9326, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.356842867487328, |
|
"grad_norm": 1.2635189294815063, |
|
"learning_rate": 0.00011584792762612703, |
|
"loss": 0.9417, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.3614771904417089, |
|
"grad_norm": 1.3055872917175293, |
|
"learning_rate": 0.00011474394481749035, |
|
"loss": 0.9683, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.36611151339608977, |
|
"grad_norm": 1.0037323236465454, |
|
"learning_rate": 0.00011362785561825406, |
|
"loss": 0.6438, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.37074583635047065, |
|
"grad_norm": 1.0581717491149902, |
|
"learning_rate": 0.0001125, |
|
"loss": 0.8729, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3753801593048516, |
|
"grad_norm": 1.194458246231079, |
|
"learning_rate": 0.00011136072151847529, |
|
"loss": 0.7414, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.38001448225923246, |
|
"grad_norm": 1.3376874923706055, |
|
"learning_rate": 0.00011021036720894179, |
|
"loss": 0.7101, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.38464880521361333, |
|
"grad_norm": 1.5701568126678467, |
|
"learning_rate": 0.00010904928748046599, |
|
"loss": 0.6721, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3892831281679942, |
|
"grad_norm": 1.53782057762146, |
|
"learning_rate": 0.0001078778360091808, |
|
"loss": 0.7689, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3939174511223751, |
|
"grad_norm": 1.1315703392028809, |
|
"learning_rate": 0.00010669636963055245, |
|
"loss": 0.6933, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.39855177407675596, |
|
"grad_norm": 1.1853336095809937, |
|
"learning_rate": 0.00010550524823068502, |
|
"loss": 0.7163, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.40318609703113684, |
|
"grad_norm": 1.1081809997558594, |
|
"learning_rate": 0.00010430483463669551, |
|
"loss": 0.8206, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.40782041998551777, |
|
"grad_norm": 1.1602833271026611, |
|
"learning_rate": 0.0001030954945061934, |
|
"loss": 0.632, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.41245474293989864, |
|
"grad_norm": 1.3947807550430298, |
|
"learning_rate": 0.0001018775962158975, |
|
"loss": 0.668, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.4170890658942795, |
|
"grad_norm": 1.2622536420822144, |
|
"learning_rate": 0.00010065151074942516, |
|
"loss": 0.5766, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4217233888486604, |
|
"grad_norm": 1.39005708694458, |
|
"learning_rate": 9.941761158428674e-05, |
|
"loss": 0.5234, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.4263577118030413, |
|
"grad_norm": 1.3181982040405273, |
|
"learning_rate": 9.817627457812105e-05, |
|
"loss": 0.575, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.43099203475742215, |
|
"grad_norm": 1.385236144065857, |
|
"learning_rate": 9.692787785420525e-05, |
|
"loss": 0.7406, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.435626357711803, |
|
"grad_norm": 1.2025413513183594, |
|
"learning_rate": 9.567280168627493e-05, |
|
"loss": 0.565, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.4402606806661839, |
|
"grad_norm": 1.1163705587387085, |
|
"learning_rate": 9.441142838268905e-05, |
|
"loss": 0.5613, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.44489500362056483, |
|
"grad_norm": 1.2544317245483398, |
|
"learning_rate": 9.314414216997507e-05, |
|
"loss": 0.7618, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.4495293265749457, |
|
"grad_norm": 1.1881450414657593, |
|
"learning_rate": 9.187132907578987e-05, |
|
"loss": 0.7518, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.4541636495293266, |
|
"grad_norm": 1.1118865013122559, |
|
"learning_rate": 9.059337681133192e-05, |
|
"loss": 0.5474, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.45879797248370746, |
|
"grad_norm": 1.0739349126815796, |
|
"learning_rate": 8.931067465324085e-05, |
|
"loss": 0.5227, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.46343229543808834, |
|
"grad_norm": 1.2055435180664062, |
|
"learning_rate": 8.802361332501978e-05, |
|
"loss": 0.6742, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.46343229543808834, |
|
"eval_loss": 0.6104360818862915, |
|
"eval_runtime": 135.03, |
|
"eval_samples_per_second": 5.384, |
|
"eval_steps_per_second": 2.696, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4680666183924692, |
|
"grad_norm": 1.332160472869873, |
|
"learning_rate": 8.673258487801731e-05, |
|
"loss": 0.5866, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.4727009413468501, |
|
"grad_norm": 1.0405492782592773, |
|
"learning_rate": 8.54379825720049e-05, |
|
"loss": 0.5656, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.47733526430123097, |
|
"grad_norm": 1.373766303062439, |
|
"learning_rate": 8.414020075538605e-05, |
|
"loss": 0.5363, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.4819695872556119, |
|
"grad_norm": 1.1279383897781372, |
|
"learning_rate": 8.2839634745074e-05, |
|
"loss": 0.474, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.4866039102099928, |
|
"grad_norm": 1.3224053382873535, |
|
"learning_rate": 8.153668070607437e-05, |
|
"loss": 0.7299, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.49123823316437365, |
|
"grad_norm": 1.1764270067214966, |
|
"learning_rate": 8.023173553080938e-05, |
|
"loss": 0.5553, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.4958725561187545, |
|
"grad_norm": 1.3858931064605713, |
|
"learning_rate": 7.89251967182208e-05, |
|
"loss": 0.6029, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.5005068790731354, |
|
"grad_norm": 1.2218059301376343, |
|
"learning_rate": 7.761746225268758e-05, |
|
"loss": 0.5179, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5051412020275163, |
|
"grad_norm": 1.186583399772644, |
|
"learning_rate": 7.630893048279627e-05, |
|
"loss": 0.592, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.5097755249818972, |
|
"grad_norm": 1.1901055574417114, |
|
"learning_rate": 7.5e-05, |
|
"loss": 0.4786, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5144098479362781, |
|
"grad_norm": 1.2109345197677612, |
|
"learning_rate": 7.369106951720373e-05, |
|
"loss": 0.4874, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5190441708906589, |
|
"grad_norm": 1.2095303535461426, |
|
"learning_rate": 7.238253774731244e-05, |
|
"loss": 0.5826, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.5236784938450398, |
|
"grad_norm": 1.4487143754959106, |
|
"learning_rate": 7.10748032817792e-05, |
|
"loss": 0.6947, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5283128167994207, |
|
"grad_norm": 1.4283783435821533, |
|
"learning_rate": 6.976826446919059e-05, |
|
"loss": 0.6258, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5329471397538016, |
|
"grad_norm": 1.387428879737854, |
|
"learning_rate": 6.846331929392562e-05, |
|
"loss": 0.5088, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5375814627081825, |
|
"grad_norm": 1.3152918815612793, |
|
"learning_rate": 6.7160365254926e-05, |
|
"loss": 0.4329, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5422157856625633, |
|
"grad_norm": 1.493445873260498, |
|
"learning_rate": 6.585979924461394e-05, |
|
"loss": 0.5035, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5468501086169443, |
|
"grad_norm": 1.2388755083084106, |
|
"learning_rate": 6.45620174279951e-05, |
|
"loss": 0.4064, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5514844315713251, |
|
"grad_norm": 1.163062572479248, |
|
"learning_rate": 6.326741512198266e-05, |
|
"loss": 0.3774, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.556118754525706, |
|
"grad_norm": 1.305509328842163, |
|
"learning_rate": 6.197638667498022e-05, |
|
"loss": 0.4626, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5607530774800868, |
|
"grad_norm": 1.2311328649520874, |
|
"learning_rate": 6.068932534675913e-05, |
|
"loss": 0.4069, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5653874004344678, |
|
"grad_norm": 1.3704758882522583, |
|
"learning_rate": 5.9406623188668055e-05, |
|
"loss": 0.5308, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5700217233888487, |
|
"grad_norm": 1.2310731410980225, |
|
"learning_rate": 5.812867092421013e-05, |
|
"loss": 0.4324, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5746560463432295, |
|
"grad_norm": 1.2407798767089844, |
|
"learning_rate": 5.685585783002493e-05, |
|
"loss": 0.4992, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5792903692976105, |
|
"grad_norm": 1.2309188842773438, |
|
"learning_rate": 5.558857161731093e-05, |
|
"loss": 0.3967, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5839246922519913, |
|
"grad_norm": 1.2751799821853638, |
|
"learning_rate": 5.4327198313725064e-05, |
|
"loss": 0.4109, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5885590152063722, |
|
"grad_norm": 1.3883857727050781, |
|
"learning_rate": 5.307212214579474e-05, |
|
"loss": 0.3867, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.593193338160753, |
|
"grad_norm": 1.410216212272644, |
|
"learning_rate": 5.182372542187895e-05, |
|
"loss": 0.3541, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.597827661115134, |
|
"grad_norm": 1.3435072898864746, |
|
"learning_rate": 5.058238841571326e-05, |
|
"loss": 0.4067, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.6024619840695149, |
|
"grad_norm": 1.3180471658706665, |
|
"learning_rate": 4.934848925057484e-05, |
|
"loss": 0.3048, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6070963070238957, |
|
"grad_norm": 1.1778309345245361, |
|
"learning_rate": 4.812240378410248e-05, |
|
"loss": 0.3317, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.6117306299782767, |
|
"grad_norm": 1.2751339673995972, |
|
"learning_rate": 4.690450549380659e-05, |
|
"loss": 0.3, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.6163649529326575, |
|
"grad_norm": 1.1635668277740479, |
|
"learning_rate": 4.569516536330447e-05, |
|
"loss": 0.2464, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.6209992758870384, |
|
"grad_norm": 1.4394093751907349, |
|
"learning_rate": 4.449475176931499e-05, |
|
"loss": 0.3776, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.6256335988414192, |
|
"grad_norm": 1.3860478401184082, |
|
"learning_rate": 4.3303630369447554e-05, |
|
"loss": 0.3414, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6302679217958002, |
|
"grad_norm": 0.9536806344985962, |
|
"learning_rate": 4.212216399081918e-05, |
|
"loss": 0.2116, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.634902244750181, |
|
"grad_norm": 1.092250943183899, |
|
"learning_rate": 4.095071251953399e-05, |
|
"loss": 0.2411, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.6395365677045619, |
|
"grad_norm": 1.3056756258010864, |
|
"learning_rate": 3.978963279105821e-05, |
|
"loss": 0.3435, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6441708906589428, |
|
"grad_norm": 1.1005077362060547, |
|
"learning_rate": 3.863927848152472e-05, |
|
"loss": 0.2612, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.6488052136133237, |
|
"grad_norm": 1.237870454788208, |
|
"learning_rate": 3.750000000000001e-05, |
|
"loss": 0.3284, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6534395365677046, |
|
"grad_norm": 1.3307782411575317, |
|
"learning_rate": 3.637214438174593e-05, |
|
"loss": 0.2536, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6580738595220854, |
|
"grad_norm": 1.4346413612365723, |
|
"learning_rate": 3.525605518250964e-05, |
|
"loss": 0.2911, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6627081824764663, |
|
"grad_norm": 1.2083615064620972, |
|
"learning_rate": 3.415207237387297e-05, |
|
"loss": 0.233, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6673425054308472, |
|
"grad_norm": 1.4748581647872925, |
|
"learning_rate": 3.3060532239693994e-05, |
|
"loss": 0.319, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.6719768283852281, |
|
"grad_norm": 1.2144207954406738, |
|
"learning_rate": 3.198176727367156e-05, |
|
"loss": 0.1959, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.676611151339609, |
|
"grad_norm": 1.5745162963867188, |
|
"learning_rate": 3.091610607806452e-05, |
|
"loss": 0.3077, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6812454742939898, |
|
"grad_norm": 1.1295483112335205, |
|
"learning_rate": 2.986387326359637e-05, |
|
"loss": 0.2328, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6858797972483708, |
|
"grad_norm": 1.218430757522583, |
|
"learning_rate": 2.8825389350575624e-05, |
|
"loss": 0.2504, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6905141202027516, |
|
"grad_norm": 1.1782724857330322, |
|
"learning_rate": 2.78009706712622e-05, |
|
"loss": 0.2519, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6951484431571325, |
|
"grad_norm": 1.3294053077697754, |
|
"learning_rate": 2.6790929273509545e-05, |
|
"loss": 0.249, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6951484431571325, |
|
"eval_loss": 0.25855064392089844, |
|
"eval_runtime": 135.0303, |
|
"eval_samples_per_second": 5.384, |
|
"eval_steps_per_second": 2.696, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6997827661115134, |
|
"grad_norm": 0.8559562563896179, |
|
"learning_rate": 2.579557282571196e-05, |
|
"loss": 0.1331, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.7044170890658943, |
|
"grad_norm": 1.5178470611572266, |
|
"learning_rate": 2.4815204523085654e-05, |
|
"loss": 0.3093, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.7090514120202752, |
|
"grad_norm": 1.3319201469421387, |
|
"learning_rate": 2.385012299531262e-05, |
|
"loss": 0.3123, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.713685734974656, |
|
"grad_norm": 1.3699779510498047, |
|
"learning_rate": 2.2900622215575197e-05, |
|
"loss": 0.2744, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.718320057929037, |
|
"grad_norm": 1.4394081830978394, |
|
"learning_rate": 2.1966991411008938e-05, |
|
"loss": 0.2854, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.7229543808834178, |
|
"grad_norm": 1.1516926288604736, |
|
"learning_rate": 2.1049514974601175e-05, |
|
"loss": 0.232, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.7275887038377987, |
|
"grad_norm": 1.226991891860962, |
|
"learning_rate": 2.0148472378562215e-05, |
|
"loss": 0.2174, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.7322230267921795, |
|
"grad_norm": 1.3236987590789795, |
|
"learning_rate": 1.926413808919542e-05, |
|
"loss": 0.2859, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.7368573497465605, |
|
"grad_norm": 1.3909859657287598, |
|
"learning_rate": 1.8396781483292098e-05, |
|
"loss": 0.1741, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.7414916727009413, |
|
"grad_norm": 1.3062829971313477, |
|
"learning_rate": 1.7546666766076655e-05, |
|
"loss": 0.2003, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7461259956553222, |
|
"grad_norm": 1.4208601713180542, |
|
"learning_rate": 1.671405289072718e-05, |
|
"loss": 0.2306, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.7507603186097032, |
|
"grad_norm": 1.130900502204895, |
|
"learning_rate": 1.5899193479495857e-05, |
|
"loss": 0.1712, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.755394641564084, |
|
"grad_norm": 1.3274939060211182, |
|
"learning_rate": 1.5102336746453053e-05, |
|
"loss": 0.2057, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.7600289645184649, |
|
"grad_norm": 1.5153155326843262, |
|
"learning_rate": 1.4323725421878949e-05, |
|
"loss": 0.2792, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7646632874728457, |
|
"grad_norm": 1.2106071710586548, |
|
"learning_rate": 1.3563596678325606e-05, |
|
"loss": 0.213, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.7692976104272267, |
|
"grad_norm": 1.3996082544326782, |
|
"learning_rate": 1.2822182058371878e-05, |
|
"loss": 0.2113, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7739319333816075, |
|
"grad_norm": 1.1402256488800049, |
|
"learning_rate": 1.2099707404093203e-05, |
|
"loss": 0.1599, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.7785662563359884, |
|
"grad_norm": 1.6671884059906006, |
|
"learning_rate": 1.1396392788268052e-05, |
|
"loss": 0.301, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.7832005792903693, |
|
"grad_norm": 1.1182959079742432, |
|
"learning_rate": 1.0712452447341582e-05, |
|
"loss": 0.1367, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.7878349022447502, |
|
"grad_norm": 1.0370293855667114, |
|
"learning_rate": 1.0048094716167095e-05, |
|
"loss": 0.1441, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7924692251991311, |
|
"grad_norm": 1.155311942100525, |
|
"learning_rate": 9.40352196454532e-06, |
|
"loss": 0.1666, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7971035481535119, |
|
"grad_norm": 1.0911647081375122, |
|
"learning_rate": 8.778930535580474e-06, |
|
"loss": 0.185, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.8017378711078929, |
|
"grad_norm": 0.9174829721450806, |
|
"learning_rate": 8.174510685872415e-06, |
|
"loss": 0.1147, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.8063721940622737, |
|
"grad_norm": 1.362585425376892, |
|
"learning_rate": 7.5904465275624884e-06, |
|
"loss": 0.2452, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.8110065170166546, |
|
"grad_norm": 1.197021722793579, |
|
"learning_rate": 7.026915972251254e-06, |
|
"loss": 0.2385, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.8156408399710355, |
|
"grad_norm": 1.0614386796951294, |
|
"learning_rate": 6.484090676804926e-06, |
|
"loss": 0.1824, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.8202751629254164, |
|
"grad_norm": 1.1456388235092163, |
|
"learning_rate": 5.962135991066971e-06, |
|
"loss": 0.1921, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.8249094858797973, |
|
"grad_norm": 1.2095783948898315, |
|
"learning_rate": 5.461210907490951e-06, |
|
"loss": 0.1803, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.8295438088341781, |
|
"grad_norm": 1.2167255878448486, |
|
"learning_rate": 4.981468012709877e-06, |
|
"loss": 0.1777, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.834178131788559, |
|
"grad_norm": 1.2779210805892944, |
|
"learning_rate": 4.523053441056876e-06, |
|
"loss": 0.2648, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8388124547429399, |
|
"grad_norm": 1.1896816492080688, |
|
"learning_rate": 4.086106830051236e-06, |
|
"loss": 0.1531, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.8434467776973208, |
|
"grad_norm": 1.0845763683319092, |
|
"learning_rate": 3.670761277863485e-06, |
|
"loss": 0.1953, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.8480811006517016, |
|
"grad_norm": 1.338794231414795, |
|
"learning_rate": 3.277143302772342e-06, |
|
"loss": 0.1878, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.8527154236060825, |
|
"grad_norm": 1.036252737045288, |
|
"learning_rate": 2.9053728046260825e-06, |
|
"loss": 0.1352, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.8573497465604635, |
|
"grad_norm": 1.3027642965316772, |
|
"learning_rate": 2.555563028319885e-06, |
|
"loss": 0.2213, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8619840695148443, |
|
"grad_norm": 1.0671361684799194, |
|
"learning_rate": 2.227820529300264e-06, |
|
"loss": 0.2172, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.8666183924692252, |
|
"grad_norm": 1.0457696914672852, |
|
"learning_rate": 1.9222451411073645e-06, |
|
"loss": 0.1704, |
|
"step": 187 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 17, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1160316364382536e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|