|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.990375360923965, |
|
"eval_steps": 500, |
|
"global_step": 5190, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0019249278152069298, |
|
"grad_norm": 9.235594749450684, |
|
"learning_rate": 3.8535645472061657e-07, |
|
"loss": 2.3328, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.009624639076034648, |
|
"grad_norm": 9.342337608337402, |
|
"learning_rate": 1.9267822736030827e-06, |
|
"loss": 2.3107, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.019249278152069296, |
|
"grad_norm": 8.154550552368164, |
|
"learning_rate": 3.853564547206165e-06, |
|
"loss": 2.3049, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.028873917228103944, |
|
"grad_norm": 5.9875688552856445, |
|
"learning_rate": 5.780346820809249e-06, |
|
"loss": 2.1949, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03849855630413859, |
|
"grad_norm": 2.7122750282287598, |
|
"learning_rate": 7.70712909441233e-06, |
|
"loss": 2.0383, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04812319538017324, |
|
"grad_norm": 1.6343287229537964, |
|
"learning_rate": 9.633911368015415e-06, |
|
"loss": 1.9244, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05774783445620789, |
|
"grad_norm": 0.805985152721405, |
|
"learning_rate": 1.1560693641618498e-05, |
|
"loss": 1.8037, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06737247353224254, |
|
"grad_norm": 0.685213029384613, |
|
"learning_rate": 1.348747591522158e-05, |
|
"loss": 1.7133, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07699711260827719, |
|
"grad_norm": 0.5439901351928711, |
|
"learning_rate": 1.541425818882466e-05, |
|
"loss": 1.6271, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08662175168431184, |
|
"grad_norm": 0.5319092273712158, |
|
"learning_rate": 1.7341040462427746e-05, |
|
"loss": 1.5405, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09624639076034648, |
|
"grad_norm": 0.5163573026657104, |
|
"learning_rate": 1.926782273603083e-05, |
|
"loss": 1.4612, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10587102983638114, |
|
"grad_norm": 0.4213581085205078, |
|
"learning_rate": 2.119460500963391e-05, |
|
"loss": 1.3647, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.11549566891241578, |
|
"grad_norm": 0.37413254380226135, |
|
"learning_rate": 2.3121387283236996e-05, |
|
"loss": 1.3279, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12512030798845045, |
|
"grad_norm": 0.3393540680408478, |
|
"learning_rate": 2.504816955684008e-05, |
|
"loss": 1.2962, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1347449470644851, |
|
"grad_norm": 0.3041280210018158, |
|
"learning_rate": 2.697495183044316e-05, |
|
"loss": 1.2746, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14436958614051973, |
|
"grad_norm": 0.29960623383522034, |
|
"learning_rate": 2.8901734104046245e-05, |
|
"loss": 1.2432, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.15399422521655437, |
|
"grad_norm": 0.28563690185546875, |
|
"learning_rate": 3.082851637764932e-05, |
|
"loss": 1.224, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16361886429258904, |
|
"grad_norm": 0.3082931339740753, |
|
"learning_rate": 3.275529865125241e-05, |
|
"loss": 1.2034, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.17324350336862368, |
|
"grad_norm": 0.3015296757221222, |
|
"learning_rate": 3.468208092485549e-05, |
|
"loss": 1.186, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18286814244465832, |
|
"grad_norm": 0.330247163772583, |
|
"learning_rate": 3.660886319845858e-05, |
|
"loss": 1.1795, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.19249278152069296, |
|
"grad_norm": 0.30705899000167847, |
|
"learning_rate": 3.853564547206166e-05, |
|
"loss": 1.171, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.20211742059672763, |
|
"grad_norm": 0.3239520192146301, |
|
"learning_rate": 4.046242774566474e-05, |
|
"loss": 1.1629, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.21174205967276227, |
|
"grad_norm": 0.31190788745880127, |
|
"learning_rate": 4.238921001926782e-05, |
|
"loss": 1.1507, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22136669874879691, |
|
"grad_norm": 0.3129926025867462, |
|
"learning_rate": 4.43159922928709e-05, |
|
"loss": 1.1597, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.23099133782483156, |
|
"grad_norm": 0.32413914799690247, |
|
"learning_rate": 4.624277456647399e-05, |
|
"loss": 1.1507, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24061597690086622, |
|
"grad_norm": 0.41083359718322754, |
|
"learning_rate": 4.816955684007707e-05, |
|
"loss": 1.1259, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2502406159769009, |
|
"grad_norm": 0.3095736801624298, |
|
"learning_rate": 5.009633911368016e-05, |
|
"loss": 1.124, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2598652550529355, |
|
"grad_norm": 0.3358061611652374, |
|
"learning_rate": 5.2023121387283234e-05, |
|
"loss": 1.1299, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2694898941289702, |
|
"grad_norm": 0.37028777599334717, |
|
"learning_rate": 5.394990366088632e-05, |
|
"loss": 1.1085, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2791145332050048, |
|
"grad_norm": 0.3638240396976471, |
|
"learning_rate": 5.58766859344894e-05, |
|
"loss": 1.1139, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.28873917228103946, |
|
"grad_norm": 0.3208532929420471, |
|
"learning_rate": 5.780346820809249e-05, |
|
"loss": 1.0867, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2983638113570741, |
|
"grad_norm": 0.325976699590683, |
|
"learning_rate": 5.973025048169557e-05, |
|
"loss": 1.0794, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.30798845043310874, |
|
"grad_norm": 0.3301510214805603, |
|
"learning_rate": 6.165703275529865e-05, |
|
"loss": 1.0811, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3176130895091434, |
|
"grad_norm": 0.35519587993621826, |
|
"learning_rate": 6.358381502890174e-05, |
|
"loss": 1.076, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3272377285851781, |
|
"grad_norm": 0.38242989778518677, |
|
"learning_rate": 6.551059730250482e-05, |
|
"loss": 1.0774, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3368623676612127, |
|
"grad_norm": 0.3178574740886688, |
|
"learning_rate": 6.74373795761079e-05, |
|
"loss": 1.0678, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.34648700673724736, |
|
"grad_norm": 0.2955685257911682, |
|
"learning_rate": 6.936416184971098e-05, |
|
"loss": 1.0741, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.35611164581328203, |
|
"grad_norm": 0.3037715554237366, |
|
"learning_rate": 7.129094412331408e-05, |
|
"loss": 1.0649, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.36573628488931664, |
|
"grad_norm": 0.3199213445186615, |
|
"learning_rate": 7.321772639691716e-05, |
|
"loss": 1.0635, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3753609239653513, |
|
"grad_norm": 0.317488431930542, |
|
"learning_rate": 7.514450867052023e-05, |
|
"loss": 1.0526, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.3849855630413859, |
|
"grad_norm": 0.3228258490562439, |
|
"learning_rate": 7.707129094412332e-05, |
|
"loss": 1.064, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3946102021174206, |
|
"grad_norm": 0.2934040129184723, |
|
"learning_rate": 7.89980732177264e-05, |
|
"loss": 1.0544, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.40423484119345526, |
|
"grad_norm": 0.32170167565345764, |
|
"learning_rate": 8.092485549132948e-05, |
|
"loss": 1.0508, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4138594802694899, |
|
"grad_norm": 0.29049986600875854, |
|
"learning_rate": 8.285163776493256e-05, |
|
"loss": 1.0611, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.42348411934552455, |
|
"grad_norm": 0.31131693720817566, |
|
"learning_rate": 8.477842003853564e-05, |
|
"loss": 1.0581, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4331087584215592, |
|
"grad_norm": 0.2872338891029358, |
|
"learning_rate": 8.670520231213874e-05, |
|
"loss": 1.0512, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.44273339749759383, |
|
"grad_norm": 0.3063661754131317, |
|
"learning_rate": 8.86319845857418e-05, |
|
"loss": 1.0508, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4523580365736285, |
|
"grad_norm": 0.30761733651161194, |
|
"learning_rate": 9.05587668593449e-05, |
|
"loss": 1.0549, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.4619826756496631, |
|
"grad_norm": 0.2758205533027649, |
|
"learning_rate": 9.248554913294798e-05, |
|
"loss": 1.0446, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4716073147256978, |
|
"grad_norm": 0.3492432236671448, |
|
"learning_rate": 9.441233140655106e-05, |
|
"loss": 1.0511, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.48123195380173245, |
|
"grad_norm": 0.27041804790496826, |
|
"learning_rate": 9.633911368015414e-05, |
|
"loss": 1.0275, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.49085659287776706, |
|
"grad_norm": 0.2999095916748047, |
|
"learning_rate": 9.826589595375723e-05, |
|
"loss": 1.0433, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5004812319538018, |
|
"grad_norm": 0.297323614358902, |
|
"learning_rate": 0.00010019267822736032, |
|
"loss": 1.0416, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5101058710298364, |
|
"grad_norm": 0.3357987403869629, |
|
"learning_rate": 0.00010211946050096339, |
|
"loss": 1.0374, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.519730510105871, |
|
"grad_norm": 0.2953435778617859, |
|
"learning_rate": 0.00010404624277456647, |
|
"loss": 1.0352, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5293551491819056, |
|
"grad_norm": 0.32853737473487854, |
|
"learning_rate": 0.00010597302504816958, |
|
"loss": 1.0529, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5389797882579404, |
|
"grad_norm": 0.28152966499328613, |
|
"learning_rate": 0.00010789980732177264, |
|
"loss": 1.0427, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.548604427333975, |
|
"grad_norm": 0.2928714454174042, |
|
"learning_rate": 0.00010982658959537572, |
|
"loss": 1.0375, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5582290664100096, |
|
"grad_norm": 0.29662230610847473, |
|
"learning_rate": 0.0001117533718689788, |
|
"loss": 1.0326, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5678537054860443, |
|
"grad_norm": 0.2677823305130005, |
|
"learning_rate": 0.00011368015414258189, |
|
"loss": 1.0477, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5774783445620789, |
|
"grad_norm": 0.2860727906227112, |
|
"learning_rate": 0.00011560693641618498, |
|
"loss": 1.0272, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5871029836381135, |
|
"grad_norm": 0.2599497437477112, |
|
"learning_rate": 0.00011753371868978806, |
|
"loss": 1.0364, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5967276227141483, |
|
"grad_norm": 0.26607978343963623, |
|
"learning_rate": 0.00011946050096339114, |
|
"loss": 1.0338, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6063522617901829, |
|
"grad_norm": 0.2653907239437103, |
|
"learning_rate": 0.00012138728323699422, |
|
"loss": 1.0274, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6159769008662175, |
|
"grad_norm": 0.2570829689502716, |
|
"learning_rate": 0.0001233140655105973, |
|
"loss": 1.0349, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6256015399422522, |
|
"grad_norm": 0.2542014420032501, |
|
"learning_rate": 0.00012524084778420039, |
|
"loss": 1.0306, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6352261790182868, |
|
"grad_norm": 0.2354612797498703, |
|
"learning_rate": 0.00012716763005780348, |
|
"loss": 1.0336, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6448508180943214, |
|
"grad_norm": 0.26090219616889954, |
|
"learning_rate": 0.00012909441233140655, |
|
"loss": 1.0319, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6544754571703562, |
|
"grad_norm": 0.2287357598543167, |
|
"learning_rate": 0.00013102119460500964, |
|
"loss": 1.0228, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6641000962463908, |
|
"grad_norm": 0.2653840184211731, |
|
"learning_rate": 0.0001329479768786127, |
|
"loss": 1.019, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6737247353224254, |
|
"grad_norm": 0.25462430715560913, |
|
"learning_rate": 0.0001348747591522158, |
|
"loss": 1.0289, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6833493743984601, |
|
"grad_norm": 0.24566137790679932, |
|
"learning_rate": 0.0001368015414258189, |
|
"loss": 1.0286, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6929740134744947, |
|
"grad_norm": 0.24448491632938385, |
|
"learning_rate": 0.00013872832369942197, |
|
"loss": 1.0195, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7025986525505293, |
|
"grad_norm": 0.2303464114665985, |
|
"learning_rate": 0.00014065510597302506, |
|
"loss": 1.0328, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7122232916265641, |
|
"grad_norm": 0.2552158832550049, |
|
"learning_rate": 0.00014258188824662816, |
|
"loss": 1.0366, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7218479307025987, |
|
"grad_norm": 0.22079892456531525, |
|
"learning_rate": 0.00014450867052023122, |
|
"loss": 1.024, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7314725697786333, |
|
"grad_norm": 0.32242512702941895, |
|
"learning_rate": 0.00014643545279383432, |
|
"loss": 1.033, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7410972088546679, |
|
"grad_norm": 0.2999092936515808, |
|
"learning_rate": 0.00014836223506743738, |
|
"loss": 1.0204, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7507218479307026, |
|
"grad_norm": 0.26794490218162537, |
|
"learning_rate": 0.00015028901734104045, |
|
"loss": 1.0305, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7603464870067372, |
|
"grad_norm": 0.33896663784980774, |
|
"learning_rate": 0.00015221579961464357, |
|
"loss": 1.0308, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7699711260827719, |
|
"grad_norm": 0.22748759388923645, |
|
"learning_rate": 0.00015414258188824664, |
|
"loss": 1.0197, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7795957651588066, |
|
"grad_norm": 0.23324738442897797, |
|
"learning_rate": 0.0001560693641618497, |
|
"loss": 1.0131, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.7892204042348412, |
|
"grad_norm": 0.24805064499378204, |
|
"learning_rate": 0.0001579961464354528, |
|
"loss": 1.0094, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7988450433108758, |
|
"grad_norm": 0.24965739250183105, |
|
"learning_rate": 0.00015992292870905587, |
|
"loss": 1.0203, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.8084696823869105, |
|
"grad_norm": 0.22509600222110748, |
|
"learning_rate": 0.00016184971098265897, |
|
"loss": 1.0265, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8180943214629451, |
|
"grad_norm": 0.2149883359670639, |
|
"learning_rate": 0.00016377649325626206, |
|
"loss": 1.0171, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.8277189605389798, |
|
"grad_norm": 0.24780240654945374, |
|
"learning_rate": 0.00016570327552986513, |
|
"loss": 1.0144, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8373435996150145, |
|
"grad_norm": 0.2780991494655609, |
|
"learning_rate": 0.00016763005780346822, |
|
"loss": 1.0145, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.8469682386910491, |
|
"grad_norm": 0.22135606408119202, |
|
"learning_rate": 0.0001695568400770713, |
|
"loss": 1.0187, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8565928777670837, |
|
"grad_norm": 0.20605282485485077, |
|
"learning_rate": 0.00017148362235067438, |
|
"loss": 1.0197, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.8662175168431184, |
|
"grad_norm": 0.24270793795585632, |
|
"learning_rate": 0.00017341040462427748, |
|
"loss": 1.0106, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.875842155919153, |
|
"grad_norm": 0.24285346269607544, |
|
"learning_rate": 0.00017533718689788055, |
|
"loss": 1.0242, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.8854667949951877, |
|
"grad_norm": 0.21814145147800446, |
|
"learning_rate": 0.0001772639691714836, |
|
"loss": 1.0176, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8950914340712224, |
|
"grad_norm": 0.22261013090610504, |
|
"learning_rate": 0.00017919075144508673, |
|
"loss": 1.0099, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.904716073147257, |
|
"grad_norm": 0.21424554288387299, |
|
"learning_rate": 0.0001811175337186898, |
|
"loss": 1.0097, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9143407122232916, |
|
"grad_norm": 0.2335994988679886, |
|
"learning_rate": 0.00018304431599229287, |
|
"loss": 1.0179, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.9239653512993262, |
|
"grad_norm": 0.20568034052848816, |
|
"learning_rate": 0.00018497109826589596, |
|
"loss": 1.0067, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.933589990375361, |
|
"grad_norm": 0.20264984667301178, |
|
"learning_rate": 0.00018689788053949903, |
|
"loss": 1.0147, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.9432146294513956, |
|
"grad_norm": 0.2133115977048874, |
|
"learning_rate": 0.00018882466281310213, |
|
"loss": 1.0071, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9528392685274302, |
|
"grad_norm": 0.2007424235343933, |
|
"learning_rate": 0.00019075144508670522, |
|
"loss": 1.0095, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.9624639076034649, |
|
"grad_norm": 0.20568867027759552, |
|
"learning_rate": 0.0001926782273603083, |
|
"loss": 1.0113, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9720885466794995, |
|
"grad_norm": 0.19897951185703278, |
|
"learning_rate": 0.00019460500963391138, |
|
"loss": 1.0129, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.9817131857555341, |
|
"grad_norm": 0.21554742753505707, |
|
"learning_rate": 0.00019653179190751445, |
|
"loss": 1.0107, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9913378248315688, |
|
"grad_norm": 0.20981793105602264, |
|
"learning_rate": 0.00019845857418111754, |
|
"loss": 1.0008, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.9990375360923965, |
|
"eval_loss": 2.1032063961029053, |
|
"eval_runtime": 0.7869, |
|
"eval_samples_per_second": 13.979, |
|
"eval_steps_per_second": 2.542, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.0009624639076036, |
|
"grad_norm": 0.22678163647651672, |
|
"learning_rate": 0.0001999999773822188, |
|
"loss": 1.0012, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0105871029836382, |
|
"grad_norm": 0.2608613967895508, |
|
"learning_rate": 0.00019999918576095053, |
|
"loss": 0.9875, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.0202117420596728, |
|
"grad_norm": 0.2601936459541321, |
|
"learning_rate": 0.0001999972632608527, |
|
"loss": 0.9805, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0298363811357074, |
|
"grad_norm": 0.21544857323169708, |
|
"learning_rate": 0.00019999420990366674, |
|
"loss": 0.9805, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.039461020211742, |
|
"grad_norm": 0.20171190798282623, |
|
"learning_rate": 0.00019999002572392255, |
|
"loss": 0.9798, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0490856592877766, |
|
"grad_norm": 0.2205726057291031, |
|
"learning_rate": 0.0001999847107689386, |
|
"loss": 0.9805, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.0587102983638113, |
|
"grad_norm": 0.20397739112377167, |
|
"learning_rate": 0.0001999782650988211, |
|
"loss": 0.9952, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.068334937439846, |
|
"grad_norm": 0.207752525806427, |
|
"learning_rate": 0.00019997068878646333, |
|
"loss": 0.9786, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.0779595765158807, |
|
"grad_norm": 0.2041793167591095, |
|
"learning_rate": 0.0001999619819175449, |
|
"loss": 0.9951, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0875842155919153, |
|
"grad_norm": 0.19135500490665436, |
|
"learning_rate": 0.00019995214459053075, |
|
"loss": 0.9912, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.09720885466795, |
|
"grad_norm": 0.2038804590702057, |
|
"learning_rate": 0.00019994117691667004, |
|
"loss": 0.9821, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.1068334937439845, |
|
"grad_norm": 0.21948496997356415, |
|
"learning_rate": 0.00019992907901999484, |
|
"loss": 0.9933, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.1164581328200192, |
|
"grad_norm": 0.21123313903808594, |
|
"learning_rate": 0.0001999158510373189, |
|
"loss": 0.9723, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.126082771896054, |
|
"grad_norm": 0.2110896110534668, |
|
"learning_rate": 0.00019990149311823588, |
|
"loss": 0.9789, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.1357074109720886, |
|
"grad_norm": 0.20370599627494812, |
|
"learning_rate": 0.00019988600542511766, |
|
"loss": 0.9902, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1453320500481232, |
|
"grad_norm": 0.19531656801700592, |
|
"learning_rate": 0.00019986938813311284, |
|
"loss": 0.9846, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.1549566891241578, |
|
"grad_norm": 0.2497565895318985, |
|
"learning_rate": 0.00019985164143014432, |
|
"loss": 0.9864, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1645813282001924, |
|
"grad_norm": 0.2870050072669983, |
|
"learning_rate": 0.00019983276551690745, |
|
"loss": 0.9851, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.174205967276227, |
|
"grad_norm": 0.20774626731872559, |
|
"learning_rate": 0.0001998127606068677, |
|
"loss": 0.9819, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.1838306063522617, |
|
"grad_norm": 0.2567305266857147, |
|
"learning_rate": 0.00019979162692625817, |
|
"loss": 0.9754, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.1934552454282965, |
|
"grad_norm": 0.1896723359823227, |
|
"learning_rate": 0.00019976936471407717, |
|
"loss": 0.9762, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.2030798845043311, |
|
"grad_norm": 0.19382244348526, |
|
"learning_rate": 0.00019974597422208533, |
|
"loss": 0.9783, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.2127045235803657, |
|
"grad_norm": 0.19210918247699738, |
|
"learning_rate": 0.00019972145571480295, |
|
"loss": 0.9778, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2223291626564003, |
|
"grad_norm": 0.2057211995124817, |
|
"learning_rate": 0.00019969580946950695, |
|
"loss": 0.9632, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.231953801732435, |
|
"grad_norm": 0.23469866812229156, |
|
"learning_rate": 0.0001996690357762276, |
|
"loss": 0.9824, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2415784408084698, |
|
"grad_norm": 0.19450876116752625, |
|
"learning_rate": 0.00019964113493774538, |
|
"loss": 0.9788, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.2512030798845044, |
|
"grad_norm": 0.18963035941123962, |
|
"learning_rate": 0.00019961210726958758, |
|
"loss": 0.9854, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.260827718960539, |
|
"grad_norm": 0.2049696296453476, |
|
"learning_rate": 0.00019958195310002457, |
|
"loss": 0.9901, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.2704523580365736, |
|
"grad_norm": 0.18745918571949005, |
|
"learning_rate": 0.00019955067277006633, |
|
"loss": 0.9772, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2800769971126083, |
|
"grad_norm": 0.1893537938594818, |
|
"learning_rate": 0.00019951826663345827, |
|
"loss": 0.9862, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.2897016361886429, |
|
"grad_norm": 0.18441106379032135, |
|
"learning_rate": 0.00019948473505667757, |
|
"loss": 0.9836, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2993262752646775, |
|
"grad_norm": 0.21260684728622437, |
|
"learning_rate": 0.00019945007841892884, |
|
"loss": 0.9878, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.3089509143407123, |
|
"grad_norm": 0.19159361720085144, |
|
"learning_rate": 0.00019941429711213982, |
|
"loss": 1.0004, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.318575553416747, |
|
"grad_norm": 0.19893284142017365, |
|
"learning_rate": 0.000199377391540957, |
|
"loss": 0.9728, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.3282001924927815, |
|
"grad_norm": 0.2625219225883484, |
|
"learning_rate": 0.00019933936212274115, |
|
"loss": 0.9815, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3378248315688162, |
|
"grad_norm": 0.20059077441692352, |
|
"learning_rate": 0.00019930020928756232, |
|
"loss": 0.9869, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.3474494706448508, |
|
"grad_norm": 0.19443583488464355, |
|
"learning_rate": 0.00019925993347819532, |
|
"loss": 0.9852, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3570741097208856, |
|
"grad_norm": 0.19254858791828156, |
|
"learning_rate": 0.00019921853515011438, |
|
"loss": 0.9768, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.36669874879692, |
|
"grad_norm": 0.1973366141319275, |
|
"learning_rate": 0.0001991760147714883, |
|
"loss": 0.9865, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3763233878729548, |
|
"grad_norm": 0.2019069492816925, |
|
"learning_rate": 0.00019913237282317495, |
|
"loss": 0.9701, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.3859480269489894, |
|
"grad_norm": 0.20254430174827576, |
|
"learning_rate": 0.0001990876097987159, |
|
"loss": 0.9867, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.395572666025024, |
|
"grad_norm": 0.2121659815311432, |
|
"learning_rate": 0.00019904172620433078, |
|
"loss": 0.9688, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.4051973051010587, |
|
"grad_norm": 0.2147083729505539, |
|
"learning_rate": 0.00019899472255891176, |
|
"loss": 0.9802, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4148219441770933, |
|
"grad_norm": 0.21038152277469635, |
|
"learning_rate": 0.0001989465993940174, |
|
"loss": 0.9759, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.4244465832531281, |
|
"grad_norm": 0.21153226494789124, |
|
"learning_rate": 0.00019889735725386683, |
|
"loss": 0.9735, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.4340712223291627, |
|
"grad_norm": 0.2074025273323059, |
|
"learning_rate": 0.00019884699669533347, |
|
"loss": 0.9913, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.4436958614051973, |
|
"grad_norm": 0.21015384793281555, |
|
"learning_rate": 0.00019879551828793892, |
|
"loss": 0.9737, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.453320500481232, |
|
"grad_norm": 0.21345528960227966, |
|
"learning_rate": 0.0001987429226138463, |
|
"loss": 0.9675, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.4629451395572666, |
|
"grad_norm": 0.21284109354019165, |
|
"learning_rate": 0.0001986892102678538, |
|
"loss": 0.9787, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.4725697786333012, |
|
"grad_norm": 0.19105084240436554, |
|
"learning_rate": 0.0001986343818573879, |
|
"loss": 0.9714, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.4821944177093358, |
|
"grad_norm": 0.18031322956085205, |
|
"learning_rate": 0.0001985784380024966, |
|
"loss": 0.965, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.4918190567853706, |
|
"grad_norm": 0.19423770904541016, |
|
"learning_rate": 0.00019852137933584215, |
|
"loss": 0.9743, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.5014436958614052, |
|
"grad_norm": 0.1923457533121109, |
|
"learning_rate": 0.0001984632065026943, |
|
"loss": 0.9872, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.5110683349374399, |
|
"grad_norm": 0.1957743912935257, |
|
"learning_rate": 0.0001984039201609226, |
|
"loss": 0.9799, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.5206929740134745, |
|
"grad_norm": 0.17838570475578308, |
|
"learning_rate": 0.0001983435209809892, |
|
"loss": 0.9765, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.530317613089509, |
|
"grad_norm": 0.1872684508562088, |
|
"learning_rate": 0.00019828200964594123, |
|
"loss": 0.9768, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.539942252165544, |
|
"grad_norm": 0.19497379660606384, |
|
"learning_rate": 0.00019821938685140298, |
|
"loss": 0.9686, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5495668912415783, |
|
"grad_norm": 0.18703444302082062, |
|
"learning_rate": 0.00019815565330556816, |
|
"loss": 0.9785, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.5591915303176132, |
|
"grad_norm": 0.18727166950702667, |
|
"learning_rate": 0.00019809080972919181, |
|
"loss": 0.9748, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5688161693936478, |
|
"grad_norm": 0.19498740136623383, |
|
"learning_rate": 0.00019802485685558222, |
|
"loss": 0.975, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.5784408084696824, |
|
"grad_norm": 0.1881551891565323, |
|
"learning_rate": 0.00019795779543059248, |
|
"loss": 0.9749, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.588065447545717, |
|
"grad_norm": 0.17449571192264557, |
|
"learning_rate": 0.00019788962621261226, |
|
"loss": 0.9676, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.5976900866217516, |
|
"grad_norm": 0.1892375349998474, |
|
"learning_rate": 0.0001978203499725591, |
|
"loss": 0.9722, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.6073147256977864, |
|
"grad_norm": 0.19908097386360168, |
|
"learning_rate": 0.00019774996749386968, |
|
"loss": 0.9674, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.6169393647738208, |
|
"grad_norm": 0.17946478724479675, |
|
"learning_rate": 0.00019767847957249108, |
|
"loss": 0.9741, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.6265640038498557, |
|
"grad_norm": 0.17460967600345612, |
|
"learning_rate": 0.0001976058870168716, |
|
"loss": 0.9726, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.6361886429258903, |
|
"grad_norm": 0.17595893144607544, |
|
"learning_rate": 0.0001975321906479518, |
|
"loss": 0.9783, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.645813282001925, |
|
"grad_norm": 0.18718552589416504, |
|
"learning_rate": 0.00019745739129915508, |
|
"loss": 0.9746, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.6554379210779597, |
|
"grad_norm": 0.18480895459651947, |
|
"learning_rate": 0.00019738148981637835, |
|
"loss": 0.9675, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.6650625601539941, |
|
"grad_norm": 0.1780669093132019, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.9648, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.674687199230029, |
|
"grad_norm": 0.17525720596313477, |
|
"learning_rate": 0.00019722638389478217, |
|
"loss": 0.9911, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.6843118383060636, |
|
"grad_norm": 0.1761050969362259, |
|
"learning_rate": 0.00019714718121003705, |
|
"loss": 0.9745, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.6939364773820982, |
|
"grad_norm": 0.193415105342865, |
|
"learning_rate": 0.00019706687989944072, |
|
"loss": 0.9669, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.7035611164581328, |
|
"grad_norm": 0.18116651475429535, |
|
"learning_rate": 0.00019698548087111102, |
|
"loss": 0.9573, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.7131857555341674, |
|
"grad_norm": 0.17790788412094116, |
|
"learning_rate": 0.0001969029850455799, |
|
"loss": 0.9738, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.7228103946102022, |
|
"grad_norm": 0.18143677711486816, |
|
"learning_rate": 0.00019681939335578275, |
|
"loss": 0.9641, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.7324350336862366, |
|
"grad_norm": 0.1727439910173416, |
|
"learning_rate": 0.00019673470674704801, |
|
"loss": 0.9612, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7420596727622715, |
|
"grad_norm": 0.17776042222976685, |
|
"learning_rate": 0.00019664892617708642, |
|
"loss": 0.9704, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.751684311838306, |
|
"grad_norm": 0.1788305640220642, |
|
"learning_rate": 0.00019656205261598016, |
|
"loss": 0.9822, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.7613089509143407, |
|
"grad_norm": 0.18292832374572754, |
|
"learning_rate": 0.00019647408704617192, |
|
"loss": 0.981, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.7709335899903753, |
|
"grad_norm": 0.1903613954782486, |
|
"learning_rate": 0.00019638503046245383, |
|
"loss": 0.9815, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.78055822906641, |
|
"grad_norm": 0.18801650404930115, |
|
"learning_rate": 0.00019629488387195614, |
|
"loss": 0.9723, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.7901828681424448, |
|
"grad_norm": 0.19215719401836395, |
|
"learning_rate": 0.0001962036482941359, |
|
"loss": 0.9785, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.7998075072184792, |
|
"grad_norm": 0.1913854032754898, |
|
"learning_rate": 0.00019611132476076527, |
|
"loss": 0.9661, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.809432146294514, |
|
"grad_norm": 0.19718807935714722, |
|
"learning_rate": 0.00019601791431592006, |
|
"loss": 0.9791, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.8190567853705486, |
|
"grad_norm": 0.18217253684997559, |
|
"learning_rate": 0.00019592341801596787, |
|
"loss": 0.9575, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.8286814244465832, |
|
"grad_norm": 0.17967750132083893, |
|
"learning_rate": 0.00019582783692955605, |
|
"loss": 0.9637, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.838306063522618, |
|
"grad_norm": 0.17850783467292786, |
|
"learning_rate": 0.00019573117213759957, |
|
"loss": 0.9605, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.8479307025986524, |
|
"grad_norm": 0.19147521257400513, |
|
"learning_rate": 0.00019563342473326913, |
|
"loss": 0.9818, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.8575553416746873, |
|
"grad_norm": 0.17569051682949066, |
|
"learning_rate": 0.00019553459582197835, |
|
"loss": 0.9642, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.867179980750722, |
|
"grad_norm": 0.18762874603271484, |
|
"learning_rate": 0.00019543468652137157, |
|
"loss": 0.9744, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.8768046198267565, |
|
"grad_norm": 0.17426376044750214, |
|
"learning_rate": 0.00019533369796131118, |
|
"loss": 0.9725, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.8864292589027911, |
|
"grad_norm": 0.21174634993076324, |
|
"learning_rate": 0.00019523163128386465, |
|
"loss": 0.9793, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8960538979788257, |
|
"grad_norm": 0.19356350600719452, |
|
"learning_rate": 0.00019512848764329188, |
|
"loss": 0.9632, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.9056785370548606, |
|
"grad_norm": 0.1924716979265213, |
|
"learning_rate": 0.00019502426820603192, |
|
"loss": 0.9791, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.915303176130895, |
|
"grad_norm": 0.20623841881752014, |
|
"learning_rate": 0.00019491897415068997, |
|
"loss": 0.9678, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.9249278152069298, |
|
"grad_norm": 0.1916794627904892, |
|
"learning_rate": 0.00019481260666802386, |
|
"loss": 0.9677, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9345524542829644, |
|
"grad_norm": 0.17562657594680786, |
|
"learning_rate": 0.00019470516696093072, |
|
"loss": 0.9818, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.944177093358999, |
|
"grad_norm": 0.17680735886096954, |
|
"learning_rate": 0.00019459665624443342, |
|
"loss": 0.9789, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.9538017324350336, |
|
"grad_norm": 0.17583592236042023, |
|
"learning_rate": 0.00019448707574566657, |
|
"loss": 0.9758, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.9634263715110682, |
|
"grad_norm": 0.18164704740047455, |
|
"learning_rate": 0.00019437642670386304, |
|
"loss": 0.9596, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.973051010587103, |
|
"grad_norm": 0.16976359486579895, |
|
"learning_rate": 0.0001942647103703395, |
|
"loss": 0.9725, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.9826756496631375, |
|
"grad_norm": 0.16864246129989624, |
|
"learning_rate": 0.00019415192800848263, |
|
"loss": 0.9788, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.9923002887391723, |
|
"grad_norm": 0.17933247983455658, |
|
"learning_rate": 0.00019403808089373472, |
|
"loss": 0.9747, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.1443910598754883, |
|
"eval_runtime": 0.7795, |
|
"eval_samples_per_second": 14.112, |
|
"eval_steps_per_second": 2.566, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 2.001924927815207, |
|
"grad_norm": 0.17538660764694214, |
|
"learning_rate": 0.00019392317031357908, |
|
"loss": 0.9576, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.0115495668912415, |
|
"grad_norm": 0.18830431997776031, |
|
"learning_rate": 0.00019380719756752584, |
|
"loss": 0.9117, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.0211742059672764, |
|
"grad_norm": 0.18357954919338226, |
|
"learning_rate": 0.00019369016396709681, |
|
"loss": 0.9149, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.0307988450433108, |
|
"grad_norm": 0.19075176119804382, |
|
"learning_rate": 0.000193572070835811, |
|
"loss": 0.9114, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.0404234841193456, |
|
"grad_norm": 0.19288337230682373, |
|
"learning_rate": 0.0001934529195091695, |
|
"loss": 0.9061, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.05004812319538, |
|
"grad_norm": 0.1923680603504181, |
|
"learning_rate": 0.00019333271133464047, |
|
"loss": 0.9165, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.059672762271415, |
|
"grad_norm": 0.19743940234184265, |
|
"learning_rate": 0.00019321144767164367, |
|
"loss": 0.9115, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.0692974013474497, |
|
"grad_norm": 0.18134470283985138, |
|
"learning_rate": 0.00019308912989153548, |
|
"loss": 0.9117, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.078922040423484, |
|
"grad_norm": 0.19912441074848175, |
|
"learning_rate": 0.00019296575937759292, |
|
"loss": 0.9139, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.088546679499519, |
|
"grad_norm": 0.20187345147132874, |
|
"learning_rate": 0.00019284133752499848, |
|
"loss": 0.9233, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.0981713185755533, |
|
"grad_norm": 0.19697998464107513, |
|
"learning_rate": 0.00019271586574082393, |
|
"loss": 0.9189, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.107795957651588, |
|
"grad_norm": 0.1886579543352127, |
|
"learning_rate": 0.0001925893454440147, |
|
"loss": 0.9157, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.1174205967276225, |
|
"grad_norm": 0.1850527673959732, |
|
"learning_rate": 0.00019246177806537377, |
|
"loss": 0.9173, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.1270452358036573, |
|
"grad_norm": 0.19263537228107452, |
|
"learning_rate": 0.00019233316504754523, |
|
"loss": 0.9213, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.136669874879692, |
|
"grad_norm": 0.18643324077129364, |
|
"learning_rate": 0.00019220350784499837, |
|
"loss": 0.9281, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.1462945139557266, |
|
"grad_norm": 0.20145340263843536, |
|
"learning_rate": 0.00019207280792401098, |
|
"loss": 0.9154, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.1559191530317614, |
|
"grad_norm": 0.20724299550056458, |
|
"learning_rate": 0.00019194106676265283, |
|
"loss": 0.9216, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.165543792107796, |
|
"grad_norm": 0.20987021923065186, |
|
"learning_rate": 0.000191808285850769, |
|
"loss": 0.9191, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.1751684311838306, |
|
"grad_norm": 0.19462813436985016, |
|
"learning_rate": 0.00019167446668996285, |
|
"loss": 0.9206, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.1847930702598655, |
|
"grad_norm": 0.18060922622680664, |
|
"learning_rate": 0.00019153961079357935, |
|
"loss": 0.9194, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.1944177093359, |
|
"grad_norm": 0.19130302965641022, |
|
"learning_rate": 0.00019140371968668767, |
|
"loss": 0.9209, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.2040423484119347, |
|
"grad_norm": 0.1925574392080307, |
|
"learning_rate": 0.00019126679490606404, |
|
"loss": 0.915, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.213666987487969, |
|
"grad_norm": 0.18374784290790558, |
|
"learning_rate": 0.00019112883800017448, |
|
"loss": 0.9266, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.223291626564004, |
|
"grad_norm": 0.1928727775812149, |
|
"learning_rate": 0.0001909898505291571, |
|
"loss": 0.9177, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.2329162656400383, |
|
"grad_norm": 0.19703041017055511, |
|
"learning_rate": 0.00019084983406480462, |
|
"loss": 0.9129, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.242540904716073, |
|
"grad_norm": 0.19135095179080963, |
|
"learning_rate": 0.00019070879019054645, |
|
"loss": 0.9204, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.252165543792108, |
|
"grad_norm": 0.18242081999778748, |
|
"learning_rate": 0.00019056672050143087, |
|
"loss": 0.9158, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.2617901828681424, |
|
"grad_norm": 0.19838295876979828, |
|
"learning_rate": 0.00019042362660410706, |
|
"loss": 0.9282, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.271414821944177, |
|
"grad_norm": 0.1942119151353836, |
|
"learning_rate": 0.0001902795101168068, |
|
"loss": 0.9224, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.2810394610202116, |
|
"grad_norm": 0.1880965530872345, |
|
"learning_rate": 0.00019013437266932615, |
|
"loss": 0.919, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.2906641000962464, |
|
"grad_norm": 0.18855926394462585, |
|
"learning_rate": 0.00018998821590300713, |
|
"loss": 0.9314, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.300288739172281, |
|
"grad_norm": 0.20218202471733093, |
|
"learning_rate": 0.00018984104147071917, |
|
"loss": 0.9209, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.3099133782483157, |
|
"grad_norm": 0.19384799897670746, |
|
"learning_rate": 0.00018969285103684032, |
|
"loss": 0.9147, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.3195380173243505, |
|
"grad_norm": 0.1903255134820938, |
|
"learning_rate": 0.00018954364627723843, |
|
"loss": 0.9178, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.329162656400385, |
|
"grad_norm": 0.180522158741951, |
|
"learning_rate": 0.00018939342887925234, |
|
"loss": 0.9215, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.3387872954764197, |
|
"grad_norm": 0.1928156316280365, |
|
"learning_rate": 0.00018924220054167257, |
|
"loss": 0.9274, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.348411934552454, |
|
"grad_norm": 0.19860059022903442, |
|
"learning_rate": 0.00018908996297472235, |
|
"loss": 0.9281, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.358036573628489, |
|
"grad_norm": 0.19085602462291718, |
|
"learning_rate": 0.00018893671790003804, |
|
"loss": 0.9288, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.3676612127045233, |
|
"grad_norm": 0.20947015285491943, |
|
"learning_rate": 0.00018878246705064994, |
|
"loss": 0.9245, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.377285851780558, |
|
"grad_norm": 0.2144593894481659, |
|
"learning_rate": 0.00018862721217096243, |
|
"loss": 0.9122, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.386910490856593, |
|
"grad_norm": 0.2063259780406952, |
|
"learning_rate": 0.00018847095501673438, |
|
"loss": 0.915, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.3965351299326274, |
|
"grad_norm": 0.19159218668937683, |
|
"learning_rate": 0.0001883136973550592, |
|
"loss": 0.9172, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.4061597690086622, |
|
"grad_norm": 0.19970135390758514, |
|
"learning_rate": 0.00018815544096434503, |
|
"loss": 0.9356, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.4157844080846966, |
|
"grad_norm": 0.19337432086467743, |
|
"learning_rate": 0.00018799618763429445, |
|
"loss": 0.9284, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.4254090471607315, |
|
"grad_norm": 0.19304610788822174, |
|
"learning_rate": 0.00018783593916588432, |
|
"loss": 0.9278, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.4350336862367663, |
|
"grad_norm": 0.18972693383693695, |
|
"learning_rate": 0.00018767469737134538, |
|
"loss": 0.9251, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.4446583253128007, |
|
"grad_norm": 0.19995278120040894, |
|
"learning_rate": 0.0001875124640741418, |
|
"loss": 0.9231, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.4542829643888355, |
|
"grad_norm": 0.1899886578321457, |
|
"learning_rate": 0.00018734924110895055, |
|
"loss": 0.9289, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.46390760346487, |
|
"grad_norm": 0.1865253895521164, |
|
"learning_rate": 0.0001871850303216406, |
|
"loss": 0.9141, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.4735322425409048, |
|
"grad_norm": 0.205548956990242, |
|
"learning_rate": 0.00018701983356925214, |
|
"loss": 0.92, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.4831568816169396, |
|
"grad_norm": 0.20036041736602783, |
|
"learning_rate": 0.00018685365271997544, |
|
"loss": 0.9274, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.492781520692974, |
|
"grad_norm": 0.20605804026126862, |
|
"learning_rate": 0.00018668648965312982, |
|
"loss": 0.9262, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.502406159769009, |
|
"grad_norm": 0.19875019788742065, |
|
"learning_rate": 0.00018651834625914247, |
|
"loss": 0.9288, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.512030798845043, |
|
"grad_norm": 0.20208601653575897, |
|
"learning_rate": 0.00018634922443952693, |
|
"loss": 0.9246, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.521655437921078, |
|
"grad_norm": 0.20923365652561188, |
|
"learning_rate": 0.00018617912610686155, |
|
"loss": 0.9285, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.5312800769971124, |
|
"grad_norm": 0.21708457171916962, |
|
"learning_rate": 0.00018600805318476807, |
|
"loss": 0.9244, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.5409047160731473, |
|
"grad_norm": 0.19935211539268494, |
|
"learning_rate": 0.00018583600760788967, |
|
"loss": 0.9261, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.550529355149182, |
|
"grad_norm": 0.19352373480796814, |
|
"learning_rate": 0.00018566299132186925, |
|
"loss": 0.9203, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.5601539942252165, |
|
"grad_norm": 0.18096321821212769, |
|
"learning_rate": 0.00018548900628332726, |
|
"loss": 0.924, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.5697786333012513, |
|
"grad_norm": 0.20240572094917297, |
|
"learning_rate": 0.0001853140544598397, |
|
"loss": 0.9242, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.5794032723772857, |
|
"grad_norm": 0.18877889215946198, |
|
"learning_rate": 0.00018513813782991578, |
|
"loss": 0.9101, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.5890279114533206, |
|
"grad_norm": 0.1912551075220108, |
|
"learning_rate": 0.00018496125838297572, |
|
"loss": 0.9201, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 2.598652550529355, |
|
"grad_norm": 0.19026340544223785, |
|
"learning_rate": 0.0001847834181193279, |
|
"loss": 0.9356, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.60827718960539, |
|
"grad_norm": 0.19470341503620148, |
|
"learning_rate": 0.00018460461905014664, |
|
"loss": 0.9213, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 2.6179018286814246, |
|
"grad_norm": 0.1977526694536209, |
|
"learning_rate": 0.00018442486319744926, |
|
"loss": 0.9292, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.627526467757459, |
|
"grad_norm": 0.19127926230430603, |
|
"learning_rate": 0.00018424415259407317, |
|
"loss": 0.9283, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 2.637151106833494, |
|
"grad_norm": 0.18255840241909027, |
|
"learning_rate": 0.00018406248928365295, |
|
"loss": 0.9179, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.6467757459095282, |
|
"grad_norm": 0.18344487249851227, |
|
"learning_rate": 0.00018387987532059725, |
|
"loss": 0.9397, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.656400384985563, |
|
"grad_norm": 0.1913861185312271, |
|
"learning_rate": 0.00018369631277006555, |
|
"loss": 0.9248, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.6660250240615975, |
|
"grad_norm": 0.1795121282339096, |
|
"learning_rate": 0.00018351180370794479, |
|
"loss": 0.9223, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.6756496631376323, |
|
"grad_norm": 0.19478866457939148, |
|
"learning_rate": 0.00018332635022082582, |
|
"loss": 0.9282, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.685274302213667, |
|
"grad_norm": 0.1917424350976944, |
|
"learning_rate": 0.00018313995440598002, |
|
"loss": 0.9228, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.6948989412897015, |
|
"grad_norm": 0.18964500725269318, |
|
"learning_rate": 0.00018295261837133532, |
|
"loss": 0.928, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.7045235803657364, |
|
"grad_norm": 0.19044145941734314, |
|
"learning_rate": 0.00018276434423545253, |
|
"loss": 0.926, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 2.714148219441771, |
|
"grad_norm": 0.1876942664384842, |
|
"learning_rate": 0.0001825751341275013, |
|
"loss": 0.9224, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.7237728585178056, |
|
"grad_norm": 0.19307979941368103, |
|
"learning_rate": 0.00018238499018723614, |
|
"loss": 0.9322, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 2.73339749759384, |
|
"grad_norm": 0.1879437267780304, |
|
"learning_rate": 0.00018219391456497216, |
|
"loss": 0.9104, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.743022136669875, |
|
"grad_norm": 0.2002253383398056, |
|
"learning_rate": 0.00018200190942156062, |
|
"loss": 0.9266, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.7526467757459097, |
|
"grad_norm": 0.1822872757911682, |
|
"learning_rate": 0.00018180897692836483, |
|
"loss": 0.9245, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.762271414821944, |
|
"grad_norm": 0.1884424090385437, |
|
"learning_rate": 0.0001816151192672352, |
|
"loss": 0.9273, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 2.771896053897979, |
|
"grad_norm": 0.1969207227230072, |
|
"learning_rate": 0.00018142033863048485, |
|
"loss": 0.929, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.7815206929740137, |
|
"grad_norm": 0.1919521689414978, |
|
"learning_rate": 0.0001812246372208647, |
|
"loss": 0.9213, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 2.791145332050048, |
|
"grad_norm": 0.18795301020145416, |
|
"learning_rate": 0.00018102801725153862, |
|
"loss": 0.9281, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.8007699711260825, |
|
"grad_norm": 0.19035767018795013, |
|
"learning_rate": 0.00018083048094605825, |
|
"loss": 0.9264, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 2.8103946102021173, |
|
"grad_norm": 0.181080624461174, |
|
"learning_rate": 0.0001806320305383381, |
|
"loss": 0.926, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.820019249278152, |
|
"grad_norm": 0.18840213119983673, |
|
"learning_rate": 0.00018043266827263003, |
|
"loss": 0.9327, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 2.8296438883541866, |
|
"grad_norm": 0.18549908697605133, |
|
"learning_rate": 0.0001802323964034981, |
|
"loss": 0.9345, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.8392685274302214, |
|
"grad_norm": 0.18507707118988037, |
|
"learning_rate": 0.00018003121719579294, |
|
"loss": 0.9243, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.8488931665062562, |
|
"grad_norm": 0.19053645431995392, |
|
"learning_rate": 0.0001798291329246261, |
|
"loss": 0.9136, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.8585178055822906, |
|
"grad_norm": 0.18798498809337616, |
|
"learning_rate": 0.00017962614587534444, |
|
"loss": 0.9296, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 2.8681424446583255, |
|
"grad_norm": 0.19244647026062012, |
|
"learning_rate": 0.00017942225834350424, |
|
"loss": 0.9212, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.87776708373436, |
|
"grad_norm": 0.18958385288715363, |
|
"learning_rate": 0.00017921747263484518, |
|
"loss": 0.9204, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 2.8873917228103947, |
|
"grad_norm": 0.1872030794620514, |
|
"learning_rate": 0.00017901179106526434, |
|
"loss": 0.9167, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.897016361886429, |
|
"grad_norm": 0.1842317432165146, |
|
"learning_rate": 0.00017880521596079003, |
|
"loss": 0.9295, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 2.906641000962464, |
|
"grad_norm": 0.1908566802740097, |
|
"learning_rate": 0.00017859774965755534, |
|
"loss": 0.933, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.9162656400384988, |
|
"grad_norm": 0.17877928912639618, |
|
"learning_rate": 0.0001783893945017719, |
|
"loss": 0.9209, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 2.925890279114533, |
|
"grad_norm": 0.19019804894924164, |
|
"learning_rate": 0.00017818015284970328, |
|
"loss": 0.9298, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.935514918190568, |
|
"grad_norm": 0.17898397147655487, |
|
"learning_rate": 0.0001779700270676382, |
|
"loss": 0.9149, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.9451395572666024, |
|
"grad_norm": 0.19317851960659027, |
|
"learning_rate": 0.0001777590195318641, |
|
"loss": 0.9268, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.954764196342637, |
|
"grad_norm": 0.1835252344608307, |
|
"learning_rate": 0.00017754713262863985, |
|
"loss": 0.9156, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 2.9643888354186716, |
|
"grad_norm": 0.18219447135925293, |
|
"learning_rate": 0.00017733436875416917, |
|
"loss": 0.928, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.9740134744947064, |
|
"grad_norm": 0.19455976784229279, |
|
"learning_rate": 0.00017712073031457331, |
|
"loss": 0.9358, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 2.9836381135707413, |
|
"grad_norm": 0.19101083278656006, |
|
"learning_rate": 0.0001769062197258637, |
|
"loss": 0.919, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.9932627526467757, |
|
"grad_norm": 0.1850951611995697, |
|
"learning_rate": 0.00017669083941391502, |
|
"loss": 0.9289, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 2.9990375360923966, |
|
"eval_loss": 2.251723289489746, |
|
"eval_runtime": 0.7901, |
|
"eval_samples_per_second": 13.922, |
|
"eval_steps_per_second": 2.531, |
|
"step": 1558 |
|
}, |
|
{ |
|
"epoch": 3.0028873917228105, |
|
"grad_norm": 0.1839417815208435, |
|
"learning_rate": 0.00017647459181443739, |
|
"loss": 0.9099, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.012512030798845, |
|
"grad_norm": 0.21318542957305908, |
|
"learning_rate": 0.0001762574793729491, |
|
"loss": 0.8622, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 3.0221366698748797, |
|
"grad_norm": 0.20732618868350983, |
|
"learning_rate": 0.00017603950454474877, |
|
"loss": 0.8502, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.0317613089509146, |
|
"grad_norm": 0.20737336575984955, |
|
"learning_rate": 0.00017582066979488764, |
|
"loss": 0.8479, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 3.041385948026949, |
|
"grad_norm": 0.2138897329568863, |
|
"learning_rate": 0.00017560097759814172, |
|
"loss": 0.8517, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.051010587102984, |
|
"grad_norm": 0.20526482164859772, |
|
"learning_rate": 0.00017538043043898376, |
|
"loss": 0.8548, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 3.060635226179018, |
|
"grad_norm": 0.21120765805244446, |
|
"learning_rate": 0.00017515903081155525, |
|
"loss": 0.8531, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.070259865255053, |
|
"grad_norm": 0.20420415699481964, |
|
"learning_rate": 0.00017493678121963807, |
|
"loss": 0.8607, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 3.0798845043310874, |
|
"grad_norm": 0.2265135943889618, |
|
"learning_rate": 0.00017471368417662627, |
|
"loss": 0.8638, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.0895091434071222, |
|
"grad_norm": 0.2099863588809967, |
|
"learning_rate": 0.00017448974220549764, |
|
"loss": 0.8648, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 3.099133782483157, |
|
"grad_norm": 0.2183115929365158, |
|
"learning_rate": 0.00017426495783878508, |
|
"loss": 0.8554, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.1087584215591915, |
|
"grad_norm": 0.2061695158481598, |
|
"learning_rate": 0.00017403933361854814, |
|
"loss": 0.8561, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 3.1183830606352263, |
|
"grad_norm": 0.21093107759952545, |
|
"learning_rate": 0.0001738128720963442, |
|
"loss": 0.8639, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.1280076997112607, |
|
"grad_norm": 0.22155196964740753, |
|
"learning_rate": 0.0001735855758331994, |
|
"loss": 0.8687, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 3.1376323387872955, |
|
"grad_norm": 0.21988868713378906, |
|
"learning_rate": 0.0001733574473995801, |
|
"loss": 0.8571, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.14725697786333, |
|
"grad_norm": 0.20397303998470306, |
|
"learning_rate": 0.00017312848937536338, |
|
"loss": 0.8556, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 3.1568816169393648, |
|
"grad_norm": 0.21777671575546265, |
|
"learning_rate": 0.00017289870434980824, |
|
"loss": 0.8657, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.1665062560153996, |
|
"grad_norm": 0.20753996074199677, |
|
"learning_rate": 0.00017266809492152597, |
|
"loss": 0.8578, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 3.176130895091434, |
|
"grad_norm": 0.22726857662200928, |
|
"learning_rate": 0.00017243666369845103, |
|
"loss": 0.8713, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.185755534167469, |
|
"grad_norm": 0.20830857753753662, |
|
"learning_rate": 0.00017220441329781147, |
|
"loss": 0.8621, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 3.195380173243503, |
|
"grad_norm": 0.21678543090820312, |
|
"learning_rate": 0.00017197134634609924, |
|
"loss": 0.8589, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.205004812319538, |
|
"grad_norm": 0.21865533292293549, |
|
"learning_rate": 0.00017173746547904063, |
|
"loss": 0.872, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 3.214629451395573, |
|
"grad_norm": 0.20973502099514008, |
|
"learning_rate": 0.0001715027733415664, |
|
"loss": 0.8624, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.2242540904716073, |
|
"grad_norm": 0.21278487145900726, |
|
"learning_rate": 0.00017126727258778187, |
|
"loss": 0.8693, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 3.233878729547642, |
|
"grad_norm": 0.2145373523235321, |
|
"learning_rate": 0.00017103096588093686, |
|
"loss": 0.8665, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.2435033686236765, |
|
"grad_norm": 0.21175837516784668, |
|
"learning_rate": 0.00017079385589339568, |
|
"loss": 0.8592, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 3.2531280076997113, |
|
"grad_norm": 0.21969176828861237, |
|
"learning_rate": 0.00017055594530660678, |
|
"loss": 0.8686, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.2627526467757457, |
|
"grad_norm": 0.23275814950466156, |
|
"learning_rate": 0.00017031723681107256, |
|
"loss": 0.8643, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 3.2723772858517806, |
|
"grad_norm": 0.22712193429470062, |
|
"learning_rate": 0.0001700777331063188, |
|
"loss": 0.8774, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.2820019249278154, |
|
"grad_norm": 0.2357400804758072, |
|
"learning_rate": 0.0001698374369008643, |
|
"loss": 0.8654, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 3.29162656400385, |
|
"grad_norm": 0.21586911380290985, |
|
"learning_rate": 0.00016959635091219011, |
|
"loss": 0.8682, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.3012512030798846, |
|
"grad_norm": 0.20854496955871582, |
|
"learning_rate": 0.00016935447786670875, |
|
"loss": 0.872, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 3.310875842155919, |
|
"grad_norm": 0.22415196895599365, |
|
"learning_rate": 0.00016911182049973364, |
|
"loss": 0.8691, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.320500481231954, |
|
"grad_norm": 0.21514172852039337, |
|
"learning_rate": 0.00016886838155544785, |
|
"loss": 0.8662, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 3.3301251203079882, |
|
"grad_norm": 0.21508009731769562, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.8677, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.339749759384023, |
|
"grad_norm": 0.21434170007705688, |
|
"learning_rate": 0.00016837916995583965, |
|
"loss": 0.8691, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 3.349374398460058, |
|
"grad_norm": 0.21920685470104218, |
|
"learning_rate": 0.00016813340283295265, |
|
"loss": 0.8632, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.3589990375360923, |
|
"grad_norm": 0.20799002051353455, |
|
"learning_rate": 0.00016788686519756337, |
|
"loss": 0.8711, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 3.368623676612127, |
|
"grad_norm": 0.22760187089443207, |
|
"learning_rate": 0.00016763955983773642, |
|
"loss": 0.8716, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.3782483156881615, |
|
"grad_norm": 0.20473913848400116, |
|
"learning_rate": 0.00016739148955021853, |
|
"loss": 0.8672, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 3.3878729547641964, |
|
"grad_norm": 0.2237493246793747, |
|
"learning_rate": 0.00016714265714040688, |
|
"loss": 0.8711, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.3974975938402308, |
|
"grad_norm": 0.21266481280326843, |
|
"learning_rate": 0.00016689306542231754, |
|
"loss": 0.8581, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 3.4071222329162656, |
|
"grad_norm": 0.21926787495613098, |
|
"learning_rate": 0.00016664271721855323, |
|
"loss": 0.8647, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.4167468719923004, |
|
"grad_norm": 0.21556758880615234, |
|
"learning_rate": 0.00016639161536027196, |
|
"loss": 0.8627, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 3.426371511068335, |
|
"grad_norm": 0.22477813065052032, |
|
"learning_rate": 0.00016613976268715458, |
|
"loss": 0.8734, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.4359961501443697, |
|
"grad_norm": 0.22144025564193726, |
|
"learning_rate": 0.00016588716204737281, |
|
"loss": 0.8633, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 3.445620789220404, |
|
"grad_norm": 0.21546606719493866, |
|
"learning_rate": 0.00016563381629755713, |
|
"loss": 0.87, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.455245428296439, |
|
"grad_norm": 0.21200338006019592, |
|
"learning_rate": 0.00016537972830276424, |
|
"loss": 0.8749, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 3.4648700673724737, |
|
"grad_norm": 0.21702003479003906, |
|
"learning_rate": 0.00016512490093644491, |
|
"loss": 0.8736, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.474494706448508, |
|
"grad_norm": 0.20890291035175323, |
|
"learning_rate": 0.00016486933708041138, |
|
"loss": 0.8658, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 3.484119345524543, |
|
"grad_norm": 0.21432092785835266, |
|
"learning_rate": 0.0001646130396248047, |
|
"loss": 0.8671, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.4937439846005773, |
|
"grad_norm": 0.21486730873584747, |
|
"learning_rate": 0.0001643560114680621, |
|
"loss": 0.8624, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 3.503368623676612, |
|
"grad_norm": 0.2079630345106125, |
|
"learning_rate": 0.0001640982555168843, |
|
"loss": 0.8623, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.512993262752647, |
|
"grad_norm": 0.21051821112632751, |
|
"learning_rate": 0.00016383977468620252, |
|
"loss": 0.8694, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 3.5226179018286814, |
|
"grad_norm": 0.22331751883029938, |
|
"learning_rate": 0.00016358057189914553, |
|
"loss": 0.8867, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.5322425409047162, |
|
"grad_norm": 0.21272289752960205, |
|
"learning_rate": 0.00016332065008700666, |
|
"loss": 0.8643, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 3.5418671799807506, |
|
"grad_norm": 0.2075881063938141, |
|
"learning_rate": 0.00016306001218921055, |
|
"loss": 0.8758, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.5514918190567855, |
|
"grad_norm": 0.21468383073806763, |
|
"learning_rate": 0.00016279866115328012, |
|
"loss": 0.8743, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 3.56111645813282, |
|
"grad_norm": 0.20136167109012604, |
|
"learning_rate": 0.00016253659993480284, |
|
"loss": 0.874, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.5707410972088547, |
|
"grad_norm": 0.2094564139842987, |
|
"learning_rate": 0.00016227383149739776, |
|
"loss": 0.8798, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 3.5803657362848895, |
|
"grad_norm": 0.21963797509670258, |
|
"learning_rate": 0.00016201035881268166, |
|
"loss": 0.8751, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.589990375360924, |
|
"grad_norm": 0.22210368514060974, |
|
"learning_rate": 0.00016174618486023565, |
|
"loss": 0.8709, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 3.5996150144369587, |
|
"grad_norm": 0.22002506256103516, |
|
"learning_rate": 0.00016148131262757134, |
|
"loss": 0.8724, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.609239653512993, |
|
"grad_norm": 0.21862515807151794, |
|
"learning_rate": 0.0001612157451100971, |
|
"loss": 0.8715, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 3.618864292589028, |
|
"grad_norm": 0.21481823921203613, |
|
"learning_rate": 0.0001609494853110843, |
|
"loss": 0.8727, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.6284889316650624, |
|
"grad_norm": 0.21671965718269348, |
|
"learning_rate": 0.00016068253624163307, |
|
"loss": 0.8695, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 3.638113570741097, |
|
"grad_norm": 0.22262564301490784, |
|
"learning_rate": 0.00016041490092063852, |
|
"loss": 0.8707, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.647738209817132, |
|
"grad_norm": 0.21777838468551636, |
|
"learning_rate": 0.0001601465823747565, |
|
"loss": 0.8719, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 3.6573628488931664, |
|
"grad_norm": 0.2157593071460724, |
|
"learning_rate": 0.00015987758363836932, |
|
"loss": 0.8649, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.6669874879692013, |
|
"grad_norm": 0.21907728910446167, |
|
"learning_rate": 0.00015960790775355159, |
|
"loss": 0.8727, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 3.6766121270452357, |
|
"grad_norm": 0.2181127518415451, |
|
"learning_rate": 0.00015933755777003552, |
|
"loss": 0.8642, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.6862367661212705, |
|
"grad_norm": 0.21002036333084106, |
|
"learning_rate": 0.0001590665367451768, |
|
"loss": 0.8853, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 3.695861405197305, |
|
"grad_norm": 0.21628259122371674, |
|
"learning_rate": 0.0001587948477439198, |
|
"loss": 0.8781, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.7054860442733397, |
|
"grad_norm": 0.21025903522968292, |
|
"learning_rate": 0.00015852249383876285, |
|
"loss": 0.8788, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 3.7151106833493746, |
|
"grad_norm": 0.21036125719547272, |
|
"learning_rate": 0.00015824947810972378, |
|
"loss": 0.8769, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.724735322425409, |
|
"grad_norm": 0.20949947834014893, |
|
"learning_rate": 0.00015797580364430473, |
|
"loss": 0.8689, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 3.734359961501444, |
|
"grad_norm": 0.22593073546886444, |
|
"learning_rate": 0.00015770147353745754, |
|
"loss": 0.8763, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.7439846005774786, |
|
"grad_norm": 0.22361914813518524, |
|
"learning_rate": 0.00015742649089154858, |
|
"loss": 0.8743, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 3.753609239653513, |
|
"grad_norm": 0.21210341155529022, |
|
"learning_rate": 0.00015715085881632366, |
|
"loss": 0.8754, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.7632338787295474, |
|
"grad_norm": 0.21233123540878296, |
|
"learning_rate": 0.00015687458042887298, |
|
"loss": 0.8823, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 3.7728585178055822, |
|
"grad_norm": 0.20900115370750427, |
|
"learning_rate": 0.00015659765885359572, |
|
"loss": 0.8601, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.782483156881617, |
|
"grad_norm": 0.20850348472595215, |
|
"learning_rate": 0.0001563200972221649, |
|
"loss": 0.8748, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 3.7921077959576515, |
|
"grad_norm": 0.2235669642686844, |
|
"learning_rate": 0.00015604189867349182, |
|
"loss": 0.8767, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.8017324350336863, |
|
"grad_norm": 0.20681613683700562, |
|
"learning_rate": 0.00015576306635369053, |
|
"loss": 0.87, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 3.811357074109721, |
|
"grad_norm": 0.2126859724521637, |
|
"learning_rate": 0.00015548360341604244, |
|
"loss": 0.8767, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.8209817131857555, |
|
"grad_norm": 0.21969568729400635, |
|
"learning_rate": 0.00015520351302096043, |
|
"loss": 0.8619, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 3.83060635226179, |
|
"grad_norm": 0.20034681260585785, |
|
"learning_rate": 0.0001549227983359533, |
|
"loss": 0.879, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.8402309913378248, |
|
"grad_norm": 0.22048155963420868, |
|
"learning_rate": 0.00015464146253558987, |
|
"loss": 0.8704, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 3.8498556304138596, |
|
"grad_norm": 0.2217637300491333, |
|
"learning_rate": 0.00015435950880146297, |
|
"loss": 0.874, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.859480269489894, |
|
"grad_norm": 0.2207387238740921, |
|
"learning_rate": 0.00015407694032215375, |
|
"loss": 0.871, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 3.869104908565929, |
|
"grad_norm": 0.21759381890296936, |
|
"learning_rate": 0.00015379376029319526, |
|
"loss": 0.881, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.8787295476419636, |
|
"grad_norm": 0.21979306638240814, |
|
"learning_rate": 0.00015350997191703662, |
|
"loss": 0.8707, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 3.888354186717998, |
|
"grad_norm": 0.2088766098022461, |
|
"learning_rate": 0.0001532255784030066, |
|
"loss": 0.8715, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.897978825794033, |
|
"grad_norm": 0.23208843171596527, |
|
"learning_rate": 0.00015294058296727746, |
|
"loss": 0.867, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 3.9076034648700673, |
|
"grad_norm": 0.211493119597435, |
|
"learning_rate": 0.00015265498883282848, |
|
"loss": 0.8746, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.917228103946102, |
|
"grad_norm": 0.2072470784187317, |
|
"learning_rate": 0.00015236879922940952, |
|
"loss": 0.8815, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 3.9268527430221365, |
|
"grad_norm": 0.2107774019241333, |
|
"learning_rate": 0.0001520820173935046, |
|
"loss": 0.8762, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.9364773820981713, |
|
"grad_norm": 0.22592873871326447, |
|
"learning_rate": 0.00015179464656829526, |
|
"loss": 0.8781, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 3.946102021174206, |
|
"grad_norm": 0.210884690284729, |
|
"learning_rate": 0.00015150669000362372, |
|
"loss": 0.8759, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.9557266602502406, |
|
"grad_norm": 0.22325028479099274, |
|
"learning_rate": 0.00015121815095595631, |
|
"loss": 0.8759, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 3.9653512993262754, |
|
"grad_norm": 0.20822718739509583, |
|
"learning_rate": 0.0001509290326883466, |
|
"loss": 0.8743, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.97497593840231, |
|
"grad_norm": 0.22340907156467438, |
|
"learning_rate": 0.00015063933847039856, |
|
"loss": 0.8768, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 3.9846005774783446, |
|
"grad_norm": 0.21545882523059845, |
|
"learning_rate": 0.0001503490715782294, |
|
"loss": 0.8737, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.994225216554379, |
|
"grad_norm": 0.21250423789024353, |
|
"learning_rate": 0.00015005823529443268, |
|
"loss": 0.8818, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.463193893432617, |
|
"eval_runtime": 0.7794, |
|
"eval_samples_per_second": 14.113, |
|
"eval_steps_per_second": 2.566, |
|
"step": 2078 |
|
}, |
|
{ |
|
"epoch": 4.003849855630414, |
|
"grad_norm": 0.20480164885520935, |
|
"learning_rate": 0.00014976683290804116, |
|
"loss": 0.8452, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 4.013474494706449, |
|
"grad_norm": 0.24909119307994843, |
|
"learning_rate": 0.00014947486771448956, |
|
"loss": 0.799, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 4.023099133782483, |
|
"grad_norm": 0.2511972188949585, |
|
"learning_rate": 0.00014918234301557732, |
|
"loss": 0.7996, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 4.0327237728585175, |
|
"grad_norm": 0.2290249615907669, |
|
"learning_rate": 0.00014888926211943128, |
|
"loss": 0.7821, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 4.042348411934553, |
|
"grad_norm": 0.2516845762729645, |
|
"learning_rate": 0.0001485956283404682, |
|
"loss": 0.8027, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.051973051010587, |
|
"grad_norm": 0.23257547616958618, |
|
"learning_rate": 0.00014830144499935742, |
|
"loss": 0.8051, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 4.0615976900866215, |
|
"grad_norm": 0.23365622758865356, |
|
"learning_rate": 0.00014800671542298312, |
|
"loss": 0.8046, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 4.071222329162657, |
|
"grad_norm": 0.25398579239845276, |
|
"learning_rate": 0.00014771144294440682, |
|
"loss": 0.7998, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 4.080846968238691, |
|
"grad_norm": 0.25395774841308594, |
|
"learning_rate": 0.00014741563090282965, |
|
"loss": 0.7896, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 4.090471607314726, |
|
"grad_norm": 0.23397642374038696, |
|
"learning_rate": 0.00014711928264355466, |
|
"loss": 0.7982, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 4.10009624639076, |
|
"grad_norm": 0.24863800406455994, |
|
"learning_rate": 0.0001468224015179488, |
|
"loss": 0.8004, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 4.109720885466795, |
|
"grad_norm": 0.24272161722183228, |
|
"learning_rate": 0.00014652499088340523, |
|
"loss": 0.7949, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 4.11934552454283, |
|
"grad_norm": 0.24697747826576233, |
|
"learning_rate": 0.00014622705410330522, |
|
"loss": 0.792, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 4.128970163618864, |
|
"grad_norm": 0.2412373572587967, |
|
"learning_rate": 0.0001459285945469802, |
|
"loss": 0.7999, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 4.138594802694899, |
|
"grad_norm": 0.255993127822876, |
|
"learning_rate": 0.0001456296155896736, |
|
"loss": 0.7965, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.148219441770934, |
|
"grad_norm": 0.23746897280216217, |
|
"learning_rate": 0.00014533012061250264, |
|
"loss": 0.8057, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 4.157844080846968, |
|
"grad_norm": 0.24358995258808136, |
|
"learning_rate": 0.00014503011300242023, |
|
"loss": 0.8073, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 4.1674687199230025, |
|
"grad_norm": 0.2651350498199463, |
|
"learning_rate": 0.0001447295961521765, |
|
"loss": 0.7961, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 4.177093358999038, |
|
"grad_norm": 0.25750139355659485, |
|
"learning_rate": 0.0001444285734602806, |
|
"loss": 0.7961, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 4.186717998075072, |
|
"grad_norm": 0.2478739321231842, |
|
"learning_rate": 0.00014412704833096217, |
|
"loss": 0.7955, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 4.196342637151107, |
|
"grad_norm": 0.26254719495773315, |
|
"learning_rate": 0.00014382502417413276, |
|
"loss": 0.7929, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 4.205967276227142, |
|
"grad_norm": 0.25435176491737366, |
|
"learning_rate": 0.00014352250440534747, |
|
"loss": 0.8052, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 4.215591915303176, |
|
"grad_norm": 0.24811629951000214, |
|
"learning_rate": 0.00014321949244576617, |
|
"loss": 0.7989, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 4.225216554379211, |
|
"grad_norm": 0.2621951103210449, |
|
"learning_rate": 0.00014291599172211485, |
|
"loss": 0.8092, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 4.234841193455245, |
|
"grad_norm": 0.2780658006668091, |
|
"learning_rate": 0.0001426120056666469, |
|
"loss": 0.8058, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.24446583253128, |
|
"grad_norm": 0.233393132686615, |
|
"learning_rate": 0.0001423075377171043, |
|
"loss": 0.8049, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 4.254090471607315, |
|
"grad_norm": 0.26003360748291016, |
|
"learning_rate": 0.00014200259131667858, |
|
"loss": 0.8091, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 4.263715110683349, |
|
"grad_norm": 0.25277137756347656, |
|
"learning_rate": 0.00014169716991397214, |
|
"loss": 0.8126, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 4.273339749759384, |
|
"grad_norm": 0.23928789794445038, |
|
"learning_rate": 0.00014139127696295912, |
|
"loss": 0.8044, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 4.282964388835419, |
|
"grad_norm": 0.254984587430954, |
|
"learning_rate": 0.00014108491592294627, |
|
"loss": 0.8036, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 4.292589027911453, |
|
"grad_norm": 0.2602671682834625, |
|
"learning_rate": 0.000140778090258534, |
|
"loss": 0.8147, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 4.3022136669874875, |
|
"grad_norm": 0.24539902806282043, |
|
"learning_rate": 0.000140470803439577, |
|
"loss": 0.8078, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 4.311838306063523, |
|
"grad_norm": 0.24983367323875427, |
|
"learning_rate": 0.00014016305894114516, |
|
"loss": 0.8089, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 4.321462945139557, |
|
"grad_norm": 0.2500509023666382, |
|
"learning_rate": 0.0001398548602434842, |
|
"loss": 0.8053, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 4.331087584215592, |
|
"grad_norm": 0.24786844849586487, |
|
"learning_rate": 0.00013954621083197628, |
|
"loss": 0.8091, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.340712223291627, |
|
"grad_norm": 0.2504083216190338, |
|
"learning_rate": 0.00013923711419710076, |
|
"loss": 0.8122, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 4.350336862367661, |
|
"grad_norm": 0.24594616889953613, |
|
"learning_rate": 0.0001389275738343944, |
|
"loss": 0.8142, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.359961501443696, |
|
"grad_norm": 0.25497034192085266, |
|
"learning_rate": 0.00013861759324441223, |
|
"loss": 0.8102, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 4.369586140519731, |
|
"grad_norm": 0.26248982548713684, |
|
"learning_rate": 0.00013830717593268764, |
|
"loss": 0.8106, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.379210779595765, |
|
"grad_norm": 0.24808135628700256, |
|
"learning_rate": 0.00013799632540969286, |
|
"loss": 0.8069, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 4.3888354186718, |
|
"grad_norm": 0.2534014582633972, |
|
"learning_rate": 0.00013768504519079923, |
|
"loss": 0.8166, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.398460057747834, |
|
"grad_norm": 0.24292294681072235, |
|
"learning_rate": 0.0001373733387962376, |
|
"loss": 0.8072, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 4.408084696823869, |
|
"grad_norm": 0.24815544486045837, |
|
"learning_rate": 0.00013706120975105822, |
|
"loss": 0.8189, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.417709335899904, |
|
"grad_norm": 0.24199172854423523, |
|
"learning_rate": 0.00013674866158509117, |
|
"loss": 0.8084, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 4.427333974975938, |
|
"grad_norm": 0.26282939314842224, |
|
"learning_rate": 0.00013643569783290622, |
|
"loss": 0.8103, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.4369586140519734, |
|
"grad_norm": 0.2644505202770233, |
|
"learning_rate": 0.00013612232203377307, |
|
"loss": 0.8106, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 4.446583253128008, |
|
"grad_norm": 0.250636488199234, |
|
"learning_rate": 0.0001358085377316211, |
|
"loss": 0.823, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.456207892204042, |
|
"grad_norm": 0.2760376036167145, |
|
"learning_rate": 0.00013549434847499945, |
|
"loss": 0.8109, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 4.465832531280077, |
|
"grad_norm": 0.24669407308101654, |
|
"learning_rate": 0.00013517975781703688, |
|
"loss": 0.8135, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.475457170356112, |
|
"grad_norm": 0.24369503557682037, |
|
"learning_rate": 0.00013486476931540145, |
|
"loss": 0.8083, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 4.485081809432146, |
|
"grad_norm": 0.2656605839729309, |
|
"learning_rate": 0.00013454938653226047, |
|
"loss": 0.8082, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.494706448508181, |
|
"grad_norm": 0.24139179289340973, |
|
"learning_rate": 0.0001342336130342401, |
|
"loss": 0.8046, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 4.504331087584216, |
|
"grad_norm": 0.2464561015367508, |
|
"learning_rate": 0.00013391745239238508, |
|
"loss": 0.8205, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.51395572666025, |
|
"grad_norm": 0.25290533900260925, |
|
"learning_rate": 0.0001336009081821183, |
|
"loss": 0.8135, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 4.523580365736285, |
|
"grad_norm": 0.2681277096271515, |
|
"learning_rate": 0.00013328398398320036, |
|
"loss": 0.8111, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.53320500481232, |
|
"grad_norm": 0.24826329946517944, |
|
"learning_rate": 0.00013296668337968903, |
|
"loss": 0.8161, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 4.542829643888354, |
|
"grad_norm": 0.26754263043403625, |
|
"learning_rate": 0.000132649009959899, |
|
"loss": 0.8103, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.552454282964389, |
|
"grad_norm": 0.2548888027667999, |
|
"learning_rate": 0.00013233096731636088, |
|
"loss": 0.8114, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 4.562078922040423, |
|
"grad_norm": 0.2608910799026489, |
|
"learning_rate": 0.00013201255904578095, |
|
"loss": 0.8104, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.5717035611164585, |
|
"grad_norm": 0.2469130903482437, |
|
"learning_rate": 0.00013169378874900017, |
|
"loss": 0.8084, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 4.581328200192493, |
|
"grad_norm": 0.26305124163627625, |
|
"learning_rate": 0.0001313746600309538, |
|
"loss": 0.8198, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.590952839268527, |
|
"grad_norm": 0.2730869650840759, |
|
"learning_rate": 0.00013105517650063026, |
|
"loss": 0.823, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 4.600577478344562, |
|
"grad_norm": 0.25265151262283325, |
|
"learning_rate": 0.0001307353417710306, |
|
"loss": 0.8166, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.610202117420597, |
|
"grad_norm": 0.2525179386138916, |
|
"learning_rate": 0.00013041515945912753, |
|
"loss": 0.8142, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 4.619826756496631, |
|
"grad_norm": 0.2585461735725403, |
|
"learning_rate": 0.00013009463318582447, |
|
"loss": 0.8112, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.629451395572666, |
|
"grad_norm": 0.25699469447135925, |
|
"learning_rate": 0.00012977376657591474, |
|
"loss": 0.815, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 4.639076034648701, |
|
"grad_norm": 0.2651076316833496, |
|
"learning_rate": 0.00012945256325804048, |
|
"loss": 0.8215, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.648700673724735, |
|
"grad_norm": 0.2517280876636505, |
|
"learning_rate": 0.0001291310268646515, |
|
"loss": 0.8126, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 4.65832531280077, |
|
"grad_norm": 0.25369200110435486, |
|
"learning_rate": 0.00012880916103196448, |
|
"loss": 0.811, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.667949951876805, |
|
"grad_norm": 0.2509647011756897, |
|
"learning_rate": 0.0001284869693999216, |
|
"loss": 0.8144, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 4.6775745909528395, |
|
"grad_norm": 0.25037845969200134, |
|
"learning_rate": 0.00012816445561214946, |
|
"loss": 0.8145, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.687199230028874, |
|
"grad_norm": 0.24885617196559906, |
|
"learning_rate": 0.000127841623315918, |
|
"loss": 0.815, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 4.696823869104908, |
|
"grad_norm": 0.26731571555137634, |
|
"learning_rate": 0.0001275184761620989, |
|
"loss": 0.8151, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.7064485081809435, |
|
"grad_norm": 0.24980269372463226, |
|
"learning_rate": 0.00012719501780512476, |
|
"loss": 0.8189, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 4.716073147256978, |
|
"grad_norm": 0.26535722613334656, |
|
"learning_rate": 0.0001268712519029474, |
|
"loss": 0.8205, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.725697786333012, |
|
"grad_norm": 0.24587014317512512, |
|
"learning_rate": 0.00012654718211699674, |
|
"loss": 0.8127, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 4.735322425409047, |
|
"grad_norm": 0.26100653409957886, |
|
"learning_rate": 0.00012622281211213915, |
|
"loss": 0.8146, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.744947064485082, |
|
"grad_norm": 0.24849233031272888, |
|
"learning_rate": 0.00012589814555663626, |
|
"loss": 0.8107, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 4.754571703561116, |
|
"grad_norm": 0.2688848078250885, |
|
"learning_rate": 0.0001255731861221033, |
|
"loss": 0.8209, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.764196342637151, |
|
"grad_norm": 0.2500625550746918, |
|
"learning_rate": 0.00012524793748346758, |
|
"loss": 0.815, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 4.773820981713186, |
|
"grad_norm": 0.2789405882358551, |
|
"learning_rate": 0.00012492240331892716, |
|
"loss": 0.8196, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.78344562078922, |
|
"grad_norm": 0.25875410437583923, |
|
"learning_rate": 0.00012459658730990891, |
|
"loss": 0.8196, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 4.793070259865255, |
|
"grad_norm": 0.25247231125831604, |
|
"learning_rate": 0.00012427049314102707, |
|
"loss": 0.8242, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.80269489894129, |
|
"grad_norm": 0.2572121024131775, |
|
"learning_rate": 0.00012394412450004164, |
|
"loss": 0.8215, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 4.8123195380173245, |
|
"grad_norm": 0.25512033700942993, |
|
"learning_rate": 0.0001236174850778165, |
|
"loss": 0.8163, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.821944177093359, |
|
"grad_norm": 0.25790128111839294, |
|
"learning_rate": 0.0001232905785682778, |
|
"loss": 0.8119, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 4.831568816169393, |
|
"grad_norm": 0.26126110553741455, |
|
"learning_rate": 0.00012296340866837222, |
|
"loss": 0.8133, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.8411934552454285, |
|
"grad_norm": 0.2542867362499237, |
|
"learning_rate": 0.00012263597907802493, |
|
"loss": 0.818, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 4.850818094321463, |
|
"grad_norm": 0.2690134048461914, |
|
"learning_rate": 0.00012230829350009804, |
|
"loss": 0.8106, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.860442733397497, |
|
"grad_norm": 0.25750601291656494, |
|
"learning_rate": 0.00012198035564034856, |
|
"loss": 0.8125, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 4.870067372473533, |
|
"grad_norm": 0.2467714548110962, |
|
"learning_rate": 0.00012165216920738651, |
|
"loss": 0.8172, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.879692011549567, |
|
"grad_norm": 0.25768086314201355, |
|
"learning_rate": 0.000121323737912633, |
|
"loss": 0.8186, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 4.889316650625601, |
|
"grad_norm": 0.2579788863658905, |
|
"learning_rate": 0.00012099506547027826, |
|
"loss": 0.8124, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.898941289701636, |
|
"grad_norm": 0.250635027885437, |
|
"learning_rate": 0.00012066615559723961, |
|
"loss": 0.8185, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 4.908565928777671, |
|
"grad_norm": 0.24465559422969818, |
|
"learning_rate": 0.00012033701201311945, |
|
"loss": 0.8246, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.9181905678537055, |
|
"grad_norm": 0.24917738139629364, |
|
"learning_rate": 0.00012000763844016321, |
|
"loss": 0.8112, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 4.92781520692974, |
|
"grad_norm": 0.24168001115322113, |
|
"learning_rate": 0.00011967803860321726, |
|
"loss": 0.8169, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.937439846005775, |
|
"grad_norm": 0.2604310214519501, |
|
"learning_rate": 0.0001193482162296867, |
|
"loss": 0.8092, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 4.9470644850818095, |
|
"grad_norm": 0.2558085024356842, |
|
"learning_rate": 0.00011901817504949331, |
|
"loss": 0.8226, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.956689124157844, |
|
"grad_norm": 0.2698078155517578, |
|
"learning_rate": 0.00011868791879503324, |
|
"loss": 0.8147, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 4.966313763233879, |
|
"grad_norm": 0.268557608127594, |
|
"learning_rate": 0.00011835745120113508, |
|
"loss": 0.8039, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.975938402309914, |
|
"grad_norm": 0.25237688422203064, |
|
"learning_rate": 0.00011802677600501725, |
|
"loss": 0.8129, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 4.985563041385948, |
|
"grad_norm": 0.24979138374328613, |
|
"learning_rate": 0.00011769589694624601, |
|
"loss": 0.8222, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.995187680461982, |
|
"grad_norm": 0.282382071018219, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.8109, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 4.999037536092397, |
|
"eval_loss": 2.708376407623291, |
|
"eval_runtime": 0.7926, |
|
"eval_samples_per_second": 13.879, |
|
"eval_steps_per_second": 2.523, |
|
"step": 2597 |
|
}, |
|
{ |
|
"epoch": 5.004812319538018, |
|
"grad_norm": 0.23464234173297882, |
|
"learning_rate": 0.00011703354221049318, |
|
"loss": 0.78, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.014436958614052, |
|
"grad_norm": 0.29451891779899597, |
|
"learning_rate": 0.0001167020740240021, |
|
"loss": 0.7451, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 5.024061597690086, |
|
"grad_norm": 0.26757895946502686, |
|
"learning_rate": 0.00011637041695575383, |
|
"loss": 0.7358, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 5.033686236766122, |
|
"grad_norm": 0.27909424901008606, |
|
"learning_rate": 0.00011603857475641846, |
|
"loss": 0.7299, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 5.043310875842156, |
|
"grad_norm": 0.27936622500419617, |
|
"learning_rate": 0.0001157065511787598, |
|
"loss": 0.7264, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 5.0529355149181905, |
|
"grad_norm": 0.2764790952205658, |
|
"learning_rate": 0.0001153743499775927, |
|
"loss": 0.7414, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 5.062560153994225, |
|
"grad_norm": 0.28827911615371704, |
|
"learning_rate": 0.00011504197490974085, |
|
"loss": 0.7344, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 5.07218479307026, |
|
"grad_norm": 0.29319116473197937, |
|
"learning_rate": 0.0001147094297339941, |
|
"loss": 0.7419, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 5.0818094321462945, |
|
"grad_norm": 0.27190330624580383, |
|
"learning_rate": 0.0001143767182110661, |
|
"loss": 0.7384, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 5.091434071222329, |
|
"grad_norm": 0.28567731380462646, |
|
"learning_rate": 0.00011404384410355167, |
|
"loss": 0.7428, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.101058710298364, |
|
"grad_norm": 0.27502113580703735, |
|
"learning_rate": 0.00011371081117588417, |
|
"loss": 0.751, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 5.110683349374399, |
|
"grad_norm": 0.2895454168319702, |
|
"learning_rate": 0.00011337762319429326, |
|
"loss": 0.7389, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 5.120307988450433, |
|
"grad_norm": 0.28590232133865356, |
|
"learning_rate": 0.00011304428392676194, |
|
"loss": 0.7351, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 5.129932627526467, |
|
"grad_norm": 0.29666268825531006, |
|
"learning_rate": 0.00011271079714298405, |
|
"loss": 0.7437, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 5.139557266602503, |
|
"grad_norm": 0.2858620584011078, |
|
"learning_rate": 0.00011237716661432181, |
|
"loss": 0.7393, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 5.149181905678537, |
|
"grad_norm": 0.29355934262275696, |
|
"learning_rate": 0.00011204339611376291, |
|
"loss": 0.7429, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 5.1588065447545715, |
|
"grad_norm": 0.31132546067237854, |
|
"learning_rate": 0.00011170948941587805, |
|
"loss": 0.7477, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 5.168431183830607, |
|
"grad_norm": 0.2698726952075958, |
|
"learning_rate": 0.00011137545029677809, |
|
"loss": 0.7453, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 5.178055822906641, |
|
"grad_norm": 0.2867010533809662, |
|
"learning_rate": 0.0001110412825340715, |
|
"loss": 0.7375, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 5.1876804619826755, |
|
"grad_norm": 0.2847628593444824, |
|
"learning_rate": 0.00011070698990682156, |
|
"loss": 0.7492, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 5.19730510105871, |
|
"grad_norm": 0.29182639718055725, |
|
"learning_rate": 0.00011037257619550352, |
|
"loss": 0.7399, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.206929740134745, |
|
"grad_norm": 0.30024299025535583, |
|
"learning_rate": 0.0001100380451819621, |
|
"loss": 0.7509, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 5.21655437921078, |
|
"grad_norm": 0.2791791260242462, |
|
"learning_rate": 0.00010970340064936853, |
|
"loss": 0.7515, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 5.226179018286814, |
|
"grad_norm": 0.3051394522190094, |
|
"learning_rate": 0.00010936864638217776, |
|
"loss": 0.7458, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 5.235803657362849, |
|
"grad_norm": 0.2900809049606323, |
|
"learning_rate": 0.00010903378616608573, |
|
"loss": 0.7433, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 5.245428296438884, |
|
"grad_norm": 0.2843543291091919, |
|
"learning_rate": 0.00010869882378798663, |
|
"loss": 0.7454, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 5.255052935514918, |
|
"grad_norm": 0.30490627884864807, |
|
"learning_rate": 0.0001083637630359299, |
|
"loss": 0.7461, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 5.264677574590952, |
|
"grad_norm": 0.28951317071914673, |
|
"learning_rate": 0.00010802860769907748, |
|
"loss": 0.7496, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 5.274302213666988, |
|
"grad_norm": 0.2910211980342865, |
|
"learning_rate": 0.00010769336156766101, |
|
"loss": 0.7465, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 5.283926852743022, |
|
"grad_norm": 0.2923705279827118, |
|
"learning_rate": 0.00010735802843293888, |
|
"loss": 0.7409, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 5.2935514918190565, |
|
"grad_norm": 0.2950255274772644, |
|
"learning_rate": 0.0001070226120871534, |
|
"loss": 0.7471, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.303176130895092, |
|
"grad_norm": 0.29950594902038574, |
|
"learning_rate": 0.00010668711632348787, |
|
"loss": 0.7417, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 5.312800769971126, |
|
"grad_norm": 0.28509971499443054, |
|
"learning_rate": 0.0001063515449360238, |
|
"loss": 0.7515, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 5.3224254090471605, |
|
"grad_norm": 0.3036702871322632, |
|
"learning_rate": 0.00010601590171969782, |
|
"loss": 0.7395, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 5.332050048123195, |
|
"grad_norm": 0.2864932119846344, |
|
"learning_rate": 0.00010568019047025893, |
|
"loss": 0.7473, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 5.34167468719923, |
|
"grad_norm": 0.29944750666618347, |
|
"learning_rate": 0.00010534441498422552, |
|
"loss": 0.7454, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 5.351299326275265, |
|
"grad_norm": 0.2880357503890991, |
|
"learning_rate": 0.00010500857905884233, |
|
"loss": 0.7455, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 5.360923965351299, |
|
"grad_norm": 0.2973332107067108, |
|
"learning_rate": 0.00010467268649203774, |
|
"loss": 0.7607, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 5.370548604427334, |
|
"grad_norm": 0.28307193517684937, |
|
"learning_rate": 0.00010433674108238059, |
|
"loss": 0.7522, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 5.380173243503369, |
|
"grad_norm": 0.29455214738845825, |
|
"learning_rate": 0.00010400074662903729, |
|
"loss": 0.748, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 5.389797882579403, |
|
"grad_norm": 0.2844898998737335, |
|
"learning_rate": 0.00010366470693172896, |
|
"loss": 0.7523, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.399422521655438, |
|
"grad_norm": 0.29002171754837036, |
|
"learning_rate": 0.0001033286257906883, |
|
"loss": 0.7493, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 5.409047160731473, |
|
"grad_norm": 0.2987057566642761, |
|
"learning_rate": 0.00010299250700661678, |
|
"loss": 0.7436, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 5.418671799807507, |
|
"grad_norm": 0.2843535244464874, |
|
"learning_rate": 0.00010265635438064145, |
|
"loss": 0.7418, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 5.4282964388835415, |
|
"grad_norm": 0.2849405109882355, |
|
"learning_rate": 0.00010232017171427223, |
|
"loss": 0.7489, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 5.437921077959577, |
|
"grad_norm": 0.2880149781703949, |
|
"learning_rate": 0.00010198396280935866, |
|
"loss": 0.7563, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 5.447545717035611, |
|
"grad_norm": 0.31955039501190186, |
|
"learning_rate": 0.00010164773146804696, |
|
"loss": 0.7544, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 5.457170356111646, |
|
"grad_norm": 0.32376500964164734, |
|
"learning_rate": 0.00010131148149273723, |
|
"loss": 0.7469, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 5.466794995187681, |
|
"grad_norm": 0.2932649254798889, |
|
"learning_rate": 0.00010097521668604015, |
|
"loss": 0.7548, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 5.476419634263715, |
|
"grad_norm": 0.29111912846565247, |
|
"learning_rate": 0.00010063894085073424, |
|
"loss": 0.7517, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 5.48604427333975, |
|
"grad_norm": 0.30052244663238525, |
|
"learning_rate": 0.0001003026577897227, |
|
"loss": 0.7468, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 5.495668912415784, |
|
"grad_norm": 0.27394363284111023, |
|
"learning_rate": 9.99663713059904e-05, |
|
"loss": 0.7545, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 5.505293551491819, |
|
"grad_norm": 0.29920995235443115, |
|
"learning_rate": 9.9630085202561e-05, |
|
"loss": 0.7578, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 5.514918190567854, |
|
"grad_norm": 0.301736056804657, |
|
"learning_rate": 9.929380328245378e-05, |
|
"loss": 0.7474, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 5.524542829643888, |
|
"grad_norm": 0.29692158102989197, |
|
"learning_rate": 9.895752934864078e-05, |
|
"loss": 0.7529, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 5.534167468719923, |
|
"grad_norm": 0.28971490263938904, |
|
"learning_rate": 9.862126720400364e-05, |
|
"loss": 0.7533, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 5.543792107795958, |
|
"grad_norm": 0.2907530665397644, |
|
"learning_rate": 9.828502065129076e-05, |
|
"loss": 0.7488, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 5.553416746871992, |
|
"grad_norm": 0.28844624757766724, |
|
"learning_rate": 9.794879349307419e-05, |
|
"loss": 0.7534, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 5.563041385948027, |
|
"grad_norm": 0.29487764835357666, |
|
"learning_rate": 9.761258953170667e-05, |
|
"loss": 0.7503, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 5.572666025024062, |
|
"grad_norm": 0.29256966710090637, |
|
"learning_rate": 9.72764125692785e-05, |
|
"loss": 0.7516, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 5.582290664100096, |
|
"grad_norm": 0.2992061376571655, |
|
"learning_rate": 9.694026640757481e-05, |
|
"loss": 0.7544, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.591915303176131, |
|
"grad_norm": 0.28604987263679504, |
|
"learning_rate": 9.660415484803226e-05, |
|
"loss": 0.7484, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 5.601539942252166, |
|
"grad_norm": 0.28531113266944885, |
|
"learning_rate": 9.626808169169634e-05, |
|
"loss": 0.7437, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 5.6111645813282, |
|
"grad_norm": 0.2726121246814728, |
|
"learning_rate": 9.593205073917817e-05, |
|
"loss": 0.7589, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 5.620789220404235, |
|
"grad_norm": 0.29796725511550903, |
|
"learning_rate": 9.559606579061154e-05, |
|
"loss": 0.7524, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 5.630413859480269, |
|
"grad_norm": 0.3006713390350342, |
|
"learning_rate": 9.526013064561006e-05, |
|
"loss": 0.765, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 5.640038498556304, |
|
"grad_norm": 0.30542224645614624, |
|
"learning_rate": 9.492424910322413e-05, |
|
"loss": 0.7545, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 5.649663137632339, |
|
"grad_norm": 0.2783224284648895, |
|
"learning_rate": 9.458842496189789e-05, |
|
"loss": 0.7493, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 5.659287776708373, |
|
"grad_norm": 0.3057067394256592, |
|
"learning_rate": 9.425266201942645e-05, |
|
"loss": 0.7668, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 5.668912415784408, |
|
"grad_norm": 0.29461607336997986, |
|
"learning_rate": 9.391696407291269e-05, |
|
"loss": 0.7544, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 5.678537054860443, |
|
"grad_norm": 0.2968499958515167, |
|
"learning_rate": 9.358133491872453e-05, |
|
"loss": 0.7508, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.688161693936477, |
|
"grad_norm": 0.3040287494659424, |
|
"learning_rate": 9.324577835245197e-05, |
|
"loss": 0.7618, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 5.6977863330125125, |
|
"grad_norm": 0.29871127009391785, |
|
"learning_rate": 9.291029816886405e-05, |
|
"loss": 0.7537, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 5.707410972088547, |
|
"grad_norm": 0.2989570200443268, |
|
"learning_rate": 9.257489816186606e-05, |
|
"loss": 0.7472, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 5.717035611164581, |
|
"grad_norm": 0.2932529151439667, |
|
"learning_rate": 9.223958212445656e-05, |
|
"loss": 0.7488, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 5.726660250240616, |
|
"grad_norm": 0.29275083541870117, |
|
"learning_rate": 9.190435384868448e-05, |
|
"loss": 0.7532, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 5.736284889316651, |
|
"grad_norm": 0.2811647057533264, |
|
"learning_rate": 9.156921712560626e-05, |
|
"loss": 0.7597, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 5.745909528392685, |
|
"grad_norm": 0.2994243800640106, |
|
"learning_rate": 9.123417574524307e-05, |
|
"loss": 0.7494, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 5.75553416746872, |
|
"grad_norm": 0.29861563444137573, |
|
"learning_rate": 9.089923349653776e-05, |
|
"loss": 0.7513, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 5.765158806544754, |
|
"grad_norm": 0.27614521980285645, |
|
"learning_rate": 9.056439416731223e-05, |
|
"loss": 0.7498, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 5.774783445620789, |
|
"grad_norm": 0.29117491841316223, |
|
"learning_rate": 9.02296615442243e-05, |
|
"loss": 0.7571, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.784408084696824, |
|
"grad_norm": 0.32449835538864136, |
|
"learning_rate": 8.989503941272522e-05, |
|
"loss": 0.7575, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 5.794032723772858, |
|
"grad_norm": 0.2920955419540405, |
|
"learning_rate": 8.956053155701661e-05, |
|
"loss": 0.7445, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 5.803657362848893, |
|
"grad_norm": 0.3088265061378479, |
|
"learning_rate": 8.922614176000783e-05, |
|
"loss": 0.7534, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 5.813282001924928, |
|
"grad_norm": 0.3056049942970276, |
|
"learning_rate": 8.889187380327312e-05, |
|
"loss": 0.7548, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 5.822906641000962, |
|
"grad_norm": 0.28941500186920166, |
|
"learning_rate": 8.855773146700872e-05, |
|
"loss": 0.7485, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 5.8325312800769975, |
|
"grad_norm": 0.2886408269405365, |
|
"learning_rate": 8.82237185299904e-05, |
|
"loss": 0.7422, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 5.842155919153032, |
|
"grad_norm": 0.2928673028945923, |
|
"learning_rate": 8.788983876953051e-05, |
|
"loss": 0.7551, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 5.851780558229066, |
|
"grad_norm": 0.3021661043167114, |
|
"learning_rate": 8.755609596143534e-05, |
|
"loss": 0.7445, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 5.861405197305101, |
|
"grad_norm": 0.2965797781944275, |
|
"learning_rate": 8.722249387996237e-05, |
|
"loss": 0.7502, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 5.871029836381136, |
|
"grad_norm": 0.3059804141521454, |
|
"learning_rate": 8.688903629777762e-05, |
|
"loss": 0.7544, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 5.88065447545717, |
|
"grad_norm": 0.2819983661174774, |
|
"learning_rate": 8.655572698591297e-05, |
|
"loss": 0.7611, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 5.890279114533205, |
|
"grad_norm": 0.297858327627182, |
|
"learning_rate": 8.62225697137236e-05, |
|
"loss": 0.7526, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 5.89990375360924, |
|
"grad_norm": 0.2882884740829468, |
|
"learning_rate": 8.588956824884523e-05, |
|
"loss": 0.762, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 5.909528392685274, |
|
"grad_norm": 0.31062471866607666, |
|
"learning_rate": 8.555672635715162e-05, |
|
"loss": 0.7537, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 5.919153031761309, |
|
"grad_norm": 0.30393049120903015, |
|
"learning_rate": 8.522404780271186e-05, |
|
"loss": 0.75, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 5.928777670837343, |
|
"grad_norm": 0.2902856469154358, |
|
"learning_rate": 8.489153634774796e-05, |
|
"loss": 0.7459, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 5.9384023099133785, |
|
"grad_norm": 0.2876073718070984, |
|
"learning_rate": 8.455919575259217e-05, |
|
"loss": 0.7541, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 5.948026948989413, |
|
"grad_norm": 0.3035559058189392, |
|
"learning_rate": 8.422702977564453e-05, |
|
"loss": 0.7564, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 5.957651588065447, |
|
"grad_norm": 0.2893913984298706, |
|
"learning_rate": 8.389504217333039e-05, |
|
"loss": 0.749, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 5.9672762271414825, |
|
"grad_norm": 0.2977910041809082, |
|
"learning_rate": 8.356323670005772e-05, |
|
"loss": 0.7509, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.976900866217517, |
|
"grad_norm": 0.27759596705436707, |
|
"learning_rate": 8.3231617108175e-05, |
|
"loss": 0.7623, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 5.986525505293551, |
|
"grad_norm": 0.30392059683799744, |
|
"learning_rate": 8.290018714792852e-05, |
|
"loss": 0.7565, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 5.996150144369587, |
|
"grad_norm": 0.2790631949901581, |
|
"learning_rate": 8.256895056742006e-05, |
|
"loss": 0.7513, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.9357750415802, |
|
"eval_runtime": 0.7796, |
|
"eval_samples_per_second": 14.111, |
|
"eval_steps_per_second": 2.566, |
|
"step": 3117 |
|
}, |
|
{ |
|
"epoch": 6.005774783445621, |
|
"grad_norm": 0.27328184247016907, |
|
"learning_rate": 8.223791111256447e-05, |
|
"loss": 0.7169, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 6.015399422521655, |
|
"grad_norm": 0.3284066319465637, |
|
"learning_rate": 8.190707252704736e-05, |
|
"loss": 0.6924, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 6.02502406159769, |
|
"grad_norm": 0.30812135338783264, |
|
"learning_rate": 8.157643855228267e-05, |
|
"loss": 0.6785, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 6.034648700673725, |
|
"grad_norm": 0.3338078558444977, |
|
"learning_rate": 8.12460129273705e-05, |
|
"loss": 0.6847, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 6.044273339749759, |
|
"grad_norm": 0.3224867284297943, |
|
"learning_rate": 8.091579938905474e-05, |
|
"loss": 0.6756, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 6.053897978825794, |
|
"grad_norm": 0.317451149225235, |
|
"learning_rate": 8.05858016716808e-05, |
|
"loss": 0.6758, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 6.063522617901829, |
|
"grad_norm": 0.29282692074775696, |
|
"learning_rate": 8.025602350715332e-05, |
|
"loss": 0.687, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 6.0731472569778635, |
|
"grad_norm": 0.3204721510410309, |
|
"learning_rate": 7.992646862489417e-05, |
|
"loss": 0.6808, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 6.082771896053898, |
|
"grad_norm": 0.3063673675060272, |
|
"learning_rate": 7.959714075180008e-05, |
|
"loss": 0.6764, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 6.092396535129932, |
|
"grad_norm": 0.3125745356082916, |
|
"learning_rate": 7.926804361220055e-05, |
|
"loss": 0.6852, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 6.102021174205968, |
|
"grad_norm": 0.31588083505630493, |
|
"learning_rate": 7.893918092781583e-05, |
|
"loss": 0.6805, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 6.111645813282002, |
|
"grad_norm": 0.3146851062774658, |
|
"learning_rate": 7.861055641771459e-05, |
|
"loss": 0.6862, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 6.121270452358036, |
|
"grad_norm": 0.33888891339302063, |
|
"learning_rate": 7.828217379827215e-05, |
|
"loss": 0.6943, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 6.130895091434072, |
|
"grad_norm": 0.33557072281837463, |
|
"learning_rate": 7.79540367831283e-05, |
|
"loss": 0.6936, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 6.140519730510106, |
|
"grad_norm": 0.33382484316825867, |
|
"learning_rate": 7.762614908314521e-05, |
|
"loss": 0.6935, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 6.15014436958614, |
|
"grad_norm": 0.31766244769096375, |
|
"learning_rate": 7.729851440636575e-05, |
|
"loss": 0.6927, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 6.159769008662175, |
|
"grad_norm": 0.3161802291870117, |
|
"learning_rate": 7.69711364579712e-05, |
|
"loss": 0.6902, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.16939364773821, |
|
"grad_norm": 0.31405240297317505, |
|
"learning_rate": 7.664401894023967e-05, |
|
"loss": 0.6824, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 6.1790182868142445, |
|
"grad_norm": 0.31615492701530457, |
|
"learning_rate": 7.6317165552504e-05, |
|
"loss": 0.6893, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 6.188642925890279, |
|
"grad_norm": 0.3123544454574585, |
|
"learning_rate": 7.59905799911101e-05, |
|
"loss": 0.6788, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 6.198267564966314, |
|
"grad_norm": 0.3448927402496338, |
|
"learning_rate": 7.566426594937503e-05, |
|
"loss": 0.6829, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 6.2078922040423485, |
|
"grad_norm": 0.2904527187347412, |
|
"learning_rate": 7.533822711754515e-05, |
|
"loss": 0.6953, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 6.217516843118383, |
|
"grad_norm": 0.31403473019599915, |
|
"learning_rate": 7.501246718275471e-05, |
|
"loss": 0.6819, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 6.227141482194417, |
|
"grad_norm": 0.31581783294677734, |
|
"learning_rate": 7.468698982898382e-05, |
|
"loss": 0.6838, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 6.236766121270453, |
|
"grad_norm": 0.3196973204612732, |
|
"learning_rate": 7.436179873701688e-05, |
|
"loss": 0.687, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 6.246390760346487, |
|
"grad_norm": 0.3196184039115906, |
|
"learning_rate": 7.403689758440115e-05, |
|
"loss": 0.6897, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 6.256015399422521, |
|
"grad_norm": 0.32126832008361816, |
|
"learning_rate": 7.371229004540481e-05, |
|
"loss": 0.6954, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 6.265640038498557, |
|
"grad_norm": 0.3566059470176697, |
|
"learning_rate": 7.338797979097571e-05, |
|
"loss": 0.698, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 6.275264677574591, |
|
"grad_norm": 0.3231862783432007, |
|
"learning_rate": 7.306397048869977e-05, |
|
"loss": 0.6864, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 6.2848893166506254, |
|
"grad_norm": 0.3360905945301056, |
|
"learning_rate": 7.274026580275937e-05, |
|
"loss": 0.6981, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 6.29451395572666, |
|
"grad_norm": 0.30905240774154663, |
|
"learning_rate": 7.241686939389214e-05, |
|
"loss": 0.6839, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 6.304138594802695, |
|
"grad_norm": 0.29758358001708984, |
|
"learning_rate": 7.20937849193493e-05, |
|
"loss": 0.6899, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 6.3137632338787295, |
|
"grad_norm": 0.32738837599754333, |
|
"learning_rate": 7.177101603285458e-05, |
|
"loss": 0.6907, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 6.323387872954764, |
|
"grad_norm": 0.30813169479370117, |
|
"learning_rate": 7.144856638456272e-05, |
|
"loss": 0.6919, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 6.333012512030799, |
|
"grad_norm": 0.340621680021286, |
|
"learning_rate": 7.112643962101817e-05, |
|
"loss": 0.6884, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 6.342637151106834, |
|
"grad_norm": 0.3451749384403229, |
|
"learning_rate": 7.080463938511405e-05, |
|
"loss": 0.6937, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 6.352261790182868, |
|
"grad_norm": 0.32087814807891846, |
|
"learning_rate": 7.048316931605062e-05, |
|
"loss": 0.6929, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.361886429258902, |
|
"grad_norm": 0.30795004963874817, |
|
"learning_rate": 7.016203304929451e-05, |
|
"loss": 0.6983, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 6.371511068334938, |
|
"grad_norm": 0.3312138617038727, |
|
"learning_rate": 6.984123421653733e-05, |
|
"loss": 0.6845, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 6.381135707410972, |
|
"grad_norm": 0.3371661901473999, |
|
"learning_rate": 6.952077644565469e-05, |
|
"loss": 0.6898, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 6.390760346487006, |
|
"grad_norm": 0.3481803834438324, |
|
"learning_rate": 6.920066336066524e-05, |
|
"loss": 0.6912, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 6.400384985563042, |
|
"grad_norm": 0.32163578271865845, |
|
"learning_rate": 6.888089858168949e-05, |
|
"loss": 0.6901, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 6.410009624639076, |
|
"grad_norm": 0.3223172128200531, |
|
"learning_rate": 6.85614857249091e-05, |
|
"loss": 0.6944, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 6.4196342637151105, |
|
"grad_norm": 0.30212926864624023, |
|
"learning_rate": 6.824242840252588e-05, |
|
"loss": 0.7016, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 6.429258902791146, |
|
"grad_norm": 0.32831230759620667, |
|
"learning_rate": 6.79237302227209e-05, |
|
"loss": 0.6869, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 6.43888354186718, |
|
"grad_norm": 0.3248232305049896, |
|
"learning_rate": 6.76053947896138e-05, |
|
"loss": 0.6945, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 6.4485081809432145, |
|
"grad_norm": 0.3347261846065521, |
|
"learning_rate": 6.728742570322181e-05, |
|
"loss": 0.6911, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 6.458132820019249, |
|
"grad_norm": 0.3434222936630249, |
|
"learning_rate": 6.69698265594194e-05, |
|
"loss": 0.7001, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 6.467757459095284, |
|
"grad_norm": 0.31891781091690063, |
|
"learning_rate": 6.66526009498972e-05, |
|
"loss": 0.6961, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 6.477382098171319, |
|
"grad_norm": 0.32785654067993164, |
|
"learning_rate": 6.633575246212175e-05, |
|
"loss": 0.6986, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 6.487006737247353, |
|
"grad_norm": 0.3148154020309448, |
|
"learning_rate": 6.601928467929472e-05, |
|
"loss": 0.6857, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 6.496631376323388, |
|
"grad_norm": 0.3220577836036682, |
|
"learning_rate": 6.570320118031232e-05, |
|
"loss": 0.6933, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 6.506256015399423, |
|
"grad_norm": 0.3030003309249878, |
|
"learning_rate": 6.538750553972509e-05, |
|
"loss": 0.6963, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 6.515880654475457, |
|
"grad_norm": 0.32863059639930725, |
|
"learning_rate": 6.507220132769723e-05, |
|
"loss": 0.6929, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 6.5255052935514914, |
|
"grad_norm": 0.35064488649368286, |
|
"learning_rate": 6.475729210996637e-05, |
|
"loss": 0.6864, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 6.535129932627527, |
|
"grad_norm": 0.32089149951934814, |
|
"learning_rate": 6.444278144780325e-05, |
|
"loss": 0.6858, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 6.544754571703561, |
|
"grad_norm": 0.4273422658443451, |
|
"learning_rate": 6.41286728979712e-05, |
|
"loss": 0.6968, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.5543792107795955, |
|
"grad_norm": 0.33466604351997375, |
|
"learning_rate": 6.38149700126863e-05, |
|
"loss": 0.6966, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 6.564003849855631, |
|
"grad_norm": 0.3052511513233185, |
|
"learning_rate": 6.350167633957698e-05, |
|
"loss": 0.6983, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 6.573628488931665, |
|
"grad_norm": 0.3621208071708679, |
|
"learning_rate": 6.318879542164385e-05, |
|
"loss": 0.6986, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 6.5832531280077, |
|
"grad_norm": 0.32712018489837646, |
|
"learning_rate": 6.287633079721986e-05, |
|
"loss": 0.6927, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 6.592877767083735, |
|
"grad_norm": 0.3064589202404022, |
|
"learning_rate": 6.256428599993e-05, |
|
"loss": 0.6995, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 6.602502406159769, |
|
"grad_norm": 0.3126335144042969, |
|
"learning_rate": 6.225266455865157e-05, |
|
"loss": 0.6985, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 6.612127045235804, |
|
"grad_norm": 0.35115116834640503, |
|
"learning_rate": 6.194146999747419e-05, |
|
"loss": 0.6918, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 6.621751684311838, |
|
"grad_norm": 0.32435253262519836, |
|
"learning_rate": 6.163070583565993e-05, |
|
"loss": 0.6988, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 6.631376323387873, |
|
"grad_norm": 0.3202888071537018, |
|
"learning_rate": 6.13203755876035e-05, |
|
"loss": 0.6895, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 6.641000962463908, |
|
"grad_norm": 0.3102019131183624, |
|
"learning_rate": 6.1010482762792585e-05, |
|
"loss": 0.6923, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 6.650625601539942, |
|
"grad_norm": 0.3367016911506653, |
|
"learning_rate": 6.070103086576802e-05, |
|
"loss": 0.6915, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 6.6602502406159765, |
|
"grad_norm": 0.3353261351585388, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.687, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 6.669874879692012, |
|
"grad_norm": 0.30828601121902466, |
|
"learning_rate": 6.0083463848269995e-05, |
|
"loss": 0.6934, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 6.679499518768046, |
|
"grad_norm": 0.3269566595554352, |
|
"learning_rate": 5.977535571178809e-05, |
|
"loss": 0.6967, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 6.6891241578440805, |
|
"grad_norm": 0.339278906583786, |
|
"learning_rate": 5.946770247099661e-05, |
|
"loss": 0.691, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 6.698748796920116, |
|
"grad_norm": 0.33345827460289, |
|
"learning_rate": 5.9160507605109275e-05, |
|
"loss": 0.7039, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 6.70837343599615, |
|
"grad_norm": 0.318852037191391, |
|
"learning_rate": 5.885377458815609e-05, |
|
"loss": 0.7019, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 6.717998075072185, |
|
"grad_norm": 0.3394601047039032, |
|
"learning_rate": 5.8547506888944007e-05, |
|
"loss": 0.6881, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 6.72762271414822, |
|
"grad_norm": 0.32474079728126526, |
|
"learning_rate": 5.824170797101787e-05, |
|
"loss": 0.6879, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 6.737247353224254, |
|
"grad_norm": 0.325595885515213, |
|
"learning_rate": 5.7936381292621e-05, |
|
"loss": 0.6951, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.746871992300289, |
|
"grad_norm": 0.3558216989040375, |
|
"learning_rate": 5.763153030665629e-05, |
|
"loss": 0.6947, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 6.756496631376323, |
|
"grad_norm": 0.3530566692352295, |
|
"learning_rate": 5.7327158460647065e-05, |
|
"loss": 0.6986, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 6.766121270452358, |
|
"grad_norm": 0.33962172269821167, |
|
"learning_rate": 5.702326919669817e-05, |
|
"loss": 0.6964, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 6.775745909528393, |
|
"grad_norm": 0.3108658790588379, |
|
"learning_rate": 5.671986595145693e-05, |
|
"loss": 0.6923, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 6.785370548604427, |
|
"grad_norm": 0.32073214650154114, |
|
"learning_rate": 5.64169521560743e-05, |
|
"loss": 0.6792, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 6.7949951876804615, |
|
"grad_norm": 0.3249306380748749, |
|
"learning_rate": 5.611453123616618e-05, |
|
"loss": 0.7013, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 6.804619826756497, |
|
"grad_norm": 0.333997905254364, |
|
"learning_rate": 5.581260661177463e-05, |
|
"loss": 0.6923, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 6.814244465832531, |
|
"grad_norm": 0.3433645963668823, |
|
"learning_rate": 5.551118169732901e-05, |
|
"loss": 0.7014, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 6.823869104908566, |
|
"grad_norm": 0.3301408886909485, |
|
"learning_rate": 5.521025990160772e-05, |
|
"loss": 0.6966, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 6.833493743984601, |
|
"grad_norm": 0.341169148683548, |
|
"learning_rate": 5.4909844627699255e-05, |
|
"loss": 0.6963, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 6.843118383060635, |
|
"grad_norm": 0.31754934787750244, |
|
"learning_rate": 5.460993927296407e-05, |
|
"loss": 0.6996, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 6.85274302213667, |
|
"grad_norm": 0.3002949655056, |
|
"learning_rate": 5.4310547228995936e-05, |
|
"loss": 0.6946, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 6.862367661212705, |
|
"grad_norm": 0.3369508981704712, |
|
"learning_rate": 5.4011671881583656e-05, |
|
"loss": 0.6902, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 6.871992300288739, |
|
"grad_norm": 0.3112001419067383, |
|
"learning_rate": 5.371331661067284e-05, |
|
"loss": 0.6935, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 6.881616939364774, |
|
"grad_norm": 0.3145786225795746, |
|
"learning_rate": 5.341548479032745e-05, |
|
"loss": 0.7027, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 6.891241578440808, |
|
"grad_norm": 0.32883113622665405, |
|
"learning_rate": 5.311817978869198e-05, |
|
"loss": 0.6928, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 6.900866217516843, |
|
"grad_norm": 0.3237265646457672, |
|
"learning_rate": 5.2821404967953114e-05, |
|
"loss": 0.6865, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 6.910490856592878, |
|
"grad_norm": 0.32935890555381775, |
|
"learning_rate": 5.2525163684301806e-05, |
|
"loss": 0.687, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 6.920115495668912, |
|
"grad_norm": 0.342359721660614, |
|
"learning_rate": 5.222945928789533e-05, |
|
"loss": 0.691, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 6.929740134744947, |
|
"grad_norm": 0.3421998620033264, |
|
"learning_rate": 5.193429512281926e-05, |
|
"loss": 0.6863, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.939364773820982, |
|
"grad_norm": 0.33589935302734375, |
|
"learning_rate": 5.1639674527049855e-05, |
|
"loss": 0.6916, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 6.948989412897016, |
|
"grad_norm": 0.3499864637851715, |
|
"learning_rate": 5.134560083241624e-05, |
|
"loss": 0.6878, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 6.958614051973051, |
|
"grad_norm": 0.3289993405342102, |
|
"learning_rate": 5.105207736456257e-05, |
|
"loss": 0.6976, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 6.968238691049086, |
|
"grad_norm": 0.32949408888816833, |
|
"learning_rate": 5.0759107442910715e-05, |
|
"loss": 0.6949, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 6.97786333012512, |
|
"grad_norm": 0.3234226703643799, |
|
"learning_rate": 5.046669438062238e-05, |
|
"loss": 0.6958, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 6.987487969201155, |
|
"grad_norm": 0.3094496726989746, |
|
"learning_rate": 5.0174841484561953e-05, |
|
"loss": 0.6938, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 6.99711260827719, |
|
"grad_norm": 0.31556159257888794, |
|
"learning_rate": 4.988355205525893e-05, |
|
"loss": 0.7004, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 6.999037536092397, |
|
"eval_loss": 3.276942253112793, |
|
"eval_runtime": 0.7888, |
|
"eval_samples_per_second": 13.945, |
|
"eval_steps_per_second": 2.535, |
|
"step": 3636 |
|
}, |
|
{ |
|
"epoch": 7.006737247353224, |
|
"grad_norm": 0.26794806122779846, |
|
"learning_rate": 4.959282938687061e-05, |
|
"loss": 0.6482, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 7.016361886429259, |
|
"grad_norm": 0.3672392666339874, |
|
"learning_rate": 4.9302676767144926e-05, |
|
"loss": 0.6471, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 7.025986525505293, |
|
"grad_norm": 0.2901393175125122, |
|
"learning_rate": 4.901309747738305e-05, |
|
"loss": 0.654, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 7.035611164581328, |
|
"grad_norm": 0.3516036868095398, |
|
"learning_rate": 4.872409479240259e-05, |
|
"loss": 0.6452, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 7.045235803657363, |
|
"grad_norm": 0.3640913665294647, |
|
"learning_rate": 4.843567198050031e-05, |
|
"loss": 0.6369, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 7.054860442733397, |
|
"grad_norm": 0.2963874936103821, |
|
"learning_rate": 4.814783230341531e-05, |
|
"loss": 0.6353, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 7.0644850818094325, |
|
"grad_norm": 0.3295438587665558, |
|
"learning_rate": 4.786057901629209e-05, |
|
"loss": 0.6398, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 7.074109720885467, |
|
"grad_norm": 0.3382556736469269, |
|
"learning_rate": 4.757391536764366e-05, |
|
"loss": 0.6452, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 7.083734359961501, |
|
"grad_norm": 0.3277692496776581, |
|
"learning_rate": 4.728784459931495e-05, |
|
"loss": 0.637, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 7.0933589990375365, |
|
"grad_norm": 0.3565356433391571, |
|
"learning_rate": 4.700236994644609e-05, |
|
"loss": 0.6379, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 7.102983638113571, |
|
"grad_norm": 0.35193830728530884, |
|
"learning_rate": 4.671749463743572e-05, |
|
"loss": 0.6512, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 7.112608277189605, |
|
"grad_norm": 0.32000118494033813, |
|
"learning_rate": 4.64332218939047e-05, |
|
"loss": 0.6445, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 7.12223291626564, |
|
"grad_norm": 0.33006584644317627, |
|
"learning_rate": 4.61495549306594e-05, |
|
"loss": 0.6381, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 7.131857555341675, |
|
"grad_norm": 0.3775092661380768, |
|
"learning_rate": 4.586649695565563e-05, |
|
"loss": 0.6331, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 7.141482194417709, |
|
"grad_norm": 0.3325980007648468, |
|
"learning_rate": 4.558405116996214e-05, |
|
"loss": 0.6436, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 7.151106833493744, |
|
"grad_norm": 0.3391129970550537, |
|
"learning_rate": 4.530222076772456e-05, |
|
"loss": 0.6415, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 7.160731472569779, |
|
"grad_norm": 0.31919702887535095, |
|
"learning_rate": 4.5021008936129216e-05, |
|
"loss": 0.6441, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 7.170356111645813, |
|
"grad_norm": 0.3420950770378113, |
|
"learning_rate": 4.4740418855367005e-05, |
|
"loss": 0.6524, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 7.179980750721848, |
|
"grad_norm": 0.354056179523468, |
|
"learning_rate": 4.4460453698597623e-05, |
|
"loss": 0.6476, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 7.189605389797882, |
|
"grad_norm": 0.31593650579452515, |
|
"learning_rate": 4.418111663191354e-05, |
|
"loss": 0.6473, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 7.1992300288739175, |
|
"grad_norm": 0.33761167526245117, |
|
"learning_rate": 4.390241081430423e-05, |
|
"loss": 0.6402, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 7.208854667949952, |
|
"grad_norm": 0.35358771681785583, |
|
"learning_rate": 4.362433939762046e-05, |
|
"loss": 0.6471, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 7.218479307025986, |
|
"grad_norm": 0.32182127237319946, |
|
"learning_rate": 4.3346905526538574e-05, |
|
"loss": 0.6408, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 7.228103946102022, |
|
"grad_norm": 0.3282702565193176, |
|
"learning_rate": 4.307011233852505e-05, |
|
"loss": 0.642, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 7.237728585178056, |
|
"grad_norm": 0.33513620495796204, |
|
"learning_rate": 4.279396296380097e-05, |
|
"loss": 0.6391, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 7.24735322425409, |
|
"grad_norm": 0.33494138717651367, |
|
"learning_rate": 4.2518460525306524e-05, |
|
"loss": 0.6401, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 7.256977863330125, |
|
"grad_norm": 0.33716508746147156, |
|
"learning_rate": 4.2243608138665906e-05, |
|
"loss": 0.6499, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 7.26660250240616, |
|
"grad_norm": 0.3404597043991089, |
|
"learning_rate": 4.19694089121518e-05, |
|
"loss": 0.6385, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 7.276227141482194, |
|
"grad_norm": 0.32999253273010254, |
|
"learning_rate": 4.169586594665048e-05, |
|
"loss": 0.6433, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 7.285851780558229, |
|
"grad_norm": 0.3411442041397095, |
|
"learning_rate": 4.142298233562664e-05, |
|
"loss": 0.6422, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 7.295476419634264, |
|
"grad_norm": 0.3550765812397003, |
|
"learning_rate": 4.115076116508837e-05, |
|
"loss": 0.6458, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 7.3051010587102985, |
|
"grad_norm": 0.3416723608970642, |
|
"learning_rate": 4.08792055135524e-05, |
|
"loss": 0.6456, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 7.314725697786333, |
|
"grad_norm": 0.35609087347984314, |
|
"learning_rate": 4.0608318452009e-05, |
|
"loss": 0.6533, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.324350336862367, |
|
"grad_norm": 0.332507461309433, |
|
"learning_rate": 4.033810304388759e-05, |
|
"loss": 0.6282, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 7.3339749759384025, |
|
"grad_norm": 0.34344714879989624, |
|
"learning_rate": 4.006856234502191e-05, |
|
"loss": 0.633, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 7.343599615014437, |
|
"grad_norm": 0.3543119430541992, |
|
"learning_rate": 3.9799699403615457e-05, |
|
"loss": 0.6417, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 7.353224254090471, |
|
"grad_norm": 0.3393097221851349, |
|
"learning_rate": 3.953151726020713e-05, |
|
"loss": 0.6337, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 7.362848893166507, |
|
"grad_norm": 0.34601929783821106, |
|
"learning_rate": 3.926401894763663e-05, |
|
"loss": 0.6514, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 7.372473532242541, |
|
"grad_norm": 0.3476494550704956, |
|
"learning_rate": 3.89972074910104e-05, |
|
"loss": 0.6381, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 7.382098171318575, |
|
"grad_norm": 0.3308873474597931, |
|
"learning_rate": 3.8731085907667345e-05, |
|
"loss": 0.6523, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 7.39172281039461, |
|
"grad_norm": 0.33746767044067383, |
|
"learning_rate": 3.846565720714451e-05, |
|
"loss": 0.6386, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 7.401347449470645, |
|
"grad_norm": 0.33146432042121887, |
|
"learning_rate": 3.820092439114339e-05, |
|
"loss": 0.6505, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 7.410972088546679, |
|
"grad_norm": 0.34075871109962463, |
|
"learning_rate": 3.793689045349575e-05, |
|
"loss": 0.6292, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 7.420596727622714, |
|
"grad_norm": 0.3384300172328949, |
|
"learning_rate": 3.7673558380129735e-05, |
|
"loss": 0.649, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 7.430221366698749, |
|
"grad_norm": 0.35409146547317505, |
|
"learning_rate": 3.741093114903631e-05, |
|
"loss": 0.6401, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 7.4398460057747835, |
|
"grad_norm": 0.3388952314853668, |
|
"learning_rate": 3.7149011730235394e-05, |
|
"loss": 0.646, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 7.449470644850818, |
|
"grad_norm": 0.3542778789997101, |
|
"learning_rate": 3.688780308574238e-05, |
|
"loss": 0.6367, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 7.459095283926853, |
|
"grad_norm": 0.33730167150497437, |
|
"learning_rate": 3.66273081695346e-05, |
|
"loss": 0.655, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 7.468719923002888, |
|
"grad_norm": 0.3402201533317566, |
|
"learning_rate": 3.6367529927517855e-05, |
|
"loss": 0.6327, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 7.478344562078922, |
|
"grad_norm": 0.3543342649936676, |
|
"learning_rate": 3.610847129749323e-05, |
|
"loss": 0.6534, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 7.487969201154956, |
|
"grad_norm": 0.3624216914176941, |
|
"learning_rate": 3.585013520912377e-05, |
|
"loss": 0.6393, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 7.497593840230992, |
|
"grad_norm": 0.3448854386806488, |
|
"learning_rate": 3.559252458390142e-05, |
|
"loss": 0.6473, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 7.507218479307026, |
|
"grad_norm": 0.3260321021080017, |
|
"learning_rate": 3.533564233511394e-05, |
|
"loss": 0.635, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.51684311838306, |
|
"grad_norm": 0.36959561705589294, |
|
"learning_rate": 3.507949136781189e-05, |
|
"loss": 0.6454, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 7.526467757459095, |
|
"grad_norm": 0.3395916223526001, |
|
"learning_rate": 3.482407457877598e-05, |
|
"loss": 0.6491, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 7.53609239653513, |
|
"grad_norm": 0.3479905426502228, |
|
"learning_rate": 3.456939485648406e-05, |
|
"loss": 0.638, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 7.5457170356111645, |
|
"grad_norm": 0.3783397674560547, |
|
"learning_rate": 3.4315455081078696e-05, |
|
"loss": 0.6446, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 7.555341674687199, |
|
"grad_norm": 0.34621936082839966, |
|
"learning_rate": 3.4062258124334434e-05, |
|
"loss": 0.64, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 7.564966313763234, |
|
"grad_norm": 0.34806111454963684, |
|
"learning_rate": 3.3809806849625314e-05, |
|
"loss": 0.641, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 7.5745909528392685, |
|
"grad_norm": 0.33737459778785706, |
|
"learning_rate": 3.355810411189264e-05, |
|
"loss": 0.6389, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 7.584215591915303, |
|
"grad_norm": 0.36518171429634094, |
|
"learning_rate": 3.330715275761257e-05, |
|
"loss": 0.6448, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 7.593840230991338, |
|
"grad_norm": 0.3364472985267639, |
|
"learning_rate": 3.305695562476393e-05, |
|
"loss": 0.6378, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 7.603464870067373, |
|
"grad_norm": 0.345920592546463, |
|
"learning_rate": 3.280751554279622e-05, |
|
"loss": 0.634, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 7.613089509143407, |
|
"grad_norm": 0.33815324306488037, |
|
"learning_rate": 3.255883533259741e-05, |
|
"loss": 0.6452, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 7.622714148219442, |
|
"grad_norm": 0.34798070788383484, |
|
"learning_rate": 3.2310917806462274e-05, |
|
"loss": 0.6433, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 7.632338787295477, |
|
"grad_norm": 0.34050893783569336, |
|
"learning_rate": 3.2063765768060475e-05, |
|
"loss": 0.6505, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 7.641963426371511, |
|
"grad_norm": 0.3409608006477356, |
|
"learning_rate": 3.1817382012404854e-05, |
|
"loss": 0.6515, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 7.651588065447545, |
|
"grad_norm": 0.3448992371559143, |
|
"learning_rate": 3.157176932581983e-05, |
|
"loss": 0.6355, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 7.661212704523581, |
|
"grad_norm": 0.3314208984375, |
|
"learning_rate": 3.132693048590988e-05, |
|
"loss": 0.647, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 7.670837343599615, |
|
"grad_norm": 0.34806132316589355, |
|
"learning_rate": 3.108286826152818e-05, |
|
"loss": 0.6377, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 7.6804619826756495, |
|
"grad_norm": 0.3525891900062561, |
|
"learning_rate": 3.083958541274518e-05, |
|
"loss": 0.6326, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 7.690086621751684, |
|
"grad_norm": 0.36846107244491577, |
|
"learning_rate": 3.059708469081754e-05, |
|
"loss": 0.6327, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 7.699711260827719, |
|
"grad_norm": 0.33311864733695984, |
|
"learning_rate": 3.035536883815696e-05, |
|
"loss": 0.6379, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.709335899903754, |
|
"grad_norm": 0.3615313172340393, |
|
"learning_rate": 3.0114440588299033e-05, |
|
"loss": 0.6522, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 7.718960538979788, |
|
"grad_norm": 0.33901557326316833, |
|
"learning_rate": 2.9874302665872544e-05, |
|
"loss": 0.6495, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 7.728585178055823, |
|
"grad_norm": 0.3336678743362427, |
|
"learning_rate": 2.963495778656853e-05, |
|
"loss": 0.6583, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 7.738209817131858, |
|
"grad_norm": 0.38028064370155334, |
|
"learning_rate": 2.9396408657109608e-05, |
|
"loss": 0.6365, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 7.747834456207892, |
|
"grad_norm": 0.3507869243621826, |
|
"learning_rate": 2.9158657975219385e-05, |
|
"loss": 0.6466, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 7.757459095283927, |
|
"grad_norm": 0.3580639660358429, |
|
"learning_rate": 2.8921708429591797e-05, |
|
"loss": 0.6472, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 7.767083734359962, |
|
"grad_norm": 0.3309887945652008, |
|
"learning_rate": 2.8685562699860957e-05, |
|
"loss": 0.6476, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 7.776708373435996, |
|
"grad_norm": 0.3457421064376831, |
|
"learning_rate": 2.8450223456570668e-05, |
|
"loss": 0.6414, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 7.7863330125120305, |
|
"grad_norm": 0.33013686537742615, |
|
"learning_rate": 2.8215693361144324e-05, |
|
"loss": 0.6535, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 7.795957651588066, |
|
"grad_norm": 0.32177311182022095, |
|
"learning_rate": 2.798197506585464e-05, |
|
"loss": 0.6487, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 7.8055822906641, |
|
"grad_norm": 0.3439447283744812, |
|
"learning_rate": 2.774907121379393e-05, |
|
"loss": 0.6354, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 7.8152069297401345, |
|
"grad_norm": 0.34718647599220276, |
|
"learning_rate": 2.751698443884394e-05, |
|
"loss": 0.6504, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 7.824831568816169, |
|
"grad_norm": 0.34381964802742004, |
|
"learning_rate": 2.7285717365646256e-05, |
|
"loss": 0.6453, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 7.834456207892204, |
|
"grad_norm": 0.34925544261932373, |
|
"learning_rate": 2.7055272609572568e-05, |
|
"loss": 0.6484, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 7.844080846968239, |
|
"grad_norm": 0.34031766653060913, |
|
"learning_rate": 2.6825652776695076e-05, |
|
"loss": 0.6462, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 7.853705486044273, |
|
"grad_norm": 0.3397299349308014, |
|
"learning_rate": 2.6596860463756935e-05, |
|
"loss": 0.6444, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 7.863330125120308, |
|
"grad_norm": 0.348021537065506, |
|
"learning_rate": 2.636889825814307e-05, |
|
"loss": 0.6389, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 7.872954764196343, |
|
"grad_norm": 0.3368039131164551, |
|
"learning_rate": 2.6141768737850814e-05, |
|
"loss": 0.6453, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 7.882579403272377, |
|
"grad_norm": 0.34815698862075806, |
|
"learning_rate": 2.5915474471460732e-05, |
|
"loss": 0.6474, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 7.892204042348412, |
|
"grad_norm": 0.3499961793422699, |
|
"learning_rate": 2.5690018018107642e-05, |
|
"loss": 0.6436, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.901828681424447, |
|
"grad_norm": 0.3426460921764374, |
|
"learning_rate": 2.5465401927451537e-05, |
|
"loss": 0.6437, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 7.911453320500481, |
|
"grad_norm": 0.3375738561153412, |
|
"learning_rate": 2.524162873964896e-05, |
|
"loss": 0.6394, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 7.9210779595765155, |
|
"grad_norm": 0.34224507212638855, |
|
"learning_rate": 2.501870098532412e-05, |
|
"loss": 0.6524, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 7.930702598652551, |
|
"grad_norm": 0.3286498785018921, |
|
"learning_rate": 2.4796621185540348e-05, |
|
"loss": 0.6507, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 7.940327237728585, |
|
"grad_norm": 0.36504673957824707, |
|
"learning_rate": 2.4575391851771477e-05, |
|
"loss": 0.6389, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 7.94995187680462, |
|
"grad_norm": 0.3325868546962738, |
|
"learning_rate": 2.4355015485873644e-05, |
|
"loss": 0.6402, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 7.959576515880655, |
|
"grad_norm": 0.35220691561698914, |
|
"learning_rate": 2.4135494580056737e-05, |
|
"loss": 0.6553, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 7.969201154956689, |
|
"grad_norm": 0.3708426058292389, |
|
"learning_rate": 2.3916831616856473e-05, |
|
"loss": 0.6518, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 7.978825794032724, |
|
"grad_norm": 0.34426939487457275, |
|
"learning_rate": 2.3699029069106115e-05, |
|
"loss": 0.6505, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 7.988450433108758, |
|
"grad_norm": 0.3554341793060303, |
|
"learning_rate": 2.348208939990866e-05, |
|
"loss": 0.6497, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 7.998075072184793, |
|
"grad_norm": 0.3434050381183624, |
|
"learning_rate": 2.3266015062608838e-05, |
|
"loss": 0.6466, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 3.694774627685547, |
|
"eval_runtime": 0.7787, |
|
"eval_samples_per_second": 14.127, |
|
"eval_steps_per_second": 2.569, |
|
"step": 4156 |
|
}, |
|
{ |
|
"epoch": 8.007699711260829, |
|
"grad_norm": 0.28248271346092224, |
|
"learning_rate": 2.3050808500765487e-05, |
|
"loss": 0.6121, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 8.017324350336862, |
|
"grad_norm": 0.36666032671928406, |
|
"learning_rate": 2.2836472148123878e-05, |
|
"loss": 0.6176, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 8.026948989412897, |
|
"grad_norm": 0.32897964119911194, |
|
"learning_rate": 2.2623008428588177e-05, |
|
"loss": 0.6079, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 8.03657362848893, |
|
"grad_norm": 0.32618117332458496, |
|
"learning_rate": 2.24104197561941e-05, |
|
"loss": 0.6043, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 8.046198267564966, |
|
"grad_norm": 0.3435162305831909, |
|
"learning_rate": 2.2198708535081446e-05, |
|
"loss": 0.6082, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 8.055822906641001, |
|
"grad_norm": 0.3350038528442383, |
|
"learning_rate": 2.198787715946712e-05, |
|
"loss": 0.6098, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 8.065447545717035, |
|
"grad_norm": 0.3771952986717224, |
|
"learning_rate": 2.1777928013617908e-05, |
|
"loss": 0.6137, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 8.07507218479307, |
|
"grad_norm": 0.3174493610858917, |
|
"learning_rate": 2.1568863471823642e-05, |
|
"loss": 0.6169, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 8.084696823869105, |
|
"grad_norm": 0.33214735984802246, |
|
"learning_rate": 2.1360685898370146e-05, |
|
"loss": 0.6066, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 8.094321462945139, |
|
"grad_norm": 0.3336653411388397, |
|
"learning_rate": 2.1153397647512763e-05, |
|
"loss": 0.6073, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 8.103946102021174, |
|
"grad_norm": 0.32206472754478455, |
|
"learning_rate": 2.0947001063449457e-05, |
|
"loss": 0.6, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 8.11357074109721, |
|
"grad_norm": 0.3184707760810852, |
|
"learning_rate": 2.074149848029453e-05, |
|
"loss": 0.6065, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 8.123195380173243, |
|
"grad_norm": 0.3209008276462555, |
|
"learning_rate": 2.0536892222052128e-05, |
|
"loss": 0.608, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 8.132820019249278, |
|
"grad_norm": 0.34929510951042175, |
|
"learning_rate": 2.0333184602589962e-05, |
|
"loss": 0.6125, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 8.142444658325314, |
|
"grad_norm": 0.34042608737945557, |
|
"learning_rate": 2.01303779256131e-05, |
|
"loss": 0.6094, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 8.152069297401347, |
|
"grad_norm": 0.33042535185813904, |
|
"learning_rate": 1.992847448463798e-05, |
|
"loss": 0.6122, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 8.161693936477382, |
|
"grad_norm": 0.3154657781124115, |
|
"learning_rate": 1.9727476562966508e-05, |
|
"loss": 0.6141, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 8.171318575553416, |
|
"grad_norm": 0.33518335223197937, |
|
"learning_rate": 1.952738643366011e-05, |
|
"loss": 0.6139, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 8.180943214629451, |
|
"grad_norm": 0.3391817510128021, |
|
"learning_rate": 1.9328206359514155e-05, |
|
"loss": 0.6106, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 8.190567853705486, |
|
"grad_norm": 0.33157217502593994, |
|
"learning_rate": 1.9129938593032227e-05, |
|
"loss": 0.6051, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 8.20019249278152, |
|
"grad_norm": 0.3601199686527252, |
|
"learning_rate": 1.8932585376400803e-05, |
|
"loss": 0.6127, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 8.209817131857555, |
|
"grad_norm": 0.3452966511249542, |
|
"learning_rate": 1.8736148941463795e-05, |
|
"loss": 0.6162, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 8.21944177093359, |
|
"grad_norm": 0.3637758791446686, |
|
"learning_rate": 1.854063150969737e-05, |
|
"loss": 0.6232, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 8.229066410009624, |
|
"grad_norm": 0.3771421015262604, |
|
"learning_rate": 1.834603529218475e-05, |
|
"loss": 0.6066, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 8.23869104908566, |
|
"grad_norm": 0.338925302028656, |
|
"learning_rate": 1.81523624895913e-05, |
|
"loss": 0.6155, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 8.248315688161695, |
|
"grad_norm": 0.3534870147705078, |
|
"learning_rate": 1.7959615292139544e-05, |
|
"loss": 0.614, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 8.257940327237728, |
|
"grad_norm": 0.33125004172325134, |
|
"learning_rate": 1.7767795879584504e-05, |
|
"loss": 0.6175, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 8.267564966313763, |
|
"grad_norm": 0.3411141037940979, |
|
"learning_rate": 1.7576906421188967e-05, |
|
"loss": 0.6114, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 8.277189605389799, |
|
"grad_norm": 0.3340323865413666, |
|
"learning_rate": 1.738694907569901e-05, |
|
"loss": 0.6233, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 8.286814244465832, |
|
"grad_norm": 0.3233914375305176, |
|
"learning_rate": 1.7197925991319486e-05, |
|
"loss": 0.6082, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 8.296438883541867, |
|
"grad_norm": 0.3364531099796295, |
|
"learning_rate": 1.7009839305689855e-05, |
|
"loss": 0.6049, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 8.306063522617901, |
|
"grad_norm": 0.34157273173332214, |
|
"learning_rate": 1.682269114585996e-05, |
|
"loss": 0.6141, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 8.315688161693936, |
|
"grad_norm": 0.33447617292404175, |
|
"learning_rate": 1.6636483628265942e-05, |
|
"loss": 0.6093, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 8.325312800769971, |
|
"grad_norm": 0.33221328258514404, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 0.6073, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 8.334937439846005, |
|
"grad_norm": 0.32823801040649414, |
|
"learning_rate": 1.626689893231832e-05, |
|
"loss": 0.6069, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 8.34456207892204, |
|
"grad_norm": 0.3583478629589081, |
|
"learning_rate": 1.60835259335538e-05, |
|
"loss": 0.6171, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 8.354186717998076, |
|
"grad_norm": 0.33178088068962097, |
|
"learning_rate": 1.5901101936156136e-05, |
|
"loss": 0.6066, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 8.363811357074109, |
|
"grad_norm": 0.3466804623603821, |
|
"learning_rate": 1.5719629003136506e-05, |
|
"loss": 0.6023, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 8.373435996150144, |
|
"grad_norm": 0.357316792011261, |
|
"learning_rate": 1.5539109186750544e-05, |
|
"loss": 0.6059, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 8.38306063522618, |
|
"grad_norm": 0.3246915340423584, |
|
"learning_rate": 1.5359544528475323e-05, |
|
"loss": 0.6231, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 8.392685274302213, |
|
"grad_norm": 0.3579736649990082, |
|
"learning_rate": 1.5180937058986033e-05, |
|
"loss": 0.617, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 8.402309913378248, |
|
"grad_norm": 0.33767664432525635, |
|
"learning_rate": 1.5003288798133198e-05, |
|
"loss": 0.6135, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 8.411934552454284, |
|
"grad_norm": 0.34384191036224365, |
|
"learning_rate": 1.4826601754919755e-05, |
|
"loss": 0.6045, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 8.421559191530317, |
|
"grad_norm": 0.34475091099739075, |
|
"learning_rate": 1.4650877927478357e-05, |
|
"loss": 0.611, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 8.431183830606352, |
|
"grad_norm": 0.3544045686721802, |
|
"learning_rate": 1.4476119303048707e-05, |
|
"loss": 0.6048, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 8.440808469682388, |
|
"grad_norm": 0.3278457820415497, |
|
"learning_rate": 1.43023278579552e-05, |
|
"loss": 0.6216, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 8.450433108758421, |
|
"grad_norm": 0.33195823431015015, |
|
"learning_rate": 1.4129505557584511e-05, |
|
"loss": 0.6106, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 8.460057747834457, |
|
"grad_norm": 0.32435399293899536, |
|
"learning_rate": 1.3957654356363349e-05, |
|
"loss": 0.6142, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 8.46968238691049, |
|
"grad_norm": 0.34540995955467224, |
|
"learning_rate": 1.3786776197736417e-05, |
|
"loss": 0.6112, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 8.479307025986525, |
|
"grad_norm": 0.3274092972278595, |
|
"learning_rate": 1.3616873014144327e-05, |
|
"loss": 0.6151, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 8.48893166506256, |
|
"grad_norm": 0.3616076409816742, |
|
"learning_rate": 1.3447946727001881e-05, |
|
"loss": 0.6167, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 8.498556304138594, |
|
"grad_norm": 0.32997846603393555, |
|
"learning_rate": 1.3279999246676256e-05, |
|
"loss": 0.611, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 8.50818094321463, |
|
"grad_norm": 0.34430432319641113, |
|
"learning_rate": 1.3113032472465426e-05, |
|
"loss": 0.613, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 8.517805582290665, |
|
"grad_norm": 0.35246655344963074, |
|
"learning_rate": 1.2947048292576636e-05, |
|
"loss": 0.6133, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 8.527430221366698, |
|
"grad_norm": 0.3330981433391571, |
|
"learning_rate": 1.2782048584105166e-05, |
|
"loss": 0.615, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 8.537054860442733, |
|
"grad_norm": 0.33830517530441284, |
|
"learning_rate": 1.2618035213012924e-05, |
|
"loss": 0.6175, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 8.546679499518769, |
|
"grad_norm": 0.3427278399467468, |
|
"learning_rate": 1.2455010034107527e-05, |
|
"loss": 0.6111, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 8.556304138594802, |
|
"grad_norm": 0.3526034355163574, |
|
"learning_rate": 1.2292974891021236e-05, |
|
"loss": 0.6135, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 8.565928777670837, |
|
"grad_norm": 0.3584502935409546, |
|
"learning_rate": 1.2131931616190118e-05, |
|
"loss": 0.6143, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 8.575553416746873, |
|
"grad_norm": 0.32676076889038086, |
|
"learning_rate": 1.1971882030833248e-05, |
|
"loss": 0.6092, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 8.585178055822906, |
|
"grad_norm": 0.3570641279220581, |
|
"learning_rate": 1.181282794493227e-05, |
|
"loss": 0.6101, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 8.594802694898942, |
|
"grad_norm": 0.35699462890625, |
|
"learning_rate": 1.165477115721083e-05, |
|
"loss": 0.6116, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 8.604427333974975, |
|
"grad_norm": 0.3642681837081909, |
|
"learning_rate": 1.1497713455114212e-05, |
|
"loss": 0.6204, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 8.61405197305101, |
|
"grad_norm": 0.34195858240127563, |
|
"learning_rate": 1.1341656614789208e-05, |
|
"loss": 0.6105, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 8.623676612127046, |
|
"grad_norm": 0.3449951410293579, |
|
"learning_rate": 1.1186602401063917e-05, |
|
"loss": 0.6061, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 8.63330125120308, |
|
"grad_norm": 0.3435938358306885, |
|
"learning_rate": 1.1032552567427912e-05, |
|
"loss": 0.6097, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 8.642925890279114, |
|
"grad_norm": 0.3187827169895172, |
|
"learning_rate": 1.0879508856012366e-05, |
|
"loss": 0.6022, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 8.65255052935515, |
|
"grad_norm": 0.3434700667858124, |
|
"learning_rate": 1.0727472997570243e-05, |
|
"loss": 0.6116, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 8.662175168431183, |
|
"grad_norm": 0.34856435656547546, |
|
"learning_rate": 1.0576446711456933e-05, |
|
"loss": 0.605, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 8.671799807507218, |
|
"grad_norm": 0.3594229817390442, |
|
"learning_rate": 1.0426431705610606e-05, |
|
"loss": 0.6133, |
|
"step": 4505 |
|
}, |
|
{ |
|
"epoch": 8.681424446583254, |
|
"grad_norm": 0.3380817174911499, |
|
"learning_rate": 1.0277429676533023e-05, |
|
"loss": 0.6073, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 8.691049085659287, |
|
"grad_norm": 0.3276160955429077, |
|
"learning_rate": 1.012944230927031e-05, |
|
"loss": 0.6021, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 8.700673724735323, |
|
"grad_norm": 0.34987348318099976, |
|
"learning_rate": 9.9824712773939e-06, |
|
"loss": 0.617, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 8.710298363811358, |
|
"grad_norm": 0.3415302336215973, |
|
"learning_rate": 9.83651824298164e-06, |
|
"loss": 0.6111, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 8.719923002887391, |
|
"grad_norm": 0.34866005182266235, |
|
"learning_rate": 9.69158485659889e-06, |
|
"loss": 0.603, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 8.729547641963427, |
|
"grad_norm": 0.36085546016693115, |
|
"learning_rate": 9.547672757280001e-06, |
|
"loss": 0.6042, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 8.739172281039462, |
|
"grad_norm": 0.36267852783203125, |
|
"learning_rate": 9.40478357250969e-06, |
|
"loss": 0.6127, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 8.748796920115495, |
|
"grad_norm": 0.36462917923927307, |
|
"learning_rate": 9.262918918204643e-06, |
|
"loss": 0.6123, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 8.75842155919153, |
|
"grad_norm": 0.34768378734588623, |
|
"learning_rate": 9.122080398695299e-06, |
|
"loss": 0.6048, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 8.768046198267564, |
|
"grad_norm": 0.330387681722641, |
|
"learning_rate": 8.982269606707593e-06, |
|
"loss": 0.6165, |
|
"step": 4555 |
|
}, |
|
{ |
|
"epoch": 8.7776708373436, |
|
"grad_norm": 0.3596397936344147, |
|
"learning_rate": 8.843488123345044e-06, |
|
"loss": 0.6072, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 8.787295476419635, |
|
"grad_norm": 0.35082703828811646, |
|
"learning_rate": 8.705737518070888e-06, |
|
"loss": 0.6185, |
|
"step": 4565 |
|
}, |
|
{ |
|
"epoch": 8.796920115495668, |
|
"grad_norm": 0.33255165815353394, |
|
"learning_rate": 8.569019348690189e-06, |
|
"loss": 0.6099, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 8.806544754571703, |
|
"grad_norm": 0.3488062620162964, |
|
"learning_rate": 8.433335161332412e-06, |
|
"loss": 0.6056, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 8.816169393647739, |
|
"grad_norm": 0.35131949186325073, |
|
"learning_rate": 8.298686490433771e-06, |
|
"loss": 0.6102, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 8.825794032723772, |
|
"grad_norm": 0.37358999252319336, |
|
"learning_rate": 8.165074858719989e-06, |
|
"loss": 0.6103, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 8.835418671799808, |
|
"grad_norm": 0.35089996457099915, |
|
"learning_rate": 8.032501777189017e-06, |
|
"loss": 0.6112, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 8.845043310875843, |
|
"grad_norm": 0.35341012477874756, |
|
"learning_rate": 7.900968745093996e-06, |
|
"loss": 0.6089, |
|
"step": 4595 |
|
}, |
|
{ |
|
"epoch": 8.854667949951876, |
|
"grad_norm": 0.3873613178730011, |
|
"learning_rate": 7.770477249926256e-06, |
|
"loss": 0.6111, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 8.864292589027912, |
|
"grad_norm": 0.34750309586524963, |
|
"learning_rate": 7.641028767398472e-06, |
|
"loss": 0.616, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 8.873917228103947, |
|
"grad_norm": 0.32477355003356934, |
|
"learning_rate": 7.512624761428066e-06, |
|
"loss": 0.6089, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 8.88354186717998, |
|
"grad_norm": 0.35710757970809937, |
|
"learning_rate": 7.385266684120573e-06, |
|
"loss": 0.61, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 8.893166506256016, |
|
"grad_norm": 0.34388595819473267, |
|
"learning_rate": 7.258955975753279e-06, |
|
"loss": 0.6076, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 8.90279114533205, |
|
"grad_norm": 0.32944580912590027, |
|
"learning_rate": 7.133694064758867e-06, |
|
"loss": 0.606, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 8.912415784408084, |
|
"grad_norm": 0.3470548093318939, |
|
"learning_rate": 7.0094823677092856e-06, |
|
"loss": 0.6015, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 8.92204042348412, |
|
"grad_norm": 0.3423613905906677, |
|
"learning_rate": 6.886322289299763e-06, |
|
"loss": 0.6155, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 8.931665062560153, |
|
"grad_norm": 0.35634317994117737, |
|
"learning_rate": 6.764215222332914e-06, |
|
"loss": 0.6146, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 8.941289701636189, |
|
"grad_norm": 0.33485671877861023, |
|
"learning_rate": 6.643162547702931e-06, |
|
"loss": 0.6135, |
|
"step": 4645 |
|
}, |
|
{ |
|
"epoch": 8.950914340712224, |
|
"grad_norm": 0.35238829255104065, |
|
"learning_rate": 6.523165634380046e-06, |
|
"loss": 0.6044, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 8.960538979788257, |
|
"grad_norm": 0.3438652753829956, |
|
"learning_rate": 6.404225839394973e-06, |
|
"loss": 0.6107, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 8.970163618864293, |
|
"grad_norm": 0.352061003446579, |
|
"learning_rate": 6.286344507823638e-06, |
|
"loss": 0.6164, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 8.979788257940328, |
|
"grad_norm": 0.3431857228279114, |
|
"learning_rate": 6.169522972771924e-06, |
|
"loss": 0.6144, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 8.989412897016361, |
|
"grad_norm": 0.32378876209259033, |
|
"learning_rate": 6.053762555360587e-06, |
|
"loss": 0.6162, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 8.999037536092397, |
|
"grad_norm": 0.36266306042671204, |
|
"learning_rate": 5.939064564710373e-06, |
|
"loss": 0.6132, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 8.999037536092397, |
|
"eval_loss": 3.9708144664764404, |
|
"eval_runtime": 0.7877, |
|
"eval_samples_per_second": 13.964, |
|
"eval_steps_per_second": 2.539, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 9.008662175168432, |
|
"grad_norm": 0.3068545162677765, |
|
"learning_rate": 5.825430297927092e-06, |
|
"loss": 0.5915, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 9.018286814244465, |
|
"grad_norm": 0.3031752407550812, |
|
"learning_rate": 5.712861040087092e-06, |
|
"loss": 0.586, |
|
"step": 4685 |
|
}, |
|
{ |
|
"epoch": 9.0279114533205, |
|
"grad_norm": 0.33787086606025696, |
|
"learning_rate": 5.601358064222639e-06, |
|
"loss": 0.5911, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 9.037536092396536, |
|
"grad_norm": 0.35586461424827576, |
|
"learning_rate": 5.49092263130756e-06, |
|
"loss": 0.5828, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 9.04716073147257, |
|
"grad_norm": 0.3516261875629425, |
|
"learning_rate": 5.381555990242959e-06, |
|
"loss": 0.5847, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 9.056785370548605, |
|
"grad_norm": 0.34338730573654175, |
|
"learning_rate": 5.273259377843087e-06, |
|
"loss": 0.6036, |
|
"step": 4705 |
|
}, |
|
{ |
|
"epoch": 9.066410009624638, |
|
"grad_norm": 0.3557838499546051, |
|
"learning_rate": 5.166034018821364e-06, |
|
"loss": 0.5939, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 9.076034648700674, |
|
"grad_norm": 0.31932586431503296, |
|
"learning_rate": 5.059881125776589e-06, |
|
"loss": 0.6016, |
|
"step": 4715 |
|
}, |
|
{ |
|
"epoch": 9.085659287776709, |
|
"grad_norm": 0.3272048532962799, |
|
"learning_rate": 4.9548018991790846e-06, |
|
"loss": 0.5909, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 9.095283926852742, |
|
"grad_norm": 0.3446064889431, |
|
"learning_rate": 4.850797527357287e-06, |
|
"loss": 0.5827, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 9.104908565928778, |
|
"grad_norm": 0.32635557651519775, |
|
"learning_rate": 4.747869186484177e-06, |
|
"loss": 0.5921, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 9.114533205004813, |
|
"grad_norm": 0.31974223256111145, |
|
"learning_rate": 4.64601804056406e-06, |
|
"loss": 0.5932, |
|
"step": 4735 |
|
}, |
|
{ |
|
"epoch": 9.124157844080846, |
|
"grad_norm": 0.3654205799102783, |
|
"learning_rate": 4.545245241419349e-06, |
|
"loss": 0.5995, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 9.133782483156882, |
|
"grad_norm": 0.35849812626838684, |
|
"learning_rate": 4.445551928677594e-06, |
|
"loss": 0.5995, |
|
"step": 4745 |
|
}, |
|
{ |
|
"epoch": 9.143407122232917, |
|
"grad_norm": 0.3359050750732422, |
|
"learning_rate": 4.346939229758529e-06, |
|
"loss": 0.5982, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 9.15303176130895, |
|
"grad_norm": 0.33533555269241333, |
|
"learning_rate": 4.2494082598613875e-06, |
|
"loss": 0.6007, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 9.162656400384986, |
|
"grad_norm": 0.3292589783668518, |
|
"learning_rate": 4.152960121952209e-06, |
|
"loss": 0.5974, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 9.172281039461021, |
|
"grad_norm": 0.34592679142951965, |
|
"learning_rate": 4.057595906751466e-06, |
|
"loss": 0.5922, |
|
"step": 4765 |
|
}, |
|
{ |
|
"epoch": 9.181905678537055, |
|
"grad_norm": 0.34907424449920654, |
|
"learning_rate": 3.963316692721663e-06, |
|
"loss": 0.6007, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 9.19153031761309, |
|
"grad_norm": 0.3478921949863434, |
|
"learning_rate": 3.870123546055149e-06, |
|
"loss": 0.5882, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 9.201154956689123, |
|
"grad_norm": 0.3408016860485077, |
|
"learning_rate": 3.7780175206620915e-06, |
|
"loss": 0.595, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 9.210779595765159, |
|
"grad_norm": 0.33491307497024536, |
|
"learning_rate": 3.686999658158474e-06, |
|
"loss": 0.5951, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 9.220404234841194, |
|
"grad_norm": 0.3383229672908783, |
|
"learning_rate": 3.597070987854456e-06, |
|
"loss": 0.5966, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 9.230028873917227, |
|
"grad_norm": 0.3315028250217438, |
|
"learning_rate": 3.508232526742583e-06, |
|
"loss": 0.5959, |
|
"step": 4795 |
|
}, |
|
{ |
|
"epoch": 9.239653512993263, |
|
"grad_norm": 0.30691462755203247, |
|
"learning_rate": 3.420485279486385e-06, |
|
"loss": 0.5853, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 9.249278152069298, |
|
"grad_norm": 0.34303727746009827, |
|
"learning_rate": 3.333830238409019e-06, |
|
"loss": 0.5973, |
|
"step": 4805 |
|
}, |
|
{ |
|
"epoch": 9.258902791145331, |
|
"grad_norm": 0.3458213210105896, |
|
"learning_rate": 3.248268383481934e-06, |
|
"loss": 0.5978, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 9.268527430221367, |
|
"grad_norm": 0.3539816737174988, |
|
"learning_rate": 3.163800682313933e-06, |
|
"loss": 0.5958, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 9.278152069297402, |
|
"grad_norm": 0.3442062735557556, |
|
"learning_rate": 3.080428090140142e-06, |
|
"loss": 0.6022, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 9.287776708373435, |
|
"grad_norm": 0.3180767893791199, |
|
"learning_rate": 2.9981515498112456e-06, |
|
"loss": 0.5955, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 9.29740134744947, |
|
"grad_norm": 0.34698548913002014, |
|
"learning_rate": 2.91697199178278e-06, |
|
"loss": 0.5947, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 9.307025986525506, |
|
"grad_norm": 0.3273780047893524, |
|
"learning_rate": 2.8368903341046583e-06, |
|
"loss": 0.5998, |
|
"step": 4835 |
|
}, |
|
{ |
|
"epoch": 9.31665062560154, |
|
"grad_norm": 0.31761637330055237, |
|
"learning_rate": 2.757907482410771e-06, |
|
"loss": 0.5841, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 9.326275264677575, |
|
"grad_norm": 0.3708135783672333, |
|
"learning_rate": 2.680024329908737e-06, |
|
"loss": 0.5953, |
|
"step": 4845 |
|
}, |
|
{ |
|
"epoch": 9.33589990375361, |
|
"grad_norm": 0.309467613697052, |
|
"learning_rate": 2.603241757369812e-06, |
|
"loss": 0.5969, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 9.345524542829644, |
|
"grad_norm": 0.32634660601615906, |
|
"learning_rate": 2.5275606331189416e-06, |
|
"loss": 0.602, |
|
"step": 4855 |
|
}, |
|
{ |
|
"epoch": 9.355149181905679, |
|
"grad_norm": 0.33582308888435364, |
|
"learning_rate": 2.452981813024868e-06, |
|
"loss": 0.5875, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 9.364773820981712, |
|
"grad_norm": 0.3333386182785034, |
|
"learning_rate": 2.379506140490595e-06, |
|
"loss": 0.5986, |
|
"step": 4865 |
|
}, |
|
{ |
|
"epoch": 9.374398460057748, |
|
"grad_norm": 0.35826408863067627, |
|
"learning_rate": 2.3071344464436595e-06, |
|
"loss": 0.6015, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 9.384023099133783, |
|
"grad_norm": 0.334588885307312, |
|
"learning_rate": 2.235867549326931e-06, |
|
"loss": 0.5942, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 9.393647738209816, |
|
"grad_norm": 0.3338033854961395, |
|
"learning_rate": 2.165706255089217e-06, |
|
"loss": 0.5991, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 9.403272377285852, |
|
"grad_norm": 0.3354242742061615, |
|
"learning_rate": 2.0966513571761827e-06, |
|
"loss": 0.5991, |
|
"step": 4885 |
|
}, |
|
{ |
|
"epoch": 9.412897016361887, |
|
"grad_norm": 0.34545251727104187, |
|
"learning_rate": 2.028703636521434e-06, |
|
"loss": 0.6058, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 9.42252165543792, |
|
"grad_norm": 0.33035480976104736, |
|
"learning_rate": 1.961863861537594e-06, |
|
"loss": 0.5981, |
|
"step": 4895 |
|
}, |
|
{ |
|
"epoch": 9.432146294513956, |
|
"grad_norm": 0.33753854036331177, |
|
"learning_rate": 1.8961327881076963e-06, |
|
"loss": 0.5944, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 9.441770933589991, |
|
"grad_norm": 0.34246233105659485, |
|
"learning_rate": 1.8315111595765932e-06, |
|
"loss": 0.5931, |
|
"step": 4905 |
|
}, |
|
{ |
|
"epoch": 9.451395572666025, |
|
"grad_norm": 0.33052095770835876, |
|
"learning_rate": 1.767999706742529e-06, |
|
"loss": 0.5986, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 9.46102021174206, |
|
"grad_norm": 0.35342252254486084, |
|
"learning_rate": 1.7055991478489464e-06, |
|
"loss": 0.5938, |
|
"step": 4915 |
|
}, |
|
{ |
|
"epoch": 9.470644850818095, |
|
"grad_norm": 0.33293551206588745, |
|
"learning_rate": 1.6443101885762812e-06, |
|
"loss": 0.5917, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 9.480269489894129, |
|
"grad_norm": 0.3331868648529053, |
|
"learning_rate": 1.5841335220340593e-06, |
|
"loss": 0.5951, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 9.489894128970164, |
|
"grad_norm": 0.35304731130599976, |
|
"learning_rate": 1.525069828753012e-06, |
|
"loss": 0.602, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 9.499518768046197, |
|
"grad_norm": 0.3421652019023895, |
|
"learning_rate": 1.4671197766773615e-06, |
|
"loss": 0.5966, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 9.509143407122233, |
|
"grad_norm": 0.3255125880241394, |
|
"learning_rate": 1.4102840211573264e-06, |
|
"loss": 0.5944, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 9.518768046198268, |
|
"grad_norm": 0.34258726239204407, |
|
"learning_rate": 1.3545632049416502e-06, |
|
"loss": 0.5889, |
|
"step": 4945 |
|
}, |
|
{ |
|
"epoch": 9.528392685274301, |
|
"grad_norm": 0.3264661729335785, |
|
"learning_rate": 1.2999579581703947e-06, |
|
"loss": 0.5954, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 9.538017324350337, |
|
"grad_norm": 0.3256395161151886, |
|
"learning_rate": 1.2464688983677697e-06, |
|
"loss": 0.5907, |
|
"step": 4955 |
|
}, |
|
{ |
|
"epoch": 9.547641963426372, |
|
"grad_norm": 0.32232365012168884, |
|
"learning_rate": 1.1940966304351265e-06, |
|
"loss": 0.5949, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 9.557266602502406, |
|
"grad_norm": 0.32586029171943665, |
|
"learning_rate": 1.1428417466442076e-06, |
|
"loss": 0.5885, |
|
"step": 4965 |
|
}, |
|
{ |
|
"epoch": 9.56689124157844, |
|
"grad_norm": 0.3531622886657715, |
|
"learning_rate": 1.0927048266303419e-06, |
|
"loss": 0.6064, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 9.576515880654476, |
|
"grad_norm": 0.34918224811553955, |
|
"learning_rate": 1.0436864373859712e-06, |
|
"loss": 0.6043, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 9.58614051973051, |
|
"grad_norm": 0.3377608358860016, |
|
"learning_rate": 9.95787133254189e-07, |
|
"loss": 0.5869, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 9.595765158806545, |
|
"grad_norm": 0.32988688349723816, |
|
"learning_rate": 9.490074559225015e-07, |
|
"loss": 0.5957, |
|
"step": 4985 |
|
}, |
|
{ |
|
"epoch": 9.60538979788258, |
|
"grad_norm": 0.3335455656051636, |
|
"learning_rate": 9.033479344166873e-07, |
|
"loss": 0.5901, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 9.615014436958614, |
|
"grad_norm": 0.34015801548957825, |
|
"learning_rate": 8.588090850948027e-07, |
|
"loss": 0.5956, |
|
"step": 4995 |
|
}, |
|
{ |
|
"epoch": 9.624639076034649, |
|
"grad_norm": 0.32440024614334106, |
|
"learning_rate": 8.153914116413752e-07, |
|
"loss": 0.6035, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.634263715110684, |
|
"grad_norm": 0.33188602328300476, |
|
"learning_rate": 7.730954050616746e-07, |
|
"loss": 0.6025, |
|
"step": 5005 |
|
}, |
|
{ |
|
"epoch": 9.643888354186718, |
|
"grad_norm": 0.33264580368995667, |
|
"learning_rate": 7.319215436761839e-07, |
|
"loss": 0.5973, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 9.653512993262753, |
|
"grad_norm": 0.342488557100296, |
|
"learning_rate": 6.918702931151711e-07, |
|
"loss": 0.5914, |
|
"step": 5015 |
|
}, |
|
{ |
|
"epoch": 9.663137632338787, |
|
"grad_norm": 0.33260515332221985, |
|
"learning_rate": 6.529421063134478e-07, |
|
"loss": 0.5964, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 9.672762271414822, |
|
"grad_norm": 0.358557790517807, |
|
"learning_rate": 6.151374235051966e-07, |
|
"loss": 0.6021, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 9.682386910490857, |
|
"grad_norm": 0.341327965259552, |
|
"learning_rate": 5.784566722190965e-07, |
|
"loss": 0.5911, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 9.69201154956689, |
|
"grad_norm": 0.31675535440444946, |
|
"learning_rate": 5.429002672733274e-07, |
|
"loss": 0.6015, |
|
"step": 5035 |
|
}, |
|
{ |
|
"epoch": 9.701636188642926, |
|
"grad_norm": 0.31824976205825806, |
|
"learning_rate": 5.084686107710513e-07, |
|
"loss": 0.599, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 9.711260827718961, |
|
"grad_norm": 0.3493671715259552, |
|
"learning_rate": 4.751620920957489e-07, |
|
"loss": 0.596, |
|
"step": 5045 |
|
}, |
|
{ |
|
"epoch": 9.720885466794995, |
|
"grad_norm": 0.34269365668296814, |
|
"learning_rate": 4.429810879068463e-07, |
|
"loss": 0.5969, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 9.73051010587103, |
|
"grad_norm": 0.3367815613746643, |
|
"learning_rate": 4.1192596213548427e-07, |
|
"loss": 0.5885, |
|
"step": 5055 |
|
}, |
|
{ |
|
"epoch": 9.740134744947065, |
|
"grad_norm": 0.34025177359580994, |
|
"learning_rate": 3.81997065980344e-07, |
|
"loss": 0.6051, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 9.749759384023099, |
|
"grad_norm": 0.3241323232650757, |
|
"learning_rate": 3.5319473790373924e-07, |
|
"loss": 0.5914, |
|
"step": 5065 |
|
}, |
|
{ |
|
"epoch": 9.759384023099134, |
|
"grad_norm": 0.3496091961860657, |
|
"learning_rate": 3.2551930362776373e-07, |
|
"loss": 0.5962, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 9.769008662175168, |
|
"grad_norm": 0.38736647367477417, |
|
"learning_rate": 2.989710761305942e-07, |
|
"loss": 0.5941, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 9.778633301251203, |
|
"grad_norm": 0.33493003249168396, |
|
"learning_rate": 2.7355035564294865e-07, |
|
"loss": 0.5971, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 9.788257940327238, |
|
"grad_norm": 0.3347594738006592, |
|
"learning_rate": 2.4925742964471144e-07, |
|
"loss": 0.605, |
|
"step": 5085 |
|
}, |
|
{ |
|
"epoch": 9.797882579403272, |
|
"grad_norm": 0.3406401574611664, |
|
"learning_rate": 2.2609257286169138e-07, |
|
"loss": 0.5876, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 9.807507218479307, |
|
"grad_norm": 0.34672555327415466, |
|
"learning_rate": 2.0405604726246864e-07, |
|
"loss": 0.5979, |
|
"step": 5095 |
|
}, |
|
{ |
|
"epoch": 9.817131857555342, |
|
"grad_norm": 0.3294496238231659, |
|
"learning_rate": 1.8314810205547483e-07, |
|
"loss": 0.584, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 9.826756496631376, |
|
"grad_norm": 0.33348360657691956, |
|
"learning_rate": 1.633689736861732e-07, |
|
"loss": 0.5945, |
|
"step": 5105 |
|
}, |
|
{ |
|
"epoch": 9.836381135707411, |
|
"grad_norm": 0.32033050060272217, |
|
"learning_rate": 1.4471888583436067e-07, |
|
"loss": 0.5952, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 9.846005774783446, |
|
"grad_norm": 0.35708528757095337, |
|
"learning_rate": 1.2719804941163648e-07, |
|
"loss": 0.5936, |
|
"step": 5115 |
|
}, |
|
{ |
|
"epoch": 9.85563041385948, |
|
"grad_norm": 0.34551671147346497, |
|
"learning_rate": 1.108066625590487e-07, |
|
"loss": 0.5902, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 9.865255052935515, |
|
"grad_norm": 0.32259657979011536, |
|
"learning_rate": 9.554491064484028e-08, |
|
"loss": 0.5976, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 9.87487969201155, |
|
"grad_norm": 0.3397790491580963, |
|
"learning_rate": 8.141296626231754e-08, |
|
"loss": 0.6072, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 9.884504331087584, |
|
"grad_norm": 0.3509461283683777, |
|
"learning_rate": 6.841098922797384e-08, |
|
"loss": 0.6013, |
|
"step": 5135 |
|
}, |
|
{ |
|
"epoch": 9.894128970163619, |
|
"grad_norm": 0.3350575268268585, |
|
"learning_rate": 5.653912657959115e-08, |
|
"loss": 0.6012, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 9.903753609239654, |
|
"grad_norm": 0.3318527042865753, |
|
"learning_rate": 4.579751257466347e-08, |
|
"loss": 0.6048, |
|
"step": 5145 |
|
}, |
|
{ |
|
"epoch": 9.913378248315688, |
|
"grad_norm": 0.37916940450668335, |
|
"learning_rate": 3.618626868879815e-08, |
|
"loss": 0.6068, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 9.923002887391723, |
|
"grad_norm": 0.3555992841720581, |
|
"learning_rate": 2.7705503614416928e-08, |
|
"loss": 0.588, |
|
"step": 5155 |
|
}, |
|
{ |
|
"epoch": 9.932627526467758, |
|
"grad_norm": 0.32007142901420593, |
|
"learning_rate": 2.0355313259468046e-08, |
|
"loss": 0.5954, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 9.942252165543792, |
|
"grad_norm": 0.3455217182636261, |
|
"learning_rate": 1.4135780746382665e-08, |
|
"loss": 0.5917, |
|
"step": 5165 |
|
}, |
|
{ |
|
"epoch": 9.951876804619827, |
|
"grad_norm": 0.32252103090286255, |
|
"learning_rate": 9.046976411108965e-09, |
|
"loss": 0.5889, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 9.96150144369586, |
|
"grad_norm": 0.3604857921600342, |
|
"learning_rate": 5.0889578023238794e-09, |
|
"loss": 0.5959, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 9.971126082771896, |
|
"grad_norm": 0.33323296904563904, |
|
"learning_rate": 2.261769680789172e-09, |
|
"loss": 0.5918, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 9.980750721847931, |
|
"grad_norm": 0.33578982949256897, |
|
"learning_rate": 5.654440188296306e-10, |
|
"loss": 0.6011, |
|
"step": 5185 |
|
}, |
|
{ |
|
"epoch": 9.990375360923965, |
|
"grad_norm": 0.34376034140586853, |
|
"learning_rate": 0.0, |
|
"loss": 0.5965, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 9.990375360923965, |
|
"eval_loss": 4.085933685302734, |
|
"eval_runtime": 0.8044, |
|
"eval_samples_per_second": 13.675, |
|
"eval_steps_per_second": 2.486, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 9.990375360923965, |
|
"step": 5190, |
|
"total_flos": 7.743588771836199e+18, |
|
"train_loss": 0.8018066772835792, |
|
"train_runtime": 21791.6644, |
|
"train_samples_per_second": 7.627, |
|
"train_steps_per_second": 0.238 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 5190, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.743588771836199e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|