diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,7 +2,7 @@ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997305930800908, - "eval_steps": 500, + "eval_steps": 200, "global_step": 3247, "is_hyper_param_search": false, "is_local_process_zero": true, @@ -10,4570 +10,4690 @@ "log_history": [ { "epoch": 0.0003078936227533387, - "grad_norm": 42.473334342863225, + "grad_norm": 0.567386619418904, "learning_rate": 6.153846153846154e-07, "loss": 1.3715, "step": 1 }, { "epoch": 0.0015394681137666935, - "grad_norm": 39.584233835268385, + "grad_norm": 0.5288856739094128, "learning_rate": 3.0769230769230774e-06, - "loss": 1.3252, + "loss": 1.3257, "step": 5 }, { "epoch": 0.003078936227533387, - "grad_norm": 38.124176994124966, + "grad_norm": 0.514193298068005, "learning_rate": 6.153846153846155e-06, - "loss": 1.3472, + "loss": 1.3509, "step": 10 }, { "epoch": 0.004618404341300081, - "grad_norm": 33.949332930378546, + "grad_norm": 0.5439189819234317, "learning_rate": 9.230769230769232e-06, - "loss": 1.3191, + "loss": 1.3301, "step": 15 }, { "epoch": 0.006157872455066774, - "grad_norm": 13.263339730204837, + "grad_norm": 0.40780463269778033, "learning_rate": 1.230769230769231e-05, - "loss": 1.3273, + "loss": 1.3432, "step": 20 }, { "epoch": 0.007697340568833468, - "grad_norm": 10.03432034420483, + "grad_norm": 0.21241926749939638, "learning_rate": 1.5384615384615387e-05, - "loss": 1.2756, + "loss": 1.2848, "step": 25 }, { "epoch": 0.009236808682600161, - "grad_norm": 10.404219206269893, + "grad_norm": 0.22229732202041577, "learning_rate": 1.8461538461538465e-05, - "loss": 1.2, + "loss": 1.2157, "step": 30 }, { "epoch": 0.010776276796366856, - "grad_norm": 4.904063566508077, + "grad_norm": 0.22180484192931016, "learning_rate": 2.1538461538461542e-05, - "loss": 1.2194, + "loss": 1.2437, "step": 35 }, { "epoch": 0.012315744910133548, - "grad_norm": 2.625318230724936, + "grad_norm": 0.26148958772269565, "learning_rate": 2.461538461538462e-05, - "loss": 1.1508, + "loss": 1.1774, "step": 40 }, { "epoch": 0.013855213023900243, - "grad_norm": 1.554813766822268, + "grad_norm": 0.1381120764969843, "learning_rate": 2.7692307692307694e-05, - "loss": 1.1786, + "loss": 1.202, "step": 45 }, { "epoch": 0.015394681137666935, - "grad_norm": 1.1809148162862095, + "grad_norm": 0.11244566836575712, "learning_rate": 3.0769230769230774e-05, - "loss": 1.1277, + "loss": 1.147, "step": 50 }, { "epoch": 0.01693414925143363, - "grad_norm": 1.203464354806506, + "grad_norm": 0.11021185664583819, "learning_rate": 3.384615384615385e-05, - "loss": 1.1463, + "loss": 1.1643, "step": 55 }, { "epoch": 0.018473617365200323, - "grad_norm": 1.1383524365220588, + "grad_norm": 0.09699197885691936, "learning_rate": 3.692307692307693e-05, - "loss": 1.1188, + "loss": 1.1375, "step": 60 }, { "epoch": 0.020013085478967015, - "grad_norm": 0.9857720112400227, + "grad_norm": 0.08432503700919332, "learning_rate": 4e-05, - "loss": 1.136, + "loss": 1.157, "step": 65 }, { "epoch": 0.02155255359273371, - "grad_norm": 0.8036894302534616, + "grad_norm": 0.08841813904259435, "learning_rate": 4.3076923076923084e-05, - "loss": 1.0966, + "loss": 1.1193, "step": 70 }, { "epoch": 0.023092021706500404, - "grad_norm": 1.3013628925636782, + "grad_norm": 0.09260059592301743, "learning_rate": 4.615384615384616e-05, - "loss": 1.092, + "loss": 1.1143, "step": 75 }, { "epoch": 0.024631489820267097, - "grad_norm": 1.2268436983757836, + "grad_norm": 0.11247544113589666, "learning_rate": 4.923076923076924e-05, - "loss": 1.1211, + "loss": 1.1422, "step": 80 }, { "epoch": 0.026170957934033793, - "grad_norm": 1.1230597630357015, + "grad_norm": 0.10305516893175011, "learning_rate": 5.230769230769231e-05, - "loss": 1.1099, + "loss": 1.1301, "step": 85 }, { "epoch": 0.027710426047800486, - "grad_norm": 1.1223875634038776, + "grad_norm": 0.10094078438604197, "learning_rate": 5.538461538461539e-05, - "loss": 1.051, + "loss": 1.07, "step": 90 }, { "epoch": 0.02924989416156718, - "grad_norm": 1.3366126407920835, + "grad_norm": 0.09878535861697932, "learning_rate": 5.846153846153847e-05, - "loss": 1.0889, + "loss": 1.1062, "step": 95 }, { "epoch": 0.03078936227533387, - "grad_norm": 0.8876804012745447, + "grad_norm": 0.0880914200143609, "learning_rate": 6.153846153846155e-05, - "loss": 1.0635, + "loss": 1.0792, "step": 100 }, { "epoch": 0.03232883038910057, - "grad_norm": 0.985567117532096, + "grad_norm": 0.09739295977826301, "learning_rate": 6.461538461538462e-05, - "loss": 1.0729, + "loss": 1.0886, "step": 105 }, { "epoch": 0.03386829850286726, - "grad_norm": 1.3857701900950476, + "grad_norm": 0.12352134648547326, "learning_rate": 6.76923076923077e-05, - "loss": 1.0687, + "loss": 1.0846, "step": 110 }, { "epoch": 0.03540776661663395, - "grad_norm": 1.0307077006740535, + "grad_norm": 0.09422623480770391, "learning_rate": 7.076923076923078e-05, - "loss": 1.0756, + "loss": 1.0892, "step": 115 }, { "epoch": 0.036947234730400645, - "grad_norm": 0.915309617477138, + "grad_norm": 0.11191294618136459, "learning_rate": 7.384615384615386e-05, - "loss": 1.0578, + "loss": 1.0709, "step": 120 }, { "epoch": 0.03848670284416734, - "grad_norm": 1.1416205921187619, + "grad_norm": 0.10765812088608237, "learning_rate": 7.692307692307693e-05, - "loss": 1.0722, + "loss": 1.084, "step": 125 }, { "epoch": 0.04002617095793403, - "grad_norm": 0.7855309627413886, + "grad_norm": 0.0922143630736113, "learning_rate": 8e-05, - "loss": 1.1091, + "loss": 1.1205, "step": 130 }, { "epoch": 0.04156563907170073, - "grad_norm": 1.027162314274905, + "grad_norm": 0.12839655709773654, "learning_rate": 8.307692307692309e-05, - "loss": 1.0632, + "loss": 1.0738, "step": 135 }, { "epoch": 0.04310510718546742, - "grad_norm": 1.1386693099418572, + "grad_norm": 0.11121867345439279, "learning_rate": 8.615384615384617e-05, - "loss": 1.0736, + "loss": 1.0821, "step": 140 }, { "epoch": 0.044644575299234115, - "grad_norm": 0.7717462620986457, + "grad_norm": 0.0928933843755614, "learning_rate": 8.923076923076924e-05, - "loss": 1.0523, + "loss": 1.0604, "step": 145 }, { "epoch": 0.04618404341300081, - "grad_norm": 0.7657994075849279, + "grad_norm": 0.09586858247024442, "learning_rate": 9.230769230769232e-05, - "loss": 1.0557, + "loss": 1.0644, "step": 150 }, { "epoch": 0.0477235115267675, - "grad_norm": 1.1680871949004308, + "grad_norm": 0.10507553798424621, "learning_rate": 9.53846153846154e-05, - "loss": 1.0548, + "loss": 1.0617, "step": 155 }, { "epoch": 0.049262979640534194, - "grad_norm": 0.8996216400147768, + "grad_norm": 0.11036562639685599, "learning_rate": 9.846153846153848e-05, - "loss": 1.0665, + "loss": 1.0732, "step": 160 }, { "epoch": 0.050802447754300886, - "grad_norm": 0.700328592160364, + "grad_norm": 0.11309124416557308, "learning_rate": 0.00010153846153846153, - "loss": 1.0496, + "loss": 1.0557, "step": 165 }, { "epoch": 0.052341915868067586, - "grad_norm": 1.0558734188354055, + "grad_norm": 0.15677878221994676, "learning_rate": 0.00010461538461538463, - "loss": 1.065, + "loss": 1.0706, "step": 170 }, { "epoch": 0.05388138398183428, - "grad_norm": 0.8578829636967044, + "grad_norm": 0.11661519431178777, "learning_rate": 0.0001076923076923077, - "loss": 1.0923, + "loss": 1.098, "step": 175 }, { "epoch": 0.05542085209560097, - "grad_norm": 0.8831349656869951, + "grad_norm": 0.12109397309650635, "learning_rate": 0.00011076923076923077, - "loss": 1.0259, + "loss": 1.0311, "step": 180 }, { "epoch": 0.056960320209367664, - "grad_norm": 0.9573775567280769, + "grad_norm": 0.11093871974267784, "learning_rate": 0.00011384615384615384, - "loss": 1.0661, + "loss": 1.0707, "step": 185 }, { "epoch": 0.05849978832313436, - "grad_norm": 0.9495008375470986, + "grad_norm": 0.11357908911113367, "learning_rate": 0.00011692307692307694, - "loss": 1.0811, + "loss": 1.0845, "step": 190 }, { "epoch": 0.06003925643690105, - "grad_norm": 1.1061782824597675, + "grad_norm": 0.12394460027791813, "learning_rate": 0.00012, - "loss": 1.0466, + "loss": 1.0495, "step": 195 }, { "epoch": 0.06157872455066774, - "grad_norm": 1.1202640249643756, + "grad_norm": 0.11194042323004598, "learning_rate": 0.0001230769230769231, - "loss": 1.0589, + "loss": 1.0614, + "step": 200 + }, + { + "epoch": 0.06157872455066774, + "eval_loss": 1.0632128715515137, + "eval_runtime": 3818.978, + "eval_samples_per_second": 6.051, + "eval_steps_per_second": 0.378, "step": 200 }, { "epoch": 0.06311819266443444, - "grad_norm": 1.0125076015545247, + "grad_norm": 0.1033181877601895, "learning_rate": 0.00012615384615384615, - "loss": 1.0407, + "loss": 1.0433, "step": 205 }, { "epoch": 0.06465766077820113, - "grad_norm": 1.0060078228523262, + "grad_norm": 0.12063646484450094, "learning_rate": 0.00012923076923076923, - "loss": 1.0688, + "loss": 1.0714, "step": 210 }, { "epoch": 0.06619712889196783, - "grad_norm": 0.7760894033200619, + "grad_norm": 0.11588361936604014, "learning_rate": 0.0001323076923076923, - "loss": 1.0549, + "loss": 1.057, "step": 215 }, { "epoch": 0.06773659700573452, - "grad_norm": 0.8990085149589899, + "grad_norm": 0.11616868785038624, "learning_rate": 0.0001353846153846154, - "loss": 1.0537, + "loss": 1.0553, "step": 220 }, { "epoch": 0.06927606511950121, - "grad_norm": 0.8549275834093875, + "grad_norm": 0.09562740234655313, "learning_rate": 0.00013846153846153847, - "loss": 1.0251, + "loss": 1.0257, "step": 225 }, { "epoch": 0.0708155332332679, - "grad_norm": 0.8075081750437678, + "grad_norm": 0.09393900612118493, "learning_rate": 0.00014153846153846156, - "loss": 1.0443, + "loss": 1.0445, "step": 230 }, { "epoch": 0.0723550013470346, - "grad_norm": 0.9259388698536392, + "grad_norm": 0.10841095433316549, "learning_rate": 0.0001446153846153846, - "loss": 1.0134, + "loss": 1.0103, "step": 235 }, { "epoch": 0.07389446946080129, - "grad_norm": 1.7409427946151614, + "grad_norm": 0.10223643461806235, "learning_rate": 0.00014769230769230772, - "loss": 1.073, + "loss": 1.0717, "step": 240 }, { "epoch": 0.07543393757456798, - "grad_norm": 0.9864593980213632, + "grad_norm": 0.11204179743813408, "learning_rate": 0.00015076923076923077, - "loss": 1.0735, + "loss": 1.0724, "step": 245 }, { "epoch": 0.07697340568833468, - "grad_norm": 0.7479355733124677, + "grad_norm": 0.09200458344532117, "learning_rate": 0.00015384615384615385, - "loss": 1.0642, + "loss": 1.0636, "step": 250 }, { "epoch": 0.07851287380210137, - "grad_norm": 0.9485409237720248, + "grad_norm": 0.11400449644867688, "learning_rate": 0.00015692307692307693, - "loss": 1.0, + "loss": 0.9971, "step": 255 }, { "epoch": 0.08005234191586806, - "grad_norm": 1.0842298527942793, + "grad_norm": 0.11612263416102313, "learning_rate": 0.00016, - "loss": 1.0726, + "loss": 1.0694, "step": 260 }, { "epoch": 0.08159181002963477, - "grad_norm": 1.4490484618864288, + "grad_norm": 0.09082765987050115, "learning_rate": 0.0001630769230769231, - "loss": 1.0506, + "loss": 1.0467, "step": 265 }, { "epoch": 0.08313127814340146, - "grad_norm": 0.7322203679384404, + "grad_norm": 0.091051842252486, "learning_rate": 0.00016615384615384617, - "loss": 1.0934, + "loss": 1.0909, "step": 270 }, { "epoch": 0.08467074625716815, - "grad_norm": 1.0463212758166829, + "grad_norm": 0.09996524510558816, "learning_rate": 0.00016923076923076923, - "loss": 1.0414, + "loss": 1.038, "step": 275 }, { "epoch": 0.08621021437093485, - "grad_norm": 0.7534147115742311, + "grad_norm": 0.08559129966794987, "learning_rate": 0.00017230769230769234, - "loss": 1.0582, + "loss": 1.055, "step": 280 }, { "epoch": 0.08774968248470154, - "grad_norm": 0.8938192794309777, + "grad_norm": 0.09417658234399054, "learning_rate": 0.0001753846153846154, - "loss": 1.0676, + "loss": 1.0628, "step": 285 }, { "epoch": 0.08928915059846823, - "grad_norm": 0.7350477942859435, + "grad_norm": 0.09367818742833312, "learning_rate": 0.00017846153846153847, - "loss": 1.0727, + "loss": 1.0698, "step": 290 }, { "epoch": 0.09082861871223492, - "grad_norm": 0.7547431024386564, + "grad_norm": 0.091480252033389, "learning_rate": 0.00018153846153846155, - "loss": 1.069, + "loss": 1.065, "step": 295 }, { "epoch": 0.09236808682600162, - "grad_norm": 1.0017027549794293, + "grad_norm": 0.0938796736597296, "learning_rate": 0.00018461538461538463, - "loss": 1.0586, + "loss": 1.0532, "step": 300 }, { "epoch": 0.09390755493976831, - "grad_norm": 0.7906132895743292, + "grad_norm": 0.09576286670666531, "learning_rate": 0.0001876923076923077, - "loss": 1.0546, + "loss": 1.0496, "step": 305 }, { "epoch": 0.095447023053535, - "grad_norm": 1.401442713529287, + "grad_norm": 0.09248254454681183, "learning_rate": 0.0001907692307692308, - "loss": 1.0698, + "loss": 1.0636, "step": 310 }, { "epoch": 0.0969864911673017, - "grad_norm": 0.5922407946201297, + "grad_norm": 0.08693274686688061, "learning_rate": 0.00019384615384615385, - "loss": 1.0901, + "loss": 1.0838, "step": 315 }, { "epoch": 0.09852595928106839, - "grad_norm": 0.6300597452052293, + "grad_norm": 0.07975001335214404, "learning_rate": 0.00019692307692307696, - "loss": 1.0429, + "loss": 1.037, "step": 320 }, { "epoch": 0.10006542739483508, - "grad_norm": 0.8806436669088872, + "grad_norm": 0.09900399369206245, "learning_rate": 0.0002, - "loss": 1.0552, + "loss": 1.0476, "step": 325 }, { "epoch": 0.10160489550860177, - "grad_norm": 1.0983869005820532, + "grad_norm": 0.0883635278339313, "learning_rate": 0.00019999855506507185, - "loss": 1.0851, + "loss": 1.0765, "step": 330 }, { "epoch": 0.10314436362236847, - "grad_norm": 0.9815804026010979, + "grad_norm": 0.08877039836306448, "learning_rate": 0.00019999422030204418, - "loss": 1.0673, + "loss": 1.059, "step": 335 }, { "epoch": 0.10468383173613517, - "grad_norm": 0.7301517719864844, + "grad_norm": 0.07996176059015918, "learning_rate": 0.00019998699583618593, - "loss": 1.0516, + "loss": 1.0418, "step": 340 }, { "epoch": 0.10622329984990186, - "grad_norm": 0.7786167269205654, + "grad_norm": 0.0870003860102271, "learning_rate": 0.00019997688187627482, - "loss": 1.064, + "loss": 1.0557, "step": 345 }, { "epoch": 0.10776276796366856, - "grad_norm": 0.7511697290569181, + "grad_norm": 0.09383600372135154, "learning_rate": 0.0001999638787145911, - "loss": 1.0836, + "loss": 1.0735, "step": 350 }, { "epoch": 0.10930223607743525, - "grad_norm": 0.8111968485885569, + "grad_norm": 0.09891188283344632, "learning_rate": 0.0001999479867269092, - "loss": 1.0681, + "loss": 1.0584, "step": 355 }, { "epoch": 0.11084170419120194, - "grad_norm": 1.1816993244319927, + "grad_norm": 0.12102740620306172, "learning_rate": 0.00019992920637248697, - "loss": 1.0571, + "loss": 1.0489, "step": 360 }, { "epoch": 0.11238117230496864, - "grad_norm": 0.7021483137095024, + "grad_norm": 0.08137547822372986, "learning_rate": 0.00019990753819405213, - "loss": 1.0366, + "loss": 1.0277, "step": 365 }, { "epoch": 0.11392064041873533, - "grad_norm": 0.786152904921158, + "grad_norm": 0.08421354630258387, "learning_rate": 0.00019988298281778684, - "loss": 1.0745, + "loss": 1.062, "step": 370 }, { "epoch": 0.11546010853250202, - "grad_norm": 0.8625505891808654, + "grad_norm": 0.08071712029036497, "learning_rate": 0.00019985554095330955, - "loss": 1.0309, + "loss": 1.0215, "step": 375 }, { "epoch": 0.11699957664626871, - "grad_norm": 0.7101475350198745, + "grad_norm": 0.0928078752872806, "learning_rate": 0.0001998252133936544, - "loss": 1.0623, + "loss": 1.053, "step": 380 }, { "epoch": 0.1185390447600354, - "grad_norm": 0.7943463609732394, + "grad_norm": 0.08562676553948985, "learning_rate": 0.00019979200101524845, - "loss": 1.0798, + "loss": 1.0701, "step": 385 }, { "epoch": 0.1200785128738021, - "grad_norm": 1.1298857584764006, + "grad_norm": 0.08403649770517699, "learning_rate": 0.00019975590477788613, - "loss": 1.0619, + "loss": 1.0514, "step": 390 }, { "epoch": 0.12161798098756879, - "grad_norm": 0.6519227994726357, + "grad_norm": 0.08183724425629081, "learning_rate": 0.0001997169257247018, - "loss": 1.0198, + "loss": 1.0095, "step": 395 }, { "epoch": 0.12315744910133548, - "grad_norm": 0.7712100700155368, + "grad_norm": 0.08039658119705519, "learning_rate": 0.00019967506498213931, - "loss": 1.08, + "loss": 1.0689, + "step": 400 + }, + { + "epoch": 0.12315744910133548, + "eval_loss": 1.0475565195083618, + "eval_runtime": 3802.3739, + "eval_samples_per_second": 6.078, + "eval_steps_per_second": 0.38, "step": 400 }, { "epoch": 0.12469691721510218, - "grad_norm": 0.785301798573464, + "grad_norm": 0.08379276105126061, "learning_rate": 0.00019963032375991966, - "loss": 1.0889, + "loss": 1.0782, "step": 405 }, { "epoch": 0.12623638532886888, - "grad_norm": 0.7950130962836747, + "grad_norm": 0.07908083743105887, "learning_rate": 0.00019958270335100595, - "loss": 1.0349, + "loss": 1.0235, "step": 410 }, { "epoch": 0.12777585344263556, - "grad_norm": 0.6411989099588487, + "grad_norm": 0.08493671539476158, "learning_rate": 0.00019953220513156602, - "loss": 1.1014, + "loss": 1.0907, "step": 415 }, { "epoch": 0.12931532155640227, - "grad_norm": 0.6881556293517902, + "grad_norm": 0.08337456151040325, "learning_rate": 0.0001994788305609327, - "loss": 1.0433, + "loss": 1.0326, "step": 420 }, { "epoch": 0.13085478967016895, - "grad_norm": 0.8822396645970323, + "grad_norm": 0.08547228164895211, "learning_rate": 0.00019942258118156163, - "loss": 1.0555, + "loss": 1.0442, "step": 425 }, { "epoch": 0.13239425778393565, - "grad_norm": 0.8566501292249419, + "grad_norm": 0.0750158452168834, "learning_rate": 0.00019936345861898663, - "loss": 1.0791, + "loss": 1.0684, "step": 430 }, { "epoch": 0.13393372589770233, - "grad_norm": 0.6000374004935582, + "grad_norm": 0.07579154319260911, "learning_rate": 0.0001993014645817728, - "loss": 1.0644, + "loss": 1.0547, "step": 435 }, { "epoch": 0.13547319401146904, - "grad_norm": 0.7094811645126697, + "grad_norm": 0.07323307416728322, "learning_rate": 0.00019923660086146723, - "loss": 1.0588, + "loss": 1.0491, "step": 440 }, { "epoch": 0.13701266212523572, - "grad_norm": 0.7299783423452573, + "grad_norm": 0.08937561723880813, "learning_rate": 0.0001991688693325469, - "loss": 1.0606, + "loss": 1.0484, "step": 445 }, { "epoch": 0.13855213023900242, - "grad_norm": 0.7793849224097605, + "grad_norm": 0.08089791643667314, "learning_rate": 0.00019909827195236493, - "loss": 1.0535, + "loss": 1.0422, "step": 450 }, { "epoch": 0.14009159835276913, - "grad_norm": 0.8123995612581718, + "grad_norm": 0.07004371269562998, "learning_rate": 0.00019902481076109372, - "loss": 1.0903, + "loss": 1.0779, "step": 455 }, { "epoch": 0.1416310664665358, - "grad_norm": 0.8269490106548164, + "grad_norm": 0.07219709532645062, "learning_rate": 0.00019894848788166604, - "loss": 1.0682, + "loss": 1.0578, "step": 460 }, { "epoch": 0.14317053458030252, - "grad_norm": 0.6945769324601003, + "grad_norm": 0.08319824295039546, "learning_rate": 0.00019886930551971387, - "loss": 1.0918, + "loss": 1.0776, "step": 465 }, { "epoch": 0.1447100026940692, - "grad_norm": 1.0743052082703004, + "grad_norm": 0.08461977605731112, "learning_rate": 0.0001987872659635043, - "loss": 1.0512, + "loss": 1.0398, "step": 470 }, { "epoch": 0.1462494708078359, - "grad_norm": 0.7274051186012023, + "grad_norm": 0.07913296391073374, "learning_rate": 0.00019870237158387384, - "loss": 1.0609, + "loss": 1.0496, "step": 475 }, { "epoch": 0.14778893892160258, - "grad_norm": 0.8436743181685329, + "grad_norm": 0.07884194841699313, "learning_rate": 0.00019861462483415952, - "loss": 1.0534, + "loss": 1.0401, "step": 480 }, { "epoch": 0.1493284070353693, - "grad_norm": 0.5539587236696554, + "grad_norm": 0.08715608209779739, "learning_rate": 0.0001985240282501282, - "loss": 1.0562, + "loss": 1.0458, "step": 485 }, { "epoch": 0.15086787514913597, - "grad_norm": 0.6673041088797687, + "grad_norm": 0.07255994722623947, "learning_rate": 0.0001984305844499033, - "loss": 1.0343, + "loss": 1.023, "step": 490 }, { "epoch": 0.15240734326290267, - "grad_norm": 0.6860635549461462, + "grad_norm": 0.0769318406138446, "learning_rate": 0.00019833429613388902, - "loss": 1.0602, + "loss": 1.0476, "step": 495 }, { "epoch": 0.15394681137666935, - "grad_norm": 1.0779978346564654, + "grad_norm": 0.07401383185768683, "learning_rate": 0.0001982351660846924, - "loss": 1.0539, + "loss": 1.0415, "step": 500 }, { "epoch": 0.15548627949043606, - "grad_norm": 0.6737414679607289, + "grad_norm": 0.07649017525208568, "learning_rate": 0.00019813319716704278, - "loss": 1.0559, + "loss": 1.0427, "step": 505 }, { "epoch": 0.15702574760420274, - "grad_norm": 0.7252765201034201, + "grad_norm": 0.07708900519770054, "learning_rate": 0.00019802839232770921, - "loss": 1.0986, + "loss": 1.0855, "step": 510 }, { "epoch": 0.15856521571796944, - "grad_norm": 1.3250469121293895, + "grad_norm": 0.07439539455159229, "learning_rate": 0.00019792075459541518, - "loss": 1.0327, + "loss": 1.0224, "step": 515 }, { "epoch": 0.16010468383173612, - "grad_norm": 0.6408679533216792, + "grad_norm": 0.08182572188820655, "learning_rate": 0.00019781028708075102, - "loss": 1.0415, + "loss": 1.0302, "step": 520 }, { "epoch": 0.16164415194550283, - "grad_norm": 0.9808936518837947, + "grad_norm": 0.08161041915734439, "learning_rate": 0.00019769699297608417, - "loss": 1.0386, + "loss": 1.0245, "step": 525 }, { "epoch": 0.16318362005926954, - "grad_norm": 0.7892295118178237, + "grad_norm": 0.08888339173595923, "learning_rate": 0.00019758087555546682, - "loss": 1.0866, + "loss": 1.0747, "step": 530 }, { "epoch": 0.16472308817303621, - "grad_norm": 0.6508163256108649, + "grad_norm": 0.07764918511303621, "learning_rate": 0.0001974619381745413, - "loss": 1.0804, + "loss": 1.0696, "step": 535 }, { "epoch": 0.16626255628680292, - "grad_norm": 0.7619370139690409, + "grad_norm": 0.07999928337629646, "learning_rate": 0.00019734018427044307, - "loss": 1.0471, + "loss": 1.0351, "step": 540 }, { "epoch": 0.1678020244005696, - "grad_norm": 1.3124132163956248, + "grad_norm": 0.07656288262788609, "learning_rate": 0.0001972156173617016, - "loss": 1.0517, + "loss": 1.0399, "step": 545 }, { "epoch": 0.1693414925143363, - "grad_norm": 0.969795077947931, + "grad_norm": 0.07902096762825829, "learning_rate": 0.00019708824104813837, - "loss": 1.0518, + "loss": 1.0397, "step": 550 }, { "epoch": 0.17088096062810298, - "grad_norm": 0.7315912579640327, + "grad_norm": 0.08100109284928467, "learning_rate": 0.00019695805901076308, - "loss": 1.0784, + "loss": 1.0661, "step": 555 }, { "epoch": 0.1724204287418697, - "grad_norm": 0.7357250477683509, + "grad_norm": 0.07617725821540045, "learning_rate": 0.00019682507501166718, - "loss": 1.1008, + "loss": 1.088, "step": 560 }, { "epoch": 0.17395989685563637, - "grad_norm": 0.96782095170125, + "grad_norm": 0.0781772918354948, "learning_rate": 0.00019668929289391523, - "loss": 1.0444, + "loss": 1.0325, "step": 565 }, { "epoch": 0.17549936496940308, - "grad_norm": 0.7600182365968516, + "grad_norm": 0.07602400296386462, "learning_rate": 0.00019655071658143366, - "loss": 1.0625, + "loss": 1.0493, "step": 570 }, { "epoch": 0.17703883308316976, - "grad_norm": 0.6161742360584987, + "grad_norm": 0.07965301834511823, "learning_rate": 0.00019640935007889755, - "loss": 1.091, + "loss": 1.0759, "step": 575 }, { "epoch": 0.17857830119693646, - "grad_norm": 0.5777577463068309, + "grad_norm": 0.07753220082647658, "learning_rate": 0.0001962651974716149, - "loss": 1.072, + "loss": 1.0601, "step": 580 }, { "epoch": 0.18011776931070314, - "grad_norm": 0.644094996294978, + "grad_norm": 0.07648415793765183, "learning_rate": 0.0001961182629254084, - "loss": 1.0275, + "loss": 1.0151, "step": 585 }, { "epoch": 0.18165723742446985, - "grad_norm": 0.8589876413667922, + "grad_norm": 0.08029992977054808, "learning_rate": 0.00019596855068649522, - "loss": 1.0605, + "loss": 1.0499, "step": 590 }, { "epoch": 0.18319670553823653, - "grad_norm": 0.7740247212495535, + "grad_norm": 0.08092320232434004, "learning_rate": 0.00019581606508136426, - "loss": 1.0764, + "loss": 1.0631, "step": 595 }, { "epoch": 0.18473617365200323, - "grad_norm": 0.6540083386801736, + "grad_norm": 0.0748434186581261, "learning_rate": 0.00019566081051665098, - "loss": 1.0174, + "loss": 1.0053, + "step": 600 + }, + { + "epoch": 0.18473617365200323, + "eval_loss": 1.0412589311599731, + "eval_runtime": 3798.478, + "eval_samples_per_second": 6.084, + "eval_steps_per_second": 0.38, "step": 600 }, { "epoch": 0.18627564176576994, - "grad_norm": 0.7815089251045078, + "grad_norm": 0.07474772922675897, "learning_rate": 0.00019550279147901036, - "loss": 1.0952, + "loss": 1.0811, "step": 605 }, { "epoch": 0.18781510987953662, - "grad_norm": 0.6670730533500357, + "grad_norm": 0.0773035541962382, "learning_rate": 0.00019534201253498682, - "loss": 1.0484, + "loss": 1.0359, "step": 610 }, { "epoch": 0.18935457799330332, - "grad_norm": 0.7552112484078745, + "grad_norm": 0.08385957791440671, "learning_rate": 0.0001951784783308827, - "loss": 1.0104, + "loss": 0.9995, "step": 615 }, { "epoch": 0.19089404610707, - "grad_norm": 0.6343514261370218, + "grad_norm": 0.07841850327107855, "learning_rate": 0.0001950121935926236, - "loss": 1.0525, + "loss": 1.0417, "step": 620 }, { "epoch": 0.1924335142208367, - "grad_norm": 1.4473777235442007, + "grad_norm": 0.11766060709484247, "learning_rate": 0.00019484316312562205, - "loss": 1.033, + "loss": 1.0227, "step": 625 }, { "epoch": 0.1939729823346034, - "grad_norm": 0.5653781957358222, + "grad_norm": 0.07110694761741441, "learning_rate": 0.00019467139181463862, - "loss": 1.078, + "loss": 1.0652, "step": 630 }, { "epoch": 0.1955124504483701, - "grad_norm": 0.6315352985318332, + "grad_norm": 0.07470835721419704, "learning_rate": 0.00019449688462364056, - "loss": 1.042, + "loss": 1.0299, "step": 635 }, { "epoch": 0.19705191856213677, - "grad_norm": 0.8308310132176421, + "grad_norm": 0.08102828325973369, "learning_rate": 0.00019431964659565867, - "loss": 1.0604, + "loss": 1.0488, "step": 640 }, { "epoch": 0.19859138667590348, - "grad_norm": 0.5915623011666878, + "grad_norm": 0.07603967351010721, "learning_rate": 0.0001941396828526412, - "loss": 1.058, + "loss": 1.0459, "step": 645 }, { "epoch": 0.20013085478967016, - "grad_norm": 0.9623739180076736, + "grad_norm": 0.08273841396400562, "learning_rate": 0.00019395699859530623, - "loss": 1.0314, + "loss": 1.0194, "step": 650 }, { "epoch": 0.20167032290343687, - "grad_norm": 0.7770406923743924, + "grad_norm": 0.08236332870987446, "learning_rate": 0.00019377159910299093, - "loss": 1.0418, + "loss": 1.0307, "step": 655 }, { "epoch": 0.20320979101720354, - "grad_norm": 0.582853981181253, + "grad_norm": 0.06867994321607887, "learning_rate": 0.00019358348973349943, - "loss": 1.0228, + "loss": 1.0098, "step": 660 }, { "epoch": 0.20474925913097025, - "grad_norm": 0.6672404317903096, + "grad_norm": 0.08144032256455716, "learning_rate": 0.00019339267592294763, - "loss": 1.0602, + "loss": 1.048, "step": 665 }, { "epoch": 0.20628872724473693, - "grad_norm": 0.7929111958910604, + "grad_norm": 0.07918355281230142, "learning_rate": 0.00019319916318560635, - "loss": 1.0359, + "loss": 1.0227, "step": 670 }, { "epoch": 0.20782819535850364, - "grad_norm": 0.6853640952957439, + "grad_norm": 0.07792694280227995, "learning_rate": 0.00019300295711374187, - "loss": 1.0519, + "loss": 1.039, "step": 675 }, { "epoch": 0.20936766347227034, - "grad_norm": 0.6773550735079633, + "grad_norm": 0.07841373321559497, "learning_rate": 0.00019280406337745428, - "loss": 1.0295, + "loss": 1.0185, "step": 680 }, { "epoch": 0.21090713158603702, - "grad_norm": 0.7456356464144906, + "grad_norm": 0.12957233088012476, "learning_rate": 0.00019260248772451377, - "loss": 1.0614, + "loss": 1.0496, "step": 685 }, { "epoch": 0.21244659969980373, - "grad_norm": 0.7051829760237421, + "grad_norm": 0.07758713390528212, "learning_rate": 0.0001923982359801943, - "loss": 1.0556, + "loss": 1.0425, "step": 690 }, { "epoch": 0.2139860678135704, - "grad_norm": 0.6567867383663277, + "grad_norm": 0.08451147321948667, "learning_rate": 0.00019219131404710552, - "loss": 1.0854, + "loss": 1.0749, "step": 695 }, { "epoch": 0.21552553592733711, - "grad_norm": 0.6369210418492607, + "grad_norm": 0.08218923027527074, "learning_rate": 0.00019198172790502196, - "loss": 1.0368, + "loss": 1.0244, "step": 700 }, { "epoch": 0.2170650040411038, - "grad_norm": 0.6342900260792401, + "grad_norm": 0.07475377041516394, "learning_rate": 0.0001917694836107104, - "loss": 1.0484, + "loss": 1.0367, "step": 705 }, { "epoch": 0.2186044721548705, - "grad_norm": 0.5622570224326242, + "grad_norm": 0.06989397102142611, "learning_rate": 0.00019155458729775467, - "loss": 1.0595, + "loss": 1.049, "step": 710 }, { "epoch": 0.22014394026863718, - "grad_norm": 0.809911467681193, + "grad_norm": 0.07403450910939992, "learning_rate": 0.0001913370451763786, - "loss": 1.0278, + "loss": 1.0135, "step": 715 }, { "epoch": 0.22168340838240388, - "grad_norm": 0.6267743698514823, + "grad_norm": 0.06736867483748331, "learning_rate": 0.00019111686353326631, - "loss": 1.0309, + "loss": 1.0213, "step": 720 }, { "epoch": 0.22322287649617056, - "grad_norm": 0.5838744010816593, + "grad_norm": 0.07406189263799307, "learning_rate": 0.00019089404873138082, - "loss": 1.0637, + "loss": 1.0521, "step": 725 }, { "epoch": 0.22476234460993727, - "grad_norm": 0.5837090263106021, + "grad_norm": 0.07355438357203191, "learning_rate": 0.00019066860720977986, - "loss": 1.059, + "loss": 1.0483, "step": 730 }, { "epoch": 0.22630181272370395, - "grad_norm": 0.7696380706231972, + "grad_norm": 0.07568463711454308, "learning_rate": 0.00019044054548343002, - "loss": 1.0403, + "loss": 1.0289, "step": 735 }, { "epoch": 0.22784128083747066, - "grad_norm": 0.5699826758505713, + "grad_norm": 0.07229067305689793, "learning_rate": 0.0001902098701430184, - "loss": 1.0799, + "loss": 1.0694, "step": 740 }, { "epoch": 0.22938074895123733, - "grad_norm": 0.6868169075444956, + "grad_norm": 0.07530804590739208, "learning_rate": 0.00018997658785476214, - "loss": 1.0781, + "loss": 1.0651, "step": 745 }, { "epoch": 0.23092021706500404, - "grad_norm": 0.583006345831051, + "grad_norm": 0.07259570093477205, "learning_rate": 0.00018974070536021572, - "loss": 1.0814, + "loss": 1.0685, "step": 750 }, { "epoch": 0.23245968517877075, - "grad_norm": 0.6006905084344202, + "grad_norm": 0.06991198063848746, "learning_rate": 0.00018950222947607625, - "loss": 1.0624, + "loss": 1.0524, "step": 755 }, { "epoch": 0.23399915329253743, - "grad_norm": 0.5697764314148909, + "grad_norm": 0.07071964916232602, "learning_rate": 0.0001892611670939865, - "loss": 1.008, + "loss": 0.9967, "step": 760 }, { "epoch": 0.23553862140630413, - "grad_norm": 0.7413320445202513, + "grad_norm": 0.08069984398117862, "learning_rate": 0.00018901752518033548, - "loss": 1.0612, + "loss": 1.0503, "step": 765 }, { "epoch": 0.2370780895200708, - "grad_norm": 0.5363151044675402, + "grad_norm": 0.0719126875966159, "learning_rate": 0.0001887713107760575, - "loss": 1.0605, + "loss": 1.0497, "step": 770 }, { "epoch": 0.23861755763383752, - "grad_norm": 0.8682202097559629, + "grad_norm": 0.07933165083127114, "learning_rate": 0.00018852253099642833, - "loss": 1.031, + "loss": 1.0163, "step": 775 }, { "epoch": 0.2401570257476042, - "grad_norm": 0.5831373382248602, + "grad_norm": 0.07529876789807866, "learning_rate": 0.0001882711930308599, - "loss": 1.06, + "loss": 1.0503, "step": 780 }, { "epoch": 0.2416964938613709, - "grad_norm": 0.5590693033565884, + "grad_norm": 0.074705285570636, "learning_rate": 0.00018801730414269225, - "loss": 1.0533, + "loss": 1.0424, "step": 785 }, { "epoch": 0.24323596197513758, - "grad_norm": 0.6205945960602542, + "grad_norm": 0.07414239254048278, "learning_rate": 0.0001877608716689839, - "loss": 1.0757, + "loss": 1.0655, "step": 790 }, { "epoch": 0.2447754300889043, - "grad_norm": 0.7900938502871934, + "grad_norm": 0.07941506265986978, "learning_rate": 0.00018750190302029956, - "loss": 1.0301, + "loss": 1.0193, "step": 795 }, { "epoch": 0.24631489820267097, - "grad_norm": 1.0979738402724484, + "grad_norm": 0.08230667165269098, "learning_rate": 0.00018724040568049612, - "loss": 1.0547, + "loss": 1.0446, + "step": 800 + }, + { + "epoch": 0.24631489820267097, + "eval_loss": 1.0366028547286987, + "eval_runtime": 3798.2715, + "eval_samples_per_second": 6.084, + "eval_steps_per_second": 0.38, "step": 800 }, { "epoch": 0.24785436631643767, - "grad_norm": 0.7255960650743382, + "grad_norm": 0.08052061406166201, "learning_rate": 0.00018697638720650646, - "loss": 1.0454, + "loss": 1.0329, "step": 805 }, { "epoch": 0.24939383443020435, - "grad_norm": 0.6620553781722808, + "grad_norm": 0.07060612206330524, "learning_rate": 0.00018670985522812084, - "loss": 1.0219, + "loss": 1.0123, "step": 810 }, { "epoch": 0.25093330254397106, - "grad_norm": 0.6854793724956825, + "grad_norm": 0.07261155032686553, "learning_rate": 0.0001864408174477665, - "loss": 1.0509, + "loss": 1.0394, "step": 815 }, { "epoch": 0.25247277065773777, - "grad_norm": 0.5492317189334932, + "grad_norm": 0.07296759582556935, "learning_rate": 0.00018616928164028523, - "loss": 1.0159, + "loss": 1.0021, "step": 820 }, { "epoch": 0.2540122387715045, - "grad_norm": 0.8404726657300783, + "grad_norm": 0.06646733390910516, "learning_rate": 0.00018589525565270844, - "loss": 1.0411, + "loss": 1.0286, "step": 825 }, { "epoch": 0.2555517068852711, - "grad_norm": 0.6623737864891829, + "grad_norm": 0.07496596424661404, "learning_rate": 0.0001856187474040306, - "loss": 1.0602, + "loss": 1.0502, "step": 830 }, { "epoch": 0.25709117499903783, - "grad_norm": 0.5418949047693957, + "grad_norm": 0.08500360217319118, "learning_rate": 0.00018533976488498016, - "loss": 1.037, + "loss": 1.0256, "step": 835 }, { "epoch": 0.25863064311280454, - "grad_norm": 0.7761449291018582, + "grad_norm": 0.07817756873072405, "learning_rate": 0.0001850583161577889, - "loss": 1.0738, + "loss": 1.0609, "step": 840 }, { "epoch": 0.26017011122657124, - "grad_norm": 0.5847223837062915, + "grad_norm": 0.07136612848707545, "learning_rate": 0.00018477440935595873, - "loss": 1.0875, + "loss": 1.0775, "step": 845 }, { "epoch": 0.2617095793403379, - "grad_norm": 0.5106913164341648, + "grad_norm": 0.07292608365481835, "learning_rate": 0.00018448805268402672, - "loss": 1.0685, + "loss": 1.058, "step": 850 }, { "epoch": 0.2632490474541046, - "grad_norm": 0.7444912623137734, + "grad_norm": 0.07716711803643432, "learning_rate": 0.00018419925441732804, - "loss": 1.0407, + "loss": 1.0294, "step": 855 }, { "epoch": 0.2647885155678713, - "grad_norm": 0.5183580101131431, + "grad_norm": 0.07526261921660161, "learning_rate": 0.00018390802290175673, - "loss": 1.0572, + "loss": 1.0467, "step": 860 }, { "epoch": 0.266327983681638, - "grad_norm": 0.6844054776521512, + "grad_norm": 0.0735157839737638, "learning_rate": 0.00018361436655352456, - "loss": 1.0383, + "loss": 1.0278, "step": 865 }, { "epoch": 0.26786745179540467, - "grad_norm": 1.0067719874632055, + "grad_norm": 0.07101822956411033, "learning_rate": 0.00018331829385891783, - "loss": 1.031, + "loss": 1.0188, "step": 870 }, { "epoch": 0.26940691990917137, - "grad_norm": 0.5484350200540726, + "grad_norm": 0.07406443039738211, "learning_rate": 0.00018301981337405212, - "loss": 1.0585, + "loss": 1.0476, "step": 875 }, { "epoch": 0.2709463880229381, - "grad_norm": 0.6056460900910511, + "grad_norm": 0.07470379094242477, "learning_rate": 0.00018271893372462497, - "loss": 1.0585, + "loss": 1.0468, "step": 880 }, { "epoch": 0.2724858561367048, - "grad_norm": 0.48416296519569474, + "grad_norm": 0.07458412123750419, "learning_rate": 0.00018241566360566665, - "loss": 1.039, + "loss": 1.0279, "step": 885 }, { "epoch": 0.27402532425047144, - "grad_norm": 0.6050871964307601, + "grad_norm": 0.08164107594170099, "learning_rate": 0.00018211001178128892, - "loss": 1.0571, + "loss": 1.0472, "step": 890 }, { "epoch": 0.27556479236423814, - "grad_norm": 0.5969541614482573, + "grad_norm": 0.07748097167228449, "learning_rate": 0.00018180198708443173, - "loss": 1.0653, + "loss": 1.0534, "step": 895 }, { "epoch": 0.27710426047800485, - "grad_norm": 0.7541885423123031, + "grad_norm": 0.07485972229218758, "learning_rate": 0.00018149159841660795, - "loss": 1.0521, + "loss": 1.0419, "step": 900 }, { "epoch": 0.27864372859177156, - "grad_norm": 0.49528417781944445, + "grad_norm": 0.07553124022662376, "learning_rate": 0.00018117885474764613, - "loss": 1.0943, + "loss": 1.0836, "step": 905 }, { "epoch": 0.28018319670553826, - "grad_norm": 0.6918705895372237, + "grad_norm": 0.07966215645919128, "learning_rate": 0.00018086376511543126, - "loss": 1.0747, + "loss": 1.0642, "step": 910 }, { "epoch": 0.2817226648193049, - "grad_norm": 0.7195296276082791, + "grad_norm": 0.08376456009997757, "learning_rate": 0.00018054633862564368, - "loss": 1.0507, + "loss": 1.0398, "step": 915 }, { "epoch": 0.2832621329330716, - "grad_norm": 0.5709551372136472, + "grad_norm": 0.075508959246266, "learning_rate": 0.0001802265844514958, - "loss": 1.0094, + "loss": 0.9996, "step": 920 }, { "epoch": 0.2848016010468383, - "grad_norm": 0.43690572218401663, + "grad_norm": 0.07358158800850821, "learning_rate": 0.0001799045118334671, - "loss": 1.064, + "loss": 1.0542, "step": 925 }, { "epoch": 0.28634106916060503, - "grad_norm": 0.5329066679717666, + "grad_norm": 0.08094264187967125, "learning_rate": 0.00017958013007903713, - "loss": 1.067, + "loss": 1.0563, "step": 930 }, { "epoch": 0.2878805372743717, - "grad_norm": 0.53800354086698, + "grad_norm": 0.07424176124118159, "learning_rate": 0.0001792534485624164, - "loss": 1.0491, + "loss": 1.0405, "step": 935 }, { "epoch": 0.2894200053881384, - "grad_norm": 0.6415976681925623, + "grad_norm": 0.07418414794842867, "learning_rate": 0.00017892447672427563, - "loss": 1.0496, + "loss": 1.0391, "step": 940 }, { "epoch": 0.2909594735019051, - "grad_norm": 0.5366936452063369, + "grad_norm": 0.0704593007549167, "learning_rate": 0.00017859322407147272, - "loss": 1.0657, + "loss": 1.0543, "step": 945 }, { "epoch": 0.2924989416156718, - "grad_norm": 0.5700114756555418, + "grad_norm": 0.07058098730245323, "learning_rate": 0.00017825970017677832, - "loss": 1.0808, + "loss": 1.0693, "step": 950 }, { "epoch": 0.29403840972943845, - "grad_norm": 0.5602975678360493, + "grad_norm": 0.07100077379863531, "learning_rate": 0.00017792391467859886, - "loss": 1.0255, + "loss": 1.0157, "step": 955 }, { "epoch": 0.29557787784320516, - "grad_norm": 0.5222669096259323, + "grad_norm": 0.07264965656385536, "learning_rate": 0.0001775858772806983, - "loss": 1.0762, + "loss": 1.0669, "step": 960 }, { "epoch": 0.29711734595697187, - "grad_norm": 0.6950722858238849, + "grad_norm": 0.06945646756969821, "learning_rate": 0.00017724559775191744, - "loss": 1.0382, + "loss": 1.0282, "step": 965 }, { "epoch": 0.2986568140707386, - "grad_norm": 1.0357933657324396, + "grad_norm": 0.0800750187488917, "learning_rate": 0.00017690308592589182, - "loss": 1.0541, + "loss": 1.0424, "step": 970 }, { "epoch": 0.3001962821845053, - "grad_norm": 0.5871650345246843, + "grad_norm": 0.07826578070698212, "learning_rate": 0.0001765583517007675, - "loss": 1.0035, + "loss": 0.994, "step": 975 }, { "epoch": 0.30173575029827193, - "grad_norm": 0.5209052634359257, + "grad_norm": 0.07185624380063993, "learning_rate": 0.00017621140503891488, - "loss": 1.0206, + "loss": 1.0117, "step": 980 }, { "epoch": 0.30327521841203864, - "grad_norm": 0.6487211604010458, + "grad_norm": 0.07770724836361542, "learning_rate": 0.00017586225596664102, - "loss": 1.0381, + "loss": 1.0282, "step": 985 }, { "epoch": 0.30481468652580535, - "grad_norm": 0.5629565123905481, + "grad_norm": 0.07425549788358596, "learning_rate": 0.00017551091457389966, - "loss": 1.0434, + "loss": 1.0332, "step": 990 }, { "epoch": 0.30635415463957205, - "grad_norm": 0.8112166900825726, + "grad_norm": 0.07157671192234144, "learning_rate": 0.00017515739101399983, - "loss": 1.0287, + "loss": 1.0202, "step": 995 }, { "epoch": 0.3078936227533387, - "grad_norm": 0.6366943000992968, + "grad_norm": 0.07195148099166214, "learning_rate": 0.00017480169550331231, - "loss": 1.0193, + "loss": 1.0091, + "step": 1000 + }, + { + "epoch": 0.3078936227533387, + "eval_loss": 1.033624291419983, + "eval_runtime": 3799.3073, + "eval_samples_per_second": 6.082, + "eval_steps_per_second": 0.38, "step": 1000 }, { "epoch": 0.3094330908671054, - "grad_norm": 0.5040648629606902, + "grad_norm": 0.0709967222808181, "learning_rate": 0.00017444383832097442, - "loss": 1.0411, + "loss": 1.0306, "step": 1005 }, { "epoch": 0.3109725589808721, - "grad_norm": 0.8283443113345634, + "grad_norm": 0.08017250953363526, "learning_rate": 0.00017408382980859305, - "loss": 1.045, + "loss": 1.0335, "step": 1010 }, { "epoch": 0.3125120270946388, - "grad_norm": 0.6777410855607223, + "grad_norm": 0.0763005407159528, "learning_rate": 0.00017372168036994566, - "loss": 1.0263, + "loss": 1.0155, "step": 1015 }, { "epoch": 0.3140514952084055, - "grad_norm": 0.5530544091806374, + "grad_norm": 0.068090767981409, "learning_rate": 0.00017335740047067972, - "loss": 1.0307, + "loss": 1.0226, "step": 1020 }, { "epoch": 0.3155909633221722, - "grad_norm": 0.45671110148384275, + "grad_norm": 0.07053765308848822, "learning_rate": 0.0001729910006380102, - "loss": 1.055, + "loss": 1.0455, "step": 1025 }, { "epoch": 0.3171304314359389, - "grad_norm": 0.6812995390238387, + "grad_norm": 0.07639366775520491, "learning_rate": 0.00017262249146041546, - "loss": 1.0838, + "loss": 1.0737, "step": 1030 }, { "epoch": 0.3186698995497056, - "grad_norm": 0.568521042510064, + "grad_norm": 0.07414091472835294, "learning_rate": 0.00017225188358733107, - "loss": 1.0265, + "loss": 1.0159, "step": 1035 }, { "epoch": 0.32020936766347224, - "grad_norm": 0.8025629235004524, + "grad_norm": 0.07840200264036183, "learning_rate": 0.00017187918772884232, - "loss": 1.0721, + "loss": 1.0605, "step": 1040 }, { "epoch": 0.32174883577723895, - "grad_norm": 0.6004044742833335, + "grad_norm": 0.06946548404139283, "learning_rate": 0.00017150441465537447, - "loss": 1.0655, + "loss": 1.0549, "step": 1045 }, { "epoch": 0.32328830389100566, - "grad_norm": 0.5635765966408569, + "grad_norm": 0.0726329779508538, "learning_rate": 0.00017112757519738154, - "loss": 1.0396, + "loss": 1.0294, "step": 1050 }, { "epoch": 0.32482777200477236, - "grad_norm": 0.5043399412628455, + "grad_norm": 0.07366641465053547, "learning_rate": 0.0001707486802450335, - "loss": 1.0536, + "loss": 1.0439, "step": 1055 }, { "epoch": 0.32636724011853907, - "grad_norm": 0.5106615244506089, + "grad_norm": 0.07461023494546891, "learning_rate": 0.00017036774074790132, - "loss": 1.0135, + "loss": 1.0036, "step": 1060 }, { "epoch": 0.3279067082323057, - "grad_norm": 0.6069038358467338, + "grad_norm": 0.07745841056330656, "learning_rate": 0.00016998476771464072, - "loss": 1.0488, + "loss": 1.039, "step": 1065 }, { "epoch": 0.32944617634607243, - "grad_norm": 0.5481314050474333, + "grad_norm": 0.07562279638819498, "learning_rate": 0.00016959977221267392, - "loss": 1.0226, + "loss": 1.0136, "step": 1070 }, { "epoch": 0.33098564445983913, - "grad_norm": 0.5692030442740773, + "grad_norm": 0.07269409212200949, "learning_rate": 0.0001692127653678699, - "loss": 1.0545, + "loss": 1.0447, "step": 1075 }, { "epoch": 0.33252511257360584, - "grad_norm": 0.5422422595812953, + "grad_norm": 0.07863977410900856, "learning_rate": 0.00016882375836422284, - "loss": 1.04, + "loss": 1.032, "step": 1080 }, { "epoch": 0.3340645806873725, - "grad_norm": 0.8241400248029749, + "grad_norm": 0.08154682576838618, "learning_rate": 0.00016843276244352885, - "loss": 1.0667, + "loss": 1.0576, "step": 1085 }, { "epoch": 0.3356040488011392, - "grad_norm": 0.5205208851186217, + "grad_norm": 0.07324914224304953, "learning_rate": 0.00016803978890506113, - "loss": 1.0783, + "loss": 1.0677, "step": 1090 }, { "epoch": 0.3371435169149059, - "grad_norm": 0.6117364442740371, + "grad_norm": 0.08330706239189462, "learning_rate": 0.00016764484910524358, - "loss": 1.0339, + "loss": 1.0244, "step": 1095 }, { "epoch": 0.3386829850286726, - "grad_norm": 0.6490741063813862, + "grad_norm": 0.07527643648007623, "learning_rate": 0.00016724795445732243, - "loss": 1.0068, + "loss": 0.9977, "step": 1100 }, { "epoch": 0.34022245314243926, - "grad_norm": 0.6013736527570045, + "grad_norm": 0.07895912028160554, "learning_rate": 0.00016684911643103642, - "loss": 1.0665, + "loss": 1.0575, "step": 1105 }, { "epoch": 0.34176192125620597, - "grad_norm": 0.6118721687785333, + "grad_norm": 0.073939133015858, "learning_rate": 0.0001664483465522855, - "loss": 1.0423, + "loss": 1.0337, "step": 1110 }, { "epoch": 0.3433013893699727, - "grad_norm": 0.5419797734028425, + "grad_norm": 0.07648599682491888, "learning_rate": 0.00016604565640279754, - "loss": 1.0564, + "loss": 1.0462, "step": 1115 }, { "epoch": 0.3448408574837394, - "grad_norm": 0.5495541176613125, + "grad_norm": 0.07375239907970622, "learning_rate": 0.0001656410576197938, - "loss": 1.0636, + "loss": 1.0537, "step": 1120 }, { "epoch": 0.3463803255975061, - "grad_norm": 0.4477184453911253, + "grad_norm": 0.07218952828382294, "learning_rate": 0.0001652345618956526, - "loss": 1.0797, + "loss": 1.0702, "step": 1125 }, { "epoch": 0.34791979371127274, - "grad_norm": 0.5912654457171957, + "grad_norm": 0.07501734343767677, "learning_rate": 0.00016482618097757122, - "loss": 1.0553, + "loss": 1.045, "step": 1130 }, { "epoch": 0.34945926182503945, - "grad_norm": 0.5561325327054591, + "grad_norm": 0.07478505250167114, "learning_rate": 0.00016441592666722684, - "loss": 1.0459, + "loss": 1.0356, "step": 1135 }, { "epoch": 0.35099872993880615, - "grad_norm": 0.6091265051455126, + "grad_norm": 0.07035501241737965, "learning_rate": 0.00016400381082043507, - "loss": 1.0916, + "loss": 1.0819, "step": 1140 }, { "epoch": 0.35253819805257286, - "grad_norm": 0.7050858387246796, + "grad_norm": 0.07713003380587562, "learning_rate": 0.00016358984534680748, - "loss": 1.0605, + "loss": 1.0494, "step": 1145 }, { "epoch": 0.3540776661663395, - "grad_norm": 0.592579238142578, + "grad_norm": 0.07012091124270008, "learning_rate": 0.00016317404220940758, - "loss": 1.0319, + "loss": 1.022, "step": 1150 }, { "epoch": 0.3556171342801062, - "grad_norm": 0.4991456345649348, + "grad_norm": 0.06697708347109951, "learning_rate": 0.00016275641342440483, - "loss": 1.0671, + "loss": 1.0589, "step": 1155 }, { "epoch": 0.3571566023938729, - "grad_norm": 0.5675708757017103, + "grad_norm": 0.07573896521834783, "learning_rate": 0.0001623369710607277, - "loss": 1.0136, + "loss": 1.0044, "step": 1160 }, { "epoch": 0.35869607050763963, - "grad_norm": 0.45324587662080057, + "grad_norm": 0.06946529088193742, "learning_rate": 0.00016191572723971455, - "loss": 1.0752, + "loss": 1.0652, "step": 1165 }, { "epoch": 0.3602355386214063, - "grad_norm": 0.4578223763710829, + "grad_norm": 0.0727340465476027, "learning_rate": 0.00016149269413476353, - "loss": 1.0162, + "loss": 1.0057, "step": 1170 }, { "epoch": 0.361775006735173, - "grad_norm": 0.4787021798925449, + "grad_norm": 0.08017843239666048, "learning_rate": 0.00016106788397098095, - "loss": 1.0031, + "loss": 0.9942, "step": 1175 }, { "epoch": 0.3633144748489397, - "grad_norm": 0.49337377711536057, + "grad_norm": 0.06899110462149576, "learning_rate": 0.0001606413090248276, - "loss": 1.0042, + "loss": 0.9958, "step": 1180 }, { "epoch": 0.3648539429627064, - "grad_norm": 0.549765500927778, + "grad_norm": 0.07737025459856088, "learning_rate": 0.00016021298162376428, - "loss": 1.0301, + "loss": 1.0211, "step": 1185 }, { "epoch": 0.36639341107647305, - "grad_norm": 0.6910911024000683, + "grad_norm": 0.07807131065906221, "learning_rate": 0.00015978291414589542, - "loss": 1.049, + "loss": 1.039, "step": 1190 }, { "epoch": 0.36793287919023976, - "grad_norm": 0.5099057113980399, + "grad_norm": 0.07059155235596021, "learning_rate": 0.0001593511190196115, - "loss": 1.0613, + "loss": 1.0513, "step": 1195 }, { "epoch": 0.36947234730400647, - "grad_norm": 0.5326314131616275, + "grad_norm": 0.07422722970665956, "learning_rate": 0.00015891760872322963, - "loss": 1.0177, + "loss": 1.0093, + "step": 1200 + }, + { + "epoch": 0.36947234730400647, + "eval_loss": 1.0309594869613647, + "eval_runtime": 3796.5579, + "eval_samples_per_second": 6.087, + "eval_steps_per_second": 0.381, "step": 1200 }, { "epoch": 0.37101181541777317, - "grad_norm": 0.44876418883849256, + "grad_norm": 0.06806084338199529, "learning_rate": 0.00015848239578463325, - "loss": 1.0594, + "loss": 1.0504, "step": 1205 }, { "epoch": 0.3725512835315399, - "grad_norm": 0.518140271684293, + "grad_norm": 0.07638211255486586, "learning_rate": 0.00015804549278090982, - "loss": 1.0228, + "loss": 1.0145, "step": 1210 }, { "epoch": 0.37409075164530653, - "grad_norm": 0.5085879960253272, + "grad_norm": 0.07232165039483601, "learning_rate": 0.00015760691233798757, - "loss": 1.0187, + "loss": 1.011, "step": 1215 }, { "epoch": 0.37563021975907324, - "grad_norm": 0.5007291726473487, + "grad_norm": 0.0725477342684882, "learning_rate": 0.00015716666713027055, - "loss": 1.0433, + "loss": 1.0338, "step": 1220 }, { "epoch": 0.37716968787283994, - "grad_norm": 0.739316075498731, + "grad_norm": 0.08448404468374969, "learning_rate": 0.00015672476988027228, - "loss": 1.0478, + "loss": 1.0388, "step": 1225 }, { "epoch": 0.37870915598660665, - "grad_norm": 0.7639785434460002, + "grad_norm": 0.08451055602238913, "learning_rate": 0.0001562812333582482, - "loss": 1.0117, + "loss": 1.0041, "step": 1230 }, { "epoch": 0.3802486241003733, - "grad_norm": 0.5823718950453733, + "grad_norm": 0.07357435195090126, "learning_rate": 0.00015583607038182655, - "loss": 1.0366, + "loss": 1.0286, "step": 1235 }, { "epoch": 0.38178809221414, - "grad_norm": 0.6893848055474053, + "grad_norm": 0.07981414373807207, "learning_rate": 0.000155389293815638, - "loss": 1.0381, + "loss": 1.0293, "step": 1240 }, { "epoch": 0.3833275603279067, - "grad_norm": 0.5063333022122084, + "grad_norm": 0.074241776686085, "learning_rate": 0.00015494091657094385, - "loss": 1.0419, + "loss": 1.033, "step": 1245 }, { "epoch": 0.3848670284416734, - "grad_norm": 0.44024803486149566, + "grad_norm": 0.07517760872068341, "learning_rate": 0.00015449095160526292, - "loss": 1.0645, + "loss": 1.0559, "step": 1250 }, { "epoch": 0.38640649655544007, - "grad_norm": 0.5599556807381372, + "grad_norm": 0.07476423646372729, "learning_rate": 0.00015403941192199718, - "loss": 1.0443, + "loss": 1.0343, "step": 1255 }, { "epoch": 0.3879459646692068, - "grad_norm": 0.4886207710717129, + "grad_norm": 0.07214431115898451, "learning_rate": 0.0001535863105700558, - "loss": 1.0556, + "loss": 1.0467, "step": 1260 }, { "epoch": 0.3894854327829735, - "grad_norm": 0.5246689722873303, + "grad_norm": 0.07644175139453621, "learning_rate": 0.00015313166064347814, - "loss": 1.0256, + "loss": 1.0188, "step": 1265 }, { "epoch": 0.3910249008967402, - "grad_norm": 0.4204231295176996, + "grad_norm": 0.06984113294468933, "learning_rate": 0.00015267547528105538, - "loss": 1.0423, + "loss": 1.0341, "step": 1270 }, { "epoch": 0.3925643690105069, - "grad_norm": 0.43365441716522, + "grad_norm": 0.06690863213166448, "learning_rate": 0.0001522177676659508, - "loss": 1.0706, + "loss": 1.0625, "step": 1275 }, { "epoch": 0.39410383712427355, - "grad_norm": 0.5576831403247644, + "grad_norm": 0.06950059946334636, "learning_rate": 0.00015175855102531887, - "loss": 1.0199, + "loss": 1.0123, "step": 1280 }, { "epoch": 0.39564330523804025, - "grad_norm": 0.4746796767312794, + "grad_norm": 0.06964516306819979, "learning_rate": 0.00015129783862992283, - "loss": 1.029, + "loss": 1.0201, "step": 1285 }, { "epoch": 0.39718277335180696, - "grad_norm": 0.5397433381233686, + "grad_norm": 0.080130428292661, "learning_rate": 0.0001508356437937512, - "loss": 1.0527, + "loss": 1.0448, "step": 1290 }, { "epoch": 0.39872224146557367, - "grad_norm": 0.7141429909374478, + "grad_norm": 0.07373600496196321, "learning_rate": 0.00015037197987363338, - "loss": 1.0348, + "loss": 1.0272, "step": 1295 }, { "epoch": 0.4002617095793403, - "grad_norm": 0.4435301474074775, + "grad_norm": 0.07444079143838864, "learning_rate": 0.0001499068602688532, - "loss": 1.0702, + "loss": 1.0625, "step": 1300 }, { "epoch": 0.401801177693107, - "grad_norm": 0.5168064407110803, + "grad_norm": 0.07495218470810172, "learning_rate": 0.00014944029842076185, - "loss": 1.0367, + "loss": 1.0277, "step": 1305 }, { "epoch": 0.40334064580687373, - "grad_norm": 0.5215356272036379, + "grad_norm": 0.07148138370454796, "learning_rate": 0.0001489723078123896, - "loss": 1.0458, + "loss": 1.0393, "step": 1310 }, { "epoch": 0.40488011392064044, - "grad_norm": 0.47530585507691386, + "grad_norm": 0.07184439296288066, "learning_rate": 0.00014850290196805594, - "loss": 1.0496, + "loss": 1.0413, "step": 1315 }, { "epoch": 0.4064195820344071, - "grad_norm": 0.4940892436164985, + "grad_norm": 0.07052670997188848, "learning_rate": 0.00014803209445297887, - "loss": 1.0142, + "loss": 1.0056, "step": 1320 }, { "epoch": 0.4079590501481738, - "grad_norm": 0.5452384723470196, + "grad_norm": 0.07344695167875763, "learning_rate": 0.00014755989887288285, - "loss": 1.0494, + "loss": 1.0411, "step": 1325 }, { "epoch": 0.4094985182619405, - "grad_norm": 0.6302906153858793, + "grad_norm": 0.0769706853052285, "learning_rate": 0.00014708632887360564, - "loss": 1.0474, + "loss": 1.0387, "step": 1330 }, { "epoch": 0.4110379863757072, - "grad_norm": 0.5314488438447993, + "grad_norm": 0.0739404718972198, "learning_rate": 0.0001466113981407039, - "loss": 1.0539, + "loss": 1.0452, "step": 1335 }, { "epoch": 0.41257745448947386, - "grad_norm": 0.5583239486533321, + "grad_norm": 0.08213823024505344, "learning_rate": 0.00014613512039905765, - "loss": 1.0425, + "loss": 1.0339, "step": 1340 }, { "epoch": 0.41411692260324057, - "grad_norm": 0.5284692971206317, + "grad_norm": 0.07328616522330499, "learning_rate": 0.00014565750941247386, - "loss": 1.02, + "loss": 1.0133, "step": 1345 }, { "epoch": 0.4156563907170073, - "grad_norm": 0.5504329155738187, + "grad_norm": 0.07420350570178859, "learning_rate": 0.0001451785789832884, - "loss": 1.0266, + "loss": 1.0186, "step": 1350 }, { "epoch": 0.417195858830774, - "grad_norm": 0.7377959516300412, + "grad_norm": 0.07745166588825841, "learning_rate": 0.00014469834295196743, - "loss": 1.0567, + "loss": 1.0498, "step": 1355 }, { "epoch": 0.4187353269445407, - "grad_norm": 0.5133810725829329, + "grad_norm": 0.07355335272924506, "learning_rate": 0.00014421681519670722, - "loss": 1.0516, + "loss": 1.0435, "step": 1360 }, { "epoch": 0.42027479505830734, - "grad_norm": 0.5092592418186027, + "grad_norm": 0.07280563497532423, "learning_rate": 0.0001437340096330332, - "loss": 1.058, + "loss": 1.0503, "step": 1365 }, { "epoch": 0.42181426317207404, - "grad_norm": 0.6808151766449716, + "grad_norm": 0.08984865578017236, "learning_rate": 0.0001432499402133979, - "loss": 1.046, + "loss": 1.0373, "step": 1370 }, { "epoch": 0.42335373128584075, - "grad_norm": 0.49715289560889164, + "grad_norm": 0.07403634652711334, "learning_rate": 0.0001427646209267775, - "loss": 1.0384, + "loss": 1.0296, "step": 1375 }, { "epoch": 0.42489319939960746, - "grad_norm": 0.4073699217300379, + "grad_norm": 0.06941364422871579, "learning_rate": 0.00014227806579826774, - "loss": 1.0181, + "loss": 1.0097, "step": 1380 }, { "epoch": 0.4264326675133741, - "grad_norm": 0.5386638584236078, + "grad_norm": 0.07802992325990128, "learning_rate": 0.00014179028888867867, - "loss": 1.0816, + "loss": 1.0745, "step": 1385 }, { "epoch": 0.4279721356271408, - "grad_norm": 0.7007555607650736, + "grad_norm": 0.08282848213775869, "learning_rate": 0.00014130130429412815, - "loss": 1.0364, + "loss": 1.0273, "step": 1390 }, { "epoch": 0.4295116037409075, - "grad_norm": 0.6291522129011135, + "grad_norm": 0.08153105885218227, "learning_rate": 0.0001408111261456346, - "loss": 1.018, + "loss": 1.0099, "step": 1395 }, { "epoch": 0.43105107185467423, - "grad_norm": 0.4183183608173097, + "grad_norm": 0.0676196363646441, "learning_rate": 0.00014031976860870855, - "loss": 1.0159, + "loss": 1.0086, + "step": 1400 + }, + { + "epoch": 0.43105107185467423, + "eval_loss": 1.0290647745132446, + "eval_runtime": 3812.7057, + "eval_samples_per_second": 6.061, + "eval_steps_per_second": 0.379, "step": 1400 }, { "epoch": 0.4325905399684409, - "grad_norm": 0.42462992648226483, + "grad_norm": 0.07464451118448104, "learning_rate": 0.00013982724588294335, - "loss": 1.026, + "loss": 1.0198, "step": 1405 }, { "epoch": 0.4341300080822076, - "grad_norm": 0.5342525321314037, + "grad_norm": 0.07512676303409935, "learning_rate": 0.00013933357220160476, - "loss": 1.0668, + "loss": 1.0591, "step": 1410 }, { "epoch": 0.4356694761959743, - "grad_norm": 0.5175732551515752, + "grad_norm": 0.06938767156943278, "learning_rate": 0.00013883876183121973, - "loss": 1.0613, + "loss": 1.0523, "step": 1415 }, { "epoch": 0.437208944309741, - "grad_norm": 0.5236448642390229, + "grad_norm": 0.0777955845768148, "learning_rate": 0.000138342829071164, - "loss": 1.0505, + "loss": 1.0436, "step": 1420 }, { "epoch": 0.4387484124235077, - "grad_norm": 0.47885357545447377, + "grad_norm": 0.076768877519892, "learning_rate": 0.00013784578825324885, - "loss": 1.0181, + "loss": 1.0098, "step": 1425 }, { "epoch": 0.44028788053727436, - "grad_norm": 0.567561554601715, + "grad_norm": 0.07138945295655025, "learning_rate": 0.00013734765374130717, - "loss": 1.0337, + "loss": 1.0262, "step": 1430 }, { "epoch": 0.44182734865104106, - "grad_norm": 0.4640525016441619, + "grad_norm": 0.07543031342117723, "learning_rate": 0.00013684843993077788, - "loss": 1.0195, + "loss": 1.0124, "step": 1435 }, { "epoch": 0.44336681676480777, - "grad_norm": 0.6957136092069967, + "grad_norm": 0.07202890380872994, "learning_rate": 0.00013634816124829063, - "loss": 1.0252, + "loss": 1.0183, "step": 1440 }, { "epoch": 0.4449062848785745, - "grad_norm": 0.414100639900688, + "grad_norm": 0.06782846978809214, "learning_rate": 0.0001358468321512481, - "loss": 1.0625, + "loss": 1.0552, "step": 1445 }, { "epoch": 0.4464457529923411, - "grad_norm": 0.4358981186515927, + "grad_norm": 0.07176237407539487, "learning_rate": 0.00013534446712740877, - "loss": 1.0336, + "loss": 1.025, "step": 1450 }, { "epoch": 0.44798522110610783, - "grad_norm": 0.7000263701281964, + "grad_norm": 0.07601715009456655, "learning_rate": 0.0001348410806944681, - "loss": 1.0247, + "loss": 1.0153, "step": 1455 }, { "epoch": 0.44952468921987454, - "grad_norm": 0.48487218580357255, + "grad_norm": 0.0761359028970367, "learning_rate": 0.00013433668739963882, - "loss": 1.0306, + "loss": 1.0244, "step": 1460 }, { "epoch": 0.45106415733364125, - "grad_norm": 0.48692168534098523, + "grad_norm": 0.07165181931814346, "learning_rate": 0.00013383130181923071, - "loss": 1.0311, + "loss": 1.0237, "step": 1465 }, { "epoch": 0.4526036254474079, - "grad_norm": 0.6087744044400206, + "grad_norm": 0.07219427827224394, "learning_rate": 0.00013332493855822936, - "loss": 1.0138, + "loss": 1.0064, "step": 1470 }, { "epoch": 0.4541430935611746, - "grad_norm": 0.5570901501341506, + "grad_norm": 0.07315240256522645, "learning_rate": 0.00013281761224987398, - "loss": 1.0121, + "loss": 1.0049, "step": 1475 }, { "epoch": 0.4556825616749413, - "grad_norm": 0.49929406825113404, + "grad_norm": 0.07283831171004836, "learning_rate": 0.00013230933755523466, - "loss": 1.0345, + "loss": 1.028, "step": 1480 }, { "epoch": 0.457222029788708, - "grad_norm": 0.7306093657667501, + "grad_norm": 0.08277958377037488, "learning_rate": 0.00013180012916278854, - "loss": 1.0472, + "loss": 1.0402, "step": 1485 }, { "epoch": 0.45876149790247467, - "grad_norm": 0.46118596282419816, + "grad_norm": 0.0732834274712129, "learning_rate": 0.00013129000178799548, - "loss": 1.0441, + "loss": 1.0366, "step": 1490 }, { "epoch": 0.4603009660162414, - "grad_norm": 0.4542565239229393, + "grad_norm": 0.07270925231246442, "learning_rate": 0.00013077897017287272, - "loss": 1.0092, + "loss": 1.0006, "step": 1495 }, { "epoch": 0.4618404341300081, - "grad_norm": 0.4921710657691845, + "grad_norm": 0.07601545515982518, "learning_rate": 0.00013026704908556888, - "loss": 1.0637, + "loss": 1.0555, "step": 1500 }, { "epoch": 0.4633799022437748, - "grad_norm": 0.5001776822700446, + "grad_norm": 0.0775749000511019, "learning_rate": 0.0001297542533199371, - "loss": 1.0483, + "loss": 1.0409, "step": 1505 }, { "epoch": 0.4649193703575415, - "grad_norm": 0.463427010769576, + "grad_norm": 0.07378648062711159, "learning_rate": 0.00012924059769510768, - "loss": 1.0388, + "loss": 1.0314, "step": 1510 }, { "epoch": 0.46645883847130815, - "grad_norm": 0.5055392500122334, + "grad_norm": 0.07321573368492998, "learning_rate": 0.00012872609705505964, - "loss": 1.0576, + "loss": 1.0502, "step": 1515 }, { "epoch": 0.46799830658507485, - "grad_norm": 0.4980518480504977, + "grad_norm": 0.07930733821420928, "learning_rate": 0.00012821076626819196, - "loss": 1.0485, + "loss": 1.0414, "step": 1520 }, { "epoch": 0.46953777469884156, - "grad_norm": 0.4390801115545415, + "grad_norm": 0.07511260278964532, "learning_rate": 0.00012769462022689363, - "loss": 1.029, + "loss": 1.0205, "step": 1525 }, { "epoch": 0.47107724281260827, - "grad_norm": 0.48684411734981153, + "grad_norm": 0.06779778370699593, "learning_rate": 0.0001271776738471136, - "loss": 1.0353, + "loss": 1.0274, "step": 1530 }, { "epoch": 0.4726167109263749, - "grad_norm": 0.6254296920632078, + "grad_norm": 0.07717323700425802, "learning_rate": 0.00012665994206792938, - "loss": 1.0662, + "loss": 1.0589, "step": 1535 }, { "epoch": 0.4741561790401416, - "grad_norm": 0.4828153968193146, + "grad_norm": 0.06842508429114769, "learning_rate": 0.00012614143985111565, - "loss": 1.1062, + "loss": 1.0987, "step": 1540 }, { "epoch": 0.47569564715390833, - "grad_norm": 0.6105347585674029, + "grad_norm": 0.07643849238067586, "learning_rate": 0.00012562218218071164, - "loss": 1.0285, + "loss": 1.0218, "step": 1545 }, { "epoch": 0.47723511526767504, - "grad_norm": 0.4580317847852175, + "grad_norm": 0.07406016562514833, "learning_rate": 0.0001251021840625883, - "loss": 1.0257, + "loss": 1.0182, "step": 1550 }, { "epoch": 0.4787745833814417, - "grad_norm": 0.49671748534126925, + "grad_norm": 0.07954321360596633, "learning_rate": 0.00012458146052401442, - "loss": 1.0347, + "loss": 1.0283, "step": 1555 }, { "epoch": 0.4803140514952084, - "grad_norm": 0.5504358336120402, + "grad_norm": 0.07374926807557698, "learning_rate": 0.00012406002661322264, - "loss": 1.0246, + "loss": 1.0165, "step": 1560 }, { "epoch": 0.4818535196089751, - "grad_norm": 0.4998876160614867, + "grad_norm": 0.07376676091481264, "learning_rate": 0.00012353789739897437, - "loss": 1.0574, + "loss": 1.0503, "step": 1565 }, { "epoch": 0.4833929877227418, - "grad_norm": 0.5729717160065313, + "grad_norm": 0.07439474348790363, "learning_rate": 0.00012301508797012432, - "loss": 1.0365, + "loss": 1.0292, "step": 1570 }, { "epoch": 0.4849324558365085, - "grad_norm": 0.47052381273276284, + "grad_norm": 0.07661999249880341, "learning_rate": 0.00012249161343518466, - "loss": 1.0173, + "loss": 1.0111, "step": 1575 }, { "epoch": 0.48647192395027516, - "grad_norm": 0.46095882989505704, + "grad_norm": 0.07208564187421422, "learning_rate": 0.00012196748892188816, - "loss": 1.0497, + "loss": 1.0441, "step": 1580 }, { "epoch": 0.48801139206404187, - "grad_norm": 0.654430908611628, + "grad_norm": 0.07849547483606649, "learning_rate": 0.00012144272957675108, - "loss": 1.0312, + "loss": 1.0235, "step": 1585 }, { "epoch": 0.4895508601778086, - "grad_norm": 0.45228923273407706, + "grad_norm": 0.07505211623162304, "learning_rate": 0.00012091735056463562, - "loss": 1.0093, + "loss": 1.0032, "step": 1590 }, { "epoch": 0.4910903282915753, - "grad_norm": 0.6127718568028115, + "grad_norm": 0.08312558481401704, "learning_rate": 0.00012039136706831145, - "loss": 1.0671, + "loss": 1.059, "step": 1595 }, { "epoch": 0.49262979640534194, - "grad_norm": 0.6260979638098746, + "grad_norm": 0.07454335234650318, "learning_rate": 0.00011986479428801709, - "loss": 1.0436, + "loss": 1.0362, + "step": 1600 + }, + { + "epoch": 0.49262979640534194, + "eval_loss": 1.0269535779953003, + "eval_runtime": 3800.036, + "eval_samples_per_second": 6.081, + "eval_steps_per_second": 0.38, "step": 1600 }, { "epoch": 0.49416926451910864, - "grad_norm": 0.4570968434668229, + "grad_norm": 0.07456362557265384, "learning_rate": 0.00011933764744102058, - "loss": 1.0229, + "loss": 1.016, "step": 1605 }, { "epoch": 0.49570873263287535, - "grad_norm": 0.5515514688898091, + "grad_norm": 0.08008662438079932, "learning_rate": 0.00011880994176117976, - "loss": 1.0449, + "loss": 1.0392, "step": 1610 }, { "epoch": 0.49724820074664206, - "grad_norm": 0.42355848666393986, + "grad_norm": 0.07176749751196013, "learning_rate": 0.00011828169249850201, - "loss": 1.0453, + "loss": 1.0392, "step": 1615 }, { "epoch": 0.4987876688604087, - "grad_norm": 0.5463051906159438, + "grad_norm": 0.07655608798136061, "learning_rate": 0.00011775291491870351, - "loss": 1.0271, + "loss": 1.0212, "step": 1620 }, { "epoch": 0.5003271369741754, - "grad_norm": 0.4043160515062089, + "grad_norm": 0.07786227626103659, "learning_rate": 0.00011722362430276816, - "loss": 1.0377, + "loss": 1.03, "step": 1625 }, { "epoch": 0.5018666050879421, - "grad_norm": 0.8243302494042759, + "grad_norm": 0.07799113973393568, "learning_rate": 0.00011669383594650593, - "loss": 1.0663, + "loss": 1.0589, "step": 1630 }, { "epoch": 0.5034060732017088, - "grad_norm": 0.4547900229781827, + "grad_norm": 0.06547994125184468, "learning_rate": 0.00011616356516011083, - "loss": 1.0159, + "loss": 1.0084, "step": 1635 }, { "epoch": 0.5049455413154755, - "grad_norm": 0.5227895926154218, + "grad_norm": 0.07784862670275924, "learning_rate": 0.00011563282726771847, - "loss": 1.0515, + "loss": 1.0449, "step": 1640 }, { "epoch": 0.5064850094292422, - "grad_norm": 0.5635603323984416, + "grad_norm": 0.0771399540024009, "learning_rate": 0.0001151016376069632, - "loss": 1.0701, + "loss": 1.0634, "step": 1645 }, { "epoch": 0.508024477543009, - "grad_norm": 0.47093920101316833, + "grad_norm": 0.07334720494239291, "learning_rate": 0.00011457001152853493, - "loss": 1.0199, + "loss": 1.0142, "step": 1650 }, { "epoch": 0.5095639456567755, - "grad_norm": 0.4344516158624125, + "grad_norm": 0.07439128068501075, "learning_rate": 0.00011403796439573544, - "loss": 1.038, + "loss": 1.0309, "step": 1655 }, { "epoch": 0.5111034137705422, - "grad_norm": 0.49922332593006863, + "grad_norm": 0.0708288260968639, "learning_rate": 0.00011350551158403442, - "loss": 1.0593, + "loss": 1.0531, "step": 1660 }, { "epoch": 0.512642881884309, - "grad_norm": 0.42072128316053065, + "grad_norm": 0.06763171945470464, "learning_rate": 0.0001129726684806252, - "loss": 1.0147, + "loss": 1.0086, "step": 1665 }, { "epoch": 0.5141823499980757, - "grad_norm": 0.4531404971038023, + "grad_norm": 0.07768921401375369, "learning_rate": 0.00011243945048398003, - "loss": 1.0215, + "loss": 1.0148, "step": 1670 }, { "epoch": 0.5157218181118424, - "grad_norm": 0.4470522441927525, + "grad_norm": 0.06896327791840266, "learning_rate": 0.000111905873003405, - "loss": 1.0323, + "loss": 1.0261, "step": 1675 }, { "epoch": 0.5172612862256091, - "grad_norm": 0.5867123309400825, + "grad_norm": 0.07842199537412599, "learning_rate": 0.00011137195145859494, - "loss": 1.0046, + "loss": 0.999, "step": 1680 }, { "epoch": 0.5188007543393758, - "grad_norm": 0.3962300365929646, + "grad_norm": 0.06865343546929636, "learning_rate": 0.00011083770127918762, - "loss": 1.0043, + "loss": 0.9982, "step": 1685 }, { "epoch": 0.5203402224531425, - "grad_norm": 0.7925921747531262, + "grad_norm": 0.08103281697737574, "learning_rate": 0.00011030313790431788, - "loss": 1.049, + "loss": 1.042, "step": 1690 }, { "epoch": 0.5218796905669091, - "grad_norm": 0.5078998609582865, + "grad_norm": 0.07974961051333619, "learning_rate": 0.00010976827678217161, - "loss": 1.0109, + "loss": 1.0039, "step": 1695 }, { "epoch": 0.5234191586806758, - "grad_norm": 0.4109138516526712, + "grad_norm": 0.06625737159764002, "learning_rate": 0.00010923313336953913, - "loss": 1.0172, + "loss": 1.0115, "step": 1700 }, { "epoch": 0.5249586267944425, - "grad_norm": 0.5367265822698736, + "grad_norm": 0.07000163587644152, "learning_rate": 0.00010869772313136861, - "loss": 1.0285, + "loss": 1.0223, "step": 1705 }, { "epoch": 0.5264980949082092, - "grad_norm": 0.45736224715965773, + "grad_norm": 0.06839426065828255, "learning_rate": 0.00010816206154031916, - "loss": 1.0146, + "loss": 1.0088, "step": 1710 }, { "epoch": 0.5280375630219759, - "grad_norm": 0.46461493826686345, + "grad_norm": 0.07949491269796151, "learning_rate": 0.00010762616407631356, - "loss": 1.0783, + "loss": 1.071, "step": 1715 }, { "epoch": 0.5295770311357426, - "grad_norm": 0.6140868832303202, + "grad_norm": 0.07557511886462906, "learning_rate": 0.00010709004622609116, - "loss": 1.0748, + "loss": 1.0676, "step": 1720 }, { "epoch": 0.5311164992495093, - "grad_norm": 0.6800071721536182, + "grad_norm": 0.08195884945191133, "learning_rate": 0.00010655372348276006, - "loss": 1.0272, + "loss": 1.0198, "step": 1725 }, { "epoch": 0.532655967363276, - "grad_norm": 0.5202497163301416, + "grad_norm": 0.0781242359342465, "learning_rate": 0.00010601721134534959, - "loss": 1.0379, + "loss": 1.0314, "step": 1730 }, { "epoch": 0.5341954354770427, - "grad_norm": 0.4826223398674556, + "grad_norm": 0.07628144377233338, "learning_rate": 0.00010548052531836223, - "loss": 1.0357, + "loss": 1.0299, "step": 1735 }, { "epoch": 0.5357349035908093, - "grad_norm": 1.0784079166978915, + "grad_norm": 0.07983920659956817, "learning_rate": 0.00010494368091132576, - "loss": 1.0378, + "loss": 1.0317, "step": 1740 }, { "epoch": 0.537274371704576, - "grad_norm": 0.4567747436804465, + "grad_norm": 0.07418551402340584, "learning_rate": 0.00010440669363834483, - "loss": 1.0188, + "loss": 1.0129, "step": 1745 }, { "epoch": 0.5388138398183427, - "grad_norm": 0.48032495281062815, + "grad_norm": 0.07002417164492032, "learning_rate": 0.00010386957901765277, - "loss": 1.034, + "loss": 1.0278, "step": 1750 }, { "epoch": 0.5403533079321095, - "grad_norm": 0.5026624164429033, + "grad_norm": 0.0707377171946109, "learning_rate": 0.00010333235257116313, - "loss": 0.9786, + "loss": 0.9727, "step": 1755 }, { "epoch": 0.5418927760458762, - "grad_norm": 0.48003816426131585, + "grad_norm": 0.0737915692626489, "learning_rate": 0.00010279502982402103, - "loss": 1.0486, + "loss": 1.0433, "step": 1760 }, { "epoch": 0.5434322441596429, - "grad_norm": 0.5389477916705987, + "grad_norm": 0.07512990163556856, "learning_rate": 0.00010225762630415457, - "loss": 1.017, + "loss": 1.0111, "step": 1765 }, { "epoch": 0.5449717122734096, - "grad_norm": 0.537004045922141, + "grad_norm": 0.0753662165245646, "learning_rate": 0.00010172015754182607, - "loss": 1.0444, + "loss": 1.037, "step": 1770 }, { "epoch": 0.5465111803871763, - "grad_norm": 1.929543100464396, + "grad_norm": 0.1349186814580228, "learning_rate": 0.00010118263906918331, - "loss": 1.0451, + "loss": 1.0381, "step": 1775 }, { "epoch": 0.5480506485009429, - "grad_norm": 0.4940854946649758, + "grad_norm": 0.07557478098317172, "learning_rate": 0.00010064508641981054, - "loss": 1.0013, + "loss": 0.9955, "step": 1780 }, { "epoch": 0.5495901166147096, - "grad_norm": 0.4816239235355272, + "grad_norm": 0.07668998832423247, "learning_rate": 0.0001001075151282798, - "loss": 1.0568, + "loss": 1.051, "step": 1785 }, { "epoch": 0.5511295847284763, - "grad_norm": 0.4644540038274215, + "grad_norm": 0.07585620860956059, "learning_rate": 9.956994072970179e-05, - "loss": 1.0332, + "loss": 1.0272, "step": 1790 }, { "epoch": 0.552669052842243, - "grad_norm": 0.39735664952249866, + "grad_norm": 0.07008056728318604, "learning_rate": 9.903237875927698e-05, - "loss": 1.0716, + "loss": 1.0653, "step": 1795 }, { "epoch": 0.5542085209560097, - "grad_norm": 0.42204715257283004, + "grad_norm": 0.07151388140558161, "learning_rate": 9.849484475184672e-05, - "loss": 1.0204, + "loss": 1.0155, + "step": 1800 + }, + { + "epoch": 0.5542085209560097, + "eval_loss": 1.0255825519561768, + "eval_runtime": 3798.3842, + "eval_samples_per_second": 6.084, + "eval_steps_per_second": 0.38, "step": 1800 }, { "epoch": 0.5557479890697764, - "grad_norm": 0.5226725559414837, + "grad_norm": 0.08081651371496165, "learning_rate": 9.795735424144428e-05, - "loss": 1.0156, + "loss": 1.0102, "step": 1805 }, { "epoch": 0.5572874571835431, - "grad_norm": 0.5721667039150645, + "grad_norm": 0.11914760104172627, "learning_rate": 9.74199227608459e-05, - "loss": 1.0381, + "loss": 1.0316, "step": 1810 }, { "epoch": 0.5588269252973098, - "grad_norm": 0.6792294345813789, + "grad_norm": 0.07607697309485299, "learning_rate": 9.688256584112192e-05, - "loss": 1.0216, + "loss": 1.0158, "step": 1815 }, { "epoch": 0.5603663934110765, - "grad_norm": 0.5038747704164618, + "grad_norm": 0.07962971403841683, "learning_rate": 9.634529901118799e-05, - "loss": 1.0309, + "loss": 1.0243, "step": 1820 }, { "epoch": 0.5619058615248431, - "grad_norm": 1.0905768705043635, + "grad_norm": 0.0715309251551143, "learning_rate": 9.580813779735624e-05, - "loss": 1.0417, + "loss": 1.0354, "step": 1825 }, { "epoch": 0.5634453296386098, - "grad_norm": 0.7259613249848627, + "grad_norm": 0.0752110141233035, "learning_rate": 9.52710977228867e-05, - "loss": 1.0348, + "loss": 1.0291, "step": 1830 }, { "epoch": 0.5649847977523765, - "grad_norm": 0.48026375265768084, + "grad_norm": 0.07768363081585787, "learning_rate": 9.473419430753864e-05, - "loss": 0.979, + "loss": 0.9735, "step": 1835 }, { "epoch": 0.5665242658661432, - "grad_norm": 0.5445245479851468, + "grad_norm": 0.07642437297948991, "learning_rate": 9.419744306712197e-05, - "loss": 1.0099, + "loss": 1.0035, "step": 1840 }, { "epoch": 0.56806373397991, - "grad_norm": 0.4377954309966568, + "grad_norm": 0.0725535618760288, "learning_rate": 9.3660859513049e-05, - "loss": 1.0681, + "loss": 1.0624, "step": 1845 }, { "epoch": 0.5696032020936767, - "grad_norm": 0.7151357125545018, + "grad_norm": 0.075838714661654, "learning_rate": 9.312445915188609e-05, - "loss": 1.033, + "loss": 1.0273, "step": 1850 }, { "epoch": 0.5711426702074434, - "grad_norm": 0.5258015505597993, + "grad_norm": 0.07494133610203221, "learning_rate": 9.258825748490558e-05, - "loss": 1.0492, + "loss": 1.043, "step": 1855 }, { "epoch": 0.5726821383212101, - "grad_norm": 0.5535249863969953, + "grad_norm": 0.0774699889487116, "learning_rate": 9.205227000763788e-05, - "loss": 1.0444, + "loss": 1.0386, "step": 1860 }, { "epoch": 0.5742216064349767, - "grad_norm": 0.5852809671349769, + "grad_norm": 0.0767041878914581, "learning_rate": 9.151651220942349e-05, - "loss": 1.0532, + "loss": 1.0475, "step": 1865 }, { "epoch": 0.5757610745487434, - "grad_norm": 0.5927067468229194, + "grad_norm": 0.07367320031783889, "learning_rate": 9.098099957296552e-05, - "loss": 1.0419, + "loss": 1.0356, "step": 1870 }, { "epoch": 0.5773005426625101, - "grad_norm": 0.5038197549382673, + "grad_norm": 0.07237560796588688, "learning_rate": 9.044574757388224e-05, - "loss": 1.0342, + "loss": 1.0291, "step": 1875 }, { "epoch": 0.5788400107762768, - "grad_norm": 0.45976069318212914, + "grad_norm": 0.0733597412062657, "learning_rate": 8.991077168025976e-05, - "loss": 1.0346, + "loss": 1.0289, "step": 1880 }, { "epoch": 0.5803794788900435, - "grad_norm": 0.6004992124353027, + "grad_norm": 0.0781082222836356, "learning_rate": 8.937608735220527e-05, - "loss": 1.0457, + "loss": 1.0411, "step": 1885 }, { "epoch": 0.5819189470038102, - "grad_norm": 0.6603686161055189, + "grad_norm": 0.0787222190348577, "learning_rate": 8.884171004139996e-05, - "loss": 1.0233, + "loss": 1.0176, "step": 1890 }, { "epoch": 0.5834584151175769, - "grad_norm": 0.502662627180952, + "grad_norm": 0.07114468470409495, "learning_rate": 8.830765519065262e-05, - "loss": 0.9875, + "loss": 0.9838, "step": 1895 }, { "epoch": 0.5849978832313436, - "grad_norm": 0.7865449254332316, + "grad_norm": 0.08086984212119945, "learning_rate": 8.777393823345343e-05, - "loss": 1.0493, + "loss": 1.0438, "step": 1900 }, { "epoch": 0.5865373513451103, - "grad_norm": 0.4226972216471069, + "grad_norm": 0.0703832911854875, "learning_rate": 8.724057459352784e-05, - "loss": 0.9935, + "loss": 0.9889, "step": 1905 }, { "epoch": 0.5880768194588769, - "grad_norm": 0.5958711278623102, + "grad_norm": 0.06956832590218136, "learning_rate": 8.670757968439086e-05, - "loss": 1.0631, + "loss": 1.0573, "step": 1910 }, { "epoch": 0.5896162875726436, - "grad_norm": 0.5199697414979809, + "grad_norm": 0.07709995404199901, "learning_rate": 8.617496890890179e-05, - "loss": 1.0343, + "loss": 1.0277, "step": 1915 }, { "epoch": 0.5911557556864103, - "grad_norm": 0.6008327984030076, + "grad_norm": 0.07724046895867277, "learning_rate": 8.564275765881887e-05, - "loss": 1.0423, + "loss": 1.0349, "step": 1920 }, { "epoch": 0.592695223800177, - "grad_norm": 0.44604940325807296, + "grad_norm": 0.0743764979962109, "learning_rate": 8.511096131435454e-05, - "loss": 1.0175, + "loss": 1.0117, "step": 1925 }, { "epoch": 0.5942346919139437, - "grad_norm": 0.42675758280608317, + "grad_norm": 0.07389820884013973, "learning_rate": 8.457959524373109e-05, - "loss": 1.0307, + "loss": 1.025, "step": 1930 }, { "epoch": 0.5957741600277104, - "grad_norm": 0.5597345800622264, + "grad_norm": 0.07448602446253362, "learning_rate": 8.404867480273636e-05, - "loss": 1.058, + "loss": 1.0524, "step": 1935 }, { "epoch": 0.5973136281414771, - "grad_norm": 0.4451211247529823, + "grad_norm": 0.07016834564183058, "learning_rate": 8.351821533428023e-05, - "loss": 1.0313, + "loss": 1.0253, "step": 1940 }, { "epoch": 0.5988530962552439, - "grad_norm": 0.407092060921353, + "grad_norm": 0.07324470240300807, "learning_rate": 8.298823216795093e-05, - "loss": 1.0506, + "loss": 1.0454, "step": 1945 }, { "epoch": 0.6003925643690106, - "grad_norm": 0.43348547367002177, + "grad_norm": 0.07179143590884257, "learning_rate": 8.245874061957224e-05, - "loss": 1.0402, + "loss": 1.0349, "step": 1950 }, { "epoch": 0.6019320324827772, - "grad_norm": 0.4472151967667229, + "grad_norm": 0.07332238543813964, "learning_rate": 8.192975599076078e-05, - "loss": 1.0168, + "loss": 1.0112, "step": 1955 }, { "epoch": 0.6034715005965439, - "grad_norm": 0.3853019563837129, + "grad_norm": 0.06832929405392676, "learning_rate": 8.140129356848387e-05, - "loss": 1.0206, + "loss": 1.0159, "step": 1960 }, { "epoch": 0.6050109687103106, - "grad_norm": 0.4721720358418525, + "grad_norm": 0.07163605823180465, "learning_rate": 8.087336862461783e-05, - "loss": 1.0112, + "loss": 1.0064, "step": 1965 }, { "epoch": 0.6065504368240773, - "grad_norm": 0.46617796491326624, + "grad_norm": 0.07827638868021947, "learning_rate": 8.034599641550642e-05, - "loss": 1.0484, + "loss": 1.0431, "step": 1970 }, { "epoch": 0.608089904937844, - "grad_norm": 0.5930150370566495, + "grad_norm": 0.07793183715870357, "learning_rate": 7.981919218152016e-05, - "loss": 1.0019, + "loss": 0.9968, "step": 1975 }, { "epoch": 0.6096293730516107, - "grad_norm": 0.515110909779576, + "grad_norm": 0.07306876363104758, "learning_rate": 7.929297114661581e-05, - "loss": 1.0182, + "loss": 1.0114, "step": 1980 }, { "epoch": 0.6111688411653774, - "grad_norm": 0.45067927700399973, + "grad_norm": 0.07438965485093788, "learning_rate": 7.876734851789643e-05, - "loss": 1.048, + "loss": 1.042, "step": 1985 }, { "epoch": 0.6127083092791441, - "grad_norm": 0.48025135310003386, + "grad_norm": 0.07813162760393345, "learning_rate": 7.824233948517185e-05, - "loss": 1.0499, + "loss": 1.0437, "step": 1990 }, { "epoch": 0.6142477773929107, - "grad_norm": 0.3688608602701041, + "grad_norm": 0.07126718370690863, "learning_rate": 7.771795922051999e-05, - "loss": 1.0493, + "loss": 1.0444, "step": 1995 }, { "epoch": 0.6157872455066774, - "grad_norm": 0.47692666795291627, + "grad_norm": 0.0756263745043903, "learning_rate": 7.719422287784798e-05, - "loss": 1.018, + "loss": 1.0138, + "step": 2000 + }, + { + "epoch": 0.6157872455066774, + "eval_loss": 1.02396821975708, + "eval_runtime": 3800.6654, + "eval_samples_per_second": 6.08, + "eval_steps_per_second": 0.38, "step": 2000 }, { "epoch": 0.6173267136204441, - "grad_norm": 0.43950677355297135, + "grad_norm": 0.07660137188176062, "learning_rate": 7.667114559245451e-05, - "loss": 1.0086, + "loss": 1.0043, "step": 2005 }, { "epoch": 0.6188661817342108, - "grad_norm": 0.5992589739511112, + "grad_norm": 0.07670943872660792, "learning_rate": 7.614874248059238e-05, - "loss": 1.0286, + "loss": 1.0215, "step": 2010 }, { "epoch": 0.6204056498479775, - "grad_norm": 0.5113558280370261, + "grad_norm": 0.0716429595375755, "learning_rate": 7.56270286390316e-05, - "loss": 1.0196, + "loss": 1.015, "step": 2015 }, { "epoch": 0.6219451179617442, - "grad_norm": 0.48971205943506146, + "grad_norm": 0.0734311834910782, "learning_rate": 7.510601914462331e-05, - "loss": 1.0021, + "loss": 0.9974, "step": 2020 }, { "epoch": 0.6234845860755109, - "grad_norm": 2.031201004984545, + "grad_norm": 0.07591975296726955, "learning_rate": 7.458572905386381e-05, - "loss": 1.0068, + "loss": 1.002, "step": 2025 }, { "epoch": 0.6250240541892776, - "grad_norm": 0.43332008516546355, + "grad_norm": 0.06924982996168204, "learning_rate": 7.406617340245957e-05, - "loss": 1.0573, + "loss": 1.0523, "step": 2030 }, { "epoch": 0.6265635223030444, - "grad_norm": 0.40359150277813904, + "grad_norm": 0.06968894725304753, "learning_rate": 7.354736720489273e-05, - "loss": 1.0114, + "loss": 1.0068, "step": 2035 }, { "epoch": 0.628102990416811, - "grad_norm": 0.5109830787118577, + "grad_norm": 0.07479745851772386, "learning_rate": 7.302932545398721e-05, - "loss": 1.0325, + "loss": 1.0272, "step": 2040 }, { "epoch": 0.6296424585305777, - "grad_norm": 0.33695770975399714, + "grad_norm": 0.06904164328810472, "learning_rate": 7.251206312047547e-05, - "loss": 0.9971, + "loss": 0.9913, "step": 2045 }, { "epoch": 0.6311819266443444, - "grad_norm": 0.4478834681612594, + "grad_norm": 0.07571635716063355, "learning_rate": 7.199559515256573e-05, - "loss": 1.0551, + "loss": 1.0488, "step": 2050 }, { "epoch": 0.6327213947581111, - "grad_norm": 0.49179933979909946, + "grad_norm": 0.09238980588900121, "learning_rate": 7.14799364755101e-05, - "loss": 1.0181, + "loss": 1.0102, "step": 2055 }, { "epoch": 0.6342608628718778, - "grad_norm": 0.476264555556864, + "grad_norm": 0.07788054862847527, "learning_rate": 7.096510199117327e-05, - "loss": 1.0346, + "loss": 1.0285, "step": 2060 }, { "epoch": 0.6358003309856445, - "grad_norm": 0.45265771641362673, + "grad_norm": 0.07820963661100531, "learning_rate": 7.045110657760179e-05, - "loss": 1.0442, + "loss": 1.0396, "step": 2065 }, { "epoch": 0.6373397990994112, - "grad_norm": 0.4988192869036288, + "grad_norm": 0.07855120885780965, "learning_rate": 6.993796508859418e-05, - "loss": 1.029, + "loss": 1.0241, "step": 2070 }, { "epoch": 0.6388792672131779, - "grad_norm": 0.4886571640531873, + "grad_norm": 0.08092337017171718, "learning_rate": 6.942569235327167e-05, - "loss": 1.0275, + "loss": 1.0218, "step": 2075 }, { "epoch": 0.6404187353269445, - "grad_norm": 0.43831855733672936, + "grad_norm": 0.07558113185363395, "learning_rate": 6.891430317564964e-05, - "loss": 1.0307, + "loss": 1.0259, "step": 2080 }, { "epoch": 0.6419582034407112, - "grad_norm": 0.5860619060156503, + "grad_norm": 0.07968672599717305, "learning_rate": 6.840381233420973e-05, - "loss": 1.0265, + "loss": 1.0215, "step": 2085 }, { "epoch": 0.6434976715544779, - "grad_norm": 0.44236844600518777, + "grad_norm": 0.06842358559775973, "learning_rate": 6.789423458147292e-05, - "loss": 1.0283, + "loss": 1.0234, "step": 2090 }, { "epoch": 0.6450371396682446, - "grad_norm": 0.4876716811387609, + "grad_norm": 0.08112775510978207, "learning_rate": 6.738558464357305e-05, - "loss": 1.0504, + "loss": 1.0455, "step": 2095 }, { "epoch": 0.6465766077820113, - "grad_norm": 0.41217817376284266, + "grad_norm": 0.07612512678774283, "learning_rate": 6.687787721983136e-05, - "loss": 1.0725, + "loss": 1.0675, "step": 2100 }, { "epoch": 0.648116075895778, - "grad_norm": 0.5066902107865301, + "grad_norm": 0.07415116511450154, "learning_rate": 6.63711269823317e-05, - "loss": 1.0168, + "loss": 1.0117, "step": 2105 }, { "epoch": 0.6496555440095447, - "grad_norm": 0.8959396144411056, + "grad_norm": 0.08613279872143775, "learning_rate": 6.586534857549638e-05, - "loss": 1.0367, + "loss": 1.031, "step": 2110 }, { "epoch": 0.6511950121233114, - "grad_norm": 0.41740196707008226, + "grad_norm": 0.07567544168434838, "learning_rate": 6.536055661566312e-05, - "loss": 1.0603, + "loss": 1.0551, "step": 2115 }, { "epoch": 0.6527344802370781, - "grad_norm": 0.4285108933692639, + "grad_norm": 0.07021325816473385, "learning_rate": 6.485676569066258e-05, - "loss": 1.0701, + "loss": 1.0636, "step": 2120 }, { "epoch": 0.6542739483508447, - "grad_norm": 0.490878565864062, + "grad_norm": 0.08020286657409759, "learning_rate": 6.43539903593969e-05, - "loss": 1.0274, + "loss": 1.0225, "step": 2125 }, { "epoch": 0.6558134164646114, - "grad_norm": 0.4955255588450408, + "grad_norm": 0.07583472247487967, "learning_rate": 6.385224515141879e-05, - "loss": 1.0213, + "loss": 1.0171, "step": 2130 }, { "epoch": 0.6573528845783781, - "grad_norm": 0.4943547579484523, + "grad_norm": 0.07970278172901481, "learning_rate": 6.335154456651178e-05, - "loss": 1.0424, + "loss": 1.0372, "step": 2135 }, { "epoch": 0.6588923526921449, - "grad_norm": 0.7266332089943804, + "grad_norm": 0.0835781643104659, "learning_rate": 6.285190307427114e-05, - "loss": 1.0646, + "loss": 1.0593, "step": 2140 }, { "epoch": 0.6604318208059116, - "grad_norm": 0.4121244632336502, + "grad_norm": 0.07413585777508627, "learning_rate": 6.235333511368573e-05, - "loss": 1.0555, + "loss": 1.0506, "step": 2145 }, { "epoch": 0.6619712889196783, - "grad_norm": 0.5185060524362, + "grad_norm": 0.07626172947059409, "learning_rate": 6.185585509272078e-05, - "loss": 1.0469, + "loss": 1.0421, "step": 2150 }, { "epoch": 0.663510757033445, - "grad_norm": 0.4631669349204426, + "grad_norm": 0.07941112609232703, "learning_rate": 6.135947738790145e-05, - "loss": 1.0567, + "loss": 1.0522, "step": 2155 }, { "epoch": 0.6650502251472117, - "grad_norm": 0.5057075839366066, + "grad_norm": 0.07504983917584221, "learning_rate": 6.0864216343897365e-05, - "loss": 1.0298, + "loss": 1.0246, "step": 2160 }, { "epoch": 0.6665896932609783, - "grad_norm": 0.5634732740375421, + "grad_norm": 0.0772279082252725, "learning_rate": 6.0370086273108205e-05, - "loss": 1.0629, + "loss": 1.0586, "step": 2165 }, { "epoch": 0.668129161374745, - "grad_norm": 0.5200825511831939, + "grad_norm": 0.07292874110921534, "learning_rate": 5.987710145524992e-05, - "loss": 1.0519, + "loss": 1.0468, "step": 2170 }, { "epoch": 0.6696686294885117, - "grad_norm": 0.4345129610373104, + "grad_norm": 0.07582055371910455, "learning_rate": 5.938527613694214e-05, - "loss": 1.0321, + "loss": 1.0272, "step": 2175 }, { "epoch": 0.6712080976022784, - "grad_norm": 0.7454797877228255, + "grad_norm": 0.08105247790122148, "learning_rate": 5.8894624531296486e-05, - "loss": 0.9871, + "loss": 0.9815, "step": 2180 }, { "epoch": 0.6727475657160451, - "grad_norm": 0.38956096747028734, + "grad_norm": 0.07733491076395216, "learning_rate": 5.840516081750583e-05, - "loss": 1.0224, + "loss": 1.018, "step": 2185 }, { "epoch": 0.6742870338298118, - "grad_norm": 0.4637705509612328, + "grad_norm": 0.07312779555518635, "learning_rate": 5.791689914043447e-05, - "loss": 1.066, + "loss": 1.061, "step": 2190 }, { "epoch": 0.6758265019435785, - "grad_norm": 0.4296286070627721, + "grad_norm": 0.07156793336536187, "learning_rate": 5.742985361020945e-05, - "loss": 1.0286, + "loss": 1.0235, "step": 2195 }, { "epoch": 0.6773659700573452, - "grad_norm": 0.4514049815518666, + "grad_norm": 0.07041645870274761, "learning_rate": 5.69440383018127e-05, - "loss": 1.0443, + "loss": 1.0392, + "step": 2200 + }, + { + "epoch": 0.6773659700573452, + "eval_loss": 1.0226157903671265, + "eval_runtime": 3799.6821, + "eval_samples_per_second": 6.082, + "eval_steps_per_second": 0.38, "step": 2200 }, { "epoch": 0.6789054381711119, - "grad_norm": 0.4614392154051119, + "grad_norm": 0.07858439926874054, "learning_rate": 5.6459467254674435e-05, - "loss": 0.9877, + "loss": 0.9838, "step": 2205 }, { "epoch": 0.6804449062848785, - "grad_norm": 0.5704518491798019, + "grad_norm": 0.08205079170307557, "learning_rate": 5.597615447226724e-05, - "loss": 1.0337, + "loss": 1.0291, "step": 2210 }, { "epoch": 0.6819843743986452, - "grad_norm": 0.39999150350123325, + "grad_norm": 0.07055010438142527, "learning_rate": 5.549411392170154e-05, - "loss": 1.0088, + "loss": 1.0045, "step": 2215 }, { "epoch": 0.6835238425124119, - "grad_norm": 0.42287100632286095, + "grad_norm": 0.07374254669107684, "learning_rate": 5.501335953332187e-05, - "loss": 1.0358, + "loss": 1.031, "step": 2220 }, { "epoch": 0.6850633106261786, - "grad_norm": 0.4444385205318582, + "grad_norm": 0.07512900746480158, "learning_rate": 5.453390520030439e-05, - "loss": 1.0274, + "loss": 1.0227, "step": 2225 }, { "epoch": 0.6866027787399454, - "grad_norm": 0.4681868371661994, + "grad_norm": 0.08032690435143434, "learning_rate": 5.405576477825538e-05, - "loss": 1.0135, + "loss": 1.0088, "step": 2230 }, { "epoch": 0.6881422468537121, - "grad_norm": 0.3647614554162627, + "grad_norm": 0.06998260694935131, "learning_rate": 5.3578952084810765e-05, - "loss": 1.0201, + "loss": 1.0154, "step": 2235 }, { "epoch": 0.6896817149674788, - "grad_norm": 0.4675279093656959, + "grad_norm": 0.07320831784074706, "learning_rate": 5.310348089923681e-05, - "loss": 1.0641, + "loss": 1.0598, "step": 2240 }, { "epoch": 0.6912211830812455, - "grad_norm": 0.5708390560359091, + "grad_norm": 0.07622548619223765, "learning_rate": 5.2629364962032004e-05, - "loss": 1.0365, + "loss": 1.0326, "step": 2245 }, { "epoch": 0.6927606511950122, - "grad_norm": 5.18192671698193, + "grad_norm": 0.08145404880583967, "learning_rate": 5.2156617974529886e-05, - "loss": 1.0407, + "loss": 1.0355, "step": 2250 }, { "epoch": 0.6943001193087788, - "grad_norm": 0.4187190501217835, + "grad_norm": 0.07606592124852478, "learning_rate": 5.1685253598503116e-05, - "loss": 1.0471, + "loss": 1.0423, "step": 2255 }, { "epoch": 0.6958395874225455, - "grad_norm": 0.6110758500884396, + "grad_norm": 0.07644130398674438, "learning_rate": 5.1215285455768794e-05, - "loss": 1.0067, + "loss": 1.0018, "step": 2260 }, { "epoch": 0.6973790555363122, - "grad_norm": 1.2232875510104708, + "grad_norm": 0.07454280179132973, "learning_rate": 5.074672712779456e-05, - "loss": 1.0186, + "loss": 1.0132, "step": 2265 }, { "epoch": 0.6989185236500789, - "grad_norm": 0.4924706604640269, + "grad_norm": 0.07254758102553986, "learning_rate": 5.0279592155306286e-05, - "loss": 1.013, + "loss": 1.0092, "step": 2270 }, { "epoch": 0.7004579917638456, - "grad_norm": 0.5351327628863634, + "grad_norm": 0.08518273395270524, "learning_rate": 4.9813894037896747e-05, - "loss": 1.0147, + "loss": 1.0097, "step": 2275 }, { "epoch": 0.7019974598776123, - "grad_norm": 0.5691385515488996, + "grad_norm": 0.08196510903602944, "learning_rate": 4.93496462336354e-05, - "loss": 1.0543, + "loss": 1.05, "step": 2280 }, { "epoch": 0.703536927991379, - "grad_norm": 0.45370659861849166, + "grad_norm": 0.07420511758023607, "learning_rate": 4.8886862158679714e-05, - "loss": 1.0502, + "loss": 1.0461, "step": 2285 }, { "epoch": 0.7050763961051457, - "grad_norm": 0.39906039100573537, + "grad_norm": 0.0732238702046481, "learning_rate": 4.8425555186887096e-05, - "loss": 1.027, + "loss": 1.0239, "step": 2290 }, { "epoch": 0.7066158642189123, - "grad_norm": 0.48115067130308187, + "grad_norm": 0.0764731943928115, "learning_rate": 4.796573864942868e-05, - "loss": 1.0515, + "loss": 1.0464, "step": 2295 }, { "epoch": 0.708155332332679, - "grad_norm": 0.43162848352113553, + "grad_norm": 0.07868943327841957, "learning_rate": 4.750742583440397e-05, - "loss": 1.0054, + "loss": 1.0001, "step": 2300 }, { "epoch": 0.7096948004464457, - "grad_norm": 0.40152697201685444, + "grad_norm": 0.07416370169638956, "learning_rate": 4.7050629986456873e-05, - "loss": 1.0279, + "loss": 1.0238, "step": 2305 }, { "epoch": 0.7112342685602124, - "grad_norm": 0.43021780150976113, + "grad_norm": 0.07521401966958795, "learning_rate": 4.65953643063929e-05, - "loss": 1.0391, + "loss": 1.0345, "step": 2310 }, { "epoch": 0.7127737366739791, - "grad_norm": 0.42088619664607274, + "grad_norm": 0.07871682861041675, "learning_rate": 4.6141641950797645e-05, - "loss": 0.988, + "loss": 0.9837, "step": 2315 }, { "epoch": 0.7143132047877458, - "grad_norm": 0.4851450074591337, + "grad_norm": 0.07766370061500591, "learning_rate": 4.5689476031656784e-05, - "loss": 1.0581, + "loss": 1.0545, "step": 2320 }, { "epoch": 0.7158526729015126, - "grad_norm": 0.5755620737371696, + "grad_norm": 0.06977159099411415, "learning_rate": 4.523887961597688e-05, - "loss": 1.0462, + "loss": 1.0402, "step": 2325 }, { "epoch": 0.7173921410152793, - "grad_norm": 0.41663849058693325, + "grad_norm": 0.07137357806433947, "learning_rate": 4.4789865725407934e-05, - "loss": 1.0329, + "loss": 1.0283, "step": 2330 }, { "epoch": 0.718931609129046, - "grad_norm": 0.47615081033464657, + "grad_norm": 0.07780529408405196, "learning_rate": 4.434244733586699e-05, - "loss": 1.0076, + "loss": 1.0037, "step": 2335 }, { "epoch": 0.7204710772428126, - "grad_norm": 0.534330404158238, + "grad_norm": 0.07460318437672668, "learning_rate": 4.389663737716324e-05, - "loss": 1.0224, + "loss": 1.0178, "step": 2340 }, { "epoch": 0.7220105453565793, - "grad_norm": 0.439745239024214, + "grad_norm": 0.07353332278008176, "learning_rate": 4.3452448732624264e-05, - "loss": 1.0034, + "loss": 0.9984, "step": 2345 }, { "epoch": 0.723550013470346, - "grad_norm": 0.46055944759568124, + "grad_norm": 0.0781522773492217, "learning_rate": 4.3009894238723856e-05, - "loss": 1.0089, + "loss": 1.0049, "step": 2350 }, { "epoch": 0.7250894815841127, - "grad_norm": 0.5413958198748791, + "grad_norm": 0.0778432230166929, "learning_rate": 4.256898668471092e-05, - "loss": 1.0401, + "loss": 1.0362, "step": 2355 }, { "epoch": 0.7266289496978794, - "grad_norm": 0.460713247825251, + "grad_norm": 0.0757029636089452, "learning_rate": 4.212973881223994e-05, - "loss": 1.0205, + "loss": 1.0166, "step": 2360 }, { "epoch": 0.7281684178116461, - "grad_norm": 0.4721070044498834, + "grad_norm": 0.07814503421219454, "learning_rate": 4.1692163315002784e-05, - "loss": 1.0292, + "loss": 1.0249, "step": 2365 }, { "epoch": 0.7297078859254128, - "grad_norm": 0.448843218109221, + "grad_norm": 0.07346479757922982, "learning_rate": 4.125627283836184e-05, - "loss": 1.0274, + "loss": 1.0234, "step": 2370 }, { "epoch": 0.7312473540391795, - "grad_norm": 0.4152647845453433, + "grad_norm": 0.07382770315764864, "learning_rate": 4.082207997898457e-05, - "loss": 0.9903, + "loss": 0.9863, "step": 2375 }, { "epoch": 0.7327868221529461, - "grad_norm": 0.386277666412621, + "grad_norm": 0.07371050763814493, "learning_rate": 4.0389597284479595e-05, - "loss": 1.0103, + "loss": 1.0056, "step": 2380 }, { "epoch": 0.7343262902667128, - "grad_norm": 0.4141845583933384, + "grad_norm": 0.07557969851025513, "learning_rate": 3.995883725303392e-05, - "loss": 1.0356, + "loss": 1.0312, "step": 2385 }, { "epoch": 0.7358657583804795, - "grad_norm": 0.4038330356544352, + "grad_norm": 0.07397761113533481, "learning_rate": 3.952981233305183e-05, - "loss": 1.0368, + "loss": 1.032, "step": 2390 }, { "epoch": 0.7374052264942462, - "grad_norm": 0.4159685288158201, + "grad_norm": 0.07309663584443002, "learning_rate": 3.9102534922795166e-05, - "loss": 1.0223, + "loss": 1.019, "step": 2395 }, { "epoch": 0.7389446946080129, - "grad_norm": 0.44365052943948013, + "grad_norm": 0.07619095281128642, "learning_rate": 3.867701737002502e-05, - "loss": 1.0125, + "loss": 1.0079, + "step": 2400 + }, + { + "epoch": 0.7389446946080129, + "eval_loss": 1.0216362476348877, + "eval_runtime": 3799.3642, + "eval_samples_per_second": 6.082, + "eval_steps_per_second": 0.38, "step": 2400 }, { "epoch": 0.7404841627217796, - "grad_norm": 0.40830415825064736, + "grad_norm": 0.0741636747479547, "learning_rate": 3.825327197164483e-05, - "loss": 1.0027, + "loss": 0.9983, "step": 2405 }, { "epoch": 0.7420236308355463, - "grad_norm": 0.6118151945433313, + "grad_norm": 0.07461006764690276, "learning_rate": 3.7831310973345216e-05, - "loss": 1.014, + "loss": 1.0103, "step": 2410 }, { "epoch": 0.743563098949313, - "grad_norm": 0.43923835412327034, + "grad_norm": 0.07475775794096068, "learning_rate": 3.741114656924983e-05, - "loss": 1.0389, + "loss": 1.0344, "step": 2415 }, { "epoch": 0.7451025670630798, - "grad_norm": 0.4961592786809209, + "grad_norm": 0.08156466585079548, "learning_rate": 3.699279090156315e-05, - "loss": 1.0488, + "loss": 1.0445, "step": 2420 }, { "epoch": 0.7466420351768464, - "grad_norm": 0.5382708745735688, + "grad_norm": 0.07150143529792177, "learning_rate": 3.6576256060219486e-05, - "loss": 1.0083, + "loss": 1.0055, "step": 2425 }, { "epoch": 0.7481815032906131, - "grad_norm": 0.44692774623072656, + "grad_norm": 0.07054926707082107, "learning_rate": 3.616155408253367e-05, - "loss": 1.0106, + "loss": 1.0048, "step": 2430 }, { "epoch": 0.7497209714043798, - "grad_norm": 0.42729189430033954, + "grad_norm": 0.07098240851417871, "learning_rate": 3.574869695285315e-05, - "loss": 1.0088, + "loss": 1.0051, "step": 2435 }, { "epoch": 0.7512604395181465, - "grad_norm": 0.4272983775590473, + "grad_norm": 0.07263690062984071, "learning_rate": 3.5337696602211614e-05, - "loss": 0.9977, + "loss": 0.994, "step": 2440 }, { "epoch": 0.7527999076319132, - "grad_norm": 0.41113864383958393, + "grad_norm": 0.0772048194935717, "learning_rate": 3.492856490798439e-05, - "loss": 1.0322, + "loss": 1.0272, "step": 2445 }, { "epoch": 0.7543393757456799, - "grad_norm": 0.44309109594960205, + "grad_norm": 0.07563646377875859, "learning_rate": 3.4521313693544966e-05, - "loss": 1.0556, + "loss": 1.0513, "step": 2450 }, { "epoch": 0.7558788438594466, - "grad_norm": 0.4196493142920476, + "grad_norm": 0.07280524199918206, "learning_rate": 3.4115954727923395e-05, - "loss": 1.0295, + "loss": 1.0253, "step": 2455 }, { "epoch": 0.7574183119732133, - "grad_norm": 0.512441618937211, + "grad_norm": 0.07220374505442433, "learning_rate": 3.371249972546624e-05, - "loss": 1.0165, + "loss": 1.0133, "step": 2460 }, { "epoch": 0.7589577800869799, - "grad_norm": 1.9609700442905864, + "grad_norm": 0.07634280064446995, "learning_rate": 3.3310960345497974e-05, - "loss": 1.0584, + "loss": 1.0543, "step": 2465 }, { "epoch": 0.7604972482007466, - "grad_norm": 0.4558093754389953, + "grad_norm": 0.073850612505331, "learning_rate": 3.291134819198417e-05, - "loss": 1.0328, + "loss": 1.029, "step": 2470 }, { "epoch": 0.7620367163145133, - "grad_norm": 0.39208217735355977, + "grad_norm": 0.07458494515935035, "learning_rate": 3.251367481319596e-05, - "loss": 1.039, + "loss": 1.0351, "step": 2475 }, { "epoch": 0.76357618442828, - "grad_norm": 0.4433705457387255, + "grad_norm": 0.07280108080142625, "learning_rate": 3.2117951701376436e-05, - "loss": 1.0065, + "loss": 1.0012, "step": 2480 }, { "epoch": 0.7651156525420467, - "grad_norm": 0.3825862067647675, + "grad_norm": 0.07354708568019297, "learning_rate": 3.172419029240853e-05, - "loss": 1.0403, + "loss": 1.0368, "step": 2485 }, { "epoch": 0.7666551206558134, - "grad_norm": 0.4572258364515658, + "grad_norm": 0.07746592093895813, "learning_rate": 3.133240196548447e-05, - "loss": 1.0303, + "loss": 1.0261, "step": 2490 }, { "epoch": 0.7681945887695801, - "grad_norm": 0.84283447972151, + "grad_norm": 0.08343235923851117, "learning_rate": 3.0942598042777073e-05, - "loss": 1.0669, + "loss": 1.0629, "step": 2495 }, { "epoch": 0.7697340568833468, - "grad_norm": 0.4610127698483277, + "grad_norm": 0.07537289148935783, "learning_rate": 3.0554789789112385e-05, - "loss": 0.9836, + "loss": 0.9802, "step": 2500 }, { "epoch": 0.7712735249971135, - "grad_norm": 0.4325409044446896, + "grad_norm": 0.07748639562105411, "learning_rate": 3.0168988411644205e-05, - "loss": 1.0065, + "loss": 1.0031, "step": 2505 }, { "epoch": 0.7728129931108801, - "grad_norm": 0.45733195048796693, + "grad_norm": 0.07297876226978224, "learning_rate": 2.9785205059530263e-05, - "loss": 1.0286, + "loss": 1.0248, "step": 2510 }, { "epoch": 0.7743524612246468, - "grad_norm": 0.4692226922974537, + "grad_norm": 0.07346198471298253, "learning_rate": 2.940345082360997e-05, - "loss": 1.0233, + "loss": 1.0188, "step": 2515 }, { "epoch": 0.7758919293384136, - "grad_norm": 0.40703032879120454, + "grad_norm": 0.07159376899978091, "learning_rate": 2.9023736736083872e-05, - "loss": 1.0503, + "loss": 1.0458, "step": 2520 }, { "epoch": 0.7774313974521803, - "grad_norm": 0.603733310574897, + "grad_norm": 0.08165639916608944, "learning_rate": 2.864607377019498e-05, - "loss": 1.0381, + "loss": 1.0323, "step": 2525 }, { "epoch": 0.778970865565947, - "grad_norm": 0.44557037962518165, + "grad_norm": 0.08094527244169142, "learning_rate": 2.82704728399115e-05, - "loss": 1.0297, + "loss": 1.0256, "step": 2530 }, { "epoch": 0.7805103336797137, - "grad_norm": 0.4138521931758361, + "grad_norm": 0.07406724108857674, "learning_rate": 2.789694479961147e-05, - "loss": 1.0195, + "loss": 1.0153, "step": 2535 }, { "epoch": 0.7820498017934804, - "grad_norm": 0.494990799259698, + "grad_norm": 0.078249669850484, "learning_rate": 2.7525500443769136e-05, - "loss": 1.0257, + "loss": 1.0218, "step": 2540 }, { "epoch": 0.7835892699072471, - "grad_norm": 0.4425172941594563, + "grad_norm": 0.07536635431294623, "learning_rate": 2.715615050664294e-05, - "loss": 1.0242, + "loss": 1.0211, "step": 2545 }, { "epoch": 0.7851287380210138, - "grad_norm": 0.3752870060929472, + "grad_norm": 0.07112966871791493, "learning_rate": 2.6788905661965458e-05, - "loss": 1.0265, + "loss": 1.0219, "step": 2550 }, { "epoch": 0.7866682061347804, - "grad_norm": 0.4116609510103765, + "grad_norm": 0.07460080764451535, "learning_rate": 2.64237765226347e-05, - "loss": 1.0158, + "loss": 1.0111, "step": 2555 }, { "epoch": 0.7882076742485471, - "grad_norm": 0.46202655193696285, + "grad_norm": 0.07534919444514368, "learning_rate": 2.606077364040762e-05, - "loss": 1.0167, + "loss": 1.0123, "step": 2560 }, { "epoch": 0.7897471423623138, - "grad_norm": 0.6202191711943372, + "grad_norm": 0.08153585976314279, "learning_rate": 2.5699907505595068e-05, - "loss": 0.9672, + "loss": 0.9628, "step": 2565 }, { "epoch": 0.7912866104760805, - "grad_norm": 0.480489604669568, + "grad_norm": 0.07963403500854437, "learning_rate": 2.5341188546758688e-05, - "loss": 0.9945, + "loss": 0.9901, "step": 2570 }, { "epoch": 0.7928260785898472, - "grad_norm": 0.4305543882648283, + "grad_norm": 0.07655689565067497, "learning_rate": 2.4984627130409577e-05, - "loss": 1.0108, + "loss": 1.0067, "step": 2575 }, { "epoch": 0.7943655467036139, - "grad_norm": 0.37712645766524694, + "grad_norm": 0.07644623297035556, "learning_rate": 2.4630233560708615e-05, - "loss": 0.9991, + "loss": 0.9944, "step": 2580 }, { "epoch": 0.7959050148173806, - "grad_norm": 0.4457882679562566, + "grad_norm": 0.0725658916090075, "learning_rate": 2.427801807916874e-05, - "loss": 1.0353, + "loss": 1.03, "step": 2585 }, { "epoch": 0.7974444829311473, - "grad_norm": 0.43372798791794065, + "grad_norm": 0.07721243400666894, "learning_rate": 2.3927990864358984e-05, - "loss": 1.016, + "loss": 1.0131, "step": 2590 }, { "epoch": 0.7989839510449139, - "grad_norm": 0.5674499912471999, + "grad_norm": 0.08559886442077007, "learning_rate": 2.358016203161031e-05, - "loss": 1.035, + "loss": 1.0316, "step": 2595 }, { "epoch": 0.8005234191586806, - "grad_norm": 0.5654406491640428, + "grad_norm": 0.07791540542431132, "learning_rate": 2.3234541632723272e-05, - "loss": 1.018, + "loss": 1.0139, + "step": 2600 + }, + { + "epoch": 0.8005234191586806, + "eval_loss": 1.0208169221878052, + "eval_runtime": 3801.5676, + "eval_samples_per_second": 6.079, + "eval_steps_per_second": 0.38, "step": 2600 }, { "epoch": 0.8020628872724473, - "grad_norm": 0.3806698162491372, + "grad_norm": 0.07193242435554525, "learning_rate": 2.2891139655677673e-05, - "loss": 1.0353, + "loss": 1.0317, "step": 2605 }, { "epoch": 0.803602355386214, - "grad_norm": 0.41395000872395227, + "grad_norm": 0.07441923180311688, "learning_rate": 2.2549966024343682e-05, - "loss": 1.0198, + "loss": 1.016, "step": 2610 }, { "epoch": 0.8051418234999808, - "grad_norm": 1.0022245903452656, + "grad_norm": 0.09964577756183264, "learning_rate": 2.2211030598195247e-05, - "loss": 1.0366, + "loss": 1.0332, "step": 2615 }, { "epoch": 0.8066812916137475, - "grad_norm": 0.37470132686684005, + "grad_norm": 0.0707934024787889, "learning_rate": 2.187434317202508e-05, - "loss": 1.0154, + "loss": 1.0119, "step": 2620 }, { "epoch": 0.8082207597275142, - "grad_norm": 0.5250486396448188, + "grad_norm": 0.07855203903741052, "learning_rate": 2.1539913475661576e-05, - "loss": 1.0053, + "loss": 1.0016, "step": 2625 }, { "epoch": 0.8097602278412809, - "grad_norm": 0.4181640130676048, + "grad_norm": 0.07398198874818958, "learning_rate": 2.1207751173687785e-05, - "loss": 1.0723, + "loss": 1.0691, "step": 2630 }, { "epoch": 0.8112996959550476, - "grad_norm": 0.455626609677034, + "grad_norm": 0.07844086321305879, "learning_rate": 2.0877865865161915e-05, - "loss": 1.0264, + "loss": 1.022, "step": 2635 }, { "epoch": 0.8128391640688142, - "grad_norm": 0.6557137988990834, + "grad_norm": 0.08648910762156158, "learning_rate": 2.0550267083340068e-05, - "loss": 1.0192, + "loss": 1.0153, "step": 2640 }, { "epoch": 0.8143786321825809, - "grad_norm": 0.3947380563744468, + "grad_norm": 0.07775325088240974, "learning_rate": 2.0224964295400682e-05, - "loss": 1.008, + "loss": 1.004, "step": 2645 }, { "epoch": 0.8159181002963476, - "grad_norm": 0.4190129497380854, + "grad_norm": 0.07614270885527169, "learning_rate": 1.9901966902170944e-05, - "loss": 1.0024, + "loss": 0.9986, "step": 2650 }, { "epoch": 0.8174575684101143, - "grad_norm": 0.4733124256368494, + "grad_norm": 0.07547266474260356, "learning_rate": 1.95812842378552e-05, - "loss": 1.0224, + "loss": 1.0189, "step": 2655 }, { "epoch": 0.818997036523881, - "grad_norm": 0.4565926045173295, + "grad_norm": 0.07552926506121348, "learning_rate": 1.9262925569765087e-05, - "loss": 1.0503, + "loss": 1.0466, "step": 2660 }, { "epoch": 0.8205365046376477, - "grad_norm": 0.4253658988474481, + "grad_norm": 0.07322212035587285, "learning_rate": 1.8946900098051778e-05, - "loss": 1.0738, + "loss": 1.0702, "step": 2665 }, { "epoch": 0.8220759727514144, - "grad_norm": 0.43002381136104756, + "grad_norm": 0.07400455619765747, "learning_rate": 1.8633216955440137e-05, - "loss": 1.0504, + "loss": 1.047, "step": 2670 }, { "epoch": 0.8236154408651811, - "grad_norm": 0.4015805803027289, + "grad_norm": 0.07863808957234804, "learning_rate": 1.832188520696472e-05, - "loss": 1.0397, + "loss": 1.0364, "step": 2675 }, { "epoch": 0.8251549089789477, - "grad_norm": 0.46899175056681264, + "grad_norm": 0.07870122732753966, "learning_rate": 1.8012913849707868e-05, - "loss": 1.0154, + "loss": 1.0116, "step": 2680 }, { "epoch": 0.8266943770927144, - "grad_norm": 0.4232662691268248, + "grad_norm": 0.07504986075094099, "learning_rate": 1.7706311812539757e-05, - "loss": 1.0188, + "loss": 1.0152, "step": 2685 }, { "epoch": 0.8282338452064811, - "grad_norm": 0.3588739754281993, + "grad_norm": 0.07232592624307334, "learning_rate": 1.7402087955860193e-05, - "loss": 1.0213, + "loss": 1.0184, "step": 2690 }, { "epoch": 0.8297733133202478, - "grad_norm": 0.4423853417849867, + "grad_norm": 0.0762286062588258, "learning_rate": 1.710025107134272e-05, - "loss": 1.0223, + "loss": 1.0179, "step": 2695 }, { "epoch": 0.8313127814340145, - "grad_norm": 0.41804031171029893, + "grad_norm": 0.07509864577486833, "learning_rate": 1.680080988168049e-05, - "loss": 1.0171, + "loss": 1.0134, "step": 2700 }, { "epoch": 0.8328522495477813, - "grad_norm": 0.42006267775596445, + "grad_norm": 0.06789415355693122, "learning_rate": 1.6503773040334126e-05, - "loss": 0.9947, + "loss": 0.9912, "step": 2705 }, { "epoch": 0.834391717661548, - "grad_norm": 0.43670340739758357, + "grad_norm": 0.07709323477784796, "learning_rate": 1.620914913128184e-05, - "loss": 1.0432, + "loss": 1.0394, "step": 2710 }, { "epoch": 0.8359311857753147, - "grad_norm": 0.4821381027661066, + "grad_norm": 0.07434954519462933, "learning_rate": 1.591694666877114e-05, - "loss": 1.004, + "loss": 1.0, "step": 2715 }, { "epoch": 0.8374706538890814, - "grad_norm": 0.3935605580571529, + "grad_norm": 0.0752630834836604, "learning_rate": 1.5627174097072904e-05, - "loss": 0.9947, + "loss": 0.9918, "step": 2720 }, { "epoch": 0.839010122002848, - "grad_norm": 0.5080429213031332, + "grad_norm": 0.08031609314625815, "learning_rate": 1.533983979023733e-05, - "loss": 1.0572, + "loss": 1.0528, "step": 2725 }, { "epoch": 0.8405495901166147, - "grad_norm": 0.4159778200626771, + "grad_norm": 0.07438893778418636, "learning_rate": 1.5054952051851934e-05, - "loss": 1.0207, + "loss": 1.0169, "step": 2730 }, { "epoch": 0.8420890582303814, - "grad_norm": 0.4033999184219861, + "grad_norm": 0.07120616920440061, "learning_rate": 1.477251911480162e-05, - "loss": 1.0353, + "loss": 1.0315, "step": 2735 }, { "epoch": 0.8436285263441481, - "grad_norm": 0.4355088544418655, + "grad_norm": 0.0778120016333404, "learning_rate": 1.4492549141030687e-05, - "loss": 1.0475, + "loss": 1.0435, "step": 2740 }, { "epoch": 0.8451679944579148, - "grad_norm": 0.48537370762792753, + "grad_norm": 0.07657562739865552, "learning_rate": 1.4215050221307002e-05, - "loss": 1.0206, + "loss": 1.0175, "step": 2745 }, { "epoch": 0.8467074625716815, - "grad_norm": 0.37095462049211186, + "grad_norm": 0.07067792403307184, "learning_rate": 1.394003037498821e-05, - "loss": 1.025, + "loss": 1.0217, "step": 2750 }, { "epoch": 0.8482469306854482, - "grad_norm": 0.4437528225326038, + "grad_norm": 0.07328319421748747, "learning_rate": 1.3667497549789932e-05, - "loss": 1.0128, + "loss": 1.0088, "step": 2755 }, { "epoch": 0.8497863987992149, - "grad_norm": 0.4192896798172941, + "grad_norm": 0.0744233681700822, "learning_rate": 1.339745962155613e-05, - "loss": 1.0333, + "loss": 1.0295, "step": 2760 }, { "epoch": 0.8513258669129815, - "grad_norm": 0.4323329881502678, + "grad_norm": 0.07877462446250265, "learning_rate": 1.3129924394031535e-05, - "loss": 0.999, + "loss": 0.9949, "step": 2765 }, { "epoch": 0.8528653350267482, - "grad_norm": 0.5312159218939685, + "grad_norm": 0.08094851855766108, "learning_rate": 1.2864899598636004e-05, - "loss": 1.019, + "loss": 1.0156, "step": 2770 }, { "epoch": 0.8544048031405149, - "grad_norm": 0.37079351875448874, + "grad_norm": 0.07115252442458989, "learning_rate": 1.2602392894241222e-05, - "loss": 1.0349, + "loss": 1.0314, "step": 2775 }, { "epoch": 0.8559442712542816, - "grad_norm": 0.4245080748700514, + "grad_norm": 0.07814933274528041, "learning_rate": 1.234241186694931e-05, - "loss": 1.0131, + "loss": 1.0099, "step": 2780 }, { "epoch": 0.8574837393680483, - "grad_norm": 0.4340531910995576, + "grad_norm": 0.07082686846202138, "learning_rate": 1.208496402987358e-05, - "loss": 1.0245, + "loss": 1.0199, "step": 2785 }, { "epoch": 0.859023207481815, - "grad_norm": 0.37626279842019295, + "grad_norm": 0.07345580113408011, "learning_rate": 1.1830056822921521e-05, - "loss": 1.0043, + "loss": 0.9999, "step": 2790 }, { "epoch": 0.8605626755955817, - "grad_norm": 0.5175737600246689, + "grad_norm": 0.07683001250959652, "learning_rate": 1.1577697612579641e-05, - "loss": 1.0232, + "loss": 1.0191, "step": 2795 }, { "epoch": 0.8621021437093485, - "grad_norm": 0.44974494562352096, + "grad_norm": 0.07156736929667568, "learning_rate": 1.1327893691700698e-05, - "loss": 0.9889, + "loss": 0.9857, + "step": 2800 + }, + { + "epoch": 0.8621021437093485, + "eval_loss": 1.0203505754470825, + "eval_runtime": 3797.4745, + "eval_samples_per_second": 6.085, + "eval_steps_per_second": 0.381, "step": 2800 }, { "epoch": 0.8636416118231152, - "grad_norm": 0.430436147584834, + "grad_norm": 0.07709402707342136, "learning_rate": 1.1080652279292891e-05, - "loss": 1.0236, + "loss": 1.0201, "step": 2805 }, { "epoch": 0.8651810799368818, - "grad_norm": 0.4728221888964366, + "grad_norm": 0.07780221289761478, "learning_rate": 1.0835980520311251e-05, - "loss": 1.0155, + "loss": 1.0124, "step": 2810 }, { "epoch": 0.8667205480506485, - "grad_norm": 0.397220073379638, + "grad_norm": 0.07052567096778889, "learning_rate": 1.0593885485451237e-05, - "loss": 1.0099, + "loss": 1.0067, "step": 2815 }, { "epoch": 0.8682600161644152, - "grad_norm": 0.5348645223247931, + "grad_norm": 0.08205387586259268, "learning_rate": 1.0354374170944258e-05, - "loss": 1.0416, + "loss": 1.0375, "step": 2820 }, { "epoch": 0.8697994842781819, - "grad_norm": 0.4043554078765482, + "grad_norm": 0.07521122233616247, "learning_rate": 1.011745349835559e-05, - "loss": 1.0121, + "loss": 1.0079, "step": 2825 }, { "epoch": 0.8713389523919486, - "grad_norm": 0.468593340879125, + "grad_norm": 0.07327233657833195, "learning_rate": 9.883130314384348e-06, - "loss": 1.0231, + "loss": 1.0201, "step": 2830 }, { "epoch": 0.8728784205057153, - "grad_norm": 0.46465265214459056, + "grad_norm": 0.07795899542631453, "learning_rate": 9.651411390665577e-06, - "loss": 0.9918, + "loss": 0.988, "step": 2835 }, { "epoch": 0.874417888619482, - "grad_norm": 0.5605586645440735, + "grad_norm": 0.07286016245406665, "learning_rate": 9.422303423574596e-06, - "loss": 1.0176, + "loss": 1.0135, "step": 2840 }, { "epoch": 0.8759573567332487, - "grad_norm": 1.775692528658347, + "grad_norm": 0.11013655641290553, "learning_rate": 9.195813034033508e-06, - "loss": 1.0134, + "loss": 1.0086, "step": 2845 }, { "epoch": 0.8774968248470154, - "grad_norm": 0.44922271095321187, + "grad_norm": 0.07680851564214486, "learning_rate": 8.971946767319805e-06, - "loss": 0.9981, + "loss": 0.9952, "step": 2850 }, { "epoch": 0.879036292960782, - "grad_norm": 0.5098548100487027, + "grad_norm": 0.07643572225139718, "learning_rate": 8.75071109287724e-06, - "loss": 1.0052, + "loss": 1.0018, "step": 2855 }, { "epoch": 0.8805757610745487, - "grad_norm": 0.41578211906898727, + "grad_norm": 0.08275815627527361, "learning_rate": 8.532112404128877e-06, - "loss": 1.0128, + "loss": 1.0093, "step": 2860 }, { "epoch": 0.8821152291883154, - "grad_norm": 0.3496924486384235, + "grad_norm": 0.07128434825055383, "learning_rate": 8.316157018292326e-06, - "loss": 1.0479, + "loss": 1.0436, "step": 2865 }, { "epoch": 0.8836546973020821, - "grad_norm": 0.3816021309698796, + "grad_norm": 0.06876446508828925, "learning_rate": 8.102851176197201e-06, - "loss": 1.0103, + "loss": 1.0061, "step": 2870 }, { "epoch": 0.8851941654158488, - "grad_norm": 0.3838865693994203, + "grad_norm": 0.07588431474537394, "learning_rate": 7.892201042104718e-06, - "loss": 1.0326, + "loss": 1.028, "step": 2875 }, { "epoch": 0.8867336335296155, - "grad_norm": 0.4294597636569908, + "grad_norm": 0.07641289046212377, "learning_rate": 7.684212703529624e-06, - "loss": 1.0323, + "loss": 1.0272, "step": 2880 }, { "epoch": 0.8882731016433822, - "grad_norm": 0.43018290128127384, + "grad_norm": 0.06945001002290217, "learning_rate": 7.4788921710642e-06, - "loss": 1.0188, + "loss": 1.0157, "step": 2885 }, { "epoch": 0.889812569757149, - "grad_norm": 0.6183016933222353, + "grad_norm": 0.07738630679829854, "learning_rate": 7.276245378204616e-06, - "loss": 1.0259, + "loss": 1.0221, "step": 2890 }, { "epoch": 0.8913520378709155, - "grad_norm": 0.46596709036144063, + "grad_norm": 0.07783820198610965, "learning_rate": 7.076278181179485e-06, - "loss": 1.0316, + "loss": 1.0281, "step": 2895 }, { "epoch": 0.8928915059846823, - "grad_norm": 0.45497909906909056, + "grad_norm": 0.07897682726117675, "learning_rate": 6.878996358780532e-06, - "loss": 1.0326, + "loss": 1.0298, "step": 2900 }, { "epoch": 0.894430974098449, - "grad_norm": 0.4605589314545624, + "grad_norm": 0.07400035245112081, "learning_rate": 6.684405612195688e-06, - "loss": 1.0184, + "loss": 1.0146, "step": 2905 }, { "epoch": 0.8959704422122157, - "grad_norm": 0.35212400219047424, + "grad_norm": 0.07041748377630946, "learning_rate": 6.492511564844273e-06, - "loss": 0.9976, + "loss": 0.9936, "step": 2910 }, { "epoch": 0.8975099103259824, - "grad_norm": 0.431075612216501, + "grad_norm": 0.07411778726810726, "learning_rate": 6.303319762214499e-06, - "loss": 1.0282, + "loss": 1.0251, "step": 2915 }, { "epoch": 0.8990493784397491, - "grad_norm": 0.49886462139074744, + "grad_norm": 0.07596908068224247, "learning_rate": 6.11683567170328e-06, - "loss": 1.0453, + "loss": 1.0412, "step": 2920 }, { "epoch": 0.9005888465535158, - "grad_norm": 0.5001791356037993, + "grad_norm": 0.07970599257161792, "learning_rate": 5.933064682458122e-06, - "loss": 0.9974, + "loss": 0.993, "step": 2925 }, { "epoch": 0.9021283146672825, - "grad_norm": 0.4334035137279178, + "grad_norm": 0.08530851506766432, "learning_rate": 5.7520121052214275e-06, - "loss": 1.002, + "loss": 0.9978, "step": 2930 }, { "epoch": 0.9036677827810492, - "grad_norm": 0.45835892877048534, + "grad_norm": 0.0787622471295689, "learning_rate": 5.57368317217708e-06, - "loss": 1.0524, + "loss": 1.0486, "step": 2935 }, { "epoch": 0.9052072508948158, - "grad_norm": 0.3752767008874508, + "grad_norm": 0.07548753866605311, "learning_rate": 5.398083036799129e-06, - "loss": 0.9691, + "loss": 0.9662, "step": 2940 }, { "epoch": 0.9067467190085825, - "grad_norm": 0.4935328531459081, + "grad_norm": 0.07385752054024032, "learning_rate": 5.225216773702968e-06, - "loss": 1.0252, + "loss": 1.0215, "step": 2945 }, { "epoch": 0.9082861871223492, - "grad_norm": 0.5210304589005964, + "grad_norm": 0.07367920850587745, "learning_rate": 5.055089378498634e-06, - "loss": 1.0199, + "loss": 1.0168, "step": 2950 }, { "epoch": 0.9098256552361159, - "grad_norm": 0.4254020055130796, + "grad_norm": 0.06933992819668296, "learning_rate": 4.887705767646434e-06, - "loss": 0.9989, + "loss": 0.9945, "step": 2955 }, { "epoch": 0.9113651233498826, - "grad_norm": 0.4643941731342843, + "grad_norm": 0.07737912900819312, "learning_rate": 4.7230707783148864e-06, - "loss": 1.0074, + "loss": 1.0045, "step": 2960 }, { "epoch": 0.9129045914636493, - "grad_norm": 0.3715160286597917, + "grad_norm": 0.0739897513293578, "learning_rate": 4.561189168240909e-06, - "loss": 1.0682, + "loss": 1.0643, "step": 2965 }, { "epoch": 0.914444059577416, - "grad_norm": 0.36108361076071477, + "grad_norm": 0.07178231727442419, "learning_rate": 4.402065615592344e-06, - "loss": 0.9939, + "loss": 0.9897, "step": 2970 }, { "epoch": 0.9159835276911827, - "grad_norm": 0.460057449616597, + "grad_norm": 0.07578780419326635, "learning_rate": 4.245704718832811e-06, - "loss": 1.029, + "loss": 1.0257, "step": 2975 }, { "epoch": 0.9175229958049493, - "grad_norm": 0.42151771172138136, + "grad_norm": 0.06776718755272677, "learning_rate": 4.092110996588705e-06, - "loss": 1.0416, + "loss": 1.0388, "step": 2980 }, { "epoch": 0.919062463918716, - "grad_norm": 0.4208982471816418, + "grad_norm": 0.06961862809963763, "learning_rate": 3.941288887518713e-06, - "loss": 1.0083, + "loss": 1.0047, "step": 2985 }, { "epoch": 0.9206019320324828, - "grad_norm": 0.3754941722458392, + "grad_norm": 0.07467398620527968, "learning_rate": 3.7932427501854996e-06, - "loss": 1.0317, + "loss": 1.0282, "step": 2990 }, { "epoch": 0.9221414001462495, - "grad_norm": 0.5370873955203277, + "grad_norm": 0.074801319310819, "learning_rate": 3.647976862929747e-06, - "loss": 1.054, + "loss": 1.0505, "step": 2995 }, { "epoch": 0.9236808682600162, - "grad_norm": 0.4426665180986752, + "grad_norm": 0.07282075126715817, "learning_rate": 3.505495423746574e-06, - "loss": 1.0287, + "loss": 1.0258, + "step": 3000 + }, + { + "epoch": 0.9236808682600162, + "eval_loss": 1.0200704336166382, + "eval_runtime": 3795.8049, + "eval_samples_per_second": 6.088, + "eval_steps_per_second": 0.381, "step": 3000 }, { "epoch": 0.9252203363737829, - "grad_norm": 0.39355763646149944, + "grad_norm": 0.07205358816276557, "learning_rate": 3.3658025501641323e-06, - "loss": 1.0538, + "loss": 1.0489, "step": 3005 }, { "epoch": 0.9267598044875496, - "grad_norm": 0.3926350659174442, + "grad_norm": 0.07610591218908633, "learning_rate": 3.228902279124657e-06, - "loss": 1.0269, + "loss": 1.0239, "step": 3010 }, { "epoch": 0.9282992726013163, - "grad_norm": 0.5307688367772885, + "grad_norm": 0.07727675334018068, "learning_rate": 3.094798566867818e-06, - "loss": 1.0098, + "loss": 1.0059, "step": 3015 }, { "epoch": 0.929838740715083, - "grad_norm": 0.5463010628237303, + "grad_norm": 0.07615981685372906, "learning_rate": 2.963495288816376e-06, - "loss": 1.0698, + "loss": 1.0657, "step": 3020 }, { "epoch": 0.9313782088288496, - "grad_norm": 0.4767217195493804, + "grad_norm": 0.08088184419445542, "learning_rate": 2.8349962394641605e-06, - "loss": 1.0652, + "loss": 1.061, "step": 3025 }, { "epoch": 0.9329176769426163, - "grad_norm": 0.46383675807120084, + "grad_norm": 0.07621008582551618, "learning_rate": 2.709305132266493e-06, - "loss": 1.0566, + "loss": 1.0538, "step": 3030 }, { "epoch": 0.934457145056383, - "grad_norm": 0.406764613161873, + "grad_norm": 0.07426199836493842, "learning_rate": 2.5864255995327936e-06, - "loss": 1.02, + "loss": 1.0171, "step": 3035 }, { "epoch": 0.9359966131701497, - "grad_norm": 0.5667072139952383, + "grad_norm": 0.07752514392332224, "learning_rate": 2.46636119232162e-06, - "loss": 1.0293, + "loss": 1.0246, "step": 3040 }, { "epoch": 0.9375360812839164, - "grad_norm": 0.4123820408065487, + "grad_norm": 0.07245868859250076, "learning_rate": 2.349115380338096e-06, - "loss": 1.0528, + "loss": 1.0498, "step": 3045 }, { "epoch": 0.9390755493976831, - "grad_norm": 0.42212082973648146, + "grad_norm": 0.080806654929095, "learning_rate": 2.2346915518335786e-06, - "loss": 0.9751, + "loss": 0.9719, "step": 3050 }, { "epoch": 0.9406150175114498, - "grad_norm": 0.44633003035441593, + "grad_norm": 0.07240743409190123, "learning_rate": 2.1230930135078373e-06, - "loss": 1.0071, + "loss": 1.0034, "step": 3055 }, { "epoch": 0.9421544856252165, - "grad_norm": 0.6009896163488582, + "grad_norm": 0.07918466925937036, "learning_rate": 2.014322990413353e-06, - "loss": 1.0275, + "loss": 1.0245, "step": 3060 }, { "epoch": 0.9436939537389831, - "grad_norm": 0.3529405684211488, + "grad_norm": 0.07146801437581773, "learning_rate": 1.9083846258622586e-06, - "loss": 1.0176, + "loss": 1.0141, "step": 3065 }, { "epoch": 0.9452334218527498, - "grad_norm": 0.7896994457794821, + "grad_norm": 0.07367001294912622, "learning_rate": 1.8052809813354111e-06, - "loss": 1.0063, + "loss": 1.0032, "step": 3070 }, { "epoch": 0.9467728899665165, - "grad_norm": 0.48377625576699135, + "grad_norm": 0.07453268715520749, "learning_rate": 1.705015036393931e-06, - "loss": 1.057, + "loss": 1.0538, "step": 3075 }, { "epoch": 0.9483123580802832, - "grad_norm": 0.43904259295920206, + "grad_norm": 0.07656072709742638, "learning_rate": 1.6075896885931807e-06, - "loss": 1.0227, + "loss": 1.0197, "step": 3080 }, { "epoch": 0.94985182619405, - "grad_norm": 0.4480774273472164, + "grad_norm": 0.07453267042052425, "learning_rate": 1.5130077533988873e-06, - "loss": 1.0369, + "loss": 1.0336, "step": 3085 }, { "epoch": 0.9513912943078167, - "grad_norm": 0.4679465739342283, + "grad_norm": 0.076241710023964, "learning_rate": 1.421271964105908e-06, - "loss": 1.0623, + "loss": 1.0588, "step": 3090 }, { "epoch": 0.9529307624215834, - "grad_norm": 0.44630980531648395, + "grad_norm": 0.07786023911471296, "learning_rate": 1.3323849717591376e-06, - "loss": 1.0166, + "loss": 1.0137, "step": 3095 }, { "epoch": 0.9544702305353501, - "grad_norm": 0.5287384335577812, + "grad_norm": 0.07802788248776593, "learning_rate": 1.2463493450769915e-06, - "loss": 1.0425, + "loss": 1.0391, "step": 3100 }, { "epoch": 0.9560096986491168, - "grad_norm": 0.40230009571762976, + "grad_norm": 0.07886805213833666, "learning_rate": 1.1631675703771105e-06, - "loss": 1.0385, + "loss": 1.0347, "step": 3105 }, { "epoch": 0.9575491667628834, - "grad_norm": 0.49211013441821094, + "grad_norm": 0.07635272592447735, "learning_rate": 1.0828420515045178e-06, - "loss": 1.0234, + "loss": 1.0207, "step": 3110 }, { "epoch": 0.9590886348766501, - "grad_norm": 0.48292762219822727, + "grad_norm": 0.07520931499528458, "learning_rate": 1.0053751097621856e-06, - "loss": 0.9869, + "loss": 0.984, "step": 3115 }, { "epoch": 0.9606281029904168, - "grad_norm": 0.4431002985495294, + "grad_norm": 0.07209942369823688, "learning_rate": 9.307689838439104e-07, - "loss": 1.0363, + "loss": 1.0323, "step": 3120 }, { "epoch": 0.9621675711041835, - "grad_norm": 0.3771405426388914, + "grad_norm": 0.07177603650069232, "learning_rate": 8.590258297696108e-07, - "loss": 0.978, + "loss": 0.975, "step": 3125 }, { "epoch": 0.9637070392179502, - "grad_norm": 0.4242241866712476, + "grad_norm": 0.07609713576567406, "learning_rate": 7.901477208230979e-07, - "loss": 1.0173, + "loss": 1.0146, "step": 3130 }, { "epoch": 0.9652465073317169, - "grad_norm": 0.42765029098432733, + "grad_norm": 0.07406778875260055, "learning_rate": 7.241366474920797e-07, - "loss": 1.0021, + "loss": 0.9989, "step": 3135 }, { "epoch": 0.9667859754454836, - "grad_norm": 0.4026446330368183, + "grad_norm": 0.07105715049913416, "learning_rate": 6.609945174106402e-07, - "loss": 1.0352, + "loss": 1.0318, "step": 3140 }, { "epoch": 0.9683254435592503, - "grad_norm": 0.4586738339921203, + "grad_norm": 0.07487401577195316, "learning_rate": 6.007231553041837e-07, - "loss": 1.0087, + "loss": 1.0046, "step": 3145 }, { "epoch": 0.969864911673017, - "grad_norm": 0.41482262786645374, + "grad_norm": 0.07414359551482441, "learning_rate": 5.43324302936643e-07, - "loss": 1.0914, + "loss": 1.0871, "step": 3150 }, { "epoch": 0.9714043797867836, - "grad_norm": 0.42202537226167186, + "grad_norm": 0.07506507959086049, "learning_rate": 4.887996190601318e-07, - "loss": 1.0419, + "loss": 1.0388, "step": 3155 }, { "epoch": 0.9729438479005503, - "grad_norm": 0.37960996848907147, + "grad_norm": 0.0752885384753745, "learning_rate": 4.3715067936705987e-07, - "loss": 1.0171, + "loss": 1.0137, "step": 3160 }, { "epoch": 0.974483316014317, - "grad_norm": 0.7530238458576849, + "grad_norm": 0.08095676290791981, "learning_rate": 3.8837897644457e-07, - "loss": 1.0446, + "loss": 1.0404, "step": 3165 }, { "epoch": 0.9760227841280837, - "grad_norm": 0.3574506914494524, + "grad_norm": 0.06987482190123641, "learning_rate": 3.4248591973140566e-07, - "loss": 1.0402, + "loss": 1.0352, "step": 3170 }, { "epoch": 0.9775622522418504, - "grad_norm": 0.4403626626769491, + "grad_norm": 0.0787167065963581, "learning_rate": 2.994728354771659e-07, - "loss": 1.0618, + "loss": 1.0583, "step": 3175 }, { "epoch": 0.9791017203556172, - "grad_norm": 0.43643836321504664, + "grad_norm": 0.07226890570969514, "learning_rate": 2.593409667040247e-07, - "loss": 0.9829, + "loss": 0.9798, "step": 3180 }, { "epoch": 0.9806411884693839, - "grad_norm": 0.44879821672356546, + "grad_norm": 0.07375768888851764, "learning_rate": 2.2209147317074908e-07, - "loss": 0.9989, + "loss": 0.9954, "step": 3185 }, { "epoch": 0.9821806565831506, - "grad_norm": 0.464478069097993, + "grad_norm": 0.07649902759613646, "learning_rate": 1.8772543133922515e-07, - "loss": 1.0269, + "loss": 1.0238, "step": 3190 }, { "epoch": 0.9837201246969172, - "grad_norm": 0.3840629750152777, + "grad_norm": 0.06996190701610033, "learning_rate": 1.5624383434333923e-07, - "loss": 1.0007, + "loss": 0.9976, "step": 3195 }, { "epoch": 0.9852595928106839, - "grad_norm": 0.4819476738232411, + "grad_norm": 0.07416785904918395, "learning_rate": 1.276475919602671e-07, - "loss": 1.0183, + "loss": 1.0147, + "step": 3200 + }, + { + "epoch": 0.9852595928106839, + "eval_loss": 1.0199946165084839, + "eval_runtime": 3798.6902, + "eval_samples_per_second": 6.083, + "eval_steps_per_second": 0.38, "step": 3200 }, { "epoch": 0.9867990609244506, - "grad_norm": 0.4679445982996664, + "grad_norm": 0.08074611350034196, "learning_rate": 1.019375305842063e-07, - "loss": 1.0093, + "loss": 1.006, "step": 3205 }, { "epoch": 0.9883385290382173, - "grad_norm": 0.608255287884667, + "grad_norm": 0.06887822697223855, "learning_rate": 7.911439320247294e-08, - "loss": 1.0133, + "loss": 1.0094, "step": 3210 }, { "epoch": 0.989877997151984, - "grad_norm": 0.49744148279623623, + "grad_norm": 0.07945600917008594, "learning_rate": 5.9178839374018914e-08, - "loss": 1.0323, + "loss": 1.0289, "step": 3215 }, { "epoch": 0.9914174652657507, - "grad_norm": 0.4874179505714964, + "grad_norm": 0.08069867178613127, "learning_rate": 4.213144521042489e-08, - "loss": 1.0355, + "loss": 1.0321, "step": 3220 }, { "epoch": 0.9929569333795174, - "grad_norm": 0.42907270925432495, + "grad_norm": 0.07866409777388325, "learning_rate": 2.797270335916924e-08, - "loss": 1.0004, + "loss": 0.9975, "step": 3225 }, { "epoch": 0.9944964014932841, - "grad_norm": 0.36896334930948343, + "grad_norm": 0.07139593983809099, "learning_rate": 1.6703022989494887e-08, - "loss": 1.0378, + "loss": 1.0347, "step": 3230 }, { "epoch": 0.9960358696070508, - "grad_norm": 0.369776577000758, + "grad_norm": 0.0702207314389356, "learning_rate": 8.322729780474436e-09, - "loss": 1.0181, + "loss": 1.0144, "step": 3235 }, { "epoch": 0.9975753377208174, - "grad_norm": 0.381178979843484, + "grad_norm": 0.07128599759030121, "learning_rate": 2.8320659116953806e-09, - "loss": 1.016, + "loss": 1.0127, "step": 3240 }, { "epoch": 0.9991148058345841, - "grad_norm": 1.063968939156568, + "grad_norm": 0.07746757212398606, "learning_rate": 2.311900561768887e-10, - "loss": 1.008, + "loss": 1.0052, "step": 3245 }, - { - "epoch": 0.9997305930800908, - "eval_loss": 1.0235061645507812, - "eval_runtime": 3775.83, - "eval_samples_per_second": 6.12, - "eval_steps_per_second": 0.383, - "step": 3247 - }, { "epoch": 0.9997305930800908, "step": 3247, - "total_flos": 4.15615026158633e+16, - "train_loss": 1.0426253444200522, - "train_runtime": 139956.5138, - "train_samples_per_second": 1.485, - "train_steps_per_second": 0.023 + "total_flos": 3.805052032096666e+16, + "train_loss": 0.6459755006114042, + "train_runtime": 124690.9481, + "train_samples_per_second": 1.667, + "train_steps_per_second": 0.026 } ], "logging_steps": 5, @@ -4581,7 +4701,7 @@ "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, - "total_flos": 4.15615026158633e+16, + "total_flos": 3.805052032096666e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null