{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.001801801801802, "eval_steps": 500, "global_step": 833, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010810810810810811, "grad_norm": 44.04393707727108, "learning_rate": 2.9999066991504905e-05, "loss": 2.3711, "step": 3 }, { "epoch": 0.021621621621621623, "grad_norm": 31.04171552882201, "learning_rate": 2.9996268082086924e-05, "loss": 4.159, "step": 6 }, { "epoch": 0.032432432432432434, "grad_norm": 11.54576214967521, "learning_rate": 2.9991603619933566e-05, "loss": 1.9733, "step": 9 }, { "epoch": 0.043243243243243246, "grad_norm": 7.08841552325599, "learning_rate": 2.9985074185309204e-05, "loss": 1.7978, "step": 12 }, { "epoch": 0.05405405405405406, "grad_norm": 5.949508626432288, "learning_rate": 2.99766805904829e-05, "loss": 1.7347, "step": 15 }, { "epoch": 0.06486486486486487, "grad_norm": 6.168616775238258, "learning_rate": 2.9966423879627356e-05, "loss": 1.6033, "step": 18 }, { "epoch": 0.07567567567567568, "grad_norm": 5.728079542384497, "learning_rate": 2.9954305328689024e-05, "loss": 1.7134, "step": 21 }, { "epoch": 0.08648648648648649, "grad_norm": 6.284326561040228, "learning_rate": 2.9940326445229367e-05, "loss": 1.6933, "step": 24 }, { "epoch": 0.0972972972972973, "grad_norm": 6.92658975011714, "learning_rate": 2.9924488968237316e-05, "loss": 1.5923, "step": 27 }, { "epoch": 0.10810810810810811, "grad_norm": 6.538508694879061, "learning_rate": 2.9906794867912953e-05, "loss": 1.6931, "step": 30 }, { "epoch": 0.11891891891891893, "grad_norm": 4.685530306007965, "learning_rate": 2.98872463454224e-05, "loss": 1.6559, "step": 33 }, { "epoch": 0.12972972972972974, "grad_norm": 5.65503266442286, "learning_rate": 2.9865845832623993e-05, "loss": 1.6982, "step": 36 }, { "epoch": 0.14054054054054055, "grad_norm": 4.886380857119004, "learning_rate": 2.9842595991765766e-05, "loss": 1.6503, "step": 39 }, { "epoch": 0.15135135135135136, "grad_norm": 5.026086310034092, "learning_rate": 2.981749971515426e-05, "loss": 1.632, "step": 42 }, { "epoch": 0.16216216216216217, "grad_norm": 4.795570770299284, "learning_rate": 2.9790560124794702e-05, "loss": 1.6824, "step": 45 }, { "epoch": 0.17297297297297298, "grad_norm": 4.756143563325781, "learning_rate": 2.976178057200266e-05, "loss": 1.6694, "step": 48 }, { "epoch": 0.1837837837837838, "grad_norm": 5.364943432581566, "learning_rate": 2.9731164636987088e-05, "loss": 1.6659, "step": 51 }, { "epoch": 0.1945945945945946, "grad_norm": 5.181051766279552, "learning_rate": 2.9698716128404985e-05, "loss": 1.6443, "step": 54 }, { "epoch": 0.20540540540540542, "grad_norm": 4.828479392346181, "learning_rate": 2.9664439082887568e-05, "loss": 1.6519, "step": 57 }, { "epoch": 0.21621621621621623, "grad_norm": 5.824361936201152, "learning_rate": 2.9628337764538135e-05, "loss": 1.6532, "step": 60 }, { "epoch": 0.22702702702702704, "grad_norm": 6.393887712988006, "learning_rate": 2.9590416664401566e-05, "loss": 1.6409, "step": 63 }, { "epoch": 0.23783783783783785, "grad_norm": 5.8650966692501765, "learning_rate": 2.955068049990568e-05, "loss": 1.6105, "step": 66 }, { "epoch": 0.24864864864864866, "grad_norm": 4.755849083042293, "learning_rate": 2.9509134214274343e-05, "loss": 1.6618, "step": 69 }, { "epoch": 0.2594594594594595, "grad_norm": 3.9789106042521962, "learning_rate": 2.9465782975912553e-05, "loss": 1.6645, "step": 72 }, { "epoch": 0.2702702702702703, "grad_norm": 4.592578050100414, "learning_rate": 2.942063217776346e-05, "loss": 1.605, "step": 75 }, { "epoch": 0.2810810810810811, "grad_norm": 4.299733571350802, "learning_rate": 2.9373687436637492e-05, "loss": 1.6233, "step": 78 }, { "epoch": 0.2918918918918919, "grad_norm": 4.585181401202116, "learning_rate": 2.9324954592513626e-05, "loss": 1.6587, "step": 81 }, { "epoch": 0.3027027027027027, "grad_norm": 4.530135718622418, "learning_rate": 2.927443970781287e-05, "loss": 1.6333, "step": 84 }, { "epoch": 0.31351351351351353, "grad_norm": 5.349009947479682, "learning_rate": 2.9222149066644088e-05, "loss": 1.6431, "step": 87 }, { "epoch": 0.32432432432432434, "grad_norm": 4.266181060344957, "learning_rate": 2.916808917402228e-05, "loss": 1.598, "step": 90 }, { "epoch": 0.33513513513513515, "grad_norm": 4.628674419343668, "learning_rate": 2.911226675505932e-05, "loss": 1.6375, "step": 93 }, { "epoch": 0.34594594594594597, "grad_norm": 5.058644550344611, "learning_rate": 2.905468875412735e-05, "loss": 1.6427, "step": 96 }, { "epoch": 0.3567567567567568, "grad_norm": 4.417115318512762, "learning_rate": 2.8995362333994906e-05, "loss": 1.6333, "step": 99 }, { "epoch": 0.3675675675675676, "grad_norm": 4.0811273644284345, "learning_rate": 2.8934294874935848e-05, "loss": 1.5855, "step": 102 }, { "epoch": 0.3783783783783784, "grad_norm": 3.962543949681069, "learning_rate": 2.887149397381126e-05, "loss": 1.6171, "step": 105 }, { "epoch": 0.3891891891891892, "grad_norm": 3.934001274285887, "learning_rate": 2.8806967443124372e-05, "loss": 1.5538, "step": 108 }, { "epoch": 0.4, "grad_norm": 4.891305018487301, "learning_rate": 2.8740723310048682e-05, "loss": 1.6476, "step": 111 }, { "epoch": 0.41081081081081083, "grad_norm": 4.553656766898869, "learning_rate": 2.8672769815429385e-05, "loss": 1.5889, "step": 114 }, { "epoch": 0.42162162162162165, "grad_norm": 4.450034079395778, "learning_rate": 2.860311541275818e-05, "loss": 1.5896, "step": 117 }, { "epoch": 0.43243243243243246, "grad_norm": 3.9485209950285274, "learning_rate": 2.8531768767121656e-05, "loss": 1.6198, "step": 120 }, { "epoch": 0.44324324324324327, "grad_norm": 4.35512448284715, "learning_rate": 2.845873875412335e-05, "loss": 1.6443, "step": 123 }, { "epoch": 0.4540540540540541, "grad_norm": 4.651266887881564, "learning_rate": 2.838403445877958e-05, "loss": 1.6542, "step": 126 }, { "epoch": 0.4648648648648649, "grad_norm": 4.5294288157069, "learning_rate": 2.8307665174389323e-05, "loss": 1.655, "step": 129 }, { "epoch": 0.4756756756756757, "grad_norm": 4.104985239339571, "learning_rate": 2.822964040137805e-05, "loss": 1.6827, "step": 132 }, { "epoch": 0.4864864864864865, "grad_norm": 3.7948515475231286, "learning_rate": 2.8149969846115894e-05, "loss": 1.6333, "step": 135 }, { "epoch": 0.4972972972972973, "grad_norm": 4.449225329061536, "learning_rate": 2.8068663419710182e-05, "loss": 1.6185, "step": 138 }, { "epoch": 0.5081081081081081, "grad_norm": 4.391987231579949, "learning_rate": 2.7985731236772448e-05, "loss": 1.6078, "step": 141 }, { "epoch": 0.518918918918919, "grad_norm": 4.982415169833182, "learning_rate": 2.7901183614160185e-05, "loss": 1.6529, "step": 144 }, { "epoch": 0.5297297297297298, "grad_norm": 4.176595151214056, "learning_rate": 2.7815031069693412e-05, "loss": 1.6073, "step": 147 }, { "epoch": 0.5405405405405406, "grad_norm": 4.3206148554703105, "learning_rate": 2.7727284320846246e-05, "loss": 1.5561, "step": 150 }, { "epoch": 0.5513513513513514, "grad_norm": 4.424758608775709, "learning_rate": 2.7637954283413632e-05, "loss": 1.6253, "step": 153 }, { "epoch": 0.5621621621621622, "grad_norm": 5.349711813640235, "learning_rate": 2.75470520701534e-05, "loss": 1.7059, "step": 156 }, { "epoch": 0.572972972972973, "grad_norm": 4.578654891344146, "learning_rate": 2.7454588989403858e-05, "loss": 1.6107, "step": 159 }, { "epoch": 0.5837837837837838, "grad_norm": 40.22880683574773, "learning_rate": 2.7360576543676972e-05, "loss": 1.6278, "step": 162 }, { "epoch": 0.5945945945945946, "grad_norm": 4.155039894851776, "learning_rate": 2.7265026428227476e-05, "loss": 1.6301, "step": 165 }, { "epoch": 0.6054054054054054, "grad_norm": 4.159866031946415, "learning_rate": 2.7167950529597963e-05, "loss": 1.5342, "step": 168 }, { "epoch": 0.6162162162162163, "grad_norm": 4.087141493968059, "learning_rate": 2.706936092414018e-05, "loss": 1.6033, "step": 171 }, { "epoch": 0.6270270270270271, "grad_norm": 3.825646270675215, "learning_rate": 2.696926987651271e-05, "loss": 1.5288, "step": 174 }, { "epoch": 0.6378378378378379, "grad_norm": 3.917523159059879, "learning_rate": 2.686768983815526e-05, "loss": 1.6363, "step": 177 }, { "epoch": 0.6486486486486487, "grad_norm": 4.202629239471907, "learning_rate": 2.676463344573965e-05, "loss": 1.6052, "step": 180 }, { "epoch": 0.6594594594594595, "grad_norm": 3.6368092847747304, "learning_rate": 2.666011351959783e-05, "loss": 1.6309, "step": 183 }, { "epoch": 0.6702702702702703, "grad_norm": 4.253168243144118, "learning_rate": 2.6554143062126995e-05, "loss": 1.5592, "step": 186 }, { "epoch": 0.6810810810810811, "grad_norm": 4.995354779998164, "learning_rate": 2.6446735256172092e-05, "loss": 1.6303, "step": 189 }, { "epoch": 0.6918918918918919, "grad_norm": 4.421549411402136, "learning_rate": 2.6337903463385836e-05, "loss": 1.5769, "step": 192 }, { "epoch": 0.7027027027027027, "grad_norm": 4.32615026522547, "learning_rate": 2.6227661222566516e-05, "loss": 1.613, "step": 195 }, { "epoch": 0.7135135135135136, "grad_norm": 3.8575639988103836, "learning_rate": 2.6116022247973773e-05, "loss": 1.5844, "step": 198 }, { "epoch": 0.7243243243243244, "grad_norm": 3.7279832633133028, "learning_rate": 2.6003000427622484e-05, "loss": 1.5301, "step": 201 }, { "epoch": 0.7351351351351352, "grad_norm": 4.190711163922663, "learning_rate": 2.5888609821555127e-05, "loss": 1.592, "step": 204 }, { "epoch": 0.745945945945946, "grad_norm": 4.733000892445367, "learning_rate": 2.577286466009266e-05, "loss": 1.6574, "step": 207 }, { "epoch": 0.7567567567567568, "grad_norm": 4.577219211132897, "learning_rate": 2.5655779342064276e-05, "loss": 1.6289, "step": 210 }, { "epoch": 0.7675675675675676, "grad_norm": 4.048131970531039, "learning_rate": 2.553736843301615e-05, "loss": 1.6169, "step": 213 }, { "epoch": 0.7783783783783784, "grad_norm": 4.018546715630257, "learning_rate": 2.5417646663399502e-05, "loss": 1.5489, "step": 216 }, { "epoch": 0.7891891891891892, "grad_norm": 3.7010313992210992, "learning_rate": 2.529662892673806e-05, "loss": 1.5596, "step": 219 }, { "epoch": 0.8, "grad_norm": 4.557965597883243, "learning_rate": 2.5174330277775354e-05, "loss": 1.6145, "step": 222 }, { "epoch": 0.8108108108108109, "grad_norm": 4.181549208740728, "learning_rate": 2.5050765930601836e-05, "loss": 1.5339, "step": 225 }, { "epoch": 0.8216216216216217, "grad_norm": 3.7892758830012823, "learning_rate": 2.4925951256762254e-05, "loss": 1.5862, "step": 228 }, { "epoch": 0.8324324324324325, "grad_norm": 3.6130747678919666, "learning_rate": 2.4799901783343407e-05, "loss": 1.4857, "step": 231 }, { "epoch": 0.8432432432432433, "grad_norm": 3.639537345617851, "learning_rate": 2.467263319104256e-05, "loss": 1.5902, "step": 234 }, { "epoch": 0.8540540540540541, "grad_norm": 4.0474919753332035, "learning_rate": 2.4544161312216752e-05, "loss": 1.5395, "step": 237 }, { "epoch": 0.8648648648648649, "grad_norm": 3.800979434984059, "learning_rate": 2.441450212891323e-05, "loss": 1.5284, "step": 240 }, { "epoch": 0.8756756756756757, "grad_norm": 3.3611120493742983, "learning_rate": 2.4283671770881256e-05, "loss": 1.515, "step": 243 }, { "epoch": 0.8864864864864865, "grad_norm": 3.459228078638404, "learning_rate": 2.415168651356556e-05, "loss": 1.5745, "step": 246 }, { "epoch": 0.8972972972972973, "grad_norm": 3.6185129562881513, "learning_rate": 2.4018562776081643e-05, "loss": 1.5989, "step": 249 }, { "epoch": 0.9081081081081082, "grad_norm": 4.499909371969758, "learning_rate": 2.388431711917324e-05, "loss": 1.5609, "step": 252 }, { "epoch": 0.918918918918919, "grad_norm": 3.6576864938242832, "learning_rate": 2.3748966243152127e-05, "loss": 1.5623, "step": 255 }, { "epoch": 0.9297297297297298, "grad_norm": 4.261199238023545, "learning_rate": 2.3612526985820586e-05, "loss": 1.5523, "step": 258 }, { "epoch": 0.9405405405405406, "grad_norm": 4.730374719738293, "learning_rate": 2.347501632037678e-05, "loss": 1.5813, "step": 261 }, { "epoch": 0.9513513513513514, "grad_norm": 3.7110704143642503, "learning_rate": 2.333645135330324e-05, "loss": 1.4888, "step": 264 }, { "epoch": 0.9621621621621622, "grad_norm": 3.481005791064881, "learning_rate": 2.3196849322238816e-05, "loss": 1.6186, "step": 267 }, { "epoch": 0.972972972972973, "grad_norm": 3.9410070667987913, "learning_rate": 2.3056227593834306e-05, "loss": 1.5343, "step": 270 }, { "epoch": 0.9837837837837838, "grad_norm": 3.73687483401855, "learning_rate": 2.291460366159199e-05, "loss": 1.527, "step": 273 }, { "epoch": 0.9945945945945946, "grad_norm": 3.636935348418019, "learning_rate": 2.277199514368947e-05, "loss": 1.5228, "step": 276 }, { "epoch": 1.0054054054054054, "grad_norm": 3.5028224113856457, "learning_rate": 2.2628419780787887e-05, "loss": 1.3043, "step": 279 }, { "epoch": 1.0162162162162163, "grad_norm": 3.2714761796276455, "learning_rate": 2.2483895433825023e-05, "loss": 1.0507, "step": 282 }, { "epoch": 1.027027027027027, "grad_norm": 3.180825722720309, "learning_rate": 2.2338440081793332e-05, "loss": 1.0155, "step": 285 }, { "epoch": 1.037837837837838, "grad_norm": 2.9167211293609894, "learning_rate": 2.2192071819503365e-05, "loss": 1.0087, "step": 288 }, { "epoch": 1.0486486486486486, "grad_norm": 3.1930797413555077, "learning_rate": 2.2044808855332743e-05, "loss": 0.9847, "step": 291 }, { "epoch": 1.0594594594594595, "grad_norm": 3.0743072086936474, "learning_rate": 2.1896669508961002e-05, "loss": 1.0024, "step": 294 }, { "epoch": 1.0702702702702702, "grad_norm": 3.3931402915538613, "learning_rate": 2.1747672209090627e-05, "loss": 1.0063, "step": 297 }, { "epoch": 1.0810810810810811, "grad_norm": 3.427840497426894, "learning_rate": 2.1597835491154495e-05, "loss": 0.9924, "step": 300 }, { "epoch": 1.0918918918918918, "grad_norm": 3.209752499479298, "learning_rate": 2.1447177995010024e-05, "loss": 1.0114, "step": 303 }, { "epoch": 1.1027027027027028, "grad_norm": 2.9188122615255487, "learning_rate": 2.1295718462620383e-05, "loss": 0.9348, "step": 306 }, { "epoch": 1.1135135135135135, "grad_norm": 3.2169410708018464, "learning_rate": 2.1143475735722965e-05, "loss": 0.9456, "step": 309 }, { "epoch": 1.1243243243243244, "grad_norm": 3.2550857985332815, "learning_rate": 2.099046875348543e-05, "loss": 0.9704, "step": 312 }, { "epoch": 1.135135135135135, "grad_norm": 3.200798957813093, "learning_rate": 2.0836716550149685e-05, "loss": 1.0187, "step": 315 }, { "epoch": 1.145945945945946, "grad_norm": 3.026699827485341, "learning_rate": 2.068223825266397e-05, "loss": 0.9959, "step": 318 }, { "epoch": 1.1567567567567567, "grad_norm": 2.966340597816754, "learning_rate": 2.0527053078303463e-05, "loss": 0.9672, "step": 321 }, { "epoch": 1.1675675675675676, "grad_norm": 3.4796215218810578, "learning_rate": 2.0371180332279642e-05, "loss": 0.9631, "step": 324 }, { "epoch": 1.1783783783783783, "grad_norm": 2.9446475013457203, "learning_rate": 2.0214639405338653e-05, "loss": 0.9922, "step": 327 }, { "epoch": 1.1891891891891893, "grad_norm": 3.0107017661224447, "learning_rate": 2.0057449771349123e-05, "loss": 0.9846, "step": 330 }, { "epoch": 1.2, "grad_norm": 3.1589173902147203, "learning_rate": 1.989963098487957e-05, "loss": 0.9945, "step": 333 }, { "epoch": 1.2108108108108109, "grad_norm": 3.291095419768011, "learning_rate": 1.9741202678765785e-05, "loss": 1.0006, "step": 336 }, { "epoch": 1.2216216216216216, "grad_norm": 3.0439357766975768, "learning_rate": 1.9582184561668496e-05, "loss": 1.0247, "step": 339 }, { "epoch": 1.2324324324324325, "grad_norm": 2.7398517472244133, "learning_rate": 1.942259641562159e-05, "loss": 1.0129, "step": 342 }, { "epoch": 1.2432432432432432, "grad_norm": 3.0466059717106098, "learning_rate": 1.9262458093571193e-05, "loss": 1.0257, "step": 345 }, { "epoch": 1.2540540540540541, "grad_norm": 2.8458132575753714, "learning_rate": 1.9101789516905953e-05, "loss": 0.9715, "step": 348 }, { "epoch": 1.2648648648648648, "grad_norm": 2.8328426905654656, "learning_rate": 1.8940610672978803e-05, "loss": 0.961, "step": 351 }, { "epoch": 1.2756756756756757, "grad_norm": 3.030835646521939, "learning_rate": 1.8778941612620482e-05, "loss": 0.9884, "step": 354 }, { "epoch": 1.2864864864864864, "grad_norm": 2.8633899892085024, "learning_rate": 1.8616802447645223e-05, "loss": 0.9937, "step": 357 }, { "epoch": 1.2972972972972974, "grad_norm": 3.338996158976475, "learning_rate": 1.8454213348348797e-05, "loss": 0.9809, "step": 360 }, { "epoch": 1.308108108108108, "grad_norm": 2.924814513226331, "learning_rate": 1.8291194540999322e-05, "loss": 0.9526, "step": 363 }, { "epoch": 1.318918918918919, "grad_norm": 3.090470952947, "learning_rate": 1.8127766305321072e-05, "loss": 0.9912, "step": 366 }, { "epoch": 1.3297297297297297, "grad_norm": 2.9540976533352867, "learning_rate": 1.7963948971971686e-05, "loss": 0.9725, "step": 369 }, { "epoch": 1.3405405405405406, "grad_norm": 2.9280101457384986, "learning_rate": 1.7799762920012982e-05, "loss": 0.9508, "step": 372 }, { "epoch": 1.3513513513513513, "grad_norm": 3.129222083901634, "learning_rate": 1.763522857437579e-05, "loss": 0.9952, "step": 375 }, { "epoch": 1.3621621621621622, "grad_norm": 3.3207813445482315, "learning_rate": 1.747036640331908e-05, "loss": 0.9778, "step": 378 }, { "epoch": 1.372972972972973, "grad_norm": 2.941815984953935, "learning_rate": 1.7305196915883662e-05, "loss": 0.9922, "step": 381 }, { "epoch": 1.3837837837837839, "grad_norm": 3.1943275224301475, "learning_rate": 1.713974065934086e-05, "loss": 0.9738, "step": 384 }, { "epoch": 1.3945945945945946, "grad_norm": 2.9545782873135478, "learning_rate": 1.6974018216636394e-05, "loss": 0.9712, "step": 387 }, { "epoch": 1.4054054054054055, "grad_norm": 2.832796057451163, "learning_rate": 1.6808050203829845e-05, "loss": 1.0121, "step": 390 }, { "epoch": 1.4162162162162162, "grad_norm": 3.2139763823586196, "learning_rate": 1.6641857267530003e-05, "loss": 0.9702, "step": 393 }, { "epoch": 1.427027027027027, "grad_norm": 3.3210065946822827, "learning_rate": 1.6475460082326377e-05, "loss": 1.0018, "step": 396 }, { "epoch": 1.4378378378378378, "grad_norm": 3.166940843865695, "learning_rate": 1.6308879348217293e-05, "loss": 0.9959, "step": 399 }, { "epoch": 1.4486486486486487, "grad_norm": 3.1486302485385878, "learning_rate": 1.6142135788034743e-05, "loss": 0.9477, "step": 402 }, { "epoch": 1.4594594594594594, "grad_norm": 3.1328749208815547, "learning_rate": 1.5975250144866492e-05, "loss": 0.9854, "step": 405 }, { "epoch": 1.4702702702702704, "grad_norm": 2.9503463010514035, "learning_rate": 1.5808243179475568e-05, "loss": 1.0001, "step": 408 }, { "epoch": 1.481081081081081, "grad_norm": 2.8925905903725355, "learning_rate": 1.564113566771764e-05, "loss": 0.9475, "step": 411 }, { "epoch": 1.491891891891892, "grad_norm": 3.2184062381528196, "learning_rate": 1.547394839795645e-05, "loss": 0.9862, "step": 414 }, { "epoch": 1.5027027027027027, "grad_norm": 3.0205819182026077, "learning_rate": 1.530670216847772e-05, "loss": 0.9689, "step": 417 }, { "epoch": 1.5135135135135136, "grad_norm": 2.886699137488658, "learning_rate": 1.5139417784901836e-05, "loss": 0.9578, "step": 420 }, { "epoch": 1.5243243243243243, "grad_norm": 3.0019029659558494, "learning_rate": 1.4972116057595592e-05, "loss": 0.9526, "step": 423 }, { "epoch": 1.535135135135135, "grad_norm": 3.141168035086649, "learning_rate": 1.480481779908337e-05, "loss": 0.9621, "step": 426 }, { "epoch": 1.545945945945946, "grad_norm": 2.842053920465437, "learning_rate": 1.463754382145802e-05, "loss": 0.9821, "step": 429 }, { "epoch": 1.5567567567567568, "grad_norm": 3.220671922556498, "learning_rate": 1.4470314933791828e-05, "loss": 0.9547, "step": 432 }, { "epoch": 1.5675675675675675, "grad_norm": 2.9122625506605586, "learning_rate": 1.430315193954783e-05, "loss": 0.9678, "step": 435 }, { "epoch": 1.5783783783783782, "grad_norm": 2.6568836209674274, "learning_rate": 1.4136075633991864e-05, "loss": 0.9566, "step": 438 }, { "epoch": 1.5891891891891892, "grad_norm": 2.7282858715379077, "learning_rate": 1.3969106801605577e-05, "loss": 0.9195, "step": 441 }, { "epoch": 1.6, "grad_norm": 2.602059773028646, "learning_rate": 1.3802266213500843e-05, "loss": 0.955, "step": 444 }, { "epoch": 1.6108108108108108, "grad_norm": 3.3786673839231423, "learning_rate": 1.3635574624835798e-05, "loss": 0.9645, "step": 447 }, { "epoch": 1.6216216216216215, "grad_norm": 2.6986089589909, "learning_rate": 1.3469052772232874e-05, "loss": 0.98, "step": 450 }, { "epoch": 1.6324324324324324, "grad_norm": 2.89235283174837, "learning_rate": 1.3302721371199165e-05, "loss": 0.9588, "step": 453 }, { "epoch": 1.6432432432432433, "grad_norm": 2.935829812088402, "learning_rate": 1.3136601113549349e-05, "loss": 0.9354, "step": 456 }, { "epoch": 1.654054054054054, "grad_norm": 2.737725384464134, "learning_rate": 1.2970712664831644e-05, "loss": 0.9574, "step": 459 }, { "epoch": 1.6648648648648647, "grad_norm": 2.867513411111901, "learning_rate": 1.2805076661756965e-05, "loss": 0.9446, "step": 462 }, { "epoch": 1.6756756756756757, "grad_norm": 2.787422977164954, "learning_rate": 1.2639713709631709e-05, "loss": 0.9558, "step": 465 }, { "epoch": 1.6864864864864866, "grad_norm": 2.9076119130942026, "learning_rate": 1.2474644379794421e-05, "loss": 0.9286, "step": 468 }, { "epoch": 1.6972972972972973, "grad_norm": 2.826910897786021, "learning_rate": 1.2309889207056708e-05, "loss": 0.9556, "step": 471 }, { "epoch": 1.708108108108108, "grad_norm": 3.027330313518151, "learning_rate": 1.2145468687148672e-05, "loss": 0.9157, "step": 474 }, { "epoch": 1.718918918918919, "grad_norm": 2.870766642781988, "learning_rate": 1.1981403274169219e-05, "loss": 0.9708, "step": 477 }, { "epoch": 1.7297297297297298, "grad_norm": 2.745031713488, "learning_rate": 1.1817713378041568e-05, "loss": 0.9404, "step": 480 }, { "epoch": 1.7405405405405405, "grad_norm": 2.8171111310049506, "learning_rate": 1.1654419361974195e-05, "loss": 0.9423, "step": 483 }, { "epoch": 1.7513513513513512, "grad_norm": 2.8549039042787503, "learning_rate": 1.1491541539927668e-05, "loss": 0.951, "step": 486 }, { "epoch": 1.7621621621621621, "grad_norm": 2.664120980356897, "learning_rate": 1.1329100174087534e-05, "loss": 0.9287, "step": 489 }, { "epoch": 1.772972972972973, "grad_norm": 2.7039305514008096, "learning_rate": 1.1167115472343693e-05, "loss": 0.9584, "step": 492 }, { "epoch": 1.7837837837837838, "grad_norm": 2.6778342659825025, "learning_rate": 1.1005607585776527e-05, "loss": 0.9151, "step": 495 }, { "epoch": 1.7945945945945945, "grad_norm": 2.6005910068753857, "learning_rate": 1.0844596606150055e-05, "loss": 0.9501, "step": 498 }, { "epoch": 1.8054054054054054, "grad_norm": 2.6765741105098364, "learning_rate": 1.0684102563412519e-05, "loss": 0.931, "step": 501 }, { "epoch": 1.8162162162162163, "grad_norm": 2.811327607536862, "learning_rate": 1.0524145423204623e-05, "loss": 0.9793, "step": 504 }, { "epoch": 1.827027027027027, "grad_norm": 2.92527323842401, "learning_rate": 1.036474508437579e-05, "loss": 0.9776, "step": 507 }, { "epoch": 1.8378378378378377, "grad_norm": 2.789416429817517, "learning_rate": 1.020592137650872e-05, "loss": 0.947, "step": 510 }, { "epoch": 1.8486486486486486, "grad_norm": 2.754589393028259, "learning_rate": 1.004769405745257e-05, "loss": 0.9685, "step": 513 }, { "epoch": 1.8594594594594596, "grad_norm": 2.593923465827381, "learning_rate": 9.890082810865046e-06, "loss": 0.9317, "step": 516 }, { "epoch": 1.8702702702702703, "grad_norm": 3.005765748087634, "learning_rate": 9.733107243763754e-06, "loss": 0.9612, "step": 519 }, { "epoch": 1.881081081081081, "grad_norm": 2.6391444135921462, "learning_rate": 9.576786884087037e-06, "loss": 0.9431, "step": 522 }, { "epoch": 1.8918918918918919, "grad_norm": 2.7958859686864823, "learning_rate": 9.421141178264702e-06, "loss": 0.9473, "step": 525 }, { "epoch": 1.9027027027027028, "grad_norm": 2.8853568858381746, "learning_rate": 9.266189488798854e-06, "loss": 0.9404, "step": 528 }, { "epoch": 1.9135135135135135, "grad_norm": 3.011863176958825, "learning_rate": 9.111951091855164e-06, "loss": 0.9424, "step": 531 }, { "epoch": 1.9243243243243242, "grad_norm": 2.624513223364359, "learning_rate": 8.95844517486492e-06, "loss": 0.9404, "step": 534 }, { "epoch": 1.9351351351351351, "grad_norm": 2.629936136635792, "learning_rate": 8.805690834138076e-06, "loss": 0.9588, "step": 537 }, { "epoch": 1.945945945945946, "grad_norm": 2.8522026479916023, "learning_rate": 8.65370707248763e-06, "loss": 0.9339, "step": 540 }, { "epoch": 1.9567567567567568, "grad_norm": 3.097766550094928, "learning_rate": 8.502512796865686e-06, "loss": 0.9394, "step": 543 }, { "epoch": 1.9675675675675675, "grad_norm": 2.749849929848188, "learning_rate": 8.352126816011382e-06, "loss": 0.9402, "step": 546 }, { "epoch": 1.9783783783783784, "grad_norm": 2.792496713680321, "learning_rate": 8.202567838111078e-06, "loss": 0.9403, "step": 549 }, { "epoch": 1.9891891891891893, "grad_norm": 2.742908628148625, "learning_rate": 8.053854468471025e-06, "loss": 0.9475, "step": 552 }, { "epoch": 2.0, "grad_norm": 2.80461985695162, "learning_rate": 7.906005207202852e-06, "loss": 0.9251, "step": 555 }, { "epoch": 2.0108108108108107, "grad_norm": 2.712256627105201, "learning_rate": 7.75903844692212e-06, "loss": 0.4979, "step": 558 }, { "epoch": 2.0216216216216214, "grad_norm": 2.2575975329190823, "learning_rate": 7.61297247046029e-06, "loss": 0.4357, "step": 561 }, { "epoch": 2.0324324324324325, "grad_norm": 2.8247101377056865, "learning_rate": 7.4678254485902675e-06, "loss": 0.4334, "step": 564 }, { "epoch": 2.0432432432432432, "grad_norm": 2.588953106795816, "learning_rate": 7.3236154377659825e-06, "loss": 0.4327, "step": 567 }, { "epoch": 2.054054054054054, "grad_norm": 2.2112157456197807, "learning_rate": 7.180360377876125e-06, "loss": 0.4301, "step": 570 }, { "epoch": 2.064864864864865, "grad_norm": 2.1916524154888903, "learning_rate": 7.038078090012406e-06, "loss": 0.4254, "step": 573 }, { "epoch": 2.075675675675676, "grad_norm": 2.092153236540328, "learning_rate": 6.896786274252595e-06, "loss": 0.4066, "step": 576 }, { "epoch": 2.0864864864864865, "grad_norm": 2.1223817949774033, "learning_rate": 6.7565025074586145e-06, "loss": 0.4018, "step": 579 }, { "epoch": 2.097297297297297, "grad_norm": 1.975599624746057, "learning_rate": 6.617244241089947e-06, "loss": 0.3899, "step": 582 }, { "epoch": 2.108108108108108, "grad_norm": 1.8893909454422486, "learning_rate": 6.479028799032664e-06, "loss": 0.397, "step": 585 }, { "epoch": 2.118918918918919, "grad_norm": 1.875678473328706, "learning_rate": 6.3418733754443136e-06, "loss": 0.407, "step": 588 }, { "epoch": 2.1297297297297297, "grad_norm": 2.210895295229888, "learning_rate": 6.205795032614943e-06, "loss": 0.4039, "step": 591 }, { "epoch": 2.1405405405405404, "grad_norm": 2.2866290593300573, "learning_rate": 6.07081069884453e-06, "loss": 0.3975, "step": 594 }, { "epoch": 2.1513513513513516, "grad_norm": 2.169724698947998, "learning_rate": 5.936937166337093e-06, "loss": 0.404, "step": 597 }, { "epoch": 2.1621621621621623, "grad_norm": 2.5885052465204503, "learning_rate": 5.804191089111711e-06, "loss": 0.4137, "step": 600 }, { "epoch": 2.172972972972973, "grad_norm": 2.1184895283704273, "learning_rate": 5.6725889809307486e-06, "loss": 0.4069, "step": 603 }, { "epoch": 2.1837837837837837, "grad_norm": 2.055767847916725, "learning_rate": 5.5421472132455285e-06, "loss": 0.4309, "step": 606 }, { "epoch": 2.1945945945945944, "grad_norm": 1.9387007802838037, "learning_rate": 5.412882013159697e-06, "loss": 0.3989, "step": 609 }, { "epoch": 2.2054054054054055, "grad_norm": 1.9041479568200537, "learning_rate": 5.284809461410556e-06, "loss": 0.4013, "step": 612 }, { "epoch": 2.2162162162162162, "grad_norm": 2.0548881191902018, "learning_rate": 5.157945490368621e-06, "loss": 0.4205, "step": 615 }, { "epoch": 2.227027027027027, "grad_norm": 2.0831599061407204, "learning_rate": 5.03230588205558e-06, "loss": 0.4122, "step": 618 }, { "epoch": 2.237837837837838, "grad_norm": 1.971310383757786, "learning_rate": 4.907906266181014e-06, "loss": 0.3837, "step": 621 }, { "epoch": 2.2486486486486488, "grad_norm": 1.977557211024187, "learning_rate": 4.784762118198041e-06, "loss": 0.3981, "step": 624 }, { "epoch": 2.2594594594594595, "grad_norm": 1.9559375507208316, "learning_rate": 4.66288875737816e-06, "loss": 0.4094, "step": 627 }, { "epoch": 2.27027027027027, "grad_norm": 1.9123345255397275, "learning_rate": 4.542301344905496e-06, "loss": 0.3863, "step": 630 }, { "epoch": 2.281081081081081, "grad_norm": 1.8524912274987262, "learning_rate": 4.423014881990751e-06, "loss": 0.3908, "step": 633 }, { "epoch": 2.291891891891892, "grad_norm": 2.0911936019239246, "learning_rate": 4.305044208005023e-06, "loss": 0.4167, "step": 636 }, { "epoch": 2.3027027027027027, "grad_norm": 1.9050565892596198, "learning_rate": 4.188403998633775e-06, "loss": 0.3955, "step": 639 }, { "epoch": 2.3135135135135134, "grad_norm": 1.8967742593703636, "learning_rate": 4.0731087640511735e-06, "loss": 0.4163, "step": 642 }, { "epoch": 2.3243243243243246, "grad_norm": 2.051977219640454, "learning_rate": 3.959172847114991e-06, "loss": 0.4024, "step": 645 }, { "epoch": 2.3351351351351353, "grad_norm": 2.0012355554132792, "learning_rate": 3.846610421582349e-06, "loss": 0.4157, "step": 648 }, { "epoch": 2.345945945945946, "grad_norm": 2.175698094340077, "learning_rate": 3.7354354903464793e-06, "loss": 0.4024, "step": 651 }, { "epoch": 2.3567567567567567, "grad_norm": 2.0526759215687362, "learning_rate": 3.625661883694753e-06, "loss": 0.3939, "step": 654 }, { "epoch": 2.3675675675675674, "grad_norm": 1.960306969771245, "learning_rate": 3.5173032575881768e-06, "loss": 0.4074, "step": 657 }, { "epoch": 2.3783783783783785, "grad_norm": 2.1570885212061826, "learning_rate": 3.4103730919625753e-06, "loss": 0.3976, "step": 660 }, { "epoch": 2.389189189189189, "grad_norm": 1.8949986677811612, "learning_rate": 3.3048846890516658e-06, "loss": 0.4, "step": 663 }, { "epoch": 2.4, "grad_norm": 1.982249761308161, "learning_rate": 3.2008511717322593e-06, "loss": 0.4133, "step": 666 }, { "epoch": 2.410810810810811, "grad_norm": 1.980047898141974, "learning_rate": 3.098285481891745e-06, "loss": 0.3939, "step": 669 }, { "epoch": 2.4216216216216218, "grad_norm": 2.0049995734478965, "learning_rate": 2.9972003788181146e-06, "loss": 0.3926, "step": 672 }, { "epoch": 2.4324324324324325, "grad_norm": 1.8400258173639734, "learning_rate": 2.8976084376126848e-06, "loss": 0.3936, "step": 675 }, { "epoch": 2.443243243243243, "grad_norm": 1.9448462664129043, "learning_rate": 2.7995220476257482e-06, "loss": 0.388, "step": 678 }, { "epoch": 2.454054054054054, "grad_norm": 1.9031160601187072, "learning_rate": 2.7029534109153186e-06, "loss": 0.3909, "step": 681 }, { "epoch": 2.464864864864865, "grad_norm": 2.279997846004982, "learning_rate": 2.6079145407291877e-06, "loss": 0.3895, "step": 684 }, { "epoch": 2.4756756756756757, "grad_norm": 1.8403404089990134, "learning_rate": 2.514417260010455e-06, "loss": 0.3976, "step": 687 }, { "epoch": 2.4864864864864864, "grad_norm": 1.7969451736164892, "learning_rate": 2.4224731999267425e-06, "loss": 0.3999, "step": 690 }, { "epoch": 2.4972972972972975, "grad_norm": 1.9253974055183771, "learning_rate": 2.3320937984232664e-06, "loss": 0.3939, "step": 693 }, { "epoch": 2.5081081081081082, "grad_norm": 1.913704985114193, "learning_rate": 2.243290298799945e-06, "loss": 0.3984, "step": 696 }, { "epoch": 2.518918918918919, "grad_norm": 1.9790173152408796, "learning_rate": 2.156073748312721e-06, "loss": 0.3819, "step": 699 }, { "epoch": 2.5297297297297296, "grad_norm": 2.276492968512024, "learning_rate": 2.070454996799261e-06, "loss": 0.4039, "step": 702 }, { "epoch": 2.5405405405405403, "grad_norm": 1.7739681476430693, "learning_rate": 1.9864446953292313e-06, "loss": 0.3791, "step": 705 }, { "epoch": 2.5513513513513515, "grad_norm": 2.0336913560870196, "learning_rate": 1.9040532948792934e-06, "loss": 0.3847, "step": 708 }, { "epoch": 2.562162162162162, "grad_norm": 1.8946193097351467, "learning_rate": 1.8232910450329832e-06, "loss": 0.385, "step": 711 }, { "epoch": 2.572972972972973, "grad_norm": 1.9817636629737283, "learning_rate": 1.744167992705664e-06, "loss": 0.3914, "step": 714 }, { "epoch": 2.583783783783784, "grad_norm": 1.8202147731376643, "learning_rate": 1.6666939808946619e-06, "loss": 0.377, "step": 717 }, { "epoch": 2.5945945945945947, "grad_norm": 1.804624257938459, "learning_rate": 1.5908786474548004e-06, "loss": 0.3834, "step": 720 }, { "epoch": 2.6054054054054054, "grad_norm": 1.9478831371089558, "learning_rate": 1.5167314238994367e-06, "loss": 0.3802, "step": 723 }, { "epoch": 2.616216216216216, "grad_norm": 1.8418242757562502, "learning_rate": 1.4442615342271625e-06, "loss": 0.3742, "step": 726 }, { "epoch": 2.627027027027027, "grad_norm": 1.8235265917309187, "learning_rate": 1.3734779937743403e-06, "loss": 0.3763, "step": 729 }, { "epoch": 2.637837837837838, "grad_norm": 1.8148562882498185, "learning_rate": 1.3043896080935785e-06, "loss": 0.3764, "step": 732 }, { "epoch": 2.6486486486486487, "grad_norm": 1.9162200026873921, "learning_rate": 1.237004971858307e-06, "loss": 0.4009, "step": 735 }, { "epoch": 2.6594594594594594, "grad_norm": 1.971484443435529, "learning_rate": 1.1713324677936015e-06, "loss": 0.3894, "step": 738 }, { "epoch": 2.6702702702702705, "grad_norm": 2.6288039366865883, "learning_rate": 1.1073802656333548e-06, "loss": 0.3736, "step": 741 }, { "epoch": 2.6810810810810812, "grad_norm": 1.8111148825496188, "learning_rate": 1.0451563211039494e-06, "loss": 0.3996, "step": 744 }, { "epoch": 2.691891891891892, "grad_norm": 1.7806298978071708, "learning_rate": 9.846683749345648e-07, "loss": 0.383, "step": 747 }, { "epoch": 2.7027027027027026, "grad_norm": 4.0497002081385185, "learning_rate": 9.25923951894222e-07, "loss": 0.3965, "step": 750 }, { "epoch": 2.7135135135135133, "grad_norm": 1.8334211425058837, "learning_rate": 8.68930359855683e-07, "loss": 0.3989, "step": 753 }, { "epoch": 2.7243243243243245, "grad_norm": 1.7500539556657924, "learning_rate": 8.136946888863528e-07, "loss": 0.395, "step": 756 }, { "epoch": 2.735135135135135, "grad_norm": 1.9130969263501059, "learning_rate": 7.602238103662646e-07, "loss": 0.3853, "step": 759 }, { "epoch": 2.745945945945946, "grad_norm": 1.8404236110308207, "learning_rate": 7.085243761332738e-07, "loss": 0.393, "step": 762 }, { "epoch": 2.756756756756757, "grad_norm": 1.7456858490902225, "learning_rate": 6.586028176555536e-07, "loss": 0.3944, "step": 765 }, { "epoch": 2.7675675675675677, "grad_norm": 1.837065449202062, "learning_rate": 6.104653452315279e-07, "loss": 0.3798, "step": 768 }, { "epoch": 2.7783783783783784, "grad_norm": 2.3308657348413058, "learning_rate": 5.641179472172875e-07, "loss": 0.3798, "step": 771 }, { "epoch": 2.789189189189189, "grad_norm": 1.7969746620946272, "learning_rate": 5.195663892816432e-07, "loss": 0.3817, "step": 774 }, { "epoch": 2.8, "grad_norm": 1.8403934823419463, "learning_rate": 4.768162136888643e-07, "loss": 0.3791, "step": 777 }, { "epoch": 2.810810810810811, "grad_norm": 1.8084423900988431, "learning_rate": 4.3587273860921985e-07, "loss": 0.3613, "step": 780 }, { "epoch": 2.8216216216216217, "grad_norm": 1.8704402298319724, "learning_rate": 3.9674105745738155e-07, "loss": 0.3771, "step": 783 }, { "epoch": 2.8324324324324324, "grad_norm": 1.818023687371634, "learning_rate": 3.594260382588105e-07, "loss": 0.3888, "step": 786 }, { "epoch": 2.8432432432432435, "grad_norm": 1.8608896733650853, "learning_rate": 3.239323230441615e-07, "loss": 0.3888, "step": 789 }, { "epoch": 2.854054054054054, "grad_norm": 1.938515453919976, "learning_rate": 2.902643272718086e-07, "loss": 0.4002, "step": 792 }, { "epoch": 2.864864864864865, "grad_norm": 1.94145103701424, "learning_rate": 2.5842623927856244e-07, "loss": 0.3858, "step": 795 }, { "epoch": 2.8756756756756756, "grad_norm": 1.7260066637899822, "learning_rate": 2.28422019758629e-07, "loss": 0.3905, "step": 798 }, { "epoch": 2.8864864864864863, "grad_norm": 1.8230164360318986, "learning_rate": 2.0025540127090513e-07, "loss": 0.3977, "step": 801 }, { "epoch": 2.8972972972972975, "grad_norm": 1.7780456307114303, "learning_rate": 1.7392988777463202e-07, "loss": 0.3881, "step": 804 }, { "epoch": 2.908108108108108, "grad_norm": 1.9497490854182644, "learning_rate": 1.4944875419350855e-07, "loss": 0.3797, "step": 807 }, { "epoch": 2.918918918918919, "grad_norm": 1.6524262781562993, "learning_rate": 1.268150460082823e-07, "loss": 0.3645, "step": 810 }, { "epoch": 2.92972972972973, "grad_norm": 1.8325737906994488, "learning_rate": 1.0603157887788428e-07, "loss": 0.3574, "step": 813 }, { "epoch": 2.9405405405405407, "grad_norm": 1.8188448217016162, "learning_rate": 8.710093828917076e-08, "loss": 0.3829, "step": 816 }, { "epoch": 2.9513513513513514, "grad_norm": 1.774843520134497, "learning_rate": 7.002547923527058e-08, "loss": 0.3945, "step": 819 }, { "epoch": 2.962162162162162, "grad_norm": 1.7417401019158905, "learning_rate": 5.4807325922632825e-08, "loss": 0.37, "step": 822 }, { "epoch": 2.972972972972973, "grad_norm": 1.6997699000548114, "learning_rate": 4.14483715067665e-08, "loss": 0.3702, "step": 825 }, { "epoch": 2.983783783783784, "grad_norm": 1.7185773019727228, "learning_rate": 2.995027785673066e-08, "loss": 0.3829, "step": 828 }, { "epoch": 2.9945945945945946, "grad_norm": 1.7433824271169698, "learning_rate": 2.0314475348401362e-08, "loss": 0.3777, "step": 831 } ], "logging_steps": 3, "max_steps": 845, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 833, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 232851391348736.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }