{ "best_metric": 0.6910951733589172, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.018067977289942392, "eval_steps": 50, "global_step": 650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.779688813837291e-05, "grad_norm": 0.04725373163819313, "learning_rate": 0.0002, "loss": 1.23, "step": 1 }, { "epoch": 2.779688813837291e-05, "eval_loss": 1.0243470668792725, "eval_runtime": 50.0777, "eval_samples_per_second": 11.522, "eval_steps_per_second": 5.771, "step": 1 }, { "epoch": 5.559377627674582e-05, "grad_norm": 0.04354099556803703, "learning_rate": 0.0004, "loss": 1.0633, "step": 2 }, { "epoch": 8.339066441511873e-05, "grad_norm": 0.04554685205221176, "learning_rate": 0.0006, "loss": 1.1887, "step": 3 }, { "epoch": 0.00011118755255349164, "grad_norm": 0.04557386413216591, "learning_rate": 0.0008, "loss": 1.121, "step": 4 }, { "epoch": 0.00013898444069186455, "grad_norm": 0.05429883301258087, "learning_rate": 0.001, "loss": 1.2029, "step": 5 }, { "epoch": 0.00016678132883023747, "grad_norm": 0.058254778385162354, "learning_rate": 0.0012, "loss": 0.8706, "step": 6 }, { "epoch": 0.00019457821696861036, "grad_norm": 0.09290671348571777, "learning_rate": 0.0014, "loss": 1.0657, "step": 7 }, { "epoch": 0.00022237510510698328, "grad_norm": 0.05990983918309212, "learning_rate": 0.0016, "loss": 0.8619, "step": 8 }, { "epoch": 0.0002501719932453562, "grad_norm": 0.060997381806373596, "learning_rate": 0.0018000000000000002, "loss": 1.1914, "step": 9 }, { "epoch": 0.0002779688813837291, "grad_norm": 0.05487683787941933, "learning_rate": 0.002, "loss": 0.8724, "step": 10 }, { "epoch": 0.000305765769522102, "grad_norm": 0.051148172467947006, "learning_rate": 0.001999979446958366, "loss": 0.8865, "step": 11 }, { "epoch": 0.00033356265766047494, "grad_norm": 0.07391902059316635, "learning_rate": 0.001999917788678319, "loss": 0.994, "step": 12 }, { "epoch": 0.0003613595457988478, "grad_norm": 0.061822760850191116, "learning_rate": 0.00199981502769439, "loss": 0.9617, "step": 13 }, { "epoch": 0.0003891564339372207, "grad_norm": 0.06005921587347984, "learning_rate": 0.00199967116823068, "loss": 0.736, "step": 14 }, { "epoch": 0.00041695332207559364, "grad_norm": 0.06111058592796326, "learning_rate": 0.001999486216200688, "loss": 0.8405, "step": 15 }, { "epoch": 0.00044475021021396656, "grad_norm": 0.06315408647060394, "learning_rate": 0.0019992601792070677, "loss": 0.7523, "step": 16 }, { "epoch": 0.00047254709835233943, "grad_norm": 0.06354085355997086, "learning_rate": 0.0019989930665413147, "loss": 0.8865, "step": 17 }, { "epoch": 0.0005003439864907124, "grad_norm": 0.06111137568950653, "learning_rate": 0.0019986848891833846, "loss": 0.8308, "step": 18 }, { "epoch": 0.0005281408746290853, "grad_norm": 0.0758710503578186, "learning_rate": 0.001998335659801241, "loss": 1.0734, "step": 19 }, { "epoch": 0.0005559377627674582, "grad_norm": 0.056908175349235535, "learning_rate": 0.0019979453927503363, "loss": 0.8955, "step": 20 }, { "epoch": 0.0005837346509058311, "grad_norm": 0.06023595482110977, "learning_rate": 0.0019975141040730207, "loss": 0.8492, "step": 21 }, { "epoch": 0.000611531539044204, "grad_norm": 0.05432360619306564, "learning_rate": 0.001997041811497882, "loss": 0.8083, "step": 22 }, { "epoch": 0.000639328427182577, "grad_norm": 0.05626050382852554, "learning_rate": 0.0019965285344390182, "loss": 0.9985, "step": 23 }, { "epoch": 0.0006671253153209499, "grad_norm": 0.06779271364212036, "learning_rate": 0.0019959742939952394, "loss": 0.9792, "step": 24 }, { "epoch": 0.0006949222034593227, "grad_norm": 0.05818340182304382, "learning_rate": 0.0019953791129491984, "loss": 0.8439, "step": 25 }, { "epoch": 0.0007227190915976956, "grad_norm": 0.08297158777713776, "learning_rate": 0.0019947430157664574, "loss": 0.6787, "step": 26 }, { "epoch": 0.0007505159797360685, "grad_norm": 0.06646459549665451, "learning_rate": 0.00199406602859448, "loss": 0.7956, "step": 27 }, { "epoch": 0.0007783128678744414, "grad_norm": 0.06349553912878036, "learning_rate": 0.001993348179261558, "loss": 0.8621, "step": 28 }, { "epoch": 0.0008061097560128144, "grad_norm": 0.06375010311603546, "learning_rate": 0.001992589497275665, "loss": 0.7232, "step": 29 }, { "epoch": 0.0008339066441511873, "grad_norm": 0.07310320436954498, "learning_rate": 0.001991790013823246, "loss": 0.7295, "step": 30 }, { "epoch": 0.0008617035322895602, "grad_norm": 0.07351308315992355, "learning_rate": 0.001990949761767935, "loss": 0.6556, "step": 31 }, { "epoch": 0.0008895004204279331, "grad_norm": 0.0795268714427948, "learning_rate": 0.0019900687756492018, "loss": 0.8927, "step": 32 }, { "epoch": 0.000917297308566306, "grad_norm": 0.06989472359418869, "learning_rate": 0.0019891470916809364, "loss": 0.6571, "step": 33 }, { "epoch": 0.0009450941967046789, "grad_norm": 0.07375137507915497, "learning_rate": 0.0019881847477499557, "loss": 0.6175, "step": 34 }, { "epoch": 0.0009728910848430518, "grad_norm": 0.07746679335832596, "learning_rate": 0.0019871817834144503, "loss": 0.6992, "step": 35 }, { "epoch": 0.0010006879729814247, "grad_norm": 0.07618487626314163, "learning_rate": 0.001986138239902355, "loss": 0.625, "step": 36 }, { "epoch": 0.0010284848611197977, "grad_norm": 0.0905221700668335, "learning_rate": 0.001985054160109657, "loss": 0.6513, "step": 37 }, { "epoch": 0.0010562817492581705, "grad_norm": 0.09803163260221481, "learning_rate": 0.0019839295885986296, "loss": 0.7122, "step": 38 }, { "epoch": 0.0010840786373965434, "grad_norm": 0.08399751782417297, "learning_rate": 0.0019827645715960037, "loss": 0.5943, "step": 39 }, { "epoch": 0.0011118755255349164, "grad_norm": 0.09010937064886093, "learning_rate": 0.0019815591569910655, "loss": 0.7233, "step": 40 }, { "epoch": 0.0011396724136732892, "grad_norm": 0.09423992037773132, "learning_rate": 0.0019803133943336873, "loss": 0.7266, "step": 41 }, { "epoch": 0.0011674693018116622, "grad_norm": 0.09580416232347488, "learning_rate": 0.001979027334832293, "loss": 0.7426, "step": 42 }, { "epoch": 0.001195266189950035, "grad_norm": 0.08860920369625092, "learning_rate": 0.0019777010313517516, "loss": 0.6335, "step": 43 }, { "epoch": 0.001223063078088408, "grad_norm": 0.0893678069114685, "learning_rate": 0.0019763345384112042, "loss": 0.6863, "step": 44 }, { "epoch": 0.0012508599662267809, "grad_norm": 0.08261829614639282, "learning_rate": 0.0019749279121818236, "loss": 0.5307, "step": 45 }, { "epoch": 0.001278656854365154, "grad_norm": 0.09626446664333344, "learning_rate": 0.001973481210484505, "loss": 0.6697, "step": 46 }, { "epoch": 0.0013064537425035267, "grad_norm": 0.127803772687912, "learning_rate": 0.001971994492787488, "loss": 0.6944, "step": 47 }, { "epoch": 0.0013342506306418997, "grad_norm": 0.1326465904712677, "learning_rate": 0.001970467820203915, "loss": 0.6792, "step": 48 }, { "epoch": 0.0013620475187802726, "grad_norm": 0.14524292945861816, "learning_rate": 0.0019689012554893154, "loss": 0.6784, "step": 49 }, { "epoch": 0.0013898444069186454, "grad_norm": 0.22971156239509583, "learning_rate": 0.0019672948630390296, "loss": 0.8101, "step": 50 }, { "epoch": 0.0013898444069186454, "eval_loss": 0.9230208396911621, "eval_runtime": 49.9615, "eval_samples_per_second": 11.549, "eval_steps_per_second": 5.784, "step": 50 }, { "epoch": 0.0014176412950570184, "grad_norm": 0.1536542773246765, "learning_rate": 0.001965648708885559, "loss": 1.0933, "step": 51 }, { "epoch": 0.0014454381831953912, "grad_norm": 0.1058596670627594, "learning_rate": 0.001963962860695853, "loss": 1.0704, "step": 52 }, { "epoch": 0.0014732350713337642, "grad_norm": 0.08224259316921234, "learning_rate": 0.001962237387768529, "loss": 1.1229, "step": 53 }, { "epoch": 0.001501031959472137, "grad_norm": 0.08697132021188736, "learning_rate": 0.0019604723610310193, "loss": 0.9118, "step": 54 }, { "epoch": 0.00152882884761051, "grad_norm": 0.08103405684232712, "learning_rate": 0.0019586678530366607, "loss": 0.946, "step": 55 }, { "epoch": 0.0015566257357488829, "grad_norm": 0.07975370436906815, "learning_rate": 0.0019568239379617086, "loss": 0.9038, "step": 56 }, { "epoch": 0.001584422623887256, "grad_norm": 0.07595320045948029, "learning_rate": 0.0019549406916022907, "loss": 0.8122, "step": 57 }, { "epoch": 0.0016122195120256287, "grad_norm": 0.08768190443515778, "learning_rate": 0.0019530181913712872, "loss": 0.8606, "step": 58 }, { "epoch": 0.0016400164001640015, "grad_norm": 0.09886912256479263, "learning_rate": 0.0019510565162951536, "loss": 0.8902, "step": 59 }, { "epoch": 0.0016678132883023746, "grad_norm": 0.08801861107349396, "learning_rate": 0.0019490557470106687, "loss": 0.9151, "step": 60 }, { "epoch": 0.0016956101764407474, "grad_norm": 0.08995132893323898, "learning_rate": 0.0019470159657616214, "loss": 0.7793, "step": 61 }, { "epoch": 0.0017234070645791204, "grad_norm": 0.08425740152597427, "learning_rate": 0.0019449372563954293, "loss": 0.7646, "step": 62 }, { "epoch": 0.0017512039527174932, "grad_norm": 0.11669428646564484, "learning_rate": 0.001942819704359693, "loss": 0.9056, "step": 63 }, { "epoch": 0.0017790008408558663, "grad_norm": 0.08303668349981308, "learning_rate": 0.0019406633966986826, "loss": 0.7583, "step": 64 }, { "epoch": 0.001806797728994239, "grad_norm": 0.08263064175844193, "learning_rate": 0.0019384684220497604, "loss": 0.9233, "step": 65 }, { "epoch": 0.001834594617132612, "grad_norm": 0.08262008428573608, "learning_rate": 0.0019362348706397372, "loss": 0.8359, "step": 66 }, { "epoch": 0.001862391505270985, "grad_norm": 0.10420376062393188, "learning_rate": 0.0019339628342811633, "loss": 0.9978, "step": 67 }, { "epoch": 0.0018901883934093577, "grad_norm": 0.0830477699637413, "learning_rate": 0.001931652406368554, "loss": 0.8834, "step": 68 }, { "epoch": 0.0019179852815477307, "grad_norm": 0.08504804968833923, "learning_rate": 0.0019293036818745519, "loss": 0.9164, "step": 69 }, { "epoch": 0.0019457821696861036, "grad_norm": 0.08910652250051498, "learning_rate": 0.0019269167573460217, "loss": 0.9095, "step": 70 }, { "epoch": 0.0019735790578244766, "grad_norm": 0.09257230162620544, "learning_rate": 0.0019244917309000815, "loss": 0.7138, "step": 71 }, { "epoch": 0.0020013759459628494, "grad_norm": 0.09553885459899902, "learning_rate": 0.0019220287022200706, "loss": 0.9544, "step": 72 }, { "epoch": 0.002029172834101222, "grad_norm": 0.08890817314386368, "learning_rate": 0.0019195277725514508, "loss": 0.7013, "step": 73 }, { "epoch": 0.0020569697222395955, "grad_norm": 0.10616549849510193, "learning_rate": 0.0019169890446976451, "loss": 0.7119, "step": 74 }, { "epoch": 0.0020847666103779683, "grad_norm": 0.09758912026882172, "learning_rate": 0.0019144126230158124, "loss": 0.811, "step": 75 }, { "epoch": 0.002112563498516341, "grad_norm": 0.09248580783605576, "learning_rate": 0.001911798613412557, "loss": 0.8025, "step": 76 }, { "epoch": 0.002140360386654714, "grad_norm": 0.09431200474500656, "learning_rate": 0.001909147123339575, "loss": 0.7038, "step": 77 }, { "epoch": 0.0021681572747930867, "grad_norm": 0.09258091449737549, "learning_rate": 0.001906458261789238, "loss": 0.7752, "step": 78 }, { "epoch": 0.00219595416293146, "grad_norm": 0.08860747516155243, "learning_rate": 0.0019037321392901135, "loss": 0.6832, "step": 79 }, { "epoch": 0.0022237510510698328, "grad_norm": 0.10791260004043579, "learning_rate": 0.001900968867902419, "loss": 0.7183, "step": 80 }, { "epoch": 0.0022515479392082056, "grad_norm": 0.0878261998295784, "learning_rate": 0.001898168561213419, "loss": 0.6677, "step": 81 }, { "epoch": 0.0022793448273465784, "grad_norm": 0.10915020108222961, "learning_rate": 0.0018953313343327532, "loss": 0.8602, "step": 82 }, { "epoch": 0.0023071417154849516, "grad_norm": 0.10625939816236496, "learning_rate": 0.001892457303887706, "loss": 0.8385, "step": 83 }, { "epoch": 0.0023349386036233244, "grad_norm": 0.10215223580598831, "learning_rate": 0.001889546588018412, "loss": 0.7723, "step": 84 }, { "epoch": 0.0023627354917616973, "grad_norm": 0.08778225630521774, "learning_rate": 0.0018865993063730002, "loss": 0.6503, "step": 85 }, { "epoch": 0.00239053237990007, "grad_norm": 0.10662350058555603, "learning_rate": 0.0018836155801026753, "loss": 0.6592, "step": 86 }, { "epoch": 0.002418329268038443, "grad_norm": 0.10347293317317963, "learning_rate": 0.001880595531856738, "loss": 0.602, "step": 87 }, { "epoch": 0.002446126156176816, "grad_norm": 0.11098446696996689, "learning_rate": 0.001877539285777543, "loss": 0.7291, "step": 88 }, { "epoch": 0.002473923044315189, "grad_norm": 0.10774262994527817, "learning_rate": 0.0018744469674953957, "loss": 0.6501, "step": 89 }, { "epoch": 0.0025017199324535618, "grad_norm": 0.10596223175525665, "learning_rate": 0.0018713187041233894, "loss": 0.7274, "step": 90 }, { "epoch": 0.0025295168205919346, "grad_norm": 0.11689383536577225, "learning_rate": 0.0018681546242521785, "loss": 0.6693, "step": 91 }, { "epoch": 0.002557313708730308, "grad_norm": 0.11212435364723206, "learning_rate": 0.0018649548579446936, "loss": 0.6218, "step": 92 }, { "epoch": 0.0025851105968686806, "grad_norm": 0.13619789481163025, "learning_rate": 0.0018617195367307952, "loss": 0.5839, "step": 93 }, { "epoch": 0.0026129074850070534, "grad_norm": 0.18084552884101868, "learning_rate": 0.001858448793601866, "loss": 0.6083, "step": 94 }, { "epoch": 0.0026407043731454262, "grad_norm": 0.14780890941619873, "learning_rate": 0.0018551427630053464, "loss": 0.6095, "step": 95 }, { "epoch": 0.0026685012612837995, "grad_norm": 0.12189039587974548, "learning_rate": 0.0018518015808392043, "loss": 0.6313, "step": 96 }, { "epoch": 0.0026962981494221723, "grad_norm": 0.2006332129240036, "learning_rate": 0.0018484253844463525, "loss": 0.6528, "step": 97 }, { "epoch": 0.002724095037560545, "grad_norm": 0.19439570605754852, "learning_rate": 0.0018450143126090013, "loss": 0.655, "step": 98 }, { "epoch": 0.002751891925698918, "grad_norm": 0.23627929389476776, "learning_rate": 0.0018415685055429532, "loss": 0.6663, "step": 99 }, { "epoch": 0.0027796888138372907, "grad_norm": 0.22060082852840424, "learning_rate": 0.0018380881048918405, "loss": 0.6549, "step": 100 }, { "epoch": 0.0027796888138372907, "eval_loss": 1.0665974617004395, "eval_runtime": 49.9362, "eval_samples_per_second": 11.555, "eval_steps_per_second": 5.787, "step": 100 }, { "epoch": 0.002807485701975664, "grad_norm": 1.510432481765747, "learning_rate": 0.0018345732537213026, "loss": 1.4424, "step": 101 }, { "epoch": 0.002835282590114037, "grad_norm": 0.1634325534105301, "learning_rate": 0.001831024096513104, "loss": 0.9927, "step": 102 }, { "epoch": 0.0028630794782524096, "grad_norm": 0.11758747696876526, "learning_rate": 0.0018274407791591964, "loss": 1.0068, "step": 103 }, { "epoch": 0.0028908763663907824, "grad_norm": 0.13593055307865143, "learning_rate": 0.0018238234489557216, "loss": 1.0023, "step": 104 }, { "epoch": 0.0029186732545291557, "grad_norm": 0.1236187294125557, "learning_rate": 0.0018201722545969558, "loss": 0.8913, "step": 105 }, { "epoch": 0.0029464701426675285, "grad_norm": 0.1045624166727066, "learning_rate": 0.0018164873461691987, "loss": 0.952, "step": 106 }, { "epoch": 0.0029742670308059013, "grad_norm": 0.1336205154657364, "learning_rate": 0.0018127688751446028, "loss": 0.9185, "step": 107 }, { "epoch": 0.003002063918944274, "grad_norm": 0.20375819504261017, "learning_rate": 0.0018090169943749475, "loss": 0.7956, "step": 108 }, { "epoch": 0.003029860807082647, "grad_norm": 0.11180537939071655, "learning_rate": 0.0018052318580853563, "loss": 0.6942, "step": 109 }, { "epoch": 0.00305765769522102, "grad_norm": 0.3445965349674225, "learning_rate": 0.0018014136218679566, "loss": 0.8457, "step": 110 }, { "epoch": 0.003085454583359393, "grad_norm": 0.19809404015541077, "learning_rate": 0.0017975624426754845, "loss": 0.689, "step": 111 }, { "epoch": 0.0031132514714977658, "grad_norm": 0.4088907837867737, "learning_rate": 0.0017936784788148326, "loss": 0.887, "step": 112 }, { "epoch": 0.0031410483596361386, "grad_norm": 0.757510781288147, "learning_rate": 0.0017897618899405424, "loss": 0.8129, "step": 113 }, { "epoch": 0.003168845247774512, "grad_norm": 0.1284535676240921, "learning_rate": 0.0017858128370482425, "loss": 0.9147, "step": 114 }, { "epoch": 0.0031966421359128847, "grad_norm": 1.1778861284255981, "learning_rate": 0.00178183148246803, "loss": 0.8639, "step": 115 }, { "epoch": 0.0032244390240512575, "grad_norm": 0.24793654680252075, "learning_rate": 0.0017778179898577974, "loss": 0.9027, "step": 116 }, { "epoch": 0.0032522359121896303, "grad_norm": 0.17602455615997314, "learning_rate": 0.0017737725241965068, "loss": 1.0078, "step": 117 }, { "epoch": 0.003280032800328003, "grad_norm": 0.18490758538246155, "learning_rate": 0.0017696952517774062, "loss": 0.7916, "step": 118 }, { "epoch": 0.0033078296884663763, "grad_norm": 0.17348746955394745, "learning_rate": 0.0017655863402011947, "loss": 0.9932, "step": 119 }, { "epoch": 0.003335626576604749, "grad_norm": 0.22050045430660248, "learning_rate": 0.0017614459583691344, "loss": 0.8613, "step": 120 }, { "epoch": 0.003363423464743122, "grad_norm": 0.12003365904092789, "learning_rate": 0.0017572742764761053, "loss": 0.9392, "step": 121 }, { "epoch": 0.0033912203528814948, "grad_norm": 0.10987789183855057, "learning_rate": 0.001753071466003611, "loss": 0.7309, "step": 122 }, { "epoch": 0.003419017241019868, "grad_norm": 0.11258568614721298, "learning_rate": 0.0017488376997127281, "loss": 0.7541, "step": 123 }, { "epoch": 0.003446814129158241, "grad_norm": 0.14257432520389557, "learning_rate": 0.0017445731516370071, "loss": 0.7806, "step": 124 }, { "epoch": 0.0034746110172966136, "grad_norm": 0.1824330985546112, "learning_rate": 0.0017402779970753155, "loss": 0.8882, "step": 125 }, { "epoch": 0.0035024079054349865, "grad_norm": 0.1172509491443634, "learning_rate": 0.001735952412584635, "loss": 0.8695, "step": 126 }, { "epoch": 0.0035302047935733593, "grad_norm": 0.1342688351869583, "learning_rate": 0.0017315965759728013, "loss": 0.6943, "step": 127 }, { "epoch": 0.0035580016817117325, "grad_norm": 0.11804018169641495, "learning_rate": 0.0017272106662911972, "loss": 0.6733, "step": 128 }, { "epoch": 0.0035857985698501053, "grad_norm": 0.1424594670534134, "learning_rate": 0.0017227948638273915, "loss": 0.8569, "step": 129 }, { "epoch": 0.003613595457988478, "grad_norm": 0.10879236459732056, "learning_rate": 0.0017183493500977276, "loss": 0.62, "step": 130 }, { "epoch": 0.003641392346126851, "grad_norm": 0.14222201704978943, "learning_rate": 0.0017138743078398632, "loss": 0.7501, "step": 131 }, { "epoch": 0.003669189234265224, "grad_norm": 0.11895573884248734, "learning_rate": 0.0017093699210052578, "loss": 0.7722, "step": 132 }, { "epoch": 0.003696986122403597, "grad_norm": 0.18408416211605072, "learning_rate": 0.0017048363747516118, "loss": 0.8163, "step": 133 }, { "epoch": 0.00372478301054197, "grad_norm": 0.13170836865901947, "learning_rate": 0.001700273855435255, "loss": 0.7529, "step": 134 }, { "epoch": 0.0037525798986803426, "grad_norm": 0.2225480079650879, "learning_rate": 0.0016956825506034864, "loss": 0.7412, "step": 135 }, { "epoch": 0.0037803767868187154, "grad_norm": 0.14438864588737488, "learning_rate": 0.0016910626489868648, "loss": 0.7257, "step": 136 }, { "epoch": 0.0038081736749570887, "grad_norm": 0.14967116713523865, "learning_rate": 0.0016864143404914506, "loss": 0.7663, "step": 137 }, { "epoch": 0.0038359705630954615, "grad_norm": 0.12598782777786255, "learning_rate": 0.0016817378161909995, "loss": 0.7768, "step": 138 }, { "epoch": 0.0038637674512338343, "grad_norm": 0.1321249157190323, "learning_rate": 0.0016770332683191096, "loss": 0.5741, "step": 139 }, { "epoch": 0.003891564339372207, "grad_norm": 0.1356271356344223, "learning_rate": 0.0016723008902613168, "loss": 0.628, "step": 140 }, { "epoch": 0.00391936122751058, "grad_norm": 0.1559610813856125, "learning_rate": 0.0016675408765471481, "loss": 0.5979, "step": 141 }, { "epoch": 0.003947158115648953, "grad_norm": 0.14019130170345306, "learning_rate": 0.001662753422842123, "loss": 0.6001, "step": 142 }, { "epoch": 0.003974955003787326, "grad_norm": 0.3218002915382385, "learning_rate": 0.0016579387259397127, "loss": 0.6655, "step": 143 }, { "epoch": 0.004002751891925699, "grad_norm": 0.14642581343650818, "learning_rate": 0.0016530969837532485, "loss": 0.6712, "step": 144 }, { "epoch": 0.004030548780064072, "grad_norm": 0.30584490299224854, "learning_rate": 0.0016482283953077885, "loss": 0.7566, "step": 145 }, { "epoch": 0.004058345668202444, "grad_norm": 0.17027173936367035, "learning_rate": 0.0016433331607319341, "loss": 0.7187, "step": 146 }, { "epoch": 0.004086142556340817, "grad_norm": 0.22963419556617737, "learning_rate": 0.0016384114812496057, "loss": 0.732, "step": 147 }, { "epoch": 0.004113939444479191, "grad_norm": 0.6273247599601746, "learning_rate": 0.0016334635591717702, "loss": 0.6352, "step": 148 }, { "epoch": 0.004141736332617564, "grad_norm": 0.14807386696338654, "learning_rate": 0.0016284895978881237, "loss": 0.6705, "step": 149 }, { "epoch": 0.0041695332207559365, "grad_norm": 0.19606240093708038, "learning_rate": 0.0016234898018587336, "loss": 0.7171, "step": 150 }, { "epoch": 0.0041695332207559365, "eval_loss": 0.8552482724189758, "eval_runtime": 50.0085, "eval_samples_per_second": 11.538, "eval_steps_per_second": 5.779, "step": 150 }, { "epoch": 0.004197330108894309, "grad_norm": 0.4086618423461914, "learning_rate": 0.0016184643766056315, "loss": 1.0707, "step": 151 }, { "epoch": 0.004225126997032682, "grad_norm": 0.2321600317955017, "learning_rate": 0.0016134135287043667, "loss": 1.0719, "step": 152 }, { "epoch": 0.004252923885171055, "grad_norm": 0.09448766708374023, "learning_rate": 0.0016083374657755133, "loss": 1.035, "step": 153 }, { "epoch": 0.004280720773309428, "grad_norm": 0.0990653857588768, "learning_rate": 0.0016032363964761363, "loss": 0.9306, "step": 154 }, { "epoch": 0.004308517661447801, "grad_norm": 0.09489451348781586, "learning_rate": 0.001598110530491216, "loss": 0.8499, "step": 155 }, { "epoch": 0.004336314549586173, "grad_norm": 0.09735696017742157, "learning_rate": 0.0015929600785250257, "loss": 0.8704, "step": 156 }, { "epoch": 0.004364111437724547, "grad_norm": 0.1042320653796196, "learning_rate": 0.0015877852522924731, "loss": 0.8512, "step": 157 }, { "epoch": 0.00439190832586292, "grad_norm": 0.12351106107234955, "learning_rate": 0.0015825862645103962, "loss": 0.9413, "step": 158 }, { "epoch": 0.004419705214001293, "grad_norm": 0.09481119364500046, "learning_rate": 0.0015773633288888196, "loss": 0.8296, "step": 159 }, { "epoch": 0.0044475021021396655, "grad_norm": 0.105812206864357, "learning_rate": 0.0015721166601221697, "loss": 0.7714, "step": 160 }, { "epoch": 0.004475298990278038, "grad_norm": 0.11305416375398636, "learning_rate": 0.00156684647388045, "loss": 0.9412, "step": 161 }, { "epoch": 0.004503095878416411, "grad_norm": 0.10926243662834167, "learning_rate": 0.0015615529868003748, "loss": 0.8682, "step": 162 }, { "epoch": 0.004530892766554784, "grad_norm": 0.10087363421916962, "learning_rate": 0.0015562364164764648, "loss": 0.8325, "step": 163 }, { "epoch": 0.004558689654693157, "grad_norm": 0.09206137806177139, "learning_rate": 0.0015508969814521025, "loss": 0.7958, "step": 164 }, { "epoch": 0.00458648654283153, "grad_norm": 0.11067818850278854, "learning_rate": 0.0015455349012105486, "loss": 1.0522, "step": 165 }, { "epoch": 0.004614283430969903, "grad_norm": 0.11335323750972748, "learning_rate": 0.0015401503961659203, "loss": 0.9374, "step": 166 }, { "epoch": 0.004642080319108276, "grad_norm": 0.1060660108923912, "learning_rate": 0.0015347436876541297, "loss": 0.9861, "step": 167 }, { "epoch": 0.004669877207246649, "grad_norm": 0.12663975358009338, "learning_rate": 0.0015293149979237874, "loss": 0.7834, "step": 168 }, { "epoch": 0.004697674095385022, "grad_norm": 0.1315016895532608, "learning_rate": 0.0015238645501270654, "loss": 0.9251, "step": 169 }, { "epoch": 0.0047254709835233945, "grad_norm": 0.16219031810760498, "learning_rate": 0.0015183925683105253, "loss": 0.8951, "step": 170 }, { "epoch": 0.004753267871661767, "grad_norm": 0.1081567108631134, "learning_rate": 0.0015128992774059062, "loss": 0.9457, "step": 171 }, { "epoch": 0.00478106475980014, "grad_norm": 0.1203177273273468, "learning_rate": 0.0015073849032208823, "loss": 0.9307, "step": 172 }, { "epoch": 0.004808861647938513, "grad_norm": 0.11848781257867813, "learning_rate": 0.0015018496724297776, "loss": 0.8301, "step": 173 }, { "epoch": 0.004836658536076886, "grad_norm": 0.13573849201202393, "learning_rate": 0.0014962938125642501, "loss": 0.723, "step": 174 }, { "epoch": 0.0048644554242152594, "grad_norm": 0.10743360966444016, "learning_rate": 0.001490717552003938, "loss": 0.7861, "step": 175 }, { "epoch": 0.004892252312353632, "grad_norm": 0.10989518463611603, "learning_rate": 0.001485121119967072, "loss": 0.7905, "step": 176 }, { "epoch": 0.004920049200492005, "grad_norm": 0.10301119089126587, "learning_rate": 0.001479504746501054, "loss": 0.6952, "step": 177 }, { "epoch": 0.004947846088630378, "grad_norm": 0.12021514773368835, "learning_rate": 0.0014738686624729987, "loss": 0.782, "step": 178 }, { "epoch": 0.004975642976768751, "grad_norm": 0.11153309047222137, "learning_rate": 0.0014682130995602458, "loss": 0.8065, "step": 179 }, { "epoch": 0.0050034398649071235, "grad_norm": 0.11229830235242844, "learning_rate": 0.0014625382902408355, "loss": 0.7113, "step": 180 }, { "epoch": 0.005031236753045496, "grad_norm": 0.11783714592456818, "learning_rate": 0.0014568444677839517, "loss": 0.6218, "step": 181 }, { "epoch": 0.005059033641183869, "grad_norm": 0.12635229527950287, "learning_rate": 0.0014511318662403345, "loss": 0.6719, "step": 182 }, { "epoch": 0.005086830529322243, "grad_norm": 0.12254615128040314, "learning_rate": 0.0014454007204326592, "loss": 0.7263, "step": 183 }, { "epoch": 0.005114627417460616, "grad_norm": 0.11896725744009018, "learning_rate": 0.0014396512659458822, "loss": 0.8887, "step": 184 }, { "epoch": 0.005142424305598988, "grad_norm": 0.10813646763563156, "learning_rate": 0.0014338837391175581, "loss": 0.6242, "step": 185 }, { "epoch": 0.005170221193737361, "grad_norm": 0.10913447290658951, "learning_rate": 0.0014280983770281258, "loss": 0.5932, "step": 186 }, { "epoch": 0.005198018081875734, "grad_norm": 0.11929253488779068, "learning_rate": 0.00142229541749116, "loss": 0.7183, "step": 187 }, { "epoch": 0.005225814970014107, "grad_norm": 0.1152404397726059, "learning_rate": 0.001416475099043599, "loss": 0.7041, "step": 188 }, { "epoch": 0.00525361185815248, "grad_norm": 0.11904522776603699, "learning_rate": 0.001410637660935938, "loss": 0.7405, "step": 189 }, { "epoch": 0.0052814087462908525, "grad_norm": 0.10983660072088242, "learning_rate": 0.0014047833431223937, "loss": 0.7169, "step": 190 }, { "epoch": 0.005309205634429225, "grad_norm": 0.1285189539194107, "learning_rate": 0.0013989123862510418, "loss": 0.7584, "step": 191 }, { "epoch": 0.005337002522567599, "grad_norm": 0.13100877404212952, "learning_rate": 0.0013930250316539236, "loss": 0.7606, "step": 192 }, { "epoch": 0.005364799410705972, "grad_norm": 0.14966635406017303, "learning_rate": 0.0013871215213371283, "loss": 0.5972, "step": 193 }, { "epoch": 0.005392596298844345, "grad_norm": 0.13933199644088745, "learning_rate": 0.0013812020979708417, "loss": 0.5656, "step": 194 }, { "epoch": 0.005420393186982717, "grad_norm": 0.13888134062290192, "learning_rate": 0.0013752670048793744, "loss": 0.7759, "step": 195 }, { "epoch": 0.00544819007512109, "grad_norm": 0.14111551642417908, "learning_rate": 0.0013693164860311565, "loss": 0.7513, "step": 196 }, { "epoch": 0.005475986963259463, "grad_norm": 0.16336259245872498, "learning_rate": 0.0013633507860287115, "loss": 0.7926, "step": 197 }, { "epoch": 0.005503783851397836, "grad_norm": 0.1474616378545761, "learning_rate": 0.0013573701500986012, "loss": 0.5559, "step": 198 }, { "epoch": 0.005531580739536209, "grad_norm": 0.18535536527633667, "learning_rate": 0.001351374824081343, "loss": 0.7313, "step": 199 }, { "epoch": 0.0055593776276745815, "grad_norm": 0.1758536696434021, "learning_rate": 0.0013453650544213076, "loss": 0.606, "step": 200 }, { "epoch": 0.0055593776276745815, "eval_loss": 0.7939153909683228, "eval_runtime": 50.0064, "eval_samples_per_second": 11.539, "eval_steps_per_second": 5.779, "step": 200 }, { "epoch": 0.005587174515812955, "grad_norm": 0.13161815702915192, "learning_rate": 0.0013393410881565877, "loss": 0.8241, "step": 201 }, { "epoch": 0.005614971403951328, "grad_norm": 0.1270628422498703, "learning_rate": 0.0013333031729088419, "loss": 0.9452, "step": 202 }, { "epoch": 0.005642768292089701, "grad_norm": 0.10348006337881088, "learning_rate": 0.0013272515568731168, "loss": 0.9022, "step": 203 }, { "epoch": 0.005670565180228074, "grad_norm": 0.1050761267542839, "learning_rate": 0.0013211864888076456, "loss": 0.947, "step": 204 }, { "epoch": 0.005698362068366446, "grad_norm": 0.09829563647508621, "learning_rate": 0.0013151082180236208, "loss": 0.8633, "step": 205 }, { "epoch": 0.005726158956504819, "grad_norm": 0.10269923508167267, "learning_rate": 0.0013090169943749475, "loss": 0.9481, "step": 206 }, { "epoch": 0.005753955844643192, "grad_norm": 0.09650863707065582, "learning_rate": 0.001302913068247972, "loss": 0.8289, "step": 207 }, { "epoch": 0.005781752732781565, "grad_norm": 0.10350701212882996, "learning_rate": 0.0012967966905511905, "loss": 0.8332, "step": 208 }, { "epoch": 0.005809549620919938, "grad_norm": 0.09841641783714294, "learning_rate": 0.0012906681127049337, "loss": 0.783, "step": 209 }, { "epoch": 0.005837346509058311, "grad_norm": 0.10020092874765396, "learning_rate": 0.0012845275866310323, "loss": 0.6906, "step": 210 }, { "epoch": 0.005865143397196684, "grad_norm": 0.10702440142631531, "learning_rate": 0.0012783753647424634, "loss": 0.6887, "step": 211 }, { "epoch": 0.005892940285335057, "grad_norm": 0.10005506873130798, "learning_rate": 0.001272211699932971, "loss": 0.9122, "step": 212 }, { "epoch": 0.00592073717347343, "grad_norm": 0.09415773302316666, "learning_rate": 0.0012660368455666752, "loss": 0.7632, "step": 213 }, { "epoch": 0.005948534061611803, "grad_norm": 0.08957406878471375, "learning_rate": 0.001259851055467653, "loss": 0.6393, "step": 214 }, { "epoch": 0.005976330949750175, "grad_norm": 0.12815257906913757, "learning_rate": 0.0012536545839095072, "loss": 0.9806, "step": 215 }, { "epoch": 0.006004127837888548, "grad_norm": 0.10439406335353851, "learning_rate": 0.0012474476856049145, "loss": 0.8639, "step": 216 }, { "epoch": 0.006031924726026921, "grad_norm": 0.10832219570875168, "learning_rate": 0.0012412306156951525, "loss": 0.9136, "step": 217 }, { "epoch": 0.006059721614165294, "grad_norm": 0.09555172920227051, "learning_rate": 0.0012350036297396152, "loss": 0.6873, "step": 218 }, { "epoch": 0.0060875185023036675, "grad_norm": 0.09809956699609756, "learning_rate": 0.0012287669837053054, "loss": 0.9795, "step": 219 }, { "epoch": 0.00611531539044204, "grad_norm": 0.10025002062320709, "learning_rate": 0.0012225209339563144, "loss": 0.7104, "step": 220 }, { "epoch": 0.006143112278580413, "grad_norm": 0.10654330253601074, "learning_rate": 0.0012162657372432836, "loss": 0.7548, "step": 221 }, { "epoch": 0.006170909166718786, "grad_norm": 0.11099881678819656, "learning_rate": 0.0012100016506928493, "loss": 0.8631, "step": 222 }, { "epoch": 0.006198706054857159, "grad_norm": 0.10728763788938522, "learning_rate": 0.0012037289317970757, "loss": 0.7795, "step": 223 }, { "epoch": 0.0062265029429955316, "grad_norm": 0.1109003946185112, "learning_rate": 0.001197447838402867, "loss": 0.8686, "step": 224 }, { "epoch": 0.006254299831133904, "grad_norm": 0.11378157138824463, "learning_rate": 0.0011911586287013725, "loss": 0.7896, "step": 225 }, { "epoch": 0.006282096719272277, "grad_norm": 0.1115458682179451, "learning_rate": 0.0011848615612173687, "loss": 0.6893, "step": 226 }, { "epoch": 0.00630989360741065, "grad_norm": 0.10416682809591293, "learning_rate": 0.0011785568947986368, "loss": 0.7173, "step": 227 }, { "epoch": 0.006337690495549024, "grad_norm": 0.10408192873001099, "learning_rate": 0.001172244888605319, "loss": 0.6849, "step": 228 }, { "epoch": 0.0063654873836873965, "grad_norm": 0.11150062829256058, "learning_rate": 0.001165925802099268, "loss": 0.6506, "step": 229 }, { "epoch": 0.006393284271825769, "grad_norm": 0.11302520334720612, "learning_rate": 0.0011595998950333793, "loss": 0.7856, "step": 230 }, { "epoch": 0.006421081159964142, "grad_norm": 0.1063392385840416, "learning_rate": 0.001153267427440916, "loss": 0.6815, "step": 231 }, { "epoch": 0.006448878048102515, "grad_norm": 0.10356339067220688, "learning_rate": 0.001146928659624818, "loss": 0.7153, "step": 232 }, { "epoch": 0.006476674936240888, "grad_norm": 0.11310825496912003, "learning_rate": 0.0011405838521470028, "loss": 0.7835, "step": 233 }, { "epoch": 0.0065044718243792606, "grad_norm": 0.11676845699548721, "learning_rate": 0.0011342332658176555, "loss": 0.6789, "step": 234 }, { "epoch": 0.006532268712517633, "grad_norm": 0.10645750910043716, "learning_rate": 0.001127877161684506, "loss": 0.6765, "step": 235 }, { "epoch": 0.006560065600656006, "grad_norm": 0.10652610659599304, "learning_rate": 0.0011215158010221004, "loss": 0.7016, "step": 236 }, { "epoch": 0.00658786248879438, "grad_norm": 0.12389989197254181, "learning_rate": 0.0011151494453210595, "loss": 0.6711, "step": 237 }, { "epoch": 0.006615659376932753, "grad_norm": 0.12299291789531708, "learning_rate": 0.0011087783562773311, "loss": 0.6813, "step": 238 }, { "epoch": 0.0066434562650711255, "grad_norm": 0.1117280051112175, "learning_rate": 0.0011024027957814314, "loss": 0.6857, "step": 239 }, { "epoch": 0.006671253153209498, "grad_norm": 0.1141078993678093, "learning_rate": 0.0010960230259076818, "loss": 0.5767, "step": 240 }, { "epoch": 0.006699050041347871, "grad_norm": 0.11716222018003464, "learning_rate": 0.0010896393089034335, "loss": 0.617, "step": 241 }, { "epoch": 0.006726846929486244, "grad_norm": 0.11783386021852493, "learning_rate": 0.0010832519071782894, "loss": 0.623, "step": 242 }, { "epoch": 0.006754643817624617, "grad_norm": 0.1618836522102356, "learning_rate": 0.0010768610832933168, "loss": 0.8134, "step": 243 }, { "epoch": 0.0067824407057629895, "grad_norm": 0.12553377449512482, "learning_rate": 0.0010704670999502539, "loss": 0.6001, "step": 244 }, { "epoch": 0.006810237593901362, "grad_norm": 0.11109066009521484, "learning_rate": 0.001064070219980713, "loss": 0.5716, "step": 245 }, { "epoch": 0.006838034482039736, "grad_norm": 0.11495482176542282, "learning_rate": 0.0010576707063353744, "loss": 0.5186, "step": 246 }, { "epoch": 0.006865831370178109, "grad_norm": 0.12931731343269348, "learning_rate": 0.0010512688220731792, "loss": 0.6666, "step": 247 }, { "epoch": 0.006893628258316482, "grad_norm": 0.12755966186523438, "learning_rate": 0.001044864830350515, "loss": 0.5354, "step": 248 }, { "epoch": 0.0069214251464548545, "grad_norm": 0.16778767108917236, "learning_rate": 0.0010384589944103983, "loss": 0.7248, "step": 249 }, { "epoch": 0.006949222034593227, "grad_norm": 0.19699546694755554, "learning_rate": 0.0010320515775716554, "loss": 0.5582, "step": 250 }, { "epoch": 0.006949222034593227, "eval_loss": 0.7649185657501221, "eval_runtime": 50.0992, "eval_samples_per_second": 11.517, "eval_steps_per_second": 5.769, "step": 250 }, { "epoch": 0.0069770189227316, "grad_norm": 0.14372354745864868, "learning_rate": 0.0010256428432180956, "loss": 1.0122, "step": 251 }, { "epoch": 0.007004815810869973, "grad_norm": 0.11382071673870087, "learning_rate": 0.0010192330547876872, "loss": 0.8098, "step": 252 }, { "epoch": 0.007032612699008346, "grad_norm": 0.10247013717889786, "learning_rate": 0.0010128224757617274, "loss": 0.9305, "step": 253 }, { "epoch": 0.0070604095871467185, "grad_norm": 0.09587737172842026, "learning_rate": 0.0010064113696540112, "loss": 0.9049, "step": 254 }, { "epoch": 0.007088206475285092, "grad_norm": 0.0981241911649704, "learning_rate": 0.001, "loss": 0.7943, "step": 255 }, { "epoch": 0.007116003363423465, "grad_norm": 0.10372012108564377, "learning_rate": 0.0009935886303459888, "loss": 0.8079, "step": 256 }, { "epoch": 0.007143800251561838, "grad_norm": 0.0910145714879036, "learning_rate": 0.0009871775242382727, "loss": 0.7809, "step": 257 }, { "epoch": 0.007171597139700211, "grad_norm": 0.09489095211029053, "learning_rate": 0.0009807669452123128, "loss": 0.7955, "step": 258 }, { "epoch": 0.0071993940278385835, "grad_norm": 0.09354311227798462, "learning_rate": 0.0009743571567819046, "loss": 0.8035, "step": 259 }, { "epoch": 0.007227190915976956, "grad_norm": 0.09070082753896713, "learning_rate": 0.0009679484224283449, "loss": 0.7908, "step": 260 }, { "epoch": 0.007254987804115329, "grad_norm": 0.1037682518362999, "learning_rate": 0.0009615410055896016, "loss": 0.7739, "step": 261 }, { "epoch": 0.007282784692253702, "grad_norm": 0.09658119082450867, "learning_rate": 0.0009551351696494854, "loss": 0.8614, "step": 262 }, { "epoch": 0.007310581580392075, "grad_norm": 0.10361933708190918, "learning_rate": 0.0009487311779268209, "loss": 0.9021, "step": 263 }, { "epoch": 0.007338378468530448, "grad_norm": 0.08979956805706024, "learning_rate": 0.0009423292936646257, "loss": 0.7559, "step": 264 }, { "epoch": 0.007366175356668821, "grad_norm": 0.09771803766489029, "learning_rate": 0.0009359297800192872, "loss": 0.7717, "step": 265 }, { "epoch": 0.007393972244807194, "grad_norm": 0.10628974437713623, "learning_rate": 0.0009295329000497459, "loss": 0.8083, "step": 266 }, { "epoch": 0.007421769132945567, "grad_norm": 0.10804276913404465, "learning_rate": 0.0009231389167066836, "loss": 0.8077, "step": 267 }, { "epoch": 0.00744956602108394, "grad_norm": 0.13234536349773407, "learning_rate": 0.0009167480928217108, "loss": 0.8623, "step": 268 }, { "epoch": 0.0074773629092223124, "grad_norm": 0.10681891441345215, "learning_rate": 0.0009103606910965666, "loss": 0.7624, "step": 269 }, { "epoch": 0.007505159797360685, "grad_norm": 0.11654768884181976, "learning_rate": 0.0009039769740923182, "loss": 0.8082, "step": 270 }, { "epoch": 0.007532956685499058, "grad_norm": 0.1005023717880249, "learning_rate": 0.0008975972042185687, "loss": 0.7448, "step": 271 }, { "epoch": 0.007560753573637431, "grad_norm": 0.10492710024118423, "learning_rate": 0.0008912216437226692, "loss": 0.8734, "step": 272 }, { "epoch": 0.0075885504617758046, "grad_norm": 0.10636113584041595, "learning_rate": 0.0008848505546789408, "loss": 0.7397, "step": 273 }, { "epoch": 0.007616347349914177, "grad_norm": 0.0918545350432396, "learning_rate": 0.0008784841989778997, "loss": 0.6861, "step": 274 }, { "epoch": 0.00764414423805255, "grad_norm": 0.10185957700014114, "learning_rate": 0.0008721228383154939, "loss": 0.7906, "step": 275 }, { "epoch": 0.007671941126190923, "grad_norm": 0.09865249693393707, "learning_rate": 0.0008657667341823448, "loss": 0.6877, "step": 276 }, { "epoch": 0.007699738014329296, "grad_norm": 0.10265160351991653, "learning_rate": 0.0008594161478529974, "loss": 0.7361, "step": 277 }, { "epoch": 0.007727534902467669, "grad_norm": 0.10760695487260818, "learning_rate": 0.0008530713403751821, "loss": 0.6973, "step": 278 }, { "epoch": 0.007755331790606041, "grad_norm": 0.1106325089931488, "learning_rate": 0.000846732572559084, "loss": 0.7736, "step": 279 }, { "epoch": 0.007783128678744414, "grad_norm": 0.11260738223791122, "learning_rate": 0.000840400104966621, "loss": 0.7567, "step": 280 }, { "epoch": 0.007810925566882787, "grad_norm": 0.10231205821037292, "learning_rate": 0.0008340741979007324, "loss": 0.7018, "step": 281 }, { "epoch": 0.00783872245502116, "grad_norm": 0.10939161479473114, "learning_rate": 0.0008277551113946811, "loss": 0.5913, "step": 282 }, { "epoch": 0.007866519343159533, "grad_norm": 0.10841213166713715, "learning_rate": 0.0008214431052013634, "loss": 0.7421, "step": 283 }, { "epoch": 0.007894316231297906, "grad_norm": 0.11439742892980576, "learning_rate": 0.0008151384387826313, "loss": 0.7312, "step": 284 }, { "epoch": 0.007922113119436278, "grad_norm": 0.11929941177368164, "learning_rate": 0.0008088413712986279, "loss": 0.7529, "step": 285 }, { "epoch": 0.007949910007574652, "grad_norm": 0.11869197338819504, "learning_rate": 0.0008025521615971329, "loss": 0.6848, "step": 286 }, { "epoch": 0.007977706895713026, "grad_norm": 0.10041101276874542, "learning_rate": 0.0007962710682029245, "loss": 0.5243, "step": 287 }, { "epoch": 0.008005503783851398, "grad_norm": 0.11425133794546127, "learning_rate": 0.0007899983493071507, "loss": 0.6441, "step": 288 }, { "epoch": 0.008033300671989771, "grad_norm": 0.10552135854959488, "learning_rate": 0.0007837342627567166, "loss": 0.5675, "step": 289 }, { "epoch": 0.008061097560128143, "grad_norm": 0.12019749730825424, "learning_rate": 0.0007774790660436857, "loss": 0.7536, "step": 290 }, { "epoch": 0.008088894448266517, "grad_norm": 0.11838424205780029, "learning_rate": 0.0007712330162946947, "loss": 0.6558, "step": 291 }, { "epoch": 0.008116691336404889, "grad_norm": 0.11879291385412216, "learning_rate": 0.0007649963702603848, "loss": 0.6071, "step": 292 }, { "epoch": 0.008144488224543263, "grad_norm": 0.1303434818983078, "learning_rate": 0.0007587693843048475, "loss": 0.5558, "step": 293 }, { "epoch": 0.008172285112681634, "grad_norm": 0.1721266806125641, "learning_rate": 0.0007525523143950859, "loss": 0.6311, "step": 294 }, { "epoch": 0.008200082000820008, "grad_norm": 0.13676951825618744, "learning_rate": 0.0007463454160904928, "loss": 0.5886, "step": 295 }, { "epoch": 0.008227878888958382, "grad_norm": 0.16192977130413055, "learning_rate": 0.0007401489445323472, "loss": 0.746, "step": 296 }, { "epoch": 0.008255675777096754, "grad_norm": 0.12178271263837814, "learning_rate": 0.000733963154433325, "loss": 0.544, "step": 297 }, { "epoch": 0.008283472665235127, "grad_norm": 0.1465931534767151, "learning_rate": 0.0007277883000670289, "loss": 0.4949, "step": 298 }, { "epoch": 0.0083112695533735, "grad_norm": 0.15438374876976013, "learning_rate": 0.0007216246352575369, "loss": 0.6408, "step": 299 }, { "epoch": 0.008339066441511873, "grad_norm": 0.2604023814201355, "learning_rate": 0.0007154724133689676, "loss": 0.7431, "step": 300 }, { "epoch": 0.008339066441511873, "eval_loss": 0.7490311861038208, "eval_runtime": 50.0501, "eval_samples_per_second": 11.528, "eval_steps_per_second": 5.774, "step": 300 }, { "epoch": 0.008366863329650245, "grad_norm": 0.12751354277133942, "learning_rate": 0.0007093318872950665, "loss": 0.9951, "step": 301 }, { "epoch": 0.008394660217788619, "grad_norm": 0.10959775000810623, "learning_rate": 0.0007032033094488094, "loss": 0.8628, "step": 302 }, { "epoch": 0.00842245710592699, "grad_norm": 0.11009855568408966, "learning_rate": 0.0006970869317520279, "loss": 0.9437, "step": 303 }, { "epoch": 0.008450253994065364, "grad_norm": 0.09348214417695999, "learning_rate": 0.0006909830056250527, "loss": 0.7982, "step": 304 }, { "epoch": 0.008478050882203738, "grad_norm": 0.09079182893037796, "learning_rate": 0.0006848917819763793, "loss": 0.759, "step": 305 }, { "epoch": 0.00850584777034211, "grad_norm": 0.09549093246459961, "learning_rate": 0.0006788135111923545, "loss": 0.93, "step": 306 }, { "epoch": 0.008533644658480484, "grad_norm": 0.09193756431341171, "learning_rate": 0.0006727484431268831, "loss": 0.8092, "step": 307 }, { "epoch": 0.008561441546618856, "grad_norm": 0.0906609520316124, "learning_rate": 0.0006666968270911584, "loss": 0.8559, "step": 308 }, { "epoch": 0.00858923843475723, "grad_norm": 0.09579095989465714, "learning_rate": 0.0006606589118434126, "loss": 0.9007, "step": 309 }, { "epoch": 0.008617035322895601, "grad_norm": 0.09106986969709396, "learning_rate": 0.0006546349455786926, "loss": 0.7693, "step": 310 }, { "epoch": 0.008644832211033975, "grad_norm": 0.08439186215400696, "learning_rate": 0.0006486251759186573, "loss": 0.7757, "step": 311 }, { "epoch": 0.008672629099172347, "grad_norm": 0.09723315387964249, "learning_rate": 0.0006426298499013994, "loss": 0.7181, "step": 312 }, { "epoch": 0.00870042598731072, "grad_norm": 0.09807878732681274, "learning_rate": 0.0006366492139712886, "loss": 0.9234, "step": 313 }, { "epoch": 0.008728222875449094, "grad_norm": 0.0981912836432457, "learning_rate": 0.0006306835139688439, "loss": 0.8984, "step": 314 }, { "epoch": 0.008756019763587466, "grad_norm": 0.0935656800866127, "learning_rate": 0.000624732995120626, "loss": 0.7538, "step": 315 }, { "epoch": 0.00878381665172584, "grad_norm": 0.09538404643535614, "learning_rate": 0.0006187979020291583, "loss": 0.86, "step": 316 }, { "epoch": 0.008811613539864212, "grad_norm": 0.10517210513353348, "learning_rate": 0.000612878478662872, "loss": 0.9827, "step": 317 }, { "epoch": 0.008839410428002585, "grad_norm": 0.11277417838573456, "learning_rate": 0.0006069749683460764, "loss": 0.7863, "step": 318 }, { "epoch": 0.008867207316140957, "grad_norm": 0.10222118347883224, "learning_rate": 0.0006010876137489584, "loss": 0.7519, "step": 319 }, { "epoch": 0.008895004204279331, "grad_norm": 0.08672767877578735, "learning_rate": 0.0005952166568776062, "loss": 0.8205, "step": 320 }, { "epoch": 0.008922801092417703, "grad_norm": 0.10612691193819046, "learning_rate": 0.0005893623390640622, "loss": 0.8871, "step": 321 }, { "epoch": 0.008950597980556077, "grad_norm": 0.09716677665710449, "learning_rate": 0.0005835249009564013, "loss": 0.7042, "step": 322 }, { "epoch": 0.00897839486869445, "grad_norm": 0.0943322479724884, "learning_rate": 0.0005777045825088404, "loss": 0.6464, "step": 323 }, { "epoch": 0.009006191756832822, "grad_norm": 0.10483499616384506, "learning_rate": 0.0005719016229718748, "loss": 0.6854, "step": 324 }, { "epoch": 0.009033988644971196, "grad_norm": 0.10260860621929169, "learning_rate": 0.0005661162608824419, "loss": 0.7816, "step": 325 }, { "epoch": 0.009061785533109568, "grad_norm": 0.1006467193365097, "learning_rate": 0.0005603487340541181, "loss": 0.8237, "step": 326 }, { "epoch": 0.009089582421247942, "grad_norm": 0.09507747739553452, "learning_rate": 0.0005545992795673408, "loss": 0.6931, "step": 327 }, { "epoch": 0.009117379309386314, "grad_norm": 0.08719619363546371, "learning_rate": 0.0005488681337596652, "loss": 0.5646, "step": 328 }, { "epoch": 0.009145176197524687, "grad_norm": 0.0974939689040184, "learning_rate": 0.0005431555322160483, "loss": 0.655, "step": 329 }, { "epoch": 0.00917297308566306, "grad_norm": 0.11043986678123474, "learning_rate": 0.000537461709759165, "loss": 0.8183, "step": 330 }, { "epoch": 0.009200769973801433, "grad_norm": 0.10393897444009781, "learning_rate": 0.0005317869004397545, "loss": 0.7377, "step": 331 }, { "epoch": 0.009228566861939807, "grad_norm": 0.10675422102212906, "learning_rate": 0.0005261313375270014, "loss": 0.7157, "step": 332 }, { "epoch": 0.009256363750078178, "grad_norm": 0.11000238358974457, "learning_rate": 0.0005204952534989462, "loss": 0.78, "step": 333 }, { "epoch": 0.009284160638216552, "grad_norm": 0.11165881901979446, "learning_rate": 0.0005148788800329278, "loss": 0.6822, "step": 334 }, { "epoch": 0.009311957526354924, "grad_norm": 0.10411150753498077, "learning_rate": 0.0005092824479960625, "loss": 0.5622, "step": 335 }, { "epoch": 0.009339754414493298, "grad_norm": 0.11088427156209946, "learning_rate": 0.0005037061874357502, "loss": 0.5462, "step": 336 }, { "epoch": 0.00936755130263167, "grad_norm": 0.11364025622606277, "learning_rate": 0.0004981503275702227, "loss": 0.5887, "step": 337 }, { "epoch": 0.009395348190770043, "grad_norm": 0.1214662715792656, "learning_rate": 0.000492615096779118, "loss": 0.7526, "step": 338 }, { "epoch": 0.009423145078908415, "grad_norm": 0.11156253516674042, "learning_rate": 0.000487100722594094, "loss": 0.5498, "step": 339 }, { "epoch": 0.009450941967046789, "grad_norm": 0.10013176500797272, "learning_rate": 0.00048160743168947496, "loss": 0.4897, "step": 340 }, { "epoch": 0.009478738855185163, "grad_norm": 0.12529128789901733, "learning_rate": 0.00047613544987293446, "loss": 0.6721, "step": 341 }, { "epoch": 0.009506535743323535, "grad_norm": 0.1168283075094223, "learning_rate": 0.00047068500207621256, "loss": 0.5659, "step": 342 }, { "epoch": 0.009534332631461908, "grad_norm": 0.16703759133815765, "learning_rate": 0.0004652563123458703, "loss": 0.7979, "step": 343 }, { "epoch": 0.00956212951960028, "grad_norm": 0.13577383756637573, "learning_rate": 0.00045984960383408004, "loss": 0.7327, "step": 344 }, { "epoch": 0.009589926407738654, "grad_norm": 0.14807015657424927, "learning_rate": 0.0004544650987894514, "loss": 0.6478, "step": 345 }, { "epoch": 0.009617723295877026, "grad_norm": 0.12839584052562714, "learning_rate": 0.0004491030185478976, "loss": 0.5733, "step": 346 }, { "epoch": 0.0096455201840154, "grad_norm": 0.13728651404380798, "learning_rate": 0.0004437635835235353, "loss": 0.6203, "step": 347 }, { "epoch": 0.009673317072153772, "grad_norm": 0.12902460992336273, "learning_rate": 0.0004384470131996252, "loss": 0.5744, "step": 348 }, { "epoch": 0.009701113960292145, "grad_norm": 0.1356661170721054, "learning_rate": 0.0004331535261195504, "loss": 0.4755, "step": 349 }, { "epoch": 0.009728910848430519, "grad_norm": 0.18092875182628632, "learning_rate": 0.0004278833398778306, "loss": 0.7017, "step": 350 }, { "epoch": 0.009728910848430519, "eval_loss": 0.7205991148948669, "eval_runtime": 50.1118, "eval_samples_per_second": 11.514, "eval_steps_per_second": 5.767, "step": 350 }, { "epoch": 0.00975670773656889, "grad_norm": 0.1047213226556778, "learning_rate": 0.0004226366711111808, "loss": 0.942, "step": 351 }, { "epoch": 0.009784504624707265, "grad_norm": 0.10103016346693039, "learning_rate": 0.00041741373548960395, "loss": 0.7983, "step": 352 }, { "epoch": 0.009812301512845636, "grad_norm": 0.10744207352399826, "learning_rate": 0.00041221474770752696, "loss": 1.0167, "step": 353 }, { "epoch": 0.00984009840098401, "grad_norm": 0.10127032548189163, "learning_rate": 0.0004070399214749743, "loss": 0.9075, "step": 354 }, { "epoch": 0.009867895289122382, "grad_norm": 0.10049542039632797, "learning_rate": 0.000401889469508784, "loss": 0.9626, "step": 355 }, { "epoch": 0.009895692177260756, "grad_norm": 0.08796097338199615, "learning_rate": 0.00039676360352386354, "loss": 0.7321, "step": 356 }, { "epoch": 0.009923489065399128, "grad_norm": 0.07982050627470016, "learning_rate": 0.00039166253422448684, "loss": 0.6331, "step": 357 }, { "epoch": 0.009951285953537501, "grad_norm": 0.08986510336399078, "learning_rate": 0.0003865864712956336, "loss": 0.646, "step": 358 }, { "epoch": 0.009979082841675875, "grad_norm": 0.0990855023264885, "learning_rate": 0.00038153562339436853, "loss": 0.7897, "step": 359 }, { "epoch": 0.010006879729814247, "grad_norm": 0.09031044691801071, "learning_rate": 0.0003765101981412665, "loss": 0.7884, "step": 360 }, { "epoch": 0.01003467661795262, "grad_norm": 0.08221515268087387, "learning_rate": 0.0003715104021118764, "loss": 0.787, "step": 361 }, { "epoch": 0.010062473506090993, "grad_norm": 0.09221215546131134, "learning_rate": 0.00036653644082823046, "loss": 0.7991, "step": 362 }, { "epoch": 0.010090270394229366, "grad_norm": 0.08920681476593018, "learning_rate": 0.00036158851875039456, "loss": 0.9703, "step": 363 }, { "epoch": 0.010118067282367738, "grad_norm": 0.09488007426261902, "learning_rate": 0.0003566668392680662, "loss": 0.759, "step": 364 }, { "epoch": 0.010145864170506112, "grad_norm": 0.0824998989701271, "learning_rate": 0.0003517716046922118, "loss": 0.6603, "step": 365 }, { "epoch": 0.010173661058644486, "grad_norm": 0.0937090590596199, "learning_rate": 0.00034690301624675125, "loss": 0.7799, "step": 366 }, { "epoch": 0.010201457946782858, "grad_norm": 0.08511339873075485, "learning_rate": 0.00034206127406028743, "loss": 0.6858, "step": 367 }, { "epoch": 0.010229254834921231, "grad_norm": 0.09240502119064331, "learning_rate": 0.0003372465771578771, "loss": 0.8179, "step": 368 }, { "epoch": 0.010257051723059603, "grad_norm": 0.08915253728628159, "learning_rate": 0.000332459123452852, "loss": 0.7399, "step": 369 }, { "epoch": 0.010284848611197977, "grad_norm": 0.09466725587844849, "learning_rate": 0.00032769910973868313, "loss": 0.7574, "step": 370 }, { "epoch": 0.010312645499336349, "grad_norm": 0.09638349711894989, "learning_rate": 0.00032296673168089073, "loss": 0.7101, "step": 371 }, { "epoch": 0.010340442387474722, "grad_norm": 0.09288477897644043, "learning_rate": 0.0003182621838090006, "loss": 0.7876, "step": 372 }, { "epoch": 0.010368239275613094, "grad_norm": 0.08711199462413788, "learning_rate": 0.0003135856595085498, "loss": 0.548, "step": 373 }, { "epoch": 0.010396036163751468, "grad_norm": 0.09743466973304749, "learning_rate": 0.00030893735101313535, "loss": 0.7647, "step": 374 }, { "epoch": 0.010423833051889842, "grad_norm": 0.10079675167798996, "learning_rate": 0.0003043174493965136, "loss": 0.6033, "step": 375 }, { "epoch": 0.010451629940028214, "grad_norm": 0.11834803968667984, "learning_rate": 0.0002997261445647453, "loss": 0.7904, "step": 376 }, { "epoch": 0.010479426828166587, "grad_norm": 0.10599831491708755, "learning_rate": 0.00029516362524838847, "loss": 0.6515, "step": 377 }, { "epoch": 0.01050722371630496, "grad_norm": 0.09899824112653732, "learning_rate": 0.0002906300789947421, "loss": 0.5683, "step": 378 }, { "epoch": 0.010535020604443333, "grad_norm": 0.09238414466381073, "learning_rate": 0.00028612569216013674, "loss": 0.5779, "step": 379 }, { "epoch": 0.010562817492581705, "grad_norm": 0.0977087989449501, "learning_rate": 0.0002816506499022725, "loss": 0.562, "step": 380 }, { "epoch": 0.010590614380720079, "grad_norm": 0.09863676875829697, "learning_rate": 0.00027720513617260855, "loss": 0.6458, "step": 381 }, { "epoch": 0.01061841126885845, "grad_norm": 0.11672049015760422, "learning_rate": 0.0002727893337088027, "loss": 0.7436, "step": 382 }, { "epoch": 0.010646208156996824, "grad_norm": 0.10594779998064041, "learning_rate": 0.0002684034240271986, "loss": 0.6286, "step": 383 }, { "epoch": 0.010674005045135198, "grad_norm": 0.11194431781768799, "learning_rate": 0.00026404758741536505, "loss": 0.6866, "step": 384 }, { "epoch": 0.01070180193327357, "grad_norm": 0.10711811482906342, "learning_rate": 0.00025972200292468463, "loss": 0.5329, "step": 385 }, { "epoch": 0.010729598821411944, "grad_norm": 0.112278513610363, "learning_rate": 0.00025542684836299314, "loss": 0.6123, "step": 386 }, { "epoch": 0.010757395709550316, "grad_norm": 0.12985379993915558, "learning_rate": 0.0002511623002872718, "loss": 0.686, "step": 387 }, { "epoch": 0.01078519259768869, "grad_norm": 0.12807150185108185, "learning_rate": 0.00024692853399638914, "loss": 0.7307, "step": 388 }, { "epoch": 0.010812989485827061, "grad_norm": 0.11589968949556351, "learning_rate": 0.00024272572352389488, "loss": 0.7484, "step": 389 }, { "epoch": 0.010840786373965435, "grad_norm": 0.11197637766599655, "learning_rate": 0.0002385540416308656, "loss": 0.645, "step": 390 }, { "epoch": 0.010868583262103807, "grad_norm": 0.11974731087684631, "learning_rate": 0.00023441365979880524, "loss": 0.6447, "step": 391 }, { "epoch": 0.01089638015024218, "grad_norm": 0.11931908875703812, "learning_rate": 0.00023030474822259396, "loss": 0.5268, "step": 392 }, { "epoch": 0.010924177038380554, "grad_norm": 0.1177850067615509, "learning_rate": 0.0002262274758034931, "loss": 0.5544, "step": 393 }, { "epoch": 0.010951973926518926, "grad_norm": 0.12234242260456085, "learning_rate": 0.00022218201014220264, "loss": 0.6351, "step": 394 }, { "epoch": 0.0109797708146573, "grad_norm": 0.12924128770828247, "learning_rate": 0.0002181685175319702, "loss": 0.6468, "step": 395 }, { "epoch": 0.011007567702795672, "grad_norm": 0.1454528272151947, "learning_rate": 0.00021418716295175765, "loss": 0.6534, "step": 396 }, { "epoch": 0.011035364590934045, "grad_norm": 0.13180844485759735, "learning_rate": 0.0002102381100594577, "loss": 0.5698, "step": 397 }, { "epoch": 0.011063161479072417, "grad_norm": 0.1514425277709961, "learning_rate": 0.00020632152118516778, "loss": 0.6062, "step": 398 }, { "epoch": 0.011090958367210791, "grad_norm": 0.1729186773300171, "learning_rate": 0.00020243755732451564, "loss": 0.6178, "step": 399 }, { "epoch": 0.011118755255349163, "grad_norm": 0.2056732028722763, "learning_rate": 0.0001985863781320435, "loss": 0.7743, "step": 400 }, { "epoch": 0.011118755255349163, "eval_loss": 0.7006093859672546, "eval_runtime": 49.996, "eval_samples_per_second": 11.541, "eval_steps_per_second": 5.78, "step": 400 }, { "epoch": 0.011146552143487537, "grad_norm": 0.09276453405618668, "learning_rate": 0.00019476814191464386, "loss": 0.8983, "step": 401 }, { "epoch": 0.01117434903162591, "grad_norm": 0.08963775634765625, "learning_rate": 0.00019098300562505265, "loss": 0.8343, "step": 402 }, { "epoch": 0.011202145919764282, "grad_norm": 0.09050919860601425, "learning_rate": 0.0001872311248553974, "loss": 0.7674, "step": 403 }, { "epoch": 0.011229942807902656, "grad_norm": 0.1092870682477951, "learning_rate": 0.00018351265383080128, "loss": 1.0618, "step": 404 }, { "epoch": 0.011257739696041028, "grad_norm": 0.08485256880521774, "learning_rate": 0.00017982774540304403, "loss": 0.7261, "step": 405 }, { "epoch": 0.011285536584179402, "grad_norm": 0.08782031387090683, "learning_rate": 0.00017617655104427832, "loss": 0.7258, "step": 406 }, { "epoch": 0.011313333472317774, "grad_norm": 0.0893518328666687, "learning_rate": 0.00017255922084080368, "loss": 0.8466, "step": 407 }, { "epoch": 0.011341130360456147, "grad_norm": 0.07837007939815521, "learning_rate": 0.00016897590348689606, "loss": 0.6156, "step": 408 }, { "epoch": 0.011368927248594519, "grad_norm": 0.09375711530447006, "learning_rate": 0.00016542674627869735, "loss": 0.7362, "step": 409 }, { "epoch": 0.011396724136732893, "grad_norm": 0.08803148567676544, "learning_rate": 0.0001619118951081594, "loss": 0.8826, "step": 410 }, { "epoch": 0.011424521024871266, "grad_norm": 0.09359045326709747, "learning_rate": 0.00015843149445704684, "loss": 0.7686, "step": 411 }, { "epoch": 0.011452317913009638, "grad_norm": 0.09178245067596436, "learning_rate": 0.00015498568739099906, "loss": 0.7662, "step": 412 }, { "epoch": 0.011480114801148012, "grad_norm": 0.0961398333311081, "learning_rate": 0.0001515746155536477, "loss": 0.8347, "step": 413 }, { "epoch": 0.011507911689286384, "grad_norm": 0.1026514321565628, "learning_rate": 0.0001481984191607959, "loss": 0.8207, "step": 414 }, { "epoch": 0.011535708577424758, "grad_norm": 0.08573547005653381, "learning_rate": 0.0001448572369946539, "loss": 0.6231, "step": 415 }, { "epoch": 0.01156350546556313, "grad_norm": 0.09467485547065735, "learning_rate": 0.0001415512063981339, "loss": 0.9214, "step": 416 }, { "epoch": 0.011591302353701503, "grad_norm": 0.0945618599653244, "learning_rate": 0.00013828046326920496, "loss": 0.749, "step": 417 }, { "epoch": 0.011619099241839875, "grad_norm": 0.10449232906103134, "learning_rate": 0.0001350451420553065, "loss": 1.0501, "step": 418 }, { "epoch": 0.011646896129978249, "grad_norm": 0.09804502129554749, "learning_rate": 0.0001318453757478215, "loss": 0.6405, "step": 419 }, { "epoch": 0.011674693018116623, "grad_norm": 0.08781873434782028, "learning_rate": 0.0001286812958766106, "loss": 0.7123, "step": 420 }, { "epoch": 0.011702489906254995, "grad_norm": 0.09648067504167557, "learning_rate": 0.00012555303250460438, "loss": 0.8559, "step": 421 }, { "epoch": 0.011730286794393368, "grad_norm": 0.09019096195697784, "learning_rate": 0.00012246071422245718, "loss": 0.761, "step": 422 }, { "epoch": 0.01175808368253174, "grad_norm": 0.09508346021175385, "learning_rate": 0.000119404468143262, "loss": 0.7989, "step": 423 }, { "epoch": 0.011785880570670114, "grad_norm": 0.08918111771345139, "learning_rate": 0.00011638441989732473, "loss": 0.6767, "step": 424 }, { "epoch": 0.011813677458808486, "grad_norm": 0.09687741100788116, "learning_rate": 0.00011340069362699989, "loss": 0.7161, "step": 425 }, { "epoch": 0.01184147434694686, "grad_norm": 0.10025037080049515, "learning_rate": 0.00011045341198158831, "loss": 0.6706, "step": 426 }, { "epoch": 0.011869271235085231, "grad_norm": 0.09664606302976608, "learning_rate": 0.00010754269611229428, "loss": 0.6177, "step": 427 }, { "epoch": 0.011897068123223605, "grad_norm": 0.09703200310468674, "learning_rate": 0.00010466866566724697, "loss": 0.6235, "step": 428 }, { "epoch": 0.011924865011361979, "grad_norm": 0.09958402812480927, "learning_rate": 0.00010183143878658097, "loss": 0.6999, "step": 429 }, { "epoch": 0.01195266189950035, "grad_norm": 0.0950227677822113, "learning_rate": 9.903113209758097e-05, "loss": 0.6983, "step": 430 }, { "epoch": 0.011980458787638724, "grad_norm": 0.09548084437847137, "learning_rate": 9.626786070988657e-05, "loss": 0.609, "step": 431 }, { "epoch": 0.012008255675777096, "grad_norm": 0.0920906737446785, "learning_rate": 9.354173821076184e-05, "loss": 0.6281, "step": 432 }, { "epoch": 0.01203605256391547, "grad_norm": 0.09439770877361298, "learning_rate": 9.085287666042507e-05, "loss": 0.6777, "step": 433 }, { "epoch": 0.012063849452053842, "grad_norm": 0.08835854381322861, "learning_rate": 8.820138658744304e-05, "loss": 0.4624, "step": 434 }, { "epoch": 0.012091646340192216, "grad_norm": 0.09508516639471054, "learning_rate": 8.558737698418762e-05, "loss": 0.5111, "step": 435 }, { "epoch": 0.012119443228330588, "grad_norm": 0.10829413682222366, "learning_rate": 8.301095530235491e-05, "loss": 0.9261, "step": 436 }, { "epoch": 0.012147240116468961, "grad_norm": 0.11397778987884521, "learning_rate": 8.047222744854943e-05, "loss": 0.6988, "step": 437 }, { "epoch": 0.012175037004607335, "grad_norm": 0.12012533843517303, "learning_rate": 7.79712977799295e-05, "loss": 0.6691, "step": 438 }, { "epoch": 0.012202833892745707, "grad_norm": 0.12243448197841644, "learning_rate": 7.550826909991859e-05, "loss": 0.649, "step": 439 }, { "epoch": 0.01223063078088408, "grad_norm": 0.10324777662754059, "learning_rate": 7.308324265397836e-05, "loss": 0.5844, "step": 440 }, { "epoch": 0.012258427669022453, "grad_norm": 0.10553352534770966, "learning_rate": 7.069631812544808e-05, "loss": 0.4693, "step": 441 }, { "epoch": 0.012286224557160826, "grad_norm": 0.11548332124948502, "learning_rate": 6.834759363144594e-05, "loss": 0.6917, "step": 442 }, { "epoch": 0.012314021445299198, "grad_norm": 0.12441360205411911, "learning_rate": 6.603716571883689e-05, "loss": 0.7703, "step": 443 }, { "epoch": 0.012341818333437572, "grad_norm": 0.12251102924346924, "learning_rate": 6.37651293602628e-05, "loss": 0.6202, "step": 444 }, { "epoch": 0.012369615221575944, "grad_norm": 0.13246478140354156, "learning_rate": 6.153157795023956e-05, "loss": 0.6897, "step": 445 }, { "epoch": 0.012397412109714318, "grad_norm": 0.12585890293121338, "learning_rate": 5.9336603301317516e-05, "loss": 0.5828, "step": 446 }, { "epoch": 0.012425208997852691, "grad_norm": 0.1446872502565384, "learning_rate": 5.718029564030702e-05, "loss": 0.5442, "step": 447 }, { "epoch": 0.012453005885991063, "grad_norm": 0.14832744002342224, "learning_rate": 5.5062743604570865e-05, "loss": 0.5683, "step": 448 }, { "epoch": 0.012480802774129437, "grad_norm": 0.1458773910999298, "learning_rate": 5.298403423837883e-05, "loss": 0.4833, "step": 449 }, { "epoch": 0.012508599662267809, "grad_norm": 0.19131259620189667, "learning_rate": 5.094425298933136e-05, "loss": 0.507, "step": 450 }, { "epoch": 0.012508599662267809, "eval_loss": 0.6939424276351929, "eval_runtime": 49.9667, "eval_samples_per_second": 11.548, "eval_steps_per_second": 5.784, "step": 450 }, { "epoch": 0.012536396550406182, "grad_norm": 0.08057525753974915, "learning_rate": 4.894348370484647e-05, "loss": 0.8512, "step": 451 }, { "epoch": 0.012564193438544554, "grad_norm": 0.08833472430706024, "learning_rate": 4.698180862871282e-05, "loss": 0.995, "step": 452 }, { "epoch": 0.012591990326682928, "grad_norm": 0.08962654322385788, "learning_rate": 4.505930839770966e-05, "loss": 0.9639, "step": 453 }, { "epoch": 0.0126197872148213, "grad_norm": 0.08324563503265381, "learning_rate": 4.3176062038291274e-05, "loss": 0.7867, "step": 454 }, { "epoch": 0.012647584102959674, "grad_norm": 0.0913391187787056, "learning_rate": 4.1332146963339423e-05, "loss": 0.7687, "step": 455 }, { "epoch": 0.012675380991098047, "grad_norm": 0.08055524528026581, "learning_rate": 3.952763896898071e-05, "loss": 0.7876, "step": 456 }, { "epoch": 0.01270317787923642, "grad_norm": 0.08405828475952148, "learning_rate": 3.776261223147126e-05, "loss": 0.7312, "step": 457 }, { "epoch": 0.012730974767374793, "grad_norm": 0.08189340680837631, "learning_rate": 3.603713930414676e-05, "loss": 0.7638, "step": 458 }, { "epoch": 0.012758771655513165, "grad_norm": 0.08689261227846146, "learning_rate": 3.435129111444113e-05, "loss": 0.7503, "step": 459 }, { "epoch": 0.012786568543651539, "grad_norm": 0.08499288558959961, "learning_rate": 3.270513696097055e-05, "loss": 0.7546, "step": 460 }, { "epoch": 0.01281436543178991, "grad_norm": 0.08908785879611969, "learning_rate": 3.109874451068473e-05, "loss": 0.7841, "step": 461 }, { "epoch": 0.012842162319928284, "grad_norm": 0.08109744638204575, "learning_rate": 2.9532179796085356e-05, "loss": 0.6244, "step": 462 }, { "epoch": 0.012869959208066656, "grad_norm": 0.08815158903598785, "learning_rate": 2.800550721251216e-05, "loss": 0.697, "step": 463 }, { "epoch": 0.01289775609620503, "grad_norm": 0.09392455220222473, "learning_rate": 2.6518789515495355e-05, "loss": 0.6835, "step": 464 }, { "epoch": 0.012925552984343404, "grad_norm": 0.1170358955860138, "learning_rate": 2.5072087818176382e-05, "loss": 0.938, "step": 465 }, { "epoch": 0.012953349872481775, "grad_norm": 0.09133084863424301, "learning_rate": 2.36654615887959e-05, "loss": 0.7392, "step": 466 }, { "epoch": 0.01298114676062015, "grad_norm": 0.09215465933084488, "learning_rate": 2.2298968648248653e-05, "loss": 0.8161, "step": 467 }, { "epoch": 0.013008943648758521, "grad_norm": 0.08623038232326508, "learning_rate": 2.0972665167707127e-05, "loss": 0.7643, "step": 468 }, { "epoch": 0.013036740536896895, "grad_norm": 0.08695585280656815, "learning_rate": 1.968660566631275e-05, "loss": 0.6102, "step": 469 }, { "epoch": 0.013064537425035267, "grad_norm": 0.08945546299219131, "learning_rate": 1.844084300893456e-05, "loss": 0.7888, "step": 470 }, { "epoch": 0.01309233431317364, "grad_norm": 0.09600325673818588, "learning_rate": 1.7235428403996167e-05, "loss": 0.8451, "step": 471 }, { "epoch": 0.013120131201312012, "grad_norm": 0.08915964514017105, "learning_rate": 1.6070411401370334e-05, "loss": 0.6192, "step": 472 }, { "epoch": 0.013147928089450386, "grad_norm": 0.1044343113899231, "learning_rate": 1.494583989034326e-05, "loss": 0.6746, "step": 473 }, { "epoch": 0.01317572497758876, "grad_norm": 0.0989852100610733, "learning_rate": 1.386176009764506e-05, "loss": 0.7388, "step": 474 }, { "epoch": 0.013203521865727132, "grad_norm": 0.11182911694049835, "learning_rate": 1.2818216585549825e-05, "loss": 0.9093, "step": 475 }, { "epoch": 0.013231318753865505, "grad_norm": 0.10432388633489609, "learning_rate": 1.1815252250044316e-05, "loss": 0.6842, "step": 476 }, { "epoch": 0.013259115642003877, "grad_norm": 0.11141140758991241, "learning_rate": 1.0852908319063826e-05, "loss": 0.6488, "step": 477 }, { "epoch": 0.013286912530142251, "grad_norm": 0.10428661853075027, "learning_rate": 9.931224350798185e-06, "loss": 0.7708, "step": 478 }, { "epoch": 0.013314709418280623, "grad_norm": 0.10303416848182678, "learning_rate": 9.0502382320653e-06, "loss": 0.6728, "step": 479 }, { "epoch": 0.013342506306418997, "grad_norm": 0.10627992451190948, "learning_rate": 8.209986176753947e-06, "loss": 0.6125, "step": 480 }, { "epoch": 0.013370303194557369, "grad_norm": 0.09629444032907486, "learning_rate": 7.4105027243349665e-06, "loss": 0.6386, "step": 481 }, { "epoch": 0.013398100082695742, "grad_norm": 0.09442020207643509, "learning_rate": 6.65182073844195e-06, "loss": 0.5644, "step": 482 }, { "epoch": 0.013425896970834116, "grad_norm": 0.0994250699877739, "learning_rate": 5.933971405519656e-06, "loss": 0.6243, "step": 483 }, { "epoch": 0.013453693858972488, "grad_norm": 0.1111208125948906, "learning_rate": 5.256984233542595e-06, "loss": 0.7222, "step": 484 }, { "epoch": 0.013481490747110862, "grad_norm": 0.11001411825418472, "learning_rate": 4.6208870508017695e-06, "loss": 0.693, "step": 485 }, { "epoch": 0.013509287635249233, "grad_norm": 0.0949028953909874, "learning_rate": 4.025706004760932e-06, "loss": 0.6378, "step": 486 }, { "epoch": 0.013537084523387607, "grad_norm": 0.10430373251438141, "learning_rate": 3.471465560981768e-06, "loss": 0.5924, "step": 487 }, { "epoch": 0.013564881411525979, "grad_norm": 0.09569145739078522, "learning_rate": 2.958188502118153e-06, "loss": 0.5534, "step": 488 }, { "epoch": 0.013592678299664353, "grad_norm": 0.09768345206975937, "learning_rate": 2.4858959269794535e-06, "loss": 0.4815, "step": 489 }, { "epoch": 0.013620475187802725, "grad_norm": 0.09561553597450256, "learning_rate": 2.054607249663665e-06, "loss": 0.5637, "step": 490 }, { "epoch": 0.013648272075941098, "grad_norm": 0.11304374039173126, "learning_rate": 1.6643401987591622e-06, "loss": 0.5943, "step": 491 }, { "epoch": 0.013676068964079472, "grad_norm": 0.11450091749429703, "learning_rate": 1.3151108166156167e-06, "loss": 0.583, "step": 492 }, { "epoch": 0.013703865852217844, "grad_norm": 0.11334867030382156, "learning_rate": 1.0069334586854107e-06, "loss": 0.6076, "step": 493 }, { "epoch": 0.013731662740356218, "grad_norm": 0.1081780344247818, "learning_rate": 7.398207929323331e-07, "loss": 0.5119, "step": 494 }, { "epoch": 0.01375945962849459, "grad_norm": 0.11384209990501404, "learning_rate": 5.137837993121064e-07, "loss": 0.48, "step": 495 }, { "epoch": 0.013787256516632963, "grad_norm": 0.11100795120000839, "learning_rate": 3.2883176932019256e-07, "loss": 0.53, "step": 496 }, { "epoch": 0.013815053404771335, "grad_norm": 0.14853325486183167, "learning_rate": 1.8497230560998722e-07, "loss": 0.5982, "step": 497 }, { "epoch": 0.013842850292909709, "grad_norm": 0.1417522132396698, "learning_rate": 8.221132168073631e-08, "loss": 0.5259, "step": 498 }, { "epoch": 0.013870647181048081, "grad_norm": 0.15358008444309235, "learning_rate": 2.0553041633952775e-08, "loss": 0.5328, "step": 499 }, { "epoch": 0.013898444069186455, "grad_norm": 0.21747428178787231, "learning_rate": 0.0, "loss": 0.5217, "step": 500 }, { "epoch": 0.013898444069186455, "eval_loss": 0.6910951733589172, "eval_runtime": 50.0791, "eval_samples_per_second": 11.522, "eval_steps_per_second": 5.771, "step": 500 }, { "epoch": 0.013926240957324828, "grad_norm": 0.08321020752191544, "learning_rate": 0.0010126929627961897, "loss": 0.7746, "step": 501 }, { "epoch": 0.0139540378454632, "grad_norm": 0.08852660655975342, "learning_rate": 0.0010095198339395769, "loss": 0.827, "step": 502 }, { "epoch": 0.013981834733601574, "grad_norm": 0.0893716886639595, "learning_rate": 0.001006346609218342, "loss": 0.9232, "step": 503 }, { "epoch": 0.014009631621739946, "grad_norm": 0.07978686690330505, "learning_rate": 0.0010031733205868223, "loss": 0.7457, "step": 504 }, { "epoch": 0.01403742850987832, "grad_norm": 0.08640889078378677, "learning_rate": 0.001, "loss": 0.7378, "step": 505 }, { "epoch": 0.014065225398016691, "grad_norm": 0.7002074718475342, "learning_rate": 0.0009968266794131778, "loss": 1.0428, "step": 506 }, { "epoch": 0.014093022286155065, "grad_norm": 0.0949423760175705, "learning_rate": 0.0009936533907816583, "loss": 0.7472, "step": 507 }, { "epoch": 0.014120819174293437, "grad_norm": 0.0923413634300232, "learning_rate": 0.0009904801660604234, "loss": 0.7292, "step": 508 }, { "epoch": 0.01414861606243181, "grad_norm": 0.10457539558410645, "learning_rate": 0.0009873070372038105, "loss": 0.8122, "step": 509 }, { "epoch": 0.014176412950570184, "grad_norm": 0.1096276342868805, "learning_rate": 0.000984134036165192, "loss": 0.7629, "step": 510 }, { "epoch": 0.014204209838708556, "grad_norm": 0.10207544267177582, "learning_rate": 0.0009809611948966533, "loss": 0.6959, "step": 511 }, { "epoch": 0.01423200672684693, "grad_norm": 0.09525441378355026, "learning_rate": 0.0009777885453486706, "loss": 0.6516, "step": 512 }, { "epoch": 0.014259803614985302, "grad_norm": 0.09446575492620468, "learning_rate": 0.0009746161194697894, "loss": 0.7135, "step": 513 }, { "epoch": 0.014287600503123676, "grad_norm": 0.09206244349479675, "learning_rate": 0.0009714439492063038, "loss": 0.6548, "step": 514 }, { "epoch": 0.014315397391262048, "grad_norm": 0.10186533629894257, "learning_rate": 0.0009682720665019326, "loss": 0.8844, "step": 515 }, { "epoch": 0.014343194279400421, "grad_norm": 0.10673288255929947, "learning_rate": 0.0009651005032974994, "loss": 0.6967, "step": 516 }, { "epoch": 0.014370991167538793, "grad_norm": 0.09592098742723465, "learning_rate": 0.0009619292915306101, "loss": 0.7879, "step": 517 }, { "epoch": 0.014398788055677167, "grad_norm": 0.09870214015245438, "learning_rate": 0.0009587584631353329, "loss": 0.7412, "step": 518 }, { "epoch": 0.01442658494381554, "grad_norm": 0.1019091010093689, "learning_rate": 0.0009555880500418739, "loss": 0.7904, "step": 519 }, { "epoch": 0.014454381831953913, "grad_norm": 0.1054217666387558, "learning_rate": 0.0009524180841762578, "loss": 0.851, "step": 520 }, { "epoch": 0.014482178720092286, "grad_norm": 0.09960552304983139, "learning_rate": 0.0009492485974600059, "loss": 0.8736, "step": 521 }, { "epoch": 0.014509975608230658, "grad_norm": 0.09485925734043121, "learning_rate": 0.0009460796218098142, "loss": 0.7327, "step": 522 }, { "epoch": 0.014537772496369032, "grad_norm": 0.10374409705400467, "learning_rate": 0.000942911189137232, "loss": 0.6675, "step": 523 }, { "epoch": 0.014565569384507404, "grad_norm": 0.11568469554185867, "learning_rate": 0.0009397433313483417, "loss": 0.7675, "step": 524 }, { "epoch": 0.014593366272645777, "grad_norm": 0.11395532637834549, "learning_rate": 0.0009365760803434355, "loss": 0.7988, "step": 525 }, { "epoch": 0.01462116316078415, "grad_norm": 0.10159295052289963, "learning_rate": 0.0009334094680166961, "loss": 0.6681, "step": 526 }, { "epoch": 0.014648960048922523, "grad_norm": 0.0982503890991211, "learning_rate": 0.0009302435262558747, "loss": 0.5873, "step": 527 }, { "epoch": 0.014676756937060897, "grad_norm": 0.10180187225341797, "learning_rate": 0.0009270782869419693, "loss": 0.5806, "step": 528 }, { "epoch": 0.014704553825199269, "grad_norm": 0.1152288019657135, "learning_rate": 0.0009239137819489048, "loss": 0.6191, "step": 529 }, { "epoch": 0.014732350713337642, "grad_norm": 0.11095945537090302, "learning_rate": 0.0009207500431432115, "loss": 0.6781, "step": 530 }, { "epoch": 0.014760147601476014, "grad_norm": 0.11314280331134796, "learning_rate": 0.0009175871023837041, "loss": 0.5022, "step": 531 }, { "epoch": 0.014787944489614388, "grad_norm": 0.11200802028179169, "learning_rate": 0.0009144249915211606, "loss": 0.6996, "step": 532 }, { "epoch": 0.01481574137775276, "grad_norm": 0.11137256771326065, "learning_rate": 0.0009112637423980021, "loss": 0.7555, "step": 533 }, { "epoch": 0.014843538265891134, "grad_norm": 0.10772709548473358, "learning_rate": 0.0009081033868479727, "loss": 0.8015, "step": 534 }, { "epoch": 0.014871335154029506, "grad_norm": 0.1146487444639206, "learning_rate": 0.0009049439566958176, "loss": 0.7214, "step": 535 }, { "epoch": 0.01489913204216788, "grad_norm": 0.11040794104337692, "learning_rate": 0.0009017854837569629, "loss": 0.6903, "step": 536 }, { "epoch": 0.014926928930306253, "grad_norm": 0.10526284575462341, "learning_rate": 0.0008986279998371967, "loss": 0.5842, "step": 537 }, { "epoch": 0.014954725818444625, "grad_norm": 0.10294964164495468, "learning_rate": 0.0008954715367323467, "loss": 0.6202, "step": 538 }, { "epoch": 0.014982522706582999, "grad_norm": 0.11229365319013596, "learning_rate": 0.0008923161262279611, "loss": 0.7023, "step": 539 }, { "epoch": 0.01501031959472137, "grad_norm": 0.11053290218114853, "learning_rate": 0.0008891618000989891, "loss": 0.6683, "step": 540 }, { "epoch": 0.015038116482859744, "grad_norm": 0.11226194351911545, "learning_rate": 0.0008860085901094594, "loss": 0.5432, "step": 541 }, { "epoch": 0.015065913370998116, "grad_norm": 0.12068658322095871, "learning_rate": 0.0008828565280121618, "loss": 0.5759, "step": 542 }, { "epoch": 0.01509371025913649, "grad_norm": 0.11296896636486053, "learning_rate": 0.0008797056455483266, "loss": 0.6444, "step": 543 }, { "epoch": 0.015121507147274862, "grad_norm": 0.14195378124713898, "learning_rate": 0.0008765559744473053, "loss": 0.5927, "step": 544 }, { "epoch": 0.015149304035413235, "grad_norm": 0.13548904657363892, "learning_rate": 0.0008734075464262507, "loss": 0.551, "step": 545 }, { "epoch": 0.015177100923551609, "grad_norm": 0.14056935906410217, "learning_rate": 0.0008702603931897982, "loss": 0.6382, "step": 546 }, { "epoch": 0.015204897811689981, "grad_norm": 0.16045929491519928, "learning_rate": 0.000867114546429746, "loss": 0.7804, "step": 547 }, { "epoch": 0.015232694699828355, "grad_norm": 0.15424484014511108, "learning_rate": 0.000863970037824736, "loss": 0.5277, "step": 548 }, { "epoch": 0.015260491587966727, "grad_norm": 0.14988261461257935, "learning_rate": 0.0008608268990399348, "loss": 0.6501, "step": 549 }, { "epoch": 0.0152882884761051, "grad_norm": 0.1926291584968567, "learning_rate": 0.000857685161726715, "loss": 0.7122, "step": 550 }, { "epoch": 0.0152882884761051, "eval_loss": 0.7408689260482788, "eval_runtime": 50.3685, "eval_samples_per_second": 11.456, "eval_steps_per_second": 5.738, "step": 550 }, { "epoch": 0.015316085364243472, "grad_norm": 0.11252893507480621, "learning_rate": 0.0008545448575223368, "loss": 0.9003, "step": 551 }, { "epoch": 0.015343882252381846, "grad_norm": 0.11723627150058746, "learning_rate": 0.0008514060180496285, "loss": 0.9084, "step": 552 }, { "epoch": 0.015371679140520218, "grad_norm": 0.10663684457540512, "learning_rate": 0.0008482686749166685, "loss": 0.8746, "step": 553 }, { "epoch": 0.015399476028658592, "grad_norm": 0.09707697480916977, "learning_rate": 0.0008451328597164678, "loss": 0.8142, "step": 554 }, { "epoch": 0.015427272916796965, "grad_norm": 0.09315719455480576, "learning_rate": 0.00084199860402665, "loss": 0.8085, "step": 555 }, { "epoch": 0.015455069804935337, "grad_norm": 0.09040182083845139, "learning_rate": 0.000838865939409136, "loss": 0.8812, "step": 556 }, { "epoch": 0.015482866693073711, "grad_norm": 0.09035031497478485, "learning_rate": 0.0008357348974098231, "loss": 0.7768, "step": 557 }, { "epoch": 0.015510663581212083, "grad_norm": 0.09659522771835327, "learning_rate": 0.0008326055095582694, "loss": 0.8016, "step": 558 }, { "epoch": 0.015538460469350457, "grad_norm": 0.09107272326946259, "learning_rate": 0.0008294778073673762, "loss": 0.6744, "step": 559 }, { "epoch": 0.015566257357488828, "grad_norm": 0.09538799524307251, "learning_rate": 0.0008263518223330697, "loss": 0.6448, "step": 560 }, { "epoch": 0.015594054245627202, "grad_norm": 0.10772833228111267, "learning_rate": 0.0008232275859339842, "loss": 0.805, "step": 561 }, { "epoch": 0.015621851133765574, "grad_norm": 0.09896623343229294, "learning_rate": 0.0008201051296311461, "loss": 0.7166, "step": 562 }, { "epoch": 0.01564964802190395, "grad_norm": 0.0901743695139885, "learning_rate": 0.0008169844848676553, "loss": 0.6907, "step": 563 }, { "epoch": 0.01567744491004232, "grad_norm": 0.10416441410779953, "learning_rate": 0.00081386568306837, "loss": 0.8327, "step": 564 }, { "epoch": 0.015705241798180693, "grad_norm": 0.09228041023015976, "learning_rate": 0.0008107487556395902, "loss": 0.7169, "step": 565 }, { "epoch": 0.015733038686319065, "grad_norm": 0.10955098271369934, "learning_rate": 0.0008076337339687394, "loss": 0.8663, "step": 566 }, { "epoch": 0.01576083557445744, "grad_norm": 0.07800483703613281, "learning_rate": 0.000804520649424052, "loss": 0.5992, "step": 567 }, { "epoch": 0.015788632462595813, "grad_norm": 0.11922062933444977, "learning_rate": 0.0008014095333542549, "loss": 0.8901, "step": 568 }, { "epoch": 0.015816429350734185, "grad_norm": 0.10293685644865036, "learning_rate": 0.0007983004170882518, "loss": 0.6794, "step": 569 }, { "epoch": 0.015844226238872557, "grad_norm": 0.09084783494472504, "learning_rate": 0.0007951933319348094, "loss": 0.688, "step": 570 }, { "epoch": 0.015872023127010932, "grad_norm": 0.09098474681377411, "learning_rate": 0.0007920883091822408, "loss": 0.6813, "step": 571 }, { "epoch": 0.015899820015149304, "grad_norm": 0.10126250982284546, "learning_rate": 0.0007889853800980904, "loss": 0.6236, "step": 572 }, { "epoch": 0.015927616903287676, "grad_norm": 0.1058661937713623, "learning_rate": 0.0007858845759288198, "loss": 0.6656, "step": 573 }, { "epoch": 0.01595541379142605, "grad_norm": 0.09901162981987, "learning_rate": 0.0007827859278994924, "loss": 0.8116, "step": 574 }, { "epoch": 0.015983210679564423, "grad_norm": 0.10155469924211502, "learning_rate": 0.0007796894672134593, "loss": 0.5172, "step": 575 }, { "epoch": 0.016011007567702795, "grad_norm": 0.09873487055301666, "learning_rate": 0.0007765952250520458, "loss": 0.5784, "step": 576 }, { "epoch": 0.016038804455841167, "grad_norm": 0.11075747758150101, "learning_rate": 0.0007735032325742355, "loss": 0.8474, "step": 577 }, { "epoch": 0.016066601343979543, "grad_norm": 0.11609680950641632, "learning_rate": 0.0007704135209163588, "loss": 0.7064, "step": 578 }, { "epoch": 0.016094398232117915, "grad_norm": 0.12410083413124084, "learning_rate": 0.0007673261211917776, "loss": 0.7152, "step": 579 }, { "epoch": 0.016122195120256286, "grad_norm": 0.11199633032083511, "learning_rate": 0.0007642410644905726, "loss": 0.6268, "step": 580 }, { "epoch": 0.016149992008394662, "grad_norm": 0.10388398915529251, "learning_rate": 0.000761158381879231, "loss": 0.5756, "step": 581 }, { "epoch": 0.016177788896533034, "grad_norm": 0.10880818963050842, "learning_rate": 0.0007580781044003324, "loss": 0.6587, "step": 582 }, { "epoch": 0.016205585784671406, "grad_norm": 0.12266401201486588, "learning_rate": 0.0007550002630722365, "loss": 0.7465, "step": 583 }, { "epoch": 0.016233382672809778, "grad_norm": 0.09996026754379272, "learning_rate": 0.0007519248888887716, "loss": 0.6004, "step": 584 }, { "epoch": 0.016261179560948153, "grad_norm": 0.11434061080217361, "learning_rate": 0.0007488520128189209, "loss": 0.6901, "step": 585 }, { "epoch": 0.016288976449086525, "grad_norm": 0.11347367614507675, "learning_rate": 0.0007457816658065133, "loss": 0.642, "step": 586 }, { "epoch": 0.016316773337224897, "grad_norm": 0.11877556145191193, "learning_rate": 0.0007427138787699086, "loss": 0.6851, "step": 587 }, { "epoch": 0.01634457022536327, "grad_norm": 0.11316651105880737, "learning_rate": 0.0007396486826016879, "loss": 0.5304, "step": 588 }, { "epoch": 0.016372367113501644, "grad_norm": 0.10818410664796829, "learning_rate": 0.0007365861081683433, "loss": 0.5677, "step": 589 }, { "epoch": 0.016400164001640016, "grad_norm": 0.11229270696640015, "learning_rate": 0.0007335261863099651, "loss": 0.5638, "step": 590 }, { "epoch": 0.016427960889778388, "grad_norm": 0.10760554671287537, "learning_rate": 0.0007304689478399323, "loss": 0.5585, "step": 591 }, { "epoch": 0.016455757777916764, "grad_norm": 0.11895162612199783, "learning_rate": 0.0007274144235446023, "loss": 0.6377, "step": 592 }, { "epoch": 0.016483554666055136, "grad_norm": 0.12675324082374573, "learning_rate": 0.0007243626441830009, "loss": 0.7035, "step": 593 }, { "epoch": 0.016511351554193508, "grad_norm": 0.1268201619386673, "learning_rate": 0.0007213136404865124, "loss": 0.5686, "step": 594 }, { "epoch": 0.01653914844233188, "grad_norm": 0.11473376303911209, "learning_rate": 0.0007182674431585703, "loss": 0.4511, "step": 595 }, { "epoch": 0.016566945330470255, "grad_norm": 0.133954718708992, "learning_rate": 0.0007152240828743477, "loss": 0.5455, "step": 596 }, { "epoch": 0.016594742218608627, "grad_norm": 0.12625186145305634, "learning_rate": 0.000712183590280449, "loss": 0.5237, "step": 597 }, { "epoch": 0.016622539106747, "grad_norm": 0.16250957548618317, "learning_rate": 0.0007091459959946009, "loss": 0.5333, "step": 598 }, { "epoch": 0.016650335994885374, "grad_norm": 0.18788903951644897, "learning_rate": 0.0007061113306053443, "loss": 0.8662, "step": 599 }, { "epoch": 0.016678132883023746, "grad_norm": 0.18305297195911407, "learning_rate": 0.0007030796246717255, "loss": 0.6526, "step": 600 }, { "epoch": 0.016678132883023746, "eval_loss": 0.7191578149795532, "eval_runtime": 50.3974, "eval_samples_per_second": 11.449, "eval_steps_per_second": 5.734, "step": 600 }, { "epoch": 0.016705929771162118, "grad_norm": 0.11464305222034454, "learning_rate": 0.0007000509087229895, "loss": 0.9103, "step": 601 }, { "epoch": 0.01673372665930049, "grad_norm": 0.09989949315786362, "learning_rate": 0.0006970252132582728, "loss": 0.8176, "step": 602 }, { "epoch": 0.016761523547438865, "grad_norm": 0.09268354624509811, "learning_rate": 0.0006940025687462952, "loss": 0.7983, "step": 603 }, { "epoch": 0.016789320435577237, "grad_norm": 0.09152427315711975, "learning_rate": 0.0006909830056250527, "loss": 0.7975, "step": 604 }, { "epoch": 0.01681711732371561, "grad_norm": 0.09386585652828217, "learning_rate": 0.000687966554301513, "loss": 0.8584, "step": 605 }, { "epoch": 0.01684491421185398, "grad_norm": 0.08477571606636047, "learning_rate": 0.0006849532451513074, "loss": 0.7387, "step": 606 }, { "epoch": 0.016872711099992357, "grad_norm": 0.08988666534423828, "learning_rate": 0.0006819431085184251, "loss": 0.8264, "step": 607 }, { "epoch": 0.01690050798813073, "grad_norm": 0.09714596718549728, "learning_rate": 0.0006789361747149092, "loss": 0.9452, "step": 608 }, { "epoch": 0.0169283048762691, "grad_norm": 0.10461269319057465, "learning_rate": 0.0006759324740205494, "loss": 0.7174, "step": 609 }, { "epoch": 0.016956101764407476, "grad_norm": 0.09161835163831711, "learning_rate": 0.0006729320366825784, "loss": 0.796, "step": 610 }, { "epoch": 0.016983898652545848, "grad_norm": 0.0949753150343895, "learning_rate": 0.0006699348929153668, "loss": 0.975, "step": 611 }, { "epoch": 0.01701169554068422, "grad_norm": 0.09028909355401993, "learning_rate": 0.0006669410729001193, "loss": 0.7738, "step": 612 }, { "epoch": 0.017039492428822592, "grad_norm": 0.08454867452383041, "learning_rate": 0.0006639506067845697, "loss": 0.7062, "step": 613 }, { "epoch": 0.017067289316960967, "grad_norm": 0.10592840611934662, "learning_rate": 0.0006609635246826793, "loss": 0.7745, "step": 614 }, { "epoch": 0.01709508620509934, "grad_norm": 0.09267466515302658, "learning_rate": 0.0006579798566743314, "loss": 0.8491, "step": 615 }, { "epoch": 0.01712288309323771, "grad_norm": 0.10221099853515625, "learning_rate": 0.0006549996328050296, "loss": 0.9564, "step": 616 }, { "epoch": 0.017150679981376087, "grad_norm": 0.09640829265117645, "learning_rate": 0.000652022883085595, "loss": 0.6694, "step": 617 }, { "epoch": 0.01717847686951446, "grad_norm": 0.09555254131555557, "learning_rate": 0.0006490496374918646, "loss": 0.7825, "step": 618 }, { "epoch": 0.01720627375765283, "grad_norm": 0.1080060750246048, "learning_rate": 0.0006460799259643883, "loss": 0.8122, "step": 619 }, { "epoch": 0.017234070645791202, "grad_norm": 0.09308885037899017, "learning_rate": 0.0006431137784081283, "loss": 0.7393, "step": 620 }, { "epoch": 0.017261867533929578, "grad_norm": 0.10485529899597168, "learning_rate": 0.0006401512246921576, "loss": 0.7577, "step": 621 }, { "epoch": 0.01728966442206795, "grad_norm": 0.10300412029027939, "learning_rate": 0.0006371922946493591, "loss": 0.7016, "step": 622 }, { "epoch": 0.01731746131020632, "grad_norm": 0.09915035963058472, "learning_rate": 0.0006342370180761255, "loss": 0.7562, "step": 623 }, { "epoch": 0.017345258198344694, "grad_norm": 0.11094118654727936, "learning_rate": 0.0006312854247320594, "loss": 0.7113, "step": 624 }, { "epoch": 0.01737305508648307, "grad_norm": 0.09752795100212097, "learning_rate": 0.0006283375443396726, "loss": 0.7649, "step": 625 }, { "epoch": 0.01740085197462144, "grad_norm": 0.10030993074178696, "learning_rate": 0.0006253934065840879, "loss": 0.7446, "step": 626 }, { "epoch": 0.017428648862759813, "grad_norm": 0.1134578287601471, "learning_rate": 0.0006224530411127403, "loss": 0.8147, "step": 627 }, { "epoch": 0.01745644575089819, "grad_norm": 0.09963490813970566, "learning_rate": 0.000619516477535077, "loss": 0.6904, "step": 628 }, { "epoch": 0.01748424263903656, "grad_norm": 0.10086818039417267, "learning_rate": 0.0006165837454222607, "loss": 0.5791, "step": 629 }, { "epoch": 0.017512039527174932, "grad_norm": 0.11571143567562103, "learning_rate": 0.0006136548743068713, "loss": 0.7572, "step": 630 }, { "epoch": 0.017539836415313304, "grad_norm": 0.10508367419242859, "learning_rate": 0.0006107298936826086, "loss": 0.5869, "step": 631 }, { "epoch": 0.01756763330345168, "grad_norm": 0.1044749990105629, "learning_rate": 0.0006078088330039945, "loss": 0.595, "step": 632 }, { "epoch": 0.01759543019159005, "grad_norm": 0.1138482466340065, "learning_rate": 0.0006048917216860781, "loss": 0.668, "step": 633 }, { "epoch": 0.017623227079728424, "grad_norm": 0.10499613732099533, "learning_rate": 0.0006019785891041381, "loss": 0.6028, "step": 634 }, { "epoch": 0.0176510239678668, "grad_norm": 0.10078407824039459, "learning_rate": 0.0005990694645933865, "loss": 0.5796, "step": 635 }, { "epoch": 0.01767882085600517, "grad_norm": 0.09239528328180313, "learning_rate": 0.0005961643774486753, "loss": 0.5735, "step": 636 }, { "epoch": 0.017706617744143543, "grad_norm": 0.09768297523260117, "learning_rate": 0.0005932633569242, "loss": 0.5082, "step": 637 }, { "epoch": 0.017734414632281915, "grad_norm": 0.10613156110048294, "learning_rate": 0.0005903664322332048, "loss": 0.5554, "step": 638 }, { "epoch": 0.01776221152042029, "grad_norm": 0.10876414179801941, "learning_rate": 0.000587473632547689, "loss": 0.6091, "step": 639 }, { "epoch": 0.017790008408558662, "grad_norm": 0.10759898275136948, "learning_rate": 0.0005845849869981136, "loss": 0.5748, "step": 640 }, { "epoch": 0.017817805296697034, "grad_norm": 0.12154053151607513, "learning_rate": 0.0005817005246731073, "loss": 0.6063, "step": 641 }, { "epoch": 0.017845602184835406, "grad_norm": 0.11394521594047546, "learning_rate": 0.0005788202746191734, "loss": 0.6124, "step": 642 }, { "epoch": 0.01787339907297378, "grad_norm": 0.09602084010839462, "learning_rate": 0.0005759442658403985, "loss": 0.4391, "step": 643 }, { "epoch": 0.017901195961112153, "grad_norm": 0.12600000202655792, "learning_rate": 0.0005730725272981583, "loss": 0.6201, "step": 644 }, { "epoch": 0.017928992849250525, "grad_norm": 0.1129770576953888, "learning_rate": 0.0005702050879108284, "loss": 0.4814, "step": 645 }, { "epoch": 0.0179567897373889, "grad_norm": 0.121727854013443, "learning_rate": 0.0005673419765534915, "loss": 0.5071, "step": 646 }, { "epoch": 0.017984586625527273, "grad_norm": 0.11814267188310623, "learning_rate": 0.0005644832220576479, "loss": 0.5387, "step": 647 }, { "epoch": 0.018012383513665645, "grad_norm": 0.14177252352237701, "learning_rate": 0.0005616288532109225, "loss": 0.6006, "step": 648 }, { "epoch": 0.018040180401804017, "grad_norm": 0.17021676898002625, "learning_rate": 0.0005587788987567784, "loss": 0.5445, "step": 649 }, { "epoch": 0.018067977289942392, "grad_norm": 0.17510192096233368, "learning_rate": 0.0005559333873942258, "loss": 0.5694, "step": 650 }, { "epoch": 0.018067977289942392, "eval_loss": 0.7097320556640625, "eval_runtime": 50.365, "eval_samples_per_second": 11.456, "eval_steps_per_second": 5.738, "step": 650 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.150175529644851e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }