{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 140, "global_step": 560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017857142857142857, "grad_norm": 0.14711466431617737, "learning_rate": 2e-05, "loss": 1.6301, "step": 1 }, { "epoch": 0.0035714285714285713, "grad_norm": 0.13304685056209564, "learning_rate": 4e-05, "loss": 1.6271, "step": 2 }, { "epoch": 0.005357142857142857, "grad_norm": 0.1233660876750946, "learning_rate": 6e-05, "loss": 1.6548, "step": 3 }, { "epoch": 0.007142857142857143, "grad_norm": 0.1381577104330063, "learning_rate": 8e-05, "loss": 1.7853, "step": 4 }, { "epoch": 0.008928571428571428, "grad_norm": 0.14668770134449005, "learning_rate": 0.0001, "loss": 1.7507, "step": 5 }, { "epoch": 0.010714285714285714, "grad_norm": 0.168854758143425, "learning_rate": 0.00012, "loss": 1.9486, "step": 6 }, { "epoch": 0.0125, "grad_norm": 0.2166246473789215, "learning_rate": 0.00014, "loss": 1.9167, "step": 7 }, { "epoch": 0.014285714285714285, "grad_norm": 0.23894698917865753, "learning_rate": 0.00016, "loss": 1.8384, "step": 8 }, { "epoch": 0.01607142857142857, "grad_norm": 0.2704271674156189, "learning_rate": 0.00018, "loss": 1.8232, "step": 9 }, { "epoch": 0.017857142857142856, "grad_norm": 0.24295401573181152, "learning_rate": 0.0002, "loss": 2.1559, "step": 10 }, { "epoch": 0.019642857142857142, "grad_norm": 0.23109205067157745, "learning_rate": 0.00019999836866486503, "loss": 1.7082, "step": 11 }, { "epoch": 0.02142857142857143, "grad_norm": 0.205198273062706, "learning_rate": 0.00019999347471268516, "loss": 1.8417, "step": 12 }, { "epoch": 0.023214285714285715, "grad_norm": 0.2678169012069702, "learning_rate": 0.00019998531830313395, "loss": 1.9222, "step": 13 }, { "epoch": 0.025, "grad_norm": 0.2563900053501129, "learning_rate": 0.0001999738997023281, "loss": 1.8931, "step": 14 }, { "epoch": 0.026785714285714284, "grad_norm": 0.31391921639442444, "learning_rate": 0.00019995921928281894, "loss": 1.9403, "step": 15 }, { "epoch": 0.02857142857142857, "grad_norm": 0.23522017896175385, "learning_rate": 0.00019994127752358013, "loss": 1.8728, "step": 16 }, { "epoch": 0.030357142857142857, "grad_norm": 0.2517368495464325, "learning_rate": 0.00019992007500999214, "loss": 1.9224, "step": 17 }, { "epoch": 0.03214285714285714, "grad_norm": 0.27111852169036865, "learning_rate": 0.00019989561243382312, "loss": 1.9046, "step": 18 }, { "epoch": 0.033928571428571426, "grad_norm": 0.2616860568523407, "learning_rate": 0.00019986789059320615, "loss": 2.1173, "step": 19 }, { "epoch": 0.03571428571428571, "grad_norm": 0.23268847167491913, "learning_rate": 0.00019983691039261357, "loss": 1.8406, "step": 20 }, { "epoch": 0.0375, "grad_norm": 0.30090174078941345, "learning_rate": 0.00019980267284282717, "loss": 1.7514, "step": 21 }, { "epoch": 0.039285714285714285, "grad_norm": 0.23314997553825378, "learning_rate": 0.00019976517906090529, "loss": 1.7518, "step": 22 }, { "epoch": 0.04107142857142857, "grad_norm": 0.2630898952484131, "learning_rate": 0.0001997244302701464, "loss": 1.7533, "step": 23 }, { "epoch": 0.04285714285714286, "grad_norm": 0.3203246593475342, "learning_rate": 0.00019968042780004917, "loss": 1.534, "step": 24 }, { "epoch": 0.044642857142857144, "grad_norm": 0.48179084062576294, "learning_rate": 0.00019963317308626914, "loss": 1.4211, "step": 25 }, { "epoch": 0.04642857142857143, "grad_norm": 0.36420196294784546, "learning_rate": 0.0001995826676705718, "loss": 1.4275, "step": 26 }, { "epoch": 0.048214285714285716, "grad_norm": 0.35330235958099365, "learning_rate": 0.00019952891320078236, "loss": 1.5612, "step": 27 }, { "epoch": 0.05, "grad_norm": 0.27140146493911743, "learning_rate": 0.00019947191143073186, "loss": 1.6001, "step": 28 }, { "epoch": 0.05178571428571429, "grad_norm": 0.2987808883190155, "learning_rate": 0.00019941166422020014, "loss": 1.3943, "step": 29 }, { "epoch": 0.05357142857142857, "grad_norm": 0.3032172918319702, "learning_rate": 0.00019934817353485501, "loss": 1.4888, "step": 30 }, { "epoch": 0.055357142857142855, "grad_norm": 0.6096982359886169, "learning_rate": 0.00019928144144618824, "loss": 1.7057, "step": 31 }, { "epoch": 0.05714285714285714, "grad_norm": 0.30788663029670715, "learning_rate": 0.0001992114701314478, "loss": 1.3929, "step": 32 }, { "epoch": 0.05892857142857143, "grad_norm": 0.33283260464668274, "learning_rate": 0.00019913826187356696, "loss": 1.5611, "step": 33 }, { "epoch": 0.060714285714285714, "grad_norm": 0.3044683337211609, "learning_rate": 0.00019906181906108984, "loss": 1.2961, "step": 34 }, { "epoch": 0.0625, "grad_norm": 0.3300105929374695, "learning_rate": 0.0001989821441880933, "loss": 1.3975, "step": 35 }, { "epoch": 0.06428571428571428, "grad_norm": 0.3126329481601715, "learning_rate": 0.00019889923985410576, "loss": 1.2759, "step": 36 }, { "epoch": 0.06607142857142857, "grad_norm": 0.33685699105262756, "learning_rate": 0.00019881310876402223, "loss": 1.162, "step": 37 }, { "epoch": 0.06785714285714285, "grad_norm": 0.3971424996852875, "learning_rate": 0.0001987237537280163, "loss": 1.21, "step": 38 }, { "epoch": 0.06964285714285715, "grad_norm": 0.43255800008773804, "learning_rate": 0.00019863117766144806, "loss": 1.0186, "step": 39 }, { "epoch": 0.07142857142857142, "grad_norm": 0.4930873215198517, "learning_rate": 0.00019853538358476932, "loss": 1.0045, "step": 40 }, { "epoch": 0.07321428571428572, "grad_norm": 0.4585554599761963, "learning_rate": 0.00019843637462342497, "loss": 1.1077, "step": 41 }, { "epoch": 0.075, "grad_norm": 0.4778697192668915, "learning_rate": 0.00019833415400775093, "loss": 1.0817, "step": 42 }, { "epoch": 0.07678571428571429, "grad_norm": 0.4595739245414734, "learning_rate": 0.0001982287250728689, "loss": 1.2805, "step": 43 }, { "epoch": 0.07857142857142857, "grad_norm": 0.5665370225906372, "learning_rate": 0.00019812009125857728, "loss": 0.9855, "step": 44 }, { "epoch": 0.08035714285714286, "grad_norm": 0.40399685502052307, "learning_rate": 0.00019800825610923934, "loss": 1.3264, "step": 45 }, { "epoch": 0.08214285714285714, "grad_norm": 0.43560901284217834, "learning_rate": 0.00019789322327366723, "loss": 1.2724, "step": 46 }, { "epoch": 0.08392857142857142, "grad_norm": 0.375598669052124, "learning_rate": 0.000197774996505003, "loss": 1.1722, "step": 47 }, { "epoch": 0.08571428571428572, "grad_norm": 0.4257534146308899, "learning_rate": 0.00019765357966059638, "loss": 1.3607, "step": 48 }, { "epoch": 0.0875, "grad_norm": 0.5437749028205872, "learning_rate": 0.0001975289767018786, "loss": 1.9812, "step": 49 }, { "epoch": 0.08928571428571429, "grad_norm": 0.6295837163925171, "learning_rate": 0.00019740119169423337, "loss": 2.1601, "step": 50 }, { "epoch": 0.09107142857142857, "grad_norm": 0.22341597080230713, "learning_rate": 0.00019727022880686412, "loss": 1.7096, "step": 51 }, { "epoch": 0.09285714285714286, "grad_norm": 0.26040709018707275, "learning_rate": 0.00019713609231265805, "loss": 1.733, "step": 52 }, { "epoch": 0.09464285714285714, "grad_norm": 0.2146587371826172, "learning_rate": 0.00019699878658804672, "loss": 1.7999, "step": 53 }, { "epoch": 0.09642857142857143, "grad_norm": 0.2316572070121765, "learning_rate": 0.0001968583161128631, "loss": 1.6017, "step": 54 }, { "epoch": 0.09821428571428571, "grad_norm": 0.20329022407531738, "learning_rate": 0.00019671468547019573, "loss": 1.7464, "step": 55 }, { "epoch": 0.1, "grad_norm": 0.22198128700256348, "learning_rate": 0.00019656789934623881, "loss": 1.7584, "step": 56 }, { "epoch": 0.10178571428571428, "grad_norm": 0.2108459621667862, "learning_rate": 0.00019641796253013958, "loss": 1.7389, "step": 57 }, { "epoch": 0.10357142857142858, "grad_norm": 0.2677428126335144, "learning_rate": 0.00019626487991384196, "loss": 1.9166, "step": 58 }, { "epoch": 0.10535714285714286, "grad_norm": 0.22205914556980133, "learning_rate": 0.00019610865649192697, "loss": 1.738, "step": 59 }, { "epoch": 0.10714285714285714, "grad_norm": 0.22724869847297668, "learning_rate": 0.00019594929736144976, "loss": 1.7451, "step": 60 }, { "epoch": 0.10892857142857143, "grad_norm": 0.24029673635959625, "learning_rate": 0.00019578680772177327, "loss": 1.9556, "step": 61 }, { "epoch": 0.11071428571428571, "grad_norm": 0.2240082174539566, "learning_rate": 0.00019562119287439873, "loss": 1.6435, "step": 62 }, { "epoch": 0.1125, "grad_norm": 0.23900961875915527, "learning_rate": 0.00019545245822279243, "loss": 1.7015, "step": 63 }, { "epoch": 0.11428571428571428, "grad_norm": 0.2319100946187973, "learning_rate": 0.0001952806092722098, "loss": 1.7287, "step": 64 }, { "epoch": 0.11607142857142858, "grad_norm": 0.22805678844451904, "learning_rate": 0.00019510565162951537, "loss": 1.7574, "step": 65 }, { "epoch": 0.11785714285714285, "grad_norm": 0.2389436662197113, "learning_rate": 0.00019492759100300019, "loss": 1.7738, "step": 66 }, { "epoch": 0.11964285714285715, "grad_norm": 0.23373426496982574, "learning_rate": 0.00019474643320219532, "loss": 1.5328, "step": 67 }, { "epoch": 0.12142857142857143, "grad_norm": 0.2664267420768738, "learning_rate": 0.0001945621841376825, "loss": 2.0273, "step": 68 }, { "epoch": 0.12321428571428572, "grad_norm": 0.2796079218387604, "learning_rate": 0.0001943748498209012, "loss": 1.547, "step": 69 }, { "epoch": 0.125, "grad_norm": 0.2695769667625427, "learning_rate": 0.00019418443636395248, "loss": 1.6374, "step": 70 }, { "epoch": 0.12678571428571428, "grad_norm": 0.3933340311050415, "learning_rate": 0.00019399094997939957, "loss": 1.2037, "step": 71 }, { "epoch": 0.12857142857142856, "grad_norm": 0.2825562357902527, "learning_rate": 0.0001937943969800652, "loss": 1.5649, "step": 72 }, { "epoch": 0.13035714285714287, "grad_norm": 0.3704264760017395, "learning_rate": 0.00019359478377882567, "loss": 1.292, "step": 73 }, { "epoch": 0.13214285714285715, "grad_norm": 0.3214375972747803, "learning_rate": 0.00019339211688840157, "loss": 1.2539, "step": 74 }, { "epoch": 0.13392857142857142, "grad_norm": 0.337907612323761, "learning_rate": 0.00019318640292114524, "loss": 1.3202, "step": 75 }, { "epoch": 0.1357142857142857, "grad_norm": 0.30557796359062195, "learning_rate": 0.00019297764858882514, "loss": 1.074, "step": 76 }, { "epoch": 0.1375, "grad_norm": 0.3154335916042328, "learning_rate": 0.00019276586070240682, "loss": 1.1813, "step": 77 }, { "epoch": 0.1392857142857143, "grad_norm": 0.3587944209575653, "learning_rate": 0.0001925510461718307, "loss": 1.3602, "step": 78 }, { "epoch": 0.14107142857142857, "grad_norm": 0.35669559240341187, "learning_rate": 0.0001923332120057866, "loss": 1.3626, "step": 79 }, { "epoch": 0.14285714285714285, "grad_norm": 0.48954322934150696, "learning_rate": 0.000192112365311485, "loss": 1.4762, "step": 80 }, { "epoch": 0.14464285714285716, "grad_norm": 0.4903950095176697, "learning_rate": 0.00019188851329442547, "loss": 1.6974, "step": 81 }, { "epoch": 0.14642857142857144, "grad_norm": 0.35062992572784424, "learning_rate": 0.00019166166325816118, "loss": 1.2373, "step": 82 }, { "epoch": 0.14821428571428572, "grad_norm": 0.5671891570091248, "learning_rate": 0.0001914318226040608, "loss": 1.5644, "step": 83 }, { "epoch": 0.15, "grad_norm": 0.338214248418808, "learning_rate": 0.000191198998831067, "loss": 1.3717, "step": 84 }, { "epoch": 0.15178571428571427, "grad_norm": 0.4187781512737274, "learning_rate": 0.00019096319953545185, "loss": 1.1211, "step": 85 }, { "epoch": 0.15357142857142858, "grad_norm": 0.3229452967643738, "learning_rate": 0.00019072443241056883, "loss": 1.0473, "step": 86 }, { "epoch": 0.15535714285714286, "grad_norm": 0.376589298248291, "learning_rate": 0.00019048270524660196, "loss": 0.9684, "step": 87 }, { "epoch": 0.15714285714285714, "grad_norm": 0.4261431396007538, "learning_rate": 0.00019023802593031154, "loss": 1.0243, "step": 88 }, { "epoch": 0.15892857142857142, "grad_norm": 0.4411628544330597, "learning_rate": 0.0001899904024447769, "loss": 1.2739, "step": 89 }, { "epoch": 0.16071428571428573, "grad_norm": 0.3415679335594177, "learning_rate": 0.00018973984286913584, "loss": 1.2494, "step": 90 }, { "epoch": 0.1625, "grad_norm": 0.48547816276550293, "learning_rate": 0.0001894863553783212, "loss": 1.0384, "step": 91 }, { "epoch": 0.16428571428571428, "grad_norm": 0.398359090089798, "learning_rate": 0.00018922994824279395, "loss": 1.2409, "step": 92 }, { "epoch": 0.16607142857142856, "grad_norm": 0.4025682806968689, "learning_rate": 0.00018897062982827344, "loss": 1.0646, "step": 93 }, { "epoch": 0.16785714285714284, "grad_norm": 0.46782195568084717, "learning_rate": 0.00018870840859546456, "loss": 1.2167, "step": 94 }, { "epoch": 0.16964285714285715, "grad_norm": 0.45423248410224915, "learning_rate": 0.00018844329309978145, "loss": 1.2359, "step": 95 }, { "epoch": 0.17142857142857143, "grad_norm": 0.380495548248291, "learning_rate": 0.0001881752919910686, "loss": 1.0583, "step": 96 }, { "epoch": 0.1732142857142857, "grad_norm": 0.41103506088256836, "learning_rate": 0.00018790441401331847, "loss": 1.2755, "step": 97 }, { "epoch": 0.175, "grad_norm": 0.4458900988101959, "learning_rate": 0.00018763066800438636, "loss": 1.3651, "step": 98 }, { "epoch": 0.1767857142857143, "grad_norm": 0.5115835666656494, "learning_rate": 0.00018735406289570192, "loss": 1.5389, "step": 99 }, { "epoch": 0.17857142857142858, "grad_norm": 0.6476715803146362, "learning_rate": 0.00018707460771197774, "loss": 2.1173, "step": 100 }, { "epoch": 0.18035714285714285, "grad_norm": 0.18222245573997498, "learning_rate": 0.00018679231157091506, "loss": 1.3799, "step": 101 }, { "epoch": 0.18214285714285713, "grad_norm": 0.2210777848958969, "learning_rate": 0.0001865071836829061, "loss": 1.6362, "step": 102 }, { "epoch": 0.18392857142857144, "grad_norm": 0.21902061998844147, "learning_rate": 0.00018621923335073376, "loss": 1.6673, "step": 103 }, { "epoch": 0.18571428571428572, "grad_norm": 0.21758711338043213, "learning_rate": 0.00018592846996926793, "loss": 1.669, "step": 104 }, { "epoch": 0.1875, "grad_norm": 0.21447710692882538, "learning_rate": 0.0001856349030251589, "loss": 1.6173, "step": 105 }, { "epoch": 0.18928571428571428, "grad_norm": 0.20819346606731415, "learning_rate": 0.00018533854209652818, "loss": 1.6841, "step": 106 }, { "epoch": 0.19107142857142856, "grad_norm": 0.19584102928638458, "learning_rate": 0.00018503939685265568, "loss": 1.5907, "step": 107 }, { "epoch": 0.19285714285714287, "grad_norm": 0.2108387053012848, "learning_rate": 0.00018473747705366426, "loss": 1.7855, "step": 108 }, { "epoch": 0.19464285714285715, "grad_norm": 0.20142777264118195, "learning_rate": 0.00018443279255020152, "loss": 1.79, "step": 109 }, { "epoch": 0.19642857142857142, "grad_norm": 0.20290708541870117, "learning_rate": 0.00018412535328311814, "loss": 1.8891, "step": 110 }, { "epoch": 0.1982142857142857, "grad_norm": 0.20743344724178314, "learning_rate": 0.00018381516928314367, "loss": 1.6483, "step": 111 }, { "epoch": 0.2, "grad_norm": 0.24931779503822327, "learning_rate": 0.00018350225067055925, "loss": 1.7883, "step": 112 }, { "epoch": 0.2017857142857143, "grad_norm": 0.20271241664886475, "learning_rate": 0.00018318660765486748, "loss": 1.8015, "step": 113 }, { "epoch": 0.20357142857142857, "grad_norm": 0.21297615766525269, "learning_rate": 0.00018286825053445918, "loss": 1.7595, "step": 114 }, { "epoch": 0.20535714285714285, "grad_norm": 0.21898160874843597, "learning_rate": 0.0001825471896962774, "loss": 1.8246, "step": 115 }, { "epoch": 0.20714285714285716, "grad_norm": 0.2433815449476242, "learning_rate": 0.00018222343561547874, "loss": 1.9214, "step": 116 }, { "epoch": 0.20892857142857144, "grad_norm": 0.2514473497867584, "learning_rate": 0.00018189699885509127, "loss": 1.674, "step": 117 }, { "epoch": 0.21071428571428572, "grad_norm": 0.23736989498138428, "learning_rate": 0.0001815678900656702, "loss": 1.7948, "step": 118 }, { "epoch": 0.2125, "grad_norm": 0.22837956249713898, "learning_rate": 0.00018123611998495007, "loss": 1.9429, "step": 119 }, { "epoch": 0.21428571428571427, "grad_norm": 0.2722070813179016, "learning_rate": 0.00018090169943749476, "loss": 1.2768, "step": 120 }, { "epoch": 0.21607142857142858, "grad_norm": 0.23845739662647247, "learning_rate": 0.00018056463933434398, "loss": 1.7407, "step": 121 }, { "epoch": 0.21785714285714286, "grad_norm": 0.24140624701976776, "learning_rate": 0.00018022495067265753, "loss": 1.378, "step": 122 }, { "epoch": 0.21964285714285714, "grad_norm": 0.2878568768501282, "learning_rate": 0.0001798826445353564, "loss": 1.6684, "step": 123 }, { "epoch": 0.22142857142857142, "grad_norm": 0.35590338706970215, "learning_rate": 0.0001795377320907611, "loss": 0.7946, "step": 124 }, { "epoch": 0.22321428571428573, "grad_norm": 0.35154449939727783, "learning_rate": 0.00017919022459222752, "loss": 0.9185, "step": 125 }, { "epoch": 0.225, "grad_norm": 0.31363779306411743, "learning_rate": 0.00017884013337777943, "loss": 1.3471, "step": 126 }, { "epoch": 0.22678571428571428, "grad_norm": 0.36924973130226135, "learning_rate": 0.00017848746986973883, "loss": 1.3852, "step": 127 }, { "epoch": 0.22857142857142856, "grad_norm": 0.44929584860801697, "learning_rate": 0.00017813224557435312, "loss": 0.82, "step": 128 }, { "epoch": 0.23035714285714284, "grad_norm": 0.3592383563518524, "learning_rate": 0.0001777744720814198, "loss": 1.0219, "step": 129 }, { "epoch": 0.23214285714285715, "grad_norm": 0.37820902466773987, "learning_rate": 0.00017741416106390826, "loss": 1.3303, "step": 130 }, { "epoch": 0.23392857142857143, "grad_norm": 0.33376428484916687, "learning_rate": 0.00017705132427757895, "loss": 1.384, "step": 131 }, { "epoch": 0.2357142857142857, "grad_norm": 0.3454584777355194, "learning_rate": 0.00017668597356059978, "loss": 1.2236, "step": 132 }, { "epoch": 0.2375, "grad_norm": 0.345708429813385, "learning_rate": 0.00017631812083316003, "loss": 1.0293, "step": 133 }, { "epoch": 0.2392857142857143, "grad_norm": 0.3823675811290741, "learning_rate": 0.00017594777809708126, "loss": 1.3131, "step": 134 }, { "epoch": 0.24107142857142858, "grad_norm": 0.336627334356308, "learning_rate": 0.00017557495743542585, "loss": 1.2084, "step": 135 }, { "epoch": 0.24285714285714285, "grad_norm": 0.3525237739086151, "learning_rate": 0.0001751996710121026, "loss": 1.1378, "step": 136 }, { "epoch": 0.24464285714285713, "grad_norm": 0.38291576504707336, "learning_rate": 0.00017482193107147014, "loss": 0.8353, "step": 137 }, { "epoch": 0.24642857142857144, "grad_norm": 0.4650520086288452, "learning_rate": 0.0001744417499379372, "loss": 1.2024, "step": 138 }, { "epoch": 0.24821428571428572, "grad_norm": 0.32640358805656433, "learning_rate": 0.0001740591400155606, "loss": 1.2343, "step": 139 }, { "epoch": 0.25, "grad_norm": 0.39074593782424927, "learning_rate": 0.0001736741137876405, "loss": 0.8896, "step": 140 }, { "epoch": 0.25, "eval_loss": 1.4710056781768799, "eval_runtime": 13.3678, "eval_samples_per_second": 17.654, "eval_steps_per_second": 8.827, "step": 140 }, { "epoch": 0.2517857142857143, "grad_norm": 0.36249810457229614, "learning_rate": 0.00017328668381631318, "loss": 1.1208, "step": 141 }, { "epoch": 0.25357142857142856, "grad_norm": 0.3750612735748291, "learning_rate": 0.00017289686274214118, "loss": 1.2502, "step": 142 }, { "epoch": 0.25535714285714284, "grad_norm": 0.4201869070529938, "learning_rate": 0.0001725046632837007, "loss": 1.1947, "step": 143 }, { "epoch": 0.2571428571428571, "grad_norm": 0.4865645468235016, "learning_rate": 0.00017211009823716694, "loss": 0.8749, "step": 144 }, { "epoch": 0.25892857142857145, "grad_norm": 0.38693225383758545, "learning_rate": 0.00017171318047589637, "loss": 1.2495, "step": 145 }, { "epoch": 0.26071428571428573, "grad_norm": 0.40707525610923767, "learning_rate": 0.00017131392295000674, "loss": 1.2321, "step": 146 }, { "epoch": 0.2625, "grad_norm": 0.39570894837379456, "learning_rate": 0.00017091233868595467, "loss": 1.301, "step": 147 }, { "epoch": 0.2642857142857143, "grad_norm": 0.4085226058959961, "learning_rate": 0.00017050844078611056, "loss": 1.5369, "step": 148 }, { "epoch": 0.26607142857142857, "grad_norm": 0.47094810009002686, "learning_rate": 0.0001701022424283311, "loss": 1.9374, "step": 149 }, { "epoch": 0.26785714285714285, "grad_norm": 0.8517308831214905, "learning_rate": 0.00016969375686552937, "loss": 1.808, "step": 150 }, { "epoch": 0.26964285714285713, "grad_norm": 0.1922745406627655, "learning_rate": 0.00016928299742524234, "loss": 1.6608, "step": 151 }, { "epoch": 0.2714285714285714, "grad_norm": 0.2090916484594345, "learning_rate": 0.00016886997750919619, "loss": 1.8009, "step": 152 }, { "epoch": 0.2732142857142857, "grad_norm": 0.21698515117168427, "learning_rate": 0.00016845471059286887, "loss": 1.7821, "step": 153 }, { "epoch": 0.275, "grad_norm": 0.21791532635688782, "learning_rate": 0.00016803721022505067, "loss": 1.5901, "step": 154 }, { "epoch": 0.2767857142857143, "grad_norm": 0.22199980914592743, "learning_rate": 0.00016761749002740193, "loss": 1.7047, "step": 155 }, { "epoch": 0.2785714285714286, "grad_norm": 0.2096625566482544, "learning_rate": 0.0001671955636940088, "loss": 1.6898, "step": 156 }, { "epoch": 0.28035714285714286, "grad_norm": 0.22975414991378784, "learning_rate": 0.00016677144499093626, "loss": 1.7631, "step": 157 }, { "epoch": 0.28214285714285714, "grad_norm": 0.2187148928642273, "learning_rate": 0.0001663451477557792, "loss": 1.7872, "step": 158 }, { "epoch": 0.2839285714285714, "grad_norm": 0.2257414609193802, "learning_rate": 0.0001659166858972107, "loss": 1.7732, "step": 159 }, { "epoch": 0.2857142857142857, "grad_norm": 0.22986693680286407, "learning_rate": 0.00016548607339452853, "loss": 1.7031, "step": 160 }, { "epoch": 0.2875, "grad_norm": 0.21585014462471008, "learning_rate": 0.0001650533242971987, "loss": 1.8421, "step": 161 }, { "epoch": 0.2892857142857143, "grad_norm": 0.22519604861736298, "learning_rate": 0.00016461845272439741, "loss": 1.6529, "step": 162 }, { "epoch": 0.2910714285714286, "grad_norm": 0.22279705107212067, "learning_rate": 0.0001641814728645502, "loss": 1.9288, "step": 163 }, { "epoch": 0.29285714285714287, "grad_norm": 0.22392615675926208, "learning_rate": 0.000163742398974869, "loss": 1.693, "step": 164 }, { "epoch": 0.29464285714285715, "grad_norm": 0.22729454934597015, "learning_rate": 0.00016330124538088705, "loss": 1.7027, "step": 165 }, { "epoch": 0.29642857142857143, "grad_norm": 0.2229882776737213, "learning_rate": 0.00016285802647599156, "loss": 1.8262, "step": 166 }, { "epoch": 0.2982142857142857, "grad_norm": 0.25520074367523193, "learning_rate": 0.00016241275672095395, "loss": 1.6009, "step": 167 }, { "epoch": 0.3, "grad_norm": 0.24272315204143524, "learning_rate": 0.00016196545064345812, "loss": 1.9227, "step": 168 }, { "epoch": 0.30178571428571427, "grad_norm": 0.24380216002464294, "learning_rate": 0.00016151612283762652, "loss": 1.5198, "step": 169 }, { "epoch": 0.30357142857142855, "grad_norm": 0.3242342472076416, "learning_rate": 0.00016106478796354382, "loss": 1.6981, "step": 170 }, { "epoch": 0.3053571428571429, "grad_norm": 0.277855783700943, "learning_rate": 0.00016061146074677885, "loss": 1.7011, "step": 171 }, { "epoch": 0.30714285714285716, "grad_norm": 0.2710039019584656, "learning_rate": 0.00016015615597790388, "loss": 1.7522, "step": 172 }, { "epoch": 0.30892857142857144, "grad_norm": 0.26541268825531006, "learning_rate": 0.00015969888851201226, "loss": 1.3804, "step": 173 }, { "epoch": 0.3107142857142857, "grad_norm": 0.28985923528671265, "learning_rate": 0.00015923967326823368, "loss": 1.6453, "step": 174 }, { "epoch": 0.3125, "grad_norm": 0.33939245343208313, "learning_rate": 0.00015877852522924732, "loss": 1.1725, "step": 175 }, { "epoch": 0.3142857142857143, "grad_norm": 0.29770731925964355, "learning_rate": 0.0001583154594407932, "loss": 1.6746, "step": 176 }, { "epoch": 0.31607142857142856, "grad_norm": 0.3280562460422516, "learning_rate": 0.0001578504910111811, "loss": 1.1357, "step": 177 }, { "epoch": 0.31785714285714284, "grad_norm": 0.2856597304344177, "learning_rate": 0.00015738363511079776, "loss": 1.1127, "step": 178 }, { "epoch": 0.3196428571428571, "grad_norm": 0.316491961479187, "learning_rate": 0.00015691490697161182, "loss": 1.4281, "step": 179 }, { "epoch": 0.32142857142857145, "grad_norm": 0.3632654845714569, "learning_rate": 0.00015644432188667695, "loss": 1.3413, "step": 180 }, { "epoch": 0.32321428571428573, "grad_norm": 0.34329405426979065, "learning_rate": 0.00015597189520963277, "loss": 1.0579, "step": 181 }, { "epoch": 0.325, "grad_norm": 0.32447105646133423, "learning_rate": 0.00015549764235420405, "loss": 1.243, "step": 182 }, { "epoch": 0.3267857142857143, "grad_norm": 0.3558500409126282, "learning_rate": 0.0001550215787936977, "loss": 1.1376, "step": 183 }, { "epoch": 0.32857142857142857, "grad_norm": 0.3373570740222931, "learning_rate": 0.00015454372006049803, "loss": 1.1251, "step": 184 }, { "epoch": 0.33035714285714285, "grad_norm": 0.36412546038627625, "learning_rate": 0.00015406408174555976, "loss": 1.3238, "step": 185 }, { "epoch": 0.33214285714285713, "grad_norm": 0.364442378282547, "learning_rate": 0.00015358267949789966, "loss": 0.9448, "step": 186 }, { "epoch": 0.3339285714285714, "grad_norm": 0.3172107934951782, "learning_rate": 0.00015309952902408576, "loss": 1.2744, "step": 187 }, { "epoch": 0.3357142857142857, "grad_norm": 0.34173399209976196, "learning_rate": 0.00015261464608772488, "loss": 1.0923, "step": 188 }, { "epoch": 0.3375, "grad_norm": 0.33419185876846313, "learning_rate": 0.0001521280465089484, "loss": 1.2762, "step": 189 }, { "epoch": 0.3392857142857143, "grad_norm": 0.3866868317127228, "learning_rate": 0.0001516397461638962, "loss": 0.9595, "step": 190 }, { "epoch": 0.3410714285714286, "grad_norm": 0.3978990614414215, "learning_rate": 0.00015114976098419842, "loss": 0.9993, "step": 191 }, { "epoch": 0.34285714285714286, "grad_norm": 0.3546142876148224, "learning_rate": 0.00015065810695645584, "loss": 1.3421, "step": 192 }, { "epoch": 0.34464285714285714, "grad_norm": 0.39728498458862305, "learning_rate": 0.00015016480012171828, "loss": 1.1209, "step": 193 }, { "epoch": 0.3464285714285714, "grad_norm": 0.4170741140842438, "learning_rate": 0.00014966985657496114, "loss": 1.0024, "step": 194 }, { "epoch": 0.3482142857142857, "grad_norm": 0.4226652681827545, "learning_rate": 0.0001491732924645604, "loss": 1.3139, "step": 195 }, { "epoch": 0.35, "grad_norm": 0.3712114691734314, "learning_rate": 0.00014867512399176563, "loss": 1.1574, "step": 196 }, { "epoch": 0.3517857142857143, "grad_norm": 0.3655322790145874, "learning_rate": 0.00014817536741017152, "loss": 1.6149, "step": 197 }, { "epoch": 0.3535714285714286, "grad_norm": 0.4362059533596039, "learning_rate": 0.0001476740390251875, "loss": 1.7657, "step": 198 }, { "epoch": 0.35535714285714287, "grad_norm": 0.43134769797325134, "learning_rate": 0.00014717115519350567, "loss": 1.7167, "step": 199 }, { "epoch": 0.35714285714285715, "grad_norm": 0.7784890532493591, "learning_rate": 0.00014666673232256738, "loss": 2.036, "step": 200 }, { "epoch": 0.35892857142857143, "grad_norm": 0.17376984655857086, "learning_rate": 0.0001461607868700276, "loss": 1.4856, "step": 201 }, { "epoch": 0.3607142857142857, "grad_norm": 0.2141953408718109, "learning_rate": 0.00014565333534321826, "loss": 1.7491, "step": 202 }, { "epoch": 0.3625, "grad_norm": 0.22548137605190277, "learning_rate": 0.00014514439429860943, "loss": 1.8457, "step": 203 }, { "epoch": 0.36428571428571427, "grad_norm": 0.20618294179439545, "learning_rate": 0.0001446339803412692, "loss": 1.4987, "step": 204 }, { "epoch": 0.36607142857142855, "grad_norm": 0.21025151014328003, "learning_rate": 0.00014412211012432212, "loss": 1.5568, "step": 205 }, { "epoch": 0.3678571428571429, "grad_norm": 0.21678180992603302, "learning_rate": 0.00014360880034840554, "loss": 1.7841, "step": 206 }, { "epoch": 0.36964285714285716, "grad_norm": 0.20914790034294128, "learning_rate": 0.0001430940677611249, "loss": 1.6693, "step": 207 }, { "epoch": 0.37142857142857144, "grad_norm": 0.21597585082054138, "learning_rate": 0.00014257792915650728, "loss": 1.648, "step": 208 }, { "epoch": 0.3732142857142857, "grad_norm": 0.23697789013385773, "learning_rate": 0.00014206040137445348, "loss": 1.7616, "step": 209 }, { "epoch": 0.375, "grad_norm": 0.2535800635814667, "learning_rate": 0.00014154150130018866, "loss": 2.0279, "step": 210 }, { "epoch": 0.3767857142857143, "grad_norm": 0.21204812824726105, "learning_rate": 0.0001410212458637112, "loss": 1.8472, "step": 211 }, { "epoch": 0.37857142857142856, "grad_norm": 0.36059629917144775, "learning_rate": 0.00014049965203924054, "loss": 1.8042, "step": 212 }, { "epoch": 0.38035714285714284, "grad_norm": 0.21400661766529083, "learning_rate": 0.0001399767368446634, "loss": 1.698, "step": 213 }, { "epoch": 0.3821428571428571, "grad_norm": 0.24055758118629456, "learning_rate": 0.00013945251734097828, "loss": 1.8758, "step": 214 }, { "epoch": 0.38392857142857145, "grad_norm": 0.23605166375637054, "learning_rate": 0.00013892701063173918, "loss": 1.7425, "step": 215 }, { "epoch": 0.38571428571428573, "grad_norm": 0.23343758285045624, "learning_rate": 0.00013840023386249713, "loss": 1.8683, "step": 216 }, { "epoch": 0.3875, "grad_norm": 0.2475200593471527, "learning_rate": 0.00013787220422024134, "loss": 1.9091, "step": 217 }, { "epoch": 0.3892857142857143, "grad_norm": 0.2618944048881531, "learning_rate": 0.00013734293893283783, "loss": 1.5086, "step": 218 }, { "epoch": 0.39107142857142857, "grad_norm": 0.2627498209476471, "learning_rate": 0.00013681245526846783, "loss": 1.3878, "step": 219 }, { "epoch": 0.39285714285714285, "grad_norm": 0.24390314519405365, "learning_rate": 0.0001362807705350641, "loss": 1.7332, "step": 220 }, { "epoch": 0.39464285714285713, "grad_norm": 0.2768295705318451, "learning_rate": 0.00013574790207974646, "loss": 1.3123, "step": 221 }, { "epoch": 0.3964285714285714, "grad_norm": 0.2606358230113983, "learning_rate": 0.0001352138672882555, "loss": 1.4506, "step": 222 }, { "epoch": 0.3982142857142857, "grad_norm": 0.24806426465511322, "learning_rate": 0.00013467868358438563, "loss": 1.7087, "step": 223 }, { "epoch": 0.4, "grad_norm": 0.2664608061313629, "learning_rate": 0.00013414236842941644, "loss": 1.3124, "step": 224 }, { "epoch": 0.4017857142857143, "grad_norm": 0.2661263346672058, "learning_rate": 0.00013360493932154302, "loss": 1.2174, "step": 225 }, { "epoch": 0.4035714285714286, "grad_norm": 0.3460038900375366, "learning_rate": 0.00013306641379530514, "loss": 0.6889, "step": 226 }, { "epoch": 0.40535714285714286, "grad_norm": 0.2929069995880127, "learning_rate": 0.000132526809421015, "loss": 0.9457, "step": 227 }, { "epoch": 0.40714285714285714, "grad_norm": 0.3459819257259369, "learning_rate": 0.00013198614380418412, "loss": 1.2547, "step": 228 }, { "epoch": 0.4089285714285714, "grad_norm": 0.30105313658714294, "learning_rate": 0.00013144443458494882, "loss": 0.957, "step": 229 }, { "epoch": 0.4107142857142857, "grad_norm": 0.3461960256099701, "learning_rate": 0.00013090169943749476, "loss": 1.3146, "step": 230 }, { "epoch": 0.4125, "grad_norm": 0.34542855620384216, "learning_rate": 0.00013035795606948023, "loss": 1.1128, "step": 231 }, { "epoch": 0.4142857142857143, "grad_norm": 0.37605586647987366, "learning_rate": 0.00012981322222145846, "loss": 1.5095, "step": 232 }, { "epoch": 0.4160714285714286, "grad_norm": 0.37267056107521057, "learning_rate": 0.00012926751566629875, "loss": 1.071, "step": 233 }, { "epoch": 0.41785714285714287, "grad_norm": 0.3052172064781189, "learning_rate": 0.00012872085420860665, "loss": 1.3136, "step": 234 }, { "epoch": 0.41964285714285715, "grad_norm": 0.36694592237472534, "learning_rate": 0.00012817325568414297, "loss": 1.2439, "step": 235 }, { "epoch": 0.42142857142857143, "grad_norm": 0.36055245995521545, "learning_rate": 0.00012762473795924204, "loss": 1.1165, "step": 236 }, { "epoch": 0.4232142857142857, "grad_norm": 0.3014545738697052, "learning_rate": 0.00012707531893022854, "loss": 1.5423, "step": 237 }, { "epoch": 0.425, "grad_norm": 0.3208891749382019, "learning_rate": 0.00012652501652283377, "loss": 1.1813, "step": 238 }, { "epoch": 0.42678571428571427, "grad_norm": 0.38703230023384094, "learning_rate": 0.00012597384869161084, "loss": 0.7706, "step": 239 }, { "epoch": 0.42857142857142855, "grad_norm": 0.38256821036338806, "learning_rate": 0.00012542183341934872, "loss": 1.0565, "step": 240 }, { "epoch": 0.4303571428571429, "grad_norm": 0.3555380702018738, "learning_rate": 0.0001248689887164855, "loss": 0.849, "step": 241 }, { "epoch": 0.43214285714285716, "grad_norm": 0.3472703993320465, "learning_rate": 0.00012431533262052098, "loss": 1.3984, "step": 242 }, { "epoch": 0.43392857142857144, "grad_norm": 0.3631349503993988, "learning_rate": 0.000123760883195428, "loss": 0.8955, "step": 243 }, { "epoch": 0.4357142857142857, "grad_norm": 0.349295973777771, "learning_rate": 0.00012320565853106316, "loss": 0.8866, "step": 244 }, { "epoch": 0.4375, "grad_norm": 0.33635953068733215, "learning_rate": 0.00012264967674257646, "loss": 1.2419, "step": 245 }, { "epoch": 0.4392857142857143, "grad_norm": 0.3833181858062744, "learning_rate": 0.00012209295596982042, "loss": 1.5507, "step": 246 }, { "epoch": 0.44107142857142856, "grad_norm": 0.3737214505672455, "learning_rate": 0.00012153551437675821, "loss": 1.4881, "step": 247 }, { "epoch": 0.44285714285714284, "grad_norm": 0.4705282747745514, "learning_rate": 0.00012097737015087094, "loss": 1.4864, "step": 248 }, { "epoch": 0.4446428571428571, "grad_norm": 0.39539188146591187, "learning_rate": 0.00012041854150256433, "loss": 1.7855, "step": 249 }, { "epoch": 0.44642857142857145, "grad_norm": 0.7369075417518616, "learning_rate": 0.00011985904666457455, "loss": 2.01, "step": 250 }, { "epoch": 0.44821428571428573, "grad_norm": 0.18146094679832458, "learning_rate": 0.00011929890389137337, "loss": 1.5898, "step": 251 }, { "epoch": 0.45, "grad_norm": 0.21558880805969238, "learning_rate": 0.00011873813145857249, "loss": 1.5816, "step": 252 }, { "epoch": 0.4517857142857143, "grad_norm": 0.19599275290966034, "learning_rate": 0.00011817674766232734, "loss": 1.6433, "step": 253 }, { "epoch": 0.45357142857142857, "grad_norm": 0.22075910866260529, "learning_rate": 0.00011761477081874015, "loss": 1.6005, "step": 254 }, { "epoch": 0.45535714285714285, "grad_norm": 0.19471955299377441, "learning_rate": 0.0001170522192632624, "loss": 1.7133, "step": 255 }, { "epoch": 0.45714285714285713, "grad_norm": 0.19876879453659058, "learning_rate": 0.00011648911135009634, "loss": 1.5085, "step": 256 }, { "epoch": 0.4589285714285714, "grad_norm": 0.20565317571163177, "learning_rate": 0.00011592546545159645, "loss": 1.7386, "step": 257 }, { "epoch": 0.4607142857142857, "grad_norm": 0.24483506381511688, "learning_rate": 0.00011536129995766996, "loss": 1.7162, "step": 258 }, { "epoch": 0.4625, "grad_norm": 0.21543823182582855, "learning_rate": 0.00011479663327517667, "loss": 1.6966, "step": 259 }, { "epoch": 0.4642857142857143, "grad_norm": 0.2661048471927643, "learning_rate": 0.00011423148382732853, "loss": 1.8821, "step": 260 }, { "epoch": 0.4660714285714286, "grad_norm": 0.24292460083961487, "learning_rate": 0.00011366587005308858, "loss": 1.7085, "step": 261 }, { "epoch": 0.46785714285714286, "grad_norm": 0.216167613863945, "learning_rate": 0.0001130998104065693, "loss": 1.7298, "step": 262 }, { "epoch": 0.46964285714285714, "grad_norm": 0.2111697793006897, "learning_rate": 0.00011253332335643043, "loss": 1.8098, "step": 263 }, { "epoch": 0.4714285714285714, "grad_norm": 0.23981061577796936, "learning_rate": 0.00011196642738527659, "loss": 1.7026, "step": 264 }, { "epoch": 0.4732142857142857, "grad_norm": 0.2623251676559448, "learning_rate": 0.00011139914098905406, "loss": 1.7894, "step": 265 }, { "epoch": 0.475, "grad_norm": 0.2482486367225647, "learning_rate": 0.00011083148267644747, "loss": 1.9019, "step": 266 }, { "epoch": 0.4767857142857143, "grad_norm": 0.238911435008049, "learning_rate": 0.00011026347096827578, "loss": 1.6809, "step": 267 }, { "epoch": 0.4785714285714286, "grad_norm": 0.24704696238040924, "learning_rate": 0.00010969512439688816, "loss": 1.6607, "step": 268 }, { "epoch": 0.48035714285714287, "grad_norm": 0.25105100870132446, "learning_rate": 0.00010912646150555919, "loss": 1.5895, "step": 269 }, { "epoch": 0.48214285714285715, "grad_norm": 0.2849842607975006, "learning_rate": 0.00010855750084788398, "loss": 2.0812, "step": 270 }, { "epoch": 0.48392857142857143, "grad_norm": 0.2599342167377472, "learning_rate": 0.00010798826098717276, "loss": 1.1569, "step": 271 }, { "epoch": 0.4857142857142857, "grad_norm": 0.28037258982658386, "learning_rate": 0.00010741876049584523, "loss": 1.1928, "step": 272 }, { "epoch": 0.4875, "grad_norm": 0.29920563101768494, "learning_rate": 0.00010684901795482456, "loss": 1.2244, "step": 273 }, { "epoch": 0.48928571428571427, "grad_norm": 0.2799164354801178, "learning_rate": 0.00010627905195293135, "loss": 1.4455, "step": 274 }, { "epoch": 0.49107142857142855, "grad_norm": 0.23873603343963623, "learning_rate": 0.00010570888108627681, "loss": 0.852, "step": 275 }, { "epoch": 0.4928571428571429, "grad_norm": 0.2817741632461548, "learning_rate": 0.00010513852395765631, "loss": 1.3203, "step": 276 }, { "epoch": 0.49464285714285716, "grad_norm": 0.27295514941215515, "learning_rate": 0.00010456799917594233, "loss": 0.749, "step": 277 }, { "epoch": 0.49642857142857144, "grad_norm": 0.30728739500045776, "learning_rate": 0.00010399732535547734, "loss": 1.0083, "step": 278 }, { "epoch": 0.4982142857142857, "grad_norm": 0.32001444697380066, "learning_rate": 0.00010342652111546635, "loss": 1.573, "step": 279 }, { "epoch": 0.5, "grad_norm": 0.3400101065635681, "learning_rate": 0.00010285560507936961, "loss": 1.0757, "step": 280 }, { "epoch": 0.5, "eval_loss": 1.4439584016799927, "eval_runtime": 13.9741, "eval_samples_per_second": 16.888, "eval_steps_per_second": 8.444, "step": 280 }, { "epoch": 0.5017857142857143, "grad_norm": 0.3074837923049927, "learning_rate": 0.00010228459587429497, "loss": 1.1389, "step": 281 }, { "epoch": 0.5035714285714286, "grad_norm": 0.3143484592437744, "learning_rate": 0.00010171351213038993, "loss": 0.9542, "step": 282 }, { "epoch": 0.5053571428571428, "grad_norm": 0.3524804413318634, "learning_rate": 0.00010114237248023404, "loss": 0.8578, "step": 283 }, { "epoch": 0.5071428571428571, "grad_norm": 0.335183322429657, "learning_rate": 0.00010057119555823085, "loss": 0.9228, "step": 284 }, { "epoch": 0.5089285714285714, "grad_norm": 0.35537081956863403, "learning_rate": 0.0001, "loss": 0.9541, "step": 285 }, { "epoch": 0.5107142857142857, "grad_norm": 0.3294563591480255, "learning_rate": 9.942880444176918e-05, "loss": 1.4223, "step": 286 }, { "epoch": 0.5125, "grad_norm": 0.32628077268600464, "learning_rate": 9.8857627519766e-05, "loss": 1.2942, "step": 287 }, { "epoch": 0.5142857142857142, "grad_norm": 0.32195404171943665, "learning_rate": 9.828648786961008e-05, "loss": 1.4239, "step": 288 }, { "epoch": 0.5160714285714286, "grad_norm": 0.3283804655075073, "learning_rate": 9.771540412570504e-05, "loss": 1.0303, "step": 289 }, { "epoch": 0.5178571428571429, "grad_norm": 0.356052428483963, "learning_rate": 9.71443949206304e-05, "loss": 1.0199, "step": 290 }, { "epoch": 0.5196428571428572, "grad_norm": 0.39479124546051025, "learning_rate": 9.657347888453367e-05, "loss": 1.1343, "step": 291 }, { "epoch": 0.5214285714285715, "grad_norm": 0.34791451692581177, "learning_rate": 9.60026746445227e-05, "loss": 1.521, "step": 292 }, { "epoch": 0.5232142857142857, "grad_norm": 0.3614530861377716, "learning_rate": 9.543200082405768e-05, "loss": 1.2346, "step": 293 }, { "epoch": 0.525, "grad_norm": 0.36958596110343933, "learning_rate": 9.486147604234371e-05, "loss": 1.0457, "step": 294 }, { "epoch": 0.5267857142857143, "grad_norm": 0.4293418824672699, "learning_rate": 9.42911189137232e-05, "loss": 1.1071, "step": 295 }, { "epoch": 0.5285714285714286, "grad_norm": 0.40408602356910706, "learning_rate": 9.372094804706867e-05, "loss": 1.3805, "step": 296 }, { "epoch": 0.5303571428571429, "grad_norm": 0.3892784118652344, "learning_rate": 9.315098204517543e-05, "loss": 0.9136, "step": 297 }, { "epoch": 0.5321428571428571, "grad_norm": 0.4003988206386566, "learning_rate": 9.258123950415479e-05, "loss": 1.3684, "step": 298 }, { "epoch": 0.5339285714285714, "grad_norm": 0.37116503715515137, "learning_rate": 9.201173901282724e-05, "loss": 1.7824, "step": 299 }, { "epoch": 0.5357142857142857, "grad_norm": 0.5286569595336914, "learning_rate": 9.144249915211605e-05, "loss": 1.9392, "step": 300 }, { "epoch": 0.5375, "grad_norm": 0.18338526785373688, "learning_rate": 9.087353849444085e-05, "loss": 1.4422, "step": 301 }, { "epoch": 0.5392857142857143, "grad_norm": 0.20191389322280884, "learning_rate": 9.030487560311186e-05, "loss": 1.5443, "step": 302 }, { "epoch": 0.5410714285714285, "grad_norm": 0.19955602288246155, "learning_rate": 8.973652903172423e-05, "loss": 1.6521, "step": 303 }, { "epoch": 0.5428571428571428, "grad_norm": 0.2253991812467575, "learning_rate": 8.916851732355255e-05, "loss": 1.6596, "step": 304 }, { "epoch": 0.5446428571428571, "grad_norm": 0.18380412459373474, "learning_rate": 8.860085901094595e-05, "loss": 1.5462, "step": 305 }, { "epoch": 0.5464285714285714, "grad_norm": 0.21318556368350983, "learning_rate": 8.803357261472343e-05, "loss": 1.6713, "step": 306 }, { "epoch": 0.5482142857142858, "grad_norm": 0.20480670034885406, "learning_rate": 8.746667664356956e-05, "loss": 1.7537, "step": 307 }, { "epoch": 0.55, "grad_norm": 0.21598470211029053, "learning_rate": 8.690018959343072e-05, "loss": 1.6955, "step": 308 }, { "epoch": 0.5517857142857143, "grad_norm": 0.21172207593917847, "learning_rate": 8.633412994691144e-05, "loss": 1.7187, "step": 309 }, { "epoch": 0.5535714285714286, "grad_norm": 0.22086191177368164, "learning_rate": 8.57685161726715e-05, "loss": 1.7579, "step": 310 }, { "epoch": 0.5553571428571429, "grad_norm": 0.2266392558813095, "learning_rate": 8.520336672482338e-05, "loss": 1.7486, "step": 311 }, { "epoch": 0.5571428571428572, "grad_norm": 0.2660948634147644, "learning_rate": 8.463870004233008e-05, "loss": 1.7903, "step": 312 }, { "epoch": 0.5589285714285714, "grad_norm": 0.2297395020723343, "learning_rate": 8.407453454840357e-05, "loss": 1.8017, "step": 313 }, { "epoch": 0.5607142857142857, "grad_norm": 0.21526682376861572, "learning_rate": 8.351088864990368e-05, "loss": 1.855, "step": 314 }, { "epoch": 0.5625, "grad_norm": 0.2516147196292877, "learning_rate": 8.294778073673762e-05, "loss": 1.7103, "step": 315 }, { "epoch": 0.5642857142857143, "grad_norm": 0.25641700625419617, "learning_rate": 8.238522918125983e-05, "loss": 1.9301, "step": 316 }, { "epoch": 0.5660714285714286, "grad_norm": 0.26828062534332275, "learning_rate": 8.182325233767267e-05, "loss": 1.8575, "step": 317 }, { "epoch": 0.5678571428571428, "grad_norm": 0.24787884950637817, "learning_rate": 8.126186854142752e-05, "loss": 2.0228, "step": 318 }, { "epoch": 0.5696428571428571, "grad_norm": 0.23658955097198486, "learning_rate": 8.070109610862668e-05, "loss": 1.7813, "step": 319 }, { "epoch": 0.5714285714285714, "grad_norm": 0.2818485498428345, "learning_rate": 8.014095333542548e-05, "loss": 1.7571, "step": 320 }, { "epoch": 0.5732142857142857, "grad_norm": 0.24982373416423798, "learning_rate": 7.958145849743569e-05, "loss": 1.602, "step": 321 }, { "epoch": 0.575, "grad_norm": 0.2815864682197571, "learning_rate": 7.902262984912909e-05, "loss": 1.7216, "step": 322 }, { "epoch": 0.5767857142857142, "grad_norm": 0.2675464451313019, "learning_rate": 7.846448562324183e-05, "loss": 1.0704, "step": 323 }, { "epoch": 0.5785714285714286, "grad_norm": 0.23938840627670288, "learning_rate": 7.79070440301796e-05, "loss": 1.282, "step": 324 }, { "epoch": 0.5803571428571429, "grad_norm": 0.2834213972091675, "learning_rate": 7.735032325742355e-05, "loss": 1.7934, "step": 325 }, { "epoch": 0.5821428571428572, "grad_norm": 0.3555513918399811, "learning_rate": 7.679434146893685e-05, "loss": 1.2089, "step": 326 }, { "epoch": 0.5839285714285715, "grad_norm": 0.3254348933696747, "learning_rate": 7.623911680457198e-05, "loss": 1.0845, "step": 327 }, { "epoch": 0.5857142857142857, "grad_norm": 0.3558744192123413, "learning_rate": 7.568466737947905e-05, "loss": 1.2339, "step": 328 }, { "epoch": 0.5875, "grad_norm": 0.32738080620765686, "learning_rate": 7.513101128351454e-05, "loss": 0.8492, "step": 329 }, { "epoch": 0.5892857142857143, "grad_norm": 0.36939212679862976, "learning_rate": 7.457816658065134e-05, "loss": 0.8973, "step": 330 }, { "epoch": 0.5910714285714286, "grad_norm": 0.3393215835094452, "learning_rate": 7.402615130838917e-05, "loss": 1.0078, "step": 331 }, { "epoch": 0.5928571428571429, "grad_norm": 0.402725487947464, "learning_rate": 7.347498347716624e-05, "loss": 1.2381, "step": 332 }, { "epoch": 0.5946428571428571, "grad_norm": 0.33164989948272705, "learning_rate": 7.292468106977148e-05, "loss": 1.197, "step": 333 }, { "epoch": 0.5964285714285714, "grad_norm": 0.3454689681529999, "learning_rate": 7.237526204075797e-05, "loss": 1.0244, "step": 334 }, { "epoch": 0.5982142857142857, "grad_norm": 0.3584868907928467, "learning_rate": 7.182674431585704e-05, "loss": 1.062, "step": 335 }, { "epoch": 0.6, "grad_norm": 0.3318020701408386, "learning_rate": 7.127914579139338e-05, "loss": 1.3696, "step": 336 }, { "epoch": 0.6017857142857143, "grad_norm": 0.38772639632225037, "learning_rate": 7.073248433370124e-05, "loss": 1.1725, "step": 337 }, { "epoch": 0.6035714285714285, "grad_norm": 0.33527833223342896, "learning_rate": 7.018677777854157e-05, "loss": 1.0849, "step": 338 }, { "epoch": 0.6053571428571428, "grad_norm": 0.3958960473537445, "learning_rate": 6.964204393051981e-05, "loss": 0.8494, "step": 339 }, { "epoch": 0.6071428571428571, "grad_norm": 0.3545495569705963, "learning_rate": 6.909830056250527e-05, "loss": 0.843, "step": 340 }, { "epoch": 0.6089285714285714, "grad_norm": 0.3475857675075531, "learning_rate": 6.855556541505122e-05, "loss": 1.1228, "step": 341 }, { "epoch": 0.6107142857142858, "grad_norm": 0.4085654616355896, "learning_rate": 6.801385619581592e-05, "loss": 0.8092, "step": 342 }, { "epoch": 0.6125, "grad_norm": 0.4605577886104584, "learning_rate": 6.747319057898503e-05, "loss": 1.0999, "step": 343 }, { "epoch": 0.6142857142857143, "grad_norm": 0.3840469717979431, "learning_rate": 6.693358620469487e-05, "loss": 1.2712, "step": 344 }, { "epoch": 0.6160714285714286, "grad_norm": 0.3712100684642792, "learning_rate": 6.639506067845697e-05, "loss": 1.1401, "step": 345 }, { "epoch": 0.6178571428571429, "grad_norm": 0.41670674085617065, "learning_rate": 6.585763157058358e-05, "loss": 1.151, "step": 346 }, { "epoch": 0.6196428571428572, "grad_norm": 0.5912812352180481, "learning_rate": 6.53213164156144e-05, "loss": 1.447, "step": 347 }, { "epoch": 0.6214285714285714, "grad_norm": 0.3995843231678009, "learning_rate": 6.478613271174453e-05, "loss": 1.6645, "step": 348 }, { "epoch": 0.6232142857142857, "grad_norm": 0.5337942242622375, "learning_rate": 6.425209792025358e-05, "loss": 1.8703, "step": 349 }, { "epoch": 0.625, "grad_norm": 0.5726771354675293, "learning_rate": 6.371922946493591e-05, "loss": 1.9795, "step": 350 }, { "epoch": 0.6267857142857143, "grad_norm": 0.17994743585586548, "learning_rate": 6.318754473153221e-05, "loss": 1.5463, "step": 351 }, { "epoch": 0.6285714285714286, "grad_norm": 0.16961722075939178, "learning_rate": 6.26570610671622e-05, "loss": 1.5388, "step": 352 }, { "epoch": 0.6303571428571428, "grad_norm": 0.19410116970539093, "learning_rate": 6.21277957797587e-05, "loss": 1.7282, "step": 353 }, { "epoch": 0.6321428571428571, "grad_norm": 0.1906638890504837, "learning_rate": 6.159976613750286e-05, "loss": 1.5167, "step": 354 }, { "epoch": 0.6339285714285714, "grad_norm": 0.21918904781341553, "learning_rate": 6.107298936826086e-05, "loss": 1.7446, "step": 355 }, { "epoch": 0.6357142857142857, "grad_norm": 0.19429104030132294, "learning_rate": 6.0547482659021706e-05, "loss": 1.7166, "step": 356 }, { "epoch": 0.6375, "grad_norm": 0.21244099736213684, "learning_rate": 6.002326315533665e-05, "loss": 1.7319, "step": 357 }, { "epoch": 0.6392857142857142, "grad_norm": 0.22793884575366974, "learning_rate": 5.950034796075947e-05, "loss": 1.6573, "step": 358 }, { "epoch": 0.6410714285714286, "grad_norm": 0.23558048903942108, "learning_rate": 5.897875413628884e-05, "loss": 1.7359, "step": 359 }, { "epoch": 0.6428571428571429, "grad_norm": 0.21951396763324738, "learning_rate": 5.845849869981137e-05, "loss": 1.6536, "step": 360 }, { "epoch": 0.6446428571428572, "grad_norm": 0.22028475999832153, "learning_rate": 5.793959862554652e-05, "loss": 1.7257, "step": 361 }, { "epoch": 0.6464285714285715, "grad_norm": 0.23070622980594635, "learning_rate": 5.7422070843492734e-05, "loss": 1.6149, "step": 362 }, { "epoch": 0.6482142857142857, "grad_norm": 0.22408127784729004, "learning_rate": 5.6905932238875123e-05, "loss": 1.5936, "step": 363 }, { "epoch": 0.65, "grad_norm": 0.23098969459533691, "learning_rate": 5.639119965159446e-05, "loss": 1.7313, "step": 364 }, { "epoch": 0.6517857142857143, "grad_norm": 0.2533377707004547, "learning_rate": 5.5877889875677845e-05, "loss": 1.9422, "step": 365 }, { "epoch": 0.6535714285714286, "grad_norm": 0.2515897750854492, "learning_rate": 5.5366019658730825e-05, "loss": 1.8331, "step": 366 }, { "epoch": 0.6553571428571429, "grad_norm": 0.2757863700389862, "learning_rate": 5.485560570139061e-05, "loss": 1.7759, "step": 367 }, { "epoch": 0.6571428571428571, "grad_norm": 0.29774826765060425, "learning_rate": 5.434666465678175e-05, "loss": 1.334, "step": 368 }, { "epoch": 0.6589285714285714, "grad_norm": 0.28664523363113403, "learning_rate": 5.383921312997242e-05, "loss": 1.6234, "step": 369 }, { "epoch": 0.6607142857142857, "grad_norm": 0.30774933099746704, "learning_rate": 5.333326767743263e-05, "loss": 1.4743, "step": 370 }, { "epoch": 0.6625, "grad_norm": 0.26094669103622437, "learning_rate": 5.282884480649435e-05, "loss": 1.1046, "step": 371 }, { "epoch": 0.6642857142857143, "grad_norm": 0.2818247079849243, "learning_rate": 5.232596097481251e-05, "loss": 0.8417, "step": 372 }, { "epoch": 0.6660714285714285, "grad_norm": 0.3089035749435425, "learning_rate": 5.182463258982846e-05, "loss": 1.3922, "step": 373 }, { "epoch": 0.6678571428571428, "grad_norm": 0.2375502586364746, "learning_rate": 5.132487600823438e-05, "loss": 1.0855, "step": 374 }, { "epoch": 0.6696428571428571, "grad_norm": 0.3417636454105377, "learning_rate": 5.082670753543961e-05, "loss": 1.0819, "step": 375 }, { "epoch": 0.6714285714285714, "grad_norm": 0.2587840259075165, "learning_rate": 5.033014342503889e-05, "loss": 1.1154, "step": 376 }, { "epoch": 0.6732142857142858, "grad_norm": 0.29829278588294983, "learning_rate": 4.9835199878281765e-05, "loss": 0.9634, "step": 377 }, { "epoch": 0.675, "grad_norm": 0.307190477848053, "learning_rate": 4.9341893043544185e-05, "loss": 1.1533, "step": 378 }, { "epoch": 0.6767857142857143, "grad_norm": 0.3548847734928131, "learning_rate": 4.8850239015801625e-05, "loss": 1.2046, "step": 379 }, { "epoch": 0.6785714285714286, "grad_norm": 0.3130282759666443, "learning_rate": 4.836025383610382e-05, "loss": 1.1391, "step": 380 }, { "epoch": 0.6803571428571429, "grad_norm": 0.3400501012802124, "learning_rate": 4.787195349105159e-05, "loss": 1.0226, "step": 381 }, { "epoch": 0.6821428571428572, "grad_norm": 0.3462565541267395, "learning_rate": 4.7385353912275165e-05, "loss": 1.0968, "step": 382 }, { "epoch": 0.6839285714285714, "grad_norm": 0.331123948097229, "learning_rate": 4.690047097591427e-05, "loss": 1.1918, "step": 383 }, { "epoch": 0.6857142857142857, "grad_norm": 0.354432612657547, "learning_rate": 4.6417320502100316e-05, "loss": 1.2261, "step": 384 }, { "epoch": 0.6875, "grad_norm": 0.34844398498535156, "learning_rate": 4.593591825444028e-05, "loss": 0.7991, "step": 385 }, { "epoch": 0.6892857142857143, "grad_norm": 0.3462367653846741, "learning_rate": 4.545627993950201e-05, "loss": 1.1343, "step": 386 }, { "epoch": 0.6910714285714286, "grad_norm": 0.3352709114551544, "learning_rate": 4.497842120630229e-05, "loss": 1.1023, "step": 387 }, { "epoch": 0.6928571428571428, "grad_norm": 0.3581717610359192, "learning_rate": 4.4502357645795976e-05, "loss": 0.9181, "step": 388 }, { "epoch": 0.6946428571428571, "grad_norm": 0.35995498299598694, "learning_rate": 4.402810479036725e-05, "loss": 1.3445, "step": 389 }, { "epoch": 0.6964285714285714, "grad_norm": 0.372935950756073, "learning_rate": 4.355567811332311e-05, "loss": 1.0725, "step": 390 }, { "epoch": 0.6982142857142857, "grad_norm": 0.3759123980998993, "learning_rate": 4.30850930283882e-05, "loss": 1.1824, "step": 391 }, { "epoch": 0.7, "grad_norm": 0.391770601272583, "learning_rate": 4.2616364889202254e-05, "loss": 1.3516, "step": 392 }, { "epoch": 0.7017857142857142, "grad_norm": 0.3785625696182251, "learning_rate": 4.214950898881892e-05, "loss": 1.1624, "step": 393 }, { "epoch": 0.7035714285714286, "grad_norm": 0.4284125864505768, "learning_rate": 4.168454055920681e-05, "loss": 1.1318, "step": 394 }, { "epoch": 0.7053571428571429, "grad_norm": 0.391161173582077, "learning_rate": 4.12214747707527e-05, "loss": 1.4543, "step": 395 }, { "epoch": 0.7071428571428572, "grad_norm": 0.3723802864551544, "learning_rate": 4.0760326731766374e-05, "loss": 1.4265, "step": 396 }, { "epoch": 0.7089285714285715, "grad_norm": 0.4321235418319702, "learning_rate": 4.030111148798775e-05, "loss": 1.4523, "step": 397 }, { "epoch": 0.7107142857142857, "grad_norm": 0.43963977694511414, "learning_rate": 3.9843844022096135e-05, "loss": 1.7426, "step": 398 }, { "epoch": 0.7125, "grad_norm": 0.5444914698600769, "learning_rate": 3.938853925322118e-05, "loss": 1.8907, "step": 399 }, { "epoch": 0.7142857142857143, "grad_norm": 0.9059175252914429, "learning_rate": 3.893521203645618e-05, "loss": 2.2147, "step": 400 }, { "epoch": 0.7160714285714286, "grad_norm": 0.20250235497951508, "learning_rate": 3.848387716237353e-05, "loss": 1.7341, "step": 401 }, { "epoch": 0.7178571428571429, "grad_norm": 0.1863853931427002, "learning_rate": 3.8034549356541894e-05, "loss": 1.6956, "step": 402 }, { "epoch": 0.7196428571428571, "grad_norm": 0.19317291676998138, "learning_rate": 3.7587243279046056e-05, "loss": 1.7165, "step": 403 }, { "epoch": 0.7214285714285714, "grad_norm": 0.21101966500282288, "learning_rate": 3.714197352400849e-05, "loss": 1.8306, "step": 404 }, { "epoch": 0.7232142857142857, "grad_norm": 0.22385361790657043, "learning_rate": 3.669875461911297e-05, "loss": 1.7104, "step": 405 }, { "epoch": 0.725, "grad_norm": 0.22555914521217346, "learning_rate": 3.6257601025131026e-05, "loss": 1.5668, "step": 406 }, { "epoch": 0.7267857142857143, "grad_norm": 0.21916812658309937, "learning_rate": 3.581852713544983e-05, "loss": 1.7827, "step": 407 }, { "epoch": 0.7285714285714285, "grad_norm": 0.23447498679161072, "learning_rate": 3.538154727560259e-05, "loss": 1.8308, "step": 408 }, { "epoch": 0.7303571428571428, "grad_norm": 0.21024593710899353, "learning_rate": 3.494667570280132e-05, "loss": 1.613, "step": 409 }, { "epoch": 0.7321428571428571, "grad_norm": 0.23882578313350677, "learning_rate": 3.45139266054715e-05, "loss": 1.6853, "step": 410 }, { "epoch": 0.7339285714285714, "grad_norm": 0.23162604868412018, "learning_rate": 3.408331410278929e-05, "loss": 1.7371, "step": 411 }, { "epoch": 0.7357142857142858, "grad_norm": 0.23150567710399628, "learning_rate": 3.3654852244220826e-05, "loss": 1.7505, "step": 412 }, { "epoch": 0.7375, "grad_norm": 0.23027552664279938, "learning_rate": 3.322855500906373e-05, "loss": 1.7128, "step": 413 }, { "epoch": 0.7392857142857143, "grad_norm": 0.22426114976406097, "learning_rate": 3.2804436305991214e-05, "loss": 1.7721, "step": 414 }, { "epoch": 0.7410714285714286, "grad_norm": 0.22792723774909973, "learning_rate": 3.238250997259808e-05, "loss": 1.7089, "step": 415 }, { "epoch": 0.7428571428571429, "grad_norm": 0.2450588494539261, "learning_rate": 3.196278977494934e-05, "loss": 1.744, "step": 416 }, { "epoch": 0.7446428571428572, "grad_norm": 0.2348526120185852, "learning_rate": 3.154528940713113e-05, "loss": 1.8496, "step": 417 }, { "epoch": 0.7464285714285714, "grad_norm": 0.2519124746322632, "learning_rate": 3.113002249080386e-05, "loss": 1.76, "step": 418 }, { "epoch": 0.7482142857142857, "grad_norm": 0.27859431505203247, "learning_rate": 3.071700257475768e-05, "loss": 1.6493, "step": 419 }, { "epoch": 0.75, "grad_norm": 0.2539427876472473, "learning_rate": 3.030624313447067e-05, "loss": 1.7619, "step": 420 }, { "epoch": 0.75, "eval_loss": 1.4375773668289185, "eval_runtime": 13.3809, "eval_samples_per_second": 17.637, "eval_steps_per_second": 8.819, "step": 420 }, { "epoch": 0.7517857142857143, "grad_norm": 0.2867870330810547, "learning_rate": 2.9897757571668905e-05, "loss": 1.5308, "step": 421 }, { "epoch": 0.7535714285714286, "grad_norm": 0.26840099692344666, "learning_rate": 2.949155921388943e-05, "loss": 1.8933, "step": 422 }, { "epoch": 0.7553571428571428, "grad_norm": 0.32162272930145264, "learning_rate": 2.9087661314045366e-05, "loss": 1.6497, "step": 423 }, { "epoch": 0.7571428571428571, "grad_norm": 0.2842392921447754, "learning_rate": 2.8686077049993287e-05, "loss": 1.521, "step": 424 }, { "epoch": 0.7589285714285714, "grad_norm": 0.28831931948661804, "learning_rate": 2.828681952410366e-05, "loss": 1.2619, "step": 425 }, { "epoch": 0.7607142857142857, "grad_norm": 0.2879536747932434, "learning_rate": 2.7889901762833083e-05, "loss": 1.1705, "step": 426 }, { "epoch": 0.7625, "grad_norm": 0.32446354627609253, "learning_rate": 2.7495336716299313e-05, "loss": 1.1636, "step": 427 }, { "epoch": 0.7642857142857142, "grad_norm": 0.2487679123878479, "learning_rate": 2.7103137257858868e-05, "loss": 0.7092, "step": 428 }, { "epoch": 0.7660714285714286, "grad_norm": 0.3402203321456909, "learning_rate": 2.671331618368682e-05, "loss": 0.8279, "step": 429 }, { "epoch": 0.7678571428571429, "grad_norm": 0.3117457330226898, "learning_rate": 2.6325886212359498e-05, "loss": 1.2101, "step": 430 }, { "epoch": 0.7696428571428572, "grad_norm": 0.28996577858924866, "learning_rate": 2.5940859984439424e-05, "loss": 1.1556, "step": 431 }, { "epoch": 0.7714285714285715, "grad_norm": 0.31623905897140503, "learning_rate": 2.5558250062062828e-05, "loss": 1.1324, "step": 432 }, { "epoch": 0.7732142857142857, "grad_norm": 0.29438769817352295, "learning_rate": 2.5178068928529864e-05, "loss": 1.4183, "step": 433 }, { "epoch": 0.775, "grad_norm": 0.31944945454597473, "learning_rate": 2.4800328987897427e-05, "loss": 1.1763, "step": 434 }, { "epoch": 0.7767857142857143, "grad_norm": 0.32895660400390625, "learning_rate": 2.4425042564574184e-05, "loss": 0.8288, "step": 435 }, { "epoch": 0.7785714285714286, "grad_norm": 0.46789219975471497, "learning_rate": 2.4052221902918725e-05, "loss": 0.889, "step": 436 }, { "epoch": 0.7803571428571429, "grad_norm": 0.35038378834724426, "learning_rate": 2.368187916683997e-05, "loss": 1.1607, "step": 437 }, { "epoch": 0.7821428571428571, "grad_norm": 0.3326191008090973, "learning_rate": 2.3314026439400217e-05, "loss": 1.3321, "step": 438 }, { "epoch": 0.7839285714285714, "grad_norm": 0.3683798015117645, "learning_rate": 2.2948675722421086e-05, "loss": 0.831, "step": 439 }, { "epoch": 0.7857142857142857, "grad_norm": 0.3587857186794281, "learning_rate": 2.2585838936091754e-05, "loss": 1.0373, "step": 440 }, { "epoch": 0.7875, "grad_norm": 0.341905802488327, "learning_rate": 2.2225527918580204e-05, "loss": 1.2294, "step": 441 }, { "epoch": 0.7892857142857143, "grad_norm": 0.36167776584625244, "learning_rate": 2.1867754425646926e-05, "loss": 1.06, "step": 442 }, { "epoch": 0.7910714285714285, "grad_norm": 0.35555702447891235, "learning_rate": 2.151253013026121e-05, "loss": 1.5072, "step": 443 }, { "epoch": 0.7928571428571428, "grad_norm": 0.34384891390800476, "learning_rate": 2.115986662222058e-05, "loss": 1.3456, "step": 444 }, { "epoch": 0.7946428571428571, "grad_norm": 0.3673519492149353, "learning_rate": 2.0809775407772503e-05, "loss": 1.3223, "step": 445 }, { "epoch": 0.7964285714285714, "grad_norm": 0.400193452835083, "learning_rate": 2.0462267909238896e-05, "loss": 1.562, "step": 446 }, { "epoch": 0.7982142857142858, "grad_norm": 0.4361307621002197, "learning_rate": 2.011735546464365e-05, "loss": 1.1107, "step": 447 }, { "epoch": 0.8, "grad_norm": 0.38674113154411316, "learning_rate": 1.9775049327342486e-05, "loss": 1.2813, "step": 448 }, { "epoch": 0.8017857142857143, "grad_norm": 0.40437138080596924, "learning_rate": 1.943536066565603e-05, "loss": 1.5164, "step": 449 }, { "epoch": 0.8035714285714286, "grad_norm": 0.5968692898750305, "learning_rate": 1.9098300562505266e-05, "loss": 2.0903, "step": 450 }, { "epoch": 0.8053571428571429, "grad_norm": 0.16759343445301056, "learning_rate": 1.876388001504995e-05, "loss": 1.4236, "step": 451 }, { "epoch": 0.8071428571428572, "grad_norm": 0.1815633326768875, "learning_rate": 1.8432109934329834e-05, "loss": 1.6486, "step": 452 }, { "epoch": 0.8089285714285714, "grad_norm": 0.18043053150177002, "learning_rate": 1.810300114490875e-05, "loss": 1.4118, "step": 453 }, { "epoch": 0.8107142857142857, "grad_norm": 0.19016335904598236, "learning_rate": 1.777656438452129e-05, "loss": 1.4844, "step": 454 }, { "epoch": 0.8125, "grad_norm": 0.2066669464111328, "learning_rate": 1.74528103037226e-05, "loss": 1.583, "step": 455 }, { "epoch": 0.8142857142857143, "grad_norm": 0.21698947250843048, "learning_rate": 1.713174946554086e-05, "loss": 1.6314, "step": 456 }, { "epoch": 0.8160714285714286, "grad_norm": 0.21937011182308197, "learning_rate": 1.6813392345132518e-05, "loss": 1.6342, "step": 457 }, { "epoch": 0.8178571428571428, "grad_norm": 0.24016696214675903, "learning_rate": 1.649774932944075e-05, "loss": 1.7726, "step": 458 }, { "epoch": 0.8196428571428571, "grad_norm": 0.23045389354228973, "learning_rate": 1.6184830716856347e-05, "loss": 1.8679, "step": 459 }, { "epoch": 0.8214285714285714, "grad_norm": 0.2114153504371643, "learning_rate": 1.587464671688187e-05, "loss": 1.6223, "step": 460 }, { "epoch": 0.8232142857142857, "grad_norm": 0.22911155223846436, "learning_rate": 1.5567207449798515e-05, "loss": 1.6179, "step": 461 }, { "epoch": 0.825, "grad_norm": 0.22748011350631714, "learning_rate": 1.5262522946335755e-05, "loss": 1.7988, "step": 462 }, { "epoch": 0.8267857142857142, "grad_norm": 0.22931312024593353, "learning_rate": 1.4960603147344343e-05, "loss": 1.6515, "step": 463 }, { "epoch": 0.8285714285714286, "grad_norm": 0.24372749030590057, "learning_rate": 1.466145790347183e-05, "loss": 1.855, "step": 464 }, { "epoch": 0.8303571428571429, "grad_norm": 0.24426500499248505, "learning_rate": 1.4365096974841108e-05, "loss": 1.7367, "step": 465 }, { "epoch": 0.8321428571428572, "grad_norm": 0.22962844371795654, "learning_rate": 1.4071530030732095e-05, "loss": 1.871, "step": 466 }, { "epoch": 0.8339285714285715, "grad_norm": 0.25352221727371216, "learning_rate": 1.3780766649266242e-05, "loss": 1.816, "step": 467 }, { "epoch": 0.8357142857142857, "grad_norm": 0.2572453022003174, "learning_rate": 1.3492816317093893e-05, "loss": 1.857, "step": 468 }, { "epoch": 0.8375, "grad_norm": 0.26578372716903687, "learning_rate": 1.3207688429084974e-05, "loss": 1.8242, "step": 469 }, { "epoch": 0.8392857142857143, "grad_norm": 0.2806295156478882, "learning_rate": 1.2925392288022298e-05, "loss": 1.9345, "step": 470 }, { "epoch": 0.8410714285714286, "grad_norm": 0.2791413962841034, "learning_rate": 1.2645937104298111e-05, "loss": 1.4595, "step": 471 }, { "epoch": 0.8428571428571429, "grad_norm": 0.25607559084892273, "learning_rate": 1.2369331995613665e-05, "loss": 1.493, "step": 472 }, { "epoch": 0.8446428571428571, "grad_norm": 0.2600158452987671, "learning_rate": 1.2095585986681535e-05, "loss": 1.4394, "step": 473 }, { "epoch": 0.8464285714285714, "grad_norm": 0.27362141013145447, "learning_rate": 1.1824708008931418e-05, "loss": 1.0637, "step": 474 }, { "epoch": 0.8482142857142857, "grad_norm": 0.27769190073013306, "learning_rate": 1.1556706900218572e-05, "loss": 1.2003, "step": 475 }, { "epoch": 0.85, "grad_norm": 0.2426546961069107, "learning_rate": 1.1291591404535462e-05, "loss": 0.8119, "step": 476 }, { "epoch": 0.8517857142857143, "grad_norm": 0.2766360938549042, "learning_rate": 1.1029370171726571e-05, "loss": 1.0697, "step": 477 }, { "epoch": 0.8535714285714285, "grad_norm": 0.31444060802459717, "learning_rate": 1.0770051757206079e-05, "loss": 1.3142, "step": 478 }, { "epoch": 0.8553571428571428, "grad_norm": 0.33098921179771423, "learning_rate": 1.051364462167881e-05, "loss": 1.0656, "step": 479 }, { "epoch": 0.8571428571428571, "grad_norm": 0.327269583940506, "learning_rate": 1.026015713086418e-05, "loss": 1.1335, "step": 480 }, { "epoch": 0.8589285714285714, "grad_norm": 0.30396682024002075, "learning_rate": 1.0009597555223128e-05, "loss": 1.2085, "step": 481 }, { "epoch": 0.8607142857142858, "grad_norm": 0.30377310514450073, "learning_rate": 9.761974069688461e-06, "loss": 1.3687, "step": 482 }, { "epoch": 0.8625, "grad_norm": 0.3333049714565277, "learning_rate": 9.517294753398064e-06, "loss": 1.0564, "step": 483 }, { "epoch": 0.8642857142857143, "grad_norm": 0.3406635820865631, "learning_rate": 9.275567589431178e-06, "loss": 1.1551, "step": 484 }, { "epoch": 0.8660714285714286, "grad_norm": 0.35342979431152344, "learning_rate": 9.036800464548157e-06, "loss": 0.8579, "step": 485 }, { "epoch": 0.8678571428571429, "grad_norm": 0.338238000869751, "learning_rate": 8.80100116893301e-06, "loss": 1.2404, "step": 486 }, { "epoch": 0.8696428571428572, "grad_norm": 0.32925620675086975, "learning_rate": 8.568177395939215e-06, "loss": 0.9557, "step": 487 }, { "epoch": 0.8714285714285714, "grad_norm": 0.36271733045578003, "learning_rate": 8.338336741838838e-06, "loss": 1.1323, "step": 488 }, { "epoch": 0.8732142857142857, "grad_norm": 0.373471736907959, "learning_rate": 8.111486705574534e-06, "loss": 0.9512, "step": 489 }, { "epoch": 0.875, "grad_norm": 0.3971388041973114, "learning_rate": 7.887634688515e-06, "loss": 1.0311, "step": 490 }, { "epoch": 0.8767857142857143, "grad_norm": 0.35106372833251953, "learning_rate": 7.666787994213453e-06, "loss": 1.0705, "step": 491 }, { "epoch": 0.8785714285714286, "grad_norm": 0.3654085099697113, "learning_rate": 7.448953828169314e-06, "loss": 1.2338, "step": 492 }, { "epoch": 0.8803571428571428, "grad_norm": 0.3616616725921631, "learning_rate": 7.2341392975931785e-06, "loss": 1.0171, "step": 493 }, { "epoch": 0.8821428571428571, "grad_norm": 0.39053040742874146, "learning_rate": 7.022351411174866e-06, "loss": 1.2481, "step": 494 }, { "epoch": 0.8839285714285714, "grad_norm": 0.36361098289489746, "learning_rate": 6.813597078854772e-06, "loss": 1.3712, "step": 495 }, { "epoch": 0.8857142857142857, "grad_norm": 0.3919769823551178, "learning_rate": 6.607883111598445e-06, "loss": 1.2261, "step": 496 }, { "epoch": 0.8875, "grad_norm": 0.3980182707309723, "learning_rate": 6.405216221174326e-06, "loss": 1.2769, "step": 497 }, { "epoch": 0.8892857142857142, "grad_norm": 0.4138076901435852, "learning_rate": 6.205603019934791e-06, "loss": 1.4143, "step": 498 }, { "epoch": 0.8910714285714286, "grad_norm": 0.47321876883506775, "learning_rate": 6.009050020600459e-06, "loss": 1.687, "step": 499 }, { "epoch": 0.8928571428571429, "grad_norm": 0.532382607460022, "learning_rate": 5.8155636360475385e-06, "loss": 2.0251, "step": 500 }, { "epoch": 0.8946428571428572, "grad_norm": 0.16291013360023499, "learning_rate": 5.625150179098804e-06, "loss": 1.3453, "step": 501 }, { "epoch": 0.8964285714285715, "grad_norm": 0.19601905345916748, "learning_rate": 5.437815862317519e-06, "loss": 1.6226, "step": 502 }, { "epoch": 0.8982142857142857, "grad_norm": 0.20232488214969635, "learning_rate": 5.25356679780471e-06, "loss": 1.7009, "step": 503 }, { "epoch": 0.9, "grad_norm": 0.21668295562267303, "learning_rate": 5.072408996999844e-06, "loss": 1.674, "step": 504 }, { "epoch": 0.9017857142857143, "grad_norm": 0.208509162068367, "learning_rate": 4.8943483704846475e-06, "loss": 1.7305, "step": 505 }, { "epoch": 0.9035714285714286, "grad_norm": 0.22677063941955566, "learning_rate": 4.719390727790218e-06, "loss": 1.9097, "step": 506 }, { "epoch": 0.9053571428571429, "grad_norm": 0.2320660501718521, "learning_rate": 4.547541777207565e-06, "loss": 1.6309, "step": 507 }, { "epoch": 0.9071428571428571, "grad_norm": 0.21626609563827515, "learning_rate": 4.378807125601303e-06, "loss": 1.8275, "step": 508 }, { "epoch": 0.9089285714285714, "grad_norm": 0.2348957508802414, "learning_rate": 4.2131922782267405e-06, "loss": 1.6609, "step": 509 }, { "epoch": 0.9107142857142857, "grad_norm": 0.21976341307163239, "learning_rate": 4.050702638550275e-06, "loss": 1.6476, "step": 510 }, { "epoch": 0.9125, "grad_norm": 0.25860780477523804, "learning_rate": 3.891343508073053e-06, "loss": 1.7382, "step": 511 }, { "epoch": 0.9142857142857143, "grad_norm": 0.24138076603412628, "learning_rate": 3.7351200861580617e-06, "loss": 1.7668, "step": 512 }, { "epoch": 0.9160714285714285, "grad_norm": 0.23734422028064728, "learning_rate": 3.5820374698604555e-06, "loss": 1.794, "step": 513 }, { "epoch": 0.9178571428571428, "grad_norm": 0.23496972024440765, "learning_rate": 3.4321006537612165e-06, "loss": 1.7674, "step": 514 }, { "epoch": 0.9196428571428571, "grad_norm": 0.24626807868480682, "learning_rate": 3.2853145298042953e-06, "loss": 2.0287, "step": 515 }, { "epoch": 0.9214285714285714, "grad_norm": 0.24339932203292847, "learning_rate": 3.1416838871368924e-06, "loss": 1.8533, "step": 516 }, { "epoch": 0.9232142857142858, "grad_norm": 0.253841370344162, "learning_rate": 3.0012134119532964e-06, "loss": 1.6682, "step": 517 }, { "epoch": 0.925, "grad_norm": 0.26810407638549805, "learning_rate": 2.863907687341949e-06, "loss": 1.4686, "step": 518 }, { "epoch": 0.9267857142857143, "grad_norm": 0.24126453697681427, "learning_rate": 2.7297711931358993e-06, "loss": 1.0048, "step": 519 }, { "epoch": 0.9285714285714286, "grad_norm": 0.3192795515060425, "learning_rate": 2.5988083057666533e-06, "loss": 1.4609, "step": 520 }, { "epoch": 0.9303571428571429, "grad_norm": 0.2680839002132416, "learning_rate": 2.471023298121422e-06, "loss": 1.4287, "step": 521 }, { "epoch": 0.9321428571428572, "grad_norm": 0.34683963656425476, "learning_rate": 2.3464203394036322e-06, "loss": 1.1019, "step": 522 }, { "epoch": 0.9339285714285714, "grad_norm": 0.31293728947639465, "learning_rate": 2.2250034949969913e-06, "loss": 1.5812, "step": 523 }, { "epoch": 0.9357142857142857, "grad_norm": 0.27005112171173096, "learning_rate": 2.1067767263327933e-06, "loss": 0.8701, "step": 524 }, { "epoch": 0.9375, "grad_norm": 0.2739523649215698, "learning_rate": 1.9917438907606556e-06, "loss": 1.1165, "step": 525 }, { "epoch": 0.9392857142857143, "grad_norm": 0.2776015102863312, "learning_rate": 1.87990874142272e-06, "loss": 1.1448, "step": 526 }, { "epoch": 0.9410714285714286, "grad_norm": 0.30518367886543274, "learning_rate": 1.771274927131139e-06, "loss": 1.0695, "step": 527 }, { "epoch": 0.9428571428571428, "grad_norm": 0.31975796818733215, "learning_rate": 1.665845992249071e-06, "loss": 1.1755, "step": 528 }, { "epoch": 0.9446428571428571, "grad_norm": 0.3304663598537445, "learning_rate": 1.5636253765750508e-06, "loss": 1.0217, "step": 529 }, { "epoch": 0.9464285714285714, "grad_norm": 0.3353167474269867, "learning_rate": 1.4646164152307018e-06, "loss": 1.2216, "step": 530 }, { "epoch": 0.9482142857142857, "grad_norm": 0.3266999423503876, "learning_rate": 1.3688223385519672e-06, "loss": 0.8658, "step": 531 }, { "epoch": 0.95, "grad_norm": 0.34619686007499695, "learning_rate": 1.2762462719837275e-06, "loss": 1.2839, "step": 532 }, { "epoch": 0.9517857142857142, "grad_norm": 0.32094186544418335, "learning_rate": 1.1868912359777607e-06, "loss": 1.1571, "step": 533 }, { "epoch": 0.9535714285714286, "grad_norm": 0.33895429968833923, "learning_rate": 1.1007601458942752e-06, "loss": 1.0222, "step": 534 }, { "epoch": 0.9553571428571429, "grad_norm": 0.34143325686454773, "learning_rate": 1.0178558119067315e-06, "loss": 0.9386, "step": 535 }, { "epoch": 0.9571428571428572, "grad_norm": 0.3505565822124481, "learning_rate": 9.381809389101825e-07, "loss": 1.1175, "step": 536 }, { "epoch": 0.9589285714285715, "grad_norm": 0.36876025795936584, "learning_rate": 8.617381264330426e-07, "loss": 1.0065, "step": 537 }, { "epoch": 0.9607142857142857, "grad_norm": 0.3824593722820282, "learning_rate": 7.885298685522235e-07, "loss": 1.1445, "step": 538 }, { "epoch": 0.9625, "grad_norm": 0.3944648504257202, "learning_rate": 7.185585538117657e-07, "loss": 0.9773, "step": 539 }, { "epoch": 0.9642857142857143, "grad_norm": 0.32271888852119446, "learning_rate": 6.518264651449779e-07, "loss": 1.073, "step": 540 }, { "epoch": 0.9660714285714286, "grad_norm": 0.36668431758880615, "learning_rate": 5.883357797998757e-07, "loss": 1.3001, "step": 541 }, { "epoch": 0.9678571428571429, "grad_norm": 0.36653202772140503, "learning_rate": 5.280885692681592e-07, "loss": 1.0388, "step": 542 }, { "epoch": 0.9696428571428571, "grad_norm": 0.3570370674133301, "learning_rate": 4.710867992176682e-07, "loss": 1.0728, "step": 543 }, { "epoch": 0.9714285714285714, "grad_norm": 0.3680841624736786, "learning_rate": 4.173323294281994e-07, "loss": 1.3424, "step": 544 }, { "epoch": 0.9732142857142857, "grad_norm": 0.37311863899230957, "learning_rate": 3.6682691373086665e-07, "loss": 1.3634, "step": 545 }, { "epoch": 0.975, "grad_norm": 0.38995441794395447, "learning_rate": 3.195721999508461e-07, "loss": 1.4773, "step": 546 }, { "epoch": 0.9767857142857143, "grad_norm": 0.3970450162887573, "learning_rate": 2.7556972985363085e-07, "loss": 1.3839, "step": 547 }, { "epoch": 0.9785714285714285, "grad_norm": 0.5267580151557922, "learning_rate": 2.3482093909473756e-07, "loss": 1.9768, "step": 548 }, { "epoch": 0.9803571428571428, "grad_norm": 0.4860612154006958, "learning_rate": 1.973271571728441e-07, "loss": 1.7893, "step": 549 }, { "epoch": 0.9821428571428571, "grad_norm": 0.9272521734237671, "learning_rate": 1.630896073864352e-07, "loss": 2.1017, "step": 550 }, { "epoch": 0.9839285714285714, "grad_norm": 0.19919352233409882, "learning_rate": 1.3210940679385664e-07, "loss": 1.5237, "step": 551 }, { "epoch": 0.9857142857142858, "grad_norm": 0.20294204354286194, "learning_rate": 1.0438756617691115e-07, "loss": 1.597, "step": 552 }, { "epoch": 0.9875, "grad_norm": 0.2281683087348938, "learning_rate": 7.992499000785136e-08, "loss": 1.7265, "step": 553 }, { "epoch": 0.9892857142857143, "grad_norm": 0.2398226112127304, "learning_rate": 5.872247641987016e-08, "loss": 1.7301, "step": 554 }, { "epoch": 0.9910714285714286, "grad_norm": 0.2581394612789154, "learning_rate": 4.078071718107701e-08, "loss": 1.6294, "step": 555 }, { "epoch": 0.9928571428571429, "grad_norm": 0.2676229774951935, "learning_rate": 2.610029767191602e-08, "loss": 0.9741, "step": 556 }, { "epoch": 0.9946428571428572, "grad_norm": 0.30877920985221863, "learning_rate": 1.4681696866081229e-08, "loss": 1.1485, "step": 557 }, { "epoch": 0.9964285714285714, "grad_norm": 0.3330385386943817, "learning_rate": 6.525287314851358e-09, "loss": 0.906, "step": 558 }, { "epoch": 0.9982142857142857, "grad_norm": 0.364859402179718, "learning_rate": 1.6313351349883655e-09, "loss": 1.0913, "step": 559 }, { "epoch": 1.0, "grad_norm": 0.5055291652679443, "learning_rate": 0.0, "loss": 1.6353, "step": 560 }, { "epoch": 1.0, "eval_loss": 1.4418776035308838, "eval_runtime": 13.3713, "eval_samples_per_second": 17.65, "eval_steps_per_second": 8.825, "step": 560 } ], "logging_steps": 1, "max_steps": 560, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 140, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.826642274948219e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }