|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5, |
|
"eval_steps": 140, |
|
"global_step": 280, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0017857142857142857, |
|
"grad_norm": 0.14711466431617737, |
|
"learning_rate": 2e-05, |
|
"loss": 1.6301, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0035714285714285713, |
|
"grad_norm": 0.13304685056209564, |
|
"learning_rate": 4e-05, |
|
"loss": 1.6271, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005357142857142857, |
|
"grad_norm": 0.1233660876750946, |
|
"learning_rate": 6e-05, |
|
"loss": 1.6548, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007142857142857143, |
|
"grad_norm": 0.1381577104330063, |
|
"learning_rate": 8e-05, |
|
"loss": 1.7853, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.008928571428571428, |
|
"grad_norm": 0.14668770134449005, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7507, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.010714285714285714, |
|
"grad_norm": 0.168854758143425, |
|
"learning_rate": 0.00012, |
|
"loss": 1.9486, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 0.2166246473789215, |
|
"learning_rate": 0.00014, |
|
"loss": 1.9167, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.014285714285714285, |
|
"grad_norm": 0.23894698917865753, |
|
"learning_rate": 0.00016, |
|
"loss": 1.8384, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01607142857142857, |
|
"grad_norm": 0.2704271674156189, |
|
"learning_rate": 0.00018, |
|
"loss": 1.8232, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.017857142857142856, |
|
"grad_norm": 0.24295401573181152, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1559, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.019642857142857142, |
|
"grad_norm": 0.23109205067157745, |
|
"learning_rate": 0.00019999836866486503, |
|
"loss": 1.7082, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02142857142857143, |
|
"grad_norm": 0.205198273062706, |
|
"learning_rate": 0.00019999347471268516, |
|
"loss": 1.8417, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.023214285714285715, |
|
"grad_norm": 0.2678169012069702, |
|
"learning_rate": 0.00019998531830313395, |
|
"loss": 1.9222, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 0.2563900053501129, |
|
"learning_rate": 0.0001999738997023281, |
|
"loss": 1.8931, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.026785714285714284, |
|
"grad_norm": 0.31391921639442444, |
|
"learning_rate": 0.00019995921928281894, |
|
"loss": 1.9403, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02857142857142857, |
|
"grad_norm": 0.23522017896175385, |
|
"learning_rate": 0.00019994127752358013, |
|
"loss": 1.8728, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.030357142857142857, |
|
"grad_norm": 0.2517368495464325, |
|
"learning_rate": 0.00019992007500999214, |
|
"loss": 1.9224, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03214285714285714, |
|
"grad_norm": 0.27111852169036865, |
|
"learning_rate": 0.00019989561243382312, |
|
"loss": 1.9046, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.033928571428571426, |
|
"grad_norm": 0.2616860568523407, |
|
"learning_rate": 0.00019986789059320615, |
|
"loss": 2.1173, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03571428571428571, |
|
"grad_norm": 0.23268847167491913, |
|
"learning_rate": 0.00019983691039261357, |
|
"loss": 1.8406, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 0.30090174078941345, |
|
"learning_rate": 0.00019980267284282717, |
|
"loss": 1.7514, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.039285714285714285, |
|
"grad_norm": 0.23314997553825378, |
|
"learning_rate": 0.00019976517906090529, |
|
"loss": 1.7518, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04107142857142857, |
|
"grad_norm": 0.2630898952484131, |
|
"learning_rate": 0.0001997244302701464, |
|
"loss": 1.7533, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04285714285714286, |
|
"grad_norm": 0.3203246593475342, |
|
"learning_rate": 0.00019968042780004917, |
|
"loss": 1.534, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.044642857142857144, |
|
"grad_norm": 0.48179084062576294, |
|
"learning_rate": 0.00019963317308626914, |
|
"loss": 1.4211, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04642857142857143, |
|
"grad_norm": 0.36420196294784546, |
|
"learning_rate": 0.0001995826676705718, |
|
"loss": 1.4275, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.048214285714285716, |
|
"grad_norm": 0.35330235958099365, |
|
"learning_rate": 0.00019952891320078236, |
|
"loss": 1.5612, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.27140146493911743, |
|
"learning_rate": 0.00019947191143073186, |
|
"loss": 1.6001, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05178571428571429, |
|
"grad_norm": 0.2987808883190155, |
|
"learning_rate": 0.00019941166422020014, |
|
"loss": 1.3943, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05357142857142857, |
|
"grad_norm": 0.3032172918319702, |
|
"learning_rate": 0.00019934817353485501, |
|
"loss": 1.4888, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.055357142857142855, |
|
"grad_norm": 0.6096982359886169, |
|
"learning_rate": 0.00019928144144618824, |
|
"loss": 1.7057, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.05714285714285714, |
|
"grad_norm": 0.30788663029670715, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 1.3929, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.05892857142857143, |
|
"grad_norm": 0.33283260464668274, |
|
"learning_rate": 0.00019913826187356696, |
|
"loss": 1.5611, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.060714285714285714, |
|
"grad_norm": 0.3044683337211609, |
|
"learning_rate": 0.00019906181906108984, |
|
"loss": 1.2961, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.3300105929374695, |
|
"learning_rate": 0.0001989821441880933, |
|
"loss": 1.3975, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06428571428571428, |
|
"grad_norm": 0.3126329481601715, |
|
"learning_rate": 0.00019889923985410576, |
|
"loss": 1.2759, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.06607142857142857, |
|
"grad_norm": 0.33685699105262756, |
|
"learning_rate": 0.00019881310876402223, |
|
"loss": 1.162, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.06785714285714285, |
|
"grad_norm": 0.3971424996852875, |
|
"learning_rate": 0.0001987237537280163, |
|
"loss": 1.21, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.06964285714285715, |
|
"grad_norm": 0.43255800008773804, |
|
"learning_rate": 0.00019863117766144806, |
|
"loss": 1.0186, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 0.4930873215198517, |
|
"learning_rate": 0.00019853538358476932, |
|
"loss": 1.0045, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07321428571428572, |
|
"grad_norm": 0.4585554599761963, |
|
"learning_rate": 0.00019843637462342497, |
|
"loss": 1.1077, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 0.4778697192668915, |
|
"learning_rate": 0.00019833415400775093, |
|
"loss": 1.0817, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.07678571428571429, |
|
"grad_norm": 0.4595739245414734, |
|
"learning_rate": 0.0001982287250728689, |
|
"loss": 1.2805, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.07857142857142857, |
|
"grad_norm": 0.5665370225906372, |
|
"learning_rate": 0.00019812009125857728, |
|
"loss": 0.9855, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08035714285714286, |
|
"grad_norm": 0.40399685502052307, |
|
"learning_rate": 0.00019800825610923934, |
|
"loss": 1.3264, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08214285714285714, |
|
"grad_norm": 0.43560901284217834, |
|
"learning_rate": 0.00019789322327366723, |
|
"loss": 1.2724, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08392857142857142, |
|
"grad_norm": 0.375598669052124, |
|
"learning_rate": 0.000197774996505003, |
|
"loss": 1.1722, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.08571428571428572, |
|
"grad_norm": 0.4257534146308899, |
|
"learning_rate": 0.00019765357966059638, |
|
"loss": 1.3607, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 0.5437749028205872, |
|
"learning_rate": 0.0001975289767018786, |
|
"loss": 1.9812, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.08928571428571429, |
|
"grad_norm": 0.6295837163925171, |
|
"learning_rate": 0.00019740119169423337, |
|
"loss": 2.1601, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09107142857142857, |
|
"grad_norm": 0.22341597080230713, |
|
"learning_rate": 0.00019727022880686412, |
|
"loss": 1.7096, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.09285714285714286, |
|
"grad_norm": 0.26040709018707275, |
|
"learning_rate": 0.00019713609231265805, |
|
"loss": 1.733, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.09464285714285714, |
|
"grad_norm": 0.2146587371826172, |
|
"learning_rate": 0.00019699878658804672, |
|
"loss": 1.7999, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.09642857142857143, |
|
"grad_norm": 0.2316572070121765, |
|
"learning_rate": 0.0001968583161128631, |
|
"loss": 1.6017, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.09821428571428571, |
|
"grad_norm": 0.20329022407531738, |
|
"learning_rate": 0.00019671468547019573, |
|
"loss": 1.7464, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.22198128700256348, |
|
"learning_rate": 0.00019656789934623881, |
|
"loss": 1.7584, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10178571428571428, |
|
"grad_norm": 0.2108459621667862, |
|
"learning_rate": 0.00019641796253013958, |
|
"loss": 1.7389, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.10357142857142858, |
|
"grad_norm": 0.2677428126335144, |
|
"learning_rate": 0.00019626487991384196, |
|
"loss": 1.9166, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.10535714285714286, |
|
"grad_norm": 0.22205914556980133, |
|
"learning_rate": 0.00019610865649192697, |
|
"loss": 1.738, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.10714285714285714, |
|
"grad_norm": 0.22724869847297668, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 1.7451, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10892857142857143, |
|
"grad_norm": 0.24029673635959625, |
|
"learning_rate": 0.00019578680772177327, |
|
"loss": 1.9556, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.11071428571428571, |
|
"grad_norm": 0.2240082174539566, |
|
"learning_rate": 0.00019562119287439873, |
|
"loss": 1.6435, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 0.23900961875915527, |
|
"learning_rate": 0.00019545245822279243, |
|
"loss": 1.7015, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.2319100946187973, |
|
"learning_rate": 0.0001952806092722098, |
|
"loss": 1.7287, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.11607142857142858, |
|
"grad_norm": 0.22805678844451904, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 1.7574, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.11785714285714285, |
|
"grad_norm": 0.2389436662197113, |
|
"learning_rate": 0.00019492759100300019, |
|
"loss": 1.7738, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.11964285714285715, |
|
"grad_norm": 0.23373426496982574, |
|
"learning_rate": 0.00019474643320219532, |
|
"loss": 1.5328, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.12142857142857143, |
|
"grad_norm": 0.2664267420768738, |
|
"learning_rate": 0.0001945621841376825, |
|
"loss": 2.0273, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.12321428571428572, |
|
"grad_norm": 0.2796079218387604, |
|
"learning_rate": 0.0001943748498209012, |
|
"loss": 1.547, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.2695769667625427, |
|
"learning_rate": 0.00019418443636395248, |
|
"loss": 1.6374, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12678571428571428, |
|
"grad_norm": 0.3933340311050415, |
|
"learning_rate": 0.00019399094997939957, |
|
"loss": 1.2037, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.12857142857142856, |
|
"grad_norm": 0.2825562357902527, |
|
"learning_rate": 0.0001937943969800652, |
|
"loss": 1.5649, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.13035714285714287, |
|
"grad_norm": 0.3704264760017395, |
|
"learning_rate": 0.00019359478377882567, |
|
"loss": 1.292, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.13214285714285715, |
|
"grad_norm": 0.3214375972747803, |
|
"learning_rate": 0.00019339211688840157, |
|
"loss": 1.2539, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.13392857142857142, |
|
"grad_norm": 0.337907612323761, |
|
"learning_rate": 0.00019318640292114524, |
|
"loss": 1.3202, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1357142857142857, |
|
"grad_norm": 0.30557796359062195, |
|
"learning_rate": 0.00019297764858882514, |
|
"loss": 1.074, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 0.3154335916042328, |
|
"learning_rate": 0.00019276586070240682, |
|
"loss": 1.1813, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.1392857142857143, |
|
"grad_norm": 0.3587944209575653, |
|
"learning_rate": 0.0001925510461718307, |
|
"loss": 1.3602, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.14107142857142857, |
|
"grad_norm": 0.35669559240341187, |
|
"learning_rate": 0.0001923332120057866, |
|
"loss": 1.3626, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.48954322934150696, |
|
"learning_rate": 0.000192112365311485, |
|
"loss": 1.4762, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14464285714285716, |
|
"grad_norm": 0.4903950095176697, |
|
"learning_rate": 0.00019188851329442547, |
|
"loss": 1.6974, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.14642857142857144, |
|
"grad_norm": 0.35062992572784424, |
|
"learning_rate": 0.00019166166325816118, |
|
"loss": 1.2373, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.14821428571428572, |
|
"grad_norm": 0.5671891570091248, |
|
"learning_rate": 0.0001914318226040608, |
|
"loss": 1.5644, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.338214248418808, |
|
"learning_rate": 0.000191198998831067, |
|
"loss": 1.3717, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.15178571428571427, |
|
"grad_norm": 0.4187781512737274, |
|
"learning_rate": 0.00019096319953545185, |
|
"loss": 1.1211, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.15357142857142858, |
|
"grad_norm": 0.3229452967643738, |
|
"learning_rate": 0.00019072443241056883, |
|
"loss": 1.0473, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.15535714285714286, |
|
"grad_norm": 0.376589298248291, |
|
"learning_rate": 0.00019048270524660196, |
|
"loss": 0.9684, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.15714285714285714, |
|
"grad_norm": 0.4261431396007538, |
|
"learning_rate": 0.00019023802593031154, |
|
"loss": 1.0243, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.15892857142857142, |
|
"grad_norm": 0.4411628544330597, |
|
"learning_rate": 0.0001899904024447769, |
|
"loss": 1.2739, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.16071428571428573, |
|
"grad_norm": 0.3415679335594177, |
|
"learning_rate": 0.00018973984286913584, |
|
"loss": 1.2494, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 0.48547816276550293, |
|
"learning_rate": 0.0001894863553783212, |
|
"loss": 1.0384, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.16428571428571428, |
|
"grad_norm": 0.398359090089798, |
|
"learning_rate": 0.00018922994824279395, |
|
"loss": 1.2409, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.16607142857142856, |
|
"grad_norm": 0.4025682806968689, |
|
"learning_rate": 0.00018897062982827344, |
|
"loss": 1.0646, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.16785714285714284, |
|
"grad_norm": 0.46782195568084717, |
|
"learning_rate": 0.00018870840859546456, |
|
"loss": 1.2167, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.16964285714285715, |
|
"grad_norm": 0.45423248410224915, |
|
"learning_rate": 0.00018844329309978145, |
|
"loss": 1.2359, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17142857142857143, |
|
"grad_norm": 0.380495548248291, |
|
"learning_rate": 0.0001881752919910686, |
|
"loss": 1.0583, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1732142857142857, |
|
"grad_norm": 0.41103506088256836, |
|
"learning_rate": 0.00018790441401331847, |
|
"loss": 1.2755, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 0.4458900988101959, |
|
"learning_rate": 0.00018763066800438636, |
|
"loss": 1.3651, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.1767857142857143, |
|
"grad_norm": 0.5115835666656494, |
|
"learning_rate": 0.00018735406289570192, |
|
"loss": 1.5389, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 0.6476715803146362, |
|
"learning_rate": 0.00018707460771197774, |
|
"loss": 2.1173, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18035714285714285, |
|
"grad_norm": 0.18222245573997498, |
|
"learning_rate": 0.00018679231157091506, |
|
"loss": 1.3799, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.18214285714285713, |
|
"grad_norm": 0.2210777848958969, |
|
"learning_rate": 0.0001865071836829061, |
|
"loss": 1.6362, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.18392857142857144, |
|
"grad_norm": 0.21902061998844147, |
|
"learning_rate": 0.00018621923335073376, |
|
"loss": 1.6673, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.18571428571428572, |
|
"grad_norm": 0.21758711338043213, |
|
"learning_rate": 0.00018592846996926793, |
|
"loss": 1.669, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.21447710692882538, |
|
"learning_rate": 0.0001856349030251589, |
|
"loss": 1.6173, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.18928571428571428, |
|
"grad_norm": 0.20819346606731415, |
|
"learning_rate": 0.00018533854209652818, |
|
"loss": 1.6841, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.19107142857142856, |
|
"grad_norm": 0.19584102928638458, |
|
"learning_rate": 0.00018503939685265568, |
|
"loss": 1.5907, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.19285714285714287, |
|
"grad_norm": 0.2108387053012848, |
|
"learning_rate": 0.00018473747705366426, |
|
"loss": 1.7855, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.19464285714285715, |
|
"grad_norm": 0.20142777264118195, |
|
"learning_rate": 0.00018443279255020152, |
|
"loss": 1.79, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.19642857142857142, |
|
"grad_norm": 0.20290708541870117, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 1.8891, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1982142857142857, |
|
"grad_norm": 0.20743344724178314, |
|
"learning_rate": 0.00018381516928314367, |
|
"loss": 1.6483, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.24931779503822327, |
|
"learning_rate": 0.00018350225067055925, |
|
"loss": 1.7883, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2017857142857143, |
|
"grad_norm": 0.20271241664886475, |
|
"learning_rate": 0.00018318660765486748, |
|
"loss": 1.8015, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.20357142857142857, |
|
"grad_norm": 0.21297615766525269, |
|
"learning_rate": 0.00018286825053445918, |
|
"loss": 1.7595, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.20535714285714285, |
|
"grad_norm": 0.21898160874843597, |
|
"learning_rate": 0.0001825471896962774, |
|
"loss": 1.8246, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.20714285714285716, |
|
"grad_norm": 0.2433815449476242, |
|
"learning_rate": 0.00018222343561547874, |
|
"loss": 1.9214, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.20892857142857144, |
|
"grad_norm": 0.2514473497867584, |
|
"learning_rate": 0.00018189699885509127, |
|
"loss": 1.674, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.21071428571428572, |
|
"grad_norm": 0.23736989498138428, |
|
"learning_rate": 0.0001815678900656702, |
|
"loss": 1.7948, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2125, |
|
"grad_norm": 0.22837956249713898, |
|
"learning_rate": 0.00018123611998495007, |
|
"loss": 1.9429, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.2722070813179016, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 1.2768, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.21607142857142858, |
|
"grad_norm": 0.23845739662647247, |
|
"learning_rate": 0.00018056463933434398, |
|
"loss": 1.7407, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.21785714285714286, |
|
"grad_norm": 0.24140624701976776, |
|
"learning_rate": 0.00018022495067265753, |
|
"loss": 1.378, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.21964285714285714, |
|
"grad_norm": 0.2878568768501282, |
|
"learning_rate": 0.0001798826445353564, |
|
"loss": 1.6684, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.22142857142857142, |
|
"grad_norm": 0.35590338706970215, |
|
"learning_rate": 0.0001795377320907611, |
|
"loss": 0.7946, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.22321428571428573, |
|
"grad_norm": 0.35154449939727783, |
|
"learning_rate": 0.00017919022459222752, |
|
"loss": 0.9185, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 0.31363779306411743, |
|
"learning_rate": 0.00017884013337777943, |
|
"loss": 1.3471, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.22678571428571428, |
|
"grad_norm": 0.36924973130226135, |
|
"learning_rate": 0.00017848746986973883, |
|
"loss": 1.3852, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.44929584860801697, |
|
"learning_rate": 0.00017813224557435312, |
|
"loss": 0.82, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.23035714285714284, |
|
"grad_norm": 0.3592383563518524, |
|
"learning_rate": 0.0001777744720814198, |
|
"loss": 1.0219, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.23214285714285715, |
|
"grad_norm": 0.37820902466773987, |
|
"learning_rate": 0.00017741416106390826, |
|
"loss": 1.3303, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23392857142857143, |
|
"grad_norm": 0.33376428484916687, |
|
"learning_rate": 0.00017705132427757895, |
|
"loss": 1.384, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2357142857142857, |
|
"grad_norm": 0.3454584777355194, |
|
"learning_rate": 0.00017668597356059978, |
|
"loss": 1.2236, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.2375, |
|
"grad_norm": 0.345708429813385, |
|
"learning_rate": 0.00017631812083316003, |
|
"loss": 1.0293, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.2392857142857143, |
|
"grad_norm": 0.3823675811290741, |
|
"learning_rate": 0.00017594777809708126, |
|
"loss": 1.3131, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.24107142857142858, |
|
"grad_norm": 0.336627334356308, |
|
"learning_rate": 0.00017557495743542585, |
|
"loss": 1.2084, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.24285714285714285, |
|
"grad_norm": 0.3525237739086151, |
|
"learning_rate": 0.0001751996710121026, |
|
"loss": 1.1378, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.24464285714285713, |
|
"grad_norm": 0.38291576504707336, |
|
"learning_rate": 0.00017482193107147014, |
|
"loss": 0.8353, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.24642857142857144, |
|
"grad_norm": 0.4650520086288452, |
|
"learning_rate": 0.0001744417499379372, |
|
"loss": 1.2024, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.24821428571428572, |
|
"grad_norm": 0.32640358805656433, |
|
"learning_rate": 0.0001740591400155606, |
|
"loss": 1.2343, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.39074593782424927, |
|
"learning_rate": 0.0001736741137876405, |
|
"loss": 0.8896, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.4710056781768799, |
|
"eval_runtime": 13.3678, |
|
"eval_samples_per_second": 17.654, |
|
"eval_steps_per_second": 8.827, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2517857142857143, |
|
"grad_norm": 0.36249810457229614, |
|
"learning_rate": 0.00017328668381631318, |
|
"loss": 1.1208, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.25357142857142856, |
|
"grad_norm": 0.3750612735748291, |
|
"learning_rate": 0.00017289686274214118, |
|
"loss": 1.2502, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.25535714285714284, |
|
"grad_norm": 0.4201869070529938, |
|
"learning_rate": 0.0001725046632837007, |
|
"loss": 1.1947, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.2571428571428571, |
|
"grad_norm": 0.4865645468235016, |
|
"learning_rate": 0.00017211009823716694, |
|
"loss": 0.8749, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.25892857142857145, |
|
"grad_norm": 0.38693225383758545, |
|
"learning_rate": 0.00017171318047589637, |
|
"loss": 1.2495, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.26071428571428573, |
|
"grad_norm": 0.40707525610923767, |
|
"learning_rate": 0.00017131392295000674, |
|
"loss": 1.2321, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.2625, |
|
"grad_norm": 0.39570894837379456, |
|
"learning_rate": 0.00017091233868595467, |
|
"loss": 1.301, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.2642857142857143, |
|
"grad_norm": 0.4085226058959961, |
|
"learning_rate": 0.00017050844078611056, |
|
"loss": 1.5369, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.26607142857142857, |
|
"grad_norm": 0.47094810009002686, |
|
"learning_rate": 0.0001701022424283311, |
|
"loss": 1.9374, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"grad_norm": 0.8517308831214905, |
|
"learning_rate": 0.00016969375686552937, |
|
"loss": 1.808, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.26964285714285713, |
|
"grad_norm": 0.1922745406627655, |
|
"learning_rate": 0.00016928299742524234, |
|
"loss": 1.6608, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.2714285714285714, |
|
"grad_norm": 0.2090916484594345, |
|
"learning_rate": 0.00016886997750919619, |
|
"loss": 1.8009, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.2732142857142857, |
|
"grad_norm": 0.21698515117168427, |
|
"learning_rate": 0.00016845471059286887, |
|
"loss": 1.7821, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 0.21791532635688782, |
|
"learning_rate": 0.00016803721022505067, |
|
"loss": 1.5901, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.2767857142857143, |
|
"grad_norm": 0.22199980914592743, |
|
"learning_rate": 0.00016761749002740193, |
|
"loss": 1.7047, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2785714285714286, |
|
"grad_norm": 0.2096625566482544, |
|
"learning_rate": 0.0001671955636940088, |
|
"loss": 1.6898, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.28035714285714286, |
|
"grad_norm": 0.22975414991378784, |
|
"learning_rate": 0.00016677144499093626, |
|
"loss": 1.7631, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.28214285714285714, |
|
"grad_norm": 0.2187148928642273, |
|
"learning_rate": 0.0001663451477557792, |
|
"loss": 1.7872, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.2839285714285714, |
|
"grad_norm": 0.2257414609193802, |
|
"learning_rate": 0.0001659166858972107, |
|
"loss": 1.7732, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.22986693680286407, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 1.7031, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2875, |
|
"grad_norm": 0.21585014462471008, |
|
"learning_rate": 0.0001650533242971987, |
|
"loss": 1.8421, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.2892857142857143, |
|
"grad_norm": 0.22519604861736298, |
|
"learning_rate": 0.00016461845272439741, |
|
"loss": 1.6529, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.2910714285714286, |
|
"grad_norm": 0.22279705107212067, |
|
"learning_rate": 0.0001641814728645502, |
|
"loss": 1.9288, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.29285714285714287, |
|
"grad_norm": 0.22392615675926208, |
|
"learning_rate": 0.000163742398974869, |
|
"loss": 1.693, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.29464285714285715, |
|
"grad_norm": 0.22729454934597015, |
|
"learning_rate": 0.00016330124538088705, |
|
"loss": 1.7027, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.29642857142857143, |
|
"grad_norm": 0.2229882776737213, |
|
"learning_rate": 0.00016285802647599156, |
|
"loss": 1.8262, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.2982142857142857, |
|
"grad_norm": 0.25520074367523193, |
|
"learning_rate": 0.00016241275672095395, |
|
"loss": 1.6009, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.24272315204143524, |
|
"learning_rate": 0.00016196545064345812, |
|
"loss": 1.9227, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.30178571428571427, |
|
"grad_norm": 0.24380216002464294, |
|
"learning_rate": 0.00016151612283762652, |
|
"loss": 1.5198, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.30357142857142855, |
|
"grad_norm": 0.3242342472076416, |
|
"learning_rate": 0.00016106478796354382, |
|
"loss": 1.6981, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3053571428571429, |
|
"grad_norm": 0.277855783700943, |
|
"learning_rate": 0.00016061146074677885, |
|
"loss": 1.7011, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.30714285714285716, |
|
"grad_norm": 0.2710039019584656, |
|
"learning_rate": 0.00016015615597790388, |
|
"loss": 1.7522, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.30892857142857144, |
|
"grad_norm": 0.26541268825531006, |
|
"learning_rate": 0.00015969888851201226, |
|
"loss": 1.3804, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3107142857142857, |
|
"grad_norm": 0.28985923528671265, |
|
"learning_rate": 0.00015923967326823368, |
|
"loss": 1.6453, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.33939245343208313, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 1.1725, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3142857142857143, |
|
"grad_norm": 0.29770731925964355, |
|
"learning_rate": 0.0001583154594407932, |
|
"loss": 1.6746, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.31607142857142856, |
|
"grad_norm": 0.3280562460422516, |
|
"learning_rate": 0.0001578504910111811, |
|
"loss": 1.1357, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.31785714285714284, |
|
"grad_norm": 0.2856597304344177, |
|
"learning_rate": 0.00015738363511079776, |
|
"loss": 1.1127, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.3196428571428571, |
|
"grad_norm": 0.316491961479187, |
|
"learning_rate": 0.00015691490697161182, |
|
"loss": 1.4281, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.32142857142857145, |
|
"grad_norm": 0.3632654845714569, |
|
"learning_rate": 0.00015644432188667695, |
|
"loss": 1.3413, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.32321428571428573, |
|
"grad_norm": 0.34329405426979065, |
|
"learning_rate": 0.00015597189520963277, |
|
"loss": 1.0579, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 0.32447105646133423, |
|
"learning_rate": 0.00015549764235420405, |
|
"loss": 1.243, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.3267857142857143, |
|
"grad_norm": 0.3558500409126282, |
|
"learning_rate": 0.0001550215787936977, |
|
"loss": 1.1376, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.32857142857142857, |
|
"grad_norm": 0.3373570740222931, |
|
"learning_rate": 0.00015454372006049803, |
|
"loss": 1.1251, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.33035714285714285, |
|
"grad_norm": 0.36412546038627625, |
|
"learning_rate": 0.00015406408174555976, |
|
"loss": 1.3238, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.33214285714285713, |
|
"grad_norm": 0.364442378282547, |
|
"learning_rate": 0.00015358267949789966, |
|
"loss": 0.9448, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.3339285714285714, |
|
"grad_norm": 0.3172107934951782, |
|
"learning_rate": 0.00015309952902408576, |
|
"loss": 1.2744, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.3357142857142857, |
|
"grad_norm": 0.34173399209976196, |
|
"learning_rate": 0.00015261464608772488, |
|
"loss": 1.0923, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.3375, |
|
"grad_norm": 0.33419185876846313, |
|
"learning_rate": 0.0001521280465089484, |
|
"loss": 1.2762, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.3392857142857143, |
|
"grad_norm": 0.3866868317127228, |
|
"learning_rate": 0.0001516397461638962, |
|
"loss": 0.9595, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3410714285714286, |
|
"grad_norm": 0.3978990614414215, |
|
"learning_rate": 0.00015114976098419842, |
|
"loss": 0.9993, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.34285714285714286, |
|
"grad_norm": 0.3546142876148224, |
|
"learning_rate": 0.00015065810695645584, |
|
"loss": 1.3421, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.34464285714285714, |
|
"grad_norm": 0.39728498458862305, |
|
"learning_rate": 0.00015016480012171828, |
|
"loss": 1.1209, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.3464285714285714, |
|
"grad_norm": 0.4170741140842438, |
|
"learning_rate": 0.00014966985657496114, |
|
"loss": 1.0024, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.3482142857142857, |
|
"grad_norm": 0.4226652681827545, |
|
"learning_rate": 0.0001491732924645604, |
|
"loss": 1.3139, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.3712114691734314, |
|
"learning_rate": 0.00014867512399176563, |
|
"loss": 1.1574, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3517857142857143, |
|
"grad_norm": 0.3655322790145874, |
|
"learning_rate": 0.00014817536741017152, |
|
"loss": 1.6149, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.3535714285714286, |
|
"grad_norm": 0.4362059533596039, |
|
"learning_rate": 0.0001476740390251875, |
|
"loss": 1.7657, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.35535714285714287, |
|
"grad_norm": 0.43134769797325134, |
|
"learning_rate": 0.00014717115519350567, |
|
"loss": 1.7167, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.7784890532493591, |
|
"learning_rate": 0.00014666673232256738, |
|
"loss": 2.036, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.35892857142857143, |
|
"grad_norm": 0.17376984655857086, |
|
"learning_rate": 0.0001461607868700276, |
|
"loss": 1.4856, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.3607142857142857, |
|
"grad_norm": 0.2141953408718109, |
|
"learning_rate": 0.00014565333534321826, |
|
"loss": 1.7491, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.3625, |
|
"grad_norm": 0.22548137605190277, |
|
"learning_rate": 0.00014514439429860943, |
|
"loss": 1.8457, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.36428571428571427, |
|
"grad_norm": 0.20618294179439545, |
|
"learning_rate": 0.0001446339803412692, |
|
"loss": 1.4987, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.36607142857142855, |
|
"grad_norm": 0.21025151014328003, |
|
"learning_rate": 0.00014412211012432212, |
|
"loss": 1.5568, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3678571428571429, |
|
"grad_norm": 0.21678180992603302, |
|
"learning_rate": 0.00014360880034840554, |
|
"loss": 1.7841, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.36964285714285716, |
|
"grad_norm": 0.20914790034294128, |
|
"learning_rate": 0.0001430940677611249, |
|
"loss": 1.6693, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.37142857142857144, |
|
"grad_norm": 0.21597585082054138, |
|
"learning_rate": 0.00014257792915650728, |
|
"loss": 1.648, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3732142857142857, |
|
"grad_norm": 0.23697789013385773, |
|
"learning_rate": 0.00014206040137445348, |
|
"loss": 1.7616, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.2535800635814667, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 2.0279, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3767857142857143, |
|
"grad_norm": 0.21204812824726105, |
|
"learning_rate": 0.0001410212458637112, |
|
"loss": 1.8472, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.37857142857142856, |
|
"grad_norm": 0.36059629917144775, |
|
"learning_rate": 0.00014049965203924054, |
|
"loss": 1.8042, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.38035714285714284, |
|
"grad_norm": 0.21400661766529083, |
|
"learning_rate": 0.0001399767368446634, |
|
"loss": 1.698, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.3821428571428571, |
|
"grad_norm": 0.24055758118629456, |
|
"learning_rate": 0.00013945251734097828, |
|
"loss": 1.8758, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.38392857142857145, |
|
"grad_norm": 0.23605166375637054, |
|
"learning_rate": 0.00013892701063173918, |
|
"loss": 1.7425, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.38571428571428573, |
|
"grad_norm": 0.23343758285045624, |
|
"learning_rate": 0.00013840023386249713, |
|
"loss": 1.8683, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.3875, |
|
"grad_norm": 0.2475200593471527, |
|
"learning_rate": 0.00013787220422024134, |
|
"loss": 1.9091, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.3892857142857143, |
|
"grad_norm": 0.2618944048881531, |
|
"learning_rate": 0.00013734293893283783, |
|
"loss": 1.5086, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.39107142857142857, |
|
"grad_norm": 0.2627498209476471, |
|
"learning_rate": 0.00013681245526846783, |
|
"loss": 1.3878, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.39285714285714285, |
|
"grad_norm": 0.24390314519405365, |
|
"learning_rate": 0.0001362807705350641, |
|
"loss": 1.7332, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.39464285714285713, |
|
"grad_norm": 0.2768295705318451, |
|
"learning_rate": 0.00013574790207974646, |
|
"loss": 1.3123, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.3964285714285714, |
|
"grad_norm": 0.2606358230113983, |
|
"learning_rate": 0.0001352138672882555, |
|
"loss": 1.4506, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.3982142857142857, |
|
"grad_norm": 0.24806426465511322, |
|
"learning_rate": 0.00013467868358438563, |
|
"loss": 1.7087, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2664608061313629, |
|
"learning_rate": 0.00013414236842941644, |
|
"loss": 1.3124, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.4017857142857143, |
|
"grad_norm": 0.2661263346672058, |
|
"learning_rate": 0.00013360493932154302, |
|
"loss": 1.2174, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4035714285714286, |
|
"grad_norm": 0.3460038900375366, |
|
"learning_rate": 0.00013306641379530514, |
|
"loss": 0.6889, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.40535714285714286, |
|
"grad_norm": 0.2929069995880127, |
|
"learning_rate": 0.000132526809421015, |
|
"loss": 0.9457, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.40714285714285714, |
|
"grad_norm": 0.3459819257259369, |
|
"learning_rate": 0.00013198614380418412, |
|
"loss": 1.2547, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4089285714285714, |
|
"grad_norm": 0.30105313658714294, |
|
"learning_rate": 0.00013144443458494882, |
|
"loss": 0.957, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.4107142857142857, |
|
"grad_norm": 0.3461960256099701, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 1.3146, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4125, |
|
"grad_norm": 0.34542855620384216, |
|
"learning_rate": 0.00013035795606948023, |
|
"loss": 1.1128, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.4142857142857143, |
|
"grad_norm": 0.37605586647987366, |
|
"learning_rate": 0.00012981322222145846, |
|
"loss": 1.5095, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.4160714285714286, |
|
"grad_norm": 0.37267056107521057, |
|
"learning_rate": 0.00012926751566629875, |
|
"loss": 1.071, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.41785714285714287, |
|
"grad_norm": 0.3052172064781189, |
|
"learning_rate": 0.00012872085420860665, |
|
"loss": 1.3136, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.41964285714285715, |
|
"grad_norm": 0.36694592237472534, |
|
"learning_rate": 0.00012817325568414297, |
|
"loss": 1.2439, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.42142857142857143, |
|
"grad_norm": 0.36055245995521545, |
|
"learning_rate": 0.00012762473795924204, |
|
"loss": 1.1165, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4232142857142857, |
|
"grad_norm": 0.3014545738697052, |
|
"learning_rate": 0.00012707531893022854, |
|
"loss": 1.5423, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 0.3208891749382019, |
|
"learning_rate": 0.00012652501652283377, |
|
"loss": 1.1813, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.42678571428571427, |
|
"grad_norm": 0.38703230023384094, |
|
"learning_rate": 0.00012597384869161084, |
|
"loss": 0.7706, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.38256821036338806, |
|
"learning_rate": 0.00012542183341934872, |
|
"loss": 1.0565, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4303571428571429, |
|
"grad_norm": 0.3555380702018738, |
|
"learning_rate": 0.0001248689887164855, |
|
"loss": 0.849, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.43214285714285716, |
|
"grad_norm": 0.3472703993320465, |
|
"learning_rate": 0.00012431533262052098, |
|
"loss": 1.3984, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.43392857142857144, |
|
"grad_norm": 0.3631349503993988, |
|
"learning_rate": 0.000123760883195428, |
|
"loss": 0.8955, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4357142857142857, |
|
"grad_norm": 0.349295973777771, |
|
"learning_rate": 0.00012320565853106316, |
|
"loss": 0.8866, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.33635953068733215, |
|
"learning_rate": 0.00012264967674257646, |
|
"loss": 1.2419, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4392857142857143, |
|
"grad_norm": 0.3833181858062744, |
|
"learning_rate": 0.00012209295596982042, |
|
"loss": 1.5507, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.44107142857142856, |
|
"grad_norm": 0.3737214505672455, |
|
"learning_rate": 0.00012153551437675821, |
|
"loss": 1.4881, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.44285714285714284, |
|
"grad_norm": 0.4705282747745514, |
|
"learning_rate": 0.00012097737015087094, |
|
"loss": 1.4864, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.4446428571428571, |
|
"grad_norm": 0.39539188146591187, |
|
"learning_rate": 0.00012041854150256433, |
|
"loss": 1.7855, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"grad_norm": 0.7369075417518616, |
|
"learning_rate": 0.00011985904666457455, |
|
"loss": 2.01, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.44821428571428573, |
|
"grad_norm": 0.18146094679832458, |
|
"learning_rate": 0.00011929890389137337, |
|
"loss": 1.5898, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.21558880805969238, |
|
"learning_rate": 0.00011873813145857249, |
|
"loss": 1.5816, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.4517857142857143, |
|
"grad_norm": 0.19599275290966034, |
|
"learning_rate": 0.00011817674766232734, |
|
"loss": 1.6433, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.45357142857142857, |
|
"grad_norm": 0.22075910866260529, |
|
"learning_rate": 0.00011761477081874015, |
|
"loss": 1.6005, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.45535714285714285, |
|
"grad_norm": 0.19471955299377441, |
|
"learning_rate": 0.0001170522192632624, |
|
"loss": 1.7133, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 0.19876879453659058, |
|
"learning_rate": 0.00011648911135009634, |
|
"loss": 1.5085, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.4589285714285714, |
|
"grad_norm": 0.20565317571163177, |
|
"learning_rate": 0.00011592546545159645, |
|
"loss": 1.7386, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.4607142857142857, |
|
"grad_norm": 0.24483506381511688, |
|
"learning_rate": 0.00011536129995766996, |
|
"loss": 1.7162, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.4625, |
|
"grad_norm": 0.21543823182582855, |
|
"learning_rate": 0.00011479663327517667, |
|
"loss": 1.6966, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.4642857142857143, |
|
"grad_norm": 0.2661048471927643, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 1.8821, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4660714285714286, |
|
"grad_norm": 0.24292460083961487, |
|
"learning_rate": 0.00011366587005308858, |
|
"loss": 1.7085, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.46785714285714286, |
|
"grad_norm": 0.216167613863945, |
|
"learning_rate": 0.0001130998104065693, |
|
"loss": 1.7298, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.46964285714285714, |
|
"grad_norm": 0.2111697793006897, |
|
"learning_rate": 0.00011253332335643043, |
|
"loss": 1.8098, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.4714285714285714, |
|
"grad_norm": 0.23981061577796936, |
|
"learning_rate": 0.00011196642738527659, |
|
"loss": 1.7026, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.4732142857142857, |
|
"grad_norm": 0.2623251676559448, |
|
"learning_rate": 0.00011139914098905406, |
|
"loss": 1.7894, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 0.2482486367225647, |
|
"learning_rate": 0.00011083148267644747, |
|
"loss": 1.9019, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.4767857142857143, |
|
"grad_norm": 0.238911435008049, |
|
"learning_rate": 0.00011026347096827578, |
|
"loss": 1.6809, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.4785714285714286, |
|
"grad_norm": 0.24704696238040924, |
|
"learning_rate": 0.00010969512439688816, |
|
"loss": 1.6607, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.48035714285714287, |
|
"grad_norm": 0.25105100870132446, |
|
"learning_rate": 0.00010912646150555919, |
|
"loss": 1.5895, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.48214285714285715, |
|
"grad_norm": 0.2849842607975006, |
|
"learning_rate": 0.00010855750084788398, |
|
"loss": 2.0812, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.48392857142857143, |
|
"grad_norm": 0.2599342167377472, |
|
"learning_rate": 0.00010798826098717276, |
|
"loss": 1.1569, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.4857142857142857, |
|
"grad_norm": 0.28037258982658386, |
|
"learning_rate": 0.00010741876049584523, |
|
"loss": 1.1928, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.4875, |
|
"grad_norm": 0.29920563101768494, |
|
"learning_rate": 0.00010684901795482456, |
|
"loss": 1.2244, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.48928571428571427, |
|
"grad_norm": 0.2799164354801178, |
|
"learning_rate": 0.00010627905195293135, |
|
"loss": 1.4455, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.49107142857142855, |
|
"grad_norm": 0.23873603343963623, |
|
"learning_rate": 0.00010570888108627681, |
|
"loss": 0.852, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4928571428571429, |
|
"grad_norm": 0.2817741632461548, |
|
"learning_rate": 0.00010513852395765631, |
|
"loss": 1.3203, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.49464285714285716, |
|
"grad_norm": 0.27295514941215515, |
|
"learning_rate": 0.00010456799917594233, |
|
"loss": 0.749, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.49642857142857144, |
|
"grad_norm": 0.30728739500045776, |
|
"learning_rate": 0.00010399732535547734, |
|
"loss": 1.0083, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.4982142857142857, |
|
"grad_norm": 0.32001444697380066, |
|
"learning_rate": 0.00010342652111546635, |
|
"loss": 1.573, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3400101065635681, |
|
"learning_rate": 0.00010285560507936961, |
|
"loss": 1.0757, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.4439584016799927, |
|
"eval_runtime": 13.9741, |
|
"eval_samples_per_second": 16.888, |
|
"eval_steps_per_second": 8.444, |
|
"step": 280 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 560, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 140, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.129139501635994e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|