|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 140, |
|
"global_step": 560, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0017857142857142857, |
|
"grad_norm": 0.14711466431617737, |
|
"learning_rate": 2e-05, |
|
"loss": 1.6301, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0035714285714285713, |
|
"grad_norm": 0.13304685056209564, |
|
"learning_rate": 4e-05, |
|
"loss": 1.6271, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005357142857142857, |
|
"grad_norm": 0.1233660876750946, |
|
"learning_rate": 6e-05, |
|
"loss": 1.6548, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007142857142857143, |
|
"grad_norm": 0.1381577104330063, |
|
"learning_rate": 8e-05, |
|
"loss": 1.7853, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.008928571428571428, |
|
"grad_norm": 0.14668770134449005, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7507, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.010714285714285714, |
|
"grad_norm": 0.168854758143425, |
|
"learning_rate": 0.00012, |
|
"loss": 1.9486, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 0.2166246473789215, |
|
"learning_rate": 0.00014, |
|
"loss": 1.9167, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.014285714285714285, |
|
"grad_norm": 0.23894698917865753, |
|
"learning_rate": 0.00016, |
|
"loss": 1.8384, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01607142857142857, |
|
"grad_norm": 0.2704271674156189, |
|
"learning_rate": 0.00018, |
|
"loss": 1.8232, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.017857142857142856, |
|
"grad_norm": 0.24295401573181152, |
|
"learning_rate": 0.0002, |
|
"loss": 2.1559, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.019642857142857142, |
|
"grad_norm": 0.23109205067157745, |
|
"learning_rate": 0.00019999836866486503, |
|
"loss": 1.7082, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02142857142857143, |
|
"grad_norm": 0.205198273062706, |
|
"learning_rate": 0.00019999347471268516, |
|
"loss": 1.8417, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.023214285714285715, |
|
"grad_norm": 0.2678169012069702, |
|
"learning_rate": 0.00019998531830313395, |
|
"loss": 1.9222, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 0.2563900053501129, |
|
"learning_rate": 0.0001999738997023281, |
|
"loss": 1.8931, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.026785714285714284, |
|
"grad_norm": 0.31391921639442444, |
|
"learning_rate": 0.00019995921928281894, |
|
"loss": 1.9403, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02857142857142857, |
|
"grad_norm": 0.23522017896175385, |
|
"learning_rate": 0.00019994127752358013, |
|
"loss": 1.8728, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.030357142857142857, |
|
"grad_norm": 0.2517368495464325, |
|
"learning_rate": 0.00019992007500999214, |
|
"loss": 1.9224, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03214285714285714, |
|
"grad_norm": 0.27111852169036865, |
|
"learning_rate": 0.00019989561243382312, |
|
"loss": 1.9046, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.033928571428571426, |
|
"grad_norm": 0.2616860568523407, |
|
"learning_rate": 0.00019986789059320615, |
|
"loss": 2.1173, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03571428571428571, |
|
"grad_norm": 0.23268847167491913, |
|
"learning_rate": 0.00019983691039261357, |
|
"loss": 1.8406, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 0.30090174078941345, |
|
"learning_rate": 0.00019980267284282717, |
|
"loss": 1.7514, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.039285714285714285, |
|
"grad_norm": 0.23314997553825378, |
|
"learning_rate": 0.00019976517906090529, |
|
"loss": 1.7518, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04107142857142857, |
|
"grad_norm": 0.2630898952484131, |
|
"learning_rate": 0.0001997244302701464, |
|
"loss": 1.7533, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04285714285714286, |
|
"grad_norm": 0.3203246593475342, |
|
"learning_rate": 0.00019968042780004917, |
|
"loss": 1.534, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.044642857142857144, |
|
"grad_norm": 0.48179084062576294, |
|
"learning_rate": 0.00019963317308626914, |
|
"loss": 1.4211, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04642857142857143, |
|
"grad_norm": 0.36420196294784546, |
|
"learning_rate": 0.0001995826676705718, |
|
"loss": 1.4275, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.048214285714285716, |
|
"grad_norm": 0.35330235958099365, |
|
"learning_rate": 0.00019952891320078236, |
|
"loss": 1.5612, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.27140146493911743, |
|
"learning_rate": 0.00019947191143073186, |
|
"loss": 1.6001, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05178571428571429, |
|
"grad_norm": 0.2987808883190155, |
|
"learning_rate": 0.00019941166422020014, |
|
"loss": 1.3943, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05357142857142857, |
|
"grad_norm": 0.3032172918319702, |
|
"learning_rate": 0.00019934817353485501, |
|
"loss": 1.4888, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.055357142857142855, |
|
"grad_norm": 0.6096982359886169, |
|
"learning_rate": 0.00019928144144618824, |
|
"loss": 1.7057, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.05714285714285714, |
|
"grad_norm": 0.30788663029670715, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 1.3929, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.05892857142857143, |
|
"grad_norm": 0.33283260464668274, |
|
"learning_rate": 0.00019913826187356696, |
|
"loss": 1.5611, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.060714285714285714, |
|
"grad_norm": 0.3044683337211609, |
|
"learning_rate": 0.00019906181906108984, |
|
"loss": 1.2961, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 0.3300105929374695, |
|
"learning_rate": 0.0001989821441880933, |
|
"loss": 1.3975, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06428571428571428, |
|
"grad_norm": 0.3126329481601715, |
|
"learning_rate": 0.00019889923985410576, |
|
"loss": 1.2759, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.06607142857142857, |
|
"grad_norm": 0.33685699105262756, |
|
"learning_rate": 0.00019881310876402223, |
|
"loss": 1.162, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.06785714285714285, |
|
"grad_norm": 0.3971424996852875, |
|
"learning_rate": 0.0001987237537280163, |
|
"loss": 1.21, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.06964285714285715, |
|
"grad_norm": 0.43255800008773804, |
|
"learning_rate": 0.00019863117766144806, |
|
"loss": 1.0186, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 0.4930873215198517, |
|
"learning_rate": 0.00019853538358476932, |
|
"loss": 1.0045, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07321428571428572, |
|
"grad_norm": 0.4585554599761963, |
|
"learning_rate": 0.00019843637462342497, |
|
"loss": 1.1077, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 0.4778697192668915, |
|
"learning_rate": 0.00019833415400775093, |
|
"loss": 1.0817, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.07678571428571429, |
|
"grad_norm": 0.4595739245414734, |
|
"learning_rate": 0.0001982287250728689, |
|
"loss": 1.2805, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.07857142857142857, |
|
"grad_norm": 0.5665370225906372, |
|
"learning_rate": 0.00019812009125857728, |
|
"loss": 0.9855, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08035714285714286, |
|
"grad_norm": 0.40399685502052307, |
|
"learning_rate": 0.00019800825610923934, |
|
"loss": 1.3264, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08214285714285714, |
|
"grad_norm": 0.43560901284217834, |
|
"learning_rate": 0.00019789322327366723, |
|
"loss": 1.2724, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08392857142857142, |
|
"grad_norm": 0.375598669052124, |
|
"learning_rate": 0.000197774996505003, |
|
"loss": 1.1722, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.08571428571428572, |
|
"grad_norm": 0.4257534146308899, |
|
"learning_rate": 0.00019765357966059638, |
|
"loss": 1.3607, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 0.5437749028205872, |
|
"learning_rate": 0.0001975289767018786, |
|
"loss": 1.9812, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.08928571428571429, |
|
"grad_norm": 0.6295837163925171, |
|
"learning_rate": 0.00019740119169423337, |
|
"loss": 2.1601, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09107142857142857, |
|
"grad_norm": 0.22341597080230713, |
|
"learning_rate": 0.00019727022880686412, |
|
"loss": 1.7096, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.09285714285714286, |
|
"grad_norm": 0.26040709018707275, |
|
"learning_rate": 0.00019713609231265805, |
|
"loss": 1.733, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.09464285714285714, |
|
"grad_norm": 0.2146587371826172, |
|
"learning_rate": 0.00019699878658804672, |
|
"loss": 1.7999, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.09642857142857143, |
|
"grad_norm": 0.2316572070121765, |
|
"learning_rate": 0.0001968583161128631, |
|
"loss": 1.6017, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.09821428571428571, |
|
"grad_norm": 0.20329022407531738, |
|
"learning_rate": 0.00019671468547019573, |
|
"loss": 1.7464, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.22198128700256348, |
|
"learning_rate": 0.00019656789934623881, |
|
"loss": 1.7584, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10178571428571428, |
|
"grad_norm": 0.2108459621667862, |
|
"learning_rate": 0.00019641796253013958, |
|
"loss": 1.7389, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.10357142857142858, |
|
"grad_norm": 0.2677428126335144, |
|
"learning_rate": 0.00019626487991384196, |
|
"loss": 1.9166, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.10535714285714286, |
|
"grad_norm": 0.22205914556980133, |
|
"learning_rate": 0.00019610865649192697, |
|
"loss": 1.738, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.10714285714285714, |
|
"grad_norm": 0.22724869847297668, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 1.7451, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10892857142857143, |
|
"grad_norm": 0.24029673635959625, |
|
"learning_rate": 0.00019578680772177327, |
|
"loss": 1.9556, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.11071428571428571, |
|
"grad_norm": 0.2240082174539566, |
|
"learning_rate": 0.00019562119287439873, |
|
"loss": 1.6435, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 0.23900961875915527, |
|
"learning_rate": 0.00019545245822279243, |
|
"loss": 1.7015, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.2319100946187973, |
|
"learning_rate": 0.0001952806092722098, |
|
"loss": 1.7287, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.11607142857142858, |
|
"grad_norm": 0.22805678844451904, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 1.7574, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.11785714285714285, |
|
"grad_norm": 0.2389436662197113, |
|
"learning_rate": 0.00019492759100300019, |
|
"loss": 1.7738, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.11964285714285715, |
|
"grad_norm": 0.23373426496982574, |
|
"learning_rate": 0.00019474643320219532, |
|
"loss": 1.5328, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.12142857142857143, |
|
"grad_norm": 0.2664267420768738, |
|
"learning_rate": 0.0001945621841376825, |
|
"loss": 2.0273, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.12321428571428572, |
|
"grad_norm": 0.2796079218387604, |
|
"learning_rate": 0.0001943748498209012, |
|
"loss": 1.547, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.2695769667625427, |
|
"learning_rate": 0.00019418443636395248, |
|
"loss": 1.6374, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12678571428571428, |
|
"grad_norm": 0.3933340311050415, |
|
"learning_rate": 0.00019399094997939957, |
|
"loss": 1.2037, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.12857142857142856, |
|
"grad_norm": 0.2825562357902527, |
|
"learning_rate": 0.0001937943969800652, |
|
"loss": 1.5649, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.13035714285714287, |
|
"grad_norm": 0.3704264760017395, |
|
"learning_rate": 0.00019359478377882567, |
|
"loss": 1.292, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.13214285714285715, |
|
"grad_norm": 0.3214375972747803, |
|
"learning_rate": 0.00019339211688840157, |
|
"loss": 1.2539, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.13392857142857142, |
|
"grad_norm": 0.337907612323761, |
|
"learning_rate": 0.00019318640292114524, |
|
"loss": 1.3202, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1357142857142857, |
|
"grad_norm": 0.30557796359062195, |
|
"learning_rate": 0.00019297764858882514, |
|
"loss": 1.074, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 0.3154335916042328, |
|
"learning_rate": 0.00019276586070240682, |
|
"loss": 1.1813, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.1392857142857143, |
|
"grad_norm": 0.3587944209575653, |
|
"learning_rate": 0.0001925510461718307, |
|
"loss": 1.3602, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.14107142857142857, |
|
"grad_norm": 0.35669559240341187, |
|
"learning_rate": 0.0001923332120057866, |
|
"loss": 1.3626, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.48954322934150696, |
|
"learning_rate": 0.000192112365311485, |
|
"loss": 1.4762, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14464285714285716, |
|
"grad_norm": 0.4903950095176697, |
|
"learning_rate": 0.00019188851329442547, |
|
"loss": 1.6974, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.14642857142857144, |
|
"grad_norm": 0.35062992572784424, |
|
"learning_rate": 0.00019166166325816118, |
|
"loss": 1.2373, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.14821428571428572, |
|
"grad_norm": 0.5671891570091248, |
|
"learning_rate": 0.0001914318226040608, |
|
"loss": 1.5644, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.338214248418808, |
|
"learning_rate": 0.000191198998831067, |
|
"loss": 1.3717, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.15178571428571427, |
|
"grad_norm": 0.4187781512737274, |
|
"learning_rate": 0.00019096319953545185, |
|
"loss": 1.1211, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.15357142857142858, |
|
"grad_norm": 0.3229452967643738, |
|
"learning_rate": 0.00019072443241056883, |
|
"loss": 1.0473, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.15535714285714286, |
|
"grad_norm": 0.376589298248291, |
|
"learning_rate": 0.00019048270524660196, |
|
"loss": 0.9684, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.15714285714285714, |
|
"grad_norm": 0.4261431396007538, |
|
"learning_rate": 0.00019023802593031154, |
|
"loss": 1.0243, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.15892857142857142, |
|
"grad_norm": 0.4411628544330597, |
|
"learning_rate": 0.0001899904024447769, |
|
"loss": 1.2739, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.16071428571428573, |
|
"grad_norm": 0.3415679335594177, |
|
"learning_rate": 0.00018973984286913584, |
|
"loss": 1.2494, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 0.48547816276550293, |
|
"learning_rate": 0.0001894863553783212, |
|
"loss": 1.0384, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.16428571428571428, |
|
"grad_norm": 0.398359090089798, |
|
"learning_rate": 0.00018922994824279395, |
|
"loss": 1.2409, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.16607142857142856, |
|
"grad_norm": 0.4025682806968689, |
|
"learning_rate": 0.00018897062982827344, |
|
"loss": 1.0646, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.16785714285714284, |
|
"grad_norm": 0.46782195568084717, |
|
"learning_rate": 0.00018870840859546456, |
|
"loss": 1.2167, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.16964285714285715, |
|
"grad_norm": 0.45423248410224915, |
|
"learning_rate": 0.00018844329309978145, |
|
"loss": 1.2359, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17142857142857143, |
|
"grad_norm": 0.380495548248291, |
|
"learning_rate": 0.0001881752919910686, |
|
"loss": 1.0583, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1732142857142857, |
|
"grad_norm": 0.41103506088256836, |
|
"learning_rate": 0.00018790441401331847, |
|
"loss": 1.2755, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 0.4458900988101959, |
|
"learning_rate": 0.00018763066800438636, |
|
"loss": 1.3651, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.1767857142857143, |
|
"grad_norm": 0.5115835666656494, |
|
"learning_rate": 0.00018735406289570192, |
|
"loss": 1.5389, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 0.6476715803146362, |
|
"learning_rate": 0.00018707460771197774, |
|
"loss": 2.1173, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18035714285714285, |
|
"grad_norm": 0.18222245573997498, |
|
"learning_rate": 0.00018679231157091506, |
|
"loss": 1.3799, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.18214285714285713, |
|
"grad_norm": 0.2210777848958969, |
|
"learning_rate": 0.0001865071836829061, |
|
"loss": 1.6362, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.18392857142857144, |
|
"grad_norm": 0.21902061998844147, |
|
"learning_rate": 0.00018621923335073376, |
|
"loss": 1.6673, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.18571428571428572, |
|
"grad_norm": 0.21758711338043213, |
|
"learning_rate": 0.00018592846996926793, |
|
"loss": 1.669, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 0.21447710692882538, |
|
"learning_rate": 0.0001856349030251589, |
|
"loss": 1.6173, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.18928571428571428, |
|
"grad_norm": 0.20819346606731415, |
|
"learning_rate": 0.00018533854209652818, |
|
"loss": 1.6841, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.19107142857142856, |
|
"grad_norm": 0.19584102928638458, |
|
"learning_rate": 0.00018503939685265568, |
|
"loss": 1.5907, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.19285714285714287, |
|
"grad_norm": 0.2108387053012848, |
|
"learning_rate": 0.00018473747705366426, |
|
"loss": 1.7855, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.19464285714285715, |
|
"grad_norm": 0.20142777264118195, |
|
"learning_rate": 0.00018443279255020152, |
|
"loss": 1.79, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.19642857142857142, |
|
"grad_norm": 0.20290708541870117, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 1.8891, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1982142857142857, |
|
"grad_norm": 0.20743344724178314, |
|
"learning_rate": 0.00018381516928314367, |
|
"loss": 1.6483, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.24931779503822327, |
|
"learning_rate": 0.00018350225067055925, |
|
"loss": 1.7883, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2017857142857143, |
|
"grad_norm": 0.20271241664886475, |
|
"learning_rate": 0.00018318660765486748, |
|
"loss": 1.8015, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.20357142857142857, |
|
"grad_norm": 0.21297615766525269, |
|
"learning_rate": 0.00018286825053445918, |
|
"loss": 1.7595, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.20535714285714285, |
|
"grad_norm": 0.21898160874843597, |
|
"learning_rate": 0.0001825471896962774, |
|
"loss": 1.8246, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.20714285714285716, |
|
"grad_norm": 0.2433815449476242, |
|
"learning_rate": 0.00018222343561547874, |
|
"loss": 1.9214, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.20892857142857144, |
|
"grad_norm": 0.2514473497867584, |
|
"learning_rate": 0.00018189699885509127, |
|
"loss": 1.674, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.21071428571428572, |
|
"grad_norm": 0.23736989498138428, |
|
"learning_rate": 0.0001815678900656702, |
|
"loss": 1.7948, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2125, |
|
"grad_norm": 0.22837956249713898, |
|
"learning_rate": 0.00018123611998495007, |
|
"loss": 1.9429, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.2722070813179016, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 1.2768, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.21607142857142858, |
|
"grad_norm": 0.23845739662647247, |
|
"learning_rate": 0.00018056463933434398, |
|
"loss": 1.7407, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.21785714285714286, |
|
"grad_norm": 0.24140624701976776, |
|
"learning_rate": 0.00018022495067265753, |
|
"loss": 1.378, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.21964285714285714, |
|
"grad_norm": 0.2878568768501282, |
|
"learning_rate": 0.0001798826445353564, |
|
"loss": 1.6684, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.22142857142857142, |
|
"grad_norm": 0.35590338706970215, |
|
"learning_rate": 0.0001795377320907611, |
|
"loss": 0.7946, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.22321428571428573, |
|
"grad_norm": 0.35154449939727783, |
|
"learning_rate": 0.00017919022459222752, |
|
"loss": 0.9185, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 0.31363779306411743, |
|
"learning_rate": 0.00017884013337777943, |
|
"loss": 1.3471, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.22678571428571428, |
|
"grad_norm": 0.36924973130226135, |
|
"learning_rate": 0.00017848746986973883, |
|
"loss": 1.3852, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.44929584860801697, |
|
"learning_rate": 0.00017813224557435312, |
|
"loss": 0.82, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.23035714285714284, |
|
"grad_norm": 0.3592383563518524, |
|
"learning_rate": 0.0001777744720814198, |
|
"loss": 1.0219, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.23214285714285715, |
|
"grad_norm": 0.37820902466773987, |
|
"learning_rate": 0.00017741416106390826, |
|
"loss": 1.3303, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23392857142857143, |
|
"grad_norm": 0.33376428484916687, |
|
"learning_rate": 0.00017705132427757895, |
|
"loss": 1.384, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2357142857142857, |
|
"grad_norm": 0.3454584777355194, |
|
"learning_rate": 0.00017668597356059978, |
|
"loss": 1.2236, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.2375, |
|
"grad_norm": 0.345708429813385, |
|
"learning_rate": 0.00017631812083316003, |
|
"loss": 1.0293, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.2392857142857143, |
|
"grad_norm": 0.3823675811290741, |
|
"learning_rate": 0.00017594777809708126, |
|
"loss": 1.3131, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.24107142857142858, |
|
"grad_norm": 0.336627334356308, |
|
"learning_rate": 0.00017557495743542585, |
|
"loss": 1.2084, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.24285714285714285, |
|
"grad_norm": 0.3525237739086151, |
|
"learning_rate": 0.0001751996710121026, |
|
"loss": 1.1378, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.24464285714285713, |
|
"grad_norm": 0.38291576504707336, |
|
"learning_rate": 0.00017482193107147014, |
|
"loss": 0.8353, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.24642857142857144, |
|
"grad_norm": 0.4650520086288452, |
|
"learning_rate": 0.0001744417499379372, |
|
"loss": 1.2024, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.24821428571428572, |
|
"grad_norm": 0.32640358805656433, |
|
"learning_rate": 0.0001740591400155606, |
|
"loss": 1.2343, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.39074593782424927, |
|
"learning_rate": 0.0001736741137876405, |
|
"loss": 0.8896, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.4710056781768799, |
|
"eval_runtime": 13.3678, |
|
"eval_samples_per_second": 17.654, |
|
"eval_steps_per_second": 8.827, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2517857142857143, |
|
"grad_norm": 0.36249810457229614, |
|
"learning_rate": 0.00017328668381631318, |
|
"loss": 1.1208, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.25357142857142856, |
|
"grad_norm": 0.3750612735748291, |
|
"learning_rate": 0.00017289686274214118, |
|
"loss": 1.2502, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.25535714285714284, |
|
"grad_norm": 0.4201869070529938, |
|
"learning_rate": 0.0001725046632837007, |
|
"loss": 1.1947, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.2571428571428571, |
|
"grad_norm": 0.4865645468235016, |
|
"learning_rate": 0.00017211009823716694, |
|
"loss": 0.8749, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.25892857142857145, |
|
"grad_norm": 0.38693225383758545, |
|
"learning_rate": 0.00017171318047589637, |
|
"loss": 1.2495, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.26071428571428573, |
|
"grad_norm": 0.40707525610923767, |
|
"learning_rate": 0.00017131392295000674, |
|
"loss": 1.2321, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.2625, |
|
"grad_norm": 0.39570894837379456, |
|
"learning_rate": 0.00017091233868595467, |
|
"loss": 1.301, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.2642857142857143, |
|
"grad_norm": 0.4085226058959961, |
|
"learning_rate": 0.00017050844078611056, |
|
"loss": 1.5369, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.26607142857142857, |
|
"grad_norm": 0.47094810009002686, |
|
"learning_rate": 0.0001701022424283311, |
|
"loss": 1.9374, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"grad_norm": 0.8517308831214905, |
|
"learning_rate": 0.00016969375686552937, |
|
"loss": 1.808, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.26964285714285713, |
|
"grad_norm": 0.1922745406627655, |
|
"learning_rate": 0.00016928299742524234, |
|
"loss": 1.6608, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.2714285714285714, |
|
"grad_norm": 0.2090916484594345, |
|
"learning_rate": 0.00016886997750919619, |
|
"loss": 1.8009, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.2732142857142857, |
|
"grad_norm": 0.21698515117168427, |
|
"learning_rate": 0.00016845471059286887, |
|
"loss": 1.7821, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 0.21791532635688782, |
|
"learning_rate": 0.00016803721022505067, |
|
"loss": 1.5901, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.2767857142857143, |
|
"grad_norm": 0.22199980914592743, |
|
"learning_rate": 0.00016761749002740193, |
|
"loss": 1.7047, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2785714285714286, |
|
"grad_norm": 0.2096625566482544, |
|
"learning_rate": 0.0001671955636940088, |
|
"loss": 1.6898, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.28035714285714286, |
|
"grad_norm": 0.22975414991378784, |
|
"learning_rate": 0.00016677144499093626, |
|
"loss": 1.7631, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.28214285714285714, |
|
"grad_norm": 0.2187148928642273, |
|
"learning_rate": 0.0001663451477557792, |
|
"loss": 1.7872, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.2839285714285714, |
|
"grad_norm": 0.2257414609193802, |
|
"learning_rate": 0.0001659166858972107, |
|
"loss": 1.7732, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.22986693680286407, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 1.7031, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2875, |
|
"grad_norm": 0.21585014462471008, |
|
"learning_rate": 0.0001650533242971987, |
|
"loss": 1.8421, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.2892857142857143, |
|
"grad_norm": 0.22519604861736298, |
|
"learning_rate": 0.00016461845272439741, |
|
"loss": 1.6529, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.2910714285714286, |
|
"grad_norm": 0.22279705107212067, |
|
"learning_rate": 0.0001641814728645502, |
|
"loss": 1.9288, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.29285714285714287, |
|
"grad_norm": 0.22392615675926208, |
|
"learning_rate": 0.000163742398974869, |
|
"loss": 1.693, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.29464285714285715, |
|
"grad_norm": 0.22729454934597015, |
|
"learning_rate": 0.00016330124538088705, |
|
"loss": 1.7027, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.29642857142857143, |
|
"grad_norm": 0.2229882776737213, |
|
"learning_rate": 0.00016285802647599156, |
|
"loss": 1.8262, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.2982142857142857, |
|
"grad_norm": 0.25520074367523193, |
|
"learning_rate": 0.00016241275672095395, |
|
"loss": 1.6009, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.24272315204143524, |
|
"learning_rate": 0.00016196545064345812, |
|
"loss": 1.9227, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.30178571428571427, |
|
"grad_norm": 0.24380216002464294, |
|
"learning_rate": 0.00016151612283762652, |
|
"loss": 1.5198, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.30357142857142855, |
|
"grad_norm": 0.3242342472076416, |
|
"learning_rate": 0.00016106478796354382, |
|
"loss": 1.6981, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3053571428571429, |
|
"grad_norm": 0.277855783700943, |
|
"learning_rate": 0.00016061146074677885, |
|
"loss": 1.7011, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.30714285714285716, |
|
"grad_norm": 0.2710039019584656, |
|
"learning_rate": 0.00016015615597790388, |
|
"loss": 1.7522, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.30892857142857144, |
|
"grad_norm": 0.26541268825531006, |
|
"learning_rate": 0.00015969888851201226, |
|
"loss": 1.3804, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3107142857142857, |
|
"grad_norm": 0.28985923528671265, |
|
"learning_rate": 0.00015923967326823368, |
|
"loss": 1.6453, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.33939245343208313, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 1.1725, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3142857142857143, |
|
"grad_norm": 0.29770731925964355, |
|
"learning_rate": 0.0001583154594407932, |
|
"loss": 1.6746, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.31607142857142856, |
|
"grad_norm": 0.3280562460422516, |
|
"learning_rate": 0.0001578504910111811, |
|
"loss": 1.1357, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.31785714285714284, |
|
"grad_norm": 0.2856597304344177, |
|
"learning_rate": 0.00015738363511079776, |
|
"loss": 1.1127, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.3196428571428571, |
|
"grad_norm": 0.316491961479187, |
|
"learning_rate": 0.00015691490697161182, |
|
"loss": 1.4281, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.32142857142857145, |
|
"grad_norm": 0.3632654845714569, |
|
"learning_rate": 0.00015644432188667695, |
|
"loss": 1.3413, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.32321428571428573, |
|
"grad_norm": 0.34329405426979065, |
|
"learning_rate": 0.00015597189520963277, |
|
"loss": 1.0579, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 0.32447105646133423, |
|
"learning_rate": 0.00015549764235420405, |
|
"loss": 1.243, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.3267857142857143, |
|
"grad_norm": 0.3558500409126282, |
|
"learning_rate": 0.0001550215787936977, |
|
"loss": 1.1376, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.32857142857142857, |
|
"grad_norm": 0.3373570740222931, |
|
"learning_rate": 0.00015454372006049803, |
|
"loss": 1.1251, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.33035714285714285, |
|
"grad_norm": 0.36412546038627625, |
|
"learning_rate": 0.00015406408174555976, |
|
"loss": 1.3238, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.33214285714285713, |
|
"grad_norm": 0.364442378282547, |
|
"learning_rate": 0.00015358267949789966, |
|
"loss": 0.9448, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.3339285714285714, |
|
"grad_norm": 0.3172107934951782, |
|
"learning_rate": 0.00015309952902408576, |
|
"loss": 1.2744, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.3357142857142857, |
|
"grad_norm": 0.34173399209976196, |
|
"learning_rate": 0.00015261464608772488, |
|
"loss": 1.0923, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.3375, |
|
"grad_norm": 0.33419185876846313, |
|
"learning_rate": 0.0001521280465089484, |
|
"loss": 1.2762, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.3392857142857143, |
|
"grad_norm": 0.3866868317127228, |
|
"learning_rate": 0.0001516397461638962, |
|
"loss": 0.9595, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3410714285714286, |
|
"grad_norm": 0.3978990614414215, |
|
"learning_rate": 0.00015114976098419842, |
|
"loss": 0.9993, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.34285714285714286, |
|
"grad_norm": 0.3546142876148224, |
|
"learning_rate": 0.00015065810695645584, |
|
"loss": 1.3421, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.34464285714285714, |
|
"grad_norm": 0.39728498458862305, |
|
"learning_rate": 0.00015016480012171828, |
|
"loss": 1.1209, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.3464285714285714, |
|
"grad_norm": 0.4170741140842438, |
|
"learning_rate": 0.00014966985657496114, |
|
"loss": 1.0024, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.3482142857142857, |
|
"grad_norm": 0.4226652681827545, |
|
"learning_rate": 0.0001491732924645604, |
|
"loss": 1.3139, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.3712114691734314, |
|
"learning_rate": 0.00014867512399176563, |
|
"loss": 1.1574, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3517857142857143, |
|
"grad_norm": 0.3655322790145874, |
|
"learning_rate": 0.00014817536741017152, |
|
"loss": 1.6149, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.3535714285714286, |
|
"grad_norm": 0.4362059533596039, |
|
"learning_rate": 0.0001476740390251875, |
|
"loss": 1.7657, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.35535714285714287, |
|
"grad_norm": 0.43134769797325134, |
|
"learning_rate": 0.00014717115519350567, |
|
"loss": 1.7167, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.7784890532493591, |
|
"learning_rate": 0.00014666673232256738, |
|
"loss": 2.036, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.35892857142857143, |
|
"grad_norm": 0.17376984655857086, |
|
"learning_rate": 0.0001461607868700276, |
|
"loss": 1.4856, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.3607142857142857, |
|
"grad_norm": 0.2141953408718109, |
|
"learning_rate": 0.00014565333534321826, |
|
"loss": 1.7491, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.3625, |
|
"grad_norm": 0.22548137605190277, |
|
"learning_rate": 0.00014514439429860943, |
|
"loss": 1.8457, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.36428571428571427, |
|
"grad_norm": 0.20618294179439545, |
|
"learning_rate": 0.0001446339803412692, |
|
"loss": 1.4987, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.36607142857142855, |
|
"grad_norm": 0.21025151014328003, |
|
"learning_rate": 0.00014412211012432212, |
|
"loss": 1.5568, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3678571428571429, |
|
"grad_norm": 0.21678180992603302, |
|
"learning_rate": 0.00014360880034840554, |
|
"loss": 1.7841, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.36964285714285716, |
|
"grad_norm": 0.20914790034294128, |
|
"learning_rate": 0.0001430940677611249, |
|
"loss": 1.6693, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.37142857142857144, |
|
"grad_norm": 0.21597585082054138, |
|
"learning_rate": 0.00014257792915650728, |
|
"loss": 1.648, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3732142857142857, |
|
"grad_norm": 0.23697789013385773, |
|
"learning_rate": 0.00014206040137445348, |
|
"loss": 1.7616, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.2535800635814667, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 2.0279, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3767857142857143, |
|
"grad_norm": 0.21204812824726105, |
|
"learning_rate": 0.0001410212458637112, |
|
"loss": 1.8472, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.37857142857142856, |
|
"grad_norm": 0.36059629917144775, |
|
"learning_rate": 0.00014049965203924054, |
|
"loss": 1.8042, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.38035714285714284, |
|
"grad_norm": 0.21400661766529083, |
|
"learning_rate": 0.0001399767368446634, |
|
"loss": 1.698, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.3821428571428571, |
|
"grad_norm": 0.24055758118629456, |
|
"learning_rate": 0.00013945251734097828, |
|
"loss": 1.8758, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.38392857142857145, |
|
"grad_norm": 0.23605166375637054, |
|
"learning_rate": 0.00013892701063173918, |
|
"loss": 1.7425, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.38571428571428573, |
|
"grad_norm": 0.23343758285045624, |
|
"learning_rate": 0.00013840023386249713, |
|
"loss": 1.8683, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.3875, |
|
"grad_norm": 0.2475200593471527, |
|
"learning_rate": 0.00013787220422024134, |
|
"loss": 1.9091, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.3892857142857143, |
|
"grad_norm": 0.2618944048881531, |
|
"learning_rate": 0.00013734293893283783, |
|
"loss": 1.5086, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.39107142857142857, |
|
"grad_norm": 0.2627498209476471, |
|
"learning_rate": 0.00013681245526846783, |
|
"loss": 1.3878, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.39285714285714285, |
|
"grad_norm": 0.24390314519405365, |
|
"learning_rate": 0.0001362807705350641, |
|
"loss": 1.7332, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.39464285714285713, |
|
"grad_norm": 0.2768295705318451, |
|
"learning_rate": 0.00013574790207974646, |
|
"loss": 1.3123, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.3964285714285714, |
|
"grad_norm": 0.2606358230113983, |
|
"learning_rate": 0.0001352138672882555, |
|
"loss": 1.4506, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.3982142857142857, |
|
"grad_norm": 0.24806426465511322, |
|
"learning_rate": 0.00013467868358438563, |
|
"loss": 1.7087, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2664608061313629, |
|
"learning_rate": 0.00013414236842941644, |
|
"loss": 1.3124, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.4017857142857143, |
|
"grad_norm": 0.2661263346672058, |
|
"learning_rate": 0.00013360493932154302, |
|
"loss": 1.2174, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4035714285714286, |
|
"grad_norm": 0.3460038900375366, |
|
"learning_rate": 0.00013306641379530514, |
|
"loss": 0.6889, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.40535714285714286, |
|
"grad_norm": 0.2929069995880127, |
|
"learning_rate": 0.000132526809421015, |
|
"loss": 0.9457, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.40714285714285714, |
|
"grad_norm": 0.3459819257259369, |
|
"learning_rate": 0.00013198614380418412, |
|
"loss": 1.2547, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4089285714285714, |
|
"grad_norm": 0.30105313658714294, |
|
"learning_rate": 0.00013144443458494882, |
|
"loss": 0.957, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.4107142857142857, |
|
"grad_norm": 0.3461960256099701, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 1.3146, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4125, |
|
"grad_norm": 0.34542855620384216, |
|
"learning_rate": 0.00013035795606948023, |
|
"loss": 1.1128, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.4142857142857143, |
|
"grad_norm": 0.37605586647987366, |
|
"learning_rate": 0.00012981322222145846, |
|
"loss": 1.5095, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.4160714285714286, |
|
"grad_norm": 0.37267056107521057, |
|
"learning_rate": 0.00012926751566629875, |
|
"loss": 1.071, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.41785714285714287, |
|
"grad_norm": 0.3052172064781189, |
|
"learning_rate": 0.00012872085420860665, |
|
"loss": 1.3136, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.41964285714285715, |
|
"grad_norm": 0.36694592237472534, |
|
"learning_rate": 0.00012817325568414297, |
|
"loss": 1.2439, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.42142857142857143, |
|
"grad_norm": 0.36055245995521545, |
|
"learning_rate": 0.00012762473795924204, |
|
"loss": 1.1165, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4232142857142857, |
|
"grad_norm": 0.3014545738697052, |
|
"learning_rate": 0.00012707531893022854, |
|
"loss": 1.5423, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 0.3208891749382019, |
|
"learning_rate": 0.00012652501652283377, |
|
"loss": 1.1813, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.42678571428571427, |
|
"grad_norm": 0.38703230023384094, |
|
"learning_rate": 0.00012597384869161084, |
|
"loss": 0.7706, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.38256821036338806, |
|
"learning_rate": 0.00012542183341934872, |
|
"loss": 1.0565, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4303571428571429, |
|
"grad_norm": 0.3555380702018738, |
|
"learning_rate": 0.0001248689887164855, |
|
"loss": 0.849, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.43214285714285716, |
|
"grad_norm": 0.3472703993320465, |
|
"learning_rate": 0.00012431533262052098, |
|
"loss": 1.3984, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.43392857142857144, |
|
"grad_norm": 0.3631349503993988, |
|
"learning_rate": 0.000123760883195428, |
|
"loss": 0.8955, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4357142857142857, |
|
"grad_norm": 0.349295973777771, |
|
"learning_rate": 0.00012320565853106316, |
|
"loss": 0.8866, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 0.33635953068733215, |
|
"learning_rate": 0.00012264967674257646, |
|
"loss": 1.2419, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4392857142857143, |
|
"grad_norm": 0.3833181858062744, |
|
"learning_rate": 0.00012209295596982042, |
|
"loss": 1.5507, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.44107142857142856, |
|
"grad_norm": 0.3737214505672455, |
|
"learning_rate": 0.00012153551437675821, |
|
"loss": 1.4881, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.44285714285714284, |
|
"grad_norm": 0.4705282747745514, |
|
"learning_rate": 0.00012097737015087094, |
|
"loss": 1.4864, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.4446428571428571, |
|
"grad_norm": 0.39539188146591187, |
|
"learning_rate": 0.00012041854150256433, |
|
"loss": 1.7855, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"grad_norm": 0.7369075417518616, |
|
"learning_rate": 0.00011985904666457455, |
|
"loss": 2.01, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.44821428571428573, |
|
"grad_norm": 0.18146094679832458, |
|
"learning_rate": 0.00011929890389137337, |
|
"loss": 1.5898, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.21558880805969238, |
|
"learning_rate": 0.00011873813145857249, |
|
"loss": 1.5816, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.4517857142857143, |
|
"grad_norm": 0.19599275290966034, |
|
"learning_rate": 0.00011817674766232734, |
|
"loss": 1.6433, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.45357142857142857, |
|
"grad_norm": 0.22075910866260529, |
|
"learning_rate": 0.00011761477081874015, |
|
"loss": 1.6005, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.45535714285714285, |
|
"grad_norm": 0.19471955299377441, |
|
"learning_rate": 0.0001170522192632624, |
|
"loss": 1.7133, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 0.19876879453659058, |
|
"learning_rate": 0.00011648911135009634, |
|
"loss": 1.5085, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.4589285714285714, |
|
"grad_norm": 0.20565317571163177, |
|
"learning_rate": 0.00011592546545159645, |
|
"loss": 1.7386, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.4607142857142857, |
|
"grad_norm": 0.24483506381511688, |
|
"learning_rate": 0.00011536129995766996, |
|
"loss": 1.7162, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.4625, |
|
"grad_norm": 0.21543823182582855, |
|
"learning_rate": 0.00011479663327517667, |
|
"loss": 1.6966, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.4642857142857143, |
|
"grad_norm": 0.2661048471927643, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 1.8821, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4660714285714286, |
|
"grad_norm": 0.24292460083961487, |
|
"learning_rate": 0.00011366587005308858, |
|
"loss": 1.7085, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.46785714285714286, |
|
"grad_norm": 0.216167613863945, |
|
"learning_rate": 0.0001130998104065693, |
|
"loss": 1.7298, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.46964285714285714, |
|
"grad_norm": 0.2111697793006897, |
|
"learning_rate": 0.00011253332335643043, |
|
"loss": 1.8098, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.4714285714285714, |
|
"grad_norm": 0.23981061577796936, |
|
"learning_rate": 0.00011196642738527659, |
|
"loss": 1.7026, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.4732142857142857, |
|
"grad_norm": 0.2623251676559448, |
|
"learning_rate": 0.00011139914098905406, |
|
"loss": 1.7894, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 0.2482486367225647, |
|
"learning_rate": 0.00011083148267644747, |
|
"loss": 1.9019, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.4767857142857143, |
|
"grad_norm": 0.238911435008049, |
|
"learning_rate": 0.00011026347096827578, |
|
"loss": 1.6809, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.4785714285714286, |
|
"grad_norm": 0.24704696238040924, |
|
"learning_rate": 0.00010969512439688816, |
|
"loss": 1.6607, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.48035714285714287, |
|
"grad_norm": 0.25105100870132446, |
|
"learning_rate": 0.00010912646150555919, |
|
"loss": 1.5895, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.48214285714285715, |
|
"grad_norm": 0.2849842607975006, |
|
"learning_rate": 0.00010855750084788398, |
|
"loss": 2.0812, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.48392857142857143, |
|
"grad_norm": 0.2599342167377472, |
|
"learning_rate": 0.00010798826098717276, |
|
"loss": 1.1569, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.4857142857142857, |
|
"grad_norm": 0.28037258982658386, |
|
"learning_rate": 0.00010741876049584523, |
|
"loss": 1.1928, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.4875, |
|
"grad_norm": 0.29920563101768494, |
|
"learning_rate": 0.00010684901795482456, |
|
"loss": 1.2244, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.48928571428571427, |
|
"grad_norm": 0.2799164354801178, |
|
"learning_rate": 0.00010627905195293135, |
|
"loss": 1.4455, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.49107142857142855, |
|
"grad_norm": 0.23873603343963623, |
|
"learning_rate": 0.00010570888108627681, |
|
"loss": 0.852, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4928571428571429, |
|
"grad_norm": 0.2817741632461548, |
|
"learning_rate": 0.00010513852395765631, |
|
"loss": 1.3203, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.49464285714285716, |
|
"grad_norm": 0.27295514941215515, |
|
"learning_rate": 0.00010456799917594233, |
|
"loss": 0.749, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.49642857142857144, |
|
"grad_norm": 0.30728739500045776, |
|
"learning_rate": 0.00010399732535547734, |
|
"loss": 1.0083, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.4982142857142857, |
|
"grad_norm": 0.32001444697380066, |
|
"learning_rate": 0.00010342652111546635, |
|
"loss": 1.573, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3400101065635681, |
|
"learning_rate": 0.00010285560507936961, |
|
"loss": 1.0757, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.4439584016799927, |
|
"eval_runtime": 13.9741, |
|
"eval_samples_per_second": 16.888, |
|
"eval_steps_per_second": 8.444, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5017857142857143, |
|
"grad_norm": 0.3074837923049927, |
|
"learning_rate": 0.00010228459587429497, |
|
"loss": 1.1389, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.5035714285714286, |
|
"grad_norm": 0.3143484592437744, |
|
"learning_rate": 0.00010171351213038993, |
|
"loss": 0.9542, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5053571428571428, |
|
"grad_norm": 0.3524804413318634, |
|
"learning_rate": 0.00010114237248023404, |
|
"loss": 0.8578, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.5071428571428571, |
|
"grad_norm": 0.335183322429657, |
|
"learning_rate": 0.00010057119555823085, |
|
"loss": 0.9228, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5089285714285714, |
|
"grad_norm": 0.35537081956863403, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9541, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5107142857142857, |
|
"grad_norm": 0.3294563591480255, |
|
"learning_rate": 9.942880444176918e-05, |
|
"loss": 1.4223, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5125, |
|
"grad_norm": 0.32628077268600464, |
|
"learning_rate": 9.8857627519766e-05, |
|
"loss": 1.2942, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.5142857142857142, |
|
"grad_norm": 0.32195404171943665, |
|
"learning_rate": 9.828648786961008e-05, |
|
"loss": 1.4239, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5160714285714286, |
|
"grad_norm": 0.3283804655075073, |
|
"learning_rate": 9.771540412570504e-05, |
|
"loss": 1.0303, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.5178571428571429, |
|
"grad_norm": 0.356052428483963, |
|
"learning_rate": 9.71443949206304e-05, |
|
"loss": 1.0199, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5196428571428572, |
|
"grad_norm": 0.39479124546051025, |
|
"learning_rate": 9.657347888453367e-05, |
|
"loss": 1.1343, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.5214285714285715, |
|
"grad_norm": 0.34791451692581177, |
|
"learning_rate": 9.60026746445227e-05, |
|
"loss": 1.521, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5232142857142857, |
|
"grad_norm": 0.3614530861377716, |
|
"learning_rate": 9.543200082405768e-05, |
|
"loss": 1.2346, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.525, |
|
"grad_norm": 0.36958596110343933, |
|
"learning_rate": 9.486147604234371e-05, |
|
"loss": 1.0457, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5267857142857143, |
|
"grad_norm": 0.4293418824672699, |
|
"learning_rate": 9.42911189137232e-05, |
|
"loss": 1.1071, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5285714285714286, |
|
"grad_norm": 0.40408602356910706, |
|
"learning_rate": 9.372094804706867e-05, |
|
"loss": 1.3805, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.5303571428571429, |
|
"grad_norm": 0.3892784118652344, |
|
"learning_rate": 9.315098204517543e-05, |
|
"loss": 0.9136, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.5321428571428571, |
|
"grad_norm": 0.4003988206386566, |
|
"learning_rate": 9.258123950415479e-05, |
|
"loss": 1.3684, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.5339285714285714, |
|
"grad_norm": 0.37116503715515137, |
|
"learning_rate": 9.201173901282724e-05, |
|
"loss": 1.7824, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 0.5286569595336914, |
|
"learning_rate": 9.144249915211605e-05, |
|
"loss": 1.9392, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5375, |
|
"grad_norm": 0.18338526785373688, |
|
"learning_rate": 9.087353849444085e-05, |
|
"loss": 1.4422, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.5392857142857143, |
|
"grad_norm": 0.20191389322280884, |
|
"learning_rate": 9.030487560311186e-05, |
|
"loss": 1.5443, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.5410714285714285, |
|
"grad_norm": 0.19955602288246155, |
|
"learning_rate": 8.973652903172423e-05, |
|
"loss": 1.6521, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.5428571428571428, |
|
"grad_norm": 0.2253991812467575, |
|
"learning_rate": 8.916851732355255e-05, |
|
"loss": 1.6596, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.5446428571428571, |
|
"grad_norm": 0.18380412459373474, |
|
"learning_rate": 8.860085901094595e-05, |
|
"loss": 1.5462, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5464285714285714, |
|
"grad_norm": 0.21318556368350983, |
|
"learning_rate": 8.803357261472343e-05, |
|
"loss": 1.6713, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.5482142857142858, |
|
"grad_norm": 0.20480670034885406, |
|
"learning_rate": 8.746667664356956e-05, |
|
"loss": 1.7537, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.21598470211029053, |
|
"learning_rate": 8.690018959343072e-05, |
|
"loss": 1.6955, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.5517857142857143, |
|
"grad_norm": 0.21172207593917847, |
|
"learning_rate": 8.633412994691144e-05, |
|
"loss": 1.7187, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.5535714285714286, |
|
"grad_norm": 0.22086191177368164, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 1.7579, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5553571428571429, |
|
"grad_norm": 0.2266392558813095, |
|
"learning_rate": 8.520336672482338e-05, |
|
"loss": 1.7486, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.5571428571428572, |
|
"grad_norm": 0.2660948634147644, |
|
"learning_rate": 8.463870004233008e-05, |
|
"loss": 1.7903, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.5589285714285714, |
|
"grad_norm": 0.2297395020723343, |
|
"learning_rate": 8.407453454840357e-05, |
|
"loss": 1.8017, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.5607142857142857, |
|
"grad_norm": 0.21526682376861572, |
|
"learning_rate": 8.351088864990368e-05, |
|
"loss": 1.855, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.2516147196292877, |
|
"learning_rate": 8.294778073673762e-05, |
|
"loss": 1.7103, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5642857142857143, |
|
"grad_norm": 0.25641700625419617, |
|
"learning_rate": 8.238522918125983e-05, |
|
"loss": 1.9301, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.5660714285714286, |
|
"grad_norm": 0.26828062534332275, |
|
"learning_rate": 8.182325233767267e-05, |
|
"loss": 1.8575, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.5678571428571428, |
|
"grad_norm": 0.24787884950637817, |
|
"learning_rate": 8.126186854142752e-05, |
|
"loss": 2.0228, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.5696428571428571, |
|
"grad_norm": 0.23658955097198486, |
|
"learning_rate": 8.070109610862668e-05, |
|
"loss": 1.7813, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.2818485498428345, |
|
"learning_rate": 8.014095333542548e-05, |
|
"loss": 1.7571, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5732142857142857, |
|
"grad_norm": 0.24982373416423798, |
|
"learning_rate": 7.958145849743569e-05, |
|
"loss": 1.602, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.575, |
|
"grad_norm": 0.2815864682197571, |
|
"learning_rate": 7.902262984912909e-05, |
|
"loss": 1.7216, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.5767857142857142, |
|
"grad_norm": 0.2675464451313019, |
|
"learning_rate": 7.846448562324183e-05, |
|
"loss": 1.0704, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.5785714285714286, |
|
"grad_norm": 0.23938840627670288, |
|
"learning_rate": 7.79070440301796e-05, |
|
"loss": 1.282, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.5803571428571429, |
|
"grad_norm": 0.2834213972091675, |
|
"learning_rate": 7.735032325742355e-05, |
|
"loss": 1.7934, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5821428571428572, |
|
"grad_norm": 0.3555513918399811, |
|
"learning_rate": 7.679434146893685e-05, |
|
"loss": 1.2089, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.5839285714285715, |
|
"grad_norm": 0.3254348933696747, |
|
"learning_rate": 7.623911680457198e-05, |
|
"loss": 1.0845, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.5857142857142857, |
|
"grad_norm": 0.3558744192123413, |
|
"learning_rate": 7.568466737947905e-05, |
|
"loss": 1.2339, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.5875, |
|
"grad_norm": 0.32738080620765686, |
|
"learning_rate": 7.513101128351454e-05, |
|
"loss": 0.8492, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.5892857142857143, |
|
"grad_norm": 0.36939212679862976, |
|
"learning_rate": 7.457816658065134e-05, |
|
"loss": 0.8973, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5910714285714286, |
|
"grad_norm": 0.3393215835094452, |
|
"learning_rate": 7.402615130838917e-05, |
|
"loss": 1.0078, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.5928571428571429, |
|
"grad_norm": 0.402725487947464, |
|
"learning_rate": 7.347498347716624e-05, |
|
"loss": 1.2381, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.5946428571428571, |
|
"grad_norm": 0.33164989948272705, |
|
"learning_rate": 7.292468106977148e-05, |
|
"loss": 1.197, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.5964285714285714, |
|
"grad_norm": 0.3454689681529999, |
|
"learning_rate": 7.237526204075797e-05, |
|
"loss": 1.0244, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.5982142857142857, |
|
"grad_norm": 0.3584868907928467, |
|
"learning_rate": 7.182674431585704e-05, |
|
"loss": 1.062, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.3318020701408386, |
|
"learning_rate": 7.127914579139338e-05, |
|
"loss": 1.3696, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6017857142857143, |
|
"grad_norm": 0.38772639632225037, |
|
"learning_rate": 7.073248433370124e-05, |
|
"loss": 1.1725, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.6035714285714285, |
|
"grad_norm": 0.33527833223342896, |
|
"learning_rate": 7.018677777854157e-05, |
|
"loss": 1.0849, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6053571428571428, |
|
"grad_norm": 0.3958960473537445, |
|
"learning_rate": 6.964204393051981e-05, |
|
"loss": 0.8494, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.6071428571428571, |
|
"grad_norm": 0.3545495569705963, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.843, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6089285714285714, |
|
"grad_norm": 0.3475857675075531, |
|
"learning_rate": 6.855556541505122e-05, |
|
"loss": 1.1228, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.6107142857142858, |
|
"grad_norm": 0.4085654616355896, |
|
"learning_rate": 6.801385619581592e-05, |
|
"loss": 0.8092, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.6125, |
|
"grad_norm": 0.4605577886104584, |
|
"learning_rate": 6.747319057898503e-05, |
|
"loss": 1.0999, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.6142857142857143, |
|
"grad_norm": 0.3840469717979431, |
|
"learning_rate": 6.693358620469487e-05, |
|
"loss": 1.2712, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.6160714285714286, |
|
"grad_norm": 0.3712100684642792, |
|
"learning_rate": 6.639506067845697e-05, |
|
"loss": 1.1401, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6178571428571429, |
|
"grad_norm": 0.41670674085617065, |
|
"learning_rate": 6.585763157058358e-05, |
|
"loss": 1.151, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.6196428571428572, |
|
"grad_norm": 0.5912812352180481, |
|
"learning_rate": 6.53213164156144e-05, |
|
"loss": 1.447, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.6214285714285714, |
|
"grad_norm": 0.3995843231678009, |
|
"learning_rate": 6.478613271174453e-05, |
|
"loss": 1.6645, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.6232142857142857, |
|
"grad_norm": 0.5337942242622375, |
|
"learning_rate": 6.425209792025358e-05, |
|
"loss": 1.8703, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.5726771354675293, |
|
"learning_rate": 6.371922946493591e-05, |
|
"loss": 1.9795, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6267857142857143, |
|
"grad_norm": 0.17994743585586548, |
|
"learning_rate": 6.318754473153221e-05, |
|
"loss": 1.5463, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.6285714285714286, |
|
"grad_norm": 0.16961722075939178, |
|
"learning_rate": 6.26570610671622e-05, |
|
"loss": 1.5388, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.6303571428571428, |
|
"grad_norm": 0.19410116970539093, |
|
"learning_rate": 6.21277957797587e-05, |
|
"loss": 1.7282, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.6321428571428571, |
|
"grad_norm": 0.1906638890504837, |
|
"learning_rate": 6.159976613750286e-05, |
|
"loss": 1.5167, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.6339285714285714, |
|
"grad_norm": 0.21918904781341553, |
|
"learning_rate": 6.107298936826086e-05, |
|
"loss": 1.7446, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6357142857142857, |
|
"grad_norm": 0.19429104030132294, |
|
"learning_rate": 6.0547482659021706e-05, |
|
"loss": 1.7166, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.6375, |
|
"grad_norm": 0.21244099736213684, |
|
"learning_rate": 6.002326315533665e-05, |
|
"loss": 1.7319, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.6392857142857142, |
|
"grad_norm": 0.22793884575366974, |
|
"learning_rate": 5.950034796075947e-05, |
|
"loss": 1.6573, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.6410714285714286, |
|
"grad_norm": 0.23558048903942108, |
|
"learning_rate": 5.897875413628884e-05, |
|
"loss": 1.7359, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 0.21951396763324738, |
|
"learning_rate": 5.845849869981137e-05, |
|
"loss": 1.6536, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6446428571428572, |
|
"grad_norm": 0.22028475999832153, |
|
"learning_rate": 5.793959862554652e-05, |
|
"loss": 1.7257, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.6464285714285715, |
|
"grad_norm": 0.23070622980594635, |
|
"learning_rate": 5.7422070843492734e-05, |
|
"loss": 1.6149, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.6482142857142857, |
|
"grad_norm": 0.22408127784729004, |
|
"learning_rate": 5.6905932238875123e-05, |
|
"loss": 1.5936, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.23098969459533691, |
|
"learning_rate": 5.639119965159446e-05, |
|
"loss": 1.7313, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.6517857142857143, |
|
"grad_norm": 0.2533377707004547, |
|
"learning_rate": 5.5877889875677845e-05, |
|
"loss": 1.9422, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6535714285714286, |
|
"grad_norm": 0.2515897750854492, |
|
"learning_rate": 5.5366019658730825e-05, |
|
"loss": 1.8331, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.6553571428571429, |
|
"grad_norm": 0.2757863700389862, |
|
"learning_rate": 5.485560570139061e-05, |
|
"loss": 1.7759, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.6571428571428571, |
|
"grad_norm": 0.29774826765060425, |
|
"learning_rate": 5.434666465678175e-05, |
|
"loss": 1.334, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.6589285714285714, |
|
"grad_norm": 0.28664523363113403, |
|
"learning_rate": 5.383921312997242e-05, |
|
"loss": 1.6234, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.6607142857142857, |
|
"grad_norm": 0.30774933099746704, |
|
"learning_rate": 5.333326767743263e-05, |
|
"loss": 1.4743, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6625, |
|
"grad_norm": 0.26094669103622437, |
|
"learning_rate": 5.282884480649435e-05, |
|
"loss": 1.1046, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.6642857142857143, |
|
"grad_norm": 0.2818247079849243, |
|
"learning_rate": 5.232596097481251e-05, |
|
"loss": 0.8417, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.6660714285714285, |
|
"grad_norm": 0.3089035749435425, |
|
"learning_rate": 5.182463258982846e-05, |
|
"loss": 1.3922, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.6678571428571428, |
|
"grad_norm": 0.2375502586364746, |
|
"learning_rate": 5.132487600823438e-05, |
|
"loss": 1.0855, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.6696428571428571, |
|
"grad_norm": 0.3417636454105377, |
|
"learning_rate": 5.082670753543961e-05, |
|
"loss": 1.0819, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.6714285714285714, |
|
"grad_norm": 0.2587840259075165, |
|
"learning_rate": 5.033014342503889e-05, |
|
"loss": 1.1154, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.6732142857142858, |
|
"grad_norm": 0.29829278588294983, |
|
"learning_rate": 4.9835199878281765e-05, |
|
"loss": 0.9634, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.675, |
|
"grad_norm": 0.307190477848053, |
|
"learning_rate": 4.9341893043544185e-05, |
|
"loss": 1.1533, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.6767857142857143, |
|
"grad_norm": 0.3548847734928131, |
|
"learning_rate": 4.8850239015801625e-05, |
|
"loss": 1.2046, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.6785714285714286, |
|
"grad_norm": 0.3130282759666443, |
|
"learning_rate": 4.836025383610382e-05, |
|
"loss": 1.1391, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6803571428571429, |
|
"grad_norm": 0.3400501012802124, |
|
"learning_rate": 4.787195349105159e-05, |
|
"loss": 1.0226, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.6821428571428572, |
|
"grad_norm": 0.3462565541267395, |
|
"learning_rate": 4.7385353912275165e-05, |
|
"loss": 1.0968, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.6839285714285714, |
|
"grad_norm": 0.331123948097229, |
|
"learning_rate": 4.690047097591427e-05, |
|
"loss": 1.1918, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.6857142857142857, |
|
"grad_norm": 0.354432612657547, |
|
"learning_rate": 4.6417320502100316e-05, |
|
"loss": 1.2261, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 0.34844398498535156, |
|
"learning_rate": 4.593591825444028e-05, |
|
"loss": 0.7991, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6892857142857143, |
|
"grad_norm": 0.3462367653846741, |
|
"learning_rate": 4.545627993950201e-05, |
|
"loss": 1.1343, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.6910714285714286, |
|
"grad_norm": 0.3352709114551544, |
|
"learning_rate": 4.497842120630229e-05, |
|
"loss": 1.1023, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.6928571428571428, |
|
"grad_norm": 0.3581717610359192, |
|
"learning_rate": 4.4502357645795976e-05, |
|
"loss": 0.9181, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.6946428571428571, |
|
"grad_norm": 0.35995498299598694, |
|
"learning_rate": 4.402810479036725e-05, |
|
"loss": 1.3445, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.6964285714285714, |
|
"grad_norm": 0.372935950756073, |
|
"learning_rate": 4.355567811332311e-05, |
|
"loss": 1.0725, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6982142857142857, |
|
"grad_norm": 0.3759123980998993, |
|
"learning_rate": 4.30850930283882e-05, |
|
"loss": 1.1824, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.391770601272583, |
|
"learning_rate": 4.2616364889202254e-05, |
|
"loss": 1.3516, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.7017857142857142, |
|
"grad_norm": 0.3785625696182251, |
|
"learning_rate": 4.214950898881892e-05, |
|
"loss": 1.1624, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.7035714285714286, |
|
"grad_norm": 0.4284125864505768, |
|
"learning_rate": 4.168454055920681e-05, |
|
"loss": 1.1318, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7053571428571429, |
|
"grad_norm": 0.391161173582077, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 1.4543, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7071428571428572, |
|
"grad_norm": 0.3723802864551544, |
|
"learning_rate": 4.0760326731766374e-05, |
|
"loss": 1.4265, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.7089285714285715, |
|
"grad_norm": 0.4321235418319702, |
|
"learning_rate": 4.030111148798775e-05, |
|
"loss": 1.4523, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.7107142857142857, |
|
"grad_norm": 0.43963977694511414, |
|
"learning_rate": 3.9843844022096135e-05, |
|
"loss": 1.7426, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.7125, |
|
"grad_norm": 0.5444914698600769, |
|
"learning_rate": 3.938853925322118e-05, |
|
"loss": 1.8907, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.9059175252914429, |
|
"learning_rate": 3.893521203645618e-05, |
|
"loss": 2.2147, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7160714285714286, |
|
"grad_norm": 0.20250235497951508, |
|
"learning_rate": 3.848387716237353e-05, |
|
"loss": 1.7341, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.7178571428571429, |
|
"grad_norm": 0.1863853931427002, |
|
"learning_rate": 3.8034549356541894e-05, |
|
"loss": 1.6956, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.7196428571428571, |
|
"grad_norm": 0.19317291676998138, |
|
"learning_rate": 3.7587243279046056e-05, |
|
"loss": 1.7165, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.7214285714285714, |
|
"grad_norm": 0.21101966500282288, |
|
"learning_rate": 3.714197352400849e-05, |
|
"loss": 1.8306, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.7232142857142857, |
|
"grad_norm": 0.22385361790657043, |
|
"learning_rate": 3.669875461911297e-05, |
|
"loss": 1.7104, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.725, |
|
"grad_norm": 0.22555914521217346, |
|
"learning_rate": 3.6257601025131026e-05, |
|
"loss": 1.5668, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.7267857142857143, |
|
"grad_norm": 0.21916812658309937, |
|
"learning_rate": 3.581852713544983e-05, |
|
"loss": 1.7827, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.7285714285714285, |
|
"grad_norm": 0.23447498679161072, |
|
"learning_rate": 3.538154727560259e-05, |
|
"loss": 1.8308, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.7303571428571428, |
|
"grad_norm": 0.21024593710899353, |
|
"learning_rate": 3.494667570280132e-05, |
|
"loss": 1.613, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.7321428571428571, |
|
"grad_norm": 0.23882578313350677, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 1.6853, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7339285714285714, |
|
"grad_norm": 0.23162604868412018, |
|
"learning_rate": 3.408331410278929e-05, |
|
"loss": 1.7371, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.7357142857142858, |
|
"grad_norm": 0.23150567710399628, |
|
"learning_rate": 3.3654852244220826e-05, |
|
"loss": 1.7505, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.7375, |
|
"grad_norm": 0.23027552664279938, |
|
"learning_rate": 3.322855500906373e-05, |
|
"loss": 1.7128, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.7392857142857143, |
|
"grad_norm": 0.22426114976406097, |
|
"learning_rate": 3.2804436305991214e-05, |
|
"loss": 1.7721, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.7410714285714286, |
|
"grad_norm": 0.22792723774909973, |
|
"learning_rate": 3.238250997259808e-05, |
|
"loss": 1.7089, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7428571428571429, |
|
"grad_norm": 0.2450588494539261, |
|
"learning_rate": 3.196278977494934e-05, |
|
"loss": 1.744, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.7446428571428572, |
|
"grad_norm": 0.2348526120185852, |
|
"learning_rate": 3.154528940713113e-05, |
|
"loss": 1.8496, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.7464285714285714, |
|
"grad_norm": 0.2519124746322632, |
|
"learning_rate": 3.113002249080386e-05, |
|
"loss": 1.76, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.7482142857142857, |
|
"grad_norm": 0.27859431505203247, |
|
"learning_rate": 3.071700257475768e-05, |
|
"loss": 1.6493, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.2539427876472473, |
|
"learning_rate": 3.030624313447067e-05, |
|
"loss": 1.7619, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 1.4375773668289185, |
|
"eval_runtime": 13.3809, |
|
"eval_samples_per_second": 17.637, |
|
"eval_steps_per_second": 8.819, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7517857142857143, |
|
"grad_norm": 0.2867870330810547, |
|
"learning_rate": 2.9897757571668905e-05, |
|
"loss": 1.5308, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.7535714285714286, |
|
"grad_norm": 0.26840099692344666, |
|
"learning_rate": 2.949155921388943e-05, |
|
"loss": 1.8933, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.7553571428571428, |
|
"grad_norm": 0.32162272930145264, |
|
"learning_rate": 2.9087661314045366e-05, |
|
"loss": 1.6497, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.7571428571428571, |
|
"grad_norm": 0.2842392921447754, |
|
"learning_rate": 2.8686077049993287e-05, |
|
"loss": 1.521, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.7589285714285714, |
|
"grad_norm": 0.28831931948661804, |
|
"learning_rate": 2.828681952410366e-05, |
|
"loss": 1.2619, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.7607142857142857, |
|
"grad_norm": 0.2879536747932434, |
|
"learning_rate": 2.7889901762833083e-05, |
|
"loss": 1.1705, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.7625, |
|
"grad_norm": 0.32446354627609253, |
|
"learning_rate": 2.7495336716299313e-05, |
|
"loss": 1.1636, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.7642857142857142, |
|
"grad_norm": 0.2487679123878479, |
|
"learning_rate": 2.7103137257858868e-05, |
|
"loss": 0.7092, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.7660714285714286, |
|
"grad_norm": 0.3402203321456909, |
|
"learning_rate": 2.671331618368682e-05, |
|
"loss": 0.8279, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.7678571428571429, |
|
"grad_norm": 0.3117457330226898, |
|
"learning_rate": 2.6325886212359498e-05, |
|
"loss": 1.2101, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7696428571428572, |
|
"grad_norm": 0.28996577858924866, |
|
"learning_rate": 2.5940859984439424e-05, |
|
"loss": 1.1556, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.7714285714285715, |
|
"grad_norm": 0.31623905897140503, |
|
"learning_rate": 2.5558250062062828e-05, |
|
"loss": 1.1324, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.7732142857142857, |
|
"grad_norm": 0.29438769817352295, |
|
"learning_rate": 2.5178068928529864e-05, |
|
"loss": 1.4183, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.775, |
|
"grad_norm": 0.31944945454597473, |
|
"learning_rate": 2.4800328987897427e-05, |
|
"loss": 1.1763, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.7767857142857143, |
|
"grad_norm": 0.32895660400390625, |
|
"learning_rate": 2.4425042564574184e-05, |
|
"loss": 0.8288, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.7785714285714286, |
|
"grad_norm": 0.46789219975471497, |
|
"learning_rate": 2.4052221902918725e-05, |
|
"loss": 0.889, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.7803571428571429, |
|
"grad_norm": 0.35038378834724426, |
|
"learning_rate": 2.368187916683997e-05, |
|
"loss": 1.1607, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.7821428571428571, |
|
"grad_norm": 0.3326191008090973, |
|
"learning_rate": 2.3314026439400217e-05, |
|
"loss": 1.3321, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.7839285714285714, |
|
"grad_norm": 0.3683798015117645, |
|
"learning_rate": 2.2948675722421086e-05, |
|
"loss": 0.831, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 0.3587857186794281, |
|
"learning_rate": 2.2585838936091754e-05, |
|
"loss": 1.0373, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7875, |
|
"grad_norm": 0.341905802488327, |
|
"learning_rate": 2.2225527918580204e-05, |
|
"loss": 1.2294, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.7892857142857143, |
|
"grad_norm": 0.36167776584625244, |
|
"learning_rate": 2.1867754425646926e-05, |
|
"loss": 1.06, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.7910714285714285, |
|
"grad_norm": 0.35555702447891235, |
|
"learning_rate": 2.151253013026121e-05, |
|
"loss": 1.5072, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.7928571428571428, |
|
"grad_norm": 0.34384891390800476, |
|
"learning_rate": 2.115986662222058e-05, |
|
"loss": 1.3456, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.7946428571428571, |
|
"grad_norm": 0.3673519492149353, |
|
"learning_rate": 2.0809775407772503e-05, |
|
"loss": 1.3223, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.7964285714285714, |
|
"grad_norm": 0.400193452835083, |
|
"learning_rate": 2.0462267909238896e-05, |
|
"loss": 1.562, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.7982142857142858, |
|
"grad_norm": 0.4361307621002197, |
|
"learning_rate": 2.011735546464365e-05, |
|
"loss": 1.1107, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.38674113154411316, |
|
"learning_rate": 1.9775049327342486e-05, |
|
"loss": 1.2813, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.8017857142857143, |
|
"grad_norm": 0.40437138080596924, |
|
"learning_rate": 1.943536066565603e-05, |
|
"loss": 1.5164, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.8035714285714286, |
|
"grad_norm": 0.5968692898750305, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 2.0903, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8053571428571429, |
|
"grad_norm": 0.16759343445301056, |
|
"learning_rate": 1.876388001504995e-05, |
|
"loss": 1.4236, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.8071428571428572, |
|
"grad_norm": 0.1815633326768875, |
|
"learning_rate": 1.8432109934329834e-05, |
|
"loss": 1.6486, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.8089285714285714, |
|
"grad_norm": 0.18043053150177002, |
|
"learning_rate": 1.810300114490875e-05, |
|
"loss": 1.4118, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.8107142857142857, |
|
"grad_norm": 0.19016335904598236, |
|
"learning_rate": 1.777656438452129e-05, |
|
"loss": 1.4844, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.2066669464111328, |
|
"learning_rate": 1.74528103037226e-05, |
|
"loss": 1.583, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.8142857142857143, |
|
"grad_norm": 0.21698947250843048, |
|
"learning_rate": 1.713174946554086e-05, |
|
"loss": 1.6314, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.8160714285714286, |
|
"grad_norm": 0.21937011182308197, |
|
"learning_rate": 1.6813392345132518e-05, |
|
"loss": 1.6342, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.8178571428571428, |
|
"grad_norm": 0.24016696214675903, |
|
"learning_rate": 1.649774932944075e-05, |
|
"loss": 1.7726, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.8196428571428571, |
|
"grad_norm": 0.23045389354228973, |
|
"learning_rate": 1.6184830716856347e-05, |
|
"loss": 1.8679, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.8214285714285714, |
|
"grad_norm": 0.2114153504371643, |
|
"learning_rate": 1.587464671688187e-05, |
|
"loss": 1.6223, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8232142857142857, |
|
"grad_norm": 0.22911155223846436, |
|
"learning_rate": 1.5567207449798515e-05, |
|
"loss": 1.6179, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.825, |
|
"grad_norm": 0.22748011350631714, |
|
"learning_rate": 1.5262522946335755e-05, |
|
"loss": 1.7988, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.8267857142857142, |
|
"grad_norm": 0.22931312024593353, |
|
"learning_rate": 1.4960603147344343e-05, |
|
"loss": 1.6515, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.8285714285714286, |
|
"grad_norm": 0.24372749030590057, |
|
"learning_rate": 1.466145790347183e-05, |
|
"loss": 1.855, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.8303571428571429, |
|
"grad_norm": 0.24426500499248505, |
|
"learning_rate": 1.4365096974841108e-05, |
|
"loss": 1.7367, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.8321428571428572, |
|
"grad_norm": 0.22962844371795654, |
|
"learning_rate": 1.4071530030732095e-05, |
|
"loss": 1.871, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.8339285714285715, |
|
"grad_norm": 0.25352221727371216, |
|
"learning_rate": 1.3780766649266242e-05, |
|
"loss": 1.816, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.8357142857142857, |
|
"grad_norm": 0.2572453022003174, |
|
"learning_rate": 1.3492816317093893e-05, |
|
"loss": 1.857, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.8375, |
|
"grad_norm": 0.26578372716903687, |
|
"learning_rate": 1.3207688429084974e-05, |
|
"loss": 1.8242, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.8392857142857143, |
|
"grad_norm": 0.2806295156478882, |
|
"learning_rate": 1.2925392288022298e-05, |
|
"loss": 1.9345, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8410714285714286, |
|
"grad_norm": 0.2791413962841034, |
|
"learning_rate": 1.2645937104298111e-05, |
|
"loss": 1.4595, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.8428571428571429, |
|
"grad_norm": 0.25607559084892273, |
|
"learning_rate": 1.2369331995613665e-05, |
|
"loss": 1.493, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.8446428571428571, |
|
"grad_norm": 0.2600158452987671, |
|
"learning_rate": 1.2095585986681535e-05, |
|
"loss": 1.4394, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.8464285714285714, |
|
"grad_norm": 0.27362141013145447, |
|
"learning_rate": 1.1824708008931418e-05, |
|
"loss": 1.0637, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.8482142857142857, |
|
"grad_norm": 0.27769190073013306, |
|
"learning_rate": 1.1556706900218572e-05, |
|
"loss": 1.2003, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.2426546961069107, |
|
"learning_rate": 1.1291591404535462e-05, |
|
"loss": 0.8119, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.8517857142857143, |
|
"grad_norm": 0.2766360938549042, |
|
"learning_rate": 1.1029370171726571e-05, |
|
"loss": 1.0697, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.8535714285714285, |
|
"grad_norm": 0.31444060802459717, |
|
"learning_rate": 1.0770051757206079e-05, |
|
"loss": 1.3142, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.8553571428571428, |
|
"grad_norm": 0.33098921179771423, |
|
"learning_rate": 1.051364462167881e-05, |
|
"loss": 1.0656, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.327269583940506, |
|
"learning_rate": 1.026015713086418e-05, |
|
"loss": 1.1335, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8589285714285714, |
|
"grad_norm": 0.30396682024002075, |
|
"learning_rate": 1.0009597555223128e-05, |
|
"loss": 1.2085, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.8607142857142858, |
|
"grad_norm": 0.30377310514450073, |
|
"learning_rate": 9.761974069688461e-06, |
|
"loss": 1.3687, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.8625, |
|
"grad_norm": 0.3333049714565277, |
|
"learning_rate": 9.517294753398064e-06, |
|
"loss": 1.0564, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.8642857142857143, |
|
"grad_norm": 0.3406635820865631, |
|
"learning_rate": 9.275567589431178e-06, |
|
"loss": 1.1551, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.8660714285714286, |
|
"grad_norm": 0.35342979431152344, |
|
"learning_rate": 9.036800464548157e-06, |
|
"loss": 0.8579, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.8678571428571429, |
|
"grad_norm": 0.338238000869751, |
|
"learning_rate": 8.80100116893301e-06, |
|
"loss": 1.2404, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.8696428571428572, |
|
"grad_norm": 0.32925620675086975, |
|
"learning_rate": 8.568177395939215e-06, |
|
"loss": 0.9557, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.8714285714285714, |
|
"grad_norm": 0.36271733045578003, |
|
"learning_rate": 8.338336741838838e-06, |
|
"loss": 1.1323, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.8732142857142857, |
|
"grad_norm": 0.373471736907959, |
|
"learning_rate": 8.111486705574534e-06, |
|
"loss": 0.9512, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.3971388041973114, |
|
"learning_rate": 7.887634688515e-06, |
|
"loss": 1.0311, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8767857142857143, |
|
"grad_norm": 0.35106372833251953, |
|
"learning_rate": 7.666787994213453e-06, |
|
"loss": 1.0705, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.8785714285714286, |
|
"grad_norm": 0.3654085099697113, |
|
"learning_rate": 7.448953828169314e-06, |
|
"loss": 1.2338, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.8803571428571428, |
|
"grad_norm": 0.3616616725921631, |
|
"learning_rate": 7.2341392975931785e-06, |
|
"loss": 1.0171, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.8821428571428571, |
|
"grad_norm": 0.39053040742874146, |
|
"learning_rate": 7.022351411174866e-06, |
|
"loss": 1.2481, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.8839285714285714, |
|
"grad_norm": 0.36361098289489746, |
|
"learning_rate": 6.813597078854772e-06, |
|
"loss": 1.3712, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8857142857142857, |
|
"grad_norm": 0.3919769823551178, |
|
"learning_rate": 6.607883111598445e-06, |
|
"loss": 1.2261, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.8875, |
|
"grad_norm": 0.3980182707309723, |
|
"learning_rate": 6.405216221174326e-06, |
|
"loss": 1.2769, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.8892857142857142, |
|
"grad_norm": 0.4138076901435852, |
|
"learning_rate": 6.205603019934791e-06, |
|
"loss": 1.4143, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.8910714285714286, |
|
"grad_norm": 0.47321876883506775, |
|
"learning_rate": 6.009050020600459e-06, |
|
"loss": 1.687, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 0.532382607460022, |
|
"learning_rate": 5.8155636360475385e-06, |
|
"loss": 2.0251, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8946428571428572, |
|
"grad_norm": 0.16291013360023499, |
|
"learning_rate": 5.625150179098804e-06, |
|
"loss": 1.3453, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.8964285714285715, |
|
"grad_norm": 0.19601905345916748, |
|
"learning_rate": 5.437815862317519e-06, |
|
"loss": 1.6226, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.8982142857142857, |
|
"grad_norm": 0.20232488214969635, |
|
"learning_rate": 5.25356679780471e-06, |
|
"loss": 1.7009, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.21668295562267303, |
|
"learning_rate": 5.072408996999844e-06, |
|
"loss": 1.674, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.9017857142857143, |
|
"grad_norm": 0.208509162068367, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 1.7305, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.9035714285714286, |
|
"grad_norm": 0.22677063941955566, |
|
"learning_rate": 4.719390727790218e-06, |
|
"loss": 1.9097, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.9053571428571429, |
|
"grad_norm": 0.2320660501718521, |
|
"learning_rate": 4.547541777207565e-06, |
|
"loss": 1.6309, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.9071428571428571, |
|
"grad_norm": 0.21626609563827515, |
|
"learning_rate": 4.378807125601303e-06, |
|
"loss": 1.8275, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.9089285714285714, |
|
"grad_norm": 0.2348957508802414, |
|
"learning_rate": 4.2131922782267405e-06, |
|
"loss": 1.6609, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.9107142857142857, |
|
"grad_norm": 0.21976341307163239, |
|
"learning_rate": 4.050702638550275e-06, |
|
"loss": 1.6476, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9125, |
|
"grad_norm": 0.25860780477523804, |
|
"learning_rate": 3.891343508073053e-06, |
|
"loss": 1.7382, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.9142857142857143, |
|
"grad_norm": 0.24138076603412628, |
|
"learning_rate": 3.7351200861580617e-06, |
|
"loss": 1.7668, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.9160714285714285, |
|
"grad_norm": 0.23734422028064728, |
|
"learning_rate": 3.5820374698604555e-06, |
|
"loss": 1.794, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.9178571428571428, |
|
"grad_norm": 0.23496972024440765, |
|
"learning_rate": 3.4321006537612165e-06, |
|
"loss": 1.7674, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.9196428571428571, |
|
"grad_norm": 0.24626807868480682, |
|
"learning_rate": 3.2853145298042953e-06, |
|
"loss": 2.0287, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.9214285714285714, |
|
"grad_norm": 0.24339932203292847, |
|
"learning_rate": 3.1416838871368924e-06, |
|
"loss": 1.8533, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.9232142857142858, |
|
"grad_norm": 0.253841370344162, |
|
"learning_rate": 3.0012134119532964e-06, |
|
"loss": 1.6682, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.925, |
|
"grad_norm": 0.26810407638549805, |
|
"learning_rate": 2.863907687341949e-06, |
|
"loss": 1.4686, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.9267857142857143, |
|
"grad_norm": 0.24126453697681427, |
|
"learning_rate": 2.7297711931358993e-06, |
|
"loss": 1.0048, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 0.3192795515060425, |
|
"learning_rate": 2.5988083057666533e-06, |
|
"loss": 1.4609, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9303571428571429, |
|
"grad_norm": 0.2680839002132416, |
|
"learning_rate": 2.471023298121422e-06, |
|
"loss": 1.4287, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.9321428571428572, |
|
"grad_norm": 0.34683963656425476, |
|
"learning_rate": 2.3464203394036322e-06, |
|
"loss": 1.1019, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.9339285714285714, |
|
"grad_norm": 0.31293728947639465, |
|
"learning_rate": 2.2250034949969913e-06, |
|
"loss": 1.5812, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.9357142857142857, |
|
"grad_norm": 0.27005112171173096, |
|
"learning_rate": 2.1067767263327933e-06, |
|
"loss": 0.8701, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.2739523649215698, |
|
"learning_rate": 1.9917438907606556e-06, |
|
"loss": 1.1165, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.9392857142857143, |
|
"grad_norm": 0.2776015102863312, |
|
"learning_rate": 1.87990874142272e-06, |
|
"loss": 1.1448, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.9410714285714286, |
|
"grad_norm": 0.30518367886543274, |
|
"learning_rate": 1.771274927131139e-06, |
|
"loss": 1.0695, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.9428571428571428, |
|
"grad_norm": 0.31975796818733215, |
|
"learning_rate": 1.665845992249071e-06, |
|
"loss": 1.1755, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.9446428571428571, |
|
"grad_norm": 0.3304663598537445, |
|
"learning_rate": 1.5636253765750508e-06, |
|
"loss": 1.0217, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.9464285714285714, |
|
"grad_norm": 0.3353167474269867, |
|
"learning_rate": 1.4646164152307018e-06, |
|
"loss": 1.2216, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9482142857142857, |
|
"grad_norm": 0.3266999423503876, |
|
"learning_rate": 1.3688223385519672e-06, |
|
"loss": 0.8658, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.34619686007499695, |
|
"learning_rate": 1.2762462719837275e-06, |
|
"loss": 1.2839, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.9517857142857142, |
|
"grad_norm": 0.32094186544418335, |
|
"learning_rate": 1.1868912359777607e-06, |
|
"loss": 1.1571, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.9535714285714286, |
|
"grad_norm": 0.33895429968833923, |
|
"learning_rate": 1.1007601458942752e-06, |
|
"loss": 1.0222, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.9553571428571429, |
|
"grad_norm": 0.34143325686454773, |
|
"learning_rate": 1.0178558119067315e-06, |
|
"loss": 0.9386, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.9571428571428572, |
|
"grad_norm": 0.3505565822124481, |
|
"learning_rate": 9.381809389101825e-07, |
|
"loss": 1.1175, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.9589285714285715, |
|
"grad_norm": 0.36876025795936584, |
|
"learning_rate": 8.617381264330426e-07, |
|
"loss": 1.0065, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.9607142857142857, |
|
"grad_norm": 0.3824593722820282, |
|
"learning_rate": 7.885298685522235e-07, |
|
"loss": 1.1445, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.9625, |
|
"grad_norm": 0.3944648504257202, |
|
"learning_rate": 7.185585538117657e-07, |
|
"loss": 0.9773, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.9642857142857143, |
|
"grad_norm": 0.32271888852119446, |
|
"learning_rate": 6.518264651449779e-07, |
|
"loss": 1.073, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9660714285714286, |
|
"grad_norm": 0.36668431758880615, |
|
"learning_rate": 5.883357797998757e-07, |
|
"loss": 1.3001, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.9678571428571429, |
|
"grad_norm": 0.36653202772140503, |
|
"learning_rate": 5.280885692681592e-07, |
|
"loss": 1.0388, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.9696428571428571, |
|
"grad_norm": 0.3570370674133301, |
|
"learning_rate": 4.710867992176682e-07, |
|
"loss": 1.0728, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.9714285714285714, |
|
"grad_norm": 0.3680841624736786, |
|
"learning_rate": 4.173323294281994e-07, |
|
"loss": 1.3424, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.9732142857142857, |
|
"grad_norm": 0.37311863899230957, |
|
"learning_rate": 3.6682691373086665e-07, |
|
"loss": 1.3634, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.975, |
|
"grad_norm": 0.38995441794395447, |
|
"learning_rate": 3.195721999508461e-07, |
|
"loss": 1.4773, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.9767857142857143, |
|
"grad_norm": 0.3970450162887573, |
|
"learning_rate": 2.7556972985363085e-07, |
|
"loss": 1.3839, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.9785714285714285, |
|
"grad_norm": 0.5267580151557922, |
|
"learning_rate": 2.3482093909473756e-07, |
|
"loss": 1.9768, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.9803571428571428, |
|
"grad_norm": 0.4860612154006958, |
|
"learning_rate": 1.973271571728441e-07, |
|
"loss": 1.7893, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.9821428571428571, |
|
"grad_norm": 0.9272521734237671, |
|
"learning_rate": 1.630896073864352e-07, |
|
"loss": 2.1017, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9839285714285714, |
|
"grad_norm": 0.19919352233409882, |
|
"learning_rate": 1.3210940679385664e-07, |
|
"loss": 1.5237, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.9857142857142858, |
|
"grad_norm": 0.20294204354286194, |
|
"learning_rate": 1.0438756617691115e-07, |
|
"loss": 1.597, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.9875, |
|
"grad_norm": 0.2281683087348938, |
|
"learning_rate": 7.992499000785136e-08, |
|
"loss": 1.7265, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.9892857142857143, |
|
"grad_norm": 0.2398226112127304, |
|
"learning_rate": 5.872247641987016e-08, |
|
"loss": 1.7301, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.9910714285714286, |
|
"grad_norm": 0.2581394612789154, |
|
"learning_rate": 4.078071718107701e-08, |
|
"loss": 1.6294, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.9928571428571429, |
|
"grad_norm": 0.2676229774951935, |
|
"learning_rate": 2.610029767191602e-08, |
|
"loss": 0.9741, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.9946428571428572, |
|
"grad_norm": 0.30877920985221863, |
|
"learning_rate": 1.4681696866081229e-08, |
|
"loss": 1.1485, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.9964285714285714, |
|
"grad_norm": 0.3330385386943817, |
|
"learning_rate": 6.525287314851358e-09, |
|
"loss": 0.906, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.9982142857142857, |
|
"grad_norm": 0.364859402179718, |
|
"learning_rate": 1.6313351349883655e-09, |
|
"loss": 1.0913, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5055291652679443, |
|
"learning_rate": 0.0, |
|
"loss": 1.6353, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.4418776035308838, |
|
"eval_runtime": 13.3713, |
|
"eval_samples_per_second": 17.65, |
|
"eval_steps_per_second": 8.825, |
|
"step": 560 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 560, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 140, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.826642274948219e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|