bborisv's picture
Upload folder using huggingface_hub
454f6ab verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 200,
"global_step": 13939,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 7.174115790228854e-05,
"grad_norm": 0.201171875,
"learning_rate": 9.999282588420978e-06,
"loss": 0.8366,
"step": 1
},
{
"epoch": 0.0017935289475572136,
"grad_norm": 0.7421875,
"learning_rate": 9.982064710524428e-06,
"loss": 0.9723,
"step": 25
},
{
"epoch": 0.0035870578951144273,
"grad_norm": 0.166015625,
"learning_rate": 9.964129421048858e-06,
"loss": 0.9534,
"step": 50
},
{
"epoch": 0.0053805868426716405,
"grad_norm": 0.265625,
"learning_rate": 9.946194131573285e-06,
"loss": 0.9673,
"step": 75
},
{
"epoch": 0.007174115790228855,
"grad_norm": 0.7109375,
"learning_rate": 9.928258842097713e-06,
"loss": 0.9947,
"step": 100
},
{
"epoch": 0.008967644737786069,
"grad_norm": 0.671875,
"learning_rate": 9.91032355262214e-06,
"loss": 0.87,
"step": 125
},
{
"epoch": 0.010761173685343281,
"grad_norm": 0.1923828125,
"learning_rate": 9.892388263146568e-06,
"loss": 0.9713,
"step": 150
},
{
"epoch": 0.012554702632900495,
"grad_norm": 1.4921875,
"learning_rate": 9.874452973670995e-06,
"loss": 0.9759,
"step": 175
},
{
"epoch": 0.01434823158045771,
"grad_norm": 0.251953125,
"learning_rate": 9.856517684195423e-06,
"loss": 1.0042,
"step": 200
},
{
"epoch": 0.01614176052801492,
"grad_norm": 0.1416015625,
"learning_rate": 9.83858239471985e-06,
"loss": 1.03,
"step": 225
},
{
"epoch": 0.017935289475572137,
"grad_norm": 0.3359375,
"learning_rate": 9.82064710524428e-06,
"loss": 0.9579,
"step": 250
},
{
"epoch": 0.01972881842312935,
"grad_norm": 0.328125,
"learning_rate": 9.802711815768707e-06,
"loss": 0.894,
"step": 275
},
{
"epoch": 0.021522347370686562,
"grad_norm": 0.46875,
"learning_rate": 9.784776526293135e-06,
"loss": 1.0126,
"step": 300
},
{
"epoch": 0.023315876318243778,
"grad_norm": 0.1669921875,
"learning_rate": 9.766841236817564e-06,
"loss": 0.9455,
"step": 325
},
{
"epoch": 0.02510940526580099,
"grad_norm": 0.3671875,
"learning_rate": 9.748905947341992e-06,
"loss": 1.0228,
"step": 350
},
{
"epoch": 0.026902934213358203,
"grad_norm": 0.890625,
"learning_rate": 9.73097065786642e-06,
"loss": 1.0725,
"step": 375
},
{
"epoch": 0.02869646316091542,
"grad_norm": 0.2578125,
"learning_rate": 9.713035368390847e-06,
"loss": 0.8015,
"step": 400
},
{
"epoch": 0.03048999210847263,
"grad_norm": 0.1865234375,
"learning_rate": 9.695100078915274e-06,
"loss": 1.0028,
"step": 425
},
{
"epoch": 0.03228352105602984,
"grad_norm": 0.3984375,
"learning_rate": 9.677164789439702e-06,
"loss": 0.9588,
"step": 450
},
{
"epoch": 0.03407705000358706,
"grad_norm": 0.1748046875,
"learning_rate": 9.65922949996413e-06,
"loss": 1.076,
"step": 475
},
{
"epoch": 0.035870578951144275,
"grad_norm": 0.177734375,
"learning_rate": 9.641294210488557e-06,
"loss": 1.0304,
"step": 500
},
{
"epoch": 0.037664107898701484,
"grad_norm": 0.26171875,
"learning_rate": 9.623358921012986e-06,
"loss": 0.9989,
"step": 525
},
{
"epoch": 0.0394576368462587,
"grad_norm": 0.1640625,
"learning_rate": 9.605423631537414e-06,
"loss": 0.9504,
"step": 550
},
{
"epoch": 0.041251165793815915,
"grad_norm": 0.1943359375,
"learning_rate": 9.587488342061841e-06,
"loss": 0.9648,
"step": 575
},
{
"epoch": 0.043044694741373124,
"grad_norm": 0.251953125,
"learning_rate": 9.569553052586269e-06,
"loss": 1.0331,
"step": 600
},
{
"epoch": 0.04483822368893034,
"grad_norm": 0.1650390625,
"learning_rate": 9.551617763110698e-06,
"loss": 0.9401,
"step": 625
},
{
"epoch": 0.046631752636487556,
"grad_norm": 0.39453125,
"learning_rate": 9.533682473635126e-06,
"loss": 1.0657,
"step": 650
},
{
"epoch": 0.048425281584044765,
"grad_norm": 0.3515625,
"learning_rate": 9.515747184159553e-06,
"loss": 0.9503,
"step": 675
},
{
"epoch": 0.05021881053160198,
"grad_norm": 0.2734375,
"learning_rate": 9.497811894683981e-06,
"loss": 1.0092,
"step": 700
},
{
"epoch": 0.052012339479159196,
"grad_norm": 0.2890625,
"learning_rate": 9.479876605208408e-06,
"loss": 1.113,
"step": 725
},
{
"epoch": 0.053805868426716405,
"grad_norm": 0.279296875,
"learning_rate": 9.461941315732836e-06,
"loss": 0.9051,
"step": 750
},
{
"epoch": 0.05559939737427362,
"grad_norm": 0.671875,
"learning_rate": 9.444006026257264e-06,
"loss": 1.0507,
"step": 775
},
{
"epoch": 0.05739292632183084,
"grad_norm": 0.1728515625,
"learning_rate": 9.426070736781693e-06,
"loss": 0.9657,
"step": 800
},
{
"epoch": 0.059186455269388046,
"grad_norm": 0.2353515625,
"learning_rate": 9.40813544730612e-06,
"loss": 0.9106,
"step": 825
},
{
"epoch": 0.06097998421694526,
"grad_norm": 0.220703125,
"learning_rate": 9.390200157830548e-06,
"loss": 0.9593,
"step": 850
},
{
"epoch": 0.06277351316450247,
"grad_norm": 0.2236328125,
"learning_rate": 9.372264868354975e-06,
"loss": 1.1362,
"step": 875
},
{
"epoch": 0.06456704211205969,
"grad_norm": 0.375,
"learning_rate": 9.354329578879405e-06,
"loss": 0.9396,
"step": 900
},
{
"epoch": 0.0663605710596169,
"grad_norm": 0.6484375,
"learning_rate": 9.336394289403832e-06,
"loss": 1.0783,
"step": 925
},
{
"epoch": 0.06815410000717412,
"grad_norm": 0.1982421875,
"learning_rate": 9.31845899992826e-06,
"loss": 0.9663,
"step": 950
},
{
"epoch": 0.06994762895473133,
"grad_norm": 1.65625,
"learning_rate": 9.300523710452687e-06,
"loss": 0.914,
"step": 975
},
{
"epoch": 0.07174115790228855,
"grad_norm": 0.251953125,
"learning_rate": 9.282588420977115e-06,
"loss": 1.0177,
"step": 1000
},
{
"epoch": 0.07353468684984575,
"grad_norm": 0.2138671875,
"learning_rate": 9.264653131501543e-06,
"loss": 1.0057,
"step": 1025
},
{
"epoch": 0.07532821579740297,
"grad_norm": 0.2392578125,
"learning_rate": 9.24671784202597e-06,
"loss": 1.054,
"step": 1050
},
{
"epoch": 0.07712174474496018,
"grad_norm": 0.16015625,
"learning_rate": 9.2287825525504e-06,
"loss": 0.9429,
"step": 1075
},
{
"epoch": 0.0789152736925174,
"grad_norm": 0.609375,
"learning_rate": 9.210847263074827e-06,
"loss": 1.093,
"step": 1100
},
{
"epoch": 0.08070880264007461,
"grad_norm": 0.1669921875,
"learning_rate": 9.192911973599254e-06,
"loss": 0.9702,
"step": 1125
},
{
"epoch": 0.08250233158763183,
"grad_norm": 0.1650390625,
"learning_rate": 9.174976684123682e-06,
"loss": 0.9367,
"step": 1150
},
{
"epoch": 0.08429586053518903,
"grad_norm": 0.224609375,
"learning_rate": 9.157041394648111e-06,
"loss": 0.987,
"step": 1175
},
{
"epoch": 0.08608938948274625,
"grad_norm": 0.189453125,
"learning_rate": 9.139106105172539e-06,
"loss": 1.0192,
"step": 1200
},
{
"epoch": 0.08788291843030346,
"grad_norm": 0.75,
"learning_rate": 9.121170815696966e-06,
"loss": 0.9659,
"step": 1225
},
{
"epoch": 0.08967644737786068,
"grad_norm": 0.142578125,
"learning_rate": 9.103235526221394e-06,
"loss": 1.0774,
"step": 1250
},
{
"epoch": 0.0914699763254179,
"grad_norm": 0.41015625,
"learning_rate": 9.085300236745821e-06,
"loss": 1.0287,
"step": 1275
},
{
"epoch": 0.09326350527297511,
"grad_norm": 0.146484375,
"learning_rate": 9.067364947270249e-06,
"loss": 1.0198,
"step": 1300
},
{
"epoch": 0.09505703422053231,
"grad_norm": 0.33984375,
"learning_rate": 9.049429657794677e-06,
"loss": 0.9995,
"step": 1325
},
{
"epoch": 0.09685056316808953,
"grad_norm": 0.474609375,
"learning_rate": 9.031494368319106e-06,
"loss": 1.0788,
"step": 1350
},
{
"epoch": 0.09864409211564674,
"grad_norm": 0.248046875,
"learning_rate": 9.013559078843533e-06,
"loss": 1.022,
"step": 1375
},
{
"epoch": 0.10043762106320396,
"grad_norm": 0.330078125,
"learning_rate": 8.995623789367961e-06,
"loss": 1.1129,
"step": 1400
},
{
"epoch": 0.10223115001076118,
"grad_norm": 0.142578125,
"learning_rate": 8.977688499892389e-06,
"loss": 1.1097,
"step": 1425
},
{
"epoch": 0.10402467895831839,
"grad_norm": 0.271484375,
"learning_rate": 8.959753210416818e-06,
"loss": 0.9678,
"step": 1450
},
{
"epoch": 0.1058182079058756,
"grad_norm": 0.181640625,
"learning_rate": 8.941817920941245e-06,
"loss": 0.9554,
"step": 1475
},
{
"epoch": 0.10761173685343281,
"grad_norm": 0.2197265625,
"learning_rate": 8.923882631465673e-06,
"loss": 0.9818,
"step": 1500
},
{
"epoch": 0.10940526580099003,
"grad_norm": 0.54296875,
"learning_rate": 8.9059473419901e-06,
"loss": 1.0429,
"step": 1525
},
{
"epoch": 0.11119879474854724,
"grad_norm": 0.275390625,
"learning_rate": 8.888012052514528e-06,
"loss": 1.0533,
"step": 1550
},
{
"epoch": 0.11299232369610446,
"grad_norm": 0.345703125,
"learning_rate": 8.870076763038956e-06,
"loss": 0.9546,
"step": 1575
},
{
"epoch": 0.11478585264366167,
"grad_norm": 0.1796875,
"learning_rate": 8.852141473563383e-06,
"loss": 0.9579,
"step": 1600
},
{
"epoch": 0.11657938159121889,
"grad_norm": 0.1689453125,
"learning_rate": 8.834206184087812e-06,
"loss": 1.1338,
"step": 1625
},
{
"epoch": 0.11837291053877609,
"grad_norm": 0.2734375,
"learning_rate": 8.81627089461224e-06,
"loss": 1.0436,
"step": 1650
},
{
"epoch": 0.1201664394863333,
"grad_norm": 0.458984375,
"learning_rate": 8.798335605136667e-06,
"loss": 1.0058,
"step": 1675
},
{
"epoch": 0.12195996843389052,
"grad_norm": 0.28515625,
"learning_rate": 8.780400315661095e-06,
"loss": 1.0284,
"step": 1700
},
{
"epoch": 0.12375349738144774,
"grad_norm": 0.341796875,
"learning_rate": 8.762465026185524e-06,
"loss": 0.9829,
"step": 1725
},
{
"epoch": 0.12554702632900494,
"grad_norm": 0.328125,
"learning_rate": 8.744529736709952e-06,
"loss": 0.8737,
"step": 1750
},
{
"epoch": 0.12734055527656216,
"grad_norm": 0.251953125,
"learning_rate": 8.72659444723438e-06,
"loss": 1.0051,
"step": 1775
},
{
"epoch": 0.12913408422411937,
"grad_norm": 0.29296875,
"learning_rate": 8.708659157758807e-06,
"loss": 0.9725,
"step": 1800
},
{
"epoch": 0.1309276131716766,
"grad_norm": 0.1650390625,
"learning_rate": 8.690723868283235e-06,
"loss": 1.0045,
"step": 1825
},
{
"epoch": 0.1327211421192338,
"grad_norm": 0.2080078125,
"learning_rate": 8.672788578807662e-06,
"loss": 1.1961,
"step": 1850
},
{
"epoch": 0.13451467106679102,
"grad_norm": 0.13671875,
"learning_rate": 8.65485328933209e-06,
"loss": 0.8977,
"step": 1875
},
{
"epoch": 0.13630820001434824,
"grad_norm": 0.126953125,
"learning_rate": 8.636917999856519e-06,
"loss": 0.83,
"step": 1900
},
{
"epoch": 0.13810172896190545,
"grad_norm": 0.201171875,
"learning_rate": 8.618982710380946e-06,
"loss": 0.9642,
"step": 1925
},
{
"epoch": 0.13989525790946267,
"grad_norm": 0.330078125,
"learning_rate": 8.601047420905374e-06,
"loss": 0.8958,
"step": 1950
},
{
"epoch": 0.14168878685701988,
"grad_norm": 0.2109375,
"learning_rate": 8.583112131429802e-06,
"loss": 1.002,
"step": 1975
},
{
"epoch": 0.1434823158045771,
"grad_norm": 0.2021484375,
"learning_rate": 8.56517684195423e-06,
"loss": 1.1167,
"step": 2000
},
{
"epoch": 0.1452758447521343,
"grad_norm": 0.185546875,
"learning_rate": 8.547241552478658e-06,
"loss": 0.9423,
"step": 2025
},
{
"epoch": 0.1470693736996915,
"grad_norm": 0.5703125,
"learning_rate": 8.529306263003086e-06,
"loss": 0.9908,
"step": 2050
},
{
"epoch": 0.14886290264724872,
"grad_norm": 0.2578125,
"learning_rate": 8.511370973527513e-06,
"loss": 0.9776,
"step": 2075
},
{
"epoch": 0.15065643159480593,
"grad_norm": 0.236328125,
"learning_rate": 8.493435684051941e-06,
"loss": 0.864,
"step": 2100
},
{
"epoch": 0.15244996054236315,
"grad_norm": 3.96875,
"learning_rate": 8.475500394576369e-06,
"loss": 1.0042,
"step": 2125
},
{
"epoch": 0.15424348948992037,
"grad_norm": 0.2890625,
"learning_rate": 8.457565105100796e-06,
"loss": 0.9348,
"step": 2150
},
{
"epoch": 0.15603701843747758,
"grad_norm": 0.1865234375,
"learning_rate": 8.439629815625225e-06,
"loss": 0.8448,
"step": 2175
},
{
"epoch": 0.1578305473850348,
"grad_norm": 0.26953125,
"learning_rate": 8.421694526149653e-06,
"loss": 1.0096,
"step": 2200
},
{
"epoch": 0.159624076332592,
"grad_norm": 0.5234375,
"learning_rate": 8.40375923667408e-06,
"loss": 0.9438,
"step": 2225
},
{
"epoch": 0.16141760528014923,
"grad_norm": 0.498046875,
"learning_rate": 8.385823947198508e-06,
"loss": 1.0227,
"step": 2250
},
{
"epoch": 0.16321113422770644,
"grad_norm": 0.2353515625,
"learning_rate": 8.367888657722937e-06,
"loss": 1.02,
"step": 2275
},
{
"epoch": 0.16500466317526366,
"grad_norm": 0.1630859375,
"learning_rate": 8.349953368247365e-06,
"loss": 0.947,
"step": 2300
},
{
"epoch": 0.16679819212282085,
"grad_norm": 0.490234375,
"learning_rate": 8.332018078771792e-06,
"loss": 0.9727,
"step": 2325
},
{
"epoch": 0.16859172107037806,
"grad_norm": 0.2490234375,
"learning_rate": 8.31408278929622e-06,
"loss": 0.9206,
"step": 2350
},
{
"epoch": 0.17038525001793528,
"grad_norm": 0.16796875,
"learning_rate": 8.296147499820648e-06,
"loss": 1.0461,
"step": 2375
},
{
"epoch": 0.1721787789654925,
"grad_norm": 0.134765625,
"learning_rate": 8.278212210345075e-06,
"loss": 1.0327,
"step": 2400
},
{
"epoch": 0.1739723079130497,
"grad_norm": 0.1923828125,
"learning_rate": 8.260276920869503e-06,
"loss": 1.0652,
"step": 2425
},
{
"epoch": 0.17576583686060693,
"grad_norm": 0.3046875,
"learning_rate": 8.24234163139393e-06,
"loss": 0.9325,
"step": 2450
},
{
"epoch": 0.17755936580816414,
"grad_norm": 0.1689453125,
"learning_rate": 8.22440634191836e-06,
"loss": 1.0164,
"step": 2475
},
{
"epoch": 0.17935289475572136,
"grad_norm": 0.1884765625,
"learning_rate": 8.206471052442787e-06,
"loss": 0.9482,
"step": 2500
},
{
"epoch": 0.18114642370327858,
"grad_norm": 0.7578125,
"learning_rate": 8.188535762967215e-06,
"loss": 0.8935,
"step": 2525
},
{
"epoch": 0.1829399526508358,
"grad_norm": 0.443359375,
"learning_rate": 8.170600473491644e-06,
"loss": 0.9094,
"step": 2550
},
{
"epoch": 0.184733481598393,
"grad_norm": 0.5546875,
"learning_rate": 8.152665184016071e-06,
"loss": 0.9651,
"step": 2575
},
{
"epoch": 0.18652701054595022,
"grad_norm": 0.220703125,
"learning_rate": 8.134729894540499e-06,
"loss": 0.9974,
"step": 2600
},
{
"epoch": 0.18832053949350744,
"grad_norm": 0.55859375,
"learning_rate": 8.116794605064927e-06,
"loss": 1.0355,
"step": 2625
},
{
"epoch": 0.19011406844106463,
"grad_norm": 0.55859375,
"learning_rate": 8.098859315589354e-06,
"loss": 0.9738,
"step": 2650
},
{
"epoch": 0.19190759738862184,
"grad_norm": 0.408203125,
"learning_rate": 8.080924026113782e-06,
"loss": 0.9252,
"step": 2675
},
{
"epoch": 0.19370112633617906,
"grad_norm": 0.234375,
"learning_rate": 8.06298873663821e-06,
"loss": 0.9504,
"step": 2700
},
{
"epoch": 0.19549465528373627,
"grad_norm": 0.208984375,
"learning_rate": 8.045053447162637e-06,
"loss": 0.9233,
"step": 2725
},
{
"epoch": 0.1972881842312935,
"grad_norm": 0.62890625,
"learning_rate": 8.027118157687066e-06,
"loss": 1.0145,
"step": 2750
},
{
"epoch": 0.1990817131788507,
"grad_norm": 0.287109375,
"learning_rate": 8.009182868211494e-06,
"loss": 0.9941,
"step": 2775
},
{
"epoch": 0.20087524212640792,
"grad_norm": 0.216796875,
"learning_rate": 7.991247578735921e-06,
"loss": 0.9517,
"step": 2800
},
{
"epoch": 0.20266877107396514,
"grad_norm": 0.2265625,
"learning_rate": 7.97331228926035e-06,
"loss": 0.8408,
"step": 2825
},
{
"epoch": 0.20446230002152235,
"grad_norm": 0.142578125,
"learning_rate": 7.955376999784778e-06,
"loss": 0.9418,
"step": 2850
},
{
"epoch": 0.20625582896907957,
"grad_norm": 0.10107421875,
"learning_rate": 7.937441710309205e-06,
"loss": 0.9479,
"step": 2875
},
{
"epoch": 0.20804935791663678,
"grad_norm": 0.25390625,
"learning_rate": 7.919506420833633e-06,
"loss": 1.1943,
"step": 2900
},
{
"epoch": 0.209842886864194,
"grad_norm": 0.392578125,
"learning_rate": 7.90157113135806e-06,
"loss": 1.0299,
"step": 2925
},
{
"epoch": 0.2116364158117512,
"grad_norm": 0.9453125,
"learning_rate": 7.883635841882488e-06,
"loss": 1.0215,
"step": 2950
},
{
"epoch": 0.2134299447593084,
"grad_norm": 0.46484375,
"learning_rate": 7.865700552406916e-06,
"loss": 1.0463,
"step": 2975
},
{
"epoch": 0.21522347370686562,
"grad_norm": 0.1318359375,
"learning_rate": 7.847765262931343e-06,
"loss": 0.9215,
"step": 3000
},
{
"epoch": 0.21701700265442284,
"grad_norm": 0.1435546875,
"learning_rate": 7.829829973455773e-06,
"loss": 0.964,
"step": 3025
},
{
"epoch": 0.21881053160198005,
"grad_norm": 0.15625,
"learning_rate": 7.8118946839802e-06,
"loss": 0.9525,
"step": 3050
},
{
"epoch": 0.22060406054953727,
"grad_norm": 0.2294921875,
"learning_rate": 7.793959394504628e-06,
"loss": 1.1366,
"step": 3075
},
{
"epoch": 0.22239758949709448,
"grad_norm": 0.2734375,
"learning_rate": 7.776024105029057e-06,
"loss": 1.0143,
"step": 3100
},
{
"epoch": 0.2241911184446517,
"grad_norm": 0.494140625,
"learning_rate": 7.758088815553484e-06,
"loss": 1.0759,
"step": 3125
},
{
"epoch": 0.22598464739220891,
"grad_norm": 0.1650390625,
"learning_rate": 7.740153526077912e-06,
"loss": 0.8999,
"step": 3150
},
{
"epoch": 0.22777817633976613,
"grad_norm": 0.125,
"learning_rate": 7.72221823660234e-06,
"loss": 1.0623,
"step": 3175
},
{
"epoch": 0.22957170528732335,
"grad_norm": 0.25,
"learning_rate": 7.704282947126767e-06,
"loss": 0.9954,
"step": 3200
},
{
"epoch": 0.23136523423488056,
"grad_norm": 0.369140625,
"learning_rate": 7.686347657651195e-06,
"loss": 1.1111,
"step": 3225
},
{
"epoch": 0.23315876318243778,
"grad_norm": 0.296875,
"learning_rate": 7.668412368175622e-06,
"loss": 0.8936,
"step": 3250
},
{
"epoch": 0.23495229212999497,
"grad_norm": 0.279296875,
"learning_rate": 7.65047707870005e-06,
"loss": 0.9579,
"step": 3275
},
{
"epoch": 0.23674582107755218,
"grad_norm": 0.337890625,
"learning_rate": 7.632541789224479e-06,
"loss": 1.1169,
"step": 3300
},
{
"epoch": 0.2385393500251094,
"grad_norm": 0.29296875,
"learning_rate": 7.614606499748907e-06,
"loss": 1.0179,
"step": 3325
},
{
"epoch": 0.2403328789726666,
"grad_norm": 1.0078125,
"learning_rate": 7.596671210273334e-06,
"loss": 0.9545,
"step": 3350
},
{
"epoch": 0.24212640792022383,
"grad_norm": 1.0625,
"learning_rate": 7.578735920797762e-06,
"loss": 1.0669,
"step": 3375
},
{
"epoch": 0.24391993686778105,
"grad_norm": 0.197265625,
"learning_rate": 7.56080063132219e-06,
"loss": 0.978,
"step": 3400
},
{
"epoch": 0.24571346581533826,
"grad_norm": 0.34375,
"learning_rate": 7.542865341846618e-06,
"loss": 0.8955,
"step": 3425
},
{
"epoch": 0.24750699476289548,
"grad_norm": 0.20703125,
"learning_rate": 7.524930052371045e-06,
"loss": 1.1202,
"step": 3450
},
{
"epoch": 0.2493005237104527,
"grad_norm": 0.1826171875,
"learning_rate": 7.5069947628954745e-06,
"loss": 0.8992,
"step": 3475
},
{
"epoch": 0.2510940526580099,
"grad_norm": 0.271484375,
"learning_rate": 7.489059473419902e-06,
"loss": 0.9247,
"step": 3500
},
{
"epoch": 0.2528875816055671,
"grad_norm": 0.1884765625,
"learning_rate": 7.47112418394433e-06,
"loss": 0.9982,
"step": 3525
},
{
"epoch": 0.2546811105531243,
"grad_norm": 0.37890625,
"learning_rate": 7.453188894468757e-06,
"loss": 0.9403,
"step": 3550
},
{
"epoch": 0.25647463950068156,
"grad_norm": 0.2001953125,
"learning_rate": 7.4352536049931856e-06,
"loss": 0.9692,
"step": 3575
},
{
"epoch": 0.25826816844823874,
"grad_norm": 0.255859375,
"learning_rate": 7.417318315517613e-06,
"loss": 0.9188,
"step": 3600
},
{
"epoch": 0.260061697395796,
"grad_norm": 0.26171875,
"learning_rate": 7.399383026042041e-06,
"loss": 0.9708,
"step": 3625
},
{
"epoch": 0.2618552263433532,
"grad_norm": 2.9375,
"learning_rate": 7.381447736566468e-06,
"loss": 0.9779,
"step": 3650
},
{
"epoch": 0.2636487552909104,
"grad_norm": 0.224609375,
"learning_rate": 7.363512447090897e-06,
"loss": 1.0341,
"step": 3675
},
{
"epoch": 0.2654422842384676,
"grad_norm": 0.1337890625,
"learning_rate": 7.345577157615324e-06,
"loss": 0.9934,
"step": 3700
},
{
"epoch": 0.2672358131860248,
"grad_norm": 0.1806640625,
"learning_rate": 7.327641868139752e-06,
"loss": 0.9893,
"step": 3725
},
{
"epoch": 0.26902934213358204,
"grad_norm": 0.1796875,
"learning_rate": 7.309706578664181e-06,
"loss": 1.0313,
"step": 3750
},
{
"epoch": 0.2708228710811392,
"grad_norm": 0.6484375,
"learning_rate": 7.2917712891886086e-06,
"loss": 1.0338,
"step": 3775
},
{
"epoch": 0.27261640002869647,
"grad_norm": 0.212890625,
"learning_rate": 7.273835999713036e-06,
"loss": 0.9932,
"step": 3800
},
{
"epoch": 0.27440992897625366,
"grad_norm": 0.1865234375,
"learning_rate": 7.255900710237464e-06,
"loss": 0.9619,
"step": 3825
},
{
"epoch": 0.2762034579238109,
"grad_norm": 0.171875,
"learning_rate": 7.237965420761892e-06,
"loss": 0.9433,
"step": 3850
},
{
"epoch": 0.2779969868713681,
"grad_norm": 0.345703125,
"learning_rate": 7.22003013128632e-06,
"loss": 1.0252,
"step": 3875
},
{
"epoch": 0.27979051581892533,
"grad_norm": 0.1279296875,
"learning_rate": 7.202094841810747e-06,
"loss": 0.963,
"step": 3900
},
{
"epoch": 0.2815840447664825,
"grad_norm": 0.220703125,
"learning_rate": 7.184159552335175e-06,
"loss": 1.0337,
"step": 3925
},
{
"epoch": 0.28337757371403977,
"grad_norm": 0.3828125,
"learning_rate": 7.166224262859603e-06,
"loss": 1.0714,
"step": 3950
},
{
"epoch": 0.28517110266159695,
"grad_norm": 0.189453125,
"learning_rate": 7.148288973384031e-06,
"loss": 0.9653,
"step": 3975
},
{
"epoch": 0.2869646316091542,
"grad_norm": 0.1884765625,
"learning_rate": 7.130353683908458e-06,
"loss": 1.0517,
"step": 4000
},
{
"epoch": 0.2887581605567114,
"grad_norm": 0.2255859375,
"learning_rate": 7.112418394432886e-06,
"loss": 1.1255,
"step": 4025
},
{
"epoch": 0.2905516895042686,
"grad_norm": 0.1845703125,
"learning_rate": 7.094483104957315e-06,
"loss": 1.0293,
"step": 4050
},
{
"epoch": 0.2923452184518258,
"grad_norm": 0.296875,
"learning_rate": 7.076547815481743e-06,
"loss": 1.1049,
"step": 4075
},
{
"epoch": 0.294138747399383,
"grad_norm": 1.75,
"learning_rate": 7.05861252600617e-06,
"loss": 1.1538,
"step": 4100
},
{
"epoch": 0.29593227634694025,
"grad_norm": 0.185546875,
"learning_rate": 7.040677236530599e-06,
"loss": 1.1805,
"step": 4125
},
{
"epoch": 0.29772580529449744,
"grad_norm": 0.3828125,
"learning_rate": 7.022741947055026e-06,
"loss": 0.9831,
"step": 4150
},
{
"epoch": 0.2995193342420547,
"grad_norm": 0.462890625,
"learning_rate": 7.004806657579454e-06,
"loss": 0.8992,
"step": 4175
},
{
"epoch": 0.30131286318961187,
"grad_norm": 0.83984375,
"learning_rate": 6.986871368103881e-06,
"loss": 0.9759,
"step": 4200
},
{
"epoch": 0.3031063921371691,
"grad_norm": 0.310546875,
"learning_rate": 6.96893607862831e-06,
"loss": 0.907,
"step": 4225
},
{
"epoch": 0.3048999210847263,
"grad_norm": 0.142578125,
"learning_rate": 6.951000789152737e-06,
"loss": 1.1788,
"step": 4250
},
{
"epoch": 0.30669345003228354,
"grad_norm": 0.1884765625,
"learning_rate": 6.933065499677165e-06,
"loss": 0.9443,
"step": 4275
},
{
"epoch": 0.30848697897984073,
"grad_norm": 0.1689453125,
"learning_rate": 6.915130210201592e-06,
"loss": 1.0499,
"step": 4300
},
{
"epoch": 0.310280507927398,
"grad_norm": 0.314453125,
"learning_rate": 6.897194920726022e-06,
"loss": 1.0442,
"step": 4325
},
{
"epoch": 0.31207403687495516,
"grad_norm": 0.34765625,
"learning_rate": 6.879259631250449e-06,
"loss": 0.9441,
"step": 4350
},
{
"epoch": 0.31386756582251235,
"grad_norm": 0.240234375,
"learning_rate": 6.861324341774877e-06,
"loss": 0.9437,
"step": 4375
},
{
"epoch": 0.3156610947700696,
"grad_norm": 0.2314453125,
"learning_rate": 6.843389052299305e-06,
"loss": 1.0314,
"step": 4400
},
{
"epoch": 0.3174546237176268,
"grad_norm": 0.21484375,
"learning_rate": 6.825453762823733e-06,
"loss": 1.0433,
"step": 4425
},
{
"epoch": 0.319248152665184,
"grad_norm": 0.625,
"learning_rate": 6.80751847334816e-06,
"loss": 0.876,
"step": 4450
},
{
"epoch": 0.3210416816127412,
"grad_norm": 0.380859375,
"learning_rate": 6.789583183872588e-06,
"loss": 1.0804,
"step": 4475
},
{
"epoch": 0.32283521056029846,
"grad_norm": 0.1416015625,
"learning_rate": 6.771647894397016e-06,
"loss": 1.0427,
"step": 4500
},
{
"epoch": 0.32462873950785565,
"grad_norm": 0.203125,
"learning_rate": 6.753712604921444e-06,
"loss": 1.0126,
"step": 4525
},
{
"epoch": 0.3264222684554129,
"grad_norm": 0.1689453125,
"learning_rate": 6.735777315445871e-06,
"loss": 1.0004,
"step": 4550
},
{
"epoch": 0.3282157974029701,
"grad_norm": 0.41015625,
"learning_rate": 6.717842025970299e-06,
"loss": 1.0539,
"step": 4575
},
{
"epoch": 0.3300093263505273,
"grad_norm": 0.181640625,
"learning_rate": 6.699906736494728e-06,
"loss": 0.8393,
"step": 4600
},
{
"epoch": 0.3318028552980845,
"grad_norm": 0.1689453125,
"learning_rate": 6.681971447019156e-06,
"loss": 0.9865,
"step": 4625
},
{
"epoch": 0.3335963842456417,
"grad_norm": 0.125,
"learning_rate": 6.664036157543583e-06,
"loss": 1.0389,
"step": 4650
},
{
"epoch": 0.33538991319319894,
"grad_norm": 0.2373046875,
"learning_rate": 6.646100868068012e-06,
"loss": 1.1386,
"step": 4675
},
{
"epoch": 0.33718344214075613,
"grad_norm": 0.20703125,
"learning_rate": 6.628165578592439e-06,
"loss": 0.9599,
"step": 4700
},
{
"epoch": 0.3389769710883134,
"grad_norm": 0.1904296875,
"learning_rate": 6.610230289116867e-06,
"loss": 1.023,
"step": 4725
},
{
"epoch": 0.34077050003587056,
"grad_norm": 0.26953125,
"learning_rate": 6.592294999641294e-06,
"loss": 1.0246,
"step": 4750
},
{
"epoch": 0.3425640289834278,
"grad_norm": 0.20703125,
"learning_rate": 6.574359710165723e-06,
"loss": 0.8527,
"step": 4775
},
{
"epoch": 0.344357557930985,
"grad_norm": 0.13671875,
"learning_rate": 6.55642442069015e-06,
"loss": 0.9386,
"step": 4800
},
{
"epoch": 0.34615108687854224,
"grad_norm": 0.53515625,
"learning_rate": 6.538489131214578e-06,
"loss": 1.0464,
"step": 4825
},
{
"epoch": 0.3479446158260994,
"grad_norm": 0.23046875,
"learning_rate": 6.520553841739005e-06,
"loss": 0.9225,
"step": 4850
},
{
"epoch": 0.34973814477365667,
"grad_norm": 0.1806640625,
"learning_rate": 6.502618552263435e-06,
"loss": 0.9708,
"step": 4875
},
{
"epoch": 0.35153167372121386,
"grad_norm": 0.310546875,
"learning_rate": 6.484683262787862e-06,
"loss": 0.9824,
"step": 4900
},
{
"epoch": 0.3533252026687711,
"grad_norm": 0.26171875,
"learning_rate": 6.46674797331229e-06,
"loss": 1.0268,
"step": 4925
},
{
"epoch": 0.3551187316163283,
"grad_norm": 0.15625,
"learning_rate": 6.448812683836717e-06,
"loss": 0.9887,
"step": 4950
},
{
"epoch": 0.3569122605638855,
"grad_norm": 0.52734375,
"learning_rate": 6.430877394361146e-06,
"loss": 1.0141,
"step": 4975
},
{
"epoch": 0.3587057895114427,
"grad_norm": 0.12060546875,
"learning_rate": 6.412942104885573e-06,
"loss": 0.8455,
"step": 5000
},
{
"epoch": 0.3604993184589999,
"grad_norm": 0.12890625,
"learning_rate": 6.395006815410001e-06,
"loss": 0.9621,
"step": 5025
},
{
"epoch": 0.36229284740655715,
"grad_norm": 0.1396484375,
"learning_rate": 6.377071525934429e-06,
"loss": 0.903,
"step": 5050
},
{
"epoch": 0.36408637635411434,
"grad_norm": 0.216796875,
"learning_rate": 6.359136236458857e-06,
"loss": 1.058,
"step": 5075
},
{
"epoch": 0.3658799053016716,
"grad_norm": 0.16015625,
"learning_rate": 6.341200946983284e-06,
"loss": 0.9965,
"step": 5100
},
{
"epoch": 0.36767343424922877,
"grad_norm": 0.76171875,
"learning_rate": 6.323265657507712e-06,
"loss": 0.9547,
"step": 5125
},
{
"epoch": 0.369466963196786,
"grad_norm": 0.88671875,
"learning_rate": 6.305330368032141e-06,
"loss": 1.066,
"step": 5150
},
{
"epoch": 0.3712604921443432,
"grad_norm": 0.17578125,
"learning_rate": 6.287395078556569e-06,
"loss": 0.9571,
"step": 5175
},
{
"epoch": 0.37305402109190045,
"grad_norm": 0.279296875,
"learning_rate": 6.269459789080996e-06,
"loss": 1.1076,
"step": 5200
},
{
"epoch": 0.37484755003945763,
"grad_norm": 0.234375,
"learning_rate": 6.251524499605424e-06,
"loss": 1.0116,
"step": 5225
},
{
"epoch": 0.3766410789870149,
"grad_norm": 0.3203125,
"learning_rate": 6.233589210129852e-06,
"loss": 0.9815,
"step": 5250
},
{
"epoch": 0.37843460793457206,
"grad_norm": 0.259765625,
"learning_rate": 6.21565392065428e-06,
"loss": 1.0173,
"step": 5275
},
{
"epoch": 0.38022813688212925,
"grad_norm": 0.2021484375,
"learning_rate": 6.197718631178707e-06,
"loss": 0.9725,
"step": 5300
},
{
"epoch": 0.3820216658296865,
"grad_norm": 0.14453125,
"learning_rate": 6.179783341703136e-06,
"loss": 0.9518,
"step": 5325
},
{
"epoch": 0.3838151947772437,
"grad_norm": 0.56640625,
"learning_rate": 6.161848052227563e-06,
"loss": 1.0591,
"step": 5350
},
{
"epoch": 0.38560872372480093,
"grad_norm": 0.443359375,
"learning_rate": 6.143912762751991e-06,
"loss": 0.9761,
"step": 5375
},
{
"epoch": 0.3874022526723581,
"grad_norm": 0.458984375,
"learning_rate": 6.125977473276418e-06,
"loss": 1.0181,
"step": 5400
},
{
"epoch": 0.38919578161991536,
"grad_norm": 0.53125,
"learning_rate": 6.108042183800848e-06,
"loss": 0.9424,
"step": 5425
},
{
"epoch": 0.39098931056747255,
"grad_norm": 0.2216796875,
"learning_rate": 6.090106894325275e-06,
"loss": 1.126,
"step": 5450
},
{
"epoch": 0.3927828395150298,
"grad_norm": 0.36328125,
"learning_rate": 6.072171604849703e-06,
"loss": 0.9558,
"step": 5475
},
{
"epoch": 0.394576368462587,
"grad_norm": 0.2119140625,
"learning_rate": 6.05423631537413e-06,
"loss": 0.9462,
"step": 5500
},
{
"epoch": 0.3963698974101442,
"grad_norm": 0.21484375,
"learning_rate": 6.036301025898559e-06,
"loss": 0.9375,
"step": 5525
},
{
"epoch": 0.3981634263577014,
"grad_norm": 1.2890625,
"learning_rate": 6.018365736422986e-06,
"loss": 1.0341,
"step": 5550
},
{
"epoch": 0.39995695530525865,
"grad_norm": 0.28515625,
"learning_rate": 6.000430446947414e-06,
"loss": 0.994,
"step": 5575
},
{
"epoch": 0.40175048425281584,
"grad_norm": 0.1591796875,
"learning_rate": 5.982495157471842e-06,
"loss": 1.1157,
"step": 5600
},
{
"epoch": 0.40354401320037303,
"grad_norm": 0.400390625,
"learning_rate": 5.96455986799627e-06,
"loss": 1.0531,
"step": 5625
},
{
"epoch": 0.4053375421479303,
"grad_norm": 0.4375,
"learning_rate": 5.946624578520697e-06,
"loss": 0.9557,
"step": 5650
},
{
"epoch": 0.40713107109548746,
"grad_norm": 0.146484375,
"learning_rate": 5.928689289045125e-06,
"loss": 1.055,
"step": 5675
},
{
"epoch": 0.4089246000430447,
"grad_norm": 0.1416015625,
"learning_rate": 5.910753999569554e-06,
"loss": 0.9165,
"step": 5700
},
{
"epoch": 0.4107181289906019,
"grad_norm": 0.2490234375,
"learning_rate": 5.892818710093982e-06,
"loss": 0.9811,
"step": 5725
},
{
"epoch": 0.41251165793815914,
"grad_norm": 0.3359375,
"learning_rate": 5.874883420618409e-06,
"loss": 1.0195,
"step": 5750
},
{
"epoch": 0.4143051868857163,
"grad_norm": 0.25390625,
"learning_rate": 5.856948131142837e-06,
"loss": 1.0109,
"step": 5775
},
{
"epoch": 0.41609871583327357,
"grad_norm": 0.26953125,
"learning_rate": 5.839012841667265e-06,
"loss": 0.9009,
"step": 5800
},
{
"epoch": 0.41789224478083076,
"grad_norm": 0.423828125,
"learning_rate": 5.821077552191693e-06,
"loss": 0.8922,
"step": 5825
},
{
"epoch": 0.419685773728388,
"grad_norm": 0.1875,
"learning_rate": 5.80314226271612e-06,
"loss": 0.9849,
"step": 5850
},
{
"epoch": 0.4214793026759452,
"grad_norm": 0.193359375,
"learning_rate": 5.785206973240548e-06,
"loss": 0.935,
"step": 5875
},
{
"epoch": 0.4232728316235024,
"grad_norm": 0.115234375,
"learning_rate": 5.767271683764976e-06,
"loss": 1.0707,
"step": 5900
},
{
"epoch": 0.4250663605710596,
"grad_norm": 0.38671875,
"learning_rate": 5.749336394289404e-06,
"loss": 0.9005,
"step": 5925
},
{
"epoch": 0.4268598895186168,
"grad_norm": 0.1572265625,
"learning_rate": 5.7314011048138314e-06,
"loss": 1.0365,
"step": 5950
},
{
"epoch": 0.42865341846617405,
"grad_norm": 1.8671875,
"learning_rate": 5.713465815338261e-06,
"loss": 0.9861,
"step": 5975
},
{
"epoch": 0.43044694741373124,
"grad_norm": 0.13671875,
"learning_rate": 5.695530525862688e-06,
"loss": 1.0156,
"step": 6000
},
{
"epoch": 0.4322404763612885,
"grad_norm": 0.255859375,
"learning_rate": 5.677595236387116e-06,
"loss": 1.0903,
"step": 6025
},
{
"epoch": 0.43403400530884567,
"grad_norm": 0.375,
"learning_rate": 5.659659946911543e-06,
"loss": 1.0378,
"step": 6050
},
{
"epoch": 0.4358275342564029,
"grad_norm": 0.30078125,
"learning_rate": 5.641724657435972e-06,
"loss": 0.9674,
"step": 6075
},
{
"epoch": 0.4376210632039601,
"grad_norm": 0.48046875,
"learning_rate": 5.623789367960399e-06,
"loss": 0.9478,
"step": 6100
},
{
"epoch": 0.43941459215151735,
"grad_norm": 0.2021484375,
"learning_rate": 5.605854078484827e-06,
"loss": 0.9359,
"step": 6125
},
{
"epoch": 0.44120812109907454,
"grad_norm": 0.2109375,
"learning_rate": 5.5879187890092544e-06,
"loss": 0.9368,
"step": 6150
},
{
"epoch": 0.4430016500466318,
"grad_norm": 0.2373046875,
"learning_rate": 5.569983499533683e-06,
"loss": 1.009,
"step": 6175
},
{
"epoch": 0.44479517899418897,
"grad_norm": 0.353515625,
"learning_rate": 5.55204821005811e-06,
"loss": 0.8657,
"step": 6200
},
{
"epoch": 0.44658870794174615,
"grad_norm": 0.255859375,
"learning_rate": 5.534112920582538e-06,
"loss": 1.0022,
"step": 6225
},
{
"epoch": 0.4483822368893034,
"grad_norm": 0.1865234375,
"learning_rate": 5.516177631106967e-06,
"loss": 0.9488,
"step": 6250
},
{
"epoch": 0.4501757658368606,
"grad_norm": 0.384765625,
"learning_rate": 5.498242341631395e-06,
"loss": 0.8726,
"step": 6275
},
{
"epoch": 0.45196929478441783,
"grad_norm": 0.15625,
"learning_rate": 5.480307052155822e-06,
"loss": 0.9772,
"step": 6300
},
{
"epoch": 0.453762823731975,
"grad_norm": 0.14453125,
"learning_rate": 5.46237176268025e-06,
"loss": 1.0123,
"step": 6325
},
{
"epoch": 0.45555635267953226,
"grad_norm": 0.181640625,
"learning_rate": 5.444436473204678e-06,
"loss": 1.2663,
"step": 6350
},
{
"epoch": 0.45734988162708945,
"grad_norm": 0.12353515625,
"learning_rate": 5.426501183729106e-06,
"loss": 0.9921,
"step": 6375
},
{
"epoch": 0.4591434105746467,
"grad_norm": 0.353515625,
"learning_rate": 5.408565894253533e-06,
"loss": 1.019,
"step": 6400
},
{
"epoch": 0.4609369395222039,
"grad_norm": 0.640625,
"learning_rate": 5.390630604777961e-06,
"loss": 1.016,
"step": 6425
},
{
"epoch": 0.4627304684697611,
"grad_norm": 0.51171875,
"learning_rate": 5.372695315302389e-06,
"loss": 1.0478,
"step": 6450
},
{
"epoch": 0.4645239974173183,
"grad_norm": 0.578125,
"learning_rate": 5.354760025826817e-06,
"loss": 0.9038,
"step": 6475
},
{
"epoch": 0.46631752636487556,
"grad_norm": 0.2216796875,
"learning_rate": 5.3368247363512445e-06,
"loss": 0.9667,
"step": 6500
},
{
"epoch": 0.46811105531243274,
"grad_norm": 0.2099609375,
"learning_rate": 5.318889446875674e-06,
"loss": 0.945,
"step": 6525
},
{
"epoch": 0.46990458425998993,
"grad_norm": 0.2080078125,
"learning_rate": 5.300954157400101e-06,
"loss": 0.9285,
"step": 6550
},
{
"epoch": 0.4716981132075472,
"grad_norm": 0.205078125,
"learning_rate": 5.283018867924529e-06,
"loss": 0.9441,
"step": 6575
},
{
"epoch": 0.47349164215510436,
"grad_norm": 0.275390625,
"learning_rate": 5.265083578448956e-06,
"loss": 1.0327,
"step": 6600
},
{
"epoch": 0.4752851711026616,
"grad_norm": 0.177734375,
"learning_rate": 5.247148288973385e-06,
"loss": 0.9288,
"step": 6625
},
{
"epoch": 0.4770787000502188,
"grad_norm": 0.6875,
"learning_rate": 5.229212999497812e-06,
"loss": 0.9748,
"step": 6650
},
{
"epoch": 0.47887222899777604,
"grad_norm": 0.28125,
"learning_rate": 5.21127771002224e-06,
"loss": 1.2034,
"step": 6675
},
{
"epoch": 0.4806657579453332,
"grad_norm": 0.494140625,
"learning_rate": 5.1933424205466675e-06,
"loss": 0.9929,
"step": 6700
},
{
"epoch": 0.48245928689289047,
"grad_norm": 0.126953125,
"learning_rate": 5.175407131071096e-06,
"loss": 1.0314,
"step": 6725
},
{
"epoch": 0.48425281584044766,
"grad_norm": 0.412109375,
"learning_rate": 5.1574718415955234e-06,
"loss": 1.1025,
"step": 6750
},
{
"epoch": 0.4860463447880049,
"grad_norm": 0.201171875,
"learning_rate": 5.139536552119951e-06,
"loss": 0.9664,
"step": 6775
},
{
"epoch": 0.4878398737355621,
"grad_norm": 0.38671875,
"learning_rate": 5.1216012626443786e-06,
"loss": 1.0528,
"step": 6800
},
{
"epoch": 0.4896334026831193,
"grad_norm": 0.294921875,
"learning_rate": 5.103665973168808e-06,
"loss": 1.012,
"step": 6825
},
{
"epoch": 0.4914269316306765,
"grad_norm": 5.53125,
"learning_rate": 5.085730683693235e-06,
"loss": 1.0442,
"step": 6850
},
{
"epoch": 0.4932204605782337,
"grad_norm": 0.1513671875,
"learning_rate": 5.067795394217663e-06,
"loss": 0.9695,
"step": 6875
},
{
"epoch": 0.49501398952579095,
"grad_norm": 0.365234375,
"learning_rate": 5.049860104742091e-06,
"loss": 0.9442,
"step": 6900
},
{
"epoch": 0.49680751847334814,
"grad_norm": 0.263671875,
"learning_rate": 5.031924815266519e-06,
"loss": 0.9851,
"step": 6925
},
{
"epoch": 0.4986010474209054,
"grad_norm": 0.58984375,
"learning_rate": 5.0139895257909464e-06,
"loss": 1.0582,
"step": 6950
},
{
"epoch": 0.5003945763684626,
"grad_norm": 0.345703125,
"learning_rate": 4.996054236315375e-06,
"loss": 1.0165,
"step": 6975
},
{
"epoch": 0.5021881053160198,
"grad_norm": 0.189453125,
"learning_rate": 4.978118946839802e-06,
"loss": 1.0012,
"step": 7000
},
{
"epoch": 0.503981634263577,
"grad_norm": 0.1396484375,
"learning_rate": 4.96018365736423e-06,
"loss": 0.9372,
"step": 7025
},
{
"epoch": 0.5057751632111342,
"grad_norm": 0.169921875,
"learning_rate": 4.9422483678886575e-06,
"loss": 0.9966,
"step": 7050
},
{
"epoch": 0.5075686921586915,
"grad_norm": 0.2353515625,
"learning_rate": 4.924313078413086e-06,
"loss": 0.9735,
"step": 7075
},
{
"epoch": 0.5093622211062486,
"grad_norm": 0.1494140625,
"learning_rate": 4.9063777889375135e-06,
"loss": 0.9408,
"step": 7100
},
{
"epoch": 0.5111557500538059,
"grad_norm": 0.47265625,
"learning_rate": 4.888442499461942e-06,
"loss": 0.9853,
"step": 7125
},
{
"epoch": 0.5129492790013631,
"grad_norm": 0.171875,
"learning_rate": 4.8705072099863694e-06,
"loss": 0.8846,
"step": 7150
},
{
"epoch": 0.5147428079489202,
"grad_norm": 0.1455078125,
"learning_rate": 4.852571920510797e-06,
"loss": 1.0347,
"step": 7175
},
{
"epoch": 0.5165363368964775,
"grad_norm": 0.33984375,
"learning_rate": 4.834636631035225e-06,
"loss": 0.9861,
"step": 7200
},
{
"epoch": 0.5183298658440347,
"grad_norm": 0.296875,
"learning_rate": 4.816701341559653e-06,
"loss": 1.0312,
"step": 7225
},
{
"epoch": 0.520123394791592,
"grad_norm": 0.12353515625,
"learning_rate": 4.798766052084081e-06,
"loss": 0.9744,
"step": 7250
},
{
"epoch": 0.5219169237391491,
"grad_norm": 3.671875,
"learning_rate": 4.780830762608509e-06,
"loss": 1.0849,
"step": 7275
},
{
"epoch": 0.5237104526867064,
"grad_norm": 0.36328125,
"learning_rate": 4.7628954731329365e-06,
"loss": 0.9942,
"step": 7300
},
{
"epoch": 0.5255039816342636,
"grad_norm": 1.1875,
"learning_rate": 4.744960183657364e-06,
"loss": 0.9007,
"step": 7325
},
{
"epoch": 0.5272975105818208,
"grad_norm": 0.119140625,
"learning_rate": 4.7270248941817924e-06,
"loss": 0.9755,
"step": 7350
},
{
"epoch": 0.529091039529378,
"grad_norm": 0.1806640625,
"learning_rate": 4.70908960470622e-06,
"loss": 0.981,
"step": 7375
},
{
"epoch": 0.5308845684769352,
"grad_norm": 0.5859375,
"learning_rate": 4.691154315230648e-06,
"loss": 1.1553,
"step": 7400
},
{
"epoch": 0.5326780974244925,
"grad_norm": 0.2470703125,
"learning_rate": 4.673219025755076e-06,
"loss": 1.2496,
"step": 7425
},
{
"epoch": 0.5344716263720496,
"grad_norm": 0.302734375,
"learning_rate": 4.6552837362795035e-06,
"loss": 0.9646,
"step": 7450
},
{
"epoch": 0.5362651553196068,
"grad_norm": 0.1298828125,
"learning_rate": 4.637348446803932e-06,
"loss": 0.9441,
"step": 7475
},
{
"epoch": 0.5380586842671641,
"grad_norm": 0.33203125,
"learning_rate": 4.6194131573283595e-06,
"loss": 0.9471,
"step": 7500
},
{
"epoch": 0.5398522132147213,
"grad_norm": 1.4765625,
"learning_rate": 4.601477867852788e-06,
"loss": 0.8943,
"step": 7525
},
{
"epoch": 0.5416457421622785,
"grad_norm": 0.48828125,
"learning_rate": 4.5835425783772154e-06,
"loss": 1.056,
"step": 7550
},
{
"epoch": 0.5434392711098357,
"grad_norm": 0.162109375,
"learning_rate": 4.565607288901643e-06,
"loss": 0.9676,
"step": 7575
},
{
"epoch": 0.5452328000573929,
"grad_norm": 0.1650390625,
"learning_rate": 4.5476719994260706e-06,
"loss": 1.0258,
"step": 7600
},
{
"epoch": 0.5470263290049502,
"grad_norm": 2.640625,
"learning_rate": 4.529736709950499e-06,
"loss": 1.0032,
"step": 7625
},
{
"epoch": 0.5488198579525073,
"grad_norm": 0.3984375,
"learning_rate": 4.5118014204749265e-06,
"loss": 1.0199,
"step": 7650
},
{
"epoch": 0.5506133869000646,
"grad_norm": 0.2431640625,
"learning_rate": 4.493866130999355e-06,
"loss": 1.0425,
"step": 7675
},
{
"epoch": 0.5524069158476218,
"grad_norm": 0.16015625,
"learning_rate": 4.4759308415237825e-06,
"loss": 1.0024,
"step": 7700
},
{
"epoch": 0.554200444795179,
"grad_norm": 0.67578125,
"learning_rate": 4.45799555204821e-06,
"loss": 0.9574,
"step": 7725
},
{
"epoch": 0.5559939737427362,
"grad_norm": 0.470703125,
"learning_rate": 4.4400602625726384e-06,
"loss": 0.9229,
"step": 7750
},
{
"epoch": 0.5577875026902934,
"grad_norm": 0.31640625,
"learning_rate": 4.422124973097066e-06,
"loss": 1.0553,
"step": 7775
},
{
"epoch": 0.5595810316378507,
"grad_norm": 0.203125,
"learning_rate": 4.404189683621494e-06,
"loss": 0.9049,
"step": 7800
},
{
"epoch": 0.5613745605854078,
"grad_norm": 0.34375,
"learning_rate": 4.386254394145922e-06,
"loss": 0.9489,
"step": 7825
},
{
"epoch": 0.563168089532965,
"grad_norm": 0.181640625,
"learning_rate": 4.3683191046703495e-06,
"loss": 1.0916,
"step": 7850
},
{
"epoch": 0.5649616184805223,
"grad_norm": 0.19921875,
"learning_rate": 4.350383815194777e-06,
"loss": 1.0332,
"step": 7875
},
{
"epoch": 0.5667551474280795,
"grad_norm": 0.208984375,
"learning_rate": 4.3324485257192055e-06,
"loss": 0.9569,
"step": 7900
},
{
"epoch": 0.5685486763756367,
"grad_norm": 0.177734375,
"learning_rate": 4.314513236243633e-06,
"loss": 0.9475,
"step": 7925
},
{
"epoch": 0.5703422053231939,
"grad_norm": 0.39453125,
"learning_rate": 4.2965779467680614e-06,
"loss": 0.9851,
"step": 7950
},
{
"epoch": 0.5721357342707512,
"grad_norm": 0.275390625,
"learning_rate": 4.278642657292489e-06,
"loss": 1.003,
"step": 7975
},
{
"epoch": 0.5739292632183084,
"grad_norm": 0.171875,
"learning_rate": 4.2607073678169166e-06,
"loss": 1.1598,
"step": 8000
},
{
"epoch": 0.5757227921658655,
"grad_norm": 0.474609375,
"learning_rate": 4.242772078341344e-06,
"loss": 1.072,
"step": 8025
},
{
"epoch": 0.5775163211134228,
"grad_norm": 0.11279296875,
"learning_rate": 4.2248367888657725e-06,
"loss": 0.8758,
"step": 8050
},
{
"epoch": 0.57930985006098,
"grad_norm": 0.259765625,
"learning_rate": 4.206901499390201e-06,
"loss": 1.1417,
"step": 8075
},
{
"epoch": 0.5811033790085371,
"grad_norm": 0.158203125,
"learning_rate": 4.1889662099146285e-06,
"loss": 0.9566,
"step": 8100
},
{
"epoch": 0.5828969079560944,
"grad_norm": 0.18359375,
"learning_rate": 4.171030920439056e-06,
"loss": 1.0096,
"step": 8125
},
{
"epoch": 0.5846904369036516,
"grad_norm": 0.1298828125,
"learning_rate": 4.153095630963484e-06,
"loss": 1.0485,
"step": 8150
},
{
"epoch": 0.5864839658512089,
"grad_norm": 0.12451171875,
"learning_rate": 4.135160341487912e-06,
"loss": 0.9747,
"step": 8175
},
{
"epoch": 0.588277494798766,
"grad_norm": 0.306640625,
"learning_rate": 4.1172250520123395e-06,
"loss": 0.9826,
"step": 8200
},
{
"epoch": 0.5900710237463233,
"grad_norm": 0.2080078125,
"learning_rate": 4.099289762536768e-06,
"loss": 1.0612,
"step": 8225
},
{
"epoch": 0.5918645526938805,
"grad_norm": 2.6875,
"learning_rate": 4.0813544730611955e-06,
"loss": 1.1022,
"step": 8250
},
{
"epoch": 0.5936580816414377,
"grad_norm": 0.1923828125,
"learning_rate": 4.063419183585623e-06,
"loss": 0.9072,
"step": 8275
},
{
"epoch": 0.5954516105889949,
"grad_norm": 0.283203125,
"learning_rate": 4.045483894110051e-06,
"loss": 1.0234,
"step": 8300
},
{
"epoch": 0.5972451395365521,
"grad_norm": 0.296875,
"learning_rate": 4.027548604634479e-06,
"loss": 1.0396,
"step": 8325
},
{
"epoch": 0.5990386684841094,
"grad_norm": 0.2890625,
"learning_rate": 4.009613315158907e-06,
"loss": 0.877,
"step": 8350
},
{
"epoch": 0.6008321974316665,
"grad_norm": 0.271484375,
"learning_rate": 3.991678025683335e-06,
"loss": 0.8808,
"step": 8375
},
{
"epoch": 0.6026257263792237,
"grad_norm": 0.1875,
"learning_rate": 3.9737427362077625e-06,
"loss": 1.0299,
"step": 8400
},
{
"epoch": 0.604419255326781,
"grad_norm": 1.09375,
"learning_rate": 3.95580744673219e-06,
"loss": 1.0278,
"step": 8425
},
{
"epoch": 0.6062127842743382,
"grad_norm": 0.19140625,
"learning_rate": 3.9378721572566185e-06,
"loss": 0.9991,
"step": 8450
},
{
"epoch": 0.6080063132218954,
"grad_norm": 0.2109375,
"learning_rate": 3.919936867781046e-06,
"loss": 0.9768,
"step": 8475
},
{
"epoch": 0.6097998421694526,
"grad_norm": 0.25390625,
"learning_rate": 3.9020015783054745e-06,
"loss": 1.0068,
"step": 8500
},
{
"epoch": 0.6115933711170098,
"grad_norm": 0.296875,
"learning_rate": 3.884066288829902e-06,
"loss": 1.0219,
"step": 8525
},
{
"epoch": 0.6133869000645671,
"grad_norm": 0.51953125,
"learning_rate": 3.86613099935433e-06,
"loss": 1.0836,
"step": 8550
},
{
"epoch": 0.6151804290121242,
"grad_norm": 0.1689453125,
"learning_rate": 3.848195709878758e-06,
"loss": 0.9227,
"step": 8575
},
{
"epoch": 0.6169739579596815,
"grad_norm": 0.5625,
"learning_rate": 3.8302604204031855e-06,
"loss": 1.0083,
"step": 8600
},
{
"epoch": 0.6187674869072387,
"grad_norm": 0.1943359375,
"learning_rate": 3.812325130927613e-06,
"loss": 1.0881,
"step": 8625
},
{
"epoch": 0.620561015854796,
"grad_norm": 0.875,
"learning_rate": 3.7943898414520415e-06,
"loss": 0.9992,
"step": 8650
},
{
"epoch": 0.6223545448023531,
"grad_norm": 0.515625,
"learning_rate": 3.7764545519764695e-06,
"loss": 1.0048,
"step": 8675
},
{
"epoch": 0.6241480737499103,
"grad_norm": 5.84375,
"learning_rate": 3.758519262500897e-06,
"loss": 1.023,
"step": 8700
},
{
"epoch": 0.6259416026974676,
"grad_norm": 0.228515625,
"learning_rate": 3.740583973025325e-06,
"loss": 0.892,
"step": 8725
},
{
"epoch": 0.6277351316450247,
"grad_norm": 3.59375,
"learning_rate": 3.7226486835497526e-06,
"loss": 0.9862,
"step": 8750
},
{
"epoch": 0.629528660592582,
"grad_norm": 0.193359375,
"learning_rate": 3.704713394074181e-06,
"loss": 1.0362,
"step": 8775
},
{
"epoch": 0.6313221895401392,
"grad_norm": 0.7890625,
"learning_rate": 3.6867781045986085e-06,
"loss": 1.0916,
"step": 8800
},
{
"epoch": 0.6331157184876964,
"grad_norm": 0.298828125,
"learning_rate": 3.6688428151230365e-06,
"loss": 0.9269,
"step": 8825
},
{
"epoch": 0.6349092474352536,
"grad_norm": 0.72265625,
"learning_rate": 3.650907525647464e-06,
"loss": 1.0274,
"step": 8850
},
{
"epoch": 0.6367027763828108,
"grad_norm": 0.2451171875,
"learning_rate": 3.632972236171892e-06,
"loss": 0.9751,
"step": 8875
},
{
"epoch": 0.638496305330368,
"grad_norm": 1.265625,
"learning_rate": 3.6150369466963196e-06,
"loss": 1.0403,
"step": 8900
},
{
"epoch": 0.6402898342779253,
"grad_norm": 0.2578125,
"learning_rate": 3.597101657220748e-06,
"loss": 0.9077,
"step": 8925
},
{
"epoch": 0.6420833632254824,
"grad_norm": 0.1630859375,
"learning_rate": 3.5791663677451756e-06,
"loss": 0.9537,
"step": 8950
},
{
"epoch": 0.6438768921730397,
"grad_norm": 0.1806640625,
"learning_rate": 3.5612310782696036e-06,
"loss": 1.0603,
"step": 8975
},
{
"epoch": 0.6456704211205969,
"grad_norm": 0.40625,
"learning_rate": 3.5432957887940315e-06,
"loss": 1.0238,
"step": 9000
},
{
"epoch": 0.647463950068154,
"grad_norm": 0.208984375,
"learning_rate": 3.525360499318459e-06,
"loss": 1.0003,
"step": 9025
},
{
"epoch": 0.6492574790157113,
"grad_norm": 0.3046875,
"learning_rate": 3.5074252098428875e-06,
"loss": 0.9724,
"step": 9050
},
{
"epoch": 0.6510510079632685,
"grad_norm": 0.232421875,
"learning_rate": 3.489489920367315e-06,
"loss": 1.0083,
"step": 9075
},
{
"epoch": 0.6528445369108258,
"grad_norm": 0.345703125,
"learning_rate": 3.471554630891743e-06,
"loss": 1.1098,
"step": 9100
},
{
"epoch": 0.6546380658583829,
"grad_norm": 0.62109375,
"learning_rate": 3.4536193414161706e-06,
"loss": 1.0617,
"step": 9125
},
{
"epoch": 0.6564315948059402,
"grad_norm": 0.283203125,
"learning_rate": 3.4356840519405986e-06,
"loss": 0.9711,
"step": 9150
},
{
"epoch": 0.6582251237534974,
"grad_norm": 0.2255859375,
"learning_rate": 3.417748762465026e-06,
"loss": 0.97,
"step": 9175
},
{
"epoch": 0.6600186527010546,
"grad_norm": 0.1904296875,
"learning_rate": 3.3998134729894545e-06,
"loss": 0.9635,
"step": 9200
},
{
"epoch": 0.6618121816486118,
"grad_norm": 2.84375,
"learning_rate": 3.381878183513882e-06,
"loss": 1.0514,
"step": 9225
},
{
"epoch": 0.663605710596169,
"grad_norm": 0.189453125,
"learning_rate": 3.36394289403831e-06,
"loss": 0.9953,
"step": 9250
},
{
"epoch": 0.6653992395437263,
"grad_norm": 0.2099609375,
"learning_rate": 3.3460076045627376e-06,
"loss": 1.0291,
"step": 9275
},
{
"epoch": 0.6671927684912834,
"grad_norm": 0.54296875,
"learning_rate": 3.3280723150871656e-06,
"loss": 1.0054,
"step": 9300
},
{
"epoch": 0.6689862974388406,
"grad_norm": 0.1982421875,
"learning_rate": 3.310137025611594e-06,
"loss": 0.9536,
"step": 9325
},
{
"epoch": 0.6707798263863979,
"grad_norm": 0.76953125,
"learning_rate": 3.2922017361360216e-06,
"loss": 1.0935,
"step": 9350
},
{
"epoch": 0.6725733553339551,
"grad_norm": 0.130859375,
"learning_rate": 3.2742664466604496e-06,
"loss": 1.0158,
"step": 9375
},
{
"epoch": 0.6743668842815123,
"grad_norm": 0.404296875,
"learning_rate": 3.256331157184877e-06,
"loss": 1.0243,
"step": 9400
},
{
"epoch": 0.6761604132290695,
"grad_norm": 0.236328125,
"learning_rate": 3.238395867709305e-06,
"loss": 0.9841,
"step": 9425
},
{
"epoch": 0.6779539421766267,
"grad_norm": 0.197265625,
"learning_rate": 3.2204605782337327e-06,
"loss": 0.9621,
"step": 9450
},
{
"epoch": 0.679747471124184,
"grad_norm": 0.4296875,
"learning_rate": 3.202525288758161e-06,
"loss": 1.2427,
"step": 9475
},
{
"epoch": 0.6815410000717411,
"grad_norm": 0.2470703125,
"learning_rate": 3.1845899992825886e-06,
"loss": 0.9871,
"step": 9500
},
{
"epoch": 0.6833345290192984,
"grad_norm": 0.37890625,
"learning_rate": 3.1666547098070166e-06,
"loss": 0.9559,
"step": 9525
},
{
"epoch": 0.6851280579668556,
"grad_norm": 4.25,
"learning_rate": 3.148719420331444e-06,
"loss": 1.1722,
"step": 9550
},
{
"epoch": 0.6869215869144129,
"grad_norm": 0.1875,
"learning_rate": 3.130784130855872e-06,
"loss": 1.0006,
"step": 9575
},
{
"epoch": 0.68871511586197,
"grad_norm": 0.1865234375,
"learning_rate": 3.1128488413802997e-06,
"loss": 0.9999,
"step": 9600
},
{
"epoch": 0.6905086448095272,
"grad_norm": 0.97265625,
"learning_rate": 3.094913551904728e-06,
"loss": 0.9543,
"step": 9625
},
{
"epoch": 0.6923021737570845,
"grad_norm": 0.4140625,
"learning_rate": 3.076978262429156e-06,
"loss": 0.9658,
"step": 9650
},
{
"epoch": 0.6940957027046416,
"grad_norm": 0.1748046875,
"learning_rate": 3.0590429729535836e-06,
"loss": 1.1451,
"step": 9675
},
{
"epoch": 0.6958892316521988,
"grad_norm": 0.166015625,
"learning_rate": 3.0411076834780116e-06,
"loss": 0.9441,
"step": 9700
},
{
"epoch": 0.6976827605997561,
"grad_norm": 0.5625,
"learning_rate": 3.023172394002439e-06,
"loss": 1.0021,
"step": 9725
},
{
"epoch": 0.6994762895473133,
"grad_norm": 0.2177734375,
"learning_rate": 3.0052371045268676e-06,
"loss": 1.0179,
"step": 9750
},
{
"epoch": 0.7012698184948705,
"grad_norm": 0.130859375,
"learning_rate": 2.987301815051295e-06,
"loss": 1.0105,
"step": 9775
},
{
"epoch": 0.7030633474424277,
"grad_norm": 0.2392578125,
"learning_rate": 2.969366525575723e-06,
"loss": 0.8946,
"step": 9800
},
{
"epoch": 0.704856876389985,
"grad_norm": 0.79296875,
"learning_rate": 2.9514312361001507e-06,
"loss": 0.9994,
"step": 9825
},
{
"epoch": 0.7066504053375422,
"grad_norm": 0.1611328125,
"learning_rate": 2.9334959466245787e-06,
"loss": 0.9527,
"step": 9850
},
{
"epoch": 0.7084439342850993,
"grad_norm": 0.171875,
"learning_rate": 2.9155606571490062e-06,
"loss": 1.0214,
"step": 9875
},
{
"epoch": 0.7102374632326566,
"grad_norm": 0.63671875,
"learning_rate": 2.8976253676734346e-06,
"loss": 1.1297,
"step": 9900
},
{
"epoch": 0.7120309921802138,
"grad_norm": 0.1552734375,
"learning_rate": 2.8796900781978626e-06,
"loss": 0.9318,
"step": 9925
},
{
"epoch": 0.713824521127771,
"grad_norm": 0.435546875,
"learning_rate": 2.86175478872229e-06,
"loss": 0.9355,
"step": 9950
},
{
"epoch": 0.7156180500753282,
"grad_norm": 0.447265625,
"learning_rate": 2.843819499246718e-06,
"loss": 0.9861,
"step": 9975
},
{
"epoch": 0.7174115790228854,
"grad_norm": 0.1728515625,
"learning_rate": 2.8258842097711457e-06,
"loss": 1.0028,
"step": 10000
},
{
"epoch": 0.7192051079704427,
"grad_norm": 1.1875,
"learning_rate": 2.807948920295574e-06,
"loss": 0.9811,
"step": 10025
},
{
"epoch": 0.7209986369179998,
"grad_norm": 0.3359375,
"learning_rate": 2.7900136308200017e-06,
"loss": 1.025,
"step": 10050
},
{
"epoch": 0.7227921658655571,
"grad_norm": 0.1875,
"learning_rate": 2.7720783413444296e-06,
"loss": 0.9284,
"step": 10075
},
{
"epoch": 0.7245856948131143,
"grad_norm": 0.380859375,
"learning_rate": 2.754143051868857e-06,
"loss": 1.038,
"step": 10100
},
{
"epoch": 0.7263792237606715,
"grad_norm": 0.58984375,
"learning_rate": 2.736207762393285e-06,
"loss": 1.0082,
"step": 10125
},
{
"epoch": 0.7281727527082287,
"grad_norm": 0.1875,
"learning_rate": 2.7182724729177127e-06,
"loss": 1.0237,
"step": 10150
},
{
"epoch": 0.7299662816557859,
"grad_norm": 0.42578125,
"learning_rate": 2.700337183442141e-06,
"loss": 0.8815,
"step": 10175
},
{
"epoch": 0.7317598106033432,
"grad_norm": 0.49609375,
"learning_rate": 2.6824018939665687e-06,
"loss": 0.9161,
"step": 10200
},
{
"epoch": 0.7335533395509003,
"grad_norm": 0.1494140625,
"learning_rate": 2.6644666044909967e-06,
"loss": 1.0186,
"step": 10225
},
{
"epoch": 0.7353468684984575,
"grad_norm": 0.19140625,
"learning_rate": 2.6465313150154247e-06,
"loss": 0.8941,
"step": 10250
},
{
"epoch": 0.7371403974460148,
"grad_norm": 0.1845703125,
"learning_rate": 2.6285960255398522e-06,
"loss": 0.9991,
"step": 10275
},
{
"epoch": 0.738933926393572,
"grad_norm": 0.1669921875,
"learning_rate": 2.6106607360642806e-06,
"loss": 1.1567,
"step": 10300
},
{
"epoch": 0.7407274553411292,
"grad_norm": 0.6796875,
"learning_rate": 2.592725446588708e-06,
"loss": 0.9901,
"step": 10325
},
{
"epoch": 0.7425209842886864,
"grad_norm": 0.1640625,
"learning_rate": 2.574790157113136e-06,
"loss": 1.0483,
"step": 10350
},
{
"epoch": 0.7443145132362436,
"grad_norm": 0.34375,
"learning_rate": 2.5568548676375637e-06,
"loss": 0.9463,
"step": 10375
},
{
"epoch": 0.7461080421838009,
"grad_norm": 0.146484375,
"learning_rate": 2.5389195781619917e-06,
"loss": 0.904,
"step": 10400
},
{
"epoch": 0.747901571131358,
"grad_norm": 0.44921875,
"learning_rate": 2.5209842886864193e-06,
"loss": 0.9677,
"step": 10425
},
{
"epoch": 0.7496951000789153,
"grad_norm": 0.357421875,
"learning_rate": 2.5030489992108477e-06,
"loss": 1.1016,
"step": 10450
},
{
"epoch": 0.7514886290264725,
"grad_norm": 0.15234375,
"learning_rate": 2.4851137097352752e-06,
"loss": 0.9498,
"step": 10475
},
{
"epoch": 0.7532821579740298,
"grad_norm": 0.26953125,
"learning_rate": 2.467178420259703e-06,
"loss": 0.9623,
"step": 10500
},
{
"epoch": 0.7550756869215869,
"grad_norm": 0.251953125,
"learning_rate": 2.449243130784131e-06,
"loss": 0.9152,
"step": 10525
},
{
"epoch": 0.7568692158691441,
"grad_norm": 0.1435546875,
"learning_rate": 2.4313078413085587e-06,
"loss": 0.9425,
"step": 10550
},
{
"epoch": 0.7586627448167014,
"grad_norm": 0.458984375,
"learning_rate": 2.4133725518329867e-06,
"loss": 1.0248,
"step": 10575
},
{
"epoch": 0.7604562737642585,
"grad_norm": 0.283203125,
"learning_rate": 2.3954372623574147e-06,
"loss": 0.8791,
"step": 10600
},
{
"epoch": 0.7622498027118157,
"grad_norm": 0.177734375,
"learning_rate": 2.3775019728818423e-06,
"loss": 0.9201,
"step": 10625
},
{
"epoch": 0.764043331659373,
"grad_norm": 0.64453125,
"learning_rate": 2.3595666834062702e-06,
"loss": 1.0206,
"step": 10650
},
{
"epoch": 0.7658368606069302,
"grad_norm": 0.46484375,
"learning_rate": 2.341631393930698e-06,
"loss": 1.0521,
"step": 10675
},
{
"epoch": 0.7676303895544874,
"grad_norm": 0.1591796875,
"learning_rate": 2.323696104455126e-06,
"loss": 1.0034,
"step": 10700
},
{
"epoch": 0.7694239185020446,
"grad_norm": 0.44921875,
"learning_rate": 2.305760814979554e-06,
"loss": 0.957,
"step": 10725
},
{
"epoch": 0.7712174474496019,
"grad_norm": 0.2490234375,
"learning_rate": 2.2878255255039817e-06,
"loss": 1.1092,
"step": 10750
},
{
"epoch": 0.7730109763971591,
"grad_norm": 0.3125,
"learning_rate": 2.2698902360284097e-06,
"loss": 0.9977,
"step": 10775
},
{
"epoch": 0.7748045053447162,
"grad_norm": 0.232421875,
"learning_rate": 2.2519549465528377e-06,
"loss": 1.0989,
"step": 10800
},
{
"epoch": 0.7765980342922735,
"grad_norm": 0.11767578125,
"learning_rate": 2.2340196570772652e-06,
"loss": 1.0842,
"step": 10825
},
{
"epoch": 0.7783915632398307,
"grad_norm": 0.4921875,
"learning_rate": 2.2160843676016932e-06,
"loss": 1.0147,
"step": 10850
},
{
"epoch": 0.7801850921873879,
"grad_norm": 0.328125,
"learning_rate": 2.198149078126121e-06,
"loss": 1.0062,
"step": 10875
},
{
"epoch": 0.7819786211349451,
"grad_norm": 0.177734375,
"learning_rate": 2.1802137886505488e-06,
"loss": 1.0242,
"step": 10900
},
{
"epoch": 0.7837721500825023,
"grad_norm": 0.275390625,
"learning_rate": 2.1622784991749767e-06,
"loss": 0.9424,
"step": 10925
},
{
"epoch": 0.7855656790300596,
"grad_norm": 0.2275390625,
"learning_rate": 2.1443432096994047e-06,
"loss": 0.931,
"step": 10950
},
{
"epoch": 0.7873592079776167,
"grad_norm": 0.353515625,
"learning_rate": 2.1264079202238323e-06,
"loss": 0.9701,
"step": 10975
},
{
"epoch": 0.789152736925174,
"grad_norm": 0.13671875,
"learning_rate": 2.1084726307482607e-06,
"loss": 1.0076,
"step": 11000
},
{
"epoch": 0.7909462658727312,
"grad_norm": 0.51953125,
"learning_rate": 2.0905373412726882e-06,
"loss": 0.9799,
"step": 11025
},
{
"epoch": 0.7927397948202884,
"grad_norm": 0.6875,
"learning_rate": 2.0726020517971162e-06,
"loss": 1.0588,
"step": 11050
},
{
"epoch": 0.7945333237678456,
"grad_norm": 0.51953125,
"learning_rate": 2.054666762321544e-06,
"loss": 0.932,
"step": 11075
},
{
"epoch": 0.7963268527154028,
"grad_norm": 0.1513671875,
"learning_rate": 2.0367314728459718e-06,
"loss": 1.053,
"step": 11100
},
{
"epoch": 0.7981203816629601,
"grad_norm": 0.353515625,
"learning_rate": 2.0187961833703997e-06,
"loss": 0.9572,
"step": 11125
},
{
"epoch": 0.7999139106105173,
"grad_norm": 0.296875,
"learning_rate": 2.0008608938948277e-06,
"loss": 1.0769,
"step": 11150
},
{
"epoch": 0.8017074395580744,
"grad_norm": 0.302734375,
"learning_rate": 1.9829256044192553e-06,
"loss": 0.9988,
"step": 11175
},
{
"epoch": 0.8035009685056317,
"grad_norm": 0.3984375,
"learning_rate": 1.9649903149436833e-06,
"loss": 1.0431,
"step": 11200
},
{
"epoch": 0.8052944974531889,
"grad_norm": 0.13671875,
"learning_rate": 1.9470550254681112e-06,
"loss": 1.0488,
"step": 11225
},
{
"epoch": 0.8070880264007461,
"grad_norm": 0.287109375,
"learning_rate": 1.929119735992539e-06,
"loss": 0.9088,
"step": 11250
},
{
"epoch": 0.8088815553483033,
"grad_norm": 0.384765625,
"learning_rate": 1.9111844465169668e-06,
"loss": 1.0106,
"step": 11275
},
{
"epoch": 0.8106750842958605,
"grad_norm": 0.203125,
"learning_rate": 1.8932491570413948e-06,
"loss": 0.8832,
"step": 11300
},
{
"epoch": 0.8124686132434178,
"grad_norm": 0.421875,
"learning_rate": 1.8753138675658227e-06,
"loss": 0.9656,
"step": 11325
},
{
"epoch": 0.8142621421909749,
"grad_norm": 0.25390625,
"learning_rate": 1.8573785780902507e-06,
"loss": 0.9692,
"step": 11350
},
{
"epoch": 0.8160556711385322,
"grad_norm": 0.1669921875,
"learning_rate": 1.8394432886146785e-06,
"loss": 0.9788,
"step": 11375
},
{
"epoch": 0.8178492000860894,
"grad_norm": 0.1259765625,
"learning_rate": 1.8215079991391063e-06,
"loss": 1.0019,
"step": 11400
},
{
"epoch": 0.8196427290336467,
"grad_norm": 0.80078125,
"learning_rate": 1.8035727096635342e-06,
"loss": 0.9674,
"step": 11425
},
{
"epoch": 0.8214362579812038,
"grad_norm": 0.1572265625,
"learning_rate": 1.785637420187962e-06,
"loss": 0.9064,
"step": 11450
},
{
"epoch": 0.823229786928761,
"grad_norm": 3.34375,
"learning_rate": 1.7677021307123898e-06,
"loss": 1.1879,
"step": 11475
},
{
"epoch": 0.8250233158763183,
"grad_norm": 0.515625,
"learning_rate": 1.7497668412368178e-06,
"loss": 1.148,
"step": 11500
},
{
"epoch": 0.8268168448238754,
"grad_norm": 1.125,
"learning_rate": 1.7318315517612455e-06,
"loss": 0.9733,
"step": 11525
},
{
"epoch": 0.8286103737714327,
"grad_norm": 0.265625,
"learning_rate": 1.7138962622856733e-06,
"loss": 1.0721,
"step": 11550
},
{
"epoch": 0.8304039027189899,
"grad_norm": 0.1474609375,
"learning_rate": 1.6959609728101013e-06,
"loss": 1.0644,
"step": 11575
},
{
"epoch": 0.8321974316665471,
"grad_norm": 0.1728515625,
"learning_rate": 1.678025683334529e-06,
"loss": 0.9961,
"step": 11600
},
{
"epoch": 0.8339909606141043,
"grad_norm": 0.2060546875,
"learning_rate": 1.6600903938589572e-06,
"loss": 0.9952,
"step": 11625
},
{
"epoch": 0.8357844895616615,
"grad_norm": 0.146484375,
"learning_rate": 1.642155104383385e-06,
"loss": 0.9518,
"step": 11650
},
{
"epoch": 0.8375780185092188,
"grad_norm": 0.337890625,
"learning_rate": 1.6242198149078128e-06,
"loss": 0.9067,
"step": 11675
},
{
"epoch": 0.839371547456776,
"grad_norm": 0.34375,
"learning_rate": 1.6062845254322408e-06,
"loss": 0.9853,
"step": 11700
},
{
"epoch": 0.8411650764043331,
"grad_norm": 0.1279296875,
"learning_rate": 1.5883492359566685e-06,
"loss": 0.9909,
"step": 11725
},
{
"epoch": 0.8429586053518904,
"grad_norm": 0.26953125,
"learning_rate": 1.5704139464810963e-06,
"loss": 1.0348,
"step": 11750
},
{
"epoch": 0.8447521342994476,
"grad_norm": 0.298828125,
"learning_rate": 1.5524786570055243e-06,
"loss": 0.9372,
"step": 11775
},
{
"epoch": 0.8465456632470048,
"grad_norm": 0.375,
"learning_rate": 1.534543367529952e-06,
"loss": 1.0784,
"step": 11800
},
{
"epoch": 0.848339192194562,
"grad_norm": 0.1279296875,
"learning_rate": 1.5166080780543798e-06,
"loss": 0.9116,
"step": 11825
},
{
"epoch": 0.8501327211421192,
"grad_norm": 0.1171875,
"learning_rate": 1.4986727885788078e-06,
"loss": 1.0438,
"step": 11850
},
{
"epoch": 0.8519262500896765,
"grad_norm": 0.1279296875,
"learning_rate": 1.4807374991032356e-06,
"loss": 0.9669,
"step": 11875
},
{
"epoch": 0.8537197790372336,
"grad_norm": 0.201171875,
"learning_rate": 1.4628022096276633e-06,
"loss": 0.9231,
"step": 11900
},
{
"epoch": 0.8555133079847909,
"grad_norm": 0.3984375,
"learning_rate": 1.4448669201520913e-06,
"loss": 1.0184,
"step": 11925
},
{
"epoch": 0.8573068369323481,
"grad_norm": 0.6953125,
"learning_rate": 1.4269316306765193e-06,
"loss": 1.1583,
"step": 11950
},
{
"epoch": 0.8591003658799053,
"grad_norm": 0.30078125,
"learning_rate": 1.4089963412009473e-06,
"loss": 1.0219,
"step": 11975
},
{
"epoch": 0.8608938948274625,
"grad_norm": 0.16015625,
"learning_rate": 1.391061051725375e-06,
"loss": 0.9889,
"step": 12000
},
{
"epoch": 0.8626874237750197,
"grad_norm": 0.70703125,
"learning_rate": 1.3731257622498028e-06,
"loss": 0.9859,
"step": 12025
},
{
"epoch": 0.864480952722577,
"grad_norm": 0.37890625,
"learning_rate": 1.3551904727742308e-06,
"loss": 1.044,
"step": 12050
},
{
"epoch": 0.8662744816701342,
"grad_norm": 0.1513671875,
"learning_rate": 1.3372551832986586e-06,
"loss": 1.1042,
"step": 12075
},
{
"epoch": 0.8680680106176913,
"grad_norm": 0.11328125,
"learning_rate": 1.3193198938230863e-06,
"loss": 0.8639,
"step": 12100
},
{
"epoch": 0.8698615395652486,
"grad_norm": 0.2109375,
"learning_rate": 1.3013846043475143e-06,
"loss": 1.0858,
"step": 12125
},
{
"epoch": 0.8716550685128058,
"grad_norm": 0.12109375,
"learning_rate": 1.283449314871942e-06,
"loss": 0.86,
"step": 12150
},
{
"epoch": 0.873448597460363,
"grad_norm": 0.365234375,
"learning_rate": 1.2655140253963699e-06,
"loss": 1.0895,
"step": 12175
},
{
"epoch": 0.8752421264079202,
"grad_norm": 0.318359375,
"learning_rate": 1.2475787359207978e-06,
"loss": 1.2286,
"step": 12200
},
{
"epoch": 0.8770356553554775,
"grad_norm": 0.515625,
"learning_rate": 1.2296434464452258e-06,
"loss": 0.9593,
"step": 12225
},
{
"epoch": 0.8788291843030347,
"grad_norm": 0.1904296875,
"learning_rate": 1.2117081569696536e-06,
"loss": 0.8951,
"step": 12250
},
{
"epoch": 0.8806227132505918,
"grad_norm": 0.150390625,
"learning_rate": 1.1937728674940814e-06,
"loss": 1.1512,
"step": 12275
},
{
"epoch": 0.8824162421981491,
"grad_norm": 5.53125,
"learning_rate": 1.1758375780185093e-06,
"loss": 0.9898,
"step": 12300
},
{
"epoch": 0.8842097711457063,
"grad_norm": 0.212890625,
"learning_rate": 1.1579022885429373e-06,
"loss": 1.0355,
"step": 12325
},
{
"epoch": 0.8860033000932636,
"grad_norm": 0.287109375,
"learning_rate": 1.139966999067365e-06,
"loss": 0.9758,
"step": 12350
},
{
"epoch": 0.8877968290408207,
"grad_norm": 0.357421875,
"learning_rate": 1.1220317095917929e-06,
"loss": 0.9874,
"step": 12375
},
{
"epoch": 0.8895903579883779,
"grad_norm": 0.6640625,
"learning_rate": 1.1040964201162208e-06,
"loss": 1.0189,
"step": 12400
},
{
"epoch": 0.8913838869359352,
"grad_norm": 0.369140625,
"learning_rate": 1.0861611306406486e-06,
"loss": 1.0027,
"step": 12425
},
{
"epoch": 0.8931774158834923,
"grad_norm": 0.306640625,
"learning_rate": 1.0682258411650764e-06,
"loss": 0.9254,
"step": 12450
},
{
"epoch": 0.8949709448310496,
"grad_norm": 0.1767578125,
"learning_rate": 1.0502905516895044e-06,
"loss": 0.9798,
"step": 12475
},
{
"epoch": 0.8967644737786068,
"grad_norm": 0.2216796875,
"learning_rate": 1.0323552622139323e-06,
"loss": 1.0012,
"step": 12500
},
{
"epoch": 0.898558002726164,
"grad_norm": 0.65234375,
"learning_rate": 1.0144199727383601e-06,
"loss": 1.1634,
"step": 12525
},
{
"epoch": 0.9003515316737212,
"grad_norm": 0.236328125,
"learning_rate": 9.964846832627879e-07,
"loss": 1.03,
"step": 12550
},
{
"epoch": 0.9021450606212784,
"grad_norm": 0.306640625,
"learning_rate": 9.785493937872159e-07,
"loss": 0.9248,
"step": 12575
},
{
"epoch": 0.9039385895688357,
"grad_norm": 0.1669921875,
"learning_rate": 9.606141043116436e-07,
"loss": 0.8934,
"step": 12600
},
{
"epoch": 0.9057321185163929,
"grad_norm": 0.1865234375,
"learning_rate": 9.426788148360716e-07,
"loss": 1.1064,
"step": 12625
},
{
"epoch": 0.90752564746395,
"grad_norm": 0.1591796875,
"learning_rate": 9.247435253604995e-07,
"loss": 0.9369,
"step": 12650
},
{
"epoch": 0.9093191764115073,
"grad_norm": 0.2412109375,
"learning_rate": 9.068082358849273e-07,
"loss": 1.0222,
"step": 12675
},
{
"epoch": 0.9111127053590645,
"grad_norm": 0.34765625,
"learning_rate": 8.888729464093551e-07,
"loss": 0.9838,
"step": 12700
},
{
"epoch": 0.9129062343066217,
"grad_norm": 0.19921875,
"learning_rate": 8.70937656933783e-07,
"loss": 0.8956,
"step": 12725
},
{
"epoch": 0.9146997632541789,
"grad_norm": 0.17578125,
"learning_rate": 8.530023674582108e-07,
"loss": 0.9418,
"step": 12750
},
{
"epoch": 0.9164932922017361,
"grad_norm": 0.5234375,
"learning_rate": 8.350670779826386e-07,
"loss": 1.0145,
"step": 12775
},
{
"epoch": 0.9182868211492934,
"grad_norm": 0.2138671875,
"learning_rate": 8.171317885070666e-07,
"loss": 0.9934,
"step": 12800
},
{
"epoch": 0.9200803500968505,
"grad_norm": 0.375,
"learning_rate": 7.991964990314945e-07,
"loss": 1.1503,
"step": 12825
},
{
"epoch": 0.9218738790444078,
"grad_norm": 0.302734375,
"learning_rate": 7.812612095559223e-07,
"loss": 1.2094,
"step": 12850
},
{
"epoch": 0.923667407991965,
"grad_norm": 0.1552734375,
"learning_rate": 7.633259200803501e-07,
"loss": 1.0571,
"step": 12875
},
{
"epoch": 0.9254609369395222,
"grad_norm": 0.2734375,
"learning_rate": 7.45390630604778e-07,
"loss": 1.0455,
"step": 12900
},
{
"epoch": 0.9272544658870794,
"grad_norm": 0.431640625,
"learning_rate": 7.274553411292058e-07,
"loss": 1.0746,
"step": 12925
},
{
"epoch": 0.9290479948346366,
"grad_norm": 0.12890625,
"learning_rate": 7.095200516536338e-07,
"loss": 1.0629,
"step": 12950
},
{
"epoch": 0.9308415237821939,
"grad_norm": 0.177734375,
"learning_rate": 6.915847621780616e-07,
"loss": 0.9921,
"step": 12975
},
{
"epoch": 0.9326350527297511,
"grad_norm": 0.302734375,
"learning_rate": 6.736494727024895e-07,
"loss": 0.9427,
"step": 13000
},
{
"epoch": 0.9344285816773082,
"grad_norm": 0.1650390625,
"learning_rate": 6.557141832269173e-07,
"loss": 0.9967,
"step": 13025
},
{
"epoch": 0.9362221106248655,
"grad_norm": 0.19140625,
"learning_rate": 6.377788937513452e-07,
"loss": 1.1217,
"step": 13050
},
{
"epoch": 0.9380156395724227,
"grad_norm": 0.1318359375,
"learning_rate": 6.19843604275773e-07,
"loss": 1.0301,
"step": 13075
},
{
"epoch": 0.9398091685199799,
"grad_norm": 1.6171875,
"learning_rate": 6.019083148002009e-07,
"loss": 0.9226,
"step": 13100
},
{
"epoch": 0.9416026974675371,
"grad_norm": 0.953125,
"learning_rate": 5.839730253246288e-07,
"loss": 1.1147,
"step": 13125
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.2294921875,
"learning_rate": 5.660377358490567e-07,
"loss": 1.1343,
"step": 13150
},
{
"epoch": 0.9451897553626516,
"grad_norm": 0.1923828125,
"learning_rate": 5.481024463734845e-07,
"loss": 1.1753,
"step": 13175
},
{
"epoch": 0.9469832843102087,
"grad_norm": 0.1767578125,
"learning_rate": 5.301671568979123e-07,
"loss": 1.0303,
"step": 13200
},
{
"epoch": 0.948776813257766,
"grad_norm": 1.6484375,
"learning_rate": 5.122318674223403e-07,
"loss": 1.0228,
"step": 13225
},
{
"epoch": 0.9505703422053232,
"grad_norm": 0.169921875,
"learning_rate": 4.942965779467681e-07,
"loss": 0.9644,
"step": 13250
},
{
"epoch": 0.9523638711528805,
"grad_norm": 0.177734375,
"learning_rate": 4.7636128847119593e-07,
"loss": 0.9838,
"step": 13275
},
{
"epoch": 0.9541574001004376,
"grad_norm": 0.162109375,
"learning_rate": 4.5842599899562386e-07,
"loss": 0.8988,
"step": 13300
},
{
"epoch": 0.9559509290479948,
"grad_norm": 0.1357421875,
"learning_rate": 4.404907095200517e-07,
"loss": 1.2,
"step": 13325
},
{
"epoch": 0.9577444579955521,
"grad_norm": 0.2470703125,
"learning_rate": 4.225554200444795e-07,
"loss": 0.9431,
"step": 13350
},
{
"epoch": 0.9595379869431092,
"grad_norm": 0.359375,
"learning_rate": 4.0462013056890743e-07,
"loss": 0.9729,
"step": 13375
},
{
"epoch": 0.9613315158906665,
"grad_norm": 0.212890625,
"learning_rate": 3.8668484109333525e-07,
"loss": 1.1659,
"step": 13400
},
{
"epoch": 0.9631250448382237,
"grad_norm": 0.546875,
"learning_rate": 3.6874955161776313e-07,
"loss": 0.9316,
"step": 13425
},
{
"epoch": 0.9649185737857809,
"grad_norm": 0.1728515625,
"learning_rate": 3.50814262142191e-07,
"loss": 0.9412,
"step": 13450
},
{
"epoch": 0.9667121027333381,
"grad_norm": 5.65625,
"learning_rate": 3.328789726666189e-07,
"loss": 0.9748,
"step": 13475
},
{
"epoch": 0.9685056316808953,
"grad_norm": 0.1796875,
"learning_rate": 3.149436831910467e-07,
"loss": 0.9574,
"step": 13500
},
{
"epoch": 0.9702991606284526,
"grad_norm": 0.2490234375,
"learning_rate": 2.970083937154746e-07,
"loss": 0.9165,
"step": 13525
},
{
"epoch": 0.9720926895760098,
"grad_norm": 0.4375,
"learning_rate": 2.7907310423990245e-07,
"loss": 0.9801,
"step": 13550
},
{
"epoch": 0.9738862185235669,
"grad_norm": 0.24609375,
"learning_rate": 2.611378147643303e-07,
"loss": 0.9946,
"step": 13575
},
{
"epoch": 0.9756797474711242,
"grad_norm": 1.03125,
"learning_rate": 2.4320252528875815e-07,
"loss": 0.9575,
"step": 13600
},
{
"epoch": 0.9774732764186814,
"grad_norm": 0.267578125,
"learning_rate": 2.2526723581318605e-07,
"loss": 0.9646,
"step": 13625
},
{
"epoch": 0.9792668053662386,
"grad_norm": 0.1708984375,
"learning_rate": 2.0733194633761392e-07,
"loss": 1.0246,
"step": 13650
},
{
"epoch": 0.9810603343137958,
"grad_norm": 0.453125,
"learning_rate": 1.8939665686204177e-07,
"loss": 0.9829,
"step": 13675
},
{
"epoch": 0.982853863261353,
"grad_norm": 0.310546875,
"learning_rate": 1.7146136738646965e-07,
"loss": 1.0098,
"step": 13700
},
{
"epoch": 0.9846473922089103,
"grad_norm": 0.3203125,
"learning_rate": 1.535260779108975e-07,
"loss": 0.9828,
"step": 13725
},
{
"epoch": 0.9864409211564674,
"grad_norm": 0.1611328125,
"learning_rate": 1.3559078843532534e-07,
"loss": 0.9689,
"step": 13750
},
{
"epoch": 0.9882344501040247,
"grad_norm": 0.1904296875,
"learning_rate": 1.1765549895975323e-07,
"loss": 1.0347,
"step": 13775
},
{
"epoch": 0.9900279790515819,
"grad_norm": 0.1669921875,
"learning_rate": 9.972020948418109e-08,
"loss": 1.0367,
"step": 13800
},
{
"epoch": 0.9918215079991392,
"grad_norm": 0.28515625,
"learning_rate": 8.178492000860894e-08,
"loss": 0.904,
"step": 13825
},
{
"epoch": 0.9936150369466963,
"grad_norm": 0.16796875,
"learning_rate": 6.384963053303682e-08,
"loss": 0.9934,
"step": 13850
},
{
"epoch": 0.9954085658942535,
"grad_norm": 0.318359375,
"learning_rate": 4.591434105746467e-08,
"loss": 1.009,
"step": 13875
},
{
"epoch": 0.9972020948418108,
"grad_norm": 0.25390625,
"learning_rate": 2.7979051581892535e-08,
"loss": 0.9746,
"step": 13900
},
{
"epoch": 0.998995623789368,
"grad_norm": 0.310546875,
"learning_rate": 1.0043762106320397e-08,
"loss": 1.0373,
"step": 13925
}
],
"logging_steps": 25,
"max_steps": 13939,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.128724037953004e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}