ErrorAI's picture
Training in progress, step 983, checkpoint
99d2655 verified
raw
history blame
170 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.000763553066938,
"eval_steps": 500,
"global_step": 983,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010180707559175363,
"grad_norm": 13.50528621673584,
"learning_rate": 2e-05,
"loss": 5.1969,
"step": 1
},
{
"epoch": 0.0020361415118350726,
"grad_norm": 19.771379470825195,
"learning_rate": 4e-05,
"loss": 7.7524,
"step": 2
},
{
"epoch": 0.003054212267752609,
"grad_norm": 19.734294891357422,
"learning_rate": 6e-05,
"loss": 8.0409,
"step": 3
},
{
"epoch": 0.004072283023670145,
"grad_norm": 24.712677001953125,
"learning_rate": 8e-05,
"loss": 7.7622,
"step": 4
},
{
"epoch": 0.0050903537795876815,
"grad_norm": 26.32750701904297,
"learning_rate": 0.0001,
"loss": 9.8505,
"step": 5
},
{
"epoch": 0.006108424535505218,
"grad_norm": 29.443405151367188,
"learning_rate": 9.999974203447433e-05,
"loss": 9.1155,
"step": 6
},
{
"epoch": 0.007126495291422754,
"grad_norm": 22.42616844177246,
"learning_rate": 9.999896814055916e-05,
"loss": 6.3676,
"step": 7
},
{
"epoch": 0.00814456604734029,
"grad_norm": 27.119375228881836,
"learning_rate": 9.999767832624001e-05,
"loss": 7.4125,
"step": 8
},
{
"epoch": 0.009162636803257827,
"grad_norm": 15.422072410583496,
"learning_rate": 9.999587260482597e-05,
"loss": 3.5393,
"step": 9
},
{
"epoch": 0.010180707559175363,
"grad_norm": 13.826335906982422,
"learning_rate": 9.999355099494962e-05,
"loss": 2.7136,
"step": 10
},
{
"epoch": 0.0111987783150929,
"grad_norm": 14.258417129516602,
"learning_rate": 9.999071352056675e-05,
"loss": 2.6158,
"step": 11
},
{
"epoch": 0.012216849071010435,
"grad_norm": 14.128898620605469,
"learning_rate": 9.99873602109562e-05,
"loss": 3.0587,
"step": 12
},
{
"epoch": 0.013234919826927972,
"grad_norm": 12.319880485534668,
"learning_rate": 9.998349110071949e-05,
"loss": 2.6488,
"step": 13
},
{
"epoch": 0.014252990582845508,
"grad_norm": 15.33985424041748,
"learning_rate": 9.99791062297805e-05,
"loss": 3.1476,
"step": 14
},
{
"epoch": 0.015271061338763044,
"grad_norm": 13.602853775024414,
"learning_rate": 9.99742056433851e-05,
"loss": 2.796,
"step": 15
},
{
"epoch": 0.01628913209468058,
"grad_norm": 9.083526611328125,
"learning_rate": 9.996878939210049e-05,
"loss": 2.0671,
"step": 16
},
{
"epoch": 0.017307202850598117,
"grad_norm": 10.980738639831543,
"learning_rate": 9.9962857531815e-05,
"loss": 2.629,
"step": 17
},
{
"epoch": 0.018325273606515653,
"grad_norm": 11.885966300964355,
"learning_rate": 9.99564101237372e-05,
"loss": 3.1609,
"step": 18
},
{
"epoch": 0.01934334436243319,
"grad_norm": 11.95373249053955,
"learning_rate": 9.994944723439546e-05,
"loss": 3.2291,
"step": 19
},
{
"epoch": 0.020361415118350726,
"grad_norm": 10.973458290100098,
"learning_rate": 9.994196893563721e-05,
"loss": 2.7778,
"step": 20
},
{
"epoch": 0.021379485874268262,
"grad_norm": 12.775362014770508,
"learning_rate": 9.993397530462818e-05,
"loss": 3.136,
"step": 21
},
{
"epoch": 0.0223975566301858,
"grad_norm": 10.916693687438965,
"learning_rate": 9.992546642385158e-05,
"loss": 2.3531,
"step": 22
},
{
"epoch": 0.023415627386103335,
"grad_norm": 12.338353157043457,
"learning_rate": 9.99164423811074e-05,
"loss": 3.6968,
"step": 23
},
{
"epoch": 0.02443369814202087,
"grad_norm": 13.168731689453125,
"learning_rate": 9.990690326951126e-05,
"loss": 3.0163,
"step": 24
},
{
"epoch": 0.025451768897938407,
"grad_norm": 11.88056755065918,
"learning_rate": 9.989684918749365e-05,
"loss": 3.0201,
"step": 25
},
{
"epoch": 0.026469839653855944,
"grad_norm": 9.301902770996094,
"learning_rate": 9.988628023879883e-05,
"loss": 2.6577,
"step": 26
},
{
"epoch": 0.02748791040977348,
"grad_norm": 10.02252197265625,
"learning_rate": 9.987519653248378e-05,
"loss": 2.4519,
"step": 27
},
{
"epoch": 0.028505981165691016,
"grad_norm": 11.7409029006958,
"learning_rate": 9.986359818291706e-05,
"loss": 3.1898,
"step": 28
},
{
"epoch": 0.029524051921608552,
"grad_norm": 10.492666244506836,
"learning_rate": 9.985148530977767e-05,
"loss": 2.7226,
"step": 29
},
{
"epoch": 0.03054212267752609,
"grad_norm": 12.711854934692383,
"learning_rate": 9.983885803805372e-05,
"loss": 2.7205,
"step": 30
},
{
"epoch": 0.03156019343344362,
"grad_norm": 12.937368392944336,
"learning_rate": 9.982571649804126e-05,
"loss": 3.204,
"step": 31
},
{
"epoch": 0.03257826418936116,
"grad_norm": 13.292618751525879,
"learning_rate": 9.981206082534286e-05,
"loss": 3.4911,
"step": 32
},
{
"epoch": 0.033596334945278694,
"grad_norm": 9.724308967590332,
"learning_rate": 9.979789116086625e-05,
"loss": 2.8603,
"step": 33
},
{
"epoch": 0.034614405701196234,
"grad_norm": 8.932650566101074,
"learning_rate": 9.978320765082278e-05,
"loss": 2.465,
"step": 34
},
{
"epoch": 0.03563247645711377,
"grad_norm": 10.732564926147461,
"learning_rate": 9.976801044672608e-05,
"loss": 2.9517,
"step": 35
},
{
"epoch": 0.036650547213031306,
"grad_norm": 11.971003532409668,
"learning_rate": 9.97522997053903e-05,
"loss": 3.2085,
"step": 36
},
{
"epoch": 0.03766861796894884,
"grad_norm": 10.34869384765625,
"learning_rate": 9.973607558892864e-05,
"loss": 2.9294,
"step": 37
},
{
"epoch": 0.03868668872486638,
"grad_norm": 12.5684814453125,
"learning_rate": 9.97193382647516e-05,
"loss": 3.6198,
"step": 38
},
{
"epoch": 0.03970475948078391,
"grad_norm": 13.42013931274414,
"learning_rate": 9.970208790556532e-05,
"loss": 2.8409,
"step": 39
},
{
"epoch": 0.04072283023670145,
"grad_norm": 10.357503890991211,
"learning_rate": 9.968432468936967e-05,
"loss": 2.6345,
"step": 40
},
{
"epoch": 0.041740900992618984,
"grad_norm": 12.668771743774414,
"learning_rate": 9.966604879945659e-05,
"loss": 3.2825,
"step": 41
},
{
"epoch": 0.042758971748536524,
"grad_norm": 9.444086074829102,
"learning_rate": 9.964726042440802e-05,
"loss": 2.7562,
"step": 42
},
{
"epoch": 0.04377704250445406,
"grad_norm": 10.448949813842773,
"learning_rate": 9.962795975809411e-05,
"loss": 2.8796,
"step": 43
},
{
"epoch": 0.0447951132603716,
"grad_norm": 10.916976928710938,
"learning_rate": 9.960814699967112e-05,
"loss": 2.7582,
"step": 44
},
{
"epoch": 0.04581318401628913,
"grad_norm": 11.323358535766602,
"learning_rate": 9.958782235357938e-05,
"loss": 2.6436,
"step": 45
},
{
"epoch": 0.04683125477220667,
"grad_norm": 10.28471851348877,
"learning_rate": 9.956698602954124e-05,
"loss": 2.8325,
"step": 46
},
{
"epoch": 0.0478493255281242,
"grad_norm": 13.5079984664917,
"learning_rate": 9.954563824255878e-05,
"loss": 3.0585,
"step": 47
},
{
"epoch": 0.04886739628404174,
"grad_norm": 11.250194549560547,
"learning_rate": 9.952377921291178e-05,
"loss": 2.7686,
"step": 48
},
{
"epoch": 0.049885467039959275,
"grad_norm": 12.554705619812012,
"learning_rate": 9.950140916615526e-05,
"loss": 3.0617,
"step": 49
},
{
"epoch": 0.050903537795876815,
"grad_norm": 12.552079200744629,
"learning_rate": 9.947852833311724e-05,
"loss": 2.08,
"step": 50
},
{
"epoch": 0.05192160855179435,
"grad_norm": 11.248369216918945,
"learning_rate": 9.945513694989639e-05,
"loss": 5.133,
"step": 51
},
{
"epoch": 0.05293967930771189,
"grad_norm": 12.866747856140137,
"learning_rate": 9.943123525785952e-05,
"loss": 5.7232,
"step": 52
},
{
"epoch": 0.05395775006362942,
"grad_norm": 12.395757675170898,
"learning_rate": 9.940682350363912e-05,
"loss": 4.6422,
"step": 53
},
{
"epoch": 0.05497582081954696,
"grad_norm": 12.23355770111084,
"learning_rate": 9.938190193913083e-05,
"loss": 4.8131,
"step": 54
},
{
"epoch": 0.05599389157546449,
"grad_norm": 14.62759017944336,
"learning_rate": 9.935647082149086e-05,
"loss": 6.0114,
"step": 55
},
{
"epoch": 0.05701196233138203,
"grad_norm": 13.613059997558594,
"learning_rate": 9.933053041313325e-05,
"loss": 4.794,
"step": 56
},
{
"epoch": 0.058030033087299565,
"grad_norm": 13.422719955444336,
"learning_rate": 9.930408098172725e-05,
"loss": 4.5392,
"step": 57
},
{
"epoch": 0.059048103843217105,
"grad_norm": 17.745412826538086,
"learning_rate": 9.92771228001945e-05,
"loss": 7.1147,
"step": 58
},
{
"epoch": 0.06006617459913464,
"grad_norm": 13.955183982849121,
"learning_rate": 9.924965614670629e-05,
"loss": 3.619,
"step": 59
},
{
"epoch": 0.06108424535505218,
"grad_norm": 11.067267417907715,
"learning_rate": 9.922168130468059e-05,
"loss": 2.6905,
"step": 60
},
{
"epoch": 0.06210231611096971,
"grad_norm": 11.641958236694336,
"learning_rate": 9.91931985627792e-05,
"loss": 2.398,
"step": 61
},
{
"epoch": 0.06312038686688724,
"grad_norm": 8.590779304504395,
"learning_rate": 9.916420821490472e-05,
"loss": 1.9248,
"step": 62
},
{
"epoch": 0.06413845762280479,
"grad_norm": 8.852486610412598,
"learning_rate": 9.91347105601976e-05,
"loss": 2.3876,
"step": 63
},
{
"epoch": 0.06515652837872232,
"grad_norm": 9.158111572265625,
"learning_rate": 9.910470590303293e-05,
"loss": 1.9339,
"step": 64
},
{
"epoch": 0.06617459913463986,
"grad_norm": 8.361588478088379,
"learning_rate": 9.907419455301741e-05,
"loss": 2.3266,
"step": 65
},
{
"epoch": 0.06719266989055739,
"grad_norm": 7.891152858734131,
"learning_rate": 9.904317682498608e-05,
"loss": 1.9775,
"step": 66
},
{
"epoch": 0.06821074064647493,
"grad_norm": 8.722708702087402,
"learning_rate": 9.901165303899916e-05,
"loss": 2.2988,
"step": 67
},
{
"epoch": 0.06922881140239247,
"grad_norm": 10.848478317260742,
"learning_rate": 9.897962352033861e-05,
"loss": 2.2087,
"step": 68
},
{
"epoch": 0.07024688215831,
"grad_norm": 7.828042984008789,
"learning_rate": 9.89470885995049e-05,
"loss": 2.1694,
"step": 69
},
{
"epoch": 0.07126495291422753,
"grad_norm": 7.928416728973389,
"learning_rate": 9.891404861221356e-05,
"loss": 1.7946,
"step": 70
},
{
"epoch": 0.07228302367014508,
"grad_norm": 8.273153305053711,
"learning_rate": 9.888050389939172e-05,
"loss": 2.2472,
"step": 71
},
{
"epoch": 0.07330109442606261,
"grad_norm": 7.866210460662842,
"learning_rate": 9.884645480717451e-05,
"loss": 1.9656,
"step": 72
},
{
"epoch": 0.07431916518198015,
"grad_norm": 9.140717506408691,
"learning_rate": 9.881190168690164e-05,
"loss": 2.5084,
"step": 73
},
{
"epoch": 0.07533723593789768,
"grad_norm": 10.078163146972656,
"learning_rate": 9.877684489511366e-05,
"loss": 2.8882,
"step": 74
},
{
"epoch": 0.07635530669381523,
"grad_norm": 8.583365440368652,
"learning_rate": 9.874128479354832e-05,
"loss": 2.2404,
"step": 75
},
{
"epoch": 0.07737337744973276,
"grad_norm": 10.980644226074219,
"learning_rate": 9.870522174913682e-05,
"loss": 2.9591,
"step": 76
},
{
"epoch": 0.07839144820565029,
"grad_norm": 9.829695701599121,
"learning_rate": 9.866865613400008e-05,
"loss": 2.5868,
"step": 77
},
{
"epoch": 0.07940951896156782,
"grad_norm": 9.993083000183105,
"learning_rate": 9.863158832544477e-05,
"loss": 2.7386,
"step": 78
},
{
"epoch": 0.08042758971748537,
"grad_norm": 9.227055549621582,
"learning_rate": 9.859401870595959e-05,
"loss": 2.3334,
"step": 79
},
{
"epoch": 0.0814456604734029,
"grad_norm": 9.135334968566895,
"learning_rate": 9.855594766321122e-05,
"loss": 2.6064,
"step": 80
},
{
"epoch": 0.08246373122932044,
"grad_norm": 9.216446876525879,
"learning_rate": 9.85173755900403e-05,
"loss": 2.9289,
"step": 81
},
{
"epoch": 0.08348180198523797,
"grad_norm": 12.71446418762207,
"learning_rate": 9.847830288445745e-05,
"loss": 3.5027,
"step": 82
},
{
"epoch": 0.08449987274115552,
"grad_norm": 9.071185111999512,
"learning_rate": 9.843872994963911e-05,
"loss": 3.1217,
"step": 83
},
{
"epoch": 0.08551794349707305,
"grad_norm": 7.825349807739258,
"learning_rate": 9.839865719392339e-05,
"loss": 2.4812,
"step": 84
},
{
"epoch": 0.08653601425299058,
"grad_norm": 11.979453086853027,
"learning_rate": 9.835808503080585e-05,
"loss": 3.6076,
"step": 85
},
{
"epoch": 0.08755408500890811,
"grad_norm": 10.889570236206055,
"learning_rate": 9.831701387893533e-05,
"loss": 3.9539,
"step": 86
},
{
"epoch": 0.08857215576482566,
"grad_norm": 6.638063430786133,
"learning_rate": 9.827544416210941e-05,
"loss": 2.1225,
"step": 87
},
{
"epoch": 0.0895902265207432,
"grad_norm": 11.630864143371582,
"learning_rate": 9.823337630927026e-05,
"loss": 2.8508,
"step": 88
},
{
"epoch": 0.09060829727666073,
"grad_norm": 11.906623840332031,
"learning_rate": 9.819081075450014e-05,
"loss": 3.0837,
"step": 89
},
{
"epoch": 0.09162636803257826,
"grad_norm": 12.019804000854492,
"learning_rate": 9.814774793701687e-05,
"loss": 3.6106,
"step": 90
},
{
"epoch": 0.0926444387884958,
"grad_norm": 7.91819953918457,
"learning_rate": 9.810418830116932e-05,
"loss": 2.3236,
"step": 91
},
{
"epoch": 0.09366250954441334,
"grad_norm": 9.185378074645996,
"learning_rate": 9.806013229643289e-05,
"loss": 2.6397,
"step": 92
},
{
"epoch": 0.09468058030033087,
"grad_norm": 12.451518058776855,
"learning_rate": 9.801558037740478e-05,
"loss": 3.3661,
"step": 93
},
{
"epoch": 0.0956986510562484,
"grad_norm": 9.665090560913086,
"learning_rate": 9.797053300379937e-05,
"loss": 2.7933,
"step": 94
},
{
"epoch": 0.09671672181216595,
"grad_norm": 9.512073516845703,
"learning_rate": 9.792499064044342e-05,
"loss": 3.1669,
"step": 95
},
{
"epoch": 0.09773479256808348,
"grad_norm": 11.063192367553711,
"learning_rate": 9.787895375727136e-05,
"loss": 2.4502,
"step": 96
},
{
"epoch": 0.09875286332400102,
"grad_norm": 11.608457565307617,
"learning_rate": 9.783242282932028e-05,
"loss": 2.5691,
"step": 97
},
{
"epoch": 0.09977093407991855,
"grad_norm": 10.834481239318848,
"learning_rate": 9.778539833672524e-05,
"loss": 2.8208,
"step": 98
},
{
"epoch": 0.1007890048358361,
"grad_norm": 9.476598739624023,
"learning_rate": 9.773788076471414e-05,
"loss": 2.4245,
"step": 99
},
{
"epoch": 0.10180707559175363,
"grad_norm": 10.453302383422852,
"learning_rate": 9.768987060360279e-05,
"loss": 2.1369,
"step": 100
},
{
"epoch": 0.10282514634767116,
"grad_norm": 8.380644798278809,
"learning_rate": 9.764136834878986e-05,
"loss": 4.4008,
"step": 101
},
{
"epoch": 0.1038432171035887,
"grad_norm": 10.45700740814209,
"learning_rate": 9.759237450075174e-05,
"loss": 3.8277,
"step": 102
},
{
"epoch": 0.10486128785950624,
"grad_norm": 11.106316566467285,
"learning_rate": 9.754288956503736e-05,
"loss": 4.3912,
"step": 103
},
{
"epoch": 0.10587935861542377,
"grad_norm": 12.727373123168945,
"learning_rate": 9.749291405226305e-05,
"loss": 5.0723,
"step": 104
},
{
"epoch": 0.10689742937134131,
"grad_norm": 11.3184175491333,
"learning_rate": 9.744244847810716e-05,
"loss": 4.6612,
"step": 105
},
{
"epoch": 0.10791550012725884,
"grad_norm": 11.49225902557373,
"learning_rate": 9.739149336330482e-05,
"loss": 5.2688,
"step": 106
},
{
"epoch": 0.10893357088317639,
"grad_norm": 9.92116928100586,
"learning_rate": 9.734004923364257e-05,
"loss": 3.1285,
"step": 107
},
{
"epoch": 0.10995164163909392,
"grad_norm": 16.322154998779297,
"learning_rate": 9.728811661995288e-05,
"loss": 4.3573,
"step": 108
},
{
"epoch": 0.11096971239501145,
"grad_norm": 11.590410232543945,
"learning_rate": 9.723569605810871e-05,
"loss": 3.3457,
"step": 109
},
{
"epoch": 0.11198778315092899,
"grad_norm": 6.267991065979004,
"learning_rate": 9.718278808901797e-05,
"loss": 1.8973,
"step": 110
},
{
"epoch": 0.11300585390684653,
"grad_norm": 7.807132720947266,
"learning_rate": 9.712939325861794e-05,
"loss": 2.2999,
"step": 111
},
{
"epoch": 0.11402392466276406,
"grad_norm": 5.800601005554199,
"learning_rate": 9.707551211786965e-05,
"loss": 1.0863,
"step": 112
},
{
"epoch": 0.1150419954186816,
"grad_norm": 7.150589466094971,
"learning_rate": 9.702114522275216e-05,
"loss": 1.9172,
"step": 113
},
{
"epoch": 0.11606006617459913,
"grad_norm": 8.134252548217773,
"learning_rate": 9.696629313425686e-05,
"loss": 2.2173,
"step": 114
},
{
"epoch": 0.11707813693051668,
"grad_norm": 7.6389689445495605,
"learning_rate": 9.691095641838169e-05,
"loss": 1.8046,
"step": 115
},
{
"epoch": 0.11809620768643421,
"grad_norm": 6.845970153808594,
"learning_rate": 9.685513564612521e-05,
"loss": 1.9059,
"step": 116
},
{
"epoch": 0.11911427844235174,
"grad_norm": 10.888468742370605,
"learning_rate": 9.679883139348082e-05,
"loss": 2.9148,
"step": 117
},
{
"epoch": 0.12013234919826928,
"grad_norm": 6.594396114349365,
"learning_rate": 9.674204424143078e-05,
"loss": 1.8292,
"step": 118
},
{
"epoch": 0.12115041995418682,
"grad_norm": 7.157876491546631,
"learning_rate": 9.66847747759402e-05,
"loss": 1.6858,
"step": 119
},
{
"epoch": 0.12216849071010435,
"grad_norm": 7.298995494842529,
"learning_rate": 9.662702358795098e-05,
"loss": 1.7957,
"step": 120
},
{
"epoch": 0.12318656146602189,
"grad_norm": 9.0108003616333,
"learning_rate": 9.656879127337571e-05,
"loss": 2.2843,
"step": 121
},
{
"epoch": 0.12420463222193942,
"grad_norm": 8.476913452148438,
"learning_rate": 9.651007843309163e-05,
"loss": 2.1026,
"step": 122
},
{
"epoch": 0.12522270297785695,
"grad_norm": 9.930148124694824,
"learning_rate": 9.645088567293426e-05,
"loss": 2.6976,
"step": 123
},
{
"epoch": 0.1262407737337745,
"grad_norm": 8.574073791503906,
"learning_rate": 9.639121360369126e-05,
"loss": 1.7768,
"step": 124
},
{
"epoch": 0.12725884448969205,
"grad_norm": 13.36725902557373,
"learning_rate": 9.63310628410961e-05,
"loss": 2.7559,
"step": 125
},
{
"epoch": 0.12827691524560958,
"grad_norm": 8.55522346496582,
"learning_rate": 9.627043400582172e-05,
"loss": 2.3419,
"step": 126
},
{
"epoch": 0.1292949860015271,
"grad_norm": 9.948506355285645,
"learning_rate": 9.620932772347408e-05,
"loss": 3.0092,
"step": 127
},
{
"epoch": 0.13031305675744465,
"grad_norm": 10.05156135559082,
"learning_rate": 9.614774462458573e-05,
"loss": 2.1554,
"step": 128
},
{
"epoch": 0.13133112751336218,
"grad_norm": 10.230545043945312,
"learning_rate": 9.608568534460936e-05,
"loss": 2.572,
"step": 129
},
{
"epoch": 0.1323491982692797,
"grad_norm": 7.820633411407471,
"learning_rate": 9.602315052391115e-05,
"loss": 2.2316,
"step": 130
},
{
"epoch": 0.13336726902519724,
"grad_norm": 7.196948528289795,
"learning_rate": 9.596014080776423e-05,
"loss": 2.276,
"step": 131
},
{
"epoch": 0.13438533978111478,
"grad_norm": 10.125378608703613,
"learning_rate": 9.589665684634196e-05,
"loss": 3.6436,
"step": 132
},
{
"epoch": 0.13540341053703234,
"grad_norm": 8.542695045471191,
"learning_rate": 9.583269929471128e-05,
"loss": 2.8726,
"step": 133
},
{
"epoch": 0.13642148129294987,
"grad_norm": 8.097149848937988,
"learning_rate": 9.576826881282594e-05,
"loss": 2.3483,
"step": 134
},
{
"epoch": 0.1374395520488674,
"grad_norm": 8.922883987426758,
"learning_rate": 9.570336606551967e-05,
"loss": 2.5365,
"step": 135
},
{
"epoch": 0.13845762280478494,
"grad_norm": 9.18602180480957,
"learning_rate": 9.56379917224993e-05,
"loss": 2.7464,
"step": 136
},
{
"epoch": 0.13947569356070247,
"grad_norm": 8.929719924926758,
"learning_rate": 9.557214645833792e-05,
"loss": 2.8074,
"step": 137
},
{
"epoch": 0.14049376431662,
"grad_norm": 10.157453536987305,
"learning_rate": 9.550583095246786e-05,
"loss": 2.6313,
"step": 138
},
{
"epoch": 0.14151183507253753,
"grad_norm": 8.677960395812988,
"learning_rate": 9.543904588917367e-05,
"loss": 2.7515,
"step": 139
},
{
"epoch": 0.14252990582845507,
"grad_norm": 8.684197425842285,
"learning_rate": 9.537179195758512e-05,
"loss": 2.5564,
"step": 140
},
{
"epoch": 0.14354797658437263,
"grad_norm": 8.283134460449219,
"learning_rate": 9.530406985167004e-05,
"loss": 2.3474,
"step": 141
},
{
"epoch": 0.14456604734029016,
"grad_norm": 7.090147018432617,
"learning_rate": 9.523588027022721e-05,
"loss": 2.0495,
"step": 142
},
{
"epoch": 0.1455841180962077,
"grad_norm": 9.59614086151123,
"learning_rate": 9.516722391687902e-05,
"loss": 2.4563,
"step": 143
},
{
"epoch": 0.14660218885212523,
"grad_norm": 7.75164270401001,
"learning_rate": 9.50981015000644e-05,
"loss": 2.0795,
"step": 144
},
{
"epoch": 0.14762025960804276,
"grad_norm": 9.117147445678711,
"learning_rate": 9.502851373303136e-05,
"loss": 2.519,
"step": 145
},
{
"epoch": 0.1486383303639603,
"grad_norm": 9.871448516845703,
"learning_rate": 9.495846133382973e-05,
"loss": 2.6371,
"step": 146
},
{
"epoch": 0.14965640111987782,
"grad_norm": 8.246638298034668,
"learning_rate": 9.488794502530362e-05,
"loss": 2.3142,
"step": 147
},
{
"epoch": 0.15067447187579536,
"grad_norm": 11.579840660095215,
"learning_rate": 9.48169655350841e-05,
"loss": 2.8947,
"step": 148
},
{
"epoch": 0.15169254263171292,
"grad_norm": 13.307292938232422,
"learning_rate": 9.474552359558166e-05,
"loss": 2.9942,
"step": 149
},
{
"epoch": 0.15271061338763045,
"grad_norm": 10.210186958312988,
"learning_rate": 9.467361994397859e-05,
"loss": 2.0216,
"step": 150
},
{
"epoch": 0.15372868414354798,
"grad_norm": 7.870486259460449,
"learning_rate": 9.460125532222141e-05,
"loss": 2.6203,
"step": 151
},
{
"epoch": 0.15474675489946552,
"grad_norm": 13.753894805908203,
"learning_rate": 9.452843047701323e-05,
"loss": 4.1998,
"step": 152
},
{
"epoch": 0.15576482565538305,
"grad_norm": 10.677061080932617,
"learning_rate": 9.445514615980604e-05,
"loss": 3.9647,
"step": 153
},
{
"epoch": 0.15678289641130058,
"grad_norm": 11.903203010559082,
"learning_rate": 9.438140312679291e-05,
"loss": 4.2215,
"step": 154
},
{
"epoch": 0.15780096716721811,
"grad_norm": 12.882353782653809,
"learning_rate": 9.43072021389003e-05,
"loss": 5.0153,
"step": 155
},
{
"epoch": 0.15881903792313565,
"grad_norm": 13.99023151397705,
"learning_rate": 9.423254396178003e-05,
"loss": 5.5362,
"step": 156
},
{
"epoch": 0.1598371086790532,
"grad_norm": 16.683727264404297,
"learning_rate": 9.415742936580157e-05,
"loss": 5.1149,
"step": 157
},
{
"epoch": 0.16085517943497074,
"grad_norm": 17.32396125793457,
"learning_rate": 9.408185912604394e-05,
"loss": 4.8563,
"step": 158
},
{
"epoch": 0.16187325019088827,
"grad_norm": 14.138668060302734,
"learning_rate": 9.400583402228784e-05,
"loss": 3.4698,
"step": 159
},
{
"epoch": 0.1628913209468058,
"grad_norm": 6.4397430419921875,
"learning_rate": 9.392935483900749e-05,
"loss": 1.8856,
"step": 160
},
{
"epoch": 0.16390939170272334,
"grad_norm": 4.72169303894043,
"learning_rate": 9.38524223653626e-05,
"loss": 1.3027,
"step": 161
},
{
"epoch": 0.16492746245864087,
"grad_norm": 7.877247333526611,
"learning_rate": 9.377503739519019e-05,
"loss": 1.9129,
"step": 162
},
{
"epoch": 0.1659455332145584,
"grad_norm": 8.524123191833496,
"learning_rate": 9.369720072699647e-05,
"loss": 1.5605,
"step": 163
},
{
"epoch": 0.16696360397047594,
"grad_norm": 9.966007232666016,
"learning_rate": 9.361891316394851e-05,
"loss": 2.5458,
"step": 164
},
{
"epoch": 0.16798167472639347,
"grad_norm": 9.061026573181152,
"learning_rate": 9.354017551386599e-05,
"loss": 1.8415,
"step": 165
},
{
"epoch": 0.16899974548231103,
"grad_norm": 7.912156581878662,
"learning_rate": 9.346098858921291e-05,
"loss": 1.9514,
"step": 166
},
{
"epoch": 0.17001781623822856,
"grad_norm": 6.926218509674072,
"learning_rate": 9.338135320708911e-05,
"loss": 2.1861,
"step": 167
},
{
"epoch": 0.1710358869941461,
"grad_norm": 7.546460151672363,
"learning_rate": 9.330127018922194e-05,
"loss": 1.7336,
"step": 168
},
{
"epoch": 0.17205395775006363,
"grad_norm": 6.780023097991943,
"learning_rate": 9.322074036195769e-05,
"loss": 1.766,
"step": 169
},
{
"epoch": 0.17307202850598116,
"grad_norm": 8.207006454467773,
"learning_rate": 9.313976455625315e-05,
"loss": 1.937,
"step": 170
},
{
"epoch": 0.1740900992618987,
"grad_norm": 10.892253875732422,
"learning_rate": 9.305834360766695e-05,
"loss": 2.6682,
"step": 171
},
{
"epoch": 0.17510817001781623,
"grad_norm": 8.318902015686035,
"learning_rate": 9.297647835635102e-05,
"loss": 2.0102,
"step": 172
},
{
"epoch": 0.17612624077373376,
"grad_norm": 7.727786540985107,
"learning_rate": 9.289416964704185e-05,
"loss": 1.9714,
"step": 173
},
{
"epoch": 0.17714431152965132,
"grad_norm": 9.250336647033691,
"learning_rate": 9.281141832905185e-05,
"loss": 2.3855,
"step": 174
},
{
"epoch": 0.17816238228556885,
"grad_norm": 7.347965717315674,
"learning_rate": 9.272822525626046e-05,
"loss": 1.8475,
"step": 175
},
{
"epoch": 0.1791804530414864,
"grad_norm": 7.1732354164123535,
"learning_rate": 9.26445912871055e-05,
"loss": 1.9938,
"step": 176
},
{
"epoch": 0.18019852379740392,
"grad_norm": 11.556361198425293,
"learning_rate": 9.25605172845742e-05,
"loss": 3.3699,
"step": 177
},
{
"epoch": 0.18121659455332145,
"grad_norm": 9.626664161682129,
"learning_rate": 9.247600411619434e-05,
"loss": 2.7054,
"step": 178
},
{
"epoch": 0.18223466530923899,
"grad_norm": 7.422823429107666,
"learning_rate": 9.239105265402525e-05,
"loss": 2.3665,
"step": 179
},
{
"epoch": 0.18325273606515652,
"grad_norm": 8.812822341918945,
"learning_rate": 9.23056637746489e-05,
"loss": 2.4336,
"step": 180
},
{
"epoch": 0.18427080682107405,
"grad_norm": 12.493931770324707,
"learning_rate": 9.221983835916074e-05,
"loss": 2.4446,
"step": 181
},
{
"epoch": 0.1852888775769916,
"grad_norm": 9.533077239990234,
"learning_rate": 9.213357729316076e-05,
"loss": 2.5195,
"step": 182
},
{
"epoch": 0.18630694833290914,
"grad_norm": 7.195649147033691,
"learning_rate": 9.204688146674418e-05,
"loss": 1.5695,
"step": 183
},
{
"epoch": 0.18732501908882668,
"grad_norm": 10.850951194763184,
"learning_rate": 9.195975177449238e-05,
"loss": 3.3308,
"step": 184
},
{
"epoch": 0.1883430898447442,
"grad_norm": 9.36767578125,
"learning_rate": 9.187218911546362e-05,
"loss": 2.8146,
"step": 185
},
{
"epoch": 0.18936116060066174,
"grad_norm": 14.791803359985352,
"learning_rate": 9.178419439318382e-05,
"loss": 3.5093,
"step": 186
},
{
"epoch": 0.19037923135657928,
"grad_norm": 10.107565879821777,
"learning_rate": 9.169576851563715e-05,
"loss": 2.4756,
"step": 187
},
{
"epoch": 0.1913973021124968,
"grad_norm": 8.8936128616333,
"learning_rate": 9.160691239525674e-05,
"loss": 2.4272,
"step": 188
},
{
"epoch": 0.19241537286841434,
"grad_norm": 8.861714363098145,
"learning_rate": 9.151762694891521e-05,
"loss": 2.1092,
"step": 189
},
{
"epoch": 0.1934334436243319,
"grad_norm": 9.74419116973877,
"learning_rate": 9.142791309791528e-05,
"loss": 3.1339,
"step": 190
},
{
"epoch": 0.19445151438024944,
"grad_norm": 10.207488059997559,
"learning_rate": 9.133777176798013e-05,
"loss": 2.5119,
"step": 191
},
{
"epoch": 0.19546958513616697,
"grad_norm": 9.463604927062988,
"learning_rate": 9.124720388924403e-05,
"loss": 2.669,
"step": 192
},
{
"epoch": 0.1964876558920845,
"grad_norm": 11.191435813903809,
"learning_rate": 9.115621039624256e-05,
"loss": 3.134,
"step": 193
},
{
"epoch": 0.19750572664800203,
"grad_norm": 8.744293212890625,
"learning_rate": 9.10647922279031e-05,
"loss": 2.8205,
"step": 194
},
{
"epoch": 0.19852379740391957,
"grad_norm": 9.338461875915527,
"learning_rate": 9.09729503275351e-05,
"loss": 2.2502,
"step": 195
},
{
"epoch": 0.1995418681598371,
"grad_norm": 8.457433700561523,
"learning_rate": 9.088068564282031e-05,
"loss": 2.1407,
"step": 196
},
{
"epoch": 0.20055993891575463,
"grad_norm": 11.790545463562012,
"learning_rate": 9.078799912580304e-05,
"loss": 3.0246,
"step": 197
},
{
"epoch": 0.2015780096716722,
"grad_norm": 10.485797882080078,
"learning_rate": 9.069489173288038e-05,
"loss": 2.7989,
"step": 198
},
{
"epoch": 0.20259608042758973,
"grad_norm": 10.064512252807617,
"learning_rate": 9.060136442479215e-05,
"loss": 2.3104,
"step": 199
},
{
"epoch": 0.20361415118350726,
"grad_norm": 11.273386001586914,
"learning_rate": 9.050741816661128e-05,
"loss": 2.1308,
"step": 200
},
{
"epoch": 0.2046322219394248,
"grad_norm": 7.872629642486572,
"learning_rate": 9.041305392773354e-05,
"loss": 3.2454,
"step": 201
},
{
"epoch": 0.20565029269534232,
"grad_norm": 10.097418785095215,
"learning_rate": 9.031827268186779e-05,
"loss": 3.8778,
"step": 202
},
{
"epoch": 0.20666836345125986,
"grad_norm": 9.544397354125977,
"learning_rate": 9.022307540702576e-05,
"loss": 3.5354,
"step": 203
},
{
"epoch": 0.2076864342071774,
"grad_norm": 13.447309494018555,
"learning_rate": 9.012746308551208e-05,
"loss": 5.3594,
"step": 204
},
{
"epoch": 0.20870450496309492,
"grad_norm": 12.501740455627441,
"learning_rate": 9.003143670391403e-05,
"loss": 3.5315,
"step": 205
},
{
"epoch": 0.20972257571901248,
"grad_norm": 13.571687698364258,
"learning_rate": 8.993499725309148e-05,
"loss": 4.0421,
"step": 206
},
{
"epoch": 0.21074064647493002,
"grad_norm": 14.879913330078125,
"learning_rate": 8.983814572816656e-05,
"loss": 4.1594,
"step": 207
},
{
"epoch": 0.21175871723084755,
"grad_norm": 17.623329162597656,
"learning_rate": 8.974088312851345e-05,
"loss": 4.9946,
"step": 208
},
{
"epoch": 0.21277678798676508,
"grad_norm": 6.669205665588379,
"learning_rate": 8.964321045774807e-05,
"loss": 1.5305,
"step": 209
},
{
"epoch": 0.21379485874268261,
"grad_norm": 9.656936645507812,
"learning_rate": 8.954512872371769e-05,
"loss": 2.7299,
"step": 210
},
{
"epoch": 0.21481292949860015,
"grad_norm": 7.008784770965576,
"learning_rate": 8.944663893849052e-05,
"loss": 1.4462,
"step": 211
},
{
"epoch": 0.21583100025451768,
"grad_norm": 6.301548004150391,
"learning_rate": 8.934774211834538e-05,
"loss": 1.4093,
"step": 212
},
{
"epoch": 0.2168490710104352,
"grad_norm": 7.544199466705322,
"learning_rate": 8.924843928376104e-05,
"loss": 1.6221,
"step": 213
},
{
"epoch": 0.21786714176635277,
"grad_norm": 9.308175086975098,
"learning_rate": 8.914873145940584e-05,
"loss": 2.1724,
"step": 214
},
{
"epoch": 0.2188852125222703,
"grad_norm": 8.202116012573242,
"learning_rate": 8.904861967412703e-05,
"loss": 1.7294,
"step": 215
},
{
"epoch": 0.21990328327818784,
"grad_norm": 9.309891700744629,
"learning_rate": 8.894810496094016e-05,
"loss": 2.1319,
"step": 216
},
{
"epoch": 0.22092135403410537,
"grad_norm": 7.8817925453186035,
"learning_rate": 8.884718835701848e-05,
"loss": 2.0479,
"step": 217
},
{
"epoch": 0.2219394247900229,
"grad_norm": 7.9436116218566895,
"learning_rate": 8.874587090368221e-05,
"loss": 1.9141,
"step": 218
},
{
"epoch": 0.22295749554594044,
"grad_norm": 9.188081741333008,
"learning_rate": 8.86441536463877e-05,
"loss": 2.5944,
"step": 219
},
{
"epoch": 0.22397556630185797,
"grad_norm": 9.442697525024414,
"learning_rate": 8.85420376347168e-05,
"loss": 2.616,
"step": 220
},
{
"epoch": 0.2249936370577755,
"grad_norm": 7.059047222137451,
"learning_rate": 8.843952392236594e-05,
"loss": 1.8199,
"step": 221
},
{
"epoch": 0.22601170781369306,
"grad_norm": 9.448399543762207,
"learning_rate": 8.833661356713528e-05,
"loss": 2.2707,
"step": 222
},
{
"epoch": 0.2270297785696106,
"grad_norm": 7.232347011566162,
"learning_rate": 8.823330763091775e-05,
"loss": 2.2834,
"step": 223
},
{
"epoch": 0.22804784932552813,
"grad_norm": 7.126833438873291,
"learning_rate": 8.812960717968818e-05,
"loss": 2.2613,
"step": 224
},
{
"epoch": 0.22906592008144566,
"grad_norm": 7.250087261199951,
"learning_rate": 8.802551328349222e-05,
"loss": 2.0233,
"step": 225
},
{
"epoch": 0.2300839908373632,
"grad_norm": 9.801566123962402,
"learning_rate": 8.792102701643531e-05,
"loss": 2.6283,
"step": 226
},
{
"epoch": 0.23110206159328073,
"grad_norm": 8.86218547821045,
"learning_rate": 8.781614945667169e-05,
"loss": 2.7821,
"step": 227
},
{
"epoch": 0.23212013234919826,
"grad_norm": 7.009481430053711,
"learning_rate": 8.771088168639312e-05,
"loss": 2.187,
"step": 228
},
{
"epoch": 0.2331382031051158,
"grad_norm": 7.643123149871826,
"learning_rate": 8.760522479181784e-05,
"loss": 2.0065,
"step": 229
},
{
"epoch": 0.23415627386103335,
"grad_norm": 6.573335647583008,
"learning_rate": 8.749917986317928e-05,
"loss": 1.939,
"step": 230
},
{
"epoch": 0.2351743446169509,
"grad_norm": 9.001991271972656,
"learning_rate": 8.73927479947149e-05,
"loss": 2.8534,
"step": 231
},
{
"epoch": 0.23619241537286842,
"grad_norm": 9.186355590820312,
"learning_rate": 8.72859302846548e-05,
"loss": 3.112,
"step": 232
},
{
"epoch": 0.23721048612878595,
"grad_norm": 9.961040496826172,
"learning_rate": 8.717872783521047e-05,
"loss": 3.2593,
"step": 233
},
{
"epoch": 0.23822855688470349,
"grad_norm": 8.34619426727295,
"learning_rate": 8.707114175256335e-05,
"loss": 2.2664,
"step": 234
},
{
"epoch": 0.23924662764062102,
"grad_norm": 7.473055839538574,
"learning_rate": 8.696317314685341e-05,
"loss": 2.8765,
"step": 235
},
{
"epoch": 0.24026469839653855,
"grad_norm": 6.791398048400879,
"learning_rate": 8.685482313216783e-05,
"loss": 2.098,
"step": 236
},
{
"epoch": 0.24128276915245608,
"grad_norm": 9.765985488891602,
"learning_rate": 8.674609282652934e-05,
"loss": 3.2374,
"step": 237
},
{
"epoch": 0.24230083990837364,
"grad_norm": 7.459610462188721,
"learning_rate": 8.663698335188477e-05,
"loss": 2.456,
"step": 238
},
{
"epoch": 0.24331891066429118,
"grad_norm": 8.42564868927002,
"learning_rate": 8.65274958340934e-05,
"loss": 2.3464,
"step": 239
},
{
"epoch": 0.2443369814202087,
"grad_norm": 7.114076137542725,
"learning_rate": 8.641763140291545e-05,
"loss": 2.1128,
"step": 240
},
{
"epoch": 0.24535505217612624,
"grad_norm": 9.573076248168945,
"learning_rate": 8.630739119200035e-05,
"loss": 2.4448,
"step": 241
},
{
"epoch": 0.24637312293204378,
"grad_norm": 7.850905895233154,
"learning_rate": 8.619677633887509e-05,
"loss": 2.446,
"step": 242
},
{
"epoch": 0.2473911936879613,
"grad_norm": 9.630354881286621,
"learning_rate": 8.608578798493236e-05,
"loss": 2.3875,
"step": 243
},
{
"epoch": 0.24840926444387884,
"grad_norm": 7.196229457855225,
"learning_rate": 8.597442727541897e-05,
"loss": 1.6186,
"step": 244
},
{
"epoch": 0.24942733519979637,
"grad_norm": 11.08008098602295,
"learning_rate": 8.586269535942385e-05,
"loss": 2.839,
"step": 245
},
{
"epoch": 0.2504454059557139,
"grad_norm": 8.258538246154785,
"learning_rate": 8.575059338986633e-05,
"loss": 2.2807,
"step": 246
},
{
"epoch": 0.25146347671163144,
"grad_norm": 9.670249938964844,
"learning_rate": 8.563812252348411e-05,
"loss": 2.2475,
"step": 247
},
{
"epoch": 0.252481547467549,
"grad_norm": 9.350224494934082,
"learning_rate": 8.552528392082147e-05,
"loss": 2.5073,
"step": 248
},
{
"epoch": 0.2534996182234665,
"grad_norm": 9.628990173339844,
"learning_rate": 8.541207874621718e-05,
"loss": 2.0659,
"step": 249
},
{
"epoch": 0.2545176889793841,
"grad_norm": 9.84993839263916,
"learning_rate": 8.529850816779251e-05,
"loss": 2.2365,
"step": 250
},
{
"epoch": 0.2555357597353016,
"grad_norm": 8.845046997070312,
"learning_rate": 8.518457335743926e-05,
"loss": 3.1796,
"step": 251
},
{
"epoch": 0.25655383049121916,
"grad_norm": 10.79970932006836,
"learning_rate": 8.507027549080753e-05,
"loss": 3.8036,
"step": 252
},
{
"epoch": 0.2575719012471367,
"grad_norm": 8.892024993896484,
"learning_rate": 8.495561574729369e-05,
"loss": 2.9368,
"step": 253
},
{
"epoch": 0.2585899720030542,
"grad_norm": 11.658991813659668,
"learning_rate": 8.484059531002821e-05,
"loss": 3.7456,
"step": 254
},
{
"epoch": 0.25960804275897176,
"grad_norm": 11.338571548461914,
"learning_rate": 8.472521536586335e-05,
"loss": 3.9418,
"step": 255
},
{
"epoch": 0.2606261135148893,
"grad_norm": 14.362560272216797,
"learning_rate": 8.460947710536107e-05,
"loss": 4.6011,
"step": 256
},
{
"epoch": 0.2616441842708068,
"grad_norm": 13.662555694580078,
"learning_rate": 8.449338172278059e-05,
"loss": 5.049,
"step": 257
},
{
"epoch": 0.26266225502672436,
"grad_norm": 15.036532402038574,
"learning_rate": 8.437693041606618e-05,
"loss": 4.0385,
"step": 258
},
{
"epoch": 0.2636803257826419,
"grad_norm": 12.57422161102295,
"learning_rate": 8.426012438683473e-05,
"loss": 3.1101,
"step": 259
},
{
"epoch": 0.2646983965385594,
"grad_norm": 8.874217987060547,
"learning_rate": 8.414296484036339e-05,
"loss": 2.3986,
"step": 260
},
{
"epoch": 0.26571646729447695,
"grad_norm": 5.300018787384033,
"learning_rate": 8.402545298557712e-05,
"loss": 0.9408,
"step": 261
},
{
"epoch": 0.2667345380503945,
"grad_norm": 6.752171039581299,
"learning_rate": 8.390759003503623e-05,
"loss": 1.8722,
"step": 262
},
{
"epoch": 0.267752608806312,
"grad_norm": 7.508800029754639,
"learning_rate": 8.378937720492384e-05,
"loss": 1.7145,
"step": 263
},
{
"epoch": 0.26877067956222955,
"grad_norm": 6.305329322814941,
"learning_rate": 8.367081571503332e-05,
"loss": 1.6567,
"step": 264
},
{
"epoch": 0.2697887503181471,
"grad_norm": 9.371475219726562,
"learning_rate": 8.355190678875578e-05,
"loss": 2.4242,
"step": 265
},
{
"epoch": 0.2708068210740647,
"grad_norm": 5.403337001800537,
"learning_rate": 8.343265165306735e-05,
"loss": 1.3716,
"step": 266
},
{
"epoch": 0.2718248918299822,
"grad_norm": 8.240038871765137,
"learning_rate": 8.331305153851658e-05,
"loss": 2.2134,
"step": 267
},
{
"epoch": 0.27284296258589974,
"grad_norm": 5.857060432434082,
"learning_rate": 8.319310767921174e-05,
"loss": 1.2823,
"step": 268
},
{
"epoch": 0.2738610333418173,
"grad_norm": 6.11975622177124,
"learning_rate": 8.307282131280804e-05,
"loss": 1.7163,
"step": 269
},
{
"epoch": 0.2748791040977348,
"grad_norm": 10.722909927368164,
"learning_rate": 8.295219368049494e-05,
"loss": 2.2343,
"step": 270
},
{
"epoch": 0.27589717485365234,
"grad_norm": 6.3688507080078125,
"learning_rate": 8.283122602698323e-05,
"loss": 1.253,
"step": 271
},
{
"epoch": 0.27691524560956987,
"grad_norm": 8.231119155883789,
"learning_rate": 8.27099196004923e-05,
"loss": 2.4005,
"step": 272
},
{
"epoch": 0.2779333163654874,
"grad_norm": 6.623697757720947,
"learning_rate": 8.258827565273718e-05,
"loss": 1.5276,
"step": 273
},
{
"epoch": 0.27895138712140494,
"grad_norm": 8.357768058776855,
"learning_rate": 8.246629543891569e-05,
"loss": 2.5312,
"step": 274
},
{
"epoch": 0.27996945787732247,
"grad_norm": 7.037582874298096,
"learning_rate": 8.23439802176954e-05,
"loss": 2.6555,
"step": 275
},
{
"epoch": 0.28098752863324,
"grad_norm": 8.760072708129883,
"learning_rate": 8.222133125120076e-05,
"loss": 2.051,
"step": 276
},
{
"epoch": 0.28200559938915754,
"grad_norm": 10.407620429992676,
"learning_rate": 8.209834980499995e-05,
"loss": 2.7866,
"step": 277
},
{
"epoch": 0.28302367014507507,
"grad_norm": 10.065641403198242,
"learning_rate": 8.197503714809191e-05,
"loss": 2.7393,
"step": 278
},
{
"epoch": 0.2840417409009926,
"grad_norm": 8.072295188903809,
"learning_rate": 8.185139455289322e-05,
"loss": 2.1416,
"step": 279
},
{
"epoch": 0.28505981165691013,
"grad_norm": 10.837465286254883,
"learning_rate": 8.172742329522493e-05,
"loss": 3.0516,
"step": 280
},
{
"epoch": 0.28607788241282767,
"grad_norm": 8.73945426940918,
"learning_rate": 8.160312465429952e-05,
"loss": 2.6516,
"step": 281
},
{
"epoch": 0.28709595316874525,
"grad_norm": 9.408519744873047,
"learning_rate": 8.147849991270752e-05,
"loss": 2.4367,
"step": 282
},
{
"epoch": 0.2881140239246628,
"grad_norm": 8.221115112304688,
"learning_rate": 8.135355035640444e-05,
"loss": 2.4484,
"step": 283
},
{
"epoch": 0.2891320946805803,
"grad_norm": 11.46916389465332,
"learning_rate": 8.122827727469737e-05,
"loss": 3.5208,
"step": 284
},
{
"epoch": 0.29015016543649785,
"grad_norm": 8.721375465393066,
"learning_rate": 8.110268196023179e-05,
"loss": 2.3896,
"step": 285
},
{
"epoch": 0.2911682361924154,
"grad_norm": 9.857149124145508,
"learning_rate": 8.097676570897814e-05,
"loss": 2.8248,
"step": 286
},
{
"epoch": 0.2921863069483329,
"grad_norm": 7.732857704162598,
"learning_rate": 8.085052982021847e-05,
"loss": 2.0455,
"step": 287
},
{
"epoch": 0.29320437770425045,
"grad_norm": 9.654264450073242,
"learning_rate": 8.072397559653313e-05,
"loss": 2.3959,
"step": 288
},
{
"epoch": 0.294222448460168,
"grad_norm": 10.697869300842285,
"learning_rate": 8.059710434378715e-05,
"loss": 3.314,
"step": 289
},
{
"epoch": 0.2952405192160855,
"grad_norm": 8.84398078918457,
"learning_rate": 8.046991737111696e-05,
"loss": 2.0514,
"step": 290
},
{
"epoch": 0.29625858997200305,
"grad_norm": 7.673434257507324,
"learning_rate": 8.034241599091665e-05,
"loss": 2.165,
"step": 291
},
{
"epoch": 0.2972766607279206,
"grad_norm": 10.299299240112305,
"learning_rate": 8.021460151882471e-05,
"loss": 3.0283,
"step": 292
},
{
"epoch": 0.2982947314838381,
"grad_norm": 6.935961723327637,
"learning_rate": 8.008647527371023e-05,
"loss": 1.9187,
"step": 293
},
{
"epoch": 0.29931280223975565,
"grad_norm": 9.410109519958496,
"learning_rate": 7.995803857765933e-05,
"loss": 2.4798,
"step": 294
},
{
"epoch": 0.3003308729956732,
"grad_norm": 9.035164833068848,
"learning_rate": 7.982929275596166e-05,
"loss": 2.8312,
"step": 295
},
{
"epoch": 0.3013489437515907,
"grad_norm": 8.214160919189453,
"learning_rate": 7.970023913709652e-05,
"loss": 2.4572,
"step": 296
},
{
"epoch": 0.30236701450750825,
"grad_norm": 9.095396041870117,
"learning_rate": 7.957087905271934e-05,
"loss": 2.4834,
"step": 297
},
{
"epoch": 0.30338508526342584,
"grad_norm": 9.806940078735352,
"learning_rate": 7.944121383764776e-05,
"loss": 2.6364,
"step": 298
},
{
"epoch": 0.30440315601934337,
"grad_norm": 10.004327774047852,
"learning_rate": 7.931124482984802e-05,
"loss": 2.4236,
"step": 299
},
{
"epoch": 0.3054212267752609,
"grad_norm": 12.964902877807617,
"learning_rate": 7.918097337042105e-05,
"loss": 2.542,
"step": 300
},
{
"epoch": 0.30643929753117843,
"grad_norm": 7.418375015258789,
"learning_rate": 7.905040080358868e-05,
"loss": 2.7417,
"step": 301
},
{
"epoch": 0.30745736828709597,
"grad_norm": 8.499213218688965,
"learning_rate": 7.891952847667973e-05,
"loss": 3.6269,
"step": 302
},
{
"epoch": 0.3084754390430135,
"grad_norm": 8.255233764648438,
"learning_rate": 7.878835774011615e-05,
"loss": 2.7048,
"step": 303
},
{
"epoch": 0.30949350979893103,
"grad_norm": 9.611371040344238,
"learning_rate": 7.865688994739907e-05,
"loss": 3.9859,
"step": 304
},
{
"epoch": 0.31051158055484857,
"grad_norm": 9.825111389160156,
"learning_rate": 7.85251264550948e-05,
"loss": 3.2304,
"step": 305
},
{
"epoch": 0.3115296513107661,
"grad_norm": 13.28979778289795,
"learning_rate": 7.839306862282089e-05,
"loss": 3.8321,
"step": 306
},
{
"epoch": 0.31254772206668363,
"grad_norm": 16.412532806396484,
"learning_rate": 7.826071781323207e-05,
"loss": 5.4082,
"step": 307
},
{
"epoch": 0.31356579282260116,
"grad_norm": 9.727624893188477,
"learning_rate": 7.812807539200622e-05,
"loss": 2.5533,
"step": 308
},
{
"epoch": 0.3145838635785187,
"grad_norm": 8.238031387329102,
"learning_rate": 7.799514272783014e-05,
"loss": 2.2857,
"step": 309
},
{
"epoch": 0.31560193433443623,
"grad_norm": 6.882323741912842,
"learning_rate": 7.786192119238567e-05,
"loss": 2.1371,
"step": 310
},
{
"epoch": 0.31662000509035376,
"grad_norm": 5.3293280601501465,
"learning_rate": 7.772841216033533e-05,
"loss": 1.1834,
"step": 311
},
{
"epoch": 0.3176380758462713,
"grad_norm": 5.593384265899658,
"learning_rate": 7.759461700930823e-05,
"loss": 1.4746,
"step": 312
},
{
"epoch": 0.31865614660218883,
"grad_norm": 5.281317234039307,
"learning_rate": 7.746053711988583e-05,
"loss": 1.1387,
"step": 313
},
{
"epoch": 0.3196742173581064,
"grad_norm": 6.735507965087891,
"learning_rate": 7.73261738755877e-05,
"loss": 1.9801,
"step": 314
},
{
"epoch": 0.32069228811402395,
"grad_norm": 6.708221912384033,
"learning_rate": 7.719152866285721e-05,
"loss": 1.863,
"step": 315
},
{
"epoch": 0.3217103588699415,
"grad_norm": 8.238001823425293,
"learning_rate": 7.70566028710473e-05,
"loss": 2.16,
"step": 316
},
{
"epoch": 0.322728429625859,
"grad_norm": 6.396310329437256,
"learning_rate": 7.692139789240611e-05,
"loss": 2.1722,
"step": 317
},
{
"epoch": 0.32374650038177655,
"grad_norm": 8.0552396774292,
"learning_rate": 7.678591512206255e-05,
"loss": 2.2866,
"step": 318
},
{
"epoch": 0.3247645711376941,
"grad_norm": 5.051511287689209,
"learning_rate": 7.665015595801197e-05,
"loss": 1.0846,
"step": 319
},
{
"epoch": 0.3257826418936116,
"grad_norm": 7.378931045532227,
"learning_rate": 7.651412180110176e-05,
"loss": 1.7923,
"step": 320
},
{
"epoch": 0.32680071264952915,
"grad_norm": 7.6839118003845215,
"learning_rate": 7.637781405501681e-05,
"loss": 1.3504,
"step": 321
},
{
"epoch": 0.3278187834054467,
"grad_norm": 7.594010829925537,
"learning_rate": 7.624123412626512e-05,
"loss": 2.5312,
"step": 322
},
{
"epoch": 0.3288368541613642,
"grad_norm": 7.310485363006592,
"learning_rate": 7.610438342416319e-05,
"loss": 1.8773,
"step": 323
},
{
"epoch": 0.32985492491728174,
"grad_norm": 10.614038467407227,
"learning_rate": 7.596726336082158e-05,
"loss": 2.8128,
"step": 324
},
{
"epoch": 0.3308729956731993,
"grad_norm": 7.768847465515137,
"learning_rate": 7.582987535113023e-05,
"loss": 2.2407,
"step": 325
},
{
"epoch": 0.3318910664291168,
"grad_norm": 9.793490409851074,
"learning_rate": 7.569222081274395e-05,
"loss": 2.3751,
"step": 326
},
{
"epoch": 0.33290913718503434,
"grad_norm": 9.077165603637695,
"learning_rate": 7.555430116606778e-05,
"loss": 2.3227,
"step": 327
},
{
"epoch": 0.3339272079409519,
"grad_norm": 8.083662986755371,
"learning_rate": 7.541611783424225e-05,
"loss": 2.4783,
"step": 328
},
{
"epoch": 0.3349452786968694,
"grad_norm": 8.170014381408691,
"learning_rate": 7.527767224312883e-05,
"loss": 2.5388,
"step": 329
},
{
"epoch": 0.33596334945278694,
"grad_norm": 7.519252777099609,
"learning_rate": 7.513896582129508e-05,
"loss": 2.384,
"step": 330
},
{
"epoch": 0.33698142020870453,
"grad_norm": 9.94649887084961,
"learning_rate": 7.500000000000001e-05,
"loss": 2.7797,
"step": 331
},
{
"epoch": 0.33799949096462206,
"grad_norm": 8.595416069030762,
"learning_rate": 7.486077621317926e-05,
"loss": 2.7035,
"step": 332
},
{
"epoch": 0.3390175617205396,
"grad_norm": 7.753424644470215,
"learning_rate": 7.472129589743033e-05,
"loss": 2.0287,
"step": 333
},
{
"epoch": 0.34003563247645713,
"grad_norm": 9.271249771118164,
"learning_rate": 7.458156049199775e-05,
"loss": 2.1601,
"step": 334
},
{
"epoch": 0.34105370323237466,
"grad_norm": 8.564419746398926,
"learning_rate": 7.44415714387582e-05,
"loss": 2.9013,
"step": 335
},
{
"epoch": 0.3420717739882922,
"grad_norm": 8.431052207946777,
"learning_rate": 7.430133018220567e-05,
"loss": 2.4643,
"step": 336
},
{
"epoch": 0.3430898447442097,
"grad_norm": 6.7154436111450195,
"learning_rate": 7.416083816943653e-05,
"loss": 2.271,
"step": 337
},
{
"epoch": 0.34410791550012726,
"grad_norm": 9.483381271362305,
"learning_rate": 7.402009685013463e-05,
"loss": 2.489,
"step": 338
},
{
"epoch": 0.3451259862560448,
"grad_norm": 7.885382175445557,
"learning_rate": 7.38791076765563e-05,
"loss": 3.1875,
"step": 339
},
{
"epoch": 0.3461440570119623,
"grad_norm": 7.622438430786133,
"learning_rate": 7.373787210351541e-05,
"loss": 2.0865,
"step": 340
},
{
"epoch": 0.34716212776787986,
"grad_norm": 7.785037517547607,
"learning_rate": 7.359639158836828e-05,
"loss": 2.0806,
"step": 341
},
{
"epoch": 0.3481801985237974,
"grad_norm": 7.861757755279541,
"learning_rate": 7.345466759099875e-05,
"loss": 2.4029,
"step": 342
},
{
"epoch": 0.3491982692797149,
"grad_norm": 8.165399551391602,
"learning_rate": 7.331270157380303e-05,
"loss": 2.1438,
"step": 343
},
{
"epoch": 0.35021634003563246,
"grad_norm": 8.010607719421387,
"learning_rate": 7.317049500167465e-05,
"loss": 1.8633,
"step": 344
},
{
"epoch": 0.35123441079155,
"grad_norm": 8.535947799682617,
"learning_rate": 7.302804934198936e-05,
"loss": 2.2122,
"step": 345
},
{
"epoch": 0.3522524815474675,
"grad_norm": 8.71279239654541,
"learning_rate": 7.28853660645899e-05,
"loss": 2.4223,
"step": 346
},
{
"epoch": 0.3532705523033851,
"grad_norm": 9.44794750213623,
"learning_rate": 7.274244664177097e-05,
"loss": 2.5881,
"step": 347
},
{
"epoch": 0.35428862305930264,
"grad_norm": 10.474303245544434,
"learning_rate": 7.259929254826392e-05,
"loss": 1.9121,
"step": 348
},
{
"epoch": 0.3553066938152202,
"grad_norm": 9.976726531982422,
"learning_rate": 7.245590526122159e-05,
"loss": 2.3289,
"step": 349
},
{
"epoch": 0.3563247645711377,
"grad_norm": 10.359407424926758,
"learning_rate": 7.231228626020304e-05,
"loss": 2.0781,
"step": 350
},
{
"epoch": 0.35734283532705524,
"grad_norm": 10.392163276672363,
"learning_rate": 7.216843702715831e-05,
"loss": 4.3803,
"step": 351
},
{
"epoch": 0.3583609060829728,
"grad_norm": 8.603669166564941,
"learning_rate": 7.202435904641315e-05,
"loss": 3.3045,
"step": 352
},
{
"epoch": 0.3593789768388903,
"grad_norm": 11.44635009765625,
"learning_rate": 7.188005380465364e-05,
"loss": 5.3026,
"step": 353
},
{
"epoch": 0.36039704759480784,
"grad_norm": 11.560089111328125,
"learning_rate": 7.173552279091087e-05,
"loss": 4.0946,
"step": 354
},
{
"epoch": 0.3614151183507254,
"grad_norm": 12.211684226989746,
"learning_rate": 7.159076749654559e-05,
"loss": 3.7029,
"step": 355
},
{
"epoch": 0.3624331891066429,
"grad_norm": 13.772133827209473,
"learning_rate": 7.144578941523284e-05,
"loss": 4.3069,
"step": 356
},
{
"epoch": 0.36345125986256044,
"grad_norm": 11.917257308959961,
"learning_rate": 7.130059004294647e-05,
"loss": 3.3222,
"step": 357
},
{
"epoch": 0.36446933061847797,
"grad_norm": 15.86817741394043,
"learning_rate": 7.115517087794381e-05,
"loss": 3.5182,
"step": 358
},
{
"epoch": 0.3654874013743955,
"grad_norm": 6.566526412963867,
"learning_rate": 7.10095334207501e-05,
"loss": 1.7594,
"step": 359
},
{
"epoch": 0.36650547213031304,
"grad_norm": 6.9844560623168945,
"learning_rate": 7.086367917414306e-05,
"loss": 1.6552,
"step": 360
},
{
"epoch": 0.36752354288623057,
"grad_norm": 9.259458541870117,
"learning_rate": 7.07176096431374e-05,
"loss": 2.2072,
"step": 361
},
{
"epoch": 0.3685416136421481,
"grad_norm": 8.759178161621094,
"learning_rate": 7.057132633496923e-05,
"loss": 1.7327,
"step": 362
},
{
"epoch": 0.3695596843980657,
"grad_norm": 7.46987247467041,
"learning_rate": 7.042483075908062e-05,
"loss": 1.6727,
"step": 363
},
{
"epoch": 0.3705777551539832,
"grad_norm": 7.011016368865967,
"learning_rate": 7.027812442710385e-05,
"loss": 1.643,
"step": 364
},
{
"epoch": 0.37159582590990076,
"grad_norm": 8.475665092468262,
"learning_rate": 7.013120885284598e-05,
"loss": 2.4295,
"step": 365
},
{
"epoch": 0.3726138966658183,
"grad_norm": 5.769803047180176,
"learning_rate": 6.998408555227314e-05,
"loss": 1.3708,
"step": 366
},
{
"epoch": 0.3736319674217358,
"grad_norm": 6.653828144073486,
"learning_rate": 6.983675604349493e-05,
"loss": 2.111,
"step": 367
},
{
"epoch": 0.37465003817765336,
"grad_norm": 8.172953605651855,
"learning_rate": 6.968922184674867e-05,
"loss": 2.3177,
"step": 368
},
{
"epoch": 0.3756681089335709,
"grad_norm": 6.391868591308594,
"learning_rate": 6.954148448438389e-05,
"loss": 1.2711,
"step": 369
},
{
"epoch": 0.3766861796894884,
"grad_norm": 8.03226375579834,
"learning_rate": 6.93935454808464e-05,
"loss": 1.7306,
"step": 370
},
{
"epoch": 0.37770425044540595,
"grad_norm": 5.273244857788086,
"learning_rate": 6.924540636266272e-05,
"loss": 1.3542,
"step": 371
},
{
"epoch": 0.3787223212013235,
"grad_norm": 9.628256797790527,
"learning_rate": 6.909706865842429e-05,
"loss": 2.7357,
"step": 372
},
{
"epoch": 0.379740391957241,
"grad_norm": 11.36279582977295,
"learning_rate": 6.894853389877163e-05,
"loss": 2.2367,
"step": 373
},
{
"epoch": 0.38075846271315855,
"grad_norm": 8.72659969329834,
"learning_rate": 6.879980361637866e-05,
"loss": 2.2005,
"step": 374
},
{
"epoch": 0.3817765334690761,
"grad_norm": 7.1913042068481445,
"learning_rate": 6.86508793459368e-05,
"loss": 2.1034,
"step": 375
},
{
"epoch": 0.3827946042249936,
"grad_norm": 8.96323299407959,
"learning_rate": 6.850176262413912e-05,
"loss": 2.8465,
"step": 376
},
{
"epoch": 0.38381267498091115,
"grad_norm": 6.918330192565918,
"learning_rate": 6.835245498966461e-05,
"loss": 1.8181,
"step": 377
},
{
"epoch": 0.3848307457368287,
"grad_norm": 9.063780784606934,
"learning_rate": 6.820295798316214e-05,
"loss": 2.4932,
"step": 378
},
{
"epoch": 0.38584881649274627,
"grad_norm": 13.343623161315918,
"learning_rate": 6.805327314723468e-05,
"loss": 3.0215,
"step": 379
},
{
"epoch": 0.3868668872486638,
"grad_norm": 7.741687774658203,
"learning_rate": 6.790340202642332e-05,
"loss": 2.1827,
"step": 380
},
{
"epoch": 0.38788495800458134,
"grad_norm": 5.362514495849609,
"learning_rate": 6.775334616719136e-05,
"loss": 1.7059,
"step": 381
},
{
"epoch": 0.38890302876049887,
"grad_norm": 10.59506893157959,
"learning_rate": 6.760310711790832e-05,
"loss": 2.6664,
"step": 382
},
{
"epoch": 0.3899210995164164,
"grad_norm": 9.036832809448242,
"learning_rate": 6.745268642883404e-05,
"loss": 2.5482,
"step": 383
},
{
"epoch": 0.39093917027233394,
"grad_norm": 7.859615802764893,
"learning_rate": 6.73020856521026e-05,
"loss": 2.1062,
"step": 384
},
{
"epoch": 0.39195724102825147,
"grad_norm": 9.639580726623535,
"learning_rate": 6.715130634170635e-05,
"loss": 3.0521,
"step": 385
},
{
"epoch": 0.392975311784169,
"grad_norm": 8.098716735839844,
"learning_rate": 6.700035005347983e-05,
"loss": 2.6295,
"step": 386
},
{
"epoch": 0.39399338254008653,
"grad_norm": 11.05691146850586,
"learning_rate": 6.684921834508379e-05,
"loss": 2.6122,
"step": 387
},
{
"epoch": 0.39501145329600407,
"grad_norm": 9.071178436279297,
"learning_rate": 6.669791277598904e-05,
"loss": 2.3797,
"step": 388
},
{
"epoch": 0.3960295240519216,
"grad_norm": 9.826159477233887,
"learning_rate": 6.654643490746042e-05,
"loss": 2.4635,
"step": 389
},
{
"epoch": 0.39704759480783913,
"grad_norm": 7.310181140899658,
"learning_rate": 6.639478630254064e-05,
"loss": 2.06,
"step": 390
},
{
"epoch": 0.39806566556375667,
"grad_norm": 9.507575035095215,
"learning_rate": 6.624296852603419e-05,
"loss": 2.9877,
"step": 391
},
{
"epoch": 0.3990837363196742,
"grad_norm": 7.882664680480957,
"learning_rate": 6.609098314449116e-05,
"loss": 2.1182,
"step": 392
},
{
"epoch": 0.40010180707559173,
"grad_norm": 9.889808654785156,
"learning_rate": 6.593883172619111e-05,
"loss": 3.1559,
"step": 393
},
{
"epoch": 0.40111987783150926,
"grad_norm": 9.180678367614746,
"learning_rate": 6.578651584112686e-05,
"loss": 2.6704,
"step": 394
},
{
"epoch": 0.40213794858742685,
"grad_norm": 8.127158164978027,
"learning_rate": 6.563403706098833e-05,
"loss": 2.1218,
"step": 395
},
{
"epoch": 0.4031560193433444,
"grad_norm": 9.298659324645996,
"learning_rate": 6.548139695914622e-05,
"loss": 3.3196,
"step": 396
},
{
"epoch": 0.4041740900992619,
"grad_norm": 7.540867328643799,
"learning_rate": 6.532859711063594e-05,
"loss": 1.9205,
"step": 397
},
{
"epoch": 0.40519216085517945,
"grad_norm": 8.0338716506958,
"learning_rate": 6.51756390921412e-05,
"loss": 2.2858,
"step": 398
},
{
"epoch": 0.406210231611097,
"grad_norm": 8.012909889221191,
"learning_rate": 6.502252448197782e-05,
"loss": 2.3761,
"step": 399
},
{
"epoch": 0.4072283023670145,
"grad_norm": 7.803510665893555,
"learning_rate": 6.486925486007742e-05,
"loss": 2.0418,
"step": 400
},
{
"epoch": 0.40824637312293205,
"grad_norm": 6.121700286865234,
"learning_rate": 6.471583180797121e-05,
"loss": 2.1481,
"step": 401
},
{
"epoch": 0.4092644438788496,
"grad_norm": 8.690316200256348,
"learning_rate": 6.456225690877344e-05,
"loss": 3.0496,
"step": 402
},
{
"epoch": 0.4102825146347671,
"grad_norm": 8.961786270141602,
"learning_rate": 6.440853174716534e-05,
"loss": 3.4188,
"step": 403
},
{
"epoch": 0.41130058539068465,
"grad_norm": 9.848220825195312,
"learning_rate": 6.425465790937861e-05,
"loss": 3.948,
"step": 404
},
{
"epoch": 0.4123186561466022,
"grad_norm": 11.758772850036621,
"learning_rate": 6.410063698317901e-05,
"loss": 4.4288,
"step": 405
},
{
"epoch": 0.4133367269025197,
"grad_norm": 10.132964134216309,
"learning_rate": 6.394647055785017e-05,
"loss": 3.4126,
"step": 406
},
{
"epoch": 0.41435479765843725,
"grad_norm": 9.949785232543945,
"learning_rate": 6.379216022417696e-05,
"loss": 2.584,
"step": 407
},
{
"epoch": 0.4153728684143548,
"grad_norm": 13.347235679626465,
"learning_rate": 6.363770757442927e-05,
"loss": 2.8766,
"step": 408
},
{
"epoch": 0.4163909391702723,
"grad_norm": 7.714400768280029,
"learning_rate": 6.348311420234542e-05,
"loss": 2.4595,
"step": 409
},
{
"epoch": 0.41740900992618984,
"grad_norm": 5.146259307861328,
"learning_rate": 6.332838170311585e-05,
"loss": 1.4562,
"step": 410
},
{
"epoch": 0.41842708068210743,
"grad_norm": 5.76894474029541,
"learning_rate": 6.31735116733666e-05,
"loss": 1.3719,
"step": 411
},
{
"epoch": 0.41944515143802497,
"grad_norm": 5.680314540863037,
"learning_rate": 6.301850571114281e-05,
"loss": 1.173,
"step": 412
},
{
"epoch": 0.4204632221939425,
"grad_norm": 8.60592269897461,
"learning_rate": 6.286336541589224e-05,
"loss": 2.2157,
"step": 413
},
{
"epoch": 0.42148129294986003,
"grad_norm": 6.760054588317871,
"learning_rate": 6.27080923884488e-05,
"loss": 1.6651,
"step": 414
},
{
"epoch": 0.42249936370577756,
"grad_norm": 6.813849925994873,
"learning_rate": 6.255268823101605e-05,
"loss": 1.6109,
"step": 415
},
{
"epoch": 0.4235174344616951,
"grad_norm": 7.729984760284424,
"learning_rate": 6.239715454715054e-05,
"loss": 2.0043,
"step": 416
},
{
"epoch": 0.42453550521761263,
"grad_norm": 6.723374366760254,
"learning_rate": 6.224149294174548e-05,
"loss": 1.7516,
"step": 417
},
{
"epoch": 0.42555357597353016,
"grad_norm": 7.92853307723999,
"learning_rate": 6.208570502101393e-05,
"loss": 2.0667,
"step": 418
},
{
"epoch": 0.4265716467294477,
"grad_norm": 8.561442375183105,
"learning_rate": 6.192979239247243e-05,
"loss": 2.3514,
"step": 419
},
{
"epoch": 0.42758971748536523,
"grad_norm": 6.065650463104248,
"learning_rate": 6.177375666492431e-05,
"loss": 1.6079,
"step": 420
},
{
"epoch": 0.42860778824128276,
"grad_norm": 7.547060489654541,
"learning_rate": 6.161759944844308e-05,
"loss": 1.6165,
"step": 421
},
{
"epoch": 0.4296258589972003,
"grad_norm": 7.750328540802002,
"learning_rate": 6.146132235435591e-05,
"loss": 2.6017,
"step": 422
},
{
"epoch": 0.4306439297531178,
"grad_norm": 8.826605796813965,
"learning_rate": 6.13049269952269e-05,
"loss": 2.8712,
"step": 423
},
{
"epoch": 0.43166200050903536,
"grad_norm": 9.169108390808105,
"learning_rate": 6.114841498484048e-05,
"loss": 3.1703,
"step": 424
},
{
"epoch": 0.4326800712649529,
"grad_norm": 10.452327728271484,
"learning_rate": 6.0991787938184784e-05,
"loss": 2.8737,
"step": 425
},
{
"epoch": 0.4336981420208704,
"grad_norm": 6.921334743499756,
"learning_rate": 6.0835047471434955e-05,
"loss": 2.1521,
"step": 426
},
{
"epoch": 0.434716212776788,
"grad_norm": 7.088666915893555,
"learning_rate": 6.067819520193645e-05,
"loss": 1.833,
"step": 427
},
{
"epoch": 0.43573428353270555,
"grad_norm": 7.354036331176758,
"learning_rate": 6.052123274818842e-05,
"loss": 2.0779,
"step": 428
},
{
"epoch": 0.4367523542886231,
"grad_norm": 6.732209205627441,
"learning_rate": 6.0364161729826905e-05,
"loss": 1.8935,
"step": 429
},
{
"epoch": 0.4377704250445406,
"grad_norm": 7.724286079406738,
"learning_rate": 6.020698376760824e-05,
"loss": 2.334,
"step": 430
},
{
"epoch": 0.43878849580045814,
"grad_norm": 7.147849082946777,
"learning_rate": 6.004970048339226e-05,
"loss": 1.9002,
"step": 431
},
{
"epoch": 0.4398065665563757,
"grad_norm": 10.327398300170898,
"learning_rate": 5.989231350012554e-05,
"loss": 3.3501,
"step": 432
},
{
"epoch": 0.4408246373122932,
"grad_norm": 8.666500091552734,
"learning_rate": 5.973482444182475e-05,
"loss": 2.4769,
"step": 433
},
{
"epoch": 0.44184270806821074,
"grad_norm": 10.457250595092773,
"learning_rate": 5.9577234933559764e-05,
"loss": 3.03,
"step": 434
},
{
"epoch": 0.4428607788241283,
"grad_norm": 11.054841995239258,
"learning_rate": 5.941954660143703e-05,
"loss": 2.6938,
"step": 435
},
{
"epoch": 0.4438788495800458,
"grad_norm": 8.820106506347656,
"learning_rate": 5.9261761072582655e-05,
"loss": 2.6282,
"step": 436
},
{
"epoch": 0.44489692033596334,
"grad_norm": 9.201048851013184,
"learning_rate": 5.910387997512573e-05,
"loss": 3.0258,
"step": 437
},
{
"epoch": 0.4459149910918809,
"grad_norm": 8.373756408691406,
"learning_rate": 5.8945904938181484e-05,
"loss": 2.0512,
"step": 438
},
{
"epoch": 0.4469330618477984,
"grad_norm": 7.493674278259277,
"learning_rate": 5.878783759183442e-05,
"loss": 2.3767,
"step": 439
},
{
"epoch": 0.44795113260371594,
"grad_norm": 8.552105903625488,
"learning_rate": 5.86296795671216e-05,
"loss": 2.4706,
"step": 440
},
{
"epoch": 0.4489692033596335,
"grad_norm": 7.708653450012207,
"learning_rate": 5.847143249601574e-05,
"loss": 2.2514,
"step": 441
},
{
"epoch": 0.449987274115551,
"grad_norm": 9.070602416992188,
"learning_rate": 5.8313098011408406e-05,
"loss": 2.322,
"step": 442
},
{
"epoch": 0.4510053448714686,
"grad_norm": 7.668588638305664,
"learning_rate": 5.8154677747093134e-05,
"loss": 1.8555,
"step": 443
},
{
"epoch": 0.4520234156273861,
"grad_norm": 9.315807342529297,
"learning_rate": 5.7996173337748606e-05,
"loss": 2.1384,
"step": 444
},
{
"epoch": 0.45304148638330366,
"grad_norm": 10.211068153381348,
"learning_rate": 5.783758641892172e-05,
"loss": 2.8774,
"step": 445
},
{
"epoch": 0.4540595571392212,
"grad_norm": 9.461262702941895,
"learning_rate": 5.767891862701082e-05,
"loss": 2.5156,
"step": 446
},
{
"epoch": 0.4550776278951387,
"grad_norm": 9.720065116882324,
"learning_rate": 5.7520171599248704e-05,
"loss": 2.6157,
"step": 447
},
{
"epoch": 0.45609569865105626,
"grad_norm": 8.700965881347656,
"learning_rate": 5.7361346973685794e-05,
"loss": 2.2904,
"step": 448
},
{
"epoch": 0.4571137694069738,
"grad_norm": 7.846927642822266,
"learning_rate": 5.7202446389173223e-05,
"loss": 1.968,
"step": 449
},
{
"epoch": 0.4581318401628913,
"grad_norm": 11.256888389587402,
"learning_rate": 5.704347148534589e-05,
"loss": 2.0931,
"step": 450
},
{
"epoch": 0.45914991091880886,
"grad_norm": 8.149317741394043,
"learning_rate": 5.688442390260559e-05,
"loss": 3.8319,
"step": 451
},
{
"epoch": 0.4601679816747264,
"grad_norm": 10.932345390319824,
"learning_rate": 5.672530528210405e-05,
"loss": 3.0198,
"step": 452
},
{
"epoch": 0.4611860524306439,
"grad_norm": 9.099678993225098,
"learning_rate": 5.6566117265726006e-05,
"loss": 3.3294,
"step": 453
},
{
"epoch": 0.46220412318656146,
"grad_norm": 11.72851848602295,
"learning_rate": 5.640686149607228e-05,
"loss": 4.2732,
"step": 454
},
{
"epoch": 0.463222193942479,
"grad_norm": 11.902469635009766,
"learning_rate": 5.624753961644281e-05,
"loss": 3.6108,
"step": 455
},
{
"epoch": 0.4642402646983965,
"grad_norm": 10.879993438720703,
"learning_rate": 5.608815327081969e-05,
"loss": 2.6874,
"step": 456
},
{
"epoch": 0.46525833545431405,
"grad_norm": 14.943718910217285,
"learning_rate": 5.5928704103850206e-05,
"loss": 4.6303,
"step": 457
},
{
"epoch": 0.4662764062102316,
"grad_norm": 10.216432571411133,
"learning_rate": 5.57691937608299e-05,
"loss": 2.4846,
"step": 458
},
{
"epoch": 0.4672944769661492,
"grad_norm": 8.506292343139648,
"learning_rate": 5.5609623887685535e-05,
"loss": 2.476,
"step": 459
},
{
"epoch": 0.4683125477220667,
"grad_norm": 8.514628410339355,
"learning_rate": 5.544999613095818e-05,
"loss": 2.377,
"step": 460
},
{
"epoch": 0.46933061847798424,
"grad_norm": 5.901019096374512,
"learning_rate": 5.5290312137786146e-05,
"loss": 1.5461,
"step": 461
},
{
"epoch": 0.4703486892339018,
"grad_norm": 5.75610876083374,
"learning_rate": 5.513057355588804e-05,
"loss": 1.4872,
"step": 462
},
{
"epoch": 0.4713667599898193,
"grad_norm": 5.7381272315979,
"learning_rate": 5.4970782033545774e-05,
"loss": 1.4357,
"step": 463
},
{
"epoch": 0.47238483074573684,
"grad_norm": 7.901957035064697,
"learning_rate": 5.4810939219587485e-05,
"loss": 1.8938,
"step": 464
},
{
"epoch": 0.47340290150165437,
"grad_norm": 5.663755893707275,
"learning_rate": 5.465104676337062e-05,
"loss": 1.2289,
"step": 465
},
{
"epoch": 0.4744209722575719,
"grad_norm": 7.933018684387207,
"learning_rate": 5.44911063147648e-05,
"loss": 2.2704,
"step": 466
},
{
"epoch": 0.47543904301348944,
"grad_norm": 7.943240165710449,
"learning_rate": 5.433111952413495e-05,
"loss": 2.3279,
"step": 467
},
{
"epoch": 0.47645711376940697,
"grad_norm": 6.650790691375732,
"learning_rate": 5.417108804232409e-05,
"loss": 2.2172,
"step": 468
},
{
"epoch": 0.4774751845253245,
"grad_norm": 7.444812774658203,
"learning_rate": 5.401101352063647e-05,
"loss": 2.2142,
"step": 469
},
{
"epoch": 0.47849325528124204,
"grad_norm": 7.4753642082214355,
"learning_rate": 5.3850897610820396e-05,
"loss": 1.757,
"step": 470
},
{
"epoch": 0.47951132603715957,
"grad_norm": 5.919788360595703,
"learning_rate": 5.369074196505125e-05,
"loss": 1.6481,
"step": 471
},
{
"epoch": 0.4805293967930771,
"grad_norm": 8.422210693359375,
"learning_rate": 5.3530548235914454e-05,
"loss": 2.157,
"step": 472
},
{
"epoch": 0.48154746754899463,
"grad_norm": 6.410617351531982,
"learning_rate": 5.33703180763884e-05,
"loss": 1.8347,
"step": 473
},
{
"epoch": 0.48256553830491217,
"grad_norm": 6.645679473876953,
"learning_rate": 5.321005313982738e-05,
"loss": 1.9199,
"step": 474
},
{
"epoch": 0.4835836090608297,
"grad_norm": 8.815511703491211,
"learning_rate": 5.3049755079944527e-05,
"loss": 1.9625,
"step": 475
},
{
"epoch": 0.4846016798167473,
"grad_norm": 6.998859882354736,
"learning_rate": 5.288942555079479e-05,
"loss": 1.9826,
"step": 476
},
{
"epoch": 0.4856197505726648,
"grad_norm": 9.812437057495117,
"learning_rate": 5.272906620675779e-05,
"loss": 2.8627,
"step": 477
},
{
"epoch": 0.48663782132858235,
"grad_norm": 6.514684677124023,
"learning_rate": 5.256867870252087e-05,
"loss": 1.8711,
"step": 478
},
{
"epoch": 0.4876558920844999,
"grad_norm": 8.101947784423828,
"learning_rate": 5.240826469306187e-05,
"loss": 2.2444,
"step": 479
},
{
"epoch": 0.4886739628404174,
"grad_norm": 7.826203346252441,
"learning_rate": 5.224782583363215e-05,
"loss": 2.3479,
"step": 480
},
{
"epoch": 0.48969203359633495,
"grad_norm": 7.306244850158691,
"learning_rate": 5.208736377973954e-05,
"loss": 2.2877,
"step": 481
},
{
"epoch": 0.4907101043522525,
"grad_norm": 10.923294067382812,
"learning_rate": 5.192688018713113e-05,
"loss": 3.4528,
"step": 482
},
{
"epoch": 0.49172817510817,
"grad_norm": 7.382264614105225,
"learning_rate": 5.176637671177631e-05,
"loss": 2.1969,
"step": 483
},
{
"epoch": 0.49274624586408755,
"grad_norm": 10.03930377960205,
"learning_rate": 5.1605855009849614e-05,
"loss": 3.5883,
"step": 484
},
{
"epoch": 0.4937643166200051,
"grad_norm": 7.927920818328857,
"learning_rate": 5.144531673771363e-05,
"loss": 2.3655,
"step": 485
},
{
"epoch": 0.4947823873759226,
"grad_norm": 8.499062538146973,
"learning_rate": 5.1284763551901995e-05,
"loss": 2.5173,
"step": 486
},
{
"epoch": 0.49580045813184015,
"grad_norm": 6.909058094024658,
"learning_rate": 5.112419710910213e-05,
"loss": 2.0323,
"step": 487
},
{
"epoch": 0.4968185288877577,
"grad_norm": 8.88456916809082,
"learning_rate": 5.096361906613836e-05,
"loss": 2.6987,
"step": 488
},
{
"epoch": 0.4978365996436752,
"grad_norm": 7.5175580978393555,
"learning_rate": 5.080303107995461e-05,
"loss": 2.1691,
"step": 489
},
{
"epoch": 0.49885467039959275,
"grad_norm": 7.615448951721191,
"learning_rate": 5.064243480759748e-05,
"loss": 2.0365,
"step": 490
},
{
"epoch": 0.4998727411555103,
"grad_norm": 9.251805305480957,
"learning_rate": 5.048183190619904e-05,
"loss": 2.2146,
"step": 491
},
{
"epoch": 0.5008908119114278,
"grad_norm": 7.876194477081299,
"learning_rate": 5.032122403295977e-05,
"loss": 2.3439,
"step": 492
},
{
"epoch": 0.5019088826673453,
"grad_norm": 8.969354629516602,
"learning_rate": 5.0160612845131414e-05,
"loss": 2.5224,
"step": 493
},
{
"epoch": 0.5029269534232629,
"grad_norm": 7.22929573059082,
"learning_rate": 5e-05,
"loss": 1.6864,
"step": 494
},
{
"epoch": 0.5039450241791804,
"grad_norm": 7.851796627044678,
"learning_rate": 4.9839387154868584e-05,
"loss": 2.0388,
"step": 495
},
{
"epoch": 0.504963094935098,
"grad_norm": 9.29887580871582,
"learning_rate": 4.967877596704025e-05,
"loss": 2.6501,
"step": 496
},
{
"epoch": 0.5059811656910155,
"grad_norm": 8.79398250579834,
"learning_rate": 4.951816809380097e-05,
"loss": 2.5759,
"step": 497
},
{
"epoch": 0.506999236446933,
"grad_norm": 10.286650657653809,
"learning_rate": 4.9357565192402525e-05,
"loss": 3.1672,
"step": 498
},
{
"epoch": 0.5080173072028505,
"grad_norm": 8.799942016601562,
"learning_rate": 4.919696892004539e-05,
"loss": 1.9062,
"step": 499
},
{
"epoch": 0.5090353779587682,
"grad_norm": 8.252535820007324,
"learning_rate": 4.903638093386167e-05,
"loss": 1.6945,
"step": 500
},
{
"epoch": 0.5100534487146857,
"grad_norm": 5.307441234588623,
"learning_rate": 4.887580289089787e-05,
"loss": 1.7627,
"step": 501
},
{
"epoch": 0.5110715194706033,
"grad_norm": 7.124861717224121,
"learning_rate": 4.8715236448098016e-05,
"loss": 2.5286,
"step": 502
},
{
"epoch": 0.5120895902265208,
"grad_norm": 10.368796348571777,
"learning_rate": 4.855468326228638e-05,
"loss": 3.6972,
"step": 503
},
{
"epoch": 0.5131076609824383,
"grad_norm": 10.741311073303223,
"learning_rate": 4.8394144990150404e-05,
"loss": 4.1789,
"step": 504
},
{
"epoch": 0.5141257317383559,
"grad_norm": 10.585883140563965,
"learning_rate": 4.8233623288223704e-05,
"loss": 3.5806,
"step": 505
},
{
"epoch": 0.5151438024942734,
"grad_norm": 13.6077880859375,
"learning_rate": 4.807311981286888e-05,
"loss": 3.5391,
"step": 506
},
{
"epoch": 0.5161618732501909,
"grad_norm": 16.34162712097168,
"learning_rate": 4.7912636220260473e-05,
"loss": 3.6142,
"step": 507
},
{
"epoch": 0.5171799440061084,
"grad_norm": 11.734464645385742,
"learning_rate": 4.775217416636786e-05,
"loss": 2.6898,
"step": 508
},
{
"epoch": 0.518198014762026,
"grad_norm": 7.3565287590026855,
"learning_rate": 4.759173530693814e-05,
"loss": 2.0978,
"step": 509
},
{
"epoch": 0.5192160855179435,
"grad_norm": 5.852792739868164,
"learning_rate": 4.7431321297479135e-05,
"loss": 1.4297,
"step": 510
},
{
"epoch": 0.520234156273861,
"grad_norm": 6.781408786773682,
"learning_rate": 4.727093379324222e-05,
"loss": 1.5329,
"step": 511
},
{
"epoch": 0.5212522270297786,
"grad_norm": 11.24429702758789,
"learning_rate": 4.711057444920522e-05,
"loss": 1.6744,
"step": 512
},
{
"epoch": 0.5222702977856961,
"grad_norm": 7.679388046264648,
"learning_rate": 4.695024492005548e-05,
"loss": 2.2356,
"step": 513
},
{
"epoch": 0.5232883685416136,
"grad_norm": 5.93134069442749,
"learning_rate": 4.6789946860172634e-05,
"loss": 1.2665,
"step": 514
},
{
"epoch": 0.5243064392975312,
"grad_norm": 6.789477348327637,
"learning_rate": 4.6629681923611603e-05,
"loss": 1.6466,
"step": 515
},
{
"epoch": 0.5253245100534487,
"grad_norm": 8.315037727355957,
"learning_rate": 4.646945176408555e-05,
"loss": 2.065,
"step": 516
},
{
"epoch": 0.5263425808093662,
"grad_norm": 5.443754196166992,
"learning_rate": 4.630925803494877e-05,
"loss": 1.0138,
"step": 517
},
{
"epoch": 0.5273606515652838,
"grad_norm": 6.596680641174316,
"learning_rate": 4.6149102389179635e-05,
"loss": 1.8229,
"step": 518
},
{
"epoch": 0.5283787223212013,
"grad_norm": 5.735509872436523,
"learning_rate": 4.598898647936354e-05,
"loss": 1.5016,
"step": 519
},
{
"epoch": 0.5293967930771188,
"grad_norm": 7.154899597167969,
"learning_rate": 4.58289119576759e-05,
"loss": 1.7249,
"step": 520
},
{
"epoch": 0.5304148638330364,
"grad_norm": 5.887238502502441,
"learning_rate": 4.566888047586507e-05,
"loss": 1.3531,
"step": 521
},
{
"epoch": 0.5314329345889539,
"grad_norm": 7.944952964782715,
"learning_rate": 4.55088936852352e-05,
"loss": 2.0604,
"step": 522
},
{
"epoch": 0.5324510053448714,
"grad_norm": 8.235894203186035,
"learning_rate": 4.5348953236629395e-05,
"loss": 1.782,
"step": 523
},
{
"epoch": 0.533469076100789,
"grad_norm": 9.824324607849121,
"learning_rate": 4.518906078041252e-05,
"loss": 3.1078,
"step": 524
},
{
"epoch": 0.5344871468567065,
"grad_norm": 8.053499221801758,
"learning_rate": 4.502921796645424e-05,
"loss": 2.5225,
"step": 525
},
{
"epoch": 0.535505217612624,
"grad_norm": 9.53549861907959,
"learning_rate": 4.486942644411197e-05,
"loss": 3.0847,
"step": 526
},
{
"epoch": 0.5365232883685416,
"grad_norm": 8.427640914916992,
"learning_rate": 4.4709687862213866e-05,
"loss": 2.1704,
"step": 527
},
{
"epoch": 0.5375413591244591,
"grad_norm": 7.989354610443115,
"learning_rate": 4.4550003869041845e-05,
"loss": 2.3719,
"step": 528
},
{
"epoch": 0.5385594298803766,
"grad_norm": 7.53865909576416,
"learning_rate": 4.439037611231448e-05,
"loss": 2.4358,
"step": 529
},
{
"epoch": 0.5395775006362942,
"grad_norm": 9.102818489074707,
"learning_rate": 4.423080623917012e-05,
"loss": 3.0774,
"step": 530
},
{
"epoch": 0.5405955713922117,
"grad_norm": 10.17009162902832,
"learning_rate": 4.407129589614979e-05,
"loss": 2.719,
"step": 531
},
{
"epoch": 0.5416136421481293,
"grad_norm": 8.132767677307129,
"learning_rate": 4.3911846729180335e-05,
"loss": 2.6276,
"step": 532
},
{
"epoch": 0.5426317129040469,
"grad_norm": 8.669943809509277,
"learning_rate": 4.3752460383557195e-05,
"loss": 2.2211,
"step": 533
},
{
"epoch": 0.5436497836599644,
"grad_norm": 8.190427780151367,
"learning_rate": 4.359313850392772e-05,
"loss": 2.2451,
"step": 534
},
{
"epoch": 0.544667854415882,
"grad_norm": 7.185608386993408,
"learning_rate": 4.3433882734274e-05,
"loss": 1.938,
"step": 535
},
{
"epoch": 0.5456859251717995,
"grad_norm": 9.735365867614746,
"learning_rate": 4.327469471789597e-05,
"loss": 3.3738,
"step": 536
},
{
"epoch": 0.546703995927717,
"grad_norm": 9.06591796875,
"learning_rate": 4.311557609739442e-05,
"loss": 3.4894,
"step": 537
},
{
"epoch": 0.5477220666836345,
"grad_norm": 8.038829803466797,
"learning_rate": 4.295652851465412e-05,
"loss": 2.6487,
"step": 538
},
{
"epoch": 0.5487401374395521,
"grad_norm": 7.375051498413086,
"learning_rate": 4.27975536108268e-05,
"loss": 2.4853,
"step": 539
},
{
"epoch": 0.5497582081954696,
"grad_norm": 9.910839080810547,
"learning_rate": 4.2638653026314224e-05,
"loss": 3.1606,
"step": 540
},
{
"epoch": 0.5507762789513871,
"grad_norm": 7.77678918838501,
"learning_rate": 4.24798284007513e-05,
"loss": 2.33,
"step": 541
},
{
"epoch": 0.5517943497073047,
"grad_norm": 7.377612113952637,
"learning_rate": 4.232108137298919e-05,
"loss": 2.299,
"step": 542
},
{
"epoch": 0.5528124204632222,
"grad_norm": 9.510624885559082,
"learning_rate": 4.216241358107831e-05,
"loss": 2.8467,
"step": 543
},
{
"epoch": 0.5538304912191397,
"grad_norm": 6.834048748016357,
"learning_rate": 4.200382666225141e-05,
"loss": 2.0166,
"step": 544
},
{
"epoch": 0.5548485619750573,
"grad_norm": 8.245951652526855,
"learning_rate": 4.1845322252906864e-05,
"loss": 2.672,
"step": 545
},
{
"epoch": 0.5558666327309748,
"grad_norm": 7.539649963378906,
"learning_rate": 4.16869019885916e-05,
"loss": 2.3618,
"step": 546
},
{
"epoch": 0.5568847034868923,
"grad_norm": 7.983175754547119,
"learning_rate": 4.152856750398426e-05,
"loss": 2.2049,
"step": 547
},
{
"epoch": 0.5579027742428099,
"grad_norm": 8.641951560974121,
"learning_rate": 4.1370320432878404e-05,
"loss": 2.2235,
"step": 548
},
{
"epoch": 0.5589208449987274,
"grad_norm": 7.9181437492370605,
"learning_rate": 4.1212162408165595e-05,
"loss": 1.9295,
"step": 549
},
{
"epoch": 0.5599389157546449,
"grad_norm": 10.45153522491455,
"learning_rate": 4.105409506181854e-05,
"loss": 2.1553,
"step": 550
},
{
"epoch": 0.5609569865105625,
"grad_norm": 6.951171398162842,
"learning_rate": 4.0896120024874286e-05,
"loss": 2.5913,
"step": 551
},
{
"epoch": 0.56197505726648,
"grad_norm": 9.614657402038574,
"learning_rate": 4.073823892741735e-05,
"loss": 4.2435,
"step": 552
},
{
"epoch": 0.5629931280223975,
"grad_norm": 9.35623550415039,
"learning_rate": 4.0580453398563e-05,
"loss": 3.7123,
"step": 553
},
{
"epoch": 0.5640111987783151,
"grad_norm": 10.756424903869629,
"learning_rate": 4.042276506644024e-05,
"loss": 3.6713,
"step": 554
},
{
"epoch": 0.5650292695342326,
"grad_norm": 9.823023796081543,
"learning_rate": 4.0265175558175265e-05,
"loss": 3.7602,
"step": 555
},
{
"epoch": 0.5660473402901501,
"grad_norm": 13.360715866088867,
"learning_rate": 4.0107686499874465e-05,
"loss": 3.269,
"step": 556
},
{
"epoch": 0.5670654110460677,
"grad_norm": 14.194052696228027,
"learning_rate": 3.9950299516607766e-05,
"loss": 4.3906,
"step": 557
},
{
"epoch": 0.5680834818019852,
"grad_norm": 16.591251373291016,
"learning_rate": 3.979301623239177e-05,
"loss": 4.4802,
"step": 558
},
{
"epoch": 0.5691015525579027,
"grad_norm": 6.6096720695495605,
"learning_rate": 3.9635838270173107e-05,
"loss": 1.6842,
"step": 559
},
{
"epoch": 0.5701196233138203,
"grad_norm": 6.252510070800781,
"learning_rate": 3.94787672518116e-05,
"loss": 1.6248,
"step": 560
},
{
"epoch": 0.5711376940697378,
"grad_norm": 7.445550441741943,
"learning_rate": 3.9321804798063565e-05,
"loss": 1.7234,
"step": 561
},
{
"epoch": 0.5721557648256553,
"grad_norm": 5.321173191070557,
"learning_rate": 3.9164952528565057e-05,
"loss": 1.2454,
"step": 562
},
{
"epoch": 0.5731738355815729,
"grad_norm": 4.566540241241455,
"learning_rate": 3.900821206181521e-05,
"loss": 1.0588,
"step": 563
},
{
"epoch": 0.5741919063374905,
"grad_norm": 8.349088668823242,
"learning_rate": 3.8851585015159536e-05,
"loss": 1.5751,
"step": 564
},
{
"epoch": 0.575209977093408,
"grad_norm": 6.328129291534424,
"learning_rate": 3.8695073004773106e-05,
"loss": 2.025,
"step": 565
},
{
"epoch": 0.5762280478493256,
"grad_norm": 8.211170196533203,
"learning_rate": 3.8538677645644096e-05,
"loss": 1.1548,
"step": 566
},
{
"epoch": 0.5772461186052431,
"grad_norm": 5.518578052520752,
"learning_rate": 3.838240055155692e-05,
"loss": 1.2809,
"step": 567
},
{
"epoch": 0.5782641893611606,
"grad_norm": 6.383520603179932,
"learning_rate": 3.822624333507571e-05,
"loss": 1.8485,
"step": 568
},
{
"epoch": 0.5792822601170782,
"grad_norm": 5.425829887390137,
"learning_rate": 3.8070207607527584e-05,
"loss": 1.4567,
"step": 569
},
{
"epoch": 0.5803003308729957,
"grad_norm": 8.478185653686523,
"learning_rate": 3.791429497898608e-05,
"loss": 2.0052,
"step": 570
},
{
"epoch": 0.5813184016289132,
"grad_norm": 8.863068580627441,
"learning_rate": 3.775850705825454e-05,
"loss": 2.2554,
"step": 571
},
{
"epoch": 0.5823364723848308,
"grad_norm": 5.8295183181762695,
"learning_rate": 3.7602845452849463e-05,
"loss": 1.2544,
"step": 572
},
{
"epoch": 0.5833545431407483,
"grad_norm": 8.446788787841797,
"learning_rate": 3.7447311768983964e-05,
"loss": 2.4702,
"step": 573
},
{
"epoch": 0.5843726138966658,
"grad_norm": 7.7443766593933105,
"learning_rate": 3.7291907611551195e-05,
"loss": 2.0707,
"step": 574
},
{
"epoch": 0.5853906846525834,
"grad_norm": 8.347147941589355,
"learning_rate": 3.713663458410779e-05,
"loss": 1.659,
"step": 575
},
{
"epoch": 0.5864087554085009,
"grad_norm": 7.487883567810059,
"learning_rate": 3.69814942888572e-05,
"loss": 2.0328,
"step": 576
},
{
"epoch": 0.5874268261644184,
"grad_norm": 7.8575286865234375,
"learning_rate": 3.682648832663339e-05,
"loss": 1.8928,
"step": 577
},
{
"epoch": 0.588444896920336,
"grad_norm": 8.947505950927734,
"learning_rate": 3.6671618296884146e-05,
"loss": 1.6774,
"step": 578
},
{
"epoch": 0.5894629676762535,
"grad_norm": 5.097304821014404,
"learning_rate": 3.6516885797654594e-05,
"loss": 1.3306,
"step": 579
},
{
"epoch": 0.590481038432171,
"grad_norm": 6.418907642364502,
"learning_rate": 3.636229242557075e-05,
"loss": 1.9186,
"step": 580
},
{
"epoch": 0.5914991091880886,
"grad_norm": 7.3138346672058105,
"learning_rate": 3.620783977582305e-05,
"loss": 2.4993,
"step": 581
},
{
"epoch": 0.5925171799440061,
"grad_norm": 7.914095878601074,
"learning_rate": 3.605352944214986e-05,
"loss": 2.078,
"step": 582
},
{
"epoch": 0.5935352506999236,
"grad_norm": 10.451981544494629,
"learning_rate": 3.5899363016821e-05,
"loss": 2.5348,
"step": 583
},
{
"epoch": 0.5945533214558412,
"grad_norm": 6.191624164581299,
"learning_rate": 3.5745342090621405e-05,
"loss": 1.6607,
"step": 584
},
{
"epoch": 0.5955713922117587,
"grad_norm": 7.947683811187744,
"learning_rate": 3.559146825283465e-05,
"loss": 2.4664,
"step": 585
},
{
"epoch": 0.5965894629676762,
"grad_norm": 7.410199165344238,
"learning_rate": 3.5437743091226565e-05,
"loss": 2.0212,
"step": 586
},
{
"epoch": 0.5976075337235938,
"grad_norm": 8.705409049987793,
"learning_rate": 3.528416819202881e-05,
"loss": 2.2274,
"step": 587
},
{
"epoch": 0.5986256044795113,
"grad_norm": 7.487548351287842,
"learning_rate": 3.5130745139922574e-05,
"loss": 2.104,
"step": 588
},
{
"epoch": 0.5996436752354288,
"grad_norm": 8.788456916809082,
"learning_rate": 3.497747551802221e-05,
"loss": 2.5106,
"step": 589
},
{
"epoch": 0.6006617459913464,
"grad_norm": 7.41387939453125,
"learning_rate": 3.482436090785882e-05,
"loss": 2.1219,
"step": 590
},
{
"epoch": 0.6016798167472639,
"grad_norm": 6.481340408325195,
"learning_rate": 3.467140288936407e-05,
"loss": 1.9451,
"step": 591
},
{
"epoch": 0.6026978875031814,
"grad_norm": 7.278069496154785,
"learning_rate": 3.451860304085378e-05,
"loss": 1.8661,
"step": 592
},
{
"epoch": 0.603715958259099,
"grad_norm": 8.016121864318848,
"learning_rate": 3.43659629390117e-05,
"loss": 1.9884,
"step": 593
},
{
"epoch": 0.6047340290150165,
"grad_norm": 8.917866706848145,
"learning_rate": 3.421348415887315e-05,
"loss": 2.6266,
"step": 594
},
{
"epoch": 0.605752099770934,
"grad_norm": 9.271273612976074,
"learning_rate": 3.406116827380889e-05,
"loss": 2.6668,
"step": 595
},
{
"epoch": 0.6067701705268517,
"grad_norm": 7.660860061645508,
"learning_rate": 3.390901685550887e-05,
"loss": 2.373,
"step": 596
},
{
"epoch": 0.6077882412827692,
"grad_norm": 7.496829032897949,
"learning_rate": 3.375703147396583e-05,
"loss": 2.137,
"step": 597
},
{
"epoch": 0.6088063120386867,
"grad_norm": 10.63588809967041,
"learning_rate": 3.360521369745937e-05,
"loss": 2.0113,
"step": 598
},
{
"epoch": 0.6098243827946043,
"grad_norm": 8.661003112792969,
"learning_rate": 3.345356509253959e-05,
"loss": 2.202,
"step": 599
},
{
"epoch": 0.6108424535505218,
"grad_norm": 6.928518295288086,
"learning_rate": 3.330208722401097e-05,
"loss": 1.6603,
"step": 600
},
{
"epoch": 0.6118605243064393,
"grad_norm": 5.956086158752441,
"learning_rate": 3.315078165491622e-05,
"loss": 2.2319,
"step": 601
},
{
"epoch": 0.6128785950623569,
"grad_norm": 9.131757736206055,
"learning_rate": 3.2999649946520174e-05,
"loss": 3.3601,
"step": 602
},
{
"epoch": 0.6138966658182744,
"grad_norm": 8.110289573669434,
"learning_rate": 3.2848693658293675e-05,
"loss": 2.8758,
"step": 603
},
{
"epoch": 0.6149147365741919,
"grad_norm": 11.287444114685059,
"learning_rate": 3.2697914347897406e-05,
"loss": 4.129,
"step": 604
},
{
"epoch": 0.6159328073301095,
"grad_norm": 10.69924259185791,
"learning_rate": 3.254731357116597e-05,
"loss": 4.2776,
"step": 605
},
{
"epoch": 0.616950878086027,
"grad_norm": 9.89280891418457,
"learning_rate": 3.239689288209168e-05,
"loss": 3.1346,
"step": 606
},
{
"epoch": 0.6179689488419445,
"grad_norm": 11.832335472106934,
"learning_rate": 3.224665383280867e-05,
"loss": 3.4148,
"step": 607
},
{
"epoch": 0.6189870195978621,
"grad_norm": 13.277129173278809,
"learning_rate": 3.2096597973576694e-05,
"loss": 3.4906,
"step": 608
},
{
"epoch": 0.6200050903537796,
"grad_norm": 6.8787994384765625,
"learning_rate": 3.194672685276532e-05,
"loss": 1.4383,
"step": 609
},
{
"epoch": 0.6210231611096971,
"grad_norm": 5.783747673034668,
"learning_rate": 3.179704201683786e-05,
"loss": 1.3518,
"step": 610
},
{
"epoch": 0.6220412318656147,
"grad_norm": 5.462782859802246,
"learning_rate": 3.16475450103354e-05,
"loss": 1.249,
"step": 611
},
{
"epoch": 0.6230593026215322,
"grad_norm": 5.050539016723633,
"learning_rate": 3.1498237375860886e-05,
"loss": 1.1348,
"step": 612
},
{
"epoch": 0.6240773733774497,
"grad_norm": 8.341720581054688,
"learning_rate": 3.1349120654063225e-05,
"loss": 1.7345,
"step": 613
},
{
"epoch": 0.6250954441333673,
"grad_norm": 4.832444190979004,
"learning_rate": 3.120019638362136e-05,
"loss": 1.0501,
"step": 614
},
{
"epoch": 0.6261135148892848,
"grad_norm": 7.373495578765869,
"learning_rate": 3.1051466101228385e-05,
"loss": 1.7428,
"step": 615
},
{
"epoch": 0.6271315856452023,
"grad_norm": 5.6345319747924805,
"learning_rate": 3.090293134157572e-05,
"loss": 1.2435,
"step": 616
},
{
"epoch": 0.6281496564011199,
"grad_norm": 6.5224609375,
"learning_rate": 3.0754593637337276e-05,
"loss": 1.4176,
"step": 617
},
{
"epoch": 0.6291677271570374,
"grad_norm": 8.80791187286377,
"learning_rate": 3.06064545191536e-05,
"loss": 2.4285,
"step": 618
},
{
"epoch": 0.6301857979129549,
"grad_norm": 9.331201553344727,
"learning_rate": 3.0458515515616115e-05,
"loss": 2.7192,
"step": 619
},
{
"epoch": 0.6312038686688725,
"grad_norm": 9.033586502075195,
"learning_rate": 3.0310778153251324e-05,
"loss": 1.8652,
"step": 620
},
{
"epoch": 0.63222193942479,
"grad_norm": 6.689144134521484,
"learning_rate": 3.0163243956505095e-05,
"loss": 1.5773,
"step": 621
},
{
"epoch": 0.6332400101807075,
"grad_norm": 8.037043571472168,
"learning_rate": 3.0015914447726867e-05,
"loss": 2.3296,
"step": 622
},
{
"epoch": 0.6342580809366251,
"grad_norm": 7.927774906158447,
"learning_rate": 2.986879114715403e-05,
"loss": 2.2707,
"step": 623
},
{
"epoch": 0.6352761516925426,
"grad_norm": 5.514461994171143,
"learning_rate": 2.9721875572896157e-05,
"loss": 1.7974,
"step": 624
},
{
"epoch": 0.6362942224484601,
"grad_norm": 7.439801216125488,
"learning_rate": 2.95751692409194e-05,
"loss": 2.1823,
"step": 625
},
{
"epoch": 0.6373122932043777,
"grad_norm": 7.419183731079102,
"learning_rate": 2.942867366503077e-05,
"loss": 2.1965,
"step": 626
},
{
"epoch": 0.6383303639602952,
"grad_norm": 5.545042037963867,
"learning_rate": 2.9282390356862606e-05,
"loss": 1.4957,
"step": 627
},
{
"epoch": 0.6393484347162128,
"grad_norm": 11.62447738647461,
"learning_rate": 2.9136320825856967e-05,
"loss": 3.3109,
"step": 628
},
{
"epoch": 0.6403665054721304,
"grad_norm": 8.367134094238281,
"learning_rate": 2.899046657924992e-05,
"loss": 2.2194,
"step": 629
},
{
"epoch": 0.6413845762280479,
"grad_norm": 10.391725540161133,
"learning_rate": 2.884482912205621e-05,
"loss": 2.0195,
"step": 630
},
{
"epoch": 0.6424026469839654,
"grad_norm": 8.217406272888184,
"learning_rate": 2.8699409957053535e-05,
"loss": 2.4132,
"step": 631
},
{
"epoch": 0.643420717739883,
"grad_norm": 8.29297161102295,
"learning_rate": 2.855421058476719e-05,
"loss": 2.4454,
"step": 632
},
{
"epoch": 0.6444387884958005,
"grad_norm": 8.815670013427734,
"learning_rate": 2.840923250345442e-05,
"loss": 2.5413,
"step": 633
},
{
"epoch": 0.645456859251718,
"grad_norm": 8.5559720993042,
"learning_rate": 2.8264477209089145e-05,
"loss": 2.7664,
"step": 634
},
{
"epoch": 0.6464749300076356,
"grad_norm": 8.682782173156738,
"learning_rate": 2.8119946195346375e-05,
"loss": 2.5312,
"step": 635
},
{
"epoch": 0.6474930007635531,
"grad_norm": 11.519887924194336,
"learning_rate": 2.7975640953586846e-05,
"loss": 2.9688,
"step": 636
},
{
"epoch": 0.6485110715194706,
"grad_norm": 8.966607093811035,
"learning_rate": 2.7831562972841696e-05,
"loss": 2.7022,
"step": 637
},
{
"epoch": 0.6495291422753882,
"grad_norm": 8.183965682983398,
"learning_rate": 2.768771373979697e-05,
"loss": 2.3317,
"step": 638
},
{
"epoch": 0.6505472130313057,
"grad_norm": 8.993667602539062,
"learning_rate": 2.7544094738778436e-05,
"loss": 2.7296,
"step": 639
},
{
"epoch": 0.6515652837872232,
"grad_norm": 7.731354713439941,
"learning_rate": 2.74007074517361e-05,
"loss": 1.9501,
"step": 640
},
{
"epoch": 0.6525833545431408,
"grad_norm": 6.967146396636963,
"learning_rate": 2.7257553358229034e-05,
"loss": 1.8838,
"step": 641
},
{
"epoch": 0.6536014252990583,
"grad_norm": 6.557554244995117,
"learning_rate": 2.7114633935410085e-05,
"loss": 1.7431,
"step": 642
},
{
"epoch": 0.6546194960549758,
"grad_norm": 10.207218170166016,
"learning_rate": 2.6971950658010666e-05,
"loss": 2.4966,
"step": 643
},
{
"epoch": 0.6556375668108934,
"grad_norm": 7.477417469024658,
"learning_rate": 2.682950499832535e-05,
"loss": 2.1944,
"step": 644
},
{
"epoch": 0.6566556375668109,
"grad_norm": 10.127610206604004,
"learning_rate": 2.6687298426196973e-05,
"loss": 2.6473,
"step": 645
},
{
"epoch": 0.6576737083227284,
"grad_norm": 6.374731540679932,
"learning_rate": 2.6545332409001265e-05,
"loss": 1.8528,
"step": 646
},
{
"epoch": 0.658691779078646,
"grad_norm": 6.7048444747924805,
"learning_rate": 2.6403608411631742e-05,
"loss": 1.7493,
"step": 647
},
{
"epoch": 0.6597098498345635,
"grad_norm": 7.112037181854248,
"learning_rate": 2.6262127896484602e-05,
"loss": 2.0421,
"step": 648
},
{
"epoch": 0.660727920590481,
"grad_norm": 8.483193397521973,
"learning_rate": 2.612089232344371e-05,
"loss": 1.91,
"step": 649
},
{
"epoch": 0.6617459913463986,
"grad_norm": 10.052485466003418,
"learning_rate": 2.5979903149865387e-05,
"loss": 2.0998,
"step": 650
},
{
"epoch": 0.6627640621023161,
"grad_norm": 8.01032543182373,
"learning_rate": 2.5839161830563474e-05,
"loss": 2.5145,
"step": 651
},
{
"epoch": 0.6637821328582336,
"grad_norm": 9.746928215026855,
"learning_rate": 2.569866981779433e-05,
"loss": 3.3683,
"step": 652
},
{
"epoch": 0.6648002036141512,
"grad_norm": 8.607123374938965,
"learning_rate": 2.555842856124182e-05,
"loss": 2.9144,
"step": 653
},
{
"epoch": 0.6658182743700687,
"grad_norm": 10.463346481323242,
"learning_rate": 2.5418439508002258e-05,
"loss": 3.9062,
"step": 654
},
{
"epoch": 0.6668363451259862,
"grad_norm": 9.336942672729492,
"learning_rate": 2.5278704102569662e-05,
"loss": 3.3966,
"step": 655
},
{
"epoch": 0.6678544158819038,
"grad_norm": 10.415209770202637,
"learning_rate": 2.5139223786820747e-05,
"loss": 3.7271,
"step": 656
},
{
"epoch": 0.6688724866378213,
"grad_norm": 14.631210327148438,
"learning_rate": 2.500000000000001e-05,
"loss": 3.7071,
"step": 657
},
{
"epoch": 0.6698905573937388,
"grad_norm": 13.001562118530273,
"learning_rate": 2.486103417870493e-05,
"loss": 3.214,
"step": 658
},
{
"epoch": 0.6709086281496563,
"grad_norm": 11.307893753051758,
"learning_rate": 2.472232775687119e-05,
"loss": 2.8893,
"step": 659
},
{
"epoch": 0.6719266989055739,
"grad_norm": 7.8647379875183105,
"learning_rate": 2.4583882165757766e-05,
"loss": 2.0442,
"step": 660
},
{
"epoch": 0.6729447696614915,
"grad_norm": 5.790807247161865,
"learning_rate": 2.4445698833932234e-05,
"loss": 1.3228,
"step": 661
},
{
"epoch": 0.6739628404174091,
"grad_norm": 5.694929599761963,
"learning_rate": 2.4307779187256064e-05,
"loss": 1.3618,
"step": 662
},
{
"epoch": 0.6749809111733266,
"grad_norm": 5.114007949829102,
"learning_rate": 2.417012464886978e-05,
"loss": 1.2137,
"step": 663
},
{
"epoch": 0.6759989819292441,
"grad_norm": 7.429940223693848,
"learning_rate": 2.4032736639178444e-05,
"loss": 1.8593,
"step": 664
},
{
"epoch": 0.6770170526851617,
"grad_norm": 5.101173400878906,
"learning_rate": 2.389561657583681e-05,
"loss": 0.9669,
"step": 665
},
{
"epoch": 0.6780351234410792,
"grad_norm": 7.89351224899292,
"learning_rate": 2.3758765873734896e-05,
"loss": 1.8615,
"step": 666
},
{
"epoch": 0.6790531941969967,
"grad_norm": 7.043496608734131,
"learning_rate": 2.3622185944983188e-05,
"loss": 1.7828,
"step": 667
},
{
"epoch": 0.6800712649529143,
"grad_norm": 7.9154510498046875,
"learning_rate": 2.3485878198898252e-05,
"loss": 2.2469,
"step": 668
},
{
"epoch": 0.6810893357088318,
"grad_norm": 6.627047061920166,
"learning_rate": 2.3349844041988045e-05,
"loss": 1.5789,
"step": 669
},
{
"epoch": 0.6821074064647493,
"grad_norm": 5.884915828704834,
"learning_rate": 2.3214084877937464e-05,
"loss": 1.5281,
"step": 670
},
{
"epoch": 0.6831254772206669,
"grad_norm": 6.640014171600342,
"learning_rate": 2.30786021075939e-05,
"loss": 1.4942,
"step": 671
},
{
"epoch": 0.6841435479765844,
"grad_norm": 6.866456985473633,
"learning_rate": 2.294339712895271e-05,
"loss": 1.674,
"step": 672
},
{
"epoch": 0.6851616187325019,
"grad_norm": 6.7534990310668945,
"learning_rate": 2.28084713371428e-05,
"loss": 1.3313,
"step": 673
},
{
"epoch": 0.6861796894884195,
"grad_norm": 6.38292121887207,
"learning_rate": 2.2673826124412312e-05,
"loss": 1.6016,
"step": 674
},
{
"epoch": 0.687197760244337,
"grad_norm": 7.129096031188965,
"learning_rate": 2.2539462880114194e-05,
"loss": 1.8662,
"step": 675
},
{
"epoch": 0.6882158310002545,
"grad_norm": 6.555764675140381,
"learning_rate": 2.240538299069178e-05,
"loss": 1.9315,
"step": 676
},
{
"epoch": 0.689233901756172,
"grad_norm": 5.772182941436768,
"learning_rate": 2.2271587839664672e-05,
"loss": 1.3156,
"step": 677
},
{
"epoch": 0.6902519725120896,
"grad_norm": 7.608791351318359,
"learning_rate": 2.213807880761434e-05,
"loss": 1.9463,
"step": 678
},
{
"epoch": 0.6912700432680071,
"grad_norm": 7.279063701629639,
"learning_rate": 2.2004857272169876e-05,
"loss": 1.9304,
"step": 679
},
{
"epoch": 0.6922881140239247,
"grad_norm": 9.676162719726562,
"learning_rate": 2.1871924607993797e-05,
"loss": 2.3767,
"step": 680
},
{
"epoch": 0.6933061847798422,
"grad_norm": 7.1779093742370605,
"learning_rate": 2.1739282186767923e-05,
"loss": 1.6381,
"step": 681
},
{
"epoch": 0.6943242555357597,
"grad_norm": 6.892930030822754,
"learning_rate": 2.160693137717912e-05,
"loss": 2.134,
"step": 682
},
{
"epoch": 0.6953423262916772,
"grad_norm": 9.403331756591797,
"learning_rate": 2.1474873544905205e-05,
"loss": 2.2294,
"step": 683
},
{
"epoch": 0.6963603970475948,
"grad_norm": 7.7654595375061035,
"learning_rate": 2.134311005260093e-05,
"loss": 2.0953,
"step": 684
},
{
"epoch": 0.6973784678035123,
"grad_norm": 10.087757110595703,
"learning_rate": 2.1211642259883867e-05,
"loss": 2.9221,
"step": 685
},
{
"epoch": 0.6983965385594298,
"grad_norm": 8.816588401794434,
"learning_rate": 2.108047152332028e-05,
"loss": 2.6949,
"step": 686
},
{
"epoch": 0.6994146093153474,
"grad_norm": 8.12427043914795,
"learning_rate": 2.0949599196411325e-05,
"loss": 1.7944,
"step": 687
},
{
"epoch": 0.7004326800712649,
"grad_norm": 7.3718461990356445,
"learning_rate": 2.0819026629578952e-05,
"loss": 2.1142,
"step": 688
},
{
"epoch": 0.7014507508271824,
"grad_norm": 7.3536577224731445,
"learning_rate": 2.0688755170151996e-05,
"loss": 2.0029,
"step": 689
},
{
"epoch": 0.7024688215831,
"grad_norm": 8.220134735107422,
"learning_rate": 2.0558786162352244e-05,
"loss": 2.2986,
"step": 690
},
{
"epoch": 0.7034868923390175,
"grad_norm": 9.169322967529297,
"learning_rate": 2.0429120947280678e-05,
"loss": 2.3455,
"step": 691
},
{
"epoch": 0.704504963094935,
"grad_norm": 8.935730934143066,
"learning_rate": 2.029976086290347e-05,
"loss": 2.1588,
"step": 692
},
{
"epoch": 0.7055230338508527,
"grad_norm": 7.555604934692383,
"learning_rate": 2.017070724403835e-05,
"loss": 2.2783,
"step": 693
},
{
"epoch": 0.7065411046067702,
"grad_norm": 7.896771430969238,
"learning_rate": 2.0041961422340676e-05,
"loss": 1.8964,
"step": 694
},
{
"epoch": 0.7075591753626878,
"grad_norm": 8.242528915405273,
"learning_rate": 1.9913524726289784e-05,
"loss": 1.9936,
"step": 695
},
{
"epoch": 0.7085772461186053,
"grad_norm": 7.946272373199463,
"learning_rate": 1.9785398481175294e-05,
"loss": 2.1526,
"step": 696
},
{
"epoch": 0.7095953168745228,
"grad_norm": 8.382307052612305,
"learning_rate": 1.965758400908334e-05,
"loss": 2.4691,
"step": 697
},
{
"epoch": 0.7106133876304404,
"grad_norm": 6.839285373687744,
"learning_rate": 1.9530082628883056e-05,
"loss": 1.7924,
"step": 698
},
{
"epoch": 0.7116314583863579,
"grad_norm": 12.65297794342041,
"learning_rate": 1.9402895656212833e-05,
"loss": 2.0093,
"step": 699
},
{
"epoch": 0.7126495291422754,
"grad_norm": 11.35102653503418,
"learning_rate": 1.927602440346687e-05,
"loss": 1.7963,
"step": 700
},
{
"epoch": 0.713667599898193,
"grad_norm": 7.479799747467041,
"learning_rate": 1.914947017978153e-05,
"loss": 3.4169,
"step": 701
},
{
"epoch": 0.7146856706541105,
"grad_norm": 9.703947067260742,
"learning_rate": 1.9023234291021873e-05,
"loss": 2.8178,
"step": 702
},
{
"epoch": 0.715703741410028,
"grad_norm": 10.218291282653809,
"learning_rate": 1.889731803976822e-05,
"loss": 2.841,
"step": 703
},
{
"epoch": 0.7167218121659455,
"grad_norm": 12.210125923156738,
"learning_rate": 1.8771722725302643e-05,
"loss": 3.9947,
"step": 704
},
{
"epoch": 0.7177398829218631,
"grad_norm": 9.851053237915039,
"learning_rate": 1.8646449643595565e-05,
"loss": 2.8836,
"step": 705
},
{
"epoch": 0.7187579536777806,
"grad_norm": 11.182621955871582,
"learning_rate": 1.8521500087292467e-05,
"loss": 3.2881,
"step": 706
},
{
"epoch": 0.7197760244336981,
"grad_norm": 16.472837448120117,
"learning_rate": 1.8396875345700497e-05,
"loss": 3.6782,
"step": 707
},
{
"epoch": 0.7207940951896157,
"grad_norm": 13.632477760314941,
"learning_rate": 1.8272576704775074e-05,
"loss": 3.5599,
"step": 708
},
{
"epoch": 0.7218121659455332,
"grad_norm": 8.531991958618164,
"learning_rate": 1.8148605447106797e-05,
"loss": 1.815,
"step": 709
},
{
"epoch": 0.7228302367014507,
"grad_norm": 6.116468906402588,
"learning_rate": 1.8024962851908107e-05,
"loss": 1.3279,
"step": 710
},
{
"epoch": 0.7238483074573683,
"grad_norm": 6.058359622955322,
"learning_rate": 1.7901650195000068e-05,
"loss": 1.1209,
"step": 711
},
{
"epoch": 0.7248663782132858,
"grad_norm": 7.301308631896973,
"learning_rate": 1.7778668748799242e-05,
"loss": 1.6941,
"step": 712
},
{
"epoch": 0.7258844489692033,
"grad_norm": 6.059625148773193,
"learning_rate": 1.76560197823046e-05,
"loss": 1.4134,
"step": 713
},
{
"epoch": 0.7269025197251209,
"grad_norm": 5.40415620803833,
"learning_rate": 1.753370456108433e-05,
"loss": 1.5117,
"step": 714
},
{
"epoch": 0.7279205904810384,
"grad_norm": 6.5403008460998535,
"learning_rate": 1.7411724347262824e-05,
"loss": 1.397,
"step": 715
},
{
"epoch": 0.7289386612369559,
"grad_norm": 8.339217185974121,
"learning_rate": 1.729008039950772e-05,
"loss": 1.5315,
"step": 716
},
{
"epoch": 0.7299567319928735,
"grad_norm": 5.882655620574951,
"learning_rate": 1.7168773973016776e-05,
"loss": 1.1574,
"step": 717
},
{
"epoch": 0.730974802748791,
"grad_norm": 6.183307647705078,
"learning_rate": 1.7047806319505076e-05,
"loss": 1.3367,
"step": 718
},
{
"epoch": 0.7319928735047085,
"grad_norm": 6.28183126449585,
"learning_rate": 1.692717868719195e-05,
"loss": 1.5637,
"step": 719
},
{
"epoch": 0.7330109442606261,
"grad_norm": 4.728903293609619,
"learning_rate": 1.680689232078827e-05,
"loss": 1.4179,
"step": 720
},
{
"epoch": 0.7340290150165436,
"grad_norm": 6.95587158203125,
"learning_rate": 1.668694846148343e-05,
"loss": 1.7837,
"step": 721
},
{
"epoch": 0.7350470857724611,
"grad_norm": 5.531774997711182,
"learning_rate": 1.6567348346932658e-05,
"loss": 1.2069,
"step": 722
},
{
"epoch": 0.7360651565283787,
"grad_norm": 5.498968601226807,
"learning_rate": 1.644809321124423e-05,
"loss": 1.1316,
"step": 723
},
{
"epoch": 0.7370832272842962,
"grad_norm": 7.1133809089660645,
"learning_rate": 1.6329184284966677e-05,
"loss": 2.0335,
"step": 724
},
{
"epoch": 0.7381012980402138,
"grad_norm": 6.765145301818848,
"learning_rate": 1.621062279507617e-05,
"loss": 2.0067,
"step": 725
},
{
"epoch": 0.7391193687961314,
"grad_norm": 7.21923828125,
"learning_rate": 1.609240996496378e-05,
"loss": 2.2922,
"step": 726
},
{
"epoch": 0.7401374395520489,
"grad_norm": 5.8889360427856445,
"learning_rate": 1.597454701442288e-05,
"loss": 1.6363,
"step": 727
},
{
"epoch": 0.7411555103079664,
"grad_norm": 8.041604042053223,
"learning_rate": 1.5857035159636623e-05,
"loss": 1.6933,
"step": 728
},
{
"epoch": 0.742173581063884,
"grad_norm": 7.711045742034912,
"learning_rate": 1.5739875613165283e-05,
"loss": 1.9258,
"step": 729
},
{
"epoch": 0.7431916518198015,
"grad_norm": 7.747977256774902,
"learning_rate": 1.5623069583933836e-05,
"loss": 2.5273,
"step": 730
},
{
"epoch": 0.744209722575719,
"grad_norm": 8.055684089660645,
"learning_rate": 1.550661827721941e-05,
"loss": 2.0398,
"step": 731
},
{
"epoch": 0.7452277933316366,
"grad_norm": 8.75759220123291,
"learning_rate": 1.5390522894638938e-05,
"loss": 2.5372,
"step": 732
},
{
"epoch": 0.7462458640875541,
"grad_norm": 6.629666805267334,
"learning_rate": 1.527478463413666e-05,
"loss": 1.8586,
"step": 733
},
{
"epoch": 0.7472639348434716,
"grad_norm": 7.634647369384766,
"learning_rate": 1.5159404689971795e-05,
"loss": 1.7609,
"step": 734
},
{
"epoch": 0.7482820055993892,
"grad_norm": 8.821757316589355,
"learning_rate": 1.5044384252706312e-05,
"loss": 2.5073,
"step": 735
},
{
"epoch": 0.7493000763553067,
"grad_norm": 7.940456867218018,
"learning_rate": 1.4929724509192488e-05,
"loss": 2.6403,
"step": 736
},
{
"epoch": 0.7503181471112242,
"grad_norm": 7.819153308868408,
"learning_rate": 1.4815426642560754e-05,
"loss": 2.3173,
"step": 737
},
{
"epoch": 0.7513362178671418,
"grad_norm": 7.586490154266357,
"learning_rate": 1.470149183220748e-05,
"loss": 2.0191,
"step": 738
},
{
"epoch": 0.7523542886230593,
"grad_norm": 6.6719584465026855,
"learning_rate": 1.4587921253782849e-05,
"loss": 1.6597,
"step": 739
},
{
"epoch": 0.7533723593789768,
"grad_norm": 8.974640846252441,
"learning_rate": 1.447471607917854e-05,
"loss": 1.9953,
"step": 740
},
{
"epoch": 0.7543904301348944,
"grad_norm": 7.379059314727783,
"learning_rate": 1.4361877476515889e-05,
"loss": 1.8422,
"step": 741
},
{
"epoch": 0.7554085008908119,
"grad_norm": 7.163296699523926,
"learning_rate": 1.4249406610133686e-05,
"loss": 1.8372,
"step": 742
},
{
"epoch": 0.7564265716467294,
"grad_norm": 10.26382064819336,
"learning_rate": 1.413730464057616e-05,
"loss": 2.2328,
"step": 743
},
{
"epoch": 0.757444642402647,
"grad_norm": 7.997495651245117,
"learning_rate": 1.4025572724581038e-05,
"loss": 2.083,
"step": 744
},
{
"epoch": 0.7584627131585645,
"grad_norm": 8.966462135314941,
"learning_rate": 1.3914212015067651e-05,
"loss": 2.0716,
"step": 745
},
{
"epoch": 0.759480783914482,
"grad_norm": 9.182121276855469,
"learning_rate": 1.3803223661124936e-05,
"loss": 2.406,
"step": 746
},
{
"epoch": 0.7604988546703996,
"grad_norm": 8.59941577911377,
"learning_rate": 1.3692608807999652e-05,
"loss": 2.1763,
"step": 747
},
{
"epoch": 0.7615169254263171,
"grad_norm": 11.990951538085938,
"learning_rate": 1.3582368597084566e-05,
"loss": 2.7291,
"step": 748
},
{
"epoch": 0.7625349961822346,
"grad_norm": 7.6986260414123535,
"learning_rate": 1.3472504165906613e-05,
"loss": 1.9128,
"step": 749
},
{
"epoch": 0.7635530669381522,
"grad_norm": 8.994124412536621,
"learning_rate": 1.3363016648115245e-05,
"loss": 1.7094,
"step": 750
},
{
"epoch": 0.7645711376940697,
"grad_norm": 8.2550630569458,
"learning_rate": 1.3253907173470648e-05,
"loss": 3.7822,
"step": 751
},
{
"epoch": 0.7655892084499872,
"grad_norm": 8.363167762756348,
"learning_rate": 1.3145176867832165e-05,
"loss": 3.1741,
"step": 752
},
{
"epoch": 0.7666072792059048,
"grad_norm": 9.87235164642334,
"learning_rate": 1.30368268531466e-05,
"loss": 3.5932,
"step": 753
},
{
"epoch": 0.7676253499618223,
"grad_norm": 7.78696346282959,
"learning_rate": 1.292885824743667e-05,
"loss": 2.4161,
"step": 754
},
{
"epoch": 0.7686434207177398,
"grad_norm": 10.35235595703125,
"learning_rate": 1.2821272164789544e-05,
"loss": 3.0389,
"step": 755
},
{
"epoch": 0.7696614914736574,
"grad_norm": 13.586828231811523,
"learning_rate": 1.2714069715345195e-05,
"loss": 3.6863,
"step": 756
},
{
"epoch": 0.770679562229575,
"grad_norm": 11.710920333862305,
"learning_rate": 1.2607252005285109e-05,
"loss": 4.0324,
"step": 757
},
{
"epoch": 0.7716976329854925,
"grad_norm": 12.497662544250488,
"learning_rate": 1.2500820136820734e-05,
"loss": 2.9699,
"step": 758
},
{
"epoch": 0.7727157037414101,
"grad_norm": 15.810281753540039,
"learning_rate": 1.2394775208182174e-05,
"loss": 3.1219,
"step": 759
},
{
"epoch": 0.7737337744973276,
"grad_norm": 10.045539855957031,
"learning_rate": 1.2289118313606896e-05,
"loss": 1.7459,
"step": 760
},
{
"epoch": 0.7747518452532451,
"grad_norm": 8.060089111328125,
"learning_rate": 1.2183850543328312e-05,
"loss": 1.4901,
"step": 761
},
{
"epoch": 0.7757699160091627,
"grad_norm": 5.897403240203857,
"learning_rate": 1.2078972983564684e-05,
"loss": 1.1615,
"step": 762
},
{
"epoch": 0.7767879867650802,
"grad_norm": 9.653463363647461,
"learning_rate": 1.1974486716507783e-05,
"loss": 2.1851,
"step": 763
},
{
"epoch": 0.7778060575209977,
"grad_norm": 8.81517505645752,
"learning_rate": 1.1870392820311821e-05,
"loss": 2.0521,
"step": 764
},
{
"epoch": 0.7788241282769153,
"grad_norm": 8.291658401489258,
"learning_rate": 1.1766692369082255e-05,
"loss": 2.0249,
"step": 765
},
{
"epoch": 0.7798421990328328,
"grad_norm": 8.102690696716309,
"learning_rate": 1.1663386432864727e-05,
"loss": 2.065,
"step": 766
},
{
"epoch": 0.7808602697887503,
"grad_norm": 4.781749248504639,
"learning_rate": 1.156047607763407e-05,
"loss": 1.2103,
"step": 767
},
{
"epoch": 0.7818783405446679,
"grad_norm": 6.986288547515869,
"learning_rate": 1.145796236528322e-05,
"loss": 1.5723,
"step": 768
},
{
"epoch": 0.7828964113005854,
"grad_norm": 6.896148681640625,
"learning_rate": 1.135584635361232e-05,
"loss": 1.7923,
"step": 769
},
{
"epoch": 0.7839144820565029,
"grad_norm": 6.917054176330566,
"learning_rate": 1.1254129096317806e-05,
"loss": 1.8785,
"step": 770
},
{
"epoch": 0.7849325528124205,
"grad_norm": 8.382417678833008,
"learning_rate": 1.115281164298153e-05,
"loss": 2.0591,
"step": 771
},
{
"epoch": 0.785950623568338,
"grad_norm": 6.412557601928711,
"learning_rate": 1.105189503905985e-05,
"loss": 1.3064,
"step": 772
},
{
"epoch": 0.7869686943242555,
"grad_norm": 8.5576753616333,
"learning_rate": 1.0951380325872979e-05,
"loss": 2.1621,
"step": 773
},
{
"epoch": 0.7879867650801731,
"grad_norm": 6.601977348327637,
"learning_rate": 1.0851268540594167e-05,
"loss": 1.9123,
"step": 774
},
{
"epoch": 0.7890048358360906,
"grad_norm": 9.037349700927734,
"learning_rate": 1.0751560716238967e-05,
"loss": 2.7441,
"step": 775
},
{
"epoch": 0.7900229065920081,
"grad_norm": 8.547028541564941,
"learning_rate": 1.0652257881654627e-05,
"loss": 2.4516,
"step": 776
},
{
"epoch": 0.7910409773479257,
"grad_norm": 6.397313117980957,
"learning_rate": 1.055336106150948e-05,
"loss": 1.7555,
"step": 777
},
{
"epoch": 0.7920590481038432,
"grad_norm": 8.946637153625488,
"learning_rate": 1.0454871276282335e-05,
"loss": 2.4554,
"step": 778
},
{
"epoch": 0.7930771188597607,
"grad_norm": 10.802556037902832,
"learning_rate": 1.0356789542251938e-05,
"loss": 3.2162,
"step": 779
},
{
"epoch": 0.7940951896156783,
"grad_norm": 6.947890281677246,
"learning_rate": 1.0259116871486557e-05,
"loss": 1.7408,
"step": 780
},
{
"epoch": 0.7951132603715958,
"grad_norm": 6.768093585968018,
"learning_rate": 1.0161854271833443e-05,
"loss": 1.8601,
"step": 781
},
{
"epoch": 0.7961313311275133,
"grad_norm": 8.398831367492676,
"learning_rate": 1.006500274690853e-05,
"loss": 2.2156,
"step": 782
},
{
"epoch": 0.7971494018834309,
"grad_norm": 7.810449123382568,
"learning_rate": 9.96856329608597e-06,
"loss": 2.1837,
"step": 783
},
{
"epoch": 0.7981674726393484,
"grad_norm": 8.77087116241455,
"learning_rate": 9.87253691448794e-06,
"loss": 2.6596,
"step": 784
},
{
"epoch": 0.7991855433952659,
"grad_norm": 11.717060089111328,
"learning_rate": 9.776924592974256e-06,
"loss": 3.5775,
"step": 785
},
{
"epoch": 0.8002036141511835,
"grad_norm": 7.535914897918701,
"learning_rate": 9.681727318132227e-06,
"loss": 2.2536,
"step": 786
},
{
"epoch": 0.801221684907101,
"grad_norm": 9.274803161621094,
"learning_rate": 9.586946072266478e-06,
"loss": 2.4405,
"step": 787
},
{
"epoch": 0.8022397556630185,
"grad_norm": 7.637203216552734,
"learning_rate": 9.492581833388736e-06,
"loss": 1.9418,
"step": 788
},
{
"epoch": 0.8032578264189361,
"grad_norm": 11.312345504760742,
"learning_rate": 9.398635575207854e-06,
"loss": 3.0226,
"step": 789
},
{
"epoch": 0.8042758971748537,
"grad_norm": 8.732460021972656,
"learning_rate": 9.305108267119645e-06,
"loss": 2.0292,
"step": 790
},
{
"epoch": 0.8052939679307712,
"grad_norm": 6.998504638671875,
"learning_rate": 9.212000874196953e-06,
"loss": 1.7993,
"step": 791
},
{
"epoch": 0.8063120386866888,
"grad_norm": 9.693340301513672,
"learning_rate": 9.119314357179687e-06,
"loss": 2.4902,
"step": 792
},
{
"epoch": 0.8073301094426063,
"grad_norm": 8.360791206359863,
"learning_rate": 9.027049672464916e-06,
"loss": 2.0688,
"step": 793
},
{
"epoch": 0.8083481801985238,
"grad_norm": 7.457218647003174,
"learning_rate": 8.935207772096904e-06,
"loss": 2.1543,
"step": 794
},
{
"epoch": 0.8093662509544414,
"grad_norm": 8.154823303222656,
"learning_rate": 8.843789603757446e-06,
"loss": 2.5219,
"step": 795
},
{
"epoch": 0.8103843217103589,
"grad_norm": 8.503774642944336,
"learning_rate": 8.752796110755984e-06,
"loss": 1.7771,
"step": 796
},
{
"epoch": 0.8114023924662764,
"grad_norm": 9.03532600402832,
"learning_rate": 8.662228232019876e-06,
"loss": 2.1927,
"step": 797
},
{
"epoch": 0.812420463222194,
"grad_norm": 7.620565891265869,
"learning_rate": 8.572086902084731e-06,
"loss": 1.5837,
"step": 798
},
{
"epoch": 0.8134385339781115,
"grad_norm": 8.183737754821777,
"learning_rate": 8.48237305108479e-06,
"loss": 2.0261,
"step": 799
},
{
"epoch": 0.814456604734029,
"grad_norm": 7.848052978515625,
"learning_rate": 8.393087604743283e-06,
"loss": 1.7399,
"step": 800
},
{
"epoch": 0.8154746754899466,
"grad_norm": 6.207403182983398,
"learning_rate": 8.304231484362868e-06,
"loss": 2.4818,
"step": 801
},
{
"epoch": 0.8164927462458641,
"grad_norm": 7.363234043121338,
"learning_rate": 8.215805606816191e-06,
"loss": 2.7651,
"step": 802
},
{
"epoch": 0.8175108170017816,
"grad_norm": 8.982840538024902,
"learning_rate": 8.127810884536403e-06,
"loss": 3.0555,
"step": 803
},
{
"epoch": 0.8185288877576992,
"grad_norm": 8.631933212280273,
"learning_rate": 8.040248225507641e-06,
"loss": 2.9924,
"step": 804
},
{
"epoch": 0.8195469585136167,
"grad_norm": 11.01415729522705,
"learning_rate": 7.95311853325582e-06,
"loss": 3.5641,
"step": 805
},
{
"epoch": 0.8205650292695342,
"grad_norm": 11.288020133972168,
"learning_rate": 7.866422706839238e-06,
"loss": 3.6431,
"step": 806
},
{
"epoch": 0.8215831000254518,
"grad_norm": 18.66059112548828,
"learning_rate": 7.780161640839257e-06,
"loss": 3.8684,
"step": 807
},
{
"epoch": 0.8226011707813693,
"grad_norm": 10.357224464416504,
"learning_rate": 7.694336225351107e-06,
"loss": 2.31,
"step": 808
},
{
"epoch": 0.8236192415372868,
"grad_norm": 10.739027976989746,
"learning_rate": 7.60894734597476e-06,
"loss": 2.3002,
"step": 809
},
{
"epoch": 0.8246373122932044,
"grad_norm": 4.031766414642334,
"learning_rate": 7.523995883805679e-06,
"loss": 0.8728,
"step": 810
},
{
"epoch": 0.8256553830491219,
"grad_norm": 5.490253448486328,
"learning_rate": 7.439482715425805e-06,
"loss": 1.0823,
"step": 811
},
{
"epoch": 0.8266734538050394,
"grad_norm": 6.157886981964111,
"learning_rate": 7.355408712894507e-06,
"loss": 1.4668,
"step": 812
},
{
"epoch": 0.827691524560957,
"grad_norm": 4.612349510192871,
"learning_rate": 7.271774743739545e-06,
"loss": 0.9666,
"step": 813
},
{
"epoch": 0.8287095953168745,
"grad_norm": 4.856873989105225,
"learning_rate": 7.188581670948169e-06,
"loss": 1.2653,
"step": 814
},
{
"epoch": 0.829727666072792,
"grad_norm": 7.248535633087158,
"learning_rate": 7.105830352958142e-06,
"loss": 1.8515,
"step": 815
},
{
"epoch": 0.8307457368287096,
"grad_norm": 7.241623878479004,
"learning_rate": 7.0235216436489835e-06,
"loss": 1.4111,
"step": 816
},
{
"epoch": 0.8317638075846271,
"grad_norm": 5.958316802978516,
"learning_rate": 6.941656392333046e-06,
"loss": 1.4647,
"step": 817
},
{
"epoch": 0.8327818783405446,
"grad_norm": 7.3134684562683105,
"learning_rate": 6.860235443746859e-06,
"loss": 1.4169,
"step": 818
},
{
"epoch": 0.8337999490964622,
"grad_norm": 6.82220458984375,
"learning_rate": 6.779259638042318e-06,
"loss": 1.6871,
"step": 819
},
{
"epoch": 0.8348180198523797,
"grad_norm": 8.294244766235352,
"learning_rate": 6.698729810778065e-06,
"loss": 2.0008,
"step": 820
},
{
"epoch": 0.8358360906082972,
"grad_norm": 7.062936305999756,
"learning_rate": 6.618646792910893e-06,
"loss": 1.5006,
"step": 821
},
{
"epoch": 0.8368541613642149,
"grad_norm": 8.914663314819336,
"learning_rate": 6.539011410787105e-06,
"loss": 2.8718,
"step": 822
},
{
"epoch": 0.8378722321201324,
"grad_norm": 6.904050827026367,
"learning_rate": 6.459824486134014e-06,
"loss": 1.6913,
"step": 823
},
{
"epoch": 0.8388903028760499,
"grad_norm": 9.867286682128906,
"learning_rate": 6.381086836051498e-06,
"loss": 2.3808,
"step": 824
},
{
"epoch": 0.8399083736319675,
"grad_norm": 7.717339992523193,
"learning_rate": 6.302799273003546e-06,
"loss": 2.3309,
"step": 825
},
{
"epoch": 0.840926444387885,
"grad_norm": 7.964992046356201,
"learning_rate": 6.224962604809819e-06,
"loss": 1.8556,
"step": 826
},
{
"epoch": 0.8419445151438025,
"grad_norm": 8.438617706298828,
"learning_rate": 6.147577634637414e-06,
"loss": 2.8175,
"step": 827
},
{
"epoch": 0.8429625858997201,
"grad_norm": 6.18842077255249,
"learning_rate": 6.070645160992522e-06,
"loss": 1.8594,
"step": 828
},
{
"epoch": 0.8439806566556376,
"grad_norm": 9.132290840148926,
"learning_rate": 5.994165977712174e-06,
"loss": 2.4016,
"step": 829
},
{
"epoch": 0.8449987274115551,
"grad_norm": 8.243701934814453,
"learning_rate": 5.918140873956063e-06,
"loss": 2.8481,
"step": 830
},
{
"epoch": 0.8460167981674727,
"grad_norm": 7.9838457107543945,
"learning_rate": 5.842570634198452e-06,
"loss": 2.2464,
"step": 831
},
{
"epoch": 0.8470348689233902,
"grad_norm": 9.669371604919434,
"learning_rate": 5.767456038219987e-06,
"loss": 3.0041,
"step": 832
},
{
"epoch": 0.8480529396793077,
"grad_norm": 6.894510746002197,
"learning_rate": 5.692797861099719e-06,
"loss": 1.9256,
"step": 833
},
{
"epoch": 0.8490710104352253,
"grad_norm": 9.749018669128418,
"learning_rate": 5.6185968732070825e-06,
"loss": 2.0606,
"step": 834
},
{
"epoch": 0.8500890811911428,
"grad_norm": 10.20246410369873,
"learning_rate": 5.544853840193981e-06,
"loss": 2.5006,
"step": 835
},
{
"epoch": 0.8511071519470603,
"grad_norm": 7.829645156860352,
"learning_rate": 5.471569522986774e-06,
"loss": 2.2776,
"step": 836
},
{
"epoch": 0.8521252227029779,
"grad_norm": 8.704474449157715,
"learning_rate": 5.398744677778594e-06,
"loss": 2.155,
"step": 837
},
{
"epoch": 0.8531432934588954,
"grad_norm": 9.067277908325195,
"learning_rate": 5.326380056021418e-06,
"loss": 1.9537,
"step": 838
},
{
"epoch": 0.8541613642148129,
"grad_norm": 7.175379276275635,
"learning_rate": 5.25447640441834e-06,
"loss": 1.8142,
"step": 839
},
{
"epoch": 0.8551794349707305,
"grad_norm": 9.313763618469238,
"learning_rate": 5.183034464915898e-06,
"loss": 2.6184,
"step": 840
},
{
"epoch": 0.856197505726648,
"grad_norm": 8.87330150604248,
"learning_rate": 5.112054974696395e-06,
"loss": 2.2619,
"step": 841
},
{
"epoch": 0.8572155764825655,
"grad_norm": 7.452190399169922,
"learning_rate": 5.041538666170281e-06,
"loss": 2.0532,
"step": 842
},
{
"epoch": 0.858233647238483,
"grad_norm": 8.609152793884277,
"learning_rate": 4.9714862669686335e-06,
"loss": 2.3708,
"step": 843
},
{
"epoch": 0.8592517179944006,
"grad_norm": 9.013458251953125,
"learning_rate": 4.901898499935609e-06,
"loss": 2.2493,
"step": 844
},
{
"epoch": 0.8602697887503181,
"grad_norm": 6.508463382720947,
"learning_rate": 4.832776083120982e-06,
"loss": 1.6531,
"step": 845
},
{
"epoch": 0.8612878595062357,
"grad_norm": 8.564003944396973,
"learning_rate": 4.764119729772809e-06,
"loss": 2.2661,
"step": 846
},
{
"epoch": 0.8623059302621532,
"grad_norm": 10.198071479797363,
"learning_rate": 4.695930148329958e-06,
"loss": 2.5887,
"step": 847
},
{
"epoch": 0.8633240010180707,
"grad_norm": 7.703155994415283,
"learning_rate": 4.628208042414889e-06,
"loss": 2.1529,
"step": 848
},
{
"epoch": 0.8643420717739883,
"grad_norm": 9.272164344787598,
"learning_rate": 4.560954110826337e-06,
"loss": 2.1878,
"step": 849
},
{
"epoch": 0.8653601425299058,
"grad_norm": 9.344987869262695,
"learning_rate": 4.494169047532154e-06,
"loss": 1.8306,
"step": 850
},
{
"epoch": 0.8663782132858233,
"grad_norm": 7.905543327331543,
"learning_rate": 4.427853541662091e-06,
"loss": 3.7738,
"step": 851
},
{
"epoch": 0.8673962840417409,
"grad_norm": 11.58995532989502,
"learning_rate": 4.362008277500701e-06,
"loss": 3.8687,
"step": 852
},
{
"epoch": 0.8684143547976584,
"grad_norm": 10.805427551269531,
"learning_rate": 4.296633934480337e-06,
"loss": 4.1339,
"step": 853
},
{
"epoch": 0.869432425553576,
"grad_norm": 9.845876693725586,
"learning_rate": 4.231731187174065e-06,
"loss": 3.6647,
"step": 854
},
{
"epoch": 0.8704504963094936,
"grad_norm": 11.320969581604004,
"learning_rate": 4.167300705288718e-06,
"loss": 3.8381,
"step": 855
},
{
"epoch": 0.8714685670654111,
"grad_norm": 12.798758506774902,
"learning_rate": 4.10334315365804e-06,
"loss": 3.4318,
"step": 856
},
{
"epoch": 0.8724866378213286,
"grad_norm": 11.61473274230957,
"learning_rate": 4.039859192235779e-06,
"loss": 2.8296,
"step": 857
},
{
"epoch": 0.8735047085772462,
"grad_norm": 14.96419620513916,
"learning_rate": 3.976849476088845e-06,
"loss": 3.5524,
"step": 858
},
{
"epoch": 0.8745227793331637,
"grad_norm": 7.406204700469971,
"learning_rate": 3.914314655390633e-06,
"loss": 1.9455,
"step": 859
},
{
"epoch": 0.8755408500890812,
"grad_norm": 6.518743991851807,
"learning_rate": 3.852255375414271e-06,
"loss": 1.2701,
"step": 860
},
{
"epoch": 0.8765589208449988,
"grad_norm": 6.446305751800537,
"learning_rate": 3.790672276525936e-06,
"loss": 1.4215,
"step": 861
},
{
"epoch": 0.8775769916009163,
"grad_norm": 8.741048812866211,
"learning_rate": 3.7295659941782855e-06,
"loss": 1.8906,
"step": 862
},
{
"epoch": 0.8785950623568338,
"grad_norm": 8.466268539428711,
"learning_rate": 3.668937158903901e-06,
"loss": 1.8339,
"step": 863
},
{
"epoch": 0.8796131331127514,
"grad_norm": 9.552635192871094,
"learning_rate": 3.6087863963087497e-06,
"loss": 2.2068,
"step": 864
},
{
"epoch": 0.8806312038686689,
"grad_norm": 6.880453586578369,
"learning_rate": 3.5491143270657446e-06,
"loss": 1.4963,
"step": 865
},
{
"epoch": 0.8816492746245864,
"grad_norm": 6.255483627319336,
"learning_rate": 3.4899215669083716e-06,
"loss": 1.3132,
"step": 866
},
{
"epoch": 0.882667345380504,
"grad_norm": 6.0426025390625,
"learning_rate": 3.4312087266242963e-06,
"loss": 1.5957,
"step": 867
},
{
"epoch": 0.8836854161364215,
"grad_norm": 6.473054885864258,
"learning_rate": 3.3729764120490446e-06,
"loss": 1.7223,
"step": 868
},
{
"epoch": 0.884703486892339,
"grad_norm": 6.081118106842041,
"learning_rate": 3.315225224059809e-06,
"loss": 1.4486,
"step": 869
},
{
"epoch": 0.8857215576482566,
"grad_norm": 8.436514854431152,
"learning_rate": 3.25795575856922e-06,
"loss": 2.5112,
"step": 870
},
{
"epoch": 0.8867396284041741,
"grad_norm": 7.625750541687012,
"learning_rate": 3.2011686065191895e-06,
"loss": 1.7229,
"step": 871
},
{
"epoch": 0.8877576991600916,
"grad_norm": 8.813794136047363,
"learning_rate": 3.1448643538748045e-06,
"loss": 2.4128,
"step": 872
},
{
"epoch": 0.8887757699160092,
"grad_norm": 5.511772155761719,
"learning_rate": 3.0890435816183226e-06,
"loss": 1.3973,
"step": 873
},
{
"epoch": 0.8897938406719267,
"grad_norm": 8.631103515625,
"learning_rate": 3.03370686574313e-06,
"loss": 2.015,
"step": 874
},
{
"epoch": 0.8908119114278442,
"grad_norm": 7.364684104919434,
"learning_rate": 2.9788547772478416e-06,
"loss": 2.2994,
"step": 875
},
{
"epoch": 0.8918299821837617,
"grad_norm": 6.241342067718506,
"learning_rate": 2.924487882130356e-06,
"loss": 1.8339,
"step": 876
},
{
"epoch": 0.8928480529396793,
"grad_norm": 7.564022064208984,
"learning_rate": 2.870606741382059e-06,
"loss": 1.7091,
"step": 877
},
{
"epoch": 0.8938661236955968,
"grad_norm": 7.5156378746032715,
"learning_rate": 2.817211910982037e-06,
"loss": 2.1743,
"step": 878
},
{
"epoch": 0.8948841944515143,
"grad_norm": 7.265412330627441,
"learning_rate": 2.7643039418913e-06,
"loss": 2.0643,
"step": 879
},
{
"epoch": 0.8959022652074319,
"grad_norm": 7.189180850982666,
"learning_rate": 2.711883380047131e-06,
"loss": 2.1941,
"step": 880
},
{
"epoch": 0.8969203359633494,
"grad_norm": 7.311581134796143,
"learning_rate": 2.6599507663574384e-06,
"loss": 2.1736,
"step": 881
},
{
"epoch": 0.897938406719267,
"grad_norm": 10.869359016418457,
"learning_rate": 2.6085066366951905e-06,
"loss": 3.2378,
"step": 882
},
{
"epoch": 0.8989564774751845,
"grad_norm": 7.649423122406006,
"learning_rate": 2.5575515218928592e-06,
"loss": 2.1515,
"step": 883
},
{
"epoch": 0.899974548231102,
"grad_norm": 7.814465045928955,
"learning_rate": 2.5070859477369645e-06,
"loss": 2.4541,
"step": 884
},
{
"epoch": 0.9009926189870195,
"grad_norm": 7.679671764373779,
"learning_rate": 2.457110434962645e-06,
"loss": 2.6637,
"step": 885
},
{
"epoch": 0.9020106897429372,
"grad_norm": 8.16401481628418,
"learning_rate": 2.407625499248273e-06,
"loss": 2.547,
"step": 886
},
{
"epoch": 0.9030287604988547,
"grad_norm": 9.379379272460938,
"learning_rate": 2.3586316512101416e-06,
"loss": 2.5642,
"step": 887
},
{
"epoch": 0.9040468312547723,
"grad_norm": 7.037562370300293,
"learning_rate": 2.3101293963972094e-06,
"loss": 1.9438,
"step": 888
},
{
"epoch": 0.9050649020106898,
"grad_norm": 7.9886674880981445,
"learning_rate": 2.26211923528587e-06,
"loss": 2.2604,
"step": 889
},
{
"epoch": 0.9060829727666073,
"grad_norm": 7.768427848815918,
"learning_rate": 2.2146016632747624e-06,
"loss": 1.8138,
"step": 890
},
{
"epoch": 0.9071010435225249,
"grad_norm": 7.540080547332764,
"learning_rate": 2.1675771706797132e-06,
"loss": 1.8673,
"step": 891
},
{
"epoch": 0.9081191142784424,
"grad_norm": 8.09107780456543,
"learning_rate": 2.1210462427286524e-06,
"loss": 1.971,
"step": 892
},
{
"epoch": 0.9091371850343599,
"grad_norm": 6.946146011352539,
"learning_rate": 2.0750093595565733e-06,
"loss": 1.7914,
"step": 893
},
{
"epoch": 0.9101552557902775,
"grad_norm": 10.480133056640625,
"learning_rate": 2.0294669962006354e-06,
"loss": 2.4089,
"step": 894
},
{
"epoch": 0.911173326546195,
"grad_norm": 7.2262797355651855,
"learning_rate": 1.984419622595224e-06,
"loss": 1.8403,
"step": 895
},
{
"epoch": 0.9121913973021125,
"grad_norm": 8.057347297668457,
"learning_rate": 1.939867703567122e-06,
"loss": 1.7492,
"step": 896
},
{
"epoch": 0.91320946805803,
"grad_norm": 8.859241485595703,
"learning_rate": 1.895811698830685e-06,
"loss": 2.3681,
"step": 897
},
{
"epoch": 0.9142275388139476,
"grad_norm": 7.5691328048706055,
"learning_rate": 1.8522520629831397e-06,
"loss": 2.1921,
"step": 898
},
{
"epoch": 0.9152456095698651,
"grad_norm": 9.58875846862793,
"learning_rate": 1.8091892454998594e-06,
"loss": 2.4974,
"step": 899
},
{
"epoch": 0.9162636803257826,
"grad_norm": 7.032078266143799,
"learning_rate": 1.7666236907297406e-06,
"loss": 1.3266,
"step": 900
},
{
"epoch": 0.9172817510817002,
"grad_norm": 7.034844875335693,
"learning_rate": 1.7245558378906013e-06,
"loss": 2.617,
"step": 901
},
{
"epoch": 0.9182998218376177,
"grad_norm": 9.052960395812988,
"learning_rate": 1.6829861210646891e-06,
"loss": 3.9865,
"step": 902
},
{
"epoch": 0.9193178925935352,
"grad_norm": 11.31360912322998,
"learning_rate": 1.641914969194147e-06,
"loss": 3.9044,
"step": 903
},
{
"epoch": 0.9203359633494528,
"grad_norm": 11.041589736938477,
"learning_rate": 1.6013428060766168e-06,
"loss": 4.3406,
"step": 904
},
{
"epoch": 0.9213540341053703,
"grad_norm": 10.366722106933594,
"learning_rate": 1.5612700503608968e-06,
"loss": 2.948,
"step": 905
},
{
"epoch": 0.9223721048612878,
"grad_norm": 13.091897964477539,
"learning_rate": 1.5216971155425475e-06,
"loss": 4.3409,
"step": 906
},
{
"epoch": 0.9233901756172054,
"grad_norm": 10.320602416992188,
"learning_rate": 1.4826244099596986e-06,
"loss": 2.7963,
"step": 907
},
{
"epoch": 0.9244082463731229,
"grad_norm": 9.795669555664062,
"learning_rate": 1.4440523367887871e-06,
"loss": 2.7568,
"step": 908
},
{
"epoch": 0.9254263171290404,
"grad_norm": 5.081234931945801,
"learning_rate": 1.4059812940404093e-06,
"loss": 1.1005,
"step": 909
},
{
"epoch": 0.926444387884958,
"grad_norm": 5.965490818023682,
"learning_rate": 1.3684116745552423e-06,
"loss": 1.6534,
"step": 910
},
{
"epoch": 0.9274624586408755,
"grad_norm": 8.321152687072754,
"learning_rate": 1.33134386599994e-06,
"loss": 1.6702,
"step": 911
},
{
"epoch": 0.928480529396793,
"grad_norm": 6.844864845275879,
"learning_rate": 1.2947782508631822e-06,
"loss": 1.5389,
"step": 912
},
{
"epoch": 0.9294986001527106,
"grad_norm": 7.344895362854004,
"learning_rate": 1.2587152064516827e-06,
"loss": 1.6019,
"step": 913
},
{
"epoch": 0.9305166709086281,
"grad_norm": 6.270571708679199,
"learning_rate": 1.223155104886342e-06,
"loss": 1.322,
"step": 914
},
{
"epoch": 0.9315347416645456,
"grad_norm": 7.117447853088379,
"learning_rate": 1.1880983130983626e-06,
"loss": 1.6418,
"step": 915
},
{
"epoch": 0.9325528124204632,
"grad_norm": 8.100106239318848,
"learning_rate": 1.1535451928254947e-06,
"loss": 2.1211,
"step": 916
},
{
"epoch": 0.9335708831763807,
"grad_norm": 9.206981658935547,
"learning_rate": 1.1194961006082972e-06,
"loss": 2.2021,
"step": 917
},
{
"epoch": 0.9345889539322983,
"grad_norm": 5.570037364959717,
"learning_rate": 1.085951387786438e-06,
"loss": 1.5467,
"step": 918
},
{
"epoch": 0.9356070246882159,
"grad_norm": 8.013311386108398,
"learning_rate": 1.0529114004951047e-06,
"loss": 1.9642,
"step": 919
},
{
"epoch": 0.9366250954441334,
"grad_norm": 6.999322891235352,
"learning_rate": 1.0203764796614058e-06,
"loss": 1.772,
"step": 920
},
{
"epoch": 0.937643166200051,
"grad_norm": 7.812559604644775,
"learning_rate": 9.883469610008577e-07,
"loss": 1.9127,
"step": 921
},
{
"epoch": 0.9386612369559685,
"grad_norm": 9.275822639465332,
"learning_rate": 9.568231750139212e-07,
"loss": 2.3705,
"step": 922
},
{
"epoch": 0.939679307711886,
"grad_norm": 7.6475372314453125,
"learning_rate": 9.258054469825972e-07,
"loss": 2.306,
"step": 923
},
{
"epoch": 0.9406973784678035,
"grad_norm": 7.288696765899658,
"learning_rate": 8.952940969670809e-07,
"loss": 1.6429,
"step": 924
},
{
"epoch": 0.9417154492237211,
"grad_norm": 7.795031547546387,
"learning_rate": 8.652894398024136e-07,
"loss": 1.9986,
"step": 925
},
{
"epoch": 0.9427335199796386,
"grad_norm": 7.860483169555664,
"learning_rate": 8.357917850952802e-07,
"loss": 2.1139,
"step": 926
},
{
"epoch": 0.9437515907355561,
"grad_norm": 7.814316749572754,
"learning_rate": 8.06801437220811e-07,
"loss": 2.1013,
"step": 927
},
{
"epoch": 0.9447696614914737,
"grad_norm": 8.413445472717285,
"learning_rate": 7.783186953194189e-07,
"loss": 2.7227,
"step": 928
},
{
"epoch": 0.9457877322473912,
"grad_norm": 7.9406328201293945,
"learning_rate": 7.503438532937168e-07,
"loss": 2.1969,
"step": 929
},
{
"epoch": 0.9468058030033087,
"grad_norm": 8.008191108703613,
"learning_rate": 7.228771998054995e-07,
"loss": 2.2026,
"step": 930
},
{
"epoch": 0.9478238737592263,
"grad_norm": 9.503705978393555,
"learning_rate": 6.959190182727615e-07,
"loss": 2.6732,
"step": 931
},
{
"epoch": 0.9488419445151438,
"grad_norm": 9.472963333129883,
"learning_rate": 6.694695868667556e-07,
"loss": 3.0515,
"step": 932
},
{
"epoch": 0.9498600152710613,
"grad_norm": 7.070324420928955,
"learning_rate": 6.43529178509139e-07,
"loss": 1.9813,
"step": 933
},
{
"epoch": 0.9508780860269789,
"grad_norm": 9.686485290527344,
"learning_rate": 6.180980608691655e-07,
"loss": 2.8315,
"step": 934
},
{
"epoch": 0.9518961567828964,
"grad_norm": 8.254791259765625,
"learning_rate": 5.931764963608866e-07,
"loss": 2.2097,
"step": 935
},
{
"epoch": 0.9529142275388139,
"grad_norm": 8.103293418884277,
"learning_rate": 5.687647421404874e-07,
"loss": 2.4556,
"step": 936
},
{
"epoch": 0.9539322982947315,
"grad_norm": 7.946092128753662,
"learning_rate": 5.448630501036112e-07,
"loss": 1.7899,
"step": 937
},
{
"epoch": 0.954950369050649,
"grad_norm": 7.618176460266113,
"learning_rate": 5.214716668827557e-07,
"loss": 2.3681,
"step": 938
},
{
"epoch": 0.9559684398065665,
"grad_norm": 5.380380153656006,
"learning_rate": 4.985908338447476e-07,
"loss": 1.7725,
"step": 939
},
{
"epoch": 0.9569865105624841,
"grad_norm": 7.778873920440674,
"learning_rate": 4.762207870882218e-07,
"loss": 1.8244,
"step": 940
},
{
"epoch": 0.9580045813184016,
"grad_norm": 9.022889137268066,
"learning_rate": 4.543617574412184e-07,
"loss": 2.5223,
"step": 941
},
{
"epoch": 0.9590226520743191,
"grad_norm": 6.915138244628906,
"learning_rate": 4.3301397045877876e-07,
"loss": 1.5739,
"step": 942
},
{
"epoch": 0.9600407228302367,
"grad_norm": 7.640623092651367,
"learning_rate": 4.121776464206251e-07,
"loss": 2.0568,
"step": 943
},
{
"epoch": 0.9610587935861542,
"grad_norm": 7.426476955413818,
"learning_rate": 3.9185300032889006e-07,
"loss": 1.8825,
"step": 944
},
{
"epoch": 0.9620768643420717,
"grad_norm": 8.864518165588379,
"learning_rate": 3.720402419058966e-07,
"loss": 2.373,
"step": 945
},
{
"epoch": 0.9630949350979893,
"grad_norm": 7.7088541984558105,
"learning_rate": 3.5273957559199266e-07,
"loss": 1.6487,
"step": 946
},
{
"epoch": 0.9641130058539068,
"grad_norm": 7.193671703338623,
"learning_rate": 3.339512005434309e-07,
"loss": 1.8298,
"step": 947
},
{
"epoch": 0.9651310766098243,
"grad_norm": 7.4829230308532715,
"learning_rate": 3.1567531063033673e-07,
"loss": 1.1332,
"step": 948
},
{
"epoch": 0.9661491473657419,
"grad_norm": 27.006391525268555,
"learning_rate": 2.979120944346936e-07,
"loss": 2.0494,
"step": 949
},
{
"epoch": 0.9671672181216594,
"grad_norm": 11.99468994140625,
"learning_rate": 2.806617352483998e-07,
"loss": 2.4674,
"step": 950
},
{
"epoch": 0.968185288877577,
"grad_norm": 7.580793380737305,
"learning_rate": 2.639244110713701e-07,
"loss": 3.3584,
"step": 951
},
{
"epoch": 0.9692033596334946,
"grad_norm": 9.658464431762695,
"learning_rate": 2.4770029460970954e-07,
"loss": 3.5755,
"step": 952
},
{
"epoch": 0.9702214303894121,
"grad_norm": 9.27920150756836,
"learning_rate": 2.319895532739369e-07,
"loss": 2.8816,
"step": 953
},
{
"epoch": 0.9712395011453296,
"grad_norm": 11.56139850616455,
"learning_rate": 2.1679234917721946e-07,
"loss": 3.1379,
"step": 954
},
{
"epoch": 0.9722575719012472,
"grad_norm": 13.082681655883789,
"learning_rate": 2.0210883913376334e-07,
"loss": 3.5519,
"step": 955
},
{
"epoch": 0.9732756426571647,
"grad_norm": 10.118450164794922,
"learning_rate": 1.8793917465713684e-07,
"loss": 2.1806,
"step": 956
},
{
"epoch": 0.9742937134130822,
"grad_norm": 7.3693928718566895,
"learning_rate": 1.742835019587441e-07,
"loss": 1.6516,
"step": 957
},
{
"epoch": 0.9753117841689998,
"grad_norm": 7.52052640914917,
"learning_rate": 1.6114196194628172e-07,
"loss": 1.8945,
"step": 958
},
{
"epoch": 0.9763298549249173,
"grad_norm": 6.575441837310791,
"learning_rate": 1.4851469022234e-07,
"loss": 1.5346,
"step": 959
},
{
"epoch": 0.9773479256808348,
"grad_norm": 5.139125823974609,
"learning_rate": 1.3640181708293731e-07,
"loss": 1.2349,
"step": 960
},
{
"epoch": 0.9783659964367524,
"grad_norm": 5.116672992706299,
"learning_rate": 1.2480346751622686e-07,
"loss": 1.3371,
"step": 961
},
{
"epoch": 0.9793840671926699,
"grad_norm": 6.816611289978027,
"learning_rate": 1.1371976120118088e-07,
"loss": 1.8166,
"step": 962
},
{
"epoch": 0.9804021379485874,
"grad_norm": 5.658526420593262,
"learning_rate": 1.0315081250636405e-07,
"loss": 1.435,
"step": 963
},
{
"epoch": 0.981420208704505,
"grad_norm": 5.805619239807129,
"learning_rate": 9.309673048875089e-08,
"loss": 1.473,
"step": 964
},
{
"epoch": 0.9824382794604225,
"grad_norm": 8.438018798828125,
"learning_rate": 8.355761889260461e-08,
"loss": 1.9403,
"step": 965
},
{
"epoch": 0.98345635021634,
"grad_norm": 6.625545501708984,
"learning_rate": 7.453357614841117e-08,
"loss": 1.7268,
"step": 966
},
{
"epoch": 0.9844744209722576,
"grad_norm": 7.497494220733643,
"learning_rate": 6.602469537183021e-08,
"loss": 2.0333,
"step": 967
},
{
"epoch": 0.9854924917281751,
"grad_norm": 7.167290210723877,
"learning_rate": 5.8031064362795705e-08,
"loss": 1.9103,
"step": 968
},
{
"epoch": 0.9865105624840926,
"grad_norm": 8.998686790466309,
"learning_rate": 5.0552765604544584e-08,
"loss": 2.8485,
"step": 969
},
{
"epoch": 0.9875286332400102,
"grad_norm": 7.887483596801758,
"learning_rate": 4.358987626281175e-08,
"loss": 2.5035,
"step": 970
},
{
"epoch": 0.9885467039959277,
"grad_norm": 8.251432418823242,
"learning_rate": 3.7142468185014104e-08,
"loss": 2.1646,
"step": 971
},
{
"epoch": 0.9895647747518452,
"grad_norm": 8.031188011169434,
"learning_rate": 3.121060789951225e-08,
"loss": 2.3445,
"step": 972
},
{
"epoch": 0.9905828455077628,
"grad_norm": 10.717124938964844,
"learning_rate": 2.5794356614922134e-08,
"loss": 2.4195,
"step": 973
},
{
"epoch": 0.9916009162636803,
"grad_norm": 7.7094950675964355,
"learning_rate": 2.0893770219493346e-08,
"loss": 2.2724,
"step": 974
},
{
"epoch": 0.9926189870195978,
"grad_norm": 9.21927547454834,
"learning_rate": 1.6508899280515134e-08,
"loss": 2.9697,
"step": 975
},
{
"epoch": 0.9936370577755154,
"grad_norm": 7.8263092041015625,
"learning_rate": 1.2639789043805694e-08,
"loss": 1.9429,
"step": 976
},
{
"epoch": 0.9946551285314329,
"grad_norm": 8.359663009643555,
"learning_rate": 9.286479433257e-09,
"loss": 2.078,
"step": 977
},
{
"epoch": 0.9956731992873504,
"grad_norm": 9.0786714553833,
"learning_rate": 6.449005050390699e-09,
"loss": 2.2142,
"step": 978
},
{
"epoch": 0.996691270043268,
"grad_norm": 8.358176231384277,
"learning_rate": 4.127395174036153e-09,
"loss": 1.7529,
"step": 979
},
{
"epoch": 0.9977093407991855,
"grad_norm": 8.492805480957031,
"learning_rate": 2.321673760002918e-09,
"loss": 2.3433,
"step": 980
},
{
"epoch": 0.998727411555103,
"grad_norm": 10.61440372467041,
"learning_rate": 1.0318594408476045e-09,
"loss": 2.466,
"step": 981
},
{
"epoch": 0.9997454823110206,
"grad_norm": 9.659395217895508,
"learning_rate": 2.57965525674031e-10,
"loss": 1.3249,
"step": 982
},
{
"epoch": 0.9997454823110206,
"eval_loss": 0.5453814268112183,
"eval_runtime": 50.2489,
"eval_samples_per_second": 16.478,
"eval_steps_per_second": 4.119,
"step": 982
},
{
"epoch": 1.000763553066938,
"grad_norm": 6.880834102630615,
"learning_rate": 0.0,
"loss": 1.8668,
"step": 983
}
],
"logging_steps": 1,
"max_steps": 983,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 246,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.031180351948718e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}