broodmother41's picture
Training in progress, step 1050, checkpoint
c5190f8 verified
raw
history blame
184 kB
{
"best_metric": 1.3224910497665405,
"best_model_checkpoint": "miner_id_24/checkpoint-1050",
"epoch": 0.9657392504023914,
"eval_steps": 150,
"global_step": 1050,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009197516670498966,
"grad_norm": 6.181552886962891,
"learning_rate": 5e-06,
"loss": 6.0754,
"step": 1
},
{
"epoch": 0.0009197516670498966,
"eval_loss": 4.431160926818848,
"eval_runtime": 49.8075,
"eval_samples_per_second": 165.437,
"eval_steps_per_second": 20.68,
"step": 1
},
{
"epoch": 0.0018395033340997931,
"grad_norm": 6.504417896270752,
"learning_rate": 1e-05,
"loss": 5.7192,
"step": 2
},
{
"epoch": 0.0027592550011496897,
"grad_norm": 7.00899076461792,
"learning_rate": 1.5e-05,
"loss": 5.533,
"step": 3
},
{
"epoch": 0.0036790066681995862,
"grad_norm": 6.67568826675415,
"learning_rate": 2e-05,
"loss": 5.0143,
"step": 4
},
{
"epoch": 0.004598758335249482,
"grad_norm": 6.313548564910889,
"learning_rate": 2.5e-05,
"loss": 4.6726,
"step": 5
},
{
"epoch": 0.005518510002299379,
"grad_norm": 5.2927422523498535,
"learning_rate": 3e-05,
"loss": 4.6566,
"step": 6
},
{
"epoch": 0.0064382616693492755,
"grad_norm": 4.771329879760742,
"learning_rate": 3.5e-05,
"loss": 4.3112,
"step": 7
},
{
"epoch": 0.0073580133363991725,
"grad_norm": 3.6339666843414307,
"learning_rate": 4e-05,
"loss": 4.1199,
"step": 8
},
{
"epoch": 0.008277765003449069,
"grad_norm": 2.8113648891448975,
"learning_rate": 4.5e-05,
"loss": 3.9369,
"step": 9
},
{
"epoch": 0.009197516670498965,
"grad_norm": 2.2301437854766846,
"learning_rate": 5e-05,
"loss": 3.7798,
"step": 10
},
{
"epoch": 0.010117268337548863,
"grad_norm": 2.4432830810546875,
"learning_rate": 5.500000000000001e-05,
"loss": 3.8405,
"step": 11
},
{
"epoch": 0.011037020004598759,
"grad_norm": 1.870229721069336,
"learning_rate": 6e-05,
"loss": 3.6539,
"step": 12
},
{
"epoch": 0.011956771671648655,
"grad_norm": 1.9459682703018188,
"learning_rate": 6.500000000000001e-05,
"loss": 3.5456,
"step": 13
},
{
"epoch": 0.012876523338698551,
"grad_norm": 1.4028608798980713,
"learning_rate": 7e-05,
"loss": 3.4191,
"step": 14
},
{
"epoch": 0.013796275005748447,
"grad_norm": 1.9811638593673706,
"learning_rate": 7.500000000000001e-05,
"loss": 3.6083,
"step": 15
},
{
"epoch": 0.014716026672798345,
"grad_norm": 1.952579379081726,
"learning_rate": 8e-05,
"loss": 3.4243,
"step": 16
},
{
"epoch": 0.01563577833984824,
"grad_norm": 1.5935711860656738,
"learning_rate": 8.5e-05,
"loss": 3.3783,
"step": 17
},
{
"epoch": 0.016555530006898137,
"grad_norm": 1.475130558013916,
"learning_rate": 9e-05,
"loss": 3.3419,
"step": 18
},
{
"epoch": 0.017475281673948035,
"grad_norm": 1.465334177017212,
"learning_rate": 9.5e-05,
"loss": 3.2841,
"step": 19
},
{
"epoch": 0.01839503334099793,
"grad_norm": 1.5258549451828003,
"learning_rate": 0.0001,
"loss": 3.1315,
"step": 20
},
{
"epoch": 0.019314785008047827,
"grad_norm": 1.2697194814682007,
"learning_rate": 9.999978367986987e-05,
"loss": 3.1049,
"step": 21
},
{
"epoch": 0.020234536675097725,
"grad_norm": 1.0417594909667969,
"learning_rate": 9.999913472135125e-05,
"loss": 3.0702,
"step": 22
},
{
"epoch": 0.02115428834214762,
"grad_norm": 0.8249285221099854,
"learning_rate": 9.999805313005946e-05,
"loss": 3.0126,
"step": 23
},
{
"epoch": 0.022074040009197517,
"grad_norm": 0.8856204152107239,
"learning_rate": 9.99965389153533e-05,
"loss": 2.936,
"step": 24
},
{
"epoch": 0.022993791676247412,
"grad_norm": 1.0896774530410767,
"learning_rate": 9.999459209033495e-05,
"loss": 3.0088,
"step": 25
},
{
"epoch": 0.02391354334329731,
"grad_norm": 0.878311276435852,
"learning_rate": 9.999221267184993e-05,
"loss": 2.8434,
"step": 26
},
{
"epoch": 0.024833295010347207,
"grad_norm": 0.6598113179206848,
"learning_rate": 9.998940068048688e-05,
"loss": 2.7397,
"step": 27
},
{
"epoch": 0.025753046677397102,
"grad_norm": 0.8144488334655762,
"learning_rate": 9.998615614057742e-05,
"loss": 2.7315,
"step": 28
},
{
"epoch": 0.026672798344447,
"grad_norm": 0.8650857210159302,
"learning_rate": 9.998247908019593e-05,
"loss": 2.8126,
"step": 29
},
{
"epoch": 0.027592550011496894,
"grad_norm": 0.6536254286766052,
"learning_rate": 9.997836953115926e-05,
"loss": 2.7479,
"step": 30
},
{
"epoch": 0.028512301678546792,
"grad_norm": 1.0240334272384644,
"learning_rate": 9.997382752902657e-05,
"loss": 2.7575,
"step": 31
},
{
"epoch": 0.02943205334559669,
"grad_norm": 0.6431864500045776,
"learning_rate": 9.996885311309891e-05,
"loss": 2.6497,
"step": 32
},
{
"epoch": 0.030351805012646584,
"grad_norm": 0.6775922179222107,
"learning_rate": 9.996344632641894e-05,
"loss": 2.6301,
"step": 33
},
{
"epoch": 0.03127155667969648,
"grad_norm": 0.9252532124519348,
"learning_rate": 9.995760721577052e-05,
"loss": 2.6123,
"step": 34
},
{
"epoch": 0.032191308346746376,
"grad_norm": 0.6126474738121033,
"learning_rate": 9.995133583167832e-05,
"loss": 2.5311,
"step": 35
},
{
"epoch": 0.033111060013796274,
"grad_norm": 0.7896717190742493,
"learning_rate": 9.994463222840746e-05,
"loss": 2.5242,
"step": 36
},
{
"epoch": 0.03403081168084617,
"grad_norm": 0.6513295769691467,
"learning_rate": 9.993749646396286e-05,
"loss": 2.4802,
"step": 37
},
{
"epoch": 0.03495056334789607,
"grad_norm": 0.6320262551307678,
"learning_rate": 9.992992860008892e-05,
"loss": 2.5144,
"step": 38
},
{
"epoch": 0.03587031501494597,
"grad_norm": 0.9524784684181213,
"learning_rate": 9.992192870226889e-05,
"loss": 2.5425,
"step": 39
},
{
"epoch": 0.03679006668199586,
"grad_norm": 0.5857168436050415,
"learning_rate": 9.991349683972434e-05,
"loss": 2.447,
"step": 40
},
{
"epoch": 0.03770981834904576,
"grad_norm": 0.7533925175666809,
"learning_rate": 9.990463308541451e-05,
"loss": 2.3431,
"step": 41
},
{
"epoch": 0.038629570016095655,
"grad_norm": 0.5700300931930542,
"learning_rate": 9.989533751603577e-05,
"loss": 2.3499,
"step": 42
},
{
"epoch": 0.03954932168314555,
"grad_norm": 0.8808962106704712,
"learning_rate": 9.988561021202083e-05,
"loss": 2.3962,
"step": 43
},
{
"epoch": 0.04046907335019545,
"grad_norm": 0.7045830488204956,
"learning_rate": 9.987545125753819e-05,
"loss": 2.2948,
"step": 44
},
{
"epoch": 0.04138882501724534,
"grad_norm": 0.8472001552581787,
"learning_rate": 9.986486074049131e-05,
"loss": 2.3045,
"step": 45
},
{
"epoch": 0.04230857668429524,
"grad_norm": 0.7173092365264893,
"learning_rate": 9.985383875251783e-05,
"loss": 2.2929,
"step": 46
},
{
"epoch": 0.04322832835134514,
"grad_norm": 0.7962790727615356,
"learning_rate": 9.984238538898891e-05,
"loss": 2.327,
"step": 47
},
{
"epoch": 0.044148080018395035,
"grad_norm": 0.8624216318130493,
"learning_rate": 9.983050074900824e-05,
"loss": 2.187,
"step": 48
},
{
"epoch": 0.04506783168544493,
"grad_norm": 0.917926549911499,
"learning_rate": 9.98181849354113e-05,
"loss": 2.1946,
"step": 49
},
{
"epoch": 0.045987583352494824,
"grad_norm": 0.8663224577903748,
"learning_rate": 9.980543805476446e-05,
"loss": 2.0988,
"step": 50
},
{
"epoch": 0.04690733501954472,
"grad_norm": 11.713833808898926,
"learning_rate": 9.979226021736396e-05,
"loss": 4.3627,
"step": 51
},
{
"epoch": 0.04782708668659462,
"grad_norm": 5.917041301727295,
"learning_rate": 9.977865153723507e-05,
"loss": 3.7012,
"step": 52
},
{
"epoch": 0.04874683835364452,
"grad_norm": 3.241976499557495,
"learning_rate": 9.976461213213104e-05,
"loss": 3.1752,
"step": 53
},
{
"epoch": 0.049666590020694415,
"grad_norm": 3.6020805835723877,
"learning_rate": 9.975014212353213e-05,
"loss": 3.019,
"step": 54
},
{
"epoch": 0.050586341687744306,
"grad_norm": 3.3399133682250977,
"learning_rate": 9.973524163664447e-05,
"loss": 2.7453,
"step": 55
},
{
"epoch": 0.051506093354794204,
"grad_norm": 2.34346604347229,
"learning_rate": 9.97199108003991e-05,
"loss": 2.5133,
"step": 56
},
{
"epoch": 0.0524258450218441,
"grad_norm": 1.2596639394760132,
"learning_rate": 9.970414974745076e-05,
"loss": 2.5255,
"step": 57
},
{
"epoch": 0.053345596688894,
"grad_norm": 2.061197519302368,
"learning_rate": 9.968795861417676e-05,
"loss": 2.4012,
"step": 58
},
{
"epoch": 0.0542653483559439,
"grad_norm": 2.119333028793335,
"learning_rate": 9.967133754067582e-05,
"loss": 2.3668,
"step": 59
},
{
"epoch": 0.05518510002299379,
"grad_norm": 1.2170815467834473,
"learning_rate": 9.965428667076686e-05,
"loss": 2.4343,
"step": 60
},
{
"epoch": 0.056104851690043686,
"grad_norm": 1.0711098909378052,
"learning_rate": 9.963680615198773e-05,
"loss": 2.3052,
"step": 61
},
{
"epoch": 0.057024603357093584,
"grad_norm": 1.64667809009552,
"learning_rate": 9.961889613559395e-05,
"loss": 2.3781,
"step": 62
},
{
"epoch": 0.05794435502414348,
"grad_norm": 1.2105283737182617,
"learning_rate": 9.960055677655742e-05,
"loss": 2.357,
"step": 63
},
{
"epoch": 0.05886410669119338,
"grad_norm": 1.0943785905838013,
"learning_rate": 9.958178823356503e-05,
"loss": 2.2903,
"step": 64
},
{
"epoch": 0.05978385835824328,
"grad_norm": 1.5415120124816895,
"learning_rate": 9.956259066901733e-05,
"loss": 2.3312,
"step": 65
},
{
"epoch": 0.06070361002529317,
"grad_norm": 0.8917611837387085,
"learning_rate": 9.954296424902708e-05,
"loss": 2.32,
"step": 66
},
{
"epoch": 0.061623361692343066,
"grad_norm": 0.7154043316841125,
"learning_rate": 9.952290914341791e-05,
"loss": 2.24,
"step": 67
},
{
"epoch": 0.06254311335939296,
"grad_norm": 1.1616435050964355,
"learning_rate": 9.950242552572271e-05,
"loss": 2.2741,
"step": 68
},
{
"epoch": 0.06346286502644286,
"grad_norm": 0.7844848036766052,
"learning_rate": 9.948151357318228e-05,
"loss": 2.2333,
"step": 69
},
{
"epoch": 0.06438261669349275,
"grad_norm": 0.7282043695449829,
"learning_rate": 9.946017346674361e-05,
"loss": 2.1664,
"step": 70
},
{
"epoch": 0.06530236836054265,
"grad_norm": 0.7888442873954773,
"learning_rate": 9.943840539105854e-05,
"loss": 2.2735,
"step": 71
},
{
"epoch": 0.06622212002759255,
"grad_norm": 0.5766344666481018,
"learning_rate": 9.941620953448194e-05,
"loss": 2.1517,
"step": 72
},
{
"epoch": 0.06714187169464245,
"grad_norm": 0.7196112871170044,
"learning_rate": 9.939358608907026e-05,
"loss": 2.1162,
"step": 73
},
{
"epoch": 0.06806162336169234,
"grad_norm": 0.7088760137557983,
"learning_rate": 9.937053525057977e-05,
"loss": 2.1777,
"step": 74
},
{
"epoch": 0.06898137502874224,
"grad_norm": 0.5653288960456848,
"learning_rate": 9.934705721846487e-05,
"loss": 2.1762,
"step": 75
},
{
"epoch": 0.06990112669579214,
"grad_norm": 0.8287534117698669,
"learning_rate": 9.93231521958764e-05,
"loss": 2.1711,
"step": 76
},
{
"epoch": 0.07082087836284204,
"grad_norm": 0.4657200872898102,
"learning_rate": 9.929882038965989e-05,
"loss": 2.1953,
"step": 77
},
{
"epoch": 0.07174063002989194,
"grad_norm": 0.47897377610206604,
"learning_rate": 9.927406201035368e-05,
"loss": 2.1214,
"step": 78
},
{
"epoch": 0.07266038169694182,
"grad_norm": 0.678236722946167,
"learning_rate": 9.924887727218724e-05,
"loss": 2.0763,
"step": 79
},
{
"epoch": 0.07358013336399172,
"grad_norm": 0.4475807249546051,
"learning_rate": 9.922326639307917e-05,
"loss": 2.16,
"step": 80
},
{
"epoch": 0.07449988503104162,
"grad_norm": 0.49449393153190613,
"learning_rate": 9.919722959463544e-05,
"loss": 2.1382,
"step": 81
},
{
"epoch": 0.07541963669809151,
"grad_norm": 0.5139850974082947,
"learning_rate": 9.917076710214739e-05,
"loss": 2.1543,
"step": 82
},
{
"epoch": 0.07633938836514141,
"grad_norm": 0.5776795148849487,
"learning_rate": 9.914387914458982e-05,
"loss": 2.157,
"step": 83
},
{
"epoch": 0.07725914003219131,
"grad_norm": 0.573421061038971,
"learning_rate": 9.911656595461898e-05,
"loss": 2.0451,
"step": 84
},
{
"epoch": 0.07817889169924121,
"grad_norm": 0.6673893332481384,
"learning_rate": 9.908882776857056e-05,
"loss": 2.11,
"step": 85
},
{
"epoch": 0.0790986433662911,
"grad_norm": 0.5322157740592957,
"learning_rate": 9.906066482645772e-05,
"loss": 2.0667,
"step": 86
},
{
"epoch": 0.080018395033341,
"grad_norm": 0.7134078741073608,
"learning_rate": 9.903207737196891e-05,
"loss": 2.0217,
"step": 87
},
{
"epoch": 0.0809381467003909,
"grad_norm": 0.5911161303520203,
"learning_rate": 9.900306565246578e-05,
"loss": 2.0574,
"step": 88
},
{
"epoch": 0.08185789836744078,
"grad_norm": 0.6985921263694763,
"learning_rate": 9.897362991898109e-05,
"loss": 2.0796,
"step": 89
},
{
"epoch": 0.08277765003449068,
"grad_norm": 0.5797624588012695,
"learning_rate": 9.894377042621655e-05,
"loss": 2.0293,
"step": 90
},
{
"epoch": 0.08369740170154058,
"grad_norm": 0.6441212296485901,
"learning_rate": 9.891348743254046e-05,
"loss": 2.0006,
"step": 91
},
{
"epoch": 0.08461715336859048,
"grad_norm": 0.5719017386436462,
"learning_rate": 9.888278119998573e-05,
"loss": 1.9847,
"step": 92
},
{
"epoch": 0.08553690503564038,
"grad_norm": 0.618574857711792,
"learning_rate": 9.885165199424738e-05,
"loss": 1.9194,
"step": 93
},
{
"epoch": 0.08645665670269027,
"grad_norm": 0.8214313983917236,
"learning_rate": 9.882010008468036e-05,
"loss": 1.8845,
"step": 94
},
{
"epoch": 0.08737640836974017,
"grad_norm": 0.6314259767532349,
"learning_rate": 9.878812574429721e-05,
"loss": 1.8474,
"step": 95
},
{
"epoch": 0.08829616003679007,
"grad_norm": 0.6584024429321289,
"learning_rate": 9.875572924976568e-05,
"loss": 1.8843,
"step": 96
},
{
"epoch": 0.08921591170383997,
"grad_norm": 0.7131389379501343,
"learning_rate": 9.87229108814063e-05,
"loss": 1.9198,
"step": 97
},
{
"epoch": 0.09013566337088987,
"grad_norm": 0.824288010597229,
"learning_rate": 9.868967092319003e-05,
"loss": 1.8658,
"step": 98
},
{
"epoch": 0.09105541503793976,
"grad_norm": 0.7455874681472778,
"learning_rate": 9.865600966273575e-05,
"loss": 1.7975,
"step": 99
},
{
"epoch": 0.09197516670498965,
"grad_norm": 1.2152295112609863,
"learning_rate": 9.86219273913078e-05,
"loss": 1.7226,
"step": 100
},
{
"epoch": 0.09289491837203954,
"grad_norm": 5.640716552734375,
"learning_rate": 9.858742440381343e-05,
"loss": 3.5625,
"step": 101
},
{
"epoch": 0.09381467003908944,
"grad_norm": 3.7876381874084473,
"learning_rate": 9.855250099880025e-05,
"loss": 3.0309,
"step": 102
},
{
"epoch": 0.09473442170613934,
"grad_norm": 2.426966428756714,
"learning_rate": 9.851715747845373e-05,
"loss": 2.6085,
"step": 103
},
{
"epoch": 0.09565417337318924,
"grad_norm": 2.368666172027588,
"learning_rate": 9.848139414859441e-05,
"loss": 2.457,
"step": 104
},
{
"epoch": 0.09657392504023914,
"grad_norm": 1.607815146446228,
"learning_rate": 9.844521131867546e-05,
"loss": 2.2837,
"step": 105
},
{
"epoch": 0.09749367670728903,
"grad_norm": 1.2020126581192017,
"learning_rate": 9.840860930177983e-05,
"loss": 2.1918,
"step": 106
},
{
"epoch": 0.09841342837433893,
"grad_norm": 1.469667673110962,
"learning_rate": 9.837158841461766e-05,
"loss": 2.1856,
"step": 107
},
{
"epoch": 0.09933318004138883,
"grad_norm": 1.2101978063583374,
"learning_rate": 9.833414897752347e-05,
"loss": 2.1572,
"step": 108
},
{
"epoch": 0.10025293170843873,
"grad_norm": 1.0145184993743896,
"learning_rate": 9.829629131445342e-05,
"loss": 2.0651,
"step": 109
},
{
"epoch": 0.10117268337548861,
"grad_norm": 1.0942986011505127,
"learning_rate": 9.825801575298248e-05,
"loss": 2.1006,
"step": 110
},
{
"epoch": 0.10209243504253851,
"grad_norm": 0.812549889087677,
"learning_rate": 9.821932262430165e-05,
"loss": 2.0787,
"step": 111
},
{
"epoch": 0.10301218670958841,
"grad_norm": 0.9913772344589233,
"learning_rate": 9.8180212263215e-05,
"loss": 2.0555,
"step": 112
},
{
"epoch": 0.1039319383766383,
"grad_norm": 0.7573890686035156,
"learning_rate": 9.814068500813692e-05,
"loss": 2.022,
"step": 113
},
{
"epoch": 0.1048516900436882,
"grad_norm": 0.876980185508728,
"learning_rate": 9.8100741201089e-05,
"loss": 2.0677,
"step": 114
},
{
"epoch": 0.1057714417107381,
"grad_norm": 0.8768622875213623,
"learning_rate": 9.806038118769723e-05,
"loss": 2.0766,
"step": 115
},
{
"epoch": 0.106691193377788,
"grad_norm": 0.6824678182601929,
"learning_rate": 9.801960531718896e-05,
"loss": 2.1323,
"step": 116
},
{
"epoch": 0.1076109450448379,
"grad_norm": 0.9467669129371643,
"learning_rate": 9.797841394238986e-05,
"loss": 1.9414,
"step": 117
},
{
"epoch": 0.1085306967118878,
"grad_norm": 0.5850769281387329,
"learning_rate": 9.793680741972084e-05,
"loss": 1.9249,
"step": 118
},
{
"epoch": 0.10945044837893769,
"grad_norm": 0.8185686469078064,
"learning_rate": 9.789478610919507e-05,
"loss": 1.9541,
"step": 119
},
{
"epoch": 0.11037020004598758,
"grad_norm": 0.9609946608543396,
"learning_rate": 9.785235037441474e-05,
"loss": 1.943,
"step": 120
},
{
"epoch": 0.11128995171303747,
"grad_norm": 0.6438754796981812,
"learning_rate": 9.780950058256802e-05,
"loss": 1.9613,
"step": 121
},
{
"epoch": 0.11220970338008737,
"grad_norm": 1.0584321022033691,
"learning_rate": 9.776623710442579e-05,
"loss": 1.9652,
"step": 122
},
{
"epoch": 0.11312945504713727,
"grad_norm": 0.5727084279060364,
"learning_rate": 9.772256031433849e-05,
"loss": 1.9769,
"step": 123
},
{
"epoch": 0.11404920671418717,
"grad_norm": 0.8819255828857422,
"learning_rate": 9.767847059023291e-05,
"loss": 2.0024,
"step": 124
},
{
"epoch": 0.11496895838123707,
"grad_norm": 0.8120801448822021,
"learning_rate": 9.763396831360884e-05,
"loss": 1.9066,
"step": 125
},
{
"epoch": 0.11588871004828696,
"grad_norm": 0.5545021891593933,
"learning_rate": 9.758905386953579e-05,
"loss": 1.9619,
"step": 126
},
{
"epoch": 0.11680846171533686,
"grad_norm": 1.0289326906204224,
"learning_rate": 9.754372764664969e-05,
"loss": 1.9098,
"step": 127
},
{
"epoch": 0.11772821338238676,
"grad_norm": 0.609516441822052,
"learning_rate": 9.749799003714954e-05,
"loss": 1.9147,
"step": 128
},
{
"epoch": 0.11864796504943666,
"grad_norm": 0.7941620945930481,
"learning_rate": 9.745184143679397e-05,
"loss": 1.8968,
"step": 129
},
{
"epoch": 0.11956771671648656,
"grad_norm": 0.787964940071106,
"learning_rate": 9.74052822448978e-05,
"loss": 1.9712,
"step": 130
},
{
"epoch": 0.12048746838353644,
"grad_norm": 0.730323314666748,
"learning_rate": 9.735831286432868e-05,
"loss": 1.8993,
"step": 131
},
{
"epoch": 0.12140722005058634,
"grad_norm": 0.8297889232635498,
"learning_rate": 9.731093370150349e-05,
"loss": 1.9682,
"step": 132
},
{
"epoch": 0.12232697171763623,
"grad_norm": 0.768775463104248,
"learning_rate": 9.72631451663849e-05,
"loss": 1.8542,
"step": 133
},
{
"epoch": 0.12324672338468613,
"grad_norm": 0.7137448787689209,
"learning_rate": 9.721494767247779e-05,
"loss": 1.8801,
"step": 134
},
{
"epoch": 0.12416647505173603,
"grad_norm": 0.6385506987571716,
"learning_rate": 9.716634163682569e-05,
"loss": 1.8384,
"step": 135
},
{
"epoch": 0.12508622671878591,
"grad_norm": 0.7410357594490051,
"learning_rate": 9.71173274800072e-05,
"loss": 1.8761,
"step": 136
},
{
"epoch": 0.12600597838583583,
"grad_norm": 0.7702000737190247,
"learning_rate": 9.706790562613219e-05,
"loss": 1.8183,
"step": 137
},
{
"epoch": 0.1269257300528857,
"grad_norm": 0.6795453429222107,
"learning_rate": 9.701807650283839e-05,
"loss": 1.8434,
"step": 138
},
{
"epoch": 0.12784548171993562,
"grad_norm": 0.8809398412704468,
"learning_rate": 9.696784054128749e-05,
"loss": 1.8462,
"step": 139
},
{
"epoch": 0.1287652333869855,
"grad_norm": 0.9881577491760254,
"learning_rate": 9.691719817616147e-05,
"loss": 1.7828,
"step": 140
},
{
"epoch": 0.12968498505403542,
"grad_norm": 0.9603993892669678,
"learning_rate": 9.686614984565887e-05,
"loss": 1.8768,
"step": 141
},
{
"epoch": 0.1306047367210853,
"grad_norm": 1.0421313047409058,
"learning_rate": 9.681469599149092e-05,
"loss": 1.8302,
"step": 142
},
{
"epoch": 0.1315244883881352,
"grad_norm": 0.8529607653617859,
"learning_rate": 9.676283705887783e-05,
"loss": 1.7531,
"step": 143
},
{
"epoch": 0.1324442400551851,
"grad_norm": 0.8817620277404785,
"learning_rate": 9.67105734965448e-05,
"loss": 1.7358,
"step": 144
},
{
"epoch": 0.133363991722235,
"grad_norm": 0.9506654739379883,
"learning_rate": 9.665790575671829e-05,
"loss": 1.7789,
"step": 145
},
{
"epoch": 0.1342837433892849,
"grad_norm": 1.1102913618087769,
"learning_rate": 9.660483429512199e-05,
"loss": 1.7401,
"step": 146
},
{
"epoch": 0.13520349505633478,
"grad_norm": 0.7556246519088745,
"learning_rate": 9.65513595709729e-05,
"loss": 1.728,
"step": 147
},
{
"epoch": 0.1361232467233847,
"grad_norm": 1.1163665056228638,
"learning_rate": 9.64974820469774e-05,
"loss": 1.6618,
"step": 148
},
{
"epoch": 0.13704299839043457,
"grad_norm": 0.9814196228981018,
"learning_rate": 9.644320218932722e-05,
"loss": 1.616,
"step": 149
},
{
"epoch": 0.13796275005748448,
"grad_norm": 1.2995212078094482,
"learning_rate": 9.638852046769539e-05,
"loss": 1.6275,
"step": 150
},
{
"epoch": 0.13796275005748448,
"eval_loss": 1.9198498725891113,
"eval_runtime": 50.0535,
"eval_samples_per_second": 164.624,
"eval_steps_per_second": 20.578,
"step": 150
},
{
"epoch": 0.13888250172453437,
"grad_norm": 3.668370485305786,
"learning_rate": 9.633343735523219e-05,
"loss": 2.841,
"step": 151
},
{
"epoch": 0.13980225339158428,
"grad_norm": 2.5073230266571045,
"learning_rate": 9.627795332856107e-05,
"loss": 2.3706,
"step": 152
},
{
"epoch": 0.14072200505863416,
"grad_norm": 1.542073130607605,
"learning_rate": 9.622206886777448e-05,
"loss": 2.1699,
"step": 153
},
{
"epoch": 0.14164175672568408,
"grad_norm": 1.3604127168655396,
"learning_rate": 9.616578445642981e-05,
"loss": 1.9859,
"step": 154
},
{
"epoch": 0.14256150839273396,
"grad_norm": 1.1186628341674805,
"learning_rate": 9.61091005815451e-05,
"loss": 1.9205,
"step": 155
},
{
"epoch": 0.14348126005978387,
"grad_norm": 1.1308863162994385,
"learning_rate": 9.605201773359485e-05,
"loss": 1.9819,
"step": 156
},
{
"epoch": 0.14440101172683376,
"grad_norm": 1.0661953687667847,
"learning_rate": 9.599453640650585e-05,
"loss": 1.9109,
"step": 157
},
{
"epoch": 0.14532076339388364,
"grad_norm": 0.7912338376045227,
"learning_rate": 9.59366570976528e-05,
"loss": 1.9331,
"step": 158
},
{
"epoch": 0.14624051506093355,
"grad_norm": 0.9056004881858826,
"learning_rate": 9.587838030785413e-05,
"loss": 1.9323,
"step": 159
},
{
"epoch": 0.14716026672798344,
"grad_norm": 1.0585856437683105,
"learning_rate": 9.581970654136751e-05,
"loss": 1.9443,
"step": 160
},
{
"epoch": 0.14808001839503335,
"grad_norm": 1.0043240785598755,
"learning_rate": 9.576063630588563e-05,
"loss": 1.8468,
"step": 161
},
{
"epoch": 0.14899977006208323,
"grad_norm": 0.9187436699867249,
"learning_rate": 9.570117011253174e-05,
"loss": 1.9558,
"step": 162
},
{
"epoch": 0.14991952172913314,
"grad_norm": 0.862158477306366,
"learning_rate": 9.56413084758552e-05,
"loss": 1.851,
"step": 163
},
{
"epoch": 0.15083927339618303,
"grad_norm": 1.04788076877594,
"learning_rate": 9.55810519138271e-05,
"loss": 1.884,
"step": 164
},
{
"epoch": 0.15175902506323294,
"grad_norm": 0.807015597820282,
"learning_rate": 9.552040094783574e-05,
"loss": 1.8688,
"step": 165
},
{
"epoch": 0.15267877673028282,
"grad_norm": 0.8749469518661499,
"learning_rate": 9.545935610268211e-05,
"loss": 1.8487,
"step": 166
},
{
"epoch": 0.1535985283973327,
"grad_norm": 0.7388503551483154,
"learning_rate": 9.539791790657538e-05,
"loss": 1.8447,
"step": 167
},
{
"epoch": 0.15451828006438262,
"grad_norm": 0.8812807202339172,
"learning_rate": 9.533608689112827e-05,
"loss": 1.8848,
"step": 168
},
{
"epoch": 0.1554380317314325,
"grad_norm": 0.6926305890083313,
"learning_rate": 9.527386359135253e-05,
"loss": 1.824,
"step": 169
},
{
"epoch": 0.15635778339848241,
"grad_norm": 0.7211126089096069,
"learning_rate": 9.521124854565425e-05,
"loss": 1.8291,
"step": 170
},
{
"epoch": 0.1572775350655323,
"grad_norm": 0.717591404914856,
"learning_rate": 9.514824229582921e-05,
"loss": 1.8463,
"step": 171
},
{
"epoch": 0.1581972867325822,
"grad_norm": 0.5658002495765686,
"learning_rate": 9.508484538705824e-05,
"loss": 1.8864,
"step": 172
},
{
"epoch": 0.1591170383996321,
"grad_norm": 0.8670650720596313,
"learning_rate": 9.50210583679024e-05,
"loss": 1.8437,
"step": 173
},
{
"epoch": 0.160036790066682,
"grad_norm": 0.6736385822296143,
"learning_rate": 9.495688179029838e-05,
"loss": 1.8376,
"step": 174
},
{
"epoch": 0.1609565417337319,
"grad_norm": 0.7114839553833008,
"learning_rate": 9.489231620955359e-05,
"loss": 1.8259,
"step": 175
},
{
"epoch": 0.1618762934007818,
"grad_norm": 0.8745600581169128,
"learning_rate": 9.482736218434143e-05,
"loss": 1.8571,
"step": 176
},
{
"epoch": 0.16279604506783169,
"grad_norm": 0.594724714756012,
"learning_rate": 9.476202027669643e-05,
"loss": 1.8385,
"step": 177
},
{
"epoch": 0.16371579673488157,
"grad_norm": 0.8559861183166504,
"learning_rate": 9.469629105200937e-05,
"loss": 1.805,
"step": 178
},
{
"epoch": 0.16463554840193148,
"grad_norm": 0.6145199537277222,
"learning_rate": 9.463017507902244e-05,
"loss": 1.8331,
"step": 179
},
{
"epoch": 0.16555530006898136,
"grad_norm": 1.0015912055969238,
"learning_rate": 9.456367292982429e-05,
"loss": 1.7974,
"step": 180
},
{
"epoch": 0.16647505173603128,
"grad_norm": 0.5909841060638428,
"learning_rate": 9.449678517984502e-05,
"loss": 1.787,
"step": 181
},
{
"epoch": 0.16739480340308116,
"grad_norm": 0.766480565071106,
"learning_rate": 9.442951240785135e-05,
"loss": 1.7213,
"step": 182
},
{
"epoch": 0.16831455507013107,
"grad_norm": 0.6516543626785278,
"learning_rate": 9.436185519594145e-05,
"loss": 1.7548,
"step": 183
},
{
"epoch": 0.16923430673718096,
"grad_norm": 0.7793421745300293,
"learning_rate": 9.429381412953999e-05,
"loss": 1.7481,
"step": 184
},
{
"epoch": 0.17015405840423087,
"grad_norm": 0.8920656442642212,
"learning_rate": 9.422538979739307e-05,
"loss": 1.805,
"step": 185
},
{
"epoch": 0.17107381007128075,
"grad_norm": 0.8302977085113525,
"learning_rate": 9.415658279156311e-05,
"loss": 1.7267,
"step": 186
},
{
"epoch": 0.17199356173833066,
"grad_norm": 0.8947249054908752,
"learning_rate": 9.408739370742373e-05,
"loss": 1.6794,
"step": 187
},
{
"epoch": 0.17291331340538055,
"grad_norm": 0.6332067251205444,
"learning_rate": 9.401782314365457e-05,
"loss": 1.7127,
"step": 188
},
{
"epoch": 0.17383306507243043,
"grad_norm": 0.830932080745697,
"learning_rate": 9.39478717022362e-05,
"loss": 1.6696,
"step": 189
},
{
"epoch": 0.17475281673948034,
"grad_norm": 0.6934016942977905,
"learning_rate": 9.387753998844482e-05,
"loss": 1.6327,
"step": 190
},
{
"epoch": 0.17567256840653023,
"grad_norm": 0.733917236328125,
"learning_rate": 9.380682861084701e-05,
"loss": 1.6992,
"step": 191
},
{
"epoch": 0.17659232007358014,
"grad_norm": 0.7675406336784363,
"learning_rate": 9.373573818129458e-05,
"loss": 1.6759,
"step": 192
},
{
"epoch": 0.17751207174063002,
"grad_norm": 0.8431460857391357,
"learning_rate": 9.366426931491916e-05,
"loss": 1.6044,
"step": 193
},
{
"epoch": 0.17843182340767993,
"grad_norm": 0.7542397975921631,
"learning_rate": 9.359242263012693e-05,
"loss": 1.6274,
"step": 194
},
{
"epoch": 0.17935157507472982,
"grad_norm": 0.8931959867477417,
"learning_rate": 9.352019874859325e-05,
"loss": 1.6006,
"step": 195
},
{
"epoch": 0.18027132674177973,
"grad_norm": 0.8215823769569397,
"learning_rate": 9.344759829525733e-05,
"loss": 1.5865,
"step": 196
},
{
"epoch": 0.18119107840882961,
"grad_norm": 0.7112393379211426,
"learning_rate": 9.337462189831669e-05,
"loss": 1.5478,
"step": 197
},
{
"epoch": 0.18211083007587953,
"grad_norm": 1.0283434391021729,
"learning_rate": 9.330127018922194e-05,
"loss": 1.5316,
"step": 198
},
{
"epoch": 0.1830305817429294,
"grad_norm": 0.9886683225631714,
"learning_rate": 9.322754380267109e-05,
"loss": 1.4653,
"step": 199
},
{
"epoch": 0.1839503334099793,
"grad_norm": 1.064937949180603,
"learning_rate": 9.315344337660421e-05,
"loss": 1.4673,
"step": 200
},
{
"epoch": 0.1848700850770292,
"grad_norm": 3.375886917114258,
"learning_rate": 9.307896955219786e-05,
"loss": 2.5919,
"step": 201
},
{
"epoch": 0.1857898367440791,
"grad_norm": 2.260359764099121,
"learning_rate": 9.300412297385954e-05,
"loss": 2.1729,
"step": 202
},
{
"epoch": 0.186709588411129,
"grad_norm": 1.4669098854064941,
"learning_rate": 9.292890428922209e-05,
"loss": 1.9383,
"step": 203
},
{
"epoch": 0.18762934007817889,
"grad_norm": 1.037178635597229,
"learning_rate": 9.285331414913815e-05,
"loss": 1.9071,
"step": 204
},
{
"epoch": 0.1885490917452288,
"grad_norm": 1.154489517211914,
"learning_rate": 9.277735320767449e-05,
"loss": 1.8216,
"step": 205
},
{
"epoch": 0.18946884341227868,
"grad_norm": 1.0613019466400146,
"learning_rate": 9.270102212210632e-05,
"loss": 1.7831,
"step": 206
},
{
"epoch": 0.1903885950793286,
"grad_norm": 1.1248329877853394,
"learning_rate": 9.262432155291167e-05,
"loss": 1.8591,
"step": 207
},
{
"epoch": 0.19130834674637848,
"grad_norm": 0.8293649554252625,
"learning_rate": 9.254725216376561e-05,
"loss": 1.8205,
"step": 208
},
{
"epoch": 0.19222809841342836,
"grad_norm": 0.9506818652153015,
"learning_rate": 9.246981462153456e-05,
"loss": 1.8283,
"step": 209
},
{
"epoch": 0.19314785008047827,
"grad_norm": 0.8719251155853271,
"learning_rate": 9.239200959627048e-05,
"loss": 1.7719,
"step": 210
},
{
"epoch": 0.19406760174752816,
"grad_norm": 0.808614194393158,
"learning_rate": 9.231383776120512e-05,
"loss": 1.8825,
"step": 211
},
{
"epoch": 0.19498735341457807,
"grad_norm": 0.897612988948822,
"learning_rate": 9.22352997927441e-05,
"loss": 1.8061,
"step": 212
},
{
"epoch": 0.19590710508162795,
"grad_norm": 0.7289676070213318,
"learning_rate": 9.215639637046121e-05,
"loss": 1.8348,
"step": 213
},
{
"epoch": 0.19682685674867786,
"grad_norm": 0.8267980813980103,
"learning_rate": 9.207712817709236e-05,
"loss": 1.7645,
"step": 214
},
{
"epoch": 0.19774660841572775,
"grad_norm": 0.7317152619361877,
"learning_rate": 9.19974958985298e-05,
"loss": 1.7478,
"step": 215
},
{
"epoch": 0.19866636008277766,
"grad_norm": 0.6896607875823975,
"learning_rate": 9.191750022381614e-05,
"loss": 1.7699,
"step": 216
},
{
"epoch": 0.19958611174982754,
"grad_norm": 0.7086347937583923,
"learning_rate": 9.183714184513832e-05,
"loss": 1.7938,
"step": 217
},
{
"epoch": 0.20050586341687746,
"grad_norm": 0.6830713152885437,
"learning_rate": 9.175642145782179e-05,
"loss": 1.7568,
"step": 218
},
{
"epoch": 0.20142561508392734,
"grad_norm": 0.5826436281204224,
"learning_rate": 9.167533976032429e-05,
"loss": 1.7548,
"step": 219
},
{
"epoch": 0.20234536675097722,
"grad_norm": 0.669696569442749,
"learning_rate": 9.159389745423002e-05,
"loss": 1.8096,
"step": 220
},
{
"epoch": 0.20326511841802714,
"grad_norm": 0.6378855109214783,
"learning_rate": 9.151209524424333e-05,
"loss": 1.7248,
"step": 221
},
{
"epoch": 0.20418487008507702,
"grad_norm": 0.7418368458747864,
"learning_rate": 9.142993383818283e-05,
"loss": 1.6951,
"step": 222
},
{
"epoch": 0.20510462175212693,
"grad_norm": 0.6502818465232849,
"learning_rate": 9.134741394697517e-05,
"loss": 1.6809,
"step": 223
},
{
"epoch": 0.20602437341917682,
"grad_norm": 0.6646417379379272,
"learning_rate": 9.126453628464888e-05,
"loss": 1.7178,
"step": 224
},
{
"epoch": 0.20694412508622673,
"grad_norm": 0.7070106267929077,
"learning_rate": 9.118130156832823e-05,
"loss": 1.7629,
"step": 225
},
{
"epoch": 0.2078638767532766,
"grad_norm": 0.6244888305664062,
"learning_rate": 9.109771051822702e-05,
"loss": 1.763,
"step": 226
},
{
"epoch": 0.20878362842032652,
"grad_norm": 0.6641138195991516,
"learning_rate": 9.10137638576423e-05,
"loss": 1.7016,
"step": 227
},
{
"epoch": 0.2097033800873764,
"grad_norm": 0.7198558449745178,
"learning_rate": 9.092946231294819e-05,
"loss": 1.7247,
"step": 228
},
{
"epoch": 0.21062313175442632,
"grad_norm": 0.5700192451477051,
"learning_rate": 9.084480661358953e-05,
"loss": 1.6782,
"step": 229
},
{
"epoch": 0.2115428834214762,
"grad_norm": 0.8081958293914795,
"learning_rate": 9.075979749207561e-05,
"loss": 1.7437,
"step": 230
},
{
"epoch": 0.2124626350885261,
"grad_norm": 0.7449802756309509,
"learning_rate": 9.067443568397378e-05,
"loss": 1.6924,
"step": 231
},
{
"epoch": 0.213382386755576,
"grad_norm": 0.8385685086250305,
"learning_rate": 9.058872192790313e-05,
"loss": 1.6572,
"step": 232
},
{
"epoch": 0.21430213842262588,
"grad_norm": 0.7077139616012573,
"learning_rate": 9.050265696552812e-05,
"loss": 1.6949,
"step": 233
},
{
"epoch": 0.2152218900896758,
"grad_norm": 0.7295122742652893,
"learning_rate": 9.041624154155208e-05,
"loss": 1.6745,
"step": 234
},
{
"epoch": 0.21614164175672568,
"grad_norm": 0.6347808241844177,
"learning_rate": 9.032947640371086e-05,
"loss": 1.6441,
"step": 235
},
{
"epoch": 0.2170613934237756,
"grad_norm": 0.8323748707771301,
"learning_rate": 9.024236230276629e-05,
"loss": 1.6198,
"step": 236
},
{
"epoch": 0.21798114509082547,
"grad_norm": 0.7440972328186035,
"learning_rate": 9.01548999924997e-05,
"loss": 1.6405,
"step": 237
},
{
"epoch": 0.21890089675787539,
"grad_norm": 0.7849915623664856,
"learning_rate": 9.006709022970547e-05,
"loss": 1.6361,
"step": 238
},
{
"epoch": 0.21982064842492527,
"grad_norm": 0.7478511929512024,
"learning_rate": 8.997893377418432e-05,
"loss": 1.543,
"step": 239
},
{
"epoch": 0.22074040009197515,
"grad_norm": 0.6225507259368896,
"learning_rate": 8.98904313887369e-05,
"loss": 1.6248,
"step": 240
},
{
"epoch": 0.22166015175902506,
"grad_norm": 0.6926827430725098,
"learning_rate": 8.980158383915713e-05,
"loss": 1.6449,
"step": 241
},
{
"epoch": 0.22257990342607495,
"grad_norm": 0.6942108869552612,
"learning_rate": 8.971239189422555e-05,
"loss": 1.5912,
"step": 242
},
{
"epoch": 0.22349965509312486,
"grad_norm": 0.623525857925415,
"learning_rate": 8.962285632570267e-05,
"loss": 1.5436,
"step": 243
},
{
"epoch": 0.22441940676017474,
"grad_norm": 0.5779447555541992,
"learning_rate": 8.953297790832231e-05,
"loss": 1.5747,
"step": 244
},
{
"epoch": 0.22533915842722466,
"grad_norm": 0.7703275680541992,
"learning_rate": 8.944275741978493e-05,
"loss": 1.5648,
"step": 245
},
{
"epoch": 0.22625891009427454,
"grad_norm": 0.7855743765830994,
"learning_rate": 8.935219564075085e-05,
"loss": 1.5246,
"step": 246
},
{
"epoch": 0.22717866176132445,
"grad_norm": 0.851977527141571,
"learning_rate": 8.926129335483349e-05,
"loss": 1.4777,
"step": 247
},
{
"epoch": 0.22809841342837434,
"grad_norm": 0.8636126518249512,
"learning_rate": 8.917005134859263e-05,
"loss": 1.5235,
"step": 248
},
{
"epoch": 0.22901816509542425,
"grad_norm": 1.055405616760254,
"learning_rate": 8.907847041152756e-05,
"loss": 1.5131,
"step": 249
},
{
"epoch": 0.22993791676247413,
"grad_norm": 1.2434190511703491,
"learning_rate": 8.89865513360703e-05,
"loss": 1.3169,
"step": 250
},
{
"epoch": 0.23085766842952402,
"grad_norm": 2.794989585876465,
"learning_rate": 8.889429491757871e-05,
"loss": 2.3149,
"step": 251
},
{
"epoch": 0.23177742009657393,
"grad_norm": 2.0627057552337646,
"learning_rate": 8.88017019543296e-05,
"loss": 2.0616,
"step": 252
},
{
"epoch": 0.2326971717636238,
"grad_norm": 1.3948839902877808,
"learning_rate": 8.870877324751184e-05,
"loss": 1.9026,
"step": 253
},
{
"epoch": 0.23361692343067372,
"grad_norm": 0.9678890109062195,
"learning_rate": 8.861550960121945e-05,
"loss": 1.8307,
"step": 254
},
{
"epoch": 0.2345366750977236,
"grad_norm": 1.0957893133163452,
"learning_rate": 8.852191182244456e-05,
"loss": 1.7364,
"step": 255
},
{
"epoch": 0.23545642676477352,
"grad_norm": 0.9677236676216125,
"learning_rate": 8.842798072107054e-05,
"loss": 1.762,
"step": 256
},
{
"epoch": 0.2363761784318234,
"grad_norm": 1.012479305267334,
"learning_rate": 8.833371710986493e-05,
"loss": 1.6711,
"step": 257
},
{
"epoch": 0.23729593009887331,
"grad_norm": 0.8846522569656372,
"learning_rate": 8.823912180447236e-05,
"loss": 1.8402,
"step": 258
},
{
"epoch": 0.2382156817659232,
"grad_norm": 1.0523695945739746,
"learning_rate": 8.81441956234076e-05,
"loss": 1.703,
"step": 259
},
{
"epoch": 0.2391354334329731,
"grad_norm": 1.0177359580993652,
"learning_rate": 8.80489393880484e-05,
"loss": 1.7218,
"step": 260
},
{
"epoch": 0.240055185100023,
"grad_norm": 0.8454842567443848,
"learning_rate": 8.79533539226284e-05,
"loss": 1.6839,
"step": 261
},
{
"epoch": 0.24097493676707288,
"grad_norm": 0.9161872863769531,
"learning_rate": 8.785744005423002e-05,
"loss": 1.7333,
"step": 262
},
{
"epoch": 0.2418946884341228,
"grad_norm": 0.7548457384109497,
"learning_rate": 8.77611986127773e-05,
"loss": 1.696,
"step": 263
},
{
"epoch": 0.24281444010117267,
"grad_norm": 0.9760596752166748,
"learning_rate": 8.766463043102864e-05,
"loss": 1.7102,
"step": 264
},
{
"epoch": 0.24373419176822259,
"grad_norm": 0.7247944474220276,
"learning_rate": 8.756773634456975e-05,
"loss": 1.7439,
"step": 265
},
{
"epoch": 0.24465394343527247,
"grad_norm": 0.7252097129821777,
"learning_rate": 8.747051719180626e-05,
"loss": 1.7811,
"step": 266
},
{
"epoch": 0.24557369510232238,
"grad_norm": 0.6071887016296387,
"learning_rate": 8.737297381395657e-05,
"loss": 1.6398,
"step": 267
},
{
"epoch": 0.24649344676937227,
"grad_norm": 0.7072895765304565,
"learning_rate": 8.727510705504454e-05,
"loss": 1.68,
"step": 268
},
{
"epoch": 0.24741319843642218,
"grad_norm": 0.7006264925003052,
"learning_rate": 8.717691776189214e-05,
"loss": 1.6814,
"step": 269
},
{
"epoch": 0.24833295010347206,
"grad_norm": 0.6832376718521118,
"learning_rate": 8.707840678411224e-05,
"loss": 1.6259,
"step": 270
},
{
"epoch": 0.24925270177052197,
"grad_norm": 0.5689120292663574,
"learning_rate": 8.697957497410108e-05,
"loss": 1.6786,
"step": 271
},
{
"epoch": 0.25017245343757183,
"grad_norm": 0.8517261743545532,
"learning_rate": 8.688042318703111e-05,
"loss": 1.6644,
"step": 272
},
{
"epoch": 0.25109220510462177,
"grad_norm": 0.5697482824325562,
"learning_rate": 8.678095228084343e-05,
"loss": 1.6705,
"step": 273
},
{
"epoch": 0.25201195677167165,
"grad_norm": 0.6067523956298828,
"learning_rate": 8.66811631162404e-05,
"loss": 1.7022,
"step": 274
},
{
"epoch": 0.25293170843872154,
"grad_norm": 0.6944383382797241,
"learning_rate": 8.65810565566782e-05,
"loss": 1.6235,
"step": 275
},
{
"epoch": 0.2538514601057714,
"grad_norm": 0.5674624443054199,
"learning_rate": 8.648063346835942e-05,
"loss": 1.6757,
"step": 276
},
{
"epoch": 0.25477121177282136,
"grad_norm": 0.6712316274642944,
"learning_rate": 8.637989472022549e-05,
"loss": 1.627,
"step": 277
},
{
"epoch": 0.25569096343987124,
"grad_norm": 0.5806477069854736,
"learning_rate": 8.627884118394913e-05,
"loss": 1.6709,
"step": 278
},
{
"epoch": 0.25661071510692113,
"grad_norm": 0.5989074110984802,
"learning_rate": 8.617747373392696e-05,
"loss": 1.6802,
"step": 279
},
{
"epoch": 0.257530466773971,
"grad_norm": 0.6222725510597229,
"learning_rate": 8.607579324727175e-05,
"loss": 1.5823,
"step": 280
},
{
"epoch": 0.25845021844102095,
"grad_norm": 0.6905350685119629,
"learning_rate": 8.597380060380493e-05,
"loss": 1.5795,
"step": 281
},
{
"epoch": 0.25936997010807084,
"grad_norm": 0.9093815684318542,
"learning_rate": 8.5871496686049e-05,
"loss": 1.6131,
"step": 282
},
{
"epoch": 0.2602897217751207,
"grad_norm": 0.8468539714813232,
"learning_rate": 8.576888237921983e-05,
"loss": 1.5836,
"step": 283
},
{
"epoch": 0.2612094734421706,
"grad_norm": 0.8949149250984192,
"learning_rate": 8.566595857121902e-05,
"loss": 1.5574,
"step": 284
},
{
"epoch": 0.2621292251092205,
"grad_norm": 0.7991402745246887,
"learning_rate": 8.556272615262622e-05,
"loss": 1.5941,
"step": 285
},
{
"epoch": 0.2630489767762704,
"grad_norm": 1.0631219148635864,
"learning_rate": 8.545918601669147e-05,
"loss": 1.6469,
"step": 286
},
{
"epoch": 0.2639687284433203,
"grad_norm": 0.6237906217575073,
"learning_rate": 8.535533905932738e-05,
"loss": 1.5148,
"step": 287
},
{
"epoch": 0.2648884801103702,
"grad_norm": 0.9192318320274353,
"learning_rate": 8.525118617910143e-05,
"loss": 1.4909,
"step": 288
},
{
"epoch": 0.2658082317774201,
"grad_norm": 0.8480085134506226,
"learning_rate": 8.514672827722824e-05,
"loss": 1.4746,
"step": 289
},
{
"epoch": 0.26672798344447,
"grad_norm": 0.9110789895057678,
"learning_rate": 8.504196625756166e-05,
"loss": 1.5245,
"step": 290
},
{
"epoch": 0.2676477351115199,
"grad_norm": 0.7915551066398621,
"learning_rate": 8.493690102658703e-05,
"loss": 1.4658,
"step": 291
},
{
"epoch": 0.2685674867785698,
"grad_norm": 0.8689735531806946,
"learning_rate": 8.483153349341335e-05,
"loss": 1.5159,
"step": 292
},
{
"epoch": 0.26948723844561967,
"grad_norm": 0.966712474822998,
"learning_rate": 8.472586456976535e-05,
"loss": 1.4782,
"step": 293
},
{
"epoch": 0.27040699011266955,
"grad_norm": 0.8555867075920105,
"learning_rate": 8.461989516997565e-05,
"loss": 1.5046,
"step": 294
},
{
"epoch": 0.2713267417797195,
"grad_norm": 0.8497052192687988,
"learning_rate": 8.45136262109768e-05,
"loss": 1.3816,
"step": 295
},
{
"epoch": 0.2722464934467694,
"grad_norm": 0.776263952255249,
"learning_rate": 8.440705861229344e-05,
"loss": 1.5065,
"step": 296
},
{
"epoch": 0.27316624511381926,
"grad_norm": 1.1991870403289795,
"learning_rate": 8.430019329603422e-05,
"loss": 1.4482,
"step": 297
},
{
"epoch": 0.27408599678086915,
"grad_norm": 0.9438532590866089,
"learning_rate": 8.41930311868839e-05,
"loss": 1.4023,
"step": 298
},
{
"epoch": 0.2750057484479191,
"grad_norm": 1.3889118432998657,
"learning_rate": 8.408557321209534e-05,
"loss": 1.3493,
"step": 299
},
{
"epoch": 0.27592550011496897,
"grad_norm": 1.7762432098388672,
"learning_rate": 8.397782030148147e-05,
"loss": 1.257,
"step": 300
},
{
"epoch": 0.27592550011496897,
"eval_loss": 1.6551681756973267,
"eval_runtime": 50.0018,
"eval_samples_per_second": 164.794,
"eval_steps_per_second": 20.599,
"step": 300
},
{
"epoch": 0.27684525178201885,
"grad_norm": 2.846353530883789,
"learning_rate": 8.386977338740724e-05,
"loss": 2.0714,
"step": 301
},
{
"epoch": 0.27776500344906874,
"grad_norm": 2.5227103233337402,
"learning_rate": 8.376143340478153e-05,
"loss": 1.8748,
"step": 302
},
{
"epoch": 0.2786847551161186,
"grad_norm": 2.0501370429992676,
"learning_rate": 8.365280129104912e-05,
"loss": 1.7948,
"step": 303
},
{
"epoch": 0.27960450678316856,
"grad_norm": 1.0905100107192993,
"learning_rate": 8.354387798618253e-05,
"loss": 1.7508,
"step": 304
},
{
"epoch": 0.28052425845021844,
"grad_norm": 1.1486353874206543,
"learning_rate": 8.343466443267391e-05,
"loss": 1.7368,
"step": 305
},
{
"epoch": 0.28144401011726833,
"grad_norm": 1.1892223358154297,
"learning_rate": 8.332516157552684e-05,
"loss": 1.6652,
"step": 306
},
{
"epoch": 0.2823637617843182,
"grad_norm": 1.027815341949463,
"learning_rate": 8.321537036224822e-05,
"loss": 1.6847,
"step": 307
},
{
"epoch": 0.28328351345136815,
"grad_norm": 1.1536738872528076,
"learning_rate": 8.310529174284004e-05,
"loss": 1.7384,
"step": 308
},
{
"epoch": 0.28420326511841804,
"grad_norm": 0.8124598264694214,
"learning_rate": 8.299492666979113e-05,
"loss": 1.6906,
"step": 309
},
{
"epoch": 0.2851230167854679,
"grad_norm": 1.1598918437957764,
"learning_rate": 8.2884276098069e-05,
"loss": 1.7223,
"step": 310
},
{
"epoch": 0.2860427684525178,
"grad_norm": 1.1664563417434692,
"learning_rate": 8.277334098511147e-05,
"loss": 1.6548,
"step": 311
},
{
"epoch": 0.28696252011956774,
"grad_norm": 0.6637358069419861,
"learning_rate": 8.266212229081847e-05,
"loss": 1.6638,
"step": 312
},
{
"epoch": 0.2878822717866176,
"grad_norm": 0.987754225730896,
"learning_rate": 8.255062097754372e-05,
"loss": 1.7133,
"step": 313
},
{
"epoch": 0.2888020234536675,
"grad_norm": 0.7713818550109863,
"learning_rate": 8.243883801008632e-05,
"loss": 1.6705,
"step": 314
},
{
"epoch": 0.2897217751207174,
"grad_norm": 1.0500911474227905,
"learning_rate": 8.232677435568252e-05,
"loss": 1.5651,
"step": 315
},
{
"epoch": 0.2906415267877673,
"grad_norm": 0.7900861501693726,
"learning_rate": 8.221443098399732e-05,
"loss": 1.6276,
"step": 316
},
{
"epoch": 0.2915612784548172,
"grad_norm": 0.7363952994346619,
"learning_rate": 8.210180886711602e-05,
"loss": 1.5795,
"step": 317
},
{
"epoch": 0.2924810301218671,
"grad_norm": 0.895269513130188,
"learning_rate": 8.198890897953586e-05,
"loss": 1.6644,
"step": 318
},
{
"epoch": 0.293400781788917,
"grad_norm": 0.9014370441436768,
"learning_rate": 8.187573229815758e-05,
"loss": 1.619,
"step": 319
},
{
"epoch": 0.29432053345596687,
"grad_norm": 1.06600821018219,
"learning_rate": 8.176227980227694e-05,
"loss": 1.6779,
"step": 320
},
{
"epoch": 0.2952402851230168,
"grad_norm": 1.0690526962280273,
"learning_rate": 8.164855247357627e-05,
"loss": 1.553,
"step": 321
},
{
"epoch": 0.2961600367900667,
"grad_norm": 0.8835525512695312,
"learning_rate": 8.153455129611605e-05,
"loss": 1.614,
"step": 322
},
{
"epoch": 0.2970797884571166,
"grad_norm": 1.1458913087844849,
"learning_rate": 8.142027725632623e-05,
"loss": 1.6015,
"step": 323
},
{
"epoch": 0.29799954012416646,
"grad_norm": 0.6511287093162537,
"learning_rate": 8.130573134299782e-05,
"loss": 1.6129,
"step": 324
},
{
"epoch": 0.29891929179121635,
"grad_norm": 1.1985218524932861,
"learning_rate": 8.119091454727428e-05,
"loss": 1.564,
"step": 325
},
{
"epoch": 0.2998390434582663,
"grad_norm": 1.0999850034713745,
"learning_rate": 8.107582786264299e-05,
"loss": 1.6318,
"step": 326
},
{
"epoch": 0.30075879512531617,
"grad_norm": 0.664042055606842,
"learning_rate": 8.09604722849266e-05,
"loss": 1.6049,
"step": 327
},
{
"epoch": 0.30167854679236605,
"grad_norm": 0.9706513285636902,
"learning_rate": 8.084484881227448e-05,
"loss": 1.6157,
"step": 328
},
{
"epoch": 0.30259829845941594,
"grad_norm": 0.7374880909919739,
"learning_rate": 8.072895844515398e-05,
"loss": 1.573,
"step": 329
},
{
"epoch": 0.3035180501264659,
"grad_norm": 0.9631950855255127,
"learning_rate": 8.061280218634192e-05,
"loss": 1.5568,
"step": 330
},
{
"epoch": 0.30443780179351576,
"grad_norm": 0.9304092526435852,
"learning_rate": 8.049638104091575e-05,
"loss": 1.6135,
"step": 331
},
{
"epoch": 0.30535755346056564,
"grad_norm": 0.7095350027084351,
"learning_rate": 8.037969601624495e-05,
"loss": 1.5427,
"step": 332
},
{
"epoch": 0.30627730512761553,
"grad_norm": 1.130644679069519,
"learning_rate": 8.026274812198234e-05,
"loss": 1.5704,
"step": 333
},
{
"epoch": 0.3071970567946654,
"grad_norm": 0.6161345839500427,
"learning_rate": 8.014553837005527e-05,
"loss": 1.5705,
"step": 334
},
{
"epoch": 0.30811680846171535,
"grad_norm": 0.7174437046051025,
"learning_rate": 8.002806777465685e-05,
"loss": 1.599,
"step": 335
},
{
"epoch": 0.30903656012876524,
"grad_norm": 1.0651494264602661,
"learning_rate": 7.991033735223729e-05,
"loss": 1.538,
"step": 336
},
{
"epoch": 0.3099563117958151,
"grad_norm": 0.7327350974082947,
"learning_rate": 7.979234812149501e-05,
"loss": 1.4112,
"step": 337
},
{
"epoch": 0.310876063462865,
"grad_norm": 0.8603296279907227,
"learning_rate": 7.967410110336782e-05,
"loss": 1.4141,
"step": 338
},
{
"epoch": 0.31179581512991494,
"grad_norm": 0.7242352962493896,
"learning_rate": 7.955559732102414e-05,
"loss": 1.4316,
"step": 339
},
{
"epoch": 0.31271556679696483,
"grad_norm": 0.7651688456535339,
"learning_rate": 7.943683779985413e-05,
"loss": 1.5116,
"step": 340
},
{
"epoch": 0.3136353184640147,
"grad_norm": 0.6736311316490173,
"learning_rate": 7.931782356746076e-05,
"loss": 1.4454,
"step": 341
},
{
"epoch": 0.3145550701310646,
"grad_norm": 0.6474123597145081,
"learning_rate": 7.919855565365102e-05,
"loss": 1.4616,
"step": 342
},
{
"epoch": 0.31547482179811454,
"grad_norm": 0.6624403595924377,
"learning_rate": 7.907903509042696e-05,
"loss": 1.4973,
"step": 343
},
{
"epoch": 0.3163945734651644,
"grad_norm": 0.6722452640533447,
"learning_rate": 7.895926291197667e-05,
"loss": 1.4452,
"step": 344
},
{
"epoch": 0.3173143251322143,
"grad_norm": 0.8001620769500732,
"learning_rate": 7.883924015466553e-05,
"loss": 1.4532,
"step": 345
},
{
"epoch": 0.3182340767992642,
"grad_norm": 0.8588351011276245,
"learning_rate": 7.871896785702707e-05,
"loss": 1.4036,
"step": 346
},
{
"epoch": 0.31915382846631407,
"grad_norm": 0.8040063977241516,
"learning_rate": 7.859844705975404e-05,
"loss": 1.3815,
"step": 347
},
{
"epoch": 0.320073580133364,
"grad_norm": 1.0031120777130127,
"learning_rate": 7.847767880568945e-05,
"loss": 1.3611,
"step": 348
},
{
"epoch": 0.3209933318004139,
"grad_norm": 0.8174616098403931,
"learning_rate": 7.835666413981743e-05,
"loss": 1.2897,
"step": 349
},
{
"epoch": 0.3219130834674638,
"grad_norm": 1.1649737358093262,
"learning_rate": 7.823540410925435e-05,
"loss": 1.22,
"step": 350
},
{
"epoch": 0.32283283513451366,
"grad_norm": 2.4392778873443604,
"learning_rate": 7.811389976323961e-05,
"loss": 1.9789,
"step": 351
},
{
"epoch": 0.3237525868015636,
"grad_norm": 1.9123626947402954,
"learning_rate": 7.799215215312667e-05,
"loss": 1.817,
"step": 352
},
{
"epoch": 0.3246723384686135,
"grad_norm": 1.556714653968811,
"learning_rate": 7.787016233237387e-05,
"loss": 1.6248,
"step": 353
},
{
"epoch": 0.32559209013566337,
"grad_norm": 1.0949770212173462,
"learning_rate": 7.774793135653538e-05,
"loss": 1.6925,
"step": 354
},
{
"epoch": 0.32651184180271325,
"grad_norm": 1.0330501794815063,
"learning_rate": 7.7625460283252e-05,
"loss": 1.6667,
"step": 355
},
{
"epoch": 0.32743159346976314,
"grad_norm": 1.113447666168213,
"learning_rate": 7.750275017224207e-05,
"loss": 1.6345,
"step": 356
},
{
"epoch": 0.3283513451368131,
"grad_norm": 1.0157980918884277,
"learning_rate": 7.737980208529231e-05,
"loss": 1.6047,
"step": 357
},
{
"epoch": 0.32927109680386296,
"grad_norm": 0.8798123598098755,
"learning_rate": 7.725661708624853e-05,
"loss": 1.5993,
"step": 358
},
{
"epoch": 0.33019084847091285,
"grad_norm": 0.9784142374992371,
"learning_rate": 7.713319624100657e-05,
"loss": 1.578,
"step": 359
},
{
"epoch": 0.33111060013796273,
"grad_norm": 0.9105007648468018,
"learning_rate": 7.700954061750293e-05,
"loss": 1.6108,
"step": 360
},
{
"epoch": 0.33203035180501267,
"grad_norm": 0.9545553922653198,
"learning_rate": 7.688565128570564e-05,
"loss": 1.6134,
"step": 361
},
{
"epoch": 0.33295010347206255,
"grad_norm": 0.8679737448692322,
"learning_rate": 7.676152931760496e-05,
"loss": 1.5928,
"step": 362
},
{
"epoch": 0.33386985513911244,
"grad_norm": 0.6711000204086304,
"learning_rate": 7.663717578720411e-05,
"loss": 1.6628,
"step": 363
},
{
"epoch": 0.3347896068061623,
"grad_norm": 0.7280721068382263,
"learning_rate": 7.651259177050996e-05,
"loss": 1.6265,
"step": 364
},
{
"epoch": 0.33570935847321226,
"grad_norm": 1.0024129152297974,
"learning_rate": 7.63877783455237e-05,
"loss": 1.6356,
"step": 365
},
{
"epoch": 0.33662911014026214,
"grad_norm": 0.7483541369438171,
"learning_rate": 7.626273659223165e-05,
"loss": 1.5906,
"step": 366
},
{
"epoch": 0.33754886180731203,
"grad_norm": 0.811964750289917,
"learning_rate": 7.61374675925957e-05,
"loss": 1.5831,
"step": 367
},
{
"epoch": 0.3384686134743619,
"grad_norm": 0.9911743998527527,
"learning_rate": 7.60119724305441e-05,
"loss": 1.5819,
"step": 368
},
{
"epoch": 0.3393883651414118,
"grad_norm": 0.6445810794830322,
"learning_rate": 7.588625219196208e-05,
"loss": 1.5991,
"step": 369
},
{
"epoch": 0.34030811680846174,
"grad_norm": 0.8051655888557434,
"learning_rate": 7.576030796468233e-05,
"loss": 1.5491,
"step": 370
},
{
"epoch": 0.3412278684755116,
"grad_norm": 0.9976129531860352,
"learning_rate": 7.563414083847573e-05,
"loss": 1.5645,
"step": 371
},
{
"epoch": 0.3421476201425615,
"grad_norm": 0.7071700096130371,
"learning_rate": 7.550775190504189e-05,
"loss": 1.528,
"step": 372
},
{
"epoch": 0.3430673718096114,
"grad_norm": 0.7412607669830322,
"learning_rate": 7.538114225799954e-05,
"loss": 1.5505,
"step": 373
},
{
"epoch": 0.3439871234766613,
"grad_norm": 0.7667213082313538,
"learning_rate": 7.525431299287738e-05,
"loss": 1.525,
"step": 374
},
{
"epoch": 0.3449068751437112,
"grad_norm": 0.5956572890281677,
"learning_rate": 7.51272652071043e-05,
"loss": 1.5149,
"step": 375
},
{
"epoch": 0.3458266268107611,
"grad_norm": 0.797289252281189,
"learning_rate": 7.500000000000001e-05,
"loss": 1.5407,
"step": 376
},
{
"epoch": 0.346746378477811,
"grad_norm": 0.7374883890151978,
"learning_rate": 7.48725184727656e-05,
"loss": 1.5777,
"step": 377
},
{
"epoch": 0.34766613014486086,
"grad_norm": 0.7943119406700134,
"learning_rate": 7.47448217284739e-05,
"loss": 1.5795,
"step": 378
},
{
"epoch": 0.3485858818119108,
"grad_norm": 0.6397266387939453,
"learning_rate": 7.461691087205993e-05,
"loss": 1.5687,
"step": 379
},
{
"epoch": 0.3495056334789607,
"grad_norm": 0.7197580337524414,
"learning_rate": 7.448878701031142e-05,
"loss": 1.4994,
"step": 380
},
{
"epoch": 0.35042538514601057,
"grad_norm": 0.614570677280426,
"learning_rate": 7.436045125185922e-05,
"loss": 1.5185,
"step": 381
},
{
"epoch": 0.35134513681306045,
"grad_norm": 0.766139566898346,
"learning_rate": 7.423190470716761e-05,
"loss": 1.5445,
"step": 382
},
{
"epoch": 0.3522648884801104,
"grad_norm": 0.6843118667602539,
"learning_rate": 7.410314848852483e-05,
"loss": 1.4972,
"step": 383
},
{
"epoch": 0.3531846401471603,
"grad_norm": 0.6766433119773865,
"learning_rate": 7.397418371003333e-05,
"loss": 1.4285,
"step": 384
},
{
"epoch": 0.35410439181421016,
"grad_norm": 0.8003432154655457,
"learning_rate": 7.384501148760024e-05,
"loss": 1.5283,
"step": 385
},
{
"epoch": 0.35502414348126005,
"grad_norm": 0.8524566888809204,
"learning_rate": 7.371563293892761e-05,
"loss": 1.4922,
"step": 386
},
{
"epoch": 0.35594389514830993,
"grad_norm": 0.9243666529655457,
"learning_rate": 7.358604918350288e-05,
"loss": 1.4883,
"step": 387
},
{
"epoch": 0.35686364681535987,
"grad_norm": 0.7275565266609192,
"learning_rate": 7.345626134258898e-05,
"loss": 1.4268,
"step": 388
},
{
"epoch": 0.35778339848240975,
"grad_norm": 0.6936664581298828,
"learning_rate": 7.332627053921482e-05,
"loss": 1.3605,
"step": 389
},
{
"epoch": 0.35870315014945964,
"grad_norm": 0.7576991319656372,
"learning_rate": 7.319607789816555e-05,
"loss": 1.4222,
"step": 390
},
{
"epoch": 0.3596229018165095,
"grad_norm": 0.7377772331237793,
"learning_rate": 7.306568454597269e-05,
"loss": 1.4681,
"step": 391
},
{
"epoch": 0.36054265348355946,
"grad_norm": 0.8987662196159363,
"learning_rate": 7.293509161090452e-05,
"loss": 1.4066,
"step": 392
},
{
"epoch": 0.36146240515060934,
"grad_norm": 0.7513107061386108,
"learning_rate": 7.280430022295631e-05,
"loss": 1.4134,
"step": 393
},
{
"epoch": 0.36238215681765923,
"grad_norm": 0.6676529049873352,
"learning_rate": 7.267331151384039e-05,
"loss": 1.4374,
"step": 394
},
{
"epoch": 0.3633019084847091,
"grad_norm": 0.8300096988677979,
"learning_rate": 7.254212661697659e-05,
"loss": 1.3849,
"step": 395
},
{
"epoch": 0.36422166015175905,
"grad_norm": 0.8758336901664734,
"learning_rate": 7.241074666748227e-05,
"loss": 1.3774,
"step": 396
},
{
"epoch": 0.36514141181880894,
"grad_norm": 0.8264380693435669,
"learning_rate": 7.227917280216254e-05,
"loss": 1.3575,
"step": 397
},
{
"epoch": 0.3660611634858588,
"grad_norm": 1.014760136604309,
"learning_rate": 7.214740615950041e-05,
"loss": 1.3026,
"step": 398
},
{
"epoch": 0.3669809151529087,
"grad_norm": 0.8453448414802551,
"learning_rate": 7.201544787964698e-05,
"loss": 1.3114,
"step": 399
},
{
"epoch": 0.3679006668199586,
"grad_norm": 1.1275343894958496,
"learning_rate": 7.188329910441154e-05,
"loss": 1.1734,
"step": 400
},
{
"epoch": 0.36882041848700853,
"grad_norm": 2.2339935302734375,
"learning_rate": 7.17509609772517e-05,
"loss": 1.8776,
"step": 401
},
{
"epoch": 0.3697401701540584,
"grad_norm": 1.5469164848327637,
"learning_rate": 7.161843464326348e-05,
"loss": 1.6876,
"step": 402
},
{
"epoch": 0.3706599218211083,
"grad_norm": 1.2731298208236694,
"learning_rate": 7.148572124917148e-05,
"loss": 1.581,
"step": 403
},
{
"epoch": 0.3715796734881582,
"grad_norm": 0.9135886430740356,
"learning_rate": 7.13528219433188e-05,
"loss": 1.5912,
"step": 404
},
{
"epoch": 0.3724994251552081,
"grad_norm": 0.8309260606765747,
"learning_rate": 7.121973787565726e-05,
"loss": 1.5825,
"step": 405
},
{
"epoch": 0.373419176822258,
"grad_norm": 0.8344767093658447,
"learning_rate": 7.10864701977374e-05,
"loss": 1.5724,
"step": 406
},
{
"epoch": 0.3743389284893079,
"grad_norm": 0.8113982081413269,
"learning_rate": 7.095302006269842e-05,
"loss": 1.5899,
"step": 407
},
{
"epoch": 0.37525868015635777,
"grad_norm": 0.8019097447395325,
"learning_rate": 7.081938862525839e-05,
"loss": 1.6347,
"step": 408
},
{
"epoch": 0.37617843182340766,
"grad_norm": 0.7903069257736206,
"learning_rate": 7.06855770417041e-05,
"loss": 1.5924,
"step": 409
},
{
"epoch": 0.3770981834904576,
"grad_norm": 0.7817911505699158,
"learning_rate": 7.055158646988109e-05,
"loss": 1.5705,
"step": 410
},
{
"epoch": 0.3780179351575075,
"grad_norm": 0.7876037359237671,
"learning_rate": 7.041741806918371e-05,
"loss": 1.553,
"step": 411
},
{
"epoch": 0.37893768682455736,
"grad_norm": 0.8235687017440796,
"learning_rate": 7.028307300054499e-05,
"loss": 1.5954,
"step": 412
},
{
"epoch": 0.37985743849160725,
"grad_norm": 0.6427410244941711,
"learning_rate": 7.014855242642662e-05,
"loss": 1.5935,
"step": 413
},
{
"epoch": 0.3807771901586572,
"grad_norm": 0.6327434182167053,
"learning_rate": 7.001385751080894e-05,
"loss": 1.5992,
"step": 414
},
{
"epoch": 0.38169694182570707,
"grad_norm": 0.705020010471344,
"learning_rate": 6.987898941918082e-05,
"loss": 1.5326,
"step": 415
},
{
"epoch": 0.38261669349275695,
"grad_norm": 0.6907270550727844,
"learning_rate": 6.974394931852956e-05,
"loss": 1.543,
"step": 416
},
{
"epoch": 0.38353644515980684,
"grad_norm": 0.6643316745758057,
"learning_rate": 6.960873837733088e-05,
"loss": 1.501,
"step": 417
},
{
"epoch": 0.3844561968268567,
"grad_norm": 0.6536545753479004,
"learning_rate": 6.94733577655387e-05,
"loss": 1.5498,
"step": 418
},
{
"epoch": 0.38537594849390666,
"grad_norm": 0.7011268138885498,
"learning_rate": 6.933780865457508e-05,
"loss": 1.6318,
"step": 419
},
{
"epoch": 0.38629570016095655,
"grad_norm": 0.6373593211174011,
"learning_rate": 6.920209221732006e-05,
"loss": 1.5523,
"step": 420
},
{
"epoch": 0.38721545182800643,
"grad_norm": 0.5898979902267456,
"learning_rate": 6.90662096281016e-05,
"loss": 1.5695,
"step": 421
},
{
"epoch": 0.3881352034950563,
"grad_norm": 0.6590458750724792,
"learning_rate": 6.893016206268518e-05,
"loss": 1.4721,
"step": 422
},
{
"epoch": 0.38905495516210625,
"grad_norm": 0.6448785662651062,
"learning_rate": 6.879395069826393e-05,
"loss": 1.5485,
"step": 423
},
{
"epoch": 0.38997470682915614,
"grad_norm": 0.648471474647522,
"learning_rate": 6.865757671344827e-05,
"loss": 1.5469,
"step": 424
},
{
"epoch": 0.390894458496206,
"grad_norm": 0.8980266451835632,
"learning_rate": 6.85210412882557e-05,
"loss": 1.5831,
"step": 425
},
{
"epoch": 0.3918142101632559,
"grad_norm": 0.6711221933364868,
"learning_rate": 6.838434560410064e-05,
"loss": 1.4341,
"step": 426
},
{
"epoch": 0.39273396183030584,
"grad_norm": 0.8187699317932129,
"learning_rate": 6.824749084378428e-05,
"loss": 1.4696,
"step": 427
},
{
"epoch": 0.39365371349735573,
"grad_norm": 0.8267800807952881,
"learning_rate": 6.811047819148413e-05,
"loss": 1.5041,
"step": 428
},
{
"epoch": 0.3945734651644056,
"grad_norm": 0.764512300491333,
"learning_rate": 6.797330883274403e-05,
"loss": 1.4774,
"step": 429
},
{
"epoch": 0.3954932168314555,
"grad_norm": 0.8012046813964844,
"learning_rate": 6.783598395446371e-05,
"loss": 1.4947,
"step": 430
},
{
"epoch": 0.3964129684985054,
"grad_norm": 0.5986045598983765,
"learning_rate": 6.769850474488859e-05,
"loss": 1.5161,
"step": 431
},
{
"epoch": 0.3973327201655553,
"grad_norm": 0.8222801685333252,
"learning_rate": 6.756087239359947e-05,
"loss": 1.4726,
"step": 432
},
{
"epoch": 0.3982524718326052,
"grad_norm": 0.6513310670852661,
"learning_rate": 6.742308809150232e-05,
"loss": 1.4894,
"step": 433
},
{
"epoch": 0.3991722234996551,
"grad_norm": 0.6340191960334778,
"learning_rate": 6.728515303081781e-05,
"loss": 1.4616,
"step": 434
},
{
"epoch": 0.40009197516670497,
"grad_norm": 0.8488625288009644,
"learning_rate": 6.714706840507121e-05,
"loss": 1.4096,
"step": 435
},
{
"epoch": 0.4010117268337549,
"grad_norm": 0.6022557020187378,
"learning_rate": 6.700883540908184e-05,
"loss": 1.4149,
"step": 436
},
{
"epoch": 0.4019314785008048,
"grad_norm": 0.7043591141700745,
"learning_rate": 6.687045523895293e-05,
"loss": 1.492,
"step": 437
},
{
"epoch": 0.4028512301678547,
"grad_norm": 0.8003234267234802,
"learning_rate": 6.673192909206108e-05,
"loss": 1.3878,
"step": 438
},
{
"epoch": 0.40377098183490456,
"grad_norm": 0.6873340010643005,
"learning_rate": 6.659325816704611e-05,
"loss": 1.4326,
"step": 439
},
{
"epoch": 0.40469073350195445,
"grad_norm": 0.673957884311676,
"learning_rate": 6.64544436638005e-05,
"loss": 1.4086,
"step": 440
},
{
"epoch": 0.4056104851690044,
"grad_norm": 0.7485764026641846,
"learning_rate": 6.63154867834591e-05,
"loss": 1.3967,
"step": 441
},
{
"epoch": 0.40653023683605427,
"grad_norm": 0.6807146072387695,
"learning_rate": 6.617638872838874e-05,
"loss": 1.3429,
"step": 442
},
{
"epoch": 0.40744998850310415,
"grad_norm": 0.6480006575584412,
"learning_rate": 6.603715070217778e-05,
"loss": 1.3968,
"step": 443
},
{
"epoch": 0.40836974017015404,
"grad_norm": 0.7995392084121704,
"learning_rate": 6.589777390962575e-05,
"loss": 1.4309,
"step": 444
},
{
"epoch": 0.409289491837204,
"grad_norm": 0.7234594821929932,
"learning_rate": 6.57582595567329e-05,
"loss": 1.2972,
"step": 445
},
{
"epoch": 0.41020924350425386,
"grad_norm": 0.9040266871452332,
"learning_rate": 6.561860885068972e-05,
"loss": 1.3339,
"step": 446
},
{
"epoch": 0.41112899517130375,
"grad_norm": 0.8719410300254822,
"learning_rate": 6.547882299986658e-05,
"loss": 1.2914,
"step": 447
},
{
"epoch": 0.41204874683835363,
"grad_norm": 0.964036226272583,
"learning_rate": 6.533890321380319e-05,
"loss": 1.2348,
"step": 448
},
{
"epoch": 0.4129684985054035,
"grad_norm": 1.0289238691329956,
"learning_rate": 6.519885070319827e-05,
"loss": 1.1747,
"step": 449
},
{
"epoch": 0.41388825017245345,
"grad_norm": 1.0722767114639282,
"learning_rate": 6.505866667989884e-05,
"loss": 1.1749,
"step": 450
},
{
"epoch": 0.41388825017245345,
"eval_loss": 1.5185648202896118,
"eval_runtime": 49.961,
"eval_samples_per_second": 164.929,
"eval_steps_per_second": 20.616,
"step": 450
},
{
"epoch": 0.41480800183950334,
"grad_norm": 2.0002212524414062,
"learning_rate": 6.491835235689e-05,
"loss": 1.8527,
"step": 451
},
{
"epoch": 0.4157277535065532,
"grad_norm": 1.7632036209106445,
"learning_rate": 6.477790894828421e-05,
"loss": 1.6736,
"step": 452
},
{
"epoch": 0.4166475051736031,
"grad_norm": 1.2842786312103271,
"learning_rate": 6.463733766931095e-05,
"loss": 1.6531,
"step": 453
},
{
"epoch": 0.41756725684065304,
"grad_norm": 0.9530149698257446,
"learning_rate": 6.449663973630613e-05,
"loss": 1.5728,
"step": 454
},
{
"epoch": 0.41848700850770293,
"grad_norm": 0.9490489363670349,
"learning_rate": 6.435581636670154e-05,
"loss": 1.458,
"step": 455
},
{
"epoch": 0.4194067601747528,
"grad_norm": 0.9226535558700562,
"learning_rate": 6.421486877901437e-05,
"loss": 1.477,
"step": 456
},
{
"epoch": 0.4203265118418027,
"grad_norm": 0.7617946267127991,
"learning_rate": 6.407379819283661e-05,
"loss": 1.4929,
"step": 457
},
{
"epoch": 0.42124626350885264,
"grad_norm": 0.7731391787528992,
"learning_rate": 6.39326058288246e-05,
"loss": 1.5828,
"step": 458
},
{
"epoch": 0.4221660151759025,
"grad_norm": 0.8461527824401855,
"learning_rate": 6.379129290868837e-05,
"loss": 1.558,
"step": 459
},
{
"epoch": 0.4230857668429524,
"grad_norm": 0.8030949234962463,
"learning_rate": 6.364986065518106e-05,
"loss": 1.5026,
"step": 460
},
{
"epoch": 0.4240055185100023,
"grad_norm": 0.9712105989456177,
"learning_rate": 6.350831029208844e-05,
"loss": 1.5603,
"step": 461
},
{
"epoch": 0.4249252701770522,
"grad_norm": 0.936730146408081,
"learning_rate": 6.336664304421818e-05,
"loss": 1.5037,
"step": 462
},
{
"epoch": 0.4258450218441021,
"grad_norm": 0.6644638776779175,
"learning_rate": 6.322486013738942e-05,
"loss": 1.5632,
"step": 463
},
{
"epoch": 0.426764773511152,
"grad_norm": 0.8889780044555664,
"learning_rate": 6.308296279842205e-05,
"loss": 1.5392,
"step": 464
},
{
"epoch": 0.4276845251782019,
"grad_norm": 0.771960973739624,
"learning_rate": 6.294095225512603e-05,
"loss": 1.5013,
"step": 465
},
{
"epoch": 0.42860427684525176,
"grad_norm": 0.7682729363441467,
"learning_rate": 6.2798829736291e-05,
"loss": 1.4829,
"step": 466
},
{
"epoch": 0.4295240285123017,
"grad_norm": 0.9224911332130432,
"learning_rate": 6.265659647167543e-05,
"loss": 1.5283,
"step": 467
},
{
"epoch": 0.4304437801793516,
"grad_norm": 0.7462615370750427,
"learning_rate": 6.251425369199599e-05,
"loss": 1.4762,
"step": 468
},
{
"epoch": 0.43136353184640147,
"grad_norm": 0.7566426396369934,
"learning_rate": 6.237180262891708e-05,
"loss": 1.5537,
"step": 469
},
{
"epoch": 0.43228328351345136,
"grad_norm": 0.7278396487236023,
"learning_rate": 6.222924451504001e-05,
"loss": 1.4805,
"step": 470
},
{
"epoch": 0.43320303518050124,
"grad_norm": 0.6063376069068909,
"learning_rate": 6.208658058389231e-05,
"loss": 1.5403,
"step": 471
},
{
"epoch": 0.4341227868475512,
"grad_norm": 0.7265048623085022,
"learning_rate": 6.194381206991722e-05,
"loss": 1.5131,
"step": 472
},
{
"epoch": 0.43504253851460106,
"grad_norm": 0.6536186933517456,
"learning_rate": 6.180094020846291e-05,
"loss": 1.4777,
"step": 473
},
{
"epoch": 0.43596229018165095,
"grad_norm": 0.6153502464294434,
"learning_rate": 6.165796623577171e-05,
"loss": 1.4592,
"step": 474
},
{
"epoch": 0.43688204184870083,
"grad_norm": 0.7638461589813232,
"learning_rate": 6.15148913889696e-05,
"loss": 1.5779,
"step": 475
},
{
"epoch": 0.43780179351575077,
"grad_norm": 0.755756139755249,
"learning_rate": 6.137171690605533e-05,
"loss": 1.5246,
"step": 476
},
{
"epoch": 0.43872154518280065,
"grad_norm": 0.5608311295509338,
"learning_rate": 6.122844402588982e-05,
"loss": 1.4824,
"step": 477
},
{
"epoch": 0.43964129684985054,
"grad_norm": 0.7992551922798157,
"learning_rate": 6.10850739881854e-05,
"loss": 1.4434,
"step": 478
},
{
"epoch": 0.4405610485169004,
"grad_norm": 0.6986256241798401,
"learning_rate": 6.094160803349508e-05,
"loss": 1.4313,
"step": 479
},
{
"epoch": 0.4414808001839503,
"grad_norm": 0.6461309790611267,
"learning_rate": 6.079804740320181e-05,
"loss": 1.4743,
"step": 480
},
{
"epoch": 0.44240055185100025,
"grad_norm": 0.7250984311103821,
"learning_rate": 6.0654393339507753e-05,
"loss": 1.4551,
"step": 481
},
{
"epoch": 0.44332030351805013,
"grad_norm": 0.6796169281005859,
"learning_rate": 6.051064708542357e-05,
"loss": 1.485,
"step": 482
},
{
"epoch": 0.4442400551851,
"grad_norm": 0.7773648500442505,
"learning_rate": 6.0366809884757556e-05,
"loss": 1.4153,
"step": 483
},
{
"epoch": 0.4451598068521499,
"grad_norm": 0.9285596609115601,
"learning_rate": 6.022288298210501e-05,
"loss": 1.4624,
"step": 484
},
{
"epoch": 0.44607955851919984,
"grad_norm": 0.7707833051681519,
"learning_rate": 6.0078867622837395e-05,
"loss": 1.431,
"step": 485
},
{
"epoch": 0.4469993101862497,
"grad_norm": 0.9251638650894165,
"learning_rate": 5.993476505309155e-05,
"loss": 1.406,
"step": 486
},
{
"epoch": 0.4479190618532996,
"grad_norm": 0.7242058515548706,
"learning_rate": 5.979057651975892e-05,
"loss": 1.3418,
"step": 487
},
{
"epoch": 0.4488388135203495,
"grad_norm": 0.6925553679466248,
"learning_rate": 5.9646303270474845e-05,
"loss": 1.3463,
"step": 488
},
{
"epoch": 0.44975856518739943,
"grad_norm": 0.779308021068573,
"learning_rate": 5.9501946553607615e-05,
"loss": 1.3228,
"step": 489
},
{
"epoch": 0.4506783168544493,
"grad_norm": 0.750455379486084,
"learning_rate": 5.9357507618247764e-05,
"loss": 1.3406,
"step": 490
},
{
"epoch": 0.4515980685214992,
"grad_norm": 0.7992476224899292,
"learning_rate": 5.921298771419731e-05,
"loss": 1.375,
"step": 491
},
{
"epoch": 0.4525178201885491,
"grad_norm": 0.7606462240219116,
"learning_rate": 5.9068388091958795e-05,
"loss": 1.3066,
"step": 492
},
{
"epoch": 0.45343757185559896,
"grad_norm": 0.651400625705719,
"learning_rate": 5.8923710002724594e-05,
"loss": 1.3312,
"step": 493
},
{
"epoch": 0.4543573235226489,
"grad_norm": 0.7911424040794373,
"learning_rate": 5.877895469836604e-05,
"loss": 1.3228,
"step": 494
},
{
"epoch": 0.4552770751896988,
"grad_norm": 0.8071415424346924,
"learning_rate": 5.863412343142258e-05,
"loss": 1.3149,
"step": 495
},
{
"epoch": 0.45619682685674867,
"grad_norm": 1.001132845878601,
"learning_rate": 5.848921745509094e-05,
"loss": 1.2951,
"step": 496
},
{
"epoch": 0.45711657852379856,
"grad_norm": 0.9951808452606201,
"learning_rate": 5.834423802321431e-05,
"loss": 1.2331,
"step": 497
},
{
"epoch": 0.4580363301908485,
"grad_norm": 0.9824991822242737,
"learning_rate": 5.8199186390271486e-05,
"loss": 1.2146,
"step": 498
},
{
"epoch": 0.4589560818578984,
"grad_norm": 1.3014886379241943,
"learning_rate": 5.805406381136598e-05,
"loss": 1.2247,
"step": 499
},
{
"epoch": 0.45987583352494826,
"grad_norm": 1.4302425384521484,
"learning_rate": 5.79088715422152e-05,
"loss": 1.047,
"step": 500
},
{
"epoch": 0.46079558519199815,
"grad_norm": 1.9563382863998413,
"learning_rate": 5.7763610839139594e-05,
"loss": 1.6971,
"step": 501
},
{
"epoch": 0.46171533685904803,
"grad_norm": 1.5344587564468384,
"learning_rate": 5.761828295905169e-05,
"loss": 1.6824,
"step": 502
},
{
"epoch": 0.46263508852609797,
"grad_norm": 1.1466830968856812,
"learning_rate": 5.747288915944533e-05,
"loss": 1.5384,
"step": 503
},
{
"epoch": 0.46355484019314785,
"grad_norm": 1.1582822799682617,
"learning_rate": 5.7327430698384775e-05,
"loss": 1.6326,
"step": 504
},
{
"epoch": 0.46447459186019774,
"grad_norm": 1.1693201065063477,
"learning_rate": 5.7181908834493726e-05,
"loss": 1.5041,
"step": 505
},
{
"epoch": 0.4653943435272476,
"grad_norm": 0.9729719758033752,
"learning_rate": 5.703632482694453e-05,
"loss": 1.5669,
"step": 506
},
{
"epoch": 0.46631409519429756,
"grad_norm": 0.9684829115867615,
"learning_rate": 5.689067993544725e-05,
"loss": 1.5907,
"step": 507
},
{
"epoch": 0.46723384686134745,
"grad_norm": 0.8785848021507263,
"learning_rate": 5.6744975420238745e-05,
"loss": 1.4962,
"step": 508
},
{
"epoch": 0.46815359852839733,
"grad_norm": 0.7249252796173096,
"learning_rate": 5.6599212542071824e-05,
"loss": 1.5372,
"step": 509
},
{
"epoch": 0.4690733501954472,
"grad_norm": 0.9696371555328369,
"learning_rate": 5.645339256220426e-05,
"loss": 1.4834,
"step": 510
},
{
"epoch": 0.46999310186249715,
"grad_norm": 0.9309729933738708,
"learning_rate": 5.6307516742387955e-05,
"loss": 1.6006,
"step": 511
},
{
"epoch": 0.47091285352954704,
"grad_norm": 0.8194191455841064,
"learning_rate": 5.616158634485793e-05,
"loss": 1.5423,
"step": 512
},
{
"epoch": 0.4718326051965969,
"grad_norm": 0.8985216617584229,
"learning_rate": 5.601560263232153e-05,
"loss": 1.4869,
"step": 513
},
{
"epoch": 0.4727523568636468,
"grad_norm": 0.8546054363250732,
"learning_rate": 5.586956686794734e-05,
"loss": 1.5534,
"step": 514
},
{
"epoch": 0.4736721085306967,
"grad_norm": 0.7134532332420349,
"learning_rate": 5.572348031535441e-05,
"loss": 1.465,
"step": 515
},
{
"epoch": 0.47459186019774663,
"grad_norm": 0.6382752656936646,
"learning_rate": 5.557734423860123e-05,
"loss": 1.4897,
"step": 516
},
{
"epoch": 0.4755116118647965,
"grad_norm": 0.8380042314529419,
"learning_rate": 5.543115990217478e-05,
"loss": 1.4646,
"step": 517
},
{
"epoch": 0.4764313635318464,
"grad_norm": 0.8848815560340881,
"learning_rate": 5.528492857097966e-05,
"loss": 1.4903,
"step": 518
},
{
"epoch": 0.4773511151988963,
"grad_norm": 0.6244109272956848,
"learning_rate": 5.5138651510327085e-05,
"loss": 1.5031,
"step": 519
},
{
"epoch": 0.4782708668659462,
"grad_norm": 0.8367244601249695,
"learning_rate": 5.499232998592399e-05,
"loss": 1.4978,
"step": 520
},
{
"epoch": 0.4791906185329961,
"grad_norm": 0.7362543344497681,
"learning_rate": 5.484596526386198e-05,
"loss": 1.529,
"step": 521
},
{
"epoch": 0.480110370200046,
"grad_norm": 0.579655647277832,
"learning_rate": 5.469955861060653e-05,
"loss": 1.4446,
"step": 522
},
{
"epoch": 0.4810301218670959,
"grad_norm": 0.7875382304191589,
"learning_rate": 5.455311129298586e-05,
"loss": 1.505,
"step": 523
},
{
"epoch": 0.48194987353414576,
"grad_norm": 0.7048112154006958,
"learning_rate": 5.4406624578180096e-05,
"loss": 1.4612,
"step": 524
},
{
"epoch": 0.4828696252011957,
"grad_norm": 0.6148046255111694,
"learning_rate": 5.4260099733710255e-05,
"loss": 1.4871,
"step": 525
},
{
"epoch": 0.4837893768682456,
"grad_norm": 0.7813459038734436,
"learning_rate": 5.4113538027427245e-05,
"loss": 1.431,
"step": 526
},
{
"epoch": 0.48470912853529546,
"grad_norm": 0.6388234496116638,
"learning_rate": 5.396694072750099e-05,
"loss": 1.4811,
"step": 527
},
{
"epoch": 0.48562888020234535,
"grad_norm": 0.5977755784988403,
"learning_rate": 5.382030910240936e-05,
"loss": 1.4302,
"step": 528
},
{
"epoch": 0.4865486318693953,
"grad_norm": 0.6440762281417847,
"learning_rate": 5.367364442092724e-05,
"loss": 1.4468,
"step": 529
},
{
"epoch": 0.48746838353644517,
"grad_norm": 0.68966144323349,
"learning_rate": 5.352694795211555e-05,
"loss": 1.4563,
"step": 530
},
{
"epoch": 0.48838813520349506,
"grad_norm": 0.682101845741272,
"learning_rate": 5.338022096531028e-05,
"loss": 1.4953,
"step": 531
},
{
"epoch": 0.48930788687054494,
"grad_norm": 0.5871472954750061,
"learning_rate": 5.3233464730111426e-05,
"loss": 1.4285,
"step": 532
},
{
"epoch": 0.4902276385375948,
"grad_norm": 0.60948246717453,
"learning_rate": 5.308668051637212e-05,
"loss": 1.4083,
"step": 533
},
{
"epoch": 0.49114739020464476,
"grad_norm": 0.7118504047393799,
"learning_rate": 5.2939869594187595e-05,
"loss": 1.4257,
"step": 534
},
{
"epoch": 0.49206714187169465,
"grad_norm": 0.6763386726379395,
"learning_rate": 5.2793033233884124e-05,
"loss": 1.3886,
"step": 535
},
{
"epoch": 0.49298689353874453,
"grad_norm": 0.6314605474472046,
"learning_rate": 5.2646172706008156e-05,
"loss": 1.3105,
"step": 536
},
{
"epoch": 0.4939066452057944,
"grad_norm": 0.7385772466659546,
"learning_rate": 5.249928928131523e-05,
"loss": 1.3189,
"step": 537
},
{
"epoch": 0.49482639687284435,
"grad_norm": 0.6615415811538696,
"learning_rate": 5.235238423075899e-05,
"loss": 1.3235,
"step": 538
},
{
"epoch": 0.49574614853989424,
"grad_norm": 0.6805823445320129,
"learning_rate": 5.220545882548023e-05,
"loss": 1.3938,
"step": 539
},
{
"epoch": 0.4966659002069441,
"grad_norm": 0.8164578676223755,
"learning_rate": 5.205851433679589e-05,
"loss": 1.329,
"step": 540
},
{
"epoch": 0.497585651873994,
"grad_norm": 0.7139110565185547,
"learning_rate": 5.191155203618796e-05,
"loss": 1.2914,
"step": 541
},
{
"epoch": 0.49850540354104395,
"grad_norm": 0.6411809921264648,
"learning_rate": 5.176457319529263e-05,
"loss": 1.3289,
"step": 542
},
{
"epoch": 0.49942515520809383,
"grad_norm": 0.639995813369751,
"learning_rate": 5.161757908588917e-05,
"loss": 1.2874,
"step": 543
},
{
"epoch": 0.5003449068751437,
"grad_norm": 0.6557344794273376,
"learning_rate": 5.1470570979888973e-05,
"loss": 1.3043,
"step": 544
},
{
"epoch": 0.5012646585421936,
"grad_norm": 0.7925935387611389,
"learning_rate": 5.132355014932455e-05,
"loss": 1.2978,
"step": 545
},
{
"epoch": 0.5021844102092435,
"grad_norm": 0.7339189052581787,
"learning_rate": 5.117651786633849e-05,
"loss": 1.2996,
"step": 546
},
{
"epoch": 0.5031041618762934,
"grad_norm": 0.805228054523468,
"learning_rate": 5.102947540317253e-05,
"loss": 1.2458,
"step": 547
},
{
"epoch": 0.5040239135433433,
"grad_norm": 0.7840575575828552,
"learning_rate": 5.088242403215644e-05,
"loss": 1.253,
"step": 548
},
{
"epoch": 0.5049436652103932,
"grad_norm": 1.0337337255477905,
"learning_rate": 5.073536502569708e-05,
"loss": 1.1262,
"step": 549
},
{
"epoch": 0.5058634168774431,
"grad_norm": 1.2608665227890015,
"learning_rate": 5.0588299656267414e-05,
"loss": 1.022,
"step": 550
},
{
"epoch": 0.506783168544493,
"grad_norm": 1.6019068956375122,
"learning_rate": 5.044122919639541e-05,
"loss": 1.6294,
"step": 551
},
{
"epoch": 0.5077029202115428,
"grad_norm": 1.4624245166778564,
"learning_rate": 5.029415491865311e-05,
"loss": 1.6211,
"step": 552
},
{
"epoch": 0.5086226718785928,
"grad_norm": 1.249880075454712,
"learning_rate": 5.014707809564562e-05,
"loss": 1.5335,
"step": 553
},
{
"epoch": 0.5095424235456427,
"grad_norm": 1.1160420179367065,
"learning_rate": 5e-05,
"loss": 1.5818,
"step": 554
},
{
"epoch": 0.5104621752126925,
"grad_norm": 0.9601331353187561,
"learning_rate": 4.98529219043544e-05,
"loss": 1.5011,
"step": 555
},
{
"epoch": 0.5113819268797425,
"grad_norm": 0.9078472852706909,
"learning_rate": 4.9705845081346894e-05,
"loss": 1.4804,
"step": 556
},
{
"epoch": 0.5123016785467923,
"grad_norm": 1.0430097579956055,
"learning_rate": 4.9558770803604614e-05,
"loss": 1.5421,
"step": 557
},
{
"epoch": 0.5132214302138423,
"grad_norm": 0.9206668138504028,
"learning_rate": 4.94117003437326e-05,
"loss": 1.5167,
"step": 558
},
{
"epoch": 0.5141411818808922,
"grad_norm": 0.7888804078102112,
"learning_rate": 4.926463497430293e-05,
"loss": 1.4761,
"step": 559
},
{
"epoch": 0.515060933547942,
"grad_norm": 0.7101994752883911,
"learning_rate": 4.911757596784357e-05,
"loss": 1.4642,
"step": 560
},
{
"epoch": 0.515980685214992,
"grad_norm": 0.8613134026527405,
"learning_rate": 4.8970524596827486e-05,
"loss": 1.5374,
"step": 561
},
{
"epoch": 0.5169004368820419,
"grad_norm": 0.7729939222335815,
"learning_rate": 4.8823482133661516e-05,
"loss": 1.4959,
"step": 562
},
{
"epoch": 0.5178201885490917,
"grad_norm": 0.9063132405281067,
"learning_rate": 4.8676449850675475e-05,
"loss": 1.5057,
"step": 563
},
{
"epoch": 0.5187399402161417,
"grad_norm": 0.9306026697158813,
"learning_rate": 4.852942902011103e-05,
"loss": 1.5544,
"step": 564
},
{
"epoch": 0.5196596918831915,
"grad_norm": 0.763334333896637,
"learning_rate": 4.838242091411084e-05,
"loss": 1.4385,
"step": 565
},
{
"epoch": 0.5205794435502414,
"grad_norm": 0.7051974534988403,
"learning_rate": 4.823542680470738e-05,
"loss": 1.4612,
"step": 566
},
{
"epoch": 0.5214991952172914,
"grad_norm": 0.7262412905693054,
"learning_rate": 4.808844796381205e-05,
"loss": 1.4366,
"step": 567
},
{
"epoch": 0.5224189468843412,
"grad_norm": 0.7530311346054077,
"learning_rate": 4.7941485663204125e-05,
"loss": 1.4883,
"step": 568
},
{
"epoch": 0.5233386985513911,
"grad_norm": 0.653555691242218,
"learning_rate": 4.779454117451977e-05,
"loss": 1.3767,
"step": 569
},
{
"epoch": 0.524258450218441,
"grad_norm": 0.7212573289871216,
"learning_rate": 4.7647615769241e-05,
"loss": 1.3811,
"step": 570
},
{
"epoch": 0.5251782018854909,
"grad_norm": 0.7534743547439575,
"learning_rate": 4.750071071868478e-05,
"loss": 1.4899,
"step": 571
},
{
"epoch": 0.5260979535525409,
"grad_norm": 0.6205776333808899,
"learning_rate": 4.735382729399184e-05,
"loss": 1.4294,
"step": 572
},
{
"epoch": 0.5270177052195907,
"grad_norm": 0.6632286906242371,
"learning_rate": 4.720696676611589e-05,
"loss": 1.4939,
"step": 573
},
{
"epoch": 0.5279374568866406,
"grad_norm": 0.7253984808921814,
"learning_rate": 4.706013040581242e-05,
"loss": 1.4342,
"step": 574
},
{
"epoch": 0.5288572085536904,
"grad_norm": 0.7158737778663635,
"learning_rate": 4.691331948362789e-05,
"loss": 1.4718,
"step": 575
},
{
"epoch": 0.5297769602207404,
"grad_norm": 0.6117165088653564,
"learning_rate": 4.676653526988858e-05,
"loss": 1.4828,
"step": 576
},
{
"epoch": 0.5306967118877903,
"grad_norm": 0.6031986474990845,
"learning_rate": 4.661977903468974e-05,
"loss": 1.4493,
"step": 577
},
{
"epoch": 0.5316164635548402,
"grad_norm": 0.6613805890083313,
"learning_rate": 4.647305204788445e-05,
"loss": 1.4419,
"step": 578
},
{
"epoch": 0.5325362152218901,
"grad_norm": 0.6349487900733948,
"learning_rate": 4.632635557907277e-05,
"loss": 1.4213,
"step": 579
},
{
"epoch": 0.53345596688894,
"grad_norm": 0.5844326019287109,
"learning_rate": 4.617969089759066e-05,
"loss": 1.4505,
"step": 580
},
{
"epoch": 0.5343757185559899,
"grad_norm": 0.7105299234390259,
"learning_rate": 4.603305927249902e-05,
"loss": 1.3974,
"step": 581
},
{
"epoch": 0.5352954702230398,
"grad_norm": 0.7277695536613464,
"learning_rate": 4.588646197257277e-05,
"loss": 1.371,
"step": 582
},
{
"epoch": 0.5362152218900896,
"grad_norm": 0.6246547698974609,
"learning_rate": 4.5739900266289756e-05,
"loss": 1.3747,
"step": 583
},
{
"epoch": 0.5371349735571396,
"grad_norm": 0.918038547039032,
"learning_rate": 4.559337542181993e-05,
"loss": 1.3068,
"step": 584
},
{
"epoch": 0.5380547252241895,
"grad_norm": 0.7304350733757019,
"learning_rate": 4.544688870701415e-05,
"loss": 1.3496,
"step": 585
},
{
"epoch": 0.5389744768912393,
"grad_norm": 0.6852339506149292,
"learning_rate": 4.53004413893935e-05,
"loss": 1.3327,
"step": 586
},
{
"epoch": 0.5398942285582893,
"grad_norm": 0.7337968349456787,
"learning_rate": 4.515403473613803e-05,
"loss": 1.3756,
"step": 587
},
{
"epoch": 0.5408139802253391,
"grad_norm": 0.7710087895393372,
"learning_rate": 4.5007670014076045e-05,
"loss": 1.3611,
"step": 588
},
{
"epoch": 0.541733731892389,
"grad_norm": 0.6107405424118042,
"learning_rate": 4.486134848967292e-05,
"loss": 1.312,
"step": 589
},
{
"epoch": 0.542653483559439,
"grad_norm": 0.7013472318649292,
"learning_rate": 4.471507142902036e-05,
"loss": 1.3194,
"step": 590
},
{
"epoch": 0.5435732352264888,
"grad_norm": 0.8323330283164978,
"learning_rate": 4.4568840097825226e-05,
"loss": 1.2888,
"step": 591
},
{
"epoch": 0.5444929868935388,
"grad_norm": 0.6520772576332092,
"learning_rate": 4.442265576139878e-05,
"loss": 1.2347,
"step": 592
},
{
"epoch": 0.5454127385605887,
"grad_norm": 0.7573135495185852,
"learning_rate": 4.4276519684645585e-05,
"loss": 1.316,
"step": 593
},
{
"epoch": 0.5463324902276385,
"grad_norm": 0.7183561325073242,
"learning_rate": 4.4130433132052664e-05,
"loss": 1.2999,
"step": 594
},
{
"epoch": 0.5472522418946885,
"grad_norm": 0.8150544762611389,
"learning_rate": 4.398439736767847e-05,
"loss": 1.2111,
"step": 595
},
{
"epoch": 0.5481719935617383,
"grad_norm": 0.8062061071395874,
"learning_rate": 4.383841365514208e-05,
"loss": 1.2231,
"step": 596
},
{
"epoch": 0.5490917452287882,
"grad_norm": 0.8277079463005066,
"learning_rate": 4.369248325761205e-05,
"loss": 1.2266,
"step": 597
},
{
"epoch": 0.5500114968958382,
"grad_norm": 1.1290823221206665,
"learning_rate": 4.354660743779574e-05,
"loss": 1.1825,
"step": 598
},
{
"epoch": 0.550931248562888,
"grad_norm": 1.0019193887710571,
"learning_rate": 4.340078745792818e-05,
"loss": 1.103,
"step": 599
},
{
"epoch": 0.5518510002299379,
"grad_norm": 1.0963555574417114,
"learning_rate": 4.325502457976126e-05,
"loss": 1.031,
"step": 600
},
{
"epoch": 0.5518510002299379,
"eval_loss": 1.4310619831085205,
"eval_runtime": 49.9435,
"eval_samples_per_second": 164.986,
"eval_steps_per_second": 20.623,
"step": 600
},
{
"epoch": 0.5527707518969878,
"grad_norm": 1.6411505937576294,
"learning_rate": 4.310932006455276e-05,
"loss": 1.6187,
"step": 601
},
{
"epoch": 0.5536905035640377,
"grad_norm": 1.455959677696228,
"learning_rate": 4.296367517305549e-05,
"loss": 1.5665,
"step": 602
},
{
"epoch": 0.5546102552310876,
"grad_norm": 1.3301597833633423,
"learning_rate": 4.281809116550629e-05,
"loss": 1.5417,
"step": 603
},
{
"epoch": 0.5555300068981375,
"grad_norm": 1.0796560049057007,
"learning_rate": 4.267256930161523e-05,
"loss": 1.5482,
"step": 604
},
{
"epoch": 0.5564497585651874,
"grad_norm": 0.842844545841217,
"learning_rate": 4.252711084055467e-05,
"loss": 1.4583,
"step": 605
},
{
"epoch": 0.5573695102322372,
"grad_norm": 0.7908689379692078,
"learning_rate": 4.2381717040948325e-05,
"loss": 1.4621,
"step": 606
},
{
"epoch": 0.5582892618992872,
"grad_norm": 0.9240807890892029,
"learning_rate": 4.223638916086043e-05,
"loss": 1.4843,
"step": 607
},
{
"epoch": 0.5592090135663371,
"grad_norm": 0.9389266967773438,
"learning_rate": 4.209112845778481e-05,
"loss": 1.4186,
"step": 608
},
{
"epoch": 0.560128765233387,
"grad_norm": 0.7683906555175781,
"learning_rate": 4.194593618863404e-05,
"loss": 1.4541,
"step": 609
},
{
"epoch": 0.5610485169004369,
"grad_norm": 0.6913854479789734,
"learning_rate": 4.1800813609728526e-05,
"loss": 1.4815,
"step": 610
},
{
"epoch": 0.5619682685674868,
"grad_norm": 0.7714055776596069,
"learning_rate": 4.1655761976785705e-05,
"loss": 1.4577,
"step": 611
},
{
"epoch": 0.5628880202345367,
"grad_norm": 0.7735984921455383,
"learning_rate": 4.1510782544909075e-05,
"loss": 1.5057,
"step": 612
},
{
"epoch": 0.5638077719015866,
"grad_norm": 0.8532646298408508,
"learning_rate": 4.136587656857744e-05,
"loss": 1.4917,
"step": 613
},
{
"epoch": 0.5647275235686364,
"grad_norm": 0.7896936535835266,
"learning_rate": 4.122104530163397e-05,
"loss": 1.5009,
"step": 614
},
{
"epoch": 0.5656472752356864,
"grad_norm": 0.6928205490112305,
"learning_rate": 4.107628999727542e-05,
"loss": 1.4733,
"step": 615
},
{
"epoch": 0.5665670269027363,
"grad_norm": 0.728251576423645,
"learning_rate": 4.09316119080412e-05,
"loss": 1.4508,
"step": 616
},
{
"epoch": 0.5674867785697861,
"grad_norm": 0.6070961356163025,
"learning_rate": 4.078701228580269e-05,
"loss": 1.5002,
"step": 617
},
{
"epoch": 0.5684065302368361,
"grad_norm": 0.7009554505348206,
"learning_rate": 4.064249238175223e-05,
"loss": 1.5289,
"step": 618
},
{
"epoch": 0.5693262819038859,
"grad_norm": 0.6865770220756531,
"learning_rate": 4.0498053446392403e-05,
"loss": 1.4876,
"step": 619
},
{
"epoch": 0.5702460335709358,
"grad_norm": 0.6156379580497742,
"learning_rate": 4.035369672952516e-05,
"loss": 1.4032,
"step": 620
},
{
"epoch": 0.5711657852379858,
"grad_norm": 0.5818307995796204,
"learning_rate": 4.020942348024108e-05,
"loss": 1.4421,
"step": 621
},
{
"epoch": 0.5720855369050356,
"grad_norm": 0.5913554430007935,
"learning_rate": 4.0065234946908456e-05,
"loss": 1.4527,
"step": 622
},
{
"epoch": 0.5730052885720855,
"grad_norm": 0.5924707651138306,
"learning_rate": 3.992113237716261e-05,
"loss": 1.4692,
"step": 623
},
{
"epoch": 0.5739250402391355,
"grad_norm": 0.6369109749794006,
"learning_rate": 3.977711701789499e-05,
"loss": 1.4541,
"step": 624
},
{
"epoch": 0.5748447919061853,
"grad_norm": 0.5432732701301575,
"learning_rate": 3.9633190115242456e-05,
"loss": 1.3981,
"step": 625
},
{
"epoch": 0.5757645435732353,
"grad_norm": 0.6044031977653503,
"learning_rate": 3.948935291457644e-05,
"loss": 1.4086,
"step": 626
},
{
"epoch": 0.5766842952402851,
"grad_norm": 0.5974178314208984,
"learning_rate": 3.934560666049226e-05,
"loss": 1.448,
"step": 627
},
{
"epoch": 0.577604046907335,
"grad_norm": 0.6302614212036133,
"learning_rate": 3.920195259679822e-05,
"loss": 1.4095,
"step": 628
},
{
"epoch": 0.578523798574385,
"grad_norm": 0.6615459322929382,
"learning_rate": 3.905839196650493e-05,
"loss": 1.5048,
"step": 629
},
{
"epoch": 0.5794435502414348,
"grad_norm": 0.5650434494018555,
"learning_rate": 3.8914926011814626e-05,
"loss": 1.4093,
"step": 630
},
{
"epoch": 0.5803633019084847,
"grad_norm": 0.5881006121635437,
"learning_rate": 3.8771555974110194e-05,
"loss": 1.3783,
"step": 631
},
{
"epoch": 0.5812830535755346,
"grad_norm": 0.6607415676116943,
"learning_rate": 3.8628283093944686e-05,
"loss": 1.4406,
"step": 632
},
{
"epoch": 0.5822028052425845,
"grad_norm": 0.6574285626411438,
"learning_rate": 3.8485108611030415e-05,
"loss": 1.3927,
"step": 633
},
{
"epoch": 0.5831225569096344,
"grad_norm": 0.7541502714157104,
"learning_rate": 3.834203376422831e-05,
"loss": 1.374,
"step": 634
},
{
"epoch": 0.5840423085766843,
"grad_norm": 0.6834109425544739,
"learning_rate": 3.81990597915371e-05,
"loss": 1.3459,
"step": 635
},
{
"epoch": 0.5849620602437342,
"grad_norm": 0.649935781955719,
"learning_rate": 3.805618793008279e-05,
"loss": 1.3314,
"step": 636
},
{
"epoch": 0.585881811910784,
"grad_norm": 0.6892503499984741,
"learning_rate": 3.7913419416107694e-05,
"loss": 1.3958,
"step": 637
},
{
"epoch": 0.586801563577834,
"grad_norm": 0.6689726710319519,
"learning_rate": 3.7770755484960004e-05,
"loss": 1.3384,
"step": 638
},
{
"epoch": 0.5877213152448839,
"grad_norm": 0.5913270711898804,
"learning_rate": 3.762819737108291e-05,
"loss": 1.3169,
"step": 639
},
{
"epoch": 0.5886410669119337,
"grad_norm": 0.6090061068534851,
"learning_rate": 3.748574630800401e-05,
"loss": 1.2413,
"step": 640
},
{
"epoch": 0.5895608185789837,
"grad_norm": 0.7058801651000977,
"learning_rate": 3.734340352832457e-05,
"loss": 1.289,
"step": 641
},
{
"epoch": 0.5904805702460336,
"grad_norm": 0.7695034146308899,
"learning_rate": 3.7201170263709e-05,
"loss": 1.3332,
"step": 642
},
{
"epoch": 0.5914003219130834,
"grad_norm": 0.6559154987335205,
"learning_rate": 3.705904774487396e-05,
"loss": 1.2992,
"step": 643
},
{
"epoch": 0.5923200735801334,
"grad_norm": 0.7140766382217407,
"learning_rate": 3.691703720157798e-05,
"loss": 1.2247,
"step": 644
},
{
"epoch": 0.5932398252471832,
"grad_norm": 0.7867764830589294,
"learning_rate": 3.6775139862610574e-05,
"loss": 1.2409,
"step": 645
},
{
"epoch": 0.5941595769142332,
"grad_norm": 0.9307761788368225,
"learning_rate": 3.663335695578183e-05,
"loss": 1.1696,
"step": 646
},
{
"epoch": 0.5950793285812831,
"grad_norm": 0.8968107104301453,
"learning_rate": 3.649168970791157e-05,
"loss": 1.1511,
"step": 647
},
{
"epoch": 0.5959990802483329,
"grad_norm": 0.9723992943763733,
"learning_rate": 3.635013934481895e-05,
"loss": 1.1133,
"step": 648
},
{
"epoch": 0.5969188319153829,
"grad_norm": 1.1764365434646606,
"learning_rate": 3.6208707091311626e-05,
"loss": 1.1247,
"step": 649
},
{
"epoch": 0.5978385835824327,
"grad_norm": 1.0631630420684814,
"learning_rate": 3.6067394171175394e-05,
"loss": 1.0094,
"step": 650
},
{
"epoch": 0.5987583352494826,
"grad_norm": 1.4610891342163086,
"learning_rate": 3.592620180716338e-05,
"loss": 1.635,
"step": 651
},
{
"epoch": 0.5996780869165326,
"grad_norm": 1.4560317993164062,
"learning_rate": 3.578513122098566e-05,
"loss": 1.5683,
"step": 652
},
{
"epoch": 0.6005978385835824,
"grad_norm": 1.250054955482483,
"learning_rate": 3.564418363329848e-05,
"loss": 1.4994,
"step": 653
},
{
"epoch": 0.6015175902506323,
"grad_norm": 1.0758668184280396,
"learning_rate": 3.5503360263693886e-05,
"loss": 1.4581,
"step": 654
},
{
"epoch": 0.6024373419176823,
"grad_norm": 0.9774999022483826,
"learning_rate": 3.5362662330689064e-05,
"loss": 1.4609,
"step": 655
},
{
"epoch": 0.6033570935847321,
"grad_norm": 0.8008742332458496,
"learning_rate": 3.52220910517158e-05,
"loss": 1.4672,
"step": 656
},
{
"epoch": 0.604276845251782,
"grad_norm": 0.7127364873886108,
"learning_rate": 3.5081647643110024e-05,
"loss": 1.4948,
"step": 657
},
{
"epoch": 0.6051965969188319,
"grad_norm": 0.76557457447052,
"learning_rate": 3.494133332010117e-05,
"loss": 1.4609,
"step": 658
},
{
"epoch": 0.6061163485858818,
"grad_norm": 0.8269351124763489,
"learning_rate": 3.480114929680176e-05,
"loss": 1.5268,
"step": 659
},
{
"epoch": 0.6070361002529318,
"grad_norm": 0.810955286026001,
"learning_rate": 3.466109678619681e-05,
"loss": 1.523,
"step": 660
},
{
"epoch": 0.6079558519199816,
"grad_norm": 0.6712583303451538,
"learning_rate": 3.452117700013345e-05,
"loss": 1.4676,
"step": 661
},
{
"epoch": 0.6088756035870315,
"grad_norm": 0.828484058380127,
"learning_rate": 3.43813911493103e-05,
"loss": 1.5116,
"step": 662
},
{
"epoch": 0.6097953552540814,
"grad_norm": 0.7789233922958374,
"learning_rate": 3.424174044326711e-05,
"loss": 1.445,
"step": 663
},
{
"epoch": 0.6107151069211313,
"grad_norm": 0.7635114789009094,
"learning_rate": 3.4102226090374246e-05,
"loss": 1.5681,
"step": 664
},
{
"epoch": 0.6116348585881812,
"grad_norm": 0.6956825256347656,
"learning_rate": 3.3962849297822226e-05,
"loss": 1.4877,
"step": 665
},
{
"epoch": 0.6125546102552311,
"grad_norm": 0.6926284432411194,
"learning_rate": 3.382361127161127e-05,
"loss": 1.4282,
"step": 666
},
{
"epoch": 0.613474361922281,
"grad_norm": 0.8702225089073181,
"learning_rate": 3.368451321654091e-05,
"loss": 1.4773,
"step": 667
},
{
"epoch": 0.6143941135893308,
"grad_norm": 0.7277842164039612,
"learning_rate": 3.35455563361995e-05,
"loss": 1.3959,
"step": 668
},
{
"epoch": 0.6153138652563808,
"grad_norm": 0.6363296508789062,
"learning_rate": 3.340674183295389e-05,
"loss": 1.4747,
"step": 669
},
{
"epoch": 0.6162336169234307,
"grad_norm": 0.6425765156745911,
"learning_rate": 3.326807090793891e-05,
"loss": 1.4423,
"step": 670
},
{
"epoch": 0.6171533685904805,
"grad_norm": 0.6721304059028625,
"learning_rate": 3.312954476104709e-05,
"loss": 1.4241,
"step": 671
},
{
"epoch": 0.6180731202575305,
"grad_norm": 0.6218870878219604,
"learning_rate": 3.299116459091816e-05,
"loss": 1.4644,
"step": 672
},
{
"epoch": 0.6189928719245804,
"grad_norm": 0.6951906681060791,
"learning_rate": 3.2852931594928807e-05,
"loss": 1.452,
"step": 673
},
{
"epoch": 0.6199126235916302,
"grad_norm": 0.6208174824714661,
"learning_rate": 3.271484696918218e-05,
"loss": 1.415,
"step": 674
},
{
"epoch": 0.6208323752586802,
"grad_norm": 0.5596356391906738,
"learning_rate": 3.257691190849769e-05,
"loss": 1.4708,
"step": 675
},
{
"epoch": 0.62175212692573,
"grad_norm": 0.6394990682601929,
"learning_rate": 3.243912760640054e-05,
"loss": 1.4522,
"step": 676
},
{
"epoch": 0.62267187859278,
"grad_norm": 0.6112094521522522,
"learning_rate": 3.2301495255111425e-05,
"loss": 1.3607,
"step": 677
},
{
"epoch": 0.6235916302598299,
"grad_norm": 0.645779013633728,
"learning_rate": 3.2164016045536304e-05,
"loss": 1.4282,
"step": 678
},
{
"epoch": 0.6245113819268797,
"grad_norm": 0.6169288754463196,
"learning_rate": 3.202669116725598e-05,
"loss": 1.4052,
"step": 679
},
{
"epoch": 0.6254311335939297,
"grad_norm": 0.6002304553985596,
"learning_rate": 3.188952180851589e-05,
"loss": 1.419,
"step": 680
},
{
"epoch": 0.6263508852609795,
"grad_norm": 0.6018975377082825,
"learning_rate": 3.1752509156215734e-05,
"loss": 1.3685,
"step": 681
},
{
"epoch": 0.6272706369280294,
"grad_norm": 0.6559040546417236,
"learning_rate": 3.1615654395899375e-05,
"loss": 1.3657,
"step": 682
},
{
"epoch": 0.6281903885950794,
"grad_norm": 0.6393570899963379,
"learning_rate": 3.147895871174432e-05,
"loss": 1.405,
"step": 683
},
{
"epoch": 0.6291101402621292,
"grad_norm": 0.6094779968261719,
"learning_rate": 3.134242328655175e-05,
"loss": 1.3179,
"step": 684
},
{
"epoch": 0.6300298919291791,
"grad_norm": 0.6581336855888367,
"learning_rate": 3.120604930173608e-05,
"loss": 1.3276,
"step": 685
},
{
"epoch": 0.6309496435962291,
"grad_norm": 0.6599423289299011,
"learning_rate": 3.106983793731484e-05,
"loss": 1.2805,
"step": 686
},
{
"epoch": 0.6318693952632789,
"grad_norm": 0.683204710483551,
"learning_rate": 3.093379037189842e-05,
"loss": 1.3557,
"step": 687
},
{
"epoch": 0.6327891469303288,
"grad_norm": 0.6180110573768616,
"learning_rate": 3.079790778267994e-05,
"loss": 1.2668,
"step": 688
},
{
"epoch": 0.6337088985973787,
"grad_norm": 0.7273058891296387,
"learning_rate": 3.066219134542492e-05,
"loss": 1.2852,
"step": 689
},
{
"epoch": 0.6346286502644286,
"grad_norm": 0.6892321705818176,
"learning_rate": 3.052664223446131e-05,
"loss": 1.2997,
"step": 690
},
{
"epoch": 0.6355484019314785,
"grad_norm": 0.694174587726593,
"learning_rate": 3.039126162266912e-05,
"loss": 1.2398,
"step": 691
},
{
"epoch": 0.6364681535985284,
"grad_norm": 0.7471473217010498,
"learning_rate": 3.0256050681470444e-05,
"loss": 1.1879,
"step": 692
},
{
"epoch": 0.6373879052655783,
"grad_norm": 0.7812895178794861,
"learning_rate": 3.012101058081919e-05,
"loss": 1.2826,
"step": 693
},
{
"epoch": 0.6383076569326281,
"grad_norm": 0.7405266761779785,
"learning_rate": 2.998614248919107e-05,
"loss": 1.1937,
"step": 694
},
{
"epoch": 0.6392274085996781,
"grad_norm": 0.7346695065498352,
"learning_rate": 2.9851447573573384e-05,
"loss": 1.2364,
"step": 695
},
{
"epoch": 0.640147160266728,
"grad_norm": 0.7376750707626343,
"learning_rate": 2.971692699945502e-05,
"loss": 1.222,
"step": 696
},
{
"epoch": 0.6410669119337778,
"grad_norm": 0.7857553362846375,
"learning_rate": 2.9582581930816288e-05,
"loss": 1.1532,
"step": 697
},
{
"epoch": 0.6419866636008278,
"grad_norm": 1.1139256954193115,
"learning_rate": 2.9448413530118914e-05,
"loss": 1.0823,
"step": 698
},
{
"epoch": 0.6429064152678776,
"grad_norm": 0.9734514355659485,
"learning_rate": 2.9314422958295907e-05,
"loss": 1.0059,
"step": 699
},
{
"epoch": 0.6438261669349276,
"grad_norm": 1.195755124092102,
"learning_rate": 2.9180611374741623e-05,
"loss": 1.0146,
"step": 700
},
{
"epoch": 0.6447459186019775,
"grad_norm": 1.1521427631378174,
"learning_rate": 2.9046979937301588e-05,
"loss": 1.5188,
"step": 701
},
{
"epoch": 0.6456656702690273,
"grad_norm": 1.0498712062835693,
"learning_rate": 2.8913529802262617e-05,
"loss": 1.5642,
"step": 702
},
{
"epoch": 0.6465854219360773,
"grad_norm": 1.004340410232544,
"learning_rate": 2.8780262124342755e-05,
"loss": 1.4869,
"step": 703
},
{
"epoch": 0.6475051736031272,
"grad_norm": 0.9507954716682434,
"learning_rate": 2.8647178056681194e-05,
"loss": 1.5128,
"step": 704
},
{
"epoch": 0.648424925270177,
"grad_norm": 0.8366132974624634,
"learning_rate": 2.8514278750828536e-05,
"loss": 1.4907,
"step": 705
},
{
"epoch": 0.649344676937227,
"grad_norm": 0.8227055072784424,
"learning_rate": 2.838156535673652e-05,
"loss": 1.5356,
"step": 706
},
{
"epoch": 0.6502644286042768,
"grad_norm": 0.7174684405326843,
"learning_rate": 2.8249039022748313e-05,
"loss": 1.4349,
"step": 707
},
{
"epoch": 0.6511841802713267,
"grad_norm": 0.6819536089897156,
"learning_rate": 2.8116700895588472e-05,
"loss": 1.4133,
"step": 708
},
{
"epoch": 0.6521039319383767,
"grad_norm": 0.7197076082229614,
"learning_rate": 2.7984552120353046e-05,
"loss": 1.4284,
"step": 709
},
{
"epoch": 0.6530236836054265,
"grad_norm": 0.7833074331283569,
"learning_rate": 2.785259384049959e-05,
"loss": 1.5066,
"step": 710
},
{
"epoch": 0.6539434352724764,
"grad_norm": 0.7236879467964172,
"learning_rate": 2.7720827197837472e-05,
"loss": 1.3815,
"step": 711
},
{
"epoch": 0.6548631869395263,
"grad_norm": 0.6463202238082886,
"learning_rate": 2.7589253332517734e-05,
"loss": 1.4513,
"step": 712
},
{
"epoch": 0.6557829386065762,
"grad_norm": 0.7177314758300781,
"learning_rate": 2.745787338302341e-05,
"loss": 1.4443,
"step": 713
},
{
"epoch": 0.6567026902736262,
"grad_norm": 0.7721028327941895,
"learning_rate": 2.7326688486159613e-05,
"loss": 1.4899,
"step": 714
},
{
"epoch": 0.657622441940676,
"grad_norm": 0.6830793023109436,
"learning_rate": 2.719569977704372e-05,
"loss": 1.5052,
"step": 715
},
{
"epoch": 0.6585421936077259,
"grad_norm": 0.6752369403839111,
"learning_rate": 2.7064908389095468e-05,
"loss": 1.5062,
"step": 716
},
{
"epoch": 0.6594619452747759,
"grad_norm": 0.6267321109771729,
"learning_rate": 2.693431545402732e-05,
"loss": 1.5125,
"step": 717
},
{
"epoch": 0.6603816969418257,
"grad_norm": 0.6160003542900085,
"learning_rate": 2.6803922101834454e-05,
"loss": 1.4609,
"step": 718
},
{
"epoch": 0.6613014486088756,
"grad_norm": 0.5926380157470703,
"learning_rate": 2.6673729460785176e-05,
"loss": 1.415,
"step": 719
},
{
"epoch": 0.6622212002759255,
"grad_norm": 0.6655170321464539,
"learning_rate": 2.6543738657411034e-05,
"loss": 1.372,
"step": 720
},
{
"epoch": 0.6631409519429754,
"grad_norm": 0.6094529628753662,
"learning_rate": 2.6413950816497147e-05,
"loss": 1.4037,
"step": 721
},
{
"epoch": 0.6640607036100253,
"grad_norm": 0.6568109393119812,
"learning_rate": 2.6284367061072378e-05,
"loss": 1.458,
"step": 722
},
{
"epoch": 0.6649804552770752,
"grad_norm": 0.5817413330078125,
"learning_rate": 2.615498851239978e-05,
"loss": 1.4009,
"step": 723
},
{
"epoch": 0.6659002069441251,
"grad_norm": 0.6216491460800171,
"learning_rate": 2.6025816289966704e-05,
"loss": 1.4178,
"step": 724
},
{
"epoch": 0.6668199586111749,
"grad_norm": 0.6176545023918152,
"learning_rate": 2.5896851511475186e-05,
"loss": 1.4191,
"step": 725
},
{
"epoch": 0.6677397102782249,
"grad_norm": 0.5803206562995911,
"learning_rate": 2.576809529283241e-05,
"loss": 1.415,
"step": 726
},
{
"epoch": 0.6686594619452748,
"grad_norm": 0.5935968160629272,
"learning_rate": 2.5639548748140802e-05,
"loss": 1.3797,
"step": 727
},
{
"epoch": 0.6695792136123246,
"grad_norm": 0.6356935501098633,
"learning_rate": 2.5511212989688586e-05,
"loss": 1.4948,
"step": 728
},
{
"epoch": 0.6704989652793746,
"grad_norm": 0.5835620760917664,
"learning_rate": 2.5383089127940086e-05,
"loss": 1.4203,
"step": 729
},
{
"epoch": 0.6714187169464245,
"grad_norm": 0.687403678894043,
"learning_rate": 2.5255178271526137e-05,
"loss": 1.3661,
"step": 730
},
{
"epoch": 0.6723384686134743,
"grad_norm": 0.6388825178146362,
"learning_rate": 2.51274815272344e-05,
"loss": 1.4157,
"step": 731
},
{
"epoch": 0.6732582202805243,
"grad_norm": 0.6280670762062073,
"learning_rate": 2.500000000000001e-05,
"loss": 1.3854,
"step": 732
},
{
"epoch": 0.6741779719475741,
"grad_norm": 0.6690565943717957,
"learning_rate": 2.4872734792895734e-05,
"loss": 1.3974,
"step": 733
},
{
"epoch": 0.6750977236146241,
"grad_norm": 0.6328375339508057,
"learning_rate": 2.4745687007122636e-05,
"loss": 1.3462,
"step": 734
},
{
"epoch": 0.676017475281674,
"grad_norm": 0.6421682834625244,
"learning_rate": 2.4618857742000463e-05,
"loss": 1.2237,
"step": 735
},
{
"epoch": 0.6769372269487238,
"grad_norm": 0.6286811828613281,
"learning_rate": 2.4492248094958147e-05,
"loss": 1.3481,
"step": 736
},
{
"epoch": 0.6778569786157738,
"grad_norm": 0.61008220911026,
"learning_rate": 2.4365859161524258e-05,
"loss": 1.2088,
"step": 737
},
{
"epoch": 0.6787767302828236,
"grad_norm": 0.6456345915794373,
"learning_rate": 2.4239692035317678e-05,
"loss": 1.1997,
"step": 738
},
{
"epoch": 0.6796964819498735,
"grad_norm": 0.8082221746444702,
"learning_rate": 2.411374780803793e-05,
"loss": 1.2172,
"step": 739
},
{
"epoch": 0.6806162336169235,
"grad_norm": 0.6706709861755371,
"learning_rate": 2.3988027569455895e-05,
"loss": 1.211,
"step": 740
},
{
"epoch": 0.6815359852839733,
"grad_norm": 0.6545360088348389,
"learning_rate": 2.3862532407404303e-05,
"loss": 1.3001,
"step": 741
},
{
"epoch": 0.6824557369510232,
"grad_norm": 0.8686853051185608,
"learning_rate": 2.373726340776837e-05,
"loss": 1.2328,
"step": 742
},
{
"epoch": 0.6833754886180731,
"grad_norm": 0.668156087398529,
"learning_rate": 2.361222165447628e-05,
"loss": 1.2011,
"step": 743
},
{
"epoch": 0.684295240285123,
"grad_norm": 0.685393750667572,
"learning_rate": 2.348740822949006e-05,
"loss": 1.2309,
"step": 744
},
{
"epoch": 0.685214991952173,
"grad_norm": 0.6708635687828064,
"learning_rate": 2.3362824212795898e-05,
"loss": 1.1972,
"step": 745
},
{
"epoch": 0.6861347436192228,
"grad_norm": 0.8381814360618591,
"learning_rate": 2.3238470682395037e-05,
"loss": 1.2545,
"step": 746
},
{
"epoch": 0.6870544952862727,
"grad_norm": 0.7803678512573242,
"learning_rate": 2.3114348714294354e-05,
"loss": 1.1471,
"step": 747
},
{
"epoch": 0.6879742469533227,
"grad_norm": 0.8974632024765015,
"learning_rate": 2.2990459382497088e-05,
"loss": 1.1145,
"step": 748
},
{
"epoch": 0.6888939986203725,
"grad_norm": 1.0532459020614624,
"learning_rate": 2.2866803758993445e-05,
"loss": 1.0573,
"step": 749
},
{
"epoch": 0.6898137502874224,
"grad_norm": 1.208759069442749,
"learning_rate": 2.274338291375147e-05,
"loss": 0.9195,
"step": 750
},
{
"epoch": 0.6898137502874224,
"eval_loss": 1.3665193319320679,
"eval_runtime": 50.0048,
"eval_samples_per_second": 164.784,
"eval_steps_per_second": 20.598,
"step": 750
},
{
"epoch": 0.6907335019544723,
"grad_norm": 1.253531575202942,
"learning_rate": 2.2620197914707718e-05,
"loss": 1.602,
"step": 751
},
{
"epoch": 0.6916532536215222,
"grad_norm": 1.2635823488235474,
"learning_rate": 2.2497249827757933e-05,
"loss": 1.5615,
"step": 752
},
{
"epoch": 0.6925730052885721,
"grad_norm": 1.0416873693466187,
"learning_rate": 2.2374539716748032e-05,
"loss": 1.4779,
"step": 753
},
{
"epoch": 0.693492756955622,
"grad_norm": 0.9805805087089539,
"learning_rate": 2.225206864346465e-05,
"loss": 1.4272,
"step": 754
},
{
"epoch": 0.6944125086226719,
"grad_norm": 0.9023362398147583,
"learning_rate": 2.2129837667626145e-05,
"loss": 1.4208,
"step": 755
},
{
"epoch": 0.6953322602897217,
"grad_norm": 1.0136377811431885,
"learning_rate": 2.200784784687334e-05,
"loss": 1.4692,
"step": 756
},
{
"epoch": 0.6962520119567717,
"grad_norm": 0.9673015475273132,
"learning_rate": 2.188610023676041e-05,
"loss": 1.4966,
"step": 757
},
{
"epoch": 0.6971717636238216,
"grad_norm": 0.8694583177566528,
"learning_rate": 2.176459589074566e-05,
"loss": 1.4035,
"step": 758
},
{
"epoch": 0.6980915152908714,
"grad_norm": 0.7423250675201416,
"learning_rate": 2.164333586018259e-05,
"loss": 1.4623,
"step": 759
},
{
"epoch": 0.6990112669579214,
"grad_norm": 0.7796162366867065,
"learning_rate": 2.1522321194310574e-05,
"loss": 1.466,
"step": 760
},
{
"epoch": 0.6999310186249713,
"grad_norm": 0.9312780499458313,
"learning_rate": 2.1401552940245962e-05,
"loss": 1.3982,
"step": 761
},
{
"epoch": 0.7008507702920211,
"grad_norm": 0.7841870784759521,
"learning_rate": 2.1281032142972933e-05,
"loss": 1.505,
"step": 762
},
{
"epoch": 0.7017705219590711,
"grad_norm": 0.6561142206192017,
"learning_rate": 2.1160759845334484e-05,
"loss": 1.4446,
"step": 763
},
{
"epoch": 0.7026902736261209,
"grad_norm": 0.6478760242462158,
"learning_rate": 2.1040737088023323e-05,
"loss": 1.4218,
"step": 764
},
{
"epoch": 0.7036100252931708,
"grad_norm": 0.8280866146087646,
"learning_rate": 2.0920964909573066e-05,
"loss": 1.4915,
"step": 765
},
{
"epoch": 0.7045297769602208,
"grad_norm": 0.8623349666595459,
"learning_rate": 2.080144434634898e-05,
"loss": 1.3761,
"step": 766
},
{
"epoch": 0.7054495286272706,
"grad_norm": 0.7455824613571167,
"learning_rate": 2.0682176432539246e-05,
"loss": 1.39,
"step": 767
},
{
"epoch": 0.7063692802943206,
"grad_norm": 0.6684551239013672,
"learning_rate": 2.056316220014588e-05,
"loss": 1.4599,
"step": 768
},
{
"epoch": 0.7072890319613704,
"grad_norm": 0.6949120759963989,
"learning_rate": 2.0444402678975877e-05,
"loss": 1.4068,
"step": 769
},
{
"epoch": 0.7082087836284203,
"grad_norm": 0.698066771030426,
"learning_rate": 2.0325898896632177e-05,
"loss": 1.4451,
"step": 770
},
{
"epoch": 0.7091285352954703,
"grad_norm": 0.6923701167106628,
"learning_rate": 2.0207651878505e-05,
"loss": 1.4183,
"step": 771
},
{
"epoch": 0.7100482869625201,
"grad_norm": 0.6396070718765259,
"learning_rate": 2.0089662647762715e-05,
"loss": 1.4079,
"step": 772
},
{
"epoch": 0.71096803862957,
"grad_norm": 0.5608759522438049,
"learning_rate": 1.997193222534316e-05,
"loss": 1.3507,
"step": 773
},
{
"epoch": 0.7118877902966199,
"grad_norm": 0.6374341249465942,
"learning_rate": 1.9854461629944763e-05,
"loss": 1.395,
"step": 774
},
{
"epoch": 0.7128075419636698,
"grad_norm": 0.5628088116645813,
"learning_rate": 1.9737251878017678e-05,
"loss": 1.3779,
"step": 775
},
{
"epoch": 0.7137272936307197,
"grad_norm": 0.6205474138259888,
"learning_rate": 1.962030398375506e-05,
"loss": 1.3974,
"step": 776
},
{
"epoch": 0.7146470452977696,
"grad_norm": 0.5789771676063538,
"learning_rate": 1.950361895908427e-05,
"loss": 1.331,
"step": 777
},
{
"epoch": 0.7155667969648195,
"grad_norm": 0.636550784111023,
"learning_rate": 1.9387197813658092e-05,
"loss": 1.3799,
"step": 778
},
{
"epoch": 0.7164865486318694,
"grad_norm": 0.6165384650230408,
"learning_rate": 1.927104155484602e-05,
"loss": 1.3579,
"step": 779
},
{
"epoch": 0.7174063002989193,
"grad_norm": 0.6170758008956909,
"learning_rate": 1.9155151187725552e-05,
"loss": 1.349,
"step": 780
},
{
"epoch": 0.7183260519659692,
"grad_norm": 0.5404320359230042,
"learning_rate": 1.9039527715073424e-05,
"loss": 1.364,
"step": 781
},
{
"epoch": 0.719245803633019,
"grad_norm": 0.5796113014221191,
"learning_rate": 1.892417213735704e-05,
"loss": 1.2893,
"step": 782
},
{
"epoch": 0.720165555300069,
"grad_norm": 0.6280906796455383,
"learning_rate": 1.8809085452725746e-05,
"loss": 1.3598,
"step": 783
},
{
"epoch": 0.7210853069671189,
"grad_norm": 0.6569982171058655,
"learning_rate": 1.8694268657002194e-05,
"loss": 1.3006,
"step": 784
},
{
"epoch": 0.7220050586341688,
"grad_norm": 0.6892338991165161,
"learning_rate": 1.8579722743673773e-05,
"loss": 1.3557,
"step": 785
},
{
"epoch": 0.7229248103012187,
"grad_norm": 0.6984684467315674,
"learning_rate": 1.8465448703883958e-05,
"loss": 1.3506,
"step": 786
},
{
"epoch": 0.7238445619682685,
"grad_norm": 0.65283203125,
"learning_rate": 1.8351447526423727e-05,
"loss": 1.3009,
"step": 787
},
{
"epoch": 0.7247643136353185,
"grad_norm": 0.7025482654571533,
"learning_rate": 1.8237720197723075e-05,
"loss": 1.1886,
"step": 788
},
{
"epoch": 0.7256840653023684,
"grad_norm": 0.6791706085205078,
"learning_rate": 1.812426770184243e-05,
"loss": 1.2081,
"step": 789
},
{
"epoch": 0.7266038169694182,
"grad_norm": 0.6996423602104187,
"learning_rate": 1.801109102046414e-05,
"loss": 1.2468,
"step": 790
},
{
"epoch": 0.7275235686364682,
"grad_norm": 0.722210705280304,
"learning_rate": 1.7898191132883968e-05,
"loss": 1.196,
"step": 791
},
{
"epoch": 0.7284433203035181,
"grad_norm": 0.6527461409568787,
"learning_rate": 1.7785569016002685e-05,
"loss": 1.2516,
"step": 792
},
{
"epoch": 0.7293630719705679,
"grad_norm": 0.6403821110725403,
"learning_rate": 1.7673225644317486e-05,
"loss": 1.1883,
"step": 793
},
{
"epoch": 0.7302828236376179,
"grad_norm": 0.7447903156280518,
"learning_rate": 1.7561161989913698e-05,
"loss": 1.2232,
"step": 794
},
{
"epoch": 0.7312025753046677,
"grad_norm": 0.8253830671310425,
"learning_rate": 1.7449379022456295e-05,
"loss": 1.2144,
"step": 795
},
{
"epoch": 0.7321223269717176,
"grad_norm": 0.8268104791641235,
"learning_rate": 1.7337877709181526e-05,
"loss": 1.1443,
"step": 796
},
{
"epoch": 0.7330420786387676,
"grad_norm": 0.8768870830535889,
"learning_rate": 1.7226659014888546e-05,
"loss": 1.0736,
"step": 797
},
{
"epoch": 0.7339618303058174,
"grad_norm": 0.8852882981300354,
"learning_rate": 1.711572390193102e-05,
"loss": 1.1051,
"step": 798
},
{
"epoch": 0.7348815819728673,
"grad_norm": 1.0162791013717651,
"learning_rate": 1.7005073330208883e-05,
"loss": 1.0043,
"step": 799
},
{
"epoch": 0.7358013336399172,
"grad_norm": 1.2660006284713745,
"learning_rate": 1.689470825715998e-05,
"loss": 1.0243,
"step": 800
},
{
"epoch": 0.7367210853069671,
"grad_norm": 1.007739543914795,
"learning_rate": 1.6784629637751815e-05,
"loss": 1.5297,
"step": 801
},
{
"epoch": 0.7376408369740171,
"grad_norm": 0.9282512664794922,
"learning_rate": 1.6674838424473173e-05,
"loss": 1.5234,
"step": 802
},
{
"epoch": 0.7385605886410669,
"grad_norm": 0.8745155334472656,
"learning_rate": 1.656533556732611e-05,
"loss": 1.4494,
"step": 803
},
{
"epoch": 0.7394803403081168,
"grad_norm": 0.941735565662384,
"learning_rate": 1.6456122013817476e-05,
"loss": 1.5395,
"step": 804
},
{
"epoch": 0.7404000919751667,
"grad_norm": 0.9213740825653076,
"learning_rate": 1.6347198708950882e-05,
"loss": 1.4104,
"step": 805
},
{
"epoch": 0.7413198436422166,
"grad_norm": 0.8986393809318542,
"learning_rate": 1.6238566595218473e-05,
"loss": 1.4004,
"step": 806
},
{
"epoch": 0.7422395953092665,
"grad_norm": 1.212737798690796,
"learning_rate": 1.6130226612592786e-05,
"loss": 1.4478,
"step": 807
},
{
"epoch": 0.7431593469763164,
"grad_norm": 0.8150504231452942,
"learning_rate": 1.6022179698518523e-05,
"loss": 1.4197,
"step": 808
},
{
"epoch": 0.7440790986433663,
"grad_norm": 0.7515584826469421,
"learning_rate": 1.591442678790467e-05,
"loss": 1.454,
"step": 809
},
{
"epoch": 0.7449988503104162,
"grad_norm": 0.6738887429237366,
"learning_rate": 1.5806968813116107e-05,
"loss": 1.46,
"step": 810
},
{
"epoch": 0.7459186019774661,
"grad_norm": 0.8340874314308167,
"learning_rate": 1.5699806703965787e-05,
"loss": 1.4261,
"step": 811
},
{
"epoch": 0.746838353644516,
"grad_norm": 0.7794579863548279,
"learning_rate": 1.559294138770656e-05,
"loss": 1.4964,
"step": 812
},
{
"epoch": 0.7477581053115658,
"grad_norm": 0.7533066868782043,
"learning_rate": 1.5486373789023205e-05,
"loss": 1.4325,
"step": 813
},
{
"epoch": 0.7486778569786158,
"grad_norm": 0.643245279788971,
"learning_rate": 1.538010483002435e-05,
"loss": 1.4201,
"step": 814
},
{
"epoch": 0.7495976086456657,
"grad_norm": 0.6805441379547119,
"learning_rate": 1.5274135430234654e-05,
"loss": 1.4768,
"step": 815
},
{
"epoch": 0.7505173603127155,
"grad_norm": 0.7012439966201782,
"learning_rate": 1.5168466506586654e-05,
"loss": 1.3795,
"step": 816
},
{
"epoch": 0.7514371119797655,
"grad_norm": 0.6986867189407349,
"learning_rate": 1.506309897341297e-05,
"loss": 1.3924,
"step": 817
},
{
"epoch": 0.7523568636468153,
"grad_norm": 0.7575457692146301,
"learning_rate": 1.495803374243835e-05,
"loss": 1.4462,
"step": 818
},
{
"epoch": 0.7532766153138652,
"grad_norm": 0.6013389229774475,
"learning_rate": 1.4853271722771772e-05,
"loss": 1.3786,
"step": 819
},
{
"epoch": 0.7541963669809152,
"grad_norm": 0.596037745475769,
"learning_rate": 1.4748813820898554e-05,
"loss": 1.3483,
"step": 820
},
{
"epoch": 0.755116118647965,
"grad_norm": 0.6031373739242554,
"learning_rate": 1.4644660940672627e-05,
"loss": 1.364,
"step": 821
},
{
"epoch": 0.756035870315015,
"grad_norm": 0.6841591000556946,
"learning_rate": 1.4540813983308548e-05,
"loss": 1.4468,
"step": 822
},
{
"epoch": 0.7569556219820649,
"grad_norm": 0.7204717993736267,
"learning_rate": 1.4437273847373777e-05,
"loss": 1.3843,
"step": 823
},
{
"epoch": 0.7578753736491147,
"grad_norm": 0.6169053912162781,
"learning_rate": 1.4334041428781003e-05,
"loss": 1.3776,
"step": 824
},
{
"epoch": 0.7587951253161647,
"grad_norm": 0.5684770941734314,
"learning_rate": 1.4231117620780188e-05,
"loss": 1.4011,
"step": 825
},
{
"epoch": 0.7597148769832145,
"grad_norm": 0.5605279207229614,
"learning_rate": 1.4128503313951009e-05,
"loss": 1.4227,
"step": 826
},
{
"epoch": 0.7606346286502644,
"grad_norm": 0.6137314438819885,
"learning_rate": 1.4026199396195077e-05,
"loss": 1.4014,
"step": 827
},
{
"epoch": 0.7615543803173144,
"grad_norm": 0.6102471351623535,
"learning_rate": 1.3924206752728281e-05,
"loss": 1.2759,
"step": 828
},
{
"epoch": 0.7624741319843642,
"grad_norm": 0.6177085638046265,
"learning_rate": 1.3822526266073043e-05,
"loss": 1.3204,
"step": 829
},
{
"epoch": 0.7633938836514141,
"grad_norm": 0.5692439675331116,
"learning_rate": 1.3721158816050873e-05,
"loss": 1.3467,
"step": 830
},
{
"epoch": 0.764313635318464,
"grad_norm": 0.6170715689659119,
"learning_rate": 1.362010527977453e-05,
"loss": 1.2864,
"step": 831
},
{
"epoch": 0.7652333869855139,
"grad_norm": 0.6100102066993713,
"learning_rate": 1.3519366531640587e-05,
"loss": 1.331,
"step": 832
},
{
"epoch": 0.7661531386525638,
"grad_norm": 0.6240009069442749,
"learning_rate": 1.3418943443321807e-05,
"loss": 1.2976,
"step": 833
},
{
"epoch": 0.7670728903196137,
"grad_norm": 0.5838286876678467,
"learning_rate": 1.3318836883759634e-05,
"loss": 1.2843,
"step": 834
},
{
"epoch": 0.7679926419866636,
"grad_norm": 0.6636451482772827,
"learning_rate": 1.3219047719156575e-05,
"loss": 1.2261,
"step": 835
},
{
"epoch": 0.7689123936537134,
"grad_norm": 0.6104261875152588,
"learning_rate": 1.3119576812968892e-05,
"loss": 1.2723,
"step": 836
},
{
"epoch": 0.7698321453207634,
"grad_norm": 0.7110616564750671,
"learning_rate": 1.3020425025898925e-05,
"loss": 1.295,
"step": 837
},
{
"epoch": 0.7707518969878133,
"grad_norm": 0.6308919191360474,
"learning_rate": 1.292159321588778e-05,
"loss": 1.225,
"step": 838
},
{
"epoch": 0.7716716486548632,
"grad_norm": 0.6422338485717773,
"learning_rate": 1.2823082238107858e-05,
"loss": 1.2812,
"step": 839
},
{
"epoch": 0.7725914003219131,
"grad_norm": 0.7281700372695923,
"learning_rate": 1.272489294495548e-05,
"loss": 1.2313,
"step": 840
},
{
"epoch": 0.773511151988963,
"grad_norm": 0.6761153340339661,
"learning_rate": 1.2627026186043422e-05,
"loss": 1.2118,
"step": 841
},
{
"epoch": 0.7744309036560129,
"grad_norm": 0.6714473366737366,
"learning_rate": 1.2529482808193749e-05,
"loss": 1.2265,
"step": 842
},
{
"epoch": 0.7753506553230628,
"grad_norm": 0.6813847422599792,
"learning_rate": 1.243226365543026e-05,
"loss": 1.2408,
"step": 843
},
{
"epoch": 0.7762704069901126,
"grad_norm": 0.6646814346313477,
"learning_rate": 1.233536956897136e-05,
"loss": 1.1755,
"step": 844
},
{
"epoch": 0.7771901586571626,
"grad_norm": 0.6985054612159729,
"learning_rate": 1.2238801387222714e-05,
"loss": 1.155,
"step": 845
},
{
"epoch": 0.7781099103242125,
"grad_norm": 0.6989067196846008,
"learning_rate": 1.2142559945769993e-05,
"loss": 1.1747,
"step": 846
},
{
"epoch": 0.7790296619912623,
"grad_norm": 0.8439406156539917,
"learning_rate": 1.2046646077371615e-05,
"loss": 1.1648,
"step": 847
},
{
"epoch": 0.7799494136583123,
"grad_norm": 0.8463898301124573,
"learning_rate": 1.1951060611951615e-05,
"loss": 1.1043,
"step": 848
},
{
"epoch": 0.7808691653253621,
"grad_norm": 0.9298079013824463,
"learning_rate": 1.185580437659241e-05,
"loss": 1.0148,
"step": 849
},
{
"epoch": 0.781788916992412,
"grad_norm": 1.260094404220581,
"learning_rate": 1.1760878195527642e-05,
"loss": 0.9653,
"step": 850
},
{
"epoch": 0.782708668659462,
"grad_norm": 1.080349326133728,
"learning_rate": 1.1666282890135082e-05,
"loss": 1.4973,
"step": 851
},
{
"epoch": 0.7836284203265118,
"grad_norm": 1.0160036087036133,
"learning_rate": 1.1572019278929458e-05,
"loss": 1.4835,
"step": 852
},
{
"epoch": 0.7845481719935617,
"grad_norm": 1.0411534309387207,
"learning_rate": 1.1478088177555441e-05,
"loss": 1.4388,
"step": 853
},
{
"epoch": 0.7854679236606117,
"grad_norm": 0.8667961359024048,
"learning_rate": 1.1384490398780562e-05,
"loss": 1.4592,
"step": 854
},
{
"epoch": 0.7863876753276615,
"grad_norm": 0.7747707366943359,
"learning_rate": 1.129122675248816e-05,
"loss": 1.4124,
"step": 855
},
{
"epoch": 0.7873074269947115,
"grad_norm": 0.9287156462669373,
"learning_rate": 1.1198298045670402e-05,
"loss": 1.4827,
"step": 856
},
{
"epoch": 0.7882271786617613,
"grad_norm": 1.0620696544647217,
"learning_rate": 1.1105705082421303e-05,
"loss": 1.4392,
"step": 857
},
{
"epoch": 0.7891469303288112,
"grad_norm": 1.099214792251587,
"learning_rate": 1.1013448663929705e-05,
"loss": 1.4812,
"step": 858
},
{
"epoch": 0.7900666819958612,
"grad_norm": 0.9307000637054443,
"learning_rate": 1.0921529588472445e-05,
"loss": 1.4939,
"step": 859
},
{
"epoch": 0.790986433662911,
"grad_norm": 0.7514574527740479,
"learning_rate": 1.0829948651407374e-05,
"loss": 1.4117,
"step": 860
},
{
"epoch": 0.7919061853299609,
"grad_norm": 0.6653128862380981,
"learning_rate": 1.0738706645166508e-05,
"loss": 1.4885,
"step": 861
},
{
"epoch": 0.7928259369970108,
"grad_norm": 0.7091299295425415,
"learning_rate": 1.0647804359249142e-05,
"loss": 1.4785,
"step": 862
},
{
"epoch": 0.7937456886640607,
"grad_norm": 0.7756891250610352,
"learning_rate": 1.0557242580215066e-05,
"loss": 1.499,
"step": 863
},
{
"epoch": 0.7946654403311106,
"grad_norm": 0.7706134915351868,
"learning_rate": 1.0467022091677691e-05,
"loss": 1.3828,
"step": 864
},
{
"epoch": 0.7955851919981605,
"grad_norm": 0.6963340044021606,
"learning_rate": 1.037714367429734e-05,
"loss": 1.415,
"step": 865
},
{
"epoch": 0.7965049436652104,
"grad_norm": 0.683591365814209,
"learning_rate": 1.0287608105774454e-05,
"loss": 1.4614,
"step": 866
},
{
"epoch": 0.7974246953322602,
"grad_norm": 0.6579643487930298,
"learning_rate": 1.019841616084286e-05,
"loss": 1.4229,
"step": 867
},
{
"epoch": 0.7983444469993102,
"grad_norm": 0.655005156993866,
"learning_rate": 1.0109568611263093e-05,
"loss": 1.3674,
"step": 868
},
{
"epoch": 0.7992641986663601,
"grad_norm": 0.6061270236968994,
"learning_rate": 1.0021066225815689e-05,
"loss": 1.4522,
"step": 869
},
{
"epoch": 0.8001839503334099,
"grad_norm": 0.6729152798652649,
"learning_rate": 9.932909770294541e-06,
"loss": 1.3665,
"step": 870
},
{
"epoch": 0.8011037020004599,
"grad_norm": 0.6866083145141602,
"learning_rate": 9.84510000750029e-06,
"loss": 1.341,
"step": 871
},
{
"epoch": 0.8020234536675098,
"grad_norm": 0.6673592329025269,
"learning_rate": 9.757637697233723e-06,
"loss": 1.4353,
"step": 872
},
{
"epoch": 0.8029432053345597,
"grad_norm": 0.6237421035766602,
"learning_rate": 9.670523596289138e-06,
"loss": 1.4077,
"step": 873
},
{
"epoch": 0.8038629570016096,
"grad_norm": 0.6855435967445374,
"learning_rate": 9.583758458447927e-06,
"loss": 1.4204,
"step": 874
},
{
"epoch": 0.8047827086686594,
"grad_norm": 0.6294743418693542,
"learning_rate": 9.497343034471895e-06,
"loss": 1.4306,
"step": 875
},
{
"epoch": 0.8057024603357094,
"grad_norm": 0.5920624136924744,
"learning_rate": 9.41127807209688e-06,
"loss": 1.4342,
"step": 876
},
{
"epoch": 0.8066222120027593,
"grad_norm": 0.5831781625747681,
"learning_rate": 9.325564316026237e-06,
"loss": 1.3581,
"step": 877
},
{
"epoch": 0.8075419636698091,
"grad_norm": 0.6441843509674072,
"learning_rate": 9.240202507924412e-06,
"loss": 1.3834,
"step": 878
},
{
"epoch": 0.8084617153368591,
"grad_norm": 0.8426811099052429,
"learning_rate": 9.155193386410465e-06,
"loss": 1.4059,
"step": 879
},
{
"epoch": 0.8093814670039089,
"grad_norm": 0.7335101366043091,
"learning_rate": 9.070537687051817e-06,
"loss": 1.3253,
"step": 880
},
{
"epoch": 0.8103012186709588,
"grad_norm": 0.6380130052566528,
"learning_rate": 8.986236142357708e-06,
"loss": 1.368,
"step": 881
},
{
"epoch": 0.8112209703380088,
"grad_norm": 0.6573965549468994,
"learning_rate": 8.902289481772997e-06,
"loss": 1.2883,
"step": 882
},
{
"epoch": 0.8121407220050586,
"grad_norm": 0.658258855342865,
"learning_rate": 8.818698431671773e-06,
"loss": 1.3068,
"step": 883
},
{
"epoch": 0.8130604736721085,
"grad_norm": 0.5781223773956299,
"learning_rate": 8.735463715351139e-06,
"loss": 1.2877,
"step": 884
},
{
"epoch": 0.8139802253391585,
"grad_norm": 0.7181767225265503,
"learning_rate": 8.652586053024836e-06,
"loss": 1.2878,
"step": 885
},
{
"epoch": 0.8148999770062083,
"grad_norm": 0.6754813194274902,
"learning_rate": 8.570066161817176e-06,
"loss": 1.2296,
"step": 886
},
{
"epoch": 0.8158197286732582,
"grad_norm": 0.655967652797699,
"learning_rate": 8.487904755756677e-06,
"loss": 1.2901,
"step": 887
},
{
"epoch": 0.8167394803403081,
"grad_norm": 0.6471141576766968,
"learning_rate": 8.406102545769989e-06,
"loss": 1.1674,
"step": 888
},
{
"epoch": 0.817659232007358,
"grad_norm": 0.615079939365387,
"learning_rate": 8.324660239675696e-06,
"loss": 1.2264,
"step": 889
},
{
"epoch": 0.818578983674408,
"grad_norm": 0.671017587184906,
"learning_rate": 8.243578542178226e-06,
"loss": 1.2746,
"step": 890
},
{
"epoch": 0.8194987353414578,
"grad_norm": 0.6405725479125977,
"learning_rate": 8.16285815486168e-06,
"loss": 1.26,
"step": 891
},
{
"epoch": 0.8204184870085077,
"grad_norm": 0.7116778492927551,
"learning_rate": 8.082499776183883e-06,
"loss": 1.2526,
"step": 892
},
{
"epoch": 0.8213382386755576,
"grad_norm": 0.6701216697692871,
"learning_rate": 8.002504101470204e-06,
"loss": 1.1883,
"step": 893
},
{
"epoch": 0.8222579903426075,
"grad_norm": 0.7331655025482178,
"learning_rate": 7.92287182290764e-06,
"loss": 1.2322,
"step": 894
},
{
"epoch": 0.8231777420096574,
"grad_norm": 0.7266958951950073,
"learning_rate": 7.843603629538804e-06,
"loss": 1.1902,
"step": 895
},
{
"epoch": 0.8240974936767073,
"grad_norm": 0.7101981043815613,
"learning_rate": 7.764700207255903e-06,
"loss": 1.0998,
"step": 896
},
{
"epoch": 0.8250172453437572,
"grad_norm": 0.7413234114646912,
"learning_rate": 7.686162238794897e-06,
"loss": 1.1047,
"step": 897
},
{
"epoch": 0.825936997010807,
"grad_norm": 0.8715062141418457,
"learning_rate": 7.607990403729526e-06,
"loss": 1.1146,
"step": 898
},
{
"epoch": 0.826856748677857,
"grad_norm": 0.9183730483055115,
"learning_rate": 7.5301853784654595e-06,
"loss": 1.0057,
"step": 899
},
{
"epoch": 0.8277765003449069,
"grad_norm": 1.0864571332931519,
"learning_rate": 7.452747836234392e-06,
"loss": 0.978,
"step": 900
},
{
"epoch": 0.8277765003449069,
"eval_loss": 1.3344465494155884,
"eval_runtime": 49.9437,
"eval_samples_per_second": 164.986,
"eval_steps_per_second": 20.623,
"step": 900
},
{
"epoch": 0.8286962520119567,
"grad_norm": 0.8766337037086487,
"learning_rate": 7.375678447088347e-06,
"loss": 1.5154,
"step": 901
},
{
"epoch": 0.8296160036790067,
"grad_norm": 0.8737375140190125,
"learning_rate": 7.298977877893687e-06,
"loss": 1.4447,
"step": 902
},
{
"epoch": 0.8305357553460566,
"grad_norm": 0.9431170225143433,
"learning_rate": 7.222646792325516e-06,
"loss": 1.4588,
"step": 903
},
{
"epoch": 0.8314555070131064,
"grad_norm": 0.9367691874504089,
"learning_rate": 7.146685850861851e-06,
"loss": 1.4205,
"step": 904
},
{
"epoch": 0.8323752586801564,
"grad_norm": 0.812258780002594,
"learning_rate": 7.071095710777925e-06,
"loss": 1.4177,
"step": 905
},
{
"epoch": 0.8332950103472062,
"grad_norm": 0.7034198045730591,
"learning_rate": 6.995877026140468e-06,
"loss": 1.4146,
"step": 906
},
{
"epoch": 0.8342147620142562,
"grad_norm": 0.7884905934333801,
"learning_rate": 6.921030447802146e-06,
"loss": 1.4616,
"step": 907
},
{
"epoch": 0.8351345136813061,
"grad_norm": 0.8112537860870361,
"learning_rate": 6.8465566233957945e-06,
"loss": 1.3435,
"step": 908
},
{
"epoch": 0.8360542653483559,
"grad_norm": 0.7667593955993652,
"learning_rate": 6.772456197328919e-06,
"loss": 1.464,
"step": 909
},
{
"epoch": 0.8369740170154059,
"grad_norm": 0.762269914150238,
"learning_rate": 6.698729810778065e-06,
"loss": 1.4473,
"step": 910
},
{
"epoch": 0.8378937686824557,
"grad_norm": 0.852673351764679,
"learning_rate": 6.625378101683316e-06,
"loss": 1.4215,
"step": 911
},
{
"epoch": 0.8388135203495056,
"grad_norm": 0.7429057359695435,
"learning_rate": 6.552401704742678e-06,
"loss": 1.4426,
"step": 912
},
{
"epoch": 0.8397332720165556,
"grad_norm": 0.6884950995445251,
"learning_rate": 6.4798012514067475e-06,
"loss": 1.4016,
"step": 913
},
{
"epoch": 0.8406530236836054,
"grad_norm": 0.6550636291503906,
"learning_rate": 6.407577369873069e-06,
"loss": 1.4468,
"step": 914
},
{
"epoch": 0.8415727753506553,
"grad_norm": 0.5837852358818054,
"learning_rate": 6.335730685080837e-06,
"loss": 1.4036,
"step": 915
},
{
"epoch": 0.8424925270177053,
"grad_norm": 0.5570608377456665,
"learning_rate": 6.264261818705419e-06,
"loss": 1.3483,
"step": 916
},
{
"epoch": 0.8434122786847551,
"grad_norm": 0.7056939005851746,
"learning_rate": 6.193171389152997e-06,
"loss": 1.3397,
"step": 917
},
{
"epoch": 0.844332030351805,
"grad_norm": 0.623600423336029,
"learning_rate": 6.122460011555187e-06,
"loss": 1.4304,
"step": 918
},
{
"epoch": 0.8452517820188549,
"grad_norm": 0.6012278199195862,
"learning_rate": 6.052128297763804e-06,
"loss": 1.3684,
"step": 919
},
{
"epoch": 0.8461715336859048,
"grad_norm": 0.582744836807251,
"learning_rate": 5.982176856345445e-06,
"loss": 1.4205,
"step": 920
},
{
"epoch": 0.8470912853529547,
"grad_norm": 0.5616964101791382,
"learning_rate": 5.912606292576283e-06,
"loss": 1.3209,
"step": 921
},
{
"epoch": 0.8480110370200046,
"grad_norm": 0.5474282503128052,
"learning_rate": 5.843417208436908e-06,
"loss": 1.4125,
"step": 922
},
{
"epoch": 0.8489307886870545,
"grad_norm": 0.533388614654541,
"learning_rate": 5.774610202606939e-06,
"loss": 1.4116,
"step": 923
},
{
"epoch": 0.8498505403541043,
"grad_norm": 0.5694478154182434,
"learning_rate": 5.706185870460018e-06,
"loss": 1.509,
"step": 924
},
{
"epoch": 0.8507702920211543,
"grad_norm": 0.5748287439346313,
"learning_rate": 5.638144804058559e-06,
"loss": 1.3528,
"step": 925
},
{
"epoch": 0.8516900436882042,
"grad_norm": 0.6192615032196045,
"learning_rate": 5.5704875921486655e-06,
"loss": 1.3098,
"step": 926
},
{
"epoch": 0.852609795355254,
"grad_norm": 0.6460704207420349,
"learning_rate": 5.503214820154978e-06,
"loss": 1.3839,
"step": 927
},
{
"epoch": 0.853529547022304,
"grad_norm": 0.620794951915741,
"learning_rate": 5.436327070175728e-06,
"loss": 1.4197,
"step": 928
},
{
"epoch": 0.8544492986893538,
"grad_norm": 0.6275455355644226,
"learning_rate": 5.369824920977568e-06,
"loss": 1.2891,
"step": 929
},
{
"epoch": 0.8553690503564038,
"grad_norm": 0.5857694149017334,
"learning_rate": 5.303708947990637e-06,
"loss": 1.3334,
"step": 930
},
{
"epoch": 0.8562888020234537,
"grad_norm": 0.6003711819648743,
"learning_rate": 5.2379797233035824e-06,
"loss": 1.395,
"step": 931
},
{
"epoch": 0.8572085536905035,
"grad_norm": 0.6273806095123291,
"learning_rate": 5.1726378156585816e-06,
"loss": 1.2778,
"step": 932
},
{
"epoch": 0.8581283053575535,
"grad_norm": 0.6366182565689087,
"learning_rate": 5.10768379044641e-06,
"loss": 1.3508,
"step": 933
},
{
"epoch": 0.8590480570246034,
"grad_norm": 0.6845077872276306,
"learning_rate": 5.043118209701631e-06,
"loss": 1.2843,
"step": 934
},
{
"epoch": 0.8599678086916532,
"grad_norm": 0.6707909107208252,
"learning_rate": 4.978941632097611e-06,
"loss": 1.3239,
"step": 935
},
{
"epoch": 0.8608875603587032,
"grad_norm": 0.7041406631469727,
"learning_rate": 4.9151546129417804e-06,
"loss": 1.2556,
"step": 936
},
{
"epoch": 0.861807312025753,
"grad_norm": 0.6683023571968079,
"learning_rate": 4.8517577041707955e-06,
"loss": 1.289,
"step": 937
},
{
"epoch": 0.8627270636928029,
"grad_norm": 0.6463608741760254,
"learning_rate": 4.788751454345763e-06,
"loss": 1.225,
"step": 938
},
{
"epoch": 0.8636468153598529,
"grad_norm": 0.6901978254318237,
"learning_rate": 4.726136408647464e-06,
"loss": 1.2177,
"step": 939
},
{
"epoch": 0.8645665670269027,
"grad_norm": 0.6679742336273193,
"learning_rate": 4.663913108871726e-06,
"loss": 1.2586,
"step": 940
},
{
"epoch": 0.8654863186939526,
"grad_norm": 0.6778735518455505,
"learning_rate": 4.60208209342462e-06,
"loss": 1.183,
"step": 941
},
{
"epoch": 0.8664060703610025,
"grad_norm": 0.6251430511474609,
"learning_rate": 4.540643897317887e-06,
"loss": 1.2523,
"step": 942
},
{
"epoch": 0.8673258220280524,
"grad_norm": 0.6894196271896362,
"learning_rate": 4.479599052164268e-06,
"loss": 1.183,
"step": 943
},
{
"epoch": 0.8682455736951024,
"grad_norm": 0.6839209198951721,
"learning_rate": 4.418948086172914e-06,
"loss": 1.1992,
"step": 944
},
{
"epoch": 0.8691653253621522,
"grad_norm": 0.7572594285011292,
"learning_rate": 4.35869152414482e-06,
"loss": 1.1731,
"step": 945
},
{
"epoch": 0.8700850770292021,
"grad_norm": 0.7147699594497681,
"learning_rate": 4.298829887468275e-06,
"loss": 1.1665,
"step": 946
},
{
"epoch": 0.8710048286962521,
"grad_norm": 0.7666782736778259,
"learning_rate": 4.2393636941143675e-06,
"loss": 1.149,
"step": 947
},
{
"epoch": 0.8719245803633019,
"grad_norm": 0.7843433022499084,
"learning_rate": 4.180293458632489e-06,
"loss": 1.0903,
"step": 948
},
{
"epoch": 0.8728443320303518,
"grad_norm": 0.958113431930542,
"learning_rate": 4.121619692145878e-06,
"loss": 1.118,
"step": 949
},
{
"epoch": 0.8737640836974017,
"grad_norm": 1.1284202337265015,
"learning_rate": 4.0633429023472e-06,
"loss": 0.9711,
"step": 950
},
{
"epoch": 0.8746838353644516,
"grad_norm": 0.8368450403213501,
"learning_rate": 4.005463593494163e-06,
"loss": 1.4433,
"step": 951
},
{
"epoch": 0.8756035870315015,
"grad_norm": 0.6638758182525635,
"learning_rate": 3.947982266405159e-06,
"loss": 1.4285,
"step": 952
},
{
"epoch": 0.8765233386985514,
"grad_norm": 0.8789987564086914,
"learning_rate": 3.890899418454913e-06,
"loss": 1.4212,
"step": 953
},
{
"epoch": 0.8774430903656013,
"grad_norm": 0.847080409526825,
"learning_rate": 3.834215543570191e-06,
"loss": 1.4124,
"step": 954
},
{
"epoch": 0.8783628420326511,
"grad_norm": 0.9596214890480042,
"learning_rate": 3.777931132225526e-06,
"loss": 1.3723,
"step": 955
},
{
"epoch": 0.8792825936997011,
"grad_norm": 0.9075647592544556,
"learning_rate": 3.72204667143895e-06,
"loss": 1.493,
"step": 956
},
{
"epoch": 0.880202345366751,
"grad_norm": 0.780536413192749,
"learning_rate": 3.6665626447678237e-06,
"loss": 1.4126,
"step": 957
},
{
"epoch": 0.8811220970338008,
"grad_norm": 0.6997688412666321,
"learning_rate": 3.611479532304618e-06,
"loss": 1.389,
"step": 958
},
{
"epoch": 0.8820418487008508,
"grad_norm": 0.620875358581543,
"learning_rate": 3.556797810672785e-06,
"loss": 1.3514,
"step": 959
},
{
"epoch": 0.8829616003679006,
"grad_norm": 0.6854445338249207,
"learning_rate": 3.5025179530225994e-06,
"loss": 1.4661,
"step": 960
},
{
"epoch": 0.8838813520349506,
"grad_norm": 0.7020566463470459,
"learning_rate": 3.4486404290271113e-06,
"loss": 1.4115,
"step": 961
},
{
"epoch": 0.8848011037020005,
"grad_norm": 0.6943616271018982,
"learning_rate": 3.3951657048780227e-06,
"loss": 1.4774,
"step": 962
},
{
"epoch": 0.8857208553690503,
"grad_norm": 0.7479608654975891,
"learning_rate": 3.3420942432817127e-06,
"loss": 1.4625,
"step": 963
},
{
"epoch": 0.8866406070361003,
"grad_norm": 0.7025173902511597,
"learning_rate": 3.289426503455201e-06,
"loss": 1.4019,
"step": 964
},
{
"epoch": 0.8875603587031502,
"grad_norm": 0.673040509223938,
"learning_rate": 3.2371629411221848e-06,
"loss": 1.4343,
"step": 965
},
{
"epoch": 0.8884801103702,
"grad_norm": 0.728541910648346,
"learning_rate": 3.185304008509077e-06,
"loss": 1.5093,
"step": 966
},
{
"epoch": 0.88939986203725,
"grad_norm": 0.6773453950881958,
"learning_rate": 3.133850154341139e-06,
"loss": 1.4002,
"step": 967
},
{
"epoch": 0.8903196137042998,
"grad_norm": 0.6363242864608765,
"learning_rate": 3.082801823838527e-06,
"loss": 1.4272,
"step": 968
},
{
"epoch": 0.8912393653713497,
"grad_norm": 0.5722589492797852,
"learning_rate": 3.032159458712508e-06,
"loss": 1.3557,
"step": 969
},
{
"epoch": 0.8921591170383997,
"grad_norm": 0.5886601209640503,
"learning_rate": 2.981923497161615e-06,
"loss": 1.3874,
"step": 970
},
{
"epoch": 0.8930788687054495,
"grad_norm": 0.6230661273002625,
"learning_rate": 2.9320943738678107e-06,
"loss": 1.3784,
"step": 971
},
{
"epoch": 0.8939986203724994,
"grad_norm": 0.5844275951385498,
"learning_rate": 2.882672519992824e-06,
"loss": 1.4153,
"step": 972
},
{
"epoch": 0.8949183720395493,
"grad_norm": 0.6414538621902466,
"learning_rate": 2.833658363174302e-06,
"loss": 1.3611,
"step": 973
},
{
"epoch": 0.8958381237065992,
"grad_norm": 0.6074815392494202,
"learning_rate": 2.785052327522214e-06,
"loss": 1.3607,
"step": 974
},
{
"epoch": 0.8967578753736491,
"grad_norm": 0.5938957333564758,
"learning_rate": 2.73685483361511e-06,
"loss": 1.3765,
"step": 975
},
{
"epoch": 0.897677627040699,
"grad_norm": 0.5869003534317017,
"learning_rate": 2.6890662984965232e-06,
"loss": 1.392,
"step": 976
},
{
"epoch": 0.8985973787077489,
"grad_norm": 0.5588386654853821,
"learning_rate": 2.6416871356713224e-06,
"loss": 1.3047,
"step": 977
},
{
"epoch": 0.8995171303747989,
"grad_norm": 0.5922186970710754,
"learning_rate": 2.594717755102205e-06,
"loss": 1.3928,
"step": 978
},
{
"epoch": 0.9004368820418487,
"grad_norm": 0.5693724155426025,
"learning_rate": 2.548158563206038e-06,
"loss": 1.347,
"step": 979
},
{
"epoch": 0.9013566337088986,
"grad_norm": 0.6117263436317444,
"learning_rate": 2.50200996285046e-06,
"loss": 1.3568,
"step": 980
},
{
"epoch": 0.9022763853759485,
"grad_norm": 0.5885259509086609,
"learning_rate": 2.4562723533503083e-06,
"loss": 1.4184,
"step": 981
},
{
"epoch": 0.9031961370429984,
"grad_norm": 0.6112256646156311,
"learning_rate": 2.4109461304642256e-06,
"loss": 1.3344,
"step": 982
},
{
"epoch": 0.9041158887100483,
"grad_norm": 0.6500238180160522,
"learning_rate": 2.366031686391168e-06,
"loss": 1.3372,
"step": 983
},
{
"epoch": 0.9050356403770982,
"grad_norm": 0.6185190677642822,
"learning_rate": 2.3215294097670925e-06,
"loss": 1.2273,
"step": 984
},
{
"epoch": 0.9059553920441481,
"grad_norm": 0.6523995995521545,
"learning_rate": 2.277439685661509e-06,
"loss": 1.2538,
"step": 985
},
{
"epoch": 0.9068751437111979,
"grad_norm": 0.7136437296867371,
"learning_rate": 2.2337628955742264e-06,
"loss": 1.3739,
"step": 986
},
{
"epoch": 0.9077948953782479,
"grad_norm": 0.6043840050697327,
"learning_rate": 2.1904994174319905e-06,
"loss": 1.2184,
"step": 987
},
{
"epoch": 0.9087146470452978,
"grad_norm": 0.6362565159797668,
"learning_rate": 2.1476496255852683e-06,
"loss": 1.1398,
"step": 988
},
{
"epoch": 0.9096343987123476,
"grad_norm": 0.6597528457641602,
"learning_rate": 2.1052138908049303e-06,
"loss": 1.1972,
"step": 989
},
{
"epoch": 0.9105541503793976,
"grad_norm": 0.679057240486145,
"learning_rate": 2.0631925802791606e-06,
"loss": 1.2572,
"step": 990
},
{
"epoch": 0.9114739020464474,
"grad_norm": 0.6650072336196899,
"learning_rate": 2.021586057610153e-06,
"loss": 1.1868,
"step": 991
},
{
"epoch": 0.9123936537134973,
"grad_norm": 0.6258329749107361,
"learning_rate": 1.9803946828110375e-06,
"loss": 1.209,
"step": 992
},
{
"epoch": 0.9133134053805473,
"grad_norm": 0.6818736791610718,
"learning_rate": 1.9396188123027737e-06,
"loss": 1.2432,
"step": 993
},
{
"epoch": 0.9142331570475971,
"grad_norm": 0.7300404906272888,
"learning_rate": 1.8992587989110134e-06,
"loss": 1.2549,
"step": 994
},
{
"epoch": 0.915152908714647,
"grad_norm": 0.7216602563858032,
"learning_rate": 1.8593149918630925e-06,
"loss": 1.1911,
"step": 995
},
{
"epoch": 0.916072660381697,
"grad_norm": 0.7485631704330444,
"learning_rate": 1.8197877367849947e-06,
"loss": 1.1326,
"step": 996
},
{
"epoch": 0.9169924120487468,
"grad_norm": 0.8240882158279419,
"learning_rate": 1.7806773756983642e-06,
"loss": 1.1299,
"step": 997
},
{
"epoch": 0.9179121637157968,
"grad_norm": 0.9147471189498901,
"learning_rate": 1.7419842470175195e-06,
"loss": 1.1179,
"step": 998
},
{
"epoch": 0.9188319153828466,
"grad_norm": 0.9360700249671936,
"learning_rate": 1.70370868554659e-06,
"loss": 1.0562,
"step": 999
},
{
"epoch": 0.9197516670498965,
"grad_norm": 1.174989104270935,
"learning_rate": 1.6658510224765333e-06,
"loss": 0.9121,
"step": 1000
},
{
"epoch": 0.9206714187169465,
"grad_norm": 0.8917292952537537,
"learning_rate": 1.6284115853823445e-06,
"loss": 1.4961,
"step": 1001
},
{
"epoch": 0.9215911703839963,
"grad_norm": 0.6432257890701294,
"learning_rate": 1.5913906982201742e-06,
"loss": 1.488,
"step": 1002
},
{
"epoch": 0.9225109220510462,
"grad_norm": 0.7689481973648071,
"learning_rate": 1.5547886813245539e-06,
"loss": 1.4265,
"step": 1003
},
{
"epoch": 0.9234306737180961,
"grad_norm": 0.7164052128791809,
"learning_rate": 1.5186058514055912e-06,
"loss": 1.4054,
"step": 1004
},
{
"epoch": 0.924350425385146,
"grad_norm": 0.8932134509086609,
"learning_rate": 1.4828425215462848e-06,
"loss": 1.403,
"step": 1005
},
{
"epoch": 0.9252701770521959,
"grad_norm": 0.8750680685043335,
"learning_rate": 1.447499001199748e-06,
"loss": 1.3956,
"step": 1006
},
{
"epoch": 0.9261899287192458,
"grad_norm": 0.7176107168197632,
"learning_rate": 1.4125755961865827e-06,
"loss": 1.4235,
"step": 1007
},
{
"epoch": 0.9271096803862957,
"grad_norm": 0.7204969525337219,
"learning_rate": 1.3780726086922103e-06,
"loss": 1.3773,
"step": 1008
},
{
"epoch": 0.9280294320533456,
"grad_norm": 0.6472546458244324,
"learning_rate": 1.3439903372642615e-06,
"loss": 1.4734,
"step": 1009
},
{
"epoch": 0.9289491837203955,
"grad_norm": 0.679750919342041,
"learning_rate": 1.3103290768099797e-06,
"loss": 1.5028,
"step": 1010
},
{
"epoch": 0.9298689353874454,
"grad_norm": 0.6491613984107971,
"learning_rate": 1.2770891185937105e-06,
"loss": 1.403,
"step": 1011
},
{
"epoch": 0.9307886870544952,
"grad_norm": 0.6442059278488159,
"learning_rate": 1.2442707502343332e-06,
"loss": 1.4124,
"step": 1012
},
{
"epoch": 0.9317084387215452,
"grad_norm": 0.5981637835502625,
"learning_rate": 1.2118742557027884e-06,
"loss": 1.459,
"step": 1013
},
{
"epoch": 0.9326281903885951,
"grad_norm": 0.5459677577018738,
"learning_rate": 1.1798999153196433e-06,
"loss": 1.4171,
"step": 1014
},
{
"epoch": 0.933547942055645,
"grad_norm": 0.5810702443122864,
"learning_rate": 1.1483480057526363e-06,
"loss": 1.3995,
"step": 1015
},
{
"epoch": 0.9344676937226949,
"grad_norm": 0.5334146022796631,
"learning_rate": 1.1172188000142802e-06,
"loss": 1.4004,
"step": 1016
},
{
"epoch": 0.9353874453897447,
"grad_norm": 0.5717347860336304,
"learning_rate": 1.0865125674595466e-06,
"loss": 1.3843,
"step": 1017
},
{
"epoch": 0.9363071970567947,
"grad_norm": 0.5235407948493958,
"learning_rate": 1.0562295737834737e-06,
"loss": 1.3558,
"step": 1018
},
{
"epoch": 0.9372269487238446,
"grad_norm": 0.5573782324790955,
"learning_rate": 1.026370081018907e-06,
"loss": 1.4016,
"step": 1019
},
{
"epoch": 0.9381467003908944,
"grad_norm": 0.5528433322906494,
"learning_rate": 9.969343475342285e-07,
"loss": 1.3298,
"step": 1020
},
{
"epoch": 0.9390664520579444,
"grad_norm": 0.573993980884552,
"learning_rate": 9.679226280310982e-07,
"loss": 1.3674,
"step": 1021
},
{
"epoch": 0.9399862037249943,
"grad_norm": 0.5446662902832031,
"learning_rate": 9.393351735422773e-07,
"loss": 1.3571,
"step": 1022
},
{
"epoch": 0.9409059553920441,
"grad_norm": 0.5892913937568665,
"learning_rate": 9.111722314294358e-07,
"loss": 1.3471,
"step": 1023
},
{
"epoch": 0.9418257070590941,
"grad_norm": 0.6275593638420105,
"learning_rate": 8.834340453810375e-07,
"loss": 1.3269,
"step": 1024
},
{
"epoch": 0.9427454587261439,
"grad_norm": 0.6341751217842102,
"learning_rate": 8.561208554101863e-07,
"loss": 1.3899,
"step": 1025
},
{
"epoch": 0.9436652103931938,
"grad_norm": 0.6272470951080322,
"learning_rate": 8.292328978526109e-07,
"loss": 1.3545,
"step": 1026
},
{
"epoch": 0.9445849620602438,
"grad_norm": 0.6651190519332886,
"learning_rate": 8.027704053645613e-07,
"loss": 1.3397,
"step": 1027
},
{
"epoch": 0.9455047137272936,
"grad_norm": 0.6504070162773132,
"learning_rate": 7.76733606920832e-07,
"loss": 1.3889,
"step": 1028
},
{
"epoch": 0.9464244653943436,
"grad_norm": 0.639077365398407,
"learning_rate": 7.511227278127697e-07,
"loss": 1.3159,
"step": 1029
},
{
"epoch": 0.9473442170613934,
"grad_norm": 0.685070812702179,
"learning_rate": 7.259379896463247e-07,
"loss": 1.312,
"step": 1030
},
{
"epoch": 0.9482639687284433,
"grad_norm": 0.705894947052002,
"learning_rate": 7.011796103401191e-07,
"loss": 1.325,
"step": 1031
},
{
"epoch": 0.9491837203954933,
"grad_norm": 0.6670310497283936,
"learning_rate": 6.768478041236037e-07,
"loss": 1.3582,
"step": 1032
},
{
"epoch": 0.9501034720625431,
"grad_norm": 0.7927426695823669,
"learning_rate": 6.529427815351374e-07,
"loss": 1.3767,
"step": 1033
},
{
"epoch": 0.951023223729593,
"grad_norm": 0.6605473160743713,
"learning_rate": 6.294647494202444e-07,
"loss": 1.2937,
"step": 1034
},
{
"epoch": 0.9519429753966429,
"grad_norm": 0.599684476852417,
"learning_rate": 6.064139109297485e-07,
"loss": 1.2802,
"step": 1035
},
{
"epoch": 0.9528627270636928,
"grad_norm": 0.6753445267677307,
"learning_rate": 5.837904655180748e-07,
"loss": 1.297,
"step": 1036
},
{
"epoch": 0.9537824787307427,
"grad_norm": 0.6682940125465393,
"learning_rate": 5.615946089414736e-07,
"loss": 1.3073,
"step": 1037
},
{
"epoch": 0.9547022303977926,
"grad_norm": 0.6744109392166138,
"learning_rate": 5.398265332563934e-07,
"loss": 1.1858,
"step": 1038
},
{
"epoch": 0.9556219820648425,
"grad_norm": 0.6154145002365112,
"learning_rate": 5.184864268177325e-07,
"loss": 1.1648,
"step": 1039
},
{
"epoch": 0.9565417337318924,
"grad_norm": 0.6836906671524048,
"learning_rate": 4.975744742772848e-07,
"loss": 1.2518,
"step": 1040
},
{
"epoch": 0.9574614853989423,
"grad_norm": 0.6386029720306396,
"learning_rate": 4.770908565820964e-07,
"loss": 1.2142,
"step": 1041
},
{
"epoch": 0.9583812370659922,
"grad_norm": 0.6528066992759705,
"learning_rate": 4.5703575097292286e-07,
"loss": 1.1931,
"step": 1042
},
{
"epoch": 0.959300988733042,
"grad_norm": 0.665433406829834,
"learning_rate": 4.37409330982691e-07,
"loss": 1.202,
"step": 1043
},
{
"epoch": 0.960220740400092,
"grad_norm": 0.7009211182594299,
"learning_rate": 4.182117664349783e-07,
"loss": 1.2317,
"step": 1044
},
{
"epoch": 0.9611404920671419,
"grad_norm": 0.7533866167068481,
"learning_rate": 3.99443223442586e-07,
"loss": 1.2128,
"step": 1045
},
{
"epoch": 0.9620602437341917,
"grad_norm": 0.7658700942993164,
"learning_rate": 3.8110386440605164e-07,
"loss": 1.1474,
"step": 1046
},
{
"epoch": 0.9629799954012417,
"grad_norm": 0.7905300259590149,
"learning_rate": 3.6319384801227763e-07,
"loss": 1.1075,
"step": 1047
},
{
"epoch": 0.9638997470682915,
"grad_norm": 0.9083186388015747,
"learning_rate": 3.4571332923314936e-07,
"loss": 1.1094,
"step": 1048
},
{
"epoch": 0.9648194987353415,
"grad_norm": 0.9923297762870789,
"learning_rate": 3.2866245932418604e-07,
"loss": 1.0341,
"step": 1049
},
{
"epoch": 0.9657392504023914,
"grad_norm": 1.4956581592559814,
"learning_rate": 3.120413858232474e-07,
"loss": 0.9236,
"step": 1050
},
{
"epoch": 0.9657392504023914,
"eval_loss": 1.3224910497665405,
"eval_runtime": 49.9198,
"eval_samples_per_second": 165.065,
"eval_steps_per_second": 20.633,
"step": 1050
}
],
"logging_steps": 1,
"max_steps": 1088,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 150,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.9092013631668224e+17,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}