{ "best_metric": 1.3224910497665405, "best_model_checkpoint": "miner_id_24/checkpoint-1050", "epoch": 0.9657392504023914, "eval_steps": 150, "global_step": 1050, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009197516670498966, "grad_norm": 6.181552886962891, "learning_rate": 5e-06, "loss": 6.0754, "step": 1 }, { "epoch": 0.0009197516670498966, "eval_loss": 4.431160926818848, "eval_runtime": 49.8075, "eval_samples_per_second": 165.437, "eval_steps_per_second": 20.68, "step": 1 }, { "epoch": 0.0018395033340997931, "grad_norm": 6.504417896270752, "learning_rate": 1e-05, "loss": 5.7192, "step": 2 }, { "epoch": 0.0027592550011496897, "grad_norm": 7.00899076461792, "learning_rate": 1.5e-05, "loss": 5.533, "step": 3 }, { "epoch": 0.0036790066681995862, "grad_norm": 6.67568826675415, "learning_rate": 2e-05, "loss": 5.0143, "step": 4 }, { "epoch": 0.004598758335249482, "grad_norm": 6.313548564910889, "learning_rate": 2.5e-05, "loss": 4.6726, "step": 5 }, { "epoch": 0.005518510002299379, "grad_norm": 5.2927422523498535, "learning_rate": 3e-05, "loss": 4.6566, "step": 6 }, { "epoch": 0.0064382616693492755, "grad_norm": 4.771329879760742, "learning_rate": 3.5e-05, "loss": 4.3112, "step": 7 }, { "epoch": 0.0073580133363991725, "grad_norm": 3.6339666843414307, "learning_rate": 4e-05, "loss": 4.1199, "step": 8 }, { "epoch": 0.008277765003449069, "grad_norm": 2.8113648891448975, "learning_rate": 4.5e-05, "loss": 3.9369, "step": 9 }, { "epoch": 0.009197516670498965, "grad_norm": 2.2301437854766846, "learning_rate": 5e-05, "loss": 3.7798, "step": 10 }, { "epoch": 0.010117268337548863, "grad_norm": 2.4432830810546875, "learning_rate": 5.500000000000001e-05, "loss": 3.8405, "step": 11 }, { "epoch": 0.011037020004598759, "grad_norm": 1.870229721069336, "learning_rate": 6e-05, "loss": 3.6539, "step": 12 }, { "epoch": 0.011956771671648655, "grad_norm": 1.9459682703018188, "learning_rate": 6.500000000000001e-05, "loss": 3.5456, "step": 13 }, { "epoch": 0.012876523338698551, "grad_norm": 1.4028608798980713, "learning_rate": 7e-05, "loss": 3.4191, "step": 14 }, { "epoch": 0.013796275005748447, "grad_norm": 1.9811638593673706, "learning_rate": 7.500000000000001e-05, "loss": 3.6083, "step": 15 }, { "epoch": 0.014716026672798345, "grad_norm": 1.952579379081726, "learning_rate": 8e-05, "loss": 3.4243, "step": 16 }, { "epoch": 0.01563577833984824, "grad_norm": 1.5935711860656738, "learning_rate": 8.5e-05, "loss": 3.3783, "step": 17 }, { "epoch": 0.016555530006898137, "grad_norm": 1.475130558013916, "learning_rate": 9e-05, "loss": 3.3419, "step": 18 }, { "epoch": 0.017475281673948035, "grad_norm": 1.465334177017212, "learning_rate": 9.5e-05, "loss": 3.2841, "step": 19 }, { "epoch": 0.01839503334099793, "grad_norm": 1.5258549451828003, "learning_rate": 0.0001, "loss": 3.1315, "step": 20 }, { "epoch": 0.019314785008047827, "grad_norm": 1.2697194814682007, "learning_rate": 9.999978367986987e-05, "loss": 3.1049, "step": 21 }, { "epoch": 0.020234536675097725, "grad_norm": 1.0417594909667969, "learning_rate": 9.999913472135125e-05, "loss": 3.0702, "step": 22 }, { "epoch": 0.02115428834214762, "grad_norm": 0.8249285221099854, "learning_rate": 9.999805313005946e-05, "loss": 3.0126, "step": 23 }, { "epoch": 0.022074040009197517, "grad_norm": 0.8856204152107239, "learning_rate": 9.99965389153533e-05, "loss": 2.936, "step": 24 }, { "epoch": 0.022993791676247412, "grad_norm": 1.0896774530410767, "learning_rate": 9.999459209033495e-05, "loss": 3.0088, "step": 25 }, { "epoch": 0.02391354334329731, "grad_norm": 0.878311276435852, "learning_rate": 9.999221267184993e-05, "loss": 2.8434, "step": 26 }, { "epoch": 0.024833295010347207, "grad_norm": 0.6598113179206848, "learning_rate": 9.998940068048688e-05, "loss": 2.7397, "step": 27 }, { "epoch": 0.025753046677397102, "grad_norm": 0.8144488334655762, "learning_rate": 9.998615614057742e-05, "loss": 2.7315, "step": 28 }, { "epoch": 0.026672798344447, "grad_norm": 0.8650857210159302, "learning_rate": 9.998247908019593e-05, "loss": 2.8126, "step": 29 }, { "epoch": 0.027592550011496894, "grad_norm": 0.6536254286766052, "learning_rate": 9.997836953115926e-05, "loss": 2.7479, "step": 30 }, { "epoch": 0.028512301678546792, "grad_norm": 1.0240334272384644, "learning_rate": 9.997382752902657e-05, "loss": 2.7575, "step": 31 }, { "epoch": 0.02943205334559669, "grad_norm": 0.6431864500045776, "learning_rate": 9.996885311309891e-05, "loss": 2.6497, "step": 32 }, { "epoch": 0.030351805012646584, "grad_norm": 0.6775922179222107, "learning_rate": 9.996344632641894e-05, "loss": 2.6301, "step": 33 }, { "epoch": 0.03127155667969648, "grad_norm": 0.9252532124519348, "learning_rate": 9.995760721577052e-05, "loss": 2.6123, "step": 34 }, { "epoch": 0.032191308346746376, "grad_norm": 0.6126474738121033, "learning_rate": 9.995133583167832e-05, "loss": 2.5311, "step": 35 }, { "epoch": 0.033111060013796274, "grad_norm": 0.7896717190742493, "learning_rate": 9.994463222840746e-05, "loss": 2.5242, "step": 36 }, { "epoch": 0.03403081168084617, "grad_norm": 0.6513295769691467, "learning_rate": 9.993749646396286e-05, "loss": 2.4802, "step": 37 }, { "epoch": 0.03495056334789607, "grad_norm": 0.6320262551307678, "learning_rate": 9.992992860008892e-05, "loss": 2.5144, "step": 38 }, { "epoch": 0.03587031501494597, "grad_norm": 0.9524784684181213, "learning_rate": 9.992192870226889e-05, "loss": 2.5425, "step": 39 }, { "epoch": 0.03679006668199586, "grad_norm": 0.5857168436050415, "learning_rate": 9.991349683972434e-05, "loss": 2.447, "step": 40 }, { "epoch": 0.03770981834904576, "grad_norm": 0.7533925175666809, "learning_rate": 9.990463308541451e-05, "loss": 2.3431, "step": 41 }, { "epoch": 0.038629570016095655, "grad_norm": 0.5700300931930542, "learning_rate": 9.989533751603577e-05, "loss": 2.3499, "step": 42 }, { "epoch": 0.03954932168314555, "grad_norm": 0.8808962106704712, "learning_rate": 9.988561021202083e-05, "loss": 2.3962, "step": 43 }, { "epoch": 0.04046907335019545, "grad_norm": 0.7045830488204956, "learning_rate": 9.987545125753819e-05, "loss": 2.2948, "step": 44 }, { "epoch": 0.04138882501724534, "grad_norm": 0.8472001552581787, "learning_rate": 9.986486074049131e-05, "loss": 2.3045, "step": 45 }, { "epoch": 0.04230857668429524, "grad_norm": 0.7173092365264893, "learning_rate": 9.985383875251783e-05, "loss": 2.2929, "step": 46 }, { "epoch": 0.04322832835134514, "grad_norm": 0.7962790727615356, "learning_rate": 9.984238538898891e-05, "loss": 2.327, "step": 47 }, { "epoch": 0.044148080018395035, "grad_norm": 0.8624216318130493, "learning_rate": 9.983050074900824e-05, "loss": 2.187, "step": 48 }, { "epoch": 0.04506783168544493, "grad_norm": 0.917926549911499, "learning_rate": 9.98181849354113e-05, "loss": 2.1946, "step": 49 }, { "epoch": 0.045987583352494824, "grad_norm": 0.8663224577903748, "learning_rate": 9.980543805476446e-05, "loss": 2.0988, "step": 50 }, { "epoch": 0.04690733501954472, "grad_norm": 11.713833808898926, "learning_rate": 9.979226021736396e-05, "loss": 4.3627, "step": 51 }, { "epoch": 0.04782708668659462, "grad_norm": 5.917041301727295, "learning_rate": 9.977865153723507e-05, "loss": 3.7012, "step": 52 }, { "epoch": 0.04874683835364452, "grad_norm": 3.241976499557495, "learning_rate": 9.976461213213104e-05, "loss": 3.1752, "step": 53 }, { "epoch": 0.049666590020694415, "grad_norm": 3.6020805835723877, "learning_rate": 9.975014212353213e-05, "loss": 3.019, "step": 54 }, { "epoch": 0.050586341687744306, "grad_norm": 3.3399133682250977, "learning_rate": 9.973524163664447e-05, "loss": 2.7453, "step": 55 }, { "epoch": 0.051506093354794204, "grad_norm": 2.34346604347229, "learning_rate": 9.97199108003991e-05, "loss": 2.5133, "step": 56 }, { "epoch": 0.0524258450218441, "grad_norm": 1.2596639394760132, "learning_rate": 9.970414974745076e-05, "loss": 2.5255, "step": 57 }, { "epoch": 0.053345596688894, "grad_norm": 2.061197519302368, "learning_rate": 9.968795861417676e-05, "loss": 2.4012, "step": 58 }, { "epoch": 0.0542653483559439, "grad_norm": 2.119333028793335, "learning_rate": 9.967133754067582e-05, "loss": 2.3668, "step": 59 }, { "epoch": 0.05518510002299379, "grad_norm": 1.2170815467834473, "learning_rate": 9.965428667076686e-05, "loss": 2.4343, "step": 60 }, { "epoch": 0.056104851690043686, "grad_norm": 1.0711098909378052, "learning_rate": 9.963680615198773e-05, "loss": 2.3052, "step": 61 }, { "epoch": 0.057024603357093584, "grad_norm": 1.64667809009552, "learning_rate": 9.961889613559395e-05, "loss": 2.3781, "step": 62 }, { "epoch": 0.05794435502414348, "grad_norm": 1.2105283737182617, "learning_rate": 9.960055677655742e-05, "loss": 2.357, "step": 63 }, { "epoch": 0.05886410669119338, "grad_norm": 1.0943785905838013, "learning_rate": 9.958178823356503e-05, "loss": 2.2903, "step": 64 }, { "epoch": 0.05978385835824328, "grad_norm": 1.5415120124816895, "learning_rate": 9.956259066901733e-05, "loss": 2.3312, "step": 65 }, { "epoch": 0.06070361002529317, "grad_norm": 0.8917611837387085, "learning_rate": 9.954296424902708e-05, "loss": 2.32, "step": 66 }, { "epoch": 0.061623361692343066, "grad_norm": 0.7154043316841125, "learning_rate": 9.952290914341791e-05, "loss": 2.24, "step": 67 }, { "epoch": 0.06254311335939296, "grad_norm": 1.1616435050964355, "learning_rate": 9.950242552572271e-05, "loss": 2.2741, "step": 68 }, { "epoch": 0.06346286502644286, "grad_norm": 0.7844848036766052, "learning_rate": 9.948151357318228e-05, "loss": 2.2333, "step": 69 }, { "epoch": 0.06438261669349275, "grad_norm": 0.7282043695449829, "learning_rate": 9.946017346674361e-05, "loss": 2.1664, "step": 70 }, { "epoch": 0.06530236836054265, "grad_norm": 0.7888442873954773, "learning_rate": 9.943840539105854e-05, "loss": 2.2735, "step": 71 }, { "epoch": 0.06622212002759255, "grad_norm": 0.5766344666481018, "learning_rate": 9.941620953448194e-05, "loss": 2.1517, "step": 72 }, { "epoch": 0.06714187169464245, "grad_norm": 0.7196112871170044, "learning_rate": 9.939358608907026e-05, "loss": 2.1162, "step": 73 }, { "epoch": 0.06806162336169234, "grad_norm": 0.7088760137557983, "learning_rate": 9.937053525057977e-05, "loss": 2.1777, "step": 74 }, { "epoch": 0.06898137502874224, "grad_norm": 0.5653288960456848, "learning_rate": 9.934705721846487e-05, "loss": 2.1762, "step": 75 }, { "epoch": 0.06990112669579214, "grad_norm": 0.8287534117698669, "learning_rate": 9.93231521958764e-05, "loss": 2.1711, "step": 76 }, { "epoch": 0.07082087836284204, "grad_norm": 0.4657200872898102, "learning_rate": 9.929882038965989e-05, "loss": 2.1953, "step": 77 }, { "epoch": 0.07174063002989194, "grad_norm": 0.47897377610206604, "learning_rate": 9.927406201035368e-05, "loss": 2.1214, "step": 78 }, { "epoch": 0.07266038169694182, "grad_norm": 0.678236722946167, "learning_rate": 9.924887727218724e-05, "loss": 2.0763, "step": 79 }, { "epoch": 0.07358013336399172, "grad_norm": 0.4475807249546051, "learning_rate": 9.922326639307917e-05, "loss": 2.16, "step": 80 }, { "epoch": 0.07449988503104162, "grad_norm": 0.49449393153190613, "learning_rate": 9.919722959463544e-05, "loss": 2.1382, "step": 81 }, { "epoch": 0.07541963669809151, "grad_norm": 0.5139850974082947, "learning_rate": 9.917076710214739e-05, "loss": 2.1543, "step": 82 }, { "epoch": 0.07633938836514141, "grad_norm": 0.5776795148849487, "learning_rate": 9.914387914458982e-05, "loss": 2.157, "step": 83 }, { "epoch": 0.07725914003219131, "grad_norm": 0.573421061038971, "learning_rate": 9.911656595461898e-05, "loss": 2.0451, "step": 84 }, { "epoch": 0.07817889169924121, "grad_norm": 0.6673893332481384, "learning_rate": 9.908882776857056e-05, "loss": 2.11, "step": 85 }, { "epoch": 0.0790986433662911, "grad_norm": 0.5322157740592957, "learning_rate": 9.906066482645772e-05, "loss": 2.0667, "step": 86 }, { "epoch": 0.080018395033341, "grad_norm": 0.7134078741073608, "learning_rate": 9.903207737196891e-05, "loss": 2.0217, "step": 87 }, { "epoch": 0.0809381467003909, "grad_norm": 0.5911161303520203, "learning_rate": 9.900306565246578e-05, "loss": 2.0574, "step": 88 }, { "epoch": 0.08185789836744078, "grad_norm": 0.6985921263694763, "learning_rate": 9.897362991898109e-05, "loss": 2.0796, "step": 89 }, { "epoch": 0.08277765003449068, "grad_norm": 0.5797624588012695, "learning_rate": 9.894377042621655e-05, "loss": 2.0293, "step": 90 }, { "epoch": 0.08369740170154058, "grad_norm": 0.6441212296485901, "learning_rate": 9.891348743254046e-05, "loss": 2.0006, "step": 91 }, { "epoch": 0.08461715336859048, "grad_norm": 0.5719017386436462, "learning_rate": 9.888278119998573e-05, "loss": 1.9847, "step": 92 }, { "epoch": 0.08553690503564038, "grad_norm": 0.618574857711792, "learning_rate": 9.885165199424738e-05, "loss": 1.9194, "step": 93 }, { "epoch": 0.08645665670269027, "grad_norm": 0.8214313983917236, "learning_rate": 9.882010008468036e-05, "loss": 1.8845, "step": 94 }, { "epoch": 0.08737640836974017, "grad_norm": 0.6314259767532349, "learning_rate": 9.878812574429721e-05, "loss": 1.8474, "step": 95 }, { "epoch": 0.08829616003679007, "grad_norm": 0.6584024429321289, "learning_rate": 9.875572924976568e-05, "loss": 1.8843, "step": 96 }, { "epoch": 0.08921591170383997, "grad_norm": 0.7131389379501343, "learning_rate": 9.87229108814063e-05, "loss": 1.9198, "step": 97 }, { "epoch": 0.09013566337088987, "grad_norm": 0.824288010597229, "learning_rate": 9.868967092319003e-05, "loss": 1.8658, "step": 98 }, { "epoch": 0.09105541503793976, "grad_norm": 0.7455874681472778, "learning_rate": 9.865600966273575e-05, "loss": 1.7975, "step": 99 }, { "epoch": 0.09197516670498965, "grad_norm": 1.2152295112609863, "learning_rate": 9.86219273913078e-05, "loss": 1.7226, "step": 100 }, { "epoch": 0.09289491837203954, "grad_norm": 5.640716552734375, "learning_rate": 9.858742440381343e-05, "loss": 3.5625, "step": 101 }, { "epoch": 0.09381467003908944, "grad_norm": 3.7876381874084473, "learning_rate": 9.855250099880025e-05, "loss": 3.0309, "step": 102 }, { "epoch": 0.09473442170613934, "grad_norm": 2.426966428756714, "learning_rate": 9.851715747845373e-05, "loss": 2.6085, "step": 103 }, { "epoch": 0.09565417337318924, "grad_norm": 2.368666172027588, "learning_rate": 9.848139414859441e-05, "loss": 2.457, "step": 104 }, { "epoch": 0.09657392504023914, "grad_norm": 1.607815146446228, "learning_rate": 9.844521131867546e-05, "loss": 2.2837, "step": 105 }, { "epoch": 0.09749367670728903, "grad_norm": 1.2020126581192017, "learning_rate": 9.840860930177983e-05, "loss": 2.1918, "step": 106 }, { "epoch": 0.09841342837433893, "grad_norm": 1.469667673110962, "learning_rate": 9.837158841461766e-05, "loss": 2.1856, "step": 107 }, { "epoch": 0.09933318004138883, "grad_norm": 1.2101978063583374, "learning_rate": 9.833414897752347e-05, "loss": 2.1572, "step": 108 }, { "epoch": 0.10025293170843873, "grad_norm": 1.0145184993743896, "learning_rate": 9.829629131445342e-05, "loss": 2.0651, "step": 109 }, { "epoch": 0.10117268337548861, "grad_norm": 1.0942986011505127, "learning_rate": 9.825801575298248e-05, "loss": 2.1006, "step": 110 }, { "epoch": 0.10209243504253851, "grad_norm": 0.812549889087677, "learning_rate": 9.821932262430165e-05, "loss": 2.0787, "step": 111 }, { "epoch": 0.10301218670958841, "grad_norm": 0.9913772344589233, "learning_rate": 9.8180212263215e-05, "loss": 2.0555, "step": 112 }, { "epoch": 0.1039319383766383, "grad_norm": 0.7573890686035156, "learning_rate": 9.814068500813692e-05, "loss": 2.022, "step": 113 }, { "epoch": 0.1048516900436882, "grad_norm": 0.876980185508728, "learning_rate": 9.8100741201089e-05, "loss": 2.0677, "step": 114 }, { "epoch": 0.1057714417107381, "grad_norm": 0.8768622875213623, "learning_rate": 9.806038118769723e-05, "loss": 2.0766, "step": 115 }, { "epoch": 0.106691193377788, "grad_norm": 0.6824678182601929, "learning_rate": 9.801960531718896e-05, "loss": 2.1323, "step": 116 }, { "epoch": 0.1076109450448379, "grad_norm": 0.9467669129371643, "learning_rate": 9.797841394238986e-05, "loss": 1.9414, "step": 117 }, { "epoch": 0.1085306967118878, "grad_norm": 0.5850769281387329, "learning_rate": 9.793680741972084e-05, "loss": 1.9249, "step": 118 }, { "epoch": 0.10945044837893769, "grad_norm": 0.8185686469078064, "learning_rate": 9.789478610919507e-05, "loss": 1.9541, "step": 119 }, { "epoch": 0.11037020004598758, "grad_norm": 0.9609946608543396, "learning_rate": 9.785235037441474e-05, "loss": 1.943, "step": 120 }, { "epoch": 0.11128995171303747, "grad_norm": 0.6438754796981812, "learning_rate": 9.780950058256802e-05, "loss": 1.9613, "step": 121 }, { "epoch": 0.11220970338008737, "grad_norm": 1.0584321022033691, "learning_rate": 9.776623710442579e-05, "loss": 1.9652, "step": 122 }, { "epoch": 0.11312945504713727, "grad_norm": 0.5727084279060364, "learning_rate": 9.772256031433849e-05, "loss": 1.9769, "step": 123 }, { "epoch": 0.11404920671418717, "grad_norm": 0.8819255828857422, "learning_rate": 9.767847059023291e-05, "loss": 2.0024, "step": 124 }, { "epoch": 0.11496895838123707, "grad_norm": 0.8120801448822021, "learning_rate": 9.763396831360884e-05, "loss": 1.9066, "step": 125 }, { "epoch": 0.11588871004828696, "grad_norm": 0.5545021891593933, "learning_rate": 9.758905386953579e-05, "loss": 1.9619, "step": 126 }, { "epoch": 0.11680846171533686, "grad_norm": 1.0289326906204224, "learning_rate": 9.754372764664969e-05, "loss": 1.9098, "step": 127 }, { "epoch": 0.11772821338238676, "grad_norm": 0.609516441822052, "learning_rate": 9.749799003714954e-05, "loss": 1.9147, "step": 128 }, { "epoch": 0.11864796504943666, "grad_norm": 0.7941620945930481, "learning_rate": 9.745184143679397e-05, "loss": 1.8968, "step": 129 }, { "epoch": 0.11956771671648656, "grad_norm": 0.787964940071106, "learning_rate": 9.74052822448978e-05, "loss": 1.9712, "step": 130 }, { "epoch": 0.12048746838353644, "grad_norm": 0.730323314666748, "learning_rate": 9.735831286432868e-05, "loss": 1.8993, "step": 131 }, { "epoch": 0.12140722005058634, "grad_norm": 0.8297889232635498, "learning_rate": 9.731093370150349e-05, "loss": 1.9682, "step": 132 }, { "epoch": 0.12232697171763623, "grad_norm": 0.768775463104248, "learning_rate": 9.72631451663849e-05, "loss": 1.8542, "step": 133 }, { "epoch": 0.12324672338468613, "grad_norm": 0.7137448787689209, "learning_rate": 9.721494767247779e-05, "loss": 1.8801, "step": 134 }, { "epoch": 0.12416647505173603, "grad_norm": 0.6385506987571716, "learning_rate": 9.716634163682569e-05, "loss": 1.8384, "step": 135 }, { "epoch": 0.12508622671878591, "grad_norm": 0.7410357594490051, "learning_rate": 9.71173274800072e-05, "loss": 1.8761, "step": 136 }, { "epoch": 0.12600597838583583, "grad_norm": 0.7702000737190247, "learning_rate": 9.706790562613219e-05, "loss": 1.8183, "step": 137 }, { "epoch": 0.1269257300528857, "grad_norm": 0.6795453429222107, "learning_rate": 9.701807650283839e-05, "loss": 1.8434, "step": 138 }, { "epoch": 0.12784548171993562, "grad_norm": 0.8809398412704468, "learning_rate": 9.696784054128749e-05, "loss": 1.8462, "step": 139 }, { "epoch": 0.1287652333869855, "grad_norm": 0.9881577491760254, "learning_rate": 9.691719817616147e-05, "loss": 1.7828, "step": 140 }, { "epoch": 0.12968498505403542, "grad_norm": 0.9603993892669678, "learning_rate": 9.686614984565887e-05, "loss": 1.8768, "step": 141 }, { "epoch": 0.1306047367210853, "grad_norm": 1.0421313047409058, "learning_rate": 9.681469599149092e-05, "loss": 1.8302, "step": 142 }, { "epoch": 0.1315244883881352, "grad_norm": 0.8529607653617859, "learning_rate": 9.676283705887783e-05, "loss": 1.7531, "step": 143 }, { "epoch": 0.1324442400551851, "grad_norm": 0.8817620277404785, "learning_rate": 9.67105734965448e-05, "loss": 1.7358, "step": 144 }, { "epoch": 0.133363991722235, "grad_norm": 0.9506654739379883, "learning_rate": 9.665790575671829e-05, "loss": 1.7789, "step": 145 }, { "epoch": 0.1342837433892849, "grad_norm": 1.1102913618087769, "learning_rate": 9.660483429512199e-05, "loss": 1.7401, "step": 146 }, { "epoch": 0.13520349505633478, "grad_norm": 0.7556246519088745, "learning_rate": 9.65513595709729e-05, "loss": 1.728, "step": 147 }, { "epoch": 0.1361232467233847, "grad_norm": 1.1163665056228638, "learning_rate": 9.64974820469774e-05, "loss": 1.6618, "step": 148 }, { "epoch": 0.13704299839043457, "grad_norm": 0.9814196228981018, "learning_rate": 9.644320218932722e-05, "loss": 1.616, "step": 149 }, { "epoch": 0.13796275005748448, "grad_norm": 1.2995212078094482, "learning_rate": 9.638852046769539e-05, "loss": 1.6275, "step": 150 }, { "epoch": 0.13796275005748448, "eval_loss": 1.9198498725891113, "eval_runtime": 50.0535, "eval_samples_per_second": 164.624, "eval_steps_per_second": 20.578, "step": 150 }, { "epoch": 0.13888250172453437, "grad_norm": 3.668370485305786, "learning_rate": 9.633343735523219e-05, "loss": 2.841, "step": 151 }, { "epoch": 0.13980225339158428, "grad_norm": 2.5073230266571045, "learning_rate": 9.627795332856107e-05, "loss": 2.3706, "step": 152 }, { "epoch": 0.14072200505863416, "grad_norm": 1.542073130607605, "learning_rate": 9.622206886777448e-05, "loss": 2.1699, "step": 153 }, { "epoch": 0.14164175672568408, "grad_norm": 1.3604127168655396, "learning_rate": 9.616578445642981e-05, "loss": 1.9859, "step": 154 }, { "epoch": 0.14256150839273396, "grad_norm": 1.1186628341674805, "learning_rate": 9.61091005815451e-05, "loss": 1.9205, "step": 155 }, { "epoch": 0.14348126005978387, "grad_norm": 1.1308863162994385, "learning_rate": 9.605201773359485e-05, "loss": 1.9819, "step": 156 }, { "epoch": 0.14440101172683376, "grad_norm": 1.0661953687667847, "learning_rate": 9.599453640650585e-05, "loss": 1.9109, "step": 157 }, { "epoch": 0.14532076339388364, "grad_norm": 0.7912338376045227, "learning_rate": 9.59366570976528e-05, "loss": 1.9331, "step": 158 }, { "epoch": 0.14624051506093355, "grad_norm": 0.9056004881858826, "learning_rate": 9.587838030785413e-05, "loss": 1.9323, "step": 159 }, { "epoch": 0.14716026672798344, "grad_norm": 1.0585856437683105, "learning_rate": 9.581970654136751e-05, "loss": 1.9443, "step": 160 }, { "epoch": 0.14808001839503335, "grad_norm": 1.0043240785598755, "learning_rate": 9.576063630588563e-05, "loss": 1.8468, "step": 161 }, { "epoch": 0.14899977006208323, "grad_norm": 0.9187436699867249, "learning_rate": 9.570117011253174e-05, "loss": 1.9558, "step": 162 }, { "epoch": 0.14991952172913314, "grad_norm": 0.862158477306366, "learning_rate": 9.56413084758552e-05, "loss": 1.851, "step": 163 }, { "epoch": 0.15083927339618303, "grad_norm": 1.04788076877594, "learning_rate": 9.55810519138271e-05, "loss": 1.884, "step": 164 }, { "epoch": 0.15175902506323294, "grad_norm": 0.807015597820282, "learning_rate": 9.552040094783574e-05, "loss": 1.8688, "step": 165 }, { "epoch": 0.15267877673028282, "grad_norm": 0.8749469518661499, "learning_rate": 9.545935610268211e-05, "loss": 1.8487, "step": 166 }, { "epoch": 0.1535985283973327, "grad_norm": 0.7388503551483154, "learning_rate": 9.539791790657538e-05, "loss": 1.8447, "step": 167 }, { "epoch": 0.15451828006438262, "grad_norm": 0.8812807202339172, "learning_rate": 9.533608689112827e-05, "loss": 1.8848, "step": 168 }, { "epoch": 0.1554380317314325, "grad_norm": 0.6926305890083313, "learning_rate": 9.527386359135253e-05, "loss": 1.824, "step": 169 }, { "epoch": 0.15635778339848241, "grad_norm": 0.7211126089096069, "learning_rate": 9.521124854565425e-05, "loss": 1.8291, "step": 170 }, { "epoch": 0.1572775350655323, "grad_norm": 0.717591404914856, "learning_rate": 9.514824229582921e-05, "loss": 1.8463, "step": 171 }, { "epoch": 0.1581972867325822, "grad_norm": 0.5658002495765686, "learning_rate": 9.508484538705824e-05, "loss": 1.8864, "step": 172 }, { "epoch": 0.1591170383996321, "grad_norm": 0.8670650720596313, "learning_rate": 9.50210583679024e-05, "loss": 1.8437, "step": 173 }, { "epoch": 0.160036790066682, "grad_norm": 0.6736385822296143, "learning_rate": 9.495688179029838e-05, "loss": 1.8376, "step": 174 }, { "epoch": 0.1609565417337319, "grad_norm": 0.7114839553833008, "learning_rate": 9.489231620955359e-05, "loss": 1.8259, "step": 175 }, { "epoch": 0.1618762934007818, "grad_norm": 0.8745600581169128, "learning_rate": 9.482736218434143e-05, "loss": 1.8571, "step": 176 }, { "epoch": 0.16279604506783169, "grad_norm": 0.594724714756012, "learning_rate": 9.476202027669643e-05, "loss": 1.8385, "step": 177 }, { "epoch": 0.16371579673488157, "grad_norm": 0.8559861183166504, "learning_rate": 9.469629105200937e-05, "loss": 1.805, "step": 178 }, { "epoch": 0.16463554840193148, "grad_norm": 0.6145199537277222, "learning_rate": 9.463017507902244e-05, "loss": 1.8331, "step": 179 }, { "epoch": 0.16555530006898136, "grad_norm": 1.0015912055969238, "learning_rate": 9.456367292982429e-05, "loss": 1.7974, "step": 180 }, { "epoch": 0.16647505173603128, "grad_norm": 0.5909841060638428, "learning_rate": 9.449678517984502e-05, "loss": 1.787, "step": 181 }, { "epoch": 0.16739480340308116, "grad_norm": 0.766480565071106, "learning_rate": 9.442951240785135e-05, "loss": 1.7213, "step": 182 }, { "epoch": 0.16831455507013107, "grad_norm": 0.6516543626785278, "learning_rate": 9.436185519594145e-05, "loss": 1.7548, "step": 183 }, { "epoch": 0.16923430673718096, "grad_norm": 0.7793421745300293, "learning_rate": 9.429381412953999e-05, "loss": 1.7481, "step": 184 }, { "epoch": 0.17015405840423087, "grad_norm": 0.8920656442642212, "learning_rate": 9.422538979739307e-05, "loss": 1.805, "step": 185 }, { "epoch": 0.17107381007128075, "grad_norm": 0.8302977085113525, "learning_rate": 9.415658279156311e-05, "loss": 1.7267, "step": 186 }, { "epoch": 0.17199356173833066, "grad_norm": 0.8947249054908752, "learning_rate": 9.408739370742373e-05, "loss": 1.6794, "step": 187 }, { "epoch": 0.17291331340538055, "grad_norm": 0.6332067251205444, "learning_rate": 9.401782314365457e-05, "loss": 1.7127, "step": 188 }, { "epoch": 0.17383306507243043, "grad_norm": 0.830932080745697, "learning_rate": 9.39478717022362e-05, "loss": 1.6696, "step": 189 }, { "epoch": 0.17475281673948034, "grad_norm": 0.6934016942977905, "learning_rate": 9.387753998844482e-05, "loss": 1.6327, "step": 190 }, { "epoch": 0.17567256840653023, "grad_norm": 0.733917236328125, "learning_rate": 9.380682861084701e-05, "loss": 1.6992, "step": 191 }, { "epoch": 0.17659232007358014, "grad_norm": 0.7675406336784363, "learning_rate": 9.373573818129458e-05, "loss": 1.6759, "step": 192 }, { "epoch": 0.17751207174063002, "grad_norm": 0.8431460857391357, "learning_rate": 9.366426931491916e-05, "loss": 1.6044, "step": 193 }, { "epoch": 0.17843182340767993, "grad_norm": 0.7542397975921631, "learning_rate": 9.359242263012693e-05, "loss": 1.6274, "step": 194 }, { "epoch": 0.17935157507472982, "grad_norm": 0.8931959867477417, "learning_rate": 9.352019874859325e-05, "loss": 1.6006, "step": 195 }, { "epoch": 0.18027132674177973, "grad_norm": 0.8215823769569397, "learning_rate": 9.344759829525733e-05, "loss": 1.5865, "step": 196 }, { "epoch": 0.18119107840882961, "grad_norm": 0.7112393379211426, "learning_rate": 9.337462189831669e-05, "loss": 1.5478, "step": 197 }, { "epoch": 0.18211083007587953, "grad_norm": 1.0283434391021729, "learning_rate": 9.330127018922194e-05, "loss": 1.5316, "step": 198 }, { "epoch": 0.1830305817429294, "grad_norm": 0.9886683225631714, "learning_rate": 9.322754380267109e-05, "loss": 1.4653, "step": 199 }, { "epoch": 0.1839503334099793, "grad_norm": 1.064937949180603, "learning_rate": 9.315344337660421e-05, "loss": 1.4673, "step": 200 }, { "epoch": 0.1848700850770292, "grad_norm": 3.375886917114258, "learning_rate": 9.307896955219786e-05, "loss": 2.5919, "step": 201 }, { "epoch": 0.1857898367440791, "grad_norm": 2.260359764099121, "learning_rate": 9.300412297385954e-05, "loss": 2.1729, "step": 202 }, { "epoch": 0.186709588411129, "grad_norm": 1.4669098854064941, "learning_rate": 9.292890428922209e-05, "loss": 1.9383, "step": 203 }, { "epoch": 0.18762934007817889, "grad_norm": 1.037178635597229, "learning_rate": 9.285331414913815e-05, "loss": 1.9071, "step": 204 }, { "epoch": 0.1885490917452288, "grad_norm": 1.154489517211914, "learning_rate": 9.277735320767449e-05, "loss": 1.8216, "step": 205 }, { "epoch": 0.18946884341227868, "grad_norm": 1.0613019466400146, "learning_rate": 9.270102212210632e-05, "loss": 1.7831, "step": 206 }, { "epoch": 0.1903885950793286, "grad_norm": 1.1248329877853394, "learning_rate": 9.262432155291167e-05, "loss": 1.8591, "step": 207 }, { "epoch": 0.19130834674637848, "grad_norm": 0.8293649554252625, "learning_rate": 9.254725216376561e-05, "loss": 1.8205, "step": 208 }, { "epoch": 0.19222809841342836, "grad_norm": 0.9506818652153015, "learning_rate": 9.246981462153456e-05, "loss": 1.8283, "step": 209 }, { "epoch": 0.19314785008047827, "grad_norm": 0.8719251155853271, "learning_rate": 9.239200959627048e-05, "loss": 1.7719, "step": 210 }, { "epoch": 0.19406760174752816, "grad_norm": 0.808614194393158, "learning_rate": 9.231383776120512e-05, "loss": 1.8825, "step": 211 }, { "epoch": 0.19498735341457807, "grad_norm": 0.897612988948822, "learning_rate": 9.22352997927441e-05, "loss": 1.8061, "step": 212 }, { "epoch": 0.19590710508162795, "grad_norm": 0.7289676070213318, "learning_rate": 9.215639637046121e-05, "loss": 1.8348, "step": 213 }, { "epoch": 0.19682685674867786, "grad_norm": 0.8267980813980103, "learning_rate": 9.207712817709236e-05, "loss": 1.7645, "step": 214 }, { "epoch": 0.19774660841572775, "grad_norm": 0.7317152619361877, "learning_rate": 9.19974958985298e-05, "loss": 1.7478, "step": 215 }, { "epoch": 0.19866636008277766, "grad_norm": 0.6896607875823975, "learning_rate": 9.191750022381614e-05, "loss": 1.7699, "step": 216 }, { "epoch": 0.19958611174982754, "grad_norm": 0.7086347937583923, "learning_rate": 9.183714184513832e-05, "loss": 1.7938, "step": 217 }, { "epoch": 0.20050586341687746, "grad_norm": 0.6830713152885437, "learning_rate": 9.175642145782179e-05, "loss": 1.7568, "step": 218 }, { "epoch": 0.20142561508392734, "grad_norm": 0.5826436281204224, "learning_rate": 9.167533976032429e-05, "loss": 1.7548, "step": 219 }, { "epoch": 0.20234536675097722, "grad_norm": 0.669696569442749, "learning_rate": 9.159389745423002e-05, "loss": 1.8096, "step": 220 }, { "epoch": 0.20326511841802714, "grad_norm": 0.6378855109214783, "learning_rate": 9.151209524424333e-05, "loss": 1.7248, "step": 221 }, { "epoch": 0.20418487008507702, "grad_norm": 0.7418368458747864, "learning_rate": 9.142993383818283e-05, "loss": 1.6951, "step": 222 }, { "epoch": 0.20510462175212693, "grad_norm": 0.6502818465232849, "learning_rate": 9.134741394697517e-05, "loss": 1.6809, "step": 223 }, { "epoch": 0.20602437341917682, "grad_norm": 0.6646417379379272, "learning_rate": 9.126453628464888e-05, "loss": 1.7178, "step": 224 }, { "epoch": 0.20694412508622673, "grad_norm": 0.7070106267929077, "learning_rate": 9.118130156832823e-05, "loss": 1.7629, "step": 225 }, { "epoch": 0.2078638767532766, "grad_norm": 0.6244888305664062, "learning_rate": 9.109771051822702e-05, "loss": 1.763, "step": 226 }, { "epoch": 0.20878362842032652, "grad_norm": 0.6641138195991516, "learning_rate": 9.10137638576423e-05, "loss": 1.7016, "step": 227 }, { "epoch": 0.2097033800873764, "grad_norm": 0.7198558449745178, "learning_rate": 9.092946231294819e-05, "loss": 1.7247, "step": 228 }, { "epoch": 0.21062313175442632, "grad_norm": 0.5700192451477051, "learning_rate": 9.084480661358953e-05, "loss": 1.6782, "step": 229 }, { "epoch": 0.2115428834214762, "grad_norm": 0.8081958293914795, "learning_rate": 9.075979749207561e-05, "loss": 1.7437, "step": 230 }, { "epoch": 0.2124626350885261, "grad_norm": 0.7449802756309509, "learning_rate": 9.067443568397378e-05, "loss": 1.6924, "step": 231 }, { "epoch": 0.213382386755576, "grad_norm": 0.8385685086250305, "learning_rate": 9.058872192790313e-05, "loss": 1.6572, "step": 232 }, { "epoch": 0.21430213842262588, "grad_norm": 0.7077139616012573, "learning_rate": 9.050265696552812e-05, "loss": 1.6949, "step": 233 }, { "epoch": 0.2152218900896758, "grad_norm": 0.7295122742652893, "learning_rate": 9.041624154155208e-05, "loss": 1.6745, "step": 234 }, { "epoch": 0.21614164175672568, "grad_norm": 0.6347808241844177, "learning_rate": 9.032947640371086e-05, "loss": 1.6441, "step": 235 }, { "epoch": 0.2170613934237756, "grad_norm": 0.8323748707771301, "learning_rate": 9.024236230276629e-05, "loss": 1.6198, "step": 236 }, { "epoch": 0.21798114509082547, "grad_norm": 0.7440972328186035, "learning_rate": 9.01548999924997e-05, "loss": 1.6405, "step": 237 }, { "epoch": 0.21890089675787539, "grad_norm": 0.7849915623664856, "learning_rate": 9.006709022970547e-05, "loss": 1.6361, "step": 238 }, { "epoch": 0.21982064842492527, "grad_norm": 0.7478511929512024, "learning_rate": 8.997893377418432e-05, "loss": 1.543, "step": 239 }, { "epoch": 0.22074040009197515, "grad_norm": 0.6225507259368896, "learning_rate": 8.98904313887369e-05, "loss": 1.6248, "step": 240 }, { "epoch": 0.22166015175902506, "grad_norm": 0.6926827430725098, "learning_rate": 8.980158383915713e-05, "loss": 1.6449, "step": 241 }, { "epoch": 0.22257990342607495, "grad_norm": 0.6942108869552612, "learning_rate": 8.971239189422555e-05, "loss": 1.5912, "step": 242 }, { "epoch": 0.22349965509312486, "grad_norm": 0.623525857925415, "learning_rate": 8.962285632570267e-05, "loss": 1.5436, "step": 243 }, { "epoch": 0.22441940676017474, "grad_norm": 0.5779447555541992, "learning_rate": 8.953297790832231e-05, "loss": 1.5747, "step": 244 }, { "epoch": 0.22533915842722466, "grad_norm": 0.7703275680541992, "learning_rate": 8.944275741978493e-05, "loss": 1.5648, "step": 245 }, { "epoch": 0.22625891009427454, "grad_norm": 0.7855743765830994, "learning_rate": 8.935219564075085e-05, "loss": 1.5246, "step": 246 }, { "epoch": 0.22717866176132445, "grad_norm": 0.851977527141571, "learning_rate": 8.926129335483349e-05, "loss": 1.4777, "step": 247 }, { "epoch": 0.22809841342837434, "grad_norm": 0.8636126518249512, "learning_rate": 8.917005134859263e-05, "loss": 1.5235, "step": 248 }, { "epoch": 0.22901816509542425, "grad_norm": 1.055405616760254, "learning_rate": 8.907847041152756e-05, "loss": 1.5131, "step": 249 }, { "epoch": 0.22993791676247413, "grad_norm": 1.2434190511703491, "learning_rate": 8.89865513360703e-05, "loss": 1.3169, "step": 250 }, { "epoch": 0.23085766842952402, "grad_norm": 2.794989585876465, "learning_rate": 8.889429491757871e-05, "loss": 2.3149, "step": 251 }, { "epoch": 0.23177742009657393, "grad_norm": 2.0627057552337646, "learning_rate": 8.88017019543296e-05, "loss": 2.0616, "step": 252 }, { "epoch": 0.2326971717636238, "grad_norm": 1.3948839902877808, "learning_rate": 8.870877324751184e-05, "loss": 1.9026, "step": 253 }, { "epoch": 0.23361692343067372, "grad_norm": 0.9678890109062195, "learning_rate": 8.861550960121945e-05, "loss": 1.8307, "step": 254 }, { "epoch": 0.2345366750977236, "grad_norm": 1.0957893133163452, "learning_rate": 8.852191182244456e-05, "loss": 1.7364, "step": 255 }, { "epoch": 0.23545642676477352, "grad_norm": 0.9677236676216125, "learning_rate": 8.842798072107054e-05, "loss": 1.762, "step": 256 }, { "epoch": 0.2363761784318234, "grad_norm": 1.012479305267334, "learning_rate": 8.833371710986493e-05, "loss": 1.6711, "step": 257 }, { "epoch": 0.23729593009887331, "grad_norm": 0.8846522569656372, "learning_rate": 8.823912180447236e-05, "loss": 1.8402, "step": 258 }, { "epoch": 0.2382156817659232, "grad_norm": 1.0523695945739746, "learning_rate": 8.81441956234076e-05, "loss": 1.703, "step": 259 }, { "epoch": 0.2391354334329731, "grad_norm": 1.0177359580993652, "learning_rate": 8.80489393880484e-05, "loss": 1.7218, "step": 260 }, { "epoch": 0.240055185100023, "grad_norm": 0.8454842567443848, "learning_rate": 8.79533539226284e-05, "loss": 1.6839, "step": 261 }, { "epoch": 0.24097493676707288, "grad_norm": 0.9161872863769531, "learning_rate": 8.785744005423002e-05, "loss": 1.7333, "step": 262 }, { "epoch": 0.2418946884341228, "grad_norm": 0.7548457384109497, "learning_rate": 8.77611986127773e-05, "loss": 1.696, "step": 263 }, { "epoch": 0.24281444010117267, "grad_norm": 0.9760596752166748, "learning_rate": 8.766463043102864e-05, "loss": 1.7102, "step": 264 }, { "epoch": 0.24373419176822259, "grad_norm": 0.7247944474220276, "learning_rate": 8.756773634456975e-05, "loss": 1.7439, "step": 265 }, { "epoch": 0.24465394343527247, "grad_norm": 0.7252097129821777, "learning_rate": 8.747051719180626e-05, "loss": 1.7811, "step": 266 }, { "epoch": 0.24557369510232238, "grad_norm": 0.6071887016296387, "learning_rate": 8.737297381395657e-05, "loss": 1.6398, "step": 267 }, { "epoch": 0.24649344676937227, "grad_norm": 0.7072895765304565, "learning_rate": 8.727510705504454e-05, "loss": 1.68, "step": 268 }, { "epoch": 0.24741319843642218, "grad_norm": 0.7006264925003052, "learning_rate": 8.717691776189214e-05, "loss": 1.6814, "step": 269 }, { "epoch": 0.24833295010347206, "grad_norm": 0.6832376718521118, "learning_rate": 8.707840678411224e-05, "loss": 1.6259, "step": 270 }, { "epoch": 0.24925270177052197, "grad_norm": 0.5689120292663574, "learning_rate": 8.697957497410108e-05, "loss": 1.6786, "step": 271 }, { "epoch": 0.25017245343757183, "grad_norm": 0.8517261743545532, "learning_rate": 8.688042318703111e-05, "loss": 1.6644, "step": 272 }, { "epoch": 0.25109220510462177, "grad_norm": 0.5697482824325562, "learning_rate": 8.678095228084343e-05, "loss": 1.6705, "step": 273 }, { "epoch": 0.25201195677167165, "grad_norm": 0.6067523956298828, "learning_rate": 8.66811631162404e-05, "loss": 1.7022, "step": 274 }, { "epoch": 0.25293170843872154, "grad_norm": 0.6944383382797241, "learning_rate": 8.65810565566782e-05, "loss": 1.6235, "step": 275 }, { "epoch": 0.2538514601057714, "grad_norm": 0.5674624443054199, "learning_rate": 8.648063346835942e-05, "loss": 1.6757, "step": 276 }, { "epoch": 0.25477121177282136, "grad_norm": 0.6712316274642944, "learning_rate": 8.637989472022549e-05, "loss": 1.627, "step": 277 }, { "epoch": 0.25569096343987124, "grad_norm": 0.5806477069854736, "learning_rate": 8.627884118394913e-05, "loss": 1.6709, "step": 278 }, { "epoch": 0.25661071510692113, "grad_norm": 0.5989074110984802, "learning_rate": 8.617747373392696e-05, "loss": 1.6802, "step": 279 }, { "epoch": 0.257530466773971, "grad_norm": 0.6222725510597229, "learning_rate": 8.607579324727175e-05, "loss": 1.5823, "step": 280 }, { "epoch": 0.25845021844102095, "grad_norm": 0.6905350685119629, "learning_rate": 8.597380060380493e-05, "loss": 1.5795, "step": 281 }, { "epoch": 0.25936997010807084, "grad_norm": 0.9093815684318542, "learning_rate": 8.5871496686049e-05, "loss": 1.6131, "step": 282 }, { "epoch": 0.2602897217751207, "grad_norm": 0.8468539714813232, "learning_rate": 8.576888237921983e-05, "loss": 1.5836, "step": 283 }, { "epoch": 0.2612094734421706, "grad_norm": 0.8949149250984192, "learning_rate": 8.566595857121902e-05, "loss": 1.5574, "step": 284 }, { "epoch": 0.2621292251092205, "grad_norm": 0.7991402745246887, "learning_rate": 8.556272615262622e-05, "loss": 1.5941, "step": 285 }, { "epoch": 0.2630489767762704, "grad_norm": 1.0631219148635864, "learning_rate": 8.545918601669147e-05, "loss": 1.6469, "step": 286 }, { "epoch": 0.2639687284433203, "grad_norm": 0.6237906217575073, "learning_rate": 8.535533905932738e-05, "loss": 1.5148, "step": 287 }, { "epoch": 0.2648884801103702, "grad_norm": 0.9192318320274353, "learning_rate": 8.525118617910143e-05, "loss": 1.4909, "step": 288 }, { "epoch": 0.2658082317774201, "grad_norm": 0.8480085134506226, "learning_rate": 8.514672827722824e-05, "loss": 1.4746, "step": 289 }, { "epoch": 0.26672798344447, "grad_norm": 0.9110789895057678, "learning_rate": 8.504196625756166e-05, "loss": 1.5245, "step": 290 }, { "epoch": 0.2676477351115199, "grad_norm": 0.7915551066398621, "learning_rate": 8.493690102658703e-05, "loss": 1.4658, "step": 291 }, { "epoch": 0.2685674867785698, "grad_norm": 0.8689735531806946, "learning_rate": 8.483153349341335e-05, "loss": 1.5159, "step": 292 }, { "epoch": 0.26948723844561967, "grad_norm": 0.966712474822998, "learning_rate": 8.472586456976535e-05, "loss": 1.4782, "step": 293 }, { "epoch": 0.27040699011266955, "grad_norm": 0.8555867075920105, "learning_rate": 8.461989516997565e-05, "loss": 1.5046, "step": 294 }, { "epoch": 0.2713267417797195, "grad_norm": 0.8497052192687988, "learning_rate": 8.45136262109768e-05, "loss": 1.3816, "step": 295 }, { "epoch": 0.2722464934467694, "grad_norm": 0.776263952255249, "learning_rate": 8.440705861229344e-05, "loss": 1.5065, "step": 296 }, { "epoch": 0.27316624511381926, "grad_norm": 1.1991870403289795, "learning_rate": 8.430019329603422e-05, "loss": 1.4482, "step": 297 }, { "epoch": 0.27408599678086915, "grad_norm": 0.9438532590866089, "learning_rate": 8.41930311868839e-05, "loss": 1.4023, "step": 298 }, { "epoch": 0.2750057484479191, "grad_norm": 1.3889118432998657, "learning_rate": 8.408557321209534e-05, "loss": 1.3493, "step": 299 }, { "epoch": 0.27592550011496897, "grad_norm": 1.7762432098388672, "learning_rate": 8.397782030148147e-05, "loss": 1.257, "step": 300 }, { "epoch": 0.27592550011496897, "eval_loss": 1.6551681756973267, "eval_runtime": 50.0018, "eval_samples_per_second": 164.794, "eval_steps_per_second": 20.599, "step": 300 }, { "epoch": 0.27684525178201885, "grad_norm": 2.846353530883789, "learning_rate": 8.386977338740724e-05, "loss": 2.0714, "step": 301 }, { "epoch": 0.27776500344906874, "grad_norm": 2.5227103233337402, "learning_rate": 8.376143340478153e-05, "loss": 1.8748, "step": 302 }, { "epoch": 0.2786847551161186, "grad_norm": 2.0501370429992676, "learning_rate": 8.365280129104912e-05, "loss": 1.7948, "step": 303 }, { "epoch": 0.27960450678316856, "grad_norm": 1.0905100107192993, "learning_rate": 8.354387798618253e-05, "loss": 1.7508, "step": 304 }, { "epoch": 0.28052425845021844, "grad_norm": 1.1486353874206543, "learning_rate": 8.343466443267391e-05, "loss": 1.7368, "step": 305 }, { "epoch": 0.28144401011726833, "grad_norm": 1.1892223358154297, "learning_rate": 8.332516157552684e-05, "loss": 1.6652, "step": 306 }, { "epoch": 0.2823637617843182, "grad_norm": 1.027815341949463, "learning_rate": 8.321537036224822e-05, "loss": 1.6847, "step": 307 }, { "epoch": 0.28328351345136815, "grad_norm": 1.1536738872528076, "learning_rate": 8.310529174284004e-05, "loss": 1.7384, "step": 308 }, { "epoch": 0.28420326511841804, "grad_norm": 0.8124598264694214, "learning_rate": 8.299492666979113e-05, "loss": 1.6906, "step": 309 }, { "epoch": 0.2851230167854679, "grad_norm": 1.1598918437957764, "learning_rate": 8.2884276098069e-05, "loss": 1.7223, "step": 310 }, { "epoch": 0.2860427684525178, "grad_norm": 1.1664563417434692, "learning_rate": 8.277334098511147e-05, "loss": 1.6548, "step": 311 }, { "epoch": 0.28696252011956774, "grad_norm": 0.6637358069419861, "learning_rate": 8.266212229081847e-05, "loss": 1.6638, "step": 312 }, { "epoch": 0.2878822717866176, "grad_norm": 0.987754225730896, "learning_rate": 8.255062097754372e-05, "loss": 1.7133, "step": 313 }, { "epoch": 0.2888020234536675, "grad_norm": 0.7713818550109863, "learning_rate": 8.243883801008632e-05, "loss": 1.6705, "step": 314 }, { "epoch": 0.2897217751207174, "grad_norm": 1.0500911474227905, "learning_rate": 8.232677435568252e-05, "loss": 1.5651, "step": 315 }, { "epoch": 0.2906415267877673, "grad_norm": 0.7900861501693726, "learning_rate": 8.221443098399732e-05, "loss": 1.6276, "step": 316 }, { "epoch": 0.2915612784548172, "grad_norm": 0.7363952994346619, "learning_rate": 8.210180886711602e-05, "loss": 1.5795, "step": 317 }, { "epoch": 0.2924810301218671, "grad_norm": 0.895269513130188, "learning_rate": 8.198890897953586e-05, "loss": 1.6644, "step": 318 }, { "epoch": 0.293400781788917, "grad_norm": 0.9014370441436768, "learning_rate": 8.187573229815758e-05, "loss": 1.619, "step": 319 }, { "epoch": 0.29432053345596687, "grad_norm": 1.06600821018219, "learning_rate": 8.176227980227694e-05, "loss": 1.6779, "step": 320 }, { "epoch": 0.2952402851230168, "grad_norm": 1.0690526962280273, "learning_rate": 8.164855247357627e-05, "loss": 1.553, "step": 321 }, { "epoch": 0.2961600367900667, "grad_norm": 0.8835525512695312, "learning_rate": 8.153455129611605e-05, "loss": 1.614, "step": 322 }, { "epoch": 0.2970797884571166, "grad_norm": 1.1458913087844849, "learning_rate": 8.142027725632623e-05, "loss": 1.6015, "step": 323 }, { "epoch": 0.29799954012416646, "grad_norm": 0.6511287093162537, "learning_rate": 8.130573134299782e-05, "loss": 1.6129, "step": 324 }, { "epoch": 0.29891929179121635, "grad_norm": 1.1985218524932861, "learning_rate": 8.119091454727428e-05, "loss": 1.564, "step": 325 }, { "epoch": 0.2998390434582663, "grad_norm": 1.0999850034713745, "learning_rate": 8.107582786264299e-05, "loss": 1.6318, "step": 326 }, { "epoch": 0.30075879512531617, "grad_norm": 0.664042055606842, "learning_rate": 8.09604722849266e-05, "loss": 1.6049, "step": 327 }, { "epoch": 0.30167854679236605, "grad_norm": 0.9706513285636902, "learning_rate": 8.084484881227448e-05, "loss": 1.6157, "step": 328 }, { "epoch": 0.30259829845941594, "grad_norm": 0.7374880909919739, "learning_rate": 8.072895844515398e-05, "loss": 1.573, "step": 329 }, { "epoch": 0.3035180501264659, "grad_norm": 0.9631950855255127, "learning_rate": 8.061280218634192e-05, "loss": 1.5568, "step": 330 }, { "epoch": 0.30443780179351576, "grad_norm": 0.9304092526435852, "learning_rate": 8.049638104091575e-05, "loss": 1.6135, "step": 331 }, { "epoch": 0.30535755346056564, "grad_norm": 0.7095350027084351, "learning_rate": 8.037969601624495e-05, "loss": 1.5427, "step": 332 }, { "epoch": 0.30627730512761553, "grad_norm": 1.130644679069519, "learning_rate": 8.026274812198234e-05, "loss": 1.5704, "step": 333 }, { "epoch": 0.3071970567946654, "grad_norm": 0.6161345839500427, "learning_rate": 8.014553837005527e-05, "loss": 1.5705, "step": 334 }, { "epoch": 0.30811680846171535, "grad_norm": 0.7174437046051025, "learning_rate": 8.002806777465685e-05, "loss": 1.599, "step": 335 }, { "epoch": 0.30903656012876524, "grad_norm": 1.0651494264602661, "learning_rate": 7.991033735223729e-05, "loss": 1.538, "step": 336 }, { "epoch": 0.3099563117958151, "grad_norm": 0.7327350974082947, "learning_rate": 7.979234812149501e-05, "loss": 1.4112, "step": 337 }, { "epoch": 0.310876063462865, "grad_norm": 0.8603296279907227, "learning_rate": 7.967410110336782e-05, "loss": 1.4141, "step": 338 }, { "epoch": 0.31179581512991494, "grad_norm": 0.7242352962493896, "learning_rate": 7.955559732102414e-05, "loss": 1.4316, "step": 339 }, { "epoch": 0.31271556679696483, "grad_norm": 0.7651688456535339, "learning_rate": 7.943683779985413e-05, "loss": 1.5116, "step": 340 }, { "epoch": 0.3136353184640147, "grad_norm": 0.6736311316490173, "learning_rate": 7.931782356746076e-05, "loss": 1.4454, "step": 341 }, { "epoch": 0.3145550701310646, "grad_norm": 0.6474123597145081, "learning_rate": 7.919855565365102e-05, "loss": 1.4616, "step": 342 }, { "epoch": 0.31547482179811454, "grad_norm": 0.6624403595924377, "learning_rate": 7.907903509042696e-05, "loss": 1.4973, "step": 343 }, { "epoch": 0.3163945734651644, "grad_norm": 0.6722452640533447, "learning_rate": 7.895926291197667e-05, "loss": 1.4452, "step": 344 }, { "epoch": 0.3173143251322143, "grad_norm": 0.8001620769500732, "learning_rate": 7.883924015466553e-05, "loss": 1.4532, "step": 345 }, { "epoch": 0.3182340767992642, "grad_norm": 0.8588351011276245, "learning_rate": 7.871896785702707e-05, "loss": 1.4036, "step": 346 }, { "epoch": 0.31915382846631407, "grad_norm": 0.8040063977241516, "learning_rate": 7.859844705975404e-05, "loss": 1.3815, "step": 347 }, { "epoch": 0.320073580133364, "grad_norm": 1.0031120777130127, "learning_rate": 7.847767880568945e-05, "loss": 1.3611, "step": 348 }, { "epoch": 0.3209933318004139, "grad_norm": 0.8174616098403931, "learning_rate": 7.835666413981743e-05, "loss": 1.2897, "step": 349 }, { "epoch": 0.3219130834674638, "grad_norm": 1.1649737358093262, "learning_rate": 7.823540410925435e-05, "loss": 1.22, "step": 350 }, { "epoch": 0.32283283513451366, "grad_norm": 2.4392778873443604, "learning_rate": 7.811389976323961e-05, "loss": 1.9789, "step": 351 }, { "epoch": 0.3237525868015636, "grad_norm": 1.9123626947402954, "learning_rate": 7.799215215312667e-05, "loss": 1.817, "step": 352 }, { "epoch": 0.3246723384686135, "grad_norm": 1.556714653968811, "learning_rate": 7.787016233237387e-05, "loss": 1.6248, "step": 353 }, { "epoch": 0.32559209013566337, "grad_norm": 1.0949770212173462, "learning_rate": 7.774793135653538e-05, "loss": 1.6925, "step": 354 }, { "epoch": 0.32651184180271325, "grad_norm": 1.0330501794815063, "learning_rate": 7.7625460283252e-05, "loss": 1.6667, "step": 355 }, { "epoch": 0.32743159346976314, "grad_norm": 1.113447666168213, "learning_rate": 7.750275017224207e-05, "loss": 1.6345, "step": 356 }, { "epoch": 0.3283513451368131, "grad_norm": 1.0157980918884277, "learning_rate": 7.737980208529231e-05, "loss": 1.6047, "step": 357 }, { "epoch": 0.32927109680386296, "grad_norm": 0.8798123598098755, "learning_rate": 7.725661708624853e-05, "loss": 1.5993, "step": 358 }, { "epoch": 0.33019084847091285, "grad_norm": 0.9784142374992371, "learning_rate": 7.713319624100657e-05, "loss": 1.578, "step": 359 }, { "epoch": 0.33111060013796273, "grad_norm": 0.9105007648468018, "learning_rate": 7.700954061750293e-05, "loss": 1.6108, "step": 360 }, { "epoch": 0.33203035180501267, "grad_norm": 0.9545553922653198, "learning_rate": 7.688565128570564e-05, "loss": 1.6134, "step": 361 }, { "epoch": 0.33295010347206255, "grad_norm": 0.8679737448692322, "learning_rate": 7.676152931760496e-05, "loss": 1.5928, "step": 362 }, { "epoch": 0.33386985513911244, "grad_norm": 0.6711000204086304, "learning_rate": 7.663717578720411e-05, "loss": 1.6628, "step": 363 }, { "epoch": 0.3347896068061623, "grad_norm": 0.7280721068382263, "learning_rate": 7.651259177050996e-05, "loss": 1.6265, "step": 364 }, { "epoch": 0.33570935847321226, "grad_norm": 1.0024129152297974, "learning_rate": 7.63877783455237e-05, "loss": 1.6356, "step": 365 }, { "epoch": 0.33662911014026214, "grad_norm": 0.7483541369438171, "learning_rate": 7.626273659223165e-05, "loss": 1.5906, "step": 366 }, { "epoch": 0.33754886180731203, "grad_norm": 0.811964750289917, "learning_rate": 7.61374675925957e-05, "loss": 1.5831, "step": 367 }, { "epoch": 0.3384686134743619, "grad_norm": 0.9911743998527527, "learning_rate": 7.60119724305441e-05, "loss": 1.5819, "step": 368 }, { "epoch": 0.3393883651414118, "grad_norm": 0.6445810794830322, "learning_rate": 7.588625219196208e-05, "loss": 1.5991, "step": 369 }, { "epoch": 0.34030811680846174, "grad_norm": 0.8051655888557434, "learning_rate": 7.576030796468233e-05, "loss": 1.5491, "step": 370 }, { "epoch": 0.3412278684755116, "grad_norm": 0.9976129531860352, "learning_rate": 7.563414083847573e-05, "loss": 1.5645, "step": 371 }, { "epoch": 0.3421476201425615, "grad_norm": 0.7071700096130371, "learning_rate": 7.550775190504189e-05, "loss": 1.528, "step": 372 }, { "epoch": 0.3430673718096114, "grad_norm": 0.7412607669830322, "learning_rate": 7.538114225799954e-05, "loss": 1.5505, "step": 373 }, { "epoch": 0.3439871234766613, "grad_norm": 0.7667213082313538, "learning_rate": 7.525431299287738e-05, "loss": 1.525, "step": 374 }, { "epoch": 0.3449068751437112, "grad_norm": 0.5956572890281677, "learning_rate": 7.51272652071043e-05, "loss": 1.5149, "step": 375 }, { "epoch": 0.3458266268107611, "grad_norm": 0.797289252281189, "learning_rate": 7.500000000000001e-05, "loss": 1.5407, "step": 376 }, { "epoch": 0.346746378477811, "grad_norm": 0.7374883890151978, "learning_rate": 7.48725184727656e-05, "loss": 1.5777, "step": 377 }, { "epoch": 0.34766613014486086, "grad_norm": 0.7943119406700134, "learning_rate": 7.47448217284739e-05, "loss": 1.5795, "step": 378 }, { "epoch": 0.3485858818119108, "grad_norm": 0.6397266387939453, "learning_rate": 7.461691087205993e-05, "loss": 1.5687, "step": 379 }, { "epoch": 0.3495056334789607, "grad_norm": 0.7197580337524414, "learning_rate": 7.448878701031142e-05, "loss": 1.4994, "step": 380 }, { "epoch": 0.35042538514601057, "grad_norm": 0.614570677280426, "learning_rate": 7.436045125185922e-05, "loss": 1.5185, "step": 381 }, { "epoch": 0.35134513681306045, "grad_norm": 0.766139566898346, "learning_rate": 7.423190470716761e-05, "loss": 1.5445, "step": 382 }, { "epoch": 0.3522648884801104, "grad_norm": 0.6843118667602539, "learning_rate": 7.410314848852483e-05, "loss": 1.4972, "step": 383 }, { "epoch": 0.3531846401471603, "grad_norm": 0.6766433119773865, "learning_rate": 7.397418371003333e-05, "loss": 1.4285, "step": 384 }, { "epoch": 0.35410439181421016, "grad_norm": 0.8003432154655457, "learning_rate": 7.384501148760024e-05, "loss": 1.5283, "step": 385 }, { "epoch": 0.35502414348126005, "grad_norm": 0.8524566888809204, "learning_rate": 7.371563293892761e-05, "loss": 1.4922, "step": 386 }, { "epoch": 0.35594389514830993, "grad_norm": 0.9243666529655457, "learning_rate": 7.358604918350288e-05, "loss": 1.4883, "step": 387 }, { "epoch": 0.35686364681535987, "grad_norm": 0.7275565266609192, "learning_rate": 7.345626134258898e-05, "loss": 1.4268, "step": 388 }, { "epoch": 0.35778339848240975, "grad_norm": 0.6936664581298828, "learning_rate": 7.332627053921482e-05, "loss": 1.3605, "step": 389 }, { "epoch": 0.35870315014945964, "grad_norm": 0.7576991319656372, "learning_rate": 7.319607789816555e-05, "loss": 1.4222, "step": 390 }, { "epoch": 0.3596229018165095, "grad_norm": 0.7377772331237793, "learning_rate": 7.306568454597269e-05, "loss": 1.4681, "step": 391 }, { "epoch": 0.36054265348355946, "grad_norm": 0.8987662196159363, "learning_rate": 7.293509161090452e-05, "loss": 1.4066, "step": 392 }, { "epoch": 0.36146240515060934, "grad_norm": 0.7513107061386108, "learning_rate": 7.280430022295631e-05, "loss": 1.4134, "step": 393 }, { "epoch": 0.36238215681765923, "grad_norm": 0.6676529049873352, "learning_rate": 7.267331151384039e-05, "loss": 1.4374, "step": 394 }, { "epoch": 0.3633019084847091, "grad_norm": 0.8300096988677979, "learning_rate": 7.254212661697659e-05, "loss": 1.3849, "step": 395 }, { "epoch": 0.36422166015175905, "grad_norm": 0.8758336901664734, "learning_rate": 7.241074666748227e-05, "loss": 1.3774, "step": 396 }, { "epoch": 0.36514141181880894, "grad_norm": 0.8264380693435669, "learning_rate": 7.227917280216254e-05, "loss": 1.3575, "step": 397 }, { "epoch": 0.3660611634858588, "grad_norm": 1.014760136604309, "learning_rate": 7.214740615950041e-05, "loss": 1.3026, "step": 398 }, { "epoch": 0.3669809151529087, "grad_norm": 0.8453448414802551, "learning_rate": 7.201544787964698e-05, "loss": 1.3114, "step": 399 }, { "epoch": 0.3679006668199586, "grad_norm": 1.1275343894958496, "learning_rate": 7.188329910441154e-05, "loss": 1.1734, "step": 400 }, { "epoch": 0.36882041848700853, "grad_norm": 2.2339935302734375, "learning_rate": 7.17509609772517e-05, "loss": 1.8776, "step": 401 }, { "epoch": 0.3697401701540584, "grad_norm": 1.5469164848327637, "learning_rate": 7.161843464326348e-05, "loss": 1.6876, "step": 402 }, { "epoch": 0.3706599218211083, "grad_norm": 1.2731298208236694, "learning_rate": 7.148572124917148e-05, "loss": 1.581, "step": 403 }, { "epoch": 0.3715796734881582, "grad_norm": 0.9135886430740356, "learning_rate": 7.13528219433188e-05, "loss": 1.5912, "step": 404 }, { "epoch": 0.3724994251552081, "grad_norm": 0.8309260606765747, "learning_rate": 7.121973787565726e-05, "loss": 1.5825, "step": 405 }, { "epoch": 0.373419176822258, "grad_norm": 0.8344767093658447, "learning_rate": 7.10864701977374e-05, "loss": 1.5724, "step": 406 }, { "epoch": 0.3743389284893079, "grad_norm": 0.8113982081413269, "learning_rate": 7.095302006269842e-05, "loss": 1.5899, "step": 407 }, { "epoch": 0.37525868015635777, "grad_norm": 0.8019097447395325, "learning_rate": 7.081938862525839e-05, "loss": 1.6347, "step": 408 }, { "epoch": 0.37617843182340766, "grad_norm": 0.7903069257736206, "learning_rate": 7.06855770417041e-05, "loss": 1.5924, "step": 409 }, { "epoch": 0.3770981834904576, "grad_norm": 0.7817911505699158, "learning_rate": 7.055158646988109e-05, "loss": 1.5705, "step": 410 }, { "epoch": 0.3780179351575075, "grad_norm": 0.7876037359237671, "learning_rate": 7.041741806918371e-05, "loss": 1.553, "step": 411 }, { "epoch": 0.37893768682455736, "grad_norm": 0.8235687017440796, "learning_rate": 7.028307300054499e-05, "loss": 1.5954, "step": 412 }, { "epoch": 0.37985743849160725, "grad_norm": 0.6427410244941711, "learning_rate": 7.014855242642662e-05, "loss": 1.5935, "step": 413 }, { "epoch": 0.3807771901586572, "grad_norm": 0.6327434182167053, "learning_rate": 7.001385751080894e-05, "loss": 1.5992, "step": 414 }, { "epoch": 0.38169694182570707, "grad_norm": 0.705020010471344, "learning_rate": 6.987898941918082e-05, "loss": 1.5326, "step": 415 }, { "epoch": 0.38261669349275695, "grad_norm": 0.6907270550727844, "learning_rate": 6.974394931852956e-05, "loss": 1.543, "step": 416 }, { "epoch": 0.38353644515980684, "grad_norm": 0.6643316745758057, "learning_rate": 6.960873837733088e-05, "loss": 1.501, "step": 417 }, { "epoch": 0.3844561968268567, "grad_norm": 0.6536545753479004, "learning_rate": 6.94733577655387e-05, "loss": 1.5498, "step": 418 }, { "epoch": 0.38537594849390666, "grad_norm": 0.7011268138885498, "learning_rate": 6.933780865457508e-05, "loss": 1.6318, "step": 419 }, { "epoch": 0.38629570016095655, "grad_norm": 0.6373593211174011, "learning_rate": 6.920209221732006e-05, "loss": 1.5523, "step": 420 }, { "epoch": 0.38721545182800643, "grad_norm": 0.5898979902267456, "learning_rate": 6.90662096281016e-05, "loss": 1.5695, "step": 421 }, { "epoch": 0.3881352034950563, "grad_norm": 0.6590458750724792, "learning_rate": 6.893016206268518e-05, "loss": 1.4721, "step": 422 }, { "epoch": 0.38905495516210625, "grad_norm": 0.6448785662651062, "learning_rate": 6.879395069826393e-05, "loss": 1.5485, "step": 423 }, { "epoch": 0.38997470682915614, "grad_norm": 0.648471474647522, "learning_rate": 6.865757671344827e-05, "loss": 1.5469, "step": 424 }, { "epoch": 0.390894458496206, "grad_norm": 0.8980266451835632, "learning_rate": 6.85210412882557e-05, "loss": 1.5831, "step": 425 }, { "epoch": 0.3918142101632559, "grad_norm": 0.6711221933364868, "learning_rate": 6.838434560410064e-05, "loss": 1.4341, "step": 426 }, { "epoch": 0.39273396183030584, "grad_norm": 0.8187699317932129, "learning_rate": 6.824749084378428e-05, "loss": 1.4696, "step": 427 }, { "epoch": 0.39365371349735573, "grad_norm": 0.8267800807952881, "learning_rate": 6.811047819148413e-05, "loss": 1.5041, "step": 428 }, { "epoch": 0.3945734651644056, "grad_norm": 0.764512300491333, "learning_rate": 6.797330883274403e-05, "loss": 1.4774, "step": 429 }, { "epoch": 0.3954932168314555, "grad_norm": 0.8012046813964844, "learning_rate": 6.783598395446371e-05, "loss": 1.4947, "step": 430 }, { "epoch": 0.3964129684985054, "grad_norm": 0.5986045598983765, "learning_rate": 6.769850474488859e-05, "loss": 1.5161, "step": 431 }, { "epoch": 0.3973327201655553, "grad_norm": 0.8222801685333252, "learning_rate": 6.756087239359947e-05, "loss": 1.4726, "step": 432 }, { "epoch": 0.3982524718326052, "grad_norm": 0.6513310670852661, "learning_rate": 6.742308809150232e-05, "loss": 1.4894, "step": 433 }, { "epoch": 0.3991722234996551, "grad_norm": 0.6340191960334778, "learning_rate": 6.728515303081781e-05, "loss": 1.4616, "step": 434 }, { "epoch": 0.40009197516670497, "grad_norm": 0.8488625288009644, "learning_rate": 6.714706840507121e-05, "loss": 1.4096, "step": 435 }, { "epoch": 0.4010117268337549, "grad_norm": 0.6022557020187378, "learning_rate": 6.700883540908184e-05, "loss": 1.4149, "step": 436 }, { "epoch": 0.4019314785008048, "grad_norm": 0.7043591141700745, "learning_rate": 6.687045523895293e-05, "loss": 1.492, "step": 437 }, { "epoch": 0.4028512301678547, "grad_norm": 0.8003234267234802, "learning_rate": 6.673192909206108e-05, "loss": 1.3878, "step": 438 }, { "epoch": 0.40377098183490456, "grad_norm": 0.6873340010643005, "learning_rate": 6.659325816704611e-05, "loss": 1.4326, "step": 439 }, { "epoch": 0.40469073350195445, "grad_norm": 0.673957884311676, "learning_rate": 6.64544436638005e-05, "loss": 1.4086, "step": 440 }, { "epoch": 0.4056104851690044, "grad_norm": 0.7485764026641846, "learning_rate": 6.63154867834591e-05, "loss": 1.3967, "step": 441 }, { "epoch": 0.40653023683605427, "grad_norm": 0.6807146072387695, "learning_rate": 6.617638872838874e-05, "loss": 1.3429, "step": 442 }, { "epoch": 0.40744998850310415, "grad_norm": 0.6480006575584412, "learning_rate": 6.603715070217778e-05, "loss": 1.3968, "step": 443 }, { "epoch": 0.40836974017015404, "grad_norm": 0.7995392084121704, "learning_rate": 6.589777390962575e-05, "loss": 1.4309, "step": 444 }, { "epoch": 0.409289491837204, "grad_norm": 0.7234594821929932, "learning_rate": 6.57582595567329e-05, "loss": 1.2972, "step": 445 }, { "epoch": 0.41020924350425386, "grad_norm": 0.9040266871452332, "learning_rate": 6.561860885068972e-05, "loss": 1.3339, "step": 446 }, { "epoch": 0.41112899517130375, "grad_norm": 0.8719410300254822, "learning_rate": 6.547882299986658e-05, "loss": 1.2914, "step": 447 }, { "epoch": 0.41204874683835363, "grad_norm": 0.964036226272583, "learning_rate": 6.533890321380319e-05, "loss": 1.2348, "step": 448 }, { "epoch": 0.4129684985054035, "grad_norm": 1.0289238691329956, "learning_rate": 6.519885070319827e-05, "loss": 1.1747, "step": 449 }, { "epoch": 0.41388825017245345, "grad_norm": 1.0722767114639282, "learning_rate": 6.505866667989884e-05, "loss": 1.1749, "step": 450 }, { "epoch": 0.41388825017245345, "eval_loss": 1.5185648202896118, "eval_runtime": 49.961, "eval_samples_per_second": 164.929, "eval_steps_per_second": 20.616, "step": 450 }, { "epoch": 0.41480800183950334, "grad_norm": 2.0002212524414062, "learning_rate": 6.491835235689e-05, "loss": 1.8527, "step": 451 }, { "epoch": 0.4157277535065532, "grad_norm": 1.7632036209106445, "learning_rate": 6.477790894828421e-05, "loss": 1.6736, "step": 452 }, { "epoch": 0.4166475051736031, "grad_norm": 1.2842786312103271, "learning_rate": 6.463733766931095e-05, "loss": 1.6531, "step": 453 }, { "epoch": 0.41756725684065304, "grad_norm": 0.9530149698257446, "learning_rate": 6.449663973630613e-05, "loss": 1.5728, "step": 454 }, { "epoch": 0.41848700850770293, "grad_norm": 0.9490489363670349, "learning_rate": 6.435581636670154e-05, "loss": 1.458, "step": 455 }, { "epoch": 0.4194067601747528, "grad_norm": 0.9226535558700562, "learning_rate": 6.421486877901437e-05, "loss": 1.477, "step": 456 }, { "epoch": 0.4203265118418027, "grad_norm": 0.7617946267127991, "learning_rate": 6.407379819283661e-05, "loss": 1.4929, "step": 457 }, { "epoch": 0.42124626350885264, "grad_norm": 0.7731391787528992, "learning_rate": 6.39326058288246e-05, "loss": 1.5828, "step": 458 }, { "epoch": 0.4221660151759025, "grad_norm": 0.8461527824401855, "learning_rate": 6.379129290868837e-05, "loss": 1.558, "step": 459 }, { "epoch": 0.4230857668429524, "grad_norm": 0.8030949234962463, "learning_rate": 6.364986065518106e-05, "loss": 1.5026, "step": 460 }, { "epoch": 0.4240055185100023, "grad_norm": 0.9712105989456177, "learning_rate": 6.350831029208844e-05, "loss": 1.5603, "step": 461 }, { "epoch": 0.4249252701770522, "grad_norm": 0.936730146408081, "learning_rate": 6.336664304421818e-05, "loss": 1.5037, "step": 462 }, { "epoch": 0.4258450218441021, "grad_norm": 0.6644638776779175, "learning_rate": 6.322486013738942e-05, "loss": 1.5632, "step": 463 }, { "epoch": 0.426764773511152, "grad_norm": 0.8889780044555664, "learning_rate": 6.308296279842205e-05, "loss": 1.5392, "step": 464 }, { "epoch": 0.4276845251782019, "grad_norm": 0.771960973739624, "learning_rate": 6.294095225512603e-05, "loss": 1.5013, "step": 465 }, { "epoch": 0.42860427684525176, "grad_norm": 0.7682729363441467, "learning_rate": 6.2798829736291e-05, "loss": 1.4829, "step": 466 }, { "epoch": 0.4295240285123017, "grad_norm": 0.9224911332130432, "learning_rate": 6.265659647167543e-05, "loss": 1.5283, "step": 467 }, { "epoch": 0.4304437801793516, "grad_norm": 0.7462615370750427, "learning_rate": 6.251425369199599e-05, "loss": 1.4762, "step": 468 }, { "epoch": 0.43136353184640147, "grad_norm": 0.7566426396369934, "learning_rate": 6.237180262891708e-05, "loss": 1.5537, "step": 469 }, { "epoch": 0.43228328351345136, "grad_norm": 0.7278396487236023, "learning_rate": 6.222924451504001e-05, "loss": 1.4805, "step": 470 }, { "epoch": 0.43320303518050124, "grad_norm": 0.6063376069068909, "learning_rate": 6.208658058389231e-05, "loss": 1.5403, "step": 471 }, { "epoch": 0.4341227868475512, "grad_norm": 0.7265048623085022, "learning_rate": 6.194381206991722e-05, "loss": 1.5131, "step": 472 }, { "epoch": 0.43504253851460106, "grad_norm": 0.6536186933517456, "learning_rate": 6.180094020846291e-05, "loss": 1.4777, "step": 473 }, { "epoch": 0.43596229018165095, "grad_norm": 0.6153502464294434, "learning_rate": 6.165796623577171e-05, "loss": 1.4592, "step": 474 }, { "epoch": 0.43688204184870083, "grad_norm": 0.7638461589813232, "learning_rate": 6.15148913889696e-05, "loss": 1.5779, "step": 475 }, { "epoch": 0.43780179351575077, "grad_norm": 0.755756139755249, "learning_rate": 6.137171690605533e-05, "loss": 1.5246, "step": 476 }, { "epoch": 0.43872154518280065, "grad_norm": 0.5608311295509338, "learning_rate": 6.122844402588982e-05, "loss": 1.4824, "step": 477 }, { "epoch": 0.43964129684985054, "grad_norm": 0.7992551922798157, "learning_rate": 6.10850739881854e-05, "loss": 1.4434, "step": 478 }, { "epoch": 0.4405610485169004, "grad_norm": 0.6986256241798401, "learning_rate": 6.094160803349508e-05, "loss": 1.4313, "step": 479 }, { "epoch": 0.4414808001839503, "grad_norm": 0.6461309790611267, "learning_rate": 6.079804740320181e-05, "loss": 1.4743, "step": 480 }, { "epoch": 0.44240055185100025, "grad_norm": 0.7250984311103821, "learning_rate": 6.0654393339507753e-05, "loss": 1.4551, "step": 481 }, { "epoch": 0.44332030351805013, "grad_norm": 0.6796169281005859, "learning_rate": 6.051064708542357e-05, "loss": 1.485, "step": 482 }, { "epoch": 0.4442400551851, "grad_norm": 0.7773648500442505, "learning_rate": 6.0366809884757556e-05, "loss": 1.4153, "step": 483 }, { "epoch": 0.4451598068521499, "grad_norm": 0.9285596609115601, "learning_rate": 6.022288298210501e-05, "loss": 1.4624, "step": 484 }, { "epoch": 0.44607955851919984, "grad_norm": 0.7707833051681519, "learning_rate": 6.0078867622837395e-05, "loss": 1.431, "step": 485 }, { "epoch": 0.4469993101862497, "grad_norm": 0.9251638650894165, "learning_rate": 5.993476505309155e-05, "loss": 1.406, "step": 486 }, { "epoch": 0.4479190618532996, "grad_norm": 0.7242058515548706, "learning_rate": 5.979057651975892e-05, "loss": 1.3418, "step": 487 }, { "epoch": 0.4488388135203495, "grad_norm": 0.6925553679466248, "learning_rate": 5.9646303270474845e-05, "loss": 1.3463, "step": 488 }, { "epoch": 0.44975856518739943, "grad_norm": 0.779308021068573, "learning_rate": 5.9501946553607615e-05, "loss": 1.3228, "step": 489 }, { "epoch": 0.4506783168544493, "grad_norm": 0.750455379486084, "learning_rate": 5.9357507618247764e-05, "loss": 1.3406, "step": 490 }, { "epoch": 0.4515980685214992, "grad_norm": 0.7992476224899292, "learning_rate": 5.921298771419731e-05, "loss": 1.375, "step": 491 }, { "epoch": 0.4525178201885491, "grad_norm": 0.7606462240219116, "learning_rate": 5.9068388091958795e-05, "loss": 1.3066, "step": 492 }, { "epoch": 0.45343757185559896, "grad_norm": 0.651400625705719, "learning_rate": 5.8923710002724594e-05, "loss": 1.3312, "step": 493 }, { "epoch": 0.4543573235226489, "grad_norm": 0.7911424040794373, "learning_rate": 5.877895469836604e-05, "loss": 1.3228, "step": 494 }, { "epoch": 0.4552770751896988, "grad_norm": 0.8071415424346924, "learning_rate": 5.863412343142258e-05, "loss": 1.3149, "step": 495 }, { "epoch": 0.45619682685674867, "grad_norm": 1.001132845878601, "learning_rate": 5.848921745509094e-05, "loss": 1.2951, "step": 496 }, { "epoch": 0.45711657852379856, "grad_norm": 0.9951808452606201, "learning_rate": 5.834423802321431e-05, "loss": 1.2331, "step": 497 }, { "epoch": 0.4580363301908485, "grad_norm": 0.9824991822242737, "learning_rate": 5.8199186390271486e-05, "loss": 1.2146, "step": 498 }, { "epoch": 0.4589560818578984, "grad_norm": 1.3014886379241943, "learning_rate": 5.805406381136598e-05, "loss": 1.2247, "step": 499 }, { "epoch": 0.45987583352494826, "grad_norm": 1.4302425384521484, "learning_rate": 5.79088715422152e-05, "loss": 1.047, "step": 500 }, { "epoch": 0.46079558519199815, "grad_norm": 1.9563382863998413, "learning_rate": 5.7763610839139594e-05, "loss": 1.6971, "step": 501 }, { "epoch": 0.46171533685904803, "grad_norm": 1.5344587564468384, "learning_rate": 5.761828295905169e-05, "loss": 1.6824, "step": 502 }, { "epoch": 0.46263508852609797, "grad_norm": 1.1466830968856812, "learning_rate": 5.747288915944533e-05, "loss": 1.5384, "step": 503 }, { "epoch": 0.46355484019314785, "grad_norm": 1.1582822799682617, "learning_rate": 5.7327430698384775e-05, "loss": 1.6326, "step": 504 }, { "epoch": 0.46447459186019774, "grad_norm": 1.1693201065063477, "learning_rate": 5.7181908834493726e-05, "loss": 1.5041, "step": 505 }, { "epoch": 0.4653943435272476, "grad_norm": 0.9729719758033752, "learning_rate": 5.703632482694453e-05, "loss": 1.5669, "step": 506 }, { "epoch": 0.46631409519429756, "grad_norm": 0.9684829115867615, "learning_rate": 5.689067993544725e-05, "loss": 1.5907, "step": 507 }, { "epoch": 0.46723384686134745, "grad_norm": 0.8785848021507263, "learning_rate": 5.6744975420238745e-05, "loss": 1.4962, "step": 508 }, { "epoch": 0.46815359852839733, "grad_norm": 0.7249252796173096, "learning_rate": 5.6599212542071824e-05, "loss": 1.5372, "step": 509 }, { "epoch": 0.4690733501954472, "grad_norm": 0.9696371555328369, "learning_rate": 5.645339256220426e-05, "loss": 1.4834, "step": 510 }, { "epoch": 0.46999310186249715, "grad_norm": 0.9309729933738708, "learning_rate": 5.6307516742387955e-05, "loss": 1.6006, "step": 511 }, { "epoch": 0.47091285352954704, "grad_norm": 0.8194191455841064, "learning_rate": 5.616158634485793e-05, "loss": 1.5423, "step": 512 }, { "epoch": 0.4718326051965969, "grad_norm": 0.8985216617584229, "learning_rate": 5.601560263232153e-05, "loss": 1.4869, "step": 513 }, { "epoch": 0.4727523568636468, "grad_norm": 0.8546054363250732, "learning_rate": 5.586956686794734e-05, "loss": 1.5534, "step": 514 }, { "epoch": 0.4736721085306967, "grad_norm": 0.7134532332420349, "learning_rate": 5.572348031535441e-05, "loss": 1.465, "step": 515 }, { "epoch": 0.47459186019774663, "grad_norm": 0.6382752656936646, "learning_rate": 5.557734423860123e-05, "loss": 1.4897, "step": 516 }, { "epoch": 0.4755116118647965, "grad_norm": 0.8380042314529419, "learning_rate": 5.543115990217478e-05, "loss": 1.4646, "step": 517 }, { "epoch": 0.4764313635318464, "grad_norm": 0.8848815560340881, "learning_rate": 5.528492857097966e-05, "loss": 1.4903, "step": 518 }, { "epoch": 0.4773511151988963, "grad_norm": 0.6244109272956848, "learning_rate": 5.5138651510327085e-05, "loss": 1.5031, "step": 519 }, { "epoch": 0.4782708668659462, "grad_norm": 0.8367244601249695, "learning_rate": 5.499232998592399e-05, "loss": 1.4978, "step": 520 }, { "epoch": 0.4791906185329961, "grad_norm": 0.7362543344497681, "learning_rate": 5.484596526386198e-05, "loss": 1.529, "step": 521 }, { "epoch": 0.480110370200046, "grad_norm": 0.579655647277832, "learning_rate": 5.469955861060653e-05, "loss": 1.4446, "step": 522 }, { "epoch": 0.4810301218670959, "grad_norm": 0.7875382304191589, "learning_rate": 5.455311129298586e-05, "loss": 1.505, "step": 523 }, { "epoch": 0.48194987353414576, "grad_norm": 0.7048112154006958, "learning_rate": 5.4406624578180096e-05, "loss": 1.4612, "step": 524 }, { "epoch": 0.4828696252011957, "grad_norm": 0.6148046255111694, "learning_rate": 5.4260099733710255e-05, "loss": 1.4871, "step": 525 }, { "epoch": 0.4837893768682456, "grad_norm": 0.7813459038734436, "learning_rate": 5.4113538027427245e-05, "loss": 1.431, "step": 526 }, { "epoch": 0.48470912853529546, "grad_norm": 0.6388234496116638, "learning_rate": 5.396694072750099e-05, "loss": 1.4811, "step": 527 }, { "epoch": 0.48562888020234535, "grad_norm": 0.5977755784988403, "learning_rate": 5.382030910240936e-05, "loss": 1.4302, "step": 528 }, { "epoch": 0.4865486318693953, "grad_norm": 0.6440762281417847, "learning_rate": 5.367364442092724e-05, "loss": 1.4468, "step": 529 }, { "epoch": 0.48746838353644517, "grad_norm": 0.68966144323349, "learning_rate": 5.352694795211555e-05, "loss": 1.4563, "step": 530 }, { "epoch": 0.48838813520349506, "grad_norm": 0.682101845741272, "learning_rate": 5.338022096531028e-05, "loss": 1.4953, "step": 531 }, { "epoch": 0.48930788687054494, "grad_norm": 0.5871472954750061, "learning_rate": 5.3233464730111426e-05, "loss": 1.4285, "step": 532 }, { "epoch": 0.4902276385375948, "grad_norm": 0.60948246717453, "learning_rate": 5.308668051637212e-05, "loss": 1.4083, "step": 533 }, { "epoch": 0.49114739020464476, "grad_norm": 0.7118504047393799, "learning_rate": 5.2939869594187595e-05, "loss": 1.4257, "step": 534 }, { "epoch": 0.49206714187169465, "grad_norm": 0.6763386726379395, "learning_rate": 5.2793033233884124e-05, "loss": 1.3886, "step": 535 }, { "epoch": 0.49298689353874453, "grad_norm": 0.6314605474472046, "learning_rate": 5.2646172706008156e-05, "loss": 1.3105, "step": 536 }, { "epoch": 0.4939066452057944, "grad_norm": 0.7385772466659546, "learning_rate": 5.249928928131523e-05, "loss": 1.3189, "step": 537 }, { "epoch": 0.49482639687284435, "grad_norm": 0.6615415811538696, "learning_rate": 5.235238423075899e-05, "loss": 1.3235, "step": 538 }, { "epoch": 0.49574614853989424, "grad_norm": 0.6805823445320129, "learning_rate": 5.220545882548023e-05, "loss": 1.3938, "step": 539 }, { "epoch": 0.4966659002069441, "grad_norm": 0.8164578676223755, "learning_rate": 5.205851433679589e-05, "loss": 1.329, "step": 540 }, { "epoch": 0.497585651873994, "grad_norm": 0.7139110565185547, "learning_rate": 5.191155203618796e-05, "loss": 1.2914, "step": 541 }, { "epoch": 0.49850540354104395, "grad_norm": 0.6411809921264648, "learning_rate": 5.176457319529263e-05, "loss": 1.3289, "step": 542 }, { "epoch": 0.49942515520809383, "grad_norm": 0.639995813369751, "learning_rate": 5.161757908588917e-05, "loss": 1.2874, "step": 543 }, { "epoch": 0.5003449068751437, "grad_norm": 0.6557344794273376, "learning_rate": 5.1470570979888973e-05, "loss": 1.3043, "step": 544 }, { "epoch": 0.5012646585421936, "grad_norm": 0.7925935387611389, "learning_rate": 5.132355014932455e-05, "loss": 1.2978, "step": 545 }, { "epoch": 0.5021844102092435, "grad_norm": 0.7339189052581787, "learning_rate": 5.117651786633849e-05, "loss": 1.2996, "step": 546 }, { "epoch": 0.5031041618762934, "grad_norm": 0.805228054523468, "learning_rate": 5.102947540317253e-05, "loss": 1.2458, "step": 547 }, { "epoch": 0.5040239135433433, "grad_norm": 0.7840575575828552, "learning_rate": 5.088242403215644e-05, "loss": 1.253, "step": 548 }, { "epoch": 0.5049436652103932, "grad_norm": 1.0337337255477905, "learning_rate": 5.073536502569708e-05, "loss": 1.1262, "step": 549 }, { "epoch": 0.5058634168774431, "grad_norm": 1.2608665227890015, "learning_rate": 5.0588299656267414e-05, "loss": 1.022, "step": 550 }, { "epoch": 0.506783168544493, "grad_norm": 1.6019068956375122, "learning_rate": 5.044122919639541e-05, "loss": 1.6294, "step": 551 }, { "epoch": 0.5077029202115428, "grad_norm": 1.4624245166778564, "learning_rate": 5.029415491865311e-05, "loss": 1.6211, "step": 552 }, { "epoch": 0.5086226718785928, "grad_norm": 1.249880075454712, "learning_rate": 5.014707809564562e-05, "loss": 1.5335, "step": 553 }, { "epoch": 0.5095424235456427, "grad_norm": 1.1160420179367065, "learning_rate": 5e-05, "loss": 1.5818, "step": 554 }, { "epoch": 0.5104621752126925, "grad_norm": 0.9601331353187561, "learning_rate": 4.98529219043544e-05, "loss": 1.5011, "step": 555 }, { "epoch": 0.5113819268797425, "grad_norm": 0.9078472852706909, "learning_rate": 4.9705845081346894e-05, "loss": 1.4804, "step": 556 }, { "epoch": 0.5123016785467923, "grad_norm": 1.0430097579956055, "learning_rate": 4.9558770803604614e-05, "loss": 1.5421, "step": 557 }, { "epoch": 0.5132214302138423, "grad_norm": 0.9206668138504028, "learning_rate": 4.94117003437326e-05, "loss": 1.5167, "step": 558 }, { "epoch": 0.5141411818808922, "grad_norm": 0.7888804078102112, "learning_rate": 4.926463497430293e-05, "loss": 1.4761, "step": 559 }, { "epoch": 0.515060933547942, "grad_norm": 0.7101994752883911, "learning_rate": 4.911757596784357e-05, "loss": 1.4642, "step": 560 }, { "epoch": 0.515980685214992, "grad_norm": 0.8613134026527405, "learning_rate": 4.8970524596827486e-05, "loss": 1.5374, "step": 561 }, { "epoch": 0.5169004368820419, "grad_norm": 0.7729939222335815, "learning_rate": 4.8823482133661516e-05, "loss": 1.4959, "step": 562 }, { "epoch": 0.5178201885490917, "grad_norm": 0.9063132405281067, "learning_rate": 4.8676449850675475e-05, "loss": 1.5057, "step": 563 }, { "epoch": 0.5187399402161417, "grad_norm": 0.9306026697158813, "learning_rate": 4.852942902011103e-05, "loss": 1.5544, "step": 564 }, { "epoch": 0.5196596918831915, "grad_norm": 0.763334333896637, "learning_rate": 4.838242091411084e-05, "loss": 1.4385, "step": 565 }, { "epoch": 0.5205794435502414, "grad_norm": 0.7051974534988403, "learning_rate": 4.823542680470738e-05, "loss": 1.4612, "step": 566 }, { "epoch": 0.5214991952172914, "grad_norm": 0.7262412905693054, "learning_rate": 4.808844796381205e-05, "loss": 1.4366, "step": 567 }, { "epoch": 0.5224189468843412, "grad_norm": 0.7530311346054077, "learning_rate": 4.7941485663204125e-05, "loss": 1.4883, "step": 568 }, { "epoch": 0.5233386985513911, "grad_norm": 0.653555691242218, "learning_rate": 4.779454117451977e-05, "loss": 1.3767, "step": 569 }, { "epoch": 0.524258450218441, "grad_norm": 0.7212573289871216, "learning_rate": 4.7647615769241e-05, "loss": 1.3811, "step": 570 }, { "epoch": 0.5251782018854909, "grad_norm": 0.7534743547439575, "learning_rate": 4.750071071868478e-05, "loss": 1.4899, "step": 571 }, { "epoch": 0.5260979535525409, "grad_norm": 0.6205776333808899, "learning_rate": 4.735382729399184e-05, "loss": 1.4294, "step": 572 }, { "epoch": 0.5270177052195907, "grad_norm": 0.6632286906242371, "learning_rate": 4.720696676611589e-05, "loss": 1.4939, "step": 573 }, { "epoch": 0.5279374568866406, "grad_norm": 0.7253984808921814, "learning_rate": 4.706013040581242e-05, "loss": 1.4342, "step": 574 }, { "epoch": 0.5288572085536904, "grad_norm": 0.7158737778663635, "learning_rate": 4.691331948362789e-05, "loss": 1.4718, "step": 575 }, { "epoch": 0.5297769602207404, "grad_norm": 0.6117165088653564, "learning_rate": 4.676653526988858e-05, "loss": 1.4828, "step": 576 }, { "epoch": 0.5306967118877903, "grad_norm": 0.6031986474990845, "learning_rate": 4.661977903468974e-05, "loss": 1.4493, "step": 577 }, { "epoch": 0.5316164635548402, "grad_norm": 0.6613805890083313, "learning_rate": 4.647305204788445e-05, "loss": 1.4419, "step": 578 }, { "epoch": 0.5325362152218901, "grad_norm": 0.6349487900733948, "learning_rate": 4.632635557907277e-05, "loss": 1.4213, "step": 579 }, { "epoch": 0.53345596688894, "grad_norm": 0.5844326019287109, "learning_rate": 4.617969089759066e-05, "loss": 1.4505, "step": 580 }, { "epoch": 0.5343757185559899, "grad_norm": 0.7105299234390259, "learning_rate": 4.603305927249902e-05, "loss": 1.3974, "step": 581 }, { "epoch": 0.5352954702230398, "grad_norm": 0.7277695536613464, "learning_rate": 4.588646197257277e-05, "loss": 1.371, "step": 582 }, { "epoch": 0.5362152218900896, "grad_norm": 0.6246547698974609, "learning_rate": 4.5739900266289756e-05, "loss": 1.3747, "step": 583 }, { "epoch": 0.5371349735571396, "grad_norm": 0.918038547039032, "learning_rate": 4.559337542181993e-05, "loss": 1.3068, "step": 584 }, { "epoch": 0.5380547252241895, "grad_norm": 0.7304350733757019, "learning_rate": 4.544688870701415e-05, "loss": 1.3496, "step": 585 }, { "epoch": 0.5389744768912393, "grad_norm": 0.6852339506149292, "learning_rate": 4.53004413893935e-05, "loss": 1.3327, "step": 586 }, { "epoch": 0.5398942285582893, "grad_norm": 0.7337968349456787, "learning_rate": 4.515403473613803e-05, "loss": 1.3756, "step": 587 }, { "epoch": 0.5408139802253391, "grad_norm": 0.7710087895393372, "learning_rate": 4.5007670014076045e-05, "loss": 1.3611, "step": 588 }, { "epoch": 0.541733731892389, "grad_norm": 0.6107405424118042, "learning_rate": 4.486134848967292e-05, "loss": 1.312, "step": 589 }, { "epoch": 0.542653483559439, "grad_norm": 0.7013472318649292, "learning_rate": 4.471507142902036e-05, "loss": 1.3194, "step": 590 }, { "epoch": 0.5435732352264888, "grad_norm": 0.8323330283164978, "learning_rate": 4.4568840097825226e-05, "loss": 1.2888, "step": 591 }, { "epoch": 0.5444929868935388, "grad_norm": 0.6520772576332092, "learning_rate": 4.442265576139878e-05, "loss": 1.2347, "step": 592 }, { "epoch": 0.5454127385605887, "grad_norm": 0.7573135495185852, "learning_rate": 4.4276519684645585e-05, "loss": 1.316, "step": 593 }, { "epoch": 0.5463324902276385, "grad_norm": 0.7183561325073242, "learning_rate": 4.4130433132052664e-05, "loss": 1.2999, "step": 594 }, { "epoch": 0.5472522418946885, "grad_norm": 0.8150544762611389, "learning_rate": 4.398439736767847e-05, "loss": 1.2111, "step": 595 }, { "epoch": 0.5481719935617383, "grad_norm": 0.8062061071395874, "learning_rate": 4.383841365514208e-05, "loss": 1.2231, "step": 596 }, { "epoch": 0.5490917452287882, "grad_norm": 0.8277079463005066, "learning_rate": 4.369248325761205e-05, "loss": 1.2266, "step": 597 }, { "epoch": 0.5500114968958382, "grad_norm": 1.1290823221206665, "learning_rate": 4.354660743779574e-05, "loss": 1.1825, "step": 598 }, { "epoch": 0.550931248562888, "grad_norm": 1.0019193887710571, "learning_rate": 4.340078745792818e-05, "loss": 1.103, "step": 599 }, { "epoch": 0.5518510002299379, "grad_norm": 1.0963555574417114, "learning_rate": 4.325502457976126e-05, "loss": 1.031, "step": 600 }, { "epoch": 0.5518510002299379, "eval_loss": 1.4310619831085205, "eval_runtime": 49.9435, "eval_samples_per_second": 164.986, "eval_steps_per_second": 20.623, "step": 600 }, { "epoch": 0.5527707518969878, "grad_norm": 1.6411505937576294, "learning_rate": 4.310932006455276e-05, "loss": 1.6187, "step": 601 }, { "epoch": 0.5536905035640377, "grad_norm": 1.455959677696228, "learning_rate": 4.296367517305549e-05, "loss": 1.5665, "step": 602 }, { "epoch": 0.5546102552310876, "grad_norm": 1.3301597833633423, "learning_rate": 4.281809116550629e-05, "loss": 1.5417, "step": 603 }, { "epoch": 0.5555300068981375, "grad_norm": 1.0796560049057007, "learning_rate": 4.267256930161523e-05, "loss": 1.5482, "step": 604 }, { "epoch": 0.5564497585651874, "grad_norm": 0.842844545841217, "learning_rate": 4.252711084055467e-05, "loss": 1.4583, "step": 605 }, { "epoch": 0.5573695102322372, "grad_norm": 0.7908689379692078, "learning_rate": 4.2381717040948325e-05, "loss": 1.4621, "step": 606 }, { "epoch": 0.5582892618992872, "grad_norm": 0.9240807890892029, "learning_rate": 4.223638916086043e-05, "loss": 1.4843, "step": 607 }, { "epoch": 0.5592090135663371, "grad_norm": 0.9389266967773438, "learning_rate": 4.209112845778481e-05, "loss": 1.4186, "step": 608 }, { "epoch": 0.560128765233387, "grad_norm": 0.7683906555175781, "learning_rate": 4.194593618863404e-05, "loss": 1.4541, "step": 609 }, { "epoch": 0.5610485169004369, "grad_norm": 0.6913854479789734, "learning_rate": 4.1800813609728526e-05, "loss": 1.4815, "step": 610 }, { "epoch": 0.5619682685674868, "grad_norm": 0.7714055776596069, "learning_rate": 4.1655761976785705e-05, "loss": 1.4577, "step": 611 }, { "epoch": 0.5628880202345367, "grad_norm": 0.7735984921455383, "learning_rate": 4.1510782544909075e-05, "loss": 1.5057, "step": 612 }, { "epoch": 0.5638077719015866, "grad_norm": 0.8532646298408508, "learning_rate": 4.136587656857744e-05, "loss": 1.4917, "step": 613 }, { "epoch": 0.5647275235686364, "grad_norm": 0.7896936535835266, "learning_rate": 4.122104530163397e-05, "loss": 1.5009, "step": 614 }, { "epoch": 0.5656472752356864, "grad_norm": 0.6928205490112305, "learning_rate": 4.107628999727542e-05, "loss": 1.4733, "step": 615 }, { "epoch": 0.5665670269027363, "grad_norm": 0.728251576423645, "learning_rate": 4.09316119080412e-05, "loss": 1.4508, "step": 616 }, { "epoch": 0.5674867785697861, "grad_norm": 0.6070961356163025, "learning_rate": 4.078701228580269e-05, "loss": 1.5002, "step": 617 }, { "epoch": 0.5684065302368361, "grad_norm": 0.7009554505348206, "learning_rate": 4.064249238175223e-05, "loss": 1.5289, "step": 618 }, { "epoch": 0.5693262819038859, "grad_norm": 0.6865770220756531, "learning_rate": 4.0498053446392403e-05, "loss": 1.4876, "step": 619 }, { "epoch": 0.5702460335709358, "grad_norm": 0.6156379580497742, "learning_rate": 4.035369672952516e-05, "loss": 1.4032, "step": 620 }, { "epoch": 0.5711657852379858, "grad_norm": 0.5818307995796204, "learning_rate": 4.020942348024108e-05, "loss": 1.4421, "step": 621 }, { "epoch": 0.5720855369050356, "grad_norm": 0.5913554430007935, "learning_rate": 4.0065234946908456e-05, "loss": 1.4527, "step": 622 }, { "epoch": 0.5730052885720855, "grad_norm": 0.5924707651138306, "learning_rate": 3.992113237716261e-05, "loss": 1.4692, "step": 623 }, { "epoch": 0.5739250402391355, "grad_norm": 0.6369109749794006, "learning_rate": 3.977711701789499e-05, "loss": 1.4541, "step": 624 }, { "epoch": 0.5748447919061853, "grad_norm": 0.5432732701301575, "learning_rate": 3.9633190115242456e-05, "loss": 1.3981, "step": 625 }, { "epoch": 0.5757645435732353, "grad_norm": 0.6044031977653503, "learning_rate": 3.948935291457644e-05, "loss": 1.4086, "step": 626 }, { "epoch": 0.5766842952402851, "grad_norm": 0.5974178314208984, "learning_rate": 3.934560666049226e-05, "loss": 1.448, "step": 627 }, { "epoch": 0.577604046907335, "grad_norm": 0.6302614212036133, "learning_rate": 3.920195259679822e-05, "loss": 1.4095, "step": 628 }, { "epoch": 0.578523798574385, "grad_norm": 0.6615459322929382, "learning_rate": 3.905839196650493e-05, "loss": 1.5048, "step": 629 }, { "epoch": 0.5794435502414348, "grad_norm": 0.5650434494018555, "learning_rate": 3.8914926011814626e-05, "loss": 1.4093, "step": 630 }, { "epoch": 0.5803633019084847, "grad_norm": 0.5881006121635437, "learning_rate": 3.8771555974110194e-05, "loss": 1.3783, "step": 631 }, { "epoch": 0.5812830535755346, "grad_norm": 0.6607415676116943, "learning_rate": 3.8628283093944686e-05, "loss": 1.4406, "step": 632 }, { "epoch": 0.5822028052425845, "grad_norm": 0.6574285626411438, "learning_rate": 3.8485108611030415e-05, "loss": 1.3927, "step": 633 }, { "epoch": 0.5831225569096344, "grad_norm": 0.7541502714157104, "learning_rate": 3.834203376422831e-05, "loss": 1.374, "step": 634 }, { "epoch": 0.5840423085766843, "grad_norm": 0.6834109425544739, "learning_rate": 3.81990597915371e-05, "loss": 1.3459, "step": 635 }, { "epoch": 0.5849620602437342, "grad_norm": 0.649935781955719, "learning_rate": 3.805618793008279e-05, "loss": 1.3314, "step": 636 }, { "epoch": 0.585881811910784, "grad_norm": 0.6892503499984741, "learning_rate": 3.7913419416107694e-05, "loss": 1.3958, "step": 637 }, { "epoch": 0.586801563577834, "grad_norm": 0.6689726710319519, "learning_rate": 3.7770755484960004e-05, "loss": 1.3384, "step": 638 }, { "epoch": 0.5877213152448839, "grad_norm": 0.5913270711898804, "learning_rate": 3.762819737108291e-05, "loss": 1.3169, "step": 639 }, { "epoch": 0.5886410669119337, "grad_norm": 0.6090061068534851, "learning_rate": 3.748574630800401e-05, "loss": 1.2413, "step": 640 }, { "epoch": 0.5895608185789837, "grad_norm": 0.7058801651000977, "learning_rate": 3.734340352832457e-05, "loss": 1.289, "step": 641 }, { "epoch": 0.5904805702460336, "grad_norm": 0.7695034146308899, "learning_rate": 3.7201170263709e-05, "loss": 1.3332, "step": 642 }, { "epoch": 0.5914003219130834, "grad_norm": 0.6559154987335205, "learning_rate": 3.705904774487396e-05, "loss": 1.2992, "step": 643 }, { "epoch": 0.5923200735801334, "grad_norm": 0.7140766382217407, "learning_rate": 3.691703720157798e-05, "loss": 1.2247, "step": 644 }, { "epoch": 0.5932398252471832, "grad_norm": 0.7867764830589294, "learning_rate": 3.6775139862610574e-05, "loss": 1.2409, "step": 645 }, { "epoch": 0.5941595769142332, "grad_norm": 0.9307761788368225, "learning_rate": 3.663335695578183e-05, "loss": 1.1696, "step": 646 }, { "epoch": 0.5950793285812831, "grad_norm": 0.8968107104301453, "learning_rate": 3.649168970791157e-05, "loss": 1.1511, "step": 647 }, { "epoch": 0.5959990802483329, "grad_norm": 0.9723992943763733, "learning_rate": 3.635013934481895e-05, "loss": 1.1133, "step": 648 }, { "epoch": 0.5969188319153829, "grad_norm": 1.1764365434646606, "learning_rate": 3.6208707091311626e-05, "loss": 1.1247, "step": 649 }, { "epoch": 0.5978385835824327, "grad_norm": 1.0631630420684814, "learning_rate": 3.6067394171175394e-05, "loss": 1.0094, "step": 650 }, { "epoch": 0.5987583352494826, "grad_norm": 1.4610891342163086, "learning_rate": 3.592620180716338e-05, "loss": 1.635, "step": 651 }, { "epoch": 0.5996780869165326, "grad_norm": 1.4560317993164062, "learning_rate": 3.578513122098566e-05, "loss": 1.5683, "step": 652 }, { "epoch": 0.6005978385835824, "grad_norm": 1.250054955482483, "learning_rate": 3.564418363329848e-05, "loss": 1.4994, "step": 653 }, { "epoch": 0.6015175902506323, "grad_norm": 1.0758668184280396, "learning_rate": 3.5503360263693886e-05, "loss": 1.4581, "step": 654 }, { "epoch": 0.6024373419176823, "grad_norm": 0.9774999022483826, "learning_rate": 3.5362662330689064e-05, "loss": 1.4609, "step": 655 }, { "epoch": 0.6033570935847321, "grad_norm": 0.8008742332458496, "learning_rate": 3.52220910517158e-05, "loss": 1.4672, "step": 656 }, { "epoch": 0.604276845251782, "grad_norm": 0.7127364873886108, "learning_rate": 3.5081647643110024e-05, "loss": 1.4948, "step": 657 }, { "epoch": 0.6051965969188319, "grad_norm": 0.76557457447052, "learning_rate": 3.494133332010117e-05, "loss": 1.4609, "step": 658 }, { "epoch": 0.6061163485858818, "grad_norm": 0.8269351124763489, "learning_rate": 3.480114929680176e-05, "loss": 1.5268, "step": 659 }, { "epoch": 0.6070361002529318, "grad_norm": 0.810955286026001, "learning_rate": 3.466109678619681e-05, "loss": 1.523, "step": 660 }, { "epoch": 0.6079558519199816, "grad_norm": 0.6712583303451538, "learning_rate": 3.452117700013345e-05, "loss": 1.4676, "step": 661 }, { "epoch": 0.6088756035870315, "grad_norm": 0.828484058380127, "learning_rate": 3.43813911493103e-05, "loss": 1.5116, "step": 662 }, { "epoch": 0.6097953552540814, "grad_norm": 0.7789233922958374, "learning_rate": 3.424174044326711e-05, "loss": 1.445, "step": 663 }, { "epoch": 0.6107151069211313, "grad_norm": 0.7635114789009094, "learning_rate": 3.4102226090374246e-05, "loss": 1.5681, "step": 664 }, { "epoch": 0.6116348585881812, "grad_norm": 0.6956825256347656, "learning_rate": 3.3962849297822226e-05, "loss": 1.4877, "step": 665 }, { "epoch": 0.6125546102552311, "grad_norm": 0.6926284432411194, "learning_rate": 3.382361127161127e-05, "loss": 1.4282, "step": 666 }, { "epoch": 0.613474361922281, "grad_norm": 0.8702225089073181, "learning_rate": 3.368451321654091e-05, "loss": 1.4773, "step": 667 }, { "epoch": 0.6143941135893308, "grad_norm": 0.7277842164039612, "learning_rate": 3.35455563361995e-05, "loss": 1.3959, "step": 668 }, { "epoch": 0.6153138652563808, "grad_norm": 0.6363296508789062, "learning_rate": 3.340674183295389e-05, "loss": 1.4747, "step": 669 }, { "epoch": 0.6162336169234307, "grad_norm": 0.6425765156745911, "learning_rate": 3.326807090793891e-05, "loss": 1.4423, "step": 670 }, { "epoch": 0.6171533685904805, "grad_norm": 0.6721304059028625, "learning_rate": 3.312954476104709e-05, "loss": 1.4241, "step": 671 }, { "epoch": 0.6180731202575305, "grad_norm": 0.6218870878219604, "learning_rate": 3.299116459091816e-05, "loss": 1.4644, "step": 672 }, { "epoch": 0.6189928719245804, "grad_norm": 0.6951906681060791, "learning_rate": 3.2852931594928807e-05, "loss": 1.452, "step": 673 }, { "epoch": 0.6199126235916302, "grad_norm": 0.6208174824714661, "learning_rate": 3.271484696918218e-05, "loss": 1.415, "step": 674 }, { "epoch": 0.6208323752586802, "grad_norm": 0.5596356391906738, "learning_rate": 3.257691190849769e-05, "loss": 1.4708, "step": 675 }, { "epoch": 0.62175212692573, "grad_norm": 0.6394990682601929, "learning_rate": 3.243912760640054e-05, "loss": 1.4522, "step": 676 }, { "epoch": 0.62267187859278, "grad_norm": 0.6112094521522522, "learning_rate": 3.2301495255111425e-05, "loss": 1.3607, "step": 677 }, { "epoch": 0.6235916302598299, "grad_norm": 0.645779013633728, "learning_rate": 3.2164016045536304e-05, "loss": 1.4282, "step": 678 }, { "epoch": 0.6245113819268797, "grad_norm": 0.6169288754463196, "learning_rate": 3.202669116725598e-05, "loss": 1.4052, "step": 679 }, { "epoch": 0.6254311335939297, "grad_norm": 0.6002304553985596, "learning_rate": 3.188952180851589e-05, "loss": 1.419, "step": 680 }, { "epoch": 0.6263508852609795, "grad_norm": 0.6018975377082825, "learning_rate": 3.1752509156215734e-05, "loss": 1.3685, "step": 681 }, { "epoch": 0.6272706369280294, "grad_norm": 0.6559040546417236, "learning_rate": 3.1615654395899375e-05, "loss": 1.3657, "step": 682 }, { "epoch": 0.6281903885950794, "grad_norm": 0.6393570899963379, "learning_rate": 3.147895871174432e-05, "loss": 1.405, "step": 683 }, { "epoch": 0.6291101402621292, "grad_norm": 0.6094779968261719, "learning_rate": 3.134242328655175e-05, "loss": 1.3179, "step": 684 }, { "epoch": 0.6300298919291791, "grad_norm": 0.6581336855888367, "learning_rate": 3.120604930173608e-05, "loss": 1.3276, "step": 685 }, { "epoch": 0.6309496435962291, "grad_norm": 0.6599423289299011, "learning_rate": 3.106983793731484e-05, "loss": 1.2805, "step": 686 }, { "epoch": 0.6318693952632789, "grad_norm": 0.683204710483551, "learning_rate": 3.093379037189842e-05, "loss": 1.3557, "step": 687 }, { "epoch": 0.6327891469303288, "grad_norm": 0.6180110573768616, "learning_rate": 3.079790778267994e-05, "loss": 1.2668, "step": 688 }, { "epoch": 0.6337088985973787, "grad_norm": 0.7273058891296387, "learning_rate": 3.066219134542492e-05, "loss": 1.2852, "step": 689 }, { "epoch": 0.6346286502644286, "grad_norm": 0.6892321705818176, "learning_rate": 3.052664223446131e-05, "loss": 1.2997, "step": 690 }, { "epoch": 0.6355484019314785, "grad_norm": 0.694174587726593, "learning_rate": 3.039126162266912e-05, "loss": 1.2398, "step": 691 }, { "epoch": 0.6364681535985284, "grad_norm": 0.7471473217010498, "learning_rate": 3.0256050681470444e-05, "loss": 1.1879, "step": 692 }, { "epoch": 0.6373879052655783, "grad_norm": 0.7812895178794861, "learning_rate": 3.012101058081919e-05, "loss": 1.2826, "step": 693 }, { "epoch": 0.6383076569326281, "grad_norm": 0.7405266761779785, "learning_rate": 2.998614248919107e-05, "loss": 1.1937, "step": 694 }, { "epoch": 0.6392274085996781, "grad_norm": 0.7346695065498352, "learning_rate": 2.9851447573573384e-05, "loss": 1.2364, "step": 695 }, { "epoch": 0.640147160266728, "grad_norm": 0.7376750707626343, "learning_rate": 2.971692699945502e-05, "loss": 1.222, "step": 696 }, { "epoch": 0.6410669119337778, "grad_norm": 0.7857553362846375, "learning_rate": 2.9582581930816288e-05, "loss": 1.1532, "step": 697 }, { "epoch": 0.6419866636008278, "grad_norm": 1.1139256954193115, "learning_rate": 2.9448413530118914e-05, "loss": 1.0823, "step": 698 }, { "epoch": 0.6429064152678776, "grad_norm": 0.9734514355659485, "learning_rate": 2.9314422958295907e-05, "loss": 1.0059, "step": 699 }, { "epoch": 0.6438261669349276, "grad_norm": 1.195755124092102, "learning_rate": 2.9180611374741623e-05, "loss": 1.0146, "step": 700 }, { "epoch": 0.6447459186019775, "grad_norm": 1.1521427631378174, "learning_rate": 2.9046979937301588e-05, "loss": 1.5188, "step": 701 }, { "epoch": 0.6456656702690273, "grad_norm": 1.0498712062835693, "learning_rate": 2.8913529802262617e-05, "loss": 1.5642, "step": 702 }, { "epoch": 0.6465854219360773, "grad_norm": 1.004340410232544, "learning_rate": 2.8780262124342755e-05, "loss": 1.4869, "step": 703 }, { "epoch": 0.6475051736031272, "grad_norm": 0.9507954716682434, "learning_rate": 2.8647178056681194e-05, "loss": 1.5128, "step": 704 }, { "epoch": 0.648424925270177, "grad_norm": 0.8366132974624634, "learning_rate": 2.8514278750828536e-05, "loss": 1.4907, "step": 705 }, { "epoch": 0.649344676937227, "grad_norm": 0.8227055072784424, "learning_rate": 2.838156535673652e-05, "loss": 1.5356, "step": 706 }, { "epoch": 0.6502644286042768, "grad_norm": 0.7174684405326843, "learning_rate": 2.8249039022748313e-05, "loss": 1.4349, "step": 707 }, { "epoch": 0.6511841802713267, "grad_norm": 0.6819536089897156, "learning_rate": 2.8116700895588472e-05, "loss": 1.4133, "step": 708 }, { "epoch": 0.6521039319383767, "grad_norm": 0.7197076082229614, "learning_rate": 2.7984552120353046e-05, "loss": 1.4284, "step": 709 }, { "epoch": 0.6530236836054265, "grad_norm": 0.7833074331283569, "learning_rate": 2.785259384049959e-05, "loss": 1.5066, "step": 710 }, { "epoch": 0.6539434352724764, "grad_norm": 0.7236879467964172, "learning_rate": 2.7720827197837472e-05, "loss": 1.3815, "step": 711 }, { "epoch": 0.6548631869395263, "grad_norm": 0.6463202238082886, "learning_rate": 2.7589253332517734e-05, "loss": 1.4513, "step": 712 }, { "epoch": 0.6557829386065762, "grad_norm": 0.7177314758300781, "learning_rate": 2.745787338302341e-05, "loss": 1.4443, "step": 713 }, { "epoch": 0.6567026902736262, "grad_norm": 0.7721028327941895, "learning_rate": 2.7326688486159613e-05, "loss": 1.4899, "step": 714 }, { "epoch": 0.657622441940676, "grad_norm": 0.6830793023109436, "learning_rate": 2.719569977704372e-05, "loss": 1.5052, "step": 715 }, { "epoch": 0.6585421936077259, "grad_norm": 0.6752369403839111, "learning_rate": 2.7064908389095468e-05, "loss": 1.5062, "step": 716 }, { "epoch": 0.6594619452747759, "grad_norm": 0.6267321109771729, "learning_rate": 2.693431545402732e-05, "loss": 1.5125, "step": 717 }, { "epoch": 0.6603816969418257, "grad_norm": 0.6160003542900085, "learning_rate": 2.6803922101834454e-05, "loss": 1.4609, "step": 718 }, { "epoch": 0.6613014486088756, "grad_norm": 0.5926380157470703, "learning_rate": 2.6673729460785176e-05, "loss": 1.415, "step": 719 }, { "epoch": 0.6622212002759255, "grad_norm": 0.6655170321464539, "learning_rate": 2.6543738657411034e-05, "loss": 1.372, "step": 720 }, { "epoch": 0.6631409519429754, "grad_norm": 0.6094529628753662, "learning_rate": 2.6413950816497147e-05, "loss": 1.4037, "step": 721 }, { "epoch": 0.6640607036100253, "grad_norm": 0.6568109393119812, "learning_rate": 2.6284367061072378e-05, "loss": 1.458, "step": 722 }, { "epoch": 0.6649804552770752, "grad_norm": 0.5817413330078125, "learning_rate": 2.615498851239978e-05, "loss": 1.4009, "step": 723 }, { "epoch": 0.6659002069441251, "grad_norm": 0.6216491460800171, "learning_rate": 2.6025816289966704e-05, "loss": 1.4178, "step": 724 }, { "epoch": 0.6668199586111749, "grad_norm": 0.6176545023918152, "learning_rate": 2.5896851511475186e-05, "loss": 1.4191, "step": 725 }, { "epoch": 0.6677397102782249, "grad_norm": 0.5803206562995911, "learning_rate": 2.576809529283241e-05, "loss": 1.415, "step": 726 }, { "epoch": 0.6686594619452748, "grad_norm": 0.5935968160629272, "learning_rate": 2.5639548748140802e-05, "loss": 1.3797, "step": 727 }, { "epoch": 0.6695792136123246, "grad_norm": 0.6356935501098633, "learning_rate": 2.5511212989688586e-05, "loss": 1.4948, "step": 728 }, { "epoch": 0.6704989652793746, "grad_norm": 0.5835620760917664, "learning_rate": 2.5383089127940086e-05, "loss": 1.4203, "step": 729 }, { "epoch": 0.6714187169464245, "grad_norm": 0.687403678894043, "learning_rate": 2.5255178271526137e-05, "loss": 1.3661, "step": 730 }, { "epoch": 0.6723384686134743, "grad_norm": 0.6388825178146362, "learning_rate": 2.51274815272344e-05, "loss": 1.4157, "step": 731 }, { "epoch": 0.6732582202805243, "grad_norm": 0.6280670762062073, "learning_rate": 2.500000000000001e-05, "loss": 1.3854, "step": 732 }, { "epoch": 0.6741779719475741, "grad_norm": 0.6690565943717957, "learning_rate": 2.4872734792895734e-05, "loss": 1.3974, "step": 733 }, { "epoch": 0.6750977236146241, "grad_norm": 0.6328375339508057, "learning_rate": 2.4745687007122636e-05, "loss": 1.3462, "step": 734 }, { "epoch": 0.676017475281674, "grad_norm": 0.6421682834625244, "learning_rate": 2.4618857742000463e-05, "loss": 1.2237, "step": 735 }, { "epoch": 0.6769372269487238, "grad_norm": 0.6286811828613281, "learning_rate": 2.4492248094958147e-05, "loss": 1.3481, "step": 736 }, { "epoch": 0.6778569786157738, "grad_norm": 0.61008220911026, "learning_rate": 2.4365859161524258e-05, "loss": 1.2088, "step": 737 }, { "epoch": 0.6787767302828236, "grad_norm": 0.6456345915794373, "learning_rate": 2.4239692035317678e-05, "loss": 1.1997, "step": 738 }, { "epoch": 0.6796964819498735, "grad_norm": 0.8082221746444702, "learning_rate": 2.411374780803793e-05, "loss": 1.2172, "step": 739 }, { "epoch": 0.6806162336169235, "grad_norm": 0.6706709861755371, "learning_rate": 2.3988027569455895e-05, "loss": 1.211, "step": 740 }, { "epoch": 0.6815359852839733, "grad_norm": 0.6545360088348389, "learning_rate": 2.3862532407404303e-05, "loss": 1.3001, "step": 741 }, { "epoch": 0.6824557369510232, "grad_norm": 0.8686853051185608, "learning_rate": 2.373726340776837e-05, "loss": 1.2328, "step": 742 }, { "epoch": 0.6833754886180731, "grad_norm": 0.668156087398529, "learning_rate": 2.361222165447628e-05, "loss": 1.2011, "step": 743 }, { "epoch": 0.684295240285123, "grad_norm": 0.685393750667572, "learning_rate": 2.348740822949006e-05, "loss": 1.2309, "step": 744 }, { "epoch": 0.685214991952173, "grad_norm": 0.6708635687828064, "learning_rate": 2.3362824212795898e-05, "loss": 1.1972, "step": 745 }, { "epoch": 0.6861347436192228, "grad_norm": 0.8381814360618591, "learning_rate": 2.3238470682395037e-05, "loss": 1.2545, "step": 746 }, { "epoch": 0.6870544952862727, "grad_norm": 0.7803678512573242, "learning_rate": 2.3114348714294354e-05, "loss": 1.1471, "step": 747 }, { "epoch": 0.6879742469533227, "grad_norm": 0.8974632024765015, "learning_rate": 2.2990459382497088e-05, "loss": 1.1145, "step": 748 }, { "epoch": 0.6888939986203725, "grad_norm": 1.0532459020614624, "learning_rate": 2.2866803758993445e-05, "loss": 1.0573, "step": 749 }, { "epoch": 0.6898137502874224, "grad_norm": 1.208759069442749, "learning_rate": 2.274338291375147e-05, "loss": 0.9195, "step": 750 }, { "epoch": 0.6898137502874224, "eval_loss": 1.3665193319320679, "eval_runtime": 50.0048, "eval_samples_per_second": 164.784, "eval_steps_per_second": 20.598, "step": 750 }, { "epoch": 0.6907335019544723, "grad_norm": 1.253531575202942, "learning_rate": 2.2620197914707718e-05, "loss": 1.602, "step": 751 }, { "epoch": 0.6916532536215222, "grad_norm": 1.2635823488235474, "learning_rate": 2.2497249827757933e-05, "loss": 1.5615, "step": 752 }, { "epoch": 0.6925730052885721, "grad_norm": 1.0416873693466187, "learning_rate": 2.2374539716748032e-05, "loss": 1.4779, "step": 753 }, { "epoch": 0.693492756955622, "grad_norm": 0.9805805087089539, "learning_rate": 2.225206864346465e-05, "loss": 1.4272, "step": 754 }, { "epoch": 0.6944125086226719, "grad_norm": 0.9023362398147583, "learning_rate": 2.2129837667626145e-05, "loss": 1.4208, "step": 755 }, { "epoch": 0.6953322602897217, "grad_norm": 1.0136377811431885, "learning_rate": 2.200784784687334e-05, "loss": 1.4692, "step": 756 }, { "epoch": 0.6962520119567717, "grad_norm": 0.9673015475273132, "learning_rate": 2.188610023676041e-05, "loss": 1.4966, "step": 757 }, { "epoch": 0.6971717636238216, "grad_norm": 0.8694583177566528, "learning_rate": 2.176459589074566e-05, "loss": 1.4035, "step": 758 }, { "epoch": 0.6980915152908714, "grad_norm": 0.7423250675201416, "learning_rate": 2.164333586018259e-05, "loss": 1.4623, "step": 759 }, { "epoch": 0.6990112669579214, "grad_norm": 0.7796162366867065, "learning_rate": 2.1522321194310574e-05, "loss": 1.466, "step": 760 }, { "epoch": 0.6999310186249713, "grad_norm": 0.9312780499458313, "learning_rate": 2.1401552940245962e-05, "loss": 1.3982, "step": 761 }, { "epoch": 0.7008507702920211, "grad_norm": 0.7841870784759521, "learning_rate": 2.1281032142972933e-05, "loss": 1.505, "step": 762 }, { "epoch": 0.7017705219590711, "grad_norm": 0.6561142206192017, "learning_rate": 2.1160759845334484e-05, "loss": 1.4446, "step": 763 }, { "epoch": 0.7026902736261209, "grad_norm": 0.6478760242462158, "learning_rate": 2.1040737088023323e-05, "loss": 1.4218, "step": 764 }, { "epoch": 0.7036100252931708, "grad_norm": 0.8280866146087646, "learning_rate": 2.0920964909573066e-05, "loss": 1.4915, "step": 765 }, { "epoch": 0.7045297769602208, "grad_norm": 0.8623349666595459, "learning_rate": 2.080144434634898e-05, "loss": 1.3761, "step": 766 }, { "epoch": 0.7054495286272706, "grad_norm": 0.7455824613571167, "learning_rate": 2.0682176432539246e-05, "loss": 1.39, "step": 767 }, { "epoch": 0.7063692802943206, "grad_norm": 0.6684551239013672, "learning_rate": 2.056316220014588e-05, "loss": 1.4599, "step": 768 }, { "epoch": 0.7072890319613704, "grad_norm": 0.6949120759963989, "learning_rate": 2.0444402678975877e-05, "loss": 1.4068, "step": 769 }, { "epoch": 0.7082087836284203, "grad_norm": 0.698066771030426, "learning_rate": 2.0325898896632177e-05, "loss": 1.4451, "step": 770 }, { "epoch": 0.7091285352954703, "grad_norm": 0.6923701167106628, "learning_rate": 2.0207651878505e-05, "loss": 1.4183, "step": 771 }, { "epoch": 0.7100482869625201, "grad_norm": 0.6396070718765259, "learning_rate": 2.0089662647762715e-05, "loss": 1.4079, "step": 772 }, { "epoch": 0.71096803862957, "grad_norm": 0.5608759522438049, "learning_rate": 1.997193222534316e-05, "loss": 1.3507, "step": 773 }, { "epoch": 0.7118877902966199, "grad_norm": 0.6374341249465942, "learning_rate": 1.9854461629944763e-05, "loss": 1.395, "step": 774 }, { "epoch": 0.7128075419636698, "grad_norm": 0.5628088116645813, "learning_rate": 1.9737251878017678e-05, "loss": 1.3779, "step": 775 }, { "epoch": 0.7137272936307197, "grad_norm": 0.6205474138259888, "learning_rate": 1.962030398375506e-05, "loss": 1.3974, "step": 776 }, { "epoch": 0.7146470452977696, "grad_norm": 0.5789771676063538, "learning_rate": 1.950361895908427e-05, "loss": 1.331, "step": 777 }, { "epoch": 0.7155667969648195, "grad_norm": 0.636550784111023, "learning_rate": 1.9387197813658092e-05, "loss": 1.3799, "step": 778 }, { "epoch": 0.7164865486318694, "grad_norm": 0.6165384650230408, "learning_rate": 1.927104155484602e-05, "loss": 1.3579, "step": 779 }, { "epoch": 0.7174063002989193, "grad_norm": 0.6170758008956909, "learning_rate": 1.9155151187725552e-05, "loss": 1.349, "step": 780 }, { "epoch": 0.7183260519659692, "grad_norm": 0.5404320359230042, "learning_rate": 1.9039527715073424e-05, "loss": 1.364, "step": 781 }, { "epoch": 0.719245803633019, "grad_norm": 0.5796113014221191, "learning_rate": 1.892417213735704e-05, "loss": 1.2893, "step": 782 }, { "epoch": 0.720165555300069, "grad_norm": 0.6280906796455383, "learning_rate": 1.8809085452725746e-05, "loss": 1.3598, "step": 783 }, { "epoch": 0.7210853069671189, "grad_norm": 0.6569982171058655, "learning_rate": 1.8694268657002194e-05, "loss": 1.3006, "step": 784 }, { "epoch": 0.7220050586341688, "grad_norm": 0.6892338991165161, "learning_rate": 1.8579722743673773e-05, "loss": 1.3557, "step": 785 }, { "epoch": 0.7229248103012187, "grad_norm": 0.6984684467315674, "learning_rate": 1.8465448703883958e-05, "loss": 1.3506, "step": 786 }, { "epoch": 0.7238445619682685, "grad_norm": 0.65283203125, "learning_rate": 1.8351447526423727e-05, "loss": 1.3009, "step": 787 }, { "epoch": 0.7247643136353185, "grad_norm": 0.7025482654571533, "learning_rate": 1.8237720197723075e-05, "loss": 1.1886, "step": 788 }, { "epoch": 0.7256840653023684, "grad_norm": 0.6791706085205078, "learning_rate": 1.812426770184243e-05, "loss": 1.2081, "step": 789 }, { "epoch": 0.7266038169694182, "grad_norm": 0.6996423602104187, "learning_rate": 1.801109102046414e-05, "loss": 1.2468, "step": 790 }, { "epoch": 0.7275235686364682, "grad_norm": 0.722210705280304, "learning_rate": 1.7898191132883968e-05, "loss": 1.196, "step": 791 }, { "epoch": 0.7284433203035181, "grad_norm": 0.6527461409568787, "learning_rate": 1.7785569016002685e-05, "loss": 1.2516, "step": 792 }, { "epoch": 0.7293630719705679, "grad_norm": 0.6403821110725403, "learning_rate": 1.7673225644317486e-05, "loss": 1.1883, "step": 793 }, { "epoch": 0.7302828236376179, "grad_norm": 0.7447903156280518, "learning_rate": 1.7561161989913698e-05, "loss": 1.2232, "step": 794 }, { "epoch": 0.7312025753046677, "grad_norm": 0.8253830671310425, "learning_rate": 1.7449379022456295e-05, "loss": 1.2144, "step": 795 }, { "epoch": 0.7321223269717176, "grad_norm": 0.8268104791641235, "learning_rate": 1.7337877709181526e-05, "loss": 1.1443, "step": 796 }, { "epoch": 0.7330420786387676, "grad_norm": 0.8768870830535889, "learning_rate": 1.7226659014888546e-05, "loss": 1.0736, "step": 797 }, { "epoch": 0.7339618303058174, "grad_norm": 0.8852882981300354, "learning_rate": 1.711572390193102e-05, "loss": 1.1051, "step": 798 }, { "epoch": 0.7348815819728673, "grad_norm": 1.0162791013717651, "learning_rate": 1.7005073330208883e-05, "loss": 1.0043, "step": 799 }, { "epoch": 0.7358013336399172, "grad_norm": 1.2660006284713745, "learning_rate": 1.689470825715998e-05, "loss": 1.0243, "step": 800 }, { "epoch": 0.7367210853069671, "grad_norm": 1.007739543914795, "learning_rate": 1.6784629637751815e-05, "loss": 1.5297, "step": 801 }, { "epoch": 0.7376408369740171, "grad_norm": 0.9282512664794922, "learning_rate": 1.6674838424473173e-05, "loss": 1.5234, "step": 802 }, { "epoch": 0.7385605886410669, "grad_norm": 0.8745155334472656, "learning_rate": 1.656533556732611e-05, "loss": 1.4494, "step": 803 }, { "epoch": 0.7394803403081168, "grad_norm": 0.941735565662384, "learning_rate": 1.6456122013817476e-05, "loss": 1.5395, "step": 804 }, { "epoch": 0.7404000919751667, "grad_norm": 0.9213740825653076, "learning_rate": 1.6347198708950882e-05, "loss": 1.4104, "step": 805 }, { "epoch": 0.7413198436422166, "grad_norm": 0.8986393809318542, "learning_rate": 1.6238566595218473e-05, "loss": 1.4004, "step": 806 }, { "epoch": 0.7422395953092665, "grad_norm": 1.212737798690796, "learning_rate": 1.6130226612592786e-05, "loss": 1.4478, "step": 807 }, { "epoch": 0.7431593469763164, "grad_norm": 0.8150504231452942, "learning_rate": 1.6022179698518523e-05, "loss": 1.4197, "step": 808 }, { "epoch": 0.7440790986433663, "grad_norm": 0.7515584826469421, "learning_rate": 1.591442678790467e-05, "loss": 1.454, "step": 809 }, { "epoch": 0.7449988503104162, "grad_norm": 0.6738887429237366, "learning_rate": 1.5806968813116107e-05, "loss": 1.46, "step": 810 }, { "epoch": 0.7459186019774661, "grad_norm": 0.8340874314308167, "learning_rate": 1.5699806703965787e-05, "loss": 1.4261, "step": 811 }, { "epoch": 0.746838353644516, "grad_norm": 0.7794579863548279, "learning_rate": 1.559294138770656e-05, "loss": 1.4964, "step": 812 }, { "epoch": 0.7477581053115658, "grad_norm": 0.7533066868782043, "learning_rate": 1.5486373789023205e-05, "loss": 1.4325, "step": 813 }, { "epoch": 0.7486778569786158, "grad_norm": 0.643245279788971, "learning_rate": 1.538010483002435e-05, "loss": 1.4201, "step": 814 }, { "epoch": 0.7495976086456657, "grad_norm": 0.6805441379547119, "learning_rate": 1.5274135430234654e-05, "loss": 1.4768, "step": 815 }, { "epoch": 0.7505173603127155, "grad_norm": 0.7012439966201782, "learning_rate": 1.5168466506586654e-05, "loss": 1.3795, "step": 816 }, { "epoch": 0.7514371119797655, "grad_norm": 0.6986867189407349, "learning_rate": 1.506309897341297e-05, "loss": 1.3924, "step": 817 }, { "epoch": 0.7523568636468153, "grad_norm": 0.7575457692146301, "learning_rate": 1.495803374243835e-05, "loss": 1.4462, "step": 818 }, { "epoch": 0.7532766153138652, "grad_norm": 0.6013389229774475, "learning_rate": 1.4853271722771772e-05, "loss": 1.3786, "step": 819 }, { "epoch": 0.7541963669809152, "grad_norm": 0.596037745475769, "learning_rate": 1.4748813820898554e-05, "loss": 1.3483, "step": 820 }, { "epoch": 0.755116118647965, "grad_norm": 0.6031373739242554, "learning_rate": 1.4644660940672627e-05, "loss": 1.364, "step": 821 }, { "epoch": 0.756035870315015, "grad_norm": 0.6841591000556946, "learning_rate": 1.4540813983308548e-05, "loss": 1.4468, "step": 822 }, { "epoch": 0.7569556219820649, "grad_norm": 0.7204717993736267, "learning_rate": 1.4437273847373777e-05, "loss": 1.3843, "step": 823 }, { "epoch": 0.7578753736491147, "grad_norm": 0.6169053912162781, "learning_rate": 1.4334041428781003e-05, "loss": 1.3776, "step": 824 }, { "epoch": 0.7587951253161647, "grad_norm": 0.5684770941734314, "learning_rate": 1.4231117620780188e-05, "loss": 1.4011, "step": 825 }, { "epoch": 0.7597148769832145, "grad_norm": 0.5605279207229614, "learning_rate": 1.4128503313951009e-05, "loss": 1.4227, "step": 826 }, { "epoch": 0.7606346286502644, "grad_norm": 0.6137314438819885, "learning_rate": 1.4026199396195077e-05, "loss": 1.4014, "step": 827 }, { "epoch": 0.7615543803173144, "grad_norm": 0.6102471351623535, "learning_rate": 1.3924206752728281e-05, "loss": 1.2759, "step": 828 }, { "epoch": 0.7624741319843642, "grad_norm": 0.6177085638046265, "learning_rate": 1.3822526266073043e-05, "loss": 1.3204, "step": 829 }, { "epoch": 0.7633938836514141, "grad_norm": 0.5692439675331116, "learning_rate": 1.3721158816050873e-05, "loss": 1.3467, "step": 830 }, { "epoch": 0.764313635318464, "grad_norm": 0.6170715689659119, "learning_rate": 1.362010527977453e-05, "loss": 1.2864, "step": 831 }, { "epoch": 0.7652333869855139, "grad_norm": 0.6100102066993713, "learning_rate": 1.3519366531640587e-05, "loss": 1.331, "step": 832 }, { "epoch": 0.7661531386525638, "grad_norm": 0.6240009069442749, "learning_rate": 1.3418943443321807e-05, "loss": 1.2976, "step": 833 }, { "epoch": 0.7670728903196137, "grad_norm": 0.5838286876678467, "learning_rate": 1.3318836883759634e-05, "loss": 1.2843, "step": 834 }, { "epoch": 0.7679926419866636, "grad_norm": 0.6636451482772827, "learning_rate": 1.3219047719156575e-05, "loss": 1.2261, "step": 835 }, { "epoch": 0.7689123936537134, "grad_norm": 0.6104261875152588, "learning_rate": 1.3119576812968892e-05, "loss": 1.2723, "step": 836 }, { "epoch": 0.7698321453207634, "grad_norm": 0.7110616564750671, "learning_rate": 1.3020425025898925e-05, "loss": 1.295, "step": 837 }, { "epoch": 0.7707518969878133, "grad_norm": 0.6308919191360474, "learning_rate": 1.292159321588778e-05, "loss": 1.225, "step": 838 }, { "epoch": 0.7716716486548632, "grad_norm": 0.6422338485717773, "learning_rate": 1.2823082238107858e-05, "loss": 1.2812, "step": 839 }, { "epoch": 0.7725914003219131, "grad_norm": 0.7281700372695923, "learning_rate": 1.272489294495548e-05, "loss": 1.2313, "step": 840 }, { "epoch": 0.773511151988963, "grad_norm": 0.6761153340339661, "learning_rate": 1.2627026186043422e-05, "loss": 1.2118, "step": 841 }, { "epoch": 0.7744309036560129, "grad_norm": 0.6714473366737366, "learning_rate": 1.2529482808193749e-05, "loss": 1.2265, "step": 842 }, { "epoch": 0.7753506553230628, "grad_norm": 0.6813847422599792, "learning_rate": 1.243226365543026e-05, "loss": 1.2408, "step": 843 }, { "epoch": 0.7762704069901126, "grad_norm": 0.6646814346313477, "learning_rate": 1.233536956897136e-05, "loss": 1.1755, "step": 844 }, { "epoch": 0.7771901586571626, "grad_norm": 0.6985054612159729, "learning_rate": 1.2238801387222714e-05, "loss": 1.155, "step": 845 }, { "epoch": 0.7781099103242125, "grad_norm": 0.6989067196846008, "learning_rate": 1.2142559945769993e-05, "loss": 1.1747, "step": 846 }, { "epoch": 0.7790296619912623, "grad_norm": 0.8439406156539917, "learning_rate": 1.2046646077371615e-05, "loss": 1.1648, "step": 847 }, { "epoch": 0.7799494136583123, "grad_norm": 0.8463898301124573, "learning_rate": 1.1951060611951615e-05, "loss": 1.1043, "step": 848 }, { "epoch": 0.7808691653253621, "grad_norm": 0.9298079013824463, "learning_rate": 1.185580437659241e-05, "loss": 1.0148, "step": 849 }, { "epoch": 0.781788916992412, "grad_norm": 1.260094404220581, "learning_rate": 1.1760878195527642e-05, "loss": 0.9653, "step": 850 }, { "epoch": 0.782708668659462, "grad_norm": 1.080349326133728, "learning_rate": 1.1666282890135082e-05, "loss": 1.4973, "step": 851 }, { "epoch": 0.7836284203265118, "grad_norm": 1.0160036087036133, "learning_rate": 1.1572019278929458e-05, "loss": 1.4835, "step": 852 }, { "epoch": 0.7845481719935617, "grad_norm": 1.0411534309387207, "learning_rate": 1.1478088177555441e-05, "loss": 1.4388, "step": 853 }, { "epoch": 0.7854679236606117, "grad_norm": 0.8667961359024048, "learning_rate": 1.1384490398780562e-05, "loss": 1.4592, "step": 854 }, { "epoch": 0.7863876753276615, "grad_norm": 0.7747707366943359, "learning_rate": 1.129122675248816e-05, "loss": 1.4124, "step": 855 }, { "epoch": 0.7873074269947115, "grad_norm": 0.9287156462669373, "learning_rate": 1.1198298045670402e-05, "loss": 1.4827, "step": 856 }, { "epoch": 0.7882271786617613, "grad_norm": 1.0620696544647217, "learning_rate": 1.1105705082421303e-05, "loss": 1.4392, "step": 857 }, { "epoch": 0.7891469303288112, "grad_norm": 1.099214792251587, "learning_rate": 1.1013448663929705e-05, "loss": 1.4812, "step": 858 }, { "epoch": 0.7900666819958612, "grad_norm": 0.9307000637054443, "learning_rate": 1.0921529588472445e-05, "loss": 1.4939, "step": 859 }, { "epoch": 0.790986433662911, "grad_norm": 0.7514574527740479, "learning_rate": 1.0829948651407374e-05, "loss": 1.4117, "step": 860 }, { "epoch": 0.7919061853299609, "grad_norm": 0.6653128862380981, "learning_rate": 1.0738706645166508e-05, "loss": 1.4885, "step": 861 }, { "epoch": 0.7928259369970108, "grad_norm": 0.7091299295425415, "learning_rate": 1.0647804359249142e-05, "loss": 1.4785, "step": 862 }, { "epoch": 0.7937456886640607, "grad_norm": 0.7756891250610352, "learning_rate": 1.0557242580215066e-05, "loss": 1.499, "step": 863 }, { "epoch": 0.7946654403311106, "grad_norm": 0.7706134915351868, "learning_rate": 1.0467022091677691e-05, "loss": 1.3828, "step": 864 }, { "epoch": 0.7955851919981605, "grad_norm": 0.6963340044021606, "learning_rate": 1.037714367429734e-05, "loss": 1.415, "step": 865 }, { "epoch": 0.7965049436652104, "grad_norm": 0.683591365814209, "learning_rate": 1.0287608105774454e-05, "loss": 1.4614, "step": 866 }, { "epoch": 0.7974246953322602, "grad_norm": 0.6579643487930298, "learning_rate": 1.019841616084286e-05, "loss": 1.4229, "step": 867 }, { "epoch": 0.7983444469993102, "grad_norm": 0.655005156993866, "learning_rate": 1.0109568611263093e-05, "loss": 1.3674, "step": 868 }, { "epoch": 0.7992641986663601, "grad_norm": 0.6061270236968994, "learning_rate": 1.0021066225815689e-05, "loss": 1.4522, "step": 869 }, { "epoch": 0.8001839503334099, "grad_norm": 0.6729152798652649, "learning_rate": 9.932909770294541e-06, "loss": 1.3665, "step": 870 }, { "epoch": 0.8011037020004599, "grad_norm": 0.6866083145141602, "learning_rate": 9.84510000750029e-06, "loss": 1.341, "step": 871 }, { "epoch": 0.8020234536675098, "grad_norm": 0.6673592329025269, "learning_rate": 9.757637697233723e-06, "loss": 1.4353, "step": 872 }, { "epoch": 0.8029432053345597, "grad_norm": 0.6237421035766602, "learning_rate": 9.670523596289138e-06, "loss": 1.4077, "step": 873 }, { "epoch": 0.8038629570016096, "grad_norm": 0.6855435967445374, "learning_rate": 9.583758458447927e-06, "loss": 1.4204, "step": 874 }, { "epoch": 0.8047827086686594, "grad_norm": 0.6294743418693542, "learning_rate": 9.497343034471895e-06, "loss": 1.4306, "step": 875 }, { "epoch": 0.8057024603357094, "grad_norm": 0.5920624136924744, "learning_rate": 9.41127807209688e-06, "loss": 1.4342, "step": 876 }, { "epoch": 0.8066222120027593, "grad_norm": 0.5831781625747681, "learning_rate": 9.325564316026237e-06, "loss": 1.3581, "step": 877 }, { "epoch": 0.8075419636698091, "grad_norm": 0.6441843509674072, "learning_rate": 9.240202507924412e-06, "loss": 1.3834, "step": 878 }, { "epoch": 0.8084617153368591, "grad_norm": 0.8426811099052429, "learning_rate": 9.155193386410465e-06, "loss": 1.4059, "step": 879 }, { "epoch": 0.8093814670039089, "grad_norm": 0.7335101366043091, "learning_rate": 9.070537687051817e-06, "loss": 1.3253, "step": 880 }, { "epoch": 0.8103012186709588, "grad_norm": 0.6380130052566528, "learning_rate": 8.986236142357708e-06, "loss": 1.368, "step": 881 }, { "epoch": 0.8112209703380088, "grad_norm": 0.6573965549468994, "learning_rate": 8.902289481772997e-06, "loss": 1.2883, "step": 882 }, { "epoch": 0.8121407220050586, "grad_norm": 0.658258855342865, "learning_rate": 8.818698431671773e-06, "loss": 1.3068, "step": 883 }, { "epoch": 0.8130604736721085, "grad_norm": 0.5781223773956299, "learning_rate": 8.735463715351139e-06, "loss": 1.2877, "step": 884 }, { "epoch": 0.8139802253391585, "grad_norm": 0.7181767225265503, "learning_rate": 8.652586053024836e-06, "loss": 1.2878, "step": 885 }, { "epoch": 0.8148999770062083, "grad_norm": 0.6754813194274902, "learning_rate": 8.570066161817176e-06, "loss": 1.2296, "step": 886 }, { "epoch": 0.8158197286732582, "grad_norm": 0.655967652797699, "learning_rate": 8.487904755756677e-06, "loss": 1.2901, "step": 887 }, { "epoch": 0.8167394803403081, "grad_norm": 0.6471141576766968, "learning_rate": 8.406102545769989e-06, "loss": 1.1674, "step": 888 }, { "epoch": 0.817659232007358, "grad_norm": 0.615079939365387, "learning_rate": 8.324660239675696e-06, "loss": 1.2264, "step": 889 }, { "epoch": 0.818578983674408, "grad_norm": 0.671017587184906, "learning_rate": 8.243578542178226e-06, "loss": 1.2746, "step": 890 }, { "epoch": 0.8194987353414578, "grad_norm": 0.6405725479125977, "learning_rate": 8.16285815486168e-06, "loss": 1.26, "step": 891 }, { "epoch": 0.8204184870085077, "grad_norm": 0.7116778492927551, "learning_rate": 8.082499776183883e-06, "loss": 1.2526, "step": 892 }, { "epoch": 0.8213382386755576, "grad_norm": 0.6701216697692871, "learning_rate": 8.002504101470204e-06, "loss": 1.1883, "step": 893 }, { "epoch": 0.8222579903426075, "grad_norm": 0.7331655025482178, "learning_rate": 7.92287182290764e-06, "loss": 1.2322, "step": 894 }, { "epoch": 0.8231777420096574, "grad_norm": 0.7266958951950073, "learning_rate": 7.843603629538804e-06, "loss": 1.1902, "step": 895 }, { "epoch": 0.8240974936767073, "grad_norm": 0.7101981043815613, "learning_rate": 7.764700207255903e-06, "loss": 1.0998, "step": 896 }, { "epoch": 0.8250172453437572, "grad_norm": 0.7413234114646912, "learning_rate": 7.686162238794897e-06, "loss": 1.1047, "step": 897 }, { "epoch": 0.825936997010807, "grad_norm": 0.8715062141418457, "learning_rate": 7.607990403729526e-06, "loss": 1.1146, "step": 898 }, { "epoch": 0.826856748677857, "grad_norm": 0.9183730483055115, "learning_rate": 7.5301853784654595e-06, "loss": 1.0057, "step": 899 }, { "epoch": 0.8277765003449069, "grad_norm": 1.0864571332931519, "learning_rate": 7.452747836234392e-06, "loss": 0.978, "step": 900 }, { "epoch": 0.8277765003449069, "eval_loss": 1.3344465494155884, "eval_runtime": 49.9437, "eval_samples_per_second": 164.986, "eval_steps_per_second": 20.623, "step": 900 }, { "epoch": 0.8286962520119567, "grad_norm": 0.8766337037086487, "learning_rate": 7.375678447088347e-06, "loss": 1.5154, "step": 901 }, { "epoch": 0.8296160036790067, "grad_norm": 0.8737375140190125, "learning_rate": 7.298977877893687e-06, "loss": 1.4447, "step": 902 }, { "epoch": 0.8305357553460566, "grad_norm": 0.9431170225143433, "learning_rate": 7.222646792325516e-06, "loss": 1.4588, "step": 903 }, { "epoch": 0.8314555070131064, "grad_norm": 0.9367691874504089, "learning_rate": 7.146685850861851e-06, "loss": 1.4205, "step": 904 }, { "epoch": 0.8323752586801564, "grad_norm": 0.812258780002594, "learning_rate": 7.071095710777925e-06, "loss": 1.4177, "step": 905 }, { "epoch": 0.8332950103472062, "grad_norm": 0.7034198045730591, "learning_rate": 6.995877026140468e-06, "loss": 1.4146, "step": 906 }, { "epoch": 0.8342147620142562, "grad_norm": 0.7884905934333801, "learning_rate": 6.921030447802146e-06, "loss": 1.4616, "step": 907 }, { "epoch": 0.8351345136813061, "grad_norm": 0.8112537860870361, "learning_rate": 6.8465566233957945e-06, "loss": 1.3435, "step": 908 }, { "epoch": 0.8360542653483559, "grad_norm": 0.7667593955993652, "learning_rate": 6.772456197328919e-06, "loss": 1.464, "step": 909 }, { "epoch": 0.8369740170154059, "grad_norm": 0.762269914150238, "learning_rate": 6.698729810778065e-06, "loss": 1.4473, "step": 910 }, { "epoch": 0.8378937686824557, "grad_norm": 0.852673351764679, "learning_rate": 6.625378101683316e-06, "loss": 1.4215, "step": 911 }, { "epoch": 0.8388135203495056, "grad_norm": 0.7429057359695435, "learning_rate": 6.552401704742678e-06, "loss": 1.4426, "step": 912 }, { "epoch": 0.8397332720165556, "grad_norm": 0.6884950995445251, "learning_rate": 6.4798012514067475e-06, "loss": 1.4016, "step": 913 }, { "epoch": 0.8406530236836054, "grad_norm": 0.6550636291503906, "learning_rate": 6.407577369873069e-06, "loss": 1.4468, "step": 914 }, { "epoch": 0.8415727753506553, "grad_norm": 0.5837852358818054, "learning_rate": 6.335730685080837e-06, "loss": 1.4036, "step": 915 }, { "epoch": 0.8424925270177053, "grad_norm": 0.5570608377456665, "learning_rate": 6.264261818705419e-06, "loss": 1.3483, "step": 916 }, { "epoch": 0.8434122786847551, "grad_norm": 0.7056939005851746, "learning_rate": 6.193171389152997e-06, "loss": 1.3397, "step": 917 }, { "epoch": 0.844332030351805, "grad_norm": 0.623600423336029, "learning_rate": 6.122460011555187e-06, "loss": 1.4304, "step": 918 }, { "epoch": 0.8452517820188549, "grad_norm": 0.6012278199195862, "learning_rate": 6.052128297763804e-06, "loss": 1.3684, "step": 919 }, { "epoch": 0.8461715336859048, "grad_norm": 0.582744836807251, "learning_rate": 5.982176856345445e-06, "loss": 1.4205, "step": 920 }, { "epoch": 0.8470912853529547, "grad_norm": 0.5616964101791382, "learning_rate": 5.912606292576283e-06, "loss": 1.3209, "step": 921 }, { "epoch": 0.8480110370200046, "grad_norm": 0.5474282503128052, "learning_rate": 5.843417208436908e-06, "loss": 1.4125, "step": 922 }, { "epoch": 0.8489307886870545, "grad_norm": 0.533388614654541, "learning_rate": 5.774610202606939e-06, "loss": 1.4116, "step": 923 }, { "epoch": 0.8498505403541043, "grad_norm": 0.5694478154182434, "learning_rate": 5.706185870460018e-06, "loss": 1.509, "step": 924 }, { "epoch": 0.8507702920211543, "grad_norm": 0.5748287439346313, "learning_rate": 5.638144804058559e-06, "loss": 1.3528, "step": 925 }, { "epoch": 0.8516900436882042, "grad_norm": 0.6192615032196045, "learning_rate": 5.5704875921486655e-06, "loss": 1.3098, "step": 926 }, { "epoch": 0.852609795355254, "grad_norm": 0.6460704207420349, "learning_rate": 5.503214820154978e-06, "loss": 1.3839, "step": 927 }, { "epoch": 0.853529547022304, "grad_norm": 0.620794951915741, "learning_rate": 5.436327070175728e-06, "loss": 1.4197, "step": 928 }, { "epoch": 0.8544492986893538, "grad_norm": 0.6275455355644226, "learning_rate": 5.369824920977568e-06, "loss": 1.2891, "step": 929 }, { "epoch": 0.8553690503564038, "grad_norm": 0.5857694149017334, "learning_rate": 5.303708947990637e-06, "loss": 1.3334, "step": 930 }, { "epoch": 0.8562888020234537, "grad_norm": 0.6003711819648743, "learning_rate": 5.2379797233035824e-06, "loss": 1.395, "step": 931 }, { "epoch": 0.8572085536905035, "grad_norm": 0.6273806095123291, "learning_rate": 5.1726378156585816e-06, "loss": 1.2778, "step": 932 }, { "epoch": 0.8581283053575535, "grad_norm": 0.6366182565689087, "learning_rate": 5.10768379044641e-06, "loss": 1.3508, "step": 933 }, { "epoch": 0.8590480570246034, "grad_norm": 0.6845077872276306, "learning_rate": 5.043118209701631e-06, "loss": 1.2843, "step": 934 }, { "epoch": 0.8599678086916532, "grad_norm": 0.6707909107208252, "learning_rate": 4.978941632097611e-06, "loss": 1.3239, "step": 935 }, { "epoch": 0.8608875603587032, "grad_norm": 0.7041406631469727, "learning_rate": 4.9151546129417804e-06, "loss": 1.2556, "step": 936 }, { "epoch": 0.861807312025753, "grad_norm": 0.6683023571968079, "learning_rate": 4.8517577041707955e-06, "loss": 1.289, "step": 937 }, { "epoch": 0.8627270636928029, "grad_norm": 0.6463608741760254, "learning_rate": 4.788751454345763e-06, "loss": 1.225, "step": 938 }, { "epoch": 0.8636468153598529, "grad_norm": 0.6901978254318237, "learning_rate": 4.726136408647464e-06, "loss": 1.2177, "step": 939 }, { "epoch": 0.8645665670269027, "grad_norm": 0.6679742336273193, "learning_rate": 4.663913108871726e-06, "loss": 1.2586, "step": 940 }, { "epoch": 0.8654863186939526, "grad_norm": 0.6778735518455505, "learning_rate": 4.60208209342462e-06, "loss": 1.183, "step": 941 }, { "epoch": 0.8664060703610025, "grad_norm": 0.6251430511474609, "learning_rate": 4.540643897317887e-06, "loss": 1.2523, "step": 942 }, { "epoch": 0.8673258220280524, "grad_norm": 0.6894196271896362, "learning_rate": 4.479599052164268e-06, "loss": 1.183, "step": 943 }, { "epoch": 0.8682455736951024, "grad_norm": 0.6839209198951721, "learning_rate": 4.418948086172914e-06, "loss": 1.1992, "step": 944 }, { "epoch": 0.8691653253621522, "grad_norm": 0.7572594285011292, "learning_rate": 4.35869152414482e-06, "loss": 1.1731, "step": 945 }, { "epoch": 0.8700850770292021, "grad_norm": 0.7147699594497681, "learning_rate": 4.298829887468275e-06, "loss": 1.1665, "step": 946 }, { "epoch": 0.8710048286962521, "grad_norm": 0.7666782736778259, "learning_rate": 4.2393636941143675e-06, "loss": 1.149, "step": 947 }, { "epoch": 0.8719245803633019, "grad_norm": 0.7843433022499084, "learning_rate": 4.180293458632489e-06, "loss": 1.0903, "step": 948 }, { "epoch": 0.8728443320303518, "grad_norm": 0.958113431930542, "learning_rate": 4.121619692145878e-06, "loss": 1.118, "step": 949 }, { "epoch": 0.8737640836974017, "grad_norm": 1.1284202337265015, "learning_rate": 4.0633429023472e-06, "loss": 0.9711, "step": 950 }, { "epoch": 0.8746838353644516, "grad_norm": 0.8368450403213501, "learning_rate": 4.005463593494163e-06, "loss": 1.4433, "step": 951 }, { "epoch": 0.8756035870315015, "grad_norm": 0.6638758182525635, "learning_rate": 3.947982266405159e-06, "loss": 1.4285, "step": 952 }, { "epoch": 0.8765233386985514, "grad_norm": 0.8789987564086914, "learning_rate": 3.890899418454913e-06, "loss": 1.4212, "step": 953 }, { "epoch": 0.8774430903656013, "grad_norm": 0.847080409526825, "learning_rate": 3.834215543570191e-06, "loss": 1.4124, "step": 954 }, { "epoch": 0.8783628420326511, "grad_norm": 0.9596214890480042, "learning_rate": 3.777931132225526e-06, "loss": 1.3723, "step": 955 }, { "epoch": 0.8792825936997011, "grad_norm": 0.9075647592544556, "learning_rate": 3.72204667143895e-06, "loss": 1.493, "step": 956 }, { "epoch": 0.880202345366751, "grad_norm": 0.780536413192749, "learning_rate": 3.6665626447678237e-06, "loss": 1.4126, "step": 957 }, { "epoch": 0.8811220970338008, "grad_norm": 0.6997688412666321, "learning_rate": 3.611479532304618e-06, "loss": 1.389, "step": 958 }, { "epoch": 0.8820418487008508, "grad_norm": 0.620875358581543, "learning_rate": 3.556797810672785e-06, "loss": 1.3514, "step": 959 }, { "epoch": 0.8829616003679006, "grad_norm": 0.6854445338249207, "learning_rate": 3.5025179530225994e-06, "loss": 1.4661, "step": 960 }, { "epoch": 0.8838813520349506, "grad_norm": 0.7020566463470459, "learning_rate": 3.4486404290271113e-06, "loss": 1.4115, "step": 961 }, { "epoch": 0.8848011037020005, "grad_norm": 0.6943616271018982, "learning_rate": 3.3951657048780227e-06, "loss": 1.4774, "step": 962 }, { "epoch": 0.8857208553690503, "grad_norm": 0.7479608654975891, "learning_rate": 3.3420942432817127e-06, "loss": 1.4625, "step": 963 }, { "epoch": 0.8866406070361003, "grad_norm": 0.7025173902511597, "learning_rate": 3.289426503455201e-06, "loss": 1.4019, "step": 964 }, { "epoch": 0.8875603587031502, "grad_norm": 0.673040509223938, "learning_rate": 3.2371629411221848e-06, "loss": 1.4343, "step": 965 }, { "epoch": 0.8884801103702, "grad_norm": 0.728541910648346, "learning_rate": 3.185304008509077e-06, "loss": 1.5093, "step": 966 }, { "epoch": 0.88939986203725, "grad_norm": 0.6773453950881958, "learning_rate": 3.133850154341139e-06, "loss": 1.4002, "step": 967 }, { "epoch": 0.8903196137042998, "grad_norm": 0.6363242864608765, "learning_rate": 3.082801823838527e-06, "loss": 1.4272, "step": 968 }, { "epoch": 0.8912393653713497, "grad_norm": 0.5722589492797852, "learning_rate": 3.032159458712508e-06, "loss": 1.3557, "step": 969 }, { "epoch": 0.8921591170383997, "grad_norm": 0.5886601209640503, "learning_rate": 2.981923497161615e-06, "loss": 1.3874, "step": 970 }, { "epoch": 0.8930788687054495, "grad_norm": 0.6230661273002625, "learning_rate": 2.9320943738678107e-06, "loss": 1.3784, "step": 971 }, { "epoch": 0.8939986203724994, "grad_norm": 0.5844275951385498, "learning_rate": 2.882672519992824e-06, "loss": 1.4153, "step": 972 }, { "epoch": 0.8949183720395493, "grad_norm": 0.6414538621902466, "learning_rate": 2.833658363174302e-06, "loss": 1.3611, "step": 973 }, { "epoch": 0.8958381237065992, "grad_norm": 0.6074815392494202, "learning_rate": 2.785052327522214e-06, "loss": 1.3607, "step": 974 }, { "epoch": 0.8967578753736491, "grad_norm": 0.5938957333564758, "learning_rate": 2.73685483361511e-06, "loss": 1.3765, "step": 975 }, { "epoch": 0.897677627040699, "grad_norm": 0.5869003534317017, "learning_rate": 2.6890662984965232e-06, "loss": 1.392, "step": 976 }, { "epoch": 0.8985973787077489, "grad_norm": 0.5588386654853821, "learning_rate": 2.6416871356713224e-06, "loss": 1.3047, "step": 977 }, { "epoch": 0.8995171303747989, "grad_norm": 0.5922186970710754, "learning_rate": 2.594717755102205e-06, "loss": 1.3928, "step": 978 }, { "epoch": 0.9004368820418487, "grad_norm": 0.5693724155426025, "learning_rate": 2.548158563206038e-06, "loss": 1.347, "step": 979 }, { "epoch": 0.9013566337088986, "grad_norm": 0.6117263436317444, "learning_rate": 2.50200996285046e-06, "loss": 1.3568, "step": 980 }, { "epoch": 0.9022763853759485, "grad_norm": 0.5885259509086609, "learning_rate": 2.4562723533503083e-06, "loss": 1.4184, "step": 981 }, { "epoch": 0.9031961370429984, "grad_norm": 0.6112256646156311, "learning_rate": 2.4109461304642256e-06, "loss": 1.3344, "step": 982 }, { "epoch": 0.9041158887100483, "grad_norm": 0.6500238180160522, "learning_rate": 2.366031686391168e-06, "loss": 1.3372, "step": 983 }, { "epoch": 0.9050356403770982, "grad_norm": 0.6185190677642822, "learning_rate": 2.3215294097670925e-06, "loss": 1.2273, "step": 984 }, { "epoch": 0.9059553920441481, "grad_norm": 0.6523995995521545, "learning_rate": 2.277439685661509e-06, "loss": 1.2538, "step": 985 }, { "epoch": 0.9068751437111979, "grad_norm": 0.7136437296867371, "learning_rate": 2.2337628955742264e-06, "loss": 1.3739, "step": 986 }, { "epoch": 0.9077948953782479, "grad_norm": 0.6043840050697327, "learning_rate": 2.1904994174319905e-06, "loss": 1.2184, "step": 987 }, { "epoch": 0.9087146470452978, "grad_norm": 0.6362565159797668, "learning_rate": 2.1476496255852683e-06, "loss": 1.1398, "step": 988 }, { "epoch": 0.9096343987123476, "grad_norm": 0.6597528457641602, "learning_rate": 2.1052138908049303e-06, "loss": 1.1972, "step": 989 }, { "epoch": 0.9105541503793976, "grad_norm": 0.679057240486145, "learning_rate": 2.0631925802791606e-06, "loss": 1.2572, "step": 990 }, { "epoch": 0.9114739020464474, "grad_norm": 0.6650072336196899, "learning_rate": 2.021586057610153e-06, "loss": 1.1868, "step": 991 }, { "epoch": 0.9123936537134973, "grad_norm": 0.6258329749107361, "learning_rate": 1.9803946828110375e-06, "loss": 1.209, "step": 992 }, { "epoch": 0.9133134053805473, "grad_norm": 0.6818736791610718, "learning_rate": 1.9396188123027737e-06, "loss": 1.2432, "step": 993 }, { "epoch": 0.9142331570475971, "grad_norm": 0.7300404906272888, "learning_rate": 1.8992587989110134e-06, "loss": 1.2549, "step": 994 }, { "epoch": 0.915152908714647, "grad_norm": 0.7216602563858032, "learning_rate": 1.8593149918630925e-06, "loss": 1.1911, "step": 995 }, { "epoch": 0.916072660381697, "grad_norm": 0.7485631704330444, "learning_rate": 1.8197877367849947e-06, "loss": 1.1326, "step": 996 }, { "epoch": 0.9169924120487468, "grad_norm": 0.8240882158279419, "learning_rate": 1.7806773756983642e-06, "loss": 1.1299, "step": 997 }, { "epoch": 0.9179121637157968, "grad_norm": 0.9147471189498901, "learning_rate": 1.7419842470175195e-06, "loss": 1.1179, "step": 998 }, { "epoch": 0.9188319153828466, "grad_norm": 0.9360700249671936, "learning_rate": 1.70370868554659e-06, "loss": 1.0562, "step": 999 }, { "epoch": 0.9197516670498965, "grad_norm": 1.174989104270935, "learning_rate": 1.6658510224765333e-06, "loss": 0.9121, "step": 1000 }, { "epoch": 0.9206714187169465, "grad_norm": 0.8917292952537537, "learning_rate": 1.6284115853823445e-06, "loss": 1.4961, "step": 1001 }, { "epoch": 0.9215911703839963, "grad_norm": 0.6432257890701294, "learning_rate": 1.5913906982201742e-06, "loss": 1.488, "step": 1002 }, { "epoch": 0.9225109220510462, "grad_norm": 0.7689481973648071, "learning_rate": 1.5547886813245539e-06, "loss": 1.4265, "step": 1003 }, { "epoch": 0.9234306737180961, "grad_norm": 0.7164052128791809, "learning_rate": 1.5186058514055912e-06, "loss": 1.4054, "step": 1004 }, { "epoch": 0.924350425385146, "grad_norm": 0.8932134509086609, "learning_rate": 1.4828425215462848e-06, "loss": 1.403, "step": 1005 }, { "epoch": 0.9252701770521959, "grad_norm": 0.8750680685043335, "learning_rate": 1.447499001199748e-06, "loss": 1.3956, "step": 1006 }, { "epoch": 0.9261899287192458, "grad_norm": 0.7176107168197632, "learning_rate": 1.4125755961865827e-06, "loss": 1.4235, "step": 1007 }, { "epoch": 0.9271096803862957, "grad_norm": 0.7204969525337219, "learning_rate": 1.3780726086922103e-06, "loss": 1.3773, "step": 1008 }, { "epoch": 0.9280294320533456, "grad_norm": 0.6472546458244324, "learning_rate": 1.3439903372642615e-06, "loss": 1.4734, "step": 1009 }, { "epoch": 0.9289491837203955, "grad_norm": 0.679750919342041, "learning_rate": 1.3103290768099797e-06, "loss": 1.5028, "step": 1010 }, { "epoch": 0.9298689353874454, "grad_norm": 0.6491613984107971, "learning_rate": 1.2770891185937105e-06, "loss": 1.403, "step": 1011 }, { "epoch": 0.9307886870544952, "grad_norm": 0.6442059278488159, "learning_rate": 1.2442707502343332e-06, "loss": 1.4124, "step": 1012 }, { "epoch": 0.9317084387215452, "grad_norm": 0.5981637835502625, "learning_rate": 1.2118742557027884e-06, "loss": 1.459, "step": 1013 }, { "epoch": 0.9326281903885951, "grad_norm": 0.5459677577018738, "learning_rate": 1.1798999153196433e-06, "loss": 1.4171, "step": 1014 }, { "epoch": 0.933547942055645, "grad_norm": 0.5810702443122864, "learning_rate": 1.1483480057526363e-06, "loss": 1.3995, "step": 1015 }, { "epoch": 0.9344676937226949, "grad_norm": 0.5334146022796631, "learning_rate": 1.1172188000142802e-06, "loss": 1.4004, "step": 1016 }, { "epoch": 0.9353874453897447, "grad_norm": 0.5717347860336304, "learning_rate": 1.0865125674595466e-06, "loss": 1.3843, "step": 1017 }, { "epoch": 0.9363071970567947, "grad_norm": 0.5235407948493958, "learning_rate": 1.0562295737834737e-06, "loss": 1.3558, "step": 1018 }, { "epoch": 0.9372269487238446, "grad_norm": 0.5573782324790955, "learning_rate": 1.026370081018907e-06, "loss": 1.4016, "step": 1019 }, { "epoch": 0.9381467003908944, "grad_norm": 0.5528433322906494, "learning_rate": 9.969343475342285e-07, "loss": 1.3298, "step": 1020 }, { "epoch": 0.9390664520579444, "grad_norm": 0.573993980884552, "learning_rate": 9.679226280310982e-07, "loss": 1.3674, "step": 1021 }, { "epoch": 0.9399862037249943, "grad_norm": 0.5446662902832031, "learning_rate": 9.393351735422773e-07, "loss": 1.3571, "step": 1022 }, { "epoch": 0.9409059553920441, "grad_norm": 0.5892913937568665, "learning_rate": 9.111722314294358e-07, "loss": 1.3471, "step": 1023 }, { "epoch": 0.9418257070590941, "grad_norm": 0.6275593638420105, "learning_rate": 8.834340453810375e-07, "loss": 1.3269, "step": 1024 }, { "epoch": 0.9427454587261439, "grad_norm": 0.6341751217842102, "learning_rate": 8.561208554101863e-07, "loss": 1.3899, "step": 1025 }, { "epoch": 0.9436652103931938, "grad_norm": 0.6272470951080322, "learning_rate": 8.292328978526109e-07, "loss": 1.3545, "step": 1026 }, { "epoch": 0.9445849620602438, "grad_norm": 0.6651190519332886, "learning_rate": 8.027704053645613e-07, "loss": 1.3397, "step": 1027 }, { "epoch": 0.9455047137272936, "grad_norm": 0.6504070162773132, "learning_rate": 7.76733606920832e-07, "loss": 1.3889, "step": 1028 }, { "epoch": 0.9464244653943436, "grad_norm": 0.639077365398407, "learning_rate": 7.511227278127697e-07, "loss": 1.3159, "step": 1029 }, { "epoch": 0.9473442170613934, "grad_norm": 0.685070812702179, "learning_rate": 7.259379896463247e-07, "loss": 1.312, "step": 1030 }, { "epoch": 0.9482639687284433, "grad_norm": 0.705894947052002, "learning_rate": 7.011796103401191e-07, "loss": 1.325, "step": 1031 }, { "epoch": 0.9491837203954933, "grad_norm": 0.6670310497283936, "learning_rate": 6.768478041236037e-07, "loss": 1.3582, "step": 1032 }, { "epoch": 0.9501034720625431, "grad_norm": 0.7927426695823669, "learning_rate": 6.529427815351374e-07, "loss": 1.3767, "step": 1033 }, { "epoch": 0.951023223729593, "grad_norm": 0.6605473160743713, "learning_rate": 6.294647494202444e-07, "loss": 1.2937, "step": 1034 }, { "epoch": 0.9519429753966429, "grad_norm": 0.599684476852417, "learning_rate": 6.064139109297485e-07, "loss": 1.2802, "step": 1035 }, { "epoch": 0.9528627270636928, "grad_norm": 0.6753445267677307, "learning_rate": 5.837904655180748e-07, "loss": 1.297, "step": 1036 }, { "epoch": 0.9537824787307427, "grad_norm": 0.6682940125465393, "learning_rate": 5.615946089414736e-07, "loss": 1.3073, "step": 1037 }, { "epoch": 0.9547022303977926, "grad_norm": 0.6744109392166138, "learning_rate": 5.398265332563934e-07, "loss": 1.1858, "step": 1038 }, { "epoch": 0.9556219820648425, "grad_norm": 0.6154145002365112, "learning_rate": 5.184864268177325e-07, "loss": 1.1648, "step": 1039 }, { "epoch": 0.9565417337318924, "grad_norm": 0.6836906671524048, "learning_rate": 4.975744742772848e-07, "loss": 1.2518, "step": 1040 }, { "epoch": 0.9574614853989423, "grad_norm": 0.6386029720306396, "learning_rate": 4.770908565820964e-07, "loss": 1.2142, "step": 1041 }, { "epoch": 0.9583812370659922, "grad_norm": 0.6528066992759705, "learning_rate": 4.5703575097292286e-07, "loss": 1.1931, "step": 1042 }, { "epoch": 0.959300988733042, "grad_norm": 0.665433406829834, "learning_rate": 4.37409330982691e-07, "loss": 1.202, "step": 1043 }, { "epoch": 0.960220740400092, "grad_norm": 0.7009211182594299, "learning_rate": 4.182117664349783e-07, "loss": 1.2317, "step": 1044 }, { "epoch": 0.9611404920671419, "grad_norm": 0.7533866167068481, "learning_rate": 3.99443223442586e-07, "loss": 1.2128, "step": 1045 }, { "epoch": 0.9620602437341917, "grad_norm": 0.7658700942993164, "learning_rate": 3.8110386440605164e-07, "loss": 1.1474, "step": 1046 }, { "epoch": 0.9629799954012417, "grad_norm": 0.7905300259590149, "learning_rate": 3.6319384801227763e-07, "loss": 1.1075, "step": 1047 }, { "epoch": 0.9638997470682915, "grad_norm": 0.9083186388015747, "learning_rate": 3.4571332923314936e-07, "loss": 1.1094, "step": 1048 }, { "epoch": 0.9648194987353415, "grad_norm": 0.9923297762870789, "learning_rate": 3.2866245932418604e-07, "loss": 1.0341, "step": 1049 }, { "epoch": 0.9657392504023914, "grad_norm": 1.4956581592559814, "learning_rate": 3.120413858232474e-07, "loss": 0.9236, "step": 1050 }, { "epoch": 0.9657392504023914, "eval_loss": 1.3224910497665405, "eval_runtime": 49.9198, "eval_samples_per_second": 165.065, "eval_steps_per_second": 20.633, "step": 1050 } ], "logging_steps": 1, "max_steps": 1088, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9092013631668224e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }