{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.000763553066938, "eval_steps": 500, "global_step": 983, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010180707559175363, "grad_norm": 13.50528621673584, "learning_rate": 2e-05, "loss": 5.1969, "step": 1 }, { "epoch": 0.0020361415118350726, "grad_norm": 19.771379470825195, "learning_rate": 4e-05, "loss": 7.7524, "step": 2 }, { "epoch": 0.003054212267752609, "grad_norm": 19.734294891357422, "learning_rate": 6e-05, "loss": 8.0409, "step": 3 }, { "epoch": 0.004072283023670145, "grad_norm": 24.712677001953125, "learning_rate": 8e-05, "loss": 7.7622, "step": 4 }, { "epoch": 0.0050903537795876815, "grad_norm": 26.32750701904297, "learning_rate": 0.0001, "loss": 9.8505, "step": 5 }, { "epoch": 0.006108424535505218, "grad_norm": 29.443405151367188, "learning_rate": 9.999974203447433e-05, "loss": 9.1155, "step": 6 }, { "epoch": 0.007126495291422754, "grad_norm": 22.42616844177246, "learning_rate": 9.999896814055916e-05, "loss": 6.3676, "step": 7 }, { "epoch": 0.00814456604734029, "grad_norm": 27.119375228881836, "learning_rate": 9.999767832624001e-05, "loss": 7.4125, "step": 8 }, { "epoch": 0.009162636803257827, "grad_norm": 15.422072410583496, "learning_rate": 9.999587260482597e-05, "loss": 3.5393, "step": 9 }, { "epoch": 0.010180707559175363, "grad_norm": 13.826335906982422, "learning_rate": 9.999355099494962e-05, "loss": 2.7136, "step": 10 }, { "epoch": 0.0111987783150929, "grad_norm": 14.258417129516602, "learning_rate": 9.999071352056675e-05, "loss": 2.6158, "step": 11 }, { "epoch": 0.012216849071010435, "grad_norm": 14.128898620605469, "learning_rate": 9.99873602109562e-05, "loss": 3.0587, "step": 12 }, { "epoch": 0.013234919826927972, "grad_norm": 12.319880485534668, "learning_rate": 9.998349110071949e-05, "loss": 2.6488, "step": 13 }, { "epoch": 0.014252990582845508, "grad_norm": 15.33985424041748, "learning_rate": 9.99791062297805e-05, "loss": 3.1476, "step": 14 }, { "epoch": 0.015271061338763044, "grad_norm": 13.602853775024414, "learning_rate": 9.99742056433851e-05, "loss": 2.796, "step": 15 }, { "epoch": 0.01628913209468058, "grad_norm": 9.083526611328125, "learning_rate": 9.996878939210049e-05, "loss": 2.0671, "step": 16 }, { "epoch": 0.017307202850598117, "grad_norm": 10.980738639831543, "learning_rate": 9.9962857531815e-05, "loss": 2.629, "step": 17 }, { "epoch": 0.018325273606515653, "grad_norm": 11.885966300964355, "learning_rate": 9.99564101237372e-05, "loss": 3.1609, "step": 18 }, { "epoch": 0.01934334436243319, "grad_norm": 11.95373249053955, "learning_rate": 9.994944723439546e-05, "loss": 3.2291, "step": 19 }, { "epoch": 0.020361415118350726, "grad_norm": 10.973458290100098, "learning_rate": 9.994196893563721e-05, "loss": 2.7778, "step": 20 }, { "epoch": 0.021379485874268262, "grad_norm": 12.775362014770508, "learning_rate": 9.993397530462818e-05, "loss": 3.136, "step": 21 }, { "epoch": 0.0223975566301858, "grad_norm": 10.916693687438965, "learning_rate": 9.992546642385158e-05, "loss": 2.3531, "step": 22 }, { "epoch": 0.023415627386103335, "grad_norm": 12.338353157043457, "learning_rate": 9.99164423811074e-05, "loss": 3.6968, "step": 23 }, { "epoch": 0.02443369814202087, "grad_norm": 13.168731689453125, "learning_rate": 9.990690326951126e-05, "loss": 3.0163, "step": 24 }, { "epoch": 0.025451768897938407, "grad_norm": 11.88056755065918, "learning_rate": 9.989684918749365e-05, "loss": 3.0201, "step": 25 }, { "epoch": 0.026469839653855944, "grad_norm": 9.301902770996094, "learning_rate": 9.988628023879883e-05, "loss": 2.6577, "step": 26 }, { "epoch": 0.02748791040977348, "grad_norm": 10.02252197265625, "learning_rate": 9.987519653248378e-05, "loss": 2.4519, "step": 27 }, { "epoch": 0.028505981165691016, "grad_norm": 11.7409029006958, "learning_rate": 9.986359818291706e-05, "loss": 3.1898, "step": 28 }, { "epoch": 0.029524051921608552, "grad_norm": 10.492666244506836, "learning_rate": 9.985148530977767e-05, "loss": 2.7226, "step": 29 }, { "epoch": 0.03054212267752609, "grad_norm": 12.711854934692383, "learning_rate": 9.983885803805372e-05, "loss": 2.7205, "step": 30 }, { "epoch": 0.03156019343344362, "grad_norm": 12.937368392944336, "learning_rate": 9.982571649804126e-05, "loss": 3.204, "step": 31 }, { "epoch": 0.03257826418936116, "grad_norm": 13.292618751525879, "learning_rate": 9.981206082534286e-05, "loss": 3.4911, "step": 32 }, { "epoch": 0.033596334945278694, "grad_norm": 9.724308967590332, "learning_rate": 9.979789116086625e-05, "loss": 2.8603, "step": 33 }, { "epoch": 0.034614405701196234, "grad_norm": 8.932650566101074, "learning_rate": 9.978320765082278e-05, "loss": 2.465, "step": 34 }, { "epoch": 0.03563247645711377, "grad_norm": 10.732564926147461, "learning_rate": 9.976801044672608e-05, "loss": 2.9517, "step": 35 }, { "epoch": 0.036650547213031306, "grad_norm": 11.971003532409668, "learning_rate": 9.97522997053903e-05, "loss": 3.2085, "step": 36 }, { "epoch": 0.03766861796894884, "grad_norm": 10.34869384765625, "learning_rate": 9.973607558892864e-05, "loss": 2.9294, "step": 37 }, { "epoch": 0.03868668872486638, "grad_norm": 12.5684814453125, "learning_rate": 9.97193382647516e-05, "loss": 3.6198, "step": 38 }, { "epoch": 0.03970475948078391, "grad_norm": 13.42013931274414, "learning_rate": 9.970208790556532e-05, "loss": 2.8409, "step": 39 }, { "epoch": 0.04072283023670145, "grad_norm": 10.357503890991211, "learning_rate": 9.968432468936967e-05, "loss": 2.6345, "step": 40 }, { "epoch": 0.041740900992618984, "grad_norm": 12.668771743774414, "learning_rate": 9.966604879945659e-05, "loss": 3.2825, "step": 41 }, { "epoch": 0.042758971748536524, "grad_norm": 9.444086074829102, "learning_rate": 9.964726042440802e-05, "loss": 2.7562, "step": 42 }, { "epoch": 0.04377704250445406, "grad_norm": 10.448949813842773, "learning_rate": 9.962795975809411e-05, "loss": 2.8796, "step": 43 }, { "epoch": 0.0447951132603716, "grad_norm": 10.916976928710938, "learning_rate": 9.960814699967112e-05, "loss": 2.7582, "step": 44 }, { "epoch": 0.04581318401628913, "grad_norm": 11.323358535766602, "learning_rate": 9.958782235357938e-05, "loss": 2.6436, "step": 45 }, { "epoch": 0.04683125477220667, "grad_norm": 10.28471851348877, "learning_rate": 9.956698602954124e-05, "loss": 2.8325, "step": 46 }, { "epoch": 0.0478493255281242, "grad_norm": 13.5079984664917, "learning_rate": 9.954563824255878e-05, "loss": 3.0585, "step": 47 }, { "epoch": 0.04886739628404174, "grad_norm": 11.250194549560547, "learning_rate": 9.952377921291178e-05, "loss": 2.7686, "step": 48 }, { "epoch": 0.049885467039959275, "grad_norm": 12.554705619812012, "learning_rate": 9.950140916615526e-05, "loss": 3.0617, "step": 49 }, { "epoch": 0.050903537795876815, "grad_norm": 12.552079200744629, "learning_rate": 9.947852833311724e-05, "loss": 2.08, "step": 50 }, { "epoch": 0.05192160855179435, "grad_norm": 11.248369216918945, "learning_rate": 9.945513694989639e-05, "loss": 5.133, "step": 51 }, { "epoch": 0.05293967930771189, "grad_norm": 12.866747856140137, "learning_rate": 9.943123525785952e-05, "loss": 5.7232, "step": 52 }, { "epoch": 0.05395775006362942, "grad_norm": 12.395757675170898, "learning_rate": 9.940682350363912e-05, "loss": 4.6422, "step": 53 }, { "epoch": 0.05497582081954696, "grad_norm": 12.23355770111084, "learning_rate": 9.938190193913083e-05, "loss": 4.8131, "step": 54 }, { "epoch": 0.05599389157546449, "grad_norm": 14.62759017944336, "learning_rate": 9.935647082149086e-05, "loss": 6.0114, "step": 55 }, { "epoch": 0.05701196233138203, "grad_norm": 13.613059997558594, "learning_rate": 9.933053041313325e-05, "loss": 4.794, "step": 56 }, { "epoch": 0.058030033087299565, "grad_norm": 13.422719955444336, "learning_rate": 9.930408098172725e-05, "loss": 4.5392, "step": 57 }, { "epoch": 0.059048103843217105, "grad_norm": 17.745412826538086, "learning_rate": 9.92771228001945e-05, "loss": 7.1147, "step": 58 }, { "epoch": 0.06006617459913464, "grad_norm": 13.955183982849121, "learning_rate": 9.924965614670629e-05, "loss": 3.619, "step": 59 }, { "epoch": 0.06108424535505218, "grad_norm": 11.067267417907715, "learning_rate": 9.922168130468059e-05, "loss": 2.6905, "step": 60 }, { "epoch": 0.06210231611096971, "grad_norm": 11.641958236694336, "learning_rate": 9.91931985627792e-05, "loss": 2.398, "step": 61 }, { "epoch": 0.06312038686688724, "grad_norm": 8.590779304504395, "learning_rate": 9.916420821490472e-05, "loss": 1.9248, "step": 62 }, { "epoch": 0.06413845762280479, "grad_norm": 8.852486610412598, "learning_rate": 9.91347105601976e-05, "loss": 2.3876, "step": 63 }, { "epoch": 0.06515652837872232, "grad_norm": 9.158111572265625, "learning_rate": 9.910470590303293e-05, "loss": 1.9339, "step": 64 }, { "epoch": 0.06617459913463986, "grad_norm": 8.361588478088379, "learning_rate": 9.907419455301741e-05, "loss": 2.3266, "step": 65 }, { "epoch": 0.06719266989055739, "grad_norm": 7.891152858734131, "learning_rate": 9.904317682498608e-05, "loss": 1.9775, "step": 66 }, { "epoch": 0.06821074064647493, "grad_norm": 8.722708702087402, "learning_rate": 9.901165303899916e-05, "loss": 2.2988, "step": 67 }, { "epoch": 0.06922881140239247, "grad_norm": 10.848478317260742, "learning_rate": 9.897962352033861e-05, "loss": 2.2087, "step": 68 }, { "epoch": 0.07024688215831, "grad_norm": 7.828042984008789, "learning_rate": 9.89470885995049e-05, "loss": 2.1694, "step": 69 }, { "epoch": 0.07126495291422753, "grad_norm": 7.928416728973389, "learning_rate": 9.891404861221356e-05, "loss": 1.7946, "step": 70 }, { "epoch": 0.07228302367014508, "grad_norm": 8.273153305053711, "learning_rate": 9.888050389939172e-05, "loss": 2.2472, "step": 71 }, { "epoch": 0.07330109442606261, "grad_norm": 7.866210460662842, "learning_rate": 9.884645480717451e-05, "loss": 1.9656, "step": 72 }, { "epoch": 0.07431916518198015, "grad_norm": 9.140717506408691, "learning_rate": 9.881190168690164e-05, "loss": 2.5084, "step": 73 }, { "epoch": 0.07533723593789768, "grad_norm": 10.078163146972656, "learning_rate": 9.877684489511366e-05, "loss": 2.8882, "step": 74 }, { "epoch": 0.07635530669381523, "grad_norm": 8.583365440368652, "learning_rate": 9.874128479354832e-05, "loss": 2.2404, "step": 75 }, { "epoch": 0.07737337744973276, "grad_norm": 10.980644226074219, "learning_rate": 9.870522174913682e-05, "loss": 2.9591, "step": 76 }, { "epoch": 0.07839144820565029, "grad_norm": 9.829695701599121, "learning_rate": 9.866865613400008e-05, "loss": 2.5868, "step": 77 }, { "epoch": 0.07940951896156782, "grad_norm": 9.993083000183105, "learning_rate": 9.863158832544477e-05, "loss": 2.7386, "step": 78 }, { "epoch": 0.08042758971748537, "grad_norm": 9.227055549621582, "learning_rate": 9.859401870595959e-05, "loss": 2.3334, "step": 79 }, { "epoch": 0.0814456604734029, "grad_norm": 9.135334968566895, "learning_rate": 9.855594766321122e-05, "loss": 2.6064, "step": 80 }, { "epoch": 0.08246373122932044, "grad_norm": 9.216446876525879, "learning_rate": 9.85173755900403e-05, "loss": 2.9289, "step": 81 }, { "epoch": 0.08348180198523797, "grad_norm": 12.71446418762207, "learning_rate": 9.847830288445745e-05, "loss": 3.5027, "step": 82 }, { "epoch": 0.08449987274115552, "grad_norm": 9.071185111999512, "learning_rate": 9.843872994963911e-05, "loss": 3.1217, "step": 83 }, { "epoch": 0.08551794349707305, "grad_norm": 7.825349807739258, "learning_rate": 9.839865719392339e-05, "loss": 2.4812, "step": 84 }, { "epoch": 0.08653601425299058, "grad_norm": 11.979453086853027, "learning_rate": 9.835808503080585e-05, "loss": 3.6076, "step": 85 }, { "epoch": 0.08755408500890811, "grad_norm": 10.889570236206055, "learning_rate": 9.831701387893533e-05, "loss": 3.9539, "step": 86 }, { "epoch": 0.08857215576482566, "grad_norm": 6.638063430786133, "learning_rate": 9.827544416210941e-05, "loss": 2.1225, "step": 87 }, { "epoch": 0.0895902265207432, "grad_norm": 11.630864143371582, "learning_rate": 9.823337630927026e-05, "loss": 2.8508, "step": 88 }, { "epoch": 0.09060829727666073, "grad_norm": 11.906623840332031, "learning_rate": 9.819081075450014e-05, "loss": 3.0837, "step": 89 }, { "epoch": 0.09162636803257826, "grad_norm": 12.019804000854492, "learning_rate": 9.814774793701687e-05, "loss": 3.6106, "step": 90 }, { "epoch": 0.0926444387884958, "grad_norm": 7.91819953918457, "learning_rate": 9.810418830116932e-05, "loss": 2.3236, "step": 91 }, { "epoch": 0.09366250954441334, "grad_norm": 9.185378074645996, "learning_rate": 9.806013229643289e-05, "loss": 2.6397, "step": 92 }, { "epoch": 0.09468058030033087, "grad_norm": 12.451518058776855, "learning_rate": 9.801558037740478e-05, "loss": 3.3661, "step": 93 }, { "epoch": 0.0956986510562484, "grad_norm": 9.665090560913086, "learning_rate": 9.797053300379937e-05, "loss": 2.7933, "step": 94 }, { "epoch": 0.09671672181216595, "grad_norm": 9.512073516845703, "learning_rate": 9.792499064044342e-05, "loss": 3.1669, "step": 95 }, { "epoch": 0.09773479256808348, "grad_norm": 11.063192367553711, "learning_rate": 9.787895375727136e-05, "loss": 2.4502, "step": 96 }, { "epoch": 0.09875286332400102, "grad_norm": 11.608457565307617, "learning_rate": 9.783242282932028e-05, "loss": 2.5691, "step": 97 }, { "epoch": 0.09977093407991855, "grad_norm": 10.834481239318848, "learning_rate": 9.778539833672524e-05, "loss": 2.8208, "step": 98 }, { "epoch": 0.1007890048358361, "grad_norm": 9.476598739624023, "learning_rate": 9.773788076471414e-05, "loss": 2.4245, "step": 99 }, { "epoch": 0.10180707559175363, "grad_norm": 10.453302383422852, "learning_rate": 9.768987060360279e-05, "loss": 2.1369, "step": 100 }, { "epoch": 0.10282514634767116, "grad_norm": 8.380644798278809, "learning_rate": 9.764136834878986e-05, "loss": 4.4008, "step": 101 }, { "epoch": 0.1038432171035887, "grad_norm": 10.45700740814209, "learning_rate": 9.759237450075174e-05, "loss": 3.8277, "step": 102 }, { "epoch": 0.10486128785950624, "grad_norm": 11.106316566467285, "learning_rate": 9.754288956503736e-05, "loss": 4.3912, "step": 103 }, { "epoch": 0.10587935861542377, "grad_norm": 12.727373123168945, "learning_rate": 9.749291405226305e-05, "loss": 5.0723, "step": 104 }, { "epoch": 0.10689742937134131, "grad_norm": 11.3184175491333, "learning_rate": 9.744244847810716e-05, "loss": 4.6612, "step": 105 }, { "epoch": 0.10791550012725884, "grad_norm": 11.49225902557373, "learning_rate": 9.739149336330482e-05, "loss": 5.2688, "step": 106 }, { "epoch": 0.10893357088317639, "grad_norm": 9.92116928100586, "learning_rate": 9.734004923364257e-05, "loss": 3.1285, "step": 107 }, { "epoch": 0.10995164163909392, "grad_norm": 16.322154998779297, "learning_rate": 9.728811661995288e-05, "loss": 4.3573, "step": 108 }, { "epoch": 0.11096971239501145, "grad_norm": 11.590410232543945, "learning_rate": 9.723569605810871e-05, "loss": 3.3457, "step": 109 }, { "epoch": 0.11198778315092899, "grad_norm": 6.267991065979004, "learning_rate": 9.718278808901797e-05, "loss": 1.8973, "step": 110 }, { "epoch": 0.11300585390684653, "grad_norm": 7.807132720947266, "learning_rate": 9.712939325861794e-05, "loss": 2.2999, "step": 111 }, { "epoch": 0.11402392466276406, "grad_norm": 5.800601005554199, "learning_rate": 9.707551211786965e-05, "loss": 1.0863, "step": 112 }, { "epoch": 0.1150419954186816, "grad_norm": 7.150589466094971, "learning_rate": 9.702114522275216e-05, "loss": 1.9172, "step": 113 }, { "epoch": 0.11606006617459913, "grad_norm": 8.134252548217773, "learning_rate": 9.696629313425686e-05, "loss": 2.2173, "step": 114 }, { "epoch": 0.11707813693051668, "grad_norm": 7.6389689445495605, "learning_rate": 9.691095641838169e-05, "loss": 1.8046, "step": 115 }, { "epoch": 0.11809620768643421, "grad_norm": 6.845970153808594, "learning_rate": 9.685513564612521e-05, "loss": 1.9059, "step": 116 }, { "epoch": 0.11911427844235174, "grad_norm": 10.888468742370605, "learning_rate": 9.679883139348082e-05, "loss": 2.9148, "step": 117 }, { "epoch": 0.12013234919826928, "grad_norm": 6.594396114349365, "learning_rate": 9.674204424143078e-05, "loss": 1.8292, "step": 118 }, { "epoch": 0.12115041995418682, "grad_norm": 7.157876491546631, "learning_rate": 9.66847747759402e-05, "loss": 1.6858, "step": 119 }, { "epoch": 0.12216849071010435, "grad_norm": 7.298995494842529, "learning_rate": 9.662702358795098e-05, "loss": 1.7957, "step": 120 }, { "epoch": 0.12318656146602189, "grad_norm": 9.0108003616333, "learning_rate": 9.656879127337571e-05, "loss": 2.2843, "step": 121 }, { "epoch": 0.12420463222193942, "grad_norm": 8.476913452148438, "learning_rate": 9.651007843309163e-05, "loss": 2.1026, "step": 122 }, { "epoch": 0.12522270297785695, "grad_norm": 9.930148124694824, "learning_rate": 9.645088567293426e-05, "loss": 2.6976, "step": 123 }, { "epoch": 0.1262407737337745, "grad_norm": 8.574073791503906, "learning_rate": 9.639121360369126e-05, "loss": 1.7768, "step": 124 }, { "epoch": 0.12725884448969205, "grad_norm": 13.36725902557373, "learning_rate": 9.63310628410961e-05, "loss": 2.7559, "step": 125 }, { "epoch": 0.12827691524560958, "grad_norm": 8.55522346496582, "learning_rate": 9.627043400582172e-05, "loss": 2.3419, "step": 126 }, { "epoch": 0.1292949860015271, "grad_norm": 9.948506355285645, "learning_rate": 9.620932772347408e-05, "loss": 3.0092, "step": 127 }, { "epoch": 0.13031305675744465, "grad_norm": 10.05156135559082, "learning_rate": 9.614774462458573e-05, "loss": 2.1554, "step": 128 }, { "epoch": 0.13133112751336218, "grad_norm": 10.230545043945312, "learning_rate": 9.608568534460936e-05, "loss": 2.572, "step": 129 }, { "epoch": 0.1323491982692797, "grad_norm": 7.820633411407471, "learning_rate": 9.602315052391115e-05, "loss": 2.2316, "step": 130 }, { "epoch": 0.13336726902519724, "grad_norm": 7.196948528289795, "learning_rate": 9.596014080776423e-05, "loss": 2.276, "step": 131 }, { "epoch": 0.13438533978111478, "grad_norm": 10.125378608703613, "learning_rate": 9.589665684634196e-05, "loss": 3.6436, "step": 132 }, { "epoch": 0.13540341053703234, "grad_norm": 8.542695045471191, "learning_rate": 9.583269929471128e-05, "loss": 2.8726, "step": 133 }, { "epoch": 0.13642148129294987, "grad_norm": 8.097149848937988, "learning_rate": 9.576826881282594e-05, "loss": 2.3483, "step": 134 }, { "epoch": 0.1374395520488674, "grad_norm": 8.922883987426758, "learning_rate": 9.570336606551967e-05, "loss": 2.5365, "step": 135 }, { "epoch": 0.13845762280478494, "grad_norm": 9.18602180480957, "learning_rate": 9.56379917224993e-05, "loss": 2.7464, "step": 136 }, { "epoch": 0.13947569356070247, "grad_norm": 8.929719924926758, "learning_rate": 9.557214645833792e-05, "loss": 2.8074, "step": 137 }, { "epoch": 0.14049376431662, "grad_norm": 10.157453536987305, "learning_rate": 9.550583095246786e-05, "loss": 2.6313, "step": 138 }, { "epoch": 0.14151183507253753, "grad_norm": 8.677960395812988, "learning_rate": 9.543904588917367e-05, "loss": 2.7515, "step": 139 }, { "epoch": 0.14252990582845507, "grad_norm": 8.684197425842285, "learning_rate": 9.537179195758512e-05, "loss": 2.5564, "step": 140 }, { "epoch": 0.14354797658437263, "grad_norm": 8.283134460449219, "learning_rate": 9.530406985167004e-05, "loss": 2.3474, "step": 141 }, { "epoch": 0.14456604734029016, "grad_norm": 7.090147018432617, "learning_rate": 9.523588027022721e-05, "loss": 2.0495, "step": 142 }, { "epoch": 0.1455841180962077, "grad_norm": 9.59614086151123, "learning_rate": 9.516722391687902e-05, "loss": 2.4563, "step": 143 }, { "epoch": 0.14660218885212523, "grad_norm": 7.75164270401001, "learning_rate": 9.50981015000644e-05, "loss": 2.0795, "step": 144 }, { "epoch": 0.14762025960804276, "grad_norm": 9.117147445678711, "learning_rate": 9.502851373303136e-05, "loss": 2.519, "step": 145 }, { "epoch": 0.1486383303639603, "grad_norm": 9.871448516845703, "learning_rate": 9.495846133382973e-05, "loss": 2.6371, "step": 146 }, { "epoch": 0.14965640111987782, "grad_norm": 8.246638298034668, "learning_rate": 9.488794502530362e-05, "loss": 2.3142, "step": 147 }, { "epoch": 0.15067447187579536, "grad_norm": 11.579840660095215, "learning_rate": 9.48169655350841e-05, "loss": 2.8947, "step": 148 }, { "epoch": 0.15169254263171292, "grad_norm": 13.307292938232422, "learning_rate": 9.474552359558166e-05, "loss": 2.9942, "step": 149 }, { "epoch": 0.15271061338763045, "grad_norm": 10.210186958312988, "learning_rate": 9.467361994397859e-05, "loss": 2.0216, "step": 150 }, { "epoch": 0.15372868414354798, "grad_norm": 7.870486259460449, "learning_rate": 9.460125532222141e-05, "loss": 2.6203, "step": 151 }, { "epoch": 0.15474675489946552, "grad_norm": 13.753894805908203, "learning_rate": 9.452843047701323e-05, "loss": 4.1998, "step": 152 }, { "epoch": 0.15576482565538305, "grad_norm": 10.677061080932617, "learning_rate": 9.445514615980604e-05, "loss": 3.9647, "step": 153 }, { "epoch": 0.15678289641130058, "grad_norm": 11.903203010559082, "learning_rate": 9.438140312679291e-05, "loss": 4.2215, "step": 154 }, { "epoch": 0.15780096716721811, "grad_norm": 12.882353782653809, "learning_rate": 9.43072021389003e-05, "loss": 5.0153, "step": 155 }, { "epoch": 0.15881903792313565, "grad_norm": 13.99023151397705, "learning_rate": 9.423254396178003e-05, "loss": 5.5362, "step": 156 }, { "epoch": 0.1598371086790532, "grad_norm": 16.683727264404297, "learning_rate": 9.415742936580157e-05, "loss": 5.1149, "step": 157 }, { "epoch": 0.16085517943497074, "grad_norm": 17.32396125793457, "learning_rate": 9.408185912604394e-05, "loss": 4.8563, "step": 158 }, { "epoch": 0.16187325019088827, "grad_norm": 14.138668060302734, "learning_rate": 9.400583402228784e-05, "loss": 3.4698, "step": 159 }, { "epoch": 0.1628913209468058, "grad_norm": 6.4397430419921875, "learning_rate": 9.392935483900749e-05, "loss": 1.8856, "step": 160 }, { "epoch": 0.16390939170272334, "grad_norm": 4.72169303894043, "learning_rate": 9.38524223653626e-05, "loss": 1.3027, "step": 161 }, { "epoch": 0.16492746245864087, "grad_norm": 7.877247333526611, "learning_rate": 9.377503739519019e-05, "loss": 1.9129, "step": 162 }, { "epoch": 0.1659455332145584, "grad_norm": 8.524123191833496, "learning_rate": 9.369720072699647e-05, "loss": 1.5605, "step": 163 }, { "epoch": 0.16696360397047594, "grad_norm": 9.966007232666016, "learning_rate": 9.361891316394851e-05, "loss": 2.5458, "step": 164 }, { "epoch": 0.16798167472639347, "grad_norm": 9.061026573181152, "learning_rate": 9.354017551386599e-05, "loss": 1.8415, "step": 165 }, { "epoch": 0.16899974548231103, "grad_norm": 7.912156581878662, "learning_rate": 9.346098858921291e-05, "loss": 1.9514, "step": 166 }, { "epoch": 0.17001781623822856, "grad_norm": 6.926218509674072, "learning_rate": 9.338135320708911e-05, "loss": 2.1861, "step": 167 }, { "epoch": 0.1710358869941461, "grad_norm": 7.546460151672363, "learning_rate": 9.330127018922194e-05, "loss": 1.7336, "step": 168 }, { "epoch": 0.17205395775006363, "grad_norm": 6.780023097991943, "learning_rate": 9.322074036195769e-05, "loss": 1.766, "step": 169 }, { "epoch": 0.17307202850598116, "grad_norm": 8.207006454467773, "learning_rate": 9.313976455625315e-05, "loss": 1.937, "step": 170 }, { "epoch": 0.1740900992618987, "grad_norm": 10.892253875732422, "learning_rate": 9.305834360766695e-05, "loss": 2.6682, "step": 171 }, { "epoch": 0.17510817001781623, "grad_norm": 8.318902015686035, "learning_rate": 9.297647835635102e-05, "loss": 2.0102, "step": 172 }, { "epoch": 0.17612624077373376, "grad_norm": 7.727786540985107, "learning_rate": 9.289416964704185e-05, "loss": 1.9714, "step": 173 }, { "epoch": 0.17714431152965132, "grad_norm": 9.250336647033691, "learning_rate": 9.281141832905185e-05, "loss": 2.3855, "step": 174 }, { "epoch": 0.17816238228556885, "grad_norm": 7.347965717315674, "learning_rate": 9.272822525626046e-05, "loss": 1.8475, "step": 175 }, { "epoch": 0.1791804530414864, "grad_norm": 7.1732354164123535, "learning_rate": 9.26445912871055e-05, "loss": 1.9938, "step": 176 }, { "epoch": 0.18019852379740392, "grad_norm": 11.556361198425293, "learning_rate": 9.25605172845742e-05, "loss": 3.3699, "step": 177 }, { "epoch": 0.18121659455332145, "grad_norm": 9.626664161682129, "learning_rate": 9.247600411619434e-05, "loss": 2.7054, "step": 178 }, { "epoch": 0.18223466530923899, "grad_norm": 7.422823429107666, "learning_rate": 9.239105265402525e-05, "loss": 2.3665, "step": 179 }, { "epoch": 0.18325273606515652, "grad_norm": 8.812822341918945, "learning_rate": 9.23056637746489e-05, "loss": 2.4336, "step": 180 }, { "epoch": 0.18427080682107405, "grad_norm": 12.493931770324707, "learning_rate": 9.221983835916074e-05, "loss": 2.4446, "step": 181 }, { "epoch": 0.1852888775769916, "grad_norm": 9.533077239990234, "learning_rate": 9.213357729316076e-05, "loss": 2.5195, "step": 182 }, { "epoch": 0.18630694833290914, "grad_norm": 7.195649147033691, "learning_rate": 9.204688146674418e-05, "loss": 1.5695, "step": 183 }, { "epoch": 0.18732501908882668, "grad_norm": 10.850951194763184, "learning_rate": 9.195975177449238e-05, "loss": 3.3308, "step": 184 }, { "epoch": 0.1883430898447442, "grad_norm": 9.36767578125, "learning_rate": 9.187218911546362e-05, "loss": 2.8146, "step": 185 }, { "epoch": 0.18936116060066174, "grad_norm": 14.791803359985352, "learning_rate": 9.178419439318382e-05, "loss": 3.5093, "step": 186 }, { "epoch": 0.19037923135657928, "grad_norm": 10.107565879821777, "learning_rate": 9.169576851563715e-05, "loss": 2.4756, "step": 187 }, { "epoch": 0.1913973021124968, "grad_norm": 8.8936128616333, "learning_rate": 9.160691239525674e-05, "loss": 2.4272, "step": 188 }, { "epoch": 0.19241537286841434, "grad_norm": 8.861714363098145, "learning_rate": 9.151762694891521e-05, "loss": 2.1092, "step": 189 }, { "epoch": 0.1934334436243319, "grad_norm": 9.74419116973877, "learning_rate": 9.142791309791528e-05, "loss": 3.1339, "step": 190 }, { "epoch": 0.19445151438024944, "grad_norm": 10.207488059997559, "learning_rate": 9.133777176798013e-05, "loss": 2.5119, "step": 191 }, { "epoch": 0.19546958513616697, "grad_norm": 9.463604927062988, "learning_rate": 9.124720388924403e-05, "loss": 2.669, "step": 192 }, { "epoch": 0.1964876558920845, "grad_norm": 11.191435813903809, "learning_rate": 9.115621039624256e-05, "loss": 3.134, "step": 193 }, { "epoch": 0.19750572664800203, "grad_norm": 8.744293212890625, "learning_rate": 9.10647922279031e-05, "loss": 2.8205, "step": 194 }, { "epoch": 0.19852379740391957, "grad_norm": 9.338461875915527, "learning_rate": 9.09729503275351e-05, "loss": 2.2502, "step": 195 }, { "epoch": 0.1995418681598371, "grad_norm": 8.457433700561523, "learning_rate": 9.088068564282031e-05, "loss": 2.1407, "step": 196 }, { "epoch": 0.20055993891575463, "grad_norm": 11.790545463562012, "learning_rate": 9.078799912580304e-05, "loss": 3.0246, "step": 197 }, { "epoch": 0.2015780096716722, "grad_norm": 10.485797882080078, "learning_rate": 9.069489173288038e-05, "loss": 2.7989, "step": 198 }, { "epoch": 0.20259608042758973, "grad_norm": 10.064512252807617, "learning_rate": 9.060136442479215e-05, "loss": 2.3104, "step": 199 }, { "epoch": 0.20361415118350726, "grad_norm": 11.273386001586914, "learning_rate": 9.050741816661128e-05, "loss": 2.1308, "step": 200 }, { "epoch": 0.2046322219394248, "grad_norm": 7.872629642486572, "learning_rate": 9.041305392773354e-05, "loss": 3.2454, "step": 201 }, { "epoch": 0.20565029269534232, "grad_norm": 10.097418785095215, "learning_rate": 9.031827268186779e-05, "loss": 3.8778, "step": 202 }, { "epoch": 0.20666836345125986, "grad_norm": 9.544397354125977, "learning_rate": 9.022307540702576e-05, "loss": 3.5354, "step": 203 }, { "epoch": 0.2076864342071774, "grad_norm": 13.447309494018555, "learning_rate": 9.012746308551208e-05, "loss": 5.3594, "step": 204 }, { "epoch": 0.20870450496309492, "grad_norm": 12.501740455627441, "learning_rate": 9.003143670391403e-05, "loss": 3.5315, "step": 205 }, { "epoch": 0.20972257571901248, "grad_norm": 13.571687698364258, "learning_rate": 8.993499725309148e-05, "loss": 4.0421, "step": 206 }, { "epoch": 0.21074064647493002, "grad_norm": 14.879913330078125, "learning_rate": 8.983814572816656e-05, "loss": 4.1594, "step": 207 }, { "epoch": 0.21175871723084755, "grad_norm": 17.623329162597656, "learning_rate": 8.974088312851345e-05, "loss": 4.9946, "step": 208 }, { "epoch": 0.21277678798676508, "grad_norm": 6.669205665588379, "learning_rate": 8.964321045774807e-05, "loss": 1.5305, "step": 209 }, { "epoch": 0.21379485874268261, "grad_norm": 9.656936645507812, "learning_rate": 8.954512872371769e-05, "loss": 2.7299, "step": 210 }, { "epoch": 0.21481292949860015, "grad_norm": 7.008784770965576, "learning_rate": 8.944663893849052e-05, "loss": 1.4462, "step": 211 }, { "epoch": 0.21583100025451768, "grad_norm": 6.301548004150391, "learning_rate": 8.934774211834538e-05, "loss": 1.4093, "step": 212 }, { "epoch": 0.2168490710104352, "grad_norm": 7.544199466705322, "learning_rate": 8.924843928376104e-05, "loss": 1.6221, "step": 213 }, { "epoch": 0.21786714176635277, "grad_norm": 9.308175086975098, "learning_rate": 8.914873145940584e-05, "loss": 2.1724, "step": 214 }, { "epoch": 0.2188852125222703, "grad_norm": 8.202116012573242, "learning_rate": 8.904861967412703e-05, "loss": 1.7294, "step": 215 }, { "epoch": 0.21990328327818784, "grad_norm": 9.309891700744629, "learning_rate": 8.894810496094016e-05, "loss": 2.1319, "step": 216 }, { "epoch": 0.22092135403410537, "grad_norm": 7.8817925453186035, "learning_rate": 8.884718835701848e-05, "loss": 2.0479, "step": 217 }, { "epoch": 0.2219394247900229, "grad_norm": 7.9436116218566895, "learning_rate": 8.874587090368221e-05, "loss": 1.9141, "step": 218 }, { "epoch": 0.22295749554594044, "grad_norm": 9.188081741333008, "learning_rate": 8.86441536463877e-05, "loss": 2.5944, "step": 219 }, { "epoch": 0.22397556630185797, "grad_norm": 9.442697525024414, "learning_rate": 8.85420376347168e-05, "loss": 2.616, "step": 220 }, { "epoch": 0.2249936370577755, "grad_norm": 7.059047222137451, "learning_rate": 8.843952392236594e-05, "loss": 1.8199, "step": 221 }, { "epoch": 0.22601170781369306, "grad_norm": 9.448399543762207, "learning_rate": 8.833661356713528e-05, "loss": 2.2707, "step": 222 }, { "epoch": 0.2270297785696106, "grad_norm": 7.232347011566162, "learning_rate": 8.823330763091775e-05, "loss": 2.2834, "step": 223 }, { "epoch": 0.22804784932552813, "grad_norm": 7.126833438873291, "learning_rate": 8.812960717968818e-05, "loss": 2.2613, "step": 224 }, { "epoch": 0.22906592008144566, "grad_norm": 7.250087261199951, "learning_rate": 8.802551328349222e-05, "loss": 2.0233, "step": 225 }, { "epoch": 0.2300839908373632, "grad_norm": 9.801566123962402, "learning_rate": 8.792102701643531e-05, "loss": 2.6283, "step": 226 }, { "epoch": 0.23110206159328073, "grad_norm": 8.86218547821045, "learning_rate": 8.781614945667169e-05, "loss": 2.7821, "step": 227 }, { "epoch": 0.23212013234919826, "grad_norm": 7.009481430053711, "learning_rate": 8.771088168639312e-05, "loss": 2.187, "step": 228 }, { "epoch": 0.2331382031051158, "grad_norm": 7.643123149871826, "learning_rate": 8.760522479181784e-05, "loss": 2.0065, "step": 229 }, { "epoch": 0.23415627386103335, "grad_norm": 6.573335647583008, "learning_rate": 8.749917986317928e-05, "loss": 1.939, "step": 230 }, { "epoch": 0.2351743446169509, "grad_norm": 9.001991271972656, "learning_rate": 8.73927479947149e-05, "loss": 2.8534, "step": 231 }, { "epoch": 0.23619241537286842, "grad_norm": 9.186355590820312, "learning_rate": 8.72859302846548e-05, "loss": 3.112, "step": 232 }, { "epoch": 0.23721048612878595, "grad_norm": 9.961040496826172, "learning_rate": 8.717872783521047e-05, "loss": 3.2593, "step": 233 }, { "epoch": 0.23822855688470349, "grad_norm": 8.34619426727295, "learning_rate": 8.707114175256335e-05, "loss": 2.2664, "step": 234 }, { "epoch": 0.23924662764062102, "grad_norm": 7.473055839538574, "learning_rate": 8.696317314685341e-05, "loss": 2.8765, "step": 235 }, { "epoch": 0.24026469839653855, "grad_norm": 6.791398048400879, "learning_rate": 8.685482313216783e-05, "loss": 2.098, "step": 236 }, { "epoch": 0.24128276915245608, "grad_norm": 9.765985488891602, "learning_rate": 8.674609282652934e-05, "loss": 3.2374, "step": 237 }, { "epoch": 0.24230083990837364, "grad_norm": 7.459610462188721, "learning_rate": 8.663698335188477e-05, "loss": 2.456, "step": 238 }, { "epoch": 0.24331891066429118, "grad_norm": 8.42564868927002, "learning_rate": 8.65274958340934e-05, "loss": 2.3464, "step": 239 }, { "epoch": 0.2443369814202087, "grad_norm": 7.114076137542725, "learning_rate": 8.641763140291545e-05, "loss": 2.1128, "step": 240 }, { "epoch": 0.24535505217612624, "grad_norm": 9.573076248168945, "learning_rate": 8.630739119200035e-05, "loss": 2.4448, "step": 241 }, { "epoch": 0.24637312293204378, "grad_norm": 7.850905895233154, "learning_rate": 8.619677633887509e-05, "loss": 2.446, "step": 242 }, { "epoch": 0.2473911936879613, "grad_norm": 9.630354881286621, "learning_rate": 8.608578798493236e-05, "loss": 2.3875, "step": 243 }, { "epoch": 0.24840926444387884, "grad_norm": 7.196229457855225, "learning_rate": 8.597442727541897e-05, "loss": 1.6186, "step": 244 }, { "epoch": 0.24942733519979637, "grad_norm": 11.08008098602295, "learning_rate": 8.586269535942385e-05, "loss": 2.839, "step": 245 }, { "epoch": 0.2504454059557139, "grad_norm": 8.258538246154785, "learning_rate": 8.575059338986633e-05, "loss": 2.2807, "step": 246 }, { "epoch": 0.25146347671163144, "grad_norm": 9.670249938964844, "learning_rate": 8.563812252348411e-05, "loss": 2.2475, "step": 247 }, { "epoch": 0.252481547467549, "grad_norm": 9.350224494934082, "learning_rate": 8.552528392082147e-05, "loss": 2.5073, "step": 248 }, { "epoch": 0.2534996182234665, "grad_norm": 9.628990173339844, "learning_rate": 8.541207874621718e-05, "loss": 2.0659, "step": 249 }, { "epoch": 0.2545176889793841, "grad_norm": 9.84993839263916, "learning_rate": 8.529850816779251e-05, "loss": 2.2365, "step": 250 }, { "epoch": 0.2555357597353016, "grad_norm": 8.845046997070312, "learning_rate": 8.518457335743926e-05, "loss": 3.1796, "step": 251 }, { "epoch": 0.25655383049121916, "grad_norm": 10.79970932006836, "learning_rate": 8.507027549080753e-05, "loss": 3.8036, "step": 252 }, { "epoch": 0.2575719012471367, "grad_norm": 8.892024993896484, "learning_rate": 8.495561574729369e-05, "loss": 2.9368, "step": 253 }, { "epoch": 0.2585899720030542, "grad_norm": 11.658991813659668, "learning_rate": 8.484059531002821e-05, "loss": 3.7456, "step": 254 }, { "epoch": 0.25960804275897176, "grad_norm": 11.338571548461914, "learning_rate": 8.472521536586335e-05, "loss": 3.9418, "step": 255 }, { "epoch": 0.2606261135148893, "grad_norm": 14.362560272216797, "learning_rate": 8.460947710536107e-05, "loss": 4.6011, "step": 256 }, { "epoch": 0.2616441842708068, "grad_norm": 13.662555694580078, "learning_rate": 8.449338172278059e-05, "loss": 5.049, "step": 257 }, { "epoch": 0.26266225502672436, "grad_norm": 15.036532402038574, "learning_rate": 8.437693041606618e-05, "loss": 4.0385, "step": 258 }, { "epoch": 0.2636803257826419, "grad_norm": 12.57422161102295, "learning_rate": 8.426012438683473e-05, "loss": 3.1101, "step": 259 }, { "epoch": 0.2646983965385594, "grad_norm": 8.874217987060547, "learning_rate": 8.414296484036339e-05, "loss": 2.3986, "step": 260 }, { "epoch": 0.26571646729447695, "grad_norm": 5.300018787384033, "learning_rate": 8.402545298557712e-05, "loss": 0.9408, "step": 261 }, { "epoch": 0.2667345380503945, "grad_norm": 6.752171039581299, "learning_rate": 8.390759003503623e-05, "loss": 1.8722, "step": 262 }, { "epoch": 0.267752608806312, "grad_norm": 7.508800029754639, "learning_rate": 8.378937720492384e-05, "loss": 1.7145, "step": 263 }, { "epoch": 0.26877067956222955, "grad_norm": 6.305329322814941, "learning_rate": 8.367081571503332e-05, "loss": 1.6567, "step": 264 }, { "epoch": 0.2697887503181471, "grad_norm": 9.371475219726562, "learning_rate": 8.355190678875578e-05, "loss": 2.4242, "step": 265 }, { "epoch": 0.2708068210740647, "grad_norm": 5.403337001800537, "learning_rate": 8.343265165306735e-05, "loss": 1.3716, "step": 266 }, { "epoch": 0.2718248918299822, "grad_norm": 8.240038871765137, "learning_rate": 8.331305153851658e-05, "loss": 2.2134, "step": 267 }, { "epoch": 0.27284296258589974, "grad_norm": 5.857060432434082, "learning_rate": 8.319310767921174e-05, "loss": 1.2823, "step": 268 }, { "epoch": 0.2738610333418173, "grad_norm": 6.11975622177124, "learning_rate": 8.307282131280804e-05, "loss": 1.7163, "step": 269 }, { "epoch": 0.2748791040977348, "grad_norm": 10.722909927368164, "learning_rate": 8.295219368049494e-05, "loss": 2.2343, "step": 270 }, { "epoch": 0.27589717485365234, "grad_norm": 6.3688507080078125, "learning_rate": 8.283122602698323e-05, "loss": 1.253, "step": 271 }, { "epoch": 0.27691524560956987, "grad_norm": 8.231119155883789, "learning_rate": 8.27099196004923e-05, "loss": 2.4005, "step": 272 }, { "epoch": 0.2779333163654874, "grad_norm": 6.623697757720947, "learning_rate": 8.258827565273718e-05, "loss": 1.5276, "step": 273 }, { "epoch": 0.27895138712140494, "grad_norm": 8.357768058776855, "learning_rate": 8.246629543891569e-05, "loss": 2.5312, "step": 274 }, { "epoch": 0.27996945787732247, "grad_norm": 7.037582874298096, "learning_rate": 8.23439802176954e-05, "loss": 2.6555, "step": 275 }, { "epoch": 0.28098752863324, "grad_norm": 8.760072708129883, "learning_rate": 8.222133125120076e-05, "loss": 2.051, "step": 276 }, { "epoch": 0.28200559938915754, "grad_norm": 10.407620429992676, "learning_rate": 8.209834980499995e-05, "loss": 2.7866, "step": 277 }, { "epoch": 0.28302367014507507, "grad_norm": 10.065641403198242, "learning_rate": 8.197503714809191e-05, "loss": 2.7393, "step": 278 }, { "epoch": 0.2840417409009926, "grad_norm": 8.072295188903809, "learning_rate": 8.185139455289322e-05, "loss": 2.1416, "step": 279 }, { "epoch": 0.28505981165691013, "grad_norm": 10.837465286254883, "learning_rate": 8.172742329522493e-05, "loss": 3.0516, "step": 280 }, { "epoch": 0.28607788241282767, "grad_norm": 8.73945426940918, "learning_rate": 8.160312465429952e-05, "loss": 2.6516, "step": 281 }, { "epoch": 0.28709595316874525, "grad_norm": 9.408519744873047, "learning_rate": 8.147849991270752e-05, "loss": 2.4367, "step": 282 }, { "epoch": 0.2881140239246628, "grad_norm": 8.221115112304688, "learning_rate": 8.135355035640444e-05, "loss": 2.4484, "step": 283 }, { "epoch": 0.2891320946805803, "grad_norm": 11.46916389465332, "learning_rate": 8.122827727469737e-05, "loss": 3.5208, "step": 284 }, { "epoch": 0.29015016543649785, "grad_norm": 8.721375465393066, "learning_rate": 8.110268196023179e-05, "loss": 2.3896, "step": 285 }, { "epoch": 0.2911682361924154, "grad_norm": 9.857149124145508, "learning_rate": 8.097676570897814e-05, "loss": 2.8248, "step": 286 }, { "epoch": 0.2921863069483329, "grad_norm": 7.732857704162598, "learning_rate": 8.085052982021847e-05, "loss": 2.0455, "step": 287 }, { "epoch": 0.29320437770425045, "grad_norm": 9.654264450073242, "learning_rate": 8.072397559653313e-05, "loss": 2.3959, "step": 288 }, { "epoch": 0.294222448460168, "grad_norm": 10.697869300842285, "learning_rate": 8.059710434378715e-05, "loss": 3.314, "step": 289 }, { "epoch": 0.2952405192160855, "grad_norm": 8.84398078918457, "learning_rate": 8.046991737111696e-05, "loss": 2.0514, "step": 290 }, { "epoch": 0.29625858997200305, "grad_norm": 7.673434257507324, "learning_rate": 8.034241599091665e-05, "loss": 2.165, "step": 291 }, { "epoch": 0.2972766607279206, "grad_norm": 10.299299240112305, "learning_rate": 8.021460151882471e-05, "loss": 3.0283, "step": 292 }, { "epoch": 0.2982947314838381, "grad_norm": 6.935961723327637, "learning_rate": 8.008647527371023e-05, "loss": 1.9187, "step": 293 }, { "epoch": 0.29931280223975565, "grad_norm": 9.410109519958496, "learning_rate": 7.995803857765933e-05, "loss": 2.4798, "step": 294 }, { "epoch": 0.3003308729956732, "grad_norm": 9.035164833068848, "learning_rate": 7.982929275596166e-05, "loss": 2.8312, "step": 295 }, { "epoch": 0.3013489437515907, "grad_norm": 8.214160919189453, "learning_rate": 7.970023913709652e-05, "loss": 2.4572, "step": 296 }, { "epoch": 0.30236701450750825, "grad_norm": 9.095396041870117, "learning_rate": 7.957087905271934e-05, "loss": 2.4834, "step": 297 }, { "epoch": 0.30338508526342584, "grad_norm": 9.806940078735352, "learning_rate": 7.944121383764776e-05, "loss": 2.6364, "step": 298 }, { "epoch": 0.30440315601934337, "grad_norm": 10.004327774047852, "learning_rate": 7.931124482984802e-05, "loss": 2.4236, "step": 299 }, { "epoch": 0.3054212267752609, "grad_norm": 12.964902877807617, "learning_rate": 7.918097337042105e-05, "loss": 2.542, "step": 300 }, { "epoch": 0.30643929753117843, "grad_norm": 7.418375015258789, "learning_rate": 7.905040080358868e-05, "loss": 2.7417, "step": 301 }, { "epoch": 0.30745736828709597, "grad_norm": 8.499213218688965, "learning_rate": 7.891952847667973e-05, "loss": 3.6269, "step": 302 }, { "epoch": 0.3084754390430135, "grad_norm": 8.255233764648438, "learning_rate": 7.878835774011615e-05, "loss": 2.7048, "step": 303 }, { "epoch": 0.30949350979893103, "grad_norm": 9.611371040344238, "learning_rate": 7.865688994739907e-05, "loss": 3.9859, "step": 304 }, { "epoch": 0.31051158055484857, "grad_norm": 9.825111389160156, "learning_rate": 7.85251264550948e-05, "loss": 3.2304, "step": 305 }, { "epoch": 0.3115296513107661, "grad_norm": 13.28979778289795, "learning_rate": 7.839306862282089e-05, "loss": 3.8321, "step": 306 }, { "epoch": 0.31254772206668363, "grad_norm": 16.412532806396484, "learning_rate": 7.826071781323207e-05, "loss": 5.4082, "step": 307 }, { "epoch": 0.31356579282260116, "grad_norm": 9.727624893188477, "learning_rate": 7.812807539200622e-05, "loss": 2.5533, "step": 308 }, { "epoch": 0.3145838635785187, "grad_norm": 8.238031387329102, "learning_rate": 7.799514272783014e-05, "loss": 2.2857, "step": 309 }, { "epoch": 0.31560193433443623, "grad_norm": 6.882323741912842, "learning_rate": 7.786192119238567e-05, "loss": 2.1371, "step": 310 }, { "epoch": 0.31662000509035376, "grad_norm": 5.3293280601501465, "learning_rate": 7.772841216033533e-05, "loss": 1.1834, "step": 311 }, { "epoch": 0.3176380758462713, "grad_norm": 5.593384265899658, "learning_rate": 7.759461700930823e-05, "loss": 1.4746, "step": 312 }, { "epoch": 0.31865614660218883, "grad_norm": 5.281317234039307, "learning_rate": 7.746053711988583e-05, "loss": 1.1387, "step": 313 }, { "epoch": 0.3196742173581064, "grad_norm": 6.735507965087891, "learning_rate": 7.73261738755877e-05, "loss": 1.9801, "step": 314 }, { "epoch": 0.32069228811402395, "grad_norm": 6.708221912384033, "learning_rate": 7.719152866285721e-05, "loss": 1.863, "step": 315 }, { "epoch": 0.3217103588699415, "grad_norm": 8.238001823425293, "learning_rate": 7.70566028710473e-05, "loss": 2.16, "step": 316 }, { "epoch": 0.322728429625859, "grad_norm": 6.396310329437256, "learning_rate": 7.692139789240611e-05, "loss": 2.1722, "step": 317 }, { "epoch": 0.32374650038177655, "grad_norm": 8.0552396774292, "learning_rate": 7.678591512206255e-05, "loss": 2.2866, "step": 318 }, { "epoch": 0.3247645711376941, "grad_norm": 5.051511287689209, "learning_rate": 7.665015595801197e-05, "loss": 1.0846, "step": 319 }, { "epoch": 0.3257826418936116, "grad_norm": 7.378931045532227, "learning_rate": 7.651412180110176e-05, "loss": 1.7923, "step": 320 }, { "epoch": 0.32680071264952915, "grad_norm": 7.6839118003845215, "learning_rate": 7.637781405501681e-05, "loss": 1.3504, "step": 321 }, { "epoch": 0.3278187834054467, "grad_norm": 7.594010829925537, "learning_rate": 7.624123412626512e-05, "loss": 2.5312, "step": 322 }, { "epoch": 0.3288368541613642, "grad_norm": 7.310485363006592, "learning_rate": 7.610438342416319e-05, "loss": 1.8773, "step": 323 }, { "epoch": 0.32985492491728174, "grad_norm": 10.614038467407227, "learning_rate": 7.596726336082158e-05, "loss": 2.8128, "step": 324 }, { "epoch": 0.3308729956731993, "grad_norm": 7.768847465515137, "learning_rate": 7.582987535113023e-05, "loss": 2.2407, "step": 325 }, { "epoch": 0.3318910664291168, "grad_norm": 9.793490409851074, "learning_rate": 7.569222081274395e-05, "loss": 2.3751, "step": 326 }, { "epoch": 0.33290913718503434, "grad_norm": 9.077165603637695, "learning_rate": 7.555430116606778e-05, "loss": 2.3227, "step": 327 }, { "epoch": 0.3339272079409519, "grad_norm": 8.083662986755371, "learning_rate": 7.541611783424225e-05, "loss": 2.4783, "step": 328 }, { "epoch": 0.3349452786968694, "grad_norm": 8.170014381408691, "learning_rate": 7.527767224312883e-05, "loss": 2.5388, "step": 329 }, { "epoch": 0.33596334945278694, "grad_norm": 7.519252777099609, "learning_rate": 7.513896582129508e-05, "loss": 2.384, "step": 330 }, { "epoch": 0.33698142020870453, "grad_norm": 9.94649887084961, "learning_rate": 7.500000000000001e-05, "loss": 2.7797, "step": 331 }, { "epoch": 0.33799949096462206, "grad_norm": 8.595416069030762, "learning_rate": 7.486077621317926e-05, "loss": 2.7035, "step": 332 }, { "epoch": 0.3390175617205396, "grad_norm": 7.753424644470215, "learning_rate": 7.472129589743033e-05, "loss": 2.0287, "step": 333 }, { "epoch": 0.34003563247645713, "grad_norm": 9.271249771118164, "learning_rate": 7.458156049199775e-05, "loss": 2.1601, "step": 334 }, { "epoch": 0.34105370323237466, "grad_norm": 8.564419746398926, "learning_rate": 7.44415714387582e-05, "loss": 2.9013, "step": 335 }, { "epoch": 0.3420717739882922, "grad_norm": 8.431052207946777, "learning_rate": 7.430133018220567e-05, "loss": 2.4643, "step": 336 }, { "epoch": 0.3430898447442097, "grad_norm": 6.7154436111450195, "learning_rate": 7.416083816943653e-05, "loss": 2.271, "step": 337 }, { "epoch": 0.34410791550012726, "grad_norm": 9.483381271362305, "learning_rate": 7.402009685013463e-05, "loss": 2.489, "step": 338 }, { "epoch": 0.3451259862560448, "grad_norm": 7.885382175445557, "learning_rate": 7.38791076765563e-05, "loss": 3.1875, "step": 339 }, { "epoch": 0.3461440570119623, "grad_norm": 7.622438430786133, "learning_rate": 7.373787210351541e-05, "loss": 2.0865, "step": 340 }, { "epoch": 0.34716212776787986, "grad_norm": 7.785037517547607, "learning_rate": 7.359639158836828e-05, "loss": 2.0806, "step": 341 }, { "epoch": 0.3481801985237974, "grad_norm": 7.861757755279541, "learning_rate": 7.345466759099875e-05, "loss": 2.4029, "step": 342 }, { "epoch": 0.3491982692797149, "grad_norm": 8.165399551391602, "learning_rate": 7.331270157380303e-05, "loss": 2.1438, "step": 343 }, { "epoch": 0.35021634003563246, "grad_norm": 8.010607719421387, "learning_rate": 7.317049500167465e-05, "loss": 1.8633, "step": 344 }, { "epoch": 0.35123441079155, "grad_norm": 8.535947799682617, "learning_rate": 7.302804934198936e-05, "loss": 2.2122, "step": 345 }, { "epoch": 0.3522524815474675, "grad_norm": 8.71279239654541, "learning_rate": 7.28853660645899e-05, "loss": 2.4223, "step": 346 }, { "epoch": 0.3532705523033851, "grad_norm": 9.44794750213623, "learning_rate": 7.274244664177097e-05, "loss": 2.5881, "step": 347 }, { "epoch": 0.35428862305930264, "grad_norm": 10.474303245544434, "learning_rate": 7.259929254826392e-05, "loss": 1.9121, "step": 348 }, { "epoch": 0.3553066938152202, "grad_norm": 9.976726531982422, "learning_rate": 7.245590526122159e-05, "loss": 2.3289, "step": 349 }, { "epoch": 0.3563247645711377, "grad_norm": 10.359407424926758, "learning_rate": 7.231228626020304e-05, "loss": 2.0781, "step": 350 }, { "epoch": 0.35734283532705524, "grad_norm": 10.392163276672363, "learning_rate": 7.216843702715831e-05, "loss": 4.3803, "step": 351 }, { "epoch": 0.3583609060829728, "grad_norm": 8.603669166564941, "learning_rate": 7.202435904641315e-05, "loss": 3.3045, "step": 352 }, { "epoch": 0.3593789768388903, "grad_norm": 11.44635009765625, "learning_rate": 7.188005380465364e-05, "loss": 5.3026, "step": 353 }, { "epoch": 0.36039704759480784, "grad_norm": 11.560089111328125, "learning_rate": 7.173552279091087e-05, "loss": 4.0946, "step": 354 }, { "epoch": 0.3614151183507254, "grad_norm": 12.211684226989746, "learning_rate": 7.159076749654559e-05, "loss": 3.7029, "step": 355 }, { "epoch": 0.3624331891066429, "grad_norm": 13.772133827209473, "learning_rate": 7.144578941523284e-05, "loss": 4.3069, "step": 356 }, { "epoch": 0.36345125986256044, "grad_norm": 11.917257308959961, "learning_rate": 7.130059004294647e-05, "loss": 3.3222, "step": 357 }, { "epoch": 0.36446933061847797, "grad_norm": 15.86817741394043, "learning_rate": 7.115517087794381e-05, "loss": 3.5182, "step": 358 }, { "epoch": 0.3654874013743955, "grad_norm": 6.566526412963867, "learning_rate": 7.10095334207501e-05, "loss": 1.7594, "step": 359 }, { "epoch": 0.36650547213031304, "grad_norm": 6.9844560623168945, "learning_rate": 7.086367917414306e-05, "loss": 1.6552, "step": 360 }, { "epoch": 0.36752354288623057, "grad_norm": 9.259458541870117, "learning_rate": 7.07176096431374e-05, "loss": 2.2072, "step": 361 }, { "epoch": 0.3685416136421481, "grad_norm": 8.759178161621094, "learning_rate": 7.057132633496923e-05, "loss": 1.7327, "step": 362 }, { "epoch": 0.3695596843980657, "grad_norm": 7.46987247467041, "learning_rate": 7.042483075908062e-05, "loss": 1.6727, "step": 363 }, { "epoch": 0.3705777551539832, "grad_norm": 7.011016368865967, "learning_rate": 7.027812442710385e-05, "loss": 1.643, "step": 364 }, { "epoch": 0.37159582590990076, "grad_norm": 8.475665092468262, "learning_rate": 7.013120885284598e-05, "loss": 2.4295, "step": 365 }, { "epoch": 0.3726138966658183, "grad_norm": 5.769803047180176, "learning_rate": 6.998408555227314e-05, "loss": 1.3708, "step": 366 }, { "epoch": 0.3736319674217358, "grad_norm": 6.653828144073486, "learning_rate": 6.983675604349493e-05, "loss": 2.111, "step": 367 }, { "epoch": 0.37465003817765336, "grad_norm": 8.172953605651855, "learning_rate": 6.968922184674867e-05, "loss": 2.3177, "step": 368 }, { "epoch": 0.3756681089335709, "grad_norm": 6.391868591308594, "learning_rate": 6.954148448438389e-05, "loss": 1.2711, "step": 369 }, { "epoch": 0.3766861796894884, "grad_norm": 8.03226375579834, "learning_rate": 6.93935454808464e-05, "loss": 1.7306, "step": 370 }, { "epoch": 0.37770425044540595, "grad_norm": 5.273244857788086, "learning_rate": 6.924540636266272e-05, "loss": 1.3542, "step": 371 }, { "epoch": 0.3787223212013235, "grad_norm": 9.628256797790527, "learning_rate": 6.909706865842429e-05, "loss": 2.7357, "step": 372 }, { "epoch": 0.379740391957241, "grad_norm": 11.36279582977295, "learning_rate": 6.894853389877163e-05, "loss": 2.2367, "step": 373 }, { "epoch": 0.38075846271315855, "grad_norm": 8.72659969329834, "learning_rate": 6.879980361637866e-05, "loss": 2.2005, "step": 374 }, { "epoch": 0.3817765334690761, "grad_norm": 7.1913042068481445, "learning_rate": 6.86508793459368e-05, "loss": 2.1034, "step": 375 }, { "epoch": 0.3827946042249936, "grad_norm": 8.96323299407959, "learning_rate": 6.850176262413912e-05, "loss": 2.8465, "step": 376 }, { "epoch": 0.38381267498091115, "grad_norm": 6.918330192565918, "learning_rate": 6.835245498966461e-05, "loss": 1.8181, "step": 377 }, { "epoch": 0.3848307457368287, "grad_norm": 9.063780784606934, "learning_rate": 6.820295798316214e-05, "loss": 2.4932, "step": 378 }, { "epoch": 0.38584881649274627, "grad_norm": 13.343623161315918, "learning_rate": 6.805327314723468e-05, "loss": 3.0215, "step": 379 }, { "epoch": 0.3868668872486638, "grad_norm": 7.741687774658203, "learning_rate": 6.790340202642332e-05, "loss": 2.1827, "step": 380 }, { "epoch": 0.38788495800458134, "grad_norm": 5.362514495849609, "learning_rate": 6.775334616719136e-05, "loss": 1.7059, "step": 381 }, { "epoch": 0.38890302876049887, "grad_norm": 10.59506893157959, "learning_rate": 6.760310711790832e-05, "loss": 2.6664, "step": 382 }, { "epoch": 0.3899210995164164, "grad_norm": 9.036832809448242, "learning_rate": 6.745268642883404e-05, "loss": 2.5482, "step": 383 }, { "epoch": 0.39093917027233394, "grad_norm": 7.859615802764893, "learning_rate": 6.73020856521026e-05, "loss": 2.1062, "step": 384 }, { "epoch": 0.39195724102825147, "grad_norm": 9.639580726623535, "learning_rate": 6.715130634170635e-05, "loss": 3.0521, "step": 385 }, { "epoch": 0.392975311784169, "grad_norm": 8.098716735839844, "learning_rate": 6.700035005347983e-05, "loss": 2.6295, "step": 386 }, { "epoch": 0.39399338254008653, "grad_norm": 11.05691146850586, "learning_rate": 6.684921834508379e-05, "loss": 2.6122, "step": 387 }, { "epoch": 0.39501145329600407, "grad_norm": 9.071178436279297, "learning_rate": 6.669791277598904e-05, "loss": 2.3797, "step": 388 }, { "epoch": 0.3960295240519216, "grad_norm": 9.826159477233887, "learning_rate": 6.654643490746042e-05, "loss": 2.4635, "step": 389 }, { "epoch": 0.39704759480783913, "grad_norm": 7.310181140899658, "learning_rate": 6.639478630254064e-05, "loss": 2.06, "step": 390 }, { "epoch": 0.39806566556375667, "grad_norm": 9.507575035095215, "learning_rate": 6.624296852603419e-05, "loss": 2.9877, "step": 391 }, { "epoch": 0.3990837363196742, "grad_norm": 7.882664680480957, "learning_rate": 6.609098314449116e-05, "loss": 2.1182, "step": 392 }, { "epoch": 0.40010180707559173, "grad_norm": 9.889808654785156, "learning_rate": 6.593883172619111e-05, "loss": 3.1559, "step": 393 }, { "epoch": 0.40111987783150926, "grad_norm": 9.180678367614746, "learning_rate": 6.578651584112686e-05, "loss": 2.6704, "step": 394 }, { "epoch": 0.40213794858742685, "grad_norm": 8.127158164978027, "learning_rate": 6.563403706098833e-05, "loss": 2.1218, "step": 395 }, { "epoch": 0.4031560193433444, "grad_norm": 9.298659324645996, "learning_rate": 6.548139695914622e-05, "loss": 3.3196, "step": 396 }, { "epoch": 0.4041740900992619, "grad_norm": 7.540867328643799, "learning_rate": 6.532859711063594e-05, "loss": 1.9205, "step": 397 }, { "epoch": 0.40519216085517945, "grad_norm": 8.0338716506958, "learning_rate": 6.51756390921412e-05, "loss": 2.2858, "step": 398 }, { "epoch": 0.406210231611097, "grad_norm": 8.012909889221191, "learning_rate": 6.502252448197782e-05, "loss": 2.3761, "step": 399 }, { "epoch": 0.4072283023670145, "grad_norm": 7.803510665893555, "learning_rate": 6.486925486007742e-05, "loss": 2.0418, "step": 400 }, { "epoch": 0.40824637312293205, "grad_norm": 6.121700286865234, "learning_rate": 6.471583180797121e-05, "loss": 2.1481, "step": 401 }, { "epoch": 0.4092644438788496, "grad_norm": 8.690316200256348, "learning_rate": 6.456225690877344e-05, "loss": 3.0496, "step": 402 }, { "epoch": 0.4102825146347671, "grad_norm": 8.961786270141602, "learning_rate": 6.440853174716534e-05, "loss": 3.4188, "step": 403 }, { "epoch": 0.41130058539068465, "grad_norm": 9.848220825195312, "learning_rate": 6.425465790937861e-05, "loss": 3.948, "step": 404 }, { "epoch": 0.4123186561466022, "grad_norm": 11.758772850036621, "learning_rate": 6.410063698317901e-05, "loss": 4.4288, "step": 405 }, { "epoch": 0.4133367269025197, "grad_norm": 10.132964134216309, "learning_rate": 6.394647055785017e-05, "loss": 3.4126, "step": 406 }, { "epoch": 0.41435479765843725, "grad_norm": 9.949785232543945, "learning_rate": 6.379216022417696e-05, "loss": 2.584, "step": 407 }, { "epoch": 0.4153728684143548, "grad_norm": 13.347235679626465, "learning_rate": 6.363770757442927e-05, "loss": 2.8766, "step": 408 }, { "epoch": 0.4163909391702723, "grad_norm": 7.714400768280029, "learning_rate": 6.348311420234542e-05, "loss": 2.4595, "step": 409 }, { "epoch": 0.41740900992618984, "grad_norm": 5.146259307861328, "learning_rate": 6.332838170311585e-05, "loss": 1.4562, "step": 410 }, { "epoch": 0.41842708068210743, "grad_norm": 5.76894474029541, "learning_rate": 6.31735116733666e-05, "loss": 1.3719, "step": 411 }, { "epoch": 0.41944515143802497, "grad_norm": 5.680314540863037, "learning_rate": 6.301850571114281e-05, "loss": 1.173, "step": 412 }, { "epoch": 0.4204632221939425, "grad_norm": 8.60592269897461, "learning_rate": 6.286336541589224e-05, "loss": 2.2157, "step": 413 }, { "epoch": 0.42148129294986003, "grad_norm": 6.760054588317871, "learning_rate": 6.27080923884488e-05, "loss": 1.6651, "step": 414 }, { "epoch": 0.42249936370577756, "grad_norm": 6.813849925994873, "learning_rate": 6.255268823101605e-05, "loss": 1.6109, "step": 415 }, { "epoch": 0.4235174344616951, "grad_norm": 7.729984760284424, "learning_rate": 6.239715454715054e-05, "loss": 2.0043, "step": 416 }, { "epoch": 0.42453550521761263, "grad_norm": 6.723374366760254, "learning_rate": 6.224149294174548e-05, "loss": 1.7516, "step": 417 }, { "epoch": 0.42555357597353016, "grad_norm": 7.92853307723999, "learning_rate": 6.208570502101393e-05, "loss": 2.0667, "step": 418 }, { "epoch": 0.4265716467294477, "grad_norm": 8.561442375183105, "learning_rate": 6.192979239247243e-05, "loss": 2.3514, "step": 419 }, { "epoch": 0.42758971748536523, "grad_norm": 6.065650463104248, "learning_rate": 6.177375666492431e-05, "loss": 1.6079, "step": 420 }, { "epoch": 0.42860778824128276, "grad_norm": 7.547060489654541, "learning_rate": 6.161759944844308e-05, "loss": 1.6165, "step": 421 }, { "epoch": 0.4296258589972003, "grad_norm": 7.750328540802002, "learning_rate": 6.146132235435591e-05, "loss": 2.6017, "step": 422 }, { "epoch": 0.4306439297531178, "grad_norm": 8.826605796813965, "learning_rate": 6.13049269952269e-05, "loss": 2.8712, "step": 423 }, { "epoch": 0.43166200050903536, "grad_norm": 9.169108390808105, "learning_rate": 6.114841498484048e-05, "loss": 3.1703, "step": 424 }, { "epoch": 0.4326800712649529, "grad_norm": 10.452327728271484, "learning_rate": 6.0991787938184784e-05, "loss": 2.8737, "step": 425 }, { "epoch": 0.4336981420208704, "grad_norm": 6.921334743499756, "learning_rate": 6.0835047471434955e-05, "loss": 2.1521, "step": 426 }, { "epoch": 0.434716212776788, "grad_norm": 7.088666915893555, "learning_rate": 6.067819520193645e-05, "loss": 1.833, "step": 427 }, { "epoch": 0.43573428353270555, "grad_norm": 7.354036331176758, "learning_rate": 6.052123274818842e-05, "loss": 2.0779, "step": 428 }, { "epoch": 0.4367523542886231, "grad_norm": 6.732209205627441, "learning_rate": 6.0364161729826905e-05, "loss": 1.8935, "step": 429 }, { "epoch": 0.4377704250445406, "grad_norm": 7.724286079406738, "learning_rate": 6.020698376760824e-05, "loss": 2.334, "step": 430 }, { "epoch": 0.43878849580045814, "grad_norm": 7.147849082946777, "learning_rate": 6.004970048339226e-05, "loss": 1.9002, "step": 431 }, { "epoch": 0.4398065665563757, "grad_norm": 10.327398300170898, "learning_rate": 5.989231350012554e-05, "loss": 3.3501, "step": 432 }, { "epoch": 0.4408246373122932, "grad_norm": 8.666500091552734, "learning_rate": 5.973482444182475e-05, "loss": 2.4769, "step": 433 }, { "epoch": 0.44184270806821074, "grad_norm": 10.457250595092773, "learning_rate": 5.9577234933559764e-05, "loss": 3.03, "step": 434 }, { "epoch": 0.4428607788241283, "grad_norm": 11.054841995239258, "learning_rate": 5.941954660143703e-05, "loss": 2.6938, "step": 435 }, { "epoch": 0.4438788495800458, "grad_norm": 8.820106506347656, "learning_rate": 5.9261761072582655e-05, "loss": 2.6282, "step": 436 }, { "epoch": 0.44489692033596334, "grad_norm": 9.201048851013184, "learning_rate": 5.910387997512573e-05, "loss": 3.0258, "step": 437 }, { "epoch": 0.4459149910918809, "grad_norm": 8.373756408691406, "learning_rate": 5.8945904938181484e-05, "loss": 2.0512, "step": 438 }, { "epoch": 0.4469330618477984, "grad_norm": 7.493674278259277, "learning_rate": 5.878783759183442e-05, "loss": 2.3767, "step": 439 }, { "epoch": 0.44795113260371594, "grad_norm": 8.552105903625488, "learning_rate": 5.86296795671216e-05, "loss": 2.4706, "step": 440 }, { "epoch": 0.4489692033596335, "grad_norm": 7.708653450012207, "learning_rate": 5.847143249601574e-05, "loss": 2.2514, "step": 441 }, { "epoch": 0.449987274115551, "grad_norm": 9.070602416992188, "learning_rate": 5.8313098011408406e-05, "loss": 2.322, "step": 442 }, { "epoch": 0.4510053448714686, "grad_norm": 7.668588638305664, "learning_rate": 5.8154677747093134e-05, "loss": 1.8555, "step": 443 }, { "epoch": 0.4520234156273861, "grad_norm": 9.315807342529297, "learning_rate": 5.7996173337748606e-05, "loss": 2.1384, "step": 444 }, { "epoch": 0.45304148638330366, "grad_norm": 10.211068153381348, "learning_rate": 5.783758641892172e-05, "loss": 2.8774, "step": 445 }, { "epoch": 0.4540595571392212, "grad_norm": 9.461262702941895, "learning_rate": 5.767891862701082e-05, "loss": 2.5156, "step": 446 }, { "epoch": 0.4550776278951387, "grad_norm": 9.720065116882324, "learning_rate": 5.7520171599248704e-05, "loss": 2.6157, "step": 447 }, { "epoch": 0.45609569865105626, "grad_norm": 8.700965881347656, "learning_rate": 5.7361346973685794e-05, "loss": 2.2904, "step": 448 }, { "epoch": 0.4571137694069738, "grad_norm": 7.846927642822266, "learning_rate": 5.7202446389173223e-05, "loss": 1.968, "step": 449 }, { "epoch": 0.4581318401628913, "grad_norm": 11.256888389587402, "learning_rate": 5.704347148534589e-05, "loss": 2.0931, "step": 450 }, { "epoch": 0.45914991091880886, "grad_norm": 8.149317741394043, "learning_rate": 5.688442390260559e-05, "loss": 3.8319, "step": 451 }, { "epoch": 0.4601679816747264, "grad_norm": 10.932345390319824, "learning_rate": 5.672530528210405e-05, "loss": 3.0198, "step": 452 }, { "epoch": 0.4611860524306439, "grad_norm": 9.099678993225098, "learning_rate": 5.6566117265726006e-05, "loss": 3.3294, "step": 453 }, { "epoch": 0.46220412318656146, "grad_norm": 11.72851848602295, "learning_rate": 5.640686149607228e-05, "loss": 4.2732, "step": 454 }, { "epoch": 0.463222193942479, "grad_norm": 11.902469635009766, "learning_rate": 5.624753961644281e-05, "loss": 3.6108, "step": 455 }, { "epoch": 0.4642402646983965, "grad_norm": 10.879993438720703, "learning_rate": 5.608815327081969e-05, "loss": 2.6874, "step": 456 }, { "epoch": 0.46525833545431405, "grad_norm": 14.943718910217285, "learning_rate": 5.5928704103850206e-05, "loss": 4.6303, "step": 457 }, { "epoch": 0.4662764062102316, "grad_norm": 10.216432571411133, "learning_rate": 5.57691937608299e-05, "loss": 2.4846, "step": 458 }, { "epoch": 0.4672944769661492, "grad_norm": 8.506292343139648, "learning_rate": 5.5609623887685535e-05, "loss": 2.476, "step": 459 }, { "epoch": 0.4683125477220667, "grad_norm": 8.514628410339355, "learning_rate": 5.544999613095818e-05, "loss": 2.377, "step": 460 }, { "epoch": 0.46933061847798424, "grad_norm": 5.901019096374512, "learning_rate": 5.5290312137786146e-05, "loss": 1.5461, "step": 461 }, { "epoch": 0.4703486892339018, "grad_norm": 5.75610876083374, "learning_rate": 5.513057355588804e-05, "loss": 1.4872, "step": 462 }, { "epoch": 0.4713667599898193, "grad_norm": 5.7381272315979, "learning_rate": 5.4970782033545774e-05, "loss": 1.4357, "step": 463 }, { "epoch": 0.47238483074573684, "grad_norm": 7.901957035064697, "learning_rate": 5.4810939219587485e-05, "loss": 1.8938, "step": 464 }, { "epoch": 0.47340290150165437, "grad_norm": 5.663755893707275, "learning_rate": 5.465104676337062e-05, "loss": 1.2289, "step": 465 }, { "epoch": 0.4744209722575719, "grad_norm": 7.933018684387207, "learning_rate": 5.44911063147648e-05, "loss": 2.2704, "step": 466 }, { "epoch": 0.47543904301348944, "grad_norm": 7.943240165710449, "learning_rate": 5.433111952413495e-05, "loss": 2.3279, "step": 467 }, { "epoch": 0.47645711376940697, "grad_norm": 6.650790691375732, "learning_rate": 5.417108804232409e-05, "loss": 2.2172, "step": 468 }, { "epoch": 0.4774751845253245, "grad_norm": 7.444812774658203, "learning_rate": 5.401101352063647e-05, "loss": 2.2142, "step": 469 }, { "epoch": 0.47849325528124204, "grad_norm": 7.4753642082214355, "learning_rate": 5.3850897610820396e-05, "loss": 1.757, "step": 470 }, { "epoch": 0.47951132603715957, "grad_norm": 5.919788360595703, "learning_rate": 5.369074196505125e-05, "loss": 1.6481, "step": 471 }, { "epoch": 0.4805293967930771, "grad_norm": 8.422210693359375, "learning_rate": 5.3530548235914454e-05, "loss": 2.157, "step": 472 }, { "epoch": 0.48154746754899463, "grad_norm": 6.410617351531982, "learning_rate": 5.33703180763884e-05, "loss": 1.8347, "step": 473 }, { "epoch": 0.48256553830491217, "grad_norm": 6.645679473876953, "learning_rate": 5.321005313982738e-05, "loss": 1.9199, "step": 474 }, { "epoch": 0.4835836090608297, "grad_norm": 8.815511703491211, "learning_rate": 5.3049755079944527e-05, "loss": 1.9625, "step": 475 }, { "epoch": 0.4846016798167473, "grad_norm": 6.998859882354736, "learning_rate": 5.288942555079479e-05, "loss": 1.9826, "step": 476 }, { "epoch": 0.4856197505726648, "grad_norm": 9.812437057495117, "learning_rate": 5.272906620675779e-05, "loss": 2.8627, "step": 477 }, { "epoch": 0.48663782132858235, "grad_norm": 6.514684677124023, "learning_rate": 5.256867870252087e-05, "loss": 1.8711, "step": 478 }, { "epoch": 0.4876558920844999, "grad_norm": 8.101947784423828, "learning_rate": 5.240826469306187e-05, "loss": 2.2444, "step": 479 }, { "epoch": 0.4886739628404174, "grad_norm": 7.826203346252441, "learning_rate": 5.224782583363215e-05, "loss": 2.3479, "step": 480 }, { "epoch": 0.48969203359633495, "grad_norm": 7.306244850158691, "learning_rate": 5.208736377973954e-05, "loss": 2.2877, "step": 481 }, { "epoch": 0.4907101043522525, "grad_norm": 10.923294067382812, "learning_rate": 5.192688018713113e-05, "loss": 3.4528, "step": 482 }, { "epoch": 0.49172817510817, "grad_norm": 7.382264614105225, "learning_rate": 5.176637671177631e-05, "loss": 2.1969, "step": 483 }, { "epoch": 0.49274624586408755, "grad_norm": 10.03930377960205, "learning_rate": 5.1605855009849614e-05, "loss": 3.5883, "step": 484 }, { "epoch": 0.4937643166200051, "grad_norm": 7.927920818328857, "learning_rate": 5.144531673771363e-05, "loss": 2.3655, "step": 485 }, { "epoch": 0.4947823873759226, "grad_norm": 8.499062538146973, "learning_rate": 5.1284763551901995e-05, "loss": 2.5173, "step": 486 }, { "epoch": 0.49580045813184015, "grad_norm": 6.909058094024658, "learning_rate": 5.112419710910213e-05, "loss": 2.0323, "step": 487 }, { "epoch": 0.4968185288877577, "grad_norm": 8.88456916809082, "learning_rate": 5.096361906613836e-05, "loss": 2.6987, "step": 488 }, { "epoch": 0.4978365996436752, "grad_norm": 7.5175580978393555, "learning_rate": 5.080303107995461e-05, "loss": 2.1691, "step": 489 }, { "epoch": 0.49885467039959275, "grad_norm": 7.615448951721191, "learning_rate": 5.064243480759748e-05, "loss": 2.0365, "step": 490 }, { "epoch": 0.4998727411555103, "grad_norm": 9.251805305480957, "learning_rate": 5.048183190619904e-05, "loss": 2.2146, "step": 491 }, { "epoch": 0.5008908119114278, "grad_norm": 7.876194477081299, "learning_rate": 5.032122403295977e-05, "loss": 2.3439, "step": 492 }, { "epoch": 0.5019088826673453, "grad_norm": 8.969354629516602, "learning_rate": 5.0160612845131414e-05, "loss": 2.5224, "step": 493 }, { "epoch": 0.5029269534232629, "grad_norm": 7.22929573059082, "learning_rate": 5e-05, "loss": 1.6864, "step": 494 }, { "epoch": 0.5039450241791804, "grad_norm": 7.851796627044678, "learning_rate": 4.9839387154868584e-05, "loss": 2.0388, "step": 495 }, { "epoch": 0.504963094935098, "grad_norm": 9.29887580871582, "learning_rate": 4.967877596704025e-05, "loss": 2.6501, "step": 496 }, { "epoch": 0.5059811656910155, "grad_norm": 8.79398250579834, "learning_rate": 4.951816809380097e-05, "loss": 2.5759, "step": 497 }, { "epoch": 0.506999236446933, "grad_norm": 10.286650657653809, "learning_rate": 4.9357565192402525e-05, "loss": 3.1672, "step": 498 }, { "epoch": 0.5080173072028505, "grad_norm": 8.799942016601562, "learning_rate": 4.919696892004539e-05, "loss": 1.9062, "step": 499 }, { "epoch": 0.5090353779587682, "grad_norm": 8.252535820007324, "learning_rate": 4.903638093386167e-05, "loss": 1.6945, "step": 500 }, { "epoch": 0.5100534487146857, "grad_norm": 5.307441234588623, "learning_rate": 4.887580289089787e-05, "loss": 1.7627, "step": 501 }, { "epoch": 0.5110715194706033, "grad_norm": 7.124861717224121, "learning_rate": 4.8715236448098016e-05, "loss": 2.5286, "step": 502 }, { "epoch": 0.5120895902265208, "grad_norm": 10.368796348571777, "learning_rate": 4.855468326228638e-05, "loss": 3.6972, "step": 503 }, { "epoch": 0.5131076609824383, "grad_norm": 10.741311073303223, "learning_rate": 4.8394144990150404e-05, "loss": 4.1789, "step": 504 }, { "epoch": 0.5141257317383559, "grad_norm": 10.585883140563965, "learning_rate": 4.8233623288223704e-05, "loss": 3.5806, "step": 505 }, { "epoch": 0.5151438024942734, "grad_norm": 13.6077880859375, "learning_rate": 4.807311981286888e-05, "loss": 3.5391, "step": 506 }, { "epoch": 0.5161618732501909, "grad_norm": 16.34162712097168, "learning_rate": 4.7912636220260473e-05, "loss": 3.6142, "step": 507 }, { "epoch": 0.5171799440061084, "grad_norm": 11.734464645385742, "learning_rate": 4.775217416636786e-05, "loss": 2.6898, "step": 508 }, { "epoch": 0.518198014762026, "grad_norm": 7.3565287590026855, "learning_rate": 4.759173530693814e-05, "loss": 2.0978, "step": 509 }, { "epoch": 0.5192160855179435, "grad_norm": 5.852792739868164, "learning_rate": 4.7431321297479135e-05, "loss": 1.4297, "step": 510 }, { "epoch": 0.520234156273861, "grad_norm": 6.781408786773682, "learning_rate": 4.727093379324222e-05, "loss": 1.5329, "step": 511 }, { "epoch": 0.5212522270297786, "grad_norm": 11.24429702758789, "learning_rate": 4.711057444920522e-05, "loss": 1.6744, "step": 512 }, { "epoch": 0.5222702977856961, "grad_norm": 7.679388046264648, "learning_rate": 4.695024492005548e-05, "loss": 2.2356, "step": 513 }, { "epoch": 0.5232883685416136, "grad_norm": 5.93134069442749, "learning_rate": 4.6789946860172634e-05, "loss": 1.2665, "step": 514 }, { "epoch": 0.5243064392975312, "grad_norm": 6.789477348327637, "learning_rate": 4.6629681923611603e-05, "loss": 1.6466, "step": 515 }, { "epoch": 0.5253245100534487, "grad_norm": 8.315037727355957, "learning_rate": 4.646945176408555e-05, "loss": 2.065, "step": 516 }, { "epoch": 0.5263425808093662, "grad_norm": 5.443754196166992, "learning_rate": 4.630925803494877e-05, "loss": 1.0138, "step": 517 }, { "epoch": 0.5273606515652838, "grad_norm": 6.596680641174316, "learning_rate": 4.6149102389179635e-05, "loss": 1.8229, "step": 518 }, { "epoch": 0.5283787223212013, "grad_norm": 5.735509872436523, "learning_rate": 4.598898647936354e-05, "loss": 1.5016, "step": 519 }, { "epoch": 0.5293967930771188, "grad_norm": 7.154899597167969, "learning_rate": 4.58289119576759e-05, "loss": 1.7249, "step": 520 }, { "epoch": 0.5304148638330364, "grad_norm": 5.887238502502441, "learning_rate": 4.566888047586507e-05, "loss": 1.3531, "step": 521 }, { "epoch": 0.5314329345889539, "grad_norm": 7.944952964782715, "learning_rate": 4.55088936852352e-05, "loss": 2.0604, "step": 522 }, { "epoch": 0.5324510053448714, "grad_norm": 8.235894203186035, "learning_rate": 4.5348953236629395e-05, "loss": 1.782, "step": 523 }, { "epoch": 0.533469076100789, "grad_norm": 9.824324607849121, "learning_rate": 4.518906078041252e-05, "loss": 3.1078, "step": 524 }, { "epoch": 0.5344871468567065, "grad_norm": 8.053499221801758, "learning_rate": 4.502921796645424e-05, "loss": 2.5225, "step": 525 }, { "epoch": 0.535505217612624, "grad_norm": 9.53549861907959, "learning_rate": 4.486942644411197e-05, "loss": 3.0847, "step": 526 }, { "epoch": 0.5365232883685416, "grad_norm": 8.427640914916992, "learning_rate": 4.4709687862213866e-05, "loss": 2.1704, "step": 527 }, { "epoch": 0.5375413591244591, "grad_norm": 7.989354610443115, "learning_rate": 4.4550003869041845e-05, "loss": 2.3719, "step": 528 }, { "epoch": 0.5385594298803766, "grad_norm": 7.53865909576416, "learning_rate": 4.439037611231448e-05, "loss": 2.4358, "step": 529 }, { "epoch": 0.5395775006362942, "grad_norm": 9.102818489074707, "learning_rate": 4.423080623917012e-05, "loss": 3.0774, "step": 530 }, { "epoch": 0.5405955713922117, "grad_norm": 10.17009162902832, "learning_rate": 4.407129589614979e-05, "loss": 2.719, "step": 531 }, { "epoch": 0.5416136421481293, "grad_norm": 8.132767677307129, "learning_rate": 4.3911846729180335e-05, "loss": 2.6276, "step": 532 }, { "epoch": 0.5426317129040469, "grad_norm": 8.669943809509277, "learning_rate": 4.3752460383557195e-05, "loss": 2.2211, "step": 533 }, { "epoch": 0.5436497836599644, "grad_norm": 8.190427780151367, "learning_rate": 4.359313850392772e-05, "loss": 2.2451, "step": 534 }, { "epoch": 0.544667854415882, "grad_norm": 7.185608386993408, "learning_rate": 4.3433882734274e-05, "loss": 1.938, "step": 535 }, { "epoch": 0.5456859251717995, "grad_norm": 9.735365867614746, "learning_rate": 4.327469471789597e-05, "loss": 3.3738, "step": 536 }, { "epoch": 0.546703995927717, "grad_norm": 9.06591796875, "learning_rate": 4.311557609739442e-05, "loss": 3.4894, "step": 537 }, { "epoch": 0.5477220666836345, "grad_norm": 8.038829803466797, "learning_rate": 4.295652851465412e-05, "loss": 2.6487, "step": 538 }, { "epoch": 0.5487401374395521, "grad_norm": 7.375051498413086, "learning_rate": 4.27975536108268e-05, "loss": 2.4853, "step": 539 }, { "epoch": 0.5497582081954696, "grad_norm": 9.910839080810547, "learning_rate": 4.2638653026314224e-05, "loss": 3.1606, "step": 540 }, { "epoch": 0.5507762789513871, "grad_norm": 7.77678918838501, "learning_rate": 4.24798284007513e-05, "loss": 2.33, "step": 541 }, { "epoch": 0.5517943497073047, "grad_norm": 7.377612113952637, "learning_rate": 4.232108137298919e-05, "loss": 2.299, "step": 542 }, { "epoch": 0.5528124204632222, "grad_norm": 9.510624885559082, "learning_rate": 4.216241358107831e-05, "loss": 2.8467, "step": 543 }, { "epoch": 0.5538304912191397, "grad_norm": 6.834048748016357, "learning_rate": 4.200382666225141e-05, "loss": 2.0166, "step": 544 }, { "epoch": 0.5548485619750573, "grad_norm": 8.245951652526855, "learning_rate": 4.1845322252906864e-05, "loss": 2.672, "step": 545 }, { "epoch": 0.5558666327309748, "grad_norm": 7.539649963378906, "learning_rate": 4.16869019885916e-05, "loss": 2.3618, "step": 546 }, { "epoch": 0.5568847034868923, "grad_norm": 7.983175754547119, "learning_rate": 4.152856750398426e-05, "loss": 2.2049, "step": 547 }, { "epoch": 0.5579027742428099, "grad_norm": 8.641951560974121, "learning_rate": 4.1370320432878404e-05, "loss": 2.2235, "step": 548 }, { "epoch": 0.5589208449987274, "grad_norm": 7.9181437492370605, "learning_rate": 4.1212162408165595e-05, "loss": 1.9295, "step": 549 }, { "epoch": 0.5599389157546449, "grad_norm": 10.45153522491455, "learning_rate": 4.105409506181854e-05, "loss": 2.1553, "step": 550 }, { "epoch": 0.5609569865105625, "grad_norm": 6.951171398162842, "learning_rate": 4.0896120024874286e-05, "loss": 2.5913, "step": 551 }, { "epoch": 0.56197505726648, "grad_norm": 9.614657402038574, "learning_rate": 4.073823892741735e-05, "loss": 4.2435, "step": 552 }, { "epoch": 0.5629931280223975, "grad_norm": 9.35623550415039, "learning_rate": 4.0580453398563e-05, "loss": 3.7123, "step": 553 }, { "epoch": 0.5640111987783151, "grad_norm": 10.756424903869629, "learning_rate": 4.042276506644024e-05, "loss": 3.6713, "step": 554 }, { "epoch": 0.5650292695342326, "grad_norm": 9.823023796081543, "learning_rate": 4.0265175558175265e-05, "loss": 3.7602, "step": 555 }, { "epoch": 0.5660473402901501, "grad_norm": 13.360715866088867, "learning_rate": 4.0107686499874465e-05, "loss": 3.269, "step": 556 }, { "epoch": 0.5670654110460677, "grad_norm": 14.194052696228027, "learning_rate": 3.9950299516607766e-05, "loss": 4.3906, "step": 557 }, { "epoch": 0.5680834818019852, "grad_norm": 16.591251373291016, "learning_rate": 3.979301623239177e-05, "loss": 4.4802, "step": 558 }, { "epoch": 0.5691015525579027, "grad_norm": 6.6096720695495605, "learning_rate": 3.9635838270173107e-05, "loss": 1.6842, "step": 559 }, { "epoch": 0.5701196233138203, "grad_norm": 6.252510070800781, "learning_rate": 3.94787672518116e-05, "loss": 1.6248, "step": 560 }, { "epoch": 0.5711376940697378, "grad_norm": 7.445550441741943, "learning_rate": 3.9321804798063565e-05, "loss": 1.7234, "step": 561 }, { "epoch": 0.5721557648256553, "grad_norm": 5.321173191070557, "learning_rate": 3.9164952528565057e-05, "loss": 1.2454, "step": 562 }, { "epoch": 0.5731738355815729, "grad_norm": 4.566540241241455, "learning_rate": 3.900821206181521e-05, "loss": 1.0588, "step": 563 }, { "epoch": 0.5741919063374905, "grad_norm": 8.349088668823242, "learning_rate": 3.8851585015159536e-05, "loss": 1.5751, "step": 564 }, { "epoch": 0.575209977093408, "grad_norm": 6.328129291534424, "learning_rate": 3.8695073004773106e-05, "loss": 2.025, "step": 565 }, { "epoch": 0.5762280478493256, "grad_norm": 8.211170196533203, "learning_rate": 3.8538677645644096e-05, "loss": 1.1548, "step": 566 }, { "epoch": 0.5772461186052431, "grad_norm": 5.518578052520752, "learning_rate": 3.838240055155692e-05, "loss": 1.2809, "step": 567 }, { "epoch": 0.5782641893611606, "grad_norm": 6.383520603179932, "learning_rate": 3.822624333507571e-05, "loss": 1.8485, "step": 568 }, { "epoch": 0.5792822601170782, "grad_norm": 5.425829887390137, "learning_rate": 3.8070207607527584e-05, "loss": 1.4567, "step": 569 }, { "epoch": 0.5803003308729957, "grad_norm": 8.478185653686523, "learning_rate": 3.791429497898608e-05, "loss": 2.0052, "step": 570 }, { "epoch": 0.5813184016289132, "grad_norm": 8.863068580627441, "learning_rate": 3.775850705825454e-05, "loss": 2.2554, "step": 571 }, { "epoch": 0.5823364723848308, "grad_norm": 5.8295183181762695, "learning_rate": 3.7602845452849463e-05, "loss": 1.2544, "step": 572 }, { "epoch": 0.5833545431407483, "grad_norm": 8.446788787841797, "learning_rate": 3.7447311768983964e-05, "loss": 2.4702, "step": 573 }, { "epoch": 0.5843726138966658, "grad_norm": 7.7443766593933105, "learning_rate": 3.7291907611551195e-05, "loss": 2.0707, "step": 574 }, { "epoch": 0.5853906846525834, "grad_norm": 8.347147941589355, "learning_rate": 3.713663458410779e-05, "loss": 1.659, "step": 575 }, { "epoch": 0.5864087554085009, "grad_norm": 7.487883567810059, "learning_rate": 3.69814942888572e-05, "loss": 2.0328, "step": 576 }, { "epoch": 0.5874268261644184, "grad_norm": 7.8575286865234375, "learning_rate": 3.682648832663339e-05, "loss": 1.8928, "step": 577 }, { "epoch": 0.588444896920336, "grad_norm": 8.947505950927734, "learning_rate": 3.6671618296884146e-05, "loss": 1.6774, "step": 578 }, { "epoch": 0.5894629676762535, "grad_norm": 5.097304821014404, "learning_rate": 3.6516885797654594e-05, "loss": 1.3306, "step": 579 }, { "epoch": 0.590481038432171, "grad_norm": 6.418907642364502, "learning_rate": 3.636229242557075e-05, "loss": 1.9186, "step": 580 }, { "epoch": 0.5914991091880886, "grad_norm": 7.3138346672058105, "learning_rate": 3.620783977582305e-05, "loss": 2.4993, "step": 581 }, { "epoch": 0.5925171799440061, "grad_norm": 7.914095878601074, "learning_rate": 3.605352944214986e-05, "loss": 2.078, "step": 582 }, { "epoch": 0.5935352506999236, "grad_norm": 10.451981544494629, "learning_rate": 3.5899363016821e-05, "loss": 2.5348, "step": 583 }, { "epoch": 0.5945533214558412, "grad_norm": 6.191624164581299, "learning_rate": 3.5745342090621405e-05, "loss": 1.6607, "step": 584 }, { "epoch": 0.5955713922117587, "grad_norm": 7.947683811187744, "learning_rate": 3.559146825283465e-05, "loss": 2.4664, "step": 585 }, { "epoch": 0.5965894629676762, "grad_norm": 7.410199165344238, "learning_rate": 3.5437743091226565e-05, "loss": 2.0212, "step": 586 }, { "epoch": 0.5976075337235938, "grad_norm": 8.705409049987793, "learning_rate": 3.528416819202881e-05, "loss": 2.2274, "step": 587 }, { "epoch": 0.5986256044795113, "grad_norm": 7.487548351287842, "learning_rate": 3.5130745139922574e-05, "loss": 2.104, "step": 588 }, { "epoch": 0.5996436752354288, "grad_norm": 8.788456916809082, "learning_rate": 3.497747551802221e-05, "loss": 2.5106, "step": 589 }, { "epoch": 0.6006617459913464, "grad_norm": 7.41387939453125, "learning_rate": 3.482436090785882e-05, "loss": 2.1219, "step": 590 }, { "epoch": 0.6016798167472639, "grad_norm": 6.481340408325195, "learning_rate": 3.467140288936407e-05, "loss": 1.9451, "step": 591 }, { "epoch": 0.6026978875031814, "grad_norm": 7.278069496154785, "learning_rate": 3.451860304085378e-05, "loss": 1.8661, "step": 592 }, { "epoch": 0.603715958259099, "grad_norm": 8.016121864318848, "learning_rate": 3.43659629390117e-05, "loss": 1.9884, "step": 593 }, { "epoch": 0.6047340290150165, "grad_norm": 8.917866706848145, "learning_rate": 3.421348415887315e-05, "loss": 2.6266, "step": 594 }, { "epoch": 0.605752099770934, "grad_norm": 9.271273612976074, "learning_rate": 3.406116827380889e-05, "loss": 2.6668, "step": 595 }, { "epoch": 0.6067701705268517, "grad_norm": 7.660860061645508, "learning_rate": 3.390901685550887e-05, "loss": 2.373, "step": 596 }, { "epoch": 0.6077882412827692, "grad_norm": 7.496829032897949, "learning_rate": 3.375703147396583e-05, "loss": 2.137, "step": 597 }, { "epoch": 0.6088063120386867, "grad_norm": 10.63588809967041, "learning_rate": 3.360521369745937e-05, "loss": 2.0113, "step": 598 }, { "epoch": 0.6098243827946043, "grad_norm": 8.661003112792969, "learning_rate": 3.345356509253959e-05, "loss": 2.202, "step": 599 }, { "epoch": 0.6108424535505218, "grad_norm": 6.928518295288086, "learning_rate": 3.330208722401097e-05, "loss": 1.6603, "step": 600 }, { "epoch": 0.6118605243064393, "grad_norm": 5.956086158752441, "learning_rate": 3.315078165491622e-05, "loss": 2.2319, "step": 601 }, { "epoch": 0.6128785950623569, "grad_norm": 9.131757736206055, "learning_rate": 3.2999649946520174e-05, "loss": 3.3601, "step": 602 }, { "epoch": 0.6138966658182744, "grad_norm": 8.110289573669434, "learning_rate": 3.2848693658293675e-05, "loss": 2.8758, "step": 603 }, { "epoch": 0.6149147365741919, "grad_norm": 11.287444114685059, "learning_rate": 3.2697914347897406e-05, "loss": 4.129, "step": 604 }, { "epoch": 0.6159328073301095, "grad_norm": 10.69924259185791, "learning_rate": 3.254731357116597e-05, "loss": 4.2776, "step": 605 }, { "epoch": 0.616950878086027, "grad_norm": 9.89280891418457, "learning_rate": 3.239689288209168e-05, "loss": 3.1346, "step": 606 }, { "epoch": 0.6179689488419445, "grad_norm": 11.832335472106934, "learning_rate": 3.224665383280867e-05, "loss": 3.4148, "step": 607 }, { "epoch": 0.6189870195978621, "grad_norm": 13.277129173278809, "learning_rate": 3.2096597973576694e-05, "loss": 3.4906, "step": 608 }, { "epoch": 0.6200050903537796, "grad_norm": 6.8787994384765625, "learning_rate": 3.194672685276532e-05, "loss": 1.4383, "step": 609 }, { "epoch": 0.6210231611096971, "grad_norm": 5.783747673034668, "learning_rate": 3.179704201683786e-05, "loss": 1.3518, "step": 610 }, { "epoch": 0.6220412318656147, "grad_norm": 5.462782859802246, "learning_rate": 3.16475450103354e-05, "loss": 1.249, "step": 611 }, { "epoch": 0.6230593026215322, "grad_norm": 5.050539016723633, "learning_rate": 3.1498237375860886e-05, "loss": 1.1348, "step": 612 }, { "epoch": 0.6240773733774497, "grad_norm": 8.341720581054688, "learning_rate": 3.1349120654063225e-05, "loss": 1.7345, "step": 613 }, { "epoch": 0.6250954441333673, "grad_norm": 4.832444190979004, "learning_rate": 3.120019638362136e-05, "loss": 1.0501, "step": 614 }, { "epoch": 0.6261135148892848, "grad_norm": 7.373495578765869, "learning_rate": 3.1051466101228385e-05, "loss": 1.7428, "step": 615 }, { "epoch": 0.6271315856452023, "grad_norm": 5.6345319747924805, "learning_rate": 3.090293134157572e-05, "loss": 1.2435, "step": 616 }, { "epoch": 0.6281496564011199, "grad_norm": 6.5224609375, "learning_rate": 3.0754593637337276e-05, "loss": 1.4176, "step": 617 }, { "epoch": 0.6291677271570374, "grad_norm": 8.80791187286377, "learning_rate": 3.06064545191536e-05, "loss": 2.4285, "step": 618 }, { "epoch": 0.6301857979129549, "grad_norm": 9.331201553344727, "learning_rate": 3.0458515515616115e-05, "loss": 2.7192, "step": 619 }, { "epoch": 0.6312038686688725, "grad_norm": 9.033586502075195, "learning_rate": 3.0310778153251324e-05, "loss": 1.8652, "step": 620 }, { "epoch": 0.63222193942479, "grad_norm": 6.689144134521484, "learning_rate": 3.0163243956505095e-05, "loss": 1.5773, "step": 621 }, { "epoch": 0.6332400101807075, "grad_norm": 8.037043571472168, "learning_rate": 3.0015914447726867e-05, "loss": 2.3296, "step": 622 }, { "epoch": 0.6342580809366251, "grad_norm": 7.927774906158447, "learning_rate": 2.986879114715403e-05, "loss": 2.2707, "step": 623 }, { "epoch": 0.6352761516925426, "grad_norm": 5.514461994171143, "learning_rate": 2.9721875572896157e-05, "loss": 1.7974, "step": 624 }, { "epoch": 0.6362942224484601, "grad_norm": 7.439801216125488, "learning_rate": 2.95751692409194e-05, "loss": 2.1823, "step": 625 }, { "epoch": 0.6373122932043777, "grad_norm": 7.419183731079102, "learning_rate": 2.942867366503077e-05, "loss": 2.1965, "step": 626 }, { "epoch": 0.6383303639602952, "grad_norm": 5.545042037963867, "learning_rate": 2.9282390356862606e-05, "loss": 1.4957, "step": 627 }, { "epoch": 0.6393484347162128, "grad_norm": 11.62447738647461, "learning_rate": 2.9136320825856967e-05, "loss": 3.3109, "step": 628 }, { "epoch": 0.6403665054721304, "grad_norm": 8.367134094238281, "learning_rate": 2.899046657924992e-05, "loss": 2.2194, "step": 629 }, { "epoch": 0.6413845762280479, "grad_norm": 10.391725540161133, "learning_rate": 2.884482912205621e-05, "loss": 2.0195, "step": 630 }, { "epoch": 0.6424026469839654, "grad_norm": 8.217406272888184, "learning_rate": 2.8699409957053535e-05, "loss": 2.4132, "step": 631 }, { "epoch": 0.643420717739883, "grad_norm": 8.29297161102295, "learning_rate": 2.855421058476719e-05, "loss": 2.4454, "step": 632 }, { "epoch": 0.6444387884958005, "grad_norm": 8.815670013427734, "learning_rate": 2.840923250345442e-05, "loss": 2.5413, "step": 633 }, { "epoch": 0.645456859251718, "grad_norm": 8.5559720993042, "learning_rate": 2.8264477209089145e-05, "loss": 2.7664, "step": 634 }, { "epoch": 0.6464749300076356, "grad_norm": 8.682782173156738, "learning_rate": 2.8119946195346375e-05, "loss": 2.5312, "step": 635 }, { "epoch": 0.6474930007635531, "grad_norm": 11.519887924194336, "learning_rate": 2.7975640953586846e-05, "loss": 2.9688, "step": 636 }, { "epoch": 0.6485110715194706, "grad_norm": 8.966607093811035, "learning_rate": 2.7831562972841696e-05, "loss": 2.7022, "step": 637 }, { "epoch": 0.6495291422753882, "grad_norm": 8.183965682983398, "learning_rate": 2.768771373979697e-05, "loss": 2.3317, "step": 638 }, { "epoch": 0.6505472130313057, "grad_norm": 8.993667602539062, "learning_rate": 2.7544094738778436e-05, "loss": 2.7296, "step": 639 }, { "epoch": 0.6515652837872232, "grad_norm": 7.731354713439941, "learning_rate": 2.74007074517361e-05, "loss": 1.9501, "step": 640 }, { "epoch": 0.6525833545431408, "grad_norm": 6.967146396636963, "learning_rate": 2.7257553358229034e-05, "loss": 1.8838, "step": 641 }, { "epoch": 0.6536014252990583, "grad_norm": 6.557554244995117, "learning_rate": 2.7114633935410085e-05, "loss": 1.7431, "step": 642 }, { "epoch": 0.6546194960549758, "grad_norm": 10.207218170166016, "learning_rate": 2.6971950658010666e-05, "loss": 2.4966, "step": 643 }, { "epoch": 0.6556375668108934, "grad_norm": 7.477417469024658, "learning_rate": 2.682950499832535e-05, "loss": 2.1944, "step": 644 }, { "epoch": 0.6566556375668109, "grad_norm": 10.127610206604004, "learning_rate": 2.6687298426196973e-05, "loss": 2.6473, "step": 645 }, { "epoch": 0.6576737083227284, "grad_norm": 6.374731540679932, "learning_rate": 2.6545332409001265e-05, "loss": 1.8528, "step": 646 }, { "epoch": 0.658691779078646, "grad_norm": 6.7048444747924805, "learning_rate": 2.6403608411631742e-05, "loss": 1.7493, "step": 647 }, { "epoch": 0.6597098498345635, "grad_norm": 7.112037181854248, "learning_rate": 2.6262127896484602e-05, "loss": 2.0421, "step": 648 }, { "epoch": 0.660727920590481, "grad_norm": 8.483193397521973, "learning_rate": 2.612089232344371e-05, "loss": 1.91, "step": 649 }, { "epoch": 0.6617459913463986, "grad_norm": 10.052485466003418, "learning_rate": 2.5979903149865387e-05, "loss": 2.0998, "step": 650 }, { "epoch": 0.6627640621023161, "grad_norm": 8.01032543182373, "learning_rate": 2.5839161830563474e-05, "loss": 2.5145, "step": 651 }, { "epoch": 0.6637821328582336, "grad_norm": 9.746928215026855, "learning_rate": 2.569866981779433e-05, "loss": 3.3683, "step": 652 }, { "epoch": 0.6648002036141512, "grad_norm": 8.607123374938965, "learning_rate": 2.555842856124182e-05, "loss": 2.9144, "step": 653 }, { "epoch": 0.6658182743700687, "grad_norm": 10.463346481323242, "learning_rate": 2.5418439508002258e-05, "loss": 3.9062, "step": 654 }, { "epoch": 0.6668363451259862, "grad_norm": 9.336942672729492, "learning_rate": 2.5278704102569662e-05, "loss": 3.3966, "step": 655 }, { "epoch": 0.6678544158819038, "grad_norm": 10.415209770202637, "learning_rate": 2.5139223786820747e-05, "loss": 3.7271, "step": 656 }, { "epoch": 0.6688724866378213, "grad_norm": 14.631210327148438, "learning_rate": 2.500000000000001e-05, "loss": 3.7071, "step": 657 }, { "epoch": 0.6698905573937388, "grad_norm": 13.001562118530273, "learning_rate": 2.486103417870493e-05, "loss": 3.214, "step": 658 }, { "epoch": 0.6709086281496563, "grad_norm": 11.307893753051758, "learning_rate": 2.472232775687119e-05, "loss": 2.8893, "step": 659 }, { "epoch": 0.6719266989055739, "grad_norm": 7.8647379875183105, "learning_rate": 2.4583882165757766e-05, "loss": 2.0442, "step": 660 }, { "epoch": 0.6729447696614915, "grad_norm": 5.790807247161865, "learning_rate": 2.4445698833932234e-05, "loss": 1.3228, "step": 661 }, { "epoch": 0.6739628404174091, "grad_norm": 5.694929599761963, "learning_rate": 2.4307779187256064e-05, "loss": 1.3618, "step": 662 }, { "epoch": 0.6749809111733266, "grad_norm": 5.114007949829102, "learning_rate": 2.417012464886978e-05, "loss": 1.2137, "step": 663 }, { "epoch": 0.6759989819292441, "grad_norm": 7.429940223693848, "learning_rate": 2.4032736639178444e-05, "loss": 1.8593, "step": 664 }, { "epoch": 0.6770170526851617, "grad_norm": 5.101173400878906, "learning_rate": 2.389561657583681e-05, "loss": 0.9669, "step": 665 }, { "epoch": 0.6780351234410792, "grad_norm": 7.89351224899292, "learning_rate": 2.3758765873734896e-05, "loss": 1.8615, "step": 666 }, { "epoch": 0.6790531941969967, "grad_norm": 7.043496608734131, "learning_rate": 2.3622185944983188e-05, "loss": 1.7828, "step": 667 }, { "epoch": 0.6800712649529143, "grad_norm": 7.9154510498046875, "learning_rate": 2.3485878198898252e-05, "loss": 2.2469, "step": 668 }, { "epoch": 0.6810893357088318, "grad_norm": 6.627047061920166, "learning_rate": 2.3349844041988045e-05, "loss": 1.5789, "step": 669 }, { "epoch": 0.6821074064647493, "grad_norm": 5.884915828704834, "learning_rate": 2.3214084877937464e-05, "loss": 1.5281, "step": 670 }, { "epoch": 0.6831254772206669, "grad_norm": 6.640014171600342, "learning_rate": 2.30786021075939e-05, "loss": 1.4942, "step": 671 }, { "epoch": 0.6841435479765844, "grad_norm": 6.866456985473633, "learning_rate": 2.294339712895271e-05, "loss": 1.674, "step": 672 }, { "epoch": 0.6851616187325019, "grad_norm": 6.7534990310668945, "learning_rate": 2.28084713371428e-05, "loss": 1.3313, "step": 673 }, { "epoch": 0.6861796894884195, "grad_norm": 6.38292121887207, "learning_rate": 2.2673826124412312e-05, "loss": 1.6016, "step": 674 }, { "epoch": 0.687197760244337, "grad_norm": 7.129096031188965, "learning_rate": 2.2539462880114194e-05, "loss": 1.8662, "step": 675 }, { "epoch": 0.6882158310002545, "grad_norm": 6.555764675140381, "learning_rate": 2.240538299069178e-05, "loss": 1.9315, "step": 676 }, { "epoch": 0.689233901756172, "grad_norm": 5.772182941436768, "learning_rate": 2.2271587839664672e-05, "loss": 1.3156, "step": 677 }, { "epoch": 0.6902519725120896, "grad_norm": 7.608791351318359, "learning_rate": 2.213807880761434e-05, "loss": 1.9463, "step": 678 }, { "epoch": 0.6912700432680071, "grad_norm": 7.279063701629639, "learning_rate": 2.2004857272169876e-05, "loss": 1.9304, "step": 679 }, { "epoch": 0.6922881140239247, "grad_norm": 9.676162719726562, "learning_rate": 2.1871924607993797e-05, "loss": 2.3767, "step": 680 }, { "epoch": 0.6933061847798422, "grad_norm": 7.1779093742370605, "learning_rate": 2.1739282186767923e-05, "loss": 1.6381, "step": 681 }, { "epoch": 0.6943242555357597, "grad_norm": 6.892930030822754, "learning_rate": 2.160693137717912e-05, "loss": 2.134, "step": 682 }, { "epoch": 0.6953423262916772, "grad_norm": 9.403331756591797, "learning_rate": 2.1474873544905205e-05, "loss": 2.2294, "step": 683 }, { "epoch": 0.6963603970475948, "grad_norm": 7.7654595375061035, "learning_rate": 2.134311005260093e-05, "loss": 2.0953, "step": 684 }, { "epoch": 0.6973784678035123, "grad_norm": 10.087757110595703, "learning_rate": 2.1211642259883867e-05, "loss": 2.9221, "step": 685 }, { "epoch": 0.6983965385594298, "grad_norm": 8.816588401794434, "learning_rate": 2.108047152332028e-05, "loss": 2.6949, "step": 686 }, { "epoch": 0.6994146093153474, "grad_norm": 8.12427043914795, "learning_rate": 2.0949599196411325e-05, "loss": 1.7944, "step": 687 }, { "epoch": 0.7004326800712649, "grad_norm": 7.3718461990356445, "learning_rate": 2.0819026629578952e-05, "loss": 2.1142, "step": 688 }, { "epoch": 0.7014507508271824, "grad_norm": 7.3536577224731445, "learning_rate": 2.0688755170151996e-05, "loss": 2.0029, "step": 689 }, { "epoch": 0.7024688215831, "grad_norm": 8.220134735107422, "learning_rate": 2.0558786162352244e-05, "loss": 2.2986, "step": 690 }, { "epoch": 0.7034868923390175, "grad_norm": 9.169322967529297, "learning_rate": 2.0429120947280678e-05, "loss": 2.3455, "step": 691 }, { "epoch": 0.704504963094935, "grad_norm": 8.935730934143066, "learning_rate": 2.029976086290347e-05, "loss": 2.1588, "step": 692 }, { "epoch": 0.7055230338508527, "grad_norm": 7.555604934692383, "learning_rate": 2.017070724403835e-05, "loss": 2.2783, "step": 693 }, { "epoch": 0.7065411046067702, "grad_norm": 7.896771430969238, "learning_rate": 2.0041961422340676e-05, "loss": 1.8964, "step": 694 }, { "epoch": 0.7075591753626878, "grad_norm": 8.242528915405273, "learning_rate": 1.9913524726289784e-05, "loss": 1.9936, "step": 695 }, { "epoch": 0.7085772461186053, "grad_norm": 7.946272373199463, "learning_rate": 1.9785398481175294e-05, "loss": 2.1526, "step": 696 }, { "epoch": 0.7095953168745228, "grad_norm": 8.382307052612305, "learning_rate": 1.965758400908334e-05, "loss": 2.4691, "step": 697 }, { "epoch": 0.7106133876304404, "grad_norm": 6.839285373687744, "learning_rate": 1.9530082628883056e-05, "loss": 1.7924, "step": 698 }, { "epoch": 0.7116314583863579, "grad_norm": 12.65297794342041, "learning_rate": 1.9402895656212833e-05, "loss": 2.0093, "step": 699 }, { "epoch": 0.7126495291422754, "grad_norm": 11.35102653503418, "learning_rate": 1.927602440346687e-05, "loss": 1.7963, "step": 700 }, { "epoch": 0.713667599898193, "grad_norm": 7.479799747467041, "learning_rate": 1.914947017978153e-05, "loss": 3.4169, "step": 701 }, { "epoch": 0.7146856706541105, "grad_norm": 9.703947067260742, "learning_rate": 1.9023234291021873e-05, "loss": 2.8178, "step": 702 }, { "epoch": 0.715703741410028, "grad_norm": 10.218291282653809, "learning_rate": 1.889731803976822e-05, "loss": 2.841, "step": 703 }, { "epoch": 0.7167218121659455, "grad_norm": 12.210125923156738, "learning_rate": 1.8771722725302643e-05, "loss": 3.9947, "step": 704 }, { "epoch": 0.7177398829218631, "grad_norm": 9.851053237915039, "learning_rate": 1.8646449643595565e-05, "loss": 2.8836, "step": 705 }, { "epoch": 0.7187579536777806, "grad_norm": 11.182621955871582, "learning_rate": 1.8521500087292467e-05, "loss": 3.2881, "step": 706 }, { "epoch": 0.7197760244336981, "grad_norm": 16.472837448120117, "learning_rate": 1.8396875345700497e-05, "loss": 3.6782, "step": 707 }, { "epoch": 0.7207940951896157, "grad_norm": 13.632477760314941, "learning_rate": 1.8272576704775074e-05, "loss": 3.5599, "step": 708 }, { "epoch": 0.7218121659455332, "grad_norm": 8.531991958618164, "learning_rate": 1.8148605447106797e-05, "loss": 1.815, "step": 709 }, { "epoch": 0.7228302367014507, "grad_norm": 6.116468906402588, "learning_rate": 1.8024962851908107e-05, "loss": 1.3279, "step": 710 }, { "epoch": 0.7238483074573683, "grad_norm": 6.058359622955322, "learning_rate": 1.7901650195000068e-05, "loss": 1.1209, "step": 711 }, { "epoch": 0.7248663782132858, "grad_norm": 7.301308631896973, "learning_rate": 1.7778668748799242e-05, "loss": 1.6941, "step": 712 }, { "epoch": 0.7258844489692033, "grad_norm": 6.059625148773193, "learning_rate": 1.76560197823046e-05, "loss": 1.4134, "step": 713 }, { "epoch": 0.7269025197251209, "grad_norm": 5.40415620803833, "learning_rate": 1.753370456108433e-05, "loss": 1.5117, "step": 714 }, { "epoch": 0.7279205904810384, "grad_norm": 6.5403008460998535, "learning_rate": 1.7411724347262824e-05, "loss": 1.397, "step": 715 }, { "epoch": 0.7289386612369559, "grad_norm": 8.339217185974121, "learning_rate": 1.729008039950772e-05, "loss": 1.5315, "step": 716 }, { "epoch": 0.7299567319928735, "grad_norm": 5.882655620574951, "learning_rate": 1.7168773973016776e-05, "loss": 1.1574, "step": 717 }, { "epoch": 0.730974802748791, "grad_norm": 6.183307647705078, "learning_rate": 1.7047806319505076e-05, "loss": 1.3367, "step": 718 }, { "epoch": 0.7319928735047085, "grad_norm": 6.28183126449585, "learning_rate": 1.692717868719195e-05, "loss": 1.5637, "step": 719 }, { "epoch": 0.7330109442606261, "grad_norm": 4.728903293609619, "learning_rate": 1.680689232078827e-05, "loss": 1.4179, "step": 720 }, { "epoch": 0.7340290150165436, "grad_norm": 6.95587158203125, "learning_rate": 1.668694846148343e-05, "loss": 1.7837, "step": 721 }, { "epoch": 0.7350470857724611, "grad_norm": 5.531774997711182, "learning_rate": 1.6567348346932658e-05, "loss": 1.2069, "step": 722 }, { "epoch": 0.7360651565283787, "grad_norm": 5.498968601226807, "learning_rate": 1.644809321124423e-05, "loss": 1.1316, "step": 723 }, { "epoch": 0.7370832272842962, "grad_norm": 7.1133809089660645, "learning_rate": 1.6329184284966677e-05, "loss": 2.0335, "step": 724 }, { "epoch": 0.7381012980402138, "grad_norm": 6.765145301818848, "learning_rate": 1.621062279507617e-05, "loss": 2.0067, "step": 725 }, { "epoch": 0.7391193687961314, "grad_norm": 7.21923828125, "learning_rate": 1.609240996496378e-05, "loss": 2.2922, "step": 726 }, { "epoch": 0.7401374395520489, "grad_norm": 5.8889360427856445, "learning_rate": 1.597454701442288e-05, "loss": 1.6363, "step": 727 }, { "epoch": 0.7411555103079664, "grad_norm": 8.041604042053223, "learning_rate": 1.5857035159636623e-05, "loss": 1.6933, "step": 728 }, { "epoch": 0.742173581063884, "grad_norm": 7.711045742034912, "learning_rate": 1.5739875613165283e-05, "loss": 1.9258, "step": 729 }, { "epoch": 0.7431916518198015, "grad_norm": 7.747977256774902, "learning_rate": 1.5623069583933836e-05, "loss": 2.5273, "step": 730 }, { "epoch": 0.744209722575719, "grad_norm": 8.055684089660645, "learning_rate": 1.550661827721941e-05, "loss": 2.0398, "step": 731 }, { "epoch": 0.7452277933316366, "grad_norm": 8.75759220123291, "learning_rate": 1.5390522894638938e-05, "loss": 2.5372, "step": 732 }, { "epoch": 0.7462458640875541, "grad_norm": 6.629666805267334, "learning_rate": 1.527478463413666e-05, "loss": 1.8586, "step": 733 }, { "epoch": 0.7472639348434716, "grad_norm": 7.634647369384766, "learning_rate": 1.5159404689971795e-05, "loss": 1.7609, "step": 734 }, { "epoch": 0.7482820055993892, "grad_norm": 8.821757316589355, "learning_rate": 1.5044384252706312e-05, "loss": 2.5073, "step": 735 }, { "epoch": 0.7493000763553067, "grad_norm": 7.940456867218018, "learning_rate": 1.4929724509192488e-05, "loss": 2.6403, "step": 736 }, { "epoch": 0.7503181471112242, "grad_norm": 7.819153308868408, "learning_rate": 1.4815426642560754e-05, "loss": 2.3173, "step": 737 }, { "epoch": 0.7513362178671418, "grad_norm": 7.586490154266357, "learning_rate": 1.470149183220748e-05, "loss": 2.0191, "step": 738 }, { "epoch": 0.7523542886230593, "grad_norm": 6.6719584465026855, "learning_rate": 1.4587921253782849e-05, "loss": 1.6597, "step": 739 }, { "epoch": 0.7533723593789768, "grad_norm": 8.974640846252441, "learning_rate": 1.447471607917854e-05, "loss": 1.9953, "step": 740 }, { "epoch": 0.7543904301348944, "grad_norm": 7.379059314727783, "learning_rate": 1.4361877476515889e-05, "loss": 1.8422, "step": 741 }, { "epoch": 0.7554085008908119, "grad_norm": 7.163296699523926, "learning_rate": 1.4249406610133686e-05, "loss": 1.8372, "step": 742 }, { "epoch": 0.7564265716467294, "grad_norm": 10.26382064819336, "learning_rate": 1.413730464057616e-05, "loss": 2.2328, "step": 743 }, { "epoch": 0.757444642402647, "grad_norm": 7.997495651245117, "learning_rate": 1.4025572724581038e-05, "loss": 2.083, "step": 744 }, { "epoch": 0.7584627131585645, "grad_norm": 8.966462135314941, "learning_rate": 1.3914212015067651e-05, "loss": 2.0716, "step": 745 }, { "epoch": 0.759480783914482, "grad_norm": 9.182121276855469, "learning_rate": 1.3803223661124936e-05, "loss": 2.406, "step": 746 }, { "epoch": 0.7604988546703996, "grad_norm": 8.59941577911377, "learning_rate": 1.3692608807999652e-05, "loss": 2.1763, "step": 747 }, { "epoch": 0.7615169254263171, "grad_norm": 11.990951538085938, "learning_rate": 1.3582368597084566e-05, "loss": 2.7291, "step": 748 }, { "epoch": 0.7625349961822346, "grad_norm": 7.6986260414123535, "learning_rate": 1.3472504165906613e-05, "loss": 1.9128, "step": 749 }, { "epoch": 0.7635530669381522, "grad_norm": 8.994124412536621, "learning_rate": 1.3363016648115245e-05, "loss": 1.7094, "step": 750 }, { "epoch": 0.7645711376940697, "grad_norm": 8.2550630569458, "learning_rate": 1.3253907173470648e-05, "loss": 3.7822, "step": 751 }, { "epoch": 0.7655892084499872, "grad_norm": 8.363167762756348, "learning_rate": 1.3145176867832165e-05, "loss": 3.1741, "step": 752 }, { "epoch": 0.7666072792059048, "grad_norm": 9.87235164642334, "learning_rate": 1.30368268531466e-05, "loss": 3.5932, "step": 753 }, { "epoch": 0.7676253499618223, "grad_norm": 7.78696346282959, "learning_rate": 1.292885824743667e-05, "loss": 2.4161, "step": 754 }, { "epoch": 0.7686434207177398, "grad_norm": 10.35235595703125, "learning_rate": 1.2821272164789544e-05, "loss": 3.0389, "step": 755 }, { "epoch": 0.7696614914736574, "grad_norm": 13.586828231811523, "learning_rate": 1.2714069715345195e-05, "loss": 3.6863, "step": 756 }, { "epoch": 0.770679562229575, "grad_norm": 11.710920333862305, "learning_rate": 1.2607252005285109e-05, "loss": 4.0324, "step": 757 }, { "epoch": 0.7716976329854925, "grad_norm": 12.497662544250488, "learning_rate": 1.2500820136820734e-05, "loss": 2.9699, "step": 758 }, { "epoch": 0.7727157037414101, "grad_norm": 15.810281753540039, "learning_rate": 1.2394775208182174e-05, "loss": 3.1219, "step": 759 }, { "epoch": 0.7737337744973276, "grad_norm": 10.045539855957031, "learning_rate": 1.2289118313606896e-05, "loss": 1.7459, "step": 760 }, { "epoch": 0.7747518452532451, "grad_norm": 8.060089111328125, "learning_rate": 1.2183850543328312e-05, "loss": 1.4901, "step": 761 }, { "epoch": 0.7757699160091627, "grad_norm": 5.897403240203857, "learning_rate": 1.2078972983564684e-05, "loss": 1.1615, "step": 762 }, { "epoch": 0.7767879867650802, "grad_norm": 9.653463363647461, "learning_rate": 1.1974486716507783e-05, "loss": 2.1851, "step": 763 }, { "epoch": 0.7778060575209977, "grad_norm": 8.81517505645752, "learning_rate": 1.1870392820311821e-05, "loss": 2.0521, "step": 764 }, { "epoch": 0.7788241282769153, "grad_norm": 8.291658401489258, "learning_rate": 1.1766692369082255e-05, "loss": 2.0249, "step": 765 }, { "epoch": 0.7798421990328328, "grad_norm": 8.102690696716309, "learning_rate": 1.1663386432864727e-05, "loss": 2.065, "step": 766 }, { "epoch": 0.7808602697887503, "grad_norm": 4.781749248504639, "learning_rate": 1.156047607763407e-05, "loss": 1.2103, "step": 767 }, { "epoch": 0.7818783405446679, "grad_norm": 6.986288547515869, "learning_rate": 1.145796236528322e-05, "loss": 1.5723, "step": 768 }, { "epoch": 0.7828964113005854, "grad_norm": 6.896148681640625, "learning_rate": 1.135584635361232e-05, "loss": 1.7923, "step": 769 }, { "epoch": 0.7839144820565029, "grad_norm": 6.917054176330566, "learning_rate": 1.1254129096317806e-05, "loss": 1.8785, "step": 770 }, { "epoch": 0.7849325528124205, "grad_norm": 8.382417678833008, "learning_rate": 1.115281164298153e-05, "loss": 2.0591, "step": 771 }, { "epoch": 0.785950623568338, "grad_norm": 6.412557601928711, "learning_rate": 1.105189503905985e-05, "loss": 1.3064, "step": 772 }, { "epoch": 0.7869686943242555, "grad_norm": 8.5576753616333, "learning_rate": 1.0951380325872979e-05, "loss": 2.1621, "step": 773 }, { "epoch": 0.7879867650801731, "grad_norm": 6.601977348327637, "learning_rate": 1.0851268540594167e-05, "loss": 1.9123, "step": 774 }, { "epoch": 0.7890048358360906, "grad_norm": 9.037349700927734, "learning_rate": 1.0751560716238967e-05, "loss": 2.7441, "step": 775 }, { "epoch": 0.7900229065920081, "grad_norm": 8.547028541564941, "learning_rate": 1.0652257881654627e-05, "loss": 2.4516, "step": 776 }, { "epoch": 0.7910409773479257, "grad_norm": 6.397313117980957, "learning_rate": 1.055336106150948e-05, "loss": 1.7555, "step": 777 }, { "epoch": 0.7920590481038432, "grad_norm": 8.946637153625488, "learning_rate": 1.0454871276282335e-05, "loss": 2.4554, "step": 778 }, { "epoch": 0.7930771188597607, "grad_norm": 10.802556037902832, "learning_rate": 1.0356789542251938e-05, "loss": 3.2162, "step": 779 }, { "epoch": 0.7940951896156783, "grad_norm": 6.947890281677246, "learning_rate": 1.0259116871486557e-05, "loss": 1.7408, "step": 780 }, { "epoch": 0.7951132603715958, "grad_norm": 6.768093585968018, "learning_rate": 1.0161854271833443e-05, "loss": 1.8601, "step": 781 }, { "epoch": 0.7961313311275133, "grad_norm": 8.398831367492676, "learning_rate": 1.006500274690853e-05, "loss": 2.2156, "step": 782 }, { "epoch": 0.7971494018834309, "grad_norm": 7.810449123382568, "learning_rate": 9.96856329608597e-06, "loss": 2.1837, "step": 783 }, { "epoch": 0.7981674726393484, "grad_norm": 8.77087116241455, "learning_rate": 9.87253691448794e-06, "loss": 2.6596, "step": 784 }, { "epoch": 0.7991855433952659, "grad_norm": 11.717060089111328, "learning_rate": 9.776924592974256e-06, "loss": 3.5775, "step": 785 }, { "epoch": 0.8002036141511835, "grad_norm": 7.535914897918701, "learning_rate": 9.681727318132227e-06, "loss": 2.2536, "step": 786 }, { "epoch": 0.801221684907101, "grad_norm": 9.274803161621094, "learning_rate": 9.586946072266478e-06, "loss": 2.4405, "step": 787 }, { "epoch": 0.8022397556630185, "grad_norm": 7.637203216552734, "learning_rate": 9.492581833388736e-06, "loss": 1.9418, "step": 788 }, { "epoch": 0.8032578264189361, "grad_norm": 11.312345504760742, "learning_rate": 9.398635575207854e-06, "loss": 3.0226, "step": 789 }, { "epoch": 0.8042758971748537, "grad_norm": 8.732460021972656, "learning_rate": 9.305108267119645e-06, "loss": 2.0292, "step": 790 }, { "epoch": 0.8052939679307712, "grad_norm": 6.998504638671875, "learning_rate": 9.212000874196953e-06, "loss": 1.7993, "step": 791 }, { "epoch": 0.8063120386866888, "grad_norm": 9.693340301513672, "learning_rate": 9.119314357179687e-06, "loss": 2.4902, "step": 792 }, { "epoch": 0.8073301094426063, "grad_norm": 8.360791206359863, "learning_rate": 9.027049672464916e-06, "loss": 2.0688, "step": 793 }, { "epoch": 0.8083481801985238, "grad_norm": 7.457218647003174, "learning_rate": 8.935207772096904e-06, "loss": 2.1543, "step": 794 }, { "epoch": 0.8093662509544414, "grad_norm": 8.154823303222656, "learning_rate": 8.843789603757446e-06, "loss": 2.5219, "step": 795 }, { "epoch": 0.8103843217103589, "grad_norm": 8.503774642944336, "learning_rate": 8.752796110755984e-06, "loss": 1.7771, "step": 796 }, { "epoch": 0.8114023924662764, "grad_norm": 9.03532600402832, "learning_rate": 8.662228232019876e-06, "loss": 2.1927, "step": 797 }, { "epoch": 0.812420463222194, "grad_norm": 7.620565891265869, "learning_rate": 8.572086902084731e-06, "loss": 1.5837, "step": 798 }, { "epoch": 0.8134385339781115, "grad_norm": 8.183737754821777, "learning_rate": 8.48237305108479e-06, "loss": 2.0261, "step": 799 }, { "epoch": 0.814456604734029, "grad_norm": 7.848052978515625, "learning_rate": 8.393087604743283e-06, "loss": 1.7399, "step": 800 }, { "epoch": 0.8154746754899466, "grad_norm": 6.207403182983398, "learning_rate": 8.304231484362868e-06, "loss": 2.4818, "step": 801 }, { "epoch": 0.8164927462458641, "grad_norm": 7.363234043121338, "learning_rate": 8.215805606816191e-06, "loss": 2.7651, "step": 802 }, { "epoch": 0.8175108170017816, "grad_norm": 8.982840538024902, "learning_rate": 8.127810884536403e-06, "loss": 3.0555, "step": 803 }, { "epoch": 0.8185288877576992, "grad_norm": 8.631933212280273, "learning_rate": 8.040248225507641e-06, "loss": 2.9924, "step": 804 }, { "epoch": 0.8195469585136167, "grad_norm": 11.01415729522705, "learning_rate": 7.95311853325582e-06, "loss": 3.5641, "step": 805 }, { "epoch": 0.8205650292695342, "grad_norm": 11.288020133972168, "learning_rate": 7.866422706839238e-06, "loss": 3.6431, "step": 806 }, { "epoch": 0.8215831000254518, "grad_norm": 18.66059112548828, "learning_rate": 7.780161640839257e-06, "loss": 3.8684, "step": 807 }, { "epoch": 0.8226011707813693, "grad_norm": 10.357224464416504, "learning_rate": 7.694336225351107e-06, "loss": 2.31, "step": 808 }, { "epoch": 0.8236192415372868, "grad_norm": 10.739027976989746, "learning_rate": 7.60894734597476e-06, "loss": 2.3002, "step": 809 }, { "epoch": 0.8246373122932044, "grad_norm": 4.031766414642334, "learning_rate": 7.523995883805679e-06, "loss": 0.8728, "step": 810 }, { "epoch": 0.8256553830491219, "grad_norm": 5.490253448486328, "learning_rate": 7.439482715425805e-06, "loss": 1.0823, "step": 811 }, { "epoch": 0.8266734538050394, "grad_norm": 6.157886981964111, "learning_rate": 7.355408712894507e-06, "loss": 1.4668, "step": 812 }, { "epoch": 0.827691524560957, "grad_norm": 4.612349510192871, "learning_rate": 7.271774743739545e-06, "loss": 0.9666, "step": 813 }, { "epoch": 0.8287095953168745, "grad_norm": 4.856873989105225, "learning_rate": 7.188581670948169e-06, "loss": 1.2653, "step": 814 }, { "epoch": 0.829727666072792, "grad_norm": 7.248535633087158, "learning_rate": 7.105830352958142e-06, "loss": 1.8515, "step": 815 }, { "epoch": 0.8307457368287096, "grad_norm": 7.241623878479004, "learning_rate": 7.0235216436489835e-06, "loss": 1.4111, "step": 816 }, { "epoch": 0.8317638075846271, "grad_norm": 5.958316802978516, "learning_rate": 6.941656392333046e-06, "loss": 1.4647, "step": 817 }, { "epoch": 0.8327818783405446, "grad_norm": 7.3134684562683105, "learning_rate": 6.860235443746859e-06, "loss": 1.4169, "step": 818 }, { "epoch": 0.8337999490964622, "grad_norm": 6.82220458984375, "learning_rate": 6.779259638042318e-06, "loss": 1.6871, "step": 819 }, { "epoch": 0.8348180198523797, "grad_norm": 8.294244766235352, "learning_rate": 6.698729810778065e-06, "loss": 2.0008, "step": 820 }, { "epoch": 0.8358360906082972, "grad_norm": 7.062936305999756, "learning_rate": 6.618646792910893e-06, "loss": 1.5006, "step": 821 }, { "epoch": 0.8368541613642149, "grad_norm": 8.914663314819336, "learning_rate": 6.539011410787105e-06, "loss": 2.8718, "step": 822 }, { "epoch": 0.8378722321201324, "grad_norm": 6.904050827026367, "learning_rate": 6.459824486134014e-06, "loss": 1.6913, "step": 823 }, { "epoch": 0.8388903028760499, "grad_norm": 9.867286682128906, "learning_rate": 6.381086836051498e-06, "loss": 2.3808, "step": 824 }, { "epoch": 0.8399083736319675, "grad_norm": 7.717339992523193, "learning_rate": 6.302799273003546e-06, "loss": 2.3309, "step": 825 }, { "epoch": 0.840926444387885, "grad_norm": 7.964992046356201, "learning_rate": 6.224962604809819e-06, "loss": 1.8556, "step": 826 }, { "epoch": 0.8419445151438025, "grad_norm": 8.438617706298828, "learning_rate": 6.147577634637414e-06, "loss": 2.8175, "step": 827 }, { "epoch": 0.8429625858997201, "grad_norm": 6.18842077255249, "learning_rate": 6.070645160992522e-06, "loss": 1.8594, "step": 828 }, { "epoch": 0.8439806566556376, "grad_norm": 9.132290840148926, "learning_rate": 5.994165977712174e-06, "loss": 2.4016, "step": 829 }, { "epoch": 0.8449987274115551, "grad_norm": 8.243701934814453, "learning_rate": 5.918140873956063e-06, "loss": 2.8481, "step": 830 }, { "epoch": 0.8460167981674727, "grad_norm": 7.9838457107543945, "learning_rate": 5.842570634198452e-06, "loss": 2.2464, "step": 831 }, { "epoch": 0.8470348689233902, "grad_norm": 9.669371604919434, "learning_rate": 5.767456038219987e-06, "loss": 3.0041, "step": 832 }, { "epoch": 0.8480529396793077, "grad_norm": 6.894510746002197, "learning_rate": 5.692797861099719e-06, "loss": 1.9256, "step": 833 }, { "epoch": 0.8490710104352253, "grad_norm": 9.749018669128418, "learning_rate": 5.6185968732070825e-06, "loss": 2.0606, "step": 834 }, { "epoch": 0.8500890811911428, "grad_norm": 10.20246410369873, "learning_rate": 5.544853840193981e-06, "loss": 2.5006, "step": 835 }, { "epoch": 0.8511071519470603, "grad_norm": 7.829645156860352, "learning_rate": 5.471569522986774e-06, "loss": 2.2776, "step": 836 }, { "epoch": 0.8521252227029779, "grad_norm": 8.704474449157715, "learning_rate": 5.398744677778594e-06, "loss": 2.155, "step": 837 }, { "epoch": 0.8531432934588954, "grad_norm": 9.067277908325195, "learning_rate": 5.326380056021418e-06, "loss": 1.9537, "step": 838 }, { "epoch": 0.8541613642148129, "grad_norm": 7.175379276275635, "learning_rate": 5.25447640441834e-06, "loss": 1.8142, "step": 839 }, { "epoch": 0.8551794349707305, "grad_norm": 9.313763618469238, "learning_rate": 5.183034464915898e-06, "loss": 2.6184, "step": 840 }, { "epoch": 0.856197505726648, "grad_norm": 8.87330150604248, "learning_rate": 5.112054974696395e-06, "loss": 2.2619, "step": 841 }, { "epoch": 0.8572155764825655, "grad_norm": 7.452190399169922, "learning_rate": 5.041538666170281e-06, "loss": 2.0532, "step": 842 }, { "epoch": 0.858233647238483, "grad_norm": 8.609152793884277, "learning_rate": 4.9714862669686335e-06, "loss": 2.3708, "step": 843 }, { "epoch": 0.8592517179944006, "grad_norm": 9.013458251953125, "learning_rate": 4.901898499935609e-06, "loss": 2.2493, "step": 844 }, { "epoch": 0.8602697887503181, "grad_norm": 6.508463382720947, "learning_rate": 4.832776083120982e-06, "loss": 1.6531, "step": 845 }, { "epoch": 0.8612878595062357, "grad_norm": 8.564003944396973, "learning_rate": 4.764119729772809e-06, "loss": 2.2661, "step": 846 }, { "epoch": 0.8623059302621532, "grad_norm": 10.198071479797363, "learning_rate": 4.695930148329958e-06, "loss": 2.5887, "step": 847 }, { "epoch": 0.8633240010180707, "grad_norm": 7.703155994415283, "learning_rate": 4.628208042414889e-06, "loss": 2.1529, "step": 848 }, { "epoch": 0.8643420717739883, "grad_norm": 9.272164344787598, "learning_rate": 4.560954110826337e-06, "loss": 2.1878, "step": 849 }, { "epoch": 0.8653601425299058, "grad_norm": 9.344987869262695, "learning_rate": 4.494169047532154e-06, "loss": 1.8306, "step": 850 }, { "epoch": 0.8663782132858233, "grad_norm": 7.905543327331543, "learning_rate": 4.427853541662091e-06, "loss": 3.7738, "step": 851 }, { "epoch": 0.8673962840417409, "grad_norm": 11.58995532989502, "learning_rate": 4.362008277500701e-06, "loss": 3.8687, "step": 852 }, { "epoch": 0.8684143547976584, "grad_norm": 10.805427551269531, "learning_rate": 4.296633934480337e-06, "loss": 4.1339, "step": 853 }, { "epoch": 0.869432425553576, "grad_norm": 9.845876693725586, "learning_rate": 4.231731187174065e-06, "loss": 3.6647, "step": 854 }, { "epoch": 0.8704504963094936, "grad_norm": 11.320969581604004, "learning_rate": 4.167300705288718e-06, "loss": 3.8381, "step": 855 }, { "epoch": 0.8714685670654111, "grad_norm": 12.798758506774902, "learning_rate": 4.10334315365804e-06, "loss": 3.4318, "step": 856 }, { "epoch": 0.8724866378213286, "grad_norm": 11.61473274230957, "learning_rate": 4.039859192235779e-06, "loss": 2.8296, "step": 857 }, { "epoch": 0.8735047085772462, "grad_norm": 14.96419620513916, "learning_rate": 3.976849476088845e-06, "loss": 3.5524, "step": 858 }, { "epoch": 0.8745227793331637, "grad_norm": 7.406204700469971, "learning_rate": 3.914314655390633e-06, "loss": 1.9455, "step": 859 }, { "epoch": 0.8755408500890812, "grad_norm": 6.518743991851807, "learning_rate": 3.852255375414271e-06, "loss": 1.2701, "step": 860 }, { "epoch": 0.8765589208449988, "grad_norm": 6.446305751800537, "learning_rate": 3.790672276525936e-06, "loss": 1.4215, "step": 861 }, { "epoch": 0.8775769916009163, "grad_norm": 8.741048812866211, "learning_rate": 3.7295659941782855e-06, "loss": 1.8906, "step": 862 }, { "epoch": 0.8785950623568338, "grad_norm": 8.466268539428711, "learning_rate": 3.668937158903901e-06, "loss": 1.8339, "step": 863 }, { "epoch": 0.8796131331127514, "grad_norm": 9.552635192871094, "learning_rate": 3.6087863963087497e-06, "loss": 2.2068, "step": 864 }, { "epoch": 0.8806312038686689, "grad_norm": 6.880453586578369, "learning_rate": 3.5491143270657446e-06, "loss": 1.4963, "step": 865 }, { "epoch": 0.8816492746245864, "grad_norm": 6.255483627319336, "learning_rate": 3.4899215669083716e-06, "loss": 1.3132, "step": 866 }, { "epoch": 0.882667345380504, "grad_norm": 6.0426025390625, "learning_rate": 3.4312087266242963e-06, "loss": 1.5957, "step": 867 }, { "epoch": 0.8836854161364215, "grad_norm": 6.473054885864258, "learning_rate": 3.3729764120490446e-06, "loss": 1.7223, "step": 868 }, { "epoch": 0.884703486892339, "grad_norm": 6.081118106842041, "learning_rate": 3.315225224059809e-06, "loss": 1.4486, "step": 869 }, { "epoch": 0.8857215576482566, "grad_norm": 8.436514854431152, "learning_rate": 3.25795575856922e-06, "loss": 2.5112, "step": 870 }, { "epoch": 0.8867396284041741, "grad_norm": 7.625750541687012, "learning_rate": 3.2011686065191895e-06, "loss": 1.7229, "step": 871 }, { "epoch": 0.8877576991600916, "grad_norm": 8.813794136047363, "learning_rate": 3.1448643538748045e-06, "loss": 2.4128, "step": 872 }, { "epoch": 0.8887757699160092, "grad_norm": 5.511772155761719, "learning_rate": 3.0890435816183226e-06, "loss": 1.3973, "step": 873 }, { "epoch": 0.8897938406719267, "grad_norm": 8.631103515625, "learning_rate": 3.03370686574313e-06, "loss": 2.015, "step": 874 }, { "epoch": 0.8908119114278442, "grad_norm": 7.364684104919434, "learning_rate": 2.9788547772478416e-06, "loss": 2.2994, "step": 875 }, { "epoch": 0.8918299821837617, "grad_norm": 6.241342067718506, "learning_rate": 2.924487882130356e-06, "loss": 1.8339, "step": 876 }, { "epoch": 0.8928480529396793, "grad_norm": 7.564022064208984, "learning_rate": 2.870606741382059e-06, "loss": 1.7091, "step": 877 }, { "epoch": 0.8938661236955968, "grad_norm": 7.5156378746032715, "learning_rate": 2.817211910982037e-06, "loss": 2.1743, "step": 878 }, { "epoch": 0.8948841944515143, "grad_norm": 7.265412330627441, "learning_rate": 2.7643039418913e-06, "loss": 2.0643, "step": 879 }, { "epoch": 0.8959022652074319, "grad_norm": 7.189180850982666, "learning_rate": 2.711883380047131e-06, "loss": 2.1941, "step": 880 }, { "epoch": 0.8969203359633494, "grad_norm": 7.311581134796143, "learning_rate": 2.6599507663574384e-06, "loss": 2.1736, "step": 881 }, { "epoch": 0.897938406719267, "grad_norm": 10.869359016418457, "learning_rate": 2.6085066366951905e-06, "loss": 3.2378, "step": 882 }, { "epoch": 0.8989564774751845, "grad_norm": 7.649423122406006, "learning_rate": 2.5575515218928592e-06, "loss": 2.1515, "step": 883 }, { "epoch": 0.899974548231102, "grad_norm": 7.814465045928955, "learning_rate": 2.5070859477369645e-06, "loss": 2.4541, "step": 884 }, { "epoch": 0.9009926189870195, "grad_norm": 7.679671764373779, "learning_rate": 2.457110434962645e-06, "loss": 2.6637, "step": 885 }, { "epoch": 0.9020106897429372, "grad_norm": 8.16401481628418, "learning_rate": 2.407625499248273e-06, "loss": 2.547, "step": 886 }, { "epoch": 0.9030287604988547, "grad_norm": 9.379379272460938, "learning_rate": 2.3586316512101416e-06, "loss": 2.5642, "step": 887 }, { "epoch": 0.9040468312547723, "grad_norm": 7.037562370300293, "learning_rate": 2.3101293963972094e-06, "loss": 1.9438, "step": 888 }, { "epoch": 0.9050649020106898, "grad_norm": 7.9886674880981445, "learning_rate": 2.26211923528587e-06, "loss": 2.2604, "step": 889 }, { "epoch": 0.9060829727666073, "grad_norm": 7.768427848815918, "learning_rate": 2.2146016632747624e-06, "loss": 1.8138, "step": 890 }, { "epoch": 0.9071010435225249, "grad_norm": 7.540080547332764, "learning_rate": 2.1675771706797132e-06, "loss": 1.8673, "step": 891 }, { "epoch": 0.9081191142784424, "grad_norm": 8.09107780456543, "learning_rate": 2.1210462427286524e-06, "loss": 1.971, "step": 892 }, { "epoch": 0.9091371850343599, "grad_norm": 6.946146011352539, "learning_rate": 2.0750093595565733e-06, "loss": 1.7914, "step": 893 }, { "epoch": 0.9101552557902775, "grad_norm": 10.480133056640625, "learning_rate": 2.0294669962006354e-06, "loss": 2.4089, "step": 894 }, { "epoch": 0.911173326546195, "grad_norm": 7.2262797355651855, "learning_rate": 1.984419622595224e-06, "loss": 1.8403, "step": 895 }, { "epoch": 0.9121913973021125, "grad_norm": 8.057347297668457, "learning_rate": 1.939867703567122e-06, "loss": 1.7492, "step": 896 }, { "epoch": 0.91320946805803, "grad_norm": 8.859241485595703, "learning_rate": 1.895811698830685e-06, "loss": 2.3681, "step": 897 }, { "epoch": 0.9142275388139476, "grad_norm": 7.5691328048706055, "learning_rate": 1.8522520629831397e-06, "loss": 2.1921, "step": 898 }, { "epoch": 0.9152456095698651, "grad_norm": 9.58875846862793, "learning_rate": 1.8091892454998594e-06, "loss": 2.4974, "step": 899 }, { "epoch": 0.9162636803257826, "grad_norm": 7.032078266143799, "learning_rate": 1.7666236907297406e-06, "loss": 1.3266, "step": 900 }, { "epoch": 0.9172817510817002, "grad_norm": 7.034844875335693, "learning_rate": 1.7245558378906013e-06, "loss": 2.617, "step": 901 }, { "epoch": 0.9182998218376177, "grad_norm": 9.052960395812988, "learning_rate": 1.6829861210646891e-06, "loss": 3.9865, "step": 902 }, { "epoch": 0.9193178925935352, "grad_norm": 11.31360912322998, "learning_rate": 1.641914969194147e-06, "loss": 3.9044, "step": 903 }, { "epoch": 0.9203359633494528, "grad_norm": 11.041589736938477, "learning_rate": 1.6013428060766168e-06, "loss": 4.3406, "step": 904 }, { "epoch": 0.9213540341053703, "grad_norm": 10.366722106933594, "learning_rate": 1.5612700503608968e-06, "loss": 2.948, "step": 905 }, { "epoch": 0.9223721048612878, "grad_norm": 13.091897964477539, "learning_rate": 1.5216971155425475e-06, "loss": 4.3409, "step": 906 }, { "epoch": 0.9233901756172054, "grad_norm": 10.320602416992188, "learning_rate": 1.4826244099596986e-06, "loss": 2.7963, "step": 907 }, { "epoch": 0.9244082463731229, "grad_norm": 9.795669555664062, "learning_rate": 1.4440523367887871e-06, "loss": 2.7568, "step": 908 }, { "epoch": 0.9254263171290404, "grad_norm": 5.081234931945801, "learning_rate": 1.4059812940404093e-06, "loss": 1.1005, "step": 909 }, { "epoch": 0.926444387884958, "grad_norm": 5.965490818023682, "learning_rate": 1.3684116745552423e-06, "loss": 1.6534, "step": 910 }, { "epoch": 0.9274624586408755, "grad_norm": 8.321152687072754, "learning_rate": 1.33134386599994e-06, "loss": 1.6702, "step": 911 }, { "epoch": 0.928480529396793, "grad_norm": 6.844864845275879, "learning_rate": 1.2947782508631822e-06, "loss": 1.5389, "step": 912 }, { "epoch": 0.9294986001527106, "grad_norm": 7.344895362854004, "learning_rate": 1.2587152064516827e-06, "loss": 1.6019, "step": 913 }, { "epoch": 0.9305166709086281, "grad_norm": 6.270571708679199, "learning_rate": 1.223155104886342e-06, "loss": 1.322, "step": 914 }, { "epoch": 0.9315347416645456, "grad_norm": 7.117447853088379, "learning_rate": 1.1880983130983626e-06, "loss": 1.6418, "step": 915 }, { "epoch": 0.9325528124204632, "grad_norm": 8.100106239318848, "learning_rate": 1.1535451928254947e-06, "loss": 2.1211, "step": 916 }, { "epoch": 0.9335708831763807, "grad_norm": 9.206981658935547, "learning_rate": 1.1194961006082972e-06, "loss": 2.2021, "step": 917 }, { "epoch": 0.9345889539322983, "grad_norm": 5.570037364959717, "learning_rate": 1.085951387786438e-06, "loss": 1.5467, "step": 918 }, { "epoch": 0.9356070246882159, "grad_norm": 8.013311386108398, "learning_rate": 1.0529114004951047e-06, "loss": 1.9642, "step": 919 }, { "epoch": 0.9366250954441334, "grad_norm": 6.999322891235352, "learning_rate": 1.0203764796614058e-06, "loss": 1.772, "step": 920 }, { "epoch": 0.937643166200051, "grad_norm": 7.812559604644775, "learning_rate": 9.883469610008577e-07, "loss": 1.9127, "step": 921 }, { "epoch": 0.9386612369559685, "grad_norm": 9.275822639465332, "learning_rate": 9.568231750139212e-07, "loss": 2.3705, "step": 922 }, { "epoch": 0.939679307711886, "grad_norm": 7.6475372314453125, "learning_rate": 9.258054469825972e-07, "loss": 2.306, "step": 923 }, { "epoch": 0.9406973784678035, "grad_norm": 7.288696765899658, "learning_rate": 8.952940969670809e-07, "loss": 1.6429, "step": 924 }, { "epoch": 0.9417154492237211, "grad_norm": 7.795031547546387, "learning_rate": 8.652894398024136e-07, "loss": 1.9986, "step": 925 }, { "epoch": 0.9427335199796386, "grad_norm": 7.860483169555664, "learning_rate": 8.357917850952802e-07, "loss": 2.1139, "step": 926 }, { "epoch": 0.9437515907355561, "grad_norm": 7.814316749572754, "learning_rate": 8.06801437220811e-07, "loss": 2.1013, "step": 927 }, { "epoch": 0.9447696614914737, "grad_norm": 8.413445472717285, "learning_rate": 7.783186953194189e-07, "loss": 2.7227, "step": 928 }, { "epoch": 0.9457877322473912, "grad_norm": 7.9406328201293945, "learning_rate": 7.503438532937168e-07, "loss": 2.1969, "step": 929 }, { "epoch": 0.9468058030033087, "grad_norm": 8.008191108703613, "learning_rate": 7.228771998054995e-07, "loss": 2.2026, "step": 930 }, { "epoch": 0.9478238737592263, "grad_norm": 9.503705978393555, "learning_rate": 6.959190182727615e-07, "loss": 2.6732, "step": 931 }, { "epoch": 0.9488419445151438, "grad_norm": 9.472963333129883, "learning_rate": 6.694695868667556e-07, "loss": 3.0515, "step": 932 }, { "epoch": 0.9498600152710613, "grad_norm": 7.070324420928955, "learning_rate": 6.43529178509139e-07, "loss": 1.9813, "step": 933 }, { "epoch": 0.9508780860269789, "grad_norm": 9.686485290527344, "learning_rate": 6.180980608691655e-07, "loss": 2.8315, "step": 934 }, { "epoch": 0.9518961567828964, "grad_norm": 8.254791259765625, "learning_rate": 5.931764963608866e-07, "loss": 2.2097, "step": 935 }, { "epoch": 0.9529142275388139, "grad_norm": 8.103293418884277, "learning_rate": 5.687647421404874e-07, "loss": 2.4556, "step": 936 }, { "epoch": 0.9539322982947315, "grad_norm": 7.946092128753662, "learning_rate": 5.448630501036112e-07, "loss": 1.7899, "step": 937 }, { "epoch": 0.954950369050649, "grad_norm": 7.618176460266113, "learning_rate": 5.214716668827557e-07, "loss": 2.3681, "step": 938 }, { "epoch": 0.9559684398065665, "grad_norm": 5.380380153656006, "learning_rate": 4.985908338447476e-07, "loss": 1.7725, "step": 939 }, { "epoch": 0.9569865105624841, "grad_norm": 7.778873920440674, "learning_rate": 4.762207870882218e-07, "loss": 1.8244, "step": 940 }, { "epoch": 0.9580045813184016, "grad_norm": 9.022889137268066, "learning_rate": 4.543617574412184e-07, "loss": 2.5223, "step": 941 }, { "epoch": 0.9590226520743191, "grad_norm": 6.915138244628906, "learning_rate": 4.3301397045877876e-07, "loss": 1.5739, "step": 942 }, { "epoch": 0.9600407228302367, "grad_norm": 7.640623092651367, "learning_rate": 4.121776464206251e-07, "loss": 2.0568, "step": 943 }, { "epoch": 0.9610587935861542, "grad_norm": 7.426476955413818, "learning_rate": 3.9185300032889006e-07, "loss": 1.8825, "step": 944 }, { "epoch": 0.9620768643420717, "grad_norm": 8.864518165588379, "learning_rate": 3.720402419058966e-07, "loss": 2.373, "step": 945 }, { "epoch": 0.9630949350979893, "grad_norm": 7.7088541984558105, "learning_rate": 3.5273957559199266e-07, "loss": 1.6487, "step": 946 }, { "epoch": 0.9641130058539068, "grad_norm": 7.193671703338623, "learning_rate": 3.339512005434309e-07, "loss": 1.8298, "step": 947 }, { "epoch": 0.9651310766098243, "grad_norm": 7.4829230308532715, "learning_rate": 3.1567531063033673e-07, "loss": 1.1332, "step": 948 }, { "epoch": 0.9661491473657419, "grad_norm": 27.006391525268555, "learning_rate": 2.979120944346936e-07, "loss": 2.0494, "step": 949 }, { "epoch": 0.9671672181216594, "grad_norm": 11.99468994140625, "learning_rate": 2.806617352483998e-07, "loss": 2.4674, "step": 950 }, { "epoch": 0.968185288877577, "grad_norm": 7.580793380737305, "learning_rate": 2.639244110713701e-07, "loss": 3.3584, "step": 951 }, { "epoch": 0.9692033596334946, "grad_norm": 9.658464431762695, "learning_rate": 2.4770029460970954e-07, "loss": 3.5755, "step": 952 }, { "epoch": 0.9702214303894121, "grad_norm": 9.27920150756836, "learning_rate": 2.319895532739369e-07, "loss": 2.8816, "step": 953 }, { "epoch": 0.9712395011453296, "grad_norm": 11.56139850616455, "learning_rate": 2.1679234917721946e-07, "loss": 3.1379, "step": 954 }, { "epoch": 0.9722575719012472, "grad_norm": 13.082681655883789, "learning_rate": 2.0210883913376334e-07, "loss": 3.5519, "step": 955 }, { "epoch": 0.9732756426571647, "grad_norm": 10.118450164794922, "learning_rate": 1.8793917465713684e-07, "loss": 2.1806, "step": 956 }, { "epoch": 0.9742937134130822, "grad_norm": 7.3693928718566895, "learning_rate": 1.742835019587441e-07, "loss": 1.6516, "step": 957 }, { "epoch": 0.9753117841689998, "grad_norm": 7.52052640914917, "learning_rate": 1.6114196194628172e-07, "loss": 1.8945, "step": 958 }, { "epoch": 0.9763298549249173, "grad_norm": 6.575441837310791, "learning_rate": 1.4851469022234e-07, "loss": 1.5346, "step": 959 }, { "epoch": 0.9773479256808348, "grad_norm": 5.139125823974609, "learning_rate": 1.3640181708293731e-07, "loss": 1.2349, "step": 960 }, { "epoch": 0.9783659964367524, "grad_norm": 5.116672992706299, "learning_rate": 1.2480346751622686e-07, "loss": 1.3371, "step": 961 }, { "epoch": 0.9793840671926699, "grad_norm": 6.816611289978027, "learning_rate": 1.1371976120118088e-07, "loss": 1.8166, "step": 962 }, { "epoch": 0.9804021379485874, "grad_norm": 5.658526420593262, "learning_rate": 1.0315081250636405e-07, "loss": 1.435, "step": 963 }, { "epoch": 0.981420208704505, "grad_norm": 5.805619239807129, "learning_rate": 9.309673048875089e-08, "loss": 1.473, "step": 964 }, { "epoch": 0.9824382794604225, "grad_norm": 8.438018798828125, "learning_rate": 8.355761889260461e-08, "loss": 1.9403, "step": 965 }, { "epoch": 0.98345635021634, "grad_norm": 6.625545501708984, "learning_rate": 7.453357614841117e-08, "loss": 1.7268, "step": 966 }, { "epoch": 0.9844744209722576, "grad_norm": 7.497494220733643, "learning_rate": 6.602469537183021e-08, "loss": 2.0333, "step": 967 }, { "epoch": 0.9854924917281751, "grad_norm": 7.167290210723877, "learning_rate": 5.8031064362795705e-08, "loss": 1.9103, "step": 968 }, { "epoch": 0.9865105624840926, "grad_norm": 8.998686790466309, "learning_rate": 5.0552765604544584e-08, "loss": 2.8485, "step": 969 }, { "epoch": 0.9875286332400102, "grad_norm": 7.887483596801758, "learning_rate": 4.358987626281175e-08, "loss": 2.5035, "step": 970 }, { "epoch": 0.9885467039959277, "grad_norm": 8.251432418823242, "learning_rate": 3.7142468185014104e-08, "loss": 2.1646, "step": 971 }, { "epoch": 0.9895647747518452, "grad_norm": 8.031188011169434, "learning_rate": 3.121060789951225e-08, "loss": 2.3445, "step": 972 }, { "epoch": 0.9905828455077628, "grad_norm": 10.717124938964844, "learning_rate": 2.5794356614922134e-08, "loss": 2.4195, "step": 973 }, { "epoch": 0.9916009162636803, "grad_norm": 7.7094950675964355, "learning_rate": 2.0893770219493346e-08, "loss": 2.2724, "step": 974 }, { "epoch": 0.9926189870195978, "grad_norm": 9.21927547454834, "learning_rate": 1.6508899280515134e-08, "loss": 2.9697, "step": 975 }, { "epoch": 0.9936370577755154, "grad_norm": 7.8263092041015625, "learning_rate": 1.2639789043805694e-08, "loss": 1.9429, "step": 976 }, { "epoch": 0.9946551285314329, "grad_norm": 8.359663009643555, "learning_rate": 9.286479433257e-09, "loss": 2.078, "step": 977 }, { "epoch": 0.9956731992873504, "grad_norm": 9.0786714553833, "learning_rate": 6.449005050390699e-09, "loss": 2.2142, "step": 978 }, { "epoch": 0.996691270043268, "grad_norm": 8.358176231384277, "learning_rate": 4.127395174036153e-09, "loss": 1.7529, "step": 979 }, { "epoch": 0.9977093407991855, "grad_norm": 8.492805480957031, "learning_rate": 2.321673760002918e-09, "loss": 2.3433, "step": 980 }, { "epoch": 0.998727411555103, "grad_norm": 10.61440372467041, "learning_rate": 1.0318594408476045e-09, "loss": 2.466, "step": 981 }, { "epoch": 0.9997454823110206, "grad_norm": 9.659395217895508, "learning_rate": 2.57965525674031e-10, "loss": 1.3249, "step": 982 }, { "epoch": 0.9997454823110206, "eval_loss": 0.5453814268112183, "eval_runtime": 50.2489, "eval_samples_per_second": 16.478, "eval_steps_per_second": 4.119, "step": 982 }, { "epoch": 1.000763553066938, "grad_norm": 6.880834102630615, "learning_rate": 0.0, "loss": 1.8668, "step": 983 } ], "logging_steps": 1, "max_steps": 983, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 246, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.031180351948718e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }