diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,62846 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999990712551615, + "eval_steps": 500, + "global_step": 44863, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001114493806200672, + "grad_norm": 0.933451771736145, + "learning_rate": 1.9999999387038595e-05, + "loss": 0.7486, + "step": 5 + }, + { + "epoch": 0.0002228987612401344, + "grad_norm": 0.8256990313529968, + "learning_rate": 1.999999754815445e-05, + "loss": 0.6341, + "step": 10 + }, + { + "epoch": 0.00033434814186020164, + "grad_norm": 0.9633138179779053, + "learning_rate": 1.99999944833478e-05, + "loss": 0.5651, + "step": 15 + }, + { + "epoch": 0.0004457975224802688, + "grad_norm": 0.5811278820037842, + "learning_rate": 1.999999019261901e-05, + "loss": 0.6119, + "step": 20 + }, + { + "epoch": 0.000557246903100336, + "grad_norm": 0.7817385196685791, + "learning_rate": 1.999998467596861e-05, + "loss": 0.5496, + "step": 25 + }, + { + "epoch": 0.0006686962837204033, + "grad_norm": 0.9045018553733826, + "learning_rate": 1.999997793339728e-05, + "loss": 0.6103, + "step": 30 + }, + { + "epoch": 0.0007801456643404704, + "grad_norm": 0.8104248046875, + "learning_rate": 1.999996996490584e-05, + "loss": 0.5454, + "step": 35 + }, + { + "epoch": 0.0008915950449605376, + "grad_norm": 0.6950963139533997, + "learning_rate": 1.9999960770495273e-05, + "loss": 0.4731, + "step": 40 + }, + { + "epoch": 0.0010030444255806049, + "grad_norm": 1.1192351579666138, + "learning_rate": 1.9999950350166698e-05, + "loss": 0.4454, + "step": 45 + }, + { + "epoch": 0.001114493806200672, + "grad_norm": 0.8689408302307129, + "learning_rate": 1.9999938703921403e-05, + "loss": 0.4379, + "step": 50 + }, + { + "epoch": 0.0012259431868207392, + "grad_norm": 0.7909908294677734, + "learning_rate": 1.9999925831760807e-05, + "loss": 0.4556, + "step": 55 + }, + { + "epoch": 0.0013373925674408065, + "grad_norm": 0.8446680903434753, + "learning_rate": 1.9999911733686496e-05, + "loss": 0.4575, + "step": 60 + }, + { + "epoch": 0.0014488419480608737, + "grad_norm": 1.1718509197235107, + "learning_rate": 1.999989640970019e-05, + "loss": 0.6243, + "step": 65 + }, + { + "epoch": 0.0015602913286809409, + "grad_norm": 0.540774405002594, + "learning_rate": 1.9999879859803775e-05, + "loss": 0.4596, + "step": 70 + }, + { + "epoch": 0.001671740709301008, + "grad_norm": 0.8521000742912292, + "learning_rate": 1.9999862083999276e-05, + "loss": 0.6575, + "step": 75 + }, + { + "epoch": 0.0017831900899210752, + "grad_norm": 0.9445694088935852, + "learning_rate": 1.999984308228887e-05, + "loss": 0.5966, + "step": 80 + }, + { + "epoch": 0.0018946394705411426, + "grad_norm": 0.7841966152191162, + "learning_rate": 1.999982285467489e-05, + "loss": 0.4704, + "step": 85 + }, + { + "epoch": 0.0020060888511612097, + "grad_norm": 0.7752586603164673, + "learning_rate": 1.9999801401159815e-05, + "loss": 0.609, + "step": 90 + }, + { + "epoch": 0.002117538231781277, + "grad_norm": 0.6920070648193359, + "learning_rate": 1.9999778721746276e-05, + "loss": 0.4078, + "step": 95 + }, + { + "epoch": 0.002228987612401344, + "grad_norm": 0.8577648401260376, + "learning_rate": 1.9999754816437052e-05, + "loss": 0.548, + "step": 100 + }, + { + "epoch": 0.002340436993021411, + "grad_norm": 0.7144039869308472, + "learning_rate": 1.999972968523507e-05, + "loss": 0.5046, + "step": 105 + }, + { + "epoch": 0.0024518863736414783, + "grad_norm": 0.8289719223976135, + "learning_rate": 1.999970332814342e-05, + "loss": 0.5335, + "step": 110 + }, + { + "epoch": 0.0025633357542615455, + "grad_norm": 0.7193917632102966, + "learning_rate": 1.9999675745165322e-05, + "loss": 0.493, + "step": 115 + }, + { + "epoch": 0.002674785134881613, + "grad_norm": 0.7550503015518188, + "learning_rate": 1.999964693630417e-05, + "loss": 0.5015, + "step": 120 + }, + { + "epoch": 0.0027862345155016803, + "grad_norm": 0.7383673191070557, + "learning_rate": 1.9999616901563486e-05, + "loss": 0.4139, + "step": 125 + }, + { + "epoch": 0.0028976838961217474, + "grad_norm": 0.5490477085113525, + "learning_rate": 1.999958564094695e-05, + "loss": 0.5196, + "step": 130 + }, + { + "epoch": 0.0030091332767418146, + "grad_norm": 0.6082173585891724, + "learning_rate": 1.9999553154458405e-05, + "loss": 0.5463, + "step": 135 + }, + { + "epoch": 0.0031205826573618817, + "grad_norm": 0.7065110206604004, + "learning_rate": 1.999951944210183e-05, + "loss": 0.6414, + "step": 140 + }, + { + "epoch": 0.003232032037981949, + "grad_norm": 0.9052994847297668, + "learning_rate": 1.9999484503881354e-05, + "loss": 0.5787, + "step": 145 + }, + { + "epoch": 0.003343481418602016, + "grad_norm": 0.8806249499320984, + "learning_rate": 1.9999448339801258e-05, + "loss": 0.5622, + "step": 150 + }, + { + "epoch": 0.003454930799222083, + "grad_norm": 0.8756687045097351, + "learning_rate": 1.9999410949865984e-05, + "loss": 0.5655, + "step": 155 + }, + { + "epoch": 0.0035663801798421504, + "grad_norm": 0.5419703722000122, + "learning_rate": 1.9999372334080108e-05, + "loss": 0.4233, + "step": 160 + }, + { + "epoch": 0.0036778295604622175, + "grad_norm": 0.6752704977989197, + "learning_rate": 1.9999332492448368e-05, + "loss": 0.5489, + "step": 165 + }, + { + "epoch": 0.003789278941082285, + "grad_norm": 0.8393696546554565, + "learning_rate": 1.999929142497565e-05, + "loss": 0.4443, + "step": 170 + }, + { + "epoch": 0.0039007283217023523, + "grad_norm": 0.6518959403038025, + "learning_rate": 1.9999249131666983e-05, + "loss": 0.5434, + "step": 175 + }, + { + "epoch": 0.004012177702322419, + "grad_norm": 0.7919310331344604, + "learning_rate": 1.9999205612527556e-05, + "loss": 0.4413, + "step": 180 + }, + { + "epoch": 0.004123627082942487, + "grad_norm": 0.7572973966598511, + "learning_rate": 1.9999160867562702e-05, + "loss": 0.4197, + "step": 185 + }, + { + "epoch": 0.004235076463562554, + "grad_norm": 0.8709051012992859, + "learning_rate": 1.999911489677791e-05, + "loss": 0.53, + "step": 190 + }, + { + "epoch": 0.004346525844182621, + "grad_norm": 0.7445703744888306, + "learning_rate": 1.999906770017881e-05, + "loss": 0.4986, + "step": 195 + }, + { + "epoch": 0.004457975224802688, + "grad_norm": 0.7933171391487122, + "learning_rate": 1.9999019277771192e-05, + "loss": 0.5453, + "step": 200 + }, + { + "epoch": 0.004569424605422755, + "grad_norm": 0.5811159610748291, + "learning_rate": 1.999896962956099e-05, + "loss": 0.5268, + "step": 205 + }, + { + "epoch": 0.004680873986042822, + "grad_norm": 0.7632311582565308, + "learning_rate": 1.9998918755554295e-05, + "loss": 0.548, + "step": 210 + }, + { + "epoch": 0.0047923233666628895, + "grad_norm": 0.6502264142036438, + "learning_rate": 1.999886665575734e-05, + "loss": 0.6303, + "step": 215 + }, + { + "epoch": 0.004903772747282957, + "grad_norm": 0.7639764547348022, + "learning_rate": 1.9998813330176507e-05, + "loss": 0.4808, + "step": 220 + }, + { + "epoch": 0.005015222127903024, + "grad_norm": 0.6532260179519653, + "learning_rate": 1.9998758778818342e-05, + "loss": 0.5684, + "step": 225 + }, + { + "epoch": 0.005126671508523091, + "grad_norm": 0.7039293646812439, + "learning_rate": 1.999870300168953e-05, + "loss": 0.4445, + "step": 230 + }, + { + "epoch": 0.005238120889143158, + "grad_norm": 0.5982739329338074, + "learning_rate": 1.999864599879691e-05, + "loss": 0.449, + "step": 235 + }, + { + "epoch": 0.005349570269763226, + "grad_norm": 0.7111895084381104, + "learning_rate": 1.9998587770147465e-05, + "loss": 0.5256, + "step": 240 + }, + { + "epoch": 0.005461019650383293, + "grad_norm": 0.5618811249732971, + "learning_rate": 1.9998528315748338e-05, + "loss": 0.4843, + "step": 245 + }, + { + "epoch": 0.0055724690310033605, + "grad_norm": 0.7400057315826416, + "learning_rate": 1.9998467635606813e-05, + "loss": 0.5675, + "step": 250 + }, + { + "epoch": 0.005683918411623428, + "grad_norm": 0.9986342191696167, + "learning_rate": 1.9998405729730338e-05, + "loss": 0.6181, + "step": 255 + }, + { + "epoch": 0.005795367792243495, + "grad_norm": 0.7148557305335999, + "learning_rate": 1.999834259812649e-05, + "loss": 0.451, + "step": 260 + }, + { + "epoch": 0.005906817172863562, + "grad_norm": 0.6029154062271118, + "learning_rate": 1.9998278240803018e-05, + "loss": 0.4362, + "step": 265 + }, + { + "epoch": 0.006018266553483629, + "grad_norm": 0.7607182264328003, + "learning_rate": 1.999821265776781e-05, + "loss": 0.5407, + "step": 270 + }, + { + "epoch": 0.006129715934103696, + "grad_norm": 0.7768608927726746, + "learning_rate": 1.9998145849028906e-05, + "loss": 0.7043, + "step": 275 + }, + { + "epoch": 0.0062411653147237635, + "grad_norm": 0.6231095790863037, + "learning_rate": 1.999807781459449e-05, + "loss": 0.4728, + "step": 280 + }, + { + "epoch": 0.006352614695343831, + "grad_norm": 0.5471903085708618, + "learning_rate": 1.999800855447291e-05, + "loss": 0.4453, + "step": 285 + }, + { + "epoch": 0.006464064075963898, + "grad_norm": 0.8246671557426453, + "learning_rate": 1.9997938068672652e-05, + "loss": 0.5344, + "step": 290 + }, + { + "epoch": 0.006575513456583965, + "grad_norm": 0.5547112822532654, + "learning_rate": 1.999786635720236e-05, + "loss": 0.4327, + "step": 295 + }, + { + "epoch": 0.006686962837204032, + "grad_norm": 0.7208492755889893, + "learning_rate": 1.9997793420070826e-05, + "loss": 0.5299, + "step": 300 + }, + { + "epoch": 0.006798412217824099, + "grad_norm": 0.7412981390953064, + "learning_rate": 1.9997719257286988e-05, + "loss": 0.5218, + "step": 305 + }, + { + "epoch": 0.006909861598444166, + "grad_norm": 0.7224171161651611, + "learning_rate": 1.999764386885994e-05, + "loss": 0.5424, + "step": 310 + }, + { + "epoch": 0.0070213109790642336, + "grad_norm": 0.6104803681373596, + "learning_rate": 1.9997567254798925e-05, + "loss": 0.5609, + "step": 315 + }, + { + "epoch": 0.007132760359684301, + "grad_norm": 0.6950530409812927, + "learning_rate": 1.999748941511333e-05, + "loss": 0.4333, + "step": 320 + }, + { + "epoch": 0.007244209740304368, + "grad_norm": 1.157394289970398, + "learning_rate": 1.9997410349812705e-05, + "loss": 0.4037, + "step": 325 + }, + { + "epoch": 0.007355659120924435, + "grad_norm": 0.6917374134063721, + "learning_rate": 1.9997330058906738e-05, + "loss": 0.3879, + "step": 330 + }, + { + "epoch": 0.007467108501544503, + "grad_norm": 0.7056661248207092, + "learning_rate": 1.9997248542405273e-05, + "loss": 0.4963, + "step": 335 + }, + { + "epoch": 0.00757855788216457, + "grad_norm": 0.8213403820991516, + "learning_rate": 1.9997165800318304e-05, + "loss": 0.4699, + "step": 340 + }, + { + "epoch": 0.007690007262784637, + "grad_norm": 0.7679407000541687, + "learning_rate": 1.9997081832655976e-05, + "loss": 0.4822, + "step": 345 + }, + { + "epoch": 0.0078014566434047045, + "grad_norm": 0.9080938100814819, + "learning_rate": 1.9996996639428578e-05, + "loss": 0.4325, + "step": 350 + }, + { + "epoch": 0.00791290602402477, + "grad_norm": 0.5678904056549072, + "learning_rate": 1.999691022064656e-05, + "loss": 0.4739, + "step": 355 + }, + { + "epoch": 0.008024355404644839, + "grad_norm": 0.567152202129364, + "learning_rate": 1.999682257632051e-05, + "loss": 0.4516, + "step": 360 + }, + { + "epoch": 0.008135804785264905, + "grad_norm": 0.7391321659088135, + "learning_rate": 1.999673370646118e-05, + "loss": 0.4987, + "step": 365 + }, + { + "epoch": 0.008247254165884973, + "grad_norm": 0.7149109840393066, + "learning_rate": 1.9996643611079457e-05, + "loss": 0.6195, + "step": 370 + }, + { + "epoch": 0.00835870354650504, + "grad_norm": 0.7918938994407654, + "learning_rate": 1.999655229018639e-05, + "loss": 0.4269, + "step": 375 + }, + { + "epoch": 0.008470152927125107, + "grad_norm": 0.6578711271286011, + "learning_rate": 1.9996459743793175e-05, + "loss": 0.4386, + "step": 380 + }, + { + "epoch": 0.008581602307745176, + "grad_norm": 0.518247663974762, + "learning_rate": 1.9996365971911155e-05, + "loss": 0.4827, + "step": 385 + }, + { + "epoch": 0.008693051688365242, + "grad_norm": 0.8139423131942749, + "learning_rate": 1.9996270974551824e-05, + "loss": 0.4481, + "step": 390 + }, + { + "epoch": 0.00880450106898531, + "grad_norm": 0.728428304195404, + "learning_rate": 1.999617475172684e-05, + "loss": 0.4442, + "step": 395 + }, + { + "epoch": 0.008915950449605376, + "grad_norm": 0.6173394918441772, + "learning_rate": 1.999607730344798e-05, + "loss": 0.6014, + "step": 400 + }, + { + "epoch": 0.009027399830225444, + "grad_norm": 0.5675866603851318, + "learning_rate": 1.9995978629727207e-05, + "loss": 0.5472, + "step": 405 + }, + { + "epoch": 0.00913884921084551, + "grad_norm": 0.8469188809394836, + "learning_rate": 1.999587873057661e-05, + "loss": 0.4522, + "step": 410 + }, + { + "epoch": 0.009250298591465578, + "grad_norm": 0.5388155579566956, + "learning_rate": 1.9995777606008434e-05, + "loss": 0.3067, + "step": 415 + }, + { + "epoch": 0.009361747972085645, + "grad_norm": 0.6457868218421936, + "learning_rate": 1.9995675256035082e-05, + "loss": 0.4966, + "step": 420 + }, + { + "epoch": 0.009473197352705713, + "grad_norm": 0.586306095123291, + "learning_rate": 1.9995571680669096e-05, + "loss": 0.5018, + "step": 425 + }, + { + "epoch": 0.009584646733325779, + "grad_norm": 0.797963559627533, + "learning_rate": 1.999546687992318e-05, + "loss": 0.4784, + "step": 430 + }, + { + "epoch": 0.009696096113945847, + "grad_norm": 0.6302655339241028, + "learning_rate": 1.9995360853810172e-05, + "loss": 0.3656, + "step": 435 + }, + { + "epoch": 0.009807545494565913, + "grad_norm": 0.6582913398742676, + "learning_rate": 1.9995253602343082e-05, + "loss": 0.6217, + "step": 440 + }, + { + "epoch": 0.009918994875185981, + "grad_norm": 0.663432776927948, + "learning_rate": 1.9995145125535047e-05, + "loss": 0.368, + "step": 445 + }, + { + "epoch": 0.010030444255806048, + "grad_norm": 0.712199330329895, + "learning_rate": 1.9995035423399373e-05, + "loss": 0.6665, + "step": 450 + }, + { + "epoch": 0.010141893636426116, + "grad_norm": 0.6557328104972839, + "learning_rate": 1.9994924495949503e-05, + "loss": 0.5076, + "step": 455 + }, + { + "epoch": 0.010253343017046182, + "grad_norm": 0.7125101685523987, + "learning_rate": 1.999481234319904e-05, + "loss": 0.4728, + "step": 460 + }, + { + "epoch": 0.01036479239766625, + "grad_norm": 0.6319306492805481, + "learning_rate": 1.9994698965161736e-05, + "loss": 0.5616, + "step": 465 + }, + { + "epoch": 0.010476241778286316, + "grad_norm": 0.6128279566764832, + "learning_rate": 1.999458436185148e-05, + "loss": 0.4844, + "step": 470 + }, + { + "epoch": 0.010587691158906384, + "grad_norm": 0.6070976853370667, + "learning_rate": 1.999446853328233e-05, + "loss": 0.551, + "step": 475 + }, + { + "epoch": 0.010699140539526452, + "grad_norm": 0.9927272796630859, + "learning_rate": 1.9994351479468484e-05, + "loss": 0.477, + "step": 480 + }, + { + "epoch": 0.010810589920146519, + "grad_norm": 0.6825286746025085, + "learning_rate": 1.9994233200424288e-05, + "loss": 0.5127, + "step": 485 + }, + { + "epoch": 0.010922039300766587, + "grad_norm": 0.6259206533432007, + "learning_rate": 1.999411369616425e-05, + "loss": 0.5682, + "step": 490 + }, + { + "epoch": 0.011033488681386653, + "grad_norm": 0.6004756689071655, + "learning_rate": 1.9993992966703012e-05, + "loss": 0.557, + "step": 495 + }, + { + "epoch": 0.011144938062006721, + "grad_norm": 0.4804564416408539, + "learning_rate": 1.999387101205538e-05, + "loss": 0.4895, + "step": 500 + }, + { + "epoch": 0.011256387442626787, + "grad_norm": 0.8419182896614075, + "learning_rate": 1.99937478322363e-05, + "loss": 0.5776, + "step": 505 + }, + { + "epoch": 0.011367836823246855, + "grad_norm": 0.6336909532546997, + "learning_rate": 1.999362342726088e-05, + "loss": 0.4448, + "step": 510 + }, + { + "epoch": 0.011479286203866922, + "grad_norm": 0.7228277921676636, + "learning_rate": 1.9993497797144363e-05, + "loss": 0.6556, + "step": 515 + }, + { + "epoch": 0.01159073558448699, + "grad_norm": 0.7878555655479431, + "learning_rate": 1.9993370941902154e-05, + "loss": 0.4965, + "step": 520 + }, + { + "epoch": 0.011702184965107056, + "grad_norm": 0.7645102739334106, + "learning_rate": 1.999324286154981e-05, + "loss": 0.3559, + "step": 525 + }, + { + "epoch": 0.011813634345727124, + "grad_norm": 0.8857200145721436, + "learning_rate": 1.9993113556103024e-05, + "loss": 0.3633, + "step": 530 + }, + { + "epoch": 0.01192508372634719, + "grad_norm": 0.6778533458709717, + "learning_rate": 1.9992983025577647e-05, + "loss": 0.4346, + "step": 535 + }, + { + "epoch": 0.012036533106967258, + "grad_norm": 0.5672030448913574, + "learning_rate": 1.9992851269989687e-05, + "loss": 0.488, + "step": 540 + }, + { + "epoch": 0.012147982487587325, + "grad_norm": 0.5006288886070251, + "learning_rate": 1.9992718289355296e-05, + "loss": 0.4945, + "step": 545 + }, + { + "epoch": 0.012259431868207393, + "grad_norm": 0.6481898427009583, + "learning_rate": 1.9992584083690777e-05, + "loss": 0.477, + "step": 550 + }, + { + "epoch": 0.012370881248827459, + "grad_norm": 0.5722131729125977, + "learning_rate": 1.9992448653012576e-05, + "loss": 0.3468, + "step": 555 + }, + { + "epoch": 0.012482330629447527, + "grad_norm": 0.6108018755912781, + "learning_rate": 1.9992311997337302e-05, + "loss": 0.4508, + "step": 560 + }, + { + "epoch": 0.012593780010067593, + "grad_norm": 0.5560242533683777, + "learning_rate": 1.9992174116681706e-05, + "loss": 0.4088, + "step": 565 + }, + { + "epoch": 0.012705229390687661, + "grad_norm": 0.7010998725891113, + "learning_rate": 1.999203501106269e-05, + "loss": 0.4018, + "step": 570 + }, + { + "epoch": 0.01281667877130773, + "grad_norm": 0.7132105827331543, + "learning_rate": 1.999189468049731e-05, + "loss": 0.4272, + "step": 575 + }, + { + "epoch": 0.012928128151927796, + "grad_norm": 0.628948986530304, + "learning_rate": 1.9991753125002766e-05, + "loss": 0.594, + "step": 580 + }, + { + "epoch": 0.013039577532547864, + "grad_norm": 0.6215829849243164, + "learning_rate": 1.9991610344596416e-05, + "loss": 0.5651, + "step": 585 + }, + { + "epoch": 0.01315102691316793, + "grad_norm": 0.47377511858940125, + "learning_rate": 1.9991466339295758e-05, + "loss": 0.514, + "step": 590 + }, + { + "epoch": 0.013262476293787998, + "grad_norm": 0.6468937397003174, + "learning_rate": 1.999132110911845e-05, + "loss": 0.4809, + "step": 595 + }, + { + "epoch": 0.013373925674408064, + "grad_norm": 0.5864740014076233, + "learning_rate": 1.9991174654082296e-05, + "loss": 0.4404, + "step": 600 + }, + { + "epoch": 0.013485375055028132, + "grad_norm": 0.6367695331573486, + "learning_rate": 1.9991026974205247e-05, + "loss": 0.3821, + "step": 605 + }, + { + "epoch": 0.013596824435648198, + "grad_norm": 0.6551821231842041, + "learning_rate": 1.999087806950541e-05, + "loss": 0.4741, + "step": 610 + }, + { + "epoch": 0.013708273816268267, + "grad_norm": 0.6759845614433289, + "learning_rate": 1.9990727940001043e-05, + "loss": 0.447, + "step": 615 + }, + { + "epoch": 0.013819723196888333, + "grad_norm": 0.5439283847808838, + "learning_rate": 1.9990576585710543e-05, + "loss": 0.3345, + "step": 620 + }, + { + "epoch": 0.0139311725775084, + "grad_norm": 0.7672117948532104, + "learning_rate": 1.9990424006652475e-05, + "loss": 0.4169, + "step": 625 + }, + { + "epoch": 0.014042621958128467, + "grad_norm": 0.5180301070213318, + "learning_rate": 1.999027020284553e-05, + "loss": 0.5224, + "step": 630 + }, + { + "epoch": 0.014154071338748535, + "grad_norm": 0.5923333168029785, + "learning_rate": 1.9990115174308577e-05, + "loss": 0.3539, + "step": 635 + }, + { + "epoch": 0.014265520719368601, + "grad_norm": 0.5843740105628967, + "learning_rate": 1.9989958921060613e-05, + "loss": 0.4949, + "step": 640 + }, + { + "epoch": 0.01437697009998867, + "grad_norm": 0.6855534911155701, + "learning_rate": 1.9989801443120796e-05, + "loss": 0.4121, + "step": 645 + }, + { + "epoch": 0.014488419480608736, + "grad_norm": 0.4852747321128845, + "learning_rate": 1.9989642740508435e-05, + "loss": 0.5217, + "step": 650 + }, + { + "epoch": 0.014599868861228804, + "grad_norm": 0.541433572769165, + "learning_rate": 1.998948281324298e-05, + "loss": 0.4084, + "step": 655 + }, + { + "epoch": 0.01471131824184887, + "grad_norm": 0.7746926546096802, + "learning_rate": 1.9989321661344036e-05, + "loss": 0.394, + "step": 660 + }, + { + "epoch": 0.014822767622468938, + "grad_norm": 0.6820917129516602, + "learning_rate": 1.9989159284831365e-05, + "loss": 0.5741, + "step": 665 + }, + { + "epoch": 0.014934217003089006, + "grad_norm": 0.7556787133216858, + "learning_rate": 1.998899568372487e-05, + "loss": 0.4957, + "step": 670 + }, + { + "epoch": 0.015045666383709072, + "grad_norm": 0.49043309688568115, + "learning_rate": 1.998883085804461e-05, + "loss": 0.3866, + "step": 675 + }, + { + "epoch": 0.01515711576432914, + "grad_norm": 0.5872183442115784, + "learning_rate": 1.9988664807810784e-05, + "loss": 0.5766, + "step": 680 + }, + { + "epoch": 0.015268565144949207, + "grad_norm": 0.6925109028816223, + "learning_rate": 1.9988497533043755e-05, + "loss": 0.4167, + "step": 685 + }, + { + "epoch": 0.015380014525569275, + "grad_norm": 0.7696684002876282, + "learning_rate": 1.998832903376403e-05, + "loss": 0.5314, + "step": 690 + }, + { + "epoch": 0.015491463906189341, + "grad_norm": 0.5626276135444641, + "learning_rate": 1.998815930999226e-05, + "loss": 0.5398, + "step": 695 + }, + { + "epoch": 0.015602913286809409, + "grad_norm": 0.7252472639083862, + "learning_rate": 1.998798836174926e-05, + "loss": 0.3591, + "step": 700 + }, + { + "epoch": 0.015714362667429475, + "grad_norm": 0.6303166747093201, + "learning_rate": 1.9987816189055977e-05, + "loss": 0.555, + "step": 705 + }, + { + "epoch": 0.01582581204804954, + "grad_norm": 0.5740355849266052, + "learning_rate": 1.9987642791933525e-05, + "loss": 0.4974, + "step": 710 + }, + { + "epoch": 0.01593726142866961, + "grad_norm": 0.49339422583580017, + "learning_rate": 1.998746817040316e-05, + "loss": 0.4822, + "step": 715 + }, + { + "epoch": 0.016048710809289678, + "grad_norm": 0.6468168497085571, + "learning_rate": 1.998729232448629e-05, + "loss": 0.4763, + "step": 720 + }, + { + "epoch": 0.016160160189909744, + "grad_norm": 0.7275257706642151, + "learning_rate": 1.998711525420447e-05, + "loss": 0.3596, + "step": 725 + }, + { + "epoch": 0.01627160957052981, + "grad_norm": 0.673309862613678, + "learning_rate": 1.9986936959579406e-05, + "loss": 0.5256, + "step": 730 + }, + { + "epoch": 0.01638305895114988, + "grad_norm": 0.6239281296730042, + "learning_rate": 1.998675744063296e-05, + "loss": 0.4836, + "step": 735 + }, + { + "epoch": 0.016494508331769946, + "grad_norm": 0.47714993357658386, + "learning_rate": 1.9986576697387142e-05, + "loss": 0.4293, + "step": 740 + }, + { + "epoch": 0.016605957712390013, + "grad_norm": 0.6969321370124817, + "learning_rate": 1.9986394729864102e-05, + "loss": 0.4619, + "step": 745 + }, + { + "epoch": 0.01671740709301008, + "grad_norm": 0.6592143177986145, + "learning_rate": 1.998621153808615e-05, + "loss": 0.4271, + "step": 750 + }, + { + "epoch": 0.01682885647363015, + "grad_norm": 0.3665013611316681, + "learning_rate": 1.9986027122075746e-05, + "loss": 0.4901, + "step": 755 + }, + { + "epoch": 0.016940305854250215, + "grad_norm": 0.5377305150032043, + "learning_rate": 1.9985841481855495e-05, + "loss": 0.4967, + "step": 760 + }, + { + "epoch": 0.01705175523487028, + "grad_norm": 0.5931578278541565, + "learning_rate": 1.998565461744816e-05, + "loss": 0.3903, + "step": 765 + }, + { + "epoch": 0.01716320461549035, + "grad_norm": 0.7513961791992188, + "learning_rate": 1.9985466528876646e-05, + "loss": 0.5286, + "step": 770 + }, + { + "epoch": 0.017274653996110417, + "grad_norm": 0.708732008934021, + "learning_rate": 1.998527721616401e-05, + "loss": 0.5049, + "step": 775 + }, + { + "epoch": 0.017386103376730484, + "grad_norm": 0.5988253951072693, + "learning_rate": 1.9985086679333462e-05, + "loss": 0.4331, + "step": 780 + }, + { + "epoch": 0.01749755275735055, + "grad_norm": 0.6259390711784363, + "learning_rate": 1.998489491840836e-05, + "loss": 0.4295, + "step": 785 + }, + { + "epoch": 0.01760900213797062, + "grad_norm": 0.9363119602203369, + "learning_rate": 1.9984701933412217e-05, + "loss": 0.4609, + "step": 790 + }, + { + "epoch": 0.017720451518590686, + "grad_norm": 0.6658714413642883, + "learning_rate": 1.9984507724368682e-05, + "loss": 0.5327, + "step": 795 + }, + { + "epoch": 0.017831900899210752, + "grad_norm": 0.6230865716934204, + "learning_rate": 1.998431229130157e-05, + "loss": 0.561, + "step": 800 + }, + { + "epoch": 0.01794335027983082, + "grad_norm": 0.44577354192733765, + "learning_rate": 1.998411563423484e-05, + "loss": 0.4457, + "step": 805 + }, + { + "epoch": 0.01805479966045089, + "grad_norm": 0.6300340890884399, + "learning_rate": 1.9983917753192596e-05, + "loss": 0.5296, + "step": 810 + }, + { + "epoch": 0.018166249041070955, + "grad_norm": 0.6478850245475769, + "learning_rate": 1.99837186481991e-05, + "loss": 0.4569, + "step": 815 + }, + { + "epoch": 0.01827769842169102, + "grad_norm": 0.5392821431159973, + "learning_rate": 1.998351831927876e-05, + "loss": 0.4988, + "step": 820 + }, + { + "epoch": 0.018389147802311087, + "grad_norm": 0.6933488845825195, + "learning_rate": 1.998331676645614e-05, + "loss": 0.5505, + "step": 825 + }, + { + "epoch": 0.018500597182931157, + "grad_norm": 0.6098037362098694, + "learning_rate": 1.9983113989755936e-05, + "loss": 0.4964, + "step": 830 + }, + { + "epoch": 0.018612046563551223, + "grad_norm": 0.7463369369506836, + "learning_rate": 1.9982909989203022e-05, + "loss": 0.5317, + "step": 835 + }, + { + "epoch": 0.01872349594417129, + "grad_norm": 0.5679046511650085, + "learning_rate": 1.9982704764822394e-05, + "loss": 0.4199, + "step": 840 + }, + { + "epoch": 0.018834945324791356, + "grad_norm": 0.5825939178466797, + "learning_rate": 1.9982498316639217e-05, + "loss": 0.4962, + "step": 845 + }, + { + "epoch": 0.018946394705411426, + "grad_norm": 0.771053671836853, + "learning_rate": 1.9982290644678805e-05, + "loss": 0.4801, + "step": 850 + }, + { + "epoch": 0.019057844086031492, + "grad_norm": 0.9269528388977051, + "learning_rate": 1.9982081748966604e-05, + "loss": 0.4249, + "step": 855 + }, + { + "epoch": 0.019169293466651558, + "grad_norm": 0.5291380882263184, + "learning_rate": 1.9981871629528236e-05, + "loss": 0.4816, + "step": 860 + }, + { + "epoch": 0.019280742847271628, + "grad_norm": 0.8677108287811279, + "learning_rate": 1.9981660286389452e-05, + "loss": 0.5593, + "step": 865 + }, + { + "epoch": 0.019392192227891694, + "grad_norm": 0.6123939156532288, + "learning_rate": 1.9981447719576163e-05, + "loss": 0.5049, + "step": 870 + }, + { + "epoch": 0.01950364160851176, + "grad_norm": 0.7051258683204651, + "learning_rate": 1.998123392911443e-05, + "loss": 0.4572, + "step": 875 + }, + { + "epoch": 0.019615090989131827, + "grad_norm": 0.7395469546318054, + "learning_rate": 1.998101891503046e-05, + "loss": 0.4729, + "step": 880 + }, + { + "epoch": 0.019726540369751897, + "grad_norm": 0.5871098041534424, + "learning_rate": 1.998080267735061e-05, + "loss": 0.3922, + "step": 885 + }, + { + "epoch": 0.019837989750371963, + "grad_norm": 0.6082477569580078, + "learning_rate": 1.9980585216101397e-05, + "loss": 0.4535, + "step": 890 + }, + { + "epoch": 0.01994943913099203, + "grad_norm": 0.5871613025665283, + "learning_rate": 1.998036653130947e-05, + "loss": 0.4683, + "step": 895 + }, + { + "epoch": 0.020060888511612095, + "grad_norm": 0.48536795377731323, + "learning_rate": 1.9980146623001645e-05, + "loss": 0.639, + "step": 900 + }, + { + "epoch": 0.020172337892232165, + "grad_norm": 0.6489545106887817, + "learning_rate": 1.997992549120488e-05, + "loss": 0.451, + "step": 905 + }, + { + "epoch": 0.02028378727285223, + "grad_norm": 0.4703137278556824, + "learning_rate": 1.997970313594628e-05, + "loss": 0.4035, + "step": 910 + }, + { + "epoch": 0.020395236653472298, + "grad_norm": 0.4589080512523651, + "learning_rate": 1.997947955725311e-05, + "loss": 0.5589, + "step": 915 + }, + { + "epoch": 0.020506686034092364, + "grad_norm": 0.5990431904792786, + "learning_rate": 1.9979254755152774e-05, + "loss": 0.4947, + "step": 920 + }, + { + "epoch": 0.020618135414712434, + "grad_norm": 0.5349087715148926, + "learning_rate": 1.9979028729672832e-05, + "loss": 0.51, + "step": 925 + }, + { + "epoch": 0.0207295847953325, + "grad_norm": 0.6614366769790649, + "learning_rate": 1.9978801480840996e-05, + "loss": 0.5226, + "step": 930 + }, + { + "epoch": 0.020841034175952566, + "grad_norm": 0.5134681463241577, + "learning_rate": 1.9978573008685122e-05, + "loss": 0.4575, + "step": 935 + }, + { + "epoch": 0.020952483556572633, + "grad_norm": 0.8158881068229675, + "learning_rate": 1.9978343313233223e-05, + "loss": 0.4214, + "step": 940 + }, + { + "epoch": 0.021063932937192702, + "grad_norm": 0.47573769092559814, + "learning_rate": 1.9978112394513453e-05, + "loss": 0.5866, + "step": 945 + }, + { + "epoch": 0.02117538231781277, + "grad_norm": 0.7340862154960632, + "learning_rate": 1.997788025255412e-05, + "loss": 0.5066, + "step": 950 + }, + { + "epoch": 0.021286831698432835, + "grad_norm": 0.8693744540214539, + "learning_rate": 1.997764688738369e-05, + "loss": 0.5558, + "step": 955 + }, + { + "epoch": 0.021398281079052905, + "grad_norm": 0.42990386486053467, + "learning_rate": 1.9977412299030764e-05, + "loss": 0.5234, + "step": 960 + }, + { + "epoch": 0.02150973045967297, + "grad_norm": 0.7284458875656128, + "learning_rate": 1.9977176487524104e-05, + "loss": 0.4541, + "step": 965 + }, + { + "epoch": 0.021621179840293037, + "grad_norm": 0.7727136611938477, + "learning_rate": 1.997693945289262e-05, + "loss": 0.4682, + "step": 970 + }, + { + "epoch": 0.021732629220913104, + "grad_norm": 0.48860886693000793, + "learning_rate": 1.9976701195165367e-05, + "loss": 0.4349, + "step": 975 + }, + { + "epoch": 0.021844078601533173, + "grad_norm": 0.41914358735084534, + "learning_rate": 1.997646171437156e-05, + "loss": 0.4263, + "step": 980 + }, + { + "epoch": 0.02195552798215324, + "grad_norm": 0.6689707040786743, + "learning_rate": 1.997622101054055e-05, + "loss": 0.5567, + "step": 985 + }, + { + "epoch": 0.022066977362773306, + "grad_norm": 0.7929695844650269, + "learning_rate": 1.997597908370185e-05, + "loss": 0.4281, + "step": 990 + }, + { + "epoch": 0.022178426743393372, + "grad_norm": 0.48860880732536316, + "learning_rate": 1.9975735933885115e-05, + "loss": 0.4514, + "step": 995 + }, + { + "epoch": 0.022289876124013442, + "grad_norm": 0.5418014526367188, + "learning_rate": 1.9975491561120158e-05, + "loss": 0.4661, + "step": 1000 + }, + { + "epoch": 0.02240132550463351, + "grad_norm": 0.7025567293167114, + "learning_rate": 1.997524596543693e-05, + "loss": 0.5681, + "step": 1005 + }, + { + "epoch": 0.022512774885253575, + "grad_norm": 0.5372076034545898, + "learning_rate": 1.997499914686555e-05, + "loss": 0.4425, + "step": 1010 + }, + { + "epoch": 0.02262422426587364, + "grad_norm": 0.5473577976226807, + "learning_rate": 1.9974751105436266e-05, + "loss": 0.4982, + "step": 1015 + }, + { + "epoch": 0.02273567364649371, + "grad_norm": 0.47598281502723694, + "learning_rate": 1.997450184117949e-05, + "loss": 0.6274, + "step": 1020 + }, + { + "epoch": 0.022847123027113777, + "grad_norm": 0.585905909538269, + "learning_rate": 1.997425135412578e-05, + "loss": 0.5617, + "step": 1025 + }, + { + "epoch": 0.022958572407733843, + "grad_norm": 0.5420157313346863, + "learning_rate": 1.9973999644305846e-05, + "loss": 0.4755, + "step": 1030 + }, + { + "epoch": 0.02307002178835391, + "grad_norm": 0.6367788314819336, + "learning_rate": 1.9973746711750538e-05, + "loss": 0.5466, + "step": 1035 + }, + { + "epoch": 0.02318147116897398, + "grad_norm": 0.5344269275665283, + "learning_rate": 1.997349255649087e-05, + "loss": 0.4518, + "step": 1040 + }, + { + "epoch": 0.023292920549594046, + "grad_norm": 0.466888964176178, + "learning_rate": 1.9973237178558003e-05, + "loss": 0.3619, + "step": 1045 + }, + { + "epoch": 0.023404369930214112, + "grad_norm": 0.6306031346321106, + "learning_rate": 1.9972980577983233e-05, + "loss": 0.4003, + "step": 1050 + }, + { + "epoch": 0.02351581931083418, + "grad_norm": 0.617495596408844, + "learning_rate": 1.9972722754798028e-05, + "loss": 0.5175, + "step": 1055 + }, + { + "epoch": 0.023627268691454248, + "grad_norm": 0.5849382877349854, + "learning_rate": 1.9972463709033988e-05, + "loss": 0.4024, + "step": 1060 + }, + { + "epoch": 0.023738718072074314, + "grad_norm": 0.556148886680603, + "learning_rate": 1.9972203440722877e-05, + "loss": 0.3669, + "step": 1065 + }, + { + "epoch": 0.02385016745269438, + "grad_norm": 0.9622534513473511, + "learning_rate": 1.9971941949896598e-05, + "loss": 0.2972, + "step": 1070 + }, + { + "epoch": 0.02396161683331445, + "grad_norm": 0.4846252501010895, + "learning_rate": 1.99716792365872e-05, + "loss": 0.43, + "step": 1075 + }, + { + "epoch": 0.024073066213934517, + "grad_norm": 0.6357453465461731, + "learning_rate": 1.9971415300826903e-05, + "loss": 0.5174, + "step": 1080 + }, + { + "epoch": 0.024184515594554583, + "grad_norm": 0.47302985191345215, + "learning_rate": 1.9971150142648056e-05, + "loss": 0.3586, + "step": 1085 + }, + { + "epoch": 0.02429596497517465, + "grad_norm": 0.6432552337646484, + "learning_rate": 1.997088376208317e-05, + "loss": 0.4996, + "step": 1090 + }, + { + "epoch": 0.02440741435579472, + "grad_norm": 0.6288877129554749, + "learning_rate": 1.9970616159164896e-05, + "loss": 0.5208, + "step": 1095 + }, + { + "epoch": 0.024518863736414785, + "grad_norm": 0.6514771580696106, + "learning_rate": 1.997034733392604e-05, + "loss": 0.4348, + "step": 1100 + }, + { + "epoch": 0.02463031311703485, + "grad_norm": 0.6467877626419067, + "learning_rate": 1.997007728639956e-05, + "loss": 0.5967, + "step": 1105 + }, + { + "epoch": 0.024741762497654918, + "grad_norm": 0.547593355178833, + "learning_rate": 1.9969806016618566e-05, + "loss": 0.4113, + "step": 1110 + }, + { + "epoch": 0.024853211878274988, + "grad_norm": 0.6538119316101074, + "learning_rate": 1.9969533524616306e-05, + "loss": 0.4445, + "step": 1115 + }, + { + "epoch": 0.024964661258895054, + "grad_norm": 0.7080554962158203, + "learning_rate": 1.9969259810426192e-05, + "loss": 0.3812, + "step": 1120 + }, + { + "epoch": 0.02507611063951512, + "grad_norm": 0.45334309339523315, + "learning_rate": 1.9968984874081773e-05, + "loss": 0.4389, + "step": 1125 + }, + { + "epoch": 0.025187560020135186, + "grad_norm": 0.4973078966140747, + "learning_rate": 1.9968708715616757e-05, + "loss": 0.5143, + "step": 1130 + }, + { + "epoch": 0.025299009400755256, + "grad_norm": 0.807214081287384, + "learning_rate": 1.9968431335065006e-05, + "loss": 0.4306, + "step": 1135 + }, + { + "epoch": 0.025410458781375322, + "grad_norm": 0.6170802712440491, + "learning_rate": 1.996815273246051e-05, + "loss": 0.4428, + "step": 1140 + }, + { + "epoch": 0.02552190816199539, + "grad_norm": 0.5940778255462646, + "learning_rate": 1.9967872907837433e-05, + "loss": 0.5215, + "step": 1145 + }, + { + "epoch": 0.02563335754261546, + "grad_norm": 0.48376041650772095, + "learning_rate": 1.9967591861230077e-05, + "loss": 0.4973, + "step": 1150 + }, + { + "epoch": 0.025744806923235525, + "grad_norm": 0.4939127564430237, + "learning_rate": 1.99673095926729e-05, + "loss": 0.3881, + "step": 1155 + }, + { + "epoch": 0.02585625630385559, + "grad_norm": 0.6013267636299133, + "learning_rate": 1.9967026102200503e-05, + "loss": 0.3574, + "step": 1160 + }, + { + "epoch": 0.025967705684475657, + "grad_norm": 0.5341581106185913, + "learning_rate": 1.9966741389847634e-05, + "loss": 0.5351, + "step": 1165 + }, + { + "epoch": 0.026079155065095727, + "grad_norm": 0.4685159921646118, + "learning_rate": 1.9966455455649207e-05, + "loss": 0.3158, + "step": 1170 + }, + { + "epoch": 0.026190604445715793, + "grad_norm": 0.7413740158081055, + "learning_rate": 1.9966168299640268e-05, + "loss": 0.4989, + "step": 1175 + }, + { + "epoch": 0.02630205382633586, + "grad_norm": 0.46080586314201355, + "learning_rate": 1.9965879921856023e-05, + "loss": 0.5247, + "step": 1180 + }, + { + "epoch": 0.026413503206955926, + "grad_norm": 0.6611289381980896, + "learning_rate": 1.9965590322331825e-05, + "loss": 0.467, + "step": 1185 + }, + { + "epoch": 0.026524952587575996, + "grad_norm": 0.6633203029632568, + "learning_rate": 1.9965299501103178e-05, + "loss": 0.4813, + "step": 1190 + }, + { + "epoch": 0.026636401968196062, + "grad_norm": 0.5674930810928345, + "learning_rate": 1.9965007458205727e-05, + "loss": 0.4263, + "step": 1195 + }, + { + "epoch": 0.02674785134881613, + "grad_norm": 0.513952910900116, + "learning_rate": 1.9964714193675287e-05, + "loss": 0.416, + "step": 1200 + }, + { + "epoch": 0.026859300729436195, + "grad_norm": 0.6331111192703247, + "learning_rate": 1.9964419707547797e-05, + "loss": 0.383, + "step": 1205 + }, + { + "epoch": 0.026970750110056264, + "grad_norm": 0.5278674364089966, + "learning_rate": 1.9964123999859365e-05, + "loss": 0.515, + "step": 1210 + }, + { + "epoch": 0.02708219949067633, + "grad_norm": 0.963834822177887, + "learning_rate": 1.9963827070646245e-05, + "loss": 0.5118, + "step": 1215 + }, + { + "epoch": 0.027193648871296397, + "grad_norm": 0.7036752700805664, + "learning_rate": 1.996352891994483e-05, + "loss": 0.53, + "step": 1220 + }, + { + "epoch": 0.027305098251916463, + "grad_norm": 0.4653802216053009, + "learning_rate": 1.996322954779168e-05, + "loss": 0.4224, + "step": 1225 + }, + { + "epoch": 0.027416547632536533, + "grad_norm": 0.6808333992958069, + "learning_rate": 1.996292895422349e-05, + "loss": 0.5377, + "step": 1230 + }, + { + "epoch": 0.0275279970131566, + "grad_norm": 0.5279821157455444, + "learning_rate": 1.9962627139277114e-05, + "loss": 0.4485, + "step": 1235 + }, + { + "epoch": 0.027639446393776666, + "grad_norm": 0.40944090485572815, + "learning_rate": 1.996232410298955e-05, + "loss": 0.5459, + "step": 1240 + }, + { + "epoch": 0.027750895774396735, + "grad_norm": 0.6649956107139587, + "learning_rate": 1.9962019845397945e-05, + "loss": 0.539, + "step": 1245 + }, + { + "epoch": 0.0278623451550168, + "grad_norm": 0.5300599336624146, + "learning_rate": 1.9961714366539607e-05, + "loss": 0.5001, + "step": 1250 + }, + { + "epoch": 0.027973794535636868, + "grad_norm": 0.6410908102989197, + "learning_rate": 1.9961407666451978e-05, + "loss": 0.3778, + "step": 1255 + }, + { + "epoch": 0.028085243916256934, + "grad_norm": 0.5215335488319397, + "learning_rate": 1.996109974517266e-05, + "loss": 0.5084, + "step": 1260 + }, + { + "epoch": 0.028196693296877004, + "grad_norm": 0.7652701735496521, + "learning_rate": 1.99607906027394e-05, + "loss": 0.4056, + "step": 1265 + }, + { + "epoch": 0.02830814267749707, + "grad_norm": 0.5366575717926025, + "learning_rate": 1.9960480239190095e-05, + "loss": 0.4365, + "step": 1270 + }, + { + "epoch": 0.028419592058117137, + "grad_norm": 0.6422768235206604, + "learning_rate": 1.99601686545628e-05, + "loss": 0.4169, + "step": 1275 + }, + { + "epoch": 0.028531041438737203, + "grad_norm": 0.686603844165802, + "learning_rate": 1.9959855848895707e-05, + "loss": 0.3715, + "step": 1280 + }, + { + "epoch": 0.028642490819357273, + "grad_norm": 0.4541156589984894, + "learning_rate": 1.9959541822227164e-05, + "loss": 0.5185, + "step": 1285 + }, + { + "epoch": 0.02875394019997734, + "grad_norm": 0.44781601428985596, + "learning_rate": 1.995922657459567e-05, + "loss": 0.4079, + "step": 1290 + }, + { + "epoch": 0.028865389580597405, + "grad_norm": 0.487728089094162, + "learning_rate": 1.9958910106039875e-05, + "loss": 0.287, + "step": 1295 + }, + { + "epoch": 0.02897683896121747, + "grad_norm": 0.45344364643096924, + "learning_rate": 1.995859241659857e-05, + "loss": 0.4283, + "step": 1300 + }, + { + "epoch": 0.02908828834183754, + "grad_norm": 0.7202114462852478, + "learning_rate": 1.9958273506310703e-05, + "loss": 0.4479, + "step": 1305 + }, + { + "epoch": 0.029199737722457608, + "grad_norm": 0.5753505229949951, + "learning_rate": 1.9957953375215368e-05, + "loss": 0.4883, + "step": 1310 + }, + { + "epoch": 0.029311187103077674, + "grad_norm": 0.6523656249046326, + "learning_rate": 1.9957632023351815e-05, + "loss": 0.3157, + "step": 1315 + }, + { + "epoch": 0.02942263648369774, + "grad_norm": 0.5862640142440796, + "learning_rate": 1.995730945075944e-05, + "loss": 0.3809, + "step": 1320 + }, + { + "epoch": 0.02953408586431781, + "grad_norm": 0.6741471290588379, + "learning_rate": 1.995698565747778e-05, + "loss": 0.4532, + "step": 1325 + }, + { + "epoch": 0.029645535244937876, + "grad_norm": 0.6238347887992859, + "learning_rate": 1.9956660643546538e-05, + "loss": 0.4625, + "step": 1330 + }, + { + "epoch": 0.029756984625557942, + "grad_norm": 0.44492000341415405, + "learning_rate": 1.9956334409005553e-05, + "loss": 0.4245, + "step": 1335 + }, + { + "epoch": 0.029868434006178012, + "grad_norm": 0.5926109552383423, + "learning_rate": 1.9956006953894817e-05, + "loss": 0.3555, + "step": 1340 + }, + { + "epoch": 0.02997988338679808, + "grad_norm": 0.5521535277366638, + "learning_rate": 1.9955678278254483e-05, + "loss": 0.4892, + "step": 1345 + }, + { + "epoch": 0.030091332767418145, + "grad_norm": 0.5029247999191284, + "learning_rate": 1.9955348382124836e-05, + "loss": 0.4331, + "step": 1350 + }, + { + "epoch": 0.03020278214803821, + "grad_norm": 0.4701353907585144, + "learning_rate": 1.995501726554632e-05, + "loss": 0.4166, + "step": 1355 + }, + { + "epoch": 0.03031423152865828, + "grad_norm": 0.5947720408439636, + "learning_rate": 1.995468492855953e-05, + "loss": 0.4421, + "step": 1360 + }, + { + "epoch": 0.030425680909278347, + "grad_norm": 0.5445137023925781, + "learning_rate": 1.9954351371205202e-05, + "loss": 0.3011, + "step": 1365 + }, + { + "epoch": 0.030537130289898413, + "grad_norm": 0.5879339575767517, + "learning_rate": 1.9954016593524236e-05, + "loss": 0.3486, + "step": 1370 + }, + { + "epoch": 0.03064857967051848, + "grad_norm": 0.4083978831768036, + "learning_rate": 1.9953680595557668e-05, + "loss": 0.4505, + "step": 1375 + }, + { + "epoch": 0.03076002905113855, + "grad_norm": 0.7546375393867493, + "learning_rate": 1.9953343377346688e-05, + "loss": 0.5002, + "step": 1380 + }, + { + "epoch": 0.030871478431758616, + "grad_norm": 0.5613773465156555, + "learning_rate": 1.995300493893264e-05, + "loss": 0.5096, + "step": 1385 + }, + { + "epoch": 0.030982927812378682, + "grad_norm": 0.6815944910049438, + "learning_rate": 1.995266528035701e-05, + "loss": 0.527, + "step": 1390 + }, + { + "epoch": 0.03109437719299875, + "grad_norm": 0.48527321219444275, + "learning_rate": 1.995232440166144e-05, + "loss": 0.4285, + "step": 1395 + }, + { + "epoch": 0.031205826573618818, + "grad_norm": 0.5341393351554871, + "learning_rate": 1.995198230288772e-05, + "loss": 0.5031, + "step": 1400 + }, + { + "epoch": 0.03131727595423888, + "grad_norm": 0.5466798543930054, + "learning_rate": 1.9951638984077784e-05, + "loss": 0.4208, + "step": 1405 + }, + { + "epoch": 0.03142872533485895, + "grad_norm": 0.5577125549316406, + "learning_rate": 1.9951294445273725e-05, + "loss": 0.4641, + "step": 1410 + }, + { + "epoch": 0.03154017471547902, + "grad_norm": 0.47489219903945923, + "learning_rate": 1.995094868651778e-05, + "loss": 0.4353, + "step": 1415 + }, + { + "epoch": 0.03165162409609908, + "grad_norm": 0.4973861277103424, + "learning_rate": 1.9950601707852337e-05, + "loss": 0.3991, + "step": 1420 + }, + { + "epoch": 0.03176307347671915, + "grad_norm": 0.5270541310310364, + "learning_rate": 1.9950253509319928e-05, + "loss": 0.4269, + "step": 1425 + }, + { + "epoch": 0.03187452285733922, + "grad_norm": 0.618984043598175, + "learning_rate": 1.9949904090963245e-05, + "loss": 0.3404, + "step": 1430 + }, + { + "epoch": 0.031985972237959286, + "grad_norm": 0.5443921089172363, + "learning_rate": 1.9949553452825122e-05, + "loss": 0.4597, + "step": 1435 + }, + { + "epoch": 0.032097421618579355, + "grad_norm": 0.7535853981971741, + "learning_rate": 1.9949201594948544e-05, + "loss": 0.4893, + "step": 1440 + }, + { + "epoch": 0.032208870999199425, + "grad_norm": 0.5268767476081848, + "learning_rate": 1.9948848517376644e-05, + "loss": 0.4357, + "step": 1445 + }, + { + "epoch": 0.03232032037981949, + "grad_norm": 0.7284941077232361, + "learning_rate": 1.9948494220152714e-05, + "loss": 0.4246, + "step": 1450 + }, + { + "epoch": 0.03243176976043956, + "grad_norm": 0.555818498134613, + "learning_rate": 1.9948138703320178e-05, + "loss": 0.4174, + "step": 1455 + }, + { + "epoch": 0.03254321914105962, + "grad_norm": 0.5622695088386536, + "learning_rate": 1.9947781966922628e-05, + "loss": 0.3668, + "step": 1460 + }, + { + "epoch": 0.03265466852167969, + "grad_norm": 0.7035006880760193, + "learning_rate": 1.9947424011003795e-05, + "loss": 0.4184, + "step": 1465 + }, + { + "epoch": 0.03276611790229976, + "grad_norm": 0.6183652877807617, + "learning_rate": 1.994706483560756e-05, + "loss": 0.4978, + "step": 1470 + }, + { + "epoch": 0.03287756728291982, + "grad_norm": 0.5243300795555115, + "learning_rate": 1.9946704440777952e-05, + "loss": 0.4918, + "step": 1475 + }, + { + "epoch": 0.03298901666353989, + "grad_norm": 0.6121699213981628, + "learning_rate": 1.9946342826559162e-05, + "loss": 0.4743, + "step": 1480 + }, + { + "epoch": 0.03310046604415996, + "grad_norm": 0.6591364741325378, + "learning_rate": 1.994597999299551e-05, + "loss": 0.4328, + "step": 1485 + }, + { + "epoch": 0.033211915424780025, + "grad_norm": 0.42131346464157104, + "learning_rate": 1.9945615940131486e-05, + "loss": 0.357, + "step": 1490 + }, + { + "epoch": 0.033323364805400095, + "grad_norm": 0.49311649799346924, + "learning_rate": 1.9945250668011714e-05, + "loss": 0.3668, + "step": 1495 + }, + { + "epoch": 0.03343481418602016, + "grad_norm": 0.6135536432266235, + "learning_rate": 1.9944884176680975e-05, + "loss": 0.4503, + "step": 1500 + }, + { + "epoch": 0.03354626356664023, + "grad_norm": 0.32271111011505127, + "learning_rate": 1.9944516466184202e-05, + "loss": 0.3132, + "step": 1505 + }, + { + "epoch": 0.0336577129472603, + "grad_norm": 0.5943491458892822, + "learning_rate": 1.994414753656647e-05, + "loss": 0.5199, + "step": 1510 + }, + { + "epoch": 0.03376916232788036, + "grad_norm": 0.5975769758224487, + "learning_rate": 1.9943777387873006e-05, + "loss": 0.367, + "step": 1515 + }, + { + "epoch": 0.03388061170850043, + "grad_norm": 0.73946613073349, + "learning_rate": 1.9943406020149185e-05, + "loss": 0.4344, + "step": 1520 + }, + { + "epoch": 0.0339920610891205, + "grad_norm": 0.6030755043029785, + "learning_rate": 1.994303343344054e-05, + "loss": 0.5145, + "step": 1525 + }, + { + "epoch": 0.03410351046974056, + "grad_norm": 0.5314607620239258, + "learning_rate": 1.9942659627792746e-05, + "loss": 0.4169, + "step": 1530 + }, + { + "epoch": 0.03421495985036063, + "grad_norm": 0.47629496455192566, + "learning_rate": 1.9942284603251624e-05, + "loss": 0.3843, + "step": 1535 + }, + { + "epoch": 0.0343264092309807, + "grad_norm": 0.5412185788154602, + "learning_rate": 1.9941908359863152e-05, + "loss": 0.3675, + "step": 1540 + }, + { + "epoch": 0.034437858611600765, + "grad_norm": 0.6103938221931458, + "learning_rate": 1.994153089767346e-05, + "loss": 0.4595, + "step": 1545 + }, + { + "epoch": 0.034549307992220835, + "grad_norm": 0.6166775226593018, + "learning_rate": 1.9941152216728813e-05, + "loss": 0.4521, + "step": 1550 + }, + { + "epoch": 0.0346607573728409, + "grad_norm": 0.4617781341075897, + "learning_rate": 1.9940772317075638e-05, + "loss": 0.4703, + "step": 1555 + }, + { + "epoch": 0.03477220675346097, + "grad_norm": 0.5079278349876404, + "learning_rate": 1.9940391198760508e-05, + "loss": 0.5156, + "step": 1560 + }, + { + "epoch": 0.03488365613408104, + "grad_norm": 0.5550146102905273, + "learning_rate": 1.9940008861830146e-05, + "loss": 0.5081, + "step": 1565 + }, + { + "epoch": 0.0349951055147011, + "grad_norm": 0.6835542321205139, + "learning_rate": 1.9939625306331423e-05, + "loss": 0.4996, + "step": 1570 + }, + { + "epoch": 0.03510655489532117, + "grad_norm": 0.6561864018440247, + "learning_rate": 1.993924053231136e-05, + "loss": 0.4664, + "step": 1575 + }, + { + "epoch": 0.03521800427594124, + "grad_norm": 0.5839830636978149, + "learning_rate": 1.9938854539817124e-05, + "loss": 0.4001, + "step": 1580 + }, + { + "epoch": 0.0353294536565613, + "grad_norm": 0.38994866609573364, + "learning_rate": 1.9938467328896038e-05, + "loss": 0.5137, + "step": 1585 + }, + { + "epoch": 0.03544090303718137, + "grad_norm": 0.7057561874389648, + "learning_rate": 1.993807889959557e-05, + "loss": 0.5832, + "step": 1590 + }, + { + "epoch": 0.035552352417801435, + "grad_norm": 0.4783352315425873, + "learning_rate": 1.9937689251963347e-05, + "loss": 0.3321, + "step": 1595 + }, + { + "epoch": 0.035663801798421504, + "grad_norm": 0.44710680842399597, + "learning_rate": 1.993729838604712e-05, + "loss": 0.3661, + "step": 1600 + }, + { + "epoch": 0.035775251179041574, + "grad_norm": 0.6399690508842468, + "learning_rate": 1.993690630189482e-05, + "loss": 0.4862, + "step": 1605 + }, + { + "epoch": 0.03588670055966164, + "grad_norm": 0.8524630069732666, + "learning_rate": 1.9936512999554505e-05, + "loss": 0.4287, + "step": 1610 + }, + { + "epoch": 0.03599814994028171, + "grad_norm": 0.6386929750442505, + "learning_rate": 1.9936118479074394e-05, + "loss": 0.358, + "step": 1615 + }, + { + "epoch": 0.03610959932090178, + "grad_norm": 0.3798363208770752, + "learning_rate": 1.9935722740502857e-05, + "loss": 0.3936, + "step": 1620 + }, + { + "epoch": 0.03622104870152184, + "grad_norm": 0.5178596377372742, + "learning_rate": 1.9935325783888403e-05, + "loss": 0.3659, + "step": 1625 + }, + { + "epoch": 0.03633249808214191, + "grad_norm": 0.5204070210456848, + "learning_rate": 1.9934927609279695e-05, + "loss": 0.4623, + "step": 1630 + }, + { + "epoch": 0.03644394746276198, + "grad_norm": 0.6260115504264832, + "learning_rate": 1.9934528216725547e-05, + "loss": 0.3808, + "step": 1635 + }, + { + "epoch": 0.03655539684338204, + "grad_norm": 0.3944597542285919, + "learning_rate": 1.9934127606274925e-05, + "loss": 0.557, + "step": 1640 + }, + { + "epoch": 0.03666684622400211, + "grad_norm": 0.567103385925293, + "learning_rate": 1.9933725777976937e-05, + "loss": 0.4482, + "step": 1645 + }, + { + "epoch": 0.036778295604622174, + "grad_norm": 0.478750616312027, + "learning_rate": 1.9933322731880845e-05, + "loss": 0.4589, + "step": 1650 + }, + { + "epoch": 0.036889744985242244, + "grad_norm": 0.6372083425521851, + "learning_rate": 1.9932918468036057e-05, + "loss": 0.4494, + "step": 1655 + }, + { + "epoch": 0.037001194365862314, + "grad_norm": 0.6181703805923462, + "learning_rate": 1.993251298649214e-05, + "loss": 0.42, + "step": 1660 + }, + { + "epoch": 0.03711264374648238, + "grad_norm": 0.7616068720817566, + "learning_rate": 1.9932106287298795e-05, + "loss": 0.467, + "step": 1665 + }, + { + "epoch": 0.037224093127102446, + "grad_norm": 0.6794231534004211, + "learning_rate": 1.9931698370505884e-05, + "loss": 0.6037, + "step": 1670 + }, + { + "epoch": 0.037335542507722516, + "grad_norm": 0.562717854976654, + "learning_rate": 1.9931289236163414e-05, + "loss": 0.4844, + "step": 1675 + }, + { + "epoch": 0.03744699188834258, + "grad_norm": 0.5788027048110962, + "learning_rate": 1.9930878884321542e-05, + "loss": 0.4101, + "step": 1680 + }, + { + "epoch": 0.03755844126896265, + "grad_norm": 0.6554732918739319, + "learning_rate": 1.9930467315030572e-05, + "loss": 0.5106, + "step": 1685 + }, + { + "epoch": 0.03766989064958271, + "grad_norm": 0.6344287395477295, + "learning_rate": 1.9930054528340963e-05, + "loss": 0.4831, + "step": 1690 + }, + { + "epoch": 0.03778134003020278, + "grad_norm": 0.6484274864196777, + "learning_rate": 1.9929640524303314e-05, + "loss": 0.399, + "step": 1695 + }, + { + "epoch": 0.03789278941082285, + "grad_norm": 0.49392303824424744, + "learning_rate": 1.992922530296838e-05, + "loss": 0.4336, + "step": 1700 + }, + { + "epoch": 0.038004238791442914, + "grad_norm": 0.5903943777084351, + "learning_rate": 1.9928808864387073e-05, + "loss": 0.4583, + "step": 1705 + }, + { + "epoch": 0.038115688172062984, + "grad_norm": 0.7169772982597351, + "learning_rate": 1.992839120861043e-05, + "loss": 0.4634, + "step": 1710 + }, + { + "epoch": 0.03822713755268305, + "grad_norm": 0.6074687838554382, + "learning_rate": 1.9927972335689667e-05, + "loss": 0.5118, + "step": 1715 + }, + { + "epoch": 0.038338586933303116, + "grad_norm": 0.4693703055381775, + "learning_rate": 1.9927552245676124e-05, + "loss": 0.4277, + "step": 1720 + }, + { + "epoch": 0.038450036313923186, + "grad_norm": 0.444767564535141, + "learning_rate": 1.9927130938621305e-05, + "loss": 0.3628, + "step": 1725 + }, + { + "epoch": 0.038561485694543256, + "grad_norm": 0.4447241723537445, + "learning_rate": 1.992670841457686e-05, + "loss": 0.3805, + "step": 1730 + }, + { + "epoch": 0.03867293507516332, + "grad_norm": 0.5440791249275208, + "learning_rate": 1.9926284673594588e-05, + "loss": 0.3269, + "step": 1735 + }, + { + "epoch": 0.03878438445578339, + "grad_norm": 0.5838759541511536, + "learning_rate": 1.992585971572643e-05, + "loss": 0.4433, + "step": 1740 + }, + { + "epoch": 0.03889583383640345, + "grad_norm": 0.5916067361831665, + "learning_rate": 1.9925433541024488e-05, + "loss": 0.4893, + "step": 1745 + }, + { + "epoch": 0.03900728321702352, + "grad_norm": 0.5731961131095886, + "learning_rate": 1.9925006149541003e-05, + "loss": 0.4612, + "step": 1750 + }, + { + "epoch": 0.03911873259764359, + "grad_norm": 0.5600702166557312, + "learning_rate": 1.9924577541328378e-05, + "loss": 0.4457, + "step": 1755 + }, + { + "epoch": 0.039230181978263653, + "grad_norm": 0.7729988098144531, + "learning_rate": 1.9924147716439152e-05, + "loss": 0.5437, + "step": 1760 + }, + { + "epoch": 0.03934163135888372, + "grad_norm": 0.6786412596702576, + "learning_rate": 1.9923716674926018e-05, + "loss": 0.531, + "step": 1765 + }, + { + "epoch": 0.03945308073950379, + "grad_norm": 0.505881130695343, + "learning_rate": 1.992328441684182e-05, + "loss": 0.4025, + "step": 1770 + }, + { + "epoch": 0.039564530120123856, + "grad_norm": 0.5706116557121277, + "learning_rate": 1.9922850942239544e-05, + "loss": 0.4751, + "step": 1775 + }, + { + "epoch": 0.039675979500743926, + "grad_norm": 0.4059732258319855, + "learning_rate": 1.992241625117234e-05, + "loss": 0.4051, + "step": 1780 + }, + { + "epoch": 0.03978742888136399, + "grad_norm": 0.5178811550140381, + "learning_rate": 1.992198034369349e-05, + "loss": 0.5105, + "step": 1785 + }, + { + "epoch": 0.03989887826198406, + "grad_norm": 0.6659989953041077, + "learning_rate": 1.9921543219856437e-05, + "loss": 0.4047, + "step": 1790 + }, + { + "epoch": 0.04001032764260413, + "grad_norm": 0.4662383198738098, + "learning_rate": 1.992110487971477e-05, + "loss": 0.4747, + "step": 1795 + }, + { + "epoch": 0.04012177702322419, + "grad_norm": 0.7052854299545288, + "learning_rate": 1.992066532332222e-05, + "loss": 0.3694, + "step": 1800 + }, + { + "epoch": 0.04023322640384426, + "grad_norm": 0.5693308711051941, + "learning_rate": 1.9920224550732682e-05, + "loss": 0.4154, + "step": 1805 + }, + { + "epoch": 0.04034467578446433, + "grad_norm": 0.5711913704872131, + "learning_rate": 1.991978256200018e-05, + "loss": 0.3909, + "step": 1810 + }, + { + "epoch": 0.04045612516508439, + "grad_norm": 0.42563021183013916, + "learning_rate": 1.9919339357178907e-05, + "loss": 0.5499, + "step": 1815 + }, + { + "epoch": 0.04056757454570446, + "grad_norm": 0.521532416343689, + "learning_rate": 1.9918894936323197e-05, + "loss": 0.3627, + "step": 1820 + }, + { + "epoch": 0.04067902392632453, + "grad_norm": 0.506631076335907, + "learning_rate": 1.991844929948753e-05, + "loss": 0.4728, + "step": 1825 + }, + { + "epoch": 0.040790473306944595, + "grad_norm": 0.6046636700630188, + "learning_rate": 1.9918002446726535e-05, + "loss": 0.3791, + "step": 1830 + }, + { + "epoch": 0.040901922687564665, + "grad_norm": 0.620905339717865, + "learning_rate": 1.9917554378094997e-05, + "loss": 0.4688, + "step": 1835 + }, + { + "epoch": 0.04101337206818473, + "grad_norm": 0.5445390939712524, + "learning_rate": 1.991710509364784e-05, + "loss": 0.4985, + "step": 1840 + }, + { + "epoch": 0.0411248214488048, + "grad_norm": 0.6072726249694824, + "learning_rate": 1.9916654593440152e-05, + "loss": 0.4017, + "step": 1845 + }, + { + "epoch": 0.04123627082942487, + "grad_norm": 0.5444666147232056, + "learning_rate": 1.9916202877527153e-05, + "loss": 0.4537, + "step": 1850 + }, + { + "epoch": 0.04134772021004493, + "grad_norm": 0.6237067580223083, + "learning_rate": 1.9915749945964228e-05, + "loss": 0.3398, + "step": 1855 + }, + { + "epoch": 0.041459169590665, + "grad_norm": 0.49777600169181824, + "learning_rate": 1.9915295798806888e-05, + "loss": 0.4411, + "step": 1860 + }, + { + "epoch": 0.04157061897128507, + "grad_norm": 0.5367849469184875, + "learning_rate": 1.9914840436110824e-05, + "loss": 0.4353, + "step": 1865 + }, + { + "epoch": 0.04168206835190513, + "grad_norm": 0.562211811542511, + "learning_rate": 1.9914383857931853e-05, + "loss": 0.4252, + "step": 1870 + }, + { + "epoch": 0.0417935177325252, + "grad_norm": 0.6259203553199768, + "learning_rate": 1.9913926064325946e-05, + "loss": 0.42, + "step": 1875 + }, + { + "epoch": 0.041904967113145265, + "grad_norm": 0.48725616931915283, + "learning_rate": 1.9913467055349227e-05, + "loss": 0.4408, + "step": 1880 + }, + { + "epoch": 0.042016416493765335, + "grad_norm": 0.538391649723053, + "learning_rate": 1.9913006831057967e-05, + "loss": 0.4785, + "step": 1885 + }, + { + "epoch": 0.042127865874385405, + "grad_norm": 0.4485551416873932, + "learning_rate": 1.991254539150859e-05, + "loss": 0.34, + "step": 1890 + }, + { + "epoch": 0.04223931525500547, + "grad_norm": 0.4810604453086853, + "learning_rate": 1.991208273675766e-05, + "loss": 0.3773, + "step": 1895 + }, + { + "epoch": 0.04235076463562554, + "grad_norm": 0.8326770067214966, + "learning_rate": 1.9911618866861894e-05, + "loss": 0.3926, + "step": 1900 + }, + { + "epoch": 0.04246221401624561, + "grad_norm": 0.6135021448135376, + "learning_rate": 1.991115378187816e-05, + "loss": 0.5185, + "step": 1905 + }, + { + "epoch": 0.04257366339686567, + "grad_norm": 0.5827608108520508, + "learning_rate": 1.9910687481863478e-05, + "loss": 0.5723, + "step": 1910 + }, + { + "epoch": 0.04268511277748574, + "grad_norm": 0.5833299160003662, + "learning_rate": 1.9910219966875007e-05, + "loss": 0.4621, + "step": 1915 + }, + { + "epoch": 0.04279656215810581, + "grad_norm": 0.7341663837432861, + "learning_rate": 1.9909751236970064e-05, + "loss": 0.4476, + "step": 1920 + }, + { + "epoch": 0.04290801153872587, + "grad_norm": 0.6236368417739868, + "learning_rate": 1.990928129220611e-05, + "loss": 0.4315, + "step": 1925 + }, + { + "epoch": 0.04301946091934594, + "grad_norm": 0.6765170097351074, + "learning_rate": 1.990881013264076e-05, + "loss": 0.5065, + "step": 1930 + }, + { + "epoch": 0.043130910299966005, + "grad_norm": 0.498177170753479, + "learning_rate": 1.990833775833177e-05, + "loss": 0.4905, + "step": 1935 + }, + { + "epoch": 0.043242359680586075, + "grad_norm": 0.7168980836868286, + "learning_rate": 1.9907864169337053e-05, + "loss": 0.4488, + "step": 1940 + }, + { + "epoch": 0.043353809061206144, + "grad_norm": 0.5037057995796204, + "learning_rate": 1.9907389365714662e-05, + "loss": 0.3357, + "step": 1945 + }, + { + "epoch": 0.04346525844182621, + "grad_norm": 0.608124852180481, + "learning_rate": 1.9906913347522812e-05, + "loss": 0.587, + "step": 1950 + }, + { + "epoch": 0.04357670782244628, + "grad_norm": 0.400680273771286, + "learning_rate": 1.9906436114819858e-05, + "loss": 0.3273, + "step": 1955 + }, + { + "epoch": 0.04368815720306635, + "grad_norm": 0.6874983310699463, + "learning_rate": 1.9905957667664296e-05, + "loss": 0.4827, + "step": 1960 + }, + { + "epoch": 0.04379960658368641, + "grad_norm": 0.5223805904388428, + "learning_rate": 1.990547800611479e-05, + "loss": 0.5685, + "step": 1965 + }, + { + "epoch": 0.04391105596430648, + "grad_norm": 0.43834665417671204, + "learning_rate": 1.9904997130230135e-05, + "loss": 0.3613, + "step": 1970 + }, + { + "epoch": 0.04402250534492654, + "grad_norm": 0.7025169730186462, + "learning_rate": 1.990451504006929e-05, + "loss": 0.4489, + "step": 1975 + }, + { + "epoch": 0.04413395472554661, + "grad_norm": 0.665053129196167, + "learning_rate": 1.990403173569135e-05, + "loss": 0.4566, + "step": 1980 + }, + { + "epoch": 0.04424540410616668, + "grad_norm": 0.5735465288162231, + "learning_rate": 1.9903547217155567e-05, + "loss": 0.2985, + "step": 1985 + }, + { + "epoch": 0.044356853486786745, + "grad_norm": 0.7619782090187073, + "learning_rate": 1.990306148452134e-05, + "loss": 0.6006, + "step": 1990 + }, + { + "epoch": 0.044468302867406814, + "grad_norm": 0.491385281085968, + "learning_rate": 1.9902574537848216e-05, + "loss": 0.5219, + "step": 1995 + }, + { + "epoch": 0.044579752248026884, + "grad_norm": 0.6306560039520264, + "learning_rate": 1.990208637719589e-05, + "loss": 0.4113, + "step": 2000 + }, + { + "epoch": 0.04469120162864695, + "grad_norm": 0.4792928993701935, + "learning_rate": 1.9901597002624204e-05, + "loss": 0.565, + "step": 2005 + }, + { + "epoch": 0.04480265100926702, + "grad_norm": 0.7919230461120605, + "learning_rate": 1.9901106414193153e-05, + "loss": 0.4039, + "step": 2010 + }, + { + "epoch": 0.044914100389887086, + "grad_norm": 0.808326780796051, + "learning_rate": 1.9900614611962882e-05, + "loss": 0.481, + "step": 2015 + }, + { + "epoch": 0.04502554977050715, + "grad_norm": 0.4971373677253723, + "learning_rate": 1.9900121595993683e-05, + "loss": 0.5343, + "step": 2020 + }, + { + "epoch": 0.04513699915112722, + "grad_norm": 0.6289275884628296, + "learning_rate": 1.989962736634599e-05, + "loss": 0.4459, + "step": 2025 + }, + { + "epoch": 0.04524844853174728, + "grad_norm": 0.5500627160072327, + "learning_rate": 1.98991319230804e-05, + "loss": 0.3991, + "step": 2030 + }, + { + "epoch": 0.04535989791236735, + "grad_norm": 0.47303643822669983, + "learning_rate": 1.9898635266257643e-05, + "loss": 0.5046, + "step": 2035 + }, + { + "epoch": 0.04547134729298742, + "grad_norm": 5.436945915222168, + "learning_rate": 1.9898137395938605e-05, + "loss": 0.4981, + "step": 2040 + }, + { + "epoch": 0.045582796673607484, + "grad_norm": 0.40001043677330017, + "learning_rate": 1.9897638312184328e-05, + "loss": 0.4359, + "step": 2045 + }, + { + "epoch": 0.045694246054227554, + "grad_norm": 0.5678738355636597, + "learning_rate": 1.989713801505599e-05, + "loss": 0.5342, + "step": 2050 + }, + { + "epoch": 0.045805695434847624, + "grad_norm": 0.539551854133606, + "learning_rate": 1.989663650461493e-05, + "loss": 0.3701, + "step": 2055 + }, + { + "epoch": 0.045917144815467686, + "grad_norm": 0.5696111917495728, + "learning_rate": 1.989613378092262e-05, + "loss": 0.5771, + "step": 2060 + }, + { + "epoch": 0.046028594196087756, + "grad_norm": 0.5799300670623779, + "learning_rate": 1.9895629844040697e-05, + "loss": 0.4812, + "step": 2065 + }, + { + "epoch": 0.04614004357670782, + "grad_norm": 0.5338211059570312, + "learning_rate": 1.9895124694030934e-05, + "loss": 0.5903, + "step": 2070 + }, + { + "epoch": 0.04625149295732789, + "grad_norm": 0.5052824020385742, + "learning_rate": 1.9894618330955268e-05, + "loss": 0.3809, + "step": 2075 + }, + { + "epoch": 0.04636294233794796, + "grad_norm": 0.47797662019729614, + "learning_rate": 1.9894110754875763e-05, + "loss": 0.371, + "step": 2080 + }, + { + "epoch": 0.04647439171856802, + "grad_norm": 0.6397122144699097, + "learning_rate": 1.9893601965854653e-05, + "loss": 0.4875, + "step": 2085 + }, + { + "epoch": 0.04658584109918809, + "grad_norm": 0.5474061369895935, + "learning_rate": 1.9893091963954312e-05, + "loss": 0.4379, + "step": 2090 + }, + { + "epoch": 0.04669729047980816, + "grad_norm": 0.5695924162864685, + "learning_rate": 1.9892580749237256e-05, + "loss": 0.4498, + "step": 2095 + }, + { + "epoch": 0.046808739860428224, + "grad_norm": 0.4939964711666107, + "learning_rate": 1.989206832176616e-05, + "loss": 0.4, + "step": 2100 + }, + { + "epoch": 0.046920189241048293, + "grad_norm": 0.5758333206176758, + "learning_rate": 1.9891554681603844e-05, + "loss": 0.4681, + "step": 2105 + }, + { + "epoch": 0.04703163862166836, + "grad_norm": 0.6062471270561218, + "learning_rate": 1.9891039828813272e-05, + "loss": 0.4122, + "step": 2110 + }, + { + "epoch": 0.047143088002288426, + "grad_norm": 0.45578670501708984, + "learning_rate": 1.9890523763457567e-05, + "loss": 0.5645, + "step": 2115 + }, + { + "epoch": 0.047254537382908496, + "grad_norm": 0.5675963759422302, + "learning_rate": 1.989000648559999e-05, + "loss": 0.4202, + "step": 2120 + }, + { + "epoch": 0.04736598676352856, + "grad_norm": 0.6070412993431091, + "learning_rate": 1.9889487995303958e-05, + "loss": 0.2939, + "step": 2125 + }, + { + "epoch": 0.04747743614414863, + "grad_norm": 0.7796115279197693, + "learning_rate": 1.9888968292633032e-05, + "loss": 0.529, + "step": 2130 + }, + { + "epoch": 0.0475888855247687, + "grad_norm": 0.5633937120437622, + "learning_rate": 1.9888447377650926e-05, + "loss": 0.5273, + "step": 2135 + }, + { + "epoch": 0.04770033490538876, + "grad_norm": 0.45713990926742554, + "learning_rate": 1.9887925250421494e-05, + "loss": 0.3204, + "step": 2140 + }, + { + "epoch": 0.04781178428600883, + "grad_norm": 0.457570880651474, + "learning_rate": 1.9887401911008758e-05, + "loss": 0.5638, + "step": 2145 + }, + { + "epoch": 0.0479232336666289, + "grad_norm": 0.5453818440437317, + "learning_rate": 1.988687735947686e-05, + "loss": 0.3383, + "step": 2150 + }, + { + "epoch": 0.04803468304724896, + "grad_norm": 0.550879955291748, + "learning_rate": 1.9886351595890114e-05, + "loss": 0.446, + "step": 2155 + }, + { + "epoch": 0.04814613242786903, + "grad_norm": 0.5164986848831177, + "learning_rate": 1.9885824620312975e-05, + "loss": 0.4184, + "step": 2160 + }, + { + "epoch": 0.048257581808489096, + "grad_norm": 0.5737098455429077, + "learning_rate": 1.988529643281004e-05, + "loss": 0.4013, + "step": 2165 + }, + { + "epoch": 0.048369031189109166, + "grad_norm": 0.6029147505760193, + "learning_rate": 1.988476703344607e-05, + "loss": 0.4013, + "step": 2170 + }, + { + "epoch": 0.048480480569729235, + "grad_norm": 0.4032836854457855, + "learning_rate": 1.988423642228596e-05, + "loss": 0.543, + "step": 2175 + }, + { + "epoch": 0.0485919299503493, + "grad_norm": 0.58678138256073, + "learning_rate": 1.988370459939476e-05, + "loss": 0.4044, + "step": 2180 + }, + { + "epoch": 0.04870337933096937, + "grad_norm": 0.5044936537742615, + "learning_rate": 1.988317156483766e-05, + "loss": 0.2857, + "step": 2185 + }, + { + "epoch": 0.04881482871158944, + "grad_norm": 0.6275551915168762, + "learning_rate": 1.988263731868002e-05, + "loss": 0.5061, + "step": 2190 + }, + { + "epoch": 0.0489262780922095, + "grad_norm": 0.7103505730628967, + "learning_rate": 1.988210186098732e-05, + "loss": 0.4178, + "step": 2195 + }, + { + "epoch": 0.04903772747282957, + "grad_norm": 0.5122005343437195, + "learning_rate": 1.9881565191825218e-05, + "loss": 0.3263, + "step": 2200 + }, + { + "epoch": 0.04914917685344964, + "grad_norm": 0.7140627503395081, + "learning_rate": 1.9881027311259487e-05, + "loss": 0.4232, + "step": 2205 + }, + { + "epoch": 0.0492606262340697, + "grad_norm": 0.6395582556724548, + "learning_rate": 1.9880488219356086e-05, + "loss": 0.5721, + "step": 2210 + }, + { + "epoch": 0.04937207561468977, + "grad_norm": 0.5498226284980774, + "learning_rate": 1.9879947916181096e-05, + "loss": 0.3603, + "step": 2215 + }, + { + "epoch": 0.049483524995309836, + "grad_norm": 0.5085523724555969, + "learning_rate": 1.9879406401800746e-05, + "loss": 0.4792, + "step": 2220 + }, + { + "epoch": 0.049594974375929905, + "grad_norm": 0.8438782691955566, + "learning_rate": 1.987886367628143e-05, + "loss": 0.4226, + "step": 2225 + }, + { + "epoch": 0.049706423756549975, + "grad_norm": 0.6150681972503662, + "learning_rate": 1.9878319739689686e-05, + "loss": 0.3928, + "step": 2230 + }, + { + "epoch": 0.04981787313717004, + "grad_norm": 0.4985312223434448, + "learning_rate": 1.987777459209219e-05, + "loss": 0.4044, + "step": 2235 + }, + { + "epoch": 0.04992932251779011, + "grad_norm": 0.5724480748176575, + "learning_rate": 1.987722823355577e-05, + "loss": 0.5321, + "step": 2240 + }, + { + "epoch": 0.05004077189841018, + "grad_norm": 0.6445596218109131, + "learning_rate": 1.9876680664147408e-05, + "loss": 0.3881, + "step": 2245 + }, + { + "epoch": 0.05015222127903024, + "grad_norm": 0.48637256026268005, + "learning_rate": 1.9876131883934235e-05, + "loss": 0.46, + "step": 2250 + }, + { + "epoch": 0.05026367065965031, + "grad_norm": 0.4218536615371704, + "learning_rate": 1.9875581892983527e-05, + "loss": 0.5875, + "step": 2255 + }, + { + "epoch": 0.05037512004027037, + "grad_norm": 0.48131638765335083, + "learning_rate": 1.9875030691362705e-05, + "loss": 0.4047, + "step": 2260 + }, + { + "epoch": 0.05048656942089044, + "grad_norm": 0.6676334142684937, + "learning_rate": 1.9874478279139343e-05, + "loss": 0.3902, + "step": 2265 + }, + { + "epoch": 0.05059801880151051, + "grad_norm": 0.4519447386264801, + "learning_rate": 1.9873924656381164e-05, + "loss": 0.4285, + "step": 2270 + }, + { + "epoch": 0.050709468182130575, + "grad_norm": 0.3770105838775635, + "learning_rate": 1.9873369823156038e-05, + "loss": 0.4169, + "step": 2275 + }, + { + "epoch": 0.050820917562750645, + "grad_norm": 0.46062755584716797, + "learning_rate": 1.987281377953198e-05, + "loss": 0.4212, + "step": 2280 + }, + { + "epoch": 0.050932366943370715, + "grad_norm": 0.4402807950973511, + "learning_rate": 1.987225652557716e-05, + "loss": 0.4225, + "step": 2285 + }, + { + "epoch": 0.05104381632399078, + "grad_norm": 0.5167077779769897, + "learning_rate": 1.9871698061359893e-05, + "loss": 0.4731, + "step": 2290 + }, + { + "epoch": 0.05115526570461085, + "grad_norm": 0.6223405599594116, + "learning_rate": 1.987113838694864e-05, + "loss": 0.435, + "step": 2295 + }, + { + "epoch": 0.05126671508523092, + "grad_norm": 0.6904946565628052, + "learning_rate": 1.9870577502412015e-05, + "loss": 0.4087, + "step": 2300 + }, + { + "epoch": 0.05137816446585098, + "grad_norm": 0.6623885035514832, + "learning_rate": 1.987001540781878e-05, + "loss": 0.4721, + "step": 2305 + }, + { + "epoch": 0.05148961384647105, + "grad_norm": 0.44664719700813293, + "learning_rate": 1.9869452103237838e-05, + "loss": 0.4547, + "step": 2310 + }, + { + "epoch": 0.05160106322709111, + "grad_norm": 0.5109519362449646, + "learning_rate": 1.986888758873825e-05, + "loss": 0.3584, + "step": 2315 + }, + { + "epoch": 0.05171251260771118, + "grad_norm": 0.5872670412063599, + "learning_rate": 1.9868321864389216e-05, + "loss": 0.4298, + "step": 2320 + }, + { + "epoch": 0.05182396198833125, + "grad_norm": 0.6067822575569153, + "learning_rate": 1.9867754930260098e-05, + "loss": 0.3886, + "step": 2325 + }, + { + "epoch": 0.051935411368951315, + "grad_norm": 0.40194398164749146, + "learning_rate": 1.9867186786420388e-05, + "loss": 0.4291, + "step": 2330 + }, + { + "epoch": 0.052046860749571384, + "grad_norm": 0.39779698848724365, + "learning_rate": 1.9866617432939746e-05, + "loss": 0.4259, + "step": 2335 + }, + { + "epoch": 0.052158310130191454, + "grad_norm": 0.5225132703781128, + "learning_rate": 1.986604686988796e-05, + "loss": 0.4568, + "step": 2340 + }, + { + "epoch": 0.05226975951081152, + "grad_norm": 0.5338283777236938, + "learning_rate": 1.986547509733499e-05, + "loss": 0.5001, + "step": 2345 + }, + { + "epoch": 0.05238120889143159, + "grad_norm": 0.37600216269493103, + "learning_rate": 1.9864902115350918e-05, + "loss": 0.3789, + "step": 2350 + }, + { + "epoch": 0.05249265827205165, + "grad_norm": 0.4644761383533478, + "learning_rate": 1.986432792400599e-05, + "loss": 0.5704, + "step": 2355 + }, + { + "epoch": 0.05260410765267172, + "grad_norm": 0.5336574912071228, + "learning_rate": 1.9863752523370602e-05, + "loss": 0.4756, + "step": 2360 + }, + { + "epoch": 0.05271555703329179, + "grad_norm": 0.4611615538597107, + "learning_rate": 1.9863175913515287e-05, + "loss": 0.4586, + "step": 2365 + }, + { + "epoch": 0.05282700641391185, + "grad_norm": 0.5109054446220398, + "learning_rate": 1.9862598094510743e-05, + "loss": 0.4997, + "step": 2370 + }, + { + "epoch": 0.05293845579453192, + "grad_norm": 0.5393320322036743, + "learning_rate": 1.98620190664278e-05, + "loss": 0.4339, + "step": 2375 + }, + { + "epoch": 0.05304990517515199, + "grad_norm": 0.5946705937385559, + "learning_rate": 1.986143882933744e-05, + "loss": 0.5266, + "step": 2380 + }, + { + "epoch": 0.053161354555772054, + "grad_norm": 0.39486464858055115, + "learning_rate": 1.9860857383310795e-05, + "loss": 0.5178, + "step": 2385 + }, + { + "epoch": 0.053272803936392124, + "grad_norm": 0.6416144967079163, + "learning_rate": 1.9860274728419155e-05, + "loss": 0.5279, + "step": 2390 + }, + { + "epoch": 0.053384253317012194, + "grad_norm": 0.868364691734314, + "learning_rate": 1.9859690864733942e-05, + "loss": 0.2764, + "step": 2395 + }, + { + "epoch": 0.05349570269763226, + "grad_norm": 0.5830197334289551, + "learning_rate": 1.9859105792326733e-05, + "loss": 0.4276, + "step": 2400 + }, + { + "epoch": 0.053607152078252326, + "grad_norm": 0.6217747330665588, + "learning_rate": 1.985851951126925e-05, + "loss": 0.4741, + "step": 2405 + }, + { + "epoch": 0.05371860145887239, + "grad_norm": 0.47893911600112915, + "learning_rate": 1.9857932021633377e-05, + "loss": 0.4266, + "step": 2410 + }, + { + "epoch": 0.05383005083949246, + "grad_norm": 0.5307624340057373, + "learning_rate": 1.9857343323491127e-05, + "loss": 0.3482, + "step": 2415 + }, + { + "epoch": 0.05394150022011253, + "grad_norm": 0.5524867177009583, + "learning_rate": 1.9856753416914673e-05, + "loss": 0.4334, + "step": 2420 + }, + { + "epoch": 0.05405294960073259, + "grad_norm": 0.41400259733200073, + "learning_rate": 1.9856162301976333e-05, + "loss": 0.3792, + "step": 2425 + }, + { + "epoch": 0.05416439898135266, + "grad_norm": 0.9114057421684265, + "learning_rate": 1.9855569978748575e-05, + "loss": 0.5029, + "step": 2430 + }, + { + "epoch": 0.05427584836197273, + "grad_norm": 0.5691152811050415, + "learning_rate": 1.9854976447304005e-05, + "loss": 0.5097, + "step": 2435 + }, + { + "epoch": 0.054387297742592794, + "grad_norm": 0.5087013840675354, + "learning_rate": 1.9854381707715396e-05, + "loss": 0.5009, + "step": 2440 + }, + { + "epoch": 0.054498747123212864, + "grad_norm": 0.596858561038971, + "learning_rate": 1.9853785760055652e-05, + "loss": 0.4261, + "step": 2445 + }, + { + "epoch": 0.054610196503832927, + "grad_norm": 0.547972559928894, + "learning_rate": 1.9853188604397834e-05, + "loss": 0.3686, + "step": 2450 + }, + { + "epoch": 0.054721645884452996, + "grad_norm": 0.63202965259552, + "learning_rate": 1.985259024081515e-05, + "loss": 0.4538, + "step": 2455 + }, + { + "epoch": 0.054833095265073066, + "grad_norm": 0.5696936249732971, + "learning_rate": 1.9851990669380948e-05, + "loss": 0.4632, + "step": 2460 + }, + { + "epoch": 0.05494454464569313, + "grad_norm": 0.5310059189796448, + "learning_rate": 1.9851389890168738e-05, + "loss": 0.4243, + "step": 2465 + }, + { + "epoch": 0.0550559940263132, + "grad_norm": 0.5194794535636902, + "learning_rate": 1.985078790325217e-05, + "loss": 0.4771, + "step": 2470 + }, + { + "epoch": 0.05516744340693327, + "grad_norm": 0.4808349013328552, + "learning_rate": 1.985018470870504e-05, + "loss": 0.4842, + "step": 2475 + }, + { + "epoch": 0.05527889278755333, + "grad_norm": 0.49285992980003357, + "learning_rate": 1.9849580306601298e-05, + "loss": 0.3485, + "step": 2480 + }, + { + "epoch": 0.0553903421681734, + "grad_norm": 0.47631382942199707, + "learning_rate": 1.9848974697015038e-05, + "loss": 0.3182, + "step": 2485 + }, + { + "epoch": 0.05550179154879347, + "grad_norm": 0.6178653836250305, + "learning_rate": 1.98483678800205e-05, + "loss": 0.3605, + "step": 2490 + }, + { + "epoch": 0.055613240929413534, + "grad_norm": 0.6378611922264099, + "learning_rate": 1.9847759855692078e-05, + "loss": 0.4466, + "step": 2495 + }, + { + "epoch": 0.0557246903100336, + "grad_norm": 0.6486213207244873, + "learning_rate": 1.9847150624104313e-05, + "loss": 0.4764, + "step": 2500 + }, + { + "epoch": 0.055836139690653666, + "grad_norm": 0.5121660232543945, + "learning_rate": 1.9846540185331886e-05, + "loss": 0.4384, + "step": 2505 + }, + { + "epoch": 0.055947589071273736, + "grad_norm": 0.5070223212242126, + "learning_rate": 1.984592853944964e-05, + "loss": 0.3902, + "step": 2510 + }, + { + "epoch": 0.056059038451893806, + "grad_norm": 0.6691417694091797, + "learning_rate": 1.984531568653255e-05, + "loss": 0.4287, + "step": 2515 + }, + { + "epoch": 0.05617048783251387, + "grad_norm": 0.6232566237449646, + "learning_rate": 1.9844701626655753e-05, + "loss": 0.3921, + "step": 2520 + }, + { + "epoch": 0.05628193721313394, + "grad_norm": 0.596046507358551, + "learning_rate": 1.9844086359894525e-05, + "loss": 0.2237, + "step": 2525 + }, + { + "epoch": 0.05639338659375401, + "grad_norm": 0.5020400285720825, + "learning_rate": 1.9843469886324294e-05, + "loss": 0.445, + "step": 2530 + }, + { + "epoch": 0.05650483597437407, + "grad_norm": 0.6604124903678894, + "learning_rate": 1.9842852206020637e-05, + "loss": 0.3864, + "step": 2535 + }, + { + "epoch": 0.05661628535499414, + "grad_norm": 0.4494327902793884, + "learning_rate": 1.9842233319059274e-05, + "loss": 0.4462, + "step": 2540 + }, + { + "epoch": 0.0567277347356142, + "grad_norm": 0.6409807801246643, + "learning_rate": 1.9841613225516077e-05, + "loss": 0.4498, + "step": 2545 + }, + { + "epoch": 0.05683918411623427, + "grad_norm": 0.6100894808769226, + "learning_rate": 1.9840991925467064e-05, + "loss": 0.3684, + "step": 2550 + }, + { + "epoch": 0.05695063349685434, + "grad_norm": 0.4814474880695343, + "learning_rate": 1.9840369418988397e-05, + "loss": 0.242, + "step": 2555 + }, + { + "epoch": 0.057062082877474406, + "grad_norm": 0.49156680703163147, + "learning_rate": 1.98397457061564e-05, + "loss": 0.3756, + "step": 2560 + }, + { + "epoch": 0.057173532258094475, + "grad_norm": 0.4226768910884857, + "learning_rate": 1.983912078704753e-05, + "loss": 0.3711, + "step": 2565 + }, + { + "epoch": 0.057284981638714545, + "grad_norm": 0.7404083609580994, + "learning_rate": 1.98384946617384e-05, + "loss": 0.4515, + "step": 2570 + }, + { + "epoch": 0.05739643101933461, + "grad_norm": 0.6239614486694336, + "learning_rate": 1.983786733030576e-05, + "loss": 0.4969, + "step": 2575 + }, + { + "epoch": 0.05750788039995468, + "grad_norm": 0.6774435639381409, + "learning_rate": 1.9837238792826526e-05, + "loss": 0.4344, + "step": 2580 + }, + { + "epoch": 0.05761932978057475, + "grad_norm": 0.5329133868217468, + "learning_rate": 1.983660904937775e-05, + "loss": 0.3811, + "step": 2585 + }, + { + "epoch": 0.05773077916119481, + "grad_norm": 0.6280587315559387, + "learning_rate": 1.9835978100036625e-05, + "loss": 0.4216, + "step": 2590 + }, + { + "epoch": 0.05784222854181488, + "grad_norm": 0.717054009437561, + "learning_rate": 1.9835345944880512e-05, + "loss": 0.5021, + "step": 2595 + }, + { + "epoch": 0.05795367792243494, + "grad_norm": 0.5728350281715393, + "learning_rate": 1.9834712583986904e-05, + "loss": 0.3976, + "step": 2600 + }, + { + "epoch": 0.05806512730305501, + "grad_norm": 0.45669928193092346, + "learning_rate": 1.9834078017433446e-05, + "loss": 0.5191, + "step": 2605 + }, + { + "epoch": 0.05817657668367508, + "grad_norm": 0.40955591201782227, + "learning_rate": 1.9833442245297923e-05, + "loss": 0.4159, + "step": 2610 + }, + { + "epoch": 0.058288026064295145, + "grad_norm": 0.6732655167579651, + "learning_rate": 1.983280526765829e-05, + "loss": 0.5921, + "step": 2615 + }, + { + "epoch": 0.058399475444915215, + "grad_norm": 0.6169721484184265, + "learning_rate": 1.9832167084592628e-05, + "loss": 0.4072, + "step": 2620 + }, + { + "epoch": 0.058510924825535285, + "grad_norm": 0.5245190262794495, + "learning_rate": 1.9831527696179173e-05, + "loss": 0.4807, + "step": 2625 + }, + { + "epoch": 0.05862237420615535, + "grad_norm": 0.6576266884803772, + "learning_rate": 1.983088710249631e-05, + "loss": 0.4389, + "step": 2630 + }, + { + "epoch": 0.05873382358677542, + "grad_norm": 0.5251596570014954, + "learning_rate": 1.9830245303622573e-05, + "loss": 0.5181, + "step": 2635 + }, + { + "epoch": 0.05884527296739548, + "grad_norm": 0.46695682406425476, + "learning_rate": 1.9829602299636637e-05, + "loss": 0.3301, + "step": 2640 + }, + { + "epoch": 0.05895672234801555, + "grad_norm": 0.7640698552131653, + "learning_rate": 1.9828958090617334e-05, + "loss": 0.5782, + "step": 2645 + }, + { + "epoch": 0.05906817172863562, + "grad_norm": 0.5179789662361145, + "learning_rate": 1.9828312676643638e-05, + "loss": 0.5458, + "step": 2650 + }, + { + "epoch": 0.05917962110925568, + "grad_norm": 0.7178815007209778, + "learning_rate": 1.9827666057794668e-05, + "loss": 0.373, + "step": 2655 + }, + { + "epoch": 0.05929107048987575, + "grad_norm": 0.5658371448516846, + "learning_rate": 1.9827018234149696e-05, + "loss": 0.3873, + "step": 2660 + }, + { + "epoch": 0.05940251987049582, + "grad_norm": 0.6456442475318909, + "learning_rate": 1.9826369205788144e-05, + "loss": 0.3859, + "step": 2665 + }, + { + "epoch": 0.059513969251115885, + "grad_norm": 0.5403016209602356, + "learning_rate": 1.9825718972789576e-05, + "loss": 0.4351, + "step": 2670 + }, + { + "epoch": 0.059625418631735955, + "grad_norm": 0.39102426171302795, + "learning_rate": 1.9825067535233703e-05, + "loss": 0.2348, + "step": 2675 + }, + { + "epoch": 0.059736868012356024, + "grad_norm": 0.539016842842102, + "learning_rate": 1.982441489320039e-05, + "loss": 0.4694, + "step": 2680 + }, + { + "epoch": 0.05984831739297609, + "grad_norm": 0.6451724171638489, + "learning_rate": 1.9823761046769644e-05, + "loss": 0.521, + "step": 2685 + }, + { + "epoch": 0.05995976677359616, + "grad_norm": 0.5299963355064392, + "learning_rate": 1.9823105996021618e-05, + "loss": 0.4558, + "step": 2690 + }, + { + "epoch": 0.06007121615421622, + "grad_norm": 0.5168675184249878, + "learning_rate": 1.9822449741036626e-05, + "loss": 0.5146, + "step": 2695 + }, + { + "epoch": 0.06018266553483629, + "grad_norm": 0.5516669750213623, + "learning_rate": 1.9821792281895108e-05, + "loss": 0.4997, + "step": 2700 + }, + { + "epoch": 0.06029411491545636, + "grad_norm": 0.3844130337238312, + "learning_rate": 1.9821133618677672e-05, + "loss": 0.5141, + "step": 2705 + }, + { + "epoch": 0.06040556429607642, + "grad_norm": 0.8562740087509155, + "learning_rate": 1.9820473751465056e-05, + "loss": 0.5316, + "step": 2710 + }, + { + "epoch": 0.06051701367669649, + "grad_norm": 0.413327693939209, + "learning_rate": 1.9819812680338167e-05, + "loss": 0.456, + "step": 2715 + }, + { + "epoch": 0.06062846305731656, + "grad_norm": 0.5889877080917358, + "learning_rate": 1.9819150405378037e-05, + "loss": 0.4245, + "step": 2720 + }, + { + "epoch": 0.060739912437936625, + "grad_norm": 0.6145250201225281, + "learning_rate": 1.981848692666586e-05, + "loss": 0.554, + "step": 2725 + }, + { + "epoch": 0.060851361818556694, + "grad_norm": 0.5503904819488525, + "learning_rate": 1.9817822244282973e-05, + "loss": 0.3949, + "step": 2730 + }, + { + "epoch": 0.06096281119917676, + "grad_norm": 0.5144293308258057, + "learning_rate": 1.981715635831086e-05, + "loss": 0.337, + "step": 2735 + }, + { + "epoch": 0.06107426057979683, + "grad_norm": 0.4838838577270508, + "learning_rate": 1.981648926883116e-05, + "loss": 0.4807, + "step": 2740 + }, + { + "epoch": 0.0611857099604169, + "grad_norm": 0.7387391924858093, + "learning_rate": 1.981582097592564e-05, + "loss": 0.3634, + "step": 2745 + }, + { + "epoch": 0.06129715934103696, + "grad_norm": 0.46244555711746216, + "learning_rate": 1.981515147967624e-05, + "loss": 0.5305, + "step": 2750 + }, + { + "epoch": 0.06140860872165703, + "grad_norm": 0.435006707906723, + "learning_rate": 1.9814480780165026e-05, + "loss": 0.4762, + "step": 2755 + }, + { + "epoch": 0.0615200581022771, + "grad_norm": 0.6218363642692566, + "learning_rate": 1.981380887747423e-05, + "loss": 0.5001, + "step": 2760 + }, + { + "epoch": 0.06163150748289716, + "grad_norm": 0.5311275124549866, + "learning_rate": 1.9813135771686213e-05, + "loss": 0.3546, + "step": 2765 + }, + { + "epoch": 0.06174295686351723, + "grad_norm": 0.4174991846084595, + "learning_rate": 1.9812461462883496e-05, + "loss": 0.3359, + "step": 2770 + }, + { + "epoch": 0.0618544062441373, + "grad_norm": 0.5177382826805115, + "learning_rate": 1.9811785951148744e-05, + "loss": 0.3957, + "step": 2775 + }, + { + "epoch": 0.061965855624757364, + "grad_norm": 0.6851249933242798, + "learning_rate": 1.981110923656477e-05, + "loss": 0.4732, + "step": 2780 + }, + { + "epoch": 0.062077305005377434, + "grad_norm": 0.6137966513633728, + "learning_rate": 1.981043131921453e-05, + "loss": 0.4677, + "step": 2785 + }, + { + "epoch": 0.0621887543859975, + "grad_norm": 0.6141671538352966, + "learning_rate": 1.980975219918114e-05, + "loss": 0.4041, + "step": 2790 + }, + { + "epoch": 0.062300203766617566, + "grad_norm": 0.5481983423233032, + "learning_rate": 1.9809071876547848e-05, + "loss": 0.4604, + "step": 2795 + }, + { + "epoch": 0.062411653147237636, + "grad_norm": 0.40346822142601013, + "learning_rate": 1.9808390351398063e-05, + "loss": 0.443, + "step": 2800 + }, + { + "epoch": 0.0625231025278577, + "grad_norm": 0.6395840644836426, + "learning_rate": 1.9807707623815323e-05, + "loss": 0.38, + "step": 2805 + }, + { + "epoch": 0.06263455190847776, + "grad_norm": 0.6557417511940002, + "learning_rate": 1.9807023693883337e-05, + "loss": 0.378, + "step": 2810 + }, + { + "epoch": 0.06274600128909784, + "grad_norm": 0.6313578486442566, + "learning_rate": 1.9806338561685946e-05, + "loss": 0.5004, + "step": 2815 + }, + { + "epoch": 0.0628574506697179, + "grad_norm": 0.4933686852455139, + "learning_rate": 1.9805652227307137e-05, + "loss": 0.3973, + "step": 2820 + }, + { + "epoch": 0.06296890005033796, + "grad_norm": 0.5198767781257629, + "learning_rate": 1.9804964690831055e-05, + "loss": 0.4025, + "step": 2825 + }, + { + "epoch": 0.06308034943095804, + "grad_norm": 0.45765766501426697, + "learning_rate": 1.9804275952341983e-05, + "loss": 0.4031, + "step": 2830 + }, + { + "epoch": 0.0631917988115781, + "grad_norm": 0.5890164971351624, + "learning_rate": 1.980358601192436e-05, + "loss": 0.5368, + "step": 2835 + }, + { + "epoch": 0.06330324819219817, + "grad_norm": 0.44444313645362854, + "learning_rate": 1.9802894869662757e-05, + "loss": 0.3634, + "step": 2840 + }, + { + "epoch": 0.06341469757281824, + "grad_norm": 0.4453275501728058, + "learning_rate": 1.9802202525641915e-05, + "loss": 0.2877, + "step": 2845 + }, + { + "epoch": 0.0635261469534383, + "grad_norm": 0.5745603442192078, + "learning_rate": 1.9801508979946703e-05, + "loss": 0.3674, + "step": 2850 + }, + { + "epoch": 0.06363759633405837, + "grad_norm": 0.5141566395759583, + "learning_rate": 1.980081423266215e-05, + "loss": 0.3664, + "step": 2855 + }, + { + "epoch": 0.06374904571467845, + "grad_norm": 0.5238634347915649, + "learning_rate": 1.980011828387342e-05, + "loss": 0.4089, + "step": 2860 + }, + { + "epoch": 0.06386049509529851, + "grad_norm": 0.47008633613586426, + "learning_rate": 1.979942113366583e-05, + "loss": 0.4745, + "step": 2865 + }, + { + "epoch": 0.06397194447591857, + "grad_norm": 0.5868954658508301, + "learning_rate": 1.9798722782124854e-05, + "loss": 0.3342, + "step": 2870 + }, + { + "epoch": 0.06408339385653865, + "grad_norm": 0.5734167098999023, + "learning_rate": 1.9798023229336097e-05, + "loss": 0.5794, + "step": 2875 + }, + { + "epoch": 0.06419484323715871, + "grad_norm": 0.6237195730209351, + "learning_rate": 1.9797322475385323e-05, + "loss": 0.5441, + "step": 2880 + }, + { + "epoch": 0.06430629261777877, + "grad_norm": 0.7441937923431396, + "learning_rate": 1.9796620520358436e-05, + "loss": 0.3542, + "step": 2885 + }, + { + "epoch": 0.06441774199839885, + "grad_norm": 0.6220014691352844, + "learning_rate": 1.979591736434149e-05, + "loss": 0.4135, + "step": 2890 + }, + { + "epoch": 0.06452919137901891, + "grad_norm": 0.3563708961009979, + "learning_rate": 1.9795213007420692e-05, + "loss": 0.3922, + "step": 2895 + }, + { + "epoch": 0.06464064075963898, + "grad_norm": 0.652351975440979, + "learning_rate": 1.9794507449682383e-05, + "loss": 0.5544, + "step": 2900 + }, + { + "epoch": 0.06475209014025904, + "grad_norm": 0.6000990271568298, + "learning_rate": 1.9793800691213065e-05, + "loss": 0.3983, + "step": 2905 + }, + { + "epoch": 0.06486353952087912, + "grad_norm": 0.6168727874755859, + "learning_rate": 1.979309273209938e-05, + "loss": 0.4477, + "step": 2910 + }, + { + "epoch": 0.06497498890149918, + "grad_norm": 0.6351516246795654, + "learning_rate": 1.9792383572428112e-05, + "loss": 0.5369, + "step": 2915 + }, + { + "epoch": 0.06508643828211924, + "grad_norm": 0.8037847280502319, + "learning_rate": 1.9791673212286208e-05, + "loss": 0.5326, + "step": 2920 + }, + { + "epoch": 0.06519788766273932, + "grad_norm": 0.519878089427948, + "learning_rate": 1.9790961651760744e-05, + "loss": 0.3582, + "step": 2925 + }, + { + "epoch": 0.06530933704335938, + "grad_norm": 0.6907233595848083, + "learning_rate": 1.9790248890938958e-05, + "loss": 0.4825, + "step": 2930 + }, + { + "epoch": 0.06542078642397944, + "grad_norm": 0.5148009657859802, + "learning_rate": 1.978953492990823e-05, + "loss": 0.5087, + "step": 2935 + }, + { + "epoch": 0.06553223580459952, + "grad_norm": 0.618364155292511, + "learning_rate": 1.978881976875608e-05, + "loss": 0.4869, + "step": 2940 + }, + { + "epoch": 0.06564368518521958, + "grad_norm": 0.7060839533805847, + "learning_rate": 1.9788103407570187e-05, + "loss": 0.3459, + "step": 2945 + }, + { + "epoch": 0.06575513456583965, + "grad_norm": 0.5656926035881042, + "learning_rate": 1.978738584643837e-05, + "loss": 0.4501, + "step": 2950 + }, + { + "epoch": 0.06586658394645972, + "grad_norm": 0.626718282699585, + "learning_rate": 1.978666708544859e-05, + "loss": 0.341, + "step": 2955 + }, + { + "epoch": 0.06597803332707979, + "grad_norm": 0.37863433361053467, + "learning_rate": 1.978594712468897e-05, + "loss": 0.4533, + "step": 2960 + }, + { + "epoch": 0.06608948270769985, + "grad_norm": 0.6492586731910706, + "learning_rate": 1.978522596424777e-05, + "loss": 0.5158, + "step": 2965 + }, + { + "epoch": 0.06620093208831992, + "grad_norm": 0.6117286086082458, + "learning_rate": 1.978450360421339e-05, + "loss": 0.4592, + "step": 2970 + }, + { + "epoch": 0.06631238146893999, + "grad_norm": 0.5867244005203247, + "learning_rate": 1.9783780044674402e-05, + "loss": 0.2563, + "step": 2975 + }, + { + "epoch": 0.06642383084956005, + "grad_norm": 0.4272123873233795, + "learning_rate": 1.9783055285719498e-05, + "loss": 0.3653, + "step": 2980 + }, + { + "epoch": 0.06653528023018013, + "grad_norm": 0.5771167278289795, + "learning_rate": 1.9782329327437524e-05, + "loss": 0.3544, + "step": 2985 + }, + { + "epoch": 0.06664672961080019, + "grad_norm": 0.776767373085022, + "learning_rate": 1.9781602169917485e-05, + "loss": 0.5062, + "step": 2990 + }, + { + "epoch": 0.06675817899142025, + "grad_norm": 0.537855327129364, + "learning_rate": 1.9780873813248525e-05, + "loss": 0.4165, + "step": 2995 + }, + { + "epoch": 0.06686962837204032, + "grad_norm": 0.46249496936798096, + "learning_rate": 1.9780144257519928e-05, + "loss": 0.4141, + "step": 3000 + }, + { + "epoch": 0.06698107775266039, + "grad_norm": 0.6784607768058777, + "learning_rate": 1.977941350282114e-05, + "loss": 0.3485, + "step": 3005 + }, + { + "epoch": 0.06709252713328046, + "grad_norm": 0.3660302758216858, + "learning_rate": 1.977868154924174e-05, + "loss": 0.4052, + "step": 3010 + }, + { + "epoch": 0.06720397651390052, + "grad_norm": 0.5289759039878845, + "learning_rate": 1.9777948396871464e-05, + "loss": 0.3597, + "step": 3015 + }, + { + "epoch": 0.0673154258945206, + "grad_norm": 0.7300575971603394, + "learning_rate": 1.977721404580019e-05, + "loss": 0.5077, + "step": 3020 + }, + { + "epoch": 0.06742687527514066, + "grad_norm": 0.6065353155136108, + "learning_rate": 1.9776478496117937e-05, + "loss": 0.3955, + "step": 3025 + }, + { + "epoch": 0.06753832465576072, + "grad_norm": 0.3663788139820099, + "learning_rate": 1.9775741747914886e-05, + "loss": 0.3011, + "step": 3030 + }, + { + "epoch": 0.0676497740363808, + "grad_norm": 0.6221305131912231, + "learning_rate": 1.9775003801281355e-05, + "loss": 0.403, + "step": 3035 + }, + { + "epoch": 0.06776122341700086, + "grad_norm": 0.6479603052139282, + "learning_rate": 1.9774264656307805e-05, + "loss": 0.4332, + "step": 3040 + }, + { + "epoch": 0.06787267279762092, + "grad_norm": 0.5434275269508362, + "learning_rate": 1.9773524313084857e-05, + "loss": 0.4223, + "step": 3045 + }, + { + "epoch": 0.067984122178241, + "grad_norm": 0.5634822249412537, + "learning_rate": 1.977278277170327e-05, + "loss": 0.36, + "step": 3050 + }, + { + "epoch": 0.06809557155886106, + "grad_norm": 0.4549255073070526, + "learning_rate": 1.9772040032253947e-05, + "loss": 0.3791, + "step": 3055 + }, + { + "epoch": 0.06820702093948112, + "grad_norm": 1.17770516872406, + "learning_rate": 1.9771296094827948e-05, + "loss": 0.4246, + "step": 3060 + }, + { + "epoch": 0.0683184703201012, + "grad_norm": 0.6360183954238892, + "learning_rate": 1.9770550959516466e-05, + "loss": 0.5389, + "step": 3065 + }, + { + "epoch": 0.06842991970072126, + "grad_norm": 0.4954073131084442, + "learning_rate": 1.9769804626410856e-05, + "loss": 0.35, + "step": 3070 + }, + { + "epoch": 0.06854136908134133, + "grad_norm": 0.496670126914978, + "learning_rate": 1.976905709560261e-05, + "loss": 0.3511, + "step": 3075 + }, + { + "epoch": 0.0686528184619614, + "grad_norm": 0.574720561504364, + "learning_rate": 1.976830836718337e-05, + "loss": 0.4003, + "step": 3080 + }, + { + "epoch": 0.06876426784258147, + "grad_norm": 0.695163905620575, + "learning_rate": 1.976755844124492e-05, + "loss": 0.3023, + "step": 3085 + }, + { + "epoch": 0.06887571722320153, + "grad_norm": 0.7089998722076416, + "learning_rate": 1.9766807317879204e-05, + "loss": 0.3988, + "step": 3090 + }, + { + "epoch": 0.06898716660382159, + "grad_norm": 0.5017579197883606, + "learning_rate": 1.9766054997178297e-05, + "loss": 0.4265, + "step": 3095 + }, + { + "epoch": 0.06909861598444167, + "grad_norm": 0.43221917748451233, + "learning_rate": 1.9765301479234428e-05, + "loss": 0.452, + "step": 3100 + }, + { + "epoch": 0.06921006536506173, + "grad_norm": 0.385077565908432, + "learning_rate": 1.9764546764139978e-05, + "loss": 0.3379, + "step": 3105 + }, + { + "epoch": 0.0693215147456818, + "grad_norm": 0.5521353483200073, + "learning_rate": 1.9763790851987465e-05, + "loss": 0.4577, + "step": 3110 + }, + { + "epoch": 0.06943296412630187, + "grad_norm": 0.6436454653739929, + "learning_rate": 1.9763033742869556e-05, + "loss": 0.4443, + "step": 3115 + }, + { + "epoch": 0.06954441350692193, + "grad_norm": 0.48646727204322815, + "learning_rate": 1.976227543687907e-05, + "loss": 0.3216, + "step": 3120 + }, + { + "epoch": 0.069655862887542, + "grad_norm": 0.3888736665248871, + "learning_rate": 1.976151593410897e-05, + "loss": 0.4476, + "step": 3125 + }, + { + "epoch": 0.06976731226816207, + "grad_norm": 0.5393504500389099, + "learning_rate": 1.976075523465236e-05, + "loss": 0.4556, + "step": 3130 + }, + { + "epoch": 0.06987876164878214, + "grad_norm": 0.6784513592720032, + "learning_rate": 1.9759993338602506e-05, + "loss": 0.4144, + "step": 3135 + }, + { + "epoch": 0.0699902110294022, + "grad_norm": 0.665978729724884, + "learning_rate": 1.97592302460528e-05, + "loss": 0.4303, + "step": 3140 + }, + { + "epoch": 0.07010166041002228, + "grad_norm": 0.5168861746788025, + "learning_rate": 1.9758465957096796e-05, + "loss": 0.4082, + "step": 3145 + }, + { + "epoch": 0.07021310979064234, + "grad_norm": 0.5755696296691895, + "learning_rate": 1.975770047182819e-05, + "loss": 0.3753, + "step": 3150 + }, + { + "epoch": 0.0703245591712624, + "grad_norm": 0.6826760172843933, + "learning_rate": 1.9756933790340823e-05, + "loss": 0.4313, + "step": 3155 + }, + { + "epoch": 0.07043600855188248, + "grad_norm": 0.4687601923942566, + "learning_rate": 1.9756165912728687e-05, + "loss": 0.4579, + "step": 3160 + }, + { + "epoch": 0.07054745793250254, + "grad_norm": 0.5872018933296204, + "learning_rate": 1.975539683908591e-05, + "loss": 0.4517, + "step": 3165 + }, + { + "epoch": 0.0706589073131226, + "grad_norm": 0.6635053157806396, + "learning_rate": 1.9754626569506786e-05, + "loss": 0.3935, + "step": 3170 + }, + { + "epoch": 0.07077035669374268, + "grad_norm": 0.8101974129676819, + "learning_rate": 1.975385510408574e-05, + "loss": 0.4217, + "step": 3175 + }, + { + "epoch": 0.07088180607436274, + "grad_norm": 0.488703191280365, + "learning_rate": 1.9753082442917346e-05, + "loss": 0.3935, + "step": 3180 + }, + { + "epoch": 0.0709932554549828, + "grad_norm": 0.4950294494628906, + "learning_rate": 1.9752308586096326e-05, + "loss": 0.4707, + "step": 3185 + }, + { + "epoch": 0.07110470483560287, + "grad_norm": 0.5157277584075928, + "learning_rate": 1.975153353371755e-05, + "loss": 0.4748, + "step": 3190 + }, + { + "epoch": 0.07121615421622295, + "grad_norm": 0.5851638913154602, + "learning_rate": 1.9750757285876032e-05, + "loss": 0.4026, + "step": 3195 + }, + { + "epoch": 0.07132760359684301, + "grad_norm": 0.5148910880088806, + "learning_rate": 1.9749979842666934e-05, + "loss": 0.4397, + "step": 3200 + }, + { + "epoch": 0.07143905297746307, + "grad_norm": 0.504661500453949, + "learning_rate": 1.974920120418557e-05, + "loss": 0.4302, + "step": 3205 + }, + { + "epoch": 0.07155050235808315, + "grad_norm": 0.6598942279815674, + "learning_rate": 1.9748421370527383e-05, + "loss": 0.4024, + "step": 3210 + }, + { + "epoch": 0.07166195173870321, + "grad_norm": 0.5633904933929443, + "learning_rate": 1.974764034178799e-05, + "loss": 0.3654, + "step": 3215 + }, + { + "epoch": 0.07177340111932327, + "grad_norm": 0.5808898210525513, + "learning_rate": 1.974685811806313e-05, + "loss": 0.4784, + "step": 3220 + }, + { + "epoch": 0.07188485049994335, + "grad_norm": 0.6391786932945251, + "learning_rate": 1.9746074699448697e-05, + "loss": 0.4359, + "step": 3225 + }, + { + "epoch": 0.07199629988056341, + "grad_norm": 0.4155711233615875, + "learning_rate": 1.974529008604073e-05, + "loss": 0.4878, + "step": 3230 + }, + { + "epoch": 0.07210774926118348, + "grad_norm": 0.5104745626449585, + "learning_rate": 1.9744504277935425e-05, + "loss": 0.4309, + "step": 3235 + }, + { + "epoch": 0.07221919864180355, + "grad_norm": 0.7546597719192505, + "learning_rate": 1.9743717275229114e-05, + "loss": 0.3166, + "step": 3240 + }, + { + "epoch": 0.07233064802242362, + "grad_norm": 0.5601435303688049, + "learning_rate": 1.974292907801827e-05, + "loss": 0.4617, + "step": 3245 + }, + { + "epoch": 0.07244209740304368, + "grad_norm": 0.3191545009613037, + "learning_rate": 1.9742139686399527e-05, + "loss": 0.5098, + "step": 3250 + }, + { + "epoch": 0.07255354678366376, + "grad_norm": 0.4238179922103882, + "learning_rate": 1.974134910046966e-05, + "loss": 0.2844, + "step": 3255 + }, + { + "epoch": 0.07266499616428382, + "grad_norm": 0.5163947343826294, + "learning_rate": 1.9740557320325578e-05, + "loss": 0.348, + "step": 3260 + }, + { + "epoch": 0.07277644554490388, + "grad_norm": 0.5601313710212708, + "learning_rate": 1.973976434606436e-05, + "loss": 0.5619, + "step": 3265 + }, + { + "epoch": 0.07288789492552396, + "grad_norm": 0.6028194427490234, + "learning_rate": 1.9738970177783206e-05, + "loss": 0.4684, + "step": 3270 + }, + { + "epoch": 0.07299934430614402, + "grad_norm": 0.663697361946106, + "learning_rate": 1.9738174815579486e-05, + "loss": 0.5199, + "step": 3275 + }, + { + "epoch": 0.07311079368676408, + "grad_norm": 0.39535772800445557, + "learning_rate": 1.97373782595507e-05, + "loss": 0.2407, + "step": 3280 + }, + { + "epoch": 0.07322224306738415, + "grad_norm": 0.6364858746528625, + "learning_rate": 1.9736580509794503e-05, + "loss": 0.3724, + "step": 3285 + }, + { + "epoch": 0.07333369244800422, + "grad_norm": 0.5598808526992798, + "learning_rate": 1.973578156640869e-05, + "loss": 0.4292, + "step": 3290 + }, + { + "epoch": 0.07344514182862429, + "grad_norm": 0.5156586170196533, + "learning_rate": 1.97349814294912e-05, + "loss": 0.4021, + "step": 3295 + }, + { + "epoch": 0.07355659120924435, + "grad_norm": 0.6328551173210144, + "learning_rate": 1.9734180099140135e-05, + "loss": 0.4461, + "step": 3300 + }, + { + "epoch": 0.07366804058986443, + "grad_norm": 0.5238905549049377, + "learning_rate": 1.9733377575453724e-05, + "loss": 0.2508, + "step": 3305 + }, + { + "epoch": 0.07377948997048449, + "grad_norm": 0.5785579085350037, + "learning_rate": 1.9732573858530353e-05, + "loss": 0.4404, + "step": 3310 + }, + { + "epoch": 0.07389093935110455, + "grad_norm": 0.7258238196372986, + "learning_rate": 1.973176894846855e-05, + "loss": 0.5267, + "step": 3315 + }, + { + "epoch": 0.07400238873172463, + "grad_norm": 0.6168790459632874, + "learning_rate": 1.9730962845366993e-05, + "loss": 0.3609, + "step": 3320 + }, + { + "epoch": 0.07411383811234469, + "grad_norm": 0.6070130467414856, + "learning_rate": 1.9730155549324502e-05, + "loss": 0.3515, + "step": 3325 + }, + { + "epoch": 0.07422528749296475, + "grad_norm": 0.7390276789665222, + "learning_rate": 1.9729347060440046e-05, + "loss": 0.413, + "step": 3330 + }, + { + "epoch": 0.07433673687358483, + "grad_norm": 0.7556608319282532, + "learning_rate": 1.9728537378812738e-05, + "loss": 0.4181, + "step": 3335 + }, + { + "epoch": 0.07444818625420489, + "grad_norm": 0.38995063304901123, + "learning_rate": 1.9727726504541838e-05, + "loss": 0.4586, + "step": 3340 + }, + { + "epoch": 0.07455963563482496, + "grad_norm": 0.4750341773033142, + "learning_rate": 1.9726914437726763e-05, + "loss": 0.4457, + "step": 3345 + }, + { + "epoch": 0.07467108501544503, + "grad_norm": 0.6445577144622803, + "learning_rate": 1.972610117846705e-05, + "loss": 0.349, + "step": 3350 + }, + { + "epoch": 0.0747825343960651, + "grad_norm": 0.6933650970458984, + "learning_rate": 1.9725286726862412e-05, + "loss": 0.4251, + "step": 3355 + }, + { + "epoch": 0.07489398377668516, + "grad_norm": 0.4839894771575928, + "learning_rate": 1.972447108301269e-05, + "loss": 0.4288, + "step": 3360 + }, + { + "epoch": 0.07500543315730523, + "grad_norm": 0.4249872863292694, + "learning_rate": 1.9723654247017867e-05, + "loss": 0.3755, + "step": 3365 + }, + { + "epoch": 0.0751168825379253, + "grad_norm": 0.3455201983451843, + "learning_rate": 1.9722836218978094e-05, + "loss": 0.4046, + "step": 3370 + }, + { + "epoch": 0.07522833191854536, + "grad_norm": 0.5376846194267273, + "learning_rate": 1.972201699899365e-05, + "loss": 0.3663, + "step": 3375 + }, + { + "epoch": 0.07533978129916542, + "grad_norm": 0.48555299639701843, + "learning_rate": 1.9721196587164963e-05, + "loss": 0.2518, + "step": 3380 + }, + { + "epoch": 0.0754512306797855, + "grad_norm": 0.5334995985031128, + "learning_rate": 1.972037498359261e-05, + "loss": 0.4687, + "step": 3385 + }, + { + "epoch": 0.07556268006040556, + "grad_norm": 0.604278028011322, + "learning_rate": 1.9719552188377314e-05, + "loss": 0.3836, + "step": 3390 + }, + { + "epoch": 0.07567412944102563, + "grad_norm": 0.5833326578140259, + "learning_rate": 1.971872820161994e-05, + "loss": 0.4573, + "step": 3395 + }, + { + "epoch": 0.0757855788216457, + "grad_norm": 0.5681135654449463, + "learning_rate": 1.971790302342151e-05, + "loss": 0.4283, + "step": 3400 + }, + { + "epoch": 0.07589702820226576, + "grad_norm": 0.71799236536026, + "learning_rate": 1.971707665388318e-05, + "loss": 0.2283, + "step": 3405 + }, + { + "epoch": 0.07600847758288583, + "grad_norm": 0.3602961599826813, + "learning_rate": 1.9716249093106255e-05, + "loss": 0.4057, + "step": 3410 + }, + { + "epoch": 0.0761199269635059, + "grad_norm": 0.37014782428741455, + "learning_rate": 1.9715420341192192e-05, + "loss": 0.4281, + "step": 3415 + }, + { + "epoch": 0.07623137634412597, + "grad_norm": 0.6142109036445618, + "learning_rate": 1.971459039824258e-05, + "loss": 0.4084, + "step": 3420 + }, + { + "epoch": 0.07634282572474603, + "grad_norm": 0.5882206559181213, + "learning_rate": 1.9713759264359175e-05, + "loss": 0.3734, + "step": 3425 + }, + { + "epoch": 0.0764542751053661, + "grad_norm": 0.7564554810523987, + "learning_rate": 1.9712926939643864e-05, + "loss": 0.3457, + "step": 3430 + }, + { + "epoch": 0.07656572448598617, + "grad_norm": 0.5783476829528809, + "learning_rate": 1.9712093424198682e-05, + "loss": 0.4603, + "step": 3435 + }, + { + "epoch": 0.07667717386660623, + "grad_norm": 0.5960418581962585, + "learning_rate": 1.971125871812581e-05, + "loss": 0.4154, + "step": 3440 + }, + { + "epoch": 0.07678862324722631, + "grad_norm": 0.5827411413192749, + "learning_rate": 1.971042282152758e-05, + "loss": 0.4036, + "step": 3445 + }, + { + "epoch": 0.07690007262784637, + "grad_norm": 0.5955131649971008, + "learning_rate": 1.970958573450646e-05, + "loss": 0.5522, + "step": 3450 + }, + { + "epoch": 0.07701152200846643, + "grad_norm": 0.5336333513259888, + "learning_rate": 1.9708747457165083e-05, + "loss": 0.3985, + "step": 3455 + }, + { + "epoch": 0.07712297138908651, + "grad_norm": 0.5097236633300781, + "learning_rate": 1.9707907989606204e-05, + "loss": 0.407, + "step": 3460 + }, + { + "epoch": 0.07723442076970657, + "grad_norm": 0.6677082180976868, + "learning_rate": 1.970706733193274e-05, + "loss": 0.4819, + "step": 3465 + }, + { + "epoch": 0.07734587015032664, + "grad_norm": 0.4805845320224762, + "learning_rate": 1.9706225484247746e-05, + "loss": 0.4058, + "step": 3470 + }, + { + "epoch": 0.0774573195309467, + "grad_norm": 0.5422244668006897, + "learning_rate": 1.9705382446654432e-05, + "loss": 0.5239, + "step": 3475 + }, + { + "epoch": 0.07756876891156678, + "grad_norm": 0.6769611239433289, + "learning_rate": 1.9704538219256143e-05, + "loss": 0.386, + "step": 3480 + }, + { + "epoch": 0.07768021829218684, + "grad_norm": 0.447390079498291, + "learning_rate": 1.9703692802156373e-05, + "loss": 0.3574, + "step": 3485 + }, + { + "epoch": 0.0777916676728069, + "grad_norm": 0.6275390982627869, + "learning_rate": 1.9702846195458768e-05, + "loss": 0.4229, + "step": 3490 + }, + { + "epoch": 0.07790311705342698, + "grad_norm": 0.6409914493560791, + "learning_rate": 1.9701998399267116e-05, + "loss": 0.4774, + "step": 3495 + }, + { + "epoch": 0.07801456643404704, + "grad_norm": 0.7508968710899353, + "learning_rate": 1.9701149413685346e-05, + "loss": 0.5063, + "step": 3500 + }, + { + "epoch": 0.0781260158146671, + "grad_norm": 0.4104915261268616, + "learning_rate": 1.970029923881754e-05, + "loss": 0.386, + "step": 3505 + }, + { + "epoch": 0.07823746519528718, + "grad_norm": 0.4921630024909973, + "learning_rate": 1.969944787476792e-05, + "loss": 0.4103, + "step": 3510 + }, + { + "epoch": 0.07834891457590724, + "grad_norm": 0.6276881694793701, + "learning_rate": 1.9698595321640864e-05, + "loss": 0.3788, + "step": 3515 + }, + { + "epoch": 0.07846036395652731, + "grad_norm": 0.3884750008583069, + "learning_rate": 1.969774157954088e-05, + "loss": 0.4529, + "step": 3520 + }, + { + "epoch": 0.07857181333714738, + "grad_norm": 0.7288042902946472, + "learning_rate": 1.9696886648572632e-05, + "loss": 0.4934, + "step": 3525 + }, + { + "epoch": 0.07868326271776745, + "grad_norm": 0.7034710645675659, + "learning_rate": 1.9696030528840932e-05, + "loss": 0.42, + "step": 3530 + }, + { + "epoch": 0.07879471209838751, + "grad_norm": 0.5408551692962646, + "learning_rate": 1.9695173220450733e-05, + "loss": 0.5281, + "step": 3535 + }, + { + "epoch": 0.07890616147900759, + "grad_norm": 0.6238879561424255, + "learning_rate": 1.9694314723507128e-05, + "loss": 0.4802, + "step": 3540 + }, + { + "epoch": 0.07901761085962765, + "grad_norm": 0.5341750979423523, + "learning_rate": 1.969345503811537e-05, + "loss": 0.5339, + "step": 3545 + }, + { + "epoch": 0.07912906024024771, + "grad_norm": 0.6059457063674927, + "learning_rate": 1.969259416438084e-05, + "loss": 0.3619, + "step": 3550 + }, + { + "epoch": 0.07924050962086779, + "grad_norm": 0.5542227029800415, + "learning_rate": 1.9691732102409086e-05, + "loss": 0.4189, + "step": 3555 + }, + { + "epoch": 0.07935195900148785, + "grad_norm": 0.46328842639923096, + "learning_rate": 1.9690868852305782e-05, + "loss": 0.3014, + "step": 3560 + }, + { + "epoch": 0.07946340838210791, + "grad_norm": 0.5214948058128357, + "learning_rate": 1.9690004414176764e-05, + "loss": 0.4184, + "step": 3565 + }, + { + "epoch": 0.07957485776272798, + "grad_norm": 0.7112562656402588, + "learning_rate": 1.9689138788127994e-05, + "loss": 0.4326, + "step": 3570 + }, + { + "epoch": 0.07968630714334805, + "grad_norm": 0.8344321250915527, + "learning_rate": 1.9688271974265603e-05, + "loss": 0.4086, + "step": 3575 + }, + { + "epoch": 0.07979775652396812, + "grad_norm": 0.587770938873291, + "learning_rate": 1.9687403972695844e-05, + "loss": 0.3255, + "step": 3580 + }, + { + "epoch": 0.07990920590458818, + "grad_norm": 0.6727814674377441, + "learning_rate": 1.9686534783525136e-05, + "loss": 0.4838, + "step": 3585 + }, + { + "epoch": 0.08002065528520826, + "grad_norm": 0.7931737899780273, + "learning_rate": 1.9685664406860033e-05, + "loss": 0.5761, + "step": 3590 + }, + { + "epoch": 0.08013210466582832, + "grad_norm": 0.49970170855522156, + "learning_rate": 1.9684792842807235e-05, + "loss": 0.3854, + "step": 3595 + }, + { + "epoch": 0.08024355404644838, + "grad_norm": 0.5609394907951355, + "learning_rate": 1.968392009147359e-05, + "loss": 0.3726, + "step": 3600 + }, + { + "epoch": 0.08035500342706846, + "grad_norm": 0.6844983696937561, + "learning_rate": 1.968304615296609e-05, + "loss": 0.4116, + "step": 3605 + }, + { + "epoch": 0.08046645280768852, + "grad_norm": 0.6218194365501404, + "learning_rate": 1.9682171027391873e-05, + "loss": 0.4588, + "step": 3610 + }, + { + "epoch": 0.08057790218830858, + "grad_norm": 0.49494484066963196, + "learning_rate": 1.9681294714858224e-05, + "loss": 0.4596, + "step": 3615 + }, + { + "epoch": 0.08068935156892866, + "grad_norm": 0.6498164534568787, + "learning_rate": 1.9680417215472566e-05, + "loss": 0.3559, + "step": 3620 + }, + { + "epoch": 0.08080080094954872, + "grad_norm": 0.5791107416152954, + "learning_rate": 1.9679538529342487e-05, + "loss": 0.554, + "step": 3625 + }, + { + "epoch": 0.08091225033016879, + "grad_norm": 0.5231840014457703, + "learning_rate": 1.9678658656575692e-05, + "loss": 0.4091, + "step": 3630 + }, + { + "epoch": 0.08102369971078886, + "grad_norm": 0.6502806544303894, + "learning_rate": 1.9677777597280055e-05, + "loss": 0.3455, + "step": 3635 + }, + { + "epoch": 0.08113514909140893, + "grad_norm": 0.6524990200996399, + "learning_rate": 1.967689535156359e-05, + "loss": 0.3828, + "step": 3640 + }, + { + "epoch": 0.08124659847202899, + "grad_norm": 0.5977895855903625, + "learning_rate": 1.9676011919534447e-05, + "loss": 0.4543, + "step": 3645 + }, + { + "epoch": 0.08135804785264907, + "grad_norm": 0.6692437529563904, + "learning_rate": 1.9675127301300927e-05, + "loss": 0.3642, + "step": 3650 + }, + { + "epoch": 0.08146949723326913, + "grad_norm": 0.6440483927726746, + "learning_rate": 1.967424149697148e-05, + "loss": 0.47, + "step": 3655 + }, + { + "epoch": 0.08158094661388919, + "grad_norm": 0.5908140540122986, + "learning_rate": 1.9673354506654703e-05, + "loss": 0.424, + "step": 3660 + }, + { + "epoch": 0.08169239599450925, + "grad_norm": 0.5895307064056396, + "learning_rate": 1.967246633045933e-05, + "loss": 0.3909, + "step": 3665 + }, + { + "epoch": 0.08180384537512933, + "grad_norm": 0.5629228353500366, + "learning_rate": 1.967157696849424e-05, + "loss": 0.5223, + "step": 3670 + }, + { + "epoch": 0.0819152947557494, + "grad_norm": 0.645937979221344, + "learning_rate": 1.9670686420868472e-05, + "loss": 0.3936, + "step": 3675 + }, + { + "epoch": 0.08202674413636946, + "grad_norm": 0.5599381923675537, + "learning_rate": 1.9669794687691192e-05, + "loss": 0.4481, + "step": 3680 + }, + { + "epoch": 0.08213819351698953, + "grad_norm": 0.5673943758010864, + "learning_rate": 1.9668901769071723e-05, + "loss": 0.4666, + "step": 3685 + }, + { + "epoch": 0.0822496428976096, + "grad_norm": 0.5841967463493347, + "learning_rate": 1.966800766511953e-05, + "loss": 0.4553, + "step": 3690 + }, + { + "epoch": 0.08236109227822966, + "grad_norm": 0.6012502312660217, + "learning_rate": 1.9667112375944226e-05, + "loss": 0.4002, + "step": 3695 + }, + { + "epoch": 0.08247254165884974, + "grad_norm": 0.874318540096283, + "learning_rate": 1.966621590165556e-05, + "loss": 0.3171, + "step": 3700 + }, + { + "epoch": 0.0825839910394698, + "grad_norm": 0.5734549760818481, + "learning_rate": 1.9665318242363437e-05, + "loss": 0.4828, + "step": 3705 + }, + { + "epoch": 0.08269544042008986, + "grad_norm": 0.4378792345523834, + "learning_rate": 1.96644193981779e-05, + "loss": 0.458, + "step": 3710 + }, + { + "epoch": 0.08280688980070994, + "grad_norm": 0.5063863396644592, + "learning_rate": 1.9663519369209147e-05, + "loss": 0.4166, + "step": 3715 + }, + { + "epoch": 0.08291833918133, + "grad_norm": 0.7026726603507996, + "learning_rate": 1.9662618155567507e-05, + "loss": 0.4646, + "step": 3720 + }, + { + "epoch": 0.08302978856195006, + "grad_norm": 0.5438389778137207, + "learning_rate": 1.9661715757363467e-05, + "loss": 0.3754, + "step": 3725 + }, + { + "epoch": 0.08314123794257014, + "grad_norm": 0.687262773513794, + "learning_rate": 1.966081217470765e-05, + "loss": 0.2916, + "step": 3730 + }, + { + "epoch": 0.0832526873231902, + "grad_norm": 0.5479100942611694, + "learning_rate": 1.9659907407710836e-05, + "loss": 0.4759, + "step": 3735 + }, + { + "epoch": 0.08336413670381027, + "grad_norm": 0.39713597297668457, + "learning_rate": 1.965900145648393e-05, + "loss": 0.5065, + "step": 3740 + }, + { + "epoch": 0.08347558608443034, + "grad_norm": 0.48706549406051636, + "learning_rate": 1.9658094321138e-05, + "loss": 0.4385, + "step": 3745 + }, + { + "epoch": 0.0835870354650504, + "grad_norm": 0.45029741525650024, + "learning_rate": 1.9657186001784262e-05, + "loss": 0.4206, + "step": 3750 + }, + { + "epoch": 0.08369848484567047, + "grad_norm": 0.5025975108146667, + "learning_rate": 1.965627649853406e-05, + "loss": 0.3257, + "step": 3755 + }, + { + "epoch": 0.08380993422629053, + "grad_norm": 0.3801042437553406, + "learning_rate": 1.9655365811498894e-05, + "loss": 0.3938, + "step": 3760 + }, + { + "epoch": 0.08392138360691061, + "grad_norm": 0.5360175967216492, + "learning_rate": 1.9654453940790405e-05, + "loss": 0.5218, + "step": 3765 + }, + { + "epoch": 0.08403283298753067, + "grad_norm": 0.5435302257537842, + "learning_rate": 1.9653540886520387e-05, + "loss": 0.3871, + "step": 3770 + }, + { + "epoch": 0.08414428236815073, + "grad_norm": 0.545606255531311, + "learning_rate": 1.965262664880077e-05, + "loss": 0.3986, + "step": 3775 + }, + { + "epoch": 0.08425573174877081, + "grad_norm": 0.571175754070282, + "learning_rate": 1.9651711227743633e-05, + "loss": 0.4242, + "step": 3780 + }, + { + "epoch": 0.08436718112939087, + "grad_norm": 0.6667168140411377, + "learning_rate": 1.9650794623461198e-05, + "loss": 0.5204, + "step": 3785 + }, + { + "epoch": 0.08447863051001094, + "grad_norm": 0.5052681565284729, + "learning_rate": 1.9649876836065836e-05, + "loss": 0.395, + "step": 3790 + }, + { + "epoch": 0.08459007989063101, + "grad_norm": 0.6449118256568909, + "learning_rate": 1.9648957865670057e-05, + "loss": 0.451, + "step": 3795 + }, + { + "epoch": 0.08470152927125107, + "grad_norm": 0.4299212098121643, + "learning_rate": 1.9648037712386527e-05, + "loss": 0.4676, + "step": 3800 + }, + { + "epoch": 0.08481297865187114, + "grad_norm": 0.5785726308822632, + "learning_rate": 1.964711637632804e-05, + "loss": 0.4101, + "step": 3805 + }, + { + "epoch": 0.08492442803249121, + "grad_norm": 0.38612961769104004, + "learning_rate": 1.964619385760755e-05, + "loss": 0.4847, + "step": 3810 + }, + { + "epoch": 0.08503587741311128, + "grad_norm": 0.5214675664901733, + "learning_rate": 1.9645270156338153e-05, + "loss": 0.4197, + "step": 3815 + }, + { + "epoch": 0.08514732679373134, + "grad_norm": 0.5284664034843445, + "learning_rate": 1.9644345272633083e-05, + "loss": 0.3452, + "step": 3820 + }, + { + "epoch": 0.08525877617435142, + "grad_norm": 0.4067220985889435, + "learning_rate": 1.9643419206605726e-05, + "loss": 0.384, + "step": 3825 + }, + { + "epoch": 0.08537022555497148, + "grad_norm": 0.44208478927612305, + "learning_rate": 1.964249195836961e-05, + "loss": 0.4052, + "step": 3830 + }, + { + "epoch": 0.08548167493559154, + "grad_norm": 0.5821109414100647, + "learning_rate": 1.964156352803841e-05, + "loss": 0.5098, + "step": 3835 + }, + { + "epoch": 0.08559312431621162, + "grad_norm": 0.5415028929710388, + "learning_rate": 1.964063391572594e-05, + "loss": 0.3839, + "step": 3840 + }, + { + "epoch": 0.08570457369683168, + "grad_norm": 0.6177157163619995, + "learning_rate": 1.9639703121546168e-05, + "loss": 0.3516, + "step": 3845 + }, + { + "epoch": 0.08581602307745174, + "grad_norm": 0.5327863097190857, + "learning_rate": 1.9638771145613197e-05, + "loss": 0.2842, + "step": 3850 + }, + { + "epoch": 0.08592747245807181, + "grad_norm": 0.5437836050987244, + "learning_rate": 1.9637837988041288e-05, + "loss": 0.4526, + "step": 3855 + }, + { + "epoch": 0.08603892183869188, + "grad_norm": 0.49541836977005005, + "learning_rate": 1.9636903648944833e-05, + "loss": 0.584, + "step": 3860 + }, + { + "epoch": 0.08615037121931195, + "grad_norm": 0.4200514853000641, + "learning_rate": 1.9635968128438376e-05, + "loss": 0.4595, + "step": 3865 + }, + { + "epoch": 0.08626182059993201, + "grad_norm": 0.41504156589508057, + "learning_rate": 1.9635031426636603e-05, + "loss": 0.4522, + "step": 3870 + }, + { + "epoch": 0.08637326998055209, + "grad_norm": 0.5315532684326172, + "learning_rate": 1.9634093543654355e-05, + "loss": 0.4135, + "step": 3875 + }, + { + "epoch": 0.08648471936117215, + "grad_norm": 0.6478207111358643, + "learning_rate": 1.9633154479606597e-05, + "loss": 0.4747, + "step": 3880 + }, + { + "epoch": 0.08659616874179221, + "grad_norm": 0.593681275844574, + "learning_rate": 1.9632214234608455e-05, + "loss": 0.3559, + "step": 3885 + }, + { + "epoch": 0.08670761812241229, + "grad_norm": 0.6857460737228394, + "learning_rate": 1.9631272808775196e-05, + "loss": 0.3462, + "step": 3890 + }, + { + "epoch": 0.08681906750303235, + "grad_norm": 0.5769230723381042, + "learning_rate": 1.9630330202222238e-05, + "loss": 0.362, + "step": 3895 + }, + { + "epoch": 0.08693051688365241, + "grad_norm": 0.6051479578018188, + "learning_rate": 1.962938641506513e-05, + "loss": 0.4291, + "step": 3900 + }, + { + "epoch": 0.08704196626427249, + "grad_norm": 0.5377400517463684, + "learning_rate": 1.9628441447419573e-05, + "loss": 0.4074, + "step": 3905 + }, + { + "epoch": 0.08715341564489255, + "grad_norm": 0.4809214174747467, + "learning_rate": 1.9627495299401415e-05, + "loss": 0.3452, + "step": 3910 + }, + { + "epoch": 0.08726486502551262, + "grad_norm": 0.6425767540931702, + "learning_rate": 1.9626547971126646e-05, + "loss": 0.4252, + "step": 3915 + }, + { + "epoch": 0.0873763144061327, + "grad_norm": 0.47710588574409485, + "learning_rate": 1.9625599462711403e-05, + "loss": 0.3737, + "step": 3920 + }, + { + "epoch": 0.08748776378675276, + "grad_norm": 0.39346760511398315, + "learning_rate": 1.9624649774271962e-05, + "loss": 0.3413, + "step": 3925 + }, + { + "epoch": 0.08759921316737282, + "grad_norm": 0.6022504568099976, + "learning_rate": 1.9623698905924754e-05, + "loss": 0.3669, + "step": 3930 + }, + { + "epoch": 0.0877106625479929, + "grad_norm": 0.6236966252326965, + "learning_rate": 1.962274685778634e-05, + "loss": 0.4755, + "step": 3935 + }, + { + "epoch": 0.08782211192861296, + "grad_norm": 0.4284822642803192, + "learning_rate": 1.962179362997344e-05, + "loss": 0.5267, + "step": 3940 + }, + { + "epoch": 0.08793356130923302, + "grad_norm": 0.5431884527206421, + "learning_rate": 1.96208392226029e-05, + "loss": 0.3777, + "step": 3945 + }, + { + "epoch": 0.08804501068985308, + "grad_norm": 0.4668923318386078, + "learning_rate": 1.9619883635791745e-05, + "loss": 0.378, + "step": 3950 + }, + { + "epoch": 0.08815646007047316, + "grad_norm": 0.5473476648330688, + "learning_rate": 1.9618926869657103e-05, + "loss": 0.3816, + "step": 3955 + }, + { + "epoch": 0.08826790945109322, + "grad_norm": 0.5855584740638733, + "learning_rate": 1.961796892431628e-05, + "loss": 0.4891, + "step": 3960 + }, + { + "epoch": 0.08837935883171329, + "grad_norm": 0.587864100933075, + "learning_rate": 1.96170097998867e-05, + "loss": 0.5786, + "step": 3965 + }, + { + "epoch": 0.08849080821233336, + "grad_norm": 0.4843202233314514, + "learning_rate": 1.9616049496485954e-05, + "loss": 0.4364, + "step": 3970 + }, + { + "epoch": 0.08860225759295343, + "grad_norm": 0.5611356496810913, + "learning_rate": 1.9615088014231765e-05, + "loss": 0.4311, + "step": 3975 + }, + { + "epoch": 0.08871370697357349, + "grad_norm": 0.6092740893363953, + "learning_rate": 1.9614125353242e-05, + "loss": 0.3972, + "step": 3980 + }, + { + "epoch": 0.08882515635419357, + "grad_norm": 0.7620090842247009, + "learning_rate": 1.9613161513634678e-05, + "loss": 0.4782, + "step": 3985 + }, + { + "epoch": 0.08893660573481363, + "grad_norm": 0.47108519077301025, + "learning_rate": 1.9612196495527956e-05, + "loss": 0.4183, + "step": 3990 + }, + { + "epoch": 0.08904805511543369, + "grad_norm": 0.9083523154258728, + "learning_rate": 1.961123029904014e-05, + "loss": 0.539, + "step": 3995 + }, + { + "epoch": 0.08915950449605377, + "grad_norm": 0.485842227935791, + "learning_rate": 1.9610262924289674e-05, + "loss": 0.3816, + "step": 4000 + }, + { + "epoch": 0.08927095387667383, + "grad_norm": 0.4859134554862976, + "learning_rate": 1.9609294371395154e-05, + "loss": 0.4024, + "step": 4005 + }, + { + "epoch": 0.0893824032572939, + "grad_norm": 0.5020127892494202, + "learning_rate": 1.9608324640475315e-05, + "loss": 0.3913, + "step": 4010 + }, + { + "epoch": 0.08949385263791397, + "grad_norm": 0.5572119355201721, + "learning_rate": 1.9607353731649045e-05, + "loss": 0.346, + "step": 4015 + }, + { + "epoch": 0.08960530201853403, + "grad_norm": 0.6680156588554382, + "learning_rate": 1.960638164503536e-05, + "loss": 0.4647, + "step": 4020 + }, + { + "epoch": 0.0897167513991541, + "grad_norm": 0.48068952560424805, + "learning_rate": 1.9605408380753438e-05, + "loss": 0.4862, + "step": 4025 + }, + { + "epoch": 0.08982820077977417, + "grad_norm": 0.6120688915252686, + "learning_rate": 1.960443393892259e-05, + "loss": 0.4473, + "step": 4030 + }, + { + "epoch": 0.08993965016039424, + "grad_norm": 0.6175330877304077, + "learning_rate": 1.9603458319662274e-05, + "loss": 0.3948, + "step": 4035 + }, + { + "epoch": 0.0900510995410143, + "grad_norm": 0.49549204111099243, + "learning_rate": 1.9602481523092097e-05, + "loss": 0.307, + "step": 4040 + }, + { + "epoch": 0.09016254892163436, + "grad_norm": 0.5401844382286072, + "learning_rate": 1.9601503549331803e-05, + "loss": 0.4125, + "step": 4045 + }, + { + "epoch": 0.09027399830225444, + "grad_norm": 0.6865761876106262, + "learning_rate": 1.960052439850129e-05, + "loss": 0.3436, + "step": 4050 + }, + { + "epoch": 0.0903854476828745, + "grad_norm": 0.6297030448913574, + "learning_rate": 1.9599544070720588e-05, + "loss": 0.3901, + "step": 4055 + }, + { + "epoch": 0.09049689706349456, + "grad_norm": 0.4896922707557678, + "learning_rate": 1.959856256610988e-05, + "loss": 0.4191, + "step": 4060 + }, + { + "epoch": 0.09060834644411464, + "grad_norm": 0.38811594247817993, + "learning_rate": 1.959757988478949e-05, + "loss": 0.4305, + "step": 4065 + }, + { + "epoch": 0.0907197958247347, + "grad_norm": 0.8301783204078674, + "learning_rate": 1.9596596026879893e-05, + "loss": 0.4498, + "step": 4070 + }, + { + "epoch": 0.09083124520535477, + "grad_norm": 0.5923734307289124, + "learning_rate": 1.9595610992501694e-05, + "loss": 0.3545, + "step": 4075 + }, + { + "epoch": 0.09094269458597484, + "grad_norm": 0.4734951853752136, + "learning_rate": 1.9594624781775655e-05, + "loss": 0.3817, + "step": 4080 + }, + { + "epoch": 0.0910541439665949, + "grad_norm": 0.47061291337013245, + "learning_rate": 1.9593637394822673e-05, + "loss": 0.4529, + "step": 4085 + }, + { + "epoch": 0.09116559334721497, + "grad_norm": 0.5556663274765015, + "learning_rate": 1.9592648831763804e-05, + "loss": 0.4694, + "step": 4090 + }, + { + "epoch": 0.09127704272783504, + "grad_norm": 0.49430355429649353, + "learning_rate": 1.9591659092720226e-05, + "loss": 0.4425, + "step": 4095 + }, + { + "epoch": 0.09138849210845511, + "grad_norm": 0.6225961446762085, + "learning_rate": 1.9590668177813284e-05, + "loss": 0.4133, + "step": 4100 + }, + { + "epoch": 0.09149994148907517, + "grad_norm": 0.8296196460723877, + "learning_rate": 1.958967608716445e-05, + "loss": 0.599, + "step": 4105 + }, + { + "epoch": 0.09161139086969525, + "grad_norm": 0.4928753972053528, + "learning_rate": 1.9588682820895352e-05, + "loss": 0.4644, + "step": 4110 + }, + { + "epoch": 0.09172284025031531, + "grad_norm": 0.6599757671356201, + "learning_rate": 1.958768837912775e-05, + "loss": 0.3729, + "step": 4115 + }, + { + "epoch": 0.09183428963093537, + "grad_norm": 0.436012864112854, + "learning_rate": 1.958669276198356e-05, + "loss": 0.4988, + "step": 4120 + }, + { + "epoch": 0.09194573901155545, + "grad_norm": 0.41031414270401, + "learning_rate": 1.958569596958483e-05, + "loss": 0.2872, + "step": 4125 + }, + { + "epoch": 0.09205718839217551, + "grad_norm": 0.5444058775901794, + "learning_rate": 1.958469800205377e-05, + "loss": 0.3723, + "step": 4130 + }, + { + "epoch": 0.09216863777279558, + "grad_norm": 0.5348300337791443, + "learning_rate": 1.9583698859512715e-05, + "loss": 0.3589, + "step": 4135 + }, + { + "epoch": 0.09228008715341564, + "grad_norm": 0.6096326112747192, + "learning_rate": 1.958269854208416e-05, + "loss": 0.3855, + "step": 4140 + }, + { + "epoch": 0.09239153653403571, + "grad_norm": 0.4446529746055603, + "learning_rate": 1.9581697049890723e-05, + "loss": 0.4177, + "step": 4145 + }, + { + "epoch": 0.09250298591465578, + "grad_norm": 0.6660559177398682, + "learning_rate": 1.958069438305519e-05, + "loss": 0.4488, + "step": 4150 + }, + { + "epoch": 0.09261443529527584, + "grad_norm": 0.5148131847381592, + "learning_rate": 1.957969054170048e-05, + "loss": 0.4624, + "step": 4155 + }, + { + "epoch": 0.09272588467589592, + "grad_norm": 0.43481868505477905, + "learning_rate": 1.957868552594965e-05, + "loss": 0.3381, + "step": 4160 + }, + { + "epoch": 0.09283733405651598, + "grad_norm": 0.47481676936149597, + "learning_rate": 1.957767933592591e-05, + "loss": 0.4867, + "step": 4165 + }, + { + "epoch": 0.09294878343713604, + "grad_norm": 0.5280261039733887, + "learning_rate": 1.9576671971752615e-05, + "loss": 0.4098, + "step": 4170 + }, + { + "epoch": 0.09306023281775612, + "grad_norm": 0.5896614789962769, + "learning_rate": 1.9575663433553257e-05, + "loss": 0.4239, + "step": 4175 + }, + { + "epoch": 0.09317168219837618, + "grad_norm": 0.6356634497642517, + "learning_rate": 1.9574653721451472e-05, + "loss": 0.4121, + "step": 4180 + }, + { + "epoch": 0.09328313157899625, + "grad_norm": 0.6598286032676697, + "learning_rate": 1.9573642835571046e-05, + "loss": 0.3416, + "step": 4185 + }, + { + "epoch": 0.09339458095961632, + "grad_norm": 0.506007194519043, + "learning_rate": 1.9572630776035904e-05, + "loss": 0.2665, + "step": 4190 + }, + { + "epoch": 0.09350603034023638, + "grad_norm": 0.4136298894882202, + "learning_rate": 1.9571617542970122e-05, + "loss": 0.4143, + "step": 4195 + }, + { + "epoch": 0.09361747972085645, + "grad_norm": 0.7333647608757019, + "learning_rate": 1.957060313649791e-05, + "loss": 0.506, + "step": 4200 + }, + { + "epoch": 0.09372892910147652, + "grad_norm": 0.505630612373352, + "learning_rate": 1.9569587556743627e-05, + "loss": 0.4094, + "step": 4205 + }, + { + "epoch": 0.09384037848209659, + "grad_norm": 0.5177976489067078, + "learning_rate": 1.9568570803831776e-05, + "loss": 0.2533, + "step": 4210 + }, + { + "epoch": 0.09395182786271665, + "grad_norm": 0.43218210339546204, + "learning_rate": 1.9567552877887e-05, + "loss": 0.3037, + "step": 4215 + }, + { + "epoch": 0.09406327724333673, + "grad_norm": 0.6493254899978638, + "learning_rate": 1.9566533779034094e-05, + "loss": 0.5223, + "step": 4220 + }, + { + "epoch": 0.09417472662395679, + "grad_norm": 0.5709115862846375, + "learning_rate": 1.9565513507397987e-05, + "loss": 0.3979, + "step": 4225 + }, + { + "epoch": 0.09428617600457685, + "grad_norm": 0.5558260083198547, + "learning_rate": 1.9564492063103762e-05, + "loss": 0.5181, + "step": 4230 + }, + { + "epoch": 0.09439762538519691, + "grad_norm": 0.5373645424842834, + "learning_rate": 1.9563469446276634e-05, + "loss": 0.441, + "step": 4235 + }, + { + "epoch": 0.09450907476581699, + "grad_norm": 0.5070239901542664, + "learning_rate": 1.9562445657041967e-05, + "loss": 0.4075, + "step": 4240 + }, + { + "epoch": 0.09462052414643705, + "grad_norm": 0.510416567325592, + "learning_rate": 1.956142069552528e-05, + "loss": 0.4303, + "step": 4245 + }, + { + "epoch": 0.09473197352705712, + "grad_norm": 0.6932322978973389, + "learning_rate": 1.9560394561852214e-05, + "loss": 0.4119, + "step": 4250 + }, + { + "epoch": 0.0948434229076772, + "grad_norm": 0.5977771878242493, + "learning_rate": 1.955936725614857e-05, + "loss": 0.2711, + "step": 4255 + }, + { + "epoch": 0.09495487228829726, + "grad_norm": 0.42069804668426514, + "learning_rate": 1.955833877854029e-05, + "loss": 0.4121, + "step": 4260 + }, + { + "epoch": 0.09506632166891732, + "grad_norm": 0.6174266934394836, + "learning_rate": 1.9557309129153454e-05, + "loss": 0.3022, + "step": 4265 + }, + { + "epoch": 0.0951777710495374, + "grad_norm": 0.8936811685562134, + "learning_rate": 1.9556278308114287e-05, + "loss": 0.4353, + "step": 4270 + }, + { + "epoch": 0.09528922043015746, + "grad_norm": 0.6502985954284668, + "learning_rate": 1.9555246315549166e-05, + "loss": 0.3721, + "step": 4275 + }, + { + "epoch": 0.09540066981077752, + "grad_norm": 0.4536944329738617, + "learning_rate": 1.95542131515846e-05, + "loss": 0.4085, + "step": 4280 + }, + { + "epoch": 0.0955121191913976, + "grad_norm": 0.5095523595809937, + "learning_rate": 1.955317881634725e-05, + "loss": 0.3132, + "step": 4285 + }, + { + "epoch": 0.09562356857201766, + "grad_norm": 0.4880094826221466, + "learning_rate": 1.9552143309963917e-05, + "loss": 0.4069, + "step": 4290 + }, + { + "epoch": 0.09573501795263772, + "grad_norm": 1.030686855316162, + "learning_rate": 1.955110663256154e-05, + "loss": 0.422, + "step": 4295 + }, + { + "epoch": 0.0958464673332578, + "grad_norm": 0.6700113415718079, + "learning_rate": 1.9550068784267217e-05, + "loss": 0.3764, + "step": 4300 + }, + { + "epoch": 0.09595791671387786, + "grad_norm": 0.5926774740219116, + "learning_rate": 1.9549029765208177e-05, + "loss": 0.5163, + "step": 4305 + }, + { + "epoch": 0.09606936609449793, + "grad_norm": 0.7712245583534241, + "learning_rate": 1.9547989575511796e-05, + "loss": 0.3977, + "step": 4310 + }, + { + "epoch": 0.096180815475118, + "grad_norm": 0.4940590560436249, + "learning_rate": 1.954694821530559e-05, + "loss": 0.466, + "step": 4315 + }, + { + "epoch": 0.09629226485573807, + "grad_norm": 0.36600184440612793, + "learning_rate": 1.9545905684717222e-05, + "loss": 0.3899, + "step": 4320 + }, + { + "epoch": 0.09640371423635813, + "grad_norm": 0.7541760802268982, + "learning_rate": 1.9544861983874504e-05, + "loss": 0.3956, + "step": 4325 + }, + { + "epoch": 0.09651516361697819, + "grad_norm": 0.4955383539199829, + "learning_rate": 1.9543817112905383e-05, + "loss": 0.4192, + "step": 4330 + }, + { + "epoch": 0.09662661299759827, + "grad_norm": 0.5017595291137695, + "learning_rate": 1.954277107193795e-05, + "loss": 0.3995, + "step": 4335 + }, + { + "epoch": 0.09673806237821833, + "grad_norm": 0.5852905511856079, + "learning_rate": 1.954172386110044e-05, + "loss": 0.4308, + "step": 4340 + }, + { + "epoch": 0.0968495117588384, + "grad_norm": 0.5507896542549133, + "learning_rate": 1.9540675480521234e-05, + "loss": 0.4659, + "step": 4345 + }, + { + "epoch": 0.09696096113945847, + "grad_norm": 0.5486690402030945, + "learning_rate": 1.953962593032886e-05, + "loss": 0.4153, + "step": 4350 + }, + { + "epoch": 0.09707241052007853, + "grad_norm": 0.5381020307540894, + "learning_rate": 1.953857521065198e-05, + "loss": 0.3121, + "step": 4355 + }, + { + "epoch": 0.0971838599006986, + "grad_norm": 0.5972719192504883, + "learning_rate": 1.9537523321619406e-05, + "loss": 0.4231, + "step": 4360 + }, + { + "epoch": 0.09729530928131867, + "grad_norm": 0.6945480108261108, + "learning_rate": 1.9536470263360093e-05, + "loss": 0.4381, + "step": 4365 + }, + { + "epoch": 0.09740675866193874, + "grad_norm": 0.4568721354007721, + "learning_rate": 1.9535416036003132e-05, + "loss": 0.3924, + "step": 4370 + }, + { + "epoch": 0.0975182080425588, + "grad_norm": 0.4959554374217987, + "learning_rate": 1.953436063967777e-05, + "loss": 0.3647, + "step": 4375 + }, + { + "epoch": 0.09762965742317888, + "grad_norm": 0.4809156060218811, + "learning_rate": 1.953330407451339e-05, + "loss": 0.4009, + "step": 4380 + }, + { + "epoch": 0.09774110680379894, + "grad_norm": 0.5318039655685425, + "learning_rate": 1.953224634063951e-05, + "loss": 0.4031, + "step": 4385 + }, + { + "epoch": 0.097852556184419, + "grad_norm": 0.6752211451530457, + "learning_rate": 1.9531187438185812e-05, + "loss": 0.342, + "step": 4390 + }, + { + "epoch": 0.09796400556503908, + "grad_norm": 0.4800848364830017, + "learning_rate": 1.95301273672821e-05, + "loss": 0.3931, + "step": 4395 + }, + { + "epoch": 0.09807545494565914, + "grad_norm": 0.5006334185600281, + "learning_rate": 1.9529066128058333e-05, + "loss": 0.3935, + "step": 4400 + }, + { + "epoch": 0.0981869043262792, + "grad_norm": 0.7476497292518616, + "learning_rate": 1.9528003720644615e-05, + "loss": 0.4759, + "step": 4405 + }, + { + "epoch": 0.09829835370689928, + "grad_norm": 0.5515742897987366, + "learning_rate": 1.9526940145171185e-05, + "loss": 0.3507, + "step": 4410 + }, + { + "epoch": 0.09840980308751934, + "grad_norm": 0.44880327582359314, + "learning_rate": 1.952587540176843e-05, + "loss": 0.4261, + "step": 4415 + }, + { + "epoch": 0.0985212524681394, + "grad_norm": 0.4958060681819916, + "learning_rate": 1.9524809490566878e-05, + "loss": 0.3289, + "step": 4420 + }, + { + "epoch": 0.09863270184875947, + "grad_norm": 0.4644846022129059, + "learning_rate": 1.9523742411697205e-05, + "loss": 0.5159, + "step": 4425 + }, + { + "epoch": 0.09874415122937955, + "grad_norm": 0.5666079521179199, + "learning_rate": 1.952267416529022e-05, + "loss": 0.3887, + "step": 4430 + }, + { + "epoch": 0.09885560060999961, + "grad_norm": 0.8068232536315918, + "learning_rate": 1.952160475147689e-05, + "loss": 0.4436, + "step": 4435 + }, + { + "epoch": 0.09896704999061967, + "grad_norm": 0.6327325701713562, + "learning_rate": 1.9520534170388314e-05, + "loss": 0.386, + "step": 4440 + }, + { + "epoch": 0.09907849937123975, + "grad_norm": 0.583257794380188, + "learning_rate": 1.9519462422155733e-05, + "loss": 0.4371, + "step": 4445 + }, + { + "epoch": 0.09918994875185981, + "grad_norm": 0.42879775166511536, + "learning_rate": 1.951838950691054e-05, + "loss": 0.38, + "step": 4450 + }, + { + "epoch": 0.09930139813247987, + "grad_norm": 0.5124005079269409, + "learning_rate": 1.9517315424784263e-05, + "loss": 0.4692, + "step": 4455 + }, + { + "epoch": 0.09941284751309995, + "grad_norm": 0.4689030945301056, + "learning_rate": 1.9516240175908578e-05, + "loss": 0.4573, + "step": 4460 + }, + { + "epoch": 0.09952429689372001, + "grad_norm": 0.5528591871261597, + "learning_rate": 1.95151637604153e-05, + "loss": 0.4225, + "step": 4465 + }, + { + "epoch": 0.09963574627434008, + "grad_norm": 0.5473395586013794, + "learning_rate": 1.9514086178436393e-05, + "loss": 0.4231, + "step": 4470 + }, + { + "epoch": 0.09974719565496015, + "grad_norm": 0.5362890958786011, + "learning_rate": 1.9513007430103954e-05, + "loss": 0.5017, + "step": 4475 + }, + { + "epoch": 0.09985864503558022, + "grad_norm": 0.5761915445327759, + "learning_rate": 1.9511927515550235e-05, + "loss": 0.3971, + "step": 4480 + }, + { + "epoch": 0.09997009441620028, + "grad_norm": 0.7011691331863403, + "learning_rate": 1.9510846434907626e-05, + "loss": 0.4316, + "step": 4485 + }, + { + "epoch": 0.10008154379682035, + "grad_norm": 0.40270212292671204, + "learning_rate": 1.9509764188308654e-05, + "loss": 0.3871, + "step": 4490 + }, + { + "epoch": 0.10019299317744042, + "grad_norm": 0.7045685052871704, + "learning_rate": 1.9508680775886e-05, + "loss": 0.3752, + "step": 4495 + }, + { + "epoch": 0.10030444255806048, + "grad_norm": 0.4979664087295532, + "learning_rate": 1.9507596197772474e-05, + "loss": 0.4033, + "step": 4500 + }, + { + "epoch": 0.10041589193868056, + "grad_norm": 0.6763628125190735, + "learning_rate": 1.9506510454101045e-05, + "loss": 0.3748, + "step": 4505 + }, + { + "epoch": 0.10052734131930062, + "grad_norm": 0.7423232793807983, + "learning_rate": 1.950542354500481e-05, + "loss": 0.408, + "step": 4510 + }, + { + "epoch": 0.10063879069992068, + "grad_norm": 0.48339027166366577, + "learning_rate": 1.9504335470617023e-05, + "loss": 0.4648, + "step": 4515 + }, + { + "epoch": 0.10075024008054075, + "grad_norm": 0.6129444241523743, + "learning_rate": 1.9503246231071068e-05, + "loss": 0.4098, + "step": 4520 + }, + { + "epoch": 0.10086168946116082, + "grad_norm": 0.5484494566917419, + "learning_rate": 1.9502155826500477e-05, + "loss": 0.3202, + "step": 4525 + }, + { + "epoch": 0.10097313884178089, + "grad_norm": 0.40450939536094666, + "learning_rate": 1.9501064257038928e-05, + "loss": 0.3256, + "step": 4530 + }, + { + "epoch": 0.10108458822240095, + "grad_norm": 0.5657923221588135, + "learning_rate": 1.9499971522820238e-05, + "loss": 0.4207, + "step": 4535 + }, + { + "epoch": 0.10119603760302102, + "grad_norm": 0.4858623445034027, + "learning_rate": 1.949887762397837e-05, + "loss": 0.4609, + "step": 4540 + }, + { + "epoch": 0.10130748698364109, + "grad_norm": 0.5333701372146606, + "learning_rate": 1.9497782560647424e-05, + "loss": 0.3616, + "step": 4545 + }, + { + "epoch": 0.10141893636426115, + "grad_norm": 0.5602688789367676, + "learning_rate": 1.9496686332961646e-05, + "loss": 0.3678, + "step": 4550 + }, + { + "epoch": 0.10153038574488123, + "grad_norm": 0.3975955545902252, + "learning_rate": 1.9495588941055428e-05, + "loss": 0.3447, + "step": 4555 + }, + { + "epoch": 0.10164183512550129, + "grad_norm": 0.48473915457725525, + "learning_rate": 1.9494490385063303e-05, + "loss": 0.2937, + "step": 4560 + }, + { + "epoch": 0.10175328450612135, + "grad_norm": 0.5309877991676331, + "learning_rate": 1.949339066511994e-05, + "loss": 0.4019, + "step": 4565 + }, + { + "epoch": 0.10186473388674143, + "grad_norm": 0.5357742309570312, + "learning_rate": 1.9492289781360158e-05, + "loss": 0.3802, + "step": 4570 + }, + { + "epoch": 0.10197618326736149, + "grad_norm": 0.6399753093719482, + "learning_rate": 1.949118773391892e-05, + "loss": 0.3296, + "step": 4575 + }, + { + "epoch": 0.10208763264798155, + "grad_norm": 0.48185470700263977, + "learning_rate": 1.9490084522931326e-05, + "loss": 0.3292, + "step": 4580 + }, + { + "epoch": 0.10219908202860163, + "grad_norm": 0.4180678725242615, + "learning_rate": 1.9488980148532622e-05, + "loss": 0.3201, + "step": 4585 + }, + { + "epoch": 0.1023105314092217, + "grad_norm": 0.5125147104263306, + "learning_rate": 1.94878746108582e-05, + "loss": 0.3645, + "step": 4590 + }, + { + "epoch": 0.10242198078984176, + "grad_norm": 0.999405562877655, + "learning_rate": 1.9486767910043577e-05, + "loss": 0.4721, + "step": 4595 + }, + { + "epoch": 0.10253343017046183, + "grad_norm": 0.5479783415794373, + "learning_rate": 1.948566004622444e-05, + "loss": 0.4055, + "step": 4600 + }, + { + "epoch": 0.1026448795510819, + "grad_norm": 0.5214530825614929, + "learning_rate": 1.94845510195366e-05, + "loss": 0.4083, + "step": 4605 + }, + { + "epoch": 0.10275632893170196, + "grad_norm": 0.5216266512870789, + "learning_rate": 1.9483440830116015e-05, + "loss": 0.3945, + "step": 4610 + }, + { + "epoch": 0.10286777831232202, + "grad_norm": 0.6572335362434387, + "learning_rate": 1.948232947809878e-05, + "loss": 0.4189, + "step": 4615 + }, + { + "epoch": 0.1029792276929421, + "grad_norm": 0.4009169042110443, + "learning_rate": 1.9481216963621147e-05, + "loss": 0.323, + "step": 4620 + }, + { + "epoch": 0.10309067707356216, + "grad_norm": 0.5584750175476074, + "learning_rate": 1.94801032868195e-05, + "loss": 0.3341, + "step": 4625 + }, + { + "epoch": 0.10320212645418222, + "grad_norm": 0.7415927648544312, + "learning_rate": 1.947898844783036e-05, + "loss": 0.3254, + "step": 4630 + }, + { + "epoch": 0.1033135758348023, + "grad_norm": 0.521470308303833, + "learning_rate": 1.9477872446790407e-05, + "loss": 0.3841, + "step": 4635 + }, + { + "epoch": 0.10342502521542236, + "grad_norm": 0.49424269795417786, + "learning_rate": 1.9476755283836448e-05, + "loss": 0.3904, + "step": 4640 + }, + { + "epoch": 0.10353647459604243, + "grad_norm": 0.44216257333755493, + "learning_rate": 1.947563695910544e-05, + "loss": 0.4693, + "step": 4645 + }, + { + "epoch": 0.1036479239766625, + "grad_norm": 0.36706945300102234, + "learning_rate": 1.9474517472734483e-05, + "loss": 0.3081, + "step": 4650 + }, + { + "epoch": 0.10375937335728257, + "grad_norm": 0.7147274017333984, + "learning_rate": 1.9473396824860818e-05, + "loss": 0.4634, + "step": 4655 + }, + { + "epoch": 0.10387082273790263, + "grad_norm": 0.4979982078075409, + "learning_rate": 1.9472275015621823e-05, + "loss": 0.398, + "step": 4660 + }, + { + "epoch": 0.1039822721185227, + "grad_norm": 0.5600672364234924, + "learning_rate": 1.9471152045155028e-05, + "loss": 0.4202, + "step": 4665 + }, + { + "epoch": 0.10409372149914277, + "grad_norm": 0.6161574721336365, + "learning_rate": 1.9470027913598094e-05, + "loss": 0.3974, + "step": 4670 + }, + { + "epoch": 0.10420517087976283, + "grad_norm": 1.0517438650131226, + "learning_rate": 1.9468902621088838e-05, + "loss": 0.4727, + "step": 4675 + }, + { + "epoch": 0.10431662026038291, + "grad_norm": 0.7160767912864685, + "learning_rate": 1.946777616776521e-05, + "loss": 0.3719, + "step": 4680 + }, + { + "epoch": 0.10442806964100297, + "grad_norm": 0.561278223991394, + "learning_rate": 1.94666485537653e-05, + "loss": 0.4472, + "step": 4685 + }, + { + "epoch": 0.10453951902162303, + "grad_norm": 0.5287666320800781, + "learning_rate": 1.9465519779227354e-05, + "loss": 0.5288, + "step": 4690 + }, + { + "epoch": 0.10465096840224311, + "grad_norm": 0.6254017949104309, + "learning_rate": 1.9464389844289742e-05, + "loss": 0.4221, + "step": 4695 + }, + { + "epoch": 0.10476241778286317, + "grad_norm": 0.4677393436431885, + "learning_rate": 1.946325874909099e-05, + "loss": 0.3627, + "step": 4700 + }, + { + "epoch": 0.10487386716348324, + "grad_norm": 0.37834852933883667, + "learning_rate": 1.946212649376976e-05, + "loss": 0.4134, + "step": 4705 + }, + { + "epoch": 0.1049853165441033, + "grad_norm": 0.4782722592353821, + "learning_rate": 1.946099307846486e-05, + "loss": 0.3583, + "step": 4710 + }, + { + "epoch": 0.10509676592472338, + "grad_norm": 0.6644002795219421, + "learning_rate": 1.9459858503315236e-05, + "loss": 0.5085, + "step": 4715 + }, + { + "epoch": 0.10520821530534344, + "grad_norm": 0.5693483948707581, + "learning_rate": 1.9458722768459976e-05, + "loss": 0.5201, + "step": 4720 + }, + { + "epoch": 0.1053196646859635, + "grad_norm": 0.5851562023162842, + "learning_rate": 1.9457585874038316e-05, + "loss": 0.4057, + "step": 4725 + }, + { + "epoch": 0.10543111406658358, + "grad_norm": 0.4632630944252014, + "learning_rate": 1.9456447820189634e-05, + "loss": 0.4006, + "step": 4730 + }, + { + "epoch": 0.10554256344720364, + "grad_norm": 0.49858537316322327, + "learning_rate": 1.9455308607053435e-05, + "loss": 0.4134, + "step": 4735 + }, + { + "epoch": 0.1056540128278237, + "grad_norm": 0.6325331330299377, + "learning_rate": 1.9454168234769388e-05, + "loss": 0.3621, + "step": 4740 + }, + { + "epoch": 0.10576546220844378, + "grad_norm": 0.528812825679779, + "learning_rate": 1.9453026703477288e-05, + "loss": 0.285, + "step": 4745 + }, + { + "epoch": 0.10587691158906384, + "grad_norm": 0.5538693070411682, + "learning_rate": 1.9451884013317078e-05, + "loss": 0.398, + "step": 4750 + }, + { + "epoch": 0.1059883609696839, + "grad_norm": 0.7025234699249268, + "learning_rate": 1.945074016442885e-05, + "loss": 0.4313, + "step": 4755 + }, + { + "epoch": 0.10609981035030398, + "grad_norm": 0.5137503147125244, + "learning_rate": 1.9449595156952827e-05, + "loss": 0.3942, + "step": 4760 + }, + { + "epoch": 0.10621125973092405, + "grad_norm": 0.5194128155708313, + "learning_rate": 1.9448448991029375e-05, + "loss": 0.5149, + "step": 4765 + }, + { + "epoch": 0.10632270911154411, + "grad_norm": 0.5486317276954651, + "learning_rate": 1.944730166679901e-05, + "loss": 0.3688, + "step": 4770 + }, + { + "epoch": 0.10643415849216419, + "grad_norm": 0.4897007942199707, + "learning_rate": 1.944615318440238e-05, + "loss": 0.3045, + "step": 4775 + }, + { + "epoch": 0.10654560787278425, + "grad_norm": 0.5833176970481873, + "learning_rate": 1.9445003543980282e-05, + "loss": 0.6282, + "step": 4780 + }, + { + "epoch": 0.10665705725340431, + "grad_norm": 0.5699679851531982, + "learning_rate": 1.944385274567366e-05, + "loss": 0.4778, + "step": 4785 + }, + { + "epoch": 0.10676850663402439, + "grad_norm": 0.405729740858078, + "learning_rate": 1.9442700789623578e-05, + "loss": 0.4771, + "step": 4790 + }, + { + "epoch": 0.10687995601464445, + "grad_norm": 0.5367416739463806, + "learning_rate": 1.944154767597127e-05, + "loss": 0.4213, + "step": 4795 + }, + { + "epoch": 0.10699140539526451, + "grad_norm": 0.9690253734588623, + "learning_rate": 1.9440393404858095e-05, + "loss": 0.4605, + "step": 4800 + }, + { + "epoch": 0.10710285477588458, + "grad_norm": 0.31591367721557617, + "learning_rate": 1.9439237976425554e-05, + "loss": 0.3637, + "step": 4805 + }, + { + "epoch": 0.10721430415650465, + "grad_norm": 0.6415589451789856, + "learning_rate": 1.94380813908153e-05, + "loss": 0.4701, + "step": 4810 + }, + { + "epoch": 0.10732575353712472, + "grad_norm": 0.5109201669692993, + "learning_rate": 1.943692364816912e-05, + "loss": 0.5069, + "step": 4815 + }, + { + "epoch": 0.10743720291774478, + "grad_norm": 0.6550776958465576, + "learning_rate": 1.9435764748628938e-05, + "loss": 0.3794, + "step": 4820 + }, + { + "epoch": 0.10754865229836486, + "grad_norm": 0.5925948023796082, + "learning_rate": 1.9434604692336833e-05, + "loss": 0.4779, + "step": 4825 + }, + { + "epoch": 0.10766010167898492, + "grad_norm": 0.6550899744033813, + "learning_rate": 1.9433443479435017e-05, + "loss": 0.4021, + "step": 4830 + }, + { + "epoch": 0.10777155105960498, + "grad_norm": 0.5794579386711121, + "learning_rate": 1.9432281110065845e-05, + "loss": 0.5415, + "step": 4835 + }, + { + "epoch": 0.10788300044022506, + "grad_norm": 0.5639534592628479, + "learning_rate": 1.9431117584371816e-05, + "loss": 0.39, + "step": 4840 + }, + { + "epoch": 0.10799444982084512, + "grad_norm": 0.43835127353668213, + "learning_rate": 1.9429952902495564e-05, + "loss": 0.3971, + "step": 4845 + }, + { + "epoch": 0.10810589920146518, + "grad_norm": 0.672596275806427, + "learning_rate": 1.942878706457988e-05, + "loss": 0.4264, + "step": 4850 + }, + { + "epoch": 0.10821734858208526, + "grad_norm": 0.6578832268714905, + "learning_rate": 1.942762007076768e-05, + "loss": 0.449, + "step": 4855 + }, + { + "epoch": 0.10832879796270532, + "grad_norm": 0.463405042886734, + "learning_rate": 1.9426451921202026e-05, + "loss": 0.4623, + "step": 4860 + }, + { + "epoch": 0.10844024734332539, + "grad_norm": 0.48977985978126526, + "learning_rate": 1.942528261602613e-05, + "loss": 0.406, + "step": 4865 + }, + { + "epoch": 0.10855169672394546, + "grad_norm": 0.47064387798309326, + "learning_rate": 1.9424112155383334e-05, + "loss": 0.3864, + "step": 4870 + }, + { + "epoch": 0.10866314610456553, + "grad_norm": 0.4514538049697876, + "learning_rate": 1.9422940539417133e-05, + "loss": 0.4726, + "step": 4875 + }, + { + "epoch": 0.10877459548518559, + "grad_norm": 0.3914186656475067, + "learning_rate": 1.9421767768271156e-05, + "loss": 0.4734, + "step": 4880 + }, + { + "epoch": 0.10888604486580566, + "grad_norm": 0.6711552739143372, + "learning_rate": 1.9420593842089178e-05, + "loss": 0.4208, + "step": 4885 + }, + { + "epoch": 0.10899749424642573, + "grad_norm": 0.598562479019165, + "learning_rate": 1.9419418761015107e-05, + "loss": 0.632, + "step": 4890 + }, + { + "epoch": 0.10910894362704579, + "grad_norm": 0.5351366996765137, + "learning_rate": 1.9418242525193002e-05, + "loss": 0.3546, + "step": 4895 + }, + { + "epoch": 0.10922039300766585, + "grad_norm": 0.546393871307373, + "learning_rate": 1.9417065134767067e-05, + "loss": 0.3914, + "step": 4900 + }, + { + "epoch": 0.10933184238828593, + "grad_norm": 0.5991262197494507, + "learning_rate": 1.941588658988163e-05, + "loss": 0.4898, + "step": 4905 + }, + { + "epoch": 0.10944329176890599, + "grad_norm": 0.6237379908561707, + "learning_rate": 1.9414706890681177e-05, + "loss": 0.3676, + "step": 4910 + }, + { + "epoch": 0.10955474114952606, + "grad_norm": 0.5884041786193848, + "learning_rate": 1.9413526037310332e-05, + "loss": 0.4046, + "step": 4915 + }, + { + "epoch": 0.10966619053014613, + "grad_norm": 0.5601520538330078, + "learning_rate": 1.9412344029913854e-05, + "loss": 0.3836, + "step": 4920 + }, + { + "epoch": 0.1097776399107662, + "grad_norm": 0.5114838480949402, + "learning_rate": 1.941116086863665e-05, + "loss": 0.4927, + "step": 4925 + }, + { + "epoch": 0.10988908929138626, + "grad_norm": 0.5547128915786743, + "learning_rate": 1.9409976553623767e-05, + "loss": 0.4657, + "step": 4930 + }, + { + "epoch": 0.11000053867200633, + "grad_norm": 0.6494245529174805, + "learning_rate": 1.940879108502039e-05, + "loss": 0.3642, + "step": 4935 + }, + { + "epoch": 0.1101119880526264, + "grad_norm": 0.34890300035476685, + "learning_rate": 1.9407604462971856e-05, + "loss": 0.2909, + "step": 4940 + }, + { + "epoch": 0.11022343743324646, + "grad_norm": 0.6003003716468811, + "learning_rate": 1.9406416687623625e-05, + "loss": 0.5144, + "step": 4945 + }, + { + "epoch": 0.11033488681386654, + "grad_norm": 0.48884761333465576, + "learning_rate": 1.9405227759121318e-05, + "loss": 0.4649, + "step": 4950 + }, + { + "epoch": 0.1104463361944866, + "grad_norm": 0.6424091458320618, + "learning_rate": 1.9404037677610685e-05, + "loss": 0.4965, + "step": 4955 + }, + { + "epoch": 0.11055778557510666, + "grad_norm": 0.6459415555000305, + "learning_rate": 1.940284644323762e-05, + "loss": 0.4513, + "step": 4960 + }, + { + "epoch": 0.11066923495572674, + "grad_norm": 0.4841134548187256, + "learning_rate": 1.940165405614816e-05, + "loss": 0.3238, + "step": 4965 + }, + { + "epoch": 0.1107806843363468, + "grad_norm": 0.7165957093238831, + "learning_rate": 1.940046051648848e-05, + "loss": 0.5479, + "step": 4970 + }, + { + "epoch": 0.11089213371696686, + "grad_norm": 0.5024096369743347, + "learning_rate": 1.9399265824404903e-05, + "loss": 0.4196, + "step": 4975 + }, + { + "epoch": 0.11100358309758694, + "grad_norm": 0.5883904695510864, + "learning_rate": 1.9398069980043885e-05, + "loss": 0.391, + "step": 4980 + }, + { + "epoch": 0.111115032478207, + "grad_norm": 0.6263939142227173, + "learning_rate": 1.9396872983552033e-05, + "loss": 0.4796, + "step": 4985 + }, + { + "epoch": 0.11122648185882707, + "grad_norm": 0.5873487591743469, + "learning_rate": 1.9395674835076085e-05, + "loss": 0.2901, + "step": 4990 + }, + { + "epoch": 0.11133793123944713, + "grad_norm": 0.5534051060676575, + "learning_rate": 1.9394475534762925e-05, + "loss": 0.4489, + "step": 4995 + }, + { + "epoch": 0.1114493806200672, + "grad_norm": 0.5187072157859802, + "learning_rate": 1.939327508275958e-05, + "loss": 0.345, + "step": 5000 + }, + { + "epoch": 0.11156083000068727, + "grad_norm": 0.60208660364151, + "learning_rate": 1.9392073479213213e-05, + "loss": 0.3369, + "step": 5005 + }, + { + "epoch": 0.11167227938130733, + "grad_norm": 0.585619330406189, + "learning_rate": 1.9390870724271133e-05, + "loss": 0.4317, + "step": 5010 + }, + { + "epoch": 0.11178372876192741, + "grad_norm": 0.5309601426124573, + "learning_rate": 1.9389666818080787e-05, + "loss": 0.4831, + "step": 5015 + }, + { + "epoch": 0.11189517814254747, + "grad_norm": 1.0372563600540161, + "learning_rate": 1.9388461760789773e-05, + "loss": 0.4236, + "step": 5020 + }, + { + "epoch": 0.11200662752316753, + "grad_norm": 0.49995213747024536, + "learning_rate": 1.938725555254581e-05, + "loss": 0.3901, + "step": 5025 + }, + { + "epoch": 0.11211807690378761, + "grad_norm": 0.4487152695655823, + "learning_rate": 1.9386048193496774e-05, + "loss": 0.413, + "step": 5030 + }, + { + "epoch": 0.11222952628440767, + "grad_norm": 0.3875434696674347, + "learning_rate": 1.938483968379068e-05, + "loss": 0.3672, + "step": 5035 + }, + { + "epoch": 0.11234097566502774, + "grad_norm": 0.691291868686676, + "learning_rate": 1.938363002357568e-05, + "loss": 0.5052, + "step": 5040 + }, + { + "epoch": 0.11245242504564781, + "grad_norm": 0.406793475151062, + "learning_rate": 1.938241921300007e-05, + "loss": 0.4285, + "step": 5045 + }, + { + "epoch": 0.11256387442626788, + "grad_norm": 0.7369039058685303, + "learning_rate": 1.9381207252212286e-05, + "loss": 0.3757, + "step": 5050 + }, + { + "epoch": 0.11267532380688794, + "grad_norm": 0.5105433464050293, + "learning_rate": 1.9379994141360906e-05, + "loss": 0.3474, + "step": 5055 + }, + { + "epoch": 0.11278677318750802, + "grad_norm": 0.6076253652572632, + "learning_rate": 1.9378779880594643e-05, + "loss": 0.495, + "step": 5060 + }, + { + "epoch": 0.11289822256812808, + "grad_norm": 0.4707455635070801, + "learning_rate": 1.937756447006236e-05, + "loss": 0.3853, + "step": 5065 + }, + { + "epoch": 0.11300967194874814, + "grad_norm": 0.629932701587677, + "learning_rate": 1.937634790991306e-05, + "loss": 0.4166, + "step": 5070 + }, + { + "epoch": 0.11312112132936822, + "grad_norm": 0.5261717438697815, + "learning_rate": 1.937513020029588e-05, + "loss": 0.3943, + "step": 5075 + }, + { + "epoch": 0.11323257070998828, + "grad_norm": 0.8253975510597229, + "learning_rate": 1.93739113413601e-05, + "loss": 0.4549, + "step": 5080 + }, + { + "epoch": 0.11334402009060834, + "grad_norm": 0.49206769466400146, + "learning_rate": 1.937269133325515e-05, + "loss": 0.337, + "step": 5085 + }, + { + "epoch": 0.1134554694712284, + "grad_norm": 0.5231266021728516, + "learning_rate": 1.937147017613058e-05, + "loss": 0.3369, + "step": 5090 + }, + { + "epoch": 0.11356691885184848, + "grad_norm": 0.5962830781936646, + "learning_rate": 1.937024787013611e-05, + "loss": 0.3433, + "step": 5095 + }, + { + "epoch": 0.11367836823246855, + "grad_norm": 0.6776148080825806, + "learning_rate": 1.9369024415421576e-05, + "loss": 0.2904, + "step": 5100 + }, + { + "epoch": 0.11378981761308861, + "grad_norm": 0.46378564834594727, + "learning_rate": 1.9367799812136967e-05, + "loss": 0.2613, + "step": 5105 + }, + { + "epoch": 0.11390126699370869, + "grad_norm": 0.6109187602996826, + "learning_rate": 1.936657406043241e-05, + "loss": 0.3001, + "step": 5110 + }, + { + "epoch": 0.11401271637432875, + "grad_norm": 0.4546675384044647, + "learning_rate": 1.9365347160458172e-05, + "loss": 0.4848, + "step": 5115 + }, + { + "epoch": 0.11412416575494881, + "grad_norm": 0.632952868938446, + "learning_rate": 1.9364119112364662e-05, + "loss": 0.2835, + "step": 5120 + }, + { + "epoch": 0.11423561513556889, + "grad_norm": 0.7153362035751343, + "learning_rate": 1.9362889916302428e-05, + "loss": 0.4732, + "step": 5125 + }, + { + "epoch": 0.11434706451618895, + "grad_norm": 0.5372235774993896, + "learning_rate": 1.9361659572422158e-05, + "loss": 0.4101, + "step": 5130 + }, + { + "epoch": 0.11445851389680901, + "grad_norm": 0.604742169380188, + "learning_rate": 1.9360428080874687e-05, + "loss": 0.4882, + "step": 5135 + }, + { + "epoch": 0.11456996327742909, + "grad_norm": 0.5875693559646606, + "learning_rate": 1.9359195441810988e-05, + "loss": 0.4322, + "step": 5140 + }, + { + "epoch": 0.11468141265804915, + "grad_norm": 0.6571182012557983, + "learning_rate": 1.9357961655382165e-05, + "loss": 0.4533, + "step": 5145 + }, + { + "epoch": 0.11479286203866922, + "grad_norm": 0.4525604248046875, + "learning_rate": 1.9356726721739476e-05, + "loss": 0.3682, + "step": 5150 + }, + { + "epoch": 0.11490431141928929, + "grad_norm": 0.542430579662323, + "learning_rate": 1.9355490641034315e-05, + "loss": 0.3799, + "step": 5155 + }, + { + "epoch": 0.11501576079990936, + "grad_norm": 1.2807742357254028, + "learning_rate": 1.9354253413418215e-05, + "loss": 0.4531, + "step": 5160 + }, + { + "epoch": 0.11512721018052942, + "grad_norm": 0.5787119269371033, + "learning_rate": 1.9353015039042852e-05, + "loss": 0.4159, + "step": 5165 + }, + { + "epoch": 0.1152386595611495, + "grad_norm": 0.3594432771205902, + "learning_rate": 1.9351775518060036e-05, + "loss": 0.4104, + "step": 5170 + }, + { + "epoch": 0.11535010894176956, + "grad_norm": 0.7893831729888916, + "learning_rate": 1.9350534850621728e-05, + "loss": 0.4584, + "step": 5175 + }, + { + "epoch": 0.11546155832238962, + "grad_norm": 0.46665945649147034, + "learning_rate": 1.9349293036880023e-05, + "loss": 0.4721, + "step": 5180 + }, + { + "epoch": 0.11557300770300968, + "grad_norm": 0.5082299113273621, + "learning_rate": 1.9348050076987155e-05, + "loss": 0.3949, + "step": 5185 + }, + { + "epoch": 0.11568445708362976, + "grad_norm": 0.6792308688163757, + "learning_rate": 1.9346805971095504e-05, + "loss": 0.4241, + "step": 5190 + }, + { + "epoch": 0.11579590646424982, + "grad_norm": 0.5918766856193542, + "learning_rate": 1.934556071935759e-05, + "loss": 0.3717, + "step": 5195 + }, + { + "epoch": 0.11590735584486989, + "grad_norm": 0.36979660391807556, + "learning_rate": 1.9344314321926065e-05, + "loss": 0.3797, + "step": 5200 + }, + { + "epoch": 0.11601880522548996, + "grad_norm": 0.5370672345161438, + "learning_rate": 1.9343066778953733e-05, + "loss": 0.4361, + "step": 5205 + }, + { + "epoch": 0.11613025460611003, + "grad_norm": 0.546700656414032, + "learning_rate": 1.9341818090593532e-05, + "loss": 0.4005, + "step": 5210 + }, + { + "epoch": 0.11624170398673009, + "grad_norm": 0.45658764243125916, + "learning_rate": 1.934056825699854e-05, + "loss": 0.3926, + "step": 5215 + }, + { + "epoch": 0.11635315336735017, + "grad_norm": 0.4908159375190735, + "learning_rate": 1.9339317278321975e-05, + "loss": 0.4419, + "step": 5220 + }, + { + "epoch": 0.11646460274797023, + "grad_norm": 0.5328987240791321, + "learning_rate": 1.9338065154717203e-05, + "loss": 0.3964, + "step": 5225 + }, + { + "epoch": 0.11657605212859029, + "grad_norm": 0.5930885076522827, + "learning_rate": 1.9336811886337723e-05, + "loss": 0.409, + "step": 5230 + }, + { + "epoch": 0.11668750150921037, + "grad_norm": 0.3622898459434509, + "learning_rate": 1.9335557473337174e-05, + "loss": 0.3064, + "step": 5235 + }, + { + "epoch": 0.11679895088983043, + "grad_norm": 0.5278002023696899, + "learning_rate": 1.9334301915869334e-05, + "loss": 0.4698, + "step": 5240 + }, + { + "epoch": 0.11691040027045049, + "grad_norm": 0.46072134375572205, + "learning_rate": 1.9333045214088132e-05, + "loss": 0.4194, + "step": 5245 + }, + { + "epoch": 0.11702184965107057, + "grad_norm": 0.8218250870704651, + "learning_rate": 1.933178736814763e-05, + "loss": 0.5006, + "step": 5250 + }, + { + "epoch": 0.11713329903169063, + "grad_norm": 0.5179040431976318, + "learning_rate": 1.9330528378202024e-05, + "loss": 0.5059, + "step": 5255 + }, + { + "epoch": 0.1172447484123107, + "grad_norm": 0.5304727554321289, + "learning_rate": 1.932926824440566e-05, + "loss": 0.5422, + "step": 5260 + }, + { + "epoch": 0.11735619779293077, + "grad_norm": 0.5799547433853149, + "learning_rate": 1.932800696691302e-05, + "loss": 0.4012, + "step": 5265 + }, + { + "epoch": 0.11746764717355083, + "grad_norm": 0.6032615900039673, + "learning_rate": 1.9326744545878727e-05, + "loss": 0.4134, + "step": 5270 + }, + { + "epoch": 0.1175790965541709, + "grad_norm": 0.6024173498153687, + "learning_rate": 1.9325480981457542e-05, + "loss": 0.4608, + "step": 5275 + }, + { + "epoch": 0.11769054593479096, + "grad_norm": 0.6025891900062561, + "learning_rate": 1.9324216273804373e-05, + "loss": 0.4729, + "step": 5280 + }, + { + "epoch": 0.11780199531541104, + "grad_norm": 0.6101847887039185, + "learning_rate": 1.932295042307426e-05, + "loss": 0.4191, + "step": 5285 + }, + { + "epoch": 0.1179134446960311, + "grad_norm": 0.5292050838470459, + "learning_rate": 1.9321683429422386e-05, + "loss": 0.3268, + "step": 5290 + }, + { + "epoch": 0.11802489407665116, + "grad_norm": 0.6705468893051147, + "learning_rate": 1.932041529300408e-05, + "loss": 0.3897, + "step": 5295 + }, + { + "epoch": 0.11813634345727124, + "grad_norm": 1.0549614429473877, + "learning_rate": 1.9319146013974795e-05, + "loss": 0.3921, + "step": 5300 + }, + { + "epoch": 0.1182477928378913, + "grad_norm": 0.4721240997314453, + "learning_rate": 1.9317875592490146e-05, + "loss": 0.3079, + "step": 5305 + }, + { + "epoch": 0.11835924221851137, + "grad_norm": 0.656427800655365, + "learning_rate": 1.9316604028705874e-05, + "loss": 0.3989, + "step": 5310 + }, + { + "epoch": 0.11847069159913144, + "grad_norm": 0.6683394908905029, + "learning_rate": 1.931533132277786e-05, + "loss": 0.432, + "step": 5315 + }, + { + "epoch": 0.1185821409797515, + "grad_norm": 0.6674720048904419, + "learning_rate": 1.9314057474862125e-05, + "loss": 0.3063, + "step": 5320 + }, + { + "epoch": 0.11869359036037157, + "grad_norm": 0.46994155645370483, + "learning_rate": 1.931278248511484e-05, + "loss": 0.3768, + "step": 5325 + }, + { + "epoch": 0.11880503974099164, + "grad_norm": 0.603171169757843, + "learning_rate": 1.9311506353692305e-05, + "loss": 0.3091, + "step": 5330 + }, + { + "epoch": 0.11891648912161171, + "grad_norm": 0.520969808101654, + "learning_rate": 1.9310229080750967e-05, + "loss": 0.3635, + "step": 5335 + }, + { + "epoch": 0.11902793850223177, + "grad_norm": 0.5163702368736267, + "learning_rate": 1.9308950666447404e-05, + "loss": 0.5842, + "step": 5340 + }, + { + "epoch": 0.11913938788285185, + "grad_norm": 0.4493872821331024, + "learning_rate": 1.930767111093835e-05, + "loss": 0.3607, + "step": 5345 + }, + { + "epoch": 0.11925083726347191, + "grad_norm": 0.48946818709373474, + "learning_rate": 1.9306390414380655e-05, + "loss": 0.4147, + "step": 5350 + }, + { + "epoch": 0.11936228664409197, + "grad_norm": 0.48165857791900635, + "learning_rate": 1.9305108576931336e-05, + "loss": 0.4171, + "step": 5355 + }, + { + "epoch": 0.11947373602471205, + "grad_norm": 0.5254682302474976, + "learning_rate": 1.9303825598747524e-05, + "loss": 0.4702, + "step": 5360 + }, + { + "epoch": 0.11958518540533211, + "grad_norm": 0.5365452170372009, + "learning_rate": 1.930254147998651e-05, + "loss": 0.3702, + "step": 5365 + }, + { + "epoch": 0.11969663478595217, + "grad_norm": 0.36603885889053345, + "learning_rate": 1.9301256220805715e-05, + "loss": 0.4523, + "step": 5370 + }, + { + "epoch": 0.11980808416657224, + "grad_norm": 0.45412683486938477, + "learning_rate": 1.9299969821362702e-05, + "loss": 0.4379, + "step": 5375 + }, + { + "epoch": 0.11991953354719231, + "grad_norm": 0.4885343611240387, + "learning_rate": 1.9298682281815176e-05, + "loss": 0.3207, + "step": 5380 + }, + { + "epoch": 0.12003098292781238, + "grad_norm": 0.6273423433303833, + "learning_rate": 1.929739360232097e-05, + "loss": 0.3908, + "step": 5385 + }, + { + "epoch": 0.12014243230843244, + "grad_norm": 0.6662055253982544, + "learning_rate": 1.929610378303808e-05, + "loss": 0.4066, + "step": 5390 + }, + { + "epoch": 0.12025388168905252, + "grad_norm": 0.5559515357017517, + "learning_rate": 1.9294812824124623e-05, + "loss": 0.3538, + "step": 5395 + }, + { + "epoch": 0.12036533106967258, + "grad_norm": 0.4622573256492615, + "learning_rate": 1.9293520725738855e-05, + "loss": 0.4457, + "step": 5400 + }, + { + "epoch": 0.12047678045029264, + "grad_norm": 0.6049358248710632, + "learning_rate": 1.929222748803918e-05, + "loss": 0.3599, + "step": 5405 + }, + { + "epoch": 0.12058822983091272, + "grad_norm": 0.5943440794944763, + "learning_rate": 1.9290933111184142e-05, + "loss": 0.3622, + "step": 5410 + }, + { + "epoch": 0.12069967921153278, + "grad_norm": 0.48504123091697693, + "learning_rate": 1.9289637595332418e-05, + "loss": 0.4892, + "step": 5415 + }, + { + "epoch": 0.12081112859215284, + "grad_norm": 0.48124271631240845, + "learning_rate": 1.928834094064283e-05, + "loss": 0.3757, + "step": 5420 + }, + { + "epoch": 0.12092257797277292, + "grad_norm": 0.545174241065979, + "learning_rate": 1.9287043147274338e-05, + "loss": 0.3642, + "step": 5425 + }, + { + "epoch": 0.12103402735339298, + "grad_norm": 0.5016522407531738, + "learning_rate": 1.9285744215386043e-05, + "loss": 0.3268, + "step": 5430 + }, + { + "epoch": 0.12114547673401305, + "grad_norm": 0.5860884189605713, + "learning_rate": 1.928444414513718e-05, + "loss": 0.3471, + "step": 5435 + }, + { + "epoch": 0.12125692611463312, + "grad_norm": 0.5948898792266846, + "learning_rate": 1.928314293668713e-05, + "loss": 0.283, + "step": 5440 + }, + { + "epoch": 0.12136837549525319, + "grad_norm": 0.5052125453948975, + "learning_rate": 1.9281840590195412e-05, + "loss": 0.4477, + "step": 5445 + }, + { + "epoch": 0.12147982487587325, + "grad_norm": 0.6505773067474365, + "learning_rate": 1.928053710582168e-05, + "loss": 0.3336, + "step": 5450 + }, + { + "epoch": 0.12159127425649333, + "grad_norm": 0.6417970657348633, + "learning_rate": 1.9279232483725735e-05, + "loss": 0.3932, + "step": 5455 + }, + { + "epoch": 0.12170272363711339, + "grad_norm": 0.44236230850219727, + "learning_rate": 1.9277926724067512e-05, + "loss": 0.4703, + "step": 5460 + }, + { + "epoch": 0.12181417301773345, + "grad_norm": 0.49172699451446533, + "learning_rate": 1.927661982700709e-05, + "loss": 0.3714, + "step": 5465 + }, + { + "epoch": 0.12192562239835351, + "grad_norm": 0.48851385712623596, + "learning_rate": 1.9275311792704676e-05, + "loss": 0.3447, + "step": 5470 + }, + { + "epoch": 0.12203707177897359, + "grad_norm": 0.6325857639312744, + "learning_rate": 1.9274002621320633e-05, + "loss": 0.3943, + "step": 5475 + }, + { + "epoch": 0.12214852115959365, + "grad_norm": 0.49979910254478455, + "learning_rate": 1.9272692313015456e-05, + "loss": 0.3352, + "step": 5480 + }, + { + "epoch": 0.12225997054021372, + "grad_norm": 0.505219042301178, + "learning_rate": 1.9271380867949774e-05, + "loss": 0.3244, + "step": 5485 + }, + { + "epoch": 0.1223714199208338, + "grad_norm": 0.5624701380729675, + "learning_rate": 1.927006828628436e-05, + "loss": 0.4565, + "step": 5490 + }, + { + "epoch": 0.12248286930145386, + "grad_norm": 0.6194440722465515, + "learning_rate": 1.9268754568180128e-05, + "loss": 0.3445, + "step": 5495 + }, + { + "epoch": 0.12259431868207392, + "grad_norm": 0.5660005807876587, + "learning_rate": 1.9267439713798132e-05, + "loss": 0.3507, + "step": 5500 + }, + { + "epoch": 0.122705768062694, + "grad_norm": 0.43132275342941284, + "learning_rate": 1.9266123723299558e-05, + "loss": 0.4045, + "step": 5505 + }, + { + "epoch": 0.12281721744331406, + "grad_norm": 0.7432733774185181, + "learning_rate": 1.9264806596845742e-05, + "loss": 0.4289, + "step": 5510 + }, + { + "epoch": 0.12292866682393412, + "grad_norm": 0.6572379469871521, + "learning_rate": 1.926348833459815e-05, + "loss": 0.3649, + "step": 5515 + }, + { + "epoch": 0.1230401162045542, + "grad_norm": 0.39170458912849426, + "learning_rate": 1.926216893671839e-05, + "loss": 0.4512, + "step": 5520 + }, + { + "epoch": 0.12315156558517426, + "grad_norm": 0.6171269416809082, + "learning_rate": 1.926084840336821e-05, + "loss": 0.4068, + "step": 5525 + }, + { + "epoch": 0.12326301496579432, + "grad_norm": 0.6351962089538574, + "learning_rate": 1.9259526734709503e-05, + "loss": 0.471, + "step": 5530 + }, + { + "epoch": 0.1233744643464144, + "grad_norm": 0.4887767434120178, + "learning_rate": 1.9258203930904286e-05, + "loss": 0.4869, + "step": 5535 + }, + { + "epoch": 0.12348591372703446, + "grad_norm": 0.7329266667366028, + "learning_rate": 1.9256879992114734e-05, + "loss": 0.47, + "step": 5540 + }, + { + "epoch": 0.12359736310765453, + "grad_norm": 0.48019132018089294, + "learning_rate": 1.9255554918503144e-05, + "loss": 0.3257, + "step": 5545 + }, + { + "epoch": 0.1237088124882746, + "grad_norm": 0.5862034559249878, + "learning_rate": 1.9254228710231962e-05, + "loss": 0.3933, + "step": 5550 + }, + { + "epoch": 0.12382026186889467, + "grad_norm": 0.5435554385185242, + "learning_rate": 1.9252901367463773e-05, + "loss": 0.4413, + "step": 5555 + }, + { + "epoch": 0.12393171124951473, + "grad_norm": 0.5873213410377502, + "learning_rate": 1.92515728903613e-05, + "loss": 0.4661, + "step": 5560 + }, + { + "epoch": 0.12404316063013479, + "grad_norm": 0.5576989650726318, + "learning_rate": 1.92502432790874e-05, + "loss": 0.4491, + "step": 5565 + }, + { + "epoch": 0.12415461001075487, + "grad_norm": 0.4655047059059143, + "learning_rate": 1.9248912533805077e-05, + "loss": 0.455, + "step": 5570 + }, + { + "epoch": 0.12426605939137493, + "grad_norm": 0.44688910245895386, + "learning_rate": 1.924758065467746e-05, + "loss": 0.3578, + "step": 5575 + }, + { + "epoch": 0.124377508771995, + "grad_norm": 0.612274706363678, + "learning_rate": 1.9246247641867843e-05, + "loss": 0.5784, + "step": 5580 + }, + { + "epoch": 0.12448895815261507, + "grad_norm": 0.46077173948287964, + "learning_rate": 1.9244913495539636e-05, + "loss": 0.4116, + "step": 5585 + }, + { + "epoch": 0.12460040753323513, + "grad_norm": 0.5178104639053345, + "learning_rate": 1.924357821585639e-05, + "loss": 0.402, + "step": 5590 + }, + { + "epoch": 0.1247118569138552, + "grad_norm": 0.6142487525939941, + "learning_rate": 1.924224180298181e-05, + "loss": 0.4987, + "step": 5595 + }, + { + "epoch": 0.12482330629447527, + "grad_norm": 0.5576684474945068, + "learning_rate": 1.9240904257079716e-05, + "loss": 0.4058, + "step": 5600 + }, + { + "epoch": 0.12493475567509534, + "grad_norm": 0.6418728828430176, + "learning_rate": 1.923956557831409e-05, + "loss": 0.3919, + "step": 5605 + }, + { + "epoch": 0.1250462050557154, + "grad_norm": 0.6118012070655823, + "learning_rate": 1.9238225766849048e-05, + "loss": 0.4424, + "step": 5610 + }, + { + "epoch": 0.12515765443633547, + "grad_norm": 0.6719847917556763, + "learning_rate": 1.923688482284883e-05, + "loss": 0.3275, + "step": 5615 + }, + { + "epoch": 0.12526910381695552, + "grad_norm": 0.6712337136268616, + "learning_rate": 1.9235542746477835e-05, + "loss": 0.3622, + "step": 5620 + }, + { + "epoch": 0.1253805531975756, + "grad_norm": 0.5340763926506042, + "learning_rate": 1.9234199537900586e-05, + "loss": 0.3324, + "step": 5625 + }, + { + "epoch": 0.12549200257819568, + "grad_norm": 0.5426451563835144, + "learning_rate": 1.923285519728175e-05, + "loss": 0.2466, + "step": 5630 + }, + { + "epoch": 0.12560345195881573, + "grad_norm": 0.5238773822784424, + "learning_rate": 1.9231509724786132e-05, + "loss": 0.4723, + "step": 5635 + }, + { + "epoch": 0.1257149013394358, + "grad_norm": 0.5981005430221558, + "learning_rate": 1.9230163120578677e-05, + "loss": 0.341, + "step": 5640 + }, + { + "epoch": 0.12582635072005588, + "grad_norm": 0.5886806845664978, + "learning_rate": 1.9228815384824472e-05, + "loss": 0.3968, + "step": 5645 + }, + { + "epoch": 0.12593780010067593, + "grad_norm": 0.6732558012008667, + "learning_rate": 1.9227466517688738e-05, + "loss": 0.569, + "step": 5650 + }, + { + "epoch": 0.126049249481296, + "grad_norm": 0.6471225023269653, + "learning_rate": 1.922611651933683e-05, + "loss": 0.3136, + "step": 5655 + }, + { + "epoch": 0.12616069886191608, + "grad_norm": 0.5462857484817505, + "learning_rate": 1.9224765389934253e-05, + "loss": 0.4516, + "step": 5660 + }, + { + "epoch": 0.12627214824253613, + "grad_norm": 0.5605525970458984, + "learning_rate": 1.9223413129646645e-05, + "loss": 0.4971, + "step": 5665 + }, + { + "epoch": 0.1263835976231562, + "grad_norm": 0.4602338373661041, + "learning_rate": 1.922205973863978e-05, + "loss": 0.3303, + "step": 5670 + }, + { + "epoch": 0.12649504700377628, + "grad_norm": 0.5662568211555481, + "learning_rate": 1.9220705217079573e-05, + "loss": 0.4036, + "step": 5675 + }, + { + "epoch": 0.12660649638439633, + "grad_norm": 0.6493194699287415, + "learning_rate": 1.9219349565132083e-05, + "loss": 0.3845, + "step": 5680 + }, + { + "epoch": 0.1267179457650164, + "grad_norm": 0.5647873282432556, + "learning_rate": 1.9217992782963493e-05, + "loss": 0.3857, + "step": 5685 + }, + { + "epoch": 0.1268293951456365, + "grad_norm": 0.5588977336883545, + "learning_rate": 1.9216634870740144e-05, + "loss": 0.3897, + "step": 5690 + }, + { + "epoch": 0.12694084452625654, + "grad_norm": 0.537597119808197, + "learning_rate": 1.92152758286285e-05, + "loss": 0.4161, + "step": 5695 + }, + { + "epoch": 0.1270522939068766, + "grad_norm": 0.7306565642356873, + "learning_rate": 1.921391565679517e-05, + "loss": 0.5014, + "step": 5700 + }, + { + "epoch": 0.1271637432874967, + "grad_norm": 0.5383148789405823, + "learning_rate": 1.9212554355406904e-05, + "loss": 0.4513, + "step": 5705 + }, + { + "epoch": 0.12727519266811674, + "grad_norm": 0.6623514890670776, + "learning_rate": 1.921119192463058e-05, + "loss": 0.4172, + "step": 5710 + }, + { + "epoch": 0.12738664204873681, + "grad_norm": 0.5583232641220093, + "learning_rate": 1.9209828364633227e-05, + "loss": 0.4372, + "step": 5715 + }, + { + "epoch": 0.1274980914293569, + "grad_norm": 0.5252357721328735, + "learning_rate": 1.9208463675582006e-05, + "loss": 0.3622, + "step": 5720 + }, + { + "epoch": 0.12760954080997694, + "grad_norm": 0.6623414754867554, + "learning_rate": 1.9207097857644216e-05, + "loss": 0.4213, + "step": 5725 + }, + { + "epoch": 0.12772099019059702, + "grad_norm": 0.5552505850791931, + "learning_rate": 1.92057309109873e-05, + "loss": 0.3906, + "step": 5730 + }, + { + "epoch": 0.1278324395712171, + "grad_norm": 0.4317275881767273, + "learning_rate": 1.920436283577883e-05, + "loss": 0.3954, + "step": 5735 + }, + { + "epoch": 0.12794388895183714, + "grad_norm": 0.6078845262527466, + "learning_rate": 1.920299363218652e-05, + "loss": 0.4249, + "step": 5740 + }, + { + "epoch": 0.12805533833245722, + "grad_norm": 0.7265923619270325, + "learning_rate": 1.9201623300378228e-05, + "loss": 0.414, + "step": 5745 + }, + { + "epoch": 0.1281667877130773, + "grad_norm": 0.6118583679199219, + "learning_rate": 1.9200251840521946e-05, + "loss": 0.3573, + "step": 5750 + }, + { + "epoch": 0.12827823709369734, + "grad_norm": 0.5438648462295532, + "learning_rate": 1.91988792527858e-05, + "loss": 0.4139, + "step": 5755 + }, + { + "epoch": 0.12838968647431742, + "grad_norm": 0.639155387878418, + "learning_rate": 1.919750553733807e-05, + "loss": 0.4046, + "step": 5760 + }, + { + "epoch": 0.1285011358549375, + "grad_norm": 0.5353571772575378, + "learning_rate": 1.919613069434715e-05, + "loss": 0.5334, + "step": 5765 + }, + { + "epoch": 0.12861258523555755, + "grad_norm": 0.6160265803337097, + "learning_rate": 1.919475472398159e-05, + "loss": 0.4462, + "step": 5770 + }, + { + "epoch": 0.12872403461617762, + "grad_norm": 0.5473472476005554, + "learning_rate": 1.9193377626410076e-05, + "loss": 0.4203, + "step": 5775 + }, + { + "epoch": 0.1288354839967977, + "grad_norm": 0.4638007581233978, + "learning_rate": 1.919199940180142e-05, + "loss": 0.4141, + "step": 5780 + }, + { + "epoch": 0.12894693337741775, + "grad_norm": 0.6351835131645203, + "learning_rate": 1.9190620050324596e-05, + "loss": 0.4435, + "step": 5785 + }, + { + "epoch": 0.12905838275803783, + "grad_norm": 0.642963171005249, + "learning_rate": 1.918923957214869e-05, + "loss": 0.3589, + "step": 5790 + }, + { + "epoch": 0.1291698321386579, + "grad_norm": 0.5821643471717834, + "learning_rate": 1.9187857967442945e-05, + "loss": 0.2967, + "step": 5795 + }, + { + "epoch": 0.12928128151927795, + "grad_norm": 0.4989367425441742, + "learning_rate": 1.9186475236376733e-05, + "loss": 0.3938, + "step": 5800 + }, + { + "epoch": 0.12939273089989803, + "grad_norm": 0.49929431080818176, + "learning_rate": 1.9185091379119566e-05, + "loss": 0.3356, + "step": 5805 + }, + { + "epoch": 0.12950418028051808, + "grad_norm": 0.4640662372112274, + "learning_rate": 1.9183706395841092e-05, + "loss": 0.2594, + "step": 5810 + }, + { + "epoch": 0.12961562966113815, + "grad_norm": 0.5312515497207642, + "learning_rate": 1.9182320286711104e-05, + "loss": 0.3543, + "step": 5815 + }, + { + "epoch": 0.12972707904175823, + "grad_norm": 0.5994097590446472, + "learning_rate": 1.9180933051899523e-05, + "loss": 0.2664, + "step": 5820 + }, + { + "epoch": 0.12983852842237828, + "grad_norm": 0.6394380927085876, + "learning_rate": 1.9179544691576416e-05, + "loss": 0.4029, + "step": 5825 + }, + { + "epoch": 0.12994997780299836, + "grad_norm": 0.5699619054794312, + "learning_rate": 1.9178155205911984e-05, + "loss": 0.2986, + "step": 5830 + }, + { + "epoch": 0.13006142718361843, + "grad_norm": 0.6062706112861633, + "learning_rate": 1.917676459507657e-05, + "loss": 0.4173, + "step": 5835 + }, + { + "epoch": 0.13017287656423848, + "grad_norm": 0.5544490218162537, + "learning_rate": 1.917537285924065e-05, + "loss": 0.4053, + "step": 5840 + }, + { + "epoch": 0.13028432594485856, + "grad_norm": 0.5762254595756531, + "learning_rate": 1.9173979998574838e-05, + "loss": 0.4464, + "step": 5845 + }, + { + "epoch": 0.13039577532547864, + "grad_norm": 0.4747742712497711, + "learning_rate": 1.9172586013249894e-05, + "loss": 0.4227, + "step": 5850 + }, + { + "epoch": 0.13050722470609868, + "grad_norm": 0.6070486903190613, + "learning_rate": 1.9171190903436705e-05, + "loss": 0.3773, + "step": 5855 + }, + { + "epoch": 0.13061867408671876, + "grad_norm": 0.4321900010108948, + "learning_rate": 1.9169794669306303e-05, + "loss": 0.4154, + "step": 5860 + }, + { + "epoch": 0.13073012346733884, + "grad_norm": 0.49118393659591675, + "learning_rate": 1.9168397311029854e-05, + "loss": 0.2462, + "step": 5865 + }, + { + "epoch": 0.1308415728479589, + "grad_norm": 0.8460988998413086, + "learning_rate": 1.9166998828778665e-05, + "loss": 0.4493, + "step": 5870 + }, + { + "epoch": 0.13095302222857896, + "grad_norm": 0.6519992351531982, + "learning_rate": 1.9165599222724175e-05, + "loss": 0.4105, + "step": 5875 + }, + { + "epoch": 0.13106447160919904, + "grad_norm": 0.6495894193649292, + "learning_rate": 1.916419849303797e-05, + "loss": 0.2441, + "step": 5880 + }, + { + "epoch": 0.1311759209898191, + "grad_norm": 0.5206788182258606, + "learning_rate": 1.9162796639891766e-05, + "loss": 0.4636, + "step": 5885 + }, + { + "epoch": 0.13128737037043917, + "grad_norm": 0.5899707078933716, + "learning_rate": 1.9161393663457418e-05, + "loss": 0.3734, + "step": 5890 + }, + { + "epoch": 0.13139881975105924, + "grad_norm": 0.760115385055542, + "learning_rate": 1.9159989563906922e-05, + "loss": 0.3605, + "step": 5895 + }, + { + "epoch": 0.1315102691316793, + "grad_norm": 0.5802580714225769, + "learning_rate": 1.9158584341412414e-05, + "loss": 0.3356, + "step": 5900 + }, + { + "epoch": 0.13162171851229937, + "grad_norm": 0.6110792756080627, + "learning_rate": 1.9157177996146156e-05, + "loss": 0.459, + "step": 5905 + }, + { + "epoch": 0.13173316789291944, + "grad_norm": 0.5127723217010498, + "learning_rate": 1.9155770528280556e-05, + "loss": 0.2911, + "step": 5910 + }, + { + "epoch": 0.1318446172735395, + "grad_norm": 0.6098167300224304, + "learning_rate": 1.9154361937988162e-05, + "loss": 0.5592, + "step": 5915 + }, + { + "epoch": 0.13195606665415957, + "grad_norm": 1.627808928489685, + "learning_rate": 1.9152952225441657e-05, + "loss": 0.3981, + "step": 5920 + }, + { + "epoch": 0.13206751603477965, + "grad_norm": 0.5559998750686646, + "learning_rate": 1.9151541390813863e-05, + "loss": 0.3277, + "step": 5925 + }, + { + "epoch": 0.1321789654153997, + "grad_norm": 0.527961790561676, + "learning_rate": 1.915012943427773e-05, + "loss": 0.3684, + "step": 5930 + }, + { + "epoch": 0.13229041479601977, + "grad_norm": 0.4821566641330719, + "learning_rate": 1.9148716356006358e-05, + "loss": 0.3656, + "step": 5935 + }, + { + "epoch": 0.13240186417663985, + "grad_norm": 0.3770541846752167, + "learning_rate": 1.914730215617297e-05, + "loss": 0.3135, + "step": 5940 + }, + { + "epoch": 0.1325133135572599, + "grad_norm": 0.46859049797058105, + "learning_rate": 1.914588683495095e-05, + "loss": 0.4604, + "step": 5945 + }, + { + "epoch": 0.13262476293787998, + "grad_norm": 0.6820949912071228, + "learning_rate": 1.91444703925138e-05, + "loss": 0.4089, + "step": 5950 + }, + { + "epoch": 0.13273621231850005, + "grad_norm": 0.6557464599609375, + "learning_rate": 1.9143052829035166e-05, + "loss": 0.374, + "step": 5955 + }, + { + "epoch": 0.1328476616991201, + "grad_norm": 0.8806049823760986, + "learning_rate": 1.914163414468883e-05, + "loss": 0.4139, + "step": 5960 + }, + { + "epoch": 0.13295911107974018, + "grad_norm": 0.5537307262420654, + "learning_rate": 1.9140214339648704e-05, + "loss": 0.3534, + "step": 5965 + }, + { + "epoch": 0.13307056046036025, + "grad_norm": 0.6633182168006897, + "learning_rate": 1.9138793414088856e-05, + "loss": 0.4184, + "step": 5970 + }, + { + "epoch": 0.1331820098409803, + "grad_norm": 0.5307210087776184, + "learning_rate": 1.9137371368183472e-05, + "loss": 0.372, + "step": 5975 + }, + { + "epoch": 0.13329345922160038, + "grad_norm": 0.48524239659309387, + "learning_rate": 1.913594820210689e-05, + "loss": 0.3055, + "step": 5980 + }, + { + "epoch": 0.13340490860222046, + "grad_norm": 0.6917334198951721, + "learning_rate": 1.9134523916033577e-05, + "loss": 0.3744, + "step": 5985 + }, + { + "epoch": 0.1335163579828405, + "grad_norm": 0.4943813383579254, + "learning_rate": 1.913309851013814e-05, + "loss": 0.4207, + "step": 5990 + }, + { + "epoch": 0.13362780736346058, + "grad_norm": 0.586175799369812, + "learning_rate": 1.9131671984595325e-05, + "loss": 0.4356, + "step": 5995 + }, + { + "epoch": 0.13373925674408063, + "grad_norm": 0.7961429953575134, + "learning_rate": 1.9130244339580007e-05, + "loss": 0.2789, + "step": 6000 + }, + { + "epoch": 0.1338507061247007, + "grad_norm": 0.5163123607635498, + "learning_rate": 1.912881557526721e-05, + "loss": 0.3108, + "step": 6005 + }, + { + "epoch": 0.13396215550532078, + "grad_norm": 0.5538989901542664, + "learning_rate": 1.9127385691832084e-05, + "loss": 0.5069, + "step": 6010 + }, + { + "epoch": 0.13407360488594083, + "grad_norm": 0.4295978546142578, + "learning_rate": 1.912595468944993e-05, + "loss": 0.4329, + "step": 6015 + }, + { + "epoch": 0.1341850542665609, + "grad_norm": 0.5083966255187988, + "learning_rate": 1.9124522568296166e-05, + "loss": 0.3951, + "step": 6020 + }, + { + "epoch": 0.134296503647181, + "grad_norm": 0.5359913110733032, + "learning_rate": 1.912308932854637e-05, + "loss": 0.3751, + "step": 6025 + }, + { + "epoch": 0.13440795302780104, + "grad_norm": 0.5720531344413757, + "learning_rate": 1.912165497037624e-05, + "loss": 0.278, + "step": 6030 + }, + { + "epoch": 0.1345194024084211, + "grad_norm": 0.5642151832580566, + "learning_rate": 1.912021949396162e-05, + "loss": 0.4278, + "step": 6035 + }, + { + "epoch": 0.1346308517890412, + "grad_norm": 0.7164551615715027, + "learning_rate": 1.9118782899478488e-05, + "loss": 0.4543, + "step": 6040 + }, + { + "epoch": 0.13474230116966124, + "grad_norm": 0.7330194711685181, + "learning_rate": 1.9117345187102958e-05, + "loss": 0.321, + "step": 6045 + }, + { + "epoch": 0.13485375055028131, + "grad_norm": 0.7292217016220093, + "learning_rate": 1.9115906357011283e-05, + "loss": 0.3806, + "step": 6050 + }, + { + "epoch": 0.1349651999309014, + "grad_norm": 0.4237048625946045, + "learning_rate": 1.911446640937985e-05, + "loss": 0.3959, + "step": 6055 + }, + { + "epoch": 0.13507664931152144, + "grad_norm": 0.5631638169288635, + "learning_rate": 1.911302534438519e-05, + "loss": 0.5221, + "step": 6060 + }, + { + "epoch": 0.13518809869214152, + "grad_norm": 0.5988253355026245, + "learning_rate": 1.9111583162203965e-05, + "loss": 0.42, + "step": 6065 + }, + { + "epoch": 0.1352995480727616, + "grad_norm": 0.44528236985206604, + "learning_rate": 1.9110139863012978e-05, + "loss": 0.3945, + "step": 6070 + }, + { + "epoch": 0.13541099745338164, + "grad_norm": 0.6617679595947266, + "learning_rate": 1.9108695446989158e-05, + "loss": 0.6861, + "step": 6075 + }, + { + "epoch": 0.13552244683400172, + "grad_norm": 0.6147041916847229, + "learning_rate": 1.9107249914309586e-05, + "loss": 0.3679, + "step": 6080 + }, + { + "epoch": 0.1356338962146218, + "grad_norm": 0.4293046295642853, + "learning_rate": 1.9105803265151474e-05, + "loss": 0.3888, + "step": 6085 + }, + { + "epoch": 0.13574534559524185, + "grad_norm": 0.5585926175117493, + "learning_rate": 1.9104355499692166e-05, + "loss": 0.3788, + "step": 6090 + }, + { + "epoch": 0.13585679497586192, + "grad_norm": 0.6501102447509766, + "learning_rate": 1.910290661810915e-05, + "loss": 0.4541, + "step": 6095 + }, + { + "epoch": 0.135968244356482, + "grad_norm": 0.50469571352005, + "learning_rate": 1.9101456620580044e-05, + "loss": 0.4489, + "step": 6100 + }, + { + "epoch": 0.13607969373710205, + "grad_norm": 0.5148460268974304, + "learning_rate": 1.910000550728261e-05, + "loss": 0.5077, + "step": 6105 + }, + { + "epoch": 0.13619114311772212, + "grad_norm": 0.383065789937973, + "learning_rate": 1.9098553278394744e-05, + "loss": 0.3584, + "step": 6110 + }, + { + "epoch": 0.1363025924983422, + "grad_norm": 0.5884717106819153, + "learning_rate": 1.909709993409447e-05, + "loss": 0.3969, + "step": 6115 + }, + { + "epoch": 0.13641404187896225, + "grad_norm": 0.48174750804901123, + "learning_rate": 1.9095645474559967e-05, + "loss": 0.4417, + "step": 6120 + }, + { + "epoch": 0.13652549125958233, + "grad_norm": 0.5044527649879456, + "learning_rate": 1.909418989996954e-05, + "loss": 0.4233, + "step": 6125 + }, + { + "epoch": 0.1366369406402024, + "grad_norm": 0.45409488677978516, + "learning_rate": 1.909273321050162e-05, + "loss": 0.4688, + "step": 6130 + }, + { + "epoch": 0.13674839002082245, + "grad_norm": 0.5548310875892639, + "learning_rate": 1.90912754063348e-05, + "loss": 0.3716, + "step": 6135 + }, + { + "epoch": 0.13685983940144253, + "grad_norm": 0.7075448036193848, + "learning_rate": 1.9089816487647786e-05, + "loss": 0.364, + "step": 6140 + }, + { + "epoch": 0.1369712887820626, + "grad_norm": 0.5327110290527344, + "learning_rate": 1.9088356454619433e-05, + "loss": 0.4024, + "step": 6145 + }, + { + "epoch": 0.13708273816268265, + "grad_norm": 0.4292064905166626, + "learning_rate": 1.9086895307428733e-05, + "loss": 0.3842, + "step": 6150 + }, + { + "epoch": 0.13719418754330273, + "grad_norm": 0.5394851565361023, + "learning_rate": 1.9085433046254805e-05, + "loss": 0.3289, + "step": 6155 + }, + { + "epoch": 0.1373056369239228, + "grad_norm": 0.5409968495368958, + "learning_rate": 1.9083969671276915e-05, + "loss": 0.4648, + "step": 6160 + }, + { + "epoch": 0.13741708630454286, + "grad_norm": 0.6410302519798279, + "learning_rate": 1.9082505182674462e-05, + "loss": 0.3038, + "step": 6165 + }, + { + "epoch": 0.13752853568516293, + "grad_norm": 0.8804435133934021, + "learning_rate": 1.908103958062698e-05, + "loss": 0.4235, + "step": 6170 + }, + { + "epoch": 0.137639985065783, + "grad_norm": 0.3780747056007385, + "learning_rate": 1.907957286531414e-05, + "loss": 0.3038, + "step": 6175 + }, + { + "epoch": 0.13775143444640306, + "grad_norm": 0.44706881046295166, + "learning_rate": 1.907810503691575e-05, + "loss": 0.3553, + "step": 6180 + }, + { + "epoch": 0.13786288382702314, + "grad_norm": 0.4074099063873291, + "learning_rate": 1.9076636095611752e-05, + "loss": 0.4158, + "step": 6185 + }, + { + "epoch": 0.13797433320764318, + "grad_norm": 0.545442521572113, + "learning_rate": 1.907516604158223e-05, + "loss": 0.3531, + "step": 6190 + }, + { + "epoch": 0.13808578258826326, + "grad_norm": 0.6031177043914795, + "learning_rate": 1.9073694875007403e-05, + "loss": 0.4075, + "step": 6195 + }, + { + "epoch": 0.13819723196888334, + "grad_norm": 0.4987606108188629, + "learning_rate": 1.9072222596067626e-05, + "loss": 0.392, + "step": 6200 + }, + { + "epoch": 0.1383086813495034, + "grad_norm": 0.6515766382217407, + "learning_rate": 1.907074920494338e-05, + "loss": 0.4539, + "step": 6205 + }, + { + "epoch": 0.13842013073012346, + "grad_norm": 0.5816649198532104, + "learning_rate": 1.90692747018153e-05, + "loss": 0.4451, + "step": 6210 + }, + { + "epoch": 0.13853158011074354, + "grad_norm": 0.5591769218444824, + "learning_rate": 1.906779908686414e-05, + "loss": 0.4067, + "step": 6215 + }, + { + "epoch": 0.1386430294913636, + "grad_norm": 0.6279967427253723, + "learning_rate": 1.906632236027081e-05, + "loss": 0.3641, + "step": 6220 + }, + { + "epoch": 0.13875447887198367, + "grad_norm": 0.7580907344818115, + "learning_rate": 1.906484452221634e-05, + "loss": 0.3638, + "step": 6225 + }, + { + "epoch": 0.13886592825260374, + "grad_norm": 0.7279596924781799, + "learning_rate": 1.90633655728819e-05, + "loss": 0.3655, + "step": 6230 + }, + { + "epoch": 0.1389773776332238, + "grad_norm": 0.38095614314079285, + "learning_rate": 1.9061885512448797e-05, + "loss": 0.3642, + "step": 6235 + }, + { + "epoch": 0.13908882701384387, + "grad_norm": 0.4549018144607544, + "learning_rate": 1.9060404341098483e-05, + "loss": 0.3173, + "step": 6240 + }, + { + "epoch": 0.13920027639446395, + "grad_norm": 0.6030601859092712, + "learning_rate": 1.905892205901253e-05, + "loss": 0.4833, + "step": 6245 + }, + { + "epoch": 0.139311725775084, + "grad_norm": 0.47454383969306946, + "learning_rate": 1.9057438666372653e-05, + "loss": 0.3674, + "step": 6250 + }, + { + "epoch": 0.13942317515570407, + "grad_norm": 0.6735197305679321, + "learning_rate": 1.905595416336071e-05, + "loss": 0.3834, + "step": 6255 + }, + { + "epoch": 0.13953462453632415, + "grad_norm": 0.6685248613357544, + "learning_rate": 1.9054468550158688e-05, + "loss": 0.3735, + "step": 6260 + }, + { + "epoch": 0.1396460739169442, + "grad_norm": 0.5561477541923523, + "learning_rate": 1.9052981826948715e-05, + "loss": 0.3955, + "step": 6265 + }, + { + "epoch": 0.13975752329756427, + "grad_norm": 0.40665513277053833, + "learning_rate": 1.9051493993913044e-05, + "loss": 0.4795, + "step": 6270 + }, + { + "epoch": 0.13986897267818435, + "grad_norm": 0.4884146451950073, + "learning_rate": 1.9050005051234078e-05, + "loss": 0.3805, + "step": 6275 + }, + { + "epoch": 0.1399804220588044, + "grad_norm": 0.5093684196472168, + "learning_rate": 1.904851499909435e-05, + "loss": 0.4379, + "step": 6280 + }, + { + "epoch": 0.14009187143942448, + "grad_norm": 0.8419882655143738, + "learning_rate": 1.9047023837676525e-05, + "loss": 0.4529, + "step": 6285 + }, + { + "epoch": 0.14020332082004455, + "grad_norm": 0.7996892333030701, + "learning_rate": 1.904553156716341e-05, + "loss": 0.4288, + "step": 6290 + }, + { + "epoch": 0.1403147702006646, + "grad_norm": 0.4805566668510437, + "learning_rate": 1.9044038187737944e-05, + "loss": 0.4511, + "step": 6295 + }, + { + "epoch": 0.14042621958128468, + "grad_norm": 0.5690134763717651, + "learning_rate": 1.9042543699583204e-05, + "loss": 0.408, + "step": 6300 + }, + { + "epoch": 0.14053766896190475, + "grad_norm": 0.7435726523399353, + "learning_rate": 1.9041048102882408e-05, + "loss": 0.3276, + "step": 6305 + }, + { + "epoch": 0.1406491183425248, + "grad_norm": 0.7217281460762024, + "learning_rate": 1.90395513978189e-05, + "loss": 0.4166, + "step": 6310 + }, + { + "epoch": 0.14076056772314488, + "grad_norm": 0.562565267086029, + "learning_rate": 1.9038053584576165e-05, + "loss": 0.3688, + "step": 6315 + }, + { + "epoch": 0.14087201710376496, + "grad_norm": 0.5799996852874756, + "learning_rate": 1.9036554663337824e-05, + "loss": 0.3326, + "step": 6320 + }, + { + "epoch": 0.140983466484385, + "grad_norm": 0.4111635982990265, + "learning_rate": 1.903505463428763e-05, + "loss": 0.3406, + "step": 6325 + }, + { + "epoch": 0.14109491586500508, + "grad_norm": 0.5106934309005737, + "learning_rate": 1.903355349760948e-05, + "loss": 0.414, + "step": 6330 + }, + { + "epoch": 0.14120636524562516, + "grad_norm": 0.40918323397636414, + "learning_rate": 1.90320512534874e-05, + "loss": 0.3693, + "step": 6335 + }, + { + "epoch": 0.1413178146262452, + "grad_norm": 0.48011714220046997, + "learning_rate": 1.9030547902105554e-05, + "loss": 0.3422, + "step": 6340 + }, + { + "epoch": 0.14142926400686529, + "grad_norm": 0.5929650664329529, + "learning_rate": 1.902904344364824e-05, + "loss": 0.5012, + "step": 6345 + }, + { + "epoch": 0.14154071338748536, + "grad_norm": 0.471110463142395, + "learning_rate": 1.902753787829989e-05, + "loss": 0.3024, + "step": 6350 + }, + { + "epoch": 0.1416521627681054, + "grad_norm": 0.6223836541175842, + "learning_rate": 1.9026031206245077e-05, + "loss": 0.3263, + "step": 6355 + }, + { + "epoch": 0.1417636121487255, + "grad_norm": 0.37485095858573914, + "learning_rate": 1.9024523427668514e-05, + "loss": 0.4027, + "step": 6360 + }, + { + "epoch": 0.14187506152934556, + "grad_norm": 0.49914073944091797, + "learning_rate": 1.9023014542755035e-05, + "loss": 0.316, + "step": 6365 + }, + { + "epoch": 0.1419865109099656, + "grad_norm": 0.5025352835655212, + "learning_rate": 1.902150455168962e-05, + "loss": 0.6833, + "step": 6370 + }, + { + "epoch": 0.1420979602905857, + "grad_norm": 0.7094488739967346, + "learning_rate": 1.901999345465738e-05, + "loss": 0.3986, + "step": 6375 + }, + { + "epoch": 0.14220940967120574, + "grad_norm": 0.634559690952301, + "learning_rate": 1.901848125184357e-05, + "loss": 0.4283, + "step": 6380 + }, + { + "epoch": 0.14232085905182582, + "grad_norm": 0.6422039270401001, + "learning_rate": 1.9016967943433568e-05, + "loss": 0.3884, + "step": 6385 + }, + { + "epoch": 0.1424323084324459, + "grad_norm": 0.4295634329319, + "learning_rate": 1.90154535296129e-05, + "loss": 0.443, + "step": 6390 + }, + { + "epoch": 0.14254375781306594, + "grad_norm": 0.7843777537345886, + "learning_rate": 1.9013938010567215e-05, + "loss": 0.2879, + "step": 6395 + }, + { + "epoch": 0.14265520719368602, + "grad_norm": 0.7074270844459534, + "learning_rate": 1.9012421386482308e-05, + "loss": 0.4682, + "step": 6400 + }, + { + "epoch": 0.1427666565743061, + "grad_norm": 0.6161109805107117, + "learning_rate": 1.90109036575441e-05, + "loss": 0.3507, + "step": 6405 + }, + { + "epoch": 0.14287810595492614, + "grad_norm": 0.5402871370315552, + "learning_rate": 1.9009384823938663e-05, + "loss": 0.4624, + "step": 6410 + }, + { + "epoch": 0.14298955533554622, + "grad_norm": 0.7705127000808716, + "learning_rate": 1.9007864885852182e-05, + "loss": 0.461, + "step": 6415 + }, + { + "epoch": 0.1431010047161663, + "grad_norm": 0.662533700466156, + "learning_rate": 1.9006343843471003e-05, + "loss": 0.3798, + "step": 6420 + }, + { + "epoch": 0.14321245409678635, + "grad_norm": 0.5392851233482361, + "learning_rate": 1.9004821696981585e-05, + "loss": 0.3014, + "step": 6425 + }, + { + "epoch": 0.14332390347740642, + "grad_norm": 0.9114376902580261, + "learning_rate": 1.900329844657053e-05, + "loss": 0.3915, + "step": 6430 + }, + { + "epoch": 0.1434353528580265, + "grad_norm": 0.7330493927001953, + "learning_rate": 1.9001774092424583e-05, + "loss": 0.4391, + "step": 6435 + }, + { + "epoch": 0.14354680223864655, + "grad_norm": 0.5785829424858093, + "learning_rate": 1.9000248634730613e-05, + "loss": 0.3804, + "step": 6440 + }, + { + "epoch": 0.14365825161926662, + "grad_norm": 0.5360656380653381, + "learning_rate": 1.8998722073675636e-05, + "loss": 0.4382, + "step": 6445 + }, + { + "epoch": 0.1437697009998867, + "grad_norm": 0.4669645130634308, + "learning_rate": 1.8997194409446787e-05, + "loss": 0.3544, + "step": 6450 + }, + { + "epoch": 0.14388115038050675, + "grad_norm": 0.5305682420730591, + "learning_rate": 1.8995665642231354e-05, + "loss": 0.4464, + "step": 6455 + }, + { + "epoch": 0.14399259976112683, + "grad_norm": 0.5815770626068115, + "learning_rate": 1.899413577221675e-05, + "loss": 0.3564, + "step": 6460 + }, + { + "epoch": 0.1441040491417469, + "grad_norm": 0.5688044428825378, + "learning_rate": 1.8992604799590526e-05, + "loss": 0.3542, + "step": 6465 + }, + { + "epoch": 0.14421549852236695, + "grad_norm": 0.4591280519962311, + "learning_rate": 1.8991072724540364e-05, + "loss": 0.3537, + "step": 6470 + }, + { + "epoch": 0.14432694790298703, + "grad_norm": 0.5264057517051697, + "learning_rate": 1.8989539547254084e-05, + "loss": 0.4731, + "step": 6475 + }, + { + "epoch": 0.1444383972836071, + "grad_norm": 0.4802411198616028, + "learning_rate": 1.8988005267919644e-05, + "loss": 0.271, + "step": 6480 + }, + { + "epoch": 0.14454984666422716, + "grad_norm": 0.6552708745002747, + "learning_rate": 1.8986469886725135e-05, + "loss": 0.3038, + "step": 6485 + }, + { + "epoch": 0.14466129604484723, + "grad_norm": 0.41324883699417114, + "learning_rate": 1.8984933403858786e-05, + "loss": 0.296, + "step": 6490 + }, + { + "epoch": 0.1447727454254673, + "grad_norm": 0.4937549829483032, + "learning_rate": 1.8983395819508955e-05, + "loss": 0.4804, + "step": 6495 + }, + { + "epoch": 0.14488419480608736, + "grad_norm": 0.7439261674880981, + "learning_rate": 1.8981857133864136e-05, + "loss": 0.3872, + "step": 6500 + }, + { + "epoch": 0.14499564418670743, + "grad_norm": 0.5570749044418335, + "learning_rate": 1.898031734711296e-05, + "loss": 0.2656, + "step": 6505 + }, + { + "epoch": 0.1451070935673275, + "grad_norm": 0.5962104201316833, + "learning_rate": 1.8978776459444196e-05, + "loss": 0.3265, + "step": 6510 + }, + { + "epoch": 0.14521854294794756, + "grad_norm": 0.5931865572929382, + "learning_rate": 1.8977234471046743e-05, + "loss": 0.3804, + "step": 6515 + }, + { + "epoch": 0.14532999232856764, + "grad_norm": 0.5354698896408081, + "learning_rate": 1.897569138210964e-05, + "loss": 0.4132, + "step": 6520 + }, + { + "epoch": 0.1454414417091877, + "grad_norm": 0.4454991817474365, + "learning_rate": 1.8974147192822053e-05, + "loss": 0.4958, + "step": 6525 + }, + { + "epoch": 0.14555289108980776, + "grad_norm": 0.5197862982749939, + "learning_rate": 1.897260190337329e-05, + "loss": 0.3653, + "step": 6530 + }, + { + "epoch": 0.14566434047042784, + "grad_norm": 0.6639044284820557, + "learning_rate": 1.897105551395279e-05, + "loss": 0.4826, + "step": 6535 + }, + { + "epoch": 0.14577578985104792, + "grad_norm": 0.6526901721954346, + "learning_rate": 1.8969508024750137e-05, + "loss": 0.3015, + "step": 6540 + }, + { + "epoch": 0.14588723923166796, + "grad_norm": 0.4506591260433197, + "learning_rate": 1.8967959435955027e-05, + "loss": 0.5079, + "step": 6545 + }, + { + "epoch": 0.14599868861228804, + "grad_norm": 0.46098169684410095, + "learning_rate": 1.8966409747757314e-05, + "loss": 0.3314, + "step": 6550 + }, + { + "epoch": 0.14611013799290812, + "grad_norm": 0.5385696291923523, + "learning_rate": 1.8964858960346976e-05, + "loss": 0.5161, + "step": 6555 + }, + { + "epoch": 0.14622158737352817, + "grad_norm": 0.6555469036102295, + "learning_rate": 1.896330707391413e-05, + "loss": 0.4846, + "step": 6560 + }, + { + "epoch": 0.14633303675414824, + "grad_norm": 0.5580034255981445, + "learning_rate": 1.8961754088649018e-05, + "loss": 0.4174, + "step": 6565 + }, + { + "epoch": 0.1464444861347683, + "grad_norm": 0.5875076651573181, + "learning_rate": 1.896020000474203e-05, + "loss": 0.5164, + "step": 6570 + }, + { + "epoch": 0.14655593551538837, + "grad_norm": 0.6634327173233032, + "learning_rate": 1.8958644822383688e-05, + "loss": 0.4792, + "step": 6575 + }, + { + "epoch": 0.14666738489600845, + "grad_norm": 0.5859067440032959, + "learning_rate": 1.8957088541764637e-05, + "loss": 0.4333, + "step": 6580 + }, + { + "epoch": 0.1467788342766285, + "grad_norm": 0.513789176940918, + "learning_rate": 1.895553116307567e-05, + "loss": 0.5009, + "step": 6585 + }, + { + "epoch": 0.14689028365724857, + "grad_norm": 0.5502701997756958, + "learning_rate": 1.8953972686507707e-05, + "loss": 0.441, + "step": 6590 + }, + { + "epoch": 0.14700173303786865, + "grad_norm": 0.4884522557258606, + "learning_rate": 1.895241311225181e-05, + "loss": 0.3899, + "step": 6595 + }, + { + "epoch": 0.1471131824184887, + "grad_norm": 0.6013541221618652, + "learning_rate": 1.8950852440499163e-05, + "loss": 0.44, + "step": 6600 + }, + { + "epoch": 0.14722463179910877, + "grad_norm": 0.4754367172718048, + "learning_rate": 1.8949290671441097e-05, + "loss": 0.3826, + "step": 6605 + }, + { + "epoch": 0.14733608117972885, + "grad_norm": 0.563018798828125, + "learning_rate": 1.894772780526908e-05, + "loss": 0.4355, + "step": 6610 + }, + { + "epoch": 0.1474475305603489, + "grad_norm": 0.6593184471130371, + "learning_rate": 1.8946163842174692e-05, + "loss": 0.4039, + "step": 6615 + }, + { + "epoch": 0.14755897994096898, + "grad_norm": 0.644548773765564, + "learning_rate": 1.8944598782349675e-05, + "loss": 0.333, + "step": 6620 + }, + { + "epoch": 0.14767042932158905, + "grad_norm": 0.5700538158416748, + "learning_rate": 1.8943032625985885e-05, + "loss": 0.4342, + "step": 6625 + }, + { + "epoch": 0.1477818787022091, + "grad_norm": 0.4651893675327301, + "learning_rate": 1.894146537327533e-05, + "loss": 0.268, + "step": 6630 + }, + { + "epoch": 0.14789332808282918, + "grad_norm": 0.6951509118080139, + "learning_rate": 1.8939897024410134e-05, + "loss": 0.4129, + "step": 6635 + }, + { + "epoch": 0.14800477746344926, + "grad_norm": 0.5308429598808289, + "learning_rate": 1.893832757958257e-05, + "loss": 0.2941, + "step": 6640 + }, + { + "epoch": 0.1481162268440693, + "grad_norm": 0.5369426608085632, + "learning_rate": 1.8936757038985037e-05, + "loss": 0.2803, + "step": 6645 + }, + { + "epoch": 0.14822767622468938, + "grad_norm": 0.5123085975646973, + "learning_rate": 1.893518540281007e-05, + "loss": 0.3713, + "step": 6650 + }, + { + "epoch": 0.14833912560530946, + "grad_norm": 0.6187421083450317, + "learning_rate": 1.8933612671250345e-05, + "loss": 0.3995, + "step": 6655 + }, + { + "epoch": 0.1484505749859295, + "grad_norm": 0.5980579257011414, + "learning_rate": 1.893203884449866e-05, + "loss": 0.3866, + "step": 6660 + }, + { + "epoch": 0.14856202436654958, + "grad_norm": 0.7539716958999634, + "learning_rate": 1.8930463922747965e-05, + "loss": 0.502, + "step": 6665 + }, + { + "epoch": 0.14867347374716966, + "grad_norm": 0.571130633354187, + "learning_rate": 1.892888790619132e-05, + "loss": 0.4009, + "step": 6670 + }, + { + "epoch": 0.1487849231277897, + "grad_norm": 0.4514002799987793, + "learning_rate": 1.8927310795021938e-05, + "loss": 0.4243, + "step": 6675 + }, + { + "epoch": 0.14889637250840979, + "grad_norm": 0.7586348652839661, + "learning_rate": 1.892573258943316e-05, + "loss": 0.4853, + "step": 6680 + }, + { + "epoch": 0.14900782188902986, + "grad_norm": 0.36333855986595154, + "learning_rate": 1.8924153289618466e-05, + "loss": 0.3791, + "step": 6685 + }, + { + "epoch": 0.1491192712696499, + "grad_norm": 0.43393850326538086, + "learning_rate": 1.8922572895771458e-05, + "loss": 0.3568, + "step": 6690 + }, + { + "epoch": 0.14923072065027, + "grad_norm": 0.5546851754188538, + "learning_rate": 1.892099140808589e-05, + "loss": 0.381, + "step": 6695 + }, + { + "epoch": 0.14934217003089006, + "grad_norm": 0.4489659368991852, + "learning_rate": 1.8919408826755628e-05, + "loss": 0.3822, + "step": 6700 + }, + { + "epoch": 0.1494536194115101, + "grad_norm": 0.5850082635879517, + "learning_rate": 1.8917825151974698e-05, + "loss": 0.4964, + "step": 6705 + }, + { + "epoch": 0.1495650687921302, + "grad_norm": 0.6391961574554443, + "learning_rate": 1.8916240383937236e-05, + "loss": 0.4219, + "step": 6710 + }, + { + "epoch": 0.14967651817275027, + "grad_norm": 0.5481122732162476, + "learning_rate": 1.8914654522837525e-05, + "loss": 0.5315, + "step": 6715 + }, + { + "epoch": 0.14978796755337032, + "grad_norm": 0.5840669274330139, + "learning_rate": 1.8913067568869984e-05, + "loss": 0.3674, + "step": 6720 + }, + { + "epoch": 0.1498994169339904, + "grad_norm": 0.5937899947166443, + "learning_rate": 1.8911479522229154e-05, + "loss": 0.3358, + "step": 6725 + }, + { + "epoch": 0.15001086631461047, + "grad_norm": 0.6355303525924683, + "learning_rate": 1.890989038310972e-05, + "loss": 0.3997, + "step": 6730 + }, + { + "epoch": 0.15012231569523052, + "grad_norm": 0.5027339458465576, + "learning_rate": 1.89083001517065e-05, + "loss": 0.4098, + "step": 6735 + }, + { + "epoch": 0.1502337650758506, + "grad_norm": 0.8207029700279236, + "learning_rate": 1.8906708828214445e-05, + "loss": 0.5281, + "step": 6740 + }, + { + "epoch": 0.15034521445647067, + "grad_norm": 0.47056856751441956, + "learning_rate": 1.8905116412828636e-05, + "loss": 0.3632, + "step": 6745 + }, + { + "epoch": 0.15045666383709072, + "grad_norm": 0.8110126852989197, + "learning_rate": 1.890352290574429e-05, + "loss": 0.227, + "step": 6750 + }, + { + "epoch": 0.1505681132177108, + "grad_norm": 0.7214770913124084, + "learning_rate": 1.8901928307156762e-05, + "loss": 0.4157, + "step": 6755 + }, + { + "epoch": 0.15067956259833085, + "grad_norm": 0.5110815763473511, + "learning_rate": 1.8900332617261535e-05, + "loss": 0.2864, + "step": 6760 + }, + { + "epoch": 0.15079101197895092, + "grad_norm": 0.5272954106330872, + "learning_rate": 1.889873583625423e-05, + "loss": 0.3965, + "step": 6765 + }, + { + "epoch": 0.150902461359571, + "grad_norm": 0.6545156836509705, + "learning_rate": 1.8897137964330595e-05, + "loss": 0.4683, + "step": 6770 + }, + { + "epoch": 0.15101391074019105, + "grad_norm": 0.5515015721321106, + "learning_rate": 1.8895539001686526e-05, + "loss": 0.5359, + "step": 6775 + }, + { + "epoch": 0.15112536012081113, + "grad_norm": 0.5681248903274536, + "learning_rate": 1.8893938948518038e-05, + "loss": 0.4592, + "step": 6780 + }, + { + "epoch": 0.1512368095014312, + "grad_norm": 0.7550321221351624, + "learning_rate": 1.8892337805021282e-05, + "loss": 0.3513, + "step": 6785 + }, + { + "epoch": 0.15134825888205125, + "grad_norm": 0.5479450821876526, + "learning_rate": 1.8890735571392557e-05, + "loss": 0.3816, + "step": 6790 + }, + { + "epoch": 0.15145970826267133, + "grad_norm": 0.5612640380859375, + "learning_rate": 1.8889132247828267e-05, + "loss": 0.3922, + "step": 6795 + }, + { + "epoch": 0.1515711576432914, + "grad_norm": 0.4440547823905945, + "learning_rate": 1.8887527834524983e-05, + "loss": 0.3642, + "step": 6800 + }, + { + "epoch": 0.15168260702391145, + "grad_norm": 0.4283455014228821, + "learning_rate": 1.8885922331679388e-05, + "loss": 0.389, + "step": 6805 + }, + { + "epoch": 0.15179405640453153, + "grad_norm": 0.5806257128715515, + "learning_rate": 1.88843157394883e-05, + "loss": 0.3685, + "step": 6810 + }, + { + "epoch": 0.1519055057851516, + "grad_norm": 0.5102450847625732, + "learning_rate": 1.8882708058148683e-05, + "loss": 0.3832, + "step": 6815 + }, + { + "epoch": 0.15201695516577166, + "grad_norm": 0.6629839539527893, + "learning_rate": 1.888109928785762e-05, + "loss": 0.5464, + "step": 6820 + }, + { + "epoch": 0.15212840454639173, + "grad_norm": 0.5725881457328796, + "learning_rate": 1.8879489428812335e-05, + "loss": 0.2462, + "step": 6825 + }, + { + "epoch": 0.1522398539270118, + "grad_norm": 2.398331642150879, + "learning_rate": 1.887787848121019e-05, + "loss": 0.4956, + "step": 6830 + }, + { + "epoch": 0.15235130330763186, + "grad_norm": 0.6613421440124512, + "learning_rate": 1.8876266445248664e-05, + "loss": 0.4017, + "step": 6835 + }, + { + "epoch": 0.15246275268825193, + "grad_norm": 0.5280817151069641, + "learning_rate": 1.8874653321125388e-05, + "loss": 0.37, + "step": 6840 + }, + { + "epoch": 0.152574202068872, + "grad_norm": 0.5924332141876221, + "learning_rate": 1.8873039109038115e-05, + "loss": 0.4621, + "step": 6845 + }, + { + "epoch": 0.15268565144949206, + "grad_norm": 0.5465047955513, + "learning_rate": 1.887142380918474e-05, + "loss": 0.4276, + "step": 6850 + }, + { + "epoch": 0.15279710083011214, + "grad_norm": 0.6491557955741882, + "learning_rate": 1.886980742176328e-05, + "loss": 0.3074, + "step": 6855 + }, + { + "epoch": 0.1529085502107322, + "grad_norm": 0.6065066456794739, + "learning_rate": 1.8868189946971895e-05, + "loss": 0.4331, + "step": 6860 + }, + { + "epoch": 0.15301999959135226, + "grad_norm": 0.6120703816413879, + "learning_rate": 1.886657138500888e-05, + "loss": 0.272, + "step": 6865 + }, + { + "epoch": 0.15313144897197234, + "grad_norm": 0.4606035649776459, + "learning_rate": 1.8864951736072643e-05, + "loss": 0.3616, + "step": 6870 + }, + { + "epoch": 0.15324289835259242, + "grad_norm": 0.5181193351745605, + "learning_rate": 1.8863331000361755e-05, + "loss": 0.4053, + "step": 6875 + }, + { + "epoch": 0.15335434773321246, + "grad_norm": 0.3439059853553772, + "learning_rate": 1.88617091780749e-05, + "loss": 0.3357, + "step": 6880 + }, + { + "epoch": 0.15346579711383254, + "grad_norm": 0.5444257855415344, + "learning_rate": 1.8860086269410905e-05, + "loss": 0.4992, + "step": 6885 + }, + { + "epoch": 0.15357724649445262, + "grad_norm": 0.729205846786499, + "learning_rate": 1.885846227456872e-05, + "loss": 0.3517, + "step": 6890 + }, + { + "epoch": 0.15368869587507267, + "grad_norm": 0.5775438547134399, + "learning_rate": 1.8856837193747436e-05, + "loss": 0.4049, + "step": 6895 + }, + { + "epoch": 0.15380014525569274, + "grad_norm": 0.49706968665122986, + "learning_rate": 1.885521102714628e-05, + "loss": 0.3226, + "step": 6900 + }, + { + "epoch": 0.15391159463631282, + "grad_norm": 0.5086760520935059, + "learning_rate": 1.8853583774964598e-05, + "loss": 0.4231, + "step": 6905 + }, + { + "epoch": 0.15402304401693287, + "grad_norm": 0.5242869257926941, + "learning_rate": 1.885195543740189e-05, + "loss": 0.3973, + "step": 6910 + }, + { + "epoch": 0.15413449339755295, + "grad_norm": 0.6669424176216125, + "learning_rate": 1.8850326014657765e-05, + "loss": 0.43, + "step": 6915 + }, + { + "epoch": 0.15424594277817302, + "grad_norm": 0.6157550811767578, + "learning_rate": 1.8848695506931995e-05, + "loss": 0.4676, + "step": 6920 + }, + { + "epoch": 0.15435739215879307, + "grad_norm": 0.3835831880569458, + "learning_rate": 1.8847063914424447e-05, + "loss": 0.3631, + "step": 6925 + }, + { + "epoch": 0.15446884153941315, + "grad_norm": 0.564392626285553, + "learning_rate": 1.8845431237335158e-05, + "loss": 0.4559, + "step": 6930 + }, + { + "epoch": 0.15458029092003323, + "grad_norm": 0.5257560610771179, + "learning_rate": 1.8843797475864274e-05, + "loss": 0.4221, + "step": 6935 + }, + { + "epoch": 0.15469174030065327, + "grad_norm": 0.5841484665870667, + "learning_rate": 1.8842162630212083e-05, + "loss": 0.3595, + "step": 6940 + }, + { + "epoch": 0.15480318968127335, + "grad_norm": 0.5178191065788269, + "learning_rate": 1.8840526700579004e-05, + "loss": 0.3742, + "step": 6945 + }, + { + "epoch": 0.1549146390618934, + "grad_norm": 0.4463173747062683, + "learning_rate": 1.8838889687165592e-05, + "loss": 0.3189, + "step": 6950 + }, + { + "epoch": 0.15502608844251348, + "grad_norm": 0.5193114280700684, + "learning_rate": 1.883725159017253e-05, + "loss": 0.2913, + "step": 6955 + }, + { + "epoch": 0.15513753782313355, + "grad_norm": 0.4916064143180847, + "learning_rate": 1.8835612409800634e-05, + "loss": 0.2726, + "step": 6960 + }, + { + "epoch": 0.1552489872037536, + "grad_norm": 0.6708848476409912, + "learning_rate": 1.883397214625086e-05, + "loss": 0.4509, + "step": 6965 + }, + { + "epoch": 0.15536043658437368, + "grad_norm": 0.5058689117431641, + "learning_rate": 1.8832330799724285e-05, + "loss": 0.3408, + "step": 6970 + }, + { + "epoch": 0.15547188596499376, + "grad_norm": 0.5161834359169006, + "learning_rate": 1.8830688370422127e-05, + "loss": 0.2951, + "step": 6975 + }, + { + "epoch": 0.1555833353456138, + "grad_norm": 1.0592902898788452, + "learning_rate": 1.8829044858545744e-05, + "loss": 0.4411, + "step": 6980 + }, + { + "epoch": 0.15569478472623388, + "grad_norm": 0.4401077628135681, + "learning_rate": 1.8827400264296606e-05, + "loss": 0.3229, + "step": 6985 + }, + { + "epoch": 0.15580623410685396, + "grad_norm": 0.48951423168182373, + "learning_rate": 1.8825754587876335e-05, + "loss": 0.4544, + "step": 6990 + }, + { + "epoch": 0.155917683487474, + "grad_norm": 0.5329950451850891, + "learning_rate": 1.8824107829486674e-05, + "loss": 0.4696, + "step": 6995 + }, + { + "epoch": 0.15602913286809408, + "grad_norm": 0.4187164604663849, + "learning_rate": 1.8822459989329508e-05, + "loss": 0.4175, + "step": 7000 + }, + { + "epoch": 0.15614058224871416, + "grad_norm": 0.5629317164421082, + "learning_rate": 1.8820811067606844e-05, + "loss": 0.4599, + "step": 7005 + }, + { + "epoch": 0.1562520316293342, + "grad_norm": 0.5468907952308655, + "learning_rate": 1.881916106452083e-05, + "loss": 0.3809, + "step": 7010 + }, + { + "epoch": 0.15636348100995429, + "grad_norm": 0.4851939380168915, + "learning_rate": 1.8817509980273742e-05, + "loss": 0.384, + "step": 7015 + }, + { + "epoch": 0.15647493039057436, + "grad_norm": 0.6714935898780823, + "learning_rate": 1.8815857815067994e-05, + "loss": 0.3595, + "step": 7020 + }, + { + "epoch": 0.1565863797711944, + "grad_norm": 0.6519018411636353, + "learning_rate": 1.8814204569106124e-05, + "loss": 0.4568, + "step": 7025 + }, + { + "epoch": 0.1566978291518145, + "grad_norm": 0.6960158348083496, + "learning_rate": 1.8812550242590805e-05, + "loss": 0.4064, + "step": 7030 + }, + { + "epoch": 0.15680927853243457, + "grad_norm": 0.4918147623538971, + "learning_rate": 1.8810894835724854e-05, + "loss": 0.4429, + "step": 7035 + }, + { + "epoch": 0.15692072791305461, + "grad_norm": 0.45890048146247864, + "learning_rate": 1.8809238348711206e-05, + "loss": 0.346, + "step": 7040 + }, + { + "epoch": 0.1570321772936747, + "grad_norm": 0.6213566660881042, + "learning_rate": 1.8807580781752932e-05, + "loss": 0.4872, + "step": 7045 + }, + { + "epoch": 0.15714362667429477, + "grad_norm": 0.4078867435455322, + "learning_rate": 1.880592213505324e-05, + "loss": 0.2985, + "step": 7050 + }, + { + "epoch": 0.15725507605491482, + "grad_norm": 0.46203523874282837, + "learning_rate": 1.880426240881546e-05, + "loss": 0.4425, + "step": 7055 + }, + { + "epoch": 0.1573665254355349, + "grad_norm": 0.6020154356956482, + "learning_rate": 1.880260160324307e-05, + "loss": 0.4224, + "step": 7060 + }, + { + "epoch": 0.15747797481615497, + "grad_norm": 0.692721426486969, + "learning_rate": 1.880093971853967e-05, + "loss": 0.5081, + "step": 7065 + }, + { + "epoch": 0.15758942419677502, + "grad_norm": 0.49963024258613586, + "learning_rate": 1.8799276754908992e-05, + "loss": 0.4919, + "step": 7070 + }, + { + "epoch": 0.1577008735773951, + "grad_norm": 0.5270716547966003, + "learning_rate": 1.8797612712554904e-05, + "loss": 0.3817, + "step": 7075 + }, + { + "epoch": 0.15781232295801517, + "grad_norm": 0.6648659706115723, + "learning_rate": 1.879594759168141e-05, + "loss": 0.3285, + "step": 7080 + }, + { + "epoch": 0.15792377233863522, + "grad_norm": 0.39034026861190796, + "learning_rate": 1.8794281392492627e-05, + "loss": 0.2896, + "step": 7085 + }, + { + "epoch": 0.1580352217192553, + "grad_norm": 1.2855838537216187, + "learning_rate": 1.8792614115192834e-05, + "loss": 0.4575, + "step": 7090 + }, + { + "epoch": 0.15814667109987537, + "grad_norm": 0.5977685451507568, + "learning_rate": 1.8790945759986414e-05, + "loss": 0.4771, + "step": 7095 + }, + { + "epoch": 0.15825812048049542, + "grad_norm": 0.8264610767364502, + "learning_rate": 1.87892763270779e-05, + "loss": 0.4239, + "step": 7100 + }, + { + "epoch": 0.1583695698611155, + "grad_norm": 0.5660960078239441, + "learning_rate": 1.8787605816671956e-05, + "loss": 0.3571, + "step": 7105 + }, + { + "epoch": 0.15848101924173558, + "grad_norm": 0.6069998741149902, + "learning_rate": 1.8785934228973364e-05, + "loss": 0.3975, + "step": 7110 + }, + { + "epoch": 0.15859246862235563, + "grad_norm": 0.6635084748268127, + "learning_rate": 1.8784261564187053e-05, + "loss": 0.4047, + "step": 7115 + }, + { + "epoch": 0.1587039180029757, + "grad_norm": 0.5507485866546631, + "learning_rate": 1.878258782251808e-05, + "loss": 0.4622, + "step": 7120 + }, + { + "epoch": 0.15881536738359578, + "grad_norm": 0.5426681637763977, + "learning_rate": 1.8780913004171628e-05, + "loss": 0.4192, + "step": 7125 + }, + { + "epoch": 0.15892681676421583, + "grad_norm": 0.7298119068145752, + "learning_rate": 1.8779237109353023e-05, + "loss": 0.4339, + "step": 7130 + }, + { + "epoch": 0.1590382661448359, + "grad_norm": 0.6015418767929077, + "learning_rate": 1.8777560138267712e-05, + "loss": 0.4617, + "step": 7135 + }, + { + "epoch": 0.15914971552545595, + "grad_norm": 0.4312678873538971, + "learning_rate": 1.8775882091121282e-05, + "loss": 0.283, + "step": 7140 + }, + { + "epoch": 0.15926116490607603, + "grad_norm": 0.6461197137832642, + "learning_rate": 1.8774202968119447e-05, + "loss": 0.3925, + "step": 7145 + }, + { + "epoch": 0.1593726142866961, + "grad_norm": 0.5625909566879272, + "learning_rate": 1.8772522769468054e-05, + "loss": 0.4194, + "step": 7150 + }, + { + "epoch": 0.15948406366731616, + "grad_norm": 0.6251127123832703, + "learning_rate": 1.8770841495373083e-05, + "loss": 0.4162, + "step": 7155 + }, + { + "epoch": 0.15959551304793623, + "grad_norm": 0.49051955342292786, + "learning_rate": 1.8769159146040644e-05, + "loss": 0.4761, + "step": 7160 + }, + { + "epoch": 0.1597069624285563, + "grad_norm": 0.590154767036438, + "learning_rate": 1.876747572167698e-05, + "loss": 0.3741, + "step": 7165 + }, + { + "epoch": 0.15981841180917636, + "grad_norm": 0.5282180905342102, + "learning_rate": 1.8765791222488472e-05, + "loss": 0.3672, + "step": 7170 + }, + { + "epoch": 0.15992986118979644, + "grad_norm": 0.3630257844924927, + "learning_rate": 1.8764105648681615e-05, + "loss": 0.3332, + "step": 7175 + }, + { + "epoch": 0.1600413105704165, + "grad_norm": 0.48569926619529724, + "learning_rate": 1.876241900046306e-05, + "loss": 0.3211, + "step": 7180 + }, + { + "epoch": 0.16015275995103656, + "grad_norm": 0.6470745205879211, + "learning_rate": 1.876073127803957e-05, + "loss": 0.4411, + "step": 7185 + }, + { + "epoch": 0.16026420933165664, + "grad_norm": 0.49332156777381897, + "learning_rate": 1.8759042481618047e-05, + "loss": 0.4793, + "step": 7190 + }, + { + "epoch": 0.16037565871227671, + "grad_norm": 0.5013951659202576, + "learning_rate": 1.8757352611405525e-05, + "loss": 0.337, + "step": 7195 + }, + { + "epoch": 0.16048710809289676, + "grad_norm": 0.47468462586402893, + "learning_rate": 1.8755661667609167e-05, + "loss": 0.4142, + "step": 7200 + }, + { + "epoch": 0.16059855747351684, + "grad_norm": 0.5139350891113281, + "learning_rate": 1.8753969650436274e-05, + "loss": 0.4156, + "step": 7205 + }, + { + "epoch": 0.16071000685413692, + "grad_norm": 0.6101190447807312, + "learning_rate": 1.8752276560094273e-05, + "loss": 0.3468, + "step": 7210 + }, + { + "epoch": 0.16082145623475697, + "grad_norm": 0.5023723840713501, + "learning_rate": 1.875058239679072e-05, + "loss": 0.4566, + "step": 7215 + }, + { + "epoch": 0.16093290561537704, + "grad_norm": 0.48146092891693115, + "learning_rate": 1.8748887160733315e-05, + "loss": 0.3785, + "step": 7220 + }, + { + "epoch": 0.16104435499599712, + "grad_norm": 0.6446987986564636, + "learning_rate": 1.8747190852129868e-05, + "loss": 0.318, + "step": 7225 + }, + { + "epoch": 0.16115580437661717, + "grad_norm": 0.5768639445304871, + "learning_rate": 1.8745493471188348e-05, + "loss": 0.348, + "step": 7230 + }, + { + "epoch": 0.16126725375723724, + "grad_norm": 0.5417771935462952, + "learning_rate": 1.8743795018116827e-05, + "loss": 0.3935, + "step": 7235 + }, + { + "epoch": 0.16137870313785732, + "grad_norm": 0.3816154897212982, + "learning_rate": 1.874209549312353e-05, + "loss": 0.4261, + "step": 7240 + }, + { + "epoch": 0.16149015251847737, + "grad_norm": 0.6149434447288513, + "learning_rate": 1.8740394896416806e-05, + "loss": 0.483, + "step": 7245 + }, + { + "epoch": 0.16160160189909745, + "grad_norm": 0.7580816745758057, + "learning_rate": 1.873869322820513e-05, + "loss": 0.4668, + "step": 7250 + }, + { + "epoch": 0.16171305127971752, + "grad_norm": 0.5908399820327759, + "learning_rate": 1.873699048869712e-05, + "loss": 0.4884, + "step": 7255 + }, + { + "epoch": 0.16182450066033757, + "grad_norm": 0.7933946251869202, + "learning_rate": 1.8735286678101515e-05, + "loss": 0.5547, + "step": 7260 + }, + { + "epoch": 0.16193595004095765, + "grad_norm": 0.39468976855278015, + "learning_rate": 1.8733581796627187e-05, + "loss": 0.375, + "step": 7265 + }, + { + "epoch": 0.16204739942157773, + "grad_norm": 0.6789149641990662, + "learning_rate": 1.8731875844483145e-05, + "loss": 0.2763, + "step": 7270 + }, + { + "epoch": 0.16215884880219777, + "grad_norm": 0.684878408908844, + "learning_rate": 1.8730168821878527e-05, + "loss": 0.3293, + "step": 7275 + }, + { + "epoch": 0.16227029818281785, + "grad_norm": 0.5506591796875, + "learning_rate": 1.8728460729022592e-05, + "loss": 0.3465, + "step": 7280 + }, + { + "epoch": 0.16238174756343793, + "grad_norm": 0.509867787361145, + "learning_rate": 1.8726751566124747e-05, + "loss": 0.4675, + "step": 7285 + }, + { + "epoch": 0.16249319694405798, + "grad_norm": 0.7684085369110107, + "learning_rate": 1.8725041333394523e-05, + "loss": 0.4364, + "step": 7290 + }, + { + "epoch": 0.16260464632467805, + "grad_norm": 0.5433658361434937, + "learning_rate": 1.872333003104158e-05, + "loss": 0.3737, + "step": 7295 + }, + { + "epoch": 0.16271609570529813, + "grad_norm": 0.6474670171737671, + "learning_rate": 1.87216176592757e-05, + "loss": 0.3013, + "step": 7300 + }, + { + "epoch": 0.16282754508591818, + "grad_norm": 0.5131295919418335, + "learning_rate": 1.8719904218306822e-05, + "loss": 0.388, + "step": 7305 + }, + { + "epoch": 0.16293899446653826, + "grad_norm": 0.5305821895599365, + "learning_rate": 1.8718189708344997e-05, + "loss": 0.3985, + "step": 7310 + }, + { + "epoch": 0.16305044384715833, + "grad_norm": 0.5121385455131531, + "learning_rate": 1.8716474129600403e-05, + "loss": 0.4053, + "step": 7315 + }, + { + "epoch": 0.16316189322777838, + "grad_norm": 0.6189393401145935, + "learning_rate": 1.871475748228336e-05, + "loss": 0.3405, + "step": 7320 + }, + { + "epoch": 0.16327334260839846, + "grad_norm": 0.34280717372894287, + "learning_rate": 1.8713039766604325e-05, + "loss": 0.4189, + "step": 7325 + }, + { + "epoch": 0.1633847919890185, + "grad_norm": 0.4888545274734497, + "learning_rate": 1.8711320982773863e-05, + "loss": 0.3416, + "step": 7330 + }, + { + "epoch": 0.16349624136963858, + "grad_norm": 0.6265230178833008, + "learning_rate": 1.870960113100269e-05, + "loss": 0.4796, + "step": 7335 + }, + { + "epoch": 0.16360769075025866, + "grad_norm": 0.5085820555686951, + "learning_rate": 1.8707880211501646e-05, + "loss": 0.4041, + "step": 7340 + }, + { + "epoch": 0.1637191401308787, + "grad_norm": 0.5407921075820923, + "learning_rate": 1.8706158224481704e-05, + "loss": 0.2705, + "step": 7345 + }, + { + "epoch": 0.1638305895114988, + "grad_norm": 0.38955292105674744, + "learning_rate": 1.8704435170153963e-05, + "loss": 0.3524, + "step": 7350 + }, + { + "epoch": 0.16394203889211886, + "grad_norm": 0.46001070737838745, + "learning_rate": 1.870271104872966e-05, + "loss": 0.4483, + "step": 7355 + }, + { + "epoch": 0.1640534882727389, + "grad_norm": 0.6388995051383972, + "learning_rate": 1.8700985860420156e-05, + "loss": 0.4532, + "step": 7360 + }, + { + "epoch": 0.164164937653359, + "grad_norm": 0.6003766655921936, + "learning_rate": 1.869925960543695e-05, + "loss": 0.4178, + "step": 7365 + }, + { + "epoch": 0.16427638703397907, + "grad_norm": 0.45454123616218567, + "learning_rate": 1.869753228399166e-05, + "loss": 0.2413, + "step": 7370 + }, + { + "epoch": 0.16438783641459911, + "grad_norm": 0.604836106300354, + "learning_rate": 1.8695803896296048e-05, + "loss": 0.3807, + "step": 7375 + }, + { + "epoch": 0.1644992857952192, + "grad_norm": 0.42217013239860535, + "learning_rate": 1.8694074442562e-05, + "loss": 0.2366, + "step": 7380 + }, + { + "epoch": 0.16461073517583927, + "grad_norm": 0.4794704020023346, + "learning_rate": 1.869234392300153e-05, + "loss": 0.3015, + "step": 7385 + }, + { + "epoch": 0.16472218455645932, + "grad_norm": 0.4159165620803833, + "learning_rate": 1.8690612337826795e-05, + "loss": 0.3359, + "step": 7390 + }, + { + "epoch": 0.1648336339370794, + "grad_norm": 0.5214026570320129, + "learning_rate": 1.8688879687250067e-05, + "loss": 0.3039, + "step": 7395 + }, + { + "epoch": 0.16494508331769947, + "grad_norm": 0.45507606863975525, + "learning_rate": 1.8687145971483757e-05, + "loss": 0.338, + "step": 7400 + }, + { + "epoch": 0.16505653269831952, + "grad_norm": 0.5133429765701294, + "learning_rate": 1.8685411190740404e-05, + "loss": 0.3267, + "step": 7405 + }, + { + "epoch": 0.1651679820789396, + "grad_norm": 0.5593928098678589, + "learning_rate": 1.8683675345232683e-05, + "loss": 0.4158, + "step": 7410 + }, + { + "epoch": 0.16527943145955967, + "grad_norm": 0.5510695576667786, + "learning_rate": 1.868193843517339e-05, + "loss": 0.3397, + "step": 7415 + }, + { + "epoch": 0.16539088084017972, + "grad_norm": 0.4579789340496063, + "learning_rate": 1.868020046077546e-05, + "loss": 0.4361, + "step": 7420 + }, + { + "epoch": 0.1655023302207998, + "grad_norm": 0.5290041565895081, + "learning_rate": 1.8678461422251956e-05, + "loss": 0.3467, + "step": 7425 + }, + { + "epoch": 0.16561377960141987, + "grad_norm": 0.5120608806610107, + "learning_rate": 1.8676721319816064e-05, + "loss": 0.3066, + "step": 7430 + }, + { + "epoch": 0.16572522898203992, + "grad_norm": 0.6638226509094238, + "learning_rate": 1.8674980153681116e-05, + "loss": 0.4378, + "step": 7435 + }, + { + "epoch": 0.16583667836266, + "grad_norm": 0.5974465012550354, + "learning_rate": 1.867323792406056e-05, + "loss": 0.3986, + "step": 7440 + }, + { + "epoch": 0.16594812774328008, + "grad_norm": 0.7560718655586243, + "learning_rate": 1.8671494631167982e-05, + "loss": 0.3244, + "step": 7445 + }, + { + "epoch": 0.16605957712390013, + "grad_norm": 0.674818217754364, + "learning_rate": 1.8669750275217097e-05, + "loss": 0.4048, + "step": 7450 + }, + { + "epoch": 0.1661710265045202, + "grad_norm": 0.7948163151741028, + "learning_rate": 1.8668004856421748e-05, + "loss": 0.2647, + "step": 7455 + }, + { + "epoch": 0.16628247588514028, + "grad_norm": 0.5481162071228027, + "learning_rate": 1.8666258374995912e-05, + "loss": 0.2952, + "step": 7460 + }, + { + "epoch": 0.16639392526576033, + "grad_norm": 0.5210584402084351, + "learning_rate": 1.866451083115369e-05, + "loss": 0.3767, + "step": 7465 + }, + { + "epoch": 0.1665053746463804, + "grad_norm": 0.9675779938697815, + "learning_rate": 1.866276222510932e-05, + "loss": 0.4264, + "step": 7470 + }, + { + "epoch": 0.16661682402700048, + "grad_norm": 0.6672073602676392, + "learning_rate": 1.8661012557077167e-05, + "loss": 0.3939, + "step": 7475 + }, + { + "epoch": 0.16672827340762053, + "grad_norm": 0.50336754322052, + "learning_rate": 1.865926182727173e-05, + "loss": 0.3563, + "step": 7480 + }, + { + "epoch": 0.1668397227882406, + "grad_norm": 0.538811445236206, + "learning_rate": 1.865751003590763e-05, + "loss": 0.4235, + "step": 7485 + }, + { + "epoch": 0.16695117216886068, + "grad_norm": 0.5374696850776672, + "learning_rate": 1.8655757183199624e-05, + "loss": 0.2565, + "step": 7490 + }, + { + "epoch": 0.16706262154948073, + "grad_norm": 0.5080285668373108, + "learning_rate": 1.8654003269362602e-05, + "loss": 0.3744, + "step": 7495 + }, + { + "epoch": 0.1671740709301008, + "grad_norm": 0.9205856919288635, + "learning_rate": 1.8652248294611576e-05, + "loss": 0.3381, + "step": 7500 + }, + { + "epoch": 0.1672855203107209, + "grad_norm": 0.5005443096160889, + "learning_rate": 1.8650492259161696e-05, + "loss": 0.4025, + "step": 7505 + }, + { + "epoch": 0.16739696969134094, + "grad_norm": 0.5921610593795776, + "learning_rate": 1.8648735163228235e-05, + "loss": 0.2706, + "step": 7510 + }, + { + "epoch": 0.167508419071961, + "grad_norm": 0.551630973815918, + "learning_rate": 1.8646977007026602e-05, + "loss": 0.3583, + "step": 7515 + }, + { + "epoch": 0.16761986845258106, + "grad_norm": 0.6462824940681458, + "learning_rate": 1.8645217790772333e-05, + "loss": 0.3964, + "step": 7520 + }, + { + "epoch": 0.16773131783320114, + "grad_norm": 0.4856247305870056, + "learning_rate": 1.8643457514681093e-05, + "loss": 0.4055, + "step": 7525 + }, + { + "epoch": 0.16784276721382121, + "grad_norm": 0.6250535845756531, + "learning_rate": 1.864169617896868e-05, + "loss": 0.4841, + "step": 7530 + }, + { + "epoch": 0.16795421659444126, + "grad_norm": 0.40336018800735474, + "learning_rate": 1.863993378385102e-05, + "loss": 0.3998, + "step": 7535 + }, + { + "epoch": 0.16806566597506134, + "grad_norm": 0.5048686265945435, + "learning_rate": 1.8638170329544164e-05, + "loss": 0.4064, + "step": 7540 + }, + { + "epoch": 0.16817711535568142, + "grad_norm": 0.5588499307632446, + "learning_rate": 1.8636405816264303e-05, + "loss": 0.3946, + "step": 7545 + }, + { + "epoch": 0.16828856473630147, + "grad_norm": 0.6174390316009521, + "learning_rate": 1.8634640244227756e-05, + "loss": 0.4433, + "step": 7550 + }, + { + "epoch": 0.16840001411692154, + "grad_norm": 0.5336619019508362, + "learning_rate": 1.863287361365096e-05, + "loss": 0.3856, + "step": 7555 + }, + { + "epoch": 0.16851146349754162, + "grad_norm": 1.3543431758880615, + "learning_rate": 1.8631105924750496e-05, + "loss": 0.3679, + "step": 7560 + }, + { + "epoch": 0.16862291287816167, + "grad_norm": 0.7976319193840027, + "learning_rate": 1.8629337177743067e-05, + "loss": 0.2943, + "step": 7565 + }, + { + "epoch": 0.16873436225878174, + "grad_norm": 0.5987356305122375, + "learning_rate": 1.862756737284551e-05, + "loss": 0.3568, + "step": 7570 + }, + { + "epoch": 0.16884581163940182, + "grad_norm": 0.4379819631576538, + "learning_rate": 1.8625796510274785e-05, + "loss": 0.352, + "step": 7575 + }, + { + "epoch": 0.16895726102002187, + "grad_norm": 1.3696235418319702, + "learning_rate": 1.862402459024799e-05, + "loss": 0.4194, + "step": 7580 + }, + { + "epoch": 0.16906871040064195, + "grad_norm": 0.6878867745399475, + "learning_rate": 1.8622251612982347e-05, + "loss": 0.3598, + "step": 7585 + }, + { + "epoch": 0.16918015978126202, + "grad_norm": 0.5670821070671082, + "learning_rate": 1.862047757869521e-05, + "loss": 0.4934, + "step": 7590 + }, + { + "epoch": 0.16929160916188207, + "grad_norm": 0.5964428186416626, + "learning_rate": 1.8618702487604064e-05, + "loss": 0.464, + "step": 7595 + }, + { + "epoch": 0.16940305854250215, + "grad_norm": 0.8312402963638306, + "learning_rate": 1.8616926339926515e-05, + "loss": 0.5313, + "step": 7600 + }, + { + "epoch": 0.16951450792312223, + "grad_norm": 0.4638909697532654, + "learning_rate": 1.8615149135880312e-05, + "loss": 0.351, + "step": 7605 + }, + { + "epoch": 0.16962595730374228, + "grad_norm": 0.7792471647262573, + "learning_rate": 1.8613370875683327e-05, + "loss": 0.4167, + "step": 7610 + }, + { + "epoch": 0.16973740668436235, + "grad_norm": 0.5532524585723877, + "learning_rate": 1.861159155955355e-05, + "loss": 0.402, + "step": 7615 + }, + { + "epoch": 0.16984885606498243, + "grad_norm": 0.567893922328949, + "learning_rate": 1.8609811187709124e-05, + "loss": 0.4829, + "step": 7620 + }, + { + "epoch": 0.16996030544560248, + "grad_norm": 0.6022348999977112, + "learning_rate": 1.8608029760368302e-05, + "loss": 0.3435, + "step": 7625 + }, + { + "epoch": 0.17007175482622255, + "grad_norm": 0.803606390953064, + "learning_rate": 1.8606247277749476e-05, + "loss": 0.3426, + "step": 7630 + }, + { + "epoch": 0.17018320420684263, + "grad_norm": 0.49966391921043396, + "learning_rate": 1.8604463740071165e-05, + "loss": 0.4837, + "step": 7635 + }, + { + "epoch": 0.17029465358746268, + "grad_norm": 0.4341173470020294, + "learning_rate": 1.8602679147552014e-05, + "loss": 0.4424, + "step": 7640 + }, + { + "epoch": 0.17040610296808276, + "grad_norm": 0.5094113945960999, + "learning_rate": 1.8600893500410803e-05, + "loss": 0.3534, + "step": 7645 + }, + { + "epoch": 0.17051755234870283, + "grad_norm": 0.6519873738288879, + "learning_rate": 1.8599106798866438e-05, + "loss": 0.3653, + "step": 7650 + }, + { + "epoch": 0.17062900172932288, + "grad_norm": 0.5553015470504761, + "learning_rate": 1.8597319043137952e-05, + "loss": 0.2955, + "step": 7655 + }, + { + "epoch": 0.17074045110994296, + "grad_norm": 0.5292229652404785, + "learning_rate": 1.8595530233444514e-05, + "loss": 0.3143, + "step": 7660 + }, + { + "epoch": 0.17085190049056304, + "grad_norm": 0.5739569664001465, + "learning_rate": 1.8593740370005415e-05, + "loss": 0.3195, + "step": 7665 + }, + { + "epoch": 0.17096334987118308, + "grad_norm": 0.502048671245575, + "learning_rate": 1.8591949453040083e-05, + "loss": 0.3717, + "step": 7670 + }, + { + "epoch": 0.17107479925180316, + "grad_norm": 0.5392212867736816, + "learning_rate": 1.8590157482768064e-05, + "loss": 0.3698, + "step": 7675 + }, + { + "epoch": 0.17118624863242324, + "grad_norm": 0.4482336938381195, + "learning_rate": 1.858836445940905e-05, + "loss": 0.4866, + "step": 7680 + }, + { + "epoch": 0.1712976980130433, + "grad_norm": 0.5693212151527405, + "learning_rate": 1.858657038318284e-05, + "loss": 0.3281, + "step": 7685 + }, + { + "epoch": 0.17140914739366336, + "grad_norm": 0.49769172072410583, + "learning_rate": 1.8584775254309378e-05, + "loss": 0.3215, + "step": 7690 + }, + { + "epoch": 0.17152059677428344, + "grad_norm": 0.5934991240501404, + "learning_rate": 1.858297907300874e-05, + "loss": 0.3258, + "step": 7695 + }, + { + "epoch": 0.1716320461549035, + "grad_norm": 0.5538322329521179, + "learning_rate": 1.858118183950111e-05, + "loss": 0.4904, + "step": 7700 + }, + { + "epoch": 0.17174349553552357, + "grad_norm": 0.6971874833106995, + "learning_rate": 1.8579383554006833e-05, + "loss": 0.5043, + "step": 7705 + }, + { + "epoch": 0.17185494491614361, + "grad_norm": 0.6677750945091248, + "learning_rate": 1.8577584216746345e-05, + "loss": 0.4446, + "step": 7710 + }, + { + "epoch": 0.1719663942967637, + "grad_norm": 0.5418416857719421, + "learning_rate": 1.8575783827940245e-05, + "loss": 0.4064, + "step": 7715 + }, + { + "epoch": 0.17207784367738377, + "grad_norm": 0.7056881189346313, + "learning_rate": 1.8573982387809244e-05, + "loss": 0.4147, + "step": 7720 + }, + { + "epoch": 0.17218929305800382, + "grad_norm": 0.6976264715194702, + "learning_rate": 1.8572179896574184e-05, + "loss": 0.4984, + "step": 7725 + }, + { + "epoch": 0.1723007424386239, + "grad_norm": 0.5107178688049316, + "learning_rate": 1.8570376354456033e-05, + "loss": 0.3481, + "step": 7730 + }, + { + "epoch": 0.17241219181924397, + "grad_norm": 0.9620299935340881, + "learning_rate": 1.8568571761675893e-05, + "loss": 0.3581, + "step": 7735 + }, + { + "epoch": 0.17252364119986402, + "grad_norm": 0.4947699010372162, + "learning_rate": 1.8566766118454996e-05, + "loss": 0.4885, + "step": 7740 + }, + { + "epoch": 0.1726350905804841, + "grad_norm": 0.4762234389781952, + "learning_rate": 1.85649594250147e-05, + "loss": 0.4067, + "step": 7745 + }, + { + "epoch": 0.17274653996110417, + "grad_norm": 0.9069308042526245, + "learning_rate": 1.8563151681576487e-05, + "loss": 0.2346, + "step": 7750 + }, + { + "epoch": 0.17285798934172422, + "grad_norm": 0.5567780137062073, + "learning_rate": 1.8561342888361978e-05, + "loss": 0.3078, + "step": 7755 + }, + { + "epoch": 0.1729694387223443, + "grad_norm": 0.641209602355957, + "learning_rate": 1.855953304559291e-05, + "loss": 0.4647, + "step": 7760 + }, + { + "epoch": 0.17308088810296438, + "grad_norm": 0.5287478566169739, + "learning_rate": 1.8557722153491166e-05, + "loss": 0.3828, + "step": 7765 + }, + { + "epoch": 0.17319233748358442, + "grad_norm": 0.5798740386962891, + "learning_rate": 1.855591021227874e-05, + "loss": 0.3984, + "step": 7770 + }, + { + "epoch": 0.1733037868642045, + "grad_norm": 0.6225942373275757, + "learning_rate": 1.855409722217776e-05, + "loss": 0.3252, + "step": 7775 + }, + { + "epoch": 0.17341523624482458, + "grad_norm": 0.6901559829711914, + "learning_rate": 1.855228318341049e-05, + "loss": 0.3742, + "step": 7780 + }, + { + "epoch": 0.17352668562544463, + "grad_norm": 0.5212215185165405, + "learning_rate": 1.8550468096199314e-05, + "loss": 0.2777, + "step": 7785 + }, + { + "epoch": 0.1736381350060647, + "grad_norm": 0.8276028037071228, + "learning_rate": 1.854865196076675e-05, + "loss": 0.4952, + "step": 7790 + }, + { + "epoch": 0.17374958438668478, + "grad_norm": 0.48634928464889526, + "learning_rate": 1.854683477733544e-05, + "loss": 0.3508, + "step": 7795 + }, + { + "epoch": 0.17386103376730483, + "grad_norm": 0.4943958520889282, + "learning_rate": 1.8545016546128162e-05, + "loss": 0.4288, + "step": 7800 + }, + { + "epoch": 0.1739724831479249, + "grad_norm": 0.5155841708183289, + "learning_rate": 1.8543197267367807e-05, + "loss": 0.459, + "step": 7805 + }, + { + "epoch": 0.17408393252854498, + "grad_norm": 0.3615794777870178, + "learning_rate": 1.8541376941277414e-05, + "loss": 0.4031, + "step": 7810 + }, + { + "epoch": 0.17419538190916503, + "grad_norm": 0.6325530409812927, + "learning_rate": 1.8539555568080134e-05, + "loss": 0.4214, + "step": 7815 + }, + { + "epoch": 0.1743068312897851, + "grad_norm": 0.513111412525177, + "learning_rate": 1.8537733147999262e-05, + "loss": 0.3848, + "step": 7820 + }, + { + "epoch": 0.17441828067040518, + "grad_norm": 0.5642367005348206, + "learning_rate": 1.8535909681258202e-05, + "loss": 0.3574, + "step": 7825 + }, + { + "epoch": 0.17452973005102523, + "grad_norm": 0.7676248550415039, + "learning_rate": 1.8534085168080503e-05, + "loss": 0.3844, + "step": 7830 + }, + { + "epoch": 0.1746411794316453, + "grad_norm": 0.5915855765342712, + "learning_rate": 1.853225960868984e-05, + "loss": 0.3365, + "step": 7835 + }, + { + "epoch": 0.1747526288122654, + "grad_norm": 0.5565099120140076, + "learning_rate": 1.8530433003310003e-05, + "loss": 0.4917, + "step": 7840 + }, + { + "epoch": 0.17486407819288544, + "grad_norm": 0.6923288702964783, + "learning_rate": 1.8528605352164926e-05, + "loss": 0.3355, + "step": 7845 + }, + { + "epoch": 0.1749755275735055, + "grad_norm": 0.4434451460838318, + "learning_rate": 1.8526776655478663e-05, + "loss": 0.4119, + "step": 7850 + }, + { + "epoch": 0.1750869769541256, + "grad_norm": 0.587926983833313, + "learning_rate": 1.85249469134754e-05, + "loss": 0.3956, + "step": 7855 + }, + { + "epoch": 0.17519842633474564, + "grad_norm": 0.5707252025604248, + "learning_rate": 1.852311612637945e-05, + "loss": 0.45, + "step": 7860 + }, + { + "epoch": 0.17530987571536572, + "grad_norm": 0.6604421138763428, + "learning_rate": 1.8521284294415247e-05, + "loss": 0.3159, + "step": 7865 + }, + { + "epoch": 0.1754213250959858, + "grad_norm": 0.6285608410835266, + "learning_rate": 1.8519451417807364e-05, + "loss": 0.4533, + "step": 7870 + }, + { + "epoch": 0.17553277447660584, + "grad_norm": 0.45332780480384827, + "learning_rate": 1.8517617496780497e-05, + "loss": 0.403, + "step": 7875 + }, + { + "epoch": 0.17564422385722592, + "grad_norm": 0.5412122011184692, + "learning_rate": 1.8515782531559474e-05, + "loss": 0.3701, + "step": 7880 + }, + { + "epoch": 0.175755673237846, + "grad_norm": 0.7073056101799011, + "learning_rate": 1.8513946522369242e-05, + "loss": 0.3685, + "step": 7885 + }, + { + "epoch": 0.17586712261846604, + "grad_norm": 0.5671860575675964, + "learning_rate": 1.851210946943489e-05, + "loss": 0.4571, + "step": 7890 + }, + { + "epoch": 0.17597857199908612, + "grad_norm": 0.6075953245162964, + "learning_rate": 1.8510271372981612e-05, + "loss": 0.4485, + "step": 7895 + }, + { + "epoch": 0.17609002137970617, + "grad_norm": 0.6864980459213257, + "learning_rate": 1.8508432233234755e-05, + "loss": 0.3992, + "step": 7900 + }, + { + "epoch": 0.17620147076032625, + "grad_norm": 0.5237451195716858, + "learning_rate": 1.8506592050419783e-05, + "loss": 0.4678, + "step": 7905 + }, + { + "epoch": 0.17631292014094632, + "grad_norm": 0.6632329821586609, + "learning_rate": 1.8504750824762285e-05, + "loss": 0.32, + "step": 7910 + }, + { + "epoch": 0.17642436952156637, + "grad_norm": 0.6396775245666504, + "learning_rate": 1.8502908556487985e-05, + "loss": 0.3531, + "step": 7915 + }, + { + "epoch": 0.17653581890218645, + "grad_norm": 0.5833070874214172, + "learning_rate": 1.8501065245822726e-05, + "loss": 0.2939, + "step": 7920 + }, + { + "epoch": 0.17664726828280652, + "grad_norm": 0.6542580723762512, + "learning_rate": 1.8499220892992483e-05, + "loss": 0.4936, + "step": 7925 + }, + { + "epoch": 0.17675871766342657, + "grad_norm": 0.4529288411140442, + "learning_rate": 1.849737549822337e-05, + "loss": 0.3176, + "step": 7930 + }, + { + "epoch": 0.17687016704404665, + "grad_norm": 0.6152017712593079, + "learning_rate": 1.8495529061741602e-05, + "loss": 0.4047, + "step": 7935 + }, + { + "epoch": 0.17698161642466673, + "grad_norm": 0.4641212522983551, + "learning_rate": 1.8493681583773556e-05, + "loss": 0.2954, + "step": 7940 + }, + { + "epoch": 0.17709306580528678, + "grad_norm": 0.4853191375732422, + "learning_rate": 1.8491833064545705e-05, + "loss": 0.4545, + "step": 7945 + }, + { + "epoch": 0.17720451518590685, + "grad_norm": 0.544423520565033, + "learning_rate": 1.8489983504284664e-05, + "loss": 0.4811, + "step": 7950 + }, + { + "epoch": 0.17731596456652693, + "grad_norm": 0.32580846548080444, + "learning_rate": 1.848813290321718e-05, + "loss": 0.4182, + "step": 7955 + }, + { + "epoch": 0.17742741394714698, + "grad_norm": 0.5929962396621704, + "learning_rate": 1.848628126157012e-05, + "loss": 0.517, + "step": 7960 + }, + { + "epoch": 0.17753886332776705, + "grad_norm": 0.6212632656097412, + "learning_rate": 1.8484428579570482e-05, + "loss": 0.2642, + "step": 7965 + }, + { + "epoch": 0.17765031270838713, + "grad_norm": 0.373495489358902, + "learning_rate": 1.848257485744539e-05, + "loss": 0.4516, + "step": 7970 + }, + { + "epoch": 0.17776176208900718, + "grad_norm": 0.693138599395752, + "learning_rate": 1.8480720095422096e-05, + "loss": 0.3265, + "step": 7975 + }, + { + "epoch": 0.17787321146962726, + "grad_norm": 0.4824640154838562, + "learning_rate": 1.847886429372798e-05, + "loss": 0.2924, + "step": 7980 + }, + { + "epoch": 0.17798466085024733, + "grad_norm": 0.562910795211792, + "learning_rate": 1.8477007452590546e-05, + "loss": 0.4182, + "step": 7985 + }, + { + "epoch": 0.17809611023086738, + "grad_norm": 0.5726629495620728, + "learning_rate": 1.8475149572237434e-05, + "loss": 0.3874, + "step": 7990 + }, + { + "epoch": 0.17820755961148746, + "grad_norm": 0.5748285055160522, + "learning_rate": 1.8473290652896398e-05, + "loss": 0.4022, + "step": 7995 + }, + { + "epoch": 0.17831900899210754, + "grad_norm": 0.5068720579147339, + "learning_rate": 1.8471430694795336e-05, + "loss": 0.3074, + "step": 8000 + }, + { + "epoch": 0.17843045837272759, + "grad_norm": 0.5286265015602112, + "learning_rate": 1.846956969816226e-05, + "loss": 0.3784, + "step": 8005 + }, + { + "epoch": 0.17854190775334766, + "grad_norm": 0.6678787469863892, + "learning_rate": 1.8467707663225312e-05, + "loss": 0.3808, + "step": 8010 + }, + { + "epoch": 0.17865335713396774, + "grad_norm": 0.49153807759284973, + "learning_rate": 1.8465844590212767e-05, + "loss": 0.4211, + "step": 8015 + }, + { + "epoch": 0.1787648065145878, + "grad_norm": 0.4546610713005066, + "learning_rate": 1.8463980479353018e-05, + "loss": 0.3546, + "step": 8020 + }, + { + "epoch": 0.17887625589520786, + "grad_norm": 0.8939555883407593, + "learning_rate": 1.8462115330874598e-05, + "loss": 0.3224, + "step": 8025 + }, + { + "epoch": 0.17898770527582794, + "grad_norm": 0.5169472098350525, + "learning_rate": 1.8460249145006156e-05, + "loss": 0.3166, + "step": 8030 + }, + { + "epoch": 0.179099154656448, + "grad_norm": 0.5126698613166809, + "learning_rate": 1.8458381921976468e-05, + "loss": 0.3324, + "step": 8035 + }, + { + "epoch": 0.17921060403706807, + "grad_norm": 0.5317251086235046, + "learning_rate": 1.845651366201445e-05, + "loss": 0.3892, + "step": 8040 + }, + { + "epoch": 0.17932205341768814, + "grad_norm": 0.6299559473991394, + "learning_rate": 1.8454644365349127e-05, + "loss": 0.364, + "step": 8045 + }, + { + "epoch": 0.1794335027983082, + "grad_norm": 0.6288081407546997, + "learning_rate": 1.8452774032209667e-05, + "loss": 0.4554, + "step": 8050 + }, + { + "epoch": 0.17954495217892827, + "grad_norm": 0.7996246814727783, + "learning_rate": 1.8450902662825357e-05, + "loss": 0.2982, + "step": 8055 + }, + { + "epoch": 0.17965640155954835, + "grad_norm": 0.7523775696754456, + "learning_rate": 1.844903025742561e-05, + "loss": 0.3866, + "step": 8060 + }, + { + "epoch": 0.1797678509401684, + "grad_norm": 0.5479750633239746, + "learning_rate": 1.8447156816239967e-05, + "loss": 0.3827, + "step": 8065 + }, + { + "epoch": 0.17987930032078847, + "grad_norm": 0.6241227388381958, + "learning_rate": 1.8445282339498105e-05, + "loss": 0.3712, + "step": 8070 + }, + { + "epoch": 0.17999074970140855, + "grad_norm": 0.4241069555282593, + "learning_rate": 1.8443406827429816e-05, + "loss": 0.4409, + "step": 8075 + }, + { + "epoch": 0.1801021990820286, + "grad_norm": 0.527955174446106, + "learning_rate": 1.844153028026502e-05, + "loss": 0.4218, + "step": 8080 + }, + { + "epoch": 0.18021364846264867, + "grad_norm": 0.4392964541912079, + "learning_rate": 1.8439652698233773e-05, + "loss": 0.2986, + "step": 8085 + }, + { + "epoch": 0.18032509784326872, + "grad_norm": 0.6911497712135315, + "learning_rate": 1.8437774081566248e-05, + "loss": 0.3723, + "step": 8090 + }, + { + "epoch": 0.1804365472238888, + "grad_norm": 0.3347485363483429, + "learning_rate": 1.843589443049275e-05, + "loss": 0.3233, + "step": 8095 + }, + { + "epoch": 0.18054799660450888, + "grad_norm": 0.6009610891342163, + "learning_rate": 1.843401374524371e-05, + "loss": 0.4063, + "step": 8100 + }, + { + "epoch": 0.18065944598512892, + "grad_norm": 0.5331124067306519, + "learning_rate": 1.8432132026049685e-05, + "loss": 0.5214, + "step": 8105 + }, + { + "epoch": 0.180770895365749, + "grad_norm": 0.9455961585044861, + "learning_rate": 1.843024927314136e-05, + "loss": 0.3784, + "step": 8110 + }, + { + "epoch": 0.18088234474636908, + "grad_norm": 0.5959413051605225, + "learning_rate": 1.8428365486749545e-05, + "loss": 0.3704, + "step": 8115 + }, + { + "epoch": 0.18099379412698913, + "grad_norm": 0.7602490186691284, + "learning_rate": 1.8426480667105178e-05, + "loss": 0.4113, + "step": 8120 + }, + { + "epoch": 0.1811052435076092, + "grad_norm": 0.76889568567276, + "learning_rate": 1.8424594814439326e-05, + "loss": 0.5736, + "step": 8125 + }, + { + "epoch": 0.18121669288822928, + "grad_norm": 0.873080849647522, + "learning_rate": 1.8422707928983173e-05, + "loss": 0.3585, + "step": 8130 + }, + { + "epoch": 0.18132814226884933, + "grad_norm": 0.3947627544403076, + "learning_rate": 1.8420820010968046e-05, + "loss": 0.4465, + "step": 8135 + }, + { + "epoch": 0.1814395916494694, + "grad_norm": 0.5145520567893982, + "learning_rate": 1.8418931060625386e-05, + "loss": 0.3982, + "step": 8140 + }, + { + "epoch": 0.18155104103008948, + "grad_norm": 0.5094866156578064, + "learning_rate": 1.8417041078186757e-05, + "loss": 0.2775, + "step": 8145 + }, + { + "epoch": 0.18166249041070953, + "grad_norm": 0.5887768864631653, + "learning_rate": 1.841515006388386e-05, + "loss": 0.4656, + "step": 8150 + }, + { + "epoch": 0.1817739397913296, + "grad_norm": 0.6336820721626282, + "learning_rate": 1.8413258017948527e-05, + "loss": 0.2485, + "step": 8155 + }, + { + "epoch": 0.18188538917194969, + "grad_norm": 0.8006008267402649, + "learning_rate": 1.84113649406127e-05, + "loss": 0.3981, + "step": 8160 + }, + { + "epoch": 0.18199683855256973, + "grad_norm": 0.5995430946350098, + "learning_rate": 1.8409470832108452e-05, + "loss": 0.3706, + "step": 8165 + }, + { + "epoch": 0.1821082879331898, + "grad_norm": 0.5357914566993713, + "learning_rate": 1.8407575692667997e-05, + "loss": 0.4349, + "step": 8170 + }, + { + "epoch": 0.1822197373138099, + "grad_norm": 0.4280491769313812, + "learning_rate": 1.8405679522523656e-05, + "loss": 0.5198, + "step": 8175 + }, + { + "epoch": 0.18233118669442994, + "grad_norm": 0.5123806595802307, + "learning_rate": 1.8403782321907888e-05, + "loss": 0.2347, + "step": 8180 + }, + { + "epoch": 0.18244263607505, + "grad_norm": 0.6792116761207581, + "learning_rate": 1.840188409105327e-05, + "loss": 0.3305, + "step": 8185 + }, + { + "epoch": 0.1825540854556701, + "grad_norm": 0.5935506224632263, + "learning_rate": 1.8399984830192522e-05, + "loss": 0.4718, + "step": 8190 + }, + { + "epoch": 0.18266553483629014, + "grad_norm": 0.4705546200275421, + "learning_rate": 1.839808453955847e-05, + "loss": 0.4294, + "step": 8195 + }, + { + "epoch": 0.18277698421691022, + "grad_norm": 0.44560903310775757, + "learning_rate": 1.8396183219384073e-05, + "loss": 0.3661, + "step": 8200 + }, + { + "epoch": 0.1828884335975303, + "grad_norm": 0.596440315246582, + "learning_rate": 1.8394280869902423e-05, + "loss": 0.4454, + "step": 8205 + }, + { + "epoch": 0.18299988297815034, + "grad_norm": 0.6796529293060303, + "learning_rate": 1.8392377491346734e-05, + "loss": 0.3922, + "step": 8210 + }, + { + "epoch": 0.18311133235877042, + "grad_norm": 0.5746434926986694, + "learning_rate": 1.8390473083950346e-05, + "loss": 0.3445, + "step": 8215 + }, + { + "epoch": 0.1832227817393905, + "grad_norm": 0.689755916595459, + "learning_rate": 1.8388567647946718e-05, + "loss": 0.4528, + "step": 8220 + }, + { + "epoch": 0.18333423112001054, + "grad_norm": 0.5509458780288696, + "learning_rate": 1.8386661183569446e-05, + "loss": 0.4711, + "step": 8225 + }, + { + "epoch": 0.18344568050063062, + "grad_norm": 0.5880481600761414, + "learning_rate": 1.8384753691052252e-05, + "loss": 0.3628, + "step": 8230 + }, + { + "epoch": 0.1835571298812507, + "grad_norm": 0.4933379888534546, + "learning_rate": 1.8382845170628973e-05, + "loss": 0.3164, + "step": 8235 + }, + { + "epoch": 0.18366857926187075, + "grad_norm": 0.5734802484512329, + "learning_rate": 1.838093562253358e-05, + "loss": 0.4686, + "step": 8240 + }, + { + "epoch": 0.18378002864249082, + "grad_norm": 0.6775012612342834, + "learning_rate": 1.8379025047000177e-05, + "loss": 0.3105, + "step": 8245 + }, + { + "epoch": 0.1838914780231109, + "grad_norm": 0.7611976861953735, + "learning_rate": 1.8377113444262973e-05, + "loss": 0.3797, + "step": 8250 + }, + { + "epoch": 0.18400292740373095, + "grad_norm": 0.5081771016120911, + "learning_rate": 1.8375200814556325e-05, + "loss": 0.2762, + "step": 8255 + }, + { + "epoch": 0.18411437678435102, + "grad_norm": 0.41866931319236755, + "learning_rate": 1.8373287158114702e-05, + "loss": 0.2894, + "step": 8260 + }, + { + "epoch": 0.1842258261649711, + "grad_norm": 0.42981696128845215, + "learning_rate": 1.8371372475172705e-05, + "loss": 0.387, + "step": 8265 + }, + { + "epoch": 0.18433727554559115, + "grad_norm": 0.4748207628726959, + "learning_rate": 1.8369456765965064e-05, + "loss": 0.2789, + "step": 8270 + }, + { + "epoch": 0.18444872492621123, + "grad_norm": 0.7153117060661316, + "learning_rate": 1.8367540030726624e-05, + "loss": 0.4423, + "step": 8275 + }, + { + "epoch": 0.18456017430683128, + "grad_norm": 0.5909894108772278, + "learning_rate": 1.8365622269692362e-05, + "loss": 0.3757, + "step": 8280 + }, + { + "epoch": 0.18467162368745135, + "grad_norm": 0.4618348479270935, + "learning_rate": 1.836370348309738e-05, + "loss": 0.4254, + "step": 8285 + }, + { + "epoch": 0.18478307306807143, + "grad_norm": 0.4978313148021698, + "learning_rate": 1.8361783671176915e-05, + "loss": 0.3347, + "step": 8290 + }, + { + "epoch": 0.18489452244869148, + "grad_norm": 0.5355519652366638, + "learning_rate": 1.835986283416631e-05, + "loss": 0.2233, + "step": 8295 + }, + { + "epoch": 0.18500597182931156, + "grad_norm": 0.5492738485336304, + "learning_rate": 1.8357940972301055e-05, + "loss": 0.3731, + "step": 8300 + }, + { + "epoch": 0.18511742120993163, + "grad_norm": 0.5359588265419006, + "learning_rate": 1.8356018085816744e-05, + "loss": 0.318, + "step": 8305 + }, + { + "epoch": 0.18522887059055168, + "grad_norm": 0.6668272614479065, + "learning_rate": 1.8354094174949117e-05, + "loss": 0.3873, + "step": 8310 + }, + { + "epoch": 0.18534031997117176, + "grad_norm": 0.592851459980011, + "learning_rate": 1.8352169239934028e-05, + "loss": 0.3334, + "step": 8315 + }, + { + "epoch": 0.18545176935179183, + "grad_norm": 0.627230167388916, + "learning_rate": 1.835024328100746e-05, + "loss": 0.4218, + "step": 8320 + }, + { + "epoch": 0.18556321873241188, + "grad_norm": 0.40039190649986267, + "learning_rate": 1.834831629840552e-05, + "loss": 0.3812, + "step": 8325 + }, + { + "epoch": 0.18567466811303196, + "grad_norm": 0.34311795234680176, + "learning_rate": 1.8346388292364438e-05, + "loss": 0.3433, + "step": 8330 + }, + { + "epoch": 0.18578611749365204, + "grad_norm": 0.6989755630493164, + "learning_rate": 1.8344459263120575e-05, + "loss": 0.3556, + "step": 8335 + }, + { + "epoch": 0.18589756687427209, + "grad_norm": 0.6018476486206055, + "learning_rate": 1.834252921091042e-05, + "loss": 0.4649, + "step": 8340 + }, + { + "epoch": 0.18600901625489216, + "grad_norm": 0.5775113105773926, + "learning_rate": 1.8340598135970577e-05, + "loss": 0.3353, + "step": 8345 + }, + { + "epoch": 0.18612046563551224, + "grad_norm": 0.48347654938697815, + "learning_rate": 1.833866603853778e-05, + "loss": 0.4051, + "step": 8350 + }, + { + "epoch": 0.1862319150161323, + "grad_norm": 1.136330008506775, + "learning_rate": 1.8336732918848894e-05, + "loss": 0.391, + "step": 8355 + }, + { + "epoch": 0.18634336439675236, + "grad_norm": 0.4948228597640991, + "learning_rate": 1.8334798777140902e-05, + "loss": 0.2786, + "step": 8360 + }, + { + "epoch": 0.18645481377737244, + "grad_norm": 0.5982722640037537, + "learning_rate": 1.8332863613650912e-05, + "loss": 0.416, + "step": 8365 + }, + { + "epoch": 0.1865662631579925, + "grad_norm": 0.5050781965255737, + "learning_rate": 1.8330927428616162e-05, + "loss": 0.2546, + "step": 8370 + }, + { + "epoch": 0.18667771253861257, + "grad_norm": 0.4478650689125061, + "learning_rate": 1.8328990222274018e-05, + "loss": 0.2382, + "step": 8375 + }, + { + "epoch": 0.18678916191923264, + "grad_norm": 0.6107395887374878, + "learning_rate": 1.832705199486196e-05, + "loss": 0.4187, + "step": 8380 + }, + { + "epoch": 0.1869006112998527, + "grad_norm": 0.5083065032958984, + "learning_rate": 1.8325112746617603e-05, + "loss": 0.3965, + "step": 8385 + }, + { + "epoch": 0.18701206068047277, + "grad_norm": 0.5986996293067932, + "learning_rate": 1.8323172477778683e-05, + "loss": 0.3662, + "step": 8390 + }, + { + "epoch": 0.18712351006109285, + "grad_norm": 0.5546152591705322, + "learning_rate": 1.832123118858306e-05, + "loss": 0.4417, + "step": 8395 + }, + { + "epoch": 0.1872349594417129, + "grad_norm": 0.5741242170333862, + "learning_rate": 1.8319288879268727e-05, + "loss": 0.4439, + "step": 8400 + }, + { + "epoch": 0.18734640882233297, + "grad_norm": 0.5699114799499512, + "learning_rate": 1.8317345550073792e-05, + "loss": 0.4508, + "step": 8405 + }, + { + "epoch": 0.18745785820295305, + "grad_norm": 0.6300404071807861, + "learning_rate": 1.8315401201236492e-05, + "loss": 0.347, + "step": 8410 + }, + { + "epoch": 0.1875693075835731, + "grad_norm": 0.5789498090744019, + "learning_rate": 1.831345583299519e-05, + "loss": 0.408, + "step": 8415 + }, + { + "epoch": 0.18768075696419317, + "grad_norm": 0.6178359985351562, + "learning_rate": 1.831150944558837e-05, + "loss": 0.4726, + "step": 8420 + }, + { + "epoch": 0.18779220634481325, + "grad_norm": 0.4740115702152252, + "learning_rate": 1.8309562039254652e-05, + "loss": 0.3297, + "step": 8425 + }, + { + "epoch": 0.1879036557254333, + "grad_norm": 0.39913448691368103, + "learning_rate": 1.8307613614232765e-05, + "loss": 0.3211, + "step": 8430 + }, + { + "epoch": 0.18801510510605338, + "grad_norm": 0.4314505159854889, + "learning_rate": 1.8305664170761576e-05, + "loss": 0.3395, + "step": 8435 + }, + { + "epoch": 0.18812655448667345, + "grad_norm": 0.5156731605529785, + "learning_rate": 1.8303713709080067e-05, + "loss": 0.344, + "step": 8440 + }, + { + "epoch": 0.1882380038672935, + "grad_norm": 0.3862886130809784, + "learning_rate": 1.8301762229427352e-05, + "loss": 0.4049, + "step": 8445 + }, + { + "epoch": 0.18834945324791358, + "grad_norm": 0.5977081060409546, + "learning_rate": 1.829980973204267e-05, + "loss": 0.3067, + "step": 8450 + }, + { + "epoch": 0.18846090262853366, + "grad_norm": 0.5924422740936279, + "learning_rate": 1.8297856217165376e-05, + "loss": 0.4148, + "step": 8455 + }, + { + "epoch": 0.1885723520091537, + "grad_norm": 0.6521643400192261, + "learning_rate": 1.8295901685034964e-05, + "loss": 0.4086, + "step": 8460 + }, + { + "epoch": 0.18868380138977378, + "grad_norm": 0.6108610033988953, + "learning_rate": 1.8293946135891038e-05, + "loss": 0.4278, + "step": 8465 + }, + { + "epoch": 0.18879525077039383, + "grad_norm": 0.4995235502719879, + "learning_rate": 1.8291989569973332e-05, + "loss": 0.3273, + "step": 8470 + }, + { + "epoch": 0.1889067001510139, + "grad_norm": 0.6318859457969666, + "learning_rate": 1.8290031987521714e-05, + "loss": 0.4842, + "step": 8475 + }, + { + "epoch": 0.18901814953163398, + "grad_norm": 0.5363388657569885, + "learning_rate": 1.828807338877616e-05, + "loss": 0.4514, + "step": 8480 + }, + { + "epoch": 0.18912959891225403, + "grad_norm": 0.5494492650032043, + "learning_rate": 1.8286113773976782e-05, + "loss": 0.3342, + "step": 8485 + }, + { + "epoch": 0.1892410482928741, + "grad_norm": 0.5755802989006042, + "learning_rate": 1.8284153143363818e-05, + "loss": 0.3305, + "step": 8490 + }, + { + "epoch": 0.18935249767349419, + "grad_norm": 0.632031261920929, + "learning_rate": 1.828219149717762e-05, + "loss": 0.3841, + "step": 8495 + }, + { + "epoch": 0.18946394705411423, + "grad_norm": 0.6508500576019287, + "learning_rate": 1.8280228835658675e-05, + "loss": 0.3259, + "step": 8500 + }, + { + "epoch": 0.1895753964347343, + "grad_norm": 0.5723993182182312, + "learning_rate": 1.8278265159047585e-05, + "loss": 0.3421, + "step": 8505 + }, + { + "epoch": 0.1896868458153544, + "grad_norm": 0.6716227531433105, + "learning_rate": 1.8276300467585087e-05, + "loss": 0.3498, + "step": 8510 + }, + { + "epoch": 0.18979829519597444, + "grad_norm": 0.5189501047134399, + "learning_rate": 1.8274334761512037e-05, + "loss": 0.3458, + "step": 8515 + }, + { + "epoch": 0.1899097445765945, + "grad_norm": 0.6275649070739746, + "learning_rate": 1.827236804106941e-05, + "loss": 0.3402, + "step": 8520 + }, + { + "epoch": 0.1900211939572146, + "grad_norm": 0.6184311509132385, + "learning_rate": 1.8270400306498313e-05, + "loss": 0.3706, + "step": 8525 + }, + { + "epoch": 0.19013264333783464, + "grad_norm": 0.5152689814567566, + "learning_rate": 1.826843155803998e-05, + "loss": 0.3156, + "step": 8530 + }, + { + "epoch": 0.19024409271845472, + "grad_norm": 0.3839017450809479, + "learning_rate": 1.8266461795935758e-05, + "loss": 0.2933, + "step": 8535 + }, + { + "epoch": 0.1903555420990748, + "grad_norm": 0.48906683921813965, + "learning_rate": 1.8264491020427128e-05, + "loss": 0.4327, + "step": 8540 + }, + { + "epoch": 0.19046699147969484, + "grad_norm": 0.6690248847007751, + "learning_rate": 1.8262519231755694e-05, + "loss": 0.4718, + "step": 8545 + }, + { + "epoch": 0.19057844086031492, + "grad_norm": 0.49581602215766907, + "learning_rate": 1.8260546430163173e-05, + "loss": 0.3557, + "step": 8550 + }, + { + "epoch": 0.190689890240935, + "grad_norm": 0.8193932771682739, + "learning_rate": 1.8258572615891427e-05, + "loss": 0.3851, + "step": 8555 + }, + { + "epoch": 0.19080133962155504, + "grad_norm": 0.7166968584060669, + "learning_rate": 1.825659778918242e-05, + "loss": 0.5081, + "step": 8560 + }, + { + "epoch": 0.19091278900217512, + "grad_norm": 0.7662790417671204, + "learning_rate": 1.8254621950278258e-05, + "loss": 0.4056, + "step": 8565 + }, + { + "epoch": 0.1910242383827952, + "grad_norm": 0.6887685656547546, + "learning_rate": 1.825264509942116e-05, + "loss": 0.3094, + "step": 8570 + }, + { + "epoch": 0.19113568776341525, + "grad_norm": 0.6675335168838501, + "learning_rate": 1.8250667236853473e-05, + "loss": 0.4079, + "step": 8575 + }, + { + "epoch": 0.19124713714403532, + "grad_norm": 0.6034323573112488, + "learning_rate": 1.824868836281767e-05, + "loss": 0.487, + "step": 8580 + }, + { + "epoch": 0.1913585865246554, + "grad_norm": 0.4529895782470703, + "learning_rate": 1.824670847755634e-05, + "loss": 0.3206, + "step": 8585 + }, + { + "epoch": 0.19147003590527545, + "grad_norm": 0.43103736639022827, + "learning_rate": 1.824472758131221e-05, + "loss": 0.3437, + "step": 8590 + }, + { + "epoch": 0.19158148528589553, + "grad_norm": 0.7910636067390442, + "learning_rate": 1.8242745674328114e-05, + "loss": 0.3933, + "step": 8595 + }, + { + "epoch": 0.1916929346665156, + "grad_norm": 0.552842915058136, + "learning_rate": 1.8240762756847024e-05, + "loss": 0.3755, + "step": 8600 + }, + { + "epoch": 0.19180438404713565, + "grad_norm": 0.5862413644790649, + "learning_rate": 1.823877882911203e-05, + "loss": 0.3324, + "step": 8605 + }, + { + "epoch": 0.19191583342775573, + "grad_norm": 0.42190515995025635, + "learning_rate": 1.8236793891366346e-05, + "loss": 0.3371, + "step": 8610 + }, + { + "epoch": 0.1920272828083758, + "grad_norm": 0.8216304779052734, + "learning_rate": 1.823480794385331e-05, + "loss": 0.396, + "step": 8615 + }, + { + "epoch": 0.19213873218899585, + "grad_norm": 0.4263015389442444, + "learning_rate": 1.8232820986816376e-05, + "loss": 0.3182, + "step": 8620 + }, + { + "epoch": 0.19225018156961593, + "grad_norm": 0.608733594417572, + "learning_rate": 1.8230833020499145e-05, + "loss": 0.4568, + "step": 8625 + }, + { + "epoch": 0.192361630950236, + "grad_norm": 0.5924399495124817, + "learning_rate": 1.8228844045145312e-05, + "loss": 0.3843, + "step": 8630 + }, + { + "epoch": 0.19247308033085606, + "grad_norm": 0.609131395816803, + "learning_rate": 1.822685406099872e-05, + "loss": 0.3626, + "step": 8635 + }, + { + "epoch": 0.19258452971147613, + "grad_norm": 0.40351277589797974, + "learning_rate": 1.8224863068303322e-05, + "loss": 0.2846, + "step": 8640 + }, + { + "epoch": 0.1926959790920962, + "grad_norm": 0.6606740951538086, + "learning_rate": 1.822287106730319e-05, + "loss": 0.3876, + "step": 8645 + }, + { + "epoch": 0.19280742847271626, + "grad_norm": 0.3179605007171631, + "learning_rate": 1.8220878058242545e-05, + "loss": 0.4238, + "step": 8650 + }, + { + "epoch": 0.19291887785333633, + "grad_norm": 0.6481133699417114, + "learning_rate": 1.82188840413657e-05, + "loss": 0.3526, + "step": 8655 + }, + { + "epoch": 0.19303032723395638, + "grad_norm": 0.5002349019050598, + "learning_rate": 1.8216889016917116e-05, + "loss": 0.4173, + "step": 8660 + }, + { + "epoch": 0.19314177661457646, + "grad_norm": 0.5035709738731384, + "learning_rate": 1.8214892985141363e-05, + "loss": 0.3237, + "step": 8665 + }, + { + "epoch": 0.19325322599519654, + "grad_norm": 0.7374876737594604, + "learning_rate": 1.8212895946283134e-05, + "loss": 0.3935, + "step": 8670 + }, + { + "epoch": 0.19336467537581659, + "grad_norm": 0.5722633004188538, + "learning_rate": 1.8210897900587263e-05, + "loss": 0.4176, + "step": 8675 + }, + { + "epoch": 0.19347612475643666, + "grad_norm": 0.56236332654953, + "learning_rate": 1.8208898848298684e-05, + "loss": 0.2532, + "step": 8680 + }, + { + "epoch": 0.19358757413705674, + "grad_norm": 0.505115270614624, + "learning_rate": 1.820689878966247e-05, + "loss": 0.362, + "step": 8685 + }, + { + "epoch": 0.1936990235176768, + "grad_norm": 0.7024111747741699, + "learning_rate": 1.820489772492381e-05, + "loss": 0.4478, + "step": 8690 + }, + { + "epoch": 0.19381047289829686, + "grad_norm": 0.4140602648258209, + "learning_rate": 1.8202895654328023e-05, + "loss": 0.3881, + "step": 8695 + }, + { + "epoch": 0.19392192227891694, + "grad_norm": 0.5594643950462341, + "learning_rate": 1.8200892578120544e-05, + "loss": 0.4082, + "step": 8700 + }, + { + "epoch": 0.194033371659537, + "grad_norm": 0.5588990449905396, + "learning_rate": 1.8198888496546936e-05, + "loss": 0.5088, + "step": 8705 + }, + { + "epoch": 0.19414482104015707, + "grad_norm": 0.3396087884902954, + "learning_rate": 1.8196883409852886e-05, + "loss": 0.3792, + "step": 8710 + }, + { + "epoch": 0.19425627042077714, + "grad_norm": 0.40732541680336, + "learning_rate": 1.81948773182842e-05, + "loss": 0.3382, + "step": 8715 + }, + { + "epoch": 0.1943677198013972, + "grad_norm": 0.4649874269962311, + "learning_rate": 1.8192870222086805e-05, + "loss": 0.3198, + "step": 8720 + }, + { + "epoch": 0.19447916918201727, + "grad_norm": 0.5772071480751038, + "learning_rate": 1.8190862121506766e-05, + "loss": 0.4351, + "step": 8725 + }, + { + "epoch": 0.19459061856263735, + "grad_norm": 0.7937235832214355, + "learning_rate": 1.8188853016790252e-05, + "loss": 0.4203, + "step": 8730 + }, + { + "epoch": 0.1947020679432574, + "grad_norm": 0.5506812930107117, + "learning_rate": 1.8186842908183568e-05, + "loss": 0.2649, + "step": 8735 + }, + { + "epoch": 0.19481351732387747, + "grad_norm": 0.43412524461746216, + "learning_rate": 1.8184831795933134e-05, + "loss": 0.3928, + "step": 8740 + }, + { + "epoch": 0.19492496670449755, + "grad_norm": 0.5557735562324524, + "learning_rate": 1.81828196802855e-05, + "loss": 0.3544, + "step": 8745 + }, + { + "epoch": 0.1950364160851176, + "grad_norm": 0.6762316823005676, + "learning_rate": 1.8180806561487333e-05, + "loss": 0.3823, + "step": 8750 + }, + { + "epoch": 0.19514786546573767, + "grad_norm": 0.5124315619468689, + "learning_rate": 1.817879243978543e-05, + "loss": 0.3836, + "step": 8755 + }, + { + "epoch": 0.19525931484635775, + "grad_norm": 0.6040787100791931, + "learning_rate": 1.8176777315426703e-05, + "loss": 0.3722, + "step": 8760 + }, + { + "epoch": 0.1953707642269778, + "grad_norm": 0.8811430335044861, + "learning_rate": 1.8174761188658196e-05, + "loss": 0.3777, + "step": 8765 + }, + { + "epoch": 0.19548221360759788, + "grad_norm": 0.40771323442459106, + "learning_rate": 1.817274405972706e-05, + "loss": 0.3874, + "step": 8770 + }, + { + "epoch": 0.19559366298821795, + "grad_norm": 0.46913596987724304, + "learning_rate": 1.8170725928880593e-05, + "loss": 0.3714, + "step": 8775 + }, + { + "epoch": 0.195705112368838, + "grad_norm": 0.7020705342292786, + "learning_rate": 1.8168706796366192e-05, + "loss": 0.3432, + "step": 8780 + }, + { + "epoch": 0.19581656174945808, + "grad_norm": 0.5612475872039795, + "learning_rate": 1.8166686662431388e-05, + "loss": 0.2656, + "step": 8785 + }, + { + "epoch": 0.19592801113007816, + "grad_norm": 0.4503854513168335, + "learning_rate": 1.816466552732384e-05, + "loss": 0.449, + "step": 8790 + }, + { + "epoch": 0.1960394605106982, + "grad_norm": 0.6024184226989746, + "learning_rate": 1.816264339129132e-05, + "loss": 0.3781, + "step": 8795 + }, + { + "epoch": 0.19615090989131828, + "grad_norm": 0.4631085991859436, + "learning_rate": 1.8160620254581727e-05, + "loss": 0.226, + "step": 8800 + }, + { + "epoch": 0.19626235927193836, + "grad_norm": 0.6948122978210449, + "learning_rate": 1.8158596117443078e-05, + "loss": 0.3695, + "step": 8805 + }, + { + "epoch": 0.1963738086525584, + "grad_norm": 0.8757532238960266, + "learning_rate": 1.815657098012352e-05, + "loss": 0.6121, + "step": 8810 + }, + { + "epoch": 0.19648525803317848, + "grad_norm": 0.5438439249992371, + "learning_rate": 1.8154544842871323e-05, + "loss": 0.3557, + "step": 8815 + }, + { + "epoch": 0.19659670741379856, + "grad_norm": 0.5261885523796082, + "learning_rate": 1.815251770593487e-05, + "loss": 0.4272, + "step": 8820 + }, + { + "epoch": 0.1967081567944186, + "grad_norm": 0.6569040417671204, + "learning_rate": 1.815048956956267e-05, + "loss": 0.4891, + "step": 8825 + }, + { + "epoch": 0.19681960617503869, + "grad_norm": 0.5094635486602783, + "learning_rate": 1.8148460434003363e-05, + "loss": 0.365, + "step": 8830 + }, + { + "epoch": 0.19693105555565876, + "grad_norm": 0.5861701965332031, + "learning_rate": 1.8146430299505705e-05, + "loss": 0.3636, + "step": 8835 + }, + { + "epoch": 0.1970425049362788, + "grad_norm": 0.5916622281074524, + "learning_rate": 1.814439916631857e-05, + "loss": 0.2662, + "step": 8840 + }, + { + "epoch": 0.1971539543168989, + "grad_norm": 0.6713497638702393, + "learning_rate": 1.8142367034690967e-05, + "loss": 0.4473, + "step": 8845 + }, + { + "epoch": 0.19726540369751894, + "grad_norm": 0.38671842217445374, + "learning_rate": 1.8140333904872013e-05, + "loss": 0.3107, + "step": 8850 + }, + { + "epoch": 0.19737685307813901, + "grad_norm": 0.35884755849838257, + "learning_rate": 1.8138299777110953e-05, + "loss": 0.3558, + "step": 8855 + }, + { + "epoch": 0.1974883024587591, + "grad_norm": 0.6742395758628845, + "learning_rate": 1.813626465165716e-05, + "loss": 0.4218, + "step": 8860 + }, + { + "epoch": 0.19759975183937914, + "grad_norm": 0.6200085878372192, + "learning_rate": 1.8134228528760124e-05, + "loss": 0.2595, + "step": 8865 + }, + { + "epoch": 0.19771120121999922, + "grad_norm": 0.30860522389411926, + "learning_rate": 1.8132191408669458e-05, + "loss": 0.3566, + "step": 8870 + }, + { + "epoch": 0.1978226506006193, + "grad_norm": 0.5478228330612183, + "learning_rate": 1.8130153291634893e-05, + "loss": 0.2773, + "step": 8875 + }, + { + "epoch": 0.19793409998123934, + "grad_norm": 0.6432132720947266, + "learning_rate": 1.8128114177906292e-05, + "loss": 0.3881, + "step": 8880 + }, + { + "epoch": 0.19804554936185942, + "grad_norm": 0.5374806523323059, + "learning_rate": 1.812607406773363e-05, + "loss": 0.2712, + "step": 8885 + }, + { + "epoch": 0.1981569987424795, + "grad_norm": 0.5458959937095642, + "learning_rate": 1.8124032961367012e-05, + "loss": 0.4764, + "step": 8890 + }, + { + "epoch": 0.19826844812309954, + "grad_norm": 0.671455979347229, + "learning_rate": 1.8121990859056664e-05, + "loss": 0.3211, + "step": 8895 + }, + { + "epoch": 0.19837989750371962, + "grad_norm": 0.6042885780334473, + "learning_rate": 1.8119947761052924e-05, + "loss": 0.3583, + "step": 8900 + }, + { + "epoch": 0.1984913468843397, + "grad_norm": 0.7543115019798279, + "learning_rate": 1.811790366760627e-05, + "loss": 0.41, + "step": 8905 + }, + { + "epoch": 0.19860279626495975, + "grad_norm": 0.40042057633399963, + "learning_rate": 1.8115858578967283e-05, + "loss": 0.4059, + "step": 8910 + }, + { + "epoch": 0.19871424564557982, + "grad_norm": 0.5100334286689758, + "learning_rate": 1.811381249538668e-05, + "loss": 0.3285, + "step": 8915 + }, + { + "epoch": 0.1988256950261999, + "grad_norm": 0.7095329165458679, + "learning_rate": 1.8111765417115292e-05, + "loss": 0.3764, + "step": 8920 + }, + { + "epoch": 0.19893714440681995, + "grad_norm": 0.4244353771209717, + "learning_rate": 1.810971734440408e-05, + "loss": 0.2824, + "step": 8925 + }, + { + "epoch": 0.19904859378744003, + "grad_norm": 0.6726874709129333, + "learning_rate": 1.810766827750412e-05, + "loss": 0.3899, + "step": 8930 + }, + { + "epoch": 0.1991600431680601, + "grad_norm": 0.635212779045105, + "learning_rate": 1.810561821666661e-05, + "loss": 0.3438, + "step": 8935 + }, + { + "epoch": 0.19927149254868015, + "grad_norm": 0.5831514596939087, + "learning_rate": 1.810356716214287e-05, + "loss": 0.4083, + "step": 8940 + }, + { + "epoch": 0.19938294192930023, + "grad_norm": 0.4919126331806183, + "learning_rate": 1.8101515114184348e-05, + "loss": 0.3371, + "step": 8945 + }, + { + "epoch": 0.1994943913099203, + "grad_norm": 0.5806923508644104, + "learning_rate": 1.8099462073042607e-05, + "loss": 0.3543, + "step": 8950 + }, + { + "epoch": 0.19960584069054035, + "grad_norm": 0.5970994234085083, + "learning_rate": 1.8097408038969332e-05, + "loss": 0.3174, + "step": 8955 + }, + { + "epoch": 0.19971729007116043, + "grad_norm": 0.8126015067100525, + "learning_rate": 1.8095353012216334e-05, + "loss": 0.3524, + "step": 8960 + }, + { + "epoch": 0.1998287394517805, + "grad_norm": 0.7372391819953918, + "learning_rate": 1.8093296993035546e-05, + "loss": 0.427, + "step": 8965 + }, + { + "epoch": 0.19994018883240056, + "grad_norm": 0.523418664932251, + "learning_rate": 1.8091239981679016e-05, + "loss": 0.4781, + "step": 8970 + }, + { + "epoch": 0.20005163821302063, + "grad_norm": 0.553030252456665, + "learning_rate": 1.808918197839892e-05, + "loss": 0.3167, + "step": 8975 + }, + { + "epoch": 0.2001630875936407, + "grad_norm": 0.7697877287864685, + "learning_rate": 1.8087122983447548e-05, + "loss": 0.4279, + "step": 8980 + }, + { + "epoch": 0.20027453697426076, + "grad_norm": 0.5038485527038574, + "learning_rate": 1.8085062997077326e-05, + "loss": 0.3461, + "step": 8985 + }, + { + "epoch": 0.20038598635488084, + "grad_norm": 0.44742345809936523, + "learning_rate": 1.8083002019540784e-05, + "loss": 0.3014, + "step": 8990 + }, + { + "epoch": 0.2004974357355009, + "grad_norm": 0.5444981455802917, + "learning_rate": 1.808094005109059e-05, + "loss": 0.4331, + "step": 8995 + }, + { + "epoch": 0.20060888511612096, + "grad_norm": 0.6124820709228516, + "learning_rate": 1.807887709197952e-05, + "loss": 0.3692, + "step": 9000 + }, + { + "epoch": 0.20072033449674104, + "grad_norm": 0.533619225025177, + "learning_rate": 1.8076813142460478e-05, + "loss": 0.2453, + "step": 9005 + }, + { + "epoch": 0.20083178387736111, + "grad_norm": 0.5671890377998352, + "learning_rate": 1.8074748202786484e-05, + "loss": 0.2173, + "step": 9010 + }, + { + "epoch": 0.20094323325798116, + "grad_norm": 0.6472200155258179, + "learning_rate": 1.8072682273210692e-05, + "loss": 0.4271, + "step": 9015 + }, + { + "epoch": 0.20105468263860124, + "grad_norm": 0.8244146704673767, + "learning_rate": 1.8070615353986362e-05, + "loss": 0.3664, + "step": 9020 + }, + { + "epoch": 0.20116613201922132, + "grad_norm": 0.4256848990917206, + "learning_rate": 1.8068547445366885e-05, + "loss": 0.2756, + "step": 9025 + }, + { + "epoch": 0.20127758139984137, + "grad_norm": 0.8401066064834595, + "learning_rate": 1.8066478547605774e-05, + "loss": 0.3697, + "step": 9030 + }, + { + "epoch": 0.20138903078046144, + "grad_norm": 0.49587559700012207, + "learning_rate": 1.8064408660956652e-05, + "loss": 0.3255, + "step": 9035 + }, + { + "epoch": 0.2015004801610815, + "grad_norm": 0.4269142150878906, + "learning_rate": 1.8062337785673284e-05, + "loss": 0.3515, + "step": 9040 + }, + { + "epoch": 0.20161192954170157, + "grad_norm": 0.38900962471961975, + "learning_rate": 1.806026592200953e-05, + "loss": 0.3871, + "step": 9045 + }, + { + "epoch": 0.20172337892232164, + "grad_norm": 0.6002835631370544, + "learning_rate": 1.8058193070219387e-05, + "loss": 0.3051, + "step": 9050 + }, + { + "epoch": 0.2018348283029417, + "grad_norm": 0.5478485226631165, + "learning_rate": 1.8056119230556978e-05, + "loss": 0.3902, + "step": 9055 + }, + { + "epoch": 0.20194627768356177, + "grad_norm": 0.5159756541252136, + "learning_rate": 1.8054044403276534e-05, + "loss": 0.4322, + "step": 9060 + }, + { + "epoch": 0.20205772706418185, + "grad_norm": 0.731988251209259, + "learning_rate": 1.8051968588632413e-05, + "loss": 0.4703, + "step": 9065 + }, + { + "epoch": 0.2021691764448019, + "grad_norm": 0.6769568920135498, + "learning_rate": 1.8049891786879093e-05, + "loss": 0.2977, + "step": 9070 + }, + { + "epoch": 0.20228062582542197, + "grad_norm": 0.5943720936775208, + "learning_rate": 1.804781399827118e-05, + "loss": 0.4456, + "step": 9075 + }, + { + "epoch": 0.20239207520604205, + "grad_norm": 0.4914182424545288, + "learning_rate": 1.8045735223063384e-05, + "loss": 0.3044, + "step": 9080 + }, + { + "epoch": 0.2025035245866621, + "grad_norm": 0.5965585112571716, + "learning_rate": 1.8043655461510558e-05, + "loss": 0.2634, + "step": 9085 + }, + { + "epoch": 0.20261497396728217, + "grad_norm": 0.39462730288505554, + "learning_rate": 1.8041574713867658e-05, + "loss": 0.3493, + "step": 9090 + }, + { + "epoch": 0.20272642334790225, + "grad_norm": 0.6479722857475281, + "learning_rate": 1.803949298038977e-05, + "loss": 0.2632, + "step": 9095 + }, + { + "epoch": 0.2028378727285223, + "grad_norm": 0.5808325409889221, + "learning_rate": 1.80374102613321e-05, + "loss": 0.3428, + "step": 9100 + }, + { + "epoch": 0.20294932210914238, + "grad_norm": 2.2530341148376465, + "learning_rate": 1.8035326556949968e-05, + "loss": 0.3293, + "step": 9105 + }, + { + "epoch": 0.20306077148976245, + "grad_norm": 0.5054280161857605, + "learning_rate": 1.8033241867498826e-05, + "loss": 0.3918, + "step": 9110 + }, + { + "epoch": 0.2031722208703825, + "grad_norm": 0.5822464227676392, + "learning_rate": 1.8031156193234237e-05, + "loss": 0.3969, + "step": 9115 + }, + { + "epoch": 0.20328367025100258, + "grad_norm": 0.7139219045639038, + "learning_rate": 1.802906953441189e-05, + "loss": 0.4328, + "step": 9120 + }, + { + "epoch": 0.20339511963162266, + "grad_norm": 0.7321597933769226, + "learning_rate": 1.8026981891287593e-05, + "loss": 0.3183, + "step": 9125 + }, + { + "epoch": 0.2035065690122427, + "grad_norm": 0.6543447375297546, + "learning_rate": 1.8024893264117275e-05, + "loss": 0.4174, + "step": 9130 + }, + { + "epoch": 0.20361801839286278, + "grad_norm": 0.6497056484222412, + "learning_rate": 1.8022803653156983e-05, + "loss": 0.4571, + "step": 9135 + }, + { + "epoch": 0.20372946777348286, + "grad_norm": 0.5019301176071167, + "learning_rate": 1.8020713058662894e-05, + "loss": 0.4811, + "step": 9140 + }, + { + "epoch": 0.2038409171541029, + "grad_norm": 0.6254482269287109, + "learning_rate": 1.8018621480891292e-05, + "loss": 0.3229, + "step": 9145 + }, + { + "epoch": 0.20395236653472298, + "grad_norm": 0.542697548866272, + "learning_rate": 1.801652892009859e-05, + "loss": 0.3139, + "step": 9150 + }, + { + "epoch": 0.20406381591534306, + "grad_norm": 0.5235081315040588, + "learning_rate": 1.8014435376541325e-05, + "loss": 0.3208, + "step": 9155 + }, + { + "epoch": 0.2041752652959631, + "grad_norm": 0.7484645843505859, + "learning_rate": 1.801234085047614e-05, + "loss": 0.3552, + "step": 9160 + }, + { + "epoch": 0.2042867146765832, + "grad_norm": 0.9511454105377197, + "learning_rate": 1.8010245342159812e-05, + "loss": 0.2527, + "step": 9165 + }, + { + "epoch": 0.20439816405720326, + "grad_norm": 0.4497017562389374, + "learning_rate": 1.8008148851849237e-05, + "loss": 0.3487, + "step": 9170 + }, + { + "epoch": 0.2045096134378233, + "grad_norm": 0.5408441424369812, + "learning_rate": 1.8006051379801425e-05, + "loss": 0.3263, + "step": 9175 + }, + { + "epoch": 0.2046210628184434, + "grad_norm": 0.6289383172988892, + "learning_rate": 1.800395292627351e-05, + "loss": 0.4548, + "step": 9180 + }, + { + "epoch": 0.20473251219906347, + "grad_norm": 0.5261548757553101, + "learning_rate": 1.8001853491522753e-05, + "loss": 0.3654, + "step": 9185 + }, + { + "epoch": 0.20484396157968351, + "grad_norm": 0.5952202081680298, + "learning_rate": 1.7999753075806516e-05, + "loss": 0.4186, + "step": 9190 + }, + { + "epoch": 0.2049554109603036, + "grad_norm": 0.9061971306800842, + "learning_rate": 1.7997651679382303e-05, + "loss": 0.2387, + "step": 9195 + }, + { + "epoch": 0.20506686034092367, + "grad_norm": 0.661060094833374, + "learning_rate": 1.7995549302507725e-05, + "loss": 0.3395, + "step": 9200 + }, + { + "epoch": 0.20517830972154372, + "grad_norm": 0.520475447177887, + "learning_rate": 1.7993445945440523e-05, + "loss": 0.3408, + "step": 9205 + }, + { + "epoch": 0.2052897591021638, + "grad_norm": 0.5380303859710693, + "learning_rate": 1.7991341608438546e-05, + "loss": 0.3459, + "step": 9210 + }, + { + "epoch": 0.20540120848278387, + "grad_norm": 0.674584686756134, + "learning_rate": 1.798923629175977e-05, + "loss": 0.3531, + "step": 9215 + }, + { + "epoch": 0.20551265786340392, + "grad_norm": 0.5248068571090698, + "learning_rate": 1.798712999566229e-05, + "loss": 0.343, + "step": 9220 + }, + { + "epoch": 0.205624107244024, + "grad_norm": 0.46953219175338745, + "learning_rate": 1.7985022720404332e-05, + "loss": 0.4012, + "step": 9225 + }, + { + "epoch": 0.20573555662464404, + "grad_norm": 0.5328274369239807, + "learning_rate": 1.7982914466244216e-05, + "loss": 0.4516, + "step": 9230 + }, + { + "epoch": 0.20584700600526412, + "grad_norm": 0.5312590003013611, + "learning_rate": 1.7980805233440406e-05, + "loss": 0.3619, + "step": 9235 + }, + { + "epoch": 0.2059584553858842, + "grad_norm": 0.4466384947299957, + "learning_rate": 1.797869502225148e-05, + "loss": 0.3922, + "step": 9240 + }, + { + "epoch": 0.20606990476650425, + "grad_norm": 0.433711975812912, + "learning_rate": 1.7976583832936124e-05, + "loss": 0.4335, + "step": 9245 + }, + { + "epoch": 0.20618135414712432, + "grad_norm": 0.612484335899353, + "learning_rate": 1.7974471665753164e-05, + "loss": 0.3176, + "step": 9250 + }, + { + "epoch": 0.2062928035277444, + "grad_norm": 0.5125216841697693, + "learning_rate": 1.7972358520961535e-05, + "loss": 0.389, + "step": 9255 + }, + { + "epoch": 0.20640425290836445, + "grad_norm": 0.7068095803260803, + "learning_rate": 1.7970244398820283e-05, + "loss": 0.3994, + "step": 9260 + }, + { + "epoch": 0.20651570228898453, + "grad_norm": 0.6456936597824097, + "learning_rate": 1.7968129299588592e-05, + "loss": 0.3172, + "step": 9265 + }, + { + "epoch": 0.2066271516696046, + "grad_norm": 0.5113691091537476, + "learning_rate": 1.796601322352575e-05, + "loss": 0.3565, + "step": 9270 + }, + { + "epoch": 0.20673860105022465, + "grad_norm": 0.6549059152603149, + "learning_rate": 1.796389617089118e-05, + "loss": 0.3518, + "step": 9275 + }, + { + "epoch": 0.20685005043084473, + "grad_norm": 0.5204808115959167, + "learning_rate": 1.7961778141944407e-05, + "loss": 0.2559, + "step": 9280 + }, + { + "epoch": 0.2069614998114648, + "grad_norm": 0.6168663501739502, + "learning_rate": 1.795965913694509e-05, + "loss": 0.4356, + "step": 9285 + }, + { + "epoch": 0.20707294919208485, + "grad_norm": 0.47157636284828186, + "learning_rate": 1.7957539156153003e-05, + "loss": 0.3213, + "step": 9290 + }, + { + "epoch": 0.20718439857270493, + "grad_norm": 0.4153405427932739, + "learning_rate": 1.7955418199828035e-05, + "loss": 0.4165, + "step": 9295 + }, + { + "epoch": 0.207295847953325, + "grad_norm": 0.6035289764404297, + "learning_rate": 1.7953296268230203e-05, + "loss": 0.4489, + "step": 9300 + }, + { + "epoch": 0.20740729733394506, + "grad_norm": 0.9584718942642212, + "learning_rate": 1.795117336161964e-05, + "loss": 0.3088, + "step": 9305 + }, + { + "epoch": 0.20751874671456513, + "grad_norm": 0.533146858215332, + "learning_rate": 1.7949049480256596e-05, + "loss": 0.3791, + "step": 9310 + }, + { + "epoch": 0.2076301960951852, + "grad_norm": 0.5689879655838013, + "learning_rate": 1.7946924624401445e-05, + "loss": 0.4243, + "step": 9315 + }, + { + "epoch": 0.20774164547580526, + "grad_norm": 0.7022379636764526, + "learning_rate": 1.7944798794314676e-05, + "loss": 0.3408, + "step": 9320 + }, + { + "epoch": 0.20785309485642534, + "grad_norm": 0.6284753084182739, + "learning_rate": 1.7942671990256895e-05, + "loss": 0.288, + "step": 9325 + }, + { + "epoch": 0.2079645442370454, + "grad_norm": 0.5168965458869934, + "learning_rate": 1.794054421248884e-05, + "loss": 0.4631, + "step": 9330 + }, + { + "epoch": 0.20807599361766546, + "grad_norm": 0.797507107257843, + "learning_rate": 1.7938415461271356e-05, + "loss": 0.392, + "step": 9335 + }, + { + "epoch": 0.20818744299828554, + "grad_norm": 0.45130276679992676, + "learning_rate": 1.793628573686541e-05, + "loss": 0.3337, + "step": 9340 + }, + { + "epoch": 0.20829889237890561, + "grad_norm": 0.5223218202590942, + "learning_rate": 1.793415503953209e-05, + "loss": 0.3618, + "step": 9345 + }, + { + "epoch": 0.20841034175952566, + "grad_norm": 0.5980708003044128, + "learning_rate": 1.793202336953261e-05, + "loss": 0.5088, + "step": 9350 + }, + { + "epoch": 0.20852179114014574, + "grad_norm": 0.507690966129303, + "learning_rate": 1.7929890727128287e-05, + "loss": 0.309, + "step": 9355 + }, + { + "epoch": 0.20863324052076582, + "grad_norm": 0.565941572189331, + "learning_rate": 1.792775711258057e-05, + "loss": 0.4123, + "step": 9360 + }, + { + "epoch": 0.20874468990138587, + "grad_norm": 0.4737381637096405, + "learning_rate": 1.7925622526151022e-05, + "loss": 0.3996, + "step": 9365 + }, + { + "epoch": 0.20885613928200594, + "grad_norm": 0.610497772693634, + "learning_rate": 1.7923486968101332e-05, + "loss": 0.4446, + "step": 9370 + }, + { + "epoch": 0.20896758866262602, + "grad_norm": 0.6589722037315369, + "learning_rate": 1.79213504386933e-05, + "loss": 0.3663, + "step": 9375 + }, + { + "epoch": 0.20907903804324607, + "grad_norm": 0.518922746181488, + "learning_rate": 1.7919212938188843e-05, + "loss": 0.296, + "step": 9380 + }, + { + "epoch": 0.20919048742386614, + "grad_norm": 0.6134815216064453, + "learning_rate": 1.7917074466850012e-05, + "loss": 0.5138, + "step": 9385 + }, + { + "epoch": 0.20930193680448622, + "grad_norm": 0.42902672290802, + "learning_rate": 1.791493502493896e-05, + "loss": 0.3863, + "step": 9390 + }, + { + "epoch": 0.20941338618510627, + "grad_norm": 0.4924355149269104, + "learning_rate": 1.7912794612717968e-05, + "loss": 0.3656, + "step": 9395 + }, + { + "epoch": 0.20952483556572635, + "grad_norm": 0.4996645748615265, + "learning_rate": 1.7910653230449434e-05, + "loss": 0.5112, + "step": 9400 + }, + { + "epoch": 0.20963628494634642, + "grad_norm": 0.6504985690116882, + "learning_rate": 1.7908510878395874e-05, + "loss": 0.3295, + "step": 9405 + }, + { + "epoch": 0.20974773432696647, + "grad_norm": 0.5653944611549377, + "learning_rate": 1.7906367556819925e-05, + "loss": 0.3313, + "step": 9410 + }, + { + "epoch": 0.20985918370758655, + "grad_norm": 0.3870103359222412, + "learning_rate": 1.790422326598434e-05, + "loss": 0.3556, + "step": 9415 + }, + { + "epoch": 0.2099706330882066, + "grad_norm": 0.5160222053527832, + "learning_rate": 1.7902078006151996e-05, + "loss": 0.3648, + "step": 9420 + }, + { + "epoch": 0.21008208246882668, + "grad_norm": 0.4862663149833679, + "learning_rate": 1.789993177758588e-05, + "loss": 0.3417, + "step": 9425 + }, + { + "epoch": 0.21019353184944675, + "grad_norm": 0.5651835203170776, + "learning_rate": 1.789778458054911e-05, + "loss": 0.4301, + "step": 9430 + }, + { + "epoch": 0.2103049812300668, + "grad_norm": 0.6121236085891724, + "learning_rate": 1.789563641530491e-05, + "loss": 0.4275, + "step": 9435 + }, + { + "epoch": 0.21041643061068688, + "grad_norm": 0.5317579507827759, + "learning_rate": 1.789348728211663e-05, + "loss": 0.3764, + "step": 9440 + }, + { + "epoch": 0.21052787999130695, + "grad_norm": 0.8041971325874329, + "learning_rate": 1.7891337181247738e-05, + "loss": 0.3525, + "step": 9445 + }, + { + "epoch": 0.210639329371927, + "grad_norm": 0.5068250894546509, + "learning_rate": 1.788918611296182e-05, + "loss": 0.3994, + "step": 9450 + }, + { + "epoch": 0.21075077875254708, + "grad_norm": 0.6096692681312561, + "learning_rate": 1.788703407752258e-05, + "loss": 0.388, + "step": 9455 + }, + { + "epoch": 0.21086222813316716, + "grad_norm": 0.4330832064151764, + "learning_rate": 1.788488107519384e-05, + "loss": 0.3733, + "step": 9460 + }, + { + "epoch": 0.2109736775137872, + "grad_norm": 0.3921918272972107, + "learning_rate": 1.7882727106239542e-05, + "loss": 0.3411, + "step": 9465 + }, + { + "epoch": 0.21108512689440728, + "grad_norm": 0.6018163561820984, + "learning_rate": 1.7880572170923747e-05, + "loss": 0.3416, + "step": 9470 + }, + { + "epoch": 0.21119657627502736, + "grad_norm": 0.6241925954818726, + "learning_rate": 1.787841626951063e-05, + "loss": 0.3981, + "step": 9475 + }, + { + "epoch": 0.2113080256556474, + "grad_norm": 0.6851200461387634, + "learning_rate": 1.7876259402264496e-05, + "loss": 0.2918, + "step": 9480 + }, + { + "epoch": 0.21141947503626748, + "grad_norm": 0.6253845691680908, + "learning_rate": 1.787410156944975e-05, + "loss": 0.4206, + "step": 9485 + }, + { + "epoch": 0.21153092441688756, + "grad_norm": 0.5972841382026672, + "learning_rate": 1.7871942771330932e-05, + "loss": 0.3712, + "step": 9490 + }, + { + "epoch": 0.2116423737975076, + "grad_norm": 0.6291491389274597, + "learning_rate": 1.786978300817269e-05, + "loss": 0.3548, + "step": 9495 + }, + { + "epoch": 0.2117538231781277, + "grad_norm": 0.5687154531478882, + "learning_rate": 1.78676222802398e-05, + "loss": 0.3425, + "step": 9500 + }, + { + "epoch": 0.21186527255874776, + "grad_norm": 0.7182362079620361, + "learning_rate": 1.7865460587797142e-05, + "loss": 0.3582, + "step": 9505 + }, + { + "epoch": 0.2119767219393678, + "grad_norm": 0.6228922605514526, + "learning_rate": 1.7863297931109733e-05, + "loss": 0.3037, + "step": 9510 + }, + { + "epoch": 0.2120881713199879, + "grad_norm": 0.3843955397605896, + "learning_rate": 1.7861134310442692e-05, + "loss": 0.4066, + "step": 9515 + }, + { + "epoch": 0.21219962070060797, + "grad_norm": 0.47894778847694397, + "learning_rate": 1.7858969726061262e-05, + "loss": 0.3569, + "step": 9520 + }, + { + "epoch": 0.21231107008122801, + "grad_norm": 0.6425552368164062, + "learning_rate": 1.7856804178230805e-05, + "loss": 0.4807, + "step": 9525 + }, + { + "epoch": 0.2124225194618481, + "grad_norm": 0.5509926676750183, + "learning_rate": 1.7854637667216802e-05, + "loss": 0.4651, + "step": 9530 + }, + { + "epoch": 0.21253396884246817, + "grad_norm": 0.6003994941711426, + "learning_rate": 1.7852470193284848e-05, + "loss": 0.2946, + "step": 9535 + }, + { + "epoch": 0.21264541822308822, + "grad_norm": 0.4288851022720337, + "learning_rate": 1.785030175670066e-05, + "loss": 0.2704, + "step": 9540 + }, + { + "epoch": 0.2127568676037083, + "grad_norm": 0.5786031484603882, + "learning_rate": 1.7848132357730078e-05, + "loss": 0.2123, + "step": 9545 + }, + { + "epoch": 0.21286831698432837, + "grad_norm": 0.4634719789028168, + "learning_rate": 1.784596199663904e-05, + "loss": 0.3564, + "step": 9550 + }, + { + "epoch": 0.21297976636494842, + "grad_norm": 0.518600583076477, + "learning_rate": 1.7843790673693627e-05, + "loss": 0.3986, + "step": 9555 + }, + { + "epoch": 0.2130912157455685, + "grad_norm": 0.46757927536964417, + "learning_rate": 1.784161838916002e-05, + "loss": 0.3545, + "step": 9560 + }, + { + "epoch": 0.21320266512618857, + "grad_norm": 0.660696804523468, + "learning_rate": 1.7839445143304524e-05, + "loss": 0.3854, + "step": 9565 + }, + { + "epoch": 0.21331411450680862, + "grad_norm": 0.41932207345962524, + "learning_rate": 1.783727093639357e-05, + "loss": 0.3081, + "step": 9570 + }, + { + "epoch": 0.2134255638874287, + "grad_norm": 0.5107130408287048, + "learning_rate": 1.783509576869369e-05, + "loss": 0.3752, + "step": 9575 + }, + { + "epoch": 0.21353701326804878, + "grad_norm": 0.5685948133468628, + "learning_rate": 1.7832919640471548e-05, + "loss": 0.3624, + "step": 9580 + }, + { + "epoch": 0.21364846264866882, + "grad_norm": 0.4625929296016693, + "learning_rate": 1.7830742551993916e-05, + "loss": 0.4257, + "step": 9585 + }, + { + "epoch": 0.2137599120292889, + "grad_norm": 0.6444830894470215, + "learning_rate": 1.7828564503527696e-05, + "loss": 0.2999, + "step": 9590 + }, + { + "epoch": 0.21387136140990898, + "grad_norm": 0.5308654308319092, + "learning_rate": 1.7826385495339892e-05, + "loss": 0.5006, + "step": 9595 + }, + { + "epoch": 0.21398281079052903, + "grad_norm": 0.549062192440033, + "learning_rate": 1.7824205527697635e-05, + "loss": 0.3594, + "step": 9600 + }, + { + "epoch": 0.2140942601711491, + "grad_norm": 0.4222000241279602, + "learning_rate": 1.782202460086818e-05, + "loss": 0.3319, + "step": 9605 + }, + { + "epoch": 0.21420570955176915, + "grad_norm": 0.32505983114242554, + "learning_rate": 1.781984271511888e-05, + "loss": 0.3907, + "step": 9610 + }, + { + "epoch": 0.21431715893238923, + "grad_norm": 0.5011066198348999, + "learning_rate": 1.7817659870717227e-05, + "loss": 0.3543, + "step": 9615 + }, + { + "epoch": 0.2144286083130093, + "grad_norm": 0.5036129355430603, + "learning_rate": 1.7815476067930816e-05, + "loss": 0.4492, + "step": 9620 + }, + { + "epoch": 0.21454005769362935, + "grad_norm": 0.40307238698005676, + "learning_rate": 1.7813291307027363e-05, + "loss": 0.2483, + "step": 9625 + }, + { + "epoch": 0.21465150707424943, + "grad_norm": 0.8827428221702576, + "learning_rate": 1.7811105588274705e-05, + "loss": 0.337, + "step": 9630 + }, + { + "epoch": 0.2147629564548695, + "grad_norm": 0.497211217880249, + "learning_rate": 1.7808918911940795e-05, + "loss": 0.4192, + "step": 9635 + }, + { + "epoch": 0.21487440583548956, + "grad_norm": 0.5260987281799316, + "learning_rate": 1.7806731278293706e-05, + "loss": 0.4036, + "step": 9640 + }, + { + "epoch": 0.21498585521610963, + "grad_norm": 0.6299923658370972, + "learning_rate": 1.7804542687601614e-05, + "loss": 0.3552, + "step": 9645 + }, + { + "epoch": 0.2150973045967297, + "grad_norm": 0.5604374408721924, + "learning_rate": 1.7802353140132835e-05, + "loss": 0.2949, + "step": 9650 + }, + { + "epoch": 0.21520875397734976, + "grad_norm": 0.7191600203514099, + "learning_rate": 1.7800162636155786e-05, + "loss": 0.4364, + "step": 9655 + }, + { + "epoch": 0.21532020335796984, + "grad_norm": 0.6973847150802612, + "learning_rate": 1.7797971175939004e-05, + "loss": 0.3598, + "step": 9660 + }, + { + "epoch": 0.2154316527385899, + "grad_norm": 0.6464542150497437, + "learning_rate": 1.779577875975115e-05, + "loss": 0.3229, + "step": 9665 + }, + { + "epoch": 0.21554310211920996, + "grad_norm": 0.5332272052764893, + "learning_rate": 1.779358538786099e-05, + "loss": 0.3024, + "step": 9670 + }, + { + "epoch": 0.21565455149983004, + "grad_norm": 0.5820956826210022, + "learning_rate": 1.779139106053742e-05, + "loss": 0.2843, + "step": 9675 + }, + { + "epoch": 0.21576600088045012, + "grad_norm": 0.746467113494873, + "learning_rate": 1.7789195778049448e-05, + "loss": 0.3948, + "step": 9680 + }, + { + "epoch": 0.21587745026107016, + "grad_norm": 0.593795895576477, + "learning_rate": 1.7786999540666197e-05, + "loss": 0.4381, + "step": 9685 + }, + { + "epoch": 0.21598889964169024, + "grad_norm": 0.568673849105835, + "learning_rate": 1.7784802348656906e-05, + "loss": 0.2747, + "step": 9690 + }, + { + "epoch": 0.21610034902231032, + "grad_norm": 0.5895990133285522, + "learning_rate": 1.778260420229094e-05, + "loss": 0.266, + "step": 9695 + }, + { + "epoch": 0.21621179840293037, + "grad_norm": 0.45582717657089233, + "learning_rate": 1.778040510183777e-05, + "loss": 0.3459, + "step": 9700 + }, + { + "epoch": 0.21632324778355044, + "grad_norm": 0.5722758769989014, + "learning_rate": 1.7778205047566987e-05, + "loss": 0.3353, + "step": 9705 + }, + { + "epoch": 0.21643469716417052, + "grad_norm": 0.492655873298645, + "learning_rate": 1.7776004039748307e-05, + "loss": 0.3778, + "step": 9710 + }, + { + "epoch": 0.21654614654479057, + "grad_norm": 0.7253314256668091, + "learning_rate": 1.777380207865155e-05, + "loss": 0.4439, + "step": 9715 + }, + { + "epoch": 0.21665759592541065, + "grad_norm": 0.6914671659469604, + "learning_rate": 1.777159916454667e-05, + "loss": 0.3464, + "step": 9720 + }, + { + "epoch": 0.21676904530603072, + "grad_norm": 0.4587441682815552, + "learning_rate": 1.7769395297703715e-05, + "loss": 0.3654, + "step": 9725 + }, + { + "epoch": 0.21688049468665077, + "grad_norm": 0.6213597059249878, + "learning_rate": 1.7767190478392863e-05, + "loss": 0.4613, + "step": 9730 + }, + { + "epoch": 0.21699194406727085, + "grad_norm": 0.4911822974681854, + "learning_rate": 1.7764984706884417e-05, + "loss": 0.2427, + "step": 9735 + }, + { + "epoch": 0.21710339344789092, + "grad_norm": 0.4868038296699524, + "learning_rate": 1.776277798344878e-05, + "loss": 0.438, + "step": 9740 + }, + { + "epoch": 0.21721484282851097, + "grad_norm": 0.8504787683486938, + "learning_rate": 1.7760570308356485e-05, + "loss": 0.3359, + "step": 9745 + }, + { + "epoch": 0.21732629220913105, + "grad_norm": 0.5720949172973633, + "learning_rate": 1.775836168187817e-05, + "loss": 0.5129, + "step": 9750 + }, + { + "epoch": 0.21743774158975113, + "grad_norm": 0.43098098039627075, + "learning_rate": 1.7756152104284602e-05, + "loss": 0.3171, + "step": 9755 + }, + { + "epoch": 0.21754919097037118, + "grad_norm": 0.5028066635131836, + "learning_rate": 1.775394157584665e-05, + "loss": 0.3363, + "step": 9760 + }, + { + "epoch": 0.21766064035099125, + "grad_norm": 0.47206565737724304, + "learning_rate": 1.7751730096835314e-05, + "loss": 0.3361, + "step": 9765 + }, + { + "epoch": 0.21777208973161133, + "grad_norm": 0.5294880270957947, + "learning_rate": 1.7749517667521702e-05, + "loss": 0.494, + "step": 9770 + }, + { + "epoch": 0.21788353911223138, + "grad_norm": 0.723375678062439, + "learning_rate": 1.7747304288177037e-05, + "loss": 0.3211, + "step": 9775 + }, + { + "epoch": 0.21799498849285145, + "grad_norm": 0.5910930633544922, + "learning_rate": 1.7745089959072672e-05, + "loss": 0.3621, + "step": 9780 + }, + { + "epoch": 0.21810643787347153, + "grad_norm": 0.790056586265564, + "learning_rate": 1.7742874680480057e-05, + "loss": 0.4886, + "step": 9785 + }, + { + "epoch": 0.21821788725409158, + "grad_norm": 0.5438470840454102, + "learning_rate": 1.7740658452670775e-05, + "loss": 0.4396, + "step": 9790 + }, + { + "epoch": 0.21832933663471166, + "grad_norm": 0.9391214847564697, + "learning_rate": 1.7738441275916515e-05, + "loss": 0.3882, + "step": 9795 + }, + { + "epoch": 0.2184407860153317, + "grad_norm": 0.6383192539215088, + "learning_rate": 1.7736223150489085e-05, + "loss": 0.4133, + "step": 9800 + }, + { + "epoch": 0.21855223539595178, + "grad_norm": 0.5294129848480225, + "learning_rate": 1.7734004076660413e-05, + "loss": 0.3046, + "step": 9805 + }, + { + "epoch": 0.21866368477657186, + "grad_norm": 0.3723093569278717, + "learning_rate": 1.7731784054702538e-05, + "loss": 0.4108, + "step": 9810 + }, + { + "epoch": 0.2187751341571919, + "grad_norm": 0.4764557480812073, + "learning_rate": 1.7729563084887615e-05, + "loss": 0.373, + "step": 9815 + }, + { + "epoch": 0.21888658353781199, + "grad_norm": 0.8178969025611877, + "learning_rate": 1.7727341167487925e-05, + "loss": 0.3919, + "step": 9820 + }, + { + "epoch": 0.21899803291843206, + "grad_norm": 0.5052682757377625, + "learning_rate": 1.772511830277585e-05, + "loss": 0.3084, + "step": 9825 + }, + { + "epoch": 0.2191094822990521, + "grad_norm": 0.617719829082489, + "learning_rate": 1.77228944910239e-05, + "loss": 0.3815, + "step": 9830 + }, + { + "epoch": 0.2192209316796722, + "grad_norm": 0.43401801586151123, + "learning_rate": 1.77206697325047e-05, + "loss": 0.3949, + "step": 9835 + }, + { + "epoch": 0.21933238106029226, + "grad_norm": 0.4182285964488983, + "learning_rate": 1.7718444027490983e-05, + "loss": 0.3994, + "step": 9840 + }, + { + "epoch": 0.2194438304409123, + "grad_norm": 0.431234210729599, + "learning_rate": 1.7716217376255608e-05, + "loss": 0.4415, + "step": 9845 + }, + { + "epoch": 0.2195552798215324, + "grad_norm": 0.6229223608970642, + "learning_rate": 1.771398977907154e-05, + "loss": 0.4124, + "step": 9850 + }, + { + "epoch": 0.21966672920215247, + "grad_norm": 0.5385680198669434, + "learning_rate": 1.771176123621187e-05, + "loss": 0.5053, + "step": 9855 + }, + { + "epoch": 0.21977817858277252, + "grad_norm": 0.5043733716011047, + "learning_rate": 1.7709531747949796e-05, + "loss": 0.4626, + "step": 9860 + }, + { + "epoch": 0.2198896279633926, + "grad_norm": 0.4025327265262604, + "learning_rate": 1.770730131455864e-05, + "loss": 0.2598, + "step": 9865 + }, + { + "epoch": 0.22000107734401267, + "grad_norm": 0.6064236760139465, + "learning_rate": 1.7705069936311836e-05, + "loss": 0.4956, + "step": 9870 + }, + { + "epoch": 0.22011252672463272, + "grad_norm": 0.28435173630714417, + "learning_rate": 1.7702837613482925e-05, + "loss": 0.2965, + "step": 9875 + }, + { + "epoch": 0.2202239761052528, + "grad_norm": 0.5732418894767761, + "learning_rate": 1.7700604346345588e-05, + "loss": 0.4326, + "step": 9880 + }, + { + "epoch": 0.22033542548587287, + "grad_norm": 0.6132063269615173, + "learning_rate": 1.769837013517359e-05, + "loss": 0.2965, + "step": 9885 + }, + { + "epoch": 0.22044687486649292, + "grad_norm": 0.8895071148872375, + "learning_rate": 1.769613498024084e-05, + "loss": 0.3603, + "step": 9890 + }, + { + "epoch": 0.220558324247113, + "grad_norm": 0.6477868556976318, + "learning_rate": 1.7693898881821344e-05, + "loss": 0.3832, + "step": 9895 + }, + { + "epoch": 0.22066977362773307, + "grad_norm": 0.7001681923866272, + "learning_rate": 1.7691661840189235e-05, + "loss": 0.2867, + "step": 9900 + }, + { + "epoch": 0.22078122300835312, + "grad_norm": 0.5996546745300293, + "learning_rate": 1.7689423855618754e-05, + "loss": 0.3618, + "step": 9905 + }, + { + "epoch": 0.2208926723889732, + "grad_norm": 0.4318576753139496, + "learning_rate": 1.7687184928384263e-05, + "loss": 0.4265, + "step": 9910 + }, + { + "epoch": 0.22100412176959328, + "grad_norm": 0.519368052482605, + "learning_rate": 1.7684945058760235e-05, + "loss": 0.3522, + "step": 9915 + }, + { + "epoch": 0.22111557115021332, + "grad_norm": 0.28598570823669434, + "learning_rate": 1.7682704247021262e-05, + "loss": 0.3078, + "step": 9920 + }, + { + "epoch": 0.2212270205308334, + "grad_norm": 0.5853939056396484, + "learning_rate": 1.768046249344205e-05, + "loss": 0.4695, + "step": 9925 + }, + { + "epoch": 0.22133846991145348, + "grad_norm": 0.7244415283203125, + "learning_rate": 1.7678219798297417e-05, + "loss": 0.4155, + "step": 9930 + }, + { + "epoch": 0.22144991929207353, + "grad_norm": 0.712857723236084, + "learning_rate": 1.7675976161862303e-05, + "loss": 0.3553, + "step": 9935 + }, + { + "epoch": 0.2215613686726936, + "grad_norm": 0.46348175406455994, + "learning_rate": 1.7673731584411766e-05, + "loss": 0.3931, + "step": 9940 + }, + { + "epoch": 0.22167281805331368, + "grad_norm": 0.6503481864929199, + "learning_rate": 1.7671486066220965e-05, + "loss": 0.4813, + "step": 9945 + }, + { + "epoch": 0.22178426743393373, + "grad_norm": 0.47859442234039307, + "learning_rate": 1.7669239607565193e-05, + "loss": 0.3663, + "step": 9950 + }, + { + "epoch": 0.2218957168145538, + "grad_norm": 0.514894425868988, + "learning_rate": 1.7666992208719835e-05, + "loss": 0.4583, + "step": 9955 + }, + { + "epoch": 0.22200716619517388, + "grad_norm": 0.5483409762382507, + "learning_rate": 1.7664743869960416e-05, + "loss": 0.3314, + "step": 9960 + }, + { + "epoch": 0.22211861557579393, + "grad_norm": 0.6043745279312134, + "learning_rate": 1.766249459156256e-05, + "loss": 0.3185, + "step": 9965 + }, + { + "epoch": 0.222230064956414, + "grad_norm": 0.3309432864189148, + "learning_rate": 1.7660244373802014e-05, + "loss": 0.3373, + "step": 9970 + }, + { + "epoch": 0.22234151433703409, + "grad_norm": 0.6270701885223389, + "learning_rate": 1.7657993216954635e-05, + "loss": 0.5158, + "step": 9975 + }, + { + "epoch": 0.22245296371765413, + "grad_norm": 0.6763097643852234, + "learning_rate": 1.76557411212964e-05, + "loss": 0.4119, + "step": 9980 + }, + { + "epoch": 0.2225644130982742, + "grad_norm": 0.7081453800201416, + "learning_rate": 1.7653488087103393e-05, + "loss": 0.3731, + "step": 9985 + }, + { + "epoch": 0.22267586247889426, + "grad_norm": 0.4833124577999115, + "learning_rate": 1.7651234114651826e-05, + "loss": 0.3278, + "step": 9990 + }, + { + "epoch": 0.22278731185951434, + "grad_norm": 0.47149044275283813, + "learning_rate": 1.764897920421801e-05, + "loss": 0.3962, + "step": 9995 + }, + { + "epoch": 0.2228987612401344, + "grad_norm": 0.5421935319900513, + "learning_rate": 1.764672335607839e-05, + "loss": 0.3723, + "step": 10000 + }, + { + "epoch": 0.22301021062075446, + "grad_norm": 0.5773723125457764, + "learning_rate": 1.7644466570509508e-05, + "loss": 0.4249, + "step": 10005 + }, + { + "epoch": 0.22312166000137454, + "grad_norm": 0.425523579120636, + "learning_rate": 1.764220884778803e-05, + "loss": 0.4544, + "step": 10010 + }, + { + "epoch": 0.22323310938199462, + "grad_norm": 0.5259734392166138, + "learning_rate": 1.7639950188190735e-05, + "loss": 0.3608, + "step": 10015 + }, + { + "epoch": 0.22334455876261466, + "grad_norm": 0.566461980342865, + "learning_rate": 1.763769059199452e-05, + "loss": 0.3499, + "step": 10020 + }, + { + "epoch": 0.22345600814323474, + "grad_norm": 0.5393753051757812, + "learning_rate": 1.7635430059476396e-05, + "loss": 0.2665, + "step": 10025 + }, + { + "epoch": 0.22356745752385482, + "grad_norm": 0.5863169431686401, + "learning_rate": 1.763316859091348e-05, + "loss": 0.4295, + "step": 10030 + }, + { + "epoch": 0.22367890690447487, + "grad_norm": 0.5091935396194458, + "learning_rate": 1.7630906186583012e-05, + "loss": 0.4251, + "step": 10035 + }, + { + "epoch": 0.22379035628509494, + "grad_norm": 0.5334644913673401, + "learning_rate": 1.7628642846762348e-05, + "loss": 0.4067, + "step": 10040 + }, + { + "epoch": 0.22390180566571502, + "grad_norm": 0.5561589598655701, + "learning_rate": 1.7626378571728958e-05, + "loss": 0.3785, + "step": 10045 + }, + { + "epoch": 0.22401325504633507, + "grad_norm": 0.4826967120170593, + "learning_rate": 1.7624113361760418e-05, + "loss": 0.3507, + "step": 10050 + }, + { + "epoch": 0.22412470442695515, + "grad_norm": 0.5006000399589539, + "learning_rate": 1.762184721713443e-05, + "loss": 0.3652, + "step": 10055 + }, + { + "epoch": 0.22423615380757522, + "grad_norm": 0.7852641344070435, + "learning_rate": 1.7619580138128805e-05, + "loss": 0.2905, + "step": 10060 + }, + { + "epoch": 0.22434760318819527, + "grad_norm": 0.7374182939529419, + "learning_rate": 1.7617312125021468e-05, + "loss": 0.3224, + "step": 10065 + }, + { + "epoch": 0.22445905256881535, + "grad_norm": 0.5154327750205994, + "learning_rate": 1.7615043178090464e-05, + "loss": 0.2448, + "step": 10070 + }, + { + "epoch": 0.22457050194943542, + "grad_norm": 0.6623854637145996, + "learning_rate": 1.7612773297613945e-05, + "loss": 0.2127, + "step": 10075 + }, + { + "epoch": 0.22468195133005547, + "grad_norm": 0.4166138172149658, + "learning_rate": 1.761050248387018e-05, + "loss": 0.4585, + "step": 10080 + }, + { + "epoch": 0.22479340071067555, + "grad_norm": 0.5983569622039795, + "learning_rate": 1.7608230737137555e-05, + "loss": 0.2964, + "step": 10085 + }, + { + "epoch": 0.22490485009129563, + "grad_norm": 0.6410354971885681, + "learning_rate": 1.7605958057694564e-05, + "loss": 0.2499, + "step": 10090 + }, + { + "epoch": 0.22501629947191568, + "grad_norm": 0.5239635109901428, + "learning_rate": 1.7603684445819832e-05, + "loss": 0.3103, + "step": 10095 + }, + { + "epoch": 0.22512774885253575, + "grad_norm": 0.702729344367981, + "learning_rate": 1.7601409901792074e-05, + "loss": 0.4165, + "step": 10100 + }, + { + "epoch": 0.22523919823315583, + "grad_norm": 0.5636135935783386, + "learning_rate": 1.7599134425890136e-05, + "loss": 0.4647, + "step": 10105 + }, + { + "epoch": 0.22535064761377588, + "grad_norm": 0.5451076030731201, + "learning_rate": 1.7596858018392974e-05, + "loss": 0.3212, + "step": 10110 + }, + { + "epoch": 0.22546209699439596, + "grad_norm": 0.849031925201416, + "learning_rate": 1.7594580679579654e-05, + "loss": 0.3979, + "step": 10115 + }, + { + "epoch": 0.22557354637501603, + "grad_norm": 0.7917623519897461, + "learning_rate": 1.759230240972937e-05, + "loss": 0.3505, + "step": 10120 + }, + { + "epoch": 0.22568499575563608, + "grad_norm": 0.5842856168746948, + "learning_rate": 1.7590023209121412e-05, + "loss": 0.3653, + "step": 10125 + }, + { + "epoch": 0.22579644513625616, + "grad_norm": 0.5458952188491821, + "learning_rate": 1.7587743078035196e-05, + "loss": 0.4657, + "step": 10130 + }, + { + "epoch": 0.22590789451687623, + "grad_norm": 0.3667095899581909, + "learning_rate": 1.7585462016750245e-05, + "loss": 0.313, + "step": 10135 + }, + { + "epoch": 0.22601934389749628, + "grad_norm": 0.7304571270942688, + "learning_rate": 1.7583180025546202e-05, + "loss": 0.3666, + "step": 10140 + }, + { + "epoch": 0.22613079327811636, + "grad_norm": 0.5201693177223206, + "learning_rate": 1.7580897104702818e-05, + "loss": 0.4025, + "step": 10145 + }, + { + "epoch": 0.22624224265873644, + "grad_norm": 0.5686962604522705, + "learning_rate": 1.757861325449997e-05, + "loss": 0.3395, + "step": 10150 + }, + { + "epoch": 0.22635369203935649, + "grad_norm": 0.5492640733718872, + "learning_rate": 1.757632847521763e-05, + "loss": 0.2222, + "step": 10155 + }, + { + "epoch": 0.22646514141997656, + "grad_norm": 0.5925264358520508, + "learning_rate": 1.75740427671359e-05, + "loss": 0.334, + "step": 10160 + }, + { + "epoch": 0.22657659080059664, + "grad_norm": 0.6872429251670837, + "learning_rate": 1.7571756130534994e-05, + "loss": 0.3493, + "step": 10165 + }, + { + "epoch": 0.2266880401812167, + "grad_norm": 0.7906669974327087, + "learning_rate": 1.7569468565695227e-05, + "loss": 0.4535, + "step": 10170 + }, + { + "epoch": 0.22679948956183676, + "grad_norm": 0.5554289221763611, + "learning_rate": 1.7567180072897043e-05, + "loss": 0.305, + "step": 10175 + }, + { + "epoch": 0.2269109389424568, + "grad_norm": 0.3369906544685364, + "learning_rate": 1.7564890652420993e-05, + "loss": 0.37, + "step": 10180 + }, + { + "epoch": 0.2270223883230769, + "grad_norm": 0.5694258809089661, + "learning_rate": 1.7562600304547735e-05, + "loss": 0.3359, + "step": 10185 + }, + { + "epoch": 0.22713383770369697, + "grad_norm": 0.38094398379325867, + "learning_rate": 1.756030902955806e-05, + "loss": 0.454, + "step": 10190 + }, + { + "epoch": 0.22724528708431702, + "grad_norm": 0.5839552879333496, + "learning_rate": 1.755801682773285e-05, + "loss": 0.3097, + "step": 10195 + }, + { + "epoch": 0.2273567364649371, + "grad_norm": 0.5562794804573059, + "learning_rate": 1.7555723699353124e-05, + "loss": 0.3604, + "step": 10200 + }, + { + "epoch": 0.22746818584555717, + "grad_norm": 0.6823106408119202, + "learning_rate": 1.7553429644699988e-05, + "loss": 0.3474, + "step": 10205 + }, + { + "epoch": 0.22757963522617722, + "grad_norm": 0.618422269821167, + "learning_rate": 1.755113466405468e-05, + "loss": 0.5486, + "step": 10210 + }, + { + "epoch": 0.2276910846067973, + "grad_norm": 0.5379924178123474, + "learning_rate": 1.754883875769855e-05, + "loss": 0.5348, + "step": 10215 + }, + { + "epoch": 0.22780253398741737, + "grad_norm": 0.617397129535675, + "learning_rate": 1.7546541925913054e-05, + "loss": 0.3263, + "step": 10220 + }, + { + "epoch": 0.22791398336803742, + "grad_norm": 0.5682933926582336, + "learning_rate": 1.754424416897977e-05, + "loss": 0.4884, + "step": 10225 + }, + { + "epoch": 0.2280254327486575, + "grad_norm": 0.5430388450622559, + "learning_rate": 1.7541945487180383e-05, + "loss": 0.3395, + "step": 10230 + }, + { + "epoch": 0.22813688212927757, + "grad_norm": 0.7244122624397278, + "learning_rate": 1.7539645880796694e-05, + "loss": 0.4879, + "step": 10235 + }, + { + "epoch": 0.22824833150989762, + "grad_norm": 0.37884101271629333, + "learning_rate": 1.753734535011062e-05, + "loss": 0.3292, + "step": 10240 + }, + { + "epoch": 0.2283597808905177, + "grad_norm": 0.5905236601829529, + "learning_rate": 1.753504389540418e-05, + "loss": 0.3041, + "step": 10245 + }, + { + "epoch": 0.22847123027113778, + "grad_norm": 0.4799133837223053, + "learning_rate": 1.7532741516959527e-05, + "loss": 0.3097, + "step": 10250 + }, + { + "epoch": 0.22858267965175783, + "grad_norm": 0.6521375775337219, + "learning_rate": 1.7530438215058902e-05, + "loss": 0.4394, + "step": 10255 + }, + { + "epoch": 0.2286941290323779, + "grad_norm": 0.6131927967071533, + "learning_rate": 1.752813398998468e-05, + "loss": 0.365, + "step": 10260 + }, + { + "epoch": 0.22880557841299798, + "grad_norm": 0.6607373356819153, + "learning_rate": 1.752582884201934e-05, + "loss": 0.4046, + "step": 10265 + }, + { + "epoch": 0.22891702779361803, + "grad_norm": 0.5813794136047363, + "learning_rate": 1.7523522771445475e-05, + "loss": 0.3887, + "step": 10270 + }, + { + "epoch": 0.2290284771742381, + "grad_norm": 0.6334280967712402, + "learning_rate": 1.752121577854579e-05, + "loss": 0.4146, + "step": 10275 + }, + { + "epoch": 0.22913992655485818, + "grad_norm": 0.5421414375305176, + "learning_rate": 1.7518907863603102e-05, + "loss": 0.3437, + "step": 10280 + }, + { + "epoch": 0.22925137593547823, + "grad_norm": 0.5887249112129211, + "learning_rate": 1.7516599026900352e-05, + "loss": 0.328, + "step": 10285 + }, + { + "epoch": 0.2293628253160983, + "grad_norm": 0.6338925957679749, + "learning_rate": 1.751428926872058e-05, + "loss": 0.3079, + "step": 10290 + }, + { + "epoch": 0.22947427469671838, + "grad_norm": 0.43777555227279663, + "learning_rate": 1.751197858934694e-05, + "loss": 0.2509, + "step": 10295 + }, + { + "epoch": 0.22958572407733843, + "grad_norm": 0.4322524666786194, + "learning_rate": 1.7509666989062713e-05, + "loss": 0.3394, + "step": 10300 + }, + { + "epoch": 0.2296971734579585, + "grad_norm": 0.39146319031715393, + "learning_rate": 1.750735446815128e-05, + "loss": 0.3528, + "step": 10305 + }, + { + "epoch": 0.22980862283857859, + "grad_norm": 0.9868649244308472, + "learning_rate": 1.7505041026896133e-05, + "loss": 0.3424, + "step": 10310 + }, + { + "epoch": 0.22992007221919863, + "grad_norm": 0.6405900716781616, + "learning_rate": 1.7502726665580887e-05, + "loss": 0.4043, + "step": 10315 + }, + { + "epoch": 0.2300315215998187, + "grad_norm": 0.4962483048439026, + "learning_rate": 1.7500411384489267e-05, + "loss": 0.3992, + "step": 10320 + }, + { + "epoch": 0.2301429709804388, + "grad_norm": 0.5950126051902771, + "learning_rate": 1.74980951839051e-05, + "loss": 0.2763, + "step": 10325 + }, + { + "epoch": 0.23025442036105884, + "grad_norm": 0.6838284134864807, + "learning_rate": 1.7495778064112347e-05, + "loss": 0.409, + "step": 10330 + }, + { + "epoch": 0.2303658697416789, + "grad_norm": 0.6459113359451294, + "learning_rate": 1.7493460025395056e-05, + "loss": 0.3499, + "step": 10335 + }, + { + "epoch": 0.230477319122299, + "grad_norm": 0.5087669491767883, + "learning_rate": 1.749114106803741e-05, + "loss": 0.2831, + "step": 10340 + }, + { + "epoch": 0.23058876850291904, + "grad_norm": 0.5966417789459229, + "learning_rate": 1.748882119232369e-05, + "loss": 0.3628, + "step": 10345 + }, + { + "epoch": 0.23070021788353912, + "grad_norm": 0.6546626687049866, + "learning_rate": 1.7486500398538298e-05, + "loss": 0.4678, + "step": 10350 + }, + { + "epoch": 0.2308116672641592, + "grad_norm": 0.7793821692466736, + "learning_rate": 1.7484178686965744e-05, + "loss": 0.4621, + "step": 10355 + }, + { + "epoch": 0.23092311664477924, + "grad_norm": 0.4167221486568451, + "learning_rate": 1.7481856057890652e-05, + "loss": 0.3032, + "step": 10360 + }, + { + "epoch": 0.23103456602539932, + "grad_norm": 0.5104589462280273, + "learning_rate": 1.7479532511597758e-05, + "loss": 0.3208, + "step": 10365 + }, + { + "epoch": 0.23114601540601937, + "grad_norm": 0.5882985591888428, + "learning_rate": 1.7477208048371908e-05, + "loss": 0.3434, + "step": 10370 + }, + { + "epoch": 0.23125746478663944, + "grad_norm": 0.4183305501937866, + "learning_rate": 1.7474882668498072e-05, + "loss": 0.291, + "step": 10375 + }, + { + "epoch": 0.23136891416725952, + "grad_norm": 0.40787050127983093, + "learning_rate": 1.7472556372261316e-05, + "loss": 0.2915, + "step": 10380 + }, + { + "epoch": 0.23148036354787957, + "grad_norm": 0.621015727519989, + "learning_rate": 1.7470229159946827e-05, + "loss": 0.3706, + "step": 10385 + }, + { + "epoch": 0.23159181292849965, + "grad_norm": 0.5765045881271362, + "learning_rate": 1.7467901031839906e-05, + "loss": 0.3946, + "step": 10390 + }, + { + "epoch": 0.23170326230911972, + "grad_norm": 0.5098361968994141, + "learning_rate": 1.746557198822596e-05, + "loss": 0.3383, + "step": 10395 + }, + { + "epoch": 0.23181471168973977, + "grad_norm": 0.5407850742340088, + "learning_rate": 1.7463242029390516e-05, + "loss": 0.3399, + "step": 10400 + }, + { + "epoch": 0.23192616107035985, + "grad_norm": 0.5743176937103271, + "learning_rate": 1.7460911155619205e-05, + "loss": 0.3745, + "step": 10405 + }, + { + "epoch": 0.23203761045097993, + "grad_norm": 0.6098573207855225, + "learning_rate": 1.7458579367197773e-05, + "loss": 0.3642, + "step": 10410 + }, + { + "epoch": 0.23214905983159997, + "grad_norm": 0.7372178435325623, + "learning_rate": 1.7456246664412085e-05, + "loss": 0.3546, + "step": 10415 + }, + { + "epoch": 0.23226050921222005, + "grad_norm": 0.6569801568984985, + "learning_rate": 1.745391304754811e-05, + "loss": 0.3821, + "step": 10420 + }, + { + "epoch": 0.23237195859284013, + "grad_norm": 0.6863839030265808, + "learning_rate": 1.745157851689193e-05, + "loss": 0.3249, + "step": 10425 + }, + { + "epoch": 0.23248340797346018, + "grad_norm": 0.6198316216468811, + "learning_rate": 1.744924307272974e-05, + "loss": 0.3759, + "step": 10430 + }, + { + "epoch": 0.23259485735408025, + "grad_norm": 0.4969221353530884, + "learning_rate": 1.7446906715347852e-05, + "loss": 0.2658, + "step": 10435 + }, + { + "epoch": 0.23270630673470033, + "grad_norm": 0.7487539649009705, + "learning_rate": 1.7444569445032677e-05, + "loss": 0.3694, + "step": 10440 + }, + { + "epoch": 0.23281775611532038, + "grad_norm": 0.7681021690368652, + "learning_rate": 1.7442231262070756e-05, + "loss": 0.3903, + "step": 10445 + }, + { + "epoch": 0.23292920549594046, + "grad_norm": 0.6074617505073547, + "learning_rate": 1.7439892166748726e-05, + "loss": 0.2939, + "step": 10450 + }, + { + "epoch": 0.23304065487656053, + "grad_norm": 0.46127086877822876, + "learning_rate": 1.7437552159353338e-05, + "loss": 0.3318, + "step": 10455 + }, + { + "epoch": 0.23315210425718058, + "grad_norm": 0.6152943968772888, + "learning_rate": 1.743521124017147e-05, + "loss": 0.3491, + "step": 10460 + }, + { + "epoch": 0.23326355363780066, + "grad_norm": 0.6032249927520752, + "learning_rate": 1.7432869409490095e-05, + "loss": 0.3311, + "step": 10465 + }, + { + "epoch": 0.23337500301842073, + "grad_norm": 0.7437279224395752, + "learning_rate": 1.7430526667596305e-05, + "loss": 0.414, + "step": 10470 + }, + { + "epoch": 0.23348645239904078, + "grad_norm": 0.7488385438919067, + "learning_rate": 1.7428183014777293e-05, + "loss": 0.3895, + "step": 10475 + }, + { + "epoch": 0.23359790177966086, + "grad_norm": 0.6262264251708984, + "learning_rate": 1.7425838451320387e-05, + "loss": 0.3258, + "step": 10480 + }, + { + "epoch": 0.23370935116028094, + "grad_norm": 0.7204452157020569, + "learning_rate": 1.7423492977513004e-05, + "loss": 0.4216, + "step": 10485 + }, + { + "epoch": 0.23382080054090099, + "grad_norm": 0.462632954120636, + "learning_rate": 1.7421146593642683e-05, + "loss": 0.3316, + "step": 10490 + }, + { + "epoch": 0.23393224992152106, + "grad_norm": 0.758755624294281, + "learning_rate": 1.741879929999707e-05, + "loss": 0.4178, + "step": 10495 + }, + { + "epoch": 0.23404369930214114, + "grad_norm": 0.46575212478637695, + "learning_rate": 1.7416451096863928e-05, + "loss": 0.452, + "step": 10500 + }, + { + "epoch": 0.2341551486827612, + "grad_norm": 0.7237605452537537, + "learning_rate": 1.7414101984531127e-05, + "loss": 0.3371, + "step": 10505 + }, + { + "epoch": 0.23426659806338127, + "grad_norm": 0.49028345942497253, + "learning_rate": 1.741175196328665e-05, + "loss": 0.3716, + "step": 10510 + }, + { + "epoch": 0.23437804744400134, + "grad_norm": 0.5918816328048706, + "learning_rate": 1.7409401033418596e-05, + "loss": 0.4963, + "step": 10515 + }, + { + "epoch": 0.2344894968246214, + "grad_norm": 0.48566198348999023, + "learning_rate": 1.7407049195215167e-05, + "loss": 0.3863, + "step": 10520 + }, + { + "epoch": 0.23460094620524147, + "grad_norm": 0.6173545718193054, + "learning_rate": 1.7404696448964676e-05, + "loss": 0.3703, + "step": 10525 + }, + { + "epoch": 0.23471239558586154, + "grad_norm": 0.6073485612869263, + "learning_rate": 1.740234279495556e-05, + "loss": 0.4293, + "step": 10530 + }, + { + "epoch": 0.2348238449664816, + "grad_norm": 0.5335012078285217, + "learning_rate": 1.739998823347635e-05, + "loss": 0.3025, + "step": 10535 + }, + { + "epoch": 0.23493529434710167, + "grad_norm": 0.5193667411804199, + "learning_rate": 1.7397632764815706e-05, + "loss": 0.4183, + "step": 10540 + }, + { + "epoch": 0.23504674372772175, + "grad_norm": 0.580973207950592, + "learning_rate": 1.7395276389262387e-05, + "loss": 0.402, + "step": 10545 + }, + { + "epoch": 0.2351581931083418, + "grad_norm": 0.6126024723052979, + "learning_rate": 1.7392919107105262e-05, + "loss": 0.3532, + "step": 10550 + }, + { + "epoch": 0.23526964248896187, + "grad_norm": 0.5873859524726868, + "learning_rate": 1.739056091863332e-05, + "loss": 0.4641, + "step": 10555 + }, + { + "epoch": 0.23538109186958192, + "grad_norm": 0.721373975276947, + "learning_rate": 1.7388201824135658e-05, + "loss": 0.3835, + "step": 10560 + }, + { + "epoch": 0.235492541250202, + "grad_norm": 0.5195067524909973, + "learning_rate": 1.7385841823901478e-05, + "loss": 0.4342, + "step": 10565 + }, + { + "epoch": 0.23560399063082207, + "grad_norm": 0.5306962132453918, + "learning_rate": 1.73834809182201e-05, + "loss": 0.2926, + "step": 10570 + }, + { + "epoch": 0.23571544001144212, + "grad_norm": 0.6120619177818298, + "learning_rate": 1.7381119107380956e-05, + "loss": 0.3548, + "step": 10575 + }, + { + "epoch": 0.2358268893920622, + "grad_norm": 0.669775128364563, + "learning_rate": 1.7378756391673578e-05, + "loss": 0.4739, + "step": 10580 + }, + { + "epoch": 0.23593833877268228, + "grad_norm": 0.4187053442001343, + "learning_rate": 1.7376392771387623e-05, + "loss": 0.2801, + "step": 10585 + }, + { + "epoch": 0.23604978815330233, + "grad_norm": 0.4181480407714844, + "learning_rate": 1.7374028246812855e-05, + "loss": 0.4613, + "step": 10590 + }, + { + "epoch": 0.2361612375339224, + "grad_norm": 0.48722395300865173, + "learning_rate": 1.737166281823914e-05, + "loss": 0.3843, + "step": 10595 + }, + { + "epoch": 0.23627268691454248, + "grad_norm": 0.4884631931781769, + "learning_rate": 1.7369296485956465e-05, + "loss": 0.3571, + "step": 10600 + }, + { + "epoch": 0.23638413629516253, + "grad_norm": 0.568600594997406, + "learning_rate": 1.7366929250254926e-05, + "loss": 0.3935, + "step": 10605 + }, + { + "epoch": 0.2364955856757826, + "grad_norm": 0.5506514310836792, + "learning_rate": 1.7364561111424717e-05, + "loss": 0.4381, + "step": 10610 + }, + { + "epoch": 0.23660703505640268, + "grad_norm": 0.5655339956283569, + "learning_rate": 1.7362192069756168e-05, + "loss": 0.3502, + "step": 10615 + }, + { + "epoch": 0.23671848443702273, + "grad_norm": 0.5412592887878418, + "learning_rate": 1.7359822125539695e-05, + "loss": 0.342, + "step": 10620 + }, + { + "epoch": 0.2368299338176428, + "grad_norm": 0.6305738687515259, + "learning_rate": 1.735745127906584e-05, + "loss": 0.2947, + "step": 10625 + }, + { + "epoch": 0.23694138319826288, + "grad_norm": 0.8514442443847656, + "learning_rate": 1.735507953062525e-05, + "loss": 0.2562, + "step": 10630 + }, + { + "epoch": 0.23705283257888293, + "grad_norm": 0.6014148592948914, + "learning_rate": 1.735270688050868e-05, + "loss": 0.3063, + "step": 10635 + }, + { + "epoch": 0.237164281959503, + "grad_norm": 0.5961599349975586, + "learning_rate": 1.7350333329007e-05, + "loss": 0.2601, + "step": 10640 + }, + { + "epoch": 0.23727573134012309, + "grad_norm": 0.5256707668304443, + "learning_rate": 1.7347958876411196e-05, + "loss": 0.3067, + "step": 10645 + }, + { + "epoch": 0.23738718072074314, + "grad_norm": 0.45404037833213806, + "learning_rate": 1.7345583523012344e-05, + "loss": 0.3712, + "step": 10650 + }, + { + "epoch": 0.2374986301013632, + "grad_norm": 0.5770034790039062, + "learning_rate": 1.7343207269101655e-05, + "loss": 0.391, + "step": 10655 + }, + { + "epoch": 0.2376100794819833, + "grad_norm": 0.5898873805999756, + "learning_rate": 1.7340830114970435e-05, + "loss": 0.4271, + "step": 10660 + }, + { + "epoch": 0.23772152886260334, + "grad_norm": 0.5621415972709656, + "learning_rate": 1.7338452060910104e-05, + "loss": 0.292, + "step": 10665 + }, + { + "epoch": 0.23783297824322341, + "grad_norm": 0.42645710706710815, + "learning_rate": 1.7336073107212197e-05, + "loss": 0.4255, + "step": 10670 + }, + { + "epoch": 0.2379444276238435, + "grad_norm": 0.678278386592865, + "learning_rate": 1.733369325416835e-05, + "loss": 0.4151, + "step": 10675 + }, + { + "epoch": 0.23805587700446354, + "grad_norm": 0.6612429618835449, + "learning_rate": 1.733131250207032e-05, + "loss": 0.3753, + "step": 10680 + }, + { + "epoch": 0.23816732638508362, + "grad_norm": 0.6525142192840576, + "learning_rate": 1.7328930851209963e-05, + "loss": 0.312, + "step": 10685 + }, + { + "epoch": 0.2382787757657037, + "grad_norm": 0.5441312193870544, + "learning_rate": 1.7326548301879258e-05, + "loss": 0.3952, + "step": 10690 + }, + { + "epoch": 0.23839022514632374, + "grad_norm": 0.600470781326294, + "learning_rate": 1.7324164854370283e-05, + "loss": 0.3412, + "step": 10695 + }, + { + "epoch": 0.23850167452694382, + "grad_norm": 0.5109327435493469, + "learning_rate": 1.7321780508975226e-05, + "loss": 0.3853, + "step": 10700 + }, + { + "epoch": 0.2386131239075639, + "grad_norm": 0.6196373701095581, + "learning_rate": 1.73193952659864e-05, + "loss": 0.4093, + "step": 10705 + }, + { + "epoch": 0.23872457328818394, + "grad_norm": 0.5077962279319763, + "learning_rate": 1.7317009125696208e-05, + "loss": 0.2362, + "step": 10710 + }, + { + "epoch": 0.23883602266880402, + "grad_norm": 0.6553756594657898, + "learning_rate": 1.7314622088397177e-05, + "loss": 0.3759, + "step": 10715 + }, + { + "epoch": 0.2389474720494241, + "grad_norm": 0.5542757511138916, + "learning_rate": 1.731223415438194e-05, + "loss": 0.3064, + "step": 10720 + }, + { + "epoch": 0.23905892143004415, + "grad_norm": 0.6027953028678894, + "learning_rate": 1.7309845323943236e-05, + "loss": 0.3629, + "step": 10725 + }, + { + "epoch": 0.23917037081066422, + "grad_norm": 0.7495735287666321, + "learning_rate": 1.7307455597373916e-05, + "loss": 0.3856, + "step": 10730 + }, + { + "epoch": 0.2392818201912843, + "grad_norm": 0.6348993182182312, + "learning_rate": 1.7305064974966946e-05, + "loss": 0.4807, + "step": 10735 + }, + { + "epoch": 0.23939326957190435, + "grad_norm": 0.6500808596611023, + "learning_rate": 1.73026734570154e-05, + "loss": 0.3925, + "step": 10740 + }, + { + "epoch": 0.23950471895252443, + "grad_norm": 0.47195371985435486, + "learning_rate": 1.7300281043812453e-05, + "loss": 0.2649, + "step": 10745 + }, + { + "epoch": 0.23961616833314447, + "grad_norm": 0.59906005859375, + "learning_rate": 1.72978877356514e-05, + "loss": 0.3296, + "step": 10750 + }, + { + "epoch": 0.23972761771376455, + "grad_norm": 0.479704350233078, + "learning_rate": 1.7295493532825643e-05, + "loss": 0.3996, + "step": 10755 + }, + { + "epoch": 0.23983906709438463, + "grad_norm": 0.5302325487136841, + "learning_rate": 1.729309843562869e-05, + "loss": 0.2719, + "step": 10760 + }, + { + "epoch": 0.23995051647500468, + "grad_norm": 0.4929358959197998, + "learning_rate": 1.729070244435416e-05, + "loss": 0.3027, + "step": 10765 + }, + { + "epoch": 0.24006196585562475, + "grad_norm": 0.5257954597473145, + "learning_rate": 1.7288305559295793e-05, + "loss": 0.344, + "step": 10770 + }, + { + "epoch": 0.24017341523624483, + "grad_norm": 0.5391680598258972, + "learning_rate": 1.7285907780747417e-05, + "loss": 0.3992, + "step": 10775 + }, + { + "epoch": 0.24028486461686488, + "grad_norm": 0.49891191720962524, + "learning_rate": 1.728350910900299e-05, + "loss": 0.4104, + "step": 10780 + }, + { + "epoch": 0.24039631399748496, + "grad_norm": 0.6244639158248901, + "learning_rate": 1.7281109544356556e-05, + "loss": 0.4117, + "step": 10785 + }, + { + "epoch": 0.24050776337810503, + "grad_norm": 0.5571478009223938, + "learning_rate": 1.72787090871023e-05, + "loss": 0.4336, + "step": 10790 + }, + { + "epoch": 0.24061921275872508, + "grad_norm": 0.5005432963371277, + "learning_rate": 1.7276307737534496e-05, + "loss": 0.461, + "step": 10795 + }, + { + "epoch": 0.24073066213934516, + "grad_norm": 0.8142117261886597, + "learning_rate": 1.7273905495947522e-05, + "loss": 0.3993, + "step": 10800 + }, + { + "epoch": 0.24084211151996524, + "grad_norm": 0.6402087807655334, + "learning_rate": 1.7271502362635883e-05, + "loss": 0.3807, + "step": 10805 + }, + { + "epoch": 0.24095356090058528, + "grad_norm": 0.37803009152412415, + "learning_rate": 1.726909833789418e-05, + "loss": 0.3521, + "step": 10810 + }, + { + "epoch": 0.24106501028120536, + "grad_norm": 0.3475203216075897, + "learning_rate": 1.7266693422017133e-05, + "loss": 0.5104, + "step": 10815 + }, + { + "epoch": 0.24117645966182544, + "grad_norm": 0.7365702986717224, + "learning_rate": 1.726428761529956e-05, + "loss": 0.3797, + "step": 10820 + }, + { + "epoch": 0.2412879090424455, + "grad_norm": 0.7421954870223999, + "learning_rate": 1.72618809180364e-05, + "loss": 0.3804, + "step": 10825 + }, + { + "epoch": 0.24139935842306556, + "grad_norm": 0.40259018540382385, + "learning_rate": 1.725947333052269e-05, + "loss": 0.4523, + "step": 10830 + }, + { + "epoch": 0.24151080780368564, + "grad_norm": 0.548994243144989, + "learning_rate": 1.7257064853053586e-05, + "loss": 0.3184, + "step": 10835 + }, + { + "epoch": 0.2416222571843057, + "grad_norm": 0.5792094469070435, + "learning_rate": 1.7254655485924346e-05, + "loss": 0.5211, + "step": 10840 + }, + { + "epoch": 0.24173370656492577, + "grad_norm": 0.648899495601654, + "learning_rate": 1.7252245229430336e-05, + "loss": 0.3558, + "step": 10845 + }, + { + "epoch": 0.24184515594554584, + "grad_norm": 0.47642436623573303, + "learning_rate": 1.7249834083867048e-05, + "loss": 0.4062, + "step": 10850 + }, + { + "epoch": 0.2419566053261659, + "grad_norm": 0.6215870976448059, + "learning_rate": 1.7247422049530056e-05, + "loss": 0.3595, + "step": 10855 + }, + { + "epoch": 0.24206805470678597, + "grad_norm": 0.8642063736915588, + "learning_rate": 1.7245009126715065e-05, + "loss": 0.496, + "step": 10860 + }, + { + "epoch": 0.24217950408740604, + "grad_norm": 0.682572066783905, + "learning_rate": 1.7242595315717876e-05, + "loss": 0.3212, + "step": 10865 + }, + { + "epoch": 0.2422909534680261, + "grad_norm": 0.6249151229858398, + "learning_rate": 1.7240180616834407e-05, + "loss": 0.479, + "step": 10870 + }, + { + "epoch": 0.24240240284864617, + "grad_norm": 0.7562622427940369, + "learning_rate": 1.723776503036068e-05, + "loss": 0.3625, + "step": 10875 + }, + { + "epoch": 0.24251385222926625, + "grad_norm": 0.53129643201828, + "learning_rate": 1.7235348556592826e-05, + "loss": 0.2043, + "step": 10880 + }, + { + "epoch": 0.2426253016098863, + "grad_norm": 0.5382141470909119, + "learning_rate": 1.7232931195827086e-05, + "loss": 0.3235, + "step": 10885 + }, + { + "epoch": 0.24273675099050637, + "grad_norm": 0.8118866682052612, + "learning_rate": 1.7230512948359816e-05, + "loss": 0.2424, + "step": 10890 + }, + { + "epoch": 0.24284820037112645, + "grad_norm": 0.7187613248825073, + "learning_rate": 1.7228093814487464e-05, + "loss": 0.4476, + "step": 10895 + }, + { + "epoch": 0.2429596497517465, + "grad_norm": 0.5982451438903809, + "learning_rate": 1.7225673794506604e-05, + "loss": 0.2502, + "step": 10900 + }, + { + "epoch": 0.24307109913236657, + "grad_norm": 0.6305253505706787, + "learning_rate": 1.7223252888713914e-05, + "loss": 0.2767, + "step": 10905 + }, + { + "epoch": 0.24318254851298665, + "grad_norm": 0.4656547009944916, + "learning_rate": 1.722083109740617e-05, + "loss": 0.3105, + "step": 10910 + }, + { + "epoch": 0.2432939978936067, + "grad_norm": 0.411578506231308, + "learning_rate": 1.7218408420880273e-05, + "loss": 0.3901, + "step": 10915 + }, + { + "epoch": 0.24340544727422678, + "grad_norm": 0.43576860427856445, + "learning_rate": 1.721598485943322e-05, + "loss": 0.2305, + "step": 10920 + }, + { + "epoch": 0.24351689665484685, + "grad_norm": 0.5072635412216187, + "learning_rate": 1.7213560413362122e-05, + "loss": 0.3357, + "step": 10925 + }, + { + "epoch": 0.2436283460354669, + "grad_norm": 0.7047842741012573, + "learning_rate": 1.7211135082964198e-05, + "loss": 0.3911, + "step": 10930 + }, + { + "epoch": 0.24373979541608698, + "grad_norm": 0.36933034658432007, + "learning_rate": 1.7208708868536775e-05, + "loss": 0.3613, + "step": 10935 + }, + { + "epoch": 0.24385124479670703, + "grad_norm": 0.6863102912902832, + "learning_rate": 1.7206281770377285e-05, + "loss": 0.3389, + "step": 10940 + }, + { + "epoch": 0.2439626941773271, + "grad_norm": 0.5595670938491821, + "learning_rate": 1.7203853788783275e-05, + "loss": 0.2501, + "step": 10945 + }, + { + "epoch": 0.24407414355794718, + "grad_norm": 0.8564665913581848, + "learning_rate": 1.7201424924052396e-05, + "loss": 0.3184, + "step": 10950 + }, + { + "epoch": 0.24418559293856723, + "grad_norm": 0.6190245151519775, + "learning_rate": 1.7198995176482407e-05, + "loss": 0.3406, + "step": 10955 + }, + { + "epoch": 0.2442970423191873, + "grad_norm": 0.5113375782966614, + "learning_rate": 1.7196564546371178e-05, + "loss": 0.3252, + "step": 10960 + }, + { + "epoch": 0.24440849169980738, + "grad_norm": 0.5863358974456787, + "learning_rate": 1.7194133034016686e-05, + "loss": 0.288, + "step": 10965 + }, + { + "epoch": 0.24451994108042743, + "grad_norm": 0.5502198934555054, + "learning_rate": 1.7191700639717012e-05, + "loss": 0.431, + "step": 10970 + }, + { + "epoch": 0.2446313904610475, + "grad_norm": 0.5362652540206909, + "learning_rate": 1.7189267363770352e-05, + "loss": 0.4535, + "step": 10975 + }, + { + "epoch": 0.2447428398416676, + "grad_norm": 0.9066315293312073, + "learning_rate": 1.7186833206475004e-05, + "loss": 0.325, + "step": 10980 + }, + { + "epoch": 0.24485428922228764, + "grad_norm": 0.7099729776382446, + "learning_rate": 1.7184398168129378e-05, + "loss": 0.3492, + "step": 10985 + }, + { + "epoch": 0.2449657386029077, + "grad_norm": 0.4674534201622009, + "learning_rate": 1.7181962249031995e-05, + "loss": 0.3179, + "step": 10990 + }, + { + "epoch": 0.2450771879835278, + "grad_norm": 0.48811131715774536, + "learning_rate": 1.7179525449481475e-05, + "loss": 0.4254, + "step": 10995 + }, + { + "epoch": 0.24518863736414784, + "grad_norm": 0.6559020280838013, + "learning_rate": 1.7177087769776556e-05, + "loss": 0.3824, + "step": 11000 + }, + { + "epoch": 0.24530008674476791, + "grad_norm": 0.683711051940918, + "learning_rate": 1.717464921021607e-05, + "loss": 0.3422, + "step": 11005 + }, + { + "epoch": 0.245411536125388, + "grad_norm": 0.5838513374328613, + "learning_rate": 1.7172209771098974e-05, + "loss": 0.301, + "step": 11010 + }, + { + "epoch": 0.24552298550600804, + "grad_norm": 0.6140018105506897, + "learning_rate": 1.716976945272432e-05, + "loss": 0.3821, + "step": 11015 + }, + { + "epoch": 0.24563443488662812, + "grad_norm": 0.49847617745399475, + "learning_rate": 1.716732825539127e-05, + "loss": 0.3379, + "step": 11020 + }, + { + "epoch": 0.2457458842672482, + "grad_norm": 0.6614333391189575, + "learning_rate": 1.7164886179399102e-05, + "loss": 0.496, + "step": 11025 + }, + { + "epoch": 0.24585733364786824, + "grad_norm": 0.3504364490509033, + "learning_rate": 1.7162443225047192e-05, + "loss": 0.223, + "step": 11030 + }, + { + "epoch": 0.24596878302848832, + "grad_norm": 0.7113224864006042, + "learning_rate": 1.715999939263503e-05, + "loss": 0.4054, + "step": 11035 + }, + { + "epoch": 0.2460802324091084, + "grad_norm": 0.7698032259941101, + "learning_rate": 1.715755468246221e-05, + "loss": 0.3, + "step": 11040 + }, + { + "epoch": 0.24619168178972844, + "grad_norm": 0.7142345309257507, + "learning_rate": 1.715510909482843e-05, + "loss": 0.2923, + "step": 11045 + }, + { + "epoch": 0.24630313117034852, + "grad_norm": 0.5872173309326172, + "learning_rate": 1.7152662630033506e-05, + "loss": 0.3531, + "step": 11050 + }, + { + "epoch": 0.2464145805509686, + "grad_norm": 0.7580005526542664, + "learning_rate": 1.7150215288377352e-05, + "loss": 0.3401, + "step": 11055 + }, + { + "epoch": 0.24652602993158865, + "grad_norm": 0.5612766742706299, + "learning_rate": 1.7147767070159995e-05, + "loss": 0.4963, + "step": 11060 + }, + { + "epoch": 0.24663747931220872, + "grad_norm": 0.5882358551025391, + "learning_rate": 1.714531797568157e-05, + "loss": 0.2482, + "step": 11065 + }, + { + "epoch": 0.2467489286928288, + "grad_norm": 0.4978226125240326, + "learning_rate": 1.714286800524231e-05, + "loss": 0.4981, + "step": 11070 + }, + { + "epoch": 0.24686037807344885, + "grad_norm": 0.4974943995475769, + "learning_rate": 1.7140417159142572e-05, + "loss": 0.2582, + "step": 11075 + }, + { + "epoch": 0.24697182745406893, + "grad_norm": 0.5015186071395874, + "learning_rate": 1.7137965437682803e-05, + "loss": 0.3072, + "step": 11080 + }, + { + "epoch": 0.247083276834689, + "grad_norm": 0.7087413668632507, + "learning_rate": 1.713551284116357e-05, + "loss": 0.4249, + "step": 11085 + }, + { + "epoch": 0.24719472621530905, + "grad_norm": 0.5860006213188171, + "learning_rate": 1.713305936988554e-05, + "loss": 0.3173, + "step": 11090 + }, + { + "epoch": 0.24730617559592913, + "grad_norm": 0.5131428241729736, + "learning_rate": 1.713060502414949e-05, + "loss": 0.3511, + "step": 11095 + }, + { + "epoch": 0.2474176249765492, + "grad_norm": 0.5306171178817749, + "learning_rate": 1.7128149804256303e-05, + "loss": 0.3353, + "step": 11100 + }, + { + "epoch": 0.24752907435716925, + "grad_norm": 0.5503345727920532, + "learning_rate": 1.7125693710506973e-05, + "loss": 0.3021, + "step": 11105 + }, + { + "epoch": 0.24764052373778933, + "grad_norm": 0.5068588852882385, + "learning_rate": 1.7123236743202592e-05, + "loss": 0.377, + "step": 11110 + }, + { + "epoch": 0.2477519731184094, + "grad_norm": 0.5373303294181824, + "learning_rate": 1.7120778902644374e-05, + "loss": 0.3971, + "step": 11115 + }, + { + "epoch": 0.24786342249902946, + "grad_norm": 0.594527006149292, + "learning_rate": 1.711832018913362e-05, + "loss": 0.2949, + "step": 11120 + }, + { + "epoch": 0.24797487187964953, + "grad_norm": 0.5444244146347046, + "learning_rate": 1.7115860602971758e-05, + "loss": 0.4187, + "step": 11125 + }, + { + "epoch": 0.24808632126026958, + "grad_norm": 0.5815946459770203, + "learning_rate": 1.7113400144460315e-05, + "loss": 0.3258, + "step": 11130 + }, + { + "epoch": 0.24819777064088966, + "grad_norm": 0.5695800185203552, + "learning_rate": 1.711093881390092e-05, + "loss": 0.3724, + "step": 11135 + }, + { + "epoch": 0.24830922002150974, + "grad_norm": 0.7198867201805115, + "learning_rate": 1.7108476611595317e-05, + "loss": 0.3852, + "step": 11140 + }, + { + "epoch": 0.24842066940212978, + "grad_norm": 0.9782135486602783, + "learning_rate": 1.7106013537845346e-05, + "loss": 0.4175, + "step": 11145 + }, + { + "epoch": 0.24853211878274986, + "grad_norm": 0.7631874084472656, + "learning_rate": 1.7103549592952967e-05, + "loss": 0.4098, + "step": 11150 + }, + { + "epoch": 0.24864356816336994, + "grad_norm": 0.6426448822021484, + "learning_rate": 1.7101084777220242e-05, + "loss": 0.3423, + "step": 11155 + }, + { + "epoch": 0.24875501754399, + "grad_norm": 0.45032215118408203, + "learning_rate": 1.709861909094933e-05, + "loss": 0.2994, + "step": 11160 + }, + { + "epoch": 0.24886646692461006, + "grad_norm": 0.5834007263183594, + "learning_rate": 1.7096152534442515e-05, + "loss": 0.3597, + "step": 11165 + }, + { + "epoch": 0.24897791630523014, + "grad_norm": 0.49693334102630615, + "learning_rate": 1.709368510800217e-05, + "loss": 0.5088, + "step": 11170 + }, + { + "epoch": 0.2490893656858502, + "grad_norm": 0.5814831852912903, + "learning_rate": 1.7091216811930788e-05, + "loss": 0.2275, + "step": 11175 + }, + { + "epoch": 0.24920081506647027, + "grad_norm": 0.6035648584365845, + "learning_rate": 1.7088747646530958e-05, + "loss": 0.4597, + "step": 11180 + }, + { + "epoch": 0.24931226444709034, + "grad_norm": 0.8806386590003967, + "learning_rate": 1.7086277612105384e-05, + "loss": 0.4899, + "step": 11185 + }, + { + "epoch": 0.2494237138277104, + "grad_norm": 0.7509183287620544, + "learning_rate": 1.708380670895687e-05, + "loss": 0.351, + "step": 11190 + }, + { + "epoch": 0.24953516320833047, + "grad_norm": 0.7867349982261658, + "learning_rate": 1.7081334937388335e-05, + "loss": 0.4327, + "step": 11195 + }, + { + "epoch": 0.24964661258895054, + "grad_norm": 0.4728323221206665, + "learning_rate": 1.7078862297702797e-05, + "loss": 0.2547, + "step": 11200 + }, + { + "epoch": 0.2497580619695706, + "grad_norm": 0.6497337222099304, + "learning_rate": 1.707638879020338e-05, + "loss": 0.4015, + "step": 11205 + }, + { + "epoch": 0.24986951135019067, + "grad_norm": 0.6320534944534302, + "learning_rate": 1.7073914415193322e-05, + "loss": 0.3609, + "step": 11210 + }, + { + "epoch": 0.24998096073081075, + "grad_norm": 0.5705777406692505, + "learning_rate": 1.7071439172975956e-05, + "loss": 0.2253, + "step": 11215 + }, + { + "epoch": 0.2500924101114308, + "grad_norm": 0.40593892335891724, + "learning_rate": 1.706896306385473e-05, + "loss": 0.3606, + "step": 11220 + }, + { + "epoch": 0.2502038594920509, + "grad_norm": 0.5855808854103088, + "learning_rate": 1.7066486088133197e-05, + "loss": 0.3391, + "step": 11225 + }, + { + "epoch": 0.25031530887267095, + "grad_norm": 0.5672227740287781, + "learning_rate": 1.7064008246115014e-05, + "loss": 0.4191, + "step": 11230 + }, + { + "epoch": 0.250426758253291, + "grad_norm": 0.5538031458854675, + "learning_rate": 1.706152953810395e-05, + "loss": 0.3872, + "step": 11235 + }, + { + "epoch": 0.25053820763391105, + "grad_norm": 0.8302563428878784, + "learning_rate": 1.7059049964403868e-05, + "loss": 0.3195, + "step": 11240 + }, + { + "epoch": 0.2506496570145311, + "grad_norm": 0.506974458694458, + "learning_rate": 1.705656952531875e-05, + "loss": 0.3242, + "step": 11245 + }, + { + "epoch": 0.2507611063951512, + "grad_norm": 0.5065387487411499, + "learning_rate": 1.7054088221152673e-05, + "loss": 0.4196, + "step": 11250 + }, + { + "epoch": 0.2508725557757713, + "grad_norm": 0.5343696475028992, + "learning_rate": 1.705160605220983e-05, + "loss": 0.3731, + "step": 11255 + }, + { + "epoch": 0.25098400515639135, + "grad_norm": 0.7063585519790649, + "learning_rate": 1.7049123018794515e-05, + "loss": 0.3776, + "step": 11260 + }, + { + "epoch": 0.25109545453701143, + "grad_norm": 0.49221959710121155, + "learning_rate": 1.7046639121211127e-05, + "loss": 0.3237, + "step": 11265 + }, + { + "epoch": 0.25120690391763145, + "grad_norm": 0.7863365411758423, + "learning_rate": 1.704415435976418e-05, + "loss": 0.4793, + "step": 11270 + }, + { + "epoch": 0.25131835329825153, + "grad_norm": 0.44239675998687744, + "learning_rate": 1.7041668734758275e-05, + "loss": 0.3187, + "step": 11275 + }, + { + "epoch": 0.2514298026788716, + "grad_norm": 0.41563260555267334, + "learning_rate": 1.7039182246498143e-05, + "loss": 0.3485, + "step": 11280 + }, + { + "epoch": 0.2515412520594917, + "grad_norm": 0.7104516625404358, + "learning_rate": 1.7036694895288596e-05, + "loss": 0.3551, + "step": 11285 + }, + { + "epoch": 0.25165270144011176, + "grad_norm": 0.43757396936416626, + "learning_rate": 1.7034206681434574e-05, + "loss": 0.3258, + "step": 11290 + }, + { + "epoch": 0.25176415082073184, + "grad_norm": 0.6314382553100586, + "learning_rate": 1.7031717605241106e-05, + "loss": 0.3952, + "step": 11295 + }, + { + "epoch": 0.25187560020135186, + "grad_norm": 0.6406825184822083, + "learning_rate": 1.7029227667013337e-05, + "loss": 0.4376, + "step": 11300 + }, + { + "epoch": 0.25198704958197193, + "grad_norm": 0.6163500547409058, + "learning_rate": 1.702673686705651e-05, + "loss": 0.4085, + "step": 11305 + }, + { + "epoch": 0.252098498962592, + "grad_norm": 0.6222765445709229, + "learning_rate": 1.7024245205675986e-05, + "loss": 0.3946, + "step": 11310 + }, + { + "epoch": 0.2522099483432121, + "grad_norm": 0.7866464257240295, + "learning_rate": 1.702175268317722e-05, + "loss": 0.4104, + "step": 11315 + }, + { + "epoch": 0.25232139772383216, + "grad_norm": 0.6964669823646545, + "learning_rate": 1.701925929986577e-05, + "loss": 0.2906, + "step": 11320 + }, + { + "epoch": 0.25243284710445224, + "grad_norm": 0.6061902642250061, + "learning_rate": 1.701676505604731e-05, + "loss": 0.3071, + "step": 11325 + }, + { + "epoch": 0.25254429648507226, + "grad_norm": 0.4792914092540741, + "learning_rate": 1.7014269952027617e-05, + "loss": 0.4317, + "step": 11330 + }, + { + "epoch": 0.25265574586569234, + "grad_norm": 0.5628675222396851, + "learning_rate": 1.701177398811257e-05, + "loss": 0.4445, + "step": 11335 + }, + { + "epoch": 0.2527671952463124, + "grad_norm": 0.5403448939323425, + "learning_rate": 1.7009277164608155e-05, + "loss": 0.2302, + "step": 11340 + }, + { + "epoch": 0.2528786446269325, + "grad_norm": 0.6103495359420776, + "learning_rate": 1.7006779481820462e-05, + "loss": 0.4181, + "step": 11345 + }, + { + "epoch": 0.25299009400755257, + "grad_norm": 0.3714889883995056, + "learning_rate": 1.700428094005569e-05, + "loss": 0.369, + "step": 11350 + }, + { + "epoch": 0.25310154338817265, + "grad_norm": 0.7734758257865906, + "learning_rate": 1.7001781539620135e-05, + "loss": 0.5424, + "step": 11355 + }, + { + "epoch": 0.25321299276879267, + "grad_norm": 0.6347566246986389, + "learning_rate": 1.6999281280820214e-05, + "loss": 0.4152, + "step": 11360 + }, + { + "epoch": 0.25332444214941274, + "grad_norm": 0.7742220759391785, + "learning_rate": 1.6996780163962432e-05, + "loss": 0.3414, + "step": 11365 + }, + { + "epoch": 0.2534358915300328, + "grad_norm": 0.6309844255447388, + "learning_rate": 1.699427818935341e-05, + "loss": 0.4047, + "step": 11370 + }, + { + "epoch": 0.2535473409106529, + "grad_norm": 0.7506263256072998, + "learning_rate": 1.6991775357299867e-05, + "loss": 0.3317, + "step": 11375 + }, + { + "epoch": 0.253658790291273, + "grad_norm": 0.6002774834632874, + "learning_rate": 1.6989271668108632e-05, + "loss": 0.3938, + "step": 11380 + }, + { + "epoch": 0.25377023967189305, + "grad_norm": 0.6120704412460327, + "learning_rate": 1.6986767122086644e-05, + "loss": 0.2883, + "step": 11385 + }, + { + "epoch": 0.25388168905251307, + "grad_norm": 0.5829112529754639, + "learning_rate": 1.6984261719540935e-05, + "loss": 0.4873, + "step": 11390 + }, + { + "epoch": 0.25399313843313315, + "grad_norm": 0.5210264921188354, + "learning_rate": 1.698175546077865e-05, + "loss": 0.3445, + "step": 11395 + }, + { + "epoch": 0.2541045878137532, + "grad_norm": 0.5267536640167236, + "learning_rate": 1.6979248346107037e-05, + "loss": 0.2697, + "step": 11400 + }, + { + "epoch": 0.2542160371943733, + "grad_norm": 0.6789841651916504, + "learning_rate": 1.6976740375833444e-05, + "loss": 0.2485, + "step": 11405 + }, + { + "epoch": 0.2543274865749934, + "grad_norm": 0.5412545800209045, + "learning_rate": 1.6974231550265338e-05, + "loss": 0.3462, + "step": 11410 + }, + { + "epoch": 0.2544389359556134, + "grad_norm": 0.40079477429389954, + "learning_rate": 1.6971721869710275e-05, + "loss": 0.4159, + "step": 11415 + }, + { + "epoch": 0.2545503853362335, + "grad_norm": 0.5569183826446533, + "learning_rate": 1.6969211334475923e-05, + "loss": 0.3448, + "step": 11420 + }, + { + "epoch": 0.25466183471685355, + "grad_norm": 0.6156569719314575, + "learning_rate": 1.6966699944870052e-05, + "loss": 0.2425, + "step": 11425 + }, + { + "epoch": 0.25477328409747363, + "grad_norm": 0.681559145450592, + "learning_rate": 1.696418770120055e-05, + "loss": 0.4076, + "step": 11430 + }, + { + "epoch": 0.2548847334780937, + "grad_norm": 0.6507387161254883, + "learning_rate": 1.6961674603775388e-05, + "loss": 0.4648, + "step": 11435 + }, + { + "epoch": 0.2549961828587138, + "grad_norm": 0.5616104602813721, + "learning_rate": 1.6959160652902654e-05, + "loss": 0.3365, + "step": 11440 + }, + { + "epoch": 0.2551076322393338, + "grad_norm": 0.689851701259613, + "learning_rate": 1.695664584889054e-05, + "loss": 0.328, + "step": 11445 + }, + { + "epoch": 0.2552190816199539, + "grad_norm": 0.6259694695472717, + "learning_rate": 1.6954130192047346e-05, + "loss": 0.5502, + "step": 11450 + }, + { + "epoch": 0.25533053100057396, + "grad_norm": 0.6690799593925476, + "learning_rate": 1.6951613682681465e-05, + "loss": 0.3411, + "step": 11455 + }, + { + "epoch": 0.25544198038119403, + "grad_norm": 0.6366260647773743, + "learning_rate": 1.6949096321101404e-05, + "loss": 0.3378, + "step": 11460 + }, + { + "epoch": 0.2555534297618141, + "grad_norm": 0.578250527381897, + "learning_rate": 1.6946578107615774e-05, + "loss": 0.4496, + "step": 11465 + }, + { + "epoch": 0.2556648791424342, + "grad_norm": 0.5326183438301086, + "learning_rate": 1.694405904253329e-05, + "loss": 0.2232, + "step": 11470 + }, + { + "epoch": 0.2557763285230542, + "grad_norm": 0.6691266298294067, + "learning_rate": 1.6941539126162766e-05, + "loss": 0.4133, + "step": 11475 + }, + { + "epoch": 0.2558877779036743, + "grad_norm": 0.5321334004402161, + "learning_rate": 1.6939018358813122e-05, + "loss": 0.2617, + "step": 11480 + }, + { + "epoch": 0.25599922728429436, + "grad_norm": 0.6388453245162964, + "learning_rate": 1.6936496740793395e-05, + "loss": 0.3585, + "step": 11485 + }, + { + "epoch": 0.25611067666491444, + "grad_norm": 0.4702873229980469, + "learning_rate": 1.69339742724127e-05, + "loss": 0.3783, + "step": 11490 + }, + { + "epoch": 0.2562221260455345, + "grad_norm": 0.6191978454589844, + "learning_rate": 1.6931450953980285e-05, + "loss": 0.4226, + "step": 11495 + }, + { + "epoch": 0.2563335754261546, + "grad_norm": 0.46512070298194885, + "learning_rate": 1.692892678580549e-05, + "loss": 0.3928, + "step": 11500 + }, + { + "epoch": 0.2564450248067746, + "grad_norm": 0.6981931924819946, + "learning_rate": 1.692640176819775e-05, + "loss": 0.2401, + "step": 11505 + }, + { + "epoch": 0.2565564741873947, + "grad_norm": 0.6449040174484253, + "learning_rate": 1.692387590146662e-05, + "loss": 0.3707, + "step": 11510 + }, + { + "epoch": 0.25666792356801477, + "grad_norm": 0.6683300733566284, + "learning_rate": 1.6921349185921744e-05, + "loss": 0.3094, + "step": 11515 + }, + { + "epoch": 0.25677937294863484, + "grad_norm": 0.4521041810512543, + "learning_rate": 1.6918821621872886e-05, + "loss": 0.436, + "step": 11520 + }, + { + "epoch": 0.2568908223292549, + "grad_norm": 0.6098304390907288, + "learning_rate": 1.69162932096299e-05, + "loss": 0.3315, + "step": 11525 + }, + { + "epoch": 0.257002271709875, + "grad_norm": 0.4085136950016022, + "learning_rate": 1.6913763949502754e-05, + "loss": 0.3717, + "step": 11530 + }, + { + "epoch": 0.257113721090495, + "grad_norm": 0.6487431526184082, + "learning_rate": 1.691123384180151e-05, + "loss": 0.3725, + "step": 11535 + }, + { + "epoch": 0.2572251704711151, + "grad_norm": 0.6593152284622192, + "learning_rate": 1.690870288683635e-05, + "loss": 0.4701, + "step": 11540 + }, + { + "epoch": 0.25733661985173517, + "grad_norm": 0.5475544929504395, + "learning_rate": 1.690617108491754e-05, + "loss": 0.3776, + "step": 11545 + }, + { + "epoch": 0.25744806923235525, + "grad_norm": 0.6540517807006836, + "learning_rate": 1.690363843635546e-05, + "loss": 0.3261, + "step": 11550 + }, + { + "epoch": 0.2575595186129753, + "grad_norm": 0.628803014755249, + "learning_rate": 1.69011049414606e-05, + "loss": 0.4249, + "step": 11555 + }, + { + "epoch": 0.2576709679935954, + "grad_norm": 0.7063155174255371, + "learning_rate": 1.689857060054354e-05, + "loss": 0.3277, + "step": 11560 + }, + { + "epoch": 0.2577824173742154, + "grad_norm": 0.7075435519218445, + "learning_rate": 1.689603541391497e-05, + "loss": 0.3694, + "step": 11565 + }, + { + "epoch": 0.2578938667548355, + "grad_norm": 0.5980352759361267, + "learning_rate": 1.6893499381885693e-05, + "loss": 0.2849, + "step": 11570 + }, + { + "epoch": 0.2580053161354556, + "grad_norm": 0.7767646908760071, + "learning_rate": 1.68909625047666e-05, + "loss": 0.3956, + "step": 11575 + }, + { + "epoch": 0.25811676551607565, + "grad_norm": 1.3503165245056152, + "learning_rate": 1.6888424782868692e-05, + "loss": 0.3896, + "step": 11580 + }, + { + "epoch": 0.25822821489669573, + "grad_norm": 0.6707198619842529, + "learning_rate": 1.6885886216503077e-05, + "loss": 0.3116, + "step": 11585 + }, + { + "epoch": 0.2583396642773158, + "grad_norm": 0.781755268573761, + "learning_rate": 1.6883346805980964e-05, + "loss": 0.3923, + "step": 11590 + }, + { + "epoch": 0.2584511136579358, + "grad_norm": 0.6143796443939209, + "learning_rate": 1.6880806551613662e-05, + "loss": 0.443, + "step": 11595 + }, + { + "epoch": 0.2585625630385559, + "grad_norm": 0.6381718516349792, + "learning_rate": 1.6878265453712587e-05, + "loss": 0.3774, + "step": 11600 + }, + { + "epoch": 0.258674012419176, + "grad_norm": 0.7067936062812805, + "learning_rate": 1.6875723512589264e-05, + "loss": 0.4032, + "step": 11605 + }, + { + "epoch": 0.25878546179979606, + "grad_norm": 0.5545554757118225, + "learning_rate": 1.687318072855531e-05, + "loss": 0.3526, + "step": 11610 + }, + { + "epoch": 0.25889691118041613, + "grad_norm": 0.8011837005615234, + "learning_rate": 1.687063710192245e-05, + "loss": 0.5166, + "step": 11615 + }, + { + "epoch": 0.25900836056103616, + "grad_norm": 0.5507569909095764, + "learning_rate": 1.6868092633002514e-05, + "loss": 0.4307, + "step": 11620 + }, + { + "epoch": 0.25911980994165623, + "grad_norm": 0.2780143916606903, + "learning_rate": 1.6865547322107434e-05, + "loss": 0.2921, + "step": 11625 + }, + { + "epoch": 0.2592312593222763, + "grad_norm": 0.44053661823272705, + "learning_rate": 1.686300116954925e-05, + "loss": 0.3779, + "step": 11630 + }, + { + "epoch": 0.2593427087028964, + "grad_norm": 0.5424075722694397, + "learning_rate": 1.6860454175640094e-05, + "loss": 0.4212, + "step": 11635 + }, + { + "epoch": 0.25945415808351646, + "grad_norm": 0.4896464943885803, + "learning_rate": 1.685790634069221e-05, + "loss": 0.3013, + "step": 11640 + }, + { + "epoch": 0.25956560746413654, + "grad_norm": 0.7107864022254944, + "learning_rate": 1.6855357665017944e-05, + "loss": 0.263, + "step": 11645 + }, + { + "epoch": 0.25967705684475656, + "grad_norm": 0.6286140084266663, + "learning_rate": 1.6852808148929745e-05, + "loss": 0.4639, + "step": 11650 + }, + { + "epoch": 0.25978850622537664, + "grad_norm": 0.6039460301399231, + "learning_rate": 1.685025779274016e-05, + "loss": 0.3377, + "step": 11655 + }, + { + "epoch": 0.2598999556059967, + "grad_norm": 0.7437496185302734, + "learning_rate": 1.6847706596761848e-05, + "loss": 0.3962, + "step": 11660 + }, + { + "epoch": 0.2600114049866168, + "grad_norm": 0.6204871535301208, + "learning_rate": 1.6845154561307562e-05, + "loss": 0.1816, + "step": 11665 + }, + { + "epoch": 0.26012285436723687, + "grad_norm": 0.5137778520584106, + "learning_rate": 1.6842601686690162e-05, + "loss": 0.3548, + "step": 11670 + }, + { + "epoch": 0.26023430374785694, + "grad_norm": 0.7160062789916992, + "learning_rate": 1.6840047973222615e-05, + "loss": 0.3825, + "step": 11675 + }, + { + "epoch": 0.26034575312847696, + "grad_norm": 0.42000359296798706, + "learning_rate": 1.6837493421217982e-05, + "loss": 0.439, + "step": 11680 + }, + { + "epoch": 0.26045720250909704, + "grad_norm": 0.6821982264518738, + "learning_rate": 1.6834938030989433e-05, + "loss": 0.3363, + "step": 11685 + }, + { + "epoch": 0.2605686518897171, + "grad_norm": 0.585969090461731, + "learning_rate": 1.683238180285024e-05, + "loss": 0.2724, + "step": 11690 + }, + { + "epoch": 0.2606801012703372, + "grad_norm": 0.588124692440033, + "learning_rate": 1.6829824737113775e-05, + "loss": 0.4278, + "step": 11695 + }, + { + "epoch": 0.26079155065095727, + "grad_norm": 0.5431883931159973, + "learning_rate": 1.6827266834093517e-05, + "loss": 0.4278, + "step": 11700 + }, + { + "epoch": 0.26090300003157735, + "grad_norm": 0.38738054037094116, + "learning_rate": 1.6824708094103043e-05, + "loss": 0.3955, + "step": 11705 + }, + { + "epoch": 0.26101444941219737, + "grad_norm": 0.3914400041103363, + "learning_rate": 1.6822148517456033e-05, + "loss": 0.231, + "step": 11710 + }, + { + "epoch": 0.26112589879281745, + "grad_norm": 0.7311981916427612, + "learning_rate": 1.6819588104466275e-05, + "loss": 0.3843, + "step": 11715 + }, + { + "epoch": 0.2612373481734375, + "grad_norm": 0.4924669563770294, + "learning_rate": 1.6817026855447657e-05, + "loss": 0.3251, + "step": 11720 + }, + { + "epoch": 0.2613487975540576, + "grad_norm": 0.5667435526847839, + "learning_rate": 1.6814464770714162e-05, + "loss": 0.2467, + "step": 11725 + }, + { + "epoch": 0.2614602469346777, + "grad_norm": 0.6493402123451233, + "learning_rate": 1.6811901850579884e-05, + "loss": 0.3649, + "step": 11730 + }, + { + "epoch": 0.26157169631529775, + "grad_norm": 0.5301352739334106, + "learning_rate": 1.6809338095359026e-05, + "loss": 0.5046, + "step": 11735 + }, + { + "epoch": 0.2616831456959178, + "grad_norm": 0.4617167115211487, + "learning_rate": 1.6806773505365873e-05, + "loss": 0.3497, + "step": 11740 + }, + { + "epoch": 0.26179459507653785, + "grad_norm": 0.5654996633529663, + "learning_rate": 1.6804208080914824e-05, + "loss": 0.3925, + "step": 11745 + }, + { + "epoch": 0.2619060444571579, + "grad_norm": 0.4751913845539093, + "learning_rate": 1.6801641822320392e-05, + "loss": 0.3844, + "step": 11750 + }, + { + "epoch": 0.262017493837778, + "grad_norm": 0.703079342842102, + "learning_rate": 1.6799074729897173e-05, + "loss": 0.3697, + "step": 11755 + }, + { + "epoch": 0.2621289432183981, + "grad_norm": 0.5409825444221497, + "learning_rate": 1.679650680395987e-05, + "loss": 0.5028, + "step": 11760 + }, + { + "epoch": 0.26224039259901816, + "grad_norm": 0.5090709328651428, + "learning_rate": 1.6793938044823295e-05, + "loss": 0.4512, + "step": 11765 + }, + { + "epoch": 0.2623518419796382, + "grad_norm": 0.35236915946006775, + "learning_rate": 1.6791368452802356e-05, + "loss": 0.3593, + "step": 11770 + }, + { + "epoch": 0.26246329136025826, + "grad_norm": 0.46630239486694336, + "learning_rate": 1.6788798028212068e-05, + "loss": 0.3396, + "step": 11775 + }, + { + "epoch": 0.26257474074087833, + "grad_norm": 0.5448976159095764, + "learning_rate": 1.678622677136754e-05, + "loss": 0.3422, + "step": 11780 + }, + { + "epoch": 0.2626861901214984, + "grad_norm": 0.5812391042709351, + "learning_rate": 1.6783654682583995e-05, + "loss": 0.3253, + "step": 11785 + }, + { + "epoch": 0.2627976395021185, + "grad_norm": 0.3605920374393463, + "learning_rate": 1.6781081762176745e-05, + "loss": 0.3841, + "step": 11790 + }, + { + "epoch": 0.26290908888273856, + "grad_norm": 0.6160728931427002, + "learning_rate": 1.6778508010461213e-05, + "loss": 0.3686, + "step": 11795 + }, + { + "epoch": 0.2630205382633586, + "grad_norm": 0.5062189102172852, + "learning_rate": 1.6775933427752922e-05, + "loss": 0.3245, + "step": 11800 + }, + { + "epoch": 0.26313198764397866, + "grad_norm": 0.6029192209243774, + "learning_rate": 1.6773358014367494e-05, + "loss": 0.2799, + "step": 11805 + }, + { + "epoch": 0.26324343702459874, + "grad_norm": 0.5310506820678711, + "learning_rate": 1.677078177062066e-05, + "loss": 0.3318, + "step": 11810 + }, + { + "epoch": 0.2633548864052188, + "grad_norm": 0.4977557957172394, + "learning_rate": 1.6768204696828236e-05, + "loss": 0.2938, + "step": 11815 + }, + { + "epoch": 0.2634663357858389, + "grad_norm": 0.543250322341919, + "learning_rate": 1.6765626793306164e-05, + "loss": 0.3897, + "step": 11820 + }, + { + "epoch": 0.2635777851664589, + "grad_norm": 0.572293221950531, + "learning_rate": 1.676304806037047e-05, + "loss": 0.3455, + "step": 11825 + }, + { + "epoch": 0.263689234547079, + "grad_norm": 0.859261155128479, + "learning_rate": 1.6760468498337285e-05, + "loss": 0.3914, + "step": 11830 + }, + { + "epoch": 0.26380068392769906, + "grad_norm": 0.5799628496170044, + "learning_rate": 1.6757888107522845e-05, + "loss": 0.4242, + "step": 11835 + }, + { + "epoch": 0.26391213330831914, + "grad_norm": 0.4707748293876648, + "learning_rate": 1.6755306888243487e-05, + "loss": 0.4087, + "step": 11840 + }, + { + "epoch": 0.2640235826889392, + "grad_norm": 0.6998196244239807, + "learning_rate": 1.6752724840815643e-05, + "loss": 0.4391, + "step": 11845 + }, + { + "epoch": 0.2641350320695593, + "grad_norm": 0.3416438698768616, + "learning_rate": 1.6750141965555858e-05, + "loss": 0.3157, + "step": 11850 + }, + { + "epoch": 0.2642464814501793, + "grad_norm": 0.582663893699646, + "learning_rate": 1.6747558262780775e-05, + "loss": 0.2197, + "step": 11855 + }, + { + "epoch": 0.2643579308307994, + "grad_norm": 0.6217628717422485, + "learning_rate": 1.6744973732807128e-05, + "loss": 0.3317, + "step": 11860 + }, + { + "epoch": 0.26446938021141947, + "grad_norm": 0.4774651527404785, + "learning_rate": 1.6742388375951767e-05, + "loss": 0.341, + "step": 11865 + }, + { + "epoch": 0.26458082959203955, + "grad_norm": 0.6677722334861755, + "learning_rate": 1.6739802192531633e-05, + "loss": 0.3857, + "step": 11870 + }, + { + "epoch": 0.2646922789726596, + "grad_norm": 0.5869291424751282, + "learning_rate": 1.6737215182863775e-05, + "loss": 0.3785, + "step": 11875 + }, + { + "epoch": 0.2648037283532797, + "grad_norm": 0.7363972067832947, + "learning_rate": 1.673462734726534e-05, + "loss": 0.3204, + "step": 11880 + }, + { + "epoch": 0.2649151777338997, + "grad_norm": 0.676572859287262, + "learning_rate": 1.6732038686053572e-05, + "loss": 0.3858, + "step": 11885 + }, + { + "epoch": 0.2650266271145198, + "grad_norm": 0.4292004108428955, + "learning_rate": 1.6729449199545828e-05, + "loss": 0.2801, + "step": 11890 + }, + { + "epoch": 0.2651380764951399, + "grad_norm": 0.6198212504386902, + "learning_rate": 1.6726858888059553e-05, + "loss": 0.3531, + "step": 11895 + }, + { + "epoch": 0.26524952587575995, + "grad_norm": 0.5823439955711365, + "learning_rate": 1.67242677519123e-05, + "loss": 0.3468, + "step": 11900 + }, + { + "epoch": 0.26536097525638, + "grad_norm": 0.5388737916946411, + "learning_rate": 1.672167579142173e-05, + "loss": 0.3273, + "step": 11905 + }, + { + "epoch": 0.2654724246370001, + "grad_norm": 0.5490552186965942, + "learning_rate": 1.6719083006905586e-05, + "loss": 0.2722, + "step": 11910 + }, + { + "epoch": 0.2655838740176201, + "grad_norm": 0.5554761290550232, + "learning_rate": 1.671648939868173e-05, + "loss": 0.319, + "step": 11915 + }, + { + "epoch": 0.2656953233982402, + "grad_norm": 0.6012527346611023, + "learning_rate": 1.6713894967068117e-05, + "loss": 0.4606, + "step": 11920 + }, + { + "epoch": 0.2658067727788603, + "grad_norm": 0.7309210896492004, + "learning_rate": 1.6711299712382807e-05, + "loss": 0.3519, + "step": 11925 + }, + { + "epoch": 0.26591822215948036, + "grad_norm": 0.5101271271705627, + "learning_rate": 1.6708703634943954e-05, + "loss": 0.2698, + "step": 11930 + }, + { + "epoch": 0.26602967154010043, + "grad_norm": 0.7300100326538086, + "learning_rate": 1.6706106735069817e-05, + "loss": 0.4509, + "step": 11935 + }, + { + "epoch": 0.2661411209207205, + "grad_norm": 0.5938910245895386, + "learning_rate": 1.6703509013078756e-05, + "loss": 0.357, + "step": 11940 + }, + { + "epoch": 0.26625257030134053, + "grad_norm": 0.6169475317001343, + "learning_rate": 1.6700910469289238e-05, + "loss": 0.3244, + "step": 11945 + }, + { + "epoch": 0.2663640196819606, + "grad_norm": 0.5854972004890442, + "learning_rate": 1.669831110401982e-05, + "loss": 0.3573, + "step": 11950 + }, + { + "epoch": 0.2664754690625807, + "grad_norm": 0.4386449158191681, + "learning_rate": 1.6695710917589156e-05, + "loss": 0.359, + "step": 11955 + }, + { + "epoch": 0.26658691844320076, + "grad_norm": 0.6892266273498535, + "learning_rate": 1.669310991031602e-05, + "loss": 0.4041, + "step": 11960 + }, + { + "epoch": 0.26669836782382084, + "grad_norm": 0.5055707693099976, + "learning_rate": 1.6690508082519275e-05, + "loss": 0.3592, + "step": 11965 + }, + { + "epoch": 0.2668098172044409, + "grad_norm": 0.38702186942100525, + "learning_rate": 1.668790543451788e-05, + "loss": 0.4146, + "step": 11970 + }, + { + "epoch": 0.26692126658506093, + "grad_norm": 0.5644632577896118, + "learning_rate": 1.6685301966630903e-05, + "loss": 0.4233, + "step": 11975 + }, + { + "epoch": 0.267032715965681, + "grad_norm": 0.5067944526672363, + "learning_rate": 1.6682697679177506e-05, + "loss": 0.2752, + "step": 11980 + }, + { + "epoch": 0.2671441653463011, + "grad_norm": 0.54136723279953, + "learning_rate": 1.6680092572476956e-05, + "loss": 0.2532, + "step": 11985 + }, + { + "epoch": 0.26725561472692116, + "grad_norm": 0.4757809042930603, + "learning_rate": 1.667748664684862e-05, + "loss": 0.3329, + "step": 11990 + }, + { + "epoch": 0.26736706410754124, + "grad_norm": 0.4774401783943176, + "learning_rate": 1.6674879902611964e-05, + "loss": 0.438, + "step": 11995 + }, + { + "epoch": 0.26747851348816126, + "grad_norm": 0.6735679507255554, + "learning_rate": 1.667227234008655e-05, + "loss": 0.4542, + "step": 12000 + }, + { + "epoch": 0.26758996286878134, + "grad_norm": 0.6603081822395325, + "learning_rate": 1.666966395959205e-05, + "loss": 0.3418, + "step": 12005 + }, + { + "epoch": 0.2677014122494014, + "grad_norm": 0.5100643634796143, + "learning_rate": 1.6667054761448233e-05, + "loss": 0.342, + "step": 12010 + }, + { + "epoch": 0.2678128616300215, + "grad_norm": 0.5056988000869751, + "learning_rate": 1.6664444745974964e-05, + "loss": 0.4094, + "step": 12015 + }, + { + "epoch": 0.26792431101064157, + "grad_norm": 0.504252016544342, + "learning_rate": 1.666183391349221e-05, + "loss": 0.3087, + "step": 12020 + }, + { + "epoch": 0.26803576039126165, + "grad_norm": 0.658033013343811, + "learning_rate": 1.6659222264320038e-05, + "loss": 0.3112, + "step": 12025 + }, + { + "epoch": 0.26814720977188167, + "grad_norm": 0.43579134345054626, + "learning_rate": 1.665660979877862e-05, + "loss": 0.1689, + "step": 12030 + }, + { + "epoch": 0.26825865915250174, + "grad_norm": 0.7889179587364197, + "learning_rate": 1.6653996517188224e-05, + "loss": 0.2845, + "step": 12035 + }, + { + "epoch": 0.2683701085331218, + "grad_norm": 0.4142740070819855, + "learning_rate": 1.665138241986921e-05, + "loss": 0.2759, + "step": 12040 + }, + { + "epoch": 0.2684815579137419, + "grad_norm": 0.5060544610023499, + "learning_rate": 1.6648767507142056e-05, + "loss": 0.3471, + "step": 12045 + }, + { + "epoch": 0.268593007294362, + "grad_norm": 0.6566082835197449, + "learning_rate": 1.6646151779327326e-05, + "loss": 0.4258, + "step": 12050 + }, + { + "epoch": 0.26870445667498205, + "grad_norm": 0.9004737734794617, + "learning_rate": 1.664353523674569e-05, + "loss": 0.4168, + "step": 12055 + }, + { + "epoch": 0.26881590605560207, + "grad_norm": 0.6746425032615662, + "learning_rate": 1.6640917879717907e-05, + "loss": 0.2759, + "step": 12060 + }, + { + "epoch": 0.26892735543622215, + "grad_norm": 0.6046661138534546, + "learning_rate": 1.6638299708564856e-05, + "loss": 0.4257, + "step": 12065 + }, + { + "epoch": 0.2690388048168422, + "grad_norm": 0.7445327639579773, + "learning_rate": 1.6635680723607507e-05, + "loss": 0.3879, + "step": 12070 + }, + { + "epoch": 0.2691502541974623, + "grad_norm": 0.46639499068260193, + "learning_rate": 1.6633060925166914e-05, + "loss": 0.4192, + "step": 12075 + }, + { + "epoch": 0.2692617035780824, + "grad_norm": 0.5227963328361511, + "learning_rate": 1.663044031356425e-05, + "loss": 0.3057, + "step": 12080 + }, + { + "epoch": 0.26937315295870246, + "grad_norm": 0.6597884893417358, + "learning_rate": 1.6627818889120787e-05, + "loss": 0.4571, + "step": 12085 + }, + { + "epoch": 0.2694846023393225, + "grad_norm": 0.6032409071922302, + "learning_rate": 1.6625196652157883e-05, + "loss": 0.3079, + "step": 12090 + }, + { + "epoch": 0.26959605171994255, + "grad_norm": 0.5562949776649475, + "learning_rate": 1.662257360299701e-05, + "loss": 0.3328, + "step": 12095 + }, + { + "epoch": 0.26970750110056263, + "grad_norm": 0.5631253123283386, + "learning_rate": 1.6619949741959734e-05, + "loss": 0.3595, + "step": 12100 + }, + { + "epoch": 0.2698189504811827, + "grad_norm": 0.7083554863929749, + "learning_rate": 1.6617325069367715e-05, + "loss": 0.3311, + "step": 12105 + }, + { + "epoch": 0.2699303998618028, + "grad_norm": 0.5315176844596863, + "learning_rate": 1.6614699585542722e-05, + "loss": 0.2935, + "step": 12110 + }, + { + "epoch": 0.27004184924242286, + "grad_norm": 0.8315207958221436, + "learning_rate": 1.6612073290806617e-05, + "loss": 0.4054, + "step": 12115 + }, + { + "epoch": 0.2701532986230429, + "grad_norm": 0.6667119860649109, + "learning_rate": 1.660944618548136e-05, + "loss": 0.4269, + "step": 12120 + }, + { + "epoch": 0.27026474800366296, + "grad_norm": 0.6344430446624756, + "learning_rate": 1.660681826988902e-05, + "loss": 0.448, + "step": 12125 + }, + { + "epoch": 0.27037619738428303, + "grad_norm": 0.5542057156562805, + "learning_rate": 1.660418954435176e-05, + "loss": 0.4683, + "step": 12130 + }, + { + "epoch": 0.2704876467649031, + "grad_norm": 0.634754478931427, + "learning_rate": 1.6601560009191837e-05, + "loss": 0.3437, + "step": 12135 + }, + { + "epoch": 0.2705990961455232, + "grad_norm": 0.892122745513916, + "learning_rate": 1.6598929664731613e-05, + "loss": 0.5243, + "step": 12140 + }, + { + "epoch": 0.27071054552614326, + "grad_norm": 0.7992680668830872, + "learning_rate": 1.659629851129355e-05, + "loss": 0.3108, + "step": 12145 + }, + { + "epoch": 0.2708219949067633, + "grad_norm": 0.6725826263427734, + "learning_rate": 1.6593666549200202e-05, + "loss": 0.3965, + "step": 12150 + }, + { + "epoch": 0.27093344428738336, + "grad_norm": 0.6233682036399841, + "learning_rate": 1.659103377877423e-05, + "loss": 0.4429, + "step": 12155 + }, + { + "epoch": 0.27104489366800344, + "grad_norm": 0.5028186440467834, + "learning_rate": 1.658840020033839e-05, + "loss": 0.3574, + "step": 12160 + }, + { + "epoch": 0.2711563430486235, + "grad_norm": 0.46699824929237366, + "learning_rate": 1.6585765814215547e-05, + "loss": 0.364, + "step": 12165 + }, + { + "epoch": 0.2712677924292436, + "grad_norm": 0.5705031752586365, + "learning_rate": 1.6583130620728643e-05, + "loss": 0.3392, + "step": 12170 + }, + { + "epoch": 0.27137924180986367, + "grad_norm": 0.7525768876075745, + "learning_rate": 1.658049462020074e-05, + "loss": 0.3572, + "step": 12175 + }, + { + "epoch": 0.2714906911904837, + "grad_norm": 0.49981358647346497, + "learning_rate": 1.6577857812954994e-05, + "loss": 0.3044, + "step": 12180 + }, + { + "epoch": 0.27160214057110377, + "grad_norm": 0.6712756156921387, + "learning_rate": 1.657522019931465e-05, + "loss": 0.3959, + "step": 12185 + }, + { + "epoch": 0.27171358995172384, + "grad_norm": 0.5159211158752441, + "learning_rate": 1.6572581779603062e-05, + "loss": 0.3127, + "step": 12190 + }, + { + "epoch": 0.2718250393323439, + "grad_norm": 0.8316812515258789, + "learning_rate": 1.6569942554143686e-05, + "loss": 0.3857, + "step": 12195 + }, + { + "epoch": 0.271936488712964, + "grad_norm": 0.5915578603744507, + "learning_rate": 1.6567302523260057e-05, + "loss": 0.3696, + "step": 12200 + }, + { + "epoch": 0.272047938093584, + "grad_norm": 0.5893950462341309, + "learning_rate": 1.6564661687275836e-05, + "loss": 0.5104, + "step": 12205 + }, + { + "epoch": 0.2721593874742041, + "grad_norm": 0.4903256595134735, + "learning_rate": 1.6562020046514763e-05, + "loss": 0.4969, + "step": 12210 + }, + { + "epoch": 0.27227083685482417, + "grad_norm": 0.7330714464187622, + "learning_rate": 1.6559377601300683e-05, + "loss": 0.3735, + "step": 12215 + }, + { + "epoch": 0.27238228623544425, + "grad_norm": 0.6421142220497131, + "learning_rate": 1.6556734351957534e-05, + "loss": 0.2828, + "step": 12220 + }, + { + "epoch": 0.2724937356160643, + "grad_norm": 0.43036210536956787, + "learning_rate": 1.6554090298809372e-05, + "loss": 0.416, + "step": 12225 + }, + { + "epoch": 0.2726051849966844, + "grad_norm": 0.48010724782943726, + "learning_rate": 1.6551445442180325e-05, + "loss": 0.3933, + "step": 12230 + }, + { + "epoch": 0.2727166343773044, + "grad_norm": 0.4992339015007019, + "learning_rate": 1.6548799782394637e-05, + "loss": 0.3862, + "step": 12235 + }, + { + "epoch": 0.2728280837579245, + "grad_norm": 0.5170784592628479, + "learning_rate": 1.6546153319776644e-05, + "loss": 0.3221, + "step": 12240 + }, + { + "epoch": 0.2729395331385446, + "grad_norm": 0.753477156162262, + "learning_rate": 1.654350605465078e-05, + "loss": 0.3116, + "step": 12245 + }, + { + "epoch": 0.27305098251916465, + "grad_norm": 0.6622695922851562, + "learning_rate": 1.654085798734159e-05, + "loss": 0.3437, + "step": 12250 + }, + { + "epoch": 0.27316243189978473, + "grad_norm": 0.588044285774231, + "learning_rate": 1.6538209118173688e-05, + "loss": 0.3554, + "step": 12255 + }, + { + "epoch": 0.2732738812804048, + "grad_norm": 0.8749666810035706, + "learning_rate": 1.653555944747182e-05, + "loss": 0.3499, + "step": 12260 + }, + { + "epoch": 0.27338533066102483, + "grad_norm": 0.6158493757247925, + "learning_rate": 1.6532908975560813e-05, + "loss": 0.2875, + "step": 12265 + }, + { + "epoch": 0.2734967800416449, + "grad_norm": 0.46477964520454407, + "learning_rate": 1.653025770276559e-05, + "loss": 0.3704, + "step": 12270 + }, + { + "epoch": 0.273608229422265, + "grad_norm": 0.6315426826477051, + "learning_rate": 1.652760562941118e-05, + "loss": 0.4398, + "step": 12275 + }, + { + "epoch": 0.27371967880288506, + "grad_norm": 0.6809016466140747, + "learning_rate": 1.6524952755822702e-05, + "loss": 0.4204, + "step": 12280 + }, + { + "epoch": 0.27383112818350513, + "grad_norm": 0.45368507504463196, + "learning_rate": 1.6522299082325385e-05, + "loss": 0.3416, + "step": 12285 + }, + { + "epoch": 0.2739425775641252, + "grad_norm": 0.6435974836349487, + "learning_rate": 1.651964460924454e-05, + "loss": 0.2746, + "step": 12290 + }, + { + "epoch": 0.27405402694474523, + "grad_norm": 0.585220992565155, + "learning_rate": 1.6516989336905594e-05, + "loss": 0.3996, + "step": 12295 + }, + { + "epoch": 0.2741654763253653, + "grad_norm": 0.632941484451294, + "learning_rate": 1.6514333265634058e-05, + "loss": 0.3072, + "step": 12300 + }, + { + "epoch": 0.2742769257059854, + "grad_norm": 0.5073537230491638, + "learning_rate": 1.6511676395755546e-05, + "loss": 0.3857, + "step": 12305 + }, + { + "epoch": 0.27438837508660546, + "grad_norm": 0.5666691660881042, + "learning_rate": 1.650901872759577e-05, + "loss": 0.4126, + "step": 12310 + }, + { + "epoch": 0.27449982446722554, + "grad_norm": 0.44631102681159973, + "learning_rate": 1.6506360261480537e-05, + "loss": 0.3289, + "step": 12315 + }, + { + "epoch": 0.2746112738478456, + "grad_norm": 0.560738205909729, + "learning_rate": 1.6503700997735764e-05, + "loss": 0.4752, + "step": 12320 + }, + { + "epoch": 0.27472272322846564, + "grad_norm": 0.5568654537200928, + "learning_rate": 1.6501040936687444e-05, + "loss": 0.3097, + "step": 12325 + }, + { + "epoch": 0.2748341726090857, + "grad_norm": 0.430672287940979, + "learning_rate": 1.6498380078661686e-05, + "loss": 0.2771, + "step": 12330 + }, + { + "epoch": 0.2749456219897058, + "grad_norm": 0.7009373307228088, + "learning_rate": 1.649571842398469e-05, + "loss": 0.315, + "step": 12335 + }, + { + "epoch": 0.27505707137032587, + "grad_norm": 0.5214234590530396, + "learning_rate": 1.649305597298276e-05, + "loss": 0.2682, + "step": 12340 + }, + { + "epoch": 0.27516852075094594, + "grad_norm": 0.6840612292289734, + "learning_rate": 1.649039272598228e-05, + "loss": 0.299, + "step": 12345 + }, + { + "epoch": 0.275279970131566, + "grad_norm": 0.6062028408050537, + "learning_rate": 1.6487728683309754e-05, + "loss": 0.4087, + "step": 12350 + }, + { + "epoch": 0.27539141951218604, + "grad_norm": 0.48450252413749695, + "learning_rate": 1.6485063845291767e-05, + "loss": 0.3104, + "step": 12355 + }, + { + "epoch": 0.2755028688928061, + "grad_norm": 0.6246992945671082, + "learning_rate": 1.648239821225501e-05, + "loss": 0.4994, + "step": 12360 + }, + { + "epoch": 0.2756143182734262, + "grad_norm": 0.6224850416183472, + "learning_rate": 1.647973178452627e-05, + "loss": 0.2987, + "step": 12365 + }, + { + "epoch": 0.27572576765404627, + "grad_norm": 0.6870371103286743, + "learning_rate": 1.6477064562432428e-05, + "loss": 0.4026, + "step": 12370 + }, + { + "epoch": 0.27583721703466635, + "grad_norm": 0.6845866441726685, + "learning_rate": 1.6474396546300464e-05, + "loss": 0.5957, + "step": 12375 + }, + { + "epoch": 0.27594866641528637, + "grad_norm": 0.5691341161727905, + "learning_rate": 1.647172773645746e-05, + "loss": 0.2552, + "step": 12380 + }, + { + "epoch": 0.27606011579590645, + "grad_norm": 0.5626338720321655, + "learning_rate": 1.6469058133230588e-05, + "loss": 0.3779, + "step": 12385 + }, + { + "epoch": 0.2761715651765265, + "grad_norm": 0.7906394004821777, + "learning_rate": 1.646638773694712e-05, + "loss": 0.3081, + "step": 12390 + }, + { + "epoch": 0.2762830145571466, + "grad_norm": 0.519107460975647, + "learning_rate": 1.6463716547934433e-05, + "loss": 0.3364, + "step": 12395 + }, + { + "epoch": 0.2763944639377667, + "grad_norm": 0.45310643315315247, + "learning_rate": 1.6461044566519993e-05, + "loss": 0.3508, + "step": 12400 + }, + { + "epoch": 0.27650591331838675, + "grad_norm": 0.5506815314292908, + "learning_rate": 1.6458371793031353e-05, + "loss": 0.4634, + "step": 12405 + }, + { + "epoch": 0.2766173626990068, + "grad_norm": 0.41375917196273804, + "learning_rate": 1.645569822779619e-05, + "loss": 0.2951, + "step": 12410 + }, + { + "epoch": 0.27672881207962685, + "grad_norm": 0.7495025992393494, + "learning_rate": 1.6453023871142245e-05, + "loss": 0.4052, + "step": 12415 + }, + { + "epoch": 0.27684026146024693, + "grad_norm": 0.827620804309845, + "learning_rate": 1.6450348723397388e-05, + "loss": 0.4226, + "step": 12420 + }, + { + "epoch": 0.276951710840867, + "grad_norm": 0.5709133148193359, + "learning_rate": 1.6447672784889567e-05, + "loss": 0.3413, + "step": 12425 + }, + { + "epoch": 0.2770631602214871, + "grad_norm": 0.5625734329223633, + "learning_rate": 1.6444996055946833e-05, + "loss": 0.3169, + "step": 12430 + }, + { + "epoch": 0.27717460960210716, + "grad_norm": 0.568760335445404, + "learning_rate": 1.6442318536897325e-05, + "loss": 0.2628, + "step": 12435 + }, + { + "epoch": 0.2772860589827272, + "grad_norm": 0.5465920567512512, + "learning_rate": 1.6439640228069297e-05, + "loss": 0.3317, + "step": 12440 + }, + { + "epoch": 0.27739750836334726, + "grad_norm": 0.5634509325027466, + "learning_rate": 1.6436961129791077e-05, + "loss": 0.325, + "step": 12445 + }, + { + "epoch": 0.27750895774396733, + "grad_norm": 0.6779350638389587, + "learning_rate": 1.6434281242391113e-05, + "loss": 0.4495, + "step": 12450 + }, + { + "epoch": 0.2776204071245874, + "grad_norm": 0.7128263115882874, + "learning_rate": 1.6431600566197934e-05, + "loss": 0.2684, + "step": 12455 + }, + { + "epoch": 0.2777318565052075, + "grad_norm": 0.5098510980606079, + "learning_rate": 1.6428919101540168e-05, + "loss": 0.3573, + "step": 12460 + }, + { + "epoch": 0.27784330588582756, + "grad_norm": 0.5363258123397827, + "learning_rate": 1.6426236848746543e-05, + "loss": 0.2721, + "step": 12465 + }, + { + "epoch": 0.2779547552664476, + "grad_norm": 0.5816171169281006, + "learning_rate": 1.6423553808145886e-05, + "loss": 0.4065, + "step": 12470 + }, + { + "epoch": 0.27806620464706766, + "grad_norm": 0.9319338798522949, + "learning_rate": 1.642086998006711e-05, + "loss": 0.336, + "step": 12475 + }, + { + "epoch": 0.27817765402768774, + "grad_norm": 0.6476007699966431, + "learning_rate": 1.6418185364839242e-05, + "loss": 0.2388, + "step": 12480 + }, + { + "epoch": 0.2782891034083078, + "grad_norm": 0.6227739453315735, + "learning_rate": 1.6415499962791383e-05, + "loss": 0.3853, + "step": 12485 + }, + { + "epoch": 0.2784005527889279, + "grad_norm": 0.4589061141014099, + "learning_rate": 1.6412813774252755e-05, + "loss": 0.2953, + "step": 12490 + }, + { + "epoch": 0.27851200216954797, + "grad_norm": 0.4495225250720978, + "learning_rate": 1.6410126799552653e-05, + "loss": 0.2907, + "step": 12495 + }, + { + "epoch": 0.278623451550168, + "grad_norm": 0.654646635055542, + "learning_rate": 1.6407439039020485e-05, + "loss": 0.2302, + "step": 12500 + }, + { + "epoch": 0.27873490093078807, + "grad_norm": 0.7446420192718506, + "learning_rate": 1.6404750492985748e-05, + "loss": 0.4201, + "step": 12505 + }, + { + "epoch": 0.27884635031140814, + "grad_norm": 0.7043203711509705, + "learning_rate": 1.640206116177804e-05, + "loss": 0.4006, + "step": 12510 + }, + { + "epoch": 0.2789577996920282, + "grad_norm": 0.530685544013977, + "learning_rate": 1.6399371045727045e-05, + "loss": 0.3329, + "step": 12515 + }, + { + "epoch": 0.2790692490726483, + "grad_norm": 0.6326469779014587, + "learning_rate": 1.639668014516256e-05, + "loss": 0.3888, + "step": 12520 + }, + { + "epoch": 0.27918069845326837, + "grad_norm": 0.5652064681053162, + "learning_rate": 1.6393988460414462e-05, + "loss": 0.3634, + "step": 12525 + }, + { + "epoch": 0.2792921478338884, + "grad_norm": 0.5635384321212769, + "learning_rate": 1.6391295991812735e-05, + "loss": 0.2965, + "step": 12530 + }, + { + "epoch": 0.27940359721450847, + "grad_norm": 0.708854079246521, + "learning_rate": 1.638860273968745e-05, + "loss": 0.4356, + "step": 12535 + }, + { + "epoch": 0.27951504659512855, + "grad_norm": 0.5619444847106934, + "learning_rate": 1.6385908704368784e-05, + "loss": 0.1672, + "step": 12540 + }, + { + "epoch": 0.2796264959757486, + "grad_norm": 0.48927435278892517, + "learning_rate": 1.6383213886187e-05, + "loss": 0.3496, + "step": 12545 + }, + { + "epoch": 0.2797379453563687, + "grad_norm": 0.47531449794769287, + "learning_rate": 1.6380518285472468e-05, + "loss": 0.3092, + "step": 12550 + }, + { + "epoch": 0.2798493947369888, + "grad_norm": 0.7148705720901489, + "learning_rate": 1.637782190255564e-05, + "loss": 0.3725, + "step": 12555 + }, + { + "epoch": 0.2799608441176088, + "grad_norm": 0.5635217428207397, + "learning_rate": 1.6375124737767077e-05, + "loss": 0.3899, + "step": 12560 + }, + { + "epoch": 0.2800722934982289, + "grad_norm": 0.576865553855896, + "learning_rate": 1.637242679143743e-05, + "loss": 0.466, + "step": 12565 + }, + { + "epoch": 0.28018374287884895, + "grad_norm": 0.6125386953353882, + "learning_rate": 1.6369728063897445e-05, + "loss": 0.4707, + "step": 12570 + }, + { + "epoch": 0.28029519225946903, + "grad_norm": 0.6960713863372803, + "learning_rate": 1.6367028555477967e-05, + "loss": 0.4153, + "step": 12575 + }, + { + "epoch": 0.2804066416400891, + "grad_norm": 0.7176638841629028, + "learning_rate": 1.6364328266509937e-05, + "loss": 0.3106, + "step": 12580 + }, + { + "epoch": 0.2805180910207091, + "grad_norm": 0.4757578670978546, + "learning_rate": 1.6361627197324382e-05, + "loss": 0.3273, + "step": 12585 + }, + { + "epoch": 0.2806295404013292, + "grad_norm": 0.7011920809745789, + "learning_rate": 1.6358925348252438e-05, + "loss": 0.2884, + "step": 12590 + }, + { + "epoch": 0.2807409897819493, + "grad_norm": 0.35939499735832214, + "learning_rate": 1.6356222719625332e-05, + "loss": 0.2344, + "step": 12595 + }, + { + "epoch": 0.28085243916256936, + "grad_norm": 0.79154372215271, + "learning_rate": 1.6353519311774383e-05, + "loss": 0.4193, + "step": 12600 + }, + { + "epoch": 0.28096388854318943, + "grad_norm": 0.6778676509857178, + "learning_rate": 1.6350815125031005e-05, + "loss": 0.3332, + "step": 12605 + }, + { + "epoch": 0.2810753379238095, + "grad_norm": 0.4393715262413025, + "learning_rate": 1.6348110159726715e-05, + "loss": 0.3775, + "step": 12610 + }, + { + "epoch": 0.28118678730442953, + "grad_norm": 0.4823377728462219, + "learning_rate": 1.6345404416193117e-05, + "loss": 0.2871, + "step": 12615 + }, + { + "epoch": 0.2812982366850496, + "grad_norm": 0.5828414559364319, + "learning_rate": 1.6342697894761923e-05, + "loss": 0.416, + "step": 12620 + }, + { + "epoch": 0.2814096860656697, + "grad_norm": 0.566920280456543, + "learning_rate": 1.633999059576492e-05, + "loss": 0.4064, + "step": 12625 + }, + { + "epoch": 0.28152113544628976, + "grad_norm": 0.593452513217926, + "learning_rate": 1.6337282519534005e-05, + "loss": 0.3254, + "step": 12630 + }, + { + "epoch": 0.28163258482690984, + "grad_norm": 0.7168657779693604, + "learning_rate": 1.6334573666401173e-05, + "loss": 0.3073, + "step": 12635 + }, + { + "epoch": 0.2817440342075299, + "grad_norm": 0.6673802733421326, + "learning_rate": 1.633186403669851e-05, + "loss": 0.4074, + "step": 12640 + }, + { + "epoch": 0.28185548358814994, + "grad_norm": 0.7347943186759949, + "learning_rate": 1.6329153630758183e-05, + "loss": 0.3531, + "step": 12645 + }, + { + "epoch": 0.28196693296877, + "grad_norm": 0.42925938963890076, + "learning_rate": 1.6326442448912475e-05, + "loss": 0.2911, + "step": 12650 + }, + { + "epoch": 0.2820783823493901, + "grad_norm": 0.5090445280075073, + "learning_rate": 1.6323730491493757e-05, + "loss": 0.3901, + "step": 12655 + }, + { + "epoch": 0.28218983173001017, + "grad_norm": 0.4907485246658325, + "learning_rate": 1.632101775883449e-05, + "loss": 0.4705, + "step": 12660 + }, + { + "epoch": 0.28230128111063024, + "grad_norm": 0.547439694404602, + "learning_rate": 1.6318304251267242e-05, + "loss": 0.3237, + "step": 12665 + }, + { + "epoch": 0.2824127304912503, + "grad_norm": 0.4126209616661072, + "learning_rate": 1.6315589969124663e-05, + "loss": 0.3427, + "step": 12670 + }, + { + "epoch": 0.28252417987187034, + "grad_norm": 0.47196847200393677, + "learning_rate": 1.6312874912739495e-05, + "loss": 0.3371, + "step": 12675 + }, + { + "epoch": 0.2826356292524904, + "grad_norm": 0.5562902092933655, + "learning_rate": 1.6310159082444593e-05, + "loss": 0.3015, + "step": 12680 + }, + { + "epoch": 0.2827470786331105, + "grad_norm": 0.5090768933296204, + "learning_rate": 1.6307442478572898e-05, + "loss": 0.3171, + "step": 12685 + }, + { + "epoch": 0.28285852801373057, + "grad_norm": 0.685769259929657, + "learning_rate": 1.6304725101457442e-05, + "loss": 0.2784, + "step": 12690 + }, + { + "epoch": 0.28296997739435065, + "grad_norm": 0.4524206221103668, + "learning_rate": 1.6302006951431348e-05, + "loss": 0.3828, + "step": 12695 + }, + { + "epoch": 0.2830814267749707, + "grad_norm": 0.5083780288696289, + "learning_rate": 1.629928802882785e-05, + "loss": 0.323, + "step": 12700 + }, + { + "epoch": 0.28319287615559074, + "grad_norm": 0.6181834936141968, + "learning_rate": 1.629656833398026e-05, + "loss": 0.2939, + "step": 12705 + }, + { + "epoch": 0.2833043255362108, + "grad_norm": 0.6792620420455933, + "learning_rate": 1.6293847867222e-05, + "loss": 0.2741, + "step": 12710 + }, + { + "epoch": 0.2834157749168309, + "grad_norm": 0.5394588708877563, + "learning_rate": 1.6291126628886566e-05, + "loss": 0.2861, + "step": 12715 + }, + { + "epoch": 0.283527224297451, + "grad_norm": 0.7299884557723999, + "learning_rate": 1.628840461930757e-05, + "loss": 0.407, + "step": 12720 + }, + { + "epoch": 0.28363867367807105, + "grad_norm": 0.6590754985809326, + "learning_rate": 1.6285681838818707e-05, + "loss": 0.3828, + "step": 12725 + }, + { + "epoch": 0.28375012305869113, + "grad_norm": 0.388741135597229, + "learning_rate": 1.6282958287753767e-05, + "loss": 0.4551, + "step": 12730 + }, + { + "epoch": 0.28386157243931115, + "grad_norm": 0.7172895073890686, + "learning_rate": 1.628023396644664e-05, + "loss": 0.3955, + "step": 12735 + }, + { + "epoch": 0.2839730218199312, + "grad_norm": 0.6117028594017029, + "learning_rate": 1.6277508875231304e-05, + "loss": 0.3407, + "step": 12740 + }, + { + "epoch": 0.2840844712005513, + "grad_norm": 0.5097143054008484, + "learning_rate": 1.6274783014441833e-05, + "loss": 0.3437, + "step": 12745 + }, + { + "epoch": 0.2841959205811714, + "grad_norm": 0.4291749596595764, + "learning_rate": 1.62720563844124e-05, + "loss": 0.3326, + "step": 12750 + }, + { + "epoch": 0.28430736996179146, + "grad_norm": 0.4608302414417267, + "learning_rate": 1.6269328985477267e-05, + "loss": 0.3271, + "step": 12755 + }, + { + "epoch": 0.2844188193424115, + "grad_norm": 0.671491265296936, + "learning_rate": 1.6266600817970794e-05, + "loss": 0.4097, + "step": 12760 + }, + { + "epoch": 0.28453026872303155, + "grad_norm": 0.6661823391914368, + "learning_rate": 1.6263871882227426e-05, + "loss": 0.381, + "step": 12765 + }, + { + "epoch": 0.28464171810365163, + "grad_norm": 0.46518033742904663, + "learning_rate": 1.626114217858172e-05, + "loss": 0.4233, + "step": 12770 + }, + { + "epoch": 0.2847531674842717, + "grad_norm": 0.39857664704322815, + "learning_rate": 1.625841170736831e-05, + "loss": 0.3127, + "step": 12775 + }, + { + "epoch": 0.2848646168648918, + "grad_norm": 0.6866069436073303, + "learning_rate": 1.6255680468921932e-05, + "loss": 0.3427, + "step": 12780 + }, + { + "epoch": 0.28497606624551186, + "grad_norm": 0.46831992268562317, + "learning_rate": 1.625294846357741e-05, + "loss": 0.3559, + "step": 12785 + }, + { + "epoch": 0.2850875156261319, + "grad_norm": 0.5224331617355347, + "learning_rate": 1.6250215691669683e-05, + "loss": 0.3373, + "step": 12790 + }, + { + "epoch": 0.28519896500675196, + "grad_norm": 0.7276885509490967, + "learning_rate": 1.624748215353375e-05, + "loss": 0.429, + "step": 12795 + }, + { + "epoch": 0.28531041438737204, + "grad_norm": 0.49990314245224, + "learning_rate": 1.6244747849504724e-05, + "loss": 0.3134, + "step": 12800 + }, + { + "epoch": 0.2854218637679921, + "grad_norm": 0.5923623442649841, + "learning_rate": 1.6242012779917818e-05, + "loss": 0.3458, + "step": 12805 + }, + { + "epoch": 0.2855333131486122, + "grad_norm": 0.7208021879196167, + "learning_rate": 1.6239276945108327e-05, + "loss": 0.395, + "step": 12810 + }, + { + "epoch": 0.28564476252923227, + "grad_norm": 0.6818788647651672, + "learning_rate": 1.6236540345411646e-05, + "loss": 0.3351, + "step": 12815 + }, + { + "epoch": 0.2857562119098523, + "grad_norm": 0.5643772482872009, + "learning_rate": 1.623380298116325e-05, + "loss": 0.3587, + "step": 12820 + }, + { + "epoch": 0.28586766129047236, + "grad_norm": 0.4340428113937378, + "learning_rate": 1.623106485269873e-05, + "loss": 0.3104, + "step": 12825 + }, + { + "epoch": 0.28597911067109244, + "grad_norm": 0.5968391299247742, + "learning_rate": 1.6228325960353752e-05, + "loss": 0.4524, + "step": 12830 + }, + { + "epoch": 0.2860905600517125, + "grad_norm": 0.5892665386199951, + "learning_rate": 1.6225586304464093e-05, + "loss": 0.3047, + "step": 12835 + }, + { + "epoch": 0.2862020094323326, + "grad_norm": 0.531557559967041, + "learning_rate": 1.6222845885365603e-05, + "loss": 0.4443, + "step": 12840 + }, + { + "epoch": 0.28631345881295267, + "grad_norm": 0.49256327748298645, + "learning_rate": 1.6220104703394237e-05, + "loss": 0.405, + "step": 12845 + }, + { + "epoch": 0.2864249081935727, + "grad_norm": 0.5449532866477966, + "learning_rate": 1.621736275888605e-05, + "loss": 0.3217, + "step": 12850 + }, + { + "epoch": 0.28653635757419277, + "grad_norm": 0.7042402625083923, + "learning_rate": 1.621462005217718e-05, + "loss": 0.4748, + "step": 12855 + }, + { + "epoch": 0.28664780695481284, + "grad_norm": 0.5613042116165161, + "learning_rate": 1.621187658360386e-05, + "loss": 0.4733, + "step": 12860 + }, + { + "epoch": 0.2867592563354329, + "grad_norm": 0.7253202795982361, + "learning_rate": 1.620913235350242e-05, + "loss": 0.255, + "step": 12865 + }, + { + "epoch": 0.286870705716053, + "grad_norm": 1.0904386043548584, + "learning_rate": 1.620638736220928e-05, + "loss": 0.4042, + "step": 12870 + }, + { + "epoch": 0.2869821550966731, + "grad_norm": 0.5551652312278748, + "learning_rate": 1.6203641610060956e-05, + "loss": 0.3905, + "step": 12875 + }, + { + "epoch": 0.2870936044772931, + "grad_norm": 0.4741367995738983, + "learning_rate": 1.6200895097394056e-05, + "loss": 0.3241, + "step": 12880 + }, + { + "epoch": 0.2872050538579132, + "grad_norm": 0.5723397135734558, + "learning_rate": 1.6198147824545278e-05, + "loss": 0.3918, + "step": 12885 + }, + { + "epoch": 0.28731650323853325, + "grad_norm": 0.4763239920139313, + "learning_rate": 1.6195399791851422e-05, + "loss": 0.3494, + "step": 12890 + }, + { + "epoch": 0.2874279526191533, + "grad_norm": 0.713117241859436, + "learning_rate": 1.619265099964937e-05, + "loss": 0.3322, + "step": 12895 + }, + { + "epoch": 0.2875394019997734, + "grad_norm": 0.7806963920593262, + "learning_rate": 1.6189901448276106e-05, + "loss": 0.3423, + "step": 12900 + }, + { + "epoch": 0.2876508513803935, + "grad_norm": 0.6539384126663208, + "learning_rate": 1.6187151138068707e-05, + "loss": 0.3596, + "step": 12905 + }, + { + "epoch": 0.2877623007610135, + "grad_norm": 0.6627002954483032, + "learning_rate": 1.618440006936433e-05, + "loss": 0.2949, + "step": 12910 + }, + { + "epoch": 0.2878737501416336, + "grad_norm": 0.5374137163162231, + "learning_rate": 1.6181648242500246e-05, + "loss": 0.4181, + "step": 12915 + }, + { + "epoch": 0.28798519952225365, + "grad_norm": 0.5556002855300903, + "learning_rate": 1.61788956578138e-05, + "loss": 0.26, + "step": 12920 + }, + { + "epoch": 0.28809664890287373, + "grad_norm": 0.8424591422080994, + "learning_rate": 1.6176142315642438e-05, + "loss": 0.3883, + "step": 12925 + }, + { + "epoch": 0.2882080982834938, + "grad_norm": 0.5209367871284485, + "learning_rate": 1.6173388216323704e-05, + "loss": 0.2877, + "step": 12930 + }, + { + "epoch": 0.2883195476641139, + "grad_norm": 0.5887765884399414, + "learning_rate": 1.6170633360195226e-05, + "loss": 0.4278, + "step": 12935 + }, + { + "epoch": 0.2884309970447339, + "grad_norm": 0.7472568154335022, + "learning_rate": 1.6167877747594724e-05, + "loss": 0.2795, + "step": 12940 + }, + { + "epoch": 0.288542446425354, + "grad_norm": 0.5088253617286682, + "learning_rate": 1.616512137886002e-05, + "loss": 0.335, + "step": 12945 + }, + { + "epoch": 0.28865389580597406, + "grad_norm": 0.5196222066879272, + "learning_rate": 1.6162364254329026e-05, + "loss": 0.3659, + "step": 12950 + }, + { + "epoch": 0.28876534518659414, + "grad_norm": 0.5183055996894836, + "learning_rate": 1.6159606374339736e-05, + "loss": 0.3415, + "step": 12955 + }, + { + "epoch": 0.2888767945672142, + "grad_norm": 0.4333648979663849, + "learning_rate": 1.6156847739230254e-05, + "loss": 0.3549, + "step": 12960 + }, + { + "epoch": 0.28898824394783423, + "grad_norm": 0.6129320859909058, + "learning_rate": 1.6154088349338758e-05, + "loss": 0.5242, + "step": 12965 + }, + { + "epoch": 0.2890996933284543, + "grad_norm": 0.6342664361000061, + "learning_rate": 1.6151328205003538e-05, + "loss": 0.355, + "step": 12970 + }, + { + "epoch": 0.2892111427090744, + "grad_norm": 0.6179186701774597, + "learning_rate": 1.6148567306562958e-05, + "loss": 0.4005, + "step": 12975 + }, + { + "epoch": 0.28932259208969446, + "grad_norm": 0.8301606774330139, + "learning_rate": 1.6145805654355484e-05, + "loss": 0.2717, + "step": 12980 + }, + { + "epoch": 0.28943404147031454, + "grad_norm": 0.5873782634735107, + "learning_rate": 1.614304324871968e-05, + "loss": 0.3209, + "step": 12985 + }, + { + "epoch": 0.2895454908509346, + "grad_norm": 0.5506610870361328, + "learning_rate": 1.614028008999419e-05, + "loss": 0.35, + "step": 12990 + }, + { + "epoch": 0.28965694023155464, + "grad_norm": 0.5306448936462402, + "learning_rate": 1.613751617851775e-05, + "loss": 0.339, + "step": 12995 + }, + { + "epoch": 0.2897683896121747, + "grad_norm": 0.49982750415802, + "learning_rate": 1.613475151462921e-05, + "loss": 0.4548, + "step": 13000 + }, + { + "epoch": 0.2898798389927948, + "grad_norm": 0.42073002457618713, + "learning_rate": 1.613198609866748e-05, + "loss": 0.4259, + "step": 13005 + }, + { + "epoch": 0.28999128837341487, + "grad_norm": 0.7801643013954163, + "learning_rate": 1.6129219930971588e-05, + "loss": 0.412, + "step": 13010 + }, + { + "epoch": 0.29010273775403495, + "grad_norm": 0.6633349061012268, + "learning_rate": 1.6126453011880644e-05, + "loss": 0.2692, + "step": 13015 + }, + { + "epoch": 0.290214187134655, + "grad_norm": 0.7223657965660095, + "learning_rate": 1.612368534173385e-05, + "loss": 0.2686, + "step": 13020 + }, + { + "epoch": 0.29032563651527504, + "grad_norm": 0.509113073348999, + "learning_rate": 1.61209169208705e-05, + "loss": 0.3325, + "step": 13025 + }, + { + "epoch": 0.2904370858958951, + "grad_norm": 0.8219191431999207, + "learning_rate": 1.6118147749629982e-05, + "loss": 0.4306, + "step": 13030 + }, + { + "epoch": 0.2905485352765152, + "grad_norm": 0.5687716007232666, + "learning_rate": 1.6115377828351773e-05, + "loss": 0.239, + "step": 13035 + }, + { + "epoch": 0.2906599846571353, + "grad_norm": 0.5741727352142334, + "learning_rate": 1.6112607157375447e-05, + "loss": 0.3715, + "step": 13040 + }, + { + "epoch": 0.29077143403775535, + "grad_norm": 0.7006103992462158, + "learning_rate": 1.6109835737040666e-05, + "loss": 0.4331, + "step": 13045 + }, + { + "epoch": 0.2908828834183754, + "grad_norm": 0.6321763396263123, + "learning_rate": 1.6107063567687183e-05, + "loss": 0.3468, + "step": 13050 + }, + { + "epoch": 0.29099433279899545, + "grad_norm": 0.5299533009529114, + "learning_rate": 1.6104290649654847e-05, + "loss": 0.3427, + "step": 13055 + }, + { + "epoch": 0.2911057821796155, + "grad_norm": 0.6118397116661072, + "learning_rate": 1.61015169832836e-05, + "loss": 0.3445, + "step": 13060 + }, + { + "epoch": 0.2912172315602356, + "grad_norm": 0.5570360422134399, + "learning_rate": 1.609874256891346e-05, + "loss": 0.3531, + "step": 13065 + }, + { + "epoch": 0.2913286809408557, + "grad_norm": 0.5952074527740479, + "learning_rate": 1.6095967406884558e-05, + "loss": 0.403, + "step": 13070 + }, + { + "epoch": 0.29144013032147575, + "grad_norm": 0.5052128434181213, + "learning_rate": 1.6093191497537106e-05, + "loss": 0.2412, + "step": 13075 + }, + { + "epoch": 0.29155157970209583, + "grad_norm": 0.5345081686973572, + "learning_rate": 1.609041484121141e-05, + "loss": 0.2459, + "step": 13080 + }, + { + "epoch": 0.29166302908271585, + "grad_norm": 0.4836462438106537, + "learning_rate": 1.6087637438247863e-05, + "loss": 0.2964, + "step": 13085 + }, + { + "epoch": 0.29177447846333593, + "grad_norm": 0.7367750406265259, + "learning_rate": 1.6084859288986957e-05, + "loss": 0.3672, + "step": 13090 + }, + { + "epoch": 0.291885927843956, + "grad_norm": 0.7815617322921753, + "learning_rate": 1.608208039376927e-05, + "loss": 0.4404, + "step": 13095 + }, + { + "epoch": 0.2919973772245761, + "grad_norm": 0.7470172047615051, + "learning_rate": 1.607930075293547e-05, + "loss": 0.3288, + "step": 13100 + }, + { + "epoch": 0.29210882660519616, + "grad_norm": 0.5423603653907776, + "learning_rate": 1.6076520366826326e-05, + "loss": 0.3442, + "step": 13105 + }, + { + "epoch": 0.29222027598581624, + "grad_norm": 0.7404739856719971, + "learning_rate": 1.6073739235782688e-05, + "loss": 0.3763, + "step": 13110 + }, + { + "epoch": 0.29233172536643626, + "grad_norm": 0.5424173474311829, + "learning_rate": 1.6070957360145502e-05, + "loss": 0.2754, + "step": 13115 + }, + { + "epoch": 0.29244317474705633, + "grad_norm": 0.9737645387649536, + "learning_rate": 1.6068174740255803e-05, + "loss": 0.389, + "step": 13120 + }, + { + "epoch": 0.2925546241276764, + "grad_norm": 0.7803316712379456, + "learning_rate": 1.6065391376454722e-05, + "loss": 0.3917, + "step": 13125 + }, + { + "epoch": 0.2926660735082965, + "grad_norm": 0.8863366842269897, + "learning_rate": 1.6062607269083475e-05, + "loss": 0.5252, + "step": 13130 + }, + { + "epoch": 0.29277752288891656, + "grad_norm": 0.4519311189651489, + "learning_rate": 1.6059822418483375e-05, + "loss": 0.3182, + "step": 13135 + }, + { + "epoch": 0.2928889722695366, + "grad_norm": 0.5549803972244263, + "learning_rate": 1.6057036824995814e-05, + "loss": 0.4129, + "step": 13140 + }, + { + "epoch": 0.29300042165015666, + "grad_norm": 0.5345078110694885, + "learning_rate": 1.60542504889623e-05, + "loss": 0.2777, + "step": 13145 + }, + { + "epoch": 0.29311187103077674, + "grad_norm": 0.6359322667121887, + "learning_rate": 1.6051463410724405e-05, + "loss": 0.3406, + "step": 13150 + }, + { + "epoch": 0.2932233204113968, + "grad_norm": 0.7798189520835876, + "learning_rate": 1.60486755906238e-05, + "loss": 0.3654, + "step": 13155 + }, + { + "epoch": 0.2933347697920169, + "grad_norm": 0.662034273147583, + "learning_rate": 1.6045887029002265e-05, + "loss": 0.258, + "step": 13160 + }, + { + "epoch": 0.29344621917263697, + "grad_norm": 0.5370074510574341, + "learning_rate": 1.6043097726201645e-05, + "loss": 0.3456, + "step": 13165 + }, + { + "epoch": 0.293557668553257, + "grad_norm": 0.9667613506317139, + "learning_rate": 1.6040307682563888e-05, + "loss": 0.3283, + "step": 13170 + }, + { + "epoch": 0.29366911793387707, + "grad_norm": 0.41724613308906555, + "learning_rate": 1.6037516898431032e-05, + "loss": 0.3669, + "step": 13175 + }, + { + "epoch": 0.29378056731449714, + "grad_norm": 0.5329295992851257, + "learning_rate": 1.6034725374145206e-05, + "loss": 0.3201, + "step": 13180 + }, + { + "epoch": 0.2938920166951172, + "grad_norm": 0.49996498227119446, + "learning_rate": 1.6031933110048633e-05, + "loss": 0.3333, + "step": 13185 + }, + { + "epoch": 0.2940034660757373, + "grad_norm": 0.6060029864311218, + "learning_rate": 1.6029140106483617e-05, + "loss": 0.3471, + "step": 13190 + }, + { + "epoch": 0.2941149154563574, + "grad_norm": 0.4627971947193146, + "learning_rate": 1.6026346363792565e-05, + "loss": 0.2648, + "step": 13195 + }, + { + "epoch": 0.2942263648369774, + "grad_norm": 0.7191663384437561, + "learning_rate": 1.6023551882317964e-05, + "loss": 0.474, + "step": 13200 + }, + { + "epoch": 0.29433781421759747, + "grad_norm": 0.45818057656288147, + "learning_rate": 1.6020756662402398e-05, + "loss": 0.4591, + "step": 13205 + }, + { + "epoch": 0.29444926359821755, + "grad_norm": 0.6689664125442505, + "learning_rate": 1.6017960704388535e-05, + "loss": 0.3238, + "step": 13210 + }, + { + "epoch": 0.2945607129788376, + "grad_norm": 0.7070868611335754, + "learning_rate": 1.6015164008619143e-05, + "loss": 0.2863, + "step": 13215 + }, + { + "epoch": 0.2946721623594577, + "grad_norm": 0.5390009880065918, + "learning_rate": 1.6012366575437074e-05, + "loss": 0.3581, + "step": 13220 + }, + { + "epoch": 0.2947836117400778, + "grad_norm": 0.46126458048820496, + "learning_rate": 1.600956840518527e-05, + "loss": 0.3509, + "step": 13225 + }, + { + "epoch": 0.2948950611206978, + "grad_norm": 0.4514496922492981, + "learning_rate": 1.6006769498206767e-05, + "loss": 0.4155, + "step": 13230 + }, + { + "epoch": 0.2950065105013179, + "grad_norm": 0.7030669450759888, + "learning_rate": 1.600396985484469e-05, + "loss": 0.3635, + "step": 13235 + }, + { + "epoch": 0.29511795988193795, + "grad_norm": 0.619882345199585, + "learning_rate": 1.600116947544225e-05, + "loss": 0.3775, + "step": 13240 + }, + { + "epoch": 0.29522940926255803, + "grad_norm": 0.6607558727264404, + "learning_rate": 1.5998368360342756e-05, + "loss": 0.3034, + "step": 13245 + }, + { + "epoch": 0.2953408586431781, + "grad_norm": 0.5937981009483337, + "learning_rate": 1.59955665098896e-05, + "loss": 0.389, + "step": 13250 + }, + { + "epoch": 0.2954523080237982, + "grad_norm": 0.7993833422660828, + "learning_rate": 1.5992763924426272e-05, + "loss": 0.3678, + "step": 13255 + }, + { + "epoch": 0.2955637574044182, + "grad_norm": 0.5195314288139343, + "learning_rate": 1.598996060429634e-05, + "loss": 0.3452, + "step": 13260 + }, + { + "epoch": 0.2956752067850383, + "grad_norm": 0.6692262291908264, + "learning_rate": 1.5987156549843474e-05, + "loss": 0.2995, + "step": 13265 + }, + { + "epoch": 0.29578665616565836, + "grad_norm": 0.6648061871528625, + "learning_rate": 1.598435176141143e-05, + "loss": 0.3632, + "step": 13270 + }, + { + "epoch": 0.29589810554627843, + "grad_norm": 0.5224160552024841, + "learning_rate": 1.598154623934405e-05, + "loss": 0.3567, + "step": 13275 + }, + { + "epoch": 0.2960095549268985, + "grad_norm": 0.8477574586868286, + "learning_rate": 1.5978739983985273e-05, + "loss": 0.3707, + "step": 13280 + }, + { + "epoch": 0.2961210043075186, + "grad_norm": 0.5074322819709778, + "learning_rate": 1.5975932995679123e-05, + "loss": 0.4027, + "step": 13285 + }, + { + "epoch": 0.2962324536881386, + "grad_norm": 0.6792076230049133, + "learning_rate": 1.5973125274769715e-05, + "loss": 0.4334, + "step": 13290 + }, + { + "epoch": 0.2963439030687587, + "grad_norm": 0.3721717298030853, + "learning_rate": 1.5970316821601253e-05, + "loss": 0.3296, + "step": 13295 + }, + { + "epoch": 0.29645535244937876, + "grad_norm": 0.5924360752105713, + "learning_rate": 1.596750763651803e-05, + "loss": 0.3489, + "step": 13300 + }, + { + "epoch": 0.29656680182999884, + "grad_norm": 0.6146001815795898, + "learning_rate": 1.5964697719864437e-05, + "loss": 0.3179, + "step": 13305 + }, + { + "epoch": 0.2966782512106189, + "grad_norm": 0.45257607102394104, + "learning_rate": 1.5961887071984944e-05, + "loss": 0.3409, + "step": 13310 + }, + { + "epoch": 0.296789700591239, + "grad_norm": 0.8483275771141052, + "learning_rate": 1.595907569322411e-05, + "loss": 0.2899, + "step": 13315 + }, + { + "epoch": 0.296901149971859, + "grad_norm": 0.7063986659049988, + "learning_rate": 1.5956263583926598e-05, + "loss": 0.2711, + "step": 13320 + }, + { + "epoch": 0.2970125993524791, + "grad_norm": 0.6052030324935913, + "learning_rate": 1.5953450744437144e-05, + "loss": 0.4041, + "step": 13325 + }, + { + "epoch": 0.29712404873309917, + "grad_norm": 0.6279826164245605, + "learning_rate": 1.5950637175100583e-05, + "loss": 0.315, + "step": 13330 + }, + { + "epoch": 0.29723549811371924, + "grad_norm": 0.6276046633720398, + "learning_rate": 1.5947822876261835e-05, + "loss": 0.3207, + "step": 13335 + }, + { + "epoch": 0.2973469474943393, + "grad_norm": 0.5449913144111633, + "learning_rate": 1.5945007848265912e-05, + "loss": 0.2947, + "step": 13340 + }, + { + "epoch": 0.29745839687495934, + "grad_norm": 0.633040726184845, + "learning_rate": 1.5942192091457918e-05, + "loss": 0.298, + "step": 13345 + }, + { + "epoch": 0.2975698462555794, + "grad_norm": 0.5707663297653198, + "learning_rate": 1.5939375606183035e-05, + "loss": 0.2653, + "step": 13350 + }, + { + "epoch": 0.2976812956361995, + "grad_norm": 0.6369941234588623, + "learning_rate": 1.5936558392786553e-05, + "loss": 0.324, + "step": 13355 + }, + { + "epoch": 0.29779274501681957, + "grad_norm": 0.7818615436553955, + "learning_rate": 1.5933740451613836e-05, + "loss": 0.2791, + "step": 13360 + }, + { + "epoch": 0.29790419439743965, + "grad_norm": 0.6412309408187866, + "learning_rate": 1.5930921783010336e-05, + "loss": 0.2938, + "step": 13365 + }, + { + "epoch": 0.2980156437780597, + "grad_norm": 0.6033828258514404, + "learning_rate": 1.592810238732161e-05, + "loss": 0.2937, + "step": 13370 + }, + { + "epoch": 0.29812709315867975, + "grad_norm": 0.8528613448143005, + "learning_rate": 1.5925282264893283e-05, + "loss": 0.3686, + "step": 13375 + }, + { + "epoch": 0.2982385425392998, + "grad_norm": 0.6302016973495483, + "learning_rate": 1.592246141607109e-05, + "loss": 0.3047, + "step": 13380 + }, + { + "epoch": 0.2983499919199199, + "grad_norm": 0.7522174715995789, + "learning_rate": 1.5919639841200843e-05, + "loss": 0.2914, + "step": 13385 + }, + { + "epoch": 0.29846144130054, + "grad_norm": 0.8643938899040222, + "learning_rate": 1.591681754062844e-05, + "loss": 0.4014, + "step": 13390 + }, + { + "epoch": 0.29857289068116005, + "grad_norm": 0.5454531311988831, + "learning_rate": 1.5913994514699883e-05, + "loss": 0.3496, + "step": 13395 + }, + { + "epoch": 0.29868434006178013, + "grad_norm": 0.5838807225227356, + "learning_rate": 1.591117076376125e-05, + "loss": 0.3131, + "step": 13400 + }, + { + "epoch": 0.29879578944240015, + "grad_norm": 0.4497433602809906, + "learning_rate": 1.59083462881587e-05, + "loss": 0.4624, + "step": 13405 + }, + { + "epoch": 0.2989072388230202, + "grad_norm": 0.5543870329856873, + "learning_rate": 1.59055210882385e-05, + "loss": 0.3888, + "step": 13410 + }, + { + "epoch": 0.2990186882036403, + "grad_norm": 0.6825234293937683, + "learning_rate": 1.5902695164347007e-05, + "loss": 0.3123, + "step": 13415 + }, + { + "epoch": 0.2991301375842604, + "grad_norm": 0.42260441184043884, + "learning_rate": 1.5899868516830643e-05, + "loss": 0.2898, + "step": 13420 + }, + { + "epoch": 0.29924158696488046, + "grad_norm": 0.6578460335731506, + "learning_rate": 1.589704114603594e-05, + "loss": 0.3062, + "step": 13425 + }, + { + "epoch": 0.29935303634550053, + "grad_norm": 0.6747757196426392, + "learning_rate": 1.589421305230951e-05, + "loss": 0.276, + "step": 13430 + }, + { + "epoch": 0.29946448572612056, + "grad_norm": 0.6671870350837708, + "learning_rate": 1.5891384235998058e-05, + "loss": 0.332, + "step": 13435 + }, + { + "epoch": 0.29957593510674063, + "grad_norm": 0.5130128860473633, + "learning_rate": 1.5888554697448372e-05, + "loss": 0.2472, + "step": 13440 + }, + { + "epoch": 0.2996873844873607, + "grad_norm": 0.525191605091095, + "learning_rate": 1.5885724437007332e-05, + "loss": 0.3166, + "step": 13445 + }, + { + "epoch": 0.2997988338679808, + "grad_norm": 0.5181118845939636, + "learning_rate": 1.5882893455021906e-05, + "loss": 0.2823, + "step": 13450 + }, + { + "epoch": 0.29991028324860086, + "grad_norm": 0.8248529434204102, + "learning_rate": 1.5880061751839153e-05, + "loss": 0.4481, + "step": 13455 + }, + { + "epoch": 0.30002173262922094, + "grad_norm": 0.509216845035553, + "learning_rate": 1.5877229327806217e-05, + "loss": 0.282, + "step": 13460 + }, + { + "epoch": 0.30013318200984096, + "grad_norm": 0.791086733341217, + "learning_rate": 1.587439618327033e-05, + "loss": 0.4165, + "step": 13465 + }, + { + "epoch": 0.30024463139046104, + "grad_norm": 0.39396199584007263, + "learning_rate": 1.5871562318578814e-05, + "loss": 0.3594, + "step": 13470 + }, + { + "epoch": 0.3003560807710811, + "grad_norm": 0.6425427198410034, + "learning_rate": 1.5868727734079078e-05, + "loss": 0.3538, + "step": 13475 + }, + { + "epoch": 0.3004675301517012, + "grad_norm": 0.5383874177932739, + "learning_rate": 1.5865892430118623e-05, + "loss": 0.3805, + "step": 13480 + }, + { + "epoch": 0.30057897953232127, + "grad_norm": 0.529425323009491, + "learning_rate": 1.5863056407045034e-05, + "loss": 0.4096, + "step": 13485 + }, + { + "epoch": 0.30069042891294134, + "grad_norm": 0.5081347227096558, + "learning_rate": 1.5860219665205985e-05, + "loss": 0.2414, + "step": 13490 + }, + { + "epoch": 0.30080187829356136, + "grad_norm": 0.5437273979187012, + "learning_rate": 1.585738220494924e-05, + "loss": 0.3649, + "step": 13495 + }, + { + "epoch": 0.30091332767418144, + "grad_norm": 0.6691897511482239, + "learning_rate": 1.5854544026622648e-05, + "loss": 0.4306, + "step": 13500 + }, + { + "epoch": 0.3010247770548015, + "grad_norm": 0.5725626945495605, + "learning_rate": 1.5851705130574147e-05, + "loss": 0.3724, + "step": 13505 + }, + { + "epoch": 0.3011362264354216, + "grad_norm": 1.0486339330673218, + "learning_rate": 1.5848865517151762e-05, + "loss": 0.3816, + "step": 13510 + }, + { + "epoch": 0.30124767581604167, + "grad_norm": 0.5338178873062134, + "learning_rate": 1.584602518670362e-05, + "loss": 0.4815, + "step": 13515 + }, + { + "epoch": 0.3013591251966617, + "grad_norm": 0.4364151656627655, + "learning_rate": 1.5843184139577908e-05, + "loss": 0.2972, + "step": 13520 + }, + { + "epoch": 0.30147057457728177, + "grad_norm": 0.43141815066337585, + "learning_rate": 1.5840342376122927e-05, + "loss": 0.2521, + "step": 13525 + }, + { + "epoch": 0.30158202395790185, + "grad_norm": 0.5989120006561279, + "learning_rate": 1.5837499896687048e-05, + "loss": 0.3116, + "step": 13530 + }, + { + "epoch": 0.3016934733385219, + "grad_norm": 0.7535663843154907, + "learning_rate": 1.583465670161874e-05, + "loss": 0.3741, + "step": 13535 + }, + { + "epoch": 0.301804922719142, + "grad_norm": 0.5637946724891663, + "learning_rate": 1.5831812791266557e-05, + "loss": 0.2601, + "step": 13540 + }, + { + "epoch": 0.3019163720997621, + "grad_norm": 0.6944532990455627, + "learning_rate": 1.582896816597914e-05, + "loss": 0.3492, + "step": 13545 + }, + { + "epoch": 0.3020278214803821, + "grad_norm": 0.40424805879592896, + "learning_rate": 1.5826122826105224e-05, + "loss": 0.2823, + "step": 13550 + }, + { + "epoch": 0.3021392708610022, + "grad_norm": 0.41228148341178894, + "learning_rate": 1.5823276771993617e-05, + "loss": 0.3248, + "step": 13555 + }, + { + "epoch": 0.30225072024162225, + "grad_norm": 0.5896701812744141, + "learning_rate": 1.5820430003993226e-05, + "loss": 0.3773, + "step": 13560 + }, + { + "epoch": 0.3023621696222423, + "grad_norm": 0.6878036856651306, + "learning_rate": 1.5817582522453042e-05, + "loss": 0.3773, + "step": 13565 + }, + { + "epoch": 0.3024736190028624, + "grad_norm": 0.5029690265655518, + "learning_rate": 1.581473432772215e-05, + "loss": 0.4281, + "step": 13570 + }, + { + "epoch": 0.3025850683834825, + "grad_norm": 0.4946044385433197, + "learning_rate": 1.581188542014971e-05, + "loss": 0.3541, + "step": 13575 + }, + { + "epoch": 0.3026965177641025, + "grad_norm": 0.4801938235759735, + "learning_rate": 1.5809035800084974e-05, + "loss": 0.4324, + "step": 13580 + }, + { + "epoch": 0.3028079671447226, + "grad_norm": 0.5312267541885376, + "learning_rate": 1.5806185467877293e-05, + "loss": 0.32, + "step": 13585 + }, + { + "epoch": 0.30291941652534266, + "grad_norm": 0.8016061782836914, + "learning_rate": 1.5803334423876088e-05, + "loss": 0.4019, + "step": 13590 + }, + { + "epoch": 0.30303086590596273, + "grad_norm": 0.7053002715110779, + "learning_rate": 1.580048266843088e-05, + "loss": 0.3772, + "step": 13595 + }, + { + "epoch": 0.3031423152865828, + "grad_norm": 0.5228629112243652, + "learning_rate": 1.5797630201891267e-05, + "loss": 0.5045, + "step": 13600 + }, + { + "epoch": 0.3032537646672029, + "grad_norm": 0.5898446440696716, + "learning_rate": 1.579477702460694e-05, + "loss": 0.4242, + "step": 13605 + }, + { + "epoch": 0.3033652140478229, + "grad_norm": 0.6774618625640869, + "learning_rate": 1.5791923136927684e-05, + "loss": 0.3472, + "step": 13610 + }, + { + "epoch": 0.303476663428443, + "grad_norm": 0.36860859394073486, + "learning_rate": 1.5789068539203356e-05, + "loss": 0.3184, + "step": 13615 + }, + { + "epoch": 0.30358811280906306, + "grad_norm": 0.4877468943595886, + "learning_rate": 1.578621323178391e-05, + "loss": 0.34, + "step": 13620 + }, + { + "epoch": 0.30369956218968314, + "grad_norm": 0.5136805176734924, + "learning_rate": 1.5783357215019383e-05, + "loss": 0.3786, + "step": 13625 + }, + { + "epoch": 0.3038110115703032, + "grad_norm": 0.8331414461135864, + "learning_rate": 1.5780500489259907e-05, + "loss": 0.3808, + "step": 13630 + }, + { + "epoch": 0.3039224609509233, + "grad_norm": 0.7533968687057495, + "learning_rate": 1.5777643054855684e-05, + "loss": 0.298, + "step": 13635 + }, + { + "epoch": 0.3040339103315433, + "grad_norm": 0.6575278639793396, + "learning_rate": 1.577478491215702e-05, + "loss": 0.355, + "step": 13640 + }, + { + "epoch": 0.3041453597121634, + "grad_norm": 0.5389758348464966, + "learning_rate": 1.5771926061514302e-05, + "loss": 0.3802, + "step": 13645 + }, + { + "epoch": 0.30425680909278346, + "grad_norm": 0.5021669864654541, + "learning_rate": 1.5769066503278e-05, + "loss": 0.2789, + "step": 13650 + }, + { + "epoch": 0.30436825847340354, + "grad_norm": 0.7188048362731934, + "learning_rate": 1.5766206237798677e-05, + "loss": 0.3787, + "step": 13655 + }, + { + "epoch": 0.3044797078540236, + "grad_norm": 0.5826491713523865, + "learning_rate": 1.5763345265426978e-05, + "loss": 0.3537, + "step": 13660 + }, + { + "epoch": 0.3045911572346437, + "grad_norm": 0.5045832395553589, + "learning_rate": 1.5760483586513632e-05, + "loss": 0.3441, + "step": 13665 + }, + { + "epoch": 0.3047026066152637, + "grad_norm": 0.7873217463493347, + "learning_rate": 1.5757621201409463e-05, + "loss": 0.4069, + "step": 13670 + }, + { + "epoch": 0.3048140559958838, + "grad_norm": 0.5248357653617859, + "learning_rate": 1.575475811046538e-05, + "loss": 0.3679, + "step": 13675 + }, + { + "epoch": 0.30492550537650387, + "grad_norm": 0.6609611511230469, + "learning_rate": 1.575189431403237e-05, + "loss": 0.3589, + "step": 13680 + }, + { + "epoch": 0.30503695475712395, + "grad_norm": 0.5051944851875305, + "learning_rate": 1.5749029812461515e-05, + "loss": 0.269, + "step": 13685 + }, + { + "epoch": 0.305148404137744, + "grad_norm": 0.6227126121520996, + "learning_rate": 1.5746164606103983e-05, + "loss": 0.3709, + "step": 13690 + }, + { + "epoch": 0.3052598535183641, + "grad_norm": 0.5263267159461975, + "learning_rate": 1.574329869531102e-05, + "loss": 0.2782, + "step": 13695 + }, + { + "epoch": 0.3053713028989841, + "grad_norm": 0.5302555561065674, + "learning_rate": 1.5740432080433974e-05, + "loss": 0.2429, + "step": 13700 + }, + { + "epoch": 0.3054827522796042, + "grad_norm": 0.5816635489463806, + "learning_rate": 1.5737564761824257e-05, + "loss": 0.2038, + "step": 13705 + }, + { + "epoch": 0.3055942016602243, + "grad_norm": 0.5670195817947388, + "learning_rate": 1.5734696739833392e-05, + "loss": 0.4132, + "step": 13710 + }, + { + "epoch": 0.30570565104084435, + "grad_norm": 0.5609695315361023, + "learning_rate": 1.573182801481297e-05, + "loss": 0.3791, + "step": 13715 + }, + { + "epoch": 0.3058171004214644, + "grad_norm": 0.7394962310791016, + "learning_rate": 1.5728958587114677e-05, + "loss": 0.3235, + "step": 13720 + }, + { + "epoch": 0.30592854980208445, + "grad_norm": 0.6420606970787048, + "learning_rate": 1.5726088457090284e-05, + "loss": 0.3705, + "step": 13725 + }, + { + "epoch": 0.3060399991827045, + "grad_norm": 0.5575365424156189, + "learning_rate": 1.5723217625091645e-05, + "loss": 0.3184, + "step": 13730 + }, + { + "epoch": 0.3061514485633246, + "grad_norm": 0.5209715366363525, + "learning_rate": 1.5720346091470697e-05, + "loss": 0.3651, + "step": 13735 + }, + { + "epoch": 0.3062628979439447, + "grad_norm": 0.6402333974838257, + "learning_rate": 1.5717473856579475e-05, + "loss": 0.3487, + "step": 13740 + }, + { + "epoch": 0.30637434732456476, + "grad_norm": 0.5436682105064392, + "learning_rate": 1.571460092077009e-05, + "loss": 0.3645, + "step": 13745 + }, + { + "epoch": 0.30648579670518483, + "grad_norm": 0.5848494172096252, + "learning_rate": 1.5711727284394745e-05, + "loss": 0.2991, + "step": 13750 + }, + { + "epoch": 0.30659724608580485, + "grad_norm": 0.6025956273078918, + "learning_rate": 1.5708852947805717e-05, + "loss": 0.3323, + "step": 13755 + }, + { + "epoch": 0.30670869546642493, + "grad_norm": 0.44370484352111816, + "learning_rate": 1.5705977911355388e-05, + "loss": 0.3734, + "step": 13760 + }, + { + "epoch": 0.306820144847045, + "grad_norm": 0.7109201550483704, + "learning_rate": 1.5703102175396208e-05, + "loss": 0.316, + "step": 13765 + }, + { + "epoch": 0.3069315942276651, + "grad_norm": 0.6673081517219543, + "learning_rate": 1.5700225740280725e-05, + "loss": 0.2519, + "step": 13770 + }, + { + "epoch": 0.30704304360828516, + "grad_norm": 0.41183361411094666, + "learning_rate": 1.5697348606361564e-05, + "loss": 0.4071, + "step": 13775 + }, + { + "epoch": 0.30715449298890524, + "grad_norm": 0.6063647270202637, + "learning_rate": 1.5694470773991438e-05, + "loss": 0.384, + "step": 13780 + }, + { + "epoch": 0.30726594236952526, + "grad_norm": 0.5366505980491638, + "learning_rate": 1.5691592243523154e-05, + "loss": 0.4145, + "step": 13785 + }, + { + "epoch": 0.30737739175014533, + "grad_norm": 0.5829271078109741, + "learning_rate": 1.5688713015309592e-05, + "loss": 0.4262, + "step": 13790 + }, + { + "epoch": 0.3074888411307654, + "grad_norm": 0.7547322511672974, + "learning_rate": 1.5685833089703718e-05, + "loss": 0.2761, + "step": 13795 + }, + { + "epoch": 0.3076002905113855, + "grad_norm": 0.5357990860939026, + "learning_rate": 1.5682952467058603e-05, + "loss": 0.3992, + "step": 13800 + }, + { + "epoch": 0.30771173989200556, + "grad_norm": 0.44748038053512573, + "learning_rate": 1.568007114772738e-05, + "loss": 0.3155, + "step": 13805 + }, + { + "epoch": 0.30782318927262564, + "grad_norm": 0.6524558067321777, + "learning_rate": 1.5677189132063278e-05, + "loss": 0.3054, + "step": 13810 + }, + { + "epoch": 0.30793463865324566, + "grad_norm": 0.42701128125190735, + "learning_rate": 1.5674306420419606e-05, + "loss": 0.3253, + "step": 13815 + }, + { + "epoch": 0.30804608803386574, + "grad_norm": 0.417032390832901, + "learning_rate": 1.567142301314977e-05, + "loss": 0.4835, + "step": 13820 + }, + { + "epoch": 0.3081575374144858, + "grad_norm": 0.6458917260169983, + "learning_rate": 1.5668538910607244e-05, + "loss": 0.3511, + "step": 13825 + }, + { + "epoch": 0.3082689867951059, + "grad_norm": 0.42670947313308716, + "learning_rate": 1.5665654113145606e-05, + "loss": 0.352, + "step": 13830 + }, + { + "epoch": 0.30838043617572597, + "grad_norm": 0.37725549936294556, + "learning_rate": 1.5662768621118507e-05, + "loss": 0.4173, + "step": 13835 + }, + { + "epoch": 0.30849188555634605, + "grad_norm": 0.5069001317024231, + "learning_rate": 1.565988243487968e-05, + "loss": 0.2881, + "step": 13840 + }, + { + "epoch": 0.30860333493696607, + "grad_norm": 0.684617817401886, + "learning_rate": 1.5656995554782957e-05, + "loss": 0.408, + "step": 13845 + }, + { + "epoch": 0.30871478431758614, + "grad_norm": 0.729494035243988, + "learning_rate": 1.5654107981182247e-05, + "loss": 0.3553, + "step": 13850 + }, + { + "epoch": 0.3088262336982062, + "grad_norm": 0.5434486269950867, + "learning_rate": 1.565121971443154e-05, + "loss": 0.303, + "step": 13855 + }, + { + "epoch": 0.3089376830788263, + "grad_norm": 0.6943069100379944, + "learning_rate": 1.5648330754884917e-05, + "loss": 0.3397, + "step": 13860 + }, + { + "epoch": 0.3090491324594464, + "grad_norm": 0.49677422642707825, + "learning_rate": 1.5645441102896536e-05, + "loss": 0.3348, + "step": 13865 + }, + { + "epoch": 0.30916058184006645, + "grad_norm": 0.5370576977729797, + "learning_rate": 1.5642550758820657e-05, + "loss": 0.4424, + "step": 13870 + }, + { + "epoch": 0.30927203122068647, + "grad_norm": 0.7557622194290161, + "learning_rate": 1.563965972301161e-05, + "loss": 0.3526, + "step": 13875 + }, + { + "epoch": 0.30938348060130655, + "grad_norm": 0.39046844840049744, + "learning_rate": 1.563676799582381e-05, + "loss": 0.3893, + "step": 13880 + }, + { + "epoch": 0.3094949299819266, + "grad_norm": 0.5151693820953369, + "learning_rate": 1.5633875577611765e-05, + "loss": 0.4531, + "step": 13885 + }, + { + "epoch": 0.3096063793625467, + "grad_norm": 0.5232051014900208, + "learning_rate": 1.563098246873006e-05, + "loss": 0.4816, + "step": 13890 + }, + { + "epoch": 0.3097178287431668, + "grad_norm": 0.6085663437843323, + "learning_rate": 1.562808866953337e-05, + "loss": 0.4044, + "step": 13895 + }, + { + "epoch": 0.3098292781237868, + "grad_norm": 0.7641314268112183, + "learning_rate": 1.5625194180376446e-05, + "loss": 0.3029, + "step": 13900 + }, + { + "epoch": 0.3099407275044069, + "grad_norm": 0.4355248212814331, + "learning_rate": 1.5622299001614138e-05, + "loss": 0.368, + "step": 13905 + }, + { + "epoch": 0.31005217688502695, + "grad_norm": 0.5732108354568481, + "learning_rate": 1.561940313360137e-05, + "loss": 0.4079, + "step": 13910 + }, + { + "epoch": 0.31016362626564703, + "grad_norm": 0.6047371625900269, + "learning_rate": 1.5616506576693155e-05, + "loss": 0.3951, + "step": 13915 + }, + { + "epoch": 0.3102750756462671, + "grad_norm": 0.6488727331161499, + "learning_rate": 1.5613609331244584e-05, + "loss": 0.3739, + "step": 13920 + }, + { + "epoch": 0.3103865250268872, + "grad_norm": 0.9942343235015869, + "learning_rate": 1.561071139761084e-05, + "loss": 0.2652, + "step": 13925 + }, + { + "epoch": 0.3104979744075072, + "grad_norm": 0.6689128279685974, + "learning_rate": 1.5607812776147192e-05, + "loss": 0.3878, + "step": 13930 + }, + { + "epoch": 0.3106094237881273, + "grad_norm": 0.6173014044761658, + "learning_rate": 1.5604913467208977e-05, + "loss": 0.1834, + "step": 13935 + }, + { + "epoch": 0.31072087316874736, + "grad_norm": 0.48626089096069336, + "learning_rate": 1.5602013471151634e-05, + "loss": 0.2309, + "step": 13940 + }, + { + "epoch": 0.31083232254936743, + "grad_norm": 0.431443452835083, + "learning_rate": 1.5599112788330685e-05, + "loss": 0.2777, + "step": 13945 + }, + { + "epoch": 0.3109437719299875, + "grad_norm": 0.7525061368942261, + "learning_rate": 1.5596211419101723e-05, + "loss": 0.4271, + "step": 13950 + }, + { + "epoch": 0.3110552213106076, + "grad_norm": 0.5650204420089722, + "learning_rate": 1.5593309363820437e-05, + "loss": 0.3672, + "step": 13955 + }, + { + "epoch": 0.3111666706912276, + "grad_norm": 0.471719890832901, + "learning_rate": 1.55904066228426e-05, + "loss": 0.4063, + "step": 13960 + }, + { + "epoch": 0.3112781200718477, + "grad_norm": 0.4454389810562134, + "learning_rate": 1.558750319652406e-05, + "loss": 0.4041, + "step": 13965 + }, + { + "epoch": 0.31138956945246776, + "grad_norm": 0.6640139222145081, + "learning_rate": 1.5584599085220754e-05, + "loss": 0.2817, + "step": 13970 + }, + { + "epoch": 0.31150101883308784, + "grad_norm": 0.8577756285667419, + "learning_rate": 1.558169428928871e-05, + "loss": 0.3628, + "step": 13975 + }, + { + "epoch": 0.3116124682137079, + "grad_norm": 0.6317833065986633, + "learning_rate": 1.5578788809084033e-05, + "loss": 0.2419, + "step": 13980 + }, + { + "epoch": 0.311723917594328, + "grad_norm": 0.6040417551994324, + "learning_rate": 1.5575882644962902e-05, + "loss": 0.3732, + "step": 13985 + }, + { + "epoch": 0.311835366974948, + "grad_norm": 0.5538058280944824, + "learning_rate": 1.5572975797281603e-05, + "loss": 0.3231, + "step": 13990 + }, + { + "epoch": 0.3119468163555681, + "grad_norm": 0.566226065158844, + "learning_rate": 1.557006826639649e-05, + "loss": 0.3924, + "step": 13995 + }, + { + "epoch": 0.31205826573618817, + "grad_norm": 0.46765801310539246, + "learning_rate": 1.5567160052664002e-05, + "loss": 0.3625, + "step": 14000 + }, + { + "epoch": 0.31216971511680824, + "grad_norm": 0.5231557488441467, + "learning_rate": 1.556425115644066e-05, + "loss": 0.3253, + "step": 14005 + }, + { + "epoch": 0.3122811644974283, + "grad_norm": 0.495217889547348, + "learning_rate": 1.5561341578083075e-05, + "loss": 0.275, + "step": 14010 + }, + { + "epoch": 0.3123926138780484, + "grad_norm": 0.5250295996665955, + "learning_rate": 1.5558431317947943e-05, + "loss": 0.487, + "step": 14015 + }, + { + "epoch": 0.3125040632586684, + "grad_norm": 0.7416672110557556, + "learning_rate": 1.555552037639203e-05, + "loss": 0.3936, + "step": 14020 + }, + { + "epoch": 0.3126155126392885, + "grad_norm": 0.5672133564949036, + "learning_rate": 1.5552608753772207e-05, + "loss": 0.4152, + "step": 14025 + }, + { + "epoch": 0.31272696201990857, + "grad_norm": 0.5304513573646545, + "learning_rate": 1.554969645044541e-05, + "loss": 0.3631, + "step": 14030 + }, + { + "epoch": 0.31283841140052865, + "grad_norm": 0.45727986097335815, + "learning_rate": 1.554678346676866e-05, + "loss": 0.3183, + "step": 14035 + }, + { + "epoch": 0.3129498607811487, + "grad_norm": 0.5454281568527222, + "learning_rate": 1.5543869803099077e-05, + "loss": 0.3309, + "step": 14040 + }, + { + "epoch": 0.3130613101617688, + "grad_norm": 0.5051997900009155, + "learning_rate": 1.5540955459793847e-05, + "loss": 0.1792, + "step": 14045 + }, + { + "epoch": 0.3131727595423888, + "grad_norm": 0.6837823987007141, + "learning_rate": 1.5538040437210247e-05, + "loss": 0.4055, + "step": 14050 + }, + { + "epoch": 0.3132842089230089, + "grad_norm": 0.6566972136497498, + "learning_rate": 1.5535124735705634e-05, + "loss": 0.264, + "step": 14055 + }, + { + "epoch": 0.313395658303629, + "grad_norm": 0.5439302325248718, + "learning_rate": 1.5532208355637454e-05, + "loss": 0.3285, + "step": 14060 + }, + { + "epoch": 0.31350710768424905, + "grad_norm": 0.5949956774711609, + "learning_rate": 1.5529291297363235e-05, + "loss": 0.3884, + "step": 14065 + }, + { + "epoch": 0.31361855706486913, + "grad_norm": 0.6063193082809448, + "learning_rate": 1.552637356124058e-05, + "loss": 0.3661, + "step": 14070 + }, + { + "epoch": 0.3137300064454892, + "grad_norm": 0.4883115291595459, + "learning_rate": 1.5523455147627182e-05, + "loss": 0.3501, + "step": 14075 + }, + { + "epoch": 0.31384145582610923, + "grad_norm": 0.6186197996139526, + "learning_rate": 1.552053605688082e-05, + "loss": 0.3345, + "step": 14080 + }, + { + "epoch": 0.3139529052067293, + "grad_norm": 0.678356945514679, + "learning_rate": 1.551761628935935e-05, + "loss": 0.3084, + "step": 14085 + }, + { + "epoch": 0.3140643545873494, + "grad_norm": 0.7062575221061707, + "learning_rate": 1.551469584542071e-05, + "loss": 0.3178, + "step": 14090 + }, + { + "epoch": 0.31417580396796946, + "grad_norm": 0.4921148419380188, + "learning_rate": 1.5511774725422924e-05, + "loss": 0.3214, + "step": 14095 + }, + { + "epoch": 0.31428725334858953, + "grad_norm": 0.6326109170913696, + "learning_rate": 1.5508852929724107e-05, + "loss": 0.3674, + "step": 14100 + }, + { + "epoch": 0.31439870272920956, + "grad_norm": 0.412112832069397, + "learning_rate": 1.550593045868244e-05, + "loss": 0.3656, + "step": 14105 + }, + { + "epoch": 0.31451015210982963, + "grad_norm": 0.46066349744796753, + "learning_rate": 1.5503007312656198e-05, + "loss": 0.294, + "step": 14110 + }, + { + "epoch": 0.3146216014904497, + "grad_norm": 0.5353672504425049, + "learning_rate": 1.5500083492003737e-05, + "loss": 0.271, + "step": 14115 + }, + { + "epoch": 0.3147330508710698, + "grad_norm": 0.5529379844665527, + "learning_rate": 1.5497158997083492e-05, + "loss": 0.2766, + "step": 14120 + }, + { + "epoch": 0.31484450025168986, + "grad_norm": 0.5734739899635315, + "learning_rate": 1.5494233828253985e-05, + "loss": 0.3251, + "step": 14125 + }, + { + "epoch": 0.31495594963230994, + "grad_norm": 0.5877709984779358, + "learning_rate": 1.5491307985873822e-05, + "loss": 0.2302, + "step": 14130 + }, + { + "epoch": 0.31506739901292996, + "grad_norm": 0.4589202404022217, + "learning_rate": 1.5488381470301685e-05, + "loss": 0.3152, + "step": 14135 + }, + { + "epoch": 0.31517884839355004, + "grad_norm": 0.5067848563194275, + "learning_rate": 1.5485454281896346e-05, + "loss": 0.3822, + "step": 14140 + }, + { + "epoch": 0.3152902977741701, + "grad_norm": 0.6330682635307312, + "learning_rate": 1.548252642101665e-05, + "loss": 0.3133, + "step": 14145 + }, + { + "epoch": 0.3154017471547902, + "grad_norm": 0.6754536628723145, + "learning_rate": 1.5479597888021537e-05, + "loss": 0.2672, + "step": 14150 + }, + { + "epoch": 0.31551319653541027, + "grad_norm": 0.7370808124542236, + "learning_rate": 1.5476668683270016e-05, + "loss": 0.3993, + "step": 14155 + }, + { + "epoch": 0.31562464591603034, + "grad_norm": 2.0108695030212402, + "learning_rate": 1.547373880712119e-05, + "loss": 0.3878, + "step": 14160 + }, + { + "epoch": 0.31573609529665037, + "grad_norm": 0.3049313724040985, + "learning_rate": 1.5470808259934235e-05, + "loss": 0.3542, + "step": 14165 + }, + { + "epoch": 0.31584754467727044, + "grad_norm": 0.5608581900596619, + "learning_rate": 1.5467877042068415e-05, + "loss": 0.4755, + "step": 14170 + }, + { + "epoch": 0.3159589940578905, + "grad_norm": 0.6225017309188843, + "learning_rate": 1.5464945153883076e-05, + "loss": 0.4222, + "step": 14175 + }, + { + "epoch": 0.3160704434385106, + "grad_norm": 0.7026190161705017, + "learning_rate": 1.5462012595737643e-05, + "loss": 0.4397, + "step": 14180 + }, + { + "epoch": 0.31618189281913067, + "grad_norm": 0.6331912279129028, + "learning_rate": 1.5459079367991626e-05, + "loss": 0.4302, + "step": 14185 + }, + { + "epoch": 0.31629334219975075, + "grad_norm": 0.4649040102958679, + "learning_rate": 1.545614547100462e-05, + "loss": 0.3459, + "step": 14190 + }, + { + "epoch": 0.31640479158037077, + "grad_norm": 0.6613979339599609, + "learning_rate": 1.545321090513629e-05, + "loss": 0.2516, + "step": 14195 + }, + { + "epoch": 0.31651624096099085, + "grad_norm": 0.6959325671195984, + "learning_rate": 1.54502756707464e-05, + "loss": 0.3448, + "step": 14200 + }, + { + "epoch": 0.3166276903416109, + "grad_norm": 0.41933050751686096, + "learning_rate": 1.5447339768194777e-05, + "loss": 0.2984, + "step": 14205 + }, + { + "epoch": 0.316739139722231, + "grad_norm": 0.7498952150344849, + "learning_rate": 1.5444403197841345e-05, + "loss": 0.3256, + "step": 14210 + }, + { + "epoch": 0.3168505891028511, + "grad_norm": 0.470032662153244, + "learning_rate": 1.544146596004611e-05, + "loss": 0.4236, + "step": 14215 + }, + { + "epoch": 0.31696203848347115, + "grad_norm": 0.527711808681488, + "learning_rate": 1.5438528055169148e-05, + "loss": 0.3678, + "step": 14220 + }, + { + "epoch": 0.3170734878640912, + "grad_norm": 0.7611703872680664, + "learning_rate": 1.5435589483570627e-05, + "loss": 0.3765, + "step": 14225 + }, + { + "epoch": 0.31718493724471125, + "grad_norm": 0.7579947710037231, + "learning_rate": 1.5432650245610788e-05, + "loss": 0.3067, + "step": 14230 + }, + { + "epoch": 0.31729638662533133, + "grad_norm": 0.559687077999115, + "learning_rate": 1.5429710341649963e-05, + "loss": 0.3298, + "step": 14235 + }, + { + "epoch": 0.3174078360059514, + "grad_norm": 0.6190071702003479, + "learning_rate": 1.5426769772048567e-05, + "loss": 0.429, + "step": 14240 + }, + { + "epoch": 0.3175192853865715, + "grad_norm": 0.5103845000267029, + "learning_rate": 1.542382853716708e-05, + "loss": 0.3116, + "step": 14245 + }, + { + "epoch": 0.31763073476719156, + "grad_norm": 0.5446650981903076, + "learning_rate": 1.542088663736608e-05, + "loss": 0.3435, + "step": 14250 + }, + { + "epoch": 0.3177421841478116, + "grad_norm": 0.49055203795433044, + "learning_rate": 1.541794407300622e-05, + "loss": 0.29, + "step": 14255 + }, + { + "epoch": 0.31785363352843166, + "grad_norm": 0.5283357501029968, + "learning_rate": 1.5415000844448244e-05, + "loss": 0.3255, + "step": 14260 + }, + { + "epoch": 0.31796508290905173, + "grad_norm": 0.7132277488708496, + "learning_rate": 1.5412056952052955e-05, + "loss": 0.4841, + "step": 14265 + }, + { + "epoch": 0.3180765322896718, + "grad_norm": 0.7739210724830627, + "learning_rate": 1.5409112396181257e-05, + "loss": 0.2972, + "step": 14270 + }, + { + "epoch": 0.3181879816702919, + "grad_norm": 0.6999651193618774, + "learning_rate": 1.5406167177194134e-05, + "loss": 0.2844, + "step": 14275 + }, + { + "epoch": 0.3182994310509119, + "grad_norm": 0.6541236042976379, + "learning_rate": 1.5403221295452647e-05, + "loss": 0.377, + "step": 14280 + }, + { + "epoch": 0.318410880431532, + "grad_norm": 0.5203682780265808, + "learning_rate": 1.540027475131793e-05, + "loss": 0.4102, + "step": 14285 + }, + { + "epoch": 0.31852232981215206, + "grad_norm": 0.4930424988269806, + "learning_rate": 1.5397327545151214e-05, + "loss": 0.2815, + "step": 14290 + }, + { + "epoch": 0.31863377919277214, + "grad_norm": 0.8869831562042236, + "learning_rate": 1.5394379677313805e-05, + "loss": 0.528, + "step": 14295 + }, + { + "epoch": 0.3187452285733922, + "grad_norm": 0.5916035175323486, + "learning_rate": 1.5391431148167084e-05, + "loss": 0.4043, + "step": 14300 + }, + { + "epoch": 0.3188566779540123, + "grad_norm": 0.538135290145874, + "learning_rate": 1.5388481958072517e-05, + "loss": 0.3168, + "step": 14305 + }, + { + "epoch": 0.3189681273346323, + "grad_norm": 0.3676489591598511, + "learning_rate": 1.538553210739166e-05, + "loss": 0.2399, + "step": 14310 + }, + { + "epoch": 0.3190795767152524, + "grad_norm": 0.5923545360565186, + "learning_rate": 1.5382581596486133e-05, + "loss": 0.3974, + "step": 14315 + }, + { + "epoch": 0.31919102609587247, + "grad_norm": 0.47751757502555847, + "learning_rate": 1.537963042571765e-05, + "loss": 0.2995, + "step": 14320 + }, + { + "epoch": 0.31930247547649254, + "grad_norm": 0.8567208051681519, + "learning_rate": 1.5376678595448002e-05, + "loss": 0.268, + "step": 14325 + }, + { + "epoch": 0.3194139248571126, + "grad_norm": 0.3531752824783325, + "learning_rate": 1.537372610603906e-05, + "loss": 0.277, + "step": 14330 + }, + { + "epoch": 0.3195253742377327, + "grad_norm": 0.5776049494743347, + "learning_rate": 1.5370772957852774e-05, + "loss": 0.2297, + "step": 14335 + }, + { + "epoch": 0.3196368236183527, + "grad_norm": 0.6118939518928528, + "learning_rate": 1.536781915125118e-05, + "loss": 0.2609, + "step": 14340 + }, + { + "epoch": 0.3197482729989728, + "grad_norm": 0.6777652502059937, + "learning_rate": 1.5364864686596393e-05, + "loss": 0.3072, + "step": 14345 + }, + { + "epoch": 0.31985972237959287, + "grad_norm": 0.547516942024231, + "learning_rate": 1.5361909564250606e-05, + "loss": 0.3243, + "step": 14350 + }, + { + "epoch": 0.31997117176021295, + "grad_norm": 0.6201531887054443, + "learning_rate": 1.5358953784576093e-05, + "loss": 0.3129, + "step": 14355 + }, + { + "epoch": 0.320082621140833, + "grad_norm": 0.3952672481536865, + "learning_rate": 1.5355997347935207e-05, + "loss": 0.3897, + "step": 14360 + }, + { + "epoch": 0.3201940705214531, + "grad_norm": 0.6198890805244446, + "learning_rate": 1.5353040254690396e-05, + "loss": 0.3446, + "step": 14365 + }, + { + "epoch": 0.3203055199020731, + "grad_norm": 0.5419462323188782, + "learning_rate": 1.5350082505204163e-05, + "loss": 0.3506, + "step": 14370 + }, + { + "epoch": 0.3204169692826932, + "grad_norm": 0.564018964767456, + "learning_rate": 1.5347124099839113e-05, + "loss": 0.3479, + "step": 14375 + }, + { + "epoch": 0.3205284186633133, + "grad_norm": 0.8541130423545837, + "learning_rate": 1.534416503895792e-05, + "loss": 0.3654, + "step": 14380 + }, + { + "epoch": 0.32063986804393335, + "grad_norm": 0.5771231055259705, + "learning_rate": 1.5341205322923344e-05, + "loss": 0.3991, + "step": 14385 + }, + { + "epoch": 0.32075131742455343, + "grad_norm": 0.8024408221244812, + "learning_rate": 1.5338244952098228e-05, + "loss": 0.3302, + "step": 14390 + }, + { + "epoch": 0.3208627668051735, + "grad_norm": 0.649458110332489, + "learning_rate": 1.533528392684548e-05, + "loss": 0.419, + "step": 14395 + }, + { + "epoch": 0.3209742161857935, + "grad_norm": 0.39435210824012756, + "learning_rate": 1.5332322247528105e-05, + "loss": 0.4264, + "step": 14400 + }, + { + "epoch": 0.3210856655664136, + "grad_norm": 0.5552257895469666, + "learning_rate": 1.5329359914509188e-05, + "loss": 0.3141, + "step": 14405 + }, + { + "epoch": 0.3211971149470337, + "grad_norm": 0.6719105243682861, + "learning_rate": 1.5326396928151875e-05, + "loss": 0.2793, + "step": 14410 + }, + { + "epoch": 0.32130856432765376, + "grad_norm": 0.52122563123703, + "learning_rate": 1.5323433288819414e-05, + "loss": 0.2675, + "step": 14415 + }, + { + "epoch": 0.32142001370827383, + "grad_norm": 0.5365925431251526, + "learning_rate": 1.5320468996875125e-05, + "loss": 0.4213, + "step": 14420 + }, + { + "epoch": 0.3215314630888939, + "grad_norm": 0.5100508332252502, + "learning_rate": 1.53175040526824e-05, + "loss": 0.4192, + "step": 14425 + }, + { + "epoch": 0.32164291246951393, + "grad_norm": 0.600318193435669, + "learning_rate": 1.531453845660473e-05, + "loss": 0.3496, + "step": 14430 + }, + { + "epoch": 0.321754361850134, + "grad_norm": 0.5246594548225403, + "learning_rate": 1.5311572209005662e-05, + "loss": 0.3804, + "step": 14435 + }, + { + "epoch": 0.3218658112307541, + "grad_norm": 0.5853553414344788, + "learning_rate": 1.5308605310248837e-05, + "loss": 0.446, + "step": 14440 + }, + { + "epoch": 0.32197726061137416, + "grad_norm": 0.48245969414711, + "learning_rate": 1.530563776069798e-05, + "loss": 0.2857, + "step": 14445 + }, + { + "epoch": 0.32208870999199424, + "grad_norm": 0.4787314832210541, + "learning_rate": 1.5302669560716892e-05, + "loss": 0.409, + "step": 14450 + }, + { + "epoch": 0.3222001593726143, + "grad_norm": 0.40312430262565613, + "learning_rate": 1.529970071066944e-05, + "loss": 0.3718, + "step": 14455 + }, + { + "epoch": 0.32231160875323434, + "grad_norm": 0.5702295899391174, + "learning_rate": 1.529673121091959e-05, + "loss": 0.3722, + "step": 14460 + }, + { + "epoch": 0.3224230581338544, + "grad_norm": 0.5956227779388428, + "learning_rate": 1.529376106183138e-05, + "loss": 0.2894, + "step": 14465 + }, + { + "epoch": 0.3225345075144745, + "grad_norm": 0.5669090747833252, + "learning_rate": 1.529079026376892e-05, + "loss": 0.2773, + "step": 14470 + }, + { + "epoch": 0.32264595689509457, + "grad_norm": 1.0934644937515259, + "learning_rate": 1.528781881709642e-05, + "loss": 0.4747, + "step": 14475 + }, + { + "epoch": 0.32275740627571464, + "grad_norm": 0.4202762544155121, + "learning_rate": 1.5284846722178145e-05, + "loss": 0.4186, + "step": 14480 + }, + { + "epoch": 0.32286885565633466, + "grad_norm": 0.7583221793174744, + "learning_rate": 1.5281873979378453e-05, + "loss": 0.3649, + "step": 14485 + }, + { + "epoch": 0.32298030503695474, + "grad_norm": 0.7628973126411438, + "learning_rate": 1.5278900589061786e-05, + "loss": 0.4465, + "step": 14490 + }, + { + "epoch": 0.3230917544175748, + "grad_norm": 0.4392721354961395, + "learning_rate": 1.527592655159265e-05, + "loss": 0.3713, + "step": 14495 + }, + { + "epoch": 0.3232032037981949, + "grad_norm": 0.5354841947555542, + "learning_rate": 1.527295186733564e-05, + "loss": 0.4206, + "step": 14500 + }, + { + "epoch": 0.32331465317881497, + "grad_norm": 0.6088135838508606, + "learning_rate": 1.5269976536655432e-05, + "loss": 0.3972, + "step": 14505 + }, + { + "epoch": 0.32342610255943505, + "grad_norm": 0.7526804208755493, + "learning_rate": 1.5267000559916786e-05, + "loss": 0.3269, + "step": 14510 + }, + { + "epoch": 0.32353755194005507, + "grad_norm": 0.4515046775341034, + "learning_rate": 1.526402393748452e-05, + "loss": 0.3799, + "step": 14515 + }, + { + "epoch": 0.32364900132067514, + "grad_norm": 0.5819978713989258, + "learning_rate": 1.526104666972355e-05, + "loss": 0.2439, + "step": 14520 + }, + { + "epoch": 0.3237604507012952, + "grad_norm": 0.5523484945297241, + "learning_rate": 1.5258068756998874e-05, + "loss": 0.4368, + "step": 14525 + }, + { + "epoch": 0.3238719000819153, + "grad_norm": 0.5664416551589966, + "learning_rate": 1.5255090199675549e-05, + "loss": 0.2417, + "step": 14530 + }, + { + "epoch": 0.3239833494625354, + "grad_norm": 0.5326518416404724, + "learning_rate": 1.525211099811873e-05, + "loss": 0.2991, + "step": 14535 + }, + { + "epoch": 0.32409479884315545, + "grad_norm": 0.5060669779777527, + "learning_rate": 1.5249131152693644e-05, + "loss": 0.4009, + "step": 14540 + }, + { + "epoch": 0.3242062482237755, + "grad_norm": 0.5437809228897095, + "learning_rate": 1.5246150663765597e-05, + "loss": 0.3516, + "step": 14545 + }, + { + "epoch": 0.32431769760439555, + "grad_norm": 0.47817185521125793, + "learning_rate": 1.5243169531699966e-05, + "loss": 0.312, + "step": 14550 + }, + { + "epoch": 0.3244291469850156, + "grad_norm": 0.7032877206802368, + "learning_rate": 1.5240187756862227e-05, + "loss": 0.3905, + "step": 14555 + }, + { + "epoch": 0.3245405963656357, + "grad_norm": 0.49623337388038635, + "learning_rate": 1.5237205339617917e-05, + "loss": 0.3021, + "step": 14560 + }, + { + "epoch": 0.3246520457462558, + "grad_norm": 0.42450109124183655, + "learning_rate": 1.5234222280332658e-05, + "loss": 0.3865, + "step": 14565 + }, + { + "epoch": 0.32476349512687586, + "grad_norm": 0.583397388458252, + "learning_rate": 1.5231238579372145e-05, + "loss": 0.4173, + "step": 14570 + }, + { + "epoch": 0.3248749445074959, + "grad_norm": 0.40309199690818787, + "learning_rate": 1.5228254237102164e-05, + "loss": 0.3815, + "step": 14575 + }, + { + "epoch": 0.32498639388811595, + "grad_norm": 0.6588791608810425, + "learning_rate": 1.522526925388857e-05, + "loss": 0.3177, + "step": 14580 + }, + { + "epoch": 0.32509784326873603, + "grad_norm": 0.5301122665405273, + "learning_rate": 1.5222283630097298e-05, + "loss": 0.3032, + "step": 14585 + }, + { + "epoch": 0.3252092926493561, + "grad_norm": 0.7074852585792542, + "learning_rate": 1.5219297366094364e-05, + "loss": 0.3527, + "step": 14590 + }, + { + "epoch": 0.3253207420299762, + "grad_norm": 0.44125667214393616, + "learning_rate": 1.5216310462245858e-05, + "loss": 0.3452, + "step": 14595 + }, + { + "epoch": 0.32543219141059626, + "grad_norm": 0.4817579686641693, + "learning_rate": 1.5213322918917954e-05, + "loss": 0.3088, + "step": 14600 + }, + { + "epoch": 0.3255436407912163, + "grad_norm": 0.4762619733810425, + "learning_rate": 1.5210334736476898e-05, + "loss": 0.3784, + "step": 14605 + }, + { + "epoch": 0.32565509017183636, + "grad_norm": 0.5899630784988403, + "learning_rate": 1.5207345915289023e-05, + "loss": 0.3804, + "step": 14610 + }, + { + "epoch": 0.32576653955245644, + "grad_norm": 0.38415637612342834, + "learning_rate": 1.5204356455720735e-05, + "loss": 0.4236, + "step": 14615 + }, + { + "epoch": 0.3258779889330765, + "grad_norm": 0.5962705016136169, + "learning_rate": 1.5201366358138514e-05, + "loss": 0.4957, + "step": 14620 + }, + { + "epoch": 0.3259894383136966, + "grad_norm": 0.6160600781440735, + "learning_rate": 1.5198375622908926e-05, + "loss": 0.362, + "step": 14625 + }, + { + "epoch": 0.32610088769431667, + "grad_norm": 0.5748515725135803, + "learning_rate": 1.5195384250398614e-05, + "loss": 0.3849, + "step": 14630 + }, + { + "epoch": 0.3262123370749367, + "grad_norm": 0.6279200315475464, + "learning_rate": 1.5192392240974296e-05, + "loss": 0.4086, + "step": 14635 + }, + { + "epoch": 0.32632378645555676, + "grad_norm": 0.671965479850769, + "learning_rate": 1.5189399595002767e-05, + "loss": 0.2291, + "step": 14640 + }, + { + "epoch": 0.32643523583617684, + "grad_norm": 0.49965712428092957, + "learning_rate": 1.5186406312850901e-05, + "loss": 0.4113, + "step": 14645 + }, + { + "epoch": 0.3265466852167969, + "grad_norm": 0.6102264523506165, + "learning_rate": 1.518341239488566e-05, + "loss": 0.3486, + "step": 14650 + }, + { + "epoch": 0.326658134597417, + "grad_norm": 0.7976998686790466, + "learning_rate": 1.5180417841474063e-05, + "loss": 0.3541, + "step": 14655 + }, + { + "epoch": 0.326769583978037, + "grad_norm": 0.7241818308830261, + "learning_rate": 1.5177422652983227e-05, + "loss": 0.3369, + "step": 14660 + }, + { + "epoch": 0.3268810333586571, + "grad_norm": 0.4444121718406677, + "learning_rate": 1.5174426829780337e-05, + "loss": 0.2688, + "step": 14665 + }, + { + "epoch": 0.32699248273927717, + "grad_norm": 0.5494623184204102, + "learning_rate": 1.5171430372232661e-05, + "loss": 0.4802, + "step": 14670 + }, + { + "epoch": 0.32710393211989724, + "grad_norm": 0.8373895883560181, + "learning_rate": 1.5168433280707536e-05, + "loss": 0.382, + "step": 14675 + }, + { + "epoch": 0.3272153815005173, + "grad_norm": 0.784369707107544, + "learning_rate": 1.5165435555572386e-05, + "loss": 0.3099, + "step": 14680 + }, + { + "epoch": 0.3273268308811374, + "grad_norm": 0.6406736969947815, + "learning_rate": 1.5162437197194707e-05, + "loss": 0.4039, + "step": 14685 + }, + { + "epoch": 0.3274382802617574, + "grad_norm": 0.4019657075405121, + "learning_rate": 1.5159438205942078e-05, + "loss": 0.2698, + "step": 14690 + }, + { + "epoch": 0.3275497296423775, + "grad_norm": 0.5189796090126038, + "learning_rate": 1.5156438582182147e-05, + "loss": 0.3633, + "step": 14695 + }, + { + "epoch": 0.3276611790229976, + "grad_norm": 0.6062685251235962, + "learning_rate": 1.515343832628265e-05, + "loss": 0.4351, + "step": 14700 + }, + { + "epoch": 0.32777262840361765, + "grad_norm": 0.5360396504402161, + "learning_rate": 1.515043743861139e-05, + "loss": 0.3736, + "step": 14705 + }, + { + "epoch": 0.3278840777842377, + "grad_norm": 0.5746144652366638, + "learning_rate": 1.5147435919536254e-05, + "loss": 0.2686, + "step": 14710 + }, + { + "epoch": 0.3279955271648578, + "grad_norm": 0.515260636806488, + "learning_rate": 1.5144433769425212e-05, + "loss": 0.2981, + "step": 14715 + }, + { + "epoch": 0.3281069765454778, + "grad_norm": 0.6790237426757812, + "learning_rate": 1.5141430988646294e-05, + "loss": 0.3976, + "step": 14720 + }, + { + "epoch": 0.3282184259260979, + "grad_norm": 0.5881702899932861, + "learning_rate": 1.5138427577567627e-05, + "loss": 0.3953, + "step": 14725 + }, + { + "epoch": 0.328329875306718, + "grad_norm": 0.4543622136116028, + "learning_rate": 1.5135423536557397e-05, + "loss": 0.4129, + "step": 14730 + }, + { + "epoch": 0.32844132468733805, + "grad_norm": 0.7353951930999756, + "learning_rate": 1.513241886598388e-05, + "loss": 0.2967, + "step": 14735 + }, + { + "epoch": 0.32855277406795813, + "grad_norm": 0.6022626757621765, + "learning_rate": 1.5129413566215432e-05, + "loss": 0.2963, + "step": 14740 + }, + { + "epoch": 0.3286642234485782, + "grad_norm": 0.5606439709663391, + "learning_rate": 1.5126407637620469e-05, + "loss": 0.3192, + "step": 14745 + }, + { + "epoch": 0.32877567282919823, + "grad_norm": 0.6193286180496216, + "learning_rate": 1.51234010805675e-05, + "loss": 0.3371, + "step": 14750 + }, + { + "epoch": 0.3288871222098183, + "grad_norm": 0.5279430150985718, + "learning_rate": 1.5120393895425108e-05, + "loss": 0.2623, + "step": 14755 + }, + { + "epoch": 0.3289985715904384, + "grad_norm": 1.206921100616455, + "learning_rate": 1.5117386082561947e-05, + "loss": 0.2733, + "step": 14760 + }, + { + "epoch": 0.32911002097105846, + "grad_norm": 0.579546332359314, + "learning_rate": 1.5114377642346747e-05, + "loss": 0.19, + "step": 14765 + }, + { + "epoch": 0.32922147035167854, + "grad_norm": 0.5887970924377441, + "learning_rate": 1.511136857514833e-05, + "loss": 0.3501, + "step": 14770 + }, + { + "epoch": 0.3293329197322986, + "grad_norm": 0.9834648370742798, + "learning_rate": 1.510835888133558e-05, + "loss": 0.4039, + "step": 14775 + }, + { + "epoch": 0.32944436911291863, + "grad_norm": 0.5637308955192566, + "learning_rate": 1.510534856127746e-05, + "loss": 0.3722, + "step": 14780 + }, + { + "epoch": 0.3295558184935387, + "grad_norm": 0.7502883672714233, + "learning_rate": 1.5102337615343014e-05, + "loss": 0.3558, + "step": 14785 + }, + { + "epoch": 0.3296672678741588, + "grad_norm": 0.6586925983428955, + "learning_rate": 1.5099326043901361e-05, + "loss": 0.2808, + "step": 14790 + }, + { + "epoch": 0.32977871725477886, + "grad_norm": 0.48584112524986267, + "learning_rate": 1.5096313847321696e-05, + "loss": 0.4449, + "step": 14795 + }, + { + "epoch": 0.32989016663539894, + "grad_norm": 0.6589898467063904, + "learning_rate": 1.5093301025973289e-05, + "loss": 0.2944, + "step": 14800 + }, + { + "epoch": 0.330001616016019, + "grad_norm": 0.6377086639404297, + "learning_rate": 1.5090287580225492e-05, + "loss": 0.3999, + "step": 14805 + }, + { + "epoch": 0.33011306539663904, + "grad_norm": 0.5869620442390442, + "learning_rate": 1.5087273510447726e-05, + "loss": 0.3394, + "step": 14810 + }, + { + "epoch": 0.3302245147772591, + "grad_norm": 0.5796343684196472, + "learning_rate": 1.5084258817009496e-05, + "loss": 0.2943, + "step": 14815 + }, + { + "epoch": 0.3303359641578792, + "grad_norm": 0.5942478775978088, + "learning_rate": 1.5081243500280383e-05, + "loss": 0.2228, + "step": 14820 + }, + { + "epoch": 0.33044741353849927, + "grad_norm": 0.589439868927002, + "learning_rate": 1.5078227560630035e-05, + "loss": 0.3606, + "step": 14825 + }, + { + "epoch": 0.33055886291911935, + "grad_norm": 0.563065767288208, + "learning_rate": 1.5075210998428187e-05, + "loss": 0.3575, + "step": 14830 + }, + { + "epoch": 0.3306703122997394, + "grad_norm": 0.607187032699585, + "learning_rate": 1.5072193814044645e-05, + "loss": 0.2671, + "step": 14835 + }, + { + "epoch": 0.33078176168035944, + "grad_norm": 0.6348040103912354, + "learning_rate": 1.506917600784929e-05, + "loss": 0.3359, + "step": 14840 + }, + { + "epoch": 0.3308932110609795, + "grad_norm": 0.6133410930633545, + "learning_rate": 1.506615758021209e-05, + "loss": 0.3673, + "step": 14845 + }, + { + "epoch": 0.3310046604415996, + "grad_norm": 0.5856065154075623, + "learning_rate": 1.5063138531503075e-05, + "loss": 0.3409, + "step": 14850 + }, + { + "epoch": 0.3311161098222197, + "grad_norm": 0.5683413147926331, + "learning_rate": 1.5060118862092354e-05, + "loss": 0.4316, + "step": 14855 + }, + { + "epoch": 0.33122755920283975, + "grad_norm": 0.5754731893539429, + "learning_rate": 1.505709857235012e-05, + "loss": 0.3709, + "step": 14860 + }, + { + "epoch": 0.33133900858345977, + "grad_norm": 0.4092066287994385, + "learning_rate": 1.5054077662646633e-05, + "loss": 0.3438, + "step": 14865 + }, + { + "epoch": 0.33145045796407985, + "grad_norm": 0.6222695112228394, + "learning_rate": 1.505105613335224e-05, + "loss": 0.363, + "step": 14870 + }, + { + "epoch": 0.3315619073446999, + "grad_norm": 0.6781238317489624, + "learning_rate": 1.5048033984837352e-05, + "loss": 0.2977, + "step": 14875 + }, + { + "epoch": 0.33167335672532, + "grad_norm": 0.4596276879310608, + "learning_rate": 1.5045011217472463e-05, + "loss": 0.3093, + "step": 14880 + }, + { + "epoch": 0.3317848061059401, + "grad_norm": 0.3854585289955139, + "learning_rate": 1.5041987831628137e-05, + "loss": 0.4025, + "step": 14885 + }, + { + "epoch": 0.33189625548656015, + "grad_norm": 0.5611217021942139, + "learning_rate": 1.5038963827675024e-05, + "loss": 0.3191, + "step": 14890 + }, + { + "epoch": 0.3320077048671802, + "grad_norm": 0.48890677094459534, + "learning_rate": 1.503593920598384e-05, + "loss": 0.3349, + "step": 14895 + }, + { + "epoch": 0.33211915424780025, + "grad_norm": 0.6745815277099609, + "learning_rate": 1.5032913966925382e-05, + "loss": 0.2311, + "step": 14900 + }, + { + "epoch": 0.33223060362842033, + "grad_norm": 0.5865761041641235, + "learning_rate": 1.5029888110870516e-05, + "loss": 0.3149, + "step": 14905 + }, + { + "epoch": 0.3323420530090404, + "grad_norm": 0.6866218447685242, + "learning_rate": 1.5026861638190196e-05, + "loss": 0.3871, + "step": 14910 + }, + { + "epoch": 0.3324535023896605, + "grad_norm": 0.6699501872062683, + "learning_rate": 1.5023834549255441e-05, + "loss": 0.4077, + "step": 14915 + }, + { + "epoch": 0.33256495177028056, + "grad_norm": 0.6308091878890991, + "learning_rate": 1.5020806844437345e-05, + "loss": 0.3688, + "step": 14920 + }, + { + "epoch": 0.3326764011509006, + "grad_norm": 0.5703872442245483, + "learning_rate": 1.5017778524107088e-05, + "loss": 0.3708, + "step": 14925 + }, + { + "epoch": 0.33278785053152066, + "grad_norm": 0.7351306676864624, + "learning_rate": 1.5014749588635914e-05, + "loss": 0.358, + "step": 14930 + }, + { + "epoch": 0.33289929991214073, + "grad_norm": 0.9598005414009094, + "learning_rate": 1.5011720038395145e-05, + "loss": 0.2703, + "step": 14935 + }, + { + "epoch": 0.3330107492927608, + "grad_norm": 0.46301475167274475, + "learning_rate": 1.5008689873756189e-05, + "loss": 0.3201, + "step": 14940 + }, + { + "epoch": 0.3331221986733809, + "grad_norm": 0.6332523822784424, + "learning_rate": 1.5005659095090513e-05, + "loss": 0.3842, + "step": 14945 + }, + { + "epoch": 0.33323364805400096, + "grad_norm": 0.5854825377464294, + "learning_rate": 1.500262770276967e-05, + "loss": 0.3669, + "step": 14950 + }, + { + "epoch": 0.333345097434621, + "grad_norm": 0.5210273861885071, + "learning_rate": 1.4999595697165286e-05, + "loss": 0.2296, + "step": 14955 + }, + { + "epoch": 0.33345654681524106, + "grad_norm": 0.6245068311691284, + "learning_rate": 1.499656307864906e-05, + "loss": 0.2669, + "step": 14960 + }, + { + "epoch": 0.33356799619586114, + "grad_norm": 0.5508987903594971, + "learning_rate": 1.4993529847592766e-05, + "loss": 0.326, + "step": 14965 + }, + { + "epoch": 0.3336794455764812, + "grad_norm": 0.37212294340133667, + "learning_rate": 1.499049600436826e-05, + "loss": 0.3557, + "step": 14970 + }, + { + "epoch": 0.3337908949571013, + "grad_norm": 0.4018285274505615, + "learning_rate": 1.4987461549347462e-05, + "loss": 0.2277, + "step": 14975 + }, + { + "epoch": 0.33390234433772137, + "grad_norm": 0.5052530765533447, + "learning_rate": 1.4984426482902377e-05, + "loss": 0.2757, + "step": 14980 + }, + { + "epoch": 0.3340137937183414, + "grad_norm": 0.6615517139434814, + "learning_rate": 1.4981390805405079e-05, + "loss": 0.3558, + "step": 14985 + }, + { + "epoch": 0.33412524309896147, + "grad_norm": 0.6946066617965698, + "learning_rate": 1.497835451722772e-05, + "loss": 0.4595, + "step": 14990 + }, + { + "epoch": 0.33423669247958154, + "grad_norm": 0.4988650977611542, + "learning_rate": 1.4975317618742518e-05, + "loss": 0.2057, + "step": 14995 + }, + { + "epoch": 0.3343481418602016, + "grad_norm": 0.43746042251586914, + "learning_rate": 1.4972280110321787e-05, + "loss": 0.2913, + "step": 15000 + }, + { + "epoch": 0.3344595912408217, + "grad_norm": 0.352634459733963, + "learning_rate": 1.4969241992337891e-05, + "loss": 0.2988, + "step": 15005 + }, + { + "epoch": 0.3345710406214418, + "grad_norm": 0.8164475560188293, + "learning_rate": 1.4966203265163284e-05, + "loss": 0.428, + "step": 15010 + }, + { + "epoch": 0.3346824900020618, + "grad_norm": 0.6539212465286255, + "learning_rate": 1.496316392917049e-05, + "loss": 0.3029, + "step": 15015 + }, + { + "epoch": 0.33479393938268187, + "grad_norm": 0.6618625521659851, + "learning_rate": 1.4960123984732109e-05, + "loss": 0.2792, + "step": 15020 + }, + { + "epoch": 0.33490538876330195, + "grad_norm": 0.6794106960296631, + "learning_rate": 1.4957083432220811e-05, + "loss": 0.2843, + "step": 15025 + }, + { + "epoch": 0.335016838143922, + "grad_norm": 0.46583518385887146, + "learning_rate": 1.4954042272009348e-05, + "loss": 0.235, + "step": 15030 + }, + { + "epoch": 0.3351282875245421, + "grad_norm": 0.6059388518333435, + "learning_rate": 1.4951000504470543e-05, + "loss": 0.3461, + "step": 15035 + }, + { + "epoch": 0.3352397369051621, + "grad_norm": 0.4841199517250061, + "learning_rate": 1.4947958129977292e-05, + "loss": 0.4458, + "step": 15040 + }, + { + "epoch": 0.3353511862857822, + "grad_norm": 0.56889408826828, + "learning_rate": 1.4944915148902564e-05, + "loss": 0.2941, + "step": 15045 + }, + { + "epoch": 0.3354626356664023, + "grad_norm": 0.741605281829834, + "learning_rate": 1.494187156161941e-05, + "loss": 0.3782, + "step": 15050 + }, + { + "epoch": 0.33557408504702235, + "grad_norm": 0.6679457426071167, + "learning_rate": 1.493882736850095e-05, + "loss": 0.3299, + "step": 15055 + }, + { + "epoch": 0.33568553442764243, + "grad_norm": 0.5985062122344971, + "learning_rate": 1.4935782569920373e-05, + "loss": 0.2703, + "step": 15060 + }, + { + "epoch": 0.3357969838082625, + "grad_norm": 0.44864389300346375, + "learning_rate": 1.4932737166250953e-05, + "loss": 0.4695, + "step": 15065 + }, + { + "epoch": 0.3359084331888825, + "grad_norm": 0.6617417931556702, + "learning_rate": 1.492969115786603e-05, + "loss": 0.3258, + "step": 15070 + }, + { + "epoch": 0.3360198825695026, + "grad_norm": 0.41781333088874817, + "learning_rate": 1.4926644545139025e-05, + "loss": 0.1761, + "step": 15075 + }, + { + "epoch": 0.3361313319501227, + "grad_norm": 0.5821076035499573, + "learning_rate": 1.4923597328443423e-05, + "loss": 0.423, + "step": 15080 + }, + { + "epoch": 0.33624278133074276, + "grad_norm": 0.6323464512825012, + "learning_rate": 1.4920549508152795e-05, + "loss": 0.4038, + "step": 15085 + }, + { + "epoch": 0.33635423071136283, + "grad_norm": 0.5762990117073059, + "learning_rate": 1.4917501084640777e-05, + "loss": 0.3248, + "step": 15090 + }, + { + "epoch": 0.3364656800919829, + "grad_norm": 0.8302823305130005, + "learning_rate": 1.4914452058281086e-05, + "loss": 0.3412, + "step": 15095 + }, + { + "epoch": 0.33657712947260293, + "grad_norm": 0.5417741537094116, + "learning_rate": 1.4911402429447503e-05, + "loss": 0.3437, + "step": 15100 + }, + { + "epoch": 0.336688578853223, + "grad_norm": 0.6161333322525024, + "learning_rate": 1.4908352198513894e-05, + "loss": 0.3852, + "step": 15105 + }, + { + "epoch": 0.3368000282338431, + "grad_norm": 0.49575358629226685, + "learning_rate": 1.4905301365854193e-05, + "loss": 0.2876, + "step": 15110 + }, + { + "epoch": 0.33691147761446316, + "grad_norm": 0.5573312640190125, + "learning_rate": 1.4902249931842407e-05, + "loss": 0.3939, + "step": 15115 + }, + { + "epoch": 0.33702292699508324, + "grad_norm": 0.5083134770393372, + "learning_rate": 1.4899197896852618e-05, + "loss": 0.2197, + "step": 15120 + }, + { + "epoch": 0.3371343763757033, + "grad_norm": 0.5789259672164917, + "learning_rate": 1.4896145261258982e-05, + "loss": 0.3829, + "step": 15125 + }, + { + "epoch": 0.33724582575632334, + "grad_norm": 0.4595588445663452, + "learning_rate": 1.4893092025435733e-05, + "loss": 0.2697, + "step": 15130 + }, + { + "epoch": 0.3373572751369434, + "grad_norm": 0.5526938438415527, + "learning_rate": 1.4890038189757166e-05, + "loss": 0.3557, + "step": 15135 + }, + { + "epoch": 0.3374687245175635, + "grad_norm": 0.5678578019142151, + "learning_rate": 1.4886983754597667e-05, + "loss": 0.3549, + "step": 15140 + }, + { + "epoch": 0.33758017389818357, + "grad_norm": 0.5942708849906921, + "learning_rate": 1.4883928720331677e-05, + "loss": 0.2335, + "step": 15145 + }, + { + "epoch": 0.33769162327880364, + "grad_norm": 0.5128694772720337, + "learning_rate": 1.4880873087333726e-05, + "loss": 0.3975, + "step": 15150 + }, + { + "epoch": 0.3378030726594237, + "grad_norm": 0.6574785709381104, + "learning_rate": 1.4877816855978409e-05, + "loss": 0.2745, + "step": 15155 + }, + { + "epoch": 0.33791452204004374, + "grad_norm": 0.7634928822517395, + "learning_rate": 1.4874760026640398e-05, + "loss": 0.3363, + "step": 15160 + }, + { + "epoch": 0.3380259714206638, + "grad_norm": 0.5223698616027832, + "learning_rate": 1.4871702599694433e-05, + "loss": 0.3408, + "step": 15165 + }, + { + "epoch": 0.3381374208012839, + "grad_norm": 0.46809354424476624, + "learning_rate": 1.4868644575515334e-05, + "loss": 0.4533, + "step": 15170 + }, + { + "epoch": 0.33824887018190397, + "grad_norm": 0.6846073865890503, + "learning_rate": 1.486558595447799e-05, + "loss": 0.3388, + "step": 15175 + }, + { + "epoch": 0.33836031956252405, + "grad_norm": 0.6507226824760437, + "learning_rate": 1.4862526736957363e-05, + "loss": 0.3976, + "step": 15180 + }, + { + "epoch": 0.3384717689431441, + "grad_norm": 0.7572437524795532, + "learning_rate": 1.4859466923328494e-05, + "loss": 0.3125, + "step": 15185 + }, + { + "epoch": 0.33858321832376415, + "grad_norm": 0.49744054675102234, + "learning_rate": 1.4856406513966487e-05, + "loss": 0.3406, + "step": 15190 + }, + { + "epoch": 0.3386946677043842, + "grad_norm": 0.5798356533050537, + "learning_rate": 1.4853345509246528e-05, + "loss": 0.4069, + "step": 15195 + }, + { + "epoch": 0.3388061170850043, + "grad_norm": 0.5558682680130005, + "learning_rate": 1.4850283909543873e-05, + "loss": 0.4139, + "step": 15200 + }, + { + "epoch": 0.3389175664656244, + "grad_norm": 0.6416339874267578, + "learning_rate": 1.4847221715233846e-05, + "loss": 0.3324, + "step": 15205 + }, + { + "epoch": 0.33902901584624445, + "grad_norm": 0.6704056262969971, + "learning_rate": 1.4844158926691857e-05, + "loss": 0.4212, + "step": 15210 + }, + { + "epoch": 0.33914046522686453, + "grad_norm": 0.5458512306213379, + "learning_rate": 1.4841095544293369e-05, + "loss": 0.2544, + "step": 15215 + }, + { + "epoch": 0.33925191460748455, + "grad_norm": 0.22037500143051147, + "learning_rate": 1.4838031568413937e-05, + "loss": 0.2862, + "step": 15220 + }, + { + "epoch": 0.3393633639881046, + "grad_norm": 0.5767005681991577, + "learning_rate": 1.4834966999429179e-05, + "loss": 0.3191, + "step": 15225 + }, + { + "epoch": 0.3394748133687247, + "grad_norm": 0.5186335444450378, + "learning_rate": 1.4831901837714786e-05, + "loss": 0.3202, + "step": 15230 + }, + { + "epoch": 0.3395862627493448, + "grad_norm": 0.5099804997444153, + "learning_rate": 1.4828836083646526e-05, + "loss": 0.2643, + "step": 15235 + }, + { + "epoch": 0.33969771212996486, + "grad_norm": 0.7531489133834839, + "learning_rate": 1.4825769737600232e-05, + "loss": 0.3556, + "step": 15240 + }, + { + "epoch": 0.3398091615105849, + "grad_norm": 1.1307042837142944, + "learning_rate": 1.482270279995182e-05, + "loss": 0.1962, + "step": 15245 + }, + { + "epoch": 0.33992061089120496, + "grad_norm": 0.8670387268066406, + "learning_rate": 1.4819635271077269e-05, + "loss": 0.3373, + "step": 15250 + }, + { + "epoch": 0.34003206027182503, + "grad_norm": 0.7208353281021118, + "learning_rate": 1.4816567151352637e-05, + "loss": 0.3233, + "step": 15255 + }, + { + "epoch": 0.3401435096524451, + "grad_norm": 0.5600586533546448, + "learning_rate": 1.4813498441154048e-05, + "loss": 0.4405, + "step": 15260 + }, + { + "epoch": 0.3402549590330652, + "grad_norm": 0.7927045822143555, + "learning_rate": 1.4810429140857707e-05, + "loss": 0.427, + "step": 15265 + }, + { + "epoch": 0.34036640841368526, + "grad_norm": 0.6728307604789734, + "learning_rate": 1.4807359250839886e-05, + "loss": 0.4482, + "step": 15270 + }, + { + "epoch": 0.3404778577943053, + "grad_norm": 0.5486059784889221, + "learning_rate": 1.4804288771476923e-05, + "loss": 0.3259, + "step": 15275 + }, + { + "epoch": 0.34058930717492536, + "grad_norm": 0.6788325905799866, + "learning_rate": 1.4801217703145243e-05, + "loss": 0.4149, + "step": 15280 + }, + { + "epoch": 0.34070075655554544, + "grad_norm": 0.6441235542297363, + "learning_rate": 1.4798146046221332e-05, + "loss": 0.3206, + "step": 15285 + }, + { + "epoch": 0.3408122059361655, + "grad_norm": 0.6822008490562439, + "learning_rate": 1.4795073801081752e-05, + "loss": 0.3903, + "step": 15290 + }, + { + "epoch": 0.3409236553167856, + "grad_norm": 0.6025763750076294, + "learning_rate": 1.4792000968103132e-05, + "loss": 0.3727, + "step": 15295 + }, + { + "epoch": 0.34103510469740567, + "grad_norm": 0.6513890027999878, + "learning_rate": 1.4788927547662188e-05, + "loss": 0.367, + "step": 15300 + }, + { + "epoch": 0.3411465540780257, + "grad_norm": 0.5683618783950806, + "learning_rate": 1.4785853540135689e-05, + "loss": 0.2873, + "step": 15305 + }, + { + "epoch": 0.34125800345864576, + "grad_norm": 0.6003490090370178, + "learning_rate": 1.4782778945900486e-05, + "loss": 0.5232, + "step": 15310 + }, + { + "epoch": 0.34136945283926584, + "grad_norm": 0.47557592391967773, + "learning_rate": 1.4779703765333504e-05, + "loss": 0.309, + "step": 15315 + }, + { + "epoch": 0.3414809022198859, + "grad_norm": 0.4998873174190521, + "learning_rate": 1.4776627998811733e-05, + "loss": 0.3484, + "step": 15320 + }, + { + "epoch": 0.341592351600506, + "grad_norm": 0.5946129560470581, + "learning_rate": 1.4773551646712242e-05, + "loss": 0.2308, + "step": 15325 + }, + { + "epoch": 0.34170380098112607, + "grad_norm": 0.5998285412788391, + "learning_rate": 1.477047470941216e-05, + "loss": 0.3165, + "step": 15330 + }, + { + "epoch": 0.3418152503617461, + "grad_norm": 0.5759714841842651, + "learning_rate": 1.47673971872887e-05, + "loss": 0.3408, + "step": 15335 + }, + { + "epoch": 0.34192669974236617, + "grad_norm": 0.8474906086921692, + "learning_rate": 1.4764319080719149e-05, + "loss": 0.2878, + "step": 15340 + }, + { + "epoch": 0.34203814912298625, + "grad_norm": 0.262577086687088, + "learning_rate": 1.476124039008085e-05, + "loss": 0.335, + "step": 15345 + }, + { + "epoch": 0.3421495985036063, + "grad_norm": 0.788374662399292, + "learning_rate": 1.4758161115751234e-05, + "loss": 0.2989, + "step": 15350 + }, + { + "epoch": 0.3422610478842264, + "grad_norm": 0.6692611575126648, + "learning_rate": 1.475508125810779e-05, + "loss": 0.3803, + "step": 15355 + }, + { + "epoch": 0.3423724972648465, + "grad_norm": 0.7293594479560852, + "learning_rate": 1.475200081752809e-05, + "loss": 0.4263, + "step": 15360 + }, + { + "epoch": 0.3424839466454665, + "grad_norm": 0.689411997795105, + "learning_rate": 1.4748919794389767e-05, + "loss": 0.4555, + "step": 15365 + }, + { + "epoch": 0.3425953960260866, + "grad_norm": 0.5694615244865417, + "learning_rate": 1.4745838189070531e-05, + "loss": 0.4428, + "step": 15370 + }, + { + "epoch": 0.34270684540670665, + "grad_norm": 0.8198974132537842, + "learning_rate": 1.4742756001948171e-05, + "loss": 0.2321, + "step": 15375 + }, + { + "epoch": 0.3428182947873267, + "grad_norm": 0.3840519189834595, + "learning_rate": 1.4739673233400528e-05, + "loss": 0.3065, + "step": 15380 + }, + { + "epoch": 0.3429297441679468, + "grad_norm": 0.5746693015098572, + "learning_rate": 1.4736589883805535e-05, + "loss": 0.2873, + "step": 15385 + }, + { + "epoch": 0.3430411935485669, + "grad_norm": 0.7387751936912537, + "learning_rate": 1.473350595354118e-05, + "loss": 0.4415, + "step": 15390 + }, + { + "epoch": 0.3431526429291869, + "grad_norm": 0.6387646198272705, + "learning_rate": 1.4730421442985534e-05, + "loss": 0.413, + "step": 15395 + }, + { + "epoch": 0.343264092309807, + "grad_norm": 0.7444597482681274, + "learning_rate": 1.4727336352516733e-05, + "loss": 0.4287, + "step": 15400 + }, + { + "epoch": 0.34337554169042706, + "grad_norm": 0.6035224795341492, + "learning_rate": 1.4724250682512985e-05, + "loss": 0.2612, + "step": 15405 + }, + { + "epoch": 0.34348699107104713, + "grad_norm": 0.6849896907806396, + "learning_rate": 1.4721164433352568e-05, + "loss": 0.2332, + "step": 15410 + }, + { + "epoch": 0.3435984404516672, + "grad_norm": 0.5554786920547485, + "learning_rate": 1.4718077605413832e-05, + "loss": 0.4821, + "step": 15415 + }, + { + "epoch": 0.34370988983228723, + "grad_norm": 0.5494076013565063, + "learning_rate": 1.47149901990752e-05, + "loss": 0.2667, + "step": 15420 + }, + { + "epoch": 0.3438213392129073, + "grad_norm": 0.6117123961448669, + "learning_rate": 1.4711902214715165e-05, + "loss": 0.4004, + "step": 15425 + }, + { + "epoch": 0.3439327885935274, + "grad_norm": 0.6088406443595886, + "learning_rate": 1.4708813652712287e-05, + "loss": 0.3147, + "step": 15430 + }, + { + "epoch": 0.34404423797414746, + "grad_norm": 0.605383038520813, + "learning_rate": 1.4705724513445204e-05, + "loss": 0.2673, + "step": 15435 + }, + { + "epoch": 0.34415568735476754, + "grad_norm": 0.9843233823776245, + "learning_rate": 1.4702634797292618e-05, + "loss": 0.459, + "step": 15440 + }, + { + "epoch": 0.3442671367353876, + "grad_norm": 0.6190301179885864, + "learning_rate": 1.4699544504633303e-05, + "loss": 0.3502, + "step": 15445 + }, + { + "epoch": 0.34437858611600763, + "grad_norm": 0.4855058193206787, + "learning_rate": 1.4696453635846109e-05, + "loss": 0.3083, + "step": 15450 + }, + { + "epoch": 0.3444900354966277, + "grad_norm": 0.8662997484207153, + "learning_rate": 1.4693362191309948e-05, + "loss": 0.3554, + "step": 15455 + }, + { + "epoch": 0.3446014848772478, + "grad_norm": 0.710433840751648, + "learning_rate": 1.4690270171403809e-05, + "loss": 0.3623, + "step": 15460 + }, + { + "epoch": 0.34471293425786786, + "grad_norm": 0.4728683531284332, + "learning_rate": 1.4687177576506752e-05, + "loss": 0.3923, + "step": 15465 + }, + { + "epoch": 0.34482438363848794, + "grad_norm": 0.3298479914665222, + "learning_rate": 1.4684084406997903e-05, + "loss": 0.3616, + "step": 15470 + }, + { + "epoch": 0.344935833019108, + "grad_norm": 0.8117642402648926, + "learning_rate": 1.468099066325646e-05, + "loss": 0.3516, + "step": 15475 + }, + { + "epoch": 0.34504728239972804, + "grad_norm": 0.5667406916618347, + "learning_rate": 1.4677896345661697e-05, + "loss": 0.3093, + "step": 15480 + }, + { + "epoch": 0.3451587317803481, + "grad_norm": 0.47480669617652893, + "learning_rate": 1.4674801454592949e-05, + "loss": 0.3733, + "step": 15485 + }, + { + "epoch": 0.3452701811609682, + "grad_norm": 0.670015811920166, + "learning_rate": 1.467170599042962e-05, + "loss": 0.3697, + "step": 15490 + }, + { + "epoch": 0.34538163054158827, + "grad_norm": 0.5766051411628723, + "learning_rate": 1.4668609953551205e-05, + "loss": 0.3838, + "step": 15495 + }, + { + "epoch": 0.34549307992220835, + "grad_norm": 1.1956382989883423, + "learning_rate": 1.4665513344337243e-05, + "loss": 0.3496, + "step": 15500 + }, + { + "epoch": 0.3456045293028284, + "grad_norm": 0.8127216696739197, + "learning_rate": 1.4662416163167354e-05, + "loss": 0.2764, + "step": 15505 + }, + { + "epoch": 0.34571597868344844, + "grad_norm": 0.693279504776001, + "learning_rate": 1.4659318410421237e-05, + "loss": 0.5523, + "step": 15510 + }, + { + "epoch": 0.3458274280640685, + "grad_norm": 0.696718692779541, + "learning_rate": 1.4656220086478645e-05, + "loss": 0.2758, + "step": 15515 + }, + { + "epoch": 0.3459388774446886, + "grad_norm": 0.41814491152763367, + "learning_rate": 1.465312119171941e-05, + "loss": 0.2746, + "step": 15520 + }, + { + "epoch": 0.3460503268253087, + "grad_norm": 0.6163225769996643, + "learning_rate": 1.4650021726523433e-05, + "loss": 0.3639, + "step": 15525 + }, + { + "epoch": 0.34616177620592875, + "grad_norm": 0.5217748880386353, + "learning_rate": 1.4646921691270688e-05, + "loss": 0.3989, + "step": 15530 + }, + { + "epoch": 0.3462732255865488, + "grad_norm": 0.5386656522750854, + "learning_rate": 1.464382108634121e-05, + "loss": 0.4572, + "step": 15535 + }, + { + "epoch": 0.34638467496716885, + "grad_norm": 0.7713764905929565, + "learning_rate": 1.4640719912115114e-05, + "loss": 0.3086, + "step": 15540 + }, + { + "epoch": 0.3464961243477889, + "grad_norm": 0.3597823977470398, + "learning_rate": 1.4637618168972574e-05, + "loss": 0.2486, + "step": 15545 + }, + { + "epoch": 0.346607573728409, + "grad_norm": 0.7092816233634949, + "learning_rate": 1.4634515857293845e-05, + "loss": 0.3379, + "step": 15550 + }, + { + "epoch": 0.3467190231090291, + "grad_norm": 0.7376322746276855, + "learning_rate": 1.4631412977459248e-05, + "loss": 0.3819, + "step": 15555 + }, + { + "epoch": 0.34683047248964916, + "grad_norm": 0.623254656791687, + "learning_rate": 1.4628309529849164e-05, + "loss": 0.3446, + "step": 15560 + }, + { + "epoch": 0.34694192187026923, + "grad_norm": 0.6260783076286316, + "learning_rate": 1.4625205514844061e-05, + "loss": 0.385, + "step": 15565 + }, + { + "epoch": 0.34705337125088925, + "grad_norm": 0.5267546772956848, + "learning_rate": 1.462210093282446e-05, + "loss": 0.2952, + "step": 15570 + }, + { + "epoch": 0.34716482063150933, + "grad_norm": 0.6364974975585938, + "learning_rate": 1.4618995784170961e-05, + "loss": 0.3519, + "step": 15575 + }, + { + "epoch": 0.3472762700121294, + "grad_norm": 0.6822920441627502, + "learning_rate": 1.4615890069264237e-05, + "loss": 0.3488, + "step": 15580 + }, + { + "epoch": 0.3473877193927495, + "grad_norm": 0.5061941742897034, + "learning_rate": 1.4612783788485014e-05, + "loss": 0.25, + "step": 15585 + }, + { + "epoch": 0.34749916877336956, + "grad_norm": 0.5946047902107239, + "learning_rate": 1.4609676942214108e-05, + "loss": 0.4002, + "step": 15590 + }, + { + "epoch": 0.34761061815398964, + "grad_norm": 0.5874724388122559, + "learning_rate": 1.4606569530832385e-05, + "loss": 0.1921, + "step": 15595 + }, + { + "epoch": 0.34772206753460966, + "grad_norm": 0.789135754108429, + "learning_rate": 1.46034615547208e-05, + "loss": 0.3302, + "step": 15600 + }, + { + "epoch": 0.34783351691522973, + "grad_norm": 0.846127986907959, + "learning_rate": 1.460035301426036e-05, + "loss": 0.3297, + "step": 15605 + }, + { + "epoch": 0.3479449662958498, + "grad_norm": 0.6058344841003418, + "learning_rate": 1.4597243909832149e-05, + "loss": 0.1896, + "step": 15610 + }, + { + "epoch": 0.3480564156764699, + "grad_norm": 0.6141452789306641, + "learning_rate": 1.459413424181732e-05, + "loss": 0.249, + "step": 15615 + }, + { + "epoch": 0.34816786505708996, + "grad_norm": 0.614362895488739, + "learning_rate": 1.4591024010597094e-05, + "loss": 0.2335, + "step": 15620 + }, + { + "epoch": 0.34827931443771, + "grad_norm": 0.5771622061729431, + "learning_rate": 1.4587913216552765e-05, + "loss": 0.2674, + "step": 15625 + }, + { + "epoch": 0.34839076381833006, + "grad_norm": 0.7412368059158325, + "learning_rate": 1.4584801860065683e-05, + "loss": 0.4087, + "step": 15630 + }, + { + "epoch": 0.34850221319895014, + "grad_norm": 0.6473712921142578, + "learning_rate": 1.4581689941517285e-05, + "loss": 0.3875, + "step": 15635 + }, + { + "epoch": 0.3486136625795702, + "grad_norm": 0.575176477432251, + "learning_rate": 1.4578577461289068e-05, + "loss": 0.3011, + "step": 15640 + }, + { + "epoch": 0.3487251119601903, + "grad_norm": 0.9265764951705933, + "learning_rate": 1.4575464419762592e-05, + "loss": 0.4058, + "step": 15645 + }, + { + "epoch": 0.34883656134081037, + "grad_norm": 0.5604076981544495, + "learning_rate": 1.4572350817319497e-05, + "loss": 0.3007, + "step": 15650 + }, + { + "epoch": 0.3489480107214304, + "grad_norm": 0.7239184975624084, + "learning_rate": 1.4569236654341486e-05, + "loss": 0.4001, + "step": 15655 + }, + { + "epoch": 0.34905946010205047, + "grad_norm": 0.7642486095428467, + "learning_rate": 1.4566121931210326e-05, + "loss": 0.2504, + "step": 15660 + }, + { + "epoch": 0.34917090948267054, + "grad_norm": 0.6194584965705872, + "learning_rate": 1.4563006648307865e-05, + "loss": 0.2743, + "step": 15665 + }, + { + "epoch": 0.3492823588632906, + "grad_norm": 0.4331457316875458, + "learning_rate": 1.455989080601601e-05, + "loss": 0.3977, + "step": 15670 + }, + { + "epoch": 0.3493938082439107, + "grad_norm": 0.4841763973236084, + "learning_rate": 1.4556774404716738e-05, + "loss": 0.3041, + "step": 15675 + }, + { + "epoch": 0.3495052576245308, + "grad_norm": 0.7557531595230103, + "learning_rate": 1.45536574447921e-05, + "loss": 0.3077, + "step": 15680 + }, + { + "epoch": 0.3496167070051508, + "grad_norm": 0.5951617360115051, + "learning_rate": 1.4550539926624206e-05, + "loss": 0.4016, + "step": 15685 + }, + { + "epoch": 0.34972815638577087, + "grad_norm": 0.5638185739517212, + "learning_rate": 1.454742185059524e-05, + "loss": 0.3353, + "step": 15690 + }, + { + "epoch": 0.34983960576639095, + "grad_norm": 0.5489707589149475, + "learning_rate": 1.4544303217087459e-05, + "loss": 0.4071, + "step": 15695 + }, + { + "epoch": 0.349951055147011, + "grad_norm": 0.5517125129699707, + "learning_rate": 1.4541184026483176e-05, + "loss": 0.3495, + "step": 15700 + }, + { + "epoch": 0.3500625045276311, + "grad_norm": 0.5938624143600464, + "learning_rate": 1.4538064279164787e-05, + "loss": 0.3706, + "step": 15705 + }, + { + "epoch": 0.3501739539082512, + "grad_norm": 0.6009719371795654, + "learning_rate": 1.4534943975514746e-05, + "loss": 0.4807, + "step": 15710 + }, + { + "epoch": 0.3502854032888712, + "grad_norm": 0.7321330904960632, + "learning_rate": 1.453182311591558e-05, + "loss": 0.4177, + "step": 15715 + }, + { + "epoch": 0.3503968526694913, + "grad_norm": 0.826553463935852, + "learning_rate": 1.4528701700749876e-05, + "loss": 0.33, + "step": 15720 + }, + { + "epoch": 0.35050830205011135, + "grad_norm": 0.4826429486274719, + "learning_rate": 1.4525579730400301e-05, + "loss": 0.3708, + "step": 15725 + }, + { + "epoch": 0.35061975143073143, + "grad_norm": 0.5381823778152466, + "learning_rate": 1.4522457205249584e-05, + "loss": 0.3267, + "step": 15730 + }, + { + "epoch": 0.3507312008113515, + "grad_norm": 1.0298216342926025, + "learning_rate": 1.4519334125680522e-05, + "loss": 0.3464, + "step": 15735 + }, + { + "epoch": 0.3508426501919716, + "grad_norm": 0.6540161967277527, + "learning_rate": 1.451621049207598e-05, + "loss": 0.3396, + "step": 15740 + }, + { + "epoch": 0.3509540995725916, + "grad_norm": 0.5737782716751099, + "learning_rate": 1.451308630481889e-05, + "loss": 0.3299, + "step": 15745 + }, + { + "epoch": 0.3510655489532117, + "grad_norm": 0.7006227970123291, + "learning_rate": 1.4509961564292257e-05, + "loss": 0.382, + "step": 15750 + }, + { + "epoch": 0.35117699833383176, + "grad_norm": 0.5406222343444824, + "learning_rate": 1.4506836270879143e-05, + "loss": 0.3064, + "step": 15755 + }, + { + "epoch": 0.35128844771445183, + "grad_norm": 0.8543760776519775, + "learning_rate": 1.4503710424962693e-05, + "loss": 0.3355, + "step": 15760 + }, + { + "epoch": 0.3513998970950719, + "grad_norm": 0.7554575204849243, + "learning_rate": 1.450058402692611e-05, + "loss": 0.372, + "step": 15765 + }, + { + "epoch": 0.351511346475692, + "grad_norm": 0.8103945851325989, + "learning_rate": 1.449745707715266e-05, + "loss": 0.3812, + "step": 15770 + }, + { + "epoch": 0.351622795856312, + "grad_norm": 0.5388813614845276, + "learning_rate": 1.4494329576025691e-05, + "loss": 0.3267, + "step": 15775 + }, + { + "epoch": 0.3517342452369321, + "grad_norm": 0.6109858155250549, + "learning_rate": 1.4491201523928608e-05, + "loss": 0.4573, + "step": 15780 + }, + { + "epoch": 0.35184569461755216, + "grad_norm": 0.6300851702690125, + "learning_rate": 1.4488072921244883e-05, + "loss": 0.2466, + "step": 15785 + }, + { + "epoch": 0.35195714399817224, + "grad_norm": 0.6552715301513672, + "learning_rate": 1.4484943768358059e-05, + "loss": 0.3671, + "step": 15790 + }, + { + "epoch": 0.3520685933787923, + "grad_norm": 0.9018075466156006, + "learning_rate": 1.448181406565175e-05, + "loss": 0.3814, + "step": 15795 + }, + { + "epoch": 0.35218004275941234, + "grad_norm": 0.5913105607032776, + "learning_rate": 1.4478683813509629e-05, + "loss": 0.2968, + "step": 15800 + }, + { + "epoch": 0.3522914921400324, + "grad_norm": 0.6036539077758789, + "learning_rate": 1.4475553012315441e-05, + "loss": 0.25, + "step": 15805 + }, + { + "epoch": 0.3524029415206525, + "grad_norm": 0.5692189931869507, + "learning_rate": 1.4472421662453004e-05, + "loss": 0.2111, + "step": 15810 + }, + { + "epoch": 0.35251439090127257, + "grad_norm": 0.676845133304596, + "learning_rate": 1.446928976430619e-05, + "loss": 0.338, + "step": 15815 + }, + { + "epoch": 0.35262584028189264, + "grad_norm": 0.8736307621002197, + "learning_rate": 1.4466157318258952e-05, + "loss": 0.288, + "step": 15820 + }, + { + "epoch": 0.3527372896625127, + "grad_norm": 0.6261550784111023, + "learning_rate": 1.4463024324695295e-05, + "loss": 0.3869, + "step": 15825 + }, + { + "epoch": 0.35284873904313274, + "grad_norm": 0.5824673771858215, + "learning_rate": 1.4459890783999308e-05, + "loss": 0.3307, + "step": 15830 + }, + { + "epoch": 0.3529601884237528, + "grad_norm": 0.6068733334541321, + "learning_rate": 1.4456756696555137e-05, + "loss": 0.3239, + "step": 15835 + }, + { + "epoch": 0.3530716378043729, + "grad_norm": 0.4593876302242279, + "learning_rate": 1.4453622062746995e-05, + "loss": 0.3784, + "step": 15840 + }, + { + "epoch": 0.35318308718499297, + "grad_norm": 0.7499799132347107, + "learning_rate": 1.4450486882959162e-05, + "loss": 0.3009, + "step": 15845 + }, + { + "epoch": 0.35329453656561305, + "grad_norm": 0.7290626168251038, + "learning_rate": 1.4447351157575993e-05, + "loss": 0.3016, + "step": 15850 + }, + { + "epoch": 0.3534059859462331, + "grad_norm": 0.625694990158081, + "learning_rate": 1.44442148869819e-05, + "loss": 0.3197, + "step": 15855 + }, + { + "epoch": 0.35351743532685315, + "grad_norm": 0.6635383367538452, + "learning_rate": 1.4441078071561363e-05, + "loss": 0.2795, + "step": 15860 + }, + { + "epoch": 0.3536288847074732, + "grad_norm": 0.6525633335113525, + "learning_rate": 1.4437940711698936e-05, + "loss": 0.3183, + "step": 15865 + }, + { + "epoch": 0.3537403340880933, + "grad_norm": 0.5589553713798523, + "learning_rate": 1.4434802807779238e-05, + "loss": 0.4735, + "step": 15870 + }, + { + "epoch": 0.3538517834687134, + "grad_norm": 0.6205970048904419, + "learning_rate": 1.4431664360186942e-05, + "loss": 0.2494, + "step": 15875 + }, + { + "epoch": 0.35396323284933345, + "grad_norm": 0.5392980575561523, + "learning_rate": 1.4428525369306803e-05, + "loss": 0.2474, + "step": 15880 + }, + { + "epoch": 0.35407468222995353, + "grad_norm": 0.6593893766403198, + "learning_rate": 1.4425385835523638e-05, + "loss": 0.373, + "step": 15885 + }, + { + "epoch": 0.35418613161057355, + "grad_norm": 1.365324854850769, + "learning_rate": 1.4422245759222326e-05, + "loss": 0.2801, + "step": 15890 + }, + { + "epoch": 0.35429758099119363, + "grad_norm": 0.8453717231750488, + "learning_rate": 1.4419105140787819e-05, + "loss": 0.2617, + "step": 15895 + }, + { + "epoch": 0.3544090303718137, + "grad_norm": 0.4204340875148773, + "learning_rate": 1.4415963980605136e-05, + "loss": 0.2475, + "step": 15900 + }, + { + "epoch": 0.3545204797524338, + "grad_norm": 0.48751235008239746, + "learning_rate": 1.441282227905935e-05, + "loss": 0.3031, + "step": 15905 + }, + { + "epoch": 0.35463192913305386, + "grad_norm": 0.5883186459541321, + "learning_rate": 1.4409680036535618e-05, + "loss": 0.2444, + "step": 15910 + }, + { + "epoch": 0.35474337851367393, + "grad_norm": 0.37586283683776855, + "learning_rate": 1.4406537253419149e-05, + "loss": 0.3133, + "step": 15915 + }, + { + "epoch": 0.35485482789429396, + "grad_norm": 0.6607975959777832, + "learning_rate": 1.4403393930095227e-05, + "loss": 0.4004, + "step": 15920 + }, + { + "epoch": 0.35496627727491403, + "grad_norm": 0.814619243144989, + "learning_rate": 1.4400250066949198e-05, + "loss": 0.4083, + "step": 15925 + }, + { + "epoch": 0.3550777266555341, + "grad_norm": 0.5426995754241943, + "learning_rate": 1.4397105664366474e-05, + "loss": 0.2856, + "step": 15930 + }, + { + "epoch": 0.3551891760361542, + "grad_norm": 0.6581703424453735, + "learning_rate": 1.4393960722732538e-05, + "loss": 0.3279, + "step": 15935 + }, + { + "epoch": 0.35530062541677426, + "grad_norm": 0.58955979347229, + "learning_rate": 1.4390815242432932e-05, + "loss": 0.3484, + "step": 15940 + }, + { + "epoch": 0.35541207479739434, + "grad_norm": 0.49699389934539795, + "learning_rate": 1.438766922385327e-05, + "loss": 0.3763, + "step": 15945 + }, + { + "epoch": 0.35552352417801436, + "grad_norm": 0.4076257646083832, + "learning_rate": 1.4384522667379229e-05, + "loss": 0.2196, + "step": 15950 + }, + { + "epoch": 0.35563497355863444, + "grad_norm": 0.6451223492622375, + "learning_rate": 1.4381375573396552e-05, + "loss": 0.387, + "step": 15955 + }, + { + "epoch": 0.3557464229392545, + "grad_norm": 0.727311372756958, + "learning_rate": 1.437822794229105e-05, + "loss": 0.2124, + "step": 15960 + }, + { + "epoch": 0.3558578723198746, + "grad_norm": 0.48559942841529846, + "learning_rate": 1.4375079774448595e-05, + "loss": 0.4022, + "step": 15965 + }, + { + "epoch": 0.35596932170049467, + "grad_norm": 0.5659855008125305, + "learning_rate": 1.437193107025513e-05, + "loss": 0.2158, + "step": 15970 + }, + { + "epoch": 0.35608077108111474, + "grad_norm": 0.8604714274406433, + "learning_rate": 1.4368781830096662e-05, + "loss": 0.3405, + "step": 15975 + }, + { + "epoch": 0.35619222046173477, + "grad_norm": 0.5525174736976624, + "learning_rate": 1.4365632054359267e-05, + "loss": 0.3046, + "step": 15980 + }, + { + "epoch": 0.35630366984235484, + "grad_norm": 0.6220390200614929, + "learning_rate": 1.4362481743429073e-05, + "loss": 0.2765, + "step": 15985 + }, + { + "epoch": 0.3564151192229749, + "grad_norm": 0.6702668070793152, + "learning_rate": 1.4359330897692296e-05, + "loss": 0.3382, + "step": 15990 + }, + { + "epoch": 0.356526568603595, + "grad_norm": 0.5980839133262634, + "learning_rate": 1.4356179517535199e-05, + "loss": 0.3194, + "step": 15995 + }, + { + "epoch": 0.35663801798421507, + "grad_norm": 0.3999008536338806, + "learning_rate": 1.4353027603344115e-05, + "loss": 0.3779, + "step": 16000 + }, + { + "epoch": 0.3567494673648351, + "grad_norm": 0.531395673751831, + "learning_rate": 1.4349875155505448e-05, + "loss": 0.4263, + "step": 16005 + }, + { + "epoch": 0.35686091674545517, + "grad_norm": 0.8333300352096558, + "learning_rate": 1.4346722174405666e-05, + "loss": 0.2635, + "step": 16010 + }, + { + "epoch": 0.35697236612607525, + "grad_norm": 0.33808475732803345, + "learning_rate": 1.4343568660431293e-05, + "loss": 0.319, + "step": 16015 + }, + { + "epoch": 0.3570838155066953, + "grad_norm": 0.7113955020904541, + "learning_rate": 1.4340414613968929e-05, + "loss": 0.4791, + "step": 16020 + }, + { + "epoch": 0.3571952648873154, + "grad_norm": 0.592242419719696, + "learning_rate": 1.433726003540524e-05, + "loss": 0.2668, + "step": 16025 + }, + { + "epoch": 0.3573067142679355, + "grad_norm": 0.6215150952339172, + "learning_rate": 1.4334104925126945e-05, + "loss": 0.3935, + "step": 16030 + }, + { + "epoch": 0.3574181636485555, + "grad_norm": 0.4412376582622528, + "learning_rate": 1.4330949283520843e-05, + "loss": 0.2753, + "step": 16035 + }, + { + "epoch": 0.3575296130291756, + "grad_norm": 0.46210625767707825, + "learning_rate": 1.432779311097379e-05, + "loss": 0.4574, + "step": 16040 + }, + { + "epoch": 0.35764106240979565, + "grad_norm": 0.5504388213157654, + "learning_rate": 1.4324636407872705e-05, + "loss": 0.3069, + "step": 16045 + }, + { + "epoch": 0.35775251179041573, + "grad_norm": 0.3885088562965393, + "learning_rate": 1.432147917460458e-05, + "loss": 0.3624, + "step": 16050 + }, + { + "epoch": 0.3578639611710358, + "grad_norm": 0.44459834694862366, + "learning_rate": 1.4318321411556462e-05, + "loss": 0.2526, + "step": 16055 + }, + { + "epoch": 0.3579754105516559, + "grad_norm": 0.36986035108566284, + "learning_rate": 1.4315163119115472e-05, + "loss": 0.369, + "step": 16060 + }, + { + "epoch": 0.3580868599322759, + "grad_norm": 0.6682476997375488, + "learning_rate": 1.4312004297668791e-05, + "loss": 0.3287, + "step": 16065 + }, + { + "epoch": 0.358198309312896, + "grad_norm": 0.5184884667396545, + "learning_rate": 1.4308844947603666e-05, + "loss": 0.2998, + "step": 16070 + }, + { + "epoch": 0.35830975869351606, + "grad_norm": 0.7321197986602783, + "learning_rate": 1.4305685069307412e-05, + "loss": 0.4231, + "step": 16075 + }, + { + "epoch": 0.35842120807413613, + "grad_norm": 0.8538438677787781, + "learning_rate": 1.4302524663167403e-05, + "loss": 0.339, + "step": 16080 + }, + { + "epoch": 0.3585326574547562, + "grad_norm": 0.5270879864692688, + "learning_rate": 1.4299363729571081e-05, + "loss": 0.3379, + "step": 16085 + }, + { + "epoch": 0.3586441068353763, + "grad_norm": 0.8474531769752502, + "learning_rate": 1.4296202268905948e-05, + "loss": 0.3172, + "step": 16090 + }, + { + "epoch": 0.3587555562159963, + "grad_norm": 0.6586465239524841, + "learning_rate": 1.429304028155958e-05, + "loss": 0.2714, + "step": 16095 + }, + { + "epoch": 0.3588670055966164, + "grad_norm": 0.5502996444702148, + "learning_rate": 1.4289877767919613e-05, + "loss": 0.2873, + "step": 16100 + }, + { + "epoch": 0.35897845497723646, + "grad_norm": 0.8836638927459717, + "learning_rate": 1.4286714728373743e-05, + "loss": 0.3991, + "step": 16105 + }, + { + "epoch": 0.35908990435785654, + "grad_norm": 0.4775274991989136, + "learning_rate": 1.4283551163309735e-05, + "loss": 0.309, + "step": 16110 + }, + { + "epoch": 0.3592013537384766, + "grad_norm": 0.579677164554596, + "learning_rate": 1.428038707311542e-05, + "loss": 0.3486, + "step": 16115 + }, + { + "epoch": 0.3593128031190967, + "grad_norm": 0.5680740475654602, + "learning_rate": 1.4277222458178688e-05, + "loss": 0.2838, + "step": 16120 + }, + { + "epoch": 0.3594242524997167, + "grad_norm": 0.5583674907684326, + "learning_rate": 1.4274057318887495e-05, + "loss": 0.403, + "step": 16125 + }, + { + "epoch": 0.3595357018803368, + "grad_norm": 0.6949165463447571, + "learning_rate": 1.427089165562987e-05, + "loss": 0.2891, + "step": 16130 + }, + { + "epoch": 0.35964715126095687, + "grad_norm": 0.6850028038024902, + "learning_rate": 1.4267725468793895e-05, + "loss": 0.3734, + "step": 16135 + }, + { + "epoch": 0.35975860064157694, + "grad_norm": 0.6475020051002502, + "learning_rate": 1.4264558758767715e-05, + "loss": 0.3214, + "step": 16140 + }, + { + "epoch": 0.359870050022197, + "grad_norm": 0.5016544461250305, + "learning_rate": 1.426139152593955e-05, + "loss": 0.308, + "step": 16145 + }, + { + "epoch": 0.3599814994028171, + "grad_norm": 0.5636915564537048, + "learning_rate": 1.4258223770697679e-05, + "loss": 0.3707, + "step": 16150 + }, + { + "epoch": 0.3600929487834371, + "grad_norm": 0.7824536561965942, + "learning_rate": 1.4255055493430441e-05, + "loss": 0.2622, + "step": 16155 + }, + { + "epoch": 0.3602043981640572, + "grad_norm": 0.7711067795753479, + "learning_rate": 1.425188669452624e-05, + "loss": 0.2371, + "step": 16160 + }, + { + "epoch": 0.36031584754467727, + "grad_norm": 0.6058388352394104, + "learning_rate": 1.4248717374373554e-05, + "loss": 0.2209, + "step": 16165 + }, + { + "epoch": 0.36042729692529735, + "grad_norm": 0.5630457401275635, + "learning_rate": 1.4245547533360912e-05, + "loss": 0.3454, + "step": 16170 + }, + { + "epoch": 0.3605387463059174, + "grad_norm": 0.43767404556274414, + "learning_rate": 1.4242377171876913e-05, + "loss": 0.3509, + "step": 16175 + }, + { + "epoch": 0.36065019568653744, + "grad_norm": 0.6815534234046936, + "learning_rate": 1.423920629031022e-05, + "loss": 0.4217, + "step": 16180 + }, + { + "epoch": 0.3607616450671575, + "grad_norm": 0.5987027883529663, + "learning_rate": 1.4236034889049554e-05, + "loss": 0.2651, + "step": 16185 + }, + { + "epoch": 0.3608730944477776, + "grad_norm": 0.651369035243988, + "learning_rate": 1.4232862968483711e-05, + "loss": 0.3665, + "step": 16190 + }, + { + "epoch": 0.3609845438283977, + "grad_norm": 0.5900524854660034, + "learning_rate": 1.4229690529001538e-05, + "loss": 0.3257, + "step": 16195 + }, + { + "epoch": 0.36109599320901775, + "grad_norm": 0.5773907899856567, + "learning_rate": 1.4226517570991955e-05, + "loss": 0.2899, + "step": 16200 + }, + { + "epoch": 0.36120744258963783, + "grad_norm": 0.5796383023262024, + "learning_rate": 1.4223344094843945e-05, + "loss": 0.3728, + "step": 16205 + }, + { + "epoch": 0.36131889197025785, + "grad_norm": 0.6338873505592346, + "learning_rate": 1.4220170100946547e-05, + "loss": 0.3826, + "step": 16210 + }, + { + "epoch": 0.3614303413508779, + "grad_norm": 0.7792302370071411, + "learning_rate": 1.4216995589688865e-05, + "loss": 0.3319, + "step": 16215 + }, + { + "epoch": 0.361541790731498, + "grad_norm": 0.39774107933044434, + "learning_rate": 1.421382056146008e-05, + "loss": 0.3335, + "step": 16220 + }, + { + "epoch": 0.3616532401121181, + "grad_norm": 0.6285306215286255, + "learning_rate": 1.4210645016649416e-05, + "loss": 0.3625, + "step": 16225 + }, + { + "epoch": 0.36176468949273816, + "grad_norm": 0.85149747133255, + "learning_rate": 1.4207468955646171e-05, + "loss": 0.3553, + "step": 16230 + }, + { + "epoch": 0.36187613887335823, + "grad_norm": 0.6813942790031433, + "learning_rate": 1.4204292378839714e-05, + "loss": 0.3652, + "step": 16235 + }, + { + "epoch": 0.36198758825397825, + "grad_norm": 0.7618511915206909, + "learning_rate": 1.4201115286619464e-05, + "loss": 0.2329, + "step": 16240 + }, + { + "epoch": 0.36209903763459833, + "grad_norm": 0.6078253388404846, + "learning_rate": 1.4197937679374904e-05, + "loss": 0.3757, + "step": 16245 + }, + { + "epoch": 0.3622104870152184, + "grad_norm": 0.5501529574394226, + "learning_rate": 1.419475955749559e-05, + "loss": 0.2823, + "step": 16250 + }, + { + "epoch": 0.3623219363958385, + "grad_norm": 0.5846516489982605, + "learning_rate": 1.4191580921371134e-05, + "loss": 0.3182, + "step": 16255 + }, + { + "epoch": 0.36243338577645856, + "grad_norm": 0.7952144145965576, + "learning_rate": 1.4188401771391207e-05, + "loss": 0.4785, + "step": 16260 + }, + { + "epoch": 0.36254483515707864, + "grad_norm": 0.5714855194091797, + "learning_rate": 1.4185222107945553e-05, + "loss": 0.4005, + "step": 16265 + }, + { + "epoch": 0.36265628453769866, + "grad_norm": 0.6147738695144653, + "learning_rate": 1.4182041931423976e-05, + "loss": 0.546, + "step": 16270 + }, + { + "epoch": 0.36276773391831874, + "grad_norm": 0.6777298450469971, + "learning_rate": 1.4178861242216338e-05, + "loss": 0.4361, + "step": 16275 + }, + { + "epoch": 0.3628791832989388, + "grad_norm": 0.6393991708755493, + "learning_rate": 1.4175680040712567e-05, + "loss": 0.4514, + "step": 16280 + }, + { + "epoch": 0.3629906326795589, + "grad_norm": 0.6732271313667297, + "learning_rate": 1.4172498327302653e-05, + "loss": 0.3916, + "step": 16285 + }, + { + "epoch": 0.36310208206017897, + "grad_norm": 0.6620044708251953, + "learning_rate": 1.4169316102376653e-05, + "loss": 0.2325, + "step": 16290 + }, + { + "epoch": 0.36321353144079904, + "grad_norm": 0.35400888323783875, + "learning_rate": 1.4166133366324682e-05, + "loss": 0.2397, + "step": 16295 + }, + { + "epoch": 0.36332498082141906, + "grad_norm": 0.5362957715988159, + "learning_rate": 1.4162950119536913e-05, + "loss": 0.3582, + "step": 16300 + }, + { + "epoch": 0.36343643020203914, + "grad_norm": 0.5642344951629639, + "learning_rate": 1.4159766362403597e-05, + "loss": 0.3506, + "step": 16305 + }, + { + "epoch": 0.3635478795826592, + "grad_norm": 0.59524005651474, + "learning_rate": 1.4156582095315032e-05, + "loss": 0.4598, + "step": 16310 + }, + { + "epoch": 0.3636593289632793, + "grad_norm": 0.6816734671592712, + "learning_rate": 1.4153397318661588e-05, + "loss": 0.4182, + "step": 16315 + }, + { + "epoch": 0.36377077834389937, + "grad_norm": 0.36529749631881714, + "learning_rate": 1.4150212032833687e-05, + "loss": 0.2804, + "step": 16320 + }, + { + "epoch": 0.36388222772451945, + "grad_norm": 0.6353341937065125, + "learning_rate": 1.4147026238221831e-05, + "loss": 0.3277, + "step": 16325 + }, + { + "epoch": 0.36399367710513947, + "grad_norm": 0.4412643015384674, + "learning_rate": 1.4143839935216565e-05, + "loss": 0.1712, + "step": 16330 + }, + { + "epoch": 0.36410512648575954, + "grad_norm": 0.36294540762901306, + "learning_rate": 1.414065312420851e-05, + "loss": 0.3183, + "step": 16335 + }, + { + "epoch": 0.3642165758663796, + "grad_norm": 0.5242140293121338, + "learning_rate": 1.4137465805588342e-05, + "loss": 0.4187, + "step": 16340 + }, + { + "epoch": 0.3643280252469997, + "grad_norm": 0.5565961599349976, + "learning_rate": 1.4134277979746803e-05, + "loss": 0.3671, + "step": 16345 + }, + { + "epoch": 0.3644394746276198, + "grad_norm": 0.5486041307449341, + "learning_rate": 1.4131089647074697e-05, + "loss": 0.448, + "step": 16350 + }, + { + "epoch": 0.36455092400823985, + "grad_norm": 0.6204886436462402, + "learning_rate": 1.4127900807962882e-05, + "loss": 0.1796, + "step": 16355 + }, + { + "epoch": 0.3646623733888599, + "grad_norm": 0.5635268092155457, + "learning_rate": 1.4124711462802296e-05, + "loss": 0.2629, + "step": 16360 + }, + { + "epoch": 0.36477382276947995, + "grad_norm": 0.4247026741504669, + "learning_rate": 1.4121521611983921e-05, + "loss": 0.3895, + "step": 16365 + }, + { + "epoch": 0.3648852721501, + "grad_norm": 0.8973407745361328, + "learning_rate": 1.4118331255898807e-05, + "loss": 0.4073, + "step": 16370 + }, + { + "epoch": 0.3649967215307201, + "grad_norm": 0.6370887756347656, + "learning_rate": 1.4115140394938074e-05, + "loss": 0.3349, + "step": 16375 + }, + { + "epoch": 0.3651081709113402, + "grad_norm": 0.6443496346473694, + "learning_rate": 1.4111949029492891e-05, + "loss": 0.5014, + "step": 16380 + }, + { + "epoch": 0.3652196202919602, + "grad_norm": 0.5404614210128784, + "learning_rate": 1.4108757159954496e-05, + "loss": 0.312, + "step": 16385 + }, + { + "epoch": 0.3653310696725803, + "grad_norm": 0.6759606003761292, + "learning_rate": 1.4105564786714185e-05, + "loss": 0.2964, + "step": 16390 + }, + { + "epoch": 0.36544251905320035, + "grad_norm": 0.763971745967865, + "learning_rate": 1.4102371910163326e-05, + "loss": 0.3385, + "step": 16395 + }, + { + "epoch": 0.36555396843382043, + "grad_norm": 0.6019205451011658, + "learning_rate": 1.4099178530693333e-05, + "loss": 0.1829, + "step": 16400 + }, + { + "epoch": 0.3656654178144405, + "grad_norm": 0.599514365196228, + "learning_rate": 1.4095984648695695e-05, + "loss": 0.2784, + "step": 16405 + }, + { + "epoch": 0.3657768671950606, + "grad_norm": 0.49529388546943665, + "learning_rate": 1.4092790264561955e-05, + "loss": 0.3325, + "step": 16410 + }, + { + "epoch": 0.3658883165756806, + "grad_norm": 0.8132462501525879, + "learning_rate": 1.4089595378683719e-05, + "loss": 0.5318, + "step": 16415 + }, + { + "epoch": 0.3659997659563007, + "grad_norm": 0.8020228147506714, + "learning_rate": 1.4086399991452658e-05, + "loss": 0.5165, + "step": 16420 + }, + { + "epoch": 0.36611121533692076, + "grad_norm": 0.5684449672698975, + "learning_rate": 1.4083204103260498e-05, + "loss": 0.2872, + "step": 16425 + }, + { + "epoch": 0.36622266471754084, + "grad_norm": 0.9270157814025879, + "learning_rate": 1.4080007714499033e-05, + "loss": 0.257, + "step": 16430 + }, + { + "epoch": 0.3663341140981609, + "grad_norm": 0.6331964731216431, + "learning_rate": 1.4076810825560116e-05, + "loss": 0.4288, + "step": 16435 + }, + { + "epoch": 0.366445563478781, + "grad_norm": 0.6294323205947876, + "learning_rate": 1.4073613436835661e-05, + "loss": 0.3198, + "step": 16440 + }, + { + "epoch": 0.366557012859401, + "grad_norm": 0.627004086971283, + "learning_rate": 1.407041554871764e-05, + "loss": 0.3088, + "step": 16445 + }, + { + "epoch": 0.3666684622400211, + "grad_norm": 0.652871310710907, + "learning_rate": 1.406721716159809e-05, + "loss": 0.3399, + "step": 16450 + }, + { + "epoch": 0.36677991162064116, + "grad_norm": 0.9713679552078247, + "learning_rate": 1.4064018275869116e-05, + "loss": 0.3622, + "step": 16455 + }, + { + "epoch": 0.36689136100126124, + "grad_norm": 0.5056372880935669, + "learning_rate": 1.4060818891922865e-05, + "loss": 0.4097, + "step": 16460 + }, + { + "epoch": 0.3670028103818813, + "grad_norm": 0.46640658378601074, + "learning_rate": 1.4057619010151564e-05, + "loss": 0.3042, + "step": 16465 + }, + { + "epoch": 0.3671142597625014, + "grad_norm": 0.7078487873077393, + "learning_rate": 1.4054418630947495e-05, + "loss": 0.309, + "step": 16470 + }, + { + "epoch": 0.3672257091431214, + "grad_norm": 0.7127984762191772, + "learning_rate": 1.4051217754702995e-05, + "loss": 0.3671, + "step": 16475 + }, + { + "epoch": 0.3673371585237415, + "grad_norm": 0.41462767124176025, + "learning_rate": 1.4048016381810467e-05, + "loss": 0.3339, + "step": 16480 + }, + { + "epoch": 0.36744860790436157, + "grad_norm": 0.5943542718887329, + "learning_rate": 1.404481451266238e-05, + "loss": 0.4385, + "step": 16485 + }, + { + "epoch": 0.36756005728498164, + "grad_norm": 0.769970715045929, + "learning_rate": 1.4041612147651252e-05, + "loss": 0.3697, + "step": 16490 + }, + { + "epoch": 0.3676715066656017, + "grad_norm": 0.5637467503547668, + "learning_rate": 1.4038409287169672e-05, + "loss": 0.3749, + "step": 16495 + }, + { + "epoch": 0.3677829560462218, + "grad_norm": 0.6345033645629883, + "learning_rate": 1.4035205931610288e-05, + "loss": 0.3776, + "step": 16500 + }, + { + "epoch": 0.3678944054268418, + "grad_norm": 0.7196531295776367, + "learning_rate": 1.4032002081365801e-05, + "loss": 0.3372, + "step": 16505 + }, + { + "epoch": 0.3680058548074619, + "grad_norm": 1.9599355459213257, + "learning_rate": 1.402879773682898e-05, + "loss": 0.4251, + "step": 16510 + }, + { + "epoch": 0.368117304188082, + "grad_norm": 0.6104928851127625, + "learning_rate": 1.4025592898392658e-05, + "loss": 0.2613, + "step": 16515 + }, + { + "epoch": 0.36822875356870205, + "grad_norm": 0.6330108046531677, + "learning_rate": 1.4022387566449715e-05, + "loss": 0.284, + "step": 16520 + }, + { + "epoch": 0.3683402029493221, + "grad_norm": 0.49859699606895447, + "learning_rate": 1.4019181741393106e-05, + "loss": 0.1871, + "step": 16525 + }, + { + "epoch": 0.3684516523299422, + "grad_norm": 0.604479968547821, + "learning_rate": 1.401597542361584e-05, + "loss": 0.2761, + "step": 16530 + }, + { + "epoch": 0.3685631017105622, + "grad_norm": 0.46823564171791077, + "learning_rate": 1.4012768613510985e-05, + "loss": 0.2745, + "step": 16535 + }, + { + "epoch": 0.3686745510911823, + "grad_norm": 0.5509923100471497, + "learning_rate": 1.400956131147167e-05, + "loss": 0.2638, + "step": 16540 + }, + { + "epoch": 0.3687860004718024, + "grad_norm": 0.6106875538825989, + "learning_rate": 1.400635351789109e-05, + "loss": 0.3422, + "step": 16545 + }, + { + "epoch": 0.36889744985242245, + "grad_norm": 0.7943158745765686, + "learning_rate": 1.4003145233162495e-05, + "loss": 0.3621, + "step": 16550 + }, + { + "epoch": 0.36900889923304253, + "grad_norm": 0.46912720799446106, + "learning_rate": 1.3999936457679189e-05, + "loss": 0.4506, + "step": 16555 + }, + { + "epoch": 0.36912034861366255, + "grad_norm": 0.421644926071167, + "learning_rate": 1.399672719183455e-05, + "loss": 0.3087, + "step": 16560 + }, + { + "epoch": 0.36923179799428263, + "grad_norm": 0.4038814604282379, + "learning_rate": 1.3993517436022006e-05, + "loss": 0.3402, + "step": 16565 + }, + { + "epoch": 0.3693432473749027, + "grad_norm": 0.3322283923625946, + "learning_rate": 1.399030719063505e-05, + "loss": 0.2699, + "step": 16570 + }, + { + "epoch": 0.3694546967555228, + "grad_norm": 0.4782276749610901, + "learning_rate": 1.3987096456067236e-05, + "loss": 0.2706, + "step": 16575 + }, + { + "epoch": 0.36956614613614286, + "grad_norm": 0.7985139489173889, + "learning_rate": 1.398388523271217e-05, + "loss": 0.2949, + "step": 16580 + }, + { + "epoch": 0.36967759551676294, + "grad_norm": 0.6678852438926697, + "learning_rate": 1.3980673520963524e-05, + "loss": 0.2423, + "step": 16585 + }, + { + "epoch": 0.36978904489738296, + "grad_norm": 0.4688909351825714, + "learning_rate": 1.3977461321215034e-05, + "loss": 0.337, + "step": 16590 + }, + { + "epoch": 0.36990049427800303, + "grad_norm": 0.6942318081855774, + "learning_rate": 1.3974248633860486e-05, + "loss": 0.4337, + "step": 16595 + }, + { + "epoch": 0.3700119436586231, + "grad_norm": 0.5391799807548523, + "learning_rate": 1.3971035459293729e-05, + "loss": 0.2623, + "step": 16600 + }, + { + "epoch": 0.3701233930392432, + "grad_norm": 0.7162173390388489, + "learning_rate": 1.3967821797908678e-05, + "loss": 0.391, + "step": 16605 + }, + { + "epoch": 0.37023484241986326, + "grad_norm": 0.656777560710907, + "learning_rate": 1.3964607650099302e-05, + "loss": 0.3275, + "step": 16610 + }, + { + "epoch": 0.37034629180048334, + "grad_norm": 0.7742978930473328, + "learning_rate": 1.396139301625963e-05, + "loss": 0.3674, + "step": 16615 + }, + { + "epoch": 0.37045774118110336, + "grad_norm": 0.8095855712890625, + "learning_rate": 1.3958177896783751e-05, + "loss": 0.2723, + "step": 16620 + }, + { + "epoch": 0.37056919056172344, + "grad_norm": 0.3930739164352417, + "learning_rate": 1.3954962292065814e-05, + "loss": 0.2473, + "step": 16625 + }, + { + "epoch": 0.3706806399423435, + "grad_norm": 0.6105589270591736, + "learning_rate": 1.395174620250003e-05, + "loss": 0.3796, + "step": 16630 + }, + { + "epoch": 0.3707920893229636, + "grad_norm": 0.6673600077629089, + "learning_rate": 1.394852962848066e-05, + "loss": 0.3638, + "step": 16635 + }, + { + "epoch": 0.37090353870358367, + "grad_norm": 0.58013916015625, + "learning_rate": 1.394531257040204e-05, + "loss": 0.3143, + "step": 16640 + }, + { + "epoch": 0.37101498808420375, + "grad_norm": 0.776722252368927, + "learning_rate": 1.3942095028658553e-05, + "loss": 0.2441, + "step": 16645 + }, + { + "epoch": 0.37112643746482377, + "grad_norm": 0.57895427942276, + "learning_rate": 1.3938877003644639e-05, + "loss": 0.2568, + "step": 16650 + }, + { + "epoch": 0.37123788684544384, + "grad_norm": 0.609281599521637, + "learning_rate": 1.3935658495754809e-05, + "loss": 0.2602, + "step": 16655 + }, + { + "epoch": 0.3713493362260639, + "grad_norm": 0.44535091519355774, + "learning_rate": 1.3932439505383628e-05, + "loss": 0.4303, + "step": 16660 + }, + { + "epoch": 0.371460785606684, + "grad_norm": 0.9552980065345764, + "learning_rate": 1.3929220032925716e-05, + "loss": 0.2524, + "step": 16665 + }, + { + "epoch": 0.3715722349873041, + "grad_norm": 0.8034424781799316, + "learning_rate": 1.3926000078775757e-05, + "loss": 0.3437, + "step": 16670 + }, + { + "epoch": 0.37168368436792415, + "grad_norm": 1.8572295904159546, + "learning_rate": 1.3922779643328492e-05, + "loss": 0.3697, + "step": 16675 + }, + { + "epoch": 0.37179513374854417, + "grad_norm": 0.6209281086921692, + "learning_rate": 1.3919558726978724e-05, + "loss": 0.2803, + "step": 16680 + }, + { + "epoch": 0.37190658312916425, + "grad_norm": 0.628084659576416, + "learning_rate": 1.3916337330121308e-05, + "loss": 0.3693, + "step": 16685 + }, + { + "epoch": 0.3720180325097843, + "grad_norm": 0.64268559217453, + "learning_rate": 1.3913115453151166e-05, + "loss": 0.274, + "step": 16690 + }, + { + "epoch": 0.3721294818904044, + "grad_norm": 0.389698326587677, + "learning_rate": 1.3909893096463274e-05, + "loss": 0.2581, + "step": 16695 + }, + { + "epoch": 0.3722409312710245, + "grad_norm": 0.6302729845046997, + "learning_rate": 1.3906670260452668e-05, + "loss": 0.2739, + "step": 16700 + }, + { + "epoch": 0.37235238065164455, + "grad_norm": 0.6693209409713745, + "learning_rate": 1.3903446945514445e-05, + "loss": 0.2398, + "step": 16705 + }, + { + "epoch": 0.3724638300322646, + "grad_norm": 0.8056045770645142, + "learning_rate": 1.390022315204375e-05, + "loss": 0.4227, + "step": 16710 + }, + { + "epoch": 0.37257527941288465, + "grad_norm": 0.8639228940010071, + "learning_rate": 1.3896998880435807e-05, + "loss": 0.3866, + "step": 16715 + }, + { + "epoch": 0.37268672879350473, + "grad_norm": 0.6434595584869385, + "learning_rate": 1.3893774131085885e-05, + "loss": 0.3445, + "step": 16720 + }, + { + "epoch": 0.3727981781741248, + "grad_norm": 0.5078811645507812, + "learning_rate": 1.38905489043893e-05, + "loss": 0.2029, + "step": 16725 + }, + { + "epoch": 0.3729096275547449, + "grad_norm": 0.6022213101387024, + "learning_rate": 1.3887323200741457e-05, + "loss": 0.3117, + "step": 16730 + }, + { + "epoch": 0.37302107693536496, + "grad_norm": 0.6306881308555603, + "learning_rate": 1.3884097020537794e-05, + "loss": 0.3548, + "step": 16735 + }, + { + "epoch": 0.373132526315985, + "grad_norm": 0.5963431000709534, + "learning_rate": 1.3880870364173815e-05, + "loss": 0.341, + "step": 16740 + }, + { + "epoch": 0.37324397569660506, + "grad_norm": 0.7082194685935974, + "learning_rate": 1.3877643232045086e-05, + "loss": 0.2662, + "step": 16745 + }, + { + "epoch": 0.37335542507722513, + "grad_norm": 0.4736379086971283, + "learning_rate": 1.3874415624547228e-05, + "loss": 0.2598, + "step": 16750 + }, + { + "epoch": 0.3734668744578452, + "grad_norm": 0.6545222401618958, + "learning_rate": 1.387118754207592e-05, + "loss": 0.239, + "step": 16755 + }, + { + "epoch": 0.3735783238384653, + "grad_norm": 0.5867891907691956, + "learning_rate": 1.38679589850269e-05, + "loss": 0.2697, + "step": 16760 + }, + { + "epoch": 0.3736897732190853, + "grad_norm": 0.6038467288017273, + "learning_rate": 1.3864729953795965e-05, + "loss": 0.3925, + "step": 16765 + }, + { + "epoch": 0.3738012225997054, + "grad_norm": 0.6953158378601074, + "learning_rate": 1.3861500448778968e-05, + "loss": 0.4082, + "step": 16770 + }, + { + "epoch": 0.37391267198032546, + "grad_norm": 0.5023922324180603, + "learning_rate": 1.3858270470371826e-05, + "loss": 0.3093, + "step": 16775 + }, + { + "epoch": 0.37402412136094554, + "grad_norm": 0.6265695095062256, + "learning_rate": 1.38550400189705e-05, + "loss": 0.3559, + "step": 16780 + }, + { + "epoch": 0.3741355707415656, + "grad_norm": 0.532604455947876, + "learning_rate": 1.3851809094971028e-05, + "loss": 0.2924, + "step": 16785 + }, + { + "epoch": 0.3742470201221857, + "grad_norm": 0.5418358445167542, + "learning_rate": 1.3848577698769491e-05, + "loss": 0.246, + "step": 16790 + }, + { + "epoch": 0.3743584695028057, + "grad_norm": 0.6811532378196716, + "learning_rate": 1.3845345830762033e-05, + "loss": 0.3921, + "step": 16795 + }, + { + "epoch": 0.3744699188834258, + "grad_norm": 0.5974913239479065, + "learning_rate": 1.384211349134486e-05, + "loss": 0.305, + "step": 16800 + }, + { + "epoch": 0.37458136826404587, + "grad_norm": 0.6165561676025391, + "learning_rate": 1.3838880680914229e-05, + "loss": 0.2876, + "step": 16805 + }, + { + "epoch": 0.37469281764466594, + "grad_norm": 0.4414345920085907, + "learning_rate": 1.3835647399866459e-05, + "loss": 0.3331, + "step": 16810 + }, + { + "epoch": 0.374804267025286, + "grad_norm": 0.5125650763511658, + "learning_rate": 1.3832413648597923e-05, + "loss": 0.3586, + "step": 16815 + }, + { + "epoch": 0.3749157164059061, + "grad_norm": 0.5107496976852417, + "learning_rate": 1.3829179427505052e-05, + "loss": 0.2738, + "step": 16820 + }, + { + "epoch": 0.3750271657865261, + "grad_norm": 0.48095574975013733, + "learning_rate": 1.3825944736984348e-05, + "loss": 0.3208, + "step": 16825 + }, + { + "epoch": 0.3751386151671462, + "grad_norm": 0.6544551253318787, + "learning_rate": 1.3822709577432345e-05, + "loss": 0.2271, + "step": 16830 + }, + { + "epoch": 0.37525006454776627, + "grad_norm": 0.6666854023933411, + "learning_rate": 1.3819473949245655e-05, + "loss": 0.4917, + "step": 16835 + }, + { + "epoch": 0.37536151392838635, + "grad_norm": 0.556831955909729, + "learning_rate": 1.3816237852820945e-05, + "loss": 0.266, + "step": 16840 + }, + { + "epoch": 0.3754729633090064, + "grad_norm": 0.6979274749755859, + "learning_rate": 1.3813001288554925e-05, + "loss": 0.3474, + "step": 16845 + }, + { + "epoch": 0.3755844126896265, + "grad_norm": 0.7797757387161255, + "learning_rate": 1.380976425684438e-05, + "loss": 0.3212, + "step": 16850 + }, + { + "epoch": 0.3756958620702465, + "grad_norm": 0.4977809488773346, + "learning_rate": 1.3806526758086148e-05, + "loss": 0.3759, + "step": 16855 + }, + { + "epoch": 0.3758073114508666, + "grad_norm": 0.634545087814331, + "learning_rate": 1.3803288792677116e-05, + "loss": 0.3098, + "step": 16860 + }, + { + "epoch": 0.3759187608314867, + "grad_norm": 0.8404287695884705, + "learning_rate": 1.380005036101423e-05, + "loss": 0.2756, + "step": 16865 + }, + { + "epoch": 0.37603021021210675, + "grad_norm": 0.596894383430481, + "learning_rate": 1.3796811463494508e-05, + "loss": 0.327, + "step": 16870 + }, + { + "epoch": 0.37614165959272683, + "grad_norm": 0.5277706384658813, + "learning_rate": 1.3793572100515004e-05, + "loss": 0.3358, + "step": 16875 + }, + { + "epoch": 0.3762531089733469, + "grad_norm": 0.5064794421195984, + "learning_rate": 1.3790332272472844e-05, + "loss": 0.3757, + "step": 16880 + }, + { + "epoch": 0.3763645583539669, + "grad_norm": 0.576138973236084, + "learning_rate": 1.3787091979765203e-05, + "loss": 0.3069, + "step": 16885 + }, + { + "epoch": 0.376476007734587, + "grad_norm": 0.6503730416297913, + "learning_rate": 1.378385122278932e-05, + "loss": 0.4241, + "step": 16890 + }, + { + "epoch": 0.3765874571152071, + "grad_norm": 0.5472214818000793, + "learning_rate": 1.3780610001942481e-05, + "loss": 0.2996, + "step": 16895 + }, + { + "epoch": 0.37669890649582716, + "grad_norm": 0.34399139881134033, + "learning_rate": 1.3777368317622038e-05, + "loss": 0.3402, + "step": 16900 + }, + { + "epoch": 0.37681035587644723, + "grad_norm": 0.4152921140193939, + "learning_rate": 1.3774126170225398e-05, + "loss": 0.2246, + "step": 16905 + }, + { + "epoch": 0.3769218052570673, + "grad_norm": 0.6326379179954529, + "learning_rate": 1.3770883560150017e-05, + "loss": 0.4047, + "step": 16910 + }, + { + "epoch": 0.37703325463768733, + "grad_norm": 0.5722355246543884, + "learning_rate": 1.3767640487793423e-05, + "loss": 0.4291, + "step": 16915 + }, + { + "epoch": 0.3771447040183074, + "grad_norm": 0.5854982137680054, + "learning_rate": 1.3764396953553182e-05, + "loss": 0.4298, + "step": 16920 + }, + { + "epoch": 0.3772561533989275, + "grad_norm": 0.47443121671676636, + "learning_rate": 1.3761152957826935e-05, + "loss": 0.4424, + "step": 16925 + }, + { + "epoch": 0.37736760277954756, + "grad_norm": 0.4873077869415283, + "learning_rate": 1.3757908501012368e-05, + "loss": 0.3093, + "step": 16930 + }, + { + "epoch": 0.37747905216016764, + "grad_norm": 0.614548921585083, + "learning_rate": 1.3754663583507222e-05, + "loss": 0.3204, + "step": 16935 + }, + { + "epoch": 0.37759050154078766, + "grad_norm": 0.7937530875205994, + "learning_rate": 1.3751418205709301e-05, + "loss": 0.3612, + "step": 16940 + }, + { + "epoch": 0.37770195092140774, + "grad_norm": 0.5327603816986084, + "learning_rate": 1.3748172368016471e-05, + "loss": 0.385, + "step": 16945 + }, + { + "epoch": 0.3778134003020278, + "grad_norm": 0.634777307510376, + "learning_rate": 1.3744926070826636e-05, + "loss": 0.3743, + "step": 16950 + }, + { + "epoch": 0.3779248496826479, + "grad_norm": 0.5578102469444275, + "learning_rate": 1.3741679314537772e-05, + "loss": 0.3718, + "step": 16955 + }, + { + "epoch": 0.37803629906326797, + "grad_norm": 0.41441190242767334, + "learning_rate": 1.3738432099547903e-05, + "loss": 0.2758, + "step": 16960 + }, + { + "epoch": 0.37814774844388804, + "grad_norm": 0.7478371858596802, + "learning_rate": 1.3735184426255117e-05, + "loss": 0.2887, + "step": 16965 + }, + { + "epoch": 0.37825919782450806, + "grad_norm": 0.7181201577186584, + "learning_rate": 1.3731936295057552e-05, + "loss": 0.3466, + "step": 16970 + }, + { + "epoch": 0.37837064720512814, + "grad_norm": 0.49926698207855225, + "learning_rate": 1.3728687706353401e-05, + "loss": 0.3399, + "step": 16975 + }, + { + "epoch": 0.3784820965857482, + "grad_norm": 0.6402499079704285, + "learning_rate": 1.3725438660540922e-05, + "loss": 0.2987, + "step": 16980 + }, + { + "epoch": 0.3785935459663683, + "grad_norm": 0.4854130744934082, + "learning_rate": 1.3722189158018415e-05, + "loss": 0.3407, + "step": 16985 + }, + { + "epoch": 0.37870499534698837, + "grad_norm": 0.5425702333450317, + "learning_rate": 1.371893919918425e-05, + "loss": 0.4215, + "step": 16990 + }, + { + "epoch": 0.37881644472760845, + "grad_norm": 0.5966536402702332, + "learning_rate": 1.3715688784436847e-05, + "loss": 0.2236, + "step": 16995 + }, + { + "epoch": 0.37892789410822847, + "grad_norm": 0.7263554930686951, + "learning_rate": 1.3712437914174676e-05, + "loss": 0.4078, + "step": 17000 + }, + { + "epoch": 0.37903934348884855, + "grad_norm": 0.6094164252281189, + "learning_rate": 1.3709186588796275e-05, + "loss": 0.3422, + "step": 17005 + }, + { + "epoch": 0.3791507928694686, + "grad_norm": 0.3622407019138336, + "learning_rate": 1.3705934808700227e-05, + "loss": 0.287, + "step": 17010 + }, + { + "epoch": 0.3792622422500887, + "grad_norm": 0.546227753162384, + "learning_rate": 1.370268257428518e-05, + "loss": 0.2921, + "step": 17015 + }, + { + "epoch": 0.3793736916307088, + "grad_norm": 0.4682996869087219, + "learning_rate": 1.3699429885949826e-05, + "loss": 0.2827, + "step": 17020 + }, + { + "epoch": 0.37948514101132885, + "grad_norm": 0.642169177532196, + "learning_rate": 1.3696176744092924e-05, + "loss": 0.262, + "step": 17025 + }, + { + "epoch": 0.3795965903919489, + "grad_norm": 0.6648514270782471, + "learning_rate": 1.3692923149113286e-05, + "loss": 0.2944, + "step": 17030 + }, + { + "epoch": 0.37970803977256895, + "grad_norm": 1.3251914978027344, + "learning_rate": 1.3689669101409773e-05, + "loss": 0.4648, + "step": 17035 + }, + { + "epoch": 0.379819489153189, + "grad_norm": 0.42268428206443787, + "learning_rate": 1.3686414601381309e-05, + "loss": 0.2584, + "step": 17040 + }, + { + "epoch": 0.3799309385338091, + "grad_norm": 0.6230071783065796, + "learning_rate": 1.3683159649426867e-05, + "loss": 0.3298, + "step": 17045 + }, + { + "epoch": 0.3800423879144292, + "grad_norm": 0.5669429302215576, + "learning_rate": 1.3679904245945481e-05, + "loss": 0.3647, + "step": 17050 + }, + { + "epoch": 0.38015383729504926, + "grad_norm": 0.6155208349227905, + "learning_rate": 1.3676648391336245e-05, + "loss": 0.4041, + "step": 17055 + }, + { + "epoch": 0.3802652866756693, + "grad_norm": 0.7474114894866943, + "learning_rate": 1.367339208599829e-05, + "loss": 0.2239, + "step": 17060 + }, + { + "epoch": 0.38037673605628936, + "grad_norm": 0.8320900797843933, + "learning_rate": 1.3670135330330819e-05, + "loss": 0.2889, + "step": 17065 + }, + { + "epoch": 0.38048818543690943, + "grad_norm": 0.5396373271942139, + "learning_rate": 1.366687812473309e-05, + "loss": 0.2983, + "step": 17070 + }, + { + "epoch": 0.3805996348175295, + "grad_norm": 0.7233791351318359, + "learning_rate": 1.3663620469604406e-05, + "loss": 0.4327, + "step": 17075 + }, + { + "epoch": 0.3807110841981496, + "grad_norm": 0.37970295548439026, + "learning_rate": 1.3660362365344126e-05, + "loss": 0.3472, + "step": 17080 + }, + { + "epoch": 0.38082253357876966, + "grad_norm": 0.797645092010498, + "learning_rate": 1.365710381235168e-05, + "loss": 0.2166, + "step": 17085 + }, + { + "epoch": 0.3809339829593897, + "grad_norm": 0.7819150686264038, + "learning_rate": 1.3653844811026534e-05, + "loss": 0.2911, + "step": 17090 + }, + { + "epoch": 0.38104543234000976, + "grad_norm": 0.7489306330680847, + "learning_rate": 1.3650585361768212e-05, + "loss": 0.3647, + "step": 17095 + }, + { + "epoch": 0.38115688172062984, + "grad_norm": 0.4779724180698395, + "learning_rate": 1.364732546497631e-05, + "loss": 0.2732, + "step": 17100 + }, + { + "epoch": 0.3812683311012499, + "grad_norm": 0.5058176517486572, + "learning_rate": 1.3644065121050456e-05, + "loss": 0.2747, + "step": 17105 + }, + { + "epoch": 0.38137978048187, + "grad_norm": 0.4260009229183197, + "learning_rate": 1.3640804330390345e-05, + "loss": 0.3045, + "step": 17110 + }, + { + "epoch": 0.38149122986249007, + "grad_norm": 0.6472933292388916, + "learning_rate": 1.3637543093395727e-05, + "loss": 0.396, + "step": 17115 + }, + { + "epoch": 0.3816026792431101, + "grad_norm": 0.5300203561782837, + "learning_rate": 1.3634281410466404e-05, + "loss": 0.2754, + "step": 17120 + }, + { + "epoch": 0.38171412862373016, + "grad_norm": 0.618672251701355, + "learning_rate": 1.3631019282002231e-05, + "loss": 0.3196, + "step": 17125 + }, + { + "epoch": 0.38182557800435024, + "grad_norm": 0.5094271302223206, + "learning_rate": 1.3627756708403122e-05, + "loss": 0.3136, + "step": 17130 + }, + { + "epoch": 0.3819370273849703, + "grad_norm": 0.5453209280967712, + "learning_rate": 1.3624493690069042e-05, + "loss": 0.3624, + "step": 17135 + }, + { + "epoch": 0.3820484767655904, + "grad_norm": 0.610127866268158, + "learning_rate": 1.3621230227400013e-05, + "loss": 0.3844, + "step": 17140 + }, + { + "epoch": 0.3821599261462104, + "grad_norm": 0.6166819930076599, + "learning_rate": 1.361796632079611e-05, + "loss": 0.3333, + "step": 17145 + }, + { + "epoch": 0.3822713755268305, + "grad_norm": 0.7649003267288208, + "learning_rate": 1.3614701970657463e-05, + "loss": 0.2924, + "step": 17150 + }, + { + "epoch": 0.38238282490745057, + "grad_norm": 0.4538317024707794, + "learning_rate": 1.3611437177384252e-05, + "loss": 0.2623, + "step": 17155 + }, + { + "epoch": 0.38249427428807065, + "grad_norm": 0.5183296799659729, + "learning_rate": 1.3608171941376722e-05, + "loss": 0.5353, + "step": 17160 + }, + { + "epoch": 0.3826057236686907, + "grad_norm": 0.9330891966819763, + "learning_rate": 1.3604906263035162e-05, + "loss": 0.3002, + "step": 17165 + }, + { + "epoch": 0.3827171730493108, + "grad_norm": 0.5934903621673584, + "learning_rate": 1.3601640142759922e-05, + "loss": 0.35, + "step": 17170 + }, + { + "epoch": 0.3828286224299308, + "grad_norm": 0.4814862012863159, + "learning_rate": 1.3598373580951399e-05, + "loss": 0.3452, + "step": 17175 + }, + { + "epoch": 0.3829400718105509, + "grad_norm": 0.5025895237922668, + "learning_rate": 1.3595106578010051e-05, + "loss": 0.2542, + "step": 17180 + }, + { + "epoch": 0.383051521191171, + "grad_norm": 0.758365273475647, + "learning_rate": 1.3591839134336384e-05, + "loss": 0.3329, + "step": 17185 + }, + { + "epoch": 0.38316297057179105, + "grad_norm": 1.2039666175842285, + "learning_rate": 1.3588571250330965e-05, + "loss": 0.3772, + "step": 17190 + }, + { + "epoch": 0.3832744199524111, + "grad_norm": 0.5428191423416138, + "learning_rate": 1.3585302926394412e-05, + "loss": 0.3924, + "step": 17195 + }, + { + "epoch": 0.3833858693330312, + "grad_norm": 0.38816478848457336, + "learning_rate": 1.3582034162927393e-05, + "loss": 0.2923, + "step": 17200 + }, + { + "epoch": 0.3834973187136512, + "grad_norm": 0.47817766666412354, + "learning_rate": 1.3578764960330632e-05, + "loss": 0.3491, + "step": 17205 + }, + { + "epoch": 0.3836087680942713, + "grad_norm": 0.5975366830825806, + "learning_rate": 1.3575495319004917e-05, + "loss": 0.3218, + "step": 17210 + }, + { + "epoch": 0.3837202174748914, + "grad_norm": 0.5096039175987244, + "learning_rate": 1.357222523935107e-05, + "loss": 0.2884, + "step": 17215 + }, + { + "epoch": 0.38383166685551146, + "grad_norm": 0.46383658051490784, + "learning_rate": 1.3568954721769982e-05, + "loss": 0.3117, + "step": 17220 + }, + { + "epoch": 0.38394311623613153, + "grad_norm": 0.47448253631591797, + "learning_rate": 1.3565683766662597e-05, + "loss": 0.2814, + "step": 17225 + }, + { + "epoch": 0.3840545656167516, + "grad_norm": 0.6228605508804321, + "learning_rate": 1.3562412374429903e-05, + "loss": 0.3834, + "step": 17230 + }, + { + "epoch": 0.38416601499737163, + "grad_norm": 0.5391026139259338, + "learning_rate": 1.3559140545472948e-05, + "loss": 0.3386, + "step": 17235 + }, + { + "epoch": 0.3842774643779917, + "grad_norm": 0.44915348291397095, + "learning_rate": 1.3555868280192835e-05, + "loss": 0.2299, + "step": 17240 + }, + { + "epoch": 0.3843889137586118, + "grad_norm": 0.6540490388870239, + "learning_rate": 1.3552595578990719e-05, + "loss": 0.3504, + "step": 17245 + }, + { + "epoch": 0.38450036313923186, + "grad_norm": 0.404447078704834, + "learning_rate": 1.3549322442267805e-05, + "loss": 0.3026, + "step": 17250 + }, + { + "epoch": 0.38461181251985194, + "grad_norm": 0.9255741834640503, + "learning_rate": 1.3546048870425356e-05, + "loss": 0.351, + "step": 17255 + }, + { + "epoch": 0.384723261900472, + "grad_norm": 0.4792323708534241, + "learning_rate": 1.3542774863864692e-05, + "loss": 0.303, + "step": 17260 + }, + { + "epoch": 0.38483471128109203, + "grad_norm": 0.5489178895950317, + "learning_rate": 1.353950042298717e-05, + "loss": 0.3484, + "step": 17265 + }, + { + "epoch": 0.3849461606617121, + "grad_norm": 0.5653095841407776, + "learning_rate": 1.3536225548194219e-05, + "loss": 0.2875, + "step": 17270 + }, + { + "epoch": 0.3850576100423322, + "grad_norm": 0.5252937078475952, + "learning_rate": 1.353295023988731e-05, + "loss": 0.2503, + "step": 17275 + }, + { + "epoch": 0.38516905942295226, + "grad_norm": 0.6666718125343323, + "learning_rate": 1.3529674498467974e-05, + "loss": 0.2796, + "step": 17280 + }, + { + "epoch": 0.38528050880357234, + "grad_norm": 0.64290452003479, + "learning_rate": 1.3526398324337788e-05, + "loss": 0.4103, + "step": 17285 + }, + { + "epoch": 0.3853919581841924, + "grad_norm": 0.9748024344444275, + "learning_rate": 1.3523121717898387e-05, + "loss": 0.3458, + "step": 17290 + }, + { + "epoch": 0.38550340756481244, + "grad_norm": 0.8246389627456665, + "learning_rate": 1.3519844679551456e-05, + "loss": 0.3598, + "step": 17295 + }, + { + "epoch": 0.3856148569454325, + "grad_norm": 0.6088479161262512, + "learning_rate": 1.3516567209698739e-05, + "loss": 0.3518, + "step": 17300 + }, + { + "epoch": 0.3857263063260526, + "grad_norm": 0.6825111508369446, + "learning_rate": 1.3513289308742028e-05, + "loss": 0.377, + "step": 17305 + }, + { + "epoch": 0.38583775570667267, + "grad_norm": 0.4026068449020386, + "learning_rate": 1.3510010977083159e-05, + "loss": 0.2406, + "step": 17310 + }, + { + "epoch": 0.38594920508729275, + "grad_norm": 0.5761940479278564, + "learning_rate": 1.3506732215124044e-05, + "loss": 0.3949, + "step": 17315 + }, + { + "epoch": 0.38606065446791277, + "grad_norm": 0.5035435557365417, + "learning_rate": 1.3503453023266626e-05, + "loss": 0.2624, + "step": 17320 + }, + { + "epoch": 0.38617210384853284, + "grad_norm": 0.6900637149810791, + "learning_rate": 1.3500173401912908e-05, + "loss": 0.336, + "step": 17325 + }, + { + "epoch": 0.3862835532291529, + "grad_norm": 0.574922502040863, + "learning_rate": 1.3496893351464948e-05, + "loss": 0.3015, + "step": 17330 + }, + { + "epoch": 0.386395002609773, + "grad_norm": 0.4953921437263489, + "learning_rate": 1.3493612872324857e-05, + "loss": 0.3169, + "step": 17335 + }, + { + "epoch": 0.3865064519903931, + "grad_norm": 0.6877520084381104, + "learning_rate": 1.3490331964894792e-05, + "loss": 0.3994, + "step": 17340 + }, + { + "epoch": 0.38661790137101315, + "grad_norm": 0.6951386332511902, + "learning_rate": 1.348705062957697e-05, + "loss": 0.3787, + "step": 17345 + }, + { + "epoch": 0.38672935075163317, + "grad_norm": 0.46507614850997925, + "learning_rate": 1.3483768866773659e-05, + "loss": 0.2933, + "step": 17350 + }, + { + "epoch": 0.38684080013225325, + "grad_norm": 0.5548629760742188, + "learning_rate": 1.3480486676887172e-05, + "loss": 0.3405, + "step": 17355 + }, + { + "epoch": 0.3869522495128733, + "grad_norm": 0.581392228603363, + "learning_rate": 1.3477204060319884e-05, + "loss": 0.2563, + "step": 17360 + }, + { + "epoch": 0.3870636988934934, + "grad_norm": 0.5281866192817688, + "learning_rate": 1.3473921017474221e-05, + "loss": 0.378, + "step": 17365 + }, + { + "epoch": 0.3871751482741135, + "grad_norm": 0.5699111223220825, + "learning_rate": 1.3470637548752657e-05, + "loss": 0.3965, + "step": 17370 + }, + { + "epoch": 0.38728659765473356, + "grad_norm": 0.45156022906303406, + "learning_rate": 1.3467353654557713e-05, + "loss": 0.225, + "step": 17375 + }, + { + "epoch": 0.3873980470353536, + "grad_norm": 0.6937002539634705, + "learning_rate": 1.3464069335291979e-05, + "loss": 0.4284, + "step": 17380 + }, + { + "epoch": 0.38750949641597365, + "grad_norm": 0.5318115949630737, + "learning_rate": 1.3460784591358083e-05, + "loss": 0.3061, + "step": 17385 + }, + { + "epoch": 0.38762094579659373, + "grad_norm": 0.46002933382987976, + "learning_rate": 1.3457499423158706e-05, + "loss": 0.2477, + "step": 17390 + }, + { + "epoch": 0.3877323951772138, + "grad_norm": 0.565171480178833, + "learning_rate": 1.345421383109659e-05, + "loss": 0.2647, + "step": 17395 + }, + { + "epoch": 0.3878438445578339, + "grad_norm": 0.5259685516357422, + "learning_rate": 1.3450927815574522e-05, + "loss": 0.2421, + "step": 17400 + }, + { + "epoch": 0.38795529393845396, + "grad_norm": 0.5806754231452942, + "learning_rate": 1.3447641376995337e-05, + "loss": 0.3067, + "step": 17405 + }, + { + "epoch": 0.388066743319074, + "grad_norm": 0.48702603578567505, + "learning_rate": 1.3444354515761935e-05, + "loss": 0.4088, + "step": 17410 + }, + { + "epoch": 0.38817819269969406, + "grad_norm": 0.5694561004638672, + "learning_rate": 1.3441067232277255e-05, + "loss": 0.2687, + "step": 17415 + }, + { + "epoch": 0.38828964208031413, + "grad_norm": 0.5904139876365662, + "learning_rate": 1.343777952694429e-05, + "loss": 0.3926, + "step": 17420 + }, + { + "epoch": 0.3884010914609342, + "grad_norm": 0.5209612846374512, + "learning_rate": 1.3434491400166095e-05, + "loss": 0.3796, + "step": 17425 + }, + { + "epoch": 0.3885125408415543, + "grad_norm": 1.1780948638916016, + "learning_rate": 1.343120285234576e-05, + "loss": 0.3414, + "step": 17430 + }, + { + "epoch": 0.38862399022217436, + "grad_norm": 0.5466594099998474, + "learning_rate": 1.342791388388644e-05, + "loss": 0.3755, + "step": 17435 + }, + { + "epoch": 0.3887354396027944, + "grad_norm": 0.3503721058368683, + "learning_rate": 1.3424624495191344e-05, + "loss": 0.25, + "step": 17440 + }, + { + "epoch": 0.38884688898341446, + "grad_norm": 0.571108877658844, + "learning_rate": 1.3421334686663714e-05, + "loss": 0.3724, + "step": 17445 + }, + { + "epoch": 0.38895833836403454, + "grad_norm": 0.6269806623458862, + "learning_rate": 1.3418044458706859e-05, + "loss": 0.2822, + "step": 17450 + }, + { + "epoch": 0.3890697877446546, + "grad_norm": 0.42661252617836, + "learning_rate": 1.3414753811724139e-05, + "loss": 0.2892, + "step": 17455 + }, + { + "epoch": 0.3891812371252747, + "grad_norm": 0.6101808547973633, + "learning_rate": 1.341146274611896e-05, + "loss": 0.2023, + "step": 17460 + }, + { + "epoch": 0.38929268650589477, + "grad_norm": 0.5674259066581726, + "learning_rate": 1.3408171262294778e-05, + "loss": 0.4122, + "step": 17465 + }, + { + "epoch": 0.3894041358865148, + "grad_norm": 0.5061613917350769, + "learning_rate": 1.3404879360655108e-05, + "loss": 0.3471, + "step": 17470 + }, + { + "epoch": 0.38951558526713487, + "grad_norm": 0.5070977210998535, + "learning_rate": 1.3401587041603512e-05, + "loss": 0.3126, + "step": 17475 + }, + { + "epoch": 0.38962703464775494, + "grad_norm": 0.6381949186325073, + "learning_rate": 1.3398294305543597e-05, + "loss": 0.3114, + "step": 17480 + }, + { + "epoch": 0.389738484028375, + "grad_norm": 0.9275464415550232, + "learning_rate": 1.3395001152879033e-05, + "loss": 0.3702, + "step": 17485 + }, + { + "epoch": 0.3898499334089951, + "grad_norm": 0.7014368176460266, + "learning_rate": 1.3391707584013533e-05, + "loss": 0.3544, + "step": 17490 + }, + { + "epoch": 0.3899613827896152, + "grad_norm": 0.38658320903778076, + "learning_rate": 1.3388413599350862e-05, + "loss": 0.3118, + "step": 17495 + }, + { + "epoch": 0.3900728321702352, + "grad_norm": 0.6096714735031128, + "learning_rate": 1.338511919929484e-05, + "loss": 0.4444, + "step": 17500 + }, + { + "epoch": 0.39018428155085527, + "grad_norm": 0.45221275091171265, + "learning_rate": 1.338182438424933e-05, + "loss": 0.3064, + "step": 17505 + }, + { + "epoch": 0.39029573093147535, + "grad_norm": 0.6235794425010681, + "learning_rate": 1.3378529154618258e-05, + "loss": 0.3525, + "step": 17510 + }, + { + "epoch": 0.3904071803120954, + "grad_norm": 0.49644261598587036, + "learning_rate": 1.337523351080559e-05, + "loss": 0.3963, + "step": 17515 + }, + { + "epoch": 0.3905186296927155, + "grad_norm": 0.39976486563682556, + "learning_rate": 1.3371937453215344e-05, + "loss": 0.372, + "step": 17520 + }, + { + "epoch": 0.3906300790733355, + "grad_norm": 0.582975447177887, + "learning_rate": 1.3368640982251595e-05, + "loss": 0.4404, + "step": 17525 + }, + { + "epoch": 0.3907415284539556, + "grad_norm": 0.7980674505233765, + "learning_rate": 1.3365344098318463e-05, + "loss": 0.4305, + "step": 17530 + }, + { + "epoch": 0.3908529778345757, + "grad_norm": 0.5849584937095642, + "learning_rate": 1.3362046801820122e-05, + "loss": 0.2463, + "step": 17535 + }, + { + "epoch": 0.39096442721519575, + "grad_norm": 0.6233468651771545, + "learning_rate": 1.3358749093160792e-05, + "loss": 0.3788, + "step": 17540 + }, + { + "epoch": 0.39107587659581583, + "grad_norm": 0.7556803226470947, + "learning_rate": 1.335545097274475e-05, + "loss": 0.3482, + "step": 17545 + }, + { + "epoch": 0.3911873259764359, + "grad_norm": 0.521452009677887, + "learning_rate": 1.335215244097632e-05, + "loss": 0.2991, + "step": 17550 + }, + { + "epoch": 0.39129877535705593, + "grad_norm": 0.6280778050422668, + "learning_rate": 1.3348853498259872e-05, + "loss": 0.425, + "step": 17555 + }, + { + "epoch": 0.391410224737676, + "grad_norm": 0.5022149682044983, + "learning_rate": 1.3345554144999837e-05, + "loss": 0.2878, + "step": 17560 + }, + { + "epoch": 0.3915216741182961, + "grad_norm": 0.6211374998092651, + "learning_rate": 1.3342254381600688e-05, + "loss": 0.2642, + "step": 17565 + }, + { + "epoch": 0.39163312349891616, + "grad_norm": 0.634395182132721, + "learning_rate": 1.3338954208466948e-05, + "loss": 0.247, + "step": 17570 + }, + { + "epoch": 0.39174457287953623, + "grad_norm": 0.8194277882575989, + "learning_rate": 1.3335653626003196e-05, + "loss": 0.2874, + "step": 17575 + }, + { + "epoch": 0.3918560222601563, + "grad_norm": 0.37737780809402466, + "learning_rate": 1.3332352634614057e-05, + "loss": 0.2985, + "step": 17580 + }, + { + "epoch": 0.39196747164077633, + "grad_norm": 0.5371142029762268, + "learning_rate": 1.3329051234704209e-05, + "loss": 0.3103, + "step": 17585 + }, + { + "epoch": 0.3920789210213964, + "grad_norm": 0.5832655429840088, + "learning_rate": 1.3325749426678369e-05, + "loss": 0.2908, + "step": 17590 + }, + { + "epoch": 0.3921903704020165, + "grad_norm": 0.5224264860153198, + "learning_rate": 1.3322447210941328e-05, + "loss": 0.2937, + "step": 17595 + }, + { + "epoch": 0.39230181978263656, + "grad_norm": 0.7488956451416016, + "learning_rate": 1.33191445878979e-05, + "loss": 0.3981, + "step": 17600 + }, + { + "epoch": 0.39241326916325664, + "grad_norm": 0.5862334966659546, + "learning_rate": 1.3315841557952967e-05, + "loss": 0.3055, + "step": 17605 + }, + { + "epoch": 0.3925247185438767, + "grad_norm": 0.7196325063705444, + "learning_rate": 1.331253812151145e-05, + "loss": 0.3212, + "step": 17610 + }, + { + "epoch": 0.39263616792449674, + "grad_norm": 0.3566960096359253, + "learning_rate": 1.3309234278978332e-05, + "loss": 0.2657, + "step": 17615 + }, + { + "epoch": 0.3927476173051168, + "grad_norm": 0.5377562642097473, + "learning_rate": 1.3305930030758632e-05, + "loss": 0.2786, + "step": 17620 + }, + { + "epoch": 0.3928590666857369, + "grad_norm": 0.9269275069236755, + "learning_rate": 1.330262537725743e-05, + "loss": 0.4352, + "step": 17625 + }, + { + "epoch": 0.39297051606635697, + "grad_norm": 0.6867557168006897, + "learning_rate": 1.3299320318879849e-05, + "loss": 0.4177, + "step": 17630 + }, + { + "epoch": 0.39308196544697704, + "grad_norm": 1.2014501094818115, + "learning_rate": 1.3296014856031062e-05, + "loss": 0.4319, + "step": 17635 + }, + { + "epoch": 0.3931934148275971, + "grad_norm": 0.56941819190979, + "learning_rate": 1.32927089891163e-05, + "loss": 0.2071, + "step": 17640 + }, + { + "epoch": 0.39330486420821714, + "grad_norm": 0.6028541922569275, + "learning_rate": 1.3289402718540826e-05, + "loss": 0.414, + "step": 17645 + }, + { + "epoch": 0.3934163135888372, + "grad_norm": 0.5946452021598816, + "learning_rate": 1.328609604470997e-05, + "loss": 0.3379, + "step": 17650 + }, + { + "epoch": 0.3935277629694573, + "grad_norm": 0.5714167356491089, + "learning_rate": 1.3282788968029108e-05, + "loss": 0.3109, + "step": 17655 + }, + { + "epoch": 0.39363921235007737, + "grad_norm": 0.8042417764663696, + "learning_rate": 1.3279481488903651e-05, + "loss": 0.3508, + "step": 17660 + }, + { + "epoch": 0.39375066173069745, + "grad_norm": 0.46548306941986084, + "learning_rate": 1.3276173607739082e-05, + "loss": 0.2213, + "step": 17665 + }, + { + "epoch": 0.3938621111113175, + "grad_norm": 0.7371535897254944, + "learning_rate": 1.3272865324940916e-05, + "loss": 0.3666, + "step": 17670 + }, + { + "epoch": 0.39397356049193755, + "grad_norm": 0.7542550563812256, + "learning_rate": 1.3269556640914726e-05, + "loss": 0.3432, + "step": 17675 + }, + { + "epoch": 0.3940850098725576, + "grad_norm": 0.6199653148651123, + "learning_rate": 1.3266247556066122e-05, + "loss": 0.3421, + "step": 17680 + }, + { + "epoch": 0.3941964592531777, + "grad_norm": 0.5280221104621887, + "learning_rate": 1.3262938070800786e-05, + "loss": 0.2772, + "step": 17685 + }, + { + "epoch": 0.3943079086337978, + "grad_norm": 0.6193488240242004, + "learning_rate": 1.3259628185524426e-05, + "loss": 0.3734, + "step": 17690 + }, + { + "epoch": 0.39441935801441785, + "grad_norm": 0.6555532217025757, + "learning_rate": 1.325631790064281e-05, + "loss": 0.3828, + "step": 17695 + }, + { + "epoch": 0.3945308073950379, + "grad_norm": 0.44673195481300354, + "learning_rate": 1.3253007216561755e-05, + "loss": 0.2106, + "step": 17700 + }, + { + "epoch": 0.39464225677565795, + "grad_norm": 0.8975861668586731, + "learning_rate": 1.3249696133687127e-05, + "loss": 0.3603, + "step": 17705 + }, + { + "epoch": 0.39475370615627803, + "grad_norm": 0.5637463331222534, + "learning_rate": 1.3246384652424832e-05, + "loss": 0.3267, + "step": 17710 + }, + { + "epoch": 0.3948651555368981, + "grad_norm": 0.5970215797424316, + "learning_rate": 1.3243072773180841e-05, + "loss": 0.3672, + "step": 17715 + }, + { + "epoch": 0.3949766049175182, + "grad_norm": 0.47597572207450867, + "learning_rate": 1.323976049636116e-05, + "loss": 0.246, + "step": 17720 + }, + { + "epoch": 0.39508805429813826, + "grad_norm": 0.5651100873947144, + "learning_rate": 1.323644782237185e-05, + "loss": 0.4041, + "step": 17725 + }, + { + "epoch": 0.3951995036787583, + "grad_norm": 0.4764869809150696, + "learning_rate": 1.3233134751619018e-05, + "loss": 0.3636, + "step": 17730 + }, + { + "epoch": 0.39531095305937836, + "grad_norm": 0.44785913825035095, + "learning_rate": 1.3229821284508817e-05, + "loss": 0.2818, + "step": 17735 + }, + { + "epoch": 0.39542240243999843, + "grad_norm": 0.5839234590530396, + "learning_rate": 1.322650742144746e-05, + "loss": 0.2325, + "step": 17740 + }, + { + "epoch": 0.3955338518206185, + "grad_norm": 0.7260363698005676, + "learning_rate": 1.32231931628412e-05, + "loss": 0.2708, + "step": 17745 + }, + { + "epoch": 0.3956453012012386, + "grad_norm": 0.4467025399208069, + "learning_rate": 1.3219878509096336e-05, + "loss": 0.304, + "step": 17750 + }, + { + "epoch": 0.39575675058185866, + "grad_norm": 0.783078134059906, + "learning_rate": 1.321656346061922e-05, + "loss": 0.3295, + "step": 17755 + }, + { + "epoch": 0.3958681999624787, + "grad_norm": 1.3599778413772583, + "learning_rate": 1.3213248017816251e-05, + "loss": 0.3469, + "step": 17760 + }, + { + "epoch": 0.39597964934309876, + "grad_norm": 0.6114479899406433, + "learning_rate": 1.320993218109388e-05, + "loss": 0.3666, + "step": 17765 + }, + { + "epoch": 0.39609109872371884, + "grad_norm": 0.5643268823623657, + "learning_rate": 1.32066159508586e-05, + "loss": 0.317, + "step": 17770 + }, + { + "epoch": 0.3962025481043389, + "grad_norm": 0.5749050974845886, + "learning_rate": 1.3203299327516951e-05, + "loss": 0.3648, + "step": 17775 + }, + { + "epoch": 0.396313997484959, + "grad_norm": 0.6513046622276306, + "learning_rate": 1.3199982311475536e-05, + "loss": 0.3595, + "step": 17780 + }, + { + "epoch": 0.39642544686557907, + "grad_norm": 0.49400731921195984, + "learning_rate": 1.3196664903140986e-05, + "loss": 0.2408, + "step": 17785 + }, + { + "epoch": 0.3965368962461991, + "grad_norm": 0.5998428463935852, + "learning_rate": 1.3193347102919995e-05, + "loss": 0.2253, + "step": 17790 + }, + { + "epoch": 0.39664834562681917, + "grad_norm": 0.5260230898857117, + "learning_rate": 1.3190028911219299e-05, + "loss": 0.3894, + "step": 17795 + }, + { + "epoch": 0.39675979500743924, + "grad_norm": 0.8185827136039734, + "learning_rate": 1.3186710328445681e-05, + "loss": 0.402, + "step": 17800 + }, + { + "epoch": 0.3968712443880593, + "grad_norm": 0.5452010631561279, + "learning_rate": 1.3183391355005972e-05, + "loss": 0.3864, + "step": 17805 + }, + { + "epoch": 0.3969826937686794, + "grad_norm": 0.5198045969009399, + "learning_rate": 1.3180071991307058e-05, + "loss": 0.3525, + "step": 17810 + }, + { + "epoch": 0.39709414314929947, + "grad_norm": 0.49183177947998047, + "learning_rate": 1.3176752237755866e-05, + "loss": 0.1952, + "step": 17815 + }, + { + "epoch": 0.3972055925299195, + "grad_norm": 0.9827027320861816, + "learning_rate": 1.3173432094759364e-05, + "loss": 0.3477, + "step": 17820 + }, + { + "epoch": 0.39731704191053957, + "grad_norm": 0.6063224673271179, + "learning_rate": 1.3170111562724586e-05, + "loss": 0.2805, + "step": 17825 + }, + { + "epoch": 0.39742849129115965, + "grad_norm": 0.5842794179916382, + "learning_rate": 1.31667906420586e-05, + "loss": 0.389, + "step": 17830 + }, + { + "epoch": 0.3975399406717797, + "grad_norm": 0.5279350280761719, + "learning_rate": 1.3163469333168526e-05, + "loss": 0.309, + "step": 17835 + }, + { + "epoch": 0.3976513900523998, + "grad_norm": 0.578009843826294, + "learning_rate": 1.3160147636461527e-05, + "loss": 0.3562, + "step": 17840 + }, + { + "epoch": 0.3977628394330199, + "grad_norm": 0.5486703515052795, + "learning_rate": 1.3156825552344822e-05, + "loss": 0.4434, + "step": 17845 + }, + { + "epoch": 0.3978742888136399, + "grad_norm": 0.536373496055603, + "learning_rate": 1.315350308122567e-05, + "loss": 0.3216, + "step": 17850 + }, + { + "epoch": 0.39798573819426, + "grad_norm": 0.4132682979106903, + "learning_rate": 1.3150180223511383e-05, + "loss": 0.2851, + "step": 17855 + }, + { + "epoch": 0.39809718757488005, + "grad_norm": 0.4532261788845062, + "learning_rate": 1.314685697960932e-05, + "loss": 0.3908, + "step": 17860 + }, + { + "epoch": 0.39820863695550013, + "grad_norm": 0.5398720502853394, + "learning_rate": 1.3143533349926875e-05, + "loss": 0.3451, + "step": 17865 + }, + { + "epoch": 0.3983200863361202, + "grad_norm": 0.5097636580467224, + "learning_rate": 1.314020933487151e-05, + "loss": 0.3461, + "step": 17870 + }, + { + "epoch": 0.3984315357167403, + "grad_norm": 0.6569482684135437, + "learning_rate": 1.3136884934850719e-05, + "loss": 0.4025, + "step": 17875 + }, + { + "epoch": 0.3985429850973603, + "grad_norm": 0.5743000507354736, + "learning_rate": 1.3133560150272047e-05, + "loss": 0.4101, + "step": 17880 + }, + { + "epoch": 0.3986544344779804, + "grad_norm": 0.49067550897598267, + "learning_rate": 1.3130234981543087e-05, + "loss": 0.2034, + "step": 17885 + }, + { + "epoch": 0.39876588385860046, + "grad_norm": 0.6841097474098206, + "learning_rate": 1.312690942907148e-05, + "loss": 0.3888, + "step": 17890 + }, + { + "epoch": 0.39887733323922053, + "grad_norm": 0.5429337024688721, + "learning_rate": 1.3123583493264913e-05, + "loss": 0.4961, + "step": 17895 + }, + { + "epoch": 0.3989887826198406, + "grad_norm": 0.56031334400177, + "learning_rate": 1.312025717453112e-05, + "loss": 0.3289, + "step": 17900 + }, + { + "epoch": 0.39910023200046063, + "grad_norm": 0.6107448935508728, + "learning_rate": 1.3116930473277884e-05, + "loss": 0.3042, + "step": 17905 + }, + { + "epoch": 0.3992116813810807, + "grad_norm": 0.30036771297454834, + "learning_rate": 1.3113603389913027e-05, + "loss": 0.2888, + "step": 17910 + }, + { + "epoch": 0.3993231307617008, + "grad_norm": 0.5498743653297424, + "learning_rate": 1.3110275924844432e-05, + "loss": 0.3323, + "step": 17915 + }, + { + "epoch": 0.39943458014232086, + "grad_norm": 0.31906214356422424, + "learning_rate": 1.3106948078480014e-05, + "loss": 0.2188, + "step": 17920 + }, + { + "epoch": 0.39954602952294094, + "grad_norm": 0.8935547471046448, + "learning_rate": 1.3103619851227744e-05, + "loss": 0.3219, + "step": 17925 + }, + { + "epoch": 0.399657478903561, + "grad_norm": 0.5760512351989746, + "learning_rate": 1.3100291243495637e-05, + "loss": 0.2671, + "step": 17930 + }, + { + "epoch": 0.39976892828418104, + "grad_norm": 0.7435246706008911, + "learning_rate": 1.3096962255691755e-05, + "loss": 0.4353, + "step": 17935 + }, + { + "epoch": 0.3998803776648011, + "grad_norm": 0.6022874712944031, + "learning_rate": 1.3093632888224206e-05, + "loss": 0.3313, + "step": 17940 + }, + { + "epoch": 0.3999918270454212, + "grad_norm": 0.897080659866333, + "learning_rate": 1.3090303141501139e-05, + "loss": 0.2699, + "step": 17945 + }, + { + "epoch": 0.40010327642604127, + "grad_norm": 0.7807203531265259, + "learning_rate": 1.3086973015930763e-05, + "loss": 0.3183, + "step": 17950 + }, + { + "epoch": 0.40021472580666134, + "grad_norm": 0.9843493103981018, + "learning_rate": 1.3083642511921325e-05, + "loss": 0.3, + "step": 17955 + }, + { + "epoch": 0.4003261751872814, + "grad_norm": 0.47445183992385864, + "learning_rate": 1.3080311629881112e-05, + "loss": 0.4984, + "step": 17960 + }, + { + "epoch": 0.40043762456790144, + "grad_norm": 0.8390313982963562, + "learning_rate": 1.3076980370218478e-05, + "loss": 0.3191, + "step": 17965 + }, + { + "epoch": 0.4005490739485215, + "grad_norm": 0.61275714635849, + "learning_rate": 1.3073648733341796e-05, + "loss": 0.3963, + "step": 17970 + }, + { + "epoch": 0.4006605233291416, + "grad_norm": 0.8645874261856079, + "learning_rate": 1.3070316719659503e-05, + "loss": 0.2564, + "step": 17975 + }, + { + "epoch": 0.40077197270976167, + "grad_norm": 0.5309740900993347, + "learning_rate": 1.3066984329580081e-05, + "loss": 0.2978, + "step": 17980 + }, + { + "epoch": 0.40088342209038175, + "grad_norm": 0.5536980628967285, + "learning_rate": 1.3063651563512054e-05, + "loss": 0.3188, + "step": 17985 + }, + { + "epoch": 0.4009948714710018, + "grad_norm": 0.788591742515564, + "learning_rate": 1.3060318421863994e-05, + "loss": 0.3653, + "step": 17990 + }, + { + "epoch": 0.40110632085162184, + "grad_norm": 0.5309394598007202, + "learning_rate": 1.3056984905044516e-05, + "loss": 0.2452, + "step": 17995 + }, + { + "epoch": 0.4012177702322419, + "grad_norm": 0.5781253576278687, + "learning_rate": 1.3053651013462285e-05, + "loss": 0.2701, + "step": 18000 + }, + { + "epoch": 0.401329219612862, + "grad_norm": 0.552573561668396, + "learning_rate": 1.305031674752601e-05, + "loss": 0.2345, + "step": 18005 + }, + { + "epoch": 0.4014406689934821, + "grad_norm": 0.5335693359375, + "learning_rate": 1.3046982107644448e-05, + "loss": 0.268, + "step": 18010 + }, + { + "epoch": 0.40155211837410215, + "grad_norm": 0.6344022750854492, + "learning_rate": 1.3043647094226397e-05, + "loss": 0.3166, + "step": 18015 + }, + { + "epoch": 0.40166356775472223, + "grad_norm": 0.6636702418327332, + "learning_rate": 1.3040311707680706e-05, + "loss": 0.4501, + "step": 18020 + }, + { + "epoch": 0.40177501713534225, + "grad_norm": 0.33217310905456543, + "learning_rate": 1.3036975948416268e-05, + "loss": 0.2042, + "step": 18025 + }, + { + "epoch": 0.4018864665159623, + "grad_norm": 0.7773119807243347, + "learning_rate": 1.3033639816842023e-05, + "loss": 0.3389, + "step": 18030 + }, + { + "epoch": 0.4019979158965824, + "grad_norm": 0.4810592234134674, + "learning_rate": 1.3030303313366948e-05, + "loss": 0.2472, + "step": 18035 + }, + { + "epoch": 0.4021093652772025, + "grad_norm": 0.5222720503807068, + "learning_rate": 1.3026966438400079e-05, + "loss": 0.3122, + "step": 18040 + }, + { + "epoch": 0.40222081465782256, + "grad_norm": 0.6859990954399109, + "learning_rate": 1.302362919235049e-05, + "loss": 0.3732, + "step": 18045 + }, + { + "epoch": 0.40233226403844263, + "grad_norm": 0.5260218381881714, + "learning_rate": 1.30202915756273e-05, + "loss": 0.3085, + "step": 18050 + }, + { + "epoch": 0.40244371341906265, + "grad_norm": 0.6136817336082458, + "learning_rate": 1.3016953588639676e-05, + "loss": 0.3202, + "step": 18055 + }, + { + "epoch": 0.40255516279968273, + "grad_norm": 0.7072620987892151, + "learning_rate": 1.3013615231796829e-05, + "loss": 0.2627, + "step": 18060 + }, + { + "epoch": 0.4026666121803028, + "grad_norm": 0.5630493760108948, + "learning_rate": 1.3010276505508017e-05, + "loss": 0.2619, + "step": 18065 + }, + { + "epoch": 0.4027780615609229, + "grad_norm": 0.5899954438209534, + "learning_rate": 1.3006937410182539e-05, + "loss": 0.3468, + "step": 18070 + }, + { + "epoch": 0.40288951094154296, + "grad_norm": 0.6174592971801758, + "learning_rate": 1.3003597946229747e-05, + "loss": 0.3401, + "step": 18075 + }, + { + "epoch": 0.403000960322163, + "grad_norm": 0.9012115001678467, + "learning_rate": 1.300025811405903e-05, + "loss": 0.2373, + "step": 18080 + }, + { + "epoch": 0.40311240970278306, + "grad_norm": 0.49508702754974365, + "learning_rate": 1.2996917914079826e-05, + "loss": 0.3124, + "step": 18085 + }, + { + "epoch": 0.40322385908340314, + "grad_norm": 0.8170999884605408, + "learning_rate": 1.299357734670162e-05, + "loss": 0.2728, + "step": 18090 + }, + { + "epoch": 0.4033353084640232, + "grad_norm": 0.4755793809890747, + "learning_rate": 1.299023641233394e-05, + "loss": 0.4259, + "step": 18095 + }, + { + "epoch": 0.4034467578446433, + "grad_norm": 0.520043671131134, + "learning_rate": 1.2986895111386351e-05, + "loss": 0.3457, + "step": 18100 + }, + { + "epoch": 0.40355820722526337, + "grad_norm": 0.5027348399162292, + "learning_rate": 1.298355344426848e-05, + "loss": 0.3179, + "step": 18105 + }, + { + "epoch": 0.4036696566058834, + "grad_norm": 0.5023277401924133, + "learning_rate": 1.2980211411389987e-05, + "loss": 0.1961, + "step": 18110 + }, + { + "epoch": 0.40378110598650346, + "grad_norm": 0.5529801249504089, + "learning_rate": 1.2976869013160577e-05, + "loss": 0.2631, + "step": 18115 + }, + { + "epoch": 0.40389255536712354, + "grad_norm": 0.652068555355072, + "learning_rate": 1.2973526249990006e-05, + "loss": 0.3448, + "step": 18120 + }, + { + "epoch": 0.4040040047477436, + "grad_norm": 0.5933851599693298, + "learning_rate": 1.2970183122288068e-05, + "loss": 0.421, + "step": 18125 + }, + { + "epoch": 0.4041154541283637, + "grad_norm": 0.4105674922466278, + "learning_rate": 1.2966839630464603e-05, + "loss": 0.2904, + "step": 18130 + }, + { + "epoch": 0.40422690350898377, + "grad_norm": 0.7906683683395386, + "learning_rate": 1.2963495774929505e-05, + "loss": 0.2608, + "step": 18135 + }, + { + "epoch": 0.4043383528896038, + "grad_norm": 0.8182550072669983, + "learning_rate": 1.2960151556092694e-05, + "loss": 0.3075, + "step": 18140 + }, + { + "epoch": 0.40444980227022387, + "grad_norm": 0.7050228118896484, + "learning_rate": 1.2956806974364153e-05, + "loss": 0.2282, + "step": 18145 + }, + { + "epoch": 0.40456125165084394, + "grad_norm": 0.5624240040779114, + "learning_rate": 1.2953462030153902e-05, + "loss": 0.2506, + "step": 18150 + }, + { + "epoch": 0.404672701031464, + "grad_norm": 0.6622910499572754, + "learning_rate": 1.2950116723872002e-05, + "loss": 0.2953, + "step": 18155 + }, + { + "epoch": 0.4047841504120841, + "grad_norm": 0.5642595291137695, + "learning_rate": 1.2946771055928562e-05, + "loss": 0.3647, + "step": 18160 + }, + { + "epoch": 0.4048955997927042, + "grad_norm": 0.29868683218955994, + "learning_rate": 1.2943425026733738e-05, + "loss": 0.2502, + "step": 18165 + }, + { + "epoch": 0.4050070491733242, + "grad_norm": 0.7383465766906738, + "learning_rate": 1.2940078636697726e-05, + "loss": 0.3526, + "step": 18170 + }, + { + "epoch": 0.4051184985539443, + "grad_norm": 0.6359235048294067, + "learning_rate": 1.2936731886230761e-05, + "loss": 0.3998, + "step": 18175 + }, + { + "epoch": 0.40522994793456435, + "grad_norm": 0.4913898706436157, + "learning_rate": 1.2933384775743143e-05, + "loss": 0.252, + "step": 18180 + }, + { + "epoch": 0.4053413973151844, + "grad_norm": 0.7118620872497559, + "learning_rate": 1.2930037305645191e-05, + "loss": 0.3759, + "step": 18185 + }, + { + "epoch": 0.4054528466958045, + "grad_norm": 0.6543040871620178, + "learning_rate": 1.292668947634728e-05, + "loss": 0.1789, + "step": 18190 + }, + { + "epoch": 0.4055642960764246, + "grad_norm": 0.7008522152900696, + "learning_rate": 1.292334128825983e-05, + "loss": 0.311, + "step": 18195 + }, + { + "epoch": 0.4056757454570446, + "grad_norm": 0.5294999480247498, + "learning_rate": 1.2919992741793306e-05, + "loss": 0.2591, + "step": 18200 + }, + { + "epoch": 0.4057871948376647, + "grad_norm": 0.44648101925849915, + "learning_rate": 1.2916643837358205e-05, + "loss": 0.2552, + "step": 18205 + }, + { + "epoch": 0.40589864421828475, + "grad_norm": 0.6527552604675293, + "learning_rate": 1.2913294575365086e-05, + "loss": 0.357, + "step": 18210 + }, + { + "epoch": 0.40601009359890483, + "grad_norm": 0.5339484214782715, + "learning_rate": 1.290994495622454e-05, + "loss": 0.28, + "step": 18215 + }, + { + "epoch": 0.4061215429795249, + "grad_norm": 0.5636955499649048, + "learning_rate": 1.2906594980347202e-05, + "loss": 0.368, + "step": 18220 + }, + { + "epoch": 0.406232992360145, + "grad_norm": 0.5656212568283081, + "learning_rate": 1.2903244648143756e-05, + "loss": 0.3734, + "step": 18225 + }, + { + "epoch": 0.406344441740765, + "grad_norm": 0.9620150923728943, + "learning_rate": 1.2899893960024926e-05, + "loss": 0.3122, + "step": 18230 + }, + { + "epoch": 0.4064558911213851, + "grad_norm": 0.7658068537712097, + "learning_rate": 1.2896542916401477e-05, + "loss": 0.3063, + "step": 18235 + }, + { + "epoch": 0.40656734050200516, + "grad_norm": 0.49050748348236084, + "learning_rate": 1.289319151768423e-05, + "loss": 0.2702, + "step": 18240 + }, + { + "epoch": 0.40667878988262524, + "grad_norm": 0.6088207364082336, + "learning_rate": 1.2889839764284032e-05, + "loss": 0.4151, + "step": 18245 + }, + { + "epoch": 0.4067902392632453, + "grad_norm": 0.5077471733093262, + "learning_rate": 1.2886487656611786e-05, + "loss": 0.3534, + "step": 18250 + }, + { + "epoch": 0.4069016886438654, + "grad_norm": 1.2243595123291016, + "learning_rate": 1.2883135195078431e-05, + "loss": 0.3367, + "step": 18255 + }, + { + "epoch": 0.4070131380244854, + "grad_norm": 0.4729726314544678, + "learning_rate": 1.2879782380094958e-05, + "loss": 0.295, + "step": 18260 + }, + { + "epoch": 0.4071245874051055, + "grad_norm": 0.428756982088089, + "learning_rate": 1.2876429212072391e-05, + "loss": 0.267, + "step": 18265 + }, + { + "epoch": 0.40723603678572556, + "grad_norm": 0.5256748795509338, + "learning_rate": 1.2873075691421808e-05, + "loss": 0.3028, + "step": 18270 + }, + { + "epoch": 0.40734748616634564, + "grad_norm": 0.512870728969574, + "learning_rate": 1.2869721818554321e-05, + "loss": 0.2903, + "step": 18275 + }, + { + "epoch": 0.4074589355469657, + "grad_norm": 0.811591386795044, + "learning_rate": 1.286636759388109e-05, + "loss": 0.3306, + "step": 18280 + }, + { + "epoch": 0.40757038492758574, + "grad_norm": 0.556670069694519, + "learning_rate": 1.2863013017813316e-05, + "loss": 0.3757, + "step": 18285 + }, + { + "epoch": 0.4076818343082058, + "grad_norm": 0.57549649477005, + "learning_rate": 1.2859658090762248e-05, + "loss": 0.3665, + "step": 18290 + }, + { + "epoch": 0.4077932836888259, + "grad_norm": 0.6680387854576111, + "learning_rate": 1.285630281313917e-05, + "loss": 0.2735, + "step": 18295 + }, + { + "epoch": 0.40790473306944597, + "grad_norm": 0.6254687309265137, + "learning_rate": 1.2852947185355409e-05, + "loss": 0.3771, + "step": 18300 + }, + { + "epoch": 0.40801618245006605, + "grad_norm": 0.9815579056739807, + "learning_rate": 1.2849591207822351e-05, + "loss": 0.4015, + "step": 18305 + }, + { + "epoch": 0.4081276318306861, + "grad_norm": 0.4263515770435333, + "learning_rate": 1.2846234880951406e-05, + "loss": 0.3862, + "step": 18310 + }, + { + "epoch": 0.40823908121130614, + "grad_norm": 0.582800030708313, + "learning_rate": 1.284287820515403e-05, + "loss": 0.2104, + "step": 18315 + }, + { + "epoch": 0.4083505305919262, + "grad_norm": 0.6399257779121399, + "learning_rate": 1.2839521180841735e-05, + "loss": 0.3136, + "step": 18320 + }, + { + "epoch": 0.4084619799725463, + "grad_norm": 0.599901556968689, + "learning_rate": 1.283616380842606e-05, + "loss": 0.3759, + "step": 18325 + }, + { + "epoch": 0.4085734293531664, + "grad_norm": 0.5146269798278809, + "learning_rate": 1.2832806088318587e-05, + "loss": 0.3023, + "step": 18330 + }, + { + "epoch": 0.40868487873378645, + "grad_norm": 0.5369811058044434, + "learning_rate": 1.2829448020930959e-05, + "loss": 0.2933, + "step": 18335 + }, + { + "epoch": 0.4087963281144065, + "grad_norm": 0.4636971056461334, + "learning_rate": 1.2826089606674845e-05, + "loss": 0.3844, + "step": 18340 + }, + { + "epoch": 0.40890777749502655, + "grad_norm": 0.6950342655181885, + "learning_rate": 1.2822730845961958e-05, + "loss": 0.3928, + "step": 18345 + }, + { + "epoch": 0.4090192268756466, + "grad_norm": 0.4408503770828247, + "learning_rate": 1.2819371739204054e-05, + "loss": 0.2776, + "step": 18350 + }, + { + "epoch": 0.4091306762562667, + "grad_norm": 0.44306161999702454, + "learning_rate": 1.2816012286812942e-05, + "loss": 0.3134, + "step": 18355 + }, + { + "epoch": 0.4092421256368868, + "grad_norm": 0.4013633131980896, + "learning_rate": 1.2812652489200457e-05, + "loss": 0.3159, + "step": 18360 + }, + { + "epoch": 0.40935357501750685, + "grad_norm": 0.7099652290344238, + "learning_rate": 1.2809292346778488e-05, + "loss": 0.3841, + "step": 18365 + }, + { + "epoch": 0.40946502439812693, + "grad_norm": 0.6522414088249207, + "learning_rate": 1.2805931859958962e-05, + "loss": 0.4007, + "step": 18370 + }, + { + "epoch": 0.40957647377874695, + "grad_norm": 0.3149759769439697, + "learning_rate": 1.2802571029153848e-05, + "loss": 0.3963, + "step": 18375 + }, + { + "epoch": 0.40968792315936703, + "grad_norm": 0.7824275493621826, + "learning_rate": 1.279920985477516e-05, + "loss": 0.2676, + "step": 18380 + }, + { + "epoch": 0.4097993725399871, + "grad_norm": 0.501352846622467, + "learning_rate": 1.2795848337234946e-05, + "loss": 0.2539, + "step": 18385 + }, + { + "epoch": 0.4099108219206072, + "grad_norm": 0.6711282134056091, + "learning_rate": 1.279248647694531e-05, + "loss": 0.3712, + "step": 18390 + }, + { + "epoch": 0.41002227130122726, + "grad_norm": 0.5977444648742676, + "learning_rate": 1.2789124274318386e-05, + "loss": 0.4149, + "step": 18395 + }, + { + "epoch": 0.41013372068184734, + "grad_norm": 0.3847135305404663, + "learning_rate": 1.2785761729766356e-05, + "loss": 0.278, + "step": 18400 + }, + { + "epoch": 0.41024517006246736, + "grad_norm": 0.5048947334289551, + "learning_rate": 1.278239884370144e-05, + "loss": 0.3682, + "step": 18405 + }, + { + "epoch": 0.41035661944308743, + "grad_norm": 0.495900958776474, + "learning_rate": 1.27790356165359e-05, + "loss": 0.3405, + "step": 18410 + }, + { + "epoch": 0.4104680688237075, + "grad_norm": 0.40374094247817993, + "learning_rate": 1.2775672048682047e-05, + "loss": 0.2201, + "step": 18415 + }, + { + "epoch": 0.4105795182043276, + "grad_norm": 0.5748105049133301, + "learning_rate": 1.2772308140552224e-05, + "loss": 0.2755, + "step": 18420 + }, + { + "epoch": 0.41069096758494766, + "grad_norm": 0.5982359647750854, + "learning_rate": 1.2768943892558823e-05, + "loss": 0.3694, + "step": 18425 + }, + { + "epoch": 0.41080241696556774, + "grad_norm": 0.48614931106567383, + "learning_rate": 1.2765579305114276e-05, + "loss": 0.3353, + "step": 18430 + }, + { + "epoch": 0.41091386634618776, + "grad_norm": 0.6232022047042847, + "learning_rate": 1.2762214378631049e-05, + "loss": 0.2576, + "step": 18435 + }, + { + "epoch": 0.41102531572680784, + "grad_norm": 0.7191541790962219, + "learning_rate": 1.275884911352166e-05, + "loss": 0.3179, + "step": 18440 + }, + { + "epoch": 0.4111367651074279, + "grad_norm": 0.6166452169418335, + "learning_rate": 1.2755483510198668e-05, + "loss": 0.2123, + "step": 18445 + }, + { + "epoch": 0.411248214488048, + "grad_norm": 0.6613633632659912, + "learning_rate": 1.2752117569074667e-05, + "loss": 0.4291, + "step": 18450 + }, + { + "epoch": 0.41135966386866807, + "grad_norm": 0.6093412637710571, + "learning_rate": 1.2748751290562291e-05, + "loss": 0.3531, + "step": 18455 + }, + { + "epoch": 0.4114711132492881, + "grad_norm": 0.9398419857025146, + "learning_rate": 1.2745384675074228e-05, + "loss": 0.2857, + "step": 18460 + }, + { + "epoch": 0.41158256262990817, + "grad_norm": 0.6028172969818115, + "learning_rate": 1.2742017723023194e-05, + "loss": 0.3597, + "step": 18465 + }, + { + "epoch": 0.41169401201052824, + "grad_norm": 0.5513226985931396, + "learning_rate": 1.2738650434821952e-05, + "loss": 0.3979, + "step": 18470 + }, + { + "epoch": 0.4118054613911483, + "grad_norm": 0.5270368456840515, + "learning_rate": 1.2735282810883303e-05, + "loss": 0.4054, + "step": 18475 + }, + { + "epoch": 0.4119169107717684, + "grad_norm": 0.48779648542404175, + "learning_rate": 1.2731914851620099e-05, + "loss": 0.3752, + "step": 18480 + }, + { + "epoch": 0.4120283601523885, + "grad_norm": 0.5824145674705505, + "learning_rate": 1.2728546557445218e-05, + "loss": 0.3695, + "step": 18485 + }, + { + "epoch": 0.4121398095330085, + "grad_norm": 0.5668191909790039, + "learning_rate": 1.2725177928771591e-05, + "loss": 0.2761, + "step": 18490 + }, + { + "epoch": 0.41225125891362857, + "grad_norm": 0.4143696129322052, + "learning_rate": 1.2721808966012188e-05, + "loss": 0.3978, + "step": 18495 + }, + { + "epoch": 0.41236270829424865, + "grad_norm": 0.4864271283149719, + "learning_rate": 1.2718439669580009e-05, + "loss": 0.2801, + "step": 18500 + }, + { + "epoch": 0.4124741576748687, + "grad_norm": 0.6782664656639099, + "learning_rate": 1.2715070039888116e-05, + "loss": 0.3272, + "step": 18505 + }, + { + "epoch": 0.4125856070554888, + "grad_norm": 0.7863107323646545, + "learning_rate": 1.2711700077349589e-05, + "loss": 0.254, + "step": 18510 + }, + { + "epoch": 0.4126970564361089, + "grad_norm": 0.7221528887748718, + "learning_rate": 1.2708329782377565e-05, + "loss": 0.3955, + "step": 18515 + }, + { + "epoch": 0.4128085058167289, + "grad_norm": 0.6611953973770142, + "learning_rate": 1.2704959155385217e-05, + "loss": 0.2752, + "step": 18520 + }, + { + "epoch": 0.412919955197349, + "grad_norm": 0.48354998230934143, + "learning_rate": 1.2701588196785755e-05, + "loss": 0.2948, + "step": 18525 + }, + { + "epoch": 0.41303140457796905, + "grad_norm": 0.4558368921279907, + "learning_rate": 1.2698216906992426e-05, + "loss": 0.4377, + "step": 18530 + }, + { + "epoch": 0.41314285395858913, + "grad_norm": 0.6224683523178101, + "learning_rate": 1.269484528641854e-05, + "loss": 0.2484, + "step": 18535 + }, + { + "epoch": 0.4132543033392092, + "grad_norm": 0.5457311272621155, + "learning_rate": 1.269147333547742e-05, + "loss": 0.3035, + "step": 18540 + }, + { + "epoch": 0.4133657527198293, + "grad_norm": 0.6461859345436096, + "learning_rate": 1.2688101054582443e-05, + "loss": 0.3112, + "step": 18545 + }, + { + "epoch": 0.4134772021004493, + "grad_norm": 0.46919846534729004, + "learning_rate": 1.2684728444147027e-05, + "loss": 0.3344, + "step": 18550 + }, + { + "epoch": 0.4135886514810694, + "grad_norm": 0.4080178737640381, + "learning_rate": 1.2681355504584627e-05, + "loss": 0.3146, + "step": 18555 + }, + { + "epoch": 0.41370010086168946, + "grad_norm": 1.5496551990509033, + "learning_rate": 1.2677982236308737e-05, + "loss": 0.2363, + "step": 18560 + }, + { + "epoch": 0.41381155024230953, + "grad_norm": 0.5933051109313965, + "learning_rate": 1.2674608639732896e-05, + "loss": 0.361, + "step": 18565 + }, + { + "epoch": 0.4139229996229296, + "grad_norm": 0.5045031905174255, + "learning_rate": 1.2671234715270684e-05, + "loss": 0.3109, + "step": 18570 + }, + { + "epoch": 0.4140344490035497, + "grad_norm": 0.7789613604545593, + "learning_rate": 1.2667860463335711e-05, + "loss": 0.4204, + "step": 18575 + }, + { + "epoch": 0.4141458983841697, + "grad_norm": 0.4235498309135437, + "learning_rate": 1.2664485884341638e-05, + "loss": 0.3569, + "step": 18580 + }, + { + "epoch": 0.4142573477647898, + "grad_norm": 0.4231083393096924, + "learning_rate": 1.2661110978702164e-05, + "loss": 0.3355, + "step": 18585 + }, + { + "epoch": 0.41436879714540986, + "grad_norm": 0.49784886837005615, + "learning_rate": 1.2657735746831023e-05, + "loss": 0.1982, + "step": 18590 + }, + { + "epoch": 0.41448024652602994, + "grad_norm": 0.6310487985610962, + "learning_rate": 1.2654360189141997e-05, + "loss": 0.3131, + "step": 18595 + }, + { + "epoch": 0.41459169590665, + "grad_norm": 0.48694488406181335, + "learning_rate": 1.2650984306048897e-05, + "loss": 0.2527, + "step": 18600 + }, + { + "epoch": 0.4147031452872701, + "grad_norm": 0.6880688667297363, + "learning_rate": 1.2647608097965585e-05, + "loss": 0.3681, + "step": 18605 + }, + { + "epoch": 0.4148145946678901, + "grad_norm": 0.7431567907333374, + "learning_rate": 1.2644231565305955e-05, + "loss": 0.3137, + "step": 18610 + }, + { + "epoch": 0.4149260440485102, + "grad_norm": 0.5349269509315491, + "learning_rate": 1.2640854708483948e-05, + "loss": 0.3026, + "step": 18615 + }, + { + "epoch": 0.41503749342913027, + "grad_norm": 0.86985182762146, + "learning_rate": 1.2637477527913538e-05, + "loss": 0.445, + "step": 18620 + }, + { + "epoch": 0.41514894280975034, + "grad_norm": 0.6898271441459656, + "learning_rate": 1.2634100024008737e-05, + "loss": 0.329, + "step": 18625 + }, + { + "epoch": 0.4152603921903704, + "grad_norm": 0.612746000289917, + "learning_rate": 1.2630722197183611e-05, + "loss": 0.271, + "step": 18630 + }, + { + "epoch": 0.4153718415709905, + "grad_norm": 0.5866104364395142, + "learning_rate": 1.2627344047852246e-05, + "loss": 0.4016, + "step": 18635 + }, + { + "epoch": 0.4154832909516105, + "grad_norm": 0.7675313949584961, + "learning_rate": 1.2623965576428781e-05, + "loss": 0.3276, + "step": 18640 + }, + { + "epoch": 0.4155947403322306, + "grad_norm": 0.6587178707122803, + "learning_rate": 1.2620586783327392e-05, + "loss": 0.1957, + "step": 18645 + }, + { + "epoch": 0.41570618971285067, + "grad_norm": 0.7516689896583557, + "learning_rate": 1.2617207668962287e-05, + "loss": 0.2169, + "step": 18650 + }, + { + "epoch": 0.41581763909347075, + "grad_norm": 0.6809471249580383, + "learning_rate": 1.2613828233747726e-05, + "loss": 0.3006, + "step": 18655 + }, + { + "epoch": 0.4159290884740908, + "grad_norm": 0.3378174304962158, + "learning_rate": 1.2610448478098003e-05, + "loss": 0.3007, + "step": 18660 + }, + { + "epoch": 0.41604053785471085, + "grad_norm": 0.40882351994514465, + "learning_rate": 1.2607068402427444e-05, + "loss": 0.2965, + "step": 18665 + }, + { + "epoch": 0.4161519872353309, + "grad_norm": 0.6255966424942017, + "learning_rate": 1.2603688007150418e-05, + "loss": 0.404, + "step": 18670 + }, + { + "epoch": 0.416263436615951, + "grad_norm": 0.6284846067428589, + "learning_rate": 1.2600307292681345e-05, + "loss": 0.4282, + "step": 18675 + }, + { + "epoch": 0.4163748859965711, + "grad_norm": 0.536528468132019, + "learning_rate": 1.2596926259434668e-05, + "loss": 0.3725, + "step": 18680 + }, + { + "epoch": 0.41648633537719115, + "grad_norm": 0.39987242221832275, + "learning_rate": 1.2593544907824872e-05, + "loss": 0.2594, + "step": 18685 + }, + { + "epoch": 0.41659778475781123, + "grad_norm": 0.5609185695648193, + "learning_rate": 1.2590163238266494e-05, + "loss": 0.2777, + "step": 18690 + }, + { + "epoch": 0.41670923413843125, + "grad_norm": 0.6560683846473694, + "learning_rate": 1.2586781251174098e-05, + "loss": 0.3646, + "step": 18695 + }, + { + "epoch": 0.4168206835190513, + "grad_norm": 0.781556248664856, + "learning_rate": 1.2583398946962284e-05, + "loss": 0.2337, + "step": 18700 + }, + { + "epoch": 0.4169321328996714, + "grad_norm": 0.5400981903076172, + "learning_rate": 1.25800163260457e-05, + "loss": 0.4057, + "step": 18705 + }, + { + "epoch": 0.4170435822802915, + "grad_norm": 0.6012236475944519, + "learning_rate": 1.2576633388839032e-05, + "loss": 0.2896, + "step": 18710 + }, + { + "epoch": 0.41715503166091156, + "grad_norm": 0.4528880715370178, + "learning_rate": 1.2573250135756996e-05, + "loss": 0.4724, + "step": 18715 + }, + { + "epoch": 0.41726648104153163, + "grad_norm": 0.5520021319389343, + "learning_rate": 1.2569866567214354e-05, + "loss": 0.3262, + "step": 18720 + }, + { + "epoch": 0.41737793042215166, + "grad_norm": 0.5487710237503052, + "learning_rate": 1.2566482683625914e-05, + "loss": 0.3624, + "step": 18725 + }, + { + "epoch": 0.41748937980277173, + "grad_norm": 0.7504733204841614, + "learning_rate": 1.2563098485406502e-05, + "loss": 0.259, + "step": 18730 + }, + { + "epoch": 0.4176008291833918, + "grad_norm": 0.5932433605194092, + "learning_rate": 1.2559713972971002e-05, + "loss": 0.3458, + "step": 18735 + }, + { + "epoch": 0.4177122785640119, + "grad_norm": 0.6621396541595459, + "learning_rate": 1.2556329146734326e-05, + "loss": 0.313, + "step": 18740 + }, + { + "epoch": 0.41782372794463196, + "grad_norm": 0.6199692487716675, + "learning_rate": 1.2552944007111424e-05, + "loss": 0.2457, + "step": 18745 + }, + { + "epoch": 0.41793517732525204, + "grad_norm": 0.5559074282646179, + "learning_rate": 1.2549558554517298e-05, + "loss": 0.3902, + "step": 18750 + }, + { + "epoch": 0.41804662670587206, + "grad_norm": 0.3973815143108368, + "learning_rate": 1.2546172789366973e-05, + "loss": 0.443, + "step": 18755 + }, + { + "epoch": 0.41815807608649214, + "grad_norm": 0.3972376585006714, + "learning_rate": 1.2542786712075516e-05, + "loss": 0.3741, + "step": 18760 + }, + { + "epoch": 0.4182695254671122, + "grad_norm": 0.6444116234779358, + "learning_rate": 1.2539400323058035e-05, + "loss": 0.3151, + "step": 18765 + }, + { + "epoch": 0.4183809748477323, + "grad_norm": 0.6359716057777405, + "learning_rate": 1.2536013622729677e-05, + "loss": 0.4012, + "step": 18770 + }, + { + "epoch": 0.41849242422835237, + "grad_norm": 0.7078494429588318, + "learning_rate": 1.2532626611505623e-05, + "loss": 0.3266, + "step": 18775 + }, + { + "epoch": 0.41860387360897244, + "grad_norm": 0.5959888696670532, + "learning_rate": 1.2529239289801093e-05, + "loss": 0.3894, + "step": 18780 + }, + { + "epoch": 0.41871532298959246, + "grad_norm": 0.6465641260147095, + "learning_rate": 1.252585165803135e-05, + "loss": 0.338, + "step": 18785 + }, + { + "epoch": 0.41882677237021254, + "grad_norm": 0.40250465273857117, + "learning_rate": 1.2522463716611693e-05, + "loss": 0.304, + "step": 18790 + }, + { + "epoch": 0.4189382217508326, + "grad_norm": 0.5716389417648315, + "learning_rate": 1.2519075465957451e-05, + "loss": 0.2614, + "step": 18795 + }, + { + "epoch": 0.4190496711314527, + "grad_norm": 0.7511163353919983, + "learning_rate": 1.2515686906484006e-05, + "loss": 0.3442, + "step": 18800 + }, + { + "epoch": 0.41916112051207277, + "grad_norm": 0.5761300921440125, + "learning_rate": 1.2512298038606759e-05, + "loss": 0.3672, + "step": 18805 + }, + { + "epoch": 0.41927256989269285, + "grad_norm": 0.585167407989502, + "learning_rate": 1.2508908862741166e-05, + "loss": 0.3066, + "step": 18810 + }, + { + "epoch": 0.41938401927331287, + "grad_norm": 0.5810245871543884, + "learning_rate": 1.2505519379302715e-05, + "loss": 0.3823, + "step": 18815 + }, + { + "epoch": 0.41949546865393295, + "grad_norm": 0.627994179725647, + "learning_rate": 1.2502129588706926e-05, + "loss": 0.3069, + "step": 18820 + }, + { + "epoch": 0.419606918034553, + "grad_norm": 0.45014336705207825, + "learning_rate": 1.2498739491369362e-05, + "loss": 0.2936, + "step": 18825 + }, + { + "epoch": 0.4197183674151731, + "grad_norm": 0.56611567735672, + "learning_rate": 1.2495349087705625e-05, + "loss": 0.427, + "step": 18830 + }, + { + "epoch": 0.4198298167957932, + "grad_norm": 0.6289352774620056, + "learning_rate": 1.249195837813135e-05, + "loss": 0.2673, + "step": 18835 + }, + { + "epoch": 0.4199412661764132, + "grad_norm": 0.5408057570457458, + "learning_rate": 1.2488567363062213e-05, + "loss": 0.3397, + "step": 18840 + }, + { + "epoch": 0.4200527155570333, + "grad_norm": 0.6514496207237244, + "learning_rate": 1.2485176042913926e-05, + "loss": 0.2976, + "step": 18845 + }, + { + "epoch": 0.42016416493765335, + "grad_norm": 0.903830885887146, + "learning_rate": 1.248178441810224e-05, + "loss": 0.3012, + "step": 18850 + }, + { + "epoch": 0.4202756143182734, + "grad_norm": 0.6694687008857727, + "learning_rate": 1.247839248904294e-05, + "loss": 0.2519, + "step": 18855 + }, + { + "epoch": 0.4203870636988935, + "grad_norm": 0.7829026579856873, + "learning_rate": 1.2475000256151852e-05, + "loss": 0.335, + "step": 18860 + }, + { + "epoch": 0.4204985130795136, + "grad_norm": 0.7483092546463013, + "learning_rate": 1.2471607719844833e-05, + "loss": 0.3928, + "step": 18865 + }, + { + "epoch": 0.4206099624601336, + "grad_norm": 0.5407531261444092, + "learning_rate": 1.2468214880537788e-05, + "loss": 0.333, + "step": 18870 + }, + { + "epoch": 0.4207214118407537, + "grad_norm": 0.45422080159187317, + "learning_rate": 1.2464821738646652e-05, + "loss": 0.356, + "step": 18875 + }, + { + "epoch": 0.42083286122137376, + "grad_norm": 0.6194318532943726, + "learning_rate": 1.2461428294587394e-05, + "loss": 0.2601, + "step": 18880 + }, + { + "epoch": 0.42094431060199383, + "grad_norm": 0.4982631802558899, + "learning_rate": 1.2458034548776026e-05, + "loss": 0.3319, + "step": 18885 + }, + { + "epoch": 0.4210557599826139, + "grad_norm": 0.5249817967414856, + "learning_rate": 1.2454640501628599e-05, + "loss": 0.3689, + "step": 18890 + }, + { + "epoch": 0.421167209363234, + "grad_norm": 0.7947955131530762, + "learning_rate": 1.2451246153561191e-05, + "loss": 0.3169, + "step": 18895 + }, + { + "epoch": 0.421278658743854, + "grad_norm": 0.43422529101371765, + "learning_rate": 1.2447851504989922e-05, + "loss": 0.3823, + "step": 18900 + }, + { + "epoch": 0.4213901081244741, + "grad_norm": 0.5541136264801025, + "learning_rate": 1.2444456556330957e-05, + "loss": 0.2701, + "step": 18905 + }, + { + "epoch": 0.42150155750509416, + "grad_norm": 0.6233346462249756, + "learning_rate": 1.2441061308000489e-05, + "loss": 0.4002, + "step": 18910 + }, + { + "epoch": 0.42161300688571424, + "grad_norm": 0.6174611449241638, + "learning_rate": 1.2437665760414742e-05, + "loss": 0.3123, + "step": 18915 + }, + { + "epoch": 0.4217244562663343, + "grad_norm": 0.7869670391082764, + "learning_rate": 1.243426991398999e-05, + "loss": 0.2451, + "step": 18920 + }, + { + "epoch": 0.4218359056469544, + "grad_norm": 0.6006391644477844, + "learning_rate": 1.2430873769142538e-05, + "loss": 0.3733, + "step": 18925 + }, + { + "epoch": 0.4219473550275744, + "grad_norm": 0.38537976145744324, + "learning_rate": 1.2427477326288722e-05, + "loss": 0.4187, + "step": 18930 + }, + { + "epoch": 0.4220588044081945, + "grad_norm": 0.40693843364715576, + "learning_rate": 1.2424080585844924e-05, + "loss": 0.345, + "step": 18935 + }, + { + "epoch": 0.42217025378881456, + "grad_norm": 0.6330365538597107, + "learning_rate": 1.2420683548227558e-05, + "loss": 0.3277, + "step": 18940 + }, + { + "epoch": 0.42228170316943464, + "grad_norm": 0.9879538416862488, + "learning_rate": 1.2417286213853075e-05, + "loss": 0.3467, + "step": 18945 + }, + { + "epoch": 0.4223931525500547, + "grad_norm": 0.7751918435096741, + "learning_rate": 1.241388858313796e-05, + "loss": 0.2522, + "step": 18950 + }, + { + "epoch": 0.4225046019306748, + "grad_norm": 0.6003888249397278, + "learning_rate": 1.2410490656498736e-05, + "loss": 0.4377, + "step": 18955 + }, + { + "epoch": 0.4226160513112948, + "grad_norm": 0.671985924243927, + "learning_rate": 1.2407092434351965e-05, + "loss": 0.3896, + "step": 18960 + }, + { + "epoch": 0.4227275006919149, + "grad_norm": 0.5274528861045837, + "learning_rate": 1.240369391711424e-05, + "loss": 0.3564, + "step": 18965 + }, + { + "epoch": 0.42283895007253497, + "grad_norm": 0.5010458827018738, + "learning_rate": 1.2400295105202194e-05, + "loss": 0.2859, + "step": 18970 + }, + { + "epoch": 0.42295039945315505, + "grad_norm": 0.5250893235206604, + "learning_rate": 1.23968959990325e-05, + "loss": 0.2515, + "step": 18975 + }, + { + "epoch": 0.4230618488337751, + "grad_norm": 0.8268576860427856, + "learning_rate": 1.2393496599021853e-05, + "loss": 0.3041, + "step": 18980 + }, + { + "epoch": 0.4231732982143952, + "grad_norm": 0.7334866523742676, + "learning_rate": 1.2390096905586997e-05, + "loss": 0.4155, + "step": 18985 + }, + { + "epoch": 0.4232847475950152, + "grad_norm": 0.6723551154136658, + "learning_rate": 1.2386696919144715e-05, + "loss": 0.2755, + "step": 18990 + }, + { + "epoch": 0.4233961969756353, + "grad_norm": 0.8720344305038452, + "learning_rate": 1.2383296640111805e-05, + "loss": 0.2926, + "step": 18995 + }, + { + "epoch": 0.4235076463562554, + "grad_norm": 0.4712897837162018, + "learning_rate": 1.2379896068905131e-05, + "loss": 0.312, + "step": 19000 + }, + { + "epoch": 0.42361909573687545, + "grad_norm": 0.6474051475524902, + "learning_rate": 1.2376495205941566e-05, + "loss": 0.2337, + "step": 19005 + }, + { + "epoch": 0.4237305451174955, + "grad_norm": 0.6197468042373657, + "learning_rate": 1.237309405163803e-05, + "loss": 0.4293, + "step": 19010 + }, + { + "epoch": 0.4238419944981156, + "grad_norm": 0.446269690990448, + "learning_rate": 1.2369692606411486e-05, + "loss": 0.2745, + "step": 19015 + }, + { + "epoch": 0.4239534438787356, + "grad_norm": 0.5081597566604614, + "learning_rate": 1.2366290870678914e-05, + "loss": 0.2961, + "step": 19020 + }, + { + "epoch": 0.4240648932593557, + "grad_norm": 0.5012861490249634, + "learning_rate": 1.2362888844857349e-05, + "loss": 0.2736, + "step": 19025 + }, + { + "epoch": 0.4241763426399758, + "grad_norm": 0.6631560921669006, + "learning_rate": 1.2359486529363851e-05, + "loss": 0.3143, + "step": 19030 + }, + { + "epoch": 0.42428779202059586, + "grad_norm": 1.1381235122680664, + "learning_rate": 1.2356083924615516e-05, + "loss": 0.5141, + "step": 19035 + }, + { + "epoch": 0.42439924140121593, + "grad_norm": 0.6382589340209961, + "learning_rate": 1.2352681031029476e-05, + "loss": 0.2542, + "step": 19040 + }, + { + "epoch": 0.42451069078183595, + "grad_norm": 0.7608498930931091, + "learning_rate": 1.2349277849022905e-05, + "loss": 0.33, + "step": 19045 + }, + { + "epoch": 0.42462214016245603, + "grad_norm": 0.4310937523841858, + "learning_rate": 1.2345874379013004e-05, + "loss": 0.2368, + "step": 19050 + }, + { + "epoch": 0.4247335895430761, + "grad_norm": 0.7651890516281128, + "learning_rate": 1.2342470621417006e-05, + "loss": 0.2911, + "step": 19055 + }, + { + "epoch": 0.4248450389236962, + "grad_norm": 0.7039048075675964, + "learning_rate": 1.2339066576652194e-05, + "loss": 0.2134, + "step": 19060 + }, + { + "epoch": 0.42495648830431626, + "grad_norm": 0.6038171052932739, + "learning_rate": 1.2335662245135877e-05, + "loss": 0.1983, + "step": 19065 + }, + { + "epoch": 0.42506793768493634, + "grad_norm": 0.8028332591056824, + "learning_rate": 1.2332257627285395e-05, + "loss": 0.3403, + "step": 19070 + }, + { + "epoch": 0.42517938706555636, + "grad_norm": 0.752396821975708, + "learning_rate": 1.2328852723518127e-05, + "loss": 0.3764, + "step": 19075 + }, + { + "epoch": 0.42529083644617643, + "grad_norm": 0.598617434501648, + "learning_rate": 1.2325447534251497e-05, + "loss": 0.3535, + "step": 19080 + }, + { + "epoch": 0.4254022858267965, + "grad_norm": 0.9264512658119202, + "learning_rate": 1.2322042059902946e-05, + "loss": 0.3442, + "step": 19085 + }, + { + "epoch": 0.4255137352074166, + "grad_norm": 0.6805040240287781, + "learning_rate": 1.2318636300889963e-05, + "loss": 0.3218, + "step": 19090 + }, + { + "epoch": 0.42562518458803666, + "grad_norm": 0.5993042588233948, + "learning_rate": 1.2315230257630066e-05, + "loss": 0.378, + "step": 19095 + }, + { + "epoch": 0.42573663396865674, + "grad_norm": 0.49872323870658875, + "learning_rate": 1.231182393054081e-05, + "loss": 0.3688, + "step": 19100 + }, + { + "epoch": 0.42584808334927676, + "grad_norm": 0.6361995935440063, + "learning_rate": 1.2308417320039785e-05, + "loss": 0.3241, + "step": 19105 + }, + { + "epoch": 0.42595953272989684, + "grad_norm": 0.6568997502326965, + "learning_rate": 1.2305010426544615e-05, + "loss": 0.3265, + "step": 19110 + }, + { + "epoch": 0.4260709821105169, + "grad_norm": 0.5412352085113525, + "learning_rate": 1.2301603250472958e-05, + "loss": 0.344, + "step": 19115 + }, + { + "epoch": 0.426182431491137, + "grad_norm": 0.47443583607673645, + "learning_rate": 1.229819579224251e-05, + "loss": 0.2534, + "step": 19120 + }, + { + "epoch": 0.42629388087175707, + "grad_norm": 0.6884459853172302, + "learning_rate": 1.2294788052270996e-05, + "loss": 0.2869, + "step": 19125 + }, + { + "epoch": 0.42640533025237715, + "grad_norm": 0.5194846391677856, + "learning_rate": 1.2291380030976177e-05, + "loss": 0.3046, + "step": 19130 + }, + { + "epoch": 0.42651677963299717, + "grad_norm": 0.8681108951568604, + "learning_rate": 1.2287971728775856e-05, + "loss": 0.4133, + "step": 19135 + }, + { + "epoch": 0.42662822901361724, + "grad_norm": 0.49403271079063416, + "learning_rate": 1.2284563146087862e-05, + "loss": 0.3009, + "step": 19140 + }, + { + "epoch": 0.4267396783942373, + "grad_norm": 0.6868266463279724, + "learning_rate": 1.2281154283330059e-05, + "loss": 0.2997, + "step": 19145 + }, + { + "epoch": 0.4268511277748574, + "grad_norm": 0.7888140082359314, + "learning_rate": 1.2277745140920347e-05, + "loss": 0.3482, + "step": 19150 + }, + { + "epoch": 0.4269625771554775, + "grad_norm": 0.8175737857818604, + "learning_rate": 1.2274335719276666e-05, + "loss": 0.376, + "step": 19155 + }, + { + "epoch": 0.42707402653609755, + "grad_norm": 0.3791024088859558, + "learning_rate": 1.2270926018816978e-05, + "loss": 0.3969, + "step": 19160 + }, + { + "epoch": 0.42718547591671757, + "grad_norm": 0.528655469417572, + "learning_rate": 1.2267516039959289e-05, + "loss": 0.4267, + "step": 19165 + }, + { + "epoch": 0.42729692529733765, + "grad_norm": 0.54219651222229, + "learning_rate": 1.2264105783121639e-05, + "loss": 0.2982, + "step": 19170 + }, + { + "epoch": 0.4274083746779577, + "grad_norm": 0.6535061001777649, + "learning_rate": 1.2260695248722096e-05, + "loss": 0.238, + "step": 19175 + }, + { + "epoch": 0.4275198240585778, + "grad_norm": 0.4789026081562042, + "learning_rate": 1.2257284437178761e-05, + "loss": 0.3183, + "step": 19180 + }, + { + "epoch": 0.4276312734391979, + "grad_norm": 0.5575686097145081, + "learning_rate": 1.2253873348909782e-05, + "loss": 0.4224, + "step": 19185 + }, + { + "epoch": 0.42774272281981796, + "grad_norm": 0.6034550666809082, + "learning_rate": 1.2250461984333326e-05, + "loss": 0.4497, + "step": 19190 + }, + { + "epoch": 0.427854172200438, + "grad_norm": 0.7348048686981201, + "learning_rate": 1.22470503438676e-05, + "loss": 0.2831, + "step": 19195 + }, + { + "epoch": 0.42796562158105805, + "grad_norm": 0.5611327886581421, + "learning_rate": 1.2243638427930848e-05, + "loss": 0.3723, + "step": 19200 + }, + { + "epoch": 0.42807707096167813, + "grad_norm": 0.6347730159759521, + "learning_rate": 1.2240226236941344e-05, + "loss": 0.2247, + "step": 19205 + }, + { + "epoch": 0.4281885203422982, + "grad_norm": 0.7379160523414612, + "learning_rate": 1.2236813771317392e-05, + "loss": 0.4229, + "step": 19210 + }, + { + "epoch": 0.4282999697229183, + "grad_norm": 0.4423249661922455, + "learning_rate": 1.2233401031477342e-05, + "loss": 0.299, + "step": 19215 + }, + { + "epoch": 0.4284114191035383, + "grad_norm": 0.7548907995223999, + "learning_rate": 1.2229988017839563e-05, + "loss": 0.3763, + "step": 19220 + }, + { + "epoch": 0.4285228684841584, + "grad_norm": 0.5771735310554504, + "learning_rate": 1.2226574730822463e-05, + "loss": 0.4532, + "step": 19225 + }, + { + "epoch": 0.42863431786477846, + "grad_norm": 0.6492488384246826, + "learning_rate": 1.222316117084449e-05, + "loss": 0.2664, + "step": 19230 + }, + { + "epoch": 0.42874576724539853, + "grad_norm": 0.7255642414093018, + "learning_rate": 1.2219747338324117e-05, + "loss": 0.2886, + "step": 19235 + }, + { + "epoch": 0.4288572166260186, + "grad_norm": 0.548874020576477, + "learning_rate": 1.2216333233679853e-05, + "loss": 0.2665, + "step": 19240 + }, + { + "epoch": 0.4289686660066387, + "grad_norm": 0.7727565765380859, + "learning_rate": 1.2212918857330244e-05, + "loss": 0.3573, + "step": 19245 + }, + { + "epoch": 0.4290801153872587, + "grad_norm": 0.5158136487007141, + "learning_rate": 1.2209504209693863e-05, + "loss": 0.2059, + "step": 19250 + }, + { + "epoch": 0.4291915647678788, + "grad_norm": 0.6173778176307678, + "learning_rate": 1.2206089291189322e-05, + "loss": 0.3144, + "step": 19255 + }, + { + "epoch": 0.42930301414849886, + "grad_norm": 0.5652750134468079, + "learning_rate": 1.2202674102235264e-05, + "loss": 0.3406, + "step": 19260 + }, + { + "epoch": 0.42941446352911894, + "grad_norm": 0.4517764151096344, + "learning_rate": 1.219925864325036e-05, + "loss": 0.3573, + "step": 19265 + }, + { + "epoch": 0.429525912909739, + "grad_norm": 0.47026604413986206, + "learning_rate": 1.2195842914653321e-05, + "loss": 0.2306, + "step": 19270 + }, + { + "epoch": 0.4296373622903591, + "grad_norm": 0.6984983682632446, + "learning_rate": 1.2192426916862892e-05, + "loss": 0.3728, + "step": 19275 + }, + { + "epoch": 0.4297488116709791, + "grad_norm": 0.5496819615364075, + "learning_rate": 1.2189010650297848e-05, + "loss": 0.361, + "step": 19280 + }, + { + "epoch": 0.4298602610515992, + "grad_norm": 0.8521873950958252, + "learning_rate": 1.2185594115376991e-05, + "loss": 0.3353, + "step": 19285 + }, + { + "epoch": 0.42997171043221927, + "grad_norm": 0.6980066895484924, + "learning_rate": 1.2182177312519166e-05, + "loss": 0.283, + "step": 19290 + }, + { + "epoch": 0.43008315981283934, + "grad_norm": 0.6652234792709351, + "learning_rate": 1.2178760242143248e-05, + "loss": 0.315, + "step": 19295 + }, + { + "epoch": 0.4301946091934594, + "grad_norm": 0.6429911255836487, + "learning_rate": 1.2175342904668139e-05, + "loss": 0.4032, + "step": 19300 + }, + { + "epoch": 0.4303060585740795, + "grad_norm": 0.35845664143562317, + "learning_rate": 1.2171925300512783e-05, + "loss": 0.24, + "step": 19305 + }, + { + "epoch": 0.4304175079546995, + "grad_norm": 0.6370695233345032, + "learning_rate": 1.2168507430096152e-05, + "loss": 0.3106, + "step": 19310 + }, + { + "epoch": 0.4305289573353196, + "grad_norm": 0.44975778460502625, + "learning_rate": 1.2165089293837245e-05, + "loss": 0.3312, + "step": 19315 + }, + { + "epoch": 0.43064040671593967, + "grad_norm": 0.7515400052070618, + "learning_rate": 1.2161670892155106e-05, + "loss": 0.2437, + "step": 19320 + }, + { + "epoch": 0.43075185609655975, + "grad_norm": 0.34712356328964233, + "learning_rate": 1.2158252225468799e-05, + "loss": 0.3372, + "step": 19325 + }, + { + "epoch": 0.4308633054771798, + "grad_norm": 0.6354972720146179, + "learning_rate": 1.2154833294197427e-05, + "loss": 0.3335, + "step": 19330 + }, + { + "epoch": 0.4309747548577999, + "grad_norm": 0.6496874094009399, + "learning_rate": 1.2151414098760129e-05, + "loss": 0.3384, + "step": 19335 + }, + { + "epoch": 0.4310862042384199, + "grad_norm": 0.6100443601608276, + "learning_rate": 1.2147994639576064e-05, + "loss": 0.364, + "step": 19340 + }, + { + "epoch": 0.43119765361904, + "grad_norm": 1.147022008895874, + "learning_rate": 1.214457491706444e-05, + "loss": 0.3455, + "step": 19345 + }, + { + "epoch": 0.4313091029996601, + "grad_norm": 0.4176161587238312, + "learning_rate": 1.2141154931644484e-05, + "loss": 0.2617, + "step": 19350 + }, + { + "epoch": 0.43142055238028015, + "grad_norm": 1.2032032012939453, + "learning_rate": 1.2137734683735463e-05, + "loss": 0.4088, + "step": 19355 + }, + { + "epoch": 0.43153200176090023, + "grad_norm": 0.6207007765769958, + "learning_rate": 1.2134314173756664e-05, + "loss": 0.3417, + "step": 19360 + }, + { + "epoch": 0.4316434511415203, + "grad_norm": 0.6498280167579651, + "learning_rate": 1.2130893402127427e-05, + "loss": 0.2873, + "step": 19365 + }, + { + "epoch": 0.43175490052214033, + "grad_norm": 0.5154594779014587, + "learning_rate": 1.2127472369267105e-05, + "loss": 0.3284, + "step": 19370 + }, + { + "epoch": 0.4318663499027604, + "grad_norm": 0.5628852844238281, + "learning_rate": 1.2124051075595094e-05, + "loss": 0.3134, + "step": 19375 + }, + { + "epoch": 0.4319777992833805, + "grad_norm": 0.6029967069625854, + "learning_rate": 1.2120629521530813e-05, + "loss": 0.4147, + "step": 19380 + }, + { + "epoch": 0.43208924866400056, + "grad_norm": 0.36349472403526306, + "learning_rate": 1.2117207707493726e-05, + "loss": 0.2858, + "step": 19385 + }, + { + "epoch": 0.43220069804462063, + "grad_norm": 0.5155287384986877, + "learning_rate": 1.2113785633903315e-05, + "loss": 0.3876, + "step": 19390 + }, + { + "epoch": 0.4323121474252407, + "grad_norm": 0.5897233486175537, + "learning_rate": 1.2110363301179096e-05, + "loss": 0.2948, + "step": 19395 + }, + { + "epoch": 0.43242359680586073, + "grad_norm": 0.6986548900604248, + "learning_rate": 1.2106940709740631e-05, + "loss": 0.4823, + "step": 19400 + }, + { + "epoch": 0.4325350461864808, + "grad_norm": 0.6543066501617432, + "learning_rate": 1.2103517860007498e-05, + "loss": 0.3273, + "step": 19405 + }, + { + "epoch": 0.4326464955671009, + "grad_norm": 0.5895144939422607, + "learning_rate": 1.2100094752399307e-05, + "loss": 0.3373, + "step": 19410 + }, + { + "epoch": 0.43275794494772096, + "grad_norm": 0.8023034334182739, + "learning_rate": 1.2096671387335717e-05, + "loss": 0.4212, + "step": 19415 + }, + { + "epoch": 0.43286939432834104, + "grad_norm": 0.4318520426750183, + "learning_rate": 1.2093247765236395e-05, + "loss": 0.3291, + "step": 19420 + }, + { + "epoch": 0.43298084370896106, + "grad_norm": 0.7532975673675537, + "learning_rate": 1.2089823886521054e-05, + "loss": 0.3317, + "step": 19425 + }, + { + "epoch": 0.43309229308958114, + "grad_norm": 0.7808141708374023, + "learning_rate": 1.2086399751609435e-05, + "loss": 0.4682, + "step": 19430 + }, + { + "epoch": 0.4332037424702012, + "grad_norm": 0.43736621737480164, + "learning_rate": 1.2082975360921314e-05, + "loss": 0.2712, + "step": 19435 + }, + { + "epoch": 0.4333151918508213, + "grad_norm": 0.3133240342140198, + "learning_rate": 1.2079550714876489e-05, + "loss": 0.2126, + "step": 19440 + }, + { + "epoch": 0.43342664123144137, + "grad_norm": 0.6244284510612488, + "learning_rate": 1.2076125813894797e-05, + "loss": 0.4448, + "step": 19445 + }, + { + "epoch": 0.43353809061206144, + "grad_norm": 0.7542661428451538, + "learning_rate": 1.207270065839611e-05, + "loss": 0.2544, + "step": 19450 + }, + { + "epoch": 0.43364953999268147, + "grad_norm": 0.5900478959083557, + "learning_rate": 1.2069275248800315e-05, + "loss": 0.2778, + "step": 19455 + }, + { + "epoch": 0.43376098937330154, + "grad_norm": 0.7354373335838318, + "learning_rate": 1.2065849585527354e-05, + "loss": 0.2657, + "step": 19460 + }, + { + "epoch": 0.4338724387539216, + "grad_norm": 0.7068347334861755, + "learning_rate": 1.2062423668997174e-05, + "loss": 0.4206, + "step": 19465 + }, + { + "epoch": 0.4339838881345417, + "grad_norm": 0.6501403450965881, + "learning_rate": 1.2058997499629775e-05, + "loss": 0.2959, + "step": 19470 + }, + { + "epoch": 0.43409533751516177, + "grad_norm": 0.6633725166320801, + "learning_rate": 1.2055571077845175e-05, + "loss": 0.1783, + "step": 19475 + }, + { + "epoch": 0.43420678689578185, + "grad_norm": 0.4296213388442993, + "learning_rate": 1.2052144404063423e-05, + "loss": 0.2178, + "step": 19480 + }, + { + "epoch": 0.43431823627640187, + "grad_norm": 0.6821916103363037, + "learning_rate": 1.2048717478704614e-05, + "loss": 0.3981, + "step": 19485 + }, + { + "epoch": 0.43442968565702195, + "grad_norm": 0.5226448774337769, + "learning_rate": 1.2045290302188852e-05, + "loss": 0.2762, + "step": 19490 + }, + { + "epoch": 0.434541135037642, + "grad_norm": 0.6372525095939636, + "learning_rate": 1.2041862874936288e-05, + "loss": 0.3254, + "step": 19495 + }, + { + "epoch": 0.4346525844182621, + "grad_norm": 0.6730618476867676, + "learning_rate": 1.2038435197367093e-05, + "loss": 0.3431, + "step": 19500 + }, + { + "epoch": 0.4347640337988822, + "grad_norm": 0.6753670573234558, + "learning_rate": 1.203500726990148e-05, + "loss": 0.3882, + "step": 19505 + }, + { + "epoch": 0.43487548317950225, + "grad_norm": 0.5922030806541443, + "learning_rate": 1.2031579092959685e-05, + "loss": 0.4237, + "step": 19510 + }, + { + "epoch": 0.4349869325601223, + "grad_norm": 0.7718773484230042, + "learning_rate": 1.2028150666961968e-05, + "loss": 0.2151, + "step": 19515 + }, + { + "epoch": 0.43509838194074235, + "grad_norm": 0.8648079633712769, + "learning_rate": 1.2024721992328638e-05, + "loss": 0.3575, + "step": 19520 + }, + { + "epoch": 0.43520983132136243, + "grad_norm": 0.6470637321472168, + "learning_rate": 1.2021293069480023e-05, + "loss": 0.3105, + "step": 19525 + }, + { + "epoch": 0.4353212807019825, + "grad_norm": 0.4397228956222534, + "learning_rate": 1.2017863898836479e-05, + "loss": 0.397, + "step": 19530 + }, + { + "epoch": 0.4354327300826026, + "grad_norm": 0.49957188963890076, + "learning_rate": 1.201443448081839e-05, + "loss": 0.2596, + "step": 19535 + }, + { + "epoch": 0.43554417946322266, + "grad_norm": 0.5893974900245667, + "learning_rate": 1.201100481584619e-05, + "loss": 0.3685, + "step": 19540 + }, + { + "epoch": 0.4356556288438427, + "grad_norm": 0.7479491233825684, + "learning_rate": 1.2007574904340322e-05, + "loss": 0.4322, + "step": 19545 + }, + { + "epoch": 0.43576707822446276, + "grad_norm": 0.8564417362213135, + "learning_rate": 1.200414474672126e-05, + "loss": 0.3602, + "step": 19550 + }, + { + "epoch": 0.43587852760508283, + "grad_norm": 0.6858411431312561, + "learning_rate": 1.200071434340953e-05, + "loss": 0.2759, + "step": 19555 + }, + { + "epoch": 0.4359899769857029, + "grad_norm": 0.9257897734642029, + "learning_rate": 1.1997283694825661e-05, + "loss": 0.3571, + "step": 19560 + }, + { + "epoch": 0.436101426366323, + "grad_norm": 0.7651895880699158, + "learning_rate": 1.1993852801390227e-05, + "loss": 0.3078, + "step": 19565 + }, + { + "epoch": 0.43621287574694306, + "grad_norm": 0.819974422454834, + "learning_rate": 1.1990421663523829e-05, + "loss": 0.4032, + "step": 19570 + }, + { + "epoch": 0.4363243251275631, + "grad_norm": 0.508603572845459, + "learning_rate": 1.1986990281647101e-05, + "loss": 0.3394, + "step": 19575 + }, + { + "epoch": 0.43643577450818316, + "grad_norm": 0.43608325719833374, + "learning_rate": 1.19835586561807e-05, + "loss": 0.3911, + "step": 19580 + }, + { + "epoch": 0.43654722388880324, + "grad_norm": 0.8884719610214233, + "learning_rate": 1.198012678754532e-05, + "loss": 0.4198, + "step": 19585 + }, + { + "epoch": 0.4366586732694233, + "grad_norm": 0.6997649073600769, + "learning_rate": 1.197669467616168e-05, + "loss": 0.299, + "step": 19590 + }, + { + "epoch": 0.4367701226500434, + "grad_norm": 0.5541196465492249, + "learning_rate": 1.1973262322450527e-05, + "loss": 0.3458, + "step": 19595 + }, + { + "epoch": 0.4368815720306634, + "grad_norm": 0.42259481549263, + "learning_rate": 1.196982972683265e-05, + "loss": 0.2937, + "step": 19600 + }, + { + "epoch": 0.4369930214112835, + "grad_norm": 0.5829482078552246, + "learning_rate": 1.1966396889728848e-05, + "loss": 0.2873, + "step": 19605 + }, + { + "epoch": 0.43710447079190357, + "grad_norm": 0.5691021680831909, + "learning_rate": 1.1962963811559969e-05, + "loss": 0.2757, + "step": 19610 + }, + { + "epoch": 0.43721592017252364, + "grad_norm": 0.412564754486084, + "learning_rate": 1.1959530492746879e-05, + "loss": 0.3327, + "step": 19615 + }, + { + "epoch": 0.4373273695531437, + "grad_norm": 0.46707937121391296, + "learning_rate": 1.1956096933710476e-05, + "loss": 0.2798, + "step": 19620 + }, + { + "epoch": 0.4374388189337638, + "grad_norm": 0.8313919305801392, + "learning_rate": 1.1952663134871681e-05, + "loss": 0.4392, + "step": 19625 + }, + { + "epoch": 0.4375502683143838, + "grad_norm": 0.7636642456054688, + "learning_rate": 1.1949229096651465e-05, + "loss": 0.3606, + "step": 19630 + }, + { + "epoch": 0.4376617176950039, + "grad_norm": 0.510886013507843, + "learning_rate": 1.1945794819470805e-05, + "loss": 0.2675, + "step": 19635 + }, + { + "epoch": 0.43777316707562397, + "grad_norm": 0.6083784103393555, + "learning_rate": 1.194236030375072e-05, + "loss": 0.3092, + "step": 19640 + }, + { + "epoch": 0.43788461645624405, + "grad_norm": 1.0390868186950684, + "learning_rate": 1.1938925549912255e-05, + "loss": 0.4237, + "step": 19645 + }, + { + "epoch": 0.4379960658368641, + "grad_norm": 0.44526880979537964, + "learning_rate": 1.1935490558376484e-05, + "loss": 0.3021, + "step": 19650 + }, + { + "epoch": 0.4381075152174842, + "grad_norm": 0.5297619700431824, + "learning_rate": 1.1932055329564507e-05, + "loss": 0.3156, + "step": 19655 + }, + { + "epoch": 0.4382189645981042, + "grad_norm": 0.7630996704101562, + "learning_rate": 1.1928619863897461e-05, + "loss": 0.2666, + "step": 19660 + }, + { + "epoch": 0.4383304139787243, + "grad_norm": 0.8829711079597473, + "learning_rate": 1.192518416179651e-05, + "loss": 0.3796, + "step": 19665 + }, + { + "epoch": 0.4384418633593444, + "grad_norm": 0.6145585775375366, + "learning_rate": 1.1921748223682837e-05, + "loss": 0.3302, + "step": 19670 + }, + { + "epoch": 0.43855331273996445, + "grad_norm": 0.5251651406288147, + "learning_rate": 1.1918312049977665e-05, + "loss": 0.3594, + "step": 19675 + }, + { + "epoch": 0.43866476212058453, + "grad_norm": 0.5702129602432251, + "learning_rate": 1.1914875641102246e-05, + "loss": 0.3224, + "step": 19680 + }, + { + "epoch": 0.4387762115012046, + "grad_norm": 0.5534759759902954, + "learning_rate": 1.1911438997477854e-05, + "loss": 0.3794, + "step": 19685 + }, + { + "epoch": 0.4388876608818246, + "grad_norm": 0.32037588953971863, + "learning_rate": 1.1908002119525792e-05, + "loss": 0.2029, + "step": 19690 + }, + { + "epoch": 0.4389991102624447, + "grad_norm": 0.6656951904296875, + "learning_rate": 1.1904565007667399e-05, + "loss": 0.3419, + "step": 19695 + }, + { + "epoch": 0.4391105596430648, + "grad_norm": 0.6678401827812195, + "learning_rate": 1.190112766232404e-05, + "loss": 0.387, + "step": 19700 + }, + { + "epoch": 0.43922200902368486, + "grad_norm": 0.4325374364852905, + "learning_rate": 1.1897690083917098e-05, + "loss": 0.325, + "step": 19705 + }, + { + "epoch": 0.43933345840430493, + "grad_norm": 0.49688783288002014, + "learning_rate": 1.1894252272868e-05, + "loss": 0.2698, + "step": 19710 + }, + { + "epoch": 0.439444907784925, + "grad_norm": 0.5682390928268433, + "learning_rate": 1.18908142295982e-05, + "loss": 0.2784, + "step": 19715 + }, + { + "epoch": 0.43955635716554503, + "grad_norm": 0.5705307722091675, + "learning_rate": 1.1887375954529167e-05, + "loss": 0.3702, + "step": 19720 + }, + { + "epoch": 0.4396678065461651, + "grad_norm": 0.6163852214813232, + "learning_rate": 1.1883937448082414e-05, + "loss": 0.2932, + "step": 19725 + }, + { + "epoch": 0.4397792559267852, + "grad_norm": 0.5091946721076965, + "learning_rate": 1.1880498710679466e-05, + "loss": 0.3564, + "step": 19730 + }, + { + "epoch": 0.43989070530740526, + "grad_norm": 0.7378758788108826, + "learning_rate": 1.1877059742741895e-05, + "loss": 0.2929, + "step": 19735 + }, + { + "epoch": 0.44000215468802534, + "grad_norm": 0.3604847192764282, + "learning_rate": 1.1873620544691288e-05, + "loss": 0.3412, + "step": 19740 + }, + { + "epoch": 0.4401136040686454, + "grad_norm": 0.5514371991157532, + "learning_rate": 1.1870181116949261e-05, + "loss": 0.3556, + "step": 19745 + }, + { + "epoch": 0.44022505344926544, + "grad_norm": 0.5001578330993652, + "learning_rate": 1.1866741459937467e-05, + "loss": 0.35, + "step": 19750 + }, + { + "epoch": 0.4403365028298855, + "grad_norm": 0.8715097308158875, + "learning_rate": 1.1863301574077581e-05, + "loss": 0.2616, + "step": 19755 + }, + { + "epoch": 0.4404479522105056, + "grad_norm": 0.7427131533622742, + "learning_rate": 1.1859861459791305e-05, + "loss": 0.4198, + "step": 19760 + }, + { + "epoch": 0.44055940159112567, + "grad_norm": 0.8536766767501831, + "learning_rate": 1.1856421117500364e-05, + "loss": 0.231, + "step": 19765 + }, + { + "epoch": 0.44067085097174574, + "grad_norm": 0.3912745416164398, + "learning_rate": 1.185298054762653e-05, + "loss": 0.1782, + "step": 19770 + }, + { + "epoch": 0.4407823003523658, + "grad_norm": 0.5229966640472412, + "learning_rate": 1.184953975059158e-05, + "loss": 0.2969, + "step": 19775 + }, + { + "epoch": 0.44089374973298584, + "grad_norm": 0.8363884687423706, + "learning_rate": 1.1846098726817332e-05, + "loss": 0.3296, + "step": 19780 + }, + { + "epoch": 0.4410051991136059, + "grad_norm": 0.5975040793418884, + "learning_rate": 1.1842657476725632e-05, + "loss": 0.2876, + "step": 19785 + }, + { + "epoch": 0.441116648494226, + "grad_norm": 0.42045867443084717, + "learning_rate": 1.1839216000738349e-05, + "loss": 0.3232, + "step": 19790 + }, + { + "epoch": 0.44122809787484607, + "grad_norm": 0.6494802832603455, + "learning_rate": 1.1835774299277379e-05, + "loss": 0.2857, + "step": 19795 + }, + { + "epoch": 0.44133954725546615, + "grad_norm": 0.631358802318573, + "learning_rate": 1.1832332372764649e-05, + "loss": 0.3664, + "step": 19800 + }, + { + "epoch": 0.44145099663608617, + "grad_norm": 0.5753572583198547, + "learning_rate": 1.1828890221622117e-05, + "loss": 0.4741, + "step": 19805 + }, + { + "epoch": 0.44156244601670624, + "grad_norm": 0.4872659146785736, + "learning_rate": 1.1825447846271758e-05, + "loss": 0.3375, + "step": 19810 + }, + { + "epoch": 0.4416738953973263, + "grad_norm": 0.5573753714561462, + "learning_rate": 1.1822005247135584e-05, + "loss": 0.4541, + "step": 19815 + }, + { + "epoch": 0.4417853447779464, + "grad_norm": 0.5151143074035645, + "learning_rate": 1.1818562424635631e-05, + "loss": 0.2904, + "step": 19820 + }, + { + "epoch": 0.4418967941585665, + "grad_norm": 0.6065669655799866, + "learning_rate": 1.1815119379193962e-05, + "loss": 0.3648, + "step": 19825 + }, + { + "epoch": 0.44200824353918655, + "grad_norm": 0.4966534972190857, + "learning_rate": 1.1811676111232668e-05, + "loss": 0.2602, + "step": 19830 + }, + { + "epoch": 0.4421196929198066, + "grad_norm": 0.37646156549453735, + "learning_rate": 1.1808232621173866e-05, + "loss": 0.327, + "step": 19835 + }, + { + "epoch": 0.44223114230042665, + "grad_norm": 0.6118890047073364, + "learning_rate": 1.1804788909439702e-05, + "loss": 0.3577, + "step": 19840 + }, + { + "epoch": 0.4423425916810467, + "grad_norm": 0.4959801137447357, + "learning_rate": 1.1801344976452348e-05, + "loss": 0.347, + "step": 19845 + }, + { + "epoch": 0.4424540410616668, + "grad_norm": 0.5995858311653137, + "learning_rate": 1.1797900822634007e-05, + "loss": 0.2956, + "step": 19850 + }, + { + "epoch": 0.4425654904422869, + "grad_norm": 0.3750613331794739, + "learning_rate": 1.17944564484069e-05, + "loss": 0.3752, + "step": 19855 + }, + { + "epoch": 0.44267693982290696, + "grad_norm": 0.6806448698043823, + "learning_rate": 1.1791011854193282e-05, + "loss": 0.33, + "step": 19860 + }, + { + "epoch": 0.442788389203527, + "grad_norm": 0.6760001182556152, + "learning_rate": 1.1787567040415437e-05, + "loss": 0.3706, + "step": 19865 + }, + { + "epoch": 0.44289983858414705, + "grad_norm": 0.39426669478416443, + "learning_rate": 1.178412200749567e-05, + "loss": 0.3819, + "step": 19870 + }, + { + "epoch": 0.44301128796476713, + "grad_norm": 0.44993576407432556, + "learning_rate": 1.1780676755856317e-05, + "loss": 0.3555, + "step": 19875 + }, + { + "epoch": 0.4431227373453872, + "grad_norm": 0.8157110810279846, + "learning_rate": 1.1777231285919742e-05, + "loss": 0.3558, + "step": 19880 + }, + { + "epoch": 0.4432341867260073, + "grad_norm": 0.5687642693519592, + "learning_rate": 1.1773785598108326e-05, + "loss": 0.241, + "step": 19885 + }, + { + "epoch": 0.44334563610662736, + "grad_norm": 0.7683583498001099, + "learning_rate": 1.1770339692844484e-05, + "loss": 0.2754, + "step": 19890 + }, + { + "epoch": 0.4434570854872474, + "grad_norm": 0.72896409034729, + "learning_rate": 1.1766893570550666e-05, + "loss": 0.3862, + "step": 19895 + }, + { + "epoch": 0.44356853486786746, + "grad_norm": 0.5326892733573914, + "learning_rate": 1.1763447231649332e-05, + "loss": 0.3575, + "step": 19900 + }, + { + "epoch": 0.44367998424848754, + "grad_norm": 0.5236831903457642, + "learning_rate": 1.1760000676562977e-05, + "loss": 0.199, + "step": 19905 + }, + { + "epoch": 0.4437914336291076, + "grad_norm": 0.6456315517425537, + "learning_rate": 1.175655390571413e-05, + "loss": 0.2572, + "step": 19910 + }, + { + "epoch": 0.4439028830097277, + "grad_norm": 0.6855353713035583, + "learning_rate": 1.175310691952533e-05, + "loss": 0.4176, + "step": 19915 + }, + { + "epoch": 0.44401433239034777, + "grad_norm": 0.9236936569213867, + "learning_rate": 1.1749659718419151e-05, + "loss": 0.4013, + "step": 19920 + }, + { + "epoch": 0.4441257817709678, + "grad_norm": 0.5553421378135681, + "learning_rate": 1.1746212302818196e-05, + "loss": 0.2543, + "step": 19925 + }, + { + "epoch": 0.44423723115158786, + "grad_norm": 0.7569783926010132, + "learning_rate": 1.1742764673145095e-05, + "loss": 0.396, + "step": 19930 + }, + { + "epoch": 0.44434868053220794, + "grad_norm": 0.5262734293937683, + "learning_rate": 1.1739316829822496e-05, + "loss": 0.2578, + "step": 19935 + }, + { + "epoch": 0.444460129912828, + "grad_norm": 0.37517091631889343, + "learning_rate": 1.1735868773273076e-05, + "loss": 0.3594, + "step": 19940 + }, + { + "epoch": 0.4445715792934481, + "grad_norm": 0.5958526730537415, + "learning_rate": 1.1732420503919547e-05, + "loss": 0.2276, + "step": 19945 + }, + { + "epoch": 0.44468302867406817, + "grad_norm": 0.8476828336715698, + "learning_rate": 1.1728972022184636e-05, + "loss": 0.3202, + "step": 19950 + }, + { + "epoch": 0.4447944780546882, + "grad_norm": 0.4996603727340698, + "learning_rate": 1.17255233284911e-05, + "loss": 0.2563, + "step": 19955 + }, + { + "epoch": 0.44490592743530827, + "grad_norm": 0.6085326671600342, + "learning_rate": 1.1722074423261723e-05, + "loss": 0.3926, + "step": 19960 + }, + { + "epoch": 0.44501737681592834, + "grad_norm": 1.2089165449142456, + "learning_rate": 1.1718625306919312e-05, + "loss": 0.3213, + "step": 19965 + }, + { + "epoch": 0.4451288261965484, + "grad_norm": 0.47199055552482605, + "learning_rate": 1.1715175979886708e-05, + "loss": 0.3736, + "step": 19970 + }, + { + "epoch": 0.4452402755771685, + "grad_norm": 0.9002397656440735, + "learning_rate": 1.1711726442586764e-05, + "loss": 0.2109, + "step": 19975 + }, + { + "epoch": 0.4453517249577885, + "grad_norm": 0.8258025646209717, + "learning_rate": 1.1708276695442371e-05, + "loss": 0.301, + "step": 19980 + }, + { + "epoch": 0.4454631743384086, + "grad_norm": 0.4305465817451477, + "learning_rate": 1.1704826738876445e-05, + "loss": 0.347, + "step": 19985 + }, + { + "epoch": 0.4455746237190287, + "grad_norm": 0.46003684401512146, + "learning_rate": 1.1701376573311918e-05, + "loss": 0.3968, + "step": 19990 + }, + { + "epoch": 0.44568607309964875, + "grad_norm": 0.6948609948158264, + "learning_rate": 1.1697926199171754e-05, + "loss": 0.3136, + "step": 19995 + }, + { + "epoch": 0.4457975224802688, + "grad_norm": 0.42964625358581543, + "learning_rate": 1.1694475616878947e-05, + "loss": 0.2609, + "step": 20000 + }, + { + "epoch": 0.4459089718608889, + "grad_norm": 0.6982073783874512, + "learning_rate": 1.1691024826856507e-05, + "loss": 0.361, + "step": 20005 + }, + { + "epoch": 0.4460204212415089, + "grad_norm": 0.5581613183021545, + "learning_rate": 1.1687573829527474e-05, + "loss": 0.3218, + "step": 20010 + }, + { + "epoch": 0.446131870622129, + "grad_norm": 0.6846082806587219, + "learning_rate": 1.1684122625314918e-05, + "loss": 0.3306, + "step": 20015 + }, + { + "epoch": 0.4462433200027491, + "grad_norm": 0.6660178303718567, + "learning_rate": 1.1680671214641927e-05, + "loss": 0.3782, + "step": 20020 + }, + { + "epoch": 0.44635476938336915, + "grad_norm": 0.47997379302978516, + "learning_rate": 1.1677219597931617e-05, + "loss": 0.2564, + "step": 20025 + }, + { + "epoch": 0.44646621876398923, + "grad_norm": 0.4481871724128723, + "learning_rate": 1.1673767775607133e-05, + "loss": 0.2694, + "step": 20030 + }, + { + "epoch": 0.4465776681446093, + "grad_norm": 0.6611993908882141, + "learning_rate": 1.1670315748091639e-05, + "loss": 0.3976, + "step": 20035 + }, + { + "epoch": 0.44668911752522933, + "grad_norm": 1.1613658666610718, + "learning_rate": 1.1666863515808323e-05, + "loss": 0.2544, + "step": 20040 + }, + { + "epoch": 0.4468005669058494, + "grad_norm": 0.5730067491531372, + "learning_rate": 1.1663411079180409e-05, + "loss": 0.3335, + "step": 20045 + }, + { + "epoch": 0.4469120162864695, + "grad_norm": 0.7078298330307007, + "learning_rate": 1.1659958438631138e-05, + "loss": 0.3033, + "step": 20050 + }, + { + "epoch": 0.44702346566708956, + "grad_norm": 0.7347967624664307, + "learning_rate": 1.1656505594583774e-05, + "loss": 0.3882, + "step": 20055 + }, + { + "epoch": 0.44713491504770964, + "grad_norm": 0.7464125752449036, + "learning_rate": 1.165305254746161e-05, + "loss": 0.3353, + "step": 20060 + }, + { + "epoch": 0.4472463644283297, + "grad_norm": 0.5959119200706482, + "learning_rate": 1.164959929768796e-05, + "loss": 0.3464, + "step": 20065 + }, + { + "epoch": 0.44735781380894973, + "grad_norm": 0.6540177464485168, + "learning_rate": 1.1646145845686175e-05, + "loss": 0.3472, + "step": 20070 + }, + { + "epoch": 0.4474692631895698, + "grad_norm": 0.5438899397850037, + "learning_rate": 1.1642692191879613e-05, + "loss": 0.3067, + "step": 20075 + }, + { + "epoch": 0.4475807125701899, + "grad_norm": 0.5280618071556091, + "learning_rate": 1.1639238336691666e-05, + "loss": 0.3269, + "step": 20080 + }, + { + "epoch": 0.44769216195080996, + "grad_norm": 0.548997700214386, + "learning_rate": 1.1635784280545755e-05, + "loss": 0.3355, + "step": 20085 + }, + { + "epoch": 0.44780361133143004, + "grad_norm": 0.5856815576553345, + "learning_rate": 1.1632330023865315e-05, + "loss": 0.2652, + "step": 20090 + }, + { + "epoch": 0.4479150607120501, + "grad_norm": 0.4746699035167694, + "learning_rate": 1.1628875567073816e-05, + "loss": 0.2406, + "step": 20095 + }, + { + "epoch": 0.44802651009267014, + "grad_norm": 0.5286392569541931, + "learning_rate": 1.1625420910594745e-05, + "loss": 0.3352, + "step": 20100 + }, + { + "epoch": 0.4481379594732902, + "grad_norm": 0.7887224555015564, + "learning_rate": 1.1621966054851614e-05, + "loss": 0.4244, + "step": 20105 + }, + { + "epoch": 0.4482494088539103, + "grad_norm": 0.6540002822875977, + "learning_rate": 1.1618511000267966e-05, + "loss": 0.3291, + "step": 20110 + }, + { + "epoch": 0.44836085823453037, + "grad_norm": 0.7543660998344421, + "learning_rate": 1.1615055747267366e-05, + "loss": 0.2833, + "step": 20115 + }, + { + "epoch": 0.44847230761515045, + "grad_norm": 0.5979798436164856, + "learning_rate": 1.1611600296273391e-05, + "loss": 0.3342, + "step": 20120 + }, + { + "epoch": 0.4485837569957705, + "grad_norm": 0.5423582196235657, + "learning_rate": 1.1608144647709664e-05, + "loss": 0.3795, + "step": 20125 + }, + { + "epoch": 0.44869520637639054, + "grad_norm": 0.8526514172554016, + "learning_rate": 1.1604688801999817e-05, + "loss": 0.4085, + "step": 20130 + }, + { + "epoch": 0.4488066557570106, + "grad_norm": 0.6163521409034729, + "learning_rate": 1.1601232759567504e-05, + "loss": 0.3199, + "step": 20135 + }, + { + "epoch": 0.4489181051376307, + "grad_norm": 0.7942532896995544, + "learning_rate": 1.159777652083642e-05, + "loss": 0.3802, + "step": 20140 + }, + { + "epoch": 0.4490295545182508, + "grad_norm": 0.46986180543899536, + "learning_rate": 1.1594320086230265e-05, + "loss": 0.286, + "step": 20145 + }, + { + "epoch": 0.44914100389887085, + "grad_norm": 0.5512806177139282, + "learning_rate": 1.1590863456172772e-05, + "loss": 0.3589, + "step": 20150 + }, + { + "epoch": 0.4492524532794909, + "grad_norm": 0.8379069566726685, + "learning_rate": 1.1587406631087701e-05, + "loss": 0.325, + "step": 20155 + }, + { + "epoch": 0.44936390266011095, + "grad_norm": 0.7071454524993896, + "learning_rate": 1.158394961139883e-05, + "loss": 0.3405, + "step": 20160 + }, + { + "epoch": 0.449475352040731, + "grad_norm": 0.49624374508857727, + "learning_rate": 1.158049239752996e-05, + "loss": 0.2863, + "step": 20165 + }, + { + "epoch": 0.4495868014213511, + "grad_norm": 0.5125814080238342, + "learning_rate": 1.1577034989904923e-05, + "loss": 0.3024, + "step": 20170 + }, + { + "epoch": 0.4496982508019712, + "grad_norm": 0.6997113227844238, + "learning_rate": 1.1573577388947573e-05, + "loss": 0.3309, + "step": 20175 + }, + { + "epoch": 0.44980970018259125, + "grad_norm": 0.58835768699646, + "learning_rate": 1.1570119595081777e-05, + "loss": 0.3265, + "step": 20180 + }, + { + "epoch": 0.4499211495632113, + "grad_norm": 0.7760397791862488, + "learning_rate": 1.156666160873144e-05, + "loss": 0.3216, + "step": 20185 + }, + { + "epoch": 0.45003259894383135, + "grad_norm": 0.63795405626297, + "learning_rate": 1.1563203430320482e-05, + "loss": 0.359, + "step": 20190 + }, + { + "epoch": 0.45014404832445143, + "grad_norm": 0.4398214817047119, + "learning_rate": 1.1559745060272849e-05, + "loss": 0.2296, + "step": 20195 + }, + { + "epoch": 0.4502554977050715, + "grad_norm": 0.5632160902023315, + "learning_rate": 1.1556286499012512e-05, + "loss": 0.4828, + "step": 20200 + }, + { + "epoch": 0.4503669470856916, + "grad_norm": 0.663616955280304, + "learning_rate": 1.155282774696346e-05, + "loss": 0.3925, + "step": 20205 + }, + { + "epoch": 0.45047839646631166, + "grad_norm": 0.7086315155029297, + "learning_rate": 1.1549368804549716e-05, + "loss": 0.2877, + "step": 20210 + }, + { + "epoch": 0.4505898458469317, + "grad_norm": 0.5059530138969421, + "learning_rate": 1.1545909672195315e-05, + "loss": 0.4004, + "step": 20215 + }, + { + "epoch": 0.45070129522755176, + "grad_norm": 0.724844753742218, + "learning_rate": 1.1542450350324321e-05, + "loss": 0.3196, + "step": 20220 + }, + { + "epoch": 0.45081274460817183, + "grad_norm": 0.34039250016212463, + "learning_rate": 1.1538990839360818e-05, + "loss": 0.232, + "step": 20225 + }, + { + "epoch": 0.4509241939887919, + "grad_norm": 0.7491649389266968, + "learning_rate": 1.1535531139728918e-05, + "loss": 0.3064, + "step": 20230 + }, + { + "epoch": 0.451035643369412, + "grad_norm": 0.7688721418380737, + "learning_rate": 1.1532071251852753e-05, + "loss": 0.2946, + "step": 20235 + }, + { + "epoch": 0.45114709275003206, + "grad_norm": 0.566525399684906, + "learning_rate": 1.1528611176156477e-05, + "loss": 0.2713, + "step": 20240 + }, + { + "epoch": 0.4512585421306521, + "grad_norm": 0.6546362042427063, + "learning_rate": 1.152515091306427e-05, + "loss": 0.2791, + "step": 20245 + }, + { + "epoch": 0.45136999151127216, + "grad_norm": 0.7522642016410828, + "learning_rate": 1.1521690463000336e-05, + "loss": 0.3468, + "step": 20250 + }, + { + "epoch": 0.45148144089189224, + "grad_norm": 0.5377618670463562, + "learning_rate": 1.1518229826388898e-05, + "loss": 0.3282, + "step": 20255 + }, + { + "epoch": 0.4515928902725123, + "grad_norm": 0.7234406471252441, + "learning_rate": 1.1514769003654195e-05, + "loss": 0.4184, + "step": 20260 + }, + { + "epoch": 0.4517043396531324, + "grad_norm": 0.5777047872543335, + "learning_rate": 1.1511307995220511e-05, + "loss": 0.2963, + "step": 20265 + }, + { + "epoch": 0.45181578903375247, + "grad_norm": 0.523595929145813, + "learning_rate": 1.1507846801512132e-05, + "loss": 0.2322, + "step": 20270 + }, + { + "epoch": 0.4519272384143725, + "grad_norm": 0.6133381724357605, + "learning_rate": 1.150438542295337e-05, + "loss": 0.2527, + "step": 20275 + }, + { + "epoch": 0.45203868779499257, + "grad_norm": 0.8271573781967163, + "learning_rate": 1.1500923859968572e-05, + "loss": 0.2821, + "step": 20280 + }, + { + "epoch": 0.45215013717561264, + "grad_norm": 0.524104118347168, + "learning_rate": 1.1497462112982092e-05, + "loss": 0.3312, + "step": 20285 + }, + { + "epoch": 0.4522615865562327, + "grad_norm": 0.6851401329040527, + "learning_rate": 1.1494000182418315e-05, + "loss": 0.336, + "step": 20290 + }, + { + "epoch": 0.4523730359368528, + "grad_norm": 0.6665769219398499, + "learning_rate": 1.1490538068701646e-05, + "loss": 0.2889, + "step": 20295 + }, + { + "epoch": 0.4524844853174729, + "grad_norm": 0.544818639755249, + "learning_rate": 1.1487075772256517e-05, + "loss": 0.2402, + "step": 20300 + }, + { + "epoch": 0.4525959346980929, + "grad_norm": 0.43808647990226746, + "learning_rate": 1.1483613293507376e-05, + "loss": 0.268, + "step": 20305 + }, + { + "epoch": 0.45270738407871297, + "grad_norm": 0.5447205305099487, + "learning_rate": 1.1480150632878697e-05, + "loss": 0.2629, + "step": 20310 + }, + { + "epoch": 0.45281883345933305, + "grad_norm": 0.6161884665489197, + "learning_rate": 1.1476687790794978e-05, + "loss": 0.25, + "step": 20315 + }, + { + "epoch": 0.4529302828399531, + "grad_norm": 0.5583766102790833, + "learning_rate": 1.147322476768073e-05, + "loss": 0.2625, + "step": 20320 + }, + { + "epoch": 0.4530417322205732, + "grad_norm": 0.5555024147033691, + "learning_rate": 1.1469761563960503e-05, + "loss": 0.3512, + "step": 20325 + }, + { + "epoch": 0.4531531816011933, + "grad_norm": 0.7176376581192017, + "learning_rate": 1.1466298180058847e-05, + "loss": 0.3411, + "step": 20330 + }, + { + "epoch": 0.4532646309818133, + "grad_norm": 0.5666157603263855, + "learning_rate": 1.1462834616400353e-05, + "loss": 0.2999, + "step": 20335 + }, + { + "epoch": 0.4533760803624334, + "grad_norm": 0.679897665977478, + "learning_rate": 1.145937087340963e-05, + "loss": 0.3178, + "step": 20340 + }, + { + "epoch": 0.45348752974305345, + "grad_norm": 0.4235386848449707, + "learning_rate": 1.14559069515113e-05, + "loss": 0.2108, + "step": 20345 + }, + { + "epoch": 0.45359897912367353, + "grad_norm": 0.6555066704750061, + "learning_rate": 1.1452442851130017e-05, + "loss": 0.3308, + "step": 20350 + }, + { + "epoch": 0.4537104285042936, + "grad_norm": 0.6095037460327148, + "learning_rate": 1.1448978572690448e-05, + "loss": 0.2004, + "step": 20355 + }, + { + "epoch": 0.4538218778849136, + "grad_norm": 0.5765591859817505, + "learning_rate": 1.1445514116617296e-05, + "loss": 0.3616, + "step": 20360 + }, + { + "epoch": 0.4539333272655337, + "grad_norm": 0.5720357894897461, + "learning_rate": 1.1442049483335267e-05, + "loss": 0.3972, + "step": 20365 + }, + { + "epoch": 0.4540447766461538, + "grad_norm": 0.6138008832931519, + "learning_rate": 1.1438584673269102e-05, + "loss": 0.3684, + "step": 20370 + }, + { + "epoch": 0.45415622602677386, + "grad_norm": 0.5443325638771057, + "learning_rate": 1.143511968684356e-05, + "loss": 0.4133, + "step": 20375 + }, + { + "epoch": 0.45426767540739393, + "grad_norm": 0.742470920085907, + "learning_rate": 1.1431654524483425e-05, + "loss": 0.2467, + "step": 20380 + }, + { + "epoch": 0.454379124788014, + "grad_norm": 0.5554812550544739, + "learning_rate": 1.1428189186613491e-05, + "loss": 0.4247, + "step": 20385 + }, + { + "epoch": 0.45449057416863403, + "grad_norm": 0.4698553681373596, + "learning_rate": 1.1424723673658593e-05, + "loss": 0.3415, + "step": 20390 + }, + { + "epoch": 0.4546020235492541, + "grad_norm": 0.5283235907554626, + "learning_rate": 1.1421257986043566e-05, + "loss": 0.2199, + "step": 20395 + }, + { + "epoch": 0.4547134729298742, + "grad_norm": 0.5475237369537354, + "learning_rate": 1.141779212419328e-05, + "loss": 0.3577, + "step": 20400 + }, + { + "epoch": 0.45482492231049426, + "grad_norm": 0.6595231294631958, + "learning_rate": 1.1414326088532625e-05, + "loss": 0.4771, + "step": 20405 + }, + { + "epoch": 0.45493637169111434, + "grad_norm": 0.6333425641059875, + "learning_rate": 1.1410859879486509e-05, + "loss": 0.2621, + "step": 20410 + }, + { + "epoch": 0.4550478210717344, + "grad_norm": 0.4809962809085846, + "learning_rate": 1.140739349747986e-05, + "loss": 0.3684, + "step": 20415 + }, + { + "epoch": 0.45515927045235444, + "grad_norm": 0.360771507024765, + "learning_rate": 1.1403926942937631e-05, + "loss": 0.4388, + "step": 20420 + }, + { + "epoch": 0.4552707198329745, + "grad_norm": 0.5278193950653076, + "learning_rate": 1.1400460216284799e-05, + "loss": 0.2437, + "step": 20425 + }, + { + "epoch": 0.4553821692135946, + "grad_norm": 0.7080212235450745, + "learning_rate": 1.139699331794635e-05, + "loss": 0.335, + "step": 20430 + }, + { + "epoch": 0.45549361859421467, + "grad_norm": 0.9361355900764465, + "learning_rate": 1.1393526248347304e-05, + "loss": 0.372, + "step": 20435 + }, + { + "epoch": 0.45560506797483474, + "grad_norm": 0.7043543457984924, + "learning_rate": 1.13900590079127e-05, + "loss": 0.314, + "step": 20440 + }, + { + "epoch": 0.4557165173554548, + "grad_norm": 0.5692029595375061, + "learning_rate": 1.1386591597067586e-05, + "loss": 0.2636, + "step": 20445 + }, + { + "epoch": 0.45582796673607484, + "grad_norm": 0.46591368317604065, + "learning_rate": 1.138312401623705e-05, + "loss": 0.3487, + "step": 20450 + }, + { + "epoch": 0.4559394161166949, + "grad_norm": 0.5977901816368103, + "learning_rate": 1.1379656265846185e-05, + "loss": 0.3386, + "step": 20455 + }, + { + "epoch": 0.456050865497315, + "grad_norm": 0.7537121176719666, + "learning_rate": 1.1376188346320107e-05, + "loss": 0.4314, + "step": 20460 + }, + { + "epoch": 0.45616231487793507, + "grad_norm": 0.6459237933158875, + "learning_rate": 1.1372720258083965e-05, + "loss": 0.3015, + "step": 20465 + }, + { + "epoch": 0.45627376425855515, + "grad_norm": 0.8141577243804932, + "learning_rate": 1.1369252001562911e-05, + "loss": 0.4319, + "step": 20470 + }, + { + "epoch": 0.4563852136391752, + "grad_norm": 0.5266554355621338, + "learning_rate": 1.1365783577182132e-05, + "loss": 0.3016, + "step": 20475 + }, + { + "epoch": 0.45649666301979525, + "grad_norm": 0.9065921306610107, + "learning_rate": 1.136231498536683e-05, + "loss": 0.3809, + "step": 20480 + }, + { + "epoch": 0.4566081124004153, + "grad_norm": 0.5425348281860352, + "learning_rate": 1.135884622654223e-05, + "loss": 0.3662, + "step": 20485 + }, + { + "epoch": 0.4567195617810354, + "grad_norm": 0.45135268568992615, + "learning_rate": 1.1355377301133564e-05, + "loss": 0.2974, + "step": 20490 + }, + { + "epoch": 0.4568310111616555, + "grad_norm": 0.4818071722984314, + "learning_rate": 1.1351908209566104e-05, + "loss": 0.232, + "step": 20495 + }, + { + "epoch": 0.45694246054227555, + "grad_norm": 0.6810379028320312, + "learning_rate": 1.1348438952265138e-05, + "loss": 0.3733, + "step": 20500 + }, + { + "epoch": 0.45705390992289563, + "grad_norm": 0.5900397896766663, + "learning_rate": 1.134496952965596e-05, + "loss": 0.4092, + "step": 20505 + }, + { + "epoch": 0.45716535930351565, + "grad_norm": 0.5401953458786011, + "learning_rate": 1.13414999421639e-05, + "loss": 0.4204, + "step": 20510 + }, + { + "epoch": 0.4572768086841357, + "grad_norm": 0.381527841091156, + "learning_rate": 1.1338030190214304e-05, + "loss": 0.2127, + "step": 20515 + }, + { + "epoch": 0.4573882580647558, + "grad_norm": 0.7745471000671387, + "learning_rate": 1.1334560274232531e-05, + "loss": 0.3037, + "step": 20520 + }, + { + "epoch": 0.4574997074453759, + "grad_norm": 0.7092231512069702, + "learning_rate": 1.1331090194643972e-05, + "loss": 0.2768, + "step": 20525 + }, + { + "epoch": 0.45761115682599596, + "grad_norm": 0.9606593251228333, + "learning_rate": 1.1327619951874029e-05, + "loss": 0.3453, + "step": 20530 + }, + { + "epoch": 0.45772260620661603, + "grad_norm": 0.9122305512428284, + "learning_rate": 1.132414954634813e-05, + "loss": 0.2957, + "step": 20535 + }, + { + "epoch": 0.45783405558723606, + "grad_norm": 0.4970526397228241, + "learning_rate": 1.1320678978491713e-05, + "loss": 0.299, + "step": 20540 + }, + { + "epoch": 0.45794550496785613, + "grad_norm": 0.5332255959510803, + "learning_rate": 1.1317208248730252e-05, + "loss": 0.3238, + "step": 20545 + }, + { + "epoch": 0.4580569543484762, + "grad_norm": 0.5015629529953003, + "learning_rate": 1.1313737357489223e-05, + "loss": 0.3711, + "step": 20550 + }, + { + "epoch": 0.4581684037290963, + "grad_norm": 0.6893032193183899, + "learning_rate": 1.1310266305194136e-05, + "loss": 0.2776, + "step": 20555 + }, + { + "epoch": 0.45827985310971636, + "grad_norm": 0.6035255789756775, + "learning_rate": 1.1306795092270512e-05, + "loss": 0.3867, + "step": 20560 + }, + { + "epoch": 0.4583913024903364, + "grad_norm": 0.4319852888584137, + "learning_rate": 1.1303323719143902e-05, + "loss": 0.2329, + "step": 20565 + }, + { + "epoch": 0.45850275187095646, + "grad_norm": 0.7130627036094666, + "learning_rate": 1.1299852186239859e-05, + "loss": 0.4149, + "step": 20570 + }, + { + "epoch": 0.45861420125157654, + "grad_norm": 0.5139709115028381, + "learning_rate": 1.1296380493983971e-05, + "loss": 0.3604, + "step": 20575 + }, + { + "epoch": 0.4587256506321966, + "grad_norm": 0.6675410270690918, + "learning_rate": 1.1292908642801845e-05, + "loss": 0.267, + "step": 20580 + }, + { + "epoch": 0.4588371000128167, + "grad_norm": 0.5854967832565308, + "learning_rate": 1.1289436633119095e-05, + "loss": 0.4298, + "step": 20585 + }, + { + "epoch": 0.45894854939343677, + "grad_norm": 0.7468124628067017, + "learning_rate": 1.128596446536137e-05, + "loss": 0.439, + "step": 20590 + }, + { + "epoch": 0.4590599987740568, + "grad_norm": 0.7832880020141602, + "learning_rate": 1.1282492139954326e-05, + "loss": 0.3148, + "step": 20595 + }, + { + "epoch": 0.45917144815467686, + "grad_norm": 0.5017052292823792, + "learning_rate": 1.1279019657323644e-05, + "loss": 0.3527, + "step": 20600 + }, + { + "epoch": 0.45928289753529694, + "grad_norm": 0.7330566048622131, + "learning_rate": 1.1275547017895027e-05, + "loss": 0.332, + "step": 20605 + }, + { + "epoch": 0.459394346915917, + "grad_norm": 0.667830228805542, + "learning_rate": 1.1272074222094189e-05, + "loss": 0.3643, + "step": 20610 + }, + { + "epoch": 0.4595057962965371, + "grad_norm": 0.5111591219902039, + "learning_rate": 1.126860127034687e-05, + "loss": 0.2477, + "step": 20615 + }, + { + "epoch": 0.45961724567715717, + "grad_norm": 0.5095810890197754, + "learning_rate": 1.1265128163078831e-05, + "loss": 0.3729, + "step": 20620 + }, + { + "epoch": 0.4597286950577772, + "grad_norm": 0.6718348264694214, + "learning_rate": 1.1261654900715844e-05, + "loss": 0.3688, + "step": 20625 + }, + { + "epoch": 0.45984014443839727, + "grad_norm": 0.6186143755912781, + "learning_rate": 1.1258181483683699e-05, + "loss": 0.2758, + "step": 20630 + }, + { + "epoch": 0.45995159381901735, + "grad_norm": 0.6600292325019836, + "learning_rate": 1.125470791240822e-05, + "loss": 0.3646, + "step": 20635 + }, + { + "epoch": 0.4600630431996374, + "grad_norm": 0.6270229816436768, + "learning_rate": 1.1251234187315237e-05, + "loss": 0.3948, + "step": 20640 + }, + { + "epoch": 0.4601744925802575, + "grad_norm": 0.6568154692649841, + "learning_rate": 1.12477603088306e-05, + "loss": 0.4049, + "step": 20645 + }, + { + "epoch": 0.4602859419608776, + "grad_norm": 0.6456282138824463, + "learning_rate": 1.1244286277380176e-05, + "loss": 0.3546, + "step": 20650 + }, + { + "epoch": 0.4603973913414976, + "grad_norm": 0.5540902018547058, + "learning_rate": 1.1240812093389865e-05, + "loss": 0.2044, + "step": 20655 + }, + { + "epoch": 0.4605088407221177, + "grad_norm": 0.7089858055114746, + "learning_rate": 1.1237337757285564e-05, + "loss": 0.4284, + "step": 20660 + }, + { + "epoch": 0.46062029010273775, + "grad_norm": 0.6283511519432068, + "learning_rate": 1.1233863269493208e-05, + "loss": 0.327, + "step": 20665 + }, + { + "epoch": 0.4607317394833578, + "grad_norm": 0.5950222611427307, + "learning_rate": 1.1230388630438737e-05, + "loss": 0.3742, + "step": 20670 + }, + { + "epoch": 0.4608431888639779, + "grad_norm": 0.4111473560333252, + "learning_rate": 1.1226913840548119e-05, + "loss": 0.3084, + "step": 20675 + }, + { + "epoch": 0.460954638244598, + "grad_norm": 0.5442538261413574, + "learning_rate": 1.1223438900247334e-05, + "loss": 0.2162, + "step": 20680 + }, + { + "epoch": 0.461066087625218, + "grad_norm": 0.5340818762779236, + "learning_rate": 1.1219963809962382e-05, + "loss": 0.3153, + "step": 20685 + }, + { + "epoch": 0.4611775370058381, + "grad_norm": 0.5781068801879883, + "learning_rate": 1.1216488570119283e-05, + "loss": 0.3198, + "step": 20690 + }, + { + "epoch": 0.46128898638645816, + "grad_norm": 0.6196689009666443, + "learning_rate": 1.1213013181144079e-05, + "loss": 0.3136, + "step": 20695 + }, + { + "epoch": 0.46140043576707823, + "grad_norm": 0.7419324517250061, + "learning_rate": 1.120953764346282e-05, + "loss": 0.245, + "step": 20700 + }, + { + "epoch": 0.4615118851476983, + "grad_norm": 0.5698023438453674, + "learning_rate": 1.120606195750158e-05, + "loss": 0.3621, + "step": 20705 + }, + { + "epoch": 0.4616233345283184, + "grad_norm": 0.7995551228523254, + "learning_rate": 1.1202586123686457e-05, + "loss": 0.2984, + "step": 20710 + }, + { + "epoch": 0.4617347839089384, + "grad_norm": 0.8355708718299866, + "learning_rate": 1.1199110142443557e-05, + "loss": 0.3809, + "step": 20715 + }, + { + "epoch": 0.4618462332895585, + "grad_norm": 0.35497555136680603, + "learning_rate": 1.1195634014199006e-05, + "loss": 0.3197, + "step": 20720 + }, + { + "epoch": 0.46195768267017856, + "grad_norm": 0.6658803224563599, + "learning_rate": 1.1192157739378958e-05, + "loss": 0.3327, + "step": 20725 + }, + { + "epoch": 0.46206913205079864, + "grad_norm": 0.6382262110710144, + "learning_rate": 1.1188681318409571e-05, + "loss": 0.4237, + "step": 20730 + }, + { + "epoch": 0.4621805814314187, + "grad_norm": 0.6009849905967712, + "learning_rate": 1.118520475171703e-05, + "loss": 0.3045, + "step": 20735 + }, + { + "epoch": 0.46229203081203873, + "grad_norm": 0.6650320291519165, + "learning_rate": 1.1181728039727532e-05, + "loss": 0.4768, + "step": 20740 + }, + { + "epoch": 0.4624034801926588, + "grad_norm": 0.4834892749786377, + "learning_rate": 1.1178251182867302e-05, + "loss": 0.3506, + "step": 20745 + }, + { + "epoch": 0.4625149295732789, + "grad_norm": 0.6801639795303345, + "learning_rate": 1.1174774181562568e-05, + "loss": 0.3021, + "step": 20750 + }, + { + "epoch": 0.46262637895389896, + "grad_norm": 0.5477128028869629, + "learning_rate": 1.117129703623959e-05, + "loss": 0.3267, + "step": 20755 + }, + { + "epoch": 0.46273782833451904, + "grad_norm": 0.7352733612060547, + "learning_rate": 1.1167819747324635e-05, + "loss": 0.2959, + "step": 20760 + }, + { + "epoch": 0.4628492777151391, + "grad_norm": 0.6437637209892273, + "learning_rate": 1.1164342315243997e-05, + "loss": 0.2679, + "step": 20765 + }, + { + "epoch": 0.46296072709575914, + "grad_norm": 0.586779773235321, + "learning_rate": 1.1160864740423971e-05, + "loss": 0.4493, + "step": 20770 + }, + { + "epoch": 0.4630721764763792, + "grad_norm": 0.6587520837783813, + "learning_rate": 1.1157387023290896e-05, + "loss": 0.325, + "step": 20775 + }, + { + "epoch": 0.4631836258569993, + "grad_norm": 0.8112159371376038, + "learning_rate": 1.1153909164271101e-05, + "loss": 0.2332, + "step": 20780 + }, + { + "epoch": 0.46329507523761937, + "grad_norm": 0.8643737435340881, + "learning_rate": 1.1150431163790951e-05, + "loss": 0.2957, + "step": 20785 + }, + { + "epoch": 0.46340652461823945, + "grad_norm": 0.6423876285552979, + "learning_rate": 1.1146953022276819e-05, + "loss": 0.28, + "step": 20790 + }, + { + "epoch": 0.4635179739988595, + "grad_norm": 0.4466862082481384, + "learning_rate": 1.11434747401551e-05, + "loss": 0.3576, + "step": 20795 + }, + { + "epoch": 0.46362942337947954, + "grad_norm": 0.4856477975845337, + "learning_rate": 1.1139996317852204e-05, + "loss": 0.2646, + "step": 20800 + }, + { + "epoch": 0.4637408727600996, + "grad_norm": 0.6780421733856201, + "learning_rate": 1.1136517755794559e-05, + "loss": 0.3032, + "step": 20805 + }, + { + "epoch": 0.4638523221407197, + "grad_norm": 0.9612930417060852, + "learning_rate": 1.1133039054408612e-05, + "loss": 0.2347, + "step": 20810 + }, + { + "epoch": 0.4639637715213398, + "grad_norm": 0.5880045294761658, + "learning_rate": 1.112956021412082e-05, + "loss": 0.4147, + "step": 20815 + }, + { + "epoch": 0.46407522090195985, + "grad_norm": 0.5519404411315918, + "learning_rate": 1.1126081235357667e-05, + "loss": 0.2711, + "step": 20820 + }, + { + "epoch": 0.4641866702825799, + "grad_norm": 0.7037073969841003, + "learning_rate": 1.1122602118545642e-05, + "loss": 0.2904, + "step": 20825 + }, + { + "epoch": 0.46429811966319995, + "grad_norm": 0.4632646441459656, + "learning_rate": 1.1119122864111264e-05, + "loss": 0.2557, + "step": 20830 + }, + { + "epoch": 0.46440956904382, + "grad_norm": 0.5045544505119324, + "learning_rate": 1.1115643472481067e-05, + "loss": 0.1933, + "step": 20835 + }, + { + "epoch": 0.4645210184244401, + "grad_norm": 0.5202092528343201, + "learning_rate": 1.1112163944081585e-05, + "loss": 0.3419, + "step": 20840 + }, + { + "epoch": 0.4646324678050602, + "grad_norm": 0.4976825714111328, + "learning_rate": 1.1108684279339388e-05, + "loss": 0.2634, + "step": 20845 + }, + { + "epoch": 0.46474391718568026, + "grad_norm": 0.5703532695770264, + "learning_rate": 1.110520447868106e-05, + "loss": 0.3883, + "step": 20850 + }, + { + "epoch": 0.46485536656630033, + "grad_norm": 0.7237419486045837, + "learning_rate": 1.1101724542533195e-05, + "loss": 0.3212, + "step": 20855 + }, + { + "epoch": 0.46496681594692035, + "grad_norm": 0.4750984013080597, + "learning_rate": 1.1098244471322397e-05, + "loss": 0.1943, + "step": 20860 + }, + { + "epoch": 0.46507826532754043, + "grad_norm": 0.32952219247817993, + "learning_rate": 1.109476426547531e-05, + "loss": 0.1731, + "step": 20865 + }, + { + "epoch": 0.4651897147081605, + "grad_norm": 0.6741600632667542, + "learning_rate": 1.1091283925418577e-05, + "loss": 0.3286, + "step": 20870 + }, + { + "epoch": 0.4653011640887806, + "grad_norm": 0.7948526740074158, + "learning_rate": 1.1087803451578854e-05, + "loss": 0.4216, + "step": 20875 + }, + { + "epoch": 0.46541261346940066, + "grad_norm": 0.6836048364639282, + "learning_rate": 1.1084322844382822e-05, + "loss": 0.3021, + "step": 20880 + }, + { + "epoch": 0.46552406285002074, + "grad_norm": 0.5596360564231873, + "learning_rate": 1.1080842104257183e-05, + "loss": 0.2108, + "step": 20885 + }, + { + "epoch": 0.46563551223064076, + "grad_norm": 0.668707549571991, + "learning_rate": 1.1077361231628646e-05, + "loss": 0.4101, + "step": 20890 + }, + { + "epoch": 0.46574696161126083, + "grad_norm": 0.49210840463638306, + "learning_rate": 1.1073880226923933e-05, + "loss": 0.291, + "step": 20895 + }, + { + "epoch": 0.4658584109918809, + "grad_norm": 0.6215358376502991, + "learning_rate": 1.1070399090569796e-05, + "loss": 0.3779, + "step": 20900 + }, + { + "epoch": 0.465969860372501, + "grad_norm": 0.5351458787918091, + "learning_rate": 1.1066917822992992e-05, + "loss": 0.2616, + "step": 20905 + }, + { + "epoch": 0.46608130975312106, + "grad_norm": 0.24766775965690613, + "learning_rate": 1.1063436424620302e-05, + "loss": 0.3989, + "step": 20910 + }, + { + "epoch": 0.46619275913374114, + "grad_norm": 0.5665358304977417, + "learning_rate": 1.1059954895878512e-05, + "loss": 0.2567, + "step": 20915 + }, + { + "epoch": 0.46630420851436116, + "grad_norm": 0.5747869610786438, + "learning_rate": 1.1056473237194434e-05, + "loss": 0.299, + "step": 20920 + }, + { + "epoch": 0.46641565789498124, + "grad_norm": 0.4050520360469818, + "learning_rate": 1.105299144899489e-05, + "loss": 0.3226, + "step": 20925 + }, + { + "epoch": 0.4665271072756013, + "grad_norm": 0.8657063245773315, + "learning_rate": 1.1049509531706721e-05, + "loss": 0.3553, + "step": 20930 + }, + { + "epoch": 0.4666385566562214, + "grad_norm": 0.7300972938537598, + "learning_rate": 1.104602748575679e-05, + "loss": 0.3177, + "step": 20935 + }, + { + "epoch": 0.46675000603684147, + "grad_norm": 0.8992680907249451, + "learning_rate": 1.1042545311571957e-05, + "loss": 0.2669, + "step": 20940 + }, + { + "epoch": 0.4668614554174615, + "grad_norm": 0.7697744369506836, + "learning_rate": 1.103906300957912e-05, + "loss": 0.3603, + "step": 20945 + }, + { + "epoch": 0.46697290479808157, + "grad_norm": 0.7392646074295044, + "learning_rate": 1.1035580580205179e-05, + "loss": 0.3591, + "step": 20950 + }, + { + "epoch": 0.46708435417870164, + "grad_norm": 0.5674382448196411, + "learning_rate": 1.103209802387705e-05, + "loss": 0.4146, + "step": 20955 + }, + { + "epoch": 0.4671958035593217, + "grad_norm": 0.6826933026313782, + "learning_rate": 1.1028615341021669e-05, + "loss": 0.2796, + "step": 20960 + }, + { + "epoch": 0.4673072529399418, + "grad_norm": 0.6870999336242676, + "learning_rate": 1.102513253206599e-05, + "loss": 0.2892, + "step": 20965 + }, + { + "epoch": 0.4674187023205619, + "grad_norm": 0.6140218377113342, + "learning_rate": 1.1021649597436971e-05, + "loss": 0.3268, + "step": 20970 + }, + { + "epoch": 0.4675301517011819, + "grad_norm": 0.5515140891075134, + "learning_rate": 1.10181665375616e-05, + "loss": 0.3496, + "step": 20975 + }, + { + "epoch": 0.46764160108180197, + "grad_norm": 0.8796146512031555, + "learning_rate": 1.1014683352866873e-05, + "loss": 0.2802, + "step": 20980 + }, + { + "epoch": 0.46775305046242205, + "grad_norm": 1.1398389339447021, + "learning_rate": 1.1011200043779795e-05, + "loss": 0.3306, + "step": 20985 + }, + { + "epoch": 0.4678644998430421, + "grad_norm": 0.6144253015518188, + "learning_rate": 1.10077166107274e-05, + "loss": 0.3811, + "step": 20990 + }, + { + "epoch": 0.4679759492236622, + "grad_norm": 0.7339516878128052, + "learning_rate": 1.1004233054136726e-05, + "loss": 0.4931, + "step": 20995 + }, + { + "epoch": 0.4680873986042823, + "grad_norm": 0.5207570195198059, + "learning_rate": 1.1000749374434826e-05, + "loss": 0.3024, + "step": 21000 + }, + { + "epoch": 0.4681988479849023, + "grad_norm": 0.5385868549346924, + "learning_rate": 1.0997265572048785e-05, + "loss": 0.333, + "step": 21005 + }, + { + "epoch": 0.4683102973655224, + "grad_norm": 0.6920881271362305, + "learning_rate": 1.0993781647405679e-05, + "loss": 0.2503, + "step": 21010 + }, + { + "epoch": 0.46842174674614245, + "grad_norm": 0.6983755230903625, + "learning_rate": 1.0990297600932614e-05, + "loss": 0.342, + "step": 21015 + }, + { + "epoch": 0.46853319612676253, + "grad_norm": 0.6789685487747192, + "learning_rate": 1.0986813433056707e-05, + "loss": 0.3334, + "step": 21020 + }, + { + "epoch": 0.4686446455073826, + "grad_norm": 0.6402018070220947, + "learning_rate": 1.0983329144205092e-05, + "loss": 0.4244, + "step": 21025 + }, + { + "epoch": 0.4687560948880027, + "grad_norm": 0.671718955039978, + "learning_rate": 1.0979844734804912e-05, + "loss": 0.3912, + "step": 21030 + }, + { + "epoch": 0.4688675442686227, + "grad_norm": 0.5738045573234558, + "learning_rate": 1.097636020528333e-05, + "loss": 0.4052, + "step": 21035 + }, + { + "epoch": 0.4689789936492428, + "grad_norm": 0.6680939197540283, + "learning_rate": 1.0972875556067526e-05, + "loss": 0.3295, + "step": 21040 + }, + { + "epoch": 0.46909044302986286, + "grad_norm": 0.6745684146881104, + "learning_rate": 1.0969390787584683e-05, + "loss": 0.3101, + "step": 21045 + }, + { + "epoch": 0.46920189241048293, + "grad_norm": 0.6045733094215393, + "learning_rate": 1.0965905900262019e-05, + "loss": 0.2997, + "step": 21050 + }, + { + "epoch": 0.469313341791103, + "grad_norm": 0.5858403444290161, + "learning_rate": 1.0962420894526744e-05, + "loss": 0.3838, + "step": 21055 + }, + { + "epoch": 0.4694247911717231, + "grad_norm": 0.9184689521789551, + "learning_rate": 1.0958935770806094e-05, + "loss": 0.3696, + "step": 21060 + }, + { + "epoch": 0.4695362405523431, + "grad_norm": 0.715875506401062, + "learning_rate": 1.0955450529527323e-05, + "loss": 0.2313, + "step": 21065 + }, + { + "epoch": 0.4696476899329632, + "grad_norm": 0.5318100452423096, + "learning_rate": 1.0951965171117691e-05, + "loss": 0.2672, + "step": 21070 + }, + { + "epoch": 0.46975913931358326, + "grad_norm": 0.5228089094161987, + "learning_rate": 1.0948479696004479e-05, + "loss": 0.2672, + "step": 21075 + }, + { + "epoch": 0.46987058869420334, + "grad_norm": 0.5848378539085388, + "learning_rate": 1.0944994104614974e-05, + "loss": 0.3674, + "step": 21080 + }, + { + "epoch": 0.4699820380748234, + "grad_norm": 0.5758753418922424, + "learning_rate": 1.094150839737649e-05, + "loss": 0.246, + "step": 21085 + }, + { + "epoch": 0.4700934874554435, + "grad_norm": 0.4807438850402832, + "learning_rate": 1.093802257471634e-05, + "loss": 0.3168, + "step": 21090 + }, + { + "epoch": 0.4702049368360635, + "grad_norm": 0.8553235530853271, + "learning_rate": 1.0934536637061865e-05, + "loss": 0.3348, + "step": 21095 + }, + { + "epoch": 0.4703163862166836, + "grad_norm": 0.75001460313797, + "learning_rate": 1.0931050584840413e-05, + "loss": 0.3114, + "step": 21100 + }, + { + "epoch": 0.47042783559730367, + "grad_norm": 0.706753134727478, + "learning_rate": 1.0927564418479342e-05, + "loss": 0.3645, + "step": 21105 + }, + { + "epoch": 0.47053928497792374, + "grad_norm": 0.6393307447433472, + "learning_rate": 1.0924078138406037e-05, + "loss": 0.3728, + "step": 21110 + }, + { + "epoch": 0.4706507343585438, + "grad_norm": 0.6542025208473206, + "learning_rate": 1.0920591745047882e-05, + "loss": 0.2668, + "step": 21115 + }, + { + "epoch": 0.47076218373916384, + "grad_norm": 0.7120665311813354, + "learning_rate": 1.0917105238832286e-05, + "loss": 0.2934, + "step": 21120 + }, + { + "epoch": 0.4708736331197839, + "grad_norm": 0.4809533357620239, + "learning_rate": 1.0913618620186666e-05, + "loss": 0.3022, + "step": 21125 + }, + { + "epoch": 0.470985082500404, + "grad_norm": 0.6897178292274475, + "learning_rate": 1.0910131889538457e-05, + "loss": 0.3523, + "step": 21130 + }, + { + "epoch": 0.47109653188102407, + "grad_norm": 0.540787398815155, + "learning_rate": 1.0906645047315103e-05, + "loss": 0.3457, + "step": 21135 + }, + { + "epoch": 0.47120798126164415, + "grad_norm": 0.7425426840782166, + "learning_rate": 1.0903158093944061e-05, + "loss": 0.3859, + "step": 21140 + }, + { + "epoch": 0.4713194306422642, + "grad_norm": 0.5876283049583435, + "learning_rate": 1.0899671029852811e-05, + "loss": 0.3606, + "step": 21145 + }, + { + "epoch": 0.47143088002288425, + "grad_norm": 0.7805973291397095, + "learning_rate": 1.089618385546884e-05, + "loss": 0.2542, + "step": 21150 + }, + { + "epoch": 0.4715423294035043, + "grad_norm": 0.6115480661392212, + "learning_rate": 1.0892696571219644e-05, + "loss": 0.3854, + "step": 21155 + }, + { + "epoch": 0.4716537787841244, + "grad_norm": 0.5104815363883972, + "learning_rate": 1.0889209177532736e-05, + "loss": 0.3008, + "step": 21160 + }, + { + "epoch": 0.4717652281647445, + "grad_norm": 0.5803892016410828, + "learning_rate": 1.0885721674835647e-05, + "loss": 0.3487, + "step": 21165 + }, + { + "epoch": 0.47187667754536455, + "grad_norm": 0.5066009759902954, + "learning_rate": 1.0882234063555918e-05, + "loss": 0.3002, + "step": 21170 + }, + { + "epoch": 0.47198812692598463, + "grad_norm": 0.5209431648254395, + "learning_rate": 1.0878746344121105e-05, + "loss": 0.3443, + "step": 21175 + }, + { + "epoch": 0.47209957630660465, + "grad_norm": 0.5825982093811035, + "learning_rate": 1.087525851695877e-05, + "loss": 0.2403, + "step": 21180 + }, + { + "epoch": 0.47221102568722473, + "grad_norm": 0.5618570446968079, + "learning_rate": 1.0871770582496499e-05, + "loss": 0.2063, + "step": 21185 + }, + { + "epoch": 0.4723224750678448, + "grad_norm": 0.6874715089797974, + "learning_rate": 1.0868282541161882e-05, + "loss": 0.2414, + "step": 21190 + }, + { + "epoch": 0.4724339244484649, + "grad_norm": 0.4891948997974396, + "learning_rate": 1.086479439338253e-05, + "loss": 0.4205, + "step": 21195 + }, + { + "epoch": 0.47254537382908496, + "grad_norm": 0.7241100668907166, + "learning_rate": 1.0861306139586056e-05, + "loss": 0.359, + "step": 21200 + }, + { + "epoch": 0.47265682320970503, + "grad_norm": 0.6043018698692322, + "learning_rate": 1.0857817780200102e-05, + "loss": 0.3749, + "step": 21205 + }, + { + "epoch": 0.47276827259032506, + "grad_norm": 0.7084778547286987, + "learning_rate": 1.085432931565231e-05, + "loss": 0.2436, + "step": 21210 + }, + { + "epoch": 0.47287972197094513, + "grad_norm": 0.817477822303772, + "learning_rate": 1.0850840746370333e-05, + "loss": 0.3669, + "step": 21215 + }, + { + "epoch": 0.4729911713515652, + "grad_norm": 0.7407671809196472, + "learning_rate": 1.0847352072781852e-05, + "loss": 0.3582, + "step": 21220 + }, + { + "epoch": 0.4731026207321853, + "grad_norm": 0.5965331792831421, + "learning_rate": 1.0843863295314549e-05, + "loss": 0.3101, + "step": 21225 + }, + { + "epoch": 0.47321407011280536, + "grad_norm": 0.5705615282058716, + "learning_rate": 1.0840374414396115e-05, + "loss": 0.29, + "step": 21230 + }, + { + "epoch": 0.47332551949342544, + "grad_norm": 0.4807664453983307, + "learning_rate": 1.0836885430454267e-05, + "loss": 0.3527, + "step": 21235 + }, + { + "epoch": 0.47343696887404546, + "grad_norm": 0.6452370882034302, + "learning_rate": 1.0833396343916726e-05, + "loss": 0.3127, + "step": 21240 + }, + { + "epoch": 0.47354841825466554, + "grad_norm": 0.6671549081802368, + "learning_rate": 1.0829907155211224e-05, + "loss": 0.3212, + "step": 21245 + }, + { + "epoch": 0.4736598676352856, + "grad_norm": 0.7048370242118835, + "learning_rate": 1.0826417864765511e-05, + "loss": 0.3144, + "step": 21250 + }, + { + "epoch": 0.4737713170159057, + "grad_norm": 0.47000741958618164, + "learning_rate": 1.0822928473007348e-05, + "loss": 0.2763, + "step": 21255 + }, + { + "epoch": 0.47388276639652577, + "grad_norm": 0.7111234068870544, + "learning_rate": 1.0819438980364504e-05, + "loss": 0.3689, + "step": 21260 + }, + { + "epoch": 0.47399421577714584, + "grad_norm": 0.6803306937217712, + "learning_rate": 1.0815949387264766e-05, + "loss": 0.2767, + "step": 21265 + }, + { + "epoch": 0.47410566515776587, + "grad_norm": 0.7065713405609131, + "learning_rate": 1.0812459694135934e-05, + "loss": 0.2115, + "step": 21270 + }, + { + "epoch": 0.47421711453838594, + "grad_norm": 0.4053919017314911, + "learning_rate": 1.0808969901405816e-05, + "loss": 0.2379, + "step": 21275 + }, + { + "epoch": 0.474328563919006, + "grad_norm": 0.6302461624145508, + "learning_rate": 1.080548000950223e-05, + "loss": 0.3157, + "step": 21280 + }, + { + "epoch": 0.4744400132996261, + "grad_norm": 0.8447433114051819, + "learning_rate": 1.080199001885301e-05, + "loss": 0.3754, + "step": 21285 + }, + { + "epoch": 0.47455146268024617, + "grad_norm": 0.6583344340324402, + "learning_rate": 1.0798499929886008e-05, + "loss": 0.2893, + "step": 21290 + }, + { + "epoch": 0.47466291206086625, + "grad_norm": 0.7092258334159851, + "learning_rate": 1.0795009743029074e-05, + "loss": 0.3731, + "step": 21295 + }, + { + "epoch": 0.47477436144148627, + "grad_norm": 0.9176077842712402, + "learning_rate": 1.0791519458710085e-05, + "loss": 0.2761, + "step": 21300 + }, + { + "epoch": 0.47488581082210635, + "grad_norm": 0.7795170545578003, + "learning_rate": 1.078802907735692e-05, + "loss": 0.2331, + "step": 21305 + }, + { + "epoch": 0.4749972602027264, + "grad_norm": 0.8962420225143433, + "learning_rate": 1.0784538599397472e-05, + "loss": 0.3119, + "step": 21310 + }, + { + "epoch": 0.4751087095833465, + "grad_norm": 0.38173189759254456, + "learning_rate": 1.0781048025259648e-05, + "loss": 0.354, + "step": 21315 + }, + { + "epoch": 0.4752201589639666, + "grad_norm": 0.6000009775161743, + "learning_rate": 1.0777557355371364e-05, + "loss": 0.4249, + "step": 21320 + }, + { + "epoch": 0.4753316083445866, + "grad_norm": 0.7955544590950012, + "learning_rate": 1.0774066590160551e-05, + "loss": 0.3269, + "step": 21325 + }, + { + "epoch": 0.4754430577252067, + "grad_norm": 0.5262717604637146, + "learning_rate": 1.0770575730055149e-05, + "loss": 0.2005, + "step": 21330 + }, + { + "epoch": 0.47555450710582675, + "grad_norm": 0.6059525609016418, + "learning_rate": 1.0767084775483107e-05, + "loss": 0.2447, + "step": 21335 + }, + { + "epoch": 0.47566595648644683, + "grad_norm": 0.4977636933326721, + "learning_rate": 1.0763593726872395e-05, + "loss": 0.2031, + "step": 21340 + }, + { + "epoch": 0.4757774058670669, + "grad_norm": 1.0840247869491577, + "learning_rate": 1.076010258465099e-05, + "loss": 0.3704, + "step": 21345 + }, + { + "epoch": 0.475888855247687, + "grad_norm": 1.2776250839233398, + "learning_rate": 1.0756611349246871e-05, + "loss": 0.1845, + "step": 21350 + }, + { + "epoch": 0.476000304628307, + "grad_norm": 0.8424351215362549, + "learning_rate": 1.0753120021088037e-05, + "loss": 0.2985, + "step": 21355 + }, + { + "epoch": 0.4761117540089271, + "grad_norm": 0.38192832469940186, + "learning_rate": 1.0749628600602509e-05, + "loss": 0.2605, + "step": 21360 + }, + { + "epoch": 0.47622320338954716, + "grad_norm": 0.5329508185386658, + "learning_rate": 1.0746137088218298e-05, + "loss": 0.3446, + "step": 21365 + }, + { + "epoch": 0.47633465277016723, + "grad_norm": 0.5951401591300964, + "learning_rate": 1.0742645484363437e-05, + "loss": 0.2271, + "step": 21370 + }, + { + "epoch": 0.4764461021507873, + "grad_norm": 0.5628944039344788, + "learning_rate": 1.0739153789465974e-05, + "loss": 0.2752, + "step": 21375 + }, + { + "epoch": 0.4765575515314074, + "grad_norm": 0.4774826467037201, + "learning_rate": 1.0735662003953964e-05, + "loss": 0.3852, + "step": 21380 + }, + { + "epoch": 0.4766690009120274, + "grad_norm": 0.612303614616394, + "learning_rate": 1.0732170128255467e-05, + "loss": 0.331, + "step": 21385 + }, + { + "epoch": 0.4767804502926475, + "grad_norm": 0.6990157961845398, + "learning_rate": 1.0728678162798565e-05, + "loss": 0.3232, + "step": 21390 + }, + { + "epoch": 0.47689189967326756, + "grad_norm": 0.6447412967681885, + "learning_rate": 1.0725186108011348e-05, + "loss": 0.3336, + "step": 21395 + }, + { + "epoch": 0.47700334905388764, + "grad_norm": 0.6951749920845032, + "learning_rate": 1.0721693964321906e-05, + "loss": 0.3038, + "step": 21400 + }, + { + "epoch": 0.4771147984345077, + "grad_norm": 0.6879447102546692, + "learning_rate": 1.0718201732158357e-05, + "loss": 0.2384, + "step": 21405 + }, + { + "epoch": 0.4772262478151278, + "grad_norm": 0.5735766887664795, + "learning_rate": 1.071470941194882e-05, + "loss": 0.2334, + "step": 21410 + }, + { + "epoch": 0.4773376971957478, + "grad_norm": 0.6717875599861145, + "learning_rate": 1.0711217004121425e-05, + "loss": 0.2281, + "step": 21415 + }, + { + "epoch": 0.4774491465763679, + "grad_norm": 0.6436781883239746, + "learning_rate": 1.0707724509104318e-05, + "loss": 0.408, + "step": 21420 + }, + { + "epoch": 0.47756059595698797, + "grad_norm": 0.6138870120048523, + "learning_rate": 1.0704231927325646e-05, + "loss": 0.3461, + "step": 21425 + }, + { + "epoch": 0.47767204533760804, + "grad_norm": 0.5195964574813843, + "learning_rate": 1.0700739259213577e-05, + "loss": 0.3413, + "step": 21430 + }, + { + "epoch": 0.4777834947182281, + "grad_norm": 0.4784146249294281, + "learning_rate": 1.0697246505196282e-05, + "loss": 0.3437, + "step": 21435 + }, + { + "epoch": 0.4778949440988482, + "grad_norm": 0.6470903754234314, + "learning_rate": 1.0693753665701949e-05, + "loss": 0.2982, + "step": 21440 + }, + { + "epoch": 0.4780063934794682, + "grad_norm": 0.4970431327819824, + "learning_rate": 1.069026074115877e-05, + "loss": 0.2671, + "step": 21445 + }, + { + "epoch": 0.4781178428600883, + "grad_norm": 0.5180191397666931, + "learning_rate": 1.0686767731994952e-05, + "loss": 0.2682, + "step": 21450 + }, + { + "epoch": 0.47822929224070837, + "grad_norm": 0.5427197813987732, + "learning_rate": 1.0683274638638712e-05, + "loss": 0.2806, + "step": 21455 + }, + { + "epoch": 0.47834074162132845, + "grad_norm": 0.7320225238800049, + "learning_rate": 1.0679781461518275e-05, + "loss": 0.3579, + "step": 21460 + }, + { + "epoch": 0.4784521910019485, + "grad_norm": 0.4966415464878082, + "learning_rate": 1.0676288201061878e-05, + "loss": 0.2459, + "step": 21465 + }, + { + "epoch": 0.4785636403825686, + "grad_norm": 0.5143702626228333, + "learning_rate": 1.0672794857697769e-05, + "loss": 0.3453, + "step": 21470 + }, + { + "epoch": 0.4786750897631886, + "grad_norm": 0.4696958363056183, + "learning_rate": 1.0669301431854203e-05, + "loss": 0.2583, + "step": 21475 + }, + { + "epoch": 0.4787865391438087, + "grad_norm": 0.6180020570755005, + "learning_rate": 1.0665807923959446e-05, + "loss": 0.4172, + "step": 21480 + }, + { + "epoch": 0.4788979885244288, + "grad_norm": 0.9005603790283203, + "learning_rate": 1.0662314334441779e-05, + "loss": 0.294, + "step": 21485 + }, + { + "epoch": 0.47900943790504885, + "grad_norm": 0.43045490980148315, + "learning_rate": 1.0658820663729486e-05, + "loss": 0.244, + "step": 21490 + }, + { + "epoch": 0.47912088728566893, + "grad_norm": 0.7310406565666199, + "learning_rate": 1.0655326912250863e-05, + "loss": 0.252, + "step": 21495 + }, + { + "epoch": 0.47923233666628895, + "grad_norm": 0.6532853841781616, + "learning_rate": 1.0651833080434222e-05, + "loss": 0.397, + "step": 21500 + }, + { + "epoch": 0.479343786046909, + "grad_norm": 0.6540833115577698, + "learning_rate": 1.0648339168707875e-05, + "loss": 0.2388, + "step": 21505 + }, + { + "epoch": 0.4794552354275291, + "grad_norm": 0.3894214928150177, + "learning_rate": 1.0644845177500152e-05, + "loss": 0.2435, + "step": 21510 + }, + { + "epoch": 0.4795666848081492, + "grad_norm": 0.6973043084144592, + "learning_rate": 1.0641351107239384e-05, + "loss": 0.3917, + "step": 21515 + }, + { + "epoch": 0.47967813418876926, + "grad_norm": 0.6568936705589294, + "learning_rate": 1.0637856958353925e-05, + "loss": 0.3837, + "step": 21520 + }, + { + "epoch": 0.47978958356938933, + "grad_norm": 0.426479697227478, + "learning_rate": 1.0634362731272123e-05, + "loss": 0.3434, + "step": 21525 + }, + { + "epoch": 0.47990103295000935, + "grad_norm": 0.8263179659843445, + "learning_rate": 1.0630868426422346e-05, + "loss": 0.1714, + "step": 21530 + }, + { + "epoch": 0.48001248233062943, + "grad_norm": 0.4639967978000641, + "learning_rate": 1.0627374044232973e-05, + "loss": 0.2999, + "step": 21535 + }, + { + "epoch": 0.4801239317112495, + "grad_norm": 0.6227473020553589, + "learning_rate": 1.0623879585132384e-05, + "loss": 0.3419, + "step": 21540 + }, + { + "epoch": 0.4802353810918696, + "grad_norm": 0.47798821330070496, + "learning_rate": 1.0620385049548976e-05, + "loss": 0.2656, + "step": 21545 + }, + { + "epoch": 0.48034683047248966, + "grad_norm": 0.6013455390930176, + "learning_rate": 1.0616890437911146e-05, + "loss": 0.3157, + "step": 21550 + }, + { + "epoch": 0.48045827985310974, + "grad_norm": 0.667288601398468, + "learning_rate": 1.0613395750647308e-05, + "loss": 0.291, + "step": 21555 + }, + { + "epoch": 0.48056972923372976, + "grad_norm": 0.808897852897644, + "learning_rate": 1.060990098818589e-05, + "loss": 0.3802, + "step": 21560 + }, + { + "epoch": 0.48068117861434984, + "grad_norm": 0.49802616238594055, + "learning_rate": 1.0606406150955315e-05, + "loss": 0.3905, + "step": 21565 + }, + { + "epoch": 0.4807926279949699, + "grad_norm": 0.7375686168670654, + "learning_rate": 1.0602911239384028e-05, + "loss": 0.3577, + "step": 21570 + }, + { + "epoch": 0.48090407737559, + "grad_norm": 0.5488283038139343, + "learning_rate": 1.0599416253900478e-05, + "loss": 0.4146, + "step": 21575 + }, + { + "epoch": 0.48101552675621007, + "grad_norm": 0.5572513341903687, + "learning_rate": 1.0595921194933124e-05, + "loss": 0.1912, + "step": 21580 + }, + { + "epoch": 0.48112697613683014, + "grad_norm": 1.139878273010254, + "learning_rate": 1.0592426062910426e-05, + "loss": 0.3301, + "step": 21585 + }, + { + "epoch": 0.48123842551745016, + "grad_norm": 0.5184080004692078, + "learning_rate": 1.0588930858260869e-05, + "loss": 0.4067, + "step": 21590 + }, + { + "epoch": 0.48134987489807024, + "grad_norm": 0.6013784408569336, + "learning_rate": 1.0585435581412933e-05, + "loss": 0.2887, + "step": 21595 + }, + { + "epoch": 0.4814613242786903, + "grad_norm": 0.8848481774330139, + "learning_rate": 1.0581940232795114e-05, + "loss": 0.3174, + "step": 21600 + }, + { + "epoch": 0.4815727736593104, + "grad_norm": 0.8106632232666016, + "learning_rate": 1.0578444812835914e-05, + "loss": 0.2261, + "step": 21605 + }, + { + "epoch": 0.48168422303993047, + "grad_norm": 0.6408175826072693, + "learning_rate": 1.0574949321963845e-05, + "loss": 0.2829, + "step": 21610 + }, + { + "epoch": 0.48179567242055055, + "grad_norm": 0.7407748103141785, + "learning_rate": 1.0571453760607428e-05, + "loss": 0.3721, + "step": 21615 + }, + { + "epoch": 0.48190712180117057, + "grad_norm": 0.6626582145690918, + "learning_rate": 1.056795812919519e-05, + "loss": 0.1896, + "step": 21620 + }, + { + "epoch": 0.48201857118179064, + "grad_norm": 0.5842767357826233, + "learning_rate": 1.0564462428155671e-05, + "loss": 0.3176, + "step": 21625 + }, + { + "epoch": 0.4821300205624107, + "grad_norm": 0.6812451481819153, + "learning_rate": 1.0560966657917414e-05, + "loss": 0.2068, + "step": 21630 + }, + { + "epoch": 0.4822414699430308, + "grad_norm": 0.5823196172714233, + "learning_rate": 1.0557470818908973e-05, + "loss": 0.3458, + "step": 21635 + }, + { + "epoch": 0.4823529193236509, + "grad_norm": 0.5161350965499878, + "learning_rate": 1.0553974911558916e-05, + "loss": 0.3995, + "step": 21640 + }, + { + "epoch": 0.48246436870427095, + "grad_norm": 0.3206993639469147, + "learning_rate": 1.055047893629581e-05, + "loss": 0.2528, + "step": 21645 + }, + { + "epoch": 0.482575818084891, + "grad_norm": 0.5969932675361633, + "learning_rate": 1.0546982893548234e-05, + "loss": 0.2286, + "step": 21650 + }, + { + "epoch": 0.48268726746551105, + "grad_norm": 0.7618280053138733, + "learning_rate": 1.0543486783744777e-05, + "loss": 0.288, + "step": 21655 + }, + { + "epoch": 0.4827987168461311, + "grad_norm": 0.726722002029419, + "learning_rate": 1.0539990607314036e-05, + "loss": 0.3569, + "step": 21660 + }, + { + "epoch": 0.4829101662267512, + "grad_norm": 0.5121591687202454, + "learning_rate": 1.0536494364684615e-05, + "loss": 0.3177, + "step": 21665 + }, + { + "epoch": 0.4830216156073713, + "grad_norm": 0.5842480659484863, + "learning_rate": 1.053299805628513e-05, + "loss": 0.3213, + "step": 21670 + }, + { + "epoch": 0.48313306498799136, + "grad_norm": 0.6678650379180908, + "learning_rate": 1.052950168254419e-05, + "loss": 0.329, + "step": 21675 + }, + { + "epoch": 0.4832445143686114, + "grad_norm": 0.6162123084068298, + "learning_rate": 1.0526005243890434e-05, + "loss": 0.3396, + "step": 21680 + }, + { + "epoch": 0.48335596374923145, + "grad_norm": 0.5280946493148804, + "learning_rate": 1.0522508740752498e-05, + "loss": 0.3658, + "step": 21685 + }, + { + "epoch": 0.48346741312985153, + "grad_norm": 0.4918314814567566, + "learning_rate": 1.051901217355902e-05, + "loss": 0.2598, + "step": 21690 + }, + { + "epoch": 0.4835788625104716, + "grad_norm": 1.1120338439941406, + "learning_rate": 1.0515515542738655e-05, + "loss": 0.2327, + "step": 21695 + }, + { + "epoch": 0.4836903118910917, + "grad_norm": 0.4988137483596802, + "learning_rate": 1.0512018848720067e-05, + "loss": 0.3211, + "step": 21700 + }, + { + "epoch": 0.4838017612717117, + "grad_norm": 0.6864381432533264, + "learning_rate": 1.050852209193192e-05, + "loss": 0.2267, + "step": 21705 + }, + { + "epoch": 0.4839132106523318, + "grad_norm": 0.924082338809967, + "learning_rate": 1.0505025272802885e-05, + "loss": 0.3091, + "step": 21710 + }, + { + "epoch": 0.48402466003295186, + "grad_norm": 0.5768184065818787, + "learning_rate": 1.0501528391761655e-05, + "loss": 0.236, + "step": 21715 + }, + { + "epoch": 0.48413610941357194, + "grad_norm": 0.6007546782493591, + "learning_rate": 1.0498031449236914e-05, + "loss": 0.387, + "step": 21720 + }, + { + "epoch": 0.484247558794192, + "grad_norm": 0.683722198009491, + "learning_rate": 1.0494534445657358e-05, + "loss": 0.3609, + "step": 21725 + }, + { + "epoch": 0.4843590081748121, + "grad_norm": 0.7989122867584229, + "learning_rate": 1.04910373814517e-05, + "loss": 0.3164, + "step": 21730 + }, + { + "epoch": 0.4844704575554321, + "grad_norm": 0.45317602157592773, + "learning_rate": 1.048754025704865e-05, + "loss": 0.3638, + "step": 21735 + }, + { + "epoch": 0.4845819069360522, + "grad_norm": 0.6904126405715942, + "learning_rate": 1.0484043072876927e-05, + "loss": 0.3686, + "step": 21740 + }, + { + "epoch": 0.48469335631667226, + "grad_norm": 0.683846652507782, + "learning_rate": 1.0480545829365257e-05, + "loss": 0.3373, + "step": 21745 + }, + { + "epoch": 0.48480480569729234, + "grad_norm": 0.8189532160758972, + "learning_rate": 1.0477048526942379e-05, + "loss": 0.3134, + "step": 21750 + }, + { + "epoch": 0.4849162550779124, + "grad_norm": 0.7070431113243103, + "learning_rate": 1.0473551166037035e-05, + "loss": 0.2997, + "step": 21755 + }, + { + "epoch": 0.4850277044585325, + "grad_norm": 0.7763333916664124, + "learning_rate": 1.047005374707797e-05, + "loss": 0.316, + "step": 21760 + }, + { + "epoch": 0.4851391538391525, + "grad_norm": 0.6913332343101501, + "learning_rate": 1.0466556270493948e-05, + "loss": 0.3186, + "step": 21765 + }, + { + "epoch": 0.4852506032197726, + "grad_norm": 0.5847534537315369, + "learning_rate": 1.0463058736713728e-05, + "loss": 0.4038, + "step": 21770 + }, + { + "epoch": 0.48536205260039267, + "grad_norm": 0.5487058758735657, + "learning_rate": 1.045956114616608e-05, + "loss": 0.3587, + "step": 21775 + }, + { + "epoch": 0.48547350198101274, + "grad_norm": 0.7386172413825989, + "learning_rate": 1.0456063499279783e-05, + "loss": 0.3604, + "step": 21780 + }, + { + "epoch": 0.4855849513616328, + "grad_norm": 0.4023517966270447, + "learning_rate": 1.0452565796483618e-05, + "loss": 0.2326, + "step": 21785 + }, + { + "epoch": 0.4856964007422529, + "grad_norm": 0.5407540202140808, + "learning_rate": 1.0449068038206385e-05, + "loss": 0.2755, + "step": 21790 + }, + { + "epoch": 0.4858078501228729, + "grad_norm": 0.5274724960327148, + "learning_rate": 1.0445570224876873e-05, + "loss": 0.3664, + "step": 21795 + }, + { + "epoch": 0.485919299503493, + "grad_norm": 1.241453766822815, + "learning_rate": 1.0442072356923893e-05, + "loss": 0.244, + "step": 21800 + }, + { + "epoch": 0.4860307488841131, + "grad_norm": 0.7574704885482788, + "learning_rate": 1.0438574434776255e-05, + "loss": 0.3552, + "step": 21805 + }, + { + "epoch": 0.48614219826473315, + "grad_norm": 0.492057204246521, + "learning_rate": 1.0435076458862774e-05, + "loss": 0.2605, + "step": 21810 + }, + { + "epoch": 0.4862536476453532, + "grad_norm": 0.6264439821243286, + "learning_rate": 1.0431578429612277e-05, + "loss": 0.1561, + "step": 21815 + }, + { + "epoch": 0.4863650970259733, + "grad_norm": 0.5074672102928162, + "learning_rate": 1.0428080347453597e-05, + "loss": 0.3695, + "step": 21820 + }, + { + "epoch": 0.4864765464065933, + "grad_norm": 0.49057939648628235, + "learning_rate": 1.0424582212815572e-05, + "loss": 0.324, + "step": 21825 + }, + { + "epoch": 0.4865879957872134, + "grad_norm": 0.6426665186882019, + "learning_rate": 1.0421084026127043e-05, + "loss": 0.4274, + "step": 21830 + }, + { + "epoch": 0.4866994451678335, + "grad_norm": 0.515211284160614, + "learning_rate": 1.0417585787816861e-05, + "loss": 0.3988, + "step": 21835 + }, + { + "epoch": 0.48681089454845355, + "grad_norm": 0.5060247778892517, + "learning_rate": 1.041408749831389e-05, + "loss": 0.4018, + "step": 21840 + }, + { + "epoch": 0.48692234392907363, + "grad_norm": 0.4912254512310028, + "learning_rate": 1.0410589158046986e-05, + "loss": 0.2778, + "step": 21845 + }, + { + "epoch": 0.4870337933096937, + "grad_norm": 1.7002813816070557, + "learning_rate": 1.0407090767445018e-05, + "loss": 0.3479, + "step": 21850 + }, + { + "epoch": 0.48714524269031373, + "grad_norm": 1.522349238395691, + "learning_rate": 1.0403592326936867e-05, + "loss": 0.2538, + "step": 21855 + }, + { + "epoch": 0.4872566920709338, + "grad_norm": 0.7782891988754272, + "learning_rate": 1.0400093836951412e-05, + "loss": 0.3399, + "step": 21860 + }, + { + "epoch": 0.4873681414515539, + "grad_norm": 0.6137044429779053, + "learning_rate": 1.0396595297917535e-05, + "loss": 0.3428, + "step": 21865 + }, + { + "epoch": 0.48747959083217396, + "grad_norm": 0.5912491679191589, + "learning_rate": 1.0393096710264143e-05, + "loss": 0.2999, + "step": 21870 + }, + { + "epoch": 0.48759104021279404, + "grad_norm": 0.6235821843147278, + "learning_rate": 1.0389598074420126e-05, + "loss": 0.428, + "step": 21875 + }, + { + "epoch": 0.48770248959341406, + "grad_norm": 0.5283631086349487, + "learning_rate": 1.0386099390814393e-05, + "loss": 0.2223, + "step": 21880 + }, + { + "epoch": 0.48781393897403413, + "grad_norm": 0.5237429141998291, + "learning_rate": 1.0382600659875852e-05, + "loss": 0.3552, + "step": 21885 + }, + { + "epoch": 0.4879253883546542, + "grad_norm": 1.0724443197250366, + "learning_rate": 1.0379101882033427e-05, + "loss": 0.4636, + "step": 21890 + }, + { + "epoch": 0.4880368377352743, + "grad_norm": 0.5929918885231018, + "learning_rate": 1.0375603057716035e-05, + "loss": 0.3857, + "step": 21895 + }, + { + "epoch": 0.48814828711589436, + "grad_norm": 0.5061531066894531, + "learning_rate": 1.0372104187352608e-05, + "loss": 0.2441, + "step": 21900 + }, + { + "epoch": 0.48825973649651444, + "grad_norm": 0.8347680568695068, + "learning_rate": 1.0368605271372082e-05, + "loss": 0.3225, + "step": 21905 + }, + { + "epoch": 0.48837118587713446, + "grad_norm": 0.550164520740509, + "learning_rate": 1.0365106310203392e-05, + "loss": 0.3056, + "step": 21910 + }, + { + "epoch": 0.48848263525775454, + "grad_norm": 0.7784298062324524, + "learning_rate": 1.0361607304275487e-05, + "loss": 0.2766, + "step": 21915 + }, + { + "epoch": 0.4885940846383746, + "grad_norm": 0.8520126342773438, + "learning_rate": 1.0358108254017319e-05, + "loss": 0.3047, + "step": 21920 + }, + { + "epoch": 0.4887055340189947, + "grad_norm": 0.5592504739761353, + "learning_rate": 1.035460915985784e-05, + "loss": 0.3923, + "step": 21925 + }, + { + "epoch": 0.48881698339961477, + "grad_norm": 0.44507914781570435, + "learning_rate": 1.0351110022226021e-05, + "loss": 0.2968, + "step": 21930 + }, + { + "epoch": 0.48892843278023485, + "grad_norm": 0.7121692299842834, + "learning_rate": 1.0347610841550821e-05, + "loss": 0.2145, + "step": 21935 + }, + { + "epoch": 0.48903988216085487, + "grad_norm": 0.49924153089523315, + "learning_rate": 1.034411161826121e-05, + "loss": 0.1799, + "step": 21940 + }, + { + "epoch": 0.48915133154147494, + "grad_norm": 0.5004662275314331, + "learning_rate": 1.0340612352786177e-05, + "loss": 0.249, + "step": 21945 + }, + { + "epoch": 0.489262780922095, + "grad_norm": 0.5530099868774414, + "learning_rate": 1.0337113045554696e-05, + "loss": 0.3849, + "step": 21950 + }, + { + "epoch": 0.4893742303027151, + "grad_norm": 0.8083066344261169, + "learning_rate": 1.033361369699576e-05, + "loss": 0.3543, + "step": 21955 + }, + { + "epoch": 0.4894856796833352, + "grad_norm": 0.7340057492256165, + "learning_rate": 1.0330114307538357e-05, + "loss": 0.3615, + "step": 21960 + }, + { + "epoch": 0.48959712906395525, + "grad_norm": 0.5993553400039673, + "learning_rate": 1.032661487761149e-05, + "loss": 0.3492, + "step": 21965 + }, + { + "epoch": 0.48970857844457527, + "grad_norm": 0.708200216293335, + "learning_rate": 1.0323115407644158e-05, + "loss": 0.3004, + "step": 21970 + }, + { + "epoch": 0.48982002782519535, + "grad_norm": 0.5713046789169312, + "learning_rate": 1.031961589806537e-05, + "loss": 0.3034, + "step": 21975 + }, + { + "epoch": 0.4899314772058154, + "grad_norm": 0.4956084191799164, + "learning_rate": 1.0316116349304144e-05, + "loss": 0.2956, + "step": 21980 + }, + { + "epoch": 0.4900429265864355, + "grad_norm": 0.6252642273902893, + "learning_rate": 1.0312616761789492e-05, + "loss": 0.2746, + "step": 21985 + }, + { + "epoch": 0.4901543759670556, + "grad_norm": 0.8498284220695496, + "learning_rate": 1.0309117135950437e-05, + "loss": 0.2398, + "step": 21990 + }, + { + "epoch": 0.49026582534767565, + "grad_norm": 0.5617647767066956, + "learning_rate": 1.0305617472216008e-05, + "loss": 0.3071, + "step": 21995 + }, + { + "epoch": 0.4903772747282957, + "grad_norm": 0.31070756912231445, + "learning_rate": 1.030211777101524e-05, + "loss": 0.2756, + "step": 22000 + }, + { + "epoch": 0.49048872410891575, + "grad_norm": 0.6978440880775452, + "learning_rate": 1.0298618032777159e-05, + "loss": 0.1967, + "step": 22005 + }, + { + "epoch": 0.49060017348953583, + "grad_norm": 0.57720947265625, + "learning_rate": 1.0295118257930813e-05, + "loss": 0.3435, + "step": 22010 + }, + { + "epoch": 0.4907116228701559, + "grad_norm": 0.5618628263473511, + "learning_rate": 1.029161844690525e-05, + "loss": 0.2808, + "step": 22015 + }, + { + "epoch": 0.490823072250776, + "grad_norm": 0.4151650667190552, + "learning_rate": 1.0288118600129511e-05, + "loss": 0.2021, + "step": 22020 + }, + { + "epoch": 0.49093452163139606, + "grad_norm": 0.6705240607261658, + "learning_rate": 1.0284618718032659e-05, + "loss": 0.3617, + "step": 22025 + }, + { + "epoch": 0.4910459710120161, + "grad_norm": 0.4305287003517151, + "learning_rate": 1.028111880104375e-05, + "loss": 0.2537, + "step": 22030 + }, + { + "epoch": 0.49115742039263616, + "grad_norm": 0.48318371176719666, + "learning_rate": 1.0277618849591845e-05, + "loss": 0.3553, + "step": 22035 + }, + { + "epoch": 0.49126886977325623, + "grad_norm": 0.6259504556655884, + "learning_rate": 1.027411886410601e-05, + "loss": 0.3473, + "step": 22040 + }, + { + "epoch": 0.4913803191538763, + "grad_norm": 0.32242828607559204, + "learning_rate": 1.027061884501532e-05, + "loss": 0.3195, + "step": 22045 + }, + { + "epoch": 0.4914917685344964, + "grad_norm": 1.1589083671569824, + "learning_rate": 1.0267118792748846e-05, + "loss": 0.3997, + "step": 22050 + }, + { + "epoch": 0.49160321791511646, + "grad_norm": 0.6759958267211914, + "learning_rate": 1.0263618707735676e-05, + "loss": 0.37, + "step": 22055 + }, + { + "epoch": 0.4917146672957365, + "grad_norm": 0.6928285360336304, + "learning_rate": 1.0260118590404881e-05, + "loss": 0.2775, + "step": 22060 + }, + { + "epoch": 0.49182611667635656, + "grad_norm": 0.7928364276885986, + "learning_rate": 1.0256618441185557e-05, + "loss": 0.2353, + "step": 22065 + }, + { + "epoch": 0.49193756605697664, + "grad_norm": 0.5948072075843811, + "learning_rate": 1.0253118260506793e-05, + "loss": 0.3934, + "step": 22070 + }, + { + "epoch": 0.4920490154375967, + "grad_norm": 0.8399053812026978, + "learning_rate": 1.0249618048797687e-05, + "loss": 0.3526, + "step": 22075 + }, + { + "epoch": 0.4921604648182168, + "grad_norm": 0.44372832775115967, + "learning_rate": 1.0246117806487328e-05, + "loss": 0.3371, + "step": 22080 + }, + { + "epoch": 0.4922719141988368, + "grad_norm": 0.5984147787094116, + "learning_rate": 1.0242617534004831e-05, + "loss": 0.177, + "step": 22085 + }, + { + "epoch": 0.4923833635794569, + "grad_norm": 0.4956534206867218, + "learning_rate": 1.0239117231779297e-05, + "loss": 0.3363, + "step": 22090 + }, + { + "epoch": 0.49249481296007697, + "grad_norm": 0.5281473994255066, + "learning_rate": 1.0235616900239831e-05, + "loss": 0.2418, + "step": 22095 + }, + { + "epoch": 0.49260626234069704, + "grad_norm": 0.5945612788200378, + "learning_rate": 1.0232116539815558e-05, + "loss": 0.3105, + "step": 22100 + }, + { + "epoch": 0.4927177117213171, + "grad_norm": 0.5451594591140747, + "learning_rate": 1.0228616150935587e-05, + "loss": 0.2799, + "step": 22105 + }, + { + "epoch": 0.4928291611019372, + "grad_norm": 0.6400839686393738, + "learning_rate": 1.0225115734029039e-05, + "loss": 0.288, + "step": 22110 + }, + { + "epoch": 0.4929406104825572, + "grad_norm": 0.43941572308540344, + "learning_rate": 1.022161528952504e-05, + "loss": 0.3765, + "step": 22115 + }, + { + "epoch": 0.4930520598631773, + "grad_norm": 0.6126463413238525, + "learning_rate": 1.0218114817852718e-05, + "loss": 0.3841, + "step": 22120 + }, + { + "epoch": 0.49316350924379737, + "grad_norm": 1.0110007524490356, + "learning_rate": 1.0214614319441202e-05, + "loss": 0.3867, + "step": 22125 + }, + { + "epoch": 0.49327495862441745, + "grad_norm": 0.7327465415000916, + "learning_rate": 1.0211113794719627e-05, + "loss": 0.2004, + "step": 22130 + }, + { + "epoch": 0.4933864080050375, + "grad_norm": 0.44659993052482605, + "learning_rate": 1.0207613244117132e-05, + "loss": 0.2642, + "step": 22135 + }, + { + "epoch": 0.4934978573856576, + "grad_norm": 0.701119601726532, + "learning_rate": 1.0204112668062853e-05, + "loss": 0.2889, + "step": 22140 + }, + { + "epoch": 0.4936093067662776, + "grad_norm": 0.42930060625076294, + "learning_rate": 1.020061206698594e-05, + "loss": 0.3402, + "step": 22145 + }, + { + "epoch": 0.4937207561468977, + "grad_norm": 0.6302748322486877, + "learning_rate": 1.0197111441315532e-05, + "loss": 0.3178, + "step": 22150 + }, + { + "epoch": 0.4938322055275178, + "grad_norm": 0.5891337394714355, + "learning_rate": 1.0193610791480789e-05, + "loss": 0.281, + "step": 22155 + }, + { + "epoch": 0.49394365490813785, + "grad_norm": 0.9075787663459778, + "learning_rate": 1.019011011791085e-05, + "loss": 0.3301, + "step": 22160 + }, + { + "epoch": 0.49405510428875793, + "grad_norm": 1.0279183387756348, + "learning_rate": 1.018660942103488e-05, + "loss": 0.2902, + "step": 22165 + }, + { + "epoch": 0.494166553669378, + "grad_norm": 0.4951561689376831, + "learning_rate": 1.018310870128204e-05, + "loss": 0.3365, + "step": 22170 + }, + { + "epoch": 0.494278003049998, + "grad_norm": 0.5057305693626404, + "learning_rate": 1.0179607959081482e-05, + "loss": 0.4527, + "step": 22175 + }, + { + "epoch": 0.4943894524306181, + "grad_norm": 0.640472948551178, + "learning_rate": 1.0176107194862377e-05, + "loss": 0.2863, + "step": 22180 + }, + { + "epoch": 0.4945009018112382, + "grad_norm": 0.7052475214004517, + "learning_rate": 1.0172606409053887e-05, + "loss": 0.4188, + "step": 22185 + }, + { + "epoch": 0.49461235119185826, + "grad_norm": 0.6334758996963501, + "learning_rate": 1.0169105602085182e-05, + "loss": 0.3385, + "step": 22190 + }, + { + "epoch": 0.49472380057247833, + "grad_norm": 0.6031564474105835, + "learning_rate": 1.0165604774385441e-05, + "loss": 0.2381, + "step": 22195 + }, + { + "epoch": 0.4948352499530984, + "grad_norm": 0.6718378067016602, + "learning_rate": 1.0162103926383828e-05, + "loss": 0.2258, + "step": 22200 + }, + { + "epoch": 0.49494669933371843, + "grad_norm": 0.5733515024185181, + "learning_rate": 1.0158603058509527e-05, + "loss": 0.2035, + "step": 22205 + }, + { + "epoch": 0.4950581487143385, + "grad_norm": 0.674278736114502, + "learning_rate": 1.0155102171191717e-05, + "loss": 0.3144, + "step": 22210 + }, + { + "epoch": 0.4951695980949586, + "grad_norm": 0.5201547741889954, + "learning_rate": 1.0151601264859577e-05, + "loss": 0.2628, + "step": 22215 + }, + { + "epoch": 0.49528104747557866, + "grad_norm": 0.5437789559364319, + "learning_rate": 1.0148100339942288e-05, + "loss": 0.1872, + "step": 22220 + }, + { + "epoch": 0.49539249685619874, + "grad_norm": 0.5899885892868042, + "learning_rate": 1.0144599396869046e-05, + "loss": 0.4151, + "step": 22225 + }, + { + "epoch": 0.4955039462368188, + "grad_norm": 1.0787954330444336, + "learning_rate": 1.0141098436069032e-05, + "loss": 0.314, + "step": 22230 + }, + { + "epoch": 0.49561539561743884, + "grad_norm": 0.6664475202560425, + "learning_rate": 1.0137597457971437e-05, + "loss": 0.3002, + "step": 22235 + }, + { + "epoch": 0.4957268449980589, + "grad_norm": 0.5675019025802612, + "learning_rate": 1.0134096463005459e-05, + "loss": 0.4102, + "step": 22240 + }, + { + "epoch": 0.495838294378679, + "grad_norm": 0.6319025754928589, + "learning_rate": 1.0130595451600289e-05, + "loss": 0.2957, + "step": 22245 + }, + { + "epoch": 0.49594974375929907, + "grad_norm": 0.5857518315315247, + "learning_rate": 1.0127094424185122e-05, + "loss": 0.3533, + "step": 22250 + }, + { + "epoch": 0.49606119313991914, + "grad_norm": 0.5915650725364685, + "learning_rate": 1.0123593381189161e-05, + "loss": 0.3708, + "step": 22255 + }, + { + "epoch": 0.49617264252053916, + "grad_norm": 0.5313299894332886, + "learning_rate": 1.0120092323041606e-05, + "loss": 0.3931, + "step": 22260 + }, + { + "epoch": 0.49628409190115924, + "grad_norm": 0.5031399726867676, + "learning_rate": 1.011659125017166e-05, + "loss": 0.4215, + "step": 22265 + }, + { + "epoch": 0.4963955412817793, + "grad_norm": 0.8380836248397827, + "learning_rate": 1.0113090163008525e-05, + "loss": 0.3264, + "step": 22270 + }, + { + "epoch": 0.4965069906623994, + "grad_norm": 0.6201096177101135, + "learning_rate": 1.0109589061981411e-05, + "loss": 0.2423, + "step": 22275 + }, + { + "epoch": 0.49661844004301947, + "grad_norm": 0.7164033651351929, + "learning_rate": 1.0106087947519521e-05, + "loss": 0.2604, + "step": 22280 + }, + { + "epoch": 0.49672988942363955, + "grad_norm": 0.5825961828231812, + "learning_rate": 1.010258682005207e-05, + "loss": 0.3742, + "step": 22285 + }, + { + "epoch": 0.49684133880425957, + "grad_norm": 0.5495718717575073, + "learning_rate": 1.0099085680008265e-05, + "loss": 0.2993, + "step": 22290 + }, + { + "epoch": 0.49695278818487965, + "grad_norm": 0.6401649713516235, + "learning_rate": 1.0095584527817319e-05, + "loss": 0.3758, + "step": 22295 + }, + { + "epoch": 0.4970642375654997, + "grad_norm": 0.5806993246078491, + "learning_rate": 1.0092083363908454e-05, + "loss": 0.3059, + "step": 22300 + }, + { + "epoch": 0.4971756869461198, + "grad_norm": 0.6632343530654907, + "learning_rate": 1.0088582188710876e-05, + "loss": 0.4287, + "step": 22305 + }, + { + "epoch": 0.4972871363267399, + "grad_norm": 0.33431074023246765, + "learning_rate": 1.0085081002653802e-05, + "loss": 0.2605, + "step": 22310 + }, + { + "epoch": 0.49739858570735995, + "grad_norm": 0.5944840312004089, + "learning_rate": 1.0081579806166456e-05, + "loss": 0.3084, + "step": 22315 + }, + { + "epoch": 0.49751003508798, + "grad_norm": 0.5025819540023804, + "learning_rate": 1.0078078599678058e-05, + "loss": 0.2872, + "step": 22320 + }, + { + "epoch": 0.49762148446860005, + "grad_norm": 0.5288582444190979, + "learning_rate": 1.0074577383617822e-05, + "loss": 0.3557, + "step": 22325 + }, + { + "epoch": 0.4977329338492201, + "grad_norm": 0.5158663392066956, + "learning_rate": 1.0071076158414977e-05, + "loss": 0.2734, + "step": 22330 + }, + { + "epoch": 0.4978443832298402, + "grad_norm": 0.7956809401512146, + "learning_rate": 1.0067574924498744e-05, + "loss": 0.3969, + "step": 22335 + }, + { + "epoch": 0.4979558326104603, + "grad_norm": 0.5699462890625, + "learning_rate": 1.0064073682298346e-05, + "loss": 0.2774, + "step": 22340 + }, + { + "epoch": 0.49806728199108036, + "grad_norm": 0.5688024759292603, + "learning_rate": 1.0060572432243007e-05, + "loss": 0.4471, + "step": 22345 + }, + { + "epoch": 0.4981787313717004, + "grad_norm": 0.4060691297054291, + "learning_rate": 1.0057071174761962e-05, + "loss": 0.271, + "step": 22350 + }, + { + "epoch": 0.49829018075232046, + "grad_norm": 0.6036604642868042, + "learning_rate": 1.0053569910284424e-05, + "loss": 0.3759, + "step": 22355 + }, + { + "epoch": 0.49840163013294053, + "grad_norm": 0.6653122305870056, + "learning_rate": 1.0050068639239632e-05, + "loss": 0.1751, + "step": 22360 + }, + { + "epoch": 0.4985130795135606, + "grad_norm": 0.554861843585968, + "learning_rate": 1.0046567362056811e-05, + "loss": 0.3071, + "step": 22365 + }, + { + "epoch": 0.4986245288941807, + "grad_norm": 0.6937253475189209, + "learning_rate": 1.0043066079165194e-05, + "loss": 0.2901, + "step": 22370 + }, + { + "epoch": 0.49873597827480076, + "grad_norm": 0.5724537968635559, + "learning_rate": 1.0039564790994002e-05, + "loss": 0.2876, + "step": 22375 + }, + { + "epoch": 0.4988474276554208, + "grad_norm": 0.5790332555770874, + "learning_rate": 1.0036063497972475e-05, + "loss": 0.3521, + "step": 22380 + }, + { + "epoch": 0.49895887703604086, + "grad_norm": 0.48917481303215027, + "learning_rate": 1.0032562200529843e-05, + "loss": 0.302, + "step": 22385 + }, + { + "epoch": 0.49907032641666094, + "grad_norm": 0.5448917150497437, + "learning_rate": 1.0029060899095335e-05, + "loss": 0.3715, + "step": 22390 + }, + { + "epoch": 0.499181775797281, + "grad_norm": 0.502838134765625, + "learning_rate": 1.0025559594098185e-05, + "loss": 0.2759, + "step": 22395 + }, + { + "epoch": 0.4992932251779011, + "grad_norm": 0.4868786334991455, + "learning_rate": 1.0022058285967627e-05, + "loss": 0.2882, + "step": 22400 + }, + { + "epoch": 0.49940467455852117, + "grad_norm": 0.671261191368103, + "learning_rate": 1.0018556975132891e-05, + "loss": 0.3673, + "step": 22405 + }, + { + "epoch": 0.4995161239391412, + "grad_norm": 0.553508996963501, + "learning_rate": 1.0015055662023217e-05, + "loss": 0.2697, + "step": 22410 + }, + { + "epoch": 0.49962757331976126, + "grad_norm": 0.6089357733726501, + "learning_rate": 1.0011554347067832e-05, + "loss": 0.4386, + "step": 22415 + }, + { + "epoch": 0.49973902270038134, + "grad_norm": 0.7945957183837891, + "learning_rate": 1.0008053030695972e-05, + "loss": 0.2507, + "step": 22420 + }, + { + "epoch": 0.4998504720810014, + "grad_norm": 0.658442497253418, + "learning_rate": 1.0004551713336876e-05, + "loss": 0.2068, + "step": 22425 + }, + { + "epoch": 0.4999619214616215, + "grad_norm": 0.527000367641449, + "learning_rate": 1.000105039541977e-05, + "loss": 0.2783, + "step": 22430 + }, + { + "epoch": 0.5000733708422416, + "grad_norm": 0.4612204432487488, + "learning_rate": 9.997549077373898e-06, + "loss": 0.2835, + "step": 22435 + }, + { + "epoch": 0.5001848202228616, + "grad_norm": 0.3289565145969391, + "learning_rate": 9.994047759628491e-06, + "loss": 0.239, + "step": 22440 + }, + { + "epoch": 0.5002962696034817, + "grad_norm": 0.7831099033355713, + "learning_rate": 9.990546442612783e-06, + "loss": 0.2561, + "step": 22445 + }, + { + "epoch": 0.5004077189841017, + "grad_norm": 0.6953529715538025, + "learning_rate": 9.987045126756005e-06, + "loss": 0.3294, + "step": 22450 + }, + { + "epoch": 0.5005191683647218, + "grad_norm": 0.6163928508758545, + "learning_rate": 9.983543812487397e-06, + "loss": 0.3615, + "step": 22455 + }, + { + "epoch": 0.5006306177453419, + "grad_norm": 0.34678909182548523, + "learning_rate": 9.980042500236187e-06, + "loss": 0.3397, + "step": 22460 + }, + { + "epoch": 0.5007420671259619, + "grad_norm": 0.5026941299438477, + "learning_rate": 9.976541190431616e-06, + "loss": 0.2778, + "step": 22465 + }, + { + "epoch": 0.500853516506582, + "grad_norm": 0.6653352975845337, + "learning_rate": 9.973039883502912e-06, + "loss": 0.2742, + "step": 22470 + }, + { + "epoch": 0.5009649658872021, + "grad_norm": 0.6822400093078613, + "learning_rate": 9.969538579879312e-06, + "loss": 0.3038, + "step": 22475 + }, + { + "epoch": 0.5010764152678221, + "grad_norm": 0.7787721157073975, + "learning_rate": 9.966037279990044e-06, + "loss": 0.2925, + "step": 22480 + }, + { + "epoch": 0.5011878646484422, + "grad_norm": 0.5366182327270508, + "learning_rate": 9.962535984264347e-06, + "loss": 0.3434, + "step": 22485 + }, + { + "epoch": 0.5012993140290622, + "grad_norm": 0.7126238346099854, + "learning_rate": 9.95903469313145e-06, + "loss": 0.3031, + "step": 22490 + }, + { + "epoch": 0.5014107634096824, + "grad_norm": 0.3990797996520996, + "learning_rate": 9.955533407020577e-06, + "loss": 0.4007, + "step": 22495 + }, + { + "epoch": 0.5015222127903024, + "grad_norm": 0.680857241153717, + "learning_rate": 9.952032126360968e-06, + "loss": 0.294, + "step": 22500 + }, + { + "epoch": 0.5016336621709225, + "grad_norm": 0.660984992980957, + "learning_rate": 9.948530851581853e-06, + "loss": 0.3406, + "step": 22505 + }, + { + "epoch": 0.5017451115515426, + "grad_norm": 1.1851459741592407, + "learning_rate": 9.945029583112456e-06, + "loss": 0.3037, + "step": 22510 + }, + { + "epoch": 0.5018565609321626, + "grad_norm": 0.5630490779876709, + "learning_rate": 9.941528321382008e-06, + "loss": 0.2537, + "step": 22515 + }, + { + "epoch": 0.5019680103127827, + "grad_norm": 0.5093672275543213, + "learning_rate": 9.938027066819738e-06, + "loss": 0.3292, + "step": 22520 + }, + { + "epoch": 0.5020794596934027, + "grad_norm": 0.6856085062026978, + "learning_rate": 9.934525819854868e-06, + "loss": 0.3213, + "step": 22525 + }, + { + "epoch": 0.5021909090740229, + "grad_norm": 0.5624552965164185, + "learning_rate": 9.931024580916626e-06, + "loss": 0.3323, + "step": 22530 + }, + { + "epoch": 0.5023023584546429, + "grad_norm": 0.6010044813156128, + "learning_rate": 9.927523350434243e-06, + "loss": 0.3975, + "step": 22535 + }, + { + "epoch": 0.5024138078352629, + "grad_norm": 0.6860697865486145, + "learning_rate": 9.924022128836937e-06, + "loss": 0.3569, + "step": 22540 + }, + { + "epoch": 0.502525257215883, + "grad_norm": 0.6552989482879639, + "learning_rate": 9.92052091655393e-06, + "loss": 0.3626, + "step": 22545 + }, + { + "epoch": 0.5026367065965031, + "grad_norm": 0.5610737204551697, + "learning_rate": 9.917019714014445e-06, + "loss": 0.3625, + "step": 22550 + }, + { + "epoch": 0.5027481559771232, + "grad_norm": 0.7716104388237, + "learning_rate": 9.913518521647702e-06, + "loss": 0.4169, + "step": 22555 + }, + { + "epoch": 0.5028596053577432, + "grad_norm": 0.3971973955631256, + "learning_rate": 9.910017339882924e-06, + "loss": 0.2835, + "step": 22560 + }, + { + "epoch": 0.5029710547383633, + "grad_norm": 0.5556774139404297, + "learning_rate": 9.90651616914932e-06, + "loss": 0.2799, + "step": 22565 + }, + { + "epoch": 0.5030825041189834, + "grad_norm": 0.6435865759849548, + "learning_rate": 9.903015009876115e-06, + "loss": 0.3382, + "step": 22570 + }, + { + "epoch": 0.5031939534996034, + "grad_norm": 0.6683843731880188, + "learning_rate": 9.899513862492521e-06, + "loss": 0.3051, + "step": 22575 + }, + { + "epoch": 0.5033054028802235, + "grad_norm": 0.5448920726776123, + "learning_rate": 9.896012727427754e-06, + "loss": 0.3286, + "step": 22580 + }, + { + "epoch": 0.5034168522608435, + "grad_norm": 0.6063219308853149, + "learning_rate": 9.892511605111024e-06, + "loss": 0.253, + "step": 22585 + }, + { + "epoch": 0.5035283016414637, + "grad_norm": 0.877792477607727, + "learning_rate": 9.889010495971538e-06, + "loss": 0.3547, + "step": 22590 + }, + { + "epoch": 0.5036397510220837, + "grad_norm": 0.4884467124938965, + "learning_rate": 9.885509400438512e-06, + "loss": 0.3147, + "step": 22595 + }, + { + "epoch": 0.5037512004027037, + "grad_norm": 0.6876420974731445, + "learning_rate": 9.882008318941145e-06, + "loss": 0.2954, + "step": 22600 + }, + { + "epoch": 0.5038626497833238, + "grad_norm": 0.5861078500747681, + "learning_rate": 9.87850725190865e-06, + "loss": 0.252, + "step": 22605 + }, + { + "epoch": 0.5039740991639439, + "grad_norm": 0.5254116654396057, + "learning_rate": 9.87500619977023e-06, + "loss": 0.4148, + "step": 22610 + }, + { + "epoch": 0.504085548544564, + "grad_norm": 0.6513392925262451, + "learning_rate": 9.871505162955084e-06, + "loss": 0.2827, + "step": 22615 + }, + { + "epoch": 0.504196997925184, + "grad_norm": 0.5880435705184937, + "learning_rate": 9.868004141892412e-06, + "loss": 0.3372, + "step": 22620 + }, + { + "epoch": 0.5043084473058042, + "grad_norm": 0.5500809550285339, + "learning_rate": 9.864503137011416e-06, + "loss": 0.4289, + "step": 22625 + }, + { + "epoch": 0.5044198966864242, + "grad_norm": 0.7847594022750854, + "learning_rate": 9.86100214874129e-06, + "loss": 0.3283, + "step": 22630 + }, + { + "epoch": 0.5045313460670442, + "grad_norm": 0.46724045276641846, + "learning_rate": 9.85750117751122e-06, + "loss": 0.2784, + "step": 22635 + }, + { + "epoch": 0.5046427954476643, + "grad_norm": 1.0864177942276, + "learning_rate": 9.854000223750412e-06, + "loss": 0.4388, + "step": 22640 + }, + { + "epoch": 0.5047542448282843, + "grad_norm": 0.3599768877029419, + "learning_rate": 9.85049928788805e-06, + "loss": 0.2593, + "step": 22645 + }, + { + "epoch": 0.5048656942089045, + "grad_norm": 0.3865143060684204, + "learning_rate": 9.846998370353318e-06, + "loss": 0.2861, + "step": 22650 + }, + { + "epoch": 0.5049771435895245, + "grad_norm": 0.5876185894012451, + "learning_rate": 9.843497471575409e-06, + "loss": 0.3759, + "step": 22655 + }, + { + "epoch": 0.5050885929701445, + "grad_norm": 0.48538899421691895, + "learning_rate": 9.8399965919835e-06, + "loss": 0.276, + "step": 22660 + }, + { + "epoch": 0.5052000423507647, + "grad_norm": 0.6529558897018433, + "learning_rate": 9.836495732006774e-06, + "loss": 0.4206, + "step": 22665 + }, + { + "epoch": 0.5053114917313847, + "grad_norm": 0.6429377794265747, + "learning_rate": 9.832994892074404e-06, + "loss": 0.3484, + "step": 22670 + }, + { + "epoch": 0.5054229411120048, + "grad_norm": 0.5969396233558655, + "learning_rate": 9.829494072615579e-06, + "loss": 0.2981, + "step": 22675 + }, + { + "epoch": 0.5055343904926248, + "grad_norm": 0.44130802154541016, + "learning_rate": 9.825993274059463e-06, + "loss": 0.1855, + "step": 22680 + }, + { + "epoch": 0.5056458398732449, + "grad_norm": 0.6406823992729187, + "learning_rate": 9.822492496835225e-06, + "loss": 0.3845, + "step": 22685 + }, + { + "epoch": 0.505757289253865, + "grad_norm": 0.7552033066749573, + "learning_rate": 9.81899174137204e-06, + "loss": 0.3, + "step": 22690 + }, + { + "epoch": 0.505868738634485, + "grad_norm": 0.5428248643875122, + "learning_rate": 9.815491008099068e-06, + "loss": 0.2329, + "step": 22695 + }, + { + "epoch": 0.5059801880151051, + "grad_norm": 0.5956423878669739, + "learning_rate": 9.811990297445477e-06, + "loss": 0.2874, + "step": 22700 + }, + { + "epoch": 0.5060916373957252, + "grad_norm": 0.5944362282752991, + "learning_rate": 9.80848960984042e-06, + "loss": 0.2433, + "step": 22705 + }, + { + "epoch": 0.5062030867763453, + "grad_norm": 0.7279775738716125, + "learning_rate": 9.804988945713062e-06, + "loss": 0.3592, + "step": 22710 + }, + { + "epoch": 0.5063145361569653, + "grad_norm": 0.7761232852935791, + "learning_rate": 9.801488305492553e-06, + "loss": 0.4098, + "step": 22715 + }, + { + "epoch": 0.5064259855375853, + "grad_norm": 0.5941323637962341, + "learning_rate": 9.797987689608047e-06, + "loss": 0.2724, + "step": 22720 + }, + { + "epoch": 0.5065374349182055, + "grad_norm": 0.6884803175926208, + "learning_rate": 9.79448709848869e-06, + "loss": 0.3842, + "step": 22725 + }, + { + "epoch": 0.5066488842988255, + "grad_norm": 0.4874166250228882, + "learning_rate": 9.790986532563627e-06, + "loss": 0.2648, + "step": 22730 + }, + { + "epoch": 0.5067603336794456, + "grad_norm": 0.5346237421035767, + "learning_rate": 9.787485992262004e-06, + "loss": 0.3159, + "step": 22735 + }, + { + "epoch": 0.5068717830600656, + "grad_norm": 0.6470242738723755, + "learning_rate": 9.783985478012954e-06, + "loss": 0.3349, + "step": 22740 + }, + { + "epoch": 0.5069832324406857, + "grad_norm": 0.5535328984260559, + "learning_rate": 9.780484990245619e-06, + "loss": 0.2287, + "step": 22745 + }, + { + "epoch": 0.5070946818213058, + "grad_norm": 0.7618942856788635, + "learning_rate": 9.776984529389132e-06, + "loss": 0.3348, + "step": 22750 + }, + { + "epoch": 0.5072061312019258, + "grad_norm": 0.524957001209259, + "learning_rate": 9.77348409587262e-06, + "loss": 0.3553, + "step": 22755 + }, + { + "epoch": 0.507317580582546, + "grad_norm": 0.6703402400016785, + "learning_rate": 9.769983690125208e-06, + "loss": 0.323, + "step": 22760 + }, + { + "epoch": 0.507429029963166, + "grad_norm": 0.7614783048629761, + "learning_rate": 9.766483312576022e-06, + "loss": 0.2614, + "step": 22765 + }, + { + "epoch": 0.5075404793437861, + "grad_norm": 0.6554802060127258, + "learning_rate": 9.762982963654178e-06, + "loss": 0.327, + "step": 22770 + }, + { + "epoch": 0.5076519287244061, + "grad_norm": 0.4557334780693054, + "learning_rate": 9.759482643788792e-06, + "loss": 0.3442, + "step": 22775 + }, + { + "epoch": 0.5077633781050261, + "grad_norm": 0.43107250332832336, + "learning_rate": 9.755982353408976e-06, + "loss": 0.2592, + "step": 22780 + }, + { + "epoch": 0.5078748274856463, + "grad_norm": 0.5819715261459351, + "learning_rate": 9.752482092943844e-06, + "loss": 0.3049, + "step": 22785 + }, + { + "epoch": 0.5079862768662663, + "grad_norm": 0.6508607864379883, + "learning_rate": 9.748981862822494e-06, + "loss": 0.2143, + "step": 22790 + }, + { + "epoch": 0.5080977262468864, + "grad_norm": 0.7444683313369751, + "learning_rate": 9.745481663474033e-06, + "loss": 0.3441, + "step": 22795 + }, + { + "epoch": 0.5082091756275064, + "grad_norm": 0.34373778104782104, + "learning_rate": 9.741981495327555e-06, + "loss": 0.3651, + "step": 22800 + }, + { + "epoch": 0.5083206250081265, + "grad_norm": 0.6767758131027222, + "learning_rate": 9.738481358812152e-06, + "loss": 0.2992, + "step": 22805 + }, + { + "epoch": 0.5084320743887466, + "grad_norm": 0.5160681009292603, + "learning_rate": 9.734981254356913e-06, + "loss": 0.3437, + "step": 22810 + }, + { + "epoch": 0.5085435237693666, + "grad_norm": 0.5730963945388794, + "learning_rate": 9.731481182390932e-06, + "loss": 0.3474, + "step": 22815 + }, + { + "epoch": 0.5086549731499868, + "grad_norm": 0.36844706535339355, + "learning_rate": 9.727981143343284e-06, + "loss": 0.2352, + "step": 22820 + }, + { + "epoch": 0.5087664225306068, + "grad_norm": 0.5087379813194275, + "learning_rate": 9.724481137643047e-06, + "loss": 0.4335, + "step": 22825 + }, + { + "epoch": 0.5088778719112268, + "grad_norm": 0.9651373624801636, + "learning_rate": 9.7209811657193e-06, + "loss": 0.3421, + "step": 22830 + }, + { + "epoch": 0.5089893212918469, + "grad_norm": 0.5327233076095581, + "learning_rate": 9.717481228001103e-06, + "loss": 0.2985, + "step": 22835 + }, + { + "epoch": 0.509100770672467, + "grad_norm": 0.6139131188392639, + "learning_rate": 9.713981324917529e-06, + "loss": 0.3014, + "step": 22840 + }, + { + "epoch": 0.5092122200530871, + "grad_norm": 0.766156017780304, + "learning_rate": 9.710481456897633e-06, + "loss": 0.2784, + "step": 22845 + }, + { + "epoch": 0.5093236694337071, + "grad_norm": 0.5155205130577087, + "learning_rate": 9.706981624370481e-06, + "loss": 0.2782, + "step": 22850 + }, + { + "epoch": 0.5094351188143272, + "grad_norm": 0.5603840947151184, + "learning_rate": 9.703481827765117e-06, + "loss": 0.2385, + "step": 22855 + }, + { + "epoch": 0.5095465681949473, + "grad_norm": 0.9208455085754395, + "learning_rate": 9.699982067510595e-06, + "loss": 0.2422, + "step": 22860 + }, + { + "epoch": 0.5096580175755673, + "grad_norm": 0.5859543681144714, + "learning_rate": 9.696482344035954e-06, + "loss": 0.3954, + "step": 22865 + }, + { + "epoch": 0.5097694669561874, + "grad_norm": 0.6649774312973022, + "learning_rate": 9.692982657770236e-06, + "loss": 0.2863, + "step": 22870 + }, + { + "epoch": 0.5098809163368074, + "grad_norm": 0.6705203652381897, + "learning_rate": 9.689483009142475e-06, + "loss": 0.3382, + "step": 22875 + }, + { + "epoch": 0.5099923657174276, + "grad_norm": 0.46043553948402405, + "learning_rate": 9.685983398581698e-06, + "loss": 0.299, + "step": 22880 + }, + { + "epoch": 0.5101038150980476, + "grad_norm": 0.9133014678955078, + "learning_rate": 9.68248382651693e-06, + "loss": 0.2856, + "step": 22885 + }, + { + "epoch": 0.5102152644786676, + "grad_norm": 0.866908073425293, + "learning_rate": 9.678984293377198e-06, + "loss": 0.3087, + "step": 22890 + }, + { + "epoch": 0.5103267138592877, + "grad_norm": 0.683866024017334, + "learning_rate": 9.675484799591515e-06, + "loss": 0.3811, + "step": 22895 + }, + { + "epoch": 0.5104381632399078, + "grad_norm": 0.585959792137146, + "learning_rate": 9.671985345588887e-06, + "loss": 0.3753, + "step": 22900 + }, + { + "epoch": 0.5105496126205279, + "grad_norm": 0.7192093729972839, + "learning_rate": 9.668485931798327e-06, + "loss": 0.2072, + "step": 22905 + }, + { + "epoch": 0.5106610620011479, + "grad_norm": 0.527054488658905, + "learning_rate": 9.66498655864883e-06, + "loss": 0.3902, + "step": 22910 + }, + { + "epoch": 0.510772511381768, + "grad_norm": 0.5373089909553528, + "learning_rate": 9.661487226569397e-06, + "loss": 0.2589, + "step": 22915 + }, + { + "epoch": 0.5108839607623881, + "grad_norm": 0.458723247051239, + "learning_rate": 9.657987935989014e-06, + "loss": 0.2575, + "step": 22920 + }, + { + "epoch": 0.5109954101430081, + "grad_norm": 0.5586103200912476, + "learning_rate": 9.654488687336673e-06, + "loss": 0.3368, + "step": 22925 + }, + { + "epoch": 0.5111068595236282, + "grad_norm": 1.0093234777450562, + "learning_rate": 9.65098948104135e-06, + "loss": 0.2647, + "step": 22930 + }, + { + "epoch": 0.5112183089042482, + "grad_norm": 0.7439244985580444, + "learning_rate": 9.647490317532026e-06, + "loss": 0.3487, + "step": 22935 + }, + { + "epoch": 0.5113297582848684, + "grad_norm": 0.7570911645889282, + "learning_rate": 9.643991197237668e-06, + "loss": 0.2823, + "step": 22940 + }, + { + "epoch": 0.5114412076654884, + "grad_norm": 0.5762554407119751, + "learning_rate": 9.640492120587237e-06, + "loss": 0.4238, + "step": 22945 + }, + { + "epoch": 0.5115526570461084, + "grad_norm": 0.42711982131004333, + "learning_rate": 9.636993088009703e-06, + "loss": 0.3863, + "step": 22950 + }, + { + "epoch": 0.5116641064267285, + "grad_norm": 0.5918402075767517, + "learning_rate": 9.633494099934008e-06, + "loss": 0.3001, + "step": 22955 + }, + { + "epoch": 0.5117755558073486, + "grad_norm": 0.4869014322757721, + "learning_rate": 9.62999515678911e-06, + "loss": 0.2149, + "step": 22960 + }, + { + "epoch": 0.5118870051879687, + "grad_norm": 0.6010558605194092, + "learning_rate": 9.62649625900395e-06, + "loss": 0.3213, + "step": 22965 + }, + { + "epoch": 0.5119984545685887, + "grad_norm": 0.5538244247436523, + "learning_rate": 9.622997407007467e-06, + "loss": 0.2962, + "step": 22970 + }, + { + "epoch": 0.5121099039492089, + "grad_norm": 0.5053831338882446, + "learning_rate": 9.61949860122859e-06, + "loss": 0.3091, + "step": 22975 + }, + { + "epoch": 0.5122213533298289, + "grad_norm": 0.4428863525390625, + "learning_rate": 9.61599984209625e-06, + "loss": 0.4138, + "step": 22980 + }, + { + "epoch": 0.5123328027104489, + "grad_norm": 0.8053079843521118, + "learning_rate": 9.612501130039364e-06, + "loss": 0.4502, + "step": 22985 + }, + { + "epoch": 0.512444252091069, + "grad_norm": 0.6244291067123413, + "learning_rate": 9.609002465486845e-06, + "loss": 0.3958, + "step": 22990 + }, + { + "epoch": 0.512555701471689, + "grad_norm": 0.6197370290756226, + "learning_rate": 9.605503848867608e-06, + "loss": 0.3291, + "step": 22995 + }, + { + "epoch": 0.5126671508523092, + "grad_norm": 0.7503328323364258, + "learning_rate": 9.602005280610556e-06, + "loss": 0.2703, + "step": 23000 + }, + { + "epoch": 0.5127786002329292, + "grad_norm": 0.608693540096283, + "learning_rate": 9.598506761144582e-06, + "loss": 0.3146, + "step": 23005 + }, + { + "epoch": 0.5128900496135492, + "grad_norm": 0.6147252917289734, + "learning_rate": 9.595008290898582e-06, + "loss": 0.2822, + "step": 23010 + }, + { + "epoch": 0.5130014989941694, + "grad_norm": 0.6361362934112549, + "learning_rate": 9.59150987030144e-06, + "loss": 0.2855, + "step": 23015 + }, + { + "epoch": 0.5131129483747894, + "grad_norm": 0.5929672122001648, + "learning_rate": 9.588011499782033e-06, + "loss": 0.3091, + "step": 23020 + }, + { + "epoch": 0.5132243977554095, + "grad_norm": 0.5976213216781616, + "learning_rate": 9.584513179769233e-06, + "loss": 0.344, + "step": 23025 + }, + { + "epoch": 0.5133358471360295, + "grad_norm": 0.612542450428009, + "learning_rate": 9.581014910691915e-06, + "loss": 0.3292, + "step": 23030 + }, + { + "epoch": 0.5134472965166496, + "grad_norm": 0.6661171317100525, + "learning_rate": 9.577516692978935e-06, + "loss": 0.3059, + "step": 23035 + }, + { + "epoch": 0.5135587458972697, + "grad_norm": 0.6575843691825867, + "learning_rate": 9.574018527059144e-06, + "loss": 0.2872, + "step": 23040 + }, + { + "epoch": 0.5136701952778897, + "grad_norm": 0.5535309314727783, + "learning_rate": 9.570520413361398e-06, + "loss": 0.2884, + "step": 23045 + }, + { + "epoch": 0.5137816446585098, + "grad_norm": 0.7088550925254822, + "learning_rate": 9.56702235231453e-06, + "loss": 0.3696, + "step": 23050 + }, + { + "epoch": 0.5138930940391299, + "grad_norm": 0.6000550985336304, + "learning_rate": 9.563524344347384e-06, + "loss": 0.5036, + "step": 23055 + }, + { + "epoch": 0.51400454341975, + "grad_norm": 0.6340003609657288, + "learning_rate": 9.560026389888777e-06, + "loss": 0.3908, + "step": 23060 + }, + { + "epoch": 0.51411599280037, + "grad_norm": 0.6714855432510376, + "learning_rate": 9.556528489367545e-06, + "loss": 0.2814, + "step": 23065 + }, + { + "epoch": 0.51422744218099, + "grad_norm": 0.7142869830131531, + "learning_rate": 9.553030643212494e-06, + "loss": 0.2587, + "step": 23070 + }, + { + "epoch": 0.5143388915616102, + "grad_norm": 0.667551577091217, + "learning_rate": 9.54953285185244e-06, + "loss": 0.2403, + "step": 23075 + }, + { + "epoch": 0.5144503409422302, + "grad_norm": 0.5600367784500122, + "learning_rate": 9.546035115716178e-06, + "loss": 0.221, + "step": 23080 + }, + { + "epoch": 0.5145617903228503, + "grad_norm": 0.3773113489151001, + "learning_rate": 9.542537435232508e-06, + "loss": 0.2968, + "step": 23085 + }, + { + "epoch": 0.5146732397034703, + "grad_norm": 0.6152325868606567, + "learning_rate": 9.539039810830217e-06, + "loss": 0.3402, + "step": 23090 + }, + { + "epoch": 0.5147846890840904, + "grad_norm": 0.6277587413787842, + "learning_rate": 9.535542242938084e-06, + "loss": 0.3275, + "step": 23095 + }, + { + "epoch": 0.5148961384647105, + "grad_norm": 0.4143803119659424, + "learning_rate": 9.532044731984889e-06, + "loss": 0.2836, + "step": 23100 + }, + { + "epoch": 0.5150075878453305, + "grad_norm": 0.5582551956176758, + "learning_rate": 9.5285472783994e-06, + "loss": 0.4107, + "step": 23105 + }, + { + "epoch": 0.5151190372259506, + "grad_norm": 0.48419493436813354, + "learning_rate": 9.525049882610374e-06, + "loss": 0.2143, + "step": 23110 + }, + { + "epoch": 0.5152304866065707, + "grad_norm": 0.7525806427001953, + "learning_rate": 9.521552545046566e-06, + "loss": 0.3405, + "step": 23115 + }, + { + "epoch": 0.5153419359871908, + "grad_norm": 0.5951476693153381, + "learning_rate": 9.518055266136725e-06, + "loss": 0.2919, + "step": 23120 + }, + { + "epoch": 0.5154533853678108, + "grad_norm": 0.7247080206871033, + "learning_rate": 9.514558046309585e-06, + "loss": 0.3474, + "step": 23125 + }, + { + "epoch": 0.5155648347484308, + "grad_norm": 0.8065999150276184, + "learning_rate": 9.511060885993883e-06, + "loss": 0.2848, + "step": 23130 + }, + { + "epoch": 0.515676284129051, + "grad_norm": 0.5279972553253174, + "learning_rate": 9.507563785618343e-06, + "loss": 0.3519, + "step": 23135 + }, + { + "epoch": 0.515787733509671, + "grad_norm": 0.6969117522239685, + "learning_rate": 9.504066745611682e-06, + "loss": 0.281, + "step": 23140 + }, + { + "epoch": 0.5158991828902911, + "grad_norm": 0.4764590263366699, + "learning_rate": 9.500569766402607e-06, + "loss": 0.1984, + "step": 23145 + }, + { + "epoch": 0.5160106322709112, + "grad_norm": 0.5910069942474365, + "learning_rate": 9.497072848419828e-06, + "loss": 0.3462, + "step": 23150 + }, + { + "epoch": 0.5161220816515312, + "grad_norm": 0.4852128326892853, + "learning_rate": 9.493575992092035e-06, + "loss": 0.245, + "step": 23155 + }, + { + "epoch": 0.5162335310321513, + "grad_norm": 0.629984438419342, + "learning_rate": 9.490079197847915e-06, + "loss": 0.265, + "step": 23160 + }, + { + "epoch": 0.5163449804127713, + "grad_norm": 0.4971965551376343, + "learning_rate": 9.486582466116147e-06, + "loss": 0.1871, + "step": 23165 + }, + { + "epoch": 0.5164564297933915, + "grad_norm": 0.781303346157074, + "learning_rate": 9.483085797325408e-06, + "loss": 0.2245, + "step": 23170 + }, + { + "epoch": 0.5165678791740115, + "grad_norm": 0.6366822719573975, + "learning_rate": 9.47958919190436e-06, + "loss": 0.322, + "step": 23175 + }, + { + "epoch": 0.5166793285546316, + "grad_norm": 0.6994455456733704, + "learning_rate": 9.476092650281661e-06, + "loss": 0.3182, + "step": 23180 + }, + { + "epoch": 0.5167907779352516, + "grad_norm": 0.5690954923629761, + "learning_rate": 9.472596172885962e-06, + "loss": 0.332, + "step": 23185 + }, + { + "epoch": 0.5169022273158717, + "grad_norm": 0.3832198977470398, + "learning_rate": 9.469099760145896e-06, + "loss": 0.3166, + "step": 23190 + }, + { + "epoch": 0.5170136766964918, + "grad_norm": 0.8475557565689087, + "learning_rate": 9.465603412490105e-06, + "loss": 0.2004, + "step": 23195 + }, + { + "epoch": 0.5171251260771118, + "grad_norm": 0.5788425207138062, + "learning_rate": 9.462107130347206e-06, + "loss": 0.2165, + "step": 23200 + }, + { + "epoch": 0.5172365754577319, + "grad_norm": 0.5698291063308716, + "learning_rate": 9.458610914145826e-06, + "loss": 0.3324, + "step": 23205 + }, + { + "epoch": 0.517348024838352, + "grad_norm": 0.5328242778778076, + "learning_rate": 9.455114764314566e-06, + "loss": 0.3755, + "step": 23210 + }, + { + "epoch": 0.517459474218972, + "grad_norm": 0.49963513016700745, + "learning_rate": 9.451618681282034e-06, + "loss": 0.3728, + "step": 23215 + }, + { + "epoch": 0.5175709235995921, + "grad_norm": 0.5842313170433044, + "learning_rate": 9.448122665476814e-06, + "loss": 0.3258, + "step": 23220 + }, + { + "epoch": 0.5176823729802121, + "grad_norm": 0.6654723286628723, + "learning_rate": 9.444626717327499e-06, + "loss": 0.2704, + "step": 23225 + }, + { + "epoch": 0.5177938223608323, + "grad_norm": 0.3355579674243927, + "learning_rate": 9.441130837262662e-06, + "loss": 0.3187, + "step": 23230 + }, + { + "epoch": 0.5179052717414523, + "grad_norm": 0.6794620752334595, + "learning_rate": 9.437635025710863e-06, + "loss": 0.3512, + "step": 23235 + }, + { + "epoch": 0.5180167211220723, + "grad_norm": 0.9019680619239807, + "learning_rate": 9.434139283100674e-06, + "loss": 0.4195, + "step": 23240 + }, + { + "epoch": 0.5181281705026924, + "grad_norm": 0.7227687835693359, + "learning_rate": 9.430643609860644e-06, + "loss": 0.3432, + "step": 23245 + }, + { + "epoch": 0.5182396198833125, + "grad_norm": 0.42706429958343506, + "learning_rate": 9.427148006419312e-06, + "loss": 0.2238, + "step": 23250 + }, + { + "epoch": 0.5183510692639326, + "grad_norm": 0.6106488704681396, + "learning_rate": 9.42365247320521e-06, + "loss": 0.2121, + "step": 23255 + }, + { + "epoch": 0.5184625186445526, + "grad_norm": 0.7338367104530334, + "learning_rate": 9.42015701064687e-06, + "loss": 0.424, + "step": 23260 + }, + { + "epoch": 0.5185739680251727, + "grad_norm": 0.5893292427062988, + "learning_rate": 9.4166616191728e-06, + "loss": 0.3253, + "step": 23265 + }, + { + "epoch": 0.5186854174057928, + "grad_norm": 0.6662803292274475, + "learning_rate": 9.41316629921151e-06, + "loss": 0.4152, + "step": 23270 + }, + { + "epoch": 0.5187968667864128, + "grad_norm": 0.5518497228622437, + "learning_rate": 9.40967105119151e-06, + "loss": 0.3649, + "step": 23275 + }, + { + "epoch": 0.5189083161670329, + "grad_norm": 0.6257652044296265, + "learning_rate": 9.40617587554128e-06, + "loss": 0.4227, + "step": 23280 + }, + { + "epoch": 0.519019765547653, + "grad_norm": 0.7822826504707336, + "learning_rate": 9.402680772689303e-06, + "loss": 0.2985, + "step": 23285 + }, + { + "epoch": 0.5191312149282731, + "grad_norm": 0.6136816143989563, + "learning_rate": 9.399185743064055e-06, + "loss": 0.3674, + "step": 23290 + }, + { + "epoch": 0.5192426643088931, + "grad_norm": 0.7093392014503479, + "learning_rate": 9.395690787093995e-06, + "loss": 0.2932, + "step": 23295 + }, + { + "epoch": 0.5193541136895131, + "grad_norm": 0.5666813254356384, + "learning_rate": 9.392195905207581e-06, + "loss": 0.2642, + "step": 23300 + }, + { + "epoch": 0.5194655630701333, + "grad_norm": 0.5724304914474487, + "learning_rate": 9.388701097833252e-06, + "loss": 0.3962, + "step": 23305 + }, + { + "epoch": 0.5195770124507533, + "grad_norm": 0.5531351566314697, + "learning_rate": 9.385206365399457e-06, + "loss": 0.279, + "step": 23310 + }, + { + "epoch": 0.5196884618313734, + "grad_norm": 0.5691862106323242, + "learning_rate": 9.381711708334613e-06, + "loss": 0.2841, + "step": 23315 + }, + { + "epoch": 0.5197999112119934, + "grad_norm": 0.4270021319389343, + "learning_rate": 9.378217127067144e-06, + "loss": 0.3192, + "step": 23320 + }, + { + "epoch": 0.5199113605926136, + "grad_norm": 0.684860348701477, + "learning_rate": 9.374722622025454e-06, + "loss": 0.3051, + "step": 23325 + }, + { + "epoch": 0.5200228099732336, + "grad_norm": 0.4866192638874054, + "learning_rate": 9.371228193637943e-06, + "loss": 0.3273, + "step": 23330 + }, + { + "epoch": 0.5201342593538536, + "grad_norm": 0.6408313512802124, + "learning_rate": 9.367733842333004e-06, + "loss": 0.34, + "step": 23335 + }, + { + "epoch": 0.5202457087344737, + "grad_norm": 0.8118225336074829, + "learning_rate": 9.364239568539012e-06, + "loss": 0.3525, + "step": 23340 + }, + { + "epoch": 0.5203571581150938, + "grad_norm": 0.5743765830993652, + "learning_rate": 9.360745372684346e-06, + "loss": 0.2737, + "step": 23345 + }, + { + "epoch": 0.5204686074957139, + "grad_norm": 0.47640687227249146, + "learning_rate": 9.35725125519736e-06, + "loss": 0.2322, + "step": 23350 + }, + { + "epoch": 0.5205800568763339, + "grad_norm": 0.4566701352596283, + "learning_rate": 9.353757216506411e-06, + "loss": 0.2681, + "step": 23355 + }, + { + "epoch": 0.5206915062569539, + "grad_norm": 1.2294862270355225, + "learning_rate": 9.350263257039837e-06, + "loss": 0.2964, + "step": 23360 + }, + { + "epoch": 0.5208029556375741, + "grad_norm": 0.4802538752555847, + "learning_rate": 9.346769377225974e-06, + "loss": 0.2547, + "step": 23365 + }, + { + "epoch": 0.5209144050181941, + "grad_norm": 0.6217039227485657, + "learning_rate": 9.343275577493146e-06, + "loss": 0.3537, + "step": 23370 + }, + { + "epoch": 0.5210258543988142, + "grad_norm": 0.41138404607772827, + "learning_rate": 9.339781858269655e-06, + "loss": 0.1877, + "step": 23375 + }, + { + "epoch": 0.5211373037794342, + "grad_norm": 0.6733947396278381, + "learning_rate": 9.336288219983817e-06, + "loss": 0.4094, + "step": 23380 + }, + { + "epoch": 0.5212487531600544, + "grad_norm": 0.7708743810653687, + "learning_rate": 9.332794663063922e-06, + "loss": 0.3424, + "step": 23385 + }, + { + "epoch": 0.5213602025406744, + "grad_norm": 0.5516330003738403, + "learning_rate": 9.329301187938249e-06, + "loss": 0.2601, + "step": 23390 + }, + { + "epoch": 0.5214716519212944, + "grad_norm": 0.8098486661911011, + "learning_rate": 9.325807795035074e-06, + "loss": 0.3064, + "step": 23395 + }, + { + "epoch": 0.5215831013019145, + "grad_norm": 0.628699779510498, + "learning_rate": 9.32231448478266e-06, + "loss": 0.2739, + "step": 23400 + }, + { + "epoch": 0.5216945506825346, + "grad_norm": 0.47968655824661255, + "learning_rate": 9.318821257609256e-06, + "loss": 0.3494, + "step": 23405 + }, + { + "epoch": 0.5218060000631547, + "grad_norm": 0.37704741954803467, + "learning_rate": 9.315328113943111e-06, + "loss": 0.3228, + "step": 23410 + }, + { + "epoch": 0.5219174494437747, + "grad_norm": 0.8004491925239563, + "learning_rate": 9.311835054212452e-06, + "loss": 0.3071, + "step": 23415 + }, + { + "epoch": 0.5220288988243947, + "grad_norm": 0.6016726493835449, + "learning_rate": 9.308342078845506e-06, + "loss": 0.4179, + "step": 23420 + }, + { + "epoch": 0.5221403482050149, + "grad_norm": 0.7621659636497498, + "learning_rate": 9.304849188270481e-06, + "loss": 0.3633, + "step": 23425 + }, + { + "epoch": 0.5222517975856349, + "grad_norm": 0.6040893793106079, + "learning_rate": 9.30135638291558e-06, + "loss": 0.3027, + "step": 23430 + }, + { + "epoch": 0.522363246966255, + "grad_norm": 0.8616956472396851, + "learning_rate": 9.297863663208995e-06, + "loss": 0.2637, + "step": 23435 + }, + { + "epoch": 0.522474696346875, + "grad_norm": 0.5304297208786011, + "learning_rate": 9.294371029578905e-06, + "loss": 0.2171, + "step": 23440 + }, + { + "epoch": 0.5225861457274951, + "grad_norm": 0.6328489780426025, + "learning_rate": 9.290878482453481e-06, + "loss": 0.4316, + "step": 23445 + }, + { + "epoch": 0.5226975951081152, + "grad_norm": 0.4458315074443817, + "learning_rate": 9.287386022260877e-06, + "loss": 0.278, + "step": 23450 + }, + { + "epoch": 0.5228090444887352, + "grad_norm": 0.750149130821228, + "learning_rate": 9.283893649429248e-06, + "loss": 0.3191, + "step": 23455 + }, + { + "epoch": 0.5229204938693554, + "grad_norm": 0.4905281662940979, + "learning_rate": 9.280401364386731e-06, + "loss": 0.2809, + "step": 23460 + }, + { + "epoch": 0.5230319432499754, + "grad_norm": 0.8535996079444885, + "learning_rate": 9.276909167561454e-06, + "loss": 0.3205, + "step": 23465 + }, + { + "epoch": 0.5231433926305955, + "grad_norm": 0.43157458305358887, + "learning_rate": 9.27341705938153e-06, + "loss": 0.1815, + "step": 23470 + }, + { + "epoch": 0.5232548420112155, + "grad_norm": 0.8516057729721069, + "learning_rate": 9.269925040275066e-06, + "loss": 0.2436, + "step": 23475 + }, + { + "epoch": 0.5233662913918355, + "grad_norm": 0.6305704116821289, + "learning_rate": 9.266433110670157e-06, + "loss": 0.2275, + "step": 23480 + }, + { + "epoch": 0.5234777407724557, + "grad_norm": 0.39994460344314575, + "learning_rate": 9.262941270994886e-06, + "loss": 0.3378, + "step": 23485 + }, + { + "epoch": 0.5235891901530757, + "grad_norm": 0.6393985748291016, + "learning_rate": 9.259449521677323e-06, + "loss": 0.3433, + "step": 23490 + }, + { + "epoch": 0.5237006395336958, + "grad_norm": 0.6144167184829712, + "learning_rate": 9.255957863145538e-06, + "loss": 0.3347, + "step": 23495 + }, + { + "epoch": 0.5238120889143159, + "grad_norm": 0.6233029961585999, + "learning_rate": 9.252466295827572e-06, + "loss": 0.3765, + "step": 23500 + }, + { + "epoch": 0.5239235382949359, + "grad_norm": 0.5702547430992126, + "learning_rate": 9.248974820151472e-06, + "loss": 0.2417, + "step": 23505 + }, + { + "epoch": 0.524034987675556, + "grad_norm": 0.47504016757011414, + "learning_rate": 9.24548343654526e-06, + "loss": 0.3524, + "step": 23510 + }, + { + "epoch": 0.524146437056176, + "grad_norm": 0.6024385094642639, + "learning_rate": 9.241992145436953e-06, + "loss": 0.2762, + "step": 23515 + }, + { + "epoch": 0.5242578864367962, + "grad_norm": 0.3721349835395813, + "learning_rate": 9.238500947254558e-06, + "loss": 0.2902, + "step": 23520 + }, + { + "epoch": 0.5243693358174162, + "grad_norm": 0.8836500644683838, + "learning_rate": 9.23500984242607e-06, + "loss": 0.4422, + "step": 23525 + }, + { + "epoch": 0.5244807851980363, + "grad_norm": 0.3741062879562378, + "learning_rate": 9.23151883137947e-06, + "loss": 0.2211, + "step": 23530 + }, + { + "epoch": 0.5245922345786563, + "grad_norm": 0.5023152232170105, + "learning_rate": 9.228027914542733e-06, + "loss": 0.289, + "step": 23535 + }, + { + "epoch": 0.5247036839592764, + "grad_norm": 0.3265573978424072, + "learning_rate": 9.224537092343812e-06, + "loss": 0.2948, + "step": 23540 + }, + { + "epoch": 0.5248151333398965, + "grad_norm": 0.5927016735076904, + "learning_rate": 9.221046365210659e-06, + "loss": 0.4409, + "step": 23545 + }, + { + "epoch": 0.5249265827205165, + "grad_norm": 0.5317661762237549, + "learning_rate": 9.217555733571208e-06, + "loss": 0.4329, + "step": 23550 + }, + { + "epoch": 0.5250380321011366, + "grad_norm": 0.6136202216148376, + "learning_rate": 9.214065197853383e-06, + "loss": 0.2988, + "step": 23555 + }, + { + "epoch": 0.5251494814817567, + "grad_norm": 0.7506454586982727, + "learning_rate": 9.2105747584851e-06, + "loss": 0.2864, + "step": 23560 + }, + { + "epoch": 0.5252609308623767, + "grad_norm": 0.5869132876396179, + "learning_rate": 9.207084415894258e-06, + "loss": 0.375, + "step": 23565 + }, + { + "epoch": 0.5253723802429968, + "grad_norm": 0.6257401704788208, + "learning_rate": 9.203594170508747e-06, + "loss": 0.3417, + "step": 23570 + }, + { + "epoch": 0.5254838296236168, + "grad_norm": 0.7179450392723083, + "learning_rate": 9.200104022756443e-06, + "loss": 0.296, + "step": 23575 + }, + { + "epoch": 0.525595279004237, + "grad_norm": 0.4368216097354889, + "learning_rate": 9.196613973065212e-06, + "loss": 0.2702, + "step": 23580 + }, + { + "epoch": 0.525706728384857, + "grad_norm": 0.5079479813575745, + "learning_rate": 9.193124021862907e-06, + "loss": 0.2096, + "step": 23585 + }, + { + "epoch": 0.5258181777654771, + "grad_norm": 0.6696041226387024, + "learning_rate": 9.189634169577365e-06, + "loss": 0.3816, + "step": 23590 + }, + { + "epoch": 0.5259296271460971, + "grad_norm": 0.8143077492713928, + "learning_rate": 9.186144416636422e-06, + "loss": 0.4566, + "step": 23595 + }, + { + "epoch": 0.5260410765267172, + "grad_norm": 0.8123489618301392, + "learning_rate": 9.18265476346789e-06, + "loss": 0.3496, + "step": 23600 + }, + { + "epoch": 0.5261525259073373, + "grad_norm": 0.405198335647583, + "learning_rate": 9.179165210499579e-06, + "loss": 0.2687, + "step": 23605 + }, + { + "epoch": 0.5262639752879573, + "grad_norm": 0.5814400911331177, + "learning_rate": 9.175675758159273e-06, + "loss": 0.2619, + "step": 23610 + }, + { + "epoch": 0.5263754246685775, + "grad_norm": 0.7175332307815552, + "learning_rate": 9.17218640687476e-06, + "loss": 0.3822, + "step": 23615 + }, + { + "epoch": 0.5264868740491975, + "grad_norm": 0.5167621970176697, + "learning_rate": 9.1686971570738e-06, + "loss": 0.2828, + "step": 23620 + }, + { + "epoch": 0.5265983234298175, + "grad_norm": 0.5517563819885254, + "learning_rate": 9.16520800918415e-06, + "loss": 0.3361, + "step": 23625 + }, + { + "epoch": 0.5267097728104376, + "grad_norm": 1.081498146057129, + "learning_rate": 9.16171896363356e-06, + "loss": 0.2769, + "step": 23630 + }, + { + "epoch": 0.5268212221910576, + "grad_norm": 0.5097198486328125, + "learning_rate": 9.158230020849754e-06, + "loss": 0.2926, + "step": 23635 + }, + { + "epoch": 0.5269326715716778, + "grad_norm": 0.9057488441467285, + "learning_rate": 9.154741181260446e-06, + "loss": 0.2774, + "step": 23640 + }, + { + "epoch": 0.5270441209522978, + "grad_norm": 0.8249532580375671, + "learning_rate": 9.151252445293349e-06, + "loss": 0.2342, + "step": 23645 + }, + { + "epoch": 0.5271555703329178, + "grad_norm": 0.5287266969680786, + "learning_rate": 9.14776381337615e-06, + "loss": 0.276, + "step": 23650 + }, + { + "epoch": 0.527267019713538, + "grad_norm": 0.8385326266288757, + "learning_rate": 9.144275285936527e-06, + "loss": 0.3539, + "step": 23655 + }, + { + "epoch": 0.527378469094158, + "grad_norm": 0.6637365818023682, + "learning_rate": 9.140786863402148e-06, + "loss": 0.3495, + "step": 23660 + }, + { + "epoch": 0.5274899184747781, + "grad_norm": 0.8243251442909241, + "learning_rate": 9.13729854620067e-06, + "loss": 0.3066, + "step": 23665 + }, + { + "epoch": 0.5276013678553981, + "grad_norm": 0.42031049728393555, + "learning_rate": 9.13381033475973e-06, + "loss": 0.3982, + "step": 23670 + }, + { + "epoch": 0.5277128172360183, + "grad_norm": 0.656934380531311, + "learning_rate": 9.130322229506958e-06, + "loss": 0.2919, + "step": 23675 + }, + { + "epoch": 0.5278242666166383, + "grad_norm": 0.742068886756897, + "learning_rate": 9.12683423086997e-06, + "loss": 0.4047, + "step": 23680 + }, + { + "epoch": 0.5279357159972583, + "grad_norm": 0.6763992309570312, + "learning_rate": 9.12334633927636e-06, + "loss": 0.1972, + "step": 23685 + }, + { + "epoch": 0.5280471653778784, + "grad_norm": 0.514451265335083, + "learning_rate": 9.119858555153727e-06, + "loss": 0.2933, + "step": 23690 + }, + { + "epoch": 0.5281586147584985, + "grad_norm": 0.512758731842041, + "learning_rate": 9.116370878929637e-06, + "loss": 0.2829, + "step": 23695 + }, + { + "epoch": 0.5282700641391186, + "grad_norm": 0.5397109389305115, + "learning_rate": 9.112883311031661e-06, + "loss": 0.2403, + "step": 23700 + }, + { + "epoch": 0.5283815135197386, + "grad_norm": 0.5646653771400452, + "learning_rate": 9.10939585188734e-06, + "loss": 0.2965, + "step": 23705 + }, + { + "epoch": 0.5284929629003586, + "grad_norm": 0.4140741527080536, + "learning_rate": 9.105908501924217e-06, + "loss": 0.3376, + "step": 23710 + }, + { + "epoch": 0.5286044122809788, + "grad_norm": 0.5725622177124023, + "learning_rate": 9.102421261569807e-06, + "loss": 0.1281, + "step": 23715 + }, + { + "epoch": 0.5287158616615988, + "grad_norm": 0.42752501368522644, + "learning_rate": 9.098934131251624e-06, + "loss": 0.2581, + "step": 23720 + }, + { + "epoch": 0.5288273110422189, + "grad_norm": 0.5737465023994446, + "learning_rate": 9.095447111397162e-06, + "loss": 0.2171, + "step": 23725 + }, + { + "epoch": 0.5289387604228389, + "grad_norm": 0.9149386882781982, + "learning_rate": 9.091960202433897e-06, + "loss": 0.2777, + "step": 23730 + }, + { + "epoch": 0.5290502098034591, + "grad_norm": 0.5786307454109192, + "learning_rate": 9.088473404789306e-06, + "loss": 0.2615, + "step": 23735 + }, + { + "epoch": 0.5291616591840791, + "grad_norm": 0.8515751957893372, + "learning_rate": 9.08498671889084e-06, + "loss": 0.449, + "step": 23740 + }, + { + "epoch": 0.5292731085646991, + "grad_norm": 0.7798794507980347, + "learning_rate": 9.08150014516594e-06, + "loss": 0.2805, + "step": 23745 + }, + { + "epoch": 0.5293845579453192, + "grad_norm": 0.7772887349128723, + "learning_rate": 9.078013684042032e-06, + "loss": 0.2888, + "step": 23750 + }, + { + "epoch": 0.5294960073259393, + "grad_norm": 0.4915827512741089, + "learning_rate": 9.07452733594653e-06, + "loss": 0.2785, + "step": 23755 + }, + { + "epoch": 0.5296074567065594, + "grad_norm": 0.2885194718837738, + "learning_rate": 9.071041101306832e-06, + "loss": 0.205, + "step": 23760 + }, + { + "epoch": 0.5297189060871794, + "grad_norm": 0.7066339254379272, + "learning_rate": 9.067554980550322e-06, + "loss": 0.3628, + "step": 23765 + }, + { + "epoch": 0.5298303554677994, + "grad_norm": 0.6108125448226929, + "learning_rate": 9.064068974104377e-06, + "loss": 0.2408, + "step": 23770 + }, + { + "epoch": 0.5299418048484196, + "grad_norm": 0.8365517258644104, + "learning_rate": 9.060583082396353e-06, + "loss": 0.3107, + "step": 23775 + }, + { + "epoch": 0.5300532542290396, + "grad_norm": 0.5426750779151917, + "learning_rate": 9.057097305853589e-06, + "loss": 0.2681, + "step": 23780 + }, + { + "epoch": 0.5301647036096597, + "grad_norm": 0.7357817888259888, + "learning_rate": 9.05361164490342e-06, + "loss": 0.3611, + "step": 23785 + }, + { + "epoch": 0.5302761529902797, + "grad_norm": 0.7073951959609985, + "learning_rate": 9.050126099973155e-06, + "loss": 0.2575, + "step": 23790 + }, + { + "epoch": 0.5303876023708998, + "grad_norm": 0.5723925828933716, + "learning_rate": 9.0466406714901e-06, + "loss": 0.4243, + "step": 23795 + }, + { + "epoch": 0.5304990517515199, + "grad_norm": 0.6931876540184021, + "learning_rate": 9.043155359881538e-06, + "loss": 0.2974, + "step": 23800 + }, + { + "epoch": 0.5306105011321399, + "grad_norm": 0.568409264087677, + "learning_rate": 9.039670165574747e-06, + "loss": 0.2453, + "step": 23805 + }, + { + "epoch": 0.53072195051276, + "grad_norm": 0.5373852252960205, + "learning_rate": 9.036185088996978e-06, + "loss": 0.3074, + "step": 23810 + }, + { + "epoch": 0.5308333998933801, + "grad_norm": 0.657902181148529, + "learning_rate": 9.03270013057548e-06, + "loss": 0.304, + "step": 23815 + }, + { + "epoch": 0.5309448492740002, + "grad_norm": 0.5374367833137512, + "learning_rate": 9.02921529073748e-06, + "loss": 0.3381, + "step": 23820 + }, + { + "epoch": 0.5310562986546202, + "grad_norm": 0.579827070236206, + "learning_rate": 9.025730569910189e-06, + "loss": 0.2773, + "step": 23825 + }, + { + "epoch": 0.5311677480352403, + "grad_norm": 0.6020088195800781, + "learning_rate": 9.022245968520812e-06, + "loss": 0.2669, + "step": 23830 + }, + { + "epoch": 0.5312791974158604, + "grad_norm": 0.24233196675777435, + "learning_rate": 9.018761486996529e-06, + "loss": 0.2554, + "step": 23835 + }, + { + "epoch": 0.5313906467964804, + "grad_norm": 1.0846818685531616, + "learning_rate": 9.015277125764515e-06, + "loss": 0.3221, + "step": 23840 + }, + { + "epoch": 0.5315020961771005, + "grad_norm": 0.4313613772392273, + "learning_rate": 9.011792885251926e-06, + "loss": 0.308, + "step": 23845 + }, + { + "epoch": 0.5316135455577206, + "grad_norm": 0.7550637125968933, + "learning_rate": 9.008308765885903e-06, + "loss": 0.3228, + "step": 23850 + }, + { + "epoch": 0.5317249949383406, + "grad_norm": 0.5088363289833069, + "learning_rate": 9.004824768093567e-06, + "loss": 0.2836, + "step": 23855 + }, + { + "epoch": 0.5318364443189607, + "grad_norm": 0.6121699213981628, + "learning_rate": 9.001340892302038e-06, + "loss": 0.2657, + "step": 23860 + }, + { + "epoch": 0.5319478936995807, + "grad_norm": 0.649071991443634, + "learning_rate": 8.997857138938404e-06, + "loss": 0.4241, + "step": 23865 + }, + { + "epoch": 0.5320593430802009, + "grad_norm": 0.7292037010192871, + "learning_rate": 8.994373508429745e-06, + "loss": 0.37, + "step": 23870 + }, + { + "epoch": 0.5321707924608209, + "grad_norm": 0.4739864468574524, + "learning_rate": 8.990890001203136e-06, + "loss": 0.3927, + "step": 23875 + }, + { + "epoch": 0.532282241841441, + "grad_norm": 0.7016490697860718, + "learning_rate": 8.987406617685625e-06, + "loss": 0.3176, + "step": 23880 + }, + { + "epoch": 0.532393691222061, + "grad_norm": 0.5824190974235535, + "learning_rate": 8.983923358304242e-06, + "loss": 0.1939, + "step": 23885 + }, + { + "epoch": 0.5325051406026811, + "grad_norm": 0.7529845237731934, + "learning_rate": 8.980440223486019e-06, + "loss": 0.3123, + "step": 23890 + }, + { + "epoch": 0.5326165899833012, + "grad_norm": 0.47603124380111694, + "learning_rate": 8.976957213657952e-06, + "loss": 0.2305, + "step": 23895 + }, + { + "epoch": 0.5327280393639212, + "grad_norm": 0.7694602012634277, + "learning_rate": 8.973474329247031e-06, + "loss": 0.2075, + "step": 23900 + }, + { + "epoch": 0.5328394887445413, + "grad_norm": 0.6289934515953064, + "learning_rate": 8.969991570680238e-06, + "loss": 0.3209, + "step": 23905 + }, + { + "epoch": 0.5329509381251614, + "grad_norm": 0.5181615352630615, + "learning_rate": 8.966508938384522e-06, + "loss": 0.2942, + "step": 23910 + }, + { + "epoch": 0.5330623875057814, + "grad_norm": 0.714810848236084, + "learning_rate": 8.963026432786839e-06, + "loss": 0.2761, + "step": 23915 + }, + { + "epoch": 0.5331738368864015, + "grad_norm": 0.5382199287414551, + "learning_rate": 8.959544054314107e-06, + "loss": 0.2745, + "step": 23920 + }, + { + "epoch": 0.5332852862670215, + "grad_norm": 0.6060092449188232, + "learning_rate": 8.956061803393246e-06, + "loss": 0.4041, + "step": 23925 + }, + { + "epoch": 0.5333967356476417, + "grad_norm": 0.46849480271339417, + "learning_rate": 8.952579680451147e-06, + "loss": 0.2914, + "step": 23930 + }, + { + "epoch": 0.5335081850282617, + "grad_norm": 0.5679491758346558, + "learning_rate": 8.949097685914698e-06, + "loss": 0.2236, + "step": 23935 + }, + { + "epoch": 0.5336196344088818, + "grad_norm": 0.6544236540794373, + "learning_rate": 8.94561582021076e-06, + "loss": 0.2899, + "step": 23940 + }, + { + "epoch": 0.5337310837895018, + "grad_norm": 0.5217140913009644, + "learning_rate": 8.942134083766182e-06, + "loss": 0.3035, + "step": 23945 + }, + { + "epoch": 0.5338425331701219, + "grad_norm": 0.4291645586490631, + "learning_rate": 8.938652477007799e-06, + "loss": 0.3018, + "step": 23950 + }, + { + "epoch": 0.533953982550742, + "grad_norm": 0.5148656964302063, + "learning_rate": 8.935171000362433e-06, + "loss": 0.2073, + "step": 23955 + }, + { + "epoch": 0.534065431931362, + "grad_norm": 0.629658579826355, + "learning_rate": 8.931689654256883e-06, + "loss": 0.2964, + "step": 23960 + }, + { + "epoch": 0.5341768813119822, + "grad_norm": 0.5942751169204712, + "learning_rate": 8.928208439117932e-06, + "loss": 0.3437, + "step": 23965 + }, + { + "epoch": 0.5342883306926022, + "grad_norm": 0.7543257474899292, + "learning_rate": 8.924727355372356e-06, + "loss": 0.3154, + "step": 23970 + }, + { + "epoch": 0.5343997800732222, + "grad_norm": 0.5518947839736938, + "learning_rate": 8.921246403446907e-06, + "loss": 0.2256, + "step": 23975 + }, + { + "epoch": 0.5345112294538423, + "grad_norm": 0.6838600039482117, + "learning_rate": 8.917765583768316e-06, + "loss": 0.373, + "step": 23980 + }, + { + "epoch": 0.5346226788344624, + "grad_norm": 0.284435898065567, + "learning_rate": 8.914284896763317e-06, + "loss": 0.1701, + "step": 23985 + }, + { + "epoch": 0.5347341282150825, + "grad_norm": 0.5570893287658691, + "learning_rate": 8.91080434285861e-06, + "loss": 0.3315, + "step": 23990 + }, + { + "epoch": 0.5348455775957025, + "grad_norm": 0.6959690451622009, + "learning_rate": 8.90732392248088e-06, + "loss": 0.3038, + "step": 23995 + }, + { + "epoch": 0.5349570269763225, + "grad_norm": 0.8016462326049805, + "learning_rate": 8.903843636056805e-06, + "loss": 0.394, + "step": 24000 + }, + { + "epoch": 0.5350684763569427, + "grad_norm": 0.5051465034484863, + "learning_rate": 8.900363484013041e-06, + "loss": 0.3425, + "step": 24005 + }, + { + "epoch": 0.5351799257375627, + "grad_norm": 0.4948039650917053, + "learning_rate": 8.896883466776223e-06, + "loss": 0.1765, + "step": 24010 + }, + { + "epoch": 0.5352913751181828, + "grad_norm": 0.764173686504364, + "learning_rate": 8.893403584772975e-06, + "loss": 0.2309, + "step": 24015 + }, + { + "epoch": 0.5354028244988028, + "grad_norm": 1.018734097480774, + "learning_rate": 8.889923838429909e-06, + "loss": 0.3934, + "step": 24020 + }, + { + "epoch": 0.535514273879423, + "grad_norm": 0.648385226726532, + "learning_rate": 8.88644422817361e-06, + "loss": 0.2999, + "step": 24025 + }, + { + "epoch": 0.535625723260043, + "grad_norm": 0.44655290246009827, + "learning_rate": 8.882964754430658e-06, + "loss": 0.3726, + "step": 24030 + }, + { + "epoch": 0.535737172640663, + "grad_norm": 0.5479897260665894, + "learning_rate": 8.879485417627602e-06, + "loss": 0.2738, + "step": 24035 + }, + { + "epoch": 0.5358486220212831, + "grad_norm": 0.36989253759384155, + "learning_rate": 8.876006218190985e-06, + "loss": 0.3391, + "step": 24040 + }, + { + "epoch": 0.5359600714019032, + "grad_norm": 0.6240692734718323, + "learning_rate": 8.872527156547329e-06, + "loss": 0.2586, + "step": 24045 + }, + { + "epoch": 0.5360715207825233, + "grad_norm": 0.416449636220932, + "learning_rate": 8.86904823312314e-06, + "loss": 0.2799, + "step": 24050 + }, + { + "epoch": 0.5361829701631433, + "grad_norm": 0.5444673895835876, + "learning_rate": 8.865569448344909e-06, + "loss": 0.3169, + "step": 24055 + }, + { + "epoch": 0.5362944195437633, + "grad_norm": 0.6248395442962646, + "learning_rate": 8.862090802639106e-06, + "loss": 0.2957, + "step": 24060 + }, + { + "epoch": 0.5364058689243835, + "grad_norm": 0.641238272190094, + "learning_rate": 8.85861229643219e-06, + "loss": 0.4187, + "step": 24065 + }, + { + "epoch": 0.5365173183050035, + "grad_norm": 0.8127478957176208, + "learning_rate": 8.855133930150594e-06, + "loss": 0.3227, + "step": 24070 + }, + { + "epoch": 0.5366287676856236, + "grad_norm": 0.7973223924636841, + "learning_rate": 8.851655704220743e-06, + "loss": 0.3224, + "step": 24075 + }, + { + "epoch": 0.5367402170662436, + "grad_norm": 0.7173725962638855, + "learning_rate": 8.848177619069039e-06, + "loss": 0.3197, + "step": 24080 + }, + { + "epoch": 0.5368516664468638, + "grad_norm": 0.5283668637275696, + "learning_rate": 8.844699675121865e-06, + "loss": 0.3195, + "step": 24085 + }, + { + "epoch": 0.5369631158274838, + "grad_norm": 0.5162988901138306, + "learning_rate": 8.841221872805595e-06, + "loss": 0.3324, + "step": 24090 + }, + { + "epoch": 0.5370745652081038, + "grad_norm": 0.807227611541748, + "learning_rate": 8.83774421254658e-06, + "loss": 0.262, + "step": 24095 + }, + { + "epoch": 0.537186014588724, + "grad_norm": 0.5646213889122009, + "learning_rate": 8.834266694771151e-06, + "loss": 0.2243, + "step": 24100 + }, + { + "epoch": 0.537297463969344, + "grad_norm": 0.5768153667449951, + "learning_rate": 8.83078931990563e-06, + "loss": 0.2814, + "step": 24105 + }, + { + "epoch": 0.5374089133499641, + "grad_norm": 0.6003533601760864, + "learning_rate": 8.827312088376315e-06, + "loss": 0.3577, + "step": 24110 + }, + { + "epoch": 0.5375203627305841, + "grad_norm": 0.5767202377319336, + "learning_rate": 8.823835000609482e-06, + "loss": 0.2962, + "step": 24115 + }, + { + "epoch": 0.5376318121112041, + "grad_norm": 0.34857234358787537, + "learning_rate": 8.820358057031397e-06, + "loss": 0.2262, + "step": 24120 + }, + { + "epoch": 0.5377432614918243, + "grad_norm": 0.615218997001648, + "learning_rate": 8.816881258068313e-06, + "loss": 0.2648, + "step": 24125 + }, + { + "epoch": 0.5378547108724443, + "grad_norm": 0.3658369779586792, + "learning_rate": 8.813404604146457e-06, + "loss": 0.2492, + "step": 24130 + }, + { + "epoch": 0.5379661602530644, + "grad_norm": 0.380628764629364, + "learning_rate": 8.809928095692033e-06, + "loss": 0.379, + "step": 24135 + }, + { + "epoch": 0.5380776096336845, + "grad_norm": 0.4054970443248749, + "learning_rate": 8.806451733131241e-06, + "loss": 0.2889, + "step": 24140 + }, + { + "epoch": 0.5381890590143046, + "grad_norm": 0.5545241236686707, + "learning_rate": 8.802975516890252e-06, + "loss": 0.254, + "step": 24145 + }, + { + "epoch": 0.5383005083949246, + "grad_norm": 0.43503686785697937, + "learning_rate": 8.799499447395227e-06, + "loss": 0.2407, + "step": 24150 + }, + { + "epoch": 0.5384119577755446, + "grad_norm": 0.817979633808136, + "learning_rate": 8.7960235250723e-06, + "loss": 0.3682, + "step": 24155 + }, + { + "epoch": 0.5385234071561648, + "grad_norm": 0.45893165469169617, + "learning_rate": 8.792547750347597e-06, + "loss": 0.385, + "step": 24160 + }, + { + "epoch": 0.5386348565367848, + "grad_norm": 0.5781266689300537, + "learning_rate": 8.78907212364722e-06, + "loss": 0.1913, + "step": 24165 + }, + { + "epoch": 0.5387463059174049, + "grad_norm": 0.70738685131073, + "learning_rate": 8.785596645397256e-06, + "loss": 0.2322, + "step": 24170 + }, + { + "epoch": 0.5388577552980249, + "grad_norm": 0.5673659443855286, + "learning_rate": 8.782121316023769e-06, + "loss": 0.3694, + "step": 24175 + }, + { + "epoch": 0.538969204678645, + "grad_norm": 0.48852887749671936, + "learning_rate": 8.778646135952805e-06, + "loss": 0.2731, + "step": 24180 + }, + { + "epoch": 0.5390806540592651, + "grad_norm": 0.7373262643814087, + "learning_rate": 8.7751711056104e-06, + "loss": 0.2755, + "step": 24185 + }, + { + "epoch": 0.5391921034398851, + "grad_norm": 0.5007327795028687, + "learning_rate": 8.771696225422559e-06, + "loss": 0.2156, + "step": 24190 + }, + { + "epoch": 0.5393035528205052, + "grad_norm": 0.7125994563102722, + "learning_rate": 8.768221495815282e-06, + "loss": 0.2494, + "step": 24195 + }, + { + "epoch": 0.5394150022011253, + "grad_norm": 0.5907738208770752, + "learning_rate": 8.764746917214543e-06, + "loss": 0.3115, + "step": 24200 + }, + { + "epoch": 0.5395264515817453, + "grad_norm": 0.6450021862983704, + "learning_rate": 8.761272490046299e-06, + "loss": 0.2341, + "step": 24205 + }, + { + "epoch": 0.5396379009623654, + "grad_norm": 0.5347104072570801, + "learning_rate": 8.757798214736483e-06, + "loss": 0.2313, + "step": 24210 + }, + { + "epoch": 0.5397493503429854, + "grad_norm": 0.5462760329246521, + "learning_rate": 8.754324091711021e-06, + "loss": 0.3637, + "step": 24215 + }, + { + "epoch": 0.5398607997236056, + "grad_norm": 0.39873236417770386, + "learning_rate": 8.75085012139581e-06, + "loss": 0.4349, + "step": 24220 + }, + { + "epoch": 0.5399722491042256, + "grad_norm": 0.5908456444740295, + "learning_rate": 8.747376304216726e-06, + "loss": 0.3994, + "step": 24225 + }, + { + "epoch": 0.5400836984848457, + "grad_norm": 0.6254735589027405, + "learning_rate": 8.74390264059964e-06, + "loss": 0.3037, + "step": 24230 + }, + { + "epoch": 0.5401951478654657, + "grad_norm": 0.7801318168640137, + "learning_rate": 8.740429130970399e-06, + "loss": 0.1759, + "step": 24235 + }, + { + "epoch": 0.5403065972460858, + "grad_norm": 1.0558874607086182, + "learning_rate": 8.736955775754821e-06, + "loss": 0.2624, + "step": 24240 + }, + { + "epoch": 0.5404180466267059, + "grad_norm": 0.5824124813079834, + "learning_rate": 8.733482575378718e-06, + "loss": 0.304, + "step": 24245 + }, + { + "epoch": 0.5405294960073259, + "grad_norm": 0.7328165173530579, + "learning_rate": 8.730009530267876e-06, + "loss": 0.2544, + "step": 24250 + }, + { + "epoch": 0.540640945387946, + "grad_norm": 0.7363235950469971, + "learning_rate": 8.72653664084806e-06, + "loss": 0.2932, + "step": 24255 + }, + { + "epoch": 0.5407523947685661, + "grad_norm": 0.43137842416763306, + "learning_rate": 8.72306390754502e-06, + "loss": 0.3143, + "step": 24260 + }, + { + "epoch": 0.5408638441491861, + "grad_norm": 0.7054587602615356, + "learning_rate": 8.719591330784493e-06, + "loss": 0.3246, + "step": 24265 + }, + { + "epoch": 0.5409752935298062, + "grad_norm": 0.6189272999763489, + "learning_rate": 8.716118910992185e-06, + "loss": 0.2671, + "step": 24270 + }, + { + "epoch": 0.5410867429104262, + "grad_norm": 0.8482842445373535, + "learning_rate": 8.712646648593786e-06, + "loss": 0.3937, + "step": 24275 + }, + { + "epoch": 0.5411981922910464, + "grad_norm": 0.7510126233100891, + "learning_rate": 8.709174544014974e-06, + "loss": 0.2735, + "step": 24280 + }, + { + "epoch": 0.5413096416716664, + "grad_norm": 0.804165780544281, + "learning_rate": 8.705702597681399e-06, + "loss": 0.3779, + "step": 24285 + }, + { + "epoch": 0.5414210910522865, + "grad_norm": 0.6668807864189148, + "learning_rate": 8.702230810018695e-06, + "loss": 0.2744, + "step": 24290 + }, + { + "epoch": 0.5415325404329066, + "grad_norm": 0.33997178077697754, + "learning_rate": 8.698759181452472e-06, + "loss": 0.2683, + "step": 24295 + }, + { + "epoch": 0.5416439898135266, + "grad_norm": 0.5800212621688843, + "learning_rate": 8.695287712408333e-06, + "loss": 0.3001, + "step": 24300 + }, + { + "epoch": 0.5417554391941467, + "grad_norm": 0.7384917140007019, + "learning_rate": 8.691816403311849e-06, + "loss": 0.3023, + "step": 24305 + }, + { + "epoch": 0.5418668885747667, + "grad_norm": 0.42454642057418823, + "learning_rate": 8.688345254588579e-06, + "loss": 0.2291, + "step": 24310 + }, + { + "epoch": 0.5419783379553869, + "grad_norm": 0.7141265869140625, + "learning_rate": 8.684874266664054e-06, + "loss": 0.3026, + "step": 24315 + }, + { + "epoch": 0.5420897873360069, + "grad_norm": 0.49102476239204407, + "learning_rate": 8.681403439963793e-06, + "loss": 0.3357, + "step": 24320 + }, + { + "epoch": 0.5422012367166269, + "grad_norm": 0.7738983631134033, + "learning_rate": 8.677932774913292e-06, + "loss": 0.3483, + "step": 24325 + }, + { + "epoch": 0.542312686097247, + "grad_norm": 0.7534337043762207, + "learning_rate": 8.674462271938026e-06, + "loss": 0.2816, + "step": 24330 + }, + { + "epoch": 0.542424135477867, + "grad_norm": 0.4947792887687683, + "learning_rate": 8.670991931463454e-06, + "loss": 0.3452, + "step": 24335 + }, + { + "epoch": 0.5425355848584872, + "grad_norm": 0.5832251310348511, + "learning_rate": 8.667521753915018e-06, + "loss": 0.3181, + "step": 24340 + }, + { + "epoch": 0.5426470342391072, + "grad_norm": 0.9216941595077515, + "learning_rate": 8.664051739718127e-06, + "loss": 0.4427, + "step": 24345 + }, + { + "epoch": 0.5427584836197273, + "grad_norm": 0.6378898024559021, + "learning_rate": 8.660581889298178e-06, + "loss": 0.2333, + "step": 24350 + }, + { + "epoch": 0.5428699330003474, + "grad_norm": 1.0858154296875, + "learning_rate": 8.657112203080555e-06, + "loss": 0.4015, + "step": 24355 + }, + { + "epoch": 0.5429813823809674, + "grad_norm": 0.6010323166847229, + "learning_rate": 8.653642681490608e-06, + "loss": 0.3105, + "step": 24360 + }, + { + "epoch": 0.5430928317615875, + "grad_norm": 0.7363236546516418, + "learning_rate": 8.650173324953675e-06, + "loss": 0.318, + "step": 24365 + }, + { + "epoch": 0.5432042811422075, + "grad_norm": 0.5955904722213745, + "learning_rate": 8.646704133895074e-06, + "loss": 0.2549, + "step": 24370 + }, + { + "epoch": 0.5433157305228277, + "grad_norm": 0.6519381999969482, + "learning_rate": 8.643235108740102e-06, + "loss": 0.3682, + "step": 24375 + }, + { + "epoch": 0.5434271799034477, + "grad_norm": 0.7452640533447266, + "learning_rate": 8.639766249914032e-06, + "loss": 0.2038, + "step": 24380 + }, + { + "epoch": 0.5435386292840677, + "grad_norm": 0.8365711569786072, + "learning_rate": 8.636297557842122e-06, + "loss": 0.3316, + "step": 24385 + }, + { + "epoch": 0.5436500786646878, + "grad_norm": 0.49274158477783203, + "learning_rate": 8.632829032949604e-06, + "loss": 0.3712, + "step": 24390 + }, + { + "epoch": 0.5437615280453079, + "grad_norm": 0.4177057445049286, + "learning_rate": 8.629360675661693e-06, + "loss": 0.3004, + "step": 24395 + }, + { + "epoch": 0.543872977425928, + "grad_norm": 0.6827482581138611, + "learning_rate": 8.62589248640358e-06, + "loss": 0.295, + "step": 24400 + }, + { + "epoch": 0.543984426806548, + "grad_norm": 0.626742422580719, + "learning_rate": 8.622424465600448e-06, + "loss": 0.1827, + "step": 24405 + }, + { + "epoch": 0.544095876187168, + "grad_norm": 0.6676792502403259, + "learning_rate": 8.618956613677438e-06, + "loss": 0.3464, + "step": 24410 + }, + { + "epoch": 0.5442073255677882, + "grad_norm": 0.5386606454849243, + "learning_rate": 8.615488931059693e-06, + "loss": 0.2527, + "step": 24415 + }, + { + "epoch": 0.5443187749484082, + "grad_norm": 0.6125070452690125, + "learning_rate": 8.612021418172316e-06, + "loss": 0.2888, + "step": 24420 + }, + { + "epoch": 0.5444302243290283, + "grad_norm": 0.8366973996162415, + "learning_rate": 8.608554075440397e-06, + "loss": 0.3497, + "step": 24425 + }, + { + "epoch": 0.5445416737096483, + "grad_norm": 0.41520625352859497, + "learning_rate": 8.605086903289011e-06, + "loss": 0.3532, + "step": 24430 + }, + { + "epoch": 0.5446531230902685, + "grad_norm": 0.5745276212692261, + "learning_rate": 8.601619902143204e-06, + "loss": 0.2881, + "step": 24435 + }, + { + "epoch": 0.5447645724708885, + "grad_norm": 0.7710216641426086, + "learning_rate": 8.598153072427998e-06, + "loss": 0.3121, + "step": 24440 + }, + { + "epoch": 0.5448760218515085, + "grad_norm": 0.5294174551963806, + "learning_rate": 8.594686414568407e-06, + "loss": 0.2459, + "step": 24445 + }, + { + "epoch": 0.5449874712321287, + "grad_norm": 0.43676459789276123, + "learning_rate": 8.591219928989418e-06, + "loss": 0.3917, + "step": 24450 + }, + { + "epoch": 0.5450989206127487, + "grad_norm": 0.49319544434547424, + "learning_rate": 8.587753616115988e-06, + "loss": 0.3877, + "step": 24455 + }, + { + "epoch": 0.5452103699933688, + "grad_norm": 0.5562756061553955, + "learning_rate": 8.584287476373066e-06, + "loss": 0.3236, + "step": 24460 + }, + { + "epoch": 0.5453218193739888, + "grad_norm": 0.5114259719848633, + "learning_rate": 8.580821510185571e-06, + "loss": 0.29, + "step": 24465 + }, + { + "epoch": 0.5454332687546088, + "grad_norm": 0.507517397403717, + "learning_rate": 8.577355717978403e-06, + "loss": 0.2529, + "step": 24470 + }, + { + "epoch": 0.545544718135229, + "grad_norm": 0.6263054609298706, + "learning_rate": 8.57389010017644e-06, + "loss": 0.2756, + "step": 24475 + }, + { + "epoch": 0.545656167515849, + "grad_norm": 0.8092488050460815, + "learning_rate": 8.570424657204548e-06, + "loss": 0.3541, + "step": 24480 + }, + { + "epoch": 0.5457676168964691, + "grad_norm": 0.6228552460670471, + "learning_rate": 8.56695938948756e-06, + "loss": 0.2658, + "step": 24485 + }, + { + "epoch": 0.5458790662770892, + "grad_norm": 0.6549887657165527, + "learning_rate": 8.563494297450285e-06, + "loss": 0.3416, + "step": 24490 + }, + { + "epoch": 0.5459905156577093, + "grad_norm": 0.5719853639602661, + "learning_rate": 8.560029381517524e-06, + "loss": 0.2552, + "step": 24495 + }, + { + "epoch": 0.5461019650383293, + "grad_norm": 0.5304532051086426, + "learning_rate": 8.556564642114044e-06, + "loss": 0.2077, + "step": 24500 + }, + { + "epoch": 0.5462134144189493, + "grad_norm": 0.6370610594749451, + "learning_rate": 8.553100079664598e-06, + "loss": 0.2462, + "step": 24505 + }, + { + "epoch": 0.5463248637995695, + "grad_norm": 0.7064418792724609, + "learning_rate": 8.549635694593911e-06, + "loss": 0.2964, + "step": 24510 + }, + { + "epoch": 0.5464363131801895, + "grad_norm": 0.5713470578193665, + "learning_rate": 8.546171487326698e-06, + "loss": 0.3, + "step": 24515 + }, + { + "epoch": 0.5465477625608096, + "grad_norm": 0.5088363885879517, + "learning_rate": 8.542707458287635e-06, + "loss": 0.4, + "step": 24520 + }, + { + "epoch": 0.5466592119414296, + "grad_norm": 0.6283735632896423, + "learning_rate": 8.539243607901391e-06, + "loss": 0.328, + "step": 24525 + }, + { + "epoch": 0.5467706613220497, + "grad_norm": 0.45291775465011597, + "learning_rate": 8.535779936592607e-06, + "loss": 0.2645, + "step": 24530 + }, + { + "epoch": 0.5468821107026698, + "grad_norm": 0.5215762257575989, + "learning_rate": 8.532316444785898e-06, + "loss": 0.3545, + "step": 24535 + }, + { + "epoch": 0.5469935600832898, + "grad_norm": 0.6499525308609009, + "learning_rate": 8.528853132905865e-06, + "loss": 0.281, + "step": 24540 + }, + { + "epoch": 0.5471050094639099, + "grad_norm": 0.6608521938323975, + "learning_rate": 8.525390001377078e-06, + "loss": 0.3618, + "step": 24545 + }, + { + "epoch": 0.54721645884453, + "grad_norm": 0.7306557297706604, + "learning_rate": 8.521927050624097e-06, + "loss": 0.2263, + "step": 24550 + }, + { + "epoch": 0.54732790822515, + "grad_norm": 0.46093496680259705, + "learning_rate": 8.518464281071454e-06, + "loss": 0.3256, + "step": 24555 + }, + { + "epoch": 0.5474393576057701, + "grad_norm": 0.7847006916999817, + "learning_rate": 8.515001693143654e-06, + "loss": 0.2738, + "step": 24560 + }, + { + "epoch": 0.5475508069863901, + "grad_norm": 0.6807790994644165, + "learning_rate": 8.511539287265183e-06, + "loss": 0.348, + "step": 24565 + }, + { + "epoch": 0.5476622563670103, + "grad_norm": 0.36090344190597534, + "learning_rate": 8.508077063860505e-06, + "loss": 0.3948, + "step": 24570 + }, + { + "epoch": 0.5477737057476303, + "grad_norm": 0.7280330061912537, + "learning_rate": 8.504615023354066e-06, + "loss": 0.3484, + "step": 24575 + }, + { + "epoch": 0.5478851551282504, + "grad_norm": 0.7046535611152649, + "learning_rate": 8.501153166170277e-06, + "loss": 0.3831, + "step": 24580 + }, + { + "epoch": 0.5479966045088704, + "grad_norm": 0.6159283518791199, + "learning_rate": 8.497691492733543e-06, + "loss": 0.2587, + "step": 24585 + }, + { + "epoch": 0.5481080538894905, + "grad_norm": 0.6410984396934509, + "learning_rate": 8.494230003468239e-06, + "loss": 0.2617, + "step": 24590 + }, + { + "epoch": 0.5482195032701106, + "grad_norm": 0.728539228439331, + "learning_rate": 8.49076869879871e-06, + "loss": 0.2402, + "step": 24595 + }, + { + "epoch": 0.5483309526507306, + "grad_norm": 0.42051032185554504, + "learning_rate": 8.487307579149293e-06, + "loss": 0.2624, + "step": 24600 + }, + { + "epoch": 0.5484424020313508, + "grad_norm": 0.5762730240821838, + "learning_rate": 8.483846644944289e-06, + "loss": 0.3665, + "step": 24605 + }, + { + "epoch": 0.5485538514119708, + "grad_norm": 0.6362734436988831, + "learning_rate": 8.480385896607981e-06, + "loss": 0.2072, + "step": 24610 + }, + { + "epoch": 0.5486653007925908, + "grad_norm": 0.7404980063438416, + "learning_rate": 8.476925334564631e-06, + "loss": 0.4376, + "step": 24615 + }, + { + "epoch": 0.5487767501732109, + "grad_norm": 0.6925135254859924, + "learning_rate": 8.473464959238485e-06, + "loss": 0.2248, + "step": 24620 + }, + { + "epoch": 0.548888199553831, + "grad_norm": 0.6552592515945435, + "learning_rate": 8.47000477105375e-06, + "loss": 0.2665, + "step": 24625 + }, + { + "epoch": 0.5489996489344511, + "grad_norm": 0.6084296107292175, + "learning_rate": 8.466544770434617e-06, + "loss": 0.2409, + "step": 24630 + }, + { + "epoch": 0.5491110983150711, + "grad_norm": 0.6576414108276367, + "learning_rate": 8.463084957805263e-06, + "loss": 0.2778, + "step": 24635 + }, + { + "epoch": 0.5492225476956912, + "grad_norm": 0.5849794745445251, + "learning_rate": 8.459625333589828e-06, + "loss": 0.2869, + "step": 24640 + }, + { + "epoch": 0.5493339970763113, + "grad_norm": 0.6730411052703857, + "learning_rate": 8.456165898212438e-06, + "loss": 0.2549, + "step": 24645 + }, + { + "epoch": 0.5494454464569313, + "grad_norm": 0.6720399856567383, + "learning_rate": 8.452706652097187e-06, + "loss": 0.3656, + "step": 24650 + }, + { + "epoch": 0.5495568958375514, + "grad_norm": 0.4191713333129883, + "learning_rate": 8.449247595668164e-06, + "loss": 0.3909, + "step": 24655 + }, + { + "epoch": 0.5496683452181714, + "grad_norm": 0.7045153379440308, + "learning_rate": 8.445788729349412e-06, + "loss": 0.3864, + "step": 24660 + }, + { + "epoch": 0.5497797945987916, + "grad_norm": 0.41460931301116943, + "learning_rate": 8.442330053564969e-06, + "loss": 0.2533, + "step": 24665 + }, + { + "epoch": 0.5498912439794116, + "grad_norm": 1.074849009513855, + "learning_rate": 8.438871568738833e-06, + "loss": 0.256, + "step": 24670 + }, + { + "epoch": 0.5500026933600316, + "grad_norm": 0.7419118881225586, + "learning_rate": 8.435413275294998e-06, + "loss": 0.3451, + "step": 24675 + }, + { + "epoch": 0.5501141427406517, + "grad_norm": 0.6535670757293701, + "learning_rate": 8.431955173657416e-06, + "loss": 0.3289, + "step": 24680 + }, + { + "epoch": 0.5502255921212718, + "grad_norm": 0.6609795689582825, + "learning_rate": 8.428497264250023e-06, + "loss": 0.3248, + "step": 24685 + }, + { + "epoch": 0.5503370415018919, + "grad_norm": 0.35083624720573425, + "learning_rate": 8.425039547496737e-06, + "loss": 0.3534, + "step": 24690 + }, + { + "epoch": 0.5504484908825119, + "grad_norm": 0.7437844276428223, + "learning_rate": 8.421582023821448e-06, + "loss": 0.2265, + "step": 24695 + }, + { + "epoch": 0.550559940263132, + "grad_norm": 0.790406346321106, + "learning_rate": 8.41812469364802e-06, + "loss": 0.3658, + "step": 24700 + }, + { + "epoch": 0.5506713896437521, + "grad_norm": 1.019261121749878, + "learning_rate": 8.414667557400293e-06, + "loss": 0.3364, + "step": 24705 + }, + { + "epoch": 0.5507828390243721, + "grad_norm": 0.5490015149116516, + "learning_rate": 8.411210615502087e-06, + "loss": 0.1999, + "step": 24710 + }, + { + "epoch": 0.5508942884049922, + "grad_norm": 0.6740686297416687, + "learning_rate": 8.407753868377194e-06, + "loss": 0.261, + "step": 24715 + }, + { + "epoch": 0.5510057377856122, + "grad_norm": 0.6299969553947449, + "learning_rate": 8.404297316449387e-06, + "loss": 0.3937, + "step": 24720 + }, + { + "epoch": 0.5511171871662324, + "grad_norm": 0.4909580647945404, + "learning_rate": 8.40084096014241e-06, + "loss": 0.2368, + "step": 24725 + }, + { + "epoch": 0.5512286365468524, + "grad_norm": 0.9306431412696838, + "learning_rate": 8.397384799879993e-06, + "loss": 0.399, + "step": 24730 + }, + { + "epoch": 0.5513400859274724, + "grad_norm": 0.46534305810928345, + "learning_rate": 8.393928836085827e-06, + "loss": 0.2315, + "step": 24735 + }, + { + "epoch": 0.5514515353080925, + "grad_norm": 0.5373953580856323, + "learning_rate": 8.39047306918359e-06, + "loss": 0.355, + "step": 24740 + }, + { + "epoch": 0.5515629846887126, + "grad_norm": 0.6013240814208984, + "learning_rate": 8.38701749959693e-06, + "loss": 0.2399, + "step": 24745 + }, + { + "epoch": 0.5516744340693327, + "grad_norm": 0.5150963664054871, + "learning_rate": 8.383562127749473e-06, + "loss": 0.2574, + "step": 24750 + }, + { + "epoch": 0.5517858834499527, + "grad_norm": 0.5833569169044495, + "learning_rate": 8.38010695406482e-06, + "loss": 0.3758, + "step": 24755 + }, + { + "epoch": 0.5518973328305727, + "grad_norm": 0.4889329969882965, + "learning_rate": 8.376651978966555e-06, + "loss": 0.2779, + "step": 24760 + }, + { + "epoch": 0.5520087822111929, + "grad_norm": 0.7517090439796448, + "learning_rate": 8.373197202878224e-06, + "loss": 0.3864, + "step": 24765 + }, + { + "epoch": 0.5521202315918129, + "grad_norm": 0.7434937357902527, + "learning_rate": 8.369742626223363e-06, + "loss": 0.2573, + "step": 24770 + }, + { + "epoch": 0.552231680972433, + "grad_norm": 0.6837606430053711, + "learning_rate": 8.36628824942547e-06, + "loss": 0.2357, + "step": 24775 + }, + { + "epoch": 0.552343130353053, + "grad_norm": 0.8016228079795837, + "learning_rate": 8.362834072908028e-06, + "loss": 0.3943, + "step": 24780 + }, + { + "epoch": 0.5524545797336732, + "grad_norm": 0.549381673336029, + "learning_rate": 8.35938009709449e-06, + "loss": 0.3435, + "step": 24785 + }, + { + "epoch": 0.5525660291142932, + "grad_norm": 1.0602985620498657, + "learning_rate": 8.355926322408287e-06, + "loss": 0.2506, + "step": 24790 + }, + { + "epoch": 0.5526774784949132, + "grad_norm": 0.526971161365509, + "learning_rate": 8.352472749272831e-06, + "loss": 0.47, + "step": 24795 + }, + { + "epoch": 0.5527889278755334, + "grad_norm": 0.8049795627593994, + "learning_rate": 8.349019378111496e-06, + "loss": 0.298, + "step": 24800 + }, + { + "epoch": 0.5529003772561534, + "grad_norm": 0.5035505294799805, + "learning_rate": 8.345566209347644e-06, + "loss": 0.2329, + "step": 24805 + }, + { + "epoch": 0.5530118266367735, + "grad_norm": 0.6278955340385437, + "learning_rate": 8.342113243404602e-06, + "loss": 0.2987, + "step": 24810 + }, + { + "epoch": 0.5531232760173935, + "grad_norm": 0.38567596673965454, + "learning_rate": 8.338660480705683e-06, + "loss": 0.2863, + "step": 24815 + }, + { + "epoch": 0.5532347253980135, + "grad_norm": 0.6542585492134094, + "learning_rate": 8.335207921674165e-06, + "loss": 0.4058, + "step": 24820 + }, + { + "epoch": 0.5533461747786337, + "grad_norm": 0.8733132481575012, + "learning_rate": 8.3317555667333e-06, + "loss": 0.2498, + "step": 24825 + }, + { + "epoch": 0.5534576241592537, + "grad_norm": 0.46271783113479614, + "learning_rate": 8.328303416306329e-06, + "loss": 0.2544, + "step": 24830 + }, + { + "epoch": 0.5535690735398738, + "grad_norm": 0.8014933466911316, + "learning_rate": 8.324851470816457e-06, + "loss": 0.3084, + "step": 24835 + }, + { + "epoch": 0.5536805229204939, + "grad_norm": 0.5467740893363953, + "learning_rate": 8.321399730686865e-06, + "loss": 0.3583, + "step": 24840 + }, + { + "epoch": 0.553791972301114, + "grad_norm": 0.7579909563064575, + "learning_rate": 8.317948196340706e-06, + "loss": 0.3352, + "step": 24845 + }, + { + "epoch": 0.553903421681734, + "grad_norm": 0.7478350400924683, + "learning_rate": 8.314496868201118e-06, + "loss": 0.4136, + "step": 24850 + }, + { + "epoch": 0.554014871062354, + "grad_norm": 0.46065306663513184, + "learning_rate": 8.3110457466912e-06, + "loss": 0.2237, + "step": 24855 + }, + { + "epoch": 0.5541263204429742, + "grad_norm": 0.5404456257820129, + "learning_rate": 8.307594832234037e-06, + "loss": 0.2682, + "step": 24860 + }, + { + "epoch": 0.5542377698235942, + "grad_norm": 0.7918099761009216, + "learning_rate": 8.304144125252687e-06, + "loss": 0.4182, + "step": 24865 + }, + { + "epoch": 0.5543492192042143, + "grad_norm": 0.4383431673049927, + "learning_rate": 8.300693626170178e-06, + "loss": 0.3092, + "step": 24870 + }, + { + "epoch": 0.5544606685848343, + "grad_norm": 0.424964040517807, + "learning_rate": 8.297243335409511e-06, + "loss": 0.3116, + "step": 24875 + }, + { + "epoch": 0.5545721179654544, + "grad_norm": 0.6739875674247742, + "learning_rate": 8.29379325339367e-06, + "loss": 0.3016, + "step": 24880 + }, + { + "epoch": 0.5546835673460745, + "grad_norm": 0.6436865329742432, + "learning_rate": 8.290343380545606e-06, + "loss": 0.2368, + "step": 24885 + }, + { + "epoch": 0.5547950167266945, + "grad_norm": 0.5790094137191772, + "learning_rate": 8.286893717288246e-06, + "loss": 0.3702, + "step": 24890 + }, + { + "epoch": 0.5549064661073146, + "grad_norm": 0.5759456157684326, + "learning_rate": 8.283444264044491e-06, + "loss": 0.2727, + "step": 24895 + }, + { + "epoch": 0.5550179154879347, + "grad_norm": 0.8519607186317444, + "learning_rate": 8.279995021237223e-06, + "loss": 0.4072, + "step": 24900 + }, + { + "epoch": 0.5551293648685548, + "grad_norm": 0.6983477473258972, + "learning_rate": 8.276545989289286e-06, + "loss": 0.2188, + "step": 24905 + }, + { + "epoch": 0.5552408142491748, + "grad_norm": 0.7973755598068237, + "learning_rate": 8.273097168623511e-06, + "loss": 0.3562, + "step": 24910 + }, + { + "epoch": 0.5553522636297948, + "grad_norm": 1.0104763507843018, + "learning_rate": 8.269648559662694e-06, + "loss": 0.2745, + "step": 24915 + }, + { + "epoch": 0.555463713010415, + "grad_norm": 0.37829214334487915, + "learning_rate": 8.266200162829605e-06, + "loss": 0.3069, + "step": 24920 + }, + { + "epoch": 0.555575162391035, + "grad_norm": 0.5734246969223022, + "learning_rate": 8.262751978546995e-06, + "loss": 0.222, + "step": 24925 + }, + { + "epoch": 0.5556866117716551, + "grad_norm": 0.5025404691696167, + "learning_rate": 8.259304007237578e-06, + "loss": 0.2063, + "step": 24930 + }, + { + "epoch": 0.5557980611522751, + "grad_norm": 0.43454474210739136, + "learning_rate": 8.255856249324058e-06, + "loss": 0.1827, + "step": 24935 + }, + { + "epoch": 0.5559095105328952, + "grad_norm": 0.6172532439231873, + "learning_rate": 8.252408705229098e-06, + "loss": 0.2765, + "step": 24940 + }, + { + "epoch": 0.5560209599135153, + "grad_norm": 0.3773133456707001, + "learning_rate": 8.248961375375341e-06, + "loss": 0.2087, + "step": 24945 + }, + { + "epoch": 0.5561324092941353, + "grad_norm": 0.5615310668945312, + "learning_rate": 8.245514260185403e-06, + "loss": 0.361, + "step": 24950 + }, + { + "epoch": 0.5562438586747555, + "grad_norm": 0.5559342503547668, + "learning_rate": 8.242067360081878e-06, + "loss": 0.3179, + "step": 24955 + }, + { + "epoch": 0.5563553080553755, + "grad_norm": 0.47637277841567993, + "learning_rate": 8.238620675487323e-06, + "loss": 0.3062, + "step": 24960 + }, + { + "epoch": 0.5564667574359955, + "grad_norm": 0.580187976360321, + "learning_rate": 8.235174206824278e-06, + "loss": 0.4512, + "step": 24965 + }, + { + "epoch": 0.5565782068166156, + "grad_norm": 0.2807926833629608, + "learning_rate": 8.231727954515247e-06, + "loss": 0.2401, + "step": 24970 + }, + { + "epoch": 0.5566896561972356, + "grad_norm": 0.6485481858253479, + "learning_rate": 8.228281918982726e-06, + "loss": 0.4448, + "step": 24975 + }, + { + "epoch": 0.5568011055778558, + "grad_norm": 0.42547407746315, + "learning_rate": 8.224836100649166e-06, + "loss": 0.3134, + "step": 24980 + }, + { + "epoch": 0.5569125549584758, + "grad_norm": 0.6035627126693726, + "learning_rate": 8.221390499936997e-06, + "loss": 0.4542, + "step": 24985 + }, + { + "epoch": 0.5570240043390959, + "grad_norm": 0.49706533551216125, + "learning_rate": 8.217945117268624e-06, + "loss": 0.1768, + "step": 24990 + }, + { + "epoch": 0.557135453719716, + "grad_norm": 0.5365151762962341, + "learning_rate": 8.214499953066423e-06, + "loss": 0.2971, + "step": 24995 + }, + { + "epoch": 0.557246903100336, + "grad_norm": 0.7276906967163086, + "learning_rate": 8.211055007752749e-06, + "loss": 0.2899, + "step": 25000 + }, + { + "epoch": 0.5573583524809561, + "grad_norm": 0.6989277601242065, + "learning_rate": 8.207610281749918e-06, + "loss": 0.3771, + "step": 25005 + }, + { + "epoch": 0.5574698018615761, + "grad_norm": 0.8797715306282043, + "learning_rate": 8.204165775480233e-06, + "loss": 0.2723, + "step": 25010 + }, + { + "epoch": 0.5575812512421963, + "grad_norm": 0.5198432207107544, + "learning_rate": 8.200721489365962e-06, + "loss": 0.3965, + "step": 25015 + }, + { + "epoch": 0.5576927006228163, + "grad_norm": 0.6038433313369751, + "learning_rate": 8.197277423829351e-06, + "loss": 0.2742, + "step": 25020 + }, + { + "epoch": 0.5578041500034363, + "grad_norm": 0.6098718047142029, + "learning_rate": 8.19383357929261e-06, + "loss": 0.2459, + "step": 25025 + }, + { + "epoch": 0.5579155993840564, + "grad_norm": 0.5177041292190552, + "learning_rate": 8.190389956177934e-06, + "loss": 0.322, + "step": 25030 + }, + { + "epoch": 0.5580270487646765, + "grad_norm": 0.8710949420928955, + "learning_rate": 8.18694655490748e-06, + "loss": 0.3124, + "step": 25035 + }, + { + "epoch": 0.5581384981452966, + "grad_norm": 0.6304861307144165, + "learning_rate": 8.183503375903378e-06, + "loss": 0.2803, + "step": 25040 + }, + { + "epoch": 0.5582499475259166, + "grad_norm": 0.5343223214149475, + "learning_rate": 8.180060419587746e-06, + "loss": 0.2192, + "step": 25045 + }, + { + "epoch": 0.5583613969065367, + "grad_norm": 1.02570641040802, + "learning_rate": 8.17661768638266e-06, + "loss": 0.1673, + "step": 25050 + }, + { + "epoch": 0.5584728462871568, + "grad_norm": 0.5182636976242065, + "learning_rate": 8.173175176710172e-06, + "loss": 0.3125, + "step": 25055 + }, + { + "epoch": 0.5585842956677768, + "grad_norm": 0.9790181517601013, + "learning_rate": 8.169732890992302e-06, + "loss": 0.447, + "step": 25060 + }, + { + "epoch": 0.5586957450483969, + "grad_norm": 0.646925151348114, + "learning_rate": 8.166290829651056e-06, + "loss": 0.2969, + "step": 25065 + }, + { + "epoch": 0.5588071944290169, + "grad_norm": 0.5242272019386292, + "learning_rate": 8.162848993108396e-06, + "loss": 0.3618, + "step": 25070 + }, + { + "epoch": 0.5589186438096371, + "grad_norm": 0.5497309565544128, + "learning_rate": 8.159407381786268e-06, + "loss": 0.3532, + "step": 25075 + }, + { + "epoch": 0.5590300931902571, + "grad_norm": 0.6220980286598206, + "learning_rate": 8.155965996106594e-06, + "loss": 0.2132, + "step": 25080 + }, + { + "epoch": 0.5591415425708771, + "grad_norm": 0.3434116244316101, + "learning_rate": 8.152524836491254e-06, + "loss": 0.3954, + "step": 25085 + }, + { + "epoch": 0.5592529919514972, + "grad_norm": 0.7324555516242981, + "learning_rate": 8.149083903362105e-06, + "loss": 0.2732, + "step": 25090 + }, + { + "epoch": 0.5593644413321173, + "grad_norm": 0.517315685749054, + "learning_rate": 8.145643197140986e-06, + "loss": 0.2802, + "step": 25095 + }, + { + "epoch": 0.5594758907127374, + "grad_norm": 0.7378593683242798, + "learning_rate": 8.142202718249697e-06, + "loss": 0.3438, + "step": 25100 + }, + { + "epoch": 0.5595873400933574, + "grad_norm": 0.5122184753417969, + "learning_rate": 8.138762467110014e-06, + "loss": 0.3632, + "step": 25105 + }, + { + "epoch": 0.5596987894739776, + "grad_norm": 0.43752196431159973, + "learning_rate": 8.135322444143683e-06, + "loss": 0.2482, + "step": 25110 + }, + { + "epoch": 0.5598102388545976, + "grad_norm": 0.7723203897476196, + "learning_rate": 8.13188264977243e-06, + "loss": 0.259, + "step": 25115 + }, + { + "epoch": 0.5599216882352176, + "grad_norm": 0.7775996327400208, + "learning_rate": 8.128443084417942e-06, + "loss": 0.4302, + "step": 25120 + }, + { + "epoch": 0.5600331376158377, + "grad_norm": 0.33896544575691223, + "learning_rate": 8.125003748501891e-06, + "loss": 0.2196, + "step": 25125 + }, + { + "epoch": 0.5601445869964577, + "grad_norm": 0.6470742225646973, + "learning_rate": 8.121564642445907e-06, + "loss": 0.4144, + "step": 25130 + }, + { + "epoch": 0.5602560363770779, + "grad_norm": 0.4051295518875122, + "learning_rate": 8.118125766671594e-06, + "loss": 0.2509, + "step": 25135 + }, + { + "epoch": 0.5603674857576979, + "grad_norm": 0.4324702024459839, + "learning_rate": 8.114687121600541e-06, + "loss": 0.422, + "step": 25140 + }, + { + "epoch": 0.5604789351383179, + "grad_norm": 0.7880158424377441, + "learning_rate": 8.111248707654288e-06, + "loss": 0.338, + "step": 25145 + }, + { + "epoch": 0.5605903845189381, + "grad_norm": 0.6531829237937927, + "learning_rate": 8.10781052525437e-06, + "loss": 0.2991, + "step": 25150 + }, + { + "epoch": 0.5607018338995581, + "grad_norm": 0.49740514159202576, + "learning_rate": 8.104372574822274e-06, + "loss": 0.3692, + "step": 25155 + }, + { + "epoch": 0.5608132832801782, + "grad_norm": 0.6843011975288391, + "learning_rate": 8.10093485677947e-06, + "loss": 0.306, + "step": 25160 + }, + { + "epoch": 0.5609247326607982, + "grad_norm": 0.9157106876373291, + "learning_rate": 8.097497371547392e-06, + "loss": 0.3139, + "step": 25165 + }, + { + "epoch": 0.5610361820414183, + "grad_norm": 0.39046990871429443, + "learning_rate": 8.094060119547455e-06, + "loss": 0.3031, + "step": 25170 + }, + { + "epoch": 0.5611476314220384, + "grad_norm": 0.5675200819969177, + "learning_rate": 8.090623101201035e-06, + "loss": 0.4418, + "step": 25175 + }, + { + "epoch": 0.5612590808026584, + "grad_norm": 0.56672203540802, + "learning_rate": 8.087186316929479e-06, + "loss": 0.2393, + "step": 25180 + }, + { + "epoch": 0.5613705301832785, + "grad_norm": 0.5712127089500427, + "learning_rate": 8.083749767154118e-06, + "loss": 0.3301, + "step": 25185 + }, + { + "epoch": 0.5614819795638986, + "grad_norm": 0.8723680377006531, + "learning_rate": 8.080313452296246e-06, + "loss": 0.2967, + "step": 25190 + }, + { + "epoch": 0.5615934289445187, + "grad_norm": 0.7300506234169006, + "learning_rate": 8.07687737277713e-06, + "loss": 0.2007, + "step": 25195 + }, + { + "epoch": 0.5617048783251387, + "grad_norm": 0.44683921337127686, + "learning_rate": 8.073441529017998e-06, + "loss": 0.2307, + "step": 25200 + }, + { + "epoch": 0.5618163277057587, + "grad_norm": 0.6372109651565552, + "learning_rate": 8.07000592144007e-06, + "loss": 0.1934, + "step": 25205 + }, + { + "epoch": 0.5619277770863789, + "grad_norm": 0.6536151170730591, + "learning_rate": 8.066570550464515e-06, + "loss": 0.3503, + "step": 25210 + }, + { + "epoch": 0.5620392264669989, + "grad_norm": 0.8571807146072388, + "learning_rate": 8.063135416512483e-06, + "loss": 0.3751, + "step": 25215 + }, + { + "epoch": 0.562150675847619, + "grad_norm": 0.6964419484138489, + "learning_rate": 8.059700520005104e-06, + "loss": 0.3219, + "step": 25220 + }, + { + "epoch": 0.562262125228239, + "grad_norm": 0.5729572772979736, + "learning_rate": 8.056265861363464e-06, + "loss": 0.3504, + "step": 25225 + }, + { + "epoch": 0.5623735746088591, + "grad_norm": 0.701521098613739, + "learning_rate": 8.052831441008626e-06, + "loss": 0.2278, + "step": 25230 + }, + { + "epoch": 0.5624850239894792, + "grad_norm": 0.4984922409057617, + "learning_rate": 8.049397259361622e-06, + "loss": 0.4445, + "step": 25235 + }, + { + "epoch": 0.5625964733700992, + "grad_norm": 0.560855507850647, + "learning_rate": 8.045963316843461e-06, + "loss": 0.3214, + "step": 25240 + }, + { + "epoch": 0.5627079227507193, + "grad_norm": 0.6652450561523438, + "learning_rate": 8.042529613875111e-06, + "loss": 0.2324, + "step": 25245 + }, + { + "epoch": 0.5628193721313394, + "grad_norm": 0.5668606758117676, + "learning_rate": 8.039096150877518e-06, + "loss": 0.3622, + "step": 25250 + }, + { + "epoch": 0.5629308215119595, + "grad_norm": 0.7954594492912292, + "learning_rate": 8.035662928271607e-06, + "loss": 0.3204, + "step": 25255 + }, + { + "epoch": 0.5630422708925795, + "grad_norm": 0.7688658237457275, + "learning_rate": 8.032229946478254e-06, + "loss": 0.3322, + "step": 25260 + }, + { + "epoch": 0.5631537202731995, + "grad_norm": 0.7299761176109314, + "learning_rate": 8.028797205918326e-06, + "loss": 0.353, + "step": 25265 + }, + { + "epoch": 0.5632651696538197, + "grad_norm": 0.6364347338676453, + "learning_rate": 8.025364707012644e-06, + "loss": 0.2966, + "step": 25270 + }, + { + "epoch": 0.5633766190344397, + "grad_norm": 0.7999971508979797, + "learning_rate": 8.021932450182004e-06, + "loss": 0.3605, + "step": 25275 + }, + { + "epoch": 0.5634880684150598, + "grad_norm": 0.5598938465118408, + "learning_rate": 8.01850043584718e-06, + "loss": 0.2989, + "step": 25280 + }, + { + "epoch": 0.5635995177956798, + "grad_norm": 0.6162434220314026, + "learning_rate": 8.015068664428905e-06, + "loss": 0.313, + "step": 25285 + }, + { + "epoch": 0.5637109671762999, + "grad_norm": 0.8101426362991333, + "learning_rate": 8.011637136347892e-06, + "loss": 0.1561, + "step": 25290 + }, + { + "epoch": 0.56382241655692, + "grad_norm": 0.6738321781158447, + "learning_rate": 8.008205852024817e-06, + "loss": 0.3467, + "step": 25295 + }, + { + "epoch": 0.56393386593754, + "grad_norm": 0.5133274793624878, + "learning_rate": 8.004774811880333e-06, + "loss": 0.3507, + "step": 25300 + }, + { + "epoch": 0.5640453153181602, + "grad_norm": 0.4188895523548126, + "learning_rate": 8.001344016335054e-06, + "loss": 0.2576, + "step": 25305 + }, + { + "epoch": 0.5641567646987802, + "grad_norm": 0.520499587059021, + "learning_rate": 7.997913465809575e-06, + "loss": 0.3331, + "step": 25310 + }, + { + "epoch": 0.5642682140794002, + "grad_norm": 0.8256027102470398, + "learning_rate": 7.994483160724449e-06, + "loss": 0.2773, + "step": 25315 + }, + { + "epoch": 0.5643796634600203, + "grad_norm": 0.5647714734077454, + "learning_rate": 7.991053101500204e-06, + "loss": 0.3196, + "step": 25320 + }, + { + "epoch": 0.5644911128406404, + "grad_norm": 0.5048403739929199, + "learning_rate": 7.987623288557343e-06, + "loss": 0.2535, + "step": 25325 + }, + { + "epoch": 0.5646025622212605, + "grad_norm": 0.8115688562393188, + "learning_rate": 7.984193722316338e-06, + "loss": 0.2521, + "step": 25330 + }, + { + "epoch": 0.5647140116018805, + "grad_norm": 0.49011000990867615, + "learning_rate": 7.98076440319762e-06, + "loss": 0.305, + "step": 25335 + }, + { + "epoch": 0.5648254609825006, + "grad_norm": 0.7082293033599854, + "learning_rate": 7.977335331621601e-06, + "loss": 0.3085, + "step": 25340 + }, + { + "epoch": 0.5649369103631207, + "grad_norm": 0.673119843006134, + "learning_rate": 7.973906508008659e-06, + "loss": 0.3354, + "step": 25345 + }, + { + "epoch": 0.5650483597437407, + "grad_norm": 0.6760952472686768, + "learning_rate": 7.970477932779137e-06, + "loss": 0.4585, + "step": 25350 + }, + { + "epoch": 0.5651598091243608, + "grad_norm": 1.0284398794174194, + "learning_rate": 7.967049606353354e-06, + "loss": 0.3098, + "step": 25355 + }, + { + "epoch": 0.5652712585049808, + "grad_norm": 0.5097162127494812, + "learning_rate": 7.963621529151601e-06, + "loss": 0.3228, + "step": 25360 + }, + { + "epoch": 0.565382707885601, + "grad_norm": 0.5417636632919312, + "learning_rate": 7.96019370159413e-06, + "loss": 0.258, + "step": 25365 + }, + { + "epoch": 0.565494157266221, + "grad_norm": 0.47898048162460327, + "learning_rate": 7.956766124101164e-06, + "loss": 0.3204, + "step": 25370 + }, + { + "epoch": 0.565605606646841, + "grad_norm": 0.5756115913391113, + "learning_rate": 7.953338797092902e-06, + "loss": 0.2885, + "step": 25375 + }, + { + "epoch": 0.5657170560274611, + "grad_norm": 0.6981114745140076, + "learning_rate": 7.949911720989503e-06, + "loss": 0.2448, + "step": 25380 + }, + { + "epoch": 0.5658285054080812, + "grad_norm": 0.805111289024353, + "learning_rate": 7.946484896211106e-06, + "loss": 0.2254, + "step": 25385 + }, + { + "epoch": 0.5659399547887013, + "grad_norm": 0.5139040946960449, + "learning_rate": 7.943058323177807e-06, + "loss": 0.177, + "step": 25390 + }, + { + "epoch": 0.5660514041693213, + "grad_norm": 0.5966136455535889, + "learning_rate": 7.939632002309681e-06, + "loss": 0.3185, + "step": 25395 + }, + { + "epoch": 0.5661628535499414, + "grad_norm": 0.8083769679069519, + "learning_rate": 7.936205934026769e-06, + "loss": 0.4075, + "step": 25400 + }, + { + "epoch": 0.5662743029305615, + "grad_norm": 0.45511701703071594, + "learning_rate": 7.93278011874908e-06, + "loss": 0.2769, + "step": 25405 + }, + { + "epoch": 0.5663857523111815, + "grad_norm": 0.8094688653945923, + "learning_rate": 7.929354556896592e-06, + "loss": 0.3354, + "step": 25410 + }, + { + "epoch": 0.5664972016918016, + "grad_norm": 0.5739903450012207, + "learning_rate": 7.92592924888925e-06, + "loss": 0.2835, + "step": 25415 + }, + { + "epoch": 0.5666086510724216, + "grad_norm": 0.7249305248260498, + "learning_rate": 7.922504195146975e-06, + "loss": 0.3656, + "step": 25420 + }, + { + "epoch": 0.5667201004530418, + "grad_norm": 0.7731884121894836, + "learning_rate": 7.919079396089648e-06, + "loss": 0.3511, + "step": 25425 + }, + { + "epoch": 0.5668315498336618, + "grad_norm": 0.652795672416687, + "learning_rate": 7.915654852137124e-06, + "loss": 0.3149, + "step": 25430 + }, + { + "epoch": 0.5669429992142818, + "grad_norm": 0.6516653895378113, + "learning_rate": 7.91223056370923e-06, + "loss": 0.3051, + "step": 25435 + }, + { + "epoch": 0.567054448594902, + "grad_norm": 0.5057012438774109, + "learning_rate": 7.908806531225753e-06, + "loss": 0.2262, + "step": 25440 + }, + { + "epoch": 0.567165897975522, + "grad_norm": 0.513541579246521, + "learning_rate": 7.905382755106452e-06, + "loss": 0.286, + "step": 25445 + }, + { + "epoch": 0.5672773473561421, + "grad_norm": 0.5020678639411926, + "learning_rate": 7.90195923577106e-06, + "loss": 0.2685, + "step": 25450 + }, + { + "epoch": 0.5673887967367621, + "grad_norm": 0.5548171997070312, + "learning_rate": 7.898535973639272e-06, + "loss": 0.3232, + "step": 25455 + }, + { + "epoch": 0.5675002461173823, + "grad_norm": 0.4107208251953125, + "learning_rate": 7.895112969130745e-06, + "loss": 0.238, + "step": 25460 + }, + { + "epoch": 0.5676116954980023, + "grad_norm": 0.6904474496841431, + "learning_rate": 7.891690222665126e-06, + "loss": 0.2172, + "step": 25465 + }, + { + "epoch": 0.5677231448786223, + "grad_norm": 0.3023183345794678, + "learning_rate": 7.888267734662013e-06, + "loss": 0.3178, + "step": 25470 + }, + { + "epoch": 0.5678345942592424, + "grad_norm": 0.6031567454338074, + "learning_rate": 7.884845505540975e-06, + "loss": 0.2792, + "step": 25475 + }, + { + "epoch": 0.5679460436398625, + "grad_norm": 0.5487951636314392, + "learning_rate": 7.881423535721553e-06, + "loss": 0.3045, + "step": 25480 + }, + { + "epoch": 0.5680574930204826, + "grad_norm": 0.6568580865859985, + "learning_rate": 7.87800182562325e-06, + "loss": 0.2478, + "step": 25485 + }, + { + "epoch": 0.5681689424011026, + "grad_norm": 0.6862227916717529, + "learning_rate": 7.874580375665546e-06, + "loss": 0.3681, + "step": 25490 + }, + { + "epoch": 0.5682803917817226, + "grad_norm": 0.18388068675994873, + "learning_rate": 7.871159186267881e-06, + "loss": 0.2117, + "step": 25495 + }, + { + "epoch": 0.5683918411623428, + "grad_norm": 0.4341485798358917, + "learning_rate": 7.867738257849665e-06, + "loss": 0.3259, + "step": 25500 + }, + { + "epoch": 0.5685032905429628, + "grad_norm": 0.7467966079711914, + "learning_rate": 7.864317590830284e-06, + "loss": 0.4328, + "step": 25505 + }, + { + "epoch": 0.5686147399235829, + "grad_norm": 0.867165744304657, + "learning_rate": 7.860897185629078e-06, + "loss": 0.283, + "step": 25510 + }, + { + "epoch": 0.5687261893042029, + "grad_norm": 0.675286591053009, + "learning_rate": 7.857477042665369e-06, + "loss": 0.2941, + "step": 25515 + }, + { + "epoch": 0.568837638684823, + "grad_norm": 0.8349703550338745, + "learning_rate": 7.854057162358436e-06, + "loss": 0.2432, + "step": 25520 + }, + { + "epoch": 0.5689490880654431, + "grad_norm": 0.5063271522521973, + "learning_rate": 7.85063754512753e-06, + "loss": 0.3295, + "step": 25525 + }, + { + "epoch": 0.5690605374460631, + "grad_norm": 0.4811290204524994, + "learning_rate": 7.847218191391873e-06, + "loss": 0.2728, + "step": 25530 + }, + { + "epoch": 0.5691719868266832, + "grad_norm": 0.5109490752220154, + "learning_rate": 7.843799101570645e-06, + "loss": 0.4188, + "step": 25535 + }, + { + "epoch": 0.5692834362073033, + "grad_norm": 0.4581867754459381, + "learning_rate": 7.840380276083002e-06, + "loss": 0.3957, + "step": 25540 + }, + { + "epoch": 0.5693948855879234, + "grad_norm": 0.507814347743988, + "learning_rate": 7.836961715348072e-06, + "loss": 0.2872, + "step": 25545 + }, + { + "epoch": 0.5695063349685434, + "grad_norm": 0.5380224585533142, + "learning_rate": 7.833543419784941e-06, + "loss": 0.2411, + "step": 25550 + }, + { + "epoch": 0.5696177843491634, + "grad_norm": 0.7744241952896118, + "learning_rate": 7.83012538981266e-06, + "loss": 0.3722, + "step": 25555 + }, + { + "epoch": 0.5697292337297836, + "grad_norm": 0.4554356634616852, + "learning_rate": 7.826707625850261e-06, + "loss": 0.2234, + "step": 25560 + }, + { + "epoch": 0.5698406831104036, + "grad_norm": 0.6055052876472473, + "learning_rate": 7.82329012831673e-06, + "loss": 0.3317, + "step": 25565 + }, + { + "epoch": 0.5699521324910237, + "grad_norm": 0.5006867051124573, + "learning_rate": 7.819872897631024e-06, + "loss": 0.2603, + "step": 25570 + }, + { + "epoch": 0.5700635818716437, + "grad_norm": 0.6414822936058044, + "learning_rate": 7.816455934212078e-06, + "loss": 0.3273, + "step": 25575 + }, + { + "epoch": 0.5701750312522638, + "grad_norm": 0.4626764953136444, + "learning_rate": 7.813039238478778e-06, + "loss": 0.2381, + "step": 25580 + }, + { + "epoch": 0.5702864806328839, + "grad_norm": 0.7485530376434326, + "learning_rate": 7.809622810849986e-06, + "loss": 0.2817, + "step": 25585 + }, + { + "epoch": 0.5703979300135039, + "grad_norm": 0.6687384247779846, + "learning_rate": 7.806206651744534e-06, + "loss": 0.4064, + "step": 25590 + }, + { + "epoch": 0.570509379394124, + "grad_norm": 0.6886327862739563, + "learning_rate": 7.80279076158121e-06, + "loss": 0.3425, + "step": 25595 + }, + { + "epoch": 0.5706208287747441, + "grad_norm": 0.47214236855506897, + "learning_rate": 7.799375140778778e-06, + "loss": 0.2447, + "step": 25600 + }, + { + "epoch": 0.5707322781553642, + "grad_norm": 0.9890148639678955, + "learning_rate": 7.795959789755967e-06, + "loss": 0.2846, + "step": 25605 + }, + { + "epoch": 0.5708437275359842, + "grad_norm": 0.6087769269943237, + "learning_rate": 7.792544708931475e-06, + "loss": 0.3436, + "step": 25610 + }, + { + "epoch": 0.5709551769166042, + "grad_norm": 0.7241442799568176, + "learning_rate": 7.789129898723962e-06, + "loss": 0.3694, + "step": 25615 + }, + { + "epoch": 0.5710666262972244, + "grad_norm": 0.6570469737052917, + "learning_rate": 7.78571535955206e-06, + "loss": 0.3063, + "step": 25620 + }, + { + "epoch": 0.5711780756778444, + "grad_norm": 0.561510443687439, + "learning_rate": 7.782301091834363e-06, + "loss": 0.286, + "step": 25625 + }, + { + "epoch": 0.5712895250584645, + "grad_norm": 0.6006831526756287, + "learning_rate": 7.778887095989433e-06, + "loss": 0.4609, + "step": 25630 + }, + { + "epoch": 0.5714009744390846, + "grad_norm": 0.4968348443508148, + "learning_rate": 7.775473372435803e-06, + "loss": 0.2656, + "step": 25635 + }, + { + "epoch": 0.5715124238197046, + "grad_norm": 0.5182333588600159, + "learning_rate": 7.77205992159196e-06, + "loss": 0.2532, + "step": 25640 + }, + { + "epoch": 0.5716238732003247, + "grad_norm": 0.6506955623626709, + "learning_rate": 7.768646743876379e-06, + "loss": 0.2729, + "step": 25645 + }, + { + "epoch": 0.5717353225809447, + "grad_norm": 0.5487266778945923, + "learning_rate": 7.765233839707483e-06, + "loss": 0.234, + "step": 25650 + }, + { + "epoch": 0.5718467719615649, + "grad_norm": 0.564389169216156, + "learning_rate": 7.761821209503669e-06, + "loss": 0.1993, + "step": 25655 + }, + { + "epoch": 0.5719582213421849, + "grad_norm": 0.5629382729530334, + "learning_rate": 7.758408853683296e-06, + "loss": 0.3271, + "step": 25660 + }, + { + "epoch": 0.572069670722805, + "grad_norm": 0.5799145698547363, + "learning_rate": 7.7549967726647e-06, + "loss": 0.2574, + "step": 25665 + }, + { + "epoch": 0.572181120103425, + "grad_norm": 0.3119282126426697, + "learning_rate": 7.75158496686617e-06, + "loss": 0.1811, + "step": 25670 + }, + { + "epoch": 0.572292569484045, + "grad_norm": 0.7346594333648682, + "learning_rate": 7.748173436705962e-06, + "loss": 0.3085, + "step": 25675 + }, + { + "epoch": 0.5724040188646652, + "grad_norm": 0.6520366072654724, + "learning_rate": 7.744762182602313e-06, + "loss": 0.2386, + "step": 25680 + }, + { + "epoch": 0.5725154682452852, + "grad_norm": 0.5619764924049377, + "learning_rate": 7.741351204973414e-06, + "loss": 0.4687, + "step": 25685 + }, + { + "epoch": 0.5726269176259053, + "grad_norm": 1.1763639450073242, + "learning_rate": 7.737940504237421e-06, + "loss": 0.1898, + "step": 25690 + }, + { + "epoch": 0.5727383670065254, + "grad_norm": 0.7920293807983398, + "learning_rate": 7.734530080812463e-06, + "loss": 0.1965, + "step": 25695 + }, + { + "epoch": 0.5728498163871454, + "grad_norm": 0.557349681854248, + "learning_rate": 7.731119935116632e-06, + "loss": 0.3269, + "step": 25700 + }, + { + "epoch": 0.5729612657677655, + "grad_norm": 0.7871140241622925, + "learning_rate": 7.727710067567982e-06, + "loss": 0.2021, + "step": 25705 + }, + { + "epoch": 0.5730727151483855, + "grad_norm": 0.7102549076080322, + "learning_rate": 7.724300478584535e-06, + "loss": 0.3496, + "step": 25710 + }, + { + "epoch": 0.5731841645290057, + "grad_norm": 0.9377176761627197, + "learning_rate": 7.720891168584288e-06, + "loss": 0.3166, + "step": 25715 + }, + { + "epoch": 0.5732956139096257, + "grad_norm": 0.7283439636230469, + "learning_rate": 7.717482137985193e-06, + "loss": 0.4105, + "step": 25720 + }, + { + "epoch": 0.5734070632902457, + "grad_norm": 0.668796956539154, + "learning_rate": 7.714073387205165e-06, + "loss": 0.245, + "step": 25725 + }, + { + "epoch": 0.5735185126708658, + "grad_norm": 0.39624908566474915, + "learning_rate": 7.7106649166621e-06, + "loss": 0.2321, + "step": 25730 + }, + { + "epoch": 0.5736299620514859, + "grad_norm": 0.5537715554237366, + "learning_rate": 7.707256726773841e-06, + "loss": 0.3214, + "step": 25735 + }, + { + "epoch": 0.573741411432106, + "grad_norm": 0.6404997110366821, + "learning_rate": 7.703848817958213e-06, + "loss": 0.3117, + "step": 25740 + }, + { + "epoch": 0.573852860812726, + "grad_norm": 0.6337887048721313, + "learning_rate": 7.700441190632992e-06, + "loss": 0.3031, + "step": 25745 + }, + { + "epoch": 0.5739643101933461, + "grad_norm": 0.667795717716217, + "learning_rate": 7.697033845215936e-06, + "loss": 0.3622, + "step": 25750 + }, + { + "epoch": 0.5740757595739662, + "grad_norm": 0.44926580786705017, + "learning_rate": 7.693626782124751e-06, + "loss": 0.3497, + "step": 25755 + }, + { + "epoch": 0.5741872089545862, + "grad_norm": 0.6890833377838135, + "learning_rate": 7.690220001777124e-06, + "loss": 0.2375, + "step": 25760 + }, + { + "epoch": 0.5742986583352063, + "grad_norm": 0.5325068235397339, + "learning_rate": 7.686813504590693e-06, + "loss": 0.3192, + "step": 25765 + }, + { + "epoch": 0.5744101077158263, + "grad_norm": 0.6580883860588074, + "learning_rate": 7.68340729098307e-06, + "loss": 0.3099, + "step": 25770 + }, + { + "epoch": 0.5745215570964465, + "grad_norm": 0.5879032015800476, + "learning_rate": 7.680001361371837e-06, + "loss": 0.2587, + "step": 25775 + }, + { + "epoch": 0.5746330064770665, + "grad_norm": 0.48277339339256287, + "learning_rate": 7.676595716174522e-06, + "loss": 0.2754, + "step": 25780 + }, + { + "epoch": 0.5747444558576865, + "grad_norm": 0.7396294474601746, + "learning_rate": 7.673190355808643e-06, + "loss": 0.3878, + "step": 25785 + }, + { + "epoch": 0.5748559052383067, + "grad_norm": 0.6438273191452026, + "learning_rate": 7.669785280691667e-06, + "loss": 0.3425, + "step": 25790 + }, + { + "epoch": 0.5749673546189267, + "grad_norm": 0.5543471574783325, + "learning_rate": 7.666380491241027e-06, + "loss": 0.2511, + "step": 25795 + }, + { + "epoch": 0.5750788039995468, + "grad_norm": 0.4841437339782715, + "learning_rate": 7.662975987874127e-06, + "loss": 0.3244, + "step": 25800 + }, + { + "epoch": 0.5751902533801668, + "grad_norm": 0.786459743976593, + "learning_rate": 7.659571771008333e-06, + "loss": 0.3308, + "step": 25805 + }, + { + "epoch": 0.575301702760787, + "grad_norm": 0.700810968875885, + "learning_rate": 7.656167841060975e-06, + "loss": 0.3623, + "step": 25810 + }, + { + "epoch": 0.575413152141407, + "grad_norm": 0.6509367227554321, + "learning_rate": 7.652764198449342e-06, + "loss": 0.3097, + "step": 25815 + }, + { + "epoch": 0.575524601522027, + "grad_norm": 0.5181035995483398, + "learning_rate": 7.649360843590704e-06, + "loss": 0.2473, + "step": 25820 + }, + { + "epoch": 0.5756360509026471, + "grad_norm": 0.8579918146133423, + "learning_rate": 7.645957776902284e-06, + "loss": 0.3062, + "step": 25825 + }, + { + "epoch": 0.5757475002832672, + "grad_norm": 0.5640930533409119, + "learning_rate": 7.642554998801268e-06, + "loss": 0.3244, + "step": 25830 + }, + { + "epoch": 0.5758589496638873, + "grad_norm": 0.6464223861694336, + "learning_rate": 7.639152509704815e-06, + "loss": 0.3029, + "step": 25835 + }, + { + "epoch": 0.5759703990445073, + "grad_norm": 0.6057314276695251, + "learning_rate": 7.635750310030041e-06, + "loss": 0.2792, + "step": 25840 + }, + { + "epoch": 0.5760818484251273, + "grad_norm": 0.4716472029685974, + "learning_rate": 7.632348400194027e-06, + "loss": 0.3339, + "step": 25845 + }, + { + "epoch": 0.5761932978057475, + "grad_norm": 0.5490631461143494, + "learning_rate": 7.6289467806138225e-06, + "loss": 0.2401, + "step": 25850 + }, + { + "epoch": 0.5763047471863675, + "grad_norm": 0.4313923418521881, + "learning_rate": 7.6255454517064466e-06, + "loss": 0.209, + "step": 25855 + }, + { + "epoch": 0.5764161965669876, + "grad_norm": 0.6104652285575867, + "learning_rate": 7.622144413888868e-06, + "loss": 0.3952, + "step": 25860 + }, + { + "epoch": 0.5765276459476076, + "grad_norm": 0.5978262424468994, + "learning_rate": 7.618743667578029e-06, + "loss": 0.2015, + "step": 25865 + }, + { + "epoch": 0.5766390953282278, + "grad_norm": 0.38337233662605286, + "learning_rate": 7.615343213190838e-06, + "loss": 0.2155, + "step": 25870 + }, + { + "epoch": 0.5767505447088478, + "grad_norm": 0.6329131126403809, + "learning_rate": 7.611943051144159e-06, + "loss": 0.3101, + "step": 25875 + }, + { + "epoch": 0.5768619940894678, + "grad_norm": 0.6377989649772644, + "learning_rate": 7.608543181854832e-06, + "loss": 0.3393, + "step": 25880 + }, + { + "epoch": 0.5769734434700879, + "grad_norm": 0.6451283693313599, + "learning_rate": 7.605143605739648e-06, + "loss": 0.2839, + "step": 25885 + }, + { + "epoch": 0.577084892850708, + "grad_norm": 0.571865975856781, + "learning_rate": 7.601744323215376e-06, + "loss": 0.2775, + "step": 25890 + }, + { + "epoch": 0.5771963422313281, + "grad_norm": 0.45400145649909973, + "learning_rate": 7.598345334698737e-06, + "loss": 0.248, + "step": 25895 + }, + { + "epoch": 0.5773077916119481, + "grad_norm": 0.36165305972099304, + "learning_rate": 7.594946640606423e-06, + "loss": 0.33, + "step": 25900 + }, + { + "epoch": 0.5774192409925681, + "grad_norm": 0.5473405122756958, + "learning_rate": 7.591548241355085e-06, + "loss": 0.3083, + "step": 25905 + }, + { + "epoch": 0.5775306903731883, + "grad_norm": 0.5231678485870361, + "learning_rate": 7.588150137361345e-06, + "loss": 0.3748, + "step": 25910 + }, + { + "epoch": 0.5776421397538083, + "grad_norm": 0.9739239811897278, + "learning_rate": 7.58475232904178e-06, + "loss": 0.3063, + "step": 25915 + }, + { + "epoch": 0.5777535891344284, + "grad_norm": 0.7371828556060791, + "learning_rate": 7.5813548168129335e-06, + "loss": 0.3693, + "step": 25920 + }, + { + "epoch": 0.5778650385150484, + "grad_norm": 0.6414753198623657, + "learning_rate": 7.577957601091319e-06, + "loss": 0.3757, + "step": 25925 + }, + { + "epoch": 0.5779764878956685, + "grad_norm": 0.7251477241516113, + "learning_rate": 7.57456068229341e-06, + "loss": 0.4134, + "step": 25930 + }, + { + "epoch": 0.5780879372762886, + "grad_norm": 0.3665677011013031, + "learning_rate": 7.5711640608356405e-06, + "loss": 0.2698, + "step": 25935 + }, + { + "epoch": 0.5781993866569086, + "grad_norm": 0.5627968907356262, + "learning_rate": 7.567767737134406e-06, + "loss": 0.2704, + "step": 25940 + }, + { + "epoch": 0.5783108360375288, + "grad_norm": 0.5136057734489441, + "learning_rate": 7.564371711606075e-06, + "loss": 0.3786, + "step": 25945 + }, + { + "epoch": 0.5784222854181488, + "grad_norm": 0.6626278758049011, + "learning_rate": 7.560975984666971e-06, + "loss": 0.3273, + "step": 25950 + }, + { + "epoch": 0.5785337347987689, + "grad_norm": 0.25327253341674805, + "learning_rate": 7.557580556733384e-06, + "loss": 0.2196, + "step": 25955 + }, + { + "epoch": 0.5786451841793889, + "grad_norm": 0.7648833990097046, + "learning_rate": 7.554185428221569e-06, + "loss": 0.2873, + "step": 25960 + }, + { + "epoch": 0.578756633560009, + "grad_norm": 0.5665774941444397, + "learning_rate": 7.550790599547742e-06, + "loss": 0.2172, + "step": 25965 + }, + { + "epoch": 0.5788680829406291, + "grad_norm": 0.5095011591911316, + "learning_rate": 7.547396071128082e-06, + "loss": 0.2808, + "step": 25970 + }, + { + "epoch": 0.5789795323212491, + "grad_norm": 0.7390519976615906, + "learning_rate": 7.544001843378734e-06, + "loss": 0.3288, + "step": 25975 + }, + { + "epoch": 0.5790909817018692, + "grad_norm": 0.4616258442401886, + "learning_rate": 7.540607916715803e-06, + "loss": 0.1979, + "step": 25980 + }, + { + "epoch": 0.5792024310824893, + "grad_norm": 0.7574527859687805, + "learning_rate": 7.537214291555356e-06, + "loss": 0.2186, + "step": 25985 + }, + { + "epoch": 0.5793138804631093, + "grad_norm": 0.6136702299118042, + "learning_rate": 7.533820968313425e-06, + "loss": 0.2186, + "step": 25990 + }, + { + "epoch": 0.5794253298437294, + "grad_norm": 0.5772611498832703, + "learning_rate": 7.530427947406011e-06, + "loss": 0.334, + "step": 25995 + }, + { + "epoch": 0.5795367792243494, + "grad_norm": 0.4464194178581238, + "learning_rate": 7.527035229249066e-06, + "loss": 0.2133, + "step": 26000 + }, + { + "epoch": 0.5796482286049696, + "grad_norm": 0.473961740732193, + "learning_rate": 7.523642814258516e-06, + "loss": 0.1995, + "step": 26005 + }, + { + "epoch": 0.5797596779855896, + "grad_norm": 0.6521080136299133, + "learning_rate": 7.520250702850242e-06, + "loss": 0.2104, + "step": 26010 + }, + { + "epoch": 0.5798711273662097, + "grad_norm": 0.5367646217346191, + "learning_rate": 7.51685889544009e-06, + "loss": 0.2848, + "step": 26015 + }, + { + "epoch": 0.5799825767468297, + "grad_norm": 0.5130108594894409, + "learning_rate": 7.513467392443872e-06, + "loss": 0.3441, + "step": 26020 + }, + { + "epoch": 0.5800940261274498, + "grad_norm": 0.5340881943702698, + "learning_rate": 7.5100761942773585e-06, + "loss": 0.3413, + "step": 26025 + }, + { + "epoch": 0.5802054755080699, + "grad_norm": 0.7103127241134644, + "learning_rate": 7.50668530135628e-06, + "loss": 0.3688, + "step": 26030 + }, + { + "epoch": 0.5803169248886899, + "grad_norm": 0.6038323640823364, + "learning_rate": 7.503294714096341e-06, + "loss": 0.3647, + "step": 26035 + }, + { + "epoch": 0.58042837426931, + "grad_norm": 0.533886194229126, + "learning_rate": 7.4999044329132e-06, + "loss": 0.426, + "step": 26040 + }, + { + "epoch": 0.5805398236499301, + "grad_norm": 0.48127058148384094, + "learning_rate": 7.496514458222475e-06, + "loss": 0.2537, + "step": 26045 + }, + { + "epoch": 0.5806512730305501, + "grad_norm": 0.5195320248603821, + "learning_rate": 7.4931247904397564e-06, + "loss": 0.2595, + "step": 26050 + }, + { + "epoch": 0.5807627224111702, + "grad_norm": 0.9014118313789368, + "learning_rate": 7.489735429980589e-06, + "loss": 0.348, + "step": 26055 + }, + { + "epoch": 0.5808741717917902, + "grad_norm": 0.7020335793495178, + "learning_rate": 7.4863463772604785e-06, + "loss": 0.2999, + "step": 26060 + }, + { + "epoch": 0.5809856211724104, + "grad_norm": 0.7003258466720581, + "learning_rate": 7.482957632694898e-06, + "loss": 0.2775, + "step": 26065 + }, + { + "epoch": 0.5810970705530304, + "grad_norm": 0.5625821948051453, + "learning_rate": 7.479569196699287e-06, + "loss": 0.2492, + "step": 26070 + }, + { + "epoch": 0.5812085199336504, + "grad_norm": 0.6482052803039551, + "learning_rate": 7.476181069689038e-06, + "loss": 0.2566, + "step": 26075 + }, + { + "epoch": 0.5813199693142705, + "grad_norm": 0.6843783855438232, + "learning_rate": 7.472793252079506e-06, + "loss": 0.3194, + "step": 26080 + }, + { + "epoch": 0.5814314186948906, + "grad_norm": 0.8143817782402039, + "learning_rate": 7.469405744286018e-06, + "loss": 0.3681, + "step": 26085 + }, + { + "epoch": 0.5815428680755107, + "grad_norm": 0.5542212724685669, + "learning_rate": 7.46601854672385e-06, + "loss": 0.207, + "step": 26090 + }, + { + "epoch": 0.5816543174561307, + "grad_norm": 0.4516535699367523, + "learning_rate": 7.462631659808251e-06, + "loss": 0.3429, + "step": 26095 + }, + { + "epoch": 0.5817657668367509, + "grad_norm": 0.4697490930557251, + "learning_rate": 7.45924508395442e-06, + "loss": 0.2176, + "step": 26100 + }, + { + "epoch": 0.5818772162173709, + "grad_norm": 0.6717570424079895, + "learning_rate": 7.455858819577535e-06, + "loss": 0.3194, + "step": 26105 + }, + { + "epoch": 0.5819886655979909, + "grad_norm": 0.8826180100440979, + "learning_rate": 7.452472867092718e-06, + "loss": 0.3914, + "step": 26110 + }, + { + "epoch": 0.582100114978611, + "grad_norm": 0.44467124342918396, + "learning_rate": 7.449087226915066e-06, + "loss": 0.2664, + "step": 26115 + }, + { + "epoch": 0.582211564359231, + "grad_norm": 0.5231371521949768, + "learning_rate": 7.4457018994596305e-06, + "loss": 0.3211, + "step": 26120 + }, + { + "epoch": 0.5823230137398512, + "grad_norm": 0.50714111328125, + "learning_rate": 7.442316885141423e-06, + "loss": 0.2728, + "step": 26125 + }, + { + "epoch": 0.5824344631204712, + "grad_norm": 0.752540111541748, + "learning_rate": 7.438932184375427e-06, + "loss": 0.3526, + "step": 26130 + }, + { + "epoch": 0.5825459125010912, + "grad_norm": 0.8013758063316345, + "learning_rate": 7.435547797576571e-06, + "loss": 0.278, + "step": 26135 + }, + { + "epoch": 0.5826573618817114, + "grad_norm": 0.7866904735565186, + "learning_rate": 7.4321637251597625e-06, + "loss": 0.2892, + "step": 26140 + }, + { + "epoch": 0.5827688112623314, + "grad_norm": 0.5477409362792969, + "learning_rate": 7.428779967539863e-06, + "loss": 0.2566, + "step": 26145 + }, + { + "epoch": 0.5828802606429515, + "grad_norm": 0.5327290892601013, + "learning_rate": 7.425396525131694e-06, + "loss": 0.3314, + "step": 26150 + }, + { + "epoch": 0.5829917100235715, + "grad_norm": 0.8647708892822266, + "learning_rate": 7.422013398350035e-06, + "loss": 0.2462, + "step": 26155 + }, + { + "epoch": 0.5831031594041917, + "grad_norm": 0.7855604290962219, + "learning_rate": 7.418630587609636e-06, + "loss": 0.4267, + "step": 26160 + }, + { + "epoch": 0.5832146087848117, + "grad_norm": 0.6840621829032898, + "learning_rate": 7.415248093325203e-06, + "loss": 0.22, + "step": 26165 + }, + { + "epoch": 0.5833260581654317, + "grad_norm": 0.7037733197212219, + "learning_rate": 7.411865915911397e-06, + "loss": 0.3764, + "step": 26170 + }, + { + "epoch": 0.5834375075460518, + "grad_norm": 0.5002277493476868, + "learning_rate": 7.408484055782854e-06, + "loss": 0.1966, + "step": 26175 + }, + { + "epoch": 0.5835489569266719, + "grad_norm": 0.6145541667938232, + "learning_rate": 7.405102513354166e-06, + "loss": 0.3244, + "step": 26180 + }, + { + "epoch": 0.583660406307292, + "grad_norm": 0.515566349029541, + "learning_rate": 7.4017212890398786e-06, + "loss": 0.343, + "step": 26185 + }, + { + "epoch": 0.583771855687912, + "grad_norm": 0.5336402654647827, + "learning_rate": 7.398340383254507e-06, + "loss": 0.258, + "step": 26190 + }, + { + "epoch": 0.583883305068532, + "grad_norm": 0.5561584830284119, + "learning_rate": 7.394959796412522e-06, + "loss": 0.285, + "step": 26195 + }, + { + "epoch": 0.5839947544491522, + "grad_norm": 0.8551725149154663, + "learning_rate": 7.3915795289283565e-06, + "loss": 0.2469, + "step": 26200 + }, + { + "epoch": 0.5841062038297722, + "grad_norm": 1.5717085599899292, + "learning_rate": 7.388199581216404e-06, + "loss": 0.3854, + "step": 26205 + }, + { + "epoch": 0.5842176532103923, + "grad_norm": 0.5521072745323181, + "learning_rate": 7.384819953691028e-06, + "loss": 0.2584, + "step": 26210 + }, + { + "epoch": 0.5843291025910123, + "grad_norm": 0.4260997474193573, + "learning_rate": 7.38144064676654e-06, + "loss": 0.3264, + "step": 26215 + }, + { + "epoch": 0.5844405519716325, + "grad_norm": 0.7784475684165955, + "learning_rate": 7.378061660857213e-06, + "loss": 0.3542, + "step": 26220 + }, + { + "epoch": 0.5845520013522525, + "grad_norm": 0.5338385701179504, + "learning_rate": 7.374682996377292e-06, + "loss": 0.3085, + "step": 26225 + }, + { + "epoch": 0.5846634507328725, + "grad_norm": 0.5612271428108215, + "learning_rate": 7.371304653740967e-06, + "loss": 0.2138, + "step": 26230 + }, + { + "epoch": 0.5847749001134926, + "grad_norm": 0.5885851979255676, + "learning_rate": 7.367926633362405e-06, + "loss": 0.272, + "step": 26235 + }, + { + "epoch": 0.5848863494941127, + "grad_norm": 0.6166054606437683, + "learning_rate": 7.364548935655717e-06, + "loss": 0.2572, + "step": 26240 + }, + { + "epoch": 0.5849977988747328, + "grad_norm": 0.4326198697090149, + "learning_rate": 7.36117156103499e-06, + "loss": 0.3473, + "step": 26245 + }, + { + "epoch": 0.5851092482553528, + "grad_norm": 0.9689270853996277, + "learning_rate": 7.35779450991426e-06, + "loss": 0.3694, + "step": 26250 + }, + { + "epoch": 0.5852206976359728, + "grad_norm": 0.6381075382232666, + "learning_rate": 7.354417782707529e-06, + "loss": 0.4034, + "step": 26255 + }, + { + "epoch": 0.585332147016593, + "grad_norm": 0.8201387524604797, + "learning_rate": 7.351041379828756e-06, + "loss": 0.3866, + "step": 26260 + }, + { + "epoch": 0.585443596397213, + "grad_norm": 0.39132338762283325, + "learning_rate": 7.347665301691865e-06, + "loss": 0.2545, + "step": 26265 + }, + { + "epoch": 0.5855550457778331, + "grad_norm": 0.8254690766334534, + "learning_rate": 7.344289548710734e-06, + "loss": 0.2786, + "step": 26270 + }, + { + "epoch": 0.5856664951584531, + "grad_norm": 0.697370707988739, + "learning_rate": 7.340914121299203e-06, + "loss": 0.2387, + "step": 26275 + }, + { + "epoch": 0.5857779445390732, + "grad_norm": 0.5445848703384399, + "learning_rate": 7.337539019871078e-06, + "loss": 0.1327, + "step": 26280 + }, + { + "epoch": 0.5858893939196933, + "grad_norm": 0.44236284494400024, + "learning_rate": 7.334164244840118e-06, + "loss": 0.2674, + "step": 26285 + }, + { + "epoch": 0.5860008433003133, + "grad_norm": 0.6897943019866943, + "learning_rate": 7.330789796620047e-06, + "loss": 0.3231, + "step": 26290 + }, + { + "epoch": 0.5861122926809335, + "grad_norm": 0.4481407105922699, + "learning_rate": 7.327415675624541e-06, + "loss": 0.2518, + "step": 26295 + }, + { + "epoch": 0.5862237420615535, + "grad_norm": 0.6133534908294678, + "learning_rate": 7.3240418822672454e-06, + "loss": 0.2758, + "step": 26300 + }, + { + "epoch": 0.5863351914421736, + "grad_norm": 0.6441121697425842, + "learning_rate": 7.320668416961758e-06, + "loss": 0.2207, + "step": 26305 + }, + { + "epoch": 0.5864466408227936, + "grad_norm": 0.9060331583023071, + "learning_rate": 7.317295280121639e-06, + "loss": 0.3035, + "step": 26310 + }, + { + "epoch": 0.5865580902034137, + "grad_norm": 0.3640364408493042, + "learning_rate": 7.313922472160415e-06, + "loss": 0.2682, + "step": 26315 + }, + { + "epoch": 0.5866695395840338, + "grad_norm": 0.5085642337799072, + "learning_rate": 7.310549993491565e-06, + "loss": 0.3085, + "step": 26320 + }, + { + "epoch": 0.5867809889646538, + "grad_norm": 0.6459940671920776, + "learning_rate": 7.307177844528521e-06, + "loss": 0.3039, + "step": 26325 + }, + { + "epoch": 0.5868924383452739, + "grad_norm": 0.7363597750663757, + "learning_rate": 7.303806025684692e-06, + "loss": 0.3756, + "step": 26330 + }, + { + "epoch": 0.587003887725894, + "grad_norm": 0.2980023920536041, + "learning_rate": 7.300434537373433e-06, + "loss": 0.2816, + "step": 26335 + }, + { + "epoch": 0.587115337106514, + "grad_norm": 0.5422041416168213, + "learning_rate": 7.29706338000806e-06, + "loss": 0.3713, + "step": 26340 + }, + { + "epoch": 0.5872267864871341, + "grad_norm": 0.585302472114563, + "learning_rate": 7.29369255400185e-06, + "loss": 0.3396, + "step": 26345 + }, + { + "epoch": 0.5873382358677541, + "grad_norm": 0.7089430689811707, + "learning_rate": 7.290322059768049e-06, + "loss": 0.3635, + "step": 26350 + }, + { + "epoch": 0.5874496852483743, + "grad_norm": 0.6561121344566345, + "learning_rate": 7.286951897719845e-06, + "loss": 0.2654, + "step": 26355 + }, + { + "epoch": 0.5875611346289943, + "grad_norm": 0.43730154633522034, + "learning_rate": 7.283582068270398e-06, + "loss": 0.3302, + "step": 26360 + }, + { + "epoch": 0.5876725840096144, + "grad_norm": 0.7563340663909912, + "learning_rate": 7.280212571832824e-06, + "loss": 0.2343, + "step": 26365 + }, + { + "epoch": 0.5877840333902344, + "grad_norm": 0.9669647216796875, + "learning_rate": 7.2768434088201924e-06, + "loss": 0.3343, + "step": 26370 + }, + { + "epoch": 0.5878954827708545, + "grad_norm": 0.5285956263542175, + "learning_rate": 7.273474579645541e-06, + "loss": 0.4108, + "step": 26375 + }, + { + "epoch": 0.5880069321514746, + "grad_norm": 0.6726597547531128, + "learning_rate": 7.270106084721856e-06, + "loss": 0.4043, + "step": 26380 + }, + { + "epoch": 0.5881183815320946, + "grad_norm": 0.5744706392288208, + "learning_rate": 7.2667379244621e-06, + "loss": 0.2229, + "step": 26385 + }, + { + "epoch": 0.5882298309127147, + "grad_norm": 0.4486239552497864, + "learning_rate": 7.263370099279173e-06, + "loss": 0.2721, + "step": 26390 + }, + { + "epoch": 0.5883412802933348, + "grad_norm": 0.6839447617530823, + "learning_rate": 7.260002609585949e-06, + "loss": 0.4493, + "step": 26395 + }, + { + "epoch": 0.5884527296739548, + "grad_norm": 0.46326157450675964, + "learning_rate": 7.256635455795256e-06, + "loss": 0.2774, + "step": 26400 + }, + { + "epoch": 0.5885641790545749, + "grad_norm": 0.49713730812072754, + "learning_rate": 7.25326863831988e-06, + "loss": 0.2504, + "step": 26405 + }, + { + "epoch": 0.5886756284351949, + "grad_norm": 0.39275237917900085, + "learning_rate": 7.24990215757257e-06, + "loss": 0.2447, + "step": 26410 + }, + { + "epoch": 0.5887870778158151, + "grad_norm": 0.5270885229110718, + "learning_rate": 7.246536013966021e-06, + "loss": 0.2935, + "step": 26415 + }, + { + "epoch": 0.5888985271964351, + "grad_norm": 0.6464756727218628, + "learning_rate": 7.243170207912909e-06, + "loss": 0.327, + "step": 26420 + }, + { + "epoch": 0.5890099765770552, + "grad_norm": 0.9072006344795227, + "learning_rate": 7.239804739825849e-06, + "loss": 0.3532, + "step": 26425 + }, + { + "epoch": 0.5891214259576752, + "grad_norm": 0.5739597678184509, + "learning_rate": 7.2364396101174235e-06, + "loss": 0.3507, + "step": 26430 + }, + { + "epoch": 0.5892328753382953, + "grad_norm": 0.820540189743042, + "learning_rate": 7.233074819200169e-06, + "loss": 0.2704, + "step": 26435 + }, + { + "epoch": 0.5893443247189154, + "grad_norm": 0.6239163279533386, + "learning_rate": 7.229710367486585e-06, + "loss": 0.2627, + "step": 26440 + }, + { + "epoch": 0.5894557740995354, + "grad_norm": 0.6565598249435425, + "learning_rate": 7.226346255389125e-06, + "loss": 0.2404, + "step": 26445 + }, + { + "epoch": 0.5895672234801556, + "grad_norm": 0.7946537137031555, + "learning_rate": 7.222982483320204e-06, + "loss": 0.2855, + "step": 26450 + }, + { + "epoch": 0.5896786728607756, + "grad_norm": 0.85628741979599, + "learning_rate": 7.219619051692198e-06, + "loss": 0.2938, + "step": 26455 + }, + { + "epoch": 0.5897901222413956, + "grad_norm": 0.9541513323783875, + "learning_rate": 7.216255960917435e-06, + "loss": 0.3255, + "step": 26460 + }, + { + "epoch": 0.5899015716220157, + "grad_norm": 0.6425460577011108, + "learning_rate": 7.212893211408203e-06, + "loss": 0.3304, + "step": 26465 + }, + { + "epoch": 0.5900130210026358, + "grad_norm": 0.6187822818756104, + "learning_rate": 7.209530803576752e-06, + "loss": 0.2497, + "step": 26470 + }, + { + "epoch": 0.5901244703832559, + "grad_norm": 0.6000394225120544, + "learning_rate": 7.206168737835284e-06, + "loss": 0.3631, + "step": 26475 + }, + { + "epoch": 0.5902359197638759, + "grad_norm": 0.7396858930587769, + "learning_rate": 7.202807014595962e-06, + "loss": 0.2548, + "step": 26480 + }, + { + "epoch": 0.5903473691444959, + "grad_norm": 0.8461358547210693, + "learning_rate": 7.199445634270908e-06, + "loss": 0.1718, + "step": 26485 + }, + { + "epoch": 0.5904588185251161, + "grad_norm": 0.44261434674263, + "learning_rate": 7.196084597272206e-06, + "loss": 0.2134, + "step": 26490 + }, + { + "epoch": 0.5905702679057361, + "grad_norm": 0.597154438495636, + "learning_rate": 7.1927239040118865e-06, + "loss": 0.2971, + "step": 26495 + }, + { + "epoch": 0.5906817172863562, + "grad_norm": 0.5538870692253113, + "learning_rate": 7.189363554901951e-06, + "loss": 0.2702, + "step": 26500 + }, + { + "epoch": 0.5907931666669762, + "grad_norm": 0.6188107132911682, + "learning_rate": 7.186003550354348e-06, + "loss": 0.3845, + "step": 26505 + }, + { + "epoch": 0.5909046160475964, + "grad_norm": 0.7189980149269104, + "learning_rate": 7.182643890780987e-06, + "loss": 0.2398, + "step": 26510 + }, + { + "epoch": 0.5910160654282164, + "grad_norm": 0.4853551387786865, + "learning_rate": 7.179284576593741e-06, + "loss": 0.3118, + "step": 26515 + }, + { + "epoch": 0.5911275148088364, + "grad_norm": 0.6885067820549011, + "learning_rate": 7.175925608204428e-06, + "loss": 0.1962, + "step": 26520 + }, + { + "epoch": 0.5912389641894565, + "grad_norm": 0.7876604795455933, + "learning_rate": 7.172566986024843e-06, + "loss": 0.3829, + "step": 26525 + }, + { + "epoch": 0.5913504135700766, + "grad_norm": 0.460191011428833, + "learning_rate": 7.169208710466717e-06, + "loss": 0.2921, + "step": 26530 + }, + { + "epoch": 0.5914618629506967, + "grad_norm": 0.7385380864143372, + "learning_rate": 7.165850781941757e-06, + "loss": 0.3389, + "step": 26535 + }, + { + "epoch": 0.5915733123313167, + "grad_norm": 0.7774286270141602, + "learning_rate": 7.162493200861611e-06, + "loss": 0.2618, + "step": 26540 + }, + { + "epoch": 0.5916847617119367, + "grad_norm": 0.4776510000228882, + "learning_rate": 7.159135967637898e-06, + "loss": 0.2776, + "step": 26545 + }, + { + "epoch": 0.5917962110925569, + "grad_norm": 0.42568615078926086, + "learning_rate": 7.155779082682188e-06, + "loss": 0.3234, + "step": 26550 + }, + { + "epoch": 0.5919076604731769, + "grad_norm": 0.620646595954895, + "learning_rate": 7.152422546406007e-06, + "loss": 0.3686, + "step": 26555 + }, + { + "epoch": 0.592019109853797, + "grad_norm": 0.516928493976593, + "learning_rate": 7.14906635922084e-06, + "loss": 0.3181, + "step": 26560 + }, + { + "epoch": 0.592130559234417, + "grad_norm": 0.6994913220405579, + "learning_rate": 7.145710521538134e-06, + "loss": 0.4317, + "step": 26565 + }, + { + "epoch": 0.5922420086150372, + "grad_norm": 0.8918303847312927, + "learning_rate": 7.142355033769286e-06, + "loss": 0.215, + "step": 26570 + }, + { + "epoch": 0.5923534579956572, + "grad_norm": 0.8608641028404236, + "learning_rate": 7.138999896325654e-06, + "loss": 0.3661, + "step": 26575 + }, + { + "epoch": 0.5924649073762772, + "grad_norm": 0.5170380473136902, + "learning_rate": 7.135645109618552e-06, + "loss": 0.2329, + "step": 26580 + }, + { + "epoch": 0.5925763567568973, + "grad_norm": 0.5056709051132202, + "learning_rate": 7.1322906740592476e-06, + "loss": 0.3267, + "step": 26585 + }, + { + "epoch": 0.5926878061375174, + "grad_norm": 0.7833072543144226, + "learning_rate": 7.128936590058973e-06, + "loss": 0.2842, + "step": 26590 + }, + { + "epoch": 0.5927992555181375, + "grad_norm": 0.5779724717140198, + "learning_rate": 7.125582858028908e-06, + "loss": 0.2362, + "step": 26595 + }, + { + "epoch": 0.5929107048987575, + "grad_norm": 0.731823742389679, + "learning_rate": 7.1222294783802015e-06, + "loss": 0.3527, + "step": 26600 + }, + { + "epoch": 0.5930221542793775, + "grad_norm": 0.5753491520881653, + "learning_rate": 7.118876451523946e-06, + "loss": 0.3011, + "step": 26605 + }, + { + "epoch": 0.5931336036599977, + "grad_norm": 0.4684869050979614, + "learning_rate": 7.1155237778712006e-06, + "loss": 0.2555, + "step": 26610 + }, + { + "epoch": 0.5932450530406177, + "grad_norm": 0.5428973436355591, + "learning_rate": 7.112171457832973e-06, + "loss": 0.282, + "step": 26615 + }, + { + "epoch": 0.5933565024212378, + "grad_norm": 0.6345245242118835, + "learning_rate": 7.1088194918202355e-06, + "loss": 0.2939, + "step": 26620 + }, + { + "epoch": 0.5934679518018579, + "grad_norm": 0.8436496257781982, + "learning_rate": 7.105467880243913e-06, + "loss": 0.3502, + "step": 26625 + }, + { + "epoch": 0.593579401182478, + "grad_norm": 0.5694996118545532, + "learning_rate": 7.102116623514881e-06, + "loss": 0.2252, + "step": 26630 + }, + { + "epoch": 0.593690850563098, + "grad_norm": 1.0282306671142578, + "learning_rate": 7.098765722043985e-06, + "loss": 0.3636, + "step": 26635 + }, + { + "epoch": 0.593802299943718, + "grad_norm": 0.7384717464447021, + "learning_rate": 7.09541517624202e-06, + "loss": 0.2732, + "step": 26640 + }, + { + "epoch": 0.5939137493243382, + "grad_norm": 0.7744261622428894, + "learning_rate": 7.0920649865197336e-06, + "loss": 0.4055, + "step": 26645 + }, + { + "epoch": 0.5940251987049582, + "grad_norm": 0.7789164781570435, + "learning_rate": 7.088715153287833e-06, + "loss": 0.3471, + "step": 26650 + }, + { + "epoch": 0.5941366480855783, + "grad_norm": 0.7392175793647766, + "learning_rate": 7.085365676956983e-06, + "loss": 0.4146, + "step": 26655 + }, + { + "epoch": 0.5942480974661983, + "grad_norm": 0.43524813652038574, + "learning_rate": 7.0820165579378035e-06, + "loss": 0.3001, + "step": 26660 + }, + { + "epoch": 0.5943595468468184, + "grad_norm": 0.5696089863777161, + "learning_rate": 7.078667796640868e-06, + "loss": 0.2282, + "step": 26665 + }, + { + "epoch": 0.5944709962274385, + "grad_norm": 0.5634101033210754, + "learning_rate": 7.075319393476716e-06, + "loss": 0.2777, + "step": 26670 + }, + { + "epoch": 0.5945824456080585, + "grad_norm": 0.5696731805801392, + "learning_rate": 7.07197134885583e-06, + "loss": 0.3796, + "step": 26675 + }, + { + "epoch": 0.5946938949886786, + "grad_norm": 0.5457412600517273, + "learning_rate": 7.068623663188654e-06, + "loss": 0.2276, + "step": 26680 + }, + { + "epoch": 0.5948053443692987, + "grad_norm": 0.792374849319458, + "learning_rate": 7.065276336885594e-06, + "loss": 0.3329, + "step": 26685 + }, + { + "epoch": 0.5949167937499187, + "grad_norm": 0.6463175415992737, + "learning_rate": 7.061929370357001e-06, + "loss": 0.2245, + "step": 26690 + }, + { + "epoch": 0.5950282431305388, + "grad_norm": 0.578173041343689, + "learning_rate": 7.058582764013187e-06, + "loss": 0.3041, + "step": 26695 + }, + { + "epoch": 0.5951396925111588, + "grad_norm": 0.6091515421867371, + "learning_rate": 7.055236518264419e-06, + "loss": 0.2944, + "step": 26700 + }, + { + "epoch": 0.595251141891779, + "grad_norm": 0.7556412816047668, + "learning_rate": 7.051890633520928e-06, + "loss": 0.3857, + "step": 26705 + }, + { + "epoch": 0.595362591272399, + "grad_norm": 0.3958643674850464, + "learning_rate": 7.048545110192888e-06, + "loss": 0.2731, + "step": 26710 + }, + { + "epoch": 0.5954740406530191, + "grad_norm": 0.23419268429279327, + "learning_rate": 7.045199948690438e-06, + "loss": 0.2714, + "step": 26715 + }, + { + "epoch": 0.5955854900336391, + "grad_norm": 0.7274675965309143, + "learning_rate": 7.0418551494236665e-06, + "loss": 0.2134, + "step": 26720 + }, + { + "epoch": 0.5956969394142592, + "grad_norm": 0.6608673930168152, + "learning_rate": 7.0385107128026174e-06, + "loss": 0.2355, + "step": 26725 + }, + { + "epoch": 0.5958083887948793, + "grad_norm": 0.8258979916572571, + "learning_rate": 7.035166639237299e-06, + "loss": 0.2381, + "step": 26730 + }, + { + "epoch": 0.5959198381754993, + "grad_norm": 0.4890570044517517, + "learning_rate": 7.031822929137661e-06, + "loss": 0.2922, + "step": 26735 + }, + { + "epoch": 0.5960312875561194, + "grad_norm": 0.5419553518295288, + "learning_rate": 7.028479582913625e-06, + "loss": 0.352, + "step": 26740 + }, + { + "epoch": 0.5961427369367395, + "grad_norm": 0.5165709853172302, + "learning_rate": 7.025136600975054e-06, + "loss": 0.3279, + "step": 26745 + }, + { + "epoch": 0.5962541863173595, + "grad_norm": 0.6278826594352722, + "learning_rate": 7.021793983731775e-06, + "loss": 0.2826, + "step": 26750 + }, + { + "epoch": 0.5963656356979796, + "grad_norm": 0.9872399568557739, + "learning_rate": 7.018451731593564e-06, + "loss": 0.4879, + "step": 26755 + }, + { + "epoch": 0.5964770850785996, + "grad_norm": 0.32107967138290405, + "learning_rate": 7.015109844970158e-06, + "loss": 0.2365, + "step": 26760 + }, + { + "epoch": 0.5965885344592198, + "grad_norm": 0.6850024461746216, + "learning_rate": 7.011768324271246e-06, + "loss": 0.2676, + "step": 26765 + }, + { + "epoch": 0.5966999838398398, + "grad_norm": 0.6033693552017212, + "learning_rate": 7.008427169906466e-06, + "loss": 0.3631, + "step": 26770 + }, + { + "epoch": 0.5968114332204599, + "grad_norm": 0.6413952708244324, + "learning_rate": 7.005086382285426e-06, + "loss": 0.2698, + "step": 26775 + }, + { + "epoch": 0.59692288260108, + "grad_norm": 0.3277706801891327, + "learning_rate": 7.001745961817682e-06, + "loss": 0.267, + "step": 26780 + }, + { + "epoch": 0.5970343319817, + "grad_norm": 0.6097815632820129, + "learning_rate": 6.9984059089127394e-06, + "loss": 0.2035, + "step": 26785 + }, + { + "epoch": 0.5971457813623201, + "grad_norm": 0.736315131187439, + "learning_rate": 6.9950662239800605e-06, + "loss": 0.2213, + "step": 26790 + }, + { + "epoch": 0.5972572307429401, + "grad_norm": 1.0850633382797241, + "learning_rate": 6.991726907429072e-06, + "loss": 0.2451, + "step": 26795 + }, + { + "epoch": 0.5973686801235603, + "grad_norm": 0.6129161715507507, + "learning_rate": 6.988387959669141e-06, + "loss": 0.3486, + "step": 26800 + }, + { + "epoch": 0.5974801295041803, + "grad_norm": 0.8613978624343872, + "learning_rate": 6.985049381109599e-06, + "loss": 0.4207, + "step": 26805 + }, + { + "epoch": 0.5975915788848003, + "grad_norm": 0.615011990070343, + "learning_rate": 6.981711172159733e-06, + "loss": 0.3018, + "step": 26810 + }, + { + "epoch": 0.5977030282654204, + "grad_norm": 0.6447067260742188, + "learning_rate": 6.978373333228782e-06, + "loss": 0.3114, + "step": 26815 + }, + { + "epoch": 0.5978144776460405, + "grad_norm": 0.6088956594467163, + "learning_rate": 6.975035864725934e-06, + "loss": 0.3613, + "step": 26820 + }, + { + "epoch": 0.5979259270266606, + "grad_norm": 0.39137575030326843, + "learning_rate": 6.971698767060342e-06, + "loss": 0.2536, + "step": 26825 + }, + { + "epoch": 0.5980373764072806, + "grad_norm": 0.6503191590309143, + "learning_rate": 6.9683620406411036e-06, + "loss": 0.3193, + "step": 26830 + }, + { + "epoch": 0.5981488257879006, + "grad_norm": 0.46391865611076355, + "learning_rate": 6.965025685877281e-06, + "loss": 0.2987, + "step": 26835 + }, + { + "epoch": 0.5982602751685208, + "grad_norm": 0.5320603847503662, + "learning_rate": 6.961689703177879e-06, + "loss": 0.2605, + "step": 26840 + }, + { + "epoch": 0.5983717245491408, + "grad_norm": 0.6308834552764893, + "learning_rate": 6.95835409295187e-06, + "loss": 0.2241, + "step": 26845 + }, + { + "epoch": 0.5984831739297609, + "grad_norm": 0.5442749857902527, + "learning_rate": 6.955018855608171e-06, + "loss": 0.2204, + "step": 26850 + }, + { + "epoch": 0.5985946233103809, + "grad_norm": 0.7064063549041748, + "learning_rate": 6.951683991555658e-06, + "loss": 0.3439, + "step": 26855 + }, + { + "epoch": 0.5987060726910011, + "grad_norm": 0.9378949999809265, + "learning_rate": 6.948349501203157e-06, + "loss": 0.3433, + "step": 26860 + }, + { + "epoch": 0.5988175220716211, + "grad_norm": 1.0083317756652832, + "learning_rate": 6.945015384959452e-06, + "loss": 0.3341, + "step": 26865 + }, + { + "epoch": 0.5989289714522411, + "grad_norm": 0.7770516872406006, + "learning_rate": 6.941681643233281e-06, + "loss": 0.4177, + "step": 26870 + }, + { + "epoch": 0.5990404208328612, + "grad_norm": 0.7873477935791016, + "learning_rate": 6.93834827643333e-06, + "loss": 0.2319, + "step": 26875 + }, + { + "epoch": 0.5991518702134813, + "grad_norm": 0.4507529139518738, + "learning_rate": 6.9350152849682515e-06, + "loss": 0.2637, + "step": 26880 + }, + { + "epoch": 0.5992633195941014, + "grad_norm": 0.5220516324043274, + "learning_rate": 6.93168266924664e-06, + "loss": 0.3425, + "step": 26885 + }, + { + "epoch": 0.5993747689747214, + "grad_norm": 0.5274975895881653, + "learning_rate": 6.928350429677051e-06, + "loss": 0.2999, + "step": 26890 + }, + { + "epoch": 0.5994862183553414, + "grad_norm": 0.4189387261867523, + "learning_rate": 6.925018566667988e-06, + "loss": 0.2648, + "step": 26895 + }, + { + "epoch": 0.5995976677359616, + "grad_norm": 0.6953780651092529, + "learning_rate": 6.9216870806279156e-06, + "loss": 0.2682, + "step": 26900 + }, + { + "epoch": 0.5997091171165816, + "grad_norm": 0.574653148651123, + "learning_rate": 6.918355971965247e-06, + "loss": 0.2319, + "step": 26905 + }, + { + "epoch": 0.5998205664972017, + "grad_norm": 0.34783512353897095, + "learning_rate": 6.915025241088344e-06, + "loss": 0.3696, + "step": 26910 + }, + { + "epoch": 0.5999320158778217, + "grad_norm": 0.5067845582962036, + "learning_rate": 6.911694888405536e-06, + "loss": 0.2181, + "step": 26915 + }, + { + "epoch": 0.6000434652584419, + "grad_norm": 0.6084128022193909, + "learning_rate": 6.9083649143251e-06, + "loss": 0.4194, + "step": 26920 + }, + { + "epoch": 0.6001549146390619, + "grad_norm": 0.6096634864807129, + "learning_rate": 6.905035319255259e-06, + "loss": 0.3532, + "step": 26925 + }, + { + "epoch": 0.6002663640196819, + "grad_norm": 0.7994610071182251, + "learning_rate": 6.9017061036042e-06, + "loss": 0.3452, + "step": 26930 + }, + { + "epoch": 0.600377813400302, + "grad_norm": 0.5439140796661377, + "learning_rate": 6.898377267780059e-06, + "loss": 0.2444, + "step": 26935 + }, + { + "epoch": 0.6004892627809221, + "grad_norm": 0.5204330086708069, + "learning_rate": 6.895048812190921e-06, + "loss": 0.2858, + "step": 26940 + }, + { + "epoch": 0.6006007121615422, + "grad_norm": 0.5798391699790955, + "learning_rate": 6.8917207372448295e-06, + "loss": 0.4046, + "step": 26945 + }, + { + "epoch": 0.6007121615421622, + "grad_norm": 0.5817235708236694, + "learning_rate": 6.8883930433497884e-06, + "loss": 0.2308, + "step": 26950 + }, + { + "epoch": 0.6008236109227822, + "grad_norm": 0.4607628583908081, + "learning_rate": 6.885065730913741e-06, + "loss": 0.3217, + "step": 26955 + }, + { + "epoch": 0.6009350603034024, + "grad_norm": 0.5161969065666199, + "learning_rate": 6.881738800344591e-06, + "loss": 0.2642, + "step": 26960 + }, + { + "epoch": 0.6010465096840224, + "grad_norm": 0.8000197410583496, + "learning_rate": 6.878412252050196e-06, + "loss": 0.3186, + "step": 26965 + }, + { + "epoch": 0.6011579590646425, + "grad_norm": 0.7184973955154419, + "learning_rate": 6.875086086438363e-06, + "loss": 0.2563, + "step": 26970 + }, + { + "epoch": 0.6012694084452626, + "grad_norm": 0.6139436960220337, + "learning_rate": 6.871760303916855e-06, + "loss": 0.2642, + "step": 26975 + }, + { + "epoch": 0.6013808578258827, + "grad_norm": 0.4902457892894745, + "learning_rate": 6.868434904893385e-06, + "loss": 0.2255, + "step": 26980 + }, + { + "epoch": 0.6014923072065027, + "grad_norm": 0.4996844530105591, + "learning_rate": 6.8651098897756276e-06, + "loss": 0.3407, + "step": 26985 + }, + { + "epoch": 0.6016037565871227, + "grad_norm": 0.48853209614753723, + "learning_rate": 6.861785258971198e-06, + "loss": 0.2316, + "step": 26990 + }, + { + "epoch": 0.6017152059677429, + "grad_norm": 0.7358313202857971, + "learning_rate": 6.858461012887674e-06, + "loss": 0.3769, + "step": 26995 + }, + { + "epoch": 0.6018266553483629, + "grad_norm": 0.5391202569007874, + "learning_rate": 6.85513715193258e-06, + "loss": 0.3337, + "step": 27000 + }, + { + "epoch": 0.601938104728983, + "grad_norm": 0.5830953121185303, + "learning_rate": 6.851813676513397e-06, + "loss": 0.3003, + "step": 27005 + }, + { + "epoch": 0.602049554109603, + "grad_norm": 0.53678297996521, + "learning_rate": 6.848490587037557e-06, + "loss": 0.3026, + "step": 27010 + }, + { + "epoch": 0.6021610034902231, + "grad_norm": 0.7724995017051697, + "learning_rate": 6.845167883912442e-06, + "loss": 0.3513, + "step": 27015 + }, + { + "epoch": 0.6022724528708432, + "grad_norm": 0.7503898739814758, + "learning_rate": 6.8418455675453944e-06, + "loss": 0.3203, + "step": 27020 + }, + { + "epoch": 0.6023839022514632, + "grad_norm": 0.8009513020515442, + "learning_rate": 6.838523638343705e-06, + "loss": 0.2895, + "step": 27025 + }, + { + "epoch": 0.6024953516320833, + "grad_norm": 0.6689680814743042, + "learning_rate": 6.835202096714615e-06, + "loss": 0.3026, + "step": 27030 + }, + { + "epoch": 0.6026068010127034, + "grad_norm": 0.6379620432853699, + "learning_rate": 6.831880943065316e-06, + "loss": 0.3278, + "step": 27035 + }, + { + "epoch": 0.6027182503933234, + "grad_norm": 0.6119515895843506, + "learning_rate": 6.8285601778029635e-06, + "loss": 0.3328, + "step": 27040 + }, + { + "epoch": 0.6028296997739435, + "grad_norm": 0.7183812856674194, + "learning_rate": 6.8252398013346536e-06, + "loss": 0.3237, + "step": 27045 + }, + { + "epoch": 0.6029411491545635, + "grad_norm": 0.5734810829162598, + "learning_rate": 6.821919814067432e-06, + "loss": 0.229, + "step": 27050 + }, + { + "epoch": 0.6030525985351837, + "grad_norm": 0.49161458015441895, + "learning_rate": 6.818600216408314e-06, + "loss": 0.2327, + "step": 27055 + }, + { + "epoch": 0.6031640479158037, + "grad_norm": 0.8137242197990417, + "learning_rate": 6.815281008764255e-06, + "loss": 0.2162, + "step": 27060 + }, + { + "epoch": 0.6032754972964238, + "grad_norm": 0.6309579014778137, + "learning_rate": 6.8119621915421595e-06, + "loss": 0.2806, + "step": 27065 + }, + { + "epoch": 0.6033869466770438, + "grad_norm": 0.5926720499992371, + "learning_rate": 6.808643765148895e-06, + "loss": 0.3955, + "step": 27070 + }, + { + "epoch": 0.6034983960576639, + "grad_norm": 0.5575020909309387, + "learning_rate": 6.805325729991269e-06, + "loss": 0.2786, + "step": 27075 + }, + { + "epoch": 0.603609845438284, + "grad_norm": 0.81143718957901, + "learning_rate": 6.802008086476049e-06, + "loss": 0.3453, + "step": 27080 + }, + { + "epoch": 0.603721294818904, + "grad_norm": 0.8011807799339294, + "learning_rate": 6.798690835009953e-06, + "loss": 0.248, + "step": 27085 + }, + { + "epoch": 0.6038327441995242, + "grad_norm": 0.567529022693634, + "learning_rate": 6.795373975999648e-06, + "loss": 0.3641, + "step": 27090 + }, + { + "epoch": 0.6039441935801442, + "grad_norm": 0.5633978843688965, + "learning_rate": 6.792057509851762e-06, + "loss": 0.2786, + "step": 27095 + }, + { + "epoch": 0.6040556429607642, + "grad_norm": 0.6928328275680542, + "learning_rate": 6.788741436972861e-06, + "loss": 0.2781, + "step": 27100 + }, + { + "epoch": 0.6041670923413843, + "grad_norm": 0.3641568720340729, + "learning_rate": 6.785425757769475e-06, + "loss": 0.2688, + "step": 27105 + }, + { + "epoch": 0.6042785417220043, + "grad_norm": 0.6293769478797913, + "learning_rate": 6.782110472648076e-06, + "loss": 0.2609, + "step": 27110 + }, + { + "epoch": 0.6043899911026245, + "grad_norm": 0.349682480096817, + "learning_rate": 6.778795582015096e-06, + "loss": 0.1502, + "step": 27115 + }, + { + "epoch": 0.6045014404832445, + "grad_norm": 0.5193562507629395, + "learning_rate": 6.7754810862769145e-06, + "loss": 0.1376, + "step": 27120 + }, + { + "epoch": 0.6046128898638646, + "grad_norm": 0.6919031143188477, + "learning_rate": 6.772166985839859e-06, + "loss": 0.3575, + "step": 27125 + }, + { + "epoch": 0.6047243392444847, + "grad_norm": 0.3963978588581085, + "learning_rate": 6.768853281110217e-06, + "loss": 0.3116, + "step": 27130 + }, + { + "epoch": 0.6048357886251047, + "grad_norm": 0.6030378937721252, + "learning_rate": 6.765539972494225e-06, + "loss": 0.2672, + "step": 27135 + }, + { + "epoch": 0.6049472380057248, + "grad_norm": 0.6921826601028442, + "learning_rate": 6.762227060398065e-06, + "loss": 0.4098, + "step": 27140 + }, + { + "epoch": 0.6050586873863448, + "grad_norm": 0.7611590623855591, + "learning_rate": 6.758914545227875e-06, + "loss": 0.435, + "step": 27145 + }, + { + "epoch": 0.605170136766965, + "grad_norm": 0.9621880650520325, + "learning_rate": 6.755602427389746e-06, + "loss": 0.3439, + "step": 27150 + }, + { + "epoch": 0.605281586147585, + "grad_norm": 0.7084947824478149, + "learning_rate": 6.752290707289715e-06, + "loss": 0.3519, + "step": 27155 + }, + { + "epoch": 0.605393035528205, + "grad_norm": 0.6749106645584106, + "learning_rate": 6.748979385333772e-06, + "loss": 0.2787, + "step": 27160 + }, + { + "epoch": 0.6055044849088251, + "grad_norm": 0.47607892751693726, + "learning_rate": 6.745668461927866e-06, + "loss": 0.4617, + "step": 27165 + }, + { + "epoch": 0.6056159342894452, + "grad_norm": 0.7611442804336548, + "learning_rate": 6.742357937477887e-06, + "loss": 0.3738, + "step": 27170 + }, + { + "epoch": 0.6057273836700653, + "grad_norm": 0.800377368927002, + "learning_rate": 6.739047812389679e-06, + "loss": 0.3432, + "step": 27175 + }, + { + "epoch": 0.6058388330506853, + "grad_norm": 0.6022168397903442, + "learning_rate": 6.7357380870690395e-06, + "loss": 0.2759, + "step": 27180 + }, + { + "epoch": 0.6059502824313054, + "grad_norm": 0.6572964191436768, + "learning_rate": 6.732428761921712e-06, + "loss": 0.2319, + "step": 27185 + }, + { + "epoch": 0.6060617318119255, + "grad_norm": 0.5233751535415649, + "learning_rate": 6.7291198373533994e-06, + "loss": 0.2188, + "step": 27190 + }, + { + "epoch": 0.6061731811925455, + "grad_norm": 0.7602106928825378, + "learning_rate": 6.725811313769742e-06, + "loss": 0.2614, + "step": 27195 + }, + { + "epoch": 0.6062846305731656, + "grad_norm": 0.5607755780220032, + "learning_rate": 6.722503191576351e-06, + "loss": 0.2624, + "step": 27200 + }, + { + "epoch": 0.6063960799537856, + "grad_norm": 1.0890930891036987, + "learning_rate": 6.719195471178766e-06, + "loss": 0.3478, + "step": 27205 + }, + { + "epoch": 0.6065075293344058, + "grad_norm": 0.6860947608947754, + "learning_rate": 6.715888152982495e-06, + "loss": 0.264, + "step": 27210 + }, + { + "epoch": 0.6066189787150258, + "grad_norm": 0.6374927759170532, + "learning_rate": 6.712581237392988e-06, + "loss": 0.34, + "step": 27215 + }, + { + "epoch": 0.6067304280956458, + "grad_norm": 0.7704671025276184, + "learning_rate": 6.709274724815643e-06, + "loss": 0.3104, + "step": 27220 + }, + { + "epoch": 0.606841877476266, + "grad_norm": 0.616899847984314, + "learning_rate": 6.705968615655819e-06, + "loss": 0.2688, + "step": 27225 + }, + { + "epoch": 0.606953326856886, + "grad_norm": 0.7842974066734314, + "learning_rate": 6.702662910318814e-06, + "loss": 0.2945, + "step": 27230 + }, + { + "epoch": 0.6070647762375061, + "grad_norm": 0.49267229437828064, + "learning_rate": 6.699357609209884e-06, + "loss": 0.4059, + "step": 27235 + }, + { + "epoch": 0.6071762256181261, + "grad_norm": 0.75706547498703, + "learning_rate": 6.6960527127342375e-06, + "loss": 0.4469, + "step": 27240 + }, + { + "epoch": 0.6072876749987461, + "grad_norm": 0.9641366600990295, + "learning_rate": 6.6927482212970254e-06, + "loss": 0.3216, + "step": 27245 + }, + { + "epoch": 0.6073991243793663, + "grad_norm": 0.5527195930480957, + "learning_rate": 6.689444135303351e-06, + "loss": 0.3861, + "step": 27250 + }, + { + "epoch": 0.6075105737599863, + "grad_norm": 0.8383334279060364, + "learning_rate": 6.686140455158275e-06, + "loss": 0.3467, + "step": 27255 + }, + { + "epoch": 0.6076220231406064, + "grad_norm": 0.4666925072669983, + "learning_rate": 6.682837181266799e-06, + "loss": 0.3701, + "step": 27260 + }, + { + "epoch": 0.6077334725212264, + "grad_norm": 0.6067997217178345, + "learning_rate": 6.679534314033878e-06, + "loss": 0.2877, + "step": 27265 + }, + { + "epoch": 0.6078449219018466, + "grad_norm": 0.561024010181427, + "learning_rate": 6.67623185386442e-06, + "loss": 0.2082, + "step": 27270 + }, + { + "epoch": 0.6079563712824666, + "grad_norm": 0.4686552882194519, + "learning_rate": 6.6729298011632835e-06, + "loss": 0.3118, + "step": 27275 + }, + { + "epoch": 0.6080678206630866, + "grad_norm": 0.6984442472457886, + "learning_rate": 6.6696281563352714e-06, + "loss": 0.2798, + "step": 27280 + }, + { + "epoch": 0.6081792700437068, + "grad_norm": 0.5045234560966492, + "learning_rate": 6.666326919785142e-06, + "loss": 0.3132, + "step": 27285 + }, + { + "epoch": 0.6082907194243268, + "grad_norm": 0.6149911284446716, + "learning_rate": 6.6630260919176e-06, + "loss": 0.2368, + "step": 27290 + }, + { + "epoch": 0.6084021688049469, + "grad_norm": 0.4781946837902069, + "learning_rate": 6.659725673137301e-06, + "loss": 0.3103, + "step": 27295 + }, + { + "epoch": 0.6085136181855669, + "grad_norm": 0.5364055037498474, + "learning_rate": 6.656425663848848e-06, + "loss": 0.2564, + "step": 27300 + }, + { + "epoch": 0.608625067566187, + "grad_norm": 0.9337091445922852, + "learning_rate": 6.653126064456805e-06, + "loss": 0.3776, + "step": 27305 + }, + { + "epoch": 0.6087365169468071, + "grad_norm": 0.6068729162216187, + "learning_rate": 6.649826875365672e-06, + "loss": 0.346, + "step": 27310 + }, + { + "epoch": 0.6088479663274271, + "grad_norm": 0.563932478427887, + "learning_rate": 6.646528096979903e-06, + "loss": 0.3165, + "step": 27315 + }, + { + "epoch": 0.6089594157080472, + "grad_norm": 0.6620405316352844, + "learning_rate": 6.643229729703906e-06, + "loss": 0.4277, + "step": 27320 + }, + { + "epoch": 0.6090708650886673, + "grad_norm": 0.540409505367279, + "learning_rate": 6.639931773942033e-06, + "loss": 0.2479, + "step": 27325 + }, + { + "epoch": 0.6091823144692874, + "grad_norm": 0.6847338080406189, + "learning_rate": 6.636634230098591e-06, + "loss": 0.2437, + "step": 27330 + }, + { + "epoch": 0.6092937638499074, + "grad_norm": 0.6418032646179199, + "learning_rate": 6.633337098577826e-06, + "loss": 0.2825, + "step": 27335 + }, + { + "epoch": 0.6094052132305274, + "grad_norm": 0.7292640805244446, + "learning_rate": 6.630040379783949e-06, + "loss": 0.3244, + "step": 27340 + }, + { + "epoch": 0.6095166626111476, + "grad_norm": 0.6510348916053772, + "learning_rate": 6.62674407412111e-06, + "loss": 0.2052, + "step": 27345 + }, + { + "epoch": 0.6096281119917676, + "grad_norm": 0.3380892872810364, + "learning_rate": 6.6234481819934106e-06, + "loss": 0.2416, + "step": 27350 + }, + { + "epoch": 0.6097395613723877, + "grad_norm": 0.738538920879364, + "learning_rate": 6.6201527038049005e-06, + "loss": 0.3871, + "step": 27355 + }, + { + "epoch": 0.6098510107530077, + "grad_norm": 0.5535542964935303, + "learning_rate": 6.616857639959579e-06, + "loss": 0.2765, + "step": 27360 + }, + { + "epoch": 0.6099624601336278, + "grad_norm": 0.45810580253601074, + "learning_rate": 6.6135629908614e-06, + "loss": 0.2714, + "step": 27365 + }, + { + "epoch": 0.6100739095142479, + "grad_norm": 0.661761999130249, + "learning_rate": 6.610268756914254e-06, + "loss": 0.3585, + "step": 27370 + }, + { + "epoch": 0.6101853588948679, + "grad_norm": 0.9758643507957458, + "learning_rate": 6.606974938521996e-06, + "loss": 0.275, + "step": 27375 + }, + { + "epoch": 0.610296808275488, + "grad_norm": 0.5419698357582092, + "learning_rate": 6.603681536088422e-06, + "loss": 0.2785, + "step": 27380 + }, + { + "epoch": 0.6104082576561081, + "grad_norm": 0.5564716458320618, + "learning_rate": 6.6003885500172775e-06, + "loss": 0.2769, + "step": 27385 + }, + { + "epoch": 0.6105197070367282, + "grad_norm": 1.0348775386810303, + "learning_rate": 6.5970959807122515e-06, + "loss": 0.3956, + "step": 27390 + }, + { + "epoch": 0.6106311564173482, + "grad_norm": 0.6051989793777466, + "learning_rate": 6.593803828576996e-06, + "loss": 0.2114, + "step": 27395 + }, + { + "epoch": 0.6107426057979682, + "grad_norm": 0.699422299861908, + "learning_rate": 6.590512094015098e-06, + "loss": 0.2821, + "step": 27400 + }, + { + "epoch": 0.6108540551785884, + "grad_norm": 0.48418572545051575, + "learning_rate": 6.587220777430097e-06, + "loss": 0.2364, + "step": 27405 + }, + { + "epoch": 0.6109655045592084, + "grad_norm": 0.4478502869606018, + "learning_rate": 6.583929879225487e-06, + "loss": 0.2317, + "step": 27410 + }, + { + "epoch": 0.6110769539398285, + "grad_norm": 0.6911597847938538, + "learning_rate": 6.580639399804709e-06, + "loss": 0.3018, + "step": 27415 + }, + { + "epoch": 0.6111884033204485, + "grad_norm": 0.6295296549797058, + "learning_rate": 6.577349339571144e-06, + "loss": 0.2588, + "step": 27420 + }, + { + "epoch": 0.6112998527010686, + "grad_norm": 0.45536351203918457, + "learning_rate": 6.574059698928133e-06, + "loss": 0.24, + "step": 27425 + }, + { + "epoch": 0.6114113020816887, + "grad_norm": 0.4310155510902405, + "learning_rate": 6.570770478278961e-06, + "loss": 0.2478, + "step": 27430 + }, + { + "epoch": 0.6115227514623087, + "grad_norm": 0.4848770201206207, + "learning_rate": 6.567481678026854e-06, + "loss": 0.3281, + "step": 27435 + }, + { + "epoch": 0.6116342008429289, + "grad_norm": 0.7289760708808899, + "learning_rate": 6.564193298574998e-06, + "loss": 0.464, + "step": 27440 + }, + { + "epoch": 0.6117456502235489, + "grad_norm": 0.5454939603805542, + "learning_rate": 6.560905340326527e-06, + "loss": 0.3648, + "step": 27445 + }, + { + "epoch": 0.6118570996041689, + "grad_norm": 0.5365239977836609, + "learning_rate": 6.557617803684515e-06, + "loss": 0.4343, + "step": 27450 + }, + { + "epoch": 0.611968548984789, + "grad_norm": 0.4437295198440552, + "learning_rate": 6.554330689051987e-06, + "loss": 0.3891, + "step": 27455 + }, + { + "epoch": 0.612079998365409, + "grad_norm": 0.7840902209281921, + "learning_rate": 6.551043996831923e-06, + "loss": 0.1994, + "step": 27460 + }, + { + "epoch": 0.6121914477460292, + "grad_norm": 0.6054477095603943, + "learning_rate": 6.54775772742724e-06, + "loss": 0.294, + "step": 27465 + }, + { + "epoch": 0.6123028971266492, + "grad_norm": 0.7534658312797546, + "learning_rate": 6.544471881240815e-06, + "loss": 0.3178, + "step": 27470 + }, + { + "epoch": 0.6124143465072693, + "grad_norm": 0.576594889163971, + "learning_rate": 6.541186458675461e-06, + "loss": 0.3334, + "step": 27475 + }, + { + "epoch": 0.6125257958878894, + "grad_norm": 0.7999580502510071, + "learning_rate": 6.537901460133953e-06, + "loss": 0.2468, + "step": 27480 + }, + { + "epoch": 0.6126372452685094, + "grad_norm": 0.9341402649879456, + "learning_rate": 6.534616886019e-06, + "loss": 0.3492, + "step": 27485 + }, + { + "epoch": 0.6127486946491295, + "grad_norm": 1.0711827278137207, + "learning_rate": 6.531332736733271e-06, + "loss": 0.3198, + "step": 27490 + }, + { + "epoch": 0.6128601440297495, + "grad_norm": 0.7271906137466431, + "learning_rate": 6.5280490126793724e-06, + "loss": 0.3124, + "step": 27495 + }, + { + "epoch": 0.6129715934103697, + "grad_norm": 0.603204071521759, + "learning_rate": 6.5247657142598686e-06, + "loss": 0.3847, + "step": 27500 + }, + { + "epoch": 0.6130830427909897, + "grad_norm": 0.7971246242523193, + "learning_rate": 6.521482841877263e-06, + "loss": 0.4809, + "step": 27505 + }, + { + "epoch": 0.6131944921716097, + "grad_norm": 0.7611426711082458, + "learning_rate": 6.518200395934004e-06, + "loss": 0.2798, + "step": 27510 + }, + { + "epoch": 0.6133059415522298, + "grad_norm": 0.6165714859962463, + "learning_rate": 6.514918376832506e-06, + "loss": 0.4297, + "step": 27515 + }, + { + "epoch": 0.6134173909328499, + "grad_norm": 0.512054979801178, + "learning_rate": 6.5116367849751146e-06, + "loss": 0.2774, + "step": 27520 + }, + { + "epoch": 0.61352884031347, + "grad_norm": 0.6778582334518433, + "learning_rate": 6.508355620764129e-06, + "loss": 0.3278, + "step": 27525 + }, + { + "epoch": 0.61364028969409, + "grad_norm": 0.6414464116096497, + "learning_rate": 6.505074884601791e-06, + "loss": 0.2077, + "step": 27530 + }, + { + "epoch": 0.6137517390747101, + "grad_norm": 0.3846264183521271, + "learning_rate": 6.501794576890297e-06, + "loss": 0.2684, + "step": 27535 + }, + { + "epoch": 0.6138631884553302, + "grad_norm": 0.5332993268966675, + "learning_rate": 6.4985146980317835e-06, + "loss": 0.2662, + "step": 27540 + }, + { + "epoch": 0.6139746378359502, + "grad_norm": 0.8535371422767639, + "learning_rate": 6.495235248428339e-06, + "loss": 0.2598, + "step": 27545 + }, + { + "epoch": 0.6140860872165703, + "grad_norm": 0.4994587004184723, + "learning_rate": 6.491956228482004e-06, + "loss": 0.3186, + "step": 27550 + }, + { + "epoch": 0.6141975365971903, + "grad_norm": 0.7860655188560486, + "learning_rate": 6.488677638594758e-06, + "loss": 0.246, + "step": 27555 + }, + { + "epoch": 0.6143089859778105, + "grad_norm": 0.5616355538368225, + "learning_rate": 6.485399479168528e-06, + "loss": 0.3059, + "step": 27560 + }, + { + "epoch": 0.6144204353584305, + "grad_norm": 0.7185654640197754, + "learning_rate": 6.4821217506051945e-06, + "loss": 0.2726, + "step": 27565 + }, + { + "epoch": 0.6145318847390505, + "grad_norm": 0.7038310170173645, + "learning_rate": 6.478844453306582e-06, + "loss": 0.4596, + "step": 27570 + }, + { + "epoch": 0.6146433341196706, + "grad_norm": 0.6382894515991211, + "learning_rate": 6.475567587674457e-06, + "loss": 0.2459, + "step": 27575 + }, + { + "epoch": 0.6147547835002907, + "grad_norm": 0.43890437483787537, + "learning_rate": 6.4722911541105435e-06, + "loss": 0.2813, + "step": 27580 + }, + { + "epoch": 0.6148662328809108, + "grad_norm": 0.6502318978309631, + "learning_rate": 6.469015153016502e-06, + "loss": 0.3317, + "step": 27585 + }, + { + "epoch": 0.6149776822615308, + "grad_norm": 0.594805896282196, + "learning_rate": 6.465739584793949e-06, + "loss": 0.2856, + "step": 27590 + }, + { + "epoch": 0.6150891316421508, + "grad_norm": 0.5505577325820923, + "learning_rate": 6.462464449844446e-06, + "loss": 0.3047, + "step": 27595 + }, + { + "epoch": 0.615200581022771, + "grad_norm": 0.6672086119651794, + "learning_rate": 6.459189748569493e-06, + "loss": 0.3137, + "step": 27600 + }, + { + "epoch": 0.615312030403391, + "grad_norm": 0.5111857056617737, + "learning_rate": 6.455915481370545e-06, + "loss": 0.2823, + "step": 27605 + }, + { + "epoch": 0.6154234797840111, + "grad_norm": 0.45580384135246277, + "learning_rate": 6.452641648649006e-06, + "loss": 0.3184, + "step": 27610 + }, + { + "epoch": 0.6155349291646312, + "grad_norm": 0.7010921239852905, + "learning_rate": 6.449368250806218e-06, + "loss": 0.3076, + "step": 27615 + }, + { + "epoch": 0.6156463785452513, + "grad_norm": 0.6821643114089966, + "learning_rate": 6.446095288243473e-06, + "loss": 0.2631, + "step": 27620 + }, + { + "epoch": 0.6157578279258713, + "grad_norm": 0.7827234268188477, + "learning_rate": 6.442822761362015e-06, + "loss": 0.3193, + "step": 27625 + }, + { + "epoch": 0.6158692773064913, + "grad_norm": 0.4832581579685211, + "learning_rate": 6.439550670563031e-06, + "loss": 0.3358, + "step": 27630 + }, + { + "epoch": 0.6159807266871115, + "grad_norm": 0.7612441778182983, + "learning_rate": 6.436279016247652e-06, + "loss": 0.4278, + "step": 27635 + }, + { + "epoch": 0.6160921760677315, + "grad_norm": 0.4451126754283905, + "learning_rate": 6.43300779881696e-06, + "loss": 0.2232, + "step": 27640 + }, + { + "epoch": 0.6162036254483516, + "grad_norm": 0.70438152551651, + "learning_rate": 6.429737018671976e-06, + "loss": 0.2801, + "step": 27645 + }, + { + "epoch": 0.6163150748289716, + "grad_norm": 0.5213625431060791, + "learning_rate": 6.426466676213675e-06, + "loss": 0.3101, + "step": 27650 + }, + { + "epoch": 0.6164265242095917, + "grad_norm": 0.7046170830726624, + "learning_rate": 6.423196771842975e-06, + "loss": 0.2849, + "step": 27655 + }, + { + "epoch": 0.6165379735902118, + "grad_norm": 0.27048227190971375, + "learning_rate": 6.419927305960745e-06, + "loss": 0.3052, + "step": 27660 + }, + { + "epoch": 0.6166494229708318, + "grad_norm": 0.8334628939628601, + "learning_rate": 6.416658278967793e-06, + "loss": 0.2801, + "step": 27665 + }, + { + "epoch": 0.6167608723514519, + "grad_norm": 0.7342699766159058, + "learning_rate": 6.413389691264875e-06, + "loss": 0.2398, + "step": 27670 + }, + { + "epoch": 0.616872321732072, + "grad_norm": 0.703302800655365, + "learning_rate": 6.4101215432526985e-06, + "loss": 0.3231, + "step": 27675 + }, + { + "epoch": 0.6169837711126921, + "grad_norm": 0.6903324127197266, + "learning_rate": 6.40685383533191e-06, + "loss": 0.3583, + "step": 27680 + }, + { + "epoch": 0.6170952204933121, + "grad_norm": 0.5930154323577881, + "learning_rate": 6.403586567903108e-06, + "loss": 0.2944, + "step": 27685 + }, + { + "epoch": 0.6172066698739321, + "grad_norm": 0.41322100162506104, + "learning_rate": 6.400319741366829e-06, + "loss": 0.3527, + "step": 27690 + }, + { + "epoch": 0.6173181192545523, + "grad_norm": 0.8043963313102722, + "learning_rate": 6.3970533561235685e-06, + "loss": 0.3136, + "step": 27695 + }, + { + "epoch": 0.6174295686351723, + "grad_norm": 0.5982251167297363, + "learning_rate": 6.393787412573753e-06, + "loss": 0.2659, + "step": 27700 + }, + { + "epoch": 0.6175410180157924, + "grad_norm": 0.7102477550506592, + "learning_rate": 6.390521911117768e-06, + "loss": 0.3109, + "step": 27705 + }, + { + "epoch": 0.6176524673964124, + "grad_norm": 0.8494806289672852, + "learning_rate": 6.387256852155934e-06, + "loss": 0.2982, + "step": 27710 + }, + { + "epoch": 0.6177639167770325, + "grad_norm": 0.7837997674942017, + "learning_rate": 6.383992236088525e-06, + "loss": 0.3226, + "step": 27715 + }, + { + "epoch": 0.6178753661576526, + "grad_norm": 0.5590851306915283, + "learning_rate": 6.3807280633157565e-06, + "loss": 0.3012, + "step": 27720 + }, + { + "epoch": 0.6179868155382726, + "grad_norm": 0.6607463359832764, + "learning_rate": 6.377464334237786e-06, + "loss": 0.2411, + "step": 27725 + }, + { + "epoch": 0.6180982649188927, + "grad_norm": 0.7981462478637695, + "learning_rate": 6.374201049254731e-06, + "loss": 0.3484, + "step": 27730 + }, + { + "epoch": 0.6182097142995128, + "grad_norm": 0.6522349715232849, + "learning_rate": 6.370938208766642e-06, + "loss": 0.313, + "step": 27735 + }, + { + "epoch": 0.6183211636801329, + "grad_norm": 0.6056199669837952, + "learning_rate": 6.367675813173515e-06, + "loss": 0.3073, + "step": 27740 + }, + { + "epoch": 0.6184326130607529, + "grad_norm": 0.5482789874076843, + "learning_rate": 6.364413862875295e-06, + "loss": 0.3135, + "step": 27745 + }, + { + "epoch": 0.6185440624413729, + "grad_norm": 0.21319426596164703, + "learning_rate": 6.361152358271875e-06, + "loss": 0.2503, + "step": 27750 + }, + { + "epoch": 0.6186555118219931, + "grad_norm": 0.6866772770881653, + "learning_rate": 6.357891299763086e-06, + "loss": 0.1807, + "step": 27755 + }, + { + "epoch": 0.6187669612026131, + "grad_norm": 0.7949379682540894, + "learning_rate": 6.35463068774871e-06, + "loss": 0.3076, + "step": 27760 + }, + { + "epoch": 0.6188784105832332, + "grad_norm": 0.5552226305007935, + "learning_rate": 6.351370522628475e-06, + "loss": 0.3984, + "step": 27765 + }, + { + "epoch": 0.6189898599638533, + "grad_norm": 0.4902004599571228, + "learning_rate": 6.348110804802052e-06, + "loss": 0.3095, + "step": 27770 + }, + { + "epoch": 0.6191013093444733, + "grad_norm": 0.5670919418334961, + "learning_rate": 6.344851534669057e-06, + "loss": 0.2612, + "step": 27775 + }, + { + "epoch": 0.6192127587250934, + "grad_norm": 0.5468574166297913, + "learning_rate": 6.341592712629049e-06, + "loss": 0.2413, + "step": 27780 + }, + { + "epoch": 0.6193242081057134, + "grad_norm": 0.47762176394462585, + "learning_rate": 6.338334339081538e-06, + "loss": 0.3156, + "step": 27785 + }, + { + "epoch": 0.6194356574863336, + "grad_norm": 0.5758314728736877, + "learning_rate": 6.335076414425969e-06, + "loss": 0.1764, + "step": 27790 + }, + { + "epoch": 0.6195471068669536, + "grad_norm": 0.5514389872550964, + "learning_rate": 6.331818939061742e-06, + "loss": 0.2183, + "step": 27795 + }, + { + "epoch": 0.6196585562475736, + "grad_norm": 0.40423551201820374, + "learning_rate": 6.328561913388203e-06, + "loss": 0.3397, + "step": 27800 + }, + { + "epoch": 0.6197700056281937, + "grad_norm": 0.4383307099342346, + "learning_rate": 6.325305337804633e-06, + "loss": 0.3033, + "step": 27805 + }, + { + "epoch": 0.6198814550088138, + "grad_norm": 0.43310850858688354, + "learning_rate": 6.322049212710265e-06, + "loss": 0.3374, + "step": 27810 + }, + { + "epoch": 0.6199929043894339, + "grad_norm": 0.5410407781600952, + "learning_rate": 6.318793538504276e-06, + "loss": 0.2061, + "step": 27815 + }, + { + "epoch": 0.6201043537700539, + "grad_norm": 0.5015875101089478, + "learning_rate": 6.315538315585781e-06, + "loss": 0.176, + "step": 27820 + }, + { + "epoch": 0.620215803150674, + "grad_norm": 0.7569509744644165, + "learning_rate": 6.31228354435385e-06, + "loss": 0.4063, + "step": 27825 + }, + { + "epoch": 0.6203272525312941, + "grad_norm": 0.8083080649375916, + "learning_rate": 6.309029225207489e-06, + "loss": 0.2622, + "step": 27830 + }, + { + "epoch": 0.6204387019119141, + "grad_norm": 0.6677647233009338, + "learning_rate": 6.305775358545659e-06, + "loss": 0.259, + "step": 27835 + }, + { + "epoch": 0.6205501512925342, + "grad_norm": 0.7124935388565063, + "learning_rate": 6.302521944767253e-06, + "loss": 0.2878, + "step": 27840 + }, + { + "epoch": 0.6206616006731542, + "grad_norm": 0.667360782623291, + "learning_rate": 6.2992689842711184e-06, + "loss": 0.2577, + "step": 27845 + }, + { + "epoch": 0.6207730500537744, + "grad_norm": 0.9954953789710999, + "learning_rate": 6.29601647745604e-06, + "loss": 0.3319, + "step": 27850 + }, + { + "epoch": 0.6208844994343944, + "grad_norm": 0.7336387038230896, + "learning_rate": 6.292764424720752e-06, + "loss": 0.2353, + "step": 27855 + }, + { + "epoch": 0.6209959488150144, + "grad_norm": 0.5347248911857605, + "learning_rate": 6.2895128264639295e-06, + "loss": 0.2903, + "step": 27860 + }, + { + "epoch": 0.6211073981956345, + "grad_norm": 0.5746235847473145, + "learning_rate": 6.28626168308419e-06, + "loss": 0.3182, + "step": 27865 + }, + { + "epoch": 0.6212188475762546, + "grad_norm": 0.6404165029525757, + "learning_rate": 6.283010994980106e-06, + "loss": 0.2627, + "step": 27870 + }, + { + "epoch": 0.6213302969568747, + "grad_norm": 0.48679685592651367, + "learning_rate": 6.2797607625501845e-06, + "loss": 0.2974, + "step": 27875 + }, + { + "epoch": 0.6214417463374947, + "grad_norm": 0.9068040251731873, + "learning_rate": 6.276510986192876e-06, + "loss": 0.2187, + "step": 27880 + }, + { + "epoch": 0.6215531957181148, + "grad_norm": 0.905701220035553, + "learning_rate": 6.27326166630658e-06, + "loss": 0.4345, + "step": 27885 + }, + { + "epoch": 0.6216646450987349, + "grad_norm": 0.45995256304740906, + "learning_rate": 6.270012803289639e-06, + "loss": 0.272, + "step": 27890 + }, + { + "epoch": 0.6217760944793549, + "grad_norm": 0.6941897869110107, + "learning_rate": 6.266764397540334e-06, + "loss": 0.3543, + "step": 27895 + }, + { + "epoch": 0.621887543859975, + "grad_norm": 0.7971600890159607, + "learning_rate": 6.263516449456898e-06, + "loss": 0.3165, + "step": 27900 + }, + { + "epoch": 0.621998993240595, + "grad_norm": 0.5076633095741272, + "learning_rate": 6.260268959437507e-06, + "loss": 0.3159, + "step": 27905 + }, + { + "epoch": 0.6221104426212152, + "grad_norm": 0.5933796167373657, + "learning_rate": 6.257021927880276e-06, + "loss": 0.2812, + "step": 27910 + }, + { + "epoch": 0.6222218920018352, + "grad_norm": 0.6237947940826416, + "learning_rate": 6.253775355183261e-06, + "loss": 0.3239, + "step": 27915 + }, + { + "epoch": 0.6223333413824552, + "grad_norm": 0.3354548513889313, + "learning_rate": 6.250529241744475e-06, + "loss": 0.2003, + "step": 27920 + }, + { + "epoch": 0.6224447907630754, + "grad_norm": 0.7078173756599426, + "learning_rate": 6.247283587961862e-06, + "loss": 0.2688, + "step": 27925 + }, + { + "epoch": 0.6225562401436954, + "grad_norm": 0.8186478614807129, + "learning_rate": 6.244038394233313e-06, + "loss": 0.3333, + "step": 27930 + }, + { + "epoch": 0.6226676895243155, + "grad_norm": 0.5947364568710327, + "learning_rate": 6.240793660956662e-06, + "loss": 0.2854, + "step": 27935 + }, + { + "epoch": 0.6227791389049355, + "grad_norm": 0.8700222373008728, + "learning_rate": 6.237549388529696e-06, + "loss": 0.2826, + "step": 27940 + }, + { + "epoch": 0.6228905882855557, + "grad_norm": 0.4603564143180847, + "learning_rate": 6.234305577350133e-06, + "loss": 0.1922, + "step": 27945 + }, + { + "epoch": 0.6230020376661757, + "grad_norm": 0.4369763731956482, + "learning_rate": 6.23106222781564e-06, + "loss": 0.2934, + "step": 27950 + }, + { + "epoch": 0.6231134870467957, + "grad_norm": 0.5998416543006897, + "learning_rate": 6.227819340323826e-06, + "loss": 0.2065, + "step": 27955 + }, + { + "epoch": 0.6232249364274158, + "grad_norm": 0.71073979139328, + "learning_rate": 6.224576915272242e-06, + "loss": 0.3077, + "step": 27960 + }, + { + "epoch": 0.6233363858080359, + "grad_norm": 1.1015069484710693, + "learning_rate": 6.221334953058389e-06, + "loss": 0.3986, + "step": 27965 + }, + { + "epoch": 0.623447835188656, + "grad_norm": 0.227636456489563, + "learning_rate": 6.2180934540797e-06, + "loss": 0.3346, + "step": 27970 + }, + { + "epoch": 0.623559284569276, + "grad_norm": 0.6981043815612793, + "learning_rate": 6.214852418733566e-06, + "loss": 0.3099, + "step": 27975 + }, + { + "epoch": 0.623670733949896, + "grad_norm": 0.45622771978378296, + "learning_rate": 6.211611847417306e-06, + "loss": 0.2065, + "step": 27980 + }, + { + "epoch": 0.6237821833305162, + "grad_norm": 0.5265591740608215, + "learning_rate": 6.208371740528194e-06, + "loss": 0.2861, + "step": 27985 + }, + { + "epoch": 0.6238936327111362, + "grad_norm": 0.8384089469909668, + "learning_rate": 6.205132098463437e-06, + "loss": 0.3414, + "step": 27990 + }, + { + "epoch": 0.6240050820917563, + "grad_norm": 1.005171775817871, + "learning_rate": 6.201892921620197e-06, + "loss": 0.2219, + "step": 27995 + }, + { + "epoch": 0.6241165314723763, + "grad_norm": 0.691701352596283, + "learning_rate": 6.1986542103955646e-06, + "loss": 0.4161, + "step": 28000 + }, + { + "epoch": 0.6242279808529964, + "grad_norm": 0.38245102763175964, + "learning_rate": 6.195415965186582e-06, + "loss": 0.3582, + "step": 28005 + }, + { + "epoch": 0.6243394302336165, + "grad_norm": 0.5451532602310181, + "learning_rate": 6.192178186390237e-06, + "loss": 0.3372, + "step": 28010 + }, + { + "epoch": 0.6244508796142365, + "grad_norm": 0.6231672167778015, + "learning_rate": 6.188940874403456e-06, + "loss": 0.341, + "step": 28015 + }, + { + "epoch": 0.6245623289948566, + "grad_norm": 0.6475543975830078, + "learning_rate": 6.185704029623107e-06, + "loss": 0.2429, + "step": 28020 + }, + { + "epoch": 0.6246737783754767, + "grad_norm": 0.3510415852069855, + "learning_rate": 6.1824676524460015e-06, + "loss": 0.2533, + "step": 28025 + }, + { + "epoch": 0.6247852277560968, + "grad_norm": 0.8009830713272095, + "learning_rate": 6.179231743268896e-06, + "loss": 0.3173, + "step": 28030 + }, + { + "epoch": 0.6248966771367168, + "grad_norm": 0.31814971566200256, + "learning_rate": 6.175996302488485e-06, + "loss": 0.2053, + "step": 28035 + }, + { + "epoch": 0.6250081265173368, + "grad_norm": 0.37845292687416077, + "learning_rate": 6.172761330501409e-06, + "loss": 0.2341, + "step": 28040 + }, + { + "epoch": 0.625119575897957, + "grad_norm": 0.581320583820343, + "learning_rate": 6.169526827704255e-06, + "loss": 0.2602, + "step": 28045 + }, + { + "epoch": 0.625231025278577, + "grad_norm": 0.666366696357727, + "learning_rate": 6.166292794493545e-06, + "loss": 0.4235, + "step": 28050 + }, + { + "epoch": 0.6253424746591971, + "grad_norm": 0.4768614172935486, + "learning_rate": 6.163059231265747e-06, + "loss": 0.2679, + "step": 28055 + }, + { + "epoch": 0.6254539240398171, + "grad_norm": 0.5455437302589417, + "learning_rate": 6.159826138417271e-06, + "loss": 0.2806, + "step": 28060 + }, + { + "epoch": 0.6255653734204372, + "grad_norm": 0.512651264667511, + "learning_rate": 6.1565935163444665e-06, + "loss": 0.2676, + "step": 28065 + }, + { + "epoch": 0.6256768228010573, + "grad_norm": 1.0292948484420776, + "learning_rate": 6.153361365443633e-06, + "loss": 0.326, + "step": 28070 + }, + { + "epoch": 0.6257882721816773, + "grad_norm": 0.43563035130500793, + "learning_rate": 6.1501296861109995e-06, + "loss": 0.2994, + "step": 28075 + }, + { + "epoch": 0.6258997215622975, + "grad_norm": 0.5008480548858643, + "learning_rate": 6.146898478742756e-06, + "loss": 0.354, + "step": 28080 + }, + { + "epoch": 0.6260111709429175, + "grad_norm": 0.650435209274292, + "learning_rate": 6.143667743735013e-06, + "loss": 0.284, + "step": 28085 + }, + { + "epoch": 0.6261226203235376, + "grad_norm": 0.6315786242485046, + "learning_rate": 6.140437481483843e-06, + "loss": 0.2181, + "step": 28090 + }, + { + "epoch": 0.6262340697041576, + "grad_norm": 0.45991653203964233, + "learning_rate": 6.137207692385246e-06, + "loss": 0.3351, + "step": 28095 + }, + { + "epoch": 0.6263455190847776, + "grad_norm": 0.6725899577140808, + "learning_rate": 6.133978376835167e-06, + "loss": 0.2262, + "step": 28100 + }, + { + "epoch": 0.6264569684653978, + "grad_norm": 0.6889006495475769, + "learning_rate": 6.1307495352295e-06, + "loss": 0.3143, + "step": 28105 + }, + { + "epoch": 0.6265684178460178, + "grad_norm": 0.6997217535972595, + "learning_rate": 6.127521167964072e-06, + "loss": 0.2375, + "step": 28110 + }, + { + "epoch": 0.6266798672266379, + "grad_norm": 0.5511771440505981, + "learning_rate": 6.124293275434659e-06, + "loss": 0.2455, + "step": 28115 + }, + { + "epoch": 0.626791316607258, + "grad_norm": 0.8514209389686584, + "learning_rate": 6.121065858036972e-06, + "loss": 0.3233, + "step": 28120 + }, + { + "epoch": 0.626902765987878, + "grad_norm": 0.5625143051147461, + "learning_rate": 6.117838916166674e-06, + "loss": 0.3249, + "step": 28125 + }, + { + "epoch": 0.6270142153684981, + "grad_norm": 0.5047087669372559, + "learning_rate": 6.114612450219356e-06, + "loss": 0.2841, + "step": 28130 + }, + { + "epoch": 0.6271256647491181, + "grad_norm": 0.7418507933616638, + "learning_rate": 6.111386460590564e-06, + "loss": 0.2969, + "step": 28135 + }, + { + "epoch": 0.6272371141297383, + "grad_norm": 0.4277876019477844, + "learning_rate": 6.1081609476757766e-06, + "loss": 0.2599, + "step": 28140 + }, + { + "epoch": 0.6273485635103583, + "grad_norm": 0.43965139985084534, + "learning_rate": 6.104935911870413e-06, + "loss": 0.2074, + "step": 28145 + }, + { + "epoch": 0.6274600128909784, + "grad_norm": 0.42385342717170715, + "learning_rate": 6.101711353569841e-06, + "loss": 0.1684, + "step": 28150 + }, + { + "epoch": 0.6275714622715984, + "grad_norm": 0.5323282480239868, + "learning_rate": 6.098487273169371e-06, + "loss": 0.2288, + "step": 28155 + }, + { + "epoch": 0.6276829116522185, + "grad_norm": 0.6577903032302856, + "learning_rate": 6.095263671064243e-06, + "loss": 0.3944, + "step": 28160 + }, + { + "epoch": 0.6277943610328386, + "grad_norm": 0.5340499877929688, + "learning_rate": 6.0920405476496515e-06, + "loss": 0.312, + "step": 28165 + }, + { + "epoch": 0.6279058104134586, + "grad_norm": 1.0899690389633179, + "learning_rate": 6.088817903320725e-06, + "loss": 0.2366, + "step": 28170 + }, + { + "epoch": 0.6280172597940787, + "grad_norm": 0.7473883628845215, + "learning_rate": 6.085595738472529e-06, + "loss": 0.2867, + "step": 28175 + }, + { + "epoch": 0.6281287091746988, + "grad_norm": 0.4160996377468109, + "learning_rate": 6.082374053500087e-06, + "loss": 0.2274, + "step": 28180 + }, + { + "epoch": 0.6282401585553188, + "grad_norm": 0.6289620995521545, + "learning_rate": 6.07915284879834e-06, + "loss": 0.3257, + "step": 28185 + }, + { + "epoch": 0.6283516079359389, + "grad_norm": 0.367827445268631, + "learning_rate": 6.075932124762194e-06, + "loss": 0.3148, + "step": 28190 + }, + { + "epoch": 0.6284630573165589, + "grad_norm": 0.6318298578262329, + "learning_rate": 6.072711881786477e-06, + "loss": 0.2852, + "step": 28195 + }, + { + "epoch": 0.6285745066971791, + "grad_norm": 0.7384268641471863, + "learning_rate": 6.069492120265974e-06, + "loss": 0.2835, + "step": 28200 + }, + { + "epoch": 0.6286859560777991, + "grad_norm": 0.7134256362915039, + "learning_rate": 6.066272840595394e-06, + "loss": 0.3431, + "step": 28205 + }, + { + "epoch": 0.6287974054584191, + "grad_norm": 0.48338642716407776, + "learning_rate": 6.063054043169403e-06, + "loss": 0.2961, + "step": 28210 + }, + { + "epoch": 0.6289088548390392, + "grad_norm": 0.41167038679122925, + "learning_rate": 6.059835728382597e-06, + "loss": 0.2717, + "step": 28215 + }, + { + "epoch": 0.6290203042196593, + "grad_norm": 0.38216400146484375, + "learning_rate": 6.056617896629515e-06, + "loss": 0.2197, + "step": 28220 + }, + { + "epoch": 0.6291317536002794, + "grad_norm": 0.9967228770256042, + "learning_rate": 6.053400548304641e-06, + "loss": 0.3655, + "step": 28225 + }, + { + "epoch": 0.6292432029808994, + "grad_norm": 0.27996689081192017, + "learning_rate": 6.0501836838024e-06, + "loss": 0.3875, + "step": 28230 + }, + { + "epoch": 0.6293546523615196, + "grad_norm": 0.8248032331466675, + "learning_rate": 6.04696730351715e-06, + "loss": 0.4108, + "step": 28235 + }, + { + "epoch": 0.6294661017421396, + "grad_norm": 0.6017432808876038, + "learning_rate": 6.043751407843195e-06, + "loss": 0.213, + "step": 28240 + }, + { + "epoch": 0.6295775511227596, + "grad_norm": 0.5813285708427429, + "learning_rate": 6.040535997174781e-06, + "loss": 0.2602, + "step": 28245 + }, + { + "epoch": 0.6296890005033797, + "grad_norm": 0.7789047956466675, + "learning_rate": 6.0373210719060894e-06, + "loss": 0.2884, + "step": 28250 + }, + { + "epoch": 0.6298004498839997, + "grad_norm": 1.083784818649292, + "learning_rate": 6.0341066324312455e-06, + "loss": 0.3943, + "step": 28255 + }, + { + "epoch": 0.6299118992646199, + "grad_norm": 0.5226530432701111, + "learning_rate": 6.03089267914432e-06, + "loss": 0.2488, + "step": 28260 + }, + { + "epoch": 0.6300233486452399, + "grad_norm": 0.7086270451545715, + "learning_rate": 6.0276792124393165e-06, + "loss": 0.2931, + "step": 28265 + }, + { + "epoch": 0.6301347980258599, + "grad_norm": 0.5156954526901245, + "learning_rate": 6.024466232710178e-06, + "loss": 0.263, + "step": 28270 + }, + { + "epoch": 0.63024624740648, + "grad_norm": 0.6274219155311584, + "learning_rate": 6.021253740350793e-06, + "loss": 0.3188, + "step": 28275 + }, + { + "epoch": 0.6303576967871001, + "grad_norm": 0.6668805480003357, + "learning_rate": 6.018041735754989e-06, + "loss": 0.3313, + "step": 28280 + }, + { + "epoch": 0.6304691461677202, + "grad_norm": 0.9630991220474243, + "learning_rate": 6.014830219316531e-06, + "loss": 0.3118, + "step": 28285 + }, + { + "epoch": 0.6305805955483402, + "grad_norm": 0.4004746377468109, + "learning_rate": 6.0116191914291255e-06, + "loss": 0.1991, + "step": 28290 + }, + { + "epoch": 0.6306920449289604, + "grad_norm": 0.3665456175804138, + "learning_rate": 6.0084086524864235e-06, + "loss": 0.1839, + "step": 28295 + }, + { + "epoch": 0.6308034943095804, + "grad_norm": 0.5296953320503235, + "learning_rate": 6.005198602882011e-06, + "loss": 0.3783, + "step": 28300 + }, + { + "epoch": 0.6309149436902004, + "grad_norm": 0.659162163734436, + "learning_rate": 6.001989043009415e-06, + "loss": 0.2793, + "step": 28305 + }, + { + "epoch": 0.6310263930708205, + "grad_norm": 0.6633775234222412, + "learning_rate": 5.9987799732621035e-06, + "loss": 0.2492, + "step": 28310 + }, + { + "epoch": 0.6311378424514406, + "grad_norm": 0.5430098176002502, + "learning_rate": 5.995571394033479e-06, + "loss": 0.2439, + "step": 28315 + }, + { + "epoch": 0.6312492918320607, + "grad_norm": 0.7841137647628784, + "learning_rate": 5.9923633057168954e-06, + "loss": 0.355, + "step": 28320 + }, + { + "epoch": 0.6313607412126807, + "grad_norm": 0.7403837442398071, + "learning_rate": 5.989155708705632e-06, + "loss": 0.2966, + "step": 28325 + }, + { + "epoch": 0.6314721905933007, + "grad_norm": 0.5251857042312622, + "learning_rate": 5.985948603392923e-06, + "loss": 0.2467, + "step": 28330 + }, + { + "epoch": 0.6315836399739209, + "grad_norm": 0.5293110609054565, + "learning_rate": 5.982741990171932e-06, + "loss": 0.3541, + "step": 28335 + }, + { + "epoch": 0.6316950893545409, + "grad_norm": 0.6655994057655334, + "learning_rate": 5.979535869435764e-06, + "loss": 0.2069, + "step": 28340 + }, + { + "epoch": 0.631806538735161, + "grad_norm": 0.813114583492279, + "learning_rate": 5.976330241577464e-06, + "loss": 0.2699, + "step": 28345 + }, + { + "epoch": 0.631917988115781, + "grad_norm": 0.35635751485824585, + "learning_rate": 5.973125106990021e-06, + "loss": 0.2387, + "step": 28350 + }, + { + "epoch": 0.6320294374964011, + "grad_norm": 0.6354801058769226, + "learning_rate": 5.9699204660663554e-06, + "loss": 0.3574, + "step": 28355 + }, + { + "epoch": 0.6321408868770212, + "grad_norm": 0.4692534804344177, + "learning_rate": 5.96671631919933e-06, + "loss": 0.3614, + "step": 28360 + }, + { + "epoch": 0.6322523362576412, + "grad_norm": 0.5889471769332886, + "learning_rate": 5.963512666781754e-06, + "loss": 0.3419, + "step": 28365 + }, + { + "epoch": 0.6323637856382613, + "grad_norm": 0.7773043513298035, + "learning_rate": 5.960309509206371e-06, + "loss": 0.3964, + "step": 28370 + }, + { + "epoch": 0.6324752350188814, + "grad_norm": 0.6574989557266235, + "learning_rate": 5.95710684686586e-06, + "loss": 0.2782, + "step": 28375 + }, + { + "epoch": 0.6325866843995015, + "grad_norm": 0.8650151491165161, + "learning_rate": 5.953904680152841e-06, + "loss": 0.2922, + "step": 28380 + }, + { + "epoch": 0.6326981337801215, + "grad_norm": 0.6198773980140686, + "learning_rate": 5.9507030094598794e-06, + "loss": 0.2723, + "step": 28385 + }, + { + "epoch": 0.6328095831607415, + "grad_norm": 0.34654122591018677, + "learning_rate": 5.94750183517947e-06, + "loss": 0.2669, + "step": 28390 + }, + { + "epoch": 0.6329210325413617, + "grad_norm": 0.8061956763267517, + "learning_rate": 5.9443011577040545e-06, + "loss": 0.2312, + "step": 28395 + }, + { + "epoch": 0.6330324819219817, + "grad_norm": 0.799089252948761, + "learning_rate": 5.9411009774260154e-06, + "loss": 0.3356, + "step": 28400 + }, + { + "epoch": 0.6331439313026018, + "grad_norm": 0.45396721363067627, + "learning_rate": 5.937901294737667e-06, + "loss": 0.1851, + "step": 28405 + }, + { + "epoch": 0.6332553806832218, + "grad_norm": 0.5224248766899109, + "learning_rate": 5.934702110031263e-06, + "loss": 0.3263, + "step": 28410 + }, + { + "epoch": 0.6333668300638419, + "grad_norm": 0.6665075421333313, + "learning_rate": 5.931503423699002e-06, + "loss": 0.3244, + "step": 28415 + }, + { + "epoch": 0.633478279444462, + "grad_norm": 0.9394651055335999, + "learning_rate": 5.928305236133016e-06, + "loss": 0.3177, + "step": 28420 + }, + { + "epoch": 0.633589728825082, + "grad_norm": 0.7035577297210693, + "learning_rate": 5.925107547725382e-06, + "loss": 0.3348, + "step": 28425 + }, + { + "epoch": 0.6337011782057022, + "grad_norm": 0.9956908226013184, + "learning_rate": 5.921910358868103e-06, + "loss": 0.2843, + "step": 28430 + }, + { + "epoch": 0.6338126275863222, + "grad_norm": 0.48321256041526794, + "learning_rate": 5.918713669953143e-06, + "loss": 0.2137, + "step": 28435 + }, + { + "epoch": 0.6339240769669423, + "grad_norm": 0.597806453704834, + "learning_rate": 5.915517481372382e-06, + "loss": 0.237, + "step": 28440 + }, + { + "epoch": 0.6340355263475623, + "grad_norm": 0.7039462327957153, + "learning_rate": 5.9123217935176505e-06, + "loss": 0.3598, + "step": 28445 + }, + { + "epoch": 0.6341469757281823, + "grad_norm": 0.7864067554473877, + "learning_rate": 5.909126606780717e-06, + "loss": 0.2633, + "step": 28450 + }, + { + "epoch": 0.6342584251088025, + "grad_norm": 0.49334821105003357, + "learning_rate": 5.905931921553282e-06, + "loss": 0.2726, + "step": 28455 + }, + { + "epoch": 0.6343698744894225, + "grad_norm": 0.6459826827049255, + "learning_rate": 5.9027377382269935e-06, + "loss": 0.4212, + "step": 28460 + }, + { + "epoch": 0.6344813238700426, + "grad_norm": 0.39496904611587524, + "learning_rate": 5.899544057193429e-06, + "loss": 0.3046, + "step": 28465 + }, + { + "epoch": 0.6345927732506627, + "grad_norm": 0.708795964717865, + "learning_rate": 5.896350878844115e-06, + "loss": 0.3406, + "step": 28470 + }, + { + "epoch": 0.6347042226312827, + "grad_norm": 1.2843683958053589, + "learning_rate": 5.893158203570508e-06, + "loss": 0.3394, + "step": 28475 + }, + { + "epoch": 0.6348156720119028, + "grad_norm": 0.5526872277259827, + "learning_rate": 5.889966031764007e-06, + "loss": 0.279, + "step": 28480 + }, + { + "epoch": 0.6349271213925228, + "grad_norm": 0.5944620370864868, + "learning_rate": 5.886774363815944e-06, + "loss": 0.2616, + "step": 28485 + }, + { + "epoch": 0.635038570773143, + "grad_norm": 0.5583816170692444, + "learning_rate": 5.883583200117595e-06, + "loss": 0.3036, + "step": 28490 + }, + { + "epoch": 0.635150020153763, + "grad_norm": 0.36957287788391113, + "learning_rate": 5.880392541060174e-06, + "loss": 0.3225, + "step": 28495 + }, + { + "epoch": 0.6352614695343831, + "grad_norm": 0.6170169711112976, + "learning_rate": 5.877202387034824e-06, + "loss": 0.3002, + "step": 28500 + }, + { + "epoch": 0.6353729189150031, + "grad_norm": 0.4157882332801819, + "learning_rate": 5.874012738432639e-06, + "loss": 0.2491, + "step": 28505 + }, + { + "epoch": 0.6354843682956232, + "grad_norm": 0.4403998553752899, + "learning_rate": 5.870823595644648e-06, + "loss": 0.2913, + "step": 28510 + }, + { + "epoch": 0.6355958176762433, + "grad_norm": 0.5965349078178406, + "learning_rate": 5.8676349590618095e-06, + "loss": 0.3229, + "step": 28515 + }, + { + "epoch": 0.6357072670568633, + "grad_norm": 0.7371415495872498, + "learning_rate": 5.86444682907503e-06, + "loss": 0.2964, + "step": 28520 + }, + { + "epoch": 0.6358187164374834, + "grad_norm": 0.7189732193946838, + "learning_rate": 5.861259206075147e-06, + "loss": 0.3191, + "step": 28525 + }, + { + "epoch": 0.6359301658181035, + "grad_norm": 0.8875950574874878, + "learning_rate": 5.858072090452939e-06, + "loss": 0.3036, + "step": 28530 + }, + { + "epoch": 0.6360416151987235, + "grad_norm": 0.7388046383857727, + "learning_rate": 5.854885482599118e-06, + "loss": 0.3218, + "step": 28535 + }, + { + "epoch": 0.6361530645793436, + "grad_norm": 0.5969105958938599, + "learning_rate": 5.8516993829043465e-06, + "loss": 0.3614, + "step": 28540 + }, + { + "epoch": 0.6362645139599636, + "grad_norm": 0.801123321056366, + "learning_rate": 5.8485137917592115e-06, + "loss": 0.2599, + "step": 28545 + }, + { + "epoch": 0.6363759633405838, + "grad_norm": 0.5359708666801453, + "learning_rate": 5.845328709554237e-06, + "loss": 0.3185, + "step": 28550 + }, + { + "epoch": 0.6364874127212038, + "grad_norm": 0.9001630544662476, + "learning_rate": 5.842144136679897e-06, + "loss": 0.3986, + "step": 28555 + }, + { + "epoch": 0.6365988621018238, + "grad_norm": 0.3510948121547699, + "learning_rate": 5.838960073526589e-06, + "loss": 0.2436, + "step": 28560 + }, + { + "epoch": 0.636710311482444, + "grad_norm": 0.5558143854141235, + "learning_rate": 5.835776520484664e-06, + "loss": 0.3203, + "step": 28565 + }, + { + "epoch": 0.636821760863064, + "grad_norm": 0.7191622257232666, + "learning_rate": 5.832593477944386e-06, + "loss": 0.327, + "step": 28570 + }, + { + "epoch": 0.6369332102436841, + "grad_norm": 0.8400184512138367, + "learning_rate": 5.829410946295981e-06, + "loss": 0.2707, + "step": 28575 + }, + { + "epoch": 0.6370446596243041, + "grad_norm": 0.6664688587188721, + "learning_rate": 5.826228925929607e-06, + "loss": 0.2452, + "step": 28580 + }, + { + "epoch": 0.6371561090049243, + "grad_norm": 0.7116914391517639, + "learning_rate": 5.82304741723535e-06, + "loss": 0.3667, + "step": 28585 + }, + { + "epoch": 0.6372675583855443, + "grad_norm": 0.8650065660476685, + "learning_rate": 5.819866420603237e-06, + "loss": 0.2962, + "step": 28590 + }, + { + "epoch": 0.6373790077661643, + "grad_norm": 0.3095763623714447, + "learning_rate": 5.8166859364232365e-06, + "loss": 0.2899, + "step": 28595 + }, + { + "epoch": 0.6374904571467844, + "grad_norm": 0.6253182291984558, + "learning_rate": 5.813505965085245e-06, + "loss": 0.3305, + "step": 28600 + }, + { + "epoch": 0.6376019065274044, + "grad_norm": 0.41500774025917053, + "learning_rate": 5.810326506979109e-06, + "loss": 0.3192, + "step": 28605 + }, + { + "epoch": 0.6377133559080246, + "grad_norm": 0.4850252568721771, + "learning_rate": 5.8071475624946065e-06, + "loss": 0.2845, + "step": 28610 + }, + { + "epoch": 0.6378248052886446, + "grad_norm": 0.771271824836731, + "learning_rate": 5.8039691320214495e-06, + "loss": 0.3584, + "step": 28615 + }, + { + "epoch": 0.6379362546692646, + "grad_norm": 0.5867313742637634, + "learning_rate": 5.800791215949289e-06, + "loss": 0.4466, + "step": 28620 + }, + { + "epoch": 0.6380477040498848, + "grad_norm": 0.6335778832435608, + "learning_rate": 5.797613814667711e-06, + "loss": 0.4406, + "step": 28625 + }, + { + "epoch": 0.6381591534305048, + "grad_norm": 0.5767678618431091, + "learning_rate": 5.794436928566238e-06, + "loss": 0.303, + "step": 28630 + }, + { + "epoch": 0.6382706028111249, + "grad_norm": 0.666451096534729, + "learning_rate": 5.7912605580343415e-06, + "loss": 0.1921, + "step": 28635 + }, + { + "epoch": 0.6383820521917449, + "grad_norm": 0.5831393003463745, + "learning_rate": 5.7880847034614125e-06, + "loss": 0.298, + "step": 28640 + }, + { + "epoch": 0.6384935015723651, + "grad_norm": 0.6831397414207458, + "learning_rate": 5.784909365236784e-06, + "loss": 0.3438, + "step": 28645 + }, + { + "epoch": 0.6386049509529851, + "grad_norm": 0.7521395087242126, + "learning_rate": 5.781734543749737e-06, + "loss": 0.4049, + "step": 28650 + }, + { + "epoch": 0.6387164003336051, + "grad_norm": 0.46241921186447144, + "learning_rate": 5.7785602393894745e-06, + "loss": 0.3837, + "step": 28655 + }, + { + "epoch": 0.6388278497142252, + "grad_norm": 0.8582910895347595, + "learning_rate": 5.775386452545142e-06, + "loss": 0.2823, + "step": 28660 + }, + { + "epoch": 0.6389392990948453, + "grad_norm": 0.6524258852005005, + "learning_rate": 5.772213183605817e-06, + "loss": 0.3483, + "step": 28665 + }, + { + "epoch": 0.6390507484754654, + "grad_norm": 0.8292897939682007, + "learning_rate": 5.7690404329605255e-06, + "loss": 0.2351, + "step": 28670 + }, + { + "epoch": 0.6391621978560854, + "grad_norm": 0.5087527632713318, + "learning_rate": 5.765868200998219e-06, + "loss": 0.3299, + "step": 28675 + }, + { + "epoch": 0.6392736472367054, + "grad_norm": 0.5977237224578857, + "learning_rate": 5.7626964881077865e-06, + "loss": 0.3581, + "step": 28680 + }, + { + "epoch": 0.6393850966173256, + "grad_norm": 0.6168895959854126, + "learning_rate": 5.75952529467806e-06, + "loss": 0.2462, + "step": 28685 + }, + { + "epoch": 0.6394965459979456, + "grad_norm": 0.6840497255325317, + "learning_rate": 5.756354621097802e-06, + "loss": 0.4014, + "step": 28690 + }, + { + "epoch": 0.6396079953785657, + "grad_norm": 0.6457498073577881, + "learning_rate": 5.753184467755706e-06, + "loss": 0.3981, + "step": 28695 + }, + { + "epoch": 0.6397194447591857, + "grad_norm": 0.6398038268089294, + "learning_rate": 5.7500148350404184e-06, + "loss": 0.3208, + "step": 28700 + }, + { + "epoch": 0.6398308941398059, + "grad_norm": 0.704741358757019, + "learning_rate": 5.746845723340509e-06, + "loss": 0.4027, + "step": 28705 + }, + { + "epoch": 0.6399423435204259, + "grad_norm": 0.5821196436882019, + "learning_rate": 5.743677133044483e-06, + "loss": 0.2992, + "step": 28710 + }, + { + "epoch": 0.6400537929010459, + "grad_norm": 0.7517719268798828, + "learning_rate": 5.7405090645407825e-06, + "loss": 0.2866, + "step": 28715 + }, + { + "epoch": 0.640165242281666, + "grad_norm": 0.5159407258033752, + "learning_rate": 5.737341518217796e-06, + "loss": 0.2866, + "step": 28720 + }, + { + "epoch": 0.6402766916622861, + "grad_norm": 0.701301634311676, + "learning_rate": 5.734174494463834e-06, + "loss": 0.2667, + "step": 28725 + }, + { + "epoch": 0.6403881410429062, + "grad_norm": 0.48736444115638733, + "learning_rate": 5.731007993667155e-06, + "loss": 0.2607, + "step": 28730 + }, + { + "epoch": 0.6404995904235262, + "grad_norm": 0.6046923398971558, + "learning_rate": 5.7278420162159455e-06, + "loss": 0.2917, + "step": 28735 + }, + { + "epoch": 0.6406110398041462, + "grad_norm": 0.6669983267784119, + "learning_rate": 5.724676562498328e-06, + "loss": 0.2034, + "step": 28740 + }, + { + "epoch": 0.6407224891847664, + "grad_norm": 0.6515795588493347, + "learning_rate": 5.721511632902364e-06, + "loss": 0.1844, + "step": 28745 + }, + { + "epoch": 0.6408339385653864, + "grad_norm": 0.38362544775009155, + "learning_rate": 5.718347227816044e-06, + "loss": 0.2074, + "step": 28750 + }, + { + "epoch": 0.6409453879460065, + "grad_norm": 0.8220067620277405, + "learning_rate": 5.715183347627305e-06, + "loss": 0.2547, + "step": 28755 + }, + { + "epoch": 0.6410568373266265, + "grad_norm": 0.6852700710296631, + "learning_rate": 5.7120199927240186e-06, + "loss": 0.4051, + "step": 28760 + }, + { + "epoch": 0.6411682867072466, + "grad_norm": 0.6596646904945374, + "learning_rate": 5.708857163493981e-06, + "loss": 0.2388, + "step": 28765 + }, + { + "epoch": 0.6412797360878667, + "grad_norm": 0.8401342034339905, + "learning_rate": 5.705694860324935e-06, + "loss": 0.301, + "step": 28770 + }, + { + "epoch": 0.6413911854684867, + "grad_norm": 0.7203656435012817, + "learning_rate": 5.702533083604551e-06, + "loss": 0.3037, + "step": 28775 + }, + { + "epoch": 0.6415026348491069, + "grad_norm": 0.509978711605072, + "learning_rate": 5.6993718337204394e-06, + "loss": 0.3031, + "step": 28780 + }, + { + "epoch": 0.6416140842297269, + "grad_norm": 0.6036132574081421, + "learning_rate": 5.696211111060142e-06, + "loss": 0.3665, + "step": 28785 + }, + { + "epoch": 0.641725533610347, + "grad_norm": 0.613450288772583, + "learning_rate": 5.693050916011141e-06, + "loss": 0.3481, + "step": 28790 + }, + { + "epoch": 0.641836982990967, + "grad_norm": 0.8690477609634399, + "learning_rate": 5.68989124896086e-06, + "loss": 0.2436, + "step": 28795 + }, + { + "epoch": 0.641948432371587, + "grad_norm": 0.548732340335846, + "learning_rate": 5.68673211029664e-06, + "loss": 0.2267, + "step": 28800 + }, + { + "epoch": 0.6420598817522072, + "grad_norm": 0.6367558240890503, + "learning_rate": 5.68357350040577e-06, + "loss": 0.2659, + "step": 28805 + }, + { + "epoch": 0.6421713311328272, + "grad_norm": 0.7086297869682312, + "learning_rate": 5.680415419675472e-06, + "loss": 0.2234, + "step": 28810 + }, + { + "epoch": 0.6422827805134473, + "grad_norm": 0.7409600019454956, + "learning_rate": 5.677257868492898e-06, + "loss": 0.3479, + "step": 28815 + }, + { + "epoch": 0.6423942298940674, + "grad_norm": 0.26541799306869507, + "learning_rate": 5.674100847245142e-06, + "loss": 0.1698, + "step": 28820 + }, + { + "epoch": 0.6425056792746874, + "grad_norm": 0.750893235206604, + "learning_rate": 5.6709443563192355e-06, + "loss": 0.2711, + "step": 28825 + }, + { + "epoch": 0.6426171286553075, + "grad_norm": 0.6889334917068481, + "learning_rate": 5.667788396102136e-06, + "loss": 0.3132, + "step": 28830 + }, + { + "epoch": 0.6427285780359275, + "grad_norm": 0.6183418035507202, + "learning_rate": 5.66463296698074e-06, + "loss": 0.2488, + "step": 28835 + }, + { + "epoch": 0.6428400274165477, + "grad_norm": 0.6999242305755615, + "learning_rate": 5.661478069341877e-06, + "loss": 0.4003, + "step": 28840 + }, + { + "epoch": 0.6429514767971677, + "grad_norm": 0.8293443322181702, + "learning_rate": 5.658323703572313e-06, + "loss": 0.2561, + "step": 28845 + }, + { + "epoch": 0.6430629261777878, + "grad_norm": 0.6160246133804321, + "learning_rate": 5.655169870058752e-06, + "loss": 0.2749, + "step": 28850 + }, + { + "epoch": 0.6431743755584078, + "grad_norm": 0.7358884215354919, + "learning_rate": 5.652016569187823e-06, + "loss": 0.2327, + "step": 28855 + }, + { + "epoch": 0.6432858249390279, + "grad_norm": 0.5642139315605164, + "learning_rate": 5.648863801346108e-06, + "loss": 0.3409, + "step": 28860 + }, + { + "epoch": 0.643397274319648, + "grad_norm": 0.47250843048095703, + "learning_rate": 5.645711566920105e-06, + "loss": 0.3037, + "step": 28865 + }, + { + "epoch": 0.643508723700268, + "grad_norm": 0.6742610335350037, + "learning_rate": 5.642559866296253e-06, + "loss": 0.3204, + "step": 28870 + }, + { + "epoch": 0.6436201730808881, + "grad_norm": 0.6448606848716736, + "learning_rate": 5.639408699860927e-06, + "loss": 0.2055, + "step": 28875 + }, + { + "epoch": 0.6437316224615082, + "grad_norm": 0.35170137882232666, + "learning_rate": 5.636258068000433e-06, + "loss": 0.2345, + "step": 28880 + }, + { + "epoch": 0.6438430718421282, + "grad_norm": 0.5862758755683899, + "learning_rate": 5.633107971101019e-06, + "loss": 0.3759, + "step": 28885 + }, + { + "epoch": 0.6439545212227483, + "grad_norm": 0.5817404985427856, + "learning_rate": 5.629958409548859e-06, + "loss": 0.201, + "step": 28890 + }, + { + "epoch": 0.6440659706033683, + "grad_norm": 0.9205470681190491, + "learning_rate": 5.626809383730069e-06, + "loss": 0.3098, + "step": 28895 + }, + { + "epoch": 0.6441774199839885, + "grad_norm": 0.5637964606285095, + "learning_rate": 5.623660894030691e-06, + "loss": 0.3565, + "step": 28900 + }, + { + "epoch": 0.6442888693646085, + "grad_norm": 0.5298755168914795, + "learning_rate": 5.620512940836711e-06, + "loss": 0.3908, + "step": 28905 + }, + { + "epoch": 0.6444003187452286, + "grad_norm": 0.7002055048942566, + "learning_rate": 5.617365524534033e-06, + "loss": 0.2143, + "step": 28910 + }, + { + "epoch": 0.6445117681258486, + "grad_norm": 0.6193220615386963, + "learning_rate": 5.614218645508518e-06, + "loss": 0.2276, + "step": 28915 + }, + { + "epoch": 0.6446232175064687, + "grad_norm": 0.5418049097061157, + "learning_rate": 5.611072304145944e-06, + "loss": 0.4421, + "step": 28920 + }, + { + "epoch": 0.6447346668870888, + "grad_norm": 0.5820499062538147, + "learning_rate": 5.607926500832024e-06, + "loss": 0.2382, + "step": 28925 + }, + { + "epoch": 0.6448461162677088, + "grad_norm": 0.5766317844390869, + "learning_rate": 5.604781235952418e-06, + "loss": 0.2872, + "step": 28930 + }, + { + "epoch": 0.644957565648329, + "grad_norm": 0.896270215511322, + "learning_rate": 5.601636509892706e-06, + "loss": 0.3515, + "step": 28935 + }, + { + "epoch": 0.645069015028949, + "grad_norm": 0.613178551197052, + "learning_rate": 5.5984923230384045e-06, + "loss": 0.2528, + "step": 28940 + }, + { + "epoch": 0.645180464409569, + "grad_norm": 0.7325118184089661, + "learning_rate": 5.595348675774972e-06, + "loss": 0.2991, + "step": 28945 + }, + { + "epoch": 0.6452919137901891, + "grad_norm": 0.6555901169776917, + "learning_rate": 5.5922055684877956e-06, + "loss": 0.2551, + "step": 28950 + }, + { + "epoch": 0.6454033631708092, + "grad_norm": 0.7699524760246277, + "learning_rate": 5.589063001562191e-06, + "loss": 0.3519, + "step": 28955 + }, + { + "epoch": 0.6455148125514293, + "grad_norm": 0.8284532427787781, + "learning_rate": 5.585920975383413e-06, + "loss": 0.2222, + "step": 28960 + }, + { + "epoch": 0.6456262619320493, + "grad_norm": 0.41853615641593933, + "learning_rate": 5.582779490336656e-06, + "loss": 0.2515, + "step": 28965 + }, + { + "epoch": 0.6457377113126693, + "grad_norm": 0.780164361000061, + "learning_rate": 5.579638546807037e-06, + "loss": 0.4116, + "step": 28970 + }, + { + "epoch": 0.6458491606932895, + "grad_norm": 0.6371381878852844, + "learning_rate": 5.5764981451796085e-06, + "loss": 0.2392, + "step": 28975 + }, + { + "epoch": 0.6459606100739095, + "grad_norm": 0.6431651711463928, + "learning_rate": 5.573358285839367e-06, + "loss": 0.3508, + "step": 28980 + }, + { + "epoch": 0.6460720594545296, + "grad_norm": 0.3787180781364441, + "learning_rate": 5.57021896917123e-06, + "loss": 0.2437, + "step": 28985 + }, + { + "epoch": 0.6461835088351496, + "grad_norm": 0.7805829048156738, + "learning_rate": 5.567080195560057e-06, + "loss": 0.1728, + "step": 28990 + }, + { + "epoch": 0.6462949582157698, + "grad_norm": 0.7594588398933411, + "learning_rate": 5.563941965390631e-06, + "loss": 0.3961, + "step": 28995 + }, + { + "epoch": 0.6464064075963898, + "grad_norm": 0.6046558022499084, + "learning_rate": 5.560804279047682e-06, + "loss": 0.3928, + "step": 29000 + }, + { + "epoch": 0.6465178569770098, + "grad_norm": 0.5299666523933411, + "learning_rate": 5.557667136915859e-06, + "loss": 0.2343, + "step": 29005 + }, + { + "epoch": 0.6466293063576299, + "grad_norm": 0.5828954577445984, + "learning_rate": 5.554530539379759e-06, + "loss": 0.2749, + "step": 29010 + }, + { + "epoch": 0.64674075573825, + "grad_norm": 0.7023400664329529, + "learning_rate": 5.551394486823903e-06, + "loss": 0.1958, + "step": 29015 + }, + { + "epoch": 0.6468522051188701, + "grad_norm": 0.7730718851089478, + "learning_rate": 5.548258979632743e-06, + "loss": 0.2952, + "step": 29020 + }, + { + "epoch": 0.6469636544994901, + "grad_norm": 0.4209458529949188, + "learning_rate": 5.545124018190671e-06, + "loss": 0.2644, + "step": 29025 + }, + { + "epoch": 0.6470751038801101, + "grad_norm": 0.526593029499054, + "learning_rate": 5.541989602882003e-06, + "loss": 0.1792, + "step": 29030 + }, + { + "epoch": 0.6471865532607303, + "grad_norm": 0.4635657072067261, + "learning_rate": 5.5388557340909995e-06, + "loss": 0.2978, + "step": 29035 + }, + { + "epoch": 0.6472980026413503, + "grad_norm": 0.4689362347126007, + "learning_rate": 5.535722412201854e-06, + "loss": 0.2753, + "step": 29040 + }, + { + "epoch": 0.6474094520219704, + "grad_norm": 0.532612681388855, + "learning_rate": 5.53258963759868e-06, + "loss": 0.3198, + "step": 29045 + }, + { + "epoch": 0.6475209014025904, + "grad_norm": 0.6630828976631165, + "learning_rate": 5.529457410665533e-06, + "loss": 0.2862, + "step": 29050 + }, + { + "epoch": 0.6476323507832106, + "grad_norm": 0.8753209114074707, + "learning_rate": 5.526325731786402e-06, + "loss": 0.2556, + "step": 29055 + }, + { + "epoch": 0.6477438001638306, + "grad_norm": 0.49083131551742554, + "learning_rate": 5.523194601345199e-06, + "loss": 0.2725, + "step": 29060 + }, + { + "epoch": 0.6478552495444506, + "grad_norm": 0.6012879014015198, + "learning_rate": 5.520064019725787e-06, + "loss": 0.2719, + "step": 29065 + }, + { + "epoch": 0.6479666989250707, + "grad_norm": 0.6715513467788696, + "learning_rate": 5.516933987311942e-06, + "loss": 0.3033, + "step": 29070 + }, + { + "epoch": 0.6480781483056908, + "grad_norm": 0.72255939245224, + "learning_rate": 5.51380450448739e-06, + "loss": 0.301, + "step": 29075 + }, + { + "epoch": 0.6481895976863109, + "grad_norm": 0.6323819756507874, + "learning_rate": 5.5106755716357796e-06, + "loss": 0.326, + "step": 29080 + }, + { + "epoch": 0.6483010470669309, + "grad_norm": 0.5553443431854248, + "learning_rate": 5.50754718914069e-06, + "loss": 0.2591, + "step": 29085 + }, + { + "epoch": 0.648412496447551, + "grad_norm": 0.8041399717330933, + "learning_rate": 5.504419357385639e-06, + "loss": 0.3453, + "step": 29090 + }, + { + "epoch": 0.6485239458281711, + "grad_norm": 0.6085423827171326, + "learning_rate": 5.50129207675407e-06, + "loss": 0.2194, + "step": 29095 + }, + { + "epoch": 0.6486353952087911, + "grad_norm": 0.6338213682174683, + "learning_rate": 5.498165347629367e-06, + "loss": 0.4204, + "step": 29100 + }, + { + "epoch": 0.6487468445894112, + "grad_norm": 0.97102952003479, + "learning_rate": 5.495039170394849e-06, + "loss": 0.2265, + "step": 29105 + }, + { + "epoch": 0.6488582939700313, + "grad_norm": 0.42732176184654236, + "learning_rate": 5.491913545433756e-06, + "loss": 0.3023, + "step": 29110 + }, + { + "epoch": 0.6489697433506513, + "grad_norm": 0.7507066130638123, + "learning_rate": 5.488788473129264e-06, + "loss": 0.2947, + "step": 29115 + }, + { + "epoch": 0.6490811927312714, + "grad_norm": 0.6872125864028931, + "learning_rate": 5.485663953864484e-06, + "loss": 0.2854, + "step": 29120 + }, + { + "epoch": 0.6491926421118914, + "grad_norm": 1.0153363943099976, + "learning_rate": 5.482539988022455e-06, + "loss": 0.3717, + "step": 29125 + }, + { + "epoch": 0.6493040914925116, + "grad_norm": 1.0214664936065674, + "learning_rate": 5.4794165759861565e-06, + "loss": 0.2702, + "step": 29130 + }, + { + "epoch": 0.6494155408731316, + "grad_norm": 0.470058798789978, + "learning_rate": 5.47629371813849e-06, + "loss": 0.2975, + "step": 29135 + }, + { + "epoch": 0.6495269902537517, + "grad_norm": 0.4673933684825897, + "learning_rate": 5.473171414862299e-06, + "loss": 0.3017, + "step": 29140 + }, + { + "epoch": 0.6496384396343717, + "grad_norm": 0.741070032119751, + "learning_rate": 5.470049666540352e-06, + "loss": 0.3026, + "step": 29145 + }, + { + "epoch": 0.6497498890149918, + "grad_norm": 0.710664689540863, + "learning_rate": 5.466928473555347e-06, + "loss": 0.2238, + "step": 29150 + }, + { + "epoch": 0.6498613383956119, + "grad_norm": 0.4912746846675873, + "learning_rate": 5.463807836289921e-06, + "loss": 0.2674, + "step": 29155 + }, + { + "epoch": 0.6499727877762319, + "grad_norm": 0.48481565713882446, + "learning_rate": 5.460687755126641e-06, + "loss": 0.342, + "step": 29160 + }, + { + "epoch": 0.650084237156852, + "grad_norm": 0.6787352561950684, + "learning_rate": 5.457568230448005e-06, + "loss": 0.3587, + "step": 29165 + }, + { + "epoch": 0.6501956865374721, + "grad_norm": 0.5041036605834961, + "learning_rate": 5.454449262636443e-06, + "loss": 0.2712, + "step": 29170 + }, + { + "epoch": 0.6503071359180921, + "grad_norm": 0.8429849743843079, + "learning_rate": 5.4513308520743105e-06, + "loss": 0.3708, + "step": 29175 + }, + { + "epoch": 0.6504185852987122, + "grad_norm": 0.7078865766525269, + "learning_rate": 5.448212999143909e-06, + "loss": 0.3688, + "step": 29180 + }, + { + "epoch": 0.6505300346793322, + "grad_norm": 0.44548001885414124, + "learning_rate": 5.445095704227459e-06, + "loss": 0.2585, + "step": 29185 + }, + { + "epoch": 0.6506414840599524, + "grad_norm": 0.5557127594947815, + "learning_rate": 5.4419789677071135e-06, + "loss": 0.2925, + "step": 29190 + }, + { + "epoch": 0.6507529334405724, + "grad_norm": 0.5642905831336975, + "learning_rate": 5.438862789964969e-06, + "loss": 0.2724, + "step": 29195 + }, + { + "epoch": 0.6508643828211925, + "grad_norm": 0.38652265071868896, + "learning_rate": 5.435747171383039e-06, + "loss": 0.2282, + "step": 29200 + }, + { + "epoch": 0.6509758322018125, + "grad_norm": 0.721820592880249, + "learning_rate": 5.432632112343274e-06, + "loss": 0.3158, + "step": 29205 + }, + { + "epoch": 0.6510872815824326, + "grad_norm": 0.8188399076461792, + "learning_rate": 5.429517613227555e-06, + "loss": 0.4775, + "step": 29210 + }, + { + "epoch": 0.6511987309630527, + "grad_norm": 0.5114537477493286, + "learning_rate": 5.426403674417701e-06, + "loss": 0.246, + "step": 29215 + }, + { + "epoch": 0.6513101803436727, + "grad_norm": 0.6441953182220459, + "learning_rate": 5.423290296295452e-06, + "loss": 0.2664, + "step": 29220 + }, + { + "epoch": 0.6514216297242928, + "grad_norm": 0.7435140013694763, + "learning_rate": 5.420177479242488e-06, + "loss": 0.2502, + "step": 29225 + }, + { + "epoch": 0.6515330791049129, + "grad_norm": 0.6647602319717407, + "learning_rate": 5.4170652236404144e-06, + "loss": 0.2905, + "step": 29230 + }, + { + "epoch": 0.6516445284855329, + "grad_norm": 0.4995235502719879, + "learning_rate": 5.413953529870769e-06, + "loss": 0.3982, + "step": 29235 + }, + { + "epoch": 0.651755977866153, + "grad_norm": 0.8494893312454224, + "learning_rate": 5.410842398315022e-06, + "loss": 0.2187, + "step": 29240 + }, + { + "epoch": 0.651867427246773, + "grad_norm": 0.42740708589553833, + "learning_rate": 5.407731829354572e-06, + "loss": 0.3034, + "step": 29245 + }, + { + "epoch": 0.6519788766273932, + "grad_norm": 0.7548797726631165, + "learning_rate": 5.4046218233707505e-06, + "loss": 0.2383, + "step": 29250 + }, + { + "epoch": 0.6520903260080132, + "grad_norm": 0.5193314552307129, + "learning_rate": 5.4015123807448286e-06, + "loss": 0.3382, + "step": 29255 + }, + { + "epoch": 0.6522017753886333, + "grad_norm": 0.4508189260959625, + "learning_rate": 5.3984035018579924e-06, + "loss": 0.4111, + "step": 29260 + }, + { + "epoch": 0.6523132247692534, + "grad_norm": 0.7551596760749817, + "learning_rate": 5.39529518709137e-06, + "loss": 0.343, + "step": 29265 + }, + { + "epoch": 0.6524246741498734, + "grad_norm": 0.5172317624092102, + "learning_rate": 5.392187436826012e-06, + "loss": 0.3155, + "step": 29270 + }, + { + "epoch": 0.6525361235304935, + "grad_norm": 0.7789238691329956, + "learning_rate": 5.3890802514429045e-06, + "loss": 0.2763, + "step": 29275 + }, + { + "epoch": 0.6526475729111135, + "grad_norm": 0.43609416484832764, + "learning_rate": 5.3859736313229715e-06, + "loss": 0.311, + "step": 29280 + }, + { + "epoch": 0.6527590222917337, + "grad_norm": 0.8336686491966248, + "learning_rate": 5.382867576847053e-06, + "loss": 0.3602, + "step": 29285 + }, + { + "epoch": 0.6528704716723537, + "grad_norm": 0.5604103803634644, + "learning_rate": 5.379762088395935e-06, + "loss": 0.3437, + "step": 29290 + }, + { + "epoch": 0.6529819210529737, + "grad_norm": 0.3999236226081848, + "learning_rate": 5.3766571663503205e-06, + "loss": 0.2477, + "step": 29295 + }, + { + "epoch": 0.6530933704335938, + "grad_norm": 0.5317768454551697, + "learning_rate": 5.373552811090852e-06, + "loss": 0.2595, + "step": 29300 + }, + { + "epoch": 0.6532048198142139, + "grad_norm": 0.6594027280807495, + "learning_rate": 5.3704490229980975e-06, + "loss": 0.2629, + "step": 29305 + }, + { + "epoch": 0.653316269194834, + "grad_norm": 0.7733690738677979, + "learning_rate": 5.367345802452555e-06, + "loss": 0.2808, + "step": 29310 + }, + { + "epoch": 0.653427718575454, + "grad_norm": 0.3656593859195709, + "learning_rate": 5.364243149834658e-06, + "loss": 0.1783, + "step": 29315 + }, + { + "epoch": 0.653539167956074, + "grad_norm": 0.8090857863426208, + "learning_rate": 5.361141065524773e-06, + "loss": 0.2217, + "step": 29320 + }, + { + "epoch": 0.6536506173366942, + "grad_norm": 0.5806332230567932, + "learning_rate": 5.358039549903186e-06, + "loss": 0.283, + "step": 29325 + }, + { + "epoch": 0.6537620667173142, + "grad_norm": 0.4583825170993805, + "learning_rate": 5.354938603350119e-06, + "loss": 0.2812, + "step": 29330 + }, + { + "epoch": 0.6538735160979343, + "grad_norm": 0.4804151952266693, + "learning_rate": 5.3518382262457265e-06, + "loss": 0.2342, + "step": 29335 + }, + { + "epoch": 0.6539849654785543, + "grad_norm": 0.5588163733482361, + "learning_rate": 5.348738418970084e-06, + "loss": 0.2548, + "step": 29340 + }, + { + "epoch": 0.6540964148591745, + "grad_norm": 0.5237212777137756, + "learning_rate": 5.345639181903214e-06, + "loss": 0.2743, + "step": 29345 + }, + { + "epoch": 0.6542078642397945, + "grad_norm": 0.49168357253074646, + "learning_rate": 5.34254051542505e-06, + "loss": 0.2248, + "step": 29350 + }, + { + "epoch": 0.6543193136204145, + "grad_norm": 0.34626704454421997, + "learning_rate": 5.339442419915472e-06, + "loss": 0.3692, + "step": 29355 + }, + { + "epoch": 0.6544307630010346, + "grad_norm": 0.8723349571228027, + "learning_rate": 5.336344895754279e-06, + "loss": 0.3255, + "step": 29360 + }, + { + "epoch": 0.6545422123816547, + "grad_norm": 0.5037609934806824, + "learning_rate": 5.333247943321205e-06, + "loss": 0.2013, + "step": 29365 + }, + { + "epoch": 0.6546536617622748, + "grad_norm": 0.5711085200309753, + "learning_rate": 5.330151562995908e-06, + "loss": 0.2588, + "step": 29370 + }, + { + "epoch": 0.6547651111428948, + "grad_norm": 0.5225061178207397, + "learning_rate": 5.327055755157986e-06, + "loss": 0.2935, + "step": 29375 + }, + { + "epoch": 0.6548765605235148, + "grad_norm": 0.7341488599777222, + "learning_rate": 5.323960520186959e-06, + "loss": 0.2403, + "step": 29380 + }, + { + "epoch": 0.654988009904135, + "grad_norm": 0.5305912494659424, + "learning_rate": 5.3208658584622765e-06, + "loss": 0.3251, + "step": 29385 + }, + { + "epoch": 0.655099459284755, + "grad_norm": 0.5498141646385193, + "learning_rate": 5.317771770363325e-06, + "loss": 0.3644, + "step": 29390 + }, + { + "epoch": 0.6552109086653751, + "grad_norm": 0.9295704960823059, + "learning_rate": 5.314678256269413e-06, + "loss": 0.2262, + "step": 29395 + }, + { + "epoch": 0.6553223580459951, + "grad_norm": 0.522474467754364, + "learning_rate": 5.311585316559782e-06, + "loss": 0.3555, + "step": 29400 + }, + { + "epoch": 0.6554338074266153, + "grad_norm": 0.6643032431602478, + "learning_rate": 5.308492951613597e-06, + "loss": 0.3139, + "step": 29405 + }, + { + "epoch": 0.6555452568072353, + "grad_norm": 0.9323588013648987, + "learning_rate": 5.305401161809969e-06, + "loss": 0.4577, + "step": 29410 + }, + { + "epoch": 0.6556567061878553, + "grad_norm": 0.7106078863143921, + "learning_rate": 5.3023099475279216e-06, + "loss": 0.2728, + "step": 29415 + }, + { + "epoch": 0.6557681555684755, + "grad_norm": 0.6369653344154358, + "learning_rate": 5.299219309146411e-06, + "loss": 0.3196, + "step": 29420 + }, + { + "epoch": 0.6558796049490955, + "grad_norm": 0.5267141461372375, + "learning_rate": 5.29612924704433e-06, + "loss": 0.223, + "step": 29425 + }, + { + "epoch": 0.6559910543297156, + "grad_norm": 0.4096876084804535, + "learning_rate": 5.293039761600496e-06, + "loss": 0.2565, + "step": 29430 + }, + { + "epoch": 0.6561025037103356, + "grad_norm": 0.7868233919143677, + "learning_rate": 5.2899508531936526e-06, + "loss": 0.2941, + "step": 29435 + }, + { + "epoch": 0.6562139530909556, + "grad_norm": 0.8961450457572937, + "learning_rate": 5.28686252220248e-06, + "loss": 0.275, + "step": 29440 + }, + { + "epoch": 0.6563254024715758, + "grad_norm": 0.6578887104988098, + "learning_rate": 5.283774769005585e-06, + "loss": 0.1974, + "step": 29445 + }, + { + "epoch": 0.6564368518521958, + "grad_norm": 0.38261333107948303, + "learning_rate": 5.280687593981497e-06, + "loss": 0.3405, + "step": 29450 + }, + { + "epoch": 0.6565483012328159, + "grad_norm": 0.85776686668396, + "learning_rate": 5.277600997508681e-06, + "loss": 0.3478, + "step": 29455 + }, + { + "epoch": 0.656659750613436, + "grad_norm": 0.46715691685676575, + "learning_rate": 5.274514979965535e-06, + "loss": 0.1674, + "step": 29460 + }, + { + "epoch": 0.6567711999940561, + "grad_norm": 0.4936739206314087, + "learning_rate": 5.2714295417303715e-06, + "loss": 0.3113, + "step": 29465 + }, + { + "epoch": 0.6568826493746761, + "grad_norm": 0.7164607644081116, + "learning_rate": 5.268344683181452e-06, + "loss": 0.3478, + "step": 29470 + }, + { + "epoch": 0.6569940987552961, + "grad_norm": 0.6708316206932068, + "learning_rate": 5.265260404696952e-06, + "loss": 0.2512, + "step": 29475 + }, + { + "epoch": 0.6571055481359163, + "grad_norm": 0.6708416938781738, + "learning_rate": 5.26217670665498e-06, + "loss": 0.3026, + "step": 29480 + }, + { + "epoch": 0.6572169975165363, + "grad_norm": 0.6250762939453125, + "learning_rate": 5.259093589433573e-06, + "loss": 0.2766, + "step": 29485 + }, + { + "epoch": 0.6573284468971564, + "grad_norm": 0.6325827240943909, + "learning_rate": 5.2560110534106944e-06, + "loss": 0.2531, + "step": 29490 + }, + { + "epoch": 0.6574398962777764, + "grad_norm": 0.6853511333465576, + "learning_rate": 5.252929098964246e-06, + "loss": 0.3013, + "step": 29495 + }, + { + "epoch": 0.6575513456583965, + "grad_norm": 0.6903412938117981, + "learning_rate": 5.249847726472044e-06, + "loss": 0.3223, + "step": 29500 + }, + { + "epoch": 0.6576627950390166, + "grad_norm": 0.8328612446784973, + "learning_rate": 5.246766936311849e-06, + "loss": 0.4012, + "step": 29505 + }, + { + "epoch": 0.6577742444196366, + "grad_norm": 0.6383739113807678, + "learning_rate": 5.243686728861339e-06, + "loss": 0.2335, + "step": 29510 + }, + { + "epoch": 0.6578856938002567, + "grad_norm": 0.3972104787826538, + "learning_rate": 5.240607104498123e-06, + "loss": 0.3859, + "step": 29515 + }, + { + "epoch": 0.6579971431808768, + "grad_norm": 0.8156102299690247, + "learning_rate": 5.237528063599739e-06, + "loss": 0.3795, + "step": 29520 + }, + { + "epoch": 0.6581085925614968, + "grad_norm": 0.7300518155097961, + "learning_rate": 5.23444960654365e-06, + "loss": 0.3428, + "step": 29525 + }, + { + "epoch": 0.6582200419421169, + "grad_norm": 0.9030614495277405, + "learning_rate": 5.231371733707253e-06, + "loss": 0.2376, + "step": 29530 + }, + { + "epoch": 0.6583314913227369, + "grad_norm": 0.6202174425125122, + "learning_rate": 5.2282944454678795e-06, + "loss": 0.2795, + "step": 29535 + }, + { + "epoch": 0.6584429407033571, + "grad_norm": 0.6512044072151184, + "learning_rate": 5.225217742202775e-06, + "loss": 0.3253, + "step": 29540 + }, + { + "epoch": 0.6585543900839771, + "grad_norm": 0.42493924498558044, + "learning_rate": 5.222141624289118e-06, + "loss": 0.2762, + "step": 29545 + }, + { + "epoch": 0.6586658394645972, + "grad_norm": 0.5045167207717896, + "learning_rate": 5.219066092104019e-06, + "loss": 0.2826, + "step": 29550 + }, + { + "epoch": 0.6587772888452172, + "grad_norm": 0.5697253346443176, + "learning_rate": 5.21599114602451e-06, + "loss": 0.2514, + "step": 29555 + }, + { + "epoch": 0.6588887382258373, + "grad_norm": 0.6183579564094543, + "learning_rate": 5.212916786427562e-06, + "loss": 0.2274, + "step": 29560 + }, + { + "epoch": 0.6590001876064574, + "grad_norm": 0.4757140278816223, + "learning_rate": 5.2098430136900665e-06, + "loss": 0.2824, + "step": 29565 + }, + { + "epoch": 0.6591116369870774, + "grad_norm": 0.5445846319198608, + "learning_rate": 5.206769828188844e-06, + "loss": 0.3084, + "step": 29570 + }, + { + "epoch": 0.6592230863676976, + "grad_norm": 0.7208926677703857, + "learning_rate": 5.203697230300643e-06, + "loss": 0.2515, + "step": 29575 + }, + { + "epoch": 0.6593345357483176, + "grad_norm": 0.6429301500320435, + "learning_rate": 5.200625220402139e-06, + "loss": 0.3822, + "step": 29580 + }, + { + "epoch": 0.6594459851289376, + "grad_norm": 0.575545608997345, + "learning_rate": 5.197553798869939e-06, + "loss": 0.2806, + "step": 29585 + }, + { + "epoch": 0.6595574345095577, + "grad_norm": 0.5600603222846985, + "learning_rate": 5.1944829660805675e-06, + "loss": 0.3525, + "step": 29590 + }, + { + "epoch": 0.6596688838901777, + "grad_norm": 0.49977508187294006, + "learning_rate": 5.1914127224104935e-06, + "loss": 0.2609, + "step": 29595 + }, + { + "epoch": 0.6597803332707979, + "grad_norm": 0.5002358555793762, + "learning_rate": 5.188343068236106e-06, + "loss": 0.2928, + "step": 29600 + }, + { + "epoch": 0.6598917826514179, + "grad_norm": 0.7009340524673462, + "learning_rate": 5.185274003933719e-06, + "loss": 0.1986, + "step": 29605 + }, + { + "epoch": 0.660003232032038, + "grad_norm": 0.5574341416358948, + "learning_rate": 5.1822055298795744e-06, + "loss": 0.3342, + "step": 29610 + }, + { + "epoch": 0.660114681412658, + "grad_norm": 0.5994423031806946, + "learning_rate": 5.179137646449845e-06, + "loss": 0.2479, + "step": 29615 + }, + { + "epoch": 0.6602261307932781, + "grad_norm": 0.8221856951713562, + "learning_rate": 5.176070354020624e-06, + "loss": 0.2727, + "step": 29620 + }, + { + "epoch": 0.6603375801738982, + "grad_norm": 0.6682111024856567, + "learning_rate": 5.173003652967947e-06, + "loss": 0.2282, + "step": 29625 + }, + { + "epoch": 0.6604490295545182, + "grad_norm": 0.5746262073516846, + "learning_rate": 5.169937543667759e-06, + "loss": 0.2221, + "step": 29630 + }, + { + "epoch": 0.6605604789351384, + "grad_norm": 0.6253163814544678, + "learning_rate": 5.16687202649595e-06, + "loss": 0.265, + "step": 29635 + }, + { + "epoch": 0.6606719283157584, + "grad_norm": 0.5271322131156921, + "learning_rate": 5.163807101828324e-06, + "loss": 0.2783, + "step": 29640 + }, + { + "epoch": 0.6607833776963784, + "grad_norm": 0.7899476289749146, + "learning_rate": 5.160742770040619e-06, + "loss": 0.4335, + "step": 29645 + }, + { + "epoch": 0.6608948270769985, + "grad_norm": 0.5249226093292236, + "learning_rate": 5.157679031508492e-06, + "loss": 0.2563, + "step": 29650 + }, + { + "epoch": 0.6610062764576186, + "grad_norm": 0.8132089376449585, + "learning_rate": 5.154615886607544e-06, + "loss": 0.2834, + "step": 29655 + }, + { + "epoch": 0.6611177258382387, + "grad_norm": 0.8233852386474609, + "learning_rate": 5.151553335713286e-06, + "loss": 0.4334, + "step": 29660 + }, + { + "epoch": 0.6612291752188587, + "grad_norm": 0.6578496098518372, + "learning_rate": 5.148491379201161e-06, + "loss": 0.2377, + "step": 29665 + }, + { + "epoch": 0.6613406245994788, + "grad_norm": 0.6311374306678772, + "learning_rate": 5.145430017446551e-06, + "loss": 0.2669, + "step": 29670 + }, + { + "epoch": 0.6614520739800989, + "grad_norm": 0.761950671672821, + "learning_rate": 5.142369250824747e-06, + "loss": 0.255, + "step": 29675 + }, + { + "epoch": 0.6615635233607189, + "grad_norm": 0.6191284656524658, + "learning_rate": 5.1393090797109745e-06, + "loss": 0.2739, + "step": 29680 + }, + { + "epoch": 0.661674972741339, + "grad_norm": 0.47322070598602295, + "learning_rate": 5.136249504480395e-06, + "loss": 0.2319, + "step": 29685 + }, + { + "epoch": 0.661786422121959, + "grad_norm": 0.8531567454338074, + "learning_rate": 5.133190525508083e-06, + "loss": 0.2633, + "step": 29690 + }, + { + "epoch": 0.6618978715025792, + "grad_norm": 0.32951730489730835, + "learning_rate": 5.130132143169046e-06, + "loss": 0.2739, + "step": 29695 + }, + { + "epoch": 0.6620093208831992, + "grad_norm": 0.760873019695282, + "learning_rate": 5.127074357838218e-06, + "loss": 0.3018, + "step": 29700 + }, + { + "epoch": 0.6621207702638192, + "grad_norm": 0.6249333620071411, + "learning_rate": 5.124017169890458e-06, + "loss": 0.1994, + "step": 29705 + }, + { + "epoch": 0.6622322196444393, + "grad_norm": 0.588768720626831, + "learning_rate": 5.12096057970056e-06, + "loss": 0.3882, + "step": 29710 + }, + { + "epoch": 0.6623436690250594, + "grad_norm": 0.4670078456401825, + "learning_rate": 5.117904587643231e-06, + "loss": 0.3119, + "step": 29715 + }, + { + "epoch": 0.6624551184056795, + "grad_norm": 0.7735868096351624, + "learning_rate": 5.114849194093119e-06, + "loss": 0.3149, + "step": 29720 + }, + { + "epoch": 0.6625665677862995, + "grad_norm": 0.601441502571106, + "learning_rate": 5.1117943994247875e-06, + "loss": 0.2163, + "step": 29725 + }, + { + "epoch": 0.6626780171669195, + "grad_norm": 0.617202877998352, + "learning_rate": 5.108740204012732e-06, + "loss": 0.2872, + "step": 29730 + }, + { + "epoch": 0.6627894665475397, + "grad_norm": 0.626083254814148, + "learning_rate": 5.105686608231373e-06, + "loss": 0.316, + "step": 29735 + }, + { + "epoch": 0.6629009159281597, + "grad_norm": 0.7950212359428406, + "learning_rate": 5.1026336124550545e-06, + "loss": 0.3991, + "step": 29740 + }, + { + "epoch": 0.6630123653087798, + "grad_norm": 0.7873651385307312, + "learning_rate": 5.099581217058052e-06, + "loss": 0.3129, + "step": 29745 + }, + { + "epoch": 0.6631238146893998, + "grad_norm": 0.576119065284729, + "learning_rate": 5.096529422414571e-06, + "loss": 0.351, + "step": 29750 + }, + { + "epoch": 0.66323526407002, + "grad_norm": 0.7593878507614136, + "learning_rate": 5.093478228898734e-06, + "loss": 0.2498, + "step": 29755 + }, + { + "epoch": 0.66334671345064, + "grad_norm": 0.5120570063591003, + "learning_rate": 5.090427636884593e-06, + "loss": 0.2429, + "step": 29760 + }, + { + "epoch": 0.66345816283126, + "grad_norm": 0.24186640977859497, + "learning_rate": 5.087377646746128e-06, + "loss": 0.251, + "step": 29765 + }, + { + "epoch": 0.6635696122118802, + "grad_norm": 0.586061418056488, + "learning_rate": 5.084328258857241e-06, + "loss": 0.2237, + "step": 29770 + }, + { + "epoch": 0.6636810615925002, + "grad_norm": 0.36794933676719666, + "learning_rate": 5.081279473591765e-06, + "loss": 0.2927, + "step": 29775 + }, + { + "epoch": 0.6637925109731203, + "grad_norm": 0.5301176309585571, + "learning_rate": 5.078231291323463e-06, + "loss": 0.3115, + "step": 29780 + }, + { + "epoch": 0.6639039603537403, + "grad_norm": 0.712449848651886, + "learning_rate": 5.075183712426014e-06, + "loss": 0.2553, + "step": 29785 + }, + { + "epoch": 0.6640154097343604, + "grad_norm": 0.803022027015686, + "learning_rate": 5.072136737273029e-06, + "loss": 0.1746, + "step": 29790 + }, + { + "epoch": 0.6641268591149805, + "grad_norm": 0.48893946409225464, + "learning_rate": 5.069090366238042e-06, + "loss": 0.1848, + "step": 29795 + }, + { + "epoch": 0.6642383084956005, + "grad_norm": 0.5302146077156067, + "learning_rate": 5.066044599694515e-06, + "loss": 0.298, + "step": 29800 + }, + { + "epoch": 0.6643497578762206, + "grad_norm": 0.5051549673080444, + "learning_rate": 5.062999438015834e-06, + "loss": 0.2469, + "step": 29805 + }, + { + "epoch": 0.6644612072568407, + "grad_norm": 0.7402228116989136, + "learning_rate": 5.059954881575313e-06, + "loss": 0.3926, + "step": 29810 + }, + { + "epoch": 0.6645726566374608, + "grad_norm": 0.3407340943813324, + "learning_rate": 5.056910930746195e-06, + "loss": 0.2767, + "step": 29815 + }, + { + "epoch": 0.6646841060180808, + "grad_norm": 0.5036594271659851, + "learning_rate": 5.0538675859016425e-06, + "loss": 0.3033, + "step": 29820 + }, + { + "epoch": 0.6647955553987008, + "grad_norm": 0.4008634388446808, + "learning_rate": 5.0508248474147455e-06, + "loss": 0.3629, + "step": 29825 + }, + { + "epoch": 0.664907004779321, + "grad_norm": 0.9136576652526855, + "learning_rate": 5.047782715658523e-06, + "loss": 0.4994, + "step": 29830 + }, + { + "epoch": 0.665018454159941, + "grad_norm": 0.7496482729911804, + "learning_rate": 5.044741191005908e-06, + "loss": 0.3205, + "step": 29835 + }, + { + "epoch": 0.6651299035405611, + "grad_norm": 0.6265192031860352, + "learning_rate": 5.041700273829778e-06, + "loss": 0.2063, + "step": 29840 + }, + { + "epoch": 0.6652413529211811, + "grad_norm": 0.591755747795105, + "learning_rate": 5.038659964502919e-06, + "loss": 0.244, + "step": 29845 + }, + { + "epoch": 0.6653528023018012, + "grad_norm": 0.6225812435150146, + "learning_rate": 5.035620263398056e-06, + "loss": 0.3191, + "step": 29850 + }, + { + "epoch": 0.6654642516824213, + "grad_norm": 0.43200963735580444, + "learning_rate": 5.032581170887831e-06, + "loss": 0.198, + "step": 29855 + }, + { + "epoch": 0.6655757010630413, + "grad_norm": 0.43655383586883545, + "learning_rate": 5.02954268734481e-06, + "loss": 0.3137, + "step": 29860 + }, + { + "epoch": 0.6656871504436614, + "grad_norm": 0.5098513960838318, + "learning_rate": 5.026504813141487e-06, + "loss": 0.3615, + "step": 29865 + }, + { + "epoch": 0.6657985998242815, + "grad_norm": 0.764817476272583, + "learning_rate": 5.023467548650288e-06, + "loss": 0.224, + "step": 29870 + }, + { + "epoch": 0.6659100492049015, + "grad_norm": 0.5321330428123474, + "learning_rate": 5.020430894243556e-06, + "loss": 0.3052, + "step": 29875 + }, + { + "epoch": 0.6660214985855216, + "grad_norm": 0.48160234093666077, + "learning_rate": 5.017394850293553e-06, + "loss": 0.2948, + "step": 29880 + }, + { + "epoch": 0.6661329479661416, + "grad_norm": 0.6200422048568726, + "learning_rate": 5.0143594171724875e-06, + "loss": 0.2417, + "step": 29885 + }, + { + "epoch": 0.6662443973467618, + "grad_norm": 0.7394756078720093, + "learning_rate": 5.011324595252474e-06, + "loss": 0.305, + "step": 29890 + }, + { + "epoch": 0.6663558467273818, + "grad_norm": 0.6181401610374451, + "learning_rate": 5.0082903849055585e-06, + "loss": 0.3193, + "step": 29895 + }, + { + "epoch": 0.6664672961080019, + "grad_norm": 0.46570515632629395, + "learning_rate": 5.0052567865037075e-06, + "loss": 0.2006, + "step": 29900 + }, + { + "epoch": 0.666578745488622, + "grad_norm": 0.6175560355186462, + "learning_rate": 5.002223800418824e-06, + "loss": 0.3857, + "step": 29905 + }, + { + "epoch": 0.666690194869242, + "grad_norm": 0.40701529383659363, + "learning_rate": 4.9991914270227274e-06, + "loss": 0.3133, + "step": 29910 + }, + { + "epoch": 0.6668016442498621, + "grad_norm": 0.529159665107727, + "learning_rate": 4.996159666687156e-06, + "loss": 0.2521, + "step": 29915 + }, + { + "epoch": 0.6669130936304821, + "grad_norm": 0.5332446694374084, + "learning_rate": 4.993128519783791e-06, + "loss": 0.1586, + "step": 29920 + }, + { + "epoch": 0.6670245430111023, + "grad_norm": 0.7735430598258972, + "learning_rate": 4.990097986684221e-06, + "loss": 0.204, + "step": 29925 + }, + { + "epoch": 0.6671359923917223, + "grad_norm": 0.5558745861053467, + "learning_rate": 4.987068067759965e-06, + "loss": 0.361, + "step": 29930 + }, + { + "epoch": 0.6672474417723423, + "grad_norm": 0.6907291412353516, + "learning_rate": 4.984038763382473e-06, + "loss": 0.3075, + "step": 29935 + }, + { + "epoch": 0.6673588911529624, + "grad_norm": 0.3587920069694519, + "learning_rate": 4.981010073923112e-06, + "loss": 0.3012, + "step": 29940 + }, + { + "epoch": 0.6674703405335825, + "grad_norm": 0.6192587018013, + "learning_rate": 4.977981999753174e-06, + "loss": 0.3315, + "step": 29945 + }, + { + "epoch": 0.6675817899142026, + "grad_norm": 0.5411548614501953, + "learning_rate": 4.974954541243876e-06, + "loss": 0.4462, + "step": 29950 + }, + { + "epoch": 0.6676932392948226, + "grad_norm": 0.7996327877044678, + "learning_rate": 4.971927698766367e-06, + "loss": 0.1447, + "step": 29955 + }, + { + "epoch": 0.6678046886754427, + "grad_norm": 0.7160996198654175, + "learning_rate": 4.9689014726917085e-06, + "loss": 0.3103, + "step": 29960 + }, + { + "epoch": 0.6679161380560628, + "grad_norm": 0.5828596353530884, + "learning_rate": 4.965875863390898e-06, + "loss": 0.352, + "step": 29965 + }, + { + "epoch": 0.6680275874366828, + "grad_norm": 0.47592854499816895, + "learning_rate": 4.96285087123485e-06, + "loss": 0.3189, + "step": 29970 + }, + { + "epoch": 0.6681390368173029, + "grad_norm": 0.5826191902160645, + "learning_rate": 4.9598264965944044e-06, + "loss": 0.2964, + "step": 29975 + }, + { + "epoch": 0.6682504861979229, + "grad_norm": 0.5355210304260254, + "learning_rate": 4.956802739840325e-06, + "loss": 0.2218, + "step": 29980 + }, + { + "epoch": 0.6683619355785431, + "grad_norm": 0.9146451950073242, + "learning_rate": 4.953779601343299e-06, + "loss": 0.2733, + "step": 29985 + }, + { + "epoch": 0.6684733849591631, + "grad_norm": 0.916202962398529, + "learning_rate": 4.9507570814739435e-06, + "loss": 0.2596, + "step": 29990 + }, + { + "epoch": 0.6685848343397831, + "grad_norm": 0.873465895652771, + "learning_rate": 4.9477351806028e-06, + "loss": 0.4759, + "step": 29995 + }, + { + "epoch": 0.6686962837204032, + "grad_norm": 0.6375863552093506, + "learning_rate": 4.944713899100324e-06, + "loss": 0.1947, + "step": 30000 + }, + { + "epoch": 0.6688077331010233, + "grad_norm": 0.5613588690757751, + "learning_rate": 4.941693237336904e-06, + "loss": 0.3548, + "step": 30005 + }, + { + "epoch": 0.6689191824816434, + "grad_norm": 0.5151784420013428, + "learning_rate": 4.938673195682849e-06, + "loss": 0.2757, + "step": 30010 + }, + { + "epoch": 0.6690306318622634, + "grad_norm": 0.433434396982193, + "learning_rate": 4.935653774508393e-06, + "loss": 0.2052, + "step": 30015 + }, + { + "epoch": 0.6691420812428835, + "grad_norm": 0.7155526280403137, + "learning_rate": 4.93263497418369e-06, + "loss": 0.2852, + "step": 30020 + }, + { + "epoch": 0.6692535306235036, + "grad_norm": 0.8238219022750854, + "learning_rate": 4.929616795078825e-06, + "loss": 0.3083, + "step": 30025 + }, + { + "epoch": 0.6693649800041236, + "grad_norm": 0.531947135925293, + "learning_rate": 4.926599237563807e-06, + "loss": 0.2996, + "step": 30030 + }, + { + "epoch": 0.6694764293847437, + "grad_norm": 0.6209695339202881, + "learning_rate": 4.923582302008562e-06, + "loss": 0.3227, + "step": 30035 + }, + { + "epoch": 0.6695878787653637, + "grad_norm": 0.6829642057418823, + "learning_rate": 4.920565988782943e-06, + "loss": 0.3823, + "step": 30040 + }, + { + "epoch": 0.6696993281459839, + "grad_norm": 1.1301989555358887, + "learning_rate": 4.917550298256726e-06, + "loss": 0.2317, + "step": 30045 + }, + { + "epoch": 0.6698107775266039, + "grad_norm": 0.6049754023551941, + "learning_rate": 4.914535230799609e-06, + "loss": 0.341, + "step": 30050 + }, + { + "epoch": 0.6699222269072239, + "grad_norm": 0.8974992036819458, + "learning_rate": 4.911520786781218e-06, + "loss": 0.191, + "step": 30055 + }, + { + "epoch": 0.670033676287844, + "grad_norm": 0.7844485640525818, + "learning_rate": 4.908506966571106e-06, + "loss": 0.2516, + "step": 30060 + }, + { + "epoch": 0.6701451256684641, + "grad_norm": 0.5584155321121216, + "learning_rate": 4.905493770538739e-06, + "loss": 0.3154, + "step": 30065 + }, + { + "epoch": 0.6702565750490842, + "grad_norm": 0.5932765603065491, + "learning_rate": 4.902481199053512e-06, + "loss": 0.3324, + "step": 30070 + }, + { + "epoch": 0.6703680244297042, + "grad_norm": 0.6151829957962036, + "learning_rate": 4.899469252484744e-06, + "loss": 0.282, + "step": 30075 + }, + { + "epoch": 0.6704794738103242, + "grad_norm": 0.6490795016288757, + "learning_rate": 4.896457931201671e-06, + "loss": 0.2691, + "step": 30080 + }, + { + "epoch": 0.6705909231909444, + "grad_norm": 0.4294373393058777, + "learning_rate": 4.8934472355734675e-06, + "loss": 0.3277, + "step": 30085 + }, + { + "epoch": 0.6707023725715644, + "grad_norm": 0.48455068469047546, + "learning_rate": 4.890437165969212e-06, + "loss": 0.2612, + "step": 30090 + }, + { + "epoch": 0.6708138219521845, + "grad_norm": 0.6728994250297546, + "learning_rate": 4.887427722757924e-06, + "loss": 0.2395, + "step": 30095 + }, + { + "epoch": 0.6709252713328046, + "grad_norm": 0.431524395942688, + "learning_rate": 4.884418906308533e-06, + "loss": 0.2241, + "step": 30100 + }, + { + "epoch": 0.6710367207134247, + "grad_norm": 0.7784751653671265, + "learning_rate": 4.881410716989899e-06, + "loss": 0.2834, + "step": 30105 + }, + { + "epoch": 0.6711481700940447, + "grad_norm": 0.8710717558860779, + "learning_rate": 4.878403155170801e-06, + "loss": 0.3092, + "step": 30110 + }, + { + "epoch": 0.6712596194746647, + "grad_norm": 0.6971104145050049, + "learning_rate": 4.87539622121994e-06, + "loss": 0.2769, + "step": 30115 + }, + { + "epoch": 0.6713710688552849, + "grad_norm": 0.7268978953361511, + "learning_rate": 4.872389915505951e-06, + "loss": 0.3372, + "step": 30120 + }, + { + "epoch": 0.6714825182359049, + "grad_norm": 0.5553570985794067, + "learning_rate": 4.869384238397375e-06, + "loss": 0.2756, + "step": 30125 + }, + { + "epoch": 0.671593967616525, + "grad_norm": 0.554465115070343, + "learning_rate": 4.866379190262692e-06, + "loss": 0.2625, + "step": 30130 + }, + { + "epoch": 0.671705416997145, + "grad_norm": 0.904084324836731, + "learning_rate": 4.863374771470296e-06, + "loss": 0.2299, + "step": 30135 + }, + { + "epoch": 0.671816866377765, + "grad_norm": 0.6341699957847595, + "learning_rate": 4.860370982388504e-06, + "loss": 0.2554, + "step": 30140 + }, + { + "epoch": 0.6719283157583852, + "grad_norm": 0.4441910684108734, + "learning_rate": 4.8573678233855534e-06, + "loss": 0.2788, + "step": 30145 + }, + { + "epoch": 0.6720397651390052, + "grad_norm": 0.45818713307380676, + "learning_rate": 4.854365294829617e-06, + "loss": 0.2327, + "step": 30150 + }, + { + "epoch": 0.6721512145196253, + "grad_norm": 0.771054744720459, + "learning_rate": 4.851363397088777e-06, + "loss": 0.2874, + "step": 30155 + }, + { + "epoch": 0.6722626639002454, + "grad_norm": 0.7584509253501892, + "learning_rate": 4.848362130531039e-06, + "loss": 0.3167, + "step": 30160 + }, + { + "epoch": 0.6723741132808655, + "grad_norm": 0.45884889364242554, + "learning_rate": 4.845361495524343e-06, + "loss": 0.2909, + "step": 30165 + }, + { + "epoch": 0.6724855626614855, + "grad_norm": 0.36979031562805176, + "learning_rate": 4.842361492436541e-06, + "loss": 0.1748, + "step": 30170 + }, + { + "epoch": 0.6725970120421055, + "grad_norm": 0.7901503443717957, + "learning_rate": 4.839362121635405e-06, + "loss": 0.2478, + "step": 30175 + }, + { + "epoch": 0.6727084614227257, + "grad_norm": 0.6436774134635925, + "learning_rate": 4.836363383488643e-06, + "loss": 0.3461, + "step": 30180 + }, + { + "epoch": 0.6728199108033457, + "grad_norm": 0.30818986892700195, + "learning_rate": 4.833365278363872e-06, + "loss": 0.2585, + "step": 30185 + }, + { + "epoch": 0.6729313601839658, + "grad_norm": 0.6664495468139648, + "learning_rate": 4.830367806628637e-06, + "loss": 0.2956, + "step": 30190 + }, + { + "epoch": 0.6730428095645858, + "grad_norm": 0.5725700855255127, + "learning_rate": 4.827370968650403e-06, + "loss": 0.1498, + "step": 30195 + }, + { + "epoch": 0.6731542589452059, + "grad_norm": 0.552641749382019, + "learning_rate": 4.824374764796565e-06, + "loss": 0.2645, + "step": 30200 + }, + { + "epoch": 0.673265708325826, + "grad_norm": 0.7079885601997375, + "learning_rate": 4.8213791954344315e-06, + "loss": 0.3482, + "step": 30205 + }, + { + "epoch": 0.673377157706446, + "grad_norm": 0.8411067128181458, + "learning_rate": 4.818384260931233e-06, + "loss": 0.3391, + "step": 30210 + }, + { + "epoch": 0.6734886070870661, + "grad_norm": 0.696851909160614, + "learning_rate": 4.81538996165413e-06, + "loss": 0.4519, + "step": 30215 + }, + { + "epoch": 0.6736000564676862, + "grad_norm": 0.6157433986663818, + "learning_rate": 4.8123962979702e-06, + "loss": 0.2456, + "step": 30220 + }, + { + "epoch": 0.6737115058483063, + "grad_norm": 1.0309218168258667, + "learning_rate": 4.809403270246441e-06, + "loss": 0.2318, + "step": 30225 + }, + { + "epoch": 0.6738229552289263, + "grad_norm": 0.4081147015094757, + "learning_rate": 4.806410878849776e-06, + "loss": 0.3775, + "step": 30230 + }, + { + "epoch": 0.6739344046095463, + "grad_norm": 0.8992989659309387, + "learning_rate": 4.803419124147045e-06, + "loss": 0.2688, + "step": 30235 + }, + { + "epoch": 0.6740458539901665, + "grad_norm": 0.4808000326156616, + "learning_rate": 4.800428006505018e-06, + "loss": 0.2405, + "step": 30240 + }, + { + "epoch": 0.6741573033707865, + "grad_norm": 0.3560895323753357, + "learning_rate": 4.797437526290386e-06, + "loss": 0.3898, + "step": 30245 + }, + { + "epoch": 0.6742687527514066, + "grad_norm": 0.4232325851917267, + "learning_rate": 4.794447683869756e-06, + "loss": 0.337, + "step": 30250 + }, + { + "epoch": 0.6743802021320267, + "grad_norm": 0.6125756502151489, + "learning_rate": 4.791458479609661e-06, + "loss": 0.3283, + "step": 30255 + }, + { + "epoch": 0.6744916515126467, + "grad_norm": 0.6041368246078491, + "learning_rate": 4.78846991387655e-06, + "loss": 0.314, + "step": 30260 + }, + { + "epoch": 0.6746031008932668, + "grad_norm": 1.6415461301803589, + "learning_rate": 4.785481987036799e-06, + "loss": 0.2928, + "step": 30265 + }, + { + "epoch": 0.6747145502738868, + "grad_norm": 0.7918848395347595, + "learning_rate": 4.782494699456706e-06, + "loss": 0.2784, + "step": 30270 + }, + { + "epoch": 0.674825999654507, + "grad_norm": 0.638884961605072, + "learning_rate": 4.779508051502492e-06, + "loss": 0.1836, + "step": 30275 + }, + { + "epoch": 0.674937449035127, + "grad_norm": 0.6684740781784058, + "learning_rate": 4.776522043540297e-06, + "loss": 0.3748, + "step": 30280 + }, + { + "epoch": 0.675048898415747, + "grad_norm": 0.5826922655105591, + "learning_rate": 4.773536675936179e-06, + "loss": 0.2958, + "step": 30285 + }, + { + "epoch": 0.6751603477963671, + "grad_norm": 0.4849734902381897, + "learning_rate": 4.770551949056123e-06, + "loss": 0.1645, + "step": 30290 + }, + { + "epoch": 0.6752717971769872, + "grad_norm": 0.6118282079696655, + "learning_rate": 4.767567863266029e-06, + "loss": 0.2709, + "step": 30295 + }, + { + "epoch": 0.6753832465576073, + "grad_norm": 0.5682799816131592, + "learning_rate": 4.764584418931731e-06, + "loss": 0.3137, + "step": 30300 + }, + { + "epoch": 0.6754946959382273, + "grad_norm": 0.7277367115020752, + "learning_rate": 4.761601616418968e-06, + "loss": 0.2859, + "step": 30305 + }, + { + "epoch": 0.6756061453188474, + "grad_norm": 0.7065811157226562, + "learning_rate": 4.758619456093417e-06, + "loss": 0.3519, + "step": 30310 + }, + { + "epoch": 0.6757175946994675, + "grad_norm": 0.39767614006996155, + "learning_rate": 4.755637938320662e-06, + "loss": 0.2527, + "step": 30315 + }, + { + "epoch": 0.6758290440800875, + "grad_norm": 0.6266661286354065, + "learning_rate": 4.752657063466217e-06, + "loss": 0.27, + "step": 30320 + }, + { + "epoch": 0.6759404934607076, + "grad_norm": 0.6225513219833374, + "learning_rate": 4.749676831895512e-06, + "loss": 0.2394, + "step": 30325 + }, + { + "epoch": 0.6760519428413276, + "grad_norm": 0.5919443964958191, + "learning_rate": 4.7466972439738965e-06, + "loss": 0.3048, + "step": 30330 + }, + { + "epoch": 0.6761633922219478, + "grad_norm": 0.9167764186859131, + "learning_rate": 4.7437183000666555e-06, + "loss": 0.2479, + "step": 30335 + }, + { + "epoch": 0.6762748416025678, + "grad_norm": 0.7809853553771973, + "learning_rate": 4.740740000538973e-06, + "loss": 0.2459, + "step": 30340 + }, + { + "epoch": 0.6763862909831878, + "grad_norm": 0.8190988302230835, + "learning_rate": 4.737762345755975e-06, + "loss": 0.309, + "step": 30345 + }, + { + "epoch": 0.6764977403638079, + "grad_norm": 0.9799290895462036, + "learning_rate": 4.734785336082697e-06, + "loss": 0.261, + "step": 30350 + }, + { + "epoch": 0.676609189744428, + "grad_norm": 0.6642264723777771, + "learning_rate": 4.731808971884095e-06, + "loss": 0.2865, + "step": 30355 + }, + { + "epoch": 0.6767206391250481, + "grad_norm": 0.6557360291481018, + "learning_rate": 4.728833253525043e-06, + "loss": 0.3017, + "step": 30360 + }, + { + "epoch": 0.6768320885056681, + "grad_norm": 0.6903407573699951, + "learning_rate": 4.725858181370352e-06, + "loss": 0.2997, + "step": 30365 + }, + { + "epoch": 0.6769435378862882, + "grad_norm": 0.7355604767799377, + "learning_rate": 4.7228837557847385e-06, + "loss": 0.3865, + "step": 30370 + }, + { + "epoch": 0.6770549872669083, + "grad_norm": 0.43351057171821594, + "learning_rate": 4.71990997713284e-06, + "loss": 0.278, + "step": 30375 + }, + { + "epoch": 0.6771664366475283, + "grad_norm": 0.41541969776153564, + "learning_rate": 4.716936845779224e-06, + "loss": 0.3562, + "step": 30380 + }, + { + "epoch": 0.6772778860281484, + "grad_norm": 0.5344095826148987, + "learning_rate": 4.713964362088374e-06, + "loss": 0.1759, + "step": 30385 + }, + { + "epoch": 0.6773893354087684, + "grad_norm": 0.7391599416732788, + "learning_rate": 4.710992526424686e-06, + "loss": 0.2791, + "step": 30390 + }, + { + "epoch": 0.6775007847893886, + "grad_norm": 0.7321291565895081, + "learning_rate": 4.708021339152493e-06, + "loss": 0.3204, + "step": 30395 + }, + { + "epoch": 0.6776122341700086, + "grad_norm": 0.5888240933418274, + "learning_rate": 4.7050508006360365e-06, + "loss": 0.3577, + "step": 30400 + }, + { + "epoch": 0.6777236835506286, + "grad_norm": 0.5763120055198669, + "learning_rate": 4.702080911239482e-06, + "loss": 0.2414, + "step": 30405 + }, + { + "epoch": 0.6778351329312488, + "grad_norm": 0.6024255156517029, + "learning_rate": 4.699111671326909e-06, + "loss": 0.3128, + "step": 30410 + }, + { + "epoch": 0.6779465823118688, + "grad_norm": 0.7076147794723511, + "learning_rate": 4.6961430812623334e-06, + "loss": 0.2945, + "step": 30415 + }, + { + "epoch": 0.6780580316924889, + "grad_norm": 0.6222317218780518, + "learning_rate": 4.693175141409675e-06, + "loss": 0.3651, + "step": 30420 + }, + { + "epoch": 0.6781694810731089, + "grad_norm": 0.5933769941329956, + "learning_rate": 4.69020785213278e-06, + "loss": 0.2369, + "step": 30425 + }, + { + "epoch": 0.6782809304537291, + "grad_norm": 0.669907808303833, + "learning_rate": 4.687241213795419e-06, + "loss": 0.3003, + "step": 30430 + }, + { + "epoch": 0.6783923798343491, + "grad_norm": 0.7214773297309875, + "learning_rate": 4.6842752267612775e-06, + "loss": 0.3474, + "step": 30435 + }, + { + "epoch": 0.6785038292149691, + "grad_norm": 0.6140491962432861, + "learning_rate": 4.6813098913939635e-06, + "loss": 0.313, + "step": 30440 + }, + { + "epoch": 0.6786152785955892, + "grad_norm": 0.40183714032173157, + "learning_rate": 4.678345208056998e-06, + "loss": 0.2156, + "step": 30445 + }, + { + "epoch": 0.6787267279762093, + "grad_norm": 0.765648365020752, + "learning_rate": 4.675381177113837e-06, + "loss": 0.2835, + "step": 30450 + }, + { + "epoch": 0.6788381773568294, + "grad_norm": 0.4940396845340729, + "learning_rate": 4.67241779892784e-06, + "loss": 0.2926, + "step": 30455 + }, + { + "epoch": 0.6789496267374494, + "grad_norm": 0.6889997124671936, + "learning_rate": 4.669455073862302e-06, + "loss": 0.3213, + "step": 30460 + }, + { + "epoch": 0.6790610761180694, + "grad_norm": 0.5608032941818237, + "learning_rate": 4.666493002280426e-06, + "loss": 0.2041, + "step": 30465 + }, + { + "epoch": 0.6791725254986896, + "grad_norm": 0.3555319011211395, + "learning_rate": 4.66353158454534e-06, + "loss": 0.3921, + "step": 30470 + }, + { + "epoch": 0.6792839748793096, + "grad_norm": 0.7849695682525635, + "learning_rate": 4.660570821020091e-06, + "loss": 0.1967, + "step": 30475 + }, + { + "epoch": 0.6793954242599297, + "grad_norm": 0.5523033142089844, + "learning_rate": 4.6576107120676415e-06, + "loss": 0.341, + "step": 30480 + }, + { + "epoch": 0.6795068736405497, + "grad_norm": 0.7489277124404907, + "learning_rate": 4.654651258050881e-06, + "loss": 0.3402, + "step": 30485 + }, + { + "epoch": 0.6796183230211698, + "grad_norm": 0.5791055560112, + "learning_rate": 4.651692459332621e-06, + "loss": 0.3398, + "step": 30490 + }, + { + "epoch": 0.6797297724017899, + "grad_norm": 0.7251898646354675, + "learning_rate": 4.648734316275583e-06, + "loss": 0.2289, + "step": 30495 + }, + { + "epoch": 0.6798412217824099, + "grad_norm": 0.43119266629219055, + "learning_rate": 4.645776829242411e-06, + "loss": 0.323, + "step": 30500 + }, + { + "epoch": 0.67995267116303, + "grad_norm": 0.3917275369167328, + "learning_rate": 4.642819998595672e-06, + "loss": 0.2227, + "step": 30505 + }, + { + "epoch": 0.6800641205436501, + "grad_norm": 0.7229493856430054, + "learning_rate": 4.639863824697848e-06, + "loss": 0.2777, + "step": 30510 + }, + { + "epoch": 0.6801755699242702, + "grad_norm": 1.0337713956832886, + "learning_rate": 4.6369083079113475e-06, + "loss": 0.2911, + "step": 30515 + }, + { + "epoch": 0.6802870193048902, + "grad_norm": 0.5802996754646301, + "learning_rate": 4.633953448598489e-06, + "loss": 0.2701, + "step": 30520 + }, + { + "epoch": 0.6803984686855102, + "grad_norm": 0.43975260853767395, + "learning_rate": 4.63099924712152e-06, + "loss": 0.2523, + "step": 30525 + }, + { + "epoch": 0.6805099180661304, + "grad_norm": 0.5821147561073303, + "learning_rate": 4.628045703842602e-06, + "loss": 0.2978, + "step": 30530 + }, + { + "epoch": 0.6806213674467504, + "grad_norm": 0.9624720215797424, + "learning_rate": 4.625092819123815e-06, + "loss": 0.3141, + "step": 30535 + }, + { + "epoch": 0.6807328168273705, + "grad_norm": 0.5244483351707458, + "learning_rate": 4.622140593327163e-06, + "loss": 0.2627, + "step": 30540 + }, + { + "epoch": 0.6808442662079905, + "grad_norm": 0.5924509763717651, + "learning_rate": 4.619189026814556e-06, + "loss": 0.2249, + "step": 30545 + }, + { + "epoch": 0.6809557155886106, + "grad_norm": 0.5367047786712646, + "learning_rate": 4.616238119947843e-06, + "loss": 0.2293, + "step": 30550 + }, + { + "epoch": 0.6810671649692307, + "grad_norm": 0.5647961497306824, + "learning_rate": 4.613287873088784e-06, + "loss": 0.3609, + "step": 30555 + }, + { + "epoch": 0.6811786143498507, + "grad_norm": 0.7824503183364868, + "learning_rate": 4.610338286599053e-06, + "loss": 0.2663, + "step": 30560 + }, + { + "epoch": 0.6812900637304709, + "grad_norm": 0.9510183930397034, + "learning_rate": 4.607389360840245e-06, + "loss": 0.3119, + "step": 30565 + }, + { + "epoch": 0.6814015131110909, + "grad_norm": 0.4036903381347656, + "learning_rate": 4.604441096173878e-06, + "loss": 0.279, + "step": 30570 + }, + { + "epoch": 0.681512962491711, + "grad_norm": 0.6872173547744751, + "learning_rate": 4.601493492961381e-06, + "loss": 0.3826, + "step": 30575 + }, + { + "epoch": 0.681624411872331, + "grad_norm": 0.5271679162979126, + "learning_rate": 4.5985465515641156e-06, + "loss": 0.2456, + "step": 30580 + }, + { + "epoch": 0.681735861252951, + "grad_norm": 0.7497157454490662, + "learning_rate": 4.595600272343347e-06, + "loss": 0.1737, + "step": 30585 + }, + { + "epoch": 0.6818473106335712, + "grad_norm": 0.23075701296329498, + "learning_rate": 4.592654655660273e-06, + "loss": 0.2424, + "step": 30590 + }, + { + "epoch": 0.6819587600141912, + "grad_norm": 0.6322120428085327, + "learning_rate": 4.589709701875999e-06, + "loss": 0.2902, + "step": 30595 + }, + { + "epoch": 0.6820702093948113, + "grad_norm": 0.7525787353515625, + "learning_rate": 4.586765411351555e-06, + "loss": 0.3234, + "step": 30600 + }, + { + "epoch": 0.6821816587754314, + "grad_norm": 0.7649107575416565, + "learning_rate": 4.583821784447884e-06, + "loss": 0.288, + "step": 30605 + }, + { + "epoch": 0.6822931081560514, + "grad_norm": 0.8376215696334839, + "learning_rate": 4.580878821525859e-06, + "loss": 0.3664, + "step": 30610 + }, + { + "epoch": 0.6824045575366715, + "grad_norm": 0.5996965169906616, + "learning_rate": 4.577936522946261e-06, + "loss": 0.3438, + "step": 30615 + }, + { + "epoch": 0.6825160069172915, + "grad_norm": 0.5214008092880249, + "learning_rate": 4.574994889069791e-06, + "loss": 0.2935, + "step": 30620 + }, + { + "epoch": 0.6826274562979117, + "grad_norm": 0.5297601222991943, + "learning_rate": 4.5720539202570766e-06, + "loss": 0.2882, + "step": 30625 + }, + { + "epoch": 0.6827389056785317, + "grad_norm": 0.6422613263130188, + "learning_rate": 4.569113616868654e-06, + "loss": 0.2625, + "step": 30630 + }, + { + "epoch": 0.6828503550591517, + "grad_norm": 0.4508661925792694, + "learning_rate": 4.566173979264982e-06, + "loss": 0.2579, + "step": 30635 + }, + { + "epoch": 0.6829618044397718, + "grad_norm": 0.3404235541820526, + "learning_rate": 4.563235007806432e-06, + "loss": 0.2441, + "step": 30640 + }, + { + "epoch": 0.6830732538203919, + "grad_norm": 0.8853699564933777, + "learning_rate": 4.560296702853311e-06, + "loss": 0.3594, + "step": 30645 + }, + { + "epoch": 0.683184703201012, + "grad_norm": 1.0352290868759155, + "learning_rate": 4.557359064765825e-06, + "loss": 0.4762, + "step": 30650 + }, + { + "epoch": 0.683296152581632, + "grad_norm": 0.7401602268218994, + "learning_rate": 4.554422093904103e-06, + "loss": 0.3626, + "step": 30655 + }, + { + "epoch": 0.6834076019622521, + "grad_norm": 0.9809353947639465, + "learning_rate": 4.551485790628203e-06, + "loss": 0.264, + "step": 30660 + }, + { + "epoch": 0.6835190513428722, + "grad_norm": 0.6002543568611145, + "learning_rate": 4.548550155298089e-06, + "loss": 0.2303, + "step": 30665 + }, + { + "epoch": 0.6836305007234922, + "grad_norm": 0.6429628729820251, + "learning_rate": 4.545615188273643e-06, + "loss": 0.3119, + "step": 30670 + }, + { + "epoch": 0.6837419501041123, + "grad_norm": 0.7270705699920654, + "learning_rate": 4.542680889914678e-06, + "loss": 0.2012, + "step": 30675 + }, + { + "epoch": 0.6838533994847323, + "grad_norm": 0.4127255976200104, + "learning_rate": 4.539747260580911e-06, + "loss": 0.3944, + "step": 30680 + }, + { + "epoch": 0.6839648488653525, + "grad_norm": 0.5112218260765076, + "learning_rate": 4.536814300631984e-06, + "loss": 0.2155, + "step": 30685 + }, + { + "epoch": 0.6840762982459725, + "grad_norm": 0.620757520198822, + "learning_rate": 4.533882010427451e-06, + "loss": 0.3096, + "step": 30690 + }, + { + "epoch": 0.6841877476265925, + "grad_norm": 0.7408103346824646, + "learning_rate": 4.530950390326794e-06, + "loss": 0.3529, + "step": 30695 + }, + { + "epoch": 0.6842991970072126, + "grad_norm": 0.6948126554489136, + "learning_rate": 4.528019440689401e-06, + "loss": 0.3586, + "step": 30700 + }, + { + "epoch": 0.6844106463878327, + "grad_norm": 0.5368820428848267, + "learning_rate": 4.525089161874592e-06, + "loss": 0.2216, + "step": 30705 + }, + { + "epoch": 0.6845220957684528, + "grad_norm": 0.4925262928009033, + "learning_rate": 4.522159554241591e-06, + "loss": 0.2569, + "step": 30710 + }, + { + "epoch": 0.6846335451490728, + "grad_norm": 0.5296086668968201, + "learning_rate": 4.519230618149547e-06, + "loss": 0.2763, + "step": 30715 + }, + { + "epoch": 0.684744994529693, + "grad_norm": 0.7307214140892029, + "learning_rate": 4.516302353957523e-06, + "loss": 0.3427, + "step": 30720 + }, + { + "epoch": 0.684856443910313, + "grad_norm": 0.7109100222587585, + "learning_rate": 4.513374762024501e-06, + "loss": 0.2628, + "step": 30725 + }, + { + "epoch": 0.684967893290933, + "grad_norm": 0.7910977005958557, + "learning_rate": 4.510447842709386e-06, + "loss": 0.3581, + "step": 30730 + }, + { + "epoch": 0.6850793426715531, + "grad_norm": 0.6768742799758911, + "learning_rate": 4.507521596370987e-06, + "loss": 0.2831, + "step": 30735 + }, + { + "epoch": 0.6851907920521731, + "grad_norm": 0.827703595161438, + "learning_rate": 4.504596023368051e-06, + "loss": 0.337, + "step": 30740 + }, + { + "epoch": 0.6853022414327933, + "grad_norm": 0.6112938523292542, + "learning_rate": 4.501671124059224e-06, + "loss": 0.2836, + "step": 30745 + }, + { + "epoch": 0.6854136908134133, + "grad_norm": 0.43153080344200134, + "learning_rate": 4.498746898803076e-06, + "loss": 0.2882, + "step": 30750 + }, + { + "epoch": 0.6855251401940333, + "grad_norm": 0.6515633463859558, + "learning_rate": 4.4958233479580945e-06, + "loss": 0.2517, + "step": 30755 + }, + { + "epoch": 0.6856365895746535, + "grad_norm": 0.4262503385543823, + "learning_rate": 4.4929004718826815e-06, + "loss": 0.2563, + "step": 30760 + }, + { + "epoch": 0.6857480389552735, + "grad_norm": 0.8875929713249207, + "learning_rate": 4.489978270935164e-06, + "loss": 0.2603, + "step": 30765 + }, + { + "epoch": 0.6858594883358936, + "grad_norm": 0.3773414194583893, + "learning_rate": 4.487056745473781e-06, + "loss": 0.4201, + "step": 30770 + }, + { + "epoch": 0.6859709377165136, + "grad_norm": 0.5433903336524963, + "learning_rate": 4.4841358958566885e-06, + "loss": 0.1744, + "step": 30775 + }, + { + "epoch": 0.6860823870971338, + "grad_norm": 0.6163174510002136, + "learning_rate": 4.481215722441959e-06, + "loss": 0.3012, + "step": 30780 + }, + { + "epoch": 0.6861938364777538, + "grad_norm": 0.5906506776809692, + "learning_rate": 4.4782962255875835e-06, + "loss": 0.3484, + "step": 30785 + }, + { + "epoch": 0.6863052858583738, + "grad_norm": 0.6859926581382751, + "learning_rate": 4.475377405651468e-06, + "loss": 0.242, + "step": 30790 + }, + { + "epoch": 0.6864167352389939, + "grad_norm": 0.49034976959228516, + "learning_rate": 4.472459262991441e-06, + "loss": 0.3221, + "step": 30795 + }, + { + "epoch": 0.686528184619614, + "grad_norm": 0.7555416226387024, + "learning_rate": 4.469541797965238e-06, + "loss": 0.2177, + "step": 30800 + }, + { + "epoch": 0.6866396340002341, + "grad_norm": 0.5270309448242188, + "learning_rate": 4.466625010930526e-06, + "loss": 0.2009, + "step": 30805 + }, + { + "epoch": 0.6867510833808541, + "grad_norm": 0.4625611901283264, + "learning_rate": 4.463708902244878e-06, + "loss": 0.3301, + "step": 30810 + }, + { + "epoch": 0.6868625327614741, + "grad_norm": 0.6135231256484985, + "learning_rate": 4.4607934722657834e-06, + "loss": 0.308, + "step": 30815 + }, + { + "epoch": 0.6869739821420943, + "grad_norm": 0.5562835335731506, + "learning_rate": 4.457878721350653e-06, + "loss": 0.381, + "step": 30820 + }, + { + "epoch": 0.6870854315227143, + "grad_norm": 0.6260277628898621, + "learning_rate": 4.45496464985681e-06, + "loss": 0.2883, + "step": 30825 + }, + { + "epoch": 0.6871968809033344, + "grad_norm": 0.4729725420475006, + "learning_rate": 4.452051258141503e-06, + "loss": 0.3035, + "step": 30830 + }, + { + "epoch": 0.6873083302839544, + "grad_norm": 0.45729824900627136, + "learning_rate": 4.4491385465618846e-06, + "loss": 0.1677, + "step": 30835 + }, + { + "epoch": 0.6874197796645745, + "grad_norm": 0.5871806144714355, + "learning_rate": 4.4462265154750386e-06, + "loss": 0.2998, + "step": 30840 + }, + { + "epoch": 0.6875312290451946, + "grad_norm": 0.4161869287490845, + "learning_rate": 4.443315165237951e-06, + "loss": 0.1802, + "step": 30845 + }, + { + "epoch": 0.6876426784258146, + "grad_norm": 0.6632789373397827, + "learning_rate": 4.440404496207536e-06, + "loss": 0.2441, + "step": 30850 + }, + { + "epoch": 0.6877541278064347, + "grad_norm": 0.424626886844635, + "learning_rate": 4.43749450874061e-06, + "loss": 0.2589, + "step": 30855 + }, + { + "epoch": 0.6878655771870548, + "grad_norm": 0.8885061740875244, + "learning_rate": 4.434585203193927e-06, + "loss": 0.3274, + "step": 30860 + }, + { + "epoch": 0.6879770265676749, + "grad_norm": 0.6270710229873657, + "learning_rate": 4.431676579924139e-06, + "loss": 0.2532, + "step": 30865 + }, + { + "epoch": 0.6880884759482949, + "grad_norm": 0.5457544326782227, + "learning_rate": 4.4287686392878185e-06, + "loss": 0.3691, + "step": 30870 + }, + { + "epoch": 0.6881999253289149, + "grad_norm": 0.5432224273681641, + "learning_rate": 4.425861381641462e-06, + "loss": 0.1776, + "step": 30875 + }, + { + "epoch": 0.6883113747095351, + "grad_norm": 0.5192307233810425, + "learning_rate": 4.4229548073414745e-06, + "loss": 0.3711, + "step": 30880 + }, + { + "epoch": 0.6884228240901551, + "grad_norm": 0.6740368008613586, + "learning_rate": 4.420048916744176e-06, + "loss": 0.4051, + "step": 30885 + }, + { + "epoch": 0.6885342734707752, + "grad_norm": 0.7884381413459778, + "learning_rate": 4.417143710205814e-06, + "loss": 0.2601, + "step": 30890 + }, + { + "epoch": 0.6886457228513952, + "grad_norm": 0.5081226229667664, + "learning_rate": 4.4142391880825386e-06, + "loss": 0.2706, + "step": 30895 + }, + { + "epoch": 0.6887571722320153, + "grad_norm": 0.5837457776069641, + "learning_rate": 4.411335350730425e-06, + "loss": 0.3546, + "step": 30900 + }, + { + "epoch": 0.6888686216126354, + "grad_norm": 0.828184962272644, + "learning_rate": 4.408432198505454e-06, + "loss": 0.1952, + "step": 30905 + }, + { + "epoch": 0.6889800709932554, + "grad_norm": 0.5400125980377197, + "learning_rate": 4.40552973176354e-06, + "loss": 0.2742, + "step": 30910 + }, + { + "epoch": 0.6890915203738756, + "grad_norm": 0.22060178220272064, + "learning_rate": 4.402627950860494e-06, + "loss": 0.2226, + "step": 30915 + }, + { + "epoch": 0.6892029697544956, + "grad_norm": 0.5295904278755188, + "learning_rate": 4.3997268561520615e-06, + "loss": 0.2693, + "step": 30920 + }, + { + "epoch": 0.6893144191351157, + "grad_norm": 0.8292050957679749, + "learning_rate": 4.396826447993887e-06, + "loss": 0.3038, + "step": 30925 + }, + { + "epoch": 0.6894258685157357, + "grad_norm": 0.5134817361831665, + "learning_rate": 4.393926726741541e-06, + "loss": 0.2658, + "step": 30930 + }, + { + "epoch": 0.6895373178963558, + "grad_norm": 0.7239783406257629, + "learning_rate": 4.391027692750506e-06, + "loss": 0.3571, + "step": 30935 + }, + { + "epoch": 0.6896487672769759, + "grad_norm": 0.4626108705997467, + "learning_rate": 4.388129346376177e-06, + "loss": 0.3237, + "step": 30940 + }, + { + "epoch": 0.6897602166575959, + "grad_norm": 0.6538516283035278, + "learning_rate": 4.385231687973878e-06, + "loss": 0.1788, + "step": 30945 + }, + { + "epoch": 0.689871666038216, + "grad_norm": 0.7186077833175659, + "learning_rate": 4.38233471789883e-06, + "loss": 0.2678, + "step": 30950 + }, + { + "epoch": 0.6899831154188361, + "grad_norm": 0.5353816151618958, + "learning_rate": 4.379438436506187e-06, + "loss": 0.2909, + "step": 30955 + }, + { + "epoch": 0.6900945647994561, + "grad_norm": 0.8315975666046143, + "learning_rate": 4.376542844151009e-06, + "loss": 0.2689, + "step": 30960 + }, + { + "epoch": 0.6902060141800762, + "grad_norm": 0.7231079339981079, + "learning_rate": 4.373647941188272e-06, + "loss": 0.2647, + "step": 30965 + }, + { + "epoch": 0.6903174635606962, + "grad_norm": 0.6835340261459351, + "learning_rate": 4.3707537279728674e-06, + "loss": 0.2836, + "step": 30970 + }, + { + "epoch": 0.6904289129413164, + "grad_norm": 0.7447507381439209, + "learning_rate": 4.367860204859601e-06, + "loss": 0.2892, + "step": 30975 + }, + { + "epoch": 0.6905403623219364, + "grad_norm": 0.6612392067909241, + "learning_rate": 4.3649673722032e-06, + "loss": 0.341, + "step": 30980 + }, + { + "epoch": 0.6906518117025565, + "grad_norm": 0.5242640972137451, + "learning_rate": 4.362075230358308e-06, + "loss": 0.3152, + "step": 30985 + }, + { + "epoch": 0.6907632610831765, + "grad_norm": 0.8434734344482422, + "learning_rate": 4.359183779679475e-06, + "loss": 0.4032, + "step": 30990 + }, + { + "epoch": 0.6908747104637966, + "grad_norm": 0.6736511588096619, + "learning_rate": 4.35629302052117e-06, + "loss": 0.204, + "step": 30995 + }, + { + "epoch": 0.6909861598444167, + "grad_norm": 0.5592665672302246, + "learning_rate": 4.353402953237776e-06, + "loss": 0.3241, + "step": 31000 + }, + { + "epoch": 0.6910976092250367, + "grad_norm": 0.49798890948295593, + "learning_rate": 4.350513578183593e-06, + "loss": 0.2848, + "step": 31005 + }, + { + "epoch": 0.6912090586056568, + "grad_norm": 0.5979819297790527, + "learning_rate": 4.347624895712837e-06, + "loss": 0.3554, + "step": 31010 + }, + { + "epoch": 0.6913205079862769, + "grad_norm": 0.6463135480880737, + "learning_rate": 4.344736906179644e-06, + "loss": 0.2998, + "step": 31015 + }, + { + "epoch": 0.6914319573668969, + "grad_norm": 0.8632110953330994, + "learning_rate": 4.341849609938054e-06, + "loss": 0.2684, + "step": 31020 + }, + { + "epoch": 0.691543406747517, + "grad_norm": 0.6110043525695801, + "learning_rate": 4.338963007342027e-06, + "loss": 0.2602, + "step": 31025 + }, + { + "epoch": 0.691654856128137, + "grad_norm": 0.9202972054481506, + "learning_rate": 4.336077098745439e-06, + "loss": 0.2615, + "step": 31030 + }, + { + "epoch": 0.6917663055087572, + "grad_norm": 0.619824230670929, + "learning_rate": 4.3331918845020805e-06, + "loss": 0.2498, + "step": 31035 + }, + { + "epoch": 0.6918777548893772, + "grad_norm": 0.5901528596878052, + "learning_rate": 4.330307364965652e-06, + "loss": 0.2758, + "step": 31040 + }, + { + "epoch": 0.6919892042699972, + "grad_norm": 0.5944219827651978, + "learning_rate": 4.327423540489777e-06, + "loss": 0.4121, + "step": 31045 + }, + { + "epoch": 0.6921006536506173, + "grad_norm": 0.49613940715789795, + "learning_rate": 4.324540411427994e-06, + "loss": 0.3705, + "step": 31050 + }, + { + "epoch": 0.6922121030312374, + "grad_norm": 0.6901610493659973, + "learning_rate": 4.3216579781337485e-06, + "loss": 0.2918, + "step": 31055 + }, + { + "epoch": 0.6923235524118575, + "grad_norm": 0.7814179062843323, + "learning_rate": 4.318776240960406e-06, + "loss": 0.2877, + "step": 31060 + }, + { + "epoch": 0.6924350017924775, + "grad_norm": 0.6709560751914978, + "learning_rate": 4.315895200261243e-06, + "loss": 0.3923, + "step": 31065 + }, + { + "epoch": 0.6925464511730977, + "grad_norm": 0.86536705493927, + "learning_rate": 4.31301485638945e-06, + "loss": 0.3328, + "step": 31070 + }, + { + "epoch": 0.6926579005537177, + "grad_norm": 0.9728611707687378, + "learning_rate": 4.310135209698143e-06, + "loss": 0.3971, + "step": 31075 + }, + { + "epoch": 0.6927693499343377, + "grad_norm": 0.39398086071014404, + "learning_rate": 4.307256260540337e-06, + "loss": 0.279, + "step": 31080 + }, + { + "epoch": 0.6928807993149578, + "grad_norm": 0.4623780846595764, + "learning_rate": 4.304378009268976e-06, + "loss": 0.3175, + "step": 31085 + }, + { + "epoch": 0.6929922486955779, + "grad_norm": 0.4939644932746887, + "learning_rate": 4.301500456236907e-06, + "loss": 0.2116, + "step": 31090 + }, + { + "epoch": 0.693103698076198, + "grad_norm": 0.5080740451812744, + "learning_rate": 4.2986236017968956e-06, + "loss": 0.3565, + "step": 31095 + }, + { + "epoch": 0.693215147456818, + "grad_norm": 0.7656640410423279, + "learning_rate": 4.2957474463016206e-06, + "loss": 0.2778, + "step": 31100 + }, + { + "epoch": 0.693326596837438, + "grad_norm": 0.7635650038719177, + "learning_rate": 4.2928719901036805e-06, + "loss": 0.3594, + "step": 31105 + }, + { + "epoch": 0.6934380462180582, + "grad_norm": 0.6077600717544556, + "learning_rate": 4.289997233555584e-06, + "loss": 0.3699, + "step": 31110 + }, + { + "epoch": 0.6935494955986782, + "grad_norm": 0.7991203665733337, + "learning_rate": 4.287123177009747e-06, + "loss": 0.2534, + "step": 31115 + }, + { + "epoch": 0.6936609449792983, + "grad_norm": 0.9318343997001648, + "learning_rate": 4.284249820818517e-06, + "loss": 0.4215, + "step": 31120 + }, + { + "epoch": 0.6937723943599183, + "grad_norm": 0.4393812119960785, + "learning_rate": 4.2813771653341395e-06, + "loss": 0.2498, + "step": 31125 + }, + { + "epoch": 0.6938838437405385, + "grad_norm": 0.6677595973014832, + "learning_rate": 4.27850521090878e-06, + "loss": 0.3103, + "step": 31130 + }, + { + "epoch": 0.6939952931211585, + "grad_norm": 1.0747525691986084, + "learning_rate": 4.275633957894516e-06, + "loss": 0.4051, + "step": 31135 + }, + { + "epoch": 0.6941067425017785, + "grad_norm": 0.584563672542572, + "learning_rate": 4.2727634066433465e-06, + "loss": 0.2511, + "step": 31140 + }, + { + "epoch": 0.6942181918823986, + "grad_norm": 0.516092836856842, + "learning_rate": 4.269893557507175e-06, + "loss": 0.3257, + "step": 31145 + }, + { + "epoch": 0.6943296412630187, + "grad_norm": 0.6570064425468445, + "learning_rate": 4.267024410837821e-06, + "loss": 0.344, + "step": 31150 + }, + { + "epoch": 0.6944410906436388, + "grad_norm": 0.5576528906822205, + "learning_rate": 4.264155966987026e-06, + "loss": 0.3629, + "step": 31155 + }, + { + "epoch": 0.6945525400242588, + "grad_norm": 0.4276863932609558, + "learning_rate": 4.261288226306436e-06, + "loss": 0.143, + "step": 31160 + }, + { + "epoch": 0.6946639894048788, + "grad_norm": 0.6324317455291748, + "learning_rate": 4.258421189147609e-06, + "loss": 0.2573, + "step": 31165 + }, + { + "epoch": 0.694775438785499, + "grad_norm": 0.4620887041091919, + "learning_rate": 4.2555548558620294e-06, + "loss": 0.1637, + "step": 31170 + }, + { + "epoch": 0.694886888166119, + "grad_norm": 0.39362654089927673, + "learning_rate": 4.2526892268010844e-06, + "loss": 0.1964, + "step": 31175 + }, + { + "epoch": 0.6949983375467391, + "grad_norm": 0.6800695657730103, + "learning_rate": 4.249824302316079e-06, + "loss": 0.3853, + "step": 31180 + }, + { + "epoch": 0.6951097869273591, + "grad_norm": 0.598951518535614, + "learning_rate": 4.246960082758225e-06, + "loss": 0.2922, + "step": 31185 + }, + { + "epoch": 0.6952212363079793, + "grad_norm": 0.5495995879173279, + "learning_rate": 4.244096568478662e-06, + "loss": 0.302, + "step": 31190 + }, + { + "epoch": 0.6953326856885993, + "grad_norm": 0.36194702982902527, + "learning_rate": 4.241233759828426e-06, + "loss": 0.2135, + "step": 31195 + }, + { + "epoch": 0.6954441350692193, + "grad_norm": 0.8240219354629517, + "learning_rate": 4.238371657158486e-06, + "loss": 0.3262, + "step": 31200 + }, + { + "epoch": 0.6955555844498394, + "grad_norm": 0.6485922336578369, + "learning_rate": 4.235510260819707e-06, + "loss": 0.2685, + "step": 31205 + }, + { + "epoch": 0.6956670338304595, + "grad_norm": 0.4704461097717285, + "learning_rate": 4.232649571162874e-06, + "loss": 0.2412, + "step": 31210 + }, + { + "epoch": 0.6957784832110796, + "grad_norm": 0.627560555934906, + "learning_rate": 4.229789588538687e-06, + "loss": 0.2287, + "step": 31215 + }, + { + "epoch": 0.6958899325916996, + "grad_norm": 0.5985351800918579, + "learning_rate": 4.226930313297754e-06, + "loss": 0.2742, + "step": 31220 + }, + { + "epoch": 0.6960013819723196, + "grad_norm": 0.5910149812698364, + "learning_rate": 4.224071745790603e-06, + "loss": 0.3414, + "step": 31225 + }, + { + "epoch": 0.6961128313529398, + "grad_norm": 0.9427435398101807, + "learning_rate": 4.221213886367677e-06, + "loss": 0.3136, + "step": 31230 + }, + { + "epoch": 0.6962242807335598, + "grad_norm": 0.6015107035636902, + "learning_rate": 4.218356735379322e-06, + "loss": 0.2943, + "step": 31235 + }, + { + "epoch": 0.6963357301141799, + "grad_norm": 0.8204703330993652, + "learning_rate": 4.215500293175805e-06, + "loss": 0.2389, + "step": 31240 + }, + { + "epoch": 0.6964471794948, + "grad_norm": 0.7436009645462036, + "learning_rate": 4.212644560107302e-06, + "loss": 0.3549, + "step": 31245 + }, + { + "epoch": 0.69655862887542, + "grad_norm": 0.6757791638374329, + "learning_rate": 4.209789536523905e-06, + "loss": 0.3306, + "step": 31250 + }, + { + "epoch": 0.6966700782560401, + "grad_norm": 0.3792746365070343, + "learning_rate": 4.206935222775612e-06, + "loss": 0.1995, + "step": 31255 + }, + { + "epoch": 0.6967815276366601, + "grad_norm": 0.5856655836105347, + "learning_rate": 4.2040816192123465e-06, + "loss": 0.2582, + "step": 31260 + }, + { + "epoch": 0.6968929770172803, + "grad_norm": 0.4346431791782379, + "learning_rate": 4.20122872618394e-06, + "loss": 0.2341, + "step": 31265 + }, + { + "epoch": 0.6970044263979003, + "grad_norm": 0.6606493592262268, + "learning_rate": 4.198376544040132e-06, + "loss": 0.3211, + "step": 31270 + }, + { + "epoch": 0.6971158757785204, + "grad_norm": 0.6540193557739258, + "learning_rate": 4.195525073130578e-06, + "loss": 0.2868, + "step": 31275 + }, + { + "epoch": 0.6972273251591404, + "grad_norm": 0.686784565448761, + "learning_rate": 4.192674313804847e-06, + "loss": 0.3434, + "step": 31280 + }, + { + "epoch": 0.6973387745397605, + "grad_norm": 0.7588372230529785, + "learning_rate": 4.189824266412416e-06, + "loss": 0.3459, + "step": 31285 + }, + { + "epoch": 0.6974502239203806, + "grad_norm": 0.7323045134544373, + "learning_rate": 4.186974931302685e-06, + "loss": 0.4094, + "step": 31290 + }, + { + "epoch": 0.6975616733010006, + "grad_norm": 0.5145244598388672, + "learning_rate": 4.184126308824954e-06, + "loss": 0.3327, + "step": 31295 + }, + { + "epoch": 0.6976731226816207, + "grad_norm": 0.4681999385356903, + "learning_rate": 4.18127839932845e-06, + "loss": 0.2122, + "step": 31300 + }, + { + "epoch": 0.6977845720622408, + "grad_norm": 0.27826374769210815, + "learning_rate": 4.178431203162301e-06, + "loss": 0.2309, + "step": 31305 + }, + { + "epoch": 0.6978960214428608, + "grad_norm": 0.5219650864601135, + "learning_rate": 4.175584720675551e-06, + "loss": 0.3067, + "step": 31310 + }, + { + "epoch": 0.6980074708234809, + "grad_norm": 0.7450000047683716, + "learning_rate": 4.172738952217151e-06, + "loss": 0.29, + "step": 31315 + }, + { + "epoch": 0.6981189202041009, + "grad_norm": 0.7271034121513367, + "learning_rate": 4.169893898135981e-06, + "loss": 0.3266, + "step": 31320 + }, + { + "epoch": 0.6982303695847211, + "grad_norm": 0.583061158657074, + "learning_rate": 4.167049558780818e-06, + "loss": 0.274, + "step": 31325 + }, + { + "epoch": 0.6983418189653411, + "grad_norm": 0.4795846939086914, + "learning_rate": 4.164205934500351e-06, + "loss": 0.1614, + "step": 31330 + }, + { + "epoch": 0.6984532683459612, + "grad_norm": 0.6440160274505615, + "learning_rate": 4.161363025643196e-06, + "loss": 0.2446, + "step": 31335 + }, + { + "epoch": 0.6985647177265812, + "grad_norm": 0.5630683302879333, + "learning_rate": 4.158520832557866e-06, + "loss": 0.2658, + "step": 31340 + }, + { + "epoch": 0.6986761671072013, + "grad_norm": 0.5534509420394897, + "learning_rate": 4.155679355592792e-06, + "loss": 0.2804, + "step": 31345 + }, + { + "epoch": 0.6987876164878214, + "grad_norm": 0.6446009874343872, + "learning_rate": 4.152838595096316e-06, + "loss": 0.2733, + "step": 31350 + }, + { + "epoch": 0.6988990658684414, + "grad_norm": 0.6161692142486572, + "learning_rate": 4.149998551416697e-06, + "loss": 0.2515, + "step": 31355 + }, + { + "epoch": 0.6990105152490615, + "grad_norm": 0.6872076988220215, + "learning_rate": 4.147159224902101e-06, + "loss": 0.3784, + "step": 31360 + }, + { + "epoch": 0.6991219646296816, + "grad_norm": 0.9411196112632751, + "learning_rate": 4.144320615900603e-06, + "loss": 0.2868, + "step": 31365 + }, + { + "epoch": 0.6992334140103016, + "grad_norm": 0.514918327331543, + "learning_rate": 4.1414827247602016e-06, + "loss": 0.3274, + "step": 31370 + }, + { + "epoch": 0.6993448633909217, + "grad_norm": 0.576237678527832, + "learning_rate": 4.138645551828799e-06, + "loss": 0.2588, + "step": 31375 + }, + { + "epoch": 0.6994563127715417, + "grad_norm": 0.840854287147522, + "learning_rate": 4.135809097454204e-06, + "loss": 0.3152, + "step": 31380 + }, + { + "epoch": 0.6995677621521619, + "grad_norm": 0.8221232891082764, + "learning_rate": 4.1329733619841535e-06, + "loss": 0.3219, + "step": 31385 + }, + { + "epoch": 0.6996792115327819, + "grad_norm": 0.6220507621765137, + "learning_rate": 4.130138345766283e-06, + "loss": 0.3625, + "step": 31390 + }, + { + "epoch": 0.6997906609134019, + "grad_norm": 0.6762942671775818, + "learning_rate": 4.127304049148142e-06, + "loss": 0.2756, + "step": 31395 + }, + { + "epoch": 0.699902110294022, + "grad_norm": 0.5760985016822815, + "learning_rate": 4.12447047247719e-06, + "loss": 0.2999, + "step": 31400 + }, + { + "epoch": 0.7000135596746421, + "grad_norm": 0.927649199962616, + "learning_rate": 4.121637616100811e-06, + "loss": 0.3766, + "step": 31405 + }, + { + "epoch": 0.7001250090552622, + "grad_norm": 1.2276265621185303, + "learning_rate": 4.1188054803662814e-06, + "loss": 0.3655, + "step": 31410 + }, + { + "epoch": 0.7002364584358822, + "grad_norm": 0.5870280265808105, + "learning_rate": 4.115974065620809e-06, + "loss": 0.2976, + "step": 31415 + }, + { + "epoch": 0.7003479078165024, + "grad_norm": 0.5980759263038635, + "learning_rate": 4.113143372211498e-06, + "loss": 0.2414, + "step": 31420 + }, + { + "epoch": 0.7004593571971224, + "grad_norm": 0.766173779964447, + "learning_rate": 4.110313400485369e-06, + "loss": 0.2918, + "step": 31425 + }, + { + "epoch": 0.7005708065777424, + "grad_norm": 0.5566070079803467, + "learning_rate": 4.107484150789356e-06, + "loss": 0.318, + "step": 31430 + }, + { + "epoch": 0.7006822559583625, + "grad_norm": 0.6765812039375305, + "learning_rate": 4.1046556234703e-06, + "loss": 0.2702, + "step": 31435 + }, + { + "epoch": 0.7007937053389826, + "grad_norm": 0.8657772541046143, + "learning_rate": 4.101827818874962e-06, + "loss": 0.3333, + "step": 31440 + }, + { + "epoch": 0.7009051547196027, + "grad_norm": 0.6400044560432434, + "learning_rate": 4.099000737350004e-06, + "loss": 0.3406, + "step": 31445 + }, + { + "epoch": 0.7010166041002227, + "grad_norm": 0.5417999029159546, + "learning_rate": 4.09617437924201e-06, + "loss": 0.2273, + "step": 31450 + }, + { + "epoch": 0.7011280534808427, + "grad_norm": 0.7063996195793152, + "learning_rate": 4.093348744897467e-06, + "loss": 0.2746, + "step": 31455 + }, + { + "epoch": 0.7012395028614629, + "grad_norm": 0.5358220934867859, + "learning_rate": 4.090523834662775e-06, + "loss": 0.2756, + "step": 31460 + }, + { + "epoch": 0.7013509522420829, + "grad_norm": 0.37458446621894836, + "learning_rate": 4.087699648884248e-06, + "loss": 0.2767, + "step": 31465 + }, + { + "epoch": 0.701462401622703, + "grad_norm": 0.7166943550109863, + "learning_rate": 4.084876187908104e-06, + "loss": 0.3122, + "step": 31470 + }, + { + "epoch": 0.701573851003323, + "grad_norm": 0.6304473280906677, + "learning_rate": 4.08205345208048e-06, + "loss": 0.3681, + "step": 31475 + }, + { + "epoch": 0.7016853003839432, + "grad_norm": 0.5616061091423035, + "learning_rate": 4.079231441747428e-06, + "loss": 0.1783, + "step": 31480 + }, + { + "epoch": 0.7017967497645632, + "grad_norm": 0.7790914177894592, + "learning_rate": 4.0764101572549e-06, + "loss": 0.2155, + "step": 31485 + }, + { + "epoch": 0.7019081991451832, + "grad_norm": 0.6581530570983887, + "learning_rate": 4.0735895989487625e-06, + "loss": 0.3486, + "step": 31490 + }, + { + "epoch": 0.7020196485258033, + "grad_norm": 0.7142459750175476, + "learning_rate": 4.070769767174797e-06, + "loss": 0.2639, + "step": 31495 + }, + { + "epoch": 0.7021310979064234, + "grad_norm": 0.6214120388031006, + "learning_rate": 4.067950662278687e-06, + "loss": 0.3789, + "step": 31500 + }, + { + "epoch": 0.7022425472870435, + "grad_norm": 0.4268057942390442, + "learning_rate": 4.065132284606038e-06, + "loss": 0.3095, + "step": 31505 + }, + { + "epoch": 0.7023539966676635, + "grad_norm": 0.6592011451721191, + "learning_rate": 4.062314634502364e-06, + "loss": 0.2152, + "step": 31510 + }, + { + "epoch": 0.7024654460482835, + "grad_norm": 0.4133501648902893, + "learning_rate": 4.059497712313083e-06, + "loss": 0.3253, + "step": 31515 + }, + { + "epoch": 0.7025768954289037, + "grad_norm": 0.9275414943695068, + "learning_rate": 4.0566815183835295e-06, + "loss": 0.2372, + "step": 31520 + }, + { + "epoch": 0.7026883448095237, + "grad_norm": 0.6263979077339172, + "learning_rate": 4.0538660530589466e-06, + "loss": 0.318, + "step": 31525 + }, + { + "epoch": 0.7027997941901438, + "grad_norm": 0.6248124241828918, + "learning_rate": 4.051051316684486e-06, + "loss": 0.2939, + "step": 31530 + }, + { + "epoch": 0.7029112435707638, + "grad_norm": 0.3651900291442871, + "learning_rate": 4.048237309605216e-06, + "loss": 0.2063, + "step": 31535 + }, + { + "epoch": 0.703022692951384, + "grad_norm": 0.5976786017417908, + "learning_rate": 4.04542403216611e-06, + "loss": 0.326, + "step": 31540 + }, + { + "epoch": 0.703134142332004, + "grad_norm": 0.7850168347358704, + "learning_rate": 4.042611484712058e-06, + "loss": 0.4271, + "step": 31545 + }, + { + "epoch": 0.703245591712624, + "grad_norm": 0.9984448552131653, + "learning_rate": 4.039799667587855e-06, + "loss": 0.386, + "step": 31550 + }, + { + "epoch": 0.7033570410932442, + "grad_norm": 0.5956363677978516, + "learning_rate": 4.036988581138206e-06, + "loss": 0.2853, + "step": 31555 + }, + { + "epoch": 0.7034684904738642, + "grad_norm": 0.6415675282478333, + "learning_rate": 4.03417822570773e-06, + "loss": 0.2615, + "step": 31560 + }, + { + "epoch": 0.7035799398544843, + "grad_norm": 0.6364458799362183, + "learning_rate": 4.031368601640951e-06, + "loss": 0.2817, + "step": 31565 + }, + { + "epoch": 0.7036913892351043, + "grad_norm": 0.7403188943862915, + "learning_rate": 4.028559709282314e-06, + "loss": 0.217, + "step": 31570 + }, + { + "epoch": 0.7038028386157243, + "grad_norm": 0.6373766660690308, + "learning_rate": 4.02575154897616e-06, + "loss": 0.1846, + "step": 31575 + }, + { + "epoch": 0.7039142879963445, + "grad_norm": 0.781013548374176, + "learning_rate": 4.022944121066757e-06, + "loss": 0.2312, + "step": 31580 + }, + { + "epoch": 0.7040257373769645, + "grad_norm": 0.48915186524391174, + "learning_rate": 4.020137425898267e-06, + "loss": 0.3111, + "step": 31585 + }, + { + "epoch": 0.7041371867575846, + "grad_norm": 0.7576313614845276, + "learning_rate": 4.017331463814772e-06, + "loss": 0.3434, + "step": 31590 + }, + { + "epoch": 0.7042486361382047, + "grad_norm": 0.4817739725112915, + "learning_rate": 4.014526235160258e-06, + "loss": 0.3492, + "step": 31595 + }, + { + "epoch": 0.7043600855188247, + "grad_norm": 0.6662478446960449, + "learning_rate": 4.011721740278629e-06, + "loss": 0.4293, + "step": 31600 + }, + { + "epoch": 0.7044715348994448, + "grad_norm": 0.576723575592041, + "learning_rate": 4.008917979513692e-06, + "loss": 0.242, + "step": 31605 + }, + { + "epoch": 0.7045829842800648, + "grad_norm": 0.6015423536300659, + "learning_rate": 4.006114953209165e-06, + "loss": 0.2857, + "step": 31610 + }, + { + "epoch": 0.704694433660685, + "grad_norm": 0.6793724298477173, + "learning_rate": 4.0033126617086815e-06, + "loss": 0.2909, + "step": 31615 + }, + { + "epoch": 0.704805883041305, + "grad_norm": 0.5891973376274109, + "learning_rate": 4.0005111053557776e-06, + "loss": 0.2533, + "step": 31620 + }, + { + "epoch": 0.7049173324219251, + "grad_norm": 0.594723105430603, + "learning_rate": 3.997710284493901e-06, + "loss": 0.4283, + "step": 31625 + }, + { + "epoch": 0.7050287818025451, + "grad_norm": 0.5770767331123352, + "learning_rate": 3.994910199466415e-06, + "loss": 0.2623, + "step": 31630 + }, + { + "epoch": 0.7051402311831652, + "grad_norm": 0.5853124260902405, + "learning_rate": 3.992110850616587e-06, + "loss": 0.1828, + "step": 31635 + }, + { + "epoch": 0.7052516805637853, + "grad_norm": 0.3737061023712158, + "learning_rate": 3.989312238287596e-06, + "loss": 0.3689, + "step": 31640 + }, + { + "epoch": 0.7053631299444053, + "grad_norm": 0.5344130992889404, + "learning_rate": 3.986514362822524e-06, + "loss": 0.2212, + "step": 31645 + }, + { + "epoch": 0.7054745793250254, + "grad_norm": 0.581981897354126, + "learning_rate": 3.983717224564378e-06, + "loss": 0.1676, + "step": 31650 + }, + { + "epoch": 0.7055860287056455, + "grad_norm": 0.7280535697937012, + "learning_rate": 3.9809208238560624e-06, + "loss": 0.295, + "step": 31655 + }, + { + "epoch": 0.7056974780862655, + "grad_norm": 0.7952364087104797, + "learning_rate": 3.97812516104039e-06, + "loss": 0.3424, + "step": 31660 + }, + { + "epoch": 0.7058089274668856, + "grad_norm": 0.36092883348464966, + "learning_rate": 3.9753302364600955e-06, + "loss": 0.3084, + "step": 31665 + }, + { + "epoch": 0.7059203768475056, + "grad_norm": 0.5385135412216187, + "learning_rate": 3.972536050457809e-06, + "loss": 0.3352, + "step": 31670 + }, + { + "epoch": 0.7060318262281258, + "grad_norm": 0.503911554813385, + "learning_rate": 3.969742603376079e-06, + "loss": 0.1922, + "step": 31675 + }, + { + "epoch": 0.7061432756087458, + "grad_norm": 0.43139663338661194, + "learning_rate": 3.966949895557355e-06, + "loss": 0.241, + "step": 31680 + }, + { + "epoch": 0.7062547249893659, + "grad_norm": 0.7122260928153992, + "learning_rate": 3.96415792734401e-06, + "loss": 0.318, + "step": 31685 + }, + { + "epoch": 0.7063661743699859, + "grad_norm": 0.7296402454376221, + "learning_rate": 3.961366699078309e-06, + "loss": 0.2224, + "step": 31690 + }, + { + "epoch": 0.706477623750606, + "grad_norm": 0.5677131414413452, + "learning_rate": 3.958576211102445e-06, + "loss": 0.3132, + "step": 31695 + }, + { + "epoch": 0.7065890731312261, + "grad_norm": 0.4357840120792389, + "learning_rate": 3.955786463758503e-06, + "loss": 0.2895, + "step": 31700 + }, + { + "epoch": 0.7067005225118461, + "grad_norm": 0.660984992980957, + "learning_rate": 3.952997457388488e-06, + "loss": 0.2977, + "step": 31705 + }, + { + "epoch": 0.7068119718924663, + "grad_norm": 0.46307283639907837, + "learning_rate": 3.950209192334308e-06, + "loss": 0.2648, + "step": 31710 + }, + { + "epoch": 0.7069234212730863, + "grad_norm": 0.6149296760559082, + "learning_rate": 3.94742166893778e-06, + "loss": 0.3036, + "step": 31715 + }, + { + "epoch": 0.7070348706537063, + "grad_norm": 0.4931444227695465, + "learning_rate": 3.944634887540637e-06, + "loss": 0.2578, + "step": 31720 + }, + { + "epoch": 0.7071463200343264, + "grad_norm": 0.7630481719970703, + "learning_rate": 3.941848848484521e-06, + "loss": 0.2542, + "step": 31725 + }, + { + "epoch": 0.7072577694149464, + "grad_norm": 0.6333499550819397, + "learning_rate": 3.939063552110973e-06, + "loss": 0.281, + "step": 31730 + }, + { + "epoch": 0.7073692187955666, + "grad_norm": 0.6649298071861267, + "learning_rate": 3.9362789987614514e-06, + "loss": 0.3235, + "step": 31735 + }, + { + "epoch": 0.7074806681761866, + "grad_norm": 0.5478550791740417, + "learning_rate": 3.933495188777318e-06, + "loss": 0.243, + "step": 31740 + }, + { + "epoch": 0.7075921175568067, + "grad_norm": 0.7835191488265991, + "learning_rate": 3.930712122499847e-06, + "loss": 0.3897, + "step": 31745 + }, + { + "epoch": 0.7077035669374268, + "grad_norm": 0.6654976606369019, + "learning_rate": 3.9279298002702245e-06, + "loss": 0.2984, + "step": 31750 + }, + { + "epoch": 0.7078150163180468, + "grad_norm": 0.8832136988639832, + "learning_rate": 3.925148222429536e-06, + "loss": 0.4235, + "step": 31755 + }, + { + "epoch": 0.7079264656986669, + "grad_norm": 0.5009804964065552, + "learning_rate": 3.922367389318788e-06, + "loss": 0.2761, + "step": 31760 + }, + { + "epoch": 0.7080379150792869, + "grad_norm": 0.456746906042099, + "learning_rate": 3.919587301278886e-06, + "loss": 0.3327, + "step": 31765 + }, + { + "epoch": 0.7081493644599071, + "grad_norm": 0.2070566713809967, + "learning_rate": 3.916807958650647e-06, + "loss": 0.2253, + "step": 31770 + }, + { + "epoch": 0.7082608138405271, + "grad_norm": 0.8645603656768799, + "learning_rate": 3.914029361774798e-06, + "loss": 0.323, + "step": 31775 + }, + { + "epoch": 0.7083722632211471, + "grad_norm": 0.609553337097168, + "learning_rate": 3.911251510991969e-06, + "loss": 0.2522, + "step": 31780 + }, + { + "epoch": 0.7084837126017672, + "grad_norm": 0.454108327627182, + "learning_rate": 3.90847440664271e-06, + "loss": 0.2475, + "step": 31785 + }, + { + "epoch": 0.7085951619823873, + "grad_norm": 1.1150599718093872, + "learning_rate": 3.905698049067466e-06, + "loss": 0.2728, + "step": 31790 + }, + { + "epoch": 0.7087066113630074, + "grad_norm": 0.6766877174377441, + "learning_rate": 3.902922438606603e-06, + "loss": 0.2768, + "step": 31795 + }, + { + "epoch": 0.7088180607436274, + "grad_norm": 0.7520423531532288, + "learning_rate": 3.9001475756003884e-06, + "loss": 0.4019, + "step": 31800 + }, + { + "epoch": 0.7089295101242474, + "grad_norm": 0.3129575848579407, + "learning_rate": 3.8973734603889965e-06, + "loss": 0.2247, + "step": 31805 + }, + { + "epoch": 0.7090409595048676, + "grad_norm": 0.3450331687927246, + "learning_rate": 3.8946000933125104e-06, + "loss": 0.246, + "step": 31810 + }, + { + "epoch": 0.7091524088854876, + "grad_norm": 0.7495473623275757, + "learning_rate": 3.89182747471093e-06, + "loss": 0.3223, + "step": 31815 + }, + { + "epoch": 0.7092638582661077, + "grad_norm": 0.6589322686195374, + "learning_rate": 3.889055604924152e-06, + "loss": 0.341, + "step": 31820 + }, + { + "epoch": 0.7093753076467277, + "grad_norm": 0.6266372203826904, + "learning_rate": 3.886284484291985e-06, + "loss": 0.2727, + "step": 31825 + }, + { + "epoch": 0.7094867570273479, + "grad_norm": 0.5666084289550781, + "learning_rate": 3.883514113154154e-06, + "loss": 0.2586, + "step": 31830 + }, + { + "epoch": 0.7095982064079679, + "grad_norm": 0.47962307929992676, + "learning_rate": 3.88074449185028e-06, + "loss": 0.3381, + "step": 31835 + }, + { + "epoch": 0.7097096557885879, + "grad_norm": 0.4192776083946228, + "learning_rate": 3.877975620719893e-06, + "loss": 0.22, + "step": 31840 + }, + { + "epoch": 0.709821105169208, + "grad_norm": 0.6102100014686584, + "learning_rate": 3.8752075001024455e-06, + "loss": 0.2446, + "step": 31845 + }, + { + "epoch": 0.7099325545498281, + "grad_norm": 0.801941990852356, + "learning_rate": 3.872440130337282e-06, + "loss": 0.2804, + "step": 31850 + }, + { + "epoch": 0.7100440039304482, + "grad_norm": 0.7876242995262146, + "learning_rate": 3.869673511763661e-06, + "loss": 0.3862, + "step": 31855 + }, + { + "epoch": 0.7101554533110682, + "grad_norm": 0.3692980706691742, + "learning_rate": 3.866907644720744e-06, + "loss": 0.3608, + "step": 31860 + }, + { + "epoch": 0.7102669026916882, + "grad_norm": 0.65399169921875, + "learning_rate": 3.864142529547614e-06, + "loss": 0.27, + "step": 31865 + }, + { + "epoch": 0.7103783520723084, + "grad_norm": 0.6146334409713745, + "learning_rate": 3.861378166583248e-06, + "loss": 0.3073, + "step": 31870 + }, + { + "epoch": 0.7104898014529284, + "grad_norm": 0.7848692536354065, + "learning_rate": 3.858614556166532e-06, + "loss": 0.2501, + "step": 31875 + }, + { + "epoch": 0.7106012508335485, + "grad_norm": 0.5123720765113831, + "learning_rate": 3.855851698636271e-06, + "loss": 0.3775, + "step": 31880 + }, + { + "epoch": 0.7107127002141685, + "grad_norm": 0.823817789554596, + "learning_rate": 3.853089594331168e-06, + "loss": 0.2522, + "step": 31885 + }, + { + "epoch": 0.7108241495947887, + "grad_norm": 0.5935199856758118, + "learning_rate": 3.850328243589832e-06, + "loss": 0.3239, + "step": 31890 + }, + { + "epoch": 0.7109355989754087, + "grad_norm": 0.7279103994369507, + "learning_rate": 3.847567646750782e-06, + "loss": 0.2592, + "step": 31895 + }, + { + "epoch": 0.7110470483560287, + "grad_norm": 0.7195340991020203, + "learning_rate": 3.8448078041524515e-06, + "loss": 0.249, + "step": 31900 + }, + { + "epoch": 0.7111584977366489, + "grad_norm": 0.619198739528656, + "learning_rate": 3.842048716133172e-06, + "loss": 0.3505, + "step": 31905 + }, + { + "epoch": 0.7112699471172689, + "grad_norm": 1.096381425857544, + "learning_rate": 3.8392903830311905e-06, + "loss": 0.262, + "step": 31910 + }, + { + "epoch": 0.711381396497889, + "grad_norm": 0.5376746654510498, + "learning_rate": 3.836532805184654e-06, + "loss": 0.24, + "step": 31915 + }, + { + "epoch": 0.711492845878509, + "grad_norm": 0.7633156180381775, + "learning_rate": 3.833775982931621e-06, + "loss": 0.2999, + "step": 31920 + }, + { + "epoch": 0.711604295259129, + "grad_norm": 0.5743112564086914, + "learning_rate": 3.831019916610057e-06, + "loss": 0.2885, + "step": 31925 + }, + { + "epoch": 0.7117157446397492, + "grad_norm": 0.6933272480964661, + "learning_rate": 3.82826460655783e-06, + "loss": 0.3165, + "step": 31930 + }, + { + "epoch": 0.7118271940203692, + "grad_norm": 0.8295173645019531, + "learning_rate": 3.825510053112724e-06, + "loss": 0.2561, + "step": 31935 + }, + { + "epoch": 0.7119386434009893, + "grad_norm": 0.5710833668708801, + "learning_rate": 3.82275625661243e-06, + "loss": 0.3247, + "step": 31940 + }, + { + "epoch": 0.7120500927816094, + "grad_norm": 0.7485097646713257, + "learning_rate": 3.820003217394537e-06, + "loss": 0.2412, + "step": 31945 + }, + { + "epoch": 0.7121615421622295, + "grad_norm": 0.6061316728591919, + "learning_rate": 3.817250935796547e-06, + "loss": 0.2984, + "step": 31950 + }, + { + "epoch": 0.7122729915428495, + "grad_norm": 0.5282992720603943, + "learning_rate": 3.8144994121558698e-06, + "loss": 0.1346, + "step": 31955 + }, + { + "epoch": 0.7123844409234695, + "grad_norm": 0.680298924446106, + "learning_rate": 3.8117486468098198e-06, + "loss": 0.221, + "step": 31960 + }, + { + "epoch": 0.7124958903040897, + "grad_norm": 0.802907407283783, + "learning_rate": 3.8089986400956156e-06, + "loss": 0.3548, + "step": 31965 + }, + { + "epoch": 0.7126073396847097, + "grad_norm": 0.6295163631439209, + "learning_rate": 3.806249392350392e-06, + "loss": 0.3174, + "step": 31970 + }, + { + "epoch": 0.7127187890653298, + "grad_norm": 0.4468139708042145, + "learning_rate": 3.803500903911187e-06, + "loss": 0.2824, + "step": 31975 + }, + { + "epoch": 0.7128302384459498, + "grad_norm": 0.3595259487628937, + "learning_rate": 3.8007531751149417e-06, + "loss": 0.2313, + "step": 31980 + }, + { + "epoch": 0.7129416878265699, + "grad_norm": 0.6914778351783752, + "learning_rate": 3.7980062062985056e-06, + "loss": 0.2235, + "step": 31985 + }, + { + "epoch": 0.71305313720719, + "grad_norm": 0.939741313457489, + "learning_rate": 3.795259997798638e-06, + "loss": 0.2855, + "step": 31990 + }, + { + "epoch": 0.71316458658781, + "grad_norm": 0.5949002504348755, + "learning_rate": 3.7925145499519967e-06, + "loss": 0.4313, + "step": 31995 + }, + { + "epoch": 0.7132760359684301, + "grad_norm": 0.37591123580932617, + "learning_rate": 3.7897698630951584e-06, + "loss": 0.3378, + "step": 32000 + }, + { + "epoch": 0.7133874853490502, + "grad_norm": 0.6279875040054321, + "learning_rate": 3.787025937564601e-06, + "loss": 0.2906, + "step": 32005 + }, + { + "epoch": 0.7134989347296702, + "grad_norm": 0.37446829676628113, + "learning_rate": 3.784282773696708e-06, + "loss": 0.2244, + "step": 32010 + }, + { + "epoch": 0.7136103841102903, + "grad_norm": 0.8711704611778259, + "learning_rate": 3.78154037182777e-06, + "loss": 0.2275, + "step": 32015 + }, + { + "epoch": 0.7137218334909103, + "grad_norm": 0.5769466161727905, + "learning_rate": 3.778798732293981e-06, + "loss": 0.1817, + "step": 32020 + }, + { + "epoch": 0.7138332828715305, + "grad_norm": 0.6223900318145752, + "learning_rate": 3.7760578554314454e-06, + "loss": 0.4379, + "step": 32025 + }, + { + "epoch": 0.7139447322521505, + "grad_norm": 0.6333432197570801, + "learning_rate": 3.773317741576178e-06, + "loss": 0.2252, + "step": 32030 + }, + { + "epoch": 0.7140561816327706, + "grad_norm": 0.3799491226673126, + "learning_rate": 3.770578391064089e-06, + "loss": 0.2514, + "step": 32035 + }, + { + "epoch": 0.7141676310133906, + "grad_norm": 0.4907926321029663, + "learning_rate": 3.7678398042310106e-06, + "loss": 0.2987, + "step": 32040 + }, + { + "epoch": 0.7142790803940107, + "grad_norm": 0.43275654315948486, + "learning_rate": 3.7651019814126656e-06, + "loss": 0.3212, + "step": 32045 + }, + { + "epoch": 0.7143905297746308, + "grad_norm": 0.8039458394050598, + "learning_rate": 3.7623649229446922e-06, + "loss": 0.31, + "step": 32050 + }, + { + "epoch": 0.7145019791552508, + "grad_norm": 0.6455947160720825, + "learning_rate": 3.759628629162633e-06, + "loss": 0.2957, + "step": 32055 + }, + { + "epoch": 0.714613428535871, + "grad_norm": 0.9271951913833618, + "learning_rate": 3.7568931004019306e-06, + "loss": 0.2133, + "step": 32060 + }, + { + "epoch": 0.714724877916491, + "grad_norm": 0.7006394267082214, + "learning_rate": 3.7541583369979484e-06, + "loss": 0.2565, + "step": 32065 + }, + { + "epoch": 0.714836327297111, + "grad_norm": 0.765282928943634, + "learning_rate": 3.7514243392859406e-06, + "loss": 0.2578, + "step": 32070 + }, + { + "epoch": 0.7149477766777311, + "grad_norm": 0.9573498368263245, + "learning_rate": 3.748691107601081e-06, + "loss": 0.3838, + "step": 32075 + }, + { + "epoch": 0.7150592260583511, + "grad_norm": 0.3499448001384735, + "learning_rate": 3.7459586422784387e-06, + "loss": 0.2668, + "step": 32080 + }, + { + "epoch": 0.7151706754389713, + "grad_norm": 0.5847846269607544, + "learning_rate": 3.7432269436529934e-06, + "loss": 0.3042, + "step": 32085 + }, + { + "epoch": 0.7152821248195913, + "grad_norm": 0.8232019543647766, + "learning_rate": 3.7404960120596256e-06, + "loss": 0.2874, + "step": 32090 + }, + { + "epoch": 0.7153935742002114, + "grad_norm": 0.8658748865127563, + "learning_rate": 3.7377658478331347e-06, + "loss": 0.2938, + "step": 32095 + }, + { + "epoch": 0.7155050235808315, + "grad_norm": 0.7339183688163757, + "learning_rate": 3.7350364513082137e-06, + "loss": 0.2917, + "step": 32100 + }, + { + "epoch": 0.7156164729614515, + "grad_norm": 0.61677086353302, + "learning_rate": 3.732307822819462e-06, + "loss": 0.3273, + "step": 32105 + }, + { + "epoch": 0.7157279223420716, + "grad_norm": 0.9375027418136597, + "learning_rate": 3.7295799627013964e-06, + "loss": 0.3053, + "step": 32110 + }, + { + "epoch": 0.7158393717226916, + "grad_norm": 0.4487803280353546, + "learning_rate": 3.7268528712884254e-06, + "loss": 0.2954, + "step": 32115 + }, + { + "epoch": 0.7159508211033118, + "grad_norm": 0.35287606716156006, + "learning_rate": 3.724126548914869e-06, + "loss": 0.2638, + "step": 32120 + }, + { + "epoch": 0.7160622704839318, + "grad_norm": 0.5714026093482971, + "learning_rate": 3.721400995914959e-06, + "loss": 0.3062, + "step": 32125 + }, + { + "epoch": 0.7161737198645518, + "grad_norm": 0.6228859424591064, + "learning_rate": 3.7186762126228227e-06, + "loss": 0.2557, + "step": 32130 + }, + { + "epoch": 0.7162851692451719, + "grad_norm": 0.761638343334198, + "learning_rate": 3.715952199372499e-06, + "loss": 0.278, + "step": 32135 + }, + { + "epoch": 0.716396618625792, + "grad_norm": 0.8885366320610046, + "learning_rate": 3.7132289564979273e-06, + "loss": 0.3607, + "step": 32140 + }, + { + "epoch": 0.7165080680064121, + "grad_norm": 0.7892611622810364, + "learning_rate": 3.710506484332962e-06, + "loss": 0.2887, + "step": 32145 + }, + { + "epoch": 0.7166195173870321, + "grad_norm": 0.6966431736946106, + "learning_rate": 3.7077847832113502e-06, + "loss": 0.4219, + "step": 32150 + }, + { + "epoch": 0.7167309667676521, + "grad_norm": 0.5574402809143066, + "learning_rate": 3.705063853466759e-06, + "loss": 0.2245, + "step": 32155 + }, + { + "epoch": 0.7168424161482723, + "grad_norm": 0.6996751427650452, + "learning_rate": 3.7023436954327507e-06, + "loss": 0.3271, + "step": 32160 + }, + { + "epoch": 0.7169538655288923, + "grad_norm": 0.8541873693466187, + "learning_rate": 3.6996243094427955e-06, + "loss": 0.391, + "step": 32165 + }, + { + "epoch": 0.7170653149095124, + "grad_norm": 1.0597639083862305, + "learning_rate": 3.6969056958302673e-06, + "loss": 0.3295, + "step": 32170 + }, + { + "epoch": 0.7171767642901324, + "grad_norm": 0.37856525182724, + "learning_rate": 3.694187854928445e-06, + "loss": 0.2672, + "step": 32175 + }, + { + "epoch": 0.7172882136707526, + "grad_norm": 0.46974796056747437, + "learning_rate": 3.6914707870705224e-06, + "loss": 0.2337, + "step": 32180 + }, + { + "epoch": 0.7173996630513726, + "grad_norm": 0.9543710350990295, + "learning_rate": 3.6887544925895826e-06, + "loss": 0.3482, + "step": 32185 + }, + { + "epoch": 0.7175111124319926, + "grad_norm": 0.5527359843254089, + "learning_rate": 3.6860389718186306e-06, + "loss": 0.2452, + "step": 32190 + }, + { + "epoch": 0.7176225618126127, + "grad_norm": 0.5871400833129883, + "learning_rate": 3.6833242250905644e-06, + "loss": 0.3222, + "step": 32195 + }, + { + "epoch": 0.7177340111932328, + "grad_norm": 0.5523154139518738, + "learning_rate": 3.6806102527381916e-06, + "loss": 0.1973, + "step": 32200 + }, + { + "epoch": 0.7178454605738529, + "grad_norm": 0.724926233291626, + "learning_rate": 3.6778970550942227e-06, + "loss": 0.2868, + "step": 32205 + }, + { + "epoch": 0.7179569099544729, + "grad_norm": 0.7483214139938354, + "learning_rate": 3.675184632491272e-06, + "loss": 0.2372, + "step": 32210 + }, + { + "epoch": 0.7180683593350929, + "grad_norm": 0.46413981914520264, + "learning_rate": 3.672472985261866e-06, + "loss": 0.4091, + "step": 32215 + }, + { + "epoch": 0.7181798087157131, + "grad_norm": 0.6509472131729126, + "learning_rate": 3.669762113738434e-06, + "loss": 0.294, + "step": 32220 + }, + { + "epoch": 0.7182912580963331, + "grad_norm": 0.8137038350105286, + "learning_rate": 3.6670520182533054e-06, + "loss": 0.3554, + "step": 32225 + }, + { + "epoch": 0.7184027074769532, + "grad_norm": 0.6024363040924072, + "learning_rate": 3.6643426991387167e-06, + "loss": 0.2791, + "step": 32230 + }, + { + "epoch": 0.7185141568575732, + "grad_norm": 0.7786808609962463, + "learning_rate": 3.661634156726809e-06, + "loss": 0.3118, + "step": 32235 + }, + { + "epoch": 0.7186256062381934, + "grad_norm": 0.3448847234249115, + "learning_rate": 3.6589263913496242e-06, + "loss": 0.2843, + "step": 32240 + }, + { + "epoch": 0.7187370556188134, + "grad_norm": 0.4758356213569641, + "learning_rate": 3.65621940333912e-06, + "loss": 0.2978, + "step": 32245 + }, + { + "epoch": 0.7188485049994334, + "grad_norm": 0.5408560037612915, + "learning_rate": 3.653513193027154e-06, + "loss": 0.2222, + "step": 32250 + }, + { + "epoch": 0.7189599543800536, + "grad_norm": 0.6804062724113464, + "learning_rate": 3.6508077607454818e-06, + "loss": 0.3638, + "step": 32255 + }, + { + "epoch": 0.7190714037606736, + "grad_norm": 0.4784459173679352, + "learning_rate": 3.648103106825771e-06, + "loss": 0.2286, + "step": 32260 + }, + { + "epoch": 0.7191828531412937, + "grad_norm": 0.43720412254333496, + "learning_rate": 3.64539923159959e-06, + "loss": 0.3287, + "step": 32265 + }, + { + "epoch": 0.7192943025219137, + "grad_norm": 0.56275874376297, + "learning_rate": 3.6426961353984125e-06, + "loss": 0.2435, + "step": 32270 + }, + { + "epoch": 0.7194057519025338, + "grad_norm": 0.3388124406337738, + "learning_rate": 3.6399938185536153e-06, + "loss": 0.2135, + "step": 32275 + }, + { + "epoch": 0.7195172012831539, + "grad_norm": 0.8845019936561584, + "learning_rate": 3.637292281396484e-06, + "loss": 0.2409, + "step": 32280 + }, + { + "epoch": 0.7196286506637739, + "grad_norm": 0.6715808510780334, + "learning_rate": 3.6345915242582096e-06, + "loss": 0.3154, + "step": 32285 + }, + { + "epoch": 0.719740100044394, + "grad_norm": 0.6816524863243103, + "learning_rate": 3.631891547469881e-06, + "loss": 0.4458, + "step": 32290 + }, + { + "epoch": 0.7198515494250141, + "grad_norm": 0.427211731672287, + "learning_rate": 3.6291923513624948e-06, + "loss": 0.3249, + "step": 32295 + }, + { + "epoch": 0.7199629988056342, + "grad_norm": 0.7947831749916077, + "learning_rate": 3.6264939362669517e-06, + "loss": 0.2995, + "step": 32300 + }, + { + "epoch": 0.7200744481862542, + "grad_norm": 0.7609732747077942, + "learning_rate": 3.623796302514051e-06, + "loss": 0.2699, + "step": 32305 + }, + { + "epoch": 0.7201858975668742, + "grad_norm": 0.6190392374992371, + "learning_rate": 3.621099450434512e-06, + "loss": 0.2547, + "step": 32310 + }, + { + "epoch": 0.7202973469474944, + "grad_norm": 0.404293030500412, + "learning_rate": 3.618403380358941e-06, + "loss": 0.3716, + "step": 32315 + }, + { + "epoch": 0.7204087963281144, + "grad_norm": 0.45661965012550354, + "learning_rate": 3.6157080926178556e-06, + "loss": 0.3288, + "step": 32320 + }, + { + "epoch": 0.7205202457087345, + "grad_norm": 0.49279457330703735, + "learning_rate": 3.6130135875416816e-06, + "loss": 0.2279, + "step": 32325 + }, + { + "epoch": 0.7206316950893545, + "grad_norm": 0.6576529741287231, + "learning_rate": 3.610319865460742e-06, + "loss": 0.2772, + "step": 32330 + }, + { + "epoch": 0.7207431444699746, + "grad_norm": 0.5785512924194336, + "learning_rate": 3.607626926705262e-06, + "loss": 0.3288, + "step": 32335 + }, + { + "epoch": 0.7208545938505947, + "grad_norm": 0.4410577118396759, + "learning_rate": 3.6049347716053838e-06, + "loss": 0.3005, + "step": 32340 + }, + { + "epoch": 0.7209660432312147, + "grad_norm": 0.8653692007064819, + "learning_rate": 3.60224340049114e-06, + "loss": 0.3509, + "step": 32345 + }, + { + "epoch": 0.7210774926118348, + "grad_norm": 0.6026588082313538, + "learning_rate": 3.599552813692472e-06, + "loss": 0.34, + "step": 32350 + }, + { + "epoch": 0.7211889419924549, + "grad_norm": 0.4423521161079407, + "learning_rate": 3.596863011539221e-06, + "loss": 0.2271, + "step": 32355 + }, + { + "epoch": 0.7213003913730749, + "grad_norm": 0.5537447333335876, + "learning_rate": 3.594173994361144e-06, + "loss": 0.3005, + "step": 32360 + }, + { + "epoch": 0.721411840753695, + "grad_norm": 0.42409682273864746, + "learning_rate": 3.5914857624878898e-06, + "loss": 0.2173, + "step": 32365 + }, + { + "epoch": 0.721523290134315, + "grad_norm": 0.698688268661499, + "learning_rate": 3.5887983162490125e-06, + "loss": 0.2936, + "step": 32370 + }, + { + "epoch": 0.7216347395149352, + "grad_norm": 0.6022817492485046, + "learning_rate": 3.5861116559739772e-06, + "loss": 0.3066, + "step": 32375 + }, + { + "epoch": 0.7217461888955552, + "grad_norm": 0.9220947623252869, + "learning_rate": 3.583425781992146e-06, + "loss": 0.2283, + "step": 32380 + }, + { + "epoch": 0.7218576382761753, + "grad_norm": 0.3861825466156006, + "learning_rate": 3.5807406946327847e-06, + "loss": 0.3059, + "step": 32385 + }, + { + "epoch": 0.7219690876567953, + "grad_norm": 0.3583422899246216, + "learning_rate": 3.5780563942250623e-06, + "loss": 0.2054, + "step": 32390 + }, + { + "epoch": 0.7220805370374154, + "grad_norm": 0.4105013608932495, + "learning_rate": 3.575372881098059e-06, + "loss": 0.2404, + "step": 32395 + }, + { + "epoch": 0.7221919864180355, + "grad_norm": 0.5124438405036926, + "learning_rate": 3.572690155580747e-06, + "loss": 0.179, + "step": 32400 + }, + { + "epoch": 0.7223034357986555, + "grad_norm": 0.3848888874053955, + "learning_rate": 3.5700082180020147e-06, + "loss": 0.3135, + "step": 32405 + }, + { + "epoch": 0.7224148851792757, + "grad_norm": 0.5860435962677002, + "learning_rate": 3.5673270686906424e-06, + "loss": 0.2559, + "step": 32410 + }, + { + "epoch": 0.7225263345598957, + "grad_norm": 0.5059179663658142, + "learning_rate": 3.564646707975319e-06, + "loss": 0.3659, + "step": 32415 + }, + { + "epoch": 0.7226377839405157, + "grad_norm": 0.671142041683197, + "learning_rate": 3.561967136184635e-06, + "loss": 0.3804, + "step": 32420 + }, + { + "epoch": 0.7227492333211358, + "grad_norm": 0.6313089728355408, + "learning_rate": 3.5592883536470836e-06, + "loss": 0.2195, + "step": 32425 + }, + { + "epoch": 0.7228606827017559, + "grad_norm": 0.5060765743255615, + "learning_rate": 3.5566103606910652e-06, + "loss": 0.3807, + "step": 32430 + }, + { + "epoch": 0.722972132082376, + "grad_norm": 0.8583391308784485, + "learning_rate": 3.5539331576448854e-06, + "loss": 0.3069, + "step": 32435 + }, + { + "epoch": 0.723083581462996, + "grad_norm": 0.8933718204498291, + "learning_rate": 3.551256744836743e-06, + "loss": 0.2781, + "step": 32440 + }, + { + "epoch": 0.7231950308436161, + "grad_norm": 0.7085146903991699, + "learning_rate": 3.5485811225947485e-06, + "loss": 0.2548, + "step": 32445 + }, + { + "epoch": 0.7233064802242362, + "grad_norm": 0.6724334359169006, + "learning_rate": 3.54590629124691e-06, + "loss": 0.2972, + "step": 32450 + }, + { + "epoch": 0.7234179296048562, + "grad_norm": 0.5674258470535278, + "learning_rate": 3.5432322511211393e-06, + "loss": 0.3705, + "step": 32455 + }, + { + "epoch": 0.7235293789854763, + "grad_norm": 0.5062234997749329, + "learning_rate": 3.5405590025452565e-06, + "loss": 0.1177, + "step": 32460 + }, + { + "epoch": 0.7236408283660963, + "grad_norm": 0.5411877632141113, + "learning_rate": 3.5378865458469824e-06, + "loss": 0.1958, + "step": 32465 + }, + { + "epoch": 0.7237522777467165, + "grad_norm": 0.6080607771873474, + "learning_rate": 3.53521488135394e-06, + "loss": 0.2997, + "step": 32470 + }, + { + "epoch": 0.7238637271273365, + "grad_norm": 0.3965998888015747, + "learning_rate": 3.5325440093936513e-06, + "loss": 0.2094, + "step": 32475 + }, + { + "epoch": 0.7239751765079565, + "grad_norm": 0.8515360951423645, + "learning_rate": 3.529873930293546e-06, + "loss": 0.2023, + "step": 32480 + }, + { + "epoch": 0.7240866258885766, + "grad_norm": 0.7043160200119019, + "learning_rate": 3.527204644380956e-06, + "loss": 0.2765, + "step": 32485 + }, + { + "epoch": 0.7241980752691967, + "grad_norm": 0.5239383578300476, + "learning_rate": 3.52453615198311e-06, + "loss": 0.2129, + "step": 32490 + }, + { + "epoch": 0.7243095246498168, + "grad_norm": 0.4187348484992981, + "learning_rate": 3.5218684534271497e-06, + "loss": 0.2735, + "step": 32495 + }, + { + "epoch": 0.7244209740304368, + "grad_norm": 0.5309123396873474, + "learning_rate": 3.5192015490401165e-06, + "loss": 0.2541, + "step": 32500 + }, + { + "epoch": 0.724532423411057, + "grad_norm": 0.48658499121665955, + "learning_rate": 3.516535439148949e-06, + "loss": 0.3096, + "step": 32505 + }, + { + "epoch": 0.724643872791677, + "grad_norm": 0.49070295691490173, + "learning_rate": 3.5138701240804927e-06, + "loss": 0.2836, + "step": 32510 + }, + { + "epoch": 0.724755322172297, + "grad_norm": 0.5202540755271912, + "learning_rate": 3.5112056041614927e-06, + "loss": 0.189, + "step": 32515 + }, + { + "epoch": 0.7248667715529171, + "grad_norm": 0.49356088042259216, + "learning_rate": 3.5085418797185977e-06, + "loss": 0.2788, + "step": 32520 + }, + { + "epoch": 0.7249782209335371, + "grad_norm": 0.792779803276062, + "learning_rate": 3.505878951078365e-06, + "loss": 0.3265, + "step": 32525 + }, + { + "epoch": 0.7250896703141573, + "grad_norm": 0.5415936708450317, + "learning_rate": 3.5032168185672423e-06, + "loss": 0.2231, + "step": 32530 + }, + { + "epoch": 0.7252011196947773, + "grad_norm": 0.9408218860626221, + "learning_rate": 3.500555482511594e-06, + "loss": 0.311, + "step": 32535 + }, + { + "epoch": 0.7253125690753973, + "grad_norm": 0.9326852560043335, + "learning_rate": 3.4978949432376753e-06, + "loss": 0.3399, + "step": 32540 + }, + { + "epoch": 0.7254240184560174, + "grad_norm": 0.5972670912742615, + "learning_rate": 3.4952352010716472e-06, + "loss": 0.3099, + "step": 32545 + }, + { + "epoch": 0.7255354678366375, + "grad_norm": 0.4196832478046417, + "learning_rate": 3.4925762563395714e-06, + "loss": 0.2358, + "step": 32550 + }, + { + "epoch": 0.7256469172172576, + "grad_norm": 0.6563181281089783, + "learning_rate": 3.489918109367422e-06, + "loss": 0.2752, + "step": 32555 + }, + { + "epoch": 0.7257583665978776, + "grad_norm": 0.501997172832489, + "learning_rate": 3.4872607604810605e-06, + "loss": 0.26, + "step": 32560 + }, + { + "epoch": 0.7258698159784976, + "grad_norm": 1.001268744468689, + "learning_rate": 3.484604210006256e-06, + "loss": 0.4104, + "step": 32565 + }, + { + "epoch": 0.7259812653591178, + "grad_norm": 0.5787915587425232, + "learning_rate": 3.481948458268688e-06, + "loss": 0.1873, + "step": 32570 + }, + { + "epoch": 0.7260927147397378, + "grad_norm": 1.107448935508728, + "learning_rate": 3.479293505593927e-06, + "loss": 0.3459, + "step": 32575 + }, + { + "epoch": 0.7262041641203579, + "grad_norm": 0.7938663363456726, + "learning_rate": 3.4766393523074504e-06, + "loss": 0.2198, + "step": 32580 + }, + { + "epoch": 0.726315613500978, + "grad_norm": 0.6893438696861267, + "learning_rate": 3.4739859987346325e-06, + "loss": 0.3321, + "step": 32585 + }, + { + "epoch": 0.7264270628815981, + "grad_norm": 0.5795388221740723, + "learning_rate": 3.471333445200762e-06, + "loss": 0.2203, + "step": 32590 + }, + { + "epoch": 0.7265385122622181, + "grad_norm": 0.863497257232666, + "learning_rate": 3.4686816920310175e-06, + "loss": 0.2348, + "step": 32595 + }, + { + "epoch": 0.7266499616428381, + "grad_norm": 0.8609886169433594, + "learning_rate": 3.466030739550481e-06, + "loss": 0.3007, + "step": 32600 + }, + { + "epoch": 0.7267614110234583, + "grad_norm": 0.37407514452934265, + "learning_rate": 3.463380588084143e-06, + "loss": 0.181, + "step": 32605 + }, + { + "epoch": 0.7268728604040783, + "grad_norm": 0.6242061853408813, + "learning_rate": 3.4607312379568913e-06, + "loss": 0.1466, + "step": 32610 + }, + { + "epoch": 0.7269843097846984, + "grad_norm": 0.6473770141601562, + "learning_rate": 3.4580826894935104e-06, + "loss": 0.2335, + "step": 32615 + }, + { + "epoch": 0.7270957591653184, + "grad_norm": 0.6805868148803711, + "learning_rate": 3.4554349430186997e-06, + "loss": 0.343, + "step": 32620 + }, + { + "epoch": 0.7272072085459385, + "grad_norm": 0.897735595703125, + "learning_rate": 3.452787998857048e-06, + "loss": 0.3243, + "step": 32625 + }, + { + "epoch": 0.7273186579265586, + "grad_norm": 0.6950689554214478, + "learning_rate": 3.4501418573330516e-06, + "loss": 0.3108, + "step": 32630 + }, + { + "epoch": 0.7274301073071786, + "grad_norm": 0.8856807351112366, + "learning_rate": 3.447496518771103e-06, + "loss": 0.4225, + "step": 32635 + }, + { + "epoch": 0.7275415566877987, + "grad_norm": 0.5541945695877075, + "learning_rate": 3.4448519834955065e-06, + "loss": 0.3201, + "step": 32640 + }, + { + "epoch": 0.7276530060684188, + "grad_norm": 1.209107756614685, + "learning_rate": 3.4422082518304555e-06, + "loss": 0.3038, + "step": 32645 + }, + { + "epoch": 0.7277644554490389, + "grad_norm": 0.6372049450874329, + "learning_rate": 3.4395653241000584e-06, + "loss": 0.2701, + "step": 32650 + }, + { + "epoch": 0.7278759048296589, + "grad_norm": 0.6735883355140686, + "learning_rate": 3.4369232006283137e-06, + "loss": 0.2318, + "step": 32655 + }, + { + "epoch": 0.7279873542102789, + "grad_norm": 0.5158315896987915, + "learning_rate": 3.4342818817391253e-06, + "loss": 0.3009, + "step": 32660 + }, + { + "epoch": 0.7280988035908991, + "grad_norm": 0.6828005909919739, + "learning_rate": 3.4316413677562976e-06, + "loss": 0.2938, + "step": 32665 + }, + { + "epoch": 0.7282102529715191, + "grad_norm": 0.6131678819656372, + "learning_rate": 3.4290016590035367e-06, + "loss": 0.2791, + "step": 32670 + }, + { + "epoch": 0.7283217023521392, + "grad_norm": 0.4129774272441864, + "learning_rate": 3.4263627558044543e-06, + "loss": 0.318, + "step": 32675 + }, + { + "epoch": 0.7284331517327592, + "grad_norm": 0.5805781483650208, + "learning_rate": 3.4237246584825545e-06, + "loss": 0.2985, + "step": 32680 + }, + { + "epoch": 0.7285446011133793, + "grad_norm": 0.7275981307029724, + "learning_rate": 3.4210873673612534e-06, + "loss": 0.2832, + "step": 32685 + }, + { + "epoch": 0.7286560504939994, + "grad_norm": 0.7103040814399719, + "learning_rate": 3.4184508827638597e-06, + "loss": 0.3154, + "step": 32690 + }, + { + "epoch": 0.7287674998746194, + "grad_norm": 0.899198055267334, + "learning_rate": 3.4158152050135864e-06, + "loss": 0.3092, + "step": 32695 + }, + { + "epoch": 0.7288789492552395, + "grad_norm": 0.6193143725395203, + "learning_rate": 3.413180334433547e-06, + "loss": 0.1996, + "step": 32700 + }, + { + "epoch": 0.7289903986358596, + "grad_norm": 0.46193233132362366, + "learning_rate": 3.410546271346752e-06, + "loss": 0.3823, + "step": 32705 + }, + { + "epoch": 0.7291018480164797, + "grad_norm": 0.46639421582221985, + "learning_rate": 3.4079130160761222e-06, + "loss": 0.2493, + "step": 32710 + }, + { + "epoch": 0.7292132973970997, + "grad_norm": 0.846572756767273, + "learning_rate": 3.4052805689444757e-06, + "loss": 0.3425, + "step": 32715 + }, + { + "epoch": 0.7293247467777197, + "grad_norm": 1.2694923877716064, + "learning_rate": 3.402648930274529e-06, + "loss": 0.2782, + "step": 32720 + }, + { + "epoch": 0.7294361961583399, + "grad_norm": 0.6195218563079834, + "learning_rate": 3.4000181003889e-06, + "loss": 0.2875, + "step": 32725 + }, + { + "epoch": 0.7295476455389599, + "grad_norm": 0.8452773094177246, + "learning_rate": 3.3973880796101067e-06, + "loss": 0.3262, + "step": 32730 + }, + { + "epoch": 0.72965909491958, + "grad_norm": 0.7911295294761658, + "learning_rate": 3.394758868260568e-06, + "loss": 0.3521, + "step": 32735 + }, + { + "epoch": 0.7297705443002, + "grad_norm": 0.6133013367652893, + "learning_rate": 3.3921304666626075e-06, + "loss": 0.3355, + "step": 32740 + }, + { + "epoch": 0.7298819936808201, + "grad_norm": 0.5400652885437012, + "learning_rate": 3.3895028751384495e-06, + "loss": 0.2788, + "step": 32745 + }, + { + "epoch": 0.7299934430614402, + "grad_norm": 0.7221797704696655, + "learning_rate": 3.386876094010214e-06, + "loss": 0.2963, + "step": 32750 + }, + { + "epoch": 0.7301048924420602, + "grad_norm": 0.6148155331611633, + "learning_rate": 3.3842501235999246e-06, + "loss": 0.2763, + "step": 32755 + }, + { + "epoch": 0.7302163418226804, + "grad_norm": 0.6763278245925903, + "learning_rate": 3.381624964229504e-06, + "loss": 0.2316, + "step": 32760 + }, + { + "epoch": 0.7303277912033004, + "grad_norm": 0.6103901863098145, + "learning_rate": 3.3790006162207722e-06, + "loss": 0.3596, + "step": 32765 + }, + { + "epoch": 0.7304392405839204, + "grad_norm": 0.7152795195579529, + "learning_rate": 3.3763770798954633e-06, + "loss": 0.2931, + "step": 32770 + }, + { + "epoch": 0.7305506899645405, + "grad_norm": 0.5556166768074036, + "learning_rate": 3.3737543555751937e-06, + "loss": 0.2399, + "step": 32775 + }, + { + "epoch": 0.7306621393451606, + "grad_norm": 0.6293778419494629, + "learning_rate": 3.3711324435814973e-06, + "loss": 0.284, + "step": 32780 + }, + { + "epoch": 0.7307735887257807, + "grad_norm": 0.37658339738845825, + "learning_rate": 3.3685113442357963e-06, + "loss": 0.3552, + "step": 32785 + }, + { + "epoch": 0.7308850381064007, + "grad_norm": 0.8621169328689575, + "learning_rate": 3.3658910578594173e-06, + "loss": 0.2481, + "step": 32790 + }, + { + "epoch": 0.7309964874870208, + "grad_norm": 0.5625035762786865, + "learning_rate": 3.3632715847735875e-06, + "loss": 0.383, + "step": 32795 + }, + { + "epoch": 0.7311079368676409, + "grad_norm": 0.5894595384597778, + "learning_rate": 3.36065292529943e-06, + "loss": 0.3287, + "step": 32800 + }, + { + "epoch": 0.7312193862482609, + "grad_norm": 0.8198038339614868, + "learning_rate": 3.3580350797579786e-06, + "loss": 0.2684, + "step": 32805 + }, + { + "epoch": 0.731330835628881, + "grad_norm": 0.4880506098270416, + "learning_rate": 3.355418048470156e-06, + "loss": 0.4003, + "step": 32810 + }, + { + "epoch": 0.731442285009501, + "grad_norm": 0.5881021618843079, + "learning_rate": 3.3528018317567954e-06, + "loss": 0.1828, + "step": 32815 + }, + { + "epoch": 0.7315537343901212, + "grad_norm": 0.5141913890838623, + "learning_rate": 3.3501864299386213e-06, + "loss": 0.2842, + "step": 32820 + }, + { + "epoch": 0.7316651837707412, + "grad_norm": 0.6004062294960022, + "learning_rate": 3.3475718433362623e-06, + "loss": 0.3737, + "step": 32825 + }, + { + "epoch": 0.7317766331513612, + "grad_norm": 0.9291187524795532, + "learning_rate": 3.3449580722702434e-06, + "loss": 0.2161, + "step": 32830 + }, + { + "epoch": 0.7318880825319813, + "grad_norm": 0.69871586561203, + "learning_rate": 3.342345117060999e-06, + "loss": 0.297, + "step": 32835 + }, + { + "epoch": 0.7319995319126014, + "grad_norm": 1.0281239748001099, + "learning_rate": 3.3397329780288546e-06, + "loss": 0.2879, + "step": 32840 + }, + { + "epoch": 0.7321109812932215, + "grad_norm": 0.3937816023826599, + "learning_rate": 3.3371216554940367e-06, + "loss": 0.2839, + "step": 32845 + }, + { + "epoch": 0.7322224306738415, + "grad_norm": 2.1417198181152344, + "learning_rate": 3.3345111497766713e-06, + "loss": 0.2963, + "step": 32850 + }, + { + "epoch": 0.7323338800544617, + "grad_norm": 0.7976836562156677, + "learning_rate": 3.3319014611967936e-06, + "loss": 0.2797, + "step": 32855 + }, + { + "epoch": 0.7324453294350817, + "grad_norm": 0.8146697878837585, + "learning_rate": 3.329292590074322e-06, + "loss": 0.1875, + "step": 32860 + }, + { + "epoch": 0.7325567788157017, + "grad_norm": 1.0141352415084839, + "learning_rate": 3.3266845367290934e-06, + "loss": 0.241, + "step": 32865 + }, + { + "epoch": 0.7326682281963218, + "grad_norm": 0.8705369830131531, + "learning_rate": 3.3240773014808303e-06, + "loss": 0.2844, + "step": 32870 + }, + { + "epoch": 0.7327796775769418, + "grad_norm": 0.5327593088150024, + "learning_rate": 3.3214708846491594e-06, + "loss": 0.2567, + "step": 32875 + }, + { + "epoch": 0.732891126957562, + "grad_norm": 0.909140944480896, + "learning_rate": 3.3188652865536074e-06, + "loss": 0.3596, + "step": 32880 + }, + { + "epoch": 0.733002576338182, + "grad_norm": 0.5487768054008484, + "learning_rate": 3.3162605075135988e-06, + "loss": 0.2859, + "step": 32885 + }, + { + "epoch": 0.733114025718802, + "grad_norm": 0.5730482935905457, + "learning_rate": 3.3136565478484638e-06, + "loss": 0.2157, + "step": 32890 + }, + { + "epoch": 0.7332254750994222, + "grad_norm": 0.7908298969268799, + "learning_rate": 3.3110534078774224e-06, + "loss": 0.2703, + "step": 32895 + }, + { + "epoch": 0.7333369244800422, + "grad_norm": 0.5331554412841797, + "learning_rate": 3.3084510879196053e-06, + "loss": 0.2244, + "step": 32900 + }, + { + "epoch": 0.7334483738606623, + "grad_norm": 0.7708745002746582, + "learning_rate": 3.3058495882940344e-06, + "loss": 0.2691, + "step": 32905 + }, + { + "epoch": 0.7335598232412823, + "grad_norm": 0.6561907529830933, + "learning_rate": 3.303248909319633e-06, + "loss": 0.2677, + "step": 32910 + }, + { + "epoch": 0.7336712726219023, + "grad_norm": 0.9045218825340271, + "learning_rate": 3.3006490513152245e-06, + "loss": 0.2301, + "step": 32915 + }, + { + "epoch": 0.7337827220025225, + "grad_norm": 0.6481374502182007, + "learning_rate": 3.2980500145995278e-06, + "loss": 0.3318, + "step": 32920 + }, + { + "epoch": 0.7338941713831425, + "grad_norm": 0.6650016903877258, + "learning_rate": 3.2954517994911684e-06, + "loss": 0.3868, + "step": 32925 + }, + { + "epoch": 0.7340056207637626, + "grad_norm": 0.6965200901031494, + "learning_rate": 3.2928544063086697e-06, + "loss": 0.3053, + "step": 32930 + }, + { + "epoch": 0.7341170701443827, + "grad_norm": 0.7827367782592773, + "learning_rate": 3.290257835370451e-06, + "loss": 0.3013, + "step": 32935 + }, + { + "epoch": 0.7342285195250028, + "grad_norm": 0.45556485652923584, + "learning_rate": 3.2876620869948294e-06, + "loss": 0.259, + "step": 32940 + }, + { + "epoch": 0.7343399689056228, + "grad_norm": 0.8706529140472412, + "learning_rate": 3.285067161500024e-06, + "loss": 0.3253, + "step": 32945 + }, + { + "epoch": 0.7344514182862428, + "grad_norm": 0.7577581405639648, + "learning_rate": 3.2824730592041507e-06, + "loss": 0.2733, + "step": 32950 + }, + { + "epoch": 0.734562867666863, + "grad_norm": 0.7123679518699646, + "learning_rate": 3.27987978042523e-06, + "loss": 0.3404, + "step": 32955 + }, + { + "epoch": 0.734674317047483, + "grad_norm": 0.6272518038749695, + "learning_rate": 3.2772873254811787e-06, + "loss": 0.218, + "step": 32960 + }, + { + "epoch": 0.7347857664281031, + "grad_norm": 0.4759124219417572, + "learning_rate": 3.2746956946898114e-06, + "loss": 0.2832, + "step": 32965 + }, + { + "epoch": 0.7348972158087231, + "grad_norm": 0.7127870917320251, + "learning_rate": 3.2721048883688387e-06, + "loss": 0.2872, + "step": 32970 + }, + { + "epoch": 0.7350086651893432, + "grad_norm": 0.7940077185630798, + "learning_rate": 3.2695149068358765e-06, + "loss": 0.1871, + "step": 32975 + }, + { + "epoch": 0.7351201145699633, + "grad_norm": 0.6166383624076843, + "learning_rate": 3.2669257504084317e-06, + "loss": 0.2697, + "step": 32980 + }, + { + "epoch": 0.7352315639505833, + "grad_norm": 0.5002848505973816, + "learning_rate": 3.264337419403922e-06, + "loss": 0.2979, + "step": 32985 + }, + { + "epoch": 0.7353430133312034, + "grad_norm": 0.5366833209991455, + "learning_rate": 3.2617499141396504e-06, + "loss": 0.2495, + "step": 32990 + }, + { + "epoch": 0.7354544627118235, + "grad_norm": 0.890483558177948, + "learning_rate": 3.2591632349328305e-06, + "loss": 0.3691, + "step": 32995 + }, + { + "epoch": 0.7355659120924436, + "grad_norm": 0.5453323721885681, + "learning_rate": 3.2565773821005663e-06, + "loss": 0.2533, + "step": 33000 + }, + { + "epoch": 0.7356773614730636, + "grad_norm": 0.56355220079422, + "learning_rate": 3.2539923559598654e-06, + "loss": 0.1984, + "step": 33005 + }, + { + "epoch": 0.7357888108536836, + "grad_norm": 0.5132585763931274, + "learning_rate": 3.251408156827629e-06, + "loss": 0.2963, + "step": 33010 + }, + { + "epoch": 0.7359002602343038, + "grad_norm": 0.5574620366096497, + "learning_rate": 3.248824785020659e-06, + "loss": 0.3047, + "step": 33015 + }, + { + "epoch": 0.7360117096149238, + "grad_norm": 0.7912201285362244, + "learning_rate": 3.246242240855663e-06, + "loss": 0.2632, + "step": 33020 + }, + { + "epoch": 0.7361231589955439, + "grad_norm": 0.4491974115371704, + "learning_rate": 3.2436605246492337e-06, + "loss": 0.2854, + "step": 33025 + }, + { + "epoch": 0.736234608376164, + "grad_norm": 0.8769235014915466, + "learning_rate": 3.2410796367178753e-06, + "loss": 0.3817, + "step": 33030 + }, + { + "epoch": 0.736346057756784, + "grad_norm": 0.6464642286300659, + "learning_rate": 3.238499577377984e-06, + "loss": 0.3015, + "step": 33035 + }, + { + "epoch": 0.7364575071374041, + "grad_norm": 1.0190484523773193, + "learning_rate": 3.235920346945852e-06, + "loss": 0.3064, + "step": 33040 + }, + { + "epoch": 0.7365689565180241, + "grad_norm": 0.7795517444610596, + "learning_rate": 3.2333419457376734e-06, + "loss": 0.279, + "step": 33045 + }, + { + "epoch": 0.7366804058986443, + "grad_norm": 0.5025030374526978, + "learning_rate": 3.2307643740695437e-06, + "loss": 0.2187, + "step": 33050 + }, + { + "epoch": 0.7367918552792643, + "grad_norm": 0.5091804265975952, + "learning_rate": 3.228187632257452e-06, + "loss": 0.3016, + "step": 33055 + }, + { + "epoch": 0.7369033046598844, + "grad_norm": 1.0234618186950684, + "learning_rate": 3.225611720617283e-06, + "loss": 0.1708, + "step": 33060 + }, + { + "epoch": 0.7370147540405044, + "grad_norm": 0.7228109836578369, + "learning_rate": 3.223036639464829e-06, + "loss": 0.4115, + "step": 33065 + }, + { + "epoch": 0.7371262034211244, + "grad_norm": 0.5997233390808105, + "learning_rate": 3.220462389115774e-06, + "loss": 0.2367, + "step": 33070 + }, + { + "epoch": 0.7372376528017446, + "grad_norm": 0.7658345103263855, + "learning_rate": 3.2178889698856964e-06, + "loss": 0.2493, + "step": 33075 + }, + { + "epoch": 0.7373491021823646, + "grad_norm": 0.6293799877166748, + "learning_rate": 3.2153163820900844e-06, + "loss": 0.3571, + "step": 33080 + }, + { + "epoch": 0.7374605515629847, + "grad_norm": 0.7071552276611328, + "learning_rate": 3.212744626044315e-06, + "loss": 0.2756, + "step": 33085 + }, + { + "epoch": 0.7375720009436048, + "grad_norm": 0.5751235485076904, + "learning_rate": 3.2101737020636637e-06, + "loss": 0.3348, + "step": 33090 + }, + { + "epoch": 0.7376834503242248, + "grad_norm": 0.8929909467697144, + "learning_rate": 3.2076036104633048e-06, + "loss": 0.348, + "step": 33095 + }, + { + "epoch": 0.7377948997048449, + "grad_norm": 0.6280662417411804, + "learning_rate": 3.205034351558317e-06, + "loss": 0.3748, + "step": 33100 + }, + { + "epoch": 0.7379063490854649, + "grad_norm": 0.7077507972717285, + "learning_rate": 3.202465925663668e-06, + "loss": 0.3992, + "step": 33105 + }, + { + "epoch": 0.7380177984660851, + "grad_norm": 0.7908935546875, + "learning_rate": 3.1998983330942246e-06, + "loss": 0.2469, + "step": 33110 + }, + { + "epoch": 0.7381292478467051, + "grad_norm": 0.39649245142936707, + "learning_rate": 3.1973315741647605e-06, + "loss": 0.155, + "step": 33115 + }, + { + "epoch": 0.7382406972273251, + "grad_norm": 0.9612866640090942, + "learning_rate": 3.194765649189937e-06, + "loss": 0.3344, + "step": 33120 + }, + { + "epoch": 0.7383521466079452, + "grad_norm": 0.5874886512756348, + "learning_rate": 3.1922005584843163e-06, + "loss": 0.2799, + "step": 33125 + }, + { + "epoch": 0.7384635959885653, + "grad_norm": 0.523689866065979, + "learning_rate": 3.1896363023623557e-06, + "loss": 0.1934, + "step": 33130 + }, + { + "epoch": 0.7385750453691854, + "grad_norm": 0.42125797271728516, + "learning_rate": 3.18707288113842e-06, + "loss": 0.2687, + "step": 33135 + }, + { + "epoch": 0.7386864947498054, + "grad_norm": 1.1294206380844116, + "learning_rate": 3.184510295126757e-06, + "loss": 0.3863, + "step": 33140 + }, + { + "epoch": 0.7387979441304255, + "grad_norm": 0.5358449816703796, + "learning_rate": 3.1819485446415287e-06, + "loss": 0.3612, + "step": 33145 + }, + { + "epoch": 0.7389093935110456, + "grad_norm": 1.1763213872909546, + "learning_rate": 3.179387629996782e-06, + "loss": 0.3059, + "step": 33150 + }, + { + "epoch": 0.7390208428916656, + "grad_norm": 0.4582350254058838, + "learning_rate": 3.1768275515064638e-06, + "loss": 0.3229, + "step": 33155 + }, + { + "epoch": 0.7391322922722857, + "grad_norm": 0.4934980571269989, + "learning_rate": 3.1742683094844206e-06, + "loss": 0.4742, + "step": 33160 + }, + { + "epoch": 0.7392437416529057, + "grad_norm": 0.6613831520080566, + "learning_rate": 3.171709904244393e-06, + "loss": 0.2208, + "step": 33165 + }, + { + "epoch": 0.7393551910335259, + "grad_norm": 0.4670477509498596, + "learning_rate": 3.1691523361000265e-06, + "loss": 0.2436, + "step": 33170 + }, + { + "epoch": 0.7394666404141459, + "grad_norm": 0.49927234649658203, + "learning_rate": 3.1665956053648594e-06, + "loss": 0.3054, + "step": 33175 + }, + { + "epoch": 0.7395780897947659, + "grad_norm": 0.6709491610527039, + "learning_rate": 3.164039712352325e-06, + "loss": 0.2551, + "step": 33180 + }, + { + "epoch": 0.739689539175386, + "grad_norm": 0.5243662595748901, + "learning_rate": 3.1614846573757572e-06, + "loss": 0.3787, + "step": 33185 + }, + { + "epoch": 0.7398009885560061, + "grad_norm": 0.7863882184028625, + "learning_rate": 3.1589304407483844e-06, + "loss": 0.2148, + "step": 33190 + }, + { + "epoch": 0.7399124379366262, + "grad_norm": 0.5960976481437683, + "learning_rate": 3.1563770627833356e-06, + "loss": 0.4045, + "step": 33195 + }, + { + "epoch": 0.7400238873172462, + "grad_norm": 0.7527463436126709, + "learning_rate": 3.1538245237936304e-06, + "loss": 0.2915, + "step": 33200 + }, + { + "epoch": 0.7401353366978664, + "grad_norm": 0.4676433801651001, + "learning_rate": 3.1512728240921943e-06, + "loss": 0.3479, + "step": 33205 + }, + { + "epoch": 0.7402467860784864, + "grad_norm": 0.4947379231452942, + "learning_rate": 3.1487219639918487e-06, + "loss": 0.292, + "step": 33210 + }, + { + "epoch": 0.7403582354591064, + "grad_norm": 0.48669198155403137, + "learning_rate": 3.1461719438053073e-06, + "loss": 0.3316, + "step": 33215 + }, + { + "epoch": 0.7404696848397265, + "grad_norm": 0.4809311032295227, + "learning_rate": 3.143622763845181e-06, + "loss": 0.2504, + "step": 33220 + }, + { + "epoch": 0.7405811342203465, + "grad_norm": 0.6982517838478088, + "learning_rate": 3.141074424423982e-06, + "loss": 0.4138, + "step": 33225 + }, + { + "epoch": 0.7406925836009667, + "grad_norm": 0.7339484095573425, + "learning_rate": 3.138526925854112e-06, + "loss": 0.3286, + "step": 33230 + }, + { + "epoch": 0.7408040329815867, + "grad_norm": 0.5350735783576965, + "learning_rate": 3.135980268447879e-06, + "loss": 0.2829, + "step": 33235 + }, + { + "epoch": 0.7409154823622067, + "grad_norm": 0.4297383427619934, + "learning_rate": 3.1334344525174854e-06, + "loss": 0.3504, + "step": 33240 + }, + { + "epoch": 0.7410269317428269, + "grad_norm": 0.8133834004402161, + "learning_rate": 3.1308894783750265e-06, + "loss": 0.2604, + "step": 33245 + }, + { + "epoch": 0.7411383811234469, + "grad_norm": 0.5834691524505615, + "learning_rate": 3.1283453463324966e-06, + "loss": 0.2232, + "step": 33250 + }, + { + "epoch": 0.741249830504067, + "grad_norm": 1.1657015085220337, + "learning_rate": 3.1258020567017855e-06, + "loss": 0.2746, + "step": 33255 + }, + { + "epoch": 0.741361279884687, + "grad_norm": 0.6553075313568115, + "learning_rate": 3.123259609794679e-06, + "loss": 0.196, + "step": 33260 + }, + { + "epoch": 0.7414727292653072, + "grad_norm": 0.4720059633255005, + "learning_rate": 3.1207180059228657e-06, + "loss": 0.3207, + "step": 33265 + }, + { + "epoch": 0.7415841786459272, + "grad_norm": 0.7128260731697083, + "learning_rate": 3.1181772453979242e-06, + "loss": 0.2255, + "step": 33270 + }, + { + "epoch": 0.7416956280265472, + "grad_norm": 0.6636922359466553, + "learning_rate": 3.1156373285313346e-06, + "loss": 0.3006, + "step": 33275 + }, + { + "epoch": 0.7418070774071673, + "grad_norm": 0.5027670860290527, + "learning_rate": 3.113098255634469e-06, + "loss": 0.3242, + "step": 33280 + }, + { + "epoch": 0.7419185267877874, + "grad_norm": 1.0309278964996338, + "learning_rate": 3.1105600270186e-06, + "loss": 0.2365, + "step": 33285 + }, + { + "epoch": 0.7420299761684075, + "grad_norm": 0.7465946674346924, + "learning_rate": 3.108022642994892e-06, + "loss": 0.3953, + "step": 33290 + }, + { + "epoch": 0.7421414255490275, + "grad_norm": 0.4628431797027588, + "learning_rate": 3.1054861038744076e-06, + "loss": 0.3051, + "step": 33295 + }, + { + "epoch": 0.7422528749296475, + "grad_norm": 0.6248934268951416, + "learning_rate": 3.102950409968113e-06, + "loss": 0.2942, + "step": 33300 + }, + { + "epoch": 0.7423643243102677, + "grad_norm": 0.4816969335079193, + "learning_rate": 3.100415561586857e-06, + "loss": 0.1637, + "step": 33305 + }, + { + "epoch": 0.7424757736908877, + "grad_norm": 0.6795361042022705, + "learning_rate": 3.0978815590414e-06, + "loss": 0.195, + "step": 33310 + }, + { + "epoch": 0.7425872230715078, + "grad_norm": 0.8494073152542114, + "learning_rate": 3.0953484026423875e-06, + "loss": 0.2266, + "step": 33315 + }, + { + "epoch": 0.7426986724521278, + "grad_norm": 0.9777247309684753, + "learning_rate": 3.092816092700366e-06, + "loss": 0.2737, + "step": 33320 + }, + { + "epoch": 0.7428101218327479, + "grad_norm": 0.6463773250579834, + "learning_rate": 3.0902846295257715e-06, + "loss": 0.2583, + "step": 33325 + }, + { + "epoch": 0.742921571213368, + "grad_norm": 0.7945129871368408, + "learning_rate": 3.087754013428951e-06, + "loss": 0.3514, + "step": 33330 + }, + { + "epoch": 0.743033020593988, + "grad_norm": 0.6409342288970947, + "learning_rate": 3.0852242447201343e-06, + "loss": 0.2827, + "step": 33335 + }, + { + "epoch": 0.7431444699746081, + "grad_norm": 0.41631874442100525, + "learning_rate": 3.082695323709447e-06, + "loss": 0.1967, + "step": 33340 + }, + { + "epoch": 0.7432559193552282, + "grad_norm": 0.6244701743125916, + "learning_rate": 3.0801672507069237e-06, + "loss": 0.3629, + "step": 33345 + }, + { + "epoch": 0.7433673687358483, + "grad_norm": 0.5667757391929626, + "learning_rate": 3.0776400260224825e-06, + "loss": 0.3233, + "step": 33350 + }, + { + "epoch": 0.7434788181164683, + "grad_norm": 0.5378469824790955, + "learning_rate": 3.0751136499659384e-06, + "loss": 0.1566, + "step": 33355 + }, + { + "epoch": 0.7435902674970883, + "grad_norm": 0.7033135294914246, + "learning_rate": 3.072588122847012e-06, + "loss": 0.3346, + "step": 33360 + }, + { + "epoch": 0.7437017168777085, + "grad_norm": 0.7798947691917419, + "learning_rate": 3.0700634449753097e-06, + "loss": 0.2435, + "step": 33365 + }, + { + "epoch": 0.7438131662583285, + "grad_norm": 0.3774152398109436, + "learning_rate": 3.067539616660339e-06, + "loss": 0.2378, + "step": 33370 + }, + { + "epoch": 0.7439246156389486, + "grad_norm": 0.49303340911865234, + "learning_rate": 3.065016638211501e-06, + "loss": 0.2845, + "step": 33375 + }, + { + "epoch": 0.7440360650195686, + "grad_norm": 0.5648555755615234, + "learning_rate": 3.06249450993809e-06, + "loss": 0.2918, + "step": 33380 + }, + { + "epoch": 0.7441475144001887, + "grad_norm": 0.8499997854232788, + "learning_rate": 3.0599732321493025e-06, + "loss": 0.2115, + "step": 33385 + }, + { + "epoch": 0.7442589637808088, + "grad_norm": 0.6331583857536316, + "learning_rate": 3.057452805154231e-06, + "loss": 0.3151, + "step": 33390 + }, + { + "epoch": 0.7443704131614288, + "grad_norm": 0.5493272542953491, + "learning_rate": 3.054933229261857e-06, + "loss": 0.2756, + "step": 33395 + }, + { + "epoch": 0.744481862542049, + "grad_norm": 0.6646212339401245, + "learning_rate": 3.0524145047810625e-06, + "loss": 0.3277, + "step": 33400 + }, + { + "epoch": 0.744593311922669, + "grad_norm": 0.5860058069229126, + "learning_rate": 3.0498966320206213e-06, + "loss": 0.2758, + "step": 33405 + }, + { + "epoch": 0.7447047613032891, + "grad_norm": 0.6671875715255737, + "learning_rate": 3.047379611289207e-06, + "loss": 0.289, + "step": 33410 + }, + { + "epoch": 0.7448162106839091, + "grad_norm": 0.7462060451507568, + "learning_rate": 3.0448634428953837e-06, + "loss": 0.291, + "step": 33415 + }, + { + "epoch": 0.7449276600645292, + "grad_norm": 0.7190172076225281, + "learning_rate": 3.042348127147616e-06, + "loss": 0.4172, + "step": 33420 + }, + { + "epoch": 0.7450391094451493, + "grad_norm": 0.6614157557487488, + "learning_rate": 3.039833664354268e-06, + "loss": 0.3088, + "step": 33425 + }, + { + "epoch": 0.7451505588257693, + "grad_norm": 0.4733962118625641, + "learning_rate": 3.037320054823587e-06, + "loss": 0.2293, + "step": 33430 + }, + { + "epoch": 0.7452620082063894, + "grad_norm": 0.6521340608596802, + "learning_rate": 3.0348072988637235e-06, + "loss": 0.2871, + "step": 33435 + }, + { + "epoch": 0.7453734575870095, + "grad_norm": 0.6829430460929871, + "learning_rate": 3.032295396782723e-06, + "loss": 0.1454, + "step": 33440 + }, + { + "epoch": 0.7454849069676295, + "grad_norm": 0.6285884380340576, + "learning_rate": 3.0297843488885204e-06, + "loss": 0.2307, + "step": 33445 + }, + { + "epoch": 0.7455963563482496, + "grad_norm": 0.837503969669342, + "learning_rate": 3.0272741554889563e-06, + "loss": 0.2876, + "step": 33450 + }, + { + "epoch": 0.7457078057288696, + "grad_norm": 0.8544986844062805, + "learning_rate": 3.024764816891761e-06, + "loss": 0.2674, + "step": 33455 + }, + { + "epoch": 0.7458192551094898, + "grad_norm": 0.5040220618247986, + "learning_rate": 3.0222563334045596e-06, + "loss": 0.2078, + "step": 33460 + }, + { + "epoch": 0.7459307044901098, + "grad_norm": 0.7368492484092712, + "learning_rate": 3.0197487053348715e-06, + "loss": 0.3709, + "step": 33465 + }, + { + "epoch": 0.7460421538707299, + "grad_norm": 0.9807363748550415, + "learning_rate": 3.0172419329901126e-06, + "loss": 0.3531, + "step": 33470 + }, + { + "epoch": 0.7461536032513499, + "grad_norm": 0.5130937695503235, + "learning_rate": 3.0147360166775907e-06, + "loss": 0.2775, + "step": 33475 + }, + { + "epoch": 0.74626505263197, + "grad_norm": 0.49336349964141846, + "learning_rate": 3.0122309567045194e-06, + "loss": 0.2453, + "step": 33480 + }, + { + "epoch": 0.7463765020125901, + "grad_norm": 0.9051334857940674, + "learning_rate": 3.0097267533779915e-06, + "loss": 0.3578, + "step": 33485 + }, + { + "epoch": 0.7464879513932101, + "grad_norm": 0.3698303997516632, + "learning_rate": 3.007223407005011e-06, + "loss": 0.3242, + "step": 33490 + }, + { + "epoch": 0.7465994007738302, + "grad_norm": 0.6289532780647278, + "learning_rate": 3.004720917892464e-06, + "loss": 0.2575, + "step": 33495 + }, + { + "epoch": 0.7467108501544503, + "grad_norm": 0.5866841673851013, + "learning_rate": 3.002219286347138e-06, + "loss": 0.3025, + "step": 33500 + }, + { + "epoch": 0.7468222995350703, + "grad_norm": 0.7117490768432617, + "learning_rate": 2.999718512675712e-06, + "loss": 0.2121, + "step": 33505 + }, + { + "epoch": 0.7469337489156904, + "grad_norm": 0.6077593564987183, + "learning_rate": 2.997218597184759e-06, + "loss": 0.3158, + "step": 33510 + }, + { + "epoch": 0.7470451982963104, + "grad_norm": 0.6103549003601074, + "learning_rate": 2.9947195401807573e-06, + "loss": 0.2382, + "step": 33515 + }, + { + "epoch": 0.7471566476769306, + "grad_norm": 0.6339691281318665, + "learning_rate": 2.992221341970064e-06, + "loss": 0.2428, + "step": 33520 + }, + { + "epoch": 0.7472680970575506, + "grad_norm": 0.6688328385353088, + "learning_rate": 2.9897240028589447e-06, + "loss": 0.3443, + "step": 33525 + }, + { + "epoch": 0.7473795464381706, + "grad_norm": 0.7099976539611816, + "learning_rate": 2.9872275231535518e-06, + "loss": 0.3767, + "step": 33530 + }, + { + "epoch": 0.7474909958187907, + "grad_norm": 0.49107834696769714, + "learning_rate": 2.9847319031599353e-06, + "loss": 0.3656, + "step": 33535 + }, + { + "epoch": 0.7476024451994108, + "grad_norm": 0.7511234879493713, + "learning_rate": 2.9822371431840346e-06, + "loss": 0.2243, + "step": 33540 + }, + { + "epoch": 0.7477138945800309, + "grad_norm": 0.483070433139801, + "learning_rate": 2.979743243531693e-06, + "loss": 0.2865, + "step": 33545 + }, + { + "epoch": 0.7478253439606509, + "grad_norm": 0.7143993377685547, + "learning_rate": 2.9772502045086438e-06, + "loss": 0.2051, + "step": 33550 + }, + { + "epoch": 0.7479367933412711, + "grad_norm": 0.5198782086372375, + "learning_rate": 2.9747580264205077e-06, + "loss": 0.3119, + "step": 33555 + }, + { + "epoch": 0.7480482427218911, + "grad_norm": 0.8435953855514526, + "learning_rate": 2.9722667095728142e-06, + "loss": 0.2516, + "step": 33560 + }, + { + "epoch": 0.7481596921025111, + "grad_norm": 0.5863881707191467, + "learning_rate": 2.9697762542709764e-06, + "loss": 0.2087, + "step": 33565 + }, + { + "epoch": 0.7482711414831312, + "grad_norm": 0.5257229804992676, + "learning_rate": 2.967286660820302e-06, + "loss": 0.2725, + "step": 33570 + }, + { + "epoch": 0.7483825908637513, + "grad_norm": 0.2609884738922119, + "learning_rate": 2.964797929526002e-06, + "loss": 0.318, + "step": 33575 + }, + { + "epoch": 0.7484940402443714, + "grad_norm": 0.6922463178634644, + "learning_rate": 2.962310060693172e-06, + "loss": 0.3863, + "step": 33580 + }, + { + "epoch": 0.7486054896249914, + "grad_norm": 0.4755772054195404, + "learning_rate": 2.9598230546268057e-06, + "loss": 0.2543, + "step": 33585 + }, + { + "epoch": 0.7487169390056114, + "grad_norm": 0.4754803776741028, + "learning_rate": 2.9573369116317885e-06, + "loss": 0.3667, + "step": 33590 + }, + { + "epoch": 0.7488283883862316, + "grad_norm": 0.7210944294929504, + "learning_rate": 2.9548516320129085e-06, + "loss": 0.2127, + "step": 33595 + }, + { + "epoch": 0.7489398377668516, + "grad_norm": 0.5226638317108154, + "learning_rate": 2.9523672160748375e-06, + "loss": 0.2286, + "step": 33600 + }, + { + "epoch": 0.7490512871474717, + "grad_norm": 0.4636479616165161, + "learning_rate": 2.949883664122143e-06, + "loss": 0.2489, + "step": 33605 + }, + { + "epoch": 0.7491627365280917, + "grad_norm": 0.5760490894317627, + "learning_rate": 2.947400976459297e-06, + "loss": 0.3907, + "step": 33610 + }, + { + "epoch": 0.7492741859087119, + "grad_norm": 0.4980350732803345, + "learning_rate": 2.9449191533906527e-06, + "loss": 0.2128, + "step": 33615 + }, + { + "epoch": 0.7493856352893319, + "grad_norm": 0.41088926792144775, + "learning_rate": 2.9424381952204638e-06, + "loss": 0.2411, + "step": 33620 + }, + { + "epoch": 0.7494970846699519, + "grad_norm": 0.485420286655426, + "learning_rate": 2.9399581022528724e-06, + "loss": 0.2114, + "step": 33625 + }, + { + "epoch": 0.749608534050572, + "grad_norm": 0.648485004901886, + "learning_rate": 2.937478874791926e-06, + "loss": 0.4191, + "step": 33630 + }, + { + "epoch": 0.7497199834311921, + "grad_norm": 0.6990744471549988, + "learning_rate": 2.9350005131415514e-06, + "loss": 0.4039, + "step": 33635 + }, + { + "epoch": 0.7498314328118122, + "grad_norm": 0.8828688859939575, + "learning_rate": 2.932523017605585e-06, + "loss": 0.3454, + "step": 33640 + }, + { + "epoch": 0.7499428821924322, + "grad_norm": 0.4837510287761688, + "learning_rate": 2.9300463884877428e-06, + "loss": 0.2088, + "step": 33645 + }, + { + "epoch": 0.7500543315730522, + "grad_norm": 0.48634445667266846, + "learning_rate": 2.9275706260916427e-06, + "loss": 0.3017, + "step": 33650 + }, + { + "epoch": 0.7501657809536724, + "grad_norm": 0.551918625831604, + "learning_rate": 2.925095730720793e-06, + "loss": 0.1801, + "step": 33655 + }, + { + "epoch": 0.7502772303342924, + "grad_norm": 0.7558825016021729, + "learning_rate": 2.922621702678593e-06, + "loss": 0.3125, + "step": 33660 + }, + { + "epoch": 0.7503886797149125, + "grad_norm": 0.5769863724708557, + "learning_rate": 2.920148542268344e-06, + "loss": 0.2318, + "step": 33665 + }, + { + "epoch": 0.7505001290955325, + "grad_norm": 0.9255049824714661, + "learning_rate": 2.9176762497932376e-06, + "loss": 0.3873, + "step": 33670 + }, + { + "epoch": 0.7506115784761526, + "grad_norm": 0.3768719732761383, + "learning_rate": 2.915204825556357e-06, + "loss": 0.2289, + "step": 33675 + }, + { + "epoch": 0.7507230278567727, + "grad_norm": 0.6153138279914856, + "learning_rate": 2.912734269860679e-06, + "loss": 0.2159, + "step": 33680 + }, + { + "epoch": 0.7508344772373927, + "grad_norm": 0.726574718952179, + "learning_rate": 2.9102645830090724e-06, + "loss": 0.3907, + "step": 33685 + }, + { + "epoch": 0.7509459266180128, + "grad_norm": 0.8838280439376831, + "learning_rate": 2.9077957653043022e-06, + "loss": 0.3025, + "step": 33690 + }, + { + "epoch": 0.7510573759986329, + "grad_norm": 0.5469415187835693, + "learning_rate": 2.9053278170490262e-06, + "loss": 0.2113, + "step": 33695 + }, + { + "epoch": 0.751168825379253, + "grad_norm": 0.7691811919212341, + "learning_rate": 2.902860738545802e-06, + "loss": 0.2403, + "step": 33700 + }, + { + "epoch": 0.751280274759873, + "grad_norm": 0.47786420583724976, + "learning_rate": 2.9003945300970683e-06, + "loss": 0.2594, + "step": 33705 + }, + { + "epoch": 0.751391724140493, + "grad_norm": 0.5924757719039917, + "learning_rate": 2.8979291920051655e-06, + "loss": 0.3142, + "step": 33710 + }, + { + "epoch": 0.7515031735211132, + "grad_norm": 0.5335355997085571, + "learning_rate": 2.8954647245723245e-06, + "loss": 0.2587, + "step": 33715 + }, + { + "epoch": 0.7516146229017332, + "grad_norm": 0.4959110915660858, + "learning_rate": 2.8930011281006685e-06, + "loss": 0.1869, + "step": 33720 + }, + { + "epoch": 0.7517260722823533, + "grad_norm": 0.8827904462814331, + "learning_rate": 2.8905384028922144e-06, + "loss": 0.4612, + "step": 33725 + }, + { + "epoch": 0.7518375216629734, + "grad_norm": 0.3391810655593872, + "learning_rate": 2.888076549248876e-06, + "loss": 0.2682, + "step": 33730 + }, + { + "epoch": 0.7519489710435934, + "grad_norm": 0.9731943607330322, + "learning_rate": 2.8856155674724595e-06, + "loss": 0.2434, + "step": 33735 + }, + { + "epoch": 0.7520604204242135, + "grad_norm": 0.41712039709091187, + "learning_rate": 2.883155457864659e-06, + "loss": 0.3726, + "step": 33740 + }, + { + "epoch": 0.7521718698048335, + "grad_norm": 0.7917229533195496, + "learning_rate": 2.8806962207270673e-06, + "loss": 0.3029, + "step": 33745 + }, + { + "epoch": 0.7522833191854537, + "grad_norm": 0.7232569456100464, + "learning_rate": 2.878237856361166e-06, + "loss": 0.3067, + "step": 33750 + }, + { + "epoch": 0.7523947685660737, + "grad_norm": 0.576991081237793, + "learning_rate": 2.875780365068328e-06, + "loss": 0.3046, + "step": 33755 + }, + { + "epoch": 0.7525062179466938, + "grad_norm": 0.47883665561676025, + "learning_rate": 2.87332374714983e-06, + "loss": 0.2504, + "step": 33760 + }, + { + "epoch": 0.7526176673273138, + "grad_norm": 0.4215628206729889, + "learning_rate": 2.870868002906828e-06, + "loss": 0.184, + "step": 33765 + }, + { + "epoch": 0.7527291167079339, + "grad_norm": 0.6791629791259766, + "learning_rate": 2.868413132640384e-06, + "loss": 0.2995, + "step": 33770 + }, + { + "epoch": 0.752840566088554, + "grad_norm": 0.5074769258499146, + "learning_rate": 2.8659591366514426e-06, + "loss": 0.2565, + "step": 33775 + }, + { + "epoch": 0.752952015469174, + "grad_norm": 0.7514712810516357, + "learning_rate": 2.8635060152408446e-06, + "loss": 0.2893, + "step": 33780 + }, + { + "epoch": 0.7530634648497941, + "grad_norm": 0.7852396368980408, + "learning_rate": 2.86105376870932e-06, + "loss": 0.3205, + "step": 33785 + }, + { + "epoch": 0.7531749142304142, + "grad_norm": 0.42000624537467957, + "learning_rate": 2.8586023973575027e-06, + "loss": 0.3827, + "step": 33790 + }, + { + "epoch": 0.7532863636110342, + "grad_norm": 0.23319512605667114, + "learning_rate": 2.8561519014859087e-06, + "loss": 0.2055, + "step": 33795 + }, + { + "epoch": 0.7533978129916543, + "grad_norm": 0.5697981715202332, + "learning_rate": 2.8537022813949456e-06, + "loss": 0.1737, + "step": 33800 + }, + { + "epoch": 0.7535092623722743, + "grad_norm": 0.5846883058547974, + "learning_rate": 2.851253537384926e-06, + "loss": 0.2479, + "step": 33805 + }, + { + "epoch": 0.7536207117528945, + "grad_norm": 0.56658536195755, + "learning_rate": 2.848805669756042e-06, + "loss": 0.2692, + "step": 33810 + }, + { + "epoch": 0.7537321611335145, + "grad_norm": 0.3844006359577179, + "learning_rate": 2.8463586788083843e-06, + "loss": 0.2243, + "step": 33815 + }, + { + "epoch": 0.7538436105141346, + "grad_norm": 0.5789145827293396, + "learning_rate": 2.843912564841932e-06, + "loss": 0.115, + "step": 33820 + }, + { + "epoch": 0.7539550598947546, + "grad_norm": 0.5536205172538757, + "learning_rate": 2.8414673281565663e-06, + "loss": 0.2793, + "step": 33825 + }, + { + "epoch": 0.7540665092753747, + "grad_norm": 0.8552709221839905, + "learning_rate": 2.83902296905205e-06, + "loss": 0.2786, + "step": 33830 + }, + { + "epoch": 0.7541779586559948, + "grad_norm": 0.5636184215545654, + "learning_rate": 2.8365794878280407e-06, + "loss": 0.3251, + "step": 33835 + }, + { + "epoch": 0.7542894080366148, + "grad_norm": 0.7049769759178162, + "learning_rate": 2.8341368847840968e-06, + "loss": 0.2594, + "step": 33840 + }, + { + "epoch": 0.754400857417235, + "grad_norm": 0.661155104637146, + "learning_rate": 2.8316951602196574e-06, + "loss": 0.2518, + "step": 33845 + }, + { + "epoch": 0.754512306797855, + "grad_norm": 0.5453686118125916, + "learning_rate": 2.829254314434059e-06, + "loss": 0.3564, + "step": 33850 + }, + { + "epoch": 0.754623756178475, + "grad_norm": 0.5124586224555969, + "learning_rate": 2.8268143477265343e-06, + "loss": 0.2007, + "step": 33855 + }, + { + "epoch": 0.7547352055590951, + "grad_norm": 0.7682088613510132, + "learning_rate": 2.8243752603962014e-06, + "loss": 0.3431, + "step": 33860 + }, + { + "epoch": 0.7548466549397151, + "grad_norm": 0.7964093685150146, + "learning_rate": 2.821937052742075e-06, + "loss": 0.2964, + "step": 33865 + }, + { + "epoch": 0.7549581043203353, + "grad_norm": 0.6934346556663513, + "learning_rate": 2.8194997250630574e-06, + "loss": 0.2396, + "step": 33870 + }, + { + "epoch": 0.7550695537009553, + "grad_norm": 0.6436589956283569, + "learning_rate": 2.817063277657951e-06, + "loss": 0.2745, + "step": 33875 + }, + { + "epoch": 0.7551810030815753, + "grad_norm": 0.7439756989479065, + "learning_rate": 2.8146277108254396e-06, + "loss": 0.3674, + "step": 33880 + }, + { + "epoch": 0.7552924524621955, + "grad_norm": 0.6890519857406616, + "learning_rate": 2.8121930248641117e-06, + "loss": 0.235, + "step": 33885 + }, + { + "epoch": 0.7554039018428155, + "grad_norm": 0.8035330176353455, + "learning_rate": 2.809759220072438e-06, + "loss": 0.2283, + "step": 33890 + }, + { + "epoch": 0.7555153512234356, + "grad_norm": 0.7052496075630188, + "learning_rate": 2.8073262967487825e-06, + "loss": 0.3007, + "step": 33895 + }, + { + "epoch": 0.7556268006040556, + "grad_norm": 0.744096577167511, + "learning_rate": 2.804894255191405e-06, + "loss": 0.1672, + "step": 33900 + }, + { + "epoch": 0.7557382499846758, + "grad_norm": 0.8351132869720459, + "learning_rate": 2.802463095698451e-06, + "loss": 0.2797, + "step": 33905 + }, + { + "epoch": 0.7558496993652958, + "grad_norm": 0.47542646527290344, + "learning_rate": 2.8000328185679683e-06, + "loss": 0.483, + "step": 33910 + }, + { + "epoch": 0.7559611487459158, + "grad_norm": 0.5174663066864014, + "learning_rate": 2.7976034240978834e-06, + "loss": 0.2212, + "step": 33915 + }, + { + "epoch": 0.7560725981265359, + "grad_norm": 0.7283490896224976, + "learning_rate": 2.7951749125860274e-06, + "loss": 0.2985, + "step": 33920 + }, + { + "epoch": 0.756184047507156, + "grad_norm": 0.5224509239196777, + "learning_rate": 2.792747284330115e-06, + "loss": 0.2678, + "step": 33925 + }, + { + "epoch": 0.7562954968877761, + "grad_norm": 0.4216907024383545, + "learning_rate": 2.7903205396277546e-06, + "loss": 0.1627, + "step": 33930 + }, + { + "epoch": 0.7564069462683961, + "grad_norm": 0.5221124291419983, + "learning_rate": 2.787894678776445e-06, + "loss": 0.2285, + "step": 33935 + }, + { + "epoch": 0.7565183956490161, + "grad_norm": 0.7804731726646423, + "learning_rate": 2.7854697020735757e-06, + "loss": 0.3794, + "step": 33940 + }, + { + "epoch": 0.7566298450296363, + "grad_norm": 0.5530830025672913, + "learning_rate": 2.783045609816434e-06, + "loss": 0.2065, + "step": 33945 + }, + { + "epoch": 0.7567412944102563, + "grad_norm": 1.1412625312805176, + "learning_rate": 2.7806224023021966e-06, + "loss": 0.3147, + "step": 33950 + }, + { + "epoch": 0.7568527437908764, + "grad_norm": 0.6325271129608154, + "learning_rate": 2.7782000798279276e-06, + "loss": 0.2436, + "step": 33955 + }, + { + "epoch": 0.7569641931714964, + "grad_norm": 0.5092622637748718, + "learning_rate": 2.775778642690585e-06, + "loss": 0.2813, + "step": 33960 + }, + { + "epoch": 0.7570756425521166, + "grad_norm": 0.6763238310813904, + "learning_rate": 2.7733580911870195e-06, + "loss": 0.2637, + "step": 33965 + }, + { + "epoch": 0.7571870919327366, + "grad_norm": 0.40753841400146484, + "learning_rate": 2.770938425613967e-06, + "loss": 0.2771, + "step": 33970 + }, + { + "epoch": 0.7572985413133566, + "grad_norm": 0.5731813311576843, + "learning_rate": 2.768519646268066e-06, + "loss": 0.2672, + "step": 33975 + }, + { + "epoch": 0.7574099906939767, + "grad_norm": 0.6931194067001343, + "learning_rate": 2.7661017534458366e-06, + "loss": 0.2182, + "step": 33980 + }, + { + "epoch": 0.7575214400745968, + "grad_norm": 0.7261427640914917, + "learning_rate": 2.7636847474436968e-06, + "loss": 0.2065, + "step": 33985 + }, + { + "epoch": 0.7576328894552169, + "grad_norm": 0.5500878095626831, + "learning_rate": 2.7612686285579515e-06, + "loss": 0.1377, + "step": 33990 + }, + { + "epoch": 0.7577443388358369, + "grad_norm": 0.6357752680778503, + "learning_rate": 2.758853397084799e-06, + "loss": 0.1856, + "step": 33995 + }, + { + "epoch": 0.7578557882164569, + "grad_norm": 0.5131103992462158, + "learning_rate": 2.756439053320322e-06, + "loss": 0.2762, + "step": 34000 + }, + { + "epoch": 0.7579672375970771, + "grad_norm": 0.7534242868423462, + "learning_rate": 2.7540255975605103e-06, + "loss": 0.2826, + "step": 34005 + }, + { + "epoch": 0.7580786869776971, + "grad_norm": 0.541512668132782, + "learning_rate": 2.751613030101229e-06, + "loss": 0.3079, + "step": 34010 + }, + { + "epoch": 0.7581901363583172, + "grad_norm": 0.83238685131073, + "learning_rate": 2.7492013512382376e-06, + "loss": 0.3747, + "step": 34015 + }, + { + "epoch": 0.7583015857389372, + "grad_norm": 0.4771921634674072, + "learning_rate": 2.7467905612671975e-06, + "loss": 0.2614, + "step": 34020 + }, + { + "epoch": 0.7584130351195574, + "grad_norm": 0.8062953948974609, + "learning_rate": 2.7443806604836474e-06, + "loss": 0.2927, + "step": 34025 + }, + { + "epoch": 0.7585244845001774, + "grad_norm": 1.1034796237945557, + "learning_rate": 2.741971649183024e-06, + "loss": 0.2625, + "step": 34030 + }, + { + "epoch": 0.7586359338807974, + "grad_norm": 0.43950408697128296, + "learning_rate": 2.739563527660649e-06, + "loss": 0.1446, + "step": 34035 + }, + { + "epoch": 0.7587473832614176, + "grad_norm": 0.7604075074195862, + "learning_rate": 2.737156296211748e-06, + "loss": 0.2146, + "step": 34040 + }, + { + "epoch": 0.7588588326420376, + "grad_norm": 0.5220000147819519, + "learning_rate": 2.734749955131424e-06, + "loss": 0.2342, + "step": 34045 + }, + { + "epoch": 0.7589702820226577, + "grad_norm": 0.44721153378486633, + "learning_rate": 2.7323445047146724e-06, + "loss": 0.3044, + "step": 34050 + }, + { + "epoch": 0.7590817314032777, + "grad_norm": 0.3938583731651306, + "learning_rate": 2.7299399452563912e-06, + "loss": 0.3128, + "step": 34055 + }, + { + "epoch": 0.7591931807838977, + "grad_norm": 0.588921070098877, + "learning_rate": 2.727536277051356e-06, + "loss": 0.2765, + "step": 34060 + }, + { + "epoch": 0.7593046301645179, + "grad_norm": 0.4165632128715515, + "learning_rate": 2.725133500394235e-06, + "loss": 0.2777, + "step": 34065 + }, + { + "epoch": 0.7594160795451379, + "grad_norm": 0.638014554977417, + "learning_rate": 2.722731615579597e-06, + "loss": 0.3232, + "step": 34070 + }, + { + "epoch": 0.759527528925758, + "grad_norm": 0.6121829152107239, + "learning_rate": 2.720330622901891e-06, + "loss": 0.2997, + "step": 34075 + }, + { + "epoch": 0.759638978306378, + "grad_norm": 0.6752800941467285, + "learning_rate": 2.7179305226554597e-06, + "loss": 0.3141, + "step": 34080 + }, + { + "epoch": 0.7597504276869981, + "grad_norm": 0.481235533952713, + "learning_rate": 2.7155313151345354e-06, + "loss": 0.2172, + "step": 34085 + }, + { + "epoch": 0.7598618770676182, + "grad_norm": 0.4340780973434448, + "learning_rate": 2.7131330006332467e-06, + "loss": 0.2937, + "step": 34090 + }, + { + "epoch": 0.7599733264482382, + "grad_norm": 0.2692956328392029, + "learning_rate": 2.7107355794456026e-06, + "loss": 0.3429, + "step": 34095 + }, + { + "epoch": 0.7600847758288584, + "grad_norm": 0.5197578072547913, + "learning_rate": 2.7083390518655163e-06, + "loss": 0.2589, + "step": 34100 + }, + { + "epoch": 0.7601962252094784, + "grad_norm": 0.4632870852947235, + "learning_rate": 2.7059434181867783e-06, + "loss": 0.3697, + "step": 34105 + }, + { + "epoch": 0.7603076745900985, + "grad_norm": 0.5069155097007751, + "learning_rate": 2.7035486787030753e-06, + "loss": 0.2879, + "step": 34110 + }, + { + "epoch": 0.7604191239707185, + "grad_norm": 0.2655554711818695, + "learning_rate": 2.7011548337079853e-06, + "loss": 0.2065, + "step": 34115 + }, + { + "epoch": 0.7605305733513386, + "grad_norm": 0.7087981104850769, + "learning_rate": 2.6987618834949715e-06, + "loss": 0.3412, + "step": 34120 + }, + { + "epoch": 0.7606420227319587, + "grad_norm": 0.4668249487876892, + "learning_rate": 2.6963698283573958e-06, + "loss": 0.274, + "step": 34125 + }, + { + "epoch": 0.7607534721125787, + "grad_norm": 0.7287569642066956, + "learning_rate": 2.6939786685885016e-06, + "loss": 0.309, + "step": 34130 + }, + { + "epoch": 0.7608649214931988, + "grad_norm": 0.5991645455360413, + "learning_rate": 2.691588404481431e-06, + "loss": 0.2946, + "step": 34135 + }, + { + "epoch": 0.7609763708738189, + "grad_norm": 0.482779324054718, + "learning_rate": 2.6891990363292107e-06, + "loss": 0.2796, + "step": 34140 + }, + { + "epoch": 0.7610878202544389, + "grad_norm": 0.5827972888946533, + "learning_rate": 2.686810564424758e-06, + "loss": 0.1942, + "step": 34145 + }, + { + "epoch": 0.761199269635059, + "grad_norm": 0.673029899597168, + "learning_rate": 2.6844229890608796e-06, + "loss": 0.3049, + "step": 34150 + }, + { + "epoch": 0.761310719015679, + "grad_norm": 0.5888857841491699, + "learning_rate": 2.682036310530273e-06, + "loss": 0.2724, + "step": 34155 + }, + { + "epoch": 0.7614221683962992, + "grad_norm": 0.6745133399963379, + "learning_rate": 2.679650529125528e-06, + "loss": 0.2954, + "step": 34160 + }, + { + "epoch": 0.7615336177769192, + "grad_norm": 0.6212778091430664, + "learning_rate": 2.6772656451391277e-06, + "loss": 0.3062, + "step": 34165 + }, + { + "epoch": 0.7616450671575393, + "grad_norm": 0.9315680861473083, + "learning_rate": 2.6748816588634363e-06, + "loss": 0.3048, + "step": 34170 + }, + { + "epoch": 0.7617565165381593, + "grad_norm": 0.6840492486953735, + "learning_rate": 2.6724985705907114e-06, + "loss": 0.2465, + "step": 34175 + }, + { + "epoch": 0.7618679659187794, + "grad_norm": 0.7086824774742126, + "learning_rate": 2.670116380613104e-06, + "loss": 0.3416, + "step": 34180 + }, + { + "epoch": 0.7619794152993995, + "grad_norm": 0.330198734998703, + "learning_rate": 2.667735089222645e-06, + "loss": 0.1631, + "step": 34185 + }, + { + "epoch": 0.7620908646800195, + "grad_norm": 0.9546667337417603, + "learning_rate": 2.6653546967112687e-06, + "loss": 0.3951, + "step": 34190 + }, + { + "epoch": 0.7622023140606397, + "grad_norm": 0.6984142065048218, + "learning_rate": 2.6629752033707956e-06, + "loss": 0.1967, + "step": 34195 + }, + { + "epoch": 0.7623137634412597, + "grad_norm": 0.7566418051719666, + "learning_rate": 2.660596609492928e-06, + "loss": 0.3162, + "step": 34200 + }, + { + "epoch": 0.7624252128218797, + "grad_norm": 0.680320143699646, + "learning_rate": 2.658218915369265e-06, + "loss": 0.2, + "step": 34205 + }, + { + "epoch": 0.7625366622024998, + "grad_norm": 0.5272489786148071, + "learning_rate": 2.6558421212912945e-06, + "loss": 0.267, + "step": 34210 + }, + { + "epoch": 0.7626481115831198, + "grad_norm": 0.8857879042625427, + "learning_rate": 2.6534662275503877e-06, + "loss": 0.3184, + "step": 34215 + }, + { + "epoch": 0.76275956096374, + "grad_norm": 0.6333488821983337, + "learning_rate": 2.651091234437817e-06, + "loss": 0.2489, + "step": 34220 + }, + { + "epoch": 0.76287101034436, + "grad_norm": 0.5380803346633911, + "learning_rate": 2.6487171422447344e-06, + "loss": 0.1641, + "step": 34225 + }, + { + "epoch": 0.7629824597249801, + "grad_norm": 0.9589352011680603, + "learning_rate": 2.646343951262189e-06, + "loss": 0.3971, + "step": 34230 + }, + { + "epoch": 0.7630939091056002, + "grad_norm": 0.3139306604862213, + "learning_rate": 2.643971661781114e-06, + "loss": 0.2628, + "step": 34235 + }, + { + "epoch": 0.7632053584862202, + "grad_norm": 0.7108866572380066, + "learning_rate": 2.641600274092334e-06, + "loss": 0.2655, + "step": 34240 + }, + { + "epoch": 0.7633168078668403, + "grad_norm": 0.7292156219482422, + "learning_rate": 2.6392297884865613e-06, + "loss": 0.2955, + "step": 34245 + }, + { + "epoch": 0.7634282572474603, + "grad_norm": 0.3986571729183197, + "learning_rate": 2.636860205254398e-06, + "loss": 0.3701, + "step": 34250 + }, + { + "epoch": 0.7635397066280805, + "grad_norm": 0.41875624656677246, + "learning_rate": 2.634491524686341e-06, + "loss": 0.3138, + "step": 34255 + }, + { + "epoch": 0.7636511560087005, + "grad_norm": 0.8360694050788879, + "learning_rate": 2.6321237470727678e-06, + "loss": 0.2779, + "step": 34260 + }, + { + "epoch": 0.7637626053893205, + "grad_norm": 0.3968079090118408, + "learning_rate": 2.6297568727039547e-06, + "loss": 0.3343, + "step": 34265 + }, + { + "epoch": 0.7638740547699406, + "grad_norm": 0.8168555498123169, + "learning_rate": 2.6273909018700595e-06, + "loss": 0.3282, + "step": 34270 + }, + { + "epoch": 0.7639855041505607, + "grad_norm": 0.8862971663475037, + "learning_rate": 2.625025834861131e-06, + "loss": 0.247, + "step": 34275 + }, + { + "epoch": 0.7640969535311808, + "grad_norm": 0.524175763130188, + "learning_rate": 2.6226616719671074e-06, + "loss": 0.2793, + "step": 34280 + }, + { + "epoch": 0.7642084029118008, + "grad_norm": 0.9022205471992493, + "learning_rate": 2.620298413477821e-06, + "loss": 0.2865, + "step": 34285 + }, + { + "epoch": 0.7643198522924208, + "grad_norm": 0.5455117225646973, + "learning_rate": 2.617936059682986e-06, + "loss": 0.3403, + "step": 34290 + }, + { + "epoch": 0.764431301673041, + "grad_norm": 0.7895170450210571, + "learning_rate": 2.615574610872208e-06, + "loss": 0.3012, + "step": 34295 + }, + { + "epoch": 0.764542751053661, + "grad_norm": 0.4839140474796295, + "learning_rate": 2.6132140673349847e-06, + "loss": 0.3254, + "step": 34300 + }, + { + "epoch": 0.7646542004342811, + "grad_norm": 0.6854641437530518, + "learning_rate": 2.610854429360701e-06, + "loss": 0.301, + "step": 34305 + }, + { + "epoch": 0.7647656498149011, + "grad_norm": 0.6549224853515625, + "learning_rate": 2.6084956972386245e-06, + "loss": 0.3275, + "step": 34310 + }, + { + "epoch": 0.7648770991955213, + "grad_norm": 0.5638486742973328, + "learning_rate": 2.6061378712579255e-06, + "loss": 0.228, + "step": 34315 + }, + { + "epoch": 0.7649885485761413, + "grad_norm": 0.526157557964325, + "learning_rate": 2.603780951707652e-06, + "loss": 0.2646, + "step": 34320 + }, + { + "epoch": 0.7650999979567613, + "grad_norm": 0.5761968493461609, + "learning_rate": 2.6014249388767433e-06, + "loss": 0.2787, + "step": 34325 + }, + { + "epoch": 0.7652114473373814, + "grad_norm": 0.7018436193466187, + "learning_rate": 2.5990698330540267e-06, + "loss": 0.3231, + "step": 34330 + }, + { + "epoch": 0.7653228967180015, + "grad_norm": 0.6506215929985046, + "learning_rate": 2.596715634528224e-06, + "loss": 0.2319, + "step": 34335 + }, + { + "epoch": 0.7654343460986216, + "grad_norm": 0.5280274152755737, + "learning_rate": 2.594362343587941e-06, + "loss": 0.29, + "step": 34340 + }, + { + "epoch": 0.7655457954792416, + "grad_norm": 0.6760926246643066, + "learning_rate": 2.5920099605216677e-06, + "loss": 0.2271, + "step": 34345 + }, + { + "epoch": 0.7656572448598616, + "grad_norm": 0.6700071692466736, + "learning_rate": 2.5896584856177954e-06, + "loss": 0.3054, + "step": 34350 + }, + { + "epoch": 0.7657686942404818, + "grad_norm": 0.5384727120399475, + "learning_rate": 2.5873079191645944e-06, + "loss": 0.2114, + "step": 34355 + }, + { + "epoch": 0.7658801436211018, + "grad_norm": 0.6928741335868835, + "learning_rate": 2.5849582614502255e-06, + "loss": 0.2694, + "step": 34360 + }, + { + "epoch": 0.7659915930017219, + "grad_norm": 0.7100231647491455, + "learning_rate": 2.582609512762735e-06, + "loss": 0.2787, + "step": 34365 + }, + { + "epoch": 0.766103042382342, + "grad_norm": 0.5652828812599182, + "learning_rate": 2.5802616733900674e-06, + "loss": 0.3399, + "step": 34370 + }, + { + "epoch": 0.7662144917629621, + "grad_norm": 0.6810146570205688, + "learning_rate": 2.5779147436200448e-06, + "loss": 0.2125, + "step": 34375 + }, + { + "epoch": 0.7663259411435821, + "grad_norm": 0.5119982361793518, + "learning_rate": 2.5755687237403882e-06, + "loss": 0.2988, + "step": 34380 + }, + { + "epoch": 0.7664373905242021, + "grad_norm": 0.7045231461524963, + "learning_rate": 2.573223614038697e-06, + "loss": 0.235, + "step": 34385 + }, + { + "epoch": 0.7665488399048223, + "grad_norm": 0.6490983963012695, + "learning_rate": 2.5708794148024654e-06, + "loss": 0.2585, + "step": 34390 + }, + { + "epoch": 0.7666602892854423, + "grad_norm": 0.7183946967124939, + "learning_rate": 2.5685361263190735e-06, + "loss": 0.3308, + "step": 34395 + }, + { + "epoch": 0.7667717386660624, + "grad_norm": 0.7707793116569519, + "learning_rate": 2.5661937488757883e-06, + "loss": 0.2189, + "step": 34400 + }, + { + "epoch": 0.7668831880466824, + "grad_norm": 0.335593581199646, + "learning_rate": 2.563852282759768e-06, + "loss": 0.2147, + "step": 34405 + }, + { + "epoch": 0.7669946374273025, + "grad_norm": 0.4331473410129547, + "learning_rate": 2.5615117282580627e-06, + "loss": 0.2491, + "step": 34410 + }, + { + "epoch": 0.7671060868079226, + "grad_norm": 0.8042201399803162, + "learning_rate": 2.559172085657604e-06, + "loss": 0.3988, + "step": 34415 + }, + { + "epoch": 0.7672175361885426, + "grad_norm": 0.5171744227409363, + "learning_rate": 2.556833355245213e-06, + "loss": 0.2492, + "step": 34420 + }, + { + "epoch": 0.7673289855691627, + "grad_norm": 0.6053786873817444, + "learning_rate": 2.5544955373075997e-06, + "loss": 0.3404, + "step": 34425 + }, + { + "epoch": 0.7674404349497828, + "grad_norm": 0.437044620513916, + "learning_rate": 2.552158632131363e-06, + "loss": 0.2578, + "step": 34430 + }, + { + "epoch": 0.7675518843304028, + "grad_norm": 0.8940054178237915, + "learning_rate": 2.5498226400029867e-06, + "loss": 0.2197, + "step": 34435 + }, + { + "epoch": 0.7676633337110229, + "grad_norm": 0.7202461957931519, + "learning_rate": 2.547487561208849e-06, + "loss": 0.2935, + "step": 34440 + }, + { + "epoch": 0.7677747830916429, + "grad_norm": 0.597625732421875, + "learning_rate": 2.545153396035214e-06, + "loss": 0.2747, + "step": 34445 + }, + { + "epoch": 0.7678862324722631, + "grad_norm": 0.5841318368911743, + "learning_rate": 2.54282014476823e-06, + "loss": 0.2042, + "step": 34450 + }, + { + "epoch": 0.7679976818528831, + "grad_norm": 0.7554879784584045, + "learning_rate": 2.5404878076939364e-06, + "loss": 0.3231, + "step": 34455 + }, + { + "epoch": 0.7681091312335032, + "grad_norm": 0.4223545789718628, + "learning_rate": 2.538156385098258e-06, + "loss": 0.2919, + "step": 34460 + }, + { + "epoch": 0.7682205806141232, + "grad_norm": 0.469969242811203, + "learning_rate": 2.5358258772670073e-06, + "loss": 0.2142, + "step": 34465 + }, + { + "epoch": 0.7683320299947433, + "grad_norm": 0.563951313495636, + "learning_rate": 2.5334962844858933e-06, + "loss": 0.2619, + "step": 34470 + }, + { + "epoch": 0.7684434793753634, + "grad_norm": 0.546368420124054, + "learning_rate": 2.531167607040499e-06, + "loss": 0.2622, + "step": 34475 + }, + { + "epoch": 0.7685549287559834, + "grad_norm": 0.5263521671295166, + "learning_rate": 2.5288398452163075e-06, + "loss": 0.2952, + "step": 34480 + }, + { + "epoch": 0.7686663781366035, + "grad_norm": 0.8289400935173035, + "learning_rate": 2.526512999298684e-06, + "loss": 0.2693, + "step": 34485 + }, + { + "epoch": 0.7687778275172236, + "grad_norm": 0.7360613942146301, + "learning_rate": 2.5241870695728788e-06, + "loss": 0.2953, + "step": 34490 + }, + { + "epoch": 0.7688892768978436, + "grad_norm": 0.689541220664978, + "learning_rate": 2.5218620563240324e-06, + "loss": 0.2033, + "step": 34495 + }, + { + "epoch": 0.7690007262784637, + "grad_norm": 0.6824425458908081, + "learning_rate": 2.5195379598371785e-06, + "loss": 0.2722, + "step": 34500 + }, + { + "epoch": 0.7691121756590837, + "grad_norm": 0.5263447761535645, + "learning_rate": 2.51721478039723e-06, + "loss": 0.3732, + "step": 34505 + }, + { + "epoch": 0.7692236250397039, + "grad_norm": 0.39247003197669983, + "learning_rate": 2.514892518288988e-06, + "loss": 0.3426, + "step": 34510 + }, + { + "epoch": 0.7693350744203239, + "grad_norm": 0.6696521639823914, + "learning_rate": 2.5125711737971514e-06, + "loss": 0.3763, + "step": 34515 + }, + { + "epoch": 0.769446523800944, + "grad_norm": 1.1758371591567993, + "learning_rate": 2.510250747206294e-06, + "loss": 0.1982, + "step": 34520 + }, + { + "epoch": 0.769557973181564, + "grad_norm": 0.5801847577095032, + "learning_rate": 2.5079312388008825e-06, + "loss": 0.3051, + "step": 34525 + }, + { + "epoch": 0.7696694225621841, + "grad_norm": 0.5518688559532166, + "learning_rate": 2.5056126488652686e-06, + "loss": 0.3729, + "step": 34530 + }, + { + "epoch": 0.7697808719428042, + "grad_norm": 0.604720413684845, + "learning_rate": 2.503294977683699e-06, + "loss": 0.4624, + "step": 34535 + }, + { + "epoch": 0.7698923213234242, + "grad_norm": 0.9135832190513611, + "learning_rate": 2.5009782255403003e-06, + "loss": 0.4127, + "step": 34540 + }, + { + "epoch": 0.7700037707040444, + "grad_norm": 0.5589683055877686, + "learning_rate": 2.4986623927190834e-06, + "loss": 0.3683, + "step": 34545 + }, + { + "epoch": 0.7701152200846644, + "grad_norm": 0.9304735660552979, + "learning_rate": 2.49634747950396e-06, + "loss": 0.1766, + "step": 34550 + }, + { + "epoch": 0.7702266694652844, + "grad_norm": 0.6019694805145264, + "learning_rate": 2.4940334861787154e-06, + "loss": 0.266, + "step": 34555 + }, + { + "epoch": 0.7703381188459045, + "grad_norm": 0.35014772415161133, + "learning_rate": 2.4917204130270255e-06, + "loss": 0.2729, + "step": 34560 + }, + { + "epoch": 0.7704495682265246, + "grad_norm": 0.44610536098480225, + "learning_rate": 2.489408260332461e-06, + "loss": 0.2708, + "step": 34565 + }, + { + "epoch": 0.7705610176071447, + "grad_norm": 0.5490642786026001, + "learning_rate": 2.48709702837847e-06, + "loss": 0.2285, + "step": 34570 + }, + { + "epoch": 0.7706724669877647, + "grad_norm": 0.5669086575508118, + "learning_rate": 2.484786717448394e-06, + "loss": 0.1863, + "step": 34575 + }, + { + "epoch": 0.7707839163683848, + "grad_norm": 0.5582325458526611, + "learning_rate": 2.4824773278254543e-06, + "loss": 0.369, + "step": 34580 + }, + { + "epoch": 0.7708953657490049, + "grad_norm": 0.39789506793022156, + "learning_rate": 2.480168859792771e-06, + "loss": 0.3403, + "step": 34585 + }, + { + "epoch": 0.7710068151296249, + "grad_norm": 0.5807655453681946, + "learning_rate": 2.4778613136333373e-06, + "loss": 0.2493, + "step": 34590 + }, + { + "epoch": 0.771118264510245, + "grad_norm": 0.6753233075141907, + "learning_rate": 2.4755546896300484e-06, + "loss": 0.1841, + "step": 34595 + }, + { + "epoch": 0.771229713890865, + "grad_norm": 0.6643304228782654, + "learning_rate": 2.4732489880656753e-06, + "loss": 0.3048, + "step": 34600 + }, + { + "epoch": 0.7713411632714852, + "grad_norm": 0.6019219756126404, + "learning_rate": 2.4709442092228773e-06, + "loss": 0.3022, + "step": 34605 + }, + { + "epoch": 0.7714526126521052, + "grad_norm": 0.7255983352661133, + "learning_rate": 2.468640353384205e-06, + "loss": 0.3594, + "step": 34610 + }, + { + "epoch": 0.7715640620327252, + "grad_norm": 0.8732434511184692, + "learning_rate": 2.466337420832089e-06, + "loss": 0.299, + "step": 34615 + }, + { + "epoch": 0.7716755114133453, + "grad_norm": 0.5019058585166931, + "learning_rate": 2.464035411848854e-06, + "loss": 0.2987, + "step": 34620 + }, + { + "epoch": 0.7717869607939654, + "grad_norm": 0.6579979062080383, + "learning_rate": 2.4617343267167126e-06, + "loss": 0.3096, + "step": 34625 + }, + { + "epoch": 0.7718984101745855, + "grad_norm": 0.7526550889015198, + "learning_rate": 2.4594341657177544e-06, + "loss": 0.2986, + "step": 34630 + }, + { + "epoch": 0.7720098595552055, + "grad_norm": 0.7801370024681091, + "learning_rate": 2.457134929133965e-06, + "loss": 0.2556, + "step": 34635 + }, + { + "epoch": 0.7721213089358255, + "grad_norm": 0.4410828649997711, + "learning_rate": 2.4548366172472103e-06, + "loss": 0.1926, + "step": 34640 + }, + { + "epoch": 0.7722327583164457, + "grad_norm": 0.5587494373321533, + "learning_rate": 2.452539230339247e-06, + "loss": 0.2875, + "step": 34645 + }, + { + "epoch": 0.7723442076970657, + "grad_norm": 0.8305307626724243, + "learning_rate": 2.450242768691714e-06, + "loss": 0.3443, + "step": 34650 + }, + { + "epoch": 0.7724556570776858, + "grad_norm": 0.5703745484352112, + "learning_rate": 2.447947232586142e-06, + "loss": 0.2984, + "step": 34655 + }, + { + "epoch": 0.7725671064583058, + "grad_norm": 0.586417019367218, + "learning_rate": 2.445652622303949e-06, + "loss": 0.2462, + "step": 34660 + }, + { + "epoch": 0.772678555838926, + "grad_norm": 0.5886982679367065, + "learning_rate": 2.4433589381264346e-06, + "loss": 0.241, + "step": 34665 + }, + { + "epoch": 0.772790005219546, + "grad_norm": 0.5183648467063904, + "learning_rate": 2.441066180334786e-06, + "loss": 0.2852, + "step": 34670 + }, + { + "epoch": 0.772901454600166, + "grad_norm": 0.5780871510505676, + "learning_rate": 2.4387743492100767e-06, + "loss": 0.2254, + "step": 34675 + }, + { + "epoch": 0.7730129039807861, + "grad_norm": 0.44057005643844604, + "learning_rate": 2.4364834450332662e-06, + "loss": 0.1836, + "step": 34680 + }, + { + "epoch": 0.7731243533614062, + "grad_norm": 0.4620932340621948, + "learning_rate": 2.4341934680852043e-06, + "loss": 0.246, + "step": 34685 + }, + { + "epoch": 0.7732358027420263, + "grad_norm": 0.5267679691314697, + "learning_rate": 2.4319044186466267e-06, + "loss": 0.1548, + "step": 34690 + }, + { + "epoch": 0.7733472521226463, + "grad_norm": 0.5535991787910461, + "learning_rate": 2.4296162969981506e-06, + "loss": 0.2883, + "step": 34695 + }, + { + "epoch": 0.7734587015032663, + "grad_norm": 0.4434576630592346, + "learning_rate": 2.4273291034202828e-06, + "loss": 0.2774, + "step": 34700 + }, + { + "epoch": 0.7735701508838865, + "grad_norm": 0.7226085066795349, + "learning_rate": 2.4250428381934143e-06, + "loss": 0.3007, + "step": 34705 + }, + { + "epoch": 0.7736816002645065, + "grad_norm": 0.48912107944488525, + "learning_rate": 2.4227575015978213e-06, + "loss": 0.2283, + "step": 34710 + }, + { + "epoch": 0.7737930496451266, + "grad_norm": 0.8533339500427246, + "learning_rate": 2.4204730939136733e-06, + "loss": 0.2332, + "step": 34715 + }, + { + "epoch": 0.7739044990257467, + "grad_norm": 0.722251296043396, + "learning_rate": 2.418189615421017e-06, + "loss": 0.2628, + "step": 34720 + }, + { + "epoch": 0.7740159484063668, + "grad_norm": 0.507311999797821, + "learning_rate": 2.415907066399794e-06, + "loss": 0.2101, + "step": 34725 + }, + { + "epoch": 0.7741273977869868, + "grad_norm": 0.8455132842063904, + "learning_rate": 2.413625447129825e-06, + "loss": 0.2839, + "step": 34730 + }, + { + "epoch": 0.7742388471676068, + "grad_norm": 0.4431849420070648, + "learning_rate": 2.411344757890818e-06, + "loss": 0.3066, + "step": 34735 + }, + { + "epoch": 0.774350296548227, + "grad_norm": 0.3015442490577698, + "learning_rate": 2.409064998962368e-06, + "loss": 0.2811, + "step": 34740 + }, + { + "epoch": 0.774461745928847, + "grad_norm": 0.5934215784072876, + "learning_rate": 2.4067861706239524e-06, + "loss": 0.1633, + "step": 34745 + }, + { + "epoch": 0.7745731953094671, + "grad_norm": 0.5405876636505127, + "learning_rate": 2.4045082731549463e-06, + "loss": 0.3663, + "step": 34750 + }, + { + "epoch": 0.7746846446900871, + "grad_norm": 0.6101316213607788, + "learning_rate": 2.4022313068345937e-06, + "loss": 0.3166, + "step": 34755 + }, + { + "epoch": 0.7747960940707072, + "grad_norm": 0.8928530216217041, + "learning_rate": 2.3999552719420403e-06, + "loss": 0.3347, + "step": 34760 + }, + { + "epoch": 0.7749075434513273, + "grad_norm": 0.510225772857666, + "learning_rate": 2.3976801687563077e-06, + "loss": 0.2394, + "step": 34765 + }, + { + "epoch": 0.7750189928319473, + "grad_norm": 0.6352126002311707, + "learning_rate": 2.395405997556305e-06, + "loss": 0.403, + "step": 34770 + }, + { + "epoch": 0.7751304422125674, + "grad_norm": 0.7462135553359985, + "learning_rate": 2.3931327586208252e-06, + "loss": 0.2649, + "step": 34775 + }, + { + "epoch": 0.7752418915931875, + "grad_norm": 0.45706817507743835, + "learning_rate": 2.3908604522285574e-06, + "loss": 0.2739, + "step": 34780 + }, + { + "epoch": 0.7753533409738076, + "grad_norm": 0.47235020995140076, + "learning_rate": 2.3885890786580636e-06, + "loss": 0.2053, + "step": 34785 + }, + { + "epoch": 0.7754647903544276, + "grad_norm": 0.8027734756469727, + "learning_rate": 2.3863186381877944e-06, + "loss": 0.3695, + "step": 34790 + }, + { + "epoch": 0.7755762397350476, + "grad_norm": 0.48161131143569946, + "learning_rate": 2.3840491310960955e-06, + "loss": 0.3491, + "step": 34795 + }, + { + "epoch": 0.7756876891156678, + "grad_norm": 0.5025513768196106, + "learning_rate": 2.381780557661186e-06, + "loss": 0.2582, + "step": 34800 + }, + { + "epoch": 0.7757991384962878, + "grad_norm": 0.8030077219009399, + "learning_rate": 2.379512918161174e-06, + "loss": 0.2948, + "step": 34805 + }, + { + "epoch": 0.7759105878769079, + "grad_norm": 1.0067226886749268, + "learning_rate": 2.3772462128740603e-06, + "loss": 0.2108, + "step": 34810 + }, + { + "epoch": 0.7760220372575279, + "grad_norm": 0.5891587734222412, + "learning_rate": 2.374980442077722e-06, + "loss": 0.4098, + "step": 34815 + }, + { + "epoch": 0.776133486638148, + "grad_norm": 0.7874899506568909, + "learning_rate": 2.3727156060499256e-06, + "loss": 0.3265, + "step": 34820 + }, + { + "epoch": 0.7762449360187681, + "grad_norm": 0.7964795827865601, + "learning_rate": 2.3704517050683196e-06, + "loss": 0.3475, + "step": 34825 + }, + { + "epoch": 0.7763563853993881, + "grad_norm": 0.3685130774974823, + "learning_rate": 2.3681887394104463e-06, + "loss": 0.3647, + "step": 34830 + }, + { + "epoch": 0.7764678347800082, + "grad_norm": 0.5425657033920288, + "learning_rate": 2.3659267093537242e-06, + "loss": 0.3282, + "step": 34835 + }, + { + "epoch": 0.7765792841606283, + "grad_norm": 0.6460210084915161, + "learning_rate": 2.363665615175459e-06, + "loss": 0.2726, + "step": 34840 + }, + { + "epoch": 0.7766907335412483, + "grad_norm": 0.4823559522628784, + "learning_rate": 2.361405457152849e-06, + "loss": 0.2596, + "step": 34845 + }, + { + "epoch": 0.7768021829218684, + "grad_norm": 0.5281304121017456, + "learning_rate": 2.359146235562969e-06, + "loss": 0.2796, + "step": 34850 + }, + { + "epoch": 0.7769136323024884, + "grad_norm": 0.6700007915496826, + "learning_rate": 2.3568879506827826e-06, + "loss": 0.2438, + "step": 34855 + }, + { + "epoch": 0.7770250816831086, + "grad_norm": 0.8023337125778198, + "learning_rate": 2.354630602789134e-06, + "loss": 0.2751, + "step": 34860 + }, + { + "epoch": 0.7771365310637286, + "grad_norm": 0.4087885916233063, + "learning_rate": 2.352374192158764e-06, + "loss": 0.2717, + "step": 34865 + }, + { + "epoch": 0.7772479804443487, + "grad_norm": 0.49791598320007324, + "learning_rate": 2.350118719068284e-06, + "loss": 0.2264, + "step": 34870 + }, + { + "epoch": 0.7773594298249688, + "grad_norm": 0.42472195625305176, + "learning_rate": 2.347864183794204e-06, + "loss": 0.2517, + "step": 34875 + }, + { + "epoch": 0.7774708792055888, + "grad_norm": 0.8098690509796143, + "learning_rate": 2.3456105866129098e-06, + "loss": 0.2864, + "step": 34880 + }, + { + "epoch": 0.7775823285862089, + "grad_norm": 0.4638606309890747, + "learning_rate": 2.343357927800676e-06, + "loss": 0.2375, + "step": 34885 + }, + { + "epoch": 0.7776937779668289, + "grad_norm": 0.2630634307861328, + "learning_rate": 2.341106207633658e-06, + "loss": 0.3394, + "step": 34890 + }, + { + "epoch": 0.7778052273474491, + "grad_norm": 1.4568618535995483, + "learning_rate": 2.3388554263878992e-06, + "loss": 0.2233, + "step": 34895 + }, + { + "epoch": 0.7779166767280691, + "grad_norm": 0.4091511368751526, + "learning_rate": 2.336605584339331e-06, + "loss": 0.2923, + "step": 34900 + }, + { + "epoch": 0.7780281261086891, + "grad_norm": 0.7855178713798523, + "learning_rate": 2.3343566817637674e-06, + "loss": 0.2633, + "step": 34905 + }, + { + "epoch": 0.7781395754893092, + "grad_norm": 0.8028655052185059, + "learning_rate": 2.332108718936905e-06, + "loss": 0.1376, + "step": 34910 + }, + { + "epoch": 0.7782510248699293, + "grad_norm": 0.869860053062439, + "learning_rate": 2.329861696134328e-06, + "loss": 0.4468, + "step": 34915 + }, + { + "epoch": 0.7783624742505494, + "grad_norm": 0.7514600157737732, + "learning_rate": 2.327615613631502e-06, + "loss": 0.2536, + "step": 34920 + }, + { + "epoch": 0.7784739236311694, + "grad_norm": 0.5989588499069214, + "learning_rate": 2.3253704717037763e-06, + "loss": 0.2255, + "step": 34925 + }, + { + "epoch": 0.7785853730117895, + "grad_norm": 0.3953379988670349, + "learning_rate": 2.3231262706263957e-06, + "loss": 0.3115, + "step": 34930 + }, + { + "epoch": 0.7786968223924096, + "grad_norm": 0.49553415179252625, + "learning_rate": 2.3208830106744738e-06, + "loss": 0.291, + "step": 34935 + }, + { + "epoch": 0.7788082717730296, + "grad_norm": 0.6226276755332947, + "learning_rate": 2.3186406921230255e-06, + "loss": 0.3238, + "step": 34940 + }, + { + "epoch": 0.7789197211536497, + "grad_norm": 0.8406127095222473, + "learning_rate": 2.3163993152469365e-06, + "loss": 0.3199, + "step": 34945 + }, + { + "epoch": 0.7790311705342697, + "grad_norm": 0.5463730096817017, + "learning_rate": 2.3141588803209837e-06, + "loss": 0.2737, + "step": 34950 + }, + { + "epoch": 0.7791426199148899, + "grad_norm": 0.46503451466560364, + "learning_rate": 2.3119193876198265e-06, + "loss": 0.1797, + "step": 34955 + }, + { + "epoch": 0.7792540692955099, + "grad_norm": 0.6056923270225525, + "learning_rate": 2.3096808374180056e-06, + "loss": 0.3692, + "step": 34960 + }, + { + "epoch": 0.7793655186761299, + "grad_norm": 0.7003437280654907, + "learning_rate": 2.307443229989957e-06, + "loss": 0.3654, + "step": 34965 + }, + { + "epoch": 0.77947696805675, + "grad_norm": 0.6190140247344971, + "learning_rate": 2.30520656560999e-06, + "loss": 0.2616, + "step": 34970 + }, + { + "epoch": 0.7795884174373701, + "grad_norm": 0.7247709631919861, + "learning_rate": 2.3029708445523048e-06, + "loss": 0.3933, + "step": 34975 + }, + { + "epoch": 0.7796998668179902, + "grad_norm": 0.43216267228126526, + "learning_rate": 2.300736067090982e-06, + "loss": 0.3454, + "step": 34980 + }, + { + "epoch": 0.7798113161986102, + "grad_norm": 0.7808366417884827, + "learning_rate": 2.2985022334999884e-06, + "loss": 0.3402, + "step": 34985 + }, + { + "epoch": 0.7799227655792303, + "grad_norm": 0.4898461699485779, + "learning_rate": 2.2962693440531713e-06, + "loss": 0.297, + "step": 34990 + }, + { + "epoch": 0.7800342149598504, + "grad_norm": 0.62904953956604, + "learning_rate": 2.2940373990242724e-06, + "loss": 0.2683, + "step": 34995 + }, + { + "epoch": 0.7801456643404704, + "grad_norm": 0.425571471452713, + "learning_rate": 2.2918063986869064e-06, + "loss": 0.1824, + "step": 35000 + }, + { + "epoch": 0.7802571137210905, + "grad_norm": 0.7334528565406799, + "learning_rate": 2.289576343314577e-06, + "loss": 0.2736, + "step": 35005 + }, + { + "epoch": 0.7803685631017105, + "grad_norm": 0.4682607054710388, + "learning_rate": 2.2873472331806732e-06, + "loss": 0.2973, + "step": 35010 + }, + { + "epoch": 0.7804800124823307, + "grad_norm": 0.6670015454292297, + "learning_rate": 2.285119068558468e-06, + "loss": 0.369, + "step": 35015 + }, + { + "epoch": 0.7805914618629507, + "grad_norm": 0.7925013303756714, + "learning_rate": 2.2828918497211107e-06, + "loss": 0.2609, + "step": 35020 + }, + { + "epoch": 0.7807029112435707, + "grad_norm": 0.5793088674545288, + "learning_rate": 2.2806655769416496e-06, + "loss": 0.2433, + "step": 35025 + }, + { + "epoch": 0.7808143606241909, + "grad_norm": 0.6716881394386292, + "learning_rate": 2.2784402504930047e-06, + "loss": 0.3465, + "step": 35030 + }, + { + "epoch": 0.7809258100048109, + "grad_norm": 0.6803726553916931, + "learning_rate": 2.2762158706479833e-06, + "loss": 0.211, + "step": 35035 + }, + { + "epoch": 0.781037259385431, + "grad_norm": 0.46885883808135986, + "learning_rate": 2.2739924376792744e-06, + "loss": 0.3056, + "step": 35040 + }, + { + "epoch": 0.781148708766051, + "grad_norm": 0.5113359689712524, + "learning_rate": 2.2717699518594606e-06, + "loss": 0.268, + "step": 35045 + }, + { + "epoch": 0.781260158146671, + "grad_norm": 0.5416445136070251, + "learning_rate": 2.269548413460998e-06, + "loss": 0.2482, + "step": 35050 + }, + { + "epoch": 0.7813716075272912, + "grad_norm": 0.88877272605896, + "learning_rate": 2.2673278227562266e-06, + "loss": 0.2914, + "step": 35055 + }, + { + "epoch": 0.7814830569079112, + "grad_norm": 0.954880952835083, + "learning_rate": 2.26510818001738e-06, + "loss": 0.3863, + "step": 35060 + }, + { + "epoch": 0.7815945062885313, + "grad_norm": 0.8852445483207703, + "learning_rate": 2.262889485516567e-06, + "loss": 0.3352, + "step": 35065 + }, + { + "epoch": 0.7817059556691514, + "grad_norm": 0.4573374092578888, + "learning_rate": 2.2606717395257816e-06, + "loss": 0.225, + "step": 35070 + }, + { + "epoch": 0.7818174050497715, + "grad_norm": 0.7477966547012329, + "learning_rate": 2.258454942316899e-06, + "loss": 0.1913, + "step": 35075 + }, + { + "epoch": 0.7819288544303915, + "grad_norm": 0.732876181602478, + "learning_rate": 2.2562390941616887e-06, + "loss": 0.2876, + "step": 35080 + }, + { + "epoch": 0.7820403038110115, + "grad_norm": 0.6387799382209778, + "learning_rate": 2.254024195331789e-06, + "loss": 0.2778, + "step": 35085 + }, + { + "epoch": 0.7821517531916317, + "grad_norm": 0.8216693997383118, + "learning_rate": 2.2518102460987355e-06, + "loss": 0.1691, + "step": 35090 + }, + { + "epoch": 0.7822632025722517, + "grad_norm": 0.337422639131546, + "learning_rate": 2.249597246733941e-06, + "loss": 0.232, + "step": 35095 + }, + { + "epoch": 0.7823746519528718, + "grad_norm": 0.7069044709205627, + "learning_rate": 2.247385197508698e-06, + "loss": 0.2494, + "step": 35100 + }, + { + "epoch": 0.7824861013334918, + "grad_norm": 0.7120121121406555, + "learning_rate": 2.2451740986941905e-06, + "loss": 0.2518, + "step": 35105 + }, + { + "epoch": 0.7825975507141119, + "grad_norm": 0.5651516318321228, + "learning_rate": 2.2429639505614764e-06, + "loss": 0.2946, + "step": 35110 + }, + { + "epoch": 0.782709000094732, + "grad_norm": 0.5717464685440063, + "learning_rate": 2.2407547533815077e-06, + "loss": 0.2368, + "step": 35115 + }, + { + "epoch": 0.782820449475352, + "grad_norm": 0.7231550812721252, + "learning_rate": 2.2385465074251166e-06, + "loss": 0.2777, + "step": 35120 + }, + { + "epoch": 0.7829318988559721, + "grad_norm": 0.5691319108009338, + "learning_rate": 2.2363392129630147e-06, + "loss": 0.2391, + "step": 35125 + }, + { + "epoch": 0.7830433482365922, + "grad_norm": 0.7855725288391113, + "learning_rate": 2.2341328702657985e-06, + "loss": 0.3428, + "step": 35130 + }, + { + "epoch": 0.7831547976172123, + "grad_norm": 0.48168542981147766, + "learning_rate": 2.2319274796039493e-06, + "loss": 0.2955, + "step": 35135 + }, + { + "epoch": 0.7832662469978323, + "grad_norm": 0.7382097840309143, + "learning_rate": 2.2297230412478275e-06, + "loss": 0.3642, + "step": 35140 + }, + { + "epoch": 0.7833776963784523, + "grad_norm": 0.6649189591407776, + "learning_rate": 2.2275195554676864e-06, + "loss": 0.2334, + "step": 35145 + }, + { + "epoch": 0.7834891457590725, + "grad_norm": 0.7483057379722595, + "learning_rate": 2.2253170225336517e-06, + "loss": 0.2402, + "step": 35150 + }, + { + "epoch": 0.7836005951396925, + "grad_norm": 0.4074045717716217, + "learning_rate": 2.223115442715741e-06, + "loss": 0.2334, + "step": 35155 + }, + { + "epoch": 0.7837120445203126, + "grad_norm": 0.8826455473899841, + "learning_rate": 2.2209148162838477e-06, + "loss": 0.2745, + "step": 35160 + }, + { + "epoch": 0.7838234939009326, + "grad_norm": 0.7632371783256531, + "learning_rate": 2.2187151435077535e-06, + "loss": 0.2734, + "step": 35165 + }, + { + "epoch": 0.7839349432815527, + "grad_norm": 0.9246512055397034, + "learning_rate": 2.2165164246571203e-06, + "loss": 0.2973, + "step": 35170 + }, + { + "epoch": 0.7840463926621728, + "grad_norm": 0.3215585947036743, + "learning_rate": 2.214318660001492e-06, + "loss": 0.4391, + "step": 35175 + }, + { + "epoch": 0.7841578420427928, + "grad_norm": 0.6698297262191772, + "learning_rate": 2.212121849810299e-06, + "loss": 0.3566, + "step": 35180 + }, + { + "epoch": 0.784269291423413, + "grad_norm": 0.9584089517593384, + "learning_rate": 2.2099259943528573e-06, + "loss": 0.3124, + "step": 35185 + }, + { + "epoch": 0.784380740804033, + "grad_norm": 0.8780977129936218, + "learning_rate": 2.2077310938983577e-06, + "loss": 0.4167, + "step": 35190 + }, + { + "epoch": 0.7844921901846531, + "grad_norm": 0.7061375379562378, + "learning_rate": 2.2055371487158796e-06, + "loss": 0.2632, + "step": 35195 + }, + { + "epoch": 0.7846036395652731, + "grad_norm": 0.6107895970344543, + "learning_rate": 2.203344159074383e-06, + "loss": 0.2148, + "step": 35200 + }, + { + "epoch": 0.7847150889458931, + "grad_norm": 0.7566227316856384, + "learning_rate": 2.201152125242708e-06, + "loss": 0.3246, + "step": 35205 + }, + { + "epoch": 0.7848265383265133, + "grad_norm": 0.6975489854812622, + "learning_rate": 2.1989610474895883e-06, + "loss": 0.3739, + "step": 35210 + }, + { + "epoch": 0.7849379877071333, + "grad_norm": 0.6445315480232239, + "learning_rate": 2.196770926083627e-06, + "loss": 0.2056, + "step": 35215 + }, + { + "epoch": 0.7850494370877534, + "grad_norm": 0.47463878989219666, + "learning_rate": 2.1945817612933194e-06, + "loss": 0.2741, + "step": 35220 + }, + { + "epoch": 0.7851608864683735, + "grad_norm": 0.5092577934265137, + "learning_rate": 2.192393553387041e-06, + "loss": 0.2682, + "step": 35225 + }, + { + "epoch": 0.7852723358489935, + "grad_norm": 0.5312575697898865, + "learning_rate": 2.1902063026330466e-06, + "loss": 0.3165, + "step": 35230 + }, + { + "epoch": 0.7853837852296136, + "grad_norm": 0.4544369578361511, + "learning_rate": 2.1880200092994754e-06, + "loss": 0.2641, + "step": 35235 + }, + { + "epoch": 0.7854952346102336, + "grad_norm": 0.46932798624038696, + "learning_rate": 2.1858346736543535e-06, + "loss": 0.2427, + "step": 35240 + }, + { + "epoch": 0.7856066839908538, + "grad_norm": 0.7473596334457397, + "learning_rate": 2.1836502959655847e-06, + "loss": 0.2826, + "step": 35245 + }, + { + "epoch": 0.7857181333714738, + "grad_norm": 0.6829301714897156, + "learning_rate": 2.181466876500954e-06, + "loss": 0.1744, + "step": 35250 + }, + { + "epoch": 0.7858295827520938, + "grad_norm": 0.4222946763038635, + "learning_rate": 2.1792844155281377e-06, + "loss": 0.2619, + "step": 35255 + }, + { + "epoch": 0.7859410321327139, + "grad_norm": 0.5802531838417053, + "learning_rate": 2.1771029133146848e-06, + "loss": 0.3516, + "step": 35260 + }, + { + "epoch": 0.786052481513334, + "grad_norm": 0.7257078289985657, + "learning_rate": 2.174922370128032e-06, + "loss": 0.2959, + "step": 35265 + }, + { + "epoch": 0.7861639308939541, + "grad_norm": 0.7537233233451843, + "learning_rate": 2.172742786235492e-06, + "loss": 0.2253, + "step": 35270 + }, + { + "epoch": 0.7862753802745741, + "grad_norm": 1.001983404159546, + "learning_rate": 2.170564161904274e-06, + "loss": 0.3747, + "step": 35275 + }, + { + "epoch": 0.7863868296551942, + "grad_norm": 0.610106348991394, + "learning_rate": 2.1683864974014545e-06, + "loss": 0.2735, + "step": 35280 + }, + { + "epoch": 0.7864982790358143, + "grad_norm": 0.40139901638031006, + "learning_rate": 2.1662097929939975e-06, + "loss": 0.2136, + "step": 35285 + }, + { + "epoch": 0.7866097284164343, + "grad_norm": 0.6343426704406738, + "learning_rate": 2.164034048948754e-06, + "loss": 0.3794, + "step": 35290 + }, + { + "epoch": 0.7867211777970544, + "grad_norm": 0.6991207599639893, + "learning_rate": 2.161859265532452e-06, + "loss": 0.2002, + "step": 35295 + }, + { + "epoch": 0.7868326271776744, + "grad_norm": 0.6790099143981934, + "learning_rate": 2.159685443011701e-06, + "loss": 0.3706, + "step": 35300 + }, + { + "epoch": 0.7869440765582946, + "grad_norm": 1.4308520555496216, + "learning_rate": 2.1575125816529996e-06, + "loss": 0.2568, + "step": 35305 + }, + { + "epoch": 0.7870555259389146, + "grad_norm": 0.4401121437549591, + "learning_rate": 2.1553406817227194e-06, + "loss": 0.3254, + "step": 35310 + }, + { + "epoch": 0.7871669753195346, + "grad_norm": 1.223118543624878, + "learning_rate": 2.1531697434871215e-06, + "loss": 0.3267, + "step": 35315 + }, + { + "epoch": 0.7872784247001547, + "grad_norm": 0.413013756275177, + "learning_rate": 2.1509997672123418e-06, + "loss": 0.2134, + "step": 35320 + }, + { + "epoch": 0.7873898740807748, + "grad_norm": 0.3138122260570526, + "learning_rate": 2.148830753164408e-06, + "loss": 0.1685, + "step": 35325 + }, + { + "epoch": 0.7875013234613949, + "grad_norm": 0.6393536329269409, + "learning_rate": 2.1466627016092202e-06, + "loss": 0.3473, + "step": 35330 + }, + { + "epoch": 0.7876127728420149, + "grad_norm": 0.8312395811080933, + "learning_rate": 2.1444956128125694e-06, + "loss": 0.3317, + "step": 35335 + }, + { + "epoch": 0.787724222222635, + "grad_norm": 0.6563614010810852, + "learning_rate": 2.1423294870401203e-06, + "loss": 0.3394, + "step": 35340 + }, + { + "epoch": 0.7878356716032551, + "grad_norm": 0.8131313323974609, + "learning_rate": 2.1401643245574244e-06, + "loss": 0.3371, + "step": 35345 + }, + { + "epoch": 0.7879471209838751, + "grad_norm": 0.8174598217010498, + "learning_rate": 2.1380001256299143e-06, + "loss": 0.3269, + "step": 35350 + }, + { + "epoch": 0.7880585703644952, + "grad_norm": 0.5772351026535034, + "learning_rate": 2.135836890522901e-06, + "loss": 0.3529, + "step": 35355 + }, + { + "epoch": 0.7881700197451152, + "grad_norm": 0.42435768246650696, + "learning_rate": 2.1336746195015845e-06, + "loss": 0.2459, + "step": 35360 + }, + { + "epoch": 0.7882814691257354, + "grad_norm": 0.731113076210022, + "learning_rate": 2.1315133128310395e-06, + "loss": 0.3766, + "step": 35365 + }, + { + "epoch": 0.7883929185063554, + "grad_norm": 0.5198209285736084, + "learning_rate": 2.1293529707762284e-06, + "loss": 0.2979, + "step": 35370 + }, + { + "epoch": 0.7885043678869754, + "grad_norm": 0.8131558895111084, + "learning_rate": 2.1271935936019915e-06, + "loss": 0.2811, + "step": 35375 + }, + { + "epoch": 0.7886158172675956, + "grad_norm": 0.5951612591743469, + "learning_rate": 2.1250351815730517e-06, + "loss": 0.2405, + "step": 35380 + }, + { + "epoch": 0.7887272666482156, + "grad_norm": 0.736031174659729, + "learning_rate": 2.122877734954013e-06, + "loss": 0.2372, + "step": 35385 + }, + { + "epoch": 0.7888387160288357, + "grad_norm": 1.0694679021835327, + "learning_rate": 2.120721254009359e-06, + "loss": 0.392, + "step": 35390 + }, + { + "epoch": 0.7889501654094557, + "grad_norm": 0.6037766337394714, + "learning_rate": 2.118565739003461e-06, + "loss": 0.2684, + "step": 35395 + }, + { + "epoch": 0.7890616147900757, + "grad_norm": 0.8734601736068726, + "learning_rate": 2.11641119020057e-06, + "loss": 0.2731, + "step": 35400 + }, + { + "epoch": 0.7891730641706959, + "grad_norm": 0.5328859090805054, + "learning_rate": 2.114257607864816e-06, + "loss": 0.181, + "step": 35405 + }, + { + "epoch": 0.7892845135513159, + "grad_norm": 0.5082289576530457, + "learning_rate": 2.1121049922602098e-06, + "loss": 0.3103, + "step": 35410 + }, + { + "epoch": 0.789395962931936, + "grad_norm": 0.49645161628723145, + "learning_rate": 2.109953343650647e-06, + "loss": 0.2639, + "step": 35415 + }, + { + "epoch": 0.7895074123125561, + "grad_norm": 0.46683311462402344, + "learning_rate": 2.1078026622999006e-06, + "loss": 0.1948, + "step": 35420 + }, + { + "epoch": 0.7896188616931762, + "grad_norm": 0.606785237789154, + "learning_rate": 2.105652948471628e-06, + "loss": 0.303, + "step": 35425 + }, + { + "epoch": 0.7897303110737962, + "grad_norm": 0.42324838042259216, + "learning_rate": 2.1035042024293716e-06, + "loss": 0.2416, + "step": 35430 + }, + { + "epoch": 0.7898417604544162, + "grad_norm": 0.785689651966095, + "learning_rate": 2.101356424436549e-06, + "loss": 0.2966, + "step": 35435 + }, + { + "epoch": 0.7899532098350364, + "grad_norm": 0.771626353263855, + "learning_rate": 2.09920961475646e-06, + "loss": 0.2261, + "step": 35440 + }, + { + "epoch": 0.7900646592156564, + "grad_norm": 0.8682251572608948, + "learning_rate": 2.097063773652288e-06, + "loss": 0.2409, + "step": 35445 + }, + { + "epoch": 0.7901761085962765, + "grad_norm": 0.6335934400558472, + "learning_rate": 2.0949189013870965e-06, + "loss": 0.2482, + "step": 35450 + }, + { + "epoch": 0.7902875579768965, + "grad_norm": 0.9981822967529297, + "learning_rate": 2.0927749982238266e-06, + "loss": 0.2093, + "step": 35455 + }, + { + "epoch": 0.7903990073575166, + "grad_norm": 0.7123530507087708, + "learning_rate": 2.09063206442531e-06, + "loss": 0.3498, + "step": 35460 + }, + { + "epoch": 0.7905104567381367, + "grad_norm": 0.4021548926830292, + "learning_rate": 2.088490100254248e-06, + "loss": 0.2242, + "step": 35465 + }, + { + "epoch": 0.7906219061187567, + "grad_norm": 0.49737146496772766, + "learning_rate": 2.0863491059732366e-06, + "loss": 0.2516, + "step": 35470 + }, + { + "epoch": 0.7907333554993768, + "grad_norm": 0.6410098075866699, + "learning_rate": 2.0842090818447393e-06, + "loss": 0.3899, + "step": 35475 + }, + { + "epoch": 0.7908448048799969, + "grad_norm": 1.1013152599334717, + "learning_rate": 2.082070028131109e-06, + "loss": 0.2475, + "step": 35480 + }, + { + "epoch": 0.790956254260617, + "grad_norm": 0.5911498069763184, + "learning_rate": 2.0799319450945733e-06, + "loss": 0.264, + "step": 35485 + }, + { + "epoch": 0.791067703641237, + "grad_norm": 0.7244366407394409, + "learning_rate": 2.0777948329972497e-06, + "loss": 0.3184, + "step": 35490 + }, + { + "epoch": 0.791179153021857, + "grad_norm": 0.6575037240982056, + "learning_rate": 2.07565869210113e-06, + "loss": 0.3026, + "step": 35495 + }, + { + "epoch": 0.7912906024024772, + "grad_norm": 0.5350658893585205, + "learning_rate": 2.073523522668086e-06, + "loss": 0.261, + "step": 35500 + }, + { + "epoch": 0.7914020517830972, + "grad_norm": 0.6373189091682434, + "learning_rate": 2.0713893249598772e-06, + "loss": 0.2683, + "step": 35505 + }, + { + "epoch": 0.7915135011637173, + "grad_norm": 0.6466808319091797, + "learning_rate": 2.0692560992381373e-06, + "loss": 0.3699, + "step": 35510 + }, + { + "epoch": 0.7916249505443373, + "grad_norm": 0.710489809513092, + "learning_rate": 2.0671238457643817e-06, + "loss": 0.3298, + "step": 35515 + }, + { + "epoch": 0.7917363999249574, + "grad_norm": 0.45638200640678406, + "learning_rate": 2.0649925648000123e-06, + "loss": 0.1977, + "step": 35520 + }, + { + "epoch": 0.7918478493055775, + "grad_norm": 0.6310269236564636, + "learning_rate": 2.0628622566063063e-06, + "loss": 0.2807, + "step": 35525 + }, + { + "epoch": 0.7919592986861975, + "grad_norm": 0.8936885595321655, + "learning_rate": 2.0607329214444216e-06, + "loss": 0.3801, + "step": 35530 + }, + { + "epoch": 0.7920707480668177, + "grad_norm": 0.5122530460357666, + "learning_rate": 2.058604559575397e-06, + "loss": 0.1657, + "step": 35535 + }, + { + "epoch": 0.7921821974474377, + "grad_norm": 0.6624186038970947, + "learning_rate": 2.0564771712601573e-06, + "loss": 0.2397, + "step": 35540 + }, + { + "epoch": 0.7922936468280578, + "grad_norm": 0.7633967399597168, + "learning_rate": 2.0543507567594987e-06, + "loss": 0.2455, + "step": 35545 + }, + { + "epoch": 0.7924050962086778, + "grad_norm": 0.431619256734848, + "learning_rate": 2.05222531633411e-06, + "loss": 0.27, + "step": 35550 + }, + { + "epoch": 0.7925165455892978, + "grad_norm": 0.6180048584938049, + "learning_rate": 2.05010085024455e-06, + "loss": 0.3086, + "step": 35555 + }, + { + "epoch": 0.792627994969918, + "grad_norm": 0.5129493474960327, + "learning_rate": 2.047977358751262e-06, + "loss": 0.0996, + "step": 35560 + }, + { + "epoch": 0.792739444350538, + "grad_norm": 0.2933807373046875, + "learning_rate": 2.0458548421145697e-06, + "loss": 0.2842, + "step": 35565 + }, + { + "epoch": 0.7928508937311581, + "grad_norm": 0.3998599648475647, + "learning_rate": 2.0437333005946736e-06, + "loss": 0.3707, + "step": 35570 + }, + { + "epoch": 0.7929623431117782, + "grad_norm": 0.8523929119110107, + "learning_rate": 2.041612734451666e-06, + "loss": 0.2828, + "step": 35575 + }, + { + "epoch": 0.7930737924923982, + "grad_norm": 0.6669270992279053, + "learning_rate": 2.0394931439455034e-06, + "loss": 0.2341, + "step": 35580 + }, + { + "epoch": 0.7931852418730183, + "grad_norm": 0.8018181324005127, + "learning_rate": 2.037374529336039e-06, + "loss": 0.3917, + "step": 35585 + }, + { + "epoch": 0.7932966912536383, + "grad_norm": 1.1173460483551025, + "learning_rate": 2.035256890882996e-06, + "loss": 0.2755, + "step": 35590 + }, + { + "epoch": 0.7934081406342585, + "grad_norm": 0.4358447194099426, + "learning_rate": 2.033140228845979e-06, + "loss": 0.2951, + "step": 35595 + }, + { + "epoch": 0.7935195900148785, + "grad_norm": 0.9090696573257446, + "learning_rate": 2.0310245434844756e-06, + "loss": 0.2723, + "step": 35600 + }, + { + "epoch": 0.7936310393954985, + "grad_norm": 0.7753550410270691, + "learning_rate": 2.0289098350578493e-06, + "loss": 0.2791, + "step": 35605 + }, + { + "epoch": 0.7937424887761186, + "grad_norm": 0.887015700340271, + "learning_rate": 2.0267961038253503e-06, + "loss": 0.3607, + "step": 35610 + }, + { + "epoch": 0.7938539381567387, + "grad_norm": 0.6061819195747375, + "learning_rate": 2.024683350046107e-06, + "loss": 0.2793, + "step": 35615 + }, + { + "epoch": 0.7939653875373588, + "grad_norm": 0.6215338110923767, + "learning_rate": 2.0225715739791265e-06, + "loss": 0.3664, + "step": 35620 + }, + { + "epoch": 0.7940768369179788, + "grad_norm": 0.7980818748474121, + "learning_rate": 2.020460775883294e-06, + "loss": 0.2984, + "step": 35625 + }, + { + "epoch": 0.7941882862985989, + "grad_norm": 0.5570255517959595, + "learning_rate": 2.0183509560173777e-06, + "loss": 0.4127, + "step": 35630 + }, + { + "epoch": 0.794299735679219, + "grad_norm": 0.47531047463417053, + "learning_rate": 2.0162421146400223e-06, + "loss": 0.2832, + "step": 35635 + }, + { + "epoch": 0.794411185059839, + "grad_norm": 0.5249255895614624, + "learning_rate": 2.0141342520097583e-06, + "loss": 0.2659, + "step": 35640 + }, + { + "epoch": 0.7945226344404591, + "grad_norm": 0.216322660446167, + "learning_rate": 2.0120273683849965e-06, + "loss": 0.2336, + "step": 35645 + }, + { + "epoch": 0.7946340838210791, + "grad_norm": 0.49721142649650574, + "learning_rate": 2.0099214640240227e-06, + "loss": 0.203, + "step": 35650 + }, + { + "epoch": 0.7947455332016993, + "grad_norm": 0.7828408479690552, + "learning_rate": 2.0078165391850026e-06, + "loss": 0.3225, + "step": 35655 + }, + { + "epoch": 0.7948569825823193, + "grad_norm": 0.8212775588035583, + "learning_rate": 2.0057125941259846e-06, + "loss": 0.256, + "step": 35660 + }, + { + "epoch": 0.7949684319629393, + "grad_norm": 1.2922614812850952, + "learning_rate": 2.0036096291048956e-06, + "loss": 0.3137, + "step": 35665 + }, + { + "epoch": 0.7950798813435594, + "grad_norm": 0.9791651964187622, + "learning_rate": 2.001507644379541e-06, + "loss": 0.3149, + "step": 35670 + }, + { + "epoch": 0.7951913307241795, + "grad_norm": 1.0271574258804321, + "learning_rate": 1.999406640207612e-06, + "loss": 0.2303, + "step": 35675 + }, + { + "epoch": 0.7953027801047996, + "grad_norm": 0.6987234950065613, + "learning_rate": 1.997306616846675e-06, + "loss": 0.4131, + "step": 35680 + }, + { + "epoch": 0.7954142294854196, + "grad_norm": 0.31380951404571533, + "learning_rate": 1.9952075745541744e-06, + "loss": 0.2285, + "step": 35685 + }, + { + "epoch": 0.7955256788660398, + "grad_norm": 0.5740078091621399, + "learning_rate": 1.993109513587439e-06, + "loss": 0.3155, + "step": 35690 + }, + { + "epoch": 0.7956371282466598, + "grad_norm": 0.6432341933250427, + "learning_rate": 1.9910124342036742e-06, + "loss": 0.3574, + "step": 35695 + }, + { + "epoch": 0.7957485776272798, + "grad_norm": 0.7405551671981812, + "learning_rate": 1.9889163366599607e-06, + "loss": 0.372, + "step": 35700 + }, + { + "epoch": 0.7958600270078999, + "grad_norm": 0.6038686633110046, + "learning_rate": 1.986821221213272e-06, + "loss": 0.183, + "step": 35705 + }, + { + "epoch": 0.79597147638852, + "grad_norm": 0.5730610489845276, + "learning_rate": 1.9847270881204462e-06, + "loss": 0.3067, + "step": 35710 + }, + { + "epoch": 0.7960829257691401, + "grad_norm": 0.7106767296791077, + "learning_rate": 1.9826339376382144e-06, + "loss": 0.2624, + "step": 35715 + }, + { + "epoch": 0.7961943751497601, + "grad_norm": 0.48092079162597656, + "learning_rate": 1.9805417700231766e-06, + "loss": 0.2149, + "step": 35720 + }, + { + "epoch": 0.7963058245303801, + "grad_norm": 0.692106306552887, + "learning_rate": 1.978450585531817e-06, + "loss": 0.3472, + "step": 35725 + }, + { + "epoch": 0.7964172739110003, + "grad_norm": 0.8615431785583496, + "learning_rate": 1.976360384420496e-06, + "loss": 0.267, + "step": 35730 + }, + { + "epoch": 0.7965287232916203, + "grad_norm": 1.365073561668396, + "learning_rate": 1.974271166945463e-06, + "loss": 0.3025, + "step": 35735 + }, + { + "epoch": 0.7966401726722404, + "grad_norm": 0.6001205444335938, + "learning_rate": 1.972182933362834e-06, + "loss": 0.209, + "step": 35740 + }, + { + "epoch": 0.7967516220528604, + "grad_norm": 0.3347148299217224, + "learning_rate": 1.97009568392861e-06, + "loss": 0.3489, + "step": 35745 + }, + { + "epoch": 0.7968630714334806, + "grad_norm": 0.5797106027603149, + "learning_rate": 1.9680094188986767e-06, + "loss": 0.3299, + "step": 35750 + }, + { + "epoch": 0.7969745208141006, + "grad_norm": 0.5811283588409424, + "learning_rate": 1.965924138528791e-06, + "loss": 0.3119, + "step": 35755 + }, + { + "epoch": 0.7970859701947206, + "grad_norm": 0.543427050113678, + "learning_rate": 1.963839843074593e-06, + "loss": 0.2819, + "step": 35760 + }, + { + "epoch": 0.7971974195753407, + "grad_norm": 0.9709669947624207, + "learning_rate": 1.9617565327915966e-06, + "loss": 0.2298, + "step": 35765 + }, + { + "epoch": 0.7973088689559608, + "grad_norm": 0.5826546549797058, + "learning_rate": 1.959674207935207e-06, + "loss": 0.3116, + "step": 35770 + }, + { + "epoch": 0.7974203183365809, + "grad_norm": 0.4199870526790619, + "learning_rate": 1.9575928687606983e-06, + "loss": 0.3215, + "step": 35775 + }, + { + "epoch": 0.7975317677172009, + "grad_norm": 0.6433164477348328, + "learning_rate": 1.9555125155232223e-06, + "loss": 0.218, + "step": 35780 + }, + { + "epoch": 0.7976432170978209, + "grad_norm": 0.6210421919822693, + "learning_rate": 1.9534331484778212e-06, + "loss": 0.2374, + "step": 35785 + }, + { + "epoch": 0.7977546664784411, + "grad_norm": 0.6035399436950684, + "learning_rate": 1.9513547678794065e-06, + "loss": 0.4207, + "step": 35790 + }, + { + "epoch": 0.7978661158590611, + "grad_norm": 0.5844293832778931, + "learning_rate": 1.949277373982769e-06, + "loss": 0.3032, + "step": 35795 + }, + { + "epoch": 0.7979775652396812, + "grad_norm": 0.7550781965255737, + "learning_rate": 1.947200967042584e-06, + "loss": 0.2134, + "step": 35800 + }, + { + "epoch": 0.7980890146203012, + "grad_norm": 0.4585762619972229, + "learning_rate": 1.9451255473134046e-06, + "loss": 0.2652, + "step": 35805 + }, + { + "epoch": 0.7982004640009213, + "grad_norm": 0.7412415742874146, + "learning_rate": 1.9430511150496576e-06, + "loss": 0.4184, + "step": 35810 + }, + { + "epoch": 0.7983119133815414, + "grad_norm": 0.7571333646774292, + "learning_rate": 1.9409776705056514e-06, + "loss": 0.3278, + "step": 35815 + }, + { + "epoch": 0.7984233627621614, + "grad_norm": 0.6268478035926819, + "learning_rate": 1.93890521393558e-06, + "loss": 0.2156, + "step": 35820 + }, + { + "epoch": 0.7985348121427815, + "grad_norm": 0.7117454409599304, + "learning_rate": 1.936833745593504e-06, + "loss": 0.2145, + "step": 35825 + }, + { + "epoch": 0.7986462615234016, + "grad_norm": 0.42261865735054016, + "learning_rate": 1.934763265733376e-06, + "loss": 0.2468, + "step": 35830 + }, + { + "epoch": 0.7987577109040217, + "grad_norm": 0.547375500202179, + "learning_rate": 1.932693774609017e-06, + "loss": 0.3163, + "step": 35835 + }, + { + "epoch": 0.7988691602846417, + "grad_norm": 0.6775712370872498, + "learning_rate": 1.9306252724741305e-06, + "loss": 0.394, + "step": 35840 + }, + { + "epoch": 0.7989806096652617, + "grad_norm": 1.0392708778381348, + "learning_rate": 1.9285577595823002e-06, + "loss": 0.2424, + "step": 35845 + }, + { + "epoch": 0.7990920590458819, + "grad_norm": 1.004452109336853, + "learning_rate": 1.9264912361869847e-06, + "loss": 0.3627, + "step": 35850 + }, + { + "epoch": 0.7992035084265019, + "grad_norm": 1.1012901067733765, + "learning_rate": 1.9244257025415247e-06, + "loss": 0.2651, + "step": 35855 + }, + { + "epoch": 0.799314957807122, + "grad_norm": 0.5909172296524048, + "learning_rate": 1.922361158899143e-06, + "loss": 0.2275, + "step": 35860 + }, + { + "epoch": 0.799426407187742, + "grad_norm": 0.7913104891777039, + "learning_rate": 1.920297605512933e-06, + "loss": 0.1997, + "step": 35865 + }, + { + "epoch": 0.7995378565683621, + "grad_norm": 0.7826907634735107, + "learning_rate": 1.918235042635871e-06, + "loss": 0.3624, + "step": 35870 + }, + { + "epoch": 0.7996493059489822, + "grad_norm": 0.6586305499076843, + "learning_rate": 1.9161734705208114e-06, + "loss": 0.2755, + "step": 35875 + }, + { + "epoch": 0.7997607553296022, + "grad_norm": 0.6121078133583069, + "learning_rate": 1.9141128894204863e-06, + "loss": 0.2498, + "step": 35880 + }, + { + "epoch": 0.7998722047102224, + "grad_norm": 0.5371085405349731, + "learning_rate": 1.912053299587505e-06, + "loss": 0.2658, + "step": 35885 + }, + { + "epoch": 0.7999836540908424, + "grad_norm": 0.4897899925708771, + "learning_rate": 1.909994701274359e-06, + "loss": 0.2947, + "step": 35890 + }, + { + "epoch": 0.8000951034714625, + "grad_norm": 0.6061673164367676, + "learning_rate": 1.9079370947334218e-06, + "loss": 0.2394, + "step": 35895 + }, + { + "epoch": 0.8002065528520825, + "grad_norm": 0.6482059359550476, + "learning_rate": 1.905880480216934e-06, + "loss": 0.1842, + "step": 35900 + }, + { + "epoch": 0.8003180022327026, + "grad_norm": 0.5653631091117859, + "learning_rate": 1.9038248579770234e-06, + "loss": 0.3537, + "step": 35905 + }, + { + "epoch": 0.8004294516133227, + "grad_norm": 0.4235904812812805, + "learning_rate": 1.9017702282656913e-06, + "loss": 0.3208, + "step": 35910 + }, + { + "epoch": 0.8005409009939427, + "grad_norm": 0.5391014814376831, + "learning_rate": 1.8997165913348191e-06, + "loss": 0.3414, + "step": 35915 + }, + { + "epoch": 0.8006523503745628, + "grad_norm": 0.6648342609405518, + "learning_rate": 1.897663947436167e-06, + "loss": 0.2192, + "step": 35920 + }, + { + "epoch": 0.8007637997551829, + "grad_norm": 0.6008848547935486, + "learning_rate": 1.8956122968213787e-06, + "loss": 0.3137, + "step": 35925 + }, + { + "epoch": 0.8008752491358029, + "grad_norm": 0.9569928050041199, + "learning_rate": 1.8935616397419653e-06, + "loss": 0.4199, + "step": 35930 + }, + { + "epoch": 0.800986698516423, + "grad_norm": 0.8108124136924744, + "learning_rate": 1.8915119764493229e-06, + "loss": 0.2181, + "step": 35935 + }, + { + "epoch": 0.801098147897043, + "grad_norm": 0.6100592017173767, + "learning_rate": 1.8894633071947245e-06, + "loss": 0.2611, + "step": 35940 + }, + { + "epoch": 0.8012095972776632, + "grad_norm": 0.5053665637969971, + "learning_rate": 1.887415632229318e-06, + "loss": 0.3694, + "step": 35945 + }, + { + "epoch": 0.8013210466582832, + "grad_norm": 0.5933337211608887, + "learning_rate": 1.8853689518041385e-06, + "loss": 0.1836, + "step": 35950 + }, + { + "epoch": 0.8014324960389033, + "grad_norm": 0.5199214220046997, + "learning_rate": 1.8833232661700873e-06, + "loss": 0.2262, + "step": 35955 + }, + { + "epoch": 0.8015439454195233, + "grad_norm": 0.790266752243042, + "learning_rate": 1.881278575577955e-06, + "loss": 0.3876, + "step": 35960 + }, + { + "epoch": 0.8016553948001434, + "grad_norm": 0.6568781733512878, + "learning_rate": 1.8792348802784022e-06, + "loss": 0.2912, + "step": 35965 + }, + { + "epoch": 0.8017668441807635, + "grad_norm": 0.7379553914070129, + "learning_rate": 1.8771921805219705e-06, + "loss": 0.2439, + "step": 35970 + }, + { + "epoch": 0.8018782935613835, + "grad_norm": 0.808842122554779, + "learning_rate": 1.87515047655908e-06, + "loss": 0.3986, + "step": 35975 + }, + { + "epoch": 0.8019897429420036, + "grad_norm": 0.2748057544231415, + "learning_rate": 1.8731097686400236e-06, + "loss": 0.2027, + "step": 35980 + }, + { + "epoch": 0.8021011923226237, + "grad_norm": 0.7190350294113159, + "learning_rate": 1.871070057014982e-06, + "loss": 0.2014, + "step": 35985 + }, + { + "epoch": 0.8022126417032437, + "grad_norm": 0.8670672178268433, + "learning_rate": 1.8690313419340055e-06, + "loss": 0.3295, + "step": 35990 + }, + { + "epoch": 0.8023240910838638, + "grad_norm": 0.8307561874389648, + "learning_rate": 1.8669936236470221e-06, + "loss": 0.2391, + "step": 35995 + }, + { + "epoch": 0.8024355404644838, + "grad_norm": 0.5368415117263794, + "learning_rate": 1.8649569024038472e-06, + "loss": 0.2935, + "step": 36000 + }, + { + "epoch": 0.802546989845104, + "grad_norm": 0.6144288182258606, + "learning_rate": 1.8629211784541623e-06, + "loss": 0.2763, + "step": 36005 + }, + { + "epoch": 0.802658439225724, + "grad_norm": 0.662746012210846, + "learning_rate": 1.86088645204753e-06, + "loss": 0.3804, + "step": 36010 + }, + { + "epoch": 0.802769888606344, + "grad_norm": 0.6758759617805481, + "learning_rate": 1.8588527234333963e-06, + "loss": 0.2599, + "step": 36015 + }, + { + "epoch": 0.8028813379869641, + "grad_norm": 0.5104584693908691, + "learning_rate": 1.8568199928610798e-06, + "loss": 0.2994, + "step": 36020 + }, + { + "epoch": 0.8029927873675842, + "grad_norm": 0.5354732275009155, + "learning_rate": 1.8547882605797763e-06, + "loss": 0.3458, + "step": 36025 + }, + { + "epoch": 0.8031042367482043, + "grad_norm": 1.0208121538162231, + "learning_rate": 1.8527575268385566e-06, + "loss": 0.2628, + "step": 36030 + }, + { + "epoch": 0.8032156861288243, + "grad_norm": 0.3906199038028717, + "learning_rate": 1.8507277918863808e-06, + "loss": 0.2572, + "step": 36035 + }, + { + "epoch": 0.8033271355094445, + "grad_norm": 0.7649385333061218, + "learning_rate": 1.848699055972073e-06, + "loss": 0.2585, + "step": 36040 + }, + { + "epoch": 0.8034385848900645, + "grad_norm": 0.7261055111885071, + "learning_rate": 1.8466713193443442e-06, + "loss": 0.2428, + "step": 36045 + }, + { + "epoch": 0.8035500342706845, + "grad_norm": 1.1180109977722168, + "learning_rate": 1.8446445822517778e-06, + "loss": 0.2716, + "step": 36050 + }, + { + "epoch": 0.8036614836513046, + "grad_norm": 0.8348441123962402, + "learning_rate": 1.842618844942836e-06, + "loss": 0.3549, + "step": 36055 + }, + { + "epoch": 0.8037729330319247, + "grad_norm": 0.9582706689834595, + "learning_rate": 1.8405941076658584e-06, + "loss": 0.3277, + "step": 36060 + }, + { + "epoch": 0.8038843824125448, + "grad_norm": 0.7937244176864624, + "learning_rate": 1.8385703706690605e-06, + "loss": 0.3877, + "step": 36065 + }, + { + "epoch": 0.8039958317931648, + "grad_norm": 0.4820943772792816, + "learning_rate": 1.8365476342005407e-06, + "loss": 0.1227, + "step": 36070 + }, + { + "epoch": 0.8041072811737848, + "grad_norm": 1.0704084634780884, + "learning_rate": 1.8345258985082658e-06, + "loss": 0.3512, + "step": 36075 + }, + { + "epoch": 0.804218730554405, + "grad_norm": 0.836803138256073, + "learning_rate": 1.8325051638400903e-06, + "loss": 0.2548, + "step": 36080 + }, + { + "epoch": 0.804330179935025, + "grad_norm": 0.5254769921302795, + "learning_rate": 1.8304854304437391e-06, + "loss": 0.225, + "step": 36085 + }, + { + "epoch": 0.8044416293156451, + "grad_norm": 0.7195600271224976, + "learning_rate": 1.8284666985668142e-06, + "loss": 0.2046, + "step": 36090 + }, + { + "epoch": 0.8045530786962651, + "grad_norm": 0.598730206489563, + "learning_rate": 1.8264489684567987e-06, + "loss": 0.2834, + "step": 36095 + }, + { + "epoch": 0.8046645280768853, + "grad_norm": 0.7976406216621399, + "learning_rate": 1.824432240361046e-06, + "loss": 0.3223, + "step": 36100 + }, + { + "epoch": 0.8047759774575053, + "grad_norm": 0.760412335395813, + "learning_rate": 1.8224165145267947e-06, + "loss": 0.3259, + "step": 36105 + }, + { + "epoch": 0.8048874268381253, + "grad_norm": 0.614362359046936, + "learning_rate": 1.8204017912011606e-06, + "loss": 0.1954, + "step": 36110 + }, + { + "epoch": 0.8049988762187454, + "grad_norm": 0.2477876991033554, + "learning_rate": 1.8183880706311308e-06, + "loss": 0.2068, + "step": 36115 + }, + { + "epoch": 0.8051103255993655, + "grad_norm": 0.6392495632171631, + "learning_rate": 1.8163753530635698e-06, + "loss": 0.2686, + "step": 36120 + }, + { + "epoch": 0.8052217749799856, + "grad_norm": 0.7322916388511658, + "learning_rate": 1.8143636387452236e-06, + "loss": 0.2283, + "step": 36125 + }, + { + "epoch": 0.8053332243606056, + "grad_norm": 0.5651661157608032, + "learning_rate": 1.8123529279227092e-06, + "loss": 0.3185, + "step": 36130 + }, + { + "epoch": 0.8054446737412256, + "grad_norm": 0.5761824250221252, + "learning_rate": 1.8103432208425264e-06, + "loss": 0.2629, + "step": 36135 + }, + { + "epoch": 0.8055561231218458, + "grad_norm": 0.5522593855857849, + "learning_rate": 1.8083345177510536e-06, + "loss": 0.2671, + "step": 36140 + }, + { + "epoch": 0.8056675725024658, + "grad_norm": 0.524655818939209, + "learning_rate": 1.8063268188945382e-06, + "loss": 0.2215, + "step": 36145 + }, + { + "epoch": 0.8057790218830859, + "grad_norm": 0.8033666014671326, + "learning_rate": 1.80432012451911e-06, + "loss": 0.4037, + "step": 36150 + }, + { + "epoch": 0.8058904712637059, + "grad_norm": 0.8125406503677368, + "learning_rate": 1.8023144348707733e-06, + "loss": 0.3197, + "step": 36155 + }, + { + "epoch": 0.806001920644326, + "grad_norm": 0.7958818674087524, + "learning_rate": 1.8003097501954081e-06, + "loss": 0.2377, + "step": 36160 + }, + { + "epoch": 0.8061133700249461, + "grad_norm": 0.5927658677101135, + "learning_rate": 1.798306070738778e-06, + "loss": 0.3152, + "step": 36165 + }, + { + "epoch": 0.8062248194055661, + "grad_norm": 0.7656975984573364, + "learning_rate": 1.7963033967465127e-06, + "loss": 0.3435, + "step": 36170 + }, + { + "epoch": 0.8063362687861862, + "grad_norm": 0.5137174129486084, + "learning_rate": 1.7943017284641317e-06, + "loss": 0.2395, + "step": 36175 + }, + { + "epoch": 0.8064477181668063, + "grad_norm": 0.9598259329795837, + "learning_rate": 1.792301066137021e-06, + "loss": 0.2673, + "step": 36180 + }, + { + "epoch": 0.8065591675474264, + "grad_norm": 0.6322871446609497, + "learning_rate": 1.7903014100104455e-06, + "loss": 0.2864, + "step": 36185 + }, + { + "epoch": 0.8066706169280464, + "grad_norm": 0.7137151956558228, + "learning_rate": 1.7883027603295479e-06, + "loss": 0.1954, + "step": 36190 + }, + { + "epoch": 0.8067820663086664, + "grad_norm": 0.5784446597099304, + "learning_rate": 1.7863051173393442e-06, + "loss": 0.3839, + "step": 36195 + }, + { + "epoch": 0.8068935156892866, + "grad_norm": 1.2818819284439087, + "learning_rate": 1.7843084812847367e-06, + "loss": 0.2935, + "step": 36200 + }, + { + "epoch": 0.8070049650699066, + "grad_norm": 1.0318589210510254, + "learning_rate": 1.7823128524104905e-06, + "loss": 0.3096, + "step": 36205 + }, + { + "epoch": 0.8071164144505267, + "grad_norm": 0.47286149859428406, + "learning_rate": 1.780318230961261e-06, + "loss": 0.321, + "step": 36210 + }, + { + "epoch": 0.8072278638311468, + "grad_norm": 0.7115129828453064, + "learning_rate": 1.7783246171815694e-06, + "loss": 0.3974, + "step": 36215 + }, + { + "epoch": 0.8073393132117668, + "grad_norm": 0.8326082229614258, + "learning_rate": 1.7763320113158188e-06, + "loss": 0.3417, + "step": 36220 + }, + { + "epoch": 0.8074507625923869, + "grad_norm": 0.6177644729614258, + "learning_rate": 1.7743404136082843e-06, + "loss": 0.3237, + "step": 36225 + }, + { + "epoch": 0.8075622119730069, + "grad_norm": 0.5992699861526489, + "learning_rate": 1.7723498243031246e-06, + "loss": 0.3652, + "step": 36230 + }, + { + "epoch": 0.8076736613536271, + "grad_norm": 0.5553713440895081, + "learning_rate": 1.770360243644369e-06, + "loss": 0.3442, + "step": 36235 + }, + { + "epoch": 0.8077851107342471, + "grad_norm": 0.4120601415634155, + "learning_rate": 1.7683716718759224e-06, + "loss": 0.3071, + "step": 36240 + }, + { + "epoch": 0.8078965601148672, + "grad_norm": 0.44997352361679077, + "learning_rate": 1.7663841092415723e-06, + "loss": 0.2397, + "step": 36245 + }, + { + "epoch": 0.8080080094954872, + "grad_norm": 0.9335451126098633, + "learning_rate": 1.7643975559849768e-06, + "loss": 0.3183, + "step": 36250 + }, + { + "epoch": 0.8081194588761073, + "grad_norm": 0.5336930751800537, + "learning_rate": 1.7624120123496702e-06, + "loss": 0.2958, + "step": 36255 + }, + { + "epoch": 0.8082309082567274, + "grad_norm": 0.8036069273948669, + "learning_rate": 1.7604274785790676e-06, + "loss": 0.3444, + "step": 36260 + }, + { + "epoch": 0.8083423576373474, + "grad_norm": 0.5085418224334717, + "learning_rate": 1.7584439549164578e-06, + "loss": 0.2722, + "step": 36265 + }, + { + "epoch": 0.8084538070179675, + "grad_norm": 0.6753281354904175, + "learning_rate": 1.756461441605003e-06, + "loss": 0.2542, + "step": 36270 + }, + { + "epoch": 0.8085652563985876, + "grad_norm": 0.4116097688674927, + "learning_rate": 1.754479938887742e-06, + "loss": 0.2768, + "step": 36275 + }, + { + "epoch": 0.8086767057792076, + "grad_norm": 0.8679396510124207, + "learning_rate": 1.7524994470075985e-06, + "loss": 0.3315, + "step": 36280 + }, + { + "epoch": 0.8087881551598277, + "grad_norm": 0.534657895565033, + "learning_rate": 1.7505199662073624e-06, + "loss": 0.1846, + "step": 36285 + }, + { + "epoch": 0.8088996045404477, + "grad_norm": 0.6001737713813782, + "learning_rate": 1.7485414967296988e-06, + "loss": 0.2151, + "step": 36290 + }, + { + "epoch": 0.8090110539210679, + "grad_norm": 0.6152361631393433, + "learning_rate": 1.7465640388171589e-06, + "loss": 0.3807, + "step": 36295 + }, + { + "epoch": 0.8091225033016879, + "grad_norm": 0.3379671275615692, + "learning_rate": 1.7445875927121602e-06, + "loss": 0.2696, + "step": 36300 + }, + { + "epoch": 0.809233952682308, + "grad_norm": 0.5682633519172668, + "learning_rate": 1.7426121586570023e-06, + "loss": 0.347, + "step": 36305 + }, + { + "epoch": 0.809345402062928, + "grad_norm": 0.8797348141670227, + "learning_rate": 1.7406377368938531e-06, + "loss": 0.2951, + "step": 36310 + }, + { + "epoch": 0.8094568514435481, + "grad_norm": 0.5881412625312805, + "learning_rate": 1.7386643276647674e-06, + "loss": 0.2931, + "step": 36315 + }, + { + "epoch": 0.8095683008241682, + "grad_norm": 0.5972985625267029, + "learning_rate": 1.7366919312116647e-06, + "loss": 0.3303, + "step": 36320 + }, + { + "epoch": 0.8096797502047882, + "grad_norm": 0.5516791343688965, + "learning_rate": 1.7347205477763508e-06, + "loss": 0.1799, + "step": 36325 + }, + { + "epoch": 0.8097911995854084, + "grad_norm": 0.8413097858428955, + "learning_rate": 1.7327501776004995e-06, + "loss": 0.2569, + "step": 36330 + }, + { + "epoch": 0.8099026489660284, + "grad_norm": 0.42086705565452576, + "learning_rate": 1.7307808209256638e-06, + "loss": 0.2209, + "step": 36335 + }, + { + "epoch": 0.8100140983466484, + "grad_norm": 0.69650799036026, + "learning_rate": 1.7288124779932701e-06, + "loss": 0.2003, + "step": 36340 + }, + { + "epoch": 0.8101255477272685, + "grad_norm": 0.5159130692481995, + "learning_rate": 1.7268451490446203e-06, + "loss": 0.2107, + "step": 36345 + }, + { + "epoch": 0.8102369971078885, + "grad_norm": 0.5900051593780518, + "learning_rate": 1.7248788343208966e-06, + "loss": 0.2757, + "step": 36350 + }, + { + "epoch": 0.8103484464885087, + "grad_norm": 0.44346192479133606, + "learning_rate": 1.7229135340631565e-06, + "loss": 0.3421, + "step": 36355 + }, + { + "epoch": 0.8104598958691287, + "grad_norm": 0.7002114057540894, + "learning_rate": 1.7209492485123281e-06, + "loss": 0.2814, + "step": 36360 + }, + { + "epoch": 0.8105713452497487, + "grad_norm": 0.6705095171928406, + "learning_rate": 1.7189859779092166e-06, + "loss": 0.3148, + "step": 36365 + }, + { + "epoch": 0.8106827946303689, + "grad_norm": 0.5612956881523132, + "learning_rate": 1.717023722494504e-06, + "loss": 0.2804, + "step": 36370 + }, + { + "epoch": 0.8107942440109889, + "grad_norm": 0.6677428483963013, + "learning_rate": 1.715062482508747e-06, + "loss": 0.2004, + "step": 36375 + }, + { + "epoch": 0.810905693391609, + "grad_norm": 0.9647729396820068, + "learning_rate": 1.7131022581923818e-06, + "loss": 0.3318, + "step": 36380 + }, + { + "epoch": 0.811017142772229, + "grad_norm": 0.4130144715309143, + "learning_rate": 1.7111430497857118e-06, + "loss": 0.1961, + "step": 36385 + }, + { + "epoch": 0.8111285921528492, + "grad_norm": 0.8154301047325134, + "learning_rate": 1.709184857528927e-06, + "loss": 0.2715, + "step": 36390 + }, + { + "epoch": 0.8112400415334692, + "grad_norm": 0.954910159111023, + "learning_rate": 1.7072276816620825e-06, + "loss": 0.1962, + "step": 36395 + }, + { + "epoch": 0.8113514909140892, + "grad_norm": 0.5167503356933594, + "learning_rate": 1.7052715224251149e-06, + "loss": 0.2053, + "step": 36400 + }, + { + "epoch": 0.8114629402947093, + "grad_norm": 0.8377135396003723, + "learning_rate": 1.7033163800578322e-06, + "loss": 0.3016, + "step": 36405 + }, + { + "epoch": 0.8115743896753294, + "grad_norm": 0.7488682270050049, + "learning_rate": 1.7013622547999187e-06, + "loss": 0.204, + "step": 36410 + }, + { + "epoch": 0.8116858390559495, + "grad_norm": 0.6839303374290466, + "learning_rate": 1.699409146890938e-06, + "loss": 0.2733, + "step": 36415 + }, + { + "epoch": 0.8117972884365695, + "grad_norm": 0.7828803062438965, + "learning_rate": 1.6974570565703263e-06, + "loss": 0.2779, + "step": 36420 + }, + { + "epoch": 0.8119087378171895, + "grad_norm": 0.6850530505180359, + "learning_rate": 1.6955059840773947e-06, + "loss": 0.2373, + "step": 36425 + }, + { + "epoch": 0.8120201871978097, + "grad_norm": 0.5718730092048645, + "learning_rate": 1.6935559296513271e-06, + "loss": 0.2751, + "step": 36430 + }, + { + "epoch": 0.8121316365784297, + "grad_norm": 0.42931827902793884, + "learning_rate": 1.691606893531188e-06, + "loss": 0.3998, + "step": 36435 + }, + { + "epoch": 0.8122430859590498, + "grad_norm": 0.5094924569129944, + "learning_rate": 1.6896588759559096e-06, + "loss": 0.2464, + "step": 36440 + }, + { + "epoch": 0.8123545353396698, + "grad_norm": 0.4700838029384613, + "learning_rate": 1.6877118771643097e-06, + "loss": 0.293, + "step": 36445 + }, + { + "epoch": 0.81246598472029, + "grad_norm": 0.5998474955558777, + "learning_rate": 1.6857658973950709e-06, + "loss": 0.3073, + "step": 36450 + }, + { + "epoch": 0.81257743410091, + "grad_norm": 0.9076191186904907, + "learning_rate": 1.6838209368867586e-06, + "loss": 0.2482, + "step": 36455 + }, + { + "epoch": 0.81268888348153, + "grad_norm": 0.6209660172462463, + "learning_rate": 1.681876995877808e-06, + "loss": 0.279, + "step": 36460 + }, + { + "epoch": 0.8128003328621501, + "grad_norm": 0.7067864537239075, + "learning_rate": 1.679934074606533e-06, + "loss": 0.2419, + "step": 36465 + }, + { + "epoch": 0.8129117822427702, + "grad_norm": 0.5416485667228699, + "learning_rate": 1.677992173311116e-06, + "loss": 0.2501, + "step": 36470 + }, + { + "epoch": 0.8130232316233903, + "grad_norm": 0.8749811053276062, + "learning_rate": 1.6760512922296245e-06, + "loss": 0.1969, + "step": 36475 + }, + { + "epoch": 0.8131346810040103, + "grad_norm": 0.6520072221755981, + "learning_rate": 1.6741114315999952e-06, + "loss": 0.2354, + "step": 36480 + }, + { + "epoch": 0.8132461303846303, + "grad_norm": 0.8475064635276794, + "learning_rate": 1.672172591660035e-06, + "loss": 0.2563, + "step": 36485 + }, + { + "epoch": 0.8133575797652505, + "grad_norm": 0.5565570592880249, + "learning_rate": 1.6702347726474367e-06, + "loss": 0.2641, + "step": 36490 + }, + { + "epoch": 0.8134690291458705, + "grad_norm": 0.5682485699653625, + "learning_rate": 1.6682979747997586e-06, + "loss": 0.234, + "step": 36495 + }, + { + "epoch": 0.8135804785264906, + "grad_norm": 0.44536277651786804, + "learning_rate": 1.6663621983544387e-06, + "loss": 0.2004, + "step": 36500 + }, + { + "epoch": 0.8136919279071106, + "grad_norm": 0.6493122577667236, + "learning_rate": 1.664427443548785e-06, + "loss": 0.4058, + "step": 36505 + }, + { + "epoch": 0.8138033772877308, + "grad_norm": 0.7767170667648315, + "learning_rate": 1.662493710619988e-06, + "loss": 0.1867, + "step": 36510 + }, + { + "epoch": 0.8139148266683508, + "grad_norm": 0.5790889263153076, + "learning_rate": 1.6605609998051064e-06, + "loss": 0.1945, + "step": 36515 + }, + { + "epoch": 0.8140262760489708, + "grad_norm": 0.777140200138092, + "learning_rate": 1.6586293113410756e-06, + "loss": 0.3477, + "step": 36520 + }, + { + "epoch": 0.814137725429591, + "grad_norm": 0.5361962914466858, + "learning_rate": 1.6566986454647039e-06, + "loss": 0.1921, + "step": 36525 + }, + { + "epoch": 0.814249174810211, + "grad_norm": 0.5239769816398621, + "learning_rate": 1.6547690024126796e-06, + "loss": 0.3521, + "step": 36530 + }, + { + "epoch": 0.8143606241908311, + "grad_norm": 0.9107983112335205, + "learning_rate": 1.6528403824215588e-06, + "loss": 0.2483, + "step": 36535 + }, + { + "epoch": 0.8144720735714511, + "grad_norm": 0.9957351088523865, + "learning_rate": 1.6509127857277784e-06, + "loss": 0.2463, + "step": 36540 + }, + { + "epoch": 0.8145835229520711, + "grad_norm": 0.6087895035743713, + "learning_rate": 1.6489862125676459e-06, + "loss": 0.3351, + "step": 36545 + }, + { + "epoch": 0.8146949723326913, + "grad_norm": 0.3920477032661438, + "learning_rate": 1.6470606631773433e-06, + "loss": 0.2002, + "step": 36550 + }, + { + "epoch": 0.8148064217133113, + "grad_norm": 0.7256163358688354, + "learning_rate": 1.6451361377929298e-06, + "loss": 0.3118, + "step": 36555 + }, + { + "epoch": 0.8149178710939314, + "grad_norm": 0.8055761456489563, + "learning_rate": 1.6432126366503331e-06, + "loss": 0.2417, + "step": 36560 + }, + { + "epoch": 0.8150293204745515, + "grad_norm": 0.909164309501648, + "learning_rate": 1.6412901599853626e-06, + "loss": 0.3491, + "step": 36565 + }, + { + "epoch": 0.8151407698551715, + "grad_norm": 0.4847832918167114, + "learning_rate": 1.6393687080337006e-06, + "loss": 0.1905, + "step": 36570 + }, + { + "epoch": 0.8152522192357916, + "grad_norm": 0.8144800066947937, + "learning_rate": 1.6374482810309022e-06, + "loss": 0.3172, + "step": 36575 + }, + { + "epoch": 0.8153636686164116, + "grad_norm": 0.8546509742736816, + "learning_rate": 1.6355288792123947e-06, + "loss": 0.3299, + "step": 36580 + }, + { + "epoch": 0.8154751179970318, + "grad_norm": 0.8366774916648865, + "learning_rate": 1.633610502813483e-06, + "loss": 0.2728, + "step": 36585 + }, + { + "epoch": 0.8155865673776518, + "grad_norm": 0.6456219553947449, + "learning_rate": 1.6316931520693457e-06, + "loss": 0.2615, + "step": 36590 + }, + { + "epoch": 0.8156980167582719, + "grad_norm": 0.5672926306724548, + "learning_rate": 1.6297768272150315e-06, + "loss": 0.3728, + "step": 36595 + }, + { + "epoch": 0.8158094661388919, + "grad_norm": 0.545039713382721, + "learning_rate": 1.6278615284854705e-06, + "loss": 0.1529, + "step": 36600 + }, + { + "epoch": 0.815920915519512, + "grad_norm": 0.7076376080513, + "learning_rate": 1.6259472561154655e-06, + "loss": 0.2669, + "step": 36605 + }, + { + "epoch": 0.8160323649001321, + "grad_norm": 0.6926854848861694, + "learning_rate": 1.624034010339688e-06, + "loss": 0.3351, + "step": 36610 + }, + { + "epoch": 0.8161438142807521, + "grad_norm": 0.8632144331932068, + "learning_rate": 1.622121791392689e-06, + "loss": 0.294, + "step": 36615 + }, + { + "epoch": 0.8162552636613722, + "grad_norm": 0.6294080018997192, + "learning_rate": 1.6202105995088912e-06, + "loss": 0.2915, + "step": 36620 + }, + { + "epoch": 0.8163667130419923, + "grad_norm": 0.8796729445457458, + "learning_rate": 1.6183004349225895e-06, + "loss": 0.2139, + "step": 36625 + }, + { + "epoch": 0.8164781624226123, + "grad_norm": 0.6016055345535278, + "learning_rate": 1.6163912978679587e-06, + "loss": 0.2296, + "step": 36630 + }, + { + "epoch": 0.8165896118032324, + "grad_norm": 0.601929783821106, + "learning_rate": 1.6144831885790435e-06, + "loss": 0.3887, + "step": 36635 + }, + { + "epoch": 0.8167010611838524, + "grad_norm": 0.6190734505653381, + "learning_rate": 1.612576107289765e-06, + "loss": 0.2295, + "step": 36640 + }, + { + "epoch": 0.8168125105644726, + "grad_norm": 0.6690784692764282, + "learning_rate": 1.6106700542339138e-06, + "loss": 0.3049, + "step": 36645 + }, + { + "epoch": 0.8169239599450926, + "grad_norm": 0.5558091998100281, + "learning_rate": 1.6087650296451584e-06, + "loss": 0.2995, + "step": 36650 + }, + { + "epoch": 0.8170354093257127, + "grad_norm": 0.34144356846809387, + "learning_rate": 1.6068610337570378e-06, + "loss": 0.2802, + "step": 36655 + }, + { + "epoch": 0.8171468587063327, + "grad_norm": 0.6739746332168579, + "learning_rate": 1.6049580668029718e-06, + "loss": 0.2603, + "step": 36660 + }, + { + "epoch": 0.8172583080869528, + "grad_norm": 0.7272350788116455, + "learning_rate": 1.603056129016244e-06, + "loss": 0.4396, + "step": 36665 + }, + { + "epoch": 0.8173697574675729, + "grad_norm": 0.3853004276752472, + "learning_rate": 1.6011552206300229e-06, + "loss": 0.2138, + "step": 36670 + }, + { + "epoch": 0.8174812068481929, + "grad_norm": 0.8203393220901489, + "learning_rate": 1.5992553418773438e-06, + "loss": 0.2679, + "step": 36675 + }, + { + "epoch": 0.817592656228813, + "grad_norm": 0.5697523951530457, + "learning_rate": 1.5973564929911144e-06, + "loss": 0.3115, + "step": 36680 + }, + { + "epoch": 0.8177041056094331, + "grad_norm": 0.6337721943855286, + "learning_rate": 1.5954586742041212e-06, + "loss": 0.3143, + "step": 36685 + }, + { + "epoch": 0.8178155549900531, + "grad_norm": 0.7625837922096252, + "learning_rate": 1.5935618857490198e-06, + "loss": 0.2307, + "step": 36690 + }, + { + "epoch": 0.8179270043706732, + "grad_norm": 0.8012816309928894, + "learning_rate": 1.5916661278583444e-06, + "loss": 0.2297, + "step": 36695 + }, + { + "epoch": 0.8180384537512932, + "grad_norm": 0.49943870306015015, + "learning_rate": 1.5897714007644983e-06, + "loss": 0.215, + "step": 36700 + }, + { + "epoch": 0.8181499031319134, + "grad_norm": 1.0728031396865845, + "learning_rate": 1.5878777046997628e-06, + "loss": 0.2713, + "step": 36705 + }, + { + "epoch": 0.8182613525125334, + "grad_norm": 0.7271302342414856, + "learning_rate": 1.5859850398962896e-06, + "loss": 0.2224, + "step": 36710 + }, + { + "epoch": 0.8183728018931535, + "grad_norm": 0.8357278108596802, + "learning_rate": 1.5840934065861047e-06, + "loss": 0.2635, + "step": 36715 + }, + { + "epoch": 0.8184842512737736, + "grad_norm": 0.4184236526489258, + "learning_rate": 1.582202805001104e-06, + "loss": 0.3066, + "step": 36720 + }, + { + "epoch": 0.8185957006543936, + "grad_norm": 0.4964730143547058, + "learning_rate": 1.5803132353730665e-06, + "loss": 0.2489, + "step": 36725 + }, + { + "epoch": 0.8187071500350137, + "grad_norm": 0.7390478849411011, + "learning_rate": 1.578424697933637e-06, + "loss": 0.2571, + "step": 36730 + }, + { + "epoch": 0.8188185994156337, + "grad_norm": 0.42086485028266907, + "learning_rate": 1.5765371929143326e-06, + "loss": 0.3055, + "step": 36735 + }, + { + "epoch": 0.8189300487962539, + "grad_norm": 0.7112220525741577, + "learning_rate": 1.574650720546551e-06, + "loss": 0.359, + "step": 36740 + }, + { + "epoch": 0.8190414981768739, + "grad_norm": 0.828458845615387, + "learning_rate": 1.5727652810615568e-06, + "loss": 0.2786, + "step": 36745 + }, + { + "epoch": 0.8191529475574939, + "grad_norm": 0.2417992502450943, + "learning_rate": 1.5708808746904891e-06, + "loss": 0.3147, + "step": 36750 + }, + { + "epoch": 0.819264396938114, + "grad_norm": 0.22680948674678802, + "learning_rate": 1.5689975016643666e-06, + "loss": 0.3218, + "step": 36755 + }, + { + "epoch": 0.8193758463187341, + "grad_norm": 0.8272700905799866, + "learning_rate": 1.5671151622140723e-06, + "loss": 0.2564, + "step": 36760 + }, + { + "epoch": 0.8194872956993542, + "grad_norm": 0.8969610333442688, + "learning_rate": 1.5652338565703673e-06, + "loss": 0.2409, + "step": 36765 + }, + { + "epoch": 0.8195987450799742, + "grad_norm": 0.769127607345581, + "learning_rate": 1.5633535849638825e-06, + "loss": 0.2854, + "step": 36770 + }, + { + "epoch": 0.8197101944605942, + "grad_norm": 0.5934464931488037, + "learning_rate": 1.5614743476251294e-06, + "loss": 0.3501, + "step": 36775 + }, + { + "epoch": 0.8198216438412144, + "grad_norm": 0.4663730561733246, + "learning_rate": 1.5595961447844843e-06, + "loss": 0.2903, + "step": 36780 + }, + { + "epoch": 0.8199330932218344, + "grad_norm": 0.8412685394287109, + "learning_rate": 1.5577189766722034e-06, + "loss": 0.3296, + "step": 36785 + }, + { + "epoch": 0.8200445426024545, + "grad_norm": 0.8940569162368774, + "learning_rate": 1.5558428435184126e-06, + "loss": 0.2503, + "step": 36790 + }, + { + "epoch": 0.8201559919830745, + "grad_norm": 0.7685565948486328, + "learning_rate": 1.5539677455531088e-06, + "loss": 0.258, + "step": 36795 + }, + { + "epoch": 0.8202674413636947, + "grad_norm": 0.6796576380729675, + "learning_rate": 1.5520936830061672e-06, + "loss": 0.2695, + "step": 36800 + }, + { + "epoch": 0.8203788907443147, + "grad_norm": 0.5190930366516113, + "learning_rate": 1.5502206561073296e-06, + "loss": 0.2867, + "step": 36805 + }, + { + "epoch": 0.8204903401249347, + "grad_norm": 0.35317862033843994, + "learning_rate": 1.54834866508622e-06, + "loss": 0.338, + "step": 36810 + }, + { + "epoch": 0.8206017895055548, + "grad_norm": 0.44570088386535645, + "learning_rate": 1.5464777101723237e-06, + "loss": 0.2515, + "step": 36815 + }, + { + "epoch": 0.8207132388861749, + "grad_norm": 0.9127787947654724, + "learning_rate": 1.5446077915950131e-06, + "loss": 0.2184, + "step": 36820 + }, + { + "epoch": 0.820824688266795, + "grad_norm": 0.8880887031555176, + "learning_rate": 1.542738909583521e-06, + "loss": 0.2583, + "step": 36825 + }, + { + "epoch": 0.820936137647415, + "grad_norm": 0.588434636592865, + "learning_rate": 1.5408710643669578e-06, + "loss": 0.2506, + "step": 36830 + }, + { + "epoch": 0.821047587028035, + "grad_norm": 0.6682761907577515, + "learning_rate": 1.539004256174308e-06, + "loss": 0.2622, + "step": 36835 + }, + { + "epoch": 0.8211590364086552, + "grad_norm": 0.5779880881309509, + "learning_rate": 1.537138485234425e-06, + "loss": 0.2864, + "step": 36840 + }, + { + "epoch": 0.8212704857892752, + "grad_norm": 0.5295442938804626, + "learning_rate": 1.5352737517760407e-06, + "loss": 0.3275, + "step": 36845 + }, + { + "epoch": 0.8213819351698953, + "grad_norm": 0.866150438785553, + "learning_rate": 1.5334100560277599e-06, + "loss": 0.3419, + "step": 36850 + }, + { + "epoch": 0.8214933845505153, + "grad_norm": 1.006531834602356, + "learning_rate": 1.5315473982180528e-06, + "loss": 0.2724, + "step": 36855 + }, + { + "epoch": 0.8216048339311355, + "grad_norm": 0.4456956088542938, + "learning_rate": 1.5296857785752694e-06, + "loss": 0.2642, + "step": 36860 + }, + { + "epoch": 0.8217162833117555, + "grad_norm": 0.5592550039291382, + "learning_rate": 1.5278251973276281e-06, + "loss": 0.2101, + "step": 36865 + }, + { + "epoch": 0.8218277326923755, + "grad_norm": 0.5776540637016296, + "learning_rate": 1.525965654703221e-06, + "loss": 0.2222, + "step": 36870 + }, + { + "epoch": 0.8219391820729957, + "grad_norm": 0.7175986766815186, + "learning_rate": 1.5241071509300143e-06, + "loss": 0.3245, + "step": 36875 + }, + { + "epoch": 0.8220506314536157, + "grad_norm": 0.7773592472076416, + "learning_rate": 1.5222496862358494e-06, + "loss": 0.3101, + "step": 36880 + }, + { + "epoch": 0.8221620808342358, + "grad_norm": 0.6264556646347046, + "learning_rate": 1.520393260848435e-06, + "loss": 0.3531, + "step": 36885 + }, + { + "epoch": 0.8222735302148558, + "grad_norm": 0.6289038062095642, + "learning_rate": 1.5185378749953538e-06, + "loss": 0.2971, + "step": 36890 + }, + { + "epoch": 0.8223849795954759, + "grad_norm": 0.6231863498687744, + "learning_rate": 1.5166835289040626e-06, + "loss": 0.2296, + "step": 36895 + }, + { + "epoch": 0.822496428976096, + "grad_norm": 0.6649795770645142, + "learning_rate": 1.5148302228018897e-06, + "loss": 0.3292, + "step": 36900 + }, + { + "epoch": 0.822607878356716, + "grad_norm": 0.6453999876976013, + "learning_rate": 1.5129779569160342e-06, + "loss": 0.2511, + "step": 36905 + }, + { + "epoch": 0.8227193277373361, + "grad_norm": 0.6257741451263428, + "learning_rate": 1.5111267314735712e-06, + "loss": 0.2089, + "step": 36910 + }, + { + "epoch": 0.8228307771179562, + "grad_norm": 0.4796069860458374, + "learning_rate": 1.509276546701448e-06, + "loss": 0.2014, + "step": 36915 + }, + { + "epoch": 0.8229422264985762, + "grad_norm": 0.784619927406311, + "learning_rate": 1.5074274028264835e-06, + "loss": 0.2584, + "step": 36920 + }, + { + "epoch": 0.8230536758791963, + "grad_norm": 0.4626988470554352, + "learning_rate": 1.505579300075366e-06, + "loss": 0.2602, + "step": 36925 + }, + { + "epoch": 0.8231651252598163, + "grad_norm": 1.1895695924758911, + "learning_rate": 1.503732238674659e-06, + "loss": 0.3199, + "step": 36930 + }, + { + "epoch": 0.8232765746404365, + "grad_norm": 0.4978344142436981, + "learning_rate": 1.5018862188507965e-06, + "loss": 0.1895, + "step": 36935 + }, + { + "epoch": 0.8233880240210565, + "grad_norm": 0.6750534772872925, + "learning_rate": 1.5000412408300914e-06, + "loss": 0.2036, + "step": 36940 + }, + { + "epoch": 0.8234994734016766, + "grad_norm": 0.7439817190170288, + "learning_rate": 1.4981973048387177e-06, + "loss": 0.229, + "step": 36945 + }, + { + "epoch": 0.8236109227822966, + "grad_norm": 0.5261428356170654, + "learning_rate": 1.4963544111027316e-06, + "loss": 0.2848, + "step": 36950 + }, + { + "epoch": 0.8237223721629167, + "grad_norm": 0.5842021703720093, + "learning_rate": 1.494512559848058e-06, + "loss": 0.2968, + "step": 36955 + }, + { + "epoch": 0.8238338215435368, + "grad_norm": 0.6377136707305908, + "learning_rate": 1.4926717513004928e-06, + "loss": 0.3559, + "step": 36960 + }, + { + "epoch": 0.8239452709241568, + "grad_norm": 0.5633856654167175, + "learning_rate": 1.4908319856857012e-06, + "loss": 0.3856, + "step": 36965 + }, + { + "epoch": 0.824056720304777, + "grad_norm": 0.6232241988182068, + "learning_rate": 1.4889932632292292e-06, + "loss": 0.268, + "step": 36970 + }, + { + "epoch": 0.824168169685397, + "grad_norm": 0.48467230796813965, + "learning_rate": 1.4871555841564889e-06, + "loss": 0.2913, + "step": 36975 + }, + { + "epoch": 0.824279619066017, + "grad_norm": 0.643015444278717, + "learning_rate": 1.4853189486927634e-06, + "loss": 0.3131, + "step": 36980 + }, + { + "epoch": 0.8243910684466371, + "grad_norm": 0.612481951713562, + "learning_rate": 1.4834833570632123e-06, + "loss": 0.1504, + "step": 36985 + }, + { + "epoch": 0.8245025178272571, + "grad_norm": 0.4606129229068756, + "learning_rate": 1.481648809492865e-06, + "loss": 0.2039, + "step": 36990 + }, + { + "epoch": 0.8246139672078773, + "grad_norm": 0.8010572195053101, + "learning_rate": 1.4798153062066223e-06, + "loss": 0.3349, + "step": 36995 + }, + { + "epoch": 0.8247254165884973, + "grad_norm": 0.6135943531990051, + "learning_rate": 1.4779828474292545e-06, + "loss": 0.3115, + "step": 37000 + }, + { + "epoch": 0.8248368659691174, + "grad_norm": 0.8414220213890076, + "learning_rate": 1.4761514333854121e-06, + "loss": 0.368, + "step": 37005 + }, + { + "epoch": 0.8249483153497374, + "grad_norm": 0.7408134341239929, + "learning_rate": 1.4743210642996108e-06, + "loss": 0.2246, + "step": 37010 + }, + { + "epoch": 0.8250597647303575, + "grad_norm": 0.5923077464103699, + "learning_rate": 1.4724917403962359e-06, + "loss": 0.2294, + "step": 37015 + }, + { + "epoch": 0.8251712141109776, + "grad_norm": 0.7077345252037048, + "learning_rate": 1.470663461899553e-06, + "loss": 0.3896, + "step": 37020 + }, + { + "epoch": 0.8252826634915976, + "grad_norm": 0.48106616735458374, + "learning_rate": 1.4688362290336944e-06, + "loss": 0.2708, + "step": 37025 + }, + { + "epoch": 0.8253941128722178, + "grad_norm": 0.4276266396045685, + "learning_rate": 1.4670100420226608e-06, + "loss": 0.1939, + "step": 37030 + }, + { + "epoch": 0.8255055622528378, + "grad_norm": 0.5082858204841614, + "learning_rate": 1.465184901090334e-06, + "loss": 0.1899, + "step": 37035 + }, + { + "epoch": 0.8256170116334578, + "grad_norm": 0.6479561924934387, + "learning_rate": 1.4633608064604598e-06, + "loss": 0.2502, + "step": 37040 + }, + { + "epoch": 0.8257284610140779, + "grad_norm": 0.6713269948959351, + "learning_rate": 1.461537758356657e-06, + "loss": 0.3826, + "step": 37045 + }, + { + "epoch": 0.825839910394698, + "grad_norm": 0.7276782393455505, + "learning_rate": 1.459715757002419e-06, + "loss": 0.2784, + "step": 37050 + }, + { + "epoch": 0.8259513597753181, + "grad_norm": 1.0333011150360107, + "learning_rate": 1.457894802621106e-06, + "loss": 0.3731, + "step": 37055 + }, + { + "epoch": 0.8260628091559381, + "grad_norm": 0.4250289797782898, + "learning_rate": 1.456074895435955e-06, + "loss": 0.2192, + "step": 37060 + }, + { + "epoch": 0.8261742585365582, + "grad_norm": 0.6744095683097839, + "learning_rate": 1.4542560356700764e-06, + "loss": 0.2691, + "step": 37065 + }, + { + "epoch": 0.8262857079171783, + "grad_norm": 0.6844059228897095, + "learning_rate": 1.4524382235464429e-06, + "loss": 0.2293, + "step": 37070 + }, + { + "epoch": 0.8263971572977983, + "grad_norm": 0.8135658502578735, + "learning_rate": 1.4506214592879075e-06, + "loss": 0.3692, + "step": 37075 + }, + { + "epoch": 0.8265086066784184, + "grad_norm": 0.43851977586746216, + "learning_rate": 1.4488057431171897e-06, + "loss": 0.2361, + "step": 37080 + }, + { + "epoch": 0.8266200560590384, + "grad_norm": 0.720723569393158, + "learning_rate": 1.4469910752568805e-06, + "loss": 0.3309, + "step": 37085 + }, + { + "epoch": 0.8267315054396586, + "grad_norm": 0.7780085802078247, + "learning_rate": 1.4451774559294463e-06, + "loss": 0.2542, + "step": 37090 + }, + { + "epoch": 0.8268429548202786, + "grad_norm": 0.5620120167732239, + "learning_rate": 1.4433648853572247e-06, + "loss": 0.3395, + "step": 37095 + }, + { + "epoch": 0.8269544042008986, + "grad_norm": 0.2839726507663727, + "learning_rate": 1.4415533637624214e-06, + "loss": 0.2525, + "step": 37100 + }, + { + "epoch": 0.8270658535815187, + "grad_norm": 0.6484590172767639, + "learning_rate": 1.4397428913671153e-06, + "loss": 0.3056, + "step": 37105 + }, + { + "epoch": 0.8271773029621388, + "grad_norm": 0.5020953416824341, + "learning_rate": 1.4379334683932544e-06, + "loss": 0.2167, + "step": 37110 + }, + { + "epoch": 0.8272887523427589, + "grad_norm": 0.7176340818405151, + "learning_rate": 1.4361250950626626e-06, + "loss": 0.2863, + "step": 37115 + }, + { + "epoch": 0.8274002017233789, + "grad_norm": 1.1245310306549072, + "learning_rate": 1.434317771597028e-06, + "loss": 0.3414, + "step": 37120 + }, + { + "epoch": 0.8275116511039989, + "grad_norm": 0.5871909856796265, + "learning_rate": 1.4325114982179177e-06, + "loss": 0.285, + "step": 37125 + }, + { + "epoch": 0.8276231004846191, + "grad_norm": 0.34241896867752075, + "learning_rate": 1.4307062751467693e-06, + "loss": 0.1995, + "step": 37130 + }, + { + "epoch": 0.8277345498652391, + "grad_norm": 0.5553747415542603, + "learning_rate": 1.4289021026048865e-06, + "loss": 0.3491, + "step": 37135 + }, + { + "epoch": 0.8278459992458592, + "grad_norm": 0.6614499092102051, + "learning_rate": 1.4270989808134483e-06, + "loss": 0.3176, + "step": 37140 + }, + { + "epoch": 0.8279574486264792, + "grad_norm": 0.7910409569740295, + "learning_rate": 1.425296909993501e-06, + "loss": 0.2726, + "step": 37145 + }, + { + "epoch": 0.8280688980070994, + "grad_norm": 0.9615556597709656, + "learning_rate": 1.4234958903659645e-06, + "loss": 0.2541, + "step": 37150 + }, + { + "epoch": 0.8281803473877194, + "grad_norm": 0.3836075961589813, + "learning_rate": 1.4216959221516336e-06, + "loss": 0.3331, + "step": 37155 + }, + { + "epoch": 0.8282917967683394, + "grad_norm": 0.40252745151519775, + "learning_rate": 1.4198970055711657e-06, + "loss": 0.1764, + "step": 37160 + }, + { + "epoch": 0.8284032461489595, + "grad_norm": 0.485383540391922, + "learning_rate": 1.4180991408450995e-06, + "loss": 0.1294, + "step": 37165 + }, + { + "epoch": 0.8285146955295796, + "grad_norm": 0.5613443851470947, + "learning_rate": 1.416302328193836e-06, + "loss": 0.1982, + "step": 37170 + }, + { + "epoch": 0.8286261449101997, + "grad_norm": 0.32537856698036194, + "learning_rate": 1.4145065678376512e-06, + "loss": 0.2892, + "step": 37175 + }, + { + "epoch": 0.8287375942908197, + "grad_norm": 0.8405792117118835, + "learning_rate": 1.4127118599966895e-06, + "loss": 0.2527, + "step": 37180 + }, + { + "epoch": 0.8288490436714397, + "grad_norm": 0.4815138876438141, + "learning_rate": 1.410918204890972e-06, + "loss": 0.2303, + "step": 37185 + }, + { + "epoch": 0.8289604930520599, + "grad_norm": 0.5533608198165894, + "learning_rate": 1.4091256027403855e-06, + "loss": 0.1856, + "step": 37190 + }, + { + "epoch": 0.8290719424326799, + "grad_norm": 0.6979275941848755, + "learning_rate": 1.4073340537646863e-06, + "loss": 0.2584, + "step": 37195 + }, + { + "epoch": 0.8291833918133, + "grad_norm": 0.8504383563995361, + "learning_rate": 1.4055435581835086e-06, + "loss": 0.2773, + "step": 37200 + }, + { + "epoch": 0.82929484119392, + "grad_norm": 0.7722695469856262, + "learning_rate": 1.4037541162163515e-06, + "loss": 0.3342, + "step": 37205 + }, + { + "epoch": 0.8294062905745402, + "grad_norm": 0.326023668050766, + "learning_rate": 1.4019657280825883e-06, + "loss": 0.3253, + "step": 37210 + }, + { + "epoch": 0.8295177399551602, + "grad_norm": 0.7090151906013489, + "learning_rate": 1.4001783940014568e-06, + "loss": 0.3364, + "step": 37215 + }, + { + "epoch": 0.8296291893357802, + "grad_norm": 0.5238401293754578, + "learning_rate": 1.3983921141920764e-06, + "loss": 0.1887, + "step": 37220 + }, + { + "epoch": 0.8297406387164004, + "grad_norm": 0.6227529048919678, + "learning_rate": 1.3966068888734285e-06, + "loss": 0.2502, + "step": 37225 + }, + { + "epoch": 0.8298520880970204, + "grad_norm": 0.4360678791999817, + "learning_rate": 1.3948227182643647e-06, + "loss": 0.2717, + "step": 37230 + }, + { + "epoch": 0.8299635374776405, + "grad_norm": 0.5678982734680176, + "learning_rate": 1.393039602583618e-06, + "loss": 0.2633, + "step": 37235 + }, + { + "epoch": 0.8300749868582605, + "grad_norm": 0.2946786880493164, + "learning_rate": 1.3912575420497798e-06, + "loss": 0.2416, + "step": 37240 + }, + { + "epoch": 0.8301864362388806, + "grad_norm": 0.9580309391021729, + "learning_rate": 1.3894765368813145e-06, + "loss": 0.2383, + "step": 37245 + }, + { + "epoch": 0.8302978856195007, + "grad_norm": 0.8293169736862183, + "learning_rate": 1.3876965872965654e-06, + "loss": 0.227, + "step": 37250 + }, + { + "epoch": 0.8304093350001207, + "grad_norm": 0.9708387851715088, + "learning_rate": 1.3859176935137387e-06, + "loss": 0.2806, + "step": 37255 + }, + { + "epoch": 0.8305207843807408, + "grad_norm": 0.6555142402648926, + "learning_rate": 1.3841398557509123e-06, + "loss": 0.2103, + "step": 37260 + }, + { + "epoch": 0.8306322337613609, + "grad_norm": 0.7201140522956848, + "learning_rate": 1.382363074226032e-06, + "loss": 0.4568, + "step": 37265 + }, + { + "epoch": 0.830743683141981, + "grad_norm": 0.6834139823913574, + "learning_rate": 1.3805873491569234e-06, + "loss": 0.2772, + "step": 37270 + }, + { + "epoch": 0.830855132522601, + "grad_norm": 0.5923340320587158, + "learning_rate": 1.3788126807612722e-06, + "loss": 0.332, + "step": 37275 + }, + { + "epoch": 0.830966581903221, + "grad_norm": 0.604604184627533, + "learning_rate": 1.3770390692566438e-06, + "loss": 0.1901, + "step": 37280 + }, + { + "epoch": 0.8310780312838412, + "grad_norm": 0.7125951647758484, + "learning_rate": 1.3752665148604661e-06, + "loss": 0.2456, + "step": 37285 + }, + { + "epoch": 0.8311894806644612, + "grad_norm": 0.6670401692390442, + "learning_rate": 1.3734950177900396e-06, + "loss": 0.2768, + "step": 37290 + }, + { + "epoch": 0.8313009300450813, + "grad_norm": 0.8545556664466858, + "learning_rate": 1.3717245782625389e-06, + "loss": 0.3144, + "step": 37295 + }, + { + "epoch": 0.8314123794257013, + "grad_norm": 0.6221092939376831, + "learning_rate": 1.369955196495003e-06, + "loss": 0.1969, + "step": 37300 + }, + { + "epoch": 0.8315238288063214, + "grad_norm": 0.5654235482215881, + "learning_rate": 1.3681868727043478e-06, + "loss": 0.2783, + "step": 37305 + }, + { + "epoch": 0.8316352781869415, + "grad_norm": 0.3323666751384735, + "learning_rate": 1.3664196071073521e-06, + "loss": 0.2179, + "step": 37310 + }, + { + "epoch": 0.8317467275675615, + "grad_norm": 0.7455230951309204, + "learning_rate": 1.3646533999206745e-06, + "loss": 0.3412, + "step": 37315 + }, + { + "epoch": 0.8318581769481816, + "grad_norm": 0.49438366293907166, + "learning_rate": 1.3628882513608343e-06, + "loss": 0.3679, + "step": 37320 + }, + { + "epoch": 0.8319696263288017, + "grad_norm": 0.831476092338562, + "learning_rate": 1.3611241616442273e-06, + "loss": 0.3278, + "step": 37325 + }, + { + "epoch": 0.8320810757094217, + "grad_norm": 0.5302203297615051, + "learning_rate": 1.3593611309871158e-06, + "loss": 0.33, + "step": 37330 + }, + { + "epoch": 0.8321925250900418, + "grad_norm": 0.6043755412101746, + "learning_rate": 1.3575991596056325e-06, + "loss": 0.2399, + "step": 37335 + }, + { + "epoch": 0.8323039744706618, + "grad_norm": 0.4499483108520508, + "learning_rate": 1.3558382477157828e-06, + "loss": 0.2271, + "step": 37340 + }, + { + "epoch": 0.832415423851282, + "grad_norm": 1.055702805519104, + "learning_rate": 1.3540783955334425e-06, + "loss": 0.3462, + "step": 37345 + }, + { + "epoch": 0.832526873231902, + "grad_norm": 0.5674599409103394, + "learning_rate": 1.3523196032743557e-06, + "loss": 0.3179, + "step": 37350 + }, + { + "epoch": 0.8326383226125221, + "grad_norm": 0.7008332014083862, + "learning_rate": 1.3505618711541358e-06, + "loss": 0.3123, + "step": 37355 + }, + { + "epoch": 0.8327497719931422, + "grad_norm": 0.8845453858375549, + "learning_rate": 1.3488051993882668e-06, + "loss": 0.3611, + "step": 37360 + }, + { + "epoch": 0.8328612213737622, + "grad_norm": 0.4716404676437378, + "learning_rate": 1.3470495881921008e-06, + "loss": 0.2243, + "step": 37365 + }, + { + "epoch": 0.8329726707543823, + "grad_norm": 0.6117929220199585, + "learning_rate": 1.3452950377808648e-06, + "loss": 0.258, + "step": 37370 + }, + { + "epoch": 0.8330841201350023, + "grad_norm": 0.7175026535987854, + "learning_rate": 1.3435415483696556e-06, + "loss": 0.3727, + "step": 37375 + }, + { + "epoch": 0.8331955695156225, + "grad_norm": 0.6029932498931885, + "learning_rate": 1.3417891201734356e-06, + "loss": 0.2152, + "step": 37380 + }, + { + "epoch": 0.8333070188962425, + "grad_norm": 0.7872262001037598, + "learning_rate": 1.3400377534070374e-06, + "loss": 0.2302, + "step": 37385 + }, + { + "epoch": 0.8334184682768625, + "grad_norm": 1.1524052619934082, + "learning_rate": 1.338287448285166e-06, + "loss": 0.4157, + "step": 37390 + }, + { + "epoch": 0.8335299176574826, + "grad_norm": 0.7111510038375854, + "learning_rate": 1.336538205022393e-06, + "loss": 0.3132, + "step": 37395 + }, + { + "epoch": 0.8336413670381027, + "grad_norm": 0.7074114084243774, + "learning_rate": 1.3347900238331668e-06, + "loss": 0.2602, + "step": 37400 + }, + { + "epoch": 0.8337528164187228, + "grad_norm": 0.5388669371604919, + "learning_rate": 1.3330429049317972e-06, + "loss": 0.3328, + "step": 37405 + }, + { + "epoch": 0.8338642657993428, + "grad_norm": 0.4883374869823456, + "learning_rate": 1.3312968485324695e-06, + "loss": 0.2247, + "step": 37410 + }, + { + "epoch": 0.8339757151799629, + "grad_norm": 0.383735716342926, + "learning_rate": 1.329551854849237e-06, + "loss": 0.2401, + "step": 37415 + }, + { + "epoch": 0.834087164560583, + "grad_norm": 0.9760423302650452, + "learning_rate": 1.3278079240960217e-06, + "loss": 0.3357, + "step": 37420 + }, + { + "epoch": 0.834198613941203, + "grad_norm": 0.6999823451042175, + "learning_rate": 1.3260650564866151e-06, + "loss": 0.2286, + "step": 37425 + }, + { + "epoch": 0.8343100633218231, + "grad_norm": 0.7534862756729126, + "learning_rate": 1.3243232522346783e-06, + "loss": 0.3578, + "step": 37430 + }, + { + "epoch": 0.8344215127024431, + "grad_norm": 0.6549674272537231, + "learning_rate": 1.3225825115537472e-06, + "loss": 0.2919, + "step": 37435 + }, + { + "epoch": 0.8345329620830633, + "grad_norm": 0.7035777568817139, + "learning_rate": 1.3208428346572189e-06, + "loss": 0.3564, + "step": 37440 + }, + { + "epoch": 0.8346444114636833, + "grad_norm": 0.7054711580276489, + "learning_rate": 1.3191042217583672e-06, + "loss": 0.3436, + "step": 37445 + }, + { + "epoch": 0.8347558608443033, + "grad_norm": 0.781085193157196, + "learning_rate": 1.3173666730703327e-06, + "loss": 0.2596, + "step": 37450 + }, + { + "epoch": 0.8348673102249234, + "grad_norm": 0.7348710894584656, + "learning_rate": 1.315630188806124e-06, + "loss": 0.3375, + "step": 37455 + }, + { + "epoch": 0.8349787596055435, + "grad_norm": 0.7270821928977966, + "learning_rate": 1.3138947691786185e-06, + "loss": 0.2431, + "step": 37460 + }, + { + "epoch": 0.8350902089861636, + "grad_norm": 0.18632780015468597, + "learning_rate": 1.3121604144005717e-06, + "loss": 0.1656, + "step": 37465 + }, + { + "epoch": 0.8352016583667836, + "grad_norm": 0.4481074810028076, + "learning_rate": 1.3104271246845967e-06, + "loss": 0.2322, + "step": 37470 + }, + { + "epoch": 0.8353131077474037, + "grad_norm": 0.7926612496376038, + "learning_rate": 1.3086949002431815e-06, + "loss": 0.3465, + "step": 37475 + }, + { + "epoch": 0.8354245571280238, + "grad_norm": 0.4460538923740387, + "learning_rate": 1.306963741288687e-06, + "loss": 0.3541, + "step": 37480 + }, + { + "epoch": 0.8355360065086438, + "grad_norm": 0.38178062438964844, + "learning_rate": 1.3052336480333372e-06, + "loss": 0.284, + "step": 37485 + }, + { + "epoch": 0.8356474558892639, + "grad_norm": 0.5777224898338318, + "learning_rate": 1.3035046206892277e-06, + "loss": 0.2561, + "step": 37490 + }, + { + "epoch": 0.835758905269884, + "grad_norm": 0.30997785925865173, + "learning_rate": 1.3017766594683267e-06, + "loss": 0.1854, + "step": 37495 + }, + { + "epoch": 0.8358703546505041, + "grad_norm": 0.6693107485771179, + "learning_rate": 1.3000497645824672e-06, + "loss": 0.262, + "step": 37500 + }, + { + "epoch": 0.8359818040311241, + "grad_norm": 0.5152618885040283, + "learning_rate": 1.2983239362433542e-06, + "loss": 0.2762, + "step": 37505 + }, + { + "epoch": 0.8360932534117441, + "grad_norm": 0.5906385183334351, + "learning_rate": 1.2965991746625561e-06, + "loss": 0.2687, + "step": 37510 + }, + { + "epoch": 0.8362047027923643, + "grad_norm": 0.6585797071456909, + "learning_rate": 1.2948754800515228e-06, + "loss": 0.3197, + "step": 37515 + }, + { + "epoch": 0.8363161521729843, + "grad_norm": 0.7745956182479858, + "learning_rate": 1.293152852621562e-06, + "loss": 0.1805, + "step": 37520 + }, + { + "epoch": 0.8364276015536044, + "grad_norm": 0.9326626062393188, + "learning_rate": 1.2914312925838524e-06, + "loss": 0.2467, + "step": 37525 + }, + { + "epoch": 0.8365390509342244, + "grad_norm": 0.7191392183303833, + "learning_rate": 1.2897108001494496e-06, + "loss": 0.3352, + "step": 37530 + }, + { + "epoch": 0.8366505003148444, + "grad_norm": 0.601437509059906, + "learning_rate": 1.2879913755292683e-06, + "loss": 0.2939, + "step": 37535 + }, + { + "epoch": 0.8367619496954646, + "grad_norm": 0.6129541993141174, + "learning_rate": 1.2862730189340989e-06, + "loss": 0.2128, + "step": 37540 + }, + { + "epoch": 0.8368733990760846, + "grad_norm": 0.5679721236228943, + "learning_rate": 1.284555730574596e-06, + "loss": 0.2247, + "step": 37545 + }, + { + "epoch": 0.8369848484567047, + "grad_norm": 0.7880651950836182, + "learning_rate": 1.282839510661289e-06, + "loss": 0.2246, + "step": 37550 + }, + { + "epoch": 0.8370962978373248, + "grad_norm": 0.6838695406913757, + "learning_rate": 1.2811243594045697e-06, + "loss": 0.2358, + "step": 37555 + }, + { + "epoch": 0.8372077472179449, + "grad_norm": 0.373812735080719, + "learning_rate": 1.2794102770147065e-06, + "loss": 0.2945, + "step": 37560 + }, + { + "epoch": 0.8373191965985649, + "grad_norm": 0.9102007150650024, + "learning_rate": 1.2776972637018314e-06, + "loss": 0.2955, + "step": 37565 + }, + { + "epoch": 0.8374306459791849, + "grad_norm": 0.4074249267578125, + "learning_rate": 1.2759853196759454e-06, + "loss": 0.3346, + "step": 37570 + }, + { + "epoch": 0.8375420953598051, + "grad_norm": 1.002528190612793, + "learning_rate": 1.2742744451469202e-06, + "loss": 0.2436, + "step": 37575 + }, + { + "epoch": 0.8376535447404251, + "grad_norm": 0.7155662775039673, + "learning_rate": 1.272564640324494e-06, + "loss": 0.2921, + "step": 37580 + }, + { + "epoch": 0.8377649941210452, + "grad_norm": 0.41818249225616455, + "learning_rate": 1.2708559054182767e-06, + "loss": 0.3699, + "step": 37585 + }, + { + "epoch": 0.8378764435016652, + "grad_norm": 0.6082835793495178, + "learning_rate": 1.2691482406377499e-06, + "loss": 0.352, + "step": 37590 + }, + { + "epoch": 0.8379878928822853, + "grad_norm": 0.7842504382133484, + "learning_rate": 1.2674416461922555e-06, + "loss": 0.4216, + "step": 37595 + }, + { + "epoch": 0.8380993422629054, + "grad_norm": 0.8266199231147766, + "learning_rate": 1.2657361222910115e-06, + "loss": 0.3116, + "step": 37600 + }, + { + "epoch": 0.8382107916435254, + "grad_norm": 0.6448431611061096, + "learning_rate": 1.264031669143101e-06, + "loss": 0.2408, + "step": 37605 + }, + { + "epoch": 0.8383222410241455, + "grad_norm": 0.9584223628044128, + "learning_rate": 1.2623282869574726e-06, + "loss": 0.2261, + "step": 37610 + }, + { + "epoch": 0.8384336904047656, + "grad_norm": 0.5301266312599182, + "learning_rate": 1.2606259759429562e-06, + "loss": 0.2289, + "step": 37615 + }, + { + "epoch": 0.8385451397853857, + "grad_norm": 0.7165831923484802, + "learning_rate": 1.2589247363082335e-06, + "loss": 0.2479, + "step": 37620 + }, + { + "epoch": 0.8386565891660057, + "grad_norm": 0.9761996865272522, + "learning_rate": 1.25722456826187e-06, + "loss": 0.3339, + "step": 37625 + }, + { + "epoch": 0.8387680385466257, + "grad_norm": 0.30678683519363403, + "learning_rate": 1.255525472012291e-06, + "loss": 0.2204, + "step": 37630 + }, + { + "epoch": 0.8388794879272459, + "grad_norm": 0.45100367069244385, + "learning_rate": 1.2538274477677925e-06, + "loss": 0.2811, + "step": 37635 + }, + { + "epoch": 0.8389909373078659, + "grad_norm": 0.5770091414451599, + "learning_rate": 1.2521304957365388e-06, + "loss": 0.2808, + "step": 37640 + }, + { + "epoch": 0.839102386688486, + "grad_norm": 0.5960776805877686, + "learning_rate": 1.2504346161265602e-06, + "loss": 0.1137, + "step": 37645 + }, + { + "epoch": 0.839213836069106, + "grad_norm": 0.6756134033203125, + "learning_rate": 1.2487398091457637e-06, + "loss": 0.2733, + "step": 37650 + }, + { + "epoch": 0.8393252854497261, + "grad_norm": 0.3762647807598114, + "learning_rate": 1.2470460750019154e-06, + "loss": 0.3683, + "step": 37655 + }, + { + "epoch": 0.8394367348303462, + "grad_norm": 0.6110719442367554, + "learning_rate": 1.2453534139026579e-06, + "loss": 0.2211, + "step": 37660 + }, + { + "epoch": 0.8395481842109662, + "grad_norm": 0.5859352350234985, + "learning_rate": 1.2436618260554955e-06, + "loss": 0.2183, + "step": 37665 + }, + { + "epoch": 0.8396596335915864, + "grad_norm": 0.800769567489624, + "learning_rate": 1.2419713116678056e-06, + "loss": 0.3123, + "step": 37670 + }, + { + "epoch": 0.8397710829722064, + "grad_norm": 0.5613384246826172, + "learning_rate": 1.240281870946829e-06, + "loss": 0.191, + "step": 37675 + }, + { + "epoch": 0.8398825323528264, + "grad_norm": 0.6227254867553711, + "learning_rate": 1.2385935040996833e-06, + "loss": 0.198, + "step": 37680 + }, + { + "epoch": 0.8399939817334465, + "grad_norm": 0.5670225620269775, + "learning_rate": 1.2369062113333453e-06, + "loss": 0.1745, + "step": 37685 + }, + { + "epoch": 0.8401054311140665, + "grad_norm": 0.44551461935043335, + "learning_rate": 1.2352199928546627e-06, + "loss": 0.201, + "step": 37690 + }, + { + "epoch": 0.8402168804946867, + "grad_norm": 0.5814508199691772, + "learning_rate": 1.233534848870358e-06, + "loss": 0.1739, + "step": 37695 + }, + { + "epoch": 0.8403283298753067, + "grad_norm": 0.6766602396965027, + "learning_rate": 1.2318507795870138e-06, + "loss": 0.2415, + "step": 37700 + }, + { + "epoch": 0.8404397792559268, + "grad_norm": 1.014407992362976, + "learning_rate": 1.2301677852110828e-06, + "loss": 0.2986, + "step": 37705 + }, + { + "epoch": 0.8405512286365469, + "grad_norm": 1.3561230897903442, + "learning_rate": 1.2284858659488908e-06, + "loss": 0.3334, + "step": 37710 + }, + { + "epoch": 0.8406626780171669, + "grad_norm": 0.46694424748420715, + "learning_rate": 1.2268050220066251e-06, + "loss": 0.2962, + "step": 37715 + }, + { + "epoch": 0.840774127397787, + "grad_norm": 0.6729729771614075, + "learning_rate": 1.2251252535903457e-06, + "loss": 0.2579, + "step": 37720 + }, + { + "epoch": 0.840885576778407, + "grad_norm": 1.022292971611023, + "learning_rate": 1.2234465609059754e-06, + "loss": 0.307, + "step": 37725 + }, + { + "epoch": 0.8409970261590272, + "grad_norm": 0.6683056950569153, + "learning_rate": 1.221768944159315e-06, + "loss": 0.3139, + "step": 37730 + }, + { + "epoch": 0.8411084755396472, + "grad_norm": 0.6963731646537781, + "learning_rate": 1.2200924035560247e-06, + "loss": 0.2479, + "step": 37735 + }, + { + "epoch": 0.8412199249202672, + "grad_norm": 0.9116354584693909, + "learning_rate": 1.218416939301633e-06, + "loss": 0.2626, + "step": 37740 + }, + { + "epoch": 0.8413313743008873, + "grad_norm": 0.44806307554244995, + "learning_rate": 1.216742551601543e-06, + "loss": 0.3445, + "step": 37745 + }, + { + "epoch": 0.8414428236815074, + "grad_norm": 0.7028947472572327, + "learning_rate": 1.2150692406610199e-06, + "loss": 0.3144, + "step": 37750 + }, + { + "epoch": 0.8415542730621275, + "grad_norm": 0.7792380452156067, + "learning_rate": 1.2133970066851985e-06, + "loss": 0.3816, + "step": 37755 + }, + { + "epoch": 0.8416657224427475, + "grad_norm": 0.34202641248703003, + "learning_rate": 1.2117258498790807e-06, + "loss": 0.24, + "step": 37760 + }, + { + "epoch": 0.8417771718233676, + "grad_norm": 0.4707144796848297, + "learning_rate": 1.2100557704475402e-06, + "loss": 0.2674, + "step": 37765 + }, + { + "epoch": 0.8418886212039877, + "grad_norm": 0.9186710715293884, + "learning_rate": 1.2083867685953121e-06, + "loss": 0.254, + "step": 37770 + }, + { + "epoch": 0.8420000705846077, + "grad_norm": 0.6091528534889221, + "learning_rate": 1.2067188445270074e-06, + "loss": 0.3032, + "step": 37775 + }, + { + "epoch": 0.8421115199652278, + "grad_norm": 0.7481672763824463, + "learning_rate": 1.2050519984470988e-06, + "loss": 0.3216, + "step": 37780 + }, + { + "epoch": 0.8422229693458478, + "grad_norm": 0.7646524310112, + "learning_rate": 1.2033862305599275e-06, + "loss": 0.4078, + "step": 37785 + }, + { + "epoch": 0.842334418726468, + "grad_norm": 0.5433209538459778, + "learning_rate": 1.201721541069706e-06, + "loss": 0.2804, + "step": 37790 + }, + { + "epoch": 0.842445868107088, + "grad_norm": 0.6066043376922607, + "learning_rate": 1.2000579301805093e-06, + "loss": 0.2953, + "step": 37795 + }, + { + "epoch": 0.842557317487708, + "grad_norm": 0.3361576199531555, + "learning_rate": 1.1983953980962848e-06, + "loss": 0.3078, + "step": 37800 + }, + { + "epoch": 0.8426687668683281, + "grad_norm": 0.4943036139011383, + "learning_rate": 1.1967339450208492e-06, + "loss": 0.1272, + "step": 37805 + }, + { + "epoch": 0.8427802162489482, + "grad_norm": 0.9423078894615173, + "learning_rate": 1.195073571157881e-06, + "loss": 0.371, + "step": 37810 + }, + { + "epoch": 0.8428916656295683, + "grad_norm": 0.3357747197151184, + "learning_rate": 1.193414276710928e-06, + "loss": 0.1712, + "step": 37815 + }, + { + "epoch": 0.8430031150101883, + "grad_norm": 0.35928764939308167, + "learning_rate": 1.1917560618834102e-06, + "loss": 0.2544, + "step": 37820 + }, + { + "epoch": 0.8431145643908085, + "grad_norm": 0.8321924805641174, + "learning_rate": 1.190098926878609e-06, + "loss": 0.3164, + "step": 37825 + }, + { + "epoch": 0.8432260137714285, + "grad_norm": 0.6431793570518494, + "learning_rate": 1.1884428718996755e-06, + "loss": 0.3778, + "step": 37830 + }, + { + "epoch": 0.8433374631520485, + "grad_norm": 0.6413928270339966, + "learning_rate": 1.1867878971496305e-06, + "loss": 0.3294, + "step": 37835 + }, + { + "epoch": 0.8434489125326686, + "grad_norm": 0.40377914905548096, + "learning_rate": 1.1851340028313652e-06, + "loss": 0.3043, + "step": 37840 + }, + { + "epoch": 0.8435603619132886, + "grad_norm": 0.6660176515579224, + "learning_rate": 1.1834811891476294e-06, + "loss": 0.1815, + "step": 37845 + }, + { + "epoch": 0.8436718112939088, + "grad_norm": 0.581720232963562, + "learning_rate": 1.181829456301048e-06, + "loss": 0.3054, + "step": 37850 + }, + { + "epoch": 0.8437832606745288, + "grad_norm": 0.28792068362236023, + "learning_rate": 1.1801788044941088e-06, + "loss": 0.2019, + "step": 37855 + }, + { + "epoch": 0.8438947100551488, + "grad_norm": 0.6191038489341736, + "learning_rate": 1.1785292339291677e-06, + "loss": 0.3956, + "step": 37860 + }, + { + "epoch": 0.844006159435769, + "grad_norm": 0.8252318501472473, + "learning_rate": 1.1768807448084507e-06, + "loss": 0.3405, + "step": 37865 + }, + { + "epoch": 0.844117608816389, + "grad_norm": 0.798909068107605, + "learning_rate": 1.175233337334053e-06, + "loss": 0.3404, + "step": 37870 + }, + { + "epoch": 0.8442290581970091, + "grad_norm": 0.5434174537658691, + "learning_rate": 1.1735870117079307e-06, + "loss": 0.1721, + "step": 37875 + }, + { + "epoch": 0.8443405075776291, + "grad_norm": 0.5645771622657776, + "learning_rate": 1.1719417681319123e-06, + "loss": 0.25, + "step": 37880 + }, + { + "epoch": 0.8444519569582492, + "grad_norm": 0.7520797252655029, + "learning_rate": 1.1702976068076898e-06, + "loss": 0.2488, + "step": 37885 + }, + { + "epoch": 0.8445634063388693, + "grad_norm": 0.8642824292182922, + "learning_rate": 1.1686545279368244e-06, + "loss": 0.2841, + "step": 37890 + }, + { + "epoch": 0.8446748557194893, + "grad_norm": 0.6720357537269592, + "learning_rate": 1.1670125317207493e-06, + "loss": 0.3353, + "step": 37895 + }, + { + "epoch": 0.8447863051001094, + "grad_norm": 0.7542029619216919, + "learning_rate": 1.1653716183607544e-06, + "loss": 0.2117, + "step": 37900 + }, + { + "epoch": 0.8448977544807295, + "grad_norm": 0.39105698466300964, + "learning_rate": 1.163731788058009e-06, + "loss": 0.2014, + "step": 37905 + }, + { + "epoch": 0.8450092038613496, + "grad_norm": 0.8331542611122131, + "learning_rate": 1.16209304101354e-06, + "loss": 0.2795, + "step": 37910 + }, + { + "epoch": 0.8451206532419696, + "grad_norm": 0.4050682783126831, + "learning_rate": 1.1604553774282467e-06, + "loss": 0.2269, + "step": 37915 + }, + { + "epoch": 0.8452321026225896, + "grad_norm": 0.556277871131897, + "learning_rate": 1.1588187975028931e-06, + "loss": 0.3418, + "step": 37920 + }, + { + "epoch": 0.8453435520032098, + "grad_norm": 0.8178937435150146, + "learning_rate": 1.1571833014381096e-06, + "loss": 0.2718, + "step": 37925 + }, + { + "epoch": 0.8454550013838298, + "grad_norm": 0.5613789558410645, + "learning_rate": 1.1555488894343991e-06, + "loss": 0.2226, + "step": 37930 + }, + { + "epoch": 0.8455664507644499, + "grad_norm": 0.5579361915588379, + "learning_rate": 1.1539155616921238e-06, + "loss": 0.2663, + "step": 37935 + }, + { + "epoch": 0.8456779001450699, + "grad_norm": 0.7475429773330688, + "learning_rate": 1.1522833184115212e-06, + "loss": 0.3363, + "step": 37940 + }, + { + "epoch": 0.84578934952569, + "grad_norm": 0.49127307534217834, + "learning_rate": 1.1506521597926912e-06, + "loss": 0.2636, + "step": 37945 + }, + { + "epoch": 0.8459007989063101, + "grad_norm": 0.5424503087997437, + "learning_rate": 1.1490220860355983e-06, + "loss": 0.2751, + "step": 37950 + }, + { + "epoch": 0.8460122482869301, + "grad_norm": 0.7346308827400208, + "learning_rate": 1.147393097340077e-06, + "loss": 0.3807, + "step": 37955 + }, + { + "epoch": 0.8461236976675502, + "grad_norm": 0.8214137554168701, + "learning_rate": 1.1457651939058335e-06, + "loss": 0.2832, + "step": 37960 + }, + { + "epoch": 0.8462351470481703, + "grad_norm": 0.8216602206230164, + "learning_rate": 1.144138375932432e-06, + "loss": 0.3962, + "step": 37965 + }, + { + "epoch": 0.8463465964287904, + "grad_norm": 0.5301281809806824, + "learning_rate": 1.1425126436193067e-06, + "loss": 0.2104, + "step": 37970 + }, + { + "epoch": 0.8464580458094104, + "grad_norm": 0.6581772565841675, + "learning_rate": 1.140887997165765e-06, + "loss": 0.3826, + "step": 37975 + }, + { + "epoch": 0.8465694951900304, + "grad_norm": 0.584018886089325, + "learning_rate": 1.1392644367709715e-06, + "loss": 0.3253, + "step": 37980 + }, + { + "epoch": 0.8466809445706506, + "grad_norm": 0.5445225238800049, + "learning_rate": 1.137641962633963e-06, + "loss": 0.334, + "step": 37985 + }, + { + "epoch": 0.8467923939512706, + "grad_norm": 0.9342231154441833, + "learning_rate": 1.1360205749536446e-06, + "loss": 0.1585, + "step": 37990 + }, + { + "epoch": 0.8469038433318907, + "grad_norm": 0.5521963238716125, + "learning_rate": 1.134400273928784e-06, + "loss": 0.3071, + "step": 37995 + }, + { + "epoch": 0.8470152927125107, + "grad_norm": 0.7862924337387085, + "learning_rate": 1.132781059758018e-06, + "loss": 0.2368, + "step": 38000 + }, + { + "epoch": 0.8471267420931308, + "grad_norm": 0.7552818655967712, + "learning_rate": 1.1311629326398477e-06, + "loss": 0.2618, + "step": 38005 + }, + { + "epoch": 0.8472381914737509, + "grad_norm": 0.6027349233627319, + "learning_rate": 1.1295458927726465e-06, + "loss": 0.2241, + "step": 38010 + }, + { + "epoch": 0.8473496408543709, + "grad_norm": 0.4127175509929657, + "learning_rate": 1.1279299403546485e-06, + "loss": 0.2692, + "step": 38015 + }, + { + "epoch": 0.847461090234991, + "grad_norm": 1.0972000360488892, + "learning_rate": 1.1263150755839592e-06, + "loss": 0.3133, + "step": 38020 + }, + { + "epoch": 0.8475725396156111, + "grad_norm": 0.5284757614135742, + "learning_rate": 1.124701298658547e-06, + "loss": 0.3089, + "step": 38025 + }, + { + "epoch": 0.8476839889962312, + "grad_norm": 0.468267560005188, + "learning_rate": 1.1230886097762495e-06, + "loss": 0.2538, + "step": 38030 + }, + { + "epoch": 0.8477954383768512, + "grad_norm": 0.7088356018066406, + "learning_rate": 1.1214770091347694e-06, + "loss": 0.2582, + "step": 38035 + }, + { + "epoch": 0.8479068877574713, + "grad_norm": 0.5855355858802795, + "learning_rate": 1.119866496931673e-06, + "loss": 0.2273, + "step": 38040 + }, + { + "epoch": 0.8480183371380914, + "grad_norm": 0.6103977560997009, + "learning_rate": 1.1182570733644037e-06, + "loss": 0.2108, + "step": 38045 + }, + { + "epoch": 0.8481297865187114, + "grad_norm": 0.66413813829422, + "learning_rate": 1.1166487386302571e-06, + "loss": 0.3404, + "step": 38050 + }, + { + "epoch": 0.8482412358993315, + "grad_norm": 0.6266335844993591, + "learning_rate": 1.1150414929264087e-06, + "loss": 0.2823, + "step": 38055 + }, + { + "epoch": 0.8483526852799516, + "grad_norm": 0.47270530462265015, + "learning_rate": 1.113435336449893e-06, + "loss": 0.2533, + "step": 38060 + }, + { + "epoch": 0.8484641346605716, + "grad_norm": 0.5260603427886963, + "learning_rate": 1.1118302693976103e-06, + "loss": 0.2751, + "step": 38065 + }, + { + "epoch": 0.8485755840411917, + "grad_norm": 0.6224083304405212, + "learning_rate": 1.1102262919663299e-06, + "loss": 0.3943, + "step": 38070 + }, + { + "epoch": 0.8486870334218117, + "grad_norm": 0.6470963954925537, + "learning_rate": 1.1086234043526866e-06, + "loss": 0.1906, + "step": 38075 + }, + { + "epoch": 0.8487984828024319, + "grad_norm": 0.46870410442352295, + "learning_rate": 1.1070216067531825e-06, + "loss": 0.2331, + "step": 38080 + }, + { + "epoch": 0.8489099321830519, + "grad_norm": 0.6851820349693298, + "learning_rate": 1.1054208993641879e-06, + "loss": 0.3887, + "step": 38085 + }, + { + "epoch": 0.8490213815636719, + "grad_norm": 0.5743823647499084, + "learning_rate": 1.103821282381936e-06, + "loss": 0.2285, + "step": 38090 + }, + { + "epoch": 0.849132830944292, + "grad_norm": 0.9163671135902405, + "learning_rate": 1.1022227560025267e-06, + "loss": 0.3448, + "step": 38095 + }, + { + "epoch": 0.8492442803249121, + "grad_norm": 0.44099679589271545, + "learning_rate": 1.1006253204219275e-06, + "loss": 0.1958, + "step": 38100 + }, + { + "epoch": 0.8493557297055322, + "grad_norm": 0.727388858795166, + "learning_rate": 1.0990289758359685e-06, + "loss": 0.27, + "step": 38105 + }, + { + "epoch": 0.8494671790861522, + "grad_norm": 0.5296708941459656, + "learning_rate": 1.0974337224403553e-06, + "loss": 0.3284, + "step": 38110 + }, + { + "epoch": 0.8495786284667723, + "grad_norm": 0.3783695697784424, + "learning_rate": 1.0958395604306482e-06, + "loss": 0.144, + "step": 38115 + }, + { + "epoch": 0.8496900778473924, + "grad_norm": 0.9098923206329346, + "learning_rate": 1.094246490002283e-06, + "loss": 0.2708, + "step": 38120 + }, + { + "epoch": 0.8498015272280124, + "grad_norm": 0.4576197564601898, + "learning_rate": 1.092654511350556e-06, + "loss": 0.2607, + "step": 38125 + }, + { + "epoch": 0.8499129766086325, + "grad_norm": 0.7173288464546204, + "learning_rate": 1.0910636246706318e-06, + "loss": 0.2797, + "step": 38130 + }, + { + "epoch": 0.8500244259892525, + "grad_norm": 0.5603467226028442, + "learning_rate": 1.0894738301575414e-06, + "loss": 0.3129, + "step": 38135 + }, + { + "epoch": 0.8501358753698727, + "grad_norm": 0.4809896945953369, + "learning_rate": 1.0878851280061787e-06, + "loss": 0.3054, + "step": 38140 + }, + { + "epoch": 0.8502473247504927, + "grad_norm": 0.5944100022315979, + "learning_rate": 1.08629751841131e-06, + "loss": 0.1697, + "step": 38145 + }, + { + "epoch": 0.8503587741311127, + "grad_norm": 0.44639232754707336, + "learning_rate": 1.0847110015675599e-06, + "loss": 0.2338, + "step": 38150 + }, + { + "epoch": 0.8504702235117328, + "grad_norm": 0.45551520586013794, + "learning_rate": 1.0831255776694283e-06, + "loss": 0.275, + "step": 38155 + }, + { + "epoch": 0.8505816728923529, + "grad_norm": 0.7087881565093994, + "learning_rate": 1.081541246911273e-06, + "loss": 0.3113, + "step": 38160 + }, + { + "epoch": 0.850693122272973, + "grad_norm": 0.7395936846733093, + "learning_rate": 1.0799580094873208e-06, + "loss": 0.2508, + "step": 38165 + }, + { + "epoch": 0.850804571653593, + "grad_norm": 0.6941021680831909, + "learning_rate": 1.078375865591662e-06, + "loss": 0.2996, + "step": 38170 + }, + { + "epoch": 0.8509160210342132, + "grad_norm": 0.5546767115592957, + "learning_rate": 1.0767948154182606e-06, + "loss": 0.1887, + "step": 38175 + }, + { + "epoch": 0.8510274704148332, + "grad_norm": 0.4076506495475769, + "learning_rate": 1.0752148591609378e-06, + "loss": 0.2515, + "step": 38180 + }, + { + "epoch": 0.8511389197954532, + "grad_norm": 0.5561453104019165, + "learning_rate": 1.0736359970133825e-06, + "loss": 0.2829, + "step": 38185 + }, + { + "epoch": 0.8512503691760733, + "grad_norm": 0.7610055804252625, + "learning_rate": 1.072058229169155e-06, + "loss": 0.2921, + "step": 38190 + }, + { + "epoch": 0.8513618185566934, + "grad_norm": 1.0838948488235474, + "learning_rate": 1.0704815558216752e-06, + "loss": 0.2318, + "step": 38195 + }, + { + "epoch": 0.8514732679373135, + "grad_norm": 0.5526623129844666, + "learning_rate": 1.0689059771642308e-06, + "loss": 0.2769, + "step": 38200 + }, + { + "epoch": 0.8515847173179335, + "grad_norm": 0.4516993761062622, + "learning_rate": 1.0673314933899758e-06, + "loss": 0.2309, + "step": 38205 + }, + { + "epoch": 0.8516961666985535, + "grad_norm": 0.5029829144477844, + "learning_rate": 1.0657581046919318e-06, + "loss": 0.2642, + "step": 38210 + }, + { + "epoch": 0.8518076160791737, + "grad_norm": 0.6774804592132568, + "learning_rate": 1.0641858112629821e-06, + "loss": 0.2934, + "step": 38215 + }, + { + "epoch": 0.8519190654597937, + "grad_norm": 0.6830132603645325, + "learning_rate": 1.062614613295876e-06, + "loss": 0.4301, + "step": 38220 + }, + { + "epoch": 0.8520305148404138, + "grad_norm": 0.44892117381095886, + "learning_rate": 1.0610445109832335e-06, + "loss": 0.2804, + "step": 38225 + }, + { + "epoch": 0.8521419642210338, + "grad_norm": 0.5139585733413696, + "learning_rate": 1.0594755045175363e-06, + "loss": 0.3174, + "step": 38230 + }, + { + "epoch": 0.852253413601654, + "grad_norm": 0.6646450161933899, + "learning_rate": 1.0579075940911309e-06, + "loss": 0.2326, + "step": 38235 + }, + { + "epoch": 0.852364862982274, + "grad_norm": 0.811774492263794, + "learning_rate": 1.0563407798962332e-06, + "loss": 0.2359, + "step": 38240 + }, + { + "epoch": 0.852476312362894, + "grad_norm": 0.450503408908844, + "learning_rate": 1.0547750621249208e-06, + "loss": 0.3047, + "step": 38245 + }, + { + "epoch": 0.8525877617435141, + "grad_norm": 0.7951428890228271, + "learning_rate": 1.05321044096914e-06, + "loss": 0.3498, + "step": 38250 + }, + { + "epoch": 0.8526992111241342, + "grad_norm": 0.8334109783172607, + "learning_rate": 1.051646916620699e-06, + "loss": 0.2847, + "step": 38255 + }, + { + "epoch": 0.8528106605047543, + "grad_norm": 0.7562937140464783, + "learning_rate": 1.050084489271277e-06, + "loss": 0.2878, + "step": 38260 + }, + { + "epoch": 0.8529221098853743, + "grad_norm": 0.5558695793151855, + "learning_rate": 1.0485231591124113e-06, + "loss": 0.2797, + "step": 38265 + }, + { + "epoch": 0.8530335592659943, + "grad_norm": 0.539916455745697, + "learning_rate": 1.046962926335514e-06, + "loss": 0.2059, + "step": 38270 + }, + { + "epoch": 0.8531450086466145, + "grad_norm": 0.856054425239563, + "learning_rate": 1.045403791131855e-06, + "loss": 0.236, + "step": 38275 + }, + { + "epoch": 0.8532564580272345, + "grad_norm": 0.7847248315811157, + "learning_rate": 1.0438457536925728e-06, + "loss": 0.4156, + "step": 38280 + }, + { + "epoch": 0.8533679074078546, + "grad_norm": 0.6048935651779175, + "learning_rate": 1.0422888142086696e-06, + "loss": 0.3339, + "step": 38285 + }, + { + "epoch": 0.8534793567884746, + "grad_norm": 0.8079482913017273, + "learning_rate": 1.0407329728710135e-06, + "loss": 0.2025, + "step": 38290 + }, + { + "epoch": 0.8535908061690947, + "grad_norm": 0.33381929993629456, + "learning_rate": 1.0391782298703391e-06, + "loss": 0.2348, + "step": 38295 + }, + { + "epoch": 0.8537022555497148, + "grad_norm": 0.7819501757621765, + "learning_rate": 1.03762458539725e-06, + "loss": 0.3077, + "step": 38300 + }, + { + "epoch": 0.8538137049303348, + "grad_norm": 0.6421924829483032, + "learning_rate": 1.0360720396422064e-06, + "loss": 0.3291, + "step": 38305 + }, + { + "epoch": 0.853925154310955, + "grad_norm": 0.2999565899372101, + "learning_rate": 1.0345205927955405e-06, + "loss": 0.2765, + "step": 38310 + }, + { + "epoch": 0.854036603691575, + "grad_norm": 0.6653572916984558, + "learning_rate": 1.0329702450474477e-06, + "loss": 0.2799, + "step": 38315 + }, + { + "epoch": 0.8541480530721951, + "grad_norm": 0.8726109266281128, + "learning_rate": 1.0314209965879852e-06, + "loss": 0.3337, + "step": 38320 + }, + { + "epoch": 0.8542595024528151, + "grad_norm": 0.6368909478187561, + "learning_rate": 1.0298728476070818e-06, + "loss": 0.3035, + "step": 38325 + }, + { + "epoch": 0.8543709518334351, + "grad_norm": 0.9166002869606018, + "learning_rate": 1.0283257982945305e-06, + "loss": 0.3818, + "step": 38330 + }, + { + "epoch": 0.8544824012140553, + "grad_norm": 0.6032201051712036, + "learning_rate": 1.026779848839986e-06, + "loss": 0.1349, + "step": 38335 + }, + { + "epoch": 0.8545938505946753, + "grad_norm": 0.7271354794502258, + "learning_rate": 1.025234999432969e-06, + "loss": 0.2524, + "step": 38340 + }, + { + "epoch": 0.8547052999752954, + "grad_norm": 0.5672757029533386, + "learning_rate": 1.023691250262867e-06, + "loss": 0.3017, + "step": 38345 + }, + { + "epoch": 0.8548167493559155, + "grad_norm": 0.48506101965904236, + "learning_rate": 1.0221486015189309e-06, + "loss": 0.277, + "step": 38350 + }, + { + "epoch": 0.8549281987365355, + "grad_norm": 0.5211977362632751, + "learning_rate": 1.0206070533902756e-06, + "loss": 0.2977, + "step": 38355 + }, + { + "epoch": 0.8550396481171556, + "grad_norm": 0.44795161485671997, + "learning_rate": 1.019066606065886e-06, + "loss": 0.2916, + "step": 38360 + }, + { + "epoch": 0.8551510974977756, + "grad_norm": 0.8175597786903381, + "learning_rate": 1.0175272597346097e-06, + "loss": 0.3533, + "step": 38365 + }, + { + "epoch": 0.8552625468783958, + "grad_norm": 0.5160136818885803, + "learning_rate": 1.0159890145851569e-06, + "loss": 0.2598, + "step": 38370 + }, + { + "epoch": 0.8553739962590158, + "grad_norm": 0.7401508092880249, + "learning_rate": 1.0144518708061057e-06, + "loss": 0.1902, + "step": 38375 + }, + { + "epoch": 0.8554854456396359, + "grad_norm": 0.5100545287132263, + "learning_rate": 1.0129158285858975e-06, + "loss": 0.2488, + "step": 38380 + }, + { + "epoch": 0.8555968950202559, + "grad_norm": 0.8144034147262573, + "learning_rate": 1.0113808881128374e-06, + "loss": 0.2565, + "step": 38385 + }, + { + "epoch": 0.855708344400876, + "grad_norm": 0.6111043095588684, + "learning_rate": 1.0098470495751e-06, + "loss": 0.2435, + "step": 38390 + }, + { + "epoch": 0.8558197937814961, + "grad_norm": 0.5661292672157288, + "learning_rate": 1.0083143131607197e-06, + "loss": 0.2558, + "step": 38395 + }, + { + "epoch": 0.8559312431621161, + "grad_norm": 0.8154894709587097, + "learning_rate": 1.0067826790576008e-06, + "loss": 0.3087, + "step": 38400 + }, + { + "epoch": 0.8560426925427362, + "grad_norm": 0.7612628936767578, + "learning_rate": 1.005252147453508e-06, + "loss": 0.3343, + "step": 38405 + }, + { + "epoch": 0.8561541419233563, + "grad_norm": 0.5865107178688049, + "learning_rate": 1.0037227185360743e-06, + "loss": 0.231, + "step": 38410 + }, + { + "epoch": 0.8562655913039763, + "grad_norm": 0.8663761615753174, + "learning_rate": 1.0021943924927924e-06, + "loss": 0.4532, + "step": 38415 + }, + { + "epoch": 0.8563770406845964, + "grad_norm": 0.6198815107345581, + "learning_rate": 1.0006671695110281e-06, + "loss": 0.4073, + "step": 38420 + }, + { + "epoch": 0.8564884900652164, + "grad_norm": 0.658185601234436, + "learning_rate": 9.99141049778004e-07, + "loss": 0.2215, + "step": 38425 + }, + { + "epoch": 0.8565999394458366, + "grad_norm": 0.5928554534912109, + "learning_rate": 9.976160334808094e-07, + "loss": 0.1856, + "step": 38430 + }, + { + "epoch": 0.8567113888264566, + "grad_norm": 1.2877331972122192, + "learning_rate": 9.960921208064023e-07, + "loss": 0.2063, + "step": 38435 + }, + { + "epoch": 0.8568228382070766, + "grad_norm": 0.9091715216636658, + "learning_rate": 9.945693119416033e-07, + "loss": 0.3503, + "step": 38440 + }, + { + "epoch": 0.8569342875876967, + "grad_norm": 0.626483142375946, + "learning_rate": 9.930476070730944e-07, + "loss": 0.2357, + "step": 38445 + }, + { + "epoch": 0.8570457369683168, + "grad_norm": 0.5116456151008606, + "learning_rate": 9.915270063874238e-07, + "loss": 0.2662, + "step": 38450 + }, + { + "epoch": 0.8571571863489369, + "grad_norm": 0.5982497930526733, + "learning_rate": 9.900075100710106e-07, + "loss": 0.3785, + "step": 38455 + }, + { + "epoch": 0.8572686357295569, + "grad_norm": 0.6922852993011475, + "learning_rate": 9.884891183101287e-07, + "loss": 0.3587, + "step": 38460 + }, + { + "epoch": 0.857380085110177, + "grad_norm": 0.4601098597049713, + "learning_rate": 9.869718312909215e-07, + "loss": 0.3211, + "step": 38465 + }, + { + "epoch": 0.8574915344907971, + "grad_norm": 0.2123146653175354, + "learning_rate": 9.854556491994005e-07, + "loss": 0.1296, + "step": 38470 + }, + { + "epoch": 0.8576029838714171, + "grad_norm": 0.546073853969574, + "learning_rate": 9.839405722214345e-07, + "loss": 0.2066, + "step": 38475 + }, + { + "epoch": 0.8577144332520372, + "grad_norm": 0.4517442584037781, + "learning_rate": 9.824266005427584e-07, + "loss": 0.2971, + "step": 38480 + }, + { + "epoch": 0.8578258826326572, + "grad_norm": 0.6086894869804382, + "learning_rate": 9.809137343489793e-07, + "loss": 0.2705, + "step": 38485 + }, + { + "epoch": 0.8579373320132774, + "grad_norm": 0.49917909502983093, + "learning_rate": 9.79401973825559e-07, + "loss": 0.2768, + "step": 38490 + }, + { + "epoch": 0.8580487813938974, + "grad_norm": 0.7907560467720032, + "learning_rate": 9.778913191578287e-07, + "loss": 0.2817, + "step": 38495 + }, + { + "epoch": 0.8581602307745174, + "grad_norm": 0.6295052766799927, + "learning_rate": 9.763817705309808e-07, + "loss": 0.2585, + "step": 38500 + }, + { + "epoch": 0.8582716801551376, + "grad_norm": 0.484733521938324, + "learning_rate": 9.748733281300782e-07, + "loss": 0.3148, + "step": 38505 + }, + { + "epoch": 0.8583831295357576, + "grad_norm": 0.7300118803977966, + "learning_rate": 9.733659921400395e-07, + "loss": 0.2856, + "step": 38510 + }, + { + "epoch": 0.8584945789163777, + "grad_norm": 0.4974004328250885, + "learning_rate": 9.718597627456582e-07, + "loss": 0.2531, + "step": 38515 + }, + { + "epoch": 0.8586060282969977, + "grad_norm": 0.37486526370048523, + "learning_rate": 9.703546401315832e-07, + "loss": 0.2336, + "step": 38520 + }, + { + "epoch": 0.8587174776776179, + "grad_norm": 0.6507949829101562, + "learning_rate": 9.688506244823315e-07, + "loss": 0.2287, + "step": 38525 + }, + { + "epoch": 0.8588289270582379, + "grad_norm": 0.7002065181732178, + "learning_rate": 9.673477159822832e-07, + "loss": 0.2789, + "step": 38530 + }, + { + "epoch": 0.8589403764388579, + "grad_norm": 0.7301084995269775, + "learning_rate": 9.658459148156807e-07, + "loss": 0.3805, + "step": 38535 + }, + { + "epoch": 0.859051825819478, + "grad_norm": 0.6007069945335388, + "learning_rate": 9.643452211666394e-07, + "loss": 0.3356, + "step": 38540 + }, + { + "epoch": 0.859163275200098, + "grad_norm": 0.505754292011261, + "learning_rate": 9.628456352191261e-07, + "loss": 0.2512, + "step": 38545 + }, + { + "epoch": 0.8592747245807182, + "grad_norm": 0.893120288848877, + "learning_rate": 9.613471571569843e-07, + "loss": 0.3404, + "step": 38550 + }, + { + "epoch": 0.8593861739613382, + "grad_norm": 0.5641831755638123, + "learning_rate": 9.598497871639133e-07, + "loss": 0.3276, + "step": 38555 + }, + { + "epoch": 0.8594976233419582, + "grad_norm": 0.5923650860786438, + "learning_rate": 9.5835352542348e-07, + "loss": 0.3105, + "step": 38560 + }, + { + "epoch": 0.8596090727225784, + "grad_norm": 0.6627763509750366, + "learning_rate": 9.568583721191126e-07, + "loss": 0.3515, + "step": 38565 + }, + { + "epoch": 0.8597205221031984, + "grad_norm": 0.5730844140052795, + "learning_rate": 9.55364327434105e-07, + "loss": 0.3103, + "step": 38570 + }, + { + "epoch": 0.8598319714838185, + "grad_norm": 0.9559993743896484, + "learning_rate": 9.538713915516173e-07, + "loss": 0.2309, + "step": 38575 + }, + { + "epoch": 0.8599434208644385, + "grad_norm": 0.6301518082618713, + "learning_rate": 9.52379564654673e-07, + "loss": 0.3103, + "step": 38580 + }, + { + "epoch": 0.8600548702450587, + "grad_norm": 0.6794214844703674, + "learning_rate": 9.508888469261568e-07, + "loss": 0.3756, + "step": 38585 + }, + { + "epoch": 0.8601663196256787, + "grad_norm": 0.5919482111930847, + "learning_rate": 9.493992385488204e-07, + "loss": 0.2822, + "step": 38590 + }, + { + "epoch": 0.8602777690062987, + "grad_norm": 0.7542113065719604, + "learning_rate": 9.479107397052767e-07, + "loss": 0.2241, + "step": 38595 + }, + { + "epoch": 0.8603892183869188, + "grad_norm": 0.6400946974754333, + "learning_rate": 9.46423350578004e-07, + "loss": 0.1845, + "step": 38600 + }, + { + "epoch": 0.8605006677675389, + "grad_norm": 0.5948458909988403, + "learning_rate": 9.449370713493444e-07, + "loss": 0.295, + "step": 38605 + }, + { + "epoch": 0.860612117148159, + "grad_norm": 0.8288118839263916, + "learning_rate": 9.434519022015088e-07, + "loss": 0.3929, + "step": 38610 + }, + { + "epoch": 0.860723566528779, + "grad_norm": 0.6763032078742981, + "learning_rate": 9.419678433165624e-07, + "loss": 0.471, + "step": 38615 + }, + { + "epoch": 0.860835015909399, + "grad_norm": 0.933210015296936, + "learning_rate": 9.404848948764422e-07, + "loss": 0.3071, + "step": 38620 + }, + { + "epoch": 0.8609464652900192, + "grad_norm": 0.5194078683853149, + "learning_rate": 9.39003057062946e-07, + "loss": 0.3275, + "step": 38625 + }, + { + "epoch": 0.8610579146706392, + "grad_norm": 0.5377574563026428, + "learning_rate": 9.375223300577319e-07, + "loss": 0.3088, + "step": 38630 + }, + { + "epoch": 0.8611693640512593, + "grad_norm": 0.6296390891075134, + "learning_rate": 9.3604271404233e-07, + "loss": 0.2647, + "step": 38635 + }, + { + "epoch": 0.8612808134318793, + "grad_norm": 0.7914707064628601, + "learning_rate": 9.345642091981288e-07, + "loss": 0.2754, + "step": 38640 + }, + { + "epoch": 0.8613922628124994, + "grad_norm": 0.7675842046737671, + "learning_rate": 9.330868157063799e-07, + "loss": 0.2453, + "step": 38645 + }, + { + "epoch": 0.8615037121931195, + "grad_norm": 0.9423962831497192, + "learning_rate": 9.316105337482028e-07, + "loss": 0.1748, + "step": 38650 + }, + { + "epoch": 0.8616151615737395, + "grad_norm": 0.6657691597938538, + "learning_rate": 9.301353635045774e-07, + "loss": 0.1363, + "step": 38655 + }, + { + "epoch": 0.8617266109543597, + "grad_norm": 0.5235121846199036, + "learning_rate": 9.286613051563476e-07, + "loss": 0.298, + "step": 38660 + }, + { + "epoch": 0.8618380603349797, + "grad_norm": 0.6590076684951782, + "learning_rate": 9.27188358884219e-07, + "loss": 0.41, + "step": 38665 + }, + { + "epoch": 0.8619495097155998, + "grad_norm": 0.6933293342590332, + "learning_rate": 9.257165248687694e-07, + "loss": 0.2681, + "step": 38670 + }, + { + "epoch": 0.8620609590962198, + "grad_norm": 0.5265421867370605, + "learning_rate": 9.242458032904311e-07, + "loss": 0.2567, + "step": 38675 + }, + { + "epoch": 0.8621724084768398, + "grad_norm": 0.447303831577301, + "learning_rate": 9.22776194329501e-07, + "loss": 0.3959, + "step": 38680 + }, + { + "epoch": 0.86228385785746, + "grad_norm": 0.6949535012245178, + "learning_rate": 9.21307698166145e-07, + "loss": 0.2809, + "step": 38685 + }, + { + "epoch": 0.86239530723808, + "grad_norm": 0.42898672819137573, + "learning_rate": 9.198403149803903e-07, + "loss": 0.2463, + "step": 38690 + }, + { + "epoch": 0.8625067566187001, + "grad_norm": 0.564953088760376, + "learning_rate": 9.183740449521217e-07, + "loss": 0.3246, + "step": 38695 + }, + { + "epoch": 0.8626182059993202, + "grad_norm": 0.5689299702644348, + "learning_rate": 9.169088882610977e-07, + "loss": 0.3268, + "step": 38700 + }, + { + "epoch": 0.8627296553799402, + "grad_norm": 0.6454007625579834, + "learning_rate": 9.154448450869347e-07, + "loss": 0.2103, + "step": 38705 + }, + { + "epoch": 0.8628411047605603, + "grad_norm": 0.8169035315513611, + "learning_rate": 9.139819156091101e-07, + "loss": 0.2811, + "step": 38710 + }, + { + "epoch": 0.8629525541411803, + "grad_norm": 0.47908321022987366, + "learning_rate": 9.125201000069683e-07, + "loss": 0.2798, + "step": 38715 + }, + { + "epoch": 0.8630640035218005, + "grad_norm": 0.42922982573509216, + "learning_rate": 9.110593984597193e-07, + "loss": 0.2175, + "step": 38720 + }, + { + "epoch": 0.8631754529024205, + "grad_norm": 0.5486500263214111, + "learning_rate": 9.0959981114643e-07, + "loss": 0.1905, + "step": 38725 + }, + { + "epoch": 0.8632869022830406, + "grad_norm": 0.911518394947052, + "learning_rate": 9.081413382460391e-07, + "loss": 0.2032, + "step": 38730 + }, + { + "epoch": 0.8633983516636606, + "grad_norm": 0.48162025213241577, + "learning_rate": 9.066839799373417e-07, + "loss": 0.2701, + "step": 38735 + }, + { + "epoch": 0.8635098010442807, + "grad_norm": 0.5854309797286987, + "learning_rate": 9.052277363989981e-07, + "loss": 0.2961, + "step": 38740 + }, + { + "epoch": 0.8636212504249008, + "grad_norm": 0.49761664867401123, + "learning_rate": 9.037726078095344e-07, + "loss": 0.2893, + "step": 38745 + }, + { + "epoch": 0.8637326998055208, + "grad_norm": 0.6846062541007996, + "learning_rate": 9.023185943473345e-07, + "loss": 0.325, + "step": 38750 + }, + { + "epoch": 0.8638441491861409, + "grad_norm": 0.672755777835846, + "learning_rate": 9.008656961906526e-07, + "loss": 0.3281, + "step": 38755 + }, + { + "epoch": 0.863955598566761, + "grad_norm": 0.6164188981056213, + "learning_rate": 8.994139135176005e-07, + "loss": 0.2586, + "step": 38760 + }, + { + "epoch": 0.864067047947381, + "grad_norm": 0.3592098653316498, + "learning_rate": 8.979632465061594e-07, + "loss": 0.2613, + "step": 38765 + }, + { + "epoch": 0.8641784973280011, + "grad_norm": 0.47354990243911743, + "learning_rate": 8.965136953341669e-07, + "loss": 0.33, + "step": 38770 + }, + { + "epoch": 0.8642899467086211, + "grad_norm": 0.7735857367515564, + "learning_rate": 8.950652601793264e-07, + "loss": 0.2444, + "step": 38775 + }, + { + "epoch": 0.8644013960892413, + "grad_norm": 0.7215647101402283, + "learning_rate": 8.93617941219207e-07, + "loss": 0.2783, + "step": 38780 + }, + { + "epoch": 0.8645128454698613, + "grad_norm": 0.8758747577667236, + "learning_rate": 8.921717386312346e-07, + "loss": 0.2587, + "step": 38785 + }, + { + "epoch": 0.8646242948504814, + "grad_norm": 0.8073229193687439, + "learning_rate": 8.90726652592706e-07, + "loss": 0.2216, + "step": 38790 + }, + { + "epoch": 0.8647357442311014, + "grad_norm": 0.4132098853588104, + "learning_rate": 8.892826832807788e-07, + "loss": 0.2941, + "step": 38795 + }, + { + "epoch": 0.8648471936117215, + "grad_norm": 0.5358142852783203, + "learning_rate": 8.8783983087247e-07, + "loss": 0.225, + "step": 38800 + }, + { + "epoch": 0.8649586429923416, + "grad_norm": 0.4428861141204834, + "learning_rate": 8.863980955446639e-07, + "loss": 0.2894, + "step": 38805 + }, + { + "epoch": 0.8650700923729616, + "grad_norm": 0.7807385921478271, + "learning_rate": 8.849574774741044e-07, + "loss": 0.2118, + "step": 38810 + }, + { + "epoch": 0.8651815417535818, + "grad_norm": 0.5106543898582458, + "learning_rate": 8.835179768373992e-07, + "loss": 0.2659, + "step": 38815 + }, + { + "epoch": 0.8652929911342018, + "grad_norm": 0.575259268283844, + "learning_rate": 8.820795938110216e-07, + "loss": 0.2706, + "step": 38820 + }, + { + "epoch": 0.8654044405148218, + "grad_norm": 0.7704864144325256, + "learning_rate": 8.806423285713084e-07, + "loss": 0.348, + "step": 38825 + }, + { + "epoch": 0.8655158898954419, + "grad_norm": 0.37502098083496094, + "learning_rate": 8.792061812944541e-07, + "loss": 0.259, + "step": 38830 + }, + { + "epoch": 0.865627339276062, + "grad_norm": 0.5361759662628174, + "learning_rate": 8.777711521565213e-07, + "loss": 0.2836, + "step": 38835 + }, + { + "epoch": 0.8657387886566821, + "grad_norm": 0.639102041721344, + "learning_rate": 8.763372413334314e-07, + "loss": 0.2414, + "step": 38840 + }, + { + "epoch": 0.8658502380373021, + "grad_norm": 0.7271820306777954, + "learning_rate": 8.749044490009717e-07, + "loss": 0.2522, + "step": 38845 + }, + { + "epoch": 0.8659616874179221, + "grad_norm": 0.6150298714637756, + "learning_rate": 8.734727753347916e-07, + "loss": 0.1579, + "step": 38850 + }, + { + "epoch": 0.8660731367985423, + "grad_norm": 0.7927728295326233, + "learning_rate": 8.720422205104029e-07, + "loss": 0.3589, + "step": 38855 + }, + { + "epoch": 0.8661845861791623, + "grad_norm": 1.0862514972686768, + "learning_rate": 8.70612784703182e-07, + "loss": 0.3257, + "step": 38860 + }, + { + "epoch": 0.8662960355597824, + "grad_norm": 0.499602735042572, + "learning_rate": 8.691844680883654e-07, + "loss": 0.2216, + "step": 38865 + }, + { + "epoch": 0.8664074849404024, + "grad_norm": 1.040918231010437, + "learning_rate": 8.677572708410543e-07, + "loss": 0.3053, + "step": 38870 + }, + { + "epoch": 0.8665189343210226, + "grad_norm": 0.5788599252700806, + "learning_rate": 8.663311931362117e-07, + "loss": 0.3767, + "step": 38875 + }, + { + "epoch": 0.8666303837016426, + "grad_norm": 1.2706820964813232, + "learning_rate": 8.649062351486626e-07, + "loss": 0.3246, + "step": 38880 + }, + { + "epoch": 0.8667418330822626, + "grad_norm": 0.4492451846599579, + "learning_rate": 8.634823970530981e-07, + "loss": 0.3951, + "step": 38885 + }, + { + "epoch": 0.8668532824628827, + "grad_norm": 0.31704801321029663, + "learning_rate": 8.620596790240665e-07, + "loss": 0.2206, + "step": 38890 + }, + { + "epoch": 0.8669647318435028, + "grad_norm": 0.4670735001564026, + "learning_rate": 8.606380812359861e-07, + "loss": 0.2563, + "step": 38895 + }, + { + "epoch": 0.8670761812241229, + "grad_norm": 0.6560717821121216, + "learning_rate": 8.592176038631328e-07, + "loss": 0.2734, + "step": 38900 + }, + { + "epoch": 0.8671876306047429, + "grad_norm": 0.43644067645072937, + "learning_rate": 8.577982470796442e-07, + "loss": 0.2596, + "step": 38905 + }, + { + "epoch": 0.8672990799853629, + "grad_norm": 0.6353582143783569, + "learning_rate": 8.563800110595222e-07, + "loss": 0.2138, + "step": 38910 + }, + { + "epoch": 0.8674105293659831, + "grad_norm": 0.5252775549888611, + "learning_rate": 8.549628959766343e-07, + "loss": 0.2546, + "step": 38915 + }, + { + "epoch": 0.8675219787466031, + "grad_norm": 0.4174504280090332, + "learning_rate": 8.535469020047071e-07, + "loss": 0.1929, + "step": 38920 + }, + { + "epoch": 0.8676334281272232, + "grad_norm": 0.5764095783233643, + "learning_rate": 8.521320293173274e-07, + "loss": 0.2603, + "step": 38925 + }, + { + "epoch": 0.8677448775078432, + "grad_norm": 0.6523967385292053, + "learning_rate": 8.50718278087953e-07, + "loss": 0.2129, + "step": 38930 + }, + { + "epoch": 0.8678563268884634, + "grad_norm": 0.6079277396202087, + "learning_rate": 8.493056484898954e-07, + "loss": 0.3142, + "step": 38935 + }, + { + "epoch": 0.8679677762690834, + "grad_norm": 0.44901034235954285, + "learning_rate": 8.478941406963315e-07, + "loss": 0.358, + "step": 38940 + }, + { + "epoch": 0.8680792256497034, + "grad_norm": 0.30900290608406067, + "learning_rate": 8.464837548803029e-07, + "loss": 0.3144, + "step": 38945 + }, + { + "epoch": 0.8681906750303235, + "grad_norm": 0.6993968486785889, + "learning_rate": 8.450744912147135e-07, + "loss": 0.3674, + "step": 38950 + }, + { + "epoch": 0.8683021244109436, + "grad_norm": 0.8762882351875305, + "learning_rate": 8.436663498723252e-07, + "loss": 0.3504, + "step": 38955 + }, + { + "epoch": 0.8684135737915637, + "grad_norm": 0.7278645038604736, + "learning_rate": 8.422593310257654e-07, + "loss": 0.2525, + "step": 38960 + }, + { + "epoch": 0.8685250231721837, + "grad_norm": 0.8509200215339661, + "learning_rate": 8.40853434847525e-07, + "loss": 0.3666, + "step": 38965 + }, + { + "epoch": 0.8686364725528037, + "grad_norm": 1.1432135105133057, + "learning_rate": 8.394486615099573e-07, + "loss": 0.3129, + "step": 38970 + }, + { + "epoch": 0.8687479219334239, + "grad_norm": 0.5976606011390686, + "learning_rate": 8.380450111852722e-07, + "loss": 0.2232, + "step": 38975 + }, + { + "epoch": 0.8688593713140439, + "grad_norm": 0.27752354741096497, + "learning_rate": 8.366424840455523e-07, + "loss": 0.1715, + "step": 38980 + }, + { + "epoch": 0.868970820694664, + "grad_norm": 0.5606639385223389, + "learning_rate": 8.352410802627331e-07, + "loss": 0.3196, + "step": 38985 + }, + { + "epoch": 0.869082270075284, + "grad_norm": 0.6761105060577393, + "learning_rate": 8.338408000086151e-07, + "loss": 0.2986, + "step": 38990 + }, + { + "epoch": 0.8691937194559042, + "grad_norm": 0.4891190826892853, + "learning_rate": 8.324416434548632e-07, + "loss": 0.2157, + "step": 38995 + }, + { + "epoch": 0.8693051688365242, + "grad_norm": 0.589738130569458, + "learning_rate": 8.310436107730035e-07, + "loss": 0.2274, + "step": 39000 + }, + { + "epoch": 0.8694166182171442, + "grad_norm": 0.6930895447731018, + "learning_rate": 8.296467021344223e-07, + "loss": 0.266, + "step": 39005 + }, + { + "epoch": 0.8695280675977644, + "grad_norm": 0.5216238498687744, + "learning_rate": 8.282509177103737e-07, + "loss": 0.4008, + "step": 39010 + }, + { + "epoch": 0.8696395169783844, + "grad_norm": 0.6839985251426697, + "learning_rate": 8.268562576719663e-07, + "loss": 0.2302, + "step": 39015 + }, + { + "epoch": 0.8697509663590045, + "grad_norm": 0.5241414904594421, + "learning_rate": 8.254627221901767e-07, + "loss": 0.3208, + "step": 39020 + }, + { + "epoch": 0.8698624157396245, + "grad_norm": 0.4129006564617157, + "learning_rate": 8.240703114358405e-07, + "loss": 0.2677, + "step": 39025 + }, + { + "epoch": 0.8699738651202445, + "grad_norm": 0.6128037571907043, + "learning_rate": 8.226790255796557e-07, + "loss": 0.2581, + "step": 39030 + }, + { + "epoch": 0.8700853145008647, + "grad_norm": 0.7427310943603516, + "learning_rate": 8.212888647921835e-07, + "loss": 0.3437, + "step": 39035 + }, + { + "epoch": 0.8701967638814847, + "grad_norm": 0.7483739852905273, + "learning_rate": 8.198998292438498e-07, + "loss": 0.2322, + "step": 39040 + }, + { + "epoch": 0.8703082132621048, + "grad_norm": 0.675360918045044, + "learning_rate": 8.185119191049373e-07, + "loss": 0.2521, + "step": 39045 + }, + { + "epoch": 0.8704196626427249, + "grad_norm": 0.5865544676780701, + "learning_rate": 8.171251345455943e-07, + "loss": 0.2734, + "step": 39050 + }, + { + "epoch": 0.8705311120233449, + "grad_norm": 0.7708106637001038, + "learning_rate": 8.157394757358283e-07, + "loss": 0.3535, + "step": 39055 + }, + { + "epoch": 0.870642561403965, + "grad_norm": 0.8821659684181213, + "learning_rate": 8.1435494284551e-07, + "loss": 0.3533, + "step": 39060 + }, + { + "epoch": 0.870754010784585, + "grad_norm": 0.8066558241844177, + "learning_rate": 8.129715360443724e-07, + "loss": 0.2142, + "step": 39065 + }, + { + "epoch": 0.8708654601652052, + "grad_norm": 0.4113776385784149, + "learning_rate": 8.115892555020122e-07, + "loss": 0.2661, + "step": 39070 + }, + { + "epoch": 0.8709769095458252, + "grad_norm": 0.5391425490379333, + "learning_rate": 8.10208101387886e-07, + "loss": 0.2141, + "step": 39075 + }, + { + "epoch": 0.8710883589264453, + "grad_norm": 0.7116353511810303, + "learning_rate": 8.088280738713139e-07, + "loss": 0.2821, + "step": 39080 + }, + { + "epoch": 0.8711998083070653, + "grad_norm": 0.6435947418212891, + "learning_rate": 8.07449173121474e-07, + "loss": 0.2571, + "step": 39085 + }, + { + "epoch": 0.8713112576876854, + "grad_norm": 0.5934739112854004, + "learning_rate": 8.060713993074099e-07, + "loss": 0.3214, + "step": 39090 + }, + { + "epoch": 0.8714227070683055, + "grad_norm": 0.6468964219093323, + "learning_rate": 8.046947525980242e-07, + "loss": 0.2661, + "step": 39095 + }, + { + "epoch": 0.8715341564489255, + "grad_norm": 0.8659062385559082, + "learning_rate": 8.033192331620843e-07, + "loss": 0.3474, + "step": 39100 + }, + { + "epoch": 0.8716456058295456, + "grad_norm": 0.7774683237075806, + "learning_rate": 8.019448411682218e-07, + "loss": 0.2422, + "step": 39105 + }, + { + "epoch": 0.8717570552101657, + "grad_norm": 0.7808752059936523, + "learning_rate": 8.005715767849231e-07, + "loss": 0.333, + "step": 39110 + }, + { + "epoch": 0.8718685045907857, + "grad_norm": 0.2276986986398697, + "learning_rate": 7.991994401805414e-07, + "loss": 0.176, + "step": 39115 + }, + { + "epoch": 0.8719799539714058, + "grad_norm": 0.5539754629135132, + "learning_rate": 7.978284315232875e-07, + "loss": 0.3066, + "step": 39120 + }, + { + "epoch": 0.8720914033520258, + "grad_norm": 0.6452548503875732, + "learning_rate": 7.964585509812373e-07, + "loss": 0.2689, + "step": 39125 + }, + { + "epoch": 0.872202852732646, + "grad_norm": 0.5837932229042053, + "learning_rate": 7.950897987223305e-07, + "loss": 0.3347, + "step": 39130 + }, + { + "epoch": 0.872314302113266, + "grad_norm": 1.0752758979797363, + "learning_rate": 7.937221749143608e-07, + "loss": 0.2316, + "step": 39135 + }, + { + "epoch": 0.8724257514938861, + "grad_norm": 0.49105966091156006, + "learning_rate": 7.923556797249921e-07, + "loss": 0.2245, + "step": 39140 + }, + { + "epoch": 0.8725372008745061, + "grad_norm": 0.6479565501213074, + "learning_rate": 7.909903133217456e-07, + "loss": 0.1952, + "step": 39145 + }, + { + "epoch": 0.8726486502551262, + "grad_norm": 0.5889627933502197, + "learning_rate": 7.896260758720043e-07, + "loss": 0.1782, + "step": 39150 + }, + { + "epoch": 0.8727600996357463, + "grad_norm": 0.5997331738471985, + "learning_rate": 7.882629675430131e-07, + "loss": 0.2341, + "step": 39155 + }, + { + "epoch": 0.8728715490163663, + "grad_norm": 0.5359553694725037, + "learning_rate": 7.869009885018764e-07, + "loss": 0.2751, + "step": 39160 + }, + { + "epoch": 0.8729829983969865, + "grad_norm": 0.46849164366722107, + "learning_rate": 7.855401389155659e-07, + "loss": 0.2439, + "step": 39165 + }, + { + "epoch": 0.8730944477776065, + "grad_norm": 0.9019102454185486, + "learning_rate": 7.841804189509094e-07, + "loss": 0.1982, + "step": 39170 + }, + { + "epoch": 0.8732058971582265, + "grad_norm": 0.6545245051383972, + "learning_rate": 7.82821828774597e-07, + "loss": 0.2837, + "step": 39175 + }, + { + "epoch": 0.8733173465388466, + "grad_norm": 0.5654610395431519, + "learning_rate": 7.814643685531842e-07, + "loss": 0.1812, + "step": 39180 + }, + { + "epoch": 0.8734287959194666, + "grad_norm": 0.8217921853065491, + "learning_rate": 7.801080384530834e-07, + "loss": 0.3776, + "step": 39185 + }, + { + "epoch": 0.8735402453000868, + "grad_norm": 0.3172108829021454, + "learning_rate": 7.787528386405696e-07, + "loss": 0.3206, + "step": 39190 + }, + { + "epoch": 0.8736516946807068, + "grad_norm": 1.0418602228164673, + "learning_rate": 7.77398769281782e-07, + "loss": 0.2938, + "step": 39195 + }, + { + "epoch": 0.8737631440613268, + "grad_norm": 0.7183475494384766, + "learning_rate": 7.760458305427176e-07, + "loss": 0.2731, + "step": 39200 + }, + { + "epoch": 0.873874593441947, + "grad_norm": 0.40343570709228516, + "learning_rate": 7.746940225892363e-07, + "loss": 0.1553, + "step": 39205 + }, + { + "epoch": 0.873986042822567, + "grad_norm": 0.5755996704101562, + "learning_rate": 7.733433455870575e-07, + "loss": 0.2032, + "step": 39210 + }, + { + "epoch": 0.8740974922031871, + "grad_norm": 0.5467020273208618, + "learning_rate": 7.719937997017679e-07, + "loss": 0.4656, + "step": 39215 + }, + { + "epoch": 0.8742089415838071, + "grad_norm": 0.800784170627594, + "learning_rate": 7.706453850988071e-07, + "loss": 0.3364, + "step": 39220 + }, + { + "epoch": 0.8743203909644273, + "grad_norm": 0.60468590259552, + "learning_rate": 7.692981019434843e-07, + "loss": 0.159, + "step": 39225 + }, + { + "epoch": 0.8744318403450473, + "grad_norm": 0.20965750515460968, + "learning_rate": 7.679519504009636e-07, + "loss": 0.1155, + "step": 39230 + }, + { + "epoch": 0.8745432897256673, + "grad_norm": 0.6301449537277222, + "learning_rate": 7.666069306362733e-07, + "loss": 0.1794, + "step": 39235 + }, + { + "epoch": 0.8746547391062874, + "grad_norm": 0.38496220111846924, + "learning_rate": 7.652630428143038e-07, + "loss": 0.2565, + "step": 39240 + }, + { + "epoch": 0.8747661884869075, + "grad_norm": 0.6018979549407959, + "learning_rate": 7.639202870998008e-07, + "loss": 0.2916, + "step": 39245 + }, + { + "epoch": 0.8748776378675276, + "grad_norm": 0.3267180919647217, + "learning_rate": 7.625786636573795e-07, + "loss": 0.3218, + "step": 39250 + }, + { + "epoch": 0.8749890872481476, + "grad_norm": 0.612133264541626, + "learning_rate": 7.612381726515139e-07, + "loss": 0.4414, + "step": 39255 + }, + { + "epoch": 0.8751005366287676, + "grad_norm": 0.7912502884864807, + "learning_rate": 7.598988142465358e-07, + "loss": 0.2768, + "step": 39260 + }, + { + "epoch": 0.8752119860093878, + "grad_norm": 0.9484602808952332, + "learning_rate": 7.585605886066405e-07, + "loss": 0.2242, + "step": 39265 + }, + { + "epoch": 0.8753234353900078, + "grad_norm": 0.6952139735221863, + "learning_rate": 7.572234958958846e-07, + "loss": 0.203, + "step": 39270 + }, + { + "epoch": 0.8754348847706279, + "grad_norm": 0.4956663250923157, + "learning_rate": 7.558875362781848e-07, + "loss": 0.2395, + "step": 39275 + }, + { + "epoch": 0.8755463341512479, + "grad_norm": 0.6728199124336243, + "learning_rate": 7.545527099173178e-07, + "loss": 0.2816, + "step": 39280 + }, + { + "epoch": 0.8756577835318681, + "grad_norm": 0.5598202347755432, + "learning_rate": 7.532190169769238e-07, + "loss": 0.2929, + "step": 39285 + }, + { + "epoch": 0.8757692329124881, + "grad_norm": 0.5399930477142334, + "learning_rate": 7.518864576205065e-07, + "loss": 0.3053, + "step": 39290 + }, + { + "epoch": 0.8758806822931081, + "grad_norm": 0.6357437372207642, + "learning_rate": 7.505550320114241e-07, + "loss": 0.4018, + "step": 39295 + }, + { + "epoch": 0.8759921316737282, + "grad_norm": 0.667725145816803, + "learning_rate": 7.492247403129016e-07, + "loss": 0.3565, + "step": 39300 + }, + { + "epoch": 0.8761035810543483, + "grad_norm": 0.7579306364059448, + "learning_rate": 7.478955826880197e-07, + "loss": 0.3909, + "step": 39305 + }, + { + "epoch": 0.8762150304349684, + "grad_norm": 0.8091420531272888, + "learning_rate": 7.465675592997223e-07, + "loss": 0.391, + "step": 39310 + }, + { + "epoch": 0.8763264798155884, + "grad_norm": 0.6669242978096008, + "learning_rate": 7.452406703108173e-07, + "loss": 0.3984, + "step": 39315 + }, + { + "epoch": 0.8764379291962084, + "grad_norm": 1.056694507598877, + "learning_rate": 7.43914915883972e-07, + "loss": 0.149, + "step": 39320 + }, + { + "epoch": 0.8765493785768286, + "grad_norm": 0.48080557584762573, + "learning_rate": 7.425902961817111e-07, + "loss": 0.2749, + "step": 39325 + }, + { + "epoch": 0.8766608279574486, + "grad_norm": 0.46817928552627563, + "learning_rate": 7.412668113664245e-07, + "loss": 0.1438, + "step": 39330 + }, + { + "epoch": 0.8767722773380687, + "grad_norm": 0.4309822916984558, + "learning_rate": 7.399444616003604e-07, + "loss": 0.1833, + "step": 39335 + }, + { + "epoch": 0.8768837267186887, + "grad_norm": 0.7805137038230896, + "learning_rate": 7.386232470456268e-07, + "loss": 0.317, + "step": 39340 + }, + { + "epoch": 0.8769951760993089, + "grad_norm": 0.6236761808395386, + "learning_rate": 7.373031678641985e-07, + "loss": 0.2534, + "step": 39345 + }, + { + "epoch": 0.8771066254799289, + "grad_norm": 0.37533897161483765, + "learning_rate": 7.359842242179016e-07, + "loss": 0.3453, + "step": 39350 + }, + { + "epoch": 0.8772180748605489, + "grad_norm": 0.4856351912021637, + "learning_rate": 7.346664162684347e-07, + "loss": 0.3894, + "step": 39355 + }, + { + "epoch": 0.8773295242411691, + "grad_norm": 0.8379895091056824, + "learning_rate": 7.333497441773474e-07, + "loss": 0.2715, + "step": 39360 + }, + { + "epoch": 0.8774409736217891, + "grad_norm": 0.8367623090744019, + "learning_rate": 7.320342081060527e-07, + "loss": 0.2622, + "step": 39365 + }, + { + "epoch": 0.8775524230024092, + "grad_norm": 0.8916360139846802, + "learning_rate": 7.307198082158285e-07, + "loss": 0.2733, + "step": 39370 + }, + { + "epoch": 0.8776638723830292, + "grad_norm": 0.56252521276474, + "learning_rate": 7.294065446678045e-07, + "loss": 0.2643, + "step": 39375 + }, + { + "epoch": 0.8777753217636493, + "grad_norm": 0.48122766613960266, + "learning_rate": 7.280944176229821e-07, + "loss": 0.247, + "step": 39380 + }, + { + "epoch": 0.8778867711442694, + "grad_norm": 0.7334158420562744, + "learning_rate": 7.267834272422148e-07, + "loss": 0.3036, + "step": 39385 + }, + { + "epoch": 0.8779982205248894, + "grad_norm": 0.7368651032447815, + "learning_rate": 7.254735736862228e-07, + "loss": 0.3579, + "step": 39390 + }, + { + "epoch": 0.8781096699055095, + "grad_norm": 0.6399995684623718, + "learning_rate": 7.241648571155824e-07, + "loss": 0.2535, + "step": 39395 + }, + { + "epoch": 0.8782211192861296, + "grad_norm": 0.6023989915847778, + "learning_rate": 7.228572776907317e-07, + "loss": 0.3227, + "step": 39400 + }, + { + "epoch": 0.8783325686667496, + "grad_norm": 0.6582564115524292, + "learning_rate": 7.215508355719681e-07, + "loss": 0.3222, + "step": 39405 + }, + { + "epoch": 0.8784440180473697, + "grad_norm": 0.5214042067527771, + "learning_rate": 7.202455309194556e-07, + "loss": 0.2741, + "step": 39410 + }, + { + "epoch": 0.8785554674279897, + "grad_norm": 0.63289874792099, + "learning_rate": 7.18941363893213e-07, + "loss": 0.2356, + "step": 39415 + }, + { + "epoch": 0.8786669168086099, + "grad_norm": 0.9625816941261292, + "learning_rate": 7.176383346531179e-07, + "loss": 0.3086, + "step": 39420 + }, + { + "epoch": 0.8787783661892299, + "grad_norm": 1.4271131753921509, + "learning_rate": 7.163364433589159e-07, + "loss": 0.2845, + "step": 39425 + }, + { + "epoch": 0.87888981556985, + "grad_norm": 0.8671938180923462, + "learning_rate": 7.150356901702072e-07, + "loss": 0.2468, + "step": 39430 + }, + { + "epoch": 0.87900126495047, + "grad_norm": 0.7523783445358276, + "learning_rate": 7.137360752464517e-07, + "loss": 0.3748, + "step": 39435 + }, + { + "epoch": 0.8791127143310901, + "grad_norm": 0.6828460097312927, + "learning_rate": 7.124375987469767e-07, + "loss": 0.3695, + "step": 39440 + }, + { + "epoch": 0.8792241637117102, + "grad_norm": 0.7740871906280518, + "learning_rate": 7.111402608309637e-07, + "loss": 0.3003, + "step": 39445 + }, + { + "epoch": 0.8793356130923302, + "grad_norm": 0.8277214765548706, + "learning_rate": 7.098440616574542e-07, + "loss": 0.295, + "step": 39450 + }, + { + "epoch": 0.8794470624729503, + "grad_norm": 0.7407256364822388, + "learning_rate": 7.085490013853536e-07, + "loss": 0.2262, + "step": 39455 + }, + { + "epoch": 0.8795585118535704, + "grad_norm": 0.3992447853088379, + "learning_rate": 7.072550801734268e-07, + "loss": 0.2338, + "step": 39460 + }, + { + "epoch": 0.8796699612341904, + "grad_norm": 0.42001864314079285, + "learning_rate": 7.059622981802994e-07, + "loss": 0.2455, + "step": 39465 + }, + { + "epoch": 0.8797814106148105, + "grad_norm": 0.7254727482795715, + "learning_rate": 7.046706555644522e-07, + "loss": 0.2974, + "step": 39470 + }, + { + "epoch": 0.8798928599954305, + "grad_norm": 0.5861344933509827, + "learning_rate": 7.033801524842366e-07, + "loss": 0.3441, + "step": 39475 + }, + { + "epoch": 0.8800043093760507, + "grad_norm": 0.7168605923652649, + "learning_rate": 7.020907890978546e-07, + "loss": 0.225, + "step": 39480 + }, + { + "epoch": 0.8801157587566707, + "grad_norm": 0.5098482966423035, + "learning_rate": 7.008025655633743e-07, + "loss": 0.2328, + "step": 39485 + }, + { + "epoch": 0.8802272081372908, + "grad_norm": 0.5601391792297363, + "learning_rate": 6.995154820387173e-07, + "loss": 0.3758, + "step": 39490 + }, + { + "epoch": 0.8803386575179108, + "grad_norm": 1.00440514087677, + "learning_rate": 6.982295386816762e-07, + "loss": 0.2554, + "step": 39495 + }, + { + "epoch": 0.8804501068985309, + "grad_norm": 0.5242079496383667, + "learning_rate": 6.969447356498938e-07, + "loss": 0.2905, + "step": 39500 + }, + { + "epoch": 0.880561556279151, + "grad_norm": 0.8028491735458374, + "learning_rate": 6.956610731008783e-07, + "loss": 0.2896, + "step": 39505 + }, + { + "epoch": 0.880673005659771, + "grad_norm": 0.2605814039707184, + "learning_rate": 6.943785511919976e-07, + "loss": 0.3147, + "step": 39510 + }, + { + "epoch": 0.8807844550403912, + "grad_norm": 1.1690866947174072, + "learning_rate": 6.930971700804789e-07, + "loss": 0.309, + "step": 39515 + }, + { + "epoch": 0.8808959044210112, + "grad_norm": 0.595558762550354, + "learning_rate": 6.918169299234079e-07, + "loss": 0.2606, + "step": 39520 + }, + { + "epoch": 0.8810073538016312, + "grad_norm": 0.6599328517913818, + "learning_rate": 6.905378308777322e-07, + "loss": 0.2377, + "step": 39525 + }, + { + "epoch": 0.8811188031822513, + "grad_norm": 0.454018771648407, + "learning_rate": 6.892598731002597e-07, + "loss": 0.2882, + "step": 39530 + }, + { + "epoch": 0.8812302525628714, + "grad_norm": 0.7266696691513062, + "learning_rate": 6.879830567476609e-07, + "loss": 0.282, + "step": 39535 + }, + { + "epoch": 0.8813417019434915, + "grad_norm": 0.796342134475708, + "learning_rate": 6.867073819764614e-07, + "loss": 0.1781, + "step": 39540 + }, + { + "epoch": 0.8814531513241115, + "grad_norm": 0.4576019048690796, + "learning_rate": 6.854328489430495e-07, + "loss": 0.3685, + "step": 39545 + }, + { + "epoch": 0.8815646007047316, + "grad_norm": 0.6398374438285828, + "learning_rate": 6.841594578036725e-07, + "loss": 0.2192, + "step": 39550 + }, + { + "epoch": 0.8816760500853517, + "grad_norm": 0.9749293923377991, + "learning_rate": 6.828872087144378e-07, + "loss": 0.3229, + "step": 39555 + }, + { + "epoch": 0.8817874994659717, + "grad_norm": 0.6655979752540588, + "learning_rate": 6.816161018313138e-07, + "loss": 0.2114, + "step": 39560 + }, + { + "epoch": 0.8818989488465918, + "grad_norm": 0.4514460265636444, + "learning_rate": 6.803461373101306e-07, + "loss": 0.2602, + "step": 39565 + }, + { + "epoch": 0.8820103982272118, + "grad_norm": 1.0040894746780396, + "learning_rate": 6.790773153065744e-07, + "loss": 0.2762, + "step": 39570 + }, + { + "epoch": 0.882121847607832, + "grad_norm": 0.5751023888587952, + "learning_rate": 6.778096359761921e-07, + "loss": 0.3602, + "step": 39575 + }, + { + "epoch": 0.882233296988452, + "grad_norm": 0.5020691752433777, + "learning_rate": 6.765430994743926e-07, + "loss": 0.2579, + "step": 39580 + }, + { + "epoch": 0.882344746369072, + "grad_norm": 0.8378996849060059, + "learning_rate": 6.752777059564431e-07, + "loss": 0.2474, + "step": 39585 + }, + { + "epoch": 0.8824561957496921, + "grad_norm": 0.7655625343322754, + "learning_rate": 6.740134555774702e-07, + "loss": 0.2461, + "step": 39590 + }, + { + "epoch": 0.8825676451303122, + "grad_norm": 0.7098322510719299, + "learning_rate": 6.727503484924614e-07, + "loss": 0.2742, + "step": 39595 + }, + { + "epoch": 0.8826790945109323, + "grad_norm": 0.756291925907135, + "learning_rate": 6.714883848562659e-07, + "loss": 0.2654, + "step": 39600 + }, + { + "epoch": 0.8827905438915523, + "grad_norm": 0.9852766394615173, + "learning_rate": 6.70227564823589e-07, + "loss": 0.4138, + "step": 39605 + }, + { + "epoch": 0.8829019932721723, + "grad_norm": 0.6849904656410217, + "learning_rate": 6.689678885489992e-07, + "loss": 0.2707, + "step": 39610 + }, + { + "epoch": 0.8830134426527925, + "grad_norm": 0.7948192358016968, + "learning_rate": 6.677093561869208e-07, + "loss": 0.2577, + "step": 39615 + }, + { + "epoch": 0.8831248920334125, + "grad_norm": 0.5963390469551086, + "learning_rate": 6.664519678916392e-07, + "loss": 0.2616, + "step": 39620 + }, + { + "epoch": 0.8832363414140326, + "grad_norm": 0.623665988445282, + "learning_rate": 6.651957238173046e-07, + "loss": 0.2896, + "step": 39625 + }, + { + "epoch": 0.8833477907946526, + "grad_norm": 0.6460698843002319, + "learning_rate": 6.639406241179192e-07, + "loss": 0.2555, + "step": 39630 + }, + { + "epoch": 0.8834592401752728, + "grad_norm": 0.7518662810325623, + "learning_rate": 6.62686668947351e-07, + "loss": 0.2195, + "step": 39635 + }, + { + "epoch": 0.8835706895558928, + "grad_norm": 0.5728727579116821, + "learning_rate": 6.614338584593249e-07, + "loss": 0.3104, + "step": 39640 + }, + { + "epoch": 0.8836821389365128, + "grad_norm": 0.5908369421958923, + "learning_rate": 6.601821928074248e-07, + "loss": 0.2138, + "step": 39645 + }, + { + "epoch": 0.883793588317133, + "grad_norm": 0.7735019326210022, + "learning_rate": 6.589316721450933e-07, + "loss": 0.3471, + "step": 39650 + }, + { + "epoch": 0.883905037697753, + "grad_norm": 0.7732353210449219, + "learning_rate": 6.576822966256391e-07, + "loss": 0.3198, + "step": 39655 + }, + { + "epoch": 0.8840164870783731, + "grad_norm": 0.5547565221786499, + "learning_rate": 6.56434066402224e-07, + "loss": 0.2367, + "step": 39660 + }, + { + "epoch": 0.8841279364589931, + "grad_norm": 0.9324460029602051, + "learning_rate": 6.551869816278699e-07, + "loss": 0.3295, + "step": 39665 + }, + { + "epoch": 0.8842393858396131, + "grad_norm": 0.7827137112617493, + "learning_rate": 6.539410424554604e-07, + "loss": 0.2771, + "step": 39670 + }, + { + "epoch": 0.8843508352202333, + "grad_norm": 0.35162538290023804, + "learning_rate": 6.526962490377397e-07, + "loss": 0.2388, + "step": 39675 + }, + { + "epoch": 0.8844622846008533, + "grad_norm": 0.8613860607147217, + "learning_rate": 6.514526015273093e-07, + "loss": 0.343, + "step": 39680 + }, + { + "epoch": 0.8845737339814734, + "grad_norm": 0.5753836631774902, + "learning_rate": 6.502101000766281e-07, + "loss": 0.3576, + "step": 39685 + }, + { + "epoch": 0.8846851833620935, + "grad_norm": 0.6556292772293091, + "learning_rate": 6.489687448380211e-07, + "loss": 0.2278, + "step": 39690 + }, + { + "epoch": 0.8847966327427136, + "grad_norm": 0.8420238494873047, + "learning_rate": 6.477285359636676e-07, + "loss": 0.3398, + "step": 39695 + }, + { + "epoch": 0.8849080821233336, + "grad_norm": 0.515235424041748, + "learning_rate": 6.464894736056071e-07, + "loss": 0.3074, + "step": 39700 + }, + { + "epoch": 0.8850195315039536, + "grad_norm": 0.7593355774879456, + "learning_rate": 6.45251557915737e-07, + "loss": 0.3446, + "step": 39705 + }, + { + "epoch": 0.8851309808845738, + "grad_norm": 0.5038763284683228, + "learning_rate": 6.440147890458193e-07, + "loss": 0.3875, + "step": 39710 + }, + { + "epoch": 0.8852424302651938, + "grad_norm": 0.6667838096618652, + "learning_rate": 6.427791671474704e-07, + "loss": 0.3563, + "step": 39715 + }, + { + "epoch": 0.8853538796458139, + "grad_norm": 0.38335031270980835, + "learning_rate": 6.415446923721713e-07, + "loss": 0.2846, + "step": 39720 + }, + { + "epoch": 0.8854653290264339, + "grad_norm": 0.48969194293022156, + "learning_rate": 6.403113648712555e-07, + "loss": 0.197, + "step": 39725 + }, + { + "epoch": 0.885576778407054, + "grad_norm": 0.6656966805458069, + "learning_rate": 6.39079184795921e-07, + "loss": 0.2357, + "step": 39730 + }, + { + "epoch": 0.8856882277876741, + "grad_norm": 0.6289982795715332, + "learning_rate": 6.378481522972236e-07, + "loss": 0.2108, + "step": 39735 + }, + { + "epoch": 0.8857996771682941, + "grad_norm": 0.48437291383743286, + "learning_rate": 6.366182675260768e-07, + "loss": 0.2386, + "step": 39740 + }, + { + "epoch": 0.8859111265489142, + "grad_norm": 0.5738726258277893, + "learning_rate": 6.353895306332558e-07, + "loss": 0.2448, + "step": 39745 + }, + { + "epoch": 0.8860225759295343, + "grad_norm": 0.8107031583786011, + "learning_rate": 6.341619417693956e-07, + "loss": 0.2105, + "step": 39750 + }, + { + "epoch": 0.8861340253101544, + "grad_norm": 0.7784823179244995, + "learning_rate": 6.32935501084988e-07, + "loss": 0.2757, + "step": 39755 + }, + { + "epoch": 0.8862454746907744, + "grad_norm": 0.67021644115448, + "learning_rate": 6.317102087303861e-07, + "loss": 0.3014, + "step": 39760 + }, + { + "epoch": 0.8863569240713944, + "grad_norm": 0.6922481656074524, + "learning_rate": 6.304860648558009e-07, + "loss": 0.3157, + "step": 39765 + }, + { + "epoch": 0.8864683734520146, + "grad_norm": 0.706314742565155, + "learning_rate": 6.292630696113e-07, + "loss": 0.1906, + "step": 39770 + }, + { + "epoch": 0.8865798228326346, + "grad_norm": 0.4553152322769165, + "learning_rate": 6.28041223146818e-07, + "loss": 0.3109, + "step": 39775 + }, + { + "epoch": 0.8866912722132547, + "grad_norm": 0.7962448596954346, + "learning_rate": 6.268205256121396e-07, + "loss": 0.3001, + "step": 39780 + }, + { + "epoch": 0.8868027215938747, + "grad_norm": 0.744176983833313, + "learning_rate": 6.256009771569171e-07, + "loss": 0.2006, + "step": 39785 + }, + { + "epoch": 0.8869141709744948, + "grad_norm": 0.7229616641998291, + "learning_rate": 6.243825779306555e-07, + "loss": 0.2824, + "step": 39790 + }, + { + "epoch": 0.8870256203551149, + "grad_norm": 0.5249221920967102, + "learning_rate": 6.231653280827211e-07, + "loss": 0.3406, + "step": 39795 + }, + { + "epoch": 0.8871370697357349, + "grad_norm": 0.48112252354621887, + "learning_rate": 6.219492277623384e-07, + "loss": 0.1427, + "step": 39800 + }, + { + "epoch": 0.887248519116355, + "grad_norm": 0.5653254985809326, + "learning_rate": 6.20734277118592e-07, + "loss": 0.3752, + "step": 39805 + }, + { + "epoch": 0.8873599684969751, + "grad_norm": 0.6261930465698242, + "learning_rate": 6.19520476300427e-07, + "loss": 0.2933, + "step": 39810 + }, + { + "epoch": 0.8874714178775951, + "grad_norm": 0.2286958545446396, + "learning_rate": 6.183078254566466e-07, + "loss": 0.1827, + "step": 39815 + }, + { + "epoch": 0.8875828672582152, + "grad_norm": 0.706714928150177, + "learning_rate": 6.170963247359119e-07, + "loss": 0.2767, + "step": 39820 + }, + { + "epoch": 0.8876943166388352, + "grad_norm": 0.6469764113426208, + "learning_rate": 6.158859742867418e-07, + "loss": 0.2535, + "step": 39825 + }, + { + "epoch": 0.8878057660194554, + "grad_norm": 0.4051867723464966, + "learning_rate": 6.146767742575178e-07, + "loss": 0.2556, + "step": 39830 + }, + { + "epoch": 0.8879172154000754, + "grad_norm": 0.3999421000480652, + "learning_rate": 6.134687247964754e-07, + "loss": 0.2969, + "step": 39835 + }, + { + "epoch": 0.8880286647806955, + "grad_norm": 0.43835553526878357, + "learning_rate": 6.122618260517166e-07, + "loss": 0.1711, + "step": 39840 + }, + { + "epoch": 0.8881401141613156, + "grad_norm": 0.649053156375885, + "learning_rate": 6.110560781711938e-07, + "loss": 0.3861, + "step": 39845 + }, + { + "epoch": 0.8882515635419356, + "grad_norm": 0.9673153162002563, + "learning_rate": 6.098514813027256e-07, + "loss": 0.2902, + "step": 39850 + }, + { + "epoch": 0.8883630129225557, + "grad_norm": 0.2699918746948242, + "learning_rate": 6.08648035593985e-07, + "loss": 0.1352, + "step": 39855 + }, + { + "epoch": 0.8884744623031757, + "grad_norm": 0.964161217212677, + "learning_rate": 6.074457411925061e-07, + "loss": 0.2884, + "step": 39860 + }, + { + "epoch": 0.8885859116837959, + "grad_norm": 0.8768846392631531, + "learning_rate": 6.062445982456777e-07, + "loss": 0.3667, + "step": 39865 + }, + { + "epoch": 0.8886973610644159, + "grad_norm": 0.6985152363777161, + "learning_rate": 6.050446069007543e-07, + "loss": 0.2767, + "step": 39870 + }, + { + "epoch": 0.8888088104450359, + "grad_norm": 0.7158140540122986, + "learning_rate": 6.038457673048448e-07, + "loss": 0.2725, + "step": 39875 + }, + { + "epoch": 0.888920259825656, + "grad_norm": 0.6063166856765747, + "learning_rate": 6.02648079604915e-07, + "loss": 0.1417, + "step": 39880 + }, + { + "epoch": 0.889031709206276, + "grad_norm": 0.4928998649120331, + "learning_rate": 6.014515439477952e-07, + "loss": 0.2431, + "step": 39885 + }, + { + "epoch": 0.8891431585868962, + "grad_norm": 0.300959050655365, + "learning_rate": 6.002561604801715e-07, + "loss": 0.2616, + "step": 39890 + }, + { + "epoch": 0.8892546079675162, + "grad_norm": 0.9015136361122131, + "learning_rate": 5.990619293485866e-07, + "loss": 0.3326, + "step": 39895 + }, + { + "epoch": 0.8893660573481363, + "grad_norm": 0.6377741694450378, + "learning_rate": 5.978688506994446e-07, + "loss": 0.2272, + "step": 39900 + }, + { + "epoch": 0.8894775067287564, + "grad_norm": 0.7506493926048279, + "learning_rate": 5.966769246790094e-07, + "loss": 0.3169, + "step": 39905 + }, + { + "epoch": 0.8895889561093764, + "grad_norm": 0.6252882480621338, + "learning_rate": 5.954861514333999e-07, + "loss": 0.329, + "step": 39910 + }, + { + "epoch": 0.8897004054899965, + "grad_norm": 0.7580475807189941, + "learning_rate": 5.942965311085957e-07, + "loss": 0.3943, + "step": 39915 + }, + { + "epoch": 0.8898118548706165, + "grad_norm": 0.7877764701843262, + "learning_rate": 5.931080638504382e-07, + "loss": 0.3409, + "step": 39920 + }, + { + "epoch": 0.8899233042512367, + "grad_norm": 0.3911045491695404, + "learning_rate": 5.919207498046209e-07, + "loss": 0.2998, + "step": 39925 + }, + { + "epoch": 0.8900347536318567, + "grad_norm": 0.934512734413147, + "learning_rate": 5.907345891166993e-07, + "loss": 0.2028, + "step": 39930 + }, + { + "epoch": 0.8901462030124767, + "grad_norm": 0.7734924554824829, + "learning_rate": 5.895495819320896e-07, + "loss": 0.3299, + "step": 39935 + }, + { + "epoch": 0.8902576523930968, + "grad_norm": 0.6034185886383057, + "learning_rate": 5.883657283960642e-07, + "loss": 0.3106, + "step": 39940 + }, + { + "epoch": 0.8903691017737169, + "grad_norm": 0.8035076260566711, + "learning_rate": 5.871830286537539e-07, + "loss": 0.329, + "step": 39945 + }, + { + "epoch": 0.890480551154337, + "grad_norm": 0.7638542056083679, + "learning_rate": 5.86001482850147e-07, + "loss": 0.324, + "step": 39950 + }, + { + "epoch": 0.890592000534957, + "grad_norm": 0.7222042679786682, + "learning_rate": 5.848210911300945e-07, + "loss": 0.2364, + "step": 39955 + }, + { + "epoch": 0.890703449915577, + "grad_norm": 0.7595744132995605, + "learning_rate": 5.836418536383015e-07, + "loss": 0.3364, + "step": 39960 + }, + { + "epoch": 0.8908148992961972, + "grad_norm": 0.7978435754776001, + "learning_rate": 5.824637705193348e-07, + "loss": 0.254, + "step": 39965 + }, + { + "epoch": 0.8909263486768172, + "grad_norm": 0.6424942016601562, + "learning_rate": 5.812868419176176e-07, + "loss": 0.3046, + "step": 39970 + }, + { + "epoch": 0.8910377980574373, + "grad_norm": 0.38849854469299316, + "learning_rate": 5.801110679774325e-07, + "loss": 0.2289, + "step": 39975 + }, + { + "epoch": 0.8911492474380573, + "grad_norm": 0.2390509694814682, + "learning_rate": 5.789364488429205e-07, + "loss": 0.3562, + "step": 39980 + }, + { + "epoch": 0.8912606968186775, + "grad_norm": 0.15910890698432922, + "learning_rate": 5.77762984658079e-07, + "loss": 0.2501, + "step": 39985 + }, + { + "epoch": 0.8913721461992975, + "grad_norm": 0.932104766368866, + "learning_rate": 5.765906755667682e-07, + "loss": 0.3042, + "step": 39990 + }, + { + "epoch": 0.8914835955799175, + "grad_norm": 0.34565722942352295, + "learning_rate": 5.754195217127013e-07, + "loss": 0.2862, + "step": 39995 + }, + { + "epoch": 0.8915950449605377, + "grad_norm": 0.7554762959480286, + "learning_rate": 5.742495232394562e-07, + "loss": 0.333, + "step": 40000 + }, + { + "epoch": 0.8917064943411577, + "grad_norm": 0.5055221319198608, + "learning_rate": 5.730806802904631e-07, + "loss": 0.241, + "step": 40005 + }, + { + "epoch": 0.8918179437217778, + "grad_norm": 0.5110387206077576, + "learning_rate": 5.719129930090151e-07, + "loss": 0.3625, + "step": 40010 + }, + { + "epoch": 0.8919293931023978, + "grad_norm": 0.8160702586174011, + "learning_rate": 5.707464615382597e-07, + "loss": 0.1209, + "step": 40015 + }, + { + "epoch": 0.8920408424830178, + "grad_norm": 0.7044849991798401, + "learning_rate": 5.695810860212047e-07, + "loss": 0.3725, + "step": 40020 + }, + { + "epoch": 0.892152291863638, + "grad_norm": 0.6476200819015503, + "learning_rate": 5.684168666007162e-07, + "loss": 0.3231, + "step": 40025 + }, + { + "epoch": 0.892263741244258, + "grad_norm": 0.704243004322052, + "learning_rate": 5.672538034195218e-07, + "loss": 0.3767, + "step": 40030 + }, + { + "epoch": 0.8923751906248781, + "grad_norm": 0.6323098540306091, + "learning_rate": 5.660918966202001e-07, + "loss": 0.2308, + "step": 40035 + }, + { + "epoch": 0.8924866400054982, + "grad_norm": 0.4053361713886261, + "learning_rate": 5.649311463451945e-07, + "loss": 0.2657, + "step": 40040 + }, + { + "epoch": 0.8925980893861183, + "grad_norm": 0.7476621270179749, + "learning_rate": 5.637715527368015e-07, + "loss": 0.3113, + "step": 40045 + }, + { + "epoch": 0.8927095387667383, + "grad_norm": 0.4209010601043701, + "learning_rate": 5.626131159371794e-07, + "loss": 0.2733, + "step": 40050 + }, + { + "epoch": 0.8928209881473583, + "grad_norm": 0.82071453332901, + "learning_rate": 5.614558360883426e-07, + "loss": 0.2244, + "step": 40055 + }, + { + "epoch": 0.8929324375279785, + "grad_norm": 0.9399155378341675, + "learning_rate": 5.602997133321686e-07, + "loss": 0.3318, + "step": 40060 + }, + { + "epoch": 0.8930438869085985, + "grad_norm": 0.5941539406776428, + "learning_rate": 5.591447478103862e-07, + "loss": 0.3065, + "step": 40065 + }, + { + "epoch": 0.8931553362892186, + "grad_norm": 0.8494892716407776, + "learning_rate": 5.579909396645844e-07, + "loss": 0.2649, + "step": 40070 + }, + { + "epoch": 0.8932667856698386, + "grad_norm": 0.5023177266120911, + "learning_rate": 5.568382890362134e-07, + "loss": 0.2635, + "step": 40075 + }, + { + "epoch": 0.8933782350504587, + "grad_norm": 1.0533781051635742, + "learning_rate": 5.556867960665768e-07, + "loss": 0.2732, + "step": 40080 + }, + { + "epoch": 0.8934896844310788, + "grad_norm": 0.6501057744026184, + "learning_rate": 5.545364608968407e-07, + "loss": 0.2353, + "step": 40085 + }, + { + "epoch": 0.8936011338116988, + "grad_norm": 0.7864367961883545, + "learning_rate": 5.533872836680254e-07, + "loss": 0.2648, + "step": 40090 + }, + { + "epoch": 0.8937125831923189, + "grad_norm": 0.5219927430152893, + "learning_rate": 5.52239264521014e-07, + "loss": 0.2553, + "step": 40095 + }, + { + "epoch": 0.893824032572939, + "grad_norm": 0.44724053144454956, + "learning_rate": 5.510924035965437e-07, + "loss": 0.277, + "step": 40100 + }, + { + "epoch": 0.8939354819535591, + "grad_norm": 0.5169790387153625, + "learning_rate": 5.4994670103521e-07, + "loss": 0.2414, + "step": 40105 + }, + { + "epoch": 0.8940469313341791, + "grad_norm": 0.5901083946228027, + "learning_rate": 5.488021569774682e-07, + "loss": 0.2424, + "step": 40110 + }, + { + "epoch": 0.8941583807147991, + "grad_norm": 0.42854076623916626, + "learning_rate": 5.476587715636273e-07, + "loss": 0.2814, + "step": 40115 + }, + { + "epoch": 0.8942698300954193, + "grad_norm": 0.5647521615028381, + "learning_rate": 5.465165449338628e-07, + "loss": 0.331, + "step": 40120 + }, + { + "epoch": 0.8943812794760393, + "grad_norm": 0.4333915710449219, + "learning_rate": 5.453754772281972e-07, + "loss": 0.2925, + "step": 40125 + }, + { + "epoch": 0.8944927288566594, + "grad_norm": 0.3756270706653595, + "learning_rate": 5.442355685865219e-07, + "loss": 0.2895, + "step": 40130 + }, + { + "epoch": 0.8946041782372794, + "grad_norm": 0.6410927176475525, + "learning_rate": 5.430968191485797e-07, + "loss": 0.4226, + "step": 40135 + }, + { + "epoch": 0.8947156276178995, + "grad_norm": 0.5165871977806091, + "learning_rate": 5.4195922905397e-07, + "loss": 0.2558, + "step": 40140 + }, + { + "epoch": 0.8948270769985196, + "grad_norm": 0.5397159457206726, + "learning_rate": 5.408227984421521e-07, + "loss": 0.3183, + "step": 40145 + }, + { + "epoch": 0.8949385263791396, + "grad_norm": 0.7090404629707336, + "learning_rate": 5.396875274524482e-07, + "loss": 0.173, + "step": 40150 + }, + { + "epoch": 0.8950499757597598, + "grad_norm": 0.9219658374786377, + "learning_rate": 5.385534162240303e-07, + "loss": 0.2953, + "step": 40155 + }, + { + "epoch": 0.8951614251403798, + "grad_norm": 0.5412634611129761, + "learning_rate": 5.374204648959314e-07, + "loss": 0.3092, + "step": 40160 + }, + { + "epoch": 0.8952728745209998, + "grad_norm": 0.6510096788406372, + "learning_rate": 5.36288673607045e-07, + "loss": 0.2088, + "step": 40165 + }, + { + "epoch": 0.8953843239016199, + "grad_norm": 0.2909053564071655, + "learning_rate": 5.351580424961178e-07, + "loss": 0.3199, + "step": 40170 + }, + { + "epoch": 0.89549577328224, + "grad_norm": 0.5638686418533325, + "learning_rate": 5.340285717017568e-07, + "loss": 0.2127, + "step": 40175 + }, + { + "epoch": 0.8956072226628601, + "grad_norm": 0.5359810590744019, + "learning_rate": 5.329002613624279e-07, + "loss": 0.2579, + "step": 40180 + }, + { + "epoch": 0.8957186720434801, + "grad_norm": 0.5342158675193787, + "learning_rate": 5.317731116164515e-07, + "loss": 0.2506, + "step": 40185 + }, + { + "epoch": 0.8958301214241002, + "grad_norm": 0.5907591581344604, + "learning_rate": 5.306471226020082e-07, + "loss": 0.1526, + "step": 40190 + }, + { + "epoch": 0.8959415708047203, + "grad_norm": 1.4923869371414185, + "learning_rate": 5.295222944571365e-07, + "loss": 0.3076, + "step": 40195 + }, + { + "epoch": 0.8960530201853403, + "grad_norm": 0.38972416520118713, + "learning_rate": 5.283986273197284e-07, + "loss": 0.2718, + "step": 40200 + }, + { + "epoch": 0.8961644695659604, + "grad_norm": 0.6770171523094177, + "learning_rate": 5.272761213275413e-07, + "loss": 0.3058, + "step": 40205 + }, + { + "epoch": 0.8962759189465804, + "grad_norm": 0.23331047594547272, + "learning_rate": 5.261547766181818e-07, + "loss": 0.271, + "step": 40210 + }, + { + "epoch": 0.8963873683272006, + "grad_norm": 0.611391007900238, + "learning_rate": 5.250345933291201e-07, + "loss": 0.2069, + "step": 40215 + }, + { + "epoch": 0.8964988177078206, + "grad_norm": 0.4834649860858917, + "learning_rate": 5.239155715976829e-07, + "loss": 0.2186, + "step": 40220 + }, + { + "epoch": 0.8966102670884406, + "grad_norm": 0.5032212734222412, + "learning_rate": 5.227977115610528e-07, + "loss": 0.2942, + "step": 40225 + }, + { + "epoch": 0.8967217164690607, + "grad_norm": 0.4694267511367798, + "learning_rate": 5.216810133562689e-07, + "loss": 0.3242, + "step": 40230 + }, + { + "epoch": 0.8968331658496808, + "grad_norm": 0.5364362597465515, + "learning_rate": 5.205654771202317e-07, + "loss": 0.208, + "step": 40235 + }, + { + "epoch": 0.8969446152303009, + "grad_norm": 0.452486127614975, + "learning_rate": 5.194511029896965e-07, + "loss": 0.2727, + "step": 40240 + }, + { + "epoch": 0.8970560646109209, + "grad_norm": 0.24721093475818634, + "learning_rate": 5.183378911012782e-07, + "loss": 0.2061, + "step": 40245 + }, + { + "epoch": 0.897167513991541, + "grad_norm": 0.5605514049530029, + "learning_rate": 5.172258415914478e-07, + "loss": 0.2209, + "step": 40250 + }, + { + "epoch": 0.8972789633721611, + "grad_norm": 0.5760518312454224, + "learning_rate": 5.161149545965328e-07, + "loss": 0.2893, + "step": 40255 + }, + { + "epoch": 0.8973904127527811, + "grad_norm": 0.516250729560852, + "learning_rate": 5.150052302527208e-07, + "loss": 0.2429, + "step": 40260 + }, + { + "epoch": 0.8975018621334012, + "grad_norm": 0.14454306662082672, + "learning_rate": 5.138966686960534e-07, + "loss": 0.2113, + "step": 40265 + }, + { + "epoch": 0.8976133115140212, + "grad_norm": 0.6639364957809448, + "learning_rate": 5.127892700624326e-07, + "loss": 0.2789, + "step": 40270 + }, + { + "epoch": 0.8977247608946414, + "grad_norm": 0.5296926498413086, + "learning_rate": 5.116830344876178e-07, + "loss": 0.2151, + "step": 40275 + }, + { + "epoch": 0.8978362102752614, + "grad_norm": 0.7752794623374939, + "learning_rate": 5.10577962107226e-07, + "loss": 0.3579, + "step": 40280 + }, + { + "epoch": 0.8979476596558814, + "grad_norm": 0.5278540253639221, + "learning_rate": 5.094740530567277e-07, + "loss": 0.1673, + "step": 40285 + }, + { + "epoch": 0.8980591090365015, + "grad_norm": 0.7104523777961731, + "learning_rate": 5.083713074714547e-07, + "loss": 0.2614, + "step": 40290 + }, + { + "epoch": 0.8981705584171216, + "grad_norm": 0.8003318905830383, + "learning_rate": 5.072697254865966e-07, + "loss": 0.4819, + "step": 40295 + }, + { + "epoch": 0.8982820077977417, + "grad_norm": 0.7443469166755676, + "learning_rate": 5.061693072371953e-07, + "loss": 0.1905, + "step": 40300 + }, + { + "epoch": 0.8983934571783617, + "grad_norm": 0.5459474325180054, + "learning_rate": 5.050700528581554e-07, + "loss": 0.3179, + "step": 40305 + }, + { + "epoch": 0.8985049065589819, + "grad_norm": 1.0226274728775024, + "learning_rate": 5.039719624842398e-07, + "loss": 0.3921, + "step": 40310 + }, + { + "epoch": 0.8986163559396019, + "grad_norm": 0.3113342821598053, + "learning_rate": 5.028750362500633e-07, + "loss": 0.3065, + "step": 40315 + }, + { + "epoch": 0.8987278053202219, + "grad_norm": 0.8021506667137146, + "learning_rate": 5.017792742901006e-07, + "loss": 0.3777, + "step": 40320 + }, + { + "epoch": 0.898839254700842, + "grad_norm": 0.36907243728637695, + "learning_rate": 5.006846767386831e-07, + "loss": 0.2731, + "step": 40325 + }, + { + "epoch": 0.898950704081462, + "grad_norm": 0.45655930042266846, + "learning_rate": 4.995912437299999e-07, + "loss": 0.2021, + "step": 40330 + }, + { + "epoch": 0.8990621534620822, + "grad_norm": 0.7580198645591736, + "learning_rate": 4.984989753981007e-07, + "loss": 0.2069, + "step": 40335 + }, + { + "epoch": 0.8991736028427022, + "grad_norm": 0.5341771841049194, + "learning_rate": 4.974078718768837e-07, + "loss": 0.3496, + "step": 40340 + }, + { + "epoch": 0.8992850522233222, + "grad_norm": 0.809218168258667, + "learning_rate": 4.963179333001156e-07, + "loss": 0.2558, + "step": 40345 + }, + { + "epoch": 0.8993965016039424, + "grad_norm": 0.488911509513855, + "learning_rate": 4.952291598014114e-07, + "loss": 0.2288, + "step": 40350 + }, + { + "epoch": 0.8995079509845624, + "grad_norm": 0.5416426062583923, + "learning_rate": 4.941415515142467e-07, + "loss": 0.2644, + "step": 40355 + }, + { + "epoch": 0.8996194003651825, + "grad_norm": 0.9903549551963806, + "learning_rate": 4.930551085719515e-07, + "loss": 0.2549, + "step": 40360 + }, + { + "epoch": 0.8997308497458025, + "grad_norm": 0.4512089490890503, + "learning_rate": 4.919698311077203e-07, + "loss": 0.2033, + "step": 40365 + }, + { + "epoch": 0.8998422991264226, + "grad_norm": 0.7078744769096375, + "learning_rate": 4.908857192545958e-07, + "loss": 0.2584, + "step": 40370 + }, + { + "epoch": 0.8999537485070427, + "grad_norm": 0.7992226481437683, + "learning_rate": 4.898027731454824e-07, + "loss": 0.2492, + "step": 40375 + }, + { + "epoch": 0.9000651978876627, + "grad_norm": 0.7031039595603943, + "learning_rate": 4.887209929131431e-07, + "loss": 0.2458, + "step": 40380 + }, + { + "epoch": 0.9001766472682828, + "grad_norm": 0.5648842453956604, + "learning_rate": 4.876403786901939e-07, + "loss": 0.2639, + "step": 40385 + }, + { + "epoch": 0.9002880966489029, + "grad_norm": 0.8292831182479858, + "learning_rate": 4.865609306091101e-07, + "loss": 0.3474, + "step": 40390 + }, + { + "epoch": 0.900399546029523, + "grad_norm": 0.8248302340507507, + "learning_rate": 4.854826488022235e-07, + "loss": 0.2959, + "step": 40395 + }, + { + "epoch": 0.900510995410143, + "grad_norm": 0.6083874702453613, + "learning_rate": 4.844055334017228e-07, + "loss": 0.2092, + "step": 40400 + }, + { + "epoch": 0.900622444790763, + "grad_norm": 0.8103845715522766, + "learning_rate": 4.833295845396558e-07, + "loss": 0.2423, + "step": 40405 + }, + { + "epoch": 0.9007338941713832, + "grad_norm": 0.6734932661056519, + "learning_rate": 4.822548023479234e-07, + "loss": 0.1936, + "step": 40410 + }, + { + "epoch": 0.9008453435520032, + "grad_norm": 0.7690288424491882, + "learning_rate": 4.811811869582872e-07, + "loss": 0.3171, + "step": 40415 + }, + { + "epoch": 0.9009567929326233, + "grad_norm": 0.8907411098480225, + "learning_rate": 4.801087385023629e-07, + "loss": 0.2316, + "step": 40420 + }, + { + "epoch": 0.9010682423132433, + "grad_norm": 0.8934478759765625, + "learning_rate": 4.79037457111624e-07, + "loss": 0.19, + "step": 40425 + }, + { + "epoch": 0.9011796916938634, + "grad_norm": 0.38658207654953003, + "learning_rate": 4.779673429174036e-07, + "loss": 0.2257, + "step": 40430 + }, + { + "epoch": 0.9012911410744835, + "grad_norm": 0.5999665260314941, + "learning_rate": 4.768983960508888e-07, + "loss": 0.2945, + "step": 40435 + }, + { + "epoch": 0.9014025904551035, + "grad_norm": 0.6051561236381531, + "learning_rate": 4.758306166431226e-07, + "loss": 0.2874, + "step": 40440 + }, + { + "epoch": 0.9015140398357236, + "grad_norm": 0.7040433287620544, + "learning_rate": 4.747640048250068e-07, + "loss": 0.3253, + "step": 40445 + }, + { + "epoch": 0.9016254892163437, + "grad_norm": 0.5505911111831665, + "learning_rate": 4.736985607273026e-07, + "loss": 0.1973, + "step": 40450 + }, + { + "epoch": 0.9017369385969638, + "grad_norm": 0.6418886780738831, + "learning_rate": 4.726342844806209e-07, + "loss": 0.3427, + "step": 40455 + }, + { + "epoch": 0.9018483879775838, + "grad_norm": 0.46738046407699585, + "learning_rate": 4.715711762154362e-07, + "loss": 0.3277, + "step": 40460 + }, + { + "epoch": 0.9019598373582038, + "grad_norm": 1.1336296796798706, + "learning_rate": 4.7050923606207886e-07, + "loss": 0.3439, + "step": 40465 + }, + { + "epoch": 0.902071286738824, + "grad_norm": 0.5736339688301086, + "learning_rate": 4.694484641507324e-07, + "loss": 0.3776, + "step": 40470 + }, + { + "epoch": 0.902182736119444, + "grad_norm": 0.4404354989528656, + "learning_rate": 4.683888606114384e-07, + "loss": 0.2393, + "step": 40475 + }, + { + "epoch": 0.9022941855000641, + "grad_norm": 0.4591207206249237, + "learning_rate": 4.673304255740974e-07, + "loss": 0.3282, + "step": 40480 + }, + { + "epoch": 0.9024056348806841, + "grad_norm": 0.9725980162620544, + "learning_rate": 4.662731591684655e-07, + "loss": 0.2651, + "step": 40485 + }, + { + "epoch": 0.9025170842613042, + "grad_norm": 0.6524075269699097, + "learning_rate": 4.652170615241558e-07, + "loss": 0.3794, + "step": 40490 + }, + { + "epoch": 0.9026285336419243, + "grad_norm": 0.6163941025733948, + "learning_rate": 4.6416213277063693e-07, + "loss": 0.2641, + "step": 40495 + }, + { + "epoch": 0.9027399830225443, + "grad_norm": 0.8426482677459717, + "learning_rate": 4.6310837303723546e-07, + "loss": 0.3196, + "step": 40500 + }, + { + "epoch": 0.9028514324031645, + "grad_norm": 0.47756075859069824, + "learning_rate": 4.6205578245313356e-07, + "loss": 0.294, + "step": 40505 + }, + { + "epoch": 0.9029628817837845, + "grad_norm": 0.5749417543411255, + "learning_rate": 4.6100436114737246e-07, + "loss": 0.2196, + "step": 40510 + }, + { + "epoch": 0.9030743311644046, + "grad_norm": 0.8055911064147949, + "learning_rate": 4.5995410924884464e-07, + "loss": 0.3331, + "step": 40515 + }, + { + "epoch": 0.9031857805450246, + "grad_norm": 0.5792744159698486, + "learning_rate": 4.5890502688630487e-07, + "loss": 0.3535, + "step": 40520 + }, + { + "epoch": 0.9032972299256447, + "grad_norm": 0.31290754675865173, + "learning_rate": 4.5785711418836473e-07, + "loss": 0.4213, + "step": 40525 + }, + { + "epoch": 0.9034086793062648, + "grad_norm": 0.27813515067100525, + "learning_rate": 4.5681037128348816e-07, + "loss": 0.2462, + "step": 40530 + }, + { + "epoch": 0.9035201286868848, + "grad_norm": 0.9764168858528137, + "learning_rate": 4.55764798299998e-07, + "loss": 0.2968, + "step": 40535 + }, + { + "epoch": 0.9036315780675049, + "grad_norm": 0.5820105671882629, + "learning_rate": 4.547203953660728e-07, + "loss": 0.3481, + "step": 40540 + }, + { + "epoch": 0.903743027448125, + "grad_norm": 0.6162011623382568, + "learning_rate": 4.5367716260974916e-07, + "loss": 0.3021, + "step": 40545 + }, + { + "epoch": 0.903854476828745, + "grad_norm": 0.5713285803794861, + "learning_rate": 4.5263510015891796e-07, + "loss": 0.2988, + "step": 40550 + }, + { + "epoch": 0.9039659262093651, + "grad_norm": 0.3402009606361389, + "learning_rate": 4.5159420814133047e-07, + "loss": 0.1795, + "step": 40555 + }, + { + "epoch": 0.9040773755899851, + "grad_norm": 0.6031463146209717, + "learning_rate": 4.505544866845901e-07, + "loss": 0.2681, + "step": 40560 + }, + { + "epoch": 0.9041888249706053, + "grad_norm": 0.746756911277771, + "learning_rate": 4.495159359161605e-07, + "loss": 0.2348, + "step": 40565 + }, + { + "epoch": 0.9043002743512253, + "grad_norm": 1.0207500457763672, + "learning_rate": 4.484785559633575e-07, + "loss": 0.2689, + "step": 40570 + }, + { + "epoch": 0.9044117237318453, + "grad_norm": 0.7450649738311768, + "learning_rate": 4.474423469533562e-07, + "loss": 0.2491, + "step": 40575 + }, + { + "epoch": 0.9045231731124654, + "grad_norm": 0.6365007162094116, + "learning_rate": 4.464073090131904e-07, + "loss": 0.3264, + "step": 40580 + }, + { + "epoch": 0.9046346224930855, + "grad_norm": 0.9207115769386292, + "learning_rate": 4.4537344226974533e-07, + "loss": 0.2552, + "step": 40585 + }, + { + "epoch": 0.9047460718737056, + "grad_norm": 0.5804201364517212, + "learning_rate": 4.4434074684976624e-07, + "loss": 0.2411, + "step": 40590 + }, + { + "epoch": 0.9048575212543256, + "grad_norm": 0.6803749203681946, + "learning_rate": 4.43309222879853e-07, + "loss": 0.3159, + "step": 40595 + }, + { + "epoch": 0.9049689706349457, + "grad_norm": 0.34516510367393494, + "learning_rate": 4.4227887048646335e-07, + "loss": 0.2359, + "step": 40600 + }, + { + "epoch": 0.9050804200155658, + "grad_norm": 0.8290709853172302, + "learning_rate": 4.4124968979590953e-07, + "loss": 0.2248, + "step": 40605 + }, + { + "epoch": 0.9051918693961858, + "grad_norm": 0.6502459049224854, + "learning_rate": 4.402216809343607e-07, + "loss": 0.2735, + "step": 40610 + }, + { + "epoch": 0.9053033187768059, + "grad_norm": 0.6392561793327332, + "learning_rate": 4.3919484402784483e-07, + "loss": 0.2153, + "step": 40615 + }, + { + "epoch": 0.9054147681574259, + "grad_norm": 0.6268975734710693, + "learning_rate": 4.3816917920224134e-07, + "loss": 0.2836, + "step": 40620 + }, + { + "epoch": 0.9055262175380461, + "grad_norm": 0.6285988688468933, + "learning_rate": 4.371446865832918e-07, + "loss": 0.2421, + "step": 40625 + }, + { + "epoch": 0.9056376669186661, + "grad_norm": 0.863253653049469, + "learning_rate": 4.3612136629659017e-07, + "loss": 0.2221, + "step": 40630 + }, + { + "epoch": 0.9057491162992861, + "grad_norm": 0.4006742238998413, + "learning_rate": 4.350992184675873e-07, + "loss": 0.2495, + "step": 40635 + }, + { + "epoch": 0.9058605656799062, + "grad_norm": 0.44644999504089355, + "learning_rate": 4.3407824322158844e-07, + "loss": 0.2173, + "step": 40640 + }, + { + "epoch": 0.9059720150605263, + "grad_norm": 0.7369060516357422, + "learning_rate": 4.3305844068376125e-07, + "loss": 0.2913, + "step": 40645 + }, + { + "epoch": 0.9060834644411464, + "grad_norm": 0.7623095512390137, + "learning_rate": 4.320398109791235e-07, + "loss": 0.2909, + "step": 40650 + }, + { + "epoch": 0.9061949138217664, + "grad_norm": 0.6151301860809326, + "learning_rate": 4.310223542325509e-07, + "loss": 0.1126, + "step": 40655 + }, + { + "epoch": 0.9063063632023866, + "grad_norm": 0.7170618772506714, + "learning_rate": 4.3000607056877697e-07, + "loss": 0.2357, + "step": 40660 + }, + { + "epoch": 0.9064178125830066, + "grad_norm": 0.5875582695007324, + "learning_rate": 4.2899096011238986e-07, + "loss": 0.2537, + "step": 40665 + }, + { + "epoch": 0.9065292619636266, + "grad_norm": 0.5494327545166016, + "learning_rate": 4.2797702298783327e-07, + "loss": 0.3221, + "step": 40670 + }, + { + "epoch": 0.9066407113442467, + "grad_norm": 0.7645689249038696, + "learning_rate": 4.269642593194101e-07, + "loss": 0.2921, + "step": 40675 + }, + { + "epoch": 0.9067521607248668, + "grad_norm": 0.5476593375205994, + "learning_rate": 4.2595266923127654e-07, + "loss": 0.3128, + "step": 40680 + }, + { + "epoch": 0.9068636101054869, + "grad_norm": 0.39104554057121277, + "learning_rate": 4.249422528474456e-07, + "loss": 0.3036, + "step": 40685 + }, + { + "epoch": 0.9069750594861069, + "grad_norm": 0.6466798186302185, + "learning_rate": 4.239330102917838e-07, + "loss": 0.2348, + "step": 40690 + }, + { + "epoch": 0.9070865088667269, + "grad_norm": 0.5702161192893982, + "learning_rate": 4.229249416880221e-07, + "loss": 0.2807, + "step": 40695 + }, + { + "epoch": 0.9071979582473471, + "grad_norm": 0.7513668537139893, + "learning_rate": 4.2191804715973725e-07, + "loss": 0.2557, + "step": 40700 + }, + { + "epoch": 0.9073094076279671, + "grad_norm": 0.6457399725914001, + "learning_rate": 4.209123268303683e-07, + "loss": 0.2531, + "step": 40705 + }, + { + "epoch": 0.9074208570085872, + "grad_norm": 0.683296263217926, + "learning_rate": 4.1990778082320993e-07, + "loss": 0.2555, + "step": 40710 + }, + { + "epoch": 0.9075323063892072, + "grad_norm": 0.4363429844379425, + "learning_rate": 4.189044092614103e-07, + "loss": 0.2557, + "step": 40715 + }, + { + "epoch": 0.9076437557698273, + "grad_norm": 1.0468658208847046, + "learning_rate": 4.179022122679754e-07, + "loss": 0.2291, + "step": 40720 + }, + { + "epoch": 0.9077552051504474, + "grad_norm": 0.5416340231895447, + "learning_rate": 4.169011899657671e-07, + "loss": 0.2986, + "step": 40725 + }, + { + "epoch": 0.9078666545310674, + "grad_norm": 0.45224234461784363, + "learning_rate": 4.159013424775016e-07, + "loss": 0.1981, + "step": 40730 + }, + { + "epoch": 0.9079781039116875, + "grad_norm": 0.7300019264221191, + "learning_rate": 4.14902669925753e-07, + "loss": 0.2268, + "step": 40735 + }, + { + "epoch": 0.9080895532923076, + "grad_norm": 0.6923551559448242, + "learning_rate": 4.1390517243295236e-07, + "loss": 0.3246, + "step": 40740 + }, + { + "epoch": 0.9082010026729277, + "grad_norm": 0.7699552178382874, + "learning_rate": 4.1290885012138514e-07, + "loss": 0.3116, + "step": 40745 + }, + { + "epoch": 0.9083124520535477, + "grad_norm": 0.6291413307189941, + "learning_rate": 4.119137031131926e-07, + "loss": 0.2966, + "step": 40750 + }, + { + "epoch": 0.9084239014341677, + "grad_norm": 0.3823592960834503, + "learning_rate": 4.109197315303703e-07, + "loss": 0.2266, + "step": 40755 + }, + { + "epoch": 0.9085353508147879, + "grad_norm": 0.6914843320846558, + "learning_rate": 4.09926935494771e-07, + "loss": 0.3038, + "step": 40760 + }, + { + "epoch": 0.9086468001954079, + "grad_norm": 0.8302752375602722, + "learning_rate": 4.089353151281061e-07, + "loss": 0.3805, + "step": 40765 + }, + { + "epoch": 0.908758249576028, + "grad_norm": 0.7488309741020203, + "learning_rate": 4.079448705519418e-07, + "loss": 0.2704, + "step": 40770 + }, + { + "epoch": 0.908869698956648, + "grad_norm": 0.5267086029052734, + "learning_rate": 4.069556018876963e-07, + "loss": 0.2928, + "step": 40775 + }, + { + "epoch": 0.9089811483372681, + "grad_norm": 0.5103002190589905, + "learning_rate": 4.059675092566462e-07, + "loss": 0.1929, + "step": 40780 + }, + { + "epoch": 0.9090925977178882, + "grad_norm": 0.476452499628067, + "learning_rate": 4.0498059277992665e-07, + "loss": 0.3, + "step": 40785 + }, + { + "epoch": 0.9092040470985082, + "grad_norm": 0.5397807955741882, + "learning_rate": 4.03994852578522e-07, + "loss": 0.2652, + "step": 40790 + }, + { + "epoch": 0.9093154964791283, + "grad_norm": 0.4290942847728729, + "learning_rate": 4.030102887732801e-07, + "loss": 0.3166, + "step": 40795 + }, + { + "epoch": 0.9094269458597484, + "grad_norm": 0.2386811226606369, + "learning_rate": 4.0202690148489766e-07, + "loss": 0.3247, + "step": 40800 + }, + { + "epoch": 0.9095383952403685, + "grad_norm": 0.898285448551178, + "learning_rate": 4.010446908339338e-07, + "loss": 0.3954, + "step": 40805 + }, + { + "epoch": 0.9096498446209885, + "grad_norm": 0.5822149515151978, + "learning_rate": 4.000636569407978e-07, + "loss": 0.2307, + "step": 40810 + }, + { + "epoch": 0.9097612940016085, + "grad_norm": 0.7847784757614136, + "learning_rate": 3.9908379992575795e-07, + "loss": 0.2917, + "step": 40815 + }, + { + "epoch": 0.9098727433822287, + "grad_norm": 0.6533625721931458, + "learning_rate": 3.9810511990893696e-07, + "loss": 0.2921, + "step": 40820 + }, + { + "epoch": 0.9099841927628487, + "grad_norm": 0.5961136817932129, + "learning_rate": 3.971276170103111e-07, + "loss": 0.3071, + "step": 40825 + }, + { + "epoch": 0.9100956421434688, + "grad_norm": 0.513742983341217, + "learning_rate": 3.9615129134971786e-07, + "loss": 0.2203, + "step": 40830 + }, + { + "epoch": 0.9102070915240889, + "grad_norm": 0.7879976630210876, + "learning_rate": 3.951761430468448e-07, + "loss": 0.2818, + "step": 40835 + }, + { + "epoch": 0.9103185409047089, + "grad_norm": 0.7775445580482483, + "learning_rate": 3.9420217222123967e-07, + "loss": 0.2644, + "step": 40840 + }, + { + "epoch": 0.910429990285329, + "grad_norm": 0.40710213780403137, + "learning_rate": 3.932293789923036e-07, + "loss": 0.4258, + "step": 40845 + }, + { + "epoch": 0.910541439665949, + "grad_norm": 0.8035426139831543, + "learning_rate": 3.9225776347929235e-07, + "loss": 0.318, + "step": 40850 + }, + { + "epoch": 0.9106528890465692, + "grad_norm": 0.41157716512680054, + "learning_rate": 3.9128732580131725e-07, + "loss": 0.2158, + "step": 40855 + }, + { + "epoch": 0.9107643384271892, + "grad_norm": 0.32904186844825745, + "learning_rate": 3.9031806607734977e-07, + "loss": 0.3042, + "step": 40860 + }, + { + "epoch": 0.9108757878078093, + "grad_norm": 0.37337726354599, + "learning_rate": 3.8934998442621165e-07, + "loss": 0.3871, + "step": 40865 + }, + { + "epoch": 0.9109872371884293, + "grad_norm": 0.6850005388259888, + "learning_rate": 3.8838308096658227e-07, + "loss": 0.2787, + "step": 40870 + }, + { + "epoch": 0.9110986865690494, + "grad_norm": 1.088067889213562, + "learning_rate": 3.874173558169969e-07, + "loss": 0.3177, + "step": 40875 + }, + { + "epoch": 0.9112101359496695, + "grad_norm": 0.7726697325706482, + "learning_rate": 3.864528090958475e-07, + "loss": 0.3719, + "step": 40880 + }, + { + "epoch": 0.9113215853302895, + "grad_norm": 0.6238122582435608, + "learning_rate": 3.854894409213761e-07, + "loss": 0.2347, + "step": 40885 + }, + { + "epoch": 0.9114330347109096, + "grad_norm": 0.8500913977622986, + "learning_rate": 3.8452725141168824e-07, + "loss": 0.3518, + "step": 40890 + }, + { + "epoch": 0.9115444840915297, + "grad_norm": 0.8673303723335266, + "learning_rate": 3.835662406847385e-07, + "loss": 0.2818, + "step": 40895 + }, + { + "epoch": 0.9116559334721497, + "grad_norm": 0.590812623500824, + "learning_rate": 3.826064088583414e-07, + "loss": 0.2046, + "step": 40900 + }, + { + "epoch": 0.9117673828527698, + "grad_norm": 0.7240970730781555, + "learning_rate": 3.816477560501619e-07, + "loss": 0.3674, + "step": 40905 + }, + { + "epoch": 0.9118788322333898, + "grad_norm": 0.5596110224723816, + "learning_rate": 3.80690282377727e-07, + "loss": 0.3576, + "step": 40910 + }, + { + "epoch": 0.91199028161401, + "grad_norm": 0.7215169072151184, + "learning_rate": 3.797339879584128e-07, + "loss": 0.2859, + "step": 40915 + }, + { + "epoch": 0.91210173099463, + "grad_norm": 0.6473619937896729, + "learning_rate": 3.7877887290945434e-07, + "loss": 0.2701, + "step": 40920 + }, + { + "epoch": 0.91221318037525, + "grad_norm": 0.760945737361908, + "learning_rate": 3.7782493734794143e-07, + "loss": 0.2415, + "step": 40925 + }, + { + "epoch": 0.9123246297558701, + "grad_norm": 0.5840770602226257, + "learning_rate": 3.768721813908205e-07, + "loss": 0.2049, + "step": 40930 + }, + { + "epoch": 0.9124360791364902, + "grad_norm": 0.779276430606842, + "learning_rate": 3.759206051548914e-07, + "loss": 0.2785, + "step": 40935 + }, + { + "epoch": 0.9125475285171103, + "grad_norm": 0.848493754863739, + "learning_rate": 3.7497020875680746e-07, + "loss": 0.3365, + "step": 40940 + }, + { + "epoch": 0.9126589778977303, + "grad_norm": 0.7321879267692566, + "learning_rate": 3.740209923130844e-07, + "loss": 0.3718, + "step": 40945 + }, + { + "epoch": 0.9127704272783504, + "grad_norm": 0.6048905253410339, + "learning_rate": 3.730729559400847e-07, + "loss": 0.2731, + "step": 40950 + }, + { + "epoch": 0.9128818766589705, + "grad_norm": 0.444602906703949, + "learning_rate": 3.7212609975403325e-07, + "loss": 0.3882, + "step": 40955 + }, + { + "epoch": 0.9129933260395905, + "grad_norm": 0.47648975253105164, + "learning_rate": 3.7118042387100596e-07, + "loss": 0.2263, + "step": 40960 + }, + { + "epoch": 0.9131047754202106, + "grad_norm": 0.7234205007553101, + "learning_rate": 3.7023592840693566e-07, + "loss": 0.3095, + "step": 40965 + }, + { + "epoch": 0.9132162248008306, + "grad_norm": 0.24275638163089752, + "learning_rate": 3.692926134776109e-07, + "loss": 0.3063, + "step": 40970 + }, + { + "epoch": 0.9133276741814508, + "grad_norm": 0.5635351538658142, + "learning_rate": 3.683504791986714e-07, + "loss": 0.3308, + "step": 40975 + }, + { + "epoch": 0.9134391235620708, + "grad_norm": 0.5409741401672363, + "learning_rate": 3.674095256856192e-07, + "loss": 0.2552, + "step": 40980 + }, + { + "epoch": 0.9135505729426908, + "grad_norm": 0.574982762336731, + "learning_rate": 3.664697530538086e-07, + "loss": 0.2514, + "step": 40985 + }, + { + "epoch": 0.913662022323311, + "grad_norm": 0.5560092329978943, + "learning_rate": 3.6553116141844645e-07, + "loss": 0.2257, + "step": 40990 + }, + { + "epoch": 0.913773471703931, + "grad_norm": 0.5724308490753174, + "learning_rate": 3.6459375089459735e-07, + "loss": 0.3164, + "step": 40995 + }, + { + "epoch": 0.9138849210845511, + "grad_norm": 0.5456295609474182, + "learning_rate": 3.636575215971805e-07, + "loss": 0.2546, + "step": 41000 + }, + { + "epoch": 0.9139963704651711, + "grad_norm": 0.5563053488731384, + "learning_rate": 3.6272247364096845e-07, + "loss": 0.3636, + "step": 41005 + }, + { + "epoch": 0.9141078198457913, + "grad_norm": 0.536551296710968, + "learning_rate": 3.6178860714059406e-07, + "loss": 0.2576, + "step": 41010 + }, + { + "epoch": 0.9142192692264113, + "grad_norm": 0.515977144241333, + "learning_rate": 3.608559222105401e-07, + "loss": 0.3612, + "step": 41015 + }, + { + "epoch": 0.9143307186070313, + "grad_norm": 0.6344241499900818, + "learning_rate": 3.599244189651485e-07, + "loss": 0.2691, + "step": 41020 + }, + { + "epoch": 0.9144421679876514, + "grad_norm": 1.1275960206985474, + "learning_rate": 3.589940975186135e-07, + "loss": 0.2796, + "step": 41025 + }, + { + "epoch": 0.9145536173682715, + "grad_norm": 0.8587217926979065, + "learning_rate": 3.5806495798498486e-07, + "loss": 0.4254, + "step": 41030 + }, + { + "epoch": 0.9146650667488916, + "grad_norm": 0.6224008202552795, + "learning_rate": 3.5713700047816715e-07, + "loss": 0.3392, + "step": 41035 + }, + { + "epoch": 0.9147765161295116, + "grad_norm": 0.7869055867195129, + "learning_rate": 3.562102251119215e-07, + "loss": 0.3612, + "step": 41040 + }, + { + "epoch": 0.9148879655101316, + "grad_norm": 0.5396468043327332, + "learning_rate": 3.5528463199986374e-07, + "loss": 0.3232, + "step": 41045 + }, + { + "epoch": 0.9149994148907518, + "grad_norm": 0.7791078090667725, + "learning_rate": 3.543602212554642e-07, + "loss": 0.3343, + "step": 41050 + }, + { + "epoch": 0.9151108642713718, + "grad_norm": 1.2158949375152588, + "learning_rate": 3.5343699299205003e-07, + "loss": 0.1825, + "step": 41055 + }, + { + "epoch": 0.9152223136519919, + "grad_norm": 0.45359018445014954, + "learning_rate": 3.5251494732279957e-07, + "loss": 0.3307, + "step": 41060 + }, + { + "epoch": 0.9153337630326119, + "grad_norm": 0.6725892424583435, + "learning_rate": 3.5159408436075015e-07, + "loss": 0.3159, + "step": 41065 + }, + { + "epoch": 0.9154452124132321, + "grad_norm": 0.4623555839061737, + "learning_rate": 3.506744042187904e-07, + "loss": 0.2987, + "step": 41070 + }, + { + "epoch": 0.9155566617938521, + "grad_norm": 0.9277505278587341, + "learning_rate": 3.497559070096679e-07, + "loss": 0.2633, + "step": 41075 + }, + { + "epoch": 0.9156681111744721, + "grad_norm": 0.7097349762916565, + "learning_rate": 3.4883859284598254e-07, + "loss": 0.2369, + "step": 41080 + }, + { + "epoch": 0.9157795605550922, + "grad_norm": 0.5371698141098022, + "learning_rate": 3.4792246184018997e-07, + "loss": 0.277, + "step": 41085 + }, + { + "epoch": 0.9158910099357123, + "grad_norm": 0.5579074621200562, + "learning_rate": 3.4700751410460255e-07, + "loss": 0.3128, + "step": 41090 + }, + { + "epoch": 0.9160024593163324, + "grad_norm": 0.9541270732879639, + "learning_rate": 3.4609374975138275e-07, + "loss": 0.3005, + "step": 41095 + }, + { + "epoch": 0.9161139086969524, + "grad_norm": 0.6161052584648132, + "learning_rate": 3.4518116889255215e-07, + "loss": 0.3291, + "step": 41100 + }, + { + "epoch": 0.9162253580775724, + "grad_norm": 0.6096290349960327, + "learning_rate": 3.442697716399879e-07, + "loss": 0.1922, + "step": 41105 + }, + { + "epoch": 0.9163368074581926, + "grad_norm": 0.8514531850814819, + "learning_rate": 3.4335955810541834e-07, + "loss": 0.3677, + "step": 41110 + }, + { + "epoch": 0.9164482568388126, + "grad_norm": 0.49064114689826965, + "learning_rate": 3.424505284004276e-07, + "loss": 0.254, + "step": 41115 + }, + { + "epoch": 0.9165597062194327, + "grad_norm": 0.7443819046020508, + "learning_rate": 3.4154268263645983e-07, + "loss": 0.2203, + "step": 41120 + }, + { + "epoch": 0.9166711556000527, + "grad_norm": 1.1286894083023071, + "learning_rate": 3.4063602092480606e-07, + "loss": 0.2697, + "step": 41125 + }, + { + "epoch": 0.9167826049806728, + "grad_norm": 0.33552834391593933, + "learning_rate": 3.3973054337661737e-07, + "loss": 0.2624, + "step": 41130 + }, + { + "epoch": 0.9168940543612929, + "grad_norm": 0.5012408494949341, + "learning_rate": 3.3882625010289717e-07, + "loss": 0.2549, + "step": 41135 + }, + { + "epoch": 0.9170055037419129, + "grad_norm": 0.7399274706840515, + "learning_rate": 3.379231412145079e-07, + "loss": 0.2953, + "step": 41140 + }, + { + "epoch": 0.917116953122533, + "grad_norm": 0.6067442893981934, + "learning_rate": 3.3702121682216094e-07, + "loss": 0.1676, + "step": 41145 + }, + { + "epoch": 0.9172284025031531, + "grad_norm": 1.101588249206543, + "learning_rate": 3.361204770364246e-07, + "loss": 0.3673, + "step": 41150 + }, + { + "epoch": 0.9173398518837732, + "grad_norm": 0.7327398061752319, + "learning_rate": 3.3522092196772605e-07, + "loss": 0.3734, + "step": 41155 + }, + { + "epoch": 0.9174513012643932, + "grad_norm": 0.6925585865974426, + "learning_rate": 3.3432255172634153e-07, + "loss": 0.1508, + "step": 41160 + }, + { + "epoch": 0.9175627506450132, + "grad_norm": 0.6559704542160034, + "learning_rate": 3.3342536642240296e-07, + "loss": 0.3934, + "step": 41165 + }, + { + "epoch": 0.9176742000256334, + "grad_norm": 0.5590237379074097, + "learning_rate": 3.3252936616590125e-07, + "loss": 0.3641, + "step": 41170 + }, + { + "epoch": 0.9177856494062534, + "grad_norm": 0.913287878036499, + "learning_rate": 3.3163455106667854e-07, + "loss": 0.305, + "step": 41175 + }, + { + "epoch": 0.9178970987868735, + "grad_norm": 0.7776272892951965, + "learning_rate": 3.3074092123443036e-07, + "loss": 0.2427, + "step": 41180 + }, + { + "epoch": 0.9180085481674936, + "grad_norm": 0.601886510848999, + "learning_rate": 3.2984847677871025e-07, + "loss": 0.1735, + "step": 41185 + }, + { + "epoch": 0.9181199975481136, + "grad_norm": 0.5641916990280151, + "learning_rate": 3.289572178089251e-07, + "loss": 0.3247, + "step": 41190 + }, + { + "epoch": 0.9182314469287337, + "grad_norm": 0.5231988430023193, + "learning_rate": 3.2806714443433416e-07, + "loss": 0.2847, + "step": 41195 + }, + { + "epoch": 0.9183428963093537, + "grad_norm": 0.83245450258255, + "learning_rate": 3.27178256764058e-07, + "loss": 0.297, + "step": 41200 + }, + { + "epoch": 0.9184543456899739, + "grad_norm": 0.6542052030563354, + "learning_rate": 3.2629055490706386e-07, + "loss": 0.3121, + "step": 41205 + }, + { + "epoch": 0.9185657950705939, + "grad_norm": 0.6979672908782959, + "learning_rate": 3.2540403897217907e-07, + "loss": 0.1994, + "step": 41210 + }, + { + "epoch": 0.918677244451214, + "grad_norm": 0.49956926703453064, + "learning_rate": 3.2451870906808236e-07, + "loss": 0.2602, + "step": 41215 + }, + { + "epoch": 0.918788693831834, + "grad_norm": 0.7163175940513611, + "learning_rate": 3.236345653033068e-07, + "loss": 0.3169, + "step": 41220 + }, + { + "epoch": 0.9189001432124541, + "grad_norm": 0.6471841335296631, + "learning_rate": 3.227516077862447e-07, + "loss": 0.2563, + "step": 41225 + }, + { + "epoch": 0.9190115925930742, + "grad_norm": 0.27869361639022827, + "learning_rate": 3.2186983662513826e-07, + "loss": 0.2896, + "step": 41230 + }, + { + "epoch": 0.9191230419736942, + "grad_norm": 0.7508506774902344, + "learning_rate": 3.209892519280866e-07, + "loss": 0.3923, + "step": 41235 + }, + { + "epoch": 0.9192344913543143, + "grad_norm": 0.47320273518562317, + "learning_rate": 3.2010985380304337e-07, + "loss": 0.237, + "step": 41240 + }, + { + "epoch": 0.9193459407349344, + "grad_norm": 0.3736218214035034, + "learning_rate": 3.1923164235781346e-07, + "loss": 0.3483, + "step": 41245 + }, + { + "epoch": 0.9194573901155544, + "grad_norm": 0.48803550004959106, + "learning_rate": 3.183546177000607e-07, + "loss": 0.2823, + "step": 41250 + }, + { + "epoch": 0.9195688394961745, + "grad_norm": 0.2997181713581085, + "learning_rate": 3.1747877993729916e-07, + "loss": 0.2793, + "step": 41255 + }, + { + "epoch": 0.9196802888767945, + "grad_norm": 0.6676647067070007, + "learning_rate": 3.1660412917690284e-07, + "loss": 0.1896, + "step": 41260 + }, + { + "epoch": 0.9197917382574147, + "grad_norm": 0.770282506942749, + "learning_rate": 3.157306655260961e-07, + "loss": 0.3209, + "step": 41265 + }, + { + "epoch": 0.9199031876380347, + "grad_norm": 0.8639677166938782, + "learning_rate": 3.1485838909195875e-07, + "loss": 0.2109, + "step": 41270 + }, + { + "epoch": 0.9200146370186548, + "grad_norm": 0.4482399523258209, + "learning_rate": 3.139872999814253e-07, + "loss": 0.1985, + "step": 41275 + }, + { + "epoch": 0.9201260863992748, + "grad_norm": 0.8507997393608093, + "learning_rate": 3.131173983012847e-07, + "loss": 0.2521, + "step": 41280 + }, + { + "epoch": 0.9202375357798949, + "grad_norm": 0.6366497278213501, + "learning_rate": 3.122486841581773e-07, + "loss": 0.3673, + "step": 41285 + }, + { + "epoch": 0.920348985160515, + "grad_norm": 0.6055535674095154, + "learning_rate": 3.1138115765860564e-07, + "loss": 0.2085, + "step": 41290 + }, + { + "epoch": 0.920460434541135, + "grad_norm": 0.547455906867981, + "learning_rate": 3.1051481890891797e-07, + "loss": 0.2279, + "step": 41295 + }, + { + "epoch": 0.9205718839217552, + "grad_norm": 0.48422321677207947, + "learning_rate": 3.096496680153238e-07, + "loss": 0.2381, + "step": 41300 + }, + { + "epoch": 0.9206833333023752, + "grad_norm": 0.9877715110778809, + "learning_rate": 3.087857050838816e-07, + "loss": 0.3086, + "step": 41305 + }, + { + "epoch": 0.9207947826829952, + "grad_norm": 0.7008081674575806, + "learning_rate": 3.079229302205078e-07, + "loss": 0.1457, + "step": 41310 + }, + { + "epoch": 0.9209062320636153, + "grad_norm": 0.7891531586647034, + "learning_rate": 3.0706134353097107e-07, + "loss": 0.2196, + "step": 41315 + }, + { + "epoch": 0.9210176814442353, + "grad_norm": 0.44549623131752014, + "learning_rate": 3.062009451208936e-07, + "loss": 0.1889, + "step": 41320 + }, + { + "epoch": 0.9211291308248555, + "grad_norm": 0.8684026002883911, + "learning_rate": 3.053417350957577e-07, + "loss": 0.332, + "step": 41325 + }, + { + "epoch": 0.9212405802054755, + "grad_norm": 0.7550075650215149, + "learning_rate": 3.0448371356089247e-07, + "loss": 0.307, + "step": 41330 + }, + { + "epoch": 0.9213520295860955, + "grad_norm": 0.7287044525146484, + "learning_rate": 3.036268806214882e-07, + "loss": 0.2904, + "step": 41335 + }, + { + "epoch": 0.9214634789667157, + "grad_norm": 0.5059028267860413, + "learning_rate": 3.02771236382583e-07, + "loss": 0.2386, + "step": 41340 + }, + { + "epoch": 0.9215749283473357, + "grad_norm": 0.7069331407546997, + "learning_rate": 3.0191678094907306e-07, + "loss": 0.3352, + "step": 41345 + }, + { + "epoch": 0.9216863777279558, + "grad_norm": 0.6661747694015503, + "learning_rate": 3.010635144257068e-07, + "loss": 0.3008, + "step": 41350 + }, + { + "epoch": 0.9217978271085758, + "grad_norm": 0.9574068784713745, + "learning_rate": 3.0021143691709055e-07, + "loss": 0.3011, + "step": 41355 + }, + { + "epoch": 0.921909276489196, + "grad_norm": 0.45941346883773804, + "learning_rate": 2.993605485276807e-07, + "loss": 0.3542, + "step": 41360 + }, + { + "epoch": 0.922020725869816, + "grad_norm": 0.7826938033103943, + "learning_rate": 2.9851084936179054e-07, + "loss": 0.3547, + "step": 41365 + }, + { + "epoch": 0.922132175250436, + "grad_norm": 0.5727695226669312, + "learning_rate": 2.976623395235867e-07, + "loss": 0.288, + "step": 41370 + }, + { + "epoch": 0.9222436246310561, + "grad_norm": 0.622995138168335, + "learning_rate": 2.968150191170882e-07, + "loss": 0.3068, + "step": 41375 + }, + { + "epoch": 0.9223550740116762, + "grad_norm": 0.5591624975204468, + "learning_rate": 2.95968888246172e-07, + "loss": 0.2044, + "step": 41380 + }, + { + "epoch": 0.9224665233922963, + "grad_norm": 0.40898847579956055, + "learning_rate": 2.9512394701456614e-07, + "loss": 0.257, + "step": 41385 + }, + { + "epoch": 0.9225779727729163, + "grad_norm": 0.690396249294281, + "learning_rate": 2.942801955258556e-07, + "loss": 0.3743, + "step": 41390 + }, + { + "epoch": 0.9226894221535363, + "grad_norm": 0.7114289999008179, + "learning_rate": 2.934376338834755e-07, + "loss": 0.2297, + "step": 41395 + }, + { + "epoch": 0.9228008715341565, + "grad_norm": 0.8272179365158081, + "learning_rate": 2.9259626219071747e-07, + "loss": 0.2813, + "step": 41400 + }, + { + "epoch": 0.9229123209147765, + "grad_norm": 0.4953787922859192, + "learning_rate": 2.917560805507291e-07, + "loss": 0.216, + "step": 41405 + }, + { + "epoch": 0.9230237702953966, + "grad_norm": 0.40638652443885803, + "learning_rate": 2.9091708906650806e-07, + "loss": 0.218, + "step": 41410 + }, + { + "epoch": 0.9231352196760166, + "grad_norm": 0.6107771992683411, + "learning_rate": 2.900792878409109e-07, + "loss": 0.3346, + "step": 41415 + }, + { + "epoch": 0.9232466690566368, + "grad_norm": 0.4116399884223938, + "learning_rate": 2.892426769766432e-07, + "loss": 0.2733, + "step": 41420 + }, + { + "epoch": 0.9233581184372568, + "grad_norm": 0.7331038117408752, + "learning_rate": 2.8840725657626746e-07, + "loss": 0.3631, + "step": 41425 + }, + { + "epoch": 0.9234695678178768, + "grad_norm": 0.44384974241256714, + "learning_rate": 2.8757302674220054e-07, + "loss": 0.259, + "step": 41430 + }, + { + "epoch": 0.923581017198497, + "grad_norm": 0.8607022762298584, + "learning_rate": 2.8673998757671185e-07, + "loss": 0.2981, + "step": 41435 + }, + { + "epoch": 0.923692466579117, + "grad_norm": 0.5234905481338501, + "learning_rate": 2.859081391819263e-07, + "loss": 0.1037, + "step": 41440 + }, + { + "epoch": 0.9238039159597371, + "grad_norm": 0.7386176586151123, + "learning_rate": 2.8507748165982006e-07, + "loss": 0.3014, + "step": 41445 + }, + { + "epoch": 0.9239153653403571, + "grad_norm": 0.5484693050384521, + "learning_rate": 2.8424801511222844e-07, + "loss": 0.2253, + "step": 41450 + }, + { + "epoch": 0.9240268147209771, + "grad_norm": 0.39193445444107056, + "learning_rate": 2.8341973964083557e-07, + "loss": 0.2594, + "step": 41455 + }, + { + "epoch": 0.9241382641015973, + "grad_norm": 0.7376248240470886, + "learning_rate": 2.825926553471825e-07, + "loss": 0.3243, + "step": 41460 + }, + { + "epoch": 0.9242497134822173, + "grad_norm": 0.46132493019104004, + "learning_rate": 2.817667623326625e-07, + "loss": 0.2577, + "step": 41465 + }, + { + "epoch": 0.9243611628628374, + "grad_norm": 0.5108756422996521, + "learning_rate": 2.809420606985236e-07, + "loss": 0.2368, + "step": 41470 + }, + { + "epoch": 0.9244726122434574, + "grad_norm": 0.814017653465271, + "learning_rate": 2.8011855054586925e-07, + "loss": 0.3394, + "step": 41475 + }, + { + "epoch": 0.9245840616240775, + "grad_norm": 0.2798021137714386, + "learning_rate": 2.7929623197565427e-07, + "loss": 0.13, + "step": 41480 + }, + { + "epoch": 0.9246955110046976, + "grad_norm": 0.6604013442993164, + "learning_rate": 2.7847510508869025e-07, + "loss": 0.2438, + "step": 41485 + }, + { + "epoch": 0.9248069603853176, + "grad_norm": 0.47539040446281433, + "learning_rate": 2.776551699856389e-07, + "loss": 0.1872, + "step": 41490 + }, + { + "epoch": 0.9249184097659378, + "grad_norm": 1.0226033926010132, + "learning_rate": 2.768364267670187e-07, + "loss": 0.3309, + "step": 41495 + }, + { + "epoch": 0.9250298591465578, + "grad_norm": 0.5431970357894897, + "learning_rate": 2.760188755332005e-07, + "loss": 0.2802, + "step": 41500 + }, + { + "epoch": 0.9251413085271779, + "grad_norm": 0.5110717415809631, + "learning_rate": 2.7520251638440965e-07, + "loss": 0.3091, + "step": 41505 + }, + { + "epoch": 0.9252527579077979, + "grad_norm": 1.4647572040557861, + "learning_rate": 2.7438734942072833e-07, + "loss": 0.3535, + "step": 41510 + }, + { + "epoch": 0.925364207288418, + "grad_norm": 0.5415831804275513, + "learning_rate": 2.7357337474208767e-07, + "loss": 0.2229, + "step": 41515 + }, + { + "epoch": 0.9254756566690381, + "grad_norm": 0.5373227596282959, + "learning_rate": 2.7276059244827455e-07, + "loss": 0.2529, + "step": 41520 + }, + { + "epoch": 0.9255871060496581, + "grad_norm": 0.6123536825180054, + "learning_rate": 2.7194900263893043e-07, + "loss": 0.372, + "step": 41525 + }, + { + "epoch": 0.9256985554302782, + "grad_norm": 0.8674310445785522, + "learning_rate": 2.7113860541354896e-07, + "loss": 0.1822, + "step": 41530 + }, + { + "epoch": 0.9258100048108983, + "grad_norm": 0.716400146484375, + "learning_rate": 2.7032940087147854e-07, + "loss": 0.2701, + "step": 41535 + }, + { + "epoch": 0.9259214541915183, + "grad_norm": 1.3148468732833862, + "learning_rate": 2.6952138911192196e-07, + "loss": 0.2786, + "step": 41540 + }, + { + "epoch": 0.9260329035721384, + "grad_norm": 0.747526228427887, + "learning_rate": 2.6871457023393667e-07, + "loss": 0.3921, + "step": 41545 + }, + { + "epoch": 0.9261443529527584, + "grad_norm": 0.37125661969184875, + "learning_rate": 2.679089443364313e-07, + "loss": 0.1607, + "step": 41550 + }, + { + "epoch": 0.9262558023333786, + "grad_norm": 0.41526857018470764, + "learning_rate": 2.6710451151816807e-07, + "loss": 0.3007, + "step": 41555 + }, + { + "epoch": 0.9263672517139986, + "grad_norm": 0.7944679260253906, + "learning_rate": 2.663012718777658e-07, + "loss": 0.2242, + "step": 41560 + }, + { + "epoch": 0.9264787010946187, + "grad_norm": 0.7380418181419373, + "learning_rate": 2.6549922551369455e-07, + "loss": 0.3003, + "step": 41565 + }, + { + "epoch": 0.9265901504752387, + "grad_norm": 0.5211284756660461, + "learning_rate": 2.646983725242802e-07, + "loss": 0.3025, + "step": 41570 + }, + { + "epoch": 0.9267015998558588, + "grad_norm": 0.5751848220825195, + "learning_rate": 2.6389871300769865e-07, + "loss": 0.1344, + "step": 41575 + }, + { + "epoch": 0.9268130492364789, + "grad_norm": 0.5620242357254028, + "learning_rate": 2.631002470619848e-07, + "loss": 0.1888, + "step": 41580 + }, + { + "epoch": 0.9269244986170989, + "grad_norm": 0.8619793653488159, + "learning_rate": 2.623029747850236e-07, + "loss": 0.3848, + "step": 41585 + }, + { + "epoch": 0.927035947997719, + "grad_norm": 0.6856527924537659, + "learning_rate": 2.6150689627455486e-07, + "loss": 0.3158, + "step": 41590 + }, + { + "epoch": 0.9271473973783391, + "grad_norm": 0.48377129435539246, + "learning_rate": 2.6071201162816916e-07, + "loss": 0.2322, + "step": 41595 + }, + { + "epoch": 0.9272588467589591, + "grad_norm": 0.6122012734413147, + "learning_rate": 2.5991832094331646e-07, + "loss": 0.2448, + "step": 41600 + }, + { + "epoch": 0.9273702961395792, + "grad_norm": 0.8147674798965454, + "learning_rate": 2.591258243172956e-07, + "loss": 0.2793, + "step": 41605 + }, + { + "epoch": 0.9274817455201992, + "grad_norm": 0.7093601822853088, + "learning_rate": 2.5833452184725995e-07, + "loss": 0.3627, + "step": 41610 + }, + { + "epoch": 0.9275931949008194, + "grad_norm": 0.39448460936546326, + "learning_rate": 2.5754441363021854e-07, + "loss": 0.2745, + "step": 41615 + }, + { + "epoch": 0.9277046442814394, + "grad_norm": 0.6350008249282837, + "learning_rate": 2.567554997630317e-07, + "loss": 0.2382, + "step": 41620 + }, + { + "epoch": 0.9278160936620595, + "grad_norm": 0.7153553366661072, + "learning_rate": 2.559677803424143e-07, + "loss": 0.2696, + "step": 41625 + }, + { + "epoch": 0.9279275430426795, + "grad_norm": 0.5094320774078369, + "learning_rate": 2.5518125546493356e-07, + "loss": 0.256, + "step": 41630 + }, + { + "epoch": 0.9280389924232996, + "grad_norm": 0.6403762698173523, + "learning_rate": 2.543959252270134e-07, + "loss": 0.3522, + "step": 41635 + }, + { + "epoch": 0.9281504418039197, + "grad_norm": 0.4988352656364441, + "learning_rate": 2.5361178972492906e-07, + "loss": 0.3591, + "step": 41640 + }, + { + "epoch": 0.9282618911845397, + "grad_norm": 0.8148736953735352, + "learning_rate": 2.52828849054807e-07, + "loss": 0.2603, + "step": 41645 + }, + { + "epoch": 0.9283733405651599, + "grad_norm": 0.6698065400123596, + "learning_rate": 2.520471033126326e-07, + "loss": 0.3158, + "step": 41650 + }, + { + "epoch": 0.9284847899457799, + "grad_norm": 0.45521312952041626, + "learning_rate": 2.512665525942404e-07, + "loss": 0.2776, + "step": 41655 + }, + { + "epoch": 0.9285962393263999, + "grad_norm": 0.6049147248268127, + "learning_rate": 2.504871969953204e-07, + "loss": 0.2781, + "step": 41660 + }, + { + "epoch": 0.92870768870702, + "grad_norm": 0.6182847619056702, + "learning_rate": 2.497090366114152e-07, + "loss": 0.2291, + "step": 41665 + }, + { + "epoch": 0.92881913808764, + "grad_norm": 0.42223942279815674, + "learning_rate": 2.4893207153792176e-07, + "loss": 0.3403, + "step": 41670 + }, + { + "epoch": 0.9289305874682602, + "grad_norm": 0.5356005430221558, + "learning_rate": 2.4815630187008944e-07, + "loss": 0.252, + "step": 41675 + }, + { + "epoch": 0.9290420368488802, + "grad_norm": 0.6396979689598083, + "learning_rate": 2.4738172770302104e-07, + "loss": 0.278, + "step": 41680 + }, + { + "epoch": 0.9291534862295002, + "grad_norm": 0.5384474396705627, + "learning_rate": 2.466083491316751e-07, + "loss": 0.3089, + "step": 41685 + }, + { + "epoch": 0.9292649356101204, + "grad_norm": 0.5913751125335693, + "learning_rate": 2.45836166250859e-07, + "loss": 0.3418, + "step": 41690 + }, + { + "epoch": 0.9293763849907404, + "grad_norm": 0.7718113660812378, + "learning_rate": 2.4506517915524054e-07, + "loss": 0.2465, + "step": 41695 + }, + { + "epoch": 0.9294878343713605, + "grad_norm": 0.7065486907958984, + "learning_rate": 2.4429538793933506e-07, + "loss": 0.3416, + "step": 41700 + }, + { + "epoch": 0.9295992837519805, + "grad_norm": 0.785386860370636, + "learning_rate": 2.4352679269751154e-07, + "loss": 0.4405, + "step": 41705 + }, + { + "epoch": 0.9297107331326007, + "grad_norm": 0.5587266087532043, + "learning_rate": 2.427593935239947e-07, + "loss": 0.2807, + "step": 41710 + }, + { + "epoch": 0.9298221825132207, + "grad_norm": 1.2147592306137085, + "learning_rate": 2.419931905128614e-07, + "loss": 0.211, + "step": 41715 + }, + { + "epoch": 0.9299336318938407, + "grad_norm": 0.5206283330917358, + "learning_rate": 2.4122818375804215e-07, + "loss": 0.2633, + "step": 41720 + }, + { + "epoch": 0.9300450812744608, + "grad_norm": 0.5563860535621643, + "learning_rate": 2.4046437335332296e-07, + "loss": 0.2558, + "step": 41725 + }, + { + "epoch": 0.9301565306550809, + "grad_norm": 0.5390650629997253, + "learning_rate": 2.39701759392339e-07, + "loss": 0.287, + "step": 41730 + }, + { + "epoch": 0.930267980035701, + "grad_norm": 0.47119244933128357, + "learning_rate": 2.389403419685821e-07, + "loss": 0.3024, + "step": 41735 + }, + { + "epoch": 0.930379429416321, + "grad_norm": 0.6295462846755981, + "learning_rate": 2.3818012117539535e-07, + "loss": 0.2199, + "step": 41740 + }, + { + "epoch": 0.930490878796941, + "grad_norm": 0.7683353424072266, + "learning_rate": 2.374210971059754e-07, + "loss": 0.2737, + "step": 41745 + }, + { + "epoch": 0.9306023281775612, + "grad_norm": 0.7312949299812317, + "learning_rate": 2.3666326985337328e-07, + "loss": 0.2167, + "step": 41750 + }, + { + "epoch": 0.9307137775581812, + "grad_norm": 0.48791399598121643, + "learning_rate": 2.3590663951049141e-07, + "loss": 0.2981, + "step": 41755 + }, + { + "epoch": 0.9308252269388013, + "grad_norm": 0.5298525094985962, + "learning_rate": 2.3515120617009112e-07, + "loss": 0.2264, + "step": 41760 + }, + { + "epoch": 0.9309366763194213, + "grad_norm": 0.5891234874725342, + "learning_rate": 2.3439696992477834e-07, + "loss": 0.39, + "step": 41765 + }, + { + "epoch": 0.9310481257000415, + "grad_norm": 0.5690597891807556, + "learning_rate": 2.336439308670191e-07, + "loss": 0.3731, + "step": 41770 + }, + { + "epoch": 0.9311595750806615, + "grad_norm": 0.5743623971939087, + "learning_rate": 2.3289208908912952e-07, + "loss": 0.2081, + "step": 41775 + }, + { + "epoch": 0.9312710244612815, + "grad_norm": 0.7222797274589539, + "learning_rate": 2.3214144468327703e-07, + "loss": 0.2236, + "step": 41780 + }, + { + "epoch": 0.9313824738419016, + "grad_norm": 0.739750862121582, + "learning_rate": 2.3139199774148912e-07, + "loss": 0.3976, + "step": 41785 + }, + { + "epoch": 0.9314939232225217, + "grad_norm": 0.612297534942627, + "learning_rate": 2.3064374835563896e-07, + "loss": 0.3034, + "step": 41790 + }, + { + "epoch": 0.9316053726031418, + "grad_norm": 0.5315260887145996, + "learning_rate": 2.2989669661745872e-07, + "loss": 0.2088, + "step": 41795 + }, + { + "epoch": 0.9317168219837618, + "grad_norm": 0.6662878394126892, + "learning_rate": 2.291508426185296e-07, + "loss": 0.3021, + "step": 41800 + }, + { + "epoch": 0.9318282713643818, + "grad_norm": 0.973757803440094, + "learning_rate": 2.2840618645028735e-07, + "loss": 0.2573, + "step": 41805 + }, + { + "epoch": 0.931939720745002, + "grad_norm": 0.5593680143356323, + "learning_rate": 2.2766272820402113e-07, + "loss": 0.1697, + "step": 41810 + }, + { + "epoch": 0.932051170125622, + "grad_norm": 0.7627435922622681, + "learning_rate": 2.2692046797087475e-07, + "loss": 0.352, + "step": 41815 + }, + { + "epoch": 0.9321626195062421, + "grad_norm": 0.9235901832580566, + "learning_rate": 2.2617940584184205e-07, + "loss": 0.2901, + "step": 41820 + }, + { + "epoch": 0.9322740688868622, + "grad_norm": 0.9725611805915833, + "learning_rate": 2.2543954190777039e-07, + "loss": 0.3149, + "step": 41825 + }, + { + "epoch": 0.9323855182674823, + "grad_norm": 0.6089190244674683, + "learning_rate": 2.2470087625936498e-07, + "loss": 0.3041, + "step": 41830 + }, + { + "epoch": 0.9324969676481023, + "grad_norm": 0.4506814479827881, + "learning_rate": 2.2396340898717783e-07, + "loss": 0.3103, + "step": 41835 + }, + { + "epoch": 0.9326084170287223, + "grad_norm": 0.5334108471870422, + "learning_rate": 2.2322714018161662e-07, + "loss": 0.1976, + "step": 41840 + }, + { + "epoch": 0.9327198664093425, + "grad_norm": 0.7881041169166565, + "learning_rate": 2.2249206993294249e-07, + "loss": 0.3227, + "step": 41845 + }, + { + "epoch": 0.9328313157899625, + "grad_norm": 0.711259126663208, + "learning_rate": 2.2175819833127111e-07, + "loss": 0.2914, + "step": 41850 + }, + { + "epoch": 0.9329427651705826, + "grad_norm": 0.6935058236122131, + "learning_rate": 2.210255254665683e-07, + "loss": 0.3051, + "step": 41855 + }, + { + "epoch": 0.9330542145512026, + "grad_norm": 0.583777129650116, + "learning_rate": 2.2029405142865225e-07, + "loss": 0.3541, + "step": 41860 + }, + { + "epoch": 0.9331656639318227, + "grad_norm": 0.5853663086891174, + "learning_rate": 2.1956377630719895e-07, + "loss": 0.3056, + "step": 41865 + }, + { + "epoch": 0.9332771133124428, + "grad_norm": 0.7453263998031616, + "learning_rate": 2.1883470019173346e-07, + "loss": 0.2945, + "step": 41870 + }, + { + "epoch": 0.9333885626930628, + "grad_norm": 0.8286758661270142, + "learning_rate": 2.1810682317163323e-07, + "loss": 0.2106, + "step": 41875 + }, + { + "epoch": 0.9335000120736829, + "grad_norm": 0.5287289023399353, + "learning_rate": 2.1738014533613349e-07, + "loss": 0.1769, + "step": 41880 + }, + { + "epoch": 0.933611461454303, + "grad_norm": 0.5840474963188171, + "learning_rate": 2.1665466677431745e-07, + "loss": 0.3766, + "step": 41885 + }, + { + "epoch": 0.933722910834923, + "grad_norm": 0.9760193228721619, + "learning_rate": 2.1593038757512397e-07, + "loss": 0.2558, + "step": 41890 + }, + { + "epoch": 0.9338343602155431, + "grad_norm": 0.5497422814369202, + "learning_rate": 2.15207307827342e-07, + "loss": 0.313, + "step": 41895 + }, + { + "epoch": 0.9339458095961631, + "grad_norm": 0.6463967561721802, + "learning_rate": 2.1448542761961844e-07, + "loss": 0.2442, + "step": 41900 + }, + { + "epoch": 0.9340572589767833, + "grad_norm": 0.6637454628944397, + "learning_rate": 2.1376474704044693e-07, + "loss": 0.2445, + "step": 41905 + }, + { + "epoch": 0.9341687083574033, + "grad_norm": 0.2920180857181549, + "learning_rate": 2.1304526617818121e-07, + "loss": 0.2486, + "step": 41910 + }, + { + "epoch": 0.9342801577380234, + "grad_norm": 0.7941288948059082, + "learning_rate": 2.123269851210219e-07, + "loss": 0.3052, + "step": 41915 + }, + { + "epoch": 0.9343916071186434, + "grad_norm": 0.6346423625946045, + "learning_rate": 2.116099039570252e-07, + "loss": 0.241, + "step": 41920 + }, + { + "epoch": 0.9345030564992635, + "grad_norm": 0.6398741602897644, + "learning_rate": 2.1089402277409855e-07, + "loss": 0.1914, + "step": 41925 + }, + { + "epoch": 0.9346145058798836, + "grad_norm": 0.9809888005256653, + "learning_rate": 2.101793416600051e-07, + "loss": 0.1846, + "step": 41930 + }, + { + "epoch": 0.9347259552605036, + "grad_norm": 0.5712922811508179, + "learning_rate": 2.0946586070235808e-07, + "loss": 0.3663, + "step": 41935 + }, + { + "epoch": 0.9348374046411237, + "grad_norm": 0.7750124335289001, + "learning_rate": 2.0875357998862422e-07, + "loss": 0.2467, + "step": 41940 + }, + { + "epoch": 0.9349488540217438, + "grad_norm": 0.4220404624938965, + "learning_rate": 2.08042499606127e-07, + "loss": 0.1812, + "step": 41945 + }, + { + "epoch": 0.9350603034023638, + "grad_norm": 0.4303109347820282, + "learning_rate": 2.0733261964203556e-07, + "loss": 0.2075, + "step": 41950 + }, + { + "epoch": 0.9351717527829839, + "grad_norm": 0.8041217923164368, + "learning_rate": 2.0662394018337806e-07, + "loss": 0.3333, + "step": 41955 + }, + { + "epoch": 0.9352832021636039, + "grad_norm": 0.5130581855773926, + "learning_rate": 2.0591646131703168e-07, + "loss": 0.2355, + "step": 41960 + }, + { + "epoch": 0.9353946515442241, + "grad_norm": 0.5752612352371216, + "learning_rate": 2.0521018312972818e-07, + "loss": 0.3229, + "step": 41965 + }, + { + "epoch": 0.9355061009248441, + "grad_norm": 0.5263034701347351, + "learning_rate": 2.045051057080516e-07, + "loss": 0.3224, + "step": 41970 + }, + { + "epoch": 0.9356175503054642, + "grad_norm": 0.40126967430114746, + "learning_rate": 2.0380122913843946e-07, + "loss": 0.2269, + "step": 41975 + }, + { + "epoch": 0.9357289996860843, + "grad_norm": 0.5782945156097412, + "learning_rate": 2.0309855350718277e-07, + "loss": 0.3501, + "step": 41980 + }, + { + "epoch": 0.9358404490667043, + "grad_norm": 0.6336562633514404, + "learning_rate": 2.023970789004226e-07, + "loss": 0.2832, + "step": 41985 + }, + { + "epoch": 0.9359518984473244, + "grad_norm": 0.6401287913322449, + "learning_rate": 2.016968054041546e-07, + "loss": 0.3738, + "step": 41990 + }, + { + "epoch": 0.9360633478279444, + "grad_norm": 0.7216722369194031, + "learning_rate": 2.0099773310422676e-07, + "loss": 0.3418, + "step": 41995 + }, + { + "epoch": 0.9361747972085646, + "grad_norm": 0.7964975833892822, + "learning_rate": 2.0029986208633943e-07, + "loss": 0.3576, + "step": 42000 + }, + { + "epoch": 0.9362862465891846, + "grad_norm": 0.40365028381347656, + "learning_rate": 1.9960319243604753e-07, + "loss": 0.3055, + "step": 42005 + }, + { + "epoch": 0.9363976959698046, + "grad_norm": 1.2185330390930176, + "learning_rate": 1.9890772423875715e-07, + "loss": 0.3307, + "step": 42010 + }, + { + "epoch": 0.9365091453504247, + "grad_norm": 0.4133281707763672, + "learning_rate": 1.9821345757972787e-07, + "loss": 0.3099, + "step": 42015 + }, + { + "epoch": 0.9366205947310448, + "grad_norm": 0.5761286020278931, + "learning_rate": 1.975203925440694e-07, + "loss": 0.2464, + "step": 42020 + }, + { + "epoch": 0.9367320441116649, + "grad_norm": 0.4297039806842804, + "learning_rate": 1.96828529216746e-07, + "loss": 0.2333, + "step": 42025 + }, + { + "epoch": 0.9368434934922849, + "grad_norm": 0.4553482234477997, + "learning_rate": 1.9613786768257758e-07, + "loss": 0.3504, + "step": 42030 + }, + { + "epoch": 0.936954942872905, + "grad_norm": 0.6616497039794922, + "learning_rate": 1.95448408026232e-07, + "loss": 0.2023, + "step": 42035 + }, + { + "epoch": 0.9370663922535251, + "grad_norm": 0.4561408758163452, + "learning_rate": 1.9476015033223273e-07, + "loss": 0.2401, + "step": 42040 + }, + { + "epoch": 0.9371778416341451, + "grad_norm": 0.9392362236976624, + "learning_rate": 1.940730946849534e-07, + "loss": 0.2696, + "step": 42045 + }, + { + "epoch": 0.9372892910147652, + "grad_norm": 0.475864976644516, + "learning_rate": 1.9338724116862328e-07, + "loss": 0.2356, + "step": 42050 + }, + { + "epoch": 0.9374007403953852, + "grad_norm": 0.5198137760162354, + "learning_rate": 1.9270258986732181e-07, + "loss": 0.2693, + "step": 42055 + }, + { + "epoch": 0.9375121897760054, + "grad_norm": 0.5708441734313965, + "learning_rate": 1.9201914086498075e-07, + "loss": 0.2114, + "step": 42060 + }, + { + "epoch": 0.9376236391566254, + "grad_norm": 0.585024893283844, + "learning_rate": 1.913368942453886e-07, + "loss": 0.3733, + "step": 42065 + }, + { + "epoch": 0.9377350885372454, + "grad_norm": 0.4550107419490814, + "learning_rate": 1.9065585009218069e-07, + "loss": 0.4031, + "step": 42070 + }, + { + "epoch": 0.9378465379178655, + "grad_norm": 0.6647144556045532, + "learning_rate": 1.8997600848885023e-07, + "loss": 0.1745, + "step": 42075 + }, + { + "epoch": 0.9379579872984856, + "grad_norm": 0.4726318120956421, + "learning_rate": 1.8929736951873946e-07, + "loss": 0.2502, + "step": 42080 + }, + { + "epoch": 0.9380694366791057, + "grad_norm": 0.517724871635437, + "learning_rate": 1.886199332650429e-07, + "loss": 0.2343, + "step": 42085 + }, + { + "epoch": 0.9381808860597257, + "grad_norm": 0.577621579170227, + "learning_rate": 1.8794369981081085e-07, + "loss": 0.268, + "step": 42090 + }, + { + "epoch": 0.9382923354403457, + "grad_norm": 0.811469316482544, + "learning_rate": 1.872686692389436e-07, + "loss": 0.2637, + "step": 42095 + }, + { + "epoch": 0.9384037848209659, + "grad_norm": 0.29971227049827576, + "learning_rate": 1.865948416321961e-07, + "loss": 0.1793, + "step": 42100 + }, + { + "epoch": 0.9385152342015859, + "grad_norm": 0.6150528192520142, + "learning_rate": 1.8592221707317116e-07, + "loss": 0.3394, + "step": 42105 + }, + { + "epoch": 0.938626683582206, + "grad_norm": 0.5606695413589478, + "learning_rate": 1.8525079564433057e-07, + "loss": 0.2622, + "step": 42110 + }, + { + "epoch": 0.938738132962826, + "grad_norm": 0.8158010840415955, + "learning_rate": 1.8458057742798407e-07, + "loss": 0.3565, + "step": 42115 + }, + { + "epoch": 0.9388495823434462, + "grad_norm": 0.6361294984817505, + "learning_rate": 1.8391156250629482e-07, + "loss": 0.2246, + "step": 42120 + }, + { + "epoch": 0.9389610317240662, + "grad_norm": 1.0060545206069946, + "learning_rate": 1.8324375096128056e-07, + "loss": 0.348, + "step": 42125 + }, + { + "epoch": 0.9390724811046862, + "grad_norm": 0.9523029327392578, + "learning_rate": 1.82577142874808e-07, + "loss": 0.301, + "step": 42130 + }, + { + "epoch": 0.9391839304853064, + "grad_norm": 0.4871150553226471, + "learning_rate": 1.8191173832859955e-07, + "loss": 0.267, + "step": 42135 + }, + { + "epoch": 0.9392953798659264, + "grad_norm": 0.499967098236084, + "learning_rate": 1.8124753740422662e-07, + "loss": 0.2269, + "step": 42140 + }, + { + "epoch": 0.9394068292465465, + "grad_norm": 0.7111514210700989, + "learning_rate": 1.8058454018311743e-07, + "loss": 0.257, + "step": 42145 + }, + { + "epoch": 0.9395182786271665, + "grad_norm": 0.8823172450065613, + "learning_rate": 1.7992274674654918e-07, + "loss": 0.306, + "step": 42150 + }, + { + "epoch": 0.9396297280077865, + "grad_norm": 0.3821994662284851, + "learning_rate": 1.7926215717565255e-07, + "loss": 0.256, + "step": 42155 + }, + { + "epoch": 0.9397411773884067, + "grad_norm": 0.683880090713501, + "learning_rate": 1.7860277155141158e-07, + "loss": 0.3176, + "step": 42160 + }, + { + "epoch": 0.9398526267690267, + "grad_norm": 0.7522774338722229, + "learning_rate": 1.7794458995466057e-07, + "loss": 0.3349, + "step": 42165 + }, + { + "epoch": 0.9399640761496468, + "grad_norm": 0.8497912883758545, + "learning_rate": 1.772876124660894e-07, + "loss": 0.3679, + "step": 42170 + }, + { + "epoch": 0.9400755255302669, + "grad_norm": 1.0213016271591187, + "learning_rate": 1.7663183916623584e-07, + "loss": 0.2661, + "step": 42175 + }, + { + "epoch": 0.940186974910887, + "grad_norm": 1.0421154499053955, + "learning_rate": 1.7597727013549448e-07, + "loss": 0.4221, + "step": 42180 + }, + { + "epoch": 0.940298424291507, + "grad_norm": 0.6340685486793518, + "learning_rate": 1.7532390545410894e-07, + "loss": 0.261, + "step": 42185 + }, + { + "epoch": 0.940409873672127, + "grad_norm": 0.6523642539978027, + "learning_rate": 1.7467174520217956e-07, + "loss": 0.2458, + "step": 42190 + }, + { + "epoch": 0.9405213230527472, + "grad_norm": 0.7212027311325073, + "learning_rate": 1.7402078945965352e-07, + "loss": 0.2504, + "step": 42195 + }, + { + "epoch": 0.9406327724333672, + "grad_norm": 0.3738698959350586, + "learning_rate": 1.7337103830633474e-07, + "loss": 0.2207, + "step": 42200 + }, + { + "epoch": 0.9407442218139873, + "grad_norm": 1.0090970993041992, + "learning_rate": 1.7272249182187616e-07, + "loss": 0.3324, + "step": 42205 + }, + { + "epoch": 0.9408556711946073, + "grad_norm": 0.5669004917144775, + "learning_rate": 1.7207515008578424e-07, + "loss": 0.3247, + "step": 42210 + }, + { + "epoch": 0.9409671205752274, + "grad_norm": 0.6298465728759766, + "learning_rate": 1.7142901317741877e-07, + "loss": 0.3369, + "step": 42215 + }, + { + "epoch": 0.9410785699558475, + "grad_norm": 0.8187587261199951, + "learning_rate": 1.7078408117599198e-07, + "loss": 0.3211, + "step": 42220 + }, + { + "epoch": 0.9411900193364675, + "grad_norm": 0.49201130867004395, + "learning_rate": 1.701403541605673e-07, + "loss": 0.3096, + "step": 42225 + }, + { + "epoch": 0.9413014687170876, + "grad_norm": 0.7773210406303406, + "learning_rate": 1.6949783221006044e-07, + "loss": 0.2497, + "step": 42230 + }, + { + "epoch": 0.9414129180977077, + "grad_norm": 0.733504056930542, + "learning_rate": 1.6885651540323954e-07, + "loss": 0.1281, + "step": 42235 + }, + { + "epoch": 0.9415243674783277, + "grad_norm": 0.5793965458869934, + "learning_rate": 1.6821640381872395e-07, + "loss": 0.2486, + "step": 42240 + }, + { + "epoch": 0.9416358168589478, + "grad_norm": 0.7282425761222839, + "learning_rate": 1.6757749753498865e-07, + "loss": 0.2126, + "step": 42245 + }, + { + "epoch": 0.9417472662395678, + "grad_norm": 1.0350067615509033, + "learning_rate": 1.6693979663035653e-07, + "loss": 0.3799, + "step": 42250 + }, + { + "epoch": 0.941858715620188, + "grad_norm": 1.0458147525787354, + "learning_rate": 1.663033011830073e-07, + "loss": 0.2125, + "step": 42255 + }, + { + "epoch": 0.941970165000808, + "grad_norm": 0.5265653133392334, + "learning_rate": 1.6566801127096855e-07, + "loss": 0.2299, + "step": 42260 + }, + { + "epoch": 0.9420816143814281, + "grad_norm": 0.7979380488395691, + "learning_rate": 1.6503392697212128e-07, + "loss": 0.3032, + "step": 42265 + }, + { + "epoch": 0.9421930637620481, + "grad_norm": 0.6244803071022034, + "learning_rate": 1.6440104836420112e-07, + "loss": 0.3416, + "step": 42270 + }, + { + "epoch": 0.9423045131426682, + "grad_norm": 0.9354427456855774, + "learning_rate": 1.6376937552479154e-07, + "loss": 0.2255, + "step": 42275 + }, + { + "epoch": 0.9424159625232883, + "grad_norm": 0.37904810905456543, + "learning_rate": 1.6313890853133397e-07, + "loss": 0.2258, + "step": 42280 + }, + { + "epoch": 0.9425274119039083, + "grad_norm": 0.732758104801178, + "learning_rate": 1.6250964746111654e-07, + "loss": 0.1669, + "step": 42285 + }, + { + "epoch": 0.9426388612845285, + "grad_norm": 0.9100874066352844, + "learning_rate": 1.618815923912842e-07, + "loss": 0.4047, + "step": 42290 + }, + { + "epoch": 0.9427503106651485, + "grad_norm": 0.612760603427887, + "learning_rate": 1.6125474339882874e-07, + "loss": 0.3023, + "step": 42295 + }, + { + "epoch": 0.9428617600457685, + "grad_norm": 0.5571433901786804, + "learning_rate": 1.6062910056059867e-07, + "loss": 0.2317, + "step": 42300 + }, + { + "epoch": 0.9429732094263886, + "grad_norm": 0.6878780722618103, + "learning_rate": 1.6000466395329151e-07, + "loss": 0.3452, + "step": 42305 + }, + { + "epoch": 0.9430846588070086, + "grad_norm": 0.7174660563468933, + "learning_rate": 1.5938143365346048e-07, + "loss": 0.2271, + "step": 42310 + }, + { + "epoch": 0.9431961081876288, + "grad_norm": 0.7692664265632629, + "learning_rate": 1.587594097375078e-07, + "loss": 0.2722, + "step": 42315 + }, + { + "epoch": 0.9433075575682488, + "grad_norm": 0.5780901312828064, + "learning_rate": 1.5813859228168805e-07, + "loss": 0.302, + "step": 42320 + }, + { + "epoch": 0.9434190069488689, + "grad_norm": 0.41825351119041443, + "learning_rate": 1.575189813621103e-07, + "loss": 0.2909, + "step": 42325 + }, + { + "epoch": 0.943530456329489, + "grad_norm": 0.40317502617836, + "learning_rate": 1.569005770547327e-07, + "loss": 0.1812, + "step": 42330 + }, + { + "epoch": 0.943641905710109, + "grad_norm": 0.5926758646965027, + "learning_rate": 1.5628337943536686e-07, + "loss": 0.3533, + "step": 42335 + }, + { + "epoch": 0.9437533550907291, + "grad_norm": 0.9634340405464172, + "learning_rate": 1.556673885796778e-07, + "loss": 0.2677, + "step": 42340 + }, + { + "epoch": 0.9438648044713491, + "grad_norm": 0.805266797542572, + "learning_rate": 1.550526045631795e-07, + "loss": 0.2779, + "step": 42345 + }, + { + "epoch": 0.9439762538519693, + "grad_norm": 0.7578127980232239, + "learning_rate": 1.5443902746124172e-07, + "loss": 0.2362, + "step": 42350 + }, + { + "epoch": 0.9440877032325893, + "grad_norm": 0.5255642533302307, + "learning_rate": 1.5382665734908098e-07, + "loss": 0.2249, + "step": 42355 + }, + { + "epoch": 0.9441991526132093, + "grad_norm": 0.9447386860847473, + "learning_rate": 1.5321549430177275e-07, + "loss": 0.4072, + "step": 42360 + }, + { + "epoch": 0.9443106019938294, + "grad_norm": 0.5444424152374268, + "learning_rate": 1.5260553839424042e-07, + "loss": 0.2678, + "step": 42365 + }, + { + "epoch": 0.9444220513744495, + "grad_norm": 0.8604699969291687, + "learning_rate": 1.5199678970125642e-07, + "loss": 0.3507, + "step": 42370 + }, + { + "epoch": 0.9445335007550696, + "grad_norm": 0.6764824986457825, + "learning_rate": 1.5138924829745217e-07, + "loss": 0.3744, + "step": 42375 + }, + { + "epoch": 0.9446449501356896, + "grad_norm": 0.3252200484275818, + "learning_rate": 1.5078291425730694e-07, + "loss": 0.2754, + "step": 42380 + }, + { + "epoch": 0.9447563995163097, + "grad_norm": 0.2970804274082184, + "learning_rate": 1.5017778765515246e-07, + "loss": 0.1979, + "step": 42385 + }, + { + "epoch": 0.9448678488969298, + "grad_norm": 0.7865724563598633, + "learning_rate": 1.495738685651704e-07, + "loss": 0.2971, + "step": 42390 + }, + { + "epoch": 0.9449792982775498, + "grad_norm": 0.7084057927131653, + "learning_rate": 1.4897115706140052e-07, + "loss": 0.2849, + "step": 42395 + }, + { + "epoch": 0.9450907476581699, + "grad_norm": 0.7051497101783752, + "learning_rate": 1.48369653217727e-07, + "loss": 0.2426, + "step": 42400 + }, + { + "epoch": 0.9452021970387899, + "grad_norm": 0.4434252381324768, + "learning_rate": 1.4776935710789196e-07, + "loss": 0.2957, + "step": 42405 + }, + { + "epoch": 0.9453136464194101, + "grad_norm": 0.6347283720970154, + "learning_rate": 1.4717026880548657e-07, + "loss": 0.3472, + "step": 42410 + }, + { + "epoch": 0.9454250958000301, + "grad_norm": 0.5560967922210693, + "learning_rate": 1.4657238838395315e-07, + "loss": 0.3799, + "step": 42415 + }, + { + "epoch": 0.9455365451806501, + "grad_norm": 0.8887553215026855, + "learning_rate": 1.4597571591658865e-07, + "loss": 0.3139, + "step": 42420 + }, + { + "epoch": 0.9456479945612702, + "grad_norm": 0.4831375181674957, + "learning_rate": 1.45380251476539e-07, + "loss": 0.2862, + "step": 42425 + }, + { + "epoch": 0.9457594439418903, + "grad_norm": 0.6553042531013489, + "learning_rate": 1.4478599513680468e-07, + "loss": 0.2052, + "step": 42430 + }, + { + "epoch": 0.9458708933225104, + "grad_norm": 0.7742103934288025, + "learning_rate": 1.4419294697023745e-07, + "loss": 0.3235, + "step": 42435 + }, + { + "epoch": 0.9459823427031304, + "grad_norm": 0.7551333904266357, + "learning_rate": 1.436011070495402e-07, + "loss": 0.2466, + "step": 42440 + }, + { + "epoch": 0.9460937920837504, + "grad_norm": 0.7697866559028625, + "learning_rate": 1.430104754472672e-07, + "loss": 0.334, + "step": 42445 + }, + { + "epoch": 0.9462052414643706, + "grad_norm": 0.4702761769294739, + "learning_rate": 1.4242105223582602e-07, + "loss": 0.223, + "step": 42450 + }, + { + "epoch": 0.9463166908449906, + "grad_norm": 0.5854142904281616, + "learning_rate": 1.4183283748747446e-07, + "loss": 0.2965, + "step": 42455 + }, + { + "epoch": 0.9464281402256107, + "grad_norm": 0.5609765648841858, + "learning_rate": 1.4124583127432257e-07, + "loss": 0.3774, + "step": 42460 + }, + { + "epoch": 0.9465395896062307, + "grad_norm": 0.642464280128479, + "learning_rate": 1.406600336683339e-07, + "loss": 0.3742, + "step": 42465 + }, + { + "epoch": 0.9466510389868509, + "grad_norm": 0.7575817108154297, + "learning_rate": 1.4007544474132438e-07, + "loss": 0.4532, + "step": 42470 + }, + { + "epoch": 0.9467624883674709, + "grad_norm": 0.32896581292152405, + "learning_rate": 1.3949206456495778e-07, + "loss": 0.2458, + "step": 42475 + }, + { + "epoch": 0.9468739377480909, + "grad_norm": 0.5499560236930847, + "learning_rate": 1.3890989321075244e-07, + "loss": 0.3524, + "step": 42480 + }, + { + "epoch": 0.946985387128711, + "grad_norm": 0.643747866153717, + "learning_rate": 1.3832893075007902e-07, + "loss": 0.3066, + "step": 42485 + }, + { + "epoch": 0.9470968365093311, + "grad_norm": 0.8362199068069458, + "learning_rate": 1.3774917725415727e-07, + "loss": 0.3007, + "step": 42490 + }, + { + "epoch": 0.9472082858899512, + "grad_norm": 0.390722393989563, + "learning_rate": 1.371706327940614e-07, + "loss": 0.3432, + "step": 42495 + }, + { + "epoch": 0.9473197352705712, + "grad_norm": 0.3729709982872009, + "learning_rate": 1.365932974407169e-07, + "loss": 0.2158, + "step": 42500 + }, + { + "epoch": 0.9474311846511912, + "grad_norm": 0.661213219165802, + "learning_rate": 1.3601717126490056e-07, + "loss": 0.3354, + "step": 42505 + }, + { + "epoch": 0.9475426340318114, + "grad_norm": 1.1504225730895996, + "learning_rate": 1.3544225433724024e-07, + "loss": 0.1984, + "step": 42510 + }, + { + "epoch": 0.9476540834124314, + "grad_norm": 0.4445840120315552, + "learning_rate": 1.3486854672821848e-07, + "loss": 0.2474, + "step": 42515 + }, + { + "epoch": 0.9477655327930515, + "grad_norm": 0.7373872399330139, + "learning_rate": 1.3429604850816346e-07, + "loss": 0.4028, + "step": 42520 + }, + { + "epoch": 0.9478769821736716, + "grad_norm": 0.5534309148788452, + "learning_rate": 1.337247597472624e-07, + "loss": 0.2287, + "step": 42525 + }, + { + "epoch": 0.9479884315542917, + "grad_norm": 0.997943103313446, + "learning_rate": 1.3315468051554926e-07, + "loss": 0.3051, + "step": 42530 + }, + { + "epoch": 0.9480998809349117, + "grad_norm": 0.5267981290817261, + "learning_rate": 1.3258581088291257e-07, + "loss": 0.3215, + "step": 42535 + }, + { + "epoch": 0.9482113303155317, + "grad_norm": 0.734476625919342, + "learning_rate": 1.32018150919091e-07, + "loss": 0.2919, + "step": 42540 + }, + { + "epoch": 0.9483227796961519, + "grad_norm": 0.4765477478504181, + "learning_rate": 1.3145170069367552e-07, + "loss": 0.2901, + "step": 42545 + }, + { + "epoch": 0.9484342290767719, + "grad_norm": 0.6583042740821838, + "learning_rate": 1.308864602761073e-07, + "loss": 0.3807, + "step": 42550 + }, + { + "epoch": 0.948545678457392, + "grad_norm": 0.9653348326683044, + "learning_rate": 1.3032242973567976e-07, + "loss": 0.2463, + "step": 42555 + }, + { + "epoch": 0.948657127838012, + "grad_norm": 0.6304477453231812, + "learning_rate": 1.2975960914154207e-07, + "loss": 0.317, + "step": 42560 + }, + { + "epoch": 0.9487685772186321, + "grad_norm": 0.8308120965957642, + "learning_rate": 1.291979985626879e-07, + "loss": 0.2333, + "step": 42565 + }, + { + "epoch": 0.9488800265992522, + "grad_norm": 0.8143225908279419, + "learning_rate": 1.2863759806796993e-07, + "loss": 0.1493, + "step": 42570 + }, + { + "epoch": 0.9489914759798722, + "grad_norm": 0.774811327457428, + "learning_rate": 1.280784077260866e-07, + "loss": 0.2704, + "step": 42575 + }, + { + "epoch": 0.9491029253604923, + "grad_norm": 0.20292221009731293, + "learning_rate": 1.275204276055919e-07, + "loss": 0.1911, + "step": 42580 + }, + { + "epoch": 0.9492143747411124, + "grad_norm": 0.8017863035202026, + "learning_rate": 1.2696365777488673e-07, + "loss": 0.2807, + "step": 42585 + }, + { + "epoch": 0.9493258241217325, + "grad_norm": 0.6566117405891418, + "learning_rate": 1.264080983022309e-07, + "loss": 0.2286, + "step": 42590 + }, + { + "epoch": 0.9494372735023525, + "grad_norm": 0.5979494452476501, + "learning_rate": 1.2585374925572991e-07, + "loss": 0.1646, + "step": 42595 + }, + { + "epoch": 0.9495487228829725, + "grad_norm": 0.6958621740341187, + "learning_rate": 1.2530061070334054e-07, + "loss": 0.2779, + "step": 42600 + }, + { + "epoch": 0.9496601722635927, + "grad_norm": 0.6765291094779968, + "learning_rate": 1.2474868271287745e-07, + "loss": 0.3752, + "step": 42605 + }, + { + "epoch": 0.9497716216442127, + "grad_norm": 0.5178284645080566, + "learning_rate": 1.241979653519998e-07, + "loss": 0.3015, + "step": 42610 + }, + { + "epoch": 0.9498830710248328, + "grad_norm": 0.7208675742149353, + "learning_rate": 1.236484586882214e-07, + "loss": 0.3072, + "step": 42615 + }, + { + "epoch": 0.9499945204054528, + "grad_norm": 0.6745007634162903, + "learning_rate": 1.2310016278890946e-07, + "loss": 0.2314, + "step": 42620 + }, + { + "epoch": 0.9501059697860729, + "grad_norm": 0.9178875684738159, + "learning_rate": 1.2255307772127911e-07, + "loss": 0.2668, + "step": 42625 + }, + { + "epoch": 0.950217419166693, + "grad_norm": 0.5638184547424316, + "learning_rate": 1.2200720355239893e-07, + "loss": 0.2664, + "step": 42630 + }, + { + "epoch": 0.950328868547313, + "grad_norm": 0.5028668642044067, + "learning_rate": 1.214625403491887e-07, + "loss": 0.4063, + "step": 42635 + }, + { + "epoch": 0.9504403179279332, + "grad_norm": 0.840446412563324, + "learning_rate": 1.209190881784217e-07, + "loss": 0.2869, + "step": 42640 + }, + { + "epoch": 0.9505517673085532, + "grad_norm": 0.7156557440757751, + "learning_rate": 1.2037684710671905e-07, + "loss": 0.2172, + "step": 42645 + }, + { + "epoch": 0.9506632166891732, + "grad_norm": 1.0785118341445923, + "learning_rate": 1.1983581720055537e-07, + "loss": 0.2311, + "step": 42650 + }, + { + "epoch": 0.9507746660697933, + "grad_norm": 1.065416932106018, + "learning_rate": 1.1929599852625872e-07, + "loss": 0.2657, + "step": 42655 + }, + { + "epoch": 0.9508861154504133, + "grad_norm": 0.7923719882965088, + "learning_rate": 1.1875739115000506e-07, + "loss": 0.3733, + "step": 42660 + }, + { + "epoch": 0.9509975648310335, + "grad_norm": 0.7212884426116943, + "learning_rate": 1.1821999513782268e-07, + "loss": 0.3436, + "step": 42665 + }, + { + "epoch": 0.9511090142116535, + "grad_norm": 0.6976764798164368, + "learning_rate": 1.176838105555933e-07, + "loss": 0.272, + "step": 42670 + }, + { + "epoch": 0.9512204635922736, + "grad_norm": 0.42368894815444946, + "learning_rate": 1.1714883746904992e-07, + "loss": 0.2463, + "step": 42675 + }, + { + "epoch": 0.9513319129728937, + "grad_norm": 0.6565561294555664, + "learning_rate": 1.1661507594377452e-07, + "loss": 0.2854, + "step": 42680 + }, + { + "epoch": 0.9514433623535137, + "grad_norm": 0.4275035560131073, + "learning_rate": 1.1608252604520364e-07, + "loss": 0.2842, + "step": 42685 + }, + { + "epoch": 0.9515548117341338, + "grad_norm": 0.9225032925605774, + "learning_rate": 1.1555118783862174e-07, + "loss": 0.2786, + "step": 42690 + }, + { + "epoch": 0.9516662611147538, + "grad_norm": 0.5609422922134399, + "learning_rate": 1.1502106138916891e-07, + "loss": 0.2586, + "step": 42695 + }, + { + "epoch": 0.951777710495374, + "grad_norm": 0.9946783781051636, + "learning_rate": 1.1449214676183429e-07, + "loss": 0.2039, + "step": 42700 + }, + { + "epoch": 0.951889159875994, + "grad_norm": 0.5673030614852905, + "learning_rate": 1.1396444402145602e-07, + "loss": 0.2096, + "step": 42705 + }, + { + "epoch": 0.952000609256614, + "grad_norm": 0.5998368263244629, + "learning_rate": 1.1343795323272899e-07, + "loss": 0.296, + "step": 42710 + }, + { + "epoch": 0.9521120586372341, + "grad_norm": 0.6101316213607788, + "learning_rate": 1.1291267446019716e-07, + "loss": 0.2241, + "step": 42715 + }, + { + "epoch": 0.9522235080178542, + "grad_norm": 0.8632827997207642, + "learning_rate": 1.1238860776825456e-07, + "loss": 0.2306, + "step": 42720 + }, + { + "epoch": 0.9523349573984743, + "grad_norm": 0.5945008993148804, + "learning_rate": 1.1186575322114868e-07, + "loss": 0.3053, + "step": 42725 + }, + { + "epoch": 0.9524464067790943, + "grad_norm": 0.5753487348556519, + "learning_rate": 1.1134411088297603e-07, + "loss": 0.2783, + "step": 42730 + }, + { + "epoch": 0.9525578561597144, + "grad_norm": 0.4323602616786957, + "learning_rate": 1.1082368081768657e-07, + "loss": 0.3981, + "step": 42735 + }, + { + "epoch": 0.9526693055403345, + "grad_norm": 0.28826257586479187, + "learning_rate": 1.1030446308908038e-07, + "loss": 0.1441, + "step": 42740 + }, + { + "epoch": 0.9527807549209545, + "grad_norm": 0.6044817566871643, + "learning_rate": 1.0978645776081098e-07, + "loss": 0.2885, + "step": 42745 + }, + { + "epoch": 0.9528922043015746, + "grad_norm": 0.6922546625137329, + "learning_rate": 1.0926966489638202e-07, + "loss": 0.3463, + "step": 42750 + }, + { + "epoch": 0.9530036536821946, + "grad_norm": 0.5683593153953552, + "learning_rate": 1.0875408455914726e-07, + "loss": 0.2736, + "step": 42755 + }, + { + "epoch": 0.9531151030628148, + "grad_norm": 0.41599422693252563, + "learning_rate": 1.0823971681231171e-07, + "loss": 0.3351, + "step": 42760 + }, + { + "epoch": 0.9532265524434348, + "grad_norm": 0.6246147155761719, + "learning_rate": 1.0772656171893603e-07, + "loss": 0.2507, + "step": 42765 + }, + { + "epoch": 0.9533380018240548, + "grad_norm": 0.6976191997528076, + "learning_rate": 1.0721461934192545e-07, + "loss": 0.3827, + "step": 42770 + }, + { + "epoch": 0.953449451204675, + "grad_norm": 0.5291700959205627, + "learning_rate": 1.06703889744042e-07, + "loss": 0.3821, + "step": 42775 + }, + { + "epoch": 0.953560900585295, + "grad_norm": 0.6558244824409485, + "learning_rate": 1.0619437298789781e-07, + "loss": 0.2086, + "step": 42780 + }, + { + "epoch": 0.9536723499659151, + "grad_norm": 0.5279899835586548, + "learning_rate": 1.0568606913595514e-07, + "loss": 0.2535, + "step": 42785 + }, + { + "epoch": 0.9537837993465351, + "grad_norm": 0.4647957384586334, + "learning_rate": 1.0517897825052858e-07, + "loss": 0.2313, + "step": 42790 + }, + { + "epoch": 0.9538952487271553, + "grad_norm": 0.9641315340995789, + "learning_rate": 1.0467310039378287e-07, + "loss": 0.2446, + "step": 42795 + }, + { + "epoch": 0.9540066981077753, + "grad_norm": 0.5629506707191467, + "learning_rate": 1.0416843562773393e-07, + "loss": 0.2346, + "step": 42800 + }, + { + "epoch": 0.9541181474883953, + "grad_norm": 0.786285400390625, + "learning_rate": 1.0366498401425117e-07, + "loss": 0.2091, + "step": 42805 + }, + { + "epoch": 0.9542295968690154, + "grad_norm": 0.8931853771209717, + "learning_rate": 1.0316274561505301e-07, + "loss": 0.3457, + "step": 42810 + }, + { + "epoch": 0.9543410462496354, + "grad_norm": 0.9373008608818054, + "learning_rate": 1.026617204917102e-07, + "loss": 0.1567, + "step": 42815 + }, + { + "epoch": 0.9544524956302556, + "grad_norm": 1.3995561599731445, + "learning_rate": 1.0216190870564579e-07, + "loss": 0.1895, + "step": 42820 + }, + { + "epoch": 0.9545639450108756, + "grad_norm": 0.3858010470867157, + "learning_rate": 1.0166331031813082e-07, + "loss": 0.2095, + "step": 42825 + }, + { + "epoch": 0.9546753943914956, + "grad_norm": 0.8172832131385803, + "learning_rate": 1.011659253902908e-07, + "loss": 0.3237, + "step": 42830 + }, + { + "epoch": 0.9547868437721158, + "grad_norm": 0.40116187930107117, + "learning_rate": 1.0066975398310141e-07, + "loss": 0.2896, + "step": 42835 + }, + { + "epoch": 0.9548982931527358, + "grad_norm": 0.3348509669303894, + "learning_rate": 1.0017479615738957e-07, + "loss": 0.266, + "step": 42840 + }, + { + "epoch": 0.9550097425333559, + "grad_norm": 0.6185810565948486, + "learning_rate": 9.968105197383226e-08, + "loss": 0.2971, + "step": 42845 + }, + { + "epoch": 0.9551211919139759, + "grad_norm": 0.8081035614013672, + "learning_rate": 9.918852149295777e-08, + "loss": 0.2854, + "step": 42850 + }, + { + "epoch": 0.955232641294596, + "grad_norm": 0.5405545830726624, + "learning_rate": 9.869720477514999e-08, + "loss": 0.2292, + "step": 42855 + }, + { + "epoch": 0.9553440906752161, + "grad_norm": 0.4950360953807831, + "learning_rate": 9.820710188063854e-08, + "loss": 0.3015, + "step": 42860 + }, + { + "epoch": 0.9554555400558361, + "grad_norm": 0.43373239040374756, + "learning_rate": 9.771821286950533e-08, + "loss": 0.2929, + "step": 42865 + }, + { + "epoch": 0.9555669894364562, + "grad_norm": 0.4429517388343811, + "learning_rate": 9.723053780168579e-08, + "loss": 0.2409, + "step": 42870 + }, + { + "epoch": 0.9556784388170763, + "grad_norm": 0.7364949584007263, + "learning_rate": 9.674407673696429e-08, + "loss": 0.2495, + "step": 42875 + }, + { + "epoch": 0.9557898881976964, + "grad_norm": 0.6135752201080322, + "learning_rate": 9.625882973497757e-08, + "loss": 0.2881, + "step": 42880 + }, + { + "epoch": 0.9559013375783164, + "grad_norm": 0.5923545956611633, + "learning_rate": 9.577479685521363e-08, + "loss": 0.2911, + "step": 42885 + }, + { + "epoch": 0.9560127869589364, + "grad_norm": 0.4776538908481598, + "learning_rate": 9.529197815701052e-08, + "loss": 0.3263, + "step": 42890 + }, + { + "epoch": 0.9561242363395566, + "grad_norm": 0.7010661363601685, + "learning_rate": 9.481037369955759e-08, + "loss": 0.2749, + "step": 42895 + }, + { + "epoch": 0.9562356857201766, + "grad_norm": 1.0098576545715332, + "learning_rate": 9.43299835418976e-08, + "loss": 0.3255, + "step": 42900 + }, + { + "epoch": 0.9563471351007967, + "grad_norm": 0.6457880139350891, + "learning_rate": 9.385080774292122e-08, + "loss": 0.2645, + "step": 42905 + }, + { + "epoch": 0.9564585844814167, + "grad_norm": 0.2925399839878082, + "learning_rate": 9.337284636137256e-08, + "loss": 0.2653, + "step": 42910 + }, + { + "epoch": 0.9565700338620368, + "grad_norm": 0.4781745672225952, + "learning_rate": 9.289609945584477e-08, + "loss": 0.3069, + "step": 42915 + }, + { + "epoch": 0.9566814832426569, + "grad_norm": 0.5949952006340027, + "learning_rate": 9.242056708478442e-08, + "loss": 0.3137, + "step": 42920 + }, + { + "epoch": 0.9567929326232769, + "grad_norm": 0.8055167198181152, + "learning_rate": 9.19462493064871e-08, + "loss": 0.2474, + "step": 42925 + }, + { + "epoch": 0.956904382003897, + "grad_norm": 0.38028067350387573, + "learning_rate": 9.147314617910186e-08, + "loss": 0.2518, + "step": 42930 + }, + { + "epoch": 0.9570158313845171, + "grad_norm": 0.39675191044807434, + "learning_rate": 9.100125776062673e-08, + "loss": 0.1767, + "step": 42935 + }, + { + "epoch": 0.9571272807651372, + "grad_norm": 0.7536404132843018, + "learning_rate": 9.053058410891214e-08, + "loss": 0.2343, + "step": 42940 + }, + { + "epoch": 0.9572387301457572, + "grad_norm": 0.5819075107574463, + "learning_rate": 9.006112528165855e-08, + "loss": 0.3314, + "step": 42945 + }, + { + "epoch": 0.9573501795263772, + "grad_norm": 0.36783868074417114, + "learning_rate": 8.959288133641664e-08, + "loss": 0.2105, + "step": 42950 + }, + { + "epoch": 0.9574616289069974, + "grad_norm": 0.6799458861351013, + "learning_rate": 8.912585233059157e-08, + "loss": 0.2144, + "step": 42955 + }, + { + "epoch": 0.9575730782876174, + "grad_norm": 0.6199312806129456, + "learning_rate": 8.866003832143644e-08, + "loss": 0.1734, + "step": 42960 + }, + { + "epoch": 0.9576845276682375, + "grad_norm": 0.5303043127059937, + "learning_rate": 8.819543936605779e-08, + "loss": 0.2384, + "step": 42965 + }, + { + "epoch": 0.9577959770488575, + "grad_norm": 0.5857964754104614, + "learning_rate": 8.773205552141118e-08, + "loss": 0.2936, + "step": 42970 + }, + { + "epoch": 0.9579074264294776, + "grad_norm": 0.6527206301689148, + "learning_rate": 8.726988684430227e-08, + "loss": 0.2617, + "step": 42975 + }, + { + "epoch": 0.9580188758100977, + "grad_norm": 0.41805943846702576, + "learning_rate": 8.680893339139241e-08, + "loss": 0.2858, + "step": 42980 + }, + { + "epoch": 0.9581303251907177, + "grad_norm": 0.6815401315689087, + "learning_rate": 8.63491952191875e-08, + "loss": 0.2616, + "step": 42985 + }, + { + "epoch": 0.9582417745713379, + "grad_norm": 0.588553249835968, + "learning_rate": 8.589067238404913e-08, + "loss": 0.3657, + "step": 42990 + }, + { + "epoch": 0.9583532239519579, + "grad_norm": 0.8243670463562012, + "learning_rate": 8.543336494219123e-08, + "loss": 0.289, + "step": 42995 + }, + { + "epoch": 0.9584646733325779, + "grad_norm": 0.6533645987510681, + "learning_rate": 8.49772729496734e-08, + "loss": 0.3099, + "step": 43000 + }, + { + "epoch": 0.958576122713198, + "grad_norm": 0.7068596482276917, + "learning_rate": 8.452239646240867e-08, + "loss": 0.2069, + "step": 43005 + }, + { + "epoch": 0.958687572093818, + "grad_norm": 0.4924030303955078, + "learning_rate": 8.406873553616357e-08, + "loss": 0.2956, + "step": 43010 + }, + { + "epoch": 0.9587990214744382, + "grad_norm": 0.646881639957428, + "learning_rate": 8.361629022655138e-08, + "loss": 0.2566, + "step": 43015 + }, + { + "epoch": 0.9589104708550582, + "grad_norm": 0.5528478622436523, + "learning_rate": 8.316506058903994e-08, + "loss": 0.2912, + "step": 43020 + }, + { + "epoch": 0.9590219202356783, + "grad_norm": 0.6520793437957764, + "learning_rate": 8.271504667894503e-08, + "loss": 0.2392, + "step": 43025 + }, + { + "epoch": 0.9591333696162984, + "grad_norm": 0.8448598980903625, + "learning_rate": 8.226624855143694e-08, + "loss": 0.2468, + "step": 43030 + }, + { + "epoch": 0.9592448189969184, + "grad_norm": 0.5260225534439087, + "learning_rate": 8.181866626153278e-08, + "loss": 0.2171, + "step": 43035 + }, + { + "epoch": 0.9593562683775385, + "grad_norm": 0.4451325237751007, + "learning_rate": 8.137229986410422e-08, + "loss": 0.3226, + "step": 43040 + }, + { + "epoch": 0.9594677177581585, + "grad_norm": 0.5130228400230408, + "learning_rate": 8.092714941387081e-08, + "loss": 0.2164, + "step": 43045 + }, + { + "epoch": 0.9595791671387787, + "grad_norm": 0.5746123194694519, + "learning_rate": 8.048321496540557e-08, + "loss": 0.3993, + "step": 43050 + }, + { + "epoch": 0.9596906165193987, + "grad_norm": 0.4763878583908081, + "learning_rate": 8.004049657313162e-08, + "loss": 0.2981, + "step": 43055 + }, + { + "epoch": 0.9598020659000187, + "grad_norm": 0.6170870661735535, + "learning_rate": 7.959899429132112e-08, + "loss": 0.3722, + "step": 43060 + }, + { + "epoch": 0.9599135152806388, + "grad_norm": 0.458804726600647, + "learning_rate": 7.915870817410188e-08, + "loss": 0.3045, + "step": 43065 + }, + { + "epoch": 0.9600249646612589, + "grad_norm": 0.6858850717544556, + "learning_rate": 7.871963827544738e-08, + "loss": 0.245, + "step": 43070 + }, + { + "epoch": 0.960136414041879, + "grad_norm": 0.5968716144561768, + "learning_rate": 7.828178464918456e-08, + "loss": 0.1815, + "step": 43075 + }, + { + "epoch": 0.960247863422499, + "grad_norm": 0.9967861771583557, + "learning_rate": 7.784514734899052e-08, + "loss": 0.3042, + "step": 43080 + }, + { + "epoch": 0.9603593128031191, + "grad_norm": 0.733881950378418, + "learning_rate": 7.740972642839573e-08, + "loss": 0.2845, + "step": 43085 + }, + { + "epoch": 0.9604707621837392, + "grad_norm": 0.4205930233001709, + "learning_rate": 7.69755219407764e-08, + "loss": 0.28, + "step": 43090 + }, + { + "epoch": 0.9605822115643592, + "grad_norm": 0.5251030325889587, + "learning_rate": 7.65425339393644e-08, + "loss": 0.2224, + "step": 43095 + }, + { + "epoch": 0.9606936609449793, + "grad_norm": 0.43053174018859863, + "learning_rate": 7.611076247724058e-08, + "loss": 0.2235, + "step": 43100 + }, + { + "epoch": 0.9608051103255993, + "grad_norm": 0.9452346563339233, + "learning_rate": 7.568020760733707e-08, + "loss": 0.3017, + "step": 43105 + }, + { + "epoch": 0.9609165597062195, + "grad_norm": 0.821847140789032, + "learning_rate": 7.525086938243498e-08, + "loss": 0.2344, + "step": 43110 + }, + { + "epoch": 0.9610280090868395, + "grad_norm": 0.7619543075561523, + "learning_rate": 7.482274785516996e-08, + "loss": 0.4167, + "step": 43115 + }, + { + "epoch": 0.9611394584674595, + "grad_norm": 0.725729763507843, + "learning_rate": 7.439584307802449e-08, + "loss": 0.2355, + "step": 43120 + }, + { + "epoch": 0.9612509078480796, + "grad_norm": 0.6464846730232239, + "learning_rate": 7.397015510333561e-08, + "loss": 0.3668, + "step": 43125 + }, + { + "epoch": 0.9613623572286997, + "grad_norm": 0.6052849292755127, + "learning_rate": 7.35456839832882e-08, + "loss": 0.1793, + "step": 43130 + }, + { + "epoch": 0.9614738066093198, + "grad_norm": 0.510604202747345, + "learning_rate": 7.312242976991956e-08, + "loss": 0.2486, + "step": 43135 + }, + { + "epoch": 0.9615852559899398, + "grad_norm": 0.9605926275253296, + "learning_rate": 7.270039251511707e-08, + "loss": 0.3757, + "step": 43140 + }, + { + "epoch": 0.96169670537056, + "grad_norm": 0.487678587436676, + "learning_rate": 7.227957227061933e-08, + "loss": 0.2236, + "step": 43145 + }, + { + "epoch": 0.96180815475118, + "grad_norm": 0.420016884803772, + "learning_rate": 7.185996908801618e-08, + "loss": 0.3208, + "step": 43150 + }, + { + "epoch": 0.9619196041318, + "grad_norm": 0.5586323738098145, + "learning_rate": 7.14415830187476e-08, + "loss": 0.2396, + "step": 43155 + }, + { + "epoch": 0.9620310535124201, + "grad_norm": 0.484066903591156, + "learning_rate": 7.102441411410366e-08, + "loss": 0.2765, + "step": 43160 + }, + { + "epoch": 0.9621425028930402, + "grad_norm": 0.9337204694747925, + "learning_rate": 7.060846242522679e-08, + "loss": 0.3034, + "step": 43165 + }, + { + "epoch": 0.9622539522736603, + "grad_norm": 0.5187270045280457, + "learning_rate": 7.019372800310953e-08, + "loss": 0.2147, + "step": 43170 + }, + { + "epoch": 0.9623654016542803, + "grad_norm": 0.6802904605865479, + "learning_rate": 6.978021089859454e-08, + "loss": 0.1796, + "step": 43175 + }, + { + "epoch": 0.9624768510349003, + "grad_norm": 0.3484508693218231, + "learning_rate": 6.936791116237574e-08, + "loss": 0.2456, + "step": 43180 + }, + { + "epoch": 0.9625883004155205, + "grad_norm": 0.5931937098503113, + "learning_rate": 6.895682884499933e-08, + "loss": 0.2011, + "step": 43185 + }, + { + "epoch": 0.9626997497961405, + "grad_norm": 0.8017531037330627, + "learning_rate": 6.854696399685945e-08, + "loss": 0.309, + "step": 43190 + }, + { + "epoch": 0.9628111991767606, + "grad_norm": 0.6763916015625, + "learning_rate": 6.813831666820259e-08, + "loss": 0.2995, + "step": 43195 + }, + { + "epoch": 0.9629226485573806, + "grad_norm": 0.579896092414856, + "learning_rate": 6.773088690912644e-08, + "loss": 0.2025, + "step": 43200 + }, + { + "epoch": 0.9630340979380007, + "grad_norm": 0.6687420606613159, + "learning_rate": 6.73246747695766e-08, + "loss": 0.2448, + "step": 43205 + }, + { + "epoch": 0.9631455473186208, + "grad_norm": 0.8000922203063965, + "learning_rate": 6.691968029935436e-08, + "loss": 0.3776, + "step": 43210 + }, + { + "epoch": 0.9632569966992408, + "grad_norm": 0.8290566802024841, + "learning_rate": 6.651590354810777e-08, + "loss": 0.1601, + "step": 43215 + }, + { + "epoch": 0.9633684460798609, + "grad_norm": 0.5483561754226685, + "learning_rate": 6.611334456533725e-08, + "loss": 0.2459, + "step": 43220 + }, + { + "epoch": 0.963479895460481, + "grad_norm": 0.6955622434616089, + "learning_rate": 6.57120034003922e-08, + "loss": 0.2317, + "step": 43225 + }, + { + "epoch": 0.9635913448411011, + "grad_norm": 0.43610045313835144, + "learning_rate": 6.531188010247436e-08, + "loss": 0.2193, + "step": 43230 + }, + { + "epoch": 0.9637027942217211, + "grad_norm": 0.7788457274436951, + "learning_rate": 6.491297472063563e-08, + "loss": 0.2353, + "step": 43235 + }, + { + "epoch": 0.9638142436023411, + "grad_norm": 0.38781026005744934, + "learning_rate": 6.45152873037802e-08, + "loss": 0.2539, + "step": 43240 + }, + { + "epoch": 0.9639256929829613, + "grad_norm": 0.6308822631835938, + "learning_rate": 6.41188179006591e-08, + "loss": 0.2511, + "step": 43245 + }, + { + "epoch": 0.9640371423635813, + "grad_norm": 0.9062620401382446, + "learning_rate": 6.3723566559879e-08, + "loss": 0.2205, + "step": 43250 + }, + { + "epoch": 0.9641485917442014, + "grad_norm": 1.0278260707855225, + "learning_rate": 6.332953332989334e-08, + "loss": 0.3506, + "step": 43255 + }, + { + "epoch": 0.9642600411248214, + "grad_norm": 0.5123734474182129, + "learning_rate": 6.293671825900571e-08, + "loss": 0.282, + "step": 43260 + }, + { + "epoch": 0.9643714905054415, + "grad_norm": 0.5359479784965515, + "learning_rate": 6.254512139537539e-08, + "loss": 0.2928, + "step": 43265 + }, + { + "epoch": 0.9644829398860616, + "grad_norm": 0.8474448323249817, + "learning_rate": 6.215474278700839e-08, + "loss": 0.2269, + "step": 43270 + }, + { + "epoch": 0.9645943892666816, + "grad_norm": 0.15656907856464386, + "learning_rate": 6.17655824817609e-08, + "loss": 0.1414, + "step": 43275 + }, + { + "epoch": 0.9647058386473017, + "grad_norm": 0.4503397047519684, + "learning_rate": 6.13776405273414e-08, + "loss": 0.2113, + "step": 43280 + }, + { + "epoch": 0.9648172880279218, + "grad_norm": 0.4656757414340973, + "learning_rate": 6.099091697130965e-08, + "loss": 0.2896, + "step": 43285 + }, + { + "epoch": 0.9649287374085419, + "grad_norm": 0.571235716342926, + "learning_rate": 6.060541186107327e-08, + "loss": 0.2686, + "step": 43290 + }, + { + "epoch": 0.9650401867891619, + "grad_norm": 0.5569755434989929, + "learning_rate": 6.022112524389223e-08, + "loss": 0.3137, + "step": 43295 + }, + { + "epoch": 0.965151636169782, + "grad_norm": 0.4425262212753296, + "learning_rate": 5.983805716687996e-08, + "loss": 0.2193, + "step": 43300 + }, + { + "epoch": 0.9652630855504021, + "grad_norm": 0.7848999500274658, + "learning_rate": 5.9456207676993336e-08, + "loss": 0.2234, + "step": 43305 + }, + { + "epoch": 0.9653745349310221, + "grad_norm": 0.9052958488464355, + "learning_rate": 5.9075576821048256e-08, + "loss": 0.2435, + "step": 43310 + }, + { + "epoch": 0.9654859843116422, + "grad_norm": 0.7334598898887634, + "learning_rate": 5.869616464570516e-08, + "loss": 0.2737, + "step": 43315 + }, + { + "epoch": 0.9655974336922623, + "grad_norm": 0.49490249156951904, + "learning_rate": 5.831797119747684e-08, + "loss": 0.228, + "step": 43320 + }, + { + "epoch": 0.9657088830728823, + "grad_norm": 0.7265264987945557, + "learning_rate": 5.794099652272622e-08, + "loss": 0.284, + "step": 43325 + }, + { + "epoch": 0.9658203324535024, + "grad_norm": 0.7840932011604309, + "learning_rate": 5.756524066766966e-08, + "loss": 0.3362, + "step": 43330 + }, + { + "epoch": 0.9659317818341224, + "grad_norm": 0.35997217893600464, + "learning_rate": 5.719070367837032e-08, + "loss": 0.155, + "step": 43335 + }, + { + "epoch": 0.9660432312147426, + "grad_norm": 0.3674221336841583, + "learning_rate": 5.681738560074479e-08, + "loss": 0.2327, + "step": 43340 + }, + { + "epoch": 0.9661546805953626, + "grad_norm": 0.549663245677948, + "learning_rate": 5.6445286480557583e-08, + "loss": 0.2507, + "step": 43345 + }, + { + "epoch": 0.9662661299759827, + "grad_norm": 0.31489622592926025, + "learning_rate": 5.6074406363425534e-08, + "loss": 0.4149, + "step": 43350 + }, + { + "epoch": 0.9663775793566027, + "grad_norm": 0.39642104506492615, + "learning_rate": 5.5704745294815624e-08, + "loss": 0.3571, + "step": 43355 + }, + { + "epoch": 0.9664890287372228, + "grad_norm": 0.7653921842575073, + "learning_rate": 5.533630332004714e-08, + "loss": 0.393, + "step": 43360 + }, + { + "epoch": 0.9666004781178429, + "grad_norm": 0.5353769659996033, + "learning_rate": 5.496908048428618e-08, + "loss": 0.2873, + "step": 43365 + }, + { + "epoch": 0.9667119274984629, + "grad_norm": 0.5851602554321289, + "learning_rate": 5.4603076832552284e-08, + "loss": 0.1736, + "step": 43370 + }, + { + "epoch": 0.966823376879083, + "grad_norm": 0.4860190451145172, + "learning_rate": 5.423829240971401e-08, + "loss": 0.2902, + "step": 43375 + }, + { + "epoch": 0.9669348262597031, + "grad_norm": 0.8070089221000671, + "learning_rate": 5.3874727260491146e-08, + "loss": 0.2503, + "step": 43380 + }, + { + "epoch": 0.9670462756403231, + "grad_norm": 0.6368986964225769, + "learning_rate": 5.3512381429455804e-08, + "loss": 0.2482, + "step": 43385 + }, + { + "epoch": 0.9671577250209432, + "grad_norm": 0.4795011579990387, + "learning_rate": 5.31512549610258e-08, + "loss": 0.2357, + "step": 43390 + }, + { + "epoch": 0.9672691744015632, + "grad_norm": 0.7794646620750427, + "learning_rate": 5.27913478994746e-08, + "loss": 0.327, + "step": 43395 + }, + { + "epoch": 0.9673806237821834, + "grad_norm": 0.5979329347610474, + "learning_rate": 5.2432660288924687e-08, + "loss": 0.2538, + "step": 43400 + }, + { + "epoch": 0.9674920731628034, + "grad_norm": 0.7042983770370483, + "learning_rate": 5.207519217334645e-08, + "loss": 0.2317, + "step": 43405 + }, + { + "epoch": 0.9676035225434234, + "grad_norm": 0.5814474821090698, + "learning_rate": 5.17189435965626e-08, + "loss": 0.3646, + "step": 43410 + }, + { + "epoch": 0.9677149719240435, + "grad_norm": 0.47899243235588074, + "learning_rate": 5.136391460224821e-08, + "loss": 0.1985, + "step": 43415 + }, + { + "epoch": 0.9678264213046636, + "grad_norm": 0.6566064953804016, + "learning_rate": 5.101010523392513e-08, + "loss": 0.3045, + "step": 43420 + }, + { + "epoch": 0.9679378706852837, + "grad_norm": 0.46647369861602783, + "learning_rate": 5.065751553496978e-08, + "loss": 0.294, + "step": 43425 + }, + { + "epoch": 0.9680493200659037, + "grad_norm": 0.5277575254440308, + "learning_rate": 5.030614554860647e-08, + "loss": 0.2, + "step": 43430 + }, + { + "epoch": 0.9681607694465239, + "grad_norm": 0.41413211822509766, + "learning_rate": 4.9955995317908514e-08, + "loss": 0.2468, + "step": 43435 + }, + { + "epoch": 0.9682722188271439, + "grad_norm": 0.5823618173599243, + "learning_rate": 4.96070648858038e-08, + "loss": 0.2698, + "step": 43440 + }, + { + "epoch": 0.9683836682077639, + "grad_norm": 0.4486048221588135, + "learning_rate": 4.925935429506701e-08, + "loss": 0.377, + "step": 43445 + }, + { + "epoch": 0.968495117588384, + "grad_norm": 0.6892697215080261, + "learning_rate": 4.891286358832626e-08, + "loss": 0.3767, + "step": 43450 + }, + { + "epoch": 0.968606566969004, + "grad_norm": 0.8102115988731384, + "learning_rate": 4.8567592808057564e-08, + "loss": 0.3368, + "step": 43455 + }, + { + "epoch": 0.9687180163496242, + "grad_norm": 0.646845817565918, + "learning_rate": 4.8223541996588186e-08, + "loss": 0.2482, + "step": 43460 + }, + { + "epoch": 0.9688294657302442, + "grad_norm": 0.42743000388145447, + "learning_rate": 4.788071119609772e-08, + "loss": 0.2973, + "step": 43465 + }, + { + "epoch": 0.9689409151108642, + "grad_norm": 0.6461029648780823, + "learning_rate": 4.753910044861254e-08, + "loss": 0.2869, + "step": 43470 + }, + { + "epoch": 0.9690523644914844, + "grad_norm": 0.45668455958366394, + "learning_rate": 4.719870979601249e-08, + "loss": 0.3004, + "step": 43475 + }, + { + "epoch": 0.9691638138721044, + "grad_norm": 0.9763020277023315, + "learning_rate": 4.685953928002751e-08, + "loss": 0.2883, + "step": 43480 + }, + { + "epoch": 0.9692752632527245, + "grad_norm": 0.5899277925491333, + "learning_rate": 4.6521588942235466e-08, + "loss": 0.2785, + "step": 43485 + }, + { + "epoch": 0.9693867126333445, + "grad_norm": 0.5160545110702515, + "learning_rate": 4.618485882406876e-08, + "loss": 0.285, + "step": 43490 + }, + { + "epoch": 0.9694981620139647, + "grad_norm": 0.6278097033500671, + "learning_rate": 4.5849348966805485e-08, + "loss": 0.2861, + "step": 43495 + }, + { + "epoch": 0.9696096113945847, + "grad_norm": 0.45948338508605957, + "learning_rate": 4.55150594115783e-08, + "loss": 0.3111, + "step": 43500 + }, + { + "epoch": 0.9697210607752047, + "grad_norm": 0.7423383593559265, + "learning_rate": 4.518199019936886e-08, + "loss": 0.3179, + "step": 43505 + }, + { + "epoch": 0.9698325101558248, + "grad_norm": 0.6052937507629395, + "learning_rate": 4.485014137100674e-08, + "loss": 0.2932, + "step": 43510 + }, + { + "epoch": 0.9699439595364449, + "grad_norm": 0.5899354219436646, + "learning_rate": 4.451951296717605e-08, + "loss": 0.1974, + "step": 43515 + }, + { + "epoch": 0.970055408917065, + "grad_norm": 0.621191143989563, + "learning_rate": 4.41901050284077e-08, + "loss": 0.3295, + "step": 43520 + }, + { + "epoch": 0.970166858297685, + "grad_norm": 0.6658492088317871, + "learning_rate": 4.386191759508607e-08, + "loss": 0.3678, + "step": 43525 + }, + { + "epoch": 0.970278307678305, + "grad_norm": 0.8935458064079285, + "learning_rate": 4.3534950707444515e-08, + "loss": 0.2223, + "step": 43530 + }, + { + "epoch": 0.9703897570589252, + "grad_norm": 0.4643547236919403, + "learning_rate": 4.320920440556542e-08, + "loss": 0.3042, + "step": 43535 + }, + { + "epoch": 0.9705012064395452, + "grad_norm": 0.749759316444397, + "learning_rate": 4.28846787293824e-08, + "loss": 0.2473, + "step": 43540 + }, + { + "epoch": 0.9706126558201653, + "grad_norm": 0.6472769975662231, + "learning_rate": 4.256137371868141e-08, + "loss": 0.2965, + "step": 43545 + }, + { + "epoch": 0.9707241052007853, + "grad_norm": 0.8817629814147949, + "learning_rate": 4.2239289413096296e-08, + "loss": 0.1953, + "step": 43550 + }, + { + "epoch": 0.9708355545814055, + "grad_norm": 0.7503361105918884, + "learning_rate": 4.1918425852111034e-08, + "loss": 0.2381, + "step": 43555 + }, + { + "epoch": 0.9709470039620255, + "grad_norm": 0.524804413318634, + "learning_rate": 4.1598783075064154e-08, + "loss": 0.2617, + "step": 43560 + }, + { + "epoch": 0.9710584533426455, + "grad_norm": 0.804250955581665, + "learning_rate": 4.1280361121137645e-08, + "loss": 0.0857, + "step": 43565 + }, + { + "epoch": 0.9711699027232656, + "grad_norm": 0.9102737903594971, + "learning_rate": 4.0963160029370284e-08, + "loss": 0.2496, + "step": 43570 + }, + { + "epoch": 0.9712813521038857, + "grad_norm": 0.7096487283706665, + "learning_rate": 4.064717983864763e-08, + "loss": 0.3119, + "step": 43575 + }, + { + "epoch": 0.9713928014845058, + "grad_norm": 0.6348690390586853, + "learning_rate": 4.033242058770648e-08, + "loss": 0.253, + "step": 43580 + }, + { + "epoch": 0.9715042508651258, + "grad_norm": 0.6706966161727905, + "learning_rate": 4.0018882315132624e-08, + "loss": 0.3657, + "step": 43585 + }, + { + "epoch": 0.9716157002457458, + "grad_norm": 0.6501227617263794, + "learning_rate": 3.970656505936532e-08, + "loss": 0.2629, + "step": 43590 + }, + { + "epoch": 0.971727149626366, + "grad_norm": 0.6011533737182617, + "learning_rate": 3.939546885869172e-08, + "loss": 0.3037, + "step": 43595 + }, + { + "epoch": 0.971838599006986, + "grad_norm": 0.7536007165908813, + "learning_rate": 3.9085593751249094e-08, + "loss": 0.3844, + "step": 43600 + }, + { + "epoch": 0.9719500483876061, + "grad_norm": 0.8846654891967773, + "learning_rate": 3.877693977502594e-08, + "loss": 0.2288, + "step": 43605 + }, + { + "epoch": 0.9720614977682261, + "grad_norm": 0.35863903164863586, + "learning_rate": 3.8469506967862e-08, + "loss": 0.2713, + "step": 43610 + }, + { + "epoch": 0.9721729471488462, + "grad_norm": 0.658648669719696, + "learning_rate": 3.81632953674449e-08, + "loss": 0.2827, + "step": 43615 + }, + { + "epoch": 0.9722843965294663, + "grad_norm": 0.6056036353111267, + "learning_rate": 3.78583050113146e-08, + "loss": 0.201, + "step": 43620 + }, + { + "epoch": 0.9723958459100863, + "grad_norm": 0.6864349842071533, + "learning_rate": 3.755453593685898e-08, + "loss": 0.3008, + "step": 43625 + }, + { + "epoch": 0.9725072952907065, + "grad_norm": 0.8081383109092712, + "learning_rate": 3.7251988181319365e-08, + "loss": 0.3825, + "step": 43630 + }, + { + "epoch": 0.9726187446713265, + "grad_norm": 0.3625475764274597, + "learning_rate": 3.695066178178608e-08, + "loss": 0.2838, + "step": 43635 + }, + { + "epoch": 0.9727301940519466, + "grad_norm": 0.6773290038108826, + "learning_rate": 3.6650556775198464e-08, + "loss": 0.2896, + "step": 43640 + }, + { + "epoch": 0.9728416434325666, + "grad_norm": 0.603001594543457, + "learning_rate": 3.635167319834709e-08, + "loss": 0.2838, + "step": 43645 + }, + { + "epoch": 0.9729530928131866, + "grad_norm": 1.0788512229919434, + "learning_rate": 3.6054011087873765e-08, + "loss": 0.3954, + "step": 43650 + }, + { + "epoch": 0.9730645421938068, + "grad_norm": 0.49225085973739624, + "learning_rate": 3.575757048026818e-08, + "loss": 0.3231, + "step": 43655 + }, + { + "epoch": 0.9731759915744268, + "grad_norm": 0.5131734013557434, + "learning_rate": 3.546235141187238e-08, + "loss": 0.286, + "step": 43660 + }, + { + "epoch": 0.9732874409550469, + "grad_norm": 0.43520259857177734, + "learning_rate": 3.5168353918877406e-08, + "loss": 0.3706, + "step": 43665 + }, + { + "epoch": 0.973398890335667, + "grad_norm": 0.9183757901191711, + "learning_rate": 3.487557803732555e-08, + "loss": 0.3376, + "step": 43670 + }, + { + "epoch": 0.973510339716287, + "grad_norm": 0.5258111953735352, + "learning_rate": 3.458402380310921e-08, + "loss": 0.2385, + "step": 43675 + }, + { + "epoch": 0.9736217890969071, + "grad_norm": 0.6289880871772766, + "learning_rate": 3.429369125197091e-08, + "loss": 0.2567, + "step": 43680 + }, + { + "epoch": 0.9737332384775271, + "grad_norm": 0.3663708567619324, + "learning_rate": 3.4004580419502164e-08, + "loss": 0.3914, + "step": 43685 + }, + { + "epoch": 0.9738446878581473, + "grad_norm": 0.5350440740585327, + "learning_rate": 3.3716691341146855e-08, + "loss": 0.3602, + "step": 43690 + }, + { + "epoch": 0.9739561372387673, + "grad_norm": 0.33239659667015076, + "learning_rate": 3.343002405219564e-08, + "loss": 0.294, + "step": 43695 + }, + { + "epoch": 0.9740675866193874, + "grad_norm": 0.9363645315170288, + "learning_rate": 3.314457858779485e-08, + "loss": 0.3205, + "step": 43700 + }, + { + "epoch": 0.9741790360000074, + "grad_norm": 0.6830964088439941, + "learning_rate": 3.2860354982935385e-08, + "loss": 0.2822, + "step": 43705 + }, + { + "epoch": 0.9742904853806275, + "grad_norm": 0.8132318258285522, + "learning_rate": 3.25773532724627e-08, + "loss": 0.273, + "step": 43710 + }, + { + "epoch": 0.9744019347612476, + "grad_norm": 0.7080520987510681, + "learning_rate": 3.2295573491070157e-08, + "loss": 0.3019, + "step": 43715 + }, + { + "epoch": 0.9745133841418676, + "grad_norm": 0.9034405946731567, + "learning_rate": 3.201501567330012e-08, + "loss": 0.2615, + "step": 43720 + }, + { + "epoch": 0.9746248335224877, + "grad_norm": 0.5081268548965454, + "learning_rate": 3.173567985354842e-08, + "loss": 0.2833, + "step": 43725 + }, + { + "epoch": 0.9747362829031078, + "grad_norm": 0.7102012038230896, + "learning_rate": 3.145756606605988e-08, + "loss": 0.2291, + "step": 43730 + }, + { + "epoch": 0.9748477322837278, + "grad_norm": 0.8138337135314941, + "learning_rate": 3.118067434492833e-08, + "loss": 0.29, + "step": 43735 + }, + { + "epoch": 0.9749591816643479, + "grad_norm": 0.2943519055843353, + "learning_rate": 3.090500472409774e-08, + "loss": 0.3336, + "step": 43740 + }, + { + "epoch": 0.9750706310449679, + "grad_norm": 0.5932673215866089, + "learning_rate": 3.0630557237365524e-08, + "loss": 0.3135, + "step": 43745 + }, + { + "epoch": 0.9751820804255881, + "grad_norm": 0.9085009694099426, + "learning_rate": 3.0357331918373644e-08, + "loss": 0.2403, + "step": 43750 + }, + { + "epoch": 0.9752935298062081, + "grad_norm": 0.5600362420082092, + "learning_rate": 3.0085328800619763e-08, + "loss": 0.2085, + "step": 43755 + }, + { + "epoch": 0.9754049791868281, + "grad_norm": 0.8048264384269714, + "learning_rate": 2.981454791744831e-08, + "loss": 0.3457, + "step": 43760 + }, + { + "epoch": 0.9755164285674482, + "grad_norm": 0.5526010394096375, + "learning_rate": 2.9544989302056072e-08, + "loss": 0.2698, + "step": 43765 + }, + { + "epoch": 0.9756278779480683, + "grad_norm": 0.5081197023391724, + "learning_rate": 2.927665298748772e-08, + "loss": 0.2491, + "step": 43770 + }, + { + "epoch": 0.9757393273286884, + "grad_norm": 0.5824170708656311, + "learning_rate": 2.9009539006639165e-08, + "loss": 0.2384, + "step": 43775 + }, + { + "epoch": 0.9758507767093084, + "grad_norm": 1.076564908027649, + "learning_rate": 2.8743647392257546e-08, + "loss": 0.1521, + "step": 43780 + }, + { + "epoch": 0.9759622260899286, + "grad_norm": 0.5411296486854553, + "learning_rate": 2.84789781769379e-08, + "loss": 0.3039, + "step": 43785 + }, + { + "epoch": 0.9760736754705486, + "grad_norm": 0.696861982345581, + "learning_rate": 2.8215531393126495e-08, + "loss": 0.3149, + "step": 43790 + }, + { + "epoch": 0.9761851248511686, + "grad_norm": 0.43493250012397766, + "learning_rate": 2.7953307073121936e-08, + "loss": 0.2139, + "step": 43795 + }, + { + "epoch": 0.9762965742317887, + "grad_norm": 0.5109127759933472, + "learning_rate": 2.7692305249068518e-08, + "loss": 0.2602, + "step": 43800 + }, + { + "epoch": 0.9764080236124087, + "grad_norm": 0.7707669734954834, + "learning_rate": 2.7432525952965084e-08, + "loss": 0.1913, + "step": 43805 + }, + { + "epoch": 0.9765194729930289, + "grad_norm": 0.8124564290046692, + "learning_rate": 2.717396921665727e-08, + "loss": 0.265, + "step": 43810 + }, + { + "epoch": 0.9766309223736489, + "grad_norm": 0.5128911137580872, + "learning_rate": 2.6916635071841945e-08, + "loss": 0.2216, + "step": 43815 + }, + { + "epoch": 0.9767423717542689, + "grad_norm": 0.8029731512069702, + "learning_rate": 2.666052355006721e-08, + "loss": 0.3493, + "step": 43820 + }, + { + "epoch": 0.976853821134889, + "grad_norm": 0.5877036452293396, + "learning_rate": 2.6405634682729054e-08, + "loss": 0.1946, + "step": 43825 + }, + { + "epoch": 0.9769652705155091, + "grad_norm": 0.714207112789154, + "learning_rate": 2.615196850107693e-08, + "loss": 0.3386, + "step": 43830 + }, + { + "epoch": 0.9770767198961292, + "grad_norm": 0.5350043773651123, + "learning_rate": 2.5899525036207073e-08, + "loss": 0.2033, + "step": 43835 + }, + { + "epoch": 0.9771881692767492, + "grad_norm": 0.7569406628608704, + "learning_rate": 2.5648304319065843e-08, + "loss": 0.2975, + "step": 43840 + }, + { + "epoch": 0.9772996186573694, + "grad_norm": 0.7935879826545715, + "learning_rate": 2.539830638045415e-08, + "loss": 0.3681, + "step": 43845 + }, + { + "epoch": 0.9774110680379894, + "grad_norm": 0.6853658556938171, + "learning_rate": 2.514953125101638e-08, + "loss": 0.1598, + "step": 43850 + }, + { + "epoch": 0.9775225174186094, + "grad_norm": 0.45932871103286743, + "learning_rate": 2.4901978961253682e-08, + "loss": 0.26, + "step": 43855 + }, + { + "epoch": 0.9776339667992295, + "grad_norm": 0.8988186120986938, + "learning_rate": 2.4655649541510674e-08, + "loss": 0.3586, + "step": 43860 + }, + { + "epoch": 0.9777454161798496, + "grad_norm": 0.6980366110801697, + "learning_rate": 2.4410543021988753e-08, + "loss": 0.2054, + "step": 43865 + }, + { + "epoch": 0.9778568655604697, + "grad_norm": 0.4933127164840698, + "learning_rate": 2.4166659432733884e-08, + "loss": 0.1948, + "step": 43870 + }, + { + "epoch": 0.9779683149410897, + "grad_norm": 0.6166961789131165, + "learning_rate": 2.392399880364438e-08, + "loss": 0.2251, + "step": 43875 + }, + { + "epoch": 0.9780797643217097, + "grad_norm": 0.7168357968330383, + "learning_rate": 2.3682561164469764e-08, + "loss": 0.2675, + "step": 43880 + }, + { + "epoch": 0.9781912137023299, + "grad_norm": 1.0506415367126465, + "learning_rate": 2.3442346544807482e-08, + "loss": 0.2907, + "step": 43885 + }, + { + "epoch": 0.9783026630829499, + "grad_norm": 0.6322489380836487, + "learning_rate": 2.3203354974107305e-08, + "loss": 0.2523, + "step": 43890 + }, + { + "epoch": 0.97841411246357, + "grad_norm": 0.29705050587654114, + "learning_rate": 2.2965586481665804e-08, + "loss": 0.2348, + "step": 43895 + }, + { + "epoch": 0.97852556184419, + "grad_norm": 0.4134402275085449, + "learning_rate": 2.2729041096632987e-08, + "loss": 0.3822, + "step": 43900 + }, + { + "epoch": 0.9786370112248102, + "grad_norm": 0.9109967947006226, + "learning_rate": 2.2493718848006773e-08, + "loss": 0.3193, + "step": 43905 + }, + { + "epoch": 0.9787484606054302, + "grad_norm": 0.6592934727668762, + "learning_rate": 2.22596197646352e-08, + "loss": 0.3314, + "step": 43910 + }, + { + "epoch": 0.9788599099860502, + "grad_norm": 0.9526196122169495, + "learning_rate": 2.2026743875218637e-08, + "loss": 0.2137, + "step": 43915 + }, + { + "epoch": 0.9789713593666703, + "grad_norm": 0.8369274735450745, + "learning_rate": 2.1795091208305363e-08, + "loss": 0.1951, + "step": 43920 + }, + { + "epoch": 0.9790828087472904, + "grad_norm": 0.6565558910369873, + "learning_rate": 2.1564661792293773e-08, + "loss": 0.2329, + "step": 43925 + }, + { + "epoch": 0.9791942581279105, + "grad_norm": 0.7436085343360901, + "learning_rate": 2.133545565543349e-08, + "loss": 0.2862, + "step": 43930 + }, + { + "epoch": 0.9793057075085305, + "grad_norm": 0.6626237630844116, + "learning_rate": 2.110747282582204e-08, + "loss": 0.2831, + "step": 43935 + }, + { + "epoch": 0.9794171568891505, + "grad_norm": 1.3161259889602661, + "learning_rate": 2.0880713331410397e-08, + "loss": 0.3011, + "step": 43940 + }, + { + "epoch": 0.9795286062697707, + "grad_norm": 0.7118861675262451, + "learning_rate": 2.0655177199995214e-08, + "loss": 0.273, + "step": 43945 + }, + { + "epoch": 0.9796400556503907, + "grad_norm": 0.583651065826416, + "learning_rate": 2.0430864459226594e-08, + "loss": 0.31, + "step": 43950 + }, + { + "epoch": 0.9797515050310108, + "grad_norm": 0.956254243850708, + "learning_rate": 2.020777513660366e-08, + "loss": 0.3354, + "step": 43955 + }, + { + "epoch": 0.9798629544116308, + "grad_norm": 0.6450993418693542, + "learning_rate": 1.9985909259475635e-08, + "loss": 0.2298, + "step": 43960 + }, + { + "epoch": 0.9799744037922509, + "grad_norm": 0.8870912790298462, + "learning_rate": 1.9765266855041876e-08, + "loss": 0.3602, + "step": 43965 + }, + { + "epoch": 0.980085853172871, + "grad_norm": 0.7856428623199463, + "learning_rate": 1.9545847950349638e-08, + "loss": 0.2129, + "step": 43970 + }, + { + "epoch": 0.980197302553491, + "grad_norm": 0.4291646480560303, + "learning_rate": 1.9327652572299628e-08, + "loss": 0.1795, + "step": 43975 + }, + { + "epoch": 0.9803087519341112, + "grad_norm": 0.5906414985656738, + "learning_rate": 1.9110680747640442e-08, + "loss": 0.2781, + "step": 43980 + }, + { + "epoch": 0.9804202013147312, + "grad_norm": 0.8675142526626587, + "learning_rate": 1.8894932502970807e-08, + "loss": 0.3785, + "step": 43985 + }, + { + "epoch": 0.9805316506953513, + "grad_norm": 0.3896576166152954, + "learning_rate": 1.868040786474068e-08, + "loss": 0.2942, + "step": 43990 + }, + { + "epoch": 0.9806431000759713, + "grad_norm": 0.6647374629974365, + "learning_rate": 1.8467106859247907e-08, + "loss": 0.3858, + "step": 43995 + }, + { + "epoch": 0.9807545494565914, + "grad_norm": 0.6497798562049866, + "learning_rate": 1.8255029512642686e-08, + "loss": 0.1956, + "step": 44000 + }, + { + "epoch": 0.9808659988372115, + "grad_norm": 0.571378231048584, + "learning_rate": 1.8044175850924215e-08, + "loss": 0.2705, + "step": 44005 + }, + { + "epoch": 0.9809774482178315, + "grad_norm": 0.7403730750083923, + "learning_rate": 1.7834545899939602e-08, + "loss": 0.4463, + "step": 44010 + }, + { + "epoch": 0.9810888975984516, + "grad_norm": 0.8009006381034851, + "learning_rate": 1.7626139685389398e-08, + "loss": 0.2168, + "step": 44015 + }, + { + "epoch": 0.9812003469790717, + "grad_norm": 0.5484341979026794, + "learning_rate": 1.7418957232823164e-08, + "loss": 0.2656, + "step": 44020 + }, + { + "epoch": 0.9813117963596917, + "grad_norm": 0.258148193359375, + "learning_rate": 1.7212998567639473e-08, + "loss": 0.2093, + "step": 44025 + }, + { + "epoch": 0.9814232457403118, + "grad_norm": 0.6334442496299744, + "learning_rate": 1.7008263715085904e-08, + "loss": 0.2161, + "step": 44030 + }, + { + "epoch": 0.9815346951209318, + "grad_norm": 0.43233025074005127, + "learning_rate": 1.6804752700262385e-08, + "loss": 0.3004, + "step": 44035 + }, + { + "epoch": 0.981646144501552, + "grad_norm": 0.5253536105155945, + "learning_rate": 1.660246554811784e-08, + "loss": 0.1689, + "step": 44040 + }, + { + "epoch": 0.981757593882172, + "grad_norm": 0.5353971719741821, + "learning_rate": 1.640140228345133e-08, + "loss": 0.275, + "step": 44045 + }, + { + "epoch": 0.9818690432627921, + "grad_norm": 0.4450036287307739, + "learning_rate": 1.620156293091091e-08, + "loss": 0.328, + "step": 44050 + }, + { + "epoch": 0.9819804926434121, + "grad_norm": 0.76844722032547, + "learning_rate": 1.600294751499587e-08, + "loss": 0.3037, + "step": 44055 + }, + { + "epoch": 0.9820919420240322, + "grad_norm": 0.965364396572113, + "learning_rate": 1.5805556060054517e-08, + "loss": 0.2984, + "step": 44060 + }, + { + "epoch": 0.9822033914046523, + "grad_norm": 0.49304118752479553, + "learning_rate": 1.5609388590286378e-08, + "loss": 0.2602, + "step": 44065 + }, + { + "epoch": 0.9823148407852723, + "grad_norm": 0.8976914882659912, + "learning_rate": 1.5414445129739998e-08, + "loss": 0.2268, + "step": 44070 + }, + { + "epoch": 0.9824262901658924, + "grad_norm": 0.7301560640335083, + "learning_rate": 1.522072570231292e-08, + "loss": 0.2517, + "step": 44075 + }, + { + "epoch": 0.9825377395465125, + "grad_norm": 1.5539482831954956, + "learning_rate": 1.5028230331753935e-08, + "loss": 0.2661, + "step": 44080 + }, + { + "epoch": 0.9826491889271325, + "grad_norm": 0.6516464352607727, + "learning_rate": 1.4836959041661935e-08, + "loss": 0.3588, + "step": 44085 + }, + { + "epoch": 0.9827606383077526, + "grad_norm": 0.5414305925369263, + "learning_rate": 1.4646911855484836e-08, + "loss": 0.2876, + "step": 44090 + }, + { + "epoch": 0.9828720876883726, + "grad_norm": 0.52645343542099, + "learning_rate": 1.4458088796521775e-08, + "loss": 0.2282, + "step": 44095 + }, + { + "epoch": 0.9829835370689928, + "grad_norm": 0.4100842773914337, + "learning_rate": 1.4270489887919792e-08, + "loss": 0.3874, + "step": 44100 + }, + { + "epoch": 0.9830949864496128, + "grad_norm": 0.6244540214538574, + "learning_rate": 1.4084115152679379e-08, + "loss": 0.2705, + "step": 44105 + }, + { + "epoch": 0.9832064358302329, + "grad_norm": 0.6012434959411621, + "learning_rate": 1.3898964613645593e-08, + "loss": 0.3067, + "step": 44110 + }, + { + "epoch": 0.983317885210853, + "grad_norm": 0.5650272369384766, + "learning_rate": 1.3715038293518057e-08, + "loss": 0.3253, + "step": 44115 + }, + { + "epoch": 0.983429334591473, + "grad_norm": 0.7158342003822327, + "learning_rate": 1.3532336214844288e-08, + "loss": 0.3328, + "step": 44120 + }, + { + "epoch": 0.9835407839720931, + "grad_norm": 0.40258294343948364, + "learning_rate": 1.3350858400023036e-08, + "loss": 0.3134, + "step": 44125 + }, + { + "epoch": 0.9836522333527131, + "grad_norm": 0.8680897355079651, + "learning_rate": 1.3170604871300951e-08, + "loss": 0.3529, + "step": 44130 + }, + { + "epoch": 0.9837636827333333, + "grad_norm": 1.1211825609207153, + "learning_rate": 1.2991575650777021e-08, + "loss": 0.2747, + "step": 44135 + }, + { + "epoch": 0.9838751321139533, + "grad_norm": 0.41539865732192993, + "learning_rate": 1.2813770760397027e-08, + "loss": 0.3315, + "step": 44140 + }, + { + "epoch": 0.9839865814945733, + "grad_norm": 0.3326054811477661, + "learning_rate": 1.2637190221960193e-08, + "loss": 0.2775, + "step": 44145 + }, + { + "epoch": 0.9840980308751934, + "grad_norm": 0.5752367377281189, + "learning_rate": 1.2461834057112543e-08, + "loss": 0.32, + "step": 44150 + }, + { + "epoch": 0.9842094802558135, + "grad_norm": 0.6004331111907959, + "learning_rate": 1.2287702287352432e-08, + "loss": 0.2507, + "step": 44155 + }, + { + "epoch": 0.9843209296364336, + "grad_norm": 0.396384596824646, + "learning_rate": 1.211479493402723e-08, + "loss": 0.1781, + "step": 44160 + }, + { + "epoch": 0.9844323790170536, + "grad_norm": 1.2847353219985962, + "learning_rate": 1.1943112018332204e-08, + "loss": 0.1996, + "step": 44165 + }, + { + "epoch": 0.9845438283976736, + "grad_norm": 0.5534709692001343, + "learning_rate": 1.1772653561317183e-08, + "loss": 0.2638, + "step": 44170 + }, + { + "epoch": 0.9846552777782938, + "grad_norm": 1.1106151342391968, + "learning_rate": 1.1603419583876564e-08, + "loss": 0.2106, + "step": 44175 + }, + { + "epoch": 0.9847667271589138, + "grad_norm": 0.5935094952583313, + "learning_rate": 1.1435410106758194e-08, + "loss": 0.1783, + "step": 44180 + }, + { + "epoch": 0.9848781765395339, + "grad_norm": 0.6666005253791809, + "learning_rate": 1.1268625150558931e-08, + "loss": 0.1647, + "step": 44185 + }, + { + "epoch": 0.9849896259201539, + "grad_norm": 0.5934476852416992, + "learning_rate": 1.1103064735725755e-08, + "loss": 0.2673, + "step": 44190 + }, + { + "epoch": 0.9851010753007741, + "grad_norm": 0.2453710436820984, + "learning_rate": 1.0938728882553539e-08, + "loss": 0.2299, + "step": 44195 + }, + { + "epoch": 0.9852125246813941, + "grad_norm": 0.5756257176399231, + "learning_rate": 1.0775617611189504e-08, + "loss": 0.2414, + "step": 44200 + }, + { + "epoch": 0.9853239740620141, + "grad_norm": 0.19844235479831696, + "learning_rate": 1.0613730941629875e-08, + "loss": 0.2291, + "step": 44205 + }, + { + "epoch": 0.9854354234426342, + "grad_norm": 0.5764844417572021, + "learning_rate": 1.0453068893720998e-08, + "loss": 0.2601, + "step": 44210 + }, + { + "epoch": 0.9855468728232543, + "grad_norm": 0.5215807557106018, + "learning_rate": 1.0293631487157119e-08, + "loss": 0.3564, + "step": 44215 + }, + { + "epoch": 0.9856583222038744, + "grad_norm": 0.506155788898468, + "learning_rate": 1.0135418741487046e-08, + "loss": 0.1787, + "step": 44220 + }, + { + "epoch": 0.9857697715844944, + "grad_norm": 0.3792957663536072, + "learning_rate": 9.978430676103047e-09, + "loss": 0.2132, + "step": 44225 + }, + { + "epoch": 0.9858812209651144, + "grad_norm": 0.5018007755279541, + "learning_rate": 9.822667310253054e-09, + "loss": 0.2947, + "step": 44230 + }, + { + "epoch": 0.9859926703457346, + "grad_norm": 0.5258249044418335, + "learning_rate": 9.668128663031795e-09, + "loss": 0.3304, + "step": 44235 + }, + { + "epoch": 0.9861041197263546, + "grad_norm": 0.6357218623161316, + "learning_rate": 9.514814753385226e-09, + "loss": 0.2847, + "step": 44240 + }, + { + "epoch": 0.9862155691069747, + "grad_norm": 0.7918208837509155, + "learning_rate": 9.362725600106094e-09, + "loss": 0.2303, + "step": 44245 + }, + { + "epoch": 0.9863270184875947, + "grad_norm": 0.6385669708251953, + "learning_rate": 9.2118612218417e-09, + "loss": 0.2983, + "step": 44250 + }, + { + "epoch": 0.9864384678682149, + "grad_norm": 0.8675650954246521, + "learning_rate": 9.062221637086143e-09, + "loss": 0.2412, + "step": 44255 + }, + { + "epoch": 0.9865499172488349, + "grad_norm": 0.5685170888900757, + "learning_rate": 8.913806864183638e-09, + "loss": 0.2755, + "step": 44260 + }, + { + "epoch": 0.9866613666294549, + "grad_norm": 1.0873620510101318, + "learning_rate": 8.76661692132963e-09, + "loss": 0.3299, + "step": 44265 + }, + { + "epoch": 0.986772816010075, + "grad_norm": 0.7357842326164246, + "learning_rate": 8.620651826567461e-09, + "loss": 0.3163, + "step": 44270 + }, + { + "epoch": 0.9868842653906951, + "grad_norm": 0.8572162389755249, + "learning_rate": 8.475911597792818e-09, + "loss": 0.268, + "step": 44275 + }, + { + "epoch": 0.9869957147713152, + "grad_norm": 0.34092795848846436, + "learning_rate": 8.332396252747066e-09, + "loss": 0.2398, + "step": 44280 + }, + { + "epoch": 0.9871071641519352, + "grad_norm": 0.5856196284294128, + "learning_rate": 8.190105809026127e-09, + "loss": 0.3095, + "step": 44285 + }, + { + "epoch": 0.9872186135325552, + "grad_norm": 0.6363946795463562, + "learning_rate": 8.049040284073828e-09, + "loss": 0.2011, + "step": 44290 + }, + { + "epoch": 0.9873300629131754, + "grad_norm": 0.6171948313713074, + "learning_rate": 7.909199695183e-09, + "loss": 0.356, + "step": 44295 + }, + { + "epoch": 0.9874415122937954, + "grad_norm": 0.5732865929603577, + "learning_rate": 7.77058405949771e-09, + "loss": 0.3505, + "step": 44300 + }, + { + "epoch": 0.9875529616744155, + "grad_norm": 0.724398672580719, + "learning_rate": 7.633193394009919e-09, + "loss": 0.1241, + "step": 44305 + }, + { + "epoch": 0.9876644110550356, + "grad_norm": 0.7275500893592834, + "learning_rate": 7.49702771556282e-09, + "loss": 0.277, + "step": 44310 + }, + { + "epoch": 0.9877758604356557, + "grad_norm": 0.48448437452316284, + "learning_rate": 7.36208704085084e-09, + "loss": 0.1904, + "step": 44315 + }, + { + "epoch": 0.9878873098162757, + "grad_norm": 0.7425686120986938, + "learning_rate": 7.228371386415189e-09, + "loss": 0.3454, + "step": 44320 + }, + { + "epoch": 0.9879987591968957, + "grad_norm": 0.725597083568573, + "learning_rate": 7.095880768649422e-09, + "loss": 0.3361, + "step": 44325 + }, + { + "epoch": 0.9881102085775159, + "grad_norm": 0.9583325386047363, + "learning_rate": 6.9646152037949884e-09, + "loss": 0.2904, + "step": 44330 + }, + { + "epoch": 0.9882216579581359, + "grad_norm": 0.633183479309082, + "learning_rate": 6.834574707943464e-09, + "loss": 0.451, + "step": 44335 + }, + { + "epoch": 0.988333107338756, + "grad_norm": 0.7976347208023071, + "learning_rate": 6.705759297038761e-09, + "loss": 0.2779, + "step": 44340 + }, + { + "epoch": 0.988444556719376, + "grad_norm": 0.5813356637954712, + "learning_rate": 6.57816898687158e-09, + "loss": 0.2151, + "step": 44345 + }, + { + "epoch": 0.988556006099996, + "grad_norm": 0.7804527282714844, + "learning_rate": 6.451803793082745e-09, + "loss": 0.2853, + "step": 44350 + }, + { + "epoch": 0.9886674554806162, + "grad_norm": 0.839613676071167, + "learning_rate": 6.3266637311654164e-09, + "loss": 0.2664, + "step": 44355 + }, + { + "epoch": 0.9887789048612362, + "grad_norm": 0.3687819540500641, + "learning_rate": 6.202748816458437e-09, + "loss": 0.2546, + "step": 44360 + }, + { + "epoch": 0.9888903542418563, + "grad_norm": 0.5957117676734924, + "learning_rate": 6.080059064155208e-09, + "loss": 0.3163, + "step": 44365 + }, + { + "epoch": 0.9890018036224764, + "grad_norm": 0.45797082781791687, + "learning_rate": 5.958594489295921e-09, + "loss": 0.2898, + "step": 44370 + }, + { + "epoch": 0.9891132530030964, + "grad_norm": 0.7213388085365295, + "learning_rate": 5.8383551067697774e-09, + "loss": 0.1246, + "step": 44375 + }, + { + "epoch": 0.9892247023837165, + "grad_norm": 0.46411189436912537, + "learning_rate": 5.719340931318318e-09, + "loss": 0.3271, + "step": 44380 + }, + { + "epoch": 0.9893361517643365, + "grad_norm": 0.6343370676040649, + "learning_rate": 5.6015519775320935e-09, + "loss": 0.1754, + "step": 44385 + }, + { + "epoch": 0.9894476011449567, + "grad_norm": 0.2115870714187622, + "learning_rate": 5.484988259850665e-09, + "loss": 0.2141, + "step": 44390 + }, + { + "epoch": 0.9895590505255767, + "grad_norm": 0.626330554485321, + "learning_rate": 5.369649792563714e-09, + "loss": 0.3045, + "step": 44395 + }, + { + "epoch": 0.9896704999061968, + "grad_norm": 0.6680053472518921, + "learning_rate": 5.255536589811039e-09, + "loss": 0.2344, + "step": 44400 + }, + { + "epoch": 0.9897819492868168, + "grad_norm": 0.4205387532711029, + "learning_rate": 5.142648665581451e-09, + "loss": 0.2797, + "step": 44405 + }, + { + "epoch": 0.9898933986674369, + "grad_norm": 0.6423759460449219, + "learning_rate": 5.030986033714991e-09, + "loss": 0.3676, + "step": 44410 + }, + { + "epoch": 0.990004848048057, + "grad_norm": 0.3676292598247528, + "learning_rate": 4.920548707900707e-09, + "loss": 0.2164, + "step": 44415 + }, + { + "epoch": 0.990116297428677, + "grad_norm": 0.5738866925239563, + "learning_rate": 4.811336701676661e-09, + "loss": 0.2332, + "step": 44420 + }, + { + "epoch": 0.9902277468092971, + "grad_norm": 0.475180447101593, + "learning_rate": 4.703350028432141e-09, + "loss": 0.1915, + "step": 44425 + }, + { + "epoch": 0.9903391961899172, + "grad_norm": 0.811890184879303, + "learning_rate": 4.596588701404336e-09, + "loss": 0.3585, + "step": 44430 + }, + { + "epoch": 0.9904506455705372, + "grad_norm": 0.5211836695671082, + "learning_rate": 4.491052733682777e-09, + "loss": 0.2742, + "step": 44435 + }, + { + "epoch": 0.9905620949511573, + "grad_norm": 1.0613921880722046, + "learning_rate": 4.386742138203781e-09, + "loss": 0.1681, + "step": 44440 + }, + { + "epoch": 0.9906735443317773, + "grad_norm": 0.4034428596496582, + "learning_rate": 4.283656927757119e-09, + "loss": 0.2306, + "step": 44445 + }, + { + "epoch": 0.9907849937123975, + "grad_norm": 0.4636722505092621, + "learning_rate": 4.181797114978236e-09, + "loss": 0.2559, + "step": 44450 + }, + { + "epoch": 0.9908964430930175, + "grad_norm": 0.7295843362808228, + "learning_rate": 4.081162712354924e-09, + "loss": 0.2294, + "step": 44455 + }, + { + "epoch": 0.9910078924736376, + "grad_norm": 0.5972443222999573, + "learning_rate": 3.981753732225091e-09, + "loss": 0.1937, + "step": 44460 + }, + { + "epoch": 0.9911193418542577, + "grad_norm": 0.43710991740226746, + "learning_rate": 3.883570186774543e-09, + "loss": 0.2792, + "step": 44465 + }, + { + "epoch": 0.9912307912348777, + "grad_norm": 0.8411343097686768, + "learning_rate": 3.78661208804032e-09, + "loss": 0.2378, + "step": 44470 + }, + { + "epoch": 0.9913422406154978, + "grad_norm": 0.7043887972831726, + "learning_rate": 3.6908794479084687e-09, + "loss": 0.2732, + "step": 44475 + }, + { + "epoch": 0.9914536899961178, + "grad_norm": 0.6706881523132324, + "learning_rate": 3.5963722781151568e-09, + "loss": 0.2515, + "step": 44480 + }, + { + "epoch": 0.991565139376738, + "grad_norm": 0.6557788252830505, + "learning_rate": 3.5030905902455615e-09, + "loss": 0.2542, + "step": 44485 + }, + { + "epoch": 0.991676588757358, + "grad_norm": 0.7045385837554932, + "learning_rate": 3.4110343957360904e-09, + "loss": 0.2377, + "step": 44490 + }, + { + "epoch": 0.991788038137978, + "grad_norm": 0.5724929571151733, + "learning_rate": 3.3202037058732707e-09, + "loss": 0.2809, + "step": 44495 + }, + { + "epoch": 0.9918994875185981, + "grad_norm": 0.7064734697341919, + "learning_rate": 3.2305985317893086e-09, + "loss": 0.3664, + "step": 44500 + }, + { + "epoch": 0.9920109368992182, + "grad_norm": 0.8698769211769104, + "learning_rate": 3.142218884472081e-09, + "loss": 0.1696, + "step": 44505 + }, + { + "epoch": 0.9921223862798383, + "grad_norm": 0.723349392414093, + "learning_rate": 3.0550647747540352e-09, + "loss": 0.2655, + "step": 44510 + }, + { + "epoch": 0.9922338356604583, + "grad_norm": 0.5983314514160156, + "learning_rate": 2.9691362133210667e-09, + "loss": 0.3201, + "step": 44515 + }, + { + "epoch": 0.9923452850410783, + "grad_norm": 0.3199304938316345, + "learning_rate": 2.8844332107058615e-09, + "loss": 0.2649, + "step": 44520 + }, + { + "epoch": 0.9924567344216985, + "grad_norm": 0.6981607675552368, + "learning_rate": 2.800955777293446e-09, + "loss": 0.3955, + "step": 44525 + }, + { + "epoch": 0.9925681838023185, + "grad_norm": 1.2841216325759888, + "learning_rate": 2.718703923317856e-09, + "loss": 0.1887, + "step": 44530 + }, + { + "epoch": 0.9926796331829386, + "grad_norm": 0.7746221423149109, + "learning_rate": 2.637677658862137e-09, + "loss": 0.2562, + "step": 44535 + }, + { + "epoch": 0.9927910825635586, + "grad_norm": 0.929040253162384, + "learning_rate": 2.557876993859454e-09, + "loss": 0.3289, + "step": 44540 + }, + { + "epoch": 0.9929025319441788, + "grad_norm": 0.49944812059402466, + "learning_rate": 2.4793019380919825e-09, + "loss": 0.3591, + "step": 44545 + }, + { + "epoch": 0.9930139813247988, + "grad_norm": 0.46589112281799316, + "learning_rate": 2.4019525011931277e-09, + "loss": 0.1766, + "step": 44550 + }, + { + "epoch": 0.9931254307054188, + "grad_norm": 0.7392093539237976, + "learning_rate": 2.3258286926453043e-09, + "loss": 0.345, + "step": 44555 + }, + { + "epoch": 0.9932368800860389, + "grad_norm": 0.6125676035881042, + "learning_rate": 2.2509305217810473e-09, + "loss": 0.2584, + "step": 44560 + }, + { + "epoch": 0.993348329466659, + "grad_norm": 0.7500099539756775, + "learning_rate": 2.177257997781901e-09, + "loss": 0.238, + "step": 44565 + }, + { + "epoch": 0.9934597788472791, + "grad_norm": 0.3287777900695801, + "learning_rate": 2.1048111296795293e-09, + "loss": 0.1774, + "step": 44570 + }, + { + "epoch": 0.9935712282278991, + "grad_norm": 0.5603126883506775, + "learning_rate": 2.0335899263546065e-09, + "loss": 0.1842, + "step": 44575 + }, + { + "epoch": 0.9936826776085191, + "grad_norm": 0.8498858213424683, + "learning_rate": 1.963594396540147e-09, + "loss": 0.3448, + "step": 44580 + }, + { + "epoch": 0.9937941269891393, + "grad_norm": 0.7383705377578735, + "learning_rate": 1.8948245488159543e-09, + "loss": 0.365, + "step": 44585 + }, + { + "epoch": 0.9939055763697593, + "grad_norm": 0.6081027388572693, + "learning_rate": 1.8272803916119519e-09, + "loss": 0.2639, + "step": 44590 + }, + { + "epoch": 0.9940170257503794, + "grad_norm": 0.5102843642234802, + "learning_rate": 1.7609619332104034e-09, + "loss": 0.2042, + "step": 44595 + }, + { + "epoch": 0.9941284751309994, + "grad_norm": 0.9159148335456848, + "learning_rate": 1.6958691817392514e-09, + "loss": 0.3071, + "step": 44600 + }, + { + "epoch": 0.9942399245116196, + "grad_norm": 0.5763835906982422, + "learning_rate": 1.6320021451798895e-09, + "loss": 0.2104, + "step": 44605 + }, + { + "epoch": 0.9943513738922396, + "grad_norm": 0.46994051337242126, + "learning_rate": 1.5693608313616104e-09, + "loss": 0.2715, + "step": 44610 + }, + { + "epoch": 0.9944628232728596, + "grad_norm": 0.6035696268081665, + "learning_rate": 1.5079452479638268e-09, + "loss": 0.3726, + "step": 44615 + }, + { + "epoch": 0.9945742726534798, + "grad_norm": 0.8437037467956543, + "learning_rate": 1.447755402514961e-09, + "loss": 0.3022, + "step": 44620 + }, + { + "epoch": 0.9946857220340998, + "grad_norm": 0.7784526944160461, + "learning_rate": 1.3887913023946652e-09, + "loss": 0.284, + "step": 44625 + }, + { + "epoch": 0.9947971714147199, + "grad_norm": 0.7864732146263123, + "learning_rate": 1.331052954831602e-09, + "loss": 0.2185, + "step": 44630 + }, + { + "epoch": 0.9949086207953399, + "grad_norm": 0.5367758274078369, + "learning_rate": 1.2745403669023327e-09, + "loss": 0.2567, + "step": 44635 + }, + { + "epoch": 0.99502007017596, + "grad_norm": 0.4071984887123108, + "learning_rate": 1.2192535455368692e-09, + "loss": 0.2429, + "step": 44640 + }, + { + "epoch": 0.9951315195565801, + "grad_norm": 0.656913697719574, + "learning_rate": 1.1651924975120133e-09, + "loss": 0.2744, + "step": 44645 + }, + { + "epoch": 0.9952429689372001, + "grad_norm": 0.7914825677871704, + "learning_rate": 1.112357229455796e-09, + "loss": 0.2515, + "step": 44650 + }, + { + "epoch": 0.9953544183178202, + "grad_norm": 1.119311809539795, + "learning_rate": 1.0607477478452588e-09, + "loss": 0.3854, + "step": 44655 + }, + { + "epoch": 0.9954658676984403, + "grad_norm": 0.5164552927017212, + "learning_rate": 1.0103640590064524e-09, + "loss": 0.2533, + "step": 44660 + }, + { + "epoch": 0.9955773170790604, + "grad_norm": 0.962119460105896, + "learning_rate": 9.612061691166575e-10, + "loss": 0.2001, + "step": 44665 + }, + { + "epoch": 0.9956887664596804, + "grad_norm": 0.793340265750885, + "learning_rate": 9.132740842021647e-10, + "loss": 0.3941, + "step": 44670 + }, + { + "epoch": 0.9958002158403004, + "grad_norm": 1.0190134048461914, + "learning_rate": 8.665678101393848e-10, + "loss": 0.2366, + "step": 44675 + }, + { + "epoch": 0.9959116652209206, + "grad_norm": 1.1671634912490845, + "learning_rate": 8.210873526537378e-10, + "loss": 0.2662, + "step": 44680 + }, + { + "epoch": 0.9960231146015406, + "grad_norm": 0.5475580096244812, + "learning_rate": 7.768327173207635e-10, + "loss": 0.2246, + "step": 44685 + }, + { + "epoch": 0.9961345639821607, + "grad_norm": 0.8026213049888611, + "learning_rate": 7.338039095672323e-10, + "loss": 0.1997, + "step": 44690 + }, + { + "epoch": 0.9962460133627807, + "grad_norm": 0.7829786539077759, + "learning_rate": 6.920009346655931e-10, + "loss": 0.3269, + "step": 44695 + }, + { + "epoch": 0.9963574627434008, + "grad_norm": 0.6256038546562195, + "learning_rate": 6.514237977417459e-10, + "loss": 0.2818, + "step": 44700 + }, + { + "epoch": 0.9964689121240209, + "grad_norm": 0.5961411595344543, + "learning_rate": 6.120725037706e-10, + "loss": 0.2174, + "step": 44705 + }, + { + "epoch": 0.9965803615046409, + "grad_norm": 0.7228636741638184, + "learning_rate": 5.739470575760742e-10, + "loss": 0.309, + "step": 44710 + }, + { + "epoch": 0.996691810885261, + "grad_norm": 0.29328784346580505, + "learning_rate": 5.370474638322076e-10, + "loss": 0.2438, + "step": 44715 + }, + { + "epoch": 0.9968032602658811, + "grad_norm": 0.9593527913093567, + "learning_rate": 5.013737270620489e-10, + "loss": 0.2308, + "step": 44720 + }, + { + "epoch": 0.9969147096465011, + "grad_norm": 1.333426594734192, + "learning_rate": 4.669258516387665e-10, + "loss": 0.3419, + "step": 44725 + }, + { + "epoch": 0.9970261590271212, + "grad_norm": 1.4894150495529175, + "learning_rate": 4.3370384178564875e-10, + "loss": 0.2652, + "step": 44730 + }, + { + "epoch": 0.9971376084077412, + "grad_norm": 0.6049108505249023, + "learning_rate": 4.0170770157610393e-10, + "loss": 0.2449, + "step": 44735 + }, + { + "epoch": 0.9972490577883614, + "grad_norm": 0.48116618394851685, + "learning_rate": 3.709374349325501e-10, + "loss": 0.2508, + "step": 44740 + }, + { + "epoch": 0.9973605071689814, + "grad_norm": 0.4338245987892151, + "learning_rate": 3.4139304562641474e-10, + "loss": 0.2205, + "step": 44745 + }, + { + "epoch": 0.9974719565496015, + "grad_norm": 0.7536609172821045, + "learning_rate": 3.130745372803556e-10, + "loss": 0.2546, + "step": 44750 + }, + { + "epoch": 0.9975834059302215, + "grad_norm": 0.8969931602478027, + "learning_rate": 2.8598191336492995e-10, + "loss": 0.2604, + "step": 44755 + }, + { + "epoch": 0.9976948553108416, + "grad_norm": 0.6707092523574829, + "learning_rate": 2.6011517720192505e-10, + "loss": 0.223, + "step": 44760 + }, + { + "epoch": 0.9978063046914617, + "grad_norm": 0.6238420605659485, + "learning_rate": 2.3547433196324796e-10, + "loss": 0.27, + "step": 44765 + }, + { + "epoch": 0.9979177540720817, + "grad_norm": 0.5326305627822876, + "learning_rate": 2.1205938066870547e-10, + "loss": 0.2645, + "step": 44770 + }, + { + "epoch": 0.9980292034527019, + "grad_norm": 0.4190085828304291, + "learning_rate": 1.898703261893342e-10, + "loss": 0.1725, + "step": 44775 + }, + { + "epoch": 0.9981406528333219, + "grad_norm": 0.866177499294281, + "learning_rate": 1.6890717124629085e-10, + "loss": 0.3569, + "step": 44780 + }, + { + "epoch": 0.9982521022139419, + "grad_norm": 0.6426597237586975, + "learning_rate": 1.4916991840641103e-10, + "loss": 0.2912, + "step": 44785 + }, + { + "epoch": 0.998363551594562, + "grad_norm": 0.8013852834701538, + "learning_rate": 1.3065857009331161e-10, + "loss": 0.2824, + "step": 44790 + }, + { + "epoch": 0.998475000975182, + "grad_norm": 0.599602222442627, + "learning_rate": 1.133731285729578e-10, + "loss": 0.2392, + "step": 44795 + }, + { + "epoch": 0.9985864503558022, + "grad_norm": 0.8479488492012024, + "learning_rate": 9.731359596587552e-11, + "loss": 0.4189, + "step": 44800 + }, + { + "epoch": 0.9986978997364222, + "grad_norm": 0.7829699516296387, + "learning_rate": 8.247997424049026e-11, + "loss": 0.2198, + "step": 44805 + }, + { + "epoch": 0.9988093491170423, + "grad_norm": 0.5513267517089844, + "learning_rate": 6.887226521645751e-11, + "loss": 0.2213, + "step": 44810 + }, + { + "epoch": 0.9989207984976624, + "grad_norm": 0.47978201508522034, + "learning_rate": 5.649047056022206e-11, + "loss": 0.2498, + "step": 44815 + }, + { + "epoch": 0.9990322478782824, + "grad_norm": 0.4485434591770172, + "learning_rate": 4.5334591791679204e-11, + "loss": 0.2715, + "step": 44820 + }, + { + "epoch": 0.9991436972589025, + "grad_norm": 0.5298421382904053, + "learning_rate": 3.5404630276403286e-11, + "loss": 0.3027, + "step": 44825 + }, + { + "epoch": 0.9992551466395225, + "grad_norm": 0.6424366235733032, + "learning_rate": 2.670058723230895e-11, + "loss": 0.3339, + "step": 44830 + }, + { + "epoch": 0.9993665960201427, + "grad_norm": 0.5604028701782227, + "learning_rate": 1.922246372743075e-11, + "loss": 0.2231, + "step": 44835 + }, + { + "epoch": 0.9994780454007627, + "grad_norm": 0.5397953391075134, + "learning_rate": 1.2970260677702684e-11, + "loss": 0.2282, + "step": 44840 + }, + { + "epoch": 0.9995894947813827, + "grad_norm": 0.8372544646263123, + "learning_rate": 7.943978850288859e-12, + "loss": 0.3187, + "step": 44845 + }, + { + "epoch": 0.9997009441620028, + "grad_norm": 0.8007095456123352, + "learning_rate": 4.143618860252829e-12, + "loss": 0.2664, + "step": 44850 + }, + { + "epoch": 0.9998123935426229, + "grad_norm": 0.5296596884727478, + "learning_rate": 1.569181173888268e-12, + "loss": 0.1991, + "step": 44855 + }, + { + "epoch": 0.999923842923243, + "grad_norm": 0.5373436212539673, + "learning_rate": 2.2066610760873575e-13, + "loss": 0.3511, + "step": 44860 + }, + { + "epoch": 0.999990712551615, + "step": 44863, + "total_flos": 6.904376768783725e+19, + "train_loss": 0.32993994381826713, + "train_runtime": 381562.2902, + "train_samples_per_second": 2.822, + "train_steps_per_second": 0.118 + } + ], + "logging_steps": 5, + "max_steps": 44863, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.904376768783725e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}