{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9995261786306564, "eval_steps": 500, "global_step": 6330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003158809128958383, "grad_norm": 0.2773555225933436, "learning_rate": 1.5797788309636651e-06, "loss": 1.8216, "step": 1 }, { "epoch": 0.0015794045644791914, "grad_norm": 0.28931175093832445, "learning_rate": 7.898894154818326e-06, "loss": 1.7537, "step": 5 }, { "epoch": 0.003158809128958383, "grad_norm": 0.2732692097833901, "learning_rate": 1.579778830963665e-05, "loss": 1.8079, "step": 10 }, { "epoch": 0.004738213693437574, "grad_norm": 0.16969401949112947, "learning_rate": 2.3696682464454976e-05, "loss": 1.777, "step": 15 }, { "epoch": 0.006317618257916766, "grad_norm": 0.20100238045547714, "learning_rate": 3.15955766192733e-05, "loss": 1.7661, "step": 20 }, { "epoch": 0.007897022822395957, "grad_norm": 0.15474933173037578, "learning_rate": 3.949447077409163e-05, "loss": 1.7012, "step": 25 }, { "epoch": 0.009476427386875147, "grad_norm": 0.17052763447912428, "learning_rate": 4.739336492890995e-05, "loss": 1.6647, "step": 30 }, { "epoch": 0.01105583195135434, "grad_norm": 0.12988597445968217, "learning_rate": 5.529225908372828e-05, "loss": 1.6729, "step": 35 }, { "epoch": 0.012635236515833531, "grad_norm": 0.1199377925438811, "learning_rate": 6.31911532385466e-05, "loss": 1.6219, "step": 40 }, { "epoch": 0.014214641080312722, "grad_norm": 0.08737660996471332, "learning_rate": 7.109004739336493e-05, "loss": 1.6271, "step": 45 }, { "epoch": 0.015794045644791914, "grad_norm": 0.07589192826931294, "learning_rate": 7.898894154818326e-05, "loss": 1.5991, "step": 50 }, { "epoch": 0.017373450209271106, "grad_norm": 0.07856530121275355, "learning_rate": 8.688783570300159e-05, "loss": 1.5694, "step": 55 }, { "epoch": 0.018952854773750295, "grad_norm": 0.0669785540989484, "learning_rate": 9.47867298578199e-05, "loss": 1.6004, "step": 60 }, { "epoch": 0.020532259338229487, "grad_norm": 0.062045435412763444, "learning_rate": 0.00010268562401263823, "loss": 1.4969, "step": 65 }, { "epoch": 0.02211166390270868, "grad_norm": 0.06814622888734555, "learning_rate": 0.00011058451816745656, "loss": 1.5789, "step": 70 }, { "epoch": 0.02369106846718787, "grad_norm": 0.057311280482936794, "learning_rate": 0.00011848341232227489, "loss": 1.5165, "step": 75 }, { "epoch": 0.025270473031667063, "grad_norm": 0.06001367630228125, "learning_rate": 0.0001263823064770932, "loss": 1.5137, "step": 80 }, { "epoch": 0.02684987759614625, "grad_norm": 0.059971236509021995, "learning_rate": 0.00013428120063191154, "loss": 1.5099, "step": 85 }, { "epoch": 0.028429282160625444, "grad_norm": 0.060915181546327565, "learning_rate": 0.00014218009478672987, "loss": 1.5098, "step": 90 }, { "epoch": 0.030008686725104636, "grad_norm": 0.061126490440117635, "learning_rate": 0.0001500789889415482, "loss": 1.4801, "step": 95 }, { "epoch": 0.03158809128958383, "grad_norm": 0.059798143534691414, "learning_rate": 0.00015797788309636652, "loss": 1.4809, "step": 100 }, { "epoch": 0.03316749585406302, "grad_norm": 0.05644578327017066, "learning_rate": 0.00016587677725118482, "loss": 1.5089, "step": 105 }, { "epoch": 0.03474690041854221, "grad_norm": 0.060839631983083176, "learning_rate": 0.00017377567140600318, "loss": 1.4598, "step": 110 }, { "epoch": 0.036326304983021404, "grad_norm": 0.059781523401543135, "learning_rate": 0.00018167456556082148, "loss": 1.4477, "step": 115 }, { "epoch": 0.03790570954750059, "grad_norm": 0.06298024185932508, "learning_rate": 0.0001895734597156398, "loss": 1.5418, "step": 120 }, { "epoch": 0.03948511411197978, "grad_norm": 0.05490407645919546, "learning_rate": 0.00019747235387045813, "loss": 1.5063, "step": 125 }, { "epoch": 0.04106451867645897, "grad_norm": 0.06132689315356882, "learning_rate": 0.00020537124802527646, "loss": 1.4772, "step": 130 }, { "epoch": 0.042643923240938165, "grad_norm": 0.05780240486687013, "learning_rate": 0.0002132701421800948, "loss": 1.4602, "step": 135 }, { "epoch": 0.04422332780541736, "grad_norm": 0.05841083271675262, "learning_rate": 0.00022116903633491312, "loss": 1.5587, "step": 140 }, { "epoch": 0.04580273236989655, "grad_norm": 0.059087762043676245, "learning_rate": 0.00022906793048973144, "loss": 1.3773, "step": 145 }, { "epoch": 0.04738213693437574, "grad_norm": 0.05824617568035495, "learning_rate": 0.00023696682464454977, "loss": 1.4704, "step": 150 }, { "epoch": 0.048961541498854934, "grad_norm": 0.059797325410886844, "learning_rate": 0.0002448657187993681, "loss": 1.4483, "step": 155 }, { "epoch": 0.050540946063334126, "grad_norm": 0.061811932232945614, "learning_rate": 0.0002527646129541864, "loss": 1.404, "step": 160 }, { "epoch": 0.05212035062781331, "grad_norm": 0.055271302768420814, "learning_rate": 0.00026066350710900475, "loss": 1.4331, "step": 165 }, { "epoch": 0.0536997551922925, "grad_norm": 0.06127973155310494, "learning_rate": 0.0002685624012638231, "loss": 1.3906, "step": 170 }, { "epoch": 0.055279159756771695, "grad_norm": 0.0611656198667157, "learning_rate": 0.0002764612954186414, "loss": 1.3621, "step": 175 }, { "epoch": 0.05685856432125089, "grad_norm": 0.06544860435333785, "learning_rate": 0.00028436018957345974, "loss": 1.4101, "step": 180 }, { "epoch": 0.05843796888573008, "grad_norm": 0.057479587488975165, "learning_rate": 0.000292259083728278, "loss": 1.41, "step": 185 }, { "epoch": 0.06001737345020927, "grad_norm": 0.057313553181093345, "learning_rate": 0.0003001579778830964, "loss": 1.4915, "step": 190 }, { "epoch": 0.061596778014688464, "grad_norm": 0.08419773115132484, "learning_rate": 0.0003080568720379147, "loss": 1.4153, "step": 195 }, { "epoch": 0.06317618257916766, "grad_norm": 0.06251914986845002, "learning_rate": 0.00031595576619273305, "loss": 1.4497, "step": 200 }, { "epoch": 0.06475558714364685, "grad_norm": 0.06026891089892474, "learning_rate": 0.0003238546603475513, "loss": 1.4466, "step": 205 }, { "epoch": 0.06633499170812604, "grad_norm": 0.07329324902446606, "learning_rate": 0.00033175355450236965, "loss": 1.4292, "step": 210 }, { "epoch": 0.06791439627260523, "grad_norm": 0.06967215139598891, "learning_rate": 0.000339652448657188, "loss": 1.4646, "step": 215 }, { "epoch": 0.06949380083708442, "grad_norm": 0.06627407385027922, "learning_rate": 0.00034755134281200636, "loss": 1.3833, "step": 220 }, { "epoch": 0.07107320540156362, "grad_norm": 0.07649673232088278, "learning_rate": 0.0003554502369668247, "loss": 1.4884, "step": 225 }, { "epoch": 0.07265260996604281, "grad_norm": 0.07463921725921271, "learning_rate": 0.00036334913112164296, "loss": 1.4663, "step": 230 }, { "epoch": 0.07423201453052199, "grad_norm": 0.08740975904430916, "learning_rate": 0.0003712480252764613, "loss": 1.3895, "step": 235 }, { "epoch": 0.07581141909500118, "grad_norm": 0.09955952791088979, "learning_rate": 0.0003791469194312796, "loss": 1.3993, "step": 240 }, { "epoch": 0.07739082365948037, "grad_norm": 0.08483061712241394, "learning_rate": 0.000387045813586098, "loss": 1.4088, "step": 245 }, { "epoch": 0.07897022822395956, "grad_norm": 0.07195096822204508, "learning_rate": 0.00039494470774091627, "loss": 1.3941, "step": 250 }, { "epoch": 0.08054963278843875, "grad_norm": 0.06331945696078782, "learning_rate": 0.0004028436018957346, "loss": 1.37, "step": 255 }, { "epoch": 0.08212903735291795, "grad_norm": 0.07744042629600942, "learning_rate": 0.0004107424960505529, "loss": 1.4228, "step": 260 }, { "epoch": 0.08370844191739714, "grad_norm": 0.06501691722126363, "learning_rate": 0.00041864139020537125, "loss": 1.37, "step": 265 }, { "epoch": 0.08528784648187633, "grad_norm": 0.05881123500383811, "learning_rate": 0.0004265402843601896, "loss": 1.3846, "step": 270 }, { "epoch": 0.08686725104635552, "grad_norm": 0.055619254855380676, "learning_rate": 0.0004344391785150079, "loss": 1.4792, "step": 275 }, { "epoch": 0.08844665561083472, "grad_norm": 0.06700650945881045, "learning_rate": 0.00044233807266982623, "loss": 1.4188, "step": 280 }, { "epoch": 0.09002606017531391, "grad_norm": 0.07103007405260178, "learning_rate": 0.00045023696682464456, "loss": 1.4763, "step": 285 }, { "epoch": 0.0916054647397931, "grad_norm": 0.07884224807036774, "learning_rate": 0.0004581358609794629, "loss": 1.4171, "step": 290 }, { "epoch": 0.09318486930427229, "grad_norm": 0.06372762385790577, "learning_rate": 0.0004660347551342812, "loss": 1.4223, "step": 295 }, { "epoch": 0.09476427386875148, "grad_norm": 0.06417684919113965, "learning_rate": 0.00047393364928909954, "loss": 1.4714, "step": 300 }, { "epoch": 0.09634367843323068, "grad_norm": 0.07965485308103767, "learning_rate": 0.00048183254344391787, "loss": 1.414, "step": 305 }, { "epoch": 0.09792308299770987, "grad_norm": 0.05996514681208502, "learning_rate": 0.0004897314375987362, "loss": 1.4566, "step": 310 }, { "epoch": 0.09950248756218906, "grad_norm": 0.06867669891166489, "learning_rate": 0.0004976303317535545, "loss": 1.3765, "step": 315 }, { "epoch": 0.10108189212666825, "grad_norm": 0.05650550967049639, "learning_rate": 0.0005055292259083729, "loss": 1.3332, "step": 320 }, { "epoch": 0.10266129669114744, "grad_norm": 0.06499712173827683, "learning_rate": 0.0005134281200631912, "loss": 1.4043, "step": 325 }, { "epoch": 0.10424070125562662, "grad_norm": 0.06864173048670531, "learning_rate": 0.0005213270142180095, "loss": 1.4192, "step": 330 }, { "epoch": 0.10582010582010581, "grad_norm": 0.0690995692470246, "learning_rate": 0.0005292259083728278, "loss": 1.3891, "step": 335 }, { "epoch": 0.107399510384585, "grad_norm": 0.08877428199764238, "learning_rate": 0.0005371248025276462, "loss": 1.449, "step": 340 }, { "epoch": 0.1089789149490642, "grad_norm": 0.0825843090529152, "learning_rate": 0.0005450236966824644, "loss": 1.422, "step": 345 }, { "epoch": 0.11055831951354339, "grad_norm": 0.08843979102541322, "learning_rate": 0.0005529225908372828, "loss": 1.4667, "step": 350 }, { "epoch": 0.11213772407802258, "grad_norm": 0.07425441335803369, "learning_rate": 0.0005608214849921011, "loss": 1.4271, "step": 355 }, { "epoch": 0.11371712864250177, "grad_norm": 0.07493041137307938, "learning_rate": 0.0005687203791469195, "loss": 1.4397, "step": 360 }, { "epoch": 0.11529653320698097, "grad_norm": 0.07354898138830711, "learning_rate": 0.0005766192733017378, "loss": 1.4723, "step": 365 }, { "epoch": 0.11687593777146016, "grad_norm": 0.06451005493101557, "learning_rate": 0.000584518167456556, "loss": 1.4386, "step": 370 }, { "epoch": 0.11845534233593935, "grad_norm": 0.07004067668646244, "learning_rate": 0.0005924170616113745, "loss": 1.3331, "step": 375 }, { "epoch": 0.12003474690041854, "grad_norm": 0.08317592723205765, "learning_rate": 0.0006003159557661928, "loss": 1.3373, "step": 380 }, { "epoch": 0.12161415146489774, "grad_norm": 0.06406427622918581, "learning_rate": 0.0006082148499210111, "loss": 1.4433, "step": 385 }, { "epoch": 0.12319355602937693, "grad_norm": 0.07375220849822373, "learning_rate": 0.0006161137440758294, "loss": 1.3761, "step": 390 }, { "epoch": 0.12477296059385612, "grad_norm": 0.08388225636511741, "learning_rate": 0.0006240126382306477, "loss": 1.3568, "step": 395 }, { "epoch": 0.1263523651583353, "grad_norm": 0.09579117145839189, "learning_rate": 0.0006319115323854661, "loss": 1.3979, "step": 400 }, { "epoch": 0.1279317697228145, "grad_norm": 0.07801632842935728, "learning_rate": 0.0006398104265402843, "loss": 1.4404, "step": 405 }, { "epoch": 0.1295111742872937, "grad_norm": 0.08011548413975278, "learning_rate": 0.0006477093206951026, "loss": 1.426, "step": 410 }, { "epoch": 0.1310905788517729, "grad_norm": 0.07546874573289, "learning_rate": 0.0006556082148499211, "loss": 1.4159, "step": 415 }, { "epoch": 0.13266998341625208, "grad_norm": 0.0697636738243306, "learning_rate": 0.0006635071090047393, "loss": 1.3699, "step": 420 }, { "epoch": 0.13424938798073127, "grad_norm": 0.07028527273409962, "learning_rate": 0.0006714060031595577, "loss": 1.3586, "step": 425 }, { "epoch": 0.13582879254521046, "grad_norm": 0.06882473259849126, "learning_rate": 0.000679304897314376, "loss": 1.3222, "step": 430 }, { "epoch": 0.13740819710968966, "grad_norm": 0.08484822137697375, "learning_rate": 0.0006872037914691943, "loss": 1.3952, "step": 435 }, { "epoch": 0.13898760167416885, "grad_norm": 0.0910555291089557, "learning_rate": 0.0006951026856240127, "loss": 1.3078, "step": 440 }, { "epoch": 0.14056700623864804, "grad_norm": 0.08770781117261345, "learning_rate": 0.0007030015797788309, "loss": 1.3812, "step": 445 }, { "epoch": 0.14214641080312723, "grad_norm": 0.06775592916509725, "learning_rate": 0.0007109004739336494, "loss": 1.4079, "step": 450 }, { "epoch": 0.14372581536760642, "grad_norm": 0.08321641995599154, "learning_rate": 0.0007187993680884676, "loss": 1.4374, "step": 455 }, { "epoch": 0.14530521993208562, "grad_norm": 0.09076328306358049, "learning_rate": 0.0007266982622432859, "loss": 1.3664, "step": 460 }, { "epoch": 0.1468846244965648, "grad_norm": 0.07105220146130643, "learning_rate": 0.0007345971563981043, "loss": 1.365, "step": 465 }, { "epoch": 0.14846402906104397, "grad_norm": 0.06661204538279, "learning_rate": 0.0007424960505529226, "loss": 1.3647, "step": 470 }, { "epoch": 0.15004343362552316, "grad_norm": 0.08076494416132875, "learning_rate": 0.0007503949447077409, "loss": 1.3848, "step": 475 }, { "epoch": 0.15162283819000236, "grad_norm": 0.06637462609376617, "learning_rate": 0.0007582938388625592, "loss": 1.4337, "step": 480 }, { "epoch": 0.15320224275448155, "grad_norm": 0.177827007648548, "learning_rate": 0.0007661927330173775, "loss": 1.4088, "step": 485 }, { "epoch": 0.15478164731896074, "grad_norm": 0.06840716619177854, "learning_rate": 0.000774091627172196, "loss": 1.4452, "step": 490 }, { "epoch": 0.15636105188343993, "grad_norm": 0.06338088183915264, "learning_rate": 0.0007819905213270142, "loss": 1.4294, "step": 495 }, { "epoch": 0.15794045644791913, "grad_norm": 0.06795037427055416, "learning_rate": 0.0007898894154818325, "loss": 1.367, "step": 500 }, { "epoch": 0.15951986101239832, "grad_norm": 0.07028812801350164, "learning_rate": 0.0007977883096366509, "loss": 1.4146, "step": 505 }, { "epoch": 0.1610992655768775, "grad_norm": 0.06275065691271632, "learning_rate": 0.0008056872037914692, "loss": 1.3709, "step": 510 }, { "epoch": 0.1626786701413567, "grad_norm": 0.06095474036700525, "learning_rate": 0.0008135860979462876, "loss": 1.4252, "step": 515 }, { "epoch": 0.1642580747058359, "grad_norm": 0.06035641811243823, "learning_rate": 0.0008214849921011058, "loss": 1.4558, "step": 520 }, { "epoch": 0.16583747927031509, "grad_norm": 0.06694827772265656, "learning_rate": 0.0008293838862559242, "loss": 1.3688, "step": 525 }, { "epoch": 0.16741688383479428, "grad_norm": 0.07621114996528461, "learning_rate": 0.0008372827804107425, "loss": 1.3391, "step": 530 }, { "epoch": 0.16899628839927347, "grad_norm": 0.07725227299520594, "learning_rate": 0.0008451816745655608, "loss": 1.3899, "step": 535 }, { "epoch": 0.17057569296375266, "grad_norm": 0.06900184011771379, "learning_rate": 0.0008530805687203792, "loss": 1.3511, "step": 540 }, { "epoch": 0.17215509752823185, "grad_norm": 0.07836093646329446, "learning_rate": 0.0008609794628751975, "loss": 1.3675, "step": 545 }, { "epoch": 0.17373450209271105, "grad_norm": 0.07944811185147413, "learning_rate": 0.0008688783570300158, "loss": 1.4324, "step": 550 }, { "epoch": 0.17531390665719024, "grad_norm": 0.09268750968081797, "learning_rate": 0.0008767772511848341, "loss": 1.4052, "step": 555 }, { "epoch": 0.17689331122166943, "grad_norm": 0.09150623169103793, "learning_rate": 0.0008846761453396525, "loss": 1.517, "step": 560 }, { "epoch": 0.17847271578614862, "grad_norm": 0.08983040105938492, "learning_rate": 0.0008925750394944708, "loss": 1.4744, "step": 565 }, { "epoch": 0.18005212035062781, "grad_norm": 0.08062631056999027, "learning_rate": 0.0009004739336492891, "loss": 1.384, "step": 570 }, { "epoch": 0.181631524915107, "grad_norm": 0.08813077900018314, "learning_rate": 0.0009083728278041074, "loss": 1.4311, "step": 575 }, { "epoch": 0.1832109294795862, "grad_norm": 0.08512263474558772, "learning_rate": 0.0009162717219589258, "loss": 1.4508, "step": 580 }, { "epoch": 0.1847903340440654, "grad_norm": 0.0740509763992527, "learning_rate": 0.0009241706161137441, "loss": 1.3657, "step": 585 }, { "epoch": 0.18636973860854458, "grad_norm": 0.07739388026554837, "learning_rate": 0.0009320695102685624, "loss": 1.3532, "step": 590 }, { "epoch": 0.18794914317302377, "grad_norm": 0.11455245769024268, "learning_rate": 0.0009399684044233808, "loss": 1.3886, "step": 595 }, { "epoch": 0.18952854773750297, "grad_norm": 0.09960009309795419, "learning_rate": 0.0009478672985781991, "loss": 1.4545, "step": 600 }, { "epoch": 0.19110795230198216, "grad_norm": 0.09022492243983328, "learning_rate": 0.0009557661927330173, "loss": 1.451, "step": 605 }, { "epoch": 0.19268735686646135, "grad_norm": 0.11673156843803718, "learning_rate": 0.0009636650868878357, "loss": 1.5408, "step": 610 }, { "epoch": 0.19426676143094054, "grad_norm": 0.10822876816560646, "learning_rate": 0.0009715639810426541, "loss": 1.5437, "step": 615 }, { "epoch": 0.19584616599541974, "grad_norm": 0.429799255340067, "learning_rate": 0.0009794628751974724, "loss": 1.4183, "step": 620 }, { "epoch": 0.19742557055989893, "grad_norm": 0.10628314572709689, "learning_rate": 0.0009873617693522906, "loss": 1.535, "step": 625 }, { "epoch": 0.19900497512437812, "grad_norm": 0.1907817824440254, "learning_rate": 0.000995260663507109, "loss": 1.4768, "step": 630 }, { "epoch": 0.2005843796888573, "grad_norm": 0.1721795524661134, "learning_rate": 0.0009999996959064125, "loss": 1.4867, "step": 635 }, { "epoch": 0.2021637842533365, "grad_norm": 0.10271076062401162, "learning_rate": 0.0009999962748577986, "loss": 1.5571, "step": 640 }, { "epoch": 0.2037431888178157, "grad_norm": 0.11702964871706054, "learning_rate": 0.0009999890526696813, "loss": 1.4514, "step": 645 }, { "epoch": 0.2053225933822949, "grad_norm": 0.08782257665091696, "learning_rate": 0.0009999780293969657, "loss": 1.4645, "step": 650 }, { "epoch": 0.20690199794677408, "grad_norm": 0.06955501897274423, "learning_rate": 0.0009999632051234547, "loss": 1.471, "step": 655 }, { "epoch": 0.20848140251125324, "grad_norm": 0.12267978814069823, "learning_rate": 0.000999944579961847, "loss": 1.5496, "step": 660 }, { "epoch": 0.21006080707573244, "grad_norm": 0.07234965343544614, "learning_rate": 0.0009999221540537377, "loss": 1.3932, "step": 665 }, { "epoch": 0.21164021164021163, "grad_norm": 0.07983944264098188, "learning_rate": 0.000999895927569616, "loss": 1.5494, "step": 670 }, { "epoch": 0.21321961620469082, "grad_norm": 0.09581628202810977, "learning_rate": 0.0009998659007088642, "loss": 1.4458, "step": 675 }, { "epoch": 0.21479902076917, "grad_norm": 0.07199883438952057, "learning_rate": 0.0009998320736997568, "loss": 1.4157, "step": 680 }, { "epoch": 0.2163784253336492, "grad_norm": 0.11518806310460458, "learning_rate": 0.0009997944467994581, "loss": 1.4431, "step": 685 }, { "epoch": 0.2179578298981284, "grad_norm": 0.3973780634713033, "learning_rate": 0.0009997530202940205, "loss": 1.6694, "step": 690 }, { "epoch": 0.2195372344626076, "grad_norm": 12.603017168823813, "learning_rate": 0.0009997077944983819, "loss": 1.6927, "step": 695 }, { "epoch": 0.22111663902708678, "grad_norm": 0.15664595446878887, "learning_rate": 0.0009996587697563642, "loss": 1.6164, "step": 700 }, { "epoch": 0.22269604359156597, "grad_norm": 0.1772139058995786, "learning_rate": 0.00099960594644067, "loss": 1.5829, "step": 705 }, { "epoch": 0.22427544815604517, "grad_norm": 0.38720671985013083, "learning_rate": 0.0009995493249528795, "loss": 1.5375, "step": 710 }, { "epoch": 0.22585485272052436, "grad_norm": 2.4931602610837733, "learning_rate": 0.0009994889057234487, "loss": 1.521, "step": 715 }, { "epoch": 0.22743425728500355, "grad_norm": 0.19713406574708697, "learning_rate": 0.0009994246892117045, "loss": 1.605, "step": 720 }, { "epoch": 0.22901366184948274, "grad_norm": 0.17427469516453623, "learning_rate": 0.0009993566759058429, "loss": 1.5897, "step": 725 }, { "epoch": 0.23059306641396193, "grad_norm": 0.28989631970775337, "learning_rate": 0.0009992848663229231, "loss": 1.7337, "step": 730 }, { "epoch": 0.23217247097844113, "grad_norm": 0.1543264269080652, "learning_rate": 0.0009992092610088662, "loss": 1.5722, "step": 735 }, { "epoch": 0.23375187554292032, "grad_norm": 0.35698761111769195, "learning_rate": 0.0009991298605384492, "loss": 1.5745, "step": 740 }, { "epoch": 0.2353312801073995, "grad_norm": 0.15647019729916586, "learning_rate": 0.000999046665515301, "loss": 1.4686, "step": 745 }, { "epoch": 0.2369106846718787, "grad_norm": 0.21542643070827774, "learning_rate": 0.0009989596765718981, "loss": 1.5144, "step": 750 }, { "epoch": 0.2384900892363579, "grad_norm": 0.1062682613376919, "learning_rate": 0.0009988688943695595, "loss": 1.5274, "step": 755 }, { "epoch": 0.24006949380083709, "grad_norm": 0.11391756402149217, "learning_rate": 0.000998774319598442, "loss": 1.4762, "step": 760 }, { "epoch": 0.24164889836531628, "grad_norm": 0.24005420199478003, "learning_rate": 0.0009986759529775349, "loss": 1.4434, "step": 765 }, { "epoch": 0.24322830292979547, "grad_norm": 0.11131226417680493, "learning_rate": 0.0009985737952546542, "loss": 1.4688, "step": 770 }, { "epoch": 0.24480770749427466, "grad_norm": 0.0861771743964312, "learning_rate": 0.0009984678472064374, "loss": 1.474, "step": 775 }, { "epoch": 0.24638711205875385, "grad_norm": 0.1345777345161801, "learning_rate": 0.0009983581096383368, "loss": 1.5011, "step": 780 }, { "epoch": 0.24796651662323305, "grad_norm": 0.11440801250960407, "learning_rate": 0.0009982445833846146, "loss": 1.4982, "step": 785 }, { "epoch": 0.24954592118771224, "grad_norm": 0.12102239610078525, "learning_rate": 0.0009981272693083349, "loss": 1.4776, "step": 790 }, { "epoch": 0.25112532575219143, "grad_norm": 0.12426909946490598, "learning_rate": 0.0009980061683013592, "loss": 1.5122, "step": 795 }, { "epoch": 0.2527047303166706, "grad_norm": 0.07627926549732114, "learning_rate": 0.0009978812812843378, "loss": 1.4804, "step": 800 }, { "epoch": 0.2542841348811498, "grad_norm": 0.07354310048802172, "learning_rate": 0.0009977526092067037, "loss": 1.4806, "step": 805 }, { "epoch": 0.255863539445629, "grad_norm": 0.06789530995861061, "learning_rate": 0.0009976201530466655, "loss": 1.4071, "step": 810 }, { "epoch": 0.2574429440101082, "grad_norm": 0.11694583069585406, "learning_rate": 0.0009974839138111988, "loss": 1.384, "step": 815 }, { "epoch": 0.2590223485745874, "grad_norm": 0.057162415760879964, "learning_rate": 0.0009973438925360407, "loss": 1.4368, "step": 820 }, { "epoch": 0.2606017531390666, "grad_norm": 0.05697724204618202, "learning_rate": 0.0009972000902856795, "loss": 1.4376, "step": 825 }, { "epoch": 0.2621811577035458, "grad_norm": 0.05176930731798256, "learning_rate": 0.0009970525081533482, "loss": 1.3819, "step": 830 }, { "epoch": 0.26376056226802497, "grad_norm": 0.11294722336351232, "learning_rate": 0.0009969011472610158, "loss": 1.3811, "step": 835 }, { "epoch": 0.26533996683250416, "grad_norm": 0.7818848281523398, "learning_rate": 0.0009967460087593786, "loss": 1.4179, "step": 840 }, { "epoch": 0.26691937139698335, "grad_norm": 0.13574159197374133, "learning_rate": 0.0009965870938278517, "loss": 1.5507, "step": 845 }, { "epoch": 0.26849877596146254, "grad_norm": 0.08413735495093176, "learning_rate": 0.0009964244036745594, "loss": 1.4094, "step": 850 }, { "epoch": 0.27007818052594174, "grad_norm": 0.0758039750337027, "learning_rate": 0.000996257939536327, "loss": 1.4359, "step": 855 }, { "epoch": 0.2716575850904209, "grad_norm": 0.07810966549358853, "learning_rate": 0.0009960877026786708, "loss": 1.4797, "step": 860 }, { "epoch": 0.2732369896549001, "grad_norm": 0.08280198420353367, "learning_rate": 0.0009959136943957887, "loss": 1.4375, "step": 865 }, { "epoch": 0.2748163942193793, "grad_norm": 0.14800707785985415, "learning_rate": 0.0009957359160105497, "loss": 1.4319, "step": 870 }, { "epoch": 0.2763957987838585, "grad_norm": 0.07356976928292576, "learning_rate": 0.000995554368874485, "loss": 1.4693, "step": 875 }, { "epoch": 0.2779752033483377, "grad_norm": 0.08549222736885605, "learning_rate": 0.0009953690543677768, "loss": 1.4679, "step": 880 }, { "epoch": 0.2795546079128169, "grad_norm": 0.06452177154798494, "learning_rate": 0.0009951799738992485, "loss": 1.4268, "step": 885 }, { "epoch": 0.2811340124772961, "grad_norm": 0.08621008702765878, "learning_rate": 0.0009949871289063525, "loss": 1.3794, "step": 890 }, { "epoch": 0.2827134170417753, "grad_norm": 0.08886259848140046, "learning_rate": 0.000994790520855162, "loss": 1.4348, "step": 895 }, { "epoch": 0.28429282160625446, "grad_norm": 0.08224974086835282, "learning_rate": 0.0009945901512403569, "loss": 1.3813, "step": 900 }, { "epoch": 0.28587222617073366, "grad_norm": 0.11356847182395537, "learning_rate": 0.0009943860215852144, "loss": 1.3709, "step": 905 }, { "epoch": 0.28745163073521285, "grad_norm": 0.05497122722279347, "learning_rate": 0.0009941781334415966, "loss": 1.472, "step": 910 }, { "epoch": 0.28903103529969204, "grad_norm": 0.05741360504195286, "learning_rate": 0.0009939664883899394, "loss": 1.401, "step": 915 }, { "epoch": 0.29061043986417123, "grad_norm": 0.0717558482829115, "learning_rate": 0.0009937510880392386, "loss": 1.3487, "step": 920 }, { "epoch": 0.2921898444286504, "grad_norm": 0.06867838525774687, "learning_rate": 0.0009935319340270408, "loss": 1.4406, "step": 925 }, { "epoch": 0.2937692489931296, "grad_norm": 0.059790192279718914, "learning_rate": 0.0009933090280194279, "loss": 1.4021, "step": 930 }, { "epoch": 0.2953486535576088, "grad_norm": 0.0565736650057571, "learning_rate": 0.0009930823717110065, "loss": 1.3519, "step": 935 }, { "epoch": 0.29692805812208795, "grad_norm": 0.06296130273815809, "learning_rate": 0.0009928519668248937, "loss": 1.3996, "step": 940 }, { "epoch": 0.29850746268656714, "grad_norm": 0.06480042045469316, "learning_rate": 0.0009926178151127049, "loss": 1.4531, "step": 945 }, { "epoch": 0.30008686725104633, "grad_norm": 0.07486630262682008, "learning_rate": 0.0009923799183545398, "loss": 1.3957, "step": 950 }, { "epoch": 0.3016662718155255, "grad_norm": 0.06804117165386249, "learning_rate": 0.0009921382783589696, "loss": 1.3777, "step": 955 }, { "epoch": 0.3032456763800047, "grad_norm": 0.0594825178337055, "learning_rate": 0.0009918928969630228, "loss": 1.3743, "step": 960 }, { "epoch": 0.3048250809444839, "grad_norm": 0.0697591381107328, "learning_rate": 0.0009916437760321708, "loss": 1.371, "step": 965 }, { "epoch": 0.3064044855089631, "grad_norm": 0.07596599793610637, "learning_rate": 0.0009913909174603147, "loss": 1.4314, "step": 970 }, { "epoch": 0.3079838900734423, "grad_norm": 0.06391627515693085, "learning_rate": 0.0009911343231697703, "loss": 1.4424, "step": 975 }, { "epoch": 0.3095632946379215, "grad_norm": 0.0866514168895784, "learning_rate": 0.0009908739951112534, "loss": 1.4468, "step": 980 }, { "epoch": 0.3111426992024007, "grad_norm": 0.17406205077887174, "learning_rate": 0.0009906099352638652, "loss": 1.4441, "step": 985 }, { "epoch": 0.31272210376687987, "grad_norm": 0.05471580954307277, "learning_rate": 0.0009903421456350775, "loss": 1.3498, "step": 990 }, { "epoch": 0.31430150833135906, "grad_norm": 0.1347284368080328, "learning_rate": 0.000990070628260717, "loss": 1.4071, "step": 995 }, { "epoch": 0.31588091289583825, "grad_norm": 0.05573768927039395, "learning_rate": 0.0009897953852049494, "loss": 1.3925, "step": 1000 }, { "epoch": 0.31746031746031744, "grad_norm": 0.07391824800461791, "learning_rate": 0.0009895164185602654, "loss": 1.5068, "step": 1005 }, { "epoch": 0.31903972202479663, "grad_norm": 0.0851934442761289, "learning_rate": 0.0009892337304474629, "loss": 1.4098, "step": 1010 }, { "epoch": 0.3206191265892758, "grad_norm": 0.05976696555429741, "learning_rate": 0.0009889473230156316, "loss": 1.3664, "step": 1015 }, { "epoch": 0.322198531153755, "grad_norm": 0.05415010411791353, "learning_rate": 0.000988657198442137, "loss": 1.3023, "step": 1020 }, { "epoch": 0.3237779357182342, "grad_norm": 0.0644878689551922, "learning_rate": 0.0009883633589326038, "loss": 1.3249, "step": 1025 }, { "epoch": 0.3253573402827134, "grad_norm": 0.06377375860506683, "learning_rate": 0.000988065806720898, "loss": 1.4376, "step": 1030 }, { "epoch": 0.3269367448471926, "grad_norm": 0.05731452611568366, "learning_rate": 0.0009877645440691122, "loss": 1.4098, "step": 1035 }, { "epoch": 0.3285161494116718, "grad_norm": 0.06598926465007522, "learning_rate": 0.0009874595732675454, "loss": 1.4049, "step": 1040 }, { "epoch": 0.330095553976151, "grad_norm": 0.052397035783897344, "learning_rate": 0.0009871508966346882, "loss": 1.3451, "step": 1045 }, { "epoch": 0.33167495854063017, "grad_norm": 0.058296655642935964, "learning_rate": 0.0009868385165172043, "loss": 1.3824, "step": 1050 }, { "epoch": 0.33325436310510936, "grad_norm": 0.08185036740231577, "learning_rate": 0.0009865224352899118, "loss": 1.494, "step": 1055 }, { "epoch": 0.33483376766958856, "grad_norm": 0.060564720604256966, "learning_rate": 0.0009862026553557669, "loss": 1.4297, "step": 1060 }, { "epoch": 0.33641317223406775, "grad_norm": 0.07310897591348295, "learning_rate": 0.000985879179145843, "loss": 1.4039, "step": 1065 }, { "epoch": 0.33799257679854694, "grad_norm": 0.05360248015901546, "learning_rate": 0.0009855520091193158, "loss": 1.3584, "step": 1070 }, { "epoch": 0.33957198136302613, "grad_norm": 0.06982832928097399, "learning_rate": 0.000985221147763441, "loss": 1.3979, "step": 1075 }, { "epoch": 0.3411513859275053, "grad_norm": 0.06740431144919927, "learning_rate": 0.000984886597593538, "loss": 1.3717, "step": 1080 }, { "epoch": 0.3427307904919845, "grad_norm": 0.05254680499298736, "learning_rate": 0.0009845483611529693, "loss": 1.3469, "step": 1085 }, { "epoch": 0.3443101950564637, "grad_norm": 0.058730348685017635, "learning_rate": 0.0009842064410131221, "loss": 1.4287, "step": 1090 }, { "epoch": 0.3458895996209429, "grad_norm": 0.05860133843671843, "learning_rate": 0.000983860839773388, "loss": 1.3657, "step": 1095 }, { "epoch": 0.3474690041854221, "grad_norm": 0.06027773674748218, "learning_rate": 0.0009835115600611434, "loss": 1.3911, "step": 1100 }, { "epoch": 0.3490484087499013, "grad_norm": 0.0589749650175029, "learning_rate": 0.00098315860453173, "loss": 1.3257, "step": 1105 }, { "epoch": 0.3506278133143805, "grad_norm": 0.05074929057571648, "learning_rate": 0.0009828019758684342, "loss": 1.3849, "step": 1110 }, { "epoch": 0.35220721787885967, "grad_norm": 0.05170951870034496, "learning_rate": 0.000982441676782467, "loss": 1.3543, "step": 1115 }, { "epoch": 0.35378662244333886, "grad_norm": 0.055612161894533316, "learning_rate": 0.0009820777100129428, "loss": 1.3724, "step": 1120 }, { "epoch": 0.35536602700781805, "grad_norm": 0.05774351075763022, "learning_rate": 0.0009817100783268591, "loss": 1.387, "step": 1125 }, { "epoch": 0.35694543157229724, "grad_norm": 0.05845012561281911, "learning_rate": 0.0009813387845190756, "loss": 1.3711, "step": 1130 }, { "epoch": 0.35852483613677644, "grad_norm": 0.06299994405223013, "learning_rate": 0.0009809638314122922, "loss": 1.3769, "step": 1135 }, { "epoch": 0.36010424070125563, "grad_norm": 0.06196349446230267, "learning_rate": 0.0009805852218570284, "loss": 1.361, "step": 1140 }, { "epoch": 0.3616836452657348, "grad_norm": 0.05920302094122903, "learning_rate": 0.000980202958731601, "loss": 1.387, "step": 1145 }, { "epoch": 0.363263049830214, "grad_norm": 0.06279816941976255, "learning_rate": 0.0009798170449421028, "loss": 1.2915, "step": 1150 }, { "epoch": 0.3648424543946932, "grad_norm": 0.057961331530386316, "learning_rate": 0.0009794274834223798, "loss": 1.3076, "step": 1155 }, { "epoch": 0.3664218589591724, "grad_norm": 0.05700274002313526, "learning_rate": 0.0009790342771340095, "loss": 1.4456, "step": 1160 }, { "epoch": 0.3680012635236516, "grad_norm": 0.061513414914566, "learning_rate": 0.000978637429066278, "loss": 1.3221, "step": 1165 }, { "epoch": 0.3695806680881308, "grad_norm": 0.05712571059268284, "learning_rate": 0.0009782369422361575, "loss": 1.3788, "step": 1170 }, { "epoch": 0.37116007265261, "grad_norm": 0.15490427831205295, "learning_rate": 0.0009778328196882835, "loss": 1.3198, "step": 1175 }, { "epoch": 0.37273947721708917, "grad_norm": 0.06337489160862961, "learning_rate": 0.000977425064494931, "loss": 1.4458, "step": 1180 }, { "epoch": 0.37431888178156836, "grad_norm": 0.049700719519203164, "learning_rate": 0.000977013679755992, "loss": 1.3772, "step": 1185 }, { "epoch": 0.37589828634604755, "grad_norm": 0.04832412129969085, "learning_rate": 0.0009765986685989513, "loss": 1.3414, "step": 1190 }, { "epoch": 0.37747769091052674, "grad_norm": 0.0514781834853502, "learning_rate": 0.0009761800341788632, "loss": 1.3959, "step": 1195 }, { "epoch": 0.37905709547500593, "grad_norm": 0.06329939670732958, "learning_rate": 0.0009757577796783267, "loss": 1.4113, "step": 1200 }, { "epoch": 0.3806365000394851, "grad_norm": 0.06325030217571598, "learning_rate": 0.0009753319083074625, "loss": 1.3523, "step": 1205 }, { "epoch": 0.3822159046039643, "grad_norm": 0.05982902253765249, "learning_rate": 0.0009749024233038876, "loss": 1.317, "step": 1210 }, { "epoch": 0.3837953091684435, "grad_norm": 0.06969096483938754, "learning_rate": 0.0009744693279326914, "loss": 1.3954, "step": 1215 }, { "epoch": 0.3853747137329227, "grad_norm": 0.053636010387175734, "learning_rate": 0.00097403262548641, "loss": 1.3042, "step": 1220 }, { "epoch": 0.3869541182974019, "grad_norm": 0.06511564777591113, "learning_rate": 0.000973592319285002, "loss": 1.4087, "step": 1225 }, { "epoch": 0.3885335228618811, "grad_norm": 0.06745398496717116, "learning_rate": 0.0009731484126758229, "loss": 1.3631, "step": 1230 }, { "epoch": 0.3901129274263603, "grad_norm": 0.061680149382042346, "learning_rate": 0.0009727009090336001, "loss": 1.407, "step": 1235 }, { "epoch": 0.39169233199083947, "grad_norm": 0.056298795186551356, "learning_rate": 0.000972249811760406, "loss": 1.3418, "step": 1240 }, { "epoch": 0.39327173655531866, "grad_norm": 0.05387617348936539, "learning_rate": 0.0009717951242856338, "loss": 1.3508, "step": 1245 }, { "epoch": 0.39485114111979785, "grad_norm": 0.05400958575618394, "learning_rate": 0.00097133685006597, "loss": 1.4308, "step": 1250 }, { "epoch": 0.39643054568427705, "grad_norm": 0.081868443557475, "learning_rate": 0.0009708749925853695, "loss": 1.3448, "step": 1255 }, { "epoch": 0.39800995024875624, "grad_norm": 0.07209877184930844, "learning_rate": 0.0009704095553550276, "loss": 1.3379, "step": 1260 }, { "epoch": 0.39958935481323543, "grad_norm": 0.06432998472541201, "learning_rate": 0.0009699405419133542, "loss": 1.3579, "step": 1265 }, { "epoch": 0.4011687593777146, "grad_norm": 0.06908063643090269, "learning_rate": 0.0009694679558259472, "loss": 1.4582, "step": 1270 }, { "epoch": 0.4027481639421938, "grad_norm": 0.06735824488281636, "learning_rate": 0.0009689918006855645, "loss": 1.3866, "step": 1275 }, { "epoch": 0.404327568506673, "grad_norm": 0.05310084913481967, "learning_rate": 0.0009685120801120974, "loss": 1.3478, "step": 1280 }, { "epoch": 0.4059069730711522, "grad_norm": 0.051868462886036595, "learning_rate": 0.0009680287977525426, "loss": 1.4168, "step": 1285 }, { "epoch": 0.4074863776356314, "grad_norm": 0.060401547007006964, "learning_rate": 0.0009675419572809748, "loss": 1.3473, "step": 1290 }, { "epoch": 0.4090657822001106, "grad_norm": 0.05186672194065688, "learning_rate": 0.0009670515623985187, "loss": 1.3348, "step": 1295 }, { "epoch": 0.4106451867645898, "grad_norm": 0.04987405652450246, "learning_rate": 0.000966557616833321, "loss": 1.3752, "step": 1300 }, { "epoch": 0.41222459132906897, "grad_norm": 0.05020307416711316, "learning_rate": 0.0009660601243405214, "loss": 1.3698, "step": 1305 }, { "epoch": 0.41380399589354816, "grad_norm": 0.06256388424829058, "learning_rate": 0.000965559088702225, "loss": 1.3236, "step": 1310 }, { "epoch": 0.4153834004580273, "grad_norm": 0.05567162306833656, "learning_rate": 0.0009650545137274727, "loss": 1.3571, "step": 1315 }, { "epoch": 0.4169628050225065, "grad_norm": 0.05360058861591483, "learning_rate": 0.000964546403252213, "loss": 1.3133, "step": 1320 }, { "epoch": 0.4185422095869857, "grad_norm": 0.051561329784840855, "learning_rate": 0.0009640347611392722, "loss": 1.3366, "step": 1325 }, { "epoch": 0.4201216141514649, "grad_norm": 0.0468294187168409, "learning_rate": 0.0009635195912783254, "loss": 1.3452, "step": 1330 }, { "epoch": 0.42170101871594406, "grad_norm": 0.05646901669857216, "learning_rate": 0.0009630008975858666, "loss": 1.3087, "step": 1335 }, { "epoch": 0.42328042328042326, "grad_norm": 0.0532455344861501, "learning_rate": 0.0009624786840051798, "loss": 1.2878, "step": 1340 }, { "epoch": 0.42485982784490245, "grad_norm": 0.05188792079441635, "learning_rate": 0.0009619529545063075, "loss": 1.3128, "step": 1345 }, { "epoch": 0.42643923240938164, "grad_norm": 0.06229727211513282, "learning_rate": 0.000961423713086022, "loss": 1.392, "step": 1350 }, { "epoch": 0.42801863697386083, "grad_norm": 0.05283681112909501, "learning_rate": 0.000960890963767794, "loss": 1.3392, "step": 1355 }, { "epoch": 0.42959804153834, "grad_norm": 0.05294824126078981, "learning_rate": 0.0009603547106017629, "loss": 1.4012, "step": 1360 }, { "epoch": 0.4311774461028192, "grad_norm": 0.047996697271962804, "learning_rate": 0.0009598149576647053, "loss": 1.3293, "step": 1365 }, { "epoch": 0.4327568506672984, "grad_norm": 0.049577098742485164, "learning_rate": 0.0009592717090600039, "loss": 1.3375, "step": 1370 }, { "epoch": 0.4343362552317776, "grad_norm": 0.049800754524267174, "learning_rate": 0.0009587249689176171, "loss": 1.4018, "step": 1375 }, { "epoch": 0.4359156597962568, "grad_norm": 0.05760936714171686, "learning_rate": 0.0009581747413940472, "loss": 1.3844, "step": 1380 }, { "epoch": 0.437495064360736, "grad_norm": 0.06338935926328094, "learning_rate": 0.000957621030672308, "loss": 1.3557, "step": 1385 }, { "epoch": 0.4390744689252152, "grad_norm": 0.058329423427736914, "learning_rate": 0.0009570638409618946, "loss": 1.3709, "step": 1390 }, { "epoch": 0.44065387348969437, "grad_norm": 0.05627593751193695, "learning_rate": 0.0009565031764987502, "loss": 1.393, "step": 1395 }, { "epoch": 0.44223327805417356, "grad_norm": 0.058252972102465196, "learning_rate": 0.000955939041545234, "loss": 1.3956, "step": 1400 }, { "epoch": 0.44381268261865275, "grad_norm": 0.07152212074750997, "learning_rate": 0.0009553714403900897, "loss": 1.3701, "step": 1405 }, { "epoch": 0.44539208718313195, "grad_norm": 0.09816874055530846, "learning_rate": 0.0009548003773484114, "loss": 1.3586, "step": 1410 }, { "epoch": 0.44697149174761114, "grad_norm": 0.06755088536188694, "learning_rate": 0.0009542258567616122, "loss": 1.3369, "step": 1415 }, { "epoch": 0.44855089631209033, "grad_norm": 0.06726253319388574, "learning_rate": 0.0009536478829973902, "loss": 1.4007, "step": 1420 }, { "epoch": 0.4501303008765695, "grad_norm": 0.0750897302165208, "learning_rate": 0.0009530664604496964, "loss": 1.4123, "step": 1425 }, { "epoch": 0.4517097054410487, "grad_norm": 0.058037606929892994, "learning_rate": 0.0009524815935386997, "loss": 1.305, "step": 1430 }, { "epoch": 0.4532891100055279, "grad_norm": 0.05846942736460335, "learning_rate": 0.0009518932867107551, "loss": 1.3692, "step": 1435 }, { "epoch": 0.4548685145700071, "grad_norm": 0.06503626468359491, "learning_rate": 0.0009513015444383682, "loss": 1.2862, "step": 1440 }, { "epoch": 0.4564479191344863, "grad_norm": 0.10037646460230905, "learning_rate": 0.0009507063712201623, "loss": 1.399, "step": 1445 }, { "epoch": 0.4580273236989655, "grad_norm": 0.05210632424586149, "learning_rate": 0.0009501077715808444, "loss": 1.3605, "step": 1450 }, { "epoch": 0.4596067282634447, "grad_norm": 0.060493393253037264, "learning_rate": 0.0009495057500711697, "loss": 1.3468, "step": 1455 }, { "epoch": 0.46118613282792387, "grad_norm": 0.06429171834586586, "learning_rate": 0.0009489003112679075, "loss": 1.3278, "step": 1460 }, { "epoch": 0.46276553739240306, "grad_norm": 0.06085111927575315, "learning_rate": 0.0009482914597738072, "loss": 1.3842, "step": 1465 }, { "epoch": 0.46434494195688225, "grad_norm": 0.3815385243918407, "learning_rate": 0.000947679200217562, "loss": 1.4127, "step": 1470 }, { "epoch": 0.46592434652136144, "grad_norm": 0.07175033796435, "learning_rate": 0.0009470635372537748, "loss": 1.304, "step": 1475 }, { "epoch": 0.46750375108584064, "grad_norm": 0.06930567299999024, "learning_rate": 0.0009464444755629216, "loss": 1.3606, "step": 1480 }, { "epoch": 0.4690831556503198, "grad_norm": 0.0461230697471275, "learning_rate": 0.0009458220198513177, "loss": 1.2905, "step": 1485 }, { "epoch": 0.470662560214799, "grad_norm": 0.06014525878813377, "learning_rate": 0.00094519617485108, "loss": 1.3334, "step": 1490 }, { "epoch": 0.4722419647792782, "grad_norm": 0.05484639310492065, "learning_rate": 0.0009445669453200923, "loss": 1.322, "step": 1495 }, { "epoch": 0.4738213693437574, "grad_norm": 0.054355655091887584, "learning_rate": 0.0009439343360419688, "loss": 1.3599, "step": 1500 }, { "epoch": 0.4754007739082366, "grad_norm": 0.05002072731537366, "learning_rate": 0.0009432983518260174, "loss": 1.3699, "step": 1505 }, { "epoch": 0.4769801784727158, "grad_norm": 0.06307180635325377, "learning_rate": 0.0009426589975072039, "loss": 1.3126, "step": 1510 }, { "epoch": 0.478559583037195, "grad_norm": 0.06731729100238709, "learning_rate": 0.0009420162779461141, "loss": 1.353, "step": 1515 }, { "epoch": 0.48013898760167417, "grad_norm": 0.07225008121051547, "learning_rate": 0.000941370198028918, "loss": 1.4184, "step": 1520 }, { "epoch": 0.48171839216615336, "grad_norm": 0.06318894909778677, "learning_rate": 0.0009407207626673319, "loss": 1.4224, "step": 1525 }, { "epoch": 0.48329779673063256, "grad_norm": 0.05619492363944542, "learning_rate": 0.0009400679767985813, "loss": 1.3037, "step": 1530 }, { "epoch": 0.48487720129511175, "grad_norm": 0.05971906123932661, "learning_rate": 0.000939411845385364, "loss": 1.3682, "step": 1535 }, { "epoch": 0.48645660585959094, "grad_norm": 0.06028454873534571, "learning_rate": 0.0009387523734158106, "loss": 1.3129, "step": 1540 }, { "epoch": 0.48803601042407013, "grad_norm": 0.056613643743635526, "learning_rate": 0.0009380895659034485, "loss": 1.3405, "step": 1545 }, { "epoch": 0.4896154149885493, "grad_norm": 0.05521627410101159, "learning_rate": 0.0009374234278871631, "loss": 1.3663, "step": 1550 }, { "epoch": 0.4911948195530285, "grad_norm": 0.057576203898595496, "learning_rate": 0.0009367539644311591, "loss": 1.2897, "step": 1555 }, { "epoch": 0.4927742241175077, "grad_norm": 0.08239727394288354, "learning_rate": 0.0009360811806249223, "loss": 1.3912, "step": 1560 }, { "epoch": 0.4943536286819869, "grad_norm": 0.06998921812386057, "learning_rate": 0.0009354050815831811, "loss": 1.3227, "step": 1565 }, { "epoch": 0.4959330332464661, "grad_norm": 0.05509978894522918, "learning_rate": 0.0009347256724458674, "loss": 1.3861, "step": 1570 }, { "epoch": 0.4975124378109453, "grad_norm": 0.06485796606861602, "learning_rate": 0.0009340429583780774, "loss": 1.4035, "step": 1575 }, { "epoch": 0.4990918423754245, "grad_norm": 0.055053322095565176, "learning_rate": 0.0009333569445700326, "loss": 1.3428, "step": 1580 }, { "epoch": 0.5006712469399036, "grad_norm": 0.06029817427362133, "learning_rate": 0.0009326676362370404, "loss": 1.3796, "step": 1585 }, { "epoch": 0.5022506515043829, "grad_norm": 0.07135736414187123, "learning_rate": 0.0009319750386194537, "loss": 1.2884, "step": 1590 }, { "epoch": 0.503830056068862, "grad_norm": 0.06188110692810147, "learning_rate": 0.0009312791569826324, "loss": 1.4261, "step": 1595 }, { "epoch": 0.5054094606333412, "grad_norm": 0.0661433762957586, "learning_rate": 0.0009305799966169022, "loss": 1.276, "step": 1600 }, { "epoch": 0.5069888651978204, "grad_norm": 0.05664300882402617, "learning_rate": 0.000929877562837515, "loss": 1.3631, "step": 1605 }, { "epoch": 0.5085682697622996, "grad_norm": 0.06042210655550929, "learning_rate": 0.0009291718609846081, "loss": 1.3338, "step": 1610 }, { "epoch": 0.5101476743267788, "grad_norm": 0.051910822348454816, "learning_rate": 0.0009284628964231635, "loss": 1.3025, "step": 1615 }, { "epoch": 0.511727078891258, "grad_norm": 0.06122265083799073, "learning_rate": 0.0009277506745429682, "loss": 1.3143, "step": 1620 }, { "epoch": 0.5133064834557372, "grad_norm": 0.42843669640669135, "learning_rate": 0.0009270352007585719, "loss": 1.4064, "step": 1625 }, { "epoch": 0.5148858880202164, "grad_norm": 0.053774048108923694, "learning_rate": 0.000926316480509246, "loss": 1.3976, "step": 1630 }, { "epoch": 0.5164652925846955, "grad_norm": 0.0645507983145944, "learning_rate": 0.0009255945192589439, "loss": 1.2992, "step": 1635 }, { "epoch": 0.5180446971491748, "grad_norm": 0.06679881271216846, "learning_rate": 0.0009248693224962567, "loss": 1.4293, "step": 1640 }, { "epoch": 0.5196241017136539, "grad_norm": 0.05811870613951585, "learning_rate": 0.0009241408957343739, "loss": 1.4199, "step": 1645 }, { "epoch": 0.5212035062781332, "grad_norm": 0.0638512337825535, "learning_rate": 0.00092340924451104, "loss": 1.2792, "step": 1650 }, { "epoch": 0.5227829108426123, "grad_norm": 0.06574430985624011, "learning_rate": 0.0009226743743885134, "loss": 1.3148, "step": 1655 }, { "epoch": 0.5243623154070916, "grad_norm": 0.05984300897989231, "learning_rate": 0.0009219362909535234, "loss": 1.3658, "step": 1660 }, { "epoch": 0.5259417199715707, "grad_norm": 0.051872766540715466, "learning_rate": 0.0009211949998172279, "loss": 1.336, "step": 1665 }, { "epoch": 0.5275211245360499, "grad_norm": 0.050809135985864166, "learning_rate": 0.0009204505066151709, "loss": 1.3391, "step": 1670 }, { "epoch": 0.5291005291005291, "grad_norm": 0.055532796673574275, "learning_rate": 0.0009197028170072397, "loss": 1.2895, "step": 1675 }, { "epoch": 0.5306799336650083, "grad_norm": 0.059985118984545824, "learning_rate": 0.0009189519366776217, "loss": 1.4115, "step": 1680 }, { "epoch": 0.5322593382294875, "grad_norm": 0.07065851481819439, "learning_rate": 0.0009181978713347613, "loss": 1.3344, "step": 1685 }, { "epoch": 0.5338387427939667, "grad_norm": 0.050044687687992556, "learning_rate": 0.000917440626711316, "loss": 1.3479, "step": 1690 }, { "epoch": 0.5354181473584458, "grad_norm": 0.05603202329229898, "learning_rate": 0.0009166802085641139, "loss": 1.3427, "step": 1695 }, { "epoch": 0.5369975519229251, "grad_norm": 0.05734710995721161, "learning_rate": 0.0009159166226741088, "loss": 1.4612, "step": 1700 }, { "epoch": 0.5385769564874042, "grad_norm": 0.07922879257598099, "learning_rate": 0.000915149874846337, "loss": 1.2982, "step": 1705 }, { "epoch": 0.5401563610518835, "grad_norm": 0.06765132202861511, "learning_rate": 0.0009143799709098728, "loss": 1.3564, "step": 1710 }, { "epoch": 0.5417357656163626, "grad_norm": 0.0564328785898305, "learning_rate": 0.0009136069167177844, "loss": 1.2715, "step": 1715 }, { "epoch": 0.5433151701808419, "grad_norm": 0.07405336952976153, "learning_rate": 0.0009128307181470893, "loss": 1.3564, "step": 1720 }, { "epoch": 0.544894574745321, "grad_norm": 0.05227215149754093, "learning_rate": 0.0009120513810987094, "loss": 1.3027, "step": 1725 }, { "epoch": 0.5464739793098002, "grad_norm": 0.05460360088843698, "learning_rate": 0.0009112689114974266, "loss": 1.403, "step": 1730 }, { "epoch": 0.5480533838742794, "grad_norm": 0.06411574647684438, "learning_rate": 0.0009104833152918375, "loss": 1.359, "step": 1735 }, { "epoch": 0.5496327884387586, "grad_norm": 0.08989543610436232, "learning_rate": 0.0009096945984543081, "loss": 1.3072, "step": 1740 }, { "epoch": 0.5512121930032378, "grad_norm": 0.08353963463569601, "learning_rate": 0.0009089027669809285, "loss": 1.4444, "step": 1745 }, { "epoch": 0.552791597567717, "grad_norm": 0.08073169166005426, "learning_rate": 0.0009081078268914673, "loss": 1.3854, "step": 1750 }, { "epoch": 0.5543710021321961, "grad_norm": 0.08043999372991253, "learning_rate": 0.000907309784229326, "loss": 1.4082, "step": 1755 }, { "epoch": 0.5559504066966754, "grad_norm": 0.058099392707801546, "learning_rate": 0.0009065086450614928, "loss": 1.3191, "step": 1760 }, { "epoch": 0.5575298112611545, "grad_norm": 0.05484591763646481, "learning_rate": 0.0009057044154784963, "loss": 1.339, "step": 1765 }, { "epoch": 0.5591092158256338, "grad_norm": 0.05932569969357305, "learning_rate": 0.0009048971015943599, "loss": 1.3565, "step": 1770 }, { "epoch": 0.5606886203901129, "grad_norm": 0.12160406198713741, "learning_rate": 0.0009040867095465548, "loss": 1.4133, "step": 1775 }, { "epoch": 0.5622680249545922, "grad_norm": 0.05055583588727909, "learning_rate": 0.0009032732454959533, "loss": 1.2822, "step": 1780 }, { "epoch": 0.5638474295190713, "grad_norm": 0.08539059786451741, "learning_rate": 0.000902456715626782, "loss": 1.535, "step": 1785 }, { "epoch": 0.5654268340835505, "grad_norm": 0.09986143351174494, "learning_rate": 0.0009016371261465752, "loss": 1.3709, "step": 1790 }, { "epoch": 0.5670062386480297, "grad_norm": 0.07875402259168916, "learning_rate": 0.0009008144832861272, "loss": 1.3447, "step": 1795 }, { "epoch": 0.5685856432125089, "grad_norm": 0.06579178895424559, "learning_rate": 0.000899988793299445, "loss": 1.3865, "step": 1800 }, { "epoch": 0.5701650477769881, "grad_norm": 0.28989046149104464, "learning_rate": 0.0008991600624637013, "loss": 1.434, "step": 1805 }, { "epoch": 0.5717444523414673, "grad_norm": 0.08095896895394208, "learning_rate": 0.0008983282970791858, "loss": 1.4016, "step": 1810 }, { "epoch": 0.5733238569059464, "grad_norm": 0.07080396797011522, "learning_rate": 0.0008974935034692583, "loss": 1.3315, "step": 1815 }, { "epoch": 0.5749032614704257, "grad_norm": 0.07006325322470423, "learning_rate": 0.0008966556879802998, "loss": 1.3811, "step": 1820 }, { "epoch": 0.5764826660349048, "grad_norm": 0.056160343060525086, "learning_rate": 0.0008958148569816652, "loss": 1.3424, "step": 1825 }, { "epoch": 0.5780620705993841, "grad_norm": 0.4802176372821301, "learning_rate": 0.0008949710168656337, "loss": 1.3718, "step": 1830 }, { "epoch": 0.5796414751638632, "grad_norm": 0.06182426238552257, "learning_rate": 0.0008941241740473612, "loss": 1.3332, "step": 1835 }, { "epoch": 0.5812208797283425, "grad_norm": 2.5497130449056735, "learning_rate": 0.0008932743349648312, "loss": 1.3643, "step": 1840 }, { "epoch": 0.5828002842928216, "grad_norm": 3.142747117361047, "learning_rate": 0.0008924215060788051, "loss": 1.3952, "step": 1845 }, { "epoch": 0.5843796888573008, "grad_norm": 0.6904318361727916, "learning_rate": 0.000891565693872775, "loss": 1.3634, "step": 1850 }, { "epoch": 0.58595909342178, "grad_norm": 0.1845061678384783, "learning_rate": 0.0008907069048529122, "loss": 1.4294, "step": 1855 }, { "epoch": 0.5875384979862592, "grad_norm": 0.12836727317677424, "learning_rate": 0.000889845145548019, "loss": 1.4301, "step": 1860 }, { "epoch": 0.5891179025507384, "grad_norm": 0.0959935947198231, "learning_rate": 0.000888980422509479, "loss": 1.4341, "step": 1865 }, { "epoch": 0.5906973071152176, "grad_norm": 0.11712871546313482, "learning_rate": 0.0008881127423112072, "loss": 1.4586, "step": 1870 }, { "epoch": 0.5922767116796968, "grad_norm": 0.08365832984442693, "learning_rate": 0.0008872421115495995, "loss": 1.3804, "step": 1875 }, { "epoch": 0.5938561162441759, "grad_norm": 0.058022969848351286, "learning_rate": 0.0008863685368434831, "loss": 1.3798, "step": 1880 }, { "epoch": 0.5954355208086551, "grad_norm": 0.0679807780560635, "learning_rate": 0.0008854920248340662, "loss": 1.3749, "step": 1885 }, { "epoch": 0.5970149253731343, "grad_norm": 0.07287764137695973, "learning_rate": 0.0008846125821848873, "loss": 1.3674, "step": 1890 }, { "epoch": 0.5985943299376135, "grad_norm": 0.05829965435168818, "learning_rate": 0.0008837302155817647, "loss": 1.3536, "step": 1895 }, { "epoch": 0.6001737345020927, "grad_norm": 0.05827657916786376, "learning_rate": 0.0008828449317327452, "loss": 1.3183, "step": 1900 }, { "epoch": 0.6017531390665719, "grad_norm": 0.07962686927216245, "learning_rate": 0.0008819567373680541, "loss": 1.4334, "step": 1905 }, { "epoch": 0.603332543631051, "grad_norm": 0.05662472589481426, "learning_rate": 0.000881065639240043, "loss": 1.3353, "step": 1910 }, { "epoch": 0.6049119481955303, "grad_norm": 0.053211086645031554, "learning_rate": 0.0008801716441231386, "loss": 1.368, "step": 1915 }, { "epoch": 0.6064913527600094, "grad_norm": 0.06448824048141763, "learning_rate": 0.0008792747588137924, "loss": 1.3226, "step": 1920 }, { "epoch": 0.6080707573244887, "grad_norm": 0.0700954227174087, "learning_rate": 0.0008783749901304271, "loss": 1.3777, "step": 1925 }, { "epoch": 0.6096501618889678, "grad_norm": 0.08178718959534338, "learning_rate": 0.0008774723449133866, "loss": 1.3966, "step": 1930 }, { "epoch": 0.6112295664534471, "grad_norm": 0.12225166128667886, "learning_rate": 0.0008765668300248823, "loss": 1.3791, "step": 1935 }, { "epoch": 0.6128089710179262, "grad_norm": 0.09445510763484219, "learning_rate": 0.000875658452348943, "loss": 1.483, "step": 1940 }, { "epoch": 0.6143883755824054, "grad_norm": 0.06361503804444989, "learning_rate": 0.0008747472187913603, "loss": 1.3189, "step": 1945 }, { "epoch": 0.6159677801468846, "grad_norm": 0.056705507703515866, "learning_rate": 0.0008738331362796375, "loss": 1.3492, "step": 1950 }, { "epoch": 0.6175471847113638, "grad_norm": 0.08934334333605545, "learning_rate": 0.0008729162117629368, "loss": 1.3571, "step": 1955 }, { "epoch": 0.619126589275843, "grad_norm": 0.0759936946095067, "learning_rate": 0.0008719964522120261, "loss": 1.315, "step": 1960 }, { "epoch": 0.6207059938403222, "grad_norm": 0.09282267714427453, "learning_rate": 0.0008710738646192262, "loss": 1.3232, "step": 1965 }, { "epoch": 0.6222853984048013, "grad_norm": 0.06567224301740628, "learning_rate": 0.0008701484559983577, "loss": 1.3847, "step": 1970 }, { "epoch": 0.6238648029692806, "grad_norm": 0.06770934921534695, "learning_rate": 0.0008692202333846875, "loss": 1.3648, "step": 1975 }, { "epoch": 0.6254442075337597, "grad_norm": 0.08977981824014389, "learning_rate": 0.0008682892038348756, "loss": 1.3653, "step": 1980 }, { "epoch": 0.627023612098239, "grad_norm": 0.07626120109513759, "learning_rate": 0.0008673553744269207, "loss": 1.3568, "step": 1985 }, { "epoch": 0.6286030166627181, "grad_norm": 0.09282539831610978, "learning_rate": 0.0008664187522601079, "loss": 1.3811, "step": 1990 }, { "epoch": 0.6301824212271974, "grad_norm": 0.06293874169321305, "learning_rate": 0.0008654793444549531, "loss": 1.3356, "step": 1995 }, { "epoch": 0.6317618257916765, "grad_norm": 0.0525054809707794, "learning_rate": 0.0008645371581531497, "loss": 1.3279, "step": 2000 }, { "epoch": 0.6333412303561557, "grad_norm": 0.0614865462121174, "learning_rate": 0.0008635922005175143, "loss": 1.3136, "step": 2005 }, { "epoch": 0.6349206349206349, "grad_norm": 0.05855572702508429, "learning_rate": 0.0008626444787319319, "loss": 1.3117, "step": 2010 }, { "epoch": 0.6365000394851141, "grad_norm": 0.05216893105103073, "learning_rate": 0.0008616940000013016, "loss": 1.3212, "step": 2015 }, { "epoch": 0.6380794440495933, "grad_norm": 0.051238030405012146, "learning_rate": 0.0008607407715514819, "loss": 1.3694, "step": 2020 }, { "epoch": 0.6396588486140725, "grad_norm": 0.06077849262961319, "learning_rate": 0.0008597848006292354, "loss": 1.3523, "step": 2025 }, { "epoch": 0.6412382531785517, "grad_norm": 0.06663726603853434, "learning_rate": 0.0008588260945021737, "loss": 1.3433, "step": 2030 }, { "epoch": 0.6428176577430309, "grad_norm": 0.08802895312613443, "learning_rate": 0.0008578646604587028, "loss": 1.3039, "step": 2035 }, { "epoch": 0.64439706230751, "grad_norm": 0.06477921738484675, "learning_rate": 0.0008569005058079671, "loss": 1.378, "step": 2040 }, { "epoch": 0.6459764668719893, "grad_norm": 0.053867571177889116, "learning_rate": 0.0008559336378797935, "loss": 1.4476, "step": 2045 }, { "epoch": 0.6475558714364684, "grad_norm": 0.05937487175357696, "learning_rate": 0.0008549640640246367, "loss": 1.4036, "step": 2050 }, { "epoch": 0.6491352760009477, "grad_norm": 0.04952143496000843, "learning_rate": 0.0008539917916135227, "loss": 1.2845, "step": 2055 }, { "epoch": 0.6507146805654268, "grad_norm": 0.052203388551482155, "learning_rate": 0.0008530168280379924, "loss": 1.4212, "step": 2060 }, { "epoch": 0.652294085129906, "grad_norm": 0.05362107976814019, "learning_rate": 0.0008520391807100465, "loss": 1.2927, "step": 2065 }, { "epoch": 0.6538734896943852, "grad_norm": 0.05911317194476086, "learning_rate": 0.0008510588570620879, "loss": 1.3302, "step": 2070 }, { "epoch": 0.6554528942588644, "grad_norm": 0.06609257711793792, "learning_rate": 0.0008500758645468661, "loss": 1.368, "step": 2075 }, { "epoch": 0.6570322988233436, "grad_norm": 0.049590709661072443, "learning_rate": 0.0008490902106374202, "loss": 1.3209, "step": 2080 }, { "epoch": 0.6586117033878228, "grad_norm": 0.04582325961245885, "learning_rate": 0.0008481019028270219, "loss": 1.3325, "step": 2085 }, { "epoch": 0.660191107952302, "grad_norm": 0.05491469843317571, "learning_rate": 0.000847110948629119, "loss": 1.3488, "step": 2090 }, { "epoch": 0.6617705125167812, "grad_norm": 0.0710848651703214, "learning_rate": 0.0008461173555772779, "loss": 1.4144, "step": 2095 }, { "epoch": 0.6633499170812603, "grad_norm": 0.04806585472104807, "learning_rate": 0.0008451211312251266, "loss": 1.3046, "step": 2100 }, { "epoch": 0.6649293216457396, "grad_norm": 0.05334217251891196, "learning_rate": 0.0008441222831462967, "loss": 1.3654, "step": 2105 }, { "epoch": 0.6665087262102187, "grad_norm": 0.05256940067787847, "learning_rate": 0.0008431208189343669, "loss": 1.3509, "step": 2110 }, { "epoch": 0.668088130774698, "grad_norm": 0.050579372064479676, "learning_rate": 0.0008421167462028039, "loss": 1.2798, "step": 2115 }, { "epoch": 0.6696675353391771, "grad_norm": 0.059791387190242816, "learning_rate": 0.000841110072584906, "loss": 1.4392, "step": 2120 }, { "epoch": 0.6712469399036564, "grad_norm": 0.06987790930285684, "learning_rate": 0.0008401008057337437, "loss": 1.4216, "step": 2125 }, { "epoch": 0.6728263444681355, "grad_norm": 0.05223687611276848, "learning_rate": 0.0008390889533221025, "loss": 1.4069, "step": 2130 }, { "epoch": 0.6744057490326147, "grad_norm": 0.045700087210283076, "learning_rate": 0.0008380745230424238, "loss": 1.3449, "step": 2135 }, { "epoch": 0.6759851535970939, "grad_norm": 0.06297320780391313, "learning_rate": 0.0008370575226067474, "loss": 1.3283, "step": 2140 }, { "epoch": 0.6775645581615731, "grad_norm": 0.04984913151055349, "learning_rate": 0.0008360379597466518, "loss": 1.2876, "step": 2145 }, { "epoch": 0.6791439627260523, "grad_norm": 0.05949198072918211, "learning_rate": 0.0008350158422131961, "loss": 1.3302, "step": 2150 }, { "epoch": 0.6807233672905315, "grad_norm": 0.05505474243183121, "learning_rate": 0.0008339911777768609, "loss": 1.2872, "step": 2155 }, { "epoch": 0.6823027718550106, "grad_norm": 0.05936066429647378, "learning_rate": 0.0008329639742274892, "loss": 1.3274, "step": 2160 }, { "epoch": 0.6838821764194899, "grad_norm": 0.052379606599607435, "learning_rate": 0.0008319342393742268, "loss": 1.3449, "step": 2165 }, { "epoch": 0.685461580983969, "grad_norm": 0.06241199544438947, "learning_rate": 0.0008309019810454643, "loss": 1.277, "step": 2170 }, { "epoch": 0.6870409855484483, "grad_norm": 0.08119602366115998, "learning_rate": 0.0008298672070887754, "loss": 1.3153, "step": 2175 }, { "epoch": 0.6886203901129274, "grad_norm": 0.06467623502601867, "learning_rate": 0.0008288299253708595, "loss": 1.285, "step": 2180 }, { "epoch": 0.6901997946774067, "grad_norm": 0.06236642572421528, "learning_rate": 0.00082779014377748, "loss": 1.3628, "step": 2185 }, { "epoch": 0.6917791992418858, "grad_norm": 0.08003656524372153, "learning_rate": 0.000826747870213406, "loss": 1.3605, "step": 2190 }, { "epoch": 0.693358603806365, "grad_norm": 0.0599300000503031, "learning_rate": 0.0008257031126023504, "loss": 1.3389, "step": 2195 }, { "epoch": 0.6949380083708442, "grad_norm": 0.05400399127216339, "learning_rate": 0.0008246558788869116, "loss": 1.3415, "step": 2200 }, { "epoch": 0.6965174129353234, "grad_norm": 0.05846033297902367, "learning_rate": 0.0008236061770285119, "loss": 1.3057, "step": 2205 }, { "epoch": 0.6980968174998026, "grad_norm": 0.058617542212508575, "learning_rate": 0.0008225540150073371, "loss": 1.4153, "step": 2210 }, { "epoch": 0.6996762220642818, "grad_norm": 0.0566701871148479, "learning_rate": 0.0008214994008222758, "loss": 1.3365, "step": 2215 }, { "epoch": 0.701255626628761, "grad_norm": 0.0598017003241574, "learning_rate": 0.0008204423424908591, "loss": 1.3246, "step": 2220 }, { "epoch": 0.7028350311932402, "grad_norm": 0.050838197634604274, "learning_rate": 0.0008193828480491995, "loss": 1.3471, "step": 2225 }, { "epoch": 0.7044144357577193, "grad_norm": 0.06922209324444975, "learning_rate": 0.000818320925551929, "loss": 1.2479, "step": 2230 }, { "epoch": 0.7059938403221986, "grad_norm": 0.047153048914755066, "learning_rate": 0.000817256583072139, "loss": 1.247, "step": 2235 }, { "epoch": 0.7075732448866777, "grad_norm": 0.05046553302914744, "learning_rate": 0.0008161898287013184, "loss": 1.2903, "step": 2240 }, { "epoch": 0.709152649451157, "grad_norm": 0.0469831802363631, "learning_rate": 0.0008151206705492919, "loss": 1.3126, "step": 2245 }, { "epoch": 0.7107320540156361, "grad_norm": 0.05095851683552942, "learning_rate": 0.0008140491167441584, "loss": 1.3003, "step": 2250 }, { "epoch": 0.7123114585801152, "grad_norm": 0.052927772230748515, "learning_rate": 0.0008129751754322299, "loss": 1.3276, "step": 2255 }, { "epoch": 0.7138908631445945, "grad_norm": 0.0564748877669435, "learning_rate": 0.0008118988547779687, "loss": 1.32, "step": 2260 }, { "epoch": 0.7154702677090736, "grad_norm": 0.05422349688312572, "learning_rate": 0.0008108201629639256, "loss": 1.3284, "step": 2265 }, { "epoch": 0.7170496722735529, "grad_norm": 0.0501756708570037, "learning_rate": 0.0008097391081906777, "loss": 1.3291, "step": 2270 }, { "epoch": 0.718629076838032, "grad_norm": 0.057745975888228426, "learning_rate": 0.0008086556986767663, "loss": 1.3298, "step": 2275 }, { "epoch": 0.7202084814025113, "grad_norm": 0.07558187383512491, "learning_rate": 0.0008075699426586344, "loss": 1.3304, "step": 2280 }, { "epoch": 0.7217878859669904, "grad_norm": 0.05423189894714324, "learning_rate": 0.0008064818483905634, "loss": 1.3771, "step": 2285 }, { "epoch": 0.7233672905314696, "grad_norm": 0.06112735027865857, "learning_rate": 0.0008053914241446112, "loss": 1.3324, "step": 2290 }, { "epoch": 0.7249466950959488, "grad_norm": 0.06015299949404258, "learning_rate": 0.000804298678210549, "loss": 1.2072, "step": 2295 }, { "epoch": 0.726526099660428, "grad_norm": 0.06050245688605966, "learning_rate": 0.0008032036188957982, "loss": 1.2966, "step": 2300 }, { "epoch": 0.7281055042249072, "grad_norm": 0.05653319086535704, "learning_rate": 0.0008021062545253672, "loss": 1.3206, "step": 2305 }, { "epoch": 0.7296849087893864, "grad_norm": 0.050676847685671905, "learning_rate": 0.0008010065934417881, "loss": 1.3041, "step": 2310 }, { "epoch": 0.7312643133538655, "grad_norm": 0.055817990685382825, "learning_rate": 0.0007999046440050538, "loss": 1.3212, "step": 2315 }, { "epoch": 0.7328437179183448, "grad_norm": 0.04855435233120509, "learning_rate": 0.0007988004145925538, "loss": 1.3186, "step": 2320 }, { "epoch": 0.7344231224828239, "grad_norm": 0.05901670354512037, "learning_rate": 0.0007976939135990106, "loss": 1.2929, "step": 2325 }, { "epoch": 0.7360025270473032, "grad_norm": 0.056402788065410825, "learning_rate": 0.000796585149436416, "loss": 1.2534, "step": 2330 }, { "epoch": 0.7375819316117823, "grad_norm": 0.0525791052774308, "learning_rate": 0.0007954741305339676, "loss": 1.3046, "step": 2335 }, { "epoch": 0.7391613361762616, "grad_norm": 0.058866507995561534, "learning_rate": 0.000794360865338004, "loss": 1.2972, "step": 2340 }, { "epoch": 0.7407407407407407, "grad_norm": 0.049237505963864406, "learning_rate": 0.0007932453623119407, "loss": 1.347, "step": 2345 }, { "epoch": 0.74232014530522, "grad_norm": 0.09963105926497849, "learning_rate": 0.0007921276299362062, "loss": 1.2614, "step": 2350 }, { "epoch": 0.7438995498696991, "grad_norm": 0.06456469060811044, "learning_rate": 0.0007910076767081772, "loss": 1.3771, "step": 2355 }, { "epoch": 0.7454789544341783, "grad_norm": 0.06568628488877269, "learning_rate": 0.0007898855111421139, "loss": 1.2523, "step": 2360 }, { "epoch": 0.7470583589986575, "grad_norm": 0.04981474007074316, "learning_rate": 0.0007887611417690958, "loss": 1.2392, "step": 2365 }, { "epoch": 0.7486377635631367, "grad_norm": 0.06439611345911384, "learning_rate": 0.0007876345771369564, "loss": 1.4903, "step": 2370 }, { "epoch": 0.7502171681276159, "grad_norm": 0.051881744416978126, "learning_rate": 0.0007865058258102177, "loss": 1.2935, "step": 2375 }, { "epoch": 0.7517965726920951, "grad_norm": 0.05357331864302873, "learning_rate": 0.0007853748963700264, "loss": 1.3526, "step": 2380 }, { "epoch": 0.7533759772565742, "grad_norm": 0.06386942141934505, "learning_rate": 0.0007842417974140879, "loss": 1.3086, "step": 2385 }, { "epoch": 0.7549553818210535, "grad_norm": 0.04530779470012284, "learning_rate": 0.0007831065375566004, "loss": 1.2277, "step": 2390 }, { "epoch": 0.7565347863855326, "grad_norm": 0.055617635761542134, "learning_rate": 0.0007819691254281905, "loss": 1.2829, "step": 2395 }, { "epoch": 0.7581141909500119, "grad_norm": 0.05012703398043827, "learning_rate": 0.0007808295696758472, "loss": 1.276, "step": 2400 }, { "epoch": 0.759693595514491, "grad_norm": 0.04975417148729171, "learning_rate": 0.0007796878789628555, "loss": 1.2577, "step": 2405 }, { "epoch": 0.7612730000789703, "grad_norm": 0.059111194132562835, "learning_rate": 0.0007785440619687316, "loss": 1.3151, "step": 2410 }, { "epoch": 0.7628524046434494, "grad_norm": 0.061633118850367014, "learning_rate": 0.0007773981273891562, "loss": 1.2997, "step": 2415 }, { "epoch": 0.7644318092079286, "grad_norm": 0.05127342678663448, "learning_rate": 0.0007762500839359084, "loss": 1.3294, "step": 2420 }, { "epoch": 0.7660112137724078, "grad_norm": 0.050750410222519485, "learning_rate": 0.0007750999403368001, "loss": 1.3007, "step": 2425 }, { "epoch": 0.767590618336887, "grad_norm": 0.055084272724464874, "learning_rate": 0.000773947705335609, "loss": 1.2859, "step": 2430 }, { "epoch": 0.7691700229013662, "grad_norm": 0.06301477368466984, "learning_rate": 0.0007727933876920121, "loss": 1.2889, "step": 2435 }, { "epoch": 0.7707494274658454, "grad_norm": 0.07416103030072384, "learning_rate": 0.0007716369961815199, "loss": 1.3424, "step": 2440 }, { "epoch": 0.7723288320303245, "grad_norm": 0.0636902229287527, "learning_rate": 0.0007704785395954085, "loss": 1.2522, "step": 2445 }, { "epoch": 0.7739082365948038, "grad_norm": 0.061113934430977956, "learning_rate": 0.0007693180267406539, "loss": 1.3076, "step": 2450 }, { "epoch": 0.7754876411592829, "grad_norm": 0.058392939762995144, "learning_rate": 0.000768155466439864, "loss": 1.3074, "step": 2455 }, { "epoch": 0.7770670457237622, "grad_norm": 0.05697368473282818, "learning_rate": 0.0007669908675312128, "loss": 1.3191, "step": 2460 }, { "epoch": 0.7786464502882413, "grad_norm": 0.22426380204225607, "learning_rate": 0.000765824238868372, "loss": 1.3071, "step": 2465 }, { "epoch": 0.7802258548527206, "grad_norm": 0.050952692832539614, "learning_rate": 0.0007646555893204442, "loss": 1.2757, "step": 2470 }, { "epoch": 0.7818052594171997, "grad_norm": 0.049895871874697476, "learning_rate": 0.0007634849277718956, "loss": 1.2902, "step": 2475 }, { "epoch": 0.7833846639816789, "grad_norm": 0.060870514702811115, "learning_rate": 0.0007623122631224881, "loss": 1.2551, "step": 2480 }, { "epoch": 0.7849640685461581, "grad_norm": 0.06418964651348986, "learning_rate": 0.0007611376042872121, "loss": 1.3117, "step": 2485 }, { "epoch": 0.7865434731106373, "grad_norm": 0.06685476400002326, "learning_rate": 0.0007599609601962183, "loss": 1.323, "step": 2490 }, { "epoch": 0.7881228776751165, "grad_norm": 0.05815267274160649, "learning_rate": 0.00075878233979475, "loss": 1.3109, "step": 2495 }, { "epoch": 0.7897022822395957, "grad_norm": 0.06148159308199383, "learning_rate": 0.000757601752043075, "loss": 1.2961, "step": 2500 }, { "epoch": 0.7912816868040748, "grad_norm": 0.048940104859922375, "learning_rate": 0.0007564192059164176, "loss": 1.3022, "step": 2505 }, { "epoch": 0.7928610913685541, "grad_norm": 0.05895366657669295, "learning_rate": 0.0007552347104048908, "loss": 1.3623, "step": 2510 }, { "epoch": 0.7944404959330332, "grad_norm": 0.049777352684372916, "learning_rate": 0.0007540482745134266, "loss": 1.2654, "step": 2515 }, { "epoch": 0.7960199004975125, "grad_norm": 0.05933219102543363, "learning_rate": 0.000752859907261709, "loss": 1.335, "step": 2520 }, { "epoch": 0.7975993050619916, "grad_norm": 0.05898104122576166, "learning_rate": 0.0007516696176841048, "loss": 1.3209, "step": 2525 }, { "epoch": 0.7991787096264709, "grad_norm": 0.06197483095616694, "learning_rate": 0.000750477414829595, "loss": 1.4343, "step": 2530 }, { "epoch": 0.80075811419095, "grad_norm": 0.05583353496744585, "learning_rate": 0.0007492833077617053, "loss": 1.3832, "step": 2535 }, { "epoch": 0.8023375187554292, "grad_norm": 0.056899647556575145, "learning_rate": 0.0007480873055584392, "loss": 1.2935, "step": 2540 }, { "epoch": 0.8039169233199084, "grad_norm": 0.053549074551499466, "learning_rate": 0.0007468894173122063, "loss": 1.3332, "step": 2545 }, { "epoch": 0.8054963278843876, "grad_norm": 0.11833801823929635, "learning_rate": 0.0007456896521297554, "loss": 1.3703, "step": 2550 }, { "epoch": 0.8070757324488668, "grad_norm": 0.05442432208840996, "learning_rate": 0.000744488019132104, "loss": 1.3426, "step": 2555 }, { "epoch": 0.808655137013346, "grad_norm": 0.05398769138904237, "learning_rate": 0.0007432845274544695, "loss": 1.2624, "step": 2560 }, { "epoch": 0.8102345415778252, "grad_norm": 0.0532093395003247, "learning_rate": 0.0007420791862461997, "loss": 1.2494, "step": 2565 }, { "epoch": 0.8118139461423044, "grad_norm": 0.05494748200852774, "learning_rate": 0.0007408720046707027, "loss": 1.2733, "step": 2570 }, { "epoch": 0.8133933507067835, "grad_norm": 0.052022918664222595, "learning_rate": 0.0007396629919053785, "loss": 1.3164, "step": 2575 }, { "epoch": 0.8149727552712628, "grad_norm": 0.09164397548792244, "learning_rate": 0.0007384521571415475, "loss": 1.3717, "step": 2580 }, { "epoch": 0.8165521598357419, "grad_norm": 0.05478777008078351, "learning_rate": 0.0007372395095843823, "loss": 1.2704, "step": 2585 }, { "epoch": 0.8181315644002212, "grad_norm": 0.0633663135818474, "learning_rate": 0.0007360250584528363, "loss": 1.4264, "step": 2590 }, { "epoch": 0.8197109689647003, "grad_norm": 0.05289622321168728, "learning_rate": 0.000734808812979575, "loss": 1.3242, "step": 2595 }, { "epoch": 0.8212903735291796, "grad_norm": 0.05041228681851876, "learning_rate": 0.0007335907824109046, "loss": 1.2925, "step": 2600 }, { "epoch": 0.8228697780936587, "grad_norm": 0.05214184770483889, "learning_rate": 0.0007323709760067023, "loss": 1.2658, "step": 2605 }, { "epoch": 0.8244491826581379, "grad_norm": 0.05055364627627826, "learning_rate": 0.0007311494030403458, "loss": 1.3002, "step": 2610 }, { "epoch": 0.8260285872226171, "grad_norm": 0.06105874504303281, "learning_rate": 0.0007299260727986428, "loss": 1.2909, "step": 2615 }, { "epoch": 0.8276079917870963, "grad_norm": 0.04669923289433585, "learning_rate": 0.0007287009945817605, "loss": 1.2559, "step": 2620 }, { "epoch": 0.8291873963515755, "grad_norm": 0.045667524781441184, "learning_rate": 0.0007274741777031544, "loss": 1.3321, "step": 2625 }, { "epoch": 0.8307668009160546, "grad_norm": 0.05938746464903109, "learning_rate": 0.0007262456314894985, "loss": 1.2775, "step": 2630 }, { "epoch": 0.8323462054805338, "grad_norm": 0.0553091144869122, "learning_rate": 0.0007250153652806133, "loss": 1.3062, "step": 2635 }, { "epoch": 0.833925610045013, "grad_norm": 0.052999806811031884, "learning_rate": 0.0007237833884293955, "loss": 1.2294, "step": 2640 }, { "epoch": 0.8355050146094922, "grad_norm": 0.05085996670338277, "learning_rate": 0.0007225497103017467, "loss": 1.2676, "step": 2645 }, { "epoch": 0.8370844191739714, "grad_norm": 0.057124565018352835, "learning_rate": 0.0007213143402765021, "loss": 1.3006, "step": 2650 }, { "epoch": 0.8386638237384506, "grad_norm": 0.0501978932135094, "learning_rate": 0.0007200772877453593, "loss": 1.3422, "step": 2655 }, { "epoch": 0.8402432283029297, "grad_norm": 0.05325201673490316, "learning_rate": 0.0007188385621128067, "loss": 1.229, "step": 2660 }, { "epoch": 0.841822632867409, "grad_norm": 0.04301259602368203, "learning_rate": 0.0007175981727960526, "loss": 1.2179, "step": 2665 }, { "epoch": 0.8434020374318881, "grad_norm": 0.05135001443269447, "learning_rate": 0.0007163561292249525, "loss": 1.3666, "step": 2670 }, { "epoch": 0.8449814419963674, "grad_norm": 0.06907394272762366, "learning_rate": 0.0007151124408419389, "loss": 1.2576, "step": 2675 }, { "epoch": 0.8465608465608465, "grad_norm": 0.05008472579830961, "learning_rate": 0.0007138671171019481, "loss": 1.3306, "step": 2680 }, { "epoch": 0.8481402511253258, "grad_norm": 0.050261438955579905, "learning_rate": 0.0007126201674723492, "loss": 1.2862, "step": 2685 }, { "epoch": 0.8497196556898049, "grad_norm": 0.055796107658739394, "learning_rate": 0.000711371601432872, "loss": 1.3433, "step": 2690 }, { "epoch": 0.8512990602542841, "grad_norm": 0.07211341298369231, "learning_rate": 0.0007101214284755344, "loss": 1.2952, "step": 2695 }, { "epoch": 0.8528784648187633, "grad_norm": 0.05626358077268197, "learning_rate": 0.000708869658104571, "loss": 1.2881, "step": 2700 }, { "epoch": 0.8544578693832425, "grad_norm": 0.05284449225003599, "learning_rate": 0.0007076162998363603, "loss": 1.2775, "step": 2705 }, { "epoch": 0.8560372739477217, "grad_norm": 0.05004959630878785, "learning_rate": 0.0007063613631993523, "loss": 1.2861, "step": 2710 }, { "epoch": 0.8576166785122009, "grad_norm": 0.05183847623720609, "learning_rate": 0.0007051048577339968, "loss": 1.302, "step": 2715 }, { "epoch": 0.85919608307668, "grad_norm": 0.050897108274268923, "learning_rate": 0.00070384679299267, "loss": 1.2803, "step": 2720 }, { "epoch": 0.8607754876411593, "grad_norm": 0.05161616394129246, "learning_rate": 0.0007025871785396023, "loss": 1.3534, "step": 2725 }, { "epoch": 0.8623548922056384, "grad_norm": 0.06042824204693549, "learning_rate": 0.0007013260239508055, "loss": 1.3633, "step": 2730 }, { "epoch": 0.8639342967701177, "grad_norm": 0.052461769000716627, "learning_rate": 0.0007000633388140002, "loss": 1.363, "step": 2735 }, { "epoch": 0.8655137013345968, "grad_norm": 0.04738855548103082, "learning_rate": 0.0006987991327285425, "loss": 1.2447, "step": 2740 }, { "epoch": 0.8670931058990761, "grad_norm": 0.04455014508469419, "learning_rate": 0.0006975334153053517, "loss": 1.243, "step": 2745 }, { "epoch": 0.8686725104635552, "grad_norm": 0.04474718982439998, "learning_rate": 0.0006962661961668362, "loss": 1.27, "step": 2750 }, { "epoch": 0.8702519150280345, "grad_norm": 0.0498969419339262, "learning_rate": 0.0006949974849468212, "loss": 1.2951, "step": 2755 }, { "epoch": 0.8718313195925136, "grad_norm": 0.043037040813865536, "learning_rate": 0.0006937272912904755, "loss": 1.2889, "step": 2760 }, { "epoch": 0.8734107241569928, "grad_norm": 0.06669453461251541, "learning_rate": 0.0006924556248542373, "loss": 1.3178, "step": 2765 }, { "epoch": 0.874990128721472, "grad_norm": 0.054889866483026135, "learning_rate": 0.0006911824953057419, "loss": 1.2803, "step": 2770 }, { "epoch": 0.8765695332859512, "grad_norm": 0.04564998819772629, "learning_rate": 0.0006899079123237473, "loss": 1.3115, "step": 2775 }, { "epoch": 0.8781489378504304, "grad_norm": 0.05167587647575047, "learning_rate": 0.0006886318855980611, "loss": 1.3558, "step": 2780 }, { "epoch": 0.8797283424149096, "grad_norm": 0.053308365263521236, "learning_rate": 0.0006873544248294671, "loss": 1.2645, "step": 2785 }, { "epoch": 0.8813077469793887, "grad_norm": 0.05518057727149211, "learning_rate": 0.0006860755397296505, "loss": 1.2705, "step": 2790 }, { "epoch": 0.882887151543868, "grad_norm": 0.05935421375050099, "learning_rate": 0.0006847952400211252, "loss": 1.3036, "step": 2795 }, { "epoch": 0.8844665561083471, "grad_norm": 0.057519691887075924, "learning_rate": 0.0006835135354371593, "loss": 1.3206, "step": 2800 }, { "epoch": 0.8860459606728264, "grad_norm": 0.06148386906328311, "learning_rate": 0.0006822304357217013, "loss": 1.2937, "step": 2805 }, { "epoch": 0.8876253652373055, "grad_norm": 0.07348742082785968, "learning_rate": 0.0006809459506293057, "loss": 1.2954, "step": 2810 }, { "epoch": 0.8892047698017848, "grad_norm": 0.05757282373092205, "learning_rate": 0.0006796600899250596, "loss": 1.233, "step": 2815 }, { "epoch": 0.8907841743662639, "grad_norm": 0.045064365699254316, "learning_rate": 0.0006783728633845076, "loss": 1.2929, "step": 2820 }, { "epoch": 0.8923635789307431, "grad_norm": 0.06142818212608352, "learning_rate": 0.0006770842807935777, "loss": 1.3022, "step": 2825 }, { "epoch": 0.8939429834952223, "grad_norm": 0.05002467602169614, "learning_rate": 0.0006757943519485075, "loss": 1.2967, "step": 2830 }, { "epoch": 0.8955223880597015, "grad_norm": 0.0482242154684967, "learning_rate": 0.0006745030866557691, "loss": 1.2781, "step": 2835 }, { "epoch": 0.8971017926241807, "grad_norm": 0.057406693868650424, "learning_rate": 0.0006732104947319942, "loss": 1.2626, "step": 2840 }, { "epoch": 0.8986811971886599, "grad_norm": 0.05409220711152165, "learning_rate": 0.0006719165860039009, "loss": 1.3079, "step": 2845 }, { "epoch": 0.900260601753139, "grad_norm": 0.050353619175419514, "learning_rate": 0.0006706213703082176, "loss": 1.311, "step": 2850 }, { "epoch": 0.9018400063176183, "grad_norm": 0.07093165501889157, "learning_rate": 0.0006693248574916086, "loss": 1.2699, "step": 2855 }, { "epoch": 0.9034194108820974, "grad_norm": 0.04932088359891142, "learning_rate": 0.0006680270574105997, "loss": 1.295, "step": 2860 }, { "epoch": 0.9049988154465767, "grad_norm": 0.05028143825392374, "learning_rate": 0.0006667279799315025, "loss": 1.2551, "step": 2865 }, { "epoch": 0.9065782200110558, "grad_norm": 0.05161132551605002, "learning_rate": 0.0006654276349303402, "loss": 1.2411, "step": 2870 }, { "epoch": 0.9081576245755351, "grad_norm": 0.049656406698105934, "learning_rate": 0.0006641260322927718, "loss": 1.2521, "step": 2875 }, { "epoch": 0.9097370291400142, "grad_norm": 0.05369045417032218, "learning_rate": 0.0006628231819140175, "loss": 1.309, "step": 2880 }, { "epoch": 0.9113164337044934, "grad_norm": 0.060611885480271493, "learning_rate": 0.0006615190936987833, "loss": 1.2808, "step": 2885 }, { "epoch": 0.9128958382689726, "grad_norm": 0.05159903759212243, "learning_rate": 0.0006602137775611853, "loss": 1.2452, "step": 2890 }, { "epoch": 0.9144752428334518, "grad_norm": 0.05429625539910252, "learning_rate": 0.000658907243424675, "loss": 1.2332, "step": 2895 }, { "epoch": 0.916054647397931, "grad_norm": 0.056197351059949, "learning_rate": 0.0006575995012219636, "loss": 1.2958, "step": 2900 }, { "epoch": 0.9176340519624102, "grad_norm": 0.06554213191346638, "learning_rate": 0.000656290560894946, "loss": 1.3888, "step": 2905 }, { "epoch": 0.9192134565268893, "grad_norm": 0.05556131120583228, "learning_rate": 0.0006549804323946261, "loss": 1.2571, "step": 2910 }, { "epoch": 0.9207928610913686, "grad_norm": 0.048005577582656306, "learning_rate": 0.0006536691256810404, "loss": 1.2658, "step": 2915 }, { "epoch": 0.9223722656558477, "grad_norm": 0.061009705763200466, "learning_rate": 0.0006523566507231827, "loss": 1.2675, "step": 2920 }, { "epoch": 0.923951670220327, "grad_norm": 0.05481675089467333, "learning_rate": 0.0006510430174989281, "loss": 1.2358, "step": 2925 }, { "epoch": 0.9255310747848061, "grad_norm": 0.05239077766197509, "learning_rate": 0.0006497282359949574, "loss": 1.2372, "step": 2930 }, { "epoch": 0.9271104793492854, "grad_norm": 0.05038994440682631, "learning_rate": 0.000648412316206681, "loss": 1.2408, "step": 2935 }, { "epoch": 0.9286898839137645, "grad_norm": 0.04933368187746228, "learning_rate": 0.0006470952681381626, "loss": 1.2815, "step": 2940 }, { "epoch": 0.9302692884782437, "grad_norm": 0.058428926376558894, "learning_rate": 0.0006457771018020435, "loss": 1.285, "step": 2945 }, { "epoch": 0.9318486930427229, "grad_norm": 0.04566703502250902, "learning_rate": 0.0006444578272194672, "loss": 1.2971, "step": 2950 }, { "epoch": 0.9334280976072021, "grad_norm": 0.0492754689134332, "learning_rate": 0.0006431374544200013, "loss": 1.2671, "step": 2955 }, { "epoch": 0.9350075021716813, "grad_norm": 0.057064269714189084, "learning_rate": 0.0006418159934415634, "loss": 1.2532, "step": 2960 }, { "epoch": 0.9365869067361605, "grad_norm": 0.04852323477283318, "learning_rate": 0.0006404934543303431, "loss": 1.3066, "step": 2965 }, { "epoch": 0.9381663113006397, "grad_norm": 0.04968318312053378, "learning_rate": 0.0006391698471407269, "loss": 1.2498, "step": 2970 }, { "epoch": 0.9397457158651189, "grad_norm": 0.05661294787265637, "learning_rate": 0.0006378451819352206, "loss": 1.2997, "step": 2975 }, { "epoch": 0.941325120429598, "grad_norm": 0.04818330209059537, "learning_rate": 0.0006365194687843743, "loss": 1.3842, "step": 2980 }, { "epoch": 0.9429045249940773, "grad_norm": 0.052949339413494216, "learning_rate": 0.0006351927177667036, "loss": 1.2445, "step": 2985 }, { "epoch": 0.9444839295585564, "grad_norm": 0.052722385485403214, "learning_rate": 0.0006338649389686157, "loss": 1.329, "step": 2990 }, { "epoch": 0.9460633341230357, "grad_norm": 0.051542067025464385, "learning_rate": 0.0006325361424843304, "loss": 1.3557, "step": 2995 }, { "epoch": 0.9476427386875148, "grad_norm": 0.0529872784793377, "learning_rate": 0.0006312063384158043, "loss": 1.2486, "step": 3000 }, { "epoch": 0.9492221432519939, "grad_norm": 0.0863270122501852, "learning_rate": 0.0006298755368726548, "loss": 1.3602, "step": 3005 }, { "epoch": 0.9508015478164732, "grad_norm": 0.11669922847923993, "learning_rate": 0.0006285437479720817, "loss": 1.3763, "step": 3010 }, { "epoch": 0.9523809523809523, "grad_norm": 0.047995729414332895, "learning_rate": 0.0006272109818387909, "loss": 1.3285, "step": 3015 }, { "epoch": 0.9539603569454316, "grad_norm": 0.6873721495781745, "learning_rate": 0.0006258772486049185, "loss": 1.3633, "step": 3020 }, { "epoch": 0.9555397615099107, "grad_norm": 0.0687577244501574, "learning_rate": 0.0006245425584099518, "loss": 1.3682, "step": 3025 }, { "epoch": 0.95711916607439, "grad_norm": 0.09740126200537633, "learning_rate": 0.0006232069214006536, "loss": 1.2846, "step": 3030 }, { "epoch": 0.9586985706388691, "grad_norm": 0.056010338911490785, "learning_rate": 0.000621870347730985, "loss": 1.3712, "step": 3035 }, { "epoch": 0.9602779752033483, "grad_norm": 0.05918236057004626, "learning_rate": 0.0006205328475620275, "loss": 1.3518, "step": 3040 }, { "epoch": 0.9618573797678275, "grad_norm": 0.05108646024107058, "learning_rate": 0.0006191944310619065, "loss": 1.3347, "step": 3045 }, { "epoch": 0.9634367843323067, "grad_norm": 0.060216327103479884, "learning_rate": 0.0006178551084057134, "loss": 1.3832, "step": 3050 }, { "epoch": 0.9650161888967859, "grad_norm": 0.053003700874160474, "learning_rate": 0.0006165148897754282, "loss": 1.2423, "step": 3055 }, { "epoch": 0.9665955934612651, "grad_norm": 0.05801080935435213, "learning_rate": 0.0006151737853598432, "loss": 1.3104, "step": 3060 }, { "epoch": 0.9681749980257442, "grad_norm": 0.05436846583498851, "learning_rate": 0.0006138318053544842, "loss": 1.2213, "step": 3065 }, { "epoch": 0.9697544025902235, "grad_norm": 0.05331705002327234, "learning_rate": 0.0006124889599615336, "loss": 1.2131, "step": 3070 }, { "epoch": 0.9713338071547026, "grad_norm": 0.10367210422595395, "learning_rate": 0.0006111452593897526, "loss": 1.225, "step": 3075 }, { "epoch": 0.9729132117191819, "grad_norm": 0.07246824562536246, "learning_rate": 0.0006098007138544044, "loss": 1.3255, "step": 3080 }, { "epoch": 0.974492616283661, "grad_norm": 0.05294366902696412, "learning_rate": 0.0006084553335771749, "loss": 1.2815, "step": 3085 }, { "epoch": 0.9760720208481403, "grad_norm": 0.05094564088811718, "learning_rate": 0.0006071091287860972, "loss": 1.2487, "step": 3090 }, { "epoch": 0.9776514254126194, "grad_norm": 0.05503028376365873, "learning_rate": 0.0006057621097154715, "loss": 1.3431, "step": 3095 }, { "epoch": 0.9792308299770986, "grad_norm": 0.05317844093013525, "learning_rate": 0.000604414286605789, "loss": 1.3419, "step": 3100 }, { "epoch": 0.9808102345415778, "grad_norm": 0.04852946839125656, "learning_rate": 0.0006030656697036534, "loss": 1.2479, "step": 3105 }, { "epoch": 0.982389639106057, "grad_norm": 0.0421790199699299, "learning_rate": 0.0006017162692617031, "loss": 1.2835, "step": 3110 }, { "epoch": 0.9839690436705362, "grad_norm": 0.045558337973098266, "learning_rate": 0.0006003660955385331, "loss": 1.301, "step": 3115 }, { "epoch": 0.9855484482350154, "grad_norm": 0.04591963430912712, "learning_rate": 0.0005990151587986171, "loss": 1.3364, "step": 3120 }, { "epoch": 0.9871278527994946, "grad_norm": 0.0455745775042671, "learning_rate": 0.0005976634693122298, "loss": 1.2722, "step": 3125 }, { "epoch": 0.9887072573639738, "grad_norm": 0.06495971833657611, "learning_rate": 0.0005963110373553686, "loss": 1.3027, "step": 3130 }, { "epoch": 0.9902866619284529, "grad_norm": 0.06868434888178966, "learning_rate": 0.0005949578732096746, "loss": 1.2498, "step": 3135 }, { "epoch": 0.9918660664929322, "grad_norm": 0.053918576354851755, "learning_rate": 0.0005936039871623563, "loss": 1.2833, "step": 3140 }, { "epoch": 0.9934454710574113, "grad_norm": 0.07466139277385875, "learning_rate": 0.0005922493895061098, "loss": 1.2109, "step": 3145 }, { "epoch": 0.9950248756218906, "grad_norm": 0.05736613292188317, "learning_rate": 0.0005908940905390408, "loss": 1.2249, "step": 3150 }, { "epoch": 0.9966042801863697, "grad_norm": 0.05383014554119235, "learning_rate": 0.0005895381005645874, "loss": 1.3811, "step": 3155 }, { "epoch": 0.998183684750849, "grad_norm": 0.052781430512047464, "learning_rate": 0.0005881814298914402, "loss": 1.2891, "step": 3160 }, { "epoch": 0.9997630893153281, "grad_norm": 0.055866315504821615, "learning_rate": 0.0005868240888334653, "loss": 1.27, "step": 3165 }, { "epoch": 0.9997630893153281, "eval_loss": 1.267873764038086, "eval_runtime": 202.0814, "eval_samples_per_second": 13.109, "eval_steps_per_second": 3.281, "step": 3165 }, { "epoch": 1.0013424938798072, "grad_norm": 0.05035870911007468, "learning_rate": 0.0005854660877096246, "loss": 1.3401, "step": 3170 }, { "epoch": 1.0029218984442865, "grad_norm": 0.047142259265301634, "learning_rate": 0.000584107436843899, "loss": 1.2192, "step": 3175 }, { "epoch": 1.0045013030087657, "grad_norm": 0.05095255176878271, "learning_rate": 0.0005827481465652079, "loss": 1.1955, "step": 3180 }, { "epoch": 1.006080707573245, "grad_norm": 0.05796765788656545, "learning_rate": 0.0005813882272073325, "loss": 1.1722, "step": 3185 }, { "epoch": 1.007660112137724, "grad_norm": 0.05743385714892482, "learning_rate": 0.0005800276891088362, "loss": 1.1755, "step": 3190 }, { "epoch": 1.0092395167022032, "grad_norm": 0.055621365938881534, "learning_rate": 0.0005786665426129862, "loss": 1.1972, "step": 3195 }, { "epoch": 1.0108189212666825, "grad_norm": 0.05343510159468665, "learning_rate": 0.000577304798067675, "loss": 1.2131, "step": 3200 }, { "epoch": 1.0123983258311617, "grad_norm": 0.05616075327365182, "learning_rate": 0.0005759424658253418, "loss": 1.2413, "step": 3205 }, { "epoch": 1.0139777303956408, "grad_norm": 0.06771875337555067, "learning_rate": 0.0005745795562428936, "loss": 1.2355, "step": 3210 }, { "epoch": 1.01555713496012, "grad_norm": 0.05638970081916908, "learning_rate": 0.0005732160796816266, "loss": 1.183, "step": 3215 }, { "epoch": 1.0171365395245993, "grad_norm": 0.08812048570157387, "learning_rate": 0.000571852046507147, "loss": 1.2842, "step": 3220 }, { "epoch": 1.0187159440890785, "grad_norm": 0.06288541185488564, "learning_rate": 0.0005704874670892929, "loss": 1.2258, "step": 3225 }, { "epoch": 1.0202953486535575, "grad_norm": 0.052806128961488744, "learning_rate": 0.000569122351802055, "loss": 1.2926, "step": 3230 }, { "epoch": 1.0218747532180368, "grad_norm": 0.05813750252759059, "learning_rate": 0.000567756711023498, "loss": 1.2403, "step": 3235 }, { "epoch": 1.023454157782516, "grad_norm": 0.0613807315554161, "learning_rate": 0.0005663905551356816, "loss": 1.2549, "step": 3240 }, { "epoch": 1.0250335623469953, "grad_norm": 0.053285044869060515, "learning_rate": 0.0005650238945245811, "loss": 1.213, "step": 3245 }, { "epoch": 1.0266129669114743, "grad_norm": 0.0712811077009441, "learning_rate": 0.0005636567395800095, "loss": 1.2404, "step": 3250 }, { "epoch": 1.0281923714759535, "grad_norm": 0.04695698793273257, "learning_rate": 0.0005622891006955374, "loss": 1.2869, "step": 3255 }, { "epoch": 1.0297717760404328, "grad_norm": 0.05490571027494214, "learning_rate": 0.0005609209882684147, "loss": 1.1743, "step": 3260 }, { "epoch": 1.031351180604912, "grad_norm": 0.04793676317706356, "learning_rate": 0.0005595524126994912, "loss": 1.2458, "step": 3265 }, { "epoch": 1.032930585169391, "grad_norm": 0.055293709501721786, "learning_rate": 0.0005581833843931377, "loss": 1.2179, "step": 3270 }, { "epoch": 1.0345099897338703, "grad_norm": 0.12300015507981796, "learning_rate": 0.0005568139137571671, "loss": 1.2151, "step": 3275 }, { "epoch": 1.0360893942983496, "grad_norm": 0.05416354144866632, "learning_rate": 0.0005554440112027546, "loss": 1.2032, "step": 3280 }, { "epoch": 1.0376687988628288, "grad_norm": 0.060778677095964746, "learning_rate": 0.0005540736871443595, "loss": 1.2222, "step": 3285 }, { "epoch": 1.0392482034273078, "grad_norm": 0.06703735588611826, "learning_rate": 0.0005527029519996448, "loss": 1.2306, "step": 3290 }, { "epoch": 1.040827607991787, "grad_norm": 0.04998163573837245, "learning_rate": 0.0005513318161893996, "loss": 1.211, "step": 3295 }, { "epoch": 1.0424070125562663, "grad_norm": 0.045145905224527924, "learning_rate": 0.0005499602901374582, "loss": 1.2057, "step": 3300 }, { "epoch": 1.0439864171207456, "grad_norm": 0.04561642723799816, "learning_rate": 0.0005485883842706224, "loss": 1.1644, "step": 3305 }, { "epoch": 1.0455658216852246, "grad_norm": 0.05227030246112335, "learning_rate": 0.0005472161090185806, "loss": 1.2585, "step": 3310 }, { "epoch": 1.0471452262497039, "grad_norm": 0.04462028281094253, "learning_rate": 0.0005458434748138302, "loss": 1.1883, "step": 3315 }, { "epoch": 1.048724630814183, "grad_norm": 0.04991981223265658, "learning_rate": 0.0005444704920915971, "loss": 1.2699, "step": 3320 }, { "epoch": 1.0503040353786623, "grad_norm": 0.04957041044070128, "learning_rate": 0.0005430971712897566, "loss": 1.32, "step": 3325 }, { "epoch": 1.0518834399431414, "grad_norm": 0.057066641733610125, "learning_rate": 0.0005417235228487546, "loss": 1.2507, "step": 3330 }, { "epoch": 1.0534628445076206, "grad_norm": 0.049763415724097905, "learning_rate": 0.0005403495572115275, "loss": 1.2176, "step": 3335 }, { "epoch": 1.0550422490720999, "grad_norm": 0.04814115266612058, "learning_rate": 0.0005389752848234234, "loss": 1.2091, "step": 3340 }, { "epoch": 1.0566216536365791, "grad_norm": 0.05386852437447205, "learning_rate": 0.000537600716132122, "loss": 1.1707, "step": 3345 }, { "epoch": 1.0582010582010581, "grad_norm": 0.046324510368052574, "learning_rate": 0.0005362258615875562, "loss": 1.2973, "step": 3350 }, { "epoch": 1.0597804627655374, "grad_norm": 0.054537381346128636, "learning_rate": 0.0005348507316418313, "loss": 1.2335, "step": 3355 }, { "epoch": 1.0613598673300166, "grad_norm": 0.04881411091627748, "learning_rate": 0.000533475336749147, "loss": 1.2041, "step": 3360 }, { "epoch": 1.0629392718944959, "grad_norm": 0.043326954235590895, "learning_rate": 0.0005320996873657167, "loss": 1.1483, "step": 3365 }, { "epoch": 1.064518676458975, "grad_norm": 0.05750219491994759, "learning_rate": 0.000530723793949689, "loss": 1.2237, "step": 3370 }, { "epoch": 1.0660980810234542, "grad_norm": 0.050600646123255585, "learning_rate": 0.0005293476669610673, "loss": 1.2017, "step": 3375 }, { "epoch": 1.0676774855879334, "grad_norm": 0.0504893464147265, "learning_rate": 0.0005279713168616309, "loss": 1.1337, "step": 3380 }, { "epoch": 1.0692568901524124, "grad_norm": 0.05228384007812538, "learning_rate": 0.0005265947541148553, "loss": 1.2183, "step": 3385 }, { "epoch": 1.0708362947168917, "grad_norm": 0.04327133423430133, "learning_rate": 0.0005252179891858326, "loss": 1.1753, "step": 3390 }, { "epoch": 1.072415699281371, "grad_norm": 0.045133242524802275, "learning_rate": 0.0005238410325411917, "loss": 1.2218, "step": 3395 }, { "epoch": 1.0739951038458502, "grad_norm": 0.057270386313069306, "learning_rate": 0.0005224638946490191, "loss": 1.2235, "step": 3400 }, { "epoch": 1.0755745084103294, "grad_norm": 0.04985205797785902, "learning_rate": 0.0005210865859787794, "loss": 1.2735, "step": 3405 }, { "epoch": 1.0771539129748084, "grad_norm": 0.04995715482190649, "learning_rate": 0.0005197091170012356, "loss": 1.1698, "step": 3410 }, { "epoch": 1.0787333175392877, "grad_norm": 0.04908533294087533, "learning_rate": 0.000518331498188369, "loss": 1.25, "step": 3415 }, { "epoch": 1.080312722103767, "grad_norm": 0.0475787399609696, "learning_rate": 0.0005169537400133002, "loss": 1.2439, "step": 3420 }, { "epoch": 1.081892126668246, "grad_norm": 0.04943615366036836, "learning_rate": 0.0005155758529502095, "loss": 1.2309, "step": 3425 }, { "epoch": 1.0834715312327252, "grad_norm": 0.04845963861117892, "learning_rate": 0.0005141978474742566, "loss": 1.2397, "step": 3430 }, { "epoch": 1.0850509357972045, "grad_norm": 0.046724412684366584, "learning_rate": 0.0005128197340615018, "loss": 1.2136, "step": 3435 }, { "epoch": 1.0866303403616837, "grad_norm": 0.04821348555066909, "learning_rate": 0.0005114415231888257, "loss": 1.1874, "step": 3440 }, { "epoch": 1.0882097449261627, "grad_norm": 0.05221843447417392, "learning_rate": 0.0005100632253338499, "loss": 1.2565, "step": 3445 }, { "epoch": 1.089789149490642, "grad_norm": 0.052114447418510915, "learning_rate": 0.0005086848509748577, "loss": 1.3301, "step": 3450 }, { "epoch": 1.0913685540551212, "grad_norm": 0.05103729232929182, "learning_rate": 0.000507306410590713, "loss": 1.2274, "step": 3455 }, { "epoch": 1.0929479586196005, "grad_norm": 0.04676530977137415, "learning_rate": 0.0005059279146607829, "loss": 1.2522, "step": 3460 }, { "epoch": 1.0945273631840795, "grad_norm": 0.05737033464708882, "learning_rate": 0.0005045493736648556, "loss": 1.2109, "step": 3465 }, { "epoch": 1.0961067677485588, "grad_norm": 0.05360770968312651, "learning_rate": 0.0005031707980830629, "loss": 1.2191, "step": 3470 }, { "epoch": 1.097686172313038, "grad_norm": 0.050856309509504365, "learning_rate": 0.000501792198395799, "loss": 1.1565, "step": 3475 }, { "epoch": 1.0992655768775172, "grad_norm": 0.051885924716512245, "learning_rate": 0.0005004135850836412, "loss": 1.2035, "step": 3480 }, { "epoch": 1.1008449814419963, "grad_norm": 0.050095527252075535, "learning_rate": 0.0004990349686272709, "loss": 1.2293, "step": 3485 }, { "epoch": 1.1024243860064755, "grad_norm": 0.05327285724143124, "learning_rate": 0.0004976563595073929, "loss": 1.248, "step": 3490 }, { "epoch": 1.1040037905709548, "grad_norm": 0.04664971173616498, "learning_rate": 0.0004962777682046565, "loss": 1.2067, "step": 3495 }, { "epoch": 1.105583195135434, "grad_norm": 0.05426737654754492, "learning_rate": 0.0004948992051995756, "loss": 1.2451, "step": 3500 }, { "epoch": 1.107162599699913, "grad_norm": 0.05434094003996787, "learning_rate": 0.0004935206809724488, "loss": 1.1894, "step": 3505 }, { "epoch": 1.1087420042643923, "grad_norm": 0.05815013109132077, "learning_rate": 0.0004921422060032801, "loss": 1.2678, "step": 3510 }, { "epoch": 1.1103214088288715, "grad_norm": 0.0513265845533846, "learning_rate": 0.0004907637907716987, "loss": 1.262, "step": 3515 }, { "epoch": 1.1119008133933508, "grad_norm": 0.054071762992654454, "learning_rate": 0.0004893854457568801, "loss": 1.2709, "step": 3520 }, { "epoch": 1.1134802179578298, "grad_norm": 0.05887334222108996, "learning_rate": 0.0004880071814374656, "loss": 1.163, "step": 3525 }, { "epoch": 1.115059622522309, "grad_norm": 0.058439194083133944, "learning_rate": 0.0004866290082914831, "loss": 1.2863, "step": 3530 }, { "epoch": 1.1166390270867883, "grad_norm": 0.0471215311688571, "learning_rate": 0.00048525093679626746, "loss": 1.2277, "step": 3535 }, { "epoch": 1.1182184316512676, "grad_norm": 0.05643867012777064, "learning_rate": 0.00048387297742838085, "loss": 1.2275, "step": 3540 }, { "epoch": 1.1197978362157466, "grad_norm": 0.06294047972884835, "learning_rate": 0.00048249514066353274, "loss": 1.1598, "step": 3545 }, { "epoch": 1.1213772407802258, "grad_norm": 0.06116682520725429, "learning_rate": 0.0004811174369765008, "loss": 1.16, "step": 3550 }, { "epoch": 1.122956645344705, "grad_norm": 0.04941841448957597, "learning_rate": 0.0004797398768410509, "loss": 1.2275, "step": 3555 }, { "epoch": 1.1245360499091843, "grad_norm": 0.04946526039712102, "learning_rate": 0.0004783624707298574, "loss": 1.2938, "step": 3560 }, { "epoch": 1.1261154544736633, "grad_norm": 0.05245477499304638, "learning_rate": 0.00047698522911442397, "loss": 1.2264, "step": 3565 }, { "epoch": 1.1276948590381426, "grad_norm": 0.0469673225075506, "learning_rate": 0.0004756081624650037, "loss": 1.257, "step": 3570 }, { "epoch": 1.1292742636026218, "grad_norm": 0.06397564318796502, "learning_rate": 0.0004742312812505194, "loss": 1.2016, "step": 3575 }, { "epoch": 1.130853668167101, "grad_norm": 0.05274939739743682, "learning_rate": 0.00047285459593848425, "loss": 1.2339, "step": 3580 }, { "epoch": 1.1324330727315801, "grad_norm": 0.05491560603717785, "learning_rate": 0.00047147811699492227, "loss": 1.2407, "step": 3585 }, { "epoch": 1.1340124772960594, "grad_norm": 0.09116754057360368, "learning_rate": 0.00047010185488428793, "loss": 1.2557, "step": 3590 }, { "epoch": 1.1355918818605386, "grad_norm": 0.05068110504582112, "learning_rate": 0.00046872582006938796, "loss": 1.2021, "step": 3595 }, { "epoch": 1.1371712864250179, "grad_norm": 0.048465699030963726, "learning_rate": 0.00046735002301130093, "loss": 1.1455, "step": 3600 }, { "epoch": 1.1387506909894969, "grad_norm": 0.04804615580495357, "learning_rate": 0.00046597447416929776, "loss": 1.2222, "step": 3605 }, { "epoch": 1.1403300955539761, "grad_norm": 0.047204711378084095, "learning_rate": 0.0004645991840007627, "loss": 1.2177, "step": 3610 }, { "epoch": 1.1419095001184554, "grad_norm": 0.05996137894288037, "learning_rate": 0.00046322416296111296, "loss": 1.2604, "step": 3615 }, { "epoch": 1.1434889046829346, "grad_norm": 0.0527027001045371, "learning_rate": 0.00046184942150372007, "loss": 1.2253, "step": 3620 }, { "epoch": 1.1450683092474137, "grad_norm": 0.05330064972161327, "learning_rate": 0.00046047497007983, "loss": 1.229, "step": 3625 }, { "epoch": 1.146647713811893, "grad_norm": 0.23200136311441616, "learning_rate": 0.0004591008191384838, "loss": 1.1273, "step": 3630 }, { "epoch": 1.1482271183763721, "grad_norm": 0.0626309117846959, "learning_rate": 0.0004577269791264383, "loss": 1.2408, "step": 3635 }, { "epoch": 1.1498065229408514, "grad_norm": 0.05121372603169457, "learning_rate": 0.00045635346048808625, "loss": 1.1992, "step": 3640 }, { "epoch": 1.1513859275053304, "grad_norm": 0.04464962432285668, "learning_rate": 0.0004549802736653775, "loss": 1.2249, "step": 3645 }, { "epoch": 1.1529653320698097, "grad_norm": 0.050175045570946084, "learning_rate": 0.00045360742909773886, "loss": 1.2326, "step": 3650 }, { "epoch": 1.154544736634289, "grad_norm": 0.054473248157898044, "learning_rate": 0.0004522349372219959, "loss": 1.2671, "step": 3655 }, { "epoch": 1.156124141198768, "grad_norm": 0.050333988548290895, "learning_rate": 0.0004508628084722923, "loss": 1.1949, "step": 3660 }, { "epoch": 1.1577035457632472, "grad_norm": 0.04594964781201864, "learning_rate": 0.0004494910532800115, "loss": 1.1776, "step": 3665 }, { "epoch": 1.1592829503277264, "grad_norm": 0.05886231948217266, "learning_rate": 0.00044811968207369675, "loss": 1.2202, "step": 3670 }, { "epoch": 1.1608623548922057, "grad_norm": 0.05448667295237958, "learning_rate": 0.0004467487052789724, "loss": 1.1877, "step": 3675 }, { "epoch": 1.162441759456685, "grad_norm": 0.04787858204740159, "learning_rate": 0.00044537813331846414, "loss": 1.2283, "step": 3680 }, { "epoch": 1.164021164021164, "grad_norm": 0.04293832227492194, "learning_rate": 0.00044400797661172016, "loss": 1.2421, "step": 3685 }, { "epoch": 1.1656005685856432, "grad_norm": 0.048733769912786315, "learning_rate": 0.00044263824557513144, "loss": 1.1808, "step": 3690 }, { "epoch": 1.1671799731501225, "grad_norm": 0.05016340471618699, "learning_rate": 0.00044126895062185324, "loss": 1.3157, "step": 3695 }, { "epoch": 1.1687593777146015, "grad_norm": 0.045822857923059374, "learning_rate": 0.00043990010216172533, "loss": 1.1644, "step": 3700 }, { "epoch": 1.1703387822790807, "grad_norm": 0.05010010843078144, "learning_rate": 0.000438531710601193, "loss": 1.1838, "step": 3705 }, { "epoch": 1.17191818684356, "grad_norm": 0.0558613567460995, "learning_rate": 0.00043716378634322834, "loss": 1.2948, "step": 3710 }, { "epoch": 1.1734975914080392, "grad_norm": 0.08521525810540957, "learning_rate": 0.00043579633978725065, "loss": 1.2011, "step": 3715 }, { "epoch": 1.1750769959725185, "grad_norm": 0.06073099494649541, "learning_rate": 0.00043442938132904767, "loss": 1.1943, "step": 3720 }, { "epoch": 1.1766564005369975, "grad_norm": 0.0488042756703879, "learning_rate": 0.00043306292136069646, "loss": 1.2335, "step": 3725 }, { "epoch": 1.1782358051014767, "grad_norm": 0.04633363315450468, "learning_rate": 0.0004316969702704842, "loss": 1.192, "step": 3730 }, { "epoch": 1.179815209665956, "grad_norm": 0.054397892829131475, "learning_rate": 0.0004303315384428298, "loss": 1.2488, "step": 3735 }, { "epoch": 1.181394614230435, "grad_norm": 0.04890430946153942, "learning_rate": 0.0004289666362582041, "loss": 1.2275, "step": 3740 }, { "epoch": 1.1829740187949143, "grad_norm": 0.05276324666264659, "learning_rate": 0.00042760227409305166, "loss": 1.1646, "step": 3745 }, { "epoch": 1.1845534233593935, "grad_norm": 0.04841086434588433, "learning_rate": 0.0004262384623197116, "loss": 1.2003, "step": 3750 }, { "epoch": 1.1861328279238728, "grad_norm": 0.042224465683642104, "learning_rate": 0.0004248752113063388, "loss": 1.2504, "step": 3755 }, { "epoch": 1.187712232488352, "grad_norm": 0.05591445065437718, "learning_rate": 0.0004235125314168251, "loss": 1.2695, "step": 3760 }, { "epoch": 1.189291637052831, "grad_norm": 0.04246000262674185, "learning_rate": 0.00042215043301072037, "loss": 1.2106, "step": 3765 }, { "epoch": 1.1908710416173103, "grad_norm": 0.039242488853726616, "learning_rate": 0.00042078892644315387, "loss": 1.1321, "step": 3770 }, { "epoch": 1.1924504461817895, "grad_norm": 0.046924780878378945, "learning_rate": 0.0004194280220647556, "loss": 1.2489, "step": 3775 }, { "epoch": 1.1940298507462686, "grad_norm": 0.04465261341407989, "learning_rate": 0.00041806773022157716, "loss": 1.2273, "step": 3780 }, { "epoch": 1.1956092553107478, "grad_norm": 0.046005494043658286, "learning_rate": 0.00041670806125501393, "loss": 1.2281, "step": 3785 }, { "epoch": 1.197188659875227, "grad_norm": 0.04766518543130319, "learning_rate": 0.0004153490255017257, "loss": 1.1463, "step": 3790 }, { "epoch": 1.1987680644397063, "grad_norm": 0.04563834421329947, "learning_rate": 0.00041399063329355853, "loss": 1.2581, "step": 3795 }, { "epoch": 1.2003474690041855, "grad_norm": 0.05071647580007674, "learning_rate": 0.00041263289495746574, "loss": 1.2408, "step": 3800 }, { "epoch": 1.2019268735686646, "grad_norm": 0.05665372272150893, "learning_rate": 0.00041127582081543, "loss": 1.239, "step": 3805 }, { "epoch": 1.2035062781331438, "grad_norm": 0.05096519648779207, "learning_rate": 0.0004099194211843847, "loss": 1.1665, "step": 3810 }, { "epoch": 1.205085682697623, "grad_norm": 0.05541142868692537, "learning_rate": 0.0004085637063761346, "loss": 1.2933, "step": 3815 }, { "epoch": 1.206665087262102, "grad_norm": 0.06191631675915175, "learning_rate": 0.0004072086866972789, "loss": 1.2254, "step": 3820 }, { "epoch": 1.2082444918265813, "grad_norm": 0.04301607408075633, "learning_rate": 0.00040585437244913217, "loss": 1.147, "step": 3825 }, { "epoch": 1.2098238963910606, "grad_norm": 0.05046142131388488, "learning_rate": 0.0004045007739276456, "loss": 1.1995, "step": 3830 }, { "epoch": 1.2114033009555398, "grad_norm": 0.0494053510738388, "learning_rate": 0.0004031479014233297, "loss": 1.1902, "step": 3835 }, { "epoch": 1.212982705520019, "grad_norm": 0.046580989213707145, "learning_rate": 0.0004017957652211753, "loss": 1.1621, "step": 3840 }, { "epoch": 1.214562110084498, "grad_norm": 0.04460881669314604, "learning_rate": 0.00040044437560057567, "loss": 1.2805, "step": 3845 }, { "epoch": 1.2161415146489774, "grad_norm": 0.05196980548749693, "learning_rate": 0.0003990937428352482, "loss": 1.2484, "step": 3850 }, { "epoch": 1.2177209192134566, "grad_norm": 0.048052273444526394, "learning_rate": 0.00039774387719315664, "loss": 1.2795, "step": 3855 }, { "epoch": 1.2193003237779356, "grad_norm": 0.043510167759428266, "learning_rate": 0.00039639478893643257, "loss": 1.2336, "step": 3860 }, { "epoch": 1.2208797283424149, "grad_norm": 0.05787361753131021, "learning_rate": 0.00039504648832129787, "loss": 1.1596, "step": 3865 }, { "epoch": 1.2224591329068941, "grad_norm": 0.04470385130737284, "learning_rate": 0.00039369898559798614, "loss": 1.2257, "step": 3870 }, { "epoch": 1.2240385374713734, "grad_norm": 0.04519656073635029, "learning_rate": 0.0003923522910106656, "loss": 1.2931, "step": 3875 }, { "epoch": 1.2256179420358524, "grad_norm": 0.051898946492035816, "learning_rate": 0.0003910064147973603, "loss": 1.2262, "step": 3880 }, { "epoch": 1.2271973466003316, "grad_norm": 0.04962752107065065, "learning_rate": 0.0003896613671898732, "loss": 1.2302, "step": 3885 }, { "epoch": 1.2287767511648109, "grad_norm": 0.04382782324176238, "learning_rate": 0.00038831715841370745, "loss": 1.1291, "step": 3890 }, { "epoch": 1.2303561557292901, "grad_norm": 0.041316258143942096, "learning_rate": 0.0003869737986879895, "loss": 1.115, "step": 3895 }, { "epoch": 1.2319355602937692, "grad_norm": 0.04739694711784743, "learning_rate": 0.0003856312982253909, "loss": 1.1985, "step": 3900 }, { "epoch": 1.2335149648582484, "grad_norm": 0.043624878938375716, "learning_rate": 0.0003842896672320506, "loss": 1.215, "step": 3905 }, { "epoch": 1.2350943694227277, "grad_norm": 0.04517991655861871, "learning_rate": 0.00038294891590749783, "loss": 1.2334, "step": 3910 }, { "epoch": 1.236673773987207, "grad_norm": 0.05307849772597235, "learning_rate": 0.0003816090544445741, "loss": 1.1373, "step": 3915 }, { "epoch": 1.238253178551686, "grad_norm": 0.04648917580072773, "learning_rate": 0.0003802700930293563, "loss": 1.2171, "step": 3920 }, { "epoch": 1.2398325831161652, "grad_norm": 0.04234443754346936, "learning_rate": 0.00037893204184107803, "loss": 1.1908, "step": 3925 }, { "epoch": 1.2414119876806444, "grad_norm": 0.04561795871505681, "learning_rate": 0.0003775949110520538, "loss": 1.1727, "step": 3930 }, { "epoch": 1.2429913922451237, "grad_norm": 0.04581052168077881, "learning_rate": 0.00037625871082760064, "loss": 1.2348, "step": 3935 }, { "epoch": 1.2445707968096027, "grad_norm": 0.04803461745103152, "learning_rate": 0.00037492345132596113, "loss": 1.2479, "step": 3940 }, { "epoch": 1.246150201374082, "grad_norm": 0.0535848773447006, "learning_rate": 0.0003735891426982262, "loss": 1.217, "step": 3945 }, { "epoch": 1.2477296059385612, "grad_norm": 0.046683351764253464, "learning_rate": 0.00037225579508825803, "loss": 1.2169, "step": 3950 }, { "epoch": 1.2493090105030404, "grad_norm": 0.047863510396069855, "learning_rate": 0.0003709234186326124, "loss": 1.2273, "step": 3955 }, { "epoch": 1.2508884150675195, "grad_norm": 0.04556962809885692, "learning_rate": 0.0003695920234604625, "loss": 1.1632, "step": 3960 }, { "epoch": 1.2524678196319987, "grad_norm": 0.047679886015279296, "learning_rate": 0.00036826161969352137, "loss": 1.1718, "step": 3965 }, { "epoch": 1.254047224196478, "grad_norm": 0.04996504809802695, "learning_rate": 0.00036693221744596476, "loss": 1.2027, "step": 3970 }, { "epoch": 1.255626628760957, "grad_norm": 0.05095536387144604, "learning_rate": 0.000365603826824355, "loss": 1.2738, "step": 3975 }, { "epoch": 1.2572060333254362, "grad_norm": 0.04859791292544802, "learning_rate": 0.00036427645792756335, "loss": 1.2804, "step": 3980 }, { "epoch": 1.2587854378899155, "grad_norm": 0.041920270767893154, "learning_rate": 0.0003629501208466938, "loss": 1.1432, "step": 3985 }, { "epoch": 1.2603648424543947, "grad_norm": 0.04557246913426828, "learning_rate": 0.000361624825665006, "loss": 1.2096, "step": 3990 }, { "epoch": 1.261944247018874, "grad_norm": 0.0437651556127673, "learning_rate": 0.0003603005824578386, "loss": 1.144, "step": 3995 }, { "epoch": 1.263523651583353, "grad_norm": 0.04576083530864447, "learning_rate": 0.00035897740129253296, "loss": 1.2037, "step": 4000 }, { "epoch": 1.2651030561478323, "grad_norm": 0.043571481430130045, "learning_rate": 0.00035765529222835666, "loss": 1.1852, "step": 4005 }, { "epoch": 1.2666824607123115, "grad_norm": 0.049493886829206066, "learning_rate": 0.00035633426531642625, "loss": 1.1967, "step": 4010 }, { "epoch": 1.2682618652767905, "grad_norm": 0.04804942874672699, "learning_rate": 0.00035501433059963194, "loss": 1.1785, "step": 4015 }, { "epoch": 1.2698412698412698, "grad_norm": 0.04487043148291244, "learning_rate": 0.00035369549811256043, "loss": 1.2042, "step": 4020 }, { "epoch": 1.271420674405749, "grad_norm": 0.044329274898826154, "learning_rate": 0.00035237777788141896, "loss": 1.2345, "step": 4025 }, { "epoch": 1.2730000789702283, "grad_norm": 0.056795591957380466, "learning_rate": 0.00035106117992395893, "loss": 1.1738, "step": 4030 }, { "epoch": 1.2745794835347075, "grad_norm": 0.04407855931638142, "learning_rate": 0.00034974571424940007, "loss": 1.2262, "step": 4035 }, { "epoch": 1.2761588880991865, "grad_norm": 0.05758541554750196, "learning_rate": 0.0003484313908583538, "loss": 1.2918, "step": 4040 }, { "epoch": 1.2777382926636658, "grad_norm": 0.045767611132417015, "learning_rate": 0.0003471182197427477, "loss": 1.1998, "step": 4045 }, { "epoch": 1.279317697228145, "grad_norm": 0.042894702331592975, "learning_rate": 0.00034580621088574944, "loss": 1.2274, "step": 4050 }, { "epoch": 1.280897101792624, "grad_norm": 0.04526297676130048, "learning_rate": 0.00034449537426169065, "loss": 1.1973, "step": 4055 }, { "epoch": 1.2824765063571033, "grad_norm": 0.044379850065689556, "learning_rate": 0.00034318571983599146, "loss": 1.2416, "step": 4060 }, { "epoch": 1.2840559109215826, "grad_norm": 0.047110933438321935, "learning_rate": 0.00034187725756508426, "loss": 1.1524, "step": 4065 }, { "epoch": 1.2856353154860618, "grad_norm": 0.046988226072507626, "learning_rate": 0.0003405699973963384, "loss": 1.1369, "step": 4070 }, { "epoch": 1.287214720050541, "grad_norm": 0.10135176216986964, "learning_rate": 0.0003392639492679846, "loss": 1.2553, "step": 4075 }, { "epoch": 1.28879412461502, "grad_norm": 0.04888719205444645, "learning_rate": 0.0003379591231090391, "loss": 1.1773, "step": 4080 }, { "epoch": 1.2903735291794993, "grad_norm": 0.05687605903348492, "learning_rate": 0.00033665552883922815, "loss": 1.2632, "step": 4085 }, { "epoch": 1.2919529337439786, "grad_norm": 0.04233305356158567, "learning_rate": 0.00033535317636891306, "loss": 1.1513, "step": 4090 }, { "epoch": 1.2935323383084576, "grad_norm": 0.042963325597750464, "learning_rate": 0.0003340520755990144, "loss": 1.2029, "step": 4095 }, { "epoch": 1.2951117428729368, "grad_norm": 0.050722019459130685, "learning_rate": 0.0003327522364209369, "loss": 1.1353, "step": 4100 }, { "epoch": 1.296691147437416, "grad_norm": 0.0494180449617406, "learning_rate": 0.0003314536687164944, "loss": 1.2296, "step": 4105 }, { "epoch": 1.2982705520018953, "grad_norm": 0.05936623342720808, "learning_rate": 0.0003301563823578343, "loss": 1.143, "step": 4110 }, { "epoch": 1.2998499565663746, "grad_norm": 0.048506460496429225, "learning_rate": 0.0003288603872073631, "loss": 1.1917, "step": 4115 }, { "epoch": 1.3014293611308536, "grad_norm": 0.05406184336994571, "learning_rate": 0.00032756569311767083, "loss": 1.2349, "step": 4120 }, { "epoch": 1.3030087656953329, "grad_norm": 0.045587554518463316, "learning_rate": 0.00032627230993145643, "loss": 1.1771, "step": 4125 }, { "epoch": 1.304588170259812, "grad_norm": 0.04992494266906095, "learning_rate": 0.0003249802474814532, "loss": 1.2611, "step": 4130 }, { "epoch": 1.3061675748242911, "grad_norm": 0.04653380031315376, "learning_rate": 0.0003236895155903533, "loss": 1.1284, "step": 4135 }, { "epoch": 1.3077469793887704, "grad_norm": 0.051633643931627726, "learning_rate": 0.000322400124070734, "loss": 1.2544, "step": 4140 }, { "epoch": 1.3093263839532496, "grad_norm": 0.25475662140274147, "learning_rate": 0.0003211120827249827, "loss": 1.1783, "step": 4145 }, { "epoch": 1.3109057885177289, "grad_norm": 0.0490546955850076, "learning_rate": 0.0003198254013452214, "loss": 1.2034, "step": 4150 }, { "epoch": 1.3124851930822081, "grad_norm": 0.05262256814953412, "learning_rate": 0.0003185400897132341, "loss": 1.2267, "step": 4155 }, { "epoch": 1.3140645976466871, "grad_norm": 0.04587307245121246, "learning_rate": 0.0003172561576003913, "loss": 1.206, "step": 4160 }, { "epoch": 1.3156440022111664, "grad_norm": 0.046265855783047156, "learning_rate": 0.00031597361476757587, "loss": 1.2063, "step": 4165 }, { "epoch": 1.3172234067756456, "grad_norm": 0.05396095274918719, "learning_rate": 0.0003146924709651089, "loss": 1.2378, "step": 4170 }, { "epoch": 1.3188028113401247, "grad_norm": 0.044551174669143205, "learning_rate": 0.0003134127359326755, "loss": 1.2104, "step": 4175 }, { "epoch": 1.320382215904604, "grad_norm": 0.04765634631480465, "learning_rate": 0.000312134419399251, "loss": 1.2069, "step": 4180 }, { "epoch": 1.3219616204690832, "grad_norm": 0.04527708511821277, "learning_rate": 0.0003108575310830266, "loss": 1.2538, "step": 4185 }, { "epoch": 1.3235410250335624, "grad_norm": 0.04141875863586854, "learning_rate": 0.00030958208069133613, "loss": 1.1605, "step": 4190 }, { "epoch": 1.3251204295980417, "grad_norm": 0.05089718039259843, "learning_rate": 0.00030830807792058137, "loss": 1.1478, "step": 4195 }, { "epoch": 1.3266998341625207, "grad_norm": 0.046012794322654635, "learning_rate": 0.0003070355324561591, "loss": 1.1809, "step": 4200 }, { "epoch": 1.328279238727, "grad_norm": 0.04428128624150087, "learning_rate": 0.0003057644539723871, "loss": 1.1621, "step": 4205 }, { "epoch": 1.3298586432914792, "grad_norm": 0.04875929021417738, "learning_rate": 0.00030449485213243047, "loss": 1.2086, "step": 4210 }, { "epoch": 1.3314380478559582, "grad_norm": 0.050801682045415804, "learning_rate": 0.00030322673658822864, "loss": 1.2216, "step": 4215 }, { "epoch": 1.3330174524204375, "grad_norm": 0.13352593501281276, "learning_rate": 0.0003019601169804216, "loss": 1.204, "step": 4220 }, { "epoch": 1.3345968569849167, "grad_norm": 0.050529420475016344, "learning_rate": 0.00030069500293827676, "loss": 1.1384, "step": 4225 }, { "epoch": 1.336176261549396, "grad_norm": 0.04643436027833839, "learning_rate": 0.00029943140407961565, "loss": 1.2196, "step": 4230 }, { "epoch": 1.3377556661138752, "grad_norm": 0.5129132530884581, "learning_rate": 0.000298169330010741, "loss": 1.1986, "step": 4235 }, { "epoch": 1.3393350706783542, "grad_norm": 0.05570315601264108, "learning_rate": 0.0002969087903263635, "loss": 1.2151, "step": 4240 }, { "epoch": 1.3409144752428335, "grad_norm": 0.053456238534042834, "learning_rate": 0.0002956497946095289, "loss": 1.2177, "step": 4245 }, { "epoch": 1.3424938798073127, "grad_norm": 0.05051596094104913, "learning_rate": 0.0002943923524315451, "loss": 1.2096, "step": 4250 }, { "epoch": 1.3440732843717917, "grad_norm": 0.19489661874496955, "learning_rate": 0.00029313647335190975, "loss": 1.2147, "step": 4255 }, { "epoch": 1.345652688936271, "grad_norm": 0.04723306901388819, "learning_rate": 0.0002918821669182372, "loss": 1.1228, "step": 4260 }, { "epoch": 1.3472320935007502, "grad_norm": 0.04936316111271542, "learning_rate": 0.00029062944266618565, "loss": 1.168, "step": 4265 }, { "epoch": 1.3488114980652295, "grad_norm": 0.04269681900750885, "learning_rate": 0.00028937831011938565, "loss": 1.1859, "step": 4270 }, { "epoch": 1.3503909026297087, "grad_norm": 0.04330226616774357, "learning_rate": 0.0002881287787893666, "loss": 1.1829, "step": 4275 }, { "epoch": 1.3519703071941878, "grad_norm": 0.04368227065736592, "learning_rate": 0.00028688085817548504, "loss": 1.1372, "step": 4280 }, { "epoch": 1.353549711758667, "grad_norm": 0.06362095389826125, "learning_rate": 0.0002856345577648526, "loss": 1.2237, "step": 4285 }, { "epoch": 1.3551291163231463, "grad_norm": 0.04645528492963239, "learning_rate": 0.00028438988703226287, "loss": 1.165, "step": 4290 }, { "epoch": 1.3567085208876253, "grad_norm": 0.052237565892190495, "learning_rate": 0.000283146855440121, "loss": 1.2479, "step": 4295 }, { "epoch": 1.3582879254521045, "grad_norm": 0.04470605148050162, "learning_rate": 0.00028190547243836994, "loss": 1.1449, "step": 4300 }, { "epoch": 1.3598673300165838, "grad_norm": 0.08660485409305897, "learning_rate": 0.0002806657474644204, "loss": 1.1464, "step": 4305 }, { "epoch": 1.361446734581063, "grad_norm": 0.049821540466483724, "learning_rate": 0.00027942768994307734, "loss": 1.1679, "step": 4310 }, { "epoch": 1.363026139145542, "grad_norm": 0.05535567430285917, "learning_rate": 0.0002781913092864699, "loss": 1.241, "step": 4315 }, { "epoch": 1.3646055437100213, "grad_norm": 0.053500066091833116, "learning_rate": 0.0002769566148939787, "loss": 1.1951, "step": 4320 }, { "epoch": 1.3661849482745005, "grad_norm": 0.04921453536505205, "learning_rate": 0.0002757236161521647, "loss": 1.2453, "step": 4325 }, { "epoch": 1.3677643528389796, "grad_norm": 0.04218661572575522, "learning_rate": 0.00027449232243469856, "loss": 1.24, "step": 4330 }, { "epoch": 1.3693437574034588, "grad_norm": 0.051517705694331714, "learning_rate": 0.00027326274310228806, "loss": 1.2165, "step": 4335 }, { "epoch": 1.370923161967938, "grad_norm": 0.046919002432579765, "learning_rate": 0.0002720348875026083, "loss": 1.1331, "step": 4340 }, { "epoch": 1.3725025665324173, "grad_norm": 0.04359109186080066, "learning_rate": 0.0002708087649702294, "loss": 1.1372, "step": 4345 }, { "epoch": 1.3740819710968966, "grad_norm": 0.04171169571610476, "learning_rate": 0.00026958438482654667, "loss": 1.2054, "step": 4350 }, { "epoch": 1.3756613756613756, "grad_norm": 0.046201381641947134, "learning_rate": 0.0002683617563797088, "loss": 1.1977, "step": 4355 }, { "epoch": 1.3772407802258548, "grad_norm": 0.05310743506329303, "learning_rate": 0.0002671408889245475, "loss": 1.1331, "step": 4360 }, { "epoch": 1.378820184790334, "grad_norm": 0.044121030982377414, "learning_rate": 0.0002659217917425071, "loss": 1.1871, "step": 4365 }, { "epoch": 1.380399589354813, "grad_norm": 0.04211060135345686, "learning_rate": 0.00026470447410157353, "loss": 1.1358, "step": 4370 }, { "epoch": 1.3819789939192924, "grad_norm": 0.11718472543144715, "learning_rate": 0.0002634889452562041, "loss": 1.2031, "step": 4375 }, { "epoch": 1.3835583984837716, "grad_norm": 0.04581904727964055, "learning_rate": 0.00026227521444725685, "loss": 1.1755, "step": 4380 }, { "epoch": 1.3851378030482508, "grad_norm": 0.04617645387189203, "learning_rate": 0.0002610632909019211, "loss": 1.2043, "step": 4385 }, { "epoch": 1.38671720761273, "grad_norm": 0.043441151863545385, "learning_rate": 0.0002598531838336461, "loss": 1.1918, "step": 4390 }, { "epoch": 1.3882966121772091, "grad_norm": 0.04677422808092315, "learning_rate": 0.0002586449024420724, "loss": 1.2572, "step": 4395 }, { "epoch": 1.3898760167416884, "grad_norm": 0.047602631667037355, "learning_rate": 0.0002574384559129602, "loss": 1.1751, "step": 4400 }, { "epoch": 1.3914554213061676, "grad_norm": 0.06480301451371907, "learning_rate": 0.00025623385341812135, "loss": 1.1973, "step": 4405 }, { "epoch": 1.3930348258706466, "grad_norm": 0.05186544795837133, "learning_rate": 0.0002550311041153482, "loss": 1.1743, "step": 4410 }, { "epoch": 1.394614230435126, "grad_norm": 0.05263472419199686, "learning_rate": 0.0002538302171483444, "loss": 1.2122, "step": 4415 }, { "epoch": 1.3961936349996051, "grad_norm": 0.046714035664102725, "learning_rate": 0.0002526312016466562, "loss": 1.1936, "step": 4420 }, { "epoch": 1.3977730395640844, "grad_norm": 0.04959284122089215, "learning_rate": 0.0002514340667256014, "loss": 1.1771, "step": 4425 }, { "epoch": 1.3993524441285636, "grad_norm": 0.04428036693602883, "learning_rate": 0.00025023882148620205, "loss": 1.1886, "step": 4430 }, { "epoch": 1.4009318486930427, "grad_norm": 0.04788711277034502, "learning_rate": 0.00024904547501511306, "loss": 1.1257, "step": 4435 }, { "epoch": 1.402511253257522, "grad_norm": 0.05027001199320111, "learning_rate": 0.00024785403638455535, "loss": 1.1414, "step": 4440 }, { "epoch": 1.4040906578220012, "grad_norm": 0.039948820068015915, "learning_rate": 0.000246664514652245, "loss": 1.1801, "step": 4445 }, { "epoch": 1.4056700623864802, "grad_norm": 0.05134243025469217, "learning_rate": 0.0002454769188613254, "loss": 1.1112, "step": 4450 }, { "epoch": 1.4072494669509594, "grad_norm": 0.04486925789171195, "learning_rate": 0.00024429125804029865, "loss": 1.1845, "step": 4455 }, { "epoch": 1.4088288715154387, "grad_norm": 0.04194713391590443, "learning_rate": 0.00024310754120295596, "loss": 1.2171, "step": 4460 }, { "epoch": 1.410408276079918, "grad_norm": 0.04153863172614139, "learning_rate": 0.00024192577734831046, "loss": 1.1884, "step": 4465 }, { "epoch": 1.4119876806443972, "grad_norm": 0.04361043366931193, "learning_rate": 0.00024074597546052713, "loss": 1.1621, "step": 4470 }, { "epoch": 1.4135670852088762, "grad_norm": 0.040256970091352005, "learning_rate": 0.00023956814450885633, "loss": 1.2167, "step": 4475 }, { "epoch": 1.4151464897733554, "grad_norm": 0.05667590501881828, "learning_rate": 0.00023839229344756418, "loss": 1.2413, "step": 4480 }, { "epoch": 1.4167258943378347, "grad_norm": 0.04404745269338672, "learning_rate": 0.00023721843121586505, "loss": 1.1372, "step": 4485 }, { "epoch": 1.4183052989023137, "grad_norm": 0.042142958545581304, "learning_rate": 0.0002360465667378534, "loss": 1.3681, "step": 4490 }, { "epoch": 1.419884703466793, "grad_norm": 0.04470497895953527, "learning_rate": 0.00023487670892243683, "loss": 1.2982, "step": 4495 }, { "epoch": 1.4214641080312722, "grad_norm": 0.04519757812791442, "learning_rate": 0.0002337088666632668, "loss": 1.2074, "step": 4500 }, { "epoch": 1.4230435125957515, "grad_norm": 0.048995084292254784, "learning_rate": 0.00023254304883867205, "loss": 1.1727, "step": 4505 }, { "epoch": 1.4246229171602307, "grad_norm": 0.047539651304436575, "learning_rate": 0.00023137926431159129, "loss": 1.2134, "step": 4510 }, { "epoch": 1.4262023217247097, "grad_norm": 0.04754709178447227, "learning_rate": 0.00023021752192950472, "loss": 1.1471, "step": 4515 }, { "epoch": 1.427781726289189, "grad_norm": 0.051579347578861894, "learning_rate": 0.00022905783052436834, "loss": 1.2198, "step": 4520 }, { "epoch": 1.4293611308536682, "grad_norm": 0.044927236422371813, "learning_rate": 0.00022790019891254506, "loss": 1.2238, "step": 4525 }, { "epoch": 1.4309405354181473, "grad_norm": 0.050537016953069176, "learning_rate": 0.00022674463589473926, "loss": 1.1164, "step": 4530 }, { "epoch": 1.4325199399826265, "grad_norm": 0.042092719652257006, "learning_rate": 0.0002255911502559287, "loss": 1.1854, "step": 4535 }, { "epoch": 1.4340993445471057, "grad_norm": 0.04128060861294385, "learning_rate": 0.0002244397507652982, "loss": 1.2014, "step": 4540 }, { "epoch": 1.435678749111585, "grad_norm": 0.039477493209807944, "learning_rate": 0.00022329044617617355, "loss": 1.142, "step": 4545 }, { "epoch": 1.4372581536760642, "grad_norm": 0.039165707489417216, "learning_rate": 0.0002221432452259536, "loss": 1.1598, "step": 4550 }, { "epoch": 1.4388375582405433, "grad_norm": 0.04196609086299118, "learning_rate": 0.00022099815663604533, "loss": 1.0862, "step": 4555 }, { "epoch": 1.4404169628050225, "grad_norm": 0.043873442257470904, "learning_rate": 0.00021985518911179624, "loss": 1.2392, "step": 4560 }, { "epoch": 1.4419963673695018, "grad_norm": 0.045824141810626655, "learning_rate": 0.0002187143513424295, "loss": 1.1632, "step": 4565 }, { "epoch": 1.4435757719339808, "grad_norm": 0.04490078337805415, "learning_rate": 0.0002175756520009765, "loss": 1.2483, "step": 4570 }, { "epoch": 1.44515517649846, "grad_norm": 0.04076803558562151, "learning_rate": 0.00021643909974421166, "loss": 1.1247, "step": 4575 }, { "epoch": 1.4467345810629393, "grad_norm": 0.04504649426022102, "learning_rate": 0.0002153047032125871, "loss": 1.1649, "step": 4580 }, { "epoch": 1.4483139856274185, "grad_norm": 0.055081629312258525, "learning_rate": 0.00021417247103016563, "loss": 1.2071, "step": 4585 }, { "epoch": 1.4498933901918978, "grad_norm": 0.04362199611214444, "learning_rate": 0.00021304241180455675, "loss": 1.1675, "step": 4590 }, { "epoch": 1.4514727947563768, "grad_norm": 0.0563539906824636, "learning_rate": 0.0002119145341268497, "loss": 1.2064, "step": 4595 }, { "epoch": 1.453052199320856, "grad_norm": 0.04817285585606027, "learning_rate": 0.00021078884657154922, "loss": 1.1497, "step": 4600 }, { "epoch": 1.4546316038853353, "grad_norm": 0.04553218807198147, "learning_rate": 0.0002096653576965098, "loss": 1.2152, "step": 4605 }, { "epoch": 1.4562110084498143, "grad_norm": 0.04654134332084045, "learning_rate": 0.00020854407604287123, "loss": 1.166, "step": 4610 }, { "epoch": 1.4577904130142936, "grad_norm": 0.04381811232269292, "learning_rate": 0.0002074250101349927, "loss": 1.0995, "step": 4615 }, { "epoch": 1.4593698175787728, "grad_norm": 0.04302993983813917, "learning_rate": 0.0002063081684803892, "loss": 1.1658, "step": 4620 }, { "epoch": 1.460949222143252, "grad_norm": 0.04905087780036018, "learning_rate": 0.00020519355956966567, "loss": 1.2449, "step": 4625 }, { "epoch": 1.4625286267077313, "grad_norm": 0.043310176329184366, "learning_rate": 0.000204081191876453, "loss": 1.1623, "step": 4630 }, { "epoch": 1.4641080312722103, "grad_norm": 0.04259169208119241, "learning_rate": 0.0002029710738573441, "loss": 1.136, "step": 4635 }, { "epoch": 1.4656874358366896, "grad_norm": 0.04515571060362108, "learning_rate": 0.00020186321395182838, "loss": 1.1657, "step": 4640 }, { "epoch": 1.4672668404011688, "grad_norm": 0.04240891294406152, "learning_rate": 0.00020075762058222914, "loss": 1.1669, "step": 4645 }, { "epoch": 1.4688462449656479, "grad_norm": 0.04979161781488388, "learning_rate": 0.00019965430215363779, "loss": 1.2395, "step": 4650 }, { "epoch": 1.470425649530127, "grad_norm": 0.04659702106631797, "learning_rate": 0.00019855326705385174, "loss": 1.2362, "step": 4655 }, { "epoch": 1.4720050540946064, "grad_norm": 0.04317957853017187, "learning_rate": 0.00019745452365330923, "loss": 1.1658, "step": 4660 }, { "epoch": 1.4735844586590856, "grad_norm": 0.04485097139432286, "learning_rate": 0.00019635808030502616, "loss": 1.1485, "step": 4665 }, { "epoch": 1.4751638632235649, "grad_norm": 0.04187479335484446, "learning_rate": 0.00019526394534453328, "loss": 1.1794, "step": 4670 }, { "epoch": 1.4767432677880439, "grad_norm": 0.043780558904614034, "learning_rate": 0.00019417212708981146, "loss": 1.1799, "step": 4675 }, { "epoch": 1.4783226723525231, "grad_norm": 0.04492211691853034, "learning_rate": 0.00019308263384122987, "loss": 1.2038, "step": 4680 }, { "epoch": 1.4799020769170022, "grad_norm": 0.04656268240379311, "learning_rate": 0.00019199547388148148, "loss": 1.1681, "step": 4685 }, { "epoch": 1.4814814814814814, "grad_norm": 0.045751216739559945, "learning_rate": 0.0001909106554755216, "loss": 1.2025, "step": 4690 }, { "epoch": 1.4830608860459606, "grad_norm": 0.045517896104072, "learning_rate": 0.00018982818687050368, "loss": 1.1746, "step": 4695 }, { "epoch": 1.48464029061044, "grad_norm": 0.04377414298184028, "learning_rate": 0.00018874807629571722, "loss": 1.1743, "step": 4700 }, { "epoch": 1.4862196951749191, "grad_norm": 0.04351387818424645, "learning_rate": 0.0001876703319625257, "loss": 1.1716, "step": 4705 }, { "epoch": 1.4877990997393982, "grad_norm": 0.041121818632838314, "learning_rate": 0.00018659496206430303, "loss": 1.1196, "step": 4710 }, { "epoch": 1.4893785043038774, "grad_norm": 0.045711119966415976, "learning_rate": 0.0001855219747763723, "loss": 1.1474, "step": 4715 }, { "epoch": 1.4909579088683567, "grad_norm": 0.04374967428774447, "learning_rate": 0.0001844513782559426, "loss": 1.2251, "step": 4720 }, { "epoch": 1.4925373134328357, "grad_norm": 0.042444788984276775, "learning_rate": 0.00018338318064204856, "loss": 1.1587, "step": 4725 }, { "epoch": 1.494116717997315, "grad_norm": 0.04250845900386926, "learning_rate": 0.00018231739005548654, "loss": 1.1185, "step": 4730 }, { "epoch": 1.4956961225617942, "grad_norm": 0.04538456119770677, "learning_rate": 0.00018125401459875474, "loss": 1.1439, "step": 4735 }, { "epoch": 1.4972755271262734, "grad_norm": 0.03912419436253607, "learning_rate": 0.00018019306235598983, "loss": 1.1974, "step": 4740 }, { "epoch": 1.4988549316907527, "grad_norm": 0.0417358462929293, "learning_rate": 0.0001791345413929073, "loss": 1.2257, "step": 4745 }, { "epoch": 1.500434336255232, "grad_norm": 0.041833967268802126, "learning_rate": 0.0001780784597567386, "loss": 1.207, "step": 4750 }, { "epoch": 1.502013740819711, "grad_norm": 0.04117133036016011, "learning_rate": 0.00017702482547617067, "loss": 1.1361, "step": 4755 }, { "epoch": 1.5035931453841902, "grad_norm": 0.0455956494414216, "learning_rate": 0.00017597364656128517, "loss": 1.2292, "step": 4760 }, { "epoch": 1.5051725499486692, "grad_norm": 0.04251795167972643, "learning_rate": 0.0001749249310034969, "loss": 1.1434, "step": 4765 }, { "epoch": 1.5067519545131485, "grad_norm": 0.04280557642038174, "learning_rate": 0.00017387868677549368, "loss": 1.1483, "step": 4770 }, { "epoch": 1.5083313590776277, "grad_norm": 0.04238746153621731, "learning_rate": 0.0001728349218311751, "loss": 1.1492, "step": 4775 }, { "epoch": 1.509910763642107, "grad_norm": 0.04637429650672603, "learning_rate": 0.00017179364410559284, "loss": 1.1852, "step": 4780 }, { "epoch": 1.5114901682065862, "grad_norm": 0.04437304922454032, "learning_rate": 0.00017075486151488955, "loss": 1.1202, "step": 4785 }, { "epoch": 1.5130695727710655, "grad_norm": 0.04285529218130754, "learning_rate": 0.00016971858195623897, "loss": 1.1769, "step": 4790 }, { "epoch": 1.5146489773355445, "grad_norm": 0.047211933672964064, "learning_rate": 0.00016868481330778646, "loss": 1.1208, "step": 4795 }, { "epoch": 1.5162283819000237, "grad_norm": 0.04274521149426863, "learning_rate": 0.00016765356342858794, "loss": 1.0854, "step": 4800 }, { "epoch": 1.5178077864645028, "grad_norm": 0.04677568243826001, "learning_rate": 0.00016662484015855152, "loss": 1.1511, "step": 4805 }, { "epoch": 1.519387191028982, "grad_norm": 0.04961266018548583, "learning_rate": 0.0001655986513183763, "loss": 1.1941, "step": 4810 }, { "epoch": 1.5209665955934613, "grad_norm": 0.04129202370064526, "learning_rate": 0.00016457500470949476, "loss": 1.1585, "step": 4815 }, { "epoch": 1.5225460001579405, "grad_norm": 0.044198758974052066, "learning_rate": 0.00016355390811401176, "loss": 1.1358, "step": 4820 }, { "epoch": 1.5241254047224198, "grad_norm": 0.044072876673132436, "learning_rate": 0.0001625353692946464, "loss": 1.1633, "step": 4825 }, { "epoch": 1.525704809286899, "grad_norm": 0.046872134564505806, "learning_rate": 0.00016151939599467246, "loss": 1.2176, "step": 4830 }, { "epoch": 1.527284213851378, "grad_norm": 0.04382226813101255, "learning_rate": 0.0001605059959378603, "loss": 1.1377, "step": 4835 }, { "epoch": 1.5288636184158573, "grad_norm": 0.04452917735802731, "learning_rate": 0.00015949517682841712, "loss": 1.188, "step": 4840 }, { "epoch": 1.5304430229803363, "grad_norm": 0.04207165462155269, "learning_rate": 0.00015848694635092896, "loss": 1.1435, "step": 4845 }, { "epoch": 1.5320224275448155, "grad_norm": 0.039707154735483516, "learning_rate": 0.00015748131217030258, "loss": 1.2172, "step": 4850 }, { "epoch": 1.5336018321092948, "grad_norm": 0.045829983010965775, "learning_rate": 0.00015647828193170632, "loss": 1.1796, "step": 4855 }, { "epoch": 1.535181236673774, "grad_norm": 0.04211294958876491, "learning_rate": 0.00015547786326051293, "loss": 1.2126, "step": 4860 }, { "epoch": 1.5367606412382533, "grad_norm": 0.044880079367989094, "learning_rate": 0.00015448006376224066, "loss": 1.2015, "step": 4865 }, { "epoch": 1.5383400458027325, "grad_norm": 0.042710320588911777, "learning_rate": 0.00015348489102249657, "loss": 1.2003, "step": 4870 }, { "epoch": 1.5399194503672116, "grad_norm": 0.04338548950267526, "learning_rate": 0.00015249235260691763, "loss": 1.1887, "step": 4875 }, { "epoch": 1.5414988549316906, "grad_norm": 0.04268991615858161, "learning_rate": 0.0001515024560611139, "loss": 1.1312, "step": 4880 }, { "epoch": 1.5430782594961698, "grad_norm": 0.04080384387393374, "learning_rate": 0.00015051520891061143, "loss": 1.1791, "step": 4885 }, { "epoch": 1.544657664060649, "grad_norm": 0.04197892181080458, "learning_rate": 0.0001495306186607942, "loss": 1.1425, "step": 4890 }, { "epoch": 1.5462370686251283, "grad_norm": 0.04558844954447147, "learning_rate": 0.00014854869279684808, "loss": 1.2075, "step": 4895 }, { "epoch": 1.5478164731896076, "grad_norm": 0.0487619594918569, "learning_rate": 0.00014756943878370266, "loss": 1.1886, "step": 4900 }, { "epoch": 1.5493958777540868, "grad_norm": 0.042567127030821036, "learning_rate": 0.000146592864065976, "loss": 1.1243, "step": 4905 }, { "epoch": 1.5509752823185659, "grad_norm": 0.04062902575277818, "learning_rate": 0.00014561897606791673, "loss": 1.1585, "step": 4910 }, { "epoch": 1.552554686883045, "grad_norm": 0.05661674152790821, "learning_rate": 0.00014464778219334812, "loss": 1.3004, "step": 4915 }, { "epoch": 1.5541340914475241, "grad_norm": 0.042203242883656615, "learning_rate": 0.00014367928982561234, "loss": 1.1681, "step": 4920 }, { "epoch": 1.5557134960120034, "grad_norm": 0.043298767397759744, "learning_rate": 0.00014271350632751313, "loss": 1.1437, "step": 4925 }, { "epoch": 1.5572929005764826, "grad_norm": 0.046831477988114115, "learning_rate": 0.00014175043904126117, "loss": 1.1504, "step": 4930 }, { "epoch": 1.5588723051409619, "grad_norm": 0.045905201375766196, "learning_rate": 0.00014079009528841668, "loss": 1.2435, "step": 4935 }, { "epoch": 1.5604517097054411, "grad_norm": 0.03870184358014551, "learning_rate": 0.0001398324823698357, "loss": 1.1649, "step": 4940 }, { "epoch": 1.5620311142699204, "grad_norm": 0.042969682597472206, "learning_rate": 0.00013887760756561268, "loss": 1.2721, "step": 4945 }, { "epoch": 1.5636105188343994, "grad_norm": 0.04625441689016196, "learning_rate": 0.00013792547813502675, "loss": 1.1569, "step": 4950 }, { "epoch": 1.5651899233988786, "grad_norm": 0.041641568088987536, "learning_rate": 0.0001369761013164851, "loss": 1.1864, "step": 4955 }, { "epoch": 1.5667693279633577, "grad_norm": 0.042917792190307856, "learning_rate": 0.00013602948432746916, "loss": 1.2344, "step": 4960 }, { "epoch": 1.568348732527837, "grad_norm": 0.04128606533605788, "learning_rate": 0.000135085634364479, "loss": 1.1529, "step": 4965 }, { "epoch": 1.5699281370923162, "grad_norm": 0.041995572079863625, "learning_rate": 0.00013414455860297865, "loss": 1.1801, "step": 4970 }, { "epoch": 1.5715075416567954, "grad_norm": 0.04048226090339713, "learning_rate": 0.00013320626419734217, "loss": 1.1789, "step": 4975 }, { "epoch": 1.5730869462212747, "grad_norm": 0.04391566170305209, "learning_rate": 0.00013227075828079832, "loss": 1.1803, "step": 4980 }, { "epoch": 1.574666350785754, "grad_norm": 0.039756967410243864, "learning_rate": 0.00013133804796537735, "loss": 1.2034, "step": 4985 }, { "epoch": 1.576245755350233, "grad_norm": 0.045107951526670645, "learning_rate": 0.00013040814034185588, "loss": 1.2619, "step": 4990 }, { "epoch": 1.5778251599147122, "grad_norm": 0.04192825688875311, "learning_rate": 0.000129481042479704, "loss": 1.1576, "step": 4995 }, { "epoch": 1.5794045644791912, "grad_norm": 0.04471083767599147, "learning_rate": 0.00012855676142703077, "loss": 1.2055, "step": 5000 }, { "epoch": 1.5809839690436704, "grad_norm": 0.04382571421373538, "learning_rate": 0.00012763530421053076, "loss": 1.0983, "step": 5005 }, { "epoch": 1.5825633736081497, "grad_norm": 0.04902844954758061, "learning_rate": 0.0001267166778354314, "loss": 1.1946, "step": 5010 }, { "epoch": 1.584142778172629, "grad_norm": 0.04111753151867177, "learning_rate": 0.00012580088928543836, "loss": 1.1874, "step": 5015 }, { "epoch": 1.5857221827371082, "grad_norm": 0.044466780678237214, "learning_rate": 0.00012488794552268395, "loss": 1.1261, "step": 5020 }, { "epoch": 1.5873015873015874, "grad_norm": 0.04833960019927479, "learning_rate": 0.0001239778534876727, "loss": 1.2095, "step": 5025 }, { "epoch": 1.5888809918660665, "grad_norm": 0.04473732928546202, "learning_rate": 0.00012307062009923005, "loss": 1.1835, "step": 5030 }, { "epoch": 1.5904603964305457, "grad_norm": 0.04081688730663804, "learning_rate": 0.0001221662522544486, "loss": 1.1563, "step": 5035 }, { "epoch": 1.5920398009950247, "grad_norm": 0.03988556117185906, "learning_rate": 0.00012126475682863608, "loss": 1.1538, "step": 5040 }, { "epoch": 1.593619205559504, "grad_norm": 0.041622954589453365, "learning_rate": 0.00012036614067526364, "loss": 1.2103, "step": 5045 }, { "epoch": 1.5951986101239832, "grad_norm": 0.03930205000792464, "learning_rate": 0.00011947041062591274, "loss": 1.162, "step": 5050 }, { "epoch": 1.5967780146884625, "grad_norm": 0.0459001691748034, "learning_rate": 0.0001185775734902238, "loss": 1.2562, "step": 5055 }, { "epoch": 1.5983574192529417, "grad_norm": 0.045956274385820284, "learning_rate": 0.00011768763605584437, "loss": 1.1082, "step": 5060 }, { "epoch": 1.599936823817421, "grad_norm": 0.042544054390199586, "learning_rate": 0.0001168006050883777, "loss": 1.1831, "step": 5065 }, { "epoch": 1.6015162283819, "grad_norm": 0.03874744763804557, "learning_rate": 0.0001159164873313307, "loss": 1.1317, "step": 5070 }, { "epoch": 1.6030956329463792, "grad_norm": 0.049451620601898526, "learning_rate": 0.00011503528950606363, "loss": 1.2068, "step": 5075 }, { "epoch": 1.6046750375108583, "grad_norm": 0.04372042755093415, "learning_rate": 0.0001141570183117378, "loss": 1.1337, "step": 5080 }, { "epoch": 1.6062544420753375, "grad_norm": 0.041215068771456684, "learning_rate": 0.00011328168042526594, "loss": 1.2221, "step": 5085 }, { "epoch": 1.6078338466398168, "grad_norm": 0.039639918835393854, "learning_rate": 0.00011240928250126026, "loss": 1.126, "step": 5090 }, { "epoch": 1.609413251204296, "grad_norm": 0.04094267045940786, "learning_rate": 0.00011153983117198252, "loss": 1.1824, "step": 5095 }, { "epoch": 1.6109926557687753, "grad_norm": 0.04000952199994167, "learning_rate": 0.00011067333304729382, "loss": 1.215, "step": 5100 }, { "epoch": 1.6125720603332545, "grad_norm": 0.0450749540652423, "learning_rate": 0.00010980979471460339, "loss": 1.1388, "step": 5105 }, { "epoch": 1.6141514648977335, "grad_norm": 0.038207405000510855, "learning_rate": 0.0001089492227388199, "loss": 1.1775, "step": 5110 }, { "epoch": 1.6157308694622128, "grad_norm": 0.043613443939006846, "learning_rate": 0.00010809162366229996, "loss": 1.239, "step": 5115 }, { "epoch": 1.6173102740266918, "grad_norm": 0.040035558300420285, "learning_rate": 0.00010723700400479997, "loss": 1.2456, "step": 5120 }, { "epoch": 1.618889678591171, "grad_norm": 0.0431411161644887, "learning_rate": 0.00010638537026342515, "loss": 1.1335, "step": 5125 }, { "epoch": 1.6204690831556503, "grad_norm": 0.041932474668096895, "learning_rate": 0.00010553672891258104, "loss": 1.1946, "step": 5130 }, { "epoch": 1.6220484877201296, "grad_norm": 0.05229432296649476, "learning_rate": 0.00010469108640392422, "loss": 1.2687, "step": 5135 }, { "epoch": 1.6236278922846088, "grad_norm": 0.04417009748564109, "learning_rate": 0.00010384844916631264, "loss": 1.1855, "step": 5140 }, { "epoch": 1.625207296849088, "grad_norm": 0.04081655389811582, "learning_rate": 0.00010300882360575775, "loss": 1.184, "step": 5145 }, { "epoch": 1.626786701413567, "grad_norm": 0.03997286011562126, "learning_rate": 0.00010217221610537448, "loss": 1.1878, "step": 5150 }, { "epoch": 1.6283661059780463, "grad_norm": 0.040343592936139795, "learning_rate": 0.0001013386330253343, "loss": 1.2426, "step": 5155 }, { "epoch": 1.6299455105425253, "grad_norm": 0.044727818071515886, "learning_rate": 0.00010050808070281508, "loss": 1.2451, "step": 5160 }, { "epoch": 1.6315249151070046, "grad_norm": 0.041263849955376276, "learning_rate": 9.968056545195476e-05, "loss": 1.2119, "step": 5165 }, { "epoch": 1.6331043196714838, "grad_norm": 0.041714047259588535, "learning_rate": 9.88560935638017e-05, "loss": 1.1997, "step": 5170 }, { "epoch": 1.634683724235963, "grad_norm": 0.03729910549613225, "learning_rate": 9.80346713062682e-05, "loss": 1.1795, "step": 5175 }, { "epoch": 1.6362631288004423, "grad_norm": 0.04029418119764931, "learning_rate": 9.72163049240819e-05, "loss": 1.1421, "step": 5180 }, { "epoch": 1.6378425333649216, "grad_norm": 0.040019018672743864, "learning_rate": 9.640100063873852e-05, "loss": 1.1993, "step": 5185 }, { "epoch": 1.6394219379294006, "grad_norm": 0.038507622250098614, "learning_rate": 9.558876464845517e-05, "loss": 1.1552, "step": 5190 }, { "epoch": 1.6410013424938799, "grad_norm": 0.04062653409832037, "learning_rate": 9.477960312812217e-05, "loss": 1.1656, "step": 5195 }, { "epoch": 1.6425807470583589, "grad_norm": 0.040618196822317196, "learning_rate": 9.397352222925737e-05, "loss": 1.155, "step": 5200 }, { "epoch": 1.6441601516228381, "grad_norm": 0.04503040118594828, "learning_rate": 9.317052807995797e-05, "loss": 1.115, "step": 5205 }, { "epoch": 1.6457395561873174, "grad_norm": 0.04118442692408954, "learning_rate": 9.23706267848553e-05, "loss": 1.2245, "step": 5210 }, { "epoch": 1.6473189607517966, "grad_norm": 0.04514946894636146, "learning_rate": 9.157382442506734e-05, "loss": 1.187, "step": 5215 }, { "epoch": 1.6488983653162759, "grad_norm": 0.03981220388248437, "learning_rate": 9.078012705815297e-05, "loss": 1.185, "step": 5220 }, { "epoch": 1.6504777698807551, "grad_norm": 0.039634242308970356, "learning_rate": 8.998954071806625e-05, "loss": 1.1511, "step": 5225 }, { "epoch": 1.6520571744452341, "grad_norm": 0.043535570747472635, "learning_rate": 8.920207141510962e-05, "loss": 1.1682, "step": 5230 }, { "epoch": 1.6536365790097134, "grad_norm": 0.044177521577120564, "learning_rate": 8.841772513588919e-05, "loss": 1.2097, "step": 5235 }, { "epoch": 1.6552159835741924, "grad_norm": 0.042929086034418966, "learning_rate": 8.763650784326855e-05, "loss": 1.1277, "step": 5240 }, { "epoch": 1.6567953881386717, "grad_norm": 0.0425784170301194, "learning_rate": 8.685842547632395e-05, "loss": 1.1901, "step": 5245 }, { "epoch": 1.658374792703151, "grad_norm": 0.04209752377190743, "learning_rate": 8.608348395029859e-05, "loss": 1.1211, "step": 5250 }, { "epoch": 1.6599541972676302, "grad_norm": 0.04715996987536057, "learning_rate": 8.531168915655785e-05, "loss": 1.1548, "step": 5255 }, { "epoch": 1.6615336018321094, "grad_norm": 0.04309584552152012, "learning_rate": 8.454304696254516e-05, "loss": 1.1504, "step": 5260 }, { "epoch": 1.6631130063965884, "grad_norm": 0.04101875078496104, "learning_rate": 8.377756321173629e-05, "loss": 1.1068, "step": 5265 }, { "epoch": 1.6646924109610677, "grad_norm": 0.039366185934383686, "learning_rate": 8.30152437235957e-05, "loss": 1.1589, "step": 5270 }, { "epoch": 1.6662718155255467, "grad_norm": 0.03983096050733411, "learning_rate": 8.225609429353187e-05, "loss": 1.17, "step": 5275 }, { "epoch": 1.667851220090026, "grad_norm": 0.0424101426469475, "learning_rate": 8.150012069285373e-05, "loss": 1.1804, "step": 5280 }, { "epoch": 1.6694306246545052, "grad_norm": 0.043416967157393835, "learning_rate": 8.074732866872619e-05, "loss": 1.1452, "step": 5285 }, { "epoch": 1.6710100292189844, "grad_norm": 0.03832663662543288, "learning_rate": 7.999772394412713e-05, "loss": 1.1384, "step": 5290 }, { "epoch": 1.6725894337834637, "grad_norm": 0.040079998650866, "learning_rate": 7.925131221780297e-05, "loss": 1.1104, "step": 5295 }, { "epoch": 1.674168838347943, "grad_norm": 0.038742779169049064, "learning_rate": 7.85080991642264e-05, "loss": 1.1493, "step": 5300 }, { "epoch": 1.675748242912422, "grad_norm": 0.04162752997766876, "learning_rate": 7.776809043355254e-05, "loss": 1.221, "step": 5305 }, { "epoch": 1.6773276474769012, "grad_norm": 0.039923080179073166, "learning_rate": 7.703129165157586e-05, "loss": 1.2448, "step": 5310 }, { "epoch": 1.6789070520413802, "grad_norm": 0.04056031771380246, "learning_rate": 7.629770841968837e-05, "loss": 1.2391, "step": 5315 }, { "epoch": 1.6804864566058595, "grad_norm": 0.03861658934485934, "learning_rate": 7.556734631483564e-05, "loss": 1.171, "step": 5320 }, { "epoch": 1.6820658611703387, "grad_norm": 0.043073490784679064, "learning_rate": 7.484021088947591e-05, "loss": 1.197, "step": 5325 }, { "epoch": 1.683645265734818, "grad_norm": 0.03887629799728236, "learning_rate": 7.411630767153643e-05, "loss": 1.1767, "step": 5330 }, { "epoch": 1.6852246702992972, "grad_norm": 0.04109182453210988, "learning_rate": 7.339564216437273e-05, "loss": 1.1576, "step": 5335 }, { "epoch": 1.6868040748637765, "grad_norm": 0.038354359248580626, "learning_rate": 7.267821984672573e-05, "loss": 1.1641, "step": 5340 }, { "epoch": 1.6883834794282555, "grad_norm": 0.041552823901267036, "learning_rate": 7.196404617268059e-05, "loss": 1.2288, "step": 5345 }, { "epoch": 1.6899628839927348, "grad_norm": 0.041688586724448265, "learning_rate": 7.125312657162547e-05, "loss": 1.1794, "step": 5350 }, { "epoch": 1.6915422885572138, "grad_norm": 0.04148547251771257, "learning_rate": 7.054546644820964e-05, "loss": 1.1277, "step": 5355 }, { "epoch": 1.693121693121693, "grad_norm": 0.03982084722584277, "learning_rate": 6.984107118230309e-05, "loss": 1.1563, "step": 5360 }, { "epoch": 1.6947010976861723, "grad_norm": 0.04276039521255618, "learning_rate": 6.91399461289548e-05, "loss": 1.1963, "step": 5365 }, { "epoch": 1.6962805022506515, "grad_norm": 0.037718584105796836, "learning_rate": 6.844209661835299e-05, "loss": 1.1223, "step": 5370 }, { "epoch": 1.6978599068151308, "grad_norm": 0.03932663688239819, "learning_rate": 6.774752795578365e-05, "loss": 1.161, "step": 5375 }, { "epoch": 1.69943931137961, "grad_norm": 0.04095939366471222, "learning_rate": 6.705624542159123e-05, "loss": 1.2073, "step": 5380 }, { "epoch": 1.701018715944089, "grad_norm": 0.037649571693366586, "learning_rate": 6.636825427113718e-05, "loss": 1.1163, "step": 5385 }, { "epoch": 1.7025981205085683, "grad_norm": 0.039891358417953646, "learning_rate": 6.568355973476136e-05, "loss": 1.1191, "step": 5390 }, { "epoch": 1.7041775250730473, "grad_norm": 0.03912471412947293, "learning_rate": 6.500216701774147e-05, "loss": 1.1434, "step": 5395 }, { "epoch": 1.7057569296375266, "grad_norm": 0.03764837877485489, "learning_rate": 6.432408130025347e-05, "loss": 1.1506, "step": 5400 }, { "epoch": 1.7073363342020058, "grad_norm": 0.03916262881781088, "learning_rate": 6.36493077373328e-05, "loss": 1.1757, "step": 5405 }, { "epoch": 1.708915738766485, "grad_norm": 0.04627067588223641, "learning_rate": 6.297785145883439e-05, "loss": 1.1796, "step": 5410 }, { "epoch": 1.7104951433309643, "grad_norm": 0.04096346085191978, "learning_rate": 6.230971756939441e-05, "loss": 1.2768, "step": 5415 }, { "epoch": 1.7120745478954436, "grad_norm": 0.040104990863273335, "learning_rate": 6.164491114839077e-05, "loss": 1.1315, "step": 5420 }, { "epoch": 1.7136539524599226, "grad_norm": 0.04096850036450036, "learning_rate": 6.098343724990524e-05, "loss": 1.115, "step": 5425 }, { "epoch": 1.7152333570244018, "grad_norm": 0.03862297509737605, "learning_rate": 6.032530090268429e-05, "loss": 1.1413, "step": 5430 }, { "epoch": 1.7168127615888809, "grad_norm": 0.03682291111577449, "learning_rate": 5.967050711010119e-05, "loss": 1.147, "step": 5435 }, { "epoch": 1.71839216615336, "grad_norm": 0.041994182108748224, "learning_rate": 5.9019060850118434e-05, "loss": 1.1433, "step": 5440 }, { "epoch": 1.7199715707178393, "grad_norm": 0.04072133696162905, "learning_rate": 5.837096707524886e-05, "loss": 1.087, "step": 5445 }, { "epoch": 1.7215509752823186, "grad_norm": 0.040239365137791645, "learning_rate": 5.772623071251915e-05, "loss": 1.2112, "step": 5450 }, { "epoch": 1.7231303798467978, "grad_norm": 0.21899842282800372, "learning_rate": 5.7084856663431216e-05, "loss": 1.2276, "step": 5455 }, { "epoch": 1.724709784411277, "grad_norm": 0.036912550884332714, "learning_rate": 5.644684980392617e-05, "loss": 1.1085, "step": 5460 }, { "epoch": 1.7262891889757561, "grad_norm": 0.03767233158960728, "learning_rate": 5.5812214984346074e-05, "loss": 1.1807, "step": 5465 }, { "epoch": 1.7278685935402354, "grad_norm": 0.04152274834198619, "learning_rate": 5.518095702939807e-05, "loss": 1.1465, "step": 5470 }, { "epoch": 1.7294479981047144, "grad_norm": 0.03970465859443351, "learning_rate": 5.4553080738116826e-05, "loss": 1.1396, "step": 5475 }, { "epoch": 1.7310274026691936, "grad_norm": 0.03999593368348414, "learning_rate": 5.392859088382856e-05, "loss": 1.0857, "step": 5480 }, { "epoch": 1.7326068072336729, "grad_norm": 0.03548705800539315, "learning_rate": 5.330749221411507e-05, "loss": 1.1714, "step": 5485 }, { "epoch": 1.7341862117981521, "grad_norm": 0.044001166550537214, "learning_rate": 5.268978945077668e-05, "loss": 1.2513, "step": 5490 }, { "epoch": 1.7357656163626314, "grad_norm": 0.03846286895723526, "learning_rate": 5.207548728979716e-05, "loss": 1.1761, "step": 5495 }, { "epoch": 1.7373450209271106, "grad_norm": 0.03725514418989374, "learning_rate": 5.1464590401307684e-05, "loss": 1.1095, "step": 5500 }, { "epoch": 1.7389244254915897, "grad_norm": 0.040904652640728234, "learning_rate": 5.085710342955163e-05, "loss": 1.1701, "step": 5505 }, { "epoch": 1.740503830056069, "grad_norm": 0.038664317622454494, "learning_rate": 5.0253030992848616e-05, "loss": 1.1888, "step": 5510 }, { "epoch": 1.742083234620548, "grad_norm": 0.040135925746465656, "learning_rate": 4.965237768356029e-05, "loss": 1.1112, "step": 5515 }, { "epoch": 1.7436626391850272, "grad_norm": 0.03980170206189093, "learning_rate": 4.905514806805456e-05, "loss": 1.1729, "step": 5520 }, { "epoch": 1.7452420437495064, "grad_norm": 0.03734397589075776, "learning_rate": 4.8461346686671405e-05, "loss": 1.1152, "step": 5525 }, { "epoch": 1.7468214483139857, "grad_norm": 0.03926109942563704, "learning_rate": 4.787097805368839e-05, "loss": 1.1272, "step": 5530 }, { "epoch": 1.748400852878465, "grad_norm": 0.03679956196056413, "learning_rate": 4.728404665728586e-05, "loss": 1.0941, "step": 5535 }, { "epoch": 1.7499802574429442, "grad_norm": 0.04380981279236289, "learning_rate": 4.670055695951342e-05, "loss": 1.1472, "step": 5540 }, { "epoch": 1.7515596620074232, "grad_norm": 0.03741394494631056, "learning_rate": 4.6120513396255446e-05, "loss": 1.1419, "step": 5545 }, { "epoch": 1.7531390665719024, "grad_norm": 0.03876190285432771, "learning_rate": 4.554392037719801e-05, "loss": 1.1393, "step": 5550 }, { "epoch": 1.7547184711363815, "grad_norm": 0.04143074063444235, "learning_rate": 4.4970782285794484e-05, "loss": 1.1781, "step": 5555 }, { "epoch": 1.7562978757008607, "grad_norm": 0.039821088206555576, "learning_rate": 4.440110347923332e-05, "loss": 1.1813, "step": 5560 }, { "epoch": 1.75787728026534, "grad_norm": 0.03858890022868445, "learning_rate": 4.383488828840387e-05, "loss": 1.1398, "step": 5565 }, { "epoch": 1.7594566848298192, "grad_norm": 0.036545000560017596, "learning_rate": 4.327214101786397e-05, "loss": 1.1996, "step": 5570 }, { "epoch": 1.7610360893942985, "grad_norm": 0.03716222191593921, "learning_rate": 4.271286594580748e-05, "loss": 1.132, "step": 5575 }, { "epoch": 1.7626154939587777, "grad_norm": 0.04274826808333374, "learning_rate": 4.215706732403096e-05, "loss": 1.1206, "step": 5580 }, { "epoch": 1.7641948985232567, "grad_norm": 0.03702457278797154, "learning_rate": 4.160474937790232e-05, "loss": 1.1746, "step": 5585 }, { "epoch": 1.765774303087736, "grad_norm": 0.0397002931743353, "learning_rate": 4.105591630632777e-05, "loss": 1.1453, "step": 5590 }, { "epoch": 1.767353707652215, "grad_norm": 0.04463513863229758, "learning_rate": 4.051057228172073e-05, "loss": 1.1402, "step": 5595 }, { "epoch": 1.7689331122166942, "grad_norm": 0.038497251633735234, "learning_rate": 3.996872144996938e-05, "loss": 1.1381, "step": 5600 }, { "epoch": 1.7705125167811735, "grad_norm": 0.03729106510760662, "learning_rate": 3.9430367930405666e-05, "loss": 1.1766, "step": 5605 }, { "epoch": 1.7720919213456527, "grad_norm": 0.03862603218316023, "learning_rate": 3.8895515815773774e-05, "loss": 1.1791, "step": 5610 }, { "epoch": 1.773671325910132, "grad_norm": 0.038360471763054765, "learning_rate": 3.836416917219881e-05, "loss": 1.1463, "step": 5615 }, { "epoch": 1.775250730474611, "grad_norm": 0.04078142535118902, "learning_rate": 3.783633203915654e-05, "loss": 1.1195, "step": 5620 }, { "epoch": 1.7768301350390903, "grad_norm": 0.03837831114793069, "learning_rate": 3.731200842944182e-05, "loss": 1.1299, "step": 5625 }, { "epoch": 1.7784095396035693, "grad_norm": 0.037632727533887717, "learning_rate": 3.6791202329138965e-05, "loss": 1.1132, "step": 5630 }, { "epoch": 1.7799889441680485, "grad_norm": 0.03957322528074159, "learning_rate": 3.6273917697590475e-05, "loss": 1.1836, "step": 5635 }, { "epoch": 1.7815683487325278, "grad_norm": 0.03849063741712455, "learning_rate": 3.576015846736797e-05, "loss": 1.1525, "step": 5640 }, { "epoch": 1.783147753297007, "grad_norm": 0.04036105343418985, "learning_rate": 3.524992854424147e-05, "loss": 1.149, "step": 5645 }, { "epoch": 1.7847271578614863, "grad_norm": 0.03934795365104782, "learning_rate": 3.4743231807150056e-05, "loss": 1.1849, "step": 5650 }, { "epoch": 1.7863065624259655, "grad_norm": 0.03618186490143044, "learning_rate": 3.4240072108172485e-05, "loss": 1.1345, "step": 5655 }, { "epoch": 1.7878859669904446, "grad_norm": 0.03741293392474867, "learning_rate": 3.3740453272497585e-05, "loss": 1.1009, "step": 5660 }, { "epoch": 1.7894653715549238, "grad_norm": 0.03844796658326061, "learning_rate": 3.324437909839556e-05, "loss": 1.1817, "step": 5665 }, { "epoch": 1.7910447761194028, "grad_norm": 0.04195468684299417, "learning_rate": 3.275185335718861e-05, "loss": 1.1783, "step": 5670 }, { "epoch": 1.792624180683882, "grad_norm": 0.047192191176001645, "learning_rate": 3.226287979322295e-05, "loss": 1.1255, "step": 5675 }, { "epoch": 1.7942035852483613, "grad_norm": 0.04105202623653139, "learning_rate": 3.177746212383953e-05, "loss": 1.2267, "step": 5680 }, { "epoch": 1.7957829898128406, "grad_norm": 0.03994382729729455, "learning_rate": 3.1295604039346615e-05, "loss": 1.138, "step": 5685 }, { "epoch": 1.7973623943773198, "grad_norm": 0.03749226714436082, "learning_rate": 3.0817309202990916e-05, "loss": 1.2155, "step": 5690 }, { "epoch": 1.798941798941799, "grad_norm": 0.03879009257148787, "learning_rate": 3.0342581250930368e-05, "loss": 1.2287, "step": 5695 }, { "epoch": 1.800521203506278, "grad_norm": 0.03738041546140523, "learning_rate": 2.9871423792206252e-05, "loss": 1.2137, "step": 5700 }, { "epoch": 1.8021006080707573, "grad_norm": 0.038328223229589876, "learning_rate": 2.940384040871563e-05, "loss": 1.1829, "step": 5705 }, { "epoch": 1.8036800126352364, "grad_norm": 0.04475646525608456, "learning_rate": 2.893983465518446e-05, "loss": 1.1625, "step": 5710 }, { "epoch": 1.8052594171997156, "grad_norm": 0.040567913477594184, "learning_rate": 2.847941005914012e-05, "loss": 1.1517, "step": 5715 }, { "epoch": 1.8068388217641949, "grad_norm": 0.07691995823020609, "learning_rate": 2.8022570120884937e-05, "loss": 1.2255, "step": 5720 }, { "epoch": 1.808418226328674, "grad_norm": 0.04051156523842043, "learning_rate": 2.756931831346937e-05, "loss": 1.1339, "step": 5725 }, { "epoch": 1.8099976308931534, "grad_norm": 0.03656410669128001, "learning_rate": 2.7119658082666034e-05, "loss": 1.1016, "step": 5730 }, { "epoch": 1.8115770354576326, "grad_norm": 0.03835511214441151, "learning_rate": 2.6673592846942707e-05, "loss": 1.2093, "step": 5735 }, { "epoch": 1.8131564400221116, "grad_norm": 0.038050633542823245, "learning_rate": 2.62311259974371e-05, "loss": 1.1291, "step": 5740 }, { "epoch": 1.8147358445865909, "grad_norm": 0.039586803586487224, "learning_rate": 2.579226089793074e-05, "loss": 1.1217, "step": 5745 }, { "epoch": 1.81631524915107, "grad_norm": 0.04064802128521732, "learning_rate": 2.5357000884823344e-05, "loss": 1.191, "step": 5750 }, { "epoch": 1.8178946537155491, "grad_norm": 0.03745252048453876, "learning_rate": 2.4925349267107765e-05, "loss": 1.2138, "step": 5755 }, { "epoch": 1.8194740582800284, "grad_norm": 0.03872938908760248, "learning_rate": 2.4497309326344364e-05, "loss": 1.1717, "step": 5760 }, { "epoch": 1.8210534628445076, "grad_norm": 0.03847000242773801, "learning_rate": 2.4072884316636512e-05, "loss": 1.2192, "step": 5765 }, { "epoch": 1.822632867408987, "grad_norm": 0.0372268013149032, "learning_rate": 2.3652077464605514e-05, "loss": 1.1603, "step": 5770 }, { "epoch": 1.8242122719734661, "grad_norm": 0.04358926754494382, "learning_rate": 2.32348919693664e-05, "loss": 1.1702, "step": 5775 }, { "epoch": 1.8257916765379452, "grad_norm": 0.036667043069080515, "learning_rate": 2.2821331002503276e-05, "loss": 1.1289, "step": 5780 }, { "epoch": 1.8273710811024244, "grad_norm": 0.041083377770693444, "learning_rate": 2.2411397708045346e-05, "loss": 1.1004, "step": 5785 }, { "epoch": 1.8289504856669034, "grad_norm": 0.039968118879742026, "learning_rate": 2.200509520244326e-05, "loss": 1.177, "step": 5790 }, { "epoch": 1.8305298902313827, "grad_norm": 0.03863271682127176, "learning_rate": 2.1602426574544863e-05, "loss": 1.13, "step": 5795 }, { "epoch": 1.832109294795862, "grad_norm": 0.038105678081707686, "learning_rate": 2.1203394885572436e-05, "loss": 1.1997, "step": 5800 }, { "epoch": 1.8336886993603412, "grad_norm": 0.03688161470356274, "learning_rate": 2.0808003169098587e-05, "loss": 1.2342, "step": 5805 }, { "epoch": 1.8352681039248204, "grad_norm": 0.039102585737269144, "learning_rate": 2.0416254431024073e-05, "loss": 1.1117, "step": 5810 }, { "epoch": 1.8368475084892997, "grad_norm": 0.04001738022857146, "learning_rate": 2.0028151649554126e-05, "loss": 1.1494, "step": 5815 }, { "epoch": 1.8384269130537787, "grad_norm": 0.03715434498313437, "learning_rate": 1.964369777517644e-05, "loss": 1.1657, "step": 5820 }, { "epoch": 1.840006317618258, "grad_norm": 0.03807302565433147, "learning_rate": 1.9262895730638387e-05, "loss": 1.1365, "step": 5825 }, { "epoch": 1.841585722182737, "grad_norm": 0.04016295883330026, "learning_rate": 1.8885748410924884e-05, "loss": 1.1503, "step": 5830 }, { "epoch": 1.8431651267472162, "grad_norm": 0.039690799467456385, "learning_rate": 1.8512258683236525e-05, "loss": 1.1382, "step": 5835 }, { "epoch": 1.8447445313116955, "grad_norm": 0.03663772995034225, "learning_rate": 1.8142429386967473e-05, "loss": 1.1395, "step": 5840 }, { "epoch": 1.8463239358761747, "grad_norm": 0.03734107530523833, "learning_rate": 1.7776263333684316e-05, "loss": 1.134, "step": 5845 }, { "epoch": 1.847903340440654, "grad_norm": 0.03599365398109658, "learning_rate": 1.7413763307104092e-05, "loss": 1.1999, "step": 5850 }, { "epoch": 1.8494827450051332, "grad_norm": 0.037571792249809915, "learning_rate": 1.70549320630739e-05, "loss": 1.1451, "step": 5855 }, { "epoch": 1.8510621495696122, "grad_norm": 0.040855679493839454, "learning_rate": 1.669977232954911e-05, "loss": 1.1862, "step": 5860 }, { "epoch": 1.8526415541340915, "grad_norm": 0.03901162538583201, "learning_rate": 1.6348286806573354e-05, "loss": 1.131, "step": 5865 }, { "epoch": 1.8542209586985705, "grad_norm": 0.036223266625389, "learning_rate": 1.6000478166257494e-05, "loss": 1.1496, "step": 5870 }, { "epoch": 1.8558003632630498, "grad_norm": 0.04076041836978, "learning_rate": 1.5656349052759533e-05, "loss": 1.264, "step": 5875 }, { "epoch": 1.857379767827529, "grad_norm": 0.03818834527855486, "learning_rate": 1.5315902082264577e-05, "loss": 1.2272, "step": 5880 }, { "epoch": 1.8589591723920083, "grad_norm": 0.037674941328260624, "learning_rate": 1.4979139842964674e-05, "loss": 1.1546, "step": 5885 }, { "epoch": 1.8605385769564875, "grad_norm": 0.04014314876199107, "learning_rate": 1.4646064895039502e-05, "loss": 1.107, "step": 5890 }, { "epoch": 1.8621179815209667, "grad_norm": 0.03963765460127949, "learning_rate": 1.4316679770636498e-05, "loss": 1.2056, "step": 5895 }, { "epoch": 1.8636973860854458, "grad_norm": 0.039998141196438394, "learning_rate": 1.3990986973852039e-05, "loss": 1.1682, "step": 5900 }, { "epoch": 1.865276790649925, "grad_norm": 0.03775072941040079, "learning_rate": 1.3668988980712005e-05, "loss": 1.1465, "step": 5905 }, { "epoch": 1.866856195214404, "grad_norm": 0.037245576393570595, "learning_rate": 1.3350688239153196e-05, "loss": 1.14, "step": 5910 }, { "epoch": 1.8684355997788833, "grad_norm": 0.05103653897583511, "learning_rate": 1.303608716900484e-05, "loss": 1.275, "step": 5915 }, { "epoch": 1.8700150043433625, "grad_norm": 0.039521837283658545, "learning_rate": 1.2725188161969659e-05, "loss": 1.1519, "step": 5920 }, { "epoch": 1.8715944089078418, "grad_norm": 0.03583759103876015, "learning_rate": 1.2417993581606447e-05, "loss": 1.2045, "step": 5925 }, { "epoch": 1.873173813472321, "grad_norm": 0.03882613376722163, "learning_rate": 1.2114505763311356e-05, "loss": 1.1371, "step": 5930 }, { "epoch": 1.8747532180368003, "grad_norm": 0.037457570907026776, "learning_rate": 1.1814727014300807e-05, "loss": 1.1083, "step": 5935 }, { "epoch": 1.8763326226012793, "grad_norm": 0.037491417519635264, "learning_rate": 1.151865961359333e-05, "loss": 1.1285, "step": 5940 }, { "epoch": 1.8779120271657586, "grad_norm": 0.037677620328568986, "learning_rate": 1.1226305811992743e-05, "loss": 1.136, "step": 5945 }, { "epoch": 1.8794914317302376, "grad_norm": 0.03780508985042414, "learning_rate": 1.093766783207073e-05, "loss": 1.2466, "step": 5950 }, { "epoch": 1.8810708362947168, "grad_norm": 0.03798590268953426, "learning_rate": 1.0652747868150125e-05, "loss": 1.1242, "step": 5955 }, { "epoch": 1.882650240859196, "grad_norm": 0.04026326809361378, "learning_rate": 1.0371548086288207e-05, "loss": 1.1346, "step": 5960 }, { "epoch": 1.8842296454236753, "grad_norm": 0.041594258355052545, "learning_rate": 1.0094070624259877e-05, "loss": 1.1412, "step": 5965 }, { "epoch": 1.8858090499881546, "grad_norm": 0.03845161567995432, "learning_rate": 9.820317591542172e-06, "loss": 1.1512, "step": 5970 }, { "epoch": 1.8873884545526338, "grad_norm": 0.04104243452282617, "learning_rate": 9.550291069297445e-06, "loss": 1.1303, "step": 5975 }, { "epoch": 1.8889678591171128, "grad_norm": 0.049978082677842185, "learning_rate": 9.283993110357936e-06, "loss": 1.1293, "step": 5980 }, { "epoch": 1.890547263681592, "grad_norm": 0.04193636399281768, "learning_rate": 9.021425739210054e-06, "loss": 1.2249, "step": 5985 }, { "epoch": 1.8921266682460711, "grad_norm": 0.03652411747142491, "learning_rate": 8.762590951979232e-06, "loss": 1.1556, "step": 5990 }, { "epoch": 1.8937060728105504, "grad_norm": 0.03660227811040269, "learning_rate": 8.507490716414269e-06, "loss": 1.0974, "step": 5995 }, { "epoch": 1.8952854773750296, "grad_norm": 0.03861183723906082, "learning_rate": 8.256126971872835e-06, "loss": 1.1357, "step": 6000 }, { "epoch": 1.8968648819395089, "grad_norm": 0.041946138441062666, "learning_rate": 8.008501629306497e-06, "loss": 1.1581, "step": 6005 }, { "epoch": 1.898444286503988, "grad_norm": 0.03938463570257878, "learning_rate": 7.764616571246218e-06, "loss": 1.1671, "step": 6010 }, { "epoch": 1.9000236910684671, "grad_norm": 0.0400784138910666, "learning_rate": 7.524473651788044e-06, "loss": 1.1995, "step": 6015 }, { "epoch": 1.9016030956329464, "grad_norm": 0.03770136245233989, "learning_rate": 7.288074696578995e-06, "loss": 1.1363, "step": 6020 }, { "epoch": 1.9031825001974254, "grad_norm": 0.03678268049224077, "learning_rate": 7.055421502803416e-06, "loss": 1.231, "step": 6025 }, { "epoch": 1.9047619047619047, "grad_norm": 0.039890798319339324, "learning_rate": 6.826515839168934e-06, "loss": 1.1717, "step": 6030 }, { "epoch": 1.906341309326384, "grad_norm": 0.036090368946140185, "learning_rate": 6.6013594458931295e-06, "loss": 1.1658, "step": 6035 }, { "epoch": 1.9079207138908632, "grad_norm": 0.036701188527316304, "learning_rate": 6.379954034690605e-06, "loss": 1.2034, "step": 6040 }, { "epoch": 1.9095001184553424, "grad_norm": 0.03780086733024395, "learning_rate": 6.162301288759498e-06, "loss": 1.1627, "step": 6045 }, { "epoch": 1.9110795230198216, "grad_norm": 0.03610287526888933, "learning_rate": 5.9484028627692085e-06, "loss": 1.2156, "step": 6050 }, { "epoch": 1.9126589275843007, "grad_norm": 0.03696748233519709, "learning_rate": 5.738260382847193e-06, "loss": 1.1476, "step": 6055 }, { "epoch": 1.91423833214878, "grad_norm": 0.03709245576725398, "learning_rate": 5.531875446567136e-06, "loss": 1.2037, "step": 6060 }, { "epoch": 1.915817736713259, "grad_norm": 0.036785850535957045, "learning_rate": 5.3292496229366824e-06, "loss": 1.2351, "step": 6065 }, { "epoch": 1.9173971412777382, "grad_norm": 0.04381622727708026, "learning_rate": 5.130384452385339e-06, "loss": 1.1328, "step": 6070 }, { "epoch": 1.9189765458422174, "grad_norm": 0.034665386562424426, "learning_rate": 4.93528144675276e-06, "loss": 1.1031, "step": 6075 }, { "epoch": 1.9205559504066967, "grad_norm": 0.040347791381984875, "learning_rate": 4.743942089277642e-06, "loss": 1.2293, "step": 6080 }, { "epoch": 1.922135354971176, "grad_norm": 0.03535462255763783, "learning_rate": 4.556367834585961e-06, "loss": 1.1572, "step": 6085 }, { "epoch": 1.9237147595356552, "grad_norm": 0.038666331327442766, "learning_rate": 4.3725601086800345e-06, "loss": 1.117, "step": 6090 }, { "epoch": 1.9252941641001342, "grad_norm": 0.03760761569395126, "learning_rate": 4.192520308928083e-06, "loss": 1.1803, "step": 6095 }, { "epoch": 1.9268735686646135, "grad_norm": 0.03714151137896293, "learning_rate": 4.016249804052907e-06, "loss": 1.1373, "step": 6100 }, { "epoch": 1.9284529732290925, "grad_norm": 0.03632988142084885, "learning_rate": 3.843749934122231e-06, "loss": 1.1782, "step": 6105 }, { "epoch": 1.9300323777935717, "grad_norm": 0.04104327576412698, "learning_rate": 3.6750220105378206e-06, "loss": 1.1346, "step": 6110 }, { "epoch": 1.931611782358051, "grad_norm": 0.03733076417370514, "learning_rate": 3.5100673160260442e-06, "loss": 1.21, "step": 6115 }, { "epoch": 1.9331911869225302, "grad_norm": 0.039119994135142215, "learning_rate": 3.3488871046278844e-06, "loss": 1.121, "step": 6120 }, { "epoch": 1.9347705914870095, "grad_norm": 0.039999245984236695, "learning_rate": 3.191482601689333e-06, "loss": 1.1533, "step": 6125 }, { "epoch": 1.9363499960514887, "grad_norm": 0.03932239723477913, "learning_rate": 3.0378550038522855e-06, "loss": 1.1172, "step": 6130 }, { "epoch": 1.9379294006159677, "grad_norm": 0.03757406245976793, "learning_rate": 2.8880054790453304e-06, "loss": 1.1515, "step": 6135 }, { "epoch": 1.939508805180447, "grad_norm": 0.042216263151382855, "learning_rate": 2.741935166474807e-06, "loss": 1.1803, "step": 6140 }, { "epoch": 1.941088209744926, "grad_norm": 0.038395584705011525, "learning_rate": 2.5996451766163165e-06, "loss": 1.2214, "step": 6145 }, { "epoch": 1.9426676143094053, "grad_norm": 0.03753622498972159, "learning_rate": 2.4611365912061143e-06, "loss": 1.1245, "step": 6150 }, { "epoch": 1.9442470188738845, "grad_norm": 0.03919269422544959, "learning_rate": 2.3264104632328974e-06, "loss": 1.1983, "step": 6155 }, { "epoch": 1.9458264234383638, "grad_norm": 0.03761146974286723, "learning_rate": 2.1954678169299745e-06, "loss": 1.1767, "step": 6160 }, { "epoch": 1.947405828002843, "grad_norm": 0.036249410566739665, "learning_rate": 2.0683096477672747e-06, "loss": 1.2084, "step": 6165 }, { "epoch": 1.9489852325673223, "grad_norm": 0.03695961304710696, "learning_rate": 1.9449369224438517e-06, "loss": 1.1832, "step": 6170 }, { "epoch": 1.9505646371318013, "grad_norm": 0.03669270640305222, "learning_rate": 1.8253505788806136e-06, "loss": 1.2063, "step": 6175 }, { "epoch": 1.9521440416962805, "grad_norm": 0.03653941867619756, "learning_rate": 1.7095515262129935e-06, "loss": 1.1484, "step": 6180 }, { "epoch": 1.9537234462607596, "grad_norm": 0.0369150963449936, "learning_rate": 1.597540644784401e-06, "loss": 1.1574, "step": 6185 }, { "epoch": 1.9553028508252388, "grad_norm": 0.03800927169900743, "learning_rate": 1.48931878613906e-06, "loss": 1.0989, "step": 6190 }, { "epoch": 1.956882255389718, "grad_norm": 0.0373871509882197, "learning_rate": 1.3848867730158476e-06, "loss": 1.1976, "step": 6195 }, { "epoch": 1.9584616599541973, "grad_norm": 0.0363558661229063, "learning_rate": 1.2842453993420765e-06, "loss": 1.1176, "step": 6200 }, { "epoch": 1.9600410645186765, "grad_norm": 0.03751431102095036, "learning_rate": 1.1873954302271118e-06, "loss": 1.1903, "step": 6205 }, { "epoch": 1.9616204690831558, "grad_norm": 0.03543597065724976, "learning_rate": 1.0943376019570962e-06, "loss": 1.1695, "step": 6210 }, { "epoch": 1.9631998736476348, "grad_norm": 0.03821700357837637, "learning_rate": 1.0050726219886785e-06, "loss": 1.2151, "step": 6215 }, { "epoch": 1.964779278212114, "grad_norm": 0.0362223423431424, "learning_rate": 9.196011689444061e-07, "loss": 1.1192, "step": 6220 }, { "epoch": 1.966358682776593, "grad_norm": 0.03645046656565166, "learning_rate": 8.379238926067845e-07, "loss": 1.1382, "step": 6225 }, { "epoch": 1.9679380873410723, "grad_norm": 0.03729643275196246, "learning_rate": 7.600414139139477e-07, "loss": 1.1773, "step": 6230 }, { "epoch": 1.9695174919055516, "grad_norm": 0.038100647971481355, "learning_rate": 6.859543249546074e-07, "loss": 1.1665, "step": 6235 }, { "epoch": 1.9710968964700308, "grad_norm": 0.0354025767185051, "learning_rate": 6.156631889637776e-07, "loss": 1.112, "step": 6240 }, { "epoch": 1.97267630103451, "grad_norm": 0.03589122877529856, "learning_rate": 5.491685403181679e-07, "loss": 1.1537, "step": 6245 }, { "epoch": 1.9742557055989893, "grad_norm": 0.03880572138041069, "learning_rate": 4.864708845324639e-07, "loss": 1.1933, "step": 6250 }, { "epoch": 1.9758351101634684, "grad_norm": 0.03756094411119688, "learning_rate": 4.275706982552752e-07, "loss": 1.1429, "step": 6255 }, { "epoch": 1.9774145147279476, "grad_norm": 0.03773518314575328, "learning_rate": 3.724684292655822e-07, "loss": 1.1493, "step": 6260 }, { "epoch": 1.9789939192924266, "grad_norm": 0.043865705578300905, "learning_rate": 3.21164496469295e-07, "loss": 1.1996, "step": 6265 }, { "epoch": 1.9805733238569059, "grad_norm": 0.03592053349570639, "learning_rate": 2.736592898961998e-07, "loss": 1.1658, "step": 6270 }, { "epoch": 1.9821527284213851, "grad_norm": 0.03764633001417179, "learning_rate": 2.29953170696795e-07, "loss": 1.2013, "step": 6275 }, { "epoch": 1.9837321329858644, "grad_norm": 0.036997027795423014, "learning_rate": 1.900464711396821e-07, "loss": 1.2161, "step": 6280 }, { "epoch": 1.9853115375503436, "grad_norm": 0.03827984206512634, "learning_rate": 1.5393949460895674e-07, "loss": 1.1047, "step": 6285 }, { "epoch": 1.9868909421148229, "grad_norm": 0.03753114314213189, "learning_rate": 1.2163251560198817e-07, "loss": 1.1817, "step": 6290 }, { "epoch": 1.988470346679302, "grad_norm": 0.037723165063936664, "learning_rate": 9.312577972725444e-08, "loss": 1.1969, "step": 6295 }, { "epoch": 1.9900497512437811, "grad_norm": 0.03696062503887895, "learning_rate": 6.841950370256589e-08, "loss": 1.1069, "step": 6300 }, { "epoch": 1.9916291558082602, "grad_norm": 0.03556299730363652, "learning_rate": 4.751387535328888e-08, "loss": 1.1539, "step": 6305 }, { "epoch": 1.9932085603727394, "grad_norm": 0.03767686268892061, "learning_rate": 3.0409053611013534e-08, "loss": 1.0877, "step": 6310 }, { "epoch": 1.9947879649372187, "grad_norm": 0.046846433133667216, "learning_rate": 1.7105168512443482e-08, "loss": 1.1207, "step": 6315 }, { "epoch": 1.996367369501698, "grad_norm": 0.03618621908358588, "learning_rate": 7.602321198063589e-09, "loss": 1.1671, "step": 6320 }, { "epoch": 1.9979467740661772, "grad_norm": 0.036327845826664686, "learning_rate": 1.900583911751408e-09, "loss": 1.1746, "step": 6325 }, { "epoch": 1.9995261786306564, "grad_norm": 0.03824369147522914, "learning_rate": 0.0, "loss": 1.1937, "step": 6330 }, { "epoch": 1.9995261786306564, "eval_loss": 1.199708104133606, "eval_runtime": 201.9535, "eval_samples_per_second": 13.117, "eval_steps_per_second": 3.283, "step": 6330 }, { "epoch": 1.9995261786306564, "step": 6330, "total_flos": 3.965582567150715e+17, "train_loss": 1.2795274726210801, "train_runtime": 31064.4256, "train_samples_per_second": 3.261, "train_steps_per_second": 0.204 } ], "logging_steps": 5, "max_steps": 6330, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.965582567150715e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }