{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00032, "grad_norm": 5.380013163878014, "learning_rate": 6.389776357827476e-08, "loss": 1.6778, "step": 1 }, { "epoch": 0.0016, "grad_norm": 7.154954424714241, "learning_rate": 3.194888178913738e-07, "loss": 1.7708, "step": 5 }, { "epoch": 0.0032, "grad_norm": 7.791599834214756, "learning_rate": 6.389776357827476e-07, "loss": 1.7103, "step": 10 }, { "epoch": 0.0048, "grad_norm": 6.699971718110182, "learning_rate": 9.584664536741215e-07, "loss": 1.7232, "step": 15 }, { "epoch": 0.0064, "grad_norm": 3.9169595462951747, "learning_rate": 1.2779552715654952e-06, "loss": 1.6546, "step": 20 }, { "epoch": 0.008, "grad_norm": 3.101793089665147, "learning_rate": 1.5974440894568691e-06, "loss": 1.5645, "step": 25 }, { "epoch": 0.0096, "grad_norm": 2.7658653539543763, "learning_rate": 1.916932907348243e-06, "loss": 1.5006, "step": 30 }, { "epoch": 0.0112, "grad_norm": 2.58791028618642, "learning_rate": 2.2364217252396165e-06, "loss": 1.4441, "step": 35 }, { "epoch": 0.0128, "grad_norm": 2.387936368495932, "learning_rate": 2.5559105431309904e-06, "loss": 1.3735, "step": 40 }, { "epoch": 0.0144, "grad_norm": 3.4026602824136853, "learning_rate": 2.8753993610223648e-06, "loss": 1.376, "step": 45 }, { "epoch": 0.016, "grad_norm": 2.142758159307882, "learning_rate": 3.1948881789137383e-06, "loss": 1.3881, "step": 50 }, { "epoch": 0.0176, "grad_norm": 1.9148304687436095, "learning_rate": 3.514376996805112e-06, "loss": 1.3168, "step": 55 }, { "epoch": 0.0192, "grad_norm": 1.9530348908355757, "learning_rate": 3.833865814696486e-06, "loss": 1.291, "step": 60 }, { "epoch": 0.0208, "grad_norm": 1.8415398556654625, "learning_rate": 4.15335463258786e-06, "loss": 1.3104, "step": 65 }, { "epoch": 0.0224, "grad_norm": 1.7817860784739772, "learning_rate": 4.472843450479233e-06, "loss": 1.2407, "step": 70 }, { "epoch": 0.024, "grad_norm": 2.1828092103330525, "learning_rate": 4.792332268370608e-06, "loss": 1.3373, "step": 75 }, { "epoch": 0.0256, "grad_norm": 1.9985913803690254, "learning_rate": 5.111821086261981e-06, "loss": 1.3142, "step": 80 }, { "epoch": 0.0272, "grad_norm": 1.8975874492385296, "learning_rate": 5.431309904153355e-06, "loss": 1.3153, "step": 85 }, { "epoch": 0.0288, "grad_norm": 1.899876204791897, "learning_rate": 5.7507987220447296e-06, "loss": 1.2999, "step": 90 }, { "epoch": 0.0304, "grad_norm": 1.9731664000232103, "learning_rate": 6.070287539936103e-06, "loss": 1.316, "step": 95 }, { "epoch": 0.032, "grad_norm": 1.9783644806483818, "learning_rate": 6.3897763578274765e-06, "loss": 1.2941, "step": 100 }, { "epoch": 0.0336, "grad_norm": 1.8654721876084297, "learning_rate": 6.709265175718851e-06, "loss": 1.258, "step": 105 }, { "epoch": 0.0352, "grad_norm": 1.9381328393426596, "learning_rate": 7.028753993610224e-06, "loss": 1.3014, "step": 110 }, { "epoch": 0.0368, "grad_norm": 2.0523469625422837, "learning_rate": 7.348242811501598e-06, "loss": 1.3002, "step": 115 }, { "epoch": 0.0384, "grad_norm": 2.366592890574014, "learning_rate": 7.667731629392972e-06, "loss": 1.3191, "step": 120 }, { "epoch": 0.04, "grad_norm": 2.12894027085166, "learning_rate": 7.987220447284347e-06, "loss": 1.3172, "step": 125 }, { "epoch": 0.0416, "grad_norm": 2.0911519904253697, "learning_rate": 8.30670926517572e-06, "loss": 1.3373, "step": 130 }, { "epoch": 0.0432, "grad_norm": 1.772653906417285, "learning_rate": 8.626198083067093e-06, "loss": 1.2892, "step": 135 }, { "epoch": 0.0448, "grad_norm": 1.980097120697081, "learning_rate": 8.945686900958466e-06, "loss": 1.3247, "step": 140 }, { "epoch": 0.0464, "grad_norm": 1.8130822133530395, "learning_rate": 9.265175718849841e-06, "loss": 1.2726, "step": 145 }, { "epoch": 0.048, "grad_norm": 2.0005704509850535, "learning_rate": 9.584664536741216e-06, "loss": 1.341, "step": 150 }, { "epoch": 0.0496, "grad_norm": 1.8312219323701686, "learning_rate": 9.904153354632589e-06, "loss": 1.3129, "step": 155 }, { "epoch": 0.0512, "grad_norm": 1.860430043441668, "learning_rate": 1.0223642172523962e-05, "loss": 1.2528, "step": 160 }, { "epoch": 0.0528, "grad_norm": 2.20023386766188, "learning_rate": 1.0543130990415335e-05, "loss": 1.2941, "step": 165 }, { "epoch": 0.0544, "grad_norm": 1.6861230689645437, "learning_rate": 1.086261980830671e-05, "loss": 1.3426, "step": 170 }, { "epoch": 0.056, "grad_norm": 2.007980253767886, "learning_rate": 1.1182108626198084e-05, "loss": 1.3037, "step": 175 }, { "epoch": 0.0576, "grad_norm": 1.9588218482561024, "learning_rate": 1.1501597444089459e-05, "loss": 1.3085, "step": 180 }, { "epoch": 0.0592, "grad_norm": 1.9618854104093122, "learning_rate": 1.1821086261980832e-05, "loss": 1.2801, "step": 185 }, { "epoch": 0.0608, "grad_norm": 1.968612767712385, "learning_rate": 1.2140575079872205e-05, "loss": 1.313, "step": 190 }, { "epoch": 0.0624, "grad_norm": 2.082550933114954, "learning_rate": 1.2460063897763578e-05, "loss": 1.3016, "step": 195 }, { "epoch": 0.064, "grad_norm": 1.974091566866228, "learning_rate": 1.2779552715654953e-05, "loss": 1.3114, "step": 200 }, { "epoch": 0.0656, "grad_norm": 1.8136676164367442, "learning_rate": 1.3099041533546326e-05, "loss": 1.2743, "step": 205 }, { "epoch": 0.0672, "grad_norm": 1.8778934083028735, "learning_rate": 1.3418530351437703e-05, "loss": 1.3171, "step": 210 }, { "epoch": 0.0688, "grad_norm": 1.467699188879101, "learning_rate": 1.3738019169329076e-05, "loss": 1.3216, "step": 215 }, { "epoch": 0.0704, "grad_norm": 1.7382657403610697, "learning_rate": 1.4057507987220449e-05, "loss": 1.2981, "step": 220 }, { "epoch": 0.072, "grad_norm": 1.791627651976965, "learning_rate": 1.4376996805111822e-05, "loss": 1.2806, "step": 225 }, { "epoch": 0.0736, "grad_norm": 2.015211239239423, "learning_rate": 1.4696485623003197e-05, "loss": 1.3526, "step": 230 }, { "epoch": 0.0752, "grad_norm": 2.136502772907875, "learning_rate": 1.501597444089457e-05, "loss": 1.3195, "step": 235 }, { "epoch": 0.0768, "grad_norm": 1.9520516774863617, "learning_rate": 1.5335463258785944e-05, "loss": 1.3071, "step": 240 }, { "epoch": 0.0784, "grad_norm": 1.8711313201866286, "learning_rate": 1.5654952076677316e-05, "loss": 1.3351, "step": 245 }, { "epoch": 0.08, "grad_norm": 1.819194958547012, "learning_rate": 1.5974440894568694e-05, "loss": 1.327, "step": 250 }, { "epoch": 0.0816, "grad_norm": 1.9048156957813254, "learning_rate": 1.6293929712460065e-05, "loss": 1.2793, "step": 255 }, { "epoch": 0.0832, "grad_norm": 1.9845792400469384, "learning_rate": 1.661341853035144e-05, "loss": 1.3465, "step": 260 }, { "epoch": 0.0848, "grad_norm": 1.919105144980939, "learning_rate": 1.693290734824281e-05, "loss": 1.3163, "step": 265 }, { "epoch": 0.0864, "grad_norm": 1.7231178079220175, "learning_rate": 1.7252396166134186e-05, "loss": 1.3691, "step": 270 }, { "epoch": 0.088, "grad_norm": 1.832687595575637, "learning_rate": 1.757188498402556e-05, "loss": 1.2865, "step": 275 }, { "epoch": 0.0896, "grad_norm": 1.925669375224468, "learning_rate": 1.7891373801916932e-05, "loss": 1.3287, "step": 280 }, { "epoch": 0.0912, "grad_norm": 1.8911106868341585, "learning_rate": 1.8210862619808307e-05, "loss": 1.2987, "step": 285 }, { "epoch": 0.0928, "grad_norm": 2.0391764950163727, "learning_rate": 1.8530351437699682e-05, "loss": 1.3085, "step": 290 }, { "epoch": 0.0944, "grad_norm": 1.912606105588985, "learning_rate": 1.8849840255591057e-05, "loss": 1.3109, "step": 295 }, { "epoch": 0.096, "grad_norm": 1.759711536935351, "learning_rate": 1.916932907348243e-05, "loss": 1.2756, "step": 300 }, { "epoch": 0.0976, "grad_norm": 1.6330615416823773, "learning_rate": 1.9488817891373803e-05, "loss": 1.3157, "step": 305 }, { "epoch": 0.0992, "grad_norm": 1.6909082438094512, "learning_rate": 1.9808306709265177e-05, "loss": 1.3371, "step": 310 }, { "epoch": 0.1008, "grad_norm": 1.8764600920235877, "learning_rate": 1.9999975036876365e-05, "loss": 1.317, "step": 315 }, { "epoch": 0.1024, "grad_norm": 1.7276579376060885, "learning_rate": 1.9999694203166786e-05, "loss": 1.2713, "step": 320 }, { "epoch": 0.104, "grad_norm": 1.974512249507493, "learning_rate": 1.999910134063538e-05, "loss": 1.2426, "step": 325 }, { "epoch": 0.1056, "grad_norm": 1.7428846383005079, "learning_rate": 1.9998196467781738e-05, "loss": 1.3445, "step": 330 }, { "epoch": 0.1072, "grad_norm": 1.771888386422436, "learning_rate": 1.999697961284136e-05, "loss": 1.3481, "step": 335 }, { "epoch": 0.1088, "grad_norm": 1.5983194771126878, "learning_rate": 1.9995450813784785e-05, "loss": 1.2856, "step": 340 }, { "epoch": 0.1104, "grad_norm": 1.6706666268150037, "learning_rate": 1.9993610118316417e-05, "loss": 1.3784, "step": 345 }, { "epoch": 0.112, "grad_norm": 1.904172246047488, "learning_rate": 1.999145758387301e-05, "loss": 1.3535, "step": 350 }, { "epoch": 0.1136, "grad_norm": 1.874557989004744, "learning_rate": 1.99889932776219e-05, "loss": 1.3398, "step": 355 }, { "epoch": 0.1152, "grad_norm": 1.939890268910186, "learning_rate": 1.9986217276458898e-05, "loss": 1.2418, "step": 360 }, { "epoch": 0.1168, "grad_norm": 1.6108251342944133, "learning_rate": 1.9983129667005887e-05, "loss": 1.3565, "step": 365 }, { "epoch": 0.1184, "grad_norm": 1.6681638910048406, "learning_rate": 1.9979730545608128e-05, "loss": 1.3346, "step": 370 }, { "epoch": 0.12, "grad_norm": 1.8889089224364535, "learning_rate": 1.9976020018331244e-05, "loss": 1.2951, "step": 375 }, { "epoch": 0.1216, "grad_norm": 1.7535744639209123, "learning_rate": 1.997199820095793e-05, "loss": 1.3554, "step": 380 }, { "epoch": 0.1232, "grad_norm": 1.7559131968042405, "learning_rate": 1.9967665218984308e-05, "loss": 1.3151, "step": 385 }, { "epoch": 0.1248, "grad_norm": 1.9028965780306437, "learning_rate": 1.996302120761605e-05, "loss": 1.3275, "step": 390 }, { "epoch": 0.1264, "grad_norm": 2.0713968273760583, "learning_rate": 1.9958066311764115e-05, "loss": 1.291, "step": 395 }, { "epoch": 0.128, "grad_norm": 1.9141121083760397, "learning_rate": 1.9952800686040268e-05, "loss": 1.3347, "step": 400 }, { "epoch": 0.1296, "grad_norm": 1.8026051249333723, "learning_rate": 1.9947224494752236e-05, "loss": 1.3091, "step": 405 }, { "epoch": 0.1312, "grad_norm": 1.5591453639315092, "learning_rate": 1.994133791189857e-05, "loss": 1.2852, "step": 410 }, { "epoch": 0.1328, "grad_norm": 1.85678459188368, "learning_rate": 1.993514112116325e-05, "loss": 1.3334, "step": 415 }, { "epoch": 0.1344, "grad_norm": 1.8168447146218092, "learning_rate": 1.992863431590991e-05, "loss": 1.3699, "step": 420 }, { "epoch": 0.136, "grad_norm": 1.8767846974527935, "learning_rate": 1.9921817699175844e-05, "loss": 1.2851, "step": 425 }, { "epoch": 0.1376, "grad_norm": 2.159652642734844, "learning_rate": 1.991469148366564e-05, "loss": 1.2593, "step": 430 }, { "epoch": 0.1392, "grad_norm": 1.7557187500253024, "learning_rate": 1.9907255891744562e-05, "loss": 1.3055, "step": 435 }, { "epoch": 0.1408, "grad_norm": 1.707945627233032, "learning_rate": 1.989951115543161e-05, "loss": 1.3316, "step": 440 }, { "epoch": 0.1424, "grad_norm": 1.5740192854498385, "learning_rate": 1.9891457516392257e-05, "loss": 1.3062, "step": 445 }, { "epoch": 0.144, "grad_norm": 1.7612290748845418, "learning_rate": 1.988309522593095e-05, "loss": 1.2971, "step": 450 }, { "epoch": 0.1456, "grad_norm": 1.7818559505170088, "learning_rate": 1.9874424544983224e-05, "loss": 1.3689, "step": 455 }, { "epoch": 0.1472, "grad_norm": 2.2400957230622502, "learning_rate": 1.9865445744107593e-05, "loss": 1.2945, "step": 460 }, { "epoch": 0.1488, "grad_norm": 1.676934020935459, "learning_rate": 1.9856159103477085e-05, "loss": 1.3703, "step": 465 }, { "epoch": 0.1504, "grad_norm": 2.3670411316522397, "learning_rate": 1.9846564912870523e-05, "loss": 1.2795, "step": 470 }, { "epoch": 0.152, "grad_norm": 1.7638582325729109, "learning_rate": 1.9836663471663454e-05, "loss": 1.3814, "step": 475 }, { "epoch": 0.1536, "grad_norm": 1.8203791958774775, "learning_rate": 1.9826455088818832e-05, "loss": 1.3512, "step": 480 }, { "epoch": 0.1552, "grad_norm": 1.7378642960480102, "learning_rate": 1.9815940082877367e-05, "loss": 1.349, "step": 485 }, { "epoch": 0.1568, "grad_norm": 1.6598789370856017, "learning_rate": 1.980511878194758e-05, "loss": 1.353, "step": 490 }, { "epoch": 0.1584, "grad_norm": 1.7553782224508565, "learning_rate": 1.9793991523695578e-05, "loss": 1.4104, "step": 495 }, { "epoch": 0.16, "grad_norm": 1.5810443772689395, "learning_rate": 1.9782558655334505e-05, "loss": 1.2999, "step": 500 }, { "epoch": 0.1616, "grad_norm": 1.8997518690087405, "learning_rate": 1.9770820533613716e-05, "loss": 1.2878, "step": 505 }, { "epoch": 0.1632, "grad_norm": 1.8228928896234957, "learning_rate": 1.9758777524807636e-05, "loss": 1.2879, "step": 510 }, { "epoch": 0.1648, "grad_norm": 1.8091333485065508, "learning_rate": 1.9746430004704353e-05, "loss": 1.3015, "step": 515 }, { "epoch": 0.1664, "grad_norm": 1.9649918630289211, "learning_rate": 1.9733778358593852e-05, "loss": 1.3328, "step": 520 }, { "epoch": 0.168, "grad_norm": 1.9894962267510252, "learning_rate": 1.9720822981256034e-05, "loss": 1.3153, "step": 525 }, { "epoch": 0.1696, "grad_norm": 1.7808145407893163, "learning_rate": 1.970756427694837e-05, "loss": 1.2829, "step": 530 }, { "epoch": 0.1712, "grad_norm": 1.7515882126470221, "learning_rate": 1.9694002659393306e-05, "loss": 1.3619, "step": 535 }, { "epoch": 0.1728, "grad_norm": 1.5718334413525132, "learning_rate": 1.9680138551765335e-05, "loss": 1.3065, "step": 540 }, { "epoch": 0.1744, "grad_norm": 1.7690133151070757, "learning_rate": 1.9665972386677796e-05, "loss": 1.2674, "step": 545 }, { "epoch": 0.176, "grad_norm": 2.199658649039188, "learning_rate": 1.9651504606169395e-05, "loss": 1.2992, "step": 550 }, { "epoch": 0.1776, "grad_norm": 1.6398244591111029, "learning_rate": 1.9636735661690385e-05, "loss": 1.3264, "step": 555 }, { "epoch": 0.1792, "grad_norm": 1.8537758083729077, "learning_rate": 1.9621666014088495e-05, "loss": 1.2723, "step": 560 }, { "epoch": 0.1808, "grad_norm": 1.5832571532059136, "learning_rate": 1.960629613359454e-05, "loss": 1.2468, "step": 565 }, { "epoch": 0.1824, "grad_norm": 1.794754973029939, "learning_rate": 1.959062649980776e-05, "loss": 1.3792, "step": 570 }, { "epoch": 0.184, "grad_norm": 1.6633346677715306, "learning_rate": 1.957465760168084e-05, "loss": 1.2727, "step": 575 }, { "epoch": 0.1856, "grad_norm": 1.6635296858145219, "learning_rate": 1.9558389937504664e-05, "loss": 1.3161, "step": 580 }, { "epoch": 0.1872, "grad_norm": 1.5868309701733159, "learning_rate": 1.954182401489277e-05, "loss": 1.294, "step": 585 }, { "epoch": 0.1888, "grad_norm": 2.448010518906839, "learning_rate": 1.952496035076549e-05, "loss": 1.3428, "step": 590 }, { "epoch": 0.1904, "grad_norm": 1.700054486538892, "learning_rate": 1.9507799471333842e-05, "loss": 1.2318, "step": 595 }, { "epoch": 0.192, "grad_norm": 1.7537382851531023, "learning_rate": 1.9490341912083103e-05, "loss": 1.2683, "step": 600 }, { "epoch": 0.1936, "grad_norm": 1.7114383221179421, "learning_rate": 1.947258821775609e-05, "loss": 1.3159, "step": 605 }, { "epoch": 0.1952, "grad_norm": 1.8124301862280392, "learning_rate": 1.945453894233618e-05, "loss": 1.3088, "step": 610 }, { "epoch": 0.1968, "grad_norm": 1.66741462868946, "learning_rate": 1.9436194649030006e-05, "loss": 1.2965, "step": 615 }, { "epoch": 0.1984, "grad_norm": 1.5482764128668798, "learning_rate": 1.9417555910249905e-05, "loss": 1.2376, "step": 620 }, { "epoch": 0.2, "grad_norm": 1.7141357661533019, "learning_rate": 1.939862330759602e-05, "loss": 1.3216, "step": 625 }, { "epoch": 0.2016, "grad_norm": 1.5376773876440648, "learning_rate": 1.9379397431838194e-05, "loss": 1.2884, "step": 630 }, { "epoch": 0.2032, "grad_norm": 1.6851675187201824, "learning_rate": 1.935987888289751e-05, "loss": 1.2566, "step": 635 }, { "epoch": 0.2048, "grad_norm": 1.7016334884940523, "learning_rate": 1.9340068269827567e-05, "loss": 1.2951, "step": 640 }, { "epoch": 0.2064, "grad_norm": 1.7247862131971499, "learning_rate": 1.93199662107955e-05, "loss": 1.2653, "step": 645 }, { "epoch": 0.208, "grad_norm": 1.7033995256282368, "learning_rate": 1.929957333306267e-05, "loss": 1.3108, "step": 650 }, { "epoch": 0.2096, "grad_norm": 1.8827313390243443, "learning_rate": 1.9278890272965097e-05, "loss": 1.2657, "step": 655 }, { "epoch": 0.2112, "grad_norm": 1.6733377924283832, "learning_rate": 1.92579176758936e-05, "loss": 1.2827, "step": 660 }, { "epoch": 0.2128, "grad_norm": 1.840766457212207, "learning_rate": 1.9236656196273676e-05, "loss": 1.326, "step": 665 }, { "epoch": 0.2144, "grad_norm": 1.7449069883828616, "learning_rate": 1.9215106497545047e-05, "loss": 1.3451, "step": 670 }, { "epoch": 0.216, "grad_norm": 2.2211033209952764, "learning_rate": 1.919326925214099e-05, "loss": 1.3346, "step": 675 }, { "epoch": 0.2176, "grad_norm": 1.7660918637921583, "learning_rate": 1.9171145141467336e-05, "loss": 1.2732, "step": 680 }, { "epoch": 0.2192, "grad_norm": 1.656419286715992, "learning_rate": 1.9148734855881218e-05, "loss": 1.287, "step": 685 }, { "epoch": 0.2208, "grad_norm": 1.9288769042527318, "learning_rate": 1.912603909466952e-05, "loss": 1.2421, "step": 690 }, { "epoch": 0.2224, "grad_norm": 1.6610200617662882, "learning_rate": 1.9103058566027062e-05, "loss": 1.3026, "step": 695 }, { "epoch": 0.224, "grad_norm": 1.592725503044272, "learning_rate": 1.9079793987034497e-05, "loss": 1.3198, "step": 700 }, { "epoch": 0.2256, "grad_norm": 1.618909217118846, "learning_rate": 1.9056246083635943e-05, "loss": 1.2889, "step": 705 }, { "epoch": 0.2272, "grad_norm": 2.093307012413756, "learning_rate": 1.9032415590616323e-05, "loss": 1.2585, "step": 710 }, { "epoch": 0.2288, "grad_norm": 1.7125520677411492, "learning_rate": 1.9008303251578445e-05, "loss": 1.3471, "step": 715 }, { "epoch": 0.2304, "grad_norm": 1.5771195817804284, "learning_rate": 1.898390981891979e-05, "loss": 1.2767, "step": 720 }, { "epoch": 0.232, "grad_norm": 1.7224651305827967, "learning_rate": 1.895923605380904e-05, "loss": 1.2838, "step": 725 }, { "epoch": 0.2336, "grad_norm": 1.651868549410077, "learning_rate": 1.8934282726162325e-05, "loss": 1.2608, "step": 730 }, { "epoch": 0.2352, "grad_norm": 1.7779288249589362, "learning_rate": 1.8909050614619197e-05, "loss": 1.3043, "step": 735 }, { "epoch": 0.2368, "grad_norm": 1.7159478994485102, "learning_rate": 1.8883540506518336e-05, "loss": 1.3111, "step": 740 }, { "epoch": 0.2384, "grad_norm": 1.6669198423947549, "learning_rate": 1.885775319787298e-05, "loss": 1.2545, "step": 745 }, { "epoch": 0.24, "grad_norm": 1.7691857206581028, "learning_rate": 1.8831689493346095e-05, "loss": 1.312, "step": 750 }, { "epoch": 0.2416, "grad_norm": 1.7254769730933195, "learning_rate": 1.880535020622525e-05, "loss": 1.2103, "step": 755 }, { "epoch": 0.2432, "grad_norm": 1.7929696816668284, "learning_rate": 1.8778736158397244e-05, "loss": 1.2415, "step": 760 }, { "epoch": 0.2448, "grad_norm": 1.7687833276945535, "learning_rate": 1.8751848180322476e-05, "loss": 1.256, "step": 765 }, { "epoch": 0.2464, "grad_norm": 1.523135869670507, "learning_rate": 1.872468711100902e-05, "loss": 1.3283, "step": 770 }, { "epoch": 0.248, "grad_norm": 1.5819216704287675, "learning_rate": 1.869725379798643e-05, "loss": 1.2522, "step": 775 }, { "epoch": 0.2496, "grad_norm": 1.7534114000609573, "learning_rate": 1.866954909727932e-05, "loss": 1.2363, "step": 780 }, { "epoch": 0.2512, "grad_norm": 1.8065991869619924, "learning_rate": 1.864157387338064e-05, "loss": 1.238, "step": 785 }, { "epoch": 0.2528, "grad_norm": 1.9623600517198603, "learning_rate": 1.86133289992247e-05, "loss": 1.2621, "step": 790 }, { "epoch": 0.2544, "grad_norm": 1.688313930137959, "learning_rate": 1.8584815356159932e-05, "loss": 1.2951, "step": 795 }, { "epoch": 0.256, "grad_norm": 1.7895896911496316, "learning_rate": 1.8556033833921386e-05, "loss": 1.341, "step": 800 }, { "epoch": 0.2576, "grad_norm": 1.7069236636660754, "learning_rate": 1.8526985330602973e-05, "loss": 1.2933, "step": 805 }, { "epoch": 0.2592, "grad_norm": 1.7617869289902588, "learning_rate": 1.8497670752629437e-05, "loss": 1.3519, "step": 810 }, { "epoch": 0.2608, "grad_norm": 1.6893459200812457, "learning_rate": 1.8468091014728076e-05, "loss": 1.2979, "step": 815 }, { "epoch": 0.2624, "grad_norm": 1.7662458125058196, "learning_rate": 1.843824703990019e-05, "loss": 1.3553, "step": 820 }, { "epoch": 0.264, "grad_norm": 1.795874796308196, "learning_rate": 1.840813975939229e-05, "loss": 1.2724, "step": 825 }, { "epoch": 0.2656, "grad_norm": 1.8716865943726197, "learning_rate": 1.8377770112667024e-05, "loss": 1.2549, "step": 830 }, { "epoch": 0.2672, "grad_norm": 1.614725819172478, "learning_rate": 1.8347139047373885e-05, "loss": 1.327, "step": 835 }, { "epoch": 0.2688, "grad_norm": 1.7333366904814536, "learning_rate": 1.8316247519319625e-05, "loss": 1.3207, "step": 840 }, { "epoch": 0.2704, "grad_norm": 1.7960248128811127, "learning_rate": 1.8285096492438424e-05, "loss": 1.2684, "step": 845 }, { "epoch": 0.272, "grad_norm": 1.795897867338955, "learning_rate": 1.825368693876183e-05, "loss": 1.2876, "step": 850 }, { "epoch": 0.2736, "grad_norm": 1.6933535548648537, "learning_rate": 1.8222019838388422e-05, "loss": 1.3026, "step": 855 }, { "epoch": 0.2752, "grad_norm": 1.7011641233792223, "learning_rate": 1.8190096179453213e-05, "loss": 1.2817, "step": 860 }, { "epoch": 0.2768, "grad_norm": 1.838849063069479, "learning_rate": 1.8157916958096837e-05, "loss": 1.2175, "step": 865 }, { "epoch": 0.2784, "grad_norm": 1.759404249427789, "learning_rate": 1.8125483178434448e-05, "loss": 1.2753, "step": 870 }, { "epoch": 0.28, "grad_norm": 1.460112891178273, "learning_rate": 1.8092795852524404e-05, "loss": 1.2237, "step": 875 }, { "epoch": 0.2816, "grad_norm": 1.7869153052439726, "learning_rate": 1.8059856000336675e-05, "loss": 1.2511, "step": 880 }, { "epoch": 0.2832, "grad_norm": 1.871448851752217, "learning_rate": 1.8026664649721016e-05, "loss": 1.2343, "step": 885 }, { "epoch": 0.2848, "grad_norm": 1.9271483042842514, "learning_rate": 1.7993222836374904e-05, "loss": 1.2854, "step": 890 }, { "epoch": 0.2864, "grad_norm": 1.6289226300811197, "learning_rate": 1.795953160381121e-05, "loss": 1.2937, "step": 895 }, { "epoch": 0.288, "grad_norm": 1.8758056412493365, "learning_rate": 1.792559200332564e-05, "loss": 1.2724, "step": 900 }, { "epoch": 0.2896, "grad_norm": 1.5451348288786018, "learning_rate": 1.789140509396394e-05, "loss": 1.22, "step": 905 }, { "epoch": 0.2912, "grad_norm": 1.6559432936902352, "learning_rate": 1.7856971942488826e-05, "loss": 1.2534, "step": 910 }, { "epoch": 0.2928, "grad_norm": 1.790726692473781, "learning_rate": 1.7822293623346736e-05, "loss": 1.3062, "step": 915 }, { "epoch": 0.2944, "grad_norm": 2.194952348147864, "learning_rate": 1.7787371218634263e-05, "loss": 1.3124, "step": 920 }, { "epoch": 0.296, "grad_norm": 1.6780139738136952, "learning_rate": 1.77522058180644e-05, "loss": 1.3287, "step": 925 }, { "epoch": 0.2976, "grad_norm": 1.6906692728554042, "learning_rate": 1.7716798518932564e-05, "loss": 1.2643, "step": 930 }, { "epoch": 0.2992, "grad_norm": 1.6263007038760662, "learning_rate": 1.7681150426082322e-05, "loss": 1.2642, "step": 935 }, { "epoch": 0.3008, "grad_norm": 1.6713466205735732, "learning_rate": 1.7645262651870926e-05, "loss": 1.336, "step": 940 }, { "epoch": 0.3024, "grad_norm": 1.8062137829573375, "learning_rate": 1.7609136316134616e-05, "loss": 1.2208, "step": 945 }, { "epoch": 0.304, "grad_norm": 2.259643177699934, "learning_rate": 1.7572772546153657e-05, "loss": 1.3017, "step": 950 }, { "epoch": 0.3056, "grad_norm": 1.6769477406746989, "learning_rate": 1.7536172476617183e-05, "loss": 1.2838, "step": 955 }, { "epoch": 0.3072, "grad_norm": 1.6123312692570513, "learning_rate": 1.749933724958777e-05, "loss": 1.2807, "step": 960 }, { "epoch": 0.3088, "grad_norm": 1.5663927724948588, "learning_rate": 1.746226801446582e-05, "loss": 1.2584, "step": 965 }, { "epoch": 0.3104, "grad_norm": 1.923634371695638, "learning_rate": 1.742496592795368e-05, "loss": 1.26, "step": 970 }, { "epoch": 0.312, "grad_norm": 1.578709710130241, "learning_rate": 1.738743215401955e-05, "loss": 1.2628, "step": 975 }, { "epoch": 0.3136, "grad_norm": 1.725657648126581, "learning_rate": 1.7349667863861175e-05, "loss": 1.2467, "step": 980 }, { "epoch": 0.3152, "grad_norm": 2.0583502894307837, "learning_rate": 1.7311674235869285e-05, "loss": 1.2852, "step": 985 }, { "epoch": 0.3168, "grad_norm": 1.6767392968856305, "learning_rate": 1.7273452455590835e-05, "loss": 1.2287, "step": 990 }, { "epoch": 0.3184, "grad_norm": 1.848336594052823, "learning_rate": 1.7235003715691996e-05, "loss": 1.2921, "step": 995 }, { "epoch": 0.32, "grad_norm": 1.7879385354450363, "learning_rate": 1.7196329215920963e-05, "loss": 1.2159, "step": 1000 }, { "epoch": 0.3216, "grad_norm": 1.6698556257326769, "learning_rate": 1.71574301630705e-05, "loss": 1.2357, "step": 1005 }, { "epoch": 0.3232, "grad_norm": 1.8995483769752146, "learning_rate": 1.711830777094028e-05, "loss": 1.2481, "step": 1010 }, { "epoch": 0.3248, "grad_norm": 1.773657115134248, "learning_rate": 1.707896326029903e-05, "loss": 1.3403, "step": 1015 }, { "epoch": 0.3264, "grad_norm": 1.699934912774303, "learning_rate": 1.7039397858846428e-05, "loss": 1.2121, "step": 1020 }, { "epoch": 0.328, "grad_norm": 1.8804415731212936, "learning_rate": 1.6999612801174782e-05, "loss": 1.2022, "step": 1025 }, { "epoch": 0.3296, "grad_norm": 1.6751309771308898, "learning_rate": 1.6959609328730526e-05, "loss": 1.2286, "step": 1030 }, { "epoch": 0.3312, "grad_norm": 1.6618184846871142, "learning_rate": 1.6919388689775463e-05, "loss": 1.2339, "step": 1035 }, { "epoch": 0.3328, "grad_norm": 1.6948121531735394, "learning_rate": 1.6878952139347834e-05, "loss": 1.2594, "step": 1040 }, { "epoch": 0.3344, "grad_norm": 1.709768346802401, "learning_rate": 1.6838300939223144e-05, "loss": 1.2386, "step": 1045 }, { "epoch": 0.336, "grad_norm": 2.0294841083107955, "learning_rate": 1.679743635787479e-05, "loss": 1.2593, "step": 1050 }, { "epoch": 0.3376, "grad_norm": 1.7526618866348942, "learning_rate": 1.6756359670434478e-05, "loss": 1.2645, "step": 1055 }, { "epoch": 0.3392, "grad_norm": 1.8609686803511076, "learning_rate": 1.6715072158652444e-05, "loss": 1.3261, "step": 1060 }, { "epoch": 0.3408, "grad_norm": 2.250771531262466, "learning_rate": 1.6673575110857457e-05, "loss": 1.2702, "step": 1065 }, { "epoch": 0.3424, "grad_norm": 1.8244852666861542, "learning_rate": 1.6631869821916602e-05, "loss": 1.2426, "step": 1070 }, { "epoch": 0.344, "grad_norm": 1.6273641049291432, "learning_rate": 1.6589957593194887e-05, "loss": 1.2285, "step": 1075 }, { "epoch": 0.3456, "grad_norm": 1.7482882018914985, "learning_rate": 1.6547839732514646e-05, "loss": 1.2331, "step": 1080 }, { "epoch": 0.3472, "grad_norm": 1.732658382810363, "learning_rate": 1.650551755411471e-05, "loss": 1.2513, "step": 1085 }, { "epoch": 0.3488, "grad_norm": 2.383840659615826, "learning_rate": 1.646299237860941e-05, "loss": 1.2799, "step": 1090 }, { "epoch": 0.3504, "grad_norm": 1.9133936804042024, "learning_rate": 1.6420265532947364e-05, "loss": 1.1864, "step": 1095 }, { "epoch": 0.352, "grad_norm": 1.815124039273757, "learning_rate": 1.6377338350370077e-05, "loss": 1.2287, "step": 1100 }, { "epoch": 0.3536, "grad_norm": 1.5451596792444091, "learning_rate": 1.6334212170370323e-05, "loss": 1.2859, "step": 1105 }, { "epoch": 0.3552, "grad_norm": 1.867947516409044, "learning_rate": 1.6290888338650373e-05, "loss": 1.1812, "step": 1110 }, { "epoch": 0.3568, "grad_norm": 1.7288525270163788, "learning_rate": 1.624736820707998e-05, "loss": 1.2508, "step": 1115 }, { "epoch": 0.3584, "grad_norm": 1.5209625000723592, "learning_rate": 1.6203653133654213e-05, "loss": 1.3289, "step": 1120 }, { "epoch": 0.36, "grad_norm": 1.607075472981109, "learning_rate": 1.615974448245107e-05, "loss": 1.2189, "step": 1125 }, { "epoch": 0.3616, "grad_norm": 1.7026832199922037, "learning_rate": 1.6115643623588915e-05, "loss": 1.2712, "step": 1130 }, { "epoch": 0.3632, "grad_norm": 1.7251428350551408, "learning_rate": 1.6071351933183736e-05, "loss": 1.2386, "step": 1135 }, { "epoch": 0.3648, "grad_norm": 1.662077062418413, "learning_rate": 1.602687079330619e-05, "loss": 1.2502, "step": 1140 }, { "epoch": 0.3664, "grad_norm": 1.4725115350669602, "learning_rate": 1.5982201591938496e-05, "loss": 1.2845, "step": 1145 }, { "epoch": 0.368, "grad_norm": 1.6912100253848203, "learning_rate": 1.5937345722931098e-05, "loss": 1.2973, "step": 1150 }, { "epoch": 0.3696, "grad_norm": 1.6811085090955742, "learning_rate": 1.5892304585959193e-05, "loss": 1.3189, "step": 1155 }, { "epoch": 0.3712, "grad_norm": 1.7117099512102294, "learning_rate": 1.5847079586479052e-05, "loss": 1.2956, "step": 1160 }, { "epoch": 0.3728, "grad_norm": 1.7616439315210002, "learning_rate": 1.580167213568416e-05, "loss": 1.2607, "step": 1165 }, { "epoch": 0.3744, "grad_norm": 1.958383374292918, "learning_rate": 1.575608365046118e-05, "loss": 1.2068, "step": 1170 }, { "epoch": 0.376, "grad_norm": 1.9451465464226556, "learning_rate": 1.571031555334575e-05, "loss": 1.2012, "step": 1175 }, { "epoch": 0.3776, "grad_norm": 1.5953267218545113, "learning_rate": 1.566436927247808e-05, "loss": 1.2158, "step": 1180 }, { "epoch": 0.3792, "grad_norm": 1.5713426095091674, "learning_rate": 1.5618246241558402e-05, "loss": 1.2816, "step": 1185 }, { "epoch": 0.3808, "grad_norm": 1.9288837080365826, "learning_rate": 1.5571947899802227e-05, "loss": 1.275, "step": 1190 }, { "epoch": 0.3824, "grad_norm": 1.6821430859336053, "learning_rate": 1.5525475691895438e-05, "loss": 1.2094, "step": 1195 }, { "epoch": 0.384, "grad_norm": 1.8584643575754158, "learning_rate": 1.5478831067949203e-05, "loss": 1.2612, "step": 1200 }, { "epoch": 0.3856, "grad_norm": 1.7570035994109643, "learning_rate": 1.5432015483454736e-05, "loss": 1.2333, "step": 1205 }, { "epoch": 0.3872, "grad_norm": 1.755216395574071, "learning_rate": 1.5385030399237878e-05, "loss": 1.2846, "step": 1210 }, { "epoch": 0.3888, "grad_norm": 1.6911329602711573, "learning_rate": 1.533787728141351e-05, "loss": 1.2371, "step": 1215 }, { "epoch": 0.3904, "grad_norm": 1.7403304486779092, "learning_rate": 1.5290557601339807e-05, "loss": 1.2939, "step": 1220 }, { "epoch": 0.392, "grad_norm": 1.6780909672189053, "learning_rate": 1.5243072835572319e-05, "loss": 1.3205, "step": 1225 }, { "epoch": 0.3936, "grad_norm": 1.6847916812460502, "learning_rate": 1.5195424465817911e-05, "loss": 1.217, "step": 1230 }, { "epoch": 0.3952, "grad_norm": 1.8655333892655066, "learning_rate": 1.5147613978888514e-05, "loss": 1.1974, "step": 1235 }, { "epoch": 0.3968, "grad_norm": 1.6806313810696407, "learning_rate": 1.5099642866654747e-05, "loss": 1.2482, "step": 1240 }, { "epoch": 0.3984, "grad_norm": 1.6973582035544275, "learning_rate": 1.505151262599934e-05, "loss": 1.2144, "step": 1245 }, { "epoch": 0.4, "grad_norm": 1.7257749200986132, "learning_rate": 1.5003224758770447e-05, "loss": 1.2354, "step": 1250 }, { "epoch": 0.4016, "grad_norm": 1.8407907459615103, "learning_rate": 1.4954780771734783e-05, "loss": 1.3085, "step": 1255 }, { "epoch": 0.4032, "grad_norm": 1.7057717859268757, "learning_rate": 1.4906182176530588e-05, "loss": 1.2917, "step": 1260 }, { "epoch": 0.4048, "grad_norm": 1.9373101107938622, "learning_rate": 1.4857430489620476e-05, "loss": 1.3003, "step": 1265 }, { "epoch": 0.4064, "grad_norm": 1.6293692369800394, "learning_rate": 1.4808527232244113e-05, "loss": 1.2204, "step": 1270 }, { "epoch": 0.408, "grad_norm": 1.7182679564190864, "learning_rate": 1.4759473930370738e-05, "loss": 1.3049, "step": 1275 }, { "epoch": 0.4096, "grad_norm": 1.988569581799617, "learning_rate": 1.4710272114651555e-05, "loss": 1.2144, "step": 1280 }, { "epoch": 0.4112, "grad_norm": 2.02181577533545, "learning_rate": 1.4660923320371974e-05, "loss": 1.2398, "step": 1285 }, { "epoch": 0.4128, "grad_norm": 1.8621040661004704, "learning_rate": 1.4611429087403695e-05, "loss": 1.2488, "step": 1290 }, { "epoch": 0.4144, "grad_norm": 1.6373000584535569, "learning_rate": 1.456179096015667e-05, "loss": 1.3347, "step": 1295 }, { "epoch": 0.416, "grad_norm": 1.8364279420285126, "learning_rate": 1.4512010487530899e-05, "loss": 1.2771, "step": 1300 }, { "epoch": 0.4176, "grad_norm": 1.7329746714101526, "learning_rate": 1.4462089222868099e-05, "loss": 1.196, "step": 1305 }, { "epoch": 0.4192, "grad_norm": 1.7674384567407724, "learning_rate": 1.4412028723903251e-05, "loss": 1.2556, "step": 1310 }, { "epoch": 0.4208, "grad_norm": 1.8925569918272147, "learning_rate": 1.4361830552715973e-05, "loss": 1.2809, "step": 1315 }, { "epoch": 0.4224, "grad_norm": 1.9179539850525023, "learning_rate": 1.4311496275681785e-05, "loss": 1.1543, "step": 1320 }, { "epoch": 0.424, "grad_norm": 1.7074805466183205, "learning_rate": 1.4261027463423232e-05, "loss": 1.183, "step": 1325 }, { "epoch": 0.4256, "grad_norm": 1.5387005155764308, "learning_rate": 1.4210425690760876e-05, "loss": 1.2124, "step": 1330 }, { "epoch": 0.4272, "grad_norm": 1.8069876048050215, "learning_rate": 1.4159692536664147e-05, "loss": 1.2541, "step": 1335 }, { "epoch": 0.4288, "grad_norm": 1.7208228857424368, "learning_rate": 1.410882958420209e-05, "loss": 1.2699, "step": 1340 }, { "epoch": 0.4304, "grad_norm": 1.8589108006950263, "learning_rate": 1.405783842049395e-05, "loss": 1.2242, "step": 1345 }, { "epoch": 0.432, "grad_norm": 1.6054110828693493, "learning_rate": 1.4006720636659656e-05, "loss": 1.2153, "step": 1350 }, { "epoch": 0.4336, "grad_norm": 1.87504292037103, "learning_rate": 1.3955477827770174e-05, "loss": 1.2861, "step": 1355 }, { "epoch": 0.4352, "grad_norm": 1.8315462144083805, "learning_rate": 1.3904111592797724e-05, "loss": 1.2106, "step": 1360 }, { "epoch": 0.4368, "grad_norm": 1.7728614708361805, "learning_rate": 1.3852623534565901e-05, "loss": 1.1785, "step": 1365 }, { "epoch": 0.4384, "grad_norm": 1.8114433322383807, "learning_rate": 1.3801015259699648e-05, "loss": 1.2453, "step": 1370 }, { "epoch": 0.44, "grad_norm": 1.757844125550881, "learning_rate": 1.3749288378575133e-05, "loss": 1.1936, "step": 1375 }, { "epoch": 0.4416, "grad_norm": 1.6877536751707114, "learning_rate": 1.3697444505269489e-05, "loss": 1.306, "step": 1380 }, { "epoch": 0.4432, "grad_norm": 1.7304940666238933, "learning_rate": 1.3645485257510456e-05, "loss": 1.2906, "step": 1385 }, { "epoch": 0.4448, "grad_norm": 1.6185528808856406, "learning_rate": 1.3593412256625898e-05, "loss": 1.1702, "step": 1390 }, { "epoch": 0.4464, "grad_norm": 1.7654642882266411, "learning_rate": 1.3541227127493218e-05, "loss": 1.2037, "step": 1395 }, { "epoch": 0.448, "grad_norm": 1.8559307749117275, "learning_rate": 1.348893149848865e-05, "loss": 1.2182, "step": 1400 }, { "epoch": 0.4496, "grad_norm": 1.7417002408754572, "learning_rate": 1.3436527001436437e-05, "loss": 1.1373, "step": 1405 }, { "epoch": 0.4512, "grad_norm": 1.7038358431614877, "learning_rate": 1.3384015271557938e-05, "loss": 1.1929, "step": 1410 }, { "epoch": 0.4528, "grad_norm": 1.6284467730858174, "learning_rate": 1.3331397947420578e-05, "loss": 1.1521, "step": 1415 }, { "epoch": 0.4544, "grad_norm": 1.7274539110687543, "learning_rate": 1.3278676670886728e-05, "loss": 1.2055, "step": 1420 }, { "epoch": 0.456, "grad_norm": 2.000897147406469, "learning_rate": 1.3225853087062481e-05, "loss": 1.2287, "step": 1425 }, { "epoch": 0.4576, "grad_norm": 1.737899727543391, "learning_rate": 1.3172928844246297e-05, "loss": 1.2065, "step": 1430 }, { "epoch": 0.4592, "grad_norm": 1.5774280537084664, "learning_rate": 1.3119905593877593e-05, "loss": 1.2003, "step": 1435 }, { "epoch": 0.4608, "grad_norm": 1.7604835736651805, "learning_rate": 1.3066784990485202e-05, "loss": 1.1642, "step": 1440 }, { "epoch": 0.4624, "grad_norm": 1.8245805480052915, "learning_rate": 1.3013568691635733e-05, "loss": 1.1785, "step": 1445 }, { "epoch": 0.464, "grad_norm": 1.669261473180978, "learning_rate": 1.2960258357881875e-05, "loss": 1.2096, "step": 1450 }, { "epoch": 0.4656, "grad_norm": 1.7979692561347327, "learning_rate": 1.2906855652710557e-05, "loss": 1.2972, "step": 1455 }, { "epoch": 0.4672, "grad_norm": 1.5939760767715185, "learning_rate": 1.2853362242491054e-05, "loss": 1.2427, "step": 1460 }, { "epoch": 0.4688, "grad_norm": 1.7306914665269681, "learning_rate": 1.279977979642299e-05, "loss": 1.1997, "step": 1465 }, { "epoch": 0.4704, "grad_norm": 1.7139864987990199, "learning_rate": 1.2746109986484236e-05, "loss": 1.157, "step": 1470 }, { "epoch": 0.472, "grad_norm": 1.8866391900020831, "learning_rate": 1.2692354487378768e-05, "loss": 1.2547, "step": 1475 }, { "epoch": 0.4736, "grad_norm": 1.498672501554436, "learning_rate": 1.2638514976484384e-05, "loss": 1.2136, "step": 1480 }, { "epoch": 0.4752, "grad_norm": 1.4399336555942077, "learning_rate": 1.2584593133800374e-05, "loss": 1.1856, "step": 1485 }, { "epoch": 0.4768, "grad_norm": 1.6133842530330442, "learning_rate": 1.2530590641895089e-05, "loss": 1.2328, "step": 1490 }, { "epoch": 0.4784, "grad_norm": 1.6259809990458836, "learning_rate": 1.2476509185853456e-05, "loss": 1.239, "step": 1495 }, { "epoch": 0.48, "grad_norm": 1.9530196475968609, "learning_rate": 1.242235045322438e-05, "loss": 1.1775, "step": 1500 }, { "epoch": 0.4816, "grad_norm": 2.056215916871571, "learning_rate": 1.2368116133968091e-05, "loss": 1.201, "step": 1505 }, { "epoch": 0.4832, "grad_norm": 1.7814846654045295, "learning_rate": 1.2313807920403419e-05, "loss": 1.2011, "step": 1510 }, { "epoch": 0.4848, "grad_norm": 1.8306687091155145, "learning_rate": 1.2259427507154964e-05, "loss": 1.154, "step": 1515 }, { "epoch": 0.4864, "grad_norm": 1.7357778684621021, "learning_rate": 1.2204976591100253e-05, "loss": 1.2003, "step": 1520 }, { "epoch": 0.488, "grad_norm": 1.7969413341692224, "learning_rate": 1.2150456871316758e-05, "loss": 1.1421, "step": 1525 }, { "epoch": 0.4896, "grad_norm": 1.5319800362284848, "learning_rate": 1.2095870049028898e-05, "loss": 1.1696, "step": 1530 }, { "epoch": 0.4912, "grad_norm": 1.9854512691507675, "learning_rate": 1.2041217827554939e-05, "loss": 1.1722, "step": 1535 }, { "epoch": 0.4928, "grad_norm": 1.6666700649037207, "learning_rate": 1.1986501912253863e-05, "loss": 1.206, "step": 1540 }, { "epoch": 0.4944, "grad_norm": 1.9594133173835664, "learning_rate": 1.1931724010472135e-05, "loss": 1.2486, "step": 1545 }, { "epoch": 0.496, "grad_norm": 1.9394583758238864, "learning_rate": 1.1876885831490442e-05, "loss": 1.2641, "step": 1550 }, { "epoch": 0.4976, "grad_norm": 1.8867772326625134, "learning_rate": 1.1821989086470349e-05, "loss": 1.2133, "step": 1555 }, { "epoch": 0.4992, "grad_norm": 1.9360885869863398, "learning_rate": 1.1767035488400903e-05, "loss": 1.2219, "step": 1560 }, { "epoch": 0.5008, "grad_norm": 1.7712443022979265, "learning_rate": 1.1712026752045189e-05, "loss": 1.2163, "step": 1565 }, { "epoch": 0.5024, "grad_norm": 2.1937615392335155, "learning_rate": 1.1656964593886819e-05, "loss": 1.1115, "step": 1570 }, { "epoch": 0.504, "grad_norm": 1.811983018037607, "learning_rate": 1.1601850732076361e-05, "loss": 1.2648, "step": 1575 }, { "epoch": 0.5056, "grad_norm": 1.831478492450499, "learning_rate": 1.1546686886377745e-05, "loss": 1.2058, "step": 1580 }, { "epoch": 0.5072, "grad_norm": 1.8203216031603568, "learning_rate": 1.1491474778114588e-05, "loss": 1.1626, "step": 1585 }, { "epoch": 0.5088, "grad_norm": 2.020122182406412, "learning_rate": 1.143621613011648e-05, "loss": 1.2599, "step": 1590 }, { "epoch": 0.5104, "grad_norm": 1.7964307131894044, "learning_rate": 1.1380912666665234e-05, "loss": 1.1822, "step": 1595 }, { "epoch": 0.512, "grad_norm": 1.5557553071786303, "learning_rate": 1.1325566113441074e-05, "loss": 1.3104, "step": 1600 }, { "epoch": 0.5136, "grad_norm": 1.755041923303687, "learning_rate": 1.1270178197468788e-05, "loss": 1.2181, "step": 1605 }, { "epoch": 0.5152, "grad_norm": 1.6227097264031225, "learning_rate": 1.121475064706385e-05, "loss": 1.1752, "step": 1610 }, { "epoch": 0.5168, "grad_norm": 1.6195409838722366, "learning_rate": 1.1159285191778473e-05, "loss": 1.2025, "step": 1615 }, { "epoch": 0.5184, "grad_norm": 1.7210526886711655, "learning_rate": 1.1103783562347642e-05, "loss": 1.1475, "step": 1620 }, { "epoch": 0.52, "grad_norm": 1.8002985653421948, "learning_rate": 1.1048247490635133e-05, "loss": 1.208, "step": 1625 }, { "epoch": 0.5216, "grad_norm": 1.8881024047703057, "learning_rate": 1.099267870957943e-05, "loss": 1.2602, "step": 1630 }, { "epoch": 0.5232, "grad_norm": 1.6454570477183756, "learning_rate": 1.0937078953139691e-05, "loss": 1.1865, "step": 1635 }, { "epoch": 0.5248, "grad_norm": 1.9745914198427592, "learning_rate": 1.0881449956241616e-05, "loss": 1.0327, "step": 1640 }, { "epoch": 0.5264, "grad_norm": 1.873322732875923, "learning_rate": 1.0825793454723325e-05, "loss": 1.2125, "step": 1645 }, { "epoch": 0.528, "grad_norm": 2.356958452510717, "learning_rate": 1.0770111185281182e-05, "loss": 1.194, "step": 1650 }, { "epoch": 0.5296, "grad_norm": 1.8047582116918872, "learning_rate": 1.071440488541562e-05, "loss": 1.185, "step": 1655 }, { "epoch": 0.5312, "grad_norm": 1.8658110439674953, "learning_rate": 1.0658676293376894e-05, "loss": 1.1692, "step": 1660 }, { "epoch": 0.5328, "grad_norm": 1.8500780961524974, "learning_rate": 1.0602927148110882e-05, "loss": 1.1241, "step": 1665 }, { "epoch": 0.5344, "grad_norm": 1.9016895300800236, "learning_rate": 1.0547159189204788e-05, "loss": 1.1445, "step": 1670 }, { "epoch": 0.536, "grad_norm": 1.7979165639498904, "learning_rate": 1.0491374156832875e-05, "loss": 1.1541, "step": 1675 }, { "epoch": 0.5376, "grad_norm": 1.7700277222491425, "learning_rate": 1.043557379170217e-05, "loss": 1.1978, "step": 1680 }, { "epoch": 0.5392, "grad_norm": 1.5777221895998852, "learning_rate": 1.0379759834998133e-05, "loss": 1.1876, "step": 1685 }, { "epoch": 0.5408, "grad_norm": 1.6820301175255123, "learning_rate": 1.0323934028330337e-05, "loss": 1.2454, "step": 1690 }, { "epoch": 0.5424, "grad_norm": 1.5758571229056562, "learning_rate": 1.0268098113678124e-05, "loss": 1.2172, "step": 1695 }, { "epoch": 0.544, "grad_norm": 1.4960013175419522, "learning_rate": 1.0212253833336237e-05, "loss": 1.2248, "step": 1700 }, { "epoch": 0.5456, "grad_norm": 1.6797061729785308, "learning_rate": 1.015640292986046e-05, "loss": 1.1263, "step": 1705 }, { "epoch": 0.5472, "grad_norm": 1.980968334084065, "learning_rate": 1.0100547146013252e-05, "loss": 1.2231, "step": 1710 }, { "epoch": 0.5488, "grad_norm": 1.747611597582233, "learning_rate": 1.0044688224709346e-05, "loss": 1.1406, "step": 1715 }, { "epoch": 0.5504, "grad_norm": 1.7989270293601793, "learning_rate": 9.988827908961392e-06, "loss": 1.2152, "step": 1720 }, { "epoch": 0.552, "grad_norm": 1.9852221230685252, "learning_rate": 9.932967941825539e-06, "loss": 1.1508, "step": 1725 }, { "epoch": 0.5536, "grad_norm": 1.6179292477548883, "learning_rate": 9.87711006634706e-06, "loss": 1.264, "step": 1730 }, { "epoch": 0.5552, "grad_norm": 1.6402075483843201, "learning_rate": 9.821256025505964e-06, "loss": 1.1641, "step": 1735 }, { "epoch": 0.5568, "grad_norm": 1.8546896875801224, "learning_rate": 9.765407562162606e-06, "loss": 1.1882, "step": 1740 }, { "epoch": 0.5584, "grad_norm": 1.9516116691456564, "learning_rate": 9.709566419003292e-06, "loss": 1.1943, "step": 1745 }, { "epoch": 0.56, "grad_norm": 1.903713613286444, "learning_rate": 9.653734338485924e-06, "loss": 1.1536, "step": 1750 }, { "epoch": 0.5616, "grad_norm": 2.14302842986629, "learning_rate": 9.597913062785603e-06, "loss": 1.153, "step": 1755 }, { "epoch": 0.5632, "grad_norm": 2.0015452119346904, "learning_rate": 9.54210433374028e-06, "loss": 1.1768, "step": 1760 }, { "epoch": 0.5648, "grad_norm": 1.7261014873427656, "learning_rate": 9.486309892796413e-06, "loss": 1.1223, "step": 1765 }, { "epoch": 0.5664, "grad_norm": 1.7829074712427893, "learning_rate": 9.430531480954605e-06, "loss": 1.1655, "step": 1770 }, { "epoch": 0.568, "grad_norm": 1.8448110808767446, "learning_rate": 9.374770838715289e-06, "loss": 1.1326, "step": 1775 }, { "epoch": 0.5696, "grad_norm": 1.521175463781666, "learning_rate": 9.319029706024428e-06, "loss": 1.2032, "step": 1780 }, { "epoch": 0.5712, "grad_norm": 1.7716801204207469, "learning_rate": 9.2633098222192e-06, "loss": 1.1556, "step": 1785 }, { "epoch": 0.5728, "grad_norm": 1.9575858433192608, "learning_rate": 9.20761292597375e-06, "loss": 1.1994, "step": 1790 }, { "epoch": 0.5744, "grad_norm": 1.7856666095928753, "learning_rate": 9.151940755244912e-06, "loss": 1.1845, "step": 1795 }, { "epoch": 0.576, "grad_norm": 1.7605163069385534, "learning_rate": 9.096295047217988e-06, "loss": 1.0999, "step": 1800 }, { "epoch": 0.5776, "grad_norm": 1.7689342903268934, "learning_rate": 9.040677538252555e-06, "loss": 1.1588, "step": 1805 }, { "epoch": 0.5792, "grad_norm": 1.6639324896801173, "learning_rate": 8.985089963828262e-06, "loss": 1.2633, "step": 1810 }, { "epoch": 0.5808, "grad_norm": 1.7769903604934556, "learning_rate": 8.929534058490682e-06, "loss": 1.0862, "step": 1815 }, { "epoch": 0.5824, "grad_norm": 2.2536341843052523, "learning_rate": 8.8740115557972e-06, "loss": 1.1634, "step": 1820 }, { "epoch": 0.584, "grad_norm": 1.9316512113483337, "learning_rate": 8.8185241882629e-06, "loss": 1.2348, "step": 1825 }, { "epoch": 0.5856, "grad_norm": 1.7122297082902351, "learning_rate": 8.763073687306523e-06, "loss": 1.0607, "step": 1830 }, { "epoch": 0.5872, "grad_norm": 1.870260682219499, "learning_rate": 8.707661783196432e-06, "loss": 1.1746, "step": 1835 }, { "epoch": 0.5888, "grad_norm": 2.0935425545694435, "learning_rate": 8.652290204996613e-06, "loss": 1.2206, "step": 1840 }, { "epoch": 0.5904, "grad_norm": 1.717721044329421, "learning_rate": 8.59696068051273e-06, "loss": 1.1471, "step": 1845 }, { "epoch": 0.592, "grad_norm": 1.9182370411555087, "learning_rate": 8.541674936238219e-06, "loss": 1.2312, "step": 1850 }, { "epoch": 0.5936, "grad_norm": 1.9856188585357646, "learning_rate": 8.486434697300394e-06, "loss": 1.191, "step": 1855 }, { "epoch": 0.5952, "grad_norm": 1.6979401036336261, "learning_rate": 8.431241687406631e-06, "loss": 1.1778, "step": 1860 }, { "epoch": 0.5968, "grad_norm": 1.7333920807373167, "learning_rate": 8.376097628790586e-06, "loss": 1.1722, "step": 1865 }, { "epoch": 0.5984, "grad_norm": 1.9709991930061477, "learning_rate": 8.321004242158439e-06, "loss": 1.1648, "step": 1870 }, { "epoch": 0.6, "grad_norm": 1.5364345407471778, "learning_rate": 8.265963246635212e-06, "loss": 1.1027, "step": 1875 }, { "epoch": 0.6016, "grad_norm": 1.7584698049912544, "learning_rate": 8.210976359711124e-06, "loss": 1.1584, "step": 1880 }, { "epoch": 0.6032, "grad_norm": 1.8792898730879708, "learning_rate": 8.156045297187994e-06, "loss": 1.1035, "step": 1885 }, { "epoch": 0.6048, "grad_norm": 1.6188642240330084, "learning_rate": 8.101171773125716e-06, "loss": 1.1365, "step": 1890 }, { "epoch": 0.6064, "grad_norm": 1.7750013978198032, "learning_rate": 8.046357499788757e-06, "loss": 1.1839, "step": 1895 }, { "epoch": 0.608, "grad_norm": 2.119708886048633, "learning_rate": 7.991604187592732e-06, "loss": 1.2787, "step": 1900 }, { "epoch": 0.6096, "grad_norm": 1.7186682287635704, "learning_rate": 7.93691354505103e-06, "loss": 1.214, "step": 1905 }, { "epoch": 0.6112, "grad_norm": 1.6183001303197078, "learning_rate": 7.882287278721523e-06, "loss": 1.1909, "step": 1910 }, { "epoch": 0.6128, "grad_norm": 2.143170628940746, "learning_rate": 7.82772709315328e-06, "loss": 1.1455, "step": 1915 }, { "epoch": 0.6144, "grad_norm": 1.497411501505339, "learning_rate": 7.77323469083341e-06, "loss": 1.1781, "step": 1920 }, { "epoch": 0.616, "grad_norm": 1.662140808683762, "learning_rate": 7.718811772133918e-06, "loss": 1.0994, "step": 1925 }, { "epoch": 0.6176, "grad_norm": 1.7552052195766064, "learning_rate": 7.664460035258651e-06, "loss": 1.2607, "step": 1930 }, { "epoch": 0.6192, "grad_norm": 1.7023983361776165, "learning_rate": 7.610181176190318e-06, "loss": 1.0845, "step": 1935 }, { "epoch": 0.6208, "grad_norm": 1.8362961723743005, "learning_rate": 7.555976888637556e-06, "loss": 1.15, "step": 1940 }, { "epoch": 0.6224, "grad_norm": 1.9014772801655833, "learning_rate": 7.501848863982082e-06, "loss": 1.1966, "step": 1945 }, { "epoch": 0.624, "grad_norm": 1.8996856327164577, "learning_rate": 7.447798791225925e-06, "loss": 1.1323, "step": 1950 }, { "epoch": 0.6256, "grad_norm": 2.1146462861770656, "learning_rate": 7.393828356938709e-06, "loss": 1.1523, "step": 1955 }, { "epoch": 0.6272, "grad_norm": 1.8013711821158906, "learning_rate": 7.3399392452050385e-06, "loss": 1.2109, "step": 1960 }, { "epoch": 0.6288, "grad_norm": 1.9068508880313984, "learning_rate": 7.286133137571938e-06, "loss": 1.1267, "step": 1965 }, { "epoch": 0.6304, "grad_norm": 1.7428851981643643, "learning_rate": 7.2324117129963815e-06, "loss": 1.1257, "step": 1970 }, { "epoch": 0.632, "grad_norm": 1.8247252498397877, "learning_rate": 7.178776647792918e-06, "loss": 1.1672, "step": 1975 }, { "epoch": 0.6336, "grad_norm": 1.9260484501951087, "learning_rate": 7.125229615581346e-06, "loss": 1.0862, "step": 1980 }, { "epoch": 0.6352, "grad_norm": 1.6199644672243705, "learning_rate": 7.071772287234497e-06, "loss": 1.1253, "step": 1985 }, { "epoch": 0.6368, "grad_norm": 1.8245871462549041, "learning_rate": 7.018406330826096e-06, "loss": 1.143, "step": 1990 }, { "epoch": 0.6384, "grad_norm": 1.9108992862414564, "learning_rate": 6.96513341157872e-06, "loss": 1.1871, "step": 1995 }, { "epoch": 0.64, "grad_norm": 2.1891002529615133, "learning_rate": 6.911955191811819e-06, "loss": 1.1825, "step": 2000 }, { "epoch": 0.6416, "grad_norm": 1.7462341130055452, "learning_rate": 6.858873330889868e-06, "loss": 1.1545, "step": 2005 }, { "epoch": 0.6432, "grad_norm": 1.9883977700425417, "learning_rate": 6.8058894851705655e-06, "loss": 1.1614, "step": 2010 }, { "epoch": 0.6448, "grad_norm": 2.0475807120880503, "learning_rate": 6.7530053079531664e-06, "loss": 1.16, "step": 2015 }, { "epoch": 0.6464, "grad_norm": 1.7165548410652414, "learning_rate": 6.700222449426885e-06, "loss": 1.2106, "step": 2020 }, { "epoch": 0.648, "grad_norm": 1.8955565543894581, "learning_rate": 6.6475425566194006e-06, "loss": 1.2206, "step": 2025 }, { "epoch": 0.6496, "grad_norm": 1.4744837625300449, "learning_rate": 6.59496727334547e-06, "loss": 1.1281, "step": 2030 }, { "epoch": 0.6512, "grad_norm": 1.7836613268629666, "learning_rate": 6.5424982401556305e-06, "loss": 1.1447, "step": 2035 }, { "epoch": 0.6528, "grad_norm": 1.7417841029299312, "learning_rate": 6.490137094285008e-06, "loss": 1.1264, "step": 2040 }, { "epoch": 0.6544, "grad_norm": 1.814473803239514, "learning_rate": 6.437885469602235e-06, "loss": 1.1684, "step": 2045 }, { "epoch": 0.656, "grad_norm": 1.9782606451237765, "learning_rate": 6.385744996558456e-06, "loss": 1.1581, "step": 2050 }, { "epoch": 0.6576, "grad_norm": 1.7876517212973595, "learning_rate": 6.333717302136457e-06, "loss": 1.1654, "step": 2055 }, { "epoch": 0.6592, "grad_norm": 1.8597944182566617, "learning_rate": 6.28180400979991e-06, "loss": 1.125, "step": 2060 }, { "epoch": 0.6608, "grad_norm": 1.7778089909036836, "learning_rate": 6.230006739442692e-06, "loss": 1.0536, "step": 2065 }, { "epoch": 0.6624, "grad_norm": 1.6834263332344224, "learning_rate": 6.178327107338353e-06, "loss": 1.2547, "step": 2070 }, { "epoch": 0.664, "grad_norm": 1.8674289724621407, "learning_rate": 6.1267667260896755e-06, "loss": 1.1382, "step": 2075 }, { "epoch": 0.6656, "grad_norm": 2.26294469324537, "learning_rate": 6.075327204578363e-06, "loss": 1.1593, "step": 2080 }, { "epoch": 0.6672, "grad_norm": 2.5345359642009697, "learning_rate": 6.024010147914826e-06, "loss": 1.1229, "step": 2085 }, { "epoch": 0.6688, "grad_norm": 1.783951584011371, "learning_rate": 5.972817157388106e-06, "loss": 1.161, "step": 2090 }, { "epoch": 0.6704, "grad_norm": 1.9698405640252596, "learning_rate": 5.921749830415905e-06, "loss": 1.1532, "step": 2095 }, { "epoch": 0.672, "grad_norm": 1.8272399298446076, "learning_rate": 5.870809760494734e-06, "loss": 1.158, "step": 2100 }, { "epoch": 0.6736, "grad_norm": 1.8653487403840077, "learning_rate": 5.819998537150203e-06, "loss": 1.1145, "step": 2105 }, { "epoch": 0.6752, "grad_norm": 1.7973076440807894, "learning_rate": 5.769317745887413e-06, "loss": 1.1443, "step": 2110 }, { "epoch": 0.6768, "grad_norm": 1.679633492442338, "learning_rate": 5.718768968141482e-06, "loss": 1.1489, "step": 2115 }, { "epoch": 0.6784, "grad_norm": 1.8895834451470048, "learning_rate": 5.668353781228193e-06, "loss": 1.2102, "step": 2120 }, { "epoch": 0.68, "grad_norm": 1.6998299086665334, "learning_rate": 5.618073758294802e-06, "loss": 1.0882, "step": 2125 }, { "epoch": 0.6816, "grad_norm": 1.6972997419835827, "learning_rate": 5.567930468270911e-06, "loss": 1.1649, "step": 2130 }, { "epoch": 0.6832, "grad_norm": 1.6791063989529862, "learning_rate": 5.517925475819539e-06, "loss": 1.1799, "step": 2135 }, { "epoch": 0.6848, "grad_norm": 1.8188742853275077, "learning_rate": 5.468060341288286e-06, "loss": 1.1407, "step": 2140 }, { "epoch": 0.6864, "grad_norm": 1.7041718631526295, "learning_rate": 5.418336620660658e-06, "loss": 1.1303, "step": 2145 }, { "epoch": 0.688, "grad_norm": 1.8156936848418812, "learning_rate": 5.36875586550749e-06, "loss": 1.0786, "step": 2150 }, { "epoch": 0.6896, "grad_norm": 1.892869625914812, "learning_rate": 5.319319622938563e-06, "loss": 1.0928, "step": 2155 }, { "epoch": 0.6912, "grad_norm": 1.6893231213950322, "learning_rate": 5.270029435554295e-06, "loss": 1.1827, "step": 2160 }, { "epoch": 0.6928, "grad_norm": 1.8653568240793923, "learning_rate": 5.22088684139763e-06, "loss": 1.1745, "step": 2165 }, { "epoch": 0.6944, "grad_norm": 2.0406820023972134, "learning_rate": 5.171893373906036e-06, "loss": 1.1345, "step": 2170 }, { "epoch": 0.696, "grad_norm": 1.9382864649581717, "learning_rate": 5.1230505618636575e-06, "loss": 1.1739, "step": 2175 }, { "epoch": 0.6976, "grad_norm": 1.7660398218066646, "learning_rate": 5.074359929353604e-06, "loss": 1.0716, "step": 2180 }, { "epoch": 0.6992, "grad_norm": 2.074791669844254, "learning_rate": 5.025822995710414e-06, "loss": 1.1208, "step": 2185 }, { "epoch": 0.7008, "grad_norm": 1.8615920455866772, "learning_rate": 4.977441275472622e-06, "loss": 1.2122, "step": 2190 }, { "epoch": 0.7024, "grad_norm": 1.826025310797338, "learning_rate": 4.929216278335508e-06, "loss": 1.1664, "step": 2195 }, { "epoch": 0.704, "grad_norm": 1.7635277537631735, "learning_rate": 4.881149509103993e-06, "loss": 1.1626, "step": 2200 }, { "epoch": 0.7056, "grad_norm": 1.7116574967246256, "learning_rate": 4.833242467645677e-06, "loss": 1.2775, "step": 2205 }, { "epoch": 0.7072, "grad_norm": 2.1747827955243486, "learning_rate": 4.785496648844049e-06, "loss": 1.0902, "step": 2210 }, { "epoch": 0.7088, "grad_norm": 1.7332465164925657, "learning_rate": 4.737913542551824e-06, "loss": 1.1867, "step": 2215 }, { "epoch": 0.7104, "grad_norm": 1.878633892152507, "learning_rate": 4.690494633544466e-06, "loss": 1.1527, "step": 2220 }, { "epoch": 0.712, "grad_norm": 1.672009681273859, "learning_rate": 4.643241401473849e-06, "loss": 1.1545, "step": 2225 }, { "epoch": 0.7136, "grad_norm": 1.6606360695261788, "learning_rate": 4.596155320822103e-06, "loss": 1.1695, "step": 2230 }, { "epoch": 0.7152, "grad_norm": 1.8547958065224872, "learning_rate": 4.549237860855578e-06, "loss": 1.1323, "step": 2235 }, { "epoch": 0.7168, "grad_norm": 1.832050436632546, "learning_rate": 4.502490485579024e-06, "loss": 1.144, "step": 2240 }, { "epoch": 0.7184, "grad_norm": 2.1045933795560505, "learning_rate": 4.455914653689889e-06, "loss": 1.1535, "step": 2245 }, { "epoch": 0.72, "grad_norm": 1.7148356097343802, "learning_rate": 4.409511818532809e-06, "loss": 1.183, "step": 2250 }, { "epoch": 0.7216, "grad_norm": 2.3818826597813407, "learning_rate": 4.363283428054262e-06, "loss": 1.1364, "step": 2255 }, { "epoch": 0.7232, "grad_norm": 1.8692471600990392, "learning_rate": 4.317230924757379e-06, "loss": 1.1015, "step": 2260 }, { "epoch": 0.7248, "grad_norm": 1.7792530120975099, "learning_rate": 4.271355745656934e-06, "loss": 1.1086, "step": 2265 }, { "epoch": 0.7264, "grad_norm": 1.9066954032906234, "learning_rate": 4.2256593222345185e-06, "loss": 1.1734, "step": 2270 }, { "epoch": 0.728, "grad_norm": 1.7541470829000243, "learning_rate": 4.1801430803938496e-06, "loss": 1.155, "step": 2275 }, { "epoch": 0.7296, "grad_norm": 2.0413408604416476, "learning_rate": 4.1348084404162895e-06, "loss": 1.0836, "step": 2280 }, { "epoch": 0.7312, "grad_norm": 1.4779853410277413, "learning_rate": 4.089656816916525e-06, "loss": 1.1409, "step": 2285 }, { "epoch": 0.7328, "grad_norm": 1.9343727057904159, "learning_rate": 4.0446896187984275e-06, "loss": 1.0762, "step": 2290 }, { "epoch": 0.7344, "grad_norm": 1.8294071209689928, "learning_rate": 3.999908249211096e-06, "loss": 1.1651, "step": 2295 }, { "epoch": 0.736, "grad_norm": 2.032456774400865, "learning_rate": 3.955314105505056e-06, "loss": 1.053, "step": 2300 }, { "epoch": 0.7376, "grad_norm": 2.1233112429944265, "learning_rate": 3.910908579188672e-06, "loss": 1.1577, "step": 2305 }, { "epoch": 0.7392, "grad_norm": 2.0592110290446937, "learning_rate": 3.866693055884723e-06, "loss": 1.1549, "step": 2310 }, { "epoch": 0.7408, "grad_norm": 2.0050807354127076, "learning_rate": 3.8226689152871576e-06, "loss": 1.1511, "step": 2315 }, { "epoch": 0.7424, "grad_norm": 1.8596633831404055, "learning_rate": 3.7788375311180624e-06, "loss": 1.0813, "step": 2320 }, { "epoch": 0.744, "grad_norm": 1.58672559228326, "learning_rate": 3.735200271084779e-06, "loss": 1.1295, "step": 2325 }, { "epoch": 0.7456, "grad_norm": 1.8434949261948346, "learning_rate": 3.691758496837228e-06, "loss": 1.1977, "step": 2330 }, { "epoch": 0.7472, "grad_norm": 1.940207307549445, "learning_rate": 3.6485135639254234e-06, "loss": 1.0689, "step": 2335 }, { "epoch": 0.7488, "grad_norm": 1.8189064690257457, "learning_rate": 3.6054668217571774e-06, "loss": 1.1431, "step": 2340 }, { "epoch": 0.7504, "grad_norm": 1.6450125833114577, "learning_rate": 3.5626196135559898e-06, "loss": 1.249, "step": 2345 }, { "epoch": 0.752, "grad_norm": 1.9956205082954976, "learning_rate": 3.5199732763191317e-06, "loss": 1.1859, "step": 2350 }, { "epoch": 0.7536, "grad_norm": 1.6691387559775372, "learning_rate": 3.4775291407759393e-06, "loss": 1.1724, "step": 2355 }, { "epoch": 0.7552, "grad_norm": 1.5830809167877844, "learning_rate": 3.435288531346269e-06, "loss": 1.0217, "step": 2360 }, { "epoch": 0.7568, "grad_norm": 1.8523069996699053, "learning_rate": 3.3932527660991877e-06, "loss": 1.1154, "step": 2365 }, { "epoch": 0.7584, "grad_norm": 1.763904598184111, "learning_rate": 3.351423156711836e-06, "loss": 1.0468, "step": 2370 }, { "epoch": 0.76, "grad_norm": 1.914001722903738, "learning_rate": 3.309801008428498e-06, "loss": 1.0914, "step": 2375 }, { "epoch": 0.7616, "grad_norm": 1.7352331291973462, "learning_rate": 3.268387620019885e-06, "loss": 1.0735, "step": 2380 }, { "epoch": 0.7632, "grad_norm": 1.7246348422927915, "learning_rate": 3.2271842837425917e-06, "loss": 1.1295, "step": 2385 }, { "epoch": 0.7648, "grad_norm": 1.7026571693578514, "learning_rate": 3.1861922852987794e-06, "loss": 1.1347, "step": 2390 }, { "epoch": 0.7664, "grad_norm": 2.1293768786628204, "learning_rate": 3.1454129037960614e-06, "loss": 1.1555, "step": 2395 }, { "epoch": 0.768, "grad_norm": 1.6902344486971177, "learning_rate": 3.1048474117075834e-06, "loss": 1.1157, "step": 2400 }, { "epoch": 0.7696, "grad_norm": 2.10155608959706, "learning_rate": 3.0644970748323253e-06, "loss": 1.1737, "step": 2405 }, { "epoch": 0.7712, "grad_norm": 2.046652148355265, "learning_rate": 3.0243631522556027e-06, "loss": 1.0971, "step": 2410 }, { "epoch": 0.7728, "grad_norm": 1.883796149960297, "learning_rate": 2.984446896309764e-06, "loss": 1.161, "step": 2415 }, { "epoch": 0.7744, "grad_norm": 1.5732088062133587, "learning_rate": 2.94474955253513e-06, "loss": 1.156, "step": 2420 }, { "epoch": 0.776, "grad_norm": 2.1402289082011214, "learning_rate": 2.9052723596411194e-06, "loss": 1.1168, "step": 2425 }, { "epoch": 0.7776, "grad_norm": 1.5899796015059864, "learning_rate": 2.866016549467602e-06, "loss": 1.0812, "step": 2430 }, { "epoch": 0.7792, "grad_norm": 1.982290951409506, "learning_rate": 2.82698334694645e-06, "loss": 1.0825, "step": 2435 }, { "epoch": 0.7808, "grad_norm": 1.838017246032037, "learning_rate": 2.7881739700633382e-06, "loss": 1.0768, "step": 2440 }, { "epoch": 0.7824, "grad_norm": 1.8430683341212708, "learning_rate": 2.749589629819708e-06, "loss": 1.071, "step": 2445 }, { "epoch": 0.784, "grad_norm": 1.5710673613261896, "learning_rate": 2.7112315301949986e-06, "loss": 1.1206, "step": 2450 }, { "epoch": 0.7856, "grad_norm": 1.8029790473147127, "learning_rate": 2.6731008681090763e-06, "loss": 1.1289, "step": 2455 }, { "epoch": 0.7872, "grad_norm": 1.7207901355936954, "learning_rate": 2.6351988333848787e-06, "loss": 1.1068, "step": 2460 }, { "epoch": 0.7888, "grad_norm": 1.8160098112373757, "learning_rate": 2.5975266087113015e-06, "loss": 1.0642, "step": 2465 }, { "epoch": 0.7904, "grad_norm": 1.8253722230987668, "learning_rate": 2.5600853696062766e-06, "loss": 1.1683, "step": 2470 }, { "epoch": 0.792, "grad_norm": 1.6728480604226394, "learning_rate": 2.5228762843801047e-06, "loss": 1.1113, "step": 2475 }, { "epoch": 0.7936, "grad_norm": 2.0500652203837326, "learning_rate": 2.485900514098991e-06, "loss": 1.1313, "step": 2480 }, { "epoch": 0.7952, "grad_norm": 1.7659568393983387, "learning_rate": 2.4491592125488206e-06, "loss": 0.9846, "step": 2485 }, { "epoch": 0.7968, "grad_norm": 1.9000513891551978, "learning_rate": 2.4126535261991577e-06, "loss": 1.1371, "step": 2490 }, { "epoch": 0.7984, "grad_norm": 1.9436701672713421, "learning_rate": 2.3763845941674703e-06, "loss": 1.0047, "step": 2495 }, { "epoch": 0.8, "grad_norm": 1.99523909245642, "learning_rate": 2.340353548183575e-06, "loss": 1.1731, "step": 2500 }, { "epoch": 0.8016, "grad_norm": 2.1047916239553635, "learning_rate": 2.3045615125543353e-06, "loss": 1.0808, "step": 2505 }, { "epoch": 0.8032, "grad_norm": 1.8842322209384812, "learning_rate": 2.2690096041285757e-06, "loss": 1.1455, "step": 2510 }, { "epoch": 0.8048, "grad_norm": 1.8358242756030556, "learning_rate": 2.2336989322622306e-06, "loss": 1.0937, "step": 2515 }, { "epoch": 0.8064, "grad_norm": 1.841737147368622, "learning_rate": 2.198630598783723e-06, "loss": 1.1291, "step": 2520 }, { "epoch": 0.808, "grad_norm": 1.9997734451558542, "learning_rate": 2.1638056979596012e-06, "loss": 1.1503, "step": 2525 }, { "epoch": 0.8096, "grad_norm": 1.7953424174224597, "learning_rate": 2.1292253164603673e-06, "loss": 1.1404, "step": 2530 }, { "epoch": 0.8112, "grad_norm": 1.8801447137331397, "learning_rate": 2.094890533326589e-06, "loss": 1.1515, "step": 2535 }, { "epoch": 0.8128, "grad_norm": 1.7874394762295498, "learning_rate": 2.0608024199352216e-06, "loss": 1.1384, "step": 2540 }, { "epoch": 0.8144, "grad_norm": 1.9891630859598495, "learning_rate": 2.026962039966176e-06, "loss": 1.0683, "step": 2545 }, { "epoch": 0.816, "grad_norm": 1.75544090120103, "learning_rate": 1.9933704493691354e-06, "loss": 1.0515, "step": 2550 }, { "epoch": 0.8176, "grad_norm": 2.0623645710529543, "learning_rate": 1.960028696330596e-06, "loss": 1.1414, "step": 2555 }, { "epoch": 0.8192, "grad_norm": 1.9888969656304096, "learning_rate": 1.926937821241164e-06, "loss": 1.104, "step": 2560 }, { "epoch": 0.8208, "grad_norm": 1.960116467134126, "learning_rate": 1.8940988566630903e-06, "loss": 1.1162, "step": 2565 }, { "epoch": 0.8224, "grad_norm": 1.9681904476690104, "learning_rate": 1.861512827298051e-06, "loss": 1.0104, "step": 2570 }, { "epoch": 0.824, "grad_norm": 1.8129607235308978, "learning_rate": 1.8291807499551772e-06, "loss": 1.1511, "step": 2575 }, { "epoch": 0.8256, "grad_norm": 1.839592043059129, "learning_rate": 1.7971036335193249e-06, "loss": 1.0645, "step": 2580 }, { "epoch": 0.8272, "grad_norm": 2.1633329214440518, "learning_rate": 1.7652824789195811e-06, "loss": 1.1242, "step": 2585 }, { "epoch": 0.8288, "grad_norm": 1.7472689726208692, "learning_rate": 1.73371827909805e-06, "loss": 1.1294, "step": 2590 }, { "epoch": 0.8304, "grad_norm": 1.8402949682889143, "learning_rate": 1.7024120189788573e-06, "loss": 1.0553, "step": 2595 }, { "epoch": 0.832, "grad_norm": 1.8532842400093568, "learning_rate": 1.6713646754374225e-06, "loss": 1.0766, "step": 2600 }, { "epoch": 0.8336, "grad_norm": 1.6432746688593902, "learning_rate": 1.6405772172699696e-06, "loss": 1.0594, "step": 2605 }, { "epoch": 0.8352, "grad_norm": 2.3171100768307142, "learning_rate": 1.6100506051633136e-06, "loss": 1.0881, "step": 2610 }, { "epoch": 0.8368, "grad_norm": 2.292555784474286, "learning_rate": 1.5797857916648596e-06, "loss": 1.1406, "step": 2615 }, { "epoch": 0.8384, "grad_norm": 1.83904059864528, "learning_rate": 1.5497837211528965e-06, "loss": 1.1157, "step": 2620 }, { "epoch": 0.84, "grad_norm": 1.842732899139006, "learning_rate": 1.5200453298071238e-06, "loss": 1.1591, "step": 2625 }, { "epoch": 0.8416, "grad_norm": 2.0241636706347745, "learning_rate": 1.4905715455794379e-06, "loss": 1.1926, "step": 2630 }, { "epoch": 0.8432, "grad_norm": 1.9850253439234942, "learning_rate": 1.461363288164983e-06, "loss": 1.133, "step": 2635 }, { "epoch": 0.8448, "grad_norm": 2.1701113199525235, "learning_rate": 1.432421468973444e-06, "loss": 1.0782, "step": 2640 }, { "epoch": 0.8464, "grad_norm": 1.744192954189927, "learning_rate": 1.4037469911006096e-06, "loss": 1.1418, "step": 2645 }, { "epoch": 0.848, "grad_norm": 1.8657163774393626, "learning_rate": 1.3753407493001968e-06, "loss": 1.1093, "step": 2650 }, { "epoch": 0.8496, "grad_norm": 1.7719983881482222, "learning_rate": 1.3472036299559255e-06, "loss": 1.0442, "step": 2655 }, { "epoch": 0.8512, "grad_norm": 1.6776555631045273, "learning_rate": 1.3193365110538647e-06, "loss": 1.1486, "step": 2660 }, { "epoch": 0.8528, "grad_norm": 1.8998463417598888, "learning_rate": 1.2917402621550369e-06, "loss": 1.0329, "step": 2665 }, { "epoch": 0.8544, "grad_norm": 2.2377097430365103, "learning_rate": 1.2644157443682737e-06, "loss": 1.1627, "step": 2670 }, { "epoch": 0.856, "grad_norm": 2.0295709661659074, "learning_rate": 1.23736381032336e-06, "loss": 1.1122, "step": 2675 }, { "epoch": 0.8576, "grad_norm": 1.6206230512932172, "learning_rate": 1.2105853041444172e-06, "loss": 1.0873, "step": 2680 }, { "epoch": 0.8592, "grad_norm": 1.8815121724770376, "learning_rate": 1.184081061423572e-06, "loss": 1.1498, "step": 2685 }, { "epoch": 0.8608, "grad_norm": 1.70840552316187, "learning_rate": 1.157851909194876e-06, "loss": 1.0604, "step": 2690 }, { "epoch": 0.8624, "grad_norm": 1.733576825498409, "learning_rate": 1.1318986659085062e-06, "loss": 1.1606, "step": 2695 }, { "epoch": 0.864, "grad_norm": 1.8215703284486364, "learning_rate": 1.10622214140522e-06, "loss": 1.1633, "step": 2700 }, { "epoch": 0.8656, "grad_norm": 2.2057043672602146, "learning_rate": 1.080823136891086e-06, "loss": 1.0168, "step": 2705 }, { "epoch": 0.8672, "grad_norm": 1.9030202419284716, "learning_rate": 1.0557024449124854e-06, "loss": 1.0994, "step": 2710 }, { "epoch": 0.8688, "grad_norm": 2.054620122467027, "learning_rate": 1.0308608493313776e-06, "loss": 1.1867, "step": 2715 }, { "epoch": 0.8704, "grad_norm": 2.085133317203706, "learning_rate": 1.0062991253008525e-06, "loss": 1.1113, "step": 2720 }, { "epoch": 0.872, "grad_norm": 1.9247540859207555, "learning_rate": 9.820180392409252e-07, "loss": 1.1283, "step": 2725 }, { "epoch": 0.8736, "grad_norm": 2.047733810088453, "learning_rate": 9.580183488146323e-07, "loss": 1.1248, "step": 2730 }, { "epoch": 0.8752, "grad_norm": 2.158110798059805, "learning_rate": 9.343008029043876e-07, "loss": 1.0999, "step": 2735 }, { "epoch": 0.8768, "grad_norm": 1.845312901522814, "learning_rate": 9.108661415886111e-07, "loss": 1.126, "step": 2740 }, { "epoch": 0.8784, "grad_norm": 1.966660062347526, "learning_rate": 8.87715096118642e-07, "loss": 1.1502, "step": 2745 }, { "epoch": 0.88, "grad_norm": 1.9638207121108124, "learning_rate": 8.64848388895917e-07, "loss": 1.1105, "step": 2750 }, { "epoch": 0.8816, "grad_norm": 1.8408954115702174, "learning_rate": 8.42266733449425e-07, "loss": 1.0827, "step": 2755 }, { "epoch": 0.8832, "grad_norm": 1.9784816125061366, "learning_rate": 8.199708344134493e-07, "loss": 1.0597, "step": 2760 }, { "epoch": 0.8848, "grad_norm": 1.7886393959332718, "learning_rate": 7.979613875055736e-07, "loss": 1.13, "step": 2765 }, { "epoch": 0.8864, "grad_norm": 1.8818256568392642, "learning_rate": 7.76239079504979e-07, "loss": 1.1338, "step": 2770 }, { "epoch": 0.888, "grad_norm": 2.03917278718352, "learning_rate": 7.548045882310084e-07, "loss": 1.0638, "step": 2775 }, { "epoch": 0.8896, "grad_norm": 1.728570632522873, "learning_rate": 7.336585825220244e-07, "loss": 1.099, "step": 2780 }, { "epoch": 0.8912, "grad_norm": 1.8973111110751473, "learning_rate": 7.128017222145267e-07, "loss": 1.122, "step": 2785 }, { "epoch": 0.8928, "grad_norm": 1.8553054400673492, "learning_rate": 6.922346581225725e-07, "loss": 1.05, "step": 2790 }, { "epoch": 0.8944, "grad_norm": 1.8575846882621156, "learning_rate": 6.719580320174657e-07, "loss": 1.073, "step": 2795 }, { "epoch": 0.896, "grad_norm": 1.9697599176581626, "learning_rate": 6.519724766077262e-07, "loss": 1.1004, "step": 2800 }, { "epoch": 0.8976, "grad_norm": 1.9664020191533902, "learning_rate": 6.322786155193594e-07, "loss": 1.1143, "step": 2805 }, { "epoch": 0.8992, "grad_norm": 1.8364722136365312, "learning_rate": 6.128770632763825e-07, "loss": 1.0633, "step": 2810 }, { "epoch": 0.9008, "grad_norm": 1.7916866191719836, "learning_rate": 5.937684252816578e-07, "loss": 1.1479, "step": 2815 }, { "epoch": 0.9024, "grad_norm": 1.8568854615924408, "learning_rate": 5.749532977979977e-07, "loss": 1.0435, "step": 2820 }, { "epoch": 0.904, "grad_norm": 1.899626316470779, "learning_rate": 5.564322679295619e-07, "loss": 1.0628, "step": 2825 }, { "epoch": 0.9056, "grad_norm": 1.8845971712370737, "learning_rate": 5.382059136035389e-07, "loss": 1.069, "step": 2830 }, { "epoch": 0.9072, "grad_norm": 2.0520274328815034, "learning_rate": 5.202748035521021e-07, "loss": 1.1332, "step": 2835 }, { "epoch": 0.9088, "grad_norm": 1.675275773929361, "learning_rate": 5.026394972946813e-07, "loss": 1.0898, "step": 2840 }, { "epoch": 0.9104, "grad_norm": 1.6244094514464815, "learning_rate": 4.85300545120484e-07, "loss": 1.1416, "step": 2845 }, { "epoch": 0.912, "grad_norm": 1.9952673823871188, "learning_rate": 4.6825848807133813e-07, "loss": 1.1914, "step": 2850 }, { "epoch": 0.9136, "grad_norm": 1.6950199027000228, "learning_rate": 4.515138579248035e-07, "loss": 1.1402, "step": 2855 }, { "epoch": 0.9152, "grad_norm": 1.7423672169557656, "learning_rate": 4.350671771775772e-07, "loss": 1.0582, "step": 2860 }, { "epoch": 0.9168, "grad_norm": 1.637145060041263, "learning_rate": 4.189189590291975e-07, "loss": 1.1407, "step": 2865 }, { "epoch": 0.9184, "grad_norm": 2.2109547890040573, "learning_rate": 4.030697073660217e-07, "loss": 1.1378, "step": 2870 }, { "epoch": 0.92, "grad_norm": 1.751115581242929, "learning_rate": 3.875199167455035e-07, "loss": 1.2019, "step": 2875 }, { "epoch": 0.9216, "grad_norm": 1.702528166658884, "learning_rate": 3.7227007238076596e-07, "loss": 1.0969, "step": 2880 }, { "epoch": 0.9232, "grad_norm": 1.9358334095804801, "learning_rate": 3.573206501254556e-07, "loss": 1.1201, "step": 2885 }, { "epoch": 0.9248, "grad_norm": 1.757773075781555, "learning_rate": 3.4267211645890306e-07, "loss": 1.128, "step": 2890 }, { "epoch": 0.9264, "grad_norm": 1.5821683316522455, "learning_rate": 3.283249284715528e-07, "loss": 1.113, "step": 2895 }, { "epoch": 0.928, "grad_norm": 1.7872342107395125, "learning_rate": 3.1427953385071207e-07, "loss": 1.1363, "step": 2900 }, { "epoch": 0.9296, "grad_norm": 1.902792184635527, "learning_rate": 3.005363708665765e-07, "loss": 1.0592, "step": 2905 }, { "epoch": 0.9312, "grad_norm": 1.643618393734696, "learning_rate": 2.870958683585545e-07, "loss": 1.0648, "step": 2910 }, { "epoch": 0.9328, "grad_norm": 1.5934457079446989, "learning_rate": 2.7395844572188915e-07, "loss": 1.0791, "step": 2915 }, { "epoch": 0.9344, "grad_norm": 1.81290378725883, "learning_rate": 2.6112451289456495e-07, "loss": 1.0802, "step": 2920 }, { "epoch": 0.936, "grad_norm": 1.7445293374799862, "learning_rate": 2.4859447034452424e-07, "loss": 1.039, "step": 2925 }, { "epoch": 0.9376, "grad_norm": 1.8709547433026, "learning_rate": 2.3636870905716424e-07, "loss": 1.0219, "step": 2930 }, { "epoch": 0.9392, "grad_norm": 1.5925084030063732, "learning_rate": 2.2444761052313857e-07, "loss": 1.0096, "step": 2935 }, { "epoch": 0.9408, "grad_norm": 1.8672177358904116, "learning_rate": 2.1283154672645522e-07, "loss": 1.1292, "step": 2940 }, { "epoch": 0.9424, "grad_norm": 1.9316161422980194, "learning_rate": 2.015208801328694e-07, "loss": 1.0738, "step": 2945 }, { "epoch": 0.944, "grad_norm": 1.990823133110935, "learning_rate": 1.905159636785714e-07, "loss": 1.0774, "step": 2950 }, { "epoch": 0.9456, "grad_norm": 1.9199541456390363, "learning_rate": 1.79817140759172e-07, "loss": 1.0964, "step": 2955 }, { "epoch": 0.9472, "grad_norm": 1.647055497128993, "learning_rate": 1.6942474521899232e-07, "loss": 1.0761, "step": 2960 }, { "epoch": 0.9488, "grad_norm": 1.7968779508466248, "learning_rate": 1.5933910134064202e-07, "loss": 1.1182, "step": 2965 }, { "epoch": 0.9504, "grad_norm": 1.910903760906081, "learning_rate": 1.4956052383490295e-07, "loss": 1.1169, "step": 2970 }, { "epoch": 0.952, "grad_norm": 1.6490510222220343, "learning_rate": 1.4008931783090707e-07, "loss": 1.059, "step": 2975 }, { "epoch": 0.9536, "grad_norm": 2.24424509198347, "learning_rate": 1.309257788666174e-07, "loss": 1.0512, "step": 2980 }, { "epoch": 0.9552, "grad_norm": 2.049955056587493, "learning_rate": 1.220701928796042e-07, "loss": 1.1129, "step": 2985 }, { "epoch": 0.9568, "grad_norm": 1.7528902034656786, "learning_rate": 1.1352283619812443e-07, "loss": 1.0786, "step": 2990 }, { "epoch": 0.9584, "grad_norm": 1.7534015177678282, "learning_rate": 1.0528397553249636e-07, "loss": 1.0915, "step": 2995 }, { "epoch": 0.96, "grad_norm": 1.870800227625946, "learning_rate": 9.73538679667807e-08, "loss": 1.1904, "step": 3000 }, { "epoch": 0.9616, "grad_norm": 1.7310669557763085, "learning_rate": 8.97327609507559e-08, "loss": 1.0884, "step": 3005 }, { "epoch": 0.9632, "grad_norm": 2.0772302938552922, "learning_rate": 8.242089229219984e-08, "loss": 1.1054, "step": 3010 }, { "epoch": 0.9648, "grad_norm": 2.0323863667111617, "learning_rate": 7.541849014946479e-08, "loss": 1.1456, "step": 3015 }, { "epoch": 0.9664, "grad_norm": 1.6123892604072234, "learning_rate": 6.872577302436179e-08, "loss": 1.1386, "step": 3020 }, { "epoch": 0.968, "grad_norm": 1.7625265804507368, "learning_rate": 6.234294975534183e-08, "loss": 1.012, "step": 3025 }, { "epoch": 0.9696, "grad_norm": 1.950177210169516, "learning_rate": 5.6270219510975445e-08, "loss": 1.1221, "step": 3030 }, { "epoch": 0.9712, "grad_norm": 1.9203504711525439, "learning_rate": 5.050777178374544e-08, "loss": 1.0957, "step": 3035 }, { "epoch": 0.9728, "grad_norm": 2.051292258021081, "learning_rate": 4.505578638412722e-08, "loss": 1.0989, "step": 3040 }, { "epoch": 0.9744, "grad_norm": 1.9736219426741475, "learning_rate": 3.9914433434982135e-08, "loss": 1.0677, "step": 3045 }, { "epoch": 0.976, "grad_norm": 1.751558982888776, "learning_rate": 3.508387336624619e-08, "loss": 1.0821, "step": 3050 }, { "epoch": 0.9776, "grad_norm": 2.237832352970378, "learning_rate": 3.056425690992404e-08, "loss": 1.1549, "step": 3055 }, { "epoch": 0.9792, "grad_norm": 1.9643541211885889, "learning_rate": 2.6355725095389416e-08, "loss": 1.1824, "step": 3060 }, { "epoch": 0.9808, "grad_norm": 1.8354358443935066, "learning_rate": 2.2458409244979772e-08, "loss": 1.1193, "step": 3065 }, { "epoch": 0.9824, "grad_norm": 1.7302872781788123, "learning_rate": 1.8872430969901766e-08, "loss": 1.1368, "step": 3070 }, { "epoch": 0.984, "grad_norm": 1.8096293993453403, "learning_rate": 1.559790216643542e-08, "loss": 1.0408, "step": 3075 }, { "epoch": 0.9856, "grad_norm": 1.7391377484296338, "learning_rate": 1.2634925012440235e-08, "loss": 1.117, "step": 3080 }, { "epoch": 0.9872, "grad_norm": 1.859241011487041, "learning_rate": 9.983591964171091e-09, "loss": 1.1645, "step": 3085 }, { "epoch": 0.9888, "grad_norm": 1.8444594250518096, "learning_rate": 7.643985753390537e-09, "loss": 1.0267, "step": 3090 }, { "epoch": 0.9904, "grad_norm": 1.7799504207338483, "learning_rate": 5.616179384788645e-09, "loss": 1.1312, "step": 3095 }, { "epoch": 0.992, "grad_norm": 2.080638343582993, "learning_rate": 3.900236133703717e-09, "loss": 1.0585, "step": 3100 }, { "epoch": 0.9936, "grad_norm": 2.008020425036196, "learning_rate": 2.496209544147199e-09, "loss": 1.1387, "step": 3105 }, { "epoch": 0.9952, "grad_norm": 1.9952427004758182, "learning_rate": 1.4041434271350184e-09, "loss": 1.1208, "step": 3110 }, { "epoch": 0.9968, "grad_norm": 1.9744394661998583, "learning_rate": 6.240718593208961e-10, "loss": 1.0075, "step": 3115 }, { "epoch": 0.9984, "grad_norm": 2.0577935920007486, "learning_rate": 1.5601918192942322e-10, "loss": 1.0806, "step": 3120 }, { "epoch": 1.0, "grad_norm": 1.889866298129456, "learning_rate": 0.0, "loss": 1.1152, "step": 3125 }, { "epoch": 1.0, "eval_loss": 1.1343333721160889, "eval_runtime": 46.5533, "eval_samples_per_second": 10.74, "eval_steps_per_second": 0.687, "step": 3125 }, { "epoch": 1.0, "step": 3125, "total_flos": 41677458800640.0, "train_loss": 1.2090233922958373, "train_runtime": 11586.5134, "train_samples_per_second": 4.315, "train_steps_per_second": 0.27 } ], "logging_steps": 5, "max_steps": 3125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 41677458800640.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }