{ "best_metric": null, "best_model_checkpoint": null, "epoch": 49.65335094090459, "eval_steps": 200, "global_step": 18800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002641135688345989, "grad_norm": 20401.947265625, "learning_rate": 5e-06, "loss": 39.1249, "step": 1 }, { "epoch": 0.002641135688345989, "eval_loss": 10.096823692321777, "eval_runtime": 2.0856, "eval_samples_per_second": 237.343, "eval_steps_per_second": 29.728, "step": 1 }, { "epoch": 0.005282271376691978, "grad_norm": 11222.62890625, "learning_rate": 1e-05, "loss": 36.867, "step": 2 }, { "epoch": 0.007923407065037967, "grad_norm": 26398.576171875, "learning_rate": 1.5e-05, "loss": 39.1706, "step": 3 }, { "epoch": 0.010564542753383956, "grad_norm": 14163.0771484375, "learning_rate": 2e-05, "loss": 38.6691, "step": 4 }, { "epoch": 0.013205678441729944, "grad_norm": 15992.083984375, "learning_rate": 2.5e-05, "loss": 37.2986, "step": 5 }, { "epoch": 0.015846814130075933, "grad_norm": 20127.369140625, "learning_rate": 3e-05, "loss": 38.7236, "step": 6 }, { "epoch": 0.01848794981842192, "grad_norm": 55544.01171875, "learning_rate": 3.5000000000000004e-05, "loss": 38.1819, "step": 7 }, { "epoch": 0.02112908550676791, "grad_norm": 17545.71484375, "learning_rate": 4e-05, "loss": 37.7058, "step": 8 }, { "epoch": 0.0237702211951139, "grad_norm": 37625.41796875, "learning_rate": 4.4999999999999996e-05, "loss": 37.5564, "step": 9 }, { "epoch": 0.02641135688345989, "grad_norm": 17234.775390625, "learning_rate": 5e-05, "loss": 38.7241, "step": 10 }, { "epoch": 0.029052492571805876, "grad_norm": 15973.6484375, "learning_rate": 5.5e-05, "loss": 37.3825, "step": 11 }, { "epoch": 0.03169362826015187, "grad_norm": 15441.8349609375, "learning_rate": 6e-05, "loss": 36.5148, "step": 12 }, { "epoch": 0.034334763948497854, "grad_norm": 7875.17822265625, "learning_rate": 6.500000000000001e-05, "loss": 38.1769, "step": 13 }, { "epoch": 0.03697589963684384, "grad_norm": 13307.8583984375, "learning_rate": 7.000000000000001e-05, "loss": 37.58, "step": 14 }, { "epoch": 0.03961703532518983, "grad_norm": 10824.8330078125, "learning_rate": 7.5e-05, "loss": 36.3215, "step": 15 }, { "epoch": 0.04225817101353582, "grad_norm": 26653.587890625, "learning_rate": 8e-05, "loss": 36.3776, "step": 16 }, { "epoch": 0.04489930670188181, "grad_norm": 5778.88427734375, "learning_rate": 8.5e-05, "loss": 36.1487, "step": 17 }, { "epoch": 0.0475404423902278, "grad_norm": 25909.529296875, "learning_rate": 8.999999999999999e-05, "loss": 35.109, "step": 18 }, { "epoch": 0.050181578078573784, "grad_norm": 18328.03125, "learning_rate": 9.5e-05, "loss": 36.8002, "step": 19 }, { "epoch": 0.05282271376691978, "grad_norm": 11749.251953125, "learning_rate": 0.0001, "loss": 34.5267, "step": 20 }, { "epoch": 0.055463849455265765, "grad_norm": 3418.73388671875, "learning_rate": 0.000105, "loss": 34.0945, "step": 21 }, { "epoch": 0.05810498514361175, "grad_norm": 14018.82421875, "learning_rate": 0.00011, "loss": 37.2818, "step": 22 }, { "epoch": 0.06074612083195774, "grad_norm": 121021.71875, "learning_rate": 0.000115, "loss": 147.535, "step": 23 }, { "epoch": 0.06338725652030373, "grad_norm": 288293.75, "learning_rate": 0.00012, "loss": 300.9805, "step": 24 }, { "epoch": 0.06602839220864971, "grad_norm": 606745.75, "learning_rate": 0.000125, "loss": 356.0137, "step": 25 }, { "epoch": 0.06866952789699571, "grad_norm": 423939.90625, "learning_rate": 0.00013000000000000002, "loss": 269.3881, "step": 26 }, { "epoch": 0.0713106635853417, "grad_norm": 208153.0, "learning_rate": 0.000135, "loss": 299.9242, "step": 27 }, { "epoch": 0.07395179927368768, "grad_norm": 241398.359375, "learning_rate": 0.00014000000000000001, "loss": 306.4643, "step": 28 }, { "epoch": 0.07659293496203368, "grad_norm": 186255.296875, "learning_rate": 0.000145, "loss": 404.5039, "step": 29 }, { "epoch": 0.07923407065037966, "grad_norm": 290052.40625, "learning_rate": 0.00015, "loss": 337.7012, "step": 30 }, { "epoch": 0.08187520633872565, "grad_norm": 287612.9375, "learning_rate": 0.000155, "loss": 262.9381, "step": 31 }, { "epoch": 0.08451634202707164, "grad_norm": 184418.84375, "learning_rate": 0.00016, "loss": 267.4774, "step": 32 }, { "epoch": 0.08715747771541763, "grad_norm": 105977.4375, "learning_rate": 0.000165, "loss": 73.3582, "step": 33 }, { "epoch": 0.08979861340376362, "grad_norm": 7956.8671875, "learning_rate": 0.00017, "loss": 34.3819, "step": 34 }, { "epoch": 0.09243974909210961, "grad_norm": 5408.1884765625, "learning_rate": 0.000175, "loss": 34.0772, "step": 35 }, { "epoch": 0.0950808847804556, "grad_norm": 6824.720703125, "learning_rate": 0.00017999999999999998, "loss": 33.4205, "step": 36 }, { "epoch": 0.09772202046880159, "grad_norm": 7688.1181640625, "learning_rate": 0.000185, "loss": 35.0794, "step": 37 }, { "epoch": 0.10036315615714757, "grad_norm": 6591.5224609375, "learning_rate": 0.00019, "loss": 34.1295, "step": 38 }, { "epoch": 0.10300429184549356, "grad_norm": 7902.45361328125, "learning_rate": 0.00019500000000000002, "loss": 34.9128, "step": 39 }, { "epoch": 0.10564542753383956, "grad_norm": 6205.4072265625, "learning_rate": 0.0002, "loss": 33.2611, "step": 40 }, { "epoch": 0.10828656322218554, "grad_norm": 8808.6064453125, "learning_rate": 0.000205, "loss": 33.561, "step": 41 }, { "epoch": 0.11092769891053153, "grad_norm": 6377.85498046875, "learning_rate": 0.00021, "loss": 34.3651, "step": 42 }, { "epoch": 0.11356883459887751, "grad_norm": 19159.29296875, "learning_rate": 0.000215, "loss": 33.2094, "step": 43 }, { "epoch": 0.1162099702872235, "grad_norm": 6177.31005859375, "learning_rate": 0.00022, "loss": 34.1021, "step": 44 }, { "epoch": 0.1188511059755695, "grad_norm": 7164.431640625, "learning_rate": 0.00022500000000000002, "loss": 33.6585, "step": 45 }, { "epoch": 0.12149224166391548, "grad_norm": 6496.5478515625, "learning_rate": 0.00023, "loss": 33.442, "step": 46 }, { "epoch": 0.12413337735226147, "grad_norm": 7890.564453125, "learning_rate": 0.000235, "loss": 34.2228, "step": 47 }, { "epoch": 0.12677451304060747, "grad_norm": 6550.92333984375, "learning_rate": 0.00024, "loss": 34.898, "step": 48 }, { "epoch": 0.12941564872895345, "grad_norm": 12506.0625, "learning_rate": 0.000245, "loss": 36.6297, "step": 49 }, { "epoch": 0.13205678441729943, "grad_norm": 25460.58203125, "learning_rate": 0.00025, "loss": 40.1671, "step": 50 }, { "epoch": 0.13469792010564544, "grad_norm": 5015.23486328125, "learning_rate": 0.000255, "loss": 36.6511, "step": 51 }, { "epoch": 0.13733905579399142, "grad_norm": 7093.125, "learning_rate": 0.00026000000000000003, "loss": 36.4618, "step": 52 }, { "epoch": 0.1399801914823374, "grad_norm": 4966.2080078125, "learning_rate": 0.00026500000000000004, "loss": 35.2807, "step": 53 }, { "epoch": 0.1426213271706834, "grad_norm": 8010.041015625, "learning_rate": 0.00027, "loss": 36.761, "step": 54 }, { "epoch": 0.14526246285902938, "grad_norm": 4930.85791015625, "learning_rate": 0.000275, "loss": 37.0344, "step": 55 }, { "epoch": 0.14790359854737536, "grad_norm": 8304.25, "learning_rate": 0.00028000000000000003, "loss": 36.5246, "step": 56 }, { "epoch": 0.15054473423572137, "grad_norm": 7921.43212890625, "learning_rate": 0.000285, "loss": 36.0162, "step": 57 }, { "epoch": 0.15318586992406735, "grad_norm": 5330.0830078125, "learning_rate": 0.00029, "loss": 36.03, "step": 58 }, { "epoch": 0.15582700561241333, "grad_norm": 7872.869140625, "learning_rate": 0.000295, "loss": 36.7273, "step": 59 }, { "epoch": 0.1584681413007593, "grad_norm": 5387.7744140625, "learning_rate": 0.0003, "loss": 33.5007, "step": 60 }, { "epoch": 0.16110927698910532, "grad_norm": 5675.3984375, "learning_rate": 0.000305, "loss": 35.7257, "step": 61 }, { "epoch": 0.1637504126774513, "grad_norm": 5046.18896484375, "learning_rate": 0.00031, "loss": 34.6394, "step": 62 }, { "epoch": 0.16639154836579728, "grad_norm": 5588.13623046875, "learning_rate": 0.000315, "loss": 33.9325, "step": 63 }, { "epoch": 0.1690326840541433, "grad_norm": 4954.25830078125, "learning_rate": 0.00032, "loss": 33.8484, "step": 64 }, { "epoch": 0.17167381974248927, "grad_norm": 5773.2421875, "learning_rate": 0.00032500000000000004, "loss": 33.1426, "step": 65 }, { "epoch": 0.17431495543083525, "grad_norm": 13806.1943359375, "learning_rate": 0.00033, "loss": 32.6295, "step": 66 }, { "epoch": 0.17695609111918126, "grad_norm": 4271.505859375, "learning_rate": 0.000335, "loss": 32.32, "step": 67 }, { "epoch": 0.17959722680752724, "grad_norm": 2729.16357421875, "learning_rate": 0.00034, "loss": 31.9276, "step": 68 }, { "epoch": 0.18223836249587322, "grad_norm": 3449.347412109375, "learning_rate": 0.000345, "loss": 32.2519, "step": 69 }, { "epoch": 0.18487949818421923, "grad_norm": 5923.7685546875, "learning_rate": 0.00035, "loss": 32.6102, "step": 70 }, { "epoch": 0.1875206338725652, "grad_norm": 4537.07275390625, "learning_rate": 0.000355, "loss": 32.3208, "step": 71 }, { "epoch": 0.1901617695609112, "grad_norm": 12403.1279296875, "learning_rate": 0.00035999999999999997, "loss": 39.5107, "step": 72 }, { "epoch": 0.19280290524925717, "grad_norm": 188205.34375, "learning_rate": 0.000365, "loss": 330.6115, "step": 73 }, { "epoch": 0.19544404093760318, "grad_norm": 206972.125, "learning_rate": 0.00037, "loss": 330.4473, "step": 74 }, { "epoch": 0.19808517662594916, "grad_norm": 212709.90625, "learning_rate": 0.000375, "loss": 257.2068, "step": 75 }, { "epoch": 0.20072631231429514, "grad_norm": 116455.09375, "learning_rate": 0.00038, "loss": 272.9365, "step": 76 }, { "epoch": 0.20336744800264114, "grad_norm": 212611.484375, "learning_rate": 0.00038500000000000003, "loss": 229.3413, "step": 77 }, { "epoch": 0.20600858369098712, "grad_norm": 246496.375, "learning_rate": 0.00039000000000000005, "loss": 236.1894, "step": 78 }, { "epoch": 0.2086497193793331, "grad_norm": 198330.125, "learning_rate": 0.000395, "loss": 198.5551, "step": 79 }, { "epoch": 0.2112908550676791, "grad_norm": 366560.71875, "learning_rate": 0.0004, "loss": 158.8219, "step": 80 }, { "epoch": 0.2139319907560251, "grad_norm": 273746.5, "learning_rate": 0.00040500000000000003, "loss": 102.4051, "step": 81 }, { "epoch": 0.21657312644437107, "grad_norm": 264478.4375, "learning_rate": 0.00041, "loss": 78.7057, "step": 82 }, { "epoch": 0.21921426213271708, "grad_norm": 65915.703125, "learning_rate": 0.000415, "loss": 152.2347, "step": 83 }, { "epoch": 0.22185539782106306, "grad_norm": 62420.66796875, "learning_rate": 0.00042, "loss": 120.5832, "step": 84 }, { "epoch": 0.22449653350940904, "grad_norm": 44273.0859375, "learning_rate": 0.000425, "loss": 70.6126, "step": 85 }, { "epoch": 0.22713766919775502, "grad_norm": 23947.40625, "learning_rate": 0.00043, "loss": 49.9754, "step": 86 }, { "epoch": 0.22977880488610103, "grad_norm": 10803.9013671875, "learning_rate": 0.000435, "loss": 44.2373, "step": 87 }, { "epoch": 0.232419940574447, "grad_norm": 10775.537109375, "learning_rate": 0.00044, "loss": 40.4109, "step": 88 }, { "epoch": 0.235061076262793, "grad_norm": 10595.3876953125, "learning_rate": 0.00044500000000000003, "loss": 37.5687, "step": 89 }, { "epoch": 0.237702211951139, "grad_norm": 6692.46826171875, "learning_rate": 0.00045000000000000004, "loss": 36.3338, "step": 90 }, { "epoch": 0.24034334763948498, "grad_norm": 8030.1494140625, "learning_rate": 0.000455, "loss": 37.783, "step": 91 }, { "epoch": 0.24298448332783096, "grad_norm": 8749.814453125, "learning_rate": 0.00046, "loss": 38.0031, "step": 92 }, { "epoch": 0.24562561901617697, "grad_norm": 8516.0859375, "learning_rate": 0.000465, "loss": 37.4491, "step": 93 }, { "epoch": 0.24826675470452295, "grad_norm": 10548.6953125, "learning_rate": 0.00047, "loss": 35.8004, "step": 94 }, { "epoch": 0.2509078903928689, "grad_norm": 7086.8037109375, "learning_rate": 0.000475, "loss": 35.2328, "step": 95 }, { "epoch": 0.25354902608121493, "grad_norm": 7524.8134765625, "learning_rate": 0.00048, "loss": 32.7064, "step": 96 }, { "epoch": 0.2561901617695609, "grad_norm": 5815.705078125, "learning_rate": 0.00048499999999999997, "loss": 33.7066, "step": 97 }, { "epoch": 0.2588312974579069, "grad_norm": 14853.759765625, "learning_rate": 0.00049, "loss": 35.0899, "step": 98 }, { "epoch": 0.2614724331462529, "grad_norm": 8826.2041015625, "learning_rate": 0.000495, "loss": 35.74, "step": 99 }, { "epoch": 0.26411356883459886, "grad_norm": 7126.2978515625, "learning_rate": 0.0005, "loss": 40.4878, "step": 100 }, { "epoch": 0.26675470452294486, "grad_norm": 9713.095703125, "learning_rate": 0.0004999999965094484, "loss": 34.1725, "step": 101 }, { "epoch": 0.26939584021129087, "grad_norm": 3374.9189453125, "learning_rate": 0.0004999999860377938, "loss": 33.6172, "step": 102 }, { "epoch": 0.2720369758996368, "grad_norm": 3657.26904296875, "learning_rate": 0.0004999999685850365, "loss": 33.0719, "step": 103 }, { "epoch": 0.27467811158798283, "grad_norm": 4509.818359375, "learning_rate": 0.0004999999441511768, "loss": 33.8813, "step": 104 }, { "epoch": 0.27731924727632884, "grad_norm": 3591.63525390625, "learning_rate": 0.0004999999127362156, "loss": 34.2678, "step": 105 }, { "epoch": 0.2799603829646748, "grad_norm": 3155.579833984375, "learning_rate": 0.0004999998743401537, "loss": 32.6803, "step": 106 }, { "epoch": 0.2826015186530208, "grad_norm": 3885.622314453125, "learning_rate": 0.0004999998289629921, "loss": 32.3832, "step": 107 }, { "epoch": 0.2852426543413668, "grad_norm": 5366.67919921875, "learning_rate": 0.0004999997766047322, "loss": 34.8086, "step": 108 }, { "epoch": 0.28788379002971276, "grad_norm": 5436.2255859375, "learning_rate": 0.0004999997172653755, "loss": 34.6929, "step": 109 }, { "epoch": 0.29052492571805877, "grad_norm": 5621.5419921875, "learning_rate": 0.0004999996509449233, "loss": 35.4193, "step": 110 }, { "epoch": 0.2931660614064048, "grad_norm": 8839.0380859375, "learning_rate": 0.0004999995776433779, "loss": 37.2234, "step": 111 }, { "epoch": 0.29580719709475073, "grad_norm": 9621.759765625, "learning_rate": 0.000499999497360741, "loss": 40.976, "step": 112 }, { "epoch": 0.29844833278309674, "grad_norm": 9217.13671875, "learning_rate": 0.0004999994100970152, "loss": 39.9566, "step": 113 }, { "epoch": 0.30108946847144274, "grad_norm": 8095.18017578125, "learning_rate": 0.0004999993158522026, "loss": 38.1321, "step": 114 }, { "epoch": 0.3037306041597887, "grad_norm": 6514.04931640625, "learning_rate": 0.000499999214626306, "loss": 38.1738, "step": 115 }, { "epoch": 0.3063717398481347, "grad_norm": 5448.1474609375, "learning_rate": 0.0004999991064193281, "loss": 39.4817, "step": 116 }, { "epoch": 0.3090128755364807, "grad_norm": 5978.3369140625, "learning_rate": 0.0004999989912312721, "loss": 39.0551, "step": 117 }, { "epoch": 0.31165401122482667, "grad_norm": 6135.671875, "learning_rate": 0.000499998869062141, "loss": 38.8773, "step": 118 }, { "epoch": 0.3142951469131727, "grad_norm": 5256.56298828125, "learning_rate": 0.0004999987399119384, "loss": 37.7619, "step": 119 }, { "epoch": 0.3169362826015186, "grad_norm": 5508.1787109375, "learning_rate": 0.000499998603780668, "loss": 39.2471, "step": 120 }, { "epoch": 0.31957741828986463, "grad_norm": 5058.90625, "learning_rate": 0.0004999984606683333, "loss": 39.1154, "step": 121 }, { "epoch": 0.32221855397821064, "grad_norm": 11336.3583984375, "learning_rate": 0.0004999983105749385, "loss": 41.8295, "step": 122 }, { "epoch": 0.3248596896665566, "grad_norm": 133908.875, "learning_rate": 0.0004999981535004877, "loss": 193.6222, "step": 123 }, { "epoch": 0.3275008253549026, "grad_norm": 165386.0, "learning_rate": 0.0004999979894449853, "loss": 335.3867, "step": 124 }, { "epoch": 0.3301419610432486, "grad_norm": 276884.4375, "learning_rate": 0.0004999978184084359, "loss": 288.0732, "step": 125 }, { "epoch": 0.33278309673159456, "grad_norm": 248015.5625, "learning_rate": 0.0004999976403908444, "loss": 235.9549, "step": 126 }, { "epoch": 0.33542423241994057, "grad_norm": 300828.25, "learning_rate": 0.0004999974553922155, "loss": 231.0662, "step": 127 }, { "epoch": 0.3380653681082866, "grad_norm": 506374.21875, "learning_rate": 0.0004999972634125545, "loss": 174.7595, "step": 128 }, { "epoch": 0.34070650379663253, "grad_norm": 79721.9765625, "learning_rate": 0.000499997064451867, "loss": 134.991, "step": 129 }, { "epoch": 0.34334763948497854, "grad_norm": 251664.421875, "learning_rate": 0.0004999968585101581, "loss": 112.0794, "step": 130 }, { "epoch": 0.34598877517332455, "grad_norm": 58107.7109375, "learning_rate": 0.0004999966455874338, "loss": 87.7634, "step": 131 }, { "epoch": 0.3486299108616705, "grad_norm": 90693.90625, "learning_rate": 0.0004999964256837002, "loss": 305.8202, "step": 132 }, { "epoch": 0.3512710465500165, "grad_norm": 94757.8046875, "learning_rate": 0.000499996198798963, "loss": 586.045, "step": 133 }, { "epoch": 0.3539121822383625, "grad_norm": 51174.39453125, "learning_rate": 0.0004999959649332291, "loss": 256.3214, "step": 134 }, { "epoch": 0.35655331792670847, "grad_norm": 20202.298828125, "learning_rate": 0.0004999957240865046, "loss": 88.475, "step": 135 }, { "epoch": 0.3591944536150545, "grad_norm": 15118.0810546875, "learning_rate": 0.0004999954762587963, "loss": 63.626, "step": 136 }, { "epoch": 0.3618355893034005, "grad_norm": 22502.21875, "learning_rate": 0.0004999952214501112, "loss": 82.6103, "step": 137 }, { "epoch": 0.36447672499174644, "grad_norm": 34358.15234375, "learning_rate": 0.0004999949596604565, "loss": 98.621, "step": 138 }, { "epoch": 0.36711786068009244, "grad_norm": 20239.955078125, "learning_rate": 0.0004999946908898393, "loss": 72.8164, "step": 139 }, { "epoch": 0.36975899636843845, "grad_norm": 16102.712890625, "learning_rate": 0.0004999944151382673, "loss": 56.7182, "step": 140 }, { "epoch": 0.3724001320567844, "grad_norm": 7010.42236328125, "learning_rate": 0.0004999941324057481, "loss": 50.195, "step": 141 }, { "epoch": 0.3750412677451304, "grad_norm": 21389.037109375, "learning_rate": 0.0004999938426922895, "loss": 55.4606, "step": 142 }, { "epoch": 0.3776824034334764, "grad_norm": 22398.826171875, "learning_rate": 0.0004999935459978998, "loss": 73.4327, "step": 143 }, { "epoch": 0.3803235391218224, "grad_norm": 16782.296875, "learning_rate": 0.0004999932423225871, "loss": 66.3248, "step": 144 }, { "epoch": 0.3829646748101684, "grad_norm": 13487.7998046875, "learning_rate": 0.00049999293166636, "loss": 51.9716, "step": 145 }, { "epoch": 0.38560581049851433, "grad_norm": 5084.9228515625, "learning_rate": 0.0004999926140292272, "loss": 44.4684, "step": 146 }, { "epoch": 0.38824694618686034, "grad_norm": 6226.92724609375, "learning_rate": 0.0004999922894111975, "loss": 46.5129, "step": 147 }, { "epoch": 0.39088808187520635, "grad_norm": 7436.89990234375, "learning_rate": 0.0004999919578122799, "loss": 48.0447, "step": 148 }, { "epoch": 0.3935292175635523, "grad_norm": 17381.50390625, "learning_rate": 0.0004999916192324837, "loss": 49.0736, "step": 149 }, { "epoch": 0.3961703532518983, "grad_norm": 11431.2275390625, "learning_rate": 0.0004999912736718185, "loss": 51.7422, "step": 150 }, { "epoch": 0.3988114889402443, "grad_norm": 3297.097900390625, "learning_rate": 0.0004999909211302937, "loss": 47.5256, "step": 151 }, { "epoch": 0.40145262462859027, "grad_norm": 4007.915771484375, "learning_rate": 0.0004999905616079194, "loss": 45.6299, "step": 152 }, { "epoch": 0.4040937603169363, "grad_norm": 4548.51953125, "learning_rate": 0.0004999901951047055, "loss": 46.4982, "step": 153 }, { "epoch": 0.4067348960052823, "grad_norm": 4367.970703125, "learning_rate": 0.0004999898216206622, "loss": 46.6904, "step": 154 }, { "epoch": 0.40937603169362824, "grad_norm": 5156.05126953125, "learning_rate": 0.0004999894411558, "loss": 47.2823, "step": 155 }, { "epoch": 0.41201716738197425, "grad_norm": 3587.627685546875, "learning_rate": 0.0004999890537101296, "loss": 47.3553, "step": 156 }, { "epoch": 0.41465830307032026, "grad_norm": 3437.5068359375, "learning_rate": 0.0004999886592836616, "loss": 47.0665, "step": 157 }, { "epoch": 0.4172994387586662, "grad_norm": 3258.25634765625, "learning_rate": 0.0004999882578764072, "loss": 46.5962, "step": 158 }, { "epoch": 0.4199405744470122, "grad_norm": 7969.96728515625, "learning_rate": 0.0004999878494883775, "loss": 48.108, "step": 159 }, { "epoch": 0.4225817101353582, "grad_norm": 4251.72509765625, "learning_rate": 0.000499987434119584, "loss": 47.8146, "step": 160 }, { "epoch": 0.4252228458237042, "grad_norm": 5230.79736328125, "learning_rate": 0.0004999870117700382, "loss": 47.1518, "step": 161 }, { "epoch": 0.4278639815120502, "grad_norm": 13383.7490234375, "learning_rate": 0.000499986582439752, "loss": 51.0644, "step": 162 }, { "epoch": 0.4305051172003962, "grad_norm": 8808.83203125, "learning_rate": 0.0004999861461287374, "loss": 49.7648, "step": 163 }, { "epoch": 0.43314625288874214, "grad_norm": 5793.46337890625, "learning_rate": 0.0004999857028370065, "loss": 49.797, "step": 164 }, { "epoch": 0.43578738857708815, "grad_norm": 8460.2255859375, "learning_rate": 0.0004999852525645716, "loss": 59.8583, "step": 165 }, { "epoch": 0.43842852426543416, "grad_norm": 14854.1015625, "learning_rate": 0.0004999847953114454, "loss": 70.4776, "step": 166 }, { "epoch": 0.4410696599537801, "grad_norm": 12337.0546875, "learning_rate": 0.0004999843310776406, "loss": 86.0081, "step": 167 }, { "epoch": 0.4437107956421261, "grad_norm": 14159.4453125, "learning_rate": 0.0004999838598631703, "loss": 93.3891, "step": 168 }, { "epoch": 0.44635193133047213, "grad_norm": 10333.9296875, "learning_rate": 0.0004999833816680476, "loss": 87.7452, "step": 169 }, { "epoch": 0.4489930670188181, "grad_norm": 5021.7578125, "learning_rate": 0.0004999828964922856, "loss": 84.7686, "step": 170 }, { "epoch": 0.4516342027071641, "grad_norm": 7778.98388671875, "learning_rate": 0.0004999824043358983, "loss": 84.3262, "step": 171 }, { "epoch": 0.45427533839551004, "grad_norm": 3972.649658203125, "learning_rate": 0.0004999819051988991, "loss": 84.2497, "step": 172 }, { "epoch": 0.45691647408385605, "grad_norm": 36349.73828125, "learning_rate": 0.000499981399081302, "loss": 118.0644, "step": 173 }, { "epoch": 0.45955760977220206, "grad_norm": 150201.875, "learning_rate": 0.0004999808859831213, "loss": 213.5293, "step": 174 }, { "epoch": 0.462198745460548, "grad_norm": 96097.5546875, "learning_rate": 0.0004999803659043712, "loss": 179.621, "step": 175 }, { "epoch": 0.464839881148894, "grad_norm": 175282.28125, "learning_rate": 0.0004999798388450661, "loss": 169.4311, "step": 176 }, { "epoch": 0.46748101683724, "grad_norm": 22516.44140625, "learning_rate": 0.0004999793048052209, "loss": 153.8926, "step": 177 }, { "epoch": 0.470122152525586, "grad_norm": 20279.24609375, "learning_rate": 0.0004999787637848505, "loss": 134.3672, "step": 178 }, { "epoch": 0.472763288213932, "grad_norm": 5812.947265625, "learning_rate": 0.0004999782157839699, "loss": 115.291, "step": 179 }, { "epoch": 0.475404423902278, "grad_norm": 7150.7841796875, "learning_rate": 0.0004999776608025946, "loss": 90.9138, "step": 180 }, { "epoch": 0.47804555959062395, "grad_norm": 13029.873046875, "learning_rate": 0.0004999770988407398, "loss": 95.3843, "step": 181 }, { "epoch": 0.48068669527896996, "grad_norm": 13345.4248046875, "learning_rate": 0.0004999765298984214, "loss": 174.4616, "step": 182 }, { "epoch": 0.48332783096731596, "grad_norm": 9002.2744140625, "learning_rate": 0.0004999759539756554, "loss": 349.1008, "step": 183 }, { "epoch": 0.4859689666556619, "grad_norm": 9823.4130859375, "learning_rate": 0.0004999753710724575, "loss": 170.5262, "step": 184 }, { "epoch": 0.4886101023440079, "grad_norm": 5980.7314453125, "learning_rate": 0.0004999747811888443, "loss": 143.1429, "step": 185 }, { "epoch": 0.49125123803235393, "grad_norm": 8309.677734375, "learning_rate": 0.0004999741843248322, "loss": 162.2347, "step": 186 }, { "epoch": 0.4938923737206999, "grad_norm": 7666.529296875, "learning_rate": 0.0004999735804804378, "loss": 114.9692, "step": 187 }, { "epoch": 0.4965335094090459, "grad_norm": 4305.162109375, "learning_rate": 0.000499972969655678, "loss": 91.7958, "step": 188 }, { "epoch": 0.4991746450973919, "grad_norm": 5754.7705078125, "learning_rate": 0.00049997235185057, "loss": 112.1112, "step": 189 }, { "epoch": 0.5018157807857379, "grad_norm": 6118.48486328125, "learning_rate": 0.0004999717270651309, "loss": 83.6814, "step": 190 }, { "epoch": 0.5044569164740839, "grad_norm": 4862.92529296875, "learning_rate": 0.000499971095299378, "loss": 63.0685, "step": 191 }, { "epoch": 0.5070980521624299, "grad_norm": 7934.3408203125, "learning_rate": 0.0004999704565533292, "loss": 77.7848, "step": 192 }, { "epoch": 0.5097391878507759, "grad_norm": 6936.11865234375, "learning_rate": 0.0004999698108270022, "loss": 81.8821, "step": 193 }, { "epoch": 0.5123803235391218, "grad_norm": 4528.42822265625, "learning_rate": 0.0004999691581204152, "loss": 70.8716, "step": 194 }, { "epoch": 0.5150214592274678, "grad_norm": 3894.0302734375, "learning_rate": 0.0004999684984335861, "loss": 65.5104, "step": 195 }, { "epoch": 0.5176625949158138, "grad_norm": 4164.306640625, "learning_rate": 0.0004999678317665337, "loss": 67.1822, "step": 196 }, { "epoch": 0.5203037306041598, "grad_norm": 2536.161865234375, "learning_rate": 0.0004999671581192764, "loss": 69.5719, "step": 197 }, { "epoch": 0.5229448662925058, "grad_norm": 3463.102783203125, "learning_rate": 0.000499966477491833, "loss": 61.839, "step": 198 }, { "epoch": 0.5255860019808518, "grad_norm": 3733.467529296875, "learning_rate": 0.0004999657898842225, "loss": 84.8052, "step": 199 }, { "epoch": 0.5282271376691977, "grad_norm": 5331.29443359375, "learning_rate": 0.0004999650952964643, "loss": 71.4111, "step": 200 }, { "epoch": 0.5282271376691977, "eval_loss": 9.3247709274292, "eval_runtime": 2.1161, "eval_samples_per_second": 233.916, "eval_steps_per_second": 29.299, "step": 200 }, { "epoch": 0.5308682733575437, "grad_norm": 2796.521484375, "learning_rate": 0.0004999643937285775, "loss": 58.7677, "step": 201 }, { "epoch": 0.5335094090458897, "grad_norm": 3957.806640625, "learning_rate": 0.000499963685180582, "loss": 61.6235, "step": 202 }, { "epoch": 0.5361505447342357, "grad_norm": 2709.49169921875, "learning_rate": 0.0004999629696524974, "loss": 60.4042, "step": 203 }, { "epoch": 0.5387916804225817, "grad_norm": 2554.22802734375, "learning_rate": 0.0004999622471443436, "loss": 63.2459, "step": 204 }, { "epoch": 0.5414328161109278, "grad_norm": 4521.96484375, "learning_rate": 0.0004999615176561409, "loss": 71.1443, "step": 205 }, { "epoch": 0.5440739517992736, "grad_norm": 2441.78857421875, "learning_rate": 0.0004999607811879096, "loss": 74.177, "step": 206 }, { "epoch": 0.5467150874876197, "grad_norm": 2357.4833984375, "learning_rate": 0.0004999600377396705, "loss": 61.3788, "step": 207 }, { "epoch": 0.5493562231759657, "grad_norm": 2660.37109375, "learning_rate": 0.0004999592873114441, "loss": 65.9906, "step": 208 }, { "epoch": 0.5519973588643117, "grad_norm": 2375.468017578125, "learning_rate": 0.0004999585299032514, "loss": 62.7134, "step": 209 }, { "epoch": 0.5546384945526577, "grad_norm": 2204.840087890625, "learning_rate": 0.0004999577655151136, "loss": 59.1076, "step": 210 }, { "epoch": 0.5572796302410036, "grad_norm": 1768.3609619140625, "learning_rate": 0.0004999569941470521, "loss": 57.0235, "step": 211 }, { "epoch": 0.5599207659293496, "grad_norm": 1545.7469482421875, "learning_rate": 0.0004999562157990883, "loss": 54.6932, "step": 212 }, { "epoch": 0.5625619016176956, "grad_norm": 1323.4130859375, "learning_rate": 0.000499955430471244, "loss": 49.757, "step": 213 }, { "epoch": 0.5652030373060416, "grad_norm": 1819.5794677734375, "learning_rate": 0.0004999546381635412, "loss": 48.9746, "step": 214 }, { "epoch": 0.5678441729943876, "grad_norm": 1648.066650390625, "learning_rate": 0.0004999538388760019, "loss": 49.6996, "step": 215 }, { "epoch": 0.5704853086827336, "grad_norm": 2218.70703125, "learning_rate": 0.0004999530326086484, "loss": 49.2004, "step": 216 }, { "epoch": 0.5731264443710795, "grad_norm": 2097.824951171875, "learning_rate": 0.0004999522193615035, "loss": 43.5925, "step": 217 }, { "epoch": 0.5757675800594255, "grad_norm": 963.8384399414062, "learning_rate": 0.0004999513991345896, "loss": 43.9562, "step": 218 }, { "epoch": 0.5784087157477715, "grad_norm": 2932.369873046875, "learning_rate": 0.0004999505719279297, "loss": 43.7959, "step": 219 }, { "epoch": 0.5810498514361175, "grad_norm": 1217.371337890625, "learning_rate": 0.000499949737741547, "loss": 46.7271, "step": 220 }, { "epoch": 0.5836909871244635, "grad_norm": 1616.615234375, "learning_rate": 0.0004999488965754645, "loss": 47.6594, "step": 221 }, { "epoch": 0.5863321228128096, "grad_norm": 17641.650390625, "learning_rate": 0.000499948048429706, "loss": 86.0985, "step": 222 }, { "epoch": 0.5889732585011554, "grad_norm": 33040.4296875, "learning_rate": 0.0004999471933042951, "loss": 172.7441, "step": 223 }, { "epoch": 0.5916143941895015, "grad_norm": 33757.98046875, "learning_rate": 0.0004999463311992557, "loss": 145.7816, "step": 224 }, { "epoch": 0.5942555298778475, "grad_norm": 25608.65625, "learning_rate": 0.0004999454621146118, "loss": 144.5488, "step": 225 }, { "epoch": 0.5968966655661935, "grad_norm": 88068.9765625, "learning_rate": 0.0004999445860503876, "loss": 126.3383, "step": 226 }, { "epoch": 0.5995378012545395, "grad_norm": 36546.9140625, "learning_rate": 0.0004999437030066078, "loss": 107.2461, "step": 227 }, { "epoch": 0.6021789369428855, "grad_norm": 42848.90625, "learning_rate": 0.0004999428129832967, "loss": 97.0156, "step": 228 }, { "epoch": 0.6048200726312314, "grad_norm": 39999.0546875, "learning_rate": 0.0004999419159804796, "loss": 78.5137, "step": 229 }, { "epoch": 0.6074612083195774, "grad_norm": 25039.015625, "learning_rate": 0.0004999410119981812, "loss": 56.9064, "step": 230 }, { "epoch": 0.6101023440079234, "grad_norm": 41022.16796875, "learning_rate": 0.0004999401010364269, "loss": 65.4115, "step": 231 }, { "epoch": 0.6127434796962694, "grad_norm": 11290.1201171875, "learning_rate": 0.0004999391830952421, "loss": 127.3712, "step": 232 }, { "epoch": 0.6153846153846154, "grad_norm": 11360.6826171875, "learning_rate": 0.0004999382581746525, "loss": 88.1978, "step": 233 }, { "epoch": 0.6180257510729614, "grad_norm": 3777.018310546875, "learning_rate": 0.0004999373262746837, "loss": 58.3822, "step": 234 }, { "epoch": 0.6206668867613073, "grad_norm": 2697.8720703125, "learning_rate": 0.000499936387395362, "loss": 52.9492, "step": 235 }, { "epoch": 0.6233080224496533, "grad_norm": 5631.3759765625, "learning_rate": 0.0004999354415367134, "loss": 65.4678, "step": 236 }, { "epoch": 0.6259491581379993, "grad_norm": 4904.7421875, "learning_rate": 0.0004999344886987646, "loss": 58.5996, "step": 237 }, { "epoch": 0.6285902938263453, "grad_norm": 4000.262451171875, "learning_rate": 0.0004999335288815419, "loss": 46.9789, "step": 238 }, { "epoch": 0.6312314295146914, "grad_norm": 3316.953857421875, "learning_rate": 0.0004999325620850722, "loss": 46.2916, "step": 239 }, { "epoch": 0.6338725652030373, "grad_norm": 4618.51806640625, "learning_rate": 0.0004999315883093826, "loss": 50.7303, "step": 240 }, { "epoch": 0.6365137008913833, "grad_norm": 7926.99560546875, "learning_rate": 0.0004999306075545002, "loss": 56.3237, "step": 241 }, { "epoch": 0.6391548365797293, "grad_norm": 8567.333984375, "learning_rate": 0.0004999296198204523, "loss": 57.8494, "step": 242 }, { "epoch": 0.6417959722680753, "grad_norm": 11318.8466796875, "learning_rate": 0.0004999286251072667, "loss": 56.2077, "step": 243 }, { "epoch": 0.6444371079564213, "grad_norm": 3837.3388671875, "learning_rate": 0.000499927623414971, "loss": 54.2061, "step": 244 }, { "epoch": 0.6470782436447673, "grad_norm": 3667.240966796875, "learning_rate": 0.0004999266147435932, "loss": 50.3968, "step": 245 }, { "epoch": 0.6497193793331132, "grad_norm": 4050.201416015625, "learning_rate": 0.0004999255990931616, "loss": 49.1856, "step": 246 }, { "epoch": 0.6523605150214592, "grad_norm": 4911.85986328125, "learning_rate": 0.0004999245764637045, "loss": 49.9588, "step": 247 }, { "epoch": 0.6550016507098052, "grad_norm": 6207.18798828125, "learning_rate": 0.0004999235468552503, "loss": 52.5268, "step": 248 }, { "epoch": 0.6576427863981512, "grad_norm": 11193.6943359375, "learning_rate": 0.0004999225102678279, "loss": 56.8916, "step": 249 }, { "epoch": 0.6602839220864972, "grad_norm": 5139.15625, "learning_rate": 0.0004999214667014662, "loss": 60.6908, "step": 250 }, { "epoch": 0.6629250577748432, "grad_norm": 5854.3916015625, "learning_rate": 0.0004999204161561943, "loss": 50.0914, "step": 251 }, { "epoch": 0.6655661934631891, "grad_norm": 4811.49658203125, "learning_rate": 0.0004999193586320416, "loss": 53.0386, "step": 252 }, { "epoch": 0.6682073291515351, "grad_norm": 5437.96435546875, "learning_rate": 0.0004999182941290378, "loss": 53.3382, "step": 253 }, { "epoch": 0.6708484648398811, "grad_norm": 3205.693115234375, "learning_rate": 0.0004999172226472123, "loss": 51.1284, "step": 254 }, { "epoch": 0.6734896005282272, "grad_norm": 3348.385986328125, "learning_rate": 0.0004999161441865952, "loss": 53.7648, "step": 255 }, { "epoch": 0.6761307362165732, "grad_norm": 7013.53564453125, "learning_rate": 0.0004999150587472166, "loss": 56.8994, "step": 256 }, { "epoch": 0.6787718719049192, "grad_norm": 3873.184814453125, "learning_rate": 0.0004999139663291067, "loss": 60.4127, "step": 257 }, { "epoch": 0.6814130075932651, "grad_norm": 2731.37646484375, "learning_rate": 0.0004999128669322962, "loss": 54.5608, "step": 258 }, { "epoch": 0.6840541432816111, "grad_norm": 2844.651123046875, "learning_rate": 0.0004999117605568156, "loss": 51.3033, "step": 259 }, { "epoch": 0.6866952789699571, "grad_norm": 3361.22900390625, "learning_rate": 0.000499910647202696, "loss": 51.4113, "step": 260 }, { "epoch": 0.6893364146583031, "grad_norm": 2154.63720703125, "learning_rate": 0.0004999095268699682, "loss": 51.1206, "step": 261 }, { "epoch": 0.6919775503466491, "grad_norm": 3131.78466796875, "learning_rate": 0.0004999083995586639, "loss": 49.6677, "step": 262 }, { "epoch": 0.6946186860349951, "grad_norm": 2690.947509765625, "learning_rate": 0.0004999072652688142, "loss": 48.9571, "step": 263 }, { "epoch": 0.697259821723341, "grad_norm": 2018.7230224609375, "learning_rate": 0.000499906124000451, "loss": 46.5242, "step": 264 }, { "epoch": 0.699900957411687, "grad_norm": 4373.845703125, "learning_rate": 0.0004999049757536061, "loss": 46.5766, "step": 265 }, { "epoch": 0.702542093100033, "grad_norm": 2761.393310546875, "learning_rate": 0.0004999038205283114, "loss": 49.8086, "step": 266 }, { "epoch": 0.705183228788379, "grad_norm": 1716.7911376953125, "learning_rate": 0.0004999026583245995, "loss": 43.5669, "step": 267 }, { "epoch": 0.707824364476725, "grad_norm": 2879.282470703125, "learning_rate": 0.0004999014891425025, "loss": 43.0523, "step": 268 }, { "epoch": 0.7104655001650709, "grad_norm": 2365.137939453125, "learning_rate": 0.0004999003129820534, "loss": 42.1102, "step": 269 }, { "epoch": 0.7131066358534169, "grad_norm": 4848.61083984375, "learning_rate": 0.0004998991298432847, "loss": 42.6761, "step": 270 }, { "epoch": 0.715747771541763, "grad_norm": 2512.700439453125, "learning_rate": 0.0004998979397262297, "loss": 47.9838, "step": 271 }, { "epoch": 0.718388907230109, "grad_norm": 16371.736328125, "learning_rate": 0.0004998967426309215, "loss": 120.9951, "step": 272 }, { "epoch": 0.721030042918455, "grad_norm": 57735.4609375, "learning_rate": 0.0004998955385573935, "loss": 226.3049, "step": 273 }, { "epoch": 0.723671178606801, "grad_norm": 36172.90625, "learning_rate": 0.0004998943275056794, "loss": 191.7664, "step": 274 }, { "epoch": 0.7263123142951469, "grad_norm": 67116.4453125, "learning_rate": 0.0004998931094758131, "loss": 212.752, "step": 275 }, { "epoch": 0.7289534499834929, "grad_norm": 30639.3984375, "learning_rate": 0.0004998918844678284, "loss": 112.6465, "step": 276 }, { "epoch": 0.7315945856718389, "grad_norm": 33489.94140625, "learning_rate": 0.0004998906524817597, "loss": 93.2881, "step": 277 }, { "epoch": 0.7342357213601849, "grad_norm": 30865.501953125, "learning_rate": 0.0004998894135176414, "loss": 71.5283, "step": 278 }, { "epoch": 0.7368768570485309, "grad_norm": 6314.75439453125, "learning_rate": 0.0004998881675755078, "loss": 60.2012, "step": 279 }, { "epoch": 0.7395179927368769, "grad_norm": 15773.6064453125, "learning_rate": 0.000499886914655394, "loss": 229.8064, "step": 280 }, { "epoch": 0.7421591284252228, "grad_norm": 8100.28955078125, "learning_rate": 0.000499885654757335, "loss": 44.6821, "step": 281 }, { "epoch": 0.7448002641135688, "grad_norm": 53421.453125, "learning_rate": 0.0004998843878813658, "loss": 1432.4661, "step": 282 }, { "epoch": 0.7474413998019148, "grad_norm": 64292.02734375, "learning_rate": 0.0004998831140275218, "loss": 1177.1903, "step": 283 }, { "epoch": 0.7500825354902608, "grad_norm": 12210.9765625, "learning_rate": 0.0004998818331958388, "loss": 437.8733, "step": 284 }, { "epoch": 0.7527236711786068, "grad_norm": 9416.611328125, "learning_rate": 0.0004998805453863523, "loss": 187.6019, "step": 285 }, { "epoch": 0.7553648068669528, "grad_norm": 4085.390869140625, "learning_rate": 0.0004998792505990984, "loss": 81.8461, "step": 286 }, { "epoch": 0.7580059425552987, "grad_norm": 3688.35595703125, "learning_rate": 0.0004998779488341131, "loss": 62.8984, "step": 287 }, { "epoch": 0.7606470782436447, "grad_norm": 3483.939208984375, "learning_rate": 0.0004998766400914329, "loss": 78.7311, "step": 288 }, { "epoch": 0.7632882139319908, "grad_norm": 6274.22998046875, "learning_rate": 0.0004998753243710943, "loss": 80.5934, "step": 289 }, { "epoch": 0.7659293496203368, "grad_norm": 3734.331787109375, "learning_rate": 0.0004998740016731342, "loss": 76.0509, "step": 290 }, { "epoch": 0.7685704853086828, "grad_norm": 2442.794921875, "learning_rate": 0.0004998726719975892, "loss": 65.6246, "step": 291 }, { "epoch": 0.7712116209970287, "grad_norm": 1536.7857666015625, "learning_rate": 0.0004998713353444967, "loss": 57.6838, "step": 292 }, { "epoch": 0.7738527566853747, "grad_norm": 4096.09814453125, "learning_rate": 0.000499869991713894, "loss": 61.0592, "step": 293 }, { "epoch": 0.7764938923737207, "grad_norm": 2245.004638671875, "learning_rate": 0.0004998686411058184, "loss": 57.9205, "step": 294 }, { "epoch": 0.7791350280620667, "grad_norm": 1506.6851806640625, "learning_rate": 0.000499867283520308, "loss": 58.696, "step": 295 }, { "epoch": 0.7817761637504127, "grad_norm": 3001.488525390625, "learning_rate": 0.0004998659189574004, "loss": 66.2079, "step": 296 }, { "epoch": 0.7844172994387587, "grad_norm": 1701.214599609375, "learning_rate": 0.0004998645474171337, "loss": 72.7522, "step": 297 }, { "epoch": 0.7870584351271046, "grad_norm": 2758.474609375, "learning_rate": 0.0004998631688995463, "loss": 73.3997, "step": 298 }, { "epoch": 0.7896995708154506, "grad_norm": 3236.857421875, "learning_rate": 0.0004998617834046767, "loss": 73.6557, "step": 299 }, { "epoch": 0.7923407065037966, "grad_norm": 3866.06494140625, "learning_rate": 0.0004998603909325636, "loss": 74.3018, "step": 300 }, { "epoch": 0.7949818421921426, "grad_norm": 2081.338623046875, "learning_rate": 0.0004998589914832459, "loss": 61.5527, "step": 301 }, { "epoch": 0.7976229778804886, "grad_norm": 1187.7021484375, "learning_rate": 0.0004998575850567626, "loss": 61.5598, "step": 302 }, { "epoch": 0.8002641135688346, "grad_norm": 1573.0159912109375, "learning_rate": 0.0004998561716531531, "loss": 60.3658, "step": 303 }, { "epoch": 0.8029052492571805, "grad_norm": 1873.841552734375, "learning_rate": 0.0004998547512724567, "loss": 73.2816, "step": 304 }, { "epoch": 0.8055463849455266, "grad_norm": 1112.3157958984375, "learning_rate": 0.0004998533239147131, "loss": 71.2628, "step": 305 }, { "epoch": 0.8081875206338726, "grad_norm": 741.0847778320312, "learning_rate": 0.0004998518895799623, "loss": 66.1208, "step": 306 }, { "epoch": 0.8108286563222186, "grad_norm": 1611.5086669921875, "learning_rate": 0.0004998504482682442, "loss": 69.679, "step": 307 }, { "epoch": 0.8134697920105646, "grad_norm": 636.1163940429688, "learning_rate": 0.0004998489999795991, "loss": 65.0243, "step": 308 }, { "epoch": 0.8161109276989106, "grad_norm": 1358.7314453125, "learning_rate": 0.0004998475447140673, "loss": 62.3139, "step": 309 }, { "epoch": 0.8187520633872565, "grad_norm": 1004.1213989257812, "learning_rate": 0.0004998460824716898, "loss": 59.7747, "step": 310 }, { "epoch": 0.8213931990756025, "grad_norm": 819.4826049804688, "learning_rate": 0.0004998446132525072, "loss": 60.8384, "step": 311 }, { "epoch": 0.8240343347639485, "grad_norm": 1330.79345703125, "learning_rate": 0.0004998431370565605, "loss": 57.1831, "step": 312 }, { "epoch": 0.8266754704522945, "grad_norm": 352.2709655761719, "learning_rate": 0.0004998416538838909, "loss": 52.6507, "step": 313 }, { "epoch": 0.8293166061406405, "grad_norm": 584.5061645507812, "learning_rate": 0.0004998401637345399, "loss": 47.8401, "step": 314 }, { "epoch": 0.8319577418289865, "grad_norm": 840.9014282226562, "learning_rate": 0.0004998386666085491, "loss": 48.721, "step": 315 }, { "epoch": 0.8345988775173324, "grad_norm": 579.9915161132812, "learning_rate": 0.0004998371625059604, "loss": 50.113, "step": 316 }, { "epoch": 0.8372400132056784, "grad_norm": 545.55224609375, "learning_rate": 0.0004998356514268156, "loss": 45.2377, "step": 317 }, { "epoch": 0.8398811488940244, "grad_norm": 292.0387268066406, "learning_rate": 0.000499834133371157, "loss": 46.8563, "step": 318 }, { "epoch": 0.8425222845823704, "grad_norm": 473.66876220703125, "learning_rate": 0.000499832608339027, "loss": 46.3929, "step": 319 }, { "epoch": 0.8451634202707164, "grad_norm": 720.73779296875, "learning_rate": 0.0004998310763304682, "loss": 46.8629, "step": 320 }, { "epoch": 0.8478045559590623, "grad_norm": 685.930419921875, "learning_rate": 0.0004998295373455234, "loss": 46.0852, "step": 321 }, { "epoch": 0.8504456916474084, "grad_norm": 2401.373046875, "learning_rate": 0.0004998279913842354, "loss": 52.6046, "step": 322 }, { "epoch": 0.8530868273357544, "grad_norm": 5285.125, "learning_rate": 0.0004998264384466476, "loss": 97.3308, "step": 323 }, { "epoch": 0.8557279630241004, "grad_norm": 6421.681640625, "learning_rate": 0.0004998248785328031, "loss": 101.5967, "step": 324 }, { "epoch": 0.8583690987124464, "grad_norm": 4085.70849609375, "learning_rate": 0.0004998233116427457, "loss": 86.373, "step": 325 }, { "epoch": 0.8610102344007924, "grad_norm": 10426.2177734375, "learning_rate": 0.0004998217377765191, "loss": 68.7578, "step": 326 }, { "epoch": 0.8636513700891383, "grad_norm": 4288.19287109375, "learning_rate": 0.0004998201569341674, "loss": 55.9492, "step": 327 }, { "epoch": 0.8662925057774843, "grad_norm": 5265.4306640625, "learning_rate": 0.0004998185691157343, "loss": 44.5479, "step": 328 }, { "epoch": 0.8689336414658303, "grad_norm": 4283.0322265625, "learning_rate": 0.0004998169743212646, "loss": 79.1904, "step": 329 }, { "epoch": 0.8715747771541763, "grad_norm": 5330.3544921875, "learning_rate": 0.0004998153725508025, "loss": 28.6396, "step": 330 }, { "epoch": 0.8742159128425223, "grad_norm": 2514.101806640625, "learning_rate": 0.0004998137638043929, "loss": 30.3107, "step": 331 }, { "epoch": 0.8768570485308683, "grad_norm": 2371.185546875, "learning_rate": 0.0004998121480820808, "loss": 91.4339, "step": 332 }, { "epoch": 0.8794981842192142, "grad_norm": 4134.07080078125, "learning_rate": 0.0004998105253839111, "loss": 245.4532, "step": 333 }, { "epoch": 0.8821393199075602, "grad_norm": 3798.6396484375, "learning_rate": 0.0004998088957099293, "loss": 184.3061, "step": 334 }, { "epoch": 0.8847804555959062, "grad_norm": 3005.31689453125, "learning_rate": 0.0004998072590601808, "loss": 103.6607, "step": 335 }, { "epoch": 0.8874215912842522, "grad_norm": 1437.6070556640625, "learning_rate": 0.0004998056154347113, "loss": 71.4409, "step": 336 }, { "epoch": 0.8900627269725983, "grad_norm": 4776.62939453125, "learning_rate": 0.0004998039648335667, "loss": 123.63, "step": 337 }, { "epoch": 0.8927038626609443, "grad_norm": 4217.09130859375, "learning_rate": 0.0004998023072567932, "loss": 121.9807, "step": 338 }, { "epoch": 0.8953449983492902, "grad_norm": 2413.409423828125, "learning_rate": 0.000499800642704437, "loss": 73.8151, "step": 339 }, { "epoch": 0.8979861340376362, "grad_norm": 1566.010986328125, "learning_rate": 0.0004997989711765446, "loss": 54.8245, "step": 340 }, { "epoch": 0.9006272697259822, "grad_norm": 2188.10888671875, "learning_rate": 0.0004997972926731627, "loss": 63.7021, "step": 341 }, { "epoch": 0.9032684054143282, "grad_norm": 2702.580322265625, "learning_rate": 0.0004997956071943381, "loss": 71.8303, "step": 342 }, { "epoch": 0.9059095411026742, "grad_norm": 3262.1796875, "learning_rate": 0.0004997939147401179, "loss": 76.8448, "step": 343 }, { "epoch": 0.9085506767910201, "grad_norm": 2275.38232421875, "learning_rate": 0.0004997922153105493, "loss": 69.7659, "step": 344 }, { "epoch": 0.9111918124793661, "grad_norm": 1388.806884765625, "learning_rate": 0.0004997905089056799, "loss": 59.2313, "step": 345 }, { "epoch": 0.9138329481677121, "grad_norm": 1541.6566162109375, "learning_rate": 0.0004997887955255574, "loss": 50.5105, "step": 346 }, { "epoch": 0.9164740838560581, "grad_norm": 1590.149658203125, "learning_rate": 0.0004997870751702293, "loss": 51.5989, "step": 347 }, { "epoch": 0.9191152195444041, "grad_norm": 7822.51123046875, "learning_rate": 0.000499785347839744, "loss": 56.359, "step": 348 }, { "epoch": 0.9217563552327501, "grad_norm": 7707.18017578125, "learning_rate": 0.0004997836135341495, "loss": 68.4879, "step": 349 }, { "epoch": 0.924397490921096, "grad_norm": 8746.6943359375, "learning_rate": 0.0004997818722534944, "loss": 73.6874, "step": 350 }, { "epoch": 0.927038626609442, "grad_norm": 3766.367431640625, "learning_rate": 0.0004997801239978272, "loss": 78.4761, "step": 351 }, { "epoch": 0.929679762297788, "grad_norm": 3463.178955078125, "learning_rate": 0.0004997783687671968, "loss": 71.6326, "step": 352 }, { "epoch": 0.932320897986134, "grad_norm": 4809.09033203125, "learning_rate": 0.0004997766065616521, "loss": 69.1884, "step": 353 }, { "epoch": 0.93496203367448, "grad_norm": 6228.24609375, "learning_rate": 0.0004997748373812425, "loss": 65.0472, "step": 354 }, { "epoch": 0.9376031693628261, "grad_norm": 7708.1982421875, "learning_rate": 0.0004997730612260172, "loss": 61.2752, "step": 355 }, { "epoch": 0.940244305051172, "grad_norm": 6855.93359375, "learning_rate": 0.000499771278096026, "loss": 64.7672, "step": 356 }, { "epoch": 0.942885440739518, "grad_norm": 7265.1826171875, "learning_rate": 0.0004997694879913185, "loss": 58.9159, "step": 357 }, { "epoch": 0.945526576427864, "grad_norm": 8104.8291015625, "learning_rate": 0.0004997676909119448, "loss": 53.6124, "step": 358 }, { "epoch": 0.94816771211621, "grad_norm": 4278.5673828125, "learning_rate": 0.000499765886857955, "loss": 55.6288, "step": 359 }, { "epoch": 0.950808847804556, "grad_norm": 11907.177734375, "learning_rate": 0.0004997640758293995, "loss": 55.2491, "step": 360 }, { "epoch": 0.953449983492902, "grad_norm": 5599.8759765625, "learning_rate": 0.000499762257826329, "loss": 55.2029, "step": 361 }, { "epoch": 0.9560911191812479, "grad_norm": 6403.51953125, "learning_rate": 0.0004997604328487942, "loss": 54.2789, "step": 362 }, { "epoch": 0.9587322548695939, "grad_norm": 31018.0078125, "learning_rate": 0.000499758600896846, "loss": 131.8263, "step": 363 }, { "epoch": 0.9613733905579399, "grad_norm": 32613.20703125, "learning_rate": 0.0004997567619705355, "loss": 189.7539, "step": 364 }, { "epoch": 0.9640145262462859, "grad_norm": 29060.837890625, "learning_rate": 0.0004997549160699142, "loss": 169.189, "step": 365 }, { "epoch": 0.9666556619346319, "grad_norm": 23381.0390625, "learning_rate": 0.0004997530631950335, "loss": 166.2148, "step": 366 }, { "epoch": 0.9692967976229779, "grad_norm": 9796.73828125, "learning_rate": 0.0004997512033459453, "loss": 155.0996, "step": 367 }, { "epoch": 0.9719379333113238, "grad_norm": 38940.359375, "learning_rate": 0.0004997493365227015, "loss": 175.8623, "step": 368 }, { "epoch": 0.9745790689996698, "grad_norm": 18331.919921875, "learning_rate": 0.0004997474627253542, "loss": 61.7454, "step": 369 }, { "epoch": 0.9772202046880158, "grad_norm": 4923.82470703125, "learning_rate": 0.0004997455819539556, "loss": 63.3026, "step": 370 }, { "epoch": 0.9798613403763619, "grad_norm": 12278.3720703125, "learning_rate": 0.0004997436942085583, "loss": 63.4896, "step": 371 }, { "epoch": 0.9825024760647079, "grad_norm": 9419.904296875, "learning_rate": 0.0004997417994892152, "loss": 63.2351, "step": 372 }, { "epoch": 0.9851436117530538, "grad_norm": 7863.21875, "learning_rate": 0.0004997398977959791, "loss": 57.4991, "step": 373 }, { "epoch": 0.9877847474413998, "grad_norm": 5677.07177734375, "learning_rate": 0.0004997379891289029, "loss": 54.8447, "step": 374 }, { "epoch": 0.9904258831297458, "grad_norm": 3484.48876953125, "learning_rate": 0.0004997360734880401, "loss": 51.6911, "step": 375 }, { "epoch": 0.9930670188180918, "grad_norm": 7194.0595703125, "learning_rate": 0.0004997341508734441, "loss": 47.4053, "step": 376 }, { "epoch": 0.9957081545064378, "grad_norm": 5090.0185546875, "learning_rate": 0.0004997322212851687, "loss": 49.681, "step": 377 }, { "epoch": 0.9983492901947838, "grad_norm": 5444.7880859375, "learning_rate": 0.0004997302847232677, "loss": 51.0672, "step": 378 }, { "epoch": 1.0009904258831297, "grad_norm": 7423.53564453125, "learning_rate": 0.0004997283411877953, "loss": 51.9964, "step": 379 }, { "epoch": 1.0036315615714757, "grad_norm": 10409.9765625, "learning_rate": 0.0004997263906788055, "loss": 52.6635, "step": 380 }, { "epoch": 1.0062726972598217, "grad_norm": 8029.009765625, "learning_rate": 0.000499724433196353, "loss": 49.814, "step": 381 }, { "epoch": 1.0089138329481677, "grad_norm": 5955.08837890625, "learning_rate": 0.0004997224687404926, "loss": 52.1557, "step": 382 }, { "epoch": 1.0115549686365137, "grad_norm": 12651.8564453125, "learning_rate": 0.0004997204973112787, "loss": 56.5734, "step": 383 }, { "epoch": 1.0141961043248597, "grad_norm": 11424.328125, "learning_rate": 0.0004997185189087667, "loss": 59.9592, "step": 384 }, { "epoch": 1.0168372400132057, "grad_norm": 10582.7421875, "learning_rate": 0.0004997165335330117, "loss": 61.2396, "step": 385 }, { "epoch": 1.0194783757015518, "grad_norm": 8767.802734375, "learning_rate": 0.0004997145411840692, "loss": 56.9444, "step": 386 }, { "epoch": 1.0221195113898978, "grad_norm": 16624.921875, "learning_rate": 0.0004997125418619949, "loss": 57.315, "step": 387 }, { "epoch": 1.0247606470782435, "grad_norm": 7268.57275390625, "learning_rate": 0.0004997105355668446, "loss": 57.318, "step": 388 }, { "epoch": 1.0274017827665896, "grad_norm": 8246.8564453125, "learning_rate": 0.0004997085222986741, "loss": 52.4045, "step": 389 }, { "epoch": 1.0300429184549356, "grad_norm": 9282.318359375, "learning_rate": 0.0004997065020575399, "loss": 51.5624, "step": 390 }, { "epoch": 1.0326840541432816, "grad_norm": 15276.265625, "learning_rate": 0.0004997044748434983, "loss": 52.2461, "step": 391 }, { "epoch": 1.0353251898316276, "grad_norm": 7414.84716796875, "learning_rate": 0.000499702440656606, "loss": 48.6172, "step": 392 }, { "epoch": 1.0379663255199736, "grad_norm": 5877.34619140625, "learning_rate": 0.0004997003994969195, "loss": 46.4119, "step": 393 }, { "epoch": 1.0406074612083196, "grad_norm": 4334.9736328125, "learning_rate": 0.0004996983513644963, "loss": 48.0705, "step": 394 }, { "epoch": 1.0432485968966656, "grad_norm": 4868.3583984375, "learning_rate": 0.0004996962962593932, "loss": 45.4562, "step": 395 }, { "epoch": 1.0458897325850116, "grad_norm": 5387.1083984375, "learning_rate": 0.0004996942341816677, "loss": 45.2076, "step": 396 }, { "epoch": 1.0485308682733576, "grad_norm": 3308.402099609375, "learning_rate": 0.0004996921651313775, "loss": 42.5609, "step": 397 }, { "epoch": 1.0511720039617036, "grad_norm": 3992.4365234375, "learning_rate": 0.0004996900891085801, "loss": 44.2654, "step": 398 }, { "epoch": 1.0538131396500496, "grad_norm": 2647.18115234375, "learning_rate": 0.0004996880061133338, "loss": 43.8749, "step": 399 }, { "epoch": 1.0564542753383954, "grad_norm": 3075.710205078125, "learning_rate": 0.0004996859161456965, "loss": 44.3177, "step": 400 }, { "epoch": 1.0564542753383954, "eval_loss": 8.147329330444336, "eval_runtime": 2.1176, "eval_samples_per_second": 233.756, "eval_steps_per_second": 29.279, "step": 400 }, { "epoch": 1.0590954110267414, "grad_norm": 16605.0234375, "learning_rate": 0.0004996838192057266, "loss": 79.9442, "step": 401 }, { "epoch": 1.0617365467150874, "grad_norm": 25329.361328125, "learning_rate": 0.0004996817152934828, "loss": 144.2154, "step": 402 }, { "epoch": 1.0643776824034334, "grad_norm": 14432.0498046875, "learning_rate": 0.0004996796044090239, "loss": 129.6633, "step": 403 }, { "epoch": 1.0670188180917795, "grad_norm": 13798.5341796875, "learning_rate": 0.0004996774865524085, "loss": 131.9882, "step": 404 }, { "epoch": 1.0696599537801255, "grad_norm": 29038.947265625, "learning_rate": 0.000499675361723696, "loss": 135.7388, "step": 405 }, { "epoch": 1.0723010894684715, "grad_norm": 9699.5751953125, "learning_rate": 0.0004996732299229458, "loss": 122.4023, "step": 406 }, { "epoch": 1.0749422251568175, "grad_norm": 14256.76171875, "learning_rate": 0.0004996710911502173, "loss": 97.5645, "step": 407 }, { "epoch": 1.0775833608451635, "grad_norm": 10029.158203125, "learning_rate": 0.0004996689454055701, "loss": 95.6055, "step": 408 }, { "epoch": 1.0802244965335095, "grad_norm": 22435.41796875, "learning_rate": 0.0004996667926890643, "loss": 82.5234, "step": 409 }, { "epoch": 1.0828656322218555, "grad_norm": 9285.9873046875, "learning_rate": 0.0004996646330007601, "loss": 80.5098, "step": 410 }, { "epoch": 1.0855067679102013, "grad_norm": 4406.46337890625, "learning_rate": 0.0004996624663407177, "loss": 84.2732, "step": 411 }, { "epoch": 1.0881479035985473, "grad_norm": 3524.113525390625, "learning_rate": 0.0004996602927089975, "loss": 82.7528, "step": 412 }, { "epoch": 1.0907890392868933, "grad_norm": 2806.48046875, "learning_rate": 0.0004996581121056603, "loss": 73.6846, "step": 413 }, { "epoch": 1.0934301749752393, "grad_norm": 2142.010009765625, "learning_rate": 0.0004996559245307669, "loss": 55.8907, "step": 414 }, { "epoch": 1.0960713106635853, "grad_norm": 3916.144287109375, "learning_rate": 0.0004996537299843786, "loss": 46.2318, "step": 415 }, { "epoch": 1.0987124463519313, "grad_norm": 2185.61669921875, "learning_rate": 0.0004996515284665565, "loss": 45.4101, "step": 416 }, { "epoch": 1.1013535820402773, "grad_norm": 3138.462890625, "learning_rate": 0.0004996493199773621, "loss": 49.1023, "step": 417 }, { "epoch": 1.1039947177286233, "grad_norm": 5794.13916015625, "learning_rate": 0.0004996471045168571, "loss": 60.5381, "step": 418 }, { "epoch": 1.1066358534169694, "grad_norm": 2643.186767578125, "learning_rate": 0.0004996448820851032, "loss": 66.7618, "step": 419 }, { "epoch": 1.1092769891053154, "grad_norm": 3830.488525390625, "learning_rate": 0.0004996426526821629, "loss": 60.7875, "step": 420 }, { "epoch": 1.1119181247936614, "grad_norm": 7337.04052734375, "learning_rate": 0.000499640416308098, "loss": 55.203, "step": 421 }, { "epoch": 1.1145592604820074, "grad_norm": 4681.22607421875, "learning_rate": 0.0004996381729629712, "loss": 49.3558, "step": 422 }, { "epoch": 1.1172003961703532, "grad_norm": 9538.955078125, "learning_rate": 0.000499635922646845, "loss": 46.7217, "step": 423 }, { "epoch": 1.1198415318586992, "grad_norm": 2420.781494140625, "learning_rate": 0.0004996336653597822, "loss": 44.4068, "step": 424 }, { "epoch": 1.1224826675470452, "grad_norm": 4225.2490234375, "learning_rate": 0.0004996314011018462, "loss": 45.3308, "step": 425 }, { "epoch": 1.1251238032353912, "grad_norm": 2804.796142578125, "learning_rate": 0.0004996291298730998, "loss": 45.2971, "step": 426 }, { "epoch": 1.1277649389237372, "grad_norm": 3813.326904296875, "learning_rate": 0.0004996268516736066, "loss": 48.9122, "step": 427 }, { "epoch": 1.1304060746120832, "grad_norm": 3948.127685546875, "learning_rate": 0.0004996245665034303, "loss": 52.9978, "step": 428 }, { "epoch": 1.1330472103004292, "grad_norm": 5007.6376953125, "learning_rate": 0.0004996222743626345, "loss": 53.679, "step": 429 }, { "epoch": 1.1356883459887752, "grad_norm": 2612.2451171875, "learning_rate": 0.0004996199752512834, "loss": 47.7507, "step": 430 }, { "epoch": 1.1383294816771212, "grad_norm": 4742.66552734375, "learning_rate": 0.000499617669169441, "loss": 46.7293, "step": 431 }, { "epoch": 1.1409706173654672, "grad_norm": 2433.10302734375, "learning_rate": 0.000499615356117172, "loss": 45.6965, "step": 432 }, { "epoch": 1.143611753053813, "grad_norm": 2576.901611328125, "learning_rate": 0.0004996130360945406, "loss": 48.0056, "step": 433 }, { "epoch": 1.146252888742159, "grad_norm": 4015.420654296875, "learning_rate": 0.000499610709101612, "loss": 55.8114, "step": 434 }, { "epoch": 1.148894024430505, "grad_norm": 3131.446533203125, "learning_rate": 0.0004996083751384509, "loss": 52.478, "step": 435 }, { "epoch": 1.151535160118851, "grad_norm": 2793.3291015625, "learning_rate": 0.0004996060342051227, "loss": 48.8852, "step": 436 }, { "epoch": 1.154176295807197, "grad_norm": 2100.110107421875, "learning_rate": 0.0004996036863016924, "loss": 50.7588, "step": 437 }, { "epoch": 1.156817431495543, "grad_norm": 2351.662841796875, "learning_rate": 0.0004996013314282258, "loss": 47.412, "step": 438 }, { "epoch": 1.159458567183889, "grad_norm": 2357.10498046875, "learning_rate": 0.0004995989695847887, "loss": 49.3401, "step": 439 }, { "epoch": 1.162099702872235, "grad_norm": 1971.196044921875, "learning_rate": 0.0004995966007714471, "loss": 48.1966, "step": 440 }, { "epoch": 1.164740838560581, "grad_norm": 1422.1444091796875, "learning_rate": 0.0004995942249882669, "loss": 44.2923, "step": 441 }, { "epoch": 1.167381974248927, "grad_norm": 1437.2418212890625, "learning_rate": 0.0004995918422353147, "loss": 46.1145, "step": 442 }, { "epoch": 1.170023109937273, "grad_norm": 1270.9195556640625, "learning_rate": 0.0004995894525126567, "loss": 44.5735, "step": 443 }, { "epoch": 1.172664245625619, "grad_norm": 1111.537353515625, "learning_rate": 0.0004995870558203602, "loss": 42.5019, "step": 444 }, { "epoch": 1.1753053813139651, "grad_norm": 1020.627685546875, "learning_rate": 0.0004995846521584915, "loss": 43.8832, "step": 445 }, { "epoch": 1.177946517002311, "grad_norm": 1019.0801391601562, "learning_rate": 0.0004995822415271181, "loss": 42.5139, "step": 446 }, { "epoch": 1.180587652690657, "grad_norm": 1492.573974609375, "learning_rate": 0.0004995798239263072, "loss": 44.2544, "step": 447 }, { "epoch": 1.183228788379003, "grad_norm": 1011.8543701171875, "learning_rate": 0.0004995773993561264, "loss": 41.1282, "step": 448 }, { "epoch": 1.185869924067349, "grad_norm": 1557.2435302734375, "learning_rate": 0.0004995749678166433, "loss": 42.204, "step": 449 }, { "epoch": 1.188511059755695, "grad_norm": 2339.2412109375, "learning_rate": 0.0004995725293079257, "loss": 44.2497, "step": 450 }, { "epoch": 1.191152195444041, "grad_norm": 7424.9716796875, "learning_rate": 0.0004995700838300419, "loss": 54.8381, "step": 451 }, { "epoch": 1.193793331132387, "grad_norm": 13564.4462890625, "learning_rate": 0.0004995676313830602, "loss": 126.2832, "step": 452 }, { "epoch": 1.196434466820733, "grad_norm": 20306.1015625, "learning_rate": 0.000499565171967049, "loss": 114.584, "step": 453 }, { "epoch": 1.199075602509079, "grad_norm": 18799.34765625, "learning_rate": 0.0004995627055820769, "loss": 87.7366, "step": 454 }, { "epoch": 1.201716738197425, "grad_norm": 22542.572265625, "learning_rate": 0.0004995602322282128, "loss": 98.0518, "step": 455 }, { "epoch": 1.2043578738857708, "grad_norm": 41804.93359375, "learning_rate": 0.000499557751905526, "loss": 79.7927, "step": 456 }, { "epoch": 1.2069990095741168, "grad_norm": 15913.638671875, "learning_rate": 0.0004995552646140854, "loss": 72.537, "step": 457 }, { "epoch": 1.2096401452624628, "grad_norm": 29981.4296875, "learning_rate": 0.0004995527703539606, "loss": 67.9288, "step": 458 }, { "epoch": 1.2122812809508088, "grad_norm": 12425.8154296875, "learning_rate": 0.0004995502691252214, "loss": 53.504, "step": 459 }, { "epoch": 1.2149224166391548, "grad_norm": 18735.111328125, "learning_rate": 0.0004995477609279376, "loss": 43.066, "step": 460 }, { "epoch": 1.2175635523275008, "grad_norm": 7316.2880859375, "learning_rate": 0.0004995452457621791, "loss": 83.7501, "step": 461 }, { "epoch": 1.2202046880158468, "grad_norm": 4239.92333984375, "learning_rate": 0.0004995427236280161, "loss": 95.4458, "step": 462 }, { "epoch": 1.2228458237041928, "grad_norm": 22697.484375, "learning_rate": 0.0004995401945255192, "loss": 83.4394, "step": 463 }, { "epoch": 1.2254869593925388, "grad_norm": 3946.581787109375, "learning_rate": 0.0004995376584547589, "loss": 71.7107, "step": 464 }, { "epoch": 1.2281280950808848, "grad_norm": 3096.7255859375, "learning_rate": 0.0004995351154158062, "loss": 61.1091, "step": 465 }, { "epoch": 1.2307692307692308, "grad_norm": 2780.000244140625, "learning_rate": 0.0004995325654087319, "loss": 56.2209, "step": 466 }, { "epoch": 1.2334103664575768, "grad_norm": 4352.5478515625, "learning_rate": 0.0004995300084336073, "loss": 51.5966, "step": 467 }, { "epoch": 1.2360515021459229, "grad_norm": 8447.5595703125, "learning_rate": 0.0004995274444905039, "loss": 51.8074, "step": 468 }, { "epoch": 1.2386926378342686, "grad_norm": 9883.47265625, "learning_rate": 0.000499524873579493, "loss": 54.9866, "step": 469 }, { "epoch": 1.2413337735226146, "grad_norm": 7821.091796875, "learning_rate": 0.0004995222957006468, "loss": 57.0134, "step": 470 }, { "epoch": 1.2439749092109607, "grad_norm": 5459.39990234375, "learning_rate": 0.0004995197108540369, "loss": 56.4015, "step": 471 }, { "epoch": 1.2466160448993067, "grad_norm": 12328.42578125, "learning_rate": 0.0004995171190397357, "loss": 62.9803, "step": 472 }, { "epoch": 1.2492571805876527, "grad_norm": 8671.453125, "learning_rate": 0.0004995145202578156, "loss": 60.6457, "step": 473 }, { "epoch": 1.2518983162759987, "grad_norm": 8542.6376953125, "learning_rate": 0.0004995119145083489, "loss": 57.553, "step": 474 }, { "epoch": 1.2545394519643447, "grad_norm": 7681.24853515625, "learning_rate": 0.0004995093017914088, "loss": 58.733, "step": 475 }, { "epoch": 1.2571805876526907, "grad_norm": 9965.6708984375, "learning_rate": 0.0004995066821070679, "loss": 64.4564, "step": 476 }, { "epoch": 1.2598217233410367, "grad_norm": 8499.2158203125, "learning_rate": 0.0004995040554553995, "loss": 57.217, "step": 477 }, { "epoch": 1.2624628590293827, "grad_norm": 16765.43359375, "learning_rate": 0.0004995014218364768, "loss": 56.0627, "step": 478 }, { "epoch": 1.2651039947177285, "grad_norm": 19967.9453125, "learning_rate": 0.0004994987812503737, "loss": 56.653, "step": 479 }, { "epoch": 1.2677451304060745, "grad_norm": 11240.9580078125, "learning_rate": 0.0004994961336971635, "loss": 50.6414, "step": 480 }, { "epoch": 1.2703862660944205, "grad_norm": 5956.078125, "learning_rate": 0.0004994934791769203, "loss": 49.3475, "step": 481 }, { "epoch": 1.2730274017827665, "grad_norm": 8427.8876953125, "learning_rate": 0.0004994908176897183, "loss": 50.8914, "step": 482 }, { "epoch": 1.2756685374711125, "grad_norm": 7899.435546875, "learning_rate": 0.0004994881492356319, "loss": 52.3463, "step": 483 }, { "epoch": 1.2783096731594585, "grad_norm": 9327.703125, "learning_rate": 0.0004994854738147354, "loss": 51.9647, "step": 484 }, { "epoch": 1.2809508088478045, "grad_norm": 9134.2783203125, "learning_rate": 0.0004994827914271036, "loss": 55.6261, "step": 485 }, { "epoch": 1.2835919445361506, "grad_norm": 5185.26025390625, "learning_rate": 0.0004994801020728114, "loss": 50.8304, "step": 486 }, { "epoch": 1.2862330802244966, "grad_norm": 6586.0634765625, "learning_rate": 0.0004994774057519339, "loss": 50.7393, "step": 487 }, { "epoch": 1.2888742159128426, "grad_norm": 5045.9580078125, "learning_rate": 0.0004994747024645464, "loss": 49.692, "step": 488 }, { "epoch": 1.2915153516011886, "grad_norm": 5303.17138671875, "learning_rate": 0.0004994719922107244, "loss": 48.9539, "step": 489 }, { "epoch": 1.2941564872895346, "grad_norm": 5645.74755859375, "learning_rate": 0.0004994692749905435, "loss": 47.9993, "step": 490 }, { "epoch": 1.2967976229778806, "grad_norm": 6330.240234375, "learning_rate": 0.0004994665508040798, "loss": 48.1565, "step": 491 }, { "epoch": 1.2994387586662266, "grad_norm": 4152.6552734375, "learning_rate": 0.000499463819651409, "loss": 45.8828, "step": 492 }, { "epoch": 1.3020798943545724, "grad_norm": 3841.1416015625, "learning_rate": 0.0004994610815326077, "loss": 44.3639, "step": 493 }, { "epoch": 1.3047210300429184, "grad_norm": 3289.11181640625, "learning_rate": 0.0004994583364477522, "loss": 45.6737, "step": 494 }, { "epoch": 1.3073621657312644, "grad_norm": 3646.172119140625, "learning_rate": 0.0004994555843969193, "loss": 45.572, "step": 495 }, { "epoch": 1.3100033014196104, "grad_norm": 2844.275390625, "learning_rate": 0.0004994528253801857, "loss": 46.3507, "step": 496 }, { "epoch": 1.3126444371079564, "grad_norm": 3148.06884765625, "learning_rate": 0.0004994500593976283, "loss": 48.7044, "step": 497 }, { "epoch": 1.3152855727963024, "grad_norm": 2899.010498046875, "learning_rate": 0.0004994472864493247, "loss": 44.4537, "step": 498 }, { "epoch": 1.3179267084846484, "grad_norm": 2117.779541015625, "learning_rate": 0.0004994445065353522, "loss": 43.0583, "step": 499 }, { "epoch": 1.3205678441729944, "grad_norm": 4640.22119140625, "learning_rate": 0.0004994417196557883, "loss": 48.3768, "step": 500 }, { "epoch": 1.3232089798613405, "grad_norm": 58048.63671875, "learning_rate": 0.0004994389258107109, "loss": 101.3468, "step": 501 }, { "epoch": 1.3258501155496862, "grad_norm": 38644.48828125, "learning_rate": 0.000499436125000198, "loss": 104.536, "step": 502 }, { "epoch": 1.3284912512380322, "grad_norm": 51349.16015625, "learning_rate": 0.0004994333172243279, "loss": 107.2685, "step": 503 }, { "epoch": 1.3311323869263783, "grad_norm": 52654.97265625, "learning_rate": 0.000499430502483179, "loss": 121.7319, "step": 504 }, { "epoch": 1.3337735226147243, "grad_norm": 29755.24609375, "learning_rate": 0.0004994276807768298, "loss": 140.2886, "step": 505 }, { "epoch": 1.3364146583030703, "grad_norm": 79252.2890625, "learning_rate": 0.0004994248521053591, "loss": 133.5311, "step": 506 }, { "epoch": 1.3390557939914163, "grad_norm": 36871.0078125, "learning_rate": 0.0004994220164688458, "loss": 150.2399, "step": 507 }, { "epoch": 1.3416969296797623, "grad_norm": 51938.15234375, "learning_rate": 0.0004994191738673695, "loss": 141.3797, "step": 508 }, { "epoch": 1.3443380653681083, "grad_norm": 32136.3671875, "learning_rate": 0.000499416324301009, "loss": 112.0729, "step": 509 }, { "epoch": 1.3469792010564543, "grad_norm": 16743.86328125, "learning_rate": 0.0004994134677698444, "loss": 114.1638, "step": 510 }, { "epoch": 1.3496203367448003, "grad_norm": 11779.8671875, "learning_rate": 0.0004994106042739551, "loss": 66.4663, "step": 511 }, { "epoch": 1.3522614724331463, "grad_norm": 1853.3721923828125, "learning_rate": 0.0004994077338134212, "loss": 44.0832, "step": 512 }, { "epoch": 1.3549026081214923, "grad_norm": 1720.296142578125, "learning_rate": 0.0004994048563883228, "loss": 45.0265, "step": 513 }, { "epoch": 1.3575437438098383, "grad_norm": 1919.02099609375, "learning_rate": 0.0004994019719987404, "loss": 46.0567, "step": 514 }, { "epoch": 1.3601848794981843, "grad_norm": 2378.523681640625, "learning_rate": 0.0004993990806447544, "loss": 45.8317, "step": 515 }, { "epoch": 1.3628260151865301, "grad_norm": 2132.94091796875, "learning_rate": 0.0004993961823264455, "loss": 47.6574, "step": 516 }, { "epoch": 1.3654671508748761, "grad_norm": 2495.041748046875, "learning_rate": 0.0004993932770438948, "loss": 43.9656, "step": 517 }, { "epoch": 1.3681082865632221, "grad_norm": 2669.94580078125, "learning_rate": 0.0004993903647971833, "loss": 46.8583, "step": 518 }, { "epoch": 1.3707494222515682, "grad_norm": 2995.564697265625, "learning_rate": 0.0004993874455863923, "loss": 43.0483, "step": 519 }, { "epoch": 1.3733905579399142, "grad_norm": 2154.090576171875, "learning_rate": 0.0004993845194116035, "loss": 42.5667, "step": 520 }, { "epoch": 1.3760316936282602, "grad_norm": 2002.6939697265625, "learning_rate": 0.0004993815862728984, "loss": 41.7963, "step": 521 }, { "epoch": 1.3786728293166062, "grad_norm": 2438.119873046875, "learning_rate": 0.0004993786461703591, "loss": 45.6839, "step": 522 }, { "epoch": 1.3813139650049522, "grad_norm": 2218.1494140625, "learning_rate": 0.0004993756991040675, "loss": 42.8831, "step": 523 }, { "epoch": 1.3839551006932982, "grad_norm": 2478.381591796875, "learning_rate": 0.000499372745074106, "loss": 44.0374, "step": 524 }, { "epoch": 1.386596236381644, "grad_norm": 2655.193603515625, "learning_rate": 0.0004993697840805573, "loss": 45.5458, "step": 525 }, { "epoch": 1.38923737206999, "grad_norm": 4535.33544921875, "learning_rate": 0.0004993668161235035, "loss": 44.4815, "step": 526 }, { "epoch": 1.391878507758336, "grad_norm": 7490.2197265625, "learning_rate": 0.0004993638412030281, "loss": 46.4061, "step": 527 }, { "epoch": 1.394519643446682, "grad_norm": 4644.63232421875, "learning_rate": 0.0004993608593192138, "loss": 46.6225, "step": 528 }, { "epoch": 1.397160779135028, "grad_norm": 12344.05078125, "learning_rate": 0.0004993578704721441, "loss": 51.5709, "step": 529 }, { "epoch": 1.399801914823374, "grad_norm": 1496.318359375, "learning_rate": 0.0004993548746619022, "loss": 43.9672, "step": 530 }, { "epoch": 1.40244305051172, "grad_norm": 1497.429931640625, "learning_rate": 0.000499351871888572, "loss": 44.686, "step": 531 }, { "epoch": 1.405084186200066, "grad_norm": 7020.529296875, "learning_rate": 0.0004993488621522373, "loss": 46.7022, "step": 532 }, { "epoch": 1.407725321888412, "grad_norm": 2337.841064453125, "learning_rate": 0.000499345845452982, "loss": 48.123, "step": 533 }, { "epoch": 1.410366457576758, "grad_norm": 1818.02294921875, "learning_rate": 0.0004993428217908905, "loss": 48.1205, "step": 534 }, { "epoch": 1.413007593265104, "grad_norm": 1475.47314453125, "learning_rate": 0.0004993397911660472, "loss": 47.6012, "step": 535 }, { "epoch": 1.41564872895345, "grad_norm": 1855.9935302734375, "learning_rate": 0.0004993367535785366, "loss": 47.0017, "step": 536 }, { "epoch": 1.418289864641796, "grad_norm": 1419.2432861328125, "learning_rate": 0.0004993337090284436, "loss": 48.8278, "step": 537 }, { "epoch": 1.420931000330142, "grad_norm": 1147.0777587890625, "learning_rate": 0.0004993306575158534, "loss": 47.3918, "step": 538 }, { "epoch": 1.423572136018488, "grad_norm": 4708.24462890625, "learning_rate": 0.000499327599040851, "loss": 48.3891, "step": 539 }, { "epoch": 1.4262132717068339, "grad_norm": 2563.889892578125, "learning_rate": 0.0004993245336035218, "loss": 47.3172, "step": 540 }, { "epoch": 1.4288544073951799, "grad_norm": 1204.1533203125, "learning_rate": 0.0004993214612039515, "loss": 45.8471, "step": 541 }, { "epoch": 1.431495543083526, "grad_norm": 989.9984130859375, "learning_rate": 0.0004993183818422259, "loss": 44.0731, "step": 542 }, { "epoch": 1.434136678771872, "grad_norm": 1431.69677734375, "learning_rate": 0.0004993152955184309, "loss": 42.449, "step": 543 }, { "epoch": 1.436777814460218, "grad_norm": 1838.745849609375, "learning_rate": 0.0004993122022326526, "loss": 42.9901, "step": 544 }, { "epoch": 1.439418950148564, "grad_norm": 2904.9609375, "learning_rate": 0.0004993091019849777, "loss": 42.9963, "step": 545 }, { "epoch": 1.44206008583691, "grad_norm": 1373.23193359375, "learning_rate": 0.0004993059947754926, "loss": 40.9209, "step": 546 }, { "epoch": 1.444701221525256, "grad_norm": 1595.357666015625, "learning_rate": 0.0004993028806042839, "loss": 42.5112, "step": 547 }, { "epoch": 1.4473423572136017, "grad_norm": 4851.8359375, "learning_rate": 0.0004992997594714388, "loss": 51.1997, "step": 548 }, { "epoch": 1.4499834929019477, "grad_norm": 2564.22509765625, "learning_rate": 0.0004992966313770443, "loss": 43.3538, "step": 549 }, { "epoch": 1.4526246285902937, "grad_norm": 1803.5380859375, "learning_rate": 0.0004992934963211879, "loss": 44.9489, "step": 550 }, { "epoch": 1.4552657642786397, "grad_norm": 6406.27685546875, "learning_rate": 0.0004992903543039568, "loss": 62.9571, "step": 551 }, { "epoch": 1.4579068999669857, "grad_norm": 40401.81640625, "learning_rate": 0.0004992872053254392, "loss": 110.6426, "step": 552 }, { "epoch": 1.4605480356553318, "grad_norm": 16106.076171875, "learning_rate": 0.000499284049385723, "loss": 101.9951, "step": 553 }, { "epoch": 1.4631891713436778, "grad_norm": 13740.8388671875, "learning_rate": 0.0004992808864848959, "loss": 94.3564, "step": 554 }, { "epoch": 1.4658303070320238, "grad_norm": 39808.06640625, "learning_rate": 0.0004992777166230465, "loss": 64.7802, "step": 555 }, { "epoch": 1.4684714427203698, "grad_norm": 12414.3564453125, "learning_rate": 0.0004992745398002634, "loss": 56.3851, "step": 556 }, { "epoch": 1.4711125784087158, "grad_norm": 19673.326171875, "learning_rate": 0.0004992713560166353, "loss": 51.892, "step": 557 }, { "epoch": 1.4737537140970618, "grad_norm": 11759.212890625, "learning_rate": 0.0004992681652722507, "loss": 55.4282, "step": 558 }, { "epoch": 1.4763948497854078, "grad_norm": 29535.52734375, "learning_rate": 0.0004992649675671992, "loss": 57.7565, "step": 559 }, { "epoch": 1.4790359854737538, "grad_norm": 30784.97265625, "learning_rate": 0.00049926176290157, "loss": 57.6497, "step": 560 }, { "epoch": 1.4816771211620998, "grad_norm": 76636.8046875, "learning_rate": 0.0004992585512754524, "loss": 2130.0991, "step": 561 }, { "epoch": 1.4843182568504458, "grad_norm": 166824.984375, "learning_rate": 0.0004992553326889362, "loss": 1960.1235, "step": 562 }, { "epoch": 1.4869593925387916, "grad_norm": 120991.828125, "learning_rate": 0.0004992521071421112, "loss": 1129.7675, "step": 563 }, { "epoch": 1.4896005282271376, "grad_norm": 29678.2265625, "learning_rate": 0.0004992488746350675, "loss": 482.0458, "step": 564 }, { "epoch": 1.4922416639154836, "grad_norm": 10743.318359375, "learning_rate": 0.0004992456351678954, "loss": 166.5363, "step": 565 }, { "epoch": 1.4948827996038296, "grad_norm": 3496.19140625, "learning_rate": 0.0004992423887406855, "loss": 69.3998, "step": 566 }, { "epoch": 1.4975239352921756, "grad_norm": 8524.09375, "learning_rate": 0.0004992391353535281, "loss": 102.4746, "step": 567 }, { "epoch": 1.5001650709805217, "grad_norm": 7087.0361328125, "learning_rate": 0.0004992358750065145, "loss": 152.5989, "step": 568 }, { "epoch": 1.5028062066688677, "grad_norm": 6642.1455078125, "learning_rate": 0.0004992326076997353, "loss": 112.9166, "step": 569 }, { "epoch": 1.5054473423572134, "grad_norm": 2552.900390625, "learning_rate": 0.000499229333433282, "loss": 57.8674, "step": 570 }, { "epoch": 1.5080884780455595, "grad_norm": 2728.0634765625, "learning_rate": 0.000499226052207246, "loss": 75.5507, "step": 571 }, { "epoch": 1.5107296137339055, "grad_norm": 2573.36474609375, "learning_rate": 0.0004992227640217188, "loss": 83.5482, "step": 572 }, { "epoch": 1.5133707494222515, "grad_norm": 3035.279296875, "learning_rate": 0.0004992194688767924, "loss": 62.4149, "step": 573 }, { "epoch": 1.5160118851105975, "grad_norm": 1182.1949462890625, "learning_rate": 0.0004992161667725588, "loss": 55.6978, "step": 574 }, { "epoch": 1.5186530207989435, "grad_norm": 2614.19873046875, "learning_rate": 0.00049921285770911, "loss": 62.5243, "step": 575 }, { "epoch": 1.5212941564872895, "grad_norm": 2771.626220703125, "learning_rate": 0.0004992095416865386, "loss": 54.6841, "step": 576 }, { "epoch": 1.5239352921756355, "grad_norm": 1350.084716796875, "learning_rate": 0.0004992062187049371, "loss": 52.8816, "step": 577 }, { "epoch": 1.5265764278639815, "grad_norm": 2194.294921875, "learning_rate": 0.0004992028887643984, "loss": 58.9793, "step": 578 }, { "epoch": 1.5292175635523275, "grad_norm": 1771.9962158203125, "learning_rate": 0.0004991995518650155, "loss": 55.6315, "step": 579 }, { "epoch": 1.5318586992406735, "grad_norm": 2302.69873046875, "learning_rate": 0.0004991962080068813, "loss": 51.8921, "step": 580 }, { "epoch": 1.5344998349290195, "grad_norm": 4353.18359375, "learning_rate": 0.0004991928571900894, "loss": 54.7606, "step": 581 }, { "epoch": 1.5371409706173655, "grad_norm": 6050.09716796875, "learning_rate": 0.0004991894994147335, "loss": 60.9764, "step": 582 }, { "epoch": 1.5397821063057116, "grad_norm": 8682.447265625, "learning_rate": 0.0004991861346809071, "loss": 66.9826, "step": 583 }, { "epoch": 1.5424232419940576, "grad_norm": 4067.413330078125, "learning_rate": 0.0004991827629887044, "loss": 66.6987, "step": 584 }, { "epoch": 1.5450643776824036, "grad_norm": 11716.4462890625, "learning_rate": 0.0004991793843382192, "loss": 63.6009, "step": 585 }, { "epoch": 1.5477055133707496, "grad_norm": 18359.931640625, "learning_rate": 0.0004991759987295463, "loss": 68.5556, "step": 586 }, { "epoch": 1.5503466490590954, "grad_norm": 4886.9375, "learning_rate": 0.0004991726061627798, "loss": 59.4787, "step": 587 }, { "epoch": 1.5529877847474414, "grad_norm": 7067.82470703125, "learning_rate": 0.0004991692066380148, "loss": 55.4636, "step": 588 }, { "epoch": 1.5556289204357874, "grad_norm": 6505.671875, "learning_rate": 0.0004991658001553459, "loss": 60.0569, "step": 589 }, { "epoch": 1.5582700561241334, "grad_norm": 7624.03662109375, "learning_rate": 0.0004991623867148686, "loss": 58.689, "step": 590 }, { "epoch": 1.5609111918124794, "grad_norm": 2548.069091796875, "learning_rate": 0.0004991589663166779, "loss": 52.9711, "step": 591 }, { "epoch": 1.5635523275008254, "grad_norm": 2187.1708984375, "learning_rate": 0.0004991555389608695, "loss": 50.2995, "step": 592 }, { "epoch": 1.5661934631891712, "grad_norm": 1547.9122314453125, "learning_rate": 0.000499152104647539, "loss": 49.3683, "step": 593 }, { "epoch": 1.5688345988775172, "grad_norm": 6219.3359375, "learning_rate": 0.0004991486633767824, "loss": 50.5522, "step": 594 }, { "epoch": 1.5714757345658632, "grad_norm": 3163.43212890625, "learning_rate": 0.0004991452151486956, "loss": 49.8966, "step": 595 }, { "epoch": 1.5741168702542092, "grad_norm": 2209.248779296875, "learning_rate": 0.000499141759963375, "loss": 47.2412, "step": 596 }, { "epoch": 1.5767580059425552, "grad_norm": 1153.2919921875, "learning_rate": 0.0004991382978209173, "loss": 47.0185, "step": 597 }, { "epoch": 1.5793991416309012, "grad_norm": 1866.169189453125, "learning_rate": 0.0004991348287214189, "loss": 45.9035, "step": 598 }, { "epoch": 1.5820402773192472, "grad_norm": 6848.564453125, "learning_rate": 0.0004991313526649767, "loss": 46.9573, "step": 599 }, { "epoch": 1.5846814130075932, "grad_norm": 1469.7354736328125, "learning_rate": 0.0004991278696516879, "loss": 49.1185, "step": 600 }, { "epoch": 1.5846814130075932, "eval_loss": 6.99809455871582, "eval_runtime": 2.0971, "eval_samples_per_second": 236.043, "eval_steps_per_second": 29.565, "step": 600 }, { "epoch": 1.5873225486959393, "grad_norm": 3431.05810546875, "learning_rate": 0.0004991243796816496, "loss": 50.9703, "step": 601 }, { "epoch": 1.5899636843842853, "grad_norm": 17163.427734375, "learning_rate": 0.0004991208827549594, "loss": 82.2, "step": 602 }, { "epoch": 1.5926048200726313, "grad_norm": 5232.05615234375, "learning_rate": 0.0004991173788717149, "loss": 80.2578, "step": 603 }, { "epoch": 1.5952459557609773, "grad_norm": 7430.76416015625, "learning_rate": 0.0004991138680320139, "loss": 65.8535, "step": 604 }, { "epoch": 1.5978870914493233, "grad_norm": 5313.9208984375, "learning_rate": 0.0004991103502359545, "loss": 59.7012, "step": 605 }, { "epoch": 1.6005282271376693, "grad_norm": 5033.74072265625, "learning_rate": 0.0004991068254836349, "loss": 43.7041, "step": 606 }, { "epoch": 1.6031693628260153, "grad_norm": 1208.118896484375, "learning_rate": 0.0004991032937751536, "loss": 37.2197, "step": 607 }, { "epoch": 1.6058104985143613, "grad_norm": 1769.3668212890625, "learning_rate": 0.000499099755110609, "loss": 38.7926, "step": 608 }, { "epoch": 1.6084516342027073, "grad_norm": 995.1265869140625, "learning_rate": 0.0004990962094901002, "loss": 43.9545, "step": 609 }, { "epoch": 1.611092769891053, "grad_norm": 3154.42333984375, "learning_rate": 0.000499092656913726, "loss": 21.801, "step": 610 }, { "epoch": 1.613733905579399, "grad_norm": 5431.11376953125, "learning_rate": 0.0004990890973815857, "loss": 98.1815, "step": 611 }, { "epoch": 1.6163750412677451, "grad_norm": 6964.67529296875, "learning_rate": 0.0004990855308937787, "loss": 142.0075, "step": 612 }, { "epoch": 1.6190161769560911, "grad_norm": 12163.259765625, "learning_rate": 0.0004990819574504046, "loss": 129.3517, "step": 613 }, { "epoch": 1.6216573126444371, "grad_norm": 6045.28271484375, "learning_rate": 0.0004990783770515631, "loss": 109.702, "step": 614 }, { "epoch": 1.6242984483327831, "grad_norm": 11491.3486328125, "learning_rate": 0.0004990747896973542, "loss": 85.0297, "step": 615 }, { "epoch": 1.626939584021129, "grad_norm": 5696.29296875, "learning_rate": 0.0004990711953878782, "loss": 71.7666, "step": 616 }, { "epoch": 1.629580719709475, "grad_norm": 6095.884765625, "learning_rate": 0.0004990675941232354, "loss": 74.4166, "step": 617 }, { "epoch": 1.632221855397821, "grad_norm": 4717.54052734375, "learning_rate": 0.0004990639859035262, "loss": 85.7458, "step": 618 }, { "epoch": 1.634862991086167, "grad_norm": 11082.0859375, "learning_rate": 0.0004990603707288515, "loss": 89.2636, "step": 619 }, { "epoch": 1.637504126774513, "grad_norm": 9063.1650390625, "learning_rate": 0.0004990567485993124, "loss": 88.2417, "step": 620 }, { "epoch": 1.640145262462859, "grad_norm": 8414.904296875, "learning_rate": 0.0004990531195150098, "loss": 82.3487, "step": 621 }, { "epoch": 1.642786398151205, "grad_norm": 12348.89453125, "learning_rate": 0.0004990494834760453, "loss": 73.4464, "step": 622 }, { "epoch": 1.645427533839551, "grad_norm": 5157.98681640625, "learning_rate": 0.0004990458404825201, "loss": 64.9995, "step": 623 }, { "epoch": 1.648068669527897, "grad_norm": 11554.2783203125, "learning_rate": 0.0004990421905345362, "loss": 61.3901, "step": 624 }, { "epoch": 1.650709805216243, "grad_norm": 6427.49755859375, "learning_rate": 0.0004990385336321953, "loss": 59.4023, "step": 625 }, { "epoch": 1.653350940904589, "grad_norm": 31144.81640625, "learning_rate": 0.0004990348697755997, "loss": 61.8908, "step": 626 }, { "epoch": 1.655992076592935, "grad_norm": 11088.3916015625, "learning_rate": 0.0004990311989648517, "loss": 65.5014, "step": 627 }, { "epoch": 1.658633212281281, "grad_norm": 13734.884765625, "learning_rate": 0.0004990275212000538, "loss": 66.4676, "step": 628 }, { "epoch": 1.661274347969627, "grad_norm": 19680.77734375, "learning_rate": 0.0004990238364813085, "loss": 69.0276, "step": 629 }, { "epoch": 1.663915483657973, "grad_norm": 7206.85302734375, "learning_rate": 0.000499020144808719, "loss": 71.5599, "step": 630 }, { "epoch": 1.666556619346319, "grad_norm": 5677.92529296875, "learning_rate": 0.0004990164461823881, "loss": 66.4158, "step": 631 }, { "epoch": 1.669197755034665, "grad_norm": 6108.72216796875, "learning_rate": 0.0004990127406024194, "loss": 58.1643, "step": 632 }, { "epoch": 1.6718388907230108, "grad_norm": 6867.95458984375, "learning_rate": 0.0004990090280689161, "loss": 58.951, "step": 633 }, { "epoch": 1.6744800264113568, "grad_norm": 8204.84765625, "learning_rate": 0.000499005308581982, "loss": 56.7117, "step": 634 }, { "epoch": 1.6771211620997029, "grad_norm": 5434.18994140625, "learning_rate": 0.0004990015821417209, "loss": 60.6797, "step": 635 }, { "epoch": 1.6797622977880489, "grad_norm": 6154.45703125, "learning_rate": 0.0004989978487482369, "loss": 60.8866, "step": 636 }, { "epoch": 1.6824034334763949, "grad_norm": 6963.5107421875, "learning_rate": 0.0004989941084016341, "loss": 61.1997, "step": 637 }, { "epoch": 1.6850445691647409, "grad_norm": 4081.23876953125, "learning_rate": 0.0004989903611020173, "loss": 59.8061, "step": 638 }, { "epoch": 1.6876857048530867, "grad_norm": 8756.810546875, "learning_rate": 0.0004989866068494908, "loss": 56.9937, "step": 639 }, { "epoch": 1.6903268405414327, "grad_norm": 7590.0595703125, "learning_rate": 0.0004989828456441597, "loss": 55.0535, "step": 640 }, { "epoch": 1.6929679762297787, "grad_norm": 4130.9189453125, "learning_rate": 0.0004989790774861287, "loss": 53.3109, "step": 641 }, { "epoch": 1.6956091119181247, "grad_norm": 3490.13330078125, "learning_rate": 0.0004989753023755034, "loss": 53.5827, "step": 642 }, { "epoch": 1.6982502476064707, "grad_norm": 4209.16064453125, "learning_rate": 0.000498971520312389, "loss": 49.583, "step": 643 }, { "epoch": 1.7008913832948167, "grad_norm": 3814.8486328125, "learning_rate": 0.000498967731296891, "loss": 49.2547, "step": 644 }, { "epoch": 1.7035325189831627, "grad_norm": 2649.128173828125, "learning_rate": 0.0004989639353291155, "loss": 45.9787, "step": 645 }, { "epoch": 1.7061736546715087, "grad_norm": 2743.810791015625, "learning_rate": 0.0004989601324091684, "loss": 43.6474, "step": 646 }, { "epoch": 1.7088147903598547, "grad_norm": 2427.1708984375, "learning_rate": 0.0004989563225371557, "loss": 45.4601, "step": 647 }, { "epoch": 1.7114559260482007, "grad_norm": 2862.26611328125, "learning_rate": 0.0004989525057131841, "loss": 47.015, "step": 648 }, { "epoch": 1.7140970617365467, "grad_norm": 2844.143798828125, "learning_rate": 0.0004989486819373599, "loss": 45.6783, "step": 649 }, { "epoch": 1.7167381974248928, "grad_norm": 4225.1953125, "learning_rate": 0.00049894485120979, "loss": 47.289, "step": 650 }, { "epoch": 1.7193793331132388, "grad_norm": 53853.36328125, "learning_rate": 0.0004989410135305814, "loss": 61.5363, "step": 651 }, { "epoch": 1.7220204688015848, "grad_norm": 15278.529296875, "learning_rate": 0.0004989371688998412, "loss": 86.3667, "step": 652 }, { "epoch": 1.7246616044899308, "grad_norm": 38323.68359375, "learning_rate": 0.0004989333173176769, "loss": 72.7863, "step": 653 }, { "epoch": 1.7273027401782768, "grad_norm": 28816.3984375, "learning_rate": 0.0004989294587841958, "loss": 84.6035, "step": 654 }, { "epoch": 1.7299438758666228, "grad_norm": 17648.578125, "learning_rate": 0.000498925593299506, "loss": 75.8962, "step": 655 }, { "epoch": 1.7325850115549688, "grad_norm": 32389.74609375, "learning_rate": 0.000498921720863715, "loss": 71.0917, "step": 656 }, { "epoch": 1.7352261472433146, "grad_norm": 28354.43359375, "learning_rate": 0.0004989178414769313, "loss": 63.073, "step": 657 }, { "epoch": 1.7378672829316606, "grad_norm": 35173.171875, "learning_rate": 0.0004989139551392631, "loss": 47.2548, "step": 658 }, { "epoch": 1.7405084186200066, "grad_norm": 15567.58984375, "learning_rate": 0.0004989100618508188, "loss": 47.2205, "step": 659 }, { "epoch": 1.7431495543083526, "grad_norm": 14656.748046875, "learning_rate": 0.0004989061616117073, "loss": 43.033, "step": 660 }, { "epoch": 1.7457906899966986, "grad_norm": 20719.66015625, "learning_rate": 0.0004989022544220375, "loss": 52.066, "step": 661 }, { "epoch": 1.7484318256850444, "grad_norm": 3261.359130859375, "learning_rate": 0.0004988983402819185, "loss": 72.645, "step": 662 }, { "epoch": 1.7510729613733904, "grad_norm": 3280.561767578125, "learning_rate": 0.0004988944191914595, "loss": 63.7631, "step": 663 }, { "epoch": 1.7537140970617364, "grad_norm": 5239.81298828125, "learning_rate": 0.00049889049115077, "loss": 60.3456, "step": 664 }, { "epoch": 1.7563552327500824, "grad_norm": 5855.068359375, "learning_rate": 0.0004988865561599598, "loss": 56.932, "step": 665 }, { "epoch": 1.7589963684384284, "grad_norm": 326595.0625, "learning_rate": 0.0004988826142191387, "loss": 61.5557, "step": 666 }, { "epoch": 1.7616375041267744, "grad_norm": 13604.9951171875, "learning_rate": 0.0004988786653284168, "loss": 56.5235, "step": 667 }, { "epoch": 1.7642786398151205, "grad_norm": 128558.203125, "learning_rate": 0.0004988747094879045, "loss": 68.4093, "step": 668 }, { "epoch": 1.7669197755034665, "grad_norm": 61036.56640625, "learning_rate": 0.0004988707466977119, "loss": 74.234, "step": 669 }, { "epoch": 1.7695609111918125, "grad_norm": 21371.783203125, "learning_rate": 0.0004988667769579501, "loss": 79.8561, "step": 670 }, { "epoch": 1.7722020468801585, "grad_norm": 31060.388671875, "learning_rate": 0.0004988628002687296, "loss": 78.5891, "step": 671 }, { "epoch": 1.7748431825685045, "grad_norm": 6724.67138671875, "learning_rate": 0.0004988588166301616, "loss": 79.8933, "step": 672 }, { "epoch": 1.7774843182568505, "grad_norm": 7499.2412109375, "learning_rate": 0.0004988548260423574, "loss": 74.4493, "step": 673 }, { "epoch": 1.7801254539451965, "grad_norm": 15037.0439453125, "learning_rate": 0.0004988508285054284, "loss": 72.0274, "step": 674 }, { "epoch": 1.7827665896335425, "grad_norm": 4084.240234375, "learning_rate": 0.0004988468240194862, "loss": 70.6132, "step": 675 }, { "epoch": 1.7854077253218885, "grad_norm": 7315.84423828125, "learning_rate": 0.0004988428125846425, "loss": 63.7729, "step": 676 }, { "epoch": 1.7880488610102345, "grad_norm": 5050.5712890625, "learning_rate": 0.0004988387942010096, "loss": 60.5483, "step": 677 }, { "epoch": 1.7906899966985805, "grad_norm": 6068.150390625, "learning_rate": 0.0004988347688686994, "loss": 57.8991, "step": 678 }, { "epoch": 1.7933311323869265, "grad_norm": 7239.93115234375, "learning_rate": 0.0004988307365878245, "loss": 56.3082, "step": 679 }, { "epoch": 1.7959722680752723, "grad_norm": 2274.79541015625, "learning_rate": 0.0004988266973584974, "loss": 51.1331, "step": 680 }, { "epoch": 1.7986134037636183, "grad_norm": 2565.087890625, "learning_rate": 0.000498822651180831, "loss": 47.8752, "step": 681 }, { "epoch": 1.8012545394519643, "grad_norm": 4348.37255859375, "learning_rate": 0.0004988185980549382, "loss": 51.7085, "step": 682 }, { "epoch": 1.8038956751403104, "grad_norm": 3907.55322265625, "learning_rate": 0.0004988145379809322, "loss": 56.1084, "step": 683 }, { "epoch": 1.8065368108286564, "grad_norm": 5691.13232421875, "learning_rate": 0.0004988104709589265, "loss": 62.4343, "step": 684 }, { "epoch": 1.8091779465170021, "grad_norm": 5721.7421875, "learning_rate": 0.0004988063969890345, "loss": 56.3448, "step": 685 }, { "epoch": 1.8118190822053482, "grad_norm": 4348.34619140625, "learning_rate": 0.00049880231607137, "loss": 58.6188, "step": 686 }, { "epoch": 1.8144602178936942, "grad_norm": 4954.25390625, "learning_rate": 0.000498798228206047, "loss": 56.1906, "step": 687 }, { "epoch": 1.8171013535820402, "grad_norm": 5521.60400390625, "learning_rate": 0.0004987941333931796, "loss": 56.2707, "step": 688 }, { "epoch": 1.8197424892703862, "grad_norm": 5209.28515625, "learning_rate": 0.0004987900316328822, "loss": 55.4515, "step": 689 }, { "epoch": 1.8223836249587322, "grad_norm": 4385.04150390625, "learning_rate": 0.0004987859229252694, "loss": 52.0273, "step": 690 }, { "epoch": 1.8250247606470782, "grad_norm": 2972.471435546875, "learning_rate": 0.0004987818072704557, "loss": 48.833, "step": 691 }, { "epoch": 1.8276658963354242, "grad_norm": 3954.517578125, "learning_rate": 0.0004987776846685562, "loss": 52.0944, "step": 692 }, { "epoch": 1.8303070320237702, "grad_norm": 2036.3094482421875, "learning_rate": 0.000498773555119686, "loss": 45.9277, "step": 693 }, { "epoch": 1.8329481677121162, "grad_norm": 2619.429443359375, "learning_rate": 0.0004987694186239605, "loss": 51.4335, "step": 694 }, { "epoch": 1.8355893034004622, "grad_norm": 1963.20263671875, "learning_rate": 0.0004987652751814951, "loss": 44.8605, "step": 695 }, { "epoch": 1.8382304390888082, "grad_norm": 2253.099609375, "learning_rate": 0.0004987611247924054, "loss": 46.387, "step": 696 }, { "epoch": 1.8408715747771542, "grad_norm": 2643.1298828125, "learning_rate": 0.0004987569674568075, "loss": 46.2716, "step": 697 }, { "epoch": 1.8435127104655002, "grad_norm": 1911.4715576171875, "learning_rate": 0.0004987528031748174, "loss": 43.6176, "step": 698 }, { "epoch": 1.8461538461538463, "grad_norm": 1944.1480712890625, "learning_rate": 0.0004987486319465514, "loss": 43.6033, "step": 699 }, { "epoch": 1.8487949818421923, "grad_norm": 3364.28125, "learning_rate": 0.0004987444537721261, "loss": 46.2094, "step": 700 }, { "epoch": 1.8514361175305383, "grad_norm": 21517.029296875, "learning_rate": 0.0004987402686516579, "loss": 66.6176, "step": 701 }, { "epoch": 1.8540772532188843, "grad_norm": 19333.025390625, "learning_rate": 0.0004987360765852638, "loss": 78.193, "step": 702 }, { "epoch": 1.85671838890723, "grad_norm": 13818.70703125, "learning_rate": 0.000498731877573061, "loss": 61.5092, "step": 703 }, { "epoch": 1.859359524595576, "grad_norm": 22082.044921875, "learning_rate": 0.0004987276716151665, "loss": 54.3263, "step": 704 }, { "epoch": 1.862000660283922, "grad_norm": 10396.0869140625, "learning_rate": 0.0004987234587116979, "loss": 56.9946, "step": 705 }, { "epoch": 1.864641795972268, "grad_norm": 11619.021484375, "learning_rate": 0.0004987192388627729, "loss": 49.1877, "step": 706 }, { "epoch": 1.867282931660614, "grad_norm": 14104.4765625, "learning_rate": 0.0004987150120685092, "loss": 47.8657, "step": 707 }, { "epoch": 1.86992406734896, "grad_norm": 15942.8193359375, "learning_rate": 0.0004987107783290249, "loss": 39.54, "step": 708 }, { "epoch": 1.872565203037306, "grad_norm": 8911.33984375, "learning_rate": 0.0004987065376444383, "loss": 42.391, "step": 709 }, { "epoch": 1.875206338725652, "grad_norm": 3348.16259765625, "learning_rate": 0.0004987022900148677, "loss": 47.9853, "step": 710 }, { "epoch": 1.877847474413998, "grad_norm": 5297.25341796875, "learning_rate": 0.0004986980354404316, "loss": 79.39, "step": 711 }, { "epoch": 1.880488610102344, "grad_norm": 7763.13525390625, "learning_rate": 0.0004986937739212491, "loss": 72.944, "step": 712 }, { "epoch": 1.88312974579069, "grad_norm": 4460.0703125, "learning_rate": 0.000498689505457439, "loss": 57.2453, "step": 713 }, { "epoch": 1.885770881479036, "grad_norm": 3230.712646484375, "learning_rate": 0.0004986852300491206, "loss": 48.4888, "step": 714 }, { "epoch": 1.888412017167382, "grad_norm": 4664.31787109375, "learning_rate": 0.0004986809476964131, "loss": 48.0037, "step": 715 }, { "epoch": 1.891053152855728, "grad_norm": 5207.767578125, "learning_rate": 0.0004986766583994364, "loss": 58.2083, "step": 716 }, { "epoch": 1.893694288544074, "grad_norm": 4591.78173828125, "learning_rate": 0.00049867236215831, "loss": 63.3571, "step": 717 }, { "epoch": 1.89633542423242, "grad_norm": 6200.36474609375, "learning_rate": 0.000498668058973154, "loss": 64.5042, "step": 718 }, { "epoch": 1.898976559920766, "grad_norm": 4197.7216796875, "learning_rate": 0.0004986637488440885, "loss": 58.7902, "step": 719 }, { "epoch": 1.901617695609112, "grad_norm": 10305.955078125, "learning_rate": 0.0004986594317712339, "loss": 52.9853, "step": 720 }, { "epoch": 1.904258831297458, "grad_norm": 3019.9453125, "learning_rate": 0.0004986551077547107, "loss": 48.0443, "step": 721 }, { "epoch": 1.906899966985804, "grad_norm": 4844.4609375, "learning_rate": 0.0004986507767946398, "loss": 49.3235, "step": 722 }, { "epoch": 1.90954110267415, "grad_norm": 10530.9501953125, "learning_rate": 0.0004986464388911419, "loss": 49.9362, "step": 723 }, { "epoch": 1.912182238362496, "grad_norm": 6195.30908203125, "learning_rate": 0.0004986420940443383, "loss": 51.8087, "step": 724 }, { "epoch": 1.914823374050842, "grad_norm": 5786.3115234375, "learning_rate": 0.0004986377422543502, "loss": 54.8957, "step": 725 }, { "epoch": 1.9174645097391878, "grad_norm": 5402.87060546875, "learning_rate": 0.0004986333835212994, "loss": 51.1439, "step": 726 }, { "epoch": 1.9201056454275338, "grad_norm": 5844.3818359375, "learning_rate": 0.0004986290178453073, "loss": 49.358, "step": 727 }, { "epoch": 1.9227467811158798, "grad_norm": 5909.25732421875, "learning_rate": 0.0004986246452264959, "loss": 53.7161, "step": 728 }, { "epoch": 1.9253879168042258, "grad_norm": 6941.48388671875, "learning_rate": 0.0004986202656649874, "loss": 48.78, "step": 729 }, { "epoch": 1.9280290524925718, "grad_norm": 5566.73193359375, "learning_rate": 0.0004986158791609039, "loss": 47.1944, "step": 730 }, { "epoch": 1.9306701881809178, "grad_norm": 6028.0634765625, "learning_rate": 0.0004986114857143682, "loss": 50.7673, "step": 731 }, { "epoch": 1.9333113238692636, "grad_norm": 37353.25390625, "learning_rate": 0.0004986070853255027, "loss": 58.9364, "step": 732 }, { "epoch": 1.9359524595576096, "grad_norm": 6141.4775390625, "learning_rate": 0.0004986026779944304, "loss": 55.7248, "step": 733 }, { "epoch": 1.9385935952459556, "grad_norm": 7841.7744140625, "learning_rate": 0.0004985982637212744, "loss": 56.0536, "step": 734 }, { "epoch": 1.9412347309343017, "grad_norm": 5055.28173828125, "learning_rate": 0.0004985938425061578, "loss": 53.0901, "step": 735 }, { "epoch": 1.9438758666226477, "grad_norm": 4373.79931640625, "learning_rate": 0.0004985894143492043, "loss": 51.2773, "step": 736 }, { "epoch": 1.9465170023109937, "grad_norm": 6576.74853515625, "learning_rate": 0.0004985849792505374, "loss": 44.7831, "step": 737 }, { "epoch": 1.9491581379993397, "grad_norm": 5702.4375, "learning_rate": 0.0004985805372102809, "loss": 45.9135, "step": 738 }, { "epoch": 1.9517992736876857, "grad_norm": 7063.09912109375, "learning_rate": 0.000498576088228559, "loss": 46.264, "step": 739 }, { "epoch": 1.9544404093760317, "grad_norm": 3646.25048828125, "learning_rate": 0.0004985716323054959, "loss": 42.7763, "step": 740 }, { "epoch": 1.9570815450643777, "grad_norm": 4527.68994140625, "learning_rate": 0.0004985671694412159, "loss": 44.3452, "step": 741 }, { "epoch": 1.9597226807527237, "grad_norm": 10270.3984375, "learning_rate": 0.0004985626996358438, "loss": 53.6739, "step": 742 }, { "epoch": 1.9623638164410697, "grad_norm": 79696.7109375, "learning_rate": 0.0004985582228895042, "loss": 72.3184, "step": 743 }, { "epoch": 1.9650049521294157, "grad_norm": 8530.6708984375, "learning_rate": 0.0004985537392023224, "loss": 63.3945, "step": 744 }, { "epoch": 1.9676460878177617, "grad_norm": 7036.5234375, "learning_rate": 0.0004985492485744233, "loss": 55.6855, "step": 745 }, { "epoch": 1.9702872235061077, "grad_norm": 55628.671875, "learning_rate": 0.0004985447510059325, "loss": 48.1279, "step": 746 }, { "epoch": 1.9729283591944538, "grad_norm": 6893.4677734375, "learning_rate": 0.0004985402464969755, "loss": 46.4378, "step": 747 }, { "epoch": 1.9755694948827998, "grad_norm": 8782.5791015625, "learning_rate": 0.0004985357350476781, "loss": 67.6096, "step": 748 }, { "epoch": 1.9782106305711455, "grad_norm": 4256.57373046875, "learning_rate": 0.0004985312166581663, "loss": 69.3722, "step": 749 }, { "epoch": 1.9808517662594916, "grad_norm": 8381.263671875, "learning_rate": 0.0004985266913285664, "loss": 58.7245, "step": 750 }, { "epoch": 1.9834929019478376, "grad_norm": 4434.2919921875, "learning_rate": 0.0004985221590590045, "loss": 53.2572, "step": 751 }, { "epoch": 1.9861340376361836, "grad_norm": 4376.60400390625, "learning_rate": 0.0004985176198496074, "loss": 44.6373, "step": 752 }, { "epoch": 1.9887751733245296, "grad_norm": 3482.80126953125, "learning_rate": 0.0004985130737005017, "loss": 45.3378, "step": 753 }, { "epoch": 1.9914163090128756, "grad_norm": 4171.46435546875, "learning_rate": 0.0004985085206118144, "loss": 45.5693, "step": 754 }, { "epoch": 1.9940574447012214, "grad_norm": 13472.3291015625, "learning_rate": 0.0004985039605836727, "loss": 49.0317, "step": 755 }, { "epoch": 1.9966985803895674, "grad_norm": 8040.88037109375, "learning_rate": 0.0004984993936162038, "loss": 54.3061, "step": 756 }, { "epoch": 1.9993397160779134, "grad_norm": 15179.0791015625, "learning_rate": 0.0004984948197095352, "loss": 59.6898, "step": 757 }, { "epoch": 2.0019808517662594, "grad_norm": 16920.38671875, "learning_rate": 0.0004984902388637949, "loss": 63.601, "step": 758 }, { "epoch": 2.0046219874546054, "grad_norm": 3290.345947265625, "learning_rate": 0.0004984856510791107, "loss": 59.5728, "step": 759 }, { "epoch": 2.0072631231429514, "grad_norm": 3846.7822265625, "learning_rate": 0.0004984810563556106, "loss": 54.589, "step": 760 }, { "epoch": 2.0099042588312974, "grad_norm": 8633.1162109375, "learning_rate": 0.0004984764546934229, "loss": 51.1414, "step": 761 }, { "epoch": 2.0125453945196434, "grad_norm": 6463.7099609375, "learning_rate": 0.0004984718460926762, "loss": 54.9377, "step": 762 }, { "epoch": 2.0151865302079894, "grad_norm": 4955.33203125, "learning_rate": 0.0004984672305534991, "loss": 54.2658, "step": 763 }, { "epoch": 2.0178276658963354, "grad_norm": 10784.2626953125, "learning_rate": 0.0004984626080760206, "loss": 62.2645, "step": 764 }, { "epoch": 2.0204688015846815, "grad_norm": 5264.59130859375, "learning_rate": 0.0004984579786603698, "loss": 64.449, "step": 765 }, { "epoch": 2.0231099372730275, "grad_norm": 4103.5263671875, "learning_rate": 0.0004984533423066758, "loss": 61.6303, "step": 766 }, { "epoch": 2.0257510729613735, "grad_norm": 5733.615234375, "learning_rate": 0.0004984486990150681, "loss": 59.1625, "step": 767 }, { "epoch": 2.0283922086497195, "grad_norm": 4586.11474609375, "learning_rate": 0.0004984440487856766, "loss": 56.2713, "step": 768 }, { "epoch": 2.0310333443380655, "grad_norm": 4040.83935546875, "learning_rate": 0.0004984393916186309, "loss": 51.1101, "step": 769 }, { "epoch": 2.0336744800264115, "grad_norm": 4356.3623046875, "learning_rate": 0.0004984347275140611, "loss": 55.3298, "step": 770 }, { "epoch": 2.0363156157147575, "grad_norm": 3045.473388671875, "learning_rate": 0.0004984300564720976, "loss": 44.8101, "step": 771 }, { "epoch": 2.0389567514031035, "grad_norm": 2764.11181640625, "learning_rate": 0.0004984253784928707, "loss": 44.9701, "step": 772 }, { "epoch": 2.0415978870914495, "grad_norm": 5006.359375, "learning_rate": 0.0004984206935765109, "loss": 49.2696, "step": 773 }, { "epoch": 2.0442390227797955, "grad_norm": 3179.804931640625, "learning_rate": 0.0004984160017231492, "loss": 45.223, "step": 774 }, { "epoch": 2.046880158468141, "grad_norm": 4876.7724609375, "learning_rate": 0.0004984113029329166, "loss": 48.1028, "step": 775 }, { "epoch": 2.049521294156487, "grad_norm": 4208.8662109375, "learning_rate": 0.0004984065972059443, "loss": 44.1137, "step": 776 }, { "epoch": 2.052162429844833, "grad_norm": 2414.38525390625, "learning_rate": 0.0004984018845423636, "loss": 43.7742, "step": 777 }, { "epoch": 2.054803565533179, "grad_norm": 2893.984619140625, "learning_rate": 0.0004983971649423063, "loss": 46.065, "step": 778 }, { "epoch": 2.057444701221525, "grad_norm": 3087.80517578125, "learning_rate": 0.0004983924384059041, "loss": 47.8694, "step": 779 }, { "epoch": 2.060085836909871, "grad_norm": 8955.09375, "learning_rate": 0.0004983877049332889, "loss": 65.4067, "step": 780 }, { "epoch": 2.062726972598217, "grad_norm": 17953.544921875, "learning_rate": 0.000498382964524593, "loss": 54.2446, "step": 781 }, { "epoch": 2.065368108286563, "grad_norm": 8791.3369140625, "learning_rate": 0.0004983782171799487, "loss": 51.1592, "step": 782 }, { "epoch": 2.068009243974909, "grad_norm": 3096.499267578125, "learning_rate": 0.0004983734628994886, "loss": 51.7802, "step": 783 }, { "epoch": 2.070650379663255, "grad_norm": 2617.982421875, "learning_rate": 0.0004983687016833454, "loss": 34.809, "step": 784 }, { "epoch": 2.073291515351601, "grad_norm": 4505.33447265625, "learning_rate": 0.0004983639335316522, "loss": 26.1957, "step": 785 }, { "epoch": 2.075932651039947, "grad_norm": 4715.9443359375, "learning_rate": 0.000498359158444542, "loss": 29.8248, "step": 786 }, { "epoch": 2.078573786728293, "grad_norm": 3414.466552734375, "learning_rate": 0.0004983543764221482, "loss": 21.1729, "step": 787 }, { "epoch": 2.081214922416639, "grad_norm": 1715.8192138671875, "learning_rate": 0.0004983495874646043, "loss": 21.7846, "step": 788 }, { "epoch": 2.083856058104985, "grad_norm": 25920.88671875, "learning_rate": 0.0004983447915720442, "loss": 20.0568, "step": 789 }, { "epoch": 2.086497193793331, "grad_norm": 2558.26123046875, "learning_rate": 0.0004983399887446015, "loss": 142.4169, "step": 790 }, { "epoch": 2.089138329481677, "grad_norm": 3041.406982421875, "learning_rate": 0.0004983351789824105, "loss": 163.6262, "step": 791 }, { "epoch": 2.0917794651700232, "grad_norm": 2408.6357421875, "learning_rate": 0.0004983303622856057, "loss": 123.9533, "step": 792 }, { "epoch": 2.0944206008583692, "grad_norm": 1664.95361328125, "learning_rate": 0.0004983255386543212, "loss": 71.0559, "step": 793 }, { "epoch": 2.0970617365467152, "grad_norm": 1791.0489501953125, "learning_rate": 0.000498320708088692, "loss": 62.7049, "step": 794 }, { "epoch": 2.0997028722350612, "grad_norm": 3774.051025390625, "learning_rate": 0.0004983158705888529, "loss": 108.5899, "step": 795 }, { "epoch": 2.1023440079234073, "grad_norm": 3875.3115234375, "learning_rate": 0.000498311026154939, "loss": 126.3212, "step": 796 }, { "epoch": 2.1049851436117533, "grad_norm": 3909.94482421875, "learning_rate": 0.0004983061747870855, "loss": 108.7758, "step": 797 }, { "epoch": 2.1076262793000993, "grad_norm": 2480.038330078125, "learning_rate": 0.000498301316485428, "loss": 67.9433, "step": 798 }, { "epoch": 2.110267414988445, "grad_norm": 2965.914306640625, "learning_rate": 0.000498296451250102, "loss": 48.4712, "step": 799 }, { "epoch": 2.112908550676791, "grad_norm": 1512.230224609375, "learning_rate": 0.0004982915790812436, "loss": 50.9228, "step": 800 }, { "epoch": 2.112908550676791, "eval_loss": 6.656986713409424, "eval_runtime": 2.2016, "eval_samples_per_second": 224.833, "eval_steps_per_second": 28.161, "step": 800 }, { "epoch": 2.115549686365137, "grad_norm": 1923.8453369140625, "learning_rate": 0.0004982866999789885, "loss": 61.089, "step": 801 }, { "epoch": 2.118190822053483, "grad_norm": 2177.495849609375, "learning_rate": 0.0004982818139434734, "loss": 64.7589, "step": 802 }, { "epoch": 2.120831957741829, "grad_norm": 3735.92138671875, "learning_rate": 0.0004982769209748343, "loss": 55.7224, "step": 803 }, { "epoch": 2.123473093430175, "grad_norm": 2127.158203125, "learning_rate": 0.0004982720210732081, "loss": 48.1209, "step": 804 }, { "epoch": 2.126114229118521, "grad_norm": 4871.24609375, "learning_rate": 0.0004982671142387316, "loss": 50.0947, "step": 805 }, { "epoch": 2.128755364806867, "grad_norm": 2225.236572265625, "learning_rate": 0.0004982622004715417, "loss": 51.426, "step": 806 }, { "epoch": 2.131396500495213, "grad_norm": 3903.15283203125, "learning_rate": 0.0004982572797717757, "loss": 57.5206, "step": 807 }, { "epoch": 2.134037636183559, "grad_norm": 2502.179931640625, "learning_rate": 0.000498252352139571, "loss": 55.7122, "step": 808 }, { "epoch": 2.136678771871905, "grad_norm": 1413.394287109375, "learning_rate": 0.0004982474175750653, "loss": 56.1485, "step": 809 }, { "epoch": 2.139319907560251, "grad_norm": 1197.210693359375, "learning_rate": 0.0004982424760783962, "loss": 52.1682, "step": 810 }, { "epoch": 2.141961043248597, "grad_norm": 1438.5213623046875, "learning_rate": 0.0004982375276497017, "loss": 50.8119, "step": 811 }, { "epoch": 2.144602178936943, "grad_norm": 1381.654052734375, "learning_rate": 0.0004982325722891202, "loss": 54.8875, "step": 812 }, { "epoch": 2.147243314625289, "grad_norm": 1171.0806884765625, "learning_rate": 0.0004982276099967899, "loss": 59.7042, "step": 813 }, { "epoch": 2.149884450313635, "grad_norm": 1450.1650390625, "learning_rate": 0.0004982226407728494, "loss": 61.1751, "step": 814 }, { "epoch": 2.152525586001981, "grad_norm": 1075.09912109375, "learning_rate": 0.0004982176646174375, "loss": 55.7839, "step": 815 }, { "epoch": 2.155166721690327, "grad_norm": 1168.44970703125, "learning_rate": 0.0004982126815306931, "loss": 50.2578, "step": 816 }, { "epoch": 2.157807857378673, "grad_norm": 992.0784912109375, "learning_rate": 0.0004982076915127553, "loss": 51.9245, "step": 817 }, { "epoch": 2.160448993067019, "grad_norm": 1092.492431640625, "learning_rate": 0.0004982026945637637, "loss": 49.0982, "step": 818 }, { "epoch": 2.163090128755365, "grad_norm": 976.0292358398438, "learning_rate": 0.0004981976906838575, "loss": 49.7268, "step": 819 }, { "epoch": 2.165731264443711, "grad_norm": 813.813232421875, "learning_rate": 0.0004981926798731766, "loss": 48.0006, "step": 820 }, { "epoch": 2.168372400132057, "grad_norm": 735.9486083984375, "learning_rate": 0.0004981876621318609, "loss": 46.3538, "step": 821 }, { "epoch": 2.1710135358204026, "grad_norm": 647.3016357421875, "learning_rate": 0.0004981826374600507, "loss": 42.8891, "step": 822 }, { "epoch": 2.1736546715087486, "grad_norm": 621.3690795898438, "learning_rate": 0.0004981776058578859, "loss": 44.1523, "step": 823 }, { "epoch": 2.1762958071970946, "grad_norm": 543.9110717773438, "learning_rate": 0.0004981725673255074, "loss": 42.9403, "step": 824 }, { "epoch": 2.1789369428854406, "grad_norm": 465.08721923828125, "learning_rate": 0.0004981675218630557, "loss": 43.8901, "step": 825 }, { "epoch": 2.1815780785737866, "grad_norm": 680.343505859375, "learning_rate": 0.0004981624694706718, "loss": 43.0611, "step": 826 }, { "epoch": 2.1842192142621326, "grad_norm": 469.0665588378906, "learning_rate": 0.0004981574101484966, "loss": 44.8933, "step": 827 }, { "epoch": 2.1868603499504786, "grad_norm": 324.87628173828125, "learning_rate": 0.0004981523438966715, "loss": 41.3353, "step": 828 }, { "epoch": 2.1895014856388246, "grad_norm": 468.1785888671875, "learning_rate": 0.0004981472707153381, "loss": 44.0802, "step": 829 }, { "epoch": 2.1921426213271706, "grad_norm": 5288.3173828125, "learning_rate": 0.0004981421906046378, "loss": 80.5534, "step": 830 }, { "epoch": 2.1947837570155166, "grad_norm": 14774.7626953125, "learning_rate": 0.0004981371035647126, "loss": 113.1553, "step": 831 }, { "epoch": 2.1974248927038627, "grad_norm": 8547.54296875, "learning_rate": 0.0004981320095957046, "loss": 97.6812, "step": 832 }, { "epoch": 2.2000660283922087, "grad_norm": 15379.76171875, "learning_rate": 0.000498126908697756, "loss": 74.8066, "step": 833 }, { "epoch": 2.2027071640805547, "grad_norm": 10761.6982421875, "learning_rate": 0.0004981218008710092, "loss": 66.8394, "step": 834 }, { "epoch": 2.2053482997689007, "grad_norm": 6421.5361328125, "learning_rate": 0.0004981166861156069, "loss": 50.5909, "step": 835 }, { "epoch": 2.2079894354572467, "grad_norm": 6108.9287109375, "learning_rate": 0.0004981115644316919, "loss": 42.7711, "step": 836 }, { "epoch": 2.2106305711455927, "grad_norm": 5975.95849609375, "learning_rate": 0.0004981064358194072, "loss": 45.5426, "step": 837 }, { "epoch": 2.2132717068339387, "grad_norm": 7029.28515625, "learning_rate": 0.000498101300278896, "loss": 43.5637, "step": 838 }, { "epoch": 2.2159128425222847, "grad_norm": 13844.0673828125, "learning_rate": 0.0004980961578103017, "loss": 36.1684, "step": 839 }, { "epoch": 2.2185539782106307, "grad_norm": 5073.7177734375, "learning_rate": 0.0004980910084137679, "loss": 132.1968, "step": 840 }, { "epoch": 2.2211951138989767, "grad_norm": 3762.1064453125, "learning_rate": 0.0004980858520894386, "loss": 192.167, "step": 841 }, { "epoch": 2.2238362495873227, "grad_norm": 3522.813720703125, "learning_rate": 0.0004980806888374575, "loss": 152.4062, "step": 842 }, { "epoch": 2.2264773852756687, "grad_norm": 3419.346923828125, "learning_rate": 0.000498075518657969, "loss": 90.3799, "step": 843 }, { "epoch": 2.2291185209640147, "grad_norm": 1433.4471435546875, "learning_rate": 0.0004980703415511172, "loss": 54.2499, "step": 844 }, { "epoch": 2.2317596566523603, "grad_norm": 941.4274291992188, "learning_rate": 0.0004980651575170469, "loss": 50.3592, "step": 845 }, { "epoch": 2.2344007923407063, "grad_norm": 1339.1605224609375, "learning_rate": 0.0004980599665559028, "loss": 56.3898, "step": 846 }, { "epoch": 2.2370419280290523, "grad_norm": 1496.6754150390625, "learning_rate": 0.0004980547686678299, "loss": 61.4616, "step": 847 }, { "epoch": 2.2396830637173983, "grad_norm": 1756.7459716796875, "learning_rate": 0.0004980495638529733, "loss": 56.3563, "step": 848 }, { "epoch": 2.2423241994057443, "grad_norm": 932.3367309570312, "learning_rate": 0.0004980443521114782, "loss": 49.7341, "step": 849 }, { "epoch": 2.2449653350940904, "grad_norm": 1046.7099609375, "learning_rate": 0.0004980391334434904, "loss": 47.1256, "step": 850 }, { "epoch": 2.2476064707824364, "grad_norm": 983.9674682617188, "learning_rate": 0.0004980339078491556, "loss": 48.5616, "step": 851 }, { "epoch": 2.2502476064707824, "grad_norm": 1178.541259765625, "learning_rate": 0.0004980286753286195, "loss": 52.609, "step": 852 }, { "epoch": 2.2528887421591284, "grad_norm": 1331.48779296875, "learning_rate": 0.0004980234358820284, "loss": 51.1861, "step": 853 }, { "epoch": 2.2555298778474744, "grad_norm": 762.24365234375, "learning_rate": 0.0004980181895095284, "loss": 46.2645, "step": 854 }, { "epoch": 2.2581710135358204, "grad_norm": 940.01025390625, "learning_rate": 0.0004980129362112663, "loss": 48.3836, "step": 855 }, { "epoch": 2.2608121492241664, "grad_norm": 1274.54443359375, "learning_rate": 0.0004980076759873886, "loss": 48.5354, "step": 856 }, { "epoch": 2.2634532849125124, "grad_norm": 1725.726806640625, "learning_rate": 0.0004980024088380423, "loss": 50.7572, "step": 857 }, { "epoch": 2.2660944206008584, "grad_norm": 1044.289306640625, "learning_rate": 0.0004979971347633743, "loss": 51.2914, "step": 858 }, { "epoch": 2.2687355562892044, "grad_norm": 790.3731689453125, "learning_rate": 0.000497991853763532, "loss": 51.1394, "step": 859 }, { "epoch": 2.2713766919775504, "grad_norm": 891.7321166992188, "learning_rate": 0.000497986565838663, "loss": 47.2535, "step": 860 }, { "epoch": 2.2740178276658964, "grad_norm": 844.716552734375, "learning_rate": 0.0004979812709889147, "loss": 48.7528, "step": 861 }, { "epoch": 2.2766589633542424, "grad_norm": 1026.21923828125, "learning_rate": 0.0004979759692144351, "loss": 51.1391, "step": 862 }, { "epoch": 2.2793000990425885, "grad_norm": 624.37646484375, "learning_rate": 0.0004979706605153722, "loss": 57.2572, "step": 863 }, { "epoch": 2.2819412347309345, "grad_norm": 719.60009765625, "learning_rate": 0.0004979653448918743, "loss": 54.5105, "step": 864 }, { "epoch": 2.2845823704192805, "grad_norm": 930.1363525390625, "learning_rate": 0.0004979600223440899, "loss": 52.8651, "step": 865 }, { "epoch": 2.287223506107626, "grad_norm": 861.9552001953125, "learning_rate": 0.0004979546928721674, "loss": 48.3074, "step": 866 }, { "epoch": 2.2898646417959725, "grad_norm": 791.7761840820312, "learning_rate": 0.0004979493564762559, "loss": 50.6847, "step": 867 }, { "epoch": 2.292505777484318, "grad_norm": 503.0054931640625, "learning_rate": 0.0004979440131565042, "loss": 47.1543, "step": 868 }, { "epoch": 2.295146913172664, "grad_norm": 1875.2425537109375, "learning_rate": 0.0004979386629130615, "loss": 47.0437, "step": 869 }, { "epoch": 2.29778804886101, "grad_norm": 589.3012084960938, "learning_rate": 0.0004979333057460774, "loss": 45.0988, "step": 870 }, { "epoch": 2.300429184549356, "grad_norm": 628.1223754882812, "learning_rate": 0.0004979279416557013, "loss": 43.2072, "step": 871 }, { "epoch": 2.303070320237702, "grad_norm": 601.3417358398438, "learning_rate": 0.0004979225706420831, "loss": 43.5924, "step": 872 }, { "epoch": 2.305711455926048, "grad_norm": 540.061767578125, "learning_rate": 0.0004979171927053729, "loss": 42.3137, "step": 873 }, { "epoch": 2.308352591614394, "grad_norm": 448.17730712890625, "learning_rate": 0.0004979118078457205, "loss": 41.595, "step": 874 }, { "epoch": 2.31099372730274, "grad_norm": 433.3888854980469, "learning_rate": 0.0004979064160632766, "loss": 40.8392, "step": 875 }, { "epoch": 2.313634862991086, "grad_norm": 707.3760986328125, "learning_rate": 0.0004979010173581916, "loss": 40.7938, "step": 876 }, { "epoch": 2.316275998679432, "grad_norm": 823.6949462890625, "learning_rate": 0.0004978956117306164, "loss": 40.6859, "step": 877 }, { "epoch": 2.318917134367778, "grad_norm": 472.45281982421875, "learning_rate": 0.0004978901991807018, "loss": 40.4482, "step": 878 }, { "epoch": 2.321558270056124, "grad_norm": 3379.951416015625, "learning_rate": 0.0004978847797085991, "loss": 45.63, "step": 879 }, { "epoch": 2.32419940574447, "grad_norm": 4149.8876953125, "learning_rate": 0.0004978793533144594, "loss": 61.3649, "step": 880 }, { "epoch": 2.326840541432816, "grad_norm": 12656.142578125, "learning_rate": 0.0004978739199984344, "loss": 146.9531, "step": 881 }, { "epoch": 2.329481677121162, "grad_norm": 5928.58349609375, "learning_rate": 0.0004978684797606759, "loss": 115.2616, "step": 882 }, { "epoch": 2.332122812809508, "grad_norm": 39259.9765625, "learning_rate": 0.0004978630326013357, "loss": 117.7471, "step": 883 }, { "epoch": 2.334763948497854, "grad_norm": 17597.501953125, "learning_rate": 0.0004978575785205659, "loss": 113.4521, "step": 884 }, { "epoch": 2.3374050841862, "grad_norm": 8422.029296875, "learning_rate": 0.0004978521175185187, "loss": 96.416, "step": 885 }, { "epoch": 2.340046219874546, "grad_norm": 9584.26171875, "learning_rate": 0.0004978466495953468, "loss": 83.5532, "step": 886 }, { "epoch": 2.342687355562892, "grad_norm": 11971.7666015625, "learning_rate": 0.0004978411747512028, "loss": 65.3389, "step": 887 }, { "epoch": 2.345328491251238, "grad_norm": 3372.618896484375, "learning_rate": 0.0004978356929862396, "loss": 51.0853, "step": 888 }, { "epoch": 2.3479696269395838, "grad_norm": 3023.967529296875, "learning_rate": 0.0004978302043006103, "loss": 50.1955, "step": 889 }, { "epoch": 2.3506107626279302, "grad_norm": 2592.014892578125, "learning_rate": 0.000497824708694468, "loss": 90.7744, "step": 890 }, { "epoch": 2.353251898316276, "grad_norm": 2088.29150390625, "learning_rate": 0.0004978192061679664, "loss": 75.2597, "step": 891 }, { "epoch": 2.355893034004622, "grad_norm": 955.3435668945312, "learning_rate": 0.000497813696721259, "loss": 54.413, "step": 892 }, { "epoch": 2.358534169692968, "grad_norm": 662.9666748046875, "learning_rate": 0.0004978081803544998, "loss": 50.5168, "step": 893 }, { "epoch": 2.361175305381314, "grad_norm": 983.45556640625, "learning_rate": 0.0004978026570678425, "loss": 48.01, "step": 894 }, { "epoch": 2.36381644106966, "grad_norm": 1455.9417724609375, "learning_rate": 0.0004977971268614417, "loss": 55.2932, "step": 895 }, { "epoch": 2.366457576758006, "grad_norm": 1234.8876953125, "learning_rate": 0.0004977915897354517, "loss": 50.5142, "step": 896 }, { "epoch": 2.369098712446352, "grad_norm": 910.5372924804688, "learning_rate": 0.0004977860456900271, "loss": 44.3037, "step": 897 }, { "epoch": 2.371739848134698, "grad_norm": 761.840576171875, "learning_rate": 0.0004977804947253228, "loss": 42.3249, "step": 898 }, { "epoch": 2.374380983823044, "grad_norm": 664.0020141601562, "learning_rate": 0.0004977749368414937, "loss": 42.208, "step": 899 }, { "epoch": 2.37702211951139, "grad_norm": 897.4065551757812, "learning_rate": 0.000497769372038695, "loss": 44.8033, "step": 900 }, { "epoch": 2.379663255199736, "grad_norm": 1837.2054443359375, "learning_rate": 0.0004977638003170822, "loss": 47.5055, "step": 901 }, { "epoch": 2.382304390888082, "grad_norm": 1779.8277587890625, "learning_rate": 0.0004977582216768108, "loss": 50.9636, "step": 902 }, { "epoch": 2.384945526576428, "grad_norm": 1932.489990234375, "learning_rate": 0.0004977526361180366, "loss": 52.3484, "step": 903 }, { "epoch": 2.387586662264774, "grad_norm": 2127.357666015625, "learning_rate": 0.0004977470436409155, "loss": 53.5957, "step": 904 }, { "epoch": 2.39022779795312, "grad_norm": 2692.701171875, "learning_rate": 0.0004977414442456038, "loss": 52.6386, "step": 905 }, { "epoch": 2.392868933641466, "grad_norm": 2402.197265625, "learning_rate": 0.0004977358379322579, "loss": 51.7275, "step": 906 }, { "epoch": 2.395510069329812, "grad_norm": 3137.0703125, "learning_rate": 0.0004977302247010341, "loss": 54.6455, "step": 907 }, { "epoch": 2.398151205018158, "grad_norm": 1776.6024169921875, "learning_rate": 0.0004977246045520893, "loss": 52.3004, "step": 908 }, { "epoch": 2.400792340706504, "grad_norm": 1596.3446044921875, "learning_rate": 0.0004977189774855805, "loss": 46.7446, "step": 909 }, { "epoch": 2.40343347639485, "grad_norm": 1104.4781494140625, "learning_rate": 0.0004977133435016647, "loss": 46.5782, "step": 910 }, { "epoch": 2.406074612083196, "grad_norm": 1251.3248291015625, "learning_rate": 0.0004977077026004995, "loss": 46.3785, "step": 911 }, { "epoch": 2.4087157477715415, "grad_norm": 1536.5950927734375, "learning_rate": 0.000497702054782242, "loss": 52.7569, "step": 912 }, { "epoch": 2.411356883459888, "grad_norm": 1545.5113525390625, "learning_rate": 0.0004976964000470502, "loss": 50.3627, "step": 913 }, { "epoch": 2.4139980191482335, "grad_norm": 1098.829345703125, "learning_rate": 0.0004976907383950819, "loss": 52.3733, "step": 914 }, { "epoch": 2.4166391548365795, "grad_norm": 1854.09765625, "learning_rate": 0.0004976850698264953, "loss": 51.5217, "step": 915 }, { "epoch": 2.4192802905249255, "grad_norm": 1505.9815673828125, "learning_rate": 0.0004976793943414486, "loss": 53.7191, "step": 916 }, { "epoch": 2.4219214262132716, "grad_norm": 1433.7919921875, "learning_rate": 0.0004976737119401003, "loss": 51.4074, "step": 917 }, { "epoch": 2.4245625619016176, "grad_norm": 2935.814453125, "learning_rate": 0.0004976680226226091, "loss": 50.6269, "step": 918 }, { "epoch": 2.4272036975899636, "grad_norm": 1502.413818359375, "learning_rate": 0.0004976623263891339, "loss": 51.4348, "step": 919 }, { "epoch": 2.4298448332783096, "grad_norm": 1580.8377685546875, "learning_rate": 0.0004976566232398336, "loss": 46.2607, "step": 920 }, { "epoch": 2.4324859689666556, "grad_norm": 2378.097412109375, "learning_rate": 0.0004976509131748676, "loss": 47.0047, "step": 921 }, { "epoch": 2.4351271046550016, "grad_norm": 5225.2041015625, "learning_rate": 0.0004976451961943954, "loss": 44.9136, "step": 922 }, { "epoch": 2.4377682403433476, "grad_norm": 2399.796875, "learning_rate": 0.0004976394722985766, "loss": 45.8615, "step": 923 }, { "epoch": 2.4404093760316936, "grad_norm": 1406.83349609375, "learning_rate": 0.0004976337414875709, "loss": 45.2609, "step": 924 }, { "epoch": 2.4430505117200396, "grad_norm": 1799.4644775390625, "learning_rate": 0.0004976280037615385, "loss": 46.861, "step": 925 }, { "epoch": 2.4456916474083856, "grad_norm": 1457.8094482421875, "learning_rate": 0.0004976222591206395, "loss": 44.5744, "step": 926 }, { "epoch": 2.4483327830967316, "grad_norm": 981.1307983398438, "learning_rate": 0.0004976165075650344, "loss": 43.9632, "step": 927 }, { "epoch": 2.4509739187850776, "grad_norm": 1171.395751953125, "learning_rate": 0.0004976107490948839, "loss": 43.6299, "step": 928 }, { "epoch": 2.4536150544734237, "grad_norm": 12901.4619140625, "learning_rate": 0.0004976049837103485, "loss": 62.2957, "step": 929 }, { "epoch": 2.4562561901617697, "grad_norm": 22835.8359375, "learning_rate": 0.0004975992114115895, "loss": 117.0161, "step": 930 }, { "epoch": 2.4588973258501157, "grad_norm": 38483.9921875, "learning_rate": 0.0004975934321987679, "loss": 108.6254, "step": 931 }, { "epoch": 2.4615384615384617, "grad_norm": 12247.8076171875, "learning_rate": 0.0004975876460720452, "loss": 122.0288, "step": 932 }, { "epoch": 2.4641795972268077, "grad_norm": 39805.08984375, "learning_rate": 0.000497581853031583, "loss": 101.9282, "step": 933 }, { "epoch": 2.4668207329151537, "grad_norm": 16936.681640625, "learning_rate": 0.0004975760530775428, "loss": 97.5828, "step": 934 }, { "epoch": 2.4694618686034997, "grad_norm": 14681.083984375, "learning_rate": 0.0004975702462100868, "loss": 84.5559, "step": 935 }, { "epoch": 2.4721030042918457, "grad_norm": 11460.2158203125, "learning_rate": 0.0004975644324293772, "loss": 58.8843, "step": 936 }, { "epoch": 2.4747441399801913, "grad_norm": 13477.7412109375, "learning_rate": 0.0004975586117355763, "loss": 49.6717, "step": 937 }, { "epoch": 2.4773852756685373, "grad_norm": 9767.66796875, "learning_rate": 0.0004975527841288464, "loss": 43.2587, "step": 938 }, { "epoch": 2.4800264113568833, "grad_norm": 2190.083984375, "learning_rate": 0.0004975469496093506, "loss": 74.8775, "step": 939 }, { "epoch": 2.4826675470452293, "grad_norm": 2061.56494140625, "learning_rate": 0.0004975411081772516, "loss": 72.4745, "step": 940 }, { "epoch": 2.4853086827335753, "grad_norm": 1365.140869140625, "learning_rate": 0.0004975352598327125, "loss": 56.3801, "step": 941 }, { "epoch": 2.4879498184219213, "grad_norm": 555.170654296875, "learning_rate": 0.0004975294045758968, "loss": 44.1874, "step": 942 }, { "epoch": 2.4905909541102673, "grad_norm": 556.7086791992188, "learning_rate": 0.0004975235424069677, "loss": 47.6431, "step": 943 }, { "epoch": 2.4932320897986133, "grad_norm": 997.5745849609375, "learning_rate": 0.0004975176733260892, "loss": 51.8131, "step": 944 }, { "epoch": 2.4958732254869593, "grad_norm": 1178.323974609375, "learning_rate": 0.0004975117973334252, "loss": 51.4012, "step": 945 }, { "epoch": 2.4985143611753053, "grad_norm": 787.5917358398438, "learning_rate": 0.0004975059144291394, "loss": 46.6565, "step": 946 }, { "epoch": 2.5011554968636514, "grad_norm": 993.6249389648438, "learning_rate": 0.0004975000246133964, "loss": 42.2946, "step": 947 }, { "epoch": 2.5037966325519974, "grad_norm": 447.3503723144531, "learning_rate": 0.0004974941278863607, "loss": 40.735, "step": 948 }, { "epoch": 2.5064377682403434, "grad_norm": 534.2039794921875, "learning_rate": 0.0004974882242481969, "loss": 43.5393, "step": 949 }, { "epoch": 2.5090789039286894, "grad_norm": 687.9132080078125, "learning_rate": 0.0004974823136990697, "loss": 44.6161, "step": 950 }, { "epoch": 2.5117200396170354, "grad_norm": 632.210205078125, "learning_rate": 0.0004974763962391443, "loss": 45.7989, "step": 951 }, { "epoch": 2.5143611753053814, "grad_norm": 579.841064453125, "learning_rate": 0.0004974704718685858, "loss": 42.7932, "step": 952 }, { "epoch": 2.5170023109937274, "grad_norm": 497.93743896484375, "learning_rate": 0.0004974645405875599, "loss": 41.0038, "step": 953 }, { "epoch": 2.5196434466820734, "grad_norm": 579.4608154296875, "learning_rate": 0.0004974586023962321, "loss": 41.4162, "step": 954 }, { "epoch": 2.5222845823704194, "grad_norm": 820.5999145507812, "learning_rate": 0.0004974526572947681, "loss": 41.8778, "step": 955 }, { "epoch": 2.5249257180587654, "grad_norm": 716.393310546875, "learning_rate": 0.000497446705283334, "loss": 45.0976, "step": 956 }, { "epoch": 2.5275668537471114, "grad_norm": 1122.4349365234375, "learning_rate": 0.0004974407463620961, "loss": 46.755, "step": 957 }, { "epoch": 2.530207989435457, "grad_norm": 823.6389770507812, "learning_rate": 0.0004974347805312207, "loss": 48.3097, "step": 958 }, { "epoch": 2.5328491251238034, "grad_norm": 568.2186889648438, "learning_rate": 0.0004974288077908743, "loss": 45.3751, "step": 959 }, { "epoch": 2.535490260812149, "grad_norm": 511.43658447265625, "learning_rate": 0.0004974228281412239, "loss": 45.6259, "step": 960 }, { "epoch": 2.5381313965004955, "grad_norm": 383.42218017578125, "learning_rate": 0.0004974168415824363, "loss": 45.5703, "step": 961 }, { "epoch": 2.540772532188841, "grad_norm": 526.7448120117188, "learning_rate": 0.0004974108481146787, "loss": 47.1941, "step": 962 }, { "epoch": 2.543413667877187, "grad_norm": 591.7235717773438, "learning_rate": 0.0004974048477381185, "loss": 50.0344, "step": 963 }, { "epoch": 2.546054803565533, "grad_norm": 574.3789672851562, "learning_rate": 0.0004973988404529234, "loss": 47.3036, "step": 964 }, { "epoch": 2.548695939253879, "grad_norm": 530.748779296875, "learning_rate": 0.0004973928262592609, "loss": 43.5661, "step": 965 }, { "epoch": 2.551337074942225, "grad_norm": 736.674560546875, "learning_rate": 0.0004973868051572991, "loss": 47.9148, "step": 966 }, { "epoch": 2.553978210630571, "grad_norm": 425.7079162597656, "learning_rate": 0.0004973807771472061, "loss": 42.1207, "step": 967 }, { "epoch": 2.556619346318917, "grad_norm": 592.14013671875, "learning_rate": 0.0004973747422291502, "loss": 46.4082, "step": 968 }, { "epoch": 2.559260482007263, "grad_norm": 1388.626953125, "learning_rate": 0.0004973687004032999, "loss": 45.5058, "step": 969 }, { "epoch": 2.561901617695609, "grad_norm": 1111.8948974609375, "learning_rate": 0.000497362651669824, "loss": 43.6234, "step": 970 }, { "epoch": 2.564542753383955, "grad_norm": 393.2334899902344, "learning_rate": 0.0004973565960288914, "loss": 41.2619, "step": 971 }, { "epoch": 2.567183889072301, "grad_norm": 426.6219177246094, "learning_rate": 0.000497350533480671, "loss": 42.5396, "step": 972 }, { "epoch": 2.569825024760647, "grad_norm": 711.80859375, "learning_rate": 0.0004973444640253324, "loss": 41.8494, "step": 973 }, { "epoch": 2.572466160448993, "grad_norm": 488.3661804199219, "learning_rate": 0.000497338387663045, "loss": 40.5168, "step": 974 }, { "epoch": 2.575107296137339, "grad_norm": 997.534423828125, "learning_rate": 0.0004973323043939783, "loss": 39.6488, "step": 975 }, { "epoch": 2.577748431825685, "grad_norm": 660.8893432617188, "learning_rate": 0.0004973262142183023, "loss": 40.2579, "step": 976 }, { "epoch": 2.580389567514031, "grad_norm": 666.5127563476562, "learning_rate": 0.000497320117136187, "loss": 38.6286, "step": 977 }, { "epoch": 2.583030703202377, "grad_norm": 472.4532470703125, "learning_rate": 0.0004973140131478027, "loss": 40.0832, "step": 978 }, { "epoch": 2.585671838890723, "grad_norm": 1036.885009765625, "learning_rate": 0.00049730790225332, "loss": 40.2901, "step": 979 }, { "epoch": 2.588312974579069, "grad_norm": 9116.4501953125, "learning_rate": 0.0004973017844529094, "loss": 102.1344, "step": 980 }, { "epoch": 2.5909541102674147, "grad_norm": 9377.8125, "learning_rate": 0.0004972956597467416, "loss": 135.8804, "step": 981 }, { "epoch": 2.593595245955761, "grad_norm": 11048.1669921875, "learning_rate": 0.0004972895281349879, "loss": 129.2205, "step": 982 }, { "epoch": 2.5962363816441067, "grad_norm": 10686.25, "learning_rate": 0.0004972833896178194, "loss": 93.7034, "step": 983 }, { "epoch": 2.598877517332453, "grad_norm": 3677.113525390625, "learning_rate": 0.0004972772441954075, "loss": 76.9434, "step": 984 }, { "epoch": 2.6015186530207988, "grad_norm": 12289.51953125, "learning_rate": 0.0004972710918679237, "loss": 85.763, "step": 985 }, { "epoch": 2.6041597887091448, "grad_norm": 5914.7373046875, "learning_rate": 0.0004972649326355401, "loss": 58.169, "step": 986 }, { "epoch": 2.606800924397491, "grad_norm": 8301.9521484375, "learning_rate": 0.0004972587664984283, "loss": 46.6302, "step": 987 }, { "epoch": 2.609442060085837, "grad_norm": 5590.36669921875, "learning_rate": 0.0004972525934567607, "loss": 44.1251, "step": 988 }, { "epoch": 2.612083195774183, "grad_norm": 3133.982666015625, "learning_rate": 0.0004972464135107098, "loss": 42.146, "step": 989 }, { "epoch": 2.614724331462529, "grad_norm": 2652.946533203125, "learning_rate": 0.0004972402266604481, "loss": 88.4525, "step": 990 }, { "epoch": 2.617365467150875, "grad_norm": 2208.004150390625, "learning_rate": 0.0004972340329061482, "loss": 81.958, "step": 991 }, { "epoch": 2.620006602839221, "grad_norm": 1975.3199462890625, "learning_rate": 0.0004972278322479831, "loss": 59.3539, "step": 992 }, { "epoch": 2.622647738527567, "grad_norm": 916.926513671875, "learning_rate": 0.0004972216246861262, "loss": 45.9479, "step": 993 }, { "epoch": 2.625288874215913, "grad_norm": 1854.23876953125, "learning_rate": 0.0004972154102207506, "loss": 42.5605, "step": 994 }, { "epoch": 2.627930009904259, "grad_norm": 2034.8585205078125, "learning_rate": 0.0004972091888520297, "loss": 42.6189, "step": 995 }, { "epoch": 2.630571145592605, "grad_norm": 1468.1639404296875, "learning_rate": 0.0004972029605801375, "loss": 52.3456, "step": 996 }, { "epoch": 2.633212281280951, "grad_norm": 1465.531494140625, "learning_rate": 0.000497196725405248, "loss": 54.8668, "step": 997 }, { "epoch": 2.635853416969297, "grad_norm": 1582.976806640625, "learning_rate": 0.000497190483327535, "loss": 50.4497, "step": 998 }, { "epoch": 2.638494552657643, "grad_norm": 1089.869140625, "learning_rate": 0.0004971842343471731, "loss": 43.575, "step": 999 }, { "epoch": 2.641135688345989, "grad_norm": 1064.6009521484375, "learning_rate": 0.0004971779784643366, "loss": 40.8153, "step": 1000 }, { "epoch": 2.641135688345989, "eval_loss": 6.843137741088867, "eval_runtime": 2.2108, "eval_samples_per_second": 223.9, "eval_steps_per_second": 28.044, "step": 1000 }, { "epoch": 2.643776824034335, "grad_norm": 670.3017578125, "learning_rate": 0.0004971717156792002, "loss": 40.511, "step": 1001 }, { "epoch": 2.646417959722681, "grad_norm": 448.58612060546875, "learning_rate": 0.0004971654459919388, "loss": 40.3159, "step": 1002 }, { "epoch": 2.649059095411027, "grad_norm": 707.8433837890625, "learning_rate": 0.0004971591694027276, "loss": 41.5087, "step": 1003 }, { "epoch": 2.6517002310993725, "grad_norm": 614.696533203125, "learning_rate": 0.0004971528859117418, "loss": 45.3692, "step": 1004 }, { "epoch": 2.654341366787719, "grad_norm": 1275.7783203125, "learning_rate": 0.0004971465955191567, "loss": 45.9475, "step": 1005 }, { "epoch": 2.6569825024760645, "grad_norm": 863.8468017578125, "learning_rate": 0.0004971402982251483, "loss": 45.7032, "step": 1006 }, { "epoch": 2.659623638164411, "grad_norm": 2311.845703125, "learning_rate": 0.0004971339940298921, "loss": 47.5713, "step": 1007 }, { "epoch": 2.6622647738527565, "grad_norm": 591.3589477539062, "learning_rate": 0.0004971276829335644, "loss": 46.3707, "step": 1008 }, { "epoch": 2.6649059095411025, "grad_norm": 409.5941467285156, "learning_rate": 0.0004971213649363412, "loss": 43.3169, "step": 1009 }, { "epoch": 2.6675470452294485, "grad_norm": 288.0762023925781, "learning_rate": 0.0004971150400383992, "loss": 43.81, "step": 1010 }, { "epoch": 2.6701881809177945, "grad_norm": 397.51116943359375, "learning_rate": 0.0004971087082399148, "loss": 43.6043, "step": 1011 }, { "epoch": 2.6728293166061405, "grad_norm": 472.59814453125, "learning_rate": 0.0004971023695410648, "loss": 46.2928, "step": 1012 }, { "epoch": 2.6754704522944865, "grad_norm": 341.5582275390625, "learning_rate": 0.0004970960239420265, "loss": 44.2, "step": 1013 }, { "epoch": 2.6781115879828326, "grad_norm": 686.7794799804688, "learning_rate": 0.0004970896714429767, "loss": 47.2703, "step": 1014 }, { "epoch": 2.6807527236711786, "grad_norm": 538.9121704101562, "learning_rate": 0.000497083312044093, "loss": 46.9903, "step": 1015 }, { "epoch": 2.6833938593595246, "grad_norm": 346.97979736328125, "learning_rate": 0.000497076945745553, "loss": 47.6282, "step": 1016 }, { "epoch": 2.6860349950478706, "grad_norm": 525.6483764648438, "learning_rate": 0.0004970705725475344, "loss": 46.4966, "step": 1017 }, { "epoch": 2.6886761307362166, "grad_norm": 470.6677551269531, "learning_rate": 0.0004970641924502151, "loss": 44.3072, "step": 1018 }, { "epoch": 2.6913172664245626, "grad_norm": 609.5536499023438, "learning_rate": 0.0004970578054537735, "loss": 43.8331, "step": 1019 }, { "epoch": 2.6939584021129086, "grad_norm": 354.22393798828125, "learning_rate": 0.0004970514115583878, "loss": 43.6256, "step": 1020 }, { "epoch": 2.6965995378012546, "grad_norm": 514.214599609375, "learning_rate": 0.0004970450107642365, "loss": 41.771, "step": 1021 }, { "epoch": 2.6992406734896006, "grad_norm": 469.3710632324219, "learning_rate": 0.0004970386030714984, "loss": 40.9855, "step": 1022 }, { "epoch": 2.7018818091779466, "grad_norm": 384.7980041503906, "learning_rate": 0.0004970321884803524, "loss": 40.3724, "step": 1023 }, { "epoch": 2.7045229448662926, "grad_norm": 566.6923217773438, "learning_rate": 0.0004970257669909778, "loss": 38.9453, "step": 1024 }, { "epoch": 2.7071640805546386, "grad_norm": 382.6417236328125, "learning_rate": 0.0004970193386035537, "loss": 39.4092, "step": 1025 }, { "epoch": 2.7098052162429846, "grad_norm": 589.7938232421875, "learning_rate": 0.0004970129033182596, "loss": 41.9191, "step": 1026 }, { "epoch": 2.71244635193133, "grad_norm": 676.723388671875, "learning_rate": 0.0004970064611352754, "loss": 39.4171, "step": 1027 }, { "epoch": 2.7150874876196767, "grad_norm": 642.3772583007812, "learning_rate": 0.0004970000120547808, "loss": 38.7971, "step": 1028 }, { "epoch": 2.7177286233080222, "grad_norm": 1464.673583984375, "learning_rate": 0.0004969935560769559, "loss": 68.3289, "step": 1029 }, { "epoch": 2.7203697589963687, "grad_norm": 11650.0, "learning_rate": 0.0004969870932019811, "loss": 101.4917, "step": 1030 }, { "epoch": 2.7230108946847142, "grad_norm": 9821.57421875, "learning_rate": 0.0004969806234300368, "loss": 121.8688, "step": 1031 }, { "epoch": 2.7256520303730603, "grad_norm": 7391.96728515625, "learning_rate": 0.0004969741467613038, "loss": 102.662, "step": 1032 }, { "epoch": 2.7282931660614063, "grad_norm": 5486.34228515625, "learning_rate": 0.0004969676631959626, "loss": 103.299, "step": 1033 }, { "epoch": 2.7309343017497523, "grad_norm": 12743.4296875, "learning_rate": 0.0004969611727341947, "loss": 97.3459, "step": 1034 }, { "epoch": 2.7335754374380983, "grad_norm": 3010.659423828125, "learning_rate": 0.000496954675376181, "loss": 74.6185, "step": 1035 }, { "epoch": 2.7362165731264443, "grad_norm": 5797.662109375, "learning_rate": 0.0004969481711221031, "loss": 62.5657, "step": 1036 }, { "epoch": 2.7388577088147903, "grad_norm": 3788.537353515625, "learning_rate": 0.0004969416599721426, "loss": 52.3973, "step": 1037 }, { "epoch": 2.7414988445031363, "grad_norm": 3965.294921875, "learning_rate": 0.0004969351419264813, "loss": 34.9233, "step": 1038 }, { "epoch": 2.7441399801914823, "grad_norm": 5331.11962890625, "learning_rate": 0.0004969286169853012, "loss": 88.6594, "step": 1039 }, { "epoch": 2.7467811158798283, "grad_norm": 4223.7998046875, "learning_rate": 0.0004969220851487844, "loss": 151.6907, "step": 1040 }, { "epoch": 2.7494222515681743, "grad_norm": 3814.271728515625, "learning_rate": 0.0004969155464171136, "loss": 141.3803, "step": 1041 }, { "epoch": 2.7520633872565203, "grad_norm": 2752.46337890625, "learning_rate": 0.0004969090007904711, "loss": 121.1252, "step": 1042 }, { "epoch": 2.7547045229448663, "grad_norm": 2512.876953125, "learning_rate": 0.0004969024482690399, "loss": 83.5357, "step": 1043 }, { "epoch": 2.7573456586332123, "grad_norm": 1076.9376220703125, "learning_rate": 0.0004968958888530028, "loss": 51.5977, "step": 1044 }, { "epoch": 2.7599867943215584, "grad_norm": 554.9221801757812, "learning_rate": 0.0004968893225425431, "loss": 45.3462, "step": 1045 }, { "epoch": 2.7626279300099044, "grad_norm": 1024.6243896484375, "learning_rate": 0.0004968827493378441, "loss": 48.0577, "step": 1046 }, { "epoch": 2.7652690656982504, "grad_norm": 1974.019775390625, "learning_rate": 0.0004968761692390893, "loss": 60.7225, "step": 1047 }, { "epoch": 2.7679102013865964, "grad_norm": 2055.240234375, "learning_rate": 0.0004968695822464625, "loss": 66.7328, "step": 1048 }, { "epoch": 2.7705513370749424, "grad_norm": 2720.55126953125, "learning_rate": 0.0004968629883601476, "loss": 61.4284, "step": 1049 }, { "epoch": 2.773192472763288, "grad_norm": 2372.9990234375, "learning_rate": 0.0004968563875803288, "loss": 50.9072, "step": 1050 }, { "epoch": 2.7758336084516344, "grad_norm": 1664.4588623046875, "learning_rate": 0.0004968497799071904, "loss": 42.8138, "step": 1051 }, { "epoch": 2.77847474413998, "grad_norm": 1156.6829833984375, "learning_rate": 0.0004968431653409169, "loss": 43.344, "step": 1052 }, { "epoch": 2.7811158798283264, "grad_norm": 2118.796875, "learning_rate": 0.000496836543881693, "loss": 40.6835, "step": 1053 }, { "epoch": 2.783757015516672, "grad_norm": 2008.954833984375, "learning_rate": 0.0004968299155297035, "loss": 42.0075, "step": 1054 }, { "epoch": 2.786398151205018, "grad_norm": 1931.84423828125, "learning_rate": 0.0004968232802851338, "loss": 42.8404, "step": 1055 }, { "epoch": 2.789039286893364, "grad_norm": 4062.859130859375, "learning_rate": 0.0004968166381481689, "loss": 43.0792, "step": 1056 }, { "epoch": 2.79168042258171, "grad_norm": 4258.8310546875, "learning_rate": 0.0004968099891189945, "loss": 49.5572, "step": 1057 }, { "epoch": 2.794321558270056, "grad_norm": 7563.748046875, "learning_rate": 0.0004968033331977959, "loss": 50.5061, "step": 1058 }, { "epoch": 2.796962693958402, "grad_norm": 2369.664794921875, "learning_rate": 0.0004967966703847594, "loss": 53.1859, "step": 1059 }, { "epoch": 2.799603829646748, "grad_norm": 1993.9925537109375, "learning_rate": 0.0004967900006800708, "loss": 49.5732, "step": 1060 }, { "epoch": 2.802244965335094, "grad_norm": 1786.736328125, "learning_rate": 0.0004967833240839164, "loss": 49.677, "step": 1061 }, { "epoch": 2.80488610102344, "grad_norm": 2670.6220703125, "learning_rate": 0.0004967766405964825, "loss": 51.0085, "step": 1062 }, { "epoch": 2.807527236711786, "grad_norm": 1491.6849365234375, "learning_rate": 0.0004967699502179561, "loss": 46.7144, "step": 1063 }, { "epoch": 2.810168372400132, "grad_norm": 1861.5369873046875, "learning_rate": 0.0004967632529485237, "loss": 50.0312, "step": 1064 }, { "epoch": 2.812809508088478, "grad_norm": 1902.092041015625, "learning_rate": 0.0004967565487883724, "loss": 48.4288, "step": 1065 }, { "epoch": 2.815450643776824, "grad_norm": 1838.88232421875, "learning_rate": 0.0004967498377376894, "loss": 47.5272, "step": 1066 }, { "epoch": 2.81809177946517, "grad_norm": 1996.6112060546875, "learning_rate": 0.0004967431197966622, "loss": 45.8158, "step": 1067 }, { "epoch": 2.820732915153516, "grad_norm": 1269.458740234375, "learning_rate": 0.0004967363949654783, "loss": 47.3819, "step": 1068 }, { "epoch": 2.823374050841862, "grad_norm": 1519.6514892578125, "learning_rate": 0.0004967296632443255, "loss": 47.0412, "step": 1069 }, { "epoch": 2.826015186530208, "grad_norm": 1694.1500244140625, "learning_rate": 0.0004967229246333918, "loss": 43.9608, "step": 1070 }, { "epoch": 2.828656322218554, "grad_norm": 2698.113037109375, "learning_rate": 0.0004967161791328654, "loss": 43.5426, "step": 1071 }, { "epoch": 2.8312974579069, "grad_norm": 1429.089599609375, "learning_rate": 0.0004967094267429345, "loss": 42.3999, "step": 1072 }, { "epoch": 2.8339385935952457, "grad_norm": 2117.93359375, "learning_rate": 0.0004967026674637879, "loss": 43.4806, "step": 1073 }, { "epoch": 2.836579729283592, "grad_norm": 2022.9913330078125, "learning_rate": 0.0004966959012956142, "loss": 42.4086, "step": 1074 }, { "epoch": 2.8392208649719377, "grad_norm": 1371.4183349609375, "learning_rate": 0.0004966891282386024, "loss": 40.662, "step": 1075 }, { "epoch": 2.841862000660284, "grad_norm": 1726.80859375, "learning_rate": 0.0004966823482929416, "loss": 41.329, "step": 1076 }, { "epoch": 2.8445031363486297, "grad_norm": 1325.6900634765625, "learning_rate": 0.0004966755614588211, "loss": 40.4912, "step": 1077 }, { "epoch": 2.847144272036976, "grad_norm": 1562.118896484375, "learning_rate": 0.0004966687677364304, "loss": 40.9216, "step": 1078 }, { "epoch": 2.8497854077253217, "grad_norm": 4633.3212890625, "learning_rate": 0.0004966619671259594, "loss": 43.9957, "step": 1079 }, { "epoch": 2.8524265434136677, "grad_norm": 19190.716796875, "learning_rate": 0.0004966551596275979, "loss": 78.9063, "step": 1080 }, { "epoch": 2.8550676791020138, "grad_norm": 23028.74609375, "learning_rate": 0.0004966483452415358, "loss": 142.3311, "step": 1081 }, { "epoch": 2.8577088147903598, "grad_norm": 12668.6806640625, "learning_rate": 0.0004966415239679636, "loss": 115.7871, "step": 1082 }, { "epoch": 2.8603499504787058, "grad_norm": 33588.06640625, "learning_rate": 0.0004966346958070717, "loss": 120.9838, "step": 1083 }, { "epoch": 2.862991086167052, "grad_norm": 31146.490234375, "learning_rate": 0.0004966278607590509, "loss": 105.7038, "step": 1084 }, { "epoch": 2.865632221855398, "grad_norm": 24830.828125, "learning_rate": 0.0004966210188240918, "loss": 107.7856, "step": 1085 }, { "epoch": 2.868273357543744, "grad_norm": 33409.7265625, "learning_rate": 0.0004966141700023858, "loss": 105.6757, "step": 1086 }, { "epoch": 2.87091449323209, "grad_norm": 28711.974609375, "learning_rate": 0.0004966073142941239, "loss": 90.0238, "step": 1087 }, { "epoch": 2.873555628920436, "grad_norm": 13363.61328125, "learning_rate": 0.0004966004516994976, "loss": 88.5883, "step": 1088 }, { "epoch": 2.876196764608782, "grad_norm": 8796.7421875, "learning_rate": 0.0004965935822186985, "loss": 67.8372, "step": 1089 }, { "epoch": 2.878837900297128, "grad_norm": 4319.8125, "learning_rate": 0.0004965867058519186, "loss": 50.4728, "step": 1090 }, { "epoch": 2.881479035985474, "grad_norm": 2096.369873046875, "learning_rate": 0.0004965798225993497, "loss": 48.809, "step": 1091 }, { "epoch": 2.88412017167382, "grad_norm": 3177.37255859375, "learning_rate": 0.0004965729324611841, "loss": 48.8648, "step": 1092 }, { "epoch": 2.886761307362166, "grad_norm": 2098.833251953125, "learning_rate": 0.0004965660354376143, "loss": 48.7962, "step": 1093 }, { "epoch": 2.889402443050512, "grad_norm": 1736.83056640625, "learning_rate": 0.0004965591315288327, "loss": 46.9389, "step": 1094 }, { "epoch": 2.892043578738858, "grad_norm": 2197.329345703125, "learning_rate": 0.0004965522207350322, "loss": 44.1071, "step": 1095 }, { "epoch": 2.8946847144272034, "grad_norm": 3322.1787109375, "learning_rate": 0.0004965453030564057, "loss": 52.4775, "step": 1096 }, { "epoch": 2.89732585011555, "grad_norm": 1544.2576904296875, "learning_rate": 0.0004965383784931467, "loss": 43.8759, "step": 1097 }, { "epoch": 2.8999669858038954, "grad_norm": 1967.0216064453125, "learning_rate": 0.0004965314470454481, "loss": 45.5598, "step": 1098 }, { "epoch": 2.902608121492242, "grad_norm": 1856.359130859375, "learning_rate": 0.0004965245087135036, "loss": 47.0832, "step": 1099 }, { "epoch": 2.9052492571805875, "grad_norm": 2440.903076171875, "learning_rate": 0.0004965175634975071, "loss": 45.7784, "step": 1100 }, { "epoch": 2.907890392868934, "grad_norm": 3146.29296875, "learning_rate": 0.0004965106113976525, "loss": 51.4013, "step": 1101 }, { "epoch": 2.9105315285572795, "grad_norm": 2311.159423828125, "learning_rate": 0.0004965036524141339, "loss": 50.7456, "step": 1102 }, { "epoch": 2.9131726642456255, "grad_norm": 4558.01611328125, "learning_rate": 0.0004964966865471455, "loss": 53.7173, "step": 1103 }, { "epoch": 2.9158137999339715, "grad_norm": 4134.078125, "learning_rate": 0.000496489713796882, "loss": 58.4398, "step": 1104 }, { "epoch": 2.9184549356223175, "grad_norm": 3973.144287109375, "learning_rate": 0.000496482734163538, "loss": 52.9127, "step": 1105 }, { "epoch": 2.9210960713106635, "grad_norm": 3401.874267578125, "learning_rate": 0.0004964757476473084, "loss": 52.9192, "step": 1106 }, { "epoch": 2.9237372069990095, "grad_norm": 4113.7802734375, "learning_rate": 0.0004964687542483885, "loss": 59.7885, "step": 1107 }, { "epoch": 2.9263783426873555, "grad_norm": 3023.4619140625, "learning_rate": 0.0004964617539669733, "loss": 60.4897, "step": 1108 }, { "epoch": 2.9290194783757015, "grad_norm": 1366.0579833984375, "learning_rate": 0.0004964547468032585, "loss": 53.6623, "step": 1109 }, { "epoch": 2.9316606140640475, "grad_norm": 1904.215576171875, "learning_rate": 0.0004964477327574395, "loss": 57.5011, "step": 1110 }, { "epoch": 2.9343017497523936, "grad_norm": 2252.56298828125, "learning_rate": 0.0004964407118297124, "loss": 60.3195, "step": 1111 }, { "epoch": 2.9369428854407396, "grad_norm": 1378.0029296875, "learning_rate": 0.0004964336840202733, "loss": 55.3411, "step": 1112 }, { "epoch": 2.9395840211290856, "grad_norm": 2063.6044921875, "learning_rate": 0.0004964266493293182, "loss": 55.0084, "step": 1113 }, { "epoch": 2.9422251568174316, "grad_norm": 1617.4495849609375, "learning_rate": 0.0004964196077570437, "loss": 51.6546, "step": 1114 }, { "epoch": 2.9448662925057776, "grad_norm": 2357.4697265625, "learning_rate": 0.0004964125593036464, "loss": 52.6764, "step": 1115 }, { "epoch": 2.9475074281941236, "grad_norm": 988.1797485351562, "learning_rate": 0.0004964055039693232, "loss": 48.2454, "step": 1116 }, { "epoch": 2.9501485638824696, "grad_norm": 1163.0733642578125, "learning_rate": 0.000496398441754271, "loss": 47.6997, "step": 1117 }, { "epoch": 2.9527896995708156, "grad_norm": 1242.6046142578125, "learning_rate": 0.0004963913726586871, "loss": 45.8979, "step": 1118 }, { "epoch": 2.955430835259161, "grad_norm": 706.8726196289062, "learning_rate": 0.0004963842966827687, "loss": 45.7897, "step": 1119 }, { "epoch": 2.9580719709475076, "grad_norm": 3094.599853515625, "learning_rate": 0.0004963772138267137, "loss": 55.8292, "step": 1120 }, { "epoch": 2.960713106635853, "grad_norm": 10988.2783203125, "learning_rate": 0.0004963701240907198, "loss": 79.4898, "step": 1121 }, { "epoch": 2.9633542423241996, "grad_norm": 12655.6005859375, "learning_rate": 0.0004963630274749847, "loss": 97.4233, "step": 1122 }, { "epoch": 2.965995378012545, "grad_norm": 7384.478515625, "learning_rate": 0.000496355923979707, "loss": 78.549, "step": 1123 }, { "epoch": 2.9686365137008917, "grad_norm": 8295.392578125, "learning_rate": 0.0004963488136050846, "loss": 81.205, "step": 1124 }, { "epoch": 2.971277649389237, "grad_norm": 16935.646484375, "learning_rate": 0.0004963416963513165, "loss": 60.6238, "step": 1125 }, { "epoch": 2.9739187850775832, "grad_norm": 2593.73974609375, "learning_rate": 0.0004963345722186011, "loss": 50.0655, "step": 1126 }, { "epoch": 2.9765599207659292, "grad_norm": 1659.99560546875, "learning_rate": 0.0004963274412071375, "loss": 52.3155, "step": 1127 }, { "epoch": 2.9792010564542752, "grad_norm": 1289.3348388671875, "learning_rate": 0.000496320303317125, "loss": 47.6925, "step": 1128 }, { "epoch": 2.9818421921426213, "grad_norm": 716.3699340820312, "learning_rate": 0.0004963131585487625, "loss": 44.2885, "step": 1129 }, { "epoch": 2.9844833278309673, "grad_norm": 1002.048583984375, "learning_rate": 0.0004963060069022499, "loss": 41.6146, "step": 1130 }, { "epoch": 2.9871244635193133, "grad_norm": 654.7083129882812, "learning_rate": 0.0004962988483777866, "loss": 44.6448, "step": 1131 }, { "epoch": 2.9897655992076593, "grad_norm": 1540.9473876953125, "learning_rate": 0.0004962916829755727, "loss": 45.3882, "step": 1132 }, { "epoch": 2.9924067348960053, "grad_norm": 1756.814697265625, "learning_rate": 0.0004962845106958083, "loss": 42.9612, "step": 1133 }, { "epoch": 2.9950478705843513, "grad_norm": 1689.3543701171875, "learning_rate": 0.0004962773315386935, "loss": 46.3205, "step": 1134 }, { "epoch": 2.9976890062726973, "grad_norm": 3278.44287109375, "learning_rate": 0.000496270145504429, "loss": 48.2008, "step": 1135 }, { "epoch": 3.0003301419610433, "grad_norm": 6322.8154296875, "learning_rate": 0.0004962629525932152, "loss": 50.474, "step": 1136 }, { "epoch": 3.0029712776493893, "grad_norm": 1203.4599609375, "learning_rate": 0.0004962557528052532, "loss": 49.947, "step": 1137 }, { "epoch": 3.0056124133377353, "grad_norm": 1249.6802978515625, "learning_rate": 0.000496248546140744, "loss": 47.1, "step": 1138 }, { "epoch": 3.0082535490260813, "grad_norm": 1365.59228515625, "learning_rate": 0.0004962413325998887, "loss": 43.3355, "step": 1139 }, { "epoch": 3.0108946847144273, "grad_norm": 1438.812255859375, "learning_rate": 0.000496234112182889, "loss": 48.6868, "step": 1140 }, { "epoch": 3.0135358204027733, "grad_norm": 1825.2008056640625, "learning_rate": 0.0004962268848899463, "loss": 48.1143, "step": 1141 }, { "epoch": 3.0161769560911194, "grad_norm": 1570.59716796875, "learning_rate": 0.0004962196507212624, "loss": 49.1076, "step": 1142 }, { "epoch": 3.0188180917794654, "grad_norm": 1740.674560546875, "learning_rate": 0.0004962124096770394, "loss": 46.5911, "step": 1143 }, { "epoch": 3.0214592274678114, "grad_norm": 1103.89306640625, "learning_rate": 0.0004962051617574796, "loss": 47.1753, "step": 1144 }, { "epoch": 3.024100363156157, "grad_norm": 1948.69580078125, "learning_rate": 0.0004961979069627851, "loss": 44.694, "step": 1145 }, { "epoch": 3.026741498844503, "grad_norm": 1103.9776611328125, "learning_rate": 0.0004961906452931588, "loss": 43.6126, "step": 1146 }, { "epoch": 3.029382634532849, "grad_norm": 935.1823120117188, "learning_rate": 0.0004961833767488034, "loss": 43.8191, "step": 1147 }, { "epoch": 3.032023770221195, "grad_norm": 1664.0433349609375, "learning_rate": 0.0004961761013299217, "loss": 43.9539, "step": 1148 }, { "epoch": 3.034664905909541, "grad_norm": 932.815673828125, "learning_rate": 0.0004961688190367171, "loss": 43.4785, "step": 1149 }, { "epoch": 3.037306041597887, "grad_norm": 1459.0126953125, "learning_rate": 0.0004961615298693928, "loss": 43.1988, "step": 1150 }, { "epoch": 3.039947177286233, "grad_norm": 1614.170166015625, "learning_rate": 0.0004961542338281523, "loss": 41.4544, "step": 1151 }, { "epoch": 3.042588312974579, "grad_norm": 883.5819702148438, "learning_rate": 0.0004961469309131995, "loss": 42.9655, "step": 1152 }, { "epoch": 3.045229448662925, "grad_norm": 1811.19189453125, "learning_rate": 0.0004961396211247382, "loss": 39.0687, "step": 1153 }, { "epoch": 3.047870584351271, "grad_norm": 1309.9769287109375, "learning_rate": 0.0004961323044629727, "loss": 39.0981, "step": 1154 }, { "epoch": 3.050511720039617, "grad_norm": 2635.9541015625, "learning_rate": 0.000496124980928107, "loss": 42.447, "step": 1155 }, { "epoch": 3.053152855727963, "grad_norm": 800.419677734375, "learning_rate": 0.0004961176505203458, "loss": 39.6476, "step": 1156 }, { "epoch": 3.055793991416309, "grad_norm": 991.4110717773438, "learning_rate": 0.0004961103132398939, "loss": 41.2625, "step": 1157 }, { "epoch": 3.058435127104655, "grad_norm": 6681.52099609375, "learning_rate": 0.0004961029690869561, "loss": 41.8367, "step": 1158 }, { "epoch": 3.061076262793001, "grad_norm": 2765.531005859375, "learning_rate": 0.0004960956180617373, "loss": 97.5632, "step": 1159 }, { "epoch": 3.063717398481347, "grad_norm": 6718.06103515625, "learning_rate": 0.000496088260164443, "loss": 77.6158, "step": 1160 }, { "epoch": 3.066358534169693, "grad_norm": 6814.390625, "learning_rate": 0.0004960808953952786, "loss": 83.4456, "step": 1161 }, { "epoch": 3.068999669858039, "grad_norm": 5757.328125, "learning_rate": 0.0004960735237544498, "loss": 62.3987, "step": 1162 }, { "epoch": 3.071640805546385, "grad_norm": 4376.91650390625, "learning_rate": 0.0004960661452421625, "loss": 51.7927, "step": 1163 }, { "epoch": 3.074281941234731, "grad_norm": 3303.999755859375, "learning_rate": 0.0004960587598586224, "loss": 46.0671, "step": 1164 }, { "epoch": 3.076923076923077, "grad_norm": 6130.509765625, "learning_rate": 0.0004960513676040361, "loss": 35.5992, "step": 1165 }, { "epoch": 3.079564212611423, "grad_norm": 3513.67822265625, "learning_rate": 0.0004960439684786099, "loss": 30.2822, "step": 1166 }, { "epoch": 3.082205348299769, "grad_norm": 1447.3255615234375, "learning_rate": 0.0004960365624825504, "loss": 21.9496, "step": 1167 }, { "epoch": 3.084846483988115, "grad_norm": 7903.9951171875, "learning_rate": 0.0004960291496160644, "loss": 72.602, "step": 1168 }, { "epoch": 3.0874876196764607, "grad_norm": 1890.4637451171875, "learning_rate": 0.0004960217298793589, "loss": 119.3472, "step": 1169 }, { "epoch": 3.0901287553648067, "grad_norm": 2767.2060546875, "learning_rate": 0.0004960143032726412, "loss": 115.0728, "step": 1170 }, { "epoch": 3.0927698910531527, "grad_norm": 1806.0372314453125, "learning_rate": 0.0004960068697961185, "loss": 87.7557, "step": 1171 }, { "epoch": 3.0954110267414987, "grad_norm": 1104.9686279296875, "learning_rate": 0.0004959994294499986, "loss": 53.1512, "step": 1172 }, { "epoch": 3.0980521624298447, "grad_norm": 1060.580810546875, "learning_rate": 0.000495991982234489, "loss": 45.9231, "step": 1173 }, { "epoch": 3.1006932981181907, "grad_norm": 1338.659912109375, "learning_rate": 0.0004959845281497979, "loss": 48.3476, "step": 1174 }, { "epoch": 3.1033344338065367, "grad_norm": 1331.66552734375, "learning_rate": 0.0004959770671961333, "loss": 61.9094, "step": 1175 }, { "epoch": 3.1059755694948827, "grad_norm": 1536.34375, "learning_rate": 0.0004959695993737036, "loss": 66.5627, "step": 1176 }, { "epoch": 3.1086167051832287, "grad_norm": 1286.490234375, "learning_rate": 0.0004959621246827173, "loss": 55.8509, "step": 1177 }, { "epoch": 3.1112578408715748, "grad_norm": 2500.14794921875, "learning_rate": 0.0004959546431233833, "loss": 44.0311, "step": 1178 }, { "epoch": 3.1138989765599208, "grad_norm": 1447.317138671875, "learning_rate": 0.0004959471546959102, "loss": 42.305, "step": 1179 }, { "epoch": 3.1165401122482668, "grad_norm": 940.467041015625, "learning_rate": 0.0004959396594005073, "loss": 40.9325, "step": 1180 }, { "epoch": 3.1191812479366128, "grad_norm": 980.5391845703125, "learning_rate": 0.000495932157237384, "loss": 42.1383, "step": 1181 }, { "epoch": 3.121822383624959, "grad_norm": 1171.756591796875, "learning_rate": 0.0004959246482067496, "loss": 45.2919, "step": 1182 }, { "epoch": 3.124463519313305, "grad_norm": 1496.9459228515625, "learning_rate": 0.0004959171323088139, "loss": 46.7969, "step": 1183 }, { "epoch": 3.127104655001651, "grad_norm": 1251.85546875, "learning_rate": 0.0004959096095437866, "loss": 48.5729, "step": 1184 }, { "epoch": 3.129745790689997, "grad_norm": 1786.5543212890625, "learning_rate": 0.0004959020799118781, "loss": 49.4479, "step": 1185 }, { "epoch": 3.132386926378343, "grad_norm": 9491.7578125, "learning_rate": 0.0004958945434132984, "loss": 48.6758, "step": 1186 }, { "epoch": 3.135028062066689, "grad_norm": 933.37451171875, "learning_rate": 0.000495887000048258, "loss": 43.4499, "step": 1187 }, { "epoch": 3.137669197755035, "grad_norm": 702.785888671875, "learning_rate": 0.0004958794498169675, "loss": 45.0845, "step": 1188 }, { "epoch": 3.140310333443381, "grad_norm": 2853.26513671875, "learning_rate": 0.000495871892719638, "loss": 42.5103, "step": 1189 }, { "epoch": 3.142951469131727, "grad_norm": 817.8250732421875, "learning_rate": 0.0004958643287564801, "loss": 46.4587, "step": 1190 }, { "epoch": 3.145592604820073, "grad_norm": 863.828125, "learning_rate": 0.0004958567579277053, "loss": 49.9814, "step": 1191 }, { "epoch": 3.1482337405084184, "grad_norm": 857.3563842773438, "learning_rate": 0.0004958491802335251, "loss": 48.5604, "step": 1192 }, { "epoch": 3.1508748761967644, "grad_norm": 724.4857788085938, "learning_rate": 0.0004958415956741509, "loss": 49.0844, "step": 1193 }, { "epoch": 3.1535160118851104, "grad_norm": 734.047119140625, "learning_rate": 0.0004958340042497945, "loss": 49.2723, "step": 1194 }, { "epoch": 3.1561571475734564, "grad_norm": 916.9022827148438, "learning_rate": 0.000495826405960668, "loss": 49.0347, "step": 1195 }, { "epoch": 3.1587982832618025, "grad_norm": 679.5859985351562, "learning_rate": 0.0004958188008069835, "loss": 47.3559, "step": 1196 }, { "epoch": 3.1614394189501485, "grad_norm": 700.8048095703125, "learning_rate": 0.0004958111887889534, "loss": 49.0636, "step": 1197 }, { "epoch": 3.1640805546384945, "grad_norm": 1185.5665283203125, "learning_rate": 0.0004958035699067901, "loss": 46.6602, "step": 1198 }, { "epoch": 3.1667216903268405, "grad_norm": 2382.923583984375, "learning_rate": 0.0004957959441607067, "loss": 45.2783, "step": 1199 }, { "epoch": 3.1693628260151865, "grad_norm": 891.3073120117188, "learning_rate": 0.0004957883115509159, "loss": 42.4526, "step": 1200 }, { "epoch": 3.1693628260151865, "eval_loss": 7.793237209320068, "eval_runtime": 2.0929, "eval_samples_per_second": 236.51, "eval_steps_per_second": 29.623, "step": 1200 }, { "epoch": 3.1720039617035325, "grad_norm": 701.9120483398438, "learning_rate": 0.0004957806720776309, "loss": 41.401, "step": 1201 }, { "epoch": 3.1746450973918785, "grad_norm": 769.1671752929688, "learning_rate": 0.0004957730257410649, "loss": 43.1417, "step": 1202 }, { "epoch": 3.1772862330802245, "grad_norm": 929.6668090820312, "learning_rate": 0.0004957653725414316, "loss": 40.5971, "step": 1203 }, { "epoch": 3.1799273687685705, "grad_norm": 1081.0169677734375, "learning_rate": 0.0004957577124789446, "loss": 40.6899, "step": 1204 }, { "epoch": 3.1825685044569165, "grad_norm": 1125.03759765625, "learning_rate": 0.000495750045553818, "loss": 42.11, "step": 1205 }, { "epoch": 3.1852096401452625, "grad_norm": 1883.401611328125, "learning_rate": 0.0004957423717662655, "loss": 42.8647, "step": 1206 }, { "epoch": 3.1878507758336085, "grad_norm": 1015.6378784179688, "learning_rate": 0.0004957346911165019, "loss": 41.622, "step": 1207 }, { "epoch": 3.1904919115219545, "grad_norm": 9835.5732421875, "learning_rate": 0.0004957270036047412, "loss": 63.2773, "step": 1208 }, { "epoch": 3.1931330472103006, "grad_norm": 11198.8193359375, "learning_rate": 0.0004957193092311983, "loss": 104.5935, "step": 1209 }, { "epoch": 3.1957741828986466, "grad_norm": 7115.2314453125, "learning_rate": 0.0004957116079960881, "loss": 113.9482, "step": 1210 }, { "epoch": 3.1984153185869926, "grad_norm": 5222.09326171875, "learning_rate": 0.0004957038998996256, "loss": 132.8193, "step": 1211 }, { "epoch": 3.2010564542753386, "grad_norm": 109176.2734375, "learning_rate": 0.000495696184942026, "loss": 120.0652, "step": 1212 }, { "epoch": 3.2036975899636846, "grad_norm": 27485.19921875, "learning_rate": 0.0004956884631235047, "loss": 87.0622, "step": 1213 }, { "epoch": 3.2063387256520306, "grad_norm": 9587.9697265625, "learning_rate": 0.0004956807344442774, "loss": 105.1016, "step": 1214 }, { "epoch": 3.208979861340376, "grad_norm": 7301.68017578125, "learning_rate": 0.0004956729989045599, "loss": 79.7099, "step": 1215 }, { "epoch": 3.211620997028722, "grad_norm": 28596.693359375, "learning_rate": 0.0004956652565045684, "loss": 72.581, "step": 1216 }, { "epoch": 3.214262132717068, "grad_norm": 13680.498046875, "learning_rate": 0.0004956575072445188, "loss": 60.2452, "step": 1217 }, { "epoch": 3.216903268405414, "grad_norm": 5339.484375, "learning_rate": 0.0004956497511246276, "loss": 56.9915, "step": 1218 }, { "epoch": 3.21954440409376, "grad_norm": 6011.4453125, "learning_rate": 0.0004956419881451114, "loss": 129.5269, "step": 1219 }, { "epoch": 3.222185539782106, "grad_norm": 5550.623046875, "learning_rate": 0.000495634218306187, "loss": 111.0073, "step": 1220 }, { "epoch": 3.224826675470452, "grad_norm": 2617.3017578125, "learning_rate": 0.0004956264416080714, "loss": 71.654, "step": 1221 }, { "epoch": 3.227467811158798, "grad_norm": 1717.783935546875, "learning_rate": 0.0004956186580509816, "loss": 54.4673, "step": 1222 }, { "epoch": 3.2301089468471442, "grad_norm": 1408.877685546875, "learning_rate": 0.000495610867635135, "loss": 43.81, "step": 1223 }, { "epoch": 3.2327500825354902, "grad_norm": 1769.8314208984375, "learning_rate": 0.0004956030703607494, "loss": 42.5116, "step": 1224 }, { "epoch": 3.2353912182238362, "grad_norm": 1391.1820068359375, "learning_rate": 0.0004955952662280421, "loss": 41.7443, "step": 1225 }, { "epoch": 3.2380323539121822, "grad_norm": 887.2571411132812, "learning_rate": 0.0004955874552372315, "loss": 42.714, "step": 1226 }, { "epoch": 3.2406734896005283, "grad_norm": 2057.600830078125, "learning_rate": 0.0004955796373885354, "loss": 42.5635, "step": 1227 }, { "epoch": 3.2433146252888743, "grad_norm": 1885.1461181640625, "learning_rate": 0.0004955718126821722, "loss": 44.9812, "step": 1228 }, { "epoch": 3.2459557609772203, "grad_norm": 1532.996826171875, "learning_rate": 0.0004955639811183604, "loss": 47.9394, "step": 1229 }, { "epoch": 3.2485968966655663, "grad_norm": 1641.4324951171875, "learning_rate": 0.0004955561426973186, "loss": 46.7042, "step": 1230 }, { "epoch": 3.2512380323539123, "grad_norm": 2160.170654296875, "learning_rate": 0.0004955482974192658, "loss": 44.0774, "step": 1231 }, { "epoch": 3.2538791680422583, "grad_norm": 5480.1298828125, "learning_rate": 0.000495540445284421, "loss": 43.4987, "step": 1232 }, { "epoch": 3.2565203037306043, "grad_norm": 5391.3818359375, "learning_rate": 0.0004955325862930035, "loss": 45.7081, "step": 1233 }, { "epoch": 3.2591614394189503, "grad_norm": 3333.2919921875, "learning_rate": 0.0004955247204452329, "loss": 44.3987, "step": 1234 }, { "epoch": 3.2618025751072963, "grad_norm": 5206.70849609375, "learning_rate": 0.0004955168477413285, "loss": 48.864, "step": 1235 }, { "epoch": 3.264443710795642, "grad_norm": 5202.14453125, "learning_rate": 0.0004955089681815105, "loss": 48.9464, "step": 1236 }, { "epoch": 3.2670848464839883, "grad_norm": 7160.82666015625, "learning_rate": 0.0004955010817659988, "loss": 43.1027, "step": 1237 }, { "epoch": 3.269725982172334, "grad_norm": 2142.715576171875, "learning_rate": 0.0004954931884950135, "loss": 45.2031, "step": 1238 }, { "epoch": 3.27236711786068, "grad_norm": 3211.5224609375, "learning_rate": 0.0004954852883687752, "loss": 43.8307, "step": 1239 }, { "epoch": 3.275008253549026, "grad_norm": 2463.414794921875, "learning_rate": 0.0004954773813875044, "loss": 47.4471, "step": 1240 }, { "epoch": 3.277649389237372, "grad_norm": 2193.271240234375, "learning_rate": 0.000495469467551422, "loss": 52.3017, "step": 1241 }, { "epoch": 3.280290524925718, "grad_norm": 1859.9140625, "learning_rate": 0.0004954615468607488, "loss": 51.752, "step": 1242 }, { "epoch": 3.282931660614064, "grad_norm": 2543.233642578125, "learning_rate": 0.0004954536193157061, "loss": 52.8425, "step": 1243 }, { "epoch": 3.28557279630241, "grad_norm": 976.06494140625, "learning_rate": 0.0004954456849165153, "loss": 47.1325, "step": 1244 }, { "epoch": 3.288213931990756, "grad_norm": 1969.8770751953125, "learning_rate": 0.000495437743663398, "loss": 48.3718, "step": 1245 }, { "epoch": 3.290855067679102, "grad_norm": 1383.21044921875, "learning_rate": 0.0004954297955565758, "loss": 46.8557, "step": 1246 }, { "epoch": 3.293496203367448, "grad_norm": 1112.529296875, "learning_rate": 0.0004954218405962707, "loss": 43.2437, "step": 1247 }, { "epoch": 3.296137339055794, "grad_norm": 848.9940185546875, "learning_rate": 0.0004954138787827049, "loss": 43.6046, "step": 1248 }, { "epoch": 3.29877847474414, "grad_norm": 1029.7203369140625, "learning_rate": 0.0004954059101161008, "loss": 43.4161, "step": 1249 }, { "epoch": 3.301419610432486, "grad_norm": 690.328369140625, "learning_rate": 0.0004953979345966807, "loss": 40.8318, "step": 1250 }, { "epoch": 3.304060746120832, "grad_norm": 1189.6741943359375, "learning_rate": 0.0004953899522246675, "loss": 42.2704, "step": 1251 }, { "epoch": 3.306701881809178, "grad_norm": 996.3287963867188, "learning_rate": 0.000495381963000284, "loss": 38.8765, "step": 1252 }, { "epoch": 3.309343017497524, "grad_norm": 1067.236328125, "learning_rate": 0.0004953739669237533, "loss": 42.2873, "step": 1253 }, { "epoch": 3.31198415318587, "grad_norm": 1356.884033203125, "learning_rate": 0.0004953659639952988, "loss": 39.6609, "step": 1254 }, { "epoch": 3.314625288874216, "grad_norm": 696.213134765625, "learning_rate": 0.0004953579542151438, "loss": 39.6861, "step": 1255 }, { "epoch": 3.317266424562562, "grad_norm": 1255.010498046875, "learning_rate": 0.0004953499375835122, "loss": 42.241, "step": 1256 }, { "epoch": 3.319907560250908, "grad_norm": 6368.25537109375, "learning_rate": 0.0004953419141006276, "loss": 50.3164, "step": 1257 }, { "epoch": 3.322548695939254, "grad_norm": 39553.37890625, "learning_rate": 0.0004953338837667141, "loss": 86.3763, "step": 1258 }, { "epoch": 3.3251898316275996, "grad_norm": 7713.86572265625, "learning_rate": 0.0004953258465819961, "loss": 84.587, "step": 1259 }, { "epoch": 3.327830967315946, "grad_norm": 9857.37109375, "learning_rate": 0.000495317802546698, "loss": 86.9319, "step": 1260 }, { "epoch": 3.3304721030042916, "grad_norm": 16377.240234375, "learning_rate": 0.0004953097516610444, "loss": 69.8059, "step": 1261 }, { "epoch": 3.3331132386926376, "grad_norm": 10177.5322265625, "learning_rate": 0.00049530169392526, "loss": 92.3415, "step": 1262 }, { "epoch": 3.3357543743809837, "grad_norm": 14821.6904296875, "learning_rate": 0.0004952936293395699, "loss": 69.1114, "step": 1263 }, { "epoch": 3.3383955100693297, "grad_norm": 10480.4873046875, "learning_rate": 0.0004952855579041992, "loss": 60.5152, "step": 1264 }, { "epoch": 3.3410366457576757, "grad_norm": 7742.29296875, "learning_rate": 0.0004952774796193735, "loss": 59.0519, "step": 1265 }, { "epoch": 3.3436777814460217, "grad_norm": 5110.5087890625, "learning_rate": 0.0004952693944853183, "loss": 49.6257, "step": 1266 }, { "epoch": 3.3463189171343677, "grad_norm": 6033.02392578125, "learning_rate": 0.0004952613025022593, "loss": 40.0366, "step": 1267 }, { "epoch": 3.3489600528227137, "grad_norm": 1944.402099609375, "learning_rate": 0.0004952532036704225, "loss": 58.1123, "step": 1268 }, { "epoch": 3.3516011885110597, "grad_norm": 1687.5765380859375, "learning_rate": 0.000495245097990034, "loss": 57.0466, "step": 1269 }, { "epoch": 3.3542423241994057, "grad_norm": 1072.2269287109375, "learning_rate": 0.0004952369854613203, "loss": 45.9809, "step": 1270 }, { "epoch": 3.3568834598877517, "grad_norm": 879.9452514648438, "learning_rate": 0.0004952288660845078, "loss": 42.8522, "step": 1271 }, { "epoch": 3.3595245955760977, "grad_norm": 621.3159790039062, "learning_rate": 0.0004952207398598232, "loss": 40.542, "step": 1272 }, { "epoch": 3.3621657312644437, "grad_norm": 1183.3013916015625, "learning_rate": 0.0004952126067874936, "loss": 42.5066, "step": 1273 }, { "epoch": 3.3648068669527897, "grad_norm": 561.5198974609375, "learning_rate": 0.000495204466867746, "loss": 44.0252, "step": 1274 }, { "epoch": 3.3674480026411358, "grad_norm": 863.3513793945312, "learning_rate": 0.0004951963201008077, "loss": 44.3061, "step": 1275 }, { "epoch": 3.3700891383294818, "grad_norm": 1973.792236328125, "learning_rate": 0.0004951881664869061, "loss": 43.0803, "step": 1276 }, { "epoch": 3.3727302740178278, "grad_norm": 980.9153442382812, "learning_rate": 0.000495180006026269, "loss": 45.0798, "step": 1277 }, { "epoch": 3.3753714097061738, "grad_norm": 1931.8914794921875, "learning_rate": 0.0004951718387191243, "loss": 42.7558, "step": 1278 }, { "epoch": 3.37801254539452, "grad_norm": 1080.4237060546875, "learning_rate": 0.0004951636645657001, "loss": 43.4875, "step": 1279 }, { "epoch": 3.380653681082866, "grad_norm": 872.8278198242188, "learning_rate": 0.0004951554835662244, "loss": 42.017, "step": 1280 }, { "epoch": 3.383294816771212, "grad_norm": 853.2974853515625, "learning_rate": 0.000495147295720926, "loss": 40.3937, "step": 1281 }, { "epoch": 3.3859359524595574, "grad_norm": 973.394775390625, "learning_rate": 0.0004951391010300333, "loss": 40.8997, "step": 1282 }, { "epoch": 3.388577088147904, "grad_norm": 1220.098876953125, "learning_rate": 0.0004951308994937752, "loss": 43.9323, "step": 1283 }, { "epoch": 3.3912182238362494, "grad_norm": 1976.5130615234375, "learning_rate": 0.0004951226911123807, "loss": 43.3638, "step": 1284 }, { "epoch": 3.3938593595245954, "grad_norm": 2440.0380859375, "learning_rate": 0.0004951144758860792, "loss": 45.6857, "step": 1285 }, { "epoch": 3.3965004952129414, "grad_norm": 1268.97265625, "learning_rate": 0.0004951062538150997, "loss": 49.5527, "step": 1286 }, { "epoch": 3.3991416309012874, "grad_norm": 712.1531982421875, "learning_rate": 0.0004950980248996723, "loss": 42.4543, "step": 1287 }, { "epoch": 3.4017827665896334, "grad_norm": 1060.131103515625, "learning_rate": 0.0004950897891400263, "loss": 44.5308, "step": 1288 }, { "epoch": 3.4044239022779794, "grad_norm": 1965.4420166015625, "learning_rate": 0.0004950815465363921, "loss": 46.462, "step": 1289 }, { "epoch": 3.4070650379663254, "grad_norm": 1055.35498046875, "learning_rate": 0.0004950732970889996, "loss": 47.7585, "step": 1290 }, { "epoch": 3.4097061736546714, "grad_norm": 1702.1798095703125, "learning_rate": 0.0004950650407980793, "loss": 48.5501, "step": 1291 }, { "epoch": 3.4123473093430174, "grad_norm": 956.5536499023438, "learning_rate": 0.0004950567776638616, "loss": 45.4662, "step": 1292 }, { "epoch": 3.4149884450313635, "grad_norm": 1106.8753662109375, "learning_rate": 0.0004950485076865773, "loss": 48.8336, "step": 1293 }, { "epoch": 3.4176295807197095, "grad_norm": 978.4710083007812, "learning_rate": 0.0004950402308664575, "loss": 47.9601, "step": 1294 }, { "epoch": 3.4202707164080555, "grad_norm": 1489.135009765625, "learning_rate": 0.0004950319472037333, "loss": 46.7231, "step": 1295 }, { "epoch": 3.4229118520964015, "grad_norm": 1780.13818359375, "learning_rate": 0.0004950236566986357, "loss": 46.6708, "step": 1296 }, { "epoch": 3.4255529877847475, "grad_norm": 1571.4432373046875, "learning_rate": 0.0004950153593513964, "loss": 46.3391, "step": 1297 }, { "epoch": 3.4281941234730935, "grad_norm": 1341.392822265625, "learning_rate": 0.0004950070551622473, "loss": 45.2072, "step": 1298 }, { "epoch": 3.4308352591614395, "grad_norm": 1296.5506591796875, "learning_rate": 0.00049499874413142, "loss": 47.1239, "step": 1299 }, { "epoch": 3.4334763948497855, "grad_norm": 1708.85009765625, "learning_rate": 0.0004949904262591467, "loss": 43.6182, "step": 1300 }, { "epoch": 3.4361175305381315, "grad_norm": 1077.37744140625, "learning_rate": 0.0004949821015456597, "loss": 40.473, "step": 1301 }, { "epoch": 3.4387586662264775, "grad_norm": 1130.248046875, "learning_rate": 0.0004949737699911914, "loss": 42.7582, "step": 1302 }, { "epoch": 3.4413998019148235, "grad_norm": 1038.9942626953125, "learning_rate": 0.0004949654315959744, "loss": 41.345, "step": 1303 }, { "epoch": 3.4440409376031695, "grad_norm": 722.3726806640625, "learning_rate": 0.0004949570863602417, "loss": 41.9433, "step": 1304 }, { "epoch": 3.446682073291515, "grad_norm": 874.8749389648438, "learning_rate": 0.0004949487342842264, "loss": 40.7714, "step": 1305 }, { "epoch": 3.4493232089798616, "grad_norm": 495.7511901855469, "learning_rate": 0.0004949403753681614, "loss": 40.4015, "step": 1306 }, { "epoch": 3.451964344668207, "grad_norm": 565.7025756835938, "learning_rate": 0.0004949320096122803, "loss": 41.2062, "step": 1307 }, { "epoch": 3.454605480356553, "grad_norm": 6516.353515625, "learning_rate": 0.0004949236370168168, "loss": 60.7283, "step": 1308 }, { "epoch": 3.457246616044899, "grad_norm": 10732.671875, "learning_rate": 0.0004949152575820045, "loss": 77.9181, "step": 1309 }, { "epoch": 3.459887751733245, "grad_norm": 7775.72509765625, "learning_rate": 0.0004949068713080776, "loss": 113.4026, "step": 1310 }, { "epoch": 3.462528887421591, "grad_norm": 6061.3369140625, "learning_rate": 0.0004948984781952702, "loss": 86.8072, "step": 1311 }, { "epoch": 3.465170023109937, "grad_norm": 4889.548828125, "learning_rate": 0.0004948900782438166, "loss": 90.7314, "step": 1312 }, { "epoch": 3.467811158798283, "grad_norm": 23521.607421875, "learning_rate": 0.0004948816714539514, "loss": 71.6597, "step": 1313 }, { "epoch": 3.470452294486629, "grad_norm": 6845.46044921875, "learning_rate": 0.0004948732578259094, "loss": 64.0872, "step": 1314 }, { "epoch": 3.473093430174975, "grad_norm": 4074.4853515625, "learning_rate": 0.0004948648373599256, "loss": 54.1474, "step": 1315 }, { "epoch": 3.475734565863321, "grad_norm": 8481.1435546875, "learning_rate": 0.000494856410056235, "loss": 44.4963, "step": 1316 }, { "epoch": 3.478375701551667, "grad_norm": 2029.987548828125, "learning_rate": 0.0004948479759150729, "loss": 38.162, "step": 1317 }, { "epoch": 3.481016837240013, "grad_norm": 4423.9580078125, "learning_rate": 0.000494839534936675, "loss": 61.0752, "step": 1318 }, { "epoch": 3.483657972928359, "grad_norm": 2116.896728515625, "learning_rate": 0.0004948310871212769, "loss": 87.253, "step": 1319 }, { "epoch": 3.4862991086167052, "grad_norm": 1749.7850341796875, "learning_rate": 0.0004948226324691145, "loss": 85.5631, "step": 1320 }, { "epoch": 3.4889402443050512, "grad_norm": 3184.0654296875, "learning_rate": 0.0004948141709804239, "loss": 65.244, "step": 1321 }, { "epoch": 3.4915813799933972, "grad_norm": 1478.3267822265625, "learning_rate": 0.0004948057026554415, "loss": 50.6076, "step": 1322 }, { "epoch": 3.4942225156817432, "grad_norm": 867.5013427734375, "learning_rate": 0.0004947972274944035, "loss": 45.5808, "step": 1323 }, { "epoch": 3.4968636513700893, "grad_norm": 722.5043334960938, "learning_rate": 0.0004947887454975469, "loss": 46.0401, "step": 1324 }, { "epoch": 3.4995047870584353, "grad_norm": 1119.6444091796875, "learning_rate": 0.0004947802566651082, "loss": 49.1881, "step": 1325 }, { "epoch": 3.5021459227467813, "grad_norm": 1248.5013427734375, "learning_rate": 0.0004947717609973247, "loss": 50.6897, "step": 1326 }, { "epoch": 3.5047870584351273, "grad_norm": 1815.82373046875, "learning_rate": 0.0004947632584944335, "loss": 51.0538, "step": 1327 }, { "epoch": 3.507428194123473, "grad_norm": 1363.710205078125, "learning_rate": 0.0004947547491566722, "loss": 52.4095, "step": 1328 }, { "epoch": 3.5100693298118193, "grad_norm": 1125.75439453125, "learning_rate": 0.0004947462329842783, "loss": 49.0257, "step": 1329 }, { "epoch": 3.512710465500165, "grad_norm": 1543.284423828125, "learning_rate": 0.0004947377099774894, "loss": 45.0597, "step": 1330 }, { "epoch": 3.5153516011885113, "grad_norm": 1405.1917724609375, "learning_rate": 0.0004947291801365439, "loss": 45.7387, "step": 1331 }, { "epoch": 3.517992736876857, "grad_norm": 1535.689208984375, "learning_rate": 0.0004947206434616798, "loss": 40.0467, "step": 1332 }, { "epoch": 3.520633872565203, "grad_norm": 1707.0308837890625, "learning_rate": 0.0004947120999531355, "loss": 41.2683, "step": 1333 }, { "epoch": 3.523275008253549, "grad_norm": 2198.938720703125, "learning_rate": 0.0004947035496111495, "loss": 44.1036, "step": 1334 }, { "epoch": 3.525916143941895, "grad_norm": 1350.5933837890625, "learning_rate": 0.0004946949924359606, "loss": 45.6895, "step": 1335 }, { "epoch": 3.528557279630241, "grad_norm": 4064.488525390625, "learning_rate": 0.0004946864284278079, "loss": 54.4587, "step": 1336 }, { "epoch": 3.531198415318587, "grad_norm": 3759.24853515625, "learning_rate": 0.0004946778575869303, "loss": 49.827, "step": 1337 }, { "epoch": 3.533839551006933, "grad_norm": 3585.4296875, "learning_rate": 0.0004946692799135674, "loss": 49.898, "step": 1338 }, { "epoch": 3.536480686695279, "grad_norm": 912.3295288085938, "learning_rate": 0.0004946606954079584, "loss": 49.4679, "step": 1339 }, { "epoch": 3.539121822383625, "grad_norm": 688.1571655273438, "learning_rate": 0.0004946521040703433, "loss": 45.9655, "step": 1340 }, { "epoch": 3.541762958071971, "grad_norm": 906.834228515625, "learning_rate": 0.000494643505900962, "loss": 50.171, "step": 1341 }, { "epoch": 3.544404093760317, "grad_norm": 642.949462890625, "learning_rate": 0.0004946349009000544, "loss": 50.2759, "step": 1342 }, { "epoch": 3.547045229448663, "grad_norm": 703.1409301757812, "learning_rate": 0.0004946262890678609, "loss": 45.3427, "step": 1343 }, { "epoch": 3.549686365137009, "grad_norm": 442.8167724609375, "learning_rate": 0.000494617670404622, "loss": 46.2216, "step": 1344 }, { "epoch": 3.552327500825355, "grad_norm": 675.7138061523438, "learning_rate": 0.0004946090449105783, "loss": 47.8405, "step": 1345 }, { "epoch": 3.554968636513701, "grad_norm": 990.2127075195312, "learning_rate": 0.0004946004125859708, "loss": 46.5672, "step": 1346 }, { "epoch": 3.557609772202047, "grad_norm": 1232.1627197265625, "learning_rate": 0.0004945917734310404, "loss": 45.9192, "step": 1347 }, { "epoch": 3.560250907890393, "grad_norm": 744.7518920898438, "learning_rate": 0.0004945831274460285, "loss": 45.6604, "step": 1348 }, { "epoch": 3.562892043578739, "grad_norm": 733.6963500976562, "learning_rate": 0.0004945744746311763, "loss": 44.3804, "step": 1349 }, { "epoch": 3.565533179267085, "grad_norm": 602.2418823242188, "learning_rate": 0.0004945658149867257, "loss": 42.6002, "step": 1350 }, { "epoch": 3.5681743149554306, "grad_norm": 849.23291015625, "learning_rate": 0.0004945571485129183, "loss": 40.276, "step": 1351 }, { "epoch": 3.570815450643777, "grad_norm": 980.417236328125, "learning_rate": 0.0004945484752099963, "loss": 40.5098, "step": 1352 }, { "epoch": 3.5734565863321226, "grad_norm": 662.5512084960938, "learning_rate": 0.0004945397950782016, "loss": 39.9364, "step": 1353 }, { "epoch": 3.576097722020469, "grad_norm": 2849.76416015625, "learning_rate": 0.0004945311081177769, "loss": 40.1179, "step": 1354 }, { "epoch": 3.5787388577088146, "grad_norm": 1225.9100341796875, "learning_rate": 0.0004945224143289646, "loss": 40.6344, "step": 1355 }, { "epoch": 3.5813799933971606, "grad_norm": 956.34423828125, "learning_rate": 0.0004945137137120076, "loss": 40.9567, "step": 1356 }, { "epoch": 3.5840211290855066, "grad_norm": 2465.3056640625, "learning_rate": 0.0004945050062671487, "loss": 40.762, "step": 1357 }, { "epoch": 3.5866622647738526, "grad_norm": 8671.880859375, "learning_rate": 0.0004944962919946311, "loss": 53.9607, "step": 1358 }, { "epoch": 3.5893034004621986, "grad_norm": 6239.52490234375, "learning_rate": 0.0004944875708946982, "loss": 78.2303, "step": 1359 }, { "epoch": 3.5919445361505447, "grad_norm": 10792.8955078125, "learning_rate": 0.0004944788429675936, "loss": 84.7757, "step": 1360 }, { "epoch": 3.5945856718388907, "grad_norm": 30060.0546875, "learning_rate": 0.0004944701082135608, "loss": 79.4937, "step": 1361 }, { "epoch": 3.5972268075272367, "grad_norm": 25690.63671875, "learning_rate": 0.000494461366632844, "loss": 68.5853, "step": 1362 }, { "epoch": 3.5998679432155827, "grad_norm": 12841.18359375, "learning_rate": 0.0004944526182256869, "loss": 68.1478, "step": 1363 }, { "epoch": 3.6025090789039287, "grad_norm": 8767.1337890625, "learning_rate": 0.0004944438629923343, "loss": 68.0669, "step": 1364 }, { "epoch": 3.6051502145922747, "grad_norm": 12683.837890625, "learning_rate": 0.0004944351009330302, "loss": 54.0777, "step": 1365 }, { "epoch": 3.6077913502806207, "grad_norm": 43564.66796875, "learning_rate": 0.0004944263320480196, "loss": 62.6876, "step": 1366 }, { "epoch": 3.6104324859689667, "grad_norm": 36704.03125, "learning_rate": 0.0004944175563375472, "loss": 51.3188, "step": 1367 }, { "epoch": 3.6130736216573127, "grad_norm": 1906.3214111328125, "learning_rate": 0.0004944087738018582, "loss": 67.656, "step": 1368 }, { "epoch": 3.6157147573456587, "grad_norm": 1959.71435546875, "learning_rate": 0.0004943999844411977, "loss": 60.9899, "step": 1369 }, { "epoch": 3.6183558930340047, "grad_norm": 2402.016357421875, "learning_rate": 0.0004943911882558112, "loss": 48.3712, "step": 1370 }, { "epoch": 3.6209970287223507, "grad_norm": 1211.4344482421875, "learning_rate": 0.0004943823852459443, "loss": 46.0134, "step": 1371 }, { "epoch": 3.6236381644106967, "grad_norm": 740.7788696289062, "learning_rate": 0.000494373575411843, "loss": 41.0027, "step": 1372 }, { "epoch": 3.6262793000990428, "grad_norm": 604.844482421875, "learning_rate": 0.000494364758753753, "loss": 40.203, "step": 1373 }, { "epoch": 3.6289204357873883, "grad_norm": 1075.80908203125, "learning_rate": 0.0004943559352719208, "loss": 46.0648, "step": 1374 }, { "epoch": 3.6315615714757348, "grad_norm": 754.5244140625, "learning_rate": 0.0004943471049665925, "loss": 43.7159, "step": 1375 }, { "epoch": 3.6342027071640803, "grad_norm": 657.5764770507812, "learning_rate": 0.000494338267838015, "loss": 41.6339, "step": 1376 }, { "epoch": 3.636843842852427, "grad_norm": 971.9566650390625, "learning_rate": 0.0004943294238864348, "loss": 40.2271, "step": 1377 }, { "epoch": 3.6394849785407724, "grad_norm": 1408.013916015625, "learning_rate": 0.0004943205731120989, "loss": 42.1859, "step": 1378 }, { "epoch": 3.6421261142291184, "grad_norm": 700.0525512695312, "learning_rate": 0.0004943117155152546, "loss": 42.9698, "step": 1379 }, { "epoch": 3.6447672499174644, "grad_norm": 974.2921752929688, "learning_rate": 0.0004943028510961491, "loss": 44.672, "step": 1380 }, { "epoch": 3.6474083856058104, "grad_norm": 3246.768310546875, "learning_rate": 0.0004942939798550302, "loss": 49.0595, "step": 1381 }, { "epoch": 3.6500495212941564, "grad_norm": 1084.7978515625, "learning_rate": 0.0004942851017921451, "loss": 41.802, "step": 1382 }, { "epoch": 3.6526906569825024, "grad_norm": 1127.0296630859375, "learning_rate": 0.0004942762169077423, "loss": 42.1815, "step": 1383 }, { "epoch": 3.6553317926708484, "grad_norm": 2646.927734375, "learning_rate": 0.0004942673252020695, "loss": 43.1447, "step": 1384 }, { "epoch": 3.6579729283591944, "grad_norm": 3813.26416015625, "learning_rate": 0.0004942584266753752, "loss": 45.8015, "step": 1385 }, { "epoch": 3.6606140640475404, "grad_norm": 10412.9326171875, "learning_rate": 0.0004942495213279078, "loss": 50.4673, "step": 1386 }, { "epoch": 3.6632551997358864, "grad_norm": 581.6724853515625, "learning_rate": 0.0004942406091599159, "loss": 40.804, "step": 1387 }, { "epoch": 3.6658963354242324, "grad_norm": 1192.6298828125, "learning_rate": 0.0004942316901716486, "loss": 42.7365, "step": 1388 }, { "epoch": 3.6685374711125784, "grad_norm": 627.3260498046875, "learning_rate": 0.0004942227643633548, "loss": 42.8589, "step": 1389 }, { "epoch": 3.6711786068009244, "grad_norm": 913.6702880859375, "learning_rate": 0.0004942138317352837, "loss": 44.812, "step": 1390 }, { "epoch": 3.6738197424892705, "grad_norm": 794.9915161132812, "learning_rate": 0.0004942048922876849, "loss": 47.7547, "step": 1391 }, { "epoch": 3.6764608781776165, "grad_norm": 816.4144287109375, "learning_rate": 0.0004941959460208078, "loss": 45.472, "step": 1392 }, { "epoch": 3.6791020138659625, "grad_norm": 666.3663940429688, "learning_rate": 0.0004941869929349026, "loss": 45.4777, "step": 1393 }, { "epoch": 3.6817431495543085, "grad_norm": 701.1419067382812, "learning_rate": 0.0004941780330302188, "loss": 44.2179, "step": 1394 }, { "epoch": 3.6843842852426545, "grad_norm": 1001.0601806640625, "learning_rate": 0.000494169066307007, "loss": 47.1556, "step": 1395 }, { "epoch": 3.6870254209310005, "grad_norm": 883.8333129882812, "learning_rate": 0.0004941600927655174, "loss": 45.8755, "step": 1396 }, { "epoch": 3.689666556619346, "grad_norm": 787.4949340820312, "learning_rate": 0.0004941511124060007, "loss": 44.1875, "step": 1397 }, { "epoch": 3.6923076923076925, "grad_norm": 528.1007690429688, "learning_rate": 0.0004941421252287077, "loss": 45.9079, "step": 1398 }, { "epoch": 3.694948827996038, "grad_norm": 1085.2730712890625, "learning_rate": 0.000494133131233889, "loss": 42.9431, "step": 1399 }, { "epoch": 3.6975899636843845, "grad_norm": 541.5074462890625, "learning_rate": 0.0004941241304217962, "loss": 39.7586, "step": 1400 }, { "epoch": 3.6975899636843845, "eval_loss": 6.229172706604004, "eval_runtime": 2.2358, "eval_samples_per_second": 221.395, "eval_steps_per_second": 27.73, "step": 1400 }, { "epoch": 3.70023109937273, "grad_norm": 392.1261291503906, "learning_rate": 0.0004941151227926805, "loss": 39.5549, "step": 1401 }, { "epoch": 3.702872235061076, "grad_norm": 523.9341430664062, "learning_rate": 0.0004941061083467932, "loss": 40.7553, "step": 1402 }, { "epoch": 3.705513370749422, "grad_norm": 1175.22607421875, "learning_rate": 0.0004940970870843863, "loss": 38.6074, "step": 1403 }, { "epoch": 3.708154506437768, "grad_norm": 631.500244140625, "learning_rate": 0.0004940880590057117, "loss": 38.5513, "step": 1404 }, { "epoch": 3.710795642126114, "grad_norm": 541.426513671875, "learning_rate": 0.0004940790241110215, "loss": 38.5976, "step": 1405 }, { "epoch": 3.71343677781446, "grad_norm": 436.00848388671875, "learning_rate": 0.0004940699824005679, "loss": 39.536, "step": 1406 }, { "epoch": 3.716077913502806, "grad_norm": 646.9573364257812, "learning_rate": 0.0004940609338746032, "loss": 39.1712, "step": 1407 }, { "epoch": 3.718719049191152, "grad_norm": 2069.99365234375, "learning_rate": 0.0004940518785333806, "loss": 61.511, "step": 1408 }, { "epoch": 3.721360184879498, "grad_norm": 11324.400390625, "learning_rate": 0.0004940428163771525, "loss": 85.8988, "step": 1409 }, { "epoch": 3.724001320567844, "grad_norm": 8503.0244140625, "learning_rate": 0.0004940337474061722, "loss": 95.534, "step": 1410 }, { "epoch": 3.72664245625619, "grad_norm": 11534.24609375, "learning_rate": 0.0004940246716206927, "loss": 79.034, "step": 1411 }, { "epoch": 3.729283591944536, "grad_norm": 4507.61181640625, "learning_rate": 0.0004940155890209676, "loss": 80.8433, "step": 1412 }, { "epoch": 3.731924727632882, "grad_norm": 5017.57958984375, "learning_rate": 0.0004940064996072506, "loss": 62.897, "step": 1413 }, { "epoch": 3.734565863321228, "grad_norm": 5061.40869140625, "learning_rate": 0.0004939974033797955, "loss": 49.9759, "step": 1414 }, { "epoch": 3.737206999009574, "grad_norm": 11513.119140625, "learning_rate": 0.0004939883003388561, "loss": 43.1898, "step": 1415 }, { "epoch": 3.73984813469792, "grad_norm": 3485.0546875, "learning_rate": 0.0004939791904846869, "loss": 38.0407, "step": 1416 }, { "epoch": 3.742489270386266, "grad_norm": 6117.533203125, "learning_rate": 0.000493970073817542, "loss": 41.991, "step": 1417 }, { "epoch": 3.7451304060746122, "grad_norm": 2274.865966796875, "learning_rate": 0.0004939609503376762, "loss": 52.0271, "step": 1418 }, { "epoch": 3.7477715417629582, "grad_norm": 2655.669189453125, "learning_rate": 0.0004939518200453441, "loss": 93.4456, "step": 1419 }, { "epoch": 3.750412677451304, "grad_norm": 2542.621337890625, "learning_rate": 0.0004939426829408007, "loss": 88.1889, "step": 1420 }, { "epoch": 3.7530538131396503, "grad_norm": 3659.952392578125, "learning_rate": 0.0004939335390243012, "loss": 73.3621, "step": 1421 }, { "epoch": 3.755694948827996, "grad_norm": 2463.96142578125, "learning_rate": 0.000493924388296101, "loss": 53.7965, "step": 1422 }, { "epoch": 3.7583360845163423, "grad_norm": 2718.474365234375, "learning_rate": 0.0004939152307564554, "loss": 44.7724, "step": 1423 }, { "epoch": 3.760977220204688, "grad_norm": 647.5338745117188, "learning_rate": 0.0004939060664056204, "loss": 42.6283, "step": 1424 }, { "epoch": 3.763618355893034, "grad_norm": 1372.169921875, "learning_rate": 0.0004938968952438518, "loss": 43.6322, "step": 1425 }, { "epoch": 3.76625949158138, "grad_norm": 1470.2919921875, "learning_rate": 0.0004938877172714055, "loss": 47.2875, "step": 1426 }, { "epoch": 3.768900627269726, "grad_norm": 1964.597900390625, "learning_rate": 0.0004938785324885381, "loss": 49.0609, "step": 1427 }, { "epoch": 3.771541762958072, "grad_norm": 1415.9771728515625, "learning_rate": 0.0004938693408955058, "loss": 49.884, "step": 1428 }, { "epoch": 3.774182898646418, "grad_norm": 3321.373291015625, "learning_rate": 0.0004938601424925656, "loss": 48.2657, "step": 1429 }, { "epoch": 3.776824034334764, "grad_norm": 1065.3643798828125, "learning_rate": 0.0004938509372799741, "loss": 43.3875, "step": 1430 }, { "epoch": 3.77946517002311, "grad_norm": 1438.181884765625, "learning_rate": 0.0004938417252579884, "loss": 41.3828, "step": 1431 }, { "epoch": 3.782106305711456, "grad_norm": 1169.4095458984375, "learning_rate": 0.0004938325064268658, "loss": 41.4898, "step": 1432 }, { "epoch": 3.784747441399802, "grad_norm": 698.364990234375, "learning_rate": 0.0004938232807868636, "loss": 40.6331, "step": 1433 }, { "epoch": 3.787388577088148, "grad_norm": 2472.00390625, "learning_rate": 0.0004938140483382396, "loss": 42.51, "step": 1434 }, { "epoch": 3.790029712776494, "grad_norm": 2663.828369140625, "learning_rate": 0.0004938048090812515, "loss": 44.3657, "step": 1435 }, { "epoch": 3.79267084846484, "grad_norm": 4820.67919921875, "learning_rate": 0.0004937955630161573, "loss": 48.9615, "step": 1436 }, { "epoch": 3.795311984153186, "grad_norm": 1040.351806640625, "learning_rate": 0.0004937863101432153, "loss": 45.3354, "step": 1437 }, { "epoch": 3.797953119841532, "grad_norm": 1161.235595703125, "learning_rate": 0.0004937770504626838, "loss": 45.5917, "step": 1438 }, { "epoch": 3.800594255529878, "grad_norm": 806.2498168945312, "learning_rate": 0.0004937677839748212, "loss": 43.5523, "step": 1439 }, { "epoch": 3.803235391218224, "grad_norm": 967.9638061523438, "learning_rate": 0.0004937585106798866, "loss": 46.3986, "step": 1440 }, { "epoch": 3.80587652690657, "grad_norm": 1374.24755859375, "learning_rate": 0.0004937492305781386, "loss": 45.7523, "step": 1441 }, { "epoch": 3.808517662594916, "grad_norm": 1328.041259765625, "learning_rate": 0.0004937399436698367, "loss": 46.3677, "step": 1442 }, { "epoch": 3.8111587982832615, "grad_norm": 1254.5062255859375, "learning_rate": 0.00049373064995524, "loss": 45.8027, "step": 1443 }, { "epoch": 3.813799933971608, "grad_norm": 1092.40771484375, "learning_rate": 0.0004937213494346081, "loss": 48.5736, "step": 1444 }, { "epoch": 3.8164410696599536, "grad_norm": 817.121826171875, "learning_rate": 0.0004937120421082006, "loss": 45.63, "step": 1445 }, { "epoch": 3.8190822053483, "grad_norm": 690.509521484375, "learning_rate": 0.0004937027279762776, "loss": 45.1403, "step": 1446 }, { "epoch": 3.8217233410366456, "grad_norm": 1166.6495361328125, "learning_rate": 0.0004936934070390991, "loss": 44.0714, "step": 1447 }, { "epoch": 3.824364476724992, "grad_norm": 1660.225830078125, "learning_rate": 0.0004936840792969251, "loss": 41.3165, "step": 1448 }, { "epoch": 3.8270056124133376, "grad_norm": 584.9605102539062, "learning_rate": 0.0004936747447500166, "loss": 41.73, "step": 1449 }, { "epoch": 3.8296467481016836, "grad_norm": 638.0139770507812, "learning_rate": 0.000493665403398634, "loss": 39.8063, "step": 1450 }, { "epoch": 3.8322878837900296, "grad_norm": 950.7485961914062, "learning_rate": 0.000493656055243038, "loss": 41.0567, "step": 1451 }, { "epoch": 3.8349290194783756, "grad_norm": 638.3499145507812, "learning_rate": 0.0004936467002834898, "loss": 39.4001, "step": 1452 }, { "epoch": 3.8375701551667216, "grad_norm": 927.4039916992188, "learning_rate": 0.0004936373385202507, "loss": 39.463, "step": 1453 }, { "epoch": 3.8402112908550676, "grad_norm": 775.9244995117188, "learning_rate": 0.000493627969953582, "loss": 39.0761, "step": 1454 }, { "epoch": 3.8428524265434136, "grad_norm": 1156.39453125, "learning_rate": 0.0004936185945837453, "loss": 40.3654, "step": 1455 }, { "epoch": 3.8454935622317596, "grad_norm": 934.4600219726562, "learning_rate": 0.0004936092124110026, "loss": 40.6758, "step": 1456 }, { "epoch": 3.8481346979201057, "grad_norm": 1122.4742431640625, "learning_rate": 0.0004935998234356156, "loss": 42.3039, "step": 1457 }, { "epoch": 3.8507758336084517, "grad_norm": 14596.3466796875, "learning_rate": 0.0004935904276578468, "loss": 46.9809, "step": 1458 }, { "epoch": 3.8534169692967977, "grad_norm": 15225.3369140625, "learning_rate": 0.0004935810250779583, "loss": 79.1032, "step": 1459 }, { "epoch": 3.8560581049851437, "grad_norm": 10102.56640625, "learning_rate": 0.0004935716156962127, "loss": 85.9251, "step": 1460 }, { "epoch": 3.8586992406734897, "grad_norm": 11658.4931640625, "learning_rate": 0.0004935621995128729, "loss": 67.4967, "step": 1461 }, { "epoch": 3.8613403763618357, "grad_norm": 5131.83203125, "learning_rate": 0.0004935527765282017, "loss": 75.9526, "step": 1462 }, { "epoch": 3.8639815120501817, "grad_norm": 8004.1708984375, "learning_rate": 0.0004935433467424624, "loss": 67.8714, "step": 1463 }, { "epoch": 3.8666226477385277, "grad_norm": 6697.64697265625, "learning_rate": 0.0004935339101559181, "loss": 56.0895, "step": 1464 }, { "epoch": 3.8692637834268737, "grad_norm": 16532.58984375, "learning_rate": 0.0004935244667688324, "loss": 52.4966, "step": 1465 }, { "epoch": 3.8719049191152193, "grad_norm": 11668.5234375, "learning_rate": 0.0004935150165814691, "loss": 44.6439, "step": 1466 }, { "epoch": 3.8745460548035657, "grad_norm": 10116.9580078125, "learning_rate": 0.000493505559594092, "loss": 38.8484, "step": 1467 }, { "epoch": 3.8771871904919113, "grad_norm": 9433.0166015625, "learning_rate": 0.0004934960958069651, "loss": 56.4591, "step": 1468 }, { "epoch": 3.8798283261802577, "grad_norm": 3586.513671875, "learning_rate": 0.0004934866252203528, "loss": 90.2097, "step": 1469 }, { "epoch": 3.8824694618686033, "grad_norm": 5447.76611328125, "learning_rate": 0.0004934771478345196, "loss": 81.8686, "step": 1470 }, { "epoch": 3.8851105975569498, "grad_norm": 11309.046875, "learning_rate": 0.0004934676636497299, "loss": 70.123, "step": 1471 }, { "epoch": 3.8877517332452953, "grad_norm": 87251.8046875, "learning_rate": 0.0004934581726662489, "loss": 64.1462, "step": 1472 }, { "epoch": 3.8903928689336413, "grad_norm": 2581.40283203125, "learning_rate": 0.0004934486748843414, "loss": 54.1393, "step": 1473 }, { "epoch": 3.8930340046219873, "grad_norm": 1449.21240234375, "learning_rate": 0.0004934391703042727, "loss": 48.6953, "step": 1474 }, { "epoch": 3.8956751403103334, "grad_norm": 2484.354736328125, "learning_rate": 0.000493429658926308, "loss": 41.2107, "step": 1475 }, { "epoch": 3.8983162759986794, "grad_norm": 1057.947265625, "learning_rate": 0.0004934201407507132, "loss": 40.2737, "step": 1476 }, { "epoch": 3.9009574116870254, "grad_norm": 1339.9168701171875, "learning_rate": 0.000493410615777754, "loss": 40.4053, "step": 1477 }, { "epoch": 3.9035985473753714, "grad_norm": 1182.9263916015625, "learning_rate": 0.0004934010840076963, "loss": 42.5407, "step": 1478 }, { "epoch": 3.9062396830637174, "grad_norm": 905.4882202148438, "learning_rate": 0.0004933915454408063, "loss": 41.2914, "step": 1479 }, { "epoch": 3.9088808187520634, "grad_norm": 3700.651611328125, "learning_rate": 0.0004933820000773503, "loss": 41.4313, "step": 1480 }, { "epoch": 3.9115219544404094, "grad_norm": 1274.091796875, "learning_rate": 0.000493372447917595, "loss": 43.909, "step": 1481 }, { "epoch": 3.9141630901287554, "grad_norm": 1234.7745361328125, "learning_rate": 0.0004933628889618071, "loss": 42.6683, "step": 1482 }, { "epoch": 3.9168042258171014, "grad_norm": 950.7550048828125, "learning_rate": 0.0004933533232102534, "loss": 42.4669, "step": 1483 }, { "epoch": 3.9194453615054474, "grad_norm": 1421.108642578125, "learning_rate": 0.0004933437506632011, "loss": 43.8868, "step": 1484 }, { "epoch": 3.9220864971937934, "grad_norm": 1637.24267578125, "learning_rate": 0.0004933341713209175, "loss": 42.7557, "step": 1485 }, { "epoch": 3.9247276328821394, "grad_norm": 3259.13525390625, "learning_rate": 0.0004933245851836701, "loss": 44.8909, "step": 1486 }, { "epoch": 3.9273687685704854, "grad_norm": 1309.0469970703125, "learning_rate": 0.0004933149922517268, "loss": 39.2061, "step": 1487 }, { "epoch": 3.9300099042588315, "grad_norm": 905.396240234375, "learning_rate": 0.0004933053925253551, "loss": 44.0229, "step": 1488 }, { "epoch": 3.932651039947177, "grad_norm": 684.1885986328125, "learning_rate": 0.0004932957860048232, "loss": 47.5071, "step": 1489 }, { "epoch": 3.9352921756355235, "grad_norm": 940.9295654296875, "learning_rate": 0.0004932861726903995, "loss": 47.6781, "step": 1490 }, { "epoch": 3.937933311323869, "grad_norm": 1200.91357421875, "learning_rate": 0.0004932765525823522, "loss": 46.3698, "step": 1491 }, { "epoch": 3.9405744470122155, "grad_norm": 730.3759765625, "learning_rate": 0.0004932669256809502, "loss": 44.241, "step": 1492 }, { "epoch": 3.943215582700561, "grad_norm": 784.46630859375, "learning_rate": 0.0004932572919864622, "loss": 43.334, "step": 1493 }, { "epoch": 3.9458567183889075, "grad_norm": 994.7269287109375, "learning_rate": 0.0004932476514991572, "loss": 42.3295, "step": 1494 }, { "epoch": 3.948497854077253, "grad_norm": 728.2947387695312, "learning_rate": 0.0004932380042193044, "loss": 40.6598, "step": 1495 }, { "epoch": 3.951138989765599, "grad_norm": 584.9619140625, "learning_rate": 0.0004932283501471733, "loss": 38.6282, "step": 1496 }, { "epoch": 3.953780125453945, "grad_norm": 1198.6883544921875, "learning_rate": 0.0004932186892830332, "loss": 38.5442, "step": 1497 }, { "epoch": 3.956421261142291, "grad_norm": 783.4630126953125, "learning_rate": 0.0004932090216271543, "loss": 39.5384, "step": 1498 }, { "epoch": 3.959062396830637, "grad_norm": 8056.69677734375, "learning_rate": 0.0004931993471798062, "loss": 52.6835, "step": 1499 }, { "epoch": 3.961703532518983, "grad_norm": 16114.19921875, "learning_rate": 0.0004931896659412593, "loss": 87.9953, "step": 1500 }, { "epoch": 3.964344668207329, "grad_norm": 14551.572265625, "learning_rate": 0.0004931799779117838, "loss": 74.9067, "step": 1501 }, { "epoch": 3.966985803895675, "grad_norm": 7261.18994140625, "learning_rate": 0.0004931702830916502, "loss": 76.987, "step": 1502 }, { "epoch": 3.969626939584021, "grad_norm": 46439.0390625, "learning_rate": 0.0004931605814811294, "loss": 81.4869, "step": 1503 }, { "epoch": 3.972268075272367, "grad_norm": 20111.017578125, "learning_rate": 0.0004931508730804922, "loss": 77.2337, "step": 1504 }, { "epoch": 3.974909210960713, "grad_norm": 1154.89697265625, "learning_rate": 0.0004931411578900096, "loss": 38.7959, "step": 1505 }, { "epoch": 3.977550346649059, "grad_norm": 1353.2706298828125, "learning_rate": 0.000493131435909953, "loss": 40.7577, "step": 1506 }, { "epoch": 3.980191482337405, "grad_norm": 1432.0174560546875, "learning_rate": 0.0004931217071405939, "loss": 38.5414, "step": 1507 }, { "epoch": 3.982832618025751, "grad_norm": 1098.754150390625, "learning_rate": 0.000493111971582204, "loss": 41.4393, "step": 1508 }, { "epoch": 3.985473753714097, "grad_norm": 1069.7821044921875, "learning_rate": 0.000493102229235055, "loss": 40.6039, "step": 1509 }, { "epoch": 3.988114889402443, "grad_norm": 747.291748046875, "learning_rate": 0.0004930924800994192, "loss": 39.5674, "step": 1510 }, { "epoch": 3.990756025090789, "grad_norm": 1372.225341796875, "learning_rate": 0.0004930827241755685, "loss": 41.5965, "step": 1511 }, { "epoch": 3.9933971607791348, "grad_norm": 3467.607421875, "learning_rate": 0.0004930729614637757, "loss": 39.5515, "step": 1512 }, { "epoch": 3.996038296467481, "grad_norm": 1039.3388671875, "learning_rate": 0.0004930631919643131, "loss": 40.0585, "step": 1513 }, { "epoch": 3.9986794321558268, "grad_norm": 1826.7330322265625, "learning_rate": 0.0004930534156774539, "loss": 44.5235, "step": 1514 }, { "epoch": 4.001320567844173, "grad_norm": 2184.44482421875, "learning_rate": 0.0004930436326034706, "loss": 48.355, "step": 1515 }, { "epoch": 4.003961703532519, "grad_norm": 1055.990478515625, "learning_rate": 0.0004930338427426367, "loss": 40.372, "step": 1516 }, { "epoch": 4.006602839220865, "grad_norm": 851.187744140625, "learning_rate": 0.0004930240460952255, "loss": 42.7905, "step": 1517 }, { "epoch": 4.009243974909211, "grad_norm": 1450.66259765625, "learning_rate": 0.0004930142426615105, "loss": 45.9382, "step": 1518 }, { "epoch": 4.011885110597557, "grad_norm": 756.2283325195312, "learning_rate": 0.0004930044324417657, "loss": 46.4536, "step": 1519 }, { "epoch": 4.014526246285903, "grad_norm": 788.4192504882812, "learning_rate": 0.0004929946154362647, "loss": 45.3561, "step": 1520 }, { "epoch": 4.017167381974249, "grad_norm": 889.8265380859375, "learning_rate": 0.0004929847916452819, "loss": 49.0868, "step": 1521 }, { "epoch": 4.019808517662595, "grad_norm": 1415.7374267578125, "learning_rate": 0.0004929749610690916, "loss": 44.1158, "step": 1522 }, { "epoch": 4.022449653350941, "grad_norm": 2367.350830078125, "learning_rate": 0.0004929651237079681, "loss": 47.0966, "step": 1523 }, { "epoch": 4.025090789039287, "grad_norm": 704.4037475585938, "learning_rate": 0.0004929552795621863, "loss": 44.1998, "step": 1524 }, { "epoch": 4.027731924727633, "grad_norm": 1058.278564453125, "learning_rate": 0.0004929454286320211, "loss": 46.7498, "step": 1525 }, { "epoch": 4.030373060415979, "grad_norm": 725.4813842773438, "learning_rate": 0.0004929355709177474, "loss": 43.0953, "step": 1526 }, { "epoch": 4.033014196104324, "grad_norm": 593.431884765625, "learning_rate": 0.0004929257064196407, "loss": 45.0746, "step": 1527 }, { "epoch": 4.035655331792671, "grad_norm": 572.0050048828125, "learning_rate": 0.0004929158351379763, "loss": 42.6794, "step": 1528 }, { "epoch": 4.0382964674810164, "grad_norm": 443.2017517089844, "learning_rate": 0.0004929059570730298, "loss": 40.7112, "step": 1529 }, { "epoch": 4.040937603169363, "grad_norm": 556.7572021484375, "learning_rate": 0.0004928960722250774, "loss": 40.6311, "step": 1530 }, { "epoch": 4.0435787388577085, "grad_norm": 608.9827880859375, "learning_rate": 0.0004928861805943947, "loss": 39.6746, "step": 1531 }, { "epoch": 4.046219874546055, "grad_norm": 598.2105102539062, "learning_rate": 0.000492876282181258, "loss": 38.0506, "step": 1532 }, { "epoch": 4.0488610102344005, "grad_norm": 412.9190979003906, "learning_rate": 0.000492866376985944, "loss": 38.2878, "step": 1533 }, { "epoch": 4.051502145922747, "grad_norm": 425.35223388671875, "learning_rate": 0.000492856465008729, "loss": 39.6825, "step": 1534 }, { "epoch": 4.0541432816110925, "grad_norm": 443.9073181152344, "learning_rate": 0.0004928465462498897, "loss": 39.8797, "step": 1535 }, { "epoch": 4.056784417299439, "grad_norm": 355.26416015625, "learning_rate": 0.0004928366207097034, "loss": 40.0527, "step": 1536 }, { "epoch": 4.0594255529877845, "grad_norm": 2360.784423828125, "learning_rate": 0.0004928266883884473, "loss": 46.9304, "step": 1537 }, { "epoch": 4.062066688676131, "grad_norm": 7155.40869140625, "learning_rate": 0.0004928167492863983, "loss": 71.261, "step": 1538 }, { "epoch": 4.0647078243644765, "grad_norm": 27863.37890625, "learning_rate": 0.0004928068034038343, "loss": 71.7428, "step": 1539 }, { "epoch": 4.067348960052823, "grad_norm": 8729.9853515625, "learning_rate": 0.000492796850741033, "loss": 58.2446, "step": 1540 }, { "epoch": 4.0699900957411685, "grad_norm": 4844.0537109375, "learning_rate": 0.0004927868912982722, "loss": 63.8549, "step": 1541 }, { "epoch": 4.072631231429515, "grad_norm": 6864.69091796875, "learning_rate": 0.0004927769250758301, "loss": 59.2049, "step": 1542 }, { "epoch": 4.075272367117861, "grad_norm": 3402.654541015625, "learning_rate": 0.000492766952073985, "loss": 50.5234, "step": 1543 }, { "epoch": 4.077913502806207, "grad_norm": 3503.338134765625, "learning_rate": 0.0004927569722930153, "loss": 40.917, "step": 1544 }, { "epoch": 4.080554638494553, "grad_norm": 1868.1944580078125, "learning_rate": 0.0004927469857331999, "loss": 38.0444, "step": 1545 }, { "epoch": 4.083195774182899, "grad_norm": 48569.14453125, "learning_rate": 0.0004927369923948174, "loss": 35.6827, "step": 1546 }, { "epoch": 4.085836909871245, "grad_norm": 5738.58154296875, "learning_rate": 0.0004927269922781471, "loss": 116.5993, "step": 1547 }, { "epoch": 4.088478045559591, "grad_norm": 4262.474609375, "learning_rate": 0.000492716985383468, "loss": 133.1989, "step": 1548 }, { "epoch": 4.091119181247937, "grad_norm": 3085.093505859375, "learning_rate": 0.0004927069717110597, "loss": 117.6441, "step": 1549 }, { "epoch": 4.093760316936282, "grad_norm": 1947.2366943359375, "learning_rate": 0.0004926969512612018, "loss": 85.2022, "step": 1550 }, { "epoch": 4.096401452624629, "grad_norm": 1760.8145751953125, "learning_rate": 0.000492686924034174, "loss": 57.0652, "step": 1551 }, { "epoch": 4.099042588312974, "grad_norm": 558.0374145507812, "learning_rate": 0.0004926768900302566, "loss": 44.3407, "step": 1552 }, { "epoch": 4.101683724001321, "grad_norm": 957.1454467773438, "learning_rate": 0.0004926668492497295, "loss": 50.1984, "step": 1553 }, { "epoch": 4.104324859689666, "grad_norm": 2303.201904296875, "learning_rate": 0.0004926568016928732, "loss": 53.1126, "step": 1554 }, { "epoch": 4.106965995378013, "grad_norm": 1654.3519287109375, "learning_rate": 0.0004926467473599682, "loss": 49.8198, "step": 1555 }, { "epoch": 4.109607131066358, "grad_norm": 1636.0206298828125, "learning_rate": 0.0004926366862512954, "loss": 41.7732, "step": 1556 }, { "epoch": 4.112248266754705, "grad_norm": 621.9669189453125, "learning_rate": 0.0004926266183671356, "loss": 43.9553, "step": 1557 }, { "epoch": 4.11488940244305, "grad_norm": 516.4110107421875, "learning_rate": 0.0004926165437077701, "loss": 43.0518, "step": 1558 }, { "epoch": 4.117530538131397, "grad_norm": 574.6517944335938, "learning_rate": 0.00049260646227348, "loss": 45.1688, "step": 1559 }, { "epoch": 4.120171673819742, "grad_norm": 573.45751953125, "learning_rate": 0.0004925963740645471, "loss": 42.9552, "step": 1560 }, { "epoch": 4.122812809508089, "grad_norm": 360.7505798339844, "learning_rate": 0.0004925862790812528, "loss": 41.8827, "step": 1561 }, { "epoch": 4.125453945196434, "grad_norm": 521.3345947265625, "learning_rate": 0.0004925761773238794, "loss": 43.3203, "step": 1562 }, { "epoch": 4.128095080884781, "grad_norm": 796.8755493164062, "learning_rate": 0.0004925660687927086, "loss": 44.5733, "step": 1563 }, { "epoch": 4.130736216573126, "grad_norm": 860.1605834960938, "learning_rate": 0.0004925559534880228, "loss": 48.0024, "step": 1564 }, { "epoch": 4.133377352261473, "grad_norm": 963.8115844726562, "learning_rate": 0.0004925458314101045, "loss": 48.214, "step": 1565 }, { "epoch": 4.136018487949818, "grad_norm": 421.76153564453125, "learning_rate": 0.0004925357025592364, "loss": 43.1153, "step": 1566 }, { "epoch": 4.138659623638165, "grad_norm": 378.4088134765625, "learning_rate": 0.0004925255669357011, "loss": 43.3883, "step": 1567 }, { "epoch": 4.14130075932651, "grad_norm": 345.6076965332031, "learning_rate": 0.000492515424539782, "loss": 44.5568, "step": 1568 }, { "epoch": 4.143941895014857, "grad_norm": 587.745361328125, "learning_rate": 0.000492505275371762, "loss": 48.7716, "step": 1569 }, { "epoch": 4.146583030703202, "grad_norm": 663.4636840820312, "learning_rate": 0.0004924951194319249, "loss": 44.8433, "step": 1570 }, { "epoch": 4.149224166391549, "grad_norm": 441.9830627441406, "learning_rate": 0.0004924849567205538, "loss": 48.0399, "step": 1571 }, { "epoch": 4.151865302079894, "grad_norm": 715.9013671875, "learning_rate": 0.0004924747872379327, "loss": 46.2425, "step": 1572 }, { "epoch": 4.154506437768241, "grad_norm": 610.5821533203125, "learning_rate": 0.0004924646109843457, "loss": 45.2533, "step": 1573 }, { "epoch": 4.157147573456586, "grad_norm": 3629.3056640625, "learning_rate": 0.0004924544279600768, "loss": 44.3073, "step": 1574 }, { "epoch": 4.159788709144932, "grad_norm": 758.4008178710938, "learning_rate": 0.0004924442381654105, "loss": 43.3922, "step": 1575 }, { "epoch": 4.162429844833278, "grad_norm": 639.306640625, "learning_rate": 0.0004924340416006312, "loss": 43.2824, "step": 1576 }, { "epoch": 4.165070980521624, "grad_norm": 481.6611022949219, "learning_rate": 0.0004924238382660237, "loss": 42.3711, "step": 1577 }, { "epoch": 4.16771211620997, "grad_norm": 527.7535400390625, "learning_rate": 0.000492413628161873, "loss": 44.0774, "step": 1578 }, { "epoch": 4.170353251898316, "grad_norm": 608.364990234375, "learning_rate": 0.0004924034112884642, "loss": 39.2281, "step": 1579 }, { "epoch": 4.172994387586662, "grad_norm": 521.40087890625, "learning_rate": 0.0004923931876460824, "loss": 42.9242, "step": 1580 }, { "epoch": 4.175635523275008, "grad_norm": 758.8186645507812, "learning_rate": 0.0004923829572350132, "loss": 39.2979, "step": 1581 }, { "epoch": 4.178276658963354, "grad_norm": 1507.8509521484375, "learning_rate": 0.0004923727200555425, "loss": 40.9214, "step": 1582 }, { "epoch": 4.1809177946517, "grad_norm": 2210.26220703125, "learning_rate": 0.0004923624761079558, "loss": 39.9794, "step": 1583 }, { "epoch": 4.1835589303400464, "grad_norm": 609.2326049804688, "learning_rate": 0.0004923522253925393, "loss": 42.4219, "step": 1584 }, { "epoch": 4.186200066028392, "grad_norm": 649.5781860351562, "learning_rate": 0.0004923419679095794, "loss": 37.7452, "step": 1585 }, { "epoch": 4.1888412017167385, "grad_norm": 753.3731079101562, "learning_rate": 0.0004923317036593623, "loss": 42.1164, "step": 1586 }, { "epoch": 4.191482337405084, "grad_norm": 4493.6796875, "learning_rate": 0.0004923214326421748, "loss": 66.5831, "step": 1587 }, { "epoch": 4.1941234730934305, "grad_norm": 32335.9375, "learning_rate": 0.0004923111548583035, "loss": 96.8277, "step": 1588 }, { "epoch": 4.196764608781776, "grad_norm": 12412.3115234375, "learning_rate": 0.0004923008703080357, "loss": 98.6379, "step": 1589 }, { "epoch": 4.1994057444701225, "grad_norm": 7940.61865234375, "learning_rate": 0.0004922905789916584, "loss": 89.2679, "step": 1590 }, { "epoch": 4.202046880158468, "grad_norm": 23218.8828125, "learning_rate": 0.000492280280909459, "loss": 95.9534, "step": 1591 }, { "epoch": 4.2046880158468145, "grad_norm": 66858.4765625, "learning_rate": 0.0004922699760617251, "loss": 92.8087, "step": 1592 }, { "epoch": 4.20732915153516, "grad_norm": 12613.279296875, "learning_rate": 0.0004922596644487444, "loss": 82.982, "step": 1593 }, { "epoch": 4.2099702872235065, "grad_norm": 10904.6513671875, "learning_rate": 0.0004922493460708049, "loss": 72.1162, "step": 1594 }, { "epoch": 4.212611422911852, "grad_norm": 16301.87890625, "learning_rate": 0.0004922390209281946, "loss": 64.3076, "step": 1595 }, { "epoch": 4.2152525586001985, "grad_norm": 33857.296875, "learning_rate": 0.0004922286890212021, "loss": 58.8416, "step": 1596 }, { "epoch": 4.217893694288544, "grad_norm": 2438.244873046875, "learning_rate": 0.0004922183503501157, "loss": 49.5762, "step": 1597 }, { "epoch": 4.22053482997689, "grad_norm": 1783.349609375, "learning_rate": 0.0004922080049152241, "loss": 44.899, "step": 1598 }, { "epoch": 4.223175965665236, "grad_norm": 1561.496337890625, "learning_rate": 0.0004921976527168164, "loss": 45.7948, "step": 1599 }, { "epoch": 4.225817101353582, "grad_norm": 1929.7452392578125, "learning_rate": 0.0004921872937551814, "loss": 48.8855, "step": 1600 }, { "epoch": 4.225817101353582, "eval_loss": 5.445690155029297, "eval_runtime": 2.166, "eval_samples_per_second": 228.535, "eval_steps_per_second": 28.625, "step": 1600 }, { "epoch": 4.228458237041928, "grad_norm": 1343.175537109375, "learning_rate": 0.0004921769280306084, "loss": 42.126, "step": 1601 }, { "epoch": 4.231099372730274, "grad_norm": 924.0390014648438, "learning_rate": 0.000492166555543387, "loss": 41.1278, "step": 1602 }, { "epoch": 4.23374050841862, "grad_norm": 1502.5823974609375, "learning_rate": 0.0004921561762938069, "loss": 42.0888, "step": 1603 }, { "epoch": 4.236381644106966, "grad_norm": 1232.1533203125, "learning_rate": 0.0004921457902821578, "loss": 42.4099, "step": 1604 }, { "epoch": 4.239022779795312, "grad_norm": 1054.8580322265625, "learning_rate": 0.0004921353975087297, "loss": 41.5533, "step": 1605 }, { "epoch": 4.241663915483658, "grad_norm": 1911.3525390625, "learning_rate": 0.0004921249979738129, "loss": 43.7778, "step": 1606 }, { "epoch": 4.244305051172004, "grad_norm": 1181.72607421875, "learning_rate": 0.0004921145916776977, "loss": 42.7153, "step": 1607 }, { "epoch": 4.24694618686035, "grad_norm": 1446.1981201171875, "learning_rate": 0.0004921041786206748, "loss": 41.8918, "step": 1608 }, { "epoch": 4.249587322548696, "grad_norm": 2063.88671875, "learning_rate": 0.0004920937588030349, "loss": 42.4798, "step": 1609 }, { "epoch": 4.252228458237042, "grad_norm": 1576.2864990234375, "learning_rate": 0.0004920833322250691, "loss": 40.2519, "step": 1610 }, { "epoch": 4.254869593925388, "grad_norm": 1567.892822265625, "learning_rate": 0.0004920728988870684, "loss": 41.2966, "step": 1611 }, { "epoch": 4.257510729613734, "grad_norm": 1659.2830810546875, "learning_rate": 0.0004920624587893242, "loss": 40.7444, "step": 1612 }, { "epoch": 4.26015186530208, "grad_norm": 1779.5322265625, "learning_rate": 0.000492052011932128, "loss": 44.3887, "step": 1613 }, { "epoch": 4.262793000990426, "grad_norm": 1944.67236328125, "learning_rate": 0.0004920415583157716, "loss": 46.3617, "step": 1614 }, { "epoch": 4.265434136678772, "grad_norm": 1485.105712890625, "learning_rate": 0.0004920310979405468, "loss": 45.7549, "step": 1615 }, { "epoch": 4.268075272367118, "grad_norm": 912.8294067382812, "learning_rate": 0.0004920206308067458, "loss": 41.3154, "step": 1616 }, { "epoch": 4.270716408055464, "grad_norm": 1002.8642578125, "learning_rate": 0.0004920101569146609, "loss": 42.5236, "step": 1617 }, { "epoch": 4.27335754374381, "grad_norm": 1042.388916015625, "learning_rate": 0.0004919996762645845, "loss": 41.669, "step": 1618 }, { "epoch": 4.275998679432156, "grad_norm": 1798.9320068359375, "learning_rate": 0.0004919891888568093, "loss": 43.4721, "step": 1619 }, { "epoch": 4.278639815120502, "grad_norm": 731.042724609375, "learning_rate": 0.0004919786946916281, "loss": 45.7159, "step": 1620 }, { "epoch": 4.281280950808847, "grad_norm": 1518.0999755859375, "learning_rate": 0.0004919681937693341, "loss": 49.7016, "step": 1621 }, { "epoch": 4.283922086497194, "grad_norm": 880.864990234375, "learning_rate": 0.0004919576860902203, "loss": 51.1277, "step": 1622 }, { "epoch": 4.286563222185539, "grad_norm": 2406.7978515625, "learning_rate": 0.0004919471716545804, "loss": 50.0791, "step": 1623 }, { "epoch": 4.289204357873886, "grad_norm": 1206.1199951171875, "learning_rate": 0.0004919366504627078, "loss": 46.427, "step": 1624 }, { "epoch": 4.291845493562231, "grad_norm": 1508.3486328125, "learning_rate": 0.0004919261225148963, "loss": 44.1393, "step": 1625 }, { "epoch": 4.294486629250578, "grad_norm": 1084.5855712890625, "learning_rate": 0.0004919155878114399, "loss": 45.5197, "step": 1626 }, { "epoch": 4.2971277649389235, "grad_norm": 1301.67138671875, "learning_rate": 0.000491905046352633, "loss": 44.3993, "step": 1627 }, { "epoch": 4.29976890062727, "grad_norm": 1293.2689208984375, "learning_rate": 0.0004918944981387696, "loss": 42.6101, "step": 1628 }, { "epoch": 4.3024100363156155, "grad_norm": 934.3630981445312, "learning_rate": 0.0004918839431701447, "loss": 41.8585, "step": 1629 }, { "epoch": 4.305051172003962, "grad_norm": 830.1573486328125, "learning_rate": 0.0004918733814470525, "loss": 39.0858, "step": 1630 }, { "epoch": 4.3076923076923075, "grad_norm": 1178.9166259765625, "learning_rate": 0.0004918628129697883, "loss": 40.7945, "step": 1631 }, { "epoch": 4.310333443380654, "grad_norm": 792.4247436523438, "learning_rate": 0.0004918522377386471, "loss": 40.3484, "step": 1632 }, { "epoch": 4.3129745790689995, "grad_norm": 1464.6392822265625, "learning_rate": 0.0004918416557539243, "loss": 39.0975, "step": 1633 }, { "epoch": 4.315615714757346, "grad_norm": 900.1143188476562, "learning_rate": 0.0004918310670159153, "loss": 39.4514, "step": 1634 }, { "epoch": 4.3182568504456915, "grad_norm": 936.521484375, "learning_rate": 0.0004918204715249158, "loss": 40.0108, "step": 1635 }, { "epoch": 4.320897986134038, "grad_norm": 772.832763671875, "learning_rate": 0.0004918098692812217, "loss": 37.936, "step": 1636 }, { "epoch": 4.3235391218223835, "grad_norm": 4753.86865234375, "learning_rate": 0.000491799260285129, "loss": 49.5302, "step": 1637 }, { "epoch": 4.32618025751073, "grad_norm": 18451.095703125, "learning_rate": 0.0004917886445369341, "loss": 84.0741, "step": 1638 }, { "epoch": 4.3288213931990756, "grad_norm": 21239.017578125, "learning_rate": 0.0004917780220369333, "loss": 88.075, "step": 1639 }, { "epoch": 4.331462528887422, "grad_norm": 8009.4775390625, "learning_rate": 0.0004917673927854232, "loss": 85.0808, "step": 1640 }, { "epoch": 4.334103664575768, "grad_norm": 7598.8203125, "learning_rate": 0.0004917567567827007, "loss": 97.172, "step": 1641 }, { "epoch": 4.336744800264114, "grad_norm": 33373.234375, "learning_rate": 0.0004917461140290628, "loss": 82.4406, "step": 1642 }, { "epoch": 4.33938593595246, "grad_norm": 8697.908203125, "learning_rate": 0.0004917354645248066, "loss": 75.5878, "step": 1643 }, { "epoch": 4.342027071640805, "grad_norm": 6433.41455078125, "learning_rate": 0.0004917248082702297, "loss": 67.6967, "step": 1644 }, { "epoch": 4.344668207329152, "grad_norm": 43803.1796875, "learning_rate": 0.0004917141452656294, "loss": 61.0094, "step": 1645 }, { "epoch": 4.347309343017497, "grad_norm": 1586.7449951171875, "learning_rate": 0.0004917034755113036, "loss": 44.0262, "step": 1646 }, { "epoch": 4.349950478705844, "grad_norm": 1852.5281982421875, "learning_rate": 0.0004916927990075504, "loss": 44.5903, "step": 1647 }, { "epoch": 4.352591614394189, "grad_norm": 1054.8751220703125, "learning_rate": 0.0004916821157546676, "loss": 49.4803, "step": 1648 }, { "epoch": 4.355232750082536, "grad_norm": 990.2027587890625, "learning_rate": 0.0004916714257529537, "loss": 45.4461, "step": 1649 }, { "epoch": 4.357873885770881, "grad_norm": 751.5443725585938, "learning_rate": 0.0004916607290027072, "loss": 43.8169, "step": 1650 }, { "epoch": 4.360515021459228, "grad_norm": 450.97027587890625, "learning_rate": 0.0004916500255042268, "loss": 40.1321, "step": 1651 }, { "epoch": 4.363156157147573, "grad_norm": 526.2379150390625, "learning_rate": 0.0004916393152578115, "loss": 39.8515, "step": 1652 }, { "epoch": 4.36579729283592, "grad_norm": 500.67333984375, "learning_rate": 0.0004916285982637602, "loss": 42.5823, "step": 1653 }, { "epoch": 4.368438428524265, "grad_norm": 624.0676879882812, "learning_rate": 0.0004916178745223721, "loss": 40.4453, "step": 1654 }, { "epoch": 4.371079564212612, "grad_norm": 702.5093383789062, "learning_rate": 0.000491607144033947, "loss": 41.0699, "step": 1655 }, { "epoch": 4.373720699900957, "grad_norm": 629.8414306640625, "learning_rate": 0.0004915964067987842, "loss": 41.2751, "step": 1656 }, { "epoch": 4.376361835589304, "grad_norm": 723.1345825195312, "learning_rate": 0.0004915856628171837, "loss": 43.8339, "step": 1657 }, { "epoch": 4.379002971277649, "grad_norm": 748.7872314453125, "learning_rate": 0.0004915749120894455, "loss": 39.6324, "step": 1658 }, { "epoch": 4.381644106965996, "grad_norm": 697.0361328125, "learning_rate": 0.0004915641546158698, "loss": 39.9642, "step": 1659 }, { "epoch": 4.384285242654341, "grad_norm": 779.9351196289062, "learning_rate": 0.0004915533903967568, "loss": 43.1958, "step": 1660 }, { "epoch": 4.386926378342688, "grad_norm": 841.23095703125, "learning_rate": 0.0004915426194324075, "loss": 38.7276, "step": 1661 }, { "epoch": 4.389567514031033, "grad_norm": 448.54608154296875, "learning_rate": 0.0004915318417231223, "loss": 42.7096, "step": 1662 }, { "epoch": 4.39220864971938, "grad_norm": 563.5016479492188, "learning_rate": 0.0004915210572692025, "loss": 41.748, "step": 1663 }, { "epoch": 4.394849785407725, "grad_norm": 719.0343627929688, "learning_rate": 0.0004915102660709488, "loss": 43.8423, "step": 1664 }, { "epoch": 4.397490921096072, "grad_norm": 662.469482421875, "learning_rate": 0.000491499468128663, "loss": 47.5327, "step": 1665 }, { "epoch": 4.400132056784417, "grad_norm": 624.25048828125, "learning_rate": 0.0004914886634426463, "loss": 43.2657, "step": 1666 }, { "epoch": 4.402773192472763, "grad_norm": 352.40557861328125, "learning_rate": 0.0004914778520132006, "loss": 43.0452, "step": 1667 }, { "epoch": 4.405414328161109, "grad_norm": 501.1050109863281, "learning_rate": 0.0004914670338406277, "loss": 43.93, "step": 1668 }, { "epoch": 4.408055463849455, "grad_norm": 565.2608642578125, "learning_rate": 0.0004914562089252297, "loss": 49.5194, "step": 1669 }, { "epoch": 4.410696599537801, "grad_norm": 447.5917053222656, "learning_rate": 0.0004914453772673089, "loss": 48.4459, "step": 1670 }, { "epoch": 4.413337735226147, "grad_norm": 390.73944091796875, "learning_rate": 0.0004914345388671678, "loss": 43.696, "step": 1671 }, { "epoch": 4.415978870914493, "grad_norm": 452.7425231933594, "learning_rate": 0.000491423693725109, "loss": 45.3163, "step": 1672 }, { "epoch": 4.418620006602839, "grad_norm": 436.2398681640625, "learning_rate": 0.0004914128418414354, "loss": 45.5107, "step": 1673 }, { "epoch": 4.421261142291185, "grad_norm": 423.54296875, "learning_rate": 0.00049140198321645, "loss": 44.5737, "step": 1674 }, { "epoch": 4.423902277979531, "grad_norm": 569.5210571289062, "learning_rate": 0.000491391117850456, "loss": 45.309, "step": 1675 }, { "epoch": 4.426543413667877, "grad_norm": 386.33184814453125, "learning_rate": 0.000491380245743757, "loss": 44.159, "step": 1676 }, { "epoch": 4.429184549356223, "grad_norm": 310.943359375, "learning_rate": 0.0004913693668966562, "loss": 42.9151, "step": 1677 }, { "epoch": 4.431825685044569, "grad_norm": 346.4234924316406, "learning_rate": 0.0004913584813094577, "loss": 41.8727, "step": 1678 }, { "epoch": 4.434466820732915, "grad_norm": 331.90350341796875, "learning_rate": 0.0004913475889824655, "loss": 38.7068, "step": 1679 }, { "epoch": 4.437107956421261, "grad_norm": 303.5743713378906, "learning_rate": 0.0004913366899159835, "loss": 41.1695, "step": 1680 }, { "epoch": 4.439749092109607, "grad_norm": 381.688720703125, "learning_rate": 0.0004913257841103162, "loss": 40.3734, "step": 1681 }, { "epoch": 4.4423902277979535, "grad_norm": 298.18115234375, "learning_rate": 0.0004913148715657683, "loss": 37.4906, "step": 1682 }, { "epoch": 4.445031363486299, "grad_norm": 236.26901245117188, "learning_rate": 0.0004913039522826442, "loss": 38.32, "step": 1683 }, { "epoch": 4.4476724991746455, "grad_norm": 264.477783203125, "learning_rate": 0.0004912930262612491, "loss": 38.0227, "step": 1684 }, { "epoch": 4.450313634862991, "grad_norm": 274.4019775390625, "learning_rate": 0.0004912820935018879, "loss": 38.1314, "step": 1685 }, { "epoch": 4.4529547705513375, "grad_norm": 476.60272216796875, "learning_rate": 0.000491271154004866, "loss": 40.5344, "step": 1686 }, { "epoch": 4.455595906239683, "grad_norm": 2889.080810546875, "learning_rate": 0.0004912602077704888, "loss": 75.0948, "step": 1687 }, { "epoch": 4.4582370419280295, "grad_norm": 3287.278564453125, "learning_rate": 0.0004912492547990622, "loss": 74.7247, "step": 1688 }, { "epoch": 4.460878177616375, "grad_norm": 4502.04931640625, "learning_rate": 0.0004912382950908918, "loss": 79.1327, "step": 1689 }, { "epoch": 4.463519313304721, "grad_norm": 4222.31005859375, "learning_rate": 0.0004912273286462836, "loss": 63.7496, "step": 1690 }, { "epoch": 4.466160448993067, "grad_norm": 5661.81591796875, "learning_rate": 0.0004912163554655441, "loss": 54.3065, "step": 1691 }, { "epoch": 4.468801584681413, "grad_norm": 3767.63525390625, "learning_rate": 0.0004912053755489795, "loss": 46.4454, "step": 1692 }, { "epoch": 4.471442720369759, "grad_norm": 2573.834716796875, "learning_rate": 0.0004911943888968965, "loss": 36.465, "step": 1693 }, { "epoch": 4.474083856058105, "grad_norm": 1298.54443359375, "learning_rate": 0.0004911833955096018, "loss": 32.0652, "step": 1694 }, { "epoch": 4.476724991746451, "grad_norm": 681.1434936523438, "learning_rate": 0.0004911723953874025, "loss": 30.1556, "step": 1695 }, { "epoch": 4.479366127434797, "grad_norm": 1273.8209228515625, "learning_rate": 0.0004911613885306057, "loss": 16.8537, "step": 1696 }, { "epoch": 4.482007263123143, "grad_norm": 1538.9061279296875, "learning_rate": 0.000491150374939519, "loss": 113.2054, "step": 1697 }, { "epoch": 4.484648398811489, "grad_norm": 1620.103759765625, "learning_rate": 0.0004911393546144495, "loss": 111.9477, "step": 1698 }, { "epoch": 4.487289534499835, "grad_norm": 1724.8199462890625, "learning_rate": 0.0004911283275557053, "loss": 88.8341, "step": 1699 }, { "epoch": 4.489930670188181, "grad_norm": 939.620849609375, "learning_rate": 0.0004911172937635942, "loss": 55.0623, "step": 1700 }, { "epoch": 4.492571805876527, "grad_norm": 358.2616271972656, "learning_rate": 0.0004911062532384242, "loss": 49.6072, "step": 1701 }, { "epoch": 4.495212941564873, "grad_norm": 479.5273132324219, "learning_rate": 0.0004910952059805038, "loss": 47.9094, "step": 1702 }, { "epoch": 4.497854077253219, "grad_norm": 722.2835083007812, "learning_rate": 0.0004910841519901414, "loss": 51.876, "step": 1703 }, { "epoch": 4.500495212941565, "grad_norm": 667.79248046875, "learning_rate": 0.0004910730912676456, "loss": 47.2268, "step": 1704 }, { "epoch": 4.503136348629911, "grad_norm": 385.7362976074219, "learning_rate": 0.0004910620238133255, "loss": 40.792, "step": 1705 }, { "epoch": 4.505777484318257, "grad_norm": 356.4949951171875, "learning_rate": 0.0004910509496274899, "loss": 38.8635, "step": 1706 }, { "epoch": 4.508418620006603, "grad_norm": 446.2564392089844, "learning_rate": 0.0004910398687104481, "loss": 39.506, "step": 1707 }, { "epoch": 4.511059755694949, "grad_norm": 608.2757568359375, "learning_rate": 0.0004910287810625096, "loss": 44.2052, "step": 1708 }, { "epoch": 4.513700891383294, "grad_norm": 513.5753173828125, "learning_rate": 0.0004910176866839839, "loss": 44.7471, "step": 1709 }, { "epoch": 4.516342027071641, "grad_norm": 880.7156372070312, "learning_rate": 0.000491006585575181, "loss": 41.4931, "step": 1710 }, { "epoch": 4.518983162759987, "grad_norm": 500.200439453125, "learning_rate": 0.0004909954777364107, "loss": 42.6993, "step": 1711 }, { "epoch": 4.521624298448333, "grad_norm": 857.0380859375, "learning_rate": 0.0004909843631679832, "loss": 41.7703, "step": 1712 }, { "epoch": 4.524265434136678, "grad_norm": 779.0283203125, "learning_rate": 0.000490973241870209, "loss": 42.7715, "step": 1713 }, { "epoch": 4.526906569825025, "grad_norm": 650.2067260742188, "learning_rate": 0.0004909621138433986, "loss": 44.1131, "step": 1714 }, { "epoch": 4.52954770551337, "grad_norm": 1981.8306884765625, "learning_rate": 0.0004909509790878626, "loss": 45.6474, "step": 1715 }, { "epoch": 4.532188841201717, "grad_norm": 670.0183715820312, "learning_rate": 0.0004909398376039123, "loss": 42.994, "step": 1716 }, { "epoch": 4.534829976890062, "grad_norm": 2609.916748046875, "learning_rate": 0.0004909286893918583, "loss": 43.7116, "step": 1717 }, { "epoch": 4.537471112578409, "grad_norm": 388.9154052734375, "learning_rate": 0.0004909175344520123, "loss": 44.3788, "step": 1718 }, { "epoch": 4.540112248266754, "grad_norm": 357.4376525878906, "learning_rate": 0.0004909063727846857, "loss": 45.627, "step": 1719 }, { "epoch": 4.542753383955101, "grad_norm": 626.6651000976562, "learning_rate": 0.0004908952043901901, "loss": 46.8512, "step": 1720 }, { "epoch": 4.545394519643446, "grad_norm": 345.1490478515625, "learning_rate": 0.0004908840292688374, "loss": 47.3941, "step": 1721 }, { "epoch": 4.548035655331793, "grad_norm": 332.3099670410156, "learning_rate": 0.0004908728474209398, "loss": 44.8193, "step": 1722 }, { "epoch": 4.5506767910201384, "grad_norm": 283.5177001953125, "learning_rate": 0.0004908616588468093, "loss": 45.4137, "step": 1723 }, { "epoch": 4.553317926708485, "grad_norm": 314.51287841796875, "learning_rate": 0.0004908504635467585, "loss": 43.2757, "step": 1724 }, { "epoch": 4.5559590623968305, "grad_norm": 325.13970947265625, "learning_rate": 0.0004908392615211, "loss": 43.5406, "step": 1725 }, { "epoch": 4.558600198085177, "grad_norm": 288.8905029296875, "learning_rate": 0.0004908280527701467, "loss": 42.4452, "step": 1726 }, { "epoch": 4.5612413337735225, "grad_norm": 212.27365112304688, "learning_rate": 0.0004908168372942113, "loss": 41.7147, "step": 1727 }, { "epoch": 4.563882469461869, "grad_norm": 293.05987548828125, "learning_rate": 0.0004908056150936073, "loss": 42.2506, "step": 1728 }, { "epoch": 4.5665236051502145, "grad_norm": 404.1700134277344, "learning_rate": 0.0004907943861686481, "loss": 40.9176, "step": 1729 }, { "epoch": 4.569164740838561, "grad_norm": 289.8724060058594, "learning_rate": 0.0004907831505196468, "loss": 39.1503, "step": 1730 }, { "epoch": 4.5718058765269065, "grad_norm": 272.8359375, "learning_rate": 0.0004907719081469177, "loss": 39.7878, "step": 1731 }, { "epoch": 4.574447012215252, "grad_norm": 253.34132385253906, "learning_rate": 0.0004907606590507744, "loss": 39.5899, "step": 1732 }, { "epoch": 4.5770881479035985, "grad_norm": 239.1757354736328, "learning_rate": 0.0004907494032315311, "loss": 38.4865, "step": 1733 }, { "epoch": 4.579729283591945, "grad_norm": 312.7656555175781, "learning_rate": 0.0004907381406895023, "loss": 38.4418, "step": 1734 }, { "epoch": 4.5823704192802905, "grad_norm": 377.4314880371094, "learning_rate": 0.0004907268714250022, "loss": 39.5456, "step": 1735 }, { "epoch": 4.585011554968636, "grad_norm": 399.2476806640625, "learning_rate": 0.0004907155954383456, "loss": 41.8767, "step": 1736 }, { "epoch": 4.587652690656983, "grad_norm": 3708.409912109375, "learning_rate": 0.0004907043127298475, "loss": 71.6795, "step": 1737 }, { "epoch": 4.590293826345328, "grad_norm": 2617.30078125, "learning_rate": 0.0004906930232998228, "loss": 124.8805, "step": 1738 }, { "epoch": 4.592934962033675, "grad_norm": 10405.4912109375, "learning_rate": 0.0004906817271485869, "loss": 143.8917, "step": 1739 }, { "epoch": 4.59557609772202, "grad_norm": 7210.9853515625, "learning_rate": 0.0004906704242764551, "loss": 113.4782, "step": 1740 }, { "epoch": 4.598217233410367, "grad_norm": 13133.1015625, "learning_rate": 0.000490659114683743, "loss": 115.9073, "step": 1741 }, { "epoch": 4.600858369098712, "grad_norm": 1859.23974609375, "learning_rate": 0.0004906477983707666, "loss": 86.7979, "step": 1742 }, { "epoch": 4.603499504787059, "grad_norm": 8408.9541015625, "learning_rate": 0.0004906364753378418, "loss": 70.8951, "step": 1743 }, { "epoch": 4.606140640475404, "grad_norm": 2865.433837890625, "learning_rate": 0.0004906251455852847, "loss": 55.4841, "step": 1744 }, { "epoch": 4.608781776163751, "grad_norm": 1704.87744140625, "learning_rate": 0.0004906138091134118, "loss": 46.938, "step": 1745 }, { "epoch": 4.611422911852096, "grad_norm": 1659.7034912109375, "learning_rate": 0.0004906024659225396, "loss": 49.2349, "step": 1746 }, { "epoch": 4.614064047540443, "grad_norm": 1634.830322265625, "learning_rate": 0.000490591116012985, "loss": 98.2955, "step": 1747 }, { "epoch": 4.616705183228788, "grad_norm": 1875.607666015625, "learning_rate": 0.0004905797593850647, "loss": 98.9979, "step": 1748 }, { "epoch": 4.619346318917135, "grad_norm": 1425.0068359375, "learning_rate": 0.000490568396039096, "loss": 69.4627, "step": 1749 }, { "epoch": 4.62198745460548, "grad_norm": 698.6114501953125, "learning_rate": 0.0004905570259753961, "loss": 48.5761, "step": 1750 }, { "epoch": 4.624628590293827, "grad_norm": 465.8067626953125, "learning_rate": 0.0004905456491942825, "loss": 41.491, "step": 1751 }, { "epoch": 4.627269725982172, "grad_norm": 408.4719543457031, "learning_rate": 0.000490534265696073, "loss": 42.1909, "step": 1752 }, { "epoch": 4.629910861670519, "grad_norm": 1002.6422119140625, "learning_rate": 0.0004905228754810856, "loss": 49.0329, "step": 1753 }, { "epoch": 4.632551997358864, "grad_norm": 973.4755859375, "learning_rate": 0.000490511478549638, "loss": 52.2018, "step": 1754 }, { "epoch": 4.63519313304721, "grad_norm": 827.2559814453125, "learning_rate": 0.0004905000749020487, "loss": 47.5926, "step": 1755 }, { "epoch": 4.637834268735556, "grad_norm": 691.8071899414062, "learning_rate": 0.0004904886645386361, "loss": 41.6818, "step": 1756 }, { "epoch": 4.640475404423903, "grad_norm": 289.2928466796875, "learning_rate": 0.0004904772474597187, "loss": 40.4614, "step": 1757 }, { "epoch": 4.643116540112248, "grad_norm": 1511.1605224609375, "learning_rate": 0.0004904658236656157, "loss": 40.8587, "step": 1758 }, { "epoch": 4.645757675800594, "grad_norm": 736.1949462890625, "learning_rate": 0.0004904543931566456, "loss": 40.2844, "step": 1759 }, { "epoch": 4.64839881148894, "grad_norm": 551.6571044921875, "learning_rate": 0.0004904429559331279, "loss": 41.3618, "step": 1760 }, { "epoch": 4.651039947177286, "grad_norm": 758.5021362304688, "learning_rate": 0.0004904315119953818, "loss": 41.5621, "step": 1761 }, { "epoch": 4.653681082865632, "grad_norm": 575.1744995117188, "learning_rate": 0.0004904200613437271, "loss": 41.6145, "step": 1762 }, { "epoch": 4.656322218553978, "grad_norm": 962.9451293945312, "learning_rate": 0.0004904086039784834, "loss": 44.3247, "step": 1763 }, { "epoch": 4.658963354242324, "grad_norm": 10481.236328125, "learning_rate": 0.0004903971398999706, "loss": 47.0272, "step": 1764 }, { "epoch": 4.66160448993067, "grad_norm": 1846.972412109375, "learning_rate": 0.0004903856691085089, "loss": 51.633, "step": 1765 }, { "epoch": 4.664245625619016, "grad_norm": 1990.39501953125, "learning_rate": 0.0004903741916044185, "loss": 46.0863, "step": 1766 }, { "epoch": 4.666886761307362, "grad_norm": 984.0360107421875, "learning_rate": 0.0004903627073880203, "loss": 46.9908, "step": 1767 }, { "epoch": 4.669527896995708, "grad_norm": 743.5486450195312, "learning_rate": 0.0004903512164596344, "loss": 45.709, "step": 1768 }, { "epoch": 4.672169032684054, "grad_norm": 885.028076171875, "learning_rate": 0.0004903397188195821, "loss": 45.1637, "step": 1769 }, { "epoch": 4.6748101683724, "grad_norm": 552.4254150390625, "learning_rate": 0.0004903282144681844, "loss": 44.0546, "step": 1770 }, { "epoch": 4.677451304060746, "grad_norm": 531.0657958984375, "learning_rate": 0.0004903167034057623, "loss": 48.6708, "step": 1771 }, { "epoch": 4.680092439749092, "grad_norm": 587.9725952148438, "learning_rate": 0.0004903051856326375, "loss": 47.0614, "step": 1772 }, { "epoch": 4.682733575437438, "grad_norm": 645.5308837890625, "learning_rate": 0.0004902936611491316, "loss": 46.8807, "step": 1773 }, { "epoch": 4.685374711125784, "grad_norm": 525.904296875, "learning_rate": 0.0004902821299555664, "loss": 44.7513, "step": 1774 }, { "epoch": 4.68801584681413, "grad_norm": 667.923583984375, "learning_rate": 0.0004902705920522638, "loss": 45.1738, "step": 1775 }, { "epoch": 4.690656982502476, "grad_norm": 587.5829467773438, "learning_rate": 0.000490259047439546, "loss": 46.8238, "step": 1776 }, { "epoch": 4.693298118190822, "grad_norm": 504.3078308105469, "learning_rate": 0.0004902474961177355, "loss": 44.4861, "step": 1777 }, { "epoch": 4.6959392538791676, "grad_norm": 503.318359375, "learning_rate": 0.0004902359380871548, "loss": 42.6258, "step": 1778 }, { "epoch": 4.698580389567514, "grad_norm": 444.9319763183594, "learning_rate": 0.0004902243733481268, "loss": 41.258, "step": 1779 }, { "epoch": 4.7012215252558605, "grad_norm": 1129.916259765625, "learning_rate": 0.0004902128019009741, "loss": 40.6169, "step": 1780 }, { "epoch": 4.703862660944206, "grad_norm": 545.4232788085938, "learning_rate": 0.0004902012237460201, "loss": 41.8983, "step": 1781 }, { "epoch": 4.706503796632552, "grad_norm": 681.7348022460938, "learning_rate": 0.0004901896388835879, "loss": 40.1803, "step": 1782 }, { "epoch": 4.709144932320898, "grad_norm": 635.7662963867188, "learning_rate": 0.0004901780473140013, "loss": 39.2711, "step": 1783 }, { "epoch": 4.711786068009244, "grad_norm": 602.8827514648438, "learning_rate": 0.0004901664490375837, "loss": 39.6721, "step": 1784 }, { "epoch": 4.71442720369759, "grad_norm": 656.5072631835938, "learning_rate": 0.0004901548440546592, "loss": 40.4224, "step": 1785 }, { "epoch": 4.717068339385936, "grad_norm": 2005.0064697265625, "learning_rate": 0.0004901432323655516, "loss": 51.8053, "step": 1786 }, { "epoch": 4.719709475074282, "grad_norm": 16202.7060546875, "learning_rate": 0.0004901316139705854, "loss": 103.9643, "step": 1787 }, { "epoch": 4.722350610762628, "grad_norm": 8632.5341796875, "learning_rate": 0.0004901199888700849, "loss": 126.069, "step": 1788 }, { "epoch": 4.724991746450974, "grad_norm": 9472.673828125, "learning_rate": 0.0004901083570643749, "loss": 104.3149, "step": 1789 }, { "epoch": 4.72763288213932, "grad_norm": 13605.400390625, "learning_rate": 0.0004900967185537799, "loss": 94.7823, "step": 1790 }, { "epoch": 4.730274017827666, "grad_norm": 30374.25390625, "learning_rate": 0.0004900850733386251, "loss": 101.9446, "step": 1791 }, { "epoch": 4.732915153516012, "grad_norm": 4232.98193359375, "learning_rate": 0.0004900734214192358, "loss": 102.9051, "step": 1792 }, { "epoch": 4.735556289204358, "grad_norm": 6696.49169921875, "learning_rate": 0.0004900617627959372, "loss": 87.9321, "step": 1793 }, { "epoch": 4.738197424892704, "grad_norm": 3202.95166015625, "learning_rate": 0.0004900500974690548, "loss": 85.7624, "step": 1794 }, { "epoch": 4.74083856058105, "grad_norm": 4039.060791015625, "learning_rate": 0.0004900384254389144, "loss": 70.4402, "step": 1795 }, { "epoch": 4.743479696269396, "grad_norm": 7169.0712890625, "learning_rate": 0.0004900267467058421, "loss": 66.7078, "step": 1796 }, { "epoch": 4.746120831957742, "grad_norm": 14334.0009765625, "learning_rate": 0.0004900150612701639, "loss": 41.8891, "step": 1797 }, { "epoch": 4.748761967646088, "grad_norm": 1628.365234375, "learning_rate": 0.000490003369132206, "loss": 44.3281, "step": 1798 }, { "epoch": 4.751403103334434, "grad_norm": 1075.8426513671875, "learning_rate": 0.000489991670292295, "loss": 46.4635, "step": 1799 }, { "epoch": 4.75404423902278, "grad_norm": 1084.9285888671875, "learning_rate": 0.0004899799647507577, "loss": 44.0622, "step": 1800 }, { "epoch": 4.75404423902278, "eval_loss": 4.977060794830322, "eval_runtime": 2.1235, "eval_samples_per_second": 233.102, "eval_steps_per_second": 29.197, "step": 1800 }, { "epoch": 4.756685374711125, "grad_norm": 711.0065307617188, "learning_rate": 0.0004899682525079208, "loss": 39.806, "step": 1801 }, { "epoch": 4.759326510399472, "grad_norm": 749.3961791992188, "learning_rate": 0.0004899565335641115, "loss": 39.7984, "step": 1802 }, { "epoch": 4.761967646087818, "grad_norm": 691.3072509765625, "learning_rate": 0.0004899448079196568, "loss": 39.5378, "step": 1803 }, { "epoch": 4.764608781776164, "grad_norm": 429.1302490234375, "learning_rate": 0.0004899330755748844, "loss": 39.0148, "step": 1804 }, { "epoch": 4.767249917464509, "grad_norm": 536.3724365234375, "learning_rate": 0.0004899213365301217, "loss": 40.2692, "step": 1805 }, { "epoch": 4.769891053152856, "grad_norm": 646.1104736328125, "learning_rate": 0.0004899095907856968, "loss": 40.1689, "step": 1806 }, { "epoch": 4.772532188841201, "grad_norm": 564.6528930664062, "learning_rate": 0.0004898978383419373, "loss": 39.1352, "step": 1807 }, { "epoch": 4.775173324529548, "grad_norm": 598.0877075195312, "learning_rate": 0.0004898860791991717, "loss": 39.3534, "step": 1808 }, { "epoch": 4.777814460217893, "grad_norm": 469.28204345703125, "learning_rate": 0.0004898743133577283, "loss": 38.7297, "step": 1809 }, { "epoch": 4.78045559590624, "grad_norm": 1156.0618896484375, "learning_rate": 0.0004898625408179354, "loss": 38.0776, "step": 1810 }, { "epoch": 4.783096731594585, "grad_norm": 1407.002685546875, "learning_rate": 0.0004898507615801222, "loss": 38.3568, "step": 1811 }, { "epoch": 4.785737867282932, "grad_norm": 4185.158203125, "learning_rate": 0.0004898389756446172, "loss": 41.2308, "step": 1812 }, { "epoch": 4.788379002971277, "grad_norm": 1642.6700439453125, "learning_rate": 0.0004898271830117499, "loss": 41.6272, "step": 1813 }, { "epoch": 4.791020138659624, "grad_norm": 1820.1954345703125, "learning_rate": 0.0004898153836818492, "loss": 44.7545, "step": 1814 }, { "epoch": 4.793661274347969, "grad_norm": 1287.8043212890625, "learning_rate": 0.0004898035776552448, "loss": 43.5337, "step": 1815 }, { "epoch": 4.796302410036316, "grad_norm": 680.0623779296875, "learning_rate": 0.0004897917649322666, "loss": 42.9759, "step": 1816 }, { "epoch": 4.798943545724661, "grad_norm": 612.1532592773438, "learning_rate": 0.0004897799455132438, "loss": 41.3876, "step": 1817 }, { "epoch": 4.801584681413008, "grad_norm": 783.5950927734375, "learning_rate": 0.0004897681193985073, "loss": 44.7611, "step": 1818 }, { "epoch": 4.804225817101353, "grad_norm": 976.695068359375, "learning_rate": 0.0004897562865883866, "loss": 48.7679, "step": 1819 }, { "epoch": 4.8068669527897, "grad_norm": 2075.695068359375, "learning_rate": 0.0004897444470832126, "loss": 47.2048, "step": 1820 }, { "epoch": 4.8095080884780455, "grad_norm": 715.4586181640625, "learning_rate": 0.0004897326008833158, "loss": 42.5663, "step": 1821 }, { "epoch": 4.812149224166392, "grad_norm": 1379.659912109375, "learning_rate": 0.0004897207479890269, "loss": 46.5635, "step": 1822 }, { "epoch": 4.8147903598547375, "grad_norm": 413.6947021484375, "learning_rate": 0.0004897088884006769, "loss": 44.767, "step": 1823 }, { "epoch": 4.817431495543083, "grad_norm": 624.3335571289062, "learning_rate": 0.0004896970221185969, "loss": 44.5743, "step": 1824 }, { "epoch": 4.8200726312314295, "grad_norm": 892.364990234375, "learning_rate": 0.0004896851491431184, "loss": 43.5511, "step": 1825 }, { "epoch": 4.822713766919776, "grad_norm": 475.97796630859375, "learning_rate": 0.0004896732694745731, "loss": 42.7945, "step": 1826 }, { "epoch": 4.8253549026081215, "grad_norm": 560.688720703125, "learning_rate": 0.0004896613831132924, "loss": 38.7632, "step": 1827 }, { "epoch": 4.827996038296467, "grad_norm": 789.5831909179688, "learning_rate": 0.0004896494900596083, "loss": 38.3493, "step": 1828 }, { "epoch": 4.8306371739848135, "grad_norm": 668.2655029296875, "learning_rate": 0.0004896375903138532, "loss": 39.267, "step": 1829 }, { "epoch": 4.833278309673159, "grad_norm": 862.3458862304688, "learning_rate": 0.0004896256838763589, "loss": 37.9088, "step": 1830 }, { "epoch": 4.8359194453615055, "grad_norm": 446.2868957519531, "learning_rate": 0.0004896137707474582, "loss": 39.2464, "step": 1831 }, { "epoch": 4.838560581049851, "grad_norm": 1241.62548828125, "learning_rate": 0.0004896018509274837, "loss": 36.8144, "step": 1832 }, { "epoch": 4.8412017167381975, "grad_norm": 696.8209228515625, "learning_rate": 0.0004895899244167684, "loss": 37.9875, "step": 1833 }, { "epoch": 4.843842852426543, "grad_norm": 440.3138732910156, "learning_rate": 0.0004895779912156452, "loss": 38.3784, "step": 1834 }, { "epoch": 4.84648398811489, "grad_norm": 734.4601440429688, "learning_rate": 0.0004895660513244472, "loss": 38.0905, "step": 1835 }, { "epoch": 4.849125123803235, "grad_norm": 1897.23046875, "learning_rate": 0.000489554104743508, "loss": 50.0166, "step": 1836 }, { "epoch": 4.851766259491582, "grad_norm": 11336.1005859375, "learning_rate": 0.0004895421514731612, "loss": 68.6278, "step": 1837 }, { "epoch": 4.854407395179927, "grad_norm": 7081.79638671875, "learning_rate": 0.0004895301915137405, "loss": 86.4976, "step": 1838 }, { "epoch": 4.857048530868274, "grad_norm": 7528.81005859375, "learning_rate": 0.0004895182248655798, "loss": 83.9435, "step": 1839 }, { "epoch": 4.859689666556619, "grad_norm": 5837.73779296875, "learning_rate": 0.0004895062515290135, "loss": 75.6024, "step": 1840 }, { "epoch": 4.862330802244966, "grad_norm": 15095.494140625, "learning_rate": 0.0004894942715043758, "loss": 67.3549, "step": 1841 }, { "epoch": 4.864971937933311, "grad_norm": 6342.50634765625, "learning_rate": 0.0004894822847920011, "loss": 64.199, "step": 1842 }, { "epoch": 4.867613073621658, "grad_norm": 15550.19921875, "learning_rate": 0.0004894702913922245, "loss": 57.1938, "step": 1843 }, { "epoch": 4.870254209310003, "grad_norm": 5958.98828125, "learning_rate": 0.0004894582913053805, "loss": 48.8805, "step": 1844 }, { "epoch": 4.87289534499835, "grad_norm": 3946.158203125, "learning_rate": 0.0004894462845318045, "loss": 36.3649, "step": 1845 }, { "epoch": 4.875536480686695, "grad_norm": 1362.6124267578125, "learning_rate": 0.0004894342710718317, "loss": 37.0547, "step": 1846 }, { "epoch": 4.878177616375041, "grad_norm": 1957.1611328125, "learning_rate": 0.0004894222509257975, "loss": 55.7746, "step": 1847 }, { "epoch": 4.880818752063387, "grad_norm": 1529.355712890625, "learning_rate": 0.0004894102240940375, "loss": 56.3254, "step": 1848 }, { "epoch": 4.883459887751734, "grad_norm": 2293.306640625, "learning_rate": 0.0004893981905768878, "loss": 48.7723, "step": 1849 }, { "epoch": 4.886101023440079, "grad_norm": 4179.6337890625, "learning_rate": 0.000489386150374684, "loss": 43.4925, "step": 1850 }, { "epoch": 4.888742159128425, "grad_norm": 2349.592041015625, "learning_rate": 0.0004893741034877628, "loss": 41.9409, "step": 1851 }, { "epoch": 4.891383294816771, "grad_norm": 1266.0787353515625, "learning_rate": 0.0004893620499164603, "loss": 41.6429, "step": 1852 }, { "epoch": 4.894024430505117, "grad_norm": 1099.129638671875, "learning_rate": 0.0004893499896611131, "loss": 42.8958, "step": 1853 }, { "epoch": 4.896665566193463, "grad_norm": 1063.2535400390625, "learning_rate": 0.0004893379227220581, "loss": 40.6661, "step": 1854 }, { "epoch": 4.899306701881809, "grad_norm": 1453.6328125, "learning_rate": 0.0004893258490996323, "loss": 40.1418, "step": 1855 }, { "epoch": 4.901947837570155, "grad_norm": 1163.1693115234375, "learning_rate": 0.0004893137687941725, "loss": 40.7759, "step": 1856 }, { "epoch": 4.904588973258501, "grad_norm": 593.0659790039062, "learning_rate": 0.0004893016818060165, "loss": 42.324, "step": 1857 }, { "epoch": 4.907230108946847, "grad_norm": 1463.19140625, "learning_rate": 0.0004892895881355016, "loss": 40.7583, "step": 1858 }, { "epoch": 4.909871244635193, "grad_norm": 718.3444213867188, "learning_rate": 0.0004892774877829655, "loss": 39.657, "step": 1859 }, { "epoch": 4.912512380323539, "grad_norm": 709.5885620117188, "learning_rate": 0.000489265380748746, "loss": 38.2574, "step": 1860 }, { "epoch": 4.915153516011885, "grad_norm": 907.7120971679688, "learning_rate": 0.0004892532670331814, "loss": 38.9807, "step": 1861 }, { "epoch": 4.917794651700231, "grad_norm": 856.7130126953125, "learning_rate": 0.0004892411466366098, "loss": 40.6776, "step": 1862 }, { "epoch": 4.920435787388577, "grad_norm": 1292.0360107421875, "learning_rate": 0.0004892290195593698, "loss": 41.8849, "step": 1863 }, { "epoch": 4.923076923076923, "grad_norm": 1144.0968017578125, "learning_rate": 0.0004892168858017999, "loss": 43.4656, "step": 1864 }, { "epoch": 4.925718058765269, "grad_norm": 4849.21875, "learning_rate": 0.000489204745364239, "loss": 45.9906, "step": 1865 }, { "epoch": 4.928359194453615, "grad_norm": 648.0402221679688, "learning_rate": 0.0004891925982470261, "loss": 44.9889, "step": 1866 }, { "epoch": 4.931000330141961, "grad_norm": 1069.705322265625, "learning_rate": 0.0004891804444505005, "loss": 44.8407, "step": 1867 }, { "epoch": 4.933641465830307, "grad_norm": 759.8941650390625, "learning_rate": 0.0004891682839750014, "loss": 46.8709, "step": 1868 }, { "epoch": 4.936282601518653, "grad_norm": 768.7850952148438, "learning_rate": 0.0004891561168208684, "loss": 49.1373, "step": 1869 }, { "epoch": 4.938923737206999, "grad_norm": 745.7367553710938, "learning_rate": 0.0004891439429884414, "loss": 44.8474, "step": 1870 }, { "epoch": 4.941564872895345, "grad_norm": 402.2779846191406, "learning_rate": 0.0004891317624780603, "loss": 44.6108, "step": 1871 }, { "epoch": 4.944206008583691, "grad_norm": 846.5027465820312, "learning_rate": 0.0004891195752900651, "loss": 42.2596, "step": 1872 }, { "epoch": 4.946847144272037, "grad_norm": 850.4214477539062, "learning_rate": 0.0004891073814247963, "loss": 41.7525, "step": 1873 }, { "epoch": 4.9494882799603825, "grad_norm": 384.6832275390625, "learning_rate": 0.0004890951808825943, "loss": 39.6683, "step": 1874 }, { "epoch": 4.952129415648729, "grad_norm": 530.1210327148438, "learning_rate": 0.0004890829736637999, "loss": 41.6936, "step": 1875 }, { "epoch": 4.954770551337075, "grad_norm": 590.25244140625, "learning_rate": 0.0004890707597687539, "loss": 41.2275, "step": 1876 }, { "epoch": 4.957411687025421, "grad_norm": 778.0222778320312, "learning_rate": 0.0004890585391977973, "loss": 46.8979, "step": 1877 }, { "epoch": 4.960052822713767, "grad_norm": 7151.78076171875, "learning_rate": 0.0004890463119512715, "loss": 88.1349, "step": 1878 }, { "epoch": 4.962693958402113, "grad_norm": 10355.55859375, "learning_rate": 0.0004890340780295178, "loss": 72.3904, "step": 1879 }, { "epoch": 4.965335094090459, "grad_norm": 4441.9052734375, "learning_rate": 0.0004890218374328779, "loss": 60.6427, "step": 1880 }, { "epoch": 4.967976229778805, "grad_norm": 4058.59423828125, "learning_rate": 0.0004890095901616936, "loss": 64.9953, "step": 1881 }, { "epoch": 4.970617365467151, "grad_norm": 3525.45849609375, "learning_rate": 0.0004889973362163069, "loss": 77.3927, "step": 1882 }, { "epoch": 4.973258501155497, "grad_norm": 1663.990234375, "learning_rate": 0.0004889850755970599, "loss": 44.9509, "step": 1883 }, { "epoch": 4.975899636843843, "grad_norm": 624.9330444335938, "learning_rate": 0.0004889728083042952, "loss": 40.0845, "step": 1884 }, { "epoch": 4.978540772532189, "grad_norm": 622.09423828125, "learning_rate": 0.0004889605343383552, "loss": 42.3963, "step": 1885 }, { "epoch": 4.981181908220535, "grad_norm": 1078.319091796875, "learning_rate": 0.0004889482536995825, "loss": 41.671, "step": 1886 }, { "epoch": 4.983823043908881, "grad_norm": 1520.3768310546875, "learning_rate": 0.0004889359663883203, "loss": 41.443, "step": 1887 }, { "epoch": 4.986464179597227, "grad_norm": 1296.8389892578125, "learning_rate": 0.0004889236724049116, "loss": 41.326, "step": 1888 }, { "epoch": 4.989105315285573, "grad_norm": 857.2449340820312, "learning_rate": 0.0004889113717496997, "loss": 40.0428, "step": 1889 }, { "epoch": 4.991746450973919, "grad_norm": 1273.9539794921875, "learning_rate": 0.000488899064423028, "loss": 40.2891, "step": 1890 }, { "epoch": 4.994387586662265, "grad_norm": 2593.69287109375, "learning_rate": 0.0004888867504252403, "loss": 38.4862, "step": 1891 }, { "epoch": 4.997028722350611, "grad_norm": 3637.495849609375, "learning_rate": 0.0004888744297566805, "loss": 41.6799, "step": 1892 }, { "epoch": 4.999669858038957, "grad_norm": 3779.611083984375, "learning_rate": 0.0004888621024176924, "loss": 45.9614, "step": 1893 }, { "epoch": 5.002310993727303, "grad_norm": 796.310302734375, "learning_rate": 0.0004888497684086207, "loss": 42.1556, "step": 1894 }, { "epoch": 5.004952129415649, "grad_norm": 766.4441528320312, "learning_rate": 0.0004888374277298092, "loss": 41.8784, "step": 1895 }, { "epoch": 5.007593265103995, "grad_norm": 674.682373046875, "learning_rate": 0.0004888250803816031, "loss": 42.2495, "step": 1896 }, { "epoch": 5.01023440079234, "grad_norm": 1219.2049560546875, "learning_rate": 0.0004888127263643467, "loss": 45.9738, "step": 1897 }, { "epoch": 5.012875536480687, "grad_norm": 715.2596435546875, "learning_rate": 0.0004888003656783854, "loss": 47.536, "step": 1898 }, { "epoch": 5.015516672169032, "grad_norm": 666.7476806640625, "learning_rate": 0.0004887879983240641, "loss": 45.3932, "step": 1899 }, { "epoch": 5.018157807857379, "grad_norm": 774.3878173828125, "learning_rate": 0.0004887756243017281, "loss": 44.9428, "step": 1900 }, { "epoch": 5.020798943545724, "grad_norm": 634.1074829101562, "learning_rate": 0.0004887632436117232, "loss": 46.1164, "step": 1901 }, { "epoch": 5.023440079234071, "grad_norm": 776.1900634765625, "learning_rate": 0.0004887508562543949, "loss": 44.9343, "step": 1902 }, { "epoch": 5.026081214922416, "grad_norm": 496.15277099609375, "learning_rate": 0.0004887384622300892, "loss": 46.6416, "step": 1903 }, { "epoch": 5.028722350610763, "grad_norm": 703.2459106445312, "learning_rate": 0.0004887260615391522, "loss": 44.9782, "step": 1904 }, { "epoch": 5.031363486299108, "grad_norm": 797.2261352539062, "learning_rate": 0.0004887136541819302, "loss": 43.9367, "step": 1905 }, { "epoch": 5.034004621987455, "grad_norm": 681.6609497070312, "learning_rate": 0.0004887012401587696, "loss": 42.2594, "step": 1906 }, { "epoch": 5.0366457576758, "grad_norm": 351.5408630371094, "learning_rate": 0.000488688819470017, "loss": 41.7729, "step": 1907 }, { "epoch": 5.039286893364147, "grad_norm": 662.9354858398438, "learning_rate": 0.0004886763921160195, "loss": 40.5267, "step": 1908 }, { "epoch": 5.041928029052492, "grad_norm": 392.9298095703125, "learning_rate": 0.0004886639580971238, "loss": 40.1288, "step": 1909 }, { "epoch": 5.044569164740839, "grad_norm": 591.1055908203125, "learning_rate": 0.0004886515174136774, "loss": 39.8169, "step": 1910 }, { "epoch": 5.047210300429184, "grad_norm": 1117.54833984375, "learning_rate": 0.0004886390700660274, "loss": 39.7884, "step": 1911 }, { "epoch": 5.049851436117531, "grad_norm": 273.91973876953125, "learning_rate": 0.0004886266160545218, "loss": 38.0693, "step": 1912 }, { "epoch": 5.052492571805876, "grad_norm": 620.9154052734375, "learning_rate": 0.0004886141553795079, "loss": 42.2773, "step": 1913 }, { "epoch": 5.055133707494223, "grad_norm": 367.7938537597656, "learning_rate": 0.000488601688041334, "loss": 38.4411, "step": 1914 }, { "epoch": 5.057774843182568, "grad_norm": 1813.1478271484375, "learning_rate": 0.0004885892140403482, "loss": 39.5048, "step": 1915 }, { "epoch": 5.060415978870915, "grad_norm": 4945.5908203125, "learning_rate": 0.0004885767333768986, "loss": 56.3757, "step": 1916 }, { "epoch": 5.06305711455926, "grad_norm": 3249.620849609375, "learning_rate": 0.0004885642460513339, "loss": 60.5495, "step": 1917 }, { "epoch": 5.065698250247607, "grad_norm": 3874.5546875, "learning_rate": 0.0004885517520640028, "loss": 48.3879, "step": 1918 }, { "epoch": 5.0683393859359525, "grad_norm": 2133.539794921875, "learning_rate": 0.0004885392514152541, "loss": 44.7883, "step": 1919 }, { "epoch": 5.070980521624298, "grad_norm": 3508.93798828125, "learning_rate": 0.000488526744105437, "loss": 32.5272, "step": 1920 }, { "epoch": 5.0736216573126445, "grad_norm": 2760.0625, "learning_rate": 0.0004885142301349006, "loss": 31.2309, "step": 1921 }, { "epoch": 5.07626279300099, "grad_norm": 1453.743408203125, "learning_rate": 0.0004885017095039945, "loss": 32.8532, "step": 1922 }, { "epoch": 5.0789039286893365, "grad_norm": 1193.30517578125, "learning_rate": 0.0004884891822130682, "loss": 25.7228, "step": 1923 }, { "epoch": 5.081545064377682, "grad_norm": 1299.1993408203125, "learning_rate": 0.0004884766482624718, "loss": 21.1368, "step": 1924 }, { "epoch": 5.0841862000660285, "grad_norm": 1036.339111328125, "learning_rate": 0.0004884641076525549, "loss": 23.8041, "step": 1925 }, { "epoch": 5.086827335754374, "grad_norm": 1791.40478515625, "learning_rate": 0.0004884515603836678, "loss": 85.5138, "step": 1926 }, { "epoch": 5.0894684714427205, "grad_norm": 1477.954833984375, "learning_rate": 0.0004884390064561611, "loss": 84.6718, "step": 1927 }, { "epoch": 5.092109607131066, "grad_norm": 1265.8009033203125, "learning_rate": 0.0004884264458703852, "loss": 61.3938, "step": 1928 }, { "epoch": 5.0947507428194125, "grad_norm": 382.28094482421875, "learning_rate": 0.0004884138786266908, "loss": 45.2891, "step": 1929 }, { "epoch": 5.097391878507758, "grad_norm": 593.1123046875, "learning_rate": 0.0004884013047254289, "loss": 42.2546, "step": 1930 }, { "epoch": 5.1000330141961046, "grad_norm": 572.4325561523438, "learning_rate": 0.0004883887241669507, "loss": 43.2007, "step": 1931 }, { "epoch": 5.10267414988445, "grad_norm": 496.0643005371094, "learning_rate": 0.0004883761369516073, "loss": 43.0338, "step": 1932 }, { "epoch": 5.105315285572797, "grad_norm": 509.3218078613281, "learning_rate": 0.0004883635430797502, "loss": 41.5925, "step": 1933 }, { "epoch": 5.107956421261142, "grad_norm": 583.7424926757812, "learning_rate": 0.0004883509425517315, "loss": 42.5421, "step": 1934 }, { "epoch": 5.110597556949489, "grad_norm": 440.0190124511719, "learning_rate": 0.0004883383353679026, "loss": 40.279, "step": 1935 }, { "epoch": 5.113238692637834, "grad_norm": 571.6030883789062, "learning_rate": 0.0004883257215286156, "loss": 42.3793, "step": 1936 }, { "epoch": 5.115879828326181, "grad_norm": 805.83837890625, "learning_rate": 0.0004883131010342229, "loss": 41.7954, "step": 1937 }, { "epoch": 5.118520964014526, "grad_norm": 1144.6248779296875, "learning_rate": 0.0004883004738850768, "loss": 40.9118, "step": 1938 }, { "epoch": 5.121162099702873, "grad_norm": 1463.116455078125, "learning_rate": 0.00048828784008153, "loss": 49.027, "step": 1939 }, { "epoch": 5.123803235391218, "grad_norm": 1725.820556640625, "learning_rate": 0.0004882751996239352, "loss": 44.0694, "step": 1940 }, { "epoch": 5.126444371079565, "grad_norm": 2851.516845703125, "learning_rate": 0.0004882625525126455, "loss": 46.2961, "step": 1941 }, { "epoch": 5.12908550676791, "grad_norm": 3486.246337890625, "learning_rate": 0.0004882498987480139, "loss": 46.7898, "step": 1942 }, { "epoch": 5.131726642456256, "grad_norm": 4474.74462890625, "learning_rate": 0.0004882372383303939, "loss": 50.7685, "step": 1943 }, { "epoch": 5.134367778144602, "grad_norm": 6461.2978515625, "learning_rate": 0.0004882245712601389, "loss": 48.3898, "step": 1944 }, { "epoch": 5.137008913832948, "grad_norm": 2612.945068359375, "learning_rate": 0.00048821189753760273, "loss": 47.2441, "step": 1945 }, { "epoch": 5.139650049521294, "grad_norm": 1574.53271484375, "learning_rate": 0.00048819921716313923, "loss": 46.67, "step": 1946 }, { "epoch": 5.14229118520964, "grad_norm": 3446.103271484375, "learning_rate": 0.00048818653013710256, "loss": 45.3888, "step": 1947 }, { "epoch": 5.144932320897986, "grad_norm": 2396.623046875, "learning_rate": 0.0004881738364598469, "loss": 51.1597, "step": 1948 }, { "epoch": 5.147573456586332, "grad_norm": 1807.99365234375, "learning_rate": 0.00048816113613172685, "loss": 48.7577, "step": 1949 }, { "epoch": 5.150214592274678, "grad_norm": 2076.756103515625, "learning_rate": 0.0004881484291530969, "loss": 49.1561, "step": 1950 }, { "epoch": 5.152855727963024, "grad_norm": 2600.005859375, "learning_rate": 0.000488135715524312, "loss": 47.7454, "step": 1951 }, { "epoch": 5.15549686365137, "grad_norm": 3764.70947265625, "learning_rate": 0.0004881229952457271, "loss": 44.9771, "step": 1952 }, { "epoch": 5.158137999339716, "grad_norm": 2328.906982421875, "learning_rate": 0.0004881102683176975, "loss": 42.5035, "step": 1953 }, { "epoch": 5.160779135028062, "grad_norm": 1901.8394775390625, "learning_rate": 0.0004880975347405784, "loss": 43.9967, "step": 1954 }, { "epoch": 5.163420270716408, "grad_norm": 4916.6875, "learning_rate": 0.00048808479451472567, "loss": 46.1169, "step": 1955 }, { "epoch": 5.166061406404754, "grad_norm": 1709.192626953125, "learning_rate": 0.00048807204764049483, "loss": 42.5945, "step": 1956 }, { "epoch": 5.1687025420931, "grad_norm": 3523.63330078125, "learning_rate": 0.0004880592941182419, "loss": 39.7814, "step": 1957 }, { "epoch": 5.171343677781446, "grad_norm": 2892.2861328125, "learning_rate": 0.00048804653394832307, "loss": 39.7167, "step": 1958 }, { "epoch": 5.173984813469792, "grad_norm": 2993.585693359375, "learning_rate": 0.0004880337671310946, "loss": 40.1224, "step": 1959 }, { "epoch": 5.176625949158138, "grad_norm": 1634.759765625, "learning_rate": 0.000488020993666913, "loss": 38.7455, "step": 1960 }, { "epoch": 5.179267084846484, "grad_norm": 1655.2425537109375, "learning_rate": 0.00048800821355613497, "loss": 38.1446, "step": 1961 }, { "epoch": 5.18190822053483, "grad_norm": 2757.15966796875, "learning_rate": 0.00048799542679911733, "loss": 38.6588, "step": 1962 }, { "epoch": 5.184549356223176, "grad_norm": 3634.447509765625, "learning_rate": 0.0004879826333962173, "loss": 40.0852, "step": 1963 }, { "epoch": 5.187190491911522, "grad_norm": 4632.25927734375, "learning_rate": 0.00048796983334779197, "loss": 39.1965, "step": 1964 }, { "epoch": 5.189831627599868, "grad_norm": 4374.828125, "learning_rate": 0.0004879570266541989, "loss": 42.2155, "step": 1965 }, { "epoch": 5.1924727632882135, "grad_norm": 17438.62890625, "learning_rate": 0.0004879442133157956, "loss": 114.5326, "step": 1966 }, { "epoch": 5.19511389897656, "grad_norm": 6758.19482421875, "learning_rate": 0.0004879313933329399, "loss": 119.0483, "step": 1967 }, { "epoch": 5.1977550346649055, "grad_norm": 35934.3984375, "learning_rate": 0.00048791856670598986, "loss": 178.0082, "step": 1968 }, { "epoch": 5.200396170353252, "grad_norm": 6678.24658203125, "learning_rate": 0.0004879057334353036, "loss": 133.8877, "step": 1969 }, { "epoch": 5.2030373060415975, "grad_norm": 6978.48681640625, "learning_rate": 0.00048789289352123944, "loss": 125.4874, "step": 1970 }, { "epoch": 5.205678441729944, "grad_norm": 123369.6796875, "learning_rate": 0.000487880046964156, "loss": 118.4655, "step": 1971 }, { "epoch": 5.2083195774182895, "grad_norm": 94672.203125, "learning_rate": 0.000487867193764412, "loss": 120.533, "step": 1972 }, { "epoch": 5.210960713106636, "grad_norm": 11903.1298828125, "learning_rate": 0.00048785433392236634, "loss": 107.1118, "step": 1973 }, { "epoch": 5.213601848794982, "grad_norm": 19280.77734375, "learning_rate": 0.0004878414674383781, "loss": 90.3913, "step": 1974 }, { "epoch": 5.216242984483328, "grad_norm": 8138.83154296875, "learning_rate": 0.0004878285943128067, "loss": 88.8205, "step": 1975 }, { "epoch": 5.218884120171674, "grad_norm": 2870.478759765625, "learning_rate": 0.0004878157145460114, "loss": 50.52, "step": 1976 }, { "epoch": 5.22152525586002, "grad_norm": 2746.872314453125, "learning_rate": 0.00048780282813835196, "loss": 44.1053, "step": 1977 }, { "epoch": 5.224166391548366, "grad_norm": 2574.194091796875, "learning_rate": 0.0004877899350901883, "loss": 45.2008, "step": 1978 }, { "epoch": 5.226807527236712, "grad_norm": 2568.953857421875, "learning_rate": 0.0004877770354018803, "loss": 45.7543, "step": 1979 }, { "epoch": 5.229448662925058, "grad_norm": 2493.135498046875, "learning_rate": 0.0004877641290737884, "loss": 43.7022, "step": 1980 }, { "epoch": 5.232089798613404, "grad_norm": 2942.098876953125, "learning_rate": 0.0004877512161062728, "loss": 40.8607, "step": 1981 }, { "epoch": 5.23473093430175, "grad_norm": 2537.938720703125, "learning_rate": 0.00048773829649969414, "loss": 41.0033, "step": 1982 }, { "epoch": 5.237372069990096, "grad_norm": 2868.87451171875, "learning_rate": 0.0004877253702544132, "loss": 37.6172, "step": 1983 }, { "epoch": 5.240013205678442, "grad_norm": 1859.2889404296875, "learning_rate": 0.00048771243737079096, "loss": 38.7787, "step": 1984 }, { "epoch": 5.242654341366788, "grad_norm": 2574.8349609375, "learning_rate": 0.00048769949784918854, "loss": 40.0517, "step": 1985 }, { "epoch": 5.245295477055134, "grad_norm": 4487.62646484375, "learning_rate": 0.00048768655168996734, "loss": 40.159, "step": 1986 }, { "epoch": 5.24793661274348, "grad_norm": 3858.9296875, "learning_rate": 0.0004876735988934887, "loss": 40.5242, "step": 1987 }, { "epoch": 5.250577748431826, "grad_norm": 3610.739990234375, "learning_rate": 0.00048766063946011453, "loss": 39.6816, "step": 1988 }, { "epoch": 5.253218884120171, "grad_norm": 4727.02294921875, "learning_rate": 0.00048764767339020657, "loss": 40.7895, "step": 1989 }, { "epoch": 5.255860019808518, "grad_norm": 2936.527587890625, "learning_rate": 0.00048763470068412685, "loss": 38.9515, "step": 1990 }, { "epoch": 5.258501155496863, "grad_norm": 3345.8818359375, "learning_rate": 0.0004876217213422378, "loss": 39.7754, "step": 1991 }, { "epoch": 5.26114229118521, "grad_norm": 4240.26025390625, "learning_rate": 0.0004876087353649018, "loss": 40.4418, "step": 1992 }, { "epoch": 5.263783426873555, "grad_norm": 61946.27734375, "learning_rate": 0.0004875957427524814, "loss": 47.3338, "step": 1993 }, { "epoch": 5.266424562561902, "grad_norm": 1318.1866455078125, "learning_rate": 0.0004875827435053394, "loss": 41.7576, "step": 1994 }, { "epoch": 5.269065698250247, "grad_norm": 1999.7906494140625, "learning_rate": 0.00048756973762383894, "loss": 43.0175, "step": 1995 }, { "epoch": 5.271706833938594, "grad_norm": 2474.958251953125, "learning_rate": 0.00048755672510834304, "loss": 42.337, "step": 1996 }, { "epoch": 5.274347969626939, "grad_norm": 3494.036865234375, "learning_rate": 0.00048754370595921514, "loss": 45.4384, "step": 1997 }, { "epoch": 5.276989105315286, "grad_norm": 2248.307861328125, "learning_rate": 0.00048753068017681886, "loss": 47.9663, "step": 1998 }, { "epoch": 5.279630241003631, "grad_norm": 2584.77587890625, "learning_rate": 0.00048751764776151785, "loss": 47.1945, "step": 1999 }, { "epoch": 5.282271376691978, "grad_norm": 4234.43603515625, "learning_rate": 0.000487504608713676, "loss": 45.8481, "step": 2000 }, { "epoch": 5.282271376691978, "eval_loss": 6.369674205780029, "eval_runtime": 2.211, "eval_samples_per_second": 223.883, "eval_steps_per_second": 28.042, "step": 2000 }, { "epoch": 5.284912512380323, "grad_norm": 1799.20263671875, "learning_rate": 0.0004874915630336575, "loss": 46.4322, "step": 2001 }, { "epoch": 5.28755364806867, "grad_norm": 1304.071044921875, "learning_rate": 0.00048747851072182666, "loss": 45.5208, "step": 2002 }, { "epoch": 5.290194783757015, "grad_norm": 1709.2982177734375, "learning_rate": 0.00048746545177854786, "loss": 43.1325, "step": 2003 }, { "epoch": 5.292835919445362, "grad_norm": 3098.7080078125, "learning_rate": 0.0004874523862041858, "loss": 43.0686, "step": 2004 }, { "epoch": 5.295477055133707, "grad_norm": 2044.3682861328125, "learning_rate": 0.00048743931399910536, "loss": 41.6898, "step": 2005 }, { "epoch": 5.298118190822054, "grad_norm": 1238.202880859375, "learning_rate": 0.0004874262351636716, "loss": 41.5709, "step": 2006 }, { "epoch": 5.300759326510399, "grad_norm": 1970.096923828125, "learning_rate": 0.0004874131496982496, "loss": 38.3935, "step": 2007 }, { "epoch": 5.303400462198746, "grad_norm": 4515.09619140625, "learning_rate": 0.00048740005760320495, "loss": 40.1485, "step": 2008 }, { "epoch": 5.306041597887091, "grad_norm": 1784.1639404296875, "learning_rate": 0.00048738695887890306, "loss": 39.0318, "step": 2009 }, { "epoch": 5.308682733575438, "grad_norm": 1828.2628173828125, "learning_rate": 0.00048737385352570984, "loss": 38.8949, "step": 2010 }, { "epoch": 5.311323869263783, "grad_norm": 1246.7919921875, "learning_rate": 0.00048736074154399125, "loss": 37.9793, "step": 2011 }, { "epoch": 5.313965004952129, "grad_norm": 1926.3719482421875, "learning_rate": 0.00048734762293411326, "loss": 38.5469, "step": 2012 }, { "epoch": 5.316606140640475, "grad_norm": 1443.648193359375, "learning_rate": 0.00048733449769644246, "loss": 38.8256, "step": 2013 }, { "epoch": 5.319247276328821, "grad_norm": 1201.653564453125, "learning_rate": 0.0004873213658313452, "loss": 39.6533, "step": 2014 }, { "epoch": 5.3218884120171674, "grad_norm": 29694.2890625, "learning_rate": 0.0004873082273391881, "loss": 63.1012, "step": 2015 }, { "epoch": 5.324529547705513, "grad_norm": 4747.32275390625, "learning_rate": 0.00048729508222033823, "loss": 83.2067, "step": 2016 }, { "epoch": 5.3271706833938595, "grad_norm": 36036.6015625, "learning_rate": 0.0004872819304751627, "loss": 73.1217, "step": 2017 }, { "epoch": 5.329811819082205, "grad_norm": 5668.689453125, "learning_rate": 0.00048726877210402846, "loss": 68.8694, "step": 2018 }, { "epoch": 5.3324529547705515, "grad_norm": 19005.0390625, "learning_rate": 0.0004872556071073032, "loss": 73.1325, "step": 2019 }, { "epoch": 5.335094090458897, "grad_norm": 5687.5576171875, "learning_rate": 0.0004872424354853545, "loss": 51.2926, "step": 2020 }, { "epoch": 5.3377352261472435, "grad_norm": 4662.142578125, "learning_rate": 0.00048722925723855017, "loss": 45.9417, "step": 2021 }, { "epoch": 5.340376361835589, "grad_norm": 6718.05419921875, "learning_rate": 0.0004872160723672582, "loss": 38.2974, "step": 2022 }, { "epoch": 5.3430174975239355, "grad_norm": 5147.7646484375, "learning_rate": 0.0004872028808718467, "loss": 29.5163, "step": 2023 }, { "epoch": 5.345658633212281, "grad_norm": 18648.78515625, "learning_rate": 0.00048718968275268416, "loss": 25.3994, "step": 2024 }, { "epoch": 5.3482997689006275, "grad_norm": 4223.45751953125, "learning_rate": 0.000487176478010139, "loss": 78.9162, "step": 2025 }, { "epoch": 5.350940904588973, "grad_norm": 6465.35888671875, "learning_rate": 0.00048716326664458004, "loss": 82.4063, "step": 2026 }, { "epoch": 5.3535820402773195, "grad_norm": 4351.017578125, "learning_rate": 0.0004871500486563761, "loss": 74.0535, "step": 2027 }, { "epoch": 5.356223175965665, "grad_norm": 3515.26318359375, "learning_rate": 0.00048713682404589656, "loss": 56.6079, "step": 2028 }, { "epoch": 5.358864311654012, "grad_norm": 3812.277099609375, "learning_rate": 0.0004871235928135103, "loss": 46.5203, "step": 2029 }, { "epoch": 5.361505447342357, "grad_norm": 2132.301513671875, "learning_rate": 0.00048711035495958713, "loss": 41.5612, "step": 2030 }, { "epoch": 5.364146583030704, "grad_norm": 3424.572021484375, "learning_rate": 0.0004870971104844966, "loss": 40.3879, "step": 2031 }, { "epoch": 5.366787718719049, "grad_norm": 3899.810791015625, "learning_rate": 0.0004870838593886084, "loss": 42.3167, "step": 2032 }, { "epoch": 5.369428854407396, "grad_norm": 2416.852294921875, "learning_rate": 0.00048707060167229276, "loss": 42.6515, "step": 2033 }, { "epoch": 5.372069990095741, "grad_norm": 4347.53662109375, "learning_rate": 0.0004870573373359199, "loss": 42.4487, "step": 2034 }, { "epoch": 5.374711125784087, "grad_norm": 6938.53076171875, "learning_rate": 0.0004870440663798601, "loss": 46.7768, "step": 2035 }, { "epoch": 5.377352261472433, "grad_norm": 3884.035400390625, "learning_rate": 0.000487030788804484, "loss": 46.1469, "step": 2036 }, { "epoch": 5.379993397160779, "grad_norm": 4646.224609375, "learning_rate": 0.0004870175046101624, "loss": 46.9345, "step": 2037 }, { "epoch": 5.382634532849125, "grad_norm": 3648.950439453125, "learning_rate": 0.0004870042137972661, "loss": 43.9393, "step": 2038 }, { "epoch": 5.385275668537471, "grad_norm": 2243.331787109375, "learning_rate": 0.00048699091636616646, "loss": 42.1309, "step": 2039 }, { "epoch": 5.387916804225817, "grad_norm": 4203.9931640625, "learning_rate": 0.0004869776123172346, "loss": 42.7358, "step": 2040 }, { "epoch": 5.390557939914163, "grad_norm": 2643.47509765625, "learning_rate": 0.0004869643016508422, "loss": 42.9661, "step": 2041 }, { "epoch": 5.393199075602509, "grad_norm": 5613.892578125, "learning_rate": 0.00048695098436736084, "loss": 41.4949, "step": 2042 }, { "epoch": 5.395840211290855, "grad_norm": 10427.48046875, "learning_rate": 0.00048693766046716246, "loss": 48.1696, "step": 2043 }, { "epoch": 5.398481346979201, "grad_norm": 1927.599609375, "learning_rate": 0.0004869243299506191, "loss": 43.2876, "step": 2044 }, { "epoch": 5.401122482667547, "grad_norm": 2375.511962890625, "learning_rate": 0.000486910992818103, "loss": 40.8715, "step": 2045 }, { "epoch": 5.403763618355893, "grad_norm": 1252.0404052734375, "learning_rate": 0.0004868976490699866, "loss": 43.1371, "step": 2046 }, { "epoch": 5.406404754044239, "grad_norm": 1763.26953125, "learning_rate": 0.0004868842987066425, "loss": 43.1415, "step": 2047 }, { "epoch": 5.409045889732585, "grad_norm": 1441.6248779296875, "learning_rate": 0.00048687094172844347, "loss": 47.0779, "step": 2048 }, { "epoch": 5.411687025420931, "grad_norm": 1318.6334228515625, "learning_rate": 0.0004868575781357626, "loss": 48.1088, "step": 2049 }, { "epoch": 5.414328161109277, "grad_norm": 1421.7208251953125, "learning_rate": 0.000486844207928973, "loss": 46.5855, "step": 2050 }, { "epoch": 5.416969296797623, "grad_norm": 1016.9189453125, "learning_rate": 0.000486830831108448, "loss": 45.8222, "step": 2051 }, { "epoch": 5.419610432485969, "grad_norm": 1161.158935546875, "learning_rate": 0.00048681744767456117, "loss": 45.0068, "step": 2052 }, { "epoch": 5.422251568174315, "grad_norm": 1334.9813232421875, "learning_rate": 0.00048680405762768623, "loss": 42.5225, "step": 2053 }, { "epoch": 5.424892703862661, "grad_norm": 1607.8177490234375, "learning_rate": 0.0004867906609681971, "loss": 45.4879, "step": 2054 }, { "epoch": 5.427533839551007, "grad_norm": 1163.5706787109375, "learning_rate": 0.0004867772576964678, "loss": 43.105, "step": 2055 }, { "epoch": 5.430174975239353, "grad_norm": 585.7183227539062, "learning_rate": 0.0004867638478128728, "loss": 40.4427, "step": 2056 }, { "epoch": 5.432816110927699, "grad_norm": 517.2630615234375, "learning_rate": 0.00048675043131778627, "loss": 39.1762, "step": 2057 }, { "epoch": 5.4354572466160445, "grad_norm": 1436.26171875, "learning_rate": 0.00048673700821158305, "loss": 40.883, "step": 2058 }, { "epoch": 5.438098382304391, "grad_norm": 1049.279052734375, "learning_rate": 0.000486723578494638, "loss": 40.8511, "step": 2059 }, { "epoch": 5.4407395179927365, "grad_norm": 1231.194091796875, "learning_rate": 0.0004867101421673261, "loss": 40.8846, "step": 2060 }, { "epoch": 5.443380653681083, "grad_norm": 754.8832397460938, "learning_rate": 0.00048669669923002247, "loss": 38.931, "step": 2061 }, { "epoch": 5.4460217893694285, "grad_norm": 1874.0343017578125, "learning_rate": 0.0004866832496831026, "loss": 38.3984, "step": 2062 }, { "epoch": 5.448662925057775, "grad_norm": 846.151611328125, "learning_rate": 0.000486669793526942, "loss": 38.1793, "step": 2063 }, { "epoch": 5.4513040607461205, "grad_norm": 763.9612426757812, "learning_rate": 0.0004866563307619164, "loss": 38.0221, "step": 2064 }, { "epoch": 5.453945196434467, "grad_norm": 1583.5762939453125, "learning_rate": 0.00048664286138840187, "loss": 48.0755, "step": 2065 }, { "epoch": 5.4565863321228125, "grad_norm": 5586.24853515625, "learning_rate": 0.0004866293854067744, "loss": 75.0818, "step": 2066 }, { "epoch": 5.459227467811159, "grad_norm": 3105.241943359375, "learning_rate": 0.0004866159028174104, "loss": 65.1336, "step": 2067 }, { "epoch": 5.4618686034995045, "grad_norm": 22695.951171875, "learning_rate": 0.00048660241362068625, "loss": 77.6185, "step": 2068 }, { "epoch": 5.464509739187851, "grad_norm": 12254.5654296875, "learning_rate": 0.00048658891781697865, "loss": 77.9428, "step": 2069 }, { "epoch": 5.4671508748761966, "grad_norm": 46917.40625, "learning_rate": 0.0004865754154066646, "loss": 60.7088, "step": 2070 }, { "epoch": 5.469792010564543, "grad_norm": 1569.4595947265625, "learning_rate": 0.00048656190639012094, "loss": 55.1535, "step": 2071 }, { "epoch": 5.472433146252889, "grad_norm": 2828.366455078125, "learning_rate": 0.00048654839076772504, "loss": 48.409, "step": 2072 }, { "epoch": 5.475074281941235, "grad_norm": 3429.790283203125, "learning_rate": 0.0004865348685398543, "loss": 35.3963, "step": 2073 }, { "epoch": 5.477715417629581, "grad_norm": 3439.99169921875, "learning_rate": 0.00048652133970688633, "loss": 28.1355, "step": 2074 }, { "epoch": 5.480356553317927, "grad_norm": 1288.9912109375, "learning_rate": 0.0004865078042691989, "loss": 36.2615, "step": 2075 }, { "epoch": 5.482997689006273, "grad_norm": 5506.8671875, "learning_rate": 0.0004864942622271699, "loss": 71.473, "step": 2076 }, { "epoch": 5.485638824694619, "grad_norm": 2664.86962890625, "learning_rate": 0.0004864807135811776, "loss": 76.8888, "step": 2077 }, { "epoch": 5.488279960382965, "grad_norm": 2104.52490234375, "learning_rate": 0.00048646715833160025, "loss": 73.9889, "step": 2078 }, { "epoch": 5.490921096071311, "grad_norm": 2186.445556640625, "learning_rate": 0.0004864535964788165, "loss": 59.6667, "step": 2079 }, { "epoch": 5.493562231759657, "grad_norm": 1391.4820556640625, "learning_rate": 0.00048644002802320483, "loss": 46.9047, "step": 2080 }, { "epoch": 5.496203367448002, "grad_norm": 3143.209716796875, "learning_rate": 0.00048642645296514443, "loss": 43.8416, "step": 2081 }, { "epoch": 5.498844503136349, "grad_norm": 2213.74072265625, "learning_rate": 0.0004864128713050141, "loss": 46.4619, "step": 2082 }, { "epoch": 5.501485638824695, "grad_norm": 2610.855224609375, "learning_rate": 0.00048639928304319326, "loss": 48.7895, "step": 2083 }, { "epoch": 5.504126774513041, "grad_norm": 2591.71826171875, "learning_rate": 0.0004863856881800613, "loss": 54.8697, "step": 2084 }, { "epoch": 5.506767910201386, "grad_norm": 2957.364990234375, "learning_rate": 0.0004863720867159979, "loss": 58.9136, "step": 2085 }, { "epoch": 5.509409045889733, "grad_norm": 2947.177490234375, "learning_rate": 0.00048635847865138274, "loss": 55.2333, "step": 2086 }, { "epoch": 5.512050181578078, "grad_norm": 2111.304931640625, "learning_rate": 0.00048634486398659596, "loss": 50.6957, "step": 2087 }, { "epoch": 5.514691317266425, "grad_norm": 2007.0521240234375, "learning_rate": 0.00048633124272201767, "loss": 42.6973, "step": 2088 }, { "epoch": 5.51733245295477, "grad_norm": 3205.111328125, "learning_rate": 0.00048631761485802826, "loss": 39.187, "step": 2089 }, { "epoch": 5.519973588643117, "grad_norm": 2986.971923828125, "learning_rate": 0.0004863039803950083, "loss": 39.9768, "step": 2090 }, { "epoch": 5.522614724331462, "grad_norm": 2346.667724609375, "learning_rate": 0.00048629033933333845, "loss": 40.8117, "step": 2091 }, { "epoch": 5.525255860019809, "grad_norm": 2477.12646484375, "learning_rate": 0.0004862766916733997, "loss": 41.7591, "step": 2092 }, { "epoch": 5.527896995708154, "grad_norm": 3384.927734375, "learning_rate": 0.0004862630374155732, "loss": 47.1716, "step": 2093 }, { "epoch": 5.530538131396501, "grad_norm": 892.193359375, "learning_rate": 0.0004862493765602401, "loss": 44.7643, "step": 2094 }, { "epoch": 5.533179267084846, "grad_norm": 1236.4019775390625, "learning_rate": 0.00048623570910778194, "loss": 46.2271, "step": 2095 }, { "epoch": 5.535820402773193, "grad_norm": 1260.8538818359375, "learning_rate": 0.0004862220350585804, "loss": 41.7058, "step": 2096 }, { "epoch": 5.538461538461538, "grad_norm": 831.1176147460938, "learning_rate": 0.00048620835441301724, "loss": 44.1669, "step": 2097 }, { "epoch": 5.541102674149885, "grad_norm": 1661.45556640625, "learning_rate": 0.00048619466717147466, "loss": 48.487, "step": 2098 }, { "epoch": 5.54374380983823, "grad_norm": 1475.427001953125, "learning_rate": 0.00048618097333433464, "loss": 46.0774, "step": 2099 }, { "epoch": 5.546384945526577, "grad_norm": 950.3251342773438, "learning_rate": 0.0004861672729019797, "loss": 45.0449, "step": 2100 }, { "epoch": 5.549026081214922, "grad_norm": 826.40625, "learning_rate": 0.0004861535658747924, "loss": 45.093, "step": 2101 }, { "epoch": 5.551667216903269, "grad_norm": 910.88720703125, "learning_rate": 0.0004861398522531555, "loss": 44.2597, "step": 2102 }, { "epoch": 5.554308352591614, "grad_norm": 980.8921508789062, "learning_rate": 0.0004861261320374519, "loss": 45.7471, "step": 2103 }, { "epoch": 5.55694948827996, "grad_norm": 1151.4757080078125, "learning_rate": 0.0004861124052280649, "loss": 46.9293, "step": 2104 }, { "epoch": 5.559590623968306, "grad_norm": 1095.73828125, "learning_rate": 0.0004860986718253776, "loss": 42.5702, "step": 2105 }, { "epoch": 5.562231759656653, "grad_norm": 901.4998168945312, "learning_rate": 0.0004860849318297735, "loss": 43.0507, "step": 2106 }, { "epoch": 5.564872895344998, "grad_norm": 809.9603881835938, "learning_rate": 0.0004860711852416365, "loss": 40.7586, "step": 2107 }, { "epoch": 5.567514031033344, "grad_norm": 1107.46240234375, "learning_rate": 0.00048605743206135026, "loss": 41.0823, "step": 2108 }, { "epoch": 5.57015516672169, "grad_norm": 855.0401000976562, "learning_rate": 0.0004860436722892989, "loss": 43.6074, "step": 2109 }, { "epoch": 5.572796302410036, "grad_norm": 919.3406372070312, "learning_rate": 0.00048602990592586673, "loss": 37.7147, "step": 2110 }, { "epoch": 5.575437438098382, "grad_norm": 699.8070068359375, "learning_rate": 0.000486016132971438, "loss": 38.1291, "step": 2111 }, { "epoch": 5.578078573786728, "grad_norm": 729.9718017578125, "learning_rate": 0.00048600235342639744, "loss": 39.6999, "step": 2112 }, { "epoch": 5.5807197094750745, "grad_norm": 522.4100952148438, "learning_rate": 0.00048598856729112973, "loss": 38.6278, "step": 2113 }, { "epoch": 5.58336084516342, "grad_norm": 767.3037109375, "learning_rate": 0.00048597477456601996, "loss": 40.3247, "step": 2114 }, { "epoch": 5.5860019808517665, "grad_norm": 5291.193359375, "learning_rate": 0.00048596097525145323, "loss": 57.5921, "step": 2115 }, { "epoch": 5.588643116540112, "grad_norm": 7787.39111328125, "learning_rate": 0.0004859471693478149, "loss": 84.0049, "step": 2116 }, { "epoch": 5.5912842522284585, "grad_norm": 4272.24462890625, "learning_rate": 0.0004859333568554904, "loss": 83.3573, "step": 2117 }, { "epoch": 5.593925387916804, "grad_norm": 10652.294921875, "learning_rate": 0.0004859195377748655, "loss": 72.8572, "step": 2118 }, { "epoch": 5.5965665236051505, "grad_norm": 2125.90576171875, "learning_rate": 0.0004859057121063262, "loss": 67.6571, "step": 2119 }, { "epoch": 5.599207659293496, "grad_norm": 5283.32568359375, "learning_rate": 0.0004858918798502584, "loss": 69.6974, "step": 2120 }, { "epoch": 5.6018487949818425, "grad_norm": 4561.3583984375, "learning_rate": 0.0004858780410070484, "loss": 49.7338, "step": 2121 }, { "epoch": 5.604489930670188, "grad_norm": 3072.471435546875, "learning_rate": 0.0004858641955770827, "loss": 45.3049, "step": 2122 }, { "epoch": 5.6071310663585345, "grad_norm": 3295.273681640625, "learning_rate": 0.00048585034356074785, "loss": 35.3249, "step": 2123 }, { "epoch": 5.60977220204688, "grad_norm": 5757.46533203125, "learning_rate": 0.0004858364849584308, "loss": 26.3215, "step": 2124 }, { "epoch": 5.6124133377352265, "grad_norm": 3683.49169921875, "learning_rate": 0.00048582261977051834, "loss": 43.6367, "step": 2125 }, { "epoch": 5.615054473423572, "grad_norm": 2341.104736328125, "learning_rate": 0.0004858087479973978, "loss": 85.3976, "step": 2126 }, { "epoch": 5.617695609111918, "grad_norm": 2671.183837890625, "learning_rate": 0.00048579486963945653, "loss": 85.0086, "step": 2127 }, { "epoch": 5.620336744800264, "grad_norm": 1962.147705078125, "learning_rate": 0.00048578098469708196, "loss": 77.8542, "step": 2128 }, { "epoch": 5.622977880488611, "grad_norm": 2238.1162109375, "learning_rate": 0.00048576709317066195, "loss": 65.4433, "step": 2129 }, { "epoch": 5.625619016176956, "grad_norm": 1643.8389892578125, "learning_rate": 0.00048575319506058434, "loss": 49.8533, "step": 2130 }, { "epoch": 5.628260151865302, "grad_norm": 1115.7620849609375, "learning_rate": 0.0004857392903672373, "loss": 42.7522, "step": 2131 }, { "epoch": 5.630901287553648, "grad_norm": 1446.2552490234375, "learning_rate": 0.000485725379091009, "loss": 42.4977, "step": 2132 }, { "epoch": 5.633542423241994, "grad_norm": 1696.107421875, "learning_rate": 0.00048571146123228793, "loss": 39.7382, "step": 2133 }, { "epoch": 5.63618355893034, "grad_norm": 1554.8768310546875, "learning_rate": 0.0004856975367914629, "loss": 40.1315, "step": 2134 }, { "epoch": 5.638824694618686, "grad_norm": 955.4298095703125, "learning_rate": 0.00048568360576892243, "loss": 42.3384, "step": 2135 }, { "epoch": 5.641465830307032, "grad_norm": 1504.271484375, "learning_rate": 0.0004856696681650559, "loss": 42.3692, "step": 2136 }, { "epoch": 5.644106965995378, "grad_norm": 1747.5126953125, "learning_rate": 0.00048565572398025214, "loss": 43.851, "step": 2137 }, { "epoch": 5.646748101683724, "grad_norm": 1672.248291015625, "learning_rate": 0.0004856417732149009, "loss": 46.0038, "step": 2138 }, { "epoch": 5.64938923737207, "grad_norm": 3891.97216796875, "learning_rate": 0.00048562781586939145, "loss": 45.6147, "step": 2139 }, { "epoch": 5.652030373060416, "grad_norm": 1959.55078125, "learning_rate": 0.00048561385194411364, "loss": 43.7963, "step": 2140 }, { "epoch": 5.654671508748762, "grad_norm": 1473.4161376953125, "learning_rate": 0.0004855998814394575, "loss": 45.65, "step": 2141 }, { "epoch": 5.657312644437108, "grad_norm": 1835.5653076171875, "learning_rate": 0.0004855859043558131, "loss": 45.95, "step": 2142 }, { "epoch": 5.659953780125454, "grad_norm": 3269.027587890625, "learning_rate": 0.0004855719206935706, "loss": 46.0962, "step": 2143 }, { "epoch": 5.6625949158138, "grad_norm": 2369.590087890625, "learning_rate": 0.0004855579304531207, "loss": 43.5447, "step": 2144 }, { "epoch": 5.665236051502146, "grad_norm": 775.2514038085938, "learning_rate": 0.0004855439336348539, "loss": 43.2455, "step": 2145 }, { "epoch": 5.667877187190492, "grad_norm": 939.2423095703125, "learning_rate": 0.00048552993023916115, "loss": 42.0635, "step": 2146 }, { "epoch": 5.670518322878838, "grad_norm": 1203.185302734375, "learning_rate": 0.0004855159202664334, "loss": 44.8474, "step": 2147 }, { "epoch": 5.673159458567184, "grad_norm": 988.8873291015625, "learning_rate": 0.000485501903717062, "loss": 44.895, "step": 2148 }, { "epoch": 5.67580059425553, "grad_norm": 634.7340698242188, "learning_rate": 0.00048548788059143825, "loss": 46.3295, "step": 2149 }, { "epoch": 5.678441729943875, "grad_norm": 1198.881591796875, "learning_rate": 0.00048547385088995376, "loss": 48.1222, "step": 2150 }, { "epoch": 5.681082865632222, "grad_norm": 755.68994140625, "learning_rate": 0.0004854598146130003, "loss": 43.0783, "step": 2151 }, { "epoch": 5.683724001320568, "grad_norm": 1017.38623046875, "learning_rate": 0.00048544577176096984, "loss": 45.9197, "step": 2152 }, { "epoch": 5.686365137008914, "grad_norm": 953.1434936523438, "learning_rate": 0.0004854317223342545, "loss": 44.4687, "step": 2153 }, { "epoch": 5.6890062726972594, "grad_norm": 1774.81982421875, "learning_rate": 0.0004854176663332467, "loss": 45.1305, "step": 2154 }, { "epoch": 5.691647408385606, "grad_norm": 1040.8013916015625, "learning_rate": 0.0004854036037583388, "loss": 43.8196, "step": 2155 }, { "epoch": 5.6942885440739515, "grad_norm": 894.6622314453125, "learning_rate": 0.0004853895346099235, "loss": 41.5939, "step": 2156 }, { "epoch": 5.696929679762298, "grad_norm": 1213.6353759765625, "learning_rate": 0.0004853754588883937, "loss": 43.745, "step": 2157 }, { "epoch": 5.6995708154506435, "grad_norm": 791.9669189453125, "learning_rate": 0.0004853613765941426, "loss": 39.3246, "step": 2158 }, { "epoch": 5.70221195113899, "grad_norm": 692.21484375, "learning_rate": 0.00048534728772756325, "loss": 39.8989, "step": 2159 }, { "epoch": 5.7048530868273355, "grad_norm": 619.09912109375, "learning_rate": 0.00048533319228904917, "loss": 38.2869, "step": 2160 }, { "epoch": 5.707494222515682, "grad_norm": 565.7427368164062, "learning_rate": 0.0004853190902789939, "loss": 40.1876, "step": 2161 }, { "epoch": 5.7101353582040275, "grad_norm": 494.9908752441406, "learning_rate": 0.0004853049816977913, "loss": 40.7686, "step": 2162 }, { "epoch": 5.712776493892374, "grad_norm": 447.1717224121094, "learning_rate": 0.00048529086654583534, "loss": 39.6789, "step": 2163 }, { "epoch": 5.7154176295807195, "grad_norm": 629.2985229492188, "learning_rate": 0.00048527674482352, "loss": 39.1463, "step": 2164 }, { "epoch": 5.718058765269066, "grad_norm": 1205.184814453125, "learning_rate": 0.00048526261653123994, "loss": 41.6209, "step": 2165 }, { "epoch": 5.7206999009574115, "grad_norm": 8154.68994140625, "learning_rate": 0.00048524848166938946, "loss": 68.2997, "step": 2166 }, { "epoch": 5.723341036645758, "grad_norm": 12795.736328125, "learning_rate": 0.00048523434023836333, "loss": 80.1291, "step": 2167 }, { "epoch": 5.725982172334104, "grad_norm": 3625.622314453125, "learning_rate": 0.0004852201922385564, "loss": 68.7365, "step": 2168 }, { "epoch": 5.72862330802245, "grad_norm": 4724.54736328125, "learning_rate": 0.00048520603767036375, "loss": 79.5789, "step": 2169 }, { "epoch": 5.731264443710796, "grad_norm": 5903.4970703125, "learning_rate": 0.00048519187653418074, "loss": 69.1752, "step": 2170 }, { "epoch": 5.733905579399142, "grad_norm": 9544.3916015625, "learning_rate": 0.0004851777088304027, "loss": 54.519, "step": 2171 }, { "epoch": 5.736546715087488, "grad_norm": 4025.054443359375, "learning_rate": 0.0004851635345594253, "loss": 45.3599, "step": 2172 }, { "epoch": 5.739187850775833, "grad_norm": 8264.494140625, "learning_rate": 0.0004851493537216443, "loss": 39.0677, "step": 2173 }, { "epoch": 5.74182898646418, "grad_norm": 2597.177490234375, "learning_rate": 0.0004851351663174558, "loss": 29.0417, "step": 2174 }, { "epoch": 5.744470122152526, "grad_norm": 2178.726318359375, "learning_rate": 0.00048512097234725584, "loss": 34.0224, "step": 2175 }, { "epoch": 5.747111257840872, "grad_norm": 1826.085205078125, "learning_rate": 0.0004851067718114409, "loss": 55.5237, "step": 2176 }, { "epoch": 5.749752393529217, "grad_norm": 1670.9915771484375, "learning_rate": 0.0004850925647104074, "loss": 59.3205, "step": 2177 }, { "epoch": 5.752393529217564, "grad_norm": 9887.693359375, "learning_rate": 0.0004850783510445522, "loss": 56.5708, "step": 2178 }, { "epoch": 5.755034664905909, "grad_norm": 1360.1507568359375, "learning_rate": 0.0004850641308142721, "loss": 52.6981, "step": 2179 }, { "epoch": 5.757675800594256, "grad_norm": 742.73388671875, "learning_rate": 0.0004850499040199643, "loss": 45.3506, "step": 2180 }, { "epoch": 5.760316936282601, "grad_norm": 806.9213256835938, "learning_rate": 0.0004850356706620259, "loss": 42.7403, "step": 2181 }, { "epoch": 5.762958071970948, "grad_norm": 553.0529174804688, "learning_rate": 0.0004850214307408546, "loss": 41.9596, "step": 2182 }, { "epoch": 5.765599207659293, "grad_norm": 766.0611572265625, "learning_rate": 0.00048500718425684776, "loss": 40.28, "step": 2183 }, { "epoch": 5.76824034334764, "grad_norm": 443.6015930175781, "learning_rate": 0.00048499293121040345, "loss": 40.6105, "step": 2184 }, { "epoch": 5.770881479035985, "grad_norm": 630.5136108398438, "learning_rate": 0.00048497867160191954, "loss": 41.4695, "step": 2185 }, { "epoch": 5.773522614724332, "grad_norm": 1118.7349853515625, "learning_rate": 0.0004849644054317942, "loss": 42.2678, "step": 2186 }, { "epoch": 5.776163750412677, "grad_norm": 515.1630859375, "learning_rate": 0.000484950132700426, "loss": 41.1903, "step": 2187 }, { "epoch": 5.778804886101024, "grad_norm": 519.4701538085938, "learning_rate": 0.0004849358534082132, "loss": 41.2016, "step": 2188 }, { "epoch": 5.781446021789369, "grad_norm": 742.1712646484375, "learning_rate": 0.00048492156755555483, "loss": 40.0249, "step": 2189 }, { "epoch": 5.784087157477716, "grad_norm": 746.06787109375, "learning_rate": 0.00048490727514284966, "loss": 40.7755, "step": 2190 }, { "epoch": 5.786728293166061, "grad_norm": 1122.341796875, "learning_rate": 0.00048489297617049676, "loss": 43.1953, "step": 2191 }, { "epoch": 5.789369428854408, "grad_norm": 934.4976806640625, "learning_rate": 0.00048487867063889555, "loss": 45.5811, "step": 2192 }, { "epoch": 5.792010564542753, "grad_norm": 988.7752685546875, "learning_rate": 0.00048486435854844543, "loss": 45.4317, "step": 2193 }, { "epoch": 5.7946517002311, "grad_norm": 650.14404296875, "learning_rate": 0.00048485003989954606, "loss": 43.4827, "step": 2194 }, { "epoch": 5.797292835919445, "grad_norm": 388.2742004394531, "learning_rate": 0.0004848357146925973, "loss": 43.8359, "step": 2195 }, { "epoch": 5.799933971607791, "grad_norm": 464.3681335449219, "learning_rate": 0.0004848213829279992, "loss": 43.7793, "step": 2196 }, { "epoch": 5.802575107296137, "grad_norm": 396.108642578125, "learning_rate": 0.00048480704460615175, "loss": 44.7052, "step": 2197 }, { "epoch": 5.805216242984484, "grad_norm": 341.6488342285156, "learning_rate": 0.00048479269972745567, "loss": 46.7084, "step": 2198 }, { "epoch": 5.807857378672829, "grad_norm": 428.6275939941406, "learning_rate": 0.0004847783482923113, "loss": 45.127, "step": 2199 }, { "epoch": 5.810498514361175, "grad_norm": 679.6702880859375, "learning_rate": 0.0004847639903011195, "loss": 47.8421, "step": 2200 }, { "epoch": 5.810498514361175, "eval_loss": 5.837680339813232, "eval_runtime": 2.1598, "eval_samples_per_second": 229.19, "eval_steps_per_second": 28.707, "step": 2200 }, { "epoch": 5.813139650049521, "grad_norm": 474.55584716796875, "learning_rate": 0.00048474962575428116, "loss": 45.4296, "step": 2201 }, { "epoch": 5.815780785737867, "grad_norm": 382.8533630371094, "learning_rate": 0.00048473525465219747, "loss": 44.6764, "step": 2202 }, { "epoch": 5.818421921426213, "grad_norm": 582.6358032226562, "learning_rate": 0.0004847208769952696, "loss": 44.8364, "step": 2203 }, { "epoch": 5.821063057114559, "grad_norm": 309.5724182128906, "learning_rate": 0.00048470649278389925, "loss": 44.3798, "step": 2204 }, { "epoch": 5.823704192802905, "grad_norm": 583.346923828125, "learning_rate": 0.00048469210201848784, "loss": 40.5568, "step": 2205 }, { "epoch": 5.826345328491251, "grad_norm": 593.0388793945312, "learning_rate": 0.0004846777046994374, "loss": 42.9946, "step": 2206 }, { "epoch": 5.828986464179597, "grad_norm": 430.433349609375, "learning_rate": 0.00048466330082714995, "loss": 41.2709, "step": 2207 }, { "epoch": 5.831627599867943, "grad_norm": 524.1536254882812, "learning_rate": 0.0004846488904020275, "loss": 37.907, "step": 2208 }, { "epoch": 5.834268735556289, "grad_norm": 398.2991638183594, "learning_rate": 0.00048463447342447286, "loss": 40.012, "step": 2209 }, { "epoch": 5.836909871244635, "grad_norm": 305.6785888671875, "learning_rate": 0.0004846200498948883, "loss": 38.665, "step": 2210 }, { "epoch": 5.8395510069329815, "grad_norm": 334.84735107421875, "learning_rate": 0.00048460561981367655, "loss": 37.0165, "step": 2211 }, { "epoch": 5.842192142621327, "grad_norm": 443.9411926269531, "learning_rate": 0.00048459118318124083, "loss": 39.6135, "step": 2212 }, { "epoch": 5.8448332783096735, "grad_norm": 607.9732055664062, "learning_rate": 0.0004845767399979841, "loss": 39.4406, "step": 2213 }, { "epoch": 5.847474413998019, "grad_norm": 542.1907958984375, "learning_rate": 0.00048456229026430967, "loss": 37.9776, "step": 2214 }, { "epoch": 5.8501155496863655, "grad_norm": 1466.8515625, "learning_rate": 0.0004845478339806211, "loss": 56.1756, "step": 2215 }, { "epoch": 5.852756685374711, "grad_norm": 13525.6162109375, "learning_rate": 0.000484533371147322, "loss": 87.5884, "step": 2216 }, { "epoch": 5.8553978210630575, "grad_norm": 5869.68701171875, "learning_rate": 0.0004845189017648163, "loss": 90.155, "step": 2217 }, { "epoch": 5.858038956751403, "grad_norm": 9822.9140625, "learning_rate": 0.000484504425833508, "loss": 97.9773, "step": 2218 }, { "epoch": 5.860680092439749, "grad_norm": 3068.03564453125, "learning_rate": 0.00048448994335380144, "loss": 88.6015, "step": 2219 }, { "epoch": 5.863321228128095, "grad_norm": 2202.78759765625, "learning_rate": 0.0004844754543261009, "loss": 72.9062, "step": 2220 }, { "epoch": 5.8659623638164415, "grad_norm": 3674.72216796875, "learning_rate": 0.0004844609587508111, "loss": 61.7485, "step": 2221 }, { "epoch": 5.868603499504787, "grad_norm": 4984.052734375, "learning_rate": 0.0004844464566283367, "loss": 46.3808, "step": 2222 }, { "epoch": 5.871244635193133, "grad_norm": 5336.8330078125, "learning_rate": 0.0004844319479590827, "loss": 33.9777, "step": 2223 }, { "epoch": 5.873885770881479, "grad_norm": 1944.662841796875, "learning_rate": 0.00048441743274345436, "loss": 29.5632, "step": 2224 }, { "epoch": 5.876526906569825, "grad_norm": 2079.058349609375, "learning_rate": 0.00048440291098185683, "loss": 48.5765, "step": 2225 }, { "epoch": 5.879168042258171, "grad_norm": 1342.9541015625, "learning_rate": 0.00048438838267469574, "loss": 75.0292, "step": 2226 }, { "epoch": 5.881809177946517, "grad_norm": 1418.876953125, "learning_rate": 0.0004843738478223768, "loss": 72.0076, "step": 2227 }, { "epoch": 5.884450313634863, "grad_norm": 1232.8199462890625, "learning_rate": 0.0004843593064253057, "loss": 56.5177, "step": 2228 }, { "epoch": 5.887091449323209, "grad_norm": 542.27490234375, "learning_rate": 0.00048434475848388876, "loss": 44.4267, "step": 2229 }, { "epoch": 5.889732585011555, "grad_norm": 386.1504821777344, "learning_rate": 0.00048433020399853204, "loss": 42.0087, "step": 2230 }, { "epoch": 5.892373720699901, "grad_norm": 449.4161682128906, "learning_rate": 0.000484315642969642, "loss": 42.1517, "step": 2231 }, { "epoch": 5.895014856388247, "grad_norm": 511.8898010253906, "learning_rate": 0.00048430107539762534, "loss": 40.9394, "step": 2232 }, { "epoch": 5.897655992076593, "grad_norm": 618.8399047851562, "learning_rate": 0.0004842865012828888, "loss": 45.1683, "step": 2233 }, { "epoch": 5.900297127764939, "grad_norm": 542.6587524414062, "learning_rate": 0.0004842719206258394, "loss": 42.5959, "step": 2234 }, { "epoch": 5.902938263453285, "grad_norm": 902.0257568359375, "learning_rate": 0.00048425733342688407, "loss": 39.8535, "step": 2235 }, { "epoch": 5.905579399141631, "grad_norm": 687.5231323242188, "learning_rate": 0.0004842427396864304, "loss": 39.6646, "step": 2236 }, { "epoch": 5.908220534829977, "grad_norm": 436.04583740234375, "learning_rate": 0.0004842281394048858, "loss": 38.7519, "step": 2237 }, { "epoch": 5.910861670518323, "grad_norm": 476.8952941894531, "learning_rate": 0.000484213532582658, "loss": 41.5079, "step": 2238 }, { "epoch": 5.913502806206669, "grad_norm": 435.462158203125, "learning_rate": 0.0004841989192201549, "loss": 39.688, "step": 2239 }, { "epoch": 5.916143941895015, "grad_norm": 656.1444702148438, "learning_rate": 0.00048418429931778456, "loss": 44.5656, "step": 2240 }, { "epoch": 5.918785077583361, "grad_norm": 715.3164672851562, "learning_rate": 0.00048416967287595526, "loss": 45.6828, "step": 2241 }, { "epoch": 5.921426213271706, "grad_norm": 1631.4534912109375, "learning_rate": 0.00048415503989507536, "loss": 50.4877, "step": 2242 }, { "epoch": 5.924067348960053, "grad_norm": 1235.808837890625, "learning_rate": 0.0004841404003755535, "loss": 46.3886, "step": 2243 }, { "epoch": 5.926708484648399, "grad_norm": 480.8287353515625, "learning_rate": 0.0004841257543177986, "loss": 45.407, "step": 2244 }, { "epoch": 5.929349620336745, "grad_norm": 359.52734375, "learning_rate": 0.0004841111017222194, "loss": 42.33, "step": 2245 }, { "epoch": 5.93199075602509, "grad_norm": 339.9226379394531, "learning_rate": 0.0004840964425892253, "loss": 44.8969, "step": 2246 }, { "epoch": 5.934631891713437, "grad_norm": 367.3833923339844, "learning_rate": 0.00048408177691922554, "loss": 45.2408, "step": 2247 }, { "epoch": 5.937273027401782, "grad_norm": 340.0387268066406, "learning_rate": 0.0004840671047126297, "loss": 46.2277, "step": 2248 }, { "epoch": 5.939914163090129, "grad_norm": 469.5791015625, "learning_rate": 0.00048405242596984746, "loss": 44.1485, "step": 2249 }, { "epoch": 5.942555298778474, "grad_norm": 305.562255859375, "learning_rate": 0.0004840377406912886, "loss": 42.3707, "step": 2250 }, { "epoch": 5.945196434466821, "grad_norm": 229.82534790039062, "learning_rate": 0.0004840230488773634, "loss": 43.3141, "step": 2251 }, { "epoch": 5.9478375701551665, "grad_norm": 270.2068786621094, "learning_rate": 0.000484008350528482, "loss": 40.0004, "step": 2252 }, { "epoch": 5.950478705843513, "grad_norm": 352.8110656738281, "learning_rate": 0.000483993645645055, "loss": 39.0534, "step": 2253 }, { "epoch": 5.9531198415318585, "grad_norm": 412.40338134765625, "learning_rate": 0.0004839789342274927, "loss": 39.6873, "step": 2254 }, { "epoch": 5.955760977220205, "grad_norm": 432.226318359375, "learning_rate": 0.00048396421627620623, "loss": 39.4818, "step": 2255 }, { "epoch": 5.9584021129085505, "grad_norm": 8438.0693359375, "learning_rate": 0.0004839494917916065, "loss": 71.1672, "step": 2256 }, { "epoch": 5.961043248596897, "grad_norm": 5882.5302734375, "learning_rate": 0.00048393476077410457, "loss": 122.728, "step": 2257 }, { "epoch": 5.9636843842852425, "grad_norm": 4983.5732421875, "learning_rate": 0.00048392002322411186, "loss": 114.2183, "step": 2258 }, { "epoch": 5.966325519973589, "grad_norm": 2244.464599609375, "learning_rate": 0.00048390527914203996, "loss": 114.9769, "step": 2259 }, { "epoch": 5.9689666556619345, "grad_norm": 3349.8447265625, "learning_rate": 0.0004838905285283005, "loss": 107.3331, "step": 2260 }, { "epoch": 5.971607791350281, "grad_norm": 3668.5849609375, "learning_rate": 0.0004838757713833054, "loss": 110.5747, "step": 2261 }, { "epoch": 5.9742489270386265, "grad_norm": 2304.746337890625, "learning_rate": 0.00048386100770746686, "loss": 53.0753, "step": 2262 }, { "epoch": 5.976890062726973, "grad_norm": 583.2804565429688, "learning_rate": 0.000483846237501197, "loss": 41.3765, "step": 2263 }, { "epoch": 5.9795311984153185, "grad_norm": 611.4117431640625, "learning_rate": 0.0004838314607649084, "loss": 45.5098, "step": 2264 }, { "epoch": 5.982172334103664, "grad_norm": 739.2782592773438, "learning_rate": 0.00048381667749901346, "loss": 43.5601, "step": 2265 }, { "epoch": 5.984813469792011, "grad_norm": 829.8128051757812, "learning_rate": 0.00048380188770392524, "loss": 43.1706, "step": 2266 }, { "epoch": 5.987454605480357, "grad_norm": 527.1699829101562, "learning_rate": 0.0004837870913800567, "loss": 40.6601, "step": 2267 }, { "epoch": 5.990095741168703, "grad_norm": 847.624267578125, "learning_rate": 0.0004837722885278209, "loss": 41.0128, "step": 2268 }, { "epoch": 5.992736876857048, "grad_norm": 583.6757202148438, "learning_rate": 0.00048375747914763126, "loss": 40.0402, "step": 2269 }, { "epoch": 5.995378012545395, "grad_norm": 881.7681884765625, "learning_rate": 0.00048374266323990137, "loss": 39.4058, "step": 2270 }, { "epoch": 5.99801914823374, "grad_norm": 534.5806274414062, "learning_rate": 0.00048372784080504493, "loss": 43.9106, "step": 2271 }, { "epoch": 6.000660283922087, "grad_norm": 943.0654907226562, "learning_rate": 0.00048371301184347583, "loss": 45.4953, "step": 2272 }, { "epoch": 6.003301419610432, "grad_norm": 6157.94775390625, "learning_rate": 0.0004836981763556081, "loss": 41.0118, "step": 2273 }, { "epoch": 6.005942555298779, "grad_norm": 348.31072998046875, "learning_rate": 0.0004836833343418561, "loss": 41.9632, "step": 2274 }, { "epoch": 6.008583690987124, "grad_norm": 587.468505859375, "learning_rate": 0.0004836684858026343, "loss": 43.5778, "step": 2275 }, { "epoch": 6.011224826675471, "grad_norm": 448.4602355957031, "learning_rate": 0.00048365363073835734, "loss": 43.621, "step": 2276 }, { "epoch": 6.013865962363816, "grad_norm": 427.034423828125, "learning_rate": 0.0004836387691494399, "loss": 45.6325, "step": 2277 }, { "epoch": 6.016507098052163, "grad_norm": 341.36029052734375, "learning_rate": 0.0004836239010362971, "loss": 46.7097, "step": 2278 }, { "epoch": 6.019148233740508, "grad_norm": 382.0007629394531, "learning_rate": 0.0004836090263993441, "loss": 48.0096, "step": 2279 }, { "epoch": 6.021789369428855, "grad_norm": 426.7140808105469, "learning_rate": 0.0004835941452389963, "loss": 46.0918, "step": 2280 }, { "epoch": 6.0244305051172, "grad_norm": 611.2520751953125, "learning_rate": 0.0004835792575556692, "loss": 44.568, "step": 2281 }, { "epoch": 6.027071640805547, "grad_norm": 505.4507751464844, "learning_rate": 0.00048356436334977856, "loss": 42.1495, "step": 2282 }, { "epoch": 6.029712776493892, "grad_norm": 355.88140869140625, "learning_rate": 0.0004835494626217403, "loss": 43.8153, "step": 2283 }, { "epoch": 6.032353912182239, "grad_norm": 519.4979248046875, "learning_rate": 0.00048353455537197045, "loss": 40.8338, "step": 2284 }, { "epoch": 6.034995047870584, "grad_norm": 397.1269836425781, "learning_rate": 0.0004835196416008853, "loss": 41.2454, "step": 2285 }, { "epoch": 6.037636183558931, "grad_norm": 502.6148376464844, "learning_rate": 0.00048350472130890133, "loss": 39.4562, "step": 2286 }, { "epoch": 6.040277319247276, "grad_norm": 489.3353576660156, "learning_rate": 0.00048348979449643525, "loss": 40.6089, "step": 2287 }, { "epoch": 6.042918454935623, "grad_norm": 1469.704345703125, "learning_rate": 0.0004834748611639038, "loss": 39.0665, "step": 2288 }, { "epoch": 6.045559590623968, "grad_norm": 431.2269287109375, "learning_rate": 0.000483459921311724, "loss": 39.1817, "step": 2289 }, { "epoch": 6.048200726312314, "grad_norm": 966.3207397460938, "learning_rate": 0.000483444974940313, "loss": 38.7006, "step": 2290 }, { "epoch": 6.05084186200066, "grad_norm": 577.0074462890625, "learning_rate": 0.00048343002205008823, "loss": 40.6196, "step": 2291 }, { "epoch": 6.053482997689006, "grad_norm": 657.8614501953125, "learning_rate": 0.0004834150626414672, "loss": 39.7932, "step": 2292 }, { "epoch": 6.056124133377352, "grad_norm": 585.601318359375, "learning_rate": 0.0004834000967148677, "loss": 40.379, "step": 2293 }, { "epoch": 6.058765269065698, "grad_norm": 3570.6328125, "learning_rate": 0.0004833851242707076, "loss": 58.4727, "step": 2294 }, { "epoch": 6.061406404754044, "grad_norm": 17429.51953125, "learning_rate": 0.000483370145309405, "loss": 61.7598, "step": 2295 }, { "epoch": 6.06404754044239, "grad_norm": 9285.4130859375, "learning_rate": 0.00048335515983137826, "loss": 63.1334, "step": 2296 }, { "epoch": 6.066688676130736, "grad_norm": 14132.4482421875, "learning_rate": 0.0004833401678370456, "loss": 66.5085, "step": 2297 }, { "epoch": 6.069329811819082, "grad_norm": 11830.9833984375, "learning_rate": 0.0004833251693268259, "loss": 58.3942, "step": 2298 }, { "epoch": 6.071970947507428, "grad_norm": 3953.404052734375, "learning_rate": 0.00048331016430113794, "loss": 57.5136, "step": 2299 }, { "epoch": 6.074612083195774, "grad_norm": 4444.58447265625, "learning_rate": 0.00048329515276040064, "loss": 53.479, "step": 2300 }, { "epoch": 6.07725321888412, "grad_norm": 4080.06396484375, "learning_rate": 0.0004832801347050333, "loss": 52.4098, "step": 2301 }, { "epoch": 6.079894354572466, "grad_norm": 11200.90234375, "learning_rate": 0.0004832651101354552, "loss": 39.1347, "step": 2302 }, { "epoch": 6.082535490260812, "grad_norm": 9544.62109375, "learning_rate": 0.00048325007905208596, "loss": 35.963, "step": 2303 }, { "epoch": 6.085176625949158, "grad_norm": 1727.6531982421875, "learning_rate": 0.0004832350414553452, "loss": 60.053, "step": 2304 }, { "epoch": 6.087817761637504, "grad_norm": 1793.9461669921875, "learning_rate": 0.000483219997345653, "loss": 57.3332, "step": 2305 }, { "epoch": 6.09045889732585, "grad_norm": 1676.1031494140625, "learning_rate": 0.00048320494672342935, "loss": 53.0497, "step": 2306 }, { "epoch": 6.0931000330141964, "grad_norm": 917.6702880859375, "learning_rate": 0.0004831898895890945, "loss": 45.0599, "step": 2307 }, { "epoch": 6.095741168702542, "grad_norm": 1529.3082275390625, "learning_rate": 0.00048317482594306904, "loss": 41.9398, "step": 2308 }, { "epoch": 6.0983823043908885, "grad_norm": 1504.316650390625, "learning_rate": 0.0004831597557857735, "loss": 40.3602, "step": 2309 }, { "epoch": 6.101023440079234, "grad_norm": 942.801513671875, "learning_rate": 0.00048314467911762885, "loss": 41.5277, "step": 2310 }, { "epoch": 6.1036645757675805, "grad_norm": 709.4506225585938, "learning_rate": 0.00048312959593905586, "loss": 40.3281, "step": 2311 }, { "epoch": 6.106305711455926, "grad_norm": 864.9071655273438, "learning_rate": 0.00048311450625047593, "loss": 40.1975, "step": 2312 }, { "epoch": 6.108946847144272, "grad_norm": 857.0675659179688, "learning_rate": 0.00048309941005231027, "loss": 39.4389, "step": 2313 }, { "epoch": 6.111587982832618, "grad_norm": 854.4354248046875, "learning_rate": 0.0004830843073449805, "loss": 40.301, "step": 2314 }, { "epoch": 6.114229118520964, "grad_norm": 944.863037109375, "learning_rate": 0.0004830691981289085, "loss": 40.4995, "step": 2315 }, { "epoch": 6.11687025420931, "grad_norm": 781.4998168945312, "learning_rate": 0.000483054082404516, "loss": 40.9354, "step": 2316 }, { "epoch": 6.119511389897656, "grad_norm": 1990.0208740234375, "learning_rate": 0.00048303896017222513, "loss": 40.965, "step": 2317 }, { "epoch": 6.122152525586002, "grad_norm": 1333.2071533203125, "learning_rate": 0.0004830238314324582, "loss": 40.8383, "step": 2318 }, { "epoch": 6.124793661274348, "grad_norm": 956.6931762695312, "learning_rate": 0.00048300869618563767, "loss": 42.7874, "step": 2319 }, { "epoch": 6.127434796962694, "grad_norm": 808.9555053710938, "learning_rate": 0.00048299355443218617, "loss": 41.2534, "step": 2320 }, { "epoch": 6.13007593265104, "grad_norm": 1176.8763427734375, "learning_rate": 0.0004829784061725265, "loss": 44.3993, "step": 2321 }, { "epoch": 6.132717068339386, "grad_norm": 1749.8004150390625, "learning_rate": 0.0004829632514070818, "loss": 46.5581, "step": 2322 }, { "epoch": 6.135358204027732, "grad_norm": 875.4750366210938, "learning_rate": 0.00048294809013627504, "loss": 42.4168, "step": 2323 }, { "epoch": 6.137999339716078, "grad_norm": 746.5895385742188, "learning_rate": 0.00048293292236052974, "loss": 42.803, "step": 2324 }, { "epoch": 6.140640475404424, "grad_norm": 799.7488403320312, "learning_rate": 0.0004829177480802694, "loss": 40.1402, "step": 2325 }, { "epoch": 6.14328161109277, "grad_norm": 635.5452880859375, "learning_rate": 0.00048290256729591777, "loss": 46.0616, "step": 2326 }, { "epoch": 6.145922746781116, "grad_norm": 543.2235717773438, "learning_rate": 0.0004828873800078988, "loss": 47.7241, "step": 2327 }, { "epoch": 6.148563882469462, "grad_norm": 657.936279296875, "learning_rate": 0.00048287218621663653, "loss": 48.2536, "step": 2328 }, { "epoch": 6.151205018157808, "grad_norm": 457.11083984375, "learning_rate": 0.0004828569859225552, "loss": 45.1161, "step": 2329 }, { "epoch": 6.153846153846154, "grad_norm": 686.60498046875, "learning_rate": 0.00048284177912607943, "loss": 47.5115, "step": 2330 }, { "epoch": 6.1564872895345, "grad_norm": 543.044677734375, "learning_rate": 0.0004828265658276337, "loss": 44.7672, "step": 2331 }, { "epoch": 6.159128425222846, "grad_norm": 425.0478820800781, "learning_rate": 0.0004828113460276429, "loss": 43.149, "step": 2332 }, { "epoch": 6.161769560911192, "grad_norm": 527.502197265625, "learning_rate": 0.00048279611972653204, "loss": 44.3906, "step": 2333 }, { "epoch": 6.164410696599538, "grad_norm": 529.286865234375, "learning_rate": 0.0004827808869247262, "loss": 45.222, "step": 2334 }, { "epoch": 6.167051832287884, "grad_norm": 737.755126953125, "learning_rate": 0.00048276564762265094, "loss": 43.9492, "step": 2335 }, { "epoch": 6.16969296797623, "grad_norm": 490.29486083984375, "learning_rate": 0.0004827504018207317, "loss": 40.1178, "step": 2336 }, { "epoch": 6.172334103664576, "grad_norm": 400.6864013671875, "learning_rate": 0.00048273514951939413, "loss": 39.859, "step": 2337 }, { "epoch": 6.174975239352921, "grad_norm": 324.2041015625, "learning_rate": 0.0004827198907190643, "loss": 41.6101, "step": 2338 }, { "epoch": 6.177616375041268, "grad_norm": 362.897705078125, "learning_rate": 0.0004827046254201682, "loss": 39.5089, "step": 2339 }, { "epoch": 6.180257510729613, "grad_norm": 367.3500671386719, "learning_rate": 0.00048268935362313215, "loss": 38.7363, "step": 2340 }, { "epoch": 6.18289864641796, "grad_norm": 345.95855712890625, "learning_rate": 0.0004826740753283826, "loss": 39.0191, "step": 2341 }, { "epoch": 6.185539782106305, "grad_norm": 275.2092590332031, "learning_rate": 0.0004826587905363461, "loss": 36.9909, "step": 2342 }, { "epoch": 6.188180917794652, "grad_norm": 256.6844482421875, "learning_rate": 0.00048264349924744964, "loss": 39.0794, "step": 2343 }, { "epoch": 6.190822053482997, "grad_norm": 1252.0670166015625, "learning_rate": 0.00048262820146212007, "loss": 57.9671, "step": 2344 }, { "epoch": 6.193463189171344, "grad_norm": 2839.57470703125, "learning_rate": 0.0004826128971807847, "loss": 71.4396, "step": 2345 }, { "epoch": 6.196104324859689, "grad_norm": 3618.19970703125, "learning_rate": 0.00048259758640387075, "loss": 73.5634, "step": 2346 }, { "epoch": 6.198745460548036, "grad_norm": 3248.666748046875, "learning_rate": 0.0004825822691318058, "loss": 59.9894, "step": 2347 }, { "epoch": 6.201386596236381, "grad_norm": 6418.37939453125, "learning_rate": 0.0004825669453650177, "loss": 58.8741, "step": 2348 }, { "epoch": 6.204027731924728, "grad_norm": 3673.85400390625, "learning_rate": 0.0004825516151039342, "loss": 46.7169, "step": 2349 }, { "epoch": 6.2066688676130735, "grad_norm": 2382.983642578125, "learning_rate": 0.0004825362783489835, "loss": 37.1664, "step": 2350 }, { "epoch": 6.20931000330142, "grad_norm": 6395.1435546875, "learning_rate": 0.0004825209351005938, "loss": 27.8026, "step": 2351 }, { "epoch": 6.2119511389897655, "grad_norm": 4498.70068359375, "learning_rate": 0.0004825055853591935, "loss": 30.2142, "step": 2352 }, { "epoch": 6.214592274678112, "grad_norm": 5076.7490234375, "learning_rate": 0.00048249022912521145, "loss": 18.8111, "step": 2353 }, { "epoch": 6.2172334103664575, "grad_norm": 6581.50390625, "learning_rate": 0.0004824748663990763, "loss": 18.2708, "step": 2354 }, { "epoch": 6.219874546054804, "grad_norm": 2366.94580078125, "learning_rate": 0.00048245949718121706, "loss": 89.7822, "step": 2355 }, { "epoch": 6.2225156817431495, "grad_norm": 1791.5677490234375, "learning_rate": 0.00048244412147206283, "loss": 94.1076, "step": 2356 }, { "epoch": 6.225156817431496, "grad_norm": 2071.794677734375, "learning_rate": 0.00048242873927204313, "loss": 84.8467, "step": 2357 }, { "epoch": 6.2277979531198415, "grad_norm": 1730.591064453125, "learning_rate": 0.0004824133505815874, "loss": 61.4477, "step": 2358 }, { "epoch": 6.230439088808188, "grad_norm": 886.9180297851562, "learning_rate": 0.0004823979554011254, "loss": 51.985, "step": 2359 }, { "epoch": 6.2330802244965335, "grad_norm": 688.2098388671875, "learning_rate": 0.00048238255373108703, "loss": 43.4007, "step": 2360 }, { "epoch": 6.235721360184879, "grad_norm": 2394.253662109375, "learning_rate": 0.00048236714557190234, "loss": 43.9181, "step": 2361 }, { "epoch": 6.2383624958732256, "grad_norm": 1877.661376953125, "learning_rate": 0.0004823517309240016, "loss": 55.0759, "step": 2362 }, { "epoch": 6.241003631561571, "grad_norm": 2324.734619140625, "learning_rate": 0.0004823363097878153, "loss": 67.6621, "step": 2363 }, { "epoch": 6.243644767249918, "grad_norm": 2060.418212890625, "learning_rate": 0.00048232088216377395, "loss": 70.4786, "step": 2364 }, { "epoch": 6.246285902938263, "grad_norm": 1914.5567626953125, "learning_rate": 0.00048230544805230846, "loss": 59.583, "step": 2365 }, { "epoch": 6.24892703862661, "grad_norm": 1510.7724609375, "learning_rate": 0.0004822900074538499, "loss": 49.177, "step": 2366 }, { "epoch": 6.251568174314955, "grad_norm": 887.6524047851562, "learning_rate": 0.00048227456036882923, "loss": 45.9556, "step": 2367 }, { "epoch": 6.254209310003302, "grad_norm": 867.2651977539062, "learning_rate": 0.0004822591067976779, "loss": 43.9124, "step": 2368 }, { "epoch": 6.256850445691647, "grad_norm": 694.0419921875, "learning_rate": 0.0004822436467408275, "loss": 46.0542, "step": 2369 }, { "epoch": 6.259491581379994, "grad_norm": 1492.5177001953125, "learning_rate": 0.00048222818019870965, "loss": 48.1928, "step": 2370 }, { "epoch": 6.262132717068339, "grad_norm": 2902.5634765625, "learning_rate": 0.00048221270717175636, "loss": 52.3489, "step": 2371 }, { "epoch": 6.264773852756686, "grad_norm": 6279.625, "learning_rate": 0.0004821972276603995, "loss": 60.8845, "step": 2372 }, { "epoch": 6.267414988445031, "grad_norm": 1160.849853515625, "learning_rate": 0.00048218174166507157, "loss": 48.544, "step": 2373 }, { "epoch": 6.270056124133378, "grad_norm": 1097.651123046875, "learning_rate": 0.0004821662491862049, "loss": 47.9942, "step": 2374 }, { "epoch": 6.272697259821723, "grad_norm": 3473.316162109375, "learning_rate": 0.00048215075022423206, "loss": 48.1199, "step": 2375 }, { "epoch": 6.27533839551007, "grad_norm": 952.898193359375, "learning_rate": 0.00048213524477958594, "loss": 49.075, "step": 2376 }, { "epoch": 6.277979531198415, "grad_norm": 722.2518920898438, "learning_rate": 0.0004821197328526994, "loss": 48.829, "step": 2377 }, { "epoch": 6.280620666886762, "grad_norm": 784.2052001953125, "learning_rate": 0.0004821042144440057, "loss": 44.8102, "step": 2378 }, { "epoch": 6.283261802575107, "grad_norm": 1439.9027099609375, "learning_rate": 0.00048208868955393813, "loss": 50.9065, "step": 2379 }, { "epoch": 6.285902938263454, "grad_norm": 1329.84375, "learning_rate": 0.0004820731581829303, "loss": 47.3897, "step": 2380 }, { "epoch": 6.288544073951799, "grad_norm": 625.0285034179688, "learning_rate": 0.0004820576203314159, "loss": 47.0226, "step": 2381 }, { "epoch": 6.291185209640146, "grad_norm": 812.9695434570312, "learning_rate": 0.0004820420759998286, "loss": 46.2299, "step": 2382 }, { "epoch": 6.293826345328491, "grad_norm": 908.65771484375, "learning_rate": 0.00048202652518860277, "loss": 45.2774, "step": 2383 }, { "epoch": 6.296467481016837, "grad_norm": 1318.831787109375, "learning_rate": 0.00048201096789817253, "loss": 44.2968, "step": 2384 }, { "epoch": 6.299108616705183, "grad_norm": 703.249755859375, "learning_rate": 0.00048199540412897224, "loss": 42.8447, "step": 2385 }, { "epoch": 6.301749752393529, "grad_norm": 1317.9140625, "learning_rate": 0.0004819798338814366, "loss": 43.3724, "step": 2386 }, { "epoch": 6.304390888081875, "grad_norm": 767.978271484375, "learning_rate": 0.00048196425715600036, "loss": 38.8134, "step": 2387 }, { "epoch": 6.307032023770221, "grad_norm": 643.6076049804688, "learning_rate": 0.0004819486739530985, "loss": 38.3951, "step": 2388 }, { "epoch": 6.309673159458567, "grad_norm": 1047.471435546875, "learning_rate": 0.00048193308427316617, "loss": 39.2872, "step": 2389 }, { "epoch": 6.312314295146913, "grad_norm": 506.8388977050781, "learning_rate": 0.00048191748811663874, "loss": 37.5911, "step": 2390 }, { "epoch": 6.314955430835259, "grad_norm": 991.6057739257812, "learning_rate": 0.00048190188548395164, "loss": 39.4547, "step": 2391 }, { "epoch": 6.317596566523605, "grad_norm": 997.2918090820312, "learning_rate": 0.0004818862763755407, "loss": 37.7785, "step": 2392 }, { "epoch": 6.320237702211951, "grad_norm": 1074.49755859375, "learning_rate": 0.0004818706607918416, "loss": 40.7421, "step": 2393 }, { "epoch": 6.322878837900297, "grad_norm": 22939.73828125, "learning_rate": 0.0004818550387332906, "loss": 58.0528, "step": 2394 }, { "epoch": 6.325519973588643, "grad_norm": 7364.1767578125, "learning_rate": 0.0004818394102003238, "loss": 65.8051, "step": 2395 }, { "epoch": 6.328161109276989, "grad_norm": 28842.01171875, "learning_rate": 0.0004818237751933777, "loss": 58.6696, "step": 2396 }, { "epoch": 6.330802244965335, "grad_norm": 7597.83837890625, "learning_rate": 0.00048180813371288876, "loss": 57.8495, "step": 2397 }, { "epoch": 6.333443380653681, "grad_norm": 6608.02685546875, "learning_rate": 0.0004817924857592939, "loss": 56.4981, "step": 2398 }, { "epoch": 6.336084516342027, "grad_norm": 7603.21533203125, "learning_rate": 0.0004817768313330301, "loss": 49.9904, "step": 2399 }, { "epoch": 6.338725652030373, "grad_norm": 17693.193359375, "learning_rate": 0.00048176117043453436, "loss": 43.9395, "step": 2400 }, { "epoch": 6.338725652030373, "eval_loss": 5.122340679168701, "eval_runtime": 2.1171, "eval_samples_per_second": 233.806, "eval_steps_per_second": 29.285, "step": 2400 }, { "epoch": 6.341366787718719, "grad_norm": 12744.251953125, "learning_rate": 0.00048174550306424413, "loss": 39.3199, "step": 2401 }, { "epoch": 6.344007923407065, "grad_norm": 2702.08984375, "learning_rate": 0.0004817298292225969, "loss": 40.1417, "step": 2402 }, { "epoch": 6.346649059095411, "grad_norm": 3948.4736328125, "learning_rate": 0.0004817141489100302, "loss": 36.6853, "step": 2403 }, { "epoch": 6.349290194783757, "grad_norm": 2310.332763671875, "learning_rate": 0.0004816984621269821, "loss": 41.3176, "step": 2404 }, { "epoch": 6.3519313304721035, "grad_norm": 3692.28955078125, "learning_rate": 0.0004816827688738905, "loss": 56.4013, "step": 2405 }, { "epoch": 6.354572466160449, "grad_norm": 5520.9091796875, "learning_rate": 0.00048166706915119374, "loss": 55.887, "step": 2406 }, { "epoch": 6.357213601848795, "grad_norm": 1916.066650390625, "learning_rate": 0.0004816513629593301, "loss": 56.2164, "step": 2407 }, { "epoch": 6.359854737537141, "grad_norm": 2767.151611328125, "learning_rate": 0.00048163565029873826, "loss": 53.3464, "step": 2408 }, { "epoch": 6.362495873225487, "grad_norm": 2206.83544921875, "learning_rate": 0.0004816199311698569, "loss": 44.7476, "step": 2409 }, { "epoch": 6.365137008913833, "grad_norm": 2103.451416015625, "learning_rate": 0.00048160420557312504, "loss": 43.7381, "step": 2410 }, { "epoch": 6.367778144602179, "grad_norm": 1979.781494140625, "learning_rate": 0.00048158847350898183, "loss": 40.243, "step": 2411 }, { "epoch": 6.370419280290525, "grad_norm": 932.767578125, "learning_rate": 0.0004815727349778666, "loss": 41.9221, "step": 2412 }, { "epoch": 6.373060415978871, "grad_norm": 1421.6873779296875, "learning_rate": 0.0004815569899802187, "loss": 40.6091, "step": 2413 }, { "epoch": 6.375701551667217, "grad_norm": 2299.0341796875, "learning_rate": 0.00048154123851647777, "loss": 42.4494, "step": 2414 }, { "epoch": 6.378342687355563, "grad_norm": 1313.3624267578125, "learning_rate": 0.00048152548058708397, "loss": 41.7907, "step": 2415 }, { "epoch": 6.380983823043909, "grad_norm": 2240.642822265625, "learning_rate": 0.00048150971619247704, "loss": 42.2741, "step": 2416 }, { "epoch": 6.383624958732255, "grad_norm": 1322.1051025390625, "learning_rate": 0.0004814939453330973, "loss": 38.9616, "step": 2417 }, { "epoch": 6.386266094420601, "grad_norm": 2976.261474609375, "learning_rate": 0.000481478168009385, "loss": 42.7424, "step": 2418 }, { "epoch": 6.388907230108947, "grad_norm": 1408.9256591796875, "learning_rate": 0.000481462384221781, "loss": 40.1643, "step": 2419 }, { "epoch": 6.391548365797293, "grad_norm": 2687.1328125, "learning_rate": 0.00048144659397072586, "loss": 41.4842, "step": 2420 }, { "epoch": 6.394189501485639, "grad_norm": 2278.03564453125, "learning_rate": 0.0004814307972566605, "loss": 44.0539, "step": 2421 }, { "epoch": 6.396830637173985, "grad_norm": 3087.126708984375, "learning_rate": 0.0004814149940800261, "loss": 45.1375, "step": 2422 }, { "epoch": 6.399471772862331, "grad_norm": 1020.227294921875, "learning_rate": 0.0004813991844412639, "loss": 40.3044, "step": 2423 }, { "epoch": 6.402112908550677, "grad_norm": 1166.5723876953125, "learning_rate": 0.0004813833683408154, "loss": 44.4808, "step": 2424 }, { "epoch": 6.404754044239023, "grad_norm": 977.7232666015625, "learning_rate": 0.0004813675457791223, "loss": 42.1521, "step": 2425 }, { "epoch": 6.407395179927369, "grad_norm": 1134.0352783203125, "learning_rate": 0.0004813517167566264, "loss": 43.9072, "step": 2426 }, { "epoch": 6.410036315615715, "grad_norm": 1350.979248046875, "learning_rate": 0.0004813358812737697, "loss": 45.1386, "step": 2427 }, { "epoch": 6.412677451304061, "grad_norm": 1813.4404296875, "learning_rate": 0.00048132003933099444, "loss": 46.8448, "step": 2428 }, { "epoch": 6.415318586992407, "grad_norm": 1076.62255859375, "learning_rate": 0.00048130419092874287, "loss": 44.7488, "step": 2429 }, { "epoch": 6.417959722680752, "grad_norm": 806.731201171875, "learning_rate": 0.00048128833606745773, "loss": 43.8487, "step": 2430 }, { "epoch": 6.420600858369099, "grad_norm": 1096.067626953125, "learning_rate": 0.00048127247474758167, "loss": 45.8594, "step": 2431 }, { "epoch": 6.423241994057444, "grad_norm": 569.7830810546875, "learning_rate": 0.00048125660696955757, "loss": 46.1407, "step": 2432 }, { "epoch": 6.425883129745791, "grad_norm": 770.477294921875, "learning_rate": 0.0004812407327338286, "loss": 44.2136, "step": 2433 }, { "epoch": 6.428524265434136, "grad_norm": 860.4324340820312, "learning_rate": 0.000481224852040838, "loss": 41.1804, "step": 2434 }, { "epoch": 6.431165401122483, "grad_norm": 1111.7183837890625, "learning_rate": 0.0004812089648910292, "loss": 42.7423, "step": 2435 }, { "epoch": 6.433806536810828, "grad_norm": 3499.59375, "learning_rate": 0.0004811930712848459, "loss": 40.7945, "step": 2436 }, { "epoch": 6.436447672499175, "grad_norm": 600.4011840820312, "learning_rate": 0.00048117717122273196, "loss": 39.4142, "step": 2437 }, { "epoch": 6.43908880818752, "grad_norm": 784.2939453125, "learning_rate": 0.0004811612647051312, "loss": 39.0594, "step": 2438 }, { "epoch": 6.441729943875867, "grad_norm": 1609.5545654296875, "learning_rate": 0.00048114535173248796, "loss": 39.0483, "step": 2439 }, { "epoch": 6.444371079564212, "grad_norm": 529.2343139648438, "learning_rate": 0.0004811294323052466, "loss": 38.9581, "step": 2440 }, { "epoch": 6.447012215252559, "grad_norm": 1682.030029296875, "learning_rate": 0.00048111350642385156, "loss": 40.0869, "step": 2441 }, { "epoch": 6.449653350940904, "grad_norm": 419.42877197265625, "learning_rate": 0.00048109757408874767, "loss": 39.511, "step": 2442 }, { "epoch": 6.452294486629251, "grad_norm": 579.1577758789062, "learning_rate": 0.00048108163530037975, "loss": 39.7201, "step": 2443 }, { "epoch": 6.454935622317596, "grad_norm": 7370.48046875, "learning_rate": 0.00048106569005919287, "loss": 60.0465, "step": 2444 }, { "epoch": 6.457576758005943, "grad_norm": 3093.021484375, "learning_rate": 0.00048104973836563236, "loss": 64.0777, "step": 2445 }, { "epoch": 6.4602178936942884, "grad_norm": 5224.6396484375, "learning_rate": 0.00048103378022014366, "loss": 58.3957, "step": 2446 }, { "epoch": 6.462859029382635, "grad_norm": 8923.01953125, "learning_rate": 0.00048101781562317233, "loss": 53.591, "step": 2447 }, { "epoch": 6.4655001650709805, "grad_norm": 3504.486083984375, "learning_rate": 0.0004810018445751642, "loss": 46.7907, "step": 2448 }, { "epoch": 6.468141300759327, "grad_norm": 13220.890625, "learning_rate": 0.0004809858670765653, "loss": 37.8163, "step": 2449 }, { "epoch": 6.4707824364476725, "grad_norm": 22653.5859375, "learning_rate": 0.0004809698831278217, "loss": 29.7946, "step": 2450 }, { "epoch": 6.473423572136019, "grad_norm": 2196.061767578125, "learning_rate": 0.00048095389272937984, "loss": 23.5287, "step": 2451 }, { "epoch": 6.4760647078243645, "grad_norm": 4227.74853515625, "learning_rate": 0.00048093789588168613, "loss": 19.9349, "step": 2452 }, { "epoch": 6.47870584351271, "grad_norm": 1324.71728515625, "learning_rate": 0.0004809218925851874, "loss": 41.3081, "step": 2453 }, { "epoch": 6.4813469792010565, "grad_norm": 1323.8544921875, "learning_rate": 0.0004809058828403304, "loss": 60.1058, "step": 2454 }, { "epoch": 6.483988114889402, "grad_norm": 1177.8458251953125, "learning_rate": 0.00048088986664756233, "loss": 53.4994, "step": 2455 }, { "epoch": 6.4866292505777485, "grad_norm": 1153.6650390625, "learning_rate": 0.0004808738440073304, "loss": 44.5569, "step": 2456 }, { "epoch": 6.489270386266094, "grad_norm": 904.970703125, "learning_rate": 0.0004808578149200819, "loss": 44.4881, "step": 2457 }, { "epoch": 6.4919115219544405, "grad_norm": 1112.621337890625, "learning_rate": 0.0004808417793862646, "loss": 40.0422, "step": 2458 }, { "epoch": 6.494552657642786, "grad_norm": 1334.1190185546875, "learning_rate": 0.0004808257374063262, "loss": 43.9631, "step": 2459 }, { "epoch": 6.497193793331133, "grad_norm": 1762.855224609375, "learning_rate": 0.00048080968898071467, "loss": 42.2539, "step": 2460 }, { "epoch": 6.499834929019478, "grad_norm": 1488.3045654296875, "learning_rate": 0.00048079363410987813, "loss": 45.3327, "step": 2461 }, { "epoch": 6.502476064707825, "grad_norm": 2964.520263671875, "learning_rate": 0.000480777572794265, "loss": 44.0438, "step": 2462 }, { "epoch": 6.50511720039617, "grad_norm": 1959.4493408203125, "learning_rate": 0.0004807615050343236, "loss": 46.3237, "step": 2463 }, { "epoch": 6.507758336084517, "grad_norm": 1653.410400390625, "learning_rate": 0.00048074543083050287, "loss": 46.8646, "step": 2464 }, { "epoch": 6.510399471772862, "grad_norm": 4860.294921875, "learning_rate": 0.0004807293501832515, "loss": 48.6688, "step": 2465 }, { "epoch": 6.513040607461209, "grad_norm": 7375.931640625, "learning_rate": 0.00048071326309301847, "loss": 51.4041, "step": 2466 }, { "epoch": 6.515681743149554, "grad_norm": 9851.6650390625, "learning_rate": 0.0004806971695602532, "loss": 49.1474, "step": 2467 }, { "epoch": 6.518322878837901, "grad_norm": 2135.523193359375, "learning_rate": 0.00048068106958540493, "loss": 47.5836, "step": 2468 }, { "epoch": 6.520964014526246, "grad_norm": 4619.5087890625, "learning_rate": 0.00048066496316892326, "loss": 49.2017, "step": 2469 }, { "epoch": 6.523605150214593, "grad_norm": 2843.40478515625, "learning_rate": 0.0004806488503112581, "loss": 49.405, "step": 2470 }, { "epoch": 6.526246285902938, "grad_norm": 8615.3310546875, "learning_rate": 0.00048063273101285923, "loss": 50.2118, "step": 2471 }, { "epoch": 6.528887421591284, "grad_norm": 2475.70849609375, "learning_rate": 0.0004806166052741768, "loss": 50.7047, "step": 2472 }, { "epoch": 6.53152855727963, "grad_norm": 2645.690185546875, "learning_rate": 0.00048060047309566125, "loss": 49.9283, "step": 2473 }, { "epoch": 6.534169692967977, "grad_norm": 5533.83837890625, "learning_rate": 0.00048058433447776286, "loss": 48.6347, "step": 2474 }, { "epoch": 6.536810828656322, "grad_norm": 3702.70654296875, "learning_rate": 0.0004805681894209324, "loss": 49.7585, "step": 2475 }, { "epoch": 6.539451964344668, "grad_norm": 1485.1455078125, "learning_rate": 0.0004805520379256207, "loss": 50.0251, "step": 2476 }, { "epoch": 6.542093100033014, "grad_norm": 2217.712646484375, "learning_rate": 0.0004805358799922788, "loss": 55.3572, "step": 2477 }, { "epoch": 6.54473423572136, "grad_norm": 1805.3507080078125, "learning_rate": 0.00048051971562135774, "loss": 53.1073, "step": 2478 }, { "epoch": 6.547375371409706, "grad_norm": 1692.596435546875, "learning_rate": 0.0004805035448133092, "loss": 49.2496, "step": 2479 }, { "epoch": 6.550016507098052, "grad_norm": 2121.675048828125, "learning_rate": 0.0004804873675685845, "loss": 51.8912, "step": 2480 }, { "epoch": 6.552657642786398, "grad_norm": 1344.4134521484375, "learning_rate": 0.0004804711838876354, "loss": 49.9689, "step": 2481 }, { "epoch": 6.555298778474744, "grad_norm": 2274.88037109375, "learning_rate": 0.000480454993770914, "loss": 49.0714, "step": 2482 }, { "epoch": 6.55793991416309, "grad_norm": 1224.68505859375, "learning_rate": 0.00048043879721887216, "loss": 46.6003, "step": 2483 }, { "epoch": 6.560581049851436, "grad_norm": 1003.1577758789062, "learning_rate": 0.00048042259423196245, "loss": 44.59, "step": 2484 }, { "epoch": 6.563222185539782, "grad_norm": 1052.5670166015625, "learning_rate": 0.000480406384810637, "loss": 43.8369, "step": 2485 }, { "epoch": 6.565863321228128, "grad_norm": 851.4990234375, "learning_rate": 0.0004803901689553486, "loss": 42.2983, "step": 2486 }, { "epoch": 6.568504456916474, "grad_norm": 1229.85400390625, "learning_rate": 0.00048037394666655014, "loss": 43.4697, "step": 2487 }, { "epoch": 6.57114559260482, "grad_norm": 749.0133666992188, "learning_rate": 0.0004803577179446945, "loss": 41.665, "step": 2488 }, { "epoch": 6.573786728293166, "grad_norm": 2206.34326171875, "learning_rate": 0.00048034148279023494, "loss": 41.3444, "step": 2489 }, { "epoch": 6.576427863981512, "grad_norm": 580.84716796875, "learning_rate": 0.00048032524120362476, "loss": 40.0116, "step": 2490 }, { "epoch": 6.579068999669858, "grad_norm": 999.0355224609375, "learning_rate": 0.0004803089931853175, "loss": 41.3493, "step": 2491 }, { "epoch": 6.581710135358204, "grad_norm": 1092.7789306640625, "learning_rate": 0.00048029273873576695, "loss": 41.4335, "step": 2492 }, { "epoch": 6.58435127104655, "grad_norm": 1007.6641235351562, "learning_rate": 0.00048027647785542696, "loss": 42.4018, "step": 2493 }, { "epoch": 6.586992406734896, "grad_norm": 2896.825927734375, "learning_rate": 0.00048026021054475154, "loss": 57.9938, "step": 2494 }, { "epoch": 6.5896335424232415, "grad_norm": 5479.490234375, "learning_rate": 0.000480243936804195, "loss": 79.699, "step": 2495 }, { "epoch": 6.592274678111588, "grad_norm": 8667.1435546875, "learning_rate": 0.00048022765663421186, "loss": 65.1452, "step": 2496 }, { "epoch": 6.594915813799934, "grad_norm": 11590.3330078125, "learning_rate": 0.0004802113700352566, "loss": 58.6867, "step": 2497 }, { "epoch": 6.59755694948828, "grad_norm": 2485.232177734375, "learning_rate": 0.00048019507700778404, "loss": 51.9295, "step": 2498 }, { "epoch": 6.6001980851766255, "grad_norm": 8150.74658203125, "learning_rate": 0.0004801787775522492, "loss": 44.4554, "step": 2499 }, { "epoch": 6.602839220864972, "grad_norm": 4719.05615234375, "learning_rate": 0.0004801624716691072, "loss": 30.7411, "step": 2500 }, { "epoch": 6.6054803565533176, "grad_norm": 1590.2655029296875, "learning_rate": 0.0004801461593588134, "loss": 32.3347, "step": 2501 }, { "epoch": 6.608121492241664, "grad_norm": 7536.51318359375, "learning_rate": 0.0004801298406218233, "loss": 24.6865, "step": 2502 }, { "epoch": 6.61076262793001, "grad_norm": 4724.13427734375, "learning_rate": 0.00048011351545859253, "loss": 22.848, "step": 2503 }, { "epoch": 6.613403763618356, "grad_norm": 1313.52734375, "learning_rate": 0.00048009718386957714, "loss": 62.2952, "step": 2504 }, { "epoch": 6.616044899306702, "grad_norm": 1626.9544677734375, "learning_rate": 0.00048008084585523294, "loss": 81.9147, "step": 2505 }, { "epoch": 6.618686034995048, "grad_norm": 1358.84130859375, "learning_rate": 0.00048006450141601634, "loss": 60.8187, "step": 2506 }, { "epoch": 6.621327170683394, "grad_norm": 986.1347045898438, "learning_rate": 0.0004800481505523836, "loss": 47.5845, "step": 2507 }, { "epoch": 6.62396830637174, "grad_norm": 3338.103515625, "learning_rate": 0.0004800317932647915, "loss": 41.8674, "step": 2508 }, { "epoch": 6.626609442060086, "grad_norm": 1125.5809326171875, "learning_rate": 0.00048001542955369657, "loss": 44.0821, "step": 2509 }, { "epoch": 6.629250577748432, "grad_norm": 950.8338012695312, "learning_rate": 0.000479999059419556, "loss": 47.9367, "step": 2510 }, { "epoch": 6.631891713436778, "grad_norm": 4548.43505859375, "learning_rate": 0.00047998268286282683, "loss": 51.713, "step": 2511 }, { "epoch": 6.634532849125124, "grad_norm": 994.5459594726562, "learning_rate": 0.0004799662998839663, "loss": 53.7319, "step": 2512 }, { "epoch": 6.63717398481347, "grad_norm": 1345.832275390625, "learning_rate": 0.0004799499104834319, "loss": 53.1509, "step": 2513 }, { "epoch": 6.639815120501816, "grad_norm": 1160.4202880859375, "learning_rate": 0.0004799335146616813, "loss": 51.558, "step": 2514 }, { "epoch": 6.642456256190162, "grad_norm": 1356.1640625, "learning_rate": 0.00047991711241917245, "loss": 50.6149, "step": 2515 }, { "epoch": 6.645097391878508, "grad_norm": 938.2459106445312, "learning_rate": 0.0004799007037563633, "loss": 49.0908, "step": 2516 }, { "epoch": 6.647738527566854, "grad_norm": 639.3724365234375, "learning_rate": 0.000479884288673712, "loss": 45.1041, "step": 2517 }, { "epoch": 6.650379663255199, "grad_norm": 629.6267700195312, "learning_rate": 0.000479867867171677, "loss": 42.8055, "step": 2518 }, { "epoch": 6.653020798943546, "grad_norm": 1166.895263671875, "learning_rate": 0.00047985143925071685, "loss": 41.1311, "step": 2519 }, { "epoch": 6.655661934631892, "grad_norm": 1278.84765625, "learning_rate": 0.0004798350049112903, "loss": 42.6446, "step": 2520 }, { "epoch": 6.658303070320238, "grad_norm": 976.27685546875, "learning_rate": 0.0004798185641538562, "loss": 44.3114, "step": 2521 }, { "epoch": 6.660944206008583, "grad_norm": 4724.3759765625, "learning_rate": 0.00047980211697887373, "loss": 49.39, "step": 2522 }, { "epoch": 6.66358534169693, "grad_norm": 1283.169921875, "learning_rate": 0.00047978566338680213, "loss": 54.3049, "step": 2523 }, { "epoch": 6.666226477385275, "grad_norm": 825.3270263671875, "learning_rate": 0.0004797692033781008, "loss": 50.225, "step": 2524 }, { "epoch": 6.668867613073622, "grad_norm": 641.312255859375, "learning_rate": 0.0004797527369532295, "loss": 44.756, "step": 2525 }, { "epoch": 6.671508748761967, "grad_norm": 710.7972412109375, "learning_rate": 0.000479736264112648, "loss": 45.8149, "step": 2526 }, { "epoch": 6.674149884450314, "grad_norm": 690.6287841796875, "learning_rate": 0.00047971978485681634, "loss": 47.4944, "step": 2527 }, { "epoch": 6.676791020138659, "grad_norm": 728.8192749023438, "learning_rate": 0.0004797032991861945, "loss": 51.6606, "step": 2528 }, { "epoch": 6.679432155827006, "grad_norm": 787.7779541015625, "learning_rate": 0.000479686807101243, "loss": 50.9127, "step": 2529 }, { "epoch": 6.682073291515351, "grad_norm": 582.42138671875, "learning_rate": 0.0004796703086024224, "loss": 47.9984, "step": 2530 }, { "epoch": 6.684714427203698, "grad_norm": 750.7901000976562, "learning_rate": 0.0004796538036901933, "loss": 45.8306, "step": 2531 }, { "epoch": 6.687355562892043, "grad_norm": 814.6594848632812, "learning_rate": 0.0004796372923650166, "loss": 45.7325, "step": 2532 }, { "epoch": 6.68999669858039, "grad_norm": 686.5460205078125, "learning_rate": 0.0004796207746273535, "loss": 46.0277, "step": 2533 }, { "epoch": 6.692637834268735, "grad_norm": 751.3591918945312, "learning_rate": 0.0004796042504776651, "loss": 42.3863, "step": 2534 }, { "epoch": 6.695278969957082, "grad_norm": 479.0381774902344, "learning_rate": 0.0004795877199164129, "loss": 42.4516, "step": 2535 }, { "epoch": 6.697920105645427, "grad_norm": 474.97216796875, "learning_rate": 0.00047957118294405843, "loss": 39.3426, "step": 2536 }, { "epoch": 6.700561241333774, "grad_norm": 618.6951904296875, "learning_rate": 0.0004795546395610636, "loss": 40.6485, "step": 2537 }, { "epoch": 6.703202377022119, "grad_norm": 511.5299377441406, "learning_rate": 0.00047953808976789033, "loss": 40.2032, "step": 2538 }, { "epoch": 6.705843512710466, "grad_norm": 462.8622741699219, "learning_rate": 0.0004795215335650007, "loss": 41.1065, "step": 2539 }, { "epoch": 6.708484648398811, "grad_norm": 397.24127197265625, "learning_rate": 0.0004795049709528571, "loss": 39.8931, "step": 2540 }, { "epoch": 6.711125784087157, "grad_norm": 731.2764892578125, "learning_rate": 0.00047948840193192196, "loss": 39.1879, "step": 2541 }, { "epoch": 6.713766919775503, "grad_norm": 532.0653686523438, "learning_rate": 0.00047947182650265796, "loss": 40.2453, "step": 2542 }, { "epoch": 6.71640805546385, "grad_norm": 494.18701171875, "learning_rate": 0.00047945524466552807, "loss": 40.1101, "step": 2543 }, { "epoch": 6.7190491911521955, "grad_norm": 1307.774658203125, "learning_rate": 0.00047943865642099525, "loss": 47.0884, "step": 2544 }, { "epoch": 6.721690326840541, "grad_norm": 3435.43017578125, "learning_rate": 0.00047942206176952273, "loss": 78.4816, "step": 2545 }, { "epoch": 6.7243314625288875, "grad_norm": 3671.157958984375, "learning_rate": 0.000479405460711574, "loss": 103.938, "step": 2546 }, { "epoch": 6.726972598217233, "grad_norm": 3305.8642578125, "learning_rate": 0.0004793888532476124, "loss": 84.1707, "step": 2547 }, { "epoch": 6.7296137339055795, "grad_norm": 4754.70751953125, "learning_rate": 0.0004793722393781019, "loss": 74.7136, "step": 2548 }, { "epoch": 6.732254869593925, "grad_norm": 4435.138671875, "learning_rate": 0.00047935561910350636, "loss": 60.9608, "step": 2549 }, { "epoch": 6.7348960052822715, "grad_norm": 5481.64013671875, "learning_rate": 0.0004793389924242898, "loss": 47.2883, "step": 2550 }, { "epoch": 6.737537140970617, "grad_norm": 2052.166748046875, "learning_rate": 0.00047932235934091674, "loss": 45.417, "step": 2551 }, { "epoch": 6.7401782766589635, "grad_norm": 2998.51220703125, "learning_rate": 0.0004793057198538514, "loss": 41.1958, "step": 2552 }, { "epoch": 6.742819412347309, "grad_norm": 2203.02490234375, "learning_rate": 0.0004792890739635585, "loss": 30.9693, "step": 2553 }, { "epoch": 6.7454605480356555, "grad_norm": 2078.239013671875, "learning_rate": 0.00047927242167050303, "loss": 39.546, "step": 2554 }, { "epoch": 6.748101683724001, "grad_norm": 1352.4052734375, "learning_rate": 0.00047925576297514974, "loss": 87.3227, "step": 2555 }, { "epoch": 6.7507428194123476, "grad_norm": 1408.924560546875, "learning_rate": 0.000479239097877964, "loss": 88.6905, "step": 2556 }, { "epoch": 6.753383955100693, "grad_norm": 1366.0162353515625, "learning_rate": 0.0004792224263794112, "loss": 76.0994, "step": 2557 }, { "epoch": 6.75602509078904, "grad_norm": 784.2796630859375, "learning_rate": 0.00047920574847995663, "loss": 57.1047, "step": 2558 }, { "epoch": 6.758666226477385, "grad_norm": 503.1458740234375, "learning_rate": 0.00047918906418006625, "loss": 46.1888, "step": 2559 }, { "epoch": 6.761307362165732, "grad_norm": 485.1630554199219, "learning_rate": 0.00047917237348020595, "loss": 41.8018, "step": 2560 }, { "epoch": 6.763948497854077, "grad_norm": 640.5099487304688, "learning_rate": 0.0004791556763808416, "loss": 42.639, "step": 2561 }, { "epoch": 6.766589633542424, "grad_norm": 688.1859741210938, "learning_rate": 0.0004791389728824397, "loss": 42.9332, "step": 2562 }, { "epoch": 6.769230769230769, "grad_norm": 554.978759765625, "learning_rate": 0.0004791222629854666, "loss": 43.854, "step": 2563 }, { "epoch": 6.771871904919115, "grad_norm": 657.1712646484375, "learning_rate": 0.0004791055466903889, "loss": 48.4555, "step": 2564 }, { "epoch": 6.774513040607461, "grad_norm": 922.487548828125, "learning_rate": 0.0004790888239976733, "loss": 48.783, "step": 2565 }, { "epoch": 6.777154176295808, "grad_norm": 646.6986083984375, "learning_rate": 0.0004790720949077869, "loss": 45.8369, "step": 2566 }, { "epoch": 6.779795311984153, "grad_norm": 583.21923828125, "learning_rate": 0.0004790553594211969, "loss": 43.0843, "step": 2567 }, { "epoch": 6.782436447672499, "grad_norm": 517.5355224609375, "learning_rate": 0.0004790386175383704, "loss": 39.5182, "step": 2568 }, { "epoch": 6.785077583360845, "grad_norm": 708.8458251953125, "learning_rate": 0.0004790218692597752, "loss": 42.179, "step": 2569 }, { "epoch": 6.787718719049191, "grad_norm": 1136.61767578125, "learning_rate": 0.0004790051145858788, "loss": 42.0841, "step": 2570 }, { "epoch": 6.790359854737537, "grad_norm": 914.6806030273438, "learning_rate": 0.000478988353517149, "loss": 46.4148, "step": 2571 }, { "epoch": 6.793000990425883, "grad_norm": 926.322509765625, "learning_rate": 0.000478971586054054, "loss": 49.8201, "step": 2572 }, { "epoch": 6.795642126114229, "grad_norm": 384.1777038574219, "learning_rate": 0.000478954812197062, "loss": 48.7688, "step": 2573 }, { "epoch": 6.798283261802575, "grad_norm": 646.2730102539062, "learning_rate": 0.00047893803194664134, "loss": 45.3222, "step": 2574 }, { "epoch": 6.800924397490921, "grad_norm": 572.1664428710938, "learning_rate": 0.00047892124530326066, "loss": 48.3286, "step": 2575 }, { "epoch": 6.803565533179267, "grad_norm": 535.7612915039062, "learning_rate": 0.00047890445226738863, "loss": 49.7991, "step": 2576 }, { "epoch": 6.806206668867613, "grad_norm": 704.3690795898438, "learning_rate": 0.0004788876528394942, "loss": 49.7292, "step": 2577 }, { "epoch": 6.808847804555959, "grad_norm": 1492.0540771484375, "learning_rate": 0.00047887084702004655, "loss": 49.0913, "step": 2578 }, { "epoch": 6.811488940244305, "grad_norm": 845.1064453125, "learning_rate": 0.000478854034809515, "loss": 49.8377, "step": 2579 }, { "epoch": 6.814130075932651, "grad_norm": 563.0088500976562, "learning_rate": 0.00047883721620836894, "loss": 49.369, "step": 2580 }, { "epoch": 6.816771211620997, "grad_norm": 603.5857543945312, "learning_rate": 0.000478820391217078, "loss": 47.7355, "step": 2581 }, { "epoch": 6.819412347309343, "grad_norm": 586.3301391601562, "learning_rate": 0.0004788035598361121, "loss": 47.5429, "step": 2582 }, { "epoch": 6.822053482997689, "grad_norm": 555.6299438476562, "learning_rate": 0.0004787867220659412, "loss": 47.2763, "step": 2583 }, { "epoch": 6.824694618686035, "grad_norm": 441.5813293457031, "learning_rate": 0.0004787698779070354, "loss": 47.5029, "step": 2584 }, { "epoch": 6.827335754374381, "grad_norm": 840.0755004882812, "learning_rate": 0.00047875302735986524, "loss": 47.1147, "step": 2585 }, { "epoch": 6.829976890062727, "grad_norm": 470.1798400878906, "learning_rate": 0.00047873617042490114, "loss": 42.5287, "step": 2586 }, { "epoch": 6.8326180257510725, "grad_norm": 389.8511657714844, "learning_rate": 0.00047871930710261386, "loss": 39.852, "step": 2587 }, { "epoch": 6.835259161439419, "grad_norm": 898.2392578125, "learning_rate": 0.0004787024373934743, "loss": 42.2617, "step": 2588 }, { "epoch": 6.837900297127765, "grad_norm": 354.42974853515625, "learning_rate": 0.0004786855612979535, "loss": 39.486, "step": 2589 }, { "epoch": 6.840541432816111, "grad_norm": 550.1604614257812, "learning_rate": 0.0004786686788165227, "loss": 39.9605, "step": 2590 }, { "epoch": 6.8431825685044565, "grad_norm": 320.0852355957031, "learning_rate": 0.0004786517899496534, "loss": 40.4157, "step": 2591 }, { "epoch": 6.845823704192803, "grad_norm": 324.0950622558594, "learning_rate": 0.00047863489469781716, "loss": 41.5892, "step": 2592 }, { "epoch": 6.8484648398811485, "grad_norm": 379.6390380859375, "learning_rate": 0.00047861799306148583, "loss": 40.523, "step": 2593 }, { "epoch": 6.851105975569495, "grad_norm": 2616.722900390625, "learning_rate": 0.00047860108504113136, "loss": 62.8488, "step": 2594 }, { "epoch": 6.8537471112578405, "grad_norm": 10497.0771484375, "learning_rate": 0.0004785841706372258, "loss": 92.8246, "step": 2595 }, { "epoch": 6.856388246946187, "grad_norm": 9865.1630859375, "learning_rate": 0.0004785672498502416, "loss": 93.9775, "step": 2596 }, { "epoch": 6.8590293826345325, "grad_norm": 7582.673828125, "learning_rate": 0.00047855032268065125, "loss": 96.0573, "step": 2597 }, { "epoch": 6.861670518322879, "grad_norm": 13434.8349609375, "learning_rate": 0.0004785333891289274, "loss": 91.505, "step": 2598 }, { "epoch": 6.864311654011225, "grad_norm": 6865.19873046875, "learning_rate": 0.0004785164491955428, "loss": 98.3208, "step": 2599 }, { "epoch": 6.866952789699571, "grad_norm": 4766.650390625, "learning_rate": 0.00047849950288097066, "loss": 85.549, "step": 2600 }, { "epoch": 6.866952789699571, "eval_loss": 6.150360584259033, "eval_runtime": 2.0678, "eval_samples_per_second": 239.384, "eval_steps_per_second": 29.983, "step": 2600 }, { "epoch": 6.869593925387917, "grad_norm": 3216.043212890625, "learning_rate": 0.0004784825501856842, "loss": 78.0074, "step": 2601 }, { "epoch": 6.872235061076263, "grad_norm": 5937.2861328125, "learning_rate": 0.00047846559111015667, "loss": 61.7906, "step": 2602 }, { "epoch": 6.874876196764609, "grad_norm": 1984.984375, "learning_rate": 0.0004784486256548617, "loss": 49.3055, "step": 2603 }, { "epoch": 6.877517332452955, "grad_norm": 1962.5032958984375, "learning_rate": 0.00047843165382027307, "loss": 44.2482, "step": 2604 }, { "epoch": 6.880158468141301, "grad_norm": 1055.9669189453125, "learning_rate": 0.0004784146756068647, "loss": 54.0373, "step": 2605 }, { "epoch": 6.882799603829647, "grad_norm": 1015.0669555664062, "learning_rate": 0.0004783976910151107, "loss": 54.614, "step": 2606 }, { "epoch": 6.885440739517993, "grad_norm": 635.96337890625, "learning_rate": 0.0004783807000454854, "loss": 49.556, "step": 2607 }, { "epoch": 6.888081875206339, "grad_norm": 478.1396789550781, "learning_rate": 0.00047836370269846315, "loss": 45.8113, "step": 2608 }, { "epoch": 6.890723010894685, "grad_norm": 414.12554931640625, "learning_rate": 0.00047834669897451866, "loss": 41.9832, "step": 2609 }, { "epoch": 6.89336414658303, "grad_norm": 973.1522216796875, "learning_rate": 0.0004783296888741267, "loss": 41.6715, "step": 2610 }, { "epoch": 6.896005282271377, "grad_norm": 1166.4315185546875, "learning_rate": 0.0004783126723977623, "loss": 40.9235, "step": 2611 }, { "epoch": 6.898646417959723, "grad_norm": 370.7779846191406, "learning_rate": 0.00047829564954590074, "loss": 40.3246, "step": 2612 }, { "epoch": 6.901287553648069, "grad_norm": 442.9255065917969, "learning_rate": 0.00047827862031901713, "loss": 41.637, "step": 2613 }, { "epoch": 6.903928689336414, "grad_norm": 476.917724609375, "learning_rate": 0.0004782615847175873, "loss": 41.5695, "step": 2614 }, { "epoch": 6.906569825024761, "grad_norm": 602.3760986328125, "learning_rate": 0.00047824454274208674, "loss": 40.2578, "step": 2615 }, { "epoch": 6.909210960713106, "grad_norm": 744.6712036132812, "learning_rate": 0.0004782274943929914, "loss": 40.6872, "step": 2616 }, { "epoch": 6.911852096401453, "grad_norm": 494.33038330078125, "learning_rate": 0.00047821043967077734, "loss": 42.868, "step": 2617 }, { "epoch": 6.914493232089798, "grad_norm": 604.6388549804688, "learning_rate": 0.00047819337857592085, "loss": 39.6034, "step": 2618 }, { "epoch": 6.917134367778145, "grad_norm": 997.2782592773438, "learning_rate": 0.00047817631110889827, "loss": 42.052, "step": 2619 }, { "epoch": 6.91977550346649, "grad_norm": 1068.7322998046875, "learning_rate": 0.00047815923727018617, "loss": 43.8889, "step": 2620 }, { "epoch": 6.922416639154837, "grad_norm": 528.17578125, "learning_rate": 0.00047814215706026153, "loss": 46.083, "step": 2621 }, { "epoch": 6.925057774843182, "grad_norm": 1192.98779296875, "learning_rate": 0.0004781250704796011, "loss": 47.3932, "step": 2622 }, { "epoch": 6.927698910531529, "grad_norm": 246.7221221923828, "learning_rate": 0.00047810797752868205, "loss": 43.8456, "step": 2623 }, { "epoch": 6.930340046219874, "grad_norm": 177.2631072998047, "learning_rate": 0.0004780908782079818, "loss": 43.1311, "step": 2624 }, { "epoch": 6.932981181908221, "grad_norm": 377.6715393066406, "learning_rate": 0.00047807377251797773, "loss": 50.6451, "step": 2625 }, { "epoch": 6.935622317596566, "grad_norm": 355.82843017578125, "learning_rate": 0.00047805666045914753, "loss": 46.3333, "step": 2626 }, { "epoch": 6.938263453284913, "grad_norm": 299.2281494140625, "learning_rate": 0.0004780395420319691, "loss": 44.0925, "step": 2627 }, { "epoch": 6.940904588973258, "grad_norm": 497.8837585449219, "learning_rate": 0.0004780224172369204, "loss": 43.7856, "step": 2628 }, { "epoch": 6.943545724661605, "grad_norm": 334.92169189453125, "learning_rate": 0.00047800528607447963, "loss": 42.681, "step": 2629 }, { "epoch": 6.94618686034995, "grad_norm": 1556.3485107421875, "learning_rate": 0.0004779881485451252, "loss": 40.6506, "step": 2630 }, { "epoch": 6.948827996038297, "grad_norm": 299.22467041015625, "learning_rate": 0.00047797100464933565, "loss": 42.1542, "step": 2631 }, { "epoch": 6.951469131726642, "grad_norm": 183.6276092529297, "learning_rate": 0.00047795385438758963, "loss": 40.1825, "step": 2632 }, { "epoch": 6.954110267414989, "grad_norm": 253.34664916992188, "learning_rate": 0.0004779366977603663, "loss": 40.0662, "step": 2633 }, { "epoch": 6.956751403103334, "grad_norm": 340.2393493652344, "learning_rate": 0.0004779195347681444, "loss": 42.1842, "step": 2634 }, { "epoch": 6.959392538791681, "grad_norm": 2449.440185546875, "learning_rate": 0.00047790236541140355, "loss": 80.2494, "step": 2635 }, { "epoch": 6.962033674480026, "grad_norm": 6153.90673828125, "learning_rate": 0.0004778851896906229, "loss": 88.0691, "step": 2636 }, { "epoch": 6.964674810168372, "grad_norm": 3187.2939453125, "learning_rate": 0.00047786800760628225, "loss": 80.6738, "step": 2637 }, { "epoch": 6.967315945856718, "grad_norm": 13575.0185546875, "learning_rate": 0.0004778508191588613, "loss": 83.0937, "step": 2638 }, { "epoch": 6.969957081545064, "grad_norm": 2408.589599609375, "learning_rate": 0.00047783362434884014, "loss": 71.782, "step": 2639 }, { "epoch": 6.9725982172334104, "grad_norm": 4763.5166015625, "learning_rate": 0.0004778164231766988, "loss": 46.5246, "step": 2640 }, { "epoch": 6.975239352921756, "grad_norm": 335.12969970703125, "learning_rate": 0.00047779921564291774, "loss": 46.0342, "step": 2641 }, { "epoch": 6.9778804886101025, "grad_norm": 419.5324401855469, "learning_rate": 0.0004777820017479773, "loss": 46.1712, "step": 2642 }, { "epoch": 6.980521624298448, "grad_norm": 283.46630859375, "learning_rate": 0.00047776478149235834, "loss": 42.7204, "step": 2643 }, { "epoch": 6.9831627599867945, "grad_norm": 412.14862060546875, "learning_rate": 0.00047774755487654157, "loss": 39.6329, "step": 2644 }, { "epoch": 6.98580389567514, "grad_norm": 265.70916748046875, "learning_rate": 0.0004777303219010082, "loss": 39.9936, "step": 2645 }, { "epoch": 6.9884450313634865, "grad_norm": 298.87213134765625, "learning_rate": 0.0004777130825662394, "loss": 39.064, "step": 2646 }, { "epoch": 6.991086167051832, "grad_norm": 225.20822143554688, "learning_rate": 0.00047769583687271646, "loss": 36.8679, "step": 2647 }, { "epoch": 6.9937273027401785, "grad_norm": 348.94866943359375, "learning_rate": 0.00047767858482092095, "loss": 39.7074, "step": 2648 }, { "epoch": 6.996368438428524, "grad_norm": 566.8523559570312, "learning_rate": 0.00047766132641133486, "loss": 42.7466, "step": 2649 }, { "epoch": 6.9990095741168705, "grad_norm": 494.343505859375, "learning_rate": 0.00047764406164443993, "loss": 42.8116, "step": 2650 }, { "epoch": 7.001650709805216, "grad_norm": 4011.9169921875, "learning_rate": 0.0004776267905207182, "loss": 47.7242, "step": 2651 }, { "epoch": 7.0042918454935625, "grad_norm": 299.0360412597656, "learning_rate": 0.0004776095130406522, "loss": 42.6034, "step": 2652 }, { "epoch": 7.006932981181908, "grad_norm": 373.0767822265625, "learning_rate": 0.0004775922292047242, "loss": 43.5809, "step": 2653 }, { "epoch": 7.009574116870255, "grad_norm": 260.2775573730469, "learning_rate": 0.00047757493901341686, "loss": 45.7865, "step": 2654 }, { "epoch": 7.0122152525586, "grad_norm": 423.5856018066406, "learning_rate": 0.0004775576424672131, "loss": 50.9852, "step": 2655 }, { "epoch": 7.014856388246947, "grad_norm": 346.03570556640625, "learning_rate": 0.0004775403395665958, "loss": 49.0573, "step": 2656 }, { "epoch": 7.017497523935292, "grad_norm": 229.99337768554688, "learning_rate": 0.0004775230303120482, "loss": 44.4354, "step": 2657 }, { "epoch": 7.020138659623639, "grad_norm": 302.3257141113281, "learning_rate": 0.00047750571470405356, "loss": 46.9447, "step": 2658 }, { "epoch": 7.022779795311984, "grad_norm": 295.80950927734375, "learning_rate": 0.00047748839274309557, "loss": 47.6329, "step": 2659 }, { "epoch": 7.02542093100033, "grad_norm": 305.4395446777344, "learning_rate": 0.0004774710644296578, "loss": 44.9436, "step": 2660 }, { "epoch": 7.028062066688676, "grad_norm": 354.17901611328125, "learning_rate": 0.0004774537297642242, "loss": 47.2263, "step": 2661 }, { "epoch": 7.030703202377022, "grad_norm": 369.8688049316406, "learning_rate": 0.00047743638874727886, "loss": 44.5086, "step": 2662 }, { "epoch": 7.033344338065368, "grad_norm": 204.46820068359375, "learning_rate": 0.00047741904137930594, "loss": 43.9715, "step": 2663 }, { "epoch": 7.035985473753714, "grad_norm": 394.8818054199219, "learning_rate": 0.0004774016876607898, "loss": 41.9745, "step": 2664 }, { "epoch": 7.03862660944206, "grad_norm": 266.4621276855469, "learning_rate": 0.0004773843275922152, "loss": 41.3027, "step": 2665 }, { "epoch": 7.041267745130406, "grad_norm": 226.98362731933594, "learning_rate": 0.00047736696117406683, "loss": 42.022, "step": 2666 }, { "epoch": 7.043908880818752, "grad_norm": 173.4537353515625, "learning_rate": 0.0004773495884068296, "loss": 39.4882, "step": 2667 }, { "epoch": 7.046550016507098, "grad_norm": 188.98739624023438, "learning_rate": 0.00047733220929098864, "loss": 39.1169, "step": 2668 }, { "epoch": 7.049191152195444, "grad_norm": 221.00289916992188, "learning_rate": 0.00047731482382702933, "loss": 40.6979, "step": 2669 }, { "epoch": 7.05183228788379, "grad_norm": 400.5524597167969, "learning_rate": 0.00047729743201543705, "loss": 40.6878, "step": 2670 }, { "epoch": 7.054473423572136, "grad_norm": 283.92315673828125, "learning_rate": 0.00047728003385669757, "loss": 40.1232, "step": 2671 }, { "epoch": 7.057114559260482, "grad_norm": 282.2577819824219, "learning_rate": 0.0004772626293512966, "loss": 40.9866, "step": 2672 }, { "epoch": 7.059755694948828, "grad_norm": 3106.540771484375, "learning_rate": 0.00047724521849972027, "loss": 66.3202, "step": 2673 }, { "epoch": 7.062396830637174, "grad_norm": 3522.85009765625, "learning_rate": 0.00047722780130245455, "loss": 69.6717, "step": 2674 }, { "epoch": 7.06503796632552, "grad_norm": 8179.203125, "learning_rate": 0.0004772103777599861, "loss": 74.148, "step": 2675 }, { "epoch": 7.067679102013866, "grad_norm": 4091.35595703125, "learning_rate": 0.00047719294787280133, "loss": 59.3736, "step": 2676 }, { "epoch": 7.070320237702212, "grad_norm": 7924.900390625, "learning_rate": 0.00047717551164138685, "loss": 63.1007, "step": 2677 }, { "epoch": 7.072961373390558, "grad_norm": 3420.4052734375, "learning_rate": 0.0004771580690662297, "loss": 49.8994, "step": 2678 }, { "epoch": 7.075602509078904, "grad_norm": 3115.3330078125, "learning_rate": 0.00047714062014781697, "loss": 40.6028, "step": 2679 }, { "epoch": 7.07824364476725, "grad_norm": 2165.186767578125, "learning_rate": 0.00047712316488663574, "loss": 30.4321, "step": 2680 }, { "epoch": 7.080884780455596, "grad_norm": 2099.293701171875, "learning_rate": 0.0004771057032831737, "loss": 31.4531, "step": 2681 }, { "epoch": 7.083525916143942, "grad_norm": 4052.2998046875, "learning_rate": 0.00047708823533791823, "loss": 31.0675, "step": 2682 }, { "epoch": 7.0861670518322875, "grad_norm": 1862.707763671875, "learning_rate": 0.00047707076105135715, "loss": 64.5195, "step": 2683 }, { "epoch": 7.088808187520634, "grad_norm": 1211.5181884765625, "learning_rate": 0.00047705328042397843, "loss": 61.772, "step": 2684 }, { "epoch": 7.0914493232089795, "grad_norm": 832.0191650390625, "learning_rate": 0.00047703579345627036, "loss": 50.1552, "step": 2685 }, { "epoch": 7.094090458897326, "grad_norm": 493.63238525390625, "learning_rate": 0.00047701830014872104, "loss": 45.882, "step": 2686 }, { "epoch": 7.0967315945856715, "grad_norm": 339.0014343261719, "learning_rate": 0.00047700080050181907, "loss": 43.1797, "step": 2687 }, { "epoch": 7.099372730274018, "grad_norm": 252.35067749023438, "learning_rate": 0.00047698329451605304, "loss": 42.1041, "step": 2688 }, { "epoch": 7.1020138659623635, "grad_norm": 298.3693542480469, "learning_rate": 0.00047696578219191187, "loss": 39.9883, "step": 2689 }, { "epoch": 7.10465500165071, "grad_norm": 376.7059631347656, "learning_rate": 0.00047694826352988464, "loss": 41.2614, "step": 2690 }, { "epoch": 7.1072961373390555, "grad_norm": 279.6436767578125, "learning_rate": 0.00047693073853046045, "loss": 40.1928, "step": 2691 }, { "epoch": 7.109937273027402, "grad_norm": 422.6330261230469, "learning_rate": 0.0004769132071941286, "loss": 38.4597, "step": 2692 }, { "epoch": 7.1125784087157475, "grad_norm": 405.1978759765625, "learning_rate": 0.00047689566952137873, "loss": 42.6122, "step": 2693 }, { "epoch": 7.115219544404094, "grad_norm": 457.3885192871094, "learning_rate": 0.0004768781255127007, "loss": 39.5292, "step": 2694 }, { "epoch": 7.1178606800924396, "grad_norm": 299.83123779296875, "learning_rate": 0.00047686057516858416, "loss": 41.8618, "step": 2695 }, { "epoch": 7.120501815780786, "grad_norm": 267.08062744140625, "learning_rate": 0.0004768430184895194, "loss": 40.3067, "step": 2696 }, { "epoch": 7.123142951469132, "grad_norm": 268.8684387207031, "learning_rate": 0.0004768254554759966, "loss": 42.1662, "step": 2697 }, { "epoch": 7.125784087157478, "grad_norm": 273.91668701171875, "learning_rate": 0.0004768078861285062, "loss": 42.7774, "step": 2698 }, { "epoch": 7.128425222845824, "grad_norm": 403.90643310546875, "learning_rate": 0.00047679031044753876, "loss": 43.1621, "step": 2699 }, { "epoch": 7.13106635853417, "grad_norm": 1291.0689697265625, "learning_rate": 0.0004767727284335852, "loss": 44.7442, "step": 2700 }, { "epoch": 7.133707494222516, "grad_norm": 4094.1748046875, "learning_rate": 0.0004767551400871364, "loss": 47.3736, "step": 2701 }, { "epoch": 7.136348629910862, "grad_norm": 240.3263702392578, "learning_rate": 0.00047673754540868347, "loss": 45.9654, "step": 2702 }, { "epoch": 7.138989765599208, "grad_norm": 254.40013122558594, "learning_rate": 0.00047671994439871777, "loss": 43.4946, "step": 2703 }, { "epoch": 7.141630901287554, "grad_norm": 272.60107421875, "learning_rate": 0.00047670233705773086, "loss": 45.8633, "step": 2704 }, { "epoch": 7.1442720369759, "grad_norm": 223.7222900390625, "learning_rate": 0.0004766847233862144, "loss": 46.2498, "step": 2705 }, { "epoch": 7.146913172664245, "grad_norm": 299.0388488769531, "learning_rate": 0.00047666710338466005, "loss": 49.0246, "step": 2706 }, { "epoch": 7.149554308352592, "grad_norm": 226.21250915527344, "learning_rate": 0.0004766494770535601, "loss": 49.4387, "step": 2707 }, { "epoch": 7.152195444040937, "grad_norm": 241.80845642089844, "learning_rate": 0.00047663184439340663, "loss": 46.6173, "step": 2708 }, { "epoch": 7.154836579729284, "grad_norm": 224.1820068359375, "learning_rate": 0.00047661420540469205, "loss": 45.1355, "step": 2709 }, { "epoch": 7.157477715417629, "grad_norm": 207.310302734375, "learning_rate": 0.00047659656008790886, "loss": 45.9545, "step": 2710 }, { "epoch": 7.160118851105976, "grad_norm": 163.61737060546875, "learning_rate": 0.0004765789084435499, "loss": 43.3507, "step": 2711 }, { "epoch": 7.162759986794321, "grad_norm": 232.382080078125, "learning_rate": 0.00047656125047210797, "loss": 43.3033, "step": 2712 }, { "epoch": 7.165401122482668, "grad_norm": 201.4738006591797, "learning_rate": 0.00047654358617407625, "loss": 45.3597, "step": 2713 }, { "epoch": 7.168042258171013, "grad_norm": 292.84783935546875, "learning_rate": 0.00047652591554994786, "loss": 40.8066, "step": 2714 }, { "epoch": 7.17068339385936, "grad_norm": 177.78033447265625, "learning_rate": 0.0004765082386002164, "loss": 40.8268, "step": 2715 }, { "epoch": 7.173324529547705, "grad_norm": 482.7901916503906, "learning_rate": 0.00047649055532537557, "loss": 42.9739, "step": 2716 }, { "epoch": 7.175965665236052, "grad_norm": 368.4410095214844, "learning_rate": 0.0004764728657259189, "loss": 41.428, "step": 2717 }, { "epoch": 7.178606800924397, "grad_norm": 241.18153381347656, "learning_rate": 0.0004764551698023405, "loss": 40.9182, "step": 2718 }, { "epoch": 7.181247936612744, "grad_norm": 250.7117919921875, "learning_rate": 0.0004764374675551345, "loss": 40.7033, "step": 2719 }, { "epoch": 7.183889072301089, "grad_norm": 722.8627319335938, "learning_rate": 0.00047641975898479527, "loss": 39.7633, "step": 2720 }, { "epoch": 7.186530207989436, "grad_norm": 313.0584411621094, "learning_rate": 0.0004764020440918172, "loss": 39.3582, "step": 2721 }, { "epoch": 7.189171343677781, "grad_norm": 358.1639709472656, "learning_rate": 0.00047638432287669516, "loss": 42.3436, "step": 2722 }, { "epoch": 7.191812479366128, "grad_norm": 2577.765625, "learning_rate": 0.00047636659533992383, "loss": 69.2747, "step": 2723 }, { "epoch": 7.194453615054473, "grad_norm": 2594.705810546875, "learning_rate": 0.0004763488614819983, "loss": 124.0585, "step": 2724 }, { "epoch": 7.19709475074282, "grad_norm": 5585.48046875, "learning_rate": 0.00047633112130341385, "loss": 139.8992, "step": 2725 }, { "epoch": 7.199735886431165, "grad_norm": 3590.64990234375, "learning_rate": 0.0004763133748046656, "loss": 127.3607, "step": 2726 }, { "epoch": 7.202377022119512, "grad_norm": 2544.7802734375, "learning_rate": 0.00047629562198624946, "loss": 106.6256, "step": 2727 }, { "epoch": 7.205018157807857, "grad_norm": 2732.61474609375, "learning_rate": 0.00047627786284866106, "loss": 90.9133, "step": 2728 }, { "epoch": 7.207659293496203, "grad_norm": 2520.992919921875, "learning_rate": 0.0004762600973923961, "loss": 91.201, "step": 2729 }, { "epoch": 7.210300429184549, "grad_norm": 1934.0462646484375, "learning_rate": 0.00047624232561795094, "loss": 62.9677, "step": 2730 }, { "epoch": 7.212941564872895, "grad_norm": 1424.1209716796875, "learning_rate": 0.00047622454752582175, "loss": 49.0481, "step": 2731 }, { "epoch": 7.215582700561241, "grad_norm": 1097.9146728515625, "learning_rate": 0.0004762067631165049, "loss": 38.7455, "step": 2732 }, { "epoch": 7.218223836249587, "grad_norm": 1878.102783203125, "learning_rate": 0.00047618897239049706, "loss": 74.7223, "step": 2733 }, { "epoch": 7.220864971937933, "grad_norm": 1341.471435546875, "learning_rate": 0.00047617117534829513, "loss": 69.9964, "step": 2734 }, { "epoch": 7.223506107626279, "grad_norm": 906.912109375, "learning_rate": 0.0004761533719903959, "loss": 61.5829, "step": 2735 }, { "epoch": 7.226147243314625, "grad_norm": 579.8167114257812, "learning_rate": 0.0004761355623172966, "loss": 49.1484, "step": 2736 }, { "epoch": 7.228788379002971, "grad_norm": 589.9385375976562, "learning_rate": 0.00047611774632949465, "loss": 43.8535, "step": 2737 }, { "epoch": 7.2314295146913175, "grad_norm": 352.563720703125, "learning_rate": 0.0004760999240274874, "loss": 44.4512, "step": 2738 }, { "epoch": 7.234070650379663, "grad_norm": 422.77154541015625, "learning_rate": 0.0004760820954117726, "loss": 43.4327, "step": 2739 }, { "epoch": 7.2367117860680095, "grad_norm": 503.5856018066406, "learning_rate": 0.00047606426048284813, "loss": 42.104, "step": 2740 }, { "epoch": 7.239352921756355, "grad_norm": 409.34490966796875, "learning_rate": 0.00047604641924121194, "loss": 40.6705, "step": 2741 }, { "epoch": 7.2419940574447015, "grad_norm": 656.1419677734375, "learning_rate": 0.00047602857168736225, "loss": 42.8728, "step": 2742 }, { "epoch": 7.244635193133047, "grad_norm": 511.09954833984375, "learning_rate": 0.0004760107178217975, "loss": 40.7985, "step": 2743 }, { "epoch": 7.2472763288213935, "grad_norm": 411.43499755859375, "learning_rate": 0.00047599285764501626, "loss": 42.0107, "step": 2744 }, { "epoch": 7.249917464509739, "grad_norm": 436.754638671875, "learning_rate": 0.00047597499115751717, "loss": 41.2067, "step": 2745 }, { "epoch": 7.2525586001980855, "grad_norm": 365.1582336425781, "learning_rate": 0.0004759571183597993, "loss": 39.1562, "step": 2746 }, { "epoch": 7.255199735886431, "grad_norm": 491.01458740234375, "learning_rate": 0.0004759392392523615, "loss": 40.7154, "step": 2747 }, { "epoch": 7.2578408715747775, "grad_norm": 532.8539428710938, "learning_rate": 0.0004759213538357032, "loss": 38.9868, "step": 2748 }, { "epoch": 7.260482007263123, "grad_norm": 806.1033935546875, "learning_rate": 0.00047590346211032387, "loss": 42.1091, "step": 2749 }, { "epoch": 7.2631231429514695, "grad_norm": 870.9827270507812, "learning_rate": 0.000475885564076723, "loss": 44.6181, "step": 2750 }, { "epoch": 7.265764278639815, "grad_norm": 930.36083984375, "learning_rate": 0.00047586765973540047, "loss": 53.634, "step": 2751 }, { "epoch": 7.268405414328161, "grad_norm": 725.174560546875, "learning_rate": 0.00047584974908685627, "loss": 49.0624, "step": 2752 }, { "epoch": 7.271046550016507, "grad_norm": 368.8458251953125, "learning_rate": 0.00047583183213159043, "loss": 42.7024, "step": 2753 }, { "epoch": 7.273687685704853, "grad_norm": 303.6537170410156, "learning_rate": 0.0004758139088701033, "loss": 43.8499, "step": 2754 }, { "epoch": 7.276328821393199, "grad_norm": 368.5811462402344, "learning_rate": 0.00047579597930289555, "loss": 45.2683, "step": 2755 }, { "epoch": 7.278969957081545, "grad_norm": 426.5256042480469, "learning_rate": 0.0004757780434304676, "loss": 47.1664, "step": 2756 }, { "epoch": 7.281611092769891, "grad_norm": 335.000244140625, "learning_rate": 0.0004757601012533205, "loss": 49.1015, "step": 2757 }, { "epoch": 7.284252228458237, "grad_norm": 451.4483947753906, "learning_rate": 0.0004757421527719551, "loss": 46.7867, "step": 2758 }, { "epoch": 7.286893364146583, "grad_norm": 397.7605285644531, "learning_rate": 0.0004757241979868728, "loss": 45.7272, "step": 2759 }, { "epoch": 7.289534499834929, "grad_norm": 397.2437744140625, "learning_rate": 0.00047570623689857476, "loss": 46.8548, "step": 2760 }, { "epoch": 7.292175635523275, "grad_norm": 544.5848388671875, "learning_rate": 0.00047568826950756274, "loss": 44.5882, "step": 2761 }, { "epoch": 7.294816771211621, "grad_norm": 496.7432556152344, "learning_rate": 0.0004756702958143383, "loss": 43.9418, "step": 2762 }, { "epoch": 7.297457906899967, "grad_norm": 326.7575988769531, "learning_rate": 0.0004756523158194034, "loss": 42.999, "step": 2763 }, { "epoch": 7.300099042588313, "grad_norm": 595.3753051757812, "learning_rate": 0.00047563432952326025, "loss": 42.1061, "step": 2764 }, { "epoch": 7.302740178276659, "grad_norm": 328.1588439941406, "learning_rate": 0.00047561633692641086, "loss": 40.9645, "step": 2765 }, { "epoch": 7.305381313965005, "grad_norm": 346.5023193359375, "learning_rate": 0.00047559833802935785, "loss": 39.9636, "step": 2766 }, { "epoch": 7.308022449653351, "grad_norm": 382.62420654296875, "learning_rate": 0.00047558033283260384, "loss": 40.4899, "step": 2767 }, { "epoch": 7.310663585341697, "grad_norm": 407.2397155761719, "learning_rate": 0.00047556232133665147, "loss": 39.6179, "step": 2768 }, { "epoch": 7.313304721030043, "grad_norm": 370.2457275390625, "learning_rate": 0.00047554430354200376, "loss": 39.1817, "step": 2769 }, { "epoch": 7.315945856718389, "grad_norm": 359.1654968261719, "learning_rate": 0.00047552627944916394, "loss": 41.2309, "step": 2770 }, { "epoch": 7.318586992406735, "grad_norm": 766.4949340820312, "learning_rate": 0.00047550824905863524, "loss": 40.0446, "step": 2771 }, { "epoch": 7.321228128095081, "grad_norm": 540.2572631835938, "learning_rate": 0.00047549021237092105, "loss": 42.2347, "step": 2772 }, { "epoch": 7.323869263783427, "grad_norm": 2745.8232421875, "learning_rate": 0.00047547216938652526, "loss": 60.0504, "step": 2773 }, { "epoch": 7.326510399471773, "grad_norm": 7168.1279296875, "learning_rate": 0.0004754541201059515, "loss": 92.6508, "step": 2774 }, { "epoch": 7.329151535160118, "grad_norm": 9654.65625, "learning_rate": 0.00047543606452970393, "loss": 95.8999, "step": 2775 }, { "epoch": 7.331792670848465, "grad_norm": 10631.953125, "learning_rate": 0.0004754180026582867, "loss": 110.4674, "step": 2776 }, { "epoch": 7.33443380653681, "grad_norm": 2914.312744140625, "learning_rate": 0.0004753999344922041, "loss": 121.4997, "step": 2777 }, { "epoch": 7.337074942225157, "grad_norm": 8710.3251953125, "learning_rate": 0.00047538186003196083, "loss": 104.173, "step": 2778 }, { "epoch": 7.3397160779135024, "grad_norm": 8344.7421875, "learning_rate": 0.00047536377927806143, "loss": 94.2815, "step": 2779 }, { "epoch": 7.342357213601849, "grad_norm": 9823.087890625, "learning_rate": 0.00047534569223101086, "loss": 81.1817, "step": 2780 }, { "epoch": 7.3449983492901945, "grad_norm": 19536.1875, "learning_rate": 0.00047532759889131425, "loss": 73.7919, "step": 2781 }, { "epoch": 7.347639484978541, "grad_norm": 3632.77685546875, "learning_rate": 0.00047530949925947684, "loss": 65.0161, "step": 2782 }, { "epoch": 7.3502806206668865, "grad_norm": 3028.8916015625, "learning_rate": 0.00047529139333600404, "loss": 54.7101, "step": 2783 }, { "epoch": 7.352921756355233, "grad_norm": 407.8616027832031, "learning_rate": 0.00047527328112140135, "loss": 45.42, "step": 2784 }, { "epoch": 7.3555628920435785, "grad_norm": 493.0317077636719, "learning_rate": 0.00047525516261617465, "loss": 44.5139, "step": 2785 }, { "epoch": 7.358204027731925, "grad_norm": 430.92059326171875, "learning_rate": 0.00047523703782082985, "loss": 42.3784, "step": 2786 }, { "epoch": 7.3608451634202705, "grad_norm": 551.5164794921875, "learning_rate": 0.0004752189067358731, "loss": 41.2773, "step": 2787 }, { "epoch": 7.363486299108617, "grad_norm": 692.3590698242188, "learning_rate": 0.00047520076936181065, "loss": 41.902, "step": 2788 }, { "epoch": 7.3661274347969625, "grad_norm": 348.2611999511719, "learning_rate": 0.00047518262569914906, "loss": 41.4657, "step": 2789 }, { "epoch": 7.368768570485309, "grad_norm": 692.2703857421875, "learning_rate": 0.0004751644757483948, "loss": 40.8386, "step": 2790 }, { "epoch": 7.3714097061736545, "grad_norm": 901.8109130859375, "learning_rate": 0.00047514631951005494, "loss": 40.0212, "step": 2791 }, { "epoch": 7.374050841862001, "grad_norm": 600.1516723632812, "learning_rate": 0.00047512815698463634, "loss": 40.0461, "step": 2792 }, { "epoch": 7.376691977550347, "grad_norm": 320.00225830078125, "learning_rate": 0.0004751099881726462, "loss": 39.56, "step": 2793 }, { "epoch": 7.379333113238693, "grad_norm": 489.5521545410156, "learning_rate": 0.0004750918130745919, "loss": 39.1008, "step": 2794 }, { "epoch": 7.381974248927039, "grad_norm": 454.1921081542969, "learning_rate": 0.0004750736316909809, "loss": 40.8049, "step": 2795 }, { "epoch": 7.384615384615385, "grad_norm": 501.5694580078125, "learning_rate": 0.0004750554440223209, "loss": 40.2047, "step": 2796 }, { "epoch": 7.387256520303731, "grad_norm": 821.4988403320312, "learning_rate": 0.00047503725006911996, "loss": 39.6371, "step": 2797 }, { "epoch": 7.389897655992076, "grad_norm": 1703.4942626953125, "learning_rate": 0.00047501904983188593, "loss": 41.3298, "step": 2798 }, { "epoch": 7.392538791680423, "grad_norm": 654.0762329101562, "learning_rate": 0.00047500084331112715, "loss": 42.5145, "step": 2799 }, { "epoch": 7.395179927368768, "grad_norm": 1218.98193359375, "learning_rate": 0.000474982630507352, "loss": 47.3447, "step": 2800 }, { "epoch": 7.395179927368768, "eval_loss": 5.7667388916015625, "eval_runtime": 2.2527, "eval_samples_per_second": 219.738, "eval_steps_per_second": 27.523, "step": 2800 }, { "epoch": 7.397821063057115, "grad_norm": 2049.01953125, "learning_rate": 0.000474964411421069, "loss": 44.7118, "step": 2801 }, { "epoch": 7.40046219874546, "grad_norm": 571.55859375, "learning_rate": 0.00047494618605278696, "loss": 44.9474, "step": 2802 }, { "epoch": 7.403103334433807, "grad_norm": 378.2970886230469, "learning_rate": 0.00047492795440301483, "loss": 42.811, "step": 2803 }, { "epoch": 7.405744470122152, "grad_norm": 362.8630676269531, "learning_rate": 0.00047490971647226176, "loss": 43.1771, "step": 2804 }, { "epoch": 7.408385605810499, "grad_norm": 343.76739501953125, "learning_rate": 0.0004748914722610369, "loss": 44.4581, "step": 2805 }, { "epoch": 7.411026741498844, "grad_norm": 349.8832702636719, "learning_rate": 0.0004748732217698499, "loss": 45.866, "step": 2806 }, { "epoch": 7.413667877187191, "grad_norm": 444.8079528808594, "learning_rate": 0.0004748549649992102, "loss": 47.3317, "step": 2807 }, { "epoch": 7.416309012875536, "grad_norm": 459.17578125, "learning_rate": 0.0004748367019496277, "loss": 46.2664, "step": 2808 }, { "epoch": 7.418950148563883, "grad_norm": 307.4496154785156, "learning_rate": 0.00047481843262161226, "loss": 44.5799, "step": 2809 }, { "epoch": 7.421591284252228, "grad_norm": 363.0213928222656, "learning_rate": 0.0004748001570156744, "loss": 44.4598, "step": 2810 }, { "epoch": 7.424232419940575, "grad_norm": 312.91705322265625, "learning_rate": 0.00047478187513232394, "loss": 44.9479, "step": 2811 }, { "epoch": 7.42687355562892, "grad_norm": 427.72271728515625, "learning_rate": 0.0004747635869720718, "loss": 44.8951, "step": 2812 }, { "epoch": 7.429514691317267, "grad_norm": 377.909912109375, "learning_rate": 0.00047474529253542855, "loss": 43.0989, "step": 2813 }, { "epoch": 7.432155827005612, "grad_norm": 450.937255859375, "learning_rate": 0.000474726991822905, "loss": 42.1288, "step": 2814 }, { "epoch": 7.434796962693959, "grad_norm": 268.6912536621094, "learning_rate": 0.0004747086848350122, "loss": 41.3334, "step": 2815 }, { "epoch": 7.437438098382304, "grad_norm": 308.704833984375, "learning_rate": 0.0004746903715722614, "loss": 39.704, "step": 2816 }, { "epoch": 7.440079234070651, "grad_norm": 277.6412658691406, "learning_rate": 0.0004746720520351639, "loss": 41.6707, "step": 2817 }, { "epoch": 7.442720369758996, "grad_norm": 301.7303161621094, "learning_rate": 0.0004746537262242314, "loss": 41.0525, "step": 2818 }, { "epoch": 7.445361505447343, "grad_norm": 264.89581298828125, "learning_rate": 0.0004746353941399756, "loss": 39.7455, "step": 2819 }, { "epoch": 7.448002641135688, "grad_norm": 565.2534790039062, "learning_rate": 0.00047461705578290833, "loss": 41.7027, "step": 2820 }, { "epoch": 7.450643776824034, "grad_norm": 354.978759765625, "learning_rate": 0.0004745987111535417, "loss": 41.0023, "step": 2821 }, { "epoch": 7.45328491251238, "grad_norm": 2874.228759765625, "learning_rate": 0.00047458036025238803, "loss": 55.2251, "step": 2822 }, { "epoch": 7.455926048200726, "grad_norm": 11809.7509765625, "learning_rate": 0.00047456200307995967, "loss": 76.3931, "step": 2823 }, { "epoch": 7.458567183889072, "grad_norm": 5440.23583984375, "learning_rate": 0.00047454363963676935, "loss": 84.8063, "step": 2824 }, { "epoch": 7.461208319577418, "grad_norm": 3210.923583984375, "learning_rate": 0.0004745252699233298, "loss": 89.7642, "step": 2825 }, { "epoch": 7.463849455265764, "grad_norm": 5623.3486328125, "learning_rate": 0.0004745068939401539, "loss": 75.3675, "step": 2826 }, { "epoch": 7.46649059095411, "grad_norm": 6229.8115234375, "learning_rate": 0.00047448851168775495, "loss": 68.8687, "step": 2827 }, { "epoch": 7.469131726642456, "grad_norm": 2320.5859375, "learning_rate": 0.00047447012316664616, "loss": 69.5483, "step": 2828 }, { "epoch": 7.471772862330802, "grad_norm": 4572.767578125, "learning_rate": 0.000474451728377341, "loss": 53.4936, "step": 2829 }, { "epoch": 7.474413998019148, "grad_norm": 2416.03271484375, "learning_rate": 0.0004744333273203533, "loss": 43.5206, "step": 2830 }, { "epoch": 7.477055133707494, "grad_norm": 1975.5390625, "learning_rate": 0.0004744149199961966, "loss": 35.6345, "step": 2831 }, { "epoch": 7.47969626939584, "grad_norm": 1360.6842041015625, "learning_rate": 0.0004743965064053852, "loss": 50.8003, "step": 2832 }, { "epoch": 7.482337405084186, "grad_norm": 1836.4625244140625, "learning_rate": 0.00047437808654843315, "loss": 71.0303, "step": 2833 }, { "epoch": 7.484978540772532, "grad_norm": 1431.42041015625, "learning_rate": 0.0004743596604258549, "loss": 71.7313, "step": 2834 }, { "epoch": 7.487619676460878, "grad_norm": 977.2224731445312, "learning_rate": 0.0004743412280381648, "loss": 56.9419, "step": 2835 }, { "epoch": 7.4902608121492245, "grad_norm": 698.2857666015625, "learning_rate": 0.0004743227893858778, "loss": 46.8857, "step": 2836 }, { "epoch": 7.49290194783757, "grad_norm": 748.4661254882812, "learning_rate": 0.0004743043444695086, "loss": 43.0788, "step": 2837 }, { "epoch": 7.4955430835259165, "grad_norm": 282.2397155761719, "learning_rate": 0.0004742858932895724, "loss": 42.6579, "step": 2838 }, { "epoch": 7.498184219214262, "grad_norm": 213.45970153808594, "learning_rate": 0.00047426743584658436, "loss": 41.6042, "step": 2839 }, { "epoch": 7.5008253549026085, "grad_norm": 447.23992919921875, "learning_rate": 0.00047424897214105996, "loss": 39.6627, "step": 2840 }, { "epoch": 7.503466490590954, "grad_norm": 455.3643493652344, "learning_rate": 0.0004742305021735147, "loss": 41.4246, "step": 2841 }, { "epoch": 7.5061076262793005, "grad_norm": 473.129150390625, "learning_rate": 0.0004742120259444644, "loss": 42.7239, "step": 2842 }, { "epoch": 7.508748761967646, "grad_norm": 805.9973754882812, "learning_rate": 0.00047419354345442494, "loss": 43.3983, "step": 2843 }, { "epoch": 7.511389897655992, "grad_norm": 530.653076171875, "learning_rate": 0.00047417505470391253, "loss": 40.7568, "step": 2844 }, { "epoch": 7.514031033344338, "grad_norm": 548.7603149414062, "learning_rate": 0.0004741565596934434, "loss": 43.3411, "step": 2845 }, { "epoch": 7.5166721690326845, "grad_norm": 884.116455078125, "learning_rate": 0.000474138058423534, "loss": 41.494, "step": 2846 }, { "epoch": 7.51931330472103, "grad_norm": 1265.360107421875, "learning_rate": 0.00047411955089470106, "loss": 42.0184, "step": 2847 }, { "epoch": 7.521954440409376, "grad_norm": 542.6777954101562, "learning_rate": 0.0004741010371074612, "loss": 42.9496, "step": 2848 }, { "epoch": 7.524595576097722, "grad_norm": 701.5136108398438, "learning_rate": 0.00047408251706233156, "loss": 44.0269, "step": 2849 }, { "epoch": 7.527236711786068, "grad_norm": 2032.999267578125, "learning_rate": 0.0004740639907598293, "loss": 46.516, "step": 2850 }, { "epoch": 7.529877847474414, "grad_norm": 951.2576904296875, "learning_rate": 0.0004740454582004717, "loss": 46.1283, "step": 2851 }, { "epoch": 7.53251898316276, "grad_norm": 341.21795654296875, "learning_rate": 0.00047402691938477627, "loss": 44.2291, "step": 2852 }, { "epoch": 7.535160118851106, "grad_norm": 344.6230163574219, "learning_rate": 0.0004740083743132607, "loss": 47.7371, "step": 2853 }, { "epoch": 7.537801254539452, "grad_norm": 425.7012634277344, "learning_rate": 0.00047398982298644296, "loss": 47.9902, "step": 2854 }, { "epoch": 7.540442390227798, "grad_norm": 427.3322448730469, "learning_rate": 0.0004739712654048409, "loss": 49.9819, "step": 2855 }, { "epoch": 7.543083525916144, "grad_norm": 480.840576171875, "learning_rate": 0.00047395270156897287, "loss": 49.6486, "step": 2856 }, { "epoch": 7.54572466160449, "grad_norm": 320.3793029785156, "learning_rate": 0.00047393413147935715, "loss": 46.2456, "step": 2857 }, { "epoch": 7.548365797292836, "grad_norm": 331.64471435546875, "learning_rate": 0.00047391555513651243, "loss": 46.704, "step": 2858 }, { "epoch": 7.551006932981182, "grad_norm": 393.87158203125, "learning_rate": 0.0004738969725409573, "loss": 47.087, "step": 2859 }, { "epoch": 7.553648068669528, "grad_norm": 284.8318786621094, "learning_rate": 0.0004738783836932108, "loss": 44.9737, "step": 2860 }, { "epoch": 7.556289204357874, "grad_norm": 309.36395263671875, "learning_rate": 0.0004738597885937919, "loss": 43.9619, "step": 2861 }, { "epoch": 7.55893034004622, "grad_norm": 347.7061462402344, "learning_rate": 0.00047384118724321997, "loss": 43.1526, "step": 2862 }, { "epoch": 7.561571475734566, "grad_norm": 240.33477783203125, "learning_rate": 0.0004738225796420143, "loss": 43.6041, "step": 2863 }, { "epoch": 7.564212611422912, "grad_norm": 231.29441833496094, "learning_rate": 0.0004738039657906946, "loss": 42.7914, "step": 2864 }, { "epoch": 7.566853747111258, "grad_norm": 558.8091430664062, "learning_rate": 0.0004737853456897807, "loss": 42.556, "step": 2865 }, { "epoch": 7.569494882799604, "grad_norm": 307.15167236328125, "learning_rate": 0.00047376671933979235, "loss": 43.368, "step": 2866 }, { "epoch": 7.572136018487949, "grad_norm": 336.4648132324219, "learning_rate": 0.00047374808674124994, "loss": 40.9638, "step": 2867 }, { "epoch": 7.574777154176296, "grad_norm": 361.2081604003906, "learning_rate": 0.00047372944789467354, "loss": 40.2938, "step": 2868 }, { "epoch": 7.577418289864642, "grad_norm": 316.122314453125, "learning_rate": 0.0004737108028005838, "loss": 40.9381, "step": 2869 }, { "epoch": 7.580059425552988, "grad_norm": 490.9631042480469, "learning_rate": 0.00047369215145950125, "loss": 41.7362, "step": 2870 }, { "epoch": 7.582700561241333, "grad_norm": 412.373291015625, "learning_rate": 0.0004736734938719468, "loss": 39.7077, "step": 2871 }, { "epoch": 7.58534169692968, "grad_norm": 3613.510009765625, "learning_rate": 0.0004736548300384414, "loss": 51.2636, "step": 2872 }, { "epoch": 7.587982832618025, "grad_norm": 3577.678955078125, "learning_rate": 0.00047363615995950624, "loss": 101.8116, "step": 2873 }, { "epoch": 7.590623968306372, "grad_norm": 15975.474609375, "learning_rate": 0.0004736174836356628, "loss": 105.6101, "step": 2874 }, { "epoch": 7.593265103994717, "grad_norm": 2523.74560546875, "learning_rate": 0.0004735988010674324, "loss": 88.1379, "step": 2875 }, { "epoch": 7.595906239683064, "grad_norm": 8672.0224609375, "learning_rate": 0.00047358011225533684, "loss": 82.0247, "step": 2876 }, { "epoch": 7.5985473753714095, "grad_norm": 11727.1806640625, "learning_rate": 0.000473561417199898, "loss": 76.9829, "step": 2877 }, { "epoch": 7.601188511059756, "grad_norm": 9403.5146484375, "learning_rate": 0.0004735427159016379, "loss": 75.0261, "step": 2878 }, { "epoch": 7.6038296467481015, "grad_norm": 5412.4150390625, "learning_rate": 0.0004735240083610787, "loss": 65.9251, "step": 2879 }, { "epoch": 7.606470782436448, "grad_norm": 2751.158203125, "learning_rate": 0.000473505294578743, "loss": 57.0461, "step": 2880 }, { "epoch": 7.6091119181247935, "grad_norm": 3279.4189453125, "learning_rate": 0.0004734865745551532, "loss": 45.2061, "step": 2881 }, { "epoch": 7.61175305381314, "grad_norm": 2997.638916015625, "learning_rate": 0.000473467848290832, "loss": 41.2974, "step": 2882 }, { "epoch": 7.6143941895014855, "grad_norm": 1062.0257568359375, "learning_rate": 0.00047344911578630256, "loss": 56.51, "step": 2883 }, { "epoch": 7.617035325189832, "grad_norm": 1208.9322509765625, "learning_rate": 0.0004734303770420877, "loss": 53.224, "step": 2884 }, { "epoch": 7.6196764608781775, "grad_norm": 645.2787475585938, "learning_rate": 0.00047341163205871084, "loss": 44.8431, "step": 2885 }, { "epoch": 7.622317596566524, "grad_norm": 613.64501953125, "learning_rate": 0.0004733928808366954, "loss": 44.3285, "step": 2886 }, { "epoch": 7.6249587322548695, "grad_norm": 460.9313049316406, "learning_rate": 0.000473374123376565, "loss": 39.8307, "step": 2887 }, { "epoch": 7.627599867943216, "grad_norm": 396.0033264160156, "learning_rate": 0.00047335535967884347, "loss": 39.5098, "step": 2888 }, { "epoch": 7.6302410036315615, "grad_norm": 304.4489440917969, "learning_rate": 0.0004733365897440547, "loss": 40.3869, "step": 2889 }, { "epoch": 7.632882139319907, "grad_norm": 606.9195556640625, "learning_rate": 0.00047331781357272287, "loss": 39.8675, "step": 2890 }, { "epoch": 7.635523275008254, "grad_norm": 696.5289306640625, "learning_rate": 0.0004732990311653722, "loss": 39.8356, "step": 2891 }, { "epoch": 7.6381644106966, "grad_norm": 449.7454833984375, "learning_rate": 0.0004732802425225273, "loss": 40.9645, "step": 2892 }, { "epoch": 7.640805546384946, "grad_norm": 582.4727172851562, "learning_rate": 0.0004732614476447128, "loss": 42.0387, "step": 2893 }, { "epoch": 7.643446682073291, "grad_norm": 690.1043090820312, "learning_rate": 0.0004732426465324535, "loss": 43.1985, "step": 2894 }, { "epoch": 7.646087817761638, "grad_norm": 776.3034057617188, "learning_rate": 0.0004732238391862745, "loss": 43.7513, "step": 2895 }, { "epoch": 7.648728953449983, "grad_norm": 384.5823669433594, "learning_rate": 0.0004732050256067009, "loss": 40.818, "step": 2896 }, { "epoch": 7.65137008913833, "grad_norm": 491.7218933105469, "learning_rate": 0.000473186205794258, "loss": 40.7002, "step": 2897 }, { "epoch": 7.654011224826675, "grad_norm": 773.0963134765625, "learning_rate": 0.00047316737974947143, "loss": 42.1252, "step": 2898 }, { "epoch": 7.656652360515022, "grad_norm": 743.5973510742188, "learning_rate": 0.00047314854747286696, "loss": 43.6137, "step": 2899 }, { "epoch": 7.659293496203367, "grad_norm": 2374.0224609375, "learning_rate": 0.0004731297089649703, "loss": 43.6975, "step": 2900 }, { "epoch": 7.661934631891714, "grad_norm": 883.5426025390625, "learning_rate": 0.00047311086422630765, "loss": 44.8669, "step": 2901 }, { "epoch": 7.664575767580059, "grad_norm": 1722.1304931640625, "learning_rate": 0.00047309201325740515, "loss": 45.8024, "step": 2902 }, { "epoch": 7.667216903268406, "grad_norm": 454.7875061035156, "learning_rate": 0.0004730731560587892, "loss": 42.8326, "step": 2903 }, { "epoch": 7.669858038956751, "grad_norm": 964.2589721679688, "learning_rate": 0.0004730542926309865, "loss": 43.3707, "step": 2904 }, { "epoch": 7.672499174645098, "grad_norm": 677.546142578125, "learning_rate": 0.00047303542297452364, "loss": 48.1312, "step": 2905 }, { "epoch": 7.675140310333443, "grad_norm": 655.2567749023438, "learning_rate": 0.0004730165470899276, "loss": 50.7172, "step": 2906 }, { "epoch": 7.67778144602179, "grad_norm": 379.40350341796875, "learning_rate": 0.00047299766497772556, "loss": 47.9954, "step": 2907 }, { "epoch": 7.680422581710135, "grad_norm": 433.5335693359375, "learning_rate": 0.0004729787766384446, "loss": 48.7291, "step": 2908 }, { "epoch": 7.683063717398482, "grad_norm": 550.7196655273438, "learning_rate": 0.00047295988207261244, "loss": 48.0225, "step": 2909 }, { "epoch": 7.685704853086827, "grad_norm": 591.2416381835938, "learning_rate": 0.0004729409812807564, "loss": 45.5602, "step": 2910 }, { "epoch": 7.688345988775174, "grad_norm": 547.3509521484375, "learning_rate": 0.0004729220742634045, "loss": 47.2999, "step": 2911 }, { "epoch": 7.690987124463519, "grad_norm": 813.3056030273438, "learning_rate": 0.00047290316102108464, "loss": 46.4304, "step": 2912 }, { "epoch": 7.693628260151865, "grad_norm": 429.73138427734375, "learning_rate": 0.0004728842415543249, "loss": 42.4179, "step": 2913 }, { "epoch": 7.696269395840211, "grad_norm": 532.6132202148438, "learning_rate": 0.0004728653158636537, "loss": 43.6508, "step": 2914 }, { "epoch": 7.698910531528558, "grad_norm": 591.1275024414062, "learning_rate": 0.00047284638394959946, "loss": 40.8389, "step": 2915 }, { "epoch": 7.701551667216903, "grad_norm": 480.6085510253906, "learning_rate": 0.0004728274458126908, "loss": 41.4988, "step": 2916 }, { "epoch": 7.704192802905249, "grad_norm": 385.31390380859375, "learning_rate": 0.0004728085014534567, "loss": 40.6639, "step": 2917 }, { "epoch": 7.706833938593595, "grad_norm": 341.1507873535156, "learning_rate": 0.00047278955087242595, "loss": 40.7704, "step": 2918 }, { "epoch": 7.709475074281941, "grad_norm": 460.50250244140625, "learning_rate": 0.00047277059407012796, "loss": 41.2399, "step": 2919 }, { "epoch": 7.712116209970287, "grad_norm": 826.7686157226562, "learning_rate": 0.00047275163104709196, "loss": 39.7116, "step": 2920 }, { "epoch": 7.714757345658633, "grad_norm": 384.95660400390625, "learning_rate": 0.0004727326618038476, "loss": 40.6556, "step": 2921 }, { "epoch": 7.717398481346979, "grad_norm": 1350.6038818359375, "learning_rate": 0.0004727136863409244, "loss": 51.6404, "step": 2922 }, { "epoch": 7.720039617035325, "grad_norm": 2544.950439453125, "learning_rate": 0.00047269470465885233, "loss": 108.6896, "step": 2923 }, { "epoch": 7.722680752723671, "grad_norm": 2872.638916015625, "learning_rate": 0.0004726757167581615, "loss": 112.1714, "step": 2924 }, { "epoch": 7.725321888412017, "grad_norm": 3478.776611328125, "learning_rate": 0.0004726567226393821, "loss": 103.3525, "step": 2925 }, { "epoch": 7.727963024100363, "grad_norm": 3005.124267578125, "learning_rate": 0.00047263772230304445, "loss": 117.3203, "step": 2926 }, { "epoch": 7.730604159788709, "grad_norm": 11875.52734375, "learning_rate": 0.0004726187157496793, "loss": 94.4484, "step": 2927 }, { "epoch": 7.733245295477055, "grad_norm": 11223.5869140625, "learning_rate": 0.0004725997029798172, "loss": 82.4767, "step": 2928 }, { "epoch": 7.735886431165401, "grad_norm": 6312.43798828125, "learning_rate": 0.0004725806839939891, "loss": 74.9209, "step": 2929 }, { "epoch": 7.738527566853747, "grad_norm": 1541.213134765625, "learning_rate": 0.0004725616587927263, "loss": 79.61, "step": 2930 }, { "epoch": 7.741168702542093, "grad_norm": 5709.44921875, "learning_rate": 0.00047254262737655983, "loss": 56.1667, "step": 2931 }, { "epoch": 7.7438098382304394, "grad_norm": 1425.786865234375, "learning_rate": 0.0004725235897460212, "loss": 45.332, "step": 2932 }, { "epoch": 7.746450973918785, "grad_norm": 1129.931884765625, "learning_rate": 0.0004725045459016421, "loss": 42.888, "step": 2933 }, { "epoch": 7.7490921096071315, "grad_norm": 861.6367797851562, "learning_rate": 0.00047248549584395425, "loss": 52.9767, "step": 2934 }, { "epoch": 7.751733245295477, "grad_norm": 1212.43212890625, "learning_rate": 0.0004724664395734896, "loss": 58.4905, "step": 2935 }, { "epoch": 7.754374380983823, "grad_norm": 888.7150268554688, "learning_rate": 0.00047244737709078034, "loss": 53.6648, "step": 2936 }, { "epoch": 7.757015516672169, "grad_norm": 582.2825927734375, "learning_rate": 0.0004724283083963587, "loss": 47.2255, "step": 2937 }, { "epoch": 7.7596566523605155, "grad_norm": 671.6316528320312, "learning_rate": 0.00047240923349075724, "loss": 43.3025, "step": 2938 }, { "epoch": 7.762297788048861, "grad_norm": 481.11785888671875, "learning_rate": 0.00047239015237450857, "loss": 44.4414, "step": 2939 }, { "epoch": 7.764938923737207, "grad_norm": 1393.6702880859375, "learning_rate": 0.00047237106504814555, "loss": 40.5096, "step": 2940 }, { "epoch": 7.767580059425553, "grad_norm": 992.269287109375, "learning_rate": 0.0004723519715122012, "loss": 42.8036, "step": 2941 }, { "epoch": 7.770221195113899, "grad_norm": 593.376708984375, "learning_rate": 0.00047233287176720855, "loss": 41.2931, "step": 2942 }, { "epoch": 7.772862330802245, "grad_norm": 2357.7607421875, "learning_rate": 0.00047231376581370117, "loss": 40.2474, "step": 2943 }, { "epoch": 7.775503466490591, "grad_norm": 1270.8458251953125, "learning_rate": 0.00047229465365221244, "loss": 40.9718, "step": 2944 }, { "epoch": 7.778144602178937, "grad_norm": 1322.290283203125, "learning_rate": 0.00047227553528327607, "loss": 42.5837, "step": 2945 }, { "epoch": 7.780785737867283, "grad_norm": 1422.6419677734375, "learning_rate": 0.00047225641070742597, "loss": 40.8577, "step": 2946 }, { "epoch": 7.783426873555629, "grad_norm": 2459.962646484375, "learning_rate": 0.00047223727992519617, "loss": 40.8807, "step": 2947 }, { "epoch": 7.786068009243975, "grad_norm": 3147.142822265625, "learning_rate": 0.00047221814293712085, "loss": 42.3455, "step": 2948 }, { "epoch": 7.788709144932321, "grad_norm": 1176.686279296875, "learning_rate": 0.0004721989997437345, "loss": 44.8375, "step": 2949 }, { "epoch": 7.791350280620667, "grad_norm": 6065.1455078125, "learning_rate": 0.00047217985034557155, "loss": 46.8077, "step": 2950 }, { "epoch": 7.793991416309013, "grad_norm": 1106.640380859375, "learning_rate": 0.00047216069474316676, "loss": 46.1683, "step": 2951 }, { "epoch": 7.796632551997359, "grad_norm": 451.9765930175781, "learning_rate": 0.0004721415329370551, "loss": 44.4299, "step": 2952 }, { "epoch": 7.799273687685705, "grad_norm": 844.3341064453125, "learning_rate": 0.0004721223649277716, "loss": 46.0391, "step": 2953 }, { "epoch": 7.801914823374051, "grad_norm": 854.2704467773438, "learning_rate": 0.0004721031907158516, "loss": 45.4431, "step": 2954 }, { "epoch": 7.804555959062397, "grad_norm": 909.8058471679688, "learning_rate": 0.00047208401030183046, "loss": 48.3995, "step": 2955 }, { "epoch": 7.807197094750743, "grad_norm": 1220.9295654296875, "learning_rate": 0.00047206482368624374, "loss": 45.767, "step": 2956 }, { "epoch": 7.809838230439089, "grad_norm": 704.345703125, "learning_rate": 0.00047204563086962736, "loss": 49.7641, "step": 2957 }, { "epoch": 7.812479366127435, "grad_norm": 843.1123046875, "learning_rate": 0.0004720264318525171, "loss": 47.6505, "step": 2958 }, { "epoch": 7.81512050181578, "grad_norm": 1295.3553466796875, "learning_rate": 0.00047200722663544916, "loss": 46.0185, "step": 2959 }, { "epoch": 7.817761637504127, "grad_norm": 1089.9176025390625, "learning_rate": 0.00047198801521895984, "loss": 44.6026, "step": 2960 }, { "epoch": 7.820402773192473, "grad_norm": 589.7762451171875, "learning_rate": 0.00047196879760358563, "loss": 44.8452, "step": 2961 }, { "epoch": 7.823043908880819, "grad_norm": 665.5540161132812, "learning_rate": 0.00047194957378986316, "loss": 43.8463, "step": 2962 }, { "epoch": 7.825685044569164, "grad_norm": 795.0679321289062, "learning_rate": 0.0004719303437783291, "loss": 43.8781, "step": 2963 }, { "epoch": 7.828326180257511, "grad_norm": 651.0238037109375, "learning_rate": 0.0004719111075695206, "loss": 42.7788, "step": 2964 }, { "epoch": 7.830967315945856, "grad_norm": 534.3692016601562, "learning_rate": 0.00047189186516397486, "loss": 40.2623, "step": 2965 }, { "epoch": 7.833608451634203, "grad_norm": 610.5177612304688, "learning_rate": 0.0004718726165622291, "loss": 41.3731, "step": 2966 }, { "epoch": 7.836249587322548, "grad_norm": 775.544677734375, "learning_rate": 0.00047185336176482084, "loss": 40.427, "step": 2967 }, { "epoch": 7.838890723010895, "grad_norm": 575.5336303710938, "learning_rate": 0.0004718341007722877, "loss": 40.4306, "step": 2968 }, { "epoch": 7.84153185869924, "grad_norm": 568.974853515625, "learning_rate": 0.0004718148335851677, "loss": 39.5287, "step": 2969 }, { "epoch": 7.844172994387587, "grad_norm": 610.0752563476562, "learning_rate": 0.0004717955602039988, "loss": 41.3237, "step": 2970 }, { "epoch": 7.846814130075932, "grad_norm": 854.1422729492188, "learning_rate": 0.0004717762806293191, "loss": 39.6356, "step": 2971 }, { "epoch": 7.849455265764279, "grad_norm": 1237.4674072265625, "learning_rate": 0.0004717569948616671, "loss": 50.9013, "step": 2972 }, { "epoch": 7.852096401452624, "grad_norm": 10071.8046875, "learning_rate": 0.00047173770290158125, "loss": 73.7236, "step": 2973 }, { "epoch": 7.854737537140971, "grad_norm": 14068.7109375, "learning_rate": 0.0004717184047496003, "loss": 86.3579, "step": 2974 }, { "epoch": 7.8573786728293165, "grad_norm": 7813.79541015625, "learning_rate": 0.0004716991004062632, "loss": 86.1103, "step": 2975 }, { "epoch": 7.860019808517663, "grad_norm": 11057.5048828125, "learning_rate": 0.00047167978987210893, "loss": 71.8067, "step": 2976 }, { "epoch": 7.8626609442060085, "grad_norm": 2941.62841796875, "learning_rate": 0.0004716604731476767, "loss": 68.3269, "step": 2977 }, { "epoch": 7.865302079894355, "grad_norm": 8240.20703125, "learning_rate": 0.00047164115023350594, "loss": 62.2852, "step": 2978 }, { "epoch": 7.8679432155827005, "grad_norm": 4042.561767578125, "learning_rate": 0.0004716218211301363, "loss": 58.6555, "step": 2979 }, { "epoch": 7.870584351271047, "grad_norm": 5013.2001953125, "learning_rate": 0.0004716024858381075, "loss": 56.4566, "step": 2980 }, { "epoch": 7.8732254869593925, "grad_norm": 2445.501953125, "learning_rate": 0.0004715831443579595, "loss": 44.1054, "step": 2981 }, { "epoch": 7.875866622647738, "grad_norm": 7724.484375, "learning_rate": 0.0004715637966902323, "loss": 40.5246, "step": 2982 }, { "epoch": 7.8785077583360845, "grad_norm": 4102.26025390625, "learning_rate": 0.0004715444428354663, "loss": 40.301, "step": 2983 }, { "epoch": 7.881148894024431, "grad_norm": 697.1181640625, "learning_rate": 0.0004715250827942018, "loss": 42.3506, "step": 2984 }, { "epoch": 7.8837900297127765, "grad_norm": 779.1642456054688, "learning_rate": 0.0004715057165669795, "loss": 44.6926, "step": 2985 }, { "epoch": 7.886431165401122, "grad_norm": 1028.787841796875, "learning_rate": 0.00047148634415434024, "loss": 43.8739, "step": 2986 }, { "epoch": 7.8890723010894686, "grad_norm": 617.3734741210938, "learning_rate": 0.0004714669655568249, "loss": 40.952, "step": 2987 }, { "epoch": 7.891713436777814, "grad_norm": 916.58056640625, "learning_rate": 0.0004714475807749746, "loss": 40.9478, "step": 2988 }, { "epoch": 7.894354572466161, "grad_norm": 712.227294921875, "learning_rate": 0.00047142818980933077, "loss": 41.859, "step": 2989 }, { "epoch": 7.896995708154506, "grad_norm": 558.9727172851562, "learning_rate": 0.0004714087926604347, "loss": 39.2013, "step": 2990 }, { "epoch": 7.899636843842853, "grad_norm": 933.861328125, "learning_rate": 0.00047138938932882825, "loss": 40.5143, "step": 2991 }, { "epoch": 7.902277979531198, "grad_norm": 733.9913330078125, "learning_rate": 0.0004713699798150531, "loss": 41.0206, "step": 2992 }, { "epoch": 7.904919115219545, "grad_norm": 837.3720703125, "learning_rate": 0.0004713505641196514, "loss": 39.0316, "step": 2993 }, { "epoch": 7.90756025090789, "grad_norm": 974.1793212890625, "learning_rate": 0.0004713311422431652, "loss": 39.893, "step": 2994 }, { "epoch": 7.910201386596237, "grad_norm": 724.8908081054688, "learning_rate": 0.00047131171418613685, "loss": 39.6467, "step": 2995 }, { "epoch": 7.912842522284582, "grad_norm": 1074.640380859375, "learning_rate": 0.0004712922799491088, "loss": 39.6043, "step": 2996 }, { "epoch": 7.915483657972929, "grad_norm": 756.3291015625, "learning_rate": 0.0004712728395326239, "loss": 40.2994, "step": 2997 }, { "epoch": 7.918124793661274, "grad_norm": 974.2147827148438, "learning_rate": 0.00047125339293722495, "loss": 41.7843, "step": 2998 }, { "epoch": 7.920765929349621, "grad_norm": 933.677001953125, "learning_rate": 0.000471233940163455, "loss": 42.8585, "step": 2999 }, { "epoch": 7.923407065037966, "grad_norm": 4574.57958984375, "learning_rate": 0.00047121448121185716, "loss": 43.8816, "step": 3000 }, { "epoch": 7.923407065037966, "eval_loss": 5.3429975509643555, "eval_runtime": 2.1043, "eval_samples_per_second": 235.233, "eval_steps_per_second": 29.463, "step": 3000 }, { "epoch": 7.926048200726313, "grad_norm": 828.4193725585938, "learning_rate": 0.0004711950160829749, "loss": 45.5398, "step": 3001 }, { "epoch": 7.928689336414658, "grad_norm": 590.0009765625, "learning_rate": 0.0004711755447773518, "loss": 42.3523, "step": 3002 }, { "epoch": 7.931330472103005, "grad_norm": 862.437744140625, "learning_rate": 0.00047115606729553153, "loss": 43.138, "step": 3003 }, { "epoch": 7.93397160779135, "grad_norm": 522.7597045898438, "learning_rate": 0.0004711365836380579, "loss": 46.6504, "step": 3004 }, { "epoch": 7.936612743479696, "grad_norm": 685.9568481445312, "learning_rate": 0.00047111709380547517, "loss": 46.2779, "step": 3005 }, { "epoch": 7.939253879168042, "grad_norm": 738.7718505859375, "learning_rate": 0.00047109759779832746, "loss": 43.2494, "step": 3006 }, { "epoch": 7.941895014856389, "grad_norm": 1274.4844970703125, "learning_rate": 0.0004710780956171592, "loss": 42.7351, "step": 3007 }, { "epoch": 7.944536150544734, "grad_norm": 573.2459716796875, "learning_rate": 0.000471058587262515, "loss": 41.9733, "step": 3008 }, { "epoch": 7.94717728623308, "grad_norm": 438.829833984375, "learning_rate": 0.0004710390727349396, "loss": 41.2322, "step": 3009 }, { "epoch": 7.949818421921426, "grad_norm": 525.8713989257812, "learning_rate": 0.00047101955203497794, "loss": 41.155, "step": 3010 }, { "epoch": 7.952459557609772, "grad_norm": 428.24761962890625, "learning_rate": 0.0004710000251631751, "loss": 39.0713, "step": 3011 }, { "epoch": 7.955100693298118, "grad_norm": 286.5125427246094, "learning_rate": 0.0004709804921200764, "loss": 38.8572, "step": 3012 }, { "epoch": 7.957741828986464, "grad_norm": 698.8197631835938, "learning_rate": 0.0004709609529062273, "loss": 40.6671, "step": 3013 }, { "epoch": 7.96038296467481, "grad_norm": 3957.8671875, "learning_rate": 0.0004709414075221734, "loss": 43.3257, "step": 3014 }, { "epoch": 7.963024100363156, "grad_norm": 8966.525390625, "learning_rate": 0.0004709218559684604, "loss": 59.1989, "step": 3015 }, { "epoch": 7.965665236051502, "grad_norm": 32139.7265625, "learning_rate": 0.0004709022982456344, "loss": 59.4886, "step": 3016 }, { "epoch": 7.968306371739848, "grad_norm": 14675.9677734375, "learning_rate": 0.00047088273435424153, "loss": 53.0822, "step": 3017 }, { "epoch": 7.970947507428194, "grad_norm": 13182.96875, "learning_rate": 0.000470863164294828, "loss": 48.8487, "step": 3018 }, { "epoch": 7.97358864311654, "grad_norm": 1762.395263671875, "learning_rate": 0.00047084358806794035, "loss": 39.4766, "step": 3019 }, { "epoch": 7.976229778804886, "grad_norm": 688.557373046875, "learning_rate": 0.0004708240056741252, "loss": 41.201, "step": 3020 }, { "epoch": 7.978870914493232, "grad_norm": 406.6363830566406, "learning_rate": 0.0004708044171139295, "loss": 39.5351, "step": 3021 }, { "epoch": 7.981512050181578, "grad_norm": 395.2772216796875, "learning_rate": 0.0004707848223879001, "loss": 40.9678, "step": 3022 }, { "epoch": 7.984153185869924, "grad_norm": 495.8891296386719, "learning_rate": 0.00047076522149658425, "loss": 40.1411, "step": 3023 }, { "epoch": 7.98679432155827, "grad_norm": 356.2596435546875, "learning_rate": 0.0004707456144405292, "loss": 38.7812, "step": 3024 }, { "epoch": 7.989435457246616, "grad_norm": 490.7709655761719, "learning_rate": 0.00047072600122028264, "loss": 40.5224, "step": 3025 }, { "epoch": 7.992076592934962, "grad_norm": 1185.498046875, "learning_rate": 0.00047070638183639214, "loss": 39.4627, "step": 3026 }, { "epoch": 7.994717728623308, "grad_norm": 687.9810180664062, "learning_rate": 0.0004706867562894056, "loss": 39.8637, "step": 3027 }, { "epoch": 7.9973588643116535, "grad_norm": 699.7904052734375, "learning_rate": 0.0004706671245798709, "loss": 40.6641, "step": 3028 }, { "epoch": 8.0, "grad_norm": 606.6514892578125, "learning_rate": 0.00047064748670833654, "loss": 43.7715, "step": 3029 }, { "epoch": 8.002641135688346, "grad_norm": 1148.4100341796875, "learning_rate": 0.00047062784267535066, "loss": 53.9245, "step": 3030 }, { "epoch": 8.005282271376691, "grad_norm": 699.3089599609375, "learning_rate": 0.00047060819248146183, "loss": 46.7761, "step": 3031 }, { "epoch": 8.007923407065038, "grad_norm": 345.5414123535156, "learning_rate": 0.0004705885361272189, "loss": 42.8405, "step": 3032 }, { "epoch": 8.010564542753384, "grad_norm": 383.8101501464844, "learning_rate": 0.00047056887361317056, "loss": 45.7574, "step": 3033 }, { "epoch": 8.01320567844173, "grad_norm": 400.0096740722656, "learning_rate": 0.0004705492049398661, "loss": 48.0722, "step": 3034 }, { "epoch": 8.015846814130075, "grad_norm": 307.10198974609375, "learning_rate": 0.00047052953010785466, "loss": 44.7529, "step": 3035 }, { "epoch": 8.018487949818422, "grad_norm": 1419.2535400390625, "learning_rate": 0.0004705098491176856, "loss": 48.4861, "step": 3036 }, { "epoch": 8.021129085506768, "grad_norm": 280.48486328125, "learning_rate": 0.0004704901619699085, "loss": 46.4938, "step": 3037 }, { "epoch": 8.023770221195115, "grad_norm": 231.98760986328125, "learning_rate": 0.0004704704686650732, "loss": 46.7579, "step": 3038 }, { "epoch": 8.02641135688346, "grad_norm": 270.31353759765625, "learning_rate": 0.00047045076920372954, "loss": 46.2409, "step": 3039 }, { "epoch": 8.029052492571806, "grad_norm": 227.04290771484375, "learning_rate": 0.0004704310635864277, "loss": 46.3765, "step": 3040 }, { "epoch": 8.031693628260152, "grad_norm": 192.85238647460938, "learning_rate": 0.00047041135181371787, "loss": 44.0607, "step": 3041 }, { "epoch": 8.034334763948499, "grad_norm": 178.92613220214844, "learning_rate": 0.00047039163388615045, "loss": 43.3735, "step": 3042 }, { "epoch": 8.036975899636843, "grad_norm": 203.94073486328125, "learning_rate": 0.00047037190980427627, "loss": 41.342, "step": 3043 }, { "epoch": 8.03961703532519, "grad_norm": 247.78802490234375, "learning_rate": 0.00047035217956864587, "loss": 40.2813, "step": 3044 }, { "epoch": 8.042258171013536, "grad_norm": 268.3840026855469, "learning_rate": 0.0004703324431798103, "loss": 41.3274, "step": 3045 }, { "epoch": 8.044899306701883, "grad_norm": 170.71128845214844, "learning_rate": 0.00047031270063832066, "loss": 39.5075, "step": 3046 }, { "epoch": 8.047540442390227, "grad_norm": 166.9001922607422, "learning_rate": 0.0004702929519447283, "loss": 40.1941, "step": 3047 }, { "epoch": 8.050181578078574, "grad_norm": 234.2989044189453, "learning_rate": 0.00047027319709958465, "loss": 40.0717, "step": 3048 }, { "epoch": 8.05282271376692, "grad_norm": 415.47021484375, "learning_rate": 0.0004702534361034414, "loss": 40.4258, "step": 3049 }, { "epoch": 8.055463849455267, "grad_norm": 736.62060546875, "learning_rate": 0.0004702336689568503, "loss": 39.2052, "step": 3050 }, { "epoch": 8.058104985143611, "grad_norm": 603.3228149414062, "learning_rate": 0.0004702138956603634, "loss": 48.9874, "step": 3051 }, { "epoch": 8.060746120831958, "grad_norm": 3847.41455078125, "learning_rate": 0.0004701941162145328, "loss": 111.8326, "step": 3052 }, { "epoch": 8.063387256520304, "grad_norm": 4978.70849609375, "learning_rate": 0.0004701743306199108, "loss": 114.2377, "step": 3053 }, { "epoch": 8.066028392208649, "grad_norm": 5173.14794921875, "learning_rate": 0.00047015453887705005, "loss": 114.6182, "step": 3054 }, { "epoch": 8.068669527896995, "grad_norm": 10267.1318359375, "learning_rate": 0.0004701347409865031, "loss": 121.0366, "step": 3055 }, { "epoch": 8.071310663585342, "grad_norm": 10506.9033203125, "learning_rate": 0.00047011493694882277, "loss": 106.9411, "step": 3056 }, { "epoch": 8.073951799273688, "grad_norm": 10254.1796875, "learning_rate": 0.00047009512676456224, "loss": 84.9917, "step": 3057 }, { "epoch": 8.076592934962033, "grad_norm": 3940.34423828125, "learning_rate": 0.00047007531043427445, "loss": 76.2088, "step": 3058 }, { "epoch": 8.07923407065038, "grad_norm": 3780.55810546875, "learning_rate": 0.00047005548795851294, "loss": 71.7404, "step": 3059 }, { "epoch": 8.081875206338726, "grad_norm": 2805.58837890625, "learning_rate": 0.00047003565933783123, "loss": 77.0224, "step": 3060 }, { "epoch": 8.084516342027072, "grad_norm": 726.5261840820312, "learning_rate": 0.000470015824572783, "loss": 42.9137, "step": 3061 }, { "epoch": 8.087157477715417, "grad_norm": 750.685546875, "learning_rate": 0.00046999598366392203, "loss": 42.0934, "step": 3062 }, { "epoch": 8.089798613403763, "grad_norm": 748.8922119140625, "learning_rate": 0.00046997613661180253, "loss": 43.4918, "step": 3063 }, { "epoch": 8.09243974909211, "grad_norm": 676.8190307617188, "learning_rate": 0.00046995628341697863, "loss": 41.2146, "step": 3064 }, { "epoch": 8.095080884780456, "grad_norm": 847.3533935546875, "learning_rate": 0.0004699364240800047, "loss": 40.1749, "step": 3065 }, { "epoch": 8.097722020468801, "grad_norm": 641.2313842773438, "learning_rate": 0.00046991655860143536, "loss": 40.7571, "step": 3066 }, { "epoch": 8.100363156157147, "grad_norm": 370.7998962402344, "learning_rate": 0.0004698966869818253, "loss": 39.5353, "step": 3067 }, { "epoch": 8.103004291845494, "grad_norm": 3246.43701171875, "learning_rate": 0.0004698768092217294, "loss": 40.6469, "step": 3068 }, { "epoch": 8.10564542753384, "grad_norm": 814.070068359375, "learning_rate": 0.0004698569253217028, "loss": 40.7614, "step": 3069 }, { "epoch": 8.108286563222185, "grad_norm": 803.130126953125, "learning_rate": 0.00046983703528230065, "loss": 40.5017, "step": 3070 }, { "epoch": 8.110927698910531, "grad_norm": 732.1276245117188, "learning_rate": 0.0004698171391040785, "loss": 40.5079, "step": 3071 }, { "epoch": 8.113568834598878, "grad_norm": 461.0784912109375, "learning_rate": 0.0004697972367875918, "loss": 39.1857, "step": 3072 }, { "epoch": 8.116209970287224, "grad_norm": 707.4703369140625, "learning_rate": 0.00046977732833339644, "loss": 39.4584, "step": 3073 }, { "epoch": 8.118851105975569, "grad_norm": 579.92578125, "learning_rate": 0.0004697574137420482, "loss": 39.8038, "step": 3074 }, { "epoch": 8.121492241663915, "grad_norm": 580.7116088867188, "learning_rate": 0.0004697374930141034, "loss": 37.7846, "step": 3075 }, { "epoch": 8.124133377352262, "grad_norm": 1265.088134765625, "learning_rate": 0.0004697175661501181, "loss": 39.8922, "step": 3076 }, { "epoch": 8.126774513040607, "grad_norm": 461.37646484375, "learning_rate": 0.00046969763315064887, "loss": 41.9817, "step": 3077 }, { "epoch": 8.129415648728953, "grad_norm": 721.961181640625, "learning_rate": 0.0004696776940162523, "loss": 40.1629, "step": 3078 }, { "epoch": 8.1320567844173, "grad_norm": 1708.5128173828125, "learning_rate": 0.00046965774874748513, "loss": 43.1049, "step": 3079 }, { "epoch": 8.134697920105646, "grad_norm": 649.9091186523438, "learning_rate": 0.00046963779734490437, "loss": 42.6434, "step": 3080 }, { "epoch": 8.13733905579399, "grad_norm": 361.2000732421875, "learning_rate": 0.00046961783980906715, "loss": 44.0943, "step": 3081 }, { "epoch": 8.139980191482337, "grad_norm": 801.2976684570312, "learning_rate": 0.0004695978761405307, "loss": 43.9978, "step": 3082 }, { "epoch": 8.142621327170684, "grad_norm": 482.2872009277344, "learning_rate": 0.0004695779063398527, "loss": 43.1553, "step": 3083 }, { "epoch": 8.14526246285903, "grad_norm": 481.34613037109375, "learning_rate": 0.00046955793040759055, "loss": 50.9247, "step": 3084 }, { "epoch": 8.147903598547375, "grad_norm": 607.8963623046875, "learning_rate": 0.0004695379483443022, "loss": 49.3769, "step": 3085 }, { "epoch": 8.150544734235721, "grad_norm": 432.5633850097656, "learning_rate": 0.0004695179601505456, "loss": 44.997, "step": 3086 }, { "epoch": 8.153185869924068, "grad_norm": 441.9025573730469, "learning_rate": 0.0004694979658268789, "loss": 47.5852, "step": 3087 }, { "epoch": 8.155827005612414, "grad_norm": 339.50390625, "learning_rate": 0.0004694779653738605, "loss": 42.2206, "step": 3088 }, { "epoch": 8.158468141300759, "grad_norm": 492.0137634277344, "learning_rate": 0.0004694579587920488, "loss": 43.4657, "step": 3089 }, { "epoch": 8.161109276989105, "grad_norm": 629.2236328125, "learning_rate": 0.0004694379460820025, "loss": 42.9031, "step": 3090 }, { "epoch": 8.163750412677452, "grad_norm": 442.7400207519531, "learning_rate": 0.0004694179272442805, "loss": 43.4144, "step": 3091 }, { "epoch": 8.166391548365798, "grad_norm": 630.5338745117188, "learning_rate": 0.0004693979022794418, "loss": 40.1983, "step": 3092 }, { "epoch": 8.169032684054143, "grad_norm": 295.2851257324219, "learning_rate": 0.00046937787118804547, "loss": 40.0226, "step": 3093 }, { "epoch": 8.17167381974249, "grad_norm": 331.0938415527344, "learning_rate": 0.000469357833970651, "loss": 39.8287, "step": 3094 }, { "epoch": 8.174314955430836, "grad_norm": 576.46142578125, "learning_rate": 0.0004693377906278179, "loss": 39.4039, "step": 3095 }, { "epoch": 8.176956091119182, "grad_norm": 206.6712646484375, "learning_rate": 0.00046931774116010586, "loss": 39.0527, "step": 3096 }, { "epoch": 8.179597226807527, "grad_norm": 695.5819702148438, "learning_rate": 0.0004692976855680746, "loss": 38.6438, "step": 3097 }, { "epoch": 8.182238362495873, "grad_norm": 343.0960388183594, "learning_rate": 0.00046927762385228443, "loss": 38.0701, "step": 3098 }, { "epoch": 8.18487949818422, "grad_norm": 400.9284973144531, "learning_rate": 0.0004692575560132954, "loss": 39.2615, "step": 3099 }, { "epoch": 8.187520633872564, "grad_norm": 329.19012451171875, "learning_rate": 0.0004692374820516679, "loss": 41.5182, "step": 3100 }, { "epoch": 8.19016176956091, "grad_norm": 986.535400390625, "learning_rate": 0.00046921740196796246, "loss": 63.2837, "step": 3101 }, { "epoch": 8.192802905249257, "grad_norm": 6517.595703125, "learning_rate": 0.00046919731576273993, "loss": 67.9644, "step": 3102 }, { "epoch": 8.195444040937604, "grad_norm": 2183.336181640625, "learning_rate": 0.00046917722343656107, "loss": 71.049, "step": 3103 }, { "epoch": 8.198085176625948, "grad_norm": 1548.758056640625, "learning_rate": 0.000469157124989987, "loss": 59.5046, "step": 3104 }, { "epoch": 8.200726312314295, "grad_norm": 28033.0, "learning_rate": 0.00046913702042357896, "loss": 54.5317, "step": 3105 }, { "epoch": 8.203367448002641, "grad_norm": 2359.114013671875, "learning_rate": 0.00046911690973789835, "loss": 45.0951, "step": 3106 }, { "epoch": 8.206008583690988, "grad_norm": 1699.8388671875, "learning_rate": 0.00046909679293350673, "loss": 38.9081, "step": 3107 }, { "epoch": 8.208649719379332, "grad_norm": 1734.1021728515625, "learning_rate": 0.0004690766700109659, "loss": 26.9547, "step": 3108 }, { "epoch": 8.211290855067679, "grad_norm": 476.4400939941406, "learning_rate": 0.0004690565409708377, "loss": 25.9266, "step": 3109 }, { "epoch": 8.213931990756025, "grad_norm": 895.3294677734375, "learning_rate": 0.00046903640581368435, "loss": 27.0588, "step": 3110 }, { "epoch": 8.216573126444372, "grad_norm": 5206.93310546875, "learning_rate": 0.00046901626454006807, "loss": 57.2038, "step": 3111 }, { "epoch": 8.219214262132716, "grad_norm": 1787.7103271484375, "learning_rate": 0.00046899611715055115, "loss": 97.2051, "step": 3112 }, { "epoch": 8.221855397821063, "grad_norm": 1497.646484375, "learning_rate": 0.0004689759636456964, "loss": 91.9546, "step": 3113 }, { "epoch": 8.22449653350941, "grad_norm": 2285.409912109375, "learning_rate": 0.0004689558040260664, "loss": 80.3973, "step": 3114 }, { "epoch": 8.227137669197756, "grad_norm": 1635.597900390625, "learning_rate": 0.0004689356382922243, "loss": 62.6172, "step": 3115 }, { "epoch": 8.2297788048861, "grad_norm": 1158.7822265625, "learning_rate": 0.000468915466444733, "loss": 47.0549, "step": 3116 }, { "epoch": 8.232419940574447, "grad_norm": 608.4374389648438, "learning_rate": 0.000468895288484156, "loss": 47.8112, "step": 3117 }, { "epoch": 8.235061076262793, "grad_norm": 547.205810546875, "learning_rate": 0.00046887510441105657, "loss": 45.9796, "step": 3118 }, { "epoch": 8.23770221195114, "grad_norm": 363.8179931640625, "learning_rate": 0.00046885491422599845, "loss": 44.0649, "step": 3119 }, { "epoch": 8.240343347639485, "grad_norm": 451.7872619628906, "learning_rate": 0.00046883471792954546, "loss": 42.6362, "step": 3120 }, { "epoch": 8.242984483327831, "grad_norm": 812.8533935546875, "learning_rate": 0.00046881451552226145, "loss": 41.1448, "step": 3121 }, { "epoch": 8.245625619016177, "grad_norm": 720.3878784179688, "learning_rate": 0.0004687943070047107, "loss": 42.0888, "step": 3122 }, { "epoch": 8.248266754704522, "grad_norm": 581.0066528320312, "learning_rate": 0.0004687740923774574, "loss": 41.2675, "step": 3123 }, { "epoch": 8.250907890392869, "grad_norm": 655.2254638671875, "learning_rate": 0.0004687538716410661, "loss": 42.2351, "step": 3124 }, { "epoch": 8.253549026081215, "grad_norm": 1168.175048828125, "learning_rate": 0.0004687336447961015, "loss": 42.0504, "step": 3125 }, { "epoch": 8.256190161769561, "grad_norm": 886.1305541992188, "learning_rate": 0.00046871341184312823, "loss": 41.2787, "step": 3126 }, { "epoch": 8.258831297457906, "grad_norm": 484.6871032714844, "learning_rate": 0.0004686931727827115, "loss": 40.6195, "step": 3127 }, { "epoch": 8.261472433146253, "grad_norm": 857.0933227539062, "learning_rate": 0.00046867292761541634, "loss": 44.9224, "step": 3128 }, { "epoch": 8.264113568834599, "grad_norm": 973.7351684570312, "learning_rate": 0.0004686526763418082, "loss": 49.0828, "step": 3129 }, { "epoch": 8.266754704522945, "grad_norm": 329.5014343261719, "learning_rate": 0.0004686324189624525, "loss": 44.9938, "step": 3130 }, { "epoch": 8.26939584021129, "grad_norm": 309.4750061035156, "learning_rate": 0.0004686121554779149, "loss": 44.5217, "step": 3131 }, { "epoch": 8.272036975899637, "grad_norm": 257.3720703125, "learning_rate": 0.0004685918858887613, "loss": 43.9642, "step": 3132 }, { "epoch": 8.274678111587983, "grad_norm": 331.0533447265625, "learning_rate": 0.0004685716101955577, "loss": 47.5834, "step": 3133 }, { "epoch": 8.27731924727633, "grad_norm": 449.6252136230469, "learning_rate": 0.00046855132839887027, "loss": 47.4789, "step": 3134 }, { "epoch": 8.279960382964674, "grad_norm": 537.1668701171875, "learning_rate": 0.00046853104049926535, "loss": 47.8484, "step": 3135 }, { "epoch": 8.28260151865302, "grad_norm": 507.8381042480469, "learning_rate": 0.0004685107464973095, "loss": 46.8546, "step": 3136 }, { "epoch": 8.285242654341367, "grad_norm": 389.9399108886719, "learning_rate": 0.00046849044639356947, "loss": 45.7273, "step": 3137 }, { "epoch": 8.287883790029714, "grad_norm": 330.1365966796875, "learning_rate": 0.000468470140188612, "loss": 48.1848, "step": 3138 }, { "epoch": 8.290524925718058, "grad_norm": 308.8583679199219, "learning_rate": 0.0004684498278830043, "loss": 47.8902, "step": 3139 }, { "epoch": 8.293166061406405, "grad_norm": 319.4422912597656, "learning_rate": 0.0004684295094773134, "loss": 44.7341, "step": 3140 }, { "epoch": 8.295807197094751, "grad_norm": 359.3221130371094, "learning_rate": 0.0004684091849721068, "loss": 45.4149, "step": 3141 }, { "epoch": 8.298448332783098, "grad_norm": 247.6615447998047, "learning_rate": 0.00046838885436795196, "loss": 43.7764, "step": 3142 }, { "epoch": 8.301089468471442, "grad_norm": 466.1419372558594, "learning_rate": 0.00046836851766541665, "loss": 42.4497, "step": 3143 }, { "epoch": 8.303730604159789, "grad_norm": 280.5438232421875, "learning_rate": 0.00046834817486506887, "loss": 40.7497, "step": 3144 }, { "epoch": 8.306371739848135, "grad_norm": 272.26947021484375, "learning_rate": 0.0004683278259674765, "loss": 41.8898, "step": 3145 }, { "epoch": 8.309012875536482, "grad_norm": 351.8935241699219, "learning_rate": 0.0004683074709732078, "loss": 39.5221, "step": 3146 }, { "epoch": 8.311654011224826, "grad_norm": 206.68263244628906, "learning_rate": 0.00046828710988283125, "loss": 38.9593, "step": 3147 }, { "epoch": 8.314295146913173, "grad_norm": 269.7945861816406, "learning_rate": 0.00046826674269691537, "loss": 39.0318, "step": 3148 }, { "epoch": 8.31693628260152, "grad_norm": 239.61192321777344, "learning_rate": 0.0004682463694160289, "loss": 43.1666, "step": 3149 }, { "epoch": 8.319577418289864, "grad_norm": 373.39056396484375, "learning_rate": 0.00046822599004074086, "loss": 40.6822, "step": 3150 }, { "epoch": 8.32221855397821, "grad_norm": 546.9072875976562, "learning_rate": 0.0004682056045716202, "loss": 50.9195, "step": 3151 }, { "epoch": 8.324859689666557, "grad_norm": 3970.204345703125, "learning_rate": 0.0004681852130092362, "loss": 89.3051, "step": 3152 }, { "epoch": 8.327500825354903, "grad_norm": 1646.7020263671875, "learning_rate": 0.0004681648153541583, "loss": 128.4858, "step": 3153 }, { "epoch": 8.330141961043248, "grad_norm": 1450.159912109375, "learning_rate": 0.0004681444116069561, "loss": 108.4168, "step": 3154 }, { "epoch": 8.332783096731594, "grad_norm": 3322.34326171875, "learning_rate": 0.0004681240017681993, "loss": 93.3688, "step": 3155 }, { "epoch": 8.33542423241994, "grad_norm": 4221.70849609375, "learning_rate": 0.0004681035858384579, "loss": 65.084, "step": 3156 }, { "epoch": 8.338065368108287, "grad_norm": 3561.515380859375, "learning_rate": 0.000468083163818302, "loss": 48.7041, "step": 3157 }, { "epoch": 8.340706503796632, "grad_norm": 688.2584228515625, "learning_rate": 0.0004680627357083019, "loss": 32.9357, "step": 3158 }, { "epoch": 8.343347639484978, "grad_norm": 413.22747802734375, "learning_rate": 0.0004680423015090279, "loss": 22.3242, "step": 3159 }, { "epoch": 8.345988775173325, "grad_norm": 621.9296875, "learning_rate": 0.00046802186122105085, "loss": 22.7526, "step": 3160 }, { "epoch": 8.348629910861671, "grad_norm": 2856.8330078125, "learning_rate": 0.0004680014148449413, "loss": 93.0124, "step": 3161 }, { "epoch": 8.351271046550016, "grad_norm": 3595.041015625, "learning_rate": 0.0004679809623812703, "loss": 160.6175, "step": 3162 }, { "epoch": 8.353912182238362, "grad_norm": 2197.303955078125, "learning_rate": 0.00046796050383060903, "loss": 117.0031, "step": 3163 }, { "epoch": 8.356553317926709, "grad_norm": 1579.1875, "learning_rate": 0.00046794003919352865, "loss": 77.8993, "step": 3164 }, { "epoch": 8.359194453615055, "grad_norm": 744.1189575195312, "learning_rate": 0.0004679195684706008, "loss": 52.3697, "step": 3165 }, { "epoch": 8.3618355893034, "grad_norm": 711.7191162109375, "learning_rate": 0.000467899091662397, "loss": 49.5385, "step": 3166 }, { "epoch": 8.364476724991746, "grad_norm": 561.4614868164062, "learning_rate": 0.00046787860876948894, "loss": 48.5484, "step": 3167 }, { "epoch": 8.367117860680093, "grad_norm": 314.76324462890625, "learning_rate": 0.00046785811979244883, "loss": 47.2883, "step": 3168 }, { "epoch": 8.36975899636844, "grad_norm": 547.149658203125, "learning_rate": 0.0004678376247318487, "loss": 45.4327, "step": 3169 }, { "epoch": 8.372400132056784, "grad_norm": 473.3979797363281, "learning_rate": 0.0004678171235882609, "loss": 44.2835, "step": 3170 }, { "epoch": 8.37504126774513, "grad_norm": 509.6723327636719, "learning_rate": 0.0004677966163622578, "loss": 44.1031, "step": 3171 }, { "epoch": 8.377682403433477, "grad_norm": 591.1434326171875, "learning_rate": 0.0004677761030544121, "loss": 42.4487, "step": 3172 }, { "epoch": 8.380323539121822, "grad_norm": 588.9575805664062, "learning_rate": 0.0004677555836652967, "loss": 41.8085, "step": 3173 }, { "epoch": 8.382964674810168, "grad_norm": 474.97113037109375, "learning_rate": 0.00046773505819548456, "loss": 41.0588, "step": 3174 }, { "epoch": 8.385605810498515, "grad_norm": 566.578125, "learning_rate": 0.00046771452664554883, "loss": 42.2881, "step": 3175 }, { "epoch": 8.388246946186861, "grad_norm": 597.8026733398438, "learning_rate": 0.0004676939890160628, "loss": 41.4797, "step": 3176 }, { "epoch": 8.390888081875206, "grad_norm": 722.1470336914062, "learning_rate": 0.00046767344530760003, "loss": 42.0879, "step": 3177 }, { "epoch": 8.393529217563552, "grad_norm": 1023.9784545898438, "learning_rate": 0.00046765289552073416, "loss": 43.3865, "step": 3178 }, { "epoch": 8.396170353251899, "grad_norm": 2322.359619140625, "learning_rate": 0.000467632339656039, "loss": 47.6651, "step": 3179 }, { "epoch": 8.398811488940245, "grad_norm": 312.9858093261719, "learning_rate": 0.0004676117777140887, "loss": 44.3619, "step": 3180 }, { "epoch": 8.40145262462859, "grad_norm": 469.3870544433594, "learning_rate": 0.00046759120969545724, "loss": 43.9762, "step": 3181 }, { "epoch": 8.404093760316936, "grad_norm": 1496.3135986328125, "learning_rate": 0.0004675706356007191, "loss": 44.623, "step": 3182 }, { "epoch": 8.406734896005283, "grad_norm": 466.92022705078125, "learning_rate": 0.0004675500554304488, "loss": 45.51, "step": 3183 }, { "epoch": 8.409376031693629, "grad_norm": 427.0769958496094, "learning_rate": 0.000467529469185221, "loss": 44.4374, "step": 3184 }, { "epoch": 8.412017167381974, "grad_norm": 393.4505920410156, "learning_rate": 0.0004675088768656105, "loss": 45.4794, "step": 3185 }, { "epoch": 8.41465830307032, "grad_norm": 512.6494750976562, "learning_rate": 0.0004674882784721924, "loss": 44.9479, "step": 3186 }, { "epoch": 8.417299438758667, "grad_norm": 404.5897216796875, "learning_rate": 0.0004674676740055419, "loss": 48.06, "step": 3187 }, { "epoch": 8.419940574447013, "grad_norm": 494.13800048828125, "learning_rate": 0.00046744706346623434, "loss": 45.1544, "step": 3188 }, { "epoch": 8.422581710135358, "grad_norm": 496.1769714355469, "learning_rate": 0.00046742644685484523, "loss": 43.2696, "step": 3189 }, { "epoch": 8.425222845823704, "grad_norm": 820.063232421875, "learning_rate": 0.0004674058241719504, "loss": 43.6233, "step": 3190 }, { "epoch": 8.42786398151205, "grad_norm": 366.9087829589844, "learning_rate": 0.00046738519541812565, "loss": 43.1062, "step": 3191 }, { "epoch": 8.430505117200397, "grad_norm": 923.6129760742188, "learning_rate": 0.0004673645605939469, "loss": 46.476, "step": 3192 }, { "epoch": 8.433146252888742, "grad_norm": 417.63970947265625, "learning_rate": 0.0004673439196999906, "loss": 42.9578, "step": 3193 }, { "epoch": 8.435787388577088, "grad_norm": 515.7982177734375, "learning_rate": 0.00046732327273683297, "loss": 42.508, "step": 3194 }, { "epoch": 8.438428524265435, "grad_norm": 712.07421875, "learning_rate": 0.00046730261970505063, "loss": 41.1228, "step": 3195 }, { "epoch": 8.44106965995378, "grad_norm": 373.3747863769531, "learning_rate": 0.0004672819606052202, "loss": 40.6923, "step": 3196 }, { "epoch": 8.443710795642126, "grad_norm": 330.38262939453125, "learning_rate": 0.0004672612954379187, "loss": 40.1628, "step": 3197 }, { "epoch": 8.446351931330472, "grad_norm": 322.1870422363281, "learning_rate": 0.0004672406242037233, "loss": 41.1931, "step": 3198 }, { "epoch": 8.448993067018819, "grad_norm": 477.06964111328125, "learning_rate": 0.0004672199469032109, "loss": 41.7077, "step": 3199 }, { "epoch": 8.451634202707163, "grad_norm": 426.8905029296875, "learning_rate": 0.00046719926353695914, "loss": 40.3727, "step": 3200 }, { "epoch": 8.451634202707163, "eval_loss": 6.10969877243042, "eval_runtime": 2.1876, "eval_samples_per_second": 226.274, "eval_steps_per_second": 28.341, "step": 3200 }, { "epoch": 8.45427533839551, "grad_norm": 950.416015625, "learning_rate": 0.00046717857410554555, "loss": 53.0954, "step": 3201 }, { "epoch": 8.456916474083856, "grad_norm": 5505.978515625, "learning_rate": 0.00046715787860954785, "loss": 89.0462, "step": 3202 }, { "epoch": 8.459557609772203, "grad_norm": 3872.430419921875, "learning_rate": 0.0004671371770495439, "loss": 89.2626, "step": 3203 }, { "epoch": 8.462198745460547, "grad_norm": 3302.763427734375, "learning_rate": 0.0004671164694261119, "loss": 75.399, "step": 3204 }, { "epoch": 8.464839881148894, "grad_norm": 9391.55859375, "learning_rate": 0.00046709575573983004, "loss": 70.1828, "step": 3205 }, { "epoch": 8.46748101683724, "grad_norm": 4433.31884765625, "learning_rate": 0.00046707503599127665, "loss": 67.3738, "step": 3206 }, { "epoch": 8.470122152525587, "grad_norm": 4075.292236328125, "learning_rate": 0.0004670543101810305, "loss": 59.1691, "step": 3207 }, { "epoch": 8.472763288213931, "grad_norm": 3635.619873046875, "learning_rate": 0.00046703357830967017, "loss": 52.0295, "step": 3208 }, { "epoch": 8.475404423902278, "grad_norm": 4139.8984375, "learning_rate": 0.0004670128403777747, "loss": 39.6512, "step": 3209 }, { "epoch": 8.478045559590624, "grad_norm": 1584.4344482421875, "learning_rate": 0.0004669920963859231, "loss": 33.4538, "step": 3210 }, { "epoch": 8.48068669527897, "grad_norm": 877.1651000976562, "learning_rate": 0.00046697134633469475, "loss": 29.9987, "step": 3211 }, { "epoch": 8.483327830967315, "grad_norm": 1305.5296630859375, "learning_rate": 0.0004669505902246689, "loss": 66.8447, "step": 3212 }, { "epoch": 8.485968966655662, "grad_norm": 1232.0189208984375, "learning_rate": 0.0004669298280564254, "loss": 66.1495, "step": 3213 }, { "epoch": 8.488610102344008, "grad_norm": 1075.2691650390625, "learning_rate": 0.0004669090598305438, "loss": 58.1162, "step": 3214 }, { "epoch": 8.491251238032355, "grad_norm": 576.7533569335938, "learning_rate": 0.0004668882855476041, "loss": 46.366, "step": 3215 }, { "epoch": 8.4938923737207, "grad_norm": 374.26910400390625, "learning_rate": 0.0004668675052081864, "loss": 43.3394, "step": 3216 }, { "epoch": 8.496533509409046, "grad_norm": 288.1693420410156, "learning_rate": 0.00046684671881287113, "loss": 41.5662, "step": 3217 }, { "epoch": 8.499174645097392, "grad_norm": 950.845947265625, "learning_rate": 0.0004668259263622386, "loss": 44.4133, "step": 3218 }, { "epoch": 8.501815780785737, "grad_norm": 480.61529541015625, "learning_rate": 0.00046680512785686934, "loss": 41.6819, "step": 3219 }, { "epoch": 8.504456916474084, "grad_norm": 319.44219970703125, "learning_rate": 0.00046678432329734434, "loss": 40.1152, "step": 3220 }, { "epoch": 8.50709805216243, "grad_norm": 411.2447204589844, "learning_rate": 0.0004667635126842444, "loss": 41.872, "step": 3221 }, { "epoch": 8.509739187850776, "grad_norm": 1127.6177978515625, "learning_rate": 0.0004667426960181508, "loss": 43.1239, "step": 3222 }, { "epoch": 8.512380323539121, "grad_norm": 433.3896789550781, "learning_rate": 0.00046672187329964464, "loss": 43.2118, "step": 3223 }, { "epoch": 8.515021459227468, "grad_norm": 497.50018310546875, "learning_rate": 0.0004667010445293075, "loss": 40.4218, "step": 3224 }, { "epoch": 8.517662594915814, "grad_norm": 364.7608642578125, "learning_rate": 0.000466680209707721, "loss": 42.2486, "step": 3225 }, { "epoch": 8.52030373060416, "grad_norm": 546.67529296875, "learning_rate": 0.00046665936883546696, "loss": 41.3425, "step": 3226 }, { "epoch": 8.522944866292505, "grad_norm": 437.7571105957031, "learning_rate": 0.00046663852191312726, "loss": 42.4684, "step": 3227 }, { "epoch": 8.525586001980852, "grad_norm": 1070.4974365234375, "learning_rate": 0.0004666176689412841, "loss": 44.1613, "step": 3228 }, { "epoch": 8.528227137669198, "grad_norm": 2313.283935546875, "learning_rate": 0.00046659680992051976, "loss": 47.1269, "step": 3229 }, { "epoch": 8.530868273357544, "grad_norm": 245.602294921875, "learning_rate": 0.00046657594485141683, "loss": 44.0913, "step": 3230 }, { "epoch": 8.53350940904589, "grad_norm": 306.9241943359375, "learning_rate": 0.00046655507373455786, "loss": 44.8815, "step": 3231 }, { "epoch": 8.536150544734236, "grad_norm": 332.485595703125, "learning_rate": 0.0004665341965705256, "loss": 44.71, "step": 3232 }, { "epoch": 8.538791680422582, "grad_norm": 398.1158447265625, "learning_rate": 0.0004665133133599031, "loss": 43.8868, "step": 3233 }, { "epoch": 8.541432816110929, "grad_norm": 305.2078552246094, "learning_rate": 0.00046649242410327354, "loss": 45.2542, "step": 3234 }, { "epoch": 8.544073951799273, "grad_norm": 447.6011962890625, "learning_rate": 0.00046647152880122025, "loss": 48.4074, "step": 3235 }, { "epoch": 8.54671508748762, "grad_norm": 367.43731689453125, "learning_rate": 0.00046645062745432654, "loss": 48.5841, "step": 3236 }, { "epoch": 8.549356223175966, "grad_norm": 365.3160705566406, "learning_rate": 0.0004664297200631763, "loss": 45.0066, "step": 3237 }, { "epoch": 8.551997358864313, "grad_norm": 1441.1248779296875, "learning_rate": 0.0004664088066283533, "loss": 45.4003, "step": 3238 }, { "epoch": 8.554638494552657, "grad_norm": 294.6477355957031, "learning_rate": 0.00046638788715044145, "loss": 43.5222, "step": 3239 }, { "epoch": 8.557279630241004, "grad_norm": 257.2624206542969, "learning_rate": 0.000466366961630025, "loss": 43.6045, "step": 3240 }, { "epoch": 8.55992076592935, "grad_norm": 294.34967041015625, "learning_rate": 0.00046634603006768827, "loss": 43.9796, "step": 3241 }, { "epoch": 8.562561901617695, "grad_norm": 260.7157287597656, "learning_rate": 0.0004663250924640157, "loss": 41.4751, "step": 3242 }, { "epoch": 8.565203037306041, "grad_norm": 270.89398193359375, "learning_rate": 0.0004663041488195919, "loss": 41.7528, "step": 3243 }, { "epoch": 8.567844172994388, "grad_norm": 376.9460754394531, "learning_rate": 0.00046628319913500195, "loss": 39.9502, "step": 3244 }, { "epoch": 8.570485308682734, "grad_norm": 491.70684814453125, "learning_rate": 0.00046626224341083066, "loss": 40.6682, "step": 3245 }, { "epoch": 8.573126444371079, "grad_norm": 259.18634033203125, "learning_rate": 0.0004662412816476633, "loss": 41.2121, "step": 3246 }, { "epoch": 8.575767580059425, "grad_norm": 198.85418701171875, "learning_rate": 0.0004662203138460852, "loss": 40.7606, "step": 3247 }, { "epoch": 8.578408715747772, "grad_norm": 325.35162353515625, "learning_rate": 0.00046619934000668176, "loss": 39.0675, "step": 3248 }, { "epoch": 8.581049851436118, "grad_norm": 254.6140899658203, "learning_rate": 0.0004661783601300388, "loss": 40.6956, "step": 3249 }, { "epoch": 8.583690987124463, "grad_norm": 265.424072265625, "learning_rate": 0.0004661573742167421, "loss": 40.3736, "step": 3250 }, { "epoch": 8.58633212281281, "grad_norm": 789.9482421875, "learning_rate": 0.00046613638226737777, "loss": 52.0453, "step": 3251 }, { "epoch": 8.588973258501156, "grad_norm": 4200.03515625, "learning_rate": 0.00046611538428253187, "loss": 97.4556, "step": 3252 }, { "epoch": 8.591614394189502, "grad_norm": 4136.248046875, "learning_rate": 0.00046609438026279083, "loss": 128.617, "step": 3253 }, { "epoch": 8.594255529877847, "grad_norm": 2920.19873046875, "learning_rate": 0.00046607337020874106, "loss": 125.2005, "step": 3254 }, { "epoch": 8.596896665566193, "grad_norm": 5207.48388671875, "learning_rate": 0.0004660523541209695, "loss": 123.0203, "step": 3255 }, { "epoch": 8.59953780125454, "grad_norm": 2065.71533203125, "learning_rate": 0.0004660313320000628, "loss": 125.1582, "step": 3256 }, { "epoch": 8.602178936942886, "grad_norm": 5789.23291015625, "learning_rate": 0.00046601030384660813, "loss": 91.9893, "step": 3257 }, { "epoch": 8.604820072631231, "grad_norm": 3445.42041015625, "learning_rate": 0.00046598926966119256, "loss": 96.8273, "step": 3258 }, { "epoch": 8.607461208319577, "grad_norm": 6104.8271484375, "learning_rate": 0.00046596822944440356, "loss": 99.5507, "step": 3259 }, { "epoch": 8.610102344007924, "grad_norm": 2243.61083984375, "learning_rate": 0.0004659471831968285, "loss": 73.9605, "step": 3260 }, { "epoch": 8.61274347969627, "grad_norm": 2132.016845703125, "learning_rate": 0.00046592613091905535, "loss": 55.5396, "step": 3261 }, { "epoch": 8.615384615384615, "grad_norm": 374.8540954589844, "learning_rate": 0.0004659050726116717, "loss": 43.1689, "step": 3262 }, { "epoch": 8.618025751072961, "grad_norm": 418.41943359375, "learning_rate": 0.0004658840082752658, "loss": 41.3052, "step": 3263 }, { "epoch": 8.620666886761308, "grad_norm": 435.8674011230469, "learning_rate": 0.0004658629379104258, "loss": 42.2635, "step": 3264 }, { "epoch": 8.623308022449653, "grad_norm": 448.87908935546875, "learning_rate": 0.00046584186151774, "loss": 44.7061, "step": 3265 }, { "epoch": 8.625949158137999, "grad_norm": 254.32846069335938, "learning_rate": 0.00046582077909779706, "loss": 41.8819, "step": 3266 }, { "epoch": 8.628590293826345, "grad_norm": 619.023681640625, "learning_rate": 0.00046579969065118563, "loss": 42.5969, "step": 3267 }, { "epoch": 8.631231429514692, "grad_norm": 372.8478698730469, "learning_rate": 0.0004657785961784946, "loss": 39.1633, "step": 3268 }, { "epoch": 8.633872565203037, "grad_norm": 323.9705505371094, "learning_rate": 0.0004657574956803131, "loss": 40.3108, "step": 3269 }, { "epoch": 8.636513700891383, "grad_norm": 375.0330505371094, "learning_rate": 0.0004657363891572302, "loss": 38.3554, "step": 3270 }, { "epoch": 8.63915483657973, "grad_norm": 466.8223571777344, "learning_rate": 0.00046571527660983536, "loss": 40.2084, "step": 3271 }, { "epoch": 8.641795972268076, "grad_norm": 305.6424865722656, "learning_rate": 0.0004656941580387182, "loss": 40.2245, "step": 3272 }, { "epoch": 8.64443710795642, "grad_norm": 385.22930908203125, "learning_rate": 0.00046567303344446833, "loss": 40.6269, "step": 3273 }, { "epoch": 8.647078243644767, "grad_norm": 474.09771728515625, "learning_rate": 0.00046565190282767566, "loss": 39.5628, "step": 3274 }, { "epoch": 8.649719379333114, "grad_norm": 782.16552734375, "learning_rate": 0.0004656307661889304, "loss": 39.2855, "step": 3275 }, { "epoch": 8.65236051502146, "grad_norm": 605.80517578125, "learning_rate": 0.0004656096235288225, "loss": 40.3536, "step": 3276 }, { "epoch": 8.655001650709805, "grad_norm": 458.7707824707031, "learning_rate": 0.00046558847484794267, "loss": 40.3839, "step": 3277 }, { "epoch": 8.657642786398151, "grad_norm": 840.9451904296875, "learning_rate": 0.00046556732014688124, "loss": 42.3192, "step": 3278 }, { "epoch": 8.660283922086498, "grad_norm": 941.3553466796875, "learning_rate": 0.00046554615942622906, "loss": 44.4412, "step": 3279 }, { "epoch": 8.662925057774844, "grad_norm": 930.1746826171875, "learning_rate": 0.000465524992686577, "loss": 49.1826, "step": 3280 }, { "epoch": 8.665566193463189, "grad_norm": 513.5194091796875, "learning_rate": 0.00046550381992851607, "loss": 46.6952, "step": 3281 }, { "epoch": 8.668207329151535, "grad_norm": 306.29608154296875, "learning_rate": 0.00046548264115263763, "loss": 42.1056, "step": 3282 }, { "epoch": 8.670848464839882, "grad_norm": 290.016845703125, "learning_rate": 0.000465461456359533, "loss": 42.222, "step": 3283 }, { "epoch": 8.673489600528228, "grad_norm": 217.21119689941406, "learning_rate": 0.0004654402655497938, "loss": 46.1415, "step": 3284 }, { "epoch": 8.676130736216573, "grad_norm": 283.3821716308594, "learning_rate": 0.0004654190687240116, "loss": 46.4079, "step": 3285 }, { "epoch": 8.67877187190492, "grad_norm": 244.12489318847656, "learning_rate": 0.00046539786588277866, "loss": 48.5349, "step": 3286 }, { "epoch": 8.681413007593266, "grad_norm": 318.24481201171875, "learning_rate": 0.0004653766570266867, "loss": 45.5922, "step": 3287 }, { "epoch": 8.68405414328161, "grad_norm": 320.2382507324219, "learning_rate": 0.0004653554421563282, "loss": 44.011, "step": 3288 }, { "epoch": 8.686695278969957, "grad_norm": 315.1008605957031, "learning_rate": 0.00046533422127229544, "loss": 45.2277, "step": 3289 }, { "epoch": 8.689336414658303, "grad_norm": 319.72296142578125, "learning_rate": 0.000465312994375181, "loss": 44.3171, "step": 3290 }, { "epoch": 8.69197755034665, "grad_norm": 233.09315490722656, "learning_rate": 0.0004652917614655777, "loss": 42.8149, "step": 3291 }, { "epoch": 8.694618686034994, "grad_norm": 203.03359985351562, "learning_rate": 0.0004652705225440784, "loss": 43.8182, "step": 3292 }, { "epoch": 8.69725982172334, "grad_norm": 291.3266906738281, "learning_rate": 0.00046524927761127635, "loss": 40.2601, "step": 3293 }, { "epoch": 8.699900957411687, "grad_norm": 255.8974151611328, "learning_rate": 0.00046522802666776455, "loss": 41.4445, "step": 3294 }, { "epoch": 8.702542093100034, "grad_norm": 248.0733184814453, "learning_rate": 0.0004652067697141366, "loss": 41.1309, "step": 3295 }, { "epoch": 8.705183228788378, "grad_norm": 240.8189697265625, "learning_rate": 0.0004651855067509859, "loss": 39.058, "step": 3296 }, { "epoch": 8.707824364476725, "grad_norm": 194.79052734375, "learning_rate": 0.00046516423777890643, "loss": 40.4613, "step": 3297 }, { "epoch": 8.710465500165071, "grad_norm": 312.3437805175781, "learning_rate": 0.000465142962798492, "loss": 41.2668, "step": 3298 }, { "epoch": 8.713106635853418, "grad_norm": 288.2276306152344, "learning_rate": 0.00046512168181033674, "loss": 38.6169, "step": 3299 }, { "epoch": 8.715747771541762, "grad_norm": 189.72305297851562, "learning_rate": 0.00046510039481503486, "loss": 38.938, "step": 3300 }, { "epoch": 8.718388907230109, "grad_norm": 917.4389038085938, "learning_rate": 0.0004650791018131808, "loss": 59.6123, "step": 3301 }, { "epoch": 8.721030042918455, "grad_norm": 1551.988037109375, "learning_rate": 0.00046505780280536917, "loss": 105.7348, "step": 3302 }, { "epoch": 8.723671178606802, "grad_norm": 6262.85791015625, "learning_rate": 0.0004650364977921948, "loss": 99.3544, "step": 3303 }, { "epoch": 8.726312314295146, "grad_norm": 1613.3587646484375, "learning_rate": 0.00046501518677425247, "loss": 97.5589, "step": 3304 }, { "epoch": 8.728953449983493, "grad_norm": 3953.449951171875, "learning_rate": 0.00046499386975213736, "loss": 93.6275, "step": 3305 }, { "epoch": 8.73159458567184, "grad_norm": 3384.24658203125, "learning_rate": 0.00046497254672644474, "loss": 99.3103, "step": 3306 }, { "epoch": 8.734235721360186, "grad_norm": 1860.0419921875, "learning_rate": 0.00046495121769777004, "loss": 83.5823, "step": 3307 }, { "epoch": 8.73687685704853, "grad_norm": 2383.782958984375, "learning_rate": 0.00046492988266670887, "loss": 79.5202, "step": 3308 }, { "epoch": 8.739517992736877, "grad_norm": 2111.8642578125, "learning_rate": 0.000464908541633857, "loss": 66.0104, "step": 3309 }, { "epoch": 8.742159128425223, "grad_norm": 1658.5240478515625, "learning_rate": 0.00046488719459981026, "loss": 53.7327, "step": 3310 }, { "epoch": 8.744800264113568, "grad_norm": 942.9588012695312, "learning_rate": 0.00046486584156516485, "loss": 47.8246, "step": 3311 }, { "epoch": 8.747441399801914, "grad_norm": 455.7585144042969, "learning_rate": 0.0004648444825305171, "loss": 48.1651, "step": 3312 }, { "epoch": 8.750082535490261, "grad_norm": 726.7997436523438, "learning_rate": 0.0004648231174964634, "loss": 50.0855, "step": 3313 }, { "epoch": 8.752723671178607, "grad_norm": 818.970703125, "learning_rate": 0.0004648017464636003, "loss": 52.6014, "step": 3314 }, { "epoch": 8.755364806866952, "grad_norm": 675.06591796875, "learning_rate": 0.00046478036943252464, "loss": 49.2626, "step": 3315 }, { "epoch": 8.758005942555299, "grad_norm": 329.7138366699219, "learning_rate": 0.0004647589864038333, "loss": 42.5627, "step": 3316 }, { "epoch": 8.760647078243645, "grad_norm": 294.3465881347656, "learning_rate": 0.0004647375973781234, "loss": 40.381, "step": 3317 }, { "epoch": 8.763288213931991, "grad_norm": 256.6517333984375, "learning_rate": 0.00046471620235599226, "loss": 40.8224, "step": 3318 }, { "epoch": 8.765929349620336, "grad_norm": 232.9967041015625, "learning_rate": 0.0004646948013380373, "loss": 39.2412, "step": 3319 }, { "epoch": 8.768570485308683, "grad_norm": 285.4788818359375, "learning_rate": 0.0004646733943248561, "loss": 39.2891, "step": 3320 }, { "epoch": 8.771211620997029, "grad_norm": 316.3724670410156, "learning_rate": 0.00046465198131704656, "loss": 40.048, "step": 3321 }, { "epoch": 8.773852756685375, "grad_norm": 372.67303466796875, "learning_rate": 0.0004646305623152065, "loss": 41.1454, "step": 3322 }, { "epoch": 8.77649389237372, "grad_norm": 274.8150939941406, "learning_rate": 0.000464609137319934, "loss": 39.6915, "step": 3323 }, { "epoch": 8.779135028062067, "grad_norm": 262.927490234375, "learning_rate": 0.0004645877063318275, "loss": 40.8401, "step": 3324 }, { "epoch": 8.781776163750413, "grad_norm": 231.60948181152344, "learning_rate": 0.00046456626935148526, "loss": 39.4122, "step": 3325 }, { "epoch": 8.78441729943876, "grad_norm": 533.36279296875, "learning_rate": 0.00046454482637950606, "loss": 40.5086, "step": 3326 }, { "epoch": 8.787058435127104, "grad_norm": 794.1074829101562, "learning_rate": 0.00046452337741648863, "loss": 41.0634, "step": 3327 }, { "epoch": 8.78969957081545, "grad_norm": 764.6676025390625, "learning_rate": 0.0004645019224630319, "loss": 41.7537, "step": 3328 }, { "epoch": 8.792340706503797, "grad_norm": 851.8583374023438, "learning_rate": 0.0004644804615197349, "loss": 44.5455, "step": 3329 }, { "epoch": 8.794981842192144, "grad_norm": 199.40257263183594, "learning_rate": 0.0004644589945871971, "loss": 43.3919, "step": 3330 }, { "epoch": 8.797622977880488, "grad_norm": 269.8446044921875, "learning_rate": 0.00046443752166601784, "loss": 43.8483, "step": 3331 }, { "epoch": 8.800264113568835, "grad_norm": 186.01522827148438, "learning_rate": 0.00046441604275679674, "loss": 41.4517, "step": 3332 }, { "epoch": 8.802905249257181, "grad_norm": 259.15478515625, "learning_rate": 0.00046439455786013355, "loss": 43.8249, "step": 3333 }, { "epoch": 8.805546384945526, "grad_norm": 226.8269500732422, "learning_rate": 0.00046437306697662837, "loss": 46.3052, "step": 3334 }, { "epoch": 8.808187520633872, "grad_norm": 195.4812469482422, "learning_rate": 0.00046435157010688114, "loss": 43.114, "step": 3335 }, { "epoch": 8.810828656322219, "grad_norm": 166.53126525878906, "learning_rate": 0.0004643300672514923, "loss": 44.6766, "step": 3336 }, { "epoch": 8.813469792010565, "grad_norm": 296.80206298828125, "learning_rate": 0.00046430855841106216, "loss": 46.0471, "step": 3337 }, { "epoch": 8.81611092769891, "grad_norm": 253.2646484375, "learning_rate": 0.00046428704358619143, "loss": 43.3867, "step": 3338 }, { "epoch": 8.818752063387256, "grad_norm": 233.62757873535156, "learning_rate": 0.000464265522777481, "loss": 43.8009, "step": 3339 }, { "epoch": 8.821393199075603, "grad_norm": 323.4447326660156, "learning_rate": 0.00046424399598553155, "loss": 45.2532, "step": 3340 }, { "epoch": 8.82403433476395, "grad_norm": 261.8585205078125, "learning_rate": 0.0004642224632109445, "loss": 41.2732, "step": 3341 }, { "epoch": 8.826675470452294, "grad_norm": 302.77001953125, "learning_rate": 0.00046420092445432094, "loss": 41.3607, "step": 3342 }, { "epoch": 8.82931660614064, "grad_norm": 369.4048767089844, "learning_rate": 0.00046417937971626245, "loss": 40.1424, "step": 3343 }, { "epoch": 8.831957741828987, "grad_norm": 390.2236633300781, "learning_rate": 0.0004641578289973705, "loss": 40.384, "step": 3344 }, { "epoch": 8.834598877517333, "grad_norm": 254.12364196777344, "learning_rate": 0.0004641362722982471, "loss": 40.9147, "step": 3345 }, { "epoch": 8.837240013205678, "grad_norm": 280.21502685546875, "learning_rate": 0.000464114709619494, "loss": 39.5977, "step": 3346 }, { "epoch": 8.839881148894024, "grad_norm": 318.7642517089844, "learning_rate": 0.0004640931409617134, "loss": 39.3958, "step": 3347 }, { "epoch": 8.84252228458237, "grad_norm": 660.0353393554688, "learning_rate": 0.00046407156632550763, "loss": 39.9416, "step": 3348 }, { "epoch": 8.845163420270717, "grad_norm": 694.6184692382812, "learning_rate": 0.00046404998571147914, "loss": 39.4464, "step": 3349 }, { "epoch": 8.847804555959062, "grad_norm": 291.6764831542969, "learning_rate": 0.00046402839912023053, "loss": 40.0712, "step": 3350 }, { "epoch": 8.850445691647408, "grad_norm": 1325.1541748046875, "learning_rate": 0.0004640068065523646, "loss": 52.2657, "step": 3351 }, { "epoch": 8.853086827335755, "grad_norm": 8815.0361328125, "learning_rate": 0.00046398520800848434, "loss": 77.208, "step": 3352 }, { "epoch": 8.855727963024101, "grad_norm": 3739.779052734375, "learning_rate": 0.00046396360348919277, "loss": 87.3296, "step": 3353 }, { "epoch": 8.858369098712446, "grad_norm": 8045.2080078125, "learning_rate": 0.0004639419929950933, "loss": 80.4966, "step": 3354 }, { "epoch": 8.861010234400792, "grad_norm": 3343.99755859375, "learning_rate": 0.0004639203765267894, "loss": 82.8595, "step": 3355 }, { "epoch": 8.863651370089139, "grad_norm": 6289.765625, "learning_rate": 0.00046389875408488457, "loss": 67.0581, "step": 3356 }, { "epoch": 8.866292505777484, "grad_norm": 7325.65771484375, "learning_rate": 0.00046387712566998274, "loss": 63.6331, "step": 3357 }, { "epoch": 8.86893364146583, "grad_norm": 6135.46923828125, "learning_rate": 0.00046385549128268776, "loss": 57.6194, "step": 3358 }, { "epoch": 8.871574777154176, "grad_norm": 9007.8974609375, "learning_rate": 0.00046383385092360385, "loss": 61.2325, "step": 3359 }, { "epoch": 8.874215912842523, "grad_norm": 3613.287841796875, "learning_rate": 0.0004638122045933353, "loss": 54.3201, "step": 3360 }, { "epoch": 8.876857048530868, "grad_norm": 2806.767333984375, "learning_rate": 0.0004637905522924865, "loss": 44.2687, "step": 3361 }, { "epoch": 8.879498184219214, "grad_norm": 495.3524169921875, "learning_rate": 0.00046376889402166213, "loss": 45.6682, "step": 3362 }, { "epoch": 8.88213931990756, "grad_norm": 957.2639770507812, "learning_rate": 0.00046374722978146694, "loss": 45.8057, "step": 3363 }, { "epoch": 8.884780455595907, "grad_norm": 441.25628662109375, "learning_rate": 0.00046372555957250596, "loss": 42.3931, "step": 3364 }, { "epoch": 8.887421591284252, "grad_norm": 479.85491943359375, "learning_rate": 0.00046370388339538423, "loss": 43.6255, "step": 3365 }, { "epoch": 8.890062726972598, "grad_norm": 326.94622802734375, "learning_rate": 0.00046368220125070716, "loss": 40.5983, "step": 3366 }, { "epoch": 8.892703862660944, "grad_norm": 734.3434448242188, "learning_rate": 0.00046366051313908007, "loss": 42.9657, "step": 3367 }, { "epoch": 8.895344998349291, "grad_norm": 409.182861328125, "learning_rate": 0.00046363881906110865, "loss": 41.435, "step": 3368 }, { "epoch": 8.897986134037636, "grad_norm": 280.15557861328125, "learning_rate": 0.0004636171190173988, "loss": 41.4844, "step": 3369 }, { "epoch": 8.900627269725982, "grad_norm": 563.8793334960938, "learning_rate": 0.00046359541300855636, "loss": 41.2797, "step": 3370 }, { "epoch": 8.903268405414329, "grad_norm": 321.9936828613281, "learning_rate": 0.00046357370103518737, "loss": 42.5553, "step": 3371 }, { "epoch": 8.905909541102675, "grad_norm": 345.59417724609375, "learning_rate": 0.00046355198309789837, "loss": 41.1506, "step": 3372 }, { "epoch": 8.90855067679102, "grad_norm": 607.0554809570312, "learning_rate": 0.00046353025919729565, "loss": 40.3375, "step": 3373 }, { "epoch": 8.911191812479366, "grad_norm": 304.4551086425781, "learning_rate": 0.00046350852933398583, "loss": 40.0645, "step": 3374 }, { "epoch": 8.913832948167713, "grad_norm": 334.6591491699219, "learning_rate": 0.00046348679350857584, "loss": 39.2715, "step": 3375 }, { "epoch": 8.916474083856059, "grad_norm": 329.0321960449219, "learning_rate": 0.0004634650517216725, "loss": 40.2672, "step": 3376 }, { "epoch": 8.919115219544404, "grad_norm": 670.4813232421875, "learning_rate": 0.000463443303973883, "loss": 43.516, "step": 3377 }, { "epoch": 8.92175635523275, "grad_norm": 393.6397399902344, "learning_rate": 0.00046342155026581457, "loss": 41.9767, "step": 3378 }, { "epoch": 8.924397490921097, "grad_norm": 1720.709716796875, "learning_rate": 0.0004633997905980748, "loss": 45.8869, "step": 3379 }, { "epoch": 8.927038626609441, "grad_norm": 208.1929473876953, "learning_rate": 0.00046337802497127117, "loss": 42.4358, "step": 3380 }, { "epoch": 8.929679762297788, "grad_norm": 215.06439208984375, "learning_rate": 0.00046335625338601155, "loss": 42.9078, "step": 3381 }, { "epoch": 8.932320897986134, "grad_norm": 242.05575561523438, "learning_rate": 0.0004633344758429039, "loss": 46.7567, "step": 3382 }, { "epoch": 8.93496203367448, "grad_norm": 248.35140991210938, "learning_rate": 0.00046331269234255634, "loss": 50.2845, "step": 3383 }, { "epoch": 8.937603169362825, "grad_norm": 221.2177734375, "learning_rate": 0.0004632909028855771, "loss": 46.9574, "step": 3384 }, { "epoch": 8.940244305051172, "grad_norm": 252.25286865234375, "learning_rate": 0.00046326910747257477, "loss": 43.6835, "step": 3385 }, { "epoch": 8.942885440739518, "grad_norm": 370.69183349609375, "learning_rate": 0.0004632473061041579, "loss": 42.1313, "step": 3386 }, { "epoch": 8.945526576427865, "grad_norm": 385.4154968261719, "learning_rate": 0.0004632254987809352, "loss": 41.8759, "step": 3387 }, { "epoch": 8.94816771211621, "grad_norm": 239.1663818359375, "learning_rate": 0.0004632036855035157, "loss": 40.2207, "step": 3388 }, { "epoch": 8.950808847804556, "grad_norm": 195.4352569580078, "learning_rate": 0.0004631818662725086, "loss": 39.5632, "step": 3389 }, { "epoch": 8.953449983492902, "grad_norm": 298.3498840332031, "learning_rate": 0.00046316004108852305, "loss": 39.942, "step": 3390 }, { "epoch": 8.956091119181249, "grad_norm": 1333.9892578125, "learning_rate": 0.0004631382099521686, "loss": 40.004, "step": 3391 }, { "epoch": 8.958732254869593, "grad_norm": 829.8394775390625, "learning_rate": 0.00046311637286405486, "loss": 53.66, "step": 3392 }, { "epoch": 8.96137339055794, "grad_norm": 6411.8837890625, "learning_rate": 0.00046309452982479156, "loss": 71.7821, "step": 3393 }, { "epoch": 8.964014526246286, "grad_norm": 5018.96142578125, "learning_rate": 0.00046307268083498873, "loss": 70.4389, "step": 3394 }, { "epoch": 8.966655661934633, "grad_norm": 4955.20458984375, "learning_rate": 0.0004630508258952564, "loss": 69.1963, "step": 3395 }, { "epoch": 8.969296797622977, "grad_norm": 6412.27001953125, "learning_rate": 0.000463028965006205, "loss": 63.0449, "step": 3396 }, { "epoch": 8.971937933311324, "grad_norm": 8533.1328125, "learning_rate": 0.00046300709816844476, "loss": 63.4756, "step": 3397 }, { "epoch": 8.97457906899967, "grad_norm": 1146.678955078125, "learning_rate": 0.0004629852253825865, "loss": 51.2366, "step": 3398 }, { "epoch": 8.977220204688017, "grad_norm": 593.1256713867188, "learning_rate": 0.000462963346649241, "loss": 40.359, "step": 3399 }, { "epoch": 8.979861340376361, "grad_norm": 481.3659362792969, "learning_rate": 0.00046294146196901905, "loss": 41.1325, "step": 3400 }, { "epoch": 8.979861340376361, "eval_loss": 5.1159257888793945, "eval_runtime": 2.1511, "eval_samples_per_second": 230.115, "eval_steps_per_second": 28.822, "step": 3400 }, { "epoch": 8.982502476064708, "grad_norm": 764.2770385742188, "learning_rate": 0.00046291957134253194, "loss": 41.4818, "step": 3401 }, { "epoch": 8.985143611753054, "grad_norm": 983.9605102539062, "learning_rate": 0.0004628976747703908, "loss": 40.0928, "step": 3402 }, { "epoch": 8.987784747441399, "grad_norm": 494.1728210449219, "learning_rate": 0.0004628757722532072, "loss": 39.9898, "step": 3403 }, { "epoch": 8.990425883129745, "grad_norm": 369.8144836425781, "learning_rate": 0.00046285386379159266, "loss": 41.5028, "step": 3404 }, { "epoch": 8.993067018818092, "grad_norm": 643.3720092773438, "learning_rate": 0.00046283194938615907, "loss": 39.6291, "step": 3405 }, { "epoch": 8.995708154506438, "grad_norm": 456.2147521972656, "learning_rate": 0.00046281002903751834, "loss": 39.2412, "step": 3406 }, { "epoch": 8.998349290194783, "grad_norm": 1046.2884521484375, "learning_rate": 0.00046278810274628246, "loss": 41.4445, "step": 3407 }, { "epoch": 9.00099042588313, "grad_norm": 2782.33642578125, "learning_rate": 0.00046276617051306394, "loss": 48.2456, "step": 3408 }, { "epoch": 9.003631561571476, "grad_norm": 1010.6439208984375, "learning_rate": 0.00046274423233847497, "loss": 49.0174, "step": 3409 }, { "epoch": 9.006272697259822, "grad_norm": 891.5706176757812, "learning_rate": 0.00046272228822312835, "loss": 47.0195, "step": 3410 }, { "epoch": 9.008913832948167, "grad_norm": 478.4961242675781, "learning_rate": 0.00046270033816763677, "loss": 45.7976, "step": 3411 }, { "epoch": 9.011554968636514, "grad_norm": 352.5732116699219, "learning_rate": 0.00046267838217261317, "loss": 47.172, "step": 3412 }, { "epoch": 9.01419610432486, "grad_norm": 384.3170166015625, "learning_rate": 0.0004626564202386707, "loss": 46.7156, "step": 3413 }, { "epoch": 9.016837240013206, "grad_norm": 368.66387939453125, "learning_rate": 0.00046263445236642266, "loss": 48.4951, "step": 3414 }, { "epoch": 9.019478375701551, "grad_norm": 292.7618713378906, "learning_rate": 0.0004626124785564824, "loss": 46.5119, "step": 3415 }, { "epoch": 9.022119511389898, "grad_norm": 357.080810546875, "learning_rate": 0.0004625904988094636, "loss": 47.2507, "step": 3416 }, { "epoch": 9.024760647078244, "grad_norm": 285.5204772949219, "learning_rate": 0.00046256851312598, "loss": 45.1707, "step": 3417 }, { "epoch": 9.02740178276659, "grad_norm": 270.4403076171875, "learning_rate": 0.0004625465215066456, "loss": 43.1399, "step": 3418 }, { "epoch": 9.030042918454935, "grad_norm": 304.3245849609375, "learning_rate": 0.00046252452395207435, "loss": 43.4469, "step": 3419 }, { "epoch": 9.032684054143282, "grad_norm": 256.7631530761719, "learning_rate": 0.00046250252046288053, "loss": 42.3878, "step": 3420 }, { "epoch": 9.035325189831628, "grad_norm": 332.5658874511719, "learning_rate": 0.0004624805110396788, "loss": 40.9176, "step": 3421 }, { "epoch": 9.037966325519974, "grad_norm": 260.9934997558594, "learning_rate": 0.0004624584956830836, "loss": 40.8218, "step": 3422 }, { "epoch": 9.04060746120832, "grad_norm": 292.38739013671875, "learning_rate": 0.0004624364743937097, "loss": 39.8506, "step": 3423 }, { "epoch": 9.043248596896666, "grad_norm": 269.0086364746094, "learning_rate": 0.0004624144471721721, "loss": 39.308, "step": 3424 }, { "epoch": 9.045889732585012, "grad_norm": 338.2366027832031, "learning_rate": 0.0004623924140190857, "loss": 39.7377, "step": 3425 }, { "epoch": 9.048530868273357, "grad_norm": 340.3506774902344, "learning_rate": 0.00046237037493506594, "loss": 40.117, "step": 3426 }, { "epoch": 9.051172003961703, "grad_norm": 1145.522705078125, "learning_rate": 0.0004623483299207283, "loss": 42.6066, "step": 3427 }, { "epoch": 9.05381313965005, "grad_norm": 277.80084228515625, "learning_rate": 0.0004623262789766882, "loss": 39.3685, "step": 3428 }, { "epoch": 9.056454275338396, "grad_norm": 306.73565673828125, "learning_rate": 0.0004623042221035616, "loss": 38.559, "step": 3429 }, { "epoch": 9.05909541102674, "grad_norm": 705.3339233398438, "learning_rate": 0.00046228215930196426, "loss": 46.8615, "step": 3430 }, { "epoch": 9.061736546715087, "grad_norm": 3722.167724609375, "learning_rate": 0.0004622600905725123, "loss": 106.0095, "step": 3431 }, { "epoch": 9.064377682403434, "grad_norm": 4389.06005859375, "learning_rate": 0.00046223801591582195, "loss": 108.1234, "step": 3432 }, { "epoch": 9.06701881809178, "grad_norm": 6482.9248046875, "learning_rate": 0.0004622159353325097, "loss": 111.3799, "step": 3433 }, { "epoch": 9.069659953780125, "grad_norm": 3275.80859375, "learning_rate": 0.0004621938488231922, "loss": 81.1719, "step": 3434 }, { "epoch": 9.072301089468471, "grad_norm": 5866.39306640625, "learning_rate": 0.00046217175638848607, "loss": 91.7826, "step": 3435 }, { "epoch": 9.074942225156818, "grad_norm": 1740.029296875, "learning_rate": 0.0004621496580290083, "loss": 79.6393, "step": 3436 }, { "epoch": 9.077583360845164, "grad_norm": 6078.00537109375, "learning_rate": 0.00046212755374537594, "loss": 85.0459, "step": 3437 }, { "epoch": 9.080224496533509, "grad_norm": 2180.321044921875, "learning_rate": 0.00046210544353820625, "loss": 61.7988, "step": 3438 }, { "epoch": 9.082865632221855, "grad_norm": 1358.771240234375, "learning_rate": 0.00046208332740811664, "loss": 48.51, "step": 3439 }, { "epoch": 9.085506767910202, "grad_norm": 360.4154357910156, "learning_rate": 0.0004620612053557247, "loss": 40.6124, "step": 3440 }, { "epoch": 9.088147903598548, "grad_norm": 291.4140930175781, "learning_rate": 0.0004620390773816482, "loss": 38.8792, "step": 3441 }, { "epoch": 9.090789039286893, "grad_norm": 252.85279846191406, "learning_rate": 0.00046201694348650495, "loss": 41.915, "step": 3442 }, { "epoch": 9.09343017497524, "grad_norm": 230.0819549560547, "learning_rate": 0.00046199480367091315, "loss": 40.1903, "step": 3443 }, { "epoch": 9.096071310663586, "grad_norm": 307.69561767578125, "learning_rate": 0.00046197265793549103, "loss": 41.1566, "step": 3444 }, { "epoch": 9.098712446351932, "grad_norm": 434.45611572265625, "learning_rate": 0.0004619505062808569, "loss": 41.3697, "step": 3445 }, { "epoch": 9.101353582040277, "grad_norm": 274.3328552246094, "learning_rate": 0.00046192834870762935, "loss": 43.4914, "step": 3446 }, { "epoch": 9.103994717728623, "grad_norm": 299.0403747558594, "learning_rate": 0.00046190618521642726, "loss": 40.2222, "step": 3447 }, { "epoch": 9.10663585341697, "grad_norm": 880.7030029296875, "learning_rate": 0.0004618840158078693, "loss": 41.2584, "step": 3448 }, { "epoch": 9.109276989105314, "grad_norm": 400.433349609375, "learning_rate": 0.0004618618404825748, "loss": 38.6623, "step": 3449 }, { "epoch": 9.111918124793661, "grad_norm": 279.84130859375, "learning_rate": 0.00046183965924116277, "loss": 39.1387, "step": 3450 }, { "epoch": 9.114559260482007, "grad_norm": 256.02191162109375, "learning_rate": 0.0004618174720842527, "loss": 40.1197, "step": 3451 }, { "epoch": 9.117200396170354, "grad_norm": 400.5591125488281, "learning_rate": 0.0004617952790124642, "loss": 41.037, "step": 3452 }, { "epoch": 9.119841531858699, "grad_norm": 298.23565673828125, "learning_rate": 0.00046177308002641693, "loss": 38.8116, "step": 3453 }, { "epoch": 9.122482667547045, "grad_norm": 951.7030029296875, "learning_rate": 0.0004617508751267308, "loss": 39.7591, "step": 3454 }, { "epoch": 9.125123803235391, "grad_norm": 608.4554443359375, "learning_rate": 0.0004617286643140259, "loss": 40.477, "step": 3455 }, { "epoch": 9.127764938923738, "grad_norm": 620.6325073242188, "learning_rate": 0.0004617064475889224, "loss": 41.5498, "step": 3456 }, { "epoch": 9.130406074612083, "grad_norm": 747.69384765625, "learning_rate": 0.00046168422495204063, "loss": 45.0741, "step": 3457 }, { "epoch": 9.133047210300429, "grad_norm": 1013.218505859375, "learning_rate": 0.00046166199640400134, "loss": 44.7789, "step": 3458 }, { "epoch": 9.135688345988775, "grad_norm": 151.78848266601562, "learning_rate": 0.0004616397619454251, "loss": 43.0318, "step": 3459 }, { "epoch": 9.138329481677122, "grad_norm": 136.87847900390625, "learning_rate": 0.00046161752157693284, "loss": 43.5373, "step": 3460 }, { "epoch": 9.140970617365467, "grad_norm": 245.07542419433594, "learning_rate": 0.0004615952752991456, "loss": 43.2831, "step": 3461 }, { "epoch": 9.143611753053813, "grad_norm": 145.6239013671875, "learning_rate": 0.00046157302311268457, "loss": 43.7877, "step": 3462 }, { "epoch": 9.14625288874216, "grad_norm": 141.03114318847656, "learning_rate": 0.00046155076501817123, "loss": 45.1876, "step": 3463 }, { "epoch": 9.148894024430506, "grad_norm": 179.55906677246094, "learning_rate": 0.000461528501016227, "loss": 45.9036, "step": 3464 }, { "epoch": 9.15153516011885, "grad_norm": 182.741455078125, "learning_rate": 0.0004615062311074736, "loss": 47.3276, "step": 3465 }, { "epoch": 9.154176295807197, "grad_norm": 170.24371337890625, "learning_rate": 0.00046148395529253296, "loss": 45.5384, "step": 3466 }, { "epoch": 9.156817431495544, "grad_norm": 303.4893493652344, "learning_rate": 0.0004614616735720272, "loss": 44.1148, "step": 3467 }, { "epoch": 9.15945856718389, "grad_norm": 168.16531372070312, "learning_rate": 0.0004614393859465783, "loss": 44.6825, "step": 3468 }, { "epoch": 9.162099702872235, "grad_norm": 255.5096435546875, "learning_rate": 0.00046141709241680873, "loss": 44.6851, "step": 3469 }, { "epoch": 9.164740838560581, "grad_norm": 223.6467742919922, "learning_rate": 0.00046139479298334116, "loss": 43.4218, "step": 3470 }, { "epoch": 9.167381974248928, "grad_norm": 141.59320068359375, "learning_rate": 0.00046137248764679815, "loss": 41.3328, "step": 3471 }, { "epoch": 9.170023109937272, "grad_norm": 205.37051391601562, "learning_rate": 0.0004613501764078025, "loss": 41.1278, "step": 3472 }, { "epoch": 9.172664245625619, "grad_norm": 123.4813232421875, "learning_rate": 0.0004613278592669774, "loss": 39.1922, "step": 3473 }, { "epoch": 9.175305381313965, "grad_norm": 201.70443725585938, "learning_rate": 0.00046130553622494597, "loss": 38.6765, "step": 3474 }, { "epoch": 9.177946517002312, "grad_norm": 244.10104370117188, "learning_rate": 0.0004612832072823316, "loss": 39.7539, "step": 3475 }, { "epoch": 9.180587652690656, "grad_norm": 175.6833038330078, "learning_rate": 0.0004612608724397577, "loss": 39.2789, "step": 3476 }, { "epoch": 9.183228788379003, "grad_norm": 154.7112274169922, "learning_rate": 0.0004612385316978481, "loss": 39.4529, "step": 3477 }, { "epoch": 9.18586992406735, "grad_norm": 168.1526336669922, "learning_rate": 0.0004612161850572266, "loss": 39.9133, "step": 3478 }, { "epoch": 9.188511059755696, "grad_norm": 156.37310791015625, "learning_rate": 0.00046119383251851716, "loss": 39.7658, "step": 3479 }, { "epoch": 9.19115219544404, "grad_norm": 1583.83447265625, "learning_rate": 0.00046117147408234404, "loss": 71.5518, "step": 3480 }, { "epoch": 9.193793331132387, "grad_norm": 2574.942138671875, "learning_rate": 0.00046114910974933153, "loss": 59.9531, "step": 3481 }, { "epoch": 9.196434466820733, "grad_norm": 7877.5107421875, "learning_rate": 0.0004611267395201042, "loss": 54.2815, "step": 3482 }, { "epoch": 9.19907560250908, "grad_norm": 2191.111572265625, "learning_rate": 0.0004611043633952867, "loss": 42.0157, "step": 3483 }, { "epoch": 9.201716738197424, "grad_norm": 1616.337890625, "learning_rate": 0.00046108198137550377, "loss": 38.9656, "step": 3484 }, { "epoch": 9.20435787388577, "grad_norm": 5273.224609375, "learning_rate": 0.0004610595934613806, "loss": 34.2135, "step": 3485 }, { "epoch": 9.206999009574117, "grad_norm": 705.0436401367188, "learning_rate": 0.0004610371996535422, "loss": 29.9518, "step": 3486 }, { "epoch": 9.209640145262464, "grad_norm": 3240.11669921875, "learning_rate": 0.000461014799952614, "loss": 28.9729, "step": 3487 }, { "epoch": 9.212281280950808, "grad_norm": 754.5531616210938, "learning_rate": 0.00046099239435922146, "loss": 30.6936, "step": 3488 }, { "epoch": 9.214922416639155, "grad_norm": 530.7709350585938, "learning_rate": 0.0004609699828739903, "loss": 23.4153, "step": 3489 }, { "epoch": 9.217563552327501, "grad_norm": 817.3278198242188, "learning_rate": 0.0004609475654975462, "loss": 54.2893, "step": 3490 }, { "epoch": 9.220204688015848, "grad_norm": 807.8184204101562, "learning_rate": 0.0004609251422305154, "loss": 55.2655, "step": 3491 }, { "epoch": 9.222845823704192, "grad_norm": 431.25836181640625, "learning_rate": 0.00046090271307352385, "loss": 46.0163, "step": 3492 }, { "epoch": 9.225486959392539, "grad_norm": 198.57261657714844, "learning_rate": 0.00046088027802719783, "loss": 44.2043, "step": 3493 }, { "epoch": 9.228128095080885, "grad_norm": 194.97793579101562, "learning_rate": 0.000460857837092164, "loss": 41.7305, "step": 3494 }, { "epoch": 9.23076923076923, "grad_norm": 182.06988525390625, "learning_rate": 0.00046083539026904895, "loss": 40.3623, "step": 3495 }, { "epoch": 9.233410366457576, "grad_norm": 229.9884490966797, "learning_rate": 0.0004608129375584794, "loss": 42.1451, "step": 3496 }, { "epoch": 9.236051502145923, "grad_norm": 176.85394287109375, "learning_rate": 0.00046079047896108253, "loss": 38.7111, "step": 3497 }, { "epoch": 9.23869263783427, "grad_norm": 191.90536499023438, "learning_rate": 0.0004607680144774853, "loss": 39.8659, "step": 3498 }, { "epoch": 9.241333773522614, "grad_norm": 212.00064086914062, "learning_rate": 0.000460745544108315, "loss": 40.9101, "step": 3499 }, { "epoch": 9.24397490921096, "grad_norm": 227.01287841796875, "learning_rate": 0.00046072306785419927, "loss": 40.4838, "step": 3500 }, { "epoch": 9.246616044899307, "grad_norm": 282.92071533203125, "learning_rate": 0.0004607005857157657, "loss": 40.7436, "step": 3501 }, { "epoch": 9.249257180587653, "grad_norm": 176.62417602539062, "learning_rate": 0.00046067809769364187, "loss": 40.1049, "step": 3502 }, { "epoch": 9.251898316275998, "grad_norm": 185.15817260742188, "learning_rate": 0.00046065560378845605, "loss": 38.5065, "step": 3503 }, { "epoch": 9.254539451964344, "grad_norm": 178.0997772216797, "learning_rate": 0.00046063310400083627, "loss": 41.046, "step": 3504 }, { "epoch": 9.257180587652691, "grad_norm": 191.79347229003906, "learning_rate": 0.0004606105983314107, "loss": 41.3778, "step": 3505 }, { "epoch": 9.259821723341037, "grad_norm": 336.5450744628906, "learning_rate": 0.00046058808678080797, "loss": 42.1967, "step": 3506 }, { "epoch": 9.262462859029382, "grad_norm": 317.07379150390625, "learning_rate": 0.0004605655693496565, "loss": 41.8758, "step": 3507 }, { "epoch": 9.265103994717728, "grad_norm": 559.9943237304688, "learning_rate": 0.0004605430460385853, "loss": 46.6196, "step": 3508 }, { "epoch": 9.267745130406075, "grad_norm": 137.94815063476562, "learning_rate": 0.00046052051684822325, "loss": 43.7852, "step": 3509 }, { "epoch": 9.270386266094421, "grad_norm": 138.4364776611328, "learning_rate": 0.0004604979817791994, "loss": 41.5461, "step": 3510 }, { "epoch": 9.273027401782766, "grad_norm": 241.1132354736328, "learning_rate": 0.0004604754408321431, "loss": 42.3502, "step": 3511 }, { "epoch": 9.275668537471113, "grad_norm": 190.24813842773438, "learning_rate": 0.00046045289400768367, "loss": 43.2808, "step": 3512 }, { "epoch": 9.278309673159459, "grad_norm": 125.28234100341797, "learning_rate": 0.00046043034130645083, "loss": 42.9857, "step": 3513 }, { "epoch": 9.280950808847805, "grad_norm": 178.94422912597656, "learning_rate": 0.00046040778272907434, "loss": 46.318, "step": 3514 }, { "epoch": 9.28359194453615, "grad_norm": 135.6380615234375, "learning_rate": 0.00046038521827618413, "loss": 44.7751, "step": 3515 }, { "epoch": 9.286233080224497, "grad_norm": 130.65150451660156, "learning_rate": 0.0004603626479484102, "loss": 45.3921, "step": 3516 }, { "epoch": 9.288874215912843, "grad_norm": 137.4483642578125, "learning_rate": 0.00046034007174638303, "loss": 43.2364, "step": 3517 }, { "epoch": 9.291515351601188, "grad_norm": 166.8400421142578, "learning_rate": 0.00046031748967073285, "loss": 43.4595, "step": 3518 }, { "epoch": 9.294156487289534, "grad_norm": 152.28504943847656, "learning_rate": 0.0004602949017220903, "loss": 40.7915, "step": 3519 }, { "epoch": 9.29679762297788, "grad_norm": 280.9032897949219, "learning_rate": 0.0004602723079010862, "loss": 42.7689, "step": 3520 }, { "epoch": 9.299438758666227, "grad_norm": 163.10890197753906, "learning_rate": 0.0004602497082083513, "loss": 39.0161, "step": 3521 }, { "epoch": 9.302079894354572, "grad_norm": 171.54248046875, "learning_rate": 0.000460227102644517, "loss": 39.0591, "step": 3522 }, { "epoch": 9.304721030042918, "grad_norm": 259.7905578613281, "learning_rate": 0.00046020449121021424, "loss": 38.8931, "step": 3523 }, { "epoch": 9.307362165731265, "grad_norm": 236.22019958496094, "learning_rate": 0.00046018187390607456, "loss": 39.149, "step": 3524 }, { "epoch": 9.310003301419611, "grad_norm": 183.24423217773438, "learning_rate": 0.00046015925073272947, "loss": 38.5306, "step": 3525 }, { "epoch": 9.312644437107956, "grad_norm": 286.13726806640625, "learning_rate": 0.00046013662169081085, "loss": 39.767, "step": 3526 }, { "epoch": 9.315285572796302, "grad_norm": 214.6374053955078, "learning_rate": 0.0004601139867809504, "loss": 39.7501, "step": 3527 }, { "epoch": 9.317926708484649, "grad_norm": 358.03729248046875, "learning_rate": 0.00046009134600378046, "loss": 39.6035, "step": 3528 }, { "epoch": 9.320567844172995, "grad_norm": 321.9617614746094, "learning_rate": 0.0004600686993599329, "loss": 40.0017, "step": 3529 }, { "epoch": 9.32320897986134, "grad_norm": 1311.8563232421875, "learning_rate": 0.0004600460468500405, "loss": 58.3976, "step": 3530 }, { "epoch": 9.325850115549686, "grad_norm": 2372.744873046875, "learning_rate": 0.00046002338847473545, "loss": 101.2391, "step": 3531 }, { "epoch": 9.328491251238033, "grad_norm": 3244.292724609375, "learning_rate": 0.0004600007242346508, "loss": 116.4106, "step": 3532 }, { "epoch": 9.33113238692638, "grad_norm": 7817.8720703125, "learning_rate": 0.0004599780541304192, "loss": 126.3867, "step": 3533 }, { "epoch": 9.333773522614724, "grad_norm": 4414.52001953125, "learning_rate": 0.0004599553781626738, "loss": 90.66, "step": 3534 }, { "epoch": 9.33641465830307, "grad_norm": 2964.6025390625, "learning_rate": 0.0004599326963320478, "loss": 90.0844, "step": 3535 }, { "epoch": 9.339055793991417, "grad_norm": 1900.75244140625, "learning_rate": 0.00045991000863917467, "loss": 73.3382, "step": 3536 }, { "epoch": 9.341696929679763, "grad_norm": 4898.4384765625, "learning_rate": 0.00045988731508468775, "loss": 68.6553, "step": 3537 }, { "epoch": 9.344338065368108, "grad_norm": 2356.228759765625, "learning_rate": 0.00045986461566922083, "loss": 58.1092, "step": 3538 }, { "epoch": 9.346979201056454, "grad_norm": 1308.6043701171875, "learning_rate": 0.00045984191039340785, "loss": 45.3502, "step": 3539 }, { "epoch": 9.3496203367448, "grad_norm": 1982.3681640625, "learning_rate": 0.0004598191992578828, "loss": 42.444, "step": 3540 }, { "epoch": 9.352261472433145, "grad_norm": 697.540771484375, "learning_rate": 0.0004597964822632799, "loss": 45.2951, "step": 3541 }, { "epoch": 9.354902608121492, "grad_norm": 609.3800659179688, "learning_rate": 0.00045977375941023334, "loss": 47.7566, "step": 3542 }, { "epoch": 9.357543743809838, "grad_norm": 568.0659790039062, "learning_rate": 0.0004597510306993778, "loss": 45.1011, "step": 3543 }, { "epoch": 9.360184879498185, "grad_norm": 454.70599365234375, "learning_rate": 0.0004597282961313481, "loss": 43.0368, "step": 3544 }, { "epoch": 9.36282601518653, "grad_norm": 421.51116943359375, "learning_rate": 0.00045970555570677886, "loss": 40.4958, "step": 3545 }, { "epoch": 9.365467150874876, "grad_norm": 371.52264404296875, "learning_rate": 0.00045968280942630516, "loss": 39.6369, "step": 3546 }, { "epoch": 9.368108286563222, "grad_norm": 334.9019775390625, "learning_rate": 0.0004596600572905621, "loss": 38.4761, "step": 3547 }, { "epoch": 9.370749422251569, "grad_norm": 676.1591796875, "learning_rate": 0.00045963729930018516, "loss": 39.5429, "step": 3548 }, { "epoch": 9.373390557939913, "grad_norm": 504.129638671875, "learning_rate": 0.0004596145354558098, "loss": 39.133, "step": 3549 }, { "epoch": 9.37603169362826, "grad_norm": 233.53091430664062, "learning_rate": 0.0004595917657580716, "loss": 39.292, "step": 3550 }, { "epoch": 9.378672829316606, "grad_norm": 446.4356689453125, "learning_rate": 0.00045956899020760655, "loss": 39.9568, "step": 3551 }, { "epoch": 9.381313965004953, "grad_norm": 327.6898498535156, "learning_rate": 0.00045954620880505053, "loss": 40.1838, "step": 3552 }, { "epoch": 9.383955100693298, "grad_norm": 214.47947692871094, "learning_rate": 0.0004595234215510397, "loss": 39.358, "step": 3553 }, { "epoch": 9.386596236381644, "grad_norm": 351.9453125, "learning_rate": 0.00045950062844621044, "loss": 38.7904, "step": 3554 }, { "epoch": 9.38923737206999, "grad_norm": 224.73226928710938, "learning_rate": 0.0004594778294911992, "loss": 39.8589, "step": 3555 }, { "epoch": 9.391878507758337, "grad_norm": 595.2722778320312, "learning_rate": 0.0004594550246866426, "loss": 40.7396, "step": 3556 }, { "epoch": 9.394519643446682, "grad_norm": 436.7476806640625, "learning_rate": 0.0004594322140331775, "loss": 42.1794, "step": 3557 }, { "epoch": 9.397160779135028, "grad_norm": 859.10693359375, "learning_rate": 0.0004594093975314408, "loss": 46.8949, "step": 3558 }, { "epoch": 9.399801914823374, "grad_norm": 472.9529724121094, "learning_rate": 0.0004593865751820697, "loss": 45.7082, "step": 3559 }, { "epoch": 9.402443050511721, "grad_norm": 214.2949676513672, "learning_rate": 0.0004593637469857015, "loss": 43.4421, "step": 3560 }, { "epoch": 9.405084186200066, "grad_norm": 403.6089782714844, "learning_rate": 0.00045934091294297364, "loss": 43.9477, "step": 3561 }, { "epoch": 9.407725321888412, "grad_norm": 328.7206115722656, "learning_rate": 0.00045931807305452376, "loss": 44.9913, "step": 3562 }, { "epoch": 9.410366457576758, "grad_norm": 286.5734558105469, "learning_rate": 0.0004592952273209897, "loss": 46.7696, "step": 3563 }, { "epoch": 9.413007593265103, "grad_norm": 399.1374206542969, "learning_rate": 0.00045927237574300934, "loss": 45.0297, "step": 3564 }, { "epoch": 9.41564872895345, "grad_norm": 290.31427001953125, "learning_rate": 0.00045924951832122086, "loss": 45.2843, "step": 3565 }, { "epoch": 9.418289864641796, "grad_norm": 466.346435546875, "learning_rate": 0.00045922665505626246, "loss": 44.8095, "step": 3566 }, { "epoch": 9.420931000330143, "grad_norm": 312.4383544921875, "learning_rate": 0.0004592037859487725, "loss": 41.8135, "step": 3567 }, { "epoch": 9.423572136018487, "grad_norm": 153.74464416503906, "learning_rate": 0.0004591809109993899, "loss": 43.4767, "step": 3568 }, { "epoch": 9.426213271706834, "grad_norm": 428.14599609375, "learning_rate": 0.0004591580302087531, "loss": 41.6218, "step": 3569 }, { "epoch": 9.42885440739518, "grad_norm": 224.33433532714844, "learning_rate": 0.0004591351435775013, "loss": 42.3729, "step": 3570 }, { "epoch": 9.431495543083527, "grad_norm": 146.48849487304688, "learning_rate": 0.00045911225110627343, "loss": 40.8937, "step": 3571 }, { "epoch": 9.434136678771871, "grad_norm": 150.78733825683594, "learning_rate": 0.00045908935279570875, "loss": 39.0518, "step": 3572 }, { "epoch": 9.436777814460218, "grad_norm": 374.25445556640625, "learning_rate": 0.00045906644864644676, "loss": 39.2611, "step": 3573 }, { "epoch": 9.439418950148564, "grad_norm": 210.6667938232422, "learning_rate": 0.000459043538659127, "loss": 39.976, "step": 3574 }, { "epoch": 9.44206008583691, "grad_norm": 202.10960388183594, "learning_rate": 0.0004590206228343892, "loss": 39.3608, "step": 3575 }, { "epoch": 9.444701221525255, "grad_norm": 327.0333251953125, "learning_rate": 0.0004589977011728733, "loss": 40.2401, "step": 3576 }, { "epoch": 9.447342357213602, "grad_norm": 155.31773376464844, "learning_rate": 0.00045897477367521934, "loss": 38.8469, "step": 3577 }, { "epoch": 9.449983492901948, "grad_norm": 173.9582061767578, "learning_rate": 0.0004589518403420676, "loss": 39.9205, "step": 3578 }, { "epoch": 9.452624628590295, "grad_norm": 4822.23388671875, "learning_rate": 0.00045892890117405844, "loss": 57.3203, "step": 3579 }, { "epoch": 9.45526576427864, "grad_norm": 1498.21240234375, "learning_rate": 0.00045890595617183253, "loss": 95.9568, "step": 3580 }, { "epoch": 9.457906899966986, "grad_norm": 2550.24951171875, "learning_rate": 0.0004588830053360304, "loss": 104.5197, "step": 3581 }, { "epoch": 9.460548035655332, "grad_norm": 1319.193359375, "learning_rate": 0.0004588600486672931, "loss": 101.3796, "step": 3582 }, { "epoch": 9.463189171343679, "grad_norm": 6261.72021484375, "learning_rate": 0.00045883708616626165, "loss": 84.0033, "step": 3583 }, { "epoch": 9.465830307032023, "grad_norm": 1329.8037109375, "learning_rate": 0.0004588141178335772, "loss": 76.7351, "step": 3584 }, { "epoch": 9.46847144272037, "grad_norm": 1870.5113525390625, "learning_rate": 0.00045879114366988123, "loss": 66.6262, "step": 3585 }, { "epoch": 9.471112578408716, "grad_norm": 2248.226318359375, "learning_rate": 0.0004587681636758152, "loss": 53.5625, "step": 3586 }, { "epoch": 9.473753714097061, "grad_norm": 933.8338012695312, "learning_rate": 0.00045874517785202086, "loss": 47.5514, "step": 3587 }, { "epoch": 9.476394849785407, "grad_norm": 1539.0333251953125, "learning_rate": 0.00045872218619914, "loss": 32.4487, "step": 3588 }, { "epoch": 9.479035985473754, "grad_norm": 1241.34326171875, "learning_rate": 0.0004586991887178147, "loss": 30.8264, "step": 3589 }, { "epoch": 9.4816771211621, "grad_norm": 1044.4005126953125, "learning_rate": 0.0004586761854086872, "loss": 49.8182, "step": 3590 }, { "epoch": 9.484318256850445, "grad_norm": 892.129150390625, "learning_rate": 0.0004586531762723998, "loss": 55.2971, "step": 3591 }, { "epoch": 9.486959392538791, "grad_norm": 910.0462036132812, "learning_rate": 0.000458630161309595, "loss": 55.2813, "step": 3592 }, { "epoch": 9.489600528227138, "grad_norm": 930.52392578125, "learning_rate": 0.0004586071405209155, "loss": 50.9028, "step": 3593 }, { "epoch": 9.492241663915484, "grad_norm": 558.4236450195312, "learning_rate": 0.00045858411390700416, "loss": 42.8645, "step": 3594 }, { "epoch": 9.494882799603829, "grad_norm": 246.8265838623047, "learning_rate": 0.00045856108146850394, "loss": 42.5548, "step": 3595 }, { "epoch": 9.497523935292175, "grad_norm": 640.0421752929688, "learning_rate": 0.00045853804320605807, "loss": 41.1413, "step": 3596 }, { "epoch": 9.500165070980522, "grad_norm": 361.9815979003906, "learning_rate": 0.00045851499912030985, "loss": 40.6301, "step": 3597 }, { "epoch": 9.502806206668868, "grad_norm": 328.0917053222656, "learning_rate": 0.00045849194921190276, "loss": 39.4543, "step": 3598 }, { "epoch": 9.505447342357213, "grad_norm": 530.1658325195312, "learning_rate": 0.00045846889348148044, "loss": 39.342, "step": 3599 }, { "epoch": 9.50808847804556, "grad_norm": 221.73390197753906, "learning_rate": 0.00045844583192968674, "loss": 40.9136, "step": 3600 }, { "epoch": 9.50808847804556, "eval_loss": 4.580142021179199, "eval_runtime": 2.127, "eval_samples_per_second": 232.72, "eval_steps_per_second": 29.149, "step": 3600 }, { "epoch": 9.510729613733906, "grad_norm": 403.747802734375, "learning_rate": 0.0004584227645571657, "loss": 38.0422, "step": 3601 }, { "epoch": 9.513370749422252, "grad_norm": 1179.836181640625, "learning_rate": 0.00045839969136456126, "loss": 38.6644, "step": 3602 }, { "epoch": 9.516011885110597, "grad_norm": 502.77392578125, "learning_rate": 0.00045837661235251795, "loss": 37.8343, "step": 3603 }, { "epoch": 9.518653020798943, "grad_norm": 522.9061889648438, "learning_rate": 0.00045835352752168014, "loss": 40.2833, "step": 3604 }, { "epoch": 9.52129415648729, "grad_norm": 316.0401916503906, "learning_rate": 0.00045833043687269244, "loss": 39.1529, "step": 3605 }, { "epoch": 9.523935292175636, "grad_norm": 397.1333312988281, "learning_rate": 0.0004583073404061997, "loss": 42.5846, "step": 3606 }, { "epoch": 9.526576427863981, "grad_norm": 1955.4525146484375, "learning_rate": 0.0004582842381228468, "loss": 43.1589, "step": 3607 }, { "epoch": 9.529217563552328, "grad_norm": 561.2981567382812, "learning_rate": 0.00045826113002327894, "loss": 45.4388, "step": 3608 }, { "epoch": 9.531858699240674, "grad_norm": 430.89794921875, "learning_rate": 0.0004582380161081414, "loss": 42.7053, "step": 3609 }, { "epoch": 9.534499834929019, "grad_norm": 475.274169921875, "learning_rate": 0.00045821489637807943, "loss": 42.9642, "step": 3610 }, { "epoch": 9.537140970617365, "grad_norm": 281.540283203125, "learning_rate": 0.00045819177083373895, "loss": 41.6022, "step": 3611 }, { "epoch": 9.539782106305712, "grad_norm": 227.3629913330078, "learning_rate": 0.00045816863947576553, "loss": 42.6435, "step": 3612 }, { "epoch": 9.542423241994058, "grad_norm": 421.82818603515625, "learning_rate": 0.00045814550230480514, "loss": 46.1, "step": 3613 }, { "epoch": 9.545064377682403, "grad_norm": 171.2279815673828, "learning_rate": 0.0004581223593215038, "loss": 45.0752, "step": 3614 }, { "epoch": 9.54770551337075, "grad_norm": 297.7975158691406, "learning_rate": 0.00045809921052650796, "loss": 44.1723, "step": 3615 }, { "epoch": 9.550346649059096, "grad_norm": 345.9429016113281, "learning_rate": 0.0004580760559204638, "loss": 42.2599, "step": 3616 }, { "epoch": 9.552987784747442, "grad_norm": 253.6796112060547, "learning_rate": 0.0004580528955040181, "loss": 43.2949, "step": 3617 }, { "epoch": 9.555628920435787, "grad_norm": 385.0306701660156, "learning_rate": 0.0004580297292778174, "loss": 42.6957, "step": 3618 }, { "epoch": 9.558270056124133, "grad_norm": 300.57244873046875, "learning_rate": 0.00045800655724250876, "loss": 41.2686, "step": 3619 }, { "epoch": 9.56091119181248, "grad_norm": 311.3326416015625, "learning_rate": 0.00045798337939873923, "loss": 39.2353, "step": 3620 }, { "epoch": 9.563552327500826, "grad_norm": 282.3966064453125, "learning_rate": 0.000457960195747156, "loss": 39.103, "step": 3621 }, { "epoch": 9.56619346318917, "grad_norm": 216.34066772460938, "learning_rate": 0.00045793700628840643, "loss": 38.2097, "step": 3622 }, { "epoch": 9.568834598877517, "grad_norm": 422.6346435546875, "learning_rate": 0.00045791381102313814, "loss": 39.8552, "step": 3623 }, { "epoch": 9.571475734565864, "grad_norm": 310.47930908203125, "learning_rate": 0.0004578906099519988, "loss": 39.0364, "step": 3624 }, { "epoch": 9.57411687025421, "grad_norm": 468.18212890625, "learning_rate": 0.00045786740307563633, "loss": 38.53, "step": 3625 }, { "epoch": 9.576758005942555, "grad_norm": 424.6642761230469, "learning_rate": 0.0004578441903946987, "loss": 37.4728, "step": 3626 }, { "epoch": 9.579399141630901, "grad_norm": 162.21180725097656, "learning_rate": 0.0004578209719098342, "loss": 38.1568, "step": 3627 }, { "epoch": 9.582040277319248, "grad_norm": 400.4797058105469, "learning_rate": 0.00045779774762169103, "loss": 37.8937, "step": 3628 }, { "epoch": 9.584681413007594, "grad_norm": 1081.3292236328125, "learning_rate": 0.0004577745175309179, "loss": 46.6059, "step": 3629 }, { "epoch": 9.587322548695939, "grad_norm": 1428.9410400390625, "learning_rate": 0.0004577512816381634, "loss": 65.9005, "step": 3630 }, { "epoch": 9.589963684384285, "grad_norm": 3931.68798828125, "learning_rate": 0.0004577280399440764, "loss": 85.8895, "step": 3631 }, { "epoch": 9.592604820072632, "grad_norm": 2201.061767578125, "learning_rate": 0.00045770479244930596, "loss": 86.7603, "step": 3632 }, { "epoch": 9.595245955760976, "grad_norm": 3855.48828125, "learning_rate": 0.0004576815391545012, "loss": 82.185, "step": 3633 }, { "epoch": 9.597887091449323, "grad_norm": 2123.558349609375, "learning_rate": 0.0004576582800603114, "loss": 76.1049, "step": 3634 }, { "epoch": 9.60052822713767, "grad_norm": 2939.70458984375, "learning_rate": 0.00045763501516738614, "loss": 54.8732, "step": 3635 }, { "epoch": 9.603169362826016, "grad_norm": 3775.660888671875, "learning_rate": 0.00045761174447637504, "loss": 53.762, "step": 3636 }, { "epoch": 9.60581049851436, "grad_norm": 4411.8447265625, "learning_rate": 0.00045758846798792796, "loss": 39.1062, "step": 3637 }, { "epoch": 9.608451634202707, "grad_norm": 1862.3946533203125, "learning_rate": 0.00045756518570269487, "loss": 29.0984, "step": 3638 }, { "epoch": 9.611092769891053, "grad_norm": 694.2131958007812, "learning_rate": 0.0004575418976213258, "loss": 34.7275, "step": 3639 }, { "epoch": 9.6137339055794, "grad_norm": 3296.94970703125, "learning_rate": 0.0004575186037444713, "loss": 89.5501, "step": 3640 }, { "epoch": 9.616375041267744, "grad_norm": 2696.084228515625, "learning_rate": 0.00045749530407278164, "loss": 79.4842, "step": 3641 }, { "epoch": 9.619016176956091, "grad_norm": 912.4937133789062, "learning_rate": 0.00045747199860690744, "loss": 56.4192, "step": 3642 }, { "epoch": 9.621657312644437, "grad_norm": 433.976806640625, "learning_rate": 0.00045744868734749956, "loss": 45.8992, "step": 3643 }, { "epoch": 9.624298448332784, "grad_norm": 326.22393798828125, "learning_rate": 0.000457425370295209, "loss": 45.6251, "step": 3644 }, { "epoch": 9.626939584021128, "grad_norm": 284.3357849121094, "learning_rate": 0.0004574020474506868, "loss": 41.862, "step": 3645 }, { "epoch": 9.629580719709475, "grad_norm": 245.61373901367188, "learning_rate": 0.0004573787188145843, "loss": 41.7869, "step": 3646 }, { "epoch": 9.632221855397821, "grad_norm": 207.5970458984375, "learning_rate": 0.0004573553843875529, "loss": 40.3809, "step": 3647 }, { "epoch": 9.634862991086168, "grad_norm": 137.335693359375, "learning_rate": 0.0004573320441702441, "loss": 38.3808, "step": 3648 }, { "epoch": 9.637504126774513, "grad_norm": 158.6073760986328, "learning_rate": 0.0004573086981633098, "loss": 41.2878, "step": 3649 }, { "epoch": 9.640145262462859, "grad_norm": 175.77647399902344, "learning_rate": 0.0004572853463674019, "loss": 38.9967, "step": 3650 }, { "epoch": 9.642786398151205, "grad_norm": 168.0815887451172, "learning_rate": 0.0004572619887831725, "loss": 41.8742, "step": 3651 }, { "epoch": 9.645427533839552, "grad_norm": 426.0965881347656, "learning_rate": 0.0004572386254112738, "loss": 39.1418, "step": 3652 }, { "epoch": 9.648068669527897, "grad_norm": 146.8704833984375, "learning_rate": 0.0004572152562523582, "loss": 38.2725, "step": 3653 }, { "epoch": 9.650709805216243, "grad_norm": 109.03627014160156, "learning_rate": 0.00045719188130707833, "loss": 39.771, "step": 3654 }, { "epoch": 9.65335094090459, "grad_norm": 621.380615234375, "learning_rate": 0.00045716850057608686, "loss": 40.0048, "step": 3655 }, { "epoch": 9.655992076592934, "grad_norm": 176.92471313476562, "learning_rate": 0.00045714511406003664, "loss": 40.8894, "step": 3656 }, { "epoch": 9.65863321228128, "grad_norm": 441.1344909667969, "learning_rate": 0.0004571217217595809, "loss": 42.9331, "step": 3657 }, { "epoch": 9.661274347969627, "grad_norm": 695.1343383789062, "learning_rate": 0.00045709832367537277, "loss": 58.6373, "step": 3658 }, { "epoch": 9.663915483657973, "grad_norm": 2849.25341796875, "learning_rate": 0.0004570749198080655, "loss": 64.7155, "step": 3659 }, { "epoch": 9.666556619346318, "grad_norm": 737.9303588867188, "learning_rate": 0.0004570515101583128, "loss": 53.3721, "step": 3660 }, { "epoch": 9.669197755034665, "grad_norm": 221.65943908691406, "learning_rate": 0.0004570280947267683, "loss": 44.6661, "step": 3661 }, { "epoch": 9.671838890723011, "grad_norm": 151.7459716796875, "learning_rate": 0.0004570046735140859, "loss": 45.5819, "step": 3662 }, { "epoch": 9.674480026411358, "grad_norm": 271.0122375488281, "learning_rate": 0.00045698124652091953, "loss": 49.9661, "step": 3663 }, { "epoch": 9.677121162099702, "grad_norm": 206.83795166015625, "learning_rate": 0.00045695781374792345, "loss": 47.4791, "step": 3664 }, { "epoch": 9.679762297788049, "grad_norm": 227.39208984375, "learning_rate": 0.00045693437519575206, "loss": 45.0931, "step": 3665 }, { "epoch": 9.682403433476395, "grad_norm": 215.72811889648438, "learning_rate": 0.00045691093086505977, "loss": 43.1376, "step": 3666 }, { "epoch": 9.685044569164742, "grad_norm": 170.63160705566406, "learning_rate": 0.00045688748075650124, "loss": 42.865, "step": 3667 }, { "epoch": 9.687685704853086, "grad_norm": 166.25271606445312, "learning_rate": 0.0004568640248707314, "loss": 42.7982, "step": 3668 }, { "epoch": 9.690326840541433, "grad_norm": 219.53707885742188, "learning_rate": 0.0004568405632084052, "loss": 44.527, "step": 3669 }, { "epoch": 9.69296797622978, "grad_norm": 227.7503662109375, "learning_rate": 0.00045681709577017764, "loss": 42.527, "step": 3670 }, { "epoch": 9.695609111918126, "grad_norm": 212.88198852539062, "learning_rate": 0.0004567936225567043, "loss": 43.2786, "step": 3671 }, { "epoch": 9.69825024760647, "grad_norm": 173.7348175048828, "learning_rate": 0.00045677014356864043, "loss": 42.0733, "step": 3672 }, { "epoch": 9.700891383294817, "grad_norm": 198.2044677734375, "learning_rate": 0.0004567466588066419, "loss": 39.2364, "step": 3673 }, { "epoch": 9.703532518983163, "grad_norm": 235.7629852294922, "learning_rate": 0.00045672316827136426, "loss": 40.7614, "step": 3674 }, { "epoch": 9.70617365467151, "grad_norm": 134.6682586669922, "learning_rate": 0.00045669967196346363, "loss": 36.8886, "step": 3675 }, { "epoch": 9.708814790359854, "grad_norm": 165.8765411376953, "learning_rate": 0.00045667616988359606, "loss": 36.9905, "step": 3676 }, { "epoch": 9.7114559260482, "grad_norm": 159.930908203125, "learning_rate": 0.00045665266203241786, "loss": 38.7067, "step": 3677 }, { "epoch": 9.714097061736547, "grad_norm": 161.6524658203125, "learning_rate": 0.0004566291484105854, "loss": 38.6685, "step": 3678 }, { "epoch": 9.716738197424892, "grad_norm": 293.12371826171875, "learning_rate": 0.00045660562901875544, "loss": 39.808, "step": 3679 }, { "epoch": 9.719379333113238, "grad_norm": 823.651123046875, "learning_rate": 0.0004565821038575847, "loss": 49.8106, "step": 3680 }, { "epoch": 9.722020468801585, "grad_norm": 2435.820068359375, "learning_rate": 0.00045655857292773, "loss": 152.1469, "step": 3681 }, { "epoch": 9.724661604489931, "grad_norm": 2324.6220703125, "learning_rate": 0.0004565350362298485, "loss": 153.3353, "step": 3682 }, { "epoch": 9.727302740178276, "grad_norm": 1154.595947265625, "learning_rate": 0.00045651149376459745, "loss": 136.2553, "step": 3683 }, { "epoch": 9.729943875866622, "grad_norm": 2510.24560546875, "learning_rate": 0.00045648794553263417, "loss": 133.8679, "step": 3684 }, { "epoch": 9.732585011554969, "grad_norm": 1404.5111083984375, "learning_rate": 0.0004564643915346164, "loss": 109.4881, "step": 3685 }, { "epoch": 9.735226147243315, "grad_norm": 2007.2154541015625, "learning_rate": 0.0004564408317712018, "loss": 104.0107, "step": 3686 }, { "epoch": 9.73786728293166, "grad_norm": 1300.462646484375, "learning_rate": 0.00045641726624304815, "loss": 99.539, "step": 3687 }, { "epoch": 9.740508418620006, "grad_norm": 1388.40771484375, "learning_rate": 0.0004563936949508136, "loss": 93.7687, "step": 3688 }, { "epoch": 9.743149554308353, "grad_norm": 3601.110595703125, "learning_rate": 0.0004563701178951564, "loss": 75.2211, "step": 3689 }, { "epoch": 9.7457906899967, "grad_norm": 1701.7030029296875, "learning_rate": 0.0004563465350767349, "loss": 42.2594, "step": 3690 }, { "epoch": 9.748431825685044, "grad_norm": 923.8509521484375, "learning_rate": 0.0004563229464962076, "loss": 65.7891, "step": 3691 }, { "epoch": 9.75107296137339, "grad_norm": 1057.701171875, "learning_rate": 0.00045629935215423325, "loss": 73.3848, "step": 3692 }, { "epoch": 9.753714097061737, "grad_norm": 1130.4364013671875, "learning_rate": 0.00045627575205147065, "loss": 67.1981, "step": 3693 }, { "epoch": 9.756355232750083, "grad_norm": 1407.7005615234375, "learning_rate": 0.0004562521461885788, "loss": 62.6665, "step": 3694 }, { "epoch": 9.758996368438428, "grad_norm": 655.4122314453125, "learning_rate": 0.00045622853456621707, "loss": 51.8949, "step": 3695 }, { "epoch": 9.761637504126774, "grad_norm": 528.5010986328125, "learning_rate": 0.00045620491718504453, "loss": 47.5943, "step": 3696 }, { "epoch": 9.764278639815121, "grad_norm": 426.51568603515625, "learning_rate": 0.0004561812940457208, "loss": 46.1342, "step": 3697 }, { "epoch": 9.766919775503467, "grad_norm": 383.0081481933594, "learning_rate": 0.00045615766514890563, "loss": 44.5245, "step": 3698 }, { "epoch": 9.769560911191812, "grad_norm": 358.7542419433594, "learning_rate": 0.0004561340304952587, "loss": 42.4593, "step": 3699 }, { "epoch": 9.772202046880158, "grad_norm": 4339.64111328125, "learning_rate": 0.0004561103900854401, "loss": 43.7661, "step": 3700 }, { "epoch": 9.774843182568505, "grad_norm": 1207.5906982421875, "learning_rate": 0.00045608674392010995, "loss": 41.6986, "step": 3701 }, { "epoch": 9.77748431825685, "grad_norm": 604.2755737304688, "learning_rate": 0.00045606309199992844, "loss": 41.7851, "step": 3702 }, { "epoch": 9.780125453945196, "grad_norm": 946.5127563476562, "learning_rate": 0.0004560394343255563, "loss": 41.1211, "step": 3703 }, { "epoch": 9.782766589633543, "grad_norm": 495.8558349609375, "learning_rate": 0.0004560157708976538, "loss": 42.2483, "step": 3704 }, { "epoch": 9.785407725321889, "grad_norm": 358.5557556152344, "learning_rate": 0.00045599210171688205, "loss": 41.9923, "step": 3705 }, { "epoch": 9.788048861010234, "grad_norm": 650.3411865234375, "learning_rate": 0.00045596842678390185, "loss": 43.197, "step": 3706 }, { "epoch": 9.79068999669858, "grad_norm": 563.0848388671875, "learning_rate": 0.00045594474609937424, "loss": 45.4869, "step": 3707 }, { "epoch": 9.793331132386927, "grad_norm": 573.1712036132812, "learning_rate": 0.0004559210596639607, "loss": 46.4935, "step": 3708 }, { "epoch": 9.795972268075273, "grad_norm": 280.75439453125, "learning_rate": 0.00045589736747832243, "loss": 45.0347, "step": 3709 }, { "epoch": 9.798613403763618, "grad_norm": 258.573486328125, "learning_rate": 0.00045587366954312117, "loss": 43.7462, "step": 3710 }, { "epoch": 9.801254539451964, "grad_norm": 518.12841796875, "learning_rate": 0.0004558499658590186, "loss": 44.5455, "step": 3711 }, { "epoch": 9.80389567514031, "grad_norm": 729.1087646484375, "learning_rate": 0.00045582625642667665, "loss": 47.8501, "step": 3712 }, { "epoch": 9.806536810828657, "grad_norm": 399.0218505859375, "learning_rate": 0.0004558025412467575, "loss": 46.9599, "step": 3713 }, { "epoch": 9.809177946517002, "grad_norm": 8811.9990234375, "learning_rate": 0.0004557788203199231, "loss": 52.6953, "step": 3714 }, { "epoch": 9.811819082205348, "grad_norm": 645.0108642578125, "learning_rate": 0.0004557550936468361, "loss": 48.2019, "step": 3715 }, { "epoch": 9.814460217893695, "grad_norm": 471.6072998046875, "learning_rate": 0.00045573136122815906, "loss": 48.6751, "step": 3716 }, { "epoch": 9.817101353582041, "grad_norm": 603.2041015625, "learning_rate": 0.00045570762306455456, "loss": 44.2743, "step": 3717 }, { "epoch": 9.819742489270386, "grad_norm": 634.1831665039062, "learning_rate": 0.0004556838791566855, "loss": 43.502, "step": 3718 }, { "epoch": 9.822383624958732, "grad_norm": 539.0808715820312, "learning_rate": 0.00045566012950521497, "loss": 42.7297, "step": 3719 }, { "epoch": 9.825024760647079, "grad_norm": 574.5101928710938, "learning_rate": 0.0004556363741108062, "loss": 44.2766, "step": 3720 }, { "epoch": 9.827665896335425, "grad_norm": 539.5939331054688, "learning_rate": 0.0004556126129741223, "loss": 41.6635, "step": 3721 }, { "epoch": 9.83030703202377, "grad_norm": 714.2122802734375, "learning_rate": 0.00045558884609582707, "loss": 42.5868, "step": 3722 }, { "epoch": 9.832948167712116, "grad_norm": 631.4068603515625, "learning_rate": 0.0004555650734765841, "loss": 41.2205, "step": 3723 }, { "epoch": 9.835589303400463, "grad_norm": 980.9544677734375, "learning_rate": 0.00045554129511705716, "loss": 42.7047, "step": 3724 }, { "epoch": 9.838230439088807, "grad_norm": 506.39654541015625, "learning_rate": 0.00045551751101791034, "loss": 40.5217, "step": 3725 }, { "epoch": 9.840871574777154, "grad_norm": 644.9725952148438, "learning_rate": 0.0004554937211798078, "loss": 40.2577, "step": 3726 }, { "epoch": 9.8435127104655, "grad_norm": 616.2069091796875, "learning_rate": 0.0004554699256034137, "loss": 40.4456, "step": 3727 }, { "epoch": 9.846153846153847, "grad_norm": 1511.4786376953125, "learning_rate": 0.0004554461242893927, "loss": 39.3295, "step": 3728 }, { "epoch": 9.848794981842191, "grad_norm": 793.4638671875, "learning_rate": 0.00045542231723840935, "loss": 41.7943, "step": 3729 }, { "epoch": 9.851436117530538, "grad_norm": 7062.80859375, "learning_rate": 0.0004553985044511284, "loss": 68.7784, "step": 3730 }, { "epoch": 9.854077253218884, "grad_norm": 11935.4775390625, "learning_rate": 0.000455374685928215, "loss": 124.2582, "step": 3731 }, { "epoch": 9.85671838890723, "grad_norm": 26190.298828125, "learning_rate": 0.000455350861670334, "loss": 127.4141, "step": 3732 }, { "epoch": 9.859359524595575, "grad_norm": 30148.54296875, "learning_rate": 0.0004553270316781508, "loss": 136.2246, "step": 3733 }, { "epoch": 9.862000660283922, "grad_norm": 8542.470703125, "learning_rate": 0.00045530319595233097, "loss": 105.3781, "step": 3734 }, { "epoch": 9.864641795972268, "grad_norm": 7701.79248046875, "learning_rate": 0.00045527935449353994, "loss": 112.6717, "step": 3735 }, { "epoch": 9.867282931660615, "grad_norm": 10049.1494140625, "learning_rate": 0.0004552555073024436, "loss": 138.0025, "step": 3736 }, { "epoch": 9.86992406734896, "grad_norm": 13220.7529296875, "learning_rate": 0.0004552316543797077, "loss": 97.5947, "step": 3737 }, { "epoch": 9.872565203037306, "grad_norm": 28932.408203125, "learning_rate": 0.00045520779572599845, "loss": 122.1338, "step": 3738 }, { "epoch": 9.875206338725652, "grad_norm": 3585.453857421875, "learning_rate": 0.00045518393134198206, "loss": 110.6209, "step": 3739 }, { "epoch": 9.877847474413999, "grad_norm": 3755.223388671875, "learning_rate": 0.0004551600612283249, "loss": 65.1908, "step": 3740 }, { "epoch": 9.880488610102343, "grad_norm": 1350.93017578125, "learning_rate": 0.0004551361853856936, "loss": 41.9807, "step": 3741 }, { "epoch": 9.88312974579069, "grad_norm": 730.5662841796875, "learning_rate": 0.00045511230381475474, "loss": 41.0444, "step": 3742 }, { "epoch": 9.885770881479036, "grad_norm": 558.427734375, "learning_rate": 0.0004550884165161753, "loss": 42.0823, "step": 3743 }, { "epoch": 9.888412017167383, "grad_norm": 976.4923706054688, "learning_rate": 0.0004550645234906223, "loss": 44.9093, "step": 3744 }, { "epoch": 9.891053152855728, "grad_norm": 619.62353515625, "learning_rate": 0.00045504062473876304, "loss": 39.7101, "step": 3745 }, { "epoch": 9.893694288544074, "grad_norm": 472.1983337402344, "learning_rate": 0.0004550167202612647, "loss": 39.4464, "step": 3746 }, { "epoch": 9.89633542423242, "grad_norm": 645.7005615234375, "learning_rate": 0.0004549928100587949, "loss": 39.5803, "step": 3747 }, { "epoch": 9.898976559920765, "grad_norm": 666.0624389648438, "learning_rate": 0.00045496889413202123, "loss": 38.8373, "step": 3748 }, { "epoch": 9.901617695609112, "grad_norm": 436.43499755859375, "learning_rate": 0.0004549449724816117, "loss": 38.3081, "step": 3749 }, { "epoch": 9.904258831297458, "grad_norm": 733.7010498046875, "learning_rate": 0.00045492104510823415, "loss": 38.7426, "step": 3750 }, { "epoch": 9.906899966985804, "grad_norm": 476.27777099609375, "learning_rate": 0.0004548971120125568, "loss": 38.8145, "step": 3751 }, { "epoch": 9.90954110267415, "grad_norm": 753.0869140625, "learning_rate": 0.000454873173195248, "loss": 40.3306, "step": 3752 }, { "epoch": 9.912182238362496, "grad_norm": 595.5208740234375, "learning_rate": 0.0004548492286569761, "loss": 40.1235, "step": 3753 }, { "epoch": 9.914823374050842, "grad_norm": 413.04473876953125, "learning_rate": 0.0004548252783984099, "loss": 39.0008, "step": 3754 }, { "epoch": 9.917464509739188, "grad_norm": 378.88238525390625, "learning_rate": 0.0004548013224202181, "loss": 39.7621, "step": 3755 }, { "epoch": 9.920105645427533, "grad_norm": 713.6878051757812, "learning_rate": 0.0004547773607230697, "loss": 42.8589, "step": 3756 }, { "epoch": 9.92274678111588, "grad_norm": 778.9638671875, "learning_rate": 0.00045475339330763376, "loss": 42.0158, "step": 3757 }, { "epoch": 9.925387916804226, "grad_norm": 1373.228515625, "learning_rate": 0.0004547294201745796, "loss": 45.163, "step": 3758 }, { "epoch": 9.928029052492573, "grad_norm": 575.89111328125, "learning_rate": 0.0004547054413245766, "loss": 42.3348, "step": 3759 }, { "epoch": 9.930670188180917, "grad_norm": 455.701171875, "learning_rate": 0.0004546814567582945, "loss": 42.8069, "step": 3760 }, { "epoch": 9.933311323869264, "grad_norm": 545.291015625, "learning_rate": 0.00045465746647640294, "loss": 48.3606, "step": 3761 }, { "epoch": 9.93595245955761, "grad_norm": 310.7208557128906, "learning_rate": 0.00045463347047957183, "loss": 43.9653, "step": 3762 }, { "epoch": 9.938593595245957, "grad_norm": 410.54193115234375, "learning_rate": 0.0004546094687684712, "loss": 45.0702, "step": 3763 }, { "epoch": 9.941234730934301, "grad_norm": 419.9945373535156, "learning_rate": 0.0004545854613437714, "loss": 45.2823, "step": 3764 }, { "epoch": 9.943875866622648, "grad_norm": 306.906494140625, "learning_rate": 0.00045456144820614275, "loss": 41.2515, "step": 3765 }, { "epoch": 9.946517002310994, "grad_norm": 436.0570983886719, "learning_rate": 0.0004545374293562559, "loss": 40.6105, "step": 3766 }, { "epoch": 9.94915813799934, "grad_norm": 364.86419677734375, "learning_rate": 0.00045451340479478144, "loss": 40.8522, "step": 3767 }, { "epoch": 9.951799273687685, "grad_norm": 279.07244873046875, "learning_rate": 0.00045448937452239024, "loss": 39.4425, "step": 3768 }, { "epoch": 9.954440409376032, "grad_norm": 245.8607635498047, "learning_rate": 0.00045446533853975345, "loss": 38.2026, "step": 3769 }, { "epoch": 9.957081545064378, "grad_norm": 631.119384765625, "learning_rate": 0.00045444129684754213, "loss": 38.7296, "step": 3770 }, { "epoch": 9.959722680752723, "grad_norm": 5925.181640625, "learning_rate": 0.00045441724944642774, "loss": 60.7873, "step": 3771 }, { "epoch": 9.96236381644107, "grad_norm": 17700.794921875, "learning_rate": 0.0004543931963370817, "loss": 77.3906, "step": 3772 }, { "epoch": 9.965004952129416, "grad_norm": 5512.9091796875, "learning_rate": 0.0004543691375201758, "loss": 66.3719, "step": 3773 }, { "epoch": 9.967646087817762, "grad_norm": 3217.5537109375, "learning_rate": 0.0004543450729963817, "loss": 70.8992, "step": 3774 }, { "epoch": 9.970287223506107, "grad_norm": 3628.184326171875, "learning_rate": 0.0004543210027663715, "loss": 66.2802, "step": 3775 }, { "epoch": 9.972928359194453, "grad_norm": 1485.8204345703125, "learning_rate": 0.00045429692683081736, "loss": 51.0521, "step": 3776 }, { "epoch": 9.9755694948828, "grad_norm": 423.89410400390625, "learning_rate": 0.00045427284519039145, "loss": 39.0667, "step": 3777 }, { "epoch": 9.978210630571146, "grad_norm": 475.142822265625, "learning_rate": 0.0004542487578457665, "loss": 39.9701, "step": 3778 }, { "epoch": 9.98085176625949, "grad_norm": 421.39312744140625, "learning_rate": 0.00045422466479761483, "loss": 39.4032, "step": 3779 }, { "epoch": 9.983492901947837, "grad_norm": 462.27667236328125, "learning_rate": 0.0004542005660466094, "loss": 38.9965, "step": 3780 }, { "epoch": 9.986134037636184, "grad_norm": 382.4969177246094, "learning_rate": 0.0004541764615934231, "loss": 39.0382, "step": 3781 }, { "epoch": 9.98877517332453, "grad_norm": 451.3133850097656, "learning_rate": 0.0004541523514387291, "loss": 38.631, "step": 3782 }, { "epoch": 9.991416309012875, "grad_norm": 341.0440979003906, "learning_rate": 0.0004541282355832006, "loss": 40.4393, "step": 3783 }, { "epoch": 9.994057444701221, "grad_norm": 301.4442138671875, "learning_rate": 0.000454104114027511, "loss": 38.7176, "step": 3784 }, { "epoch": 9.996698580389568, "grad_norm": 832.2748413085938, "learning_rate": 0.0004540799867723339, "loss": 38.5071, "step": 3785 }, { "epoch": 9.999339716077914, "grad_norm": 623.305419921875, "learning_rate": 0.00045405585381834315, "loss": 41.8819, "step": 3786 }, { "epoch": 10.001980851766259, "grad_norm": 822.127685546875, "learning_rate": 0.00045403171516621247, "loss": 47.892, "step": 3787 }, { "epoch": 10.004621987454605, "grad_norm": 683.8168334960938, "learning_rate": 0.00045400757081661596, "loss": 45.9526, "step": 3788 }, { "epoch": 10.007263123142952, "grad_norm": 266.9868469238281, "learning_rate": 0.00045398342077022794, "loss": 41.9205, "step": 3789 }, { "epoch": 10.009904258831298, "grad_norm": 380.72198486328125, "learning_rate": 0.00045395926502772273, "loss": 42.5503, "step": 3790 }, { "epoch": 10.012545394519643, "grad_norm": 256.4933776855469, "learning_rate": 0.00045393510358977487, "loss": 43.7469, "step": 3791 }, { "epoch": 10.01518653020799, "grad_norm": 280.317626953125, "learning_rate": 0.00045391093645705895, "loss": 46.7212, "step": 3792 }, { "epoch": 10.017827665896336, "grad_norm": 344.889892578125, "learning_rate": 0.00045388676363024993, "loss": 46.3928, "step": 3793 }, { "epoch": 10.02046880158468, "grad_norm": 238.6133575439453, "learning_rate": 0.00045386258511002285, "loss": 44.981, "step": 3794 }, { "epoch": 10.023109937273027, "grad_norm": 229.0686492919922, "learning_rate": 0.0004538384008970529, "loss": 42.8449, "step": 3795 }, { "epoch": 10.025751072961373, "grad_norm": 229.96864318847656, "learning_rate": 0.00045381421099201523, "loss": 43.0268, "step": 3796 }, { "epoch": 10.02839220864972, "grad_norm": 227.02894592285156, "learning_rate": 0.0004537900153955855, "loss": 45.2533, "step": 3797 }, { "epoch": 10.031033344338065, "grad_norm": 363.63134765625, "learning_rate": 0.0004537658141084393, "loss": 41.6353, "step": 3798 }, { "epoch": 10.033674480026411, "grad_norm": 207.8053436279297, "learning_rate": 0.00045374160713125246, "loss": 42.8751, "step": 3799 }, { "epoch": 10.036315615714757, "grad_norm": 206.0843963623047, "learning_rate": 0.00045371739446470083, "loss": 41.1352, "step": 3800 }, { "epoch": 10.036315615714757, "eval_loss": 6.2687835693359375, "eval_runtime": 2.1568, "eval_samples_per_second": 229.503, "eval_steps_per_second": 28.746, "step": 3800 }, { "epoch": 10.038956751403104, "grad_norm": 262.23529052734375, "learning_rate": 0.00045369317610946075, "loss": 39.0725, "step": 3801 }, { "epoch": 10.041597887091449, "grad_norm": 150.5678253173828, "learning_rate": 0.0004536689520662083, "loss": 40.4438, "step": 3802 }, { "epoch": 10.044239022779795, "grad_norm": 245.80258178710938, "learning_rate": 0.00045364472233562004, "loss": 39.236, "step": 3803 }, { "epoch": 10.046880158468142, "grad_norm": 239.38157653808594, "learning_rate": 0.00045362048691837255, "loss": 39.8934, "step": 3804 }, { "epoch": 10.049521294156488, "grad_norm": 167.3960418701172, "learning_rate": 0.00045359624581514256, "loss": 38.9249, "step": 3805 }, { "epoch": 10.052162429844833, "grad_norm": 282.90460205078125, "learning_rate": 0.000453571999026607, "loss": 39.0875, "step": 3806 }, { "epoch": 10.054803565533179, "grad_norm": 239.04708862304688, "learning_rate": 0.0004535477465534429, "loss": 39.6073, "step": 3807 }, { "epoch": 10.057444701221526, "grad_norm": 358.58843994140625, "learning_rate": 0.00045352348839632767, "loss": 45.2253, "step": 3808 }, { "epoch": 10.060085836909872, "grad_norm": 1735.2086181640625, "learning_rate": 0.0004534992245559385, "loss": 100.2246, "step": 3809 }, { "epoch": 10.062726972598217, "grad_norm": 2873.4228515625, "learning_rate": 0.00045347495503295305, "loss": 110.6354, "step": 3810 }, { "epoch": 10.065368108286563, "grad_norm": 2366.427734375, "learning_rate": 0.000453450679828049, "loss": 92.2312, "step": 3811 }, { "epoch": 10.06800924397491, "grad_norm": 2007.862060546875, "learning_rate": 0.00045342639894190424, "loss": 117.4118, "step": 3812 }, { "epoch": 10.070650379663256, "grad_norm": 1847.426025390625, "learning_rate": 0.0004534021123751968, "loss": 101.8705, "step": 3813 }, { "epoch": 10.0732915153516, "grad_norm": 3281.719970703125, "learning_rate": 0.0004533778201286048, "loss": 82.2649, "step": 3814 }, { "epoch": 10.075932651039947, "grad_norm": 13350.7119140625, "learning_rate": 0.0004533535222028067, "loss": 82.2695, "step": 3815 }, { "epoch": 10.078573786728294, "grad_norm": 2528.663330078125, "learning_rate": 0.0004533292185984809, "loss": 62.5898, "step": 3816 }, { "epoch": 10.081214922416638, "grad_norm": 3330.38623046875, "learning_rate": 0.0004533049093163062, "loss": 54.9284, "step": 3817 }, { "epoch": 10.083856058104985, "grad_norm": 6420.1318359375, "learning_rate": 0.00045328059435696124, "loss": 43.7483, "step": 3818 }, { "epoch": 10.086497193793331, "grad_norm": 484.5692138671875, "learning_rate": 0.0004532562737211251, "loss": 41.7527, "step": 3819 }, { "epoch": 10.089138329481678, "grad_norm": 462.3982849121094, "learning_rate": 0.00045323194740947695, "loss": 41.8656, "step": 3820 }, { "epoch": 10.091779465170022, "grad_norm": 513.4441528320312, "learning_rate": 0.00045320761542269605, "loss": 43.1134, "step": 3821 }, { "epoch": 10.094420600858369, "grad_norm": 427.5337219238281, "learning_rate": 0.00045318327776146185, "loss": 41.4018, "step": 3822 }, { "epoch": 10.097061736546715, "grad_norm": 438.01080322265625, "learning_rate": 0.000453158934426454, "loss": 40.7331, "step": 3823 }, { "epoch": 10.099702872235062, "grad_norm": 728.3533935546875, "learning_rate": 0.00045313458541835225, "loss": 38.6156, "step": 3824 }, { "epoch": 10.102344007923406, "grad_norm": 1489.3272705078125, "learning_rate": 0.0004531102307378365, "loss": 39.7885, "step": 3825 }, { "epoch": 10.104985143611753, "grad_norm": 591.2474365234375, "learning_rate": 0.0004530858703855869, "loss": 38.9496, "step": 3826 }, { "epoch": 10.1076262793001, "grad_norm": 473.1734924316406, "learning_rate": 0.00045306150436228365, "loss": 40.2308, "step": 3827 }, { "epoch": 10.110267414988446, "grad_norm": 730.185546875, "learning_rate": 0.00045303713266860716, "loss": 39.8843, "step": 3828 }, { "epoch": 10.11290855067679, "grad_norm": 770.6106567382812, "learning_rate": 0.000453012755305238, "loss": 39.6308, "step": 3829 }, { "epoch": 10.115549686365137, "grad_norm": 1542.5521240234375, "learning_rate": 0.00045298837227285694, "loss": 39.2163, "step": 3830 }, { "epoch": 10.118190822053483, "grad_norm": 554.6454467773438, "learning_rate": 0.00045296398357214487, "loss": 40.4819, "step": 3831 }, { "epoch": 10.12083195774183, "grad_norm": 1223.2469482421875, "learning_rate": 0.0004529395892037827, "loss": 41.1827, "step": 3832 }, { "epoch": 10.123473093430174, "grad_norm": 739.099853515625, "learning_rate": 0.00045291518916845176, "loss": 41.0132, "step": 3833 }, { "epoch": 10.12611422911852, "grad_norm": 1785.1900634765625, "learning_rate": 0.0004528907834668333, "loss": 42.2204, "step": 3834 }, { "epoch": 10.128755364806867, "grad_norm": 521.3050537109375, "learning_rate": 0.000452866372099609, "loss": 42.2345, "step": 3835 }, { "epoch": 10.131396500495214, "grad_norm": 1158.820068359375, "learning_rate": 0.0004528419550674604, "loss": 43.6818, "step": 3836 }, { "epoch": 10.134037636183558, "grad_norm": 880.2210083007812, "learning_rate": 0.00045281753237106926, "loss": 42.7939, "step": 3837 }, { "epoch": 10.136678771871905, "grad_norm": 387.462646484375, "learning_rate": 0.0004527931040111178, "loss": 42.6609, "step": 3838 }, { "epoch": 10.139319907560251, "grad_norm": 412.0037536621094, "learning_rate": 0.0004527686699882879, "loss": 42.9568, "step": 3839 }, { "epoch": 10.141961043248596, "grad_norm": 399.8983154296875, "learning_rate": 0.0004527442303032622, "loss": 43.9506, "step": 3840 }, { "epoch": 10.144602178936942, "grad_norm": 321.3785095214844, "learning_rate": 0.00045271978495672286, "loss": 45.7462, "step": 3841 }, { "epoch": 10.147243314625289, "grad_norm": 256.851318359375, "learning_rate": 0.0004526953339493525, "loss": 43.993, "step": 3842 }, { "epoch": 10.149884450313635, "grad_norm": 215.20101928710938, "learning_rate": 0.0004526708772818342, "loss": 48.7396, "step": 3843 }, { "epoch": 10.15252558600198, "grad_norm": 275.6069641113281, "learning_rate": 0.0004526464149548506, "loss": 45.8917, "step": 3844 }, { "epoch": 10.155166721690327, "grad_norm": 384.6188659667969, "learning_rate": 0.00045262194696908486, "loss": 44.9933, "step": 3845 }, { "epoch": 10.157807857378673, "grad_norm": 322.7388916015625, "learning_rate": 0.0004525974733252204, "loss": 44.5594, "step": 3846 }, { "epoch": 10.16044899306702, "grad_norm": 303.6708679199219, "learning_rate": 0.0004525729940239404, "loss": 44.0872, "step": 3847 }, { "epoch": 10.163090128755364, "grad_norm": 381.39483642578125, "learning_rate": 0.0004525485090659286, "loss": 41.6802, "step": 3848 }, { "epoch": 10.16573126444371, "grad_norm": 226.2705841064453, "learning_rate": 0.0004525240184518687, "loss": 40.9696, "step": 3849 }, { "epoch": 10.168372400132057, "grad_norm": 251.0211944580078, "learning_rate": 0.0004524995221824445, "loss": 40.0507, "step": 3850 }, { "epoch": 10.171013535820403, "grad_norm": 250.09786987304688, "learning_rate": 0.00045247502025834007, "loss": 39.0616, "step": 3851 }, { "epoch": 10.173654671508748, "grad_norm": 236.37063598632812, "learning_rate": 0.00045245051268023975, "loss": 38.903, "step": 3852 }, { "epoch": 10.176295807197095, "grad_norm": 203.32408142089844, "learning_rate": 0.0004524259994488277, "loss": 38.8262, "step": 3853 }, { "epoch": 10.178936942885441, "grad_norm": 515.907958984375, "learning_rate": 0.00045240148056478855, "loss": 38.7022, "step": 3854 }, { "epoch": 10.181578078573787, "grad_norm": 383.9682922363281, "learning_rate": 0.0004523769560288069, "loss": 40.2755, "step": 3855 }, { "epoch": 10.184219214262132, "grad_norm": 273.96868896484375, "learning_rate": 0.0004523524258415677, "loss": 37.43, "step": 3856 }, { "epoch": 10.186860349950479, "grad_norm": 345.4823303222656, "learning_rate": 0.00045232789000375584, "loss": 38.5912, "step": 3857 }, { "epoch": 10.189501485638825, "grad_norm": 795.1988525390625, "learning_rate": 0.0004523033485160566, "loss": 39.5612, "step": 3858 }, { "epoch": 10.192142621327172, "grad_norm": 2594.491943359375, "learning_rate": 0.00045227880137915506, "loss": 49.0651, "step": 3859 }, { "epoch": 10.194783757015516, "grad_norm": 19673.833984375, "learning_rate": 0.0004522542485937369, "loss": 51.6371, "step": 3860 }, { "epoch": 10.197424892703863, "grad_norm": 7584.4345703125, "learning_rate": 0.0004522296901604876, "loss": 48.8614, "step": 3861 }, { "epoch": 10.200066028392209, "grad_norm": 6703.10546875, "learning_rate": 0.000452205126080093, "loss": 53.3466, "step": 3862 }, { "epoch": 10.202707164080554, "grad_norm": 9418.642578125, "learning_rate": 0.0004521805563532391, "loss": 48.8747, "step": 3863 }, { "epoch": 10.2053482997689, "grad_norm": 36864.1640625, "learning_rate": 0.0004521559809806119, "loss": 40.2564, "step": 3864 }, { "epoch": 10.207989435457247, "grad_norm": 7438.8779296875, "learning_rate": 0.00045213139996289764, "loss": 41.99, "step": 3865 }, { "epoch": 10.210630571145593, "grad_norm": 1700.20751953125, "learning_rate": 0.00045210681330078286, "loss": 39.2314, "step": 3866 }, { "epoch": 10.213271706833938, "grad_norm": 1759.35498046875, "learning_rate": 0.00045208222099495397, "loss": 37.2669, "step": 3867 }, { "epoch": 10.215912842522284, "grad_norm": 2185.630859375, "learning_rate": 0.00045205762304609774, "loss": 35.4468, "step": 3868 }, { "epoch": 10.21855397821063, "grad_norm": 438.3274230957031, "learning_rate": 0.0004520330194549012, "loss": 43.0146, "step": 3869 }, { "epoch": 10.221195113898977, "grad_norm": 396.74407958984375, "learning_rate": 0.0004520084102220512, "loss": 41.5445, "step": 3870 }, { "epoch": 10.223836249587322, "grad_norm": 310.6490173339844, "learning_rate": 0.00045198379534823507, "loss": 41.9543, "step": 3871 }, { "epoch": 10.226477385275668, "grad_norm": 361.799072265625, "learning_rate": 0.00045195917483414005, "loss": 43.1153, "step": 3872 }, { "epoch": 10.229118520964015, "grad_norm": 283.3182067871094, "learning_rate": 0.0004519345486804537, "loss": 39.2575, "step": 3873 }, { "epoch": 10.231759656652361, "grad_norm": 362.1432800292969, "learning_rate": 0.0004519099168878637, "loss": 39.6007, "step": 3874 }, { "epoch": 10.234400792340706, "grad_norm": 178.8114013671875, "learning_rate": 0.00045188527945705793, "loss": 40.1005, "step": 3875 }, { "epoch": 10.237041928029052, "grad_norm": 328.56195068359375, "learning_rate": 0.00045186063638872427, "loss": 38.9969, "step": 3876 }, { "epoch": 10.239683063717399, "grad_norm": 324.3384704589844, "learning_rate": 0.0004518359876835509, "loss": 38.2427, "step": 3877 }, { "epoch": 10.242324199405745, "grad_norm": 242.83984375, "learning_rate": 0.00045181133334222626, "loss": 38.8206, "step": 3878 }, { "epoch": 10.24496533509409, "grad_norm": 168.89617919921875, "learning_rate": 0.0004517866733654385, "loss": 39.2655, "step": 3879 }, { "epoch": 10.247606470782436, "grad_norm": 182.2276153564453, "learning_rate": 0.0004517620077538766, "loss": 39.0195, "step": 3880 }, { "epoch": 10.250247606470783, "grad_norm": 360.47979736328125, "learning_rate": 0.00045173733650822907, "loss": 38.8051, "step": 3881 }, { "epoch": 10.25288874215913, "grad_norm": 377.6777038574219, "learning_rate": 0.00045171265962918493, "loss": 39.3982, "step": 3882 }, { "epoch": 10.255529877847474, "grad_norm": 233.5725555419922, "learning_rate": 0.0004516879771174333, "loss": 38.1225, "step": 3883 }, { "epoch": 10.25817101353582, "grad_norm": 301.0205383300781, "learning_rate": 0.0004516632889736634, "loss": 38.0415, "step": 3884 }, { "epoch": 10.260812149224167, "grad_norm": 290.89630126953125, "learning_rate": 0.00045163859519856456, "loss": 40.9647, "step": 3885 }, { "epoch": 10.263453284912512, "grad_norm": 496.3010559082031, "learning_rate": 0.00045161389579282643, "loss": 41.9972, "step": 3886 }, { "epoch": 10.266094420600858, "grad_norm": 309.8701477050781, "learning_rate": 0.0004515891907571386, "loss": 41.4841, "step": 3887 }, { "epoch": 10.268735556289204, "grad_norm": 180.74798583984375, "learning_rate": 0.0004515644800921911, "loss": 41.0221, "step": 3888 }, { "epoch": 10.27137669197755, "grad_norm": 303.2255554199219, "learning_rate": 0.00045153976379867393, "loss": 42.0172, "step": 3889 }, { "epoch": 10.274017827665896, "grad_norm": 262.1221923828125, "learning_rate": 0.0004515150418772772, "loss": 40.5929, "step": 3890 }, { "epoch": 10.276658963354242, "grad_norm": 221.67233276367188, "learning_rate": 0.00045149031432869136, "loss": 45.7927, "step": 3891 }, { "epoch": 10.279300099042588, "grad_norm": 611.5584716796875, "learning_rate": 0.00045146558115360686, "loss": 41.6302, "step": 3892 }, { "epoch": 10.281941234730935, "grad_norm": 170.13072204589844, "learning_rate": 0.00045144084235271433, "loss": 45.5992, "step": 3893 }, { "epoch": 10.28458237041928, "grad_norm": 269.3602600097656, "learning_rate": 0.00045141609792670456, "loss": 44.7288, "step": 3894 }, { "epoch": 10.287223506107626, "grad_norm": 301.1896057128906, "learning_rate": 0.0004513913478762687, "loss": 44.7237, "step": 3895 }, { "epoch": 10.289864641795972, "grad_norm": 210.42196655273438, "learning_rate": 0.0004513665922020976, "loss": 41.496, "step": 3896 }, { "epoch": 10.292505777484319, "grad_norm": 266.88409423828125, "learning_rate": 0.0004513418309048828, "loss": 41.9903, "step": 3897 }, { "epoch": 10.295146913172664, "grad_norm": 495.75726318359375, "learning_rate": 0.0004513170639853156, "loss": 41.5748, "step": 3898 }, { "epoch": 10.29778804886101, "grad_norm": 206.83387756347656, "learning_rate": 0.00045129229144408767, "loss": 41.2143, "step": 3899 }, { "epoch": 10.300429184549357, "grad_norm": 283.2553405761719, "learning_rate": 0.0004512675132818908, "loss": 40.8007, "step": 3900 }, { "epoch": 10.303070320237703, "grad_norm": 291.8153381347656, "learning_rate": 0.0004512427294994167, "loss": 37.4282, "step": 3901 }, { "epoch": 10.305711455926048, "grad_norm": 289.1590576171875, "learning_rate": 0.0004512179400973577, "loss": 38.8893, "step": 3902 }, { "epoch": 10.308352591614394, "grad_norm": 215.65774536132812, "learning_rate": 0.000451193145076406, "loss": 38.9589, "step": 3903 }, { "epoch": 10.31099372730274, "grad_norm": 160.89834594726562, "learning_rate": 0.00045116834443725373, "loss": 39.0713, "step": 3904 }, { "epoch": 10.313634862991087, "grad_norm": 206.75555419921875, "learning_rate": 0.0004511435381805937, "loss": 37.5952, "step": 3905 }, { "epoch": 10.316275998679432, "grad_norm": 358.9605407714844, "learning_rate": 0.0004511187263071186, "loss": 38.2827, "step": 3906 }, { "epoch": 10.318917134367778, "grad_norm": 386.3522033691406, "learning_rate": 0.0004510939088175211, "loss": 38.6081, "step": 3907 }, { "epoch": 10.321558270056125, "grad_norm": 336.7215576171875, "learning_rate": 0.0004510690857124944, "loss": 37.8108, "step": 3908 }, { "epoch": 10.32419940574447, "grad_norm": 996.5487060546875, "learning_rate": 0.00045104425699273157, "loss": 48.0082, "step": 3909 }, { "epoch": 10.326840541432816, "grad_norm": 33094.171875, "learning_rate": 0.00045101942265892596, "loss": 71.2755, "step": 3910 }, { "epoch": 10.329481677121162, "grad_norm": 2186.2724609375, "learning_rate": 0.00045099458271177106, "loss": 70.1601, "step": 3911 }, { "epoch": 10.332122812809509, "grad_norm": 7920.8857421875, "learning_rate": 0.0004509697371519605, "loss": 60.586, "step": 3912 }, { "epoch": 10.334763948497853, "grad_norm": 3965.31591796875, "learning_rate": 0.00045094488598018814, "loss": 66.8954, "step": 3913 }, { "epoch": 10.3374050841862, "grad_norm": 4170.2421875, "learning_rate": 0.0004509200291971478, "loss": 57.8475, "step": 3914 }, { "epoch": 10.340046219874546, "grad_norm": 5181.8837890625, "learning_rate": 0.0004508951668035337, "loss": 55.7083, "step": 3915 }, { "epoch": 10.342687355562893, "grad_norm": 3235.65380859375, "learning_rate": 0.00045087029880004, "loss": 42.6084, "step": 3916 }, { "epoch": 10.345328491251237, "grad_norm": 5065.392578125, "learning_rate": 0.0004508454251873614, "loss": 41.3866, "step": 3917 }, { "epoch": 10.347969626939584, "grad_norm": 3632.73583984375, "learning_rate": 0.0004508205459661922, "loss": 36.1978, "step": 3918 }, { "epoch": 10.35061076262793, "grad_norm": 1454.711669921875, "learning_rate": 0.00045079566113722714, "loss": 39.9191, "step": 3919 }, { "epoch": 10.353251898316277, "grad_norm": 396.01800537109375, "learning_rate": 0.0004507707707011612, "loss": 41.3955, "step": 3920 }, { "epoch": 10.355893034004621, "grad_norm": 323.76324462890625, "learning_rate": 0.00045074587465868956, "loss": 38.8782, "step": 3921 }, { "epoch": 10.358534169692968, "grad_norm": 633.6897583007812, "learning_rate": 0.0004507209730105072, "loss": 39.1973, "step": 3922 }, { "epoch": 10.361175305381314, "grad_norm": 473.1686706542969, "learning_rate": 0.0004506960657573096, "loss": 39.1601, "step": 3923 }, { "epoch": 10.36381644106966, "grad_norm": 362.0717468261719, "learning_rate": 0.0004506711528997922, "loss": 39.9392, "step": 3924 }, { "epoch": 10.366457576758005, "grad_norm": 675.3978271484375, "learning_rate": 0.0004506462344386508, "loss": 42.4584, "step": 3925 }, { "epoch": 10.369098712446352, "grad_norm": 279.37188720703125, "learning_rate": 0.0004506213103745812, "loss": 39.1554, "step": 3926 }, { "epoch": 10.371739848134698, "grad_norm": 807.1853637695312, "learning_rate": 0.00045059638070827924, "loss": 38.0477, "step": 3927 }, { "epoch": 10.374380983823045, "grad_norm": 401.0590515136719, "learning_rate": 0.00045057144544044124, "loss": 40.2885, "step": 3928 }, { "epoch": 10.37702211951139, "grad_norm": 513.495849609375, "learning_rate": 0.0004505465045717635, "loss": 40.655, "step": 3929 }, { "epoch": 10.379663255199736, "grad_norm": 411.97235107421875, "learning_rate": 0.00045052155810294245, "loss": 39.0951, "step": 3930 }, { "epoch": 10.382304390888082, "grad_norm": 559.1839599609375, "learning_rate": 0.0004504966060346746, "loss": 37.9764, "step": 3931 }, { "epoch": 10.384945526576427, "grad_norm": 541.2291870117188, "learning_rate": 0.00045047164836765685, "loss": 37.067, "step": 3932 }, { "epoch": 10.387586662264773, "grad_norm": 691.5557861328125, "learning_rate": 0.000450446685102586, "loss": 38.4979, "step": 3933 }, { "epoch": 10.39022779795312, "grad_norm": 724.7285766601562, "learning_rate": 0.0004504217162401593, "loss": 39.2882, "step": 3934 }, { "epoch": 10.392868933641466, "grad_norm": 559.592041015625, "learning_rate": 0.00045039674178107395, "loss": 39.9979, "step": 3935 }, { "epoch": 10.395510069329811, "grad_norm": 1236.96142578125, "learning_rate": 0.0004503717617260272, "loss": 43.1148, "step": 3936 }, { "epoch": 10.398151205018157, "grad_norm": 676.4827270507812, "learning_rate": 0.0004503467760757167, "loss": 44.0655, "step": 3937 }, { "epoch": 10.400792340706504, "grad_norm": 575.6954956054688, "learning_rate": 0.0004503217848308403, "loss": 43.9695, "step": 3938 }, { "epoch": 10.40343347639485, "grad_norm": 462.10595703125, "learning_rate": 0.0004502967879920956, "loss": 42.5172, "step": 3939 }, { "epoch": 10.406074612083195, "grad_norm": 475.9786682128906, "learning_rate": 0.0004502717855601809, "loss": 42.9514, "step": 3940 }, { "epoch": 10.408715747771542, "grad_norm": 368.930419921875, "learning_rate": 0.0004502467775357941, "loss": 46.0327, "step": 3941 }, { "epoch": 10.411356883459888, "grad_norm": 363.5591735839844, "learning_rate": 0.0004502217639196337, "loss": 45.3121, "step": 3942 }, { "epoch": 10.413998019148234, "grad_norm": 315.5859680175781, "learning_rate": 0.0004501967447123982, "loss": 44.7156, "step": 3943 }, { "epoch": 10.416639154836579, "grad_norm": 423.4031677246094, "learning_rate": 0.0004501717199147862, "loss": 46.504, "step": 3944 }, { "epoch": 10.419280290524926, "grad_norm": 291.990478515625, "learning_rate": 0.00045014668952749647, "loss": 43.8074, "step": 3945 }, { "epoch": 10.421921426213272, "grad_norm": 282.2054748535156, "learning_rate": 0.00045012165355122805, "loss": 40.9332, "step": 3946 }, { "epoch": 10.424562561901618, "grad_norm": 326.75494384765625, "learning_rate": 0.00045009661198667996, "loss": 43.2733, "step": 3947 }, { "epoch": 10.427203697589963, "grad_norm": 291.6916809082031, "learning_rate": 0.00045007156483455155, "loss": 42.285, "step": 3948 }, { "epoch": 10.42984483327831, "grad_norm": 1726.41455078125, "learning_rate": 0.0004500465120955422, "loss": 40.1115, "step": 3949 }, { "epoch": 10.432485968966656, "grad_norm": 330.8326416015625, "learning_rate": 0.00045002145377035153, "loss": 39.9721, "step": 3950 }, { "epoch": 10.435127104655002, "grad_norm": 317.88751220703125, "learning_rate": 0.0004499963898596793, "loss": 39.0143, "step": 3951 }, { "epoch": 10.437768240343347, "grad_norm": 527.4280395507812, "learning_rate": 0.00044997132036422535, "loss": 39.2025, "step": 3952 }, { "epoch": 10.440409376031694, "grad_norm": 293.4837646484375, "learning_rate": 0.00044994624528468975, "loss": 38.7504, "step": 3953 }, { "epoch": 10.44305051172004, "grad_norm": 239.1688995361328, "learning_rate": 0.0004499211646217727, "loss": 38.6323, "step": 3954 }, { "epoch": 10.445691647408385, "grad_norm": 253.00454711914062, "learning_rate": 0.0004498960783761745, "loss": 38.7739, "step": 3955 }, { "epoch": 10.448332783096731, "grad_norm": 320.5574951171875, "learning_rate": 0.0004498709865485958, "loss": 39.3382, "step": 3956 }, { "epoch": 10.450973918785078, "grad_norm": 493.6973876953125, "learning_rate": 0.00044984588913973725, "loss": 38.5423, "step": 3957 }, { "epoch": 10.453615054473424, "grad_norm": 319.1230163574219, "learning_rate": 0.0004498207861502995, "loss": 39.6988, "step": 3958 }, { "epoch": 10.456256190161769, "grad_norm": 4307.68115234375, "learning_rate": 0.00044979567758098385, "loss": 50.2814, "step": 3959 }, { "epoch": 10.458897325850115, "grad_norm": 2680.0576171875, "learning_rate": 0.0004497705634324912, "loss": 81.8632, "step": 3960 }, { "epoch": 10.461538461538462, "grad_norm": 5831.60986328125, "learning_rate": 0.0004497454437055229, "loss": 120.4423, "step": 3961 }, { "epoch": 10.464179597226808, "grad_norm": 5046.58984375, "learning_rate": 0.00044972031840078043, "loss": 97.3413, "step": 3962 }, { "epoch": 10.466820732915153, "grad_norm": 20508.7421875, "learning_rate": 0.00044969518751896537, "loss": 98.2231, "step": 3963 }, { "epoch": 10.4694618686035, "grad_norm": 16357.119140625, "learning_rate": 0.0004496700510607795, "loss": 96.7211, "step": 3964 }, { "epoch": 10.472103004291846, "grad_norm": 3254.083984375, "learning_rate": 0.00044964490902692477, "loss": 98.0901, "step": 3965 }, { "epoch": 10.474744139980192, "grad_norm": 2762.221923828125, "learning_rate": 0.00044961976141810327, "loss": 80.4364, "step": 3966 }, { "epoch": 10.477385275668537, "grad_norm": 6832.34326171875, "learning_rate": 0.0004495946082350172, "loss": 84.6587, "step": 3967 }, { "epoch": 10.480026411356883, "grad_norm": 6556.3408203125, "learning_rate": 0.0004495694494783689, "loss": 58.1595, "step": 3968 }, { "epoch": 10.48266754704523, "grad_norm": 352.49615478515625, "learning_rate": 0.0004495442851488609, "loss": 38.8965, "step": 3969 }, { "epoch": 10.485308682733576, "grad_norm": 288.47137451171875, "learning_rate": 0.00044951911524719607, "loss": 38.2283, "step": 3970 }, { "epoch": 10.48794981842192, "grad_norm": 256.6803894042969, "learning_rate": 0.0004494939397740771, "loss": 38.9195, "step": 3971 }, { "epoch": 10.490590954110267, "grad_norm": 391.89886474609375, "learning_rate": 0.0004494687587302071, "loss": 38.007, "step": 3972 }, { "epoch": 10.493232089798614, "grad_norm": 190.52276611328125, "learning_rate": 0.0004494435721162891, "loss": 38.1358, "step": 3973 }, { "epoch": 10.49587322548696, "grad_norm": 147.6156768798828, "learning_rate": 0.0004494183799330266, "loss": 37.6928, "step": 3974 }, { "epoch": 10.498514361175305, "grad_norm": 220.5774688720703, "learning_rate": 0.00044939318218112286, "loss": 39.6511, "step": 3975 }, { "epoch": 10.501155496863651, "grad_norm": 120.26123046875, "learning_rate": 0.00044936797886128174, "loss": 39.2617, "step": 3976 }, { "epoch": 10.503796632551998, "grad_norm": 147.61041259765625, "learning_rate": 0.0004493427699742069, "loss": 39.2107, "step": 3977 }, { "epoch": 10.506437768240342, "grad_norm": 294.0646057128906, "learning_rate": 0.00044931755552060225, "loss": 38.9095, "step": 3978 }, { "epoch": 10.509078903928689, "grad_norm": 217.86056518554688, "learning_rate": 0.00044929233550117204, "loss": 38.6187, "step": 3979 }, { "epoch": 10.511720039617035, "grad_norm": 159.12351989746094, "learning_rate": 0.00044926710991662034, "loss": 39.3534, "step": 3980 }, { "epoch": 10.514361175305382, "grad_norm": 261.6485290527344, "learning_rate": 0.00044924187876765176, "loss": 38.2011, "step": 3981 }, { "epoch": 10.517002310993727, "grad_norm": 245.95379638671875, "learning_rate": 0.0004492166420549707, "loss": 37.3734, "step": 3982 }, { "epoch": 10.519643446682073, "grad_norm": 243.43406677246094, "learning_rate": 0.00044919139977928193, "loss": 39.7451, "step": 3983 }, { "epoch": 10.52228458237042, "grad_norm": 278.3465881347656, "learning_rate": 0.0004491661519412903, "loss": 39.3076, "step": 3984 }, { "epoch": 10.524925718058766, "grad_norm": 501.3578186035156, "learning_rate": 0.00044914089854170095, "loss": 40.6435, "step": 3985 }, { "epoch": 10.52756685374711, "grad_norm": 415.91064453125, "learning_rate": 0.0004491156395812189, "loss": 43.279, "step": 3986 }, { "epoch": 10.530207989435457, "grad_norm": 383.12078857421875, "learning_rate": 0.00044909037506054975, "loss": 43.3573, "step": 3987 }, { "epoch": 10.532849125123803, "grad_norm": 159.23731994628906, "learning_rate": 0.0004490651049803987, "loss": 42.9949, "step": 3988 }, { "epoch": 10.53549026081215, "grad_norm": 188.3250732421875, "learning_rate": 0.00044903982934147146, "loss": 42.1057, "step": 3989 }, { "epoch": 10.538131396500495, "grad_norm": 553.0594482421875, "learning_rate": 0.00044901454814447406, "loss": 44.2694, "step": 3990 }, { "epoch": 10.540772532188841, "grad_norm": 242.44027709960938, "learning_rate": 0.0004489892613901122, "loss": 44.0602, "step": 3991 }, { "epoch": 10.543413667877187, "grad_norm": 439.88372802734375, "learning_rate": 0.00044896396907909223, "loss": 48.8888, "step": 3992 }, { "epoch": 10.546054803565534, "grad_norm": 526.1802978515625, "learning_rate": 0.0004489386712121202, "loss": 43.7602, "step": 3993 }, { "epoch": 10.548695939253879, "grad_norm": 341.88494873046875, "learning_rate": 0.00044891336778990265, "loss": 46.3432, "step": 3994 }, { "epoch": 10.551337074942225, "grad_norm": 185.23008728027344, "learning_rate": 0.00044888805881314613, "loss": 43.3317, "step": 3995 }, { "epoch": 10.553978210630572, "grad_norm": 321.0625305175781, "learning_rate": 0.0004488627442825575, "loss": 45.2237, "step": 3996 }, { "epoch": 10.556619346318918, "grad_norm": 269.01214599609375, "learning_rate": 0.0004488374241988434, "loss": 42.9077, "step": 3997 }, { "epoch": 10.559260482007263, "grad_norm": 291.06787109375, "learning_rate": 0.00044881209856271115, "loss": 42.3302, "step": 3998 }, { "epoch": 10.561901617695609, "grad_norm": 216.21865844726562, "learning_rate": 0.0004487867673748678, "loss": 40.8751, "step": 3999 }, { "epoch": 10.564542753383956, "grad_norm": 209.73785400390625, "learning_rate": 0.00044876143063602076, "loss": 39.6023, "step": 4000 }, { "epoch": 10.564542753383956, "eval_loss": 5.190937519073486, "eval_runtime": 2.131, "eval_samples_per_second": 232.291, "eval_steps_per_second": 29.095, "step": 4000 }, { "epoch": 10.5671838890723, "grad_norm": 154.81234741210938, "learning_rate": 0.00044873608834687754, "loss": 38.0614, "step": 4001 }, { "epoch": 10.569825024760647, "grad_norm": 173.68931579589844, "learning_rate": 0.00044871074050814575, "loss": 39.2712, "step": 4002 }, { "epoch": 10.572466160448993, "grad_norm": 202.14739990234375, "learning_rate": 0.0004486853871205333, "loss": 37.3418, "step": 4003 }, { "epoch": 10.57510729613734, "grad_norm": 251.03407287597656, "learning_rate": 0.0004486600281847482, "loss": 38.1131, "step": 4004 }, { "epoch": 10.577748431825684, "grad_norm": 238.24728393554688, "learning_rate": 0.0004486346637014984, "loss": 38.7589, "step": 4005 }, { "epoch": 10.58038956751403, "grad_norm": 273.18023681640625, "learning_rate": 0.0004486092936714923, "loss": 38.5251, "step": 4006 }, { "epoch": 10.583030703202377, "grad_norm": 233.04762268066406, "learning_rate": 0.00044858391809543834, "loss": 38.2392, "step": 4007 }, { "epoch": 10.585671838890724, "grad_norm": 219.4154815673828, "learning_rate": 0.00044855853697404514, "loss": 39.5895, "step": 4008 }, { "epoch": 10.588312974579068, "grad_norm": 1408.895263671875, "learning_rate": 0.00044853315030802144, "loss": 62.4632, "step": 4009 }, { "epoch": 10.590954110267415, "grad_norm": 2027.8033447265625, "learning_rate": 0.0004485077580980762, "loss": 52.7583, "step": 4010 }, { "epoch": 10.593595245955761, "grad_norm": 6257.5419921875, "learning_rate": 0.00044848236034491834, "loss": 46.9638, "step": 4011 }, { "epoch": 10.596236381644108, "grad_norm": 1719.7667236328125, "learning_rate": 0.0004484569570492572, "loss": 45.6076, "step": 4012 }, { "epoch": 10.598877517332452, "grad_norm": 3494.13330078125, "learning_rate": 0.0004484315482118021, "loss": 31.4196, "step": 4013 }, { "epoch": 10.601518653020799, "grad_norm": 2970.76904296875, "learning_rate": 0.0004484061338332626, "loss": 31.0632, "step": 4014 }, { "epoch": 10.604159788709145, "grad_norm": 411.50579833984375, "learning_rate": 0.0004483807139143484, "loss": 23.9203, "step": 4015 }, { "epoch": 10.606800924397492, "grad_norm": 679.9797973632812, "learning_rate": 0.00044835528845576925, "loss": 29.8555, "step": 4016 }, { "epoch": 10.609442060085836, "grad_norm": 1033.2852783203125, "learning_rate": 0.0004483298574582352, "loss": 25.4486, "step": 4017 }, { "epoch": 10.612083195774183, "grad_norm": 674.4724731445312, "learning_rate": 0.0004483044209224564, "loss": 36.8104, "step": 4018 }, { "epoch": 10.61472433146253, "grad_norm": 1158.5262451171875, "learning_rate": 0.00044827897884914305, "loss": 55.8826, "step": 4019 }, { "epoch": 10.617365467150876, "grad_norm": 968.438232421875, "learning_rate": 0.0004482535312390058, "loss": 50.2396, "step": 4020 }, { "epoch": 10.62000660283922, "grad_norm": 502.2772521972656, "learning_rate": 0.00044822807809275514, "loss": 44.8657, "step": 4021 }, { "epoch": 10.622647738527567, "grad_norm": 320.421875, "learning_rate": 0.0004482026194111018, "loss": 44.4899, "step": 4022 }, { "epoch": 10.625288874215913, "grad_norm": 313.5997009277344, "learning_rate": 0.0004481771551947567, "loss": 43.7678, "step": 4023 }, { "epoch": 10.627930009904258, "grad_norm": 325.61566162109375, "learning_rate": 0.00044815168544443107, "loss": 42.0827, "step": 4024 }, { "epoch": 10.630571145592604, "grad_norm": 154.5929412841797, "learning_rate": 0.0004481262101608359, "loss": 40.6997, "step": 4025 }, { "epoch": 10.63321228128095, "grad_norm": 334.365478515625, "learning_rate": 0.00044810072934468284, "loss": 39.0157, "step": 4026 }, { "epoch": 10.635853416969297, "grad_norm": 183.95082092285156, "learning_rate": 0.0004480752429966832, "loss": 38.9271, "step": 4027 }, { "epoch": 10.638494552657642, "grad_norm": 211.1219940185547, "learning_rate": 0.00044804975111754875, "loss": 39.3098, "step": 4028 }, { "epoch": 10.641135688345988, "grad_norm": 445.1644287109375, "learning_rate": 0.0004480242537079914, "loss": 40.1944, "step": 4029 }, { "epoch": 10.643776824034335, "grad_norm": 344.72442626953125, "learning_rate": 0.000447998750768723, "loss": 39.2828, "step": 4030 }, { "epoch": 10.646417959722681, "grad_norm": 278.92236328125, "learning_rate": 0.00044797324230045585, "loss": 40.5435, "step": 4031 }, { "epoch": 10.649059095411026, "grad_norm": 1344.7225341796875, "learning_rate": 0.0004479477283039022, "loss": 41.541, "step": 4032 }, { "epoch": 10.651700231099372, "grad_norm": 324.4949951171875, "learning_rate": 0.0004479222087797745, "loss": 39.1924, "step": 4033 }, { "epoch": 10.654341366787719, "grad_norm": 554.3349609375, "learning_rate": 0.0004478966837287854, "loss": 40.3616, "step": 4034 }, { "epoch": 10.656982502476065, "grad_norm": 360.72442626953125, "learning_rate": 0.0004478711531516477, "loss": 39.8372, "step": 4035 }, { "epoch": 10.65962363816441, "grad_norm": 518.1167602539062, "learning_rate": 0.0004478456170490742, "loss": 45.9998, "step": 4036 }, { "epoch": 10.662264773852757, "grad_norm": 505.97100830078125, "learning_rate": 0.0004478200754217781, "loss": 42.8899, "step": 4037 }, { "epoch": 10.664905909541103, "grad_norm": 273.33154296875, "learning_rate": 0.0004477945282704726, "loss": 42.8385, "step": 4038 }, { "epoch": 10.66754704522945, "grad_norm": 272.26776123046875, "learning_rate": 0.00044776897559587115, "loss": 41.3195, "step": 4039 }, { "epoch": 10.670188180917794, "grad_norm": 253.38949584960938, "learning_rate": 0.00044774341739868713, "loss": 42.305, "step": 4040 }, { "epoch": 10.67282931660614, "grad_norm": 218.10757446289062, "learning_rate": 0.00044771785367963434, "loss": 46.839, "step": 4041 }, { "epoch": 10.675470452294487, "grad_norm": 275.40325927734375, "learning_rate": 0.0004476922844394268, "loss": 44.2689, "step": 4042 }, { "epoch": 10.678111587982833, "grad_norm": 174.83592224121094, "learning_rate": 0.00044766670967877814, "loss": 45.0393, "step": 4043 }, { "epoch": 10.680752723671178, "grad_norm": 400.9130554199219, "learning_rate": 0.00044764112939840283, "loss": 42.1525, "step": 4044 }, { "epoch": 10.683393859359525, "grad_norm": 156.74124145507812, "learning_rate": 0.00044761554359901503, "loss": 43.3973, "step": 4045 }, { "epoch": 10.686034995047871, "grad_norm": 260.9909973144531, "learning_rate": 0.0004475899522813294, "loss": 42.184, "step": 4046 }, { "epoch": 10.688676130736216, "grad_norm": 163.87667846679688, "learning_rate": 0.00044756435544606024, "loss": 41.4939, "step": 4047 }, { "epoch": 10.691317266424562, "grad_norm": 185.98583984375, "learning_rate": 0.0004475387530939226, "loss": 41.072, "step": 4048 }, { "epoch": 10.693958402112909, "grad_norm": 197.4683380126953, "learning_rate": 0.00044751314522563135, "loss": 42.3255, "step": 4049 }, { "epoch": 10.696599537801255, "grad_norm": 133.27816772460938, "learning_rate": 0.0004474875318419015, "loss": 40.165, "step": 4050 }, { "epoch": 10.6992406734896, "grad_norm": 193.50448608398438, "learning_rate": 0.0004474619129434483, "loss": 39.3515, "step": 4051 }, { "epoch": 10.701881809177946, "grad_norm": 207.72720336914062, "learning_rate": 0.00044743628853098725, "loss": 39.1964, "step": 4052 }, { "epoch": 10.704522944866293, "grad_norm": 149.70101928710938, "learning_rate": 0.0004474106586052338, "loss": 40.2601, "step": 4053 }, { "epoch": 10.707164080554639, "grad_norm": 154.10690307617188, "learning_rate": 0.0004473850231669037, "loss": 38.0526, "step": 4054 }, { "epoch": 10.709805216242984, "grad_norm": 263.61077880859375, "learning_rate": 0.0004473593822167127, "loss": 37.782, "step": 4055 }, { "epoch": 10.71244635193133, "grad_norm": 200.6097869873047, "learning_rate": 0.0004473337357553769, "loss": 39.2944, "step": 4056 }, { "epoch": 10.715087487619677, "grad_norm": 204.5486297607422, "learning_rate": 0.0004473080837836124, "loss": 40.2096, "step": 4057 }, { "epoch": 10.717728623308023, "grad_norm": 1721.49560546875, "learning_rate": 0.00044728242630213565, "loss": 81.6413, "step": 4058 }, { "epoch": 10.720369758996368, "grad_norm": 1862.4951171875, "learning_rate": 0.0004472567633116631, "loss": 91.1971, "step": 4059 }, { "epoch": 10.723010894684714, "grad_norm": 2220.6005859375, "learning_rate": 0.00044723109481291124, "loss": 81.4553, "step": 4060 }, { "epoch": 10.72565203037306, "grad_norm": 2245.86474609375, "learning_rate": 0.00044720542080659687, "loss": 67.741, "step": 4061 }, { "epoch": 10.728293166061407, "grad_norm": 4246.9130859375, "learning_rate": 0.0004471797412934371, "loss": 57.3858, "step": 4062 }, { "epoch": 10.730934301749752, "grad_norm": 2044.18212890625, "learning_rate": 0.00044715405627414876, "loss": 50.7635, "step": 4063 }, { "epoch": 10.733575437438098, "grad_norm": 4571.02880859375, "learning_rate": 0.00044712836574944926, "loss": 37.8827, "step": 4064 }, { "epoch": 10.736216573126445, "grad_norm": 1390.9847412109375, "learning_rate": 0.00044710266972005595, "loss": 28.3619, "step": 4065 }, { "epoch": 10.738857708814791, "grad_norm": 1198.3861083984375, "learning_rate": 0.0004470769681866863, "loss": 29.5542, "step": 4066 }, { "epoch": 10.741498844503136, "grad_norm": 814.3239135742188, "learning_rate": 0.00044705126115005813, "loss": 20.0855, "step": 4067 }, { "epoch": 10.744139980191482, "grad_norm": 369.8492126464844, "learning_rate": 0.0004470255486108893, "loss": 35.771, "step": 4068 }, { "epoch": 10.746781115879829, "grad_norm": 524.4271240234375, "learning_rate": 0.0004469998305698977, "loss": 50.9064, "step": 4069 }, { "epoch": 10.749422251568173, "grad_norm": 592.4837646484375, "learning_rate": 0.0004469741070278016, "loss": 52.1457, "step": 4070 }, { "epoch": 10.75206338725652, "grad_norm": 328.6641540527344, "learning_rate": 0.0004469483779853192, "loss": 45.5579, "step": 4071 }, { "epoch": 10.754704522944866, "grad_norm": 217.02188110351562, "learning_rate": 0.0004469226434431691, "loss": 41.5535, "step": 4072 }, { "epoch": 10.757345658633213, "grad_norm": 237.90090942382812, "learning_rate": 0.0004468969034020699, "loss": 39.6762, "step": 4073 }, { "epoch": 10.759986794321557, "grad_norm": 177.21206665039062, "learning_rate": 0.00044687115786274024, "loss": 40.7652, "step": 4074 }, { "epoch": 10.762627930009904, "grad_norm": 218.08688354492188, "learning_rate": 0.0004468454068258992, "loss": 37.5928, "step": 4075 }, { "epoch": 10.76526906569825, "grad_norm": 310.25457763671875, "learning_rate": 0.0004468196502922658, "loss": 37.7802, "step": 4076 }, { "epoch": 10.767910201386597, "grad_norm": 362.1049499511719, "learning_rate": 0.0004467938882625593, "loss": 40.24, "step": 4077 }, { "epoch": 10.770551337074942, "grad_norm": 267.25341796875, "learning_rate": 0.000446768120737499, "loss": 38.0832, "step": 4078 }, { "epoch": 10.773192472763288, "grad_norm": 240.22891235351562, "learning_rate": 0.0004467423477178046, "loss": 38.9628, "step": 4079 }, { "epoch": 10.775833608451634, "grad_norm": 348.68927001953125, "learning_rate": 0.00044671656920419566, "loss": 39.2362, "step": 4080 }, { "epoch": 10.77847474413998, "grad_norm": 283.308837890625, "learning_rate": 0.00044669078519739215, "loss": 37.5703, "step": 4081 }, { "epoch": 10.781115879828326, "grad_norm": 543.721923828125, "learning_rate": 0.0004466649956981139, "loss": 38.7452, "step": 4082 }, { "epoch": 10.783757015516672, "grad_norm": 300.5260009765625, "learning_rate": 0.00044663920070708125, "loss": 38.8872, "step": 4083 }, { "epoch": 10.786398151205018, "grad_norm": 288.15789794921875, "learning_rate": 0.00044661340022501437, "loss": 40.0597, "step": 4084 }, { "epoch": 10.789039286893365, "grad_norm": 428.4024353027344, "learning_rate": 0.00044658759425263383, "loss": 39.4043, "step": 4085 }, { "epoch": 10.79168042258171, "grad_norm": 397.35791015625, "learning_rate": 0.00044656178279066025, "loss": 42.9037, "step": 4086 }, { "epoch": 10.794321558270056, "grad_norm": 181.40635681152344, "learning_rate": 0.00044653596583981434, "loss": 43.0525, "step": 4087 }, { "epoch": 10.796962693958402, "grad_norm": 187.37010192871094, "learning_rate": 0.000446510143400817, "loss": 41.5764, "step": 4088 }, { "epoch": 10.799603829646749, "grad_norm": 277.7115173339844, "learning_rate": 0.00044648431547438926, "loss": 41.5998, "step": 4089 }, { "epoch": 10.802244965335094, "grad_norm": 166.43081665039062, "learning_rate": 0.00044645848206125257, "loss": 44.4735, "step": 4090 }, { "epoch": 10.80488610102344, "grad_norm": 368.808349609375, "learning_rate": 0.00044643264316212807, "loss": 44.0775, "step": 4091 }, { "epoch": 10.807527236711787, "grad_norm": 205.32582092285156, "learning_rate": 0.00044640679877773747, "loss": 47.0062, "step": 4092 }, { "epoch": 10.810168372400131, "grad_norm": 279.7489929199219, "learning_rate": 0.00044638094890880236, "loss": 44.4062, "step": 4093 }, { "epoch": 10.812809508088478, "grad_norm": 262.392333984375, "learning_rate": 0.00044635509355604464, "loss": 43.0659, "step": 4094 }, { "epoch": 10.815450643776824, "grad_norm": 364.9067077636719, "learning_rate": 0.0004463292327201862, "loss": 44.3644, "step": 4095 }, { "epoch": 10.81809177946517, "grad_norm": 335.19189453125, "learning_rate": 0.00044630336640194936, "loss": 44.0713, "step": 4096 }, { "epoch": 10.820732915153515, "grad_norm": 185.21151733398438, "learning_rate": 0.0004462774946020563, "loss": 42.8397, "step": 4097 }, { "epoch": 10.823374050841862, "grad_norm": 173.04090881347656, "learning_rate": 0.0004462516173212295, "loss": 39.6833, "step": 4098 }, { "epoch": 10.826015186530208, "grad_norm": 279.04364013671875, "learning_rate": 0.00044622573456019155, "loss": 41.2857, "step": 4099 }, { "epoch": 10.828656322218555, "grad_norm": 349.71905517578125, "learning_rate": 0.00044619984631966527, "loss": 39.4439, "step": 4100 }, { "epoch": 10.8312974579069, "grad_norm": 162.2829132080078, "learning_rate": 0.00044617395260037354, "loss": 39.4748, "step": 4101 }, { "epoch": 10.833938593595246, "grad_norm": 99.62092590332031, "learning_rate": 0.0004461480534030393, "loss": 38.7467, "step": 4102 }, { "epoch": 10.836579729283592, "grad_norm": 222.9463348388672, "learning_rate": 0.0004461221487283861, "loss": 37.8039, "step": 4103 }, { "epoch": 10.839220864971939, "grad_norm": 196.4410858154297, "learning_rate": 0.0004460962385771369, "loss": 38.2556, "step": 4104 }, { "epoch": 10.841862000660283, "grad_norm": 178.05169677734375, "learning_rate": 0.00044607032295001554, "loss": 36.7202, "step": 4105 }, { "epoch": 10.84450313634863, "grad_norm": 106.30889129638672, "learning_rate": 0.0004460444018477456, "loss": 38.6175, "step": 4106 }, { "epoch": 10.847144272036976, "grad_norm": 242.69772338867188, "learning_rate": 0.0004460184752710509, "loss": 38.4789, "step": 4107 }, { "epoch": 10.849785407725323, "grad_norm": 571.1672973632812, "learning_rate": 0.00044599254322065534, "loss": 58.291, "step": 4108 }, { "epoch": 10.852426543413667, "grad_norm": 1494.7607421875, "learning_rate": 0.0004459666056972832, "loss": 85.4623, "step": 4109 }, { "epoch": 10.855067679102014, "grad_norm": 1944.026123046875, "learning_rate": 0.0004459406627016587, "loss": 81.9182, "step": 4110 }, { "epoch": 10.85770881479036, "grad_norm": 1411.5413818359375, "learning_rate": 0.00044591471423450634, "loss": 76.7136, "step": 4111 }, { "epoch": 10.860349950478707, "grad_norm": 1328.5421142578125, "learning_rate": 0.0004458887602965507, "loss": 58.9832, "step": 4112 }, { "epoch": 10.862991086167051, "grad_norm": 3017.698486328125, "learning_rate": 0.00044586280088851636, "loss": 50.8182, "step": 4113 }, { "epoch": 10.865632221855398, "grad_norm": 1085.6859130859375, "learning_rate": 0.00044583683601112845, "loss": 36.8734, "step": 4114 }, { "epoch": 10.868273357543744, "grad_norm": 950.5670776367188, "learning_rate": 0.00044581086566511196, "loss": 24.5331, "step": 4115 }, { "epoch": 10.870914493232089, "grad_norm": 490.8705139160156, "learning_rate": 0.00044578488985119204, "loss": 22.339, "step": 4116 }, { "epoch": 10.873555628920435, "grad_norm": 1101.2750244140625, "learning_rate": 0.00044575890857009404, "loss": 28.6848, "step": 4117 }, { "epoch": 10.876196764608782, "grad_norm": 667.5849609375, "learning_rate": 0.00044573292182254354, "loss": 47.5001, "step": 4118 }, { "epoch": 10.878837900297128, "grad_norm": 838.7893676757812, "learning_rate": 0.0004457069296092662, "loss": 55.3331, "step": 4119 }, { "epoch": 10.881479035985473, "grad_norm": 737.8836059570312, "learning_rate": 0.00044568093193098784, "loss": 49.4437, "step": 4120 }, { "epoch": 10.88412017167382, "grad_norm": 273.5928955078125, "learning_rate": 0.00044565492878843437, "loss": 42.4895, "step": 4121 }, { "epoch": 10.886761307362166, "grad_norm": 172.0184326171875, "learning_rate": 0.00044562892018233197, "loss": 42.0576, "step": 4122 }, { "epoch": 10.889402443050512, "grad_norm": 126.30522918701172, "learning_rate": 0.0004456029061134069, "loss": 41.2046, "step": 4123 }, { "epoch": 10.892043578738857, "grad_norm": 430.9688720703125, "learning_rate": 0.0004455768865823855, "loss": 39.7919, "step": 4124 }, { "epoch": 10.894684714427203, "grad_norm": 179.47972106933594, "learning_rate": 0.00044555086158999446, "loss": 38.924, "step": 4125 }, { "epoch": 10.89732585011555, "grad_norm": 181.7604522705078, "learning_rate": 0.0004455248311369605, "loss": 38.5599, "step": 4126 }, { "epoch": 10.899966985803896, "grad_norm": 135.71226501464844, "learning_rate": 0.00044549879522401057, "loss": 37.4015, "step": 4127 }, { "epoch": 10.902608121492241, "grad_norm": 292.3005065917969, "learning_rate": 0.0004454727538518715, "loss": 39.3457, "step": 4128 }, { "epoch": 10.905249257180587, "grad_norm": 150.58944702148438, "learning_rate": 0.00044544670702127063, "loss": 41.6452, "step": 4129 }, { "epoch": 10.907890392868934, "grad_norm": 241.89047241210938, "learning_rate": 0.0004454206547329354, "loss": 39.2667, "step": 4130 }, { "epoch": 10.91053152855728, "grad_norm": 228.01425170898438, "learning_rate": 0.000445394596987593, "loss": 39.5592, "step": 4131 }, { "epoch": 10.913172664245625, "grad_norm": 207.5089569091797, "learning_rate": 0.00044536853378597134, "loss": 39.2829, "step": 4132 }, { "epoch": 10.915813799933971, "grad_norm": 289.5299072265625, "learning_rate": 0.00044534246512879815, "loss": 37.6675, "step": 4133 }, { "epoch": 10.918454935622318, "grad_norm": 363.7677917480469, "learning_rate": 0.0004453163910168014, "loss": 39.6885, "step": 4134 }, { "epoch": 10.921096071310664, "grad_norm": 152.5277099609375, "learning_rate": 0.00044529031145070907, "loss": 38.8413, "step": 4135 }, { "epoch": 10.923737206999009, "grad_norm": 235.01548767089844, "learning_rate": 0.00044526422643124963, "loss": 41.4729, "step": 4136 }, { "epoch": 10.926378342687356, "grad_norm": 312.9405517578125, "learning_rate": 0.0004452381359591513, "loss": 43.2417, "step": 4137 }, { "epoch": 10.929019478375702, "grad_norm": 177.0260772705078, "learning_rate": 0.00044521204003514274, "loss": 42.3452, "step": 4138 }, { "epoch": 10.931660614064047, "grad_norm": 131.7273406982422, "learning_rate": 0.0004451859386599526, "loss": 42.1951, "step": 4139 }, { "epoch": 10.934301749752393, "grad_norm": 142.0334014892578, "learning_rate": 0.00044515983183430986, "loss": 43.0209, "step": 4140 }, { "epoch": 10.93694288544074, "grad_norm": 117.36882781982422, "learning_rate": 0.00044513371955894336, "loss": 41.801, "step": 4141 }, { "epoch": 10.939584021129086, "grad_norm": 160.8968963623047, "learning_rate": 0.0004451076018345824, "loss": 41.0689, "step": 4142 }, { "epoch": 10.94222515681743, "grad_norm": 183.59242248535156, "learning_rate": 0.0004450814786619564, "loss": 42.2931, "step": 4143 }, { "epoch": 10.944866292505777, "grad_norm": 100.41038513183594, "learning_rate": 0.0004450553500417945, "loss": 39.8635, "step": 4144 }, { "epoch": 10.947507428194124, "grad_norm": 97.22856140136719, "learning_rate": 0.00044502921597482667, "loss": 38.8493, "step": 4145 }, { "epoch": 10.95014856388247, "grad_norm": 94.73809814453125, "learning_rate": 0.0004450030764617825, "loss": 37.561, "step": 4146 }, { "epoch": 10.952789699570815, "grad_norm": 97.17437744140625, "learning_rate": 0.0004449769315033919, "loss": 37.4117, "step": 4147 }, { "epoch": 10.955430835259161, "grad_norm": 82.73017120361328, "learning_rate": 0.0004449507811003851, "loss": 36.5611, "step": 4148 }, { "epoch": 10.958071970947508, "grad_norm": 961.617919921875, "learning_rate": 0.0004449246252534923, "loss": 63.7447, "step": 4149 }, { "epoch": 10.960713106635854, "grad_norm": 2021.0103759765625, "learning_rate": 0.0004448984639634438, "loss": 77.8695, "step": 4150 }, { "epoch": 10.963354242324199, "grad_norm": 1259.802490234375, "learning_rate": 0.0004448722972309702, "loss": 81.2199, "step": 4151 }, { "epoch": 10.965995378012545, "grad_norm": 4268.7939453125, "learning_rate": 0.0004448461250568021, "loss": 64.5124, "step": 4152 }, { "epoch": 10.968636513700892, "grad_norm": 1401.1666259765625, "learning_rate": 0.0004448199474416705, "loss": 53.8122, "step": 4153 }, { "epoch": 10.971277649389238, "grad_norm": 1753.1717529296875, "learning_rate": 0.00044479376438630633, "loss": 46.0066, "step": 4154 }, { "epoch": 10.973918785077583, "grad_norm": 590.483642578125, "learning_rate": 0.0004447675758914407, "loss": 39.9309, "step": 4155 }, { "epoch": 10.97655992076593, "grad_norm": 157.0719451904297, "learning_rate": 0.00044474138195780487, "loss": 40.4717, "step": 4156 }, { "epoch": 10.979201056454276, "grad_norm": 177.15621948242188, "learning_rate": 0.00044471518258613034, "loss": 39.4044, "step": 4157 }, { "epoch": 10.981842192142622, "grad_norm": 236.73597717285156, "learning_rate": 0.0004446889777771488, "loss": 39.7763, "step": 4158 }, { "epoch": 10.984483327830967, "grad_norm": 124.30425262451172, "learning_rate": 0.00044466276753159185, "loss": 39.2561, "step": 4159 }, { "epoch": 10.987124463519313, "grad_norm": 156.6599884033203, "learning_rate": 0.00044463655185019146, "loss": 36.9692, "step": 4160 }, { "epoch": 10.98976559920766, "grad_norm": 156.1059112548828, "learning_rate": 0.0004446103307336797, "loss": 38.5042, "step": 4161 }, { "epoch": 10.992406734896004, "grad_norm": 181.2158660888672, "learning_rate": 0.00044458410418278884, "loss": 37.8499, "step": 4162 }, { "epoch": 10.99504787058435, "grad_norm": 269.2628479003906, "learning_rate": 0.00044455787219825117, "loss": 37.3619, "step": 4163 }, { "epoch": 10.997689006272697, "grad_norm": 554.244140625, "learning_rate": 0.0004445316347807992, "loss": 39.6766, "step": 4164 }, { "epoch": 11.000330141961044, "grad_norm": 544.17333984375, "learning_rate": 0.00044450539193116557, "loss": 55.4815, "step": 4165 }, { "epoch": 11.002971277649388, "grad_norm": 1455.69140625, "learning_rate": 0.0004444791436500831, "loss": 90.9052, "step": 4166 }, { "epoch": 11.005612413337735, "grad_norm": 1212.762451171875, "learning_rate": 0.0004444528899382848, "loss": 76.4164, "step": 4167 }, { "epoch": 11.008253549026081, "grad_norm": 816.6371459960938, "learning_rate": 0.0004444266307965038, "loss": 50.6447, "step": 4168 }, { "epoch": 11.010894684714428, "grad_norm": 195.79298400878906, "learning_rate": 0.0004444003662254734, "loss": 45.338, "step": 4169 }, { "epoch": 11.013535820402772, "grad_norm": 172.62684631347656, "learning_rate": 0.0004443740962259269, "loss": 42.8867, "step": 4170 }, { "epoch": 11.016176956091119, "grad_norm": 214.98101806640625, "learning_rate": 0.000444347820798598, "loss": 42.104, "step": 4171 }, { "epoch": 11.018818091779465, "grad_norm": 130.90859985351562, "learning_rate": 0.0004443215399442203, "loss": 45.6109, "step": 4172 }, { "epoch": 11.021459227467812, "grad_norm": 121.78900146484375, "learning_rate": 0.0004442952536635277, "loss": 44.5716, "step": 4173 }, { "epoch": 11.024100363156156, "grad_norm": 167.57765197753906, "learning_rate": 0.0004442689619572544, "loss": 44.2284, "step": 4174 }, { "epoch": 11.026741498844503, "grad_norm": 127.38481140136719, "learning_rate": 0.00044424266482613443, "loss": 40.5501, "step": 4175 }, { "epoch": 11.02938263453285, "grad_norm": 147.17434692382812, "learning_rate": 0.0004442163622709022, "loss": 40.8344, "step": 4176 }, { "epoch": 11.032023770221196, "grad_norm": 158.06982421875, "learning_rate": 0.0004441900542922921, "loss": 39.7822, "step": 4177 }, { "epoch": 11.03466490590954, "grad_norm": 147.1549530029297, "learning_rate": 0.0004441637408910387, "loss": 41.5521, "step": 4178 }, { "epoch": 11.037306041597887, "grad_norm": 59.09796905517578, "learning_rate": 0.000444137422067877, "loss": 38.2747, "step": 4179 }, { "epoch": 11.039947177286233, "grad_norm": 81.009033203125, "learning_rate": 0.0004441110978235418, "loss": 37.9112, "step": 4180 }, { "epoch": 11.04258831297458, "grad_norm": 103.68826293945312, "learning_rate": 0.00044408476815876823, "loss": 39.3287, "step": 4181 }, { "epoch": 11.045229448662925, "grad_norm": 76.50444793701172, "learning_rate": 0.0004440584330742915, "loss": 38.3034, "step": 4182 }, { "epoch": 11.047870584351271, "grad_norm": 254.7351837158203, "learning_rate": 0.0004440320925708471, "loss": 37.2415, "step": 4183 }, { "epoch": 11.050511720039617, "grad_norm": 114.00656127929688, "learning_rate": 0.0004440057466491704, "loss": 39.2838, "step": 4184 }, { "epoch": 11.053152855727962, "grad_norm": 72.39910125732422, "learning_rate": 0.0004439793953099972, "loss": 37.1456, "step": 4185 }, { "epoch": 11.055793991416309, "grad_norm": 252.9908447265625, "learning_rate": 0.00044395303855406345, "loss": 38.9917, "step": 4186 }, { "epoch": 11.058435127104655, "grad_norm": 627.9036254882812, "learning_rate": 0.00044392667638210487, "loss": 62.6708, "step": 4187 }, { "epoch": 11.061076262793001, "grad_norm": 1501.3328857421875, "learning_rate": 0.00044390030879485785, "loss": 122.4178, "step": 4188 }, { "epoch": 11.063717398481346, "grad_norm": 1004.9485473632812, "learning_rate": 0.0004438739357930586, "loss": 116.7045, "step": 4189 }, { "epoch": 11.066358534169693, "grad_norm": 4438.23095703125, "learning_rate": 0.00044384755737744366, "loss": 103.1891, "step": 4190 }, { "epoch": 11.068999669858039, "grad_norm": 1499.5443115234375, "learning_rate": 0.0004438211735487494, "loss": 124.8908, "step": 4191 }, { "epoch": 11.071640805546386, "grad_norm": 1148.3558349609375, "learning_rate": 0.00044379478430771286, "loss": 90.9008, "step": 4192 }, { "epoch": 11.07428194123473, "grad_norm": 2257.913818359375, "learning_rate": 0.0004437683896550707, "loss": 87.6497, "step": 4193 }, { "epoch": 11.076923076923077, "grad_norm": 1029.2493896484375, "learning_rate": 0.0004437419895915602, "loss": 74.5694, "step": 4194 }, { "epoch": 11.079564212611423, "grad_norm": 911.5936889648438, "learning_rate": 0.00044371558411791833, "loss": 60.9613, "step": 4195 }, { "epoch": 11.08220534829977, "grad_norm": 1160.222412109375, "learning_rate": 0.0004436891732348827, "loss": 40.2685, "step": 4196 }, { "epoch": 11.084846483988114, "grad_norm": 569.940673828125, "learning_rate": 0.0004436627569431906, "loss": 35.7612, "step": 4197 }, { "epoch": 11.08748761967646, "grad_norm": 981.3412475585938, "learning_rate": 0.00044363633524357977, "loss": 60.6991, "step": 4198 }, { "epoch": 11.090128755364807, "grad_norm": 1211.3720703125, "learning_rate": 0.000443609908136788, "loss": 76.2771, "step": 4199 }, { "epoch": 11.092769891053154, "grad_norm": 1327.40478515625, "learning_rate": 0.0004435834756235534, "loss": 80.3153, "step": 4200 }, { "epoch": 11.092769891053154, "eval_loss": 6.460549831390381, "eval_runtime": 2.2312, "eval_samples_per_second": 221.85, "eval_steps_per_second": 27.787, "step": 4200 }, { "epoch": 11.095411026741498, "grad_norm": 1014.016357421875, "learning_rate": 0.00044355703770461387, "loss": 64.0615, "step": 4201 }, { "epoch": 11.098052162429845, "grad_norm": 743.9364624023438, "learning_rate": 0.00044353059438070777, "loss": 52.8333, "step": 4202 }, { "epoch": 11.100693298118191, "grad_norm": 283.79840087890625, "learning_rate": 0.0004435041456525735, "loss": 45.5595, "step": 4203 }, { "epoch": 11.103334433806538, "grad_norm": 197.5261993408203, "learning_rate": 0.0004434776915209496, "loss": 40.9766, "step": 4204 }, { "epoch": 11.105975569494882, "grad_norm": 258.0011291503906, "learning_rate": 0.00044345123198657495, "loss": 39.9067, "step": 4205 }, { "epoch": 11.108616705183229, "grad_norm": 169.4458770751953, "learning_rate": 0.0004434247670501882, "loss": 39.7363, "step": 4206 }, { "epoch": 11.111257840871575, "grad_norm": 305.0285339355469, "learning_rate": 0.0004433982967125285, "loss": 39.5597, "step": 4207 }, { "epoch": 11.11389897655992, "grad_norm": 215.94300842285156, "learning_rate": 0.00044337182097433487, "loss": 39.0486, "step": 4208 }, { "epoch": 11.116540112248266, "grad_norm": 175.06668090820312, "learning_rate": 0.0004433453398363468, "loss": 38.8538, "step": 4209 }, { "epoch": 11.119181247936613, "grad_norm": 179.7303924560547, "learning_rate": 0.00044331885329930367, "loss": 38.1909, "step": 4210 }, { "epoch": 11.12182238362496, "grad_norm": 424.495849609375, "learning_rate": 0.00044329236136394514, "loss": 38.5219, "step": 4211 }, { "epoch": 11.124463519313304, "grad_norm": 312.8796081542969, "learning_rate": 0.000443265864031011, "loss": 38.8274, "step": 4212 }, { "epoch": 11.12710465500165, "grad_norm": 184.31161499023438, "learning_rate": 0.0004432393613012411, "loss": 39.782, "step": 4213 }, { "epoch": 11.129745790689997, "grad_norm": 367.04473876953125, "learning_rate": 0.0004432128531753755, "loss": 41.0212, "step": 4214 }, { "epoch": 11.132386926378343, "grad_norm": 409.0860595703125, "learning_rate": 0.00044318633965415456, "loss": 45.5512, "step": 4215 }, { "epoch": 11.135028062066688, "grad_norm": 193.08169555664062, "learning_rate": 0.00044315982073831855, "loss": 42.4386, "step": 4216 }, { "epoch": 11.137669197755034, "grad_norm": 301.2242736816406, "learning_rate": 0.000443133296428608, "loss": 42.001, "step": 4217 }, { "epoch": 11.14031033344338, "grad_norm": 131.35397338867188, "learning_rate": 0.0004431067667257636, "loss": 39.7676, "step": 4218 }, { "epoch": 11.142951469131727, "grad_norm": 182.76695251464844, "learning_rate": 0.00044308023163052617, "loss": 42.548, "step": 4219 }, { "epoch": 11.145592604820072, "grad_norm": 126.93172454833984, "learning_rate": 0.0004430536911436367, "loss": 45.5464, "step": 4220 }, { "epoch": 11.148233740508418, "grad_norm": 285.5293884277344, "learning_rate": 0.00044302714526583633, "loss": 44.9755, "step": 4221 }, { "epoch": 11.150874876196765, "grad_norm": 198.3924102783203, "learning_rate": 0.0004430005939978663, "loss": 45.9202, "step": 4222 }, { "epoch": 11.153516011885111, "grad_norm": 127.02302551269531, "learning_rate": 0.000442974037340468, "loss": 42.613, "step": 4223 }, { "epoch": 11.156157147573456, "grad_norm": 174.1609344482422, "learning_rate": 0.0004429474752943832, "loss": 43.0001, "step": 4224 }, { "epoch": 11.158798283261802, "grad_norm": 179.58119201660156, "learning_rate": 0.0004429209078603534, "loss": 43.7434, "step": 4225 }, { "epoch": 11.161439418950149, "grad_norm": 159.7641143798828, "learning_rate": 0.00044289433503912057, "loss": 40.588, "step": 4226 }, { "epoch": 11.164080554638495, "grad_norm": 178.57237243652344, "learning_rate": 0.0004428677568314268, "loss": 41.2012, "step": 4227 }, { "epoch": 11.16672169032684, "grad_norm": 97.11273956298828, "learning_rate": 0.00044284117323801413, "loss": 39.5581, "step": 4228 }, { "epoch": 11.169362826015186, "grad_norm": 212.1147918701172, "learning_rate": 0.0004428145842596251, "loss": 39.3505, "step": 4229 }, { "epoch": 11.172003961703533, "grad_norm": 137.43504333496094, "learning_rate": 0.000442787989897002, "loss": 37.37, "step": 4230 }, { "epoch": 11.174645097391878, "grad_norm": 107.81166076660156, "learning_rate": 0.0004427613901508875, "loss": 37.4835, "step": 4231 }, { "epoch": 11.177286233080224, "grad_norm": 100.01378631591797, "learning_rate": 0.00044273478502202446, "loss": 37.1435, "step": 4232 }, { "epoch": 11.17992736876857, "grad_norm": 146.6248321533203, "learning_rate": 0.00044270817451115573, "loss": 37.2018, "step": 4233 }, { "epoch": 11.182568504456917, "grad_norm": 110.15731811523438, "learning_rate": 0.00044268155861902446, "loss": 38.2251, "step": 4234 }, { "epoch": 11.185209640145262, "grad_norm": 121.5240249633789, "learning_rate": 0.00044265493734637387, "loss": 38.424, "step": 4235 }, { "epoch": 11.187850775833608, "grad_norm": 152.1005859375, "learning_rate": 0.0004426283106939473, "loss": 37.065, "step": 4236 }, { "epoch": 11.190491911521955, "grad_norm": 97.36339569091797, "learning_rate": 0.00044260167866248834, "loss": 39.5149, "step": 4237 }, { "epoch": 11.193133047210301, "grad_norm": 1226.4251708984375, "learning_rate": 0.0004425750412527406, "loss": 84.9192, "step": 4238 }, { "epoch": 11.195774182898646, "grad_norm": 2964.288330078125, "learning_rate": 0.00044254839846544805, "loss": 112.9237, "step": 4239 }, { "epoch": 11.198415318586992, "grad_norm": 1435.4410400390625, "learning_rate": 0.00044252175030135444, "loss": 113.6573, "step": 4240 }, { "epoch": 11.201056454275339, "grad_norm": 979.8338623046875, "learning_rate": 0.00044249509676120414, "loss": 90.1071, "step": 4241 }, { "epoch": 11.203697589963685, "grad_norm": 760.2485961914062, "learning_rate": 0.00044246843784574133, "loss": 75.0559, "step": 4242 }, { "epoch": 11.20633872565203, "grad_norm": 2236.578125, "learning_rate": 0.0004424417735557105, "loss": 80.5492, "step": 4243 }, { "epoch": 11.208979861340376, "grad_norm": 4413.765625, "learning_rate": 0.00044241510389185604, "loss": 72.1387, "step": 4244 }, { "epoch": 11.211620997028723, "grad_norm": 1982.9195556640625, "learning_rate": 0.0004423884288549229, "loss": 50.8636, "step": 4245 }, { "epoch": 11.214262132717069, "grad_norm": 4080.74169921875, "learning_rate": 0.000442361748445656, "loss": 37.6039, "step": 4246 }, { "epoch": 11.216903268405414, "grad_norm": 855.6854858398438, "learning_rate": 0.0004423350626648002, "loss": 35.4382, "step": 4247 }, { "epoch": 11.21954440409376, "grad_norm": 1023.7556762695312, "learning_rate": 0.0004423083715131007, "loss": 52.9561, "step": 4248 }, { "epoch": 11.222185539782107, "grad_norm": 883.3020629882812, "learning_rate": 0.000442281674991303, "loss": 52.8595, "step": 4249 }, { "epoch": 11.224826675470453, "grad_norm": 637.9661254882812, "learning_rate": 0.0004422549731001524, "loss": 47.0534, "step": 4250 }, { "epoch": 11.227467811158798, "grad_norm": 253.02984619140625, "learning_rate": 0.00044222826584039464, "loss": 41.4011, "step": 4251 }, { "epoch": 11.230108946847144, "grad_norm": 449.7094421386719, "learning_rate": 0.0004422015532127755, "loss": 40.0284, "step": 4252 }, { "epoch": 11.23275008253549, "grad_norm": 253.59243774414062, "learning_rate": 0.0004421748352180409, "loss": 38.3352, "step": 4253 }, { "epoch": 11.235391218223835, "grad_norm": 452.1617431640625, "learning_rate": 0.00044214811185693686, "loss": 38.9548, "step": 4254 }, { "epoch": 11.238032353912182, "grad_norm": 789.491943359375, "learning_rate": 0.0004421213831302097, "loss": 39.0349, "step": 4255 }, { "epoch": 11.240673489600528, "grad_norm": 597.8129272460938, "learning_rate": 0.00044209464903860576, "loss": 38.8262, "step": 4256 }, { "epoch": 11.243314625288875, "grad_norm": 336.85162353515625, "learning_rate": 0.0004420679095828716, "loss": 38.6842, "step": 4257 }, { "epoch": 11.24595576097722, "grad_norm": 520.9883422851562, "learning_rate": 0.0004420411647637539, "loss": 37.8596, "step": 4258 }, { "epoch": 11.248596896665566, "grad_norm": 619.891845703125, "learning_rate": 0.00044201441458199955, "loss": 38.1897, "step": 4259 }, { "epoch": 11.251238032353912, "grad_norm": 720.1528930664062, "learning_rate": 0.0004419876590383554, "loss": 39.3274, "step": 4260 }, { "epoch": 11.253879168042259, "grad_norm": 596.288330078125, "learning_rate": 0.0004419608981335685, "loss": 39.3393, "step": 4261 }, { "epoch": 11.256520303730603, "grad_norm": 457.1181640625, "learning_rate": 0.0004419341318683865, "loss": 39.4084, "step": 4262 }, { "epoch": 11.25916143941895, "grad_norm": 554.5546875, "learning_rate": 0.0004419073602435566, "loss": 39.6968, "step": 4263 }, { "epoch": 11.261802575107296, "grad_norm": 684.2613525390625, "learning_rate": 0.0004418805832598263, "loss": 41.0926, "step": 4264 }, { "epoch": 11.264443710795643, "grad_norm": 2470.23291015625, "learning_rate": 0.0004418538009179435, "loss": 43.4923, "step": 4265 }, { "epoch": 11.267084846483987, "grad_norm": 213.4482879638672, "learning_rate": 0.000441827013218656, "loss": 39.6803, "step": 4266 }, { "epoch": 11.269725982172334, "grad_norm": 311.8735656738281, "learning_rate": 0.0004418002201627118, "loss": 41.2915, "step": 4267 }, { "epoch": 11.27236711786068, "grad_norm": 641.8482666015625, "learning_rate": 0.00044177342175085913, "loss": 42.252, "step": 4268 }, { "epoch": 11.275008253549027, "grad_norm": 361.6531677246094, "learning_rate": 0.00044174661798384636, "loss": 43.0634, "step": 4269 }, { "epoch": 11.277649389237371, "grad_norm": 349.73663330078125, "learning_rate": 0.00044171980886242184, "loss": 46.8584, "step": 4270 }, { "epoch": 11.280290524925718, "grad_norm": 318.8775634765625, "learning_rate": 0.00044169299438733434, "loss": 45.6661, "step": 4271 }, { "epoch": 11.282931660614064, "grad_norm": 465.792724609375, "learning_rate": 0.00044166617455933255, "loss": 43.8913, "step": 4272 }, { "epoch": 11.28557279630241, "grad_norm": 396.19842529296875, "learning_rate": 0.0004416393493791655, "loss": 46.2043, "step": 4273 }, { "epoch": 11.288213931990756, "grad_norm": 477.5589294433594, "learning_rate": 0.0004416125188475821, "loss": 42.6548, "step": 4274 }, { "epoch": 11.290855067679102, "grad_norm": 268.17974853515625, "learning_rate": 0.00044158568296533173, "loss": 42.3731, "step": 4275 }, { "epoch": 11.293496203367448, "grad_norm": 260.1064453125, "learning_rate": 0.0004415588417331637, "loss": 41.5122, "step": 4276 }, { "epoch": 11.296137339055793, "grad_norm": 178.46502685546875, "learning_rate": 0.00044153199515182753, "loss": 40.3842, "step": 4277 }, { "epoch": 11.29877847474414, "grad_norm": 325.4002380371094, "learning_rate": 0.0004415051432220729, "loss": 39.6724, "step": 4278 }, { "epoch": 11.301419610432486, "grad_norm": 608.8048706054688, "learning_rate": 0.00044147828594464966, "loss": 39.1577, "step": 4279 }, { "epoch": 11.304060746120832, "grad_norm": 265.04718017578125, "learning_rate": 0.0004414514233203077, "loss": 38.2238, "step": 4280 }, { "epoch": 11.306701881809177, "grad_norm": 381.07781982421875, "learning_rate": 0.0004414245553497973, "loss": 38.7883, "step": 4281 }, { "epoch": 11.309343017497524, "grad_norm": 345.9266662597656, "learning_rate": 0.0004413976820338686, "loss": 38.2367, "step": 4282 }, { "epoch": 11.31198415318587, "grad_norm": 371.3165588378906, "learning_rate": 0.00044137080337327205, "loss": 37.098, "step": 4283 }, { "epoch": 11.314625288874216, "grad_norm": 210.47947692871094, "learning_rate": 0.0004413439193687583, "loss": 38.2303, "step": 4284 }, { "epoch": 11.317266424562561, "grad_norm": 359.3103942871094, "learning_rate": 0.0004413170300210779, "loss": 37.7676, "step": 4285 }, { "epoch": 11.319907560250908, "grad_norm": 293.5397033691406, "learning_rate": 0.0004412901353309819, "loss": 38.0426, "step": 4286 }, { "epoch": 11.322548695939254, "grad_norm": 1644.04833984375, "learning_rate": 0.0004412632352992212, "loss": 53.7209, "step": 4287 }, { "epoch": 11.3251898316276, "grad_norm": 1404.52978515625, "learning_rate": 0.000441236329926547, "loss": 61.349, "step": 4288 }, { "epoch": 11.327830967315945, "grad_norm": 3677.6962890625, "learning_rate": 0.0004412094192137106, "loss": 55.765, "step": 4289 }, { "epoch": 11.330472103004292, "grad_norm": 1852.0589599609375, "learning_rate": 0.0004411825031614636, "loss": 51.3923, "step": 4290 }, { "epoch": 11.333113238692638, "grad_norm": 1171.84326171875, "learning_rate": 0.00044115558177055744, "loss": 44.6891, "step": 4291 }, { "epoch": 11.335754374380985, "grad_norm": 1640.947021484375, "learning_rate": 0.00044112865504174393, "loss": 37.9744, "step": 4292 }, { "epoch": 11.33839551006933, "grad_norm": 8042.75244140625, "learning_rate": 0.00044110172297577506, "loss": 25.255, "step": 4293 }, { "epoch": 11.341036645757676, "grad_norm": 752.420654296875, "learning_rate": 0.00044107478557340273, "loss": 26.2433, "step": 4294 }, { "epoch": 11.343677781446022, "grad_norm": 3280.201171875, "learning_rate": 0.00044104784283537936, "loss": 24.987, "step": 4295 }, { "epoch": 11.346318917134369, "grad_norm": 978.0949096679688, "learning_rate": 0.0004410208947624572, "loss": 24.8533, "step": 4296 }, { "epoch": 11.348960052822713, "grad_norm": 672.707763671875, "learning_rate": 0.00044099394135538875, "loss": 42.1588, "step": 4297 }, { "epoch": 11.35160118851106, "grad_norm": 800.3187255859375, "learning_rate": 0.00044096698261492665, "loss": 46.3749, "step": 4298 }, { "epoch": 11.354242324199406, "grad_norm": 535.9456787109375, "learning_rate": 0.00044094001854182375, "loss": 42.7211, "step": 4299 }, { "epoch": 11.35688345988775, "grad_norm": 423.21832275390625, "learning_rate": 0.00044091304913683303, "loss": 44.6829, "step": 4300 }, { "epoch": 11.359524595576097, "grad_norm": 234.87857055664062, "learning_rate": 0.00044088607440070747, "loss": 40.3527, "step": 4301 }, { "epoch": 11.362165731264444, "grad_norm": 278.9092712402344, "learning_rate": 0.0004408590943342006, "loss": 39.5647, "step": 4302 }, { "epoch": 11.36480686695279, "grad_norm": 434.4763488769531, "learning_rate": 0.0004408321089380655, "loss": 39.7465, "step": 4303 }, { "epoch": 11.367448002641135, "grad_norm": 494.8058776855469, "learning_rate": 0.00044080511821305583, "loss": 39.7313, "step": 4304 }, { "epoch": 11.370089138329481, "grad_norm": 257.88861083984375, "learning_rate": 0.0004407781221599254, "loss": 38.3556, "step": 4305 }, { "epoch": 11.372730274017828, "grad_norm": 225.00328063964844, "learning_rate": 0.00044075112077942793, "loss": 38.419, "step": 4306 }, { "epoch": 11.375371409706174, "grad_norm": 171.18710327148438, "learning_rate": 0.0004407241140723175, "loss": 38.6755, "step": 4307 }, { "epoch": 11.378012545394519, "grad_norm": 400.0480651855469, "learning_rate": 0.0004406971020393482, "loss": 39.4072, "step": 4308 }, { "epoch": 11.380653681082865, "grad_norm": 281.7795104980469, "learning_rate": 0.00044067008468127436, "loss": 38.5241, "step": 4309 }, { "epoch": 11.383294816771212, "grad_norm": 180.61033630371094, "learning_rate": 0.0004406430619988504, "loss": 37.3276, "step": 4310 }, { "epoch": 11.385935952459558, "grad_norm": 571.5989379882812, "learning_rate": 0.0004406160339928309, "loss": 38.8011, "step": 4311 }, { "epoch": 11.388577088147903, "grad_norm": 325.88067626953125, "learning_rate": 0.0004405890006639707, "loss": 38.6956, "step": 4312 }, { "epoch": 11.39121822383625, "grad_norm": 314.6018371582031, "learning_rate": 0.00044056196201302454, "loss": 39.7599, "step": 4313 }, { "epoch": 11.393859359524596, "grad_norm": 270.276123046875, "learning_rate": 0.0004405349180407475, "loss": 40.867, "step": 4314 }, { "epoch": 11.396500495212942, "grad_norm": 461.662109375, "learning_rate": 0.00044050786874789494, "loss": 46.8527, "step": 4315 }, { "epoch": 11.399141630901287, "grad_norm": 836.26025390625, "learning_rate": 0.00044048081413522195, "loss": 54.5614, "step": 4316 }, { "epoch": 11.401782766589633, "grad_norm": 431.18890380859375, "learning_rate": 0.00044045375420348416, "loss": 48.1375, "step": 4317 }, { "epoch": 11.40442390227798, "grad_norm": 266.2716979980469, "learning_rate": 0.0004404266889534371, "loss": 42.2313, "step": 4318 }, { "epoch": 11.407065037966326, "grad_norm": 296.6770935058594, "learning_rate": 0.0004403996183858367, "loss": 42.5916, "step": 4319 }, { "epoch": 11.409706173654671, "grad_norm": 266.8823547363281, "learning_rate": 0.00044037254250143876, "loss": 44.9593, "step": 4320 }, { "epoch": 11.412347309343017, "grad_norm": 456.544189453125, "learning_rate": 0.0004403454613009994, "loss": 44.6503, "step": 4321 }, { "epoch": 11.414988445031364, "grad_norm": 149.25697326660156, "learning_rate": 0.0004403183747852748, "loss": 45.5833, "step": 4322 }, { "epoch": 11.417629580719709, "grad_norm": 343.6122131347656, "learning_rate": 0.00044029128295502154, "loss": 44.398, "step": 4323 }, { "epoch": 11.420270716408055, "grad_norm": 135.43699645996094, "learning_rate": 0.0004402641858109958, "loss": 41.9579, "step": 4324 }, { "epoch": 11.422911852096401, "grad_norm": 441.9931640625, "learning_rate": 0.0004402370833539545, "loss": 41.9645, "step": 4325 }, { "epoch": 11.425552987784748, "grad_norm": 123.0609359741211, "learning_rate": 0.00044020997558465446, "loss": 41.9835, "step": 4326 }, { "epoch": 11.428194123473093, "grad_norm": 219.0078125, "learning_rate": 0.0004401828625038525, "loss": 39.3397, "step": 4327 }, { "epoch": 11.430835259161439, "grad_norm": 304.4501953125, "learning_rate": 0.00044015574411230597, "loss": 40.724, "step": 4328 }, { "epoch": 11.433476394849786, "grad_norm": 172.84161376953125, "learning_rate": 0.0004401286204107718, "loss": 38.8592, "step": 4329 }, { "epoch": 11.436117530538132, "grad_norm": 191.85885620117188, "learning_rate": 0.0004401014914000078, "loss": 40.1138, "step": 4330 }, { "epoch": 11.438758666226477, "grad_norm": 218.70680236816406, "learning_rate": 0.00044007435708077114, "loss": 39.4668, "step": 4331 }, { "epoch": 11.441399801914823, "grad_norm": 269.7578125, "learning_rate": 0.0004400472174538198, "loss": 37.6549, "step": 4332 }, { "epoch": 11.44404093760317, "grad_norm": 221.52552795410156, "learning_rate": 0.0004400200725199115, "loss": 37.9819, "step": 4333 }, { "epoch": 11.446682073291516, "grad_norm": 251.16744995117188, "learning_rate": 0.00043999292227980434, "loss": 37.7219, "step": 4334 }, { "epoch": 11.44932320897986, "grad_norm": 394.2164001464844, "learning_rate": 0.0004399657667342565, "loss": 38.0665, "step": 4335 }, { "epoch": 11.451964344668207, "grad_norm": 188.70669555664062, "learning_rate": 0.0004399386058840261, "loss": 38.6126, "step": 4336 }, { "epoch": 11.454605480356554, "grad_norm": 607.6909790039062, "learning_rate": 0.00043991143972987173, "loss": 39.7038, "step": 4337 }, { "epoch": 11.4572466160449, "grad_norm": 2318.066162109375, "learning_rate": 0.000439884268272552, "loss": 126.3678, "step": 4338 }, { "epoch": 11.459887751733245, "grad_norm": 2177.455078125, "learning_rate": 0.0004398570915128256, "loss": 157.351, "step": 4339 }, { "epoch": 11.462528887421591, "grad_norm": 4818.19140625, "learning_rate": 0.00043982990945145146, "loss": 125.4781, "step": 4340 }, { "epoch": 11.465170023109938, "grad_norm": 4028.034423828125, "learning_rate": 0.0004398027220891886, "loss": 117.4101, "step": 4341 }, { "epoch": 11.467811158798284, "grad_norm": 4057.03271484375, "learning_rate": 0.00043977552942679624, "loss": 134.76, "step": 4342 }, { "epoch": 11.470452294486629, "grad_norm": 3194.43212890625, "learning_rate": 0.0004397483314650337, "loss": 111.4524, "step": 4343 }, { "epoch": 11.473093430174975, "grad_norm": 1427.842041015625, "learning_rate": 0.00043972112820466044, "loss": 84.7663, "step": 4344 }, { "epoch": 11.475734565863322, "grad_norm": 2190.82568359375, "learning_rate": 0.0004396939196464361, "loss": 95.7002, "step": 4345 }, { "epoch": 11.478375701551666, "grad_norm": 3251.81982421875, "learning_rate": 0.00043966670579112055, "loss": 112.123, "step": 4346 }, { "epoch": 11.481016837240013, "grad_norm": 1138.652587890625, "learning_rate": 0.00043963948663947364, "loss": 64.7057, "step": 4347 }, { "epoch": 11.48365797292836, "grad_norm": 188.87879943847656, "learning_rate": 0.00043961226219225546, "loss": 38.1879, "step": 4348 }, { "epoch": 11.486299108616706, "grad_norm": 253.63075256347656, "learning_rate": 0.00043958503245022625, "loss": 38.5022, "step": 4349 }, { "epoch": 11.48894024430505, "grad_norm": 127.19835662841797, "learning_rate": 0.00043955779741414636, "loss": 38.0402, "step": 4350 }, { "epoch": 11.491581379993397, "grad_norm": 180.9385223388672, "learning_rate": 0.0004395305570847763, "loss": 37.3004, "step": 4351 }, { "epoch": 11.494222515681743, "grad_norm": 428.4337158203125, "learning_rate": 0.0004395033114628768, "loss": 36.5962, "step": 4352 }, { "epoch": 11.49686365137009, "grad_norm": 418.4631042480469, "learning_rate": 0.00043947606054920866, "loss": 38.3738, "step": 4353 }, { "epoch": 11.499504787058434, "grad_norm": 403.3025207519531, "learning_rate": 0.00043944880434453287, "loss": 38.4651, "step": 4354 }, { "epoch": 11.50214592274678, "grad_norm": 304.2173156738281, "learning_rate": 0.00043942154284961043, "loss": 38.0377, "step": 4355 }, { "epoch": 11.504787058435127, "grad_norm": 173.08572387695312, "learning_rate": 0.0004393942760652028, "loss": 38.8201, "step": 4356 }, { "epoch": 11.507428194123474, "grad_norm": 426.3858947753906, "learning_rate": 0.0004393670039920711, "loss": 38.898, "step": 4357 }, { "epoch": 11.510069329811818, "grad_norm": 462.3197937011719, "learning_rate": 0.00043933972663097714, "loss": 41.1152, "step": 4358 }, { "epoch": 11.512710465500165, "grad_norm": 172.04359436035156, "learning_rate": 0.00043931244398268257, "loss": 37.9899, "step": 4359 }, { "epoch": 11.515351601188511, "grad_norm": 302.6434020996094, "learning_rate": 0.00043928515604794917, "loss": 37.1898, "step": 4360 }, { "epoch": 11.517992736876858, "grad_norm": 207.218505859375, "learning_rate": 0.000439257862827539, "loss": 36.6381, "step": 4361 }, { "epoch": 11.520633872565202, "grad_norm": 286.8313293457031, "learning_rate": 0.00043923056432221416, "loss": 37.596, "step": 4362 }, { "epoch": 11.523275008253549, "grad_norm": 952.3929443359375, "learning_rate": 0.000439203260532737, "loss": 38.6364, "step": 4363 }, { "epoch": 11.525916143941895, "grad_norm": 361.9759216308594, "learning_rate": 0.00043917595145986986, "loss": 40.9139, "step": 4364 }, { "epoch": 11.528557279630242, "grad_norm": 193.8699493408203, "learning_rate": 0.0004391486371043755, "loss": 41.9016, "step": 4365 }, { "epoch": 11.531198415318586, "grad_norm": 156.3402557373047, "learning_rate": 0.00043912131746701643, "loss": 39.4955, "step": 4366 }, { "epoch": 11.533839551006933, "grad_norm": 172.4318389892578, "learning_rate": 0.0004390939925485558, "loss": 39.8349, "step": 4367 }, { "epoch": 11.53648068669528, "grad_norm": 180.73406982421875, "learning_rate": 0.0004390666623497565, "loss": 39.7573, "step": 4368 }, { "epoch": 11.539121822383624, "grad_norm": 108.82060241699219, "learning_rate": 0.00043903932687138164, "loss": 43.0227, "step": 4369 }, { "epoch": 11.54176295807197, "grad_norm": 100.15670776367188, "learning_rate": 0.0004390119861141947, "loss": 43.5854, "step": 4370 }, { "epoch": 11.544404093760317, "grad_norm": 126.32666015625, "learning_rate": 0.000438984640078959, "loss": 43.6956, "step": 4371 }, { "epoch": 11.547045229448663, "grad_norm": 116.00304412841797, "learning_rate": 0.00043895728876643824, "loss": 41.363, "step": 4372 }, { "epoch": 11.549686365137008, "grad_norm": 142.46148681640625, "learning_rate": 0.0004389299321773963, "loss": 43.509, "step": 4373 }, { "epoch": 11.552327500825355, "grad_norm": 97.63660430908203, "learning_rate": 0.000438902570312597, "loss": 42.6862, "step": 4374 }, { "epoch": 11.554968636513701, "grad_norm": 105.5244140625, "learning_rate": 0.00043887520317280424, "loss": 40.0623, "step": 4375 }, { "epoch": 11.557609772202047, "grad_norm": 99.74370574951172, "learning_rate": 0.0004388478307587824, "loss": 43.1783, "step": 4376 }, { "epoch": 11.560250907890392, "grad_norm": 165.53204345703125, "learning_rate": 0.0004388204530712959, "loss": 40.0711, "step": 4377 }, { "epoch": 11.562892043578739, "grad_norm": 120.72510528564453, "learning_rate": 0.0004387930701111091, "loss": 38.356, "step": 4378 }, { "epoch": 11.565533179267085, "grad_norm": 84.81449127197266, "learning_rate": 0.0004387656818789868, "loss": 38.3048, "step": 4379 }, { "epoch": 11.568174314955431, "grad_norm": 233.658447265625, "learning_rate": 0.00043873828837569375, "loss": 37.9601, "step": 4380 }, { "epoch": 11.570815450643776, "grad_norm": 176.2926025390625, "learning_rate": 0.0004387108896019948, "loss": 38.0702, "step": 4381 }, { "epoch": 11.573456586332123, "grad_norm": 181.78042602539062, "learning_rate": 0.0004386834855586551, "loss": 36.5975, "step": 4382 }, { "epoch": 11.576097722020469, "grad_norm": 211.961669921875, "learning_rate": 0.0004386560762464399, "loss": 39.2569, "step": 4383 }, { "epoch": 11.578738857708816, "grad_norm": 222.903564453125, "learning_rate": 0.0004386286616661146, "loss": 37.3713, "step": 4384 }, { "epoch": 11.58137999339716, "grad_norm": 225.39820861816406, "learning_rate": 0.0004386012418184447, "loss": 38.0641, "step": 4385 }, { "epoch": 11.584021129085507, "grad_norm": 239.76524353027344, "learning_rate": 0.000438573816704196, "loss": 37.3905, "step": 4386 }, { "epoch": 11.586662264773853, "grad_norm": 223.58493041992188, "learning_rate": 0.00043854638632413425, "loss": 42.3412, "step": 4387 }, { "epoch": 11.5893034004622, "grad_norm": 2414.264404296875, "learning_rate": 0.00043851895067902535, "loss": 57.9383, "step": 4388 }, { "epoch": 11.591944536150544, "grad_norm": 2497.79541015625, "learning_rate": 0.00043849150976963547, "loss": 54.9075, "step": 4389 }, { "epoch": 11.59458567183889, "grad_norm": 2137.36767578125, "learning_rate": 0.00043846406359673097, "loss": 47.1828, "step": 4390 }, { "epoch": 11.597226807527237, "grad_norm": 1504.336181640625, "learning_rate": 0.00043843661216107823, "loss": 47.8173, "step": 4391 }, { "epoch": 11.599867943215582, "grad_norm": 2162.619873046875, "learning_rate": 0.00043840915546344367, "loss": 36.1601, "step": 4392 }, { "epoch": 11.602509078903928, "grad_norm": 3597.84814453125, "learning_rate": 0.00043838169350459425, "loss": 35.9342, "step": 4393 }, { "epoch": 11.605150214592275, "grad_norm": 4339.27587890625, "learning_rate": 0.00043835422628529666, "loss": 27.0849, "step": 4394 }, { "epoch": 11.607791350280621, "grad_norm": 845.8353881835938, "learning_rate": 0.0004383267538063179, "loss": 30.9536, "step": 4395 }, { "epoch": 11.610432485968968, "grad_norm": 1993.3060302734375, "learning_rate": 0.0004382992760684252, "loss": 27.2334, "step": 4396 }, { "epoch": 11.613073621657312, "grad_norm": 1146.016357421875, "learning_rate": 0.0004382717930723858, "loss": 32.9591, "step": 4397 }, { "epoch": 11.615714757345659, "grad_norm": 934.5614624023438, "learning_rate": 0.00043824430481896723, "loss": 52.1395, "step": 4398 }, { "epoch": 11.618355893034005, "grad_norm": 837.4677734375, "learning_rate": 0.000438216811308937, "loss": 48.8842, "step": 4399 }, { "epoch": 11.62099702872235, "grad_norm": 755.1490478515625, "learning_rate": 0.00043818931254306284, "loss": 45.0038, "step": 4400 }, { "epoch": 11.62099702872235, "eval_loss": 4.579347133636475, "eval_runtime": 2.2079, "eval_samples_per_second": 224.198, "eval_steps_per_second": 28.081, "step": 4400 }, { "epoch": 11.623638164410696, "grad_norm": 454.74798583984375, "learning_rate": 0.0004381618085221127, "loss": 40.874, "step": 4401 }, { "epoch": 11.626279300099043, "grad_norm": 263.1895751953125, "learning_rate": 0.00043813429924685463, "loss": 39.231, "step": 4402 }, { "epoch": 11.62892043578739, "grad_norm": 376.8038635253906, "learning_rate": 0.00043810678471805677, "loss": 37.5612, "step": 4403 }, { "epoch": 11.631561571475734, "grad_norm": 366.6222229003906, "learning_rate": 0.0004380792649364874, "loss": 37.7517, "step": 4404 }, { "epoch": 11.63420270716408, "grad_norm": 364.23114013671875, "learning_rate": 0.00043805173990291504, "loss": 39.2681, "step": 4405 }, { "epoch": 11.636843842852427, "grad_norm": 282.0042724609375, "learning_rate": 0.00043802420961810837, "loss": 38.5812, "step": 4406 }, { "epoch": 11.639484978540773, "grad_norm": 570.87255859375, "learning_rate": 0.00043799667408283606, "loss": 38.5957, "step": 4407 }, { "epoch": 11.642126114229118, "grad_norm": 357.8546447753906, "learning_rate": 0.000437969133297867, "loss": 37.5466, "step": 4408 }, { "epoch": 11.644767249917464, "grad_norm": 347.86474609375, "learning_rate": 0.00043794158726397043, "loss": 37.9922, "step": 4409 }, { "epoch": 11.64740838560581, "grad_norm": 231.3607177734375, "learning_rate": 0.0004379140359819154, "loss": 37.171, "step": 4410 }, { "epoch": 11.650049521294157, "grad_norm": 743.1427612304688, "learning_rate": 0.00043788647945247123, "loss": 37.7656, "step": 4411 }, { "epoch": 11.652690656982502, "grad_norm": 381.88250732421875, "learning_rate": 0.0004378589176764076, "loss": 38.8591, "step": 4412 }, { "epoch": 11.655331792670848, "grad_norm": 693.8150024414062, "learning_rate": 0.00043783135065449396, "loss": 38.7031, "step": 4413 }, { "epoch": 11.657972928359195, "grad_norm": 910.866943359375, "learning_rate": 0.0004378037783875002, "loss": 38.8109, "step": 4414 }, { "epoch": 11.66061406404754, "grad_norm": 845.8284912109375, "learning_rate": 0.0004377762008761963, "loss": 45.5586, "step": 4415 }, { "epoch": 11.663255199735886, "grad_norm": 306.173828125, "learning_rate": 0.00043774861812135224, "loss": 40.9439, "step": 4416 }, { "epoch": 11.665896335424232, "grad_norm": 282.1875915527344, "learning_rate": 0.0004377210301237384, "loss": 42.2599, "step": 4417 }, { "epoch": 11.668537471112579, "grad_norm": 338.7309875488281, "learning_rate": 0.00043769343688412497, "loss": 40.9756, "step": 4418 }, { "epoch": 11.671178606800925, "grad_norm": 603.60791015625, "learning_rate": 0.0004376658384032826, "loss": 41.1861, "step": 4419 }, { "epoch": 11.67381974248927, "grad_norm": 355.744873046875, "learning_rate": 0.0004376382346819819, "loss": 42.5777, "step": 4420 }, { "epoch": 11.676460878177616, "grad_norm": 273.7424621582031, "learning_rate": 0.0004376106257209938, "loss": 42.4124, "step": 4421 }, { "epoch": 11.679102013865963, "grad_norm": 251.36004638671875, "learning_rate": 0.00043758301152108916, "loss": 43.4127, "step": 4422 }, { "epoch": 11.681743149554308, "grad_norm": 214.35665893554688, "learning_rate": 0.00043755539208303906, "loss": 41.9764, "step": 4423 }, { "epoch": 11.684384285242654, "grad_norm": 186.88616943359375, "learning_rate": 0.0004375277674076149, "loss": 40.1006, "step": 4424 }, { "epoch": 11.687025420931, "grad_norm": 442.549072265625, "learning_rate": 0.00043750013749558795, "loss": 42.1938, "step": 4425 }, { "epoch": 11.689666556619347, "grad_norm": 263.8553771972656, "learning_rate": 0.0004374725023477298, "loss": 40.3947, "step": 4426 }, { "epoch": 11.692307692307692, "grad_norm": 236.68563842773438, "learning_rate": 0.00043744486196481213, "loss": 40.3613, "step": 4427 }, { "epoch": 11.694948827996038, "grad_norm": 247.45352172851562, "learning_rate": 0.0004374172163476069, "loss": 39.8732, "step": 4428 }, { "epoch": 11.697589963684385, "grad_norm": 134.34857177734375, "learning_rate": 0.0004373895654968859, "loss": 38.3608, "step": 4429 }, { "epoch": 11.700231099372731, "grad_norm": 488.3821105957031, "learning_rate": 0.00043736190941342135, "loss": 38.1854, "step": 4430 }, { "epoch": 11.702872235061076, "grad_norm": 130.6168670654297, "learning_rate": 0.0004373342480979856, "loss": 38.1528, "step": 4431 }, { "epoch": 11.705513370749422, "grad_norm": 211.33819580078125, "learning_rate": 0.00043730658155135096, "loss": 36.7701, "step": 4432 }, { "epoch": 11.708154506437769, "grad_norm": 162.98895263671875, "learning_rate": 0.00043727890977429007, "loss": 37.9516, "step": 4433 }, { "epoch": 11.710795642126115, "grad_norm": 396.3399353027344, "learning_rate": 0.0004372512327675757, "loss": 38.4747, "step": 4434 }, { "epoch": 11.71343677781446, "grad_norm": 114.38463592529297, "learning_rate": 0.0004372235505319806, "loss": 37.9287, "step": 4435 }, { "epoch": 11.716077913502806, "grad_norm": 262.0226135253906, "learning_rate": 0.00043719586306827785, "loss": 38.343, "step": 4436 }, { "epoch": 11.718719049191153, "grad_norm": 440.9118347167969, "learning_rate": 0.0004371681703772406, "loss": 47.8123, "step": 4437 }, { "epoch": 11.721360184879497, "grad_norm": 2165.438232421875, "learning_rate": 0.00043714047245964213, "loss": 91.5328, "step": 4438 }, { "epoch": 11.724001320567844, "grad_norm": 1410.71435546875, "learning_rate": 0.00043711276931625585, "loss": 90.0292, "step": 4439 }, { "epoch": 11.72664245625619, "grad_norm": 3532.356201171875, "learning_rate": 0.0004370850609478555, "loss": 88.2702, "step": 4440 }, { "epoch": 11.729283591944537, "grad_norm": 3054.617431640625, "learning_rate": 0.00043705734735521466, "loss": 69.0245, "step": 4441 }, { "epoch": 11.731924727632883, "grad_norm": 1775.4371337890625, "learning_rate": 0.00043702962853910734, "loss": 70.267, "step": 4442 }, { "epoch": 11.734565863321228, "grad_norm": 1524.010986328125, "learning_rate": 0.0004370019045003074, "loss": 59.7308, "step": 4443 }, { "epoch": 11.737206999009574, "grad_norm": 1470.918212890625, "learning_rate": 0.0004369741752395892, "loss": 44.9302, "step": 4444 }, { "epoch": 11.73984813469792, "grad_norm": 1507.5311279296875, "learning_rate": 0.000436946440757727, "loss": 33.9893, "step": 4445 }, { "epoch": 11.742489270386265, "grad_norm": 398.4513244628906, "learning_rate": 0.00043691870105549524, "loss": 20.1174, "step": 4446 }, { "epoch": 11.745130406074612, "grad_norm": 4734.43505859375, "learning_rate": 0.00043689095613366855, "loss": 104.7355, "step": 4447 }, { "epoch": 11.747771541762958, "grad_norm": 3606.365234375, "learning_rate": 0.0004368632059930217, "loss": 118.4823, "step": 4448 }, { "epoch": 11.750412677451305, "grad_norm": 2496.54248046875, "learning_rate": 0.0004368354506343296, "loss": 95.9561, "step": 4449 }, { "epoch": 11.75305381313965, "grad_norm": 1308.328369140625, "learning_rate": 0.00043680769005836727, "loss": 63.8971, "step": 4450 }, { "epoch": 11.755694948827996, "grad_norm": 724.6001586914062, "learning_rate": 0.00043677992426590994, "loss": 50.4616, "step": 4451 }, { "epoch": 11.758336084516342, "grad_norm": 614.071533203125, "learning_rate": 0.000436752153257733, "loss": 43.8272, "step": 4452 }, { "epoch": 11.760977220204689, "grad_norm": 436.49078369140625, "learning_rate": 0.00043672437703461174, "loss": 43.8726, "step": 4453 }, { "epoch": 11.763618355893033, "grad_norm": 738.3623657226562, "learning_rate": 0.0004366965955973221, "loss": 44.8704, "step": 4454 }, { "epoch": 11.76625949158138, "grad_norm": 507.0439758300781, "learning_rate": 0.00043666880894663963, "loss": 41.9961, "step": 4455 }, { "epoch": 11.768900627269726, "grad_norm": 339.376220703125, "learning_rate": 0.00043664101708334037, "loss": 39.8599, "step": 4456 }, { "epoch": 11.771541762958073, "grad_norm": 935.6715698242188, "learning_rate": 0.00043661322000820027, "loss": 40.627, "step": 4457 }, { "epoch": 11.774182898646417, "grad_norm": 324.3068542480469, "learning_rate": 0.0004365854177219957, "loss": 40.9997, "step": 4458 }, { "epoch": 11.776824034334764, "grad_norm": 328.4519348144531, "learning_rate": 0.0004365576102255029, "loss": 39.8751, "step": 4459 }, { "epoch": 11.77946517002311, "grad_norm": 1001.7463989257812, "learning_rate": 0.0004365297975194984, "loss": 38.9083, "step": 4460 }, { "epoch": 11.782106305711455, "grad_norm": 542.4823608398438, "learning_rate": 0.0004365019796047589, "loss": 37.7664, "step": 4461 }, { "epoch": 11.784747441399801, "grad_norm": 313.2491149902344, "learning_rate": 0.00043647415648206126, "loss": 38.1138, "step": 4462 }, { "epoch": 11.787388577088148, "grad_norm": 727.6185913085938, "learning_rate": 0.00043644632815218224, "loss": 39.4139, "step": 4463 }, { "epoch": 11.790029712776494, "grad_norm": 4051.82080078125, "learning_rate": 0.000436418494615899, "loss": 43.0988, "step": 4464 }, { "epoch": 11.79267084846484, "grad_norm": 776.7000732421875, "learning_rate": 0.00043639065587398884, "loss": 43.9364, "step": 4465 }, { "epoch": 11.795311984153185, "grad_norm": 199.41226196289062, "learning_rate": 0.00043636281192722915, "loss": 44.5691, "step": 4466 }, { "epoch": 11.797953119841532, "grad_norm": 233.3283233642578, "learning_rate": 0.00043633496277639737, "loss": 42.9455, "step": 4467 }, { "epoch": 11.800594255529878, "grad_norm": 645.1508178710938, "learning_rate": 0.0004363071084222713, "loss": 42.6179, "step": 4468 }, { "epoch": 11.803235391218223, "grad_norm": 299.9791259765625, "learning_rate": 0.0004362792488656286, "loss": 41.6753, "step": 4469 }, { "epoch": 11.80587652690657, "grad_norm": 364.3859558105469, "learning_rate": 0.0004362513841072473, "loss": 45.1994, "step": 4470 }, { "epoch": 11.808517662594916, "grad_norm": 142.5248565673828, "learning_rate": 0.0004362235141479055, "loss": 44.5279, "step": 4471 }, { "epoch": 11.811158798283262, "grad_norm": 649.604736328125, "learning_rate": 0.0004361956389883815, "loss": 42.9788, "step": 4472 }, { "epoch": 11.813799933971607, "grad_norm": 305.8474426269531, "learning_rate": 0.00043616775862945364, "loss": 43.4232, "step": 4473 }, { "epoch": 11.816441069659954, "grad_norm": 305.4406433105469, "learning_rate": 0.0004361398730719005, "loss": 42.4462, "step": 4474 }, { "epoch": 11.8190822053483, "grad_norm": 389.110595703125, "learning_rate": 0.00043611198231650075, "loss": 42.7858, "step": 4475 }, { "epoch": 11.821723341036646, "grad_norm": 277.5581970214844, "learning_rate": 0.00043608408636403316, "loss": 42.4524, "step": 4476 }, { "epoch": 11.824364476724991, "grad_norm": 318.8314208984375, "learning_rate": 0.0004360561852152768, "loss": 40.4243, "step": 4477 }, { "epoch": 11.827005612413338, "grad_norm": 226.994873046875, "learning_rate": 0.00043602827887101084, "loss": 40.6337, "step": 4478 }, { "epoch": 11.829646748101684, "grad_norm": 232.90545654296875, "learning_rate": 0.00043600036733201433, "loss": 37.6224, "step": 4479 }, { "epoch": 11.83228788379003, "grad_norm": 730.0094604492188, "learning_rate": 0.0004359724505990669, "loss": 38.1617, "step": 4480 }, { "epoch": 11.834929019478375, "grad_norm": 126.05863952636719, "learning_rate": 0.00043594452867294804, "loss": 39.1584, "step": 4481 }, { "epoch": 11.837570155166722, "grad_norm": 230.8749542236328, "learning_rate": 0.00043591660155443745, "loss": 38.1492, "step": 4482 }, { "epoch": 11.840211290855068, "grad_norm": 167.95774841308594, "learning_rate": 0.00043588866924431497, "loss": 36.4619, "step": 4483 }, { "epoch": 11.842852426543413, "grad_norm": 408.2431640625, "learning_rate": 0.0004358607317433606, "loss": 38.1404, "step": 4484 }, { "epoch": 11.84549356223176, "grad_norm": 486.95751953125, "learning_rate": 0.00043583278905235447, "loss": 37.4117, "step": 4485 }, { "epoch": 11.848134697920106, "grad_norm": 198.2930908203125, "learning_rate": 0.00043580484117207685, "loss": 38.2789, "step": 4486 }, { "epoch": 11.850775833608452, "grad_norm": 781.1264038085938, "learning_rate": 0.0004357768881033082, "loss": 49.9945, "step": 4487 }, { "epoch": 11.853416969296799, "grad_norm": 2336.222900390625, "learning_rate": 0.0004357489298468291, "loss": 55.7685, "step": 4488 }, { "epoch": 11.856058104985143, "grad_norm": 1804.028076171875, "learning_rate": 0.00043572096640342024, "loss": 52.3651, "step": 4489 }, { "epoch": 11.85869924067349, "grad_norm": 1858.08447265625, "learning_rate": 0.0004356929977738625, "loss": 48.1848, "step": 4490 }, { "epoch": 11.861340376361836, "grad_norm": 1899.197509765625, "learning_rate": 0.0004356650239589368, "loss": 38.7079, "step": 4491 }, { "epoch": 11.86398151205018, "grad_norm": 1631.4566650390625, "learning_rate": 0.0004356370449594245, "loss": 34.4904, "step": 4492 }, { "epoch": 11.866622647738527, "grad_norm": 2619.74365234375, "learning_rate": 0.0004356090607761067, "loss": 21.9449, "step": 4493 }, { "epoch": 11.869263783426874, "grad_norm": 779.3370971679688, "learning_rate": 0.00043558107140976486, "loss": 25.3863, "step": 4494 }, { "epoch": 11.87190491911522, "grad_norm": 2156.022216796875, "learning_rate": 0.0004355530768611807, "loss": 19.8818, "step": 4495 }, { "epoch": 11.874546054803565, "grad_norm": 573.7858276367188, "learning_rate": 0.00043552507713113587, "loss": 21.885, "step": 4496 }, { "epoch": 11.877187190491911, "grad_norm": 506.4678955078125, "learning_rate": 0.00043549707222041224, "loss": 34.5212, "step": 4497 }, { "epoch": 11.879828326180258, "grad_norm": 1141.30908203125, "learning_rate": 0.0004354690621297918, "loss": 48.7763, "step": 4498 }, { "epoch": 11.882469461868604, "grad_norm": 617.50927734375, "learning_rate": 0.0004354410468600567, "loss": 45.2472, "step": 4499 }, { "epoch": 11.885110597556949, "grad_norm": 433.4910583496094, "learning_rate": 0.00043541302641198946, "loss": 44.806, "step": 4500 }, { "epoch": 11.887751733245295, "grad_norm": 444.81524658203125, "learning_rate": 0.0004353850007863722, "loss": 42.4544, "step": 4501 }, { "epoch": 11.890392868933642, "grad_norm": 214.8372344970703, "learning_rate": 0.0004353569699839878, "loss": 40.2985, "step": 4502 }, { "epoch": 11.893034004621988, "grad_norm": 144.2434539794922, "learning_rate": 0.0004353289340056189, "loss": 39.6911, "step": 4503 }, { "epoch": 11.895675140310333, "grad_norm": 162.44558715820312, "learning_rate": 0.00043530089285204833, "loss": 37.9419, "step": 4504 }, { "epoch": 11.89831627599868, "grad_norm": 163.3802032470703, "learning_rate": 0.0004352728465240592, "loss": 37.3858, "step": 4505 }, { "epoch": 11.900957411687026, "grad_norm": 260.8974304199219, "learning_rate": 0.00043524479502243465, "loss": 36.8684, "step": 4506 }, { "epoch": 11.90359854737537, "grad_norm": 154.0093536376953, "learning_rate": 0.00043521673834795807, "loss": 36.3324, "step": 4507 }, { "epoch": 11.906239683063717, "grad_norm": 566.5517578125, "learning_rate": 0.0004351886765014128, "loss": 39.8319, "step": 4508 }, { "epoch": 11.908880818752063, "grad_norm": 407.9395751953125, "learning_rate": 0.0004351606094835826, "loss": 37.9412, "step": 4509 }, { "epoch": 11.91152195444041, "grad_norm": 186.97760009765625, "learning_rate": 0.0004351325372952511, "loss": 38.3289, "step": 4510 }, { "epoch": 11.914163090128756, "grad_norm": 516.5010375976562, "learning_rate": 0.00043510445993720227, "loss": 35.9535, "step": 4511 }, { "epoch": 11.916804225817101, "grad_norm": 703.6259765625, "learning_rate": 0.0004350763774102201, "loss": 37.7727, "step": 4512 }, { "epoch": 11.919445361505447, "grad_norm": 564.0025024414062, "learning_rate": 0.0004350482897150889, "loss": 40.236, "step": 4513 }, { "epoch": 11.922086497193794, "grad_norm": 581.6343383789062, "learning_rate": 0.0004350201968525928, "loss": 39.608, "step": 4514 }, { "epoch": 11.924727632882139, "grad_norm": 1546.80029296875, "learning_rate": 0.0004349920988235164, "loss": 43.5275, "step": 4515 }, { "epoch": 11.927368768570485, "grad_norm": 353.91986083984375, "learning_rate": 0.00043496399562864443, "loss": 43.8542, "step": 4516 }, { "epoch": 11.930009904258831, "grad_norm": 388.6835632324219, "learning_rate": 0.00043493588726876146, "loss": 44.6585, "step": 4517 }, { "epoch": 11.932651039947178, "grad_norm": 319.741943359375, "learning_rate": 0.00043490777374465244, "loss": 48.3655, "step": 4518 }, { "epoch": 11.935292175635523, "grad_norm": 313.8059997558594, "learning_rate": 0.00043487965505710247, "loss": 43.3066, "step": 4519 }, { "epoch": 11.937933311323869, "grad_norm": 197.53872680664062, "learning_rate": 0.0004348515312068968, "loss": 41.9596, "step": 4520 }, { "epoch": 11.940574447012215, "grad_norm": 195.12648010253906, "learning_rate": 0.0004348234021948206, "loss": 42.7001, "step": 4521 }, { "epoch": 11.943215582700562, "grad_norm": 190.1983184814453, "learning_rate": 0.0004347952680216596, "loss": 40.6784, "step": 4522 }, { "epoch": 11.945856718388907, "grad_norm": 509.81597900390625, "learning_rate": 0.0004347671286881992, "loss": 38.6119, "step": 4523 }, { "epoch": 11.948497854077253, "grad_norm": 137.90243530273438, "learning_rate": 0.00043473898419522526, "loss": 39.7733, "step": 4524 }, { "epoch": 11.9511389897656, "grad_norm": 171.61280822753906, "learning_rate": 0.0004347108345435238, "loss": 36.6347, "step": 4525 }, { "epoch": 11.953780125453946, "grad_norm": 226.11715698242188, "learning_rate": 0.0004346826797338807, "loss": 37.8034, "step": 4526 }, { "epoch": 11.95642126114229, "grad_norm": 1230.8968505859375, "learning_rate": 0.00043465451976708227, "loss": 48.5453, "step": 4527 }, { "epoch": 11.959062396830637, "grad_norm": 1985.408203125, "learning_rate": 0.00043462635464391484, "loss": 85.6313, "step": 4528 }, { "epoch": 11.961703532518984, "grad_norm": 1698.5498046875, "learning_rate": 0.00043459818436516497, "loss": 85.039, "step": 4529 }, { "epoch": 11.964344668207328, "grad_norm": 2464.139892578125, "learning_rate": 0.00043457000893161923, "loss": 71.0801, "step": 4530 }, { "epoch": 11.966985803895675, "grad_norm": 1230.0169677734375, "learning_rate": 0.00043454182834406444, "loss": 70.1172, "step": 4531 }, { "epoch": 11.969626939584021, "grad_norm": 1167.76611328125, "learning_rate": 0.0004345136426032874, "loss": 58.5785, "step": 4532 }, { "epoch": 11.972268075272368, "grad_norm": 1389.9097900390625, "learning_rate": 0.00043448545171007535, "loss": 46.2355, "step": 4533 }, { "epoch": 11.974909210960714, "grad_norm": 194.35824584960938, "learning_rate": 0.0004344572556652154, "loss": 39.6397, "step": 4534 }, { "epoch": 11.977550346649059, "grad_norm": 183.60675048828125, "learning_rate": 0.000434429054469495, "loss": 38.415, "step": 4535 }, { "epoch": 11.980191482337405, "grad_norm": 163.00059509277344, "learning_rate": 0.0004344008481237015, "loss": 38.7737, "step": 4536 }, { "epoch": 11.982832618025752, "grad_norm": 150.5061492919922, "learning_rate": 0.0004343726366286227, "loss": 38.0182, "step": 4537 }, { "epoch": 11.985473753714096, "grad_norm": 175.1493682861328, "learning_rate": 0.0004343444199850464, "loss": 38.0423, "step": 4538 }, { "epoch": 11.988114889402443, "grad_norm": 227.23245239257812, "learning_rate": 0.00043431619819376036, "loss": 37.7034, "step": 4539 }, { "epoch": 11.99075602509079, "grad_norm": 237.61668395996094, "learning_rate": 0.00043428797125555276, "loss": 37.8709, "step": 4540 }, { "epoch": 11.993397160779136, "grad_norm": 475.15118408203125, "learning_rate": 0.00043425973917121175, "loss": 38.6504, "step": 4541 }, { "epoch": 11.99603829646748, "grad_norm": 660.6506958007812, "learning_rate": 0.0004342315019415259, "loss": 37.3555, "step": 4542 }, { "epoch": 11.998679432155827, "grad_norm": 587.3457641601562, "learning_rate": 0.00043420325956728355, "loss": 39.2327, "step": 4543 }, { "epoch": 12.001320567844173, "grad_norm": 824.8370971679688, "learning_rate": 0.00043417501204927333, "loss": 59.8838, "step": 4544 }, { "epoch": 12.00396170353252, "grad_norm": 1091.1849365234375, "learning_rate": 0.0004341467593882842, "loss": 64.2769, "step": 4545 }, { "epoch": 12.006602839220864, "grad_norm": 987.346435546875, "learning_rate": 0.0004341185015851049, "loss": 59.9373, "step": 4546 }, { "epoch": 12.00924397490921, "grad_norm": 421.73895263671875, "learning_rate": 0.00043409023864052454, "loss": 47.4578, "step": 4547 }, { "epoch": 12.011885110597557, "grad_norm": 379.2757568359375, "learning_rate": 0.0004340619705553325, "loss": 45.2972, "step": 4548 }, { "epoch": 12.014526246285904, "grad_norm": 190.7756805419922, "learning_rate": 0.00043403369733031806, "loss": 45.7045, "step": 4549 }, { "epoch": 12.017167381974248, "grad_norm": 339.50482177734375, "learning_rate": 0.00043400541896627064, "loss": 47.0588, "step": 4550 }, { "epoch": 12.019808517662595, "grad_norm": 145.5802459716797, "learning_rate": 0.0004339771354639801, "loss": 43.9892, "step": 4551 }, { "epoch": 12.022449653350941, "grad_norm": 375.03387451171875, "learning_rate": 0.0004339488468242361, "loss": 43.1189, "step": 4552 }, { "epoch": 12.025090789039288, "grad_norm": 246.66357421875, "learning_rate": 0.0004339205530478286, "loss": 42.0819, "step": 4553 }, { "epoch": 12.027731924727632, "grad_norm": 361.0267333984375, "learning_rate": 0.0004338922541355477, "loss": 41.7852, "step": 4554 }, { "epoch": 12.030373060415979, "grad_norm": 545.6790161132812, "learning_rate": 0.00043386395008818357, "loss": 39.8048, "step": 4555 }, { "epoch": 12.033014196104325, "grad_norm": 458.5499267578125, "learning_rate": 0.00043383564090652667, "loss": 40.6294, "step": 4556 }, { "epoch": 12.03565533179267, "grad_norm": 161.3148193359375, "learning_rate": 0.00043380732659136753, "loss": 42.4031, "step": 4557 }, { "epoch": 12.038296467481016, "grad_norm": 338.859619140625, "learning_rate": 0.0004337790071434967, "loss": 39.3556, "step": 4558 }, { "epoch": 12.040937603169363, "grad_norm": 278.95501708984375, "learning_rate": 0.0004337506825637051, "loss": 38.9152, "step": 4559 }, { "epoch": 12.04357873885771, "grad_norm": 250.11294555664062, "learning_rate": 0.0004337223528527836, "loss": 39.0511, "step": 4560 }, { "epoch": 12.046219874546054, "grad_norm": 274.0605163574219, "learning_rate": 0.0004336940180115234, "loss": 38.177, "step": 4561 }, { "epoch": 12.0488610102344, "grad_norm": 707.7633666992188, "learning_rate": 0.00043366567804071555, "loss": 38.107, "step": 4562 }, { "epoch": 12.051502145922747, "grad_norm": 233.3881378173828, "learning_rate": 0.0004336373329411516, "loss": 37.9115, "step": 4563 }, { "epoch": 12.054143281611093, "grad_norm": 1228.9603271484375, "learning_rate": 0.00043360898271362303, "loss": 37.6645, "step": 4564 }, { "epoch": 12.056784417299438, "grad_norm": 364.8267822265625, "learning_rate": 0.0004335806273589214, "loss": 39.2942, "step": 4565 }, { "epoch": 12.059425552987785, "grad_norm": 1930.16064453125, "learning_rate": 0.0004335522668778386, "loss": 75.3176, "step": 4566 }, { "epoch": 12.062066688676131, "grad_norm": 9626.916015625, "learning_rate": 0.0004335239012711666, "loss": 150.0301, "step": 4567 }, { "epoch": 12.064707824364477, "grad_norm": 1063.9381103515625, "learning_rate": 0.00043349553053969746, "loss": 109.4197, "step": 4568 }, { "epoch": 12.067348960052822, "grad_norm": 30717.40625, "learning_rate": 0.0004334671546842234, "loss": 106.1837, "step": 4569 }, { "epoch": 12.069990095741169, "grad_norm": 1807.024169921875, "learning_rate": 0.00043343877370553683, "loss": 118.7445, "step": 4570 }, { "epoch": 12.072631231429515, "grad_norm": 2019.10546875, "learning_rate": 0.0004334103876044303, "loss": 125.0641, "step": 4571 }, { "epoch": 12.075272367117861, "grad_norm": 867.5408325195312, "learning_rate": 0.0004333819963816964, "loss": 90.6746, "step": 4572 }, { "epoch": 12.077913502806206, "grad_norm": 2866.613525390625, "learning_rate": 0.00043335360003812795, "loss": 80.0109, "step": 4573 }, { "epoch": 12.080554638494553, "grad_norm": 2020.6552734375, "learning_rate": 0.000433325198574518, "loss": 80.1067, "step": 4574 }, { "epoch": 12.083195774182899, "grad_norm": 2354.283447265625, "learning_rate": 0.0004332967919916595, "loss": 63.4831, "step": 4575 }, { "epoch": 12.085836909871245, "grad_norm": 1711.28955078125, "learning_rate": 0.0004332683802903458, "loss": 45.06, "step": 4576 }, { "epoch": 12.08847804555959, "grad_norm": 446.1355895996094, "learning_rate": 0.0004332399634713703, "loss": 43.4349, "step": 4577 }, { "epoch": 12.091119181247937, "grad_norm": 309.64129638671875, "learning_rate": 0.00043321154153552635, "loss": 40.0963, "step": 4578 }, { "epoch": 12.093760316936283, "grad_norm": 311.7984619140625, "learning_rate": 0.0004331831144836078, "loss": 40.592, "step": 4579 }, { "epoch": 12.096401452624628, "grad_norm": 361.539794921875, "learning_rate": 0.00043315468231640834, "loss": 38.3431, "step": 4580 }, { "epoch": 12.099042588312974, "grad_norm": 325.8556823730469, "learning_rate": 0.000433126245034722, "loss": 37.851, "step": 4581 }, { "epoch": 12.10168372400132, "grad_norm": 407.74371337890625, "learning_rate": 0.00043309780263934276, "loss": 37.4751, "step": 4582 }, { "epoch": 12.104324859689667, "grad_norm": 502.1434326171875, "learning_rate": 0.00043306935513106503, "loss": 37.6417, "step": 4583 }, { "epoch": 12.106965995378012, "grad_norm": 213.71319580078125, "learning_rate": 0.0004330409025106831, "loss": 38.1493, "step": 4584 }, { "epoch": 12.109607131066358, "grad_norm": 941.1172485351562, "learning_rate": 0.0004330124447789914, "loss": 38.5301, "step": 4585 }, { "epoch": 12.112248266754705, "grad_norm": 331.7996520996094, "learning_rate": 0.00043298398193678477, "loss": 36.5362, "step": 4586 }, { "epoch": 12.114889402443051, "grad_norm": 560.3101806640625, "learning_rate": 0.0004329555139848579, "loss": 37.6028, "step": 4587 }, { "epoch": 12.117530538131396, "grad_norm": 585.3292846679688, "learning_rate": 0.00043292704092400577, "loss": 37.1067, "step": 4588 }, { "epoch": 12.120171673819742, "grad_norm": 312.1689758300781, "learning_rate": 0.0004328985627550235, "loss": 38.6778, "step": 4589 }, { "epoch": 12.122812809508089, "grad_norm": 1121.4986572265625, "learning_rate": 0.00043287007947870624, "loss": 38.1933, "step": 4590 }, { "epoch": 12.125453945196435, "grad_norm": 382.045654296875, "learning_rate": 0.0004328415910958495, "loss": 37.0041, "step": 4591 }, { "epoch": 12.12809508088478, "grad_norm": 279.2748107910156, "learning_rate": 0.00043281309760724877, "loss": 39.5897, "step": 4592 }, { "epoch": 12.130736216573126, "grad_norm": 1765.7655029296875, "learning_rate": 0.0004327845990136996, "loss": 40.5544, "step": 4593 }, { "epoch": 12.133377352261473, "grad_norm": 348.41448974609375, "learning_rate": 0.00043275609531599795, "loss": 45.6053, "step": 4594 }, { "epoch": 12.13601848794982, "grad_norm": 247.01976013183594, "learning_rate": 0.00043272758651493964, "loss": 42.1932, "step": 4595 }, { "epoch": 12.138659623638164, "grad_norm": 205.95127868652344, "learning_rate": 0.0004326990726113209, "loss": 43.4093, "step": 4596 }, { "epoch": 12.14130075932651, "grad_norm": 125.74959564208984, "learning_rate": 0.00043267055360593775, "loss": 41.2423, "step": 4597 }, { "epoch": 12.143941895014857, "grad_norm": 127.501953125, "learning_rate": 0.00043264202949958676, "loss": 43.4482, "step": 4598 }, { "epoch": 12.146583030703203, "grad_norm": 128.63619995117188, "learning_rate": 0.00043261350029306445, "loss": 44.0509, "step": 4599 }, { "epoch": 12.149224166391548, "grad_norm": 76.44379425048828, "learning_rate": 0.00043258496598716734, "loss": 41.6005, "step": 4600 }, { "epoch": 12.149224166391548, "eval_loss": 4.990532398223877, "eval_runtime": 2.0663, "eval_samples_per_second": 239.555, "eval_steps_per_second": 30.005, "step": 4600 }, { "epoch": 12.151865302079894, "grad_norm": 107.9751205444336, "learning_rate": 0.00043255642658269236, "loss": 44.7329, "step": 4601 }, { "epoch": 12.15450643776824, "grad_norm": 259.58514404296875, "learning_rate": 0.00043252788208043635, "loss": 42.0315, "step": 4602 }, { "epoch": 12.157147573456585, "grad_norm": 147.54638671875, "learning_rate": 0.0004324993324811964, "loss": 40.7146, "step": 4603 }, { "epoch": 12.159788709144932, "grad_norm": 151.95338439941406, "learning_rate": 0.00043247077778576994, "loss": 40.0261, "step": 4604 }, { "epoch": 12.162429844833278, "grad_norm": 369.7939453125, "learning_rate": 0.0004324422179949542, "loss": 40.9737, "step": 4605 }, { "epoch": 12.165070980521625, "grad_norm": 225.84783935546875, "learning_rate": 0.0004324136531095466, "loss": 40.2144, "step": 4606 }, { "epoch": 12.16771211620997, "grad_norm": 221.48439025878906, "learning_rate": 0.0004323850831303449, "loss": 40.9245, "step": 4607 }, { "epoch": 12.170353251898316, "grad_norm": 130.4600372314453, "learning_rate": 0.00043235650805814694, "loss": 38.6584, "step": 4608 }, { "epoch": 12.172994387586662, "grad_norm": 294.99981689453125, "learning_rate": 0.0004323279278937506, "loss": 39.4248, "step": 4609 }, { "epoch": 12.175635523275009, "grad_norm": 165.4444580078125, "learning_rate": 0.00043229934263795395, "loss": 37.8882, "step": 4610 }, { "epoch": 12.178276658963354, "grad_norm": 140.6112823486328, "learning_rate": 0.0004322707522915553, "loss": 37.3775, "step": 4611 }, { "epoch": 12.1809177946517, "grad_norm": 249.16136169433594, "learning_rate": 0.00043224215685535287, "loss": 36.9247, "step": 4612 }, { "epoch": 12.183558930340046, "grad_norm": 298.56939697265625, "learning_rate": 0.00043221355633014534, "loss": 36.9283, "step": 4613 }, { "epoch": 12.186200066028393, "grad_norm": 327.7862854003906, "learning_rate": 0.00043218495071673126, "loss": 37.6243, "step": 4614 }, { "epoch": 12.188841201716738, "grad_norm": 4306.927734375, "learning_rate": 0.00043215634001590947, "loss": 47.7185, "step": 4615 }, { "epoch": 12.191482337405084, "grad_norm": 1121.3876953125, "learning_rate": 0.0004321277242284789, "loss": 78.2458, "step": 4616 }, { "epoch": 12.19412347309343, "grad_norm": 3680.39892578125, "learning_rate": 0.0004320991033552386, "loss": 90.6895, "step": 4617 }, { "epoch": 12.196764608781777, "grad_norm": 1362.8905029296875, "learning_rate": 0.0004320704773969878, "loss": 95.3327, "step": 4618 }, { "epoch": 12.199405744470122, "grad_norm": 1295.722900390625, "learning_rate": 0.00043204184635452594, "loss": 110.089, "step": 4619 }, { "epoch": 12.202046880158468, "grad_norm": 2182.067138671875, "learning_rate": 0.0004320132102286524, "loss": 90.7316, "step": 4620 }, { "epoch": 12.204688015846815, "grad_norm": 1671.7745361328125, "learning_rate": 0.0004319845690201668, "loss": 83.1132, "step": 4621 }, { "epoch": 12.207329151535161, "grad_norm": 1725.6126708984375, "learning_rate": 0.00043195592272986916, "loss": 71.8488, "step": 4622 }, { "epoch": 12.209970287223506, "grad_norm": 1270.215576171875, "learning_rate": 0.0004319272713585592, "loss": 55.7391, "step": 4623 }, { "epoch": 12.212611422911852, "grad_norm": 1270.06982421875, "learning_rate": 0.0004318986149070371, "loss": 39.6472, "step": 4624 }, { "epoch": 12.215252558600199, "grad_norm": 6081.306640625, "learning_rate": 0.000431869953376103, "loss": 34.5698, "step": 4625 }, { "epoch": 12.217893694288543, "grad_norm": 658.9833984375, "learning_rate": 0.0004318412867665572, "loss": 45.3802, "step": 4626 }, { "epoch": 12.22053482997689, "grad_norm": 847.490478515625, "learning_rate": 0.0004318126150792004, "loss": 49.0756, "step": 4627 }, { "epoch": 12.223175965665236, "grad_norm": 1239.921630859375, "learning_rate": 0.0004317839383148331, "loss": 48.9945, "step": 4628 }, { "epoch": 12.225817101353583, "grad_norm": 1038.0810546875, "learning_rate": 0.0004317552564742561, "loss": 48.8515, "step": 4629 }, { "epoch": 12.228458237041927, "grad_norm": 585.988037109375, "learning_rate": 0.00043172656955827035, "loss": 41.655, "step": 4630 }, { "epoch": 12.231099372730274, "grad_norm": 627.9915161132812, "learning_rate": 0.0004316978775676769, "loss": 38.6824, "step": 4631 }, { "epoch": 12.23374050841862, "grad_norm": 384.617431640625, "learning_rate": 0.0004316691805032769, "loss": 39.3112, "step": 4632 }, { "epoch": 12.236381644106967, "grad_norm": 569.060302734375, "learning_rate": 0.00043164047836587177, "loss": 37.9198, "step": 4633 }, { "epoch": 12.239022779795311, "grad_norm": 281.5377502441406, "learning_rate": 0.00043161177115626307, "loss": 38.3523, "step": 4634 }, { "epoch": 12.241663915483658, "grad_norm": 714.2742309570312, "learning_rate": 0.00043158305887525225, "loss": 38.8861, "step": 4635 }, { "epoch": 12.244305051172004, "grad_norm": 430.4718933105469, "learning_rate": 0.00043155434152364114, "loss": 40.1911, "step": 4636 }, { "epoch": 12.24694618686035, "grad_norm": 750.0943603515625, "learning_rate": 0.00043152561910223176, "loss": 39.1249, "step": 4637 }, { "epoch": 12.249587322548695, "grad_norm": 425.0622253417969, "learning_rate": 0.00043149689161182616, "loss": 38.0016, "step": 4638 }, { "epoch": 12.252228458237042, "grad_norm": 837.6129150390625, "learning_rate": 0.0004314681590532264, "loss": 38.8509, "step": 4639 }, { "epoch": 12.254869593925388, "grad_norm": 436.1637878417969, "learning_rate": 0.0004314394214272349, "loss": 37.6477, "step": 4640 }, { "epoch": 12.257510729613735, "grad_norm": 306.34222412109375, "learning_rate": 0.00043141067873465416, "loss": 40.4515, "step": 4641 }, { "epoch": 12.26015186530208, "grad_norm": 200.53440856933594, "learning_rate": 0.00043138193097628683, "loss": 39.2957, "step": 4642 }, { "epoch": 12.262793000990426, "grad_norm": 1417.33349609375, "learning_rate": 0.00043135317815293553, "loss": 42.581, "step": 4643 }, { "epoch": 12.265434136678772, "grad_norm": 257.08795166015625, "learning_rate": 0.0004313244202654033, "loss": 42.5722, "step": 4644 }, { "epoch": 12.268075272367119, "grad_norm": 195.12503051757812, "learning_rate": 0.00043129565731449314, "loss": 41.9812, "step": 4645 }, { "epoch": 12.270716408055463, "grad_norm": 140.24940490722656, "learning_rate": 0.00043126688930100835, "loss": 41.1858, "step": 4646 }, { "epoch": 12.27335754374381, "grad_norm": 112.69154357910156, "learning_rate": 0.000431238116225752, "loss": 41.3703, "step": 4647 }, { "epoch": 12.275998679432156, "grad_norm": 462.3638916015625, "learning_rate": 0.0004312093380895278, "loss": 44.8051, "step": 4648 }, { "epoch": 12.278639815120501, "grad_norm": 373.0461730957031, "learning_rate": 0.00043118055489313933, "loss": 46.1021, "step": 4649 }, { "epoch": 12.281280950808847, "grad_norm": 180.1009979248047, "learning_rate": 0.0004311517666373902, "loss": 44.4644, "step": 4650 }, { "epoch": 12.283922086497194, "grad_norm": 227.27609252929688, "learning_rate": 0.0004311229733230845, "loss": 42.991, "step": 4651 }, { "epoch": 12.28656322218554, "grad_norm": 145.1341552734375, "learning_rate": 0.0004310941749510261, "loss": 41.3225, "step": 4652 }, { "epoch": 12.289204357873885, "grad_norm": 219.44845581054688, "learning_rate": 0.00043106537152201927, "loss": 40.6581, "step": 4653 }, { "epoch": 12.291845493562231, "grad_norm": 124.54893493652344, "learning_rate": 0.0004310365630368682, "loss": 42.1802, "step": 4654 }, { "epoch": 12.294486629250578, "grad_norm": 142.58139038085938, "learning_rate": 0.0004310077494963776, "loss": 41.5922, "step": 4655 }, { "epoch": 12.297127764938924, "grad_norm": 201.06980895996094, "learning_rate": 0.00043097893090135187, "loss": 41.3679, "step": 4656 }, { "epoch": 12.299768900627269, "grad_norm": 284.4271240234375, "learning_rate": 0.0004309501072525958, "loss": 37.982, "step": 4657 }, { "epoch": 12.302410036315615, "grad_norm": 168.93809509277344, "learning_rate": 0.0004309212785509143, "loss": 38.3383, "step": 4658 }, { "epoch": 12.305051172003962, "grad_norm": 228.854248046875, "learning_rate": 0.00043089244479711233, "loss": 37.4753, "step": 4659 }, { "epoch": 12.307692307692308, "grad_norm": 249.2740936279297, "learning_rate": 0.00043086360599199516, "loss": 37.1564, "step": 4660 }, { "epoch": 12.310333443380653, "grad_norm": 127.70690155029297, "learning_rate": 0.00043083476213636796, "loss": 38.0784, "step": 4661 }, { "epoch": 12.312974579069, "grad_norm": 310.6531982421875, "learning_rate": 0.00043080591323103634, "loss": 38.2611, "step": 4662 }, { "epoch": 12.315615714757346, "grad_norm": 98.44426727294922, "learning_rate": 0.00043077705927680577, "loss": 37.4731, "step": 4663 }, { "epoch": 12.318256850445692, "grad_norm": 132.99244689941406, "learning_rate": 0.000430748200274482, "loss": 38.9552, "step": 4664 }, { "epoch": 12.320897986134037, "grad_norm": 144.65072631835938, "learning_rate": 0.0004307193362248709, "loss": 40.3515, "step": 4665 }, { "epoch": 12.323539121822384, "grad_norm": 3346.05029296875, "learning_rate": 0.0004306904671287786, "loss": 67.8712, "step": 4666 }, { "epoch": 12.32618025751073, "grad_norm": 2701.72998046875, "learning_rate": 0.00043066159298701105, "loss": 57.8779, "step": 4667 }, { "epoch": 12.328821393199076, "grad_norm": 1357.8983154296875, "learning_rate": 0.0004306327138003747, "loss": 58.8702, "step": 4668 }, { "epoch": 12.331462528887421, "grad_norm": 2496.2958984375, "learning_rate": 0.00043060382956967586, "loss": 50.1227, "step": 4669 }, { "epoch": 12.334103664575768, "grad_norm": 1761.0009765625, "learning_rate": 0.0004305749402957212, "loss": 45.9405, "step": 4670 }, { "epoch": 12.336744800264114, "grad_norm": 1708.3804931640625, "learning_rate": 0.00043054604597931743, "loss": 37.7388, "step": 4671 }, { "epoch": 12.33938593595246, "grad_norm": 5087.1279296875, "learning_rate": 0.00043051714662127133, "loss": 37.9889, "step": 4672 }, { "epoch": 12.342027071640805, "grad_norm": 639.492919921875, "learning_rate": 0.00043048824222239, "loss": 28.8118, "step": 4673 }, { "epoch": 12.344668207329152, "grad_norm": 3093.248046875, "learning_rate": 0.0004304593327834805, "loss": 25.3624, "step": 4674 }, { "epoch": 12.347309343017498, "grad_norm": 3393.5126953125, "learning_rate": 0.0004304304183053502, "loss": 27.783, "step": 4675 }, { "epoch": 12.349950478705843, "grad_norm": 331.5249328613281, "learning_rate": 0.0004304014987888064, "loss": 42.0844, "step": 4676 }, { "epoch": 12.35259161439419, "grad_norm": 231.9332275390625, "learning_rate": 0.0004303725742346568, "loss": 42.2873, "step": 4677 }, { "epoch": 12.355232750082536, "grad_norm": 314.9468994140625, "learning_rate": 0.0004303436446437089, "loss": 38.3577, "step": 4678 }, { "epoch": 12.357873885770882, "grad_norm": 343.1004333496094, "learning_rate": 0.00043031471001677075, "loss": 38.1132, "step": 4679 }, { "epoch": 12.360515021459227, "grad_norm": 228.5577392578125, "learning_rate": 0.0004302857703546502, "loss": 37.607, "step": 4680 }, { "epoch": 12.363156157147573, "grad_norm": 250.98458862304688, "learning_rate": 0.00043025682565815546, "loss": 37.0499, "step": 4681 }, { "epoch": 12.36579729283592, "grad_norm": 1023.5079956054688, "learning_rate": 0.0004302278759280947, "loss": 37.5074, "step": 4682 }, { "epoch": 12.368438428524266, "grad_norm": 234.4066619873047, "learning_rate": 0.00043019892116527634, "loss": 37.5443, "step": 4683 }, { "epoch": 12.37107956421261, "grad_norm": 145.6867218017578, "learning_rate": 0.00043016996137050913, "loss": 37.1969, "step": 4684 }, { "epoch": 12.373720699900957, "grad_norm": 1199.50537109375, "learning_rate": 0.0004301409965446014, "loss": 37.7114, "step": 4685 }, { "epoch": 12.376361835589304, "grad_norm": 184.14456176757812, "learning_rate": 0.0004301120266883623, "loss": 38.1963, "step": 4686 }, { "epoch": 12.37900297127765, "grad_norm": 143.27392578125, "learning_rate": 0.00043008305180260056, "loss": 38.9843, "step": 4687 }, { "epoch": 12.381644106965995, "grad_norm": 412.9300842285156, "learning_rate": 0.00043005407188812547, "loss": 38.17, "step": 4688 }, { "epoch": 12.384285242654341, "grad_norm": 438.0623474121094, "learning_rate": 0.0004300250869457462, "loss": 35.887, "step": 4689 }, { "epoch": 12.386926378342688, "grad_norm": 269.82611083984375, "learning_rate": 0.000429996096976272, "loss": 37.1147, "step": 4690 }, { "epoch": 12.389567514031034, "grad_norm": 583.0418090820312, "learning_rate": 0.0004299671019805126, "loss": 37.6449, "step": 4691 }, { "epoch": 12.392208649719379, "grad_norm": 615.0950927734375, "learning_rate": 0.00042993810195927765, "loss": 40.026, "step": 4692 }, { "epoch": 12.394849785407725, "grad_norm": 320.4130554199219, "learning_rate": 0.0004299090969133768, "loss": 40.0417, "step": 4693 }, { "epoch": 12.397490921096072, "grad_norm": 610.86328125, "learning_rate": 0.0004298800868436202, "loss": 44.9943, "step": 4694 }, { "epoch": 12.400132056784418, "grad_norm": 187.5879669189453, "learning_rate": 0.00042985107175081777, "loss": 42.0561, "step": 4695 }, { "epoch": 12.402773192472763, "grad_norm": 433.2693786621094, "learning_rate": 0.0004298220516357798, "loss": 42.4621, "step": 4696 }, { "epoch": 12.40541432816111, "grad_norm": 175.03274536132812, "learning_rate": 0.0004297930264993167, "loss": 42.6376, "step": 4697 }, { "epoch": 12.408055463849456, "grad_norm": 205.83026123046875, "learning_rate": 0.00042976399634223907, "loss": 42.1821, "step": 4698 }, { "epoch": 12.4106965995378, "grad_norm": 286.6769104003906, "learning_rate": 0.00042973496116535724, "loss": 42.8595, "step": 4699 }, { "epoch": 12.413337735226147, "grad_norm": 717.1072998046875, "learning_rate": 0.0004297059209694824, "loss": 40.9672, "step": 4700 }, { "epoch": 12.415978870914493, "grad_norm": 350.2613525390625, "learning_rate": 0.00042967687575542514, "loss": 43.9239, "step": 4701 }, { "epoch": 12.41862000660284, "grad_norm": 380.5751037597656, "learning_rate": 0.0004296478255239967, "loss": 41.583, "step": 4702 }, { "epoch": 12.421261142291184, "grad_norm": 173.29144287109375, "learning_rate": 0.0004296187702760083, "loss": 42.1692, "step": 4703 }, { "epoch": 12.423902277979531, "grad_norm": 808.41015625, "learning_rate": 0.0004295897100122712, "loss": 42.2223, "step": 4704 }, { "epoch": 12.426543413667877, "grad_norm": 207.04966735839844, "learning_rate": 0.00042956064473359697, "loss": 40.6364, "step": 4705 }, { "epoch": 12.429184549356224, "grad_norm": 212.83096313476562, "learning_rate": 0.0004295315744407972, "loss": 42.5211, "step": 4706 }, { "epoch": 12.431825685044569, "grad_norm": 202.70803833007812, "learning_rate": 0.00042950249913468373, "loss": 38.6015, "step": 4707 }, { "epoch": 12.434466820732915, "grad_norm": 132.2440948486328, "learning_rate": 0.0004294734188160684, "loss": 37.6682, "step": 4708 }, { "epoch": 12.437107956421261, "grad_norm": 135.74993896484375, "learning_rate": 0.00042944433348576316, "loss": 37.9271, "step": 4709 }, { "epoch": 12.439749092109608, "grad_norm": 99.08940124511719, "learning_rate": 0.00042941524314458047, "loss": 37.2212, "step": 4710 }, { "epoch": 12.442390227797953, "grad_norm": 135.81298828125, "learning_rate": 0.00042938614779333245, "loss": 37.4817, "step": 4711 }, { "epoch": 12.445031363486299, "grad_norm": 207.05418395996094, "learning_rate": 0.00042935704743283165, "loss": 37.5794, "step": 4712 }, { "epoch": 12.447672499174645, "grad_norm": 127.18132781982422, "learning_rate": 0.0004293279420638907, "loss": 38.1693, "step": 4713 }, { "epoch": 12.450313634862992, "grad_norm": 212.13739013671875, "learning_rate": 0.0004292988316873222, "loss": 38.288, "step": 4714 }, { "epoch": 12.452954770551337, "grad_norm": 136.1768798828125, "learning_rate": 0.00042926971630393926, "loss": 36.7254, "step": 4715 }, { "epoch": 12.455595906239683, "grad_norm": 2220.3115234375, "learning_rate": 0.00042924059591455474, "loss": 71.3709, "step": 4716 }, { "epoch": 12.45823704192803, "grad_norm": 1163.2396240234375, "learning_rate": 0.0004292114705199819, "loss": 93.4528, "step": 4717 }, { "epoch": 12.460878177616376, "grad_norm": 1759.989013671875, "learning_rate": 0.00042918234012103394, "loss": 84.7007, "step": 4718 }, { "epoch": 12.46351931330472, "grad_norm": 2261.583740234375, "learning_rate": 0.00042915320471852446, "loss": 78.1346, "step": 4719 }, { "epoch": 12.466160448993067, "grad_norm": 1180.7821044921875, "learning_rate": 0.000429124064313267, "loss": 74.1827, "step": 4720 }, { "epoch": 12.468801584681414, "grad_norm": 1378.590087890625, "learning_rate": 0.00042909491890607524, "loss": 75.1977, "step": 4721 }, { "epoch": 12.471442720369758, "grad_norm": 1632.435546875, "learning_rate": 0.000429065768497763, "loss": 64.2333, "step": 4722 }, { "epoch": 12.474083856058105, "grad_norm": 3281.895263671875, "learning_rate": 0.0004290366130891445, "loss": 56.3619, "step": 4723 }, { "epoch": 12.476724991746451, "grad_norm": 1411.159423828125, "learning_rate": 0.0004290074526810337, "loss": 36.2961, "step": 4724 }, { "epoch": 12.479366127434798, "grad_norm": 1457.404052734375, "learning_rate": 0.0004289782872742449, "loss": 33.0215, "step": 4725 }, { "epoch": 12.482007263123142, "grad_norm": 313.89019775390625, "learning_rate": 0.0004289491168695926, "loss": 40.2004, "step": 4726 }, { "epoch": 12.484648398811489, "grad_norm": 580.2703857421875, "learning_rate": 0.0004289199414678914, "loss": 42.7921, "step": 4727 }, { "epoch": 12.487289534499835, "grad_norm": 590.9381103515625, "learning_rate": 0.0004288907610699558, "loss": 42.3055, "step": 4728 }, { "epoch": 12.489930670188182, "grad_norm": 465.6142883300781, "learning_rate": 0.00042886157567660084, "loss": 40.6564, "step": 4729 }, { "epoch": 12.492571805876526, "grad_norm": 235.31971740722656, "learning_rate": 0.00042883238528864146, "loss": 39.2744, "step": 4730 }, { "epoch": 12.495212941564873, "grad_norm": 504.3174133300781, "learning_rate": 0.00042880318990689274, "loss": 38.0911, "step": 4731 }, { "epoch": 12.49785407725322, "grad_norm": 232.5786895751953, "learning_rate": 0.00042877398953217, "loss": 39.3787, "step": 4732 }, { "epoch": 12.500495212941566, "grad_norm": 703.1875, "learning_rate": 0.00042874478416528864, "loss": 37.9133, "step": 4733 }, { "epoch": 12.50313634862991, "grad_norm": 209.67330932617188, "learning_rate": 0.0004287155738070642, "loss": 37.6602, "step": 4734 }, { "epoch": 12.505777484318257, "grad_norm": 524.645751953125, "learning_rate": 0.00042868635845831225, "loss": 37.6121, "step": 4735 }, { "epoch": 12.508418620006603, "grad_norm": 232.23703002929688, "learning_rate": 0.00042865713811984877, "loss": 37.5064, "step": 4736 }, { "epoch": 12.51105975569495, "grad_norm": 353.76641845703125, "learning_rate": 0.00042862791279248965, "loss": 37.5793, "step": 4737 }, { "epoch": 12.513700891383294, "grad_norm": 397.2656555175781, "learning_rate": 0.00042859868247705103, "loss": 36.7059, "step": 4738 }, { "epoch": 12.51634202707164, "grad_norm": 285.2645568847656, "learning_rate": 0.0004285694471743491, "loss": 37.3577, "step": 4739 }, { "epoch": 12.518983162759987, "grad_norm": 487.7076721191406, "learning_rate": 0.0004285402068852002, "loss": 37.3844, "step": 4740 }, { "epoch": 12.521624298448334, "grad_norm": 469.581298828125, "learning_rate": 0.0004285109616104209, "loss": 37.5593, "step": 4741 }, { "epoch": 12.524265434136678, "grad_norm": 597.359375, "learning_rate": 0.00042848171135082794, "loss": 37.7224, "step": 4742 }, { "epoch": 12.526906569825025, "grad_norm": 757.2730102539062, "learning_rate": 0.000428452456107238, "loss": 39.9663, "step": 4743 }, { "epoch": 12.529547705513371, "grad_norm": 2388.791015625, "learning_rate": 0.0004284231958804681, "loss": 42.338, "step": 4744 }, { "epoch": 12.532188841201716, "grad_norm": 263.5386047363281, "learning_rate": 0.00042839393067133514, "loss": 41.2586, "step": 4745 }, { "epoch": 12.534829976890062, "grad_norm": 305.0296325683594, "learning_rate": 0.0004283646604806566, "loss": 41.1953, "step": 4746 }, { "epoch": 12.537471112578409, "grad_norm": 522.0488891601562, "learning_rate": 0.00042833538530924963, "loss": 39.1108, "step": 4747 }, { "epoch": 12.540112248266755, "grad_norm": 315.5683288574219, "learning_rate": 0.0004283061051579318, "loss": 43.1211, "step": 4748 }, { "epoch": 12.5427533839551, "grad_norm": 261.78082275390625, "learning_rate": 0.00042827682002752066, "loss": 44.1394, "step": 4749 }, { "epoch": 12.545394519643446, "grad_norm": 240.21487426757812, "learning_rate": 0.0004282475299188341, "loss": 45.3297, "step": 4750 }, { "epoch": 12.548035655331793, "grad_norm": 192.1096954345703, "learning_rate": 0.00042821823483269, "loss": 44.0642, "step": 4751 }, { "epoch": 12.55067679102014, "grad_norm": 174.73379516601562, "learning_rate": 0.0004281889347699063, "loss": 44.1428, "step": 4752 }, { "epoch": 12.553317926708484, "grad_norm": 302.8716735839844, "learning_rate": 0.00042815962973130134, "loss": 40.8849, "step": 4753 }, { "epoch": 12.55595906239683, "grad_norm": 164.64451599121094, "learning_rate": 0.0004281303197176934, "loss": 42.528, "step": 4754 }, { "epoch": 12.558600198085177, "grad_norm": 240.918212890625, "learning_rate": 0.0004281010047299009, "loss": 41.6608, "step": 4755 }, { "epoch": 12.561241333773523, "grad_norm": 248.95794677734375, "learning_rate": 0.00042807168476874244, "loss": 38.9943, "step": 4756 }, { "epoch": 12.563882469461868, "grad_norm": 216.2438507080078, "learning_rate": 0.00042804235983503683, "loss": 37.8322, "step": 4757 }, { "epoch": 12.566523605150214, "grad_norm": 316.9333801269531, "learning_rate": 0.00042801302992960283, "loss": 38.1385, "step": 4758 }, { "epoch": 12.569164740838561, "grad_norm": 192.77987670898438, "learning_rate": 0.0004279836950532596, "loss": 37.2901, "step": 4759 }, { "epoch": 12.571805876526907, "grad_norm": 207.61831665039062, "learning_rate": 0.0004279543552068262, "loss": 39.1072, "step": 4760 }, { "epoch": 12.574447012215252, "grad_norm": 560.865478515625, "learning_rate": 0.000427925010391122, "loss": 37.0187, "step": 4761 }, { "epoch": 12.577088147903599, "grad_norm": 229.76123046875, "learning_rate": 0.0004278956606069664, "loss": 38.2397, "step": 4762 }, { "epoch": 12.579729283591945, "grad_norm": 158.502197265625, "learning_rate": 0.00042786630585517897, "loss": 37.9838, "step": 4763 }, { "epoch": 12.582370419280291, "grad_norm": 315.4051818847656, "learning_rate": 0.0004278369461365794, "loss": 37.4627, "step": 4764 }, { "epoch": 12.585011554968636, "grad_norm": 265.826416015625, "learning_rate": 0.0004278075814519877, "loss": 37.8825, "step": 4765 }, { "epoch": 12.587652690656983, "grad_norm": 721.1143188476562, "learning_rate": 0.00042777821180222357, "loss": 48.085, "step": 4766 }, { "epoch": 12.590293826345329, "grad_norm": 1586.0389404296875, "learning_rate": 0.0004277488371881074, "loss": 73.0356, "step": 4767 }, { "epoch": 12.592934962033674, "grad_norm": 1226.6148681640625, "learning_rate": 0.00042771945761045936, "loss": 61.1488, "step": 4768 }, { "epoch": 12.59557609772202, "grad_norm": 1954.52587890625, "learning_rate": 0.0004276900730700999, "loss": 54.7295, "step": 4769 }, { "epoch": 12.598217233410367, "grad_norm": 1728.980224609375, "learning_rate": 0.00042766068356784947, "loss": 51.0622, "step": 4770 }, { "epoch": 12.600858369098713, "grad_norm": 1296.7359619140625, "learning_rate": 0.0004276312891045289, "loss": 41.3939, "step": 4771 }, { "epoch": 12.603499504787058, "grad_norm": 994.8030395507812, "learning_rate": 0.0004276018896809588, "loss": 41.352, "step": 4772 }, { "epoch": 12.606140640475404, "grad_norm": 579.1591186523438, "learning_rate": 0.0004275724852979603, "loss": 27.0873, "step": 4773 }, { "epoch": 12.60878177616375, "grad_norm": 5721.9619140625, "learning_rate": 0.00042754307595635455, "loss": 26.9662, "step": 4774 }, { "epoch": 12.611422911852097, "grad_norm": 548.3441772460938, "learning_rate": 0.0004275136616569626, "loss": 26.7357, "step": 4775 }, { "epoch": 12.614064047540442, "grad_norm": 375.3779602050781, "learning_rate": 0.000427484242400606, "loss": 42.1668, "step": 4776 }, { "epoch": 12.616705183228788, "grad_norm": 1476.6072998046875, "learning_rate": 0.0004274548181881061, "loss": 43.6071, "step": 4777 }, { "epoch": 12.619346318917135, "grad_norm": 970.2721557617188, "learning_rate": 0.00042742538902028473, "loss": 42.0829, "step": 4778 }, { "epoch": 12.621987454605481, "grad_norm": 979.1839599609375, "learning_rate": 0.00042739595489796355, "loss": 41.7984, "step": 4779 }, { "epoch": 12.624628590293826, "grad_norm": 705.1743774414062, "learning_rate": 0.0004273665158219645, "loss": 42.118, "step": 4780 }, { "epoch": 12.627269725982172, "grad_norm": 694.2454223632812, "learning_rate": 0.0004273370717931098, "loss": 40.0445, "step": 4781 }, { "epoch": 12.629910861670519, "grad_norm": 1259.5069580078125, "learning_rate": 0.0004273076228122215, "loss": 40.5559, "step": 4782 }, { "epoch": 12.632551997358865, "grad_norm": 947.3713989257812, "learning_rate": 0.0004272781688801219, "loss": 41.1217, "step": 4783 }, { "epoch": 12.63519313304721, "grad_norm": 1176.7060546875, "learning_rate": 0.00042724870999763363, "loss": 38.3636, "step": 4784 }, { "epoch": 12.637834268735556, "grad_norm": 637.9274291992188, "learning_rate": 0.0004272192461655793, "loss": 38.5415, "step": 4785 }, { "epoch": 12.640475404423903, "grad_norm": 1804.178466796875, "learning_rate": 0.0004271897773847816, "loss": 39.6141, "step": 4786 }, { "epoch": 12.64311654011225, "grad_norm": 739.5028076171875, "learning_rate": 0.0004271603036560634, "loss": 39.0716, "step": 4787 }, { "epoch": 12.645757675800594, "grad_norm": 754.42578125, "learning_rate": 0.00042713082498024786, "loss": 38.0708, "step": 4788 }, { "epoch": 12.64839881148894, "grad_norm": 1273.7303466796875, "learning_rate": 0.00042710134135815804, "loss": 38.1929, "step": 4789 }, { "epoch": 12.651039947177287, "grad_norm": 1046.11865234375, "learning_rate": 0.0004270718527906173, "loss": 38.6706, "step": 4790 }, { "epoch": 12.653681082865631, "grad_norm": 709.7261352539062, "learning_rate": 0.00042704235927844915, "loss": 38.2104, "step": 4791 }, { "epoch": 12.656322218553978, "grad_norm": 1046.0323486328125, "learning_rate": 0.00042701286082247704, "loss": 41.006, "step": 4792 }, { "epoch": 12.658963354242324, "grad_norm": 752.421630859375, "learning_rate": 0.00042698335742352483, "loss": 42.3496, "step": 4793 }, { "epoch": 12.66160448993067, "grad_norm": 1576.8338623046875, "learning_rate": 0.0004269538490824163, "loss": 45.2353, "step": 4794 }, { "epoch": 12.664245625619015, "grad_norm": 484.03424072265625, "learning_rate": 0.00042692433579997547, "loss": 45.8922, "step": 4795 }, { "epoch": 12.666886761307362, "grad_norm": 414.578125, "learning_rate": 0.0004268948175770265, "loss": 45.3989, "step": 4796 }, { "epoch": 12.669527896995708, "grad_norm": 479.8261413574219, "learning_rate": 0.0004268652944143937, "loss": 43.654, "step": 4797 }, { "epoch": 12.672169032684055, "grad_norm": 670.4197387695312, "learning_rate": 0.00042683576631290146, "loss": 44.1475, "step": 4798 }, { "epoch": 12.6748101683724, "grad_norm": 377.057861328125, "learning_rate": 0.0004268062332733743, "loss": 42.0901, "step": 4799 }, { "epoch": 12.677451304060746, "grad_norm": 301.82159423828125, "learning_rate": 0.00042677669529663686, "loss": 45.3766, "step": 4800 }, { "epoch": 12.677451304060746, "eval_loss": 5.083944797515869, "eval_runtime": 2.0773, "eval_samples_per_second": 238.29, "eval_steps_per_second": 29.846, "step": 4800 }, { "epoch": 12.680092439749092, "grad_norm": 361.02069091796875, "learning_rate": 0.0004267471523835141, "loss": 44.2292, "step": 4801 }, { "epoch": 12.682733575437439, "grad_norm": 452.2074279785156, "learning_rate": 0.000426717604534831, "loss": 43.1173, "step": 4802 }, { "epoch": 12.685374711125784, "grad_norm": 371.47283935546875, "learning_rate": 0.00042668805175141255, "loss": 43.8847, "step": 4803 }, { "epoch": 12.68801584681413, "grad_norm": 268.60443115234375, "learning_rate": 0.0004266584940340841, "loss": 41.6668, "step": 4804 }, { "epoch": 12.690656982502476, "grad_norm": 476.9060363769531, "learning_rate": 0.0004266289313836709, "loss": 41.8745, "step": 4805 }, { "epoch": 12.693298118190823, "grad_norm": 512.37890625, "learning_rate": 0.0004265993638009986, "loss": 40.4111, "step": 4806 }, { "epoch": 12.695939253879168, "grad_norm": 254.0316619873047, "learning_rate": 0.0004265697912868928, "loss": 40.669, "step": 4807 }, { "epoch": 12.698580389567514, "grad_norm": 436.6776428222656, "learning_rate": 0.0004265402138421792, "loss": 38.1757, "step": 4808 }, { "epoch": 12.70122152525586, "grad_norm": 380.1092224121094, "learning_rate": 0.000426510631467684, "loss": 39.1319, "step": 4809 }, { "epoch": 12.703862660944207, "grad_norm": 419.9729919433594, "learning_rate": 0.00042648104416423297, "loss": 38.4568, "step": 4810 }, { "epoch": 12.706503796632552, "grad_norm": 244.53550720214844, "learning_rate": 0.00042645145193265254, "loss": 38.1819, "step": 4811 }, { "epoch": 12.709144932320898, "grad_norm": 493.535888671875, "learning_rate": 0.0004264218547737689, "loss": 37.6911, "step": 4812 }, { "epoch": 12.711786068009244, "grad_norm": 328.4656677246094, "learning_rate": 0.00042639225268840866, "loss": 38.2253, "step": 4813 }, { "epoch": 12.71442720369759, "grad_norm": 382.7575988769531, "learning_rate": 0.00042636264567739833, "loss": 39.3266, "step": 4814 }, { "epoch": 12.717068339385936, "grad_norm": 593.5936279296875, "learning_rate": 0.00042633303374156476, "loss": 37.3722, "step": 4815 }, { "epoch": 12.719709475074282, "grad_norm": 639.4523315429688, "learning_rate": 0.0004263034168817348, "loss": 39.6182, "step": 4816 }, { "epoch": 12.722350610762629, "grad_norm": 4980.07373046875, "learning_rate": 0.00042627379509873546, "loss": 83.4626, "step": 4817 }, { "epoch": 12.724991746450973, "grad_norm": 4885.66796875, "learning_rate": 0.00042624416839339395, "loss": 87.0367, "step": 4818 }, { "epoch": 12.72763288213932, "grad_norm": 4306.6748046875, "learning_rate": 0.00042621453676653764, "loss": 79.0595, "step": 4819 }, { "epoch": 12.730274017827666, "grad_norm": 6472.61474609375, "learning_rate": 0.00042618490021899383, "loss": 87.2568, "step": 4820 }, { "epoch": 12.732915153516013, "grad_norm": 17904.794921875, "learning_rate": 0.00042615525875159025, "loss": 87.919, "step": 4821 }, { "epoch": 12.735556289204357, "grad_norm": 15168.62890625, "learning_rate": 0.0004261256123651545, "loss": 73.8465, "step": 4822 }, { "epoch": 12.738197424892704, "grad_norm": 2750.796875, "learning_rate": 0.0004260959610605144, "loss": 79.8579, "step": 4823 }, { "epoch": 12.74083856058105, "grad_norm": 5234.197265625, "learning_rate": 0.0004260663048384981, "loss": 71.6614, "step": 4824 }, { "epoch": 12.743479696269397, "grad_norm": 5632.00048828125, "learning_rate": 0.0004260366436999337, "loss": 63.4749, "step": 4825 }, { "epoch": 12.746120831957741, "grad_norm": 2484.295654296875, "learning_rate": 0.0004260069776456494, "loss": 54.6422, "step": 4826 }, { "epoch": 12.748761967646088, "grad_norm": 457.55902099609375, "learning_rate": 0.00042597730667647373, "loss": 41.1602, "step": 4827 }, { "epoch": 12.751403103334434, "grad_norm": 441.73248291015625, "learning_rate": 0.00042594763079323506, "loss": 38.6781, "step": 4828 }, { "epoch": 12.75404423902278, "grad_norm": 375.0263671875, "learning_rate": 0.00042591794999676213, "loss": 39.3623, "step": 4829 }, { "epoch": 12.756685374711125, "grad_norm": 577.5059814453125, "learning_rate": 0.00042588826428788387, "loss": 39.6824, "step": 4830 }, { "epoch": 12.759326510399472, "grad_norm": 335.12579345703125, "learning_rate": 0.00042585857366742903, "loss": 38.4588, "step": 4831 }, { "epoch": 12.761967646087818, "grad_norm": 506.4570007324219, "learning_rate": 0.00042582887813622693, "loss": 39.1347, "step": 4832 }, { "epoch": 12.764608781776165, "grad_norm": 441.25885009765625, "learning_rate": 0.00042579917769510666, "loss": 37.8025, "step": 4833 }, { "epoch": 12.76724991746451, "grad_norm": 558.7540893554688, "learning_rate": 0.0004257694723448976, "loss": 36.7311, "step": 4834 }, { "epoch": 12.769891053152856, "grad_norm": 503.77423095703125, "learning_rate": 0.00042573976208642935, "loss": 38.786, "step": 4835 }, { "epoch": 12.772532188841202, "grad_norm": 556.9520874023438, "learning_rate": 0.0004257100469205314, "loss": 38.7829, "step": 4836 }, { "epoch": 12.775173324529547, "grad_norm": 1774.6712646484375, "learning_rate": 0.0004256803268480337, "loss": 37.6461, "step": 4837 }, { "epoch": 12.777814460217893, "grad_norm": 371.1172790527344, "learning_rate": 0.00042565060186976596, "loss": 38.1404, "step": 4838 }, { "epoch": 12.78045559590624, "grad_norm": 667.2598876953125, "learning_rate": 0.0004256208719865584, "loss": 37.923, "step": 4839 }, { "epoch": 12.783096731594586, "grad_norm": 411.1222839355469, "learning_rate": 0.00042559113719924113, "loss": 37.8885, "step": 4840 }, { "epoch": 12.785737867282931, "grad_norm": 724.138427734375, "learning_rate": 0.0004255613975086445, "loss": 38.0199, "step": 4841 }, { "epoch": 12.788379002971277, "grad_norm": 862.92626953125, "learning_rate": 0.000425531652915599, "loss": 39.1594, "step": 4842 }, { "epoch": 12.791020138659624, "grad_norm": 1188.033935546875, "learning_rate": 0.00042550190342093523, "loss": 41.3884, "step": 4843 }, { "epoch": 12.79366127434797, "grad_norm": 1238.6824951171875, "learning_rate": 0.00042547214902548394, "loss": 43.3968, "step": 4844 }, { "epoch": 12.796302410036315, "grad_norm": 279.34967041015625, "learning_rate": 0.0004254423897300759, "loss": 40.1065, "step": 4845 }, { "epoch": 12.798943545724661, "grad_norm": 380.4222412109375, "learning_rate": 0.00042541262553554216, "loss": 41.9206, "step": 4846 }, { "epoch": 12.801584681413008, "grad_norm": 442.7679138183594, "learning_rate": 0.000425382856442714, "loss": 43.355, "step": 4847 }, { "epoch": 12.804225817101354, "grad_norm": 478.6844482421875, "learning_rate": 0.00042535308245242244, "loss": 44.4303, "step": 4848 }, { "epoch": 12.806866952789699, "grad_norm": 353.8058776855469, "learning_rate": 0.0004253233035654992, "loss": 46.2546, "step": 4849 }, { "epoch": 12.809508088478045, "grad_norm": 304.673828125, "learning_rate": 0.0004252935197827756, "loss": 44.015, "step": 4850 }, { "epoch": 12.812149224166392, "grad_norm": 346.55047607421875, "learning_rate": 0.00042526373110508353, "loss": 43.8268, "step": 4851 }, { "epoch": 12.814790359854738, "grad_norm": 531.2142944335938, "learning_rate": 0.00042523393753325466, "loss": 43.0639, "step": 4852 }, { "epoch": 12.817431495543083, "grad_norm": 542.24658203125, "learning_rate": 0.00042520413906812105, "loss": 42.4041, "step": 4853 }, { "epoch": 12.82007263123143, "grad_norm": 469.8409729003906, "learning_rate": 0.0004251743357105148, "loss": 42.0255, "step": 4854 }, { "epoch": 12.822713766919776, "grad_norm": 838.2225341796875, "learning_rate": 0.000425144527461268, "loss": 41.6059, "step": 4855 }, { "epoch": 12.825354902608122, "grad_norm": 352.6860656738281, "learning_rate": 0.0004251147143212133, "loss": 39.9563, "step": 4856 }, { "epoch": 12.827996038296467, "grad_norm": 228.66094970703125, "learning_rate": 0.000425084896291183, "loss": 39.2126, "step": 4857 }, { "epoch": 12.830637173984814, "grad_norm": 282.413330078125, "learning_rate": 0.00042505507337200983, "loss": 37.6608, "step": 4858 }, { "epoch": 12.83327830967316, "grad_norm": 264.2486877441406, "learning_rate": 0.0004250252455645266, "loss": 39.28, "step": 4859 }, { "epoch": 12.835919445361505, "grad_norm": 420.6194763183594, "learning_rate": 0.0004249954128695662, "loss": 39.6765, "step": 4860 }, { "epoch": 12.838560581049851, "grad_norm": 318.70428466796875, "learning_rate": 0.00042496557528796166, "loss": 37.5561, "step": 4861 }, { "epoch": 12.841201716738198, "grad_norm": 953.0073852539062, "learning_rate": 0.0004249357328205462, "loss": 38.0093, "step": 4862 }, { "epoch": 12.843842852426544, "grad_norm": 583.9441528320312, "learning_rate": 0.0004249058854681532, "loss": 38.5551, "step": 4863 }, { "epoch": 12.846483988114889, "grad_norm": 361.22857666015625, "learning_rate": 0.00042487603323161606, "loss": 38.6917, "step": 4864 }, { "epoch": 12.849125123803235, "grad_norm": 736.2733154296875, "learning_rate": 0.00042484617611176844, "loss": 40.4421, "step": 4865 }, { "epoch": 12.851766259491582, "grad_norm": 2048.880126953125, "learning_rate": 0.0004248163141094441, "loss": 54.1885, "step": 4866 }, { "epoch": 12.854407395179928, "grad_norm": 5227.96044921875, "learning_rate": 0.00042478644722547683, "loss": 50.5137, "step": 4867 }, { "epoch": 12.857048530868273, "grad_norm": 3360.5546875, "learning_rate": 0.00042475657546070067, "loss": 49.0676, "step": 4868 }, { "epoch": 12.85968966655662, "grad_norm": 2688.56982421875, "learning_rate": 0.0004247266988159498, "loss": 46.5499, "step": 4869 }, { "epoch": 12.862330802244966, "grad_norm": 1559.300537109375, "learning_rate": 0.00042469681729205864, "loss": 38.4972, "step": 4870 }, { "epoch": 12.864971937933312, "grad_norm": 1778.1082763671875, "learning_rate": 0.00042466693088986137, "loss": 30.2572, "step": 4871 }, { "epoch": 12.867613073621657, "grad_norm": 3252.766845703125, "learning_rate": 0.0004246370396101927, "loss": 28.543, "step": 4872 }, { "epoch": 12.870254209310003, "grad_norm": 962.1363525390625, "learning_rate": 0.0004246071434538872, "loss": 31.2029, "step": 4873 }, { "epoch": 12.87289534499835, "grad_norm": 2351.68310546875, "learning_rate": 0.00042457724242177985, "loss": 27.3337, "step": 4874 }, { "epoch": 12.875536480686696, "grad_norm": 1215.560791015625, "learning_rate": 0.0004245473365147056, "loss": 20.9846, "step": 4875 }, { "epoch": 12.87817761637504, "grad_norm": 689.3510131835938, "learning_rate": 0.00042451742573349947, "loss": 38.5177, "step": 4876 }, { "epoch": 12.880818752063387, "grad_norm": 1009.4086303710938, "learning_rate": 0.00042448751007899676, "loss": 42.6693, "step": 4877 }, { "epoch": 12.883459887751734, "grad_norm": 792.9686889648438, "learning_rate": 0.0004244575895520328, "loss": 43.757, "step": 4878 }, { "epoch": 12.88610102344008, "grad_norm": 763.7418823242188, "learning_rate": 0.00042442766415344313, "loss": 42.787, "step": 4879 }, { "epoch": 12.888742159128425, "grad_norm": 1060.2349853515625, "learning_rate": 0.00042439773388406346, "loss": 41.7125, "step": 4880 }, { "epoch": 12.891383294816771, "grad_norm": 635.3150634765625, "learning_rate": 0.00042436779874472947, "loss": 40.7139, "step": 4881 }, { "epoch": 12.894024430505118, "grad_norm": 558.4193115234375, "learning_rate": 0.00042433785873627713, "loss": 40.2113, "step": 4882 }, { "epoch": 12.896665566193462, "grad_norm": 523.2048950195312, "learning_rate": 0.00042430791385954247, "loss": 41.4728, "step": 4883 }, { "epoch": 12.899306701881809, "grad_norm": 729.4921875, "learning_rate": 0.0004242779641153618, "loss": 39.0276, "step": 4884 }, { "epoch": 12.901947837570155, "grad_norm": 1869.268310546875, "learning_rate": 0.00042424800950457124, "loss": 39.7855, "step": 4885 }, { "epoch": 12.904588973258502, "grad_norm": 405.832275390625, "learning_rate": 0.0004242180500280075, "loss": 39.5618, "step": 4886 }, { "epoch": 12.907230108946846, "grad_norm": 631.4027099609375, "learning_rate": 0.0004241880856865069, "loss": 39.0131, "step": 4887 }, { "epoch": 12.909871244635193, "grad_norm": 923.0225219726562, "learning_rate": 0.00042415811648090646, "loss": 38.2401, "step": 4888 }, { "epoch": 12.91251238032354, "grad_norm": 920.51708984375, "learning_rate": 0.0004241281424120429, "loss": 39.3193, "step": 4889 }, { "epoch": 12.915153516011886, "grad_norm": 503.4462585449219, "learning_rate": 0.0004240981634807532, "loss": 39.5792, "step": 4890 }, { "epoch": 12.91779465170023, "grad_norm": 1676.954345703125, "learning_rate": 0.0004240681796878746, "loss": 39.1094, "step": 4891 }, { "epoch": 12.920435787388577, "grad_norm": 874.9702758789062, "learning_rate": 0.00042403819103424437, "loss": 39.4223, "step": 4892 }, { "epoch": 12.923076923076923, "grad_norm": 947.8944702148438, "learning_rate": 0.00042400819752069985, "loss": 42.0177, "step": 4893 }, { "epoch": 12.92571805876527, "grad_norm": 2513.3603515625, "learning_rate": 0.00042397819914807855, "loss": 43.1065, "step": 4894 }, { "epoch": 12.928359194453614, "grad_norm": 361.4359130859375, "learning_rate": 0.0004239481959172183, "loss": 43.1555, "step": 4895 }, { "epoch": 12.931000330141961, "grad_norm": 260.5422058105469, "learning_rate": 0.000423918187828957, "loss": 41.9331, "step": 4896 }, { "epoch": 12.933641465830307, "grad_norm": 452.75775146484375, "learning_rate": 0.00042388817488413227, "loss": 45.3306, "step": 4897 }, { "epoch": 12.936282601518654, "grad_norm": 347.426025390625, "learning_rate": 0.00042385815708358257, "loss": 44.5411, "step": 4898 }, { "epoch": 12.938923737206999, "grad_norm": 319.4422912597656, "learning_rate": 0.0004238281344281458, "loss": 41.9878, "step": 4899 }, { "epoch": 12.941564872895345, "grad_norm": 412.9517822265625, "learning_rate": 0.00042379810691866064, "loss": 40.1593, "step": 4900 }, { "epoch": 12.944206008583691, "grad_norm": 648.8578491210938, "learning_rate": 0.00042376807455596534, "loss": 38.7975, "step": 4901 }, { "epoch": 12.946847144272038, "grad_norm": 381.6836242675781, "learning_rate": 0.0004237380373408986, "loss": 38.7997, "step": 4902 }, { "epoch": 12.949488279960383, "grad_norm": 354.8941955566406, "learning_rate": 0.00042370799527429934, "loss": 37.4157, "step": 4903 }, { "epoch": 12.952129415648729, "grad_norm": 335.48358154296875, "learning_rate": 0.0004236779483570062, "loss": 38.2781, "step": 4904 }, { "epoch": 12.954770551337075, "grad_norm": 291.9018249511719, "learning_rate": 0.0004236478965898586, "loss": 37.1935, "step": 4905 }, { "epoch": 12.95741168702542, "grad_norm": 2122.161865234375, "learning_rate": 0.0004236178399736953, "loss": 47.2286, "step": 4906 }, { "epoch": 12.960052822713767, "grad_norm": 3824.21875, "learning_rate": 0.00042358777850935595, "loss": 60.8873, "step": 4907 }, { "epoch": 12.962693958402113, "grad_norm": 8197.7080078125, "learning_rate": 0.0004235577121976797, "loss": 53.7918, "step": 4908 }, { "epoch": 12.96533509409046, "grad_norm": 3827.230712890625, "learning_rate": 0.00042352764103950635, "loss": 52.4923, "step": 4909 }, { "epoch": 12.967976229778804, "grad_norm": 1461.85888671875, "learning_rate": 0.0004234975650356756, "loss": 33.5539, "step": 4910 }, { "epoch": 12.97061736546715, "grad_norm": 2488.974853515625, "learning_rate": 0.0004234674841870273, "loss": 37.5471, "step": 4911 }, { "epoch": 12.973258501155497, "grad_norm": 730.5457153320312, "learning_rate": 0.0004234373984944013, "loss": 35.0121, "step": 4912 }, { "epoch": 12.975899636843844, "grad_norm": 391.6708984375, "learning_rate": 0.0004234073079586379, "loss": 38.196, "step": 4913 }, { "epoch": 12.978540772532188, "grad_norm": 340.5655212402344, "learning_rate": 0.0004233772125805773, "loss": 37.6634, "step": 4914 }, { "epoch": 12.981181908220535, "grad_norm": 382.2851257324219, "learning_rate": 0.0004233471123610598, "loss": 39.3318, "step": 4915 }, { "epoch": 12.983823043908881, "grad_norm": 873.0800170898438, "learning_rate": 0.00042331700730092607, "loss": 43.3652, "step": 4916 }, { "epoch": 12.986464179597228, "grad_norm": 530.3013305664062, "learning_rate": 0.00042328689740101676, "loss": 37.9007, "step": 4917 }, { "epoch": 12.989105315285572, "grad_norm": 488.4697570800781, "learning_rate": 0.0004232567826621726, "loss": 39.8528, "step": 4918 }, { "epoch": 12.991746450973919, "grad_norm": 1156.6123046875, "learning_rate": 0.00042322666308523463, "loss": 38.9789, "step": 4919 }, { "epoch": 12.994387586662265, "grad_norm": 782.60595703125, "learning_rate": 0.0004231965386710437, "loss": 37.4954, "step": 4920 }, { "epoch": 12.997028722350612, "grad_norm": 1242.1982421875, "learning_rate": 0.0004231664094204413, "loss": 39.7946, "step": 4921 }, { "epoch": 12.999669858038956, "grad_norm": 1232.7789306640625, "learning_rate": 0.0004231362753342686, "loss": 44.0491, "step": 4922 }, { "epoch": 13.002310993727303, "grad_norm": 581.5228881835938, "learning_rate": 0.00042310613641336706, "loss": 46.0965, "step": 4923 }, { "epoch": 13.00495212941565, "grad_norm": 506.5477294921875, "learning_rate": 0.00042307599265857846, "loss": 46.2469, "step": 4924 }, { "epoch": 13.007593265103996, "grad_norm": 641.2460327148438, "learning_rate": 0.00042304584407074433, "loss": 47.6065, "step": 4925 }, { "epoch": 13.01023440079234, "grad_norm": 496.69354248046875, "learning_rate": 0.0004230156906507067, "loss": 44.7415, "step": 4926 }, { "epoch": 13.012875536480687, "grad_norm": 322.9624328613281, "learning_rate": 0.00042298553239930755, "loss": 44.4105, "step": 4927 }, { "epoch": 13.015516672169033, "grad_norm": 282.4483337402344, "learning_rate": 0.000422955369317389, "loss": 45.1546, "step": 4928 }, { "epoch": 13.018157807857378, "grad_norm": 439.4166259765625, "learning_rate": 0.0004229252014057934, "loss": 44.6852, "step": 4929 }, { "epoch": 13.020798943545724, "grad_norm": 482.1062316894531, "learning_rate": 0.0004228950286653631, "loss": 43.3371, "step": 4930 }, { "epoch": 13.02344007923407, "grad_norm": 620.0262451171875, "learning_rate": 0.0004228648510969407, "loss": 43.2153, "step": 4931 }, { "epoch": 13.026081214922417, "grad_norm": 575.6287841796875, "learning_rate": 0.0004228346687013689, "loss": 42.4885, "step": 4932 }, { "epoch": 13.028722350610762, "grad_norm": 561.9248657226562, "learning_rate": 0.0004228044814794905, "loss": 40.7125, "step": 4933 }, { "epoch": 13.031363486299108, "grad_norm": 722.7451782226562, "learning_rate": 0.00042277428943214844, "loss": 40.0306, "step": 4934 }, { "epoch": 13.034004621987455, "grad_norm": 914.9729614257812, "learning_rate": 0.0004227440925601859, "loss": 40.1839, "step": 4935 }, { "epoch": 13.036645757675801, "grad_norm": 447.8088073730469, "learning_rate": 0.0004227138908644459, "loss": 38.6788, "step": 4936 }, { "epoch": 13.039286893364146, "grad_norm": 1055.7154541015625, "learning_rate": 0.0004226836843457721, "loss": 37.0084, "step": 4937 }, { "epoch": 13.041928029052492, "grad_norm": 518.2670288085938, "learning_rate": 0.0004226534730050078, "loss": 38.2456, "step": 4938 }, { "epoch": 13.044569164740839, "grad_norm": 518.3629150390625, "learning_rate": 0.00042262325684299675, "loss": 38.4939, "step": 4939 }, { "epoch": 13.047210300429185, "grad_norm": 461.7813415527344, "learning_rate": 0.0004225930358605826, "loss": 37.6886, "step": 4940 }, { "epoch": 13.04985143611753, "grad_norm": 562.7626342773438, "learning_rate": 0.0004225628100586093, "loss": 38.505, "step": 4941 }, { "epoch": 13.052492571805876, "grad_norm": 905.5946655273438, "learning_rate": 0.00042253257943792097, "loss": 38.142, "step": 4942 }, { "epoch": 13.055133707494223, "grad_norm": 1294.7142333984375, "learning_rate": 0.0004225023439993616, "loss": 37.9785, "step": 4943 }, { "epoch": 13.05777484318257, "grad_norm": 3404.297607421875, "learning_rate": 0.00042247210374377563, "loss": 47.9181, "step": 4944 }, { "epoch": 13.060415978870914, "grad_norm": 1774.8515625, "learning_rate": 0.0004224418586720076, "loss": 75.3899, "step": 4945 }, { "epoch": 13.06305711455926, "grad_norm": 15780.4912109375, "learning_rate": 0.0004224116087849018, "loss": 83.9169, "step": 4946 }, { "epoch": 13.065698250247607, "grad_norm": 5779.02294921875, "learning_rate": 0.00042238135408330325, "loss": 103.6882, "step": 4947 }, { "epoch": 13.068339385935953, "grad_norm": 2472.5859375, "learning_rate": 0.0004223510945680565, "loss": 89.1145, "step": 4948 }, { "epoch": 13.070980521624298, "grad_norm": 4324.14453125, "learning_rate": 0.0004223208302400068, "loss": 88.2292, "step": 4949 }, { "epoch": 13.073621657312644, "grad_norm": 4588.26416015625, "learning_rate": 0.00042229056109999906, "loss": 75.2432, "step": 4950 }, { "epoch": 13.076262793000991, "grad_norm": 2508.876953125, "learning_rate": 0.00042226028714887866, "loss": 72.5221, "step": 4951 }, { "epoch": 13.078903928689336, "grad_norm": 7187.29638671875, "learning_rate": 0.000422230008387491, "loss": 62.5287, "step": 4952 }, { "epoch": 13.081545064377682, "grad_norm": 1987.8411865234375, "learning_rate": 0.0004221997248166814, "loss": 58.4677, "step": 4953 }, { "epoch": 13.084186200066029, "grad_norm": 3472.69580078125, "learning_rate": 0.0004221694364372957, "loss": 53.496, "step": 4954 }, { "epoch": 13.086827335754375, "grad_norm": 738.446044921875, "learning_rate": 0.00042213914325017965, "loss": 44.5168, "step": 4955 }, { "epoch": 13.08946847144272, "grad_norm": 610.0072631835938, "learning_rate": 0.0004221088452561791, "loss": 40.1596, "step": 4956 }, { "epoch": 13.092109607131066, "grad_norm": 556.3998413085938, "learning_rate": 0.00042207854245614017, "loss": 41.5474, "step": 4957 }, { "epoch": 13.094750742819413, "grad_norm": 458.6668395996094, "learning_rate": 0.000422048234850909, "loss": 40.9367, "step": 4958 }, { "epoch": 13.097391878507759, "grad_norm": 484.6396179199219, "learning_rate": 0.000422017922441332, "loss": 39.7964, "step": 4959 }, { "epoch": 13.100033014196104, "grad_norm": 516.1243286132812, "learning_rate": 0.00042198760522825545, "loss": 38.8476, "step": 4960 }, { "epoch": 13.10267414988445, "grad_norm": 343.0889892578125, "learning_rate": 0.00042195728321252617, "loss": 39.1144, "step": 4961 }, { "epoch": 13.105315285572797, "grad_norm": 560.9722900390625, "learning_rate": 0.0004219269563949907, "loss": 38.7291, "step": 4962 }, { "epoch": 13.107956421261143, "grad_norm": 504.7649230957031, "learning_rate": 0.00042189662477649606, "loss": 39.4437, "step": 4963 }, { "epoch": 13.110597556949488, "grad_norm": 392.3074035644531, "learning_rate": 0.000421866288357889, "loss": 39.3186, "step": 4964 }, { "epoch": 13.113238692637834, "grad_norm": 493.6226501464844, "learning_rate": 0.00042183594714001693, "loss": 40.0487, "step": 4965 }, { "epoch": 13.11587982832618, "grad_norm": 465.50048828125, "learning_rate": 0.0004218056011237269, "loss": 39.529, "step": 4966 }, { "epoch": 13.118520964014527, "grad_norm": 501.5942077636719, "learning_rate": 0.0004217752503098664, "loss": 37.6678, "step": 4967 }, { "epoch": 13.121162099702872, "grad_norm": 371.3668518066406, "learning_rate": 0.000421744894699283, "loss": 39.4624, "step": 4968 }, { "epoch": 13.123803235391218, "grad_norm": 1119.6292724609375, "learning_rate": 0.00042171453429282424, "loss": 37.1128, "step": 4969 }, { "epoch": 13.126444371079565, "grad_norm": 1091.1298828125, "learning_rate": 0.000421684169091338, "loss": 38.1767, "step": 4970 }, { "epoch": 13.129085506767911, "grad_norm": 742.1924438476562, "learning_rate": 0.0004216537990956722, "loss": 39.6825, "step": 4971 }, { "epoch": 13.131726642456256, "grad_norm": 3058.659423828125, "learning_rate": 0.0004216234243066749, "loss": 43.6711, "step": 4972 }, { "epoch": 13.134367778144602, "grad_norm": 702.1382446289062, "learning_rate": 0.00042159304472519425, "loss": 42.4535, "step": 4973 }, { "epoch": 13.137008913832949, "grad_norm": 217.85986328125, "learning_rate": 0.0004215626603520787, "loss": 42.2645, "step": 4974 }, { "epoch": 13.139650049521293, "grad_norm": 429.3426208496094, "learning_rate": 0.0004215322711881766, "loss": 41.0353, "step": 4975 }, { "epoch": 13.14229118520964, "grad_norm": 461.11468505859375, "learning_rate": 0.0004215018772343365, "loss": 41.8465, "step": 4976 }, { "epoch": 13.144932320897986, "grad_norm": 343.30743408203125, "learning_rate": 0.00042147147849140733, "loss": 45.6257, "step": 4977 }, { "epoch": 13.147573456586333, "grad_norm": 246.90371704101562, "learning_rate": 0.0004214410749602379, "loss": 45.2121, "step": 4978 }, { "epoch": 13.150214592274677, "grad_norm": 478.2987060546875, "learning_rate": 0.000421410666641677, "loss": 43.8474, "step": 4979 }, { "epoch": 13.152855727963024, "grad_norm": 351.00250244140625, "learning_rate": 0.00042138025353657407, "loss": 43.2808, "step": 4980 }, { "epoch": 13.15549686365137, "grad_norm": 271.427001953125, "learning_rate": 0.00042134983564577817, "loss": 46.4295, "step": 4981 }, { "epoch": 13.158137999339717, "grad_norm": 276.7729797363281, "learning_rate": 0.00042131941297013874, "loss": 43.2815, "step": 4982 }, { "epoch": 13.160779135028061, "grad_norm": 303.9843444824219, "learning_rate": 0.0004212889855105054, "loss": 41.3656, "step": 4983 }, { "epoch": 13.163420270716408, "grad_norm": 287.41705322265625, "learning_rate": 0.0004212585532677278, "loss": 40.6162, "step": 4984 }, { "epoch": 13.166061406404754, "grad_norm": 558.5026245117188, "learning_rate": 0.00042122811624265564, "loss": 40.4644, "step": 4985 }, { "epoch": 13.1687025420931, "grad_norm": 285.19482421875, "learning_rate": 0.0004211976744361389, "loss": 40.2421, "step": 4986 }, { "epoch": 13.171343677781445, "grad_norm": 442.144775390625, "learning_rate": 0.00042116722784902775, "loss": 40.6544, "step": 4987 }, { "epoch": 13.173984813469792, "grad_norm": 426.5894775390625, "learning_rate": 0.0004211367764821722, "loss": 40.014, "step": 4988 }, { "epoch": 13.176625949158138, "grad_norm": 888.3052978515625, "learning_rate": 0.00042110632033642273, "loss": 39.3063, "step": 4989 }, { "epoch": 13.179267084846485, "grad_norm": 277.1825866699219, "learning_rate": 0.00042107585941262985, "loss": 39.6134, "step": 4990 }, { "epoch": 13.18190822053483, "grad_norm": 314.93682861328125, "learning_rate": 0.00042104539371164395, "loss": 37.8582, "step": 4991 }, { "epoch": 13.184549356223176, "grad_norm": 625.8118896484375, "learning_rate": 0.00042101492323431603, "loss": 38.8925, "step": 4992 }, { "epoch": 13.187190491911522, "grad_norm": 495.3768615722656, "learning_rate": 0.0004209844479814968, "loss": 37.5222, "step": 4993 }, { "epoch": 13.189831627599869, "grad_norm": 370.51800537109375, "learning_rate": 0.0004209539679540373, "loss": 38.3501, "step": 4994 }, { "epoch": 13.192472763288213, "grad_norm": 985.994873046875, "learning_rate": 0.0004209234831527887, "loss": 49.0833, "step": 4995 }, { "epoch": 13.19511389897656, "grad_norm": 6353.24658203125, "learning_rate": 0.00042089299357860223, "loss": 56.6911, "step": 4996 }, { "epoch": 13.197755034664906, "grad_norm": 4616.6513671875, "learning_rate": 0.00042086249923232923, "loss": 52.3653, "step": 4997 }, { "epoch": 13.200396170353251, "grad_norm": 3426.326416015625, "learning_rate": 0.00042083200011482135, "loss": 53.9101, "step": 4998 }, { "epoch": 13.203037306041598, "grad_norm": 3162.85498046875, "learning_rate": 0.0004208014962269303, "loss": 53.2608, "step": 4999 }, { "epoch": 13.205678441729944, "grad_norm": 2099.33544921875, "learning_rate": 0.0004207709875695077, "loss": 47.2055, "step": 5000 }, { "epoch": 13.205678441729944, "eval_loss": 4.83643102645874, "eval_runtime": 2.2473, "eval_samples_per_second": 220.267, "eval_steps_per_second": 27.589, "step": 5000 }, { "epoch": 13.20831957741829, "grad_norm": 3575.465087890625, "learning_rate": 0.00042074047414340565, "loss": 40.0201, "step": 5001 }, { "epoch": 13.210960713106635, "grad_norm": 3714.200927734375, "learning_rate": 0.00042070995594947617, "loss": 41.2117, "step": 5002 }, { "epoch": 13.213601848794982, "grad_norm": 12987.466796875, "learning_rate": 0.00042067943298857135, "loss": 36.3709, "step": 5003 }, { "epoch": 13.216242984483328, "grad_norm": 5245.8271484375, "learning_rate": 0.0004206489052615437, "loss": 27.6734, "step": 5004 }, { "epoch": 13.218884120171674, "grad_norm": 1055.0496826171875, "learning_rate": 0.0004206183727692456, "loss": 30.0619, "step": 5005 }, { "epoch": 13.22152525586002, "grad_norm": 1569.36376953125, "learning_rate": 0.00042058783551252965, "loss": 39.3647, "step": 5006 }, { "epoch": 13.224166391548366, "grad_norm": 450.7438049316406, "learning_rate": 0.00042055729349224863, "loss": 37.222, "step": 5007 }, { "epoch": 13.226807527236712, "grad_norm": 424.77471923828125, "learning_rate": 0.0004205267467092554, "loss": 41.0745, "step": 5008 }, { "epoch": 13.229448662925058, "grad_norm": 759.7691040039062, "learning_rate": 0.0004204961951644029, "loss": 39.9106, "step": 5009 }, { "epoch": 13.232089798613403, "grad_norm": 483.1945495605469, "learning_rate": 0.0004204656388585443, "loss": 39.073, "step": 5010 }, { "epoch": 13.23473093430175, "grad_norm": 513.5589599609375, "learning_rate": 0.0004204350777925329, "loss": 39.7732, "step": 5011 }, { "epoch": 13.237372069990096, "grad_norm": 906.0276489257812, "learning_rate": 0.00042040451196722207, "loss": 38.66, "step": 5012 }, { "epoch": 13.240013205678443, "grad_norm": 411.7055358886719, "learning_rate": 0.0004203739413834653, "loss": 39.369, "step": 5013 }, { "epoch": 13.242654341366787, "grad_norm": 563.2393188476562, "learning_rate": 0.0004203433660421164, "loss": 37.85, "step": 5014 }, { "epoch": 13.245295477055134, "grad_norm": 651.3539428710938, "learning_rate": 0.000420312785944029, "loss": 40.3624, "step": 5015 }, { "epoch": 13.24793661274348, "grad_norm": 1091.46484375, "learning_rate": 0.0004202822010900571, "loss": 40.154, "step": 5016 }, { "epoch": 13.250577748431827, "grad_norm": 1318.259033203125, "learning_rate": 0.0004202516114810547, "loss": 39.6744, "step": 5017 }, { "epoch": 13.253218884120171, "grad_norm": 542.1820068359375, "learning_rate": 0.0004202210171178762, "loss": 40.0673, "step": 5018 }, { "epoch": 13.255860019808518, "grad_norm": 1528.1746826171875, "learning_rate": 0.0004201904180013757, "loss": 37.5643, "step": 5019 }, { "epoch": 13.258501155496864, "grad_norm": 484.8447265625, "learning_rate": 0.0004201598141324078, "loss": 40.8786, "step": 5020 }, { "epoch": 13.261142291185209, "grad_norm": 859.8868408203125, "learning_rate": 0.000420129205511827, "loss": 39.3107, "step": 5021 }, { "epoch": 13.263783426873555, "grad_norm": 1141.6298828125, "learning_rate": 0.0004200985921404881, "loss": 43.5957, "step": 5022 }, { "epoch": 13.266424562561902, "grad_norm": 2704.436279296875, "learning_rate": 0.0004200679740192459, "loss": 43.0811, "step": 5023 }, { "epoch": 13.269065698250248, "grad_norm": 275.3863830566406, "learning_rate": 0.00042003735114895546, "loss": 42.1295, "step": 5024 }, { "epoch": 13.271706833938593, "grad_norm": 953.1244506835938, "learning_rate": 0.00042000672353047187, "loss": 40.7511, "step": 5025 }, { "epoch": 13.27434796962694, "grad_norm": 358.1284484863281, "learning_rate": 0.0004199760911646504, "loss": 42.598, "step": 5026 }, { "epoch": 13.276989105315286, "grad_norm": 693.0626220703125, "learning_rate": 0.00041994545405234647, "loss": 44.0505, "step": 5027 }, { "epoch": 13.279630241003632, "grad_norm": 943.361572265625, "learning_rate": 0.00041991481219441553, "loss": 43.4776, "step": 5028 }, { "epoch": 13.282271376691977, "grad_norm": 366.1666564941406, "learning_rate": 0.0004198841655917133, "loss": 44.2379, "step": 5029 }, { "epoch": 13.284912512380323, "grad_norm": 639.05078125, "learning_rate": 0.00041985351424509543, "loss": 44.129, "step": 5030 }, { "epoch": 13.28755364806867, "grad_norm": 414.1295166015625, "learning_rate": 0.000419822858155418, "loss": 42.9797, "step": 5031 }, { "epoch": 13.290194783757016, "grad_norm": 335.8523254394531, "learning_rate": 0.0004197921973235371, "loss": 44.0663, "step": 5032 }, { "epoch": 13.292835919445361, "grad_norm": 729.2303466796875, "learning_rate": 0.00041976153175030877, "loss": 42.8171, "step": 5033 }, { "epoch": 13.295477055133707, "grad_norm": 569.7051391601562, "learning_rate": 0.00041973086143658943, "loss": 40.821, "step": 5034 }, { "epoch": 13.298118190822054, "grad_norm": 377.61273193359375, "learning_rate": 0.00041970018638323546, "loss": 40.7006, "step": 5035 }, { "epoch": 13.3007593265104, "grad_norm": 401.3155517578125, "learning_rate": 0.0004196695065911035, "loss": 39.5837, "step": 5036 }, { "epoch": 13.303400462198745, "grad_norm": 3698.365478515625, "learning_rate": 0.0004196388220610502, "loss": 38.44, "step": 5037 }, { "epoch": 13.306041597887091, "grad_norm": 367.1592712402344, "learning_rate": 0.0004196081327939324, "loss": 39.3359, "step": 5038 }, { "epoch": 13.308682733575438, "grad_norm": 386.5728759765625, "learning_rate": 0.00041957743879060717, "loss": 38.2323, "step": 5039 }, { "epoch": 13.311323869263784, "grad_norm": 387.8990783691406, "learning_rate": 0.0004195467400519316, "loss": 37.449, "step": 5040 }, { "epoch": 13.313965004952129, "grad_norm": 443.2782287597656, "learning_rate": 0.0004195160365787629, "loss": 37.3826, "step": 5041 }, { "epoch": 13.316606140640475, "grad_norm": 361.41473388671875, "learning_rate": 0.00041948532837195846, "loss": 38.042, "step": 5042 }, { "epoch": 13.319247276328822, "grad_norm": 344.48724365234375, "learning_rate": 0.00041945461543237584, "loss": 38.2913, "step": 5043 }, { "epoch": 13.321888412017167, "grad_norm": 1332.8369140625, "learning_rate": 0.0004194238977608725, "loss": 40.5958, "step": 5044 }, { "epoch": 13.324529547705513, "grad_norm": 1502.505615234375, "learning_rate": 0.0004193931753583064, "loss": 54.9946, "step": 5045 }, { "epoch": 13.32717068339386, "grad_norm": 6295.77392578125, "learning_rate": 0.0004193624482255354, "loss": 48.3486, "step": 5046 }, { "epoch": 13.329811819082206, "grad_norm": 2886.09033203125, "learning_rate": 0.0004193317163634175, "loss": 43.764, "step": 5047 }, { "epoch": 13.33245295477055, "grad_norm": 2687.663330078125, "learning_rate": 0.00041930097977281095, "loss": 41.6909, "step": 5048 }, { "epoch": 13.335094090458897, "grad_norm": 1889.379150390625, "learning_rate": 0.0004192702384545739, "loss": 41.8285, "step": 5049 }, { "epoch": 13.337735226147243, "grad_norm": 1665.285888671875, "learning_rate": 0.00041923949240956496, "loss": 38.5112, "step": 5050 }, { "epoch": 13.34037636183559, "grad_norm": 964.7704467773438, "learning_rate": 0.0004192087416386426, "loss": 31.2882, "step": 5051 }, { "epoch": 13.343017497523935, "grad_norm": 1475.7392578125, "learning_rate": 0.0004191779861426654, "loss": 21.2819, "step": 5052 }, { "epoch": 13.345658633212281, "grad_norm": 525.437255859375, "learning_rate": 0.00041914722592249255, "loss": 21.1733, "step": 5053 }, { "epoch": 13.348299768900628, "grad_norm": 1487.3851318359375, "learning_rate": 0.0004191164609789826, "loss": 23.2461, "step": 5054 }, { "epoch": 13.350940904588974, "grad_norm": 514.3805541992188, "learning_rate": 0.0004190856913129949, "loss": 43.535, "step": 5055 }, { "epoch": 13.353582040277319, "grad_norm": 490.28814697265625, "learning_rate": 0.00041905491692538857, "loss": 44.3433, "step": 5056 }, { "epoch": 13.356223175965665, "grad_norm": 442.40985107421875, "learning_rate": 0.000419024137817023, "loss": 44.5523, "step": 5057 }, { "epoch": 13.358864311654012, "grad_norm": 302.2409362792969, "learning_rate": 0.0004189933539887576, "loss": 41.8397, "step": 5058 }, { "epoch": 13.361505447342358, "grad_norm": 293.2207946777344, "learning_rate": 0.00041896256544145217, "loss": 40.9951, "step": 5059 }, { "epoch": 13.364146583030703, "grad_norm": 226.64505004882812, "learning_rate": 0.00041893177217596633, "loss": 40.1621, "step": 5060 }, { "epoch": 13.36678771871905, "grad_norm": 163.30711364746094, "learning_rate": 0.00041890097419316, "loss": 40.4418, "step": 5061 }, { "epoch": 13.369428854407396, "grad_norm": 203.55703735351562, "learning_rate": 0.0004188701714938931, "loss": 41.2102, "step": 5062 }, { "epoch": 13.372069990095742, "grad_norm": 190.450439453125, "learning_rate": 0.0004188393640790259, "loss": 40.7029, "step": 5063 }, { "epoch": 13.374711125784087, "grad_norm": 208.352294921875, "learning_rate": 0.0004188085519494187, "loss": 38.8052, "step": 5064 }, { "epoch": 13.377352261472433, "grad_norm": 133.6958770751953, "learning_rate": 0.00041877773510593175, "loss": 38.0903, "step": 5065 }, { "epoch": 13.37999339716078, "grad_norm": 156.20872497558594, "learning_rate": 0.00041874691354942573, "loss": 39.2281, "step": 5066 }, { "epoch": 13.382634532849124, "grad_norm": 376.4897155761719, "learning_rate": 0.00041871608728076125, "loss": 38.9717, "step": 5067 }, { "epoch": 13.38527566853747, "grad_norm": 167.6680145263672, "learning_rate": 0.00041868525630079924, "loss": 38.301, "step": 5068 }, { "epoch": 13.387916804225817, "grad_norm": 581.3779296875, "learning_rate": 0.0004186544206104005, "loss": 39.5571, "step": 5069 }, { "epoch": 13.390557939914164, "grad_norm": 146.11627197265625, "learning_rate": 0.0004186235802104261, "loss": 40.0857, "step": 5070 }, { "epoch": 13.393199075602508, "grad_norm": 219.94818115234375, "learning_rate": 0.00041859273510173737, "loss": 41.566, "step": 5071 }, { "epoch": 13.395840211290855, "grad_norm": 826.9295043945312, "learning_rate": 0.0004185618852851954, "loss": 43.2378, "step": 5072 }, { "epoch": 13.398481346979201, "grad_norm": 189.0330810546875, "learning_rate": 0.00041853103076166186, "loss": 44.1367, "step": 5073 }, { "epoch": 13.401122482667548, "grad_norm": 117.50833129882812, "learning_rate": 0.00041850017153199833, "loss": 41.2201, "step": 5074 }, { "epoch": 13.403763618355892, "grad_norm": 155.43502807617188, "learning_rate": 0.00041846930759706645, "loss": 42.3305, "step": 5075 }, { "epoch": 13.406404754044239, "grad_norm": 164.98326110839844, "learning_rate": 0.00041843843895772817, "loss": 42.1056, "step": 5076 }, { "epoch": 13.409045889732585, "grad_norm": 243.7605438232422, "learning_rate": 0.0004184075656148454, "loss": 47.5432, "step": 5077 }, { "epoch": 13.411687025420932, "grad_norm": 186.86337280273438, "learning_rate": 0.00041837668756928025, "loss": 43.0605, "step": 5078 }, { "epoch": 13.414328161109276, "grad_norm": 335.2673034667969, "learning_rate": 0.00041834580482189504, "loss": 47.7145, "step": 5079 }, { "epoch": 13.416969296797623, "grad_norm": 194.46359252929688, "learning_rate": 0.0004183149173735521, "loss": 45.5259, "step": 5080 }, { "epoch": 13.41961043248597, "grad_norm": 145.12705993652344, "learning_rate": 0.000418284025225114, "loss": 43.8856, "step": 5081 }, { "epoch": 13.422251568174316, "grad_norm": 204.806640625, "learning_rate": 0.00041825312837744333, "loss": 42.8835, "step": 5082 }, { "epoch": 13.42489270386266, "grad_norm": 132.01885986328125, "learning_rate": 0.00041822222683140294, "loss": 44.0959, "step": 5083 }, { "epoch": 13.427533839551007, "grad_norm": 255.4943389892578, "learning_rate": 0.0004181913205878556, "loss": 40.6268, "step": 5084 }, { "epoch": 13.430174975239353, "grad_norm": 128.51296997070312, "learning_rate": 0.00041816040964766447, "loss": 40.9737, "step": 5085 }, { "epoch": 13.4328161109277, "grad_norm": 161.0417938232422, "learning_rate": 0.00041812949401169277, "loss": 38.838, "step": 5086 }, { "epoch": 13.435457246616044, "grad_norm": 139.97254943847656, "learning_rate": 0.00041809857368080354, "loss": 37.325, "step": 5087 }, { "epoch": 13.438098382304391, "grad_norm": 162.08993530273438, "learning_rate": 0.00041806764865586057, "loss": 38.7729, "step": 5088 }, { "epoch": 13.440739517992737, "grad_norm": 242.7154998779297, "learning_rate": 0.0004180367189377271, "loss": 37.6584, "step": 5089 }, { "epoch": 13.443380653681082, "grad_norm": 212.305419921875, "learning_rate": 0.000418005784527267, "loss": 37.1614, "step": 5090 }, { "epoch": 13.446021789369428, "grad_norm": 239.97613525390625, "learning_rate": 0.0004179748454253441, "loss": 37.732, "step": 5091 }, { "epoch": 13.448662925057775, "grad_norm": 209.99176025390625, "learning_rate": 0.00041794390163282227, "loss": 38.0127, "step": 5092 }, { "epoch": 13.451304060746121, "grad_norm": 207.87890625, "learning_rate": 0.00041791295315056563, "loss": 38.9328, "step": 5093 }, { "epoch": 13.453945196434466, "grad_norm": 1203.7567138671875, "learning_rate": 0.00041788199997943844, "loss": 57.6794, "step": 5094 }, { "epoch": 13.456586332122813, "grad_norm": 2188.236572265625, "learning_rate": 0.0004178510421203051, "loss": 59.4687, "step": 5095 }, { "epoch": 13.459227467811159, "grad_norm": 1453.3985595703125, "learning_rate": 0.0004178200795740299, "loss": 55.3726, "step": 5096 }, { "epoch": 13.461868603499505, "grad_norm": 1549.4639892578125, "learning_rate": 0.0004177891123414776, "loss": 51.516, "step": 5097 }, { "epoch": 13.46450973918785, "grad_norm": 1627.537841796875, "learning_rate": 0.00041775814042351283, "loss": 41.2577, "step": 5098 }, { "epoch": 13.467150874876197, "grad_norm": 2564.28564453125, "learning_rate": 0.0004177271638210006, "loss": 35.4097, "step": 5099 }, { "epoch": 13.469792010564543, "grad_norm": 1597.3724365234375, "learning_rate": 0.0004176961825348059, "loss": 33.0083, "step": 5100 }, { "epoch": 13.47243314625289, "grad_norm": 1692.8480224609375, "learning_rate": 0.0004176651965657937, "loss": 27.4386, "step": 5101 }, { "epoch": 13.475074281941234, "grad_norm": 1841.515380859375, "learning_rate": 0.00041763420591482946, "loss": 22.1557, "step": 5102 }, { "epoch": 13.47771541762958, "grad_norm": 1173.68505859375, "learning_rate": 0.0004176032105827785, "loss": 22.7822, "step": 5103 }, { "epoch": 13.480356553317927, "grad_norm": 710.5396118164062, "learning_rate": 0.0004175722105705063, "loss": 34.5433, "step": 5104 }, { "epoch": 13.482997689006273, "grad_norm": 421.6883239746094, "learning_rate": 0.00041754120587887857, "loss": 45.1409, "step": 5105 }, { "epoch": 13.485638824694618, "grad_norm": 543.7853393554688, "learning_rate": 0.00041751019650876104, "loss": 43.0775, "step": 5106 }, { "epoch": 13.488279960382965, "grad_norm": 405.66290283203125, "learning_rate": 0.00041747918246101977, "loss": 40.1512, "step": 5107 }, { "epoch": 13.490921096071311, "grad_norm": 475.8853454589844, "learning_rate": 0.0004174481637365206, "loss": 39.1398, "step": 5108 }, { "epoch": 13.493562231759658, "grad_norm": 296.0535888671875, "learning_rate": 0.00041741714033612993, "loss": 38.6906, "step": 5109 }, { "epoch": 13.496203367448002, "grad_norm": 454.6282958984375, "learning_rate": 0.00041738611226071386, "loss": 38.5834, "step": 5110 }, { "epoch": 13.498844503136349, "grad_norm": 527.34326171875, "learning_rate": 0.00041735507951113905, "loss": 38.2596, "step": 5111 }, { "epoch": 13.501485638824695, "grad_norm": 311.6153259277344, "learning_rate": 0.0004173240420882719, "loss": 37.147, "step": 5112 }, { "epoch": 13.50412677451304, "grad_norm": 611.2308959960938, "learning_rate": 0.0004172929999929791, "loss": 37.2845, "step": 5113 }, { "epoch": 13.506767910201386, "grad_norm": 910.9912719726562, "learning_rate": 0.00041726195322612763, "loss": 39.1666, "step": 5114 }, { "epoch": 13.509409045889733, "grad_norm": 378.4812927246094, "learning_rate": 0.0004172309017885844, "loss": 38.1241, "step": 5115 }, { "epoch": 13.51205018157808, "grad_norm": 495.8267822265625, "learning_rate": 0.0004171998456812164, "loss": 37.4364, "step": 5116 }, { "epoch": 13.514691317266424, "grad_norm": 440.6478576660156, "learning_rate": 0.0004171687849048909, "loss": 38.4135, "step": 5117 }, { "epoch": 13.51733245295477, "grad_norm": 571.7146606445312, "learning_rate": 0.00041713771946047534, "loss": 39.3306, "step": 5118 }, { "epoch": 13.519973588643117, "grad_norm": 580.5181274414062, "learning_rate": 0.0004171066493488371, "loss": 38.3602, "step": 5119 }, { "epoch": 13.522614724331463, "grad_norm": 458.6819763183594, "learning_rate": 0.0004170755745708439, "loss": 39.5323, "step": 5120 }, { "epoch": 13.525255860019808, "grad_norm": 662.8701171875, "learning_rate": 0.0004170444951273634, "loss": 37.8691, "step": 5121 }, { "epoch": 13.527896995708154, "grad_norm": 1142.6768798828125, "learning_rate": 0.0004170134110192635, "loss": 43.3037, "step": 5122 }, { "epoch": 13.5305381313965, "grad_norm": 891.966064453125, "learning_rate": 0.0004169823222474123, "loss": 43.8169, "step": 5123 }, { "epoch": 13.533179267084847, "grad_norm": 397.9765319824219, "learning_rate": 0.00041695122881267767, "loss": 45.2166, "step": 5124 }, { "epoch": 13.535820402773192, "grad_norm": 419.22088623046875, "learning_rate": 0.00041692013071592814, "loss": 42.3851, "step": 5125 }, { "epoch": 13.538461538461538, "grad_norm": 319.9222412109375, "learning_rate": 0.000416889027958032, "loss": 43.0166, "step": 5126 }, { "epoch": 13.541102674149885, "grad_norm": 236.74588012695312, "learning_rate": 0.00041685792053985785, "loss": 43.6664, "step": 5127 }, { "epoch": 13.543743809838231, "grad_norm": 294.76837158203125, "learning_rate": 0.00041682680846227415, "loss": 47.3152, "step": 5128 }, { "epoch": 13.546384945526576, "grad_norm": 384.4709777832031, "learning_rate": 0.00041679569172614996, "loss": 43.941, "step": 5129 }, { "epoch": 13.549026081214922, "grad_norm": 295.7247314453125, "learning_rate": 0.000416764570332354, "loss": 44.0761, "step": 5130 }, { "epoch": 13.551667216903269, "grad_norm": 284.3724365234375, "learning_rate": 0.00041673344428175534, "loss": 41.0821, "step": 5131 }, { "epoch": 13.554308352591615, "grad_norm": 246.98397827148438, "learning_rate": 0.00041670231357522326, "loss": 41.9057, "step": 5132 }, { "epoch": 13.55694948827996, "grad_norm": 382.61572265625, "learning_rate": 0.00041667117821362693, "loss": 41.7097, "step": 5133 }, { "epoch": 13.559590623968306, "grad_norm": 267.3830871582031, "learning_rate": 0.0004166400381978359, "loss": 41.2291, "step": 5134 }, { "epoch": 13.562231759656653, "grad_norm": 364.1096496582031, "learning_rate": 0.0004166088935287197, "loss": 40.6105, "step": 5135 }, { "epoch": 13.564872895344998, "grad_norm": 314.93157958984375, "learning_rate": 0.000416577744207148, "loss": 38.7004, "step": 5136 }, { "epoch": 13.567514031033344, "grad_norm": 304.9834899902344, "learning_rate": 0.00041654659023399065, "loss": 40.1218, "step": 5137 }, { "epoch": 13.57015516672169, "grad_norm": 379.2454833984375, "learning_rate": 0.0004165154316101176, "loss": 39.2246, "step": 5138 }, { "epoch": 13.572796302410037, "grad_norm": 275.96588134765625, "learning_rate": 0.00041648426833639895, "loss": 37.4609, "step": 5139 }, { "epoch": 13.575437438098382, "grad_norm": 332.2690124511719, "learning_rate": 0.0004164531004137049, "loss": 39.3652, "step": 5140 }, { "epoch": 13.578078573786728, "grad_norm": 327.5970153808594, "learning_rate": 0.00041642192784290585, "loss": 38.3544, "step": 5141 }, { "epoch": 13.580719709475074, "grad_norm": 282.54473876953125, "learning_rate": 0.0004163907506248722, "loss": 37.7327, "step": 5142 }, { "epoch": 13.583360845163421, "grad_norm": 579.9797973632812, "learning_rate": 0.0004163595687604745, "loss": 37.7918, "step": 5143 }, { "epoch": 13.586001980851766, "grad_norm": 14819.78125, "learning_rate": 0.00041632838225058366, "loss": 64.6574, "step": 5144 }, { "epoch": 13.588643116540112, "grad_norm": 3830.73828125, "learning_rate": 0.00041629719109607043, "loss": 103.4527, "step": 5145 }, { "epoch": 13.591284252228458, "grad_norm": 9246.9267578125, "learning_rate": 0.00041626599529780584, "loss": 102.3984, "step": 5146 }, { "epoch": 13.593925387916805, "grad_norm": 2954.966552734375, "learning_rate": 0.00041623479485666093, "loss": 78.8455, "step": 5147 }, { "epoch": 13.59656652360515, "grad_norm": 2055.234375, "learning_rate": 0.0004162035897735071, "loss": 106.4704, "step": 5148 }, { "epoch": 13.599207659293496, "grad_norm": 1906.12158203125, "learning_rate": 0.00041617238004921565, "loss": 99.1201, "step": 5149 }, { "epoch": 13.601848794981843, "grad_norm": 4158.54931640625, "learning_rate": 0.000416141165684658, "loss": 84.1739, "step": 5150 }, { "epoch": 13.604489930670189, "grad_norm": 5445.7158203125, "learning_rate": 0.00041610994668070606, "loss": 78.7862, "step": 5151 }, { "epoch": 13.607131066358534, "grad_norm": 3506.86083984375, "learning_rate": 0.00041607872303823135, "loss": 70.1902, "step": 5152 }, { "epoch": 13.60977220204688, "grad_norm": 2620.247314453125, "learning_rate": 0.00041604749475810586, "loss": 58.8132, "step": 5153 }, { "epoch": 13.612413337735227, "grad_norm": 1550.937744140625, "learning_rate": 0.0004160162618412016, "loss": 52.3676, "step": 5154 }, { "epoch": 13.615054473423573, "grad_norm": 235.31834411621094, "learning_rate": 0.0004159850242883908, "loss": 37.7885, "step": 5155 }, { "epoch": 13.617695609111918, "grad_norm": 374.97674560546875, "learning_rate": 0.00041595378210054567, "loss": 39.0244, "step": 5156 }, { "epoch": 13.620336744800264, "grad_norm": 310.22332763671875, "learning_rate": 0.00041592253527853866, "loss": 36.9876, "step": 5157 }, { "epoch": 13.62297788048861, "grad_norm": 333.34759521484375, "learning_rate": 0.00041589128382324235, "loss": 37.7212, "step": 5158 }, { "epoch": 13.625619016176955, "grad_norm": 366.1679382324219, "learning_rate": 0.00041586002773552936, "loss": 38.9903, "step": 5159 }, { "epoch": 13.628260151865302, "grad_norm": 477.41375732421875, "learning_rate": 0.0004158287670162725, "loss": 38.3106, "step": 5160 }, { "epoch": 13.630901287553648, "grad_norm": 415.79290771484375, "learning_rate": 0.0004157975016663448, "loss": 40.1579, "step": 5161 }, { "epoch": 13.633542423241995, "grad_norm": 494.23907470703125, "learning_rate": 0.00041576623168661925, "loss": 39.1381, "step": 5162 }, { "epoch": 13.63618355893034, "grad_norm": 598.8728637695312, "learning_rate": 0.00041573495707796903, "loss": 37.3132, "step": 5163 }, { "epoch": 13.638824694618686, "grad_norm": 580.1464233398438, "learning_rate": 0.0004157036778412675, "loss": 36.948, "step": 5164 }, { "epoch": 13.641465830307032, "grad_norm": 388.0495910644531, "learning_rate": 0.0004156723939773881, "loss": 37.4974, "step": 5165 }, { "epoch": 13.644106965995379, "grad_norm": 1794.9534912109375, "learning_rate": 0.0004156411054872045, "loss": 38.3508, "step": 5166 }, { "epoch": 13.646748101683723, "grad_norm": 270.4270935058594, "learning_rate": 0.0004156098123715902, "loss": 36.7325, "step": 5167 }, { "epoch": 13.64938923737207, "grad_norm": 292.80657958984375, "learning_rate": 0.0004155785146314193, "loss": 36.7082, "step": 5168 }, { "epoch": 13.652030373060416, "grad_norm": 718.6485595703125, "learning_rate": 0.00041554721226756556, "loss": 36.1588, "step": 5169 }, { "epoch": 13.654671508748763, "grad_norm": 659.6080932617188, "learning_rate": 0.0004155159052809031, "loss": 38.7426, "step": 5170 }, { "epoch": 13.657312644437107, "grad_norm": 432.4489440917969, "learning_rate": 0.0004154845936723063, "loss": 40.5345, "step": 5171 }, { "epoch": 13.659953780125454, "grad_norm": 608.739501953125, "learning_rate": 0.0004154532774426495, "loss": 42.6133, "step": 5172 }, { "epoch": 13.6625949158138, "grad_norm": 354.82623291015625, "learning_rate": 0.000415421956592807, "loss": 40.9246, "step": 5173 }, { "epoch": 13.665236051502147, "grad_norm": 234.17808532714844, "learning_rate": 0.00041539063112365363, "loss": 40.5293, "step": 5174 }, { "epoch": 13.667877187190491, "grad_norm": 145.8466339111328, "learning_rate": 0.000415359301036064, "loss": 41.7179, "step": 5175 }, { "epoch": 13.670518322878838, "grad_norm": 181.68588256835938, "learning_rate": 0.00041532796633091297, "loss": 43.2086, "step": 5176 }, { "epoch": 13.673159458567184, "grad_norm": 229.1260986328125, "learning_rate": 0.0004152966270090757, "loss": 41.9507, "step": 5177 }, { "epoch": 13.67580059425553, "grad_norm": 292.4814758300781, "learning_rate": 0.00041526528307142714, "loss": 44.0817, "step": 5178 }, { "epoch": 13.678441729943875, "grad_norm": 274.6791687011719, "learning_rate": 0.00041523393451884267, "loss": 42.9275, "step": 5179 }, { "epoch": 13.681082865632222, "grad_norm": 199.7200927734375, "learning_rate": 0.00041520258135219765, "loss": 43.5571, "step": 5180 }, { "epoch": 13.683724001320568, "grad_norm": 245.7757110595703, "learning_rate": 0.0004151712235723676, "loss": 42.4882, "step": 5181 }, { "epoch": 13.686365137008913, "grad_norm": 126.67755126953125, "learning_rate": 0.0004151398611802281, "loss": 39.4943, "step": 5182 }, { "epoch": 13.68900627269726, "grad_norm": 192.453125, "learning_rate": 0.00041510849417665497, "loss": 41.0163, "step": 5183 }, { "epoch": 13.691647408385606, "grad_norm": 241.00888061523438, "learning_rate": 0.0004150771225625243, "loss": 40.7496, "step": 5184 }, { "epoch": 13.694288544073952, "grad_norm": 188.92381286621094, "learning_rate": 0.0004150457463387117, "loss": 39.4454, "step": 5185 }, { "epoch": 13.696929679762297, "grad_norm": 310.45184326171875, "learning_rate": 0.0004150143655060938, "loss": 38.1358, "step": 5186 }, { "epoch": 13.699570815450643, "grad_norm": 195.31655883789062, "learning_rate": 0.00041498298006554656, "loss": 38.8749, "step": 5187 }, { "epoch": 13.70221195113899, "grad_norm": 152.59535217285156, "learning_rate": 0.0004149515900179466, "loss": 38.8284, "step": 5188 }, { "epoch": 13.704853086827336, "grad_norm": 197.23397827148438, "learning_rate": 0.00041492019536417036, "loss": 38.2836, "step": 5189 }, { "epoch": 13.707494222515681, "grad_norm": 362.2752380371094, "learning_rate": 0.0004148887961050945, "loss": 35.9797, "step": 5190 }, { "epoch": 13.710135358204028, "grad_norm": 314.5331726074219, "learning_rate": 0.0004148573922415959, "loss": 36.1399, "step": 5191 }, { "epoch": 13.712776493892374, "grad_norm": 2971.9228515625, "learning_rate": 0.0004148259837745515, "loss": 38.8897, "step": 5192 }, { "epoch": 13.71541762958072, "grad_norm": 503.2937927246094, "learning_rate": 0.00041479457070483827, "loss": 37.8057, "step": 5193 }, { "epoch": 13.718058765269065, "grad_norm": 419.8975830078125, "learning_rate": 0.00041476315303333347, "loss": 39.9292, "step": 5194 }, { "epoch": 13.720699900957412, "grad_norm": 1369.98974609375, "learning_rate": 0.0004147317307609144, "loss": 54.3998, "step": 5195 }, { "epoch": 13.723341036645758, "grad_norm": 2882.194580078125, "learning_rate": 0.00041470030388845856, "loss": 67.2982, "step": 5196 }, { "epoch": 13.725982172334104, "grad_norm": 1798.4503173828125, "learning_rate": 0.00041466887241684345, "loss": 53.2886, "step": 5197 }, { "epoch": 13.72862330802245, "grad_norm": 1144.250732421875, "learning_rate": 0.00041463743634694684, "loss": 55.3233, "step": 5198 }, { "epoch": 13.731264443710796, "grad_norm": 5490.70703125, "learning_rate": 0.0004146059956796465, "loss": 44.8834, "step": 5199 }, { "epoch": 13.733905579399142, "grad_norm": 1414.46484375, "learning_rate": 0.0004145745504158204, "loss": 39.6311, "step": 5200 }, { "epoch": 13.733905579399142, "eval_loss": 4.498843669891357, "eval_runtime": 2.1442, "eval_samples_per_second": 230.859, "eval_steps_per_second": 28.916, "step": 5200 }, { "epoch": 13.736546715087488, "grad_norm": 1761.677490234375, "learning_rate": 0.0004145431005563467, "loss": 32.7461, "step": 5201 }, { "epoch": 13.739187850775833, "grad_norm": 567.9356689453125, "learning_rate": 0.00041451164610210357, "loss": 31.7929, "step": 5202 }, { "epoch": 13.74182898646418, "grad_norm": 1631.1434326171875, "learning_rate": 0.0004144801870539694, "loss": 26.3142, "step": 5203 }, { "epoch": 13.744470122152526, "grad_norm": 2120.47216796875, "learning_rate": 0.0004144487234128226, "loss": 29.4644, "step": 5204 }, { "epoch": 13.74711125784087, "grad_norm": 911.6950073242188, "learning_rate": 0.00041441725517954176, "loss": 54.5753, "step": 5205 }, { "epoch": 13.749752393529217, "grad_norm": 953.4873657226562, "learning_rate": 0.0004143857823550057, "loss": 54.9913, "step": 5206 }, { "epoch": 13.752393529217564, "grad_norm": 825.7830200195312, "learning_rate": 0.00041435430494009317, "loss": 51.8302, "step": 5207 }, { "epoch": 13.75503466490591, "grad_norm": 647.6296997070312, "learning_rate": 0.00041432282293568326, "loss": 45.4142, "step": 5208 }, { "epoch": 13.757675800594255, "grad_norm": 280.5292663574219, "learning_rate": 0.000414291336342655, "loss": 40.1831, "step": 5209 }, { "epoch": 13.760316936282601, "grad_norm": 334.3447265625, "learning_rate": 0.0004142598451618878, "loss": 38.5046, "step": 5210 }, { "epoch": 13.762958071970948, "grad_norm": 372.3764343261719, "learning_rate": 0.0004142283493942608, "loss": 40.2129, "step": 5211 }, { "epoch": 13.765599207659294, "grad_norm": 210.947265625, "learning_rate": 0.00041419684904065373, "loss": 38.4634, "step": 5212 }, { "epoch": 13.768240343347639, "grad_norm": 344.19085693359375, "learning_rate": 0.0004141653441019461, "loss": 38.3125, "step": 5213 }, { "epoch": 13.770881479035985, "grad_norm": 248.71347045898438, "learning_rate": 0.00041413383457901756, "loss": 39.18, "step": 5214 }, { "epoch": 13.773522614724332, "grad_norm": 277.8101806640625, "learning_rate": 0.0004141023204727481, "loss": 40.1869, "step": 5215 }, { "epoch": 13.776163750412678, "grad_norm": 361.53143310546875, "learning_rate": 0.00041407080178401787, "loss": 38.0893, "step": 5216 }, { "epoch": 13.778804886101023, "grad_norm": 181.1862335205078, "learning_rate": 0.00041403927851370683, "loss": 37.4799, "step": 5217 }, { "epoch": 13.78144602178937, "grad_norm": 225.2546844482422, "learning_rate": 0.00041400775066269523, "loss": 37.9008, "step": 5218 }, { "epoch": 13.784087157477716, "grad_norm": 188.04698181152344, "learning_rate": 0.0004139762182318636, "loss": 37.4472, "step": 5219 }, { "epoch": 13.786728293166062, "grad_norm": 301.5449523925781, "learning_rate": 0.0004139446812220924, "loss": 39.9685, "step": 5220 }, { "epoch": 13.789369428854407, "grad_norm": 243.21336364746094, "learning_rate": 0.00041391313963426223, "loss": 39.7646, "step": 5221 }, { "epoch": 13.792010564542753, "grad_norm": 660.5564575195312, "learning_rate": 0.00041388159346925405, "loss": 41.433, "step": 5222 }, { "epoch": 13.7946517002311, "grad_norm": 147.83978271484375, "learning_rate": 0.00041385004272794846, "loss": 42.8997, "step": 5223 }, { "epoch": 13.797292835919446, "grad_norm": 195.7646026611328, "learning_rate": 0.00041381848741122687, "loss": 41.0831, "step": 5224 }, { "epoch": 13.799933971607791, "grad_norm": 261.3430480957031, "learning_rate": 0.0004137869275199701, "loss": 42.8809, "step": 5225 }, { "epoch": 13.802575107296137, "grad_norm": 92.18681335449219, "learning_rate": 0.0004137553630550597, "loss": 41.5123, "step": 5226 }, { "epoch": 13.805216242984484, "grad_norm": 139.94564819335938, "learning_rate": 0.00041372379401737687, "loss": 44.4206, "step": 5227 }, { "epoch": 13.807857378672828, "grad_norm": 93.04174041748047, "learning_rate": 0.0004136922204078033, "loss": 40.1862, "step": 5228 }, { "epoch": 13.810498514361175, "grad_norm": 137.6465606689453, "learning_rate": 0.00041366064222722067, "loss": 45.0001, "step": 5229 }, { "epoch": 13.813139650049521, "grad_norm": 114.91822814941406, "learning_rate": 0.00041362905947651073, "loss": 41.0601, "step": 5230 }, { "epoch": 13.815780785737868, "grad_norm": 94.23973846435547, "learning_rate": 0.0004135974721565555, "loss": 42.4902, "step": 5231 }, { "epoch": 13.818421921426213, "grad_norm": 253.13134765625, "learning_rate": 0.00041356588026823684, "loss": 41.8807, "step": 5232 }, { "epoch": 13.821063057114559, "grad_norm": 125.44652557373047, "learning_rate": 0.0004135342838124371, "loss": 41.274, "step": 5233 }, { "epoch": 13.823704192802905, "grad_norm": 197.4928436279297, "learning_rate": 0.00041350268279003854, "loss": 40.7413, "step": 5234 }, { "epoch": 13.826345328491252, "grad_norm": 141.0206756591797, "learning_rate": 0.00041347107720192367, "loss": 38.9634, "step": 5235 }, { "epoch": 13.828986464179597, "grad_norm": 116.67232513427734, "learning_rate": 0.00041343946704897494, "loss": 37.9164, "step": 5236 }, { "epoch": 13.831627599867943, "grad_norm": 140.2429656982422, "learning_rate": 0.0004134078523320751, "loss": 38.0473, "step": 5237 }, { "epoch": 13.83426873555629, "grad_norm": 121.57496643066406, "learning_rate": 0.00041337623305210704, "loss": 37.5365, "step": 5238 }, { "epoch": 13.836909871244636, "grad_norm": 166.10882568359375, "learning_rate": 0.0004133446092099536, "loss": 36.8451, "step": 5239 }, { "epoch": 13.83955100693298, "grad_norm": 160.8760223388672, "learning_rate": 0.0004133129808064979, "loss": 37.7064, "step": 5240 }, { "epoch": 13.842192142621327, "grad_norm": 121.96522521972656, "learning_rate": 0.0004132813478426232, "loss": 36.3604, "step": 5241 }, { "epoch": 13.844833278309673, "grad_norm": 129.89199829101562, "learning_rate": 0.00041324971031921266, "loss": 37.1274, "step": 5242 }, { "epoch": 13.84747441399802, "grad_norm": 217.18389892578125, "learning_rate": 0.0004132180682371499, "loss": 37.9729, "step": 5243 }, { "epoch": 13.850115549686365, "grad_norm": 874.4033813476562, "learning_rate": 0.00041318642159731843, "loss": 50.5001, "step": 5244 }, { "epoch": 13.852756685374711, "grad_norm": 2558.1484375, "learning_rate": 0.00041315477040060214, "loss": 99.9716, "step": 5245 }, { "epoch": 13.855397821063058, "grad_norm": 1777.4759521484375, "learning_rate": 0.00041312311464788466, "loss": 90.1396, "step": 5246 }, { "epoch": 13.858038956751404, "grad_norm": 2254.718505859375, "learning_rate": 0.00041309145434005004, "loss": 85.767, "step": 5247 }, { "epoch": 13.860680092439749, "grad_norm": 1150.394775390625, "learning_rate": 0.0004130597894779823, "loss": 72.3529, "step": 5248 }, { "epoch": 13.863321228128095, "grad_norm": 1682.15625, "learning_rate": 0.00041302812006256575, "loss": 74.0345, "step": 5249 }, { "epoch": 13.865962363816442, "grad_norm": 1339.21484375, "learning_rate": 0.0004129964460946847, "loss": 55.1124, "step": 5250 }, { "epoch": 13.868603499504786, "grad_norm": 4272.87060546875, "learning_rate": 0.0004129647675752237, "loss": 40.968, "step": 5251 }, { "epoch": 13.871244635193133, "grad_norm": 749.8217163085938, "learning_rate": 0.00041293308450506726, "loss": 36.7623, "step": 5252 }, { "epoch": 13.87388577088148, "grad_norm": 1246.4970703125, "learning_rate": 0.0004129013968851001, "loss": 20.7936, "step": 5253 }, { "epoch": 13.876526906569826, "grad_norm": 356.3553466796875, "learning_rate": 0.0004128697047162072, "loss": 38.4983, "step": 5254 }, { "epoch": 13.87916804225817, "grad_norm": 637.9224853515625, "learning_rate": 0.0004128380079992734, "loss": 43.1916, "step": 5255 }, { "epoch": 13.881809177946517, "grad_norm": 367.63604736328125, "learning_rate": 0.00041280630673518396, "loss": 41.4299, "step": 5256 }, { "epoch": 13.884450313634863, "grad_norm": 455.9267883300781, "learning_rate": 0.000412774600924824, "loss": 41.0892, "step": 5257 }, { "epoch": 13.88709144932321, "grad_norm": 1134.8184814453125, "learning_rate": 0.00041274289056907894, "loss": 39.9979, "step": 5258 }, { "epoch": 13.889732585011554, "grad_norm": 611.2168579101562, "learning_rate": 0.0004127111756688343, "loss": 40.2018, "step": 5259 }, { "epoch": 13.8923737206999, "grad_norm": 309.31390380859375, "learning_rate": 0.0004126794562249756, "loss": 38.0378, "step": 5260 }, { "epoch": 13.895014856388247, "grad_norm": 231.71405029296875, "learning_rate": 0.0004126477322383887, "loss": 37.4699, "step": 5261 }, { "epoch": 13.897655992076594, "grad_norm": 1306.7337646484375, "learning_rate": 0.00041261600370995935, "loss": 37.7858, "step": 5262 }, { "epoch": 13.900297127764938, "grad_norm": 295.3404541015625, "learning_rate": 0.0004125842706405736, "loss": 37.1973, "step": 5263 }, { "epoch": 13.902938263453285, "grad_norm": 175.7779083251953, "learning_rate": 0.00041255253303111773, "loss": 37.7659, "step": 5264 }, { "epoch": 13.905579399141631, "grad_norm": 334.8846435546875, "learning_rate": 0.0004125207908824777, "loss": 37.2829, "step": 5265 }, { "epoch": 13.908220534829978, "grad_norm": 313.432373046875, "learning_rate": 0.0004124890441955402, "loss": 38.8137, "step": 5266 }, { "epoch": 13.910861670518322, "grad_norm": 427.0295715332031, "learning_rate": 0.0004124572929711916, "loss": 38.2168, "step": 5267 }, { "epoch": 13.913502806206669, "grad_norm": 250.91024780273438, "learning_rate": 0.00041242553721031845, "loss": 37.1063, "step": 5268 }, { "epoch": 13.916143941895015, "grad_norm": 495.5728454589844, "learning_rate": 0.00041239377691380764, "loss": 37.9344, "step": 5269 }, { "epoch": 13.918785077583362, "grad_norm": 563.5223388671875, "learning_rate": 0.0004123620120825459, "loss": 37.4943, "step": 5270 }, { "epoch": 13.921426213271706, "grad_norm": 369.0342102050781, "learning_rate": 0.00041233024271742047, "loss": 39.0402, "step": 5271 }, { "epoch": 13.924067348960053, "grad_norm": 670.196044921875, "learning_rate": 0.00041229846881931834, "loss": 41.7646, "step": 5272 }, { "epoch": 13.9267084846484, "grad_norm": 771.355712890625, "learning_rate": 0.0004122666903891268, "loss": 50.4639, "step": 5273 }, { "epoch": 13.929349620336744, "grad_norm": 518.3516845703125, "learning_rate": 0.0004122349074277332, "loss": 48.5228, "step": 5274 }, { "epoch": 13.93199075602509, "grad_norm": 529.8887329101562, "learning_rate": 0.0004122031199360251, "loss": 46.3382, "step": 5275 }, { "epoch": 13.934631891713437, "grad_norm": 195.12379455566406, "learning_rate": 0.0004121713279148902, "loss": 48.3333, "step": 5276 }, { "epoch": 13.937273027401783, "grad_norm": 158.73846435546875, "learning_rate": 0.0004121395313652163, "loss": 41.5947, "step": 5277 }, { "epoch": 13.939914163090128, "grad_norm": 423.8229064941406, "learning_rate": 0.0004121077302878912, "loss": 40.2708, "step": 5278 }, { "epoch": 13.942555298778474, "grad_norm": 286.17022705078125, "learning_rate": 0.00041207592468380286, "loss": 39.6763, "step": 5279 }, { "epoch": 13.945196434466821, "grad_norm": 311.10565185546875, "learning_rate": 0.00041204411455383964, "loss": 39.8271, "step": 5280 }, { "epoch": 13.947837570155167, "grad_norm": 273.7900390625, "learning_rate": 0.00041201229989888964, "loss": 37.5167, "step": 5281 }, { "epoch": 13.950478705843512, "grad_norm": 239.42608642578125, "learning_rate": 0.0004119804807198414, "loss": 37.4152, "step": 5282 }, { "epoch": 13.953119841531858, "grad_norm": 416.0164794921875, "learning_rate": 0.00041194865701758333, "loss": 37.6417, "step": 5283 }, { "epoch": 13.955760977220205, "grad_norm": 443.259521484375, "learning_rate": 0.0004119168287930042, "loss": 37.7985, "step": 5284 }, { "epoch": 13.958402112908551, "grad_norm": 1333.9844970703125, "learning_rate": 0.0004118849960469927, "loss": 48.2809, "step": 5285 }, { "epoch": 13.961043248596896, "grad_norm": 1266.062744140625, "learning_rate": 0.00041185315878043786, "loss": 85.8697, "step": 5286 }, { "epoch": 13.963684384285243, "grad_norm": 5175.64697265625, "learning_rate": 0.00041182131699422854, "loss": 87.587, "step": 5287 }, { "epoch": 13.966325519973589, "grad_norm": 14570.7861328125, "learning_rate": 0.00041178947068925404, "loss": 88.0222, "step": 5288 }, { "epoch": 13.968966655661935, "grad_norm": 3749.7236328125, "learning_rate": 0.00041175761986640366, "loss": 81.7458, "step": 5289 }, { "epoch": 13.97160779135028, "grad_norm": 1714.423095703125, "learning_rate": 0.00041172576452656667, "loss": 80.6471, "step": 5290 }, { "epoch": 13.974248927038627, "grad_norm": 267.4094543457031, "learning_rate": 0.0004116939046706327, "loss": 38.0933, "step": 5291 }, { "epoch": 13.976890062726973, "grad_norm": 223.06776428222656, "learning_rate": 0.0004116620402994916, "loss": 36.911, "step": 5292 }, { "epoch": 13.97953119841532, "grad_norm": 795.3379516601562, "learning_rate": 0.00041163017141403283, "loss": 36.9907, "step": 5293 }, { "epoch": 13.982172334103664, "grad_norm": 504.02911376953125, "learning_rate": 0.0004115982980151465, "loss": 39.2835, "step": 5294 }, { "epoch": 13.98481346979201, "grad_norm": 607.999755859375, "learning_rate": 0.0004115664201037227, "loss": 36.7093, "step": 5295 }, { "epoch": 13.987454605480357, "grad_norm": 397.3961486816406, "learning_rate": 0.00041153453768065143, "loss": 37.2711, "step": 5296 }, { "epoch": 13.990095741168702, "grad_norm": 236.6573028564453, "learning_rate": 0.00041150265074682313, "loss": 37.2973, "step": 5297 }, { "epoch": 13.992736876857048, "grad_norm": 854.3328247070312, "learning_rate": 0.0004114707593031282, "loss": 36.8573, "step": 5298 }, { "epoch": 13.995378012545395, "grad_norm": 401.05206298828125, "learning_rate": 0.0004114388633504571, "loss": 38.1856, "step": 5299 }, { "epoch": 13.998019148233741, "grad_norm": 765.9391479492188, "learning_rate": 0.0004114069628897006, "loss": 39.8742, "step": 5300 }, { "epoch": 14.000660283922086, "grad_norm": 590.51708984375, "learning_rate": 0.0004113750579217495, "loss": 42.8782, "step": 5301 }, { "epoch": 14.003301419610432, "grad_norm": 375.422119140625, "learning_rate": 0.0004113431484474947, "loss": 41.2784, "step": 5302 }, { "epoch": 14.005942555298779, "grad_norm": 147.830322265625, "learning_rate": 0.00041131123446782724, "loss": 39.6468, "step": 5303 }, { "epoch": 14.008583690987125, "grad_norm": 127.54167938232422, "learning_rate": 0.00041127931598363836, "loss": 40.648, "step": 5304 }, { "epoch": 14.01122482667547, "grad_norm": 251.79428100585938, "learning_rate": 0.00041124739299581927, "loss": 43.0658, "step": 5305 }, { "epoch": 14.013865962363816, "grad_norm": 162.0753173828125, "learning_rate": 0.0004112154655052615, "loss": 43.9729, "step": 5306 }, { "epoch": 14.016507098052163, "grad_norm": 169.95335388183594, "learning_rate": 0.00041118353351285653, "loss": 44.9452, "step": 5307 }, { "epoch": 14.01914823374051, "grad_norm": 247.0986328125, "learning_rate": 0.00041115159701949613, "loss": 42.8768, "step": 5308 }, { "epoch": 14.021789369428854, "grad_norm": 301.1118469238281, "learning_rate": 0.00041111965602607194, "loss": 43.4517, "step": 5309 }, { "epoch": 14.0244305051172, "grad_norm": 381.79583740234375, "learning_rate": 0.00041108771053347605, "loss": 39.2092, "step": 5310 }, { "epoch": 14.027071640805547, "grad_norm": 219.7157440185547, "learning_rate": 0.00041105576054260054, "loss": 38.8662, "step": 5311 }, { "epoch": 14.029712776493893, "grad_norm": 350.12933349609375, "learning_rate": 0.0004110238060543374, "loss": 40.8051, "step": 5312 }, { "epoch": 14.032353912182238, "grad_norm": 191.58859252929688, "learning_rate": 0.0004109918470695792, "loss": 39.3102, "step": 5313 }, { "epoch": 14.034995047870584, "grad_norm": 426.01593017578125, "learning_rate": 0.0004109598835892181, "loss": 40.5957, "step": 5314 }, { "epoch": 14.03763618355893, "grad_norm": 515.9833984375, "learning_rate": 0.00041092791561414695, "loss": 38.9683, "step": 5315 }, { "epoch": 14.040277319247277, "grad_norm": 242.0401153564453, "learning_rate": 0.0004108959431452581, "loss": 36.8087, "step": 5316 }, { "epoch": 14.042918454935622, "grad_norm": 239.73867797851562, "learning_rate": 0.00041086396618344475, "loss": 38.1469, "step": 5317 }, { "epoch": 14.045559590623968, "grad_norm": 283.8434143066406, "learning_rate": 0.0004108319847295996, "loss": 38.3935, "step": 5318 }, { "epoch": 14.048200726312315, "grad_norm": 199.19129943847656, "learning_rate": 0.0004107999987846157, "loss": 36.7394, "step": 5319 }, { "epoch": 14.05084186200066, "grad_norm": 200.26019287109375, "learning_rate": 0.00041076800834938634, "loss": 36.9551, "step": 5320 }, { "epoch": 14.053482997689006, "grad_norm": 236.04347229003906, "learning_rate": 0.0004107360134248047, "loss": 37.2532, "step": 5321 }, { "epoch": 14.056124133377352, "grad_norm": 214.05401611328125, "learning_rate": 0.00041070401401176437, "loss": 37.0965, "step": 5322 }, { "epoch": 14.058765269065699, "grad_norm": 753.8381958007812, "learning_rate": 0.0004106720101111588, "loss": 43.4924, "step": 5323 }, { "epoch": 14.061406404754043, "grad_norm": 2212.33349609375, "learning_rate": 0.0004106400017238818, "loss": 51.1239, "step": 5324 }, { "epoch": 14.06404754044239, "grad_norm": 1613.0780029296875, "learning_rate": 0.00041060798885082713, "loss": 50.3344, "step": 5325 }, { "epoch": 14.066688676130736, "grad_norm": 2541.373046875, "learning_rate": 0.0004105759714928887, "loss": 47.0371, "step": 5326 }, { "epoch": 14.069329811819083, "grad_norm": 2408.1826171875, "learning_rate": 0.00041054394965096055, "loss": 43.4201, "step": 5327 }, { "epoch": 14.071970947507427, "grad_norm": 3042.169921875, "learning_rate": 0.0004105119233259369, "loss": 40.5952, "step": 5328 }, { "epoch": 14.074612083195774, "grad_norm": 1150.6597900390625, "learning_rate": 0.0004104798925187122, "loss": 31.1811, "step": 5329 }, { "epoch": 14.07725321888412, "grad_norm": 2123.404296875, "learning_rate": 0.00041044785723018065, "loss": 29.6711, "step": 5330 }, { "epoch": 14.079894354572467, "grad_norm": 6436.55615234375, "learning_rate": 0.00041041581746123696, "loss": 24.3655, "step": 5331 }, { "epoch": 14.082535490260812, "grad_norm": 1453.679443359375, "learning_rate": 0.0004103837732127759, "loss": 27.2049, "step": 5332 }, { "epoch": 14.085176625949158, "grad_norm": 1225.2808837890625, "learning_rate": 0.000410351724485692, "loss": 27.5236, "step": 5333 }, { "epoch": 14.087817761637504, "grad_norm": 197.36431884765625, "learning_rate": 0.00041031967128088057, "loss": 38.272, "step": 5334 }, { "epoch": 14.09045889732585, "grad_norm": 286.851318359375, "learning_rate": 0.0004102876135992364, "loss": 38.4902, "step": 5335 }, { "epoch": 14.093100033014196, "grad_norm": 414.3047180175781, "learning_rate": 0.0004102555514416548, "loss": 37.9484, "step": 5336 }, { "epoch": 14.095741168702542, "grad_norm": 480.501953125, "learning_rate": 0.00041022348480903106, "loss": 38.4332, "step": 5337 }, { "epoch": 14.098382304390888, "grad_norm": 403.657470703125, "learning_rate": 0.00041019141370226067, "loss": 36.6064, "step": 5338 }, { "epoch": 14.101023440079235, "grad_norm": 212.6046142578125, "learning_rate": 0.00041015933812223914, "loss": 37.8841, "step": 5339 }, { "epoch": 14.10366457576758, "grad_norm": 262.5412902832031, "learning_rate": 0.00041012725806986207, "loss": 38.6507, "step": 5340 }, { "epoch": 14.106305711455926, "grad_norm": 252.0068817138672, "learning_rate": 0.00041009517354602556, "loss": 36.485, "step": 5341 }, { "epoch": 14.108946847144272, "grad_norm": 1886.8544921875, "learning_rate": 0.0004100630845516252, "loss": 37.0206, "step": 5342 }, { "epoch": 14.111587982832617, "grad_norm": 510.1051330566406, "learning_rate": 0.0004100309910875574, "loss": 37.479, "step": 5343 }, { "epoch": 14.114229118520964, "grad_norm": 281.6647033691406, "learning_rate": 0.00040999889315471806, "loss": 38.4837, "step": 5344 }, { "epoch": 14.11687025420931, "grad_norm": 466.8079833984375, "learning_rate": 0.0004099667907540036, "loss": 37.1102, "step": 5345 }, { "epoch": 14.119511389897657, "grad_norm": 322.5397644042969, "learning_rate": 0.00040993468388631063, "loss": 36.0698, "step": 5346 }, { "epoch": 14.122152525586001, "grad_norm": 564.9363403320312, "learning_rate": 0.0004099025725525354, "loss": 36.7887, "step": 5347 }, { "epoch": 14.124793661274348, "grad_norm": 267.0990905761719, "learning_rate": 0.0004098704567535748, "loss": 35.946, "step": 5348 }, { "epoch": 14.127434796962694, "grad_norm": 613.6519165039062, "learning_rate": 0.00040983833649032566, "loss": 38.4305, "step": 5349 }, { "epoch": 14.13007593265104, "grad_norm": 473.5823669433594, "learning_rate": 0.00040980621176368485, "loss": 40.3301, "step": 5350 }, { "epoch": 14.132717068339385, "grad_norm": 326.5218811035156, "learning_rate": 0.0004097740825745494, "loss": 43.8915, "step": 5351 }, { "epoch": 14.135358204027732, "grad_norm": 330.5227966308594, "learning_rate": 0.0004097419489238166, "loss": 40.4403, "step": 5352 }, { "epoch": 14.137999339716078, "grad_norm": 375.9547119140625, "learning_rate": 0.0004097098108123837, "loss": 45.2235, "step": 5353 }, { "epoch": 14.140640475404425, "grad_norm": 438.29345703125, "learning_rate": 0.0004096776682411482, "loss": 44.7729, "step": 5354 }, { "epoch": 14.14328161109277, "grad_norm": 230.0542755126953, "learning_rate": 0.00040964552121100755, "loss": 43.2152, "step": 5355 }, { "epoch": 14.145922746781116, "grad_norm": 195.1163330078125, "learning_rate": 0.00040961336972285956, "loss": 43.6367, "step": 5356 }, { "epoch": 14.148563882469462, "grad_norm": 282.2037658691406, "learning_rate": 0.00040958121377760186, "loss": 45.1268, "step": 5357 }, { "epoch": 14.151205018157809, "grad_norm": 216.84690856933594, "learning_rate": 0.0004095490533761326, "loss": 45.8793, "step": 5358 }, { "epoch": 14.153846153846153, "grad_norm": 285.7198181152344, "learning_rate": 0.00040951688851934974, "loss": 41.9546, "step": 5359 }, { "epoch": 14.1564872895345, "grad_norm": 285.395263671875, "learning_rate": 0.0004094847192081516, "loss": 40.3646, "step": 5360 }, { "epoch": 14.159128425222846, "grad_norm": 262.3214111328125, "learning_rate": 0.0004094525454434361, "loss": 40.777, "step": 5361 }, { "epoch": 14.161769560911193, "grad_norm": 148.86181640625, "learning_rate": 0.00040942036722610214, "loss": 42.3753, "step": 5362 }, { "epoch": 14.164410696599537, "grad_norm": 353.7479553222656, "learning_rate": 0.000409388184557048, "loss": 39.378, "step": 5363 }, { "epoch": 14.167051832287884, "grad_norm": 483.4755859375, "learning_rate": 0.00040935599743717243, "loss": 38.834, "step": 5364 }, { "epoch": 14.16969296797623, "grad_norm": 132.72560119628906, "learning_rate": 0.00040932380586737425, "loss": 37.5501, "step": 5365 }, { "epoch": 14.172334103664575, "grad_norm": 145.4012451171875, "learning_rate": 0.0004092916098485524, "loss": 37.1788, "step": 5366 }, { "epoch": 14.174975239352921, "grad_norm": 569.556640625, "learning_rate": 0.00040925940938160596, "loss": 37.9821, "step": 5367 }, { "epoch": 14.177616375041268, "grad_norm": 241.7764129638672, "learning_rate": 0.000409227204467434, "loss": 37.0475, "step": 5368 }, { "epoch": 14.180257510729614, "grad_norm": 131.27096557617188, "learning_rate": 0.0004091949951069359, "loss": 35.886, "step": 5369 }, { "epoch": 14.182898646417959, "grad_norm": 184.51951599121094, "learning_rate": 0.00040916278130101105, "loss": 37.1726, "step": 5370 }, { "epoch": 14.185539782106305, "grad_norm": 226.41867065429688, "learning_rate": 0.0004091305630505591, "loss": 36.8331, "step": 5371 }, { "epoch": 14.188180917794652, "grad_norm": 268.1504821777344, "learning_rate": 0.0004090983403564796, "loss": 37.118, "step": 5372 }, { "epoch": 14.190822053482998, "grad_norm": 1526.0106201171875, "learning_rate": 0.00040906611321967233, "loss": 57.2441, "step": 5373 }, { "epoch": 14.193463189171343, "grad_norm": 1917.0953369140625, "learning_rate": 0.0004090338816410375, "loss": 105.5043, "step": 5374 }, { "epoch": 14.19610432485969, "grad_norm": 3014.334716796875, "learning_rate": 0.0004090016456214748, "loss": 94.5447, "step": 5375 }, { "epoch": 14.198745460548036, "grad_norm": 5661.240234375, "learning_rate": 0.0004089694051618846, "loss": 87.1577, "step": 5376 }, { "epoch": 14.201386596236382, "grad_norm": 3097.32666015625, "learning_rate": 0.00040893716026316705, "loss": 75.3127, "step": 5377 }, { "epoch": 14.204027731924727, "grad_norm": 1480.759521484375, "learning_rate": 0.0004089049109262227, "loss": 80.2818, "step": 5378 }, { "epoch": 14.206668867613073, "grad_norm": 1813.372802734375, "learning_rate": 0.00040887265715195217, "loss": 85.3334, "step": 5379 }, { "epoch": 14.20931000330142, "grad_norm": 1233.2872314453125, "learning_rate": 0.0004088403989412559, "loss": 76.1388, "step": 5380 }, { "epoch": 14.211951138989766, "grad_norm": 4261.52099609375, "learning_rate": 0.00040880813629503486, "loss": 64.0426, "step": 5381 }, { "epoch": 14.214592274678111, "grad_norm": 2170.196044921875, "learning_rate": 0.00040877586921418983, "loss": 59.249, "step": 5382 }, { "epoch": 14.217233410366457, "grad_norm": 3862.18115234375, "learning_rate": 0.00040874359769962204, "loss": 45.1644, "step": 5383 }, { "epoch": 14.219874546054804, "grad_norm": 296.912353515625, "learning_rate": 0.00040871132175223247, "loss": 38.5831, "step": 5384 }, { "epoch": 14.22251568174315, "grad_norm": 314.7358703613281, "learning_rate": 0.00040867904137292246, "loss": 36.8587, "step": 5385 }, { "epoch": 14.225156817431495, "grad_norm": 486.7153015136719, "learning_rate": 0.0004086467565625934, "loss": 38.8094, "step": 5386 }, { "epoch": 14.227797953119842, "grad_norm": 225.62094116210938, "learning_rate": 0.0004086144673221469, "loss": 38.4256, "step": 5387 }, { "epoch": 14.230439088808188, "grad_norm": 327.06890869140625, "learning_rate": 0.00040858217365248464, "loss": 36.2925, "step": 5388 }, { "epoch": 14.233080224496533, "grad_norm": 353.007568359375, "learning_rate": 0.00040854987555450826, "loss": 37.8012, "step": 5389 }, { "epoch": 14.235721360184879, "grad_norm": 573.9970703125, "learning_rate": 0.0004085175730291197, "loss": 36.3122, "step": 5390 }, { "epoch": 14.238362495873226, "grad_norm": 586.9494018554688, "learning_rate": 0.0004084852660772211, "loss": 36.8878, "step": 5391 }, { "epoch": 14.241003631561572, "grad_norm": 493.1592712402344, "learning_rate": 0.00040845295469971453, "loss": 36.9333, "step": 5392 }, { "epoch": 14.243644767249917, "grad_norm": 680.9443969726562, "learning_rate": 0.00040842063889750223, "loss": 38.4272, "step": 5393 }, { "epoch": 14.246285902938263, "grad_norm": 375.2386169433594, "learning_rate": 0.00040838831867148675, "loss": 38.0713, "step": 5394 }, { "epoch": 14.24892703862661, "grad_norm": 350.706787109375, "learning_rate": 0.00040835599402257053, "loss": 37.1727, "step": 5395 }, { "epoch": 14.251568174314956, "grad_norm": 281.55914306640625, "learning_rate": 0.00040832366495165606, "loss": 36.7775, "step": 5396 }, { "epoch": 14.2542093100033, "grad_norm": 553.5082397460938, "learning_rate": 0.0004082913314596464, "loss": 36.6415, "step": 5397 }, { "epoch": 14.256850445691647, "grad_norm": 399.8324890136719, "learning_rate": 0.0004082589935474442, "loss": 38.1728, "step": 5398 }, { "epoch": 14.259491581379994, "grad_norm": 481.8021545410156, "learning_rate": 0.00040822665121595256, "loss": 37.1604, "step": 5399 }, { "epoch": 14.26213271706834, "grad_norm": 1676.79296875, "learning_rate": 0.00040819430446607465, "loss": 38.6004, "step": 5400 }, { "epoch": 14.26213271706834, "eval_loss": 4.344143390655518, "eval_runtime": 2.0591, "eval_samples_per_second": 240.394, "eval_steps_per_second": 30.11, "step": 5400 }, { "epoch": 14.264773852756685, "grad_norm": 933.5306396484375, "learning_rate": 0.00040816195329871377, "loss": 41.8461, "step": 5401 }, { "epoch": 14.267414988445031, "grad_norm": 370.52545166015625, "learning_rate": 0.0004081295977147732, "loss": 42.0453, "step": 5402 }, { "epoch": 14.270056124133378, "grad_norm": 322.19488525390625, "learning_rate": 0.0004080972377151565, "loss": 41.6639, "step": 5403 }, { "epoch": 14.272697259821724, "grad_norm": 319.7577819824219, "learning_rate": 0.00040806487330076734, "loss": 40.7694, "step": 5404 }, { "epoch": 14.275338395510069, "grad_norm": 196.4357147216797, "learning_rate": 0.0004080325044725094, "loss": 43.9976, "step": 5405 }, { "epoch": 14.277979531198415, "grad_norm": 236.992919921875, "learning_rate": 0.00040800013123128663, "loss": 42.056, "step": 5406 }, { "epoch": 14.280620666886762, "grad_norm": 196.62413024902344, "learning_rate": 0.000407967753578003, "loss": 40.0326, "step": 5407 }, { "epoch": 14.283261802575108, "grad_norm": 269.7807312011719, "learning_rate": 0.00040793537151356265, "loss": 42.0446, "step": 5408 }, { "epoch": 14.285902938263453, "grad_norm": 217.9604034423828, "learning_rate": 0.0004079029850388699, "loss": 43.3459, "step": 5409 }, { "epoch": 14.2885440739518, "grad_norm": 158.87484741210938, "learning_rate": 0.00040787059415482897, "loss": 40.0521, "step": 5410 }, { "epoch": 14.291185209640146, "grad_norm": 193.80052185058594, "learning_rate": 0.00040783819886234445, "loss": 41.0713, "step": 5411 }, { "epoch": 14.29382634532849, "grad_norm": 189.45867919921875, "learning_rate": 0.00040780579916232097, "loss": 39.6086, "step": 5412 }, { "epoch": 14.296467481016837, "grad_norm": 231.9961395263672, "learning_rate": 0.0004077733950556632, "loss": 39.036, "step": 5413 }, { "epoch": 14.299108616705183, "grad_norm": 182.85618591308594, "learning_rate": 0.0004077409865432761, "loss": 39.1797, "step": 5414 }, { "epoch": 14.30174975239353, "grad_norm": 154.17138671875, "learning_rate": 0.0004077085736260646, "loss": 38.7663, "step": 5415 }, { "epoch": 14.304390888081874, "grad_norm": 310.9510803222656, "learning_rate": 0.00040767615630493383, "loss": 37.4069, "step": 5416 }, { "epoch": 14.30703202377022, "grad_norm": 239.0245819091797, "learning_rate": 0.00040764373458078895, "loss": 37.4161, "step": 5417 }, { "epoch": 14.309673159458567, "grad_norm": 124.25415802001953, "learning_rate": 0.0004076113084545355, "loss": 37.6213, "step": 5418 }, { "epoch": 14.312314295146914, "grad_norm": 175.32615661621094, "learning_rate": 0.00040757887792707877, "loss": 36.6935, "step": 5419 }, { "epoch": 14.314955430835258, "grad_norm": 170.5276336669922, "learning_rate": 0.0004075464429993244, "loss": 36.7229, "step": 5420 }, { "epoch": 14.317596566523605, "grad_norm": 132.1531982421875, "learning_rate": 0.00040751400367217827, "loss": 36.3486, "step": 5421 }, { "epoch": 14.320237702211951, "grad_norm": 281.71478271484375, "learning_rate": 0.00040748155994654606, "loss": 38.5615, "step": 5422 }, { "epoch": 14.322878837900298, "grad_norm": 990.8863525390625, "learning_rate": 0.00040744911182333384, "loss": 64.4165, "step": 5423 }, { "epoch": 14.325519973588642, "grad_norm": 1342.07568359375, "learning_rate": 0.00040741665930344763, "loss": 73.7479, "step": 5424 }, { "epoch": 14.328161109276989, "grad_norm": 1523.7213134765625, "learning_rate": 0.0004073842023877936, "loss": 66.5221, "step": 5425 }, { "epoch": 14.330802244965335, "grad_norm": 2227.386474609375, "learning_rate": 0.00040735174107727834, "loss": 68.1232, "step": 5426 }, { "epoch": 14.333443380653682, "grad_norm": 4635.31103515625, "learning_rate": 0.000407319275372808, "loss": 73.7191, "step": 5427 }, { "epoch": 14.336084516342027, "grad_norm": 2364.97998046875, "learning_rate": 0.0004072868052752894, "loss": 57.2811, "step": 5428 }, { "epoch": 14.338725652030373, "grad_norm": 1962.872802734375, "learning_rate": 0.0004072543307856291, "loss": 52.6058, "step": 5429 }, { "epoch": 14.34136678771872, "grad_norm": 1327.8436279296875, "learning_rate": 0.000407221851904734, "loss": 42.9204, "step": 5430 }, { "epoch": 14.344007923407066, "grad_norm": 1345.33056640625, "learning_rate": 0.0004071893686335111, "loss": 36.2001, "step": 5431 }, { "epoch": 14.34664905909541, "grad_norm": 4591.533203125, "learning_rate": 0.00040715688097286735, "loss": 34.5221, "step": 5432 }, { "epoch": 14.349290194783757, "grad_norm": 303.1421813964844, "learning_rate": 0.00040712438892371005, "loss": 40.3827, "step": 5433 }, { "epoch": 14.351931330472103, "grad_norm": 277.1851501464844, "learning_rate": 0.00040709189248694643, "loss": 39.2065, "step": 5434 }, { "epoch": 14.354572466160448, "grad_norm": 794.6296997070312, "learning_rate": 0.0004070593916634841, "loss": 39.7212, "step": 5435 }, { "epoch": 14.357213601848795, "grad_norm": 253.6390838623047, "learning_rate": 0.00040702688645423044, "loss": 37.625, "step": 5436 }, { "epoch": 14.359854737537141, "grad_norm": 271.0153503417969, "learning_rate": 0.0004069943768600932, "loss": 38.1903, "step": 5437 }, { "epoch": 14.362495873225487, "grad_norm": 620.0083618164062, "learning_rate": 0.0004069618628819803, "loss": 38.3405, "step": 5438 }, { "epoch": 14.365137008913832, "grad_norm": 221.63552856445312, "learning_rate": 0.0004069293445207995, "loss": 38.1023, "step": 5439 }, { "epoch": 14.367778144602179, "grad_norm": 224.48709106445312, "learning_rate": 0.00040689682177745903, "loss": 37.6427, "step": 5440 }, { "epoch": 14.370419280290525, "grad_norm": 496.7649841308594, "learning_rate": 0.0004068642946528669, "loss": 38.4515, "step": 5441 }, { "epoch": 14.373060415978872, "grad_norm": 304.21063232421875, "learning_rate": 0.0004068317631479315, "loss": 36.8763, "step": 5442 }, { "epoch": 14.375701551667216, "grad_norm": 327.69384765625, "learning_rate": 0.0004067992272635612, "loss": 38.1636, "step": 5443 }, { "epoch": 14.378342687355563, "grad_norm": 413.2613525390625, "learning_rate": 0.0004067666870006647, "loss": 37.9732, "step": 5444 }, { "epoch": 14.380983823043909, "grad_norm": 377.1940612792969, "learning_rate": 0.0004067341423601505, "loss": 37.7668, "step": 5445 }, { "epoch": 14.383624958732256, "grad_norm": 362.693603515625, "learning_rate": 0.0004067015933429274, "loss": 37.7764, "step": 5446 }, { "epoch": 14.3862660944206, "grad_norm": 914.44970703125, "learning_rate": 0.00040666903994990446, "loss": 36.125, "step": 5447 }, { "epoch": 14.388907230108947, "grad_norm": 231.04232788085938, "learning_rate": 0.00040663648218199045, "loss": 37.0113, "step": 5448 }, { "epoch": 14.391548365797293, "grad_norm": 319.0538635253906, "learning_rate": 0.00040660392004009485, "loss": 40.5469, "step": 5449 }, { "epoch": 14.39418950148564, "grad_norm": 669.2515869140625, "learning_rate": 0.0004065713535251267, "loss": 42.1678, "step": 5450 }, { "epoch": 14.396830637173984, "grad_norm": 436.3216552734375, "learning_rate": 0.0004065387826379955, "loss": 41.0653, "step": 5451 }, { "epoch": 14.39947177286233, "grad_norm": 126.83684539794922, "learning_rate": 0.00040650620737961077, "loss": 41.5877, "step": 5452 }, { "epoch": 14.402112908550677, "grad_norm": 184.64537048339844, "learning_rate": 0.00040647362775088206, "loss": 41.6875, "step": 5453 }, { "epoch": 14.404754044239024, "grad_norm": 184.33387756347656, "learning_rate": 0.0004064410437527193, "loss": 41.7845, "step": 5454 }, { "epoch": 14.407395179927368, "grad_norm": 198.25099182128906, "learning_rate": 0.0004064084553860322, "loss": 41.2805, "step": 5455 }, { "epoch": 14.410036315615715, "grad_norm": 216.8796844482422, "learning_rate": 0.0004063758626517309, "loss": 45.1153, "step": 5456 }, { "epoch": 14.412677451304061, "grad_norm": 188.9040069580078, "learning_rate": 0.00040634326555072554, "loss": 42.1914, "step": 5457 }, { "epoch": 14.415318586992406, "grad_norm": 193.8314971923828, "learning_rate": 0.00040631066408392636, "loss": 43.5244, "step": 5458 }, { "epoch": 14.417959722680752, "grad_norm": 264.3770446777344, "learning_rate": 0.0004062780582522436, "loss": 42.0858, "step": 5459 }, { "epoch": 14.420600858369099, "grad_norm": 219.64016723632812, "learning_rate": 0.00040624544805658794, "loss": 45.8637, "step": 5460 }, { "epoch": 14.423241994057445, "grad_norm": 223.3534393310547, "learning_rate": 0.0004062128334978699, "loss": 39.34, "step": 5461 }, { "epoch": 14.42588312974579, "grad_norm": 233.30490112304688, "learning_rate": 0.00040618021457700023, "loss": 39.6483, "step": 5462 }, { "epoch": 14.428524265434136, "grad_norm": 160.26910400390625, "learning_rate": 0.00040614759129488983, "loss": 39.7824, "step": 5463 }, { "epoch": 14.431165401122483, "grad_norm": 235.05271911621094, "learning_rate": 0.0004061149636524497, "loss": 38.0008, "step": 5464 }, { "epoch": 14.43380653681083, "grad_norm": 239.95999145507812, "learning_rate": 0.0004060823316505908, "loss": 37.6666, "step": 5465 }, { "epoch": 14.436447672499174, "grad_norm": 242.54673767089844, "learning_rate": 0.0004060496952902246, "loss": 36.7322, "step": 5466 }, { "epoch": 14.43908880818752, "grad_norm": 259.33294677734375, "learning_rate": 0.00040601705457226234, "loss": 36.8111, "step": 5467 }, { "epoch": 14.441729943875867, "grad_norm": 191.067138671875, "learning_rate": 0.0004059844094976154, "loss": 37.0815, "step": 5468 }, { "epoch": 14.444371079564213, "grad_norm": 196.2432098388672, "learning_rate": 0.0004059517600671955, "loss": 37.3475, "step": 5469 }, { "epoch": 14.447012215252558, "grad_norm": 142.41183471679688, "learning_rate": 0.0004059191062819143, "loss": 37.915, "step": 5470 }, { "epoch": 14.449653350940904, "grad_norm": 172.3330078125, "learning_rate": 0.0004058864481426836, "loss": 36.9468, "step": 5471 }, { "epoch": 14.45229448662925, "grad_norm": 210.68450927734375, "learning_rate": 0.0004058537856504155, "loss": 37.1183, "step": 5472 }, { "epoch": 14.454935622317597, "grad_norm": 472.6793518066406, "learning_rate": 0.0004058211188060219, "loss": 47.5684, "step": 5473 }, { "epoch": 14.457576758005942, "grad_norm": 1866.3182373046875, "learning_rate": 0.0004057884476104152, "loss": 101.1597, "step": 5474 }, { "epoch": 14.460217893694288, "grad_norm": 1143.675048828125, "learning_rate": 0.00040575577206450753, "loss": 89.4627, "step": 5475 }, { "epoch": 14.462859029382635, "grad_norm": 1717.03125, "learning_rate": 0.0004057230921692114, "loss": 85.0148, "step": 5476 }, { "epoch": 14.465500165070981, "grad_norm": 2051.470458984375, "learning_rate": 0.0004056904079254395, "loss": 76.0583, "step": 5477 }, { "epoch": 14.468141300759326, "grad_norm": 1350.31396484375, "learning_rate": 0.00040565771933410435, "loss": 84.291, "step": 5478 }, { "epoch": 14.470782436447672, "grad_norm": 2437.36083984375, "learning_rate": 0.00040562502639611886, "loss": 63.9311, "step": 5479 }, { "epoch": 14.473423572136019, "grad_norm": 998.283935546875, "learning_rate": 0.00040559232911239584, "loss": 61.7499, "step": 5480 }, { "epoch": 14.476064707824364, "grad_norm": 2194.90234375, "learning_rate": 0.0004055596274838485, "loss": 52.4916, "step": 5481 }, { "epoch": 14.47870584351271, "grad_norm": 2452.331787109375, "learning_rate": 0.00040552692151138995, "loss": 39.6586, "step": 5482 }, { "epoch": 14.481346979201057, "grad_norm": 1518.44970703125, "learning_rate": 0.0004054942111959334, "loss": 29.6802, "step": 5483 }, { "epoch": 14.483988114889403, "grad_norm": 288.4370422363281, "learning_rate": 0.00040546149653839246, "loss": 39.4546, "step": 5484 }, { "epoch": 14.486629250577748, "grad_norm": 365.3255920410156, "learning_rate": 0.00040542877753968053, "loss": 39.1829, "step": 5485 }, { "epoch": 14.489270386266094, "grad_norm": 279.85137939453125, "learning_rate": 0.0004053960542007112, "loss": 39.0306, "step": 5486 }, { "epoch": 14.49191152195444, "grad_norm": 342.680908203125, "learning_rate": 0.0004053633265223985, "loss": 39.4261, "step": 5487 }, { "epoch": 14.494552657642787, "grad_norm": 386.21026611328125, "learning_rate": 0.000405330594505656, "loss": 38.3573, "step": 5488 }, { "epoch": 14.497193793331132, "grad_norm": 379.3042907714844, "learning_rate": 0.00040529785815139797, "loss": 39.1604, "step": 5489 }, { "epoch": 14.499834929019478, "grad_norm": 307.0923156738281, "learning_rate": 0.0004052651174605385, "loss": 37.7364, "step": 5490 }, { "epoch": 14.502476064707825, "grad_norm": 391.0886535644531, "learning_rate": 0.0004052323724339918, "loss": 36.3001, "step": 5491 }, { "epoch": 14.505117200396171, "grad_norm": 177.70350646972656, "learning_rate": 0.00040519962307267225, "loss": 36.8233, "step": 5492 }, { "epoch": 14.507758336084516, "grad_norm": 456.4559020996094, "learning_rate": 0.00040516686937749446, "loss": 37.6022, "step": 5493 }, { "epoch": 14.510399471772862, "grad_norm": 366.08441162109375, "learning_rate": 0.000405134111349373, "loss": 36.858, "step": 5494 }, { "epoch": 14.513040607461209, "grad_norm": 402.3110046386719, "learning_rate": 0.00040510134898922255, "loss": 37.9491, "step": 5495 }, { "epoch": 14.515681743149555, "grad_norm": 348.07049560546875, "learning_rate": 0.00040506858229795813, "loss": 38.5796, "step": 5496 }, { "epoch": 14.5183228788379, "grad_norm": 278.89215087890625, "learning_rate": 0.0004050358112764946, "loss": 37.0214, "step": 5497 }, { "epoch": 14.520964014526246, "grad_norm": 652.1532592773438, "learning_rate": 0.00040500303592574705, "loss": 37.3298, "step": 5498 }, { "epoch": 14.523605150214593, "grad_norm": 541.06982421875, "learning_rate": 0.00040497025624663084, "loss": 38.3945, "step": 5499 }, { "epoch": 14.526246285902939, "grad_norm": 676.2547607421875, "learning_rate": 0.0004049374722400612, "loss": 38.5739, "step": 5500 }, { "epoch": 14.528887421591284, "grad_norm": 595.5825805664062, "learning_rate": 0.0004049046839069538, "loss": 43.5634, "step": 5501 }, { "epoch": 14.53152855727963, "grad_norm": 221.95864868164062, "learning_rate": 0.00040487189124822394, "loss": 42.0443, "step": 5502 }, { "epoch": 14.534169692967977, "grad_norm": 235.19801330566406, "learning_rate": 0.0004048390942647876, "loss": 42.0836, "step": 5503 }, { "epoch": 14.536810828656321, "grad_norm": 253.49404907226562, "learning_rate": 0.00040480629295756046, "loss": 42.3047, "step": 5504 }, { "epoch": 14.539451964344668, "grad_norm": 234.707275390625, "learning_rate": 0.00040477348732745853, "loss": 43.7527, "step": 5505 }, { "epoch": 14.542093100033014, "grad_norm": 304.43157958984375, "learning_rate": 0.00040474067737539786, "loss": 46.0972, "step": 5506 }, { "epoch": 14.54473423572136, "grad_norm": 210.52407836914062, "learning_rate": 0.0004047078631022947, "loss": 44.1185, "step": 5507 }, { "epoch": 14.547375371409705, "grad_norm": 264.16827392578125, "learning_rate": 0.00040467504450906534, "loss": 43.3614, "step": 5508 }, { "epoch": 14.550016507098052, "grad_norm": 253.31851196289062, "learning_rate": 0.00040464222159662623, "loss": 41.1093, "step": 5509 }, { "epoch": 14.552657642786398, "grad_norm": 216.99871826171875, "learning_rate": 0.00040460939436589395, "loss": 42.9316, "step": 5510 }, { "epoch": 14.555298778474745, "grad_norm": 191.42279052734375, "learning_rate": 0.0004045765628177851, "loss": 41.8388, "step": 5511 }, { "epoch": 14.55793991416309, "grad_norm": 522.44189453125, "learning_rate": 0.00040454372695321664, "loss": 38.7045, "step": 5512 }, { "epoch": 14.560581049851436, "grad_norm": 195.09512329101562, "learning_rate": 0.00040451088677310527, "loss": 38.8106, "step": 5513 }, { "epoch": 14.563222185539782, "grad_norm": 265.4656066894531, "learning_rate": 0.00040447804227836814, "loss": 38.072, "step": 5514 }, { "epoch": 14.565863321228129, "grad_norm": 181.15023803710938, "learning_rate": 0.00040444519346992256, "loss": 36.6522, "step": 5515 }, { "epoch": 14.568504456916473, "grad_norm": 166.5741424560547, "learning_rate": 0.0004044123403486856, "loss": 37.1653, "step": 5516 }, { "epoch": 14.57114559260482, "grad_norm": 166.2372589111328, "learning_rate": 0.0004043794829155747, "loss": 37.6632, "step": 5517 }, { "epoch": 14.573786728293166, "grad_norm": 188.5160369873047, "learning_rate": 0.0004043466211715075, "loss": 36.3297, "step": 5518 }, { "epoch": 14.576427863981513, "grad_norm": 301.613037109375, "learning_rate": 0.00040431375511740155, "loss": 37.78, "step": 5519 }, { "epoch": 14.579068999669857, "grad_norm": 158.4596710205078, "learning_rate": 0.0004042808847541746, "loss": 36.4899, "step": 5520 }, { "epoch": 14.581710135358204, "grad_norm": 175.51657104492188, "learning_rate": 0.0004042480100827446, "loss": 37.5209, "step": 5521 }, { "epoch": 14.58435127104655, "grad_norm": 161.99893188476562, "learning_rate": 0.0004042151311040295, "loss": 38.0515, "step": 5522 }, { "epoch": 14.586992406734897, "grad_norm": 229.07684326171875, "learning_rate": 0.00040418224781894755, "loss": 37.2457, "step": 5523 }, { "epoch": 14.589633542423242, "grad_norm": 1814.76171875, "learning_rate": 0.0004041493602284168, "loss": 58.7243, "step": 5524 }, { "epoch": 14.592274678111588, "grad_norm": 2131.3125, "learning_rate": 0.00040411646833335576, "loss": 67.431, "step": 5525 }, { "epoch": 14.594915813799934, "grad_norm": 1927.9244384765625, "learning_rate": 0.0004040835721346829, "loss": 58.5113, "step": 5526 }, { "epoch": 14.597556949488279, "grad_norm": 2495.350341796875, "learning_rate": 0.00040405067163331676, "loss": 58.8514, "step": 5527 }, { "epoch": 14.600198085176626, "grad_norm": 6214.326171875, "learning_rate": 0.0004040177668301761, "loss": 60.922, "step": 5528 }, { "epoch": 14.602839220864972, "grad_norm": 1208.8619384765625, "learning_rate": 0.00040398485772617977, "loss": 52.2644, "step": 5529 }, { "epoch": 14.605480356553318, "grad_norm": 2606.757080078125, "learning_rate": 0.0004039519443222468, "loss": 44.8248, "step": 5530 }, { "epoch": 14.608121492241663, "grad_norm": 1412.031494140625, "learning_rate": 0.0004039190266192962, "loss": 41.6175, "step": 5531 }, { "epoch": 14.61076262793001, "grad_norm": 3510.244873046875, "learning_rate": 0.00040388610461824717, "loss": 31.6534, "step": 5532 }, { "epoch": 14.613403763618356, "grad_norm": 3205.835693359375, "learning_rate": 0.0004038531783200191, "loss": 39.5542, "step": 5533 }, { "epoch": 14.616044899306702, "grad_norm": 420.93243408203125, "learning_rate": 0.0004038202477255314, "loss": 39.4031, "step": 5534 }, { "epoch": 14.618686034995047, "grad_norm": 589.1253051757812, "learning_rate": 0.0004037873128357036, "loss": 36.6365, "step": 5535 }, { "epoch": 14.621327170683394, "grad_norm": 292.360595703125, "learning_rate": 0.00040375437365145546, "loss": 38.3921, "step": 5536 }, { "epoch": 14.62396830637174, "grad_norm": 296.2291259765625, "learning_rate": 0.0004037214301737068, "loss": 37.5179, "step": 5537 }, { "epoch": 14.626609442060087, "grad_norm": 241.56692504882812, "learning_rate": 0.0004036884824033774, "loss": 37.7963, "step": 5538 }, { "epoch": 14.629250577748431, "grad_norm": 250.53662109375, "learning_rate": 0.00040365553034138745, "loss": 37.0484, "step": 5539 }, { "epoch": 14.631891713436778, "grad_norm": 292.21026611328125, "learning_rate": 0.00040362257398865713, "loss": 36.9162, "step": 5540 }, { "epoch": 14.634532849125124, "grad_norm": 269.0523681640625, "learning_rate": 0.00040358961334610664, "loss": 38.14, "step": 5541 }, { "epoch": 14.63717398481347, "grad_norm": 306.76092529296875, "learning_rate": 0.0004035566484146564, "loss": 37.5483, "step": 5542 }, { "epoch": 14.639815120501815, "grad_norm": 325.33929443359375, "learning_rate": 0.000403523679195227, "loss": 36.5657, "step": 5543 }, { "epoch": 14.642456256190162, "grad_norm": 227.9186553955078, "learning_rate": 0.000403490705688739, "loss": 36.285, "step": 5544 }, { "epoch": 14.645097391878508, "grad_norm": 415.4536437988281, "learning_rate": 0.00040345772789611325, "loss": 37.7919, "step": 5545 }, { "epoch": 14.647738527566855, "grad_norm": 255.3971710205078, "learning_rate": 0.0004034247458182706, "loss": 37.044, "step": 5546 }, { "epoch": 14.6503796632552, "grad_norm": 266.6968994140625, "learning_rate": 0.000403391759456132, "loss": 36.4347, "step": 5547 }, { "epoch": 14.653020798943546, "grad_norm": 284.1669006347656, "learning_rate": 0.0004033587688106186, "loss": 36.227, "step": 5548 }, { "epoch": 14.655661934631892, "grad_norm": 234.38685607910156, "learning_rate": 0.00040332577388265176, "loss": 38.1191, "step": 5549 }, { "epoch": 14.658303070320237, "grad_norm": 1292.8724365234375, "learning_rate": 0.00040329277467315275, "loss": 38.3417, "step": 5550 }, { "epoch": 14.660944206008583, "grad_norm": 723.8731079101562, "learning_rate": 0.00040325977118304293, "loss": 44.8257, "step": 5551 }, { "epoch": 14.66358534169693, "grad_norm": 372.8329772949219, "learning_rate": 0.00040322676341324415, "loss": 46.0017, "step": 5552 }, { "epoch": 14.666226477385276, "grad_norm": 401.6424560546875, "learning_rate": 0.00040319375136467793, "loss": 46.6479, "step": 5553 }, { "epoch": 14.66886761307362, "grad_norm": 351.4795227050781, "learning_rate": 0.0004031607350382662, "loss": 43.2276, "step": 5554 }, { "epoch": 14.671508748761967, "grad_norm": 305.5983581542969, "learning_rate": 0.000403127714434931, "loss": 45.1993, "step": 5555 }, { "epoch": 14.674149884450314, "grad_norm": 254.50613403320312, "learning_rate": 0.0004030946895555943, "loss": 43.3208, "step": 5556 }, { "epoch": 14.67679102013866, "grad_norm": 213.325927734375, "learning_rate": 0.0004030616604011783, "loss": 44.9228, "step": 5557 }, { "epoch": 14.679432155827005, "grad_norm": 335.92803955078125, "learning_rate": 0.00040302862697260533, "loss": 42.709, "step": 5558 }, { "epoch": 14.682073291515351, "grad_norm": 289.7813415527344, "learning_rate": 0.0004029955892707979, "loss": 44.1573, "step": 5559 }, { "epoch": 14.684714427203698, "grad_norm": 326.1020812988281, "learning_rate": 0.0004029625472966785, "loss": 41.1287, "step": 5560 }, { "epoch": 14.687355562892044, "grad_norm": 313.35467529296875, "learning_rate": 0.00040292950105116974, "loss": 42.0123, "step": 5561 }, { "epoch": 14.689996698580389, "grad_norm": 245.88462829589844, "learning_rate": 0.00040289645053519463, "loss": 39.2888, "step": 5562 }, { "epoch": 14.692637834268735, "grad_norm": 467.0791931152344, "learning_rate": 0.0004028633957496759, "loss": 40.8599, "step": 5563 }, { "epoch": 14.695278969957082, "grad_norm": 244.74818420410156, "learning_rate": 0.00040283033669553665, "loss": 41.3185, "step": 5564 }, { "epoch": 14.697920105645428, "grad_norm": 278.9380187988281, "learning_rate": 0.00040279727337369997, "loss": 38.3474, "step": 5565 }, { "epoch": 14.700561241333773, "grad_norm": 343.04486083984375, "learning_rate": 0.0004027642057850893, "loss": 40.6049, "step": 5566 }, { "epoch": 14.70320237702212, "grad_norm": 336.8914489746094, "learning_rate": 0.0004027311339306278, "loss": 37.0878, "step": 5567 }, { "epoch": 14.705843512710466, "grad_norm": 384.63360595703125, "learning_rate": 0.00040269805781123914, "loss": 37.9363, "step": 5568 }, { "epoch": 14.708484648398812, "grad_norm": 235.78404235839844, "learning_rate": 0.0004026649774278469, "loss": 36.3619, "step": 5569 }, { "epoch": 14.711125784087157, "grad_norm": 253.5797576904297, "learning_rate": 0.0004026318927813749, "loss": 37.133, "step": 5570 }, { "epoch": 14.713766919775503, "grad_norm": 283.78973388671875, "learning_rate": 0.00040259880387274696, "loss": 37.754, "step": 5571 }, { "epoch": 14.71640805546385, "grad_norm": 259.4492492675781, "learning_rate": 0.000402565710702887, "loss": 38.9499, "step": 5572 }, { "epoch": 14.719049191152195, "grad_norm": 839.730224609375, "learning_rate": 0.0004025326132727192, "loss": 56.927, "step": 5573 }, { "epoch": 14.721690326840541, "grad_norm": 3267.75341796875, "learning_rate": 0.00040249951158316777, "loss": 95.8117, "step": 5574 }, { "epoch": 14.724331462528887, "grad_norm": 1144.4100341796875, "learning_rate": 0.0004024664056351571, "loss": 85.8587, "step": 5575 }, { "epoch": 14.726972598217234, "grad_norm": 4717.603515625, "learning_rate": 0.00040243329542961166, "loss": 87.0199, "step": 5576 }, { "epoch": 14.729613733905579, "grad_norm": 5243.9052734375, "learning_rate": 0.0004024001809674559, "loss": 91.7599, "step": 5577 }, { "epoch": 14.732254869593925, "grad_norm": 1279.0299072265625, "learning_rate": 0.0004023670622496147, "loss": 81.4286, "step": 5578 }, { "epoch": 14.734896005282272, "grad_norm": 1851.015625, "learning_rate": 0.00040233393927701267, "loss": 89.4722, "step": 5579 }, { "epoch": 14.737537140970618, "grad_norm": 1765.86181640625, "learning_rate": 0.000402300812050575, "loss": 71.7193, "step": 5580 }, { "epoch": 14.740178276658963, "grad_norm": 1449.470458984375, "learning_rate": 0.00040226768057122653, "loss": 54.2262, "step": 5581 }, { "epoch": 14.742819412347309, "grad_norm": 3582.93505859375, "learning_rate": 0.0004022345448398925, "loss": 41.1793, "step": 5582 }, { "epoch": 14.745460548035656, "grad_norm": 562.4647216796875, "learning_rate": 0.00040220140485749836, "loss": 38.7962, "step": 5583 }, { "epoch": 14.748101683724002, "grad_norm": 322.682373046875, "learning_rate": 0.0004021682606249693, "loss": 37.7686, "step": 5584 }, { "epoch": 14.750742819412347, "grad_norm": 244.0998992919922, "learning_rate": 0.00040213511214323103, "loss": 36.7005, "step": 5585 }, { "epoch": 14.753383955100693, "grad_norm": 446.5534362792969, "learning_rate": 0.00040210195941320907, "loss": 37.8807, "step": 5586 }, { "epoch": 14.75602509078904, "grad_norm": 782.8748168945312, "learning_rate": 0.0004020688024358292, "loss": 38.9771, "step": 5587 }, { "epoch": 14.758666226477386, "grad_norm": 592.3160400390625, "learning_rate": 0.0004020356412120174, "loss": 40.269, "step": 5588 }, { "epoch": 14.76130736216573, "grad_norm": 412.36553955078125, "learning_rate": 0.0004020024757426996, "loss": 41.7108, "step": 5589 }, { "epoch": 14.763948497854077, "grad_norm": 488.6809997558594, "learning_rate": 0.000401969306028802, "loss": 40.1898, "step": 5590 }, { "epoch": 14.766589633542424, "grad_norm": 733.9113159179688, "learning_rate": 0.0004019361320712508, "loss": 39.4916, "step": 5591 }, { "epoch": 14.76923076923077, "grad_norm": 366.25640869140625, "learning_rate": 0.0004019029538709724, "loss": 37.4557, "step": 5592 }, { "epoch": 14.771871904919115, "grad_norm": 239.38632202148438, "learning_rate": 0.00040186977142889316, "loss": 38.1348, "step": 5593 }, { "epoch": 14.774513040607461, "grad_norm": 368.3595275878906, "learning_rate": 0.00040183658474593975, "loss": 37.3354, "step": 5594 }, { "epoch": 14.777154176295808, "grad_norm": 433.4449157714844, "learning_rate": 0.00040180339382303896, "loss": 37.087, "step": 5595 }, { "epoch": 14.779795311984152, "grad_norm": 454.0767517089844, "learning_rate": 0.00040177019866111753, "loss": 36.5875, "step": 5596 }, { "epoch": 14.782436447672499, "grad_norm": 512.9886474609375, "learning_rate": 0.0004017369992611025, "loss": 37.6186, "step": 5597 }, { "epoch": 14.785077583360845, "grad_norm": 396.12799072265625, "learning_rate": 0.0004017037956239209, "loss": 38.8992, "step": 5598 }, { "epoch": 14.787718719049192, "grad_norm": 730.18017578125, "learning_rate": 0.00040167058775049993, "loss": 37.6327, "step": 5599 }, { "epoch": 14.790359854737536, "grad_norm": 724.6292724609375, "learning_rate": 0.0004016373756417668, "loss": 40.9808, "step": 5600 }, { "epoch": 14.790359854737536, "eval_loss": 4.439844608306885, "eval_runtime": 2.164, "eval_samples_per_second": 228.744, "eval_steps_per_second": 28.651, "step": 5600 }, { "epoch": 14.793000990425883, "grad_norm": 667.091064453125, "learning_rate": 0.00040160415929864926, "loss": 42.5813, "step": 5601 }, { "epoch": 14.79564212611423, "grad_norm": 260.9648742675781, "learning_rate": 0.0004015709387220744, "loss": 41.4196, "step": 5602 }, { "epoch": 14.798283261802576, "grad_norm": 750.6450805664062, "learning_rate": 0.00040153771391297023, "loss": 42.3602, "step": 5603 }, { "epoch": 14.80092439749092, "grad_norm": 1291.53125, "learning_rate": 0.0004015044848722643, "loss": 40.7418, "step": 5604 }, { "epoch": 14.803565533179267, "grad_norm": 443.28173828125, "learning_rate": 0.00040147125160088474, "loss": 40.3521, "step": 5605 }, { "epoch": 14.806206668867613, "grad_norm": 438.0926208496094, "learning_rate": 0.00040143801409975945, "loss": 43.9766, "step": 5606 }, { "epoch": 14.80884780455596, "grad_norm": 901.4999389648438, "learning_rate": 0.00040140477236981655, "loss": 43.4698, "step": 5607 }, { "epoch": 14.811488940244304, "grad_norm": 784.054443359375, "learning_rate": 0.0004013715264119844, "loss": 43.0582, "step": 5608 }, { "epoch": 14.81413007593265, "grad_norm": 439.7117614746094, "learning_rate": 0.0004013382762271911, "loss": 41.0386, "step": 5609 }, { "epoch": 14.816771211620997, "grad_norm": 359.660400390625, "learning_rate": 0.00040130502181636547, "loss": 45.1973, "step": 5610 }, { "epoch": 14.819412347309344, "grad_norm": 548.9259033203125, "learning_rate": 0.00040127176318043603, "loss": 39.3133, "step": 5611 }, { "epoch": 14.822053482997688, "grad_norm": 354.61602783203125, "learning_rate": 0.0004012385003203314, "loss": 40.9257, "step": 5612 }, { "epoch": 14.824694618686035, "grad_norm": 419.571533203125, "learning_rate": 0.0004012052332369804, "loss": 38.9156, "step": 5613 }, { "epoch": 14.827335754374381, "grad_norm": 479.83111572265625, "learning_rate": 0.00040117196193131224, "loss": 37.6633, "step": 5614 }, { "epoch": 14.829976890062728, "grad_norm": 317.7039794921875, "learning_rate": 0.0004011386864042558, "loss": 37.4967, "step": 5615 }, { "epoch": 14.832618025751072, "grad_norm": 462.40203857421875, "learning_rate": 0.00040110540665674036, "loss": 38.357, "step": 5616 }, { "epoch": 14.835259161439419, "grad_norm": 470.30645751953125, "learning_rate": 0.0004010721226896951, "loss": 36.1525, "step": 5617 }, { "epoch": 14.837900297127765, "grad_norm": 548.6552124023438, "learning_rate": 0.0004010388345040497, "loss": 37.1834, "step": 5618 }, { "epoch": 14.84054143281611, "grad_norm": 496.3546447753906, "learning_rate": 0.00040100554210073335, "loss": 38.0144, "step": 5619 }, { "epoch": 14.843182568504456, "grad_norm": 1532.377685546875, "learning_rate": 0.00040097224548067613, "loss": 36.7884, "step": 5620 }, { "epoch": 14.845823704192803, "grad_norm": 474.466796875, "learning_rate": 0.0004009389446448076, "loss": 37.978, "step": 5621 }, { "epoch": 14.84846483988115, "grad_norm": 794.2891235351562, "learning_rate": 0.0004009056395940577, "loss": 44.2423, "step": 5622 }, { "epoch": 14.851105975569494, "grad_norm": 1798.1224365234375, "learning_rate": 0.00040087233032935646, "loss": 52.4857, "step": 5623 }, { "epoch": 14.85374711125784, "grad_norm": 6751.45947265625, "learning_rate": 0.000400839016851634, "loss": 52.0182, "step": 5624 }, { "epoch": 14.856388246946187, "grad_norm": 1535.81396484375, "learning_rate": 0.0004008056991618206, "loss": 47.8621, "step": 5625 }, { "epoch": 14.859029382634533, "grad_norm": 3898.07275390625, "learning_rate": 0.0004007723772608467, "loss": 36.8409, "step": 5626 }, { "epoch": 14.861670518322878, "grad_norm": 4162.6142578125, "learning_rate": 0.00040073905114964274, "loss": 34.317, "step": 5627 }, { "epoch": 14.864311654011225, "grad_norm": 2030.7286376953125, "learning_rate": 0.0004007057208291393, "loss": 37.786, "step": 5628 }, { "epoch": 14.866952789699571, "grad_norm": 1917.438232421875, "learning_rate": 0.00040067238630026703, "loss": 27.7274, "step": 5629 }, { "epoch": 14.869593925387917, "grad_norm": 3519.165771484375, "learning_rate": 0.00040063904756395695, "loss": 27.3517, "step": 5630 }, { "epoch": 14.872235061076262, "grad_norm": 2784.136962890625, "learning_rate": 0.00040060570462114003, "loss": 21.2803, "step": 5631 }, { "epoch": 14.874876196764609, "grad_norm": 2110.906494140625, "learning_rate": 0.00040057235747274723, "loss": 26.099, "step": 5632 }, { "epoch": 14.877517332452955, "grad_norm": 675.8025512695312, "learning_rate": 0.00040053900611970977, "loss": 39.6528, "step": 5633 }, { "epoch": 14.880158468141301, "grad_norm": 480.0870666503906, "learning_rate": 0.0004005056505629591, "loss": 40.1529, "step": 5634 }, { "epoch": 14.882799603829646, "grad_norm": 467.966064453125, "learning_rate": 0.0004004722908034264, "loss": 38.0903, "step": 5635 }, { "epoch": 14.885440739517993, "grad_norm": 307.2192687988281, "learning_rate": 0.0004004389268420434, "loss": 37.9829, "step": 5636 }, { "epoch": 14.888081875206339, "grad_norm": 421.4923400878906, "learning_rate": 0.00040040555867974185, "loss": 38.5682, "step": 5637 }, { "epoch": 14.890723010894686, "grad_norm": 881.877197265625, "learning_rate": 0.0004003721863174533, "loss": 37.6618, "step": 5638 }, { "epoch": 14.89336414658303, "grad_norm": 1152.4735107421875, "learning_rate": 0.0004003388097561099, "loss": 39.1686, "step": 5639 }, { "epoch": 14.896005282271377, "grad_norm": 606.11962890625, "learning_rate": 0.00040030542899664344, "loss": 37.7674, "step": 5640 }, { "epoch": 14.898646417959723, "grad_norm": 616.20751953125, "learning_rate": 0.0004002720440399863, "loss": 37.4452, "step": 5641 }, { "epoch": 14.901287553648068, "grad_norm": 651.6256713867188, "learning_rate": 0.00040023865488707043, "loss": 38.5571, "step": 5642 }, { "epoch": 14.903928689336414, "grad_norm": 667.93408203125, "learning_rate": 0.0004002052615388285, "loss": 38.9921, "step": 5643 }, { "epoch": 14.90656982502476, "grad_norm": 515.9508056640625, "learning_rate": 0.0004001718639961929, "loss": 36.7024, "step": 5644 }, { "epoch": 14.909210960713107, "grad_norm": 692.1105346679688, "learning_rate": 0.0004001384622600961, "loss": 38.6424, "step": 5645 }, { "epoch": 14.911852096401452, "grad_norm": 517.414306640625, "learning_rate": 0.00040010505633147106, "loss": 37.5625, "step": 5646 }, { "epoch": 14.914493232089798, "grad_norm": 1332.0706787109375, "learning_rate": 0.0004000716462112504, "loss": 37.3143, "step": 5647 }, { "epoch": 14.917134367778145, "grad_norm": 1503.4444580078125, "learning_rate": 0.00040003823190036726, "loss": 37.1305, "step": 5648 }, { "epoch": 14.919775503466491, "grad_norm": 551.1129760742188, "learning_rate": 0.0004000048133997546, "loss": 38.0031, "step": 5649 }, { "epoch": 14.922416639154836, "grad_norm": 1133.962890625, "learning_rate": 0.00039997139071034553, "loss": 40.0545, "step": 5650 }, { "epoch": 14.925057774843182, "grad_norm": 2578.65576171875, "learning_rate": 0.00039993796383307363, "loss": 47.2877, "step": 5651 }, { "epoch": 14.927698910531529, "grad_norm": 925.0799560546875, "learning_rate": 0.00039990453276887207, "loss": 49.1361, "step": 5652 }, { "epoch": 14.930340046219875, "grad_norm": 751.7777099609375, "learning_rate": 0.00039987109751867455, "loss": 48.1523, "step": 5653 }, { "epoch": 14.93298118190822, "grad_norm": 607.6380615234375, "learning_rate": 0.00039983765808341464, "loss": 48.4893, "step": 5654 }, { "epoch": 14.935622317596566, "grad_norm": 1776.3084716796875, "learning_rate": 0.00039980421446402615, "loss": 44.2094, "step": 5655 }, { "epoch": 14.938263453284913, "grad_norm": 634.803955078125, "learning_rate": 0.000399770766661443, "loss": 43.6942, "step": 5656 }, { "epoch": 14.94090458897326, "grad_norm": 473.6617431640625, "learning_rate": 0.0003997373146765991, "loss": 41.1863, "step": 5657 }, { "epoch": 14.943545724661604, "grad_norm": 488.1770324707031, "learning_rate": 0.00039970385851042877, "loss": 42.6165, "step": 5658 }, { "epoch": 14.94618686034995, "grad_norm": 365.65509033203125, "learning_rate": 0.000399670398163866, "loss": 39.6098, "step": 5659 }, { "epoch": 14.948827996038297, "grad_norm": 306.5569152832031, "learning_rate": 0.0003996369336378454, "loss": 38.8139, "step": 5660 }, { "epoch": 14.951469131726643, "grad_norm": 346.97796630859375, "learning_rate": 0.0003996034649333012, "loss": 36.2981, "step": 5661 }, { "epoch": 14.954110267414988, "grad_norm": 292.58892822265625, "learning_rate": 0.0003995699920511683, "loss": 36.7243, "step": 5662 }, { "epoch": 14.956751403103334, "grad_norm": 524.9547119140625, "learning_rate": 0.0003995365149923811, "loss": 37.8885, "step": 5663 }, { "epoch": 14.95939253879168, "grad_norm": 4566.1318359375, "learning_rate": 0.00039950303375787453, "loss": 41.2991, "step": 5664 }, { "epoch": 14.962033674480026, "grad_norm": 1329.574462890625, "learning_rate": 0.00039946954834858367, "loss": 43.6267, "step": 5665 }, { "epoch": 14.964674810168372, "grad_norm": 2600.179443359375, "learning_rate": 0.00039943605876544344, "loss": 39.8571, "step": 5666 }, { "epoch": 14.967315945856718, "grad_norm": 3496.763427734375, "learning_rate": 0.0003994025650093891, "loss": 42.8045, "step": 5667 }, { "epoch": 14.969957081545065, "grad_norm": 1715.685791015625, "learning_rate": 0.0003993690670813558, "loss": 32.9397, "step": 5668 }, { "epoch": 14.97259821723341, "grad_norm": 467.0994873046875, "learning_rate": 0.00039933556498227907, "loss": 38.1399, "step": 5669 }, { "epoch": 14.975239352921756, "grad_norm": 943.7830810546875, "learning_rate": 0.0003993020587130944, "loss": 39.3033, "step": 5670 }, { "epoch": 14.977880488610102, "grad_norm": 386.3266906738281, "learning_rate": 0.00039926854827473745, "loss": 38.2558, "step": 5671 }, { "epoch": 14.980521624298449, "grad_norm": 600.9826049804688, "learning_rate": 0.0003992350336681441, "loss": 37.6767, "step": 5672 }, { "epoch": 14.983162759986794, "grad_norm": 796.2494506835938, "learning_rate": 0.0003992015148942499, "loss": 37.3435, "step": 5673 }, { "epoch": 14.98580389567514, "grad_norm": 318.12066650390625, "learning_rate": 0.00039916799195399125, "loss": 39.1746, "step": 5674 }, { "epoch": 14.988445031363486, "grad_norm": 401.0943298339844, "learning_rate": 0.0003991344648483039, "loss": 37.1678, "step": 5675 }, { "epoch": 14.991086167051833, "grad_norm": 289.831298828125, "learning_rate": 0.00039910093357812426, "loss": 37.9701, "step": 5676 }, { "epoch": 14.993727302740178, "grad_norm": 289.1233825683594, "learning_rate": 0.0003990673981443886, "loss": 37.7014, "step": 5677 }, { "epoch": 14.996368438428524, "grad_norm": 1267.8099365234375, "learning_rate": 0.0003990338585480335, "loss": 39.0543, "step": 5678 }, { "epoch": 14.99900957411687, "grad_norm": 1304.169677734375, "learning_rate": 0.00039900031478999544, "loss": 42.1204, "step": 5679 }, { "epoch": 15.001650709805217, "grad_norm": 436.1067199707031, "learning_rate": 0.000398966766871211, "loss": 42.6425, "step": 5680 }, { "epoch": 15.004291845493562, "grad_norm": 337.63201904296875, "learning_rate": 0.00039893321479261726, "loss": 42.5606, "step": 5681 }, { "epoch": 15.006932981181908, "grad_norm": 387.17578125, "learning_rate": 0.00039889965855515087, "loss": 44.646, "step": 5682 }, { "epoch": 15.009574116870255, "grad_norm": 327.8307189941406, "learning_rate": 0.00039886609815974905, "loss": 43.4465, "step": 5683 }, { "epoch": 15.012215252558601, "grad_norm": 259.6901550292969, "learning_rate": 0.00039883253360734885, "loss": 43.0631, "step": 5684 }, { "epoch": 15.014856388246946, "grad_norm": 203.62290954589844, "learning_rate": 0.0003987989648988876, "loss": 41.9503, "step": 5685 }, { "epoch": 15.017497523935292, "grad_norm": 234.0961151123047, "learning_rate": 0.00039876539203530264, "loss": 42.3769, "step": 5686 }, { "epoch": 15.020138659623639, "grad_norm": 1032.99072265625, "learning_rate": 0.00039873181501753153, "loss": 41.892, "step": 5687 }, { "epoch": 15.022779795311983, "grad_norm": 226.61245727539062, "learning_rate": 0.0003986982338465118, "loss": 43.6847, "step": 5688 }, { "epoch": 15.02542093100033, "grad_norm": 464.2698974609375, "learning_rate": 0.0003986646485231813, "loss": 40.3116, "step": 5689 }, { "epoch": 15.028062066688676, "grad_norm": 288.3354187011719, "learning_rate": 0.0003986310590484778, "loss": 42.1854, "step": 5690 }, { "epoch": 15.030703202377023, "grad_norm": 199.769287109375, "learning_rate": 0.0003985974654233393, "loss": 38.4697, "step": 5691 }, { "epoch": 15.033344338065367, "grad_norm": 237.03564453125, "learning_rate": 0.00039856386764870386, "loss": 39.7227, "step": 5692 }, { "epoch": 15.035985473753714, "grad_norm": 237.82427978515625, "learning_rate": 0.00039853026572550965, "loss": 37.469, "step": 5693 }, { "epoch": 15.03862660944206, "grad_norm": 222.12380981445312, "learning_rate": 0.00039849665965469505, "loss": 38.7365, "step": 5694 }, { "epoch": 15.041267745130407, "grad_norm": 349.42022705078125, "learning_rate": 0.00039846304943719847, "loss": 38.0285, "step": 5695 }, { "epoch": 15.043908880818751, "grad_norm": 266.4547424316406, "learning_rate": 0.00039842943507395847, "loss": 36.9199, "step": 5696 }, { "epoch": 15.046550016507098, "grad_norm": 290.86212158203125, "learning_rate": 0.00039839581656591364, "loss": 37.1576, "step": 5697 }, { "epoch": 15.049191152195444, "grad_norm": 261.9786682128906, "learning_rate": 0.0003983621939140028, "loss": 36.5687, "step": 5698 }, { "epoch": 15.05183228788379, "grad_norm": 236.036376953125, "learning_rate": 0.0003983285671191649, "loss": 37.5955, "step": 5699 }, { "epoch": 15.054473423572135, "grad_norm": 372.9809265136719, "learning_rate": 0.0003982949361823388, "loss": 38.0055, "step": 5700 }, { "epoch": 15.057114559260482, "grad_norm": 568.1593017578125, "learning_rate": 0.0003982613011044638, "loss": 37.6058, "step": 5701 }, { "epoch": 15.059755694948828, "grad_norm": 5579.97314453125, "learning_rate": 0.0003982276618864791, "loss": 59.7951, "step": 5702 }, { "epoch": 15.062396830637175, "grad_norm": 10737.5859375, "learning_rate": 0.0003981940185293239, "loss": 64.7485, "step": 5703 }, { "epoch": 15.06503796632552, "grad_norm": 1677.2021484375, "learning_rate": 0.00039816037103393784, "loss": 64.4053, "step": 5704 }, { "epoch": 15.067679102013866, "grad_norm": 3851.433837890625, "learning_rate": 0.00039812671940126046, "loss": 74.4179, "step": 5705 }, { "epoch": 15.070320237702212, "grad_norm": 2461.105712890625, "learning_rate": 0.0003980930636322314, "loss": 62.2437, "step": 5706 }, { "epoch": 15.072961373390559, "grad_norm": 2208.874267578125, "learning_rate": 0.00039805940372779065, "loss": 67.1192, "step": 5707 }, { "epoch": 15.075602509078903, "grad_norm": 9609.0673828125, "learning_rate": 0.00039802573968887795, "loss": 61.7108, "step": 5708 }, { "epoch": 15.07824364476725, "grad_norm": 1659.3724365234375, "learning_rate": 0.00039799207151643346, "loss": 54.0657, "step": 5709 }, { "epoch": 15.080884780455596, "grad_norm": 3410.561767578125, "learning_rate": 0.0003979583992113973, "loss": 53.5991, "step": 5710 }, { "epoch": 15.083525916143941, "grad_norm": 1638.6280517578125, "learning_rate": 0.0003979247227747097, "loss": 39.1181, "step": 5711 }, { "epoch": 15.086167051832287, "grad_norm": 625.9146118164062, "learning_rate": 0.0003978910422073112, "loss": 42.2972, "step": 5712 }, { "epoch": 15.088808187520634, "grad_norm": 846.236572265625, "learning_rate": 0.00039785735751014215, "loss": 37.7649, "step": 5713 }, { "epoch": 15.09144932320898, "grad_norm": 649.1726684570312, "learning_rate": 0.00039782366868414327, "loss": 38.2438, "step": 5714 }, { "epoch": 15.094090458897325, "grad_norm": 239.6650848388672, "learning_rate": 0.0003977899757302553, "loss": 37.2477, "step": 5715 }, { "epoch": 15.096731594585671, "grad_norm": 190.68865966796875, "learning_rate": 0.00039775627864941905, "loss": 37.2761, "step": 5716 }, { "epoch": 15.099372730274018, "grad_norm": 246.37200927734375, "learning_rate": 0.0003977225774425755, "loss": 37.5996, "step": 5717 }, { "epoch": 15.102013865962364, "grad_norm": 338.43524169921875, "learning_rate": 0.00039768887211066574, "loss": 38.6299, "step": 5718 }, { "epoch": 15.104655001650709, "grad_norm": 235.27394104003906, "learning_rate": 0.0003976551626546311, "loss": 37.9836, "step": 5719 }, { "epoch": 15.107296137339056, "grad_norm": 296.7807312011719, "learning_rate": 0.0003976214490754126, "loss": 37.6804, "step": 5720 }, { "epoch": 15.109937273027402, "grad_norm": 346.1487121582031, "learning_rate": 0.000397587731373952, "loss": 37.5678, "step": 5721 }, { "epoch": 15.112578408715748, "grad_norm": 286.8078308105469, "learning_rate": 0.0003975540095511906, "loss": 37.105, "step": 5722 }, { "epoch": 15.115219544404093, "grad_norm": 319.15936279296875, "learning_rate": 0.00039752028360807024, "loss": 37.5836, "step": 5723 }, { "epoch": 15.11786068009244, "grad_norm": 382.6737365722656, "learning_rate": 0.0003974865535455326, "loss": 37.9881, "step": 5724 }, { "epoch": 15.120501815780786, "grad_norm": 416.7841796875, "learning_rate": 0.0003974528193645196, "loss": 37.4976, "step": 5725 }, { "epoch": 15.123142951469132, "grad_norm": 537.4453735351562, "learning_rate": 0.00039741908106597325, "loss": 38.8557, "step": 5726 }, { "epoch": 15.125784087157477, "grad_norm": 398.1562194824219, "learning_rate": 0.0003973853386508356, "loss": 37.5656, "step": 5727 }, { "epoch": 15.128425222845824, "grad_norm": 350.2348327636719, "learning_rate": 0.00039735159212004895, "loss": 38.1567, "step": 5728 }, { "epoch": 15.13106635853417, "grad_norm": 515.3190307617188, "learning_rate": 0.0003973178414745556, "loss": 40.601, "step": 5729 }, { "epoch": 15.133707494222516, "grad_norm": 1830.20166015625, "learning_rate": 0.00039728408671529827, "loss": 43.7739, "step": 5730 }, { "epoch": 15.136348629910861, "grad_norm": 225.4771270751953, "learning_rate": 0.00039725032784321913, "loss": 42.213, "step": 5731 }, { "epoch": 15.138989765599208, "grad_norm": 282.7248229980469, "learning_rate": 0.0003972165648592612, "loss": 41.8888, "step": 5732 }, { "epoch": 15.141630901287554, "grad_norm": 204.54714965820312, "learning_rate": 0.0003971827977643671, "loss": 41.9661, "step": 5733 }, { "epoch": 15.144272036975899, "grad_norm": 193.67962646484375, "learning_rate": 0.0003971490265594799, "loss": 44.5577, "step": 5734 }, { "epoch": 15.146913172664245, "grad_norm": 335.057373046875, "learning_rate": 0.00039711525124554257, "loss": 41.8296, "step": 5735 }, { "epoch": 15.149554308352592, "grad_norm": 209.02349853515625, "learning_rate": 0.00039708147182349816, "loss": 42.9238, "step": 5736 }, { "epoch": 15.152195444040938, "grad_norm": 219.69613647460938, "learning_rate": 0.00039704768829429014, "loss": 41.2718, "step": 5737 }, { "epoch": 15.154836579729283, "grad_norm": 204.1127471923828, "learning_rate": 0.00039701390065886177, "loss": 40.2109, "step": 5738 }, { "epoch": 15.15747771541763, "grad_norm": 240.70436096191406, "learning_rate": 0.0003969801089181566, "loss": 40.3295, "step": 5739 }, { "epoch": 15.160118851105976, "grad_norm": 199.691162109375, "learning_rate": 0.0003969463130731183, "loss": 38.8887, "step": 5740 }, { "epoch": 15.162759986794322, "grad_norm": 255.61044311523438, "learning_rate": 0.00039691251312469044, "loss": 38.8426, "step": 5741 }, { "epoch": 15.165401122482667, "grad_norm": 240.57723999023438, "learning_rate": 0.000396878709073817, "loss": 40.9813, "step": 5742 }, { "epoch": 15.168042258171013, "grad_norm": 173.88931274414062, "learning_rate": 0.00039684490092144187, "loss": 38.0103, "step": 5743 }, { "epoch": 15.17068339385936, "grad_norm": 196.88270568847656, "learning_rate": 0.0003968110886685092, "loss": 37.3703, "step": 5744 }, { "epoch": 15.173324529547706, "grad_norm": 148.7333221435547, "learning_rate": 0.00039677727231596303, "loss": 37.4086, "step": 5745 }, { "epoch": 15.17596566523605, "grad_norm": 189.10569763183594, "learning_rate": 0.00039674345186474773, "loss": 37.2817, "step": 5746 }, { "epoch": 15.178606800924397, "grad_norm": 331.4669494628906, "learning_rate": 0.00039670962731580783, "loss": 37.488, "step": 5747 }, { "epoch": 15.181247936612744, "grad_norm": 287.162841796875, "learning_rate": 0.0003966757986700877, "loss": 36.1929, "step": 5748 }, { "epoch": 15.18388907230109, "grad_norm": 306.1023254394531, "learning_rate": 0.00039664196592853213, "loss": 37.4036, "step": 5749 }, { "epoch": 15.186530207989435, "grad_norm": 291.97540283203125, "learning_rate": 0.00039660812909208577, "loss": 38.7749, "step": 5750 }, { "epoch": 15.189171343677781, "grad_norm": 8349.814453125, "learning_rate": 0.0003965742881616935, "loss": 45.6179, "step": 5751 }, { "epoch": 15.191812479366128, "grad_norm": 3907.587646484375, "learning_rate": 0.0003965404431383004, "loss": 64.3127, "step": 5752 }, { "epoch": 15.194453615054474, "grad_norm": 1035.72265625, "learning_rate": 0.0003965065940228514, "loss": 61.8094, "step": 5753 }, { "epoch": 15.197094750742819, "grad_norm": 5582.576171875, "learning_rate": 0.0003964727408162919, "loss": 60.9024, "step": 5754 }, { "epoch": 15.199735886431165, "grad_norm": 2278.852294921875, "learning_rate": 0.0003964388835195671, "loss": 56.8441, "step": 5755 }, { "epoch": 15.202377022119512, "grad_norm": 9019.66015625, "learning_rate": 0.00039640502213362264, "loss": 53.4827, "step": 5756 }, { "epoch": 15.205018157807856, "grad_norm": 2811.995849609375, "learning_rate": 0.00039637115665940384, "loss": 41.9497, "step": 5757 }, { "epoch": 15.207659293496203, "grad_norm": 2060.924072265625, "learning_rate": 0.0003963372870978564, "loss": 42.3113, "step": 5758 }, { "epoch": 15.21030042918455, "grad_norm": 2029.0030517578125, "learning_rate": 0.0003963034134499263, "loss": 37.7059, "step": 5759 }, { "epoch": 15.212941564872896, "grad_norm": 1337.12158203125, "learning_rate": 0.00039626953571655926, "loss": 29.8717, "step": 5760 }, { "epoch": 15.21558270056124, "grad_norm": 354.3843994140625, "learning_rate": 0.0003962356538987014, "loss": 34.9203, "step": 5761 }, { "epoch": 15.218223836249587, "grad_norm": 361.232421875, "learning_rate": 0.0003962017679972988, "loss": 39.0523, "step": 5762 }, { "epoch": 15.220864971937933, "grad_norm": 452.79278564453125, "learning_rate": 0.0003961678780132977, "loss": 38.0374, "step": 5763 }, { "epoch": 15.22350610762628, "grad_norm": 386.1086730957031, "learning_rate": 0.00039613398394764445, "loss": 36.7072, "step": 5764 }, { "epoch": 15.226147243314625, "grad_norm": 405.8882751464844, "learning_rate": 0.00039610008580128554, "loss": 38.0968, "step": 5765 }, { "epoch": 15.228788379002971, "grad_norm": 330.7980651855469, "learning_rate": 0.00039606618357516753, "loss": 37.9635, "step": 5766 }, { "epoch": 15.231429514691317, "grad_norm": 264.4001159667969, "learning_rate": 0.00039603227727023725, "loss": 37.1738, "step": 5767 }, { "epoch": 15.234070650379664, "grad_norm": 330.2308044433594, "learning_rate": 0.0003959983668874414, "loss": 38.0768, "step": 5768 }, { "epoch": 15.236711786068009, "grad_norm": 405.5726623535156, "learning_rate": 0.00039596445242772685, "loss": 35.9575, "step": 5769 }, { "epoch": 15.239352921756355, "grad_norm": 201.16030883789062, "learning_rate": 0.0003959305338920407, "loss": 36.7295, "step": 5770 }, { "epoch": 15.241994057444701, "grad_norm": 439.1127624511719, "learning_rate": 0.00039589661128133015, "loss": 38.2243, "step": 5771 }, { "epoch": 15.244635193133048, "grad_norm": 248.01968383789062, "learning_rate": 0.0003958626845965424, "loss": 37.379, "step": 5772 }, { "epoch": 15.247276328821393, "grad_norm": 444.4195251464844, "learning_rate": 0.0003958287538386249, "loss": 37.5446, "step": 5773 }, { "epoch": 15.249917464509739, "grad_norm": 295.5962829589844, "learning_rate": 0.0003957948190085252, "loss": 37.0165, "step": 5774 }, { "epoch": 15.252558600198086, "grad_norm": 424.9654541015625, "learning_rate": 0.0003957608801071907, "loss": 37.754, "step": 5775 }, { "epoch": 15.255199735886432, "grad_norm": 232.47360229492188, "learning_rate": 0.0003957269371355693, "loss": 36.9924, "step": 5776 }, { "epoch": 15.257840871574777, "grad_norm": 362.7882385253906, "learning_rate": 0.0003956929900946088, "loss": 38.5936, "step": 5777 }, { "epoch": 15.260482007263123, "grad_norm": 1004.7485961914062, "learning_rate": 0.0003956590389852571, "loss": 39.1874, "step": 5778 }, { "epoch": 15.26312314295147, "grad_norm": 1198.3603515625, "learning_rate": 0.0003956250838084623, "loss": 41.2583, "step": 5779 }, { "epoch": 15.265764278639814, "grad_norm": 363.9424743652344, "learning_rate": 0.0003955911245651726, "loss": 39.9356, "step": 5780 }, { "epoch": 15.26840541432816, "grad_norm": 264.7147521972656, "learning_rate": 0.00039555716125633624, "loss": 43.3732, "step": 5781 }, { "epoch": 15.271046550016507, "grad_norm": 293.41015625, "learning_rate": 0.0003955231938829017, "loss": 42.2632, "step": 5782 }, { "epoch": 15.273687685704854, "grad_norm": 331.3219299316406, "learning_rate": 0.0003954892224458174, "loss": 42.0621, "step": 5783 }, { "epoch": 15.276328821393198, "grad_norm": 384.46728515625, "learning_rate": 0.00039545524694603205, "loss": 41.699, "step": 5784 }, { "epoch": 15.278969957081545, "grad_norm": 663.9901733398438, "learning_rate": 0.0003954212673844944, "loss": 46.8783, "step": 5785 }, { "epoch": 15.281611092769891, "grad_norm": 223.2167205810547, "learning_rate": 0.0003953872837621533, "loss": 43.0853, "step": 5786 }, { "epoch": 15.284252228458238, "grad_norm": 270.3028869628906, "learning_rate": 0.0003953532960799577, "loss": 42.3551, "step": 5787 }, { "epoch": 15.286893364146582, "grad_norm": 238.2912139892578, "learning_rate": 0.0003953193043388566, "loss": 41.2492, "step": 5788 }, { "epoch": 15.289534499834929, "grad_norm": 299.5049743652344, "learning_rate": 0.0003952853085397994, "loss": 41.1007, "step": 5789 }, { "epoch": 15.292175635523275, "grad_norm": 296.83648681640625, "learning_rate": 0.00039525130868373526, "loss": 41.3787, "step": 5790 }, { "epoch": 15.294816771211622, "grad_norm": 323.9659423828125, "learning_rate": 0.00039521730477161366, "loss": 40.611, "step": 5791 }, { "epoch": 15.297457906899966, "grad_norm": 241.50299072265625, "learning_rate": 0.00039518329680438414, "loss": 39.3285, "step": 5792 }, { "epoch": 15.300099042588313, "grad_norm": 287.96380615234375, "learning_rate": 0.00039514928478299625, "loss": 39.383, "step": 5793 }, { "epoch": 15.30274017827666, "grad_norm": 334.57342529296875, "learning_rate": 0.00039511526870839996, "loss": 37.864, "step": 5794 }, { "epoch": 15.305381313965006, "grad_norm": 279.1084899902344, "learning_rate": 0.00039508124858154503, "loss": 37.2224, "step": 5795 }, { "epoch": 15.30802244965335, "grad_norm": 326.77581787109375, "learning_rate": 0.00039504722440338144, "loss": 38.6169, "step": 5796 }, { "epoch": 15.310663585341697, "grad_norm": 235.86207580566406, "learning_rate": 0.0003950131961748593, "loss": 36.4562, "step": 5797 }, { "epoch": 15.313304721030043, "grad_norm": 233.0327911376953, "learning_rate": 0.00039497916389692885, "loss": 36.6524, "step": 5798 }, { "epoch": 15.31594585671839, "grad_norm": 589.4620361328125, "learning_rate": 0.00039494512757054035, "loss": 37.959, "step": 5799 }, { "epoch": 15.318586992406734, "grad_norm": 406.3992004394531, "learning_rate": 0.0003949110871966444, "loss": 37.3181, "step": 5800 }, { "epoch": 15.318586992406734, "eval_loss": 5.10468053817749, "eval_runtime": 2.1623, "eval_samples_per_second": 228.924, "eval_steps_per_second": 28.673, "step": 5800 }, { "epoch": 15.32122812809508, "grad_norm": 361.3656311035156, "learning_rate": 0.0003948770427761914, "loss": 37.2567, "step": 5801 }, { "epoch": 15.323869263783427, "grad_norm": 1240.23193359375, "learning_rate": 0.0003948429943101322, "loss": 46.5154, "step": 5802 }, { "epoch": 15.326510399471772, "grad_norm": 2433.556640625, "learning_rate": 0.0003948089417994173, "loss": 71.0121, "step": 5803 }, { "epoch": 15.329151535160118, "grad_norm": 3241.239501953125, "learning_rate": 0.00039477488524499783, "loss": 63.2293, "step": 5804 }, { "epoch": 15.331792670848465, "grad_norm": 3058.2568359375, "learning_rate": 0.0003947408246478248, "loss": 67.0723, "step": 5805 }, { "epoch": 15.334433806536811, "grad_norm": 2133.59619140625, "learning_rate": 0.0003947067600088492, "loss": 51.5625, "step": 5806 }, { "epoch": 15.337074942225156, "grad_norm": 10924.0419921875, "learning_rate": 0.00039467269132902234, "loss": 52.1243, "step": 5807 }, { "epoch": 15.339716077913502, "grad_norm": 11050.1884765625, "learning_rate": 0.0003946386186092956, "loss": 48.3664, "step": 5808 }, { "epoch": 15.342357213601849, "grad_norm": 2228.3798828125, "learning_rate": 0.0003946045418506203, "loss": 37.3618, "step": 5809 }, { "epoch": 15.344998349290195, "grad_norm": 2639.481689453125, "learning_rate": 0.0003945704610539483, "loss": 37.9618, "step": 5810 }, { "epoch": 15.34763948497854, "grad_norm": 3389.7958984375, "learning_rate": 0.000394536376220231, "loss": 34.408, "step": 5811 }, { "epoch": 15.350280620666886, "grad_norm": 1321.26953125, "learning_rate": 0.0003945022873504203, "loss": 32.7262, "step": 5812 }, { "epoch": 15.352921756355233, "grad_norm": 286.89129638671875, "learning_rate": 0.00039446819444546806, "loss": 39.6514, "step": 5813 }, { "epoch": 15.35556289204358, "grad_norm": 371.12408447265625, "learning_rate": 0.00039443409750632635, "loss": 37.216, "step": 5814 }, { "epoch": 15.358204027731924, "grad_norm": 310.6380920410156, "learning_rate": 0.0003943999965339474, "loss": 37.4846, "step": 5815 }, { "epoch": 15.36084516342027, "grad_norm": 308.7300720214844, "learning_rate": 0.0003943658915292833, "loss": 37.8828, "step": 5816 }, { "epoch": 15.363486299108617, "grad_norm": 280.0924377441406, "learning_rate": 0.00039433178249328657, "loss": 37.1735, "step": 5817 }, { "epoch": 15.366127434796963, "grad_norm": 236.79307556152344, "learning_rate": 0.00039429766942690946, "loss": 37.4485, "step": 5818 }, { "epoch": 15.368768570485308, "grad_norm": 334.466552734375, "learning_rate": 0.0003942635523311048, "loss": 36.4968, "step": 5819 }, { "epoch": 15.371409706173655, "grad_norm": 298.88372802734375, "learning_rate": 0.0003942294312068252, "loss": 36.6876, "step": 5820 }, { "epoch": 15.374050841862001, "grad_norm": 669.4081420898438, "learning_rate": 0.0003941953060550234, "loss": 37.0893, "step": 5821 }, { "epoch": 15.376691977550347, "grad_norm": 240.63040161132812, "learning_rate": 0.00039416117687665245, "loss": 37.4035, "step": 5822 }, { "epoch": 15.379333113238692, "grad_norm": 251.49598693847656, "learning_rate": 0.0003941270436726653, "loss": 36.3802, "step": 5823 }, { "epoch": 15.381974248927039, "grad_norm": 402.427734375, "learning_rate": 0.0003940929064440151, "loss": 37.3054, "step": 5824 }, { "epoch": 15.384615384615385, "grad_norm": 375.7248229980469, "learning_rate": 0.0003940587651916551, "loss": 36.2948, "step": 5825 }, { "epoch": 15.38725652030373, "grad_norm": 397.8468017578125, "learning_rate": 0.0003940246199165387, "loss": 37.3006, "step": 5826 }, { "epoch": 15.389897655992076, "grad_norm": 432.4656677246094, "learning_rate": 0.00039399047061961943, "loss": 37.7686, "step": 5827 }, { "epoch": 15.392538791680423, "grad_norm": 289.2859802246094, "learning_rate": 0.0003939563173018509, "loss": 39.6336, "step": 5828 }, { "epoch": 15.395179927368769, "grad_norm": 473.8230285644531, "learning_rate": 0.0003939221599641867, "loss": 38.996, "step": 5829 }, { "epoch": 15.397821063057114, "grad_norm": 502.8184509277344, "learning_rate": 0.00039388799860758073, "loss": 41.7424, "step": 5830 }, { "epoch": 15.40046219874546, "grad_norm": 269.4997253417969, "learning_rate": 0.00039385383323298695, "loss": 41.9871, "step": 5831 }, { "epoch": 15.403103334433807, "grad_norm": 356.4634704589844, "learning_rate": 0.00039381966384135944, "loss": 42.1393, "step": 5832 }, { "epoch": 15.405744470122153, "grad_norm": 291.73028564453125, "learning_rate": 0.0003937854904336522, "loss": 42.1091, "step": 5833 }, { "epoch": 15.408385605810498, "grad_norm": 257.0148620605469, "learning_rate": 0.0003937513130108197, "loss": 44.6726, "step": 5834 }, { "epoch": 15.411026741498844, "grad_norm": 716.44921875, "learning_rate": 0.0003937171315738161, "loss": 45.1758, "step": 5835 }, { "epoch": 15.41366787718719, "grad_norm": 190.1684112548828, "learning_rate": 0.0003936829461235961, "loss": 44.5064, "step": 5836 }, { "epoch": 15.416309012875537, "grad_norm": 212.50962829589844, "learning_rate": 0.0003936487566611142, "loss": 42.5818, "step": 5837 }, { "epoch": 15.418950148563882, "grad_norm": 200.6001434326172, "learning_rate": 0.00039361456318732526, "loss": 41.0769, "step": 5838 }, { "epoch": 15.421591284252228, "grad_norm": 292.9892883300781, "learning_rate": 0.0003935803657031839, "loss": 42.0304, "step": 5839 }, { "epoch": 15.424232419940575, "grad_norm": 264.9844055175781, "learning_rate": 0.00039354616420964517, "loss": 41.0243, "step": 5840 }, { "epoch": 15.426873555628921, "grad_norm": 181.51255798339844, "learning_rate": 0.00039351195870766423, "loss": 39.8118, "step": 5841 }, { "epoch": 15.429514691317266, "grad_norm": 221.79957580566406, "learning_rate": 0.00039347774919819604, "loss": 38.8518, "step": 5842 }, { "epoch": 15.432155827005612, "grad_norm": 169.3321075439453, "learning_rate": 0.000393443535682196, "loss": 39.0914, "step": 5843 }, { "epoch": 15.434796962693959, "grad_norm": 137.4297332763672, "learning_rate": 0.00039340931816061954, "loss": 36.7564, "step": 5844 }, { "epoch": 15.437438098382305, "grad_norm": 941.8692626953125, "learning_rate": 0.0003933750966344221, "loss": 37.1039, "step": 5845 }, { "epoch": 15.44007923407065, "grad_norm": 217.33168029785156, "learning_rate": 0.0003933408711045593, "loss": 36.2486, "step": 5846 }, { "epoch": 15.442720369758996, "grad_norm": 173.24111938476562, "learning_rate": 0.0003933066415719869, "loss": 35.7888, "step": 5847 }, { "epoch": 15.445361505447343, "grad_norm": 162.51055908203125, "learning_rate": 0.0003932724080376607, "loss": 35.9873, "step": 5848 }, { "epoch": 15.448002641135687, "grad_norm": 186.6444091796875, "learning_rate": 0.00039323817050253664, "loss": 37.1861, "step": 5849 }, { "epoch": 15.450643776824034, "grad_norm": 125.30313110351562, "learning_rate": 0.00039320392896757085, "loss": 36.3074, "step": 5850 }, { "epoch": 15.45328491251238, "grad_norm": 192.0481719970703, "learning_rate": 0.0003931696834337194, "loss": 37.8103, "step": 5851 }, { "epoch": 15.455926048200727, "grad_norm": 1775.52099609375, "learning_rate": 0.0003931354339019387, "loss": 77.1407, "step": 5852 }, { "epoch": 15.458567183889071, "grad_norm": 1610.1741943359375, "learning_rate": 0.0003931011803731851, "loss": 72.2542, "step": 5853 }, { "epoch": 15.461208319577418, "grad_norm": 2080.427490234375, "learning_rate": 0.00039306692284841506, "loss": 63.2407, "step": 5854 }, { "epoch": 15.463849455265764, "grad_norm": 2345.903076171875, "learning_rate": 0.0003930326613285853, "loss": 61.7077, "step": 5855 }, { "epoch": 15.46649059095411, "grad_norm": 2460.23095703125, "learning_rate": 0.00039299839581465246, "loss": 48.5464, "step": 5856 }, { "epoch": 15.469131726642456, "grad_norm": 2471.86572265625, "learning_rate": 0.0003929641263075734, "loss": 45.5047, "step": 5857 }, { "epoch": 15.471772862330802, "grad_norm": 1449.711669921875, "learning_rate": 0.00039292985280830507, "loss": 41.6513, "step": 5858 }, { "epoch": 15.474413998019148, "grad_norm": 1490.361083984375, "learning_rate": 0.0003928955753178046, "loss": 29.9119, "step": 5859 }, { "epoch": 15.477055133707495, "grad_norm": 4322.4990234375, "learning_rate": 0.0003928612938370292, "loss": 27.3607, "step": 5860 }, { "epoch": 15.47969626939584, "grad_norm": 525.0249633789062, "learning_rate": 0.000392827008366936, "loss": 20.1405, "step": 5861 }, { "epoch": 15.482337405084186, "grad_norm": 230.1781768798828, "learning_rate": 0.0003927927189084825, "loss": 35.586, "step": 5862 }, { "epoch": 15.484978540772532, "grad_norm": 205.08143615722656, "learning_rate": 0.00039275842546262617, "loss": 39.0851, "step": 5863 }, { "epoch": 15.487619676460879, "grad_norm": 215.2645263671875, "learning_rate": 0.00039272412803032473, "loss": 39.6429, "step": 5864 }, { "epoch": 15.490260812149224, "grad_norm": 428.3683776855469, "learning_rate": 0.0003926898266125359, "loss": 41.1571, "step": 5865 }, { "epoch": 15.49290194783757, "grad_norm": 221.53775024414062, "learning_rate": 0.0003926555212102174, "loss": 39.7147, "step": 5866 }, { "epoch": 15.495543083525916, "grad_norm": 528.591552734375, "learning_rate": 0.0003926212118243273, "loss": 38.6401, "step": 5867 }, { "epoch": 15.498184219214263, "grad_norm": 389.65869140625, "learning_rate": 0.0003925868984558236, "loss": 37.3564, "step": 5868 }, { "epoch": 15.500825354902608, "grad_norm": 307.4845275878906, "learning_rate": 0.0003925525811056646, "loss": 37.9717, "step": 5869 }, { "epoch": 15.503466490590954, "grad_norm": 191.8729248046875, "learning_rate": 0.0003925182597748085, "loss": 37.4324, "step": 5870 }, { "epoch": 15.5061076262793, "grad_norm": 265.4689025878906, "learning_rate": 0.0003924839344642137, "loss": 37.7982, "step": 5871 }, { "epoch": 15.508748761967645, "grad_norm": 214.8079376220703, "learning_rate": 0.0003924496051748387, "loss": 38.8009, "step": 5872 }, { "epoch": 15.511389897655992, "grad_norm": 232.58505249023438, "learning_rate": 0.00039241527190764214, "loss": 37.6639, "step": 5873 }, { "epoch": 15.514031033344338, "grad_norm": 312.53509521484375, "learning_rate": 0.0003923809346635828, "loss": 36.8747, "step": 5874 }, { "epoch": 15.516672169032685, "grad_norm": 285.5728759765625, "learning_rate": 0.0003923465934436195, "loss": 36.2466, "step": 5875 }, { "epoch": 15.51931330472103, "grad_norm": 316.0765686035156, "learning_rate": 0.00039231224824871127, "loss": 35.7914, "step": 5876 }, { "epoch": 15.521954440409376, "grad_norm": 444.10791015625, "learning_rate": 0.00039227789907981696, "loss": 37.5659, "step": 5877 }, { "epoch": 15.524595576097722, "grad_norm": 404.53900146484375, "learning_rate": 0.00039224354593789604, "loss": 37.5586, "step": 5878 }, { "epoch": 15.527236711786069, "grad_norm": 2372.5908203125, "learning_rate": 0.00039220918882390754, "loss": 40.4274, "step": 5879 }, { "epoch": 15.529877847474413, "grad_norm": 508.9284362792969, "learning_rate": 0.0003921748277388111, "loss": 43.8213, "step": 5880 }, { "epoch": 15.53251898316276, "grad_norm": 219.59559631347656, "learning_rate": 0.000392140462683566, "loss": 42.1287, "step": 5881 }, { "epoch": 15.535160118851106, "grad_norm": 436.4244384765625, "learning_rate": 0.000392106093659132, "loss": 43.5374, "step": 5882 }, { "epoch": 15.537801254539453, "grad_norm": 274.94476318359375, "learning_rate": 0.0003920717206664689, "loss": 43.1906, "step": 5883 }, { "epoch": 15.540442390227797, "grad_norm": 251.96401977539062, "learning_rate": 0.0003920373437065363, "loss": 46.2362, "step": 5884 }, { "epoch": 15.543083525916144, "grad_norm": 146.18862915039062, "learning_rate": 0.00039200296278029447, "loss": 43.5802, "step": 5885 }, { "epoch": 15.54572466160449, "grad_norm": 267.5779113769531, "learning_rate": 0.0003919685778887032, "loss": 45.3435, "step": 5886 }, { "epoch": 15.548365797292837, "grad_norm": 236.4609832763672, "learning_rate": 0.0003919341890327229, "loss": 42.3331, "step": 5887 }, { "epoch": 15.551006932981181, "grad_norm": 210.97923278808594, "learning_rate": 0.0003918997962133137, "loss": 43.5984, "step": 5888 }, { "epoch": 15.553648068669528, "grad_norm": 272.0746765136719, "learning_rate": 0.00039186539943143604, "loss": 43.1025, "step": 5889 }, { "epoch": 15.556289204357874, "grad_norm": 239.1708984375, "learning_rate": 0.00039183099868805046, "loss": 39.8644, "step": 5890 }, { "epoch": 15.55893034004622, "grad_norm": 247.54827880859375, "learning_rate": 0.0003917965939841175, "loss": 39.5879, "step": 5891 }, { "epoch": 15.561571475734565, "grad_norm": 253.3896026611328, "learning_rate": 0.00039176218532059804, "loss": 40.643, "step": 5892 }, { "epoch": 15.564212611422912, "grad_norm": 168.2385711669922, "learning_rate": 0.0003917277726984527, "loss": 38.2791, "step": 5893 }, { "epoch": 15.566853747111258, "grad_norm": 184.45115661621094, "learning_rate": 0.00039169335611864264, "loss": 36.838, "step": 5894 }, { "epoch": 15.569494882799603, "grad_norm": 286.5032043457031, "learning_rate": 0.00039165893558212883, "loss": 39.0817, "step": 5895 }, { "epoch": 15.57213601848795, "grad_norm": 263.21075439453125, "learning_rate": 0.0003916245110898725, "loss": 39.8258, "step": 5896 }, { "epoch": 15.574777154176296, "grad_norm": 190.00355529785156, "learning_rate": 0.0003915900826428349, "loss": 37.1678, "step": 5897 }, { "epoch": 15.577418289864642, "grad_norm": 192.7726287841797, "learning_rate": 0.00039155565024197737, "loss": 36.6738, "step": 5898 }, { "epoch": 15.580059425552989, "grad_norm": 170.6693572998047, "learning_rate": 0.00039152121388826147, "loss": 36.2195, "step": 5899 }, { "epoch": 15.582700561241333, "grad_norm": 208.76296997070312, "learning_rate": 0.0003914867735826488, "loss": 37.0038, "step": 5900 }, { "epoch": 15.58534169692968, "grad_norm": 424.1411437988281, "learning_rate": 0.0003914523293261011, "loss": 38.0963, "step": 5901 }, { "epoch": 15.587982832618026, "grad_norm": 1787.3555908203125, "learning_rate": 0.0003914178811195802, "loss": 70.3139, "step": 5902 }, { "epoch": 15.590623968306371, "grad_norm": 1819.96240234375, "learning_rate": 0.000391383428964048, "loss": 85.5022, "step": 5903 }, { "epoch": 15.593265103994717, "grad_norm": 2399.45556640625, "learning_rate": 0.0003913489728604667, "loss": 76.5886, "step": 5904 }, { "epoch": 15.595906239683064, "grad_norm": 2001.1885986328125, "learning_rate": 0.0003913145128097983, "loss": 72.7975, "step": 5905 }, { "epoch": 15.59854737537141, "grad_norm": 1611.8236083984375, "learning_rate": 0.00039128004881300516, "loss": 70.7394, "step": 5906 }, { "epoch": 15.601188511059755, "grad_norm": 2675.3134765625, "learning_rate": 0.0003912455808710497, "loss": 69.6132, "step": 5907 }, { "epoch": 15.603829646748101, "grad_norm": 1945.6788330078125, "learning_rate": 0.0003912111089848942, "loss": 61.9651, "step": 5908 }, { "epoch": 15.606470782436448, "grad_norm": 2922.8603515625, "learning_rate": 0.0003911766331555016, "loss": 61.3647, "step": 5909 }, { "epoch": 15.609111918124794, "grad_norm": 2948.83935546875, "learning_rate": 0.00039114215338383436, "loss": 51.5844, "step": 5910 }, { "epoch": 15.611753053813139, "grad_norm": 1422.198486328125, "learning_rate": 0.0003911076696708554, "loss": 39.4507, "step": 5911 }, { "epoch": 15.614394189501485, "grad_norm": 469.3312072753906, "learning_rate": 0.0003910731820175277, "loss": 39.0691, "step": 5912 }, { "epoch": 15.617035325189832, "grad_norm": 269.08197021484375, "learning_rate": 0.0003910386904248143, "loss": 37.8307, "step": 5913 }, { "epoch": 15.619676460878178, "grad_norm": 326.876953125, "learning_rate": 0.0003910041948936782, "loss": 37.3319, "step": 5914 }, { "epoch": 15.622317596566523, "grad_norm": 262.2302551269531, "learning_rate": 0.0003909696954250829, "loss": 36.8429, "step": 5915 }, { "epoch": 15.62495873225487, "grad_norm": 204.3318328857422, "learning_rate": 0.0003909351920199916, "loss": 36.4202, "step": 5916 }, { "epoch": 15.627599867943216, "grad_norm": 221.2900848388672, "learning_rate": 0.0003909006846793679, "loss": 36.48, "step": 5917 }, { "epoch": 15.63024100363156, "grad_norm": 264.78790283203125, "learning_rate": 0.0003908661734041753, "loss": 37.6644, "step": 5918 }, { "epoch": 15.632882139319907, "grad_norm": 294.24169921875, "learning_rate": 0.0003908316581953776, "loss": 37.4987, "step": 5919 }, { "epoch": 15.635523275008254, "grad_norm": 236.41226196289062, "learning_rate": 0.0003907971390539385, "loss": 36.5337, "step": 5920 }, { "epoch": 15.6381644106966, "grad_norm": 545.6978759765625, "learning_rate": 0.0003907626159808221, "loss": 37.123, "step": 5921 }, { "epoch": 15.640805546384946, "grad_norm": 491.5599670410156, "learning_rate": 0.00039072808897699224, "loss": 37.3116, "step": 5922 }, { "epoch": 15.643446682073291, "grad_norm": 291.431396484375, "learning_rate": 0.00039069355804341325, "loss": 38.2451, "step": 5923 }, { "epoch": 15.646087817761638, "grad_norm": 285.22467041015625, "learning_rate": 0.00039065902318104927, "loss": 37.6542, "step": 5924 }, { "epoch": 15.648728953449984, "grad_norm": 262.87310791015625, "learning_rate": 0.00039062448439086466, "loss": 35.6977, "step": 5925 }, { "epoch": 15.651370089138329, "grad_norm": 184.23829650878906, "learning_rate": 0.000390589941673824, "loss": 36.6123, "step": 5926 }, { "epoch": 15.654011224826675, "grad_norm": 555.0333251953125, "learning_rate": 0.0003905553950308918, "loss": 37.6582, "step": 5927 }, { "epoch": 15.656652360515022, "grad_norm": 462.8868408203125, "learning_rate": 0.00039052084446303264, "loss": 38.5711, "step": 5928 }, { "epoch": 15.659293496203368, "grad_norm": 1569.8035888671875, "learning_rate": 0.00039048628997121154, "loss": 41.3849, "step": 5929 }, { "epoch": 15.661934631891713, "grad_norm": 494.1920166015625, "learning_rate": 0.00039045173155639336, "loss": 42.7711, "step": 5930 }, { "epoch": 15.66457576758006, "grad_norm": 1056.4925537109375, "learning_rate": 0.00039041716921954295, "loss": 45.6359, "step": 5931 }, { "epoch": 15.667216903268406, "grad_norm": 2812.6318359375, "learning_rate": 0.0003903826029616257, "loss": 47.7511, "step": 5932 }, { "epoch": 15.669858038956752, "grad_norm": 779.846923828125, "learning_rate": 0.0003903480327836067, "loss": 47.2599, "step": 5933 }, { "epoch": 15.672499174645097, "grad_norm": 326.2760314941406, "learning_rate": 0.0003903134586864513, "loss": 48.2066, "step": 5934 }, { "epoch": 15.675140310333443, "grad_norm": 323.1400451660156, "learning_rate": 0.00039027888067112503, "loss": 47.6105, "step": 5935 }, { "epoch": 15.67778144602179, "grad_norm": 266.9851989746094, "learning_rate": 0.00039024429873859336, "loss": 46.088, "step": 5936 }, { "epoch": 15.680422581710136, "grad_norm": 296.970947265625, "learning_rate": 0.0003902097128898221, "loss": 45.0691, "step": 5937 }, { "epoch": 15.68306371739848, "grad_norm": 334.9802551269531, "learning_rate": 0.0003901751231257769, "loss": 42.4955, "step": 5938 }, { "epoch": 15.685704853086827, "grad_norm": 207.52369689941406, "learning_rate": 0.0003901405294474238, "loss": 41.2172, "step": 5939 }, { "epoch": 15.688345988775174, "grad_norm": 349.6665344238281, "learning_rate": 0.00039010593185572867, "loss": 41.3781, "step": 5940 }, { "epoch": 15.690987124463518, "grad_norm": 178.8085174560547, "learning_rate": 0.0003900713303516578, "loss": 40.051, "step": 5941 }, { "epoch": 15.693628260151865, "grad_norm": 335.8957214355469, "learning_rate": 0.0003900367249361772, "loss": 38.0687, "step": 5942 }, { "epoch": 15.696269395840211, "grad_norm": 243.22210693359375, "learning_rate": 0.00039000211561025333, "loss": 36.4687, "step": 5943 }, { "epoch": 15.698910531528558, "grad_norm": 407.8404846191406, "learning_rate": 0.00038996750237485267, "loss": 37.6708, "step": 5944 }, { "epoch": 15.701551667216904, "grad_norm": 291.9508972167969, "learning_rate": 0.0003899328852309417, "loss": 38.5426, "step": 5945 }, { "epoch": 15.704192802905249, "grad_norm": 230.8794403076172, "learning_rate": 0.00038989826417948716, "loss": 37.4486, "step": 5946 }, { "epoch": 15.706833938593595, "grad_norm": 249.56588745117188, "learning_rate": 0.0003898636392214557, "loss": 36.9457, "step": 5947 }, { "epoch": 15.709475074281942, "grad_norm": 562.0996704101562, "learning_rate": 0.0003898290103578144, "loss": 36.3476, "step": 5948 }, { "epoch": 15.712116209970286, "grad_norm": 219.3847198486328, "learning_rate": 0.00038979437758952996, "loss": 35.7188, "step": 5949 }, { "epoch": 15.714757345658633, "grad_norm": 377.1235656738281, "learning_rate": 0.0003897597409175697, "loss": 37.2062, "step": 5950 }, { "epoch": 15.71739848134698, "grad_norm": 1158.175048828125, "learning_rate": 0.00038972510034290087, "loss": 46.5618, "step": 5951 }, { "epoch": 15.720039617035326, "grad_norm": 17340.029296875, "learning_rate": 0.0003896904558664906, "loss": 80.0566, "step": 5952 }, { "epoch": 15.72268075272367, "grad_norm": 3361.14404296875, "learning_rate": 0.00038965580748930647, "loss": 87.1237, "step": 5953 }, { "epoch": 15.725321888412017, "grad_norm": 4008.3056640625, "learning_rate": 0.0003896211552123159, "loss": 70.1709, "step": 5954 }, { "epoch": 15.727963024100363, "grad_norm": 3162.51708984375, "learning_rate": 0.0003895864990364866, "loss": 77.7829, "step": 5955 }, { "epoch": 15.73060415978871, "grad_norm": 3409.64453125, "learning_rate": 0.0003895518389627863, "loss": 73.9501, "step": 5956 }, { "epoch": 15.733245295477055, "grad_norm": 3029.969970703125, "learning_rate": 0.0003895171749921829, "loss": 77.7155, "step": 5957 }, { "epoch": 15.735886431165401, "grad_norm": 3025.46728515625, "learning_rate": 0.0003894825071256443, "loss": 67.9432, "step": 5958 }, { "epoch": 15.738527566853747, "grad_norm": 1619.812744140625, "learning_rate": 0.00038944783536413866, "loss": 55.3265, "step": 5959 }, { "epoch": 15.741168702542094, "grad_norm": 1519.792236328125, "learning_rate": 0.00038941315970863414, "loss": 46.5052, "step": 5960 }, { "epoch": 15.743809838230439, "grad_norm": 2151.091064453125, "learning_rate": 0.00038937848016009903, "loss": 40.8123, "step": 5961 }, { "epoch": 15.746450973918785, "grad_norm": 345.328369140625, "learning_rate": 0.00038934379671950174, "loss": 38.1451, "step": 5962 }, { "epoch": 15.749092109607131, "grad_norm": 923.52783203125, "learning_rate": 0.0003893091093878107, "loss": 36.7235, "step": 5963 }, { "epoch": 15.751733245295476, "grad_norm": 377.4470520019531, "learning_rate": 0.0003892744181659947, "loss": 37.3949, "step": 5964 }, { "epoch": 15.754374380983823, "grad_norm": 387.6208190917969, "learning_rate": 0.0003892397230550224, "loss": 37.2351, "step": 5965 }, { "epoch": 15.757015516672169, "grad_norm": 347.6411437988281, "learning_rate": 0.0003892050240558626, "loss": 38.689, "step": 5966 }, { "epoch": 15.759656652360515, "grad_norm": 409.914306640625, "learning_rate": 0.00038917032116948433, "loss": 36.972, "step": 5967 }, { "epoch": 15.762297788048862, "grad_norm": 434.9236145019531, "learning_rate": 0.0003891356143968565, "loss": 36.588, "step": 5968 }, { "epoch": 15.764938923737207, "grad_norm": 392.5713806152344, "learning_rate": 0.00038910090373894834, "loss": 36.617, "step": 5969 }, { "epoch": 15.767580059425553, "grad_norm": 430.90972900390625, "learning_rate": 0.00038906618919672933, "loss": 36.7449, "step": 5970 }, { "epoch": 15.7702211951139, "grad_norm": 455.78564453125, "learning_rate": 0.0003890314707711685, "loss": 36.971, "step": 5971 }, { "epoch": 15.772862330802244, "grad_norm": 556.0492553710938, "learning_rate": 0.00038899674846323563, "loss": 37.338, "step": 5972 }, { "epoch": 15.77550346649059, "grad_norm": 630.1760864257812, "learning_rate": 0.0003889620222739002, "loss": 37.0806, "step": 5973 }, { "epoch": 15.778144602178937, "grad_norm": 584.86474609375, "learning_rate": 0.00038892729220413194, "loss": 37.4716, "step": 5974 }, { "epoch": 15.780785737867284, "grad_norm": 251.3419647216797, "learning_rate": 0.00038889255825490053, "loss": 35.0014, "step": 5975 }, { "epoch": 15.783426873555628, "grad_norm": 442.20892333984375, "learning_rate": 0.00038885782042717617, "loss": 37.0474, "step": 5976 }, { "epoch": 15.786068009243975, "grad_norm": 539.0763549804688, "learning_rate": 0.00038882307872192866, "loss": 38.7165, "step": 5977 }, { "epoch": 15.788709144932321, "grad_norm": 286.00421142578125, "learning_rate": 0.0003887883331401282, "loss": 38.275, "step": 5978 }, { "epoch": 15.791350280620668, "grad_norm": 1556.056396484375, "learning_rate": 0.0003887535836827452, "loss": 43.6234, "step": 5979 }, { "epoch": 15.793991416309012, "grad_norm": 365.59954833984375, "learning_rate": 0.0003887188303507497, "loss": 44.0737, "step": 5980 }, { "epoch": 15.796632551997359, "grad_norm": 238.61131286621094, "learning_rate": 0.00038868407314511256, "loss": 41.8766, "step": 5981 }, { "epoch": 15.799273687685705, "grad_norm": 333.3055419921875, "learning_rate": 0.000388649312066804, "loss": 42.9036, "step": 5982 }, { "epoch": 15.801914823374052, "grad_norm": 293.0674743652344, "learning_rate": 0.00038861454711679487, "loss": 42.8799, "step": 5983 }, { "epoch": 15.804555959062396, "grad_norm": 229.90773010253906, "learning_rate": 0.000388579778296056, "loss": 44.238, "step": 5984 }, { "epoch": 15.807197094750743, "grad_norm": 226.64857482910156, "learning_rate": 0.0003885450056055582, "loss": 41.758, "step": 5985 }, { "epoch": 15.80983823043909, "grad_norm": 251.21945190429688, "learning_rate": 0.0003885102290462724, "loss": 41.295, "step": 5986 }, { "epoch": 15.812479366127434, "grad_norm": 195.13475036621094, "learning_rate": 0.00038847544861916997, "loss": 42.2569, "step": 5987 }, { "epoch": 15.81512050181578, "grad_norm": 239.376953125, "learning_rate": 0.0003884406643252219, "loss": 43.2159, "step": 5988 }, { "epoch": 15.817761637504127, "grad_norm": 745.1929321289062, "learning_rate": 0.0003884058761653996, "loss": 42.9007, "step": 5989 }, { "epoch": 15.820402773192473, "grad_norm": 229.24525451660156, "learning_rate": 0.0003883710841406745, "loss": 40.0579, "step": 5990 }, { "epoch": 15.82304390888082, "grad_norm": 366.5471496582031, "learning_rate": 0.0003883362882520182, "loss": 38.5513, "step": 5991 }, { "epoch": 15.825685044569164, "grad_norm": 334.67596435546875, "learning_rate": 0.0003883014885004023, "loss": 40.4589, "step": 5992 }, { "epoch": 15.82832618025751, "grad_norm": 464.9457092285156, "learning_rate": 0.0003882666848867986, "loss": 38.4974, "step": 5993 }, { "epoch": 15.830967315945857, "grad_norm": 400.67431640625, "learning_rate": 0.00038823187741217894, "loss": 37.9554, "step": 5994 }, { "epoch": 15.833608451634202, "grad_norm": 309.5360107421875, "learning_rate": 0.00038819706607751527, "loss": 37.0209, "step": 5995 }, { "epoch": 15.836249587322548, "grad_norm": 233.43809509277344, "learning_rate": 0.0003881622508837798, "loss": 36.0544, "step": 5996 }, { "epoch": 15.838890723010895, "grad_norm": 333.2075500488281, "learning_rate": 0.0003881274318319445, "loss": 37.7272, "step": 5997 }, { "epoch": 15.841531858699241, "grad_norm": 392.83416748046875, "learning_rate": 0.00038809260892298184, "loss": 37.4385, "step": 5998 }, { "epoch": 15.844172994387586, "grad_norm": 403.45001220703125, "learning_rate": 0.0003880577821578642, "loss": 38.0839, "step": 5999 }, { "epoch": 15.846814130075932, "grad_norm": 374.94415283203125, "learning_rate": 0.00038802295153756415, "loss": 36.6136, "step": 6000 }, { "epoch": 15.846814130075932, "eval_loss": 5.488409519195557, "eval_runtime": 2.2074, "eval_samples_per_second": 224.242, "eval_steps_per_second": 28.087, "step": 6000 }, { "epoch": 15.849455265764279, "grad_norm": 525.3743896484375, "learning_rate": 0.00038798811706305426, "loss": 37.8304, "step": 6001 }, { "epoch": 15.852096401452625, "grad_norm": 918.5164794921875, "learning_rate": 0.00038795327873530726, "loss": 49.0353, "step": 6002 }, { "epoch": 15.85473753714097, "grad_norm": 5682.072265625, "learning_rate": 0.0003879184365552959, "loss": 82.771, "step": 6003 }, { "epoch": 15.857378672829316, "grad_norm": 1690.3900146484375, "learning_rate": 0.00038788359052399326, "loss": 76.2467, "step": 6004 }, { "epoch": 15.860019808517663, "grad_norm": 2026.156982421875, "learning_rate": 0.0003878487406423724, "loss": 79.139, "step": 6005 }, { "epoch": 15.86266094420601, "grad_norm": 7261.6611328125, "learning_rate": 0.0003878138869114064, "loss": 71.0807, "step": 6006 }, { "epoch": 15.865302079894354, "grad_norm": 8433.8720703125, "learning_rate": 0.00038777902933206855, "loss": 73.0884, "step": 6007 }, { "epoch": 15.8679432155827, "grad_norm": 2583.523681640625, "learning_rate": 0.0003877441679053323, "loss": 65.3219, "step": 6008 }, { "epoch": 15.870584351271047, "grad_norm": 3303.448486328125, "learning_rate": 0.000387709302632171, "loss": 60.6788, "step": 6009 }, { "epoch": 15.873225486959392, "grad_norm": 1439.0079345703125, "learning_rate": 0.00038767443351355835, "loss": 65.6599, "step": 6010 }, { "epoch": 15.875866622647738, "grad_norm": 10417.427734375, "learning_rate": 0.00038763956055046803, "loss": 53.9467, "step": 6011 }, { "epoch": 15.878507758336085, "grad_norm": 536.7849731445312, "learning_rate": 0.00038760468374387385, "loss": 39.1425, "step": 6012 }, { "epoch": 15.881148894024431, "grad_norm": 685.2000732421875, "learning_rate": 0.00038756980309474965, "loss": 36.7035, "step": 6013 }, { "epoch": 15.883790029712777, "grad_norm": 1181.9879150390625, "learning_rate": 0.0003875349186040696, "loss": 38.0972, "step": 6014 }, { "epoch": 15.886431165401122, "grad_norm": 707.9356079101562, "learning_rate": 0.0003875000302728077, "loss": 38.6492, "step": 6015 }, { "epoch": 15.889072301089469, "grad_norm": 490.330322265625, "learning_rate": 0.0003874651381019382, "loss": 37.6716, "step": 6016 }, { "epoch": 15.891713436777815, "grad_norm": 496.066650390625, "learning_rate": 0.0003874302420924355, "loss": 37.7742, "step": 6017 }, { "epoch": 15.89435457246616, "grad_norm": 531.6822509765625, "learning_rate": 0.00038739534224527396, "loss": 36.8669, "step": 6018 }, { "epoch": 15.896995708154506, "grad_norm": 795.3026733398438, "learning_rate": 0.0003873604385614282, "loss": 38.3809, "step": 6019 }, { "epoch": 15.899636843842853, "grad_norm": 935.2015380859375, "learning_rate": 0.00038732553104187296, "loss": 37.4977, "step": 6020 }, { "epoch": 15.902277979531199, "grad_norm": 416.1978454589844, "learning_rate": 0.00038729061968758297, "loss": 37.5368, "step": 6021 }, { "epoch": 15.904919115219544, "grad_norm": 790.9063720703125, "learning_rate": 0.00038725570449953296, "loss": 36.9183, "step": 6022 }, { "epoch": 15.90756025090789, "grad_norm": 717.6805419921875, "learning_rate": 0.00038722078547869806, "loss": 36.9005, "step": 6023 }, { "epoch": 15.910201386596237, "grad_norm": 825.4923706054688, "learning_rate": 0.0003871858626260534, "loss": 36.7063, "step": 6024 }, { "epoch": 15.912842522284583, "grad_norm": 888.4327392578125, "learning_rate": 0.000387150935942574, "loss": 37.3034, "step": 6025 }, { "epoch": 15.915483657972928, "grad_norm": 714.7401733398438, "learning_rate": 0.00038711600542923536, "loss": 35.8961, "step": 6026 }, { "epoch": 15.918124793661274, "grad_norm": 1115.4349365234375, "learning_rate": 0.0003870810710870128, "loss": 35.8471, "step": 6027 }, { "epoch": 15.92076592934962, "grad_norm": 852.2251586914062, "learning_rate": 0.00038704613291688187, "loss": 39.0466, "step": 6028 }, { "epoch": 15.923407065037967, "grad_norm": 1256.7421875, "learning_rate": 0.0003870111909198182, "loss": 40.2743, "step": 6029 }, { "epoch": 15.926048200726312, "grad_norm": 489.7148742675781, "learning_rate": 0.00038697624509679743, "loss": 41.1957, "step": 6030 }, { "epoch": 15.928689336414658, "grad_norm": 561.5980834960938, "learning_rate": 0.00038694129544879555, "loss": 42.2161, "step": 6031 }, { "epoch": 15.931330472103005, "grad_norm": 427.4500427246094, "learning_rate": 0.00038690634197678837, "loss": 42.8536, "step": 6032 }, { "epoch": 15.93397160779135, "grad_norm": 624.7421875, "learning_rate": 0.0003868713846817521, "loss": 45.4797, "step": 6033 }, { "epoch": 15.936612743479696, "grad_norm": 479.14208984375, "learning_rate": 0.0003868364235646628, "loss": 43.9177, "step": 6034 }, { "epoch": 15.939253879168042, "grad_norm": 738.6083374023438, "learning_rate": 0.0003868014586264967, "loss": 43.2301, "step": 6035 }, { "epoch": 15.941895014856389, "grad_norm": 582.1879272460938, "learning_rate": 0.00038676648986823027, "loss": 43.0828, "step": 6036 }, { "epoch": 15.944536150544735, "grad_norm": 452.748779296875, "learning_rate": 0.0003867315172908399, "loss": 39.591, "step": 6037 }, { "epoch": 15.94717728623308, "grad_norm": 455.0257263183594, "learning_rate": 0.00038669654089530225, "loss": 38.4908, "step": 6038 }, { "epoch": 15.949818421921426, "grad_norm": 436.2026062011719, "learning_rate": 0.000386661560682594, "loss": 37.284, "step": 6039 }, { "epoch": 15.952459557609773, "grad_norm": 871.1309204101562, "learning_rate": 0.000386626576653692, "loss": 36.7402, "step": 6040 }, { "epoch": 15.955100693298117, "grad_norm": 891.765869140625, "learning_rate": 0.000386591588809573, "loss": 37.7729, "step": 6041 }, { "epoch": 15.957741828986464, "grad_norm": 650.9501342773438, "learning_rate": 0.00038655659715121425, "loss": 36.7314, "step": 6042 }, { "epoch": 15.96038296467481, "grad_norm": 2686.6982421875, "learning_rate": 0.00038652160167959267, "loss": 53.874, "step": 6043 }, { "epoch": 15.963024100363157, "grad_norm": 3791.36181640625, "learning_rate": 0.00038648660239568554, "loss": 30.6452, "step": 6044 }, { "epoch": 15.965665236051501, "grad_norm": 9320.9287109375, "learning_rate": 0.00038645159930047026, "loss": 31.1108, "step": 6045 }, { "epoch": 15.968306371739848, "grad_norm": 1675.0953369140625, "learning_rate": 0.00038641659239492423, "loss": 33.4593, "step": 6046 }, { "epoch": 15.970947507428194, "grad_norm": 4006.00634765625, "learning_rate": 0.000386381581680025, "loss": 28.7187, "step": 6047 }, { "epoch": 15.97358864311654, "grad_norm": 5452.775390625, "learning_rate": 0.0003863465671567501, "loss": 38.7315, "step": 6048 }, { "epoch": 15.976229778804885, "grad_norm": 1016.141357421875, "learning_rate": 0.00038631154882607754, "loss": 37.2223, "step": 6049 }, { "epoch": 15.978870914493232, "grad_norm": 863.2342529296875, "learning_rate": 0.000386276526688985, "loss": 37.6121, "step": 6050 }, { "epoch": 15.981512050181578, "grad_norm": 825.256591796875, "learning_rate": 0.0003862415007464505, "loss": 36.8834, "step": 6051 }, { "epoch": 15.984153185869925, "grad_norm": 642.4773559570312, "learning_rate": 0.00038620647099945216, "loss": 38.8221, "step": 6052 }, { "epoch": 15.98679432155827, "grad_norm": 606.3431396484375, "learning_rate": 0.0003861714374489681, "loss": 38.1676, "step": 6053 }, { "epoch": 15.989435457246616, "grad_norm": 924.4703369140625, "learning_rate": 0.0003861364000959766, "loss": 39.4095, "step": 6054 }, { "epoch": 15.992076592934962, "grad_norm": 1492.258056640625, "learning_rate": 0.0003861013589414562, "loss": 39.9968, "step": 6055 }, { "epoch": 15.994717728623307, "grad_norm": 1041.8922119140625, "learning_rate": 0.00038606631398638525, "loss": 37.6665, "step": 6056 }, { "epoch": 15.997358864311654, "grad_norm": 1254.2396240234375, "learning_rate": 0.0003860312652317424, "loss": 40.2544, "step": 6057 }, { "epoch": 16.0, "grad_norm": 4001.4287109375, "learning_rate": 0.00038599621267850637, "loss": 45.5061, "step": 6058 }, { "epoch": 16.002641135688346, "grad_norm": 914.1931762695312, "learning_rate": 0.00038596115632765605, "loss": 42.1865, "step": 6059 }, { "epoch": 16.005282271376693, "grad_norm": 843.1884155273438, "learning_rate": 0.0003859260961801702, "loss": 43.7024, "step": 6060 }, { "epoch": 16.00792340706504, "grad_norm": 554.1885986328125, "learning_rate": 0.00038589103223702807, "loss": 43.8815, "step": 6061 }, { "epoch": 16.010564542753382, "grad_norm": 528.176513671875, "learning_rate": 0.00038585596449920863, "loss": 44.2739, "step": 6062 }, { "epoch": 16.01320567844173, "grad_norm": 796.7938232421875, "learning_rate": 0.0003858208929676912, "loss": 48.0707, "step": 6063 }, { "epoch": 16.015846814130075, "grad_norm": 737.465087890625, "learning_rate": 0.0003857858176434551, "loss": 46.6764, "step": 6064 }, { "epoch": 16.01848794981842, "grad_norm": 567.7354125976562, "learning_rate": 0.0003857507385274798, "loss": 48.7217, "step": 6065 }, { "epoch": 16.021129085506768, "grad_norm": 470.30609130859375, "learning_rate": 0.00038571565562074494, "loss": 46.445, "step": 6066 }, { "epoch": 16.023770221195115, "grad_norm": 550.4747924804688, "learning_rate": 0.0003856805689242301, "loss": 44.2664, "step": 6067 }, { "epoch": 16.02641135688346, "grad_norm": 425.3103332519531, "learning_rate": 0.00038564547843891506, "loss": 44.1815, "step": 6068 }, { "epoch": 16.029052492571807, "grad_norm": 508.21728515625, "learning_rate": 0.0003856103841657797, "loss": 42.3259, "step": 6069 }, { "epoch": 16.03169362826015, "grad_norm": 551.4007568359375, "learning_rate": 0.00038557528610580407, "loss": 42.8328, "step": 6070 }, { "epoch": 16.034334763948497, "grad_norm": 474.2994689941406, "learning_rate": 0.00038554018425996815, "loss": 42.117, "step": 6071 }, { "epoch": 16.036975899636843, "grad_norm": 606.6430053710938, "learning_rate": 0.0003855050786292522, "loss": 38.9614, "step": 6072 }, { "epoch": 16.03961703532519, "grad_norm": 758.2822875976562, "learning_rate": 0.0003854699692146366, "loss": 37.4666, "step": 6073 }, { "epoch": 16.042258171013536, "grad_norm": 317.616455078125, "learning_rate": 0.00038543485601710164, "loss": 39.4913, "step": 6074 }, { "epoch": 16.044899306701883, "grad_norm": 945.3771362304688, "learning_rate": 0.00038539973903762794, "loss": 38.5688, "step": 6075 }, { "epoch": 16.04754044239023, "grad_norm": 608.1839599609375, "learning_rate": 0.00038536461827719606, "loss": 38.4354, "step": 6076 }, { "epoch": 16.050181578078575, "grad_norm": 321.77484130859375, "learning_rate": 0.00038532949373678665, "loss": 37.9781, "step": 6077 }, { "epoch": 16.05282271376692, "grad_norm": 418.3764343261719, "learning_rate": 0.0003852943654173807, "loss": 38.8898, "step": 6078 }, { "epoch": 16.055463849455265, "grad_norm": 1297.48046875, "learning_rate": 0.00038525923331995905, "loss": 37.6484, "step": 6079 }, { "epoch": 16.05810498514361, "grad_norm": 2511.2255859375, "learning_rate": 0.0003852240974455028, "loss": 44.1107, "step": 6080 }, { "epoch": 16.060746120831958, "grad_norm": 2705.66064453125, "learning_rate": 0.00038518895779499296, "loss": 44.4275, "step": 6081 }, { "epoch": 16.063387256520304, "grad_norm": 1885.7579345703125, "learning_rate": 0.000385153814369411, "loss": 33.662, "step": 6082 }, { "epoch": 16.06602839220865, "grad_norm": 2868.667724609375, "learning_rate": 0.00038511866716973807, "loss": 29.989, "step": 6083 }, { "epoch": 16.068669527896997, "grad_norm": 1541.4122314453125, "learning_rate": 0.0003850835161969558, "loss": 28.532, "step": 6084 }, { "epoch": 16.07131066358534, "grad_norm": 4135.84521484375, "learning_rate": 0.0003850483614520457, "loss": 31.143, "step": 6085 }, { "epoch": 16.073951799273686, "grad_norm": 1152.7847900390625, "learning_rate": 0.0003850132029359893, "loss": 22.6401, "step": 6086 }, { "epoch": 16.076592934962033, "grad_norm": 6008.62158203125, "learning_rate": 0.0003849780406497687, "loss": 27.3929, "step": 6087 }, { "epoch": 16.07923407065038, "grad_norm": 9393.4560546875, "learning_rate": 0.00038494287459436556, "loss": 23.4229, "step": 6088 }, { "epoch": 16.081875206338726, "grad_norm": 593.1703491210938, "learning_rate": 0.0003849077047707619, "loss": 25.3753, "step": 6089 }, { "epoch": 16.084516342027072, "grad_norm": 2859.761962890625, "learning_rate": 0.0003848725311799398, "loss": 21.1245, "step": 6090 }, { "epoch": 16.08715747771542, "grad_norm": 530.6705322265625, "learning_rate": 0.00038483735382288155, "loss": 36.0622, "step": 6091 }, { "epoch": 16.089798613403765, "grad_norm": 196.35507202148438, "learning_rate": 0.0003848021727005694, "loss": 39.1235, "step": 6092 }, { "epoch": 16.092439749092108, "grad_norm": 263.4659118652344, "learning_rate": 0.00038476698781398565, "loss": 38.2926, "step": 6093 }, { "epoch": 16.095080884780455, "grad_norm": 404.733154296875, "learning_rate": 0.00038473179916411306, "loss": 38.4964, "step": 6094 }, { "epoch": 16.0977220204688, "grad_norm": 404.7265319824219, "learning_rate": 0.00038469660675193414, "loss": 37.4959, "step": 6095 }, { "epoch": 16.100363156157147, "grad_norm": 452.78228759765625, "learning_rate": 0.0003846614105784315, "loss": 38.2636, "step": 6096 }, { "epoch": 16.103004291845494, "grad_norm": 369.27642822265625, "learning_rate": 0.00038462621064458814, "loss": 38.8001, "step": 6097 }, { "epoch": 16.10564542753384, "grad_norm": 309.6981506347656, "learning_rate": 0.0003845910069513869, "loss": 36.913, "step": 6098 }, { "epoch": 16.108286563222187, "grad_norm": 372.00421142578125, "learning_rate": 0.0003845557994998109, "loss": 37.1089, "step": 6099 }, { "epoch": 16.110927698910533, "grad_norm": 376.42962646484375, "learning_rate": 0.0003845205882908432, "loss": 38.4477, "step": 6100 }, { "epoch": 16.113568834598876, "grad_norm": 528.1492309570312, "learning_rate": 0.00038448537332546706, "loss": 37.773, "step": 6101 }, { "epoch": 16.116209970287223, "grad_norm": 430.2724304199219, "learning_rate": 0.0003844501546046659, "loss": 36.8466, "step": 6102 }, { "epoch": 16.11885110597557, "grad_norm": 460.5201416015625, "learning_rate": 0.00038441493212942326, "loss": 37.3125, "step": 6103 }, { "epoch": 16.121492241663915, "grad_norm": 966.8856201171875, "learning_rate": 0.00038437970590072244, "loss": 37.0226, "step": 6104 }, { "epoch": 16.124133377352262, "grad_norm": 581.806640625, "learning_rate": 0.0003843444759195474, "loss": 36.3699, "step": 6105 }, { "epoch": 16.12677451304061, "grad_norm": 622.403564453125, "learning_rate": 0.00038430924218688166, "loss": 37.6064, "step": 6106 }, { "epoch": 16.129415648728955, "grad_norm": 993.8897705078125, "learning_rate": 0.00038427400470370925, "loss": 39.3296, "step": 6107 }, { "epoch": 16.132056784417298, "grad_norm": 517.66357421875, "learning_rate": 0.00038423876347101424, "loss": 43.39, "step": 6108 }, { "epoch": 16.134697920105644, "grad_norm": 1074.2681884765625, "learning_rate": 0.0003842035184897805, "loss": 42.0338, "step": 6109 }, { "epoch": 16.13733905579399, "grad_norm": 493.24169921875, "learning_rate": 0.0003841682697609925, "loss": 42.0177, "step": 6110 }, { "epoch": 16.139980191482337, "grad_norm": 405.9346618652344, "learning_rate": 0.00038413301728563423, "loss": 41.0129, "step": 6111 }, { "epoch": 16.142621327170684, "grad_norm": 308.3611755371094, "learning_rate": 0.00038409776106469024, "loss": 41.8148, "step": 6112 }, { "epoch": 16.14526246285903, "grad_norm": 428.06549072265625, "learning_rate": 0.0003840625010991452, "loss": 42.375, "step": 6113 }, { "epoch": 16.147903598547376, "grad_norm": 308.8721923828125, "learning_rate": 0.00038402723738998343, "loss": 43.9363, "step": 6114 }, { "epoch": 16.150544734235723, "grad_norm": 348.86553955078125, "learning_rate": 0.00038399196993818985, "loss": 40.7747, "step": 6115 }, { "epoch": 16.153185869924066, "grad_norm": 542.5014038085938, "learning_rate": 0.00038395669874474915, "loss": 42.5961, "step": 6116 }, { "epoch": 16.155827005612412, "grad_norm": 266.5709228515625, "learning_rate": 0.0003839214238106464, "loss": 42.3162, "step": 6117 }, { "epoch": 16.15846814130076, "grad_norm": 455.8856506347656, "learning_rate": 0.00038388614513686656, "loss": 40.1146, "step": 6118 }, { "epoch": 16.161109276989105, "grad_norm": 378.7201843261719, "learning_rate": 0.0003838508627243947, "loss": 39.7483, "step": 6119 }, { "epoch": 16.16375041267745, "grad_norm": 189.4478759765625, "learning_rate": 0.0003838155765742162, "loss": 38.1484, "step": 6120 }, { "epoch": 16.166391548365798, "grad_norm": 265.60736083984375, "learning_rate": 0.00038378028668731626, "loss": 39.7519, "step": 6121 }, { "epoch": 16.169032684054145, "grad_norm": 310.6053161621094, "learning_rate": 0.00038374499306468045, "loss": 38.2808, "step": 6122 }, { "epoch": 16.17167381974249, "grad_norm": 244.49220275878906, "learning_rate": 0.0003837096957072943, "loss": 37.8832, "step": 6123 }, { "epoch": 16.174314955430834, "grad_norm": 329.98822021484375, "learning_rate": 0.0003836743946161434, "loss": 39.1603, "step": 6124 }, { "epoch": 16.17695609111918, "grad_norm": 345.44061279296875, "learning_rate": 0.0003836390897922135, "loss": 37.9824, "step": 6125 }, { "epoch": 16.179597226807527, "grad_norm": 222.1686553955078, "learning_rate": 0.00038360378123649064, "loss": 36.6527, "step": 6126 }, { "epoch": 16.182238362495873, "grad_norm": 255.43508911132812, "learning_rate": 0.00038356846894996066, "loss": 36.4482, "step": 6127 }, { "epoch": 16.18487949818422, "grad_norm": 255.1405029296875, "learning_rate": 0.0003835331529336096, "loss": 36.2614, "step": 6128 }, { "epoch": 16.187520633872566, "grad_norm": 221.05152893066406, "learning_rate": 0.0003834978331884237, "loss": 37.4897, "step": 6129 }, { "epoch": 16.190161769560913, "grad_norm": 262.71466064453125, "learning_rate": 0.0003834625097153892, "loss": 37.0045, "step": 6130 }, { "epoch": 16.192802905249255, "grad_norm": 877.4600219726562, "learning_rate": 0.00038342718251549256, "loss": 51.2853, "step": 6131 }, { "epoch": 16.195444040937602, "grad_norm": 2697.4609375, "learning_rate": 0.0003833918515897202, "loss": 60.7036, "step": 6132 }, { "epoch": 16.19808517662595, "grad_norm": 12770.8671875, "learning_rate": 0.00038335651693905873, "loss": 59.9434, "step": 6133 }, { "epoch": 16.200726312314295, "grad_norm": 1549.97998046875, "learning_rate": 0.00038332117856449484, "loss": 40.6967, "step": 6134 }, { "epoch": 16.20336744800264, "grad_norm": 4348.33203125, "learning_rate": 0.0003832858364670154, "loss": 44.7809, "step": 6135 }, { "epoch": 16.206008583690988, "grad_norm": 4156.9609375, "learning_rate": 0.0003832504906476073, "loss": 40.0797, "step": 6136 }, { "epoch": 16.208649719379334, "grad_norm": 7066.8349609375, "learning_rate": 0.00038321514110725745, "loss": 33.0029, "step": 6137 }, { "epoch": 16.21129085506768, "grad_norm": 1017.2330322265625, "learning_rate": 0.0003831797878469531, "loss": 30.157, "step": 6138 }, { "epoch": 16.213931990756024, "grad_norm": 1857.19970703125, "learning_rate": 0.0003831444308676814, "loss": 28.9443, "step": 6139 }, { "epoch": 16.21657312644437, "grad_norm": 1313.5965576171875, "learning_rate": 0.0003831090701704296, "loss": 24.6278, "step": 6140 }, { "epoch": 16.219214262132716, "grad_norm": 428.9931640625, "learning_rate": 0.00038307370575618537, "loss": 35.2072, "step": 6141 }, { "epoch": 16.221855397821063, "grad_norm": 304.4910888671875, "learning_rate": 0.00038303833762593593, "loss": 37.5909, "step": 6142 }, { "epoch": 16.22449653350941, "grad_norm": 575.948974609375, "learning_rate": 0.00038300296578066916, "loss": 37.8302, "step": 6143 }, { "epoch": 16.227137669197756, "grad_norm": 263.3163757324219, "learning_rate": 0.0003829675902213726, "loss": 38.0523, "step": 6144 }, { "epoch": 16.229778804886102, "grad_norm": 309.8327941894531, "learning_rate": 0.0003829322109490343, "loss": 36.1217, "step": 6145 }, { "epoch": 16.23241994057445, "grad_norm": 466.2748718261719, "learning_rate": 0.000382896827964642, "loss": 36.6774, "step": 6146 }, { "epoch": 16.23506107626279, "grad_norm": 335.9361572265625, "learning_rate": 0.00038286144126918386, "loss": 38.1129, "step": 6147 }, { "epoch": 16.237702211951138, "grad_norm": 298.6313171386719, "learning_rate": 0.0003828260508636482, "loss": 36.9677, "step": 6148 }, { "epoch": 16.240343347639485, "grad_norm": 339.4700012207031, "learning_rate": 0.00038279065674902293, "loss": 37.2139, "step": 6149 }, { "epoch": 16.24298448332783, "grad_norm": 356.32318115234375, "learning_rate": 0.0003827552589262966, "loss": 38.1512, "step": 6150 }, { "epoch": 16.245625619016177, "grad_norm": 234.21237182617188, "learning_rate": 0.0003827198573964576, "loss": 36.5011, "step": 6151 }, { "epoch": 16.248266754704524, "grad_norm": 275.17730712890625, "learning_rate": 0.00038268445216049464, "loss": 37.1451, "step": 6152 }, { "epoch": 16.25090789039287, "grad_norm": 371.8175354003906, "learning_rate": 0.0003826490432193962, "loss": 37.5633, "step": 6153 }, { "epoch": 16.253549026081213, "grad_norm": 259.71038818359375, "learning_rate": 0.00038261363057415123, "loss": 36.3132, "step": 6154 }, { "epoch": 16.25619016176956, "grad_norm": 293.2767333984375, "learning_rate": 0.0003825782142257485, "loss": 37.0314, "step": 6155 }, { "epoch": 16.258831297457906, "grad_norm": 298.66741943359375, "learning_rate": 0.000382542794175177, "loss": 37.5509, "step": 6156 }, { "epoch": 16.261472433146253, "grad_norm": 562.204345703125, "learning_rate": 0.00038250737042342585, "loss": 38.5698, "step": 6157 }, { "epoch": 16.2641135688346, "grad_norm": 744.6715698242188, "learning_rate": 0.00038247194297148427, "loss": 43.682, "step": 6158 }, { "epoch": 16.266754704522945, "grad_norm": 371.25469970703125, "learning_rate": 0.00038243651182034137, "loss": 41.1435, "step": 6159 }, { "epoch": 16.269395840211292, "grad_norm": 359.5370178222656, "learning_rate": 0.0003824010769709868, "loss": 40.8207, "step": 6160 }, { "epoch": 16.27203697589964, "grad_norm": 305.0095520019531, "learning_rate": 0.00038236563842440987, "loss": 41.0895, "step": 6161 }, { "epoch": 16.27467811158798, "grad_norm": 447.78448486328125, "learning_rate": 0.00038233019618160027, "loss": 41.1599, "step": 6162 }, { "epoch": 16.277319247276328, "grad_norm": 336.1880187988281, "learning_rate": 0.00038229475024354766, "loss": 43.7338, "step": 6163 }, { "epoch": 16.279960382964674, "grad_norm": 327.1413879394531, "learning_rate": 0.0003822593006112419, "loss": 44.5208, "step": 6164 }, { "epoch": 16.28260151865302, "grad_norm": 390.50604248046875, "learning_rate": 0.00038222384728567283, "loss": 42.5969, "step": 6165 }, { "epoch": 16.285242654341367, "grad_norm": 604.226806640625, "learning_rate": 0.00038218839026783047, "loss": 43.7045, "step": 6166 }, { "epoch": 16.287883790029714, "grad_norm": 708.3975830078125, "learning_rate": 0.000382152929558705, "loss": 42.3176, "step": 6167 }, { "epoch": 16.29052492571806, "grad_norm": 514.030029296875, "learning_rate": 0.0003821174651592866, "loss": 41.5615, "step": 6168 }, { "epoch": 16.293166061406406, "grad_norm": 346.4398498535156, "learning_rate": 0.0003820819970705656, "loss": 39.3875, "step": 6169 }, { "epoch": 16.29580719709475, "grad_norm": 420.12017822265625, "learning_rate": 0.00038204652529353245, "loss": 40.3091, "step": 6170 }, { "epoch": 16.298448332783096, "grad_norm": 295.8365783691406, "learning_rate": 0.0003820110498291776, "loss": 38.4502, "step": 6171 }, { "epoch": 16.301089468471442, "grad_norm": 382.41339111328125, "learning_rate": 0.00038197557067849177, "loss": 37.831, "step": 6172 }, { "epoch": 16.30373060415979, "grad_norm": 251.8983154296875, "learning_rate": 0.0003819400878424656, "loss": 36.153, "step": 6173 }, { "epoch": 16.306371739848135, "grad_norm": 1020.4642333984375, "learning_rate": 0.00038190460132209, "loss": 36.5704, "step": 6174 }, { "epoch": 16.30901287553648, "grad_norm": 316.1288757324219, "learning_rate": 0.00038186911111835585, "loss": 36.5318, "step": 6175 }, { "epoch": 16.311654011224828, "grad_norm": 605.3110961914062, "learning_rate": 0.00038183361723225434, "loss": 37.0476, "step": 6176 }, { "epoch": 16.31429514691317, "grad_norm": 822.23974609375, "learning_rate": 0.00038179811966477644, "loss": 36.9889, "step": 6177 }, { "epoch": 16.316936282601517, "grad_norm": 594.1703491210938, "learning_rate": 0.0003817626184169135, "loss": 36.0941, "step": 6178 }, { "epoch": 16.319577418289864, "grad_norm": 365.2333679199219, "learning_rate": 0.0003817271134896569, "loss": 35.4276, "step": 6179 }, { "epoch": 16.32221855397821, "grad_norm": 886.3919067382812, "learning_rate": 0.0003816916048839979, "loss": 44.7297, "step": 6180 }, { "epoch": 16.324859689666557, "grad_norm": 1142.65185546875, "learning_rate": 0.00038165609260092825, "loss": 42.9513, "step": 6181 }, { "epoch": 16.327500825354903, "grad_norm": 4181.923828125, "learning_rate": 0.00038162057664143956, "loss": 46.4428, "step": 6182 }, { "epoch": 16.33014196104325, "grad_norm": 1127.350830078125, "learning_rate": 0.0003815850570065237, "loss": 47.8812, "step": 6183 }, { "epoch": 16.332783096731596, "grad_norm": 3345.735595703125, "learning_rate": 0.00038154953369717225, "loss": 41.3382, "step": 6184 }, { "epoch": 16.33542423241994, "grad_norm": 2877.58203125, "learning_rate": 0.00038151400671437743, "loss": 37.5456, "step": 6185 }, { "epoch": 16.338065368108285, "grad_norm": 2686.7109375, "learning_rate": 0.0003814784760591312, "loss": 37.073, "step": 6186 }, { "epoch": 16.340706503796632, "grad_norm": 3586.7392578125, "learning_rate": 0.0003814429417324258, "loss": 29.8657, "step": 6187 }, { "epoch": 16.34334763948498, "grad_norm": 1573.9083251953125, "learning_rate": 0.00038140740373525346, "loss": 24.7754, "step": 6188 }, { "epoch": 16.345988775173325, "grad_norm": 1112.6864013671875, "learning_rate": 0.00038137186206860653, "loss": 22.5313, "step": 6189 }, { "epoch": 16.34862991086167, "grad_norm": 1032.3131103515625, "learning_rate": 0.0003813363167334775, "loss": 33.799, "step": 6190 }, { "epoch": 16.351271046550018, "grad_norm": 557.6571655273438, "learning_rate": 0.000381300767730859, "loss": 38.6813, "step": 6191 }, { "epoch": 16.353912182238364, "grad_norm": 535.6976928710938, "learning_rate": 0.0003812652150617438, "loss": 39.2883, "step": 6192 }, { "epoch": 16.356553317926707, "grad_norm": 711.69580078125, "learning_rate": 0.00038122965872712446, "loss": 38.0585, "step": 6193 }, { "epoch": 16.359194453615054, "grad_norm": 431.6228332519531, "learning_rate": 0.00038119409872799394, "loss": 38.8558, "step": 6194 }, { "epoch": 16.3618355893034, "grad_norm": 1091.88232421875, "learning_rate": 0.00038115853506534537, "loss": 38.0573, "step": 6195 }, { "epoch": 16.364476724991746, "grad_norm": 330.35882568359375, "learning_rate": 0.00038112296774017176, "loss": 38.9163, "step": 6196 }, { "epoch": 16.367117860680093, "grad_norm": 635.1190795898438, "learning_rate": 0.00038108739675346624, "loss": 37.2402, "step": 6197 }, { "epoch": 16.36975899636844, "grad_norm": 284.858642578125, "learning_rate": 0.00038105182210622227, "loss": 38.5016, "step": 6198 }, { "epoch": 16.372400132056786, "grad_norm": 350.5694274902344, "learning_rate": 0.00038101624379943303, "loss": 37.8719, "step": 6199 }, { "epoch": 16.37504126774513, "grad_norm": 454.5061950683594, "learning_rate": 0.0003809806618340922, "loss": 37.8118, "step": 6200 }, { "epoch": 16.37504126774513, "eval_loss": 4.355167865753174, "eval_runtime": 2.1892, "eval_samples_per_second": 226.111, "eval_steps_per_second": 28.321, "step": 6200 }, { "epoch": 16.377682403433475, "grad_norm": 786.2684326171875, "learning_rate": 0.00038094507621119333, "loss": 38.4454, "step": 6201 }, { "epoch": 16.38032353912182, "grad_norm": 1022.5571899414062, "learning_rate": 0.00038090948693173015, "loss": 38.3533, "step": 6202 }, { "epoch": 16.382964674810168, "grad_norm": 653.6176147460938, "learning_rate": 0.00038087389399669644, "loss": 38.7207, "step": 6203 }, { "epoch": 16.385605810498515, "grad_norm": 660.4783935546875, "learning_rate": 0.0003808382974070861, "loss": 35.9741, "step": 6204 }, { "epoch": 16.38824694618686, "grad_norm": 1519.5455322265625, "learning_rate": 0.0003808026971638932, "loss": 37.5233, "step": 6205 }, { "epoch": 16.390888081875207, "grad_norm": 780.5739135742188, "learning_rate": 0.0003807670932681118, "loss": 38.9923, "step": 6206 }, { "epoch": 16.393529217563554, "grad_norm": 1383.718017578125, "learning_rate": 0.00038073148572073613, "loss": 38.8939, "step": 6207 }, { "epoch": 16.396170353251897, "grad_norm": 1856.296875, "learning_rate": 0.00038069587452276043, "loss": 42.3601, "step": 6208 }, { "epoch": 16.398811488940243, "grad_norm": 450.2565002441406, "learning_rate": 0.00038066025967517937, "loss": 41.128, "step": 6209 }, { "epoch": 16.40145262462859, "grad_norm": 443.7144775390625, "learning_rate": 0.0003806246411789872, "loss": 42.7051, "step": 6210 }, { "epoch": 16.404093760316936, "grad_norm": 594.4367065429688, "learning_rate": 0.00038058901903517873, "loss": 43.0967, "step": 6211 }, { "epoch": 16.406734896005283, "grad_norm": 554.8883666992188, "learning_rate": 0.00038055339324474856, "loss": 44.0312, "step": 6212 }, { "epoch": 16.40937603169363, "grad_norm": 661.2871704101562, "learning_rate": 0.0003805177638086916, "loss": 47.9267, "step": 6213 }, { "epoch": 16.412017167381975, "grad_norm": 597.2293090820312, "learning_rate": 0.0003804821307280028, "loss": 40.8016, "step": 6214 }, { "epoch": 16.414658303070322, "grad_norm": 872.8804931640625, "learning_rate": 0.00038044649400367705, "loss": 44.9535, "step": 6215 }, { "epoch": 16.417299438758665, "grad_norm": 516.0518188476562, "learning_rate": 0.0003804108536367097, "loss": 44.0495, "step": 6216 }, { "epoch": 16.41994057444701, "grad_norm": 953.3701171875, "learning_rate": 0.00038037520962809577, "loss": 41.7908, "step": 6217 }, { "epoch": 16.422581710135358, "grad_norm": 692.1351318359375, "learning_rate": 0.0003803395619788308, "loss": 40.5542, "step": 6218 }, { "epoch": 16.425222845823704, "grad_norm": 568.977783203125, "learning_rate": 0.00038030391068991, "loss": 42.0308, "step": 6219 }, { "epoch": 16.42786398151205, "grad_norm": 681.3132934570312, "learning_rate": 0.000380268255762329, "loss": 40.4801, "step": 6220 }, { "epoch": 16.430505117200397, "grad_norm": 470.25225830078125, "learning_rate": 0.0003802325971970837, "loss": 41.0322, "step": 6221 }, { "epoch": 16.433146252888744, "grad_norm": 630.5338134765625, "learning_rate": 0.0003801969349951695, "loss": 38.6841, "step": 6222 }, { "epoch": 16.435787388577086, "grad_norm": 471.9179992675781, "learning_rate": 0.0003801612691575824, "loss": 37.4054, "step": 6223 }, { "epoch": 16.438428524265433, "grad_norm": 610.6727294921875, "learning_rate": 0.00038012559968531826, "loss": 39.3565, "step": 6224 }, { "epoch": 16.44106965995378, "grad_norm": 314.4000244140625, "learning_rate": 0.00038008992657937325, "loss": 37.3503, "step": 6225 }, { "epoch": 16.443710795642126, "grad_norm": 377.4306335449219, "learning_rate": 0.00038005424984074337, "loss": 37.2067, "step": 6226 }, { "epoch": 16.446351931330472, "grad_norm": 709.2787475585938, "learning_rate": 0.00038001856947042506, "loss": 37.7979, "step": 6227 }, { "epoch": 16.44899306701882, "grad_norm": 374.3955993652344, "learning_rate": 0.00037998288546941454, "loss": 38.1965, "step": 6228 }, { "epoch": 16.451634202707165, "grad_norm": 642.45654296875, "learning_rate": 0.0003799471978387083, "loss": 38.9701, "step": 6229 }, { "epoch": 16.45427533839551, "grad_norm": 1802.9427490234375, "learning_rate": 0.0003799115065793029, "loss": 45.8071, "step": 6230 }, { "epoch": 16.456916474083854, "grad_norm": 1587.9293212890625, "learning_rate": 0.00037987581169219496, "loss": 50.3666, "step": 6231 }, { "epoch": 16.4595576097722, "grad_norm": 1657.0810546875, "learning_rate": 0.00037984011317838143, "loss": 43.9836, "step": 6232 }, { "epoch": 16.462198745460547, "grad_norm": 14438.6572265625, "learning_rate": 0.0003798044110388588, "loss": 39.6236, "step": 6233 }, { "epoch": 16.464839881148894, "grad_norm": 2075.722900390625, "learning_rate": 0.00037976870527462424, "loss": 38.3965, "step": 6234 }, { "epoch": 16.46748101683724, "grad_norm": 9156.029296875, "learning_rate": 0.00037973299588667497, "loss": 29.0486, "step": 6235 }, { "epoch": 16.470122152525587, "grad_norm": 2646.883544921875, "learning_rate": 0.0003796972828760079, "loss": 27.3239, "step": 6236 }, { "epoch": 16.472763288213933, "grad_norm": 4270.35302734375, "learning_rate": 0.00037966156624362046, "loss": 27.1323, "step": 6237 }, { "epoch": 16.47540442390228, "grad_norm": 2117.269287109375, "learning_rate": 0.00037962584599050993, "loss": 27.1211, "step": 6238 }, { "epoch": 16.478045559590623, "grad_norm": 12164.865234375, "learning_rate": 0.00037959012211767374, "loss": 28.6068, "step": 6239 }, { "epoch": 16.48068669527897, "grad_norm": 2218.984130859375, "learning_rate": 0.0003795543946261096, "loss": 21.301, "step": 6240 }, { "epoch": 16.483327830967315, "grad_norm": 536.9093627929688, "learning_rate": 0.000379518663516815, "loss": 38.7895, "step": 6241 }, { "epoch": 16.485968966655662, "grad_norm": 377.9259338378906, "learning_rate": 0.00037948292879078793, "loss": 36.834, "step": 6242 }, { "epoch": 16.48861010234401, "grad_norm": 1964.759765625, "learning_rate": 0.000379447190449026, "loss": 37.6785, "step": 6243 }, { "epoch": 16.491251238032355, "grad_norm": 256.8405456542969, "learning_rate": 0.0003794114484925275, "loss": 39.8894, "step": 6244 }, { "epoch": 16.4938923737207, "grad_norm": 570.07568359375, "learning_rate": 0.00037937570292229014, "loss": 36.4867, "step": 6245 }, { "epoch": 16.496533509409044, "grad_norm": 398.5157165527344, "learning_rate": 0.0003793399537393124, "loss": 36.4493, "step": 6246 }, { "epoch": 16.49917464509739, "grad_norm": 304.2563781738281, "learning_rate": 0.0003793042009445923, "loss": 36.9654, "step": 6247 }, { "epoch": 16.501815780785737, "grad_norm": 544.6727294921875, "learning_rate": 0.0003792684445391284, "loss": 37.5129, "step": 6248 }, { "epoch": 16.504456916474084, "grad_norm": 617.327880859375, "learning_rate": 0.0003792326845239191, "loss": 35.7492, "step": 6249 }, { "epoch": 16.50709805216243, "grad_norm": 579.6605834960938, "learning_rate": 0.00037919692089996305, "loss": 37.167, "step": 6250 }, { "epoch": 16.509739187850776, "grad_norm": 624.1376953125, "learning_rate": 0.0003791611536682589, "loss": 39.105, "step": 6251 }, { "epoch": 16.512380323539123, "grad_norm": 4866.73779296875, "learning_rate": 0.00037912538282980536, "loss": 37.5983, "step": 6252 }, { "epoch": 16.51502145922747, "grad_norm": 505.80377197265625, "learning_rate": 0.0003790896083856014, "loss": 37.3984, "step": 6253 }, { "epoch": 16.517662594915812, "grad_norm": 438.42584228515625, "learning_rate": 0.00037905383033664587, "loss": 36.8588, "step": 6254 }, { "epoch": 16.52030373060416, "grad_norm": 643.3865356445312, "learning_rate": 0.00037901804868393797, "loss": 38.3554, "step": 6255 }, { "epoch": 16.522944866292505, "grad_norm": 445.6590881347656, "learning_rate": 0.00037898226342847693, "loss": 38.5383, "step": 6256 }, { "epoch": 16.52558600198085, "grad_norm": 1339.96533203125, "learning_rate": 0.00037894647457126186, "loss": 37.6661, "step": 6257 }, { "epoch": 16.528227137669198, "grad_norm": 1075.5589599609375, "learning_rate": 0.0003789106821132923, "loss": 43.011, "step": 6258 }, { "epoch": 16.530868273357544, "grad_norm": 430.08984375, "learning_rate": 0.00037887488605556764, "loss": 44.2726, "step": 6259 }, { "epoch": 16.53350940904589, "grad_norm": 333.7040710449219, "learning_rate": 0.0003788390863990875, "loss": 42.346, "step": 6260 }, { "epoch": 16.536150544734237, "grad_norm": 624.4558715820312, "learning_rate": 0.00037880328314485146, "loss": 44.6896, "step": 6261 }, { "epoch": 16.53879168042258, "grad_norm": 367.7952575683594, "learning_rate": 0.00037876747629385956, "loss": 44.9915, "step": 6262 }, { "epoch": 16.541432816110927, "grad_norm": 478.4957580566406, "learning_rate": 0.0003787316658471115, "loss": 45.9192, "step": 6263 }, { "epoch": 16.544073951799273, "grad_norm": 355.9396057128906, "learning_rate": 0.0003786958518056072, "loss": 45.6642, "step": 6264 }, { "epoch": 16.54671508748762, "grad_norm": 256.23590087890625, "learning_rate": 0.0003786600341703469, "loss": 44.2645, "step": 6265 }, { "epoch": 16.549356223175966, "grad_norm": 377.8072814941406, "learning_rate": 0.00037862421294233075, "loss": 42.9307, "step": 6266 }, { "epoch": 16.551997358864313, "grad_norm": 346.654296875, "learning_rate": 0.0003785883881225589, "loss": 41.4085, "step": 6267 }, { "epoch": 16.55463849455266, "grad_norm": 308.9209289550781, "learning_rate": 0.000378552559712032, "loss": 41.6009, "step": 6268 }, { "epoch": 16.557279630241002, "grad_norm": 506.6177673339844, "learning_rate": 0.0003785167277117502, "loss": 40.4524, "step": 6269 }, { "epoch": 16.55992076592935, "grad_norm": 359.19281005859375, "learning_rate": 0.0003784808921227144, "loss": 41.3503, "step": 6270 }, { "epoch": 16.562561901617695, "grad_norm": 261.4582824707031, "learning_rate": 0.00037844505294592514, "loss": 37.7395, "step": 6271 }, { "epoch": 16.56520303730604, "grad_norm": 374.354248046875, "learning_rate": 0.0003784092101823833, "loss": 38.3356, "step": 6272 }, { "epoch": 16.567844172994388, "grad_norm": 283.4086608886719, "learning_rate": 0.00037837336383308957, "loss": 37.5785, "step": 6273 }, { "epoch": 16.570485308682734, "grad_norm": 242.03269958496094, "learning_rate": 0.00037833751389904514, "loss": 36.0037, "step": 6274 }, { "epoch": 16.57312644437108, "grad_norm": 209.06304931640625, "learning_rate": 0.00037830166038125103, "loss": 35.8777, "step": 6275 }, { "epoch": 16.575767580059427, "grad_norm": 293.3053283691406, "learning_rate": 0.00037826580328070836, "loss": 36.3745, "step": 6276 }, { "epoch": 16.57840871574777, "grad_norm": 362.8944396972656, "learning_rate": 0.00037822994259841854, "loss": 37.5393, "step": 6277 }, { "epoch": 16.581049851436116, "grad_norm": 248.44273376464844, "learning_rate": 0.0003781940783353828, "loss": 37.7309, "step": 6278 }, { "epoch": 16.583690987124463, "grad_norm": 504.5776062011719, "learning_rate": 0.0003781582104926029, "loss": 36.9112, "step": 6279 }, { "epoch": 16.58633212281281, "grad_norm": 985.6865234375, "learning_rate": 0.0003781223390710802, "loss": 43.3213, "step": 6280 }, { "epoch": 16.588973258501156, "grad_norm": 13623.9130859375, "learning_rate": 0.0003780864640718164, "loss": 63.9106, "step": 6281 }, { "epoch": 16.591614394189502, "grad_norm": 8571.421875, "learning_rate": 0.0003780505854958133, "loss": 60.5609, "step": 6282 }, { "epoch": 16.59425552987785, "grad_norm": 3224.12646484375, "learning_rate": 0.00037801470334407285, "loss": 62.0932, "step": 6283 }, { "epoch": 16.596896665566195, "grad_norm": 8764.9130859375, "learning_rate": 0.000377978817617597, "loss": 73.6522, "step": 6284 }, { "epoch": 16.599537801254538, "grad_norm": 2794.239990234375, "learning_rate": 0.00037794292831738784, "loss": 74.2141, "step": 6285 }, { "epoch": 16.602178936942884, "grad_norm": 3963.333984375, "learning_rate": 0.00037790703544444764, "loss": 56.5147, "step": 6286 }, { "epoch": 16.60482007263123, "grad_norm": 1398.6802978515625, "learning_rate": 0.00037787113899977854, "loss": 54.5069, "step": 6287 }, { "epoch": 16.607461208319577, "grad_norm": 3422.736083984375, "learning_rate": 0.00037783523898438304, "loss": 47.5897, "step": 6288 }, { "epoch": 16.610102344007924, "grad_norm": 1106.9954833984375, "learning_rate": 0.0003777993353992636, "loss": 40.2858, "step": 6289 }, { "epoch": 16.61274347969627, "grad_norm": 3584.8505859375, "learning_rate": 0.0003777634282454227, "loss": 33.0506, "step": 6290 }, { "epoch": 16.615384615384617, "grad_norm": 588.4129028320312, "learning_rate": 0.0003777275175238632, "loss": 39.0948, "step": 6291 }, { "epoch": 16.618025751072963, "grad_norm": 411.38079833984375, "learning_rate": 0.0003776916032355878, "loss": 37.2151, "step": 6292 }, { "epoch": 16.620666886761306, "grad_norm": 450.0474853515625, "learning_rate": 0.00037765568538159936, "loss": 35.886, "step": 6293 }, { "epoch": 16.623308022449653, "grad_norm": 291.8737487792969, "learning_rate": 0.00037761976396290085, "loss": 37.2707, "step": 6294 }, { "epoch": 16.625949158138, "grad_norm": 591.3463745117188, "learning_rate": 0.00037758383898049556, "loss": 37.9366, "step": 6295 }, { "epoch": 16.628590293826345, "grad_norm": 653.8583374023438, "learning_rate": 0.00037754791043538636, "loss": 36.9294, "step": 6296 }, { "epoch": 16.631231429514692, "grad_norm": 450.4312438964844, "learning_rate": 0.00037751197832857676, "loss": 38.5019, "step": 6297 }, { "epoch": 16.63387256520304, "grad_norm": 480.79974365234375, "learning_rate": 0.00037747604266107006, "loss": 37.7171, "step": 6298 }, { "epoch": 16.636513700891385, "grad_norm": 356.12896728515625, "learning_rate": 0.0003774401034338697, "loss": 36.2178, "step": 6299 }, { "epoch": 16.639154836579728, "grad_norm": 918.18408203125, "learning_rate": 0.00037740416064797937, "loss": 38.9953, "step": 6300 }, { "epoch": 16.641795972268074, "grad_norm": 508.576171875, "learning_rate": 0.0003773682143044027, "loss": 37.6348, "step": 6301 }, { "epoch": 16.64443710795642, "grad_norm": 486.2532043457031, "learning_rate": 0.0003773322644041434, "loss": 38.0419, "step": 6302 }, { "epoch": 16.647078243644767, "grad_norm": 561.7691040039062, "learning_rate": 0.0003772963109482055, "loss": 37.9551, "step": 6303 }, { "epoch": 16.649719379333114, "grad_norm": 928.4147338867188, "learning_rate": 0.00037726035393759286, "loss": 37.6437, "step": 6304 }, { "epoch": 16.65236051502146, "grad_norm": 897.2867431640625, "learning_rate": 0.00037722439337330957, "loss": 38.3862, "step": 6305 }, { "epoch": 16.655001650709806, "grad_norm": 880.4070434570312, "learning_rate": 0.0003771884292563599, "loss": 38.955, "step": 6306 }, { "epoch": 16.657642786398153, "grad_norm": 550.3218383789062, "learning_rate": 0.00037715246158774796, "loss": 38.9211, "step": 6307 }, { "epoch": 16.660283922086496, "grad_norm": 948.2816162109375, "learning_rate": 0.0003771164903684783, "loss": 42.6539, "step": 6308 }, { "epoch": 16.662925057774842, "grad_norm": 261.4675598144531, "learning_rate": 0.00037708051559955525, "loss": 39.7253, "step": 6309 }, { "epoch": 16.66556619346319, "grad_norm": 498.7019348144531, "learning_rate": 0.0003770445372819835, "loss": 42.6511, "step": 6310 }, { "epoch": 16.668207329151535, "grad_norm": 408.78125, "learning_rate": 0.00037700855541676767, "loss": 42.3475, "step": 6311 }, { "epoch": 16.67084846483988, "grad_norm": 416.4279479980469, "learning_rate": 0.0003769725700049125, "loss": 41.8906, "step": 6312 }, { "epoch": 16.673489600528228, "grad_norm": 364.26568603515625, "learning_rate": 0.0003769365810474229, "loss": 45.0854, "step": 6313 }, { "epoch": 16.676130736216574, "grad_norm": 449.3486328125, "learning_rate": 0.00037690058854530396, "loss": 43.8395, "step": 6314 }, { "epoch": 16.67877187190492, "grad_norm": 487.29949951171875, "learning_rate": 0.0003768645924995605, "loss": 41.8321, "step": 6315 }, { "epoch": 16.681413007593264, "grad_norm": 484.33819580078125, "learning_rate": 0.00037682859291119785, "loss": 42.166, "step": 6316 }, { "epoch": 16.68405414328161, "grad_norm": 603.7760620117188, "learning_rate": 0.00037679258978122134, "loss": 42.8028, "step": 6317 }, { "epoch": 16.686695278969957, "grad_norm": 469.3656311035156, "learning_rate": 0.0003767565831106362, "loss": 41.4635, "step": 6318 }, { "epoch": 16.689336414658303, "grad_norm": 554.373291015625, "learning_rate": 0.0003767205729004479, "loss": 40.3261, "step": 6319 }, { "epoch": 16.69197755034665, "grad_norm": 594.6992797851562, "learning_rate": 0.000376684559151662, "loss": 38.4843, "step": 6320 }, { "epoch": 16.694618686034996, "grad_norm": 465.3691101074219, "learning_rate": 0.00037664854186528424, "loss": 41.7046, "step": 6321 }, { "epoch": 16.697259821723343, "grad_norm": 494.3134765625, "learning_rate": 0.00037661252104232037, "loss": 37.0655, "step": 6322 }, { "epoch": 16.699900957411685, "grad_norm": 398.37213134765625, "learning_rate": 0.0003765764966837762, "loss": 36.3128, "step": 6323 }, { "epoch": 16.702542093100032, "grad_norm": 416.7842712402344, "learning_rate": 0.00037654046879065774, "loss": 37.5446, "step": 6324 }, { "epoch": 16.70518322878838, "grad_norm": 1238.3189697265625, "learning_rate": 0.000376504437363971, "loss": 36.3665, "step": 6325 }, { "epoch": 16.707824364476725, "grad_norm": 471.68511962890625, "learning_rate": 0.00037646840240472215, "loss": 37.2632, "step": 6326 }, { "epoch": 16.71046550016507, "grad_norm": 597.9033203125, "learning_rate": 0.00037643236391391745, "loss": 35.9414, "step": 6327 }, { "epoch": 16.713106635853418, "grad_norm": 549.5546875, "learning_rate": 0.00037639632189256336, "loss": 36.7874, "step": 6328 }, { "epoch": 16.715747771541764, "grad_norm": 742.9067993164062, "learning_rate": 0.0003763602763416661, "loss": 37.5228, "step": 6329 }, { "epoch": 16.71838890723011, "grad_norm": 10889.8642578125, "learning_rate": 0.00037632422726223235, "loss": 53.4112, "step": 6330 }, { "epoch": 16.721030042918454, "grad_norm": 2434.09619140625, "learning_rate": 0.00037628817465526883, "loss": 41.4017, "step": 6331 }, { "epoch": 16.7236711786068, "grad_norm": 3482.480712890625, "learning_rate": 0.0003762521185217822, "loss": 45.2511, "step": 6332 }, { "epoch": 16.726312314295146, "grad_norm": 5515.193359375, "learning_rate": 0.00037621605886277936, "loss": 45.3347, "step": 6333 }, { "epoch": 16.728953449983493, "grad_norm": 2267.4658203125, "learning_rate": 0.0003761799956792672, "loss": 37.6999, "step": 6334 }, { "epoch": 16.73159458567184, "grad_norm": 1393.8511962890625, "learning_rate": 0.00037614392897225275, "loss": 39.3948, "step": 6335 }, { "epoch": 16.734235721360186, "grad_norm": 1878.357421875, "learning_rate": 0.00037610785874274324, "loss": 31.2157, "step": 6336 }, { "epoch": 16.736876857048532, "grad_norm": 8042.93310546875, "learning_rate": 0.00037607178499174577, "loss": 26.6701, "step": 6337 }, { "epoch": 16.73951799273688, "grad_norm": 3161.474365234375, "learning_rate": 0.00037603570772026785, "loss": 31.0458, "step": 6338 }, { "epoch": 16.74215912842522, "grad_norm": 754.7647094726562, "learning_rate": 0.00037599962692931675, "loss": 25.0595, "step": 6339 }, { "epoch": 16.744800264113568, "grad_norm": 782.7627563476562, "learning_rate": 0.00037596354261990007, "loss": 31.0252, "step": 6340 }, { "epoch": 16.747441399801914, "grad_norm": 575.24267578125, "learning_rate": 0.0003759274547930256, "loss": 37.297, "step": 6341 }, { "epoch": 16.75008253549026, "grad_norm": 559.9315795898438, "learning_rate": 0.0003758913634497008, "loss": 37.4602, "step": 6342 }, { "epoch": 16.752723671178607, "grad_norm": 1128.447021484375, "learning_rate": 0.00037585526859093365, "loss": 39.674, "step": 6343 }, { "epoch": 16.755364806866954, "grad_norm": 801.9541625976562, "learning_rate": 0.0003758191702177321, "loss": 38.4732, "step": 6344 }, { "epoch": 16.7580059425553, "grad_norm": 474.9579772949219, "learning_rate": 0.0003757830683311041, "loss": 38.5178, "step": 6345 }, { "epoch": 16.760647078243643, "grad_norm": 619.1551513671875, "learning_rate": 0.0003757469629320578, "loss": 36.739, "step": 6346 }, { "epoch": 16.76328821393199, "grad_norm": 995.513671875, "learning_rate": 0.00037571085402160145, "loss": 38.1952, "step": 6347 }, { "epoch": 16.765929349620336, "grad_norm": 482.0097961425781, "learning_rate": 0.0003756747416007433, "loss": 36.2286, "step": 6348 }, { "epoch": 16.768570485308683, "grad_norm": 1644.7093505859375, "learning_rate": 0.0003756386256704919, "loss": 37.9456, "step": 6349 }, { "epoch": 16.77121162099703, "grad_norm": 845.8798217773438, "learning_rate": 0.00037560250623185564, "loss": 37.2488, "step": 6350 }, { "epoch": 16.773852756685375, "grad_norm": 538.8759155273438, "learning_rate": 0.00037556638328584314, "loss": 37.6816, "step": 6351 }, { "epoch": 16.776493892373722, "grad_norm": 602.4103393554688, "learning_rate": 0.0003755302568334632, "loss": 37.3538, "step": 6352 }, { "epoch": 16.77913502806207, "grad_norm": 586.3916015625, "learning_rate": 0.0003754941268757246, "loss": 36.6627, "step": 6353 }, { "epoch": 16.78177616375041, "grad_norm": 834.49169921875, "learning_rate": 0.0003754579934136362, "loss": 37.962, "step": 6354 }, { "epoch": 16.784417299438758, "grad_norm": 815.94873046875, "learning_rate": 0.0003754218564482069, "loss": 36.6516, "step": 6355 }, { "epoch": 16.787058435127104, "grad_norm": 1757.0535888671875, "learning_rate": 0.0003753857159804461, "loss": 36.4669, "step": 6356 }, { "epoch": 16.78969957081545, "grad_norm": 1656.1131591796875, "learning_rate": 0.0003753495720113628, "loss": 41.4718, "step": 6357 }, { "epoch": 16.792340706503797, "grad_norm": 2552.1953125, "learning_rate": 0.0003753134245419663, "loss": 42.9272, "step": 6358 }, { "epoch": 16.794981842192144, "grad_norm": 489.1360168457031, "learning_rate": 0.00037527727357326607, "loss": 42.3238, "step": 6359 }, { "epoch": 16.79762297788049, "grad_norm": 439.9766845703125, "learning_rate": 0.00037524111910627157, "loss": 41.6941, "step": 6360 }, { "epoch": 16.800264113568836, "grad_norm": 560.1973266601562, "learning_rate": 0.0003752049611419923, "loss": 43.6754, "step": 6361 }, { "epoch": 16.80290524925718, "grad_norm": 2267.245361328125, "learning_rate": 0.0003751687996814382, "loss": 43.9797, "step": 6362 }, { "epoch": 16.805546384945526, "grad_norm": 508.06884765625, "learning_rate": 0.00037513263472561877, "loss": 44.9126, "step": 6363 }, { "epoch": 16.808187520633872, "grad_norm": 367.62042236328125, "learning_rate": 0.00037509646627554406, "loss": 45.8429, "step": 6364 }, { "epoch": 16.81082865632222, "grad_norm": 421.5746765136719, "learning_rate": 0.000375060294332224, "loss": 42.7033, "step": 6365 }, { "epoch": 16.813469792010565, "grad_norm": 615.75341796875, "learning_rate": 0.0003750241188966687, "loss": 42.8054, "step": 6366 }, { "epoch": 16.81611092769891, "grad_norm": 675.5474853515625, "learning_rate": 0.00037498793996988835, "loss": 41.7325, "step": 6367 }, { "epoch": 16.818752063387258, "grad_norm": 939.5045776367188, "learning_rate": 0.00037495175755289324, "loss": 41.7502, "step": 6368 }, { "epoch": 16.8213931990756, "grad_norm": 1241.0992431640625, "learning_rate": 0.00037491557164669355, "loss": 41.5605, "step": 6369 }, { "epoch": 16.824034334763947, "grad_norm": 1391.586181640625, "learning_rate": 0.0003748793822523, "loss": 38.2456, "step": 6370 }, { "epoch": 16.826675470452294, "grad_norm": 662.6670532226562, "learning_rate": 0.00037484318937072307, "loss": 39.324, "step": 6371 }, { "epoch": 16.82931660614064, "grad_norm": 796.2350463867188, "learning_rate": 0.0003748069930029734, "loss": 37.7054, "step": 6372 }, { "epoch": 16.831957741828987, "grad_norm": 851.200927734375, "learning_rate": 0.00037477079315006176, "loss": 38.3796, "step": 6373 }, { "epoch": 16.834598877517333, "grad_norm": 477.9299011230469, "learning_rate": 0.00037473458981299894, "loss": 36.9029, "step": 6374 }, { "epoch": 16.83724001320568, "grad_norm": 403.87396240234375, "learning_rate": 0.0003746983829927961, "loss": 37.5728, "step": 6375 }, { "epoch": 16.839881148894026, "grad_norm": 870.1527099609375, "learning_rate": 0.0003746621726904641, "loss": 36.587, "step": 6376 }, { "epoch": 16.84252228458237, "grad_norm": 593.7183837890625, "learning_rate": 0.00037462595890701413, "loss": 36.3183, "step": 6377 }, { "epoch": 16.845163420270715, "grad_norm": 917.7791748046875, "learning_rate": 0.0003745897416434575, "loss": 36.029, "step": 6378 }, { "epoch": 16.847804555959062, "grad_norm": 1286.4677734375, "learning_rate": 0.0003745535209008056, "loss": 55.8267, "step": 6379 }, { "epoch": 16.85044569164741, "grad_norm": 2520.69482421875, "learning_rate": 0.0003745172966800697, "loss": 45.0248, "step": 6380 }, { "epoch": 16.853086827335755, "grad_norm": 3238.660888671875, "learning_rate": 0.00037448106898226144, "loss": 50.6216, "step": 6381 }, { "epoch": 16.8557279630241, "grad_norm": 2833.625, "learning_rate": 0.0003744448378083925, "loss": 47.0075, "step": 6382 }, { "epoch": 16.858369098712448, "grad_norm": 3519.56982421875, "learning_rate": 0.0003744086031594745, "loss": 34.4888, "step": 6383 }, { "epoch": 16.861010234400794, "grad_norm": 2807.29833984375, "learning_rate": 0.0003743723650365194, "loss": 44.4184, "step": 6384 }, { "epoch": 16.863651370089137, "grad_norm": 1943.743408203125, "learning_rate": 0.000374336123440539, "loss": 33.9429, "step": 6385 }, { "epoch": 16.866292505777484, "grad_norm": 3210.13623046875, "learning_rate": 0.0003742998783725454, "loss": 24.5905, "step": 6386 }, { "epoch": 16.86893364146583, "grad_norm": 4342.8984375, "learning_rate": 0.00037426362983355077, "loss": 26.9091, "step": 6387 }, { "epoch": 16.871574777154176, "grad_norm": 1344.4442138671875, "learning_rate": 0.0003742273778245673, "loss": 21.396, "step": 6388 }, { "epoch": 16.874215912842523, "grad_norm": 1658.377685546875, "learning_rate": 0.00037419112234660725, "loss": 22.2843, "step": 6389 }, { "epoch": 16.87685704853087, "grad_norm": 1028.362548828125, "learning_rate": 0.000374154863400683, "loss": 38.2399, "step": 6390 }, { "epoch": 16.879498184219216, "grad_norm": 1306.6685791015625, "learning_rate": 0.00037411860098780715, "loss": 37.4318, "step": 6391 }, { "epoch": 16.88213931990756, "grad_norm": 875.0740966796875, "learning_rate": 0.0003740823351089923, "loss": 37.6077, "step": 6392 }, { "epoch": 16.884780455595905, "grad_norm": 658.2835693359375, "learning_rate": 0.00037404606576525116, "loss": 37.5536, "step": 6393 }, { "epoch": 16.88742159128425, "grad_norm": 989.3173828125, "learning_rate": 0.00037400979295759647, "loss": 38.1919, "step": 6394 }, { "epoch": 16.890062726972598, "grad_norm": 957.0130004882812, "learning_rate": 0.0003739735166870412, "loss": 38.7259, "step": 6395 }, { "epoch": 16.892703862660944, "grad_norm": 918.5581665039062, "learning_rate": 0.0003739372369545982, "loss": 37.4093, "step": 6396 }, { "epoch": 16.89534499834929, "grad_norm": 1155.14794921875, "learning_rate": 0.00037390095376128076, "loss": 36.7741, "step": 6397 }, { "epoch": 16.897986134037637, "grad_norm": 779.8397827148438, "learning_rate": 0.0003738646671081019, "loss": 37.6131, "step": 6398 }, { "epoch": 16.900627269725984, "grad_norm": 1059.350341796875, "learning_rate": 0.000373828376996075, "loss": 37.8221, "step": 6399 }, { "epoch": 16.903268405414327, "grad_norm": 1307.3348388671875, "learning_rate": 0.0003737920834262134, "loss": 36.0667, "step": 6400 }, { "epoch": 16.903268405414327, "eval_loss": 4.331569671630859, "eval_runtime": 2.211, "eval_samples_per_second": 223.884, "eval_steps_per_second": 28.042, "step": 6400 }, { "epoch": 16.905909541102673, "grad_norm": 722.5147094726562, "learning_rate": 0.0003737557863995306, "loss": 37.5586, "step": 6401 }, { "epoch": 16.90855067679102, "grad_norm": 2299.92138671875, "learning_rate": 0.00037371948591704017, "loss": 37.0624, "step": 6402 }, { "epoch": 16.911191812479366, "grad_norm": 973.1285400390625, "learning_rate": 0.0003736831819797558, "loss": 36.6855, "step": 6403 }, { "epoch": 16.913832948167713, "grad_norm": 1499.1802978515625, "learning_rate": 0.0003736468745886912, "loss": 36.3257, "step": 6404 }, { "epoch": 16.91647408385606, "grad_norm": 1704.17236328125, "learning_rate": 0.0003736105637448603, "loss": 36.8411, "step": 6405 }, { "epoch": 16.919115219544405, "grad_norm": 1347.2130126953125, "learning_rate": 0.000373574249449277, "loss": 40.2016, "step": 6406 }, { "epoch": 16.921756355232752, "grad_norm": 2771.393310546875, "learning_rate": 0.0003735379317029553, "loss": 40.0715, "step": 6407 }, { "epoch": 16.924397490921095, "grad_norm": 1616.440185546875, "learning_rate": 0.0003735016105069095, "loss": 44.185, "step": 6408 }, { "epoch": 16.92703862660944, "grad_norm": 499.4847412109375, "learning_rate": 0.00037346528586215373, "loss": 42.7331, "step": 6409 }, { "epoch": 16.929679762297788, "grad_norm": 645.0137329101562, "learning_rate": 0.00037342895776970245, "loss": 47.403, "step": 6410 }, { "epoch": 16.932320897986134, "grad_norm": 949.0625610351562, "learning_rate": 0.00037339262623057, "loss": 47.2552, "step": 6411 }, { "epoch": 16.93496203367448, "grad_norm": 787.8806762695312, "learning_rate": 0.0003733562912457709, "loss": 44.8046, "step": 6412 }, { "epoch": 16.937603169362827, "grad_norm": 679.2576904296875, "learning_rate": 0.0003733199528163199, "loss": 45.0449, "step": 6413 }, { "epoch": 16.940244305051174, "grad_norm": 1084.9730224609375, "learning_rate": 0.00037328361094323163, "loss": 41.0759, "step": 6414 }, { "epoch": 16.942885440739516, "grad_norm": 796.7198486328125, "learning_rate": 0.000373247265627521, "loss": 40.4354, "step": 6415 }, { "epoch": 16.945526576427863, "grad_norm": 722.9692993164062, "learning_rate": 0.00037321091687020277, "loss": 38.6054, "step": 6416 }, { "epoch": 16.94816771211621, "grad_norm": 905.0545654296875, "learning_rate": 0.00037317456467229204, "loss": 39.8592, "step": 6417 }, { "epoch": 16.950808847804556, "grad_norm": 760.8477783203125, "learning_rate": 0.0003731382090348041, "loss": 37.6565, "step": 6418 }, { "epoch": 16.953449983492902, "grad_norm": 761.7744750976562, "learning_rate": 0.0003731018499587539, "loss": 38.1412, "step": 6419 }, { "epoch": 16.95609111918125, "grad_norm": 827.3031616210938, "learning_rate": 0.0003730654874451569, "loss": 38.582, "step": 6420 }, { "epoch": 16.958732254869595, "grad_norm": 1837.160888671875, "learning_rate": 0.00037302912149502843, "loss": 34.7289, "step": 6421 }, { "epoch": 16.96137339055794, "grad_norm": 25198.20703125, "learning_rate": 0.00037299275210938405, "loss": 24.1933, "step": 6422 }, { "epoch": 16.964014526246284, "grad_norm": 1326.5228271484375, "learning_rate": 0.0003729563792892393, "loss": 25.5654, "step": 6423 }, { "epoch": 16.96665566193463, "grad_norm": 5929.17138671875, "learning_rate": 0.00037292000303560985, "loss": 31.2184, "step": 6424 }, { "epoch": 16.969296797622977, "grad_norm": 2207.601318359375, "learning_rate": 0.00037288362334951156, "loss": 24.3735, "step": 6425 }, { "epoch": 16.971937933311324, "grad_norm": 1716.0205078125, "learning_rate": 0.00037284724023196024, "loss": 26.1416, "step": 6426 }, { "epoch": 16.97457906899967, "grad_norm": 1055.044921875, "learning_rate": 0.00037281085368397193, "loss": 39.7041, "step": 6427 }, { "epoch": 16.977220204688017, "grad_norm": 1077.7998046875, "learning_rate": 0.0003727744637065627, "loss": 36.6218, "step": 6428 }, { "epoch": 16.979861340376363, "grad_norm": 1074.527099609375, "learning_rate": 0.0003727380703007487, "loss": 36.9588, "step": 6429 }, { "epoch": 16.98250247606471, "grad_norm": 1454.146484375, "learning_rate": 0.0003727016734675461, "loss": 37.6579, "step": 6430 }, { "epoch": 16.985143611753053, "grad_norm": 995.7008666992188, "learning_rate": 0.0003726652732079714, "loss": 37.375, "step": 6431 }, { "epoch": 16.9877847474414, "grad_norm": 1810.1236572265625, "learning_rate": 0.00037262886952304104, "loss": 37.456, "step": 6432 }, { "epoch": 16.990425883129745, "grad_norm": 1507.15771484375, "learning_rate": 0.00037259246241377143, "loss": 37.8872, "step": 6433 }, { "epoch": 16.993067018818092, "grad_norm": 1867.7171630859375, "learning_rate": 0.00037255605188117944, "loss": 36.4744, "step": 6434 }, { "epoch": 16.99570815450644, "grad_norm": 1069.3541259765625, "learning_rate": 0.0003725196379262816, "loss": 37.5323, "step": 6435 }, { "epoch": 16.998349290194785, "grad_norm": 1380.041748046875, "learning_rate": 0.00037248322055009485, "loss": 38.1633, "step": 6436 }, { "epoch": 17.00099042588313, "grad_norm": 1464.344482421875, "learning_rate": 0.0003724467997536362, "loss": 40.6041, "step": 6437 }, { "epoch": 17.003631561571474, "grad_norm": 467.2341613769531, "learning_rate": 0.00037241037553792253, "loss": 40.397, "step": 6438 }, { "epoch": 17.00627269725982, "grad_norm": 628.3887329101562, "learning_rate": 0.00037237394790397104, "loss": 39.7705, "step": 6439 }, { "epoch": 17.008913832948167, "grad_norm": 758.5719604492188, "learning_rate": 0.000372337516852799, "loss": 41.322, "step": 6440 }, { "epoch": 17.011554968636514, "grad_norm": 2090.9951171875, "learning_rate": 0.0003723010823854236, "loss": 43.9193, "step": 6441 }, { "epoch": 17.01419610432486, "grad_norm": 919.9518432617188, "learning_rate": 0.0003722646445028623, "loss": 45.1613, "step": 6442 }, { "epoch": 17.016837240013206, "grad_norm": 609.5640258789062, "learning_rate": 0.0003722282032061327, "loss": 42.2226, "step": 6443 }, { "epoch": 17.019478375701553, "grad_norm": 1863.947509765625, "learning_rate": 0.0003721917584962522, "loss": 41.0528, "step": 6444 }, { "epoch": 17.0221195113899, "grad_norm": 757.2589721679688, "learning_rate": 0.0003721553103742388, "loss": 40.6169, "step": 6445 }, { "epoch": 17.024760647078242, "grad_norm": 1034.313232421875, "learning_rate": 0.00037211885884110994, "loss": 40.3824, "step": 6446 }, { "epoch": 17.02740178276659, "grad_norm": 658.3348388671875, "learning_rate": 0.00037208240389788376, "loss": 41.1296, "step": 6447 }, { "epoch": 17.030042918454935, "grad_norm": 1186.646484375, "learning_rate": 0.00037204594554557824, "loss": 40.9262, "step": 6448 }, { "epoch": 17.03268405414328, "grad_norm": 629.83251953125, "learning_rate": 0.0003720094837852113, "loss": 38.451, "step": 6449 }, { "epoch": 17.035325189831628, "grad_norm": 847.6072998046875, "learning_rate": 0.00037197301861780125, "loss": 38.6899, "step": 6450 }, { "epoch": 17.037966325519974, "grad_norm": 788.4738159179688, "learning_rate": 0.00037193655004436624, "loss": 37.469, "step": 6451 }, { "epoch": 17.04060746120832, "grad_norm": 984.6025390625, "learning_rate": 0.0003719000780659247, "loss": 37.815, "step": 6452 }, { "epoch": 17.043248596896664, "grad_norm": 798.5681762695312, "learning_rate": 0.0003718636026834951, "loss": 37.2292, "step": 6453 }, { "epoch": 17.04588973258501, "grad_norm": 1323.5537109375, "learning_rate": 0.000371827123898096, "loss": 37.0517, "step": 6454 }, { "epoch": 17.048530868273357, "grad_norm": 672.7445068359375, "learning_rate": 0.00037179064171074605, "loss": 36.653, "step": 6455 }, { "epoch": 17.051172003961703, "grad_norm": 703.939453125, "learning_rate": 0.00037175415612246393, "loss": 36.8402, "step": 6456 }, { "epoch": 17.05381313965005, "grad_norm": 999.0853271484375, "learning_rate": 0.0003717176671342686, "loss": 37.6106, "step": 6457 }, { "epoch": 17.056454275338396, "grad_norm": 1125.7557373046875, "learning_rate": 0.00037168117474717887, "loss": 42.0565, "step": 6458 }, { "epoch": 17.059095411026743, "grad_norm": 3133.0380859375, "learning_rate": 0.0003716446789622138, "loss": 29.9911, "step": 6459 }, { "epoch": 17.06173654671509, "grad_norm": 2001.590087890625, "learning_rate": 0.0003716081797803925, "loss": 25.9522, "step": 6460 }, { "epoch": 17.064377682403432, "grad_norm": 3433.1396484375, "learning_rate": 0.0003715716772027342, "loss": 23.2375, "step": 6461 }, { "epoch": 17.06701881809178, "grad_norm": 3755.520263671875, "learning_rate": 0.0003715351712302583, "loss": 23.3344, "step": 6462 }, { "epoch": 17.069659953780125, "grad_norm": 900.887451171875, "learning_rate": 0.0003714986618639842, "loss": 21.6931, "step": 6463 }, { "epoch": 17.07230108946847, "grad_norm": 1285.0711669921875, "learning_rate": 0.00037146214910493123, "loss": 14.2682, "step": 6464 }, { "epoch": 17.074942225156818, "grad_norm": 875.899658203125, "learning_rate": 0.00037142563295411906, "loss": 17.0937, "step": 6465 }, { "epoch": 17.077583360845164, "grad_norm": 1209.2528076171875, "learning_rate": 0.0003713891134125675, "loss": 13.5298, "step": 6466 }, { "epoch": 17.08022449653351, "grad_norm": 1407.569091796875, "learning_rate": 0.00037135259048129627, "loss": 21.434, "step": 6467 }, { "epoch": 17.082865632221857, "grad_norm": 676.9480590820312, "learning_rate": 0.0003713160641613252, "loss": 13.4033, "step": 6468 }, { "epoch": 17.0855067679102, "grad_norm": 821.75537109375, "learning_rate": 0.0003712795344536743, "loss": 34.7965, "step": 6469 }, { "epoch": 17.088147903598546, "grad_norm": 940.9038696289062, "learning_rate": 0.0003712430013593636, "loss": 40.4904, "step": 6470 }, { "epoch": 17.090789039286893, "grad_norm": 787.1813354492188, "learning_rate": 0.0003712064648794134, "loss": 38.6341, "step": 6471 }, { "epoch": 17.09343017497524, "grad_norm": 548.838623046875, "learning_rate": 0.0003711699250148439, "loss": 39.8553, "step": 6472 }, { "epoch": 17.096071310663586, "grad_norm": 1352.975341796875, "learning_rate": 0.0003711333817666753, "loss": 40.1653, "step": 6473 }, { "epoch": 17.098712446351932, "grad_norm": 1177.8289794921875, "learning_rate": 0.00037109683513592825, "loss": 39.4858, "step": 6474 }, { "epoch": 17.10135358204028, "grad_norm": 2631.461669921875, "learning_rate": 0.00037106028512362316, "loss": 38.9216, "step": 6475 }, { "epoch": 17.10399471772862, "grad_norm": 940.5543212890625, "learning_rate": 0.0003710237317307808, "loss": 38.667, "step": 6476 }, { "epoch": 17.106635853416968, "grad_norm": 1312.68896484375, "learning_rate": 0.0003709871749584217, "loss": 39.1112, "step": 6477 }, { "epoch": 17.109276989105314, "grad_norm": 685.70654296875, "learning_rate": 0.0003709506148075669, "loss": 38.5429, "step": 6478 }, { "epoch": 17.11191812479366, "grad_norm": 691.21484375, "learning_rate": 0.0003709140512792372, "loss": 37.9394, "step": 6479 }, { "epoch": 17.114559260482007, "grad_norm": 767.6426391601562, "learning_rate": 0.00037087748437445367, "loss": 38.7012, "step": 6480 }, { "epoch": 17.117200396170354, "grad_norm": 591.9429321289062, "learning_rate": 0.0003708409140942374, "loss": 38.4717, "step": 6481 }, { "epoch": 17.1198415318587, "grad_norm": 794.4052124023438, "learning_rate": 0.0003708043404396096, "loss": 36.6979, "step": 6482 }, { "epoch": 17.122482667547047, "grad_norm": 589.2150268554688, "learning_rate": 0.0003707677634115916, "loss": 36.8682, "step": 6483 }, { "epoch": 17.12512380323539, "grad_norm": 3366.4921875, "learning_rate": 0.00037073118301120467, "loss": 36.9486, "step": 6484 }, { "epoch": 17.127764938923736, "grad_norm": 1136.068603515625, "learning_rate": 0.00037069459923947035, "loss": 38.0796, "step": 6485 }, { "epoch": 17.130406074612083, "grad_norm": 1250.4776611328125, "learning_rate": 0.0003706580120974103, "loss": 40.9765, "step": 6486 }, { "epoch": 17.13304721030043, "grad_norm": 3206.562255859375, "learning_rate": 0.0003706214215860461, "loss": 44.0283, "step": 6487 }, { "epoch": 17.135688345988775, "grad_norm": 435.8665466308594, "learning_rate": 0.0003705848277063996, "loss": 41.5952, "step": 6488 }, { "epoch": 17.138329481677122, "grad_norm": 1222.88330078125, "learning_rate": 0.00037054823045949264, "loss": 42.6781, "step": 6489 }, { "epoch": 17.14097061736547, "grad_norm": 546.1701049804688, "learning_rate": 0.00037051162984634713, "loss": 44.2578, "step": 6490 }, { "epoch": 17.143611753053815, "grad_norm": 462.7916564941406, "learning_rate": 0.00037047502586798516, "loss": 41.744, "step": 6491 }, { "epoch": 17.146252888742158, "grad_norm": 616.9076538085938, "learning_rate": 0.0003704384185254288, "loss": 42.3058, "step": 6492 }, { "epoch": 17.148894024430504, "grad_norm": 616.4176635742188, "learning_rate": 0.0003704018078197005, "loss": 46.9468, "step": 6493 }, { "epoch": 17.15153516011885, "grad_norm": 945.7684326171875, "learning_rate": 0.0003703651937518223, "loss": 43.0868, "step": 6494 }, { "epoch": 17.154176295807197, "grad_norm": 772.0945434570312, "learning_rate": 0.0003703285763228168, "loss": 43.9414, "step": 6495 }, { "epoch": 17.156817431495544, "grad_norm": 539.291015625, "learning_rate": 0.0003702919555337065, "loss": 41.7222, "step": 6496 }, { "epoch": 17.15945856718389, "grad_norm": 597.0450439453125, "learning_rate": 0.000370255331385514, "loss": 41.5909, "step": 6497 }, { "epoch": 17.162099702872236, "grad_norm": 421.08380126953125, "learning_rate": 0.00037021870387926207, "loss": 39.8778, "step": 6498 }, { "epoch": 17.16474083856058, "grad_norm": 489.2596435546875, "learning_rate": 0.0003701820730159734, "loss": 39.245, "step": 6499 }, { "epoch": 17.167381974248926, "grad_norm": 607.2840576171875, "learning_rate": 0.00037014543879667093, "loss": 39.5518, "step": 6500 }, { "epoch": 17.170023109937272, "grad_norm": 377.3681640625, "learning_rate": 0.0003701088012223777, "loss": 38.0262, "step": 6501 }, { "epoch": 17.17266424562562, "grad_norm": 425.3075256347656, "learning_rate": 0.0003700721602941168, "loss": 37.1008, "step": 6502 }, { "epoch": 17.175305381313965, "grad_norm": 448.523681640625, "learning_rate": 0.0003700355160129112, "loss": 37.4196, "step": 6503 }, { "epoch": 17.17794651700231, "grad_norm": 359.2659606933594, "learning_rate": 0.0003699988683797845, "loss": 37.305, "step": 6504 }, { "epoch": 17.180587652690658, "grad_norm": 340.94970703125, "learning_rate": 0.00036996221739575975, "loss": 38.7329, "step": 6505 }, { "epoch": 17.183228788379004, "grad_norm": 533.818603515625, "learning_rate": 0.00036992556306186065, "loss": 37.3362, "step": 6506 }, { "epoch": 17.185869924067347, "grad_norm": 623.9136352539062, "learning_rate": 0.00036988890537911057, "loss": 36.946, "step": 6507 }, { "epoch": 17.188511059755694, "grad_norm": 433.8808288574219, "learning_rate": 0.00036985224434853326, "loss": 36.217, "step": 6508 }, { "epoch": 17.19115219544404, "grad_norm": 1057.0667724609375, "learning_rate": 0.00036981557997115244, "loss": 39.1843, "step": 6509 }, { "epoch": 17.193793331132387, "grad_norm": 8245.95703125, "learning_rate": 0.00036977891224799196, "loss": 64.2495, "step": 6510 }, { "epoch": 17.196434466820733, "grad_norm": 1388.5576171875, "learning_rate": 0.00036974224118007575, "loss": 49.8281, "step": 6511 }, { "epoch": 17.19907560250908, "grad_norm": 5547.04052734375, "learning_rate": 0.0003697055667684277, "loss": 42.653, "step": 6512 }, { "epoch": 17.201716738197426, "grad_norm": 1993.6954345703125, "learning_rate": 0.000369668889014072, "loss": 33.9905, "step": 6513 }, { "epoch": 17.204357873885773, "grad_norm": 3197.77587890625, "learning_rate": 0.000369632207918033, "loss": 32.1383, "step": 6514 }, { "epoch": 17.206999009574115, "grad_norm": 1065.4794921875, "learning_rate": 0.0003695955234813348, "loss": 24.3248, "step": 6515 }, { "epoch": 17.209640145262462, "grad_norm": 1335.021728515625, "learning_rate": 0.00036955883570500187, "loss": 27.3331, "step": 6516 }, { "epoch": 17.21228128095081, "grad_norm": 759.5545043945312, "learning_rate": 0.0003695221445900586, "loss": 18.0081, "step": 6517 }, { "epoch": 17.214922416639155, "grad_norm": 9436.4501953125, "learning_rate": 0.00036948545013752975, "loss": 19.7971, "step": 6518 }, { "epoch": 17.2175635523275, "grad_norm": 3126.86572265625, "learning_rate": 0.0003694487523484399, "loss": 14.6662, "step": 6519 }, { "epoch": 17.220204688015848, "grad_norm": 926.6644287109375, "learning_rate": 0.00036941205122381377, "loss": 39.5099, "step": 6520 }, { "epoch": 17.222845823704194, "grad_norm": 493.74127197265625, "learning_rate": 0.00036937534676467635, "loss": 40.9918, "step": 6521 }, { "epoch": 17.225486959392537, "grad_norm": 719.4754028320312, "learning_rate": 0.00036933863897205233, "loss": 41.3587, "step": 6522 }, { "epoch": 17.228128095080883, "grad_norm": 1680.1060791015625, "learning_rate": 0.00036930192784696706, "loss": 39.7343, "step": 6523 }, { "epoch": 17.23076923076923, "grad_norm": 553.0454711914062, "learning_rate": 0.0003692652133904454, "loss": 40.7694, "step": 6524 }, { "epoch": 17.233410366457576, "grad_norm": 576.9655151367188, "learning_rate": 0.0003692284956035129, "loss": 39.7946, "step": 6525 }, { "epoch": 17.236051502145923, "grad_norm": 1559.5321044921875, "learning_rate": 0.00036919177448719453, "loss": 40.4184, "step": 6526 }, { "epoch": 17.23869263783427, "grad_norm": 645.9541015625, "learning_rate": 0.00036915505004251595, "loss": 38.6861, "step": 6527 }, { "epoch": 17.241333773522616, "grad_norm": 382.5498046875, "learning_rate": 0.00036911832227050264, "loss": 38.1701, "step": 6528 }, { "epoch": 17.243974909210962, "grad_norm": 314.1971740722656, "learning_rate": 0.00036908159117218006, "loss": 38.841, "step": 6529 }, { "epoch": 17.246616044899305, "grad_norm": 290.13397216796875, "learning_rate": 0.000369044856748574, "loss": 36.4511, "step": 6530 }, { "epoch": 17.24925718058765, "grad_norm": 428.7754821777344, "learning_rate": 0.0003690081190007103, "loss": 38.6752, "step": 6531 }, { "epoch": 17.251898316275998, "grad_norm": 329.28533935546875, "learning_rate": 0.0003689713779296148, "loss": 37.5111, "step": 6532 }, { "epoch": 17.254539451964344, "grad_norm": 393.7846984863281, "learning_rate": 0.0003689346335363134, "loss": 38.2696, "step": 6533 }, { "epoch": 17.25718058765269, "grad_norm": 326.7133483886719, "learning_rate": 0.00036889788582183226, "loss": 37.3582, "step": 6534 }, { "epoch": 17.259821723341037, "grad_norm": 283.45684814453125, "learning_rate": 0.0003688611347871975, "loss": 38.6834, "step": 6535 }, { "epoch": 17.262462859029384, "grad_norm": 490.17388916015625, "learning_rate": 0.0003688243804334353, "loss": 40.5906, "step": 6536 }, { "epoch": 17.26510399471773, "grad_norm": 649.6276245117188, "learning_rate": 0.0003687876227615722, "loss": 43.5296, "step": 6537 }, { "epoch": 17.267745130406073, "grad_norm": 190.6903839111328, "learning_rate": 0.00036875086177263447, "loss": 40.7443, "step": 6538 }, { "epoch": 17.27038626609442, "grad_norm": 229.76145935058594, "learning_rate": 0.0003687140974676486, "loss": 42.391, "step": 6539 }, { "epoch": 17.273027401782766, "grad_norm": 303.4750061035156, "learning_rate": 0.0003686773298476414, "loss": 42.1746, "step": 6540 }, { "epoch": 17.275668537471113, "grad_norm": 229.31558227539062, "learning_rate": 0.00036864055891363943, "loss": 42.1119, "step": 6541 }, { "epoch": 17.27830967315946, "grad_norm": 232.06492614746094, "learning_rate": 0.00036860378466666965, "loss": 43.8167, "step": 6542 }, { "epoch": 17.280950808847805, "grad_norm": 240.74276733398438, "learning_rate": 0.0003685670071077587, "loss": 44.7356, "step": 6543 }, { "epoch": 17.283591944536152, "grad_norm": 233.42977905273438, "learning_rate": 0.0003685302262379339, "loss": 42.842, "step": 6544 }, { "epoch": 17.2862330802245, "grad_norm": 130.80386352539062, "learning_rate": 0.000368493442058222, "loss": 43.3313, "step": 6545 }, { "epoch": 17.28887421591284, "grad_norm": 208.79164123535156, "learning_rate": 0.0003684566545696504, "loss": 42.637, "step": 6546 }, { "epoch": 17.291515351601188, "grad_norm": 223.19580078125, "learning_rate": 0.00036841986377324627, "loss": 41.6898, "step": 6547 }, { "epoch": 17.294156487289534, "grad_norm": 267.751953125, "learning_rate": 0.000368383069670037, "loss": 41.0698, "step": 6548 }, { "epoch": 17.29679762297788, "grad_norm": 370.7958679199219, "learning_rate": 0.0003683462722610501, "loss": 40.0298, "step": 6549 }, { "epoch": 17.299438758666227, "grad_norm": 208.1413116455078, "learning_rate": 0.00036830947154731306, "loss": 38.8389, "step": 6550 }, { "epoch": 17.302079894354573, "grad_norm": 155.56678771972656, "learning_rate": 0.00036827266752985355, "loss": 37.2426, "step": 6551 }, { "epoch": 17.30472103004292, "grad_norm": 141.07603454589844, "learning_rate": 0.0003682358602096992, "loss": 37.7498, "step": 6552 }, { "epoch": 17.307362165731263, "grad_norm": 246.36865234375, "learning_rate": 0.00036819904958787796, "loss": 38.6274, "step": 6553 }, { "epoch": 17.31000330141961, "grad_norm": 165.55001831054688, "learning_rate": 0.00036816223566541774, "loss": 36.7122, "step": 6554 }, { "epoch": 17.312644437107956, "grad_norm": 217.44979858398438, "learning_rate": 0.00036812541844334645, "loss": 36.6259, "step": 6555 }, { "epoch": 17.315285572796302, "grad_norm": 201.1334991455078, "learning_rate": 0.00036808859792269224, "loss": 36.9087, "step": 6556 }, { "epoch": 17.31792670848465, "grad_norm": 233.06597900390625, "learning_rate": 0.00036805177410448325, "loss": 37.8207, "step": 6557 }, { "epoch": 17.320567844172995, "grad_norm": 284.4275207519531, "learning_rate": 0.0003680149469897479, "loss": 38.9394, "step": 6558 }, { "epoch": 17.32320897986134, "grad_norm": 1669.48681640625, "learning_rate": 0.00036797811657951447, "loss": 53.0591, "step": 6559 }, { "epoch": 17.325850115549688, "grad_norm": 1327.9560546875, "learning_rate": 0.0003679412828748114, "loss": 74.7165, "step": 6560 }, { "epoch": 17.32849125123803, "grad_norm": 1248.0865478515625, "learning_rate": 0.0003679044458766673, "loss": 81.8575, "step": 6561 }, { "epoch": 17.331132386926377, "grad_norm": 6817.8359375, "learning_rate": 0.00036786760558611085, "loss": 79.5595, "step": 6562 }, { "epoch": 17.333773522614724, "grad_norm": 2567.724365234375, "learning_rate": 0.00036783076200417073, "loss": 73.5922, "step": 6563 }, { "epoch": 17.33641465830307, "grad_norm": 1891.3836669921875, "learning_rate": 0.00036779391513187576, "loss": 55.6444, "step": 6564 }, { "epoch": 17.339055793991417, "grad_norm": 1364.974853515625, "learning_rate": 0.00036775706497025495, "loss": 47.7392, "step": 6565 }, { "epoch": 17.341696929679763, "grad_norm": 1384.171875, "learning_rate": 0.00036772021152033725, "loss": 39.1403, "step": 6566 }, { "epoch": 17.34433806536811, "grad_norm": 1637.422607421875, "learning_rate": 0.0003676833547831518, "loss": 26.8751, "step": 6567 }, { "epoch": 17.346979201056456, "grad_norm": 979.588134765625, "learning_rate": 0.0003676464947597278, "loss": 23.8359, "step": 6568 }, { "epoch": 17.3496203367448, "grad_norm": 271.2937927246094, "learning_rate": 0.00036760963145109456, "loss": 26.5823, "step": 6569 }, { "epoch": 17.352261472433145, "grad_norm": 876.6717529296875, "learning_rate": 0.00036757276485828146, "loss": 50.6447, "step": 6570 }, { "epoch": 17.354902608121492, "grad_norm": 954.3099365234375, "learning_rate": 0.00036753589498231785, "loss": 49.1683, "step": 6571 }, { "epoch": 17.35754374380984, "grad_norm": 608.9263916015625, "learning_rate": 0.00036749902182423364, "loss": 48.1697, "step": 6572 }, { "epoch": 17.360184879498185, "grad_norm": 413.64483642578125, "learning_rate": 0.0003674621453850581, "loss": 44.5184, "step": 6573 }, { "epoch": 17.36282601518653, "grad_norm": 436.8431091308594, "learning_rate": 0.00036742526566582116, "loss": 40.0608, "step": 6574 }, { "epoch": 17.365467150874878, "grad_norm": 510.28924560546875, "learning_rate": 0.0003673883826675527, "loss": 38.6961, "step": 6575 }, { "epoch": 17.36810828656322, "grad_norm": 323.5348815917969, "learning_rate": 0.0003673514963912826, "loss": 39.95, "step": 6576 }, { "epoch": 17.370749422251567, "grad_norm": 1468.421875, "learning_rate": 0.0003673146068380409, "loss": 38.1254, "step": 6577 }, { "epoch": 17.373390557939913, "grad_norm": 328.0318298339844, "learning_rate": 0.0003672777140088577, "loss": 39.0575, "step": 6578 }, { "epoch": 17.37603169362826, "grad_norm": 736.2506103515625, "learning_rate": 0.00036724081790476325, "loss": 38.4753, "step": 6579 }, { "epoch": 17.378672829316606, "grad_norm": 420.5440673828125, "learning_rate": 0.0003672039185267878, "loss": 39.3087, "step": 6580 }, { "epoch": 17.381313965004953, "grad_norm": 408.14837646484375, "learning_rate": 0.0003671670158759618, "loss": 38.8283, "step": 6581 }, { "epoch": 17.3839551006933, "grad_norm": 1065.095458984375, "learning_rate": 0.00036713010995331573, "loss": 37.7717, "step": 6582 }, { "epoch": 17.386596236381646, "grad_norm": 599.9276733398438, "learning_rate": 0.0003670932007598801, "loss": 37.8501, "step": 6583 }, { "epoch": 17.38923737206999, "grad_norm": 353.38140869140625, "learning_rate": 0.00036705628829668567, "loss": 37.5926, "step": 6584 }, { "epoch": 17.391878507758335, "grad_norm": 323.1408996582031, "learning_rate": 0.00036701937256476317, "loss": 38.6514, "step": 6585 }, { "epoch": 17.39451964344668, "grad_norm": 1357.3509521484375, "learning_rate": 0.00036698245356514336, "loss": 40.8189, "step": 6586 }, { "epoch": 17.397160779135028, "grad_norm": 396.3932189941406, "learning_rate": 0.00036694553129885726, "loss": 41.9855, "step": 6587 }, { "epoch": 17.399801914823374, "grad_norm": 143.58274841308594, "learning_rate": 0.0003669086057669359, "loss": 40.4567, "step": 6588 }, { "epoch": 17.40244305051172, "grad_norm": 257.1139831542969, "learning_rate": 0.0003668716769704105, "loss": 41.8979, "step": 6589 }, { "epoch": 17.405084186200067, "grad_norm": 186.1780242919922, "learning_rate": 0.00036683474491031205, "loss": 40.4682, "step": 6590 }, { "epoch": 17.407725321888414, "grad_norm": 162.40318298339844, "learning_rate": 0.00036679780958767205, "loss": 42.315, "step": 6591 }, { "epoch": 17.410366457576757, "grad_norm": 181.5269012451172, "learning_rate": 0.0003667608710035218, "loss": 44.8516, "step": 6592 }, { "epoch": 17.413007593265103, "grad_norm": 115.96819305419922, "learning_rate": 0.0003667239291588928, "loss": 43.4902, "step": 6593 }, { "epoch": 17.41564872895345, "grad_norm": 139.70326232910156, "learning_rate": 0.0003666869840548167, "loss": 41.5869, "step": 6594 }, { "epoch": 17.418289864641796, "grad_norm": 201.57933044433594, "learning_rate": 0.000366650035692325, "loss": 42.407, "step": 6595 }, { "epoch": 17.420931000330143, "grad_norm": 238.6509552001953, "learning_rate": 0.00036661308407244975, "loss": 41.2269, "step": 6596 }, { "epoch": 17.42357213601849, "grad_norm": 174.07852172851562, "learning_rate": 0.0003665761291962225, "loss": 42.6327, "step": 6597 }, { "epoch": 17.426213271706835, "grad_norm": 158.5216064453125, "learning_rate": 0.0003665391710646754, "loss": 38.9, "step": 6598 }, { "epoch": 17.42885440739518, "grad_norm": 167.02548217773438, "learning_rate": 0.00036650220967884037, "loss": 39.6008, "step": 6599 }, { "epoch": 17.431495543083525, "grad_norm": 1155.5, "learning_rate": 0.0003664652450397495, "loss": 37.5132, "step": 6600 }, { "epoch": 17.431495543083525, "eval_loss": 4.886215686798096, "eval_runtime": 2.1642, "eval_samples_per_second": 228.722, "eval_steps_per_second": 28.648, "step": 6600 }, { "epoch": 17.43413667877187, "grad_norm": 242.12477111816406, "learning_rate": 0.0003664282771484352, "loss": 38.1354, "step": 6601 }, { "epoch": 17.436777814460218, "grad_norm": 230.365966796875, "learning_rate": 0.0003663913060059296, "loss": 38.7182, "step": 6602 }, { "epoch": 17.439418950148564, "grad_norm": 199.68463134765625, "learning_rate": 0.00036635433161326516, "loss": 38.3485, "step": 6603 }, { "epoch": 17.44206008583691, "grad_norm": 517.6157836914062, "learning_rate": 0.0003663173539714743, "loss": 37.4251, "step": 6604 }, { "epoch": 17.444701221525257, "grad_norm": 153.5864715576172, "learning_rate": 0.0003662803730815898, "loss": 35.9552, "step": 6605 }, { "epoch": 17.447342357213603, "grad_norm": 153.2327117919922, "learning_rate": 0.00036624338894464405, "loss": 36.7966, "step": 6606 }, { "epoch": 17.449983492901946, "grad_norm": 117.7197494506836, "learning_rate": 0.00036620640156167003, "loss": 38.1665, "step": 6607 }, { "epoch": 17.452624628590293, "grad_norm": 162.96658325195312, "learning_rate": 0.0003661694109337005, "loss": 37.174, "step": 6608 }, { "epoch": 17.45526576427864, "grad_norm": 1895.1783447265625, "learning_rate": 0.0003661324170617683, "loss": 51.1111, "step": 6609 }, { "epoch": 17.457906899966986, "grad_norm": 4365.13037109375, "learning_rate": 0.0003660954199469067, "loss": 59.2719, "step": 6610 }, { "epoch": 17.460548035655332, "grad_norm": 1228.68310546875, "learning_rate": 0.0003660584195901487, "loss": 48.9834, "step": 6611 }, { "epoch": 17.46318917134368, "grad_norm": 1004.040771484375, "learning_rate": 0.0003660214159925275, "loss": 39.188, "step": 6612 }, { "epoch": 17.465830307032025, "grad_norm": 4764.48583984375, "learning_rate": 0.0003659844091550763, "loss": 40.8434, "step": 6613 }, { "epoch": 17.46847144272037, "grad_norm": 2905.850341796875, "learning_rate": 0.00036594739907882867, "loss": 29.1096, "step": 6614 }, { "epoch": 17.471112578408714, "grad_norm": 565.9572143554688, "learning_rate": 0.00036591038576481804, "loss": 31.0683, "step": 6615 }, { "epoch": 17.47375371409706, "grad_norm": 1145.9718017578125, "learning_rate": 0.00036587336921407805, "loss": 27.9429, "step": 6616 }, { "epoch": 17.476394849785407, "grad_norm": 795.0176391601562, "learning_rate": 0.00036583634942764223, "loss": 25.2304, "step": 6617 }, { "epoch": 17.479035985473754, "grad_norm": 312.3695373535156, "learning_rate": 0.00036579932640654436, "loss": 33.4975, "step": 6618 }, { "epoch": 17.4816771211621, "grad_norm": 216.591064453125, "learning_rate": 0.0003657623001518184, "loss": 39.0664, "step": 6619 }, { "epoch": 17.484318256850447, "grad_norm": 459.3465881347656, "learning_rate": 0.00036572527066449815, "loss": 37.1663, "step": 6620 }, { "epoch": 17.486959392538793, "grad_norm": 207.27984619140625, "learning_rate": 0.0003656882379456177, "loss": 38.1179, "step": 6621 }, { "epoch": 17.489600528227136, "grad_norm": 227.9271697998047, "learning_rate": 0.0003656512019962112, "loss": 39.1033, "step": 6622 }, { "epoch": 17.492241663915483, "grad_norm": 222.419921875, "learning_rate": 0.0003656141628173128, "loss": 38.9286, "step": 6623 }, { "epoch": 17.49488279960383, "grad_norm": 207.1656494140625, "learning_rate": 0.00036557712040995687, "loss": 36.445, "step": 6624 }, { "epoch": 17.497523935292175, "grad_norm": 241.9269256591797, "learning_rate": 0.0003655400747751777, "loss": 37.565, "step": 6625 }, { "epoch": 17.500165070980522, "grad_norm": 213.9453125, "learning_rate": 0.00036550302591400986, "loss": 37.1632, "step": 6626 }, { "epoch": 17.50280620666887, "grad_norm": 334.1177673339844, "learning_rate": 0.00036546597382748776, "loss": 35.8793, "step": 6627 }, { "epoch": 17.505447342357215, "grad_norm": 958.91015625, "learning_rate": 0.00036542891851664626, "loss": 37.9947, "step": 6628 }, { "epoch": 17.50808847804556, "grad_norm": 600.90869140625, "learning_rate": 0.00036539185998252, "loss": 38.3606, "step": 6629 }, { "epoch": 17.510729613733904, "grad_norm": 351.1955261230469, "learning_rate": 0.0003653547982261438, "loss": 37.457, "step": 6630 }, { "epoch": 17.51337074942225, "grad_norm": 618.7466430664062, "learning_rate": 0.0003653177332485527, "loss": 37.6646, "step": 6631 }, { "epoch": 17.516011885110597, "grad_norm": 168.72434997558594, "learning_rate": 0.00036528066505078153, "loss": 36.9453, "step": 6632 }, { "epoch": 17.518653020798943, "grad_norm": 223.14089965820312, "learning_rate": 0.0003652435936338656, "loss": 36.607, "step": 6633 }, { "epoch": 17.52129415648729, "grad_norm": 272.23431396484375, "learning_rate": 0.00036520651899884, "loss": 36.936, "step": 6634 }, { "epoch": 17.523935292175636, "grad_norm": 170.1662139892578, "learning_rate": 0.00036516944114674, "loss": 39.1194, "step": 6635 }, { "epoch": 17.526576427863983, "grad_norm": 362.0066833496094, "learning_rate": 0.000365132360078601, "loss": 39.4322, "step": 6636 }, { "epoch": 17.52921756355233, "grad_norm": 238.90838623046875, "learning_rate": 0.00036509527579545853, "loss": 44.5159, "step": 6637 }, { "epoch": 17.531858699240672, "grad_norm": 178.69630432128906, "learning_rate": 0.00036505818829834813, "loss": 42.3151, "step": 6638 }, { "epoch": 17.53449983492902, "grad_norm": 324.4364929199219, "learning_rate": 0.00036502109758830526, "loss": 45.6614, "step": 6639 }, { "epoch": 17.537140970617365, "grad_norm": 373.7210693359375, "learning_rate": 0.000364984003666366, "loss": 43.5715, "step": 6640 }, { "epoch": 17.53978210630571, "grad_norm": 462.3702392578125, "learning_rate": 0.00036494690653356577, "loss": 46.9574, "step": 6641 }, { "epoch": 17.542423241994058, "grad_norm": 231.6293487548828, "learning_rate": 0.00036490980619094084, "loss": 43.4101, "step": 6642 }, { "epoch": 17.545064377682404, "grad_norm": 209.94525146484375, "learning_rate": 0.0003648727026395271, "loss": 44.4216, "step": 6643 }, { "epoch": 17.54770551337075, "grad_norm": 196.9573974609375, "learning_rate": 0.0003648355958803605, "loss": 42.7837, "step": 6644 }, { "epoch": 17.550346649059094, "grad_norm": 138.396240234375, "learning_rate": 0.0003647984859144774, "loss": 42.4028, "step": 6645 }, { "epoch": 17.55298778474744, "grad_norm": 162.4833984375, "learning_rate": 0.000364761372742914, "loss": 41.8353, "step": 6646 }, { "epoch": 17.555628920435787, "grad_norm": 187.60238647460938, "learning_rate": 0.0003647242563667068, "loss": 42.3584, "step": 6647 }, { "epoch": 17.558270056124133, "grad_norm": 403.9679260253906, "learning_rate": 0.00036468713678689197, "loss": 39.4772, "step": 6648 }, { "epoch": 17.56091119181248, "grad_norm": 237.93756103515625, "learning_rate": 0.00036465001400450627, "loss": 38.294, "step": 6649 }, { "epoch": 17.563552327500826, "grad_norm": 153.8393096923828, "learning_rate": 0.00036461288802058626, "loss": 37.5188, "step": 6650 }, { "epoch": 17.566193463189173, "grad_norm": 98.95536041259766, "learning_rate": 0.00036457575883616877, "loss": 37.7964, "step": 6651 }, { "epoch": 17.56883459887752, "grad_norm": 134.84262084960938, "learning_rate": 0.00036453862645229046, "loss": 36.0763, "step": 6652 }, { "epoch": 17.571475734565862, "grad_norm": 135.04296875, "learning_rate": 0.0003645014908699883, "loss": 37.095, "step": 6653 }, { "epoch": 17.57411687025421, "grad_norm": 168.73770141601562, "learning_rate": 0.00036446435209029924, "loss": 37.21, "step": 6654 }, { "epoch": 17.576758005942555, "grad_norm": 206.97830200195312, "learning_rate": 0.0003644272101142604, "loss": 36.3448, "step": 6655 }, { "epoch": 17.5793991416309, "grad_norm": 264.1046142578125, "learning_rate": 0.00036439006494290893, "loss": 37.2638, "step": 6656 }, { "epoch": 17.582040277319248, "grad_norm": 151.09466552734375, "learning_rate": 0.00036435291657728214, "loss": 37.1899, "step": 6657 }, { "epoch": 17.584681413007594, "grad_norm": 2520.65869140625, "learning_rate": 0.00036431576501841725, "loss": 73.786, "step": 6658 }, { "epoch": 17.58732254869594, "grad_norm": 1087.736328125, "learning_rate": 0.0003642786102673519, "loss": 78.9005, "step": 6659 }, { "epoch": 17.589963684384287, "grad_norm": 2553.794189453125, "learning_rate": 0.00036424145232512333, "loss": 69.2457, "step": 6660 }, { "epoch": 17.59260482007263, "grad_norm": 2603.56103515625, "learning_rate": 0.00036420429119276936, "loss": 72.297, "step": 6661 }, { "epoch": 17.595245955760976, "grad_norm": 1057.9844970703125, "learning_rate": 0.0003641671268713277, "loss": 65.1374, "step": 6662 }, { "epoch": 17.597887091449323, "grad_norm": 1999.508056640625, "learning_rate": 0.0003641299593618361, "loss": 55.7302, "step": 6663 }, { "epoch": 17.60052822713767, "grad_norm": 1279.571044921875, "learning_rate": 0.0003640927886653324, "loss": 38.0653, "step": 6664 }, { "epoch": 17.603169362826016, "grad_norm": 2683.9697265625, "learning_rate": 0.0003640556147828545, "loss": 29.4912, "step": 6665 }, { "epoch": 17.605810498514362, "grad_norm": 1231.1900634765625, "learning_rate": 0.00036401843771544074, "loss": 22.7711, "step": 6666 }, { "epoch": 17.60845163420271, "grad_norm": 440.0444030761719, "learning_rate": 0.000363981257464129, "loss": 22.7683, "step": 6667 }, { "epoch": 17.61109276989105, "grad_norm": 504.06085205078125, "learning_rate": 0.00036394407402995756, "loss": 38.7614, "step": 6668 }, { "epoch": 17.613733905579398, "grad_norm": 351.67620849609375, "learning_rate": 0.00036390688741396484, "loss": 38.4681, "step": 6669 }, { "epoch": 17.616375041267744, "grad_norm": 227.7058868408203, "learning_rate": 0.0003638696976171891, "loss": 39.2231, "step": 6670 }, { "epoch": 17.61901617695609, "grad_norm": 277.18951416015625, "learning_rate": 0.00036383250464066906, "loss": 38.261, "step": 6671 }, { "epoch": 17.621657312644437, "grad_norm": 403.1344299316406, "learning_rate": 0.0003637953084854432, "loss": 40.5141, "step": 6672 }, { "epoch": 17.624298448332784, "grad_norm": 725.3721923828125, "learning_rate": 0.0003637581091525502, "loss": 37.7105, "step": 6673 }, { "epoch": 17.62693958402113, "grad_norm": 175.7461395263672, "learning_rate": 0.00036372090664302877, "loss": 38.8464, "step": 6674 }, { "epoch": 17.629580719709477, "grad_norm": 406.10443115234375, "learning_rate": 0.0003636837009579178, "loss": 38.5515, "step": 6675 }, { "epoch": 17.63222185539782, "grad_norm": 282.6061096191406, "learning_rate": 0.00036364649209825626, "loss": 38.5701, "step": 6676 }, { "epoch": 17.634862991086166, "grad_norm": 519.997314453125, "learning_rate": 0.0003636092800650833, "loss": 37.1437, "step": 6677 }, { "epoch": 17.637504126774513, "grad_norm": 543.121337890625, "learning_rate": 0.0003635720648594379, "loss": 37.1317, "step": 6678 }, { "epoch": 17.64014526246286, "grad_norm": 292.8321228027344, "learning_rate": 0.00036353484648235924, "loss": 37.7675, "step": 6679 }, { "epoch": 17.642786398151205, "grad_norm": 276.59051513671875, "learning_rate": 0.00036349762493488667, "loss": 36.1332, "step": 6680 }, { "epoch": 17.645427533839552, "grad_norm": 268.2702331542969, "learning_rate": 0.0003634604002180597, "loss": 36.945, "step": 6681 }, { "epoch": 17.6480686695279, "grad_norm": 241.43179321289062, "learning_rate": 0.0003634231723329176, "loss": 35.934, "step": 6682 }, { "epoch": 17.650709805216245, "grad_norm": 201.91265869140625, "learning_rate": 0.00036338594128050014, "loss": 36.7432, "step": 6683 }, { "epoch": 17.653350940904588, "grad_norm": 1036.0247802734375, "learning_rate": 0.0003633487070618467, "loss": 37.2157, "step": 6684 }, { "epoch": 17.655992076592934, "grad_norm": 285.9254150390625, "learning_rate": 0.00036331146967799745, "loss": 36.1354, "step": 6685 }, { "epoch": 17.65863321228128, "grad_norm": 886.4259033203125, "learning_rate": 0.0003632742291299918, "loss": 39.9272, "step": 6686 }, { "epoch": 17.661274347969627, "grad_norm": 383.9967346191406, "learning_rate": 0.00036323698541886995, "loss": 43.1125, "step": 6687 }, { "epoch": 17.663915483657973, "grad_norm": 367.060546875, "learning_rate": 0.0003631997385456717, "loss": 39.1558, "step": 6688 }, { "epoch": 17.66655661934632, "grad_norm": 343.86199951171875, "learning_rate": 0.0003631624885114373, "loss": 40.5208, "step": 6689 }, { "epoch": 17.669197755034666, "grad_norm": 218.50442504882812, "learning_rate": 0.00036312523531720686, "loss": 39.5928, "step": 6690 }, { "epoch": 17.67183889072301, "grad_norm": 237.06875610351562, "learning_rate": 0.0003630879789640207, "loss": 40.8553, "step": 6691 }, { "epoch": 17.674480026411356, "grad_norm": 160.62376403808594, "learning_rate": 0.00036305071945291913, "loss": 42.2483, "step": 6692 }, { "epoch": 17.677121162099702, "grad_norm": 187.34136962890625, "learning_rate": 0.00036301345678494264, "loss": 42.5822, "step": 6693 }, { "epoch": 17.67976229778805, "grad_norm": 153.52503967285156, "learning_rate": 0.0003629761909611318, "loss": 43.9465, "step": 6694 }, { "epoch": 17.682403433476395, "grad_norm": 188.8361358642578, "learning_rate": 0.0003629389219825271, "loss": 41.4286, "step": 6695 }, { "epoch": 17.68504456916474, "grad_norm": 324.8582763671875, "learning_rate": 0.0003629016498501694, "loss": 40.0188, "step": 6696 }, { "epoch": 17.687685704853088, "grad_norm": 301.3804016113281, "learning_rate": 0.00036286437456509944, "loss": 39.1659, "step": 6697 }, { "epoch": 17.690326840541434, "grad_norm": 440.7856750488281, "learning_rate": 0.0003628270961283582, "loss": 38.716, "step": 6698 }, { "epoch": 17.692967976229777, "grad_norm": 171.26182556152344, "learning_rate": 0.0003627898145409865, "loss": 38.0466, "step": 6699 }, { "epoch": 17.695609111918124, "grad_norm": 147.24966430664062, "learning_rate": 0.00036275252980402545, "loss": 40.0496, "step": 6700 }, { "epoch": 17.69825024760647, "grad_norm": 172.04592895507812, "learning_rate": 0.00036271524191851636, "loss": 38.8443, "step": 6701 }, { "epoch": 17.700891383294817, "grad_norm": 186.5936279296875, "learning_rate": 0.0003626779508855002, "loss": 37.2028, "step": 6702 }, { "epoch": 17.703532518983163, "grad_norm": 168.12274169921875, "learning_rate": 0.0003626406567060185, "loss": 36.8718, "step": 6703 }, { "epoch": 17.70617365467151, "grad_norm": 217.0692596435547, "learning_rate": 0.00036260335938111273, "loss": 36.2966, "step": 6704 }, { "epoch": 17.708814790359856, "grad_norm": 204.6612548828125, "learning_rate": 0.0003625660589118242, "loss": 35.8226, "step": 6705 }, { "epoch": 17.711455926048203, "grad_norm": 319.7032165527344, "learning_rate": 0.00036252875529919464, "loss": 37.0881, "step": 6706 }, { "epoch": 17.714097061736545, "grad_norm": 230.6657257080078, "learning_rate": 0.0003624914485442657, "loss": 35.7257, "step": 6707 }, { "epoch": 17.716738197424892, "grad_norm": 323.99188232421875, "learning_rate": 0.00036245413864807914, "loss": 36.2931, "step": 6708 }, { "epoch": 17.71937933311324, "grad_norm": 1265.8885498046875, "learning_rate": 0.0003624168256116768, "loss": 49.704, "step": 6709 }, { "epoch": 17.722020468801585, "grad_norm": 1117.959716796875, "learning_rate": 0.00036237950943610053, "loss": 55.6639, "step": 6710 }, { "epoch": 17.72466160448993, "grad_norm": 2797.403564453125, "learning_rate": 0.0003623421901223927, "loss": 58.4824, "step": 6711 }, { "epoch": 17.727302740178278, "grad_norm": 995.9281005859375, "learning_rate": 0.00036230486767159503, "loss": 47.9545, "step": 6712 }, { "epoch": 17.729943875866624, "grad_norm": 1159.8555908203125, "learning_rate": 0.00036226754208474996, "loss": 47.9486, "step": 6713 }, { "epoch": 17.732585011554967, "grad_norm": 1048.470947265625, "learning_rate": 0.00036223021336289973, "loss": 41.1235, "step": 6714 }, { "epoch": 17.735226147243313, "grad_norm": 639.937255859375, "learning_rate": 0.00036219288150708673, "loss": 22.6753, "step": 6715 }, { "epoch": 17.73786728293166, "grad_norm": 1034.1812744140625, "learning_rate": 0.00036215554651835343, "loss": 21.4004, "step": 6716 }, { "epoch": 17.740508418620006, "grad_norm": 696.2310180664062, "learning_rate": 0.0003621182083977423, "loss": 21.7006, "step": 6717 }, { "epoch": 17.743149554308353, "grad_norm": 695.3160400390625, "learning_rate": 0.0003620808671462961, "loss": 15.3323, "step": 6718 }, { "epoch": 17.7457906899967, "grad_norm": 256.9900207519531, "learning_rate": 0.0003620435227650575, "loss": 26.7767, "step": 6719 }, { "epoch": 17.748431825685046, "grad_norm": 607.551513671875, "learning_rate": 0.0003620061752550694, "loss": 40.9259, "step": 6720 }, { "epoch": 17.751072961373392, "grad_norm": 454.2593688964844, "learning_rate": 0.00036196882461737466, "loss": 40.4501, "step": 6721 }, { "epoch": 17.753714097061735, "grad_norm": 535.2890625, "learning_rate": 0.0003619314708530163, "loss": 40.4378, "step": 6722 }, { "epoch": 17.75635523275008, "grad_norm": 402.7552490234375, "learning_rate": 0.0003618941139630373, "loss": 41.0718, "step": 6723 }, { "epoch": 17.758996368438428, "grad_norm": 641.4647216796875, "learning_rate": 0.0003618567539484809, "loss": 41.0953, "step": 6724 }, { "epoch": 17.761637504126774, "grad_norm": 318.3255310058594, "learning_rate": 0.0003618193908103904, "loss": 40.0841, "step": 6725 }, { "epoch": 17.76427863981512, "grad_norm": 932.9686279296875, "learning_rate": 0.000361782024549809, "loss": 37.8327, "step": 6726 }, { "epoch": 17.766919775503467, "grad_norm": 495.0477294921875, "learning_rate": 0.0003617446551677803, "loss": 36.8996, "step": 6727 }, { "epoch": 17.769560911191814, "grad_norm": 227.2021942138672, "learning_rate": 0.0003617072826653477, "loss": 39.4799, "step": 6728 }, { "epoch": 17.77220204688016, "grad_norm": 522.0311889648438, "learning_rate": 0.0003616699070435549, "loss": 37.0902, "step": 6729 }, { "epoch": 17.774843182568503, "grad_norm": 388.7218322753906, "learning_rate": 0.00036163252830344556, "loss": 37.8959, "step": 6730 }, { "epoch": 17.77748431825685, "grad_norm": 960.1210327148438, "learning_rate": 0.0003615951464460634, "loss": 37.5758, "step": 6731 }, { "epoch": 17.780125453945196, "grad_norm": 266.4852600097656, "learning_rate": 0.00036155776147245235, "loss": 38.7564, "step": 6732 }, { "epoch": 17.782766589633543, "grad_norm": 1081.9066162109375, "learning_rate": 0.00036152037338365634, "loss": 37.5546, "step": 6733 }, { "epoch": 17.78540772532189, "grad_norm": 325.35675048828125, "learning_rate": 0.00036148298218071945, "loss": 38.3746, "step": 6734 }, { "epoch": 17.788048861010235, "grad_norm": 765.4342041015625, "learning_rate": 0.0003614455878646857, "loss": 37.3146, "step": 6735 }, { "epoch": 17.790689996698582, "grad_norm": 1283.4959716796875, "learning_rate": 0.00036140819043659934, "loss": 39.9085, "step": 6736 }, { "epoch": 17.793331132386925, "grad_norm": 786.6159057617188, "learning_rate": 0.00036137078989750484, "loss": 42.3909, "step": 6737 }, { "epoch": 17.79597226807527, "grad_norm": 274.3076477050781, "learning_rate": 0.00036133338624844634, "loss": 41.5015, "step": 6738 }, { "epoch": 17.798613403763618, "grad_norm": 305.5714111328125, "learning_rate": 0.0003612959794904685, "loss": 42.8963, "step": 6739 }, { "epoch": 17.801254539451964, "grad_norm": 371.7842102050781, "learning_rate": 0.0003612585696246158, "loss": 41.4698, "step": 6740 }, { "epoch": 17.80389567514031, "grad_norm": 241.22799682617188, "learning_rate": 0.0003612211566519329, "loss": 42.199, "step": 6741 }, { "epoch": 17.806536810828657, "grad_norm": 546.8875122070312, "learning_rate": 0.0003611837405734646, "loss": 45.0797, "step": 6742 }, { "epoch": 17.809177946517003, "grad_norm": 280.2182312011719, "learning_rate": 0.0003611463213902555, "loss": 42.5861, "step": 6743 }, { "epoch": 17.81181908220535, "grad_norm": 646.7526245117188, "learning_rate": 0.00036110889910335076, "loss": 43.0033, "step": 6744 }, { "epoch": 17.814460217893693, "grad_norm": 406.7886047363281, "learning_rate": 0.00036107147371379527, "loss": 41.706, "step": 6745 }, { "epoch": 17.81710135358204, "grad_norm": 193.9873809814453, "learning_rate": 0.00036103404522263416, "loss": 40.6279, "step": 6746 }, { "epoch": 17.819742489270386, "grad_norm": 231.677001953125, "learning_rate": 0.0003609966136309125, "loss": 39.947, "step": 6747 }, { "epoch": 17.822383624958732, "grad_norm": 206.1978302001953, "learning_rate": 0.0003609591789396757, "loss": 39.2671, "step": 6748 }, { "epoch": 17.82502476064708, "grad_norm": 230.40003967285156, "learning_rate": 0.0003609217411499689, "loss": 41.2489, "step": 6749 }, { "epoch": 17.827665896335425, "grad_norm": 207.91497802734375, "learning_rate": 0.0003608843002628377, "loss": 38.8326, "step": 6750 }, { "epoch": 17.83030703202377, "grad_norm": 134.7882843017578, "learning_rate": 0.00036084685627932756, "loss": 37.2792, "step": 6751 }, { "epoch": 17.832948167712118, "grad_norm": 146.44229125976562, "learning_rate": 0.000360809409200484, "loss": 36.5133, "step": 6752 }, { "epoch": 17.83558930340046, "grad_norm": 378.0213623046875, "learning_rate": 0.00036077195902735283, "loss": 38.0971, "step": 6753 }, { "epoch": 17.838230439088807, "grad_norm": 243.63438415527344, "learning_rate": 0.0003607345057609798, "loss": 35.7685, "step": 6754 }, { "epoch": 17.840871574777154, "grad_norm": 269.5248107910156, "learning_rate": 0.00036069704940241065, "loss": 36.9252, "step": 6755 }, { "epoch": 17.8435127104655, "grad_norm": 254.8719940185547, "learning_rate": 0.00036065958995269156, "loss": 37.6887, "step": 6756 }, { "epoch": 17.846153846153847, "grad_norm": 311.7680358886719, "learning_rate": 0.0003606221274128683, "loss": 37.3419, "step": 6757 }, { "epoch": 17.848794981842193, "grad_norm": 230.00808715820312, "learning_rate": 0.0003605846617839872, "loss": 36.2397, "step": 6758 }, { "epoch": 17.85143611753054, "grad_norm": 831.8417358398438, "learning_rate": 0.0003605471930670944, "loss": 48.4481, "step": 6759 }, { "epoch": 17.854077253218883, "grad_norm": 1671.7886962890625, "learning_rate": 0.0003605097212632361, "loss": 89.8438, "step": 6760 }, { "epoch": 17.85671838890723, "grad_norm": 3119.547119140625, "learning_rate": 0.00036047224637345877, "loss": 70.3372, "step": 6761 }, { "epoch": 17.859359524595575, "grad_norm": 6317.388671875, "learning_rate": 0.00036043476839880885, "loss": 75.4821, "step": 6762 }, { "epoch": 17.862000660283922, "grad_norm": 5536.95703125, "learning_rate": 0.0003603972873403329, "loss": 58.7255, "step": 6763 }, { "epoch": 17.86464179597227, "grad_norm": 1563.7412109375, "learning_rate": 0.0003603598031990776, "loss": 61.7634, "step": 6764 }, { "epoch": 17.867282931660615, "grad_norm": 1365.02685546875, "learning_rate": 0.00036032231597608953, "loss": 42.1893, "step": 6765 }, { "epoch": 17.86992406734896, "grad_norm": 3899.211181640625, "learning_rate": 0.0003602848256724157, "loss": 30.2149, "step": 6766 }, { "epoch": 17.872565203037308, "grad_norm": 935.7935791015625, "learning_rate": 0.00036024733228910276, "loss": 23.8506, "step": 6767 }, { "epoch": 17.87520633872565, "grad_norm": 1220.3590087890625, "learning_rate": 0.000360209835827198, "loss": 17.8956, "step": 6768 }, { "epoch": 17.877847474413997, "grad_norm": 274.22760009765625, "learning_rate": 0.0003601723362877482, "loss": 34.2685, "step": 6769 }, { "epoch": 17.880488610102343, "grad_norm": 322.3304443359375, "learning_rate": 0.00036013483367180067, "loss": 38.2687, "step": 6770 }, { "epoch": 17.88312974579069, "grad_norm": 397.83929443359375, "learning_rate": 0.0003600973279804025, "loss": 39.2222, "step": 6771 }, { "epoch": 17.885770881479036, "grad_norm": 604.73876953125, "learning_rate": 0.0003600598192146013, "loss": 39.4755, "step": 6772 }, { "epoch": 17.888412017167383, "grad_norm": 345.46026611328125, "learning_rate": 0.00036002230737544415, "loss": 38.3354, "step": 6773 }, { "epoch": 17.89105315285573, "grad_norm": 382.58160400390625, "learning_rate": 0.0003599847924639788, "loss": 38.4306, "step": 6774 }, { "epoch": 17.893694288544076, "grad_norm": 233.7359161376953, "learning_rate": 0.00035994727448125267, "loss": 39.7095, "step": 6775 }, { "epoch": 17.89633542423242, "grad_norm": 355.16387939453125, "learning_rate": 0.00035990975342831355, "loss": 36.6594, "step": 6776 }, { "epoch": 17.898976559920765, "grad_norm": 289.4728088378906, "learning_rate": 0.0003598722293062091, "loss": 37.1594, "step": 6777 }, { "epoch": 17.90161769560911, "grad_norm": 310.6521301269531, "learning_rate": 0.0003598347021159871, "loss": 35.7082, "step": 6778 }, { "epoch": 17.904258831297458, "grad_norm": 429.3592834472656, "learning_rate": 0.0003597971718586957, "loss": 35.8121, "step": 6779 }, { "epoch": 17.906899966985804, "grad_norm": 177.17164611816406, "learning_rate": 0.0003597596385353827, "loss": 37.8831, "step": 6780 }, { "epoch": 17.90954110267415, "grad_norm": 312.68768310546875, "learning_rate": 0.00035972210214709627, "loss": 37.2691, "step": 6781 }, { "epoch": 17.912182238362497, "grad_norm": 199.8417510986328, "learning_rate": 0.0003596845626948846, "loss": 36.8427, "step": 6782 }, { "epoch": 17.91482337405084, "grad_norm": 201.62062072753906, "learning_rate": 0.000359647020179796, "loss": 36.827, "step": 6783 }, { "epoch": 17.917464509739187, "grad_norm": 217.96810913085938, "learning_rate": 0.0003596094746028787, "loss": 36.9537, "step": 6784 }, { "epoch": 17.920105645427533, "grad_norm": 1062.6416015625, "learning_rate": 0.00035957192596518126, "loss": 38.2758, "step": 6785 }, { "epoch": 17.92274678111588, "grad_norm": 708.5445556640625, "learning_rate": 0.00035953437426775216, "loss": 40.2889, "step": 6786 }, { "epoch": 17.925387916804226, "grad_norm": 872.9161376953125, "learning_rate": 0.0003594968195116399, "loss": 42.1491, "step": 6787 }, { "epoch": 17.928029052492573, "grad_norm": 203.32009887695312, "learning_rate": 0.0003594592616978934, "loss": 42.0431, "step": 6788 }, { "epoch": 17.93067018818092, "grad_norm": 308.3658447265625, "learning_rate": 0.0003594217008275612, "loss": 44.2682, "step": 6789 }, { "epoch": 17.933311323869265, "grad_norm": 203.06727600097656, "learning_rate": 0.0003593841369016924, "loss": 46.9899, "step": 6790 }, { "epoch": 17.93595245955761, "grad_norm": 286.99432373046875, "learning_rate": 0.00035934656992133574, "loss": 43.5035, "step": 6791 }, { "epoch": 17.938593595245955, "grad_norm": 176.90008544921875, "learning_rate": 0.0003593089998875404, "loss": 44.4862, "step": 6792 }, { "epoch": 17.9412347309343, "grad_norm": 190.05914306640625, "learning_rate": 0.0003592714268013554, "loss": 41.7694, "step": 6793 }, { "epoch": 17.943875866622648, "grad_norm": 211.97732543945312, "learning_rate": 0.0003592338506638301, "loss": 39.8782, "step": 6794 }, { "epoch": 17.946517002310994, "grad_norm": 269.2523498535156, "learning_rate": 0.00035919627147601365, "loss": 39.1988, "step": 6795 }, { "epoch": 17.94915813799934, "grad_norm": 115.005126953125, "learning_rate": 0.0003591586892389554, "loss": 35.9893, "step": 6796 }, { "epoch": 17.951799273687687, "grad_norm": 245.23194885253906, "learning_rate": 0.0003591211039537049, "loss": 36.1655, "step": 6797 }, { "epoch": 17.954440409376033, "grad_norm": 155.1767578125, "learning_rate": 0.0003590835156213117, "loss": 36.0467, "step": 6798 }, { "epoch": 17.957081545064376, "grad_norm": 150.02841186523438, "learning_rate": 0.0003590459242428254, "loss": 38.6308, "step": 6799 }, { "epoch": 17.959722680752723, "grad_norm": 534.7611694335938, "learning_rate": 0.0003590083298192957, "loss": 50.9856, "step": 6800 }, { "epoch": 17.959722680752723, "eval_loss": 5.566368103027344, "eval_runtime": 2.1288, "eval_samples_per_second": 232.522, "eval_steps_per_second": 29.124, "step": 6800 }, { "epoch": 17.96236381644107, "grad_norm": 1291.9559326171875, "learning_rate": 0.0003589707323517724, "loss": 71.9357, "step": 6801 }, { "epoch": 17.965004952129416, "grad_norm": 1902.2706298828125, "learning_rate": 0.0003589331318413055, "loss": 73.4801, "step": 6802 }, { "epoch": 17.967646087817762, "grad_norm": 1317.3876953125, "learning_rate": 0.00035889552828894484, "loss": 70.1371, "step": 6803 }, { "epoch": 17.97028722350611, "grad_norm": 1575.1959228515625, "learning_rate": 0.0003588579216957405, "loss": 67.8226, "step": 6804 }, { "epoch": 17.972928359194455, "grad_norm": 1421.208740234375, "learning_rate": 0.0003588203120627427, "loss": 59.0537, "step": 6805 }, { "epoch": 17.975569494882798, "grad_norm": 401.903564453125, "learning_rate": 0.0003587826993910015, "loss": 39.0716, "step": 6806 }, { "epoch": 17.978210630571144, "grad_norm": 324.5088806152344, "learning_rate": 0.0003587450836815674, "loss": 38.3112, "step": 6807 }, { "epoch": 17.98085176625949, "grad_norm": 403.770263671875, "learning_rate": 0.0003587074649354907, "loss": 36.7715, "step": 6808 }, { "epoch": 17.983492901947837, "grad_norm": 161.16368103027344, "learning_rate": 0.0003586698431538219, "loss": 36.8602, "step": 6809 }, { "epoch": 17.986134037636184, "grad_norm": 231.15277099609375, "learning_rate": 0.0003586322183376116, "loss": 36.8651, "step": 6810 }, { "epoch": 17.98877517332453, "grad_norm": 139.40467834472656, "learning_rate": 0.00035859459048791036, "loss": 38.1092, "step": 6811 }, { "epoch": 17.991416309012877, "grad_norm": 204.84622192382812, "learning_rate": 0.0003585569596057691, "loss": 36.4068, "step": 6812 }, { "epoch": 17.994057444701223, "grad_norm": 194.83859252929688, "learning_rate": 0.0003585193256922384, "loss": 36.2764, "step": 6813 }, { "epoch": 17.996698580389566, "grad_norm": 146.0400390625, "learning_rate": 0.0003584816887483693, "loss": 36.4634, "step": 6814 }, { "epoch": 17.999339716077912, "grad_norm": 164.12789916992188, "learning_rate": 0.0003584440487752127, "loss": 39.239, "step": 6815 }, { "epoch": 18.00198085176626, "grad_norm": 631.5355834960938, "learning_rate": 0.00035840640577381984, "loss": 44.9406, "step": 6816 }, { "epoch": 18.004621987454605, "grad_norm": 357.58978271484375, "learning_rate": 0.00035836875974524173, "loss": 43.8281, "step": 6817 }, { "epoch": 18.007263123142952, "grad_norm": 388.47308349609375, "learning_rate": 0.0003583311106905297, "loss": 43.2339, "step": 6818 }, { "epoch": 18.0099042588313, "grad_norm": 409.18072509765625, "learning_rate": 0.00035829345861073503, "loss": 42.4296, "step": 6819 }, { "epoch": 18.012545394519645, "grad_norm": 1070.8143310546875, "learning_rate": 0.0003582558035069091, "loss": 44.1032, "step": 6820 }, { "epoch": 18.01518653020799, "grad_norm": 198.16184997558594, "learning_rate": 0.00035821814538010356, "loss": 44.0584, "step": 6821 }, { "epoch": 18.017827665896334, "grad_norm": 202.88565063476562, "learning_rate": 0.0003581804842313697, "loss": 42.8242, "step": 6822 }, { "epoch": 18.02046880158468, "grad_norm": 236.40310668945312, "learning_rate": 0.00035814282006175953, "loss": 42.5458, "step": 6823 }, { "epoch": 18.023109937273027, "grad_norm": 102.9897232055664, "learning_rate": 0.00035810515287232455, "loss": 40.2088, "step": 6824 }, { "epoch": 18.025751072961373, "grad_norm": 123.54155731201172, "learning_rate": 0.0003580674826641167, "loss": 42.1362, "step": 6825 }, { "epoch": 18.02839220864972, "grad_norm": 154.16006469726562, "learning_rate": 0.0003580298094381879, "loss": 40.3762, "step": 6826 }, { "epoch": 18.031033344338066, "grad_norm": 256.3985900878906, "learning_rate": 0.0003579921331955901, "loss": 39.0134, "step": 6827 }, { "epoch": 18.033674480026413, "grad_norm": 116.47545623779297, "learning_rate": 0.0003579544539373755, "loss": 38.9259, "step": 6828 }, { "epoch": 18.036315615714756, "grad_norm": 157.91561889648438, "learning_rate": 0.0003579167716645961, "loss": 37.8025, "step": 6829 }, { "epoch": 18.038956751403102, "grad_norm": 225.67893981933594, "learning_rate": 0.0003578790863783043, "loss": 36.2675, "step": 6830 }, { "epoch": 18.04159788709145, "grad_norm": 440.8684997558594, "learning_rate": 0.0003578413980795523, "loss": 37.5669, "step": 6831 }, { "epoch": 18.044239022779795, "grad_norm": 203.8704071044922, "learning_rate": 0.00035780370676939265, "loss": 35.8424, "step": 6832 }, { "epoch": 18.04688015846814, "grad_norm": 113.63232421875, "learning_rate": 0.00035776601244887784, "loss": 36.6338, "step": 6833 }, { "epoch": 18.049521294156488, "grad_norm": 314.9732971191406, "learning_rate": 0.00035772831511906043, "loss": 35.8847, "step": 6834 }, { "epoch": 18.052162429844834, "grad_norm": 177.2148895263672, "learning_rate": 0.0003576906147809931, "loss": 36.4636, "step": 6835 }, { "epoch": 18.05480356553318, "grad_norm": 123.76679992675781, "learning_rate": 0.00035765291143572867, "loss": 36.3475, "step": 6836 }, { "epoch": 18.057444701221524, "grad_norm": 469.4510498046875, "learning_rate": 0.00035761520508431984, "loss": 58.9816, "step": 6837 }, { "epoch": 18.06008583690987, "grad_norm": 1663.212158203125, "learning_rate": 0.00035757749572781967, "loss": 60.8304, "step": 6838 }, { "epoch": 18.062726972598217, "grad_norm": 1733.037109375, "learning_rate": 0.0003575397833672811, "loss": 63.9468, "step": 6839 }, { "epoch": 18.065368108286563, "grad_norm": 3836.931884765625, "learning_rate": 0.0003575020680037573, "loss": 58.5668, "step": 6840 }, { "epoch": 18.06800924397491, "grad_norm": 2243.48828125, "learning_rate": 0.00035746434963830127, "loss": 57.6757, "step": 6841 }, { "epoch": 18.070650379663256, "grad_norm": 1307.3631591796875, "learning_rate": 0.00035742662827196656, "loss": 46.9142, "step": 6842 }, { "epoch": 18.073291515351602, "grad_norm": 2321.37744140625, "learning_rate": 0.0003573889039058063, "loss": 43.8836, "step": 6843 }, { "epoch": 18.07593265103995, "grad_norm": 3470.0537109375, "learning_rate": 0.000357351176540874, "loss": 38.2977, "step": 6844 }, { "epoch": 18.078573786728292, "grad_norm": 3045.449951171875, "learning_rate": 0.0003573134461782231, "loss": 29.9276, "step": 6845 }, { "epoch": 18.08121492241664, "grad_norm": 854.6876220703125, "learning_rate": 0.00035727571281890725, "loss": 28.3945, "step": 6846 }, { "epoch": 18.083856058104985, "grad_norm": 483.57415771484375, "learning_rate": 0.00035723797646398025, "loss": 18.7852, "step": 6847 }, { "epoch": 18.08649719379333, "grad_norm": 183.7161865234375, "learning_rate": 0.0003572002371144957, "loss": 36.334, "step": 6848 }, { "epoch": 18.089138329481678, "grad_norm": 390.69708251953125, "learning_rate": 0.0003571624947715074, "loss": 38.7353, "step": 6849 }, { "epoch": 18.091779465170024, "grad_norm": 274.8089904785156, "learning_rate": 0.00035712474943606947, "loss": 37.5313, "step": 6850 }, { "epoch": 18.09442060085837, "grad_norm": 205.32749938964844, "learning_rate": 0.0003570870011092359, "loss": 38.9569, "step": 6851 }, { "epoch": 18.097061736546713, "grad_norm": 205.3043975830078, "learning_rate": 0.00035704924979206064, "loss": 36.8814, "step": 6852 }, { "epoch": 18.09970287223506, "grad_norm": 242.52439880371094, "learning_rate": 0.000357011495485598, "loss": 37.5364, "step": 6853 }, { "epoch": 18.102344007923406, "grad_norm": 230.64569091796875, "learning_rate": 0.0003569737381909022, "loss": 37.7244, "step": 6854 }, { "epoch": 18.104985143611753, "grad_norm": 241.2430419921875, "learning_rate": 0.0003569359779090276, "loss": 37.2663, "step": 6855 }, { "epoch": 18.1076262793001, "grad_norm": 187.9485321044922, "learning_rate": 0.00035689821464102866, "loss": 37.2066, "step": 6856 }, { "epoch": 18.110267414988446, "grad_norm": 248.00555419921875, "learning_rate": 0.0003568604483879598, "loss": 36.3791, "step": 6857 }, { "epoch": 18.112908550676792, "grad_norm": 387.362548828125, "learning_rate": 0.00035682267915087575, "loss": 36.802, "step": 6858 }, { "epoch": 18.11554968636514, "grad_norm": 183.8302764892578, "learning_rate": 0.0003567849069308311, "loss": 36.4163, "step": 6859 }, { "epoch": 18.11819082205348, "grad_norm": 219.79415893554688, "learning_rate": 0.0003567471317288807, "loss": 36.5999, "step": 6860 }, { "epoch": 18.120831957741828, "grad_norm": 210.93067932128906, "learning_rate": 0.0003567093535460794, "loss": 35.9816, "step": 6861 }, { "epoch": 18.123473093430174, "grad_norm": 387.7620849609375, "learning_rate": 0.00035667157238348195, "loss": 36.2811, "step": 6862 }, { "epoch": 18.12611422911852, "grad_norm": 330.0669860839844, "learning_rate": 0.0003566337882421436, "loss": 37.4326, "step": 6863 }, { "epoch": 18.128755364806867, "grad_norm": 287.55633544921875, "learning_rate": 0.0003565960011231194, "loss": 36.6802, "step": 6864 }, { "epoch": 18.131396500495214, "grad_norm": 727.4823608398438, "learning_rate": 0.0003565582110274643, "loss": 38.8991, "step": 6865 }, { "epoch": 18.13403763618356, "grad_norm": 317.29876708984375, "learning_rate": 0.0003565204179562339, "loss": 44.5156, "step": 6866 }, { "epoch": 18.136678771871907, "grad_norm": 280.68731689453125, "learning_rate": 0.0003564826219104834, "loss": 44.0399, "step": 6867 }, { "epoch": 18.13931990756025, "grad_norm": 324.62347412109375, "learning_rate": 0.0003564448228912682, "loss": 45.4812, "step": 6868 }, { "epoch": 18.141961043248596, "grad_norm": 313.6701965332031, "learning_rate": 0.0003564070208996438, "loss": 43.7557, "step": 6869 }, { "epoch": 18.144602178936942, "grad_norm": 232.26295471191406, "learning_rate": 0.00035636921593666594, "loss": 43.6702, "step": 6870 }, { "epoch": 18.14724331462529, "grad_norm": 147.8546600341797, "learning_rate": 0.0003563314080033901, "loss": 42.6332, "step": 6871 }, { "epoch": 18.149884450313635, "grad_norm": 284.3149719238281, "learning_rate": 0.00035629359710087226, "loss": 43.8956, "step": 6872 }, { "epoch": 18.152525586001982, "grad_norm": 173.73773193359375, "learning_rate": 0.00035625578323016815, "loss": 41.3013, "step": 6873 }, { "epoch": 18.15516672169033, "grad_norm": 172.55108642578125, "learning_rate": 0.00035621796639233365, "loss": 44.7498, "step": 6874 }, { "epoch": 18.15780785737867, "grad_norm": 214.43441772460938, "learning_rate": 0.0003561801465884249, "loss": 40.9321, "step": 6875 }, { "epoch": 18.160448993067018, "grad_norm": 248.9014129638672, "learning_rate": 0.0003561423238194979, "loss": 41.2586, "step": 6876 }, { "epoch": 18.163090128755364, "grad_norm": 231.90011596679688, "learning_rate": 0.00035610449808660884, "loss": 40.1182, "step": 6877 }, { "epoch": 18.16573126444371, "grad_norm": 1222.6473388671875, "learning_rate": 0.00035606666939081403, "loss": 39.0789, "step": 6878 }, { "epoch": 18.168372400132057, "grad_norm": 173.5012664794922, "learning_rate": 0.00035602883773316977, "loss": 36.7763, "step": 6879 }, { "epoch": 18.171013535820403, "grad_norm": 242.12173461914062, "learning_rate": 0.00035599100311473254, "loss": 37.3597, "step": 6880 }, { "epoch": 18.17365467150875, "grad_norm": 236.89012145996094, "learning_rate": 0.0003559531655365587, "loss": 37.565, "step": 6881 }, { "epoch": 18.176295807197096, "grad_norm": 203.7154998779297, "learning_rate": 0.00035591532499970515, "loss": 38.2311, "step": 6882 }, { "epoch": 18.17893694288544, "grad_norm": 378.60455322265625, "learning_rate": 0.0003558774815052282, "loss": 36.9072, "step": 6883 }, { "epoch": 18.181578078573786, "grad_norm": 434.11163330078125, "learning_rate": 0.0003558396350541848, "loss": 37.3737, "step": 6884 }, { "epoch": 18.184219214262132, "grad_norm": 181.6985626220703, "learning_rate": 0.0003558017856476318, "loss": 36.3617, "step": 6885 }, { "epoch": 18.18686034995048, "grad_norm": 636.874755859375, "learning_rate": 0.00035576393328662603, "loss": 36.707, "step": 6886 }, { "epoch": 18.189501485638825, "grad_norm": 235.62489318847656, "learning_rate": 0.00035572607797222465, "loss": 38.0988, "step": 6887 }, { "epoch": 18.19214262132717, "grad_norm": 448.736572265625, "learning_rate": 0.00035568821970548455, "loss": 41.3605, "step": 6888 }, { "epoch": 18.194783757015518, "grad_norm": 3530.2080078125, "learning_rate": 0.0003556503584874631, "loss": 101.422, "step": 6889 }, { "epoch": 18.197424892703864, "grad_norm": 4550.28955078125, "learning_rate": 0.0003556124943192174, "loss": 82.7256, "step": 6890 }, { "epoch": 18.200066028392207, "grad_norm": 2786.838134765625, "learning_rate": 0.0003555746272018048, "loss": 77.8596, "step": 6891 }, { "epoch": 18.202707164080554, "grad_norm": 8984.005859375, "learning_rate": 0.00035553675713628275, "loss": 82.1583, "step": 6892 }, { "epoch": 18.2053482997689, "grad_norm": 3159.243408203125, "learning_rate": 0.0003554988841237088, "loss": 62.5281, "step": 6893 }, { "epoch": 18.207989435457247, "grad_norm": 7276.3349609375, "learning_rate": 0.0003554610081651405, "loss": 63.8743, "step": 6894 }, { "epoch": 18.210630571145593, "grad_norm": 2656.485595703125, "learning_rate": 0.00035542312926163547, "loss": 63.6764, "step": 6895 }, { "epoch": 18.21327170683394, "grad_norm": 2886.900146484375, "learning_rate": 0.00035538524741425147, "loss": 58.8042, "step": 6896 }, { "epoch": 18.215912842522286, "grad_norm": 1570.2662353515625, "learning_rate": 0.00035534736262404635, "loss": 42.809, "step": 6897 }, { "epoch": 18.21855397821063, "grad_norm": 444.33306884765625, "learning_rate": 0.000355309474892078, "loss": 38.1848, "step": 6898 }, { "epoch": 18.221195113898975, "grad_norm": 605.4149169921875, "learning_rate": 0.00035527158421940446, "loss": 39.1619, "step": 6899 }, { "epoch": 18.223836249587322, "grad_norm": 1203.3763427734375, "learning_rate": 0.0003552336906070838, "loss": 38.3942, "step": 6900 }, { "epoch": 18.22647738527567, "grad_norm": 248.7318115234375, "learning_rate": 0.0003551957940561741, "loss": 36.0862, "step": 6901 }, { "epoch": 18.229118520964015, "grad_norm": 1034.677490234375, "learning_rate": 0.0003551578945677336, "loss": 37.1222, "step": 6902 }, { "epoch": 18.23175965665236, "grad_norm": 365.07220458984375, "learning_rate": 0.0003551199921428207, "loss": 37.2169, "step": 6903 }, { "epoch": 18.234400792340708, "grad_norm": 273.85113525390625, "learning_rate": 0.00035508208678249384, "loss": 36.9332, "step": 6904 }, { "epoch": 18.237041928029054, "grad_norm": 347.1279296875, "learning_rate": 0.0003550441784878114, "loss": 37.1745, "step": 6905 }, { "epoch": 18.239683063717397, "grad_norm": 403.3643798828125, "learning_rate": 0.00035500626725983194, "loss": 37.3944, "step": 6906 }, { "epoch": 18.242324199405743, "grad_norm": 724.751953125, "learning_rate": 0.0003549683530996141, "loss": 36.7942, "step": 6907 }, { "epoch": 18.24496533509409, "grad_norm": 249.7607421875, "learning_rate": 0.00035493043600821683, "loss": 37.3926, "step": 6908 }, { "epoch": 18.247606470782436, "grad_norm": 218.297607421875, "learning_rate": 0.0003548925159866986, "loss": 36.6395, "step": 6909 }, { "epoch": 18.250247606470783, "grad_norm": 1290.4632568359375, "learning_rate": 0.0003548545930361186, "loss": 36.8786, "step": 6910 }, { "epoch": 18.25288874215913, "grad_norm": 277.5931091308594, "learning_rate": 0.00035481666715753564, "loss": 37.6814, "step": 6911 }, { "epoch": 18.255529877847476, "grad_norm": 323.2451477050781, "learning_rate": 0.0003547787383520088, "loss": 36.2141, "step": 6912 }, { "epoch": 18.258171013535822, "grad_norm": 586.3724365234375, "learning_rate": 0.00035474080662059734, "loss": 39.2884, "step": 6913 }, { "epoch": 18.260812149224165, "grad_norm": 413.8885192871094, "learning_rate": 0.00035470287196436023, "loss": 37.9647, "step": 6914 }, { "epoch": 18.26345328491251, "grad_norm": 1329.39013671875, "learning_rate": 0.00035466493438435703, "loss": 42.4568, "step": 6915 }, { "epoch": 18.266094420600858, "grad_norm": 348.70367431640625, "learning_rate": 0.000354626993881647, "loss": 41.7312, "step": 6916 }, { "epoch": 18.268735556289204, "grad_norm": 246.65879821777344, "learning_rate": 0.00035458905045728965, "loss": 40.7216, "step": 6917 }, { "epoch": 18.27137669197755, "grad_norm": 351.3370361328125, "learning_rate": 0.0003545511041123445, "loss": 42.9411, "step": 6918 }, { "epoch": 18.274017827665897, "grad_norm": 241.33447265625, "learning_rate": 0.00035451315484787116, "loss": 43.1389, "step": 6919 }, { "epoch": 18.276658963354244, "grad_norm": 176.2541961669922, "learning_rate": 0.0003544752026649293, "loss": 43.5298, "step": 6920 }, { "epoch": 18.279300099042587, "grad_norm": 248.26272583007812, "learning_rate": 0.00035443724756457884, "loss": 43.7334, "step": 6921 }, { "epoch": 18.281941234730933, "grad_norm": 218.22906494140625, "learning_rate": 0.00035439928954787963, "loss": 44.9163, "step": 6922 }, { "epoch": 18.28458237041928, "grad_norm": 199.7729034423828, "learning_rate": 0.00035436132861589153, "loss": 40.2226, "step": 6923 }, { "epoch": 18.287223506107626, "grad_norm": 270.58258056640625, "learning_rate": 0.0003543233647696747, "loss": 40.2461, "step": 6924 }, { "epoch": 18.289864641795972, "grad_norm": 194.68426513671875, "learning_rate": 0.00035428539801028914, "loss": 41.3082, "step": 6925 }, { "epoch": 18.29250577748432, "grad_norm": 265.69390869140625, "learning_rate": 0.00035424742833879507, "loss": 41.0311, "step": 6926 }, { "epoch": 18.295146913172665, "grad_norm": 165.96749877929688, "learning_rate": 0.00035420945575625286, "loss": 40.3872, "step": 6927 }, { "epoch": 18.297788048861012, "grad_norm": 208.12710571289062, "learning_rate": 0.0003541714802637228, "loss": 39.2136, "step": 6928 }, { "epoch": 18.300429184549355, "grad_norm": 424.5712585449219, "learning_rate": 0.00035413350186226536, "loss": 38.6019, "step": 6929 }, { "epoch": 18.3030703202377, "grad_norm": 123.80462646484375, "learning_rate": 0.00035409552055294105, "loss": 37.5837, "step": 6930 }, { "epoch": 18.305711455926048, "grad_norm": 198.50433349609375, "learning_rate": 0.0003540575363368105, "loss": 38.0456, "step": 6931 }, { "epoch": 18.308352591614394, "grad_norm": 277.8392639160156, "learning_rate": 0.0003540195492149343, "loss": 38.5212, "step": 6932 }, { "epoch": 18.31099372730274, "grad_norm": 460.8122863769531, "learning_rate": 0.00035398155918837324, "loss": 36.336, "step": 6933 }, { "epoch": 18.313634862991087, "grad_norm": 164.77333068847656, "learning_rate": 0.0003539435662581884, "loss": 37.0162, "step": 6934 }, { "epoch": 18.316275998679433, "grad_norm": 155.77149963378906, "learning_rate": 0.0003539055704254404, "loss": 38.1309, "step": 6935 }, { "epoch": 18.31891713436778, "grad_norm": 258.7120056152344, "learning_rate": 0.0003538675716911905, "loss": 38.2281, "step": 6936 }, { "epoch": 18.321558270056123, "grad_norm": 554.3073120117188, "learning_rate": 0.0003538295700564995, "loss": 52.7907, "step": 6937 }, { "epoch": 18.32419940574447, "grad_norm": 2804.981201171875, "learning_rate": 0.0003537915655224289, "loss": 65.6005, "step": 6938 }, { "epoch": 18.326840541432816, "grad_norm": 1308.640380859375, "learning_rate": 0.0003537535580900398, "loss": 66.8493, "step": 6939 }, { "epoch": 18.329481677121162, "grad_norm": 2200.672607421875, "learning_rate": 0.00035371554776039346, "loss": 54.8234, "step": 6940 }, { "epoch": 18.33212281280951, "grad_norm": 1312.39501953125, "learning_rate": 0.00035367753453455144, "loss": 55.465, "step": 6941 }, { "epoch": 18.334763948497855, "grad_norm": 1147.3958740234375, "learning_rate": 0.00035363951841357514, "loss": 45.0051, "step": 6942 }, { "epoch": 18.3374050841862, "grad_norm": 1506.6881103515625, "learning_rate": 0.00035360149939852616, "loss": 36.0527, "step": 6943 }, { "epoch": 18.340046219874544, "grad_norm": 1723.6514892578125, "learning_rate": 0.00035356347749046616, "loss": 30.6631, "step": 6944 }, { "epoch": 18.34268735556289, "grad_norm": 3002.165771484375, "learning_rate": 0.00035352545269045694, "loss": 27.8082, "step": 6945 }, { "epoch": 18.345328491251237, "grad_norm": 815.0447998046875, "learning_rate": 0.0003534874249995602, "loss": 26.5221, "step": 6946 }, { "epoch": 18.347969626939584, "grad_norm": 332.1131896972656, "learning_rate": 0.00035344939441883796, "loss": 39.0715, "step": 6947 }, { "epoch": 18.35061076262793, "grad_norm": 498.4893493652344, "learning_rate": 0.0003534113609493521, "loss": 40.3316, "step": 6948 }, { "epoch": 18.353251898316277, "grad_norm": 421.45843505859375, "learning_rate": 0.00035337332459216476, "loss": 38.436, "step": 6949 }, { "epoch": 18.355893034004623, "grad_norm": 281.367919921875, "learning_rate": 0.0003533352853483381, "loss": 40.1111, "step": 6950 }, { "epoch": 18.35853416969297, "grad_norm": 433.804931640625, "learning_rate": 0.00035329724321893417, "loss": 39.0633, "step": 6951 }, { "epoch": 18.361175305381312, "grad_norm": 668.336181640625, "learning_rate": 0.00035325919820501563, "loss": 37.894, "step": 6952 }, { "epoch": 18.36381644106966, "grad_norm": 225.5714111328125, "learning_rate": 0.00035322115030764445, "loss": 36.7285, "step": 6953 }, { "epoch": 18.366457576758005, "grad_norm": 969.1740112304688, "learning_rate": 0.0003531830995278833, "loss": 37.1415, "step": 6954 }, { "epoch": 18.369098712446352, "grad_norm": 250.05377197265625, "learning_rate": 0.00035314504586679476, "loss": 36.7212, "step": 6955 }, { "epoch": 18.3717398481347, "grad_norm": 244.6406707763672, "learning_rate": 0.0003531069893254414, "loss": 37.8722, "step": 6956 }, { "epoch": 18.374380983823045, "grad_norm": 234.4036865234375, "learning_rate": 0.00035306892990488595, "loss": 38.185, "step": 6957 }, { "epoch": 18.37702211951139, "grad_norm": 698.1635131835938, "learning_rate": 0.0003530308676061911, "loss": 36.777, "step": 6958 }, { "epoch": 18.379663255199738, "grad_norm": 415.6015625, "learning_rate": 0.00035299280243041985, "loss": 37.6723, "step": 6959 }, { "epoch": 18.38230439088808, "grad_norm": 770.2377319335938, "learning_rate": 0.000352954734378635, "loss": 37.1007, "step": 6960 }, { "epoch": 18.384945526576427, "grad_norm": 489.6528015136719, "learning_rate": 0.0003529166634518998, "loss": 38.0713, "step": 6961 }, { "epoch": 18.387586662264773, "grad_norm": 411.6640930175781, "learning_rate": 0.00035287858965127723, "loss": 36.6829, "step": 6962 }, { "epoch": 18.39022779795312, "grad_norm": 235.38441467285156, "learning_rate": 0.0003528405129778304, "loss": 38.3439, "step": 6963 }, { "epoch": 18.392868933641466, "grad_norm": 428.1925964355469, "learning_rate": 0.0003528024334326227, "loss": 37.7441, "step": 6964 }, { "epoch": 18.395510069329813, "grad_norm": 1388.1834716796875, "learning_rate": 0.00035276435101671747, "loss": 41.667, "step": 6965 }, { "epoch": 18.39815120501816, "grad_norm": 2890.976806640625, "learning_rate": 0.00035272626573117804, "loss": 43.372, "step": 6966 }, { "epoch": 18.400792340706502, "grad_norm": 195.9245147705078, "learning_rate": 0.000352688177577068, "loss": 42.9915, "step": 6967 }, { "epoch": 18.40343347639485, "grad_norm": 347.4018249511719, "learning_rate": 0.0003526500865554509, "loss": 44.2734, "step": 6968 }, { "epoch": 18.406074612083195, "grad_norm": 313.5688171386719, "learning_rate": 0.0003526119926673905, "loss": 42.7206, "step": 6969 }, { "epoch": 18.40871574777154, "grad_norm": 461.7015380859375, "learning_rate": 0.0003525738959139504, "loss": 46.5509, "step": 6970 }, { "epoch": 18.411356883459888, "grad_norm": 219.12380981445312, "learning_rate": 0.0003525357962961946, "loss": 45.056, "step": 6971 }, { "epoch": 18.413998019148234, "grad_norm": 383.597900390625, "learning_rate": 0.0003524976938151868, "loss": 45.1339, "step": 6972 }, { "epoch": 18.41663915483658, "grad_norm": 410.34027099609375, "learning_rate": 0.0003524595884719912, "loss": 43.0221, "step": 6973 }, { "epoch": 18.419280290524927, "grad_norm": 588.153076171875, "learning_rate": 0.00035242148026767183, "loss": 42.5632, "step": 6974 }, { "epoch": 18.42192142621327, "grad_norm": 385.1394958496094, "learning_rate": 0.0003523833692032927, "loss": 43.7019, "step": 6975 }, { "epoch": 18.424562561901617, "grad_norm": 366.5780944824219, "learning_rate": 0.0003523452552799182, "loss": 39.4713, "step": 6976 }, { "epoch": 18.427203697589963, "grad_norm": 331.96002197265625, "learning_rate": 0.0003523071384986125, "loss": 41.4555, "step": 6977 }, { "epoch": 18.42984483327831, "grad_norm": 268.1607360839844, "learning_rate": 0.0003522690188604401, "loss": 40.7645, "step": 6978 }, { "epoch": 18.432485968966656, "grad_norm": 408.1178894042969, "learning_rate": 0.0003522308963664654, "loss": 38.3216, "step": 6979 }, { "epoch": 18.435127104655002, "grad_norm": 244.3515625, "learning_rate": 0.000352192771017753, "loss": 37.1754, "step": 6980 }, { "epoch": 18.43776824034335, "grad_norm": 304.376708984375, "learning_rate": 0.0003521546428153674, "loss": 37.3318, "step": 6981 }, { "epoch": 18.440409376031695, "grad_norm": 331.4186706542969, "learning_rate": 0.00035211651176037353, "loss": 37.0456, "step": 6982 }, { "epoch": 18.44305051172004, "grad_norm": 331.3724670410156, "learning_rate": 0.00035207837785383603, "loss": 37.5804, "step": 6983 }, { "epoch": 18.445691647408385, "grad_norm": 394.400146484375, "learning_rate": 0.0003520402410968198, "loss": 36.7892, "step": 6984 }, { "epoch": 18.44833278309673, "grad_norm": 234.19064331054688, "learning_rate": 0.0003520021014903897, "loss": 37.0848, "step": 6985 }, { "epoch": 18.450973918785078, "grad_norm": 297.3032531738281, "learning_rate": 0.00035196395903561086, "loss": 36.1567, "step": 6986 }, { "epoch": 18.453615054473424, "grad_norm": 391.7376708984375, "learning_rate": 0.0003519258137335484, "loss": 36.5213, "step": 6987 }, { "epoch": 18.45625619016177, "grad_norm": 1048.1436767578125, "learning_rate": 0.00035188766558526746, "loss": 36.8084, "step": 6988 }, { "epoch": 18.458897325850117, "grad_norm": 1702.7213134765625, "learning_rate": 0.0003518495145918333, "loss": 58.5904, "step": 6989 }, { "epoch": 18.46153846153846, "grad_norm": 5953.7333984375, "learning_rate": 0.0003518113607543113, "loss": 56.3283, "step": 6990 }, { "epoch": 18.464179597226806, "grad_norm": 2980.637451171875, "learning_rate": 0.00035177320407376683, "loss": 51.9635, "step": 6991 }, { "epoch": 18.466820732915153, "grad_norm": 2088.228271484375, "learning_rate": 0.00035173504455126545, "loss": 53.4392, "step": 6992 }, { "epoch": 18.4694618686035, "grad_norm": 7976.4453125, "learning_rate": 0.00035169688218787264, "loss": 48.4511, "step": 6993 }, { "epoch": 18.472103004291846, "grad_norm": 35054.29296875, "learning_rate": 0.0003516587169846542, "loss": 33.5935, "step": 6994 }, { "epoch": 18.474744139980192, "grad_norm": 2178.1201171875, "learning_rate": 0.0003516205489426758, "loss": 33.1685, "step": 6995 }, { "epoch": 18.47738527566854, "grad_norm": 3688.824951171875, "learning_rate": 0.00035158237806300324, "loss": 30.1757, "step": 6996 }, { "epoch": 18.480026411356885, "grad_norm": 3331.49658203125, "learning_rate": 0.0003515442043467025, "loss": 33.9987, "step": 6997 }, { "epoch": 18.482667547045228, "grad_norm": 784.1060180664062, "learning_rate": 0.0003515060277948394, "loss": 31.7624, "step": 6998 }, { "epoch": 18.485308682733574, "grad_norm": 486.7588195800781, "learning_rate": 0.00035146784840848024, "loss": 37.0432, "step": 6999 }, { "epoch": 18.48794981842192, "grad_norm": 704.999755859375, "learning_rate": 0.0003514296661886909, "loss": 36.4944, "step": 7000 }, { "epoch": 18.48794981842192, "eval_loss": 4.217146873474121, "eval_runtime": 2.2601, "eval_samples_per_second": 219.02, "eval_steps_per_second": 27.433, "step": 7000 }, { "epoch": 18.490590954110267, "grad_norm": 267.484619140625, "learning_rate": 0.0003513914811365377, "loss": 36.8413, "step": 7001 }, { "epoch": 18.493232089798614, "grad_norm": 362.4431457519531, "learning_rate": 0.00035135329325308706, "loss": 39.468, "step": 7002 }, { "epoch": 18.49587322548696, "grad_norm": 547.4967041015625, "learning_rate": 0.0003513151025394052, "loss": 37.1076, "step": 7003 }, { "epoch": 18.498514361175307, "grad_norm": 413.61212158203125, "learning_rate": 0.00035127690899655866, "loss": 36.9039, "step": 7004 }, { "epoch": 18.501155496863653, "grad_norm": 638.8807983398438, "learning_rate": 0.0003512387126256139, "loss": 36.7687, "step": 7005 }, { "epoch": 18.503796632551996, "grad_norm": 467.7843933105469, "learning_rate": 0.00035120051342763756, "loss": 36.54, "step": 7006 }, { "epoch": 18.506437768240342, "grad_norm": 302.51678466796875, "learning_rate": 0.00035116231140369626, "loss": 37.7105, "step": 7007 }, { "epoch": 18.50907890392869, "grad_norm": 216.42356872558594, "learning_rate": 0.0003511241065548569, "loss": 36.5809, "step": 7008 }, { "epoch": 18.511720039617035, "grad_norm": 231.43991088867188, "learning_rate": 0.0003510858988821863, "loss": 37.3222, "step": 7009 }, { "epoch": 18.514361175305382, "grad_norm": 247.88314819335938, "learning_rate": 0.00035104768838675125, "loss": 37.2419, "step": 7010 }, { "epoch": 18.51700231099373, "grad_norm": 261.7864074707031, "learning_rate": 0.000351009475069619, "loss": 36.3486, "step": 7011 }, { "epoch": 18.519643446682075, "grad_norm": 378.890625, "learning_rate": 0.0003509712589318563, "loss": 37.3346, "step": 7012 }, { "epoch": 18.522284582370418, "grad_norm": 482.58953857421875, "learning_rate": 0.00035093303997453073, "loss": 37.0502, "step": 7013 }, { "epoch": 18.524925718058764, "grad_norm": 395.4397277832031, "learning_rate": 0.00035089481819870923, "loss": 39.1669, "step": 7014 }, { "epoch": 18.52756685374711, "grad_norm": 248.62550354003906, "learning_rate": 0.00035085659360545914, "loss": 41.1874, "step": 7015 }, { "epoch": 18.530207989435457, "grad_norm": 179.18690490722656, "learning_rate": 0.0003508183661958479, "loss": 41.6833, "step": 7016 }, { "epoch": 18.532849125123803, "grad_norm": 187.4378662109375, "learning_rate": 0.00035078013597094314, "loss": 42.1357, "step": 7017 }, { "epoch": 18.53549026081215, "grad_norm": 264.0503234863281, "learning_rate": 0.0003507419029318122, "loss": 42.72, "step": 7018 }, { "epoch": 18.538131396500496, "grad_norm": 319.5157165527344, "learning_rate": 0.0003507036670795228, "loss": 42.0648, "step": 7019 }, { "epoch": 18.540772532188843, "grad_norm": 243.12405395507812, "learning_rate": 0.0003506654284151427, "loss": 43.3238, "step": 7020 }, { "epoch": 18.543413667877186, "grad_norm": 326.8499755859375, "learning_rate": 0.0003506271869397396, "loss": 45.8087, "step": 7021 }, { "epoch": 18.546054803565532, "grad_norm": 160.19837951660156, "learning_rate": 0.00035058894265438146, "loss": 42.839, "step": 7022 }, { "epoch": 18.54869593925388, "grad_norm": 139.88145446777344, "learning_rate": 0.0003505506955601361, "loss": 42.6343, "step": 7023 }, { "epoch": 18.551337074942225, "grad_norm": 139.2359161376953, "learning_rate": 0.00035051244565807173, "loss": 41.7058, "step": 7024 }, { "epoch": 18.55397821063057, "grad_norm": 233.61264038085938, "learning_rate": 0.0003504741929492563, "loss": 41.3984, "step": 7025 }, { "epoch": 18.556619346318918, "grad_norm": 245.98744201660156, "learning_rate": 0.0003504359374347581, "loss": 38.7054, "step": 7026 }, { "epoch": 18.559260482007264, "grad_norm": 190.30023193359375, "learning_rate": 0.00035039767911564535, "loss": 38.8165, "step": 7027 }, { "epoch": 18.56190161769561, "grad_norm": 133.0536346435547, "learning_rate": 0.0003503594179929863, "loss": 39.1663, "step": 7028 }, { "epoch": 18.564542753383954, "grad_norm": 118.55645751953125, "learning_rate": 0.00035032115406784947, "loss": 37.8791, "step": 7029 }, { "epoch": 18.5671838890723, "grad_norm": 213.97866821289062, "learning_rate": 0.0003502828873413034, "loss": 38.0065, "step": 7030 }, { "epoch": 18.569825024760647, "grad_norm": 150.55702209472656, "learning_rate": 0.00035024461781441664, "loss": 37.3888, "step": 7031 }, { "epoch": 18.572466160448993, "grad_norm": 161.69911193847656, "learning_rate": 0.00035020634548825773, "loss": 36.1993, "step": 7032 }, { "epoch": 18.57510729613734, "grad_norm": 204.30868530273438, "learning_rate": 0.0003501680703638955, "loss": 35.8173, "step": 7033 }, { "epoch": 18.577748431825686, "grad_norm": 293.3546142578125, "learning_rate": 0.00035012979244239883, "loss": 35.593, "step": 7034 }, { "epoch": 18.580389567514032, "grad_norm": 271.5693359375, "learning_rate": 0.00035009151172483646, "loss": 36.5442, "step": 7035 }, { "epoch": 18.583030703202375, "grad_norm": 293.7117614746094, "learning_rate": 0.00035005322821227744, "loss": 36.1846, "step": 7036 }, { "epoch": 18.585671838890722, "grad_norm": 317.60980224609375, "learning_rate": 0.0003500149419057908, "loss": 36.4356, "step": 7037 }, { "epoch": 18.58831297457907, "grad_norm": 2798.739013671875, "learning_rate": 0.00034997665280644556, "loss": 62.0673, "step": 7038 }, { "epoch": 18.590954110267415, "grad_norm": 2548.11376953125, "learning_rate": 0.0003499383609153112, "loss": 59.1023, "step": 7039 }, { "epoch": 18.59359524595576, "grad_norm": 2827.181884765625, "learning_rate": 0.0003499000662334567, "loss": 57.049, "step": 7040 }, { "epoch": 18.596236381644108, "grad_norm": 12280.71875, "learning_rate": 0.0003498617687619516, "loss": 53.6125, "step": 7041 }, { "epoch": 18.598877517332454, "grad_norm": 4712.802734375, "learning_rate": 0.0003498234685018652, "loss": 45.8282, "step": 7042 }, { "epoch": 18.6015186530208, "grad_norm": 2694.661865234375, "learning_rate": 0.00034978516545426707, "loss": 50.9881, "step": 7043 }, { "epoch": 18.604159788709143, "grad_norm": 1680.1217041015625, "learning_rate": 0.0003497468596202269, "loss": 48.1376, "step": 7044 }, { "epoch": 18.60680092439749, "grad_norm": 14093.80078125, "learning_rate": 0.00034970855100081416, "loss": 35.6345, "step": 7045 }, { "epoch": 18.609442060085836, "grad_norm": 1459.717041015625, "learning_rate": 0.00034967023959709884, "loss": 29.8872, "step": 7046 }, { "epoch": 18.612083195774183, "grad_norm": 2460.638671875, "learning_rate": 0.00034963192541015046, "loss": 23.251, "step": 7047 }, { "epoch": 18.61472433146253, "grad_norm": 516.294677734375, "learning_rate": 0.0003495936084410393, "loss": 38.1613, "step": 7048 }, { "epoch": 18.617365467150876, "grad_norm": 370.1392822265625, "learning_rate": 0.00034955528869083495, "loss": 38.2431, "step": 7049 }, { "epoch": 18.620006602839222, "grad_norm": 581.2576293945312, "learning_rate": 0.0003495169661606077, "loss": 36.5349, "step": 7050 }, { "epoch": 18.62264773852757, "grad_norm": 631.8758544921875, "learning_rate": 0.00034947864085142766, "loss": 36.6233, "step": 7051 }, { "epoch": 18.62528887421591, "grad_norm": 331.6054382324219, "learning_rate": 0.000349440312764365, "loss": 37.232, "step": 7052 }, { "epoch": 18.627930009904258, "grad_norm": 398.18450927734375, "learning_rate": 0.00034940198190049, "loss": 36.3484, "step": 7053 }, { "epoch": 18.630571145592604, "grad_norm": 520.5009155273438, "learning_rate": 0.0003493636482608731, "loss": 38.7667, "step": 7054 }, { "epoch": 18.63321228128095, "grad_norm": 342.30419921875, "learning_rate": 0.0003493253118465846, "loss": 36.7504, "step": 7055 }, { "epoch": 18.635853416969297, "grad_norm": 498.92962646484375, "learning_rate": 0.00034928697265869515, "loss": 35.595, "step": 7056 }, { "epoch": 18.638494552657644, "grad_norm": 481.0252685546875, "learning_rate": 0.0003492486306982753, "loss": 37.1048, "step": 7057 }, { "epoch": 18.64113568834599, "grad_norm": 963.0735473632812, "learning_rate": 0.0003492102859663958, "loss": 36.3129, "step": 7058 }, { "epoch": 18.643776824034333, "grad_norm": 522.527099609375, "learning_rate": 0.0003491719384641273, "loss": 35.9628, "step": 7059 }, { "epoch": 18.64641795972268, "grad_norm": 690.1793212890625, "learning_rate": 0.0003491335881925407, "loss": 37.9322, "step": 7060 }, { "epoch": 18.649059095411026, "grad_norm": 385.47064208984375, "learning_rate": 0.00034909523515270694, "loss": 35.4041, "step": 7061 }, { "epoch": 18.651700231099372, "grad_norm": 270.4382019042969, "learning_rate": 0.0003490568793456968, "loss": 36.2904, "step": 7062 }, { "epoch": 18.65434136678772, "grad_norm": 782.9710693359375, "learning_rate": 0.00034901852077258156, "loss": 36.3225, "step": 7063 }, { "epoch": 18.656982502476065, "grad_norm": 340.84149169921875, "learning_rate": 0.0003489801594344323, "loss": 38.7544, "step": 7064 }, { "epoch": 18.659623638164412, "grad_norm": 1184.4373779296875, "learning_rate": 0.0003489417953323203, "loss": 41.7183, "step": 7065 }, { "epoch": 18.66226477385276, "grad_norm": 3108.187255859375, "learning_rate": 0.0003489034284673167, "loss": 43.538, "step": 7066 }, { "epoch": 18.6649059095411, "grad_norm": 397.9750061035156, "learning_rate": 0.00034886505884049304, "loss": 43.1258, "step": 7067 }, { "epoch": 18.667547045229448, "grad_norm": 378.5554504394531, "learning_rate": 0.00034882668645292055, "loss": 42.169, "step": 7068 }, { "epoch": 18.670188180917794, "grad_norm": 324.911865234375, "learning_rate": 0.000348788311305671, "loss": 43.0724, "step": 7069 }, { "epoch": 18.67282931660614, "grad_norm": 188.30931091308594, "learning_rate": 0.00034874993339981586, "loss": 45.9928, "step": 7070 }, { "epoch": 18.675470452294487, "grad_norm": 231.02743530273438, "learning_rate": 0.00034871155273642686, "loss": 41.5127, "step": 7071 }, { "epoch": 18.678111587982833, "grad_norm": 266.7607116699219, "learning_rate": 0.0003486731693165757, "loss": 42.6531, "step": 7072 }, { "epoch": 18.68075272367118, "grad_norm": 412.17901611328125, "learning_rate": 0.00034863478314133424, "loss": 43.2894, "step": 7073 }, { "epoch": 18.683393859359526, "grad_norm": 197.28309631347656, "learning_rate": 0.0003485963942117745, "loss": 39.6519, "step": 7074 }, { "epoch": 18.68603499504787, "grad_norm": 314.679443359375, "learning_rate": 0.0003485580025289683, "loss": 40.4866, "step": 7075 }, { "epoch": 18.688676130736216, "grad_norm": 376.3508605957031, "learning_rate": 0.00034851960809398775, "loss": 40.3227, "step": 7076 }, { "epoch": 18.691317266424562, "grad_norm": 252.4982147216797, "learning_rate": 0.000348481210907905, "loss": 38.6833, "step": 7077 }, { "epoch": 18.69395840211291, "grad_norm": 476.03662109375, "learning_rate": 0.0003484428109717924, "loss": 40.0871, "step": 7078 }, { "epoch": 18.696599537801255, "grad_norm": 302.7790832519531, "learning_rate": 0.00034840440828672205, "loss": 37.0545, "step": 7079 }, { "epoch": 18.6992406734896, "grad_norm": 264.64752197265625, "learning_rate": 0.0003483660028537664, "loss": 36.2367, "step": 7080 }, { "epoch": 18.701881809177948, "grad_norm": 293.088623046875, "learning_rate": 0.0003483275946739979, "loss": 37.0911, "step": 7081 }, { "epoch": 18.70452294486629, "grad_norm": 296.7691650390625, "learning_rate": 0.00034828918374848913, "loss": 35.4843, "step": 7082 }, { "epoch": 18.707164080554637, "grad_norm": 288.3581848144531, "learning_rate": 0.0003482507700783126, "loss": 36.3444, "step": 7083 }, { "epoch": 18.709805216242984, "grad_norm": 231.28018188476562, "learning_rate": 0.00034821235366454105, "loss": 36.192, "step": 7084 }, { "epoch": 18.71244635193133, "grad_norm": 433.28875732421875, "learning_rate": 0.00034817393450824717, "loss": 38.7753, "step": 7085 }, { "epoch": 18.715087487619677, "grad_norm": 251.34756469726562, "learning_rate": 0.0003481355126105039, "loss": 37.4194, "step": 7086 }, { "epoch": 18.717728623308023, "grad_norm": 2869.66650390625, "learning_rate": 0.00034809708797238416, "loss": 52.6466, "step": 7087 }, { "epoch": 18.72036975899637, "grad_norm": 2230.47998046875, "learning_rate": 0.0003480586605949608, "loss": 60.0002, "step": 7088 }, { "epoch": 18.723010894684716, "grad_norm": 3651.365478515625, "learning_rate": 0.00034802023047930686, "loss": 52.1718, "step": 7089 }, { "epoch": 18.72565203037306, "grad_norm": 2000.1859130859375, "learning_rate": 0.0003479817976264956, "loss": 61.066, "step": 7090 }, { "epoch": 18.728293166061405, "grad_norm": 2648.480224609375, "learning_rate": 0.00034794336203760023, "loss": 48.9102, "step": 7091 }, { "epoch": 18.730934301749752, "grad_norm": 1946.524169921875, "learning_rate": 0.000347904923713694, "loss": 52.9362, "step": 7092 }, { "epoch": 18.7335754374381, "grad_norm": 1374.7542724609375, "learning_rate": 0.0003478664826558503, "loss": 47.1047, "step": 7093 }, { "epoch": 18.736216573126445, "grad_norm": 1293.9306640625, "learning_rate": 0.0003478280388651425, "loss": 41.4361, "step": 7094 }, { "epoch": 18.73885770881479, "grad_norm": 2524.07275390625, "learning_rate": 0.00034778959234264427, "loss": 30.059, "step": 7095 }, { "epoch": 18.741498844503138, "grad_norm": 989.9658203125, "learning_rate": 0.0003477511430894291, "loss": 29.3421, "step": 7096 }, { "epoch": 18.744139980191484, "grad_norm": 1255.8648681640625, "learning_rate": 0.00034771269110657065, "loss": 26.6753, "step": 7097 }, { "epoch": 18.746781115879827, "grad_norm": 4810.3037109375, "learning_rate": 0.00034767423639514274, "loss": 38.5813, "step": 7098 }, { "epoch": 18.749422251568173, "grad_norm": 168.83631896972656, "learning_rate": 0.0003476357789562191, "loss": 36.6013, "step": 7099 }, { "epoch": 18.75206338725652, "grad_norm": 213.8062286376953, "learning_rate": 0.0003475973187908737, "loss": 35.7609, "step": 7100 }, { "epoch": 18.754704522944866, "grad_norm": 251.21136474609375, "learning_rate": 0.00034755885590018057, "loss": 37.58, "step": 7101 }, { "epoch": 18.757345658633213, "grad_norm": 208.564453125, "learning_rate": 0.00034752039028521363, "loss": 36.3169, "step": 7102 }, { "epoch": 18.75998679432156, "grad_norm": 241.97071838378906, "learning_rate": 0.0003474819219470471, "loss": 36.9987, "step": 7103 }, { "epoch": 18.762627930009906, "grad_norm": 402.85296630859375, "learning_rate": 0.00034744345088675514, "loss": 35.5719, "step": 7104 }, { "epoch": 18.76526906569825, "grad_norm": 368.815185546875, "learning_rate": 0.0003474049771054121, "loss": 36.1653, "step": 7105 }, { "epoch": 18.767910201386595, "grad_norm": 340.1468811035156, "learning_rate": 0.00034736650060409224, "loss": 35.9102, "step": 7106 }, { "epoch": 18.77055133707494, "grad_norm": 365.06982421875, "learning_rate": 0.0003473280213838701, "loss": 35.9141, "step": 7107 }, { "epoch": 18.773192472763288, "grad_norm": 360.2379150390625, "learning_rate": 0.0003472895394458201, "loss": 37.1152, "step": 7108 }, { "epoch": 18.775833608451634, "grad_norm": 222.670654296875, "learning_rate": 0.0003472510547910169, "loss": 38.1422, "step": 7109 }, { "epoch": 18.77847474413998, "grad_norm": 1606.642822265625, "learning_rate": 0.0003472125674205351, "loss": 35.318, "step": 7110 }, { "epoch": 18.781115879828327, "grad_norm": 434.9007568359375, "learning_rate": 0.0003471740773354494, "loss": 36.5859, "step": 7111 }, { "epoch": 18.783757015516674, "grad_norm": 317.4244384765625, "learning_rate": 0.0003471355845368347, "loss": 37.0847, "step": 7112 }, { "epoch": 18.786398151205017, "grad_norm": 380.8766784667969, "learning_rate": 0.00034709708902576597, "loss": 35.9864, "step": 7113 }, { "epoch": 18.789039286893363, "grad_norm": 815.0540771484375, "learning_rate": 0.000347058590803318, "loss": 38.4341, "step": 7114 }, { "epoch": 18.79168042258171, "grad_norm": 536.4606323242188, "learning_rate": 0.00034702008987056587, "loss": 41.8162, "step": 7115 }, { "epoch": 18.794321558270056, "grad_norm": 353.51715087890625, "learning_rate": 0.0003469815862285848, "loss": 41.1783, "step": 7116 }, { "epoch": 18.796962693958402, "grad_norm": 248.3135986328125, "learning_rate": 0.0003469430798784498, "loss": 42.5337, "step": 7117 }, { "epoch": 18.79960382964675, "grad_norm": 289.6338806152344, "learning_rate": 0.0003469045708212363, "loss": 44.2506, "step": 7118 }, { "epoch": 18.802244965335095, "grad_norm": 281.696533203125, "learning_rate": 0.0003468660590580195, "loss": 42.6495, "step": 7119 }, { "epoch": 18.804886101023442, "grad_norm": 251.28656005859375, "learning_rate": 0.00034682754458987495, "loss": 43.0908, "step": 7120 }, { "epoch": 18.807527236711785, "grad_norm": 264.875, "learning_rate": 0.0003467890274178782, "loss": 45.4158, "step": 7121 }, { "epoch": 18.81016837240013, "grad_norm": 235.8667755126953, "learning_rate": 0.00034675050754310466, "loss": 42.3555, "step": 7122 }, { "epoch": 18.812809508088478, "grad_norm": 173.05160522460938, "learning_rate": 0.00034671198496663, "loss": 41.9365, "step": 7123 }, { "epoch": 18.815450643776824, "grad_norm": 325.72283935546875, "learning_rate": 0.00034667345968952996, "loss": 40.6624, "step": 7124 }, { "epoch": 18.81809177946517, "grad_norm": 226.87196350097656, "learning_rate": 0.00034663493171288037, "loss": 42.2463, "step": 7125 }, { "epoch": 18.820732915153517, "grad_norm": 513.0947265625, "learning_rate": 0.0003465964010377571, "loss": 39.4402, "step": 7126 }, { "epoch": 18.823374050841863, "grad_norm": 383.5038757324219, "learning_rate": 0.000346557867665236, "loss": 37.0474, "step": 7127 }, { "epoch": 18.826015186530206, "grad_norm": 351.159912109375, "learning_rate": 0.00034651933159639324, "loss": 38.0424, "step": 7128 }, { "epoch": 18.828656322218553, "grad_norm": 409.21575927734375, "learning_rate": 0.00034648079283230483, "loss": 37.2409, "step": 7129 }, { "epoch": 18.8312974579069, "grad_norm": 270.7707824707031, "learning_rate": 0.00034644225137404693, "loss": 35.3389, "step": 7130 }, { "epoch": 18.833938593595246, "grad_norm": 378.1299133300781, "learning_rate": 0.0003464037072226959, "loss": 37.3375, "step": 7131 }, { "epoch": 18.836579729283592, "grad_norm": 511.4823913574219, "learning_rate": 0.0003463651603793279, "loss": 34.7451, "step": 7132 }, { "epoch": 18.83922086497194, "grad_norm": 546.3992919921875, "learning_rate": 0.00034632661084501936, "loss": 35.615, "step": 7133 }, { "epoch": 18.841862000660285, "grad_norm": 338.9766540527344, "learning_rate": 0.0003462880586208468, "loss": 35.1384, "step": 7134 }, { "epoch": 18.84450313634863, "grad_norm": 519.453857421875, "learning_rate": 0.0003462495037078869, "loss": 35.4981, "step": 7135 }, { "epoch": 18.847144272036974, "grad_norm": 358.6070556640625, "learning_rate": 0.000346210946107216, "loss": 35.2569, "step": 7136 }, { "epoch": 18.84978540772532, "grad_norm": 1285.6185302734375, "learning_rate": 0.000346172385819911, "loss": 45.3644, "step": 7137 }, { "epoch": 18.852426543413667, "grad_norm": 1864.2752685546875, "learning_rate": 0.0003461338228470486, "loss": 56.8858, "step": 7138 }, { "epoch": 18.855067679102014, "grad_norm": 7827.87060546875, "learning_rate": 0.0003460952571897058, "loss": 72.1003, "step": 7139 }, { "epoch": 18.85770881479036, "grad_norm": 1928.2939453125, "learning_rate": 0.0003460566888489593, "loss": 67.8016, "step": 7140 }, { "epoch": 18.860349950478707, "grad_norm": 2677.71142578125, "learning_rate": 0.00034601811782588617, "loss": 62.465, "step": 7141 }, { "epoch": 18.862991086167053, "grad_norm": 3458.400146484375, "learning_rate": 0.0003459795441215635, "loss": 54.2858, "step": 7142 }, { "epoch": 18.8656322218554, "grad_norm": 5232.2841796875, "learning_rate": 0.00034594096773706847, "loss": 50.4477, "step": 7143 }, { "epoch": 18.868273357543742, "grad_norm": 1591.182861328125, "learning_rate": 0.0003459023886734782, "loss": 39.1136, "step": 7144 }, { "epoch": 18.87091449323209, "grad_norm": 5352.86376953125, "learning_rate": 0.0003458638069318702, "loss": 37.4014, "step": 7145 }, { "epoch": 18.873555628920435, "grad_norm": 2752.3154296875, "learning_rate": 0.0003458252225133216, "loss": 33.3202, "step": 7146 }, { "epoch": 18.876196764608782, "grad_norm": 1322.4090576171875, "learning_rate": 0.00034578663541891, "loss": 31.8577, "step": 7147 }, { "epoch": 18.87883790029713, "grad_norm": 594.1236572265625, "learning_rate": 0.00034574804564971285, "loss": 35.6817, "step": 7148 }, { "epoch": 18.881479035985475, "grad_norm": 734.7824096679688, "learning_rate": 0.0003457094532068078, "loss": 38.0218, "step": 7149 }, { "epoch": 18.88412017167382, "grad_norm": 423.5558166503906, "learning_rate": 0.0003456708580912725, "loss": 38.7769, "step": 7150 }, { "epoch": 18.886761307362164, "grad_norm": 357.10894775390625, "learning_rate": 0.0003456322603041846, "loss": 36.5246, "step": 7151 }, { "epoch": 18.88940244305051, "grad_norm": 769.0978393554688, "learning_rate": 0.00034559365984662205, "loss": 35.5859, "step": 7152 }, { "epoch": 18.892043578738857, "grad_norm": 483.46600341796875, "learning_rate": 0.00034555505671966276, "loss": 35.7742, "step": 7153 }, { "epoch": 18.894684714427203, "grad_norm": 464.677978515625, "learning_rate": 0.0003455164509243846, "loss": 37.3587, "step": 7154 }, { "epoch": 18.89732585011555, "grad_norm": 453.3910217285156, "learning_rate": 0.00034547784246186564, "loss": 37.1346, "step": 7155 }, { "epoch": 18.899966985803896, "grad_norm": 424.34765625, "learning_rate": 0.00034543923133318406, "loss": 36.9581, "step": 7156 }, { "epoch": 18.902608121492243, "grad_norm": 417.2919006347656, "learning_rate": 0.000345400617539418, "loss": 36.7066, "step": 7157 }, { "epoch": 18.90524925718059, "grad_norm": 441.6119384765625, "learning_rate": 0.0003453620010816457, "loss": 36.5486, "step": 7158 }, { "epoch": 18.907890392868932, "grad_norm": 657.6688232421875, "learning_rate": 0.0003453233819609456, "loss": 36.9737, "step": 7159 }, { "epoch": 18.91053152855728, "grad_norm": 324.0419921875, "learning_rate": 0.00034528476017839594, "loss": 35.0856, "step": 7160 }, { "epoch": 18.913172664245625, "grad_norm": 427.861083984375, "learning_rate": 0.0003452461357350755, "loss": 36.1747, "step": 7161 }, { "epoch": 18.91581379993397, "grad_norm": 371.12188720703125, "learning_rate": 0.0003452075086320625, "loss": 36.5212, "step": 7162 }, { "epoch": 18.918454935622318, "grad_norm": 496.586669921875, "learning_rate": 0.00034516887887043587, "loss": 35.9332, "step": 7163 }, { "epoch": 18.921096071310664, "grad_norm": 508.48931884765625, "learning_rate": 0.0003451302464512741, "loss": 40.1729, "step": 7164 }, { "epoch": 18.92373720699901, "grad_norm": 3311.0859375, "learning_rate": 0.00034509161137565616, "loss": 40.7588, "step": 7165 }, { "epoch": 18.926378342687357, "grad_norm": 1425.1279296875, "learning_rate": 0.0003450529736446608, "loss": 40.0446, "step": 7166 }, { "epoch": 18.9290194783757, "grad_norm": 210.41651916503906, "learning_rate": 0.000345014333259367, "loss": 42.5243, "step": 7167 }, { "epoch": 18.931660614064047, "grad_norm": 420.0691223144531, "learning_rate": 0.00034497569022085375, "loss": 43.8182, "step": 7168 }, { "epoch": 18.934301749752393, "grad_norm": 187.97657775878906, "learning_rate": 0.00034493704453020013, "loss": 43.6954, "step": 7169 }, { "epoch": 18.93694288544074, "grad_norm": 211.11634826660156, "learning_rate": 0.00034489839618848534, "loss": 46.7124, "step": 7170 }, { "epoch": 18.939584021129086, "grad_norm": 370.0015563964844, "learning_rate": 0.0003448597451967886, "loss": 40.5723, "step": 7171 }, { "epoch": 18.942225156817432, "grad_norm": 316.7171630859375, "learning_rate": 0.0003448210915561891, "loss": 39.2008, "step": 7172 }, { "epoch": 18.94486629250578, "grad_norm": 142.1846466064453, "learning_rate": 0.0003447824352677664, "loss": 38.1767, "step": 7173 }, { "epoch": 18.947507428194122, "grad_norm": 156.65126037597656, "learning_rate": 0.0003447437763325999, "loss": 37.3259, "step": 7174 }, { "epoch": 18.95014856388247, "grad_norm": 291.2027282714844, "learning_rate": 0.00034470511475176907, "loss": 38.7579, "step": 7175 }, { "epoch": 18.952789699570815, "grad_norm": 246.8443145751953, "learning_rate": 0.00034466645052635346, "loss": 36.6399, "step": 7176 }, { "epoch": 18.95543083525916, "grad_norm": 142.4567413330078, "learning_rate": 0.00034462778365743296, "loss": 37.7086, "step": 7177 }, { "epoch": 18.958071970947508, "grad_norm": 835.3235473632812, "learning_rate": 0.0003445891141460871, "loss": 43.4766, "step": 7178 }, { "epoch": 18.960713106635854, "grad_norm": 1555.8670654296875, "learning_rate": 0.0003445504419933959, "loss": 35.9093, "step": 7179 }, { "epoch": 18.9633542423242, "grad_norm": 4829.55810546875, "learning_rate": 0.000344511767200439, "loss": 34.8107, "step": 7180 }, { "epoch": 18.965995378012547, "grad_norm": 1823.3277587890625, "learning_rate": 0.0003444730897682966, "loss": 42.3198, "step": 7181 }, { "epoch": 18.96863651370089, "grad_norm": 3173.125, "learning_rate": 0.00034443440969804867, "loss": 39.698, "step": 7182 }, { "epoch": 18.971277649389236, "grad_norm": 5766.76708984375, "learning_rate": 0.0003443957269907753, "loss": 36.6171, "step": 7183 }, { "epoch": 18.973918785077583, "grad_norm": 1719.8504638671875, "learning_rate": 0.0003443570416475567, "loss": 37.0765, "step": 7184 }, { "epoch": 18.97655992076593, "grad_norm": 322.8064880371094, "learning_rate": 0.0003443183536694732, "loss": 36.0216, "step": 7185 }, { "epoch": 18.979201056454276, "grad_norm": 366.99432373046875, "learning_rate": 0.000344279663057605, "loss": 36.1814, "step": 7186 }, { "epoch": 18.981842192142622, "grad_norm": 252.95758056640625, "learning_rate": 0.00034424096981303267, "loss": 36.2022, "step": 7187 }, { "epoch": 18.98448332783097, "grad_norm": 458.04522705078125, "learning_rate": 0.0003442022739368366, "loss": 36.4957, "step": 7188 }, { "epoch": 18.987124463519315, "grad_norm": 647.132568359375, "learning_rate": 0.0003441635754300974, "loss": 35.52, "step": 7189 }, { "epoch": 18.989765599207658, "grad_norm": 364.2440490722656, "learning_rate": 0.0003441248742938955, "loss": 37.5186, "step": 7190 }, { "epoch": 18.992406734896004, "grad_norm": 314.4647521972656, "learning_rate": 0.0003440861705293119, "loss": 35.8243, "step": 7191 }, { "epoch": 18.99504787058435, "grad_norm": 346.9446105957031, "learning_rate": 0.00034404746413742727, "loss": 35.8638, "step": 7192 }, { "epoch": 18.997689006272697, "grad_norm": 467.1021728515625, "learning_rate": 0.0003440087551193224, "loss": 36.4984, "step": 7193 }, { "epoch": 19.000330141961044, "grad_norm": 799.7078247070312, "learning_rate": 0.0003439700434760783, "loss": 38.4094, "step": 7194 }, { "epoch": 19.00297127764939, "grad_norm": 165.12762451171875, "learning_rate": 0.0003439313292087759, "loss": 39.1868, "step": 7195 }, { "epoch": 19.005612413337737, "grad_norm": 1102.875244140625, "learning_rate": 0.0003438926123184964, "loss": 40.6172, "step": 7196 }, { "epoch": 19.00825354902608, "grad_norm": 438.2513122558594, "learning_rate": 0.00034385389280632077, "loss": 40.0589, "step": 7197 }, { "epoch": 19.010894684714426, "grad_norm": 228.96795654296875, "learning_rate": 0.00034381517067333037, "loss": 40.9843, "step": 7198 }, { "epoch": 19.013535820402772, "grad_norm": 239.96499633789062, "learning_rate": 0.00034377644592060644, "loss": 42.3768, "step": 7199 }, { "epoch": 19.01617695609112, "grad_norm": 243.5162811279297, "learning_rate": 0.0003437377185492303, "loss": 45.6295, "step": 7200 }, { "epoch": 19.01617695609112, "eval_loss": 4.182600498199463, "eval_runtime": 2.2472, "eval_samples_per_second": 220.273, "eval_steps_per_second": 27.59, "step": 7200 }, { "epoch": 19.018818091779465, "grad_norm": 214.9833526611328, "learning_rate": 0.00034369898856028347, "loss": 42.0588, "step": 7201 }, { "epoch": 19.021459227467812, "grad_norm": 255.09878540039062, "learning_rate": 0.0003436602559548474, "loss": 42.208, "step": 7202 }, { "epoch": 19.02410036315616, "grad_norm": 438.0495910644531, "learning_rate": 0.0003436215207340037, "loss": 40.8075, "step": 7203 }, { "epoch": 19.026741498844505, "grad_norm": 680.6847534179688, "learning_rate": 0.000343582782898834, "loss": 39.7096, "step": 7204 }, { "epoch": 19.029382634532848, "grad_norm": 234.32589721679688, "learning_rate": 0.0003435440424504201, "loss": 38.042, "step": 7205 }, { "epoch": 19.032023770221194, "grad_norm": 291.5975036621094, "learning_rate": 0.00034350529938984383, "loss": 39.2942, "step": 7206 }, { "epoch": 19.03466490590954, "grad_norm": 420.49102783203125, "learning_rate": 0.0003434665537181869, "loss": 36.5976, "step": 7207 }, { "epoch": 19.037306041597887, "grad_norm": 369.6666259765625, "learning_rate": 0.0003434278054365314, "loss": 35.6911, "step": 7208 }, { "epoch": 19.039947177286233, "grad_norm": 235.23489379882812, "learning_rate": 0.00034338905454595937, "loss": 37.4788, "step": 7209 }, { "epoch": 19.04258831297458, "grad_norm": 176.2379608154297, "learning_rate": 0.00034335030104755285, "loss": 36.9487, "step": 7210 }, { "epoch": 19.045229448662926, "grad_norm": 207.08091735839844, "learning_rate": 0.00034331154494239384, "loss": 36.1826, "step": 7211 }, { "epoch": 19.047870584351273, "grad_norm": 240.7685546875, "learning_rate": 0.00034327278623156487, "loss": 36.3846, "step": 7212 }, { "epoch": 19.050511720039616, "grad_norm": 177.67648315429688, "learning_rate": 0.0003432340249161482, "loss": 36.8437, "step": 7213 }, { "epoch": 19.053152855727962, "grad_norm": 238.15235900878906, "learning_rate": 0.0003431952609972261, "loss": 36.2319, "step": 7214 }, { "epoch": 19.05579399141631, "grad_norm": 717.76806640625, "learning_rate": 0.00034315649447588104, "loss": 37.2584, "step": 7215 }, { "epoch": 19.058435127104655, "grad_norm": 1933.84228515625, "learning_rate": 0.0003431177253531956, "loss": 46.6066, "step": 7216 }, { "epoch": 19.061076262793, "grad_norm": 3847.596435546875, "learning_rate": 0.0003430789536302524, "loss": 38.8856, "step": 7217 }, { "epoch": 19.063717398481348, "grad_norm": 1501.3543701171875, "learning_rate": 0.0003430401793081341, "loss": 32.7528, "step": 7218 }, { "epoch": 19.066358534169694, "grad_norm": 1070.7198486328125, "learning_rate": 0.0003430014023879235, "loss": 30.1789, "step": 7219 }, { "epoch": 19.068999669858037, "grad_norm": 1083.641357421875, "learning_rate": 0.00034296262287070335, "loss": 26.2311, "step": 7220 }, { "epoch": 19.071640805546384, "grad_norm": 1618.4951171875, "learning_rate": 0.0003429238407575564, "loss": 19.6137, "step": 7221 }, { "epoch": 19.07428194123473, "grad_norm": 808.19287109375, "learning_rate": 0.000342885056049566, "loss": 26.9831, "step": 7222 }, { "epoch": 19.076923076923077, "grad_norm": 12384.376953125, "learning_rate": 0.0003428462687478149, "loss": 19.1316, "step": 7223 }, { "epoch": 19.079564212611423, "grad_norm": 368.44488525390625, "learning_rate": 0.0003428074788533863, "loss": 21.8601, "step": 7224 }, { "epoch": 19.08220534829977, "grad_norm": 1101.0504150390625, "learning_rate": 0.00034276868636736336, "loss": 20.0945, "step": 7225 }, { "epoch": 19.084846483988116, "grad_norm": 1243.5931396484375, "learning_rate": 0.00034272989129082934, "loss": 29.2723, "step": 7226 }, { "epoch": 19.087487619676462, "grad_norm": 1019.5775756835938, "learning_rate": 0.0003426910936248676, "loss": 40.8051, "step": 7227 }, { "epoch": 19.090128755364805, "grad_norm": 480.9029846191406, "learning_rate": 0.0003426522933705615, "loss": 39.5599, "step": 7228 }, { "epoch": 19.092769891053152, "grad_norm": 309.5763854980469, "learning_rate": 0.0003426134905289945, "loss": 41.4084, "step": 7229 }, { "epoch": 19.0954110267415, "grad_norm": 572.8245239257812, "learning_rate": 0.0003425746851012502, "loss": 38.7232, "step": 7230 }, { "epoch": 19.098052162429845, "grad_norm": 1751.7315673828125, "learning_rate": 0.0003425358770884122, "loss": 38.7357, "step": 7231 }, { "epoch": 19.10069329811819, "grad_norm": 427.3213195800781, "learning_rate": 0.00034249706649156425, "loss": 39.2843, "step": 7232 }, { "epoch": 19.103334433806538, "grad_norm": 341.9447021484375, "learning_rate": 0.00034245825331178997, "loss": 37.3618, "step": 7233 }, { "epoch": 19.105975569494884, "grad_norm": 202.15318298339844, "learning_rate": 0.00034241943755017334, "loss": 38.2556, "step": 7234 }, { "epoch": 19.10861670518323, "grad_norm": 528.2062377929688, "learning_rate": 0.00034238061920779825, "loss": 37.7431, "step": 7235 }, { "epoch": 19.111257840871573, "grad_norm": 215.44113159179688, "learning_rate": 0.0003423417982857485, "loss": 37.8216, "step": 7236 }, { "epoch": 19.11389897655992, "grad_norm": 320.6522216796875, "learning_rate": 0.0003423029747851084, "loss": 36.8247, "step": 7237 }, { "epoch": 19.116540112248266, "grad_norm": 488.0525817871094, "learning_rate": 0.0003422641487069619, "loss": 36.3861, "step": 7238 }, { "epoch": 19.119181247936613, "grad_norm": 402.3039855957031, "learning_rate": 0.0003422253200523932, "loss": 37.448, "step": 7239 }, { "epoch": 19.12182238362496, "grad_norm": 246.19053649902344, "learning_rate": 0.00034218648882248663, "loss": 36.5896, "step": 7240 }, { "epoch": 19.124463519313306, "grad_norm": 454.9844970703125, "learning_rate": 0.00034214765501832664, "loss": 37.9283, "step": 7241 }, { "epoch": 19.127104655001652, "grad_norm": 943.5689086914062, "learning_rate": 0.00034210881864099737, "loss": 38.0885, "step": 7242 }, { "epoch": 19.129745790689995, "grad_norm": 229.87451171875, "learning_rate": 0.00034206997969158347, "loss": 38.7939, "step": 7243 }, { "epoch": 19.13238692637834, "grad_norm": 291.9731140136719, "learning_rate": 0.00034203113817116957, "loss": 42.7905, "step": 7244 }, { "epoch": 19.135028062066688, "grad_norm": 154.27032470703125, "learning_rate": 0.0003419922940808401, "loss": 41.1816, "step": 7245 }, { "epoch": 19.137669197755034, "grad_norm": 237.2357177734375, "learning_rate": 0.0003419534474216798, "loss": 41.0718, "step": 7246 }, { "epoch": 19.14031033344338, "grad_norm": 269.5833740234375, "learning_rate": 0.00034191459819477363, "loss": 41.1001, "step": 7247 }, { "epoch": 19.142951469131727, "grad_norm": 176.37820434570312, "learning_rate": 0.0003418757464012062, "loss": 43.112, "step": 7248 }, { "epoch": 19.145592604820074, "grad_norm": 165.89028930664062, "learning_rate": 0.0003418368920420626, "loss": 44.4656, "step": 7249 }, { "epoch": 19.14823374050842, "grad_norm": 211.80397033691406, "learning_rate": 0.00034179803511842773, "loss": 41.7375, "step": 7250 }, { "epoch": 19.150874876196763, "grad_norm": 158.48985290527344, "learning_rate": 0.00034175917563138657, "loss": 41.7623, "step": 7251 }, { "epoch": 19.15351601188511, "grad_norm": 214.39158630371094, "learning_rate": 0.0003417203135820244, "loss": 42.4947, "step": 7252 }, { "epoch": 19.156157147573456, "grad_norm": 203.41363525390625, "learning_rate": 0.00034168144897142637, "loss": 41.4338, "step": 7253 }, { "epoch": 19.158798283261802, "grad_norm": 249.91650390625, "learning_rate": 0.0003416425818006777, "loss": 39.5952, "step": 7254 }, { "epoch": 19.16143941895015, "grad_norm": 224.7513885498047, "learning_rate": 0.0003416037120708638, "loss": 40.2979, "step": 7255 }, { "epoch": 19.164080554638495, "grad_norm": 247.7198486328125, "learning_rate": 0.00034156483978307, "loss": 38.2772, "step": 7256 }, { "epoch": 19.166721690326842, "grad_norm": 203.6414031982422, "learning_rate": 0.0003415259649383819, "loss": 37.651, "step": 7257 }, { "epoch": 19.16936282601519, "grad_norm": 403.3721618652344, "learning_rate": 0.000341487087537885, "loss": 37.7197, "step": 7258 }, { "epoch": 19.17200396170353, "grad_norm": 180.86729431152344, "learning_rate": 0.0003414482075826648, "loss": 35.9659, "step": 7259 }, { "epoch": 19.174645097391878, "grad_norm": 373.01873779296875, "learning_rate": 0.0003414093250738072, "loss": 36.9611, "step": 7260 }, { "epoch": 19.177286233080224, "grad_norm": 237.76663208007812, "learning_rate": 0.0003413704400123979, "loss": 35.7312, "step": 7261 }, { "epoch": 19.17992736876857, "grad_norm": 330.811279296875, "learning_rate": 0.00034133155239952283, "loss": 37.6379, "step": 7262 }, { "epoch": 19.182568504456917, "grad_norm": 373.8287353515625, "learning_rate": 0.00034129266223626765, "loss": 35.7026, "step": 7263 }, { "epoch": 19.185209640145263, "grad_norm": 279.4390869140625, "learning_rate": 0.00034125376952371864, "loss": 36.4977, "step": 7264 }, { "epoch": 19.18785077583361, "grad_norm": 321.2301940917969, "learning_rate": 0.0003412148742629616, "loss": 35.7038, "step": 7265 }, { "epoch": 19.190491911521953, "grad_norm": 1759.2918701171875, "learning_rate": 0.00034117597645508285, "loss": 58.1645, "step": 7266 }, { "epoch": 19.1931330472103, "grad_norm": 3910.597900390625, "learning_rate": 0.0003411370761011685, "loss": 74.0734, "step": 7267 }, { "epoch": 19.195774182898646, "grad_norm": 2015.3404541015625, "learning_rate": 0.0003410981732023048, "loss": 65.1053, "step": 7268 }, { "epoch": 19.198415318586992, "grad_norm": 2667.501953125, "learning_rate": 0.0003410592677595782, "loss": 61.672, "step": 7269 }, { "epoch": 19.20105645427534, "grad_norm": 7806.33935546875, "learning_rate": 0.00034102035977407504, "loss": 60.9746, "step": 7270 }, { "epoch": 19.203697589963685, "grad_norm": 5250.271484375, "learning_rate": 0.0003409814492468818, "loss": 52.581, "step": 7271 }, { "epoch": 19.20633872565203, "grad_norm": 3143.7060546875, "learning_rate": 0.0003409425361790849, "loss": 44.3136, "step": 7272 }, { "epoch": 19.208979861340378, "grad_norm": 4053.038330078125, "learning_rate": 0.00034090362057177115, "loss": 47.7128, "step": 7273 }, { "epoch": 19.21162099702872, "grad_norm": 2030.5606689453125, "learning_rate": 0.0003408647024260273, "loss": 39.1364, "step": 7274 }, { "epoch": 19.214262132717067, "grad_norm": 1217.6221923828125, "learning_rate": 0.00034082578174293997, "loss": 41.1505, "step": 7275 }, { "epoch": 19.216903268405414, "grad_norm": 472.503173828125, "learning_rate": 0.00034078685852359606, "loss": 34.7577, "step": 7276 }, { "epoch": 19.21954440409376, "grad_norm": 613.5003662109375, "learning_rate": 0.0003407479327690824, "loss": 36.4651, "step": 7277 }, { "epoch": 19.222185539782107, "grad_norm": 370.77801513671875, "learning_rate": 0.00034070900448048604, "loss": 36.7496, "step": 7278 }, { "epoch": 19.224826675470453, "grad_norm": 254.5529327392578, "learning_rate": 0.00034067007365889404, "loss": 37.131, "step": 7279 }, { "epoch": 19.2274678111588, "grad_norm": 347.8147277832031, "learning_rate": 0.00034063114030539343, "loss": 37.2992, "step": 7280 }, { "epoch": 19.230108946847146, "grad_norm": 475.069580078125, "learning_rate": 0.0003405922044210715, "loss": 35.4297, "step": 7281 }, { "epoch": 19.23275008253549, "grad_norm": 529.8347778320312, "learning_rate": 0.0003405532660070155, "loss": 36.0424, "step": 7282 }, { "epoch": 19.235391218223835, "grad_norm": 1132.5379638671875, "learning_rate": 0.0003405143250643128, "loss": 36.8534, "step": 7283 }, { "epoch": 19.238032353912182, "grad_norm": 818.280517578125, "learning_rate": 0.00034047538159405065, "loss": 36.5223, "step": 7284 }, { "epoch": 19.24067348960053, "grad_norm": 354.8565979003906, "learning_rate": 0.0003404364355973167, "loss": 35.7978, "step": 7285 }, { "epoch": 19.243314625288875, "grad_norm": 689.1845092773438, "learning_rate": 0.0003403974870751983, "loss": 35.3781, "step": 7286 }, { "epoch": 19.24595576097722, "grad_norm": 429.70819091796875, "learning_rate": 0.0003403585360287832, "loss": 36.153, "step": 7287 }, { "epoch": 19.248596896665568, "grad_norm": 483.4678649902344, "learning_rate": 0.0003403195824591592, "loss": 37.0561, "step": 7288 }, { "epoch": 19.25123803235391, "grad_norm": 1113.557861328125, "learning_rate": 0.00034028062636741376, "loss": 36.6205, "step": 7289 }, { "epoch": 19.253879168042257, "grad_norm": 581.956298828125, "learning_rate": 0.00034024166775463487, "loss": 37.1341, "step": 7290 }, { "epoch": 19.256520303730603, "grad_norm": 1149.9127197265625, "learning_rate": 0.00034020270662191046, "loss": 36.7399, "step": 7291 }, { "epoch": 19.25916143941895, "grad_norm": 994.5842895507812, "learning_rate": 0.0003401637429703285, "loss": 37.371, "step": 7292 }, { "epoch": 19.261802575107296, "grad_norm": 709.4341430664062, "learning_rate": 0.0003401247768009769, "loss": 38.3886, "step": 7293 }, { "epoch": 19.264443710795643, "grad_norm": 655.5719604492188, "learning_rate": 0.0003400858081149439, "loss": 42.677, "step": 7294 }, { "epoch": 19.26708484648399, "grad_norm": 515.9841918945312, "learning_rate": 0.0003400468369133176, "loss": 40.013, "step": 7295 }, { "epoch": 19.269725982172336, "grad_norm": 579.00732421875, "learning_rate": 0.00034000786319718623, "loss": 43.1742, "step": 7296 }, { "epoch": 19.27236711786068, "grad_norm": 512.1796875, "learning_rate": 0.0003399688869676383, "loss": 42.707, "step": 7297 }, { "epoch": 19.275008253549025, "grad_norm": 430.54693603515625, "learning_rate": 0.00033992990822576195, "loss": 42.2298, "step": 7298 }, { "epoch": 19.27764938923737, "grad_norm": 489.5904846191406, "learning_rate": 0.0003398909269726458, "loss": 43.2385, "step": 7299 }, { "epoch": 19.280290524925718, "grad_norm": 298.5907897949219, "learning_rate": 0.0003398519432093782, "loss": 45.5059, "step": 7300 }, { "epoch": 19.282931660614064, "grad_norm": 403.18359375, "learning_rate": 0.00033981295693704793, "loss": 43.2, "step": 7301 }, { "epoch": 19.28557279630241, "grad_norm": 603.3379516601562, "learning_rate": 0.00033977396815674366, "loss": 44.2052, "step": 7302 }, { "epoch": 19.288213931990757, "grad_norm": 618.6876831054688, "learning_rate": 0.000339734976869554, "loss": 40.9351, "step": 7303 }, { "epoch": 19.290855067679104, "grad_norm": 390.0279541015625, "learning_rate": 0.00033969598307656784, "loss": 41.7775, "step": 7304 }, { "epoch": 19.293496203367447, "grad_norm": 439.692138671875, "learning_rate": 0.0003396569867788741, "loss": 39.5188, "step": 7305 }, { "epoch": 19.296137339055793, "grad_norm": 338.985595703125, "learning_rate": 0.0003396179879775616, "loss": 41.8064, "step": 7306 }, { "epoch": 19.29877847474414, "grad_norm": 226.2009735107422, "learning_rate": 0.00033957898667371945, "loss": 37.7992, "step": 7307 }, { "epoch": 19.301419610432486, "grad_norm": 1016.7015380859375, "learning_rate": 0.0003395399828684366, "loss": 37.412, "step": 7308 }, { "epoch": 19.304060746120832, "grad_norm": 539.7320556640625, "learning_rate": 0.0003395009765628025, "loss": 37.488, "step": 7309 }, { "epoch": 19.30670188180918, "grad_norm": 565.5055541992188, "learning_rate": 0.00033946196775790614, "loss": 35.8271, "step": 7310 }, { "epoch": 19.309343017497525, "grad_norm": 273.5400695800781, "learning_rate": 0.00033942295645483697, "loss": 34.8375, "step": 7311 }, { "epoch": 19.31198415318587, "grad_norm": 313.5791015625, "learning_rate": 0.0003393839426546841, "loss": 34.897, "step": 7312 }, { "epoch": 19.314625288874215, "grad_norm": 438.1047058105469, "learning_rate": 0.00033934492635853724, "loss": 36.4798, "step": 7313 }, { "epoch": 19.31726642456256, "grad_norm": 480.9938049316406, "learning_rate": 0.0003393059075674858, "loss": 36.9605, "step": 7314 }, { "epoch": 19.319907560250908, "grad_norm": 1184.8419189453125, "learning_rate": 0.0003392668862826193, "loss": 36.8501, "step": 7315 }, { "epoch": 19.322548695939254, "grad_norm": 1079.3800048828125, "learning_rate": 0.0003392278625050275, "loss": 51.9038, "step": 7316 }, { "epoch": 19.3251898316276, "grad_norm": 2275.0283203125, "learning_rate": 0.0003391888362358, "loss": 32.8924, "step": 7317 }, { "epoch": 19.327830967315947, "grad_norm": 3319.297607421875, "learning_rate": 0.0003391498074760267, "loss": 29.2645, "step": 7318 }, { "epoch": 19.330472103004293, "grad_norm": 4674.0693359375, "learning_rate": 0.00033911077622679735, "loss": 34.8427, "step": 7319 }, { "epoch": 19.333113238692636, "grad_norm": 1942.49462890625, "learning_rate": 0.00033907174248920194, "loss": 28.5885, "step": 7320 }, { "epoch": 19.335754374380983, "grad_norm": 2312.513671875, "learning_rate": 0.00033903270626433046, "loss": 24.0209, "step": 7321 }, { "epoch": 19.33839551006933, "grad_norm": 2336.99951171875, "learning_rate": 0.00033899366755327286, "loss": 28.4303, "step": 7322 }, { "epoch": 19.341036645757676, "grad_norm": 12231.0107421875, "learning_rate": 0.0003389546263571195, "loss": 20.3666, "step": 7323 }, { "epoch": 19.343677781446022, "grad_norm": 7692.6435546875, "learning_rate": 0.0003389155826769604, "loss": 17.678, "step": 7324 }, { "epoch": 19.34631891713437, "grad_norm": 2782.859375, "learning_rate": 0.00033887653651388594, "loss": 22.846, "step": 7325 }, { "epoch": 19.348960052822715, "grad_norm": 308.54296875, "learning_rate": 0.00033883748786898635, "loss": 30.0416, "step": 7326 }, { "epoch": 19.35160118851106, "grad_norm": 435.3580322265625, "learning_rate": 0.0003387984367433521, "loss": 36.4561, "step": 7327 }, { "epoch": 19.354242324199404, "grad_norm": 1134.93212890625, "learning_rate": 0.0003387593831380736, "loss": 35.8923, "step": 7328 }, { "epoch": 19.35688345988775, "grad_norm": 563.0441284179688, "learning_rate": 0.0003387203270542416, "loss": 39.2283, "step": 7329 }, { "epoch": 19.359524595576097, "grad_norm": 385.98590087890625, "learning_rate": 0.0003386812684929465, "loss": 36.427, "step": 7330 }, { "epoch": 19.362165731264444, "grad_norm": 327.8623962402344, "learning_rate": 0.0003386422074552791, "loss": 36.1601, "step": 7331 }, { "epoch": 19.36480686695279, "grad_norm": 555.21044921875, "learning_rate": 0.00033860314394233015, "loss": 36.9824, "step": 7332 }, { "epoch": 19.367448002641137, "grad_norm": 1096.8345947265625, "learning_rate": 0.0003385640779551904, "loss": 35.9894, "step": 7333 }, { "epoch": 19.370089138329483, "grad_norm": 265.2846374511719, "learning_rate": 0.0003385250094949508, "loss": 37.9761, "step": 7334 }, { "epoch": 19.372730274017826, "grad_norm": 770.7849731445312, "learning_rate": 0.0003384859385627023, "loss": 35.7009, "step": 7335 }, { "epoch": 19.375371409706172, "grad_norm": 860.9058227539062, "learning_rate": 0.00033844686515953603, "loss": 36.5067, "step": 7336 }, { "epoch": 19.37801254539452, "grad_norm": 451.5001525878906, "learning_rate": 0.00033840778928654296, "loss": 37.1804, "step": 7337 }, { "epoch": 19.380653681082865, "grad_norm": 417.13519287109375, "learning_rate": 0.00033836871094481433, "loss": 37.3028, "step": 7338 }, { "epoch": 19.383294816771212, "grad_norm": 720.5863647460938, "learning_rate": 0.00033832963013544134, "loss": 35.3945, "step": 7339 }, { "epoch": 19.38593595245956, "grad_norm": 1851.6175537109375, "learning_rate": 0.00033829054685951534, "loss": 34.7496, "step": 7340 }, { "epoch": 19.388577088147905, "grad_norm": 724.3504638671875, "learning_rate": 0.00033825146111812754, "loss": 36.4757, "step": 7341 }, { "epoch": 19.39121822383625, "grad_norm": 612.9051513671875, "learning_rate": 0.00033821237291236966, "loss": 36.0053, "step": 7342 }, { "epoch": 19.393859359524594, "grad_norm": 879.3419189453125, "learning_rate": 0.000338173282243333, "loss": 37.911, "step": 7343 }, { "epoch": 19.39650049521294, "grad_norm": 1588.27001953125, "learning_rate": 0.00033813418911210935, "loss": 41.2, "step": 7344 }, { "epoch": 19.399141630901287, "grad_norm": 271.7469177246094, "learning_rate": 0.0003380950935197902, "loss": 40.9483, "step": 7345 }, { "epoch": 19.401782766589633, "grad_norm": 555.7592163085938, "learning_rate": 0.00033805599546746733, "loss": 41.567, "step": 7346 }, { "epoch": 19.40442390227798, "grad_norm": 301.1236267089844, "learning_rate": 0.00033801689495623245, "loss": 40.6837, "step": 7347 }, { "epoch": 19.407065037966326, "grad_norm": 550.613037109375, "learning_rate": 0.0003379777919871775, "loss": 42.5538, "step": 7348 }, { "epoch": 19.409706173654673, "grad_norm": 963.5126342773438, "learning_rate": 0.00033793868656139446, "loss": 44.4464, "step": 7349 }, { "epoch": 19.41234730934302, "grad_norm": 282.81475830078125, "learning_rate": 0.00033789957867997515, "loss": 44.7382, "step": 7350 }, { "epoch": 19.414988445031362, "grad_norm": 457.8644714355469, "learning_rate": 0.00033786046834401184, "loss": 43.5221, "step": 7351 }, { "epoch": 19.41762958071971, "grad_norm": 342.7183532714844, "learning_rate": 0.0003378213555545965, "loss": 42.597, "step": 7352 }, { "epoch": 19.420270716408055, "grad_norm": 318.1738586425781, "learning_rate": 0.0003377822403128214, "loss": 42.1944, "step": 7353 }, { "epoch": 19.4229118520964, "grad_norm": 349.7456970214844, "learning_rate": 0.00033774312261977885, "loss": 39.6077, "step": 7354 }, { "epoch": 19.425552987784748, "grad_norm": 2584.058837890625, "learning_rate": 0.0003377040024765611, "loss": 40.1143, "step": 7355 }, { "epoch": 19.428194123473094, "grad_norm": 397.2752990722656, "learning_rate": 0.0003376648798842606, "loss": 40.064, "step": 7356 }, { "epoch": 19.43083525916144, "grad_norm": 802.5820922851562, "learning_rate": 0.0003376257548439698, "loss": 36.3858, "step": 7357 }, { "epoch": 19.433476394849784, "grad_norm": 530.8153686523438, "learning_rate": 0.00033758662735678134, "loss": 38.9411, "step": 7358 }, { "epoch": 19.43611753053813, "grad_norm": 961.634521484375, "learning_rate": 0.00033754749742378776, "loss": 37.4667, "step": 7359 }, { "epoch": 19.438758666226477, "grad_norm": 402.7406921386719, "learning_rate": 0.0003375083650460817, "loss": 36.9831, "step": 7360 }, { "epoch": 19.441399801914823, "grad_norm": 468.855712890625, "learning_rate": 0.00033746923022475594, "loss": 36.7472, "step": 7361 }, { "epoch": 19.44404093760317, "grad_norm": 517.4923706054688, "learning_rate": 0.0003374300929609033, "loss": 36.3753, "step": 7362 }, { "epoch": 19.446682073291516, "grad_norm": 280.4987487792969, "learning_rate": 0.00033739095325561677, "loss": 37.9743, "step": 7363 }, { "epoch": 19.449323208979862, "grad_norm": 424.18310546875, "learning_rate": 0.0003373518111099891, "loss": 35.1645, "step": 7364 }, { "epoch": 19.45196434466821, "grad_norm": 477.7861328125, "learning_rate": 0.0003373126665251135, "loss": 36.1262, "step": 7365 }, { "epoch": 19.454605480356552, "grad_norm": 338.15301513671875, "learning_rate": 0.0003372735195020829, "loss": 37.9423, "step": 7366 }, { "epoch": 19.4572466160449, "grad_norm": 1464.8380126953125, "learning_rate": 0.00033723437004199065, "loss": 39.2259, "step": 7367 }, { "epoch": 19.459887751733245, "grad_norm": 2503.4951171875, "learning_rate": 0.00033719521814592975, "loss": 33.8109, "step": 7368 }, { "epoch": 19.46252888742159, "grad_norm": 3278.68896484375, "learning_rate": 0.00033715606381499357, "loss": 29.4169, "step": 7369 }, { "epoch": 19.465170023109938, "grad_norm": 4570.1220703125, "learning_rate": 0.0003371169070502756, "loss": 29.3649, "step": 7370 }, { "epoch": 19.467811158798284, "grad_norm": 951.5264892578125, "learning_rate": 0.0003370777478528692, "loss": 24.2311, "step": 7371 }, { "epoch": 19.47045229448663, "grad_norm": 638.7213745117188, "learning_rate": 0.0003370385862238678, "loss": 22.4707, "step": 7372 }, { "epoch": 19.473093430174977, "grad_norm": 925.8685913085938, "learning_rate": 0.000336999422164365, "loss": 18.1714, "step": 7373 }, { "epoch": 19.47573456586332, "grad_norm": 1131.4705810546875, "learning_rate": 0.0003369602556754544, "loss": 17.6767, "step": 7374 }, { "epoch": 19.478375701551666, "grad_norm": 4692.0107421875, "learning_rate": 0.00033692108675822973, "loss": 14.4929, "step": 7375 }, { "epoch": 19.481016837240013, "grad_norm": 2415.30224609375, "learning_rate": 0.0003368819154137849, "loss": 15.5094, "step": 7376 }, { "epoch": 19.48365797292836, "grad_norm": 403.3702697753906, "learning_rate": 0.00033684274164321354, "loss": 37.165, "step": 7377 }, { "epoch": 19.486299108616706, "grad_norm": 451.6563415527344, "learning_rate": 0.0003368035654476096, "loss": 37.4855, "step": 7378 }, { "epoch": 19.488940244305052, "grad_norm": 306.57635498046875, "learning_rate": 0.00033676438682806717, "loss": 39.0154, "step": 7379 }, { "epoch": 19.4915813799934, "grad_norm": 288.11834716796875, "learning_rate": 0.0003367252057856802, "loss": 37.6504, "step": 7380 }, { "epoch": 19.49422251568174, "grad_norm": 331.80755615234375, "learning_rate": 0.0003366860223215427, "loss": 37.9868, "step": 7381 }, { "epoch": 19.496863651370088, "grad_norm": 372.3703918457031, "learning_rate": 0.00033664683643674903, "loss": 36.4131, "step": 7382 }, { "epoch": 19.499504787058434, "grad_norm": 259.19134521484375, "learning_rate": 0.0003366076481323933, "loss": 38.4278, "step": 7383 }, { "epoch": 19.50214592274678, "grad_norm": 309.43109130859375, "learning_rate": 0.00033656845740957, "loss": 35.3858, "step": 7384 }, { "epoch": 19.504787058435127, "grad_norm": 434.9239196777344, "learning_rate": 0.0003365292642693733, "loss": 35.0668, "step": 7385 }, { "epoch": 19.507428194123474, "grad_norm": 346.7585754394531, "learning_rate": 0.0003364900687128977, "loss": 36.3142, "step": 7386 }, { "epoch": 19.51006932981182, "grad_norm": 382.74078369140625, "learning_rate": 0.0003364508707412377, "loss": 36.6587, "step": 7387 }, { "epoch": 19.512710465500167, "grad_norm": 353.62115478515625, "learning_rate": 0.000336411670355488, "loss": 36.1819, "step": 7388 }, { "epoch": 19.51535160118851, "grad_norm": 430.2603759765625, "learning_rate": 0.00033637246755674314, "loss": 35.8802, "step": 7389 }, { "epoch": 19.517992736876856, "grad_norm": 492.6023254394531, "learning_rate": 0.00033633326234609783, "loss": 35.6791, "step": 7390 }, { "epoch": 19.520633872565202, "grad_norm": 225.17111206054688, "learning_rate": 0.00033629405472464696, "loss": 34.6374, "step": 7391 }, { "epoch": 19.52327500825355, "grad_norm": 341.3864440917969, "learning_rate": 0.0003362548446934852, "loss": 35.8998, "step": 7392 }, { "epoch": 19.525916143941895, "grad_norm": 594.6458740234375, "learning_rate": 0.00033621563225370773, "loss": 38.6663, "step": 7393 }, { "epoch": 19.528557279630242, "grad_norm": 516.4776611328125, "learning_rate": 0.00033617641740640923, "loss": 42.8496, "step": 7394 }, { "epoch": 19.53119841531859, "grad_norm": 243.3544464111328, "learning_rate": 0.0003361372001526849, "loss": 39.8397, "step": 7395 }, { "epoch": 19.533839551006935, "grad_norm": 364.7626647949219, "learning_rate": 0.00033609798049362996, "loss": 40.7943, "step": 7396 }, { "epoch": 19.536480686695278, "grad_norm": 308.8230285644531, "learning_rate": 0.0003360587584303394, "loss": 42.3329, "step": 7397 }, { "epoch": 19.539121822383624, "grad_norm": 460.9798278808594, "learning_rate": 0.00033601953396390863, "loss": 42.5316, "step": 7398 }, { "epoch": 19.54176295807197, "grad_norm": 375.4728698730469, "learning_rate": 0.0003359803070954328, "loss": 43.5644, "step": 7399 }, { "epoch": 19.544404093760317, "grad_norm": 276.7141418457031, "learning_rate": 0.00033594107782600755, "loss": 42.4406, "step": 7400 }, { "epoch": 19.544404093760317, "eval_loss": 4.059871196746826, "eval_runtime": 2.154, "eval_samples_per_second": 229.803, "eval_steps_per_second": 28.783, "step": 7400 }, { "epoch": 19.547045229448663, "grad_norm": 278.48095703125, "learning_rate": 0.000335901846156728, "loss": 42.9753, "step": 7401 }, { "epoch": 19.54968636513701, "grad_norm": 227.05165100097656, "learning_rate": 0.00033586261208869, "loss": 42.886, "step": 7402 }, { "epoch": 19.552327500825356, "grad_norm": 253.054443359375, "learning_rate": 0.000335823375622989, "loss": 40.6794, "step": 7403 }, { "epoch": 19.5549686365137, "grad_norm": 538.1354370117188, "learning_rate": 0.0003357841367607206, "loss": 40.375, "step": 7404 }, { "epoch": 19.557609772202046, "grad_norm": 309.8332214355469, "learning_rate": 0.00033574489550298054, "loss": 38.9109, "step": 7405 }, { "epoch": 19.560250907890392, "grad_norm": 584.8917236328125, "learning_rate": 0.00033570565185086473, "loss": 40.1844, "step": 7406 }, { "epoch": 19.56289204357874, "grad_norm": 329.8917236328125, "learning_rate": 0.0003356664058054689, "loss": 38.4133, "step": 7407 }, { "epoch": 19.565533179267085, "grad_norm": 305.4219665527344, "learning_rate": 0.000335627157367889, "loss": 38.9984, "step": 7408 }, { "epoch": 19.56817431495543, "grad_norm": 300.04180908203125, "learning_rate": 0.000335587906539221, "loss": 35.976, "step": 7409 }, { "epoch": 19.570815450643778, "grad_norm": 357.6717224121094, "learning_rate": 0.00033554865332056103, "loss": 35.8976, "step": 7410 }, { "epoch": 19.573456586332124, "grad_norm": 298.6019592285156, "learning_rate": 0.00033550939771300516, "loss": 37.038, "step": 7411 }, { "epoch": 19.576097722020467, "grad_norm": 177.99493408203125, "learning_rate": 0.0003354701397176496, "loss": 34.9993, "step": 7412 }, { "epoch": 19.578738857708814, "grad_norm": 180.56825256347656, "learning_rate": 0.00033543087933559057, "loss": 36.4656, "step": 7413 }, { "epoch": 19.58137999339716, "grad_norm": 231.90878295898438, "learning_rate": 0.0003353916165679244, "loss": 36.6051, "step": 7414 }, { "epoch": 19.584021129085507, "grad_norm": 261.17083740234375, "learning_rate": 0.0003353523514157476, "loss": 37.4016, "step": 7415 }, { "epoch": 19.586662264773853, "grad_norm": 2368.81298828125, "learning_rate": 0.00033531308388015646, "loss": 51.6886, "step": 7416 }, { "epoch": 19.5893034004622, "grad_norm": 1052.867431640625, "learning_rate": 0.00033527381396224757, "loss": 46.3782, "step": 7417 }, { "epoch": 19.591944536150546, "grad_norm": 2401.92431640625, "learning_rate": 0.0003352345416631175, "loss": 43.0564, "step": 7418 }, { "epoch": 19.594585671838892, "grad_norm": 1414.3365478515625, "learning_rate": 0.000335195266983863, "loss": 44.925, "step": 7419 }, { "epoch": 19.597226807527235, "grad_norm": 2369.390380859375, "learning_rate": 0.0003351559899255806, "loss": 35.3518, "step": 7420 }, { "epoch": 19.599867943215582, "grad_norm": 2262.956298828125, "learning_rate": 0.0003351167104893673, "loss": 37.9207, "step": 7421 }, { "epoch": 19.60250907890393, "grad_norm": 2463.793701171875, "learning_rate": 0.0003350774286763198, "loss": 31.8377, "step": 7422 }, { "epoch": 19.605150214592275, "grad_norm": 1695.2379150390625, "learning_rate": 0.0003350381444875351, "loss": 30.6544, "step": 7423 }, { "epoch": 19.60779135028062, "grad_norm": 865.2066650390625, "learning_rate": 0.0003349988579241102, "loss": 23.1188, "step": 7424 }, { "epoch": 19.610432485968968, "grad_norm": 819.013671875, "learning_rate": 0.0003349595689871421, "loss": 16.5744, "step": 7425 }, { "epoch": 19.613073621657314, "grad_norm": 671.3240966796875, "learning_rate": 0.0003349202776777279, "loss": 25.7503, "step": 7426 }, { "epoch": 19.615714757345657, "grad_norm": 894.0667114257812, "learning_rate": 0.0003348809839969649, "loss": 38.5225, "step": 7427 }, { "epoch": 19.618355893034003, "grad_norm": 298.88922119140625, "learning_rate": 0.00033484168794595026, "loss": 35.39, "step": 7428 }, { "epoch": 19.62099702872235, "grad_norm": 193.86788940429688, "learning_rate": 0.0003348023895257813, "loss": 35.1326, "step": 7429 }, { "epoch": 19.623638164410696, "grad_norm": 280.0637512207031, "learning_rate": 0.0003347630887375554, "loss": 37.3944, "step": 7430 }, { "epoch": 19.626279300099043, "grad_norm": 325.65911865234375, "learning_rate": 0.0003347237855823701, "loss": 37.0862, "step": 7431 }, { "epoch": 19.62892043578739, "grad_norm": 208.6654052734375, "learning_rate": 0.0003346844800613229, "loss": 36.77, "step": 7432 }, { "epoch": 19.631561571475736, "grad_norm": 584.7177734375, "learning_rate": 0.00033464517217551127, "loss": 37.467, "step": 7433 }, { "epoch": 19.634202707164082, "grad_norm": 162.4768524169922, "learning_rate": 0.0003346058619260329, "loss": 35.1892, "step": 7434 }, { "epoch": 19.636843842852425, "grad_norm": 305.2442932128906, "learning_rate": 0.00033456654931398564, "loss": 37.0148, "step": 7435 }, { "epoch": 19.63948497854077, "grad_norm": 559.4111328125, "learning_rate": 0.00033452723434046705, "loss": 37.0311, "step": 7436 }, { "epoch": 19.642126114229118, "grad_norm": 242.9644775390625, "learning_rate": 0.0003344879170065752, "loss": 36.4391, "step": 7437 }, { "epoch": 19.644767249917464, "grad_norm": 1565.38671875, "learning_rate": 0.0003344485973134078, "loss": 35.7394, "step": 7438 }, { "epoch": 19.64740838560581, "grad_norm": 480.1116638183594, "learning_rate": 0.000334409275262063, "loss": 36.8258, "step": 7439 }, { "epoch": 19.650049521294157, "grad_norm": 332.8730773925781, "learning_rate": 0.00033436995085363877, "loss": 34.6056, "step": 7440 }, { "epoch": 19.652690656982504, "grad_norm": 655.0591430664062, "learning_rate": 0.0003343306240892332, "loss": 36.9409, "step": 7441 }, { "epoch": 19.65533179267085, "grad_norm": 391.1048889160156, "learning_rate": 0.0003342912949699445, "loss": 38.3163, "step": 7442 }, { "epoch": 19.657972928359193, "grad_norm": 312.6840515136719, "learning_rate": 0.00033425196349687086, "loss": 38.1688, "step": 7443 }, { "epoch": 19.66061406404754, "grad_norm": 424.2295837402344, "learning_rate": 0.0003342126296711107, "loss": 43.1519, "step": 7444 }, { "epoch": 19.663255199735886, "grad_norm": 396.7959289550781, "learning_rate": 0.0003341732934937622, "loss": 41.5425, "step": 7445 }, { "epoch": 19.665896335424232, "grad_norm": 820.96484375, "learning_rate": 0.00033413395496592407, "loss": 43.5748, "step": 7446 }, { "epoch": 19.66853747111258, "grad_norm": 302.9134521484375, "learning_rate": 0.00033409461408869467, "loss": 42.8559, "step": 7447 }, { "epoch": 19.671178606800925, "grad_norm": 277.404296875, "learning_rate": 0.0003340552708631725, "loss": 47.0661, "step": 7448 }, { "epoch": 19.673819742489272, "grad_norm": 224.78277587890625, "learning_rate": 0.00033401592529045635, "loss": 46.1818, "step": 7449 }, { "epoch": 19.676460878177615, "grad_norm": 440.7366027832031, "learning_rate": 0.0003339765773716448, "loss": 44.5777, "step": 7450 }, { "epoch": 19.67910201386596, "grad_norm": 260.45745849609375, "learning_rate": 0.0003339372271078366, "loss": 40.6068, "step": 7451 }, { "epoch": 19.681743149554308, "grad_norm": 603.754150390625, "learning_rate": 0.00033389787450013067, "loss": 41.5691, "step": 7452 }, { "epoch": 19.684384285242654, "grad_norm": 177.17909240722656, "learning_rate": 0.00033385851954962584, "loss": 42.2592, "step": 7453 }, { "epoch": 19.687025420931, "grad_norm": 240.72764587402344, "learning_rate": 0.0003338191622574212, "loss": 40.2491, "step": 7454 }, { "epoch": 19.689666556619347, "grad_norm": 229.54461669921875, "learning_rate": 0.00033377980262461567, "loss": 38.7406, "step": 7455 }, { "epoch": 19.692307692307693, "grad_norm": 407.1549072265625, "learning_rate": 0.0003337404406523083, "loss": 38.0523, "step": 7456 }, { "epoch": 19.69494882799604, "grad_norm": 183.54656982421875, "learning_rate": 0.00033370107634159835, "loss": 38.2906, "step": 7457 }, { "epoch": 19.697589963684383, "grad_norm": 1165.2210693359375, "learning_rate": 0.00033366170969358506, "loss": 36.4921, "step": 7458 }, { "epoch": 19.70023109937273, "grad_norm": 234.12939453125, "learning_rate": 0.0003336223407093676, "loss": 37.2837, "step": 7459 }, { "epoch": 19.702872235061076, "grad_norm": 265.850830078125, "learning_rate": 0.00033358296939004547, "loss": 36.9171, "step": 7460 }, { "epoch": 19.705513370749422, "grad_norm": 200.2274932861328, "learning_rate": 0.000333543595736718, "loss": 36.8196, "step": 7461 }, { "epoch": 19.70815450643777, "grad_norm": 520.7882690429688, "learning_rate": 0.0003335042197504846, "loss": 35.9493, "step": 7462 }, { "epoch": 19.710795642126115, "grad_norm": 255.56588745117188, "learning_rate": 0.00033346484143244505, "loss": 35.6377, "step": 7463 }, { "epoch": 19.71343677781446, "grad_norm": 510.2916564941406, "learning_rate": 0.00033342546078369884, "loss": 36.2045, "step": 7464 }, { "epoch": 19.716077913502808, "grad_norm": 333.9627990722656, "learning_rate": 0.0003333860778053455, "loss": 36.892, "step": 7465 }, { "epoch": 19.71871904919115, "grad_norm": 982.2466430664062, "learning_rate": 0.00033334669249848503, "loss": 59.766, "step": 7466 }, { "epoch": 19.721360184879497, "grad_norm": 2000.933349609375, "learning_rate": 0.0003333073048642171, "loss": 44.327, "step": 7467 }, { "epoch": 19.724001320567844, "grad_norm": 19364.75, "learning_rate": 0.0003332679149036417, "loss": 45.7006, "step": 7468 }, { "epoch": 19.72664245625619, "grad_norm": 4073.2646484375, "learning_rate": 0.0003332285226178586, "loss": 47.1068, "step": 7469 }, { "epoch": 19.729283591944537, "grad_norm": 2555.2607421875, "learning_rate": 0.00033318912800796784, "loss": 44.9497, "step": 7470 }, { "epoch": 19.731924727632883, "grad_norm": 3886.14306640625, "learning_rate": 0.00033314973107506965, "loss": 50.1308, "step": 7471 }, { "epoch": 19.73456586332123, "grad_norm": 1505.5662841796875, "learning_rate": 0.00033311033182026404, "loss": 35.2254, "step": 7472 }, { "epoch": 19.737206999009572, "grad_norm": 4780.43115234375, "learning_rate": 0.0003330709302446513, "loss": 30.587, "step": 7473 }, { "epoch": 19.73984813469792, "grad_norm": 4305.3369140625, "learning_rate": 0.00033303152634933154, "loss": 27.7542, "step": 7474 }, { "epoch": 19.742489270386265, "grad_norm": 651.819091796875, "learning_rate": 0.0003329921201354052, "loss": 23.8376, "step": 7475 }, { "epoch": 19.745130406074612, "grad_norm": 815.9229736328125, "learning_rate": 0.00033295271160397265, "loss": 37.0748, "step": 7476 }, { "epoch": 19.74777154176296, "grad_norm": 819.5665893554688, "learning_rate": 0.00033291330075613435, "loss": 36.974, "step": 7477 }, { "epoch": 19.750412677451305, "grad_norm": 579.9901123046875, "learning_rate": 0.0003328738875929909, "loss": 37.1078, "step": 7478 }, { "epoch": 19.75305381313965, "grad_norm": 240.35604858398438, "learning_rate": 0.0003328344721156427, "loss": 36.492, "step": 7479 }, { "epoch": 19.755694948827998, "grad_norm": 190.73196411132812, "learning_rate": 0.00033279505432519066, "loss": 35.9965, "step": 7480 }, { "epoch": 19.75833608451634, "grad_norm": 250.59616088867188, "learning_rate": 0.0003327556342227353, "loss": 37.0891, "step": 7481 }, { "epoch": 19.760977220204687, "grad_norm": 134.61138916015625, "learning_rate": 0.0003327162118093775, "loss": 35.9626, "step": 7482 }, { "epoch": 19.763618355893033, "grad_norm": 579.0947875976562, "learning_rate": 0.000332676787086218, "loss": 36.0058, "step": 7483 }, { "epoch": 19.76625949158138, "grad_norm": 290.68853759765625, "learning_rate": 0.00033263736005435785, "loss": 37.2036, "step": 7484 }, { "epoch": 19.768900627269726, "grad_norm": 281.57904052734375, "learning_rate": 0.00033259793071489796, "loss": 38.0116, "step": 7485 }, { "epoch": 19.771541762958073, "grad_norm": 207.34266662597656, "learning_rate": 0.0003325584990689394, "loss": 35.9522, "step": 7486 }, { "epoch": 19.77418289864642, "grad_norm": 516.2290649414062, "learning_rate": 0.00033251906511758323, "loss": 37.6974, "step": 7487 }, { "epoch": 19.776824034334766, "grad_norm": 204.24632263183594, "learning_rate": 0.0003324796288619306, "loss": 37.5028, "step": 7488 }, { "epoch": 19.77946517002311, "grad_norm": 963.8366088867188, "learning_rate": 0.00033244019030308284, "loss": 34.5507, "step": 7489 }, { "epoch": 19.782106305711455, "grad_norm": 658.6143188476562, "learning_rate": 0.00033240074944214116, "loss": 36.4655, "step": 7490 }, { "epoch": 19.7847474413998, "grad_norm": 300.7995910644531, "learning_rate": 0.00033236130628020696, "loss": 36.8775, "step": 7491 }, { "epoch": 19.787388577088148, "grad_norm": 438.38531494140625, "learning_rate": 0.0003323218608183817, "loss": 36.8974, "step": 7492 }, { "epoch": 19.790029712776494, "grad_norm": 546.6461181640625, "learning_rate": 0.00033228241305776674, "loss": 38.7757, "step": 7493 }, { "epoch": 19.79267084846484, "grad_norm": 630.6079711914062, "learning_rate": 0.0003322429629994638, "loss": 41.1192, "step": 7494 }, { "epoch": 19.795311984153187, "grad_norm": 166.4779052734375, "learning_rate": 0.0003322035106445744, "loss": 40.292, "step": 7495 }, { "epoch": 19.79795311984153, "grad_norm": 263.570068359375, "learning_rate": 0.0003321640559942003, "loss": 41.981, "step": 7496 }, { "epoch": 19.800594255529877, "grad_norm": 319.9884033203125, "learning_rate": 0.00033212459904944313, "loss": 40.9531, "step": 7497 }, { "epoch": 19.803235391218223, "grad_norm": 224.820556640625, "learning_rate": 0.0003320851398114049, "loss": 41.6007, "step": 7498 }, { "epoch": 19.80587652690657, "grad_norm": 262.40521240234375, "learning_rate": 0.0003320456782811873, "loss": 44.3137, "step": 7499 }, { "epoch": 19.808517662594916, "grad_norm": 268.7491455078125, "learning_rate": 0.00033200621445989226, "loss": 44.5502, "step": 7500 }, { "epoch": 19.811158798283262, "grad_norm": 476.6345520019531, "learning_rate": 0.00033196674834862195, "loss": 41.5193, "step": 7501 }, { "epoch": 19.81379993397161, "grad_norm": 257.33734130859375, "learning_rate": 0.00033192727994847833, "loss": 41.1539, "step": 7502 }, { "epoch": 19.816441069659955, "grad_norm": 439.0334777832031, "learning_rate": 0.0003318878092605635, "loss": 40.3044, "step": 7503 }, { "epoch": 19.8190822053483, "grad_norm": 286.1539001464844, "learning_rate": 0.00033184833628597973, "loss": 40.1399, "step": 7504 }, { "epoch": 19.821723341036645, "grad_norm": 352.5527648925781, "learning_rate": 0.0003318088610258292, "loss": 40.6977, "step": 7505 }, { "epoch": 19.82436447672499, "grad_norm": 294.04437255859375, "learning_rate": 0.00033176938348121433, "loss": 39.9511, "step": 7506 }, { "epoch": 19.827005612413338, "grad_norm": 328.98291015625, "learning_rate": 0.0003317299036532374, "loss": 38.4202, "step": 7507 }, { "epoch": 19.829646748101684, "grad_norm": 313.38934326171875, "learning_rate": 0.000331690421543001, "loss": 36.2697, "step": 7508 }, { "epoch": 19.83228788379003, "grad_norm": 596.3222045898438, "learning_rate": 0.0003316509371516075, "loss": 38.143, "step": 7509 }, { "epoch": 19.834929019478377, "grad_norm": 314.05621337890625, "learning_rate": 0.0003316114504801596, "loss": 36.3596, "step": 7510 }, { "epoch": 19.837570155166723, "grad_norm": 399.2556457519531, "learning_rate": 0.0003315719615297599, "loss": 35.4924, "step": 7511 }, { "epoch": 19.840211290855066, "grad_norm": 329.5427551269531, "learning_rate": 0.000331532470301511, "loss": 35.6567, "step": 7512 }, { "epoch": 19.842852426543413, "grad_norm": 224.92962646484375, "learning_rate": 0.0003314929767965159, "loss": 36.4909, "step": 7513 }, { "epoch": 19.84549356223176, "grad_norm": 245.4032745361328, "learning_rate": 0.00033145348101587714, "loss": 36.0619, "step": 7514 }, { "epoch": 19.848134697920106, "grad_norm": 368.2260437011719, "learning_rate": 0.00033141398296069794, "loss": 36.8734, "step": 7515 }, { "epoch": 19.850775833608452, "grad_norm": 434.8218688964844, "learning_rate": 0.00033137448263208095, "loss": 37.2676, "step": 7516 }, { "epoch": 19.8534169692968, "grad_norm": 1623.151123046875, "learning_rate": 0.0003313349800311294, "loss": 52.6749, "step": 7517 }, { "epoch": 19.856058104985145, "grad_norm": 1359.044921875, "learning_rate": 0.00033129547515894627, "loss": 55.3118, "step": 7518 }, { "epoch": 19.858699240673488, "grad_norm": 2690.8642578125, "learning_rate": 0.00033125596801663476, "loss": 49.6741, "step": 7519 }, { "epoch": 19.861340376361834, "grad_norm": 5603.14208984375, "learning_rate": 0.00033121645860529817, "loss": 36.184, "step": 7520 }, { "epoch": 19.86398151205018, "grad_norm": 2940.387451171875, "learning_rate": 0.0003311769469260395, "loss": 30.6679, "step": 7521 }, { "epoch": 19.866622647738527, "grad_norm": 1928.2852783203125, "learning_rate": 0.00033113743297996244, "loss": 28.5576, "step": 7522 }, { "epoch": 19.869263783426874, "grad_norm": 3513.859619140625, "learning_rate": 0.00033109791676817015, "loss": 19.6293, "step": 7523 }, { "epoch": 19.87190491911522, "grad_norm": 1418.6036376953125, "learning_rate": 0.0003310583982917662, "loss": 21.5953, "step": 7524 }, { "epoch": 19.874546054803567, "grad_norm": 1833.5068359375, "learning_rate": 0.0003310188775518541, "loss": 21.601, "step": 7525 }, { "epoch": 19.877187190491913, "grad_norm": 2902.418701171875, "learning_rate": 0.00033097935454953737, "loss": 24.3275, "step": 7526 }, { "epoch": 19.879828326180256, "grad_norm": 440.9394836425781, "learning_rate": 0.0003309398292859198, "loss": 38.807, "step": 7527 }, { "epoch": 19.882469461868602, "grad_norm": 311.56707763671875, "learning_rate": 0.000330900301762105, "loss": 37.3752, "step": 7528 }, { "epoch": 19.88511059755695, "grad_norm": 249.01861572265625, "learning_rate": 0.00033086077197919686, "loss": 36.8598, "step": 7529 }, { "epoch": 19.887751733245295, "grad_norm": 317.91998291015625, "learning_rate": 0.0003308212399382991, "loss": 35.3097, "step": 7530 }, { "epoch": 19.890392868933642, "grad_norm": 466.0320739746094, "learning_rate": 0.0003307817056405157, "loss": 36.3885, "step": 7531 }, { "epoch": 19.89303400462199, "grad_norm": 286.75732421875, "learning_rate": 0.00033074216908695063, "loss": 36.6907, "step": 7532 }, { "epoch": 19.895675140310335, "grad_norm": 178.07827758789062, "learning_rate": 0.00033070263027870796, "loss": 35.5926, "step": 7533 }, { "epoch": 19.89831627599868, "grad_norm": 740.8466796875, "learning_rate": 0.0003306630892168917, "loss": 35.4064, "step": 7534 }, { "epoch": 19.900957411687024, "grad_norm": 268.4031677246094, "learning_rate": 0.00033062354590260605, "loss": 35.4308, "step": 7535 }, { "epoch": 19.90359854737537, "grad_norm": 448.96234130859375, "learning_rate": 0.00033058400033695527, "loss": 35.8039, "step": 7536 }, { "epoch": 19.906239683063717, "grad_norm": 464.2582092285156, "learning_rate": 0.00033054445252104364, "loss": 37.3171, "step": 7537 }, { "epoch": 19.908880818752063, "grad_norm": 259.11993408203125, "learning_rate": 0.0003305049024559754, "loss": 35.3649, "step": 7538 }, { "epoch": 19.91152195444041, "grad_norm": 553.3370361328125, "learning_rate": 0.0003304653501428552, "loss": 36.3732, "step": 7539 }, { "epoch": 19.914163090128756, "grad_norm": 474.1895751953125, "learning_rate": 0.00033042579558278717, "loss": 36.9801, "step": 7540 }, { "epoch": 19.916804225817103, "grad_norm": 341.205078125, "learning_rate": 0.00033038623877687626, "loss": 35.7366, "step": 7541 }, { "epoch": 19.919445361505446, "grad_norm": 755.4842529296875, "learning_rate": 0.0003303466797262267, "loss": 38.3494, "step": 7542 }, { "epoch": 19.922086497193792, "grad_norm": 189.35391235351562, "learning_rate": 0.0003303071184319434, "loss": 40.5109, "step": 7543 }, { "epoch": 19.92472763288214, "grad_norm": 861.1663818359375, "learning_rate": 0.0003302675548951309, "loss": 40.6563, "step": 7544 }, { "epoch": 19.927368768570485, "grad_norm": 258.773681640625, "learning_rate": 0.0003302279891168941, "loss": 43.6524, "step": 7545 }, { "epoch": 19.93000990425883, "grad_norm": 277.6313781738281, "learning_rate": 0.0003301884210983379, "loss": 43.9774, "step": 7546 }, { "epoch": 19.932651039947178, "grad_norm": 379.5459899902344, "learning_rate": 0.00033014885084056705, "loss": 45.8659, "step": 7547 }, { "epoch": 19.935292175635524, "grad_norm": 477.9105529785156, "learning_rate": 0.00033010927834468674, "loss": 43.8634, "step": 7548 }, { "epoch": 19.93793331132387, "grad_norm": 263.6606750488281, "learning_rate": 0.0003300697036118018, "loss": 42.3911, "step": 7549 }, { "epoch": 19.940574447012214, "grad_norm": 191.9362030029297, "learning_rate": 0.00033003012664301745, "loss": 40.0795, "step": 7550 }, { "epoch": 19.94321558270056, "grad_norm": 188.0053253173828, "learning_rate": 0.0003299905474394388, "loss": 39.7312, "step": 7551 }, { "epoch": 19.945856718388907, "grad_norm": 161.9982452392578, "learning_rate": 0.00032995096600217113, "loss": 37.3791, "step": 7552 }, { "epoch": 19.948497854077253, "grad_norm": 177.92478942871094, "learning_rate": 0.00032991138233231973, "loss": 36.6333, "step": 7553 }, { "epoch": 19.9511389897656, "grad_norm": 118.84905242919922, "learning_rate": 0.00032987179643098987, "loss": 37.3303, "step": 7554 }, { "epoch": 19.953780125453946, "grad_norm": 185.1211700439453, "learning_rate": 0.00032983220829928706, "loss": 35.4488, "step": 7555 }, { "epoch": 19.956421261142292, "grad_norm": 144.59426879882812, "learning_rate": 0.00032979261793831665, "loss": 35.6772, "step": 7556 }, { "epoch": 19.95906239683064, "grad_norm": 445.9217224121094, "learning_rate": 0.00032975302534918435, "loss": 37.4319, "step": 7557 }, { "epoch": 19.96170353251898, "grad_norm": 18451.193359375, "learning_rate": 0.00032971343053299557, "loss": 36.8826, "step": 7558 }, { "epoch": 19.96434466820733, "grad_norm": 1427.27099609375, "learning_rate": 0.0003296738334908561, "loss": 41.9538, "step": 7559 }, { "epoch": 19.966985803895675, "grad_norm": 3066.914794921875, "learning_rate": 0.0003296342342238717, "loss": 43.7098, "step": 7560 }, { "epoch": 19.96962693958402, "grad_norm": 2670.11767578125, "learning_rate": 0.000329594632733148, "loss": 44.9958, "step": 7561 }, { "epoch": 19.972268075272368, "grad_norm": 2381.6171875, "learning_rate": 0.000329555029019791, "loss": 31.6967, "step": 7562 }, { "epoch": 19.974909210960714, "grad_norm": 1714.3963623046875, "learning_rate": 0.0003295154230849066, "loss": 33.528, "step": 7563 }, { "epoch": 19.97755034664906, "grad_norm": 154.13038635253906, "learning_rate": 0.0003294758149296006, "loss": 37.1674, "step": 7564 }, { "epoch": 19.980191482337403, "grad_norm": 238.39695739746094, "learning_rate": 0.0003294362045549792, "loss": 37.518, "step": 7565 }, { "epoch": 19.98283261802575, "grad_norm": 239.19834899902344, "learning_rate": 0.0003293965919621484, "loss": 37.1384, "step": 7566 }, { "epoch": 19.985473753714096, "grad_norm": 188.89759826660156, "learning_rate": 0.00032935697715221444, "loss": 34.6459, "step": 7567 }, { "epoch": 19.988114889402443, "grad_norm": 252.7106475830078, "learning_rate": 0.0003293173601262836, "loss": 36.4629, "step": 7568 }, { "epoch": 19.99075602509079, "grad_norm": 261.1814880371094, "learning_rate": 0.00032927774088546203, "loss": 35.5483, "step": 7569 }, { "epoch": 19.993397160779136, "grad_norm": 308.131591796875, "learning_rate": 0.00032923811943085603, "loss": 35.8158, "step": 7570 }, { "epoch": 19.996038296467482, "grad_norm": 214.93984985351562, "learning_rate": 0.0003291984957635722, "loss": 36.3097, "step": 7571 }, { "epoch": 19.99867943215583, "grad_norm": 429.95013427734375, "learning_rate": 0.0003291588698847169, "loss": 38.6043, "step": 7572 }, { "epoch": 20.00132056784417, "grad_norm": 202.4595184326172, "learning_rate": 0.00032911924179539653, "loss": 41.3453, "step": 7573 }, { "epoch": 20.003961703532518, "grad_norm": 163.8855438232422, "learning_rate": 0.0003290796114967179, "loss": 39.8685, "step": 7574 }, { "epoch": 20.006602839220864, "grad_norm": 148.09506225585938, "learning_rate": 0.00032903997898978756, "loss": 39.329, "step": 7575 }, { "epoch": 20.00924397490921, "grad_norm": 161.46656799316406, "learning_rate": 0.00032900034427571224, "loss": 41.8648, "step": 7576 }, { "epoch": 20.011885110597557, "grad_norm": 214.23448181152344, "learning_rate": 0.00032896070735559873, "loss": 43.7718, "step": 7577 }, { "epoch": 20.014526246285904, "grad_norm": 361.383056640625, "learning_rate": 0.0003289210682305538, "loss": 42.4193, "step": 7578 }, { "epoch": 20.01716738197425, "grad_norm": 118.25663757324219, "learning_rate": 0.0003288814269016844, "loss": 43.128, "step": 7579 }, { "epoch": 20.019808517662597, "grad_norm": 145.0161895751953, "learning_rate": 0.00032884178337009765, "loss": 42.7301, "step": 7580 }, { "epoch": 20.02244965335094, "grad_norm": 126.77027130126953, "learning_rate": 0.00032880213763690025, "loss": 42.0115, "step": 7581 }, { "epoch": 20.025090789039286, "grad_norm": 129.873291015625, "learning_rate": 0.00032876248970319945, "loss": 41.2449, "step": 7582 }, { "epoch": 20.027731924727632, "grad_norm": 167.9395294189453, "learning_rate": 0.00032872283957010245, "loss": 39.3971, "step": 7583 }, { "epoch": 20.03037306041598, "grad_norm": 151.07928466796875, "learning_rate": 0.00032868318723871636, "loss": 41.5596, "step": 7584 }, { "epoch": 20.033014196104325, "grad_norm": 145.48104858398438, "learning_rate": 0.0003286435327101485, "loss": 38.1047, "step": 7585 }, { "epoch": 20.03565533179267, "grad_norm": 133.25857543945312, "learning_rate": 0.0003286038759855062, "loss": 36.4121, "step": 7586 }, { "epoch": 20.03829646748102, "grad_norm": 143.00535583496094, "learning_rate": 0.0003285642170658968, "loss": 36.0148, "step": 7587 }, { "epoch": 20.04093760316936, "grad_norm": 344.4967346191406, "learning_rate": 0.00032852455595242793, "loss": 37.1952, "step": 7588 }, { "epoch": 20.043578738857708, "grad_norm": 226.83575439453125, "learning_rate": 0.0003284848926462068, "loss": 36.1391, "step": 7589 }, { "epoch": 20.046219874546054, "grad_norm": 160.1512451171875, "learning_rate": 0.0003284452271483412, "loss": 36.6644, "step": 7590 }, { "epoch": 20.0488610102344, "grad_norm": 299.60302734375, "learning_rate": 0.0003284055594599387, "loss": 36.9589, "step": 7591 }, { "epoch": 20.051502145922747, "grad_norm": 312.15130615234375, "learning_rate": 0.000328365889582107, "loss": 35.7386, "step": 7592 }, { "epoch": 20.054143281611093, "grad_norm": 255.5709991455078, "learning_rate": 0.0003283262175159539, "loss": 36.1347, "step": 7593 }, { "epoch": 20.05678441729944, "grad_norm": 1425.8287353515625, "learning_rate": 0.0003282865432625872, "loss": 38.5029, "step": 7594 }, { "epoch": 20.059425552987786, "grad_norm": 5817.1318359375, "learning_rate": 0.0003282468668231148, "loss": 57.0264, "step": 7595 }, { "epoch": 20.06206668867613, "grad_norm": 4218.115234375, "learning_rate": 0.0003282071881986445, "loss": 35.2353, "step": 7596 }, { "epoch": 20.064707824364476, "grad_norm": 3419.312255859375, "learning_rate": 0.0003281675073902845, "loss": 38.9035, "step": 7597 }, { "epoch": 20.067348960052822, "grad_norm": 1062.5953369140625, "learning_rate": 0.00032812782439914275, "loss": 33.1477, "step": 7598 }, { "epoch": 20.06999009574117, "grad_norm": 1575.2982177734375, "learning_rate": 0.00032808813922632733, "loss": 26.4334, "step": 7599 }, { "epoch": 20.072631231429515, "grad_norm": 1062.7989501953125, "learning_rate": 0.0003280484518729466, "loss": 26.7199, "step": 7600 }, { "epoch": 20.072631231429515, "eval_loss": 4.106680870056152, "eval_runtime": 2.2231, "eval_samples_per_second": 222.664, "eval_steps_per_second": 27.889, "step": 7600 }, { "epoch": 20.07527236711786, "grad_norm": 1324.8948974609375, "learning_rate": 0.00032800876234010865, "loss": 27.9209, "step": 7601 }, { "epoch": 20.077913502806208, "grad_norm": 758.3114013671875, "learning_rate": 0.0003279690706289218, "loss": 20.7971, "step": 7602 }, { "epoch": 20.080554638494554, "grad_norm": 12529.0283203125, "learning_rate": 0.00032792937674049457, "loss": 24.1635, "step": 7603 }, { "epoch": 20.083195774182897, "grad_norm": 910.6559448242188, "learning_rate": 0.00032788968067593515, "loss": 17.9442, "step": 7604 }, { "epoch": 20.085836909871244, "grad_norm": 516.8895263671875, "learning_rate": 0.0003278499824363522, "loss": 20.1767, "step": 7605 }, { "epoch": 20.08847804555959, "grad_norm": 324.076171875, "learning_rate": 0.0003278102820228542, "loss": 36.6437, "step": 7606 }, { "epoch": 20.091119181247937, "grad_norm": 5895.166015625, "learning_rate": 0.00032777057943654986, "loss": 38.8937, "step": 7607 }, { "epoch": 20.093760316936283, "grad_norm": 254.58291625976562, "learning_rate": 0.00032773087467854767, "loss": 38.2032, "step": 7608 }, { "epoch": 20.09640145262463, "grad_norm": 275.2153625488281, "learning_rate": 0.0003276911677499566, "loss": 37.5636, "step": 7609 }, { "epoch": 20.099042588312976, "grad_norm": 306.5406799316406, "learning_rate": 0.0003276514586518851, "loss": 36.7109, "step": 7610 }, { "epoch": 20.10168372400132, "grad_norm": 178.7208709716797, "learning_rate": 0.00032761174738544244, "loss": 37.3128, "step": 7611 }, { "epoch": 20.104324859689665, "grad_norm": 259.1585998535156, "learning_rate": 0.00032757203395173724, "loss": 36.0308, "step": 7612 }, { "epoch": 20.10696599537801, "grad_norm": 208.8627166748047, "learning_rate": 0.0003275323183518785, "loss": 36.5515, "step": 7613 }, { "epoch": 20.109607131066358, "grad_norm": 239.85784912109375, "learning_rate": 0.00032749260058697536, "loss": 37.0005, "step": 7614 }, { "epoch": 20.112248266754705, "grad_norm": 552.0098876953125, "learning_rate": 0.00032745288065813696, "loss": 37.42, "step": 7615 }, { "epoch": 20.11488940244305, "grad_norm": 200.6132049560547, "learning_rate": 0.0003274131585664723, "loss": 35.527, "step": 7616 }, { "epoch": 20.117530538131398, "grad_norm": 521.3973388671875, "learning_rate": 0.00032737343431309064, "loss": 36.2813, "step": 7617 }, { "epoch": 20.120171673819744, "grad_norm": 194.48385620117188, "learning_rate": 0.0003273337078991013, "loss": 35.7019, "step": 7618 }, { "epoch": 20.122812809508087, "grad_norm": 227.4093017578125, "learning_rate": 0.00032729397932561354, "loss": 36.0966, "step": 7619 }, { "epoch": 20.125453945196433, "grad_norm": 313.8783874511719, "learning_rate": 0.00032725424859373687, "loss": 36.4894, "step": 7620 }, { "epoch": 20.12809508088478, "grad_norm": 434.7150573730469, "learning_rate": 0.0003272145157045807, "loss": 37.9776, "step": 7621 }, { "epoch": 20.130736216573126, "grad_norm": 895.0326538085938, "learning_rate": 0.0003271747806592545, "loss": 38.5914, "step": 7622 }, { "epoch": 20.133377352261473, "grad_norm": 205.40370178222656, "learning_rate": 0.0003271350434588679, "loss": 41.6336, "step": 7623 }, { "epoch": 20.13601848794982, "grad_norm": 421.2994079589844, "learning_rate": 0.0003270953041045305, "loss": 40.7896, "step": 7624 }, { "epoch": 20.138659623638166, "grad_norm": 167.12440490722656, "learning_rate": 0.00032705556259735215, "loss": 40.2118, "step": 7625 }, { "epoch": 20.141300759326512, "grad_norm": 471.2852783203125, "learning_rate": 0.0003270158189384423, "loss": 41.1496, "step": 7626 }, { "epoch": 20.143941895014855, "grad_norm": 141.27938842773438, "learning_rate": 0.000326976073128911, "loss": 41.6397, "step": 7627 }, { "epoch": 20.1465830307032, "grad_norm": 264.967529296875, "learning_rate": 0.0003269363251698681, "loss": 42.0872, "step": 7628 }, { "epoch": 20.149224166391548, "grad_norm": 234.92251586914062, "learning_rate": 0.00032689657506242354, "loss": 43.1467, "step": 7629 }, { "epoch": 20.151865302079894, "grad_norm": 161.99107360839844, "learning_rate": 0.00032685682280768726, "loss": 41.4172, "step": 7630 }, { "epoch": 20.15450643776824, "grad_norm": 212.93785095214844, "learning_rate": 0.0003268170684067693, "loss": 43.7661, "step": 7631 }, { "epoch": 20.157147573456587, "grad_norm": 184.09251403808594, "learning_rate": 0.00032677731186078, "loss": 38.7328, "step": 7632 }, { "epoch": 20.159788709144934, "grad_norm": 136.18067932128906, "learning_rate": 0.0003267375531708292, "loss": 39.2097, "step": 7633 }, { "epoch": 20.162429844833277, "grad_norm": 383.3724060058594, "learning_rate": 0.0003266977923380273, "loss": 39.3932, "step": 7634 }, { "epoch": 20.165070980521623, "grad_norm": 185.96482849121094, "learning_rate": 0.0003266580293634847, "loss": 37.6698, "step": 7635 }, { "epoch": 20.16771211620997, "grad_norm": 122.79054260253906, "learning_rate": 0.00032661826424831164, "loss": 37.3746, "step": 7636 }, { "epoch": 20.170353251898316, "grad_norm": 137.02272033691406, "learning_rate": 0.0003265784969936185, "loss": 36.6584, "step": 7637 }, { "epoch": 20.172994387586662, "grad_norm": 168.2884063720703, "learning_rate": 0.0003265387276005159, "loss": 37.7517, "step": 7638 }, { "epoch": 20.17563552327501, "grad_norm": 125.20411682128906, "learning_rate": 0.00032649895607011424, "loss": 35.4622, "step": 7639 }, { "epoch": 20.178276658963355, "grad_norm": 153.6974639892578, "learning_rate": 0.00032645918240352417, "loss": 35.4239, "step": 7640 }, { "epoch": 20.1809177946517, "grad_norm": 186.70648193359375, "learning_rate": 0.00032641940660185634, "loss": 35.8277, "step": 7641 }, { "epoch": 20.183558930340045, "grad_norm": 109.5548095703125, "learning_rate": 0.0003263796286662216, "loss": 36.0838, "step": 7642 }, { "epoch": 20.18620006602839, "grad_norm": 122.04216766357422, "learning_rate": 0.0003263398485977304, "loss": 35.1587, "step": 7643 }, { "epoch": 20.188841201716738, "grad_norm": 306.206298828125, "learning_rate": 0.00032630006639749386, "loss": 43.4889, "step": 7644 }, { "epoch": 20.191482337405084, "grad_norm": 2598.97314453125, "learning_rate": 0.0003262602820666228, "loss": 57.2832, "step": 7645 }, { "epoch": 20.19412347309343, "grad_norm": 4460.2841796875, "learning_rate": 0.00032622049560622815, "loss": 50.6837, "step": 7646 }, { "epoch": 20.196764608781777, "grad_norm": 1960.640869140625, "learning_rate": 0.000326180707017421, "loss": 53.3715, "step": 7647 }, { "epoch": 20.199405744470123, "grad_norm": 1711.59228515625, "learning_rate": 0.0003261409163013122, "loss": 37.6686, "step": 7648 }, { "epoch": 20.20204688015847, "grad_norm": 5437.197265625, "learning_rate": 0.00032610112345901317, "loss": 35.7265, "step": 7649 }, { "epoch": 20.204688015846813, "grad_norm": 1878.0673828125, "learning_rate": 0.00032606132849163493, "loss": 30.0039, "step": 7650 }, { "epoch": 20.20732915153516, "grad_norm": 1125.27001953125, "learning_rate": 0.0003260215314002888, "loss": 23.3243, "step": 7651 }, { "epoch": 20.209970287223506, "grad_norm": 507.0677185058594, "learning_rate": 0.000325981732186086, "loss": 18.6244, "step": 7652 }, { "epoch": 20.212611422911852, "grad_norm": 1427.8895263671875, "learning_rate": 0.00032594193085013797, "loss": 22.4389, "step": 7653 }, { "epoch": 20.2152525586002, "grad_norm": 493.3594055175781, "learning_rate": 0.0003259021273935562, "loss": 32.8129, "step": 7654 }, { "epoch": 20.217893694288545, "grad_norm": 516.1626586914062, "learning_rate": 0.0003258623218174521, "loss": 37.2601, "step": 7655 }, { "epoch": 20.22053482997689, "grad_norm": 258.2445068359375, "learning_rate": 0.0003258225141229372, "loss": 38.0713, "step": 7656 }, { "epoch": 20.223175965665234, "grad_norm": 237.0260772705078, "learning_rate": 0.0003257827043111231, "loss": 39.6884, "step": 7657 }, { "epoch": 20.22581710135358, "grad_norm": 193.6735076904297, "learning_rate": 0.00032574289238312157, "loss": 37.5843, "step": 7658 }, { "epoch": 20.228458237041927, "grad_norm": 190.05615234375, "learning_rate": 0.0003257030783400442, "loss": 36.4911, "step": 7659 }, { "epoch": 20.231099372730274, "grad_norm": 193.47157287597656, "learning_rate": 0.00032566326218300286, "loss": 37.1858, "step": 7660 }, { "epoch": 20.23374050841862, "grad_norm": 97.74513244628906, "learning_rate": 0.0003256234439131094, "loss": 36.9927, "step": 7661 }, { "epoch": 20.236381644106967, "grad_norm": 106.53952026367188, "learning_rate": 0.00032558362353147564, "loss": 36.4025, "step": 7662 }, { "epoch": 20.239022779795313, "grad_norm": 140.36595153808594, "learning_rate": 0.0003255438010392136, "loss": 34.7682, "step": 7663 }, { "epoch": 20.24166391548366, "grad_norm": 154.62359619140625, "learning_rate": 0.00032550397643743535, "loss": 36.4877, "step": 7664 }, { "epoch": 20.244305051172002, "grad_norm": 169.66961669921875, "learning_rate": 0.00032546414972725287, "loss": 37.1427, "step": 7665 }, { "epoch": 20.24694618686035, "grad_norm": 342.3319091796875, "learning_rate": 0.0003254243209097783, "loss": 37.1139, "step": 7666 }, { "epoch": 20.249587322548695, "grad_norm": 209.43692016601562, "learning_rate": 0.0003253844899861239, "loss": 35.1535, "step": 7667 }, { "epoch": 20.25222845823704, "grad_norm": 168.82568359375, "learning_rate": 0.00032534465695740203, "loss": 35.6593, "step": 7668 }, { "epoch": 20.254869593925388, "grad_norm": 132.62184143066406, "learning_rate": 0.00032530482182472465, "loss": 35.4431, "step": 7669 }, { "epoch": 20.257510729613735, "grad_norm": 224.99224853515625, "learning_rate": 0.0003252649845892045, "loss": 36.9631, "step": 7670 }, { "epoch": 20.26015186530208, "grad_norm": 216.8449249267578, "learning_rate": 0.0003252251452519538, "loss": 37.5887, "step": 7671 }, { "epoch": 20.262793000990428, "grad_norm": 244.70326232910156, "learning_rate": 0.0003251853038140852, "loss": 40.079, "step": 7672 }, { "epoch": 20.26543413667877, "grad_norm": 6815.9931640625, "learning_rate": 0.0003251454602767111, "loss": 41.6096, "step": 7673 }, { "epoch": 20.268075272367117, "grad_norm": 115.888427734375, "learning_rate": 0.0003251056146409441, "loss": 38.9721, "step": 7674 }, { "epoch": 20.270716408055463, "grad_norm": 141.60328674316406, "learning_rate": 0.0003250657669078971, "loss": 40.2727, "step": 7675 }, { "epoch": 20.27335754374381, "grad_norm": 134.52279663085938, "learning_rate": 0.00032502591707868247, "loss": 40.7816, "step": 7676 }, { "epoch": 20.275998679432156, "grad_norm": 104.8785400390625, "learning_rate": 0.0003249860651544133, "loss": 42.129, "step": 7677 }, { "epoch": 20.278639815120503, "grad_norm": 217.3083953857422, "learning_rate": 0.0003249462111362023, "loss": 42.1524, "step": 7678 }, { "epoch": 20.28128095080885, "grad_norm": 64.93672180175781, "learning_rate": 0.00032490635502516234, "loss": 40.5187, "step": 7679 }, { "epoch": 20.283922086497192, "grad_norm": 642.3863525390625, "learning_rate": 0.00032486649682240645, "loss": 42.8428, "step": 7680 }, { "epoch": 20.28656322218554, "grad_norm": 84.80598449707031, "learning_rate": 0.0003248266365290476, "loss": 38.7031, "step": 7681 }, { "epoch": 20.289204357873885, "grad_norm": 131.08018493652344, "learning_rate": 0.00032478677414619884, "loss": 38.6832, "step": 7682 }, { "epoch": 20.29184549356223, "grad_norm": 98.22313690185547, "learning_rate": 0.00032474690967497337, "loss": 39.3398, "step": 7683 }, { "epoch": 20.294486629250578, "grad_norm": 85.90760803222656, "learning_rate": 0.0003247070431164844, "loss": 37.7538, "step": 7684 }, { "epoch": 20.297127764938924, "grad_norm": 120.5302734375, "learning_rate": 0.0003246671744718451, "loss": 38.1679, "step": 7685 }, { "epoch": 20.29976890062727, "grad_norm": 169.66200256347656, "learning_rate": 0.00032462730374216886, "loss": 38.493, "step": 7686 }, { "epoch": 20.302410036315617, "grad_norm": 176.08421325683594, "learning_rate": 0.0003245874309285689, "loss": 36.9024, "step": 7687 }, { "epoch": 20.30505117200396, "grad_norm": 70.88851165771484, "learning_rate": 0.0003245475560321588, "loss": 36.9646, "step": 7688 }, { "epoch": 20.307692307692307, "grad_norm": 76.42431640625, "learning_rate": 0.000324507679054052, "loss": 35.1499, "step": 7689 }, { "epoch": 20.310333443380653, "grad_norm": 205.8919677734375, "learning_rate": 0.000324467799995362, "loss": 35.7114, "step": 7690 }, { "epoch": 20.312974579069, "grad_norm": 111.99324035644531, "learning_rate": 0.0003244279188572025, "loss": 35.3891, "step": 7691 }, { "epoch": 20.315615714757346, "grad_norm": 154.97061157226562, "learning_rate": 0.000324388035640687, "loss": 35.7859, "step": 7692 }, { "epoch": 20.318256850445692, "grad_norm": 105.29151153564453, "learning_rate": 0.00032434815034692937, "loss": 37.4004, "step": 7693 }, { "epoch": 20.32089798613404, "grad_norm": 270.85443115234375, "learning_rate": 0.0003243082629770433, "loss": 44.3218, "step": 7694 }, { "epoch": 20.323539121822385, "grad_norm": 1125.968994140625, "learning_rate": 0.0003242683735321426, "loss": 61.7943, "step": 7695 }, { "epoch": 20.326180257510728, "grad_norm": 1243.990234375, "learning_rate": 0.00032422848201334127, "loss": 56.9364, "step": 7696 }, { "epoch": 20.328821393199075, "grad_norm": 1035.3668212890625, "learning_rate": 0.0003241885884217531, "loss": 56.687, "step": 7697 }, { "epoch": 20.33146252888742, "grad_norm": 3625.75634765625, "learning_rate": 0.00032414869275849224, "loss": 47.0563, "step": 7698 }, { "epoch": 20.334103664575768, "grad_norm": 2473.748046875, "learning_rate": 0.00032410879502467265, "loss": 38.0556, "step": 7699 }, { "epoch": 20.336744800264114, "grad_norm": 1452.712158203125, "learning_rate": 0.0003240688952214085, "loss": 35.4515, "step": 7700 }, { "epoch": 20.33938593595246, "grad_norm": 1168.5272216796875, "learning_rate": 0.00032402899334981394, "loss": 29.8205, "step": 7701 }, { "epoch": 20.342027071640807, "grad_norm": 1726.7958984375, "learning_rate": 0.00032398908941100324, "loss": 22.6192, "step": 7702 }, { "epoch": 20.34466820732915, "grad_norm": 3668.355224609375, "learning_rate": 0.0003239491834060908, "loss": 23.8251, "step": 7703 }, { "epoch": 20.347309343017496, "grad_norm": 235.78720092773438, "learning_rate": 0.00032390927533619074, "loss": 27.8816, "step": 7704 }, { "epoch": 20.349950478705843, "grad_norm": 105.71586608886719, "learning_rate": 0.00032386936520241764, "loss": 37.5305, "step": 7705 }, { "epoch": 20.35259161439419, "grad_norm": 191.27200317382812, "learning_rate": 0.00032382945300588577, "loss": 36.2145, "step": 7706 }, { "epoch": 20.355232750082536, "grad_norm": 157.55514526367188, "learning_rate": 0.00032378953874770996, "loss": 35.3308, "step": 7707 }, { "epoch": 20.357873885770882, "grad_norm": 353.1784362792969, "learning_rate": 0.00032374962242900457, "loss": 35.0874, "step": 7708 }, { "epoch": 20.36051502145923, "grad_norm": 133.1815185546875, "learning_rate": 0.00032370970405088426, "loss": 35.7552, "step": 7709 }, { "epoch": 20.363156157147575, "grad_norm": 130.53562927246094, "learning_rate": 0.0003236697836144638, "loss": 35.4276, "step": 7710 }, { "epoch": 20.365797292835918, "grad_norm": 300.014404296875, "learning_rate": 0.00032362986112085786, "loss": 36.5223, "step": 7711 }, { "epoch": 20.368438428524264, "grad_norm": 144.85134887695312, "learning_rate": 0.0003235899365711814, "loss": 35.9157, "step": 7712 }, { "epoch": 20.37107956421261, "grad_norm": 260.8979797363281, "learning_rate": 0.0003235500099665491, "loss": 35.6265, "step": 7713 }, { "epoch": 20.373720699900957, "grad_norm": 102.78553771972656, "learning_rate": 0.000323510081308076, "loss": 35.3373, "step": 7714 }, { "epoch": 20.376361835589304, "grad_norm": 129.17311096191406, "learning_rate": 0.00032347015059687706, "loss": 34.8744, "step": 7715 }, { "epoch": 20.37900297127765, "grad_norm": 192.80726623535156, "learning_rate": 0.00032343021783406734, "loss": 37.2821, "step": 7716 }, { "epoch": 20.381644106965997, "grad_norm": 389.80792236328125, "learning_rate": 0.00032339028302076194, "loss": 35.4688, "step": 7717 }, { "epoch": 20.384285242654343, "grad_norm": 73.40228271484375, "learning_rate": 0.00032335034615807597, "loss": 34.4702, "step": 7718 }, { "epoch": 20.386926378342686, "grad_norm": 154.31906127929688, "learning_rate": 0.0003233104072471247, "loss": 35.5183, "step": 7719 }, { "epoch": 20.389567514031032, "grad_norm": 148.73985290527344, "learning_rate": 0.00032327046628902336, "loss": 35.9302, "step": 7720 }, { "epoch": 20.39220864971938, "grad_norm": 100.3431167602539, "learning_rate": 0.00032323052328488724, "loss": 36.858, "step": 7721 }, { "epoch": 20.394849785407725, "grad_norm": 213.13153076171875, "learning_rate": 0.00032319057823583183, "loss": 38.2541, "step": 7722 }, { "epoch": 20.39749092109607, "grad_norm": 223.0154266357422, "learning_rate": 0.00032315063114297243, "loss": 42.8703, "step": 7723 }, { "epoch": 20.400132056784418, "grad_norm": 198.44285583496094, "learning_rate": 0.0003231106820074247, "loss": 40.3491, "step": 7724 }, { "epoch": 20.402773192472765, "grad_norm": 178.33836364746094, "learning_rate": 0.00032307073083030416, "loss": 40.4591, "step": 7725 }, { "epoch": 20.405414328161108, "grad_norm": 249.08819580078125, "learning_rate": 0.00032303077761272636, "loss": 41.1591, "step": 7726 }, { "epoch": 20.408055463849454, "grad_norm": 160.17665100097656, "learning_rate": 0.000322990822355807, "loss": 43.3662, "step": 7727 }, { "epoch": 20.4106965995378, "grad_norm": 106.76828002929688, "learning_rate": 0.00032295086506066174, "loss": 43.4346, "step": 7728 }, { "epoch": 20.413337735226147, "grad_norm": 490.0293884277344, "learning_rate": 0.00032291090572840654, "loss": 41.5909, "step": 7729 }, { "epoch": 20.415978870914493, "grad_norm": 112.09744262695312, "learning_rate": 0.00032287094436015697, "loss": 39.3318, "step": 7730 }, { "epoch": 20.41862000660284, "grad_norm": 115.71880340576172, "learning_rate": 0.0003228309809570292, "loss": 41.6634, "step": 7731 }, { "epoch": 20.421261142291186, "grad_norm": 146.75296020507812, "learning_rate": 0.000322791015520139, "loss": 40.1553, "step": 7732 }, { "epoch": 20.423902277979533, "grad_norm": 174.4634552001953, "learning_rate": 0.00032275104805060253, "loss": 39.0662, "step": 7733 }, { "epoch": 20.426543413667876, "grad_norm": 176.22618103027344, "learning_rate": 0.00032271107854953574, "loss": 40.48, "step": 7734 }, { "epoch": 20.429184549356222, "grad_norm": 113.05332946777344, "learning_rate": 0.0003226711070180548, "loss": 37.5817, "step": 7735 }, { "epoch": 20.43182568504457, "grad_norm": 116.76354217529297, "learning_rate": 0.0003226311334572759, "loss": 38.2303, "step": 7736 }, { "epoch": 20.434466820732915, "grad_norm": 176.68743896484375, "learning_rate": 0.0003225911578683152, "loss": 38.5218, "step": 7737 }, { "epoch": 20.43710795642126, "grad_norm": 133.38455200195312, "learning_rate": 0.0003225511802522892, "loss": 36.6511, "step": 7738 }, { "epoch": 20.439749092109608, "grad_norm": 74.97927856445312, "learning_rate": 0.000322511200610314, "loss": 36.7179, "step": 7739 }, { "epoch": 20.442390227797954, "grad_norm": 180.76170349121094, "learning_rate": 0.00032247121894350613, "loss": 36.8941, "step": 7740 }, { "epoch": 20.4450313634863, "grad_norm": 128.0946502685547, "learning_rate": 0.000322431235252982, "loss": 35.746, "step": 7741 }, { "epoch": 20.447672499174644, "grad_norm": 273.2564392089844, "learning_rate": 0.0003223912495398583, "loss": 37.1823, "step": 7742 }, { "epoch": 20.45031363486299, "grad_norm": 406.3354797363281, "learning_rate": 0.0003223512618052514, "loss": 35.7871, "step": 7743 }, { "epoch": 20.452954770551337, "grad_norm": 85.43754577636719, "learning_rate": 0.000322311272050278, "loss": 35.5633, "step": 7744 }, { "epoch": 20.455595906239683, "grad_norm": 554.5789184570312, "learning_rate": 0.0003222712802760548, "loss": 43.9689, "step": 7745 }, { "epoch": 20.45823704192803, "grad_norm": 1308.7672119140625, "learning_rate": 0.00032223128648369855, "loss": 61.8469, "step": 7746 }, { "epoch": 20.460878177616376, "grad_norm": 3371.87548828125, "learning_rate": 0.0003221912906743262, "loss": 71.4561, "step": 7747 }, { "epoch": 20.463519313304722, "grad_norm": 3985.70166015625, "learning_rate": 0.00032215129284905426, "loss": 75.4301, "step": 7748 }, { "epoch": 20.466160448993065, "grad_norm": 1710.2965087890625, "learning_rate": 0.0003221112930089999, "loss": 59.6906, "step": 7749 }, { "epoch": 20.46880158468141, "grad_norm": 2546.578857421875, "learning_rate": 0.0003220712911552801, "loss": 43.4182, "step": 7750 }, { "epoch": 20.471442720369758, "grad_norm": 1571.208740234375, "learning_rate": 0.0003220312872890119, "loss": 36.5208, "step": 7751 }, { "epoch": 20.474083856058105, "grad_norm": 842.62109375, "learning_rate": 0.00032199128141131215, "loss": 29.7078, "step": 7752 }, { "epoch": 20.47672499174645, "grad_norm": 5537.4765625, "learning_rate": 0.0003219512735232982, "loss": 22.773, "step": 7753 }, { "epoch": 20.479366127434798, "grad_norm": 1432.61474609375, "learning_rate": 0.0003219112636260873, "loss": 21.558, "step": 7754 }, { "epoch": 20.482007263123144, "grad_norm": 111.72416687011719, "learning_rate": 0.00032187125172079655, "loss": 37.1287, "step": 7755 }, { "epoch": 20.48464839881149, "grad_norm": 329.77117919921875, "learning_rate": 0.00032183123780854327, "loss": 36.9989, "step": 7756 }, { "epoch": 20.487289534499833, "grad_norm": 155.43756103515625, "learning_rate": 0.00032179122189044497, "loss": 36.5399, "step": 7757 }, { "epoch": 20.48993067018818, "grad_norm": 163.3948516845703, "learning_rate": 0.00032175120396761884, "loss": 36.0071, "step": 7758 }, { "epoch": 20.492571805876526, "grad_norm": 169.70875549316406, "learning_rate": 0.0003217111840411826, "loss": 35.3804, "step": 7759 }, { "epoch": 20.495212941564873, "grad_norm": 109.52317810058594, "learning_rate": 0.0003216711621122537, "loss": 37.05, "step": 7760 }, { "epoch": 20.49785407725322, "grad_norm": 132.6793670654297, "learning_rate": 0.0003216311381819496, "loss": 35.2697, "step": 7761 }, { "epoch": 20.500495212941566, "grad_norm": 283.8396911621094, "learning_rate": 0.00032159111225138807, "loss": 35.8135, "step": 7762 }, { "epoch": 20.503136348629912, "grad_norm": 149.10618591308594, "learning_rate": 0.0003215510843216868, "loss": 34.5008, "step": 7763 }, { "epoch": 20.50577748431826, "grad_norm": 93.00524139404297, "learning_rate": 0.00032151105439396353, "loss": 34.8238, "step": 7764 }, { "epoch": 20.5084186200066, "grad_norm": 83.3645248413086, "learning_rate": 0.00032147102246933606, "loss": 36.4869, "step": 7765 }, { "epoch": 20.511059755694948, "grad_norm": 725.3027954101562, "learning_rate": 0.0003214309885489223, "loss": 34.9829, "step": 7766 }, { "epoch": 20.513700891383294, "grad_norm": 138.35763549804688, "learning_rate": 0.0003213909526338401, "loss": 34.6884, "step": 7767 }, { "epoch": 20.51634202707164, "grad_norm": 108.8589859008789, "learning_rate": 0.0003213509147252075, "loss": 35.1821, "step": 7768 }, { "epoch": 20.518983162759987, "grad_norm": 262.72314453125, "learning_rate": 0.00032131087482414254, "loss": 34.3261, "step": 7769 }, { "epoch": 20.521624298448334, "grad_norm": 276.22564697265625, "learning_rate": 0.0003212708329317633, "loss": 35.64, "step": 7770 }, { "epoch": 20.52426543413668, "grad_norm": 250.50942993164062, "learning_rate": 0.0003212307890491879, "loss": 36.8945, "step": 7771 }, { "epoch": 20.526906569825023, "grad_norm": 222.57525634765625, "learning_rate": 0.0003211907431775345, "loss": 38.7825, "step": 7772 }, { "epoch": 20.52954770551337, "grad_norm": 108.93848419189453, "learning_rate": 0.0003211506953179216, "loss": 41.2237, "step": 7773 }, { "epoch": 20.532188841201716, "grad_norm": 98.74085235595703, "learning_rate": 0.0003211106454714671, "loss": 39.2145, "step": 7774 }, { "epoch": 20.534829976890062, "grad_norm": 160.2722930908203, "learning_rate": 0.0003210705936392897, "loss": 40.9681, "step": 7775 }, { "epoch": 20.53747111257841, "grad_norm": 141.37229919433594, "learning_rate": 0.00032103053982250775, "loss": 40.9167, "step": 7776 }, { "epoch": 20.540112248266755, "grad_norm": 111.12305450439453, "learning_rate": 0.00032099048402223966, "loss": 41.8184, "step": 7777 }, { "epoch": 20.5427533839551, "grad_norm": 124.6285171508789, "learning_rate": 0.00032095042623960407, "loss": 43.6394, "step": 7778 }, { "epoch": 20.545394519643448, "grad_norm": 166.89122009277344, "learning_rate": 0.00032091036647571944, "loss": 44.1523, "step": 7779 }, { "epoch": 20.54803565533179, "grad_norm": 106.99757385253906, "learning_rate": 0.00032087030473170445, "loss": 41.8169, "step": 7780 }, { "epoch": 20.550676791020138, "grad_norm": 105.94007110595703, "learning_rate": 0.00032083024100867783, "loss": 39.8688, "step": 7781 }, { "epoch": 20.553317926708484, "grad_norm": 129.05023193359375, "learning_rate": 0.00032079017530775843, "loss": 40.0198, "step": 7782 }, { "epoch": 20.55595906239683, "grad_norm": 107.22564697265625, "learning_rate": 0.000320750107630065, "loss": 39.3285, "step": 7783 }, { "epoch": 20.558600198085177, "grad_norm": 85.61442565917969, "learning_rate": 0.00032071003797671627, "loss": 38.2459, "step": 7784 }, { "epoch": 20.561241333773523, "grad_norm": 102.51970672607422, "learning_rate": 0.0003206699663488313, "loss": 38.5641, "step": 7785 }, { "epoch": 20.56388246946187, "grad_norm": 77.46428680419922, "learning_rate": 0.00032062989274752907, "loss": 36.5875, "step": 7786 }, { "epoch": 20.566523605150216, "grad_norm": 121.81597137451172, "learning_rate": 0.00032058981717392854, "loss": 35.0881, "step": 7787 }, { "epoch": 20.56916474083856, "grad_norm": 130.0885772705078, "learning_rate": 0.0003205497396291488, "loss": 36.5364, "step": 7788 }, { "epoch": 20.571805876526906, "grad_norm": 136.5607147216797, "learning_rate": 0.000320509660114309, "loss": 36.179, "step": 7789 }, { "epoch": 20.574447012215252, "grad_norm": 93.79354095458984, "learning_rate": 0.00032046957863052846, "loss": 35.6361, "step": 7790 }, { "epoch": 20.5770881479036, "grad_norm": 73.1540756225586, "learning_rate": 0.0003204294951789263, "loss": 34.6509, "step": 7791 }, { "epoch": 20.579729283591945, "grad_norm": 108.16254425048828, "learning_rate": 0.0003203894097606219, "loss": 35.2555, "step": 7792 }, { "epoch": 20.58237041928029, "grad_norm": 141.85464477539062, "learning_rate": 0.0003203493223767344, "loss": 35.831, "step": 7793 }, { "epoch": 20.585011554968638, "grad_norm": 131.9561767578125, "learning_rate": 0.00032030923302838355, "loss": 35.6408, "step": 7794 }, { "epoch": 20.58765269065698, "grad_norm": 104.44185638427734, "learning_rate": 0.0003202691417166887, "loss": 36.2393, "step": 7795 }, { "epoch": 20.590293826345327, "grad_norm": 2609.8916015625, "learning_rate": 0.0003202290484427692, "loss": 73.3816, "step": 7796 }, { "epoch": 20.592934962033674, "grad_norm": 2458.0, "learning_rate": 0.00032018895320774495, "loss": 80.298, "step": 7797 }, { "epoch": 20.59557609772202, "grad_norm": 2558.887451171875, "learning_rate": 0.0003201488560127353, "loss": 77.149, "step": 7798 }, { "epoch": 20.598217233410367, "grad_norm": 13682.18359375, "learning_rate": 0.00032010875685886014, "loss": 65.7986, "step": 7799 }, { "epoch": 20.600858369098713, "grad_norm": 1158.1549072265625, "learning_rate": 0.000320068655747239, "loss": 54.9829, "step": 7800 }, { "epoch": 20.600858369098713, "eval_loss": 4.862379550933838, "eval_runtime": 2.1318, "eval_samples_per_second": 232.195, "eval_steps_per_second": 29.083, "step": 7800 }, { "epoch": 20.60349950478706, "grad_norm": 1032.4141845703125, "learning_rate": 0.00032002855267899196, "loss": 64.1979, "step": 7801 }, { "epoch": 20.606140640475406, "grad_norm": 797.5202026367188, "learning_rate": 0.0003199884476552386, "loss": 44.5186, "step": 7802 }, { "epoch": 20.60878177616375, "grad_norm": 1336.5677490234375, "learning_rate": 0.000319948340677099, "loss": 35.6856, "step": 7803 }, { "epoch": 20.611422911852095, "grad_norm": 1427.4168701171875, "learning_rate": 0.0003199082317456931, "loss": 31.6191, "step": 7804 }, { "epoch": 20.61406404754044, "grad_norm": 1004.087158203125, "learning_rate": 0.00031986812086214083, "loss": 29.4925, "step": 7805 }, { "epoch": 20.616705183228788, "grad_norm": 108.22225189208984, "learning_rate": 0.0003198280080275624, "loss": 36.415, "step": 7806 }, { "epoch": 20.619346318917135, "grad_norm": 144.40634155273438, "learning_rate": 0.0003197878932430778, "loss": 35.689, "step": 7807 }, { "epoch": 20.62198745460548, "grad_norm": 188.12574768066406, "learning_rate": 0.00031974777650980735, "loss": 38.2146, "step": 7808 }, { "epoch": 20.624628590293828, "grad_norm": 207.51736450195312, "learning_rate": 0.0003197076578288712, "loss": 35.998, "step": 7809 }, { "epoch": 20.627269725982174, "grad_norm": 208.1840057373047, "learning_rate": 0.00031966753720138953, "loss": 35.3778, "step": 7810 }, { "epoch": 20.629910861670517, "grad_norm": 213.74179077148438, "learning_rate": 0.0003196274146284829, "loss": 36.2548, "step": 7811 }, { "epoch": 20.632551997358863, "grad_norm": 160.84902954101562, "learning_rate": 0.0003195872901112717, "loss": 37.1147, "step": 7812 }, { "epoch": 20.63519313304721, "grad_norm": 875.2734985351562, "learning_rate": 0.0003195471636508762, "loss": 35.7504, "step": 7813 }, { "epoch": 20.637834268735556, "grad_norm": 135.44973754882812, "learning_rate": 0.000319507035248417, "loss": 35.5707, "step": 7814 }, { "epoch": 20.640475404423903, "grad_norm": 119.31359100341797, "learning_rate": 0.0003194669049050147, "loss": 36.6716, "step": 7815 }, { "epoch": 20.64311654011225, "grad_norm": 325.10675048828125, "learning_rate": 0.0003194267726217899, "loss": 35.4076, "step": 7816 }, { "epoch": 20.645757675800596, "grad_norm": 237.3900604248047, "learning_rate": 0.00031938663839986325, "loss": 36.4581, "step": 7817 }, { "epoch": 20.64839881148894, "grad_norm": 248.6508331298828, "learning_rate": 0.00031934650224035556, "loss": 35.5719, "step": 7818 }, { "epoch": 20.651039947177285, "grad_norm": 201.0950164794922, "learning_rate": 0.0003193063641443874, "loss": 35.7749, "step": 7819 }, { "epoch": 20.65368108286563, "grad_norm": 149.4627685546875, "learning_rate": 0.00031926622411307984, "loss": 34.7036, "step": 7820 }, { "epoch": 20.656322218553978, "grad_norm": 1240.95751953125, "learning_rate": 0.00031922608214755355, "loss": 35.4869, "step": 7821 }, { "epoch": 20.658963354242324, "grad_norm": 449.6322326660156, "learning_rate": 0.0003191859382489297, "loss": 39.6427, "step": 7822 }, { "epoch": 20.66160448993067, "grad_norm": 415.5215759277344, "learning_rate": 0.0003191457924183291, "loss": 40.1771, "step": 7823 }, { "epoch": 20.664245625619017, "grad_norm": 116.91801452636719, "learning_rate": 0.0003191056446568728, "loss": 39.8381, "step": 7824 }, { "epoch": 20.666886761307364, "grad_norm": 166.28533935546875, "learning_rate": 0.00031906549496568204, "loss": 42.4995, "step": 7825 }, { "epoch": 20.669527896995707, "grad_norm": 241.34432983398438, "learning_rate": 0.00031902534334587795, "loss": 41.2363, "step": 7826 }, { "epoch": 20.672169032684053, "grad_norm": 187.59713745117188, "learning_rate": 0.0003189851897985817, "loss": 44.9868, "step": 7827 }, { "epoch": 20.6748101683724, "grad_norm": 151.02084350585938, "learning_rate": 0.0003189450343249144, "loss": 40.6287, "step": 7828 }, { "epoch": 20.677451304060746, "grad_norm": 110.5277328491211, "learning_rate": 0.0003189048769259977, "loss": 46.0664, "step": 7829 }, { "epoch": 20.680092439749092, "grad_norm": 139.9405975341797, "learning_rate": 0.0003188647176029527, "loss": 41.9763, "step": 7830 }, { "epoch": 20.68273357543744, "grad_norm": 122.4825210571289, "learning_rate": 0.00031882455635690087, "loss": 40.674, "step": 7831 }, { "epoch": 20.685374711125785, "grad_norm": 174.952392578125, "learning_rate": 0.00031878439318896377, "loss": 39.9397, "step": 7832 }, { "epoch": 20.68801584681413, "grad_norm": 121.83260345458984, "learning_rate": 0.0003187442281002629, "loss": 39.7326, "step": 7833 }, { "epoch": 20.690656982502475, "grad_norm": 181.98883056640625, "learning_rate": 0.0003187040610919199, "loss": 38.6626, "step": 7834 }, { "epoch": 20.69329811819082, "grad_norm": 119.1464614868164, "learning_rate": 0.0003186638921650563, "loss": 38.2615, "step": 7835 }, { "epoch": 20.695939253879168, "grad_norm": 87.44525909423828, "learning_rate": 0.0003186237213207938, "loss": 38.0369, "step": 7836 }, { "epoch": 20.698580389567514, "grad_norm": 214.8970184326172, "learning_rate": 0.0003185835485602543, "loss": 35.398, "step": 7837 }, { "epoch": 20.70122152525586, "grad_norm": 369.2488708496094, "learning_rate": 0.00031854337388455945, "loss": 36.3929, "step": 7838 }, { "epoch": 20.703862660944207, "grad_norm": 103.5701904296875, "learning_rate": 0.0003185031972948311, "loss": 36.1116, "step": 7839 }, { "epoch": 20.706503796632553, "grad_norm": 100.42804718017578, "learning_rate": 0.0003184630187921913, "loss": 35.7385, "step": 7840 }, { "epoch": 20.709144932320896, "grad_norm": 112.76209259033203, "learning_rate": 0.0003184228383777618, "loss": 35.5103, "step": 7841 }, { "epoch": 20.711786068009243, "grad_norm": 98.46780395507812, "learning_rate": 0.00031838265605266475, "loss": 34.0336, "step": 7842 }, { "epoch": 20.71442720369759, "grad_norm": 102.30675506591797, "learning_rate": 0.0003183424718180223, "loss": 35.1628, "step": 7843 }, { "epoch": 20.717068339385936, "grad_norm": 127.89922332763672, "learning_rate": 0.00031830228567495647, "loss": 36.6928, "step": 7844 }, { "epoch": 20.719709475074282, "grad_norm": 812.0861206054688, "learning_rate": 0.0003182620976245893, "loss": 77.0217, "step": 7845 }, { "epoch": 20.72235061076263, "grad_norm": 1460.2996826171875, "learning_rate": 0.00031822190766804325, "loss": 91.4421, "step": 7846 }, { "epoch": 20.724991746450975, "grad_norm": 1885.7611083984375, "learning_rate": 0.0003181817158064405, "loss": 84.9633, "step": 7847 }, { "epoch": 20.72763288213932, "grad_norm": 2735.626708984375, "learning_rate": 0.00031814152204090343, "loss": 68.1283, "step": 7848 }, { "epoch": 20.730274017827664, "grad_norm": 1731.6259765625, "learning_rate": 0.0003181013263725543, "loss": 62.8301, "step": 7849 }, { "epoch": 20.73291515351601, "grad_norm": 1125.9046630859375, "learning_rate": 0.0003180611288025156, "loss": 46.7035, "step": 7850 }, { "epoch": 20.735556289204357, "grad_norm": 6451.68212890625, "learning_rate": 0.00031802092933190995, "loss": 41.7207, "step": 7851 }, { "epoch": 20.738197424892704, "grad_norm": 1726.4276123046875, "learning_rate": 0.0003179807279618598, "loss": 34.0492, "step": 7852 }, { "epoch": 20.74083856058105, "grad_norm": 1310.121826171875, "learning_rate": 0.0003179405246934878, "loss": 26.534, "step": 7853 }, { "epoch": 20.743479696269397, "grad_norm": 2761.359375, "learning_rate": 0.0003179003195279164, "loss": 21.5024, "step": 7854 }, { "epoch": 20.746120831957743, "grad_norm": 739.0012817382812, "learning_rate": 0.00031786011246626855, "loss": 22.4153, "step": 7855 }, { "epoch": 20.74876196764609, "grad_norm": 186.04718017578125, "learning_rate": 0.000317819903509667, "loss": 36.9326, "step": 7856 }, { "epoch": 20.751403103334432, "grad_norm": 162.64041137695312, "learning_rate": 0.00031777969265923435, "loss": 38.4169, "step": 7857 }, { "epoch": 20.75404423902278, "grad_norm": 191.63192749023438, "learning_rate": 0.0003177394799160937, "loss": 35.8709, "step": 7858 }, { "epoch": 20.756685374711125, "grad_norm": 302.8233337402344, "learning_rate": 0.0003176992652813678, "loss": 38.315, "step": 7859 }, { "epoch": 20.75932651039947, "grad_norm": 127.68251037597656, "learning_rate": 0.00031765904875617973, "loss": 36.3162, "step": 7860 }, { "epoch": 20.761967646087818, "grad_norm": 620.2579956054688, "learning_rate": 0.0003176188303416525, "loss": 36.0664, "step": 7861 }, { "epoch": 20.764608781776165, "grad_norm": 190.2633514404297, "learning_rate": 0.0003175786100389091, "loss": 37.5274, "step": 7862 }, { "epoch": 20.76724991746451, "grad_norm": 103.12435913085938, "learning_rate": 0.0003175383878490727, "loss": 36.5137, "step": 7863 }, { "epoch": 20.769891053152854, "grad_norm": 223.1306610107422, "learning_rate": 0.00031749816377326653, "loss": 34.6985, "step": 7864 }, { "epoch": 20.7725321888412, "grad_norm": 281.7974853515625, "learning_rate": 0.0003174579378126138, "loss": 36.612, "step": 7865 }, { "epoch": 20.775173324529547, "grad_norm": 271.9704284667969, "learning_rate": 0.0003174177099682377, "loss": 34.709, "step": 7866 }, { "epoch": 20.777814460217893, "grad_norm": 221.5959930419922, "learning_rate": 0.0003173774802412617, "loss": 37.1648, "step": 7867 }, { "epoch": 20.78045559590624, "grad_norm": 160.21884155273438, "learning_rate": 0.00031733724863280917, "loss": 35.4636, "step": 7868 }, { "epoch": 20.783096731594586, "grad_norm": 146.16799926757812, "learning_rate": 0.00031729701514400353, "loss": 35.0625, "step": 7869 }, { "epoch": 20.785737867282933, "grad_norm": 99.76471710205078, "learning_rate": 0.0003172567797759682, "loss": 36.3299, "step": 7870 }, { "epoch": 20.78837900297128, "grad_norm": 385.0705261230469, "learning_rate": 0.0003172165425298269, "loss": 36.4258, "step": 7871 }, { "epoch": 20.791020138659622, "grad_norm": 156.92388916015625, "learning_rate": 0.0003171763034067031, "loss": 37.7646, "step": 7872 }, { "epoch": 20.79366127434797, "grad_norm": 865.696044921875, "learning_rate": 0.00031713606240772046, "loss": 41.5518, "step": 7873 }, { "epoch": 20.796302410036315, "grad_norm": 90.95207214355469, "learning_rate": 0.0003170958195340028, "loss": 39.5391, "step": 7874 }, { "epoch": 20.79894354572466, "grad_norm": 260.1847839355469, "learning_rate": 0.00031705557478667367, "loss": 39.5218, "step": 7875 }, { "epoch": 20.801584681413008, "grad_norm": 158.07212829589844, "learning_rate": 0.00031701532816685705, "loss": 39.5145, "step": 7876 }, { "epoch": 20.804225817101354, "grad_norm": 504.27618408203125, "learning_rate": 0.0003169750796756768, "loss": 40.9743, "step": 7877 }, { "epoch": 20.8068669527897, "grad_norm": 147.81031799316406, "learning_rate": 0.00031693482931425675, "loss": 43.6026, "step": 7878 }, { "epoch": 20.809508088478047, "grad_norm": 440.10009765625, "learning_rate": 0.00031689457708372094, "loss": 40.825, "step": 7879 }, { "epoch": 20.81214922416639, "grad_norm": 432.98455810546875, "learning_rate": 0.0003168543229851933, "loss": 40.6938, "step": 7880 }, { "epoch": 20.814790359854737, "grad_norm": 112.94600677490234, "learning_rate": 0.0003168140670197981, "loss": 40.154, "step": 7881 }, { "epoch": 20.817431495543083, "grad_norm": 160.14390563964844, "learning_rate": 0.00031677380918865923, "loss": 40.2861, "step": 7882 }, { "epoch": 20.82007263123143, "grad_norm": 139.6979217529297, "learning_rate": 0.00031673354949290103, "loss": 39.5967, "step": 7883 }, { "epoch": 20.822713766919776, "grad_norm": 290.6186828613281, "learning_rate": 0.0003166932879336475, "loss": 38.6679, "step": 7884 }, { "epoch": 20.825354902608122, "grad_norm": 185.21902465820312, "learning_rate": 0.0003166530245120232, "loss": 38.0162, "step": 7885 }, { "epoch": 20.82799603829647, "grad_norm": 277.5670166015625, "learning_rate": 0.00031661275922915234, "loss": 35.6567, "step": 7886 }, { "epoch": 20.83063717398481, "grad_norm": 217.84906005859375, "learning_rate": 0.00031657249208615935, "loss": 36.3985, "step": 7887 }, { "epoch": 20.833278309673158, "grad_norm": 427.4508056640625, "learning_rate": 0.00031653222308416855, "loss": 35.6521, "step": 7888 }, { "epoch": 20.835919445361505, "grad_norm": 497.3858337402344, "learning_rate": 0.00031649195222430447, "loss": 35.2456, "step": 7889 }, { "epoch": 20.83856058104985, "grad_norm": 306.9926452636719, "learning_rate": 0.00031645167950769185, "loss": 35.0096, "step": 7890 }, { "epoch": 20.841201716738198, "grad_norm": 183.63986206054688, "learning_rate": 0.00031641140493545496, "loss": 34.7808, "step": 7891 }, { "epoch": 20.843842852426544, "grad_norm": 399.1923828125, "learning_rate": 0.0003163711285087186, "loss": 36.3548, "step": 7892 }, { "epoch": 20.84648398811489, "grad_norm": 910.59033203125, "learning_rate": 0.00031633085022860755, "loss": 35.8466, "step": 7893 }, { "epoch": 20.849125123803237, "grad_norm": 422.1288757324219, "learning_rate": 0.0003162905700962464, "loss": 36.8009, "step": 7894 }, { "epoch": 20.85176625949158, "grad_norm": 793.6275024414062, "learning_rate": 0.00031625028811276007, "loss": 49.1033, "step": 7895 }, { "epoch": 20.854407395179926, "grad_norm": 1376.5345458984375, "learning_rate": 0.00031621000427927337, "loss": 46.1297, "step": 7896 }, { "epoch": 20.857048530868273, "grad_norm": 1525.876953125, "learning_rate": 0.0003161697185969112, "loss": 44.278, "step": 7897 }, { "epoch": 20.85968966655662, "grad_norm": 920.3646850585938, "learning_rate": 0.0003161294310667984, "loss": 38.7275, "step": 7898 }, { "epoch": 20.862330802244966, "grad_norm": 1013.38818359375, "learning_rate": 0.0003160891416900602, "loss": 34.1592, "step": 7899 }, { "epoch": 20.864971937933312, "grad_norm": 1056.27197265625, "learning_rate": 0.0003160488504678216, "loss": 29.3762, "step": 7900 }, { "epoch": 20.86761307362166, "grad_norm": 8111.64794921875, "learning_rate": 0.0003160085574012075, "loss": 22.6027, "step": 7901 }, { "epoch": 20.870254209310005, "grad_norm": 2830.935302734375, "learning_rate": 0.00031596826249134324, "loss": 17.8333, "step": 7902 }, { "epoch": 20.872895344998348, "grad_norm": 1718.1368408203125, "learning_rate": 0.000315927965739354, "loss": 17.1269, "step": 7903 }, { "epoch": 20.875536480686694, "grad_norm": 607.7705688476562, "learning_rate": 0.00031588766714636505, "loss": 18.0277, "step": 7904 }, { "epoch": 20.87817761637504, "grad_norm": 291.9044494628906, "learning_rate": 0.0003158473667135018, "loss": 37.2991, "step": 7905 }, { "epoch": 20.880818752063387, "grad_norm": 444.51812744140625, "learning_rate": 0.0003158070644418894, "loss": 38.8605, "step": 7906 }, { "epoch": 20.883459887751734, "grad_norm": 253.30523681640625, "learning_rate": 0.00031576676033265343, "loss": 37.7957, "step": 7907 }, { "epoch": 20.88610102344008, "grad_norm": 204.4633026123047, "learning_rate": 0.0003157264543869193, "loss": 37.053, "step": 7908 }, { "epoch": 20.888742159128427, "grad_norm": 305.8465576171875, "learning_rate": 0.0003156861466058126, "loss": 36.0324, "step": 7909 }, { "epoch": 20.89138329481677, "grad_norm": 791.34619140625, "learning_rate": 0.00031564583699045874, "loss": 35.8984, "step": 7910 }, { "epoch": 20.894024430505116, "grad_norm": 289.53619384765625, "learning_rate": 0.00031560552554198346, "loss": 35.9974, "step": 7911 }, { "epoch": 20.896665566193462, "grad_norm": 282.6357116699219, "learning_rate": 0.0003155652122615125, "loss": 34.3339, "step": 7912 }, { "epoch": 20.89930670188181, "grad_norm": 247.7781219482422, "learning_rate": 0.0003155248971501714, "loss": 35.4256, "step": 7913 }, { "epoch": 20.901947837570155, "grad_norm": 986.1806640625, "learning_rate": 0.0003154845802090861, "loss": 36.7359, "step": 7914 }, { "epoch": 20.9045889732585, "grad_norm": 331.31640625, "learning_rate": 0.0003154442614393823, "loss": 37.4704, "step": 7915 }, { "epoch": 20.907230108946848, "grad_norm": 187.68251037597656, "learning_rate": 0.000315403940842186, "loss": 35.9438, "step": 7916 }, { "epoch": 20.909871244635195, "grad_norm": 472.828125, "learning_rate": 0.00031536361841862315, "loss": 35.4824, "step": 7917 }, { "epoch": 20.912512380323538, "grad_norm": 418.32281494140625, "learning_rate": 0.00031532329416981953, "loss": 35.5822, "step": 7918 }, { "epoch": 20.915153516011884, "grad_norm": 413.2804260253906, "learning_rate": 0.00031528296809690137, "loss": 34.9384, "step": 7919 }, { "epoch": 20.91779465170023, "grad_norm": 482.15625, "learning_rate": 0.00031524264020099457, "loss": 35.8269, "step": 7920 }, { "epoch": 20.920435787388577, "grad_norm": 488.94512939453125, "learning_rate": 0.00031520231048322544, "loss": 37.0196, "step": 7921 }, { "epoch": 20.923076923076923, "grad_norm": 524.7352905273438, "learning_rate": 0.00031516197894472015, "loss": 39.8204, "step": 7922 }, { "epoch": 20.92571805876527, "grad_norm": 826.2592163085938, "learning_rate": 0.0003151216455866048, "loss": 41.2533, "step": 7923 }, { "epoch": 20.928359194453616, "grad_norm": 217.37396240234375, "learning_rate": 0.00031508131041000574, "loss": 41.3712, "step": 7924 }, { "epoch": 20.931000330141963, "grad_norm": 293.9978942871094, "learning_rate": 0.00031504097341604936, "loss": 42.0816, "step": 7925 }, { "epoch": 20.933641465830306, "grad_norm": 440.678466796875, "learning_rate": 0.000315000634605862, "loss": 44.012, "step": 7926 }, { "epoch": 20.936282601518652, "grad_norm": 254.3311004638672, "learning_rate": 0.00031496029398057006, "loss": 44.2007, "step": 7927 }, { "epoch": 20.938923737207, "grad_norm": 181.7843475341797, "learning_rate": 0.00031491995154130017, "loss": 41.1341, "step": 7928 }, { "epoch": 20.941564872895345, "grad_norm": 197.30209350585938, "learning_rate": 0.00031487960728917864, "loss": 39.2714, "step": 7929 }, { "epoch": 20.94420600858369, "grad_norm": 285.92401123046875, "learning_rate": 0.00031483926122533224, "loss": 37.2753, "step": 7930 }, { "epoch": 20.946847144272038, "grad_norm": 167.38941955566406, "learning_rate": 0.00031479891335088763, "loss": 37.1027, "step": 7931 }, { "epoch": 20.949488279960384, "grad_norm": 130.71102905273438, "learning_rate": 0.00031475856366697133, "loss": 35.8968, "step": 7932 }, { "epoch": 20.952129415648727, "grad_norm": 228.08433532714844, "learning_rate": 0.0003147182121747103, "loss": 36.3313, "step": 7933 }, { "epoch": 20.954770551337074, "grad_norm": 289.8608703613281, "learning_rate": 0.0003146778588752311, "loss": 36.0962, "step": 7934 }, { "epoch": 20.95741168702542, "grad_norm": 497.3238525390625, "learning_rate": 0.00031463750376966074, "loss": 41.6462, "step": 7935 }, { "epoch": 20.960052822713767, "grad_norm": 2790.7900390625, "learning_rate": 0.000314597146859126, "loss": 31.9845, "step": 7936 }, { "epoch": 20.962693958402113, "grad_norm": 703.2827758789062, "learning_rate": 0.0003145567881447539, "loss": 34.8969, "step": 7937 }, { "epoch": 20.96533509409046, "grad_norm": 1631.522705078125, "learning_rate": 0.00031451642762767143, "loss": 39.3667, "step": 7938 }, { "epoch": 20.967976229778806, "grad_norm": 2795.138671875, "learning_rate": 0.00031447606530900566, "loss": 29.4466, "step": 7939 }, { "epoch": 20.970617365467152, "grad_norm": 3127.6044921875, "learning_rate": 0.00031443570118988356, "loss": 26.2968, "step": 7940 }, { "epoch": 20.973258501155495, "grad_norm": 529.45751953125, "learning_rate": 0.0003143953352714324, "loss": 29.0028, "step": 7941 }, { "epoch": 20.97589963684384, "grad_norm": 348.73193359375, "learning_rate": 0.00031435496755477933, "loss": 35.4478, "step": 7942 }, { "epoch": 20.978540772532188, "grad_norm": 1236.8963623046875, "learning_rate": 0.0003143145980410516, "loss": 35.2819, "step": 7943 }, { "epoch": 20.981181908220535, "grad_norm": 325.8311767578125, "learning_rate": 0.00031427422673137653, "loss": 35.6172, "step": 7944 }, { "epoch": 20.98382304390888, "grad_norm": 286.70245361328125, "learning_rate": 0.0003142338536268813, "loss": 36.8503, "step": 7945 }, { "epoch": 20.986464179597228, "grad_norm": 407.8120422363281, "learning_rate": 0.0003141934787286935, "loss": 35.857, "step": 7946 }, { "epoch": 20.989105315285574, "grad_norm": 443.5566101074219, "learning_rate": 0.00031415310203794056, "loss": 35.0515, "step": 7947 }, { "epoch": 20.99174645097392, "grad_norm": 476.0083923339844, "learning_rate": 0.00031411272355574995, "loss": 35.9494, "step": 7948 }, { "epoch": 20.994387586662263, "grad_norm": 511.2718811035156, "learning_rate": 0.0003140723432832492, "loss": 35.3354, "step": 7949 }, { "epoch": 20.99702872235061, "grad_norm": 472.9263000488281, "learning_rate": 0.0003140319612215659, "loss": 38.088, "step": 7950 }, { "epoch": 20.999669858038956, "grad_norm": 893.31396484375, "learning_rate": 0.0003139915773718276, "loss": 39.5075, "step": 7951 }, { "epoch": 21.002310993727303, "grad_norm": 303.9479064941406, "learning_rate": 0.0003139511917351622, "loss": 40.4549, "step": 7952 }, { "epoch": 21.00495212941565, "grad_norm": 312.9767150878906, "learning_rate": 0.00031391080431269726, "loss": 41.4964, "step": 7953 }, { "epoch": 21.007593265103996, "grad_norm": 300.5788269042969, "learning_rate": 0.0003138704151055607, "loss": 41.0681, "step": 7954 }, { "epoch": 21.010234400792342, "grad_norm": 599.1240234375, "learning_rate": 0.00031383002411488025, "loss": 41.2893, "step": 7955 }, { "epoch": 21.012875536480685, "grad_norm": 373.2067565917969, "learning_rate": 0.00031378963134178395, "loss": 43.2183, "step": 7956 }, { "epoch": 21.01551667216903, "grad_norm": 166.51597595214844, "learning_rate": 0.0003137492367873997, "loss": 41.5679, "step": 7957 }, { "epoch": 21.018157807857378, "grad_norm": 430.85919189453125, "learning_rate": 0.0003137088404528554, "loss": 44.7334, "step": 7958 }, { "epoch": 21.020798943545724, "grad_norm": 184.6895751953125, "learning_rate": 0.00031366844233927916, "loss": 42.8459, "step": 7959 }, { "epoch": 21.02344007923407, "grad_norm": 422.2144775390625, "learning_rate": 0.00031362804244779906, "loss": 39.2463, "step": 7960 }, { "epoch": 21.026081214922417, "grad_norm": 406.9034423828125, "learning_rate": 0.0003135876407795433, "loss": 39.5491, "step": 7961 }, { "epoch": 21.028722350610764, "grad_norm": 242.42971801757812, "learning_rate": 0.0003135472373356399, "loss": 37.8459, "step": 7962 }, { "epoch": 21.03136348629911, "grad_norm": 230.5771484375, "learning_rate": 0.00031350683211721743, "loss": 37.098, "step": 7963 }, { "epoch": 21.034004621987453, "grad_norm": 397.1234130859375, "learning_rate": 0.0003134664251254038, "loss": 36.4759, "step": 7964 }, { "epoch": 21.0366457576758, "grad_norm": 177.496826171875, "learning_rate": 0.0003134260163613276, "loss": 36.5274, "step": 7965 }, { "epoch": 21.039286893364146, "grad_norm": 534.1036987304688, "learning_rate": 0.0003133856058261172, "loss": 36.4256, "step": 7966 }, { "epoch": 21.041928029052492, "grad_norm": 735.0631103515625, "learning_rate": 0.00031334519352090083, "loss": 36.198, "step": 7967 }, { "epoch": 21.04456916474084, "grad_norm": 171.25599670410156, "learning_rate": 0.0003133047794468073, "loss": 35.8917, "step": 7968 }, { "epoch": 21.047210300429185, "grad_norm": 239.83340454101562, "learning_rate": 0.000313264363604965, "loss": 35.0353, "step": 7969 }, { "epoch": 21.04985143611753, "grad_norm": 305.8096923828125, "learning_rate": 0.00031322394599650244, "loss": 34.1383, "step": 7970 }, { "epoch": 21.052492571805878, "grad_norm": 477.3667297363281, "learning_rate": 0.0003131835266225483, "loss": 36.384, "step": 7971 }, { "epoch": 21.05513370749422, "grad_norm": 316.09832763671875, "learning_rate": 0.0003131431054842313, "loss": 36.136, "step": 7972 }, { "epoch": 21.057774843182568, "grad_norm": 861.2808227539062, "learning_rate": 0.0003131026825826802, "loss": 43.7257, "step": 7973 }, { "epoch": 21.060415978870914, "grad_norm": 9186.27734375, "learning_rate": 0.00031306225791902383, "loss": 30.7559, "step": 7974 }, { "epoch": 21.06305711455926, "grad_norm": 5843.1337890625, "learning_rate": 0.0003130218314943909, "loss": 34.8146, "step": 7975 }, { "epoch": 21.065698250247607, "grad_norm": 1536.3170166015625, "learning_rate": 0.00031298140330991025, "loss": 27.7477, "step": 7976 }, { "epoch": 21.068339385935953, "grad_norm": 1337.0838623046875, "learning_rate": 0.000312940973366711, "loss": 29.4753, "step": 7977 }, { "epoch": 21.0709805216243, "grad_norm": 2939.24072265625, "learning_rate": 0.0003129005416659221, "loss": 25.3775, "step": 7978 }, { "epoch": 21.073621657312643, "grad_norm": 899.7219848632812, "learning_rate": 0.00031286010820867237, "loss": 23.3494, "step": 7979 }, { "epoch": 21.07626279300099, "grad_norm": 1632.486572265625, "learning_rate": 0.0003128196729960912, "loss": 18.4857, "step": 7980 }, { "epoch": 21.078903928689336, "grad_norm": 10313.9306640625, "learning_rate": 0.0003127792360293075, "loss": 22.5646, "step": 7981 }, { "epoch": 21.081545064377682, "grad_norm": 949.1658325195312, "learning_rate": 0.00031273879730945046, "loss": 15.5733, "step": 7982 }, { "epoch": 21.08418620006603, "grad_norm": 586.7965087890625, "learning_rate": 0.00031269835683764947, "loss": 34.3872, "step": 7983 }, { "epoch": 21.086827335754375, "grad_norm": 724.892822265625, "learning_rate": 0.0003126579146150336, "loss": 36.8098, "step": 7984 }, { "epoch": 21.08946847144272, "grad_norm": 395.60870361328125, "learning_rate": 0.00031261747064273226, "loss": 36.1435, "step": 7985 }, { "epoch": 21.092109607131068, "grad_norm": 562.8125, "learning_rate": 0.0003125770249218749, "loss": 36.6808, "step": 7986 }, { "epoch": 21.09475074281941, "grad_norm": 769.1927490234375, "learning_rate": 0.00031253657745359094, "loss": 36.9777, "step": 7987 }, { "epoch": 21.097391878507757, "grad_norm": 747.8685302734375, "learning_rate": 0.00031249612823900963, "loss": 35.9452, "step": 7988 }, { "epoch": 21.100033014196104, "grad_norm": 781.3179931640625, "learning_rate": 0.0003124556772792608, "loss": 36.6748, "step": 7989 }, { "epoch": 21.10267414988445, "grad_norm": 412.3851623535156, "learning_rate": 0.0003124152245754738, "loss": 36.654, "step": 7990 }, { "epoch": 21.105315285572797, "grad_norm": 376.57171630859375, "learning_rate": 0.0003123747701287783, "loss": 37.0339, "step": 7991 }, { "epoch": 21.107956421261143, "grad_norm": 939.0514526367188, "learning_rate": 0.00031233431394030403, "loss": 35.6905, "step": 7992 }, { "epoch": 21.11059755694949, "grad_norm": 480.8186340332031, "learning_rate": 0.0003122938560111806, "loss": 35.8143, "step": 7993 }, { "epoch": 21.113238692637836, "grad_norm": 455.2934875488281, "learning_rate": 0.00031225339634253786, "loss": 35.6461, "step": 7994 }, { "epoch": 21.11587982832618, "grad_norm": 326.71630859375, "learning_rate": 0.0003122129349355056, "loss": 35.2074, "step": 7995 }, { "epoch": 21.118520964014525, "grad_norm": 574.3265380859375, "learning_rate": 0.0003121724717912138, "loss": 36.6418, "step": 7996 }, { "epoch": 21.12116209970287, "grad_norm": 288.46380615234375, "learning_rate": 0.00031213200691079205, "loss": 34.6525, "step": 7997 }, { "epoch": 21.123803235391218, "grad_norm": 342.4916687011719, "learning_rate": 0.00031209154029537064, "loss": 36.6078, "step": 7998 }, { "epoch": 21.126444371079565, "grad_norm": 1538.55859375, "learning_rate": 0.0003120510719460793, "loss": 35.4668, "step": 7999 }, { "epoch": 21.12908550676791, "grad_norm": 452.6728210449219, "learning_rate": 0.00031201060186404833, "loss": 39.5695, "step": 8000 }, { "epoch": 21.12908550676791, "eval_loss": 4.072892665863037, "eval_runtime": 2.0779, "eval_samples_per_second": 238.217, "eval_steps_per_second": 29.837, "step": 8000 }, { "epoch": 21.131726642456258, "grad_norm": 798.3375854492188, "learning_rate": 0.00031197013005040774, "loss": 42.0082, "step": 8001 }, { "epoch": 21.1343677781446, "grad_norm": 365.9707946777344, "learning_rate": 0.0003119296565062876, "loss": 42.6957, "step": 8002 }, { "epoch": 21.137008913832947, "grad_norm": 323.341552734375, "learning_rate": 0.0003118891812328183, "loss": 41.8381, "step": 8003 }, { "epoch": 21.139650049521293, "grad_norm": 519.330810546875, "learning_rate": 0.0003118487042311299, "loss": 42.09, "step": 8004 }, { "epoch": 21.14229118520964, "grad_norm": 310.2235412597656, "learning_rate": 0.00031180822550235275, "loss": 42.912, "step": 8005 }, { "epoch": 21.144932320897986, "grad_norm": 315.3227844238281, "learning_rate": 0.00031176774504761725, "loss": 45.49, "step": 8006 }, { "epoch": 21.147573456586333, "grad_norm": 226.36778259277344, "learning_rate": 0.00031172726286805364, "loss": 43.2225, "step": 8007 }, { "epoch": 21.15021459227468, "grad_norm": 281.5495300292969, "learning_rate": 0.0003116867789647926, "loss": 41.7555, "step": 8008 }, { "epoch": 21.152855727963026, "grad_norm": 315.0810546875, "learning_rate": 0.0003116462933389645, "loss": 40.8425, "step": 8009 }, { "epoch": 21.15549686365137, "grad_norm": 522.8634033203125, "learning_rate": 0.0003116058059916999, "loss": 40.7647, "step": 8010 }, { "epoch": 21.158137999339715, "grad_norm": 467.9088439941406, "learning_rate": 0.0003115653169241292, "loss": 42.2834, "step": 8011 }, { "epoch": 21.16077913502806, "grad_norm": 360.0850524902344, "learning_rate": 0.0003115248261373833, "loss": 39.6474, "step": 8012 }, { "epoch": 21.163420270716408, "grad_norm": 609.0686645507812, "learning_rate": 0.00031148433363259275, "loss": 36.9786, "step": 8013 }, { "epoch": 21.166061406404754, "grad_norm": 452.6014099121094, "learning_rate": 0.0003114438394108883, "loss": 37.7655, "step": 8014 }, { "epoch": 21.1687025420931, "grad_norm": 698.690673828125, "learning_rate": 0.00031140334347340074, "loss": 36.5395, "step": 8015 }, { "epoch": 21.171343677781447, "grad_norm": 510.8963317871094, "learning_rate": 0.00031136284582126083, "loss": 35.7274, "step": 8016 }, { "epoch": 21.173984813469794, "grad_norm": 264.0738830566406, "learning_rate": 0.00031132234645559956, "loss": 37.0079, "step": 8017 }, { "epoch": 21.176625949158137, "grad_norm": 418.62908935546875, "learning_rate": 0.00031128184537754775, "loss": 35.9689, "step": 8018 }, { "epoch": 21.179267084846483, "grad_norm": 506.5116882324219, "learning_rate": 0.0003112413425882365, "loss": 35.5618, "step": 8019 }, { "epoch": 21.18190822053483, "grad_norm": 358.40484619140625, "learning_rate": 0.00031120083808879663, "loss": 35.7374, "step": 8020 }, { "epoch": 21.184549356223176, "grad_norm": 404.1617126464844, "learning_rate": 0.00031116033188035924, "loss": 35.1125, "step": 8021 }, { "epoch": 21.187190491911522, "grad_norm": 365.3127136230469, "learning_rate": 0.0003111198239640557, "loss": 36.6159, "step": 8022 }, { "epoch": 21.18983162759987, "grad_norm": 1566.013916015625, "learning_rate": 0.0003110793143410169, "loss": 46.9147, "step": 8023 }, { "epoch": 21.192472763288215, "grad_norm": 1449.0374755859375, "learning_rate": 0.00031103880301237407, "loss": 39.6622, "step": 8024 }, { "epoch": 21.195113898976558, "grad_norm": 2369.590576171875, "learning_rate": 0.00031099828997925853, "loss": 34.7527, "step": 8025 }, { "epoch": 21.197755034664905, "grad_norm": 1370.755126953125, "learning_rate": 0.00031095777524280165, "loss": 32.169, "step": 8026 }, { "epoch": 21.20039617035325, "grad_norm": 2115.534423828125, "learning_rate": 0.00031091725880413466, "loss": 29.4216, "step": 8027 }, { "epoch": 21.203037306041598, "grad_norm": 1588.2283935546875, "learning_rate": 0.000310876740664389, "loss": 24.5134, "step": 8028 }, { "epoch": 21.205678441729944, "grad_norm": 6167.642578125, "learning_rate": 0.00031083622082469614, "loss": 22.6541, "step": 8029 }, { "epoch": 21.20831957741829, "grad_norm": 1833.1229248046875, "learning_rate": 0.00031079569928618754, "loss": 23.5167, "step": 8030 }, { "epoch": 21.210960713106637, "grad_norm": 981.1682739257812, "learning_rate": 0.00031075517604999475, "loss": 22.4273, "step": 8031 }, { "epoch": 21.213601848794983, "grad_norm": 644.2537841796875, "learning_rate": 0.00031071465111724933, "loss": 15.5485, "step": 8032 }, { "epoch": 21.216242984483326, "grad_norm": 730.3925170898438, "learning_rate": 0.00031067412448908293, "loss": 24.9471, "step": 8033 }, { "epoch": 21.218884120171673, "grad_norm": 381.4124755859375, "learning_rate": 0.00031063359616662734, "loss": 36.7559, "step": 8034 }, { "epoch": 21.22152525586002, "grad_norm": 775.2112426757812, "learning_rate": 0.00031059306615101415, "loss": 36.7684, "step": 8035 }, { "epoch": 21.224166391548366, "grad_norm": 576.0255126953125, "learning_rate": 0.00031055253444337515, "loss": 37.8919, "step": 8036 }, { "epoch": 21.226807527236712, "grad_norm": 862.4802856445312, "learning_rate": 0.0003105120010448422, "loss": 35.8692, "step": 8037 }, { "epoch": 21.22944866292506, "grad_norm": 347.1155700683594, "learning_rate": 0.0003104714659565472, "loss": 37.4671, "step": 8038 }, { "epoch": 21.232089798613405, "grad_norm": 652.205322265625, "learning_rate": 0.0003104309291796221, "loss": 36.6942, "step": 8039 }, { "epoch": 21.23473093430175, "grad_norm": 908.2239990234375, "learning_rate": 0.00031039039071519865, "loss": 37.1269, "step": 8040 }, { "epoch": 21.237372069990094, "grad_norm": 576.3546142578125, "learning_rate": 0.00031034985056440907, "loss": 35.4838, "step": 8041 }, { "epoch": 21.24001320567844, "grad_norm": 250.75108337402344, "learning_rate": 0.00031030930872838534, "loss": 36.8486, "step": 8042 }, { "epoch": 21.242654341366787, "grad_norm": 1438.22314453125, "learning_rate": 0.0003102687652082597, "loss": 37.2792, "step": 8043 }, { "epoch": 21.245295477055134, "grad_norm": 1619.50830078125, "learning_rate": 0.0003102282200051641, "loss": 38.2218, "step": 8044 }, { "epoch": 21.24793661274348, "grad_norm": 1466.6343994140625, "learning_rate": 0.0003101876731202309, "loss": 37.5146, "step": 8045 }, { "epoch": 21.250577748431827, "grad_norm": 420.4725341796875, "learning_rate": 0.0003101471245545922, "loss": 35.751, "step": 8046 }, { "epoch": 21.253218884120173, "grad_norm": 465.3676452636719, "learning_rate": 0.00031010657430938037, "loss": 36.7876, "step": 8047 }, { "epoch": 21.255860019808516, "grad_norm": 568.7908325195312, "learning_rate": 0.0003100660223857279, "loss": 36.2693, "step": 8048 }, { "epoch": 21.258501155496862, "grad_norm": 495.3664855957031, "learning_rate": 0.00031002546878476693, "loss": 36.0179, "step": 8049 }, { "epoch": 21.26114229118521, "grad_norm": 326.02484130859375, "learning_rate": 0.0003099849135076301, "loss": 37.7904, "step": 8050 }, { "epoch": 21.263783426873555, "grad_norm": 1013.4765014648438, "learning_rate": 0.00030994435655544965, "loss": 39.6649, "step": 8051 }, { "epoch": 21.2664245625619, "grad_norm": 565.0308227539062, "learning_rate": 0.0003099037979293584, "loss": 42.6689, "step": 8052 }, { "epoch": 21.269065698250248, "grad_norm": 200.18023681640625, "learning_rate": 0.0003098632376304887, "loss": 41.4647, "step": 8053 }, { "epoch": 21.271706833938595, "grad_norm": 303.59002685546875, "learning_rate": 0.00030982267565997324, "loss": 43.4768, "step": 8054 }, { "epoch": 21.27434796962694, "grad_norm": 281.6217956542969, "learning_rate": 0.0003097821120189447, "loss": 42.0594, "step": 8055 }, { "epoch": 21.276989105315284, "grad_norm": 643.2981567382812, "learning_rate": 0.0003097415467085359, "loss": 44.063, "step": 8056 }, { "epoch": 21.27963024100363, "grad_norm": 337.96563720703125, "learning_rate": 0.00030970097972987947, "loss": 44.7021, "step": 8057 }, { "epoch": 21.282271376691977, "grad_norm": 323.21002197265625, "learning_rate": 0.0003096604110841082, "loss": 45.2924, "step": 8058 }, { "epoch": 21.284912512380323, "grad_norm": 258.3374938964844, "learning_rate": 0.0003096198407723551, "loss": 41.7915, "step": 8059 }, { "epoch": 21.28755364806867, "grad_norm": 231.40890502929688, "learning_rate": 0.0003095792687957528, "loss": 41.2608, "step": 8060 }, { "epoch": 21.290194783757016, "grad_norm": 178.82859802246094, "learning_rate": 0.00030953869515543445, "loss": 42.5991, "step": 8061 }, { "epoch": 21.292835919445363, "grad_norm": 185.89979553222656, "learning_rate": 0.00030949811985253313, "loss": 41.1257, "step": 8062 }, { "epoch": 21.29547705513371, "grad_norm": 354.80096435546875, "learning_rate": 0.0003094575428881817, "loss": 38.0702, "step": 8063 }, { "epoch": 21.298118190822052, "grad_norm": 268.09033203125, "learning_rate": 0.0003094169642635133, "loss": 39.2295, "step": 8064 }, { "epoch": 21.3007593265104, "grad_norm": 290.5319519042969, "learning_rate": 0.0003093763839796611, "loss": 39.5117, "step": 8065 }, { "epoch": 21.303400462198745, "grad_norm": 365.0279541015625, "learning_rate": 0.0003093358020377583, "loss": 35.1378, "step": 8066 }, { "epoch": 21.30604159788709, "grad_norm": 304.4820556640625, "learning_rate": 0.000309295218438938, "loss": 36.429, "step": 8067 }, { "epoch": 21.308682733575438, "grad_norm": 288.3876953125, "learning_rate": 0.0003092546331843335, "loss": 37.3043, "step": 8068 }, { "epoch": 21.311323869263784, "grad_norm": 223.0193328857422, "learning_rate": 0.0003092140462750783, "loss": 35.4583, "step": 8069 }, { "epoch": 21.31396500495213, "grad_norm": 218.6533203125, "learning_rate": 0.00030917345771230565, "loss": 35.5709, "step": 8070 }, { "epoch": 21.316606140640474, "grad_norm": 341.1078796386719, "learning_rate": 0.0003091328674971489, "loss": 35.0468, "step": 8071 }, { "epoch": 21.31924727632882, "grad_norm": 248.58261108398438, "learning_rate": 0.00030909227563074155, "loss": 35.1889, "step": 8072 }, { "epoch": 21.321888412017167, "grad_norm": 231.6096954345703, "learning_rate": 0.0003090516821142171, "loss": 36.9895, "step": 8073 }, { "epoch": 21.324529547705513, "grad_norm": 410.7417907714844, "learning_rate": 0.00030901108694870923, "loss": 38.3611, "step": 8074 }, { "epoch": 21.32717068339386, "grad_norm": 5318.57177734375, "learning_rate": 0.0003089704901353513, "loss": 51.1691, "step": 8075 }, { "epoch": 21.329811819082206, "grad_norm": 9310.1826171875, "learning_rate": 0.0003089298916752771, "loss": 55.6107, "step": 8076 }, { "epoch": 21.332452954770552, "grad_norm": 3225.391845703125, "learning_rate": 0.0003088892915696203, "loss": 57.7938, "step": 8077 }, { "epoch": 21.3350940904589, "grad_norm": 2290.6162109375, "learning_rate": 0.00030884868981951463, "loss": 50.8968, "step": 8078 }, { "epoch": 21.33773522614724, "grad_norm": 3347.3408203125, "learning_rate": 0.0003088080864260939, "loss": 49.2907, "step": 8079 }, { "epoch": 21.340376361835588, "grad_norm": 4651.39697265625, "learning_rate": 0.0003087674813904919, "loss": 50.7224, "step": 8080 }, { "epoch": 21.343017497523935, "grad_norm": 2238.01513671875, "learning_rate": 0.0003087268747138425, "loss": 41.2155, "step": 8081 }, { "epoch": 21.34565863321228, "grad_norm": 3111.720458984375, "learning_rate": 0.0003086862663972795, "loss": 40.3899, "step": 8082 }, { "epoch": 21.348299768900628, "grad_norm": 4669.80224609375, "learning_rate": 0.0003086456564419372, "loss": 34.478, "step": 8083 }, { "epoch": 21.350940904588974, "grad_norm": 1397.8861083984375, "learning_rate": 0.00030860504484894925, "loss": 34.7095, "step": 8084 }, { "epoch": 21.35358204027732, "grad_norm": 550.1766967773438, "learning_rate": 0.0003085644316194499, "loss": 36.0327, "step": 8085 }, { "epoch": 21.356223175965667, "grad_norm": 750.0299072265625, "learning_rate": 0.00030852381675457315, "loss": 36.3074, "step": 8086 }, { "epoch": 21.35886431165401, "grad_norm": 1709.9642333984375, "learning_rate": 0.00030848320025545325, "loss": 36.5682, "step": 8087 }, { "epoch": 21.361505447342356, "grad_norm": 489.4355773925781, "learning_rate": 0.00030844258212322427, "loss": 35.3099, "step": 8088 }, { "epoch": 21.364146583030703, "grad_norm": 457.89324951171875, "learning_rate": 0.0003084019623590206, "loss": 35.2843, "step": 8089 }, { "epoch": 21.36678771871905, "grad_norm": 816.0443115234375, "learning_rate": 0.0003083613409639764, "loss": 35.8816, "step": 8090 }, { "epoch": 21.369428854407396, "grad_norm": 398.7054138183594, "learning_rate": 0.0003083207179392259, "loss": 36.9266, "step": 8091 }, { "epoch": 21.372069990095742, "grad_norm": 427.70867919921875, "learning_rate": 0.00030828009328590387, "loss": 35.7518, "step": 8092 }, { "epoch": 21.37471112578409, "grad_norm": 310.70318603515625, "learning_rate": 0.00030823946700514433, "loss": 35.3361, "step": 8093 }, { "epoch": 21.37735226147243, "grad_norm": 757.8091430664062, "learning_rate": 0.00030819883909808196, "loss": 35.1339, "step": 8094 }, { "epoch": 21.379993397160778, "grad_norm": 563.4588012695312, "learning_rate": 0.00030815820956585105, "loss": 36.1528, "step": 8095 }, { "epoch": 21.382634532849124, "grad_norm": 571.1627197265625, "learning_rate": 0.00030811757840958645, "loss": 36.0809, "step": 8096 }, { "epoch": 21.38527566853747, "grad_norm": 349.35565185546875, "learning_rate": 0.00030807694563042255, "loss": 36.3975, "step": 8097 }, { "epoch": 21.387916804225817, "grad_norm": 420.6034240722656, "learning_rate": 0.000308036311229494, "loss": 36.9691, "step": 8098 }, { "epoch": 21.390557939914164, "grad_norm": 620.8261108398438, "learning_rate": 0.00030799567520793566, "loss": 37.5541, "step": 8099 }, { "epoch": 21.39319907560251, "grad_norm": 1077.5428466796875, "learning_rate": 0.0003079550375668821, "loss": 36.9309, "step": 8100 }, { "epoch": 21.395840211290857, "grad_norm": 620.9872436523438, "learning_rate": 0.0003079143983074681, "loss": 40.4603, "step": 8101 }, { "epoch": 21.3984813469792, "grad_norm": 500.4020080566406, "learning_rate": 0.0003078737574308287, "loss": 40.5859, "step": 8102 }, { "epoch": 21.401122482667546, "grad_norm": 322.8009033203125, "learning_rate": 0.0003078331149380985, "loss": 39.966, "step": 8103 }, { "epoch": 21.403763618355892, "grad_norm": 428.5740051269531, "learning_rate": 0.00030779247083041256, "loss": 40.7118, "step": 8104 }, { "epoch": 21.40640475404424, "grad_norm": 943.4024658203125, "learning_rate": 0.0003077518251089059, "loss": 39.2688, "step": 8105 }, { "epoch": 21.409045889732585, "grad_norm": 641.2691650390625, "learning_rate": 0.0003077111777747134, "loss": 41.9274, "step": 8106 }, { "epoch": 21.41168702542093, "grad_norm": 552.4464111328125, "learning_rate": 0.0003076705288289701, "loss": 43.5237, "step": 8107 }, { "epoch": 21.414328161109278, "grad_norm": 336.6502990722656, "learning_rate": 0.0003076298782728112, "loss": 42.4968, "step": 8108 }, { "epoch": 21.416969296797625, "grad_norm": 682.4485473632812, "learning_rate": 0.0003075892261073719, "loss": 41.7659, "step": 8109 }, { "epoch": 21.419610432485968, "grad_norm": 449.7858581542969, "learning_rate": 0.0003075485723337871, "loss": 40.8766, "step": 8110 }, { "epoch": 21.422251568174314, "grad_norm": 423.8988342285156, "learning_rate": 0.00030750791695319235, "loss": 39.493, "step": 8111 }, { "epoch": 21.42489270386266, "grad_norm": 414.2210388183594, "learning_rate": 0.00030746725996672275, "loss": 40.1632, "step": 8112 }, { "epoch": 21.427533839551007, "grad_norm": 543.5509643554688, "learning_rate": 0.00030742660137551375, "loss": 39.1286, "step": 8113 }, { "epoch": 21.430174975239353, "grad_norm": 390.03741455078125, "learning_rate": 0.0003073859411807006, "loss": 38.6729, "step": 8114 }, { "epoch": 21.4328161109277, "grad_norm": 463.059326171875, "learning_rate": 0.00030734527938341874, "loss": 37.643, "step": 8115 }, { "epoch": 21.435457246616046, "grad_norm": 254.29974365234375, "learning_rate": 0.00030730461598480364, "loss": 37.5875, "step": 8116 }, { "epoch": 21.43809838230439, "grad_norm": 317.4508361816406, "learning_rate": 0.0003072639509859908, "loss": 36.7273, "step": 8117 }, { "epoch": 21.440739517992736, "grad_norm": 318.5875549316406, "learning_rate": 0.00030722328438811584, "loss": 35.8227, "step": 8118 }, { "epoch": 21.443380653681082, "grad_norm": 613.8399658203125, "learning_rate": 0.00030718261619231416, "loss": 37.0051, "step": 8119 }, { "epoch": 21.44602178936943, "grad_norm": 297.81591796875, "learning_rate": 0.0003071419463997216, "loss": 33.9821, "step": 8120 }, { "epoch": 21.448662925057775, "grad_norm": 252.7701416015625, "learning_rate": 0.00030710127501147366, "loss": 35.4297, "step": 8121 }, { "epoch": 21.45130406074612, "grad_norm": 325.0713195800781, "learning_rate": 0.00030706060202870624, "loss": 36.4398, "step": 8122 }, { "epoch": 21.453945196434468, "grad_norm": 1000.5637817382812, "learning_rate": 0.00030701992745255503, "loss": 40.524, "step": 8123 }, { "epoch": 21.456586332122814, "grad_norm": 496.4938049316406, "learning_rate": 0.0003069792512841558, "loss": 33.069, "step": 8124 }, { "epoch": 21.459227467811157, "grad_norm": 1115.9986572265625, "learning_rate": 0.0003069385735246444, "loss": 22.3629, "step": 8125 }, { "epoch": 21.461868603499504, "grad_norm": 1992.697021484375, "learning_rate": 0.0003068978941751569, "loss": 26.0113, "step": 8126 }, { "epoch": 21.46450973918785, "grad_norm": 1237.9827880859375, "learning_rate": 0.00030685721323682913, "loss": 22.2107, "step": 8127 }, { "epoch": 21.467150874876197, "grad_norm": 2064.187255859375, "learning_rate": 0.00030681653071079693, "loss": 25.1389, "step": 8128 }, { "epoch": 21.469792010564543, "grad_norm": 787.257080078125, "learning_rate": 0.00030677584659819657, "loss": 14.9375, "step": 8129 }, { "epoch": 21.47243314625289, "grad_norm": 12529.8173828125, "learning_rate": 0.0003067351609001641, "loss": 16.3412, "step": 8130 }, { "epoch": 21.475074281941236, "grad_norm": 2145.8212890625, "learning_rate": 0.0003066944736178356, "loss": 18.8652, "step": 8131 }, { "epoch": 21.477715417629582, "grad_norm": 2966.85595703125, "learning_rate": 0.0003066537847523472, "loss": 11.2021, "step": 8132 }, { "epoch": 21.480356553317925, "grad_norm": 9514.501953125, "learning_rate": 0.00030661309430483506, "loss": 15.6687, "step": 8133 }, { "epoch": 21.48299768900627, "grad_norm": 691.9060668945312, "learning_rate": 0.00030657240227643554, "loss": 35.4794, "step": 8134 }, { "epoch": 21.485638824694618, "grad_norm": 498.6492919921875, "learning_rate": 0.00030653170866828494, "loss": 35.1382, "step": 8135 }, { "epoch": 21.488279960382965, "grad_norm": 581.8998413085938, "learning_rate": 0.00030649101348151966, "loss": 36.6427, "step": 8136 }, { "epoch": 21.49092109607131, "grad_norm": 534.5865478515625, "learning_rate": 0.000306450316717276, "loss": 35.5721, "step": 8137 }, { "epoch": 21.493562231759658, "grad_norm": 562.76953125, "learning_rate": 0.0003064096183766903, "loss": 36.1346, "step": 8138 }, { "epoch": 21.496203367448004, "grad_norm": 261.5889587402344, "learning_rate": 0.0003063689184608992, "loss": 36.2388, "step": 8139 }, { "epoch": 21.498844503136347, "grad_norm": 317.2455139160156, "learning_rate": 0.0003063282169710392, "loss": 36.1442, "step": 8140 }, { "epoch": 21.501485638824693, "grad_norm": 318.81927490234375, "learning_rate": 0.00030628751390824677, "loss": 35.9909, "step": 8141 }, { "epoch": 21.50412677451304, "grad_norm": 552.3731689453125, "learning_rate": 0.0003062468092736586, "loss": 35.2087, "step": 8142 }, { "epoch": 21.506767910201386, "grad_norm": 607.5955810546875, "learning_rate": 0.0003062061030684113, "loss": 36.7075, "step": 8143 }, { "epoch": 21.509409045889733, "grad_norm": 384.74688720703125, "learning_rate": 0.0003061653952936417, "loss": 35.2878, "step": 8144 }, { "epoch": 21.51205018157808, "grad_norm": 616.2892456054688, "learning_rate": 0.00030612468595048626, "loss": 35.1133, "step": 8145 }, { "epoch": 21.514691317266426, "grad_norm": 1041.9796142578125, "learning_rate": 0.00030608397504008206, "loss": 36.008, "step": 8146 }, { "epoch": 21.517332452954772, "grad_norm": 380.6595153808594, "learning_rate": 0.00030604326256356576, "loss": 35.3725, "step": 8147 }, { "epoch": 21.519973588643115, "grad_norm": 698.4161987304688, "learning_rate": 0.00030600254852207427, "loss": 36.0532, "step": 8148 }, { "epoch": 21.52261472433146, "grad_norm": 912.0230102539062, "learning_rate": 0.0003059618329167445, "loss": 35.0107, "step": 8149 }, { "epoch": 21.525255860019808, "grad_norm": 988.4190673828125, "learning_rate": 0.00030592111574871347, "loss": 38.0449, "step": 8150 }, { "epoch": 21.527896995708154, "grad_norm": 1490.727783203125, "learning_rate": 0.00030588039701911807, "loss": 38.0089, "step": 8151 }, { "epoch": 21.5305381313965, "grad_norm": 909.9227905273438, "learning_rate": 0.0003058396767290954, "loss": 40.7429, "step": 8152 }, { "epoch": 21.533179267084847, "grad_norm": 519.4076538085938, "learning_rate": 0.0003057989548797827, "loss": 40.0081, "step": 8153 }, { "epoch": 21.535820402773194, "grad_norm": 494.869140625, "learning_rate": 0.00030575823147231684, "loss": 41.8871, "step": 8154 }, { "epoch": 21.53846153846154, "grad_norm": 333.2418212890625, "learning_rate": 0.00030571750650783513, "loss": 41.9902, "step": 8155 }, { "epoch": 21.541102674149883, "grad_norm": 545.0033569335938, "learning_rate": 0.0003056767799874748, "loss": 43.0806, "step": 8156 }, { "epoch": 21.54374380983823, "grad_norm": 316.59747314453125, "learning_rate": 0.0003056360519123731, "loss": 44.1431, "step": 8157 }, { "epoch": 21.546384945526576, "grad_norm": 562.3798828125, "learning_rate": 0.00030559532228366734, "loss": 42.4486, "step": 8158 }, { "epoch": 21.549026081214922, "grad_norm": 390.7737121582031, "learning_rate": 0.0003055545911024948, "loss": 42.4622, "step": 8159 }, { "epoch": 21.55166721690327, "grad_norm": 763.1212158203125, "learning_rate": 0.000305513858369993, "loss": 40.2327, "step": 8160 }, { "epoch": 21.554308352591615, "grad_norm": 705.5169677734375, "learning_rate": 0.00030547312408729925, "loss": 40.1325, "step": 8161 }, { "epoch": 21.55694948827996, "grad_norm": 448.4164733886719, "learning_rate": 0.00030543238825555115, "loss": 38.1199, "step": 8162 }, { "epoch": 21.559590623968305, "grad_norm": 479.672119140625, "learning_rate": 0.0003053916508758862, "loss": 39.037, "step": 8163 }, { "epoch": 21.56223175965665, "grad_norm": 578.5130615234375, "learning_rate": 0.0003053509119494419, "loss": 36.6824, "step": 8164 }, { "epoch": 21.564872895344998, "grad_norm": 466.3874816894531, "learning_rate": 0.0003053101714773559, "loss": 38.3684, "step": 8165 }, { "epoch": 21.567514031033344, "grad_norm": 358.72515869140625, "learning_rate": 0.0003052694294607659, "loss": 36.7663, "step": 8166 }, { "epoch": 21.57015516672169, "grad_norm": 611.9024047851562, "learning_rate": 0.0003052286859008095, "loss": 36.9303, "step": 8167 }, { "epoch": 21.572796302410037, "grad_norm": 833.4638671875, "learning_rate": 0.00030518794079862445, "loss": 34.7672, "step": 8168 }, { "epoch": 21.575437438098383, "grad_norm": 466.4683532714844, "learning_rate": 0.00030514719415534856, "loss": 36.324, "step": 8169 }, { "epoch": 21.57807857378673, "grad_norm": 377.0718078613281, "learning_rate": 0.0003051064459721197, "loss": 37.2697, "step": 8170 }, { "epoch": 21.580719709475073, "grad_norm": 1047.562744140625, "learning_rate": 0.0003050656962500758, "loss": 34.8295, "step": 8171 }, { "epoch": 21.58336084516342, "grad_norm": 1385.4276123046875, "learning_rate": 0.0003050249449903546, "loss": 36.3477, "step": 8172 }, { "epoch": 21.586001980851766, "grad_norm": 1458.0517578125, "learning_rate": 0.00030498419219409414, "loss": 43.7336, "step": 8173 }, { "epoch": 21.588643116540112, "grad_norm": 4066.84375, "learning_rate": 0.00030494343786243247, "loss": 32.163, "step": 8174 }, { "epoch": 21.59128425222846, "grad_norm": 2195.2568359375, "learning_rate": 0.0003049026819965075, "loss": 24.5308, "step": 8175 }, { "epoch": 21.593925387916805, "grad_norm": 21774.8203125, "learning_rate": 0.0003048619245974574, "loss": 26.0861, "step": 8176 }, { "epoch": 21.59656652360515, "grad_norm": 965.0050048828125, "learning_rate": 0.00030482116566642024, "loss": 26.0415, "step": 8177 }, { "epoch": 21.599207659293498, "grad_norm": 11212.91796875, "learning_rate": 0.0003047804052045343, "loss": 20.6647, "step": 8178 }, { "epoch": 21.60184879498184, "grad_norm": 3600.854736328125, "learning_rate": 0.00030473964321293777, "loss": 22.9346, "step": 8179 }, { "epoch": 21.604489930670187, "grad_norm": 3870.453125, "learning_rate": 0.00030469887969276877, "loss": 19.5534, "step": 8180 }, { "epoch": 21.607131066358534, "grad_norm": 1441.816162109375, "learning_rate": 0.0003046581146451657, "loss": 18.7592, "step": 8181 }, { "epoch": 21.60977220204688, "grad_norm": 841.194580078125, "learning_rate": 0.0003046173480712669, "loss": 19.6107, "step": 8182 }, { "epoch": 21.612413337735227, "grad_norm": 569.892822265625, "learning_rate": 0.00030457657997221075, "loss": 23.9012, "step": 8183 }, { "epoch": 21.615054473423573, "grad_norm": 420.3096008300781, "learning_rate": 0.0003045358103491357, "loss": 36.7852, "step": 8184 }, { "epoch": 21.61769560911192, "grad_norm": 632.94482421875, "learning_rate": 0.00030449503920318004, "loss": 34.9684, "step": 8185 }, { "epoch": 21.620336744800262, "grad_norm": 2437.333251953125, "learning_rate": 0.0003044542665354826, "loss": 36.2703, "step": 8186 }, { "epoch": 21.62297788048861, "grad_norm": 843.951171875, "learning_rate": 0.00030441349234718165, "loss": 37.5795, "step": 8187 }, { "epoch": 21.625619016176955, "grad_norm": 298.8426208496094, "learning_rate": 0.00030437271663941596, "loss": 37.5463, "step": 8188 }, { "epoch": 21.6282601518653, "grad_norm": 499.4682922363281, "learning_rate": 0.000304331939413324, "loss": 36.6487, "step": 8189 }, { "epoch": 21.630901287553648, "grad_norm": 504.395263671875, "learning_rate": 0.0003042911606700446, "loss": 36.5483, "step": 8190 }, { "epoch": 21.633542423241995, "grad_norm": 456.4308776855469, "learning_rate": 0.0003042503804107164, "loss": 34.8007, "step": 8191 }, { "epoch": 21.63618355893034, "grad_norm": 697.1819458007812, "learning_rate": 0.0003042095986364782, "loss": 34.9281, "step": 8192 }, { "epoch": 21.638824694618688, "grad_norm": 280.8713073730469, "learning_rate": 0.0003041688153484689, "loss": 35.6262, "step": 8193 }, { "epoch": 21.64146583030703, "grad_norm": 281.89068603515625, "learning_rate": 0.00030412803054782715, "loss": 37.016, "step": 8194 }, { "epoch": 21.644106965995377, "grad_norm": 425.441650390625, "learning_rate": 0.00030408724423569203, "loss": 34.6123, "step": 8195 }, { "epoch": 21.646748101683723, "grad_norm": 485.01422119140625, "learning_rate": 0.0003040464564132023, "loss": 35.2471, "step": 8196 }, { "epoch": 21.64938923737207, "grad_norm": 480.1241760253906, "learning_rate": 0.000304005667081497, "loss": 35.3442, "step": 8197 }, { "epoch": 21.652030373060416, "grad_norm": 434.5090026855469, "learning_rate": 0.00030396487624171525, "loss": 34.6702, "step": 8198 }, { "epoch": 21.654671508748763, "grad_norm": 659.1851806640625, "learning_rate": 0.000303924083894996, "loss": 38.1051, "step": 8199 }, { "epoch": 21.65731264443711, "grad_norm": 646.12939453125, "learning_rate": 0.0003038832900424784, "loss": 37.4214, "step": 8200 }, { "epoch": 21.65731264443711, "eval_loss": 3.9377875328063965, "eval_runtime": 2.0615, "eval_samples_per_second": 240.112, "eval_steps_per_second": 30.075, "step": 8200 }, { "epoch": 21.659953780125456, "grad_norm": 2806.157958984375, "learning_rate": 0.00030384249468530164, "loss": 41.5965, "step": 8201 }, { "epoch": 21.6625949158138, "grad_norm": 313.3565368652344, "learning_rate": 0.00030380169782460477, "loss": 42.1352, "step": 8202 }, { "epoch": 21.665236051502145, "grad_norm": 422.7454528808594, "learning_rate": 0.00030376089946152705, "loss": 40.4354, "step": 8203 }, { "epoch": 21.66787718719049, "grad_norm": 345.2088623046875, "learning_rate": 0.0003037200995972078, "loss": 40.0287, "step": 8204 }, { "epoch": 21.670518322878838, "grad_norm": 206.5756378173828, "learning_rate": 0.0003036792982327864, "loss": 42.0753, "step": 8205 }, { "epoch": 21.673159458567184, "grad_norm": 405.2364501953125, "learning_rate": 0.0003036384953694021, "loss": 44.9354, "step": 8206 }, { "epoch": 21.67580059425553, "grad_norm": 483.2583923339844, "learning_rate": 0.00030359769100819436, "loss": 42.2687, "step": 8207 }, { "epoch": 21.678441729943877, "grad_norm": 548.9083251953125, "learning_rate": 0.0003035568851503025, "loss": 42.0264, "step": 8208 }, { "epoch": 21.68108286563222, "grad_norm": 223.04087829589844, "learning_rate": 0.00030351607779686616, "loss": 40.1268, "step": 8209 }, { "epoch": 21.683724001320567, "grad_norm": 176.71080017089844, "learning_rate": 0.00030347526894902477, "loss": 40.8029, "step": 8210 }, { "epoch": 21.686365137008913, "grad_norm": 209.7482452392578, "learning_rate": 0.00030343445860791786, "loss": 39.3008, "step": 8211 }, { "epoch": 21.68900627269726, "grad_norm": 327.5287780761719, "learning_rate": 0.00030339364677468513, "loss": 39.8219, "step": 8212 }, { "epoch": 21.691647408385606, "grad_norm": 496.77349853515625, "learning_rate": 0.0003033528334504661, "loss": 37.7365, "step": 8213 }, { "epoch": 21.694288544073952, "grad_norm": 177.1134490966797, "learning_rate": 0.00030331201863640066, "loss": 38.2465, "step": 8214 }, { "epoch": 21.6969296797623, "grad_norm": 211.5409393310547, "learning_rate": 0.00030327120233362834, "loss": 35.0532, "step": 8215 }, { "epoch": 21.699570815450645, "grad_norm": 394.5475769042969, "learning_rate": 0.000303230384543289, "loss": 36.7796, "step": 8216 }, { "epoch": 21.702211951138988, "grad_norm": 197.32037353515625, "learning_rate": 0.00030318956526652243, "loss": 36.5015, "step": 8217 }, { "epoch": 21.704853086827335, "grad_norm": 281.3800354003906, "learning_rate": 0.00030314874450446854, "loss": 35.0793, "step": 8218 }, { "epoch": 21.70749422251568, "grad_norm": 710.77880859375, "learning_rate": 0.00030310792225826715, "loss": 36.0462, "step": 8219 }, { "epoch": 21.710135358204028, "grad_norm": 216.74037170410156, "learning_rate": 0.00030306709852905825, "loss": 35.627, "step": 8220 }, { "epoch": 21.712776493892374, "grad_norm": 181.3594970703125, "learning_rate": 0.00030302627331798177, "loss": 33.5873, "step": 8221 }, { "epoch": 21.71541762958072, "grad_norm": 638.903564453125, "learning_rate": 0.0003029854466261777, "loss": 37.2596, "step": 8222 }, { "epoch": 21.718058765269067, "grad_norm": 3425.73779296875, "learning_rate": 0.00030294461845478633, "loss": 46.3834, "step": 8223 }, { "epoch": 21.720699900957413, "grad_norm": 3702.256103515625, "learning_rate": 0.0003029037888049474, "loss": 34.2764, "step": 8224 }, { "epoch": 21.723341036645756, "grad_norm": 1672.58642578125, "learning_rate": 0.0003028629576778013, "loss": 33.5883, "step": 8225 }, { "epoch": 21.725982172334103, "grad_norm": 18550.06640625, "learning_rate": 0.0003028221250744882, "loss": 32.3852, "step": 8226 }, { "epoch": 21.72862330802245, "grad_norm": 2725.854736328125, "learning_rate": 0.0003027812909961483, "loss": 31.2413, "step": 8227 }, { "epoch": 21.731264443710796, "grad_norm": 2321.170654296875, "learning_rate": 0.00030274045544392185, "loss": 32.0581, "step": 8228 }, { "epoch": 21.733905579399142, "grad_norm": 7909.38330078125, "learning_rate": 0.0003026996184189491, "loss": 26.2059, "step": 8229 }, { "epoch": 21.73654671508749, "grad_norm": 1050.3350830078125, "learning_rate": 0.00030265877992237053, "loss": 27.5159, "step": 8230 }, { "epoch": 21.739187850775835, "grad_norm": 4275.40966796875, "learning_rate": 0.0003026179399553264, "loss": 18.235, "step": 8231 }, { "epoch": 21.741828986464178, "grad_norm": 3787.876953125, "learning_rate": 0.00030257709851895726, "loss": 23.8036, "step": 8232 }, { "epoch": 21.744470122152524, "grad_norm": 816.8889770507812, "learning_rate": 0.00030253625561440356, "loss": 25.024, "step": 8233 }, { "epoch": 21.74711125784087, "grad_norm": 323.37286376953125, "learning_rate": 0.00030249541124280566, "loss": 36.4108, "step": 8234 }, { "epoch": 21.749752393529217, "grad_norm": 758.904296875, "learning_rate": 0.00030245456540530435, "loss": 37.2837, "step": 8235 }, { "epoch": 21.752393529217564, "grad_norm": 313.1193542480469, "learning_rate": 0.00030241371810304016, "loss": 35.921, "step": 8236 }, { "epoch": 21.75503466490591, "grad_norm": 340.6794738769531, "learning_rate": 0.0003023728693371535, "loss": 36.6641, "step": 8237 }, { "epoch": 21.757675800594257, "grad_norm": 433.7482604980469, "learning_rate": 0.0003023320191087854, "loss": 36.0533, "step": 8238 }, { "epoch": 21.760316936282603, "grad_norm": 443.19970703125, "learning_rate": 0.00030229116741907627, "loss": 35.1987, "step": 8239 }, { "epoch": 21.762958071970946, "grad_norm": 316.4296569824219, "learning_rate": 0.0003022503142691671, "loss": 35.2704, "step": 8240 }, { "epoch": 21.765599207659292, "grad_norm": 398.7014465332031, "learning_rate": 0.0003022094596601986, "loss": 35.4864, "step": 8241 }, { "epoch": 21.76824034334764, "grad_norm": 723.828369140625, "learning_rate": 0.00030216860359331163, "loss": 35.3252, "step": 8242 }, { "epoch": 21.770881479035985, "grad_norm": 323.94744873046875, "learning_rate": 0.00030212774606964694, "loss": 35.696, "step": 8243 }, { "epoch": 21.77352261472433, "grad_norm": 1043.942626953125, "learning_rate": 0.00030208688709034565, "loss": 37.301, "step": 8244 }, { "epoch": 21.776163750412678, "grad_norm": 451.492431640625, "learning_rate": 0.00030204602665654865, "loss": 35.4517, "step": 8245 }, { "epoch": 21.778804886101025, "grad_norm": 347.1617126464844, "learning_rate": 0.00030200516476939685, "loss": 34.4385, "step": 8246 }, { "epoch": 21.78144602178937, "grad_norm": 500.4003601074219, "learning_rate": 0.0003019643014300314, "loss": 35.2681, "step": 8247 }, { "epoch": 21.784087157477714, "grad_norm": 400.6777038574219, "learning_rate": 0.0003019234366395933, "loss": 34.6736, "step": 8248 }, { "epoch": 21.78672829316606, "grad_norm": 558.2153930664062, "learning_rate": 0.0003018825703992239, "loss": 36.5079, "step": 8249 }, { "epoch": 21.789369428854407, "grad_norm": 623.4026489257812, "learning_rate": 0.00030184170271006414, "loss": 37.045, "step": 8250 }, { "epoch": 21.792010564542753, "grad_norm": 3692.583984375, "learning_rate": 0.00030180083357325514, "loss": 39.7429, "step": 8251 }, { "epoch": 21.7946517002311, "grad_norm": 451.6702880859375, "learning_rate": 0.00030175996298993846, "loss": 38.5178, "step": 8252 }, { "epoch": 21.797292835919446, "grad_norm": 660.5073852539062, "learning_rate": 0.00030171909096125517, "loss": 40.3122, "step": 8253 }, { "epoch": 21.799933971607793, "grad_norm": 300.9979248046875, "learning_rate": 0.00030167821748834666, "loss": 39.726, "step": 8254 }, { "epoch": 21.802575107296136, "grad_norm": 382.26513671875, "learning_rate": 0.0003016373425723542, "loss": 42.5393, "step": 8255 }, { "epoch": 21.805216242984482, "grad_norm": 471.8289794921875, "learning_rate": 0.0003015964662144194, "loss": 42.9997, "step": 8256 }, { "epoch": 21.80785737867283, "grad_norm": 433.68212890625, "learning_rate": 0.0003015555884156835, "loss": 42.1488, "step": 8257 }, { "epoch": 21.810498514361175, "grad_norm": 186.52915954589844, "learning_rate": 0.00030151470917728814, "loss": 40.9998, "step": 8258 }, { "epoch": 21.81313965004952, "grad_norm": 387.20513916015625, "learning_rate": 0.00030147382850037477, "loss": 41.9129, "step": 8259 }, { "epoch": 21.815780785737868, "grad_norm": 209.24021911621094, "learning_rate": 0.00030143294638608487, "loss": 40.7832, "step": 8260 }, { "epoch": 21.818421921426214, "grad_norm": 255.10073852539062, "learning_rate": 0.0003013920628355603, "loss": 40.8077, "step": 8261 }, { "epoch": 21.82106305711456, "grad_norm": 328.41229248046875, "learning_rate": 0.00030135117784994256, "loss": 38.4729, "step": 8262 }, { "epoch": 21.823704192802904, "grad_norm": 310.03466796875, "learning_rate": 0.0003013102914303733, "loss": 37.4648, "step": 8263 }, { "epoch": 21.82634532849125, "grad_norm": 118.87606048583984, "learning_rate": 0.00030126940357799427, "loss": 37.9381, "step": 8264 }, { "epoch": 21.828986464179597, "grad_norm": 202.51913452148438, "learning_rate": 0.0003012285142939473, "loss": 35.3724, "step": 8265 }, { "epoch": 21.831627599867943, "grad_norm": 517.6845092773438, "learning_rate": 0.00030118762357937413, "loss": 36.3352, "step": 8266 }, { "epoch": 21.83426873555629, "grad_norm": 216.77293395996094, "learning_rate": 0.00030114673143541665, "loss": 36.3388, "step": 8267 }, { "epoch": 21.836909871244636, "grad_norm": 170.81617736816406, "learning_rate": 0.00030110583786321677, "loss": 34.3854, "step": 8268 }, { "epoch": 21.839551006932982, "grad_norm": 343.6143798828125, "learning_rate": 0.0003010649428639163, "loss": 36.1189, "step": 8269 }, { "epoch": 21.84219214262133, "grad_norm": 262.4176940917969, "learning_rate": 0.0003010240464386574, "loss": 36.3638, "step": 8270 }, { "epoch": 21.84483327830967, "grad_norm": 190.05064392089844, "learning_rate": 0.0003009831485885819, "loss": 35.8748, "step": 8271 }, { "epoch": 21.847474413998018, "grad_norm": 286.4758605957031, "learning_rate": 0.00030094224931483197, "loss": 35.8289, "step": 8272 }, { "epoch": 21.850115549686365, "grad_norm": 6695.4970703125, "learning_rate": 0.0003009013486185497, "loss": 40.6669, "step": 8273 }, { "epoch": 21.85275668537471, "grad_norm": 1455.6602783203125, "learning_rate": 0.0003008604465008771, "loss": 35.8454, "step": 8274 }, { "epoch": 21.855397821063058, "grad_norm": 9618.8935546875, "learning_rate": 0.00030081954296295646, "loss": 23.5709, "step": 8275 }, { "epoch": 21.858038956751404, "grad_norm": 1290.089599609375, "learning_rate": 0.00030077863800592985, "loss": 30.4994, "step": 8276 }, { "epoch": 21.86068009243975, "grad_norm": 2351.59521484375, "learning_rate": 0.00030073773163093973, "loss": 29.0741, "step": 8277 }, { "epoch": 21.863321228128093, "grad_norm": 23520.033203125, "learning_rate": 0.0003006968238391281, "loss": 27.1879, "step": 8278 }, { "epoch": 21.86596236381644, "grad_norm": 1618.3438720703125, "learning_rate": 0.00030065591463163756, "loss": 21.2442, "step": 8279 }, { "epoch": 21.868603499504786, "grad_norm": 1084.371826171875, "learning_rate": 0.0003006150040096104, "loss": 26.9238, "step": 8280 }, { "epoch": 21.871244635193133, "grad_norm": 1104.6754150390625, "learning_rate": 0.00030057409197418886, "loss": 24.098, "step": 8281 }, { "epoch": 21.87388577088148, "grad_norm": 755.7086791992188, "learning_rate": 0.0003005331785265156, "loss": 15.2662, "step": 8282 }, { "epoch": 21.876526906569826, "grad_norm": 1055.7890625, "learning_rate": 0.0003004922636677329, "loss": 20.8413, "step": 8283 }, { "epoch": 21.879168042258172, "grad_norm": 370.40472412109375, "learning_rate": 0.0003004513473989836, "loss": 36.4039, "step": 8284 }, { "epoch": 21.88180917794652, "grad_norm": 239.61663818359375, "learning_rate": 0.00030041042972140986, "loss": 36.7116, "step": 8285 }, { "epoch": 21.88445031363486, "grad_norm": 377.50885009765625, "learning_rate": 0.00030036951063615455, "loss": 35.3235, "step": 8286 }, { "epoch": 21.887091449323208, "grad_norm": 337.5924072265625, "learning_rate": 0.00030032859014436023, "loss": 37.0606, "step": 8287 }, { "epoch": 21.889732585011554, "grad_norm": 460.7380676269531, "learning_rate": 0.00030028766824716967, "loss": 34.7922, "step": 8288 }, { "epoch": 21.8923737206999, "grad_norm": 372.3096008300781, "learning_rate": 0.0003002467449457255, "loss": 35.27, "step": 8289 }, { "epoch": 21.895014856388247, "grad_norm": 1691.4854736328125, "learning_rate": 0.00030020582024117045, "loss": 36.9495, "step": 8290 }, { "epoch": 21.897655992076594, "grad_norm": 169.24635314941406, "learning_rate": 0.00030016489413464735, "loss": 35.7759, "step": 8291 }, { "epoch": 21.90029712776494, "grad_norm": 278.92083740234375, "learning_rate": 0.00030012396662729905, "loss": 36.9973, "step": 8292 }, { "epoch": 21.902938263453287, "grad_norm": 872.9880981445312, "learning_rate": 0.00030008303772026846, "loss": 35.7503, "step": 8293 }, { "epoch": 21.90557939914163, "grad_norm": 574.8671875, "learning_rate": 0.0003000421074146985, "loss": 37.2063, "step": 8294 }, { "epoch": 21.908220534829976, "grad_norm": 314.74139404296875, "learning_rate": 0.000300001175711732, "loss": 36.4769, "step": 8295 }, { "epoch": 21.910861670518322, "grad_norm": 366.2242736816406, "learning_rate": 0.0002999602426125121, "loss": 36.2164, "step": 8296 }, { "epoch": 21.91350280620667, "grad_norm": 303.83197021484375, "learning_rate": 0.00029991930811818174, "loss": 36.0598, "step": 8297 }, { "epoch": 21.916143941895015, "grad_norm": 303.4806823730469, "learning_rate": 0.000299878372229884, "loss": 35.6486, "step": 8298 }, { "epoch": 21.91878507758336, "grad_norm": 458.4897155761719, "learning_rate": 0.0002998374349487621, "loss": 36.9565, "step": 8299 }, { "epoch": 21.921426213271708, "grad_norm": 816.5763549804688, "learning_rate": 0.000299796496275959, "loss": 36.6275, "step": 8300 }, { "epoch": 21.92406734896005, "grad_norm": 330.1514587402344, "learning_rate": 0.0002997555562126181, "loss": 40.7995, "step": 8301 }, { "epoch": 21.926708484648398, "grad_norm": 313.8084411621094, "learning_rate": 0.0002997146147598825, "loss": 41.431, "step": 8302 }, { "epoch": 21.929349620336744, "grad_norm": 304.286376953125, "learning_rate": 0.00029967367191889544, "loss": 41.0628, "step": 8303 }, { "epoch": 21.93199075602509, "grad_norm": 371.41339111328125, "learning_rate": 0.00029963272769080023, "loss": 42.5902, "step": 8304 }, { "epoch": 21.934631891713437, "grad_norm": 278.24041748046875, "learning_rate": 0.0002995917820767404, "loss": 44.4936, "step": 8305 }, { "epoch": 21.937273027401783, "grad_norm": 254.51930236816406, "learning_rate": 0.00029955083507785907, "loss": 42.3715, "step": 8306 }, { "epoch": 21.93991416309013, "grad_norm": 265.0382385253906, "learning_rate": 0.00029950988669529975, "loss": 42.5159, "step": 8307 }, { "epoch": 21.942555298778476, "grad_norm": 486.35137939453125, "learning_rate": 0.00029946893693020605, "loss": 40.2022, "step": 8308 }, { "epoch": 21.94519643446682, "grad_norm": 354.415771484375, "learning_rate": 0.00029942798578372126, "loss": 39.5792, "step": 8309 }, { "epoch": 21.947837570155166, "grad_norm": 250.15267944335938, "learning_rate": 0.00029938703325698904, "loss": 36.3211, "step": 8310 }, { "epoch": 21.950478705843512, "grad_norm": 802.5132446289062, "learning_rate": 0.00029934607935115296, "loss": 36.7858, "step": 8311 }, { "epoch": 21.95311984153186, "grad_norm": 323.43792724609375, "learning_rate": 0.00029930512406735647, "loss": 35.7677, "step": 8312 }, { "epoch": 21.955760977220205, "grad_norm": 143.39389038085938, "learning_rate": 0.00029926416740674346, "loss": 36.6493, "step": 8313 }, { "epoch": 21.95840211290855, "grad_norm": 257.829345703125, "learning_rate": 0.00029922320937045754, "loss": 37.773, "step": 8314 }, { "epoch": 21.961043248596898, "grad_norm": 1260.3834228515625, "learning_rate": 0.0002991822499596424, "loss": 26.6678, "step": 8315 }, { "epoch": 21.963684384285244, "grad_norm": 2034.9420166015625, "learning_rate": 0.00029914128917544173, "loss": 20.6156, "step": 8316 }, { "epoch": 21.966325519973587, "grad_norm": 32907.0859375, "learning_rate": 0.0002991003270189995, "loss": 25.7997, "step": 8317 }, { "epoch": 21.968966655661934, "grad_norm": 1429.79541015625, "learning_rate": 0.00029905936349145944, "loss": 29.6, "step": 8318 }, { "epoch": 21.97160779135028, "grad_norm": 2419.08447265625, "learning_rate": 0.0002990183985939655, "loss": 20.0123, "step": 8319 }, { "epoch": 21.974248927038627, "grad_norm": 716.15576171875, "learning_rate": 0.0002989774323276616, "loss": 31.3751, "step": 8320 }, { "epoch": 21.976890062726973, "grad_norm": 328.1409606933594, "learning_rate": 0.00029893646469369165, "loss": 36.3147, "step": 8321 }, { "epoch": 21.97953119841532, "grad_norm": 417.00018310546875, "learning_rate": 0.0002988954956931997, "loss": 35.8503, "step": 8322 }, { "epoch": 21.982172334103666, "grad_norm": 423.93206787109375, "learning_rate": 0.0002988545253273297, "loss": 34.5856, "step": 8323 }, { "epoch": 21.98481346979201, "grad_norm": 173.99026489257812, "learning_rate": 0.00029881355359722583, "loss": 36.1564, "step": 8324 }, { "epoch": 21.987454605480355, "grad_norm": 194.068359375, "learning_rate": 0.0002987725805040321, "loss": 36.6113, "step": 8325 }, { "epoch": 21.9900957411687, "grad_norm": 385.01214599609375, "learning_rate": 0.0002987316060488927, "loss": 35.5309, "step": 8326 }, { "epoch": 21.992736876857048, "grad_norm": 185.78321838378906, "learning_rate": 0.0002986906302329518, "loss": 34.3252, "step": 8327 }, { "epoch": 21.995378012545395, "grad_norm": 689.3461303710938, "learning_rate": 0.0002986496530573538, "loss": 36.4926, "step": 8328 }, { "epoch": 21.99801914823374, "grad_norm": 373.9440612792969, "learning_rate": 0.0002986086745232428, "loss": 36.9019, "step": 8329 }, { "epoch": 22.000660283922088, "grad_norm": 310.4923400878906, "learning_rate": 0.000298567694631763, "loss": 40.6164, "step": 8330 }, { "epoch": 22.003301419610434, "grad_norm": 158.9080810546875, "learning_rate": 0.0002985267133840589, "loss": 39.7404, "step": 8331 }, { "epoch": 22.005942555298777, "grad_norm": 184.8914031982422, "learning_rate": 0.0002984857307812748, "loss": 41.1035, "step": 8332 }, { "epoch": 22.008583690987123, "grad_norm": 180.97640991210938, "learning_rate": 0.0002984447468245552, "loss": 40.2996, "step": 8333 }, { "epoch": 22.01122482667547, "grad_norm": 202.59393310546875, "learning_rate": 0.0002984037615150445, "loss": 41.9228, "step": 8334 }, { "epoch": 22.013865962363816, "grad_norm": 221.10763549804688, "learning_rate": 0.0002983627748538872, "loss": 44.3256, "step": 8335 }, { "epoch": 22.016507098052163, "grad_norm": 247.1941680908203, "learning_rate": 0.00029832178684222776, "loss": 42.2607, "step": 8336 }, { "epoch": 22.01914823374051, "grad_norm": 173.15615844726562, "learning_rate": 0.0002982807974812109, "loss": 42.8166, "step": 8337 }, { "epoch": 22.021789369428856, "grad_norm": 339.2604675292969, "learning_rate": 0.00029823980677198115, "loss": 39.7782, "step": 8338 }, { "epoch": 22.024430505117202, "grad_norm": 234.74069213867188, "learning_rate": 0.000298198814715683, "loss": 39.7745, "step": 8339 }, { "epoch": 22.027071640805545, "grad_norm": 204.8360595703125, "learning_rate": 0.00029815782131346137, "loss": 41.6144, "step": 8340 }, { "epoch": 22.02971277649389, "grad_norm": 193.8770294189453, "learning_rate": 0.00029811682656646086, "loss": 39.9526, "step": 8341 }, { "epoch": 22.032353912182238, "grad_norm": 174.97911071777344, "learning_rate": 0.00029807583047582615, "loss": 38.2318, "step": 8342 }, { "epoch": 22.034995047870584, "grad_norm": 266.9058837890625, "learning_rate": 0.0002980348330427022, "loss": 37.0497, "step": 8343 }, { "epoch": 22.03763618355893, "grad_norm": 477.335693359375, "learning_rate": 0.0002979938342682337, "loss": 36.7268, "step": 8344 }, { "epoch": 22.040277319247277, "grad_norm": 303.6522216796875, "learning_rate": 0.0002979528341535656, "loss": 37.5029, "step": 8345 }, { "epoch": 22.042918454935624, "grad_norm": 460.7578430175781, "learning_rate": 0.00029791183269984277, "loss": 37.1275, "step": 8346 }, { "epoch": 22.045559590623967, "grad_norm": 372.3319396972656, "learning_rate": 0.0002978708299082102, "loss": 36.1422, "step": 8347 }, { "epoch": 22.048200726312313, "grad_norm": 263.253662109375, "learning_rate": 0.00029782982577981275, "loss": 35.4032, "step": 8348 }, { "epoch": 22.05084186200066, "grad_norm": 206.79600524902344, "learning_rate": 0.00029778882031579555, "loss": 35.8749, "step": 8349 }, { "epoch": 22.053482997689006, "grad_norm": 205.01588439941406, "learning_rate": 0.00029774781351730366, "loss": 35.653, "step": 8350 }, { "epoch": 22.056124133377352, "grad_norm": 229.90614318847656, "learning_rate": 0.00029770680538548206, "loss": 35.3075, "step": 8351 }, { "epoch": 22.0587652690657, "grad_norm": 2761.149169921875, "learning_rate": 0.000297665795921476, "loss": 49.2398, "step": 8352 }, { "epoch": 22.061406404754045, "grad_norm": 4494.265625, "learning_rate": 0.00029762478512643047, "loss": 41.1904, "step": 8353 }, { "epoch": 22.06404754044239, "grad_norm": 3944.381103515625, "learning_rate": 0.0002975837730014909, "loss": 44.1866, "step": 8354 }, { "epoch": 22.066688676130735, "grad_norm": 6480.46337890625, "learning_rate": 0.00029754275954780245, "loss": 46.5107, "step": 8355 }, { "epoch": 22.06932981181908, "grad_norm": 5424.76708984375, "learning_rate": 0.0002975017447665103, "loss": 39.3851, "step": 8356 }, { "epoch": 22.071970947507427, "grad_norm": 2682.090576171875, "learning_rate": 0.00029746072865875984, "loss": 31.6694, "step": 8357 }, { "epoch": 22.074612083195774, "grad_norm": 2631.043701171875, "learning_rate": 0.00029741971122569644, "loss": 30.6413, "step": 8358 }, { "epoch": 22.07725321888412, "grad_norm": 7752.49609375, "learning_rate": 0.0002973786924684654, "loss": 29.905, "step": 8359 }, { "epoch": 22.079894354572467, "grad_norm": 895.8388671875, "learning_rate": 0.0002973376723882122, "loss": 30.3541, "step": 8360 }, { "epoch": 22.082535490260813, "grad_norm": 2875.36474609375, "learning_rate": 0.0002972966509860824, "loss": 26.5897, "step": 8361 }, { "epoch": 22.08517662594916, "grad_norm": 2494.768798828125, "learning_rate": 0.00029725562826322135, "loss": 27.3434, "step": 8362 }, { "epoch": 22.087817761637503, "grad_norm": 212.07986450195312, "learning_rate": 0.00029721460422077467, "loss": 36.5006, "step": 8363 }, { "epoch": 22.09045889732585, "grad_norm": 478.6232604980469, "learning_rate": 0.0002971735788598879, "loss": 36.9087, "step": 8364 }, { "epoch": 22.093100033014196, "grad_norm": 647.6249389648438, "learning_rate": 0.0002971325521817066, "loss": 36.6304, "step": 8365 }, { "epoch": 22.095741168702542, "grad_norm": 676.0175170898438, "learning_rate": 0.0002970915241873764, "loss": 35.9713, "step": 8366 }, { "epoch": 22.09838230439089, "grad_norm": 415.03216552734375, "learning_rate": 0.00029705049487804326, "loss": 36.6115, "step": 8367 }, { "epoch": 22.101023440079235, "grad_norm": 350.59478759765625, "learning_rate": 0.00029700946425485254, "loss": 35.0907, "step": 8368 }, { "epoch": 22.10366457576758, "grad_norm": 268.18316650390625, "learning_rate": 0.0002969684323189502, "loss": 35.238, "step": 8369 }, { "epoch": 22.106305711455924, "grad_norm": 216.9114990234375, "learning_rate": 0.0002969273990714819, "loss": 34.8728, "step": 8370 }, { "epoch": 22.10894684714427, "grad_norm": 300.2666320800781, "learning_rate": 0.00029688636451359363, "loss": 35.0134, "step": 8371 }, { "epoch": 22.111587982832617, "grad_norm": 341.78887939453125, "learning_rate": 0.0002968453286464312, "loss": 34.4623, "step": 8372 }, { "epoch": 22.114229118520964, "grad_norm": 535.1244506835938, "learning_rate": 0.0002968042914711404, "loss": 38.0952, "step": 8373 }, { "epoch": 22.11687025420931, "grad_norm": 320.4387512207031, "learning_rate": 0.0002967632529888673, "loss": 34.3003, "step": 8374 }, { "epoch": 22.119511389897657, "grad_norm": 534.7425537109375, "learning_rate": 0.0002967222132007578, "loss": 36.1389, "step": 8375 }, { "epoch": 22.122152525586003, "grad_norm": 251.18804931640625, "learning_rate": 0.000296681172107958, "loss": 35.9581, "step": 8376 }, { "epoch": 22.12479366127435, "grad_norm": 352.3895568847656, "learning_rate": 0.0002966401297116139, "loss": 35.7039, "step": 8377 }, { "epoch": 22.127434796962692, "grad_norm": 659.8284912109375, "learning_rate": 0.0002965990860128715, "loss": 35.5702, "step": 8378 }, { "epoch": 22.13007593265104, "grad_norm": 628.260986328125, "learning_rate": 0.000296558041012877, "loss": 38.5693, "step": 8379 }, { "epoch": 22.132717068339385, "grad_norm": 318.4929504394531, "learning_rate": 0.0002965169947127766, "loss": 39.8816, "step": 8380 }, { "epoch": 22.13535820402773, "grad_norm": 168.7434844970703, "learning_rate": 0.00029647594711371643, "loss": 39.5475, "step": 8381 }, { "epoch": 22.137999339716078, "grad_norm": 191.3740234375, "learning_rate": 0.00029643489821684274, "loss": 40.9682, "step": 8382 }, { "epoch": 22.140640475404425, "grad_norm": 169.7427215576172, "learning_rate": 0.00029639384802330175, "loss": 40.1528, "step": 8383 }, { "epoch": 22.14328161109277, "grad_norm": 197.05154418945312, "learning_rate": 0.00029635279653423993, "loss": 42.2479, "step": 8384 }, { "epoch": 22.145922746781117, "grad_norm": 879.4843139648438, "learning_rate": 0.0002963117437508034, "loss": 40.613, "step": 8385 }, { "epoch": 22.14856388246946, "grad_norm": 350.6630554199219, "learning_rate": 0.0002962706896741386, "loss": 40.9954, "step": 8386 }, { "epoch": 22.151205018157807, "grad_norm": 310.0904541015625, "learning_rate": 0.000296229634305392, "loss": 42.7255, "step": 8387 }, { "epoch": 22.153846153846153, "grad_norm": 302.67547607421875, "learning_rate": 0.00029618857764571004, "loss": 41.9528, "step": 8388 }, { "epoch": 22.1564872895345, "grad_norm": 294.4707336425781, "learning_rate": 0.0002961475196962392, "loss": 41.0129, "step": 8389 }, { "epoch": 22.159128425222846, "grad_norm": 196.39683532714844, "learning_rate": 0.00029610646045812603, "loss": 40.2926, "step": 8390 }, { "epoch": 22.161769560911193, "grad_norm": 542.056640625, "learning_rate": 0.00029606539993251693, "loss": 38.4001, "step": 8391 }, { "epoch": 22.16441069659954, "grad_norm": 369.0796813964844, "learning_rate": 0.0002960243381205586, "loss": 38.0655, "step": 8392 }, { "epoch": 22.167051832287882, "grad_norm": 147.795654296875, "learning_rate": 0.0002959832750233977, "loss": 39.6944, "step": 8393 }, { "epoch": 22.16969296797623, "grad_norm": 340.1771240234375, "learning_rate": 0.00029594221064218083, "loss": 36.5007, "step": 8394 }, { "epoch": 22.172334103664575, "grad_norm": 282.1785583496094, "learning_rate": 0.00029590114497805473, "loss": 35.3672, "step": 8395 }, { "epoch": 22.17497523935292, "grad_norm": 207.38926696777344, "learning_rate": 0.0002958600780321661, "loss": 34.8599, "step": 8396 }, { "epoch": 22.177616375041268, "grad_norm": 326.2638244628906, "learning_rate": 0.0002958190098056618, "loss": 35.0973, "step": 8397 }, { "epoch": 22.180257510729614, "grad_norm": 345.9947509765625, "learning_rate": 0.0002957779402996886, "loss": 36.0883, "step": 8398 }, { "epoch": 22.18289864641796, "grad_norm": 257.450439453125, "learning_rate": 0.00029573686951539323, "loss": 34.2724, "step": 8399 }, { "epoch": 22.185539782106307, "grad_norm": 174.90707397460938, "learning_rate": 0.0002956957974539226, "loss": 36.9187, "step": 8400 }, { "epoch": 22.185539782106307, "eval_loss": 4.314826011657715, "eval_runtime": 2.147, "eval_samples_per_second": 230.559, "eval_steps_per_second": 28.878, "step": 8400 }, { "epoch": 22.18818091779465, "grad_norm": 1688.421875, "learning_rate": 0.0002956547241164237, "loss": 34.8644, "step": 8401 }, { "epoch": 22.190822053482997, "grad_norm": 922.3491821289062, "learning_rate": 0.00029561364950404356, "loss": 38.4229, "step": 8402 }, { "epoch": 22.193463189171343, "grad_norm": 4326.302734375, "learning_rate": 0.0002955725736179289, "loss": 41.1306, "step": 8403 }, { "epoch": 22.19610432485969, "grad_norm": 5399.1845703125, "learning_rate": 0.000295531496459227, "loss": 46.3686, "step": 8404 }, { "epoch": 22.198745460548036, "grad_norm": 2326.743408203125, "learning_rate": 0.00029549041802908474, "loss": 36.5739, "step": 8405 }, { "epoch": 22.201386596236382, "grad_norm": 1747.8577880859375, "learning_rate": 0.0002954493383286493, "loss": 41.4275, "step": 8406 }, { "epoch": 22.20402773192473, "grad_norm": 6807.39111328125, "learning_rate": 0.0002954082573590678, "loss": 33.3788, "step": 8407 }, { "epoch": 22.206668867613075, "grad_norm": 2134.91796875, "learning_rate": 0.00029536717512148735, "loss": 33.0534, "step": 8408 }, { "epoch": 22.209310003301418, "grad_norm": 2341.883544921875, "learning_rate": 0.0002953260916170552, "loss": 28.6097, "step": 8409 }, { "epoch": 22.211951138989765, "grad_norm": 2055.388427734375, "learning_rate": 0.0002952850068469186, "loss": 23.8124, "step": 8410 }, { "epoch": 22.21459227467811, "grad_norm": 4968.75, "learning_rate": 0.0002952439208122248, "loss": 23.0252, "step": 8411 }, { "epoch": 22.217233410366457, "grad_norm": 767.1857299804688, "learning_rate": 0.000295202833514121, "loss": 17.641, "step": 8412 }, { "epoch": 22.219874546054804, "grad_norm": 257.4449768066406, "learning_rate": 0.0002951617449537547, "loss": 35.5661, "step": 8413 }, { "epoch": 22.22251568174315, "grad_norm": 866.2459106445312, "learning_rate": 0.00029512065513227314, "loss": 36.0365, "step": 8414 }, { "epoch": 22.225156817431497, "grad_norm": 430.2967834472656, "learning_rate": 0.00029507956405082386, "loss": 34.9082, "step": 8415 }, { "epoch": 22.22779795311984, "grad_norm": 539.6392822265625, "learning_rate": 0.00029503847171055426, "loss": 35.495, "step": 8416 }, { "epoch": 22.230439088808186, "grad_norm": 306.5121765136719, "learning_rate": 0.00029499737811261166, "loss": 34.7849, "step": 8417 }, { "epoch": 22.233080224496533, "grad_norm": 298.7595520019531, "learning_rate": 0.00029495628325814386, "loss": 36.2033, "step": 8418 }, { "epoch": 22.23572136018488, "grad_norm": 359.0038757324219, "learning_rate": 0.0002949151871482982, "loss": 36.4072, "step": 8419 }, { "epoch": 22.238362495873226, "grad_norm": 155.85597229003906, "learning_rate": 0.0002948740897842223, "loss": 35.3408, "step": 8420 }, { "epoch": 22.241003631561572, "grad_norm": 320.4159851074219, "learning_rate": 0.0002948329911670638, "loss": 34.9562, "step": 8421 }, { "epoch": 22.24364476724992, "grad_norm": 812.4486083984375, "learning_rate": 0.00029479189129797036, "loss": 35.5802, "step": 8422 }, { "epoch": 22.246285902938265, "grad_norm": 268.5194396972656, "learning_rate": 0.0002947507901780897, "loss": 34.5214, "step": 8423 }, { "epoch": 22.248927038626608, "grad_norm": 730.525146484375, "learning_rate": 0.00029470968780856947, "loss": 35.5536, "step": 8424 }, { "epoch": 22.251568174314954, "grad_norm": 493.6590270996094, "learning_rate": 0.0002946685841905575, "loss": 35.6214, "step": 8425 }, { "epoch": 22.2542093100033, "grad_norm": 222.63133239746094, "learning_rate": 0.00029462747932520154, "loss": 34.5978, "step": 8426 }, { "epoch": 22.256850445691647, "grad_norm": 310.7442626953125, "learning_rate": 0.0002945863732136494, "loss": 35.3822, "step": 8427 }, { "epoch": 22.259491581379994, "grad_norm": 500.7985534667969, "learning_rate": 0.0002945452658570491, "loss": 37.2089, "step": 8428 }, { "epoch": 22.26213271706834, "grad_norm": 249.7920379638672, "learning_rate": 0.00029450415725654834, "loss": 35.4452, "step": 8429 }, { "epoch": 22.264773852756687, "grad_norm": 991.2028198242188, "learning_rate": 0.00029446304741329515, "loss": 41.4731, "step": 8430 }, { "epoch": 22.267414988445033, "grad_norm": 236.08978271484375, "learning_rate": 0.0002944219363284374, "loss": 39.8878, "step": 8431 }, { "epoch": 22.270056124133376, "grad_norm": 267.8977355957031, "learning_rate": 0.0002943808240031233, "loss": 41.0719, "step": 8432 }, { "epoch": 22.272697259821722, "grad_norm": 313.75830078125, "learning_rate": 0.00029433971043850074, "loss": 40.0972, "step": 8433 }, { "epoch": 22.27533839551007, "grad_norm": 594.9404907226562, "learning_rate": 0.0002942985956357177, "loss": 42.6247, "step": 8434 }, { "epoch": 22.277979531198415, "grad_norm": 388.99176025390625, "learning_rate": 0.00029425747959592246, "loss": 43.4451, "step": 8435 }, { "epoch": 22.28062066688676, "grad_norm": 238.76678466796875, "learning_rate": 0.0002942163623202631, "loss": 44.5371, "step": 8436 }, { "epoch": 22.283261802575108, "grad_norm": 242.36073303222656, "learning_rate": 0.0002941752438098879, "loss": 41.2482, "step": 8437 }, { "epoch": 22.285902938263455, "grad_norm": 391.04083251953125, "learning_rate": 0.00029413412406594484, "loss": 39.8167, "step": 8438 }, { "epoch": 22.288544073951797, "grad_norm": 265.5522766113281, "learning_rate": 0.0002940930030895823, "loss": 40.3155, "step": 8439 }, { "epoch": 22.291185209640144, "grad_norm": 415.7916564941406, "learning_rate": 0.0002940518808819486, "loss": 38.2431, "step": 8440 }, { "epoch": 22.29382634532849, "grad_norm": 286.5270690917969, "learning_rate": 0.000294010757444192, "loss": 37.4664, "step": 8441 }, { "epoch": 22.296467481016837, "grad_norm": 305.6883239746094, "learning_rate": 0.0002939696327774608, "loss": 37.406, "step": 8442 }, { "epoch": 22.299108616705183, "grad_norm": 435.7538146972656, "learning_rate": 0.00029392850688290343, "loss": 36.4368, "step": 8443 }, { "epoch": 22.30174975239353, "grad_norm": 301.5271301269531, "learning_rate": 0.0002938873797616684, "loss": 35.9158, "step": 8444 }, { "epoch": 22.304390888081876, "grad_norm": 230.31298828125, "learning_rate": 0.00029384625141490404, "loss": 36.2358, "step": 8445 }, { "epoch": 22.307032023770223, "grad_norm": 586.9121704101562, "learning_rate": 0.00029380512184375883, "loss": 34.703, "step": 8446 }, { "epoch": 22.309673159458566, "grad_norm": 175.3349609375, "learning_rate": 0.00029376399104938134, "loss": 35.8249, "step": 8447 }, { "epoch": 22.312314295146912, "grad_norm": 272.7773742675781, "learning_rate": 0.00029372285903292005, "loss": 35.6426, "step": 8448 }, { "epoch": 22.31495543083526, "grad_norm": 184.41453552246094, "learning_rate": 0.00029368172579552373, "loss": 35.7449, "step": 8449 }, { "epoch": 22.317596566523605, "grad_norm": 187.147705078125, "learning_rate": 0.00029364059133834083, "loss": 34.874, "step": 8450 }, { "epoch": 22.32023770221195, "grad_norm": 1068.1180419921875, "learning_rate": 0.00029359945566252, "loss": 42.0978, "step": 8451 }, { "epoch": 22.322878837900298, "grad_norm": 2357.9658203125, "learning_rate": 0.00029355831876921, "loss": 54.3618, "step": 8452 }, { "epoch": 22.325519973588644, "grad_norm": 5547.01708984375, "learning_rate": 0.00029351718065955955, "loss": 41.5442, "step": 8453 }, { "epoch": 22.32816110927699, "grad_norm": 3547.033203125, "learning_rate": 0.0002934760413347174, "loss": 47.7539, "step": 8454 }, { "epoch": 22.330802244965334, "grad_norm": 2481.5751953125, "learning_rate": 0.00029343490079583237, "loss": 39.3768, "step": 8455 }, { "epoch": 22.33344338065368, "grad_norm": 2130.689208984375, "learning_rate": 0.00029339375904405317, "loss": 37.8697, "step": 8456 }, { "epoch": 22.336084516342027, "grad_norm": 935.95751953125, "learning_rate": 0.00029335261608052875, "loss": 18.2257, "step": 8457 }, { "epoch": 22.338725652030373, "grad_norm": 1019.4254150390625, "learning_rate": 0.00029331147190640803, "loss": 13.6316, "step": 8458 }, { "epoch": 22.34136678771872, "grad_norm": 2318.104248046875, "learning_rate": 0.0002932703265228399, "loss": 18.281, "step": 8459 }, { "epoch": 22.344007923407066, "grad_norm": 7732.01025390625, "learning_rate": 0.0002932291799309734, "loss": 20.7058, "step": 8460 }, { "epoch": 22.346649059095412, "grad_norm": 381.49468994140625, "learning_rate": 0.00029318803213195725, "loss": 11.3912, "step": 8461 }, { "epoch": 22.349290194783755, "grad_norm": 650.5052490234375, "learning_rate": 0.00029314688312694076, "loss": 24.6003, "step": 8462 }, { "epoch": 22.3519313304721, "grad_norm": 3867.259033203125, "learning_rate": 0.000293105732917073, "loss": 37.793, "step": 8463 }, { "epoch": 22.354572466160448, "grad_norm": 323.4693298339844, "learning_rate": 0.0002930645815035029, "loss": 35.3995, "step": 8464 }, { "epoch": 22.357213601848795, "grad_norm": 548.656982421875, "learning_rate": 0.00029302342888737966, "loss": 38.8749, "step": 8465 }, { "epoch": 22.35985473753714, "grad_norm": 300.0752258300781, "learning_rate": 0.0002929822750698524, "loss": 36.2588, "step": 8466 }, { "epoch": 22.362495873225487, "grad_norm": 218.25399780273438, "learning_rate": 0.0002929411200520704, "loss": 34.7694, "step": 8467 }, { "epoch": 22.365137008913834, "grad_norm": 217.6888885498047, "learning_rate": 0.00029289996383518284, "loss": 36.953, "step": 8468 }, { "epoch": 22.36777814460218, "grad_norm": 174.4749755859375, "learning_rate": 0.000292858806420339, "loss": 34.4611, "step": 8469 }, { "epoch": 22.370419280290523, "grad_norm": 776.1744995117188, "learning_rate": 0.00029281764780868816, "loss": 35.0774, "step": 8470 }, { "epoch": 22.37306041597887, "grad_norm": 1046.5113525390625, "learning_rate": 0.0002927764880013796, "loss": 34.3628, "step": 8471 }, { "epoch": 22.375701551667216, "grad_norm": 395.9505310058594, "learning_rate": 0.00029273532699956287, "loss": 36.9506, "step": 8472 }, { "epoch": 22.378342687355563, "grad_norm": 227.49464416503906, "learning_rate": 0.0002926941648043872, "loss": 34.7508, "step": 8473 }, { "epoch": 22.38098382304391, "grad_norm": 158.02316284179688, "learning_rate": 0.00029265300141700196, "loss": 35.3867, "step": 8474 }, { "epoch": 22.383624958732256, "grad_norm": 504.9655456542969, "learning_rate": 0.00029261183683855675, "loss": 35.6005, "step": 8475 }, { "epoch": 22.386266094420602, "grad_norm": 146.751708984375, "learning_rate": 0.00029257067107020106, "loss": 34.7152, "step": 8476 }, { "epoch": 22.38890723010895, "grad_norm": 377.39910888671875, "learning_rate": 0.0002925295041130844, "loss": 35.6824, "step": 8477 }, { "epoch": 22.39154836579729, "grad_norm": 404.0184631347656, "learning_rate": 0.00029248833596835625, "loss": 36.951, "step": 8478 }, { "epoch": 22.394189501485638, "grad_norm": 366.193359375, "learning_rate": 0.00029244716663716635, "loss": 37.8203, "step": 8479 }, { "epoch": 22.396830637173984, "grad_norm": 598.7274169921875, "learning_rate": 0.0002924059961206643, "loss": 39.5108, "step": 8480 }, { "epoch": 22.39947177286233, "grad_norm": 227.28146362304688, "learning_rate": 0.00029236482441999964, "loss": 39.3007, "step": 8481 }, { "epoch": 22.402112908550677, "grad_norm": 197.15322875976562, "learning_rate": 0.0002923236515363221, "loss": 39.9748, "step": 8482 }, { "epoch": 22.404754044239024, "grad_norm": 627.8633422851562, "learning_rate": 0.0002922824774707815, "loss": 40.1691, "step": 8483 }, { "epoch": 22.40739517992737, "grad_norm": 181.32518005371094, "learning_rate": 0.00029224130222452757, "loss": 42.2787, "step": 8484 }, { "epoch": 22.410036315615713, "grad_norm": 250.44122314453125, "learning_rate": 0.00029220012579871015, "loss": 43.4387, "step": 8485 }, { "epoch": 22.41267745130406, "grad_norm": 297.09954833984375, "learning_rate": 0.0002921589481944789, "loss": 42.1501, "step": 8486 }, { "epoch": 22.415318586992406, "grad_norm": 161.5186004638672, "learning_rate": 0.0002921177694129838, "loss": 39.7777, "step": 8487 }, { "epoch": 22.417959722680752, "grad_norm": 211.2624969482422, "learning_rate": 0.0002920765894553748, "loss": 40.5305, "step": 8488 }, { "epoch": 22.4206008583691, "grad_norm": 156.59620666503906, "learning_rate": 0.00029203540832280174, "loss": 38.3708, "step": 8489 }, { "epoch": 22.423241994057445, "grad_norm": 199.16600036621094, "learning_rate": 0.0002919942260164145, "loss": 40.002, "step": 8490 }, { "epoch": 22.42588312974579, "grad_norm": 192.89268493652344, "learning_rate": 0.00029195304253736324, "loss": 38.7243, "step": 8491 }, { "epoch": 22.428524265434138, "grad_norm": 349.2945556640625, "learning_rate": 0.0002919118578867979, "loss": 38.4049, "step": 8492 }, { "epoch": 22.43116540112248, "grad_norm": 643.5361328125, "learning_rate": 0.00029187067206586857, "loss": 37.5102, "step": 8493 }, { "epoch": 22.433806536810827, "grad_norm": 237.23570251464844, "learning_rate": 0.0002918294850757253, "loss": 35.9349, "step": 8494 }, { "epoch": 22.436447672499174, "grad_norm": 277.3971252441406, "learning_rate": 0.00029178829691751826, "loss": 36.6908, "step": 8495 }, { "epoch": 22.43908880818752, "grad_norm": 156.11346435546875, "learning_rate": 0.0002917471075923975, "loss": 36.6896, "step": 8496 }, { "epoch": 22.441729943875867, "grad_norm": 251.21255493164062, "learning_rate": 0.00029170591710151334, "loss": 35.5531, "step": 8497 }, { "epoch": 22.444371079564213, "grad_norm": 360.5844421386719, "learning_rate": 0.00029166472544601605, "loss": 34.2045, "step": 8498 }, { "epoch": 22.44701221525256, "grad_norm": 189.22000122070312, "learning_rate": 0.0002916235326270556, "loss": 34.4444, "step": 8499 }, { "epoch": 22.449653350940906, "grad_norm": 323.3016662597656, "learning_rate": 0.00029158233864578256, "loss": 35.1869, "step": 8500 }, { "epoch": 22.45229448662925, "grad_norm": 214.27059936523438, "learning_rate": 0.00029154114350334704, "loss": 35.653, "step": 8501 }, { "epoch": 22.454935622317596, "grad_norm": 965.1487426757812, "learning_rate": 0.00029149994720089955, "loss": 40.2581, "step": 8502 }, { "epoch": 22.457576758005942, "grad_norm": 5934.1943359375, "learning_rate": 0.0002914587497395905, "loss": 48.1379, "step": 8503 }, { "epoch": 22.46021789369429, "grad_norm": 1497.7906494140625, "learning_rate": 0.00029141755112057006, "loss": 41.0671, "step": 8504 }, { "epoch": 22.462859029382635, "grad_norm": 5665.52734375, "learning_rate": 0.0002913763513449889, "loss": 34.1779, "step": 8505 }, { "epoch": 22.46550016507098, "grad_norm": 2011.2816162109375, "learning_rate": 0.00029133515041399743, "loss": 40.4823, "step": 8506 }, { "epoch": 22.468141300759328, "grad_norm": 1202.89013671875, "learning_rate": 0.00029129394832874624, "loss": 40.9388, "step": 8507 }, { "epoch": 22.47078243644767, "grad_norm": 2684.095947265625, "learning_rate": 0.00029125274509038575, "loss": 35.1029, "step": 8508 }, { "epoch": 22.473423572136017, "grad_norm": 2443.37109375, "learning_rate": 0.0002912115407000665, "loss": 27.3479, "step": 8509 }, { "epoch": 22.476064707824364, "grad_norm": 2424.34375, "learning_rate": 0.00029117033515893933, "loss": 24.4768, "step": 8510 }, { "epoch": 22.47870584351271, "grad_norm": 1770.934814453125, "learning_rate": 0.0002911291284681546, "loss": 17.6635, "step": 8511 }, { "epoch": 22.481346979201057, "grad_norm": 565.058349609375, "learning_rate": 0.0002910879206288632, "loss": 33.2143, "step": 8512 }, { "epoch": 22.483988114889403, "grad_norm": 267.8642883300781, "learning_rate": 0.00029104671164221574, "loss": 37.3109, "step": 8513 }, { "epoch": 22.48662925057775, "grad_norm": 381.2462463378906, "learning_rate": 0.000291005501509363, "loss": 36.0691, "step": 8514 }, { "epoch": 22.489270386266096, "grad_norm": 378.48175048828125, "learning_rate": 0.00029096429023145573, "loss": 35.6722, "step": 8515 }, { "epoch": 22.49191152195444, "grad_norm": 336.24310302734375, "learning_rate": 0.0002909230778096446, "loss": 35.8636, "step": 8516 }, { "epoch": 22.494552657642785, "grad_norm": 396.0838928222656, "learning_rate": 0.00029088186424508066, "loss": 35.5048, "step": 8517 }, { "epoch": 22.49719379333113, "grad_norm": 538.7273559570312, "learning_rate": 0.00029084064953891457, "loss": 34.3864, "step": 8518 }, { "epoch": 22.499834929019478, "grad_norm": 329.1448974609375, "learning_rate": 0.00029079943369229747, "loss": 37.9241, "step": 8519 }, { "epoch": 22.502476064707825, "grad_norm": 227.2936553955078, "learning_rate": 0.0002907582167063801, "loss": 35.15, "step": 8520 }, { "epoch": 22.50511720039617, "grad_norm": 361.4136047363281, "learning_rate": 0.00029071699858231346, "loss": 35.1015, "step": 8521 }, { "epoch": 22.507758336084517, "grad_norm": 380.00201416015625, "learning_rate": 0.00029067577932124854, "loss": 35.0761, "step": 8522 }, { "epoch": 22.510399471772864, "grad_norm": 257.23651123046875, "learning_rate": 0.0002906345589243364, "loss": 35.4091, "step": 8523 }, { "epoch": 22.513040607461207, "grad_norm": 442.7409362792969, "learning_rate": 0.00029059333739272814, "loss": 37.112, "step": 8524 }, { "epoch": 22.515681743149553, "grad_norm": 1031.5367431640625, "learning_rate": 0.00029055211472757463, "loss": 34.0858, "step": 8525 }, { "epoch": 22.5183228788379, "grad_norm": 277.5330505371094, "learning_rate": 0.00029051089093002724, "loss": 35.8739, "step": 8526 }, { "epoch": 22.520964014526246, "grad_norm": 301.2928771972656, "learning_rate": 0.000290469666001237, "loss": 34.903, "step": 8527 }, { "epoch": 22.523605150214593, "grad_norm": 1151.5179443359375, "learning_rate": 0.00029042843994235513, "loss": 35.6416, "step": 8528 }, { "epoch": 22.52624628590294, "grad_norm": 959.9163818359375, "learning_rate": 0.00029038721275453276, "loss": 37.0774, "step": 8529 }, { "epoch": 22.528887421591286, "grad_norm": 2079.08642578125, "learning_rate": 0.00029034598443892124, "loss": 41.4405, "step": 8530 }, { "epoch": 22.53152855727963, "grad_norm": 287.6624755859375, "learning_rate": 0.0002903047549966718, "loss": 39.2137, "step": 8531 }, { "epoch": 22.534169692967975, "grad_norm": 329.63397216796875, "learning_rate": 0.00029026352442893574, "loss": 40.7592, "step": 8532 }, { "epoch": 22.53681082865632, "grad_norm": 244.13864135742188, "learning_rate": 0.00029022229273686445, "loss": 41.3675, "step": 8533 }, { "epoch": 22.539451964344668, "grad_norm": 243.81500244140625, "learning_rate": 0.00029018105992160927, "loss": 42.4227, "step": 8534 }, { "epoch": 22.542093100033014, "grad_norm": 356.8961486816406, "learning_rate": 0.00029013982598432154, "loss": 42.7546, "step": 8535 }, { "epoch": 22.54473423572136, "grad_norm": 300.3351135253906, "learning_rate": 0.00029009859092615276, "loss": 42.6567, "step": 8536 }, { "epoch": 22.547375371409707, "grad_norm": 632.5995483398438, "learning_rate": 0.0002900573547482544, "loss": 45.7717, "step": 8537 }, { "epoch": 22.550016507098054, "grad_norm": 219.85971069335938, "learning_rate": 0.000290016117451778, "loss": 43.1625, "step": 8538 }, { "epoch": 22.552657642786397, "grad_norm": 627.5389404296875, "learning_rate": 0.0002899748790378749, "loss": 41.8727, "step": 8539 }, { "epoch": 22.555298778474743, "grad_norm": 341.6162414550781, "learning_rate": 0.00028993363950769683, "loss": 38.2777, "step": 8540 }, { "epoch": 22.55793991416309, "grad_norm": 421.8855285644531, "learning_rate": 0.0002898923988623953, "loss": 38.8401, "step": 8541 }, { "epoch": 22.560581049851436, "grad_norm": 381.89752197265625, "learning_rate": 0.00028985115710312216, "loss": 39.5143, "step": 8542 }, { "epoch": 22.563222185539782, "grad_norm": 230.98843383789062, "learning_rate": 0.00028980991423102864, "loss": 36.4249, "step": 8543 }, { "epoch": 22.56586332122813, "grad_norm": 438.6866149902344, "learning_rate": 0.00028976867024726674, "loss": 37.9312, "step": 8544 }, { "epoch": 22.568504456916475, "grad_norm": 153.78512573242188, "learning_rate": 0.00028972742515298805, "loss": 35.5404, "step": 8545 }, { "epoch": 22.57114559260482, "grad_norm": 481.36053466796875, "learning_rate": 0.0002896861789493445, "loss": 36.4576, "step": 8546 }, { "epoch": 22.573786728293165, "grad_norm": 435.1902160644531, "learning_rate": 0.00028964493163748757, "loss": 36.1177, "step": 8547 }, { "epoch": 22.57642786398151, "grad_norm": 333.40435791015625, "learning_rate": 0.00028960368321856916, "loss": 35.5389, "step": 8548 }, { "epoch": 22.579068999669857, "grad_norm": 364.0855407714844, "learning_rate": 0.0002895624336937413, "loss": 35.2977, "step": 8549 }, { "epoch": 22.581710135358204, "grad_norm": 669.8021240234375, "learning_rate": 0.00028952118306415557, "loss": 35.5697, "step": 8550 }, { "epoch": 22.58435127104655, "grad_norm": 1101.2828369140625, "learning_rate": 0.00028947993133096416, "loss": 37.0528, "step": 8551 }, { "epoch": 22.586992406734897, "grad_norm": 1283.8167724609375, "learning_rate": 0.00028943867849531884, "loss": 50.0532, "step": 8552 }, { "epoch": 22.589633542423243, "grad_norm": 4037.11083984375, "learning_rate": 0.0002893974245583715, "loss": 33.6994, "step": 8553 }, { "epoch": 22.592274678111586, "grad_norm": 4500.8662109375, "learning_rate": 0.0002893561695212743, "loss": 42.2127, "step": 8554 }, { "epoch": 22.594915813799933, "grad_norm": 3463.1865234375, "learning_rate": 0.00028931491338517917, "loss": 42.875, "step": 8555 }, { "epoch": 22.59755694948828, "grad_norm": 2475.076171875, "learning_rate": 0.0002892736561512382, "loss": 35.6025, "step": 8556 }, { "epoch": 22.600198085176626, "grad_norm": 6680.65869140625, "learning_rate": 0.0002892323978206034, "loss": 30.5436, "step": 8557 }, { "epoch": 22.602839220864972, "grad_norm": 1544.456787109375, "learning_rate": 0.00028919113839442694, "loss": 31.9915, "step": 8558 }, { "epoch": 22.60548035655332, "grad_norm": 1626.2349853515625, "learning_rate": 0.00028914987787386105, "loss": 23.4024, "step": 8559 }, { "epoch": 22.608121492241665, "grad_norm": 1419.87255859375, "learning_rate": 0.00028910861626005774, "loss": 24.4714, "step": 8560 }, { "epoch": 22.61076262793001, "grad_norm": 1731.42041015625, "learning_rate": 0.00028906735355416934, "loss": 27.5839, "step": 8561 }, { "epoch": 22.613403763618354, "grad_norm": 1822.8201904296875, "learning_rate": 0.00028902608975734794, "loss": 31.0708, "step": 8562 }, { "epoch": 22.6160448993067, "grad_norm": 759.8280029296875, "learning_rate": 0.00028898482487074603, "loss": 35.7297, "step": 8563 }, { "epoch": 22.618686034995047, "grad_norm": 381.77008056640625, "learning_rate": 0.00028894355889551575, "loss": 33.6626, "step": 8564 }, { "epoch": 22.621327170683394, "grad_norm": 546.6328735351562, "learning_rate": 0.0002889022918328094, "loss": 37.044, "step": 8565 }, { "epoch": 22.62396830637174, "grad_norm": 385.1479797363281, "learning_rate": 0.0002888610236837795, "loss": 35.8779, "step": 8566 }, { "epoch": 22.626609442060087, "grad_norm": 1028.871826171875, "learning_rate": 0.0002888197544495783, "loss": 35.2222, "step": 8567 }, { "epoch": 22.629250577748433, "grad_norm": 564.1788940429688, "learning_rate": 0.00028877848413135827, "loss": 35.0316, "step": 8568 }, { "epoch": 22.63189171343678, "grad_norm": 467.4166259765625, "learning_rate": 0.00028873721273027183, "loss": 35.8822, "step": 8569 }, { "epoch": 22.634532849125122, "grad_norm": 875.8065795898438, "learning_rate": 0.0002886959402474714, "loss": 34.8552, "step": 8570 }, { "epoch": 22.63717398481347, "grad_norm": 312.4656982421875, "learning_rate": 0.0002886546666841097, "loss": 36.3652, "step": 8571 }, { "epoch": 22.639815120501815, "grad_norm": 444.49884033203125, "learning_rate": 0.0002886133920413391, "loss": 34.9615, "step": 8572 }, { "epoch": 22.64245625619016, "grad_norm": 646.4826049804688, "learning_rate": 0.0002885721163203123, "loss": 34.8178, "step": 8573 }, { "epoch": 22.645097391878508, "grad_norm": 264.2083435058594, "learning_rate": 0.0002885308395221816, "loss": 35.5388, "step": 8574 }, { "epoch": 22.647738527566855, "grad_norm": 422.6962585449219, "learning_rate": 0.00028848956164810005, "loss": 34.7405, "step": 8575 }, { "epoch": 22.6503796632552, "grad_norm": 328.5664367675781, "learning_rate": 0.0002884482826992199, "loss": 34.289, "step": 8576 }, { "epoch": 22.653020798943544, "grad_norm": 577.1526489257812, "learning_rate": 0.00028840700267669417, "loss": 36.0156, "step": 8577 }, { "epoch": 22.65566193463189, "grad_norm": 284.2899169921875, "learning_rate": 0.0002883657215816755, "loss": 36.5021, "step": 8578 }, { "epoch": 22.658303070320237, "grad_norm": 706.339111328125, "learning_rate": 0.0002883244394153165, "loss": 38.4762, "step": 8579 }, { "epoch": 22.660944206008583, "grad_norm": 736.1837768554688, "learning_rate": 0.00028828315617877, "loss": 41.1445, "step": 8580 }, { "epoch": 22.66358534169693, "grad_norm": 210.19554138183594, "learning_rate": 0.000288241871873189, "loss": 39.0472, "step": 8581 }, { "epoch": 22.666226477385276, "grad_norm": 499.00616455078125, "learning_rate": 0.00028820058649972613, "loss": 42.0159, "step": 8582 }, { "epoch": 22.668867613073623, "grad_norm": 451.44366455078125, "learning_rate": 0.00028815930005953426, "loss": 41.1486, "step": 8583 }, { "epoch": 22.67150874876197, "grad_norm": 580.4337768554688, "learning_rate": 0.00028811801255376637, "loss": 41.9812, "step": 8584 }, { "epoch": 22.674149884450312, "grad_norm": 960.5958251953125, "learning_rate": 0.0002880767239835754, "loss": 45.0617, "step": 8585 }, { "epoch": 22.67679102013866, "grad_norm": 318.04205322265625, "learning_rate": 0.0002880354343501143, "loss": 43.3157, "step": 8586 }, { "epoch": 22.679432155827005, "grad_norm": 639.0299072265625, "learning_rate": 0.00028799414365453604, "loss": 42.2406, "step": 8587 }, { "epoch": 22.68207329151535, "grad_norm": 908.9718627929688, "learning_rate": 0.00028795285189799356, "loss": 42.8092, "step": 8588 }, { "epoch": 22.684714427203698, "grad_norm": 377.26556396484375, "learning_rate": 0.00028791155908164005, "loss": 39.8414, "step": 8589 }, { "epoch": 22.687355562892044, "grad_norm": 377.92266845703125, "learning_rate": 0.00028787026520662854, "loss": 40.6778, "step": 8590 }, { "epoch": 22.68999669858039, "grad_norm": 450.79736328125, "learning_rate": 0.0002878289702741121, "loss": 37.7345, "step": 8591 }, { "epoch": 22.692637834268737, "grad_norm": 282.3731689453125, "learning_rate": 0.00028778767428524387, "loss": 37.5766, "step": 8592 }, { "epoch": 22.69527896995708, "grad_norm": 344.5379943847656, "learning_rate": 0.000287746377241177, "loss": 36.9206, "step": 8593 }, { "epoch": 22.697920105645427, "grad_norm": 297.8644714355469, "learning_rate": 0.0002877050791430648, "loss": 35.7904, "step": 8594 }, { "epoch": 22.700561241333773, "grad_norm": 468.95684814453125, "learning_rate": 0.0002876637799920604, "loss": 35.341, "step": 8595 }, { "epoch": 22.70320237702212, "grad_norm": 923.272705078125, "learning_rate": 0.0002876224797893171, "loss": 36.3774, "step": 8596 }, { "epoch": 22.705843512710466, "grad_norm": 257.659912109375, "learning_rate": 0.000287581178535988, "loss": 33.8263, "step": 8597 }, { "epoch": 22.708484648398812, "grad_norm": 219.5772247314453, "learning_rate": 0.00028753987623322667, "loss": 35.0588, "step": 8598 }, { "epoch": 22.71112578408716, "grad_norm": 668.5338134765625, "learning_rate": 0.00028749857288218637, "loss": 35.4899, "step": 8599 }, { "epoch": 22.7137669197755, "grad_norm": 394.9412841796875, "learning_rate": 0.00028745726848402036, "loss": 35.4205, "step": 8600 }, { "epoch": 22.7137669197755, "eval_loss": 4.1647515296936035, "eval_runtime": 2.1203, "eval_samples_per_second": 233.455, "eval_steps_per_second": 29.241, "step": 8600 }, { "epoch": 22.716408055463848, "grad_norm": 917.740234375, "learning_rate": 0.0002874159630398823, "loss": 37.0478, "step": 8601 }, { "epoch": 22.719049191152195, "grad_norm": 3069.310302734375, "learning_rate": 0.00028737465655092524, "loss": 46.1927, "step": 8602 }, { "epoch": 22.72169032684054, "grad_norm": 1877.2447509765625, "learning_rate": 0.00028733334901830305, "loss": 36.7719, "step": 8603 }, { "epoch": 22.724331462528887, "grad_norm": 1078.4285888671875, "learning_rate": 0.00028729204044316886, "loss": 30.0088, "step": 8604 }, { "epoch": 22.726972598217234, "grad_norm": 3613.35888671875, "learning_rate": 0.00028725073082667643, "loss": 27.2153, "step": 8605 }, { "epoch": 22.72961373390558, "grad_norm": 6763.6787109375, "learning_rate": 0.0002872094201699792, "loss": 25.6644, "step": 8606 }, { "epoch": 22.732254869593927, "grad_norm": 4235.71728515625, "learning_rate": 0.0002871681084742308, "loss": 21.0825, "step": 8607 }, { "epoch": 22.73489600528227, "grad_norm": 1180.3306884765625, "learning_rate": 0.00028712679574058477, "loss": 18.7515, "step": 8608 }, { "epoch": 22.737537140970616, "grad_norm": 4098.14501953125, "learning_rate": 0.00028708548197019473, "loss": 17.6355, "step": 8609 }, { "epoch": 22.740178276658963, "grad_norm": 2475.213134765625, "learning_rate": 0.0002870441671642145, "loss": 17.7596, "step": 8610 }, { "epoch": 22.74281941234731, "grad_norm": 947.0032958984375, "learning_rate": 0.00028700285132379755, "loss": 15.0003, "step": 8611 }, { "epoch": 22.745460548035656, "grad_norm": 1284.2281494140625, "learning_rate": 0.0002869615344500978, "loss": 22.8707, "step": 8612 }, { "epoch": 22.748101683724002, "grad_norm": 1586.6092529296875, "learning_rate": 0.0002869202165442689, "loss": 37.1986, "step": 8613 }, { "epoch": 22.75074281941235, "grad_norm": 2445.039306640625, "learning_rate": 0.0002868788976074646, "loss": 36.5517, "step": 8614 }, { "epoch": 22.753383955100695, "grad_norm": 702.990478515625, "learning_rate": 0.0002868375776408388, "loss": 36.5687, "step": 8615 }, { "epoch": 22.756025090789038, "grad_norm": 1089.6676025390625, "learning_rate": 0.00028679625664554533, "loss": 35.775, "step": 8616 }, { "epoch": 22.758666226477384, "grad_norm": 1096.6876220703125, "learning_rate": 0.00028675493462273794, "loss": 35.4549, "step": 8617 }, { "epoch": 22.76130736216573, "grad_norm": 1015.0945434570312, "learning_rate": 0.00028671361157357056, "loss": 36.2744, "step": 8618 }, { "epoch": 22.763948497854077, "grad_norm": 1126.721923828125, "learning_rate": 0.0002866722874991971, "loss": 35.9421, "step": 8619 }, { "epoch": 22.766589633542424, "grad_norm": 526.2467651367188, "learning_rate": 0.00028663096240077166, "loss": 35.9646, "step": 8620 }, { "epoch": 22.76923076923077, "grad_norm": 460.7110290527344, "learning_rate": 0.0002865896362794481, "loss": 34.8493, "step": 8621 }, { "epoch": 22.771871904919117, "grad_norm": 763.000732421875, "learning_rate": 0.00028654830913638047, "loss": 35.6529, "step": 8622 }, { "epoch": 22.77451304060746, "grad_norm": 699.5167236328125, "learning_rate": 0.00028650698097272266, "loss": 36.8565, "step": 8623 }, { "epoch": 22.777154176295806, "grad_norm": 633.4168090820312, "learning_rate": 0.00028646565178962896, "loss": 34.6551, "step": 8624 }, { "epoch": 22.779795311984152, "grad_norm": 595.2700805664062, "learning_rate": 0.00028642432158825334, "loss": 34.6236, "step": 8625 }, { "epoch": 22.7824364476725, "grad_norm": 542.052490234375, "learning_rate": 0.00028638299036974987, "loss": 35.7081, "step": 8626 }, { "epoch": 22.785077583360845, "grad_norm": 862.9315185546875, "learning_rate": 0.0002863416581352728, "loss": 34.44, "step": 8627 }, { "epoch": 22.78771871904919, "grad_norm": 774.6289672851562, "learning_rate": 0.0002863003248859762, "loss": 36.6522, "step": 8628 }, { "epoch": 22.790359854737538, "grad_norm": 815.9254760742188, "learning_rate": 0.00028625899062301456, "loss": 39.4327, "step": 8629 }, { "epoch": 22.793000990425885, "grad_norm": 2129.009765625, "learning_rate": 0.00028621765534754176, "loss": 44.4591, "step": 8630 }, { "epoch": 22.795642126114227, "grad_norm": 482.91851806640625, "learning_rate": 0.00028617631906071217, "loss": 41.6563, "step": 8631 }, { "epoch": 22.798283261802574, "grad_norm": 350.9093017578125, "learning_rate": 0.00028613498176368023, "loss": 42.3449, "step": 8632 }, { "epoch": 22.80092439749092, "grad_norm": 2453.678955078125, "learning_rate": 0.00028609364345760015, "loss": 41.6986, "step": 8633 }, { "epoch": 22.803565533179267, "grad_norm": 812.0777587890625, "learning_rate": 0.0002860523041436262, "loss": 42.9441, "step": 8634 }, { "epoch": 22.806206668867613, "grad_norm": 249.77415466308594, "learning_rate": 0.00028601096382291287, "loss": 44.8995, "step": 8635 }, { "epoch": 22.80884780455596, "grad_norm": 836.8030395507812, "learning_rate": 0.0002859696224966145, "loss": 44.6635, "step": 8636 }, { "epoch": 22.811488940244306, "grad_norm": 452.26336669921875, "learning_rate": 0.00028592828016588557, "loss": 42.6019, "step": 8637 }, { "epoch": 22.814130075932653, "grad_norm": 400.36968994140625, "learning_rate": 0.0002858869368318805, "loss": 42.0751, "step": 8638 }, { "epoch": 22.816771211620996, "grad_norm": 371.3652038574219, "learning_rate": 0.0002858455924957539, "loss": 40.5424, "step": 8639 }, { "epoch": 22.819412347309342, "grad_norm": 410.9153137207031, "learning_rate": 0.0002858042471586601, "loss": 40.5758, "step": 8640 }, { "epoch": 22.82205348299769, "grad_norm": 249.2348175048828, "learning_rate": 0.0002857629008217538, "loss": 39.9192, "step": 8641 }, { "epoch": 22.824694618686035, "grad_norm": 215.62184143066406, "learning_rate": 0.00028572155348618945, "loss": 36.7211, "step": 8642 }, { "epoch": 22.82733575437438, "grad_norm": 236.7309112548828, "learning_rate": 0.0002856802051531217, "loss": 37.1209, "step": 8643 }, { "epoch": 22.829976890062728, "grad_norm": 653.15283203125, "learning_rate": 0.00028563885582370515, "loss": 35.8737, "step": 8644 }, { "epoch": 22.832618025751074, "grad_norm": 246.6115264892578, "learning_rate": 0.00028559750549909447, "loss": 35.9727, "step": 8645 }, { "epoch": 22.835259161439417, "grad_norm": 223.30812072753906, "learning_rate": 0.00028555615418044447, "loss": 35.6441, "step": 8646 }, { "epoch": 22.837900297127764, "grad_norm": 253.22943115234375, "learning_rate": 0.00028551480186890966, "loss": 35.4189, "step": 8647 }, { "epoch": 22.84054143281611, "grad_norm": 259.90380859375, "learning_rate": 0.0002854734485656449, "loss": 35.6887, "step": 8648 }, { "epoch": 22.843182568504456, "grad_norm": 237.71580505371094, "learning_rate": 0.00028543209427180495, "loss": 34.8335, "step": 8649 }, { "epoch": 22.845823704192803, "grad_norm": 221.6401824951172, "learning_rate": 0.00028539073898854457, "loss": 35.7536, "step": 8650 }, { "epoch": 22.84846483988115, "grad_norm": 301.6711120605469, "learning_rate": 0.0002853493827170186, "loss": 36.062, "step": 8651 }, { "epoch": 22.851105975569496, "grad_norm": 1688.513427734375, "learning_rate": 0.00028530802545838184, "loss": 40.5257, "step": 8652 }, { "epoch": 22.853747111257842, "grad_norm": 3353.4951171875, "learning_rate": 0.00028526666721378927, "loss": 25.4441, "step": 8653 }, { "epoch": 22.856388246946185, "grad_norm": 892.042236328125, "learning_rate": 0.00028522530798439564, "loss": 36.1392, "step": 8654 }, { "epoch": 22.85902938263453, "grad_norm": 1726.1810302734375, "learning_rate": 0.0002851839477713561, "loss": 23.3919, "step": 8655 }, { "epoch": 22.861670518322878, "grad_norm": 1295.5361328125, "learning_rate": 0.0002851425865758254, "loss": 25.7821, "step": 8656 }, { "epoch": 22.864311654011225, "grad_norm": 1226.305419921875, "learning_rate": 0.0002851012243989587, "loss": 17.7688, "step": 8657 }, { "epoch": 22.86695278969957, "grad_norm": 1325.0853271484375, "learning_rate": 0.0002850598612419108, "loss": 16.8698, "step": 8658 }, { "epoch": 22.869593925387917, "grad_norm": 2775.132080078125, "learning_rate": 0.00028501849710583696, "loss": 19.1842, "step": 8659 }, { "epoch": 22.872235061076264, "grad_norm": 1187.0421142578125, "learning_rate": 0.0002849771319918922, "loss": 13.0361, "step": 8660 }, { "epoch": 22.87487619676461, "grad_norm": 334.9435119628906, "learning_rate": 0.00028493576590123143, "loss": 18.1346, "step": 8661 }, { "epoch": 22.877517332452953, "grad_norm": 218.2841339111328, "learning_rate": 0.00028489439883501, "loss": 36.0928, "step": 8662 }, { "epoch": 22.8801584681413, "grad_norm": 603.295166015625, "learning_rate": 0.00028485303079438294, "loss": 35.5004, "step": 8663 }, { "epoch": 22.882799603829646, "grad_norm": 2033.7091064453125, "learning_rate": 0.0002848116617805056, "loss": 35.9095, "step": 8664 }, { "epoch": 22.885440739517993, "grad_norm": 326.66033935546875, "learning_rate": 0.000284770291794533, "loss": 35.6037, "step": 8665 }, { "epoch": 22.88808187520634, "grad_norm": 446.3265380859375, "learning_rate": 0.0002847289208376203, "loss": 35.1431, "step": 8666 }, { "epoch": 22.890723010894686, "grad_norm": 464.3775634765625, "learning_rate": 0.000284687548910923, "loss": 36.7189, "step": 8667 }, { "epoch": 22.893364146583032, "grad_norm": 355.0977783203125, "learning_rate": 0.0002846461760155963, "loss": 35.4091, "step": 8668 }, { "epoch": 22.896005282271375, "grad_norm": 221.1808624267578, "learning_rate": 0.00028460480215279546, "loss": 35.0553, "step": 8669 }, { "epoch": 22.89864641795972, "grad_norm": 297.1434326171875, "learning_rate": 0.0002845634273236758, "loss": 36.1472, "step": 8670 }, { "epoch": 22.901287553648068, "grad_norm": 1055.745849609375, "learning_rate": 0.00028452205152939273, "loss": 35.3925, "step": 8671 }, { "epoch": 22.903928689336414, "grad_norm": 760.3895263671875, "learning_rate": 0.00028448067477110167, "loss": 35.8086, "step": 8672 }, { "epoch": 22.90656982502476, "grad_norm": 503.65716552734375, "learning_rate": 0.0002844392970499581, "loss": 36.0817, "step": 8673 }, { "epoch": 22.909210960713107, "grad_norm": 252.44772338867188, "learning_rate": 0.0002843979183671173, "loss": 36.481, "step": 8674 }, { "epoch": 22.911852096401454, "grad_norm": 627.0155029296875, "learning_rate": 0.0002843565387237348, "loss": 36.3038, "step": 8675 }, { "epoch": 22.9144932320898, "grad_norm": 477.487548828125, "learning_rate": 0.00028431515812096624, "loss": 34.7115, "step": 8676 }, { "epoch": 22.917134367778143, "grad_norm": 689.3750610351562, "learning_rate": 0.0002842737765599671, "loss": 36.2807, "step": 8677 }, { "epoch": 22.91977550346649, "grad_norm": 334.33795166015625, "learning_rate": 0.0002842323940418928, "loss": 37.7596, "step": 8678 }, { "epoch": 22.922416639154836, "grad_norm": 354.6820983886719, "learning_rate": 0.0002841910105678989, "loss": 39.3567, "step": 8679 }, { "epoch": 22.925057774843182, "grad_norm": 328.1991271972656, "learning_rate": 0.0002841496261391412, "loss": 42.0432, "step": 8680 }, { "epoch": 22.92769891053153, "grad_norm": 501.4020080566406, "learning_rate": 0.0002841082407567753, "loss": 41.6341, "step": 8681 }, { "epoch": 22.930340046219875, "grad_norm": 928.68017578125, "learning_rate": 0.00028406685442195677, "loss": 42.1409, "step": 8682 }, { "epoch": 22.93298118190822, "grad_norm": 529.3074951171875, "learning_rate": 0.0002840254671358414, "loss": 42.1581, "step": 8683 }, { "epoch": 22.935622317596568, "grad_norm": 929.0535888671875, "learning_rate": 0.0002839840788995847, "loss": 42.8675, "step": 8684 }, { "epoch": 22.93826345328491, "grad_norm": 397.7246398925781, "learning_rate": 0.0002839426897143427, "loss": 42.422, "step": 8685 }, { "epoch": 22.940904588973257, "grad_norm": 875.8801879882812, "learning_rate": 0.00028390129958127096, "loss": 40.4452, "step": 8686 }, { "epoch": 22.943545724661604, "grad_norm": 482.1387939453125, "learning_rate": 0.0002838599085015254, "loss": 40.0391, "step": 8687 }, { "epoch": 22.94618686034995, "grad_norm": 451.5480041503906, "learning_rate": 0.00028381851647626174, "loss": 39.3764, "step": 8688 }, { "epoch": 22.948827996038297, "grad_norm": 373.969970703125, "learning_rate": 0.0002837771235066359, "loss": 37.2908, "step": 8689 }, { "epoch": 22.951469131726643, "grad_norm": 363.89141845703125, "learning_rate": 0.0002837357295938037, "loss": 36.0721, "step": 8690 }, { "epoch": 22.95411026741499, "grad_norm": 499.24090576171875, "learning_rate": 0.00028369433473892107, "loss": 35.8702, "step": 8691 }, { "epoch": 22.956751403103333, "grad_norm": 217.34120178222656, "learning_rate": 0.0002836529389431439, "loss": 36.5784, "step": 8692 }, { "epoch": 22.95939253879168, "grad_norm": 614.810791015625, "learning_rate": 0.0002836115422076283, "loss": 24.9631, "step": 8693 }, { "epoch": 22.962033674480026, "grad_norm": 427.36376953125, "learning_rate": 0.00028357014453353, "loss": 11.086, "step": 8694 }, { "epoch": 22.964674810168372, "grad_norm": 3982.04736328125, "learning_rate": 0.0002835287459220053, "loss": 15.9358, "step": 8695 }, { "epoch": 22.96731594585672, "grad_norm": 17289.58203125, "learning_rate": 0.0002834873463742099, "loss": 10.8828, "step": 8696 }, { "epoch": 22.969957081545065, "grad_norm": 420.6706848144531, "learning_rate": 0.0002834459458913001, "loss": 16.9438, "step": 8697 }, { "epoch": 22.97259821723341, "grad_norm": 566.7324829101562, "learning_rate": 0.0002834045444744318, "loss": 31.7931, "step": 8698 }, { "epoch": 22.975239352921758, "grad_norm": 647.619873046875, "learning_rate": 0.0002833631421247614, "loss": 35.826, "step": 8699 }, { "epoch": 22.9778804886101, "grad_norm": 218.1534881591797, "learning_rate": 0.00028332173884344477, "loss": 35.6418, "step": 8700 }, { "epoch": 22.980521624298447, "grad_norm": 229.28648376464844, "learning_rate": 0.0002832803346316381, "loss": 34.8248, "step": 8701 }, { "epoch": 22.983162759986794, "grad_norm": 461.83660888671875, "learning_rate": 0.0002832389294904977, "loss": 35.6566, "step": 8702 }, { "epoch": 22.98580389567514, "grad_norm": 234.41432189941406, "learning_rate": 0.0002831975234211797, "loss": 34.8157, "step": 8703 }, { "epoch": 22.988445031363486, "grad_norm": 1092.12548828125, "learning_rate": 0.00028315611642484043, "loss": 36.6769, "step": 8704 }, { "epoch": 22.991086167051833, "grad_norm": 725.8175659179688, "learning_rate": 0.00028311470850263595, "loss": 36.8133, "step": 8705 }, { "epoch": 22.99372730274018, "grad_norm": 332.3294372558594, "learning_rate": 0.0002830732996557227, "loss": 35.102, "step": 8706 }, { "epoch": 22.996368438428526, "grad_norm": 216.93978881835938, "learning_rate": 0.00028303188988525706, "loss": 35.49, "step": 8707 }, { "epoch": 22.99900957411687, "grad_norm": 606.2385864257812, "learning_rate": 0.00028299047919239534, "loss": 38.5785, "step": 8708 }, { "epoch": 23.001650709805215, "grad_norm": 501.9562072753906, "learning_rate": 0.0002829490675782938, "loss": 40.5689, "step": 8709 }, { "epoch": 23.00429184549356, "grad_norm": 177.79380798339844, "learning_rate": 0.00028290765504410885, "loss": 40.4316, "step": 8710 }, { "epoch": 23.006932981181908, "grad_norm": 601.940185546875, "learning_rate": 0.00028286624159099704, "loss": 39.4782, "step": 8711 }, { "epoch": 23.009574116870255, "grad_norm": 169.39903259277344, "learning_rate": 0.00028282482722011477, "loss": 39.6296, "step": 8712 }, { "epoch": 23.0122152525586, "grad_norm": 250.89491271972656, "learning_rate": 0.00028278341193261843, "loss": 41.1892, "step": 8713 }, { "epoch": 23.014856388246947, "grad_norm": 246.76071166992188, "learning_rate": 0.00028274199572966456, "loss": 43.325, "step": 8714 }, { "epoch": 23.017497523935294, "grad_norm": 822.7919921875, "learning_rate": 0.0002827005786124097, "loss": 43.4114, "step": 8715 }, { "epoch": 23.020138659623637, "grad_norm": 251.5475311279297, "learning_rate": 0.0002826591605820104, "loss": 42.8374, "step": 8716 }, { "epoch": 23.022779795311983, "grad_norm": 283.3432922363281, "learning_rate": 0.0002826177416396233, "loss": 39.7614, "step": 8717 }, { "epoch": 23.02542093100033, "grad_norm": 262.96337890625, "learning_rate": 0.00028257632178640485, "loss": 42.4422, "step": 8718 }, { "epoch": 23.028062066688676, "grad_norm": 281.0896301269531, "learning_rate": 0.0002825349010235117, "loss": 40.3657, "step": 8719 }, { "epoch": 23.030703202377023, "grad_norm": 219.00865173339844, "learning_rate": 0.00028249347935210064, "loss": 38.9004, "step": 8720 }, { "epoch": 23.03334433806537, "grad_norm": 234.70751953125, "learning_rate": 0.00028245205677332827, "loss": 38.7197, "step": 8721 }, { "epoch": 23.035985473753716, "grad_norm": 292.8930358886719, "learning_rate": 0.00028241063328835115, "loss": 36.8572, "step": 8722 }, { "epoch": 23.03862660944206, "grad_norm": 311.4075927734375, "learning_rate": 0.00028236920889832624, "loss": 37.0634, "step": 8723 }, { "epoch": 23.041267745130405, "grad_norm": 365.056884765625, "learning_rate": 0.0002823277836044101, "loss": 35.8073, "step": 8724 }, { "epoch": 23.04390888081875, "grad_norm": 197.0491943359375, "learning_rate": 0.0002822863574077597, "loss": 35.6639, "step": 8725 }, { "epoch": 23.046550016507098, "grad_norm": 237.5565948486328, "learning_rate": 0.00028224493030953174, "loss": 35.3297, "step": 8726 }, { "epoch": 23.049191152195444, "grad_norm": 271.08740234375, "learning_rate": 0.0002822035023108829, "loss": 34.6235, "step": 8727 }, { "epoch": 23.05183228788379, "grad_norm": 421.88653564453125, "learning_rate": 0.00028216207341297034, "loss": 36.2443, "step": 8728 }, { "epoch": 23.054473423572137, "grad_norm": 238.19142150878906, "learning_rate": 0.0002821206436169507, "loss": 35.5634, "step": 8729 }, { "epoch": 23.057114559260484, "grad_norm": 707.533203125, "learning_rate": 0.000282079212923981, "loss": 36.8833, "step": 8730 }, { "epoch": 23.059755694948826, "grad_norm": 1667.82666015625, "learning_rate": 0.000282037781335218, "loss": 46.1021, "step": 8731 }, { "epoch": 23.062396830637173, "grad_norm": 2358.4248046875, "learning_rate": 0.0002819963488518189, "loss": 31.4327, "step": 8732 }, { "epoch": 23.06503796632552, "grad_norm": 6523.11328125, "learning_rate": 0.00028195491547494043, "loss": 28.1348, "step": 8733 }, { "epoch": 23.067679102013866, "grad_norm": 1902.674072265625, "learning_rate": 0.0002819134812057398, "loss": 34.5761, "step": 8734 }, { "epoch": 23.070320237702212, "grad_norm": 1476.765869140625, "learning_rate": 0.000281872046045374, "loss": 22.4745, "step": 8735 }, { "epoch": 23.07296137339056, "grad_norm": 798.0320434570312, "learning_rate": 0.000281830609995, "loss": 22.1012, "step": 8736 }, { "epoch": 23.075602509078905, "grad_norm": 4637.89501953125, "learning_rate": 0.0002817891730557749, "loss": 17.3866, "step": 8737 }, { "epoch": 23.07824364476725, "grad_norm": 888.5277099609375, "learning_rate": 0.0002817477352288558, "loss": 10.9039, "step": 8738 }, { "epoch": 23.080884780455595, "grad_norm": 1934.370361328125, "learning_rate": 0.0002817062965153999, "loss": 17.3296, "step": 8739 }, { "epoch": 23.08352591614394, "grad_norm": 685.8988037109375, "learning_rate": 0.00028166485691656423, "loss": 17.6587, "step": 8740 }, { "epoch": 23.086167051832287, "grad_norm": 482.944580078125, "learning_rate": 0.00028162341643350606, "loss": 36.978, "step": 8741 }, { "epoch": 23.088808187520634, "grad_norm": 323.1009521484375, "learning_rate": 0.0002815819750673826, "loss": 36.5638, "step": 8742 }, { "epoch": 23.09144932320898, "grad_norm": 375.13824462890625, "learning_rate": 0.00028154053281935097, "loss": 36.2929, "step": 8743 }, { "epoch": 23.094090458897327, "grad_norm": 514.9178466796875, "learning_rate": 0.00028149908969056855, "loss": 36.7078, "step": 8744 }, { "epoch": 23.096731594585673, "grad_norm": 481.18621826171875, "learning_rate": 0.0002814576456821925, "loss": 35.3464, "step": 8745 }, { "epoch": 23.099372730274016, "grad_norm": 341.3115234375, "learning_rate": 0.0002814162007953802, "loss": 36.11, "step": 8746 }, { "epoch": 23.102013865962363, "grad_norm": 519.4296264648438, "learning_rate": 0.00028137475503128893, "loss": 36.6168, "step": 8747 }, { "epoch": 23.10465500165071, "grad_norm": 556.228515625, "learning_rate": 0.00028133330839107606, "loss": 36.4495, "step": 8748 }, { "epoch": 23.107296137339056, "grad_norm": 251.27499389648438, "learning_rate": 0.00028129186087589897, "loss": 34.5944, "step": 8749 }, { "epoch": 23.109937273027402, "grad_norm": 293.4910888671875, "learning_rate": 0.00028125041248691505, "loss": 35.8447, "step": 8750 }, { "epoch": 23.11257840871575, "grad_norm": 505.22235107421875, "learning_rate": 0.00028120896322528173, "loss": 35.7422, "step": 8751 }, { "epoch": 23.115219544404095, "grad_norm": 344.4521179199219, "learning_rate": 0.0002811675130921564, "loss": 36.1073, "step": 8752 }, { "epoch": 23.11786068009244, "grad_norm": 488.8525390625, "learning_rate": 0.00028112606208869663, "loss": 34.8839, "step": 8753 }, { "epoch": 23.120501815780784, "grad_norm": 189.6537322998047, "learning_rate": 0.00028108461021605983, "loss": 34.967, "step": 8754 }, { "epoch": 23.12314295146913, "grad_norm": 537.4869384765625, "learning_rate": 0.0002810431574754036, "loss": 35.9755, "step": 8755 }, { "epoch": 23.125784087157477, "grad_norm": 382.3701477050781, "learning_rate": 0.0002810017038678854, "loss": 35.7887, "step": 8756 }, { "epoch": 23.128425222845824, "grad_norm": 635.926513671875, "learning_rate": 0.0002809602493946628, "loss": 36.8157, "step": 8757 }, { "epoch": 23.13106635853417, "grad_norm": 871.4188842773438, "learning_rate": 0.0002809187940568935, "loss": 39.3958, "step": 8758 }, { "epoch": 23.133707494222516, "grad_norm": 579.2510986328125, "learning_rate": 0.0002808773378557349, "loss": 41.9519, "step": 8759 }, { "epoch": 23.136348629910863, "grad_norm": 737.9061889648438, "learning_rate": 0.0002808358807923448, "loss": 42.3219, "step": 8760 }, { "epoch": 23.13898976559921, "grad_norm": 574.70263671875, "learning_rate": 0.00028079442286788097, "loss": 42.2158, "step": 8761 }, { "epoch": 23.141630901287552, "grad_norm": 336.5176696777344, "learning_rate": 0.00028075296408350086, "loss": 43.2756, "step": 8762 }, { "epoch": 23.1442720369759, "grad_norm": 646.2844848632812, "learning_rate": 0.00028071150444036237, "loss": 43.7837, "step": 8763 }, { "epoch": 23.146913172664245, "grad_norm": 435.9888610839844, "learning_rate": 0.00028067004393962316, "loss": 43.0871, "step": 8764 }, { "epoch": 23.14955430835259, "grad_norm": 371.746337890625, "learning_rate": 0.000280628582582441, "loss": 42.7928, "step": 8765 }, { "epoch": 23.152195444040938, "grad_norm": 856.5863037109375, "learning_rate": 0.0002805871203699736, "loss": 42.9143, "step": 8766 }, { "epoch": 23.154836579729285, "grad_norm": 439.491943359375, "learning_rate": 0.00028054565730337876, "loss": 39.9433, "step": 8767 }, { "epoch": 23.15747771541763, "grad_norm": 901.3368530273438, "learning_rate": 0.0002805041933838145, "loss": 41.117, "step": 8768 }, { "epoch": 23.160118851105974, "grad_norm": 519.8350830078125, "learning_rate": 0.00028046272861243854, "loss": 42.5194, "step": 8769 }, { "epoch": 23.16275998679432, "grad_norm": 355.0358581542969, "learning_rate": 0.00028042126299040885, "loss": 40.3003, "step": 8770 }, { "epoch": 23.165401122482667, "grad_norm": 215.014892578125, "learning_rate": 0.0002803797965188831, "loss": 39.4317, "step": 8771 }, { "epoch": 23.168042258171013, "grad_norm": 293.06866455078125, "learning_rate": 0.0002803383291990195, "loss": 37.0315, "step": 8772 }, { "epoch": 23.17068339385936, "grad_norm": 247.88888549804688, "learning_rate": 0.0002802968610319759, "loss": 37.4345, "step": 8773 }, { "epoch": 23.173324529547706, "grad_norm": 322.9653015136719, "learning_rate": 0.0002802553920189101, "loss": 34.5307, "step": 8774 }, { "epoch": 23.175965665236053, "grad_norm": 458.6505432128906, "learning_rate": 0.0002802139221609804, "loss": 35.4034, "step": 8775 }, { "epoch": 23.1786068009244, "grad_norm": 214.7537078857422, "learning_rate": 0.00028017245145934454, "loss": 35.1929, "step": 8776 }, { "epoch": 23.181247936612742, "grad_norm": 385.0619201660156, "learning_rate": 0.0002801309799151608, "loss": 36.4104, "step": 8777 }, { "epoch": 23.18388907230109, "grad_norm": 506.60003662109375, "learning_rate": 0.00028008950752958717, "loss": 35.5524, "step": 8778 }, { "epoch": 23.186530207989435, "grad_norm": 302.9767761230469, "learning_rate": 0.0002800480343037817, "loss": 35.9381, "step": 8779 }, { "epoch": 23.18917134367778, "grad_norm": 292.75811767578125, "learning_rate": 0.00028000656023890243, "loss": 37.2083, "step": 8780 }, { "epoch": 23.191812479366128, "grad_norm": 500.97430419921875, "learning_rate": 0.0002799650853361076, "loss": 39.0858, "step": 8781 }, { "epoch": 23.194453615054474, "grad_norm": 5381.3896484375, "learning_rate": 0.0002799236095965555, "loss": 44.6676, "step": 8782 }, { "epoch": 23.19709475074282, "grad_norm": 1816.9168701171875, "learning_rate": 0.0002798821330214041, "loss": 31.8075, "step": 8783 }, { "epoch": 23.199735886431167, "grad_norm": 6565.787109375, "learning_rate": 0.00027984065561181164, "loss": 35.3721, "step": 8784 }, { "epoch": 23.20237702211951, "grad_norm": 7559.81201171875, "learning_rate": 0.00027979917736893644, "loss": 31.4374, "step": 8785 }, { "epoch": 23.205018157807856, "grad_norm": 2366.50634765625, "learning_rate": 0.0002797576982939368, "loss": 24.8713, "step": 8786 }, { "epoch": 23.207659293496203, "grad_norm": 4965.1708984375, "learning_rate": 0.0002797162183879708, "loss": 32.3325, "step": 8787 }, { "epoch": 23.21030042918455, "grad_norm": 990.6084594726562, "learning_rate": 0.00027967473765219686, "loss": 19.7413, "step": 8788 }, { "epoch": 23.212941564872896, "grad_norm": 1856.165283203125, "learning_rate": 0.0002796332560877733, "loss": 19.5722, "step": 8789 }, { "epoch": 23.215582700561242, "grad_norm": 1504.7984619140625, "learning_rate": 0.00027959177369585856, "loss": 17.7217, "step": 8790 }, { "epoch": 23.21822383624959, "grad_norm": 327.32080078125, "learning_rate": 0.0002795502904776109, "loss": 26.1791, "step": 8791 }, { "epoch": 23.22086497193793, "grad_norm": 955.575439453125, "learning_rate": 0.0002795088064341887, "loss": 36.1397, "step": 8792 }, { "epoch": 23.223506107626278, "grad_norm": 290.23919677734375, "learning_rate": 0.0002794673215667505, "loss": 36.1471, "step": 8793 }, { "epoch": 23.226147243314625, "grad_norm": 479.1543273925781, "learning_rate": 0.00027942583587645456, "loss": 37.3705, "step": 8794 }, { "epoch": 23.22878837900297, "grad_norm": 664.2098999023438, "learning_rate": 0.00027938434936445943, "loss": 35.6923, "step": 8795 }, { "epoch": 23.231429514691317, "grad_norm": 462.2098388671875, "learning_rate": 0.0002793428620319237, "loss": 35.8186, "step": 8796 }, { "epoch": 23.234070650379664, "grad_norm": 281.36322021484375, "learning_rate": 0.00027930137388000565, "loss": 37.1529, "step": 8797 }, { "epoch": 23.23671178606801, "grad_norm": 208.09707641601562, "learning_rate": 0.00027925988490986404, "loss": 34.9427, "step": 8798 }, { "epoch": 23.239352921756357, "grad_norm": 314.79791259765625, "learning_rate": 0.0002792183951226573, "loss": 36.2288, "step": 8799 }, { "epoch": 23.2419940574447, "grad_norm": 242.68406677246094, "learning_rate": 0.0002791769045195441, "loss": 35.7372, "step": 8800 }, { "epoch": 23.2419940574447, "eval_loss": 4.016531944274902, "eval_runtime": 2.0812, "eval_samples_per_second": 237.841, "eval_steps_per_second": 29.79, "step": 8800 }, { "epoch": 23.244635193133046, "grad_norm": 818.8218383789062, "learning_rate": 0.00027913541310168287, "loss": 36.2847, "step": 8801 }, { "epoch": 23.247276328821393, "grad_norm": 785.203857421875, "learning_rate": 0.0002790939208702324, "loss": 36.1479, "step": 8802 }, { "epoch": 23.24991746450974, "grad_norm": 348.7536926269531, "learning_rate": 0.00027905242782635134, "loss": 36.0675, "step": 8803 }, { "epoch": 23.252558600198086, "grad_norm": 666.95751953125, "learning_rate": 0.0002790109339711982, "loss": 34.7845, "step": 8804 }, { "epoch": 23.255199735886432, "grad_norm": 272.37725830078125, "learning_rate": 0.0002789694393059318, "loss": 36.0048, "step": 8805 }, { "epoch": 23.25784087157478, "grad_norm": 431.62530517578125, "learning_rate": 0.0002789279438317108, "loss": 34.8734, "step": 8806 }, { "epoch": 23.260482007263125, "grad_norm": 1731.8052978515625, "learning_rate": 0.0002788864475496941, "loss": 37.3336, "step": 8807 }, { "epoch": 23.263123142951468, "grad_norm": 1653.608154296875, "learning_rate": 0.0002788449504610402, "loss": 40.9636, "step": 8808 }, { "epoch": 23.265764278639814, "grad_norm": 470.9421691894531, "learning_rate": 0.00027880345256690804, "loss": 40.9827, "step": 8809 }, { "epoch": 23.26840541432816, "grad_norm": 347.4349060058594, "learning_rate": 0.00027876195386845637, "loss": 41.6673, "step": 8810 }, { "epoch": 23.271046550016507, "grad_norm": 361.6109924316406, "learning_rate": 0.000278720454366844, "loss": 43.0696, "step": 8811 }, { "epoch": 23.273687685704854, "grad_norm": 642.7182006835938, "learning_rate": 0.00027867895406322987, "loss": 41.2759, "step": 8812 }, { "epoch": 23.2763288213932, "grad_norm": 1043.133544921875, "learning_rate": 0.0002786374529587728, "loss": 44.3174, "step": 8813 }, { "epoch": 23.278969957081546, "grad_norm": 729.7957153320312, "learning_rate": 0.00027859595105463167, "loss": 43.4185, "step": 8814 }, { "epoch": 23.28161109276989, "grad_norm": 693.4087524414062, "learning_rate": 0.00027855444835196534, "loss": 43.6876, "step": 8815 }, { "epoch": 23.284252228458236, "grad_norm": 604.9615478515625, "learning_rate": 0.0002785129448519328, "loss": 43.9464, "step": 8816 }, { "epoch": 23.286893364146582, "grad_norm": 1889.4229736328125, "learning_rate": 0.0002784714405556932, "loss": 40.9186, "step": 8817 }, { "epoch": 23.28953449983493, "grad_norm": 576.1091918945312, "learning_rate": 0.0002784299354644052, "loss": 41.0915, "step": 8818 }, { "epoch": 23.292175635523275, "grad_norm": 384.26031494140625, "learning_rate": 0.000278388429579228, "loss": 39.7902, "step": 8819 }, { "epoch": 23.29481677121162, "grad_norm": 357.0392761230469, "learning_rate": 0.00027834692290132053, "loss": 38.5557, "step": 8820 }, { "epoch": 23.297457906899968, "grad_norm": 356.37188720703125, "learning_rate": 0.00027830541543184193, "loss": 38.9602, "step": 8821 }, { "epoch": 23.300099042588315, "grad_norm": 319.3775939941406, "learning_rate": 0.0002782639071719513, "loss": 38.9154, "step": 8822 }, { "epoch": 23.302740178276657, "grad_norm": 343.95343017578125, "learning_rate": 0.0002782223981228075, "loss": 36.9409, "step": 8823 }, { "epoch": 23.305381313965004, "grad_norm": 463.9439392089844, "learning_rate": 0.00027818088828556993, "loss": 36.5002, "step": 8824 }, { "epoch": 23.30802244965335, "grad_norm": 646.9937133789062, "learning_rate": 0.00027813937766139765, "loss": 36.9127, "step": 8825 }, { "epoch": 23.310663585341697, "grad_norm": 289.3427429199219, "learning_rate": 0.00027809786625144974, "loss": 35.4857, "step": 8826 }, { "epoch": 23.313304721030043, "grad_norm": 242.9993896484375, "learning_rate": 0.00027805635405688537, "loss": 35.2325, "step": 8827 }, { "epoch": 23.31594585671839, "grad_norm": 623.7344970703125, "learning_rate": 0.0002780148410788637, "loss": 35.2953, "step": 8828 }, { "epoch": 23.318586992406736, "grad_norm": 211.16830444335938, "learning_rate": 0.0002779733273185442, "loss": 34.6309, "step": 8829 }, { "epoch": 23.321228128095083, "grad_norm": 400.5089416503906, "learning_rate": 0.00027793181277708596, "loss": 35.7339, "step": 8830 }, { "epoch": 23.323869263783426, "grad_norm": 555.0662231445312, "learning_rate": 0.00027789029745564826, "loss": 36.2143, "step": 8831 }, { "epoch": 23.326510399471772, "grad_norm": 1965.9407958984375, "learning_rate": 0.0002778487813553903, "loss": 31.2599, "step": 8832 }, { "epoch": 23.32915153516012, "grad_norm": 2908.4033203125, "learning_rate": 0.0002778072644774716, "loss": 31.0778, "step": 8833 }, { "epoch": 23.331792670848465, "grad_norm": 1502.07421875, "learning_rate": 0.00027776574682305134, "loss": 25.446, "step": 8834 }, { "epoch": 23.33443380653681, "grad_norm": 1980.0582275390625, "learning_rate": 0.0002777242283932888, "loss": 32.6602, "step": 8835 }, { "epoch": 23.337074942225158, "grad_norm": 985.6865234375, "learning_rate": 0.00027768270918934355, "loss": 28.9311, "step": 8836 }, { "epoch": 23.339716077913504, "grad_norm": 580.5571899414062, "learning_rate": 0.0002776411892123749, "loss": 25.4693, "step": 8837 }, { "epoch": 23.342357213601847, "grad_norm": 2492.5810546875, "learning_rate": 0.00027759966846354234, "loss": 18.4252, "step": 8838 }, { "epoch": 23.344998349290194, "grad_norm": 10797.392578125, "learning_rate": 0.0002775581469440052, "loss": 16.7057, "step": 8839 }, { "epoch": 23.34763948497854, "grad_norm": 3129.69970703125, "learning_rate": 0.000277516624654923, "loss": 15.1038, "step": 8840 }, { "epoch": 23.350280620666886, "grad_norm": 2557.75537109375, "learning_rate": 0.00027747510159745523, "loss": 24.2123, "step": 8841 }, { "epoch": 23.352921756355233, "grad_norm": 1144.622314453125, "learning_rate": 0.0002774335777727613, "loss": 38.3494, "step": 8842 }, { "epoch": 23.35556289204358, "grad_norm": 424.7168273925781, "learning_rate": 0.00027739205318200096, "loss": 36.3024, "step": 8843 }, { "epoch": 23.358204027731926, "grad_norm": 263.8346862792969, "learning_rate": 0.0002773505278263335, "loss": 38.5412, "step": 8844 }, { "epoch": 23.360845163420272, "grad_norm": 239.3277130126953, "learning_rate": 0.00027730900170691873, "loss": 37.004, "step": 8845 }, { "epoch": 23.363486299108615, "grad_norm": 497.49896240234375, "learning_rate": 0.000277267474824916, "loss": 37.0802, "step": 8846 }, { "epoch": 23.36612743479696, "grad_norm": 394.8294982910156, "learning_rate": 0.0002772259471814852, "loss": 36.907, "step": 8847 }, { "epoch": 23.368768570485308, "grad_norm": 442.1584167480469, "learning_rate": 0.00027718441877778576, "loss": 36.1563, "step": 8848 }, { "epoch": 23.371409706173655, "grad_norm": 442.0547790527344, "learning_rate": 0.00027714288961497736, "loss": 35.7874, "step": 8849 }, { "epoch": 23.374050841862, "grad_norm": 544.3693237304688, "learning_rate": 0.00027710135969421975, "loss": 36.4017, "step": 8850 }, { "epoch": 23.376691977550347, "grad_norm": 314.219970703125, "learning_rate": 0.00027705982901667255, "loss": 36.698, "step": 8851 }, { "epoch": 23.379333113238694, "grad_norm": 354.2569580078125, "learning_rate": 0.00027701829758349563, "loss": 37.2526, "step": 8852 }, { "epoch": 23.38197424892704, "grad_norm": 387.4010009765625, "learning_rate": 0.0002769767653958485, "loss": 36.6188, "step": 8853 }, { "epoch": 23.384615384615383, "grad_norm": 479.7021484375, "learning_rate": 0.00027693523245489113, "loss": 35.8369, "step": 8854 }, { "epoch": 23.38725652030373, "grad_norm": 1945.73095703125, "learning_rate": 0.0002768936987617831, "loss": 35.8308, "step": 8855 }, { "epoch": 23.389897655992076, "grad_norm": 691.4675903320312, "learning_rate": 0.00027685216431768447, "loss": 36.5633, "step": 8856 }, { "epoch": 23.392538791680423, "grad_norm": 976.5257568359375, "learning_rate": 0.00027681062912375483, "loss": 36.9545, "step": 8857 }, { "epoch": 23.39517992736877, "grad_norm": 1177.0665283203125, "learning_rate": 0.0002767690931811541, "loss": 39.1336, "step": 8858 }, { "epoch": 23.397821063057116, "grad_norm": 528.7267456054688, "learning_rate": 0.00027672755649104226, "loss": 41.8559, "step": 8859 }, { "epoch": 23.400462198745462, "grad_norm": 212.0080108642578, "learning_rate": 0.0002766860190545791, "loss": 41.5148, "step": 8860 }, { "epoch": 23.403103334433805, "grad_norm": 230.54417419433594, "learning_rate": 0.0002766444808729245, "loss": 42.4734, "step": 8861 }, { "epoch": 23.40574447012215, "grad_norm": 249.49398803710938, "learning_rate": 0.00027660294194723834, "loss": 42.7845, "step": 8862 }, { "epoch": 23.408385605810498, "grad_norm": 456.5840148925781, "learning_rate": 0.00027656140227868073, "loss": 43.7211, "step": 8863 }, { "epoch": 23.411026741498844, "grad_norm": 300.3740234375, "learning_rate": 0.0002765198618684116, "loss": 42.1491, "step": 8864 }, { "epoch": 23.41366787718719, "grad_norm": 1031.7935791015625, "learning_rate": 0.00027647832071759086, "loss": 45.9291, "step": 8865 }, { "epoch": 23.416309012875537, "grad_norm": 606.8561401367188, "learning_rate": 0.0002764367788273786, "loss": 42.2607, "step": 8866 }, { "epoch": 23.418950148563884, "grad_norm": 215.49143981933594, "learning_rate": 0.00027639523619893473, "loss": 42.1978, "step": 8867 }, { "epoch": 23.42159128425223, "grad_norm": 178.04476928710938, "learning_rate": 0.00027635369283341956, "loss": 39.2449, "step": 8868 }, { "epoch": 23.424232419940573, "grad_norm": 1250.2757568359375, "learning_rate": 0.0002763121487319929, "loss": 39.6556, "step": 8869 }, { "epoch": 23.42687355562892, "grad_norm": 286.2371520996094, "learning_rate": 0.00027627060389581493, "loss": 39.2142, "step": 8870 }, { "epoch": 23.429514691317266, "grad_norm": 500.5489501953125, "learning_rate": 0.0002762290583260458, "loss": 38.315, "step": 8871 }, { "epoch": 23.432155827005612, "grad_norm": 328.50665283203125, "learning_rate": 0.0002761875120238456, "loss": 37.7259, "step": 8872 }, { "epoch": 23.43479696269396, "grad_norm": 374.2314147949219, "learning_rate": 0.00027614596499037455, "loss": 36.1261, "step": 8873 }, { "epoch": 23.437438098382305, "grad_norm": 332.8160095214844, "learning_rate": 0.00027610441722679277, "loss": 36.2781, "step": 8874 }, { "epoch": 23.44007923407065, "grad_norm": 416.66229248046875, "learning_rate": 0.00027606286873426046, "loss": 36.9046, "step": 8875 }, { "epoch": 23.442720369758998, "grad_norm": 300.8540344238281, "learning_rate": 0.0002760213195139378, "loss": 34.7423, "step": 8876 }, { "epoch": 23.44536150544734, "grad_norm": 203.47560119628906, "learning_rate": 0.00027597976956698515, "loss": 35.9263, "step": 8877 }, { "epoch": 23.448002641135687, "grad_norm": 446.8605041503906, "learning_rate": 0.0002759382188945628, "loss": 36.3231, "step": 8878 }, { "epoch": 23.450643776824034, "grad_norm": 859.9263305664062, "learning_rate": 0.0002758966674978307, "loss": 35.5301, "step": 8879 }, { "epoch": 23.45328491251238, "grad_norm": 3807.735595703125, "learning_rate": 0.0002758551153779495, "loss": 45.0342, "step": 8880 }, { "epoch": 23.455926048200727, "grad_norm": 10210.6826171875, "learning_rate": 0.0002758135625360794, "loss": 38.337, "step": 8881 }, { "epoch": 23.458567183889073, "grad_norm": 16338.775390625, "learning_rate": 0.00027577200897338066, "loss": 34.9717, "step": 8882 }, { "epoch": 23.46120831957742, "grad_norm": 2378.178955078125, "learning_rate": 0.0002757304546910138, "loss": 34.1182, "step": 8883 }, { "epoch": 23.463849455265763, "grad_norm": 2808.296875, "learning_rate": 0.0002756888996901391, "loss": 34.1716, "step": 8884 }, { "epoch": 23.46649059095411, "grad_norm": 2887.466552734375, "learning_rate": 0.0002756473439719169, "loss": 40.0377, "step": 8885 }, { "epoch": 23.469131726642456, "grad_norm": 1719.3905029296875, "learning_rate": 0.00027560578753750765, "loss": 30.6879, "step": 8886 }, { "epoch": 23.471772862330802, "grad_norm": 9496.3330078125, "learning_rate": 0.000275564230388072, "loss": 30.6471, "step": 8887 }, { "epoch": 23.47441399801915, "grad_norm": 925.0084838867188, "learning_rate": 0.00027552267252477, "loss": 27.853, "step": 8888 }, { "epoch": 23.477055133707495, "grad_norm": 691.2125244140625, "learning_rate": 0.00027548111394876254, "loss": 18.9321, "step": 8889 }, { "epoch": 23.47969626939584, "grad_norm": 2241.09326171875, "learning_rate": 0.00027543955466120986, "loss": 20.5936, "step": 8890 }, { "epoch": 23.482337405084188, "grad_norm": 325.00726318359375, "learning_rate": 0.00027539799466327257, "loss": 37.4707, "step": 8891 }, { "epoch": 23.48497854077253, "grad_norm": 605.7012329101562, "learning_rate": 0.00027535643395611126, "loss": 35.4663, "step": 8892 }, { "epoch": 23.487619676460877, "grad_norm": 482.21917724609375, "learning_rate": 0.0002753148725408863, "loss": 36.844, "step": 8893 }, { "epoch": 23.490260812149224, "grad_norm": 373.535888671875, "learning_rate": 0.00027527331041875855, "loss": 35.8624, "step": 8894 }, { "epoch": 23.49290194783757, "grad_norm": 373.5015563964844, "learning_rate": 0.00027523174759088836, "loss": 35.9898, "step": 8895 }, { "epoch": 23.495543083525916, "grad_norm": 432.9563293457031, "learning_rate": 0.00027519018405843643, "loss": 35.1627, "step": 8896 }, { "epoch": 23.498184219214263, "grad_norm": 291.53082275390625, "learning_rate": 0.0002751486198225635, "loss": 36.4465, "step": 8897 }, { "epoch": 23.50082535490261, "grad_norm": 348.0343322753906, "learning_rate": 0.00027510705488443, "loss": 34.4248, "step": 8898 }, { "epoch": 23.503466490590956, "grad_norm": 290.7660827636719, "learning_rate": 0.0002750654892451969, "loss": 36.8665, "step": 8899 }, { "epoch": 23.5061076262793, "grad_norm": 451.95941162109375, "learning_rate": 0.0002750239229060246, "loss": 36.3397, "step": 8900 }, { "epoch": 23.508748761967645, "grad_norm": 350.77130126953125, "learning_rate": 0.000274982355868074, "loss": 35.5311, "step": 8901 }, { "epoch": 23.51138989765599, "grad_norm": 303.4825134277344, "learning_rate": 0.00027494078813250576, "loss": 36.1708, "step": 8902 }, { "epoch": 23.514031033344338, "grad_norm": 668.6532592773438, "learning_rate": 0.00027489921970048073, "loss": 35.5975, "step": 8903 }, { "epoch": 23.516672169032685, "grad_norm": 524.448974609375, "learning_rate": 0.00027485765057315957, "loss": 36.4264, "step": 8904 }, { "epoch": 23.51931330472103, "grad_norm": 1183.249755859375, "learning_rate": 0.0002748160807517031, "loss": 34.2641, "step": 8905 }, { "epoch": 23.521954440409377, "grad_norm": 485.5596618652344, "learning_rate": 0.00027477451023727223, "loss": 35.5471, "step": 8906 }, { "epoch": 23.52459557609772, "grad_norm": 5016.490234375, "learning_rate": 0.00027473293903102766, "loss": 37.2864, "step": 8907 }, { "epoch": 23.527236711786067, "grad_norm": 883.9940185546875, "learning_rate": 0.0002746913671341303, "loss": 39.6975, "step": 8908 }, { "epoch": 23.529877847474413, "grad_norm": 2192.86328125, "learning_rate": 0.00027464979454774104, "loss": 41.0203, "step": 8909 }, { "epoch": 23.53251898316276, "grad_norm": 558.3253173828125, "learning_rate": 0.0002746082212730207, "loss": 39.9211, "step": 8910 }, { "epoch": 23.535160118851106, "grad_norm": 252.6592254638672, "learning_rate": 0.0002745666473111303, "loss": 43.2849, "step": 8911 }, { "epoch": 23.537801254539453, "grad_norm": 1114.7186279296875, "learning_rate": 0.00027452507266323065, "loss": 42.5235, "step": 8912 }, { "epoch": 23.5404423902278, "grad_norm": 375.46649169921875, "learning_rate": 0.0002744834973304828, "loss": 44.3303, "step": 8913 }, { "epoch": 23.543083525916146, "grad_norm": 666.9158935546875, "learning_rate": 0.0002744419213140476, "loss": 44.3321, "step": 8914 }, { "epoch": 23.54572466160449, "grad_norm": 374.6075134277344, "learning_rate": 0.0002744003446150862, "loss": 43.5963, "step": 8915 }, { "epoch": 23.548365797292835, "grad_norm": 1149.3759765625, "learning_rate": 0.0002743587672347594, "loss": 43.0388, "step": 8916 }, { "epoch": 23.55100693298118, "grad_norm": 457.5811767578125, "learning_rate": 0.00027431718917422836, "loss": 42.4551, "step": 8917 }, { "epoch": 23.553648068669528, "grad_norm": 291.91790771484375, "learning_rate": 0.0002742756104346542, "loss": 40.105, "step": 8918 }, { "epoch": 23.556289204357874, "grad_norm": 411.595703125, "learning_rate": 0.00027423403101719775, "loss": 40.7074, "step": 8919 }, { "epoch": 23.55893034004622, "grad_norm": 408.62835693359375, "learning_rate": 0.0002741924509230203, "loss": 38.4114, "step": 8920 }, { "epoch": 23.561571475734567, "grad_norm": 1038.65771484375, "learning_rate": 0.00027415087015328284, "loss": 37.3503, "step": 8921 }, { "epoch": 23.564212611422914, "grad_norm": 287.2536315917969, "learning_rate": 0.00027410928870914657, "loss": 38.9669, "step": 8922 }, { "epoch": 23.566853747111256, "grad_norm": 316.9952697753906, "learning_rate": 0.00027406770659177247, "loss": 37.5789, "step": 8923 }, { "epoch": 23.569494882799603, "grad_norm": 314.38104248046875, "learning_rate": 0.0002740261238023218, "loss": 35.9868, "step": 8924 }, { "epoch": 23.57213601848795, "grad_norm": 336.5721740722656, "learning_rate": 0.00027398454034195587, "loss": 35.5272, "step": 8925 }, { "epoch": 23.574777154176296, "grad_norm": 602.8035278320312, "learning_rate": 0.0002739429562118357, "loss": 36.2841, "step": 8926 }, { "epoch": 23.577418289864642, "grad_norm": 647.8701782226562, "learning_rate": 0.0002739013714131225, "loss": 35.4324, "step": 8927 }, { "epoch": 23.58005942555299, "grad_norm": 1493.4649658203125, "learning_rate": 0.0002738597859469775, "loss": 36.0128, "step": 8928 }, { "epoch": 23.582700561241335, "grad_norm": 185.3161163330078, "learning_rate": 0.0002738181998145621, "loss": 35.1951, "step": 8929 }, { "epoch": 23.585341696929678, "grad_norm": 1486.1551513671875, "learning_rate": 0.0002737766130170374, "loss": 42.7863, "step": 8930 }, { "epoch": 23.587982832618025, "grad_norm": 3167.963134765625, "learning_rate": 0.0002737350255555647, "loss": 37.5877, "step": 8931 }, { "epoch": 23.59062396830637, "grad_norm": 2302.3671875, "learning_rate": 0.00027369343743130546, "loss": 23.3993, "step": 8932 }, { "epoch": 23.593265103994717, "grad_norm": 2089.654541015625, "learning_rate": 0.0002736518486454208, "loss": 21.7836, "step": 8933 }, { "epoch": 23.595906239683064, "grad_norm": 1977.4776611328125, "learning_rate": 0.00027361025919907223, "loss": 30.09, "step": 8934 }, { "epoch": 23.59854737537141, "grad_norm": 3234.682861328125, "learning_rate": 0.00027356866909342104, "loss": 17.4859, "step": 8935 }, { "epoch": 23.601188511059757, "grad_norm": 6527.98828125, "learning_rate": 0.0002735270783296286, "loss": 16.9367, "step": 8936 }, { "epoch": 23.603829646748103, "grad_norm": 540.8558349609375, "learning_rate": 0.00027348548690885635, "loss": 18.8325, "step": 8937 }, { "epoch": 23.606470782436446, "grad_norm": 3026.78759765625, "learning_rate": 0.0002734438948322656, "loss": 16.255, "step": 8938 }, { "epoch": 23.609111918124793, "grad_norm": 960.40185546875, "learning_rate": 0.00027340230210101793, "loss": 14.8628, "step": 8939 }, { "epoch": 23.61175305381314, "grad_norm": 544.5585327148438, "learning_rate": 0.0002733607087162747, "loss": 14.4746, "step": 8940 }, { "epoch": 23.614394189501485, "grad_norm": 2059.172119140625, "learning_rate": 0.00027331911467919735, "loss": 36.4192, "step": 8941 }, { "epoch": 23.617035325189832, "grad_norm": 593.18017578125, "learning_rate": 0.00027327751999094743, "loss": 35.128, "step": 8942 }, { "epoch": 23.61967646087818, "grad_norm": 975.3379516601562, "learning_rate": 0.0002732359246526865, "loss": 37.0111, "step": 8943 }, { "epoch": 23.622317596566525, "grad_norm": 294.7489013671875, "learning_rate": 0.00027319432866557597, "loss": 35.2928, "step": 8944 }, { "epoch": 23.62495873225487, "grad_norm": 954.753173828125, "learning_rate": 0.0002731527320307774, "loss": 35.4084, "step": 8945 }, { "epoch": 23.627599867943214, "grad_norm": 242.6295928955078, "learning_rate": 0.00027311113474945246, "loss": 34.892, "step": 8946 }, { "epoch": 23.63024100363156, "grad_norm": 351.5466003417969, "learning_rate": 0.00027306953682276264, "loss": 36.2913, "step": 8947 }, { "epoch": 23.632882139319907, "grad_norm": 404.31488037109375, "learning_rate": 0.0002730279382518696, "loss": 36.3275, "step": 8948 }, { "epoch": 23.635523275008254, "grad_norm": 331.0942077636719, "learning_rate": 0.00027298633903793484, "loss": 34.89, "step": 8949 }, { "epoch": 23.6381644106966, "grad_norm": 813.3432006835938, "learning_rate": 0.0002729447391821201, "loss": 35.1886, "step": 8950 }, { "epoch": 23.640805546384946, "grad_norm": 401.2343444824219, "learning_rate": 0.00027290313868558695, "loss": 35.5648, "step": 8951 }, { "epoch": 23.643446682073293, "grad_norm": 1718.374267578125, "learning_rate": 0.0002728615375494972, "loss": 35.1783, "step": 8952 }, { "epoch": 23.646087817761636, "grad_norm": 270.6428527832031, "learning_rate": 0.00027281993577501243, "loss": 35.0677, "step": 8953 }, { "epoch": 23.648728953449982, "grad_norm": 607.9004516601562, "learning_rate": 0.00027277833336329427, "loss": 34.7595, "step": 8954 }, { "epoch": 23.65137008913833, "grad_norm": 512.6288452148438, "learning_rate": 0.0002727367303155046, "loss": 37.0576, "step": 8955 }, { "epoch": 23.654011224826675, "grad_norm": 426.1629638671875, "learning_rate": 0.00027269512663280504, "loss": 36.3767, "step": 8956 }, { "epoch": 23.65665236051502, "grad_norm": 436.4564208984375, "learning_rate": 0.00027265352231635746, "loss": 36.601, "step": 8957 }, { "epoch": 23.659293496203368, "grad_norm": 845.9805908203125, "learning_rate": 0.00027261191736732357, "loss": 38.6202, "step": 8958 }, { "epoch": 23.661934631891715, "grad_norm": 3510.865478515625, "learning_rate": 0.00027257031178686507, "loss": 41.0284, "step": 8959 }, { "epoch": 23.66457576758006, "grad_norm": 379.9098205566406, "learning_rate": 0.000272528705576144, "loss": 41.0173, "step": 8960 }, { "epoch": 23.667216903268404, "grad_norm": 1262.1053466796875, "learning_rate": 0.00027248709873632204, "loss": 42.3088, "step": 8961 }, { "epoch": 23.66985803895675, "grad_norm": 3782.137939453125, "learning_rate": 0.00027244549126856105, "loss": 42.2142, "step": 8962 }, { "epoch": 23.672499174645097, "grad_norm": 451.5382995605469, "learning_rate": 0.00027240388317402285, "loss": 46.3031, "step": 8963 }, { "epoch": 23.675140310333443, "grad_norm": 345.4093017578125, "learning_rate": 0.0002723622744538694, "loss": 44.6163, "step": 8964 }, { "epoch": 23.67778144602179, "grad_norm": 477.4657897949219, "learning_rate": 0.0002723206651092626, "loss": 44.5387, "step": 8965 }, { "epoch": 23.680422581710136, "grad_norm": 670.272216796875, "learning_rate": 0.0002722790551413643, "loss": 44.4965, "step": 8966 }, { "epoch": 23.683063717398483, "grad_norm": 332.250732421875, "learning_rate": 0.00027223744455133654, "loss": 42.4534, "step": 8967 }, { "epoch": 23.68570485308683, "grad_norm": 293.78424072265625, "learning_rate": 0.0002721958333403411, "loss": 41.9795, "step": 8968 }, { "epoch": 23.688345988775172, "grad_norm": 388.9925842285156, "learning_rate": 0.00027215422150954013, "loss": 39.0422, "step": 8969 }, { "epoch": 23.69098712446352, "grad_norm": 299.0621643066406, "learning_rate": 0.00027211260906009557, "loss": 38.7328, "step": 8970 }, { "epoch": 23.693628260151865, "grad_norm": 497.5458984375, "learning_rate": 0.00027207099599316925, "loss": 38.9185, "step": 8971 }, { "epoch": 23.69626939584021, "grad_norm": 406.8272399902344, "learning_rate": 0.0002720293823099235, "loss": 37.506, "step": 8972 }, { "epoch": 23.698910531528558, "grad_norm": 2780.9765625, "learning_rate": 0.00027198776801152, "loss": 35.4331, "step": 8973 }, { "epoch": 23.701551667216904, "grad_norm": 288.5928955078125, "learning_rate": 0.0002719461530991212, "loss": 38.1973, "step": 8974 }, { "epoch": 23.70419280290525, "grad_norm": 1714.42529296875, "learning_rate": 0.00027190453757388884, "loss": 36.5882, "step": 8975 }, { "epoch": 23.706833938593594, "grad_norm": 711.29443359375, "learning_rate": 0.0002718629214369852, "loss": 35.1134, "step": 8976 }, { "epoch": 23.70947507428194, "grad_norm": 1063.212646484375, "learning_rate": 0.00027182130468957224, "loss": 36.0155, "step": 8977 }, { "epoch": 23.712116209970286, "grad_norm": 365.392578125, "learning_rate": 0.00027177968733281224, "loss": 35.3854, "step": 8978 }, { "epoch": 23.714757345658633, "grad_norm": 337.2193603515625, "learning_rate": 0.0002717380693678673, "loss": 36.2972, "step": 8979 }, { "epoch": 23.71739848134698, "grad_norm": 2138.518310546875, "learning_rate": 0.0002716964507958994, "loss": 41.4563, "step": 8980 }, { "epoch": 23.720039617035326, "grad_norm": 1850.955322265625, "learning_rate": 0.00027165483161807093, "loss": 42.7922, "step": 8981 }, { "epoch": 23.722680752723672, "grad_norm": 7515.19970703125, "learning_rate": 0.000271613211835544, "loss": 27.9352, "step": 8982 }, { "epoch": 23.72532188841202, "grad_norm": 1348.14697265625, "learning_rate": 0.0002715715914494809, "loss": 22.4017, "step": 8983 }, { "epoch": 23.72796302410036, "grad_norm": 2484.291259765625, "learning_rate": 0.00027152997046104373, "loss": 25.9258, "step": 8984 }, { "epoch": 23.730604159788708, "grad_norm": 2564.0126953125, "learning_rate": 0.00027148834887139473, "loss": 28.7383, "step": 8985 }, { "epoch": 23.733245295477055, "grad_norm": 6103.75048828125, "learning_rate": 0.0002714467266816962, "loss": 24.6875, "step": 8986 }, { "epoch": 23.7358864311654, "grad_norm": 3512.43408203125, "learning_rate": 0.0002714051038931106, "loss": 22.5357, "step": 8987 }, { "epoch": 23.738527566853747, "grad_norm": 2154.78515625, "learning_rate": 0.0002713634805067999, "loss": 19.8178, "step": 8988 }, { "epoch": 23.741168702542094, "grad_norm": 1338.91748046875, "learning_rate": 0.00027132185652392653, "loss": 17.4222, "step": 8989 }, { "epoch": 23.74380983823044, "grad_norm": 398.2796630859375, "learning_rate": 0.00027128023194565294, "loss": 37.1787, "step": 8990 }, { "epoch": 23.746450973918787, "grad_norm": 243.8106689453125, "learning_rate": 0.00027123860677314126, "loss": 36.5862, "step": 8991 }, { "epoch": 23.74909210960713, "grad_norm": 747.3992309570312, "learning_rate": 0.0002711969810075541, "loss": 36.7144, "step": 8992 }, { "epoch": 23.751733245295476, "grad_norm": 474.1430358886719, "learning_rate": 0.0002711553546500536, "loss": 35.4683, "step": 8993 }, { "epoch": 23.754374380983823, "grad_norm": 318.5148010253906, "learning_rate": 0.00027111372770180226, "loss": 35.8114, "step": 8994 }, { "epoch": 23.75701551667217, "grad_norm": 1976.9564208984375, "learning_rate": 0.0002710721001639625, "loss": 35.9814, "step": 8995 }, { "epoch": 23.759656652360515, "grad_norm": 427.85443115234375, "learning_rate": 0.00027103047203769676, "loss": 36.0388, "step": 8996 }, { "epoch": 23.762297788048862, "grad_norm": 275.9825134277344, "learning_rate": 0.0002709888433241674, "loss": 36.2671, "step": 8997 }, { "epoch": 23.76493892373721, "grad_norm": 613.658203125, "learning_rate": 0.00027094721402453687, "loss": 35.9312, "step": 8998 }, { "epoch": 23.76758005942555, "grad_norm": 475.1626892089844, "learning_rate": 0.00027090558413996773, "loss": 34.821, "step": 8999 }, { "epoch": 23.770221195113898, "grad_norm": 423.4107360839844, "learning_rate": 0.00027086395367162245, "loss": 35.1088, "step": 9000 }, { "epoch": 23.770221195113898, "eval_loss": 3.8705921173095703, "eval_runtime": 2.1937, "eval_samples_per_second": 225.641, "eval_steps_per_second": 28.262, "step": 9000 }, { "epoch": 23.772862330802244, "grad_norm": 215.06813049316406, "learning_rate": 0.0002708223226206635, "loss": 34.9846, "step": 9001 }, { "epoch": 23.77550346649059, "grad_norm": 469.7164611816406, "learning_rate": 0.00027078069098825347, "loss": 34.6846, "step": 9002 }, { "epoch": 23.778144602178937, "grad_norm": 1122.773681640625, "learning_rate": 0.00027073905877555483, "loss": 37.042, "step": 9003 }, { "epoch": 23.780785737867284, "grad_norm": 268.192138671875, "learning_rate": 0.00027069742598373014, "loss": 34.3863, "step": 9004 }, { "epoch": 23.78342687355563, "grad_norm": 495.34918212890625, "learning_rate": 0.000270655792613942, "loss": 35.7889, "step": 9005 }, { "epoch": 23.786068009243976, "grad_norm": 1021.471923828125, "learning_rate": 0.00027061415866735293, "loss": 36.1006, "step": 9006 }, { "epoch": 23.78870914493232, "grad_norm": 675.207763671875, "learning_rate": 0.0002705725241451257, "loss": 37.4669, "step": 9007 }, { "epoch": 23.791350280620666, "grad_norm": 1582.563720703125, "learning_rate": 0.0002705308890484227, "loss": 39.0443, "step": 9008 }, { "epoch": 23.793991416309012, "grad_norm": 553.9302978515625, "learning_rate": 0.0002704892533784068, "loss": 41.4504, "step": 9009 }, { "epoch": 23.79663255199736, "grad_norm": 129.92092895507812, "learning_rate": 0.00027044761713624043, "loss": 40.3396, "step": 9010 }, { "epoch": 23.799273687685705, "grad_norm": 823.80126953125, "learning_rate": 0.00027040598032308636, "loss": 39.3836, "step": 9011 }, { "epoch": 23.80191482337405, "grad_norm": 386.8561706542969, "learning_rate": 0.00027036434294010734, "loss": 40.3781, "step": 9012 }, { "epoch": 23.804555959062398, "grad_norm": 563.0883178710938, "learning_rate": 0.0002703227049884661, "loss": 42.6314, "step": 9013 }, { "epoch": 23.807197094750745, "grad_norm": 239.83084106445312, "learning_rate": 0.0002702810664693251, "loss": 42.0644, "step": 9014 }, { "epoch": 23.809838230439087, "grad_norm": 188.97109985351562, "learning_rate": 0.00027023942738384725, "loss": 40.7559, "step": 9015 }, { "epoch": 23.812479366127434, "grad_norm": 402.2982482910156, "learning_rate": 0.00027019778773319537, "loss": 41.3959, "step": 9016 }, { "epoch": 23.81512050181578, "grad_norm": 149.73956298828125, "learning_rate": 0.000270156147518532, "loss": 38.6231, "step": 9017 }, { "epoch": 23.817761637504127, "grad_norm": 354.1707763671875, "learning_rate": 0.0002701145067410202, "loss": 38.0915, "step": 9018 }, { "epoch": 23.820402773192473, "grad_norm": 357.3100891113281, "learning_rate": 0.0002700728654018225, "loss": 38.4189, "step": 9019 }, { "epoch": 23.82304390888082, "grad_norm": 479.9960021972656, "learning_rate": 0.00027003122350210187, "loss": 37.7707, "step": 9020 }, { "epoch": 23.825685044569166, "grad_norm": 275.2333679199219, "learning_rate": 0.0002699895810430211, "loss": 35.2871, "step": 9021 }, { "epoch": 23.82832618025751, "grad_norm": 281.9359130859375, "learning_rate": 0.000269947938025743, "loss": 36.4494, "step": 9022 }, { "epoch": 23.830967315945855, "grad_norm": 281.2836608886719, "learning_rate": 0.0002699062944514305, "loss": 36.6316, "step": 9023 }, { "epoch": 23.833608451634202, "grad_norm": 240.48745727539062, "learning_rate": 0.0002698646503212464, "loss": 36.026, "step": 9024 }, { "epoch": 23.83624958732255, "grad_norm": 646.9261474609375, "learning_rate": 0.0002698230056363535, "loss": 36.4118, "step": 9025 }, { "epoch": 23.838890723010895, "grad_norm": 234.64877319335938, "learning_rate": 0.0002697813603979149, "loss": 35.0456, "step": 9026 }, { "epoch": 23.84153185869924, "grad_norm": 414.4435119628906, "learning_rate": 0.00026973971460709346, "loss": 35.1824, "step": 9027 }, { "epoch": 23.844172994387588, "grad_norm": 395.3463134765625, "learning_rate": 0.00026969806826505205, "loss": 35.6964, "step": 9028 }, { "epoch": 23.846814130075934, "grad_norm": 205.40025329589844, "learning_rate": 0.00026965642137295363, "loss": 37.136, "step": 9029 }, { "epoch": 23.849455265764277, "grad_norm": 1162.447021484375, "learning_rate": 0.00026961477393196127, "loss": 39.6584, "step": 9030 }, { "epoch": 23.852096401452624, "grad_norm": 8905.4970703125, "learning_rate": 0.00026957312594323783, "loss": 36.6668, "step": 9031 }, { "epoch": 23.85473753714097, "grad_norm": 2269.09814453125, "learning_rate": 0.00026953147740794634, "loss": 22.5948, "step": 9032 }, { "epoch": 23.857378672829316, "grad_norm": 2592.1083984375, "learning_rate": 0.00026948982832724976, "loss": 27.6361, "step": 9033 }, { "epoch": 23.860019808517663, "grad_norm": 7311.181640625, "learning_rate": 0.0002694481787023112, "loss": 21.2887, "step": 9034 }, { "epoch": 23.86266094420601, "grad_norm": 3072.137939453125, "learning_rate": 0.0002694065285342938, "loss": 23.7037, "step": 9035 }, { "epoch": 23.865302079894356, "grad_norm": 1061.2137451171875, "learning_rate": 0.00026936487782436037, "loss": 18.5363, "step": 9036 }, { "epoch": 23.867943215582702, "grad_norm": 1614.171142578125, "learning_rate": 0.0002693232265736741, "loss": 19.7994, "step": 9037 }, { "epoch": 23.870584351271045, "grad_norm": 1723.44482421875, "learning_rate": 0.0002692815747833981, "loss": 19.4558, "step": 9038 }, { "epoch": 23.87322548695939, "grad_norm": 1344.0228271484375, "learning_rate": 0.0002692399224546955, "loss": 13.9765, "step": 9039 }, { "epoch": 23.875866622647738, "grad_norm": 668.7467651367188, "learning_rate": 0.0002691982695887293, "loss": 16.8495, "step": 9040 }, { "epoch": 23.878507758336085, "grad_norm": 6121.9765625, "learning_rate": 0.0002691566161866627, "loss": 29.4343, "step": 9041 }, { "epoch": 23.88114889402443, "grad_norm": 363.2242431640625, "learning_rate": 0.00026911496224965884, "loss": 35.9428, "step": 9042 }, { "epoch": 23.883790029712777, "grad_norm": 400.92108154296875, "learning_rate": 0.00026907330777888093, "loss": 35.2076, "step": 9043 }, { "epoch": 23.886431165401124, "grad_norm": 393.0230712890625, "learning_rate": 0.0002690316527754921, "loss": 34.9321, "step": 9044 }, { "epoch": 23.889072301089467, "grad_norm": 495.1220397949219, "learning_rate": 0.00026898999724065553, "loss": 36.1786, "step": 9045 }, { "epoch": 23.891713436777813, "grad_norm": 500.2127380371094, "learning_rate": 0.00026894834117553447, "loss": 36.5231, "step": 9046 }, { "epoch": 23.89435457246616, "grad_norm": 4409.1279296875, "learning_rate": 0.00026890668458129206, "loss": 36.1764, "step": 9047 }, { "epoch": 23.896995708154506, "grad_norm": 239.68634033203125, "learning_rate": 0.00026886502745909166, "loss": 35.0925, "step": 9048 }, { "epoch": 23.899636843842853, "grad_norm": 1169.9998779296875, "learning_rate": 0.0002688233698100964, "loss": 35.1251, "step": 9049 }, { "epoch": 23.9022779795312, "grad_norm": 842.678466796875, "learning_rate": 0.00026878171163546963, "loss": 36.8519, "step": 9050 }, { "epoch": 23.904919115219545, "grad_norm": 436.8217468261719, "learning_rate": 0.00026874005293637453, "loss": 37.3709, "step": 9051 }, { "epoch": 23.907560250907892, "grad_norm": 447.1668701171875, "learning_rate": 0.00026869839371397456, "loss": 37.4528, "step": 9052 }, { "epoch": 23.910201386596235, "grad_norm": 297.359619140625, "learning_rate": 0.00026865673396943285, "loss": 36.2076, "step": 9053 }, { "epoch": 23.91284252228458, "grad_norm": 461.81988525390625, "learning_rate": 0.0002686150737039128, "loss": 35.55, "step": 9054 }, { "epoch": 23.915483657972928, "grad_norm": 1222.2022705078125, "learning_rate": 0.0002685734129185778, "loss": 35.9908, "step": 9055 }, { "epoch": 23.918124793661274, "grad_norm": 297.8985900878906, "learning_rate": 0.00026853175161459117, "loss": 36.4913, "step": 9056 }, { "epoch": 23.92076592934962, "grad_norm": 647.5958862304688, "learning_rate": 0.00026849008979311617, "loss": 37.951, "step": 9057 }, { "epoch": 23.923407065037967, "grad_norm": 1041.9620361328125, "learning_rate": 0.0002684484274553164, "loss": 39.6165, "step": 9058 }, { "epoch": 23.926048200726314, "grad_norm": 343.3116149902344, "learning_rate": 0.000268406764602355, "loss": 40.0994, "step": 9059 }, { "epoch": 23.92868933641466, "grad_norm": 641.8831787109375, "learning_rate": 0.0002683651012353955, "loss": 41.456, "step": 9060 }, { "epoch": 23.931330472103003, "grad_norm": 634.6017456054688, "learning_rate": 0.00026832343735560144, "loss": 42.9477, "step": 9061 }, { "epoch": 23.93397160779135, "grad_norm": 292.526123046875, "learning_rate": 0.00026828177296413615, "loss": 46.8527, "step": 9062 }, { "epoch": 23.936612743479696, "grad_norm": 899.9619140625, "learning_rate": 0.0002682401080621631, "loss": 44.8123, "step": 9063 }, { "epoch": 23.939253879168042, "grad_norm": 495.2588806152344, "learning_rate": 0.00026819844265084565, "loss": 41.7185, "step": 9064 }, { "epoch": 23.94189501485639, "grad_norm": 1174.8123779296875, "learning_rate": 0.00026815677673134745, "loss": 40.7981, "step": 9065 }, { "epoch": 23.944536150544735, "grad_norm": 395.5329895019531, "learning_rate": 0.0002681151103048319, "loss": 38.929, "step": 9066 }, { "epoch": 23.94717728623308, "grad_norm": 267.2776794433594, "learning_rate": 0.00026807344337246257, "loss": 37.4093, "step": 9067 }, { "epoch": 23.949818421921425, "grad_norm": 494.8185729980469, "learning_rate": 0.00026803177593540296, "loss": 37.5381, "step": 9068 }, { "epoch": 23.95245955760977, "grad_norm": 557.8453369140625, "learning_rate": 0.00026799010799481655, "loss": 36.4469, "step": 9069 }, { "epoch": 23.955100693298117, "grad_norm": 827.9326782226562, "learning_rate": 0.00026794843955186697, "loss": 35.0963, "step": 9070 }, { "epoch": 23.957741828986464, "grad_norm": 378.64447021484375, "learning_rate": 0.0002679067706077177, "loss": 30.2015, "step": 9071 }, { "epoch": 23.96038296467481, "grad_norm": 653.0863037109375, "learning_rate": 0.0002678651011635325, "loss": 15.6355, "step": 9072 }, { "epoch": 23.963024100363157, "grad_norm": 2691.0556640625, "learning_rate": 0.00026782343122047473, "loss": 12.19, "step": 9073 }, { "epoch": 23.965665236051503, "grad_norm": 457.31689453125, "learning_rate": 0.0002677817607797082, "loss": 13.2592, "step": 9074 }, { "epoch": 23.96830637173985, "grad_norm": 14493.5478515625, "learning_rate": 0.0002677400898423964, "loss": 12.5994, "step": 9075 }, { "epoch": 23.970947507428193, "grad_norm": 932.0010986328125, "learning_rate": 0.000267698418409703, "loss": 17.7218, "step": 9076 }, { "epoch": 23.97358864311654, "grad_norm": 4637.07666015625, "learning_rate": 0.0002676567464827917, "loss": 32.433, "step": 9077 }, { "epoch": 23.976229778804885, "grad_norm": 1000.4429931640625, "learning_rate": 0.0002676150740628261, "loss": 36.5006, "step": 9078 }, { "epoch": 23.978870914493232, "grad_norm": 382.55615234375, "learning_rate": 0.00026757340115096987, "loss": 35.9638, "step": 9079 }, { "epoch": 23.98151205018158, "grad_norm": 228.78582763671875, "learning_rate": 0.00026753172774838683, "loss": 34.6979, "step": 9080 }, { "epoch": 23.984153185869925, "grad_norm": 419.140380859375, "learning_rate": 0.0002674900538562405, "loss": 35.0853, "step": 9081 }, { "epoch": 23.98679432155827, "grad_norm": 309.2196960449219, "learning_rate": 0.00026744837947569473, "loss": 35.6384, "step": 9082 }, { "epoch": 23.989435457246618, "grad_norm": 517.5664672851562, "learning_rate": 0.0002674067046079133, "loss": 35.341, "step": 9083 }, { "epoch": 23.99207659293496, "grad_norm": 273.71746826171875, "learning_rate": 0.00026736502925405977, "loss": 35.5903, "step": 9084 }, { "epoch": 23.994717728623307, "grad_norm": 405.5719909667969, "learning_rate": 0.000267323353415298, "loss": 34.812, "step": 9085 }, { "epoch": 23.997358864311654, "grad_norm": 881.8012084960938, "learning_rate": 0.0002672816770927917, "loss": 35.9182, "step": 9086 }, { "epoch": 24.0, "grad_norm": 1021.076171875, "learning_rate": 0.0002672400002877048, "loss": 41.7443, "step": 9087 }, { "epoch": 24.002641135688346, "grad_norm": 277.76177978515625, "learning_rate": 0.000267198323001201, "loss": 39.8901, "step": 9088 }, { "epoch": 24.005282271376693, "grad_norm": 304.00408935546875, "learning_rate": 0.00026715664523444416, "loss": 41.7085, "step": 9089 }, { "epoch": 24.00792340706504, "grad_norm": 289.4375915527344, "learning_rate": 0.00026711496698859807, "loss": 39.8041, "step": 9090 }, { "epoch": 24.010564542753382, "grad_norm": 294.8310546875, "learning_rate": 0.00026707328826482656, "loss": 41.2141, "step": 9091 }, { "epoch": 24.01320567844173, "grad_norm": 462.09893798828125, "learning_rate": 0.00026703160906429356, "loss": 43.5048, "step": 9092 }, { "epoch": 24.015846814130075, "grad_norm": 243.62509155273438, "learning_rate": 0.0002669899293881628, "loss": 43.4973, "step": 9093 }, { "epoch": 24.01848794981842, "grad_norm": 254.517333984375, "learning_rate": 0.0002669482492375983, "loss": 40.3762, "step": 9094 }, { "epoch": 24.021129085506768, "grad_norm": 474.568359375, "learning_rate": 0.00026690656861376384, "loss": 41.7224, "step": 9095 }, { "epoch": 24.023770221195115, "grad_norm": 263.7966613769531, "learning_rate": 0.00026686488751782346, "loss": 39.9516, "step": 9096 }, { "epoch": 24.02641135688346, "grad_norm": 327.6964416503906, "learning_rate": 0.00026682320595094094, "loss": 38.7661, "step": 9097 }, { "epoch": 24.029052492571807, "grad_norm": 509.3265686035156, "learning_rate": 0.00026678152391428035, "loss": 39.3892, "step": 9098 }, { "epoch": 24.03169362826015, "grad_norm": 342.801025390625, "learning_rate": 0.00026673984140900545, "loss": 39.7556, "step": 9099 }, { "epoch": 24.034334763948497, "grad_norm": 285.1516418457031, "learning_rate": 0.0002666981584362804, "loss": 37.6038, "step": 9100 }, { "epoch": 24.036975899636843, "grad_norm": 547.1983642578125, "learning_rate": 0.0002666564749972691, "loss": 35.9027, "step": 9101 }, { "epoch": 24.03961703532519, "grad_norm": 204.28564453125, "learning_rate": 0.0002666147910931355, "loss": 35.369, "step": 9102 }, { "epoch": 24.042258171013536, "grad_norm": 341.0447692871094, "learning_rate": 0.00026657310672504355, "loss": 36.2876, "step": 9103 }, { "epoch": 24.044899306701883, "grad_norm": 536.6439819335938, "learning_rate": 0.00026653142189415733, "loss": 36.3197, "step": 9104 }, { "epoch": 24.04754044239023, "grad_norm": 246.2998504638672, "learning_rate": 0.0002664897366016409, "loss": 36.3632, "step": 9105 }, { "epoch": 24.050181578078575, "grad_norm": 428.4355773925781, "learning_rate": 0.00026644805084865837, "loss": 36.1479, "step": 9106 }, { "epoch": 24.05282271376692, "grad_norm": 311.1357421875, "learning_rate": 0.0002664063646363735, "loss": 36.7694, "step": 9107 }, { "epoch": 24.055463849455265, "grad_norm": 828.0021362304688, "learning_rate": 0.00026636467796595063, "loss": 36.8076, "step": 9108 }, { "epoch": 24.05810498514361, "grad_norm": 338.9824523925781, "learning_rate": 0.0002663229908385538, "loss": 35.5536, "step": 9109 }, { "epoch": 24.060746120831958, "grad_norm": 381.2406921386719, "learning_rate": 0.000266281303255347, "loss": 43.4109, "step": 9110 }, { "epoch": 24.063387256520304, "grad_norm": 8647.708984375, "learning_rate": 0.0002662396152174943, "loss": 24.5997, "step": 9111 }, { "epoch": 24.06602839220865, "grad_norm": 17425.84375, "learning_rate": 0.0002661979267261599, "loss": 23.7037, "step": 9112 }, { "epoch": 24.068669527896997, "grad_norm": 1174.535888671875, "learning_rate": 0.00026615623778250796, "loss": 29.7349, "step": 9113 }, { "epoch": 24.07131066358534, "grad_norm": 8371.048828125, "learning_rate": 0.00026611454838770256, "loss": 22.3136, "step": 9114 }, { "epoch": 24.073951799273686, "grad_norm": 1482.0718994140625, "learning_rate": 0.00026607285854290784, "loss": 31.5755, "step": 9115 }, { "epoch": 24.076592934962033, "grad_norm": 2298.026611328125, "learning_rate": 0.00026603116824928805, "loss": 20.0616, "step": 9116 }, { "epoch": 24.07923407065038, "grad_norm": 3049.43115234375, "learning_rate": 0.0002659894775080072, "loss": 18.3117, "step": 9117 }, { "epoch": 24.081875206338726, "grad_norm": 1094.015625, "learning_rate": 0.0002659477863202297, "loss": 13.4922, "step": 9118 }, { "epoch": 24.084516342027072, "grad_norm": 630.2213134765625, "learning_rate": 0.0002659060946871196, "loss": 16.0085, "step": 9119 }, { "epoch": 24.08715747771542, "grad_norm": 290.8477783203125, "learning_rate": 0.00026586440260984107, "loss": 35.2773, "step": 9120 }, { "epoch": 24.089798613403765, "grad_norm": 977.8167724609375, "learning_rate": 0.0002658227100895584, "loss": 36.7783, "step": 9121 }, { "epoch": 24.092439749092108, "grad_norm": 161.01588439941406, "learning_rate": 0.00026578101712743595, "loss": 35.9386, "step": 9122 }, { "epoch": 24.095080884780455, "grad_norm": 245.3394012451172, "learning_rate": 0.00026573932372463783, "loss": 37.575, "step": 9123 }, { "epoch": 24.0977220204688, "grad_norm": 198.37521362304688, "learning_rate": 0.0002656976298823284, "loss": 35.5817, "step": 9124 }, { "epoch": 24.100363156157147, "grad_norm": 891.949462890625, "learning_rate": 0.00026565593560167177, "loss": 36.2867, "step": 9125 }, { "epoch": 24.103004291845494, "grad_norm": 399.4884338378906, "learning_rate": 0.0002656142408838324, "loss": 34.9505, "step": 9126 }, { "epoch": 24.10564542753384, "grad_norm": 282.5657653808594, "learning_rate": 0.0002655725457299745, "loss": 36.6217, "step": 9127 }, { "epoch": 24.108286563222187, "grad_norm": 250.3912353515625, "learning_rate": 0.00026553085014126244, "loss": 36.3124, "step": 9128 }, { "epoch": 24.110927698910533, "grad_norm": 703.5540771484375, "learning_rate": 0.00026548915411886046, "loss": 35.7815, "step": 9129 }, { "epoch": 24.113568834598876, "grad_norm": 196.36000061035156, "learning_rate": 0.000265447457663933, "loss": 37.8441, "step": 9130 }, { "epoch": 24.116209970287223, "grad_norm": 1127.34130859375, "learning_rate": 0.00026540576077764436, "loss": 34.2867, "step": 9131 }, { "epoch": 24.11885110597557, "grad_norm": 322.78472900390625, "learning_rate": 0.0002653640634611589, "loss": 36.7905, "step": 9132 }, { "epoch": 24.121492241663915, "grad_norm": 275.4847717285156, "learning_rate": 0.00026532236571564095, "loss": 35.3524, "step": 9133 }, { "epoch": 24.124133377352262, "grad_norm": 1546.156005859375, "learning_rate": 0.00026528066754225493, "loss": 35.4419, "step": 9134 }, { "epoch": 24.12677451304061, "grad_norm": 414.954833984375, "learning_rate": 0.0002652389689421653, "loss": 37.7451, "step": 9135 }, { "epoch": 24.129415648728955, "grad_norm": 870.99169921875, "learning_rate": 0.00026519726991653644, "loss": 38.5093, "step": 9136 }, { "epoch": 24.132056784417298, "grad_norm": 358.1219482421875, "learning_rate": 0.0002651555704665327, "loss": 42.3216, "step": 9137 }, { "epoch": 24.134697920105644, "grad_norm": 181.6311798095703, "learning_rate": 0.0002651138705933185, "loss": 41.1868, "step": 9138 }, { "epoch": 24.13733905579399, "grad_norm": 180.4313507080078, "learning_rate": 0.00026507217029805836, "loss": 41.7003, "step": 9139 }, { "epoch": 24.139980191482337, "grad_norm": 379.87872314453125, "learning_rate": 0.0002650304695819168, "loss": 40.8302, "step": 9140 }, { "epoch": 24.142621327170684, "grad_norm": 326.07574462890625, "learning_rate": 0.00026498876844605816, "loss": 42.9276, "step": 9141 }, { "epoch": 24.14526246285903, "grad_norm": 249.67372131347656, "learning_rate": 0.0002649470668916469, "loss": 44.1223, "step": 9142 }, { "epoch": 24.147903598547376, "grad_norm": 171.1880340576172, "learning_rate": 0.00026490536491984766, "loss": 44.4697, "step": 9143 }, { "epoch": 24.150544734235723, "grad_norm": 218.72886657714844, "learning_rate": 0.0002648636625318249, "loss": 42.4922, "step": 9144 }, { "epoch": 24.153185869924066, "grad_norm": 127.93699645996094, "learning_rate": 0.0002648219597287431, "loss": 40.585, "step": 9145 }, { "epoch": 24.155827005612412, "grad_norm": 183.78958129882812, "learning_rate": 0.00026478025651176663, "loss": 39.1437, "step": 9146 }, { "epoch": 24.15846814130076, "grad_norm": 229.3134765625, "learning_rate": 0.0002647385528820602, "loss": 38.8723, "step": 9147 }, { "epoch": 24.161109276989105, "grad_norm": 183.9147491455078, "learning_rate": 0.0002646968488407885, "loss": 39.0654, "step": 9148 }, { "epoch": 24.16375041267745, "grad_norm": 288.2423400878906, "learning_rate": 0.0002646551443891158, "loss": 38.8272, "step": 9149 }, { "epoch": 24.166391548365798, "grad_norm": 245.89317321777344, "learning_rate": 0.0002646134395282069, "loss": 37.4286, "step": 9150 }, { "epoch": 24.169032684054145, "grad_norm": 142.3569793701172, "learning_rate": 0.0002645717342592262, "loss": 36.1927, "step": 9151 }, { "epoch": 24.17167381974249, "grad_norm": 246.9098663330078, "learning_rate": 0.0002645300285833384, "loss": 36.3173, "step": 9152 }, { "epoch": 24.174314955430834, "grad_norm": 297.7392272949219, "learning_rate": 0.0002644883225017081, "loss": 35.0969, "step": 9153 }, { "epoch": 24.17695609111918, "grad_norm": 202.30979919433594, "learning_rate": 0.0002644466160154999, "loss": 36.074, "step": 9154 }, { "epoch": 24.179597226807527, "grad_norm": 466.9269104003906, "learning_rate": 0.00026440490912587846, "loss": 36.5078, "step": 9155 }, { "epoch": 24.182238362495873, "grad_norm": 256.6102294921875, "learning_rate": 0.0002643632018340083, "loss": 35.7636, "step": 9156 }, { "epoch": 24.18487949818422, "grad_norm": 244.5244140625, "learning_rate": 0.0002643214941410543, "loss": 35.1128, "step": 9157 }, { "epoch": 24.187520633872566, "grad_norm": 176.2557830810547, "learning_rate": 0.00026427978604818094, "loss": 37.9884, "step": 9158 }, { "epoch": 24.190161769560913, "grad_norm": 4242.4951171875, "learning_rate": 0.0002642380775565529, "loss": 61.5172, "step": 9159 }, { "epoch": 24.192802905249255, "grad_norm": 3403.9931640625, "learning_rate": 0.00026419636866733485, "loss": 44.7064, "step": 9160 }, { "epoch": 24.195444040937602, "grad_norm": 1358.1435546875, "learning_rate": 0.00026415465938169167, "loss": 48.8058, "step": 9161 }, { "epoch": 24.19808517662595, "grad_norm": 1904.452392578125, "learning_rate": 0.0002641129497007879, "loss": 45.5941, "step": 9162 }, { "epoch": 24.200726312314295, "grad_norm": 1802.537353515625, "learning_rate": 0.00026407123962578834, "loss": 41.6982, "step": 9163 }, { "epoch": 24.20336744800264, "grad_norm": 2558.245849609375, "learning_rate": 0.0002640295291578576, "loss": 34.4995, "step": 9164 }, { "epoch": 24.206008583690988, "grad_norm": 3514.2119140625, "learning_rate": 0.00026398781829816055, "loss": 24.5271, "step": 9165 }, { "epoch": 24.208649719379334, "grad_norm": 1264.050048828125, "learning_rate": 0.0002639461070478619, "loss": 23.6173, "step": 9166 }, { "epoch": 24.21129085506768, "grad_norm": 1701.0888671875, "learning_rate": 0.0002639043954081264, "loss": 18.4512, "step": 9167 }, { "epoch": 24.213931990756024, "grad_norm": 2114.42919921875, "learning_rate": 0.0002638626833801188, "loss": 21.6523, "step": 9168 }, { "epoch": 24.21657312644437, "grad_norm": 493.1608581542969, "learning_rate": 0.0002638209709650039, "loss": 32.3575, "step": 9169 }, { "epoch": 24.219214262132716, "grad_norm": 6927.36083984375, "learning_rate": 0.00026377925816394656, "loss": 36.612, "step": 9170 }, { "epoch": 24.221855397821063, "grad_norm": 557.5582275390625, "learning_rate": 0.00026373754497811147, "loss": 36.4273, "step": 9171 }, { "epoch": 24.22449653350941, "grad_norm": 347.2105407714844, "learning_rate": 0.0002636958314086635, "loss": 38.0467, "step": 9172 }, { "epoch": 24.227137669197756, "grad_norm": 440.9910583496094, "learning_rate": 0.00026365411745676756, "loss": 35.9881, "step": 9173 }, { "epoch": 24.229778804886102, "grad_norm": 266.40008544921875, "learning_rate": 0.0002636124031235883, "loss": 36.3786, "step": 9174 }, { "epoch": 24.23241994057445, "grad_norm": 374.87554931640625, "learning_rate": 0.0002635706884102908, "loss": 36.5044, "step": 9175 }, { "epoch": 24.23506107626279, "grad_norm": 845.7483520507812, "learning_rate": 0.00026352897331803973, "loss": 35.8305, "step": 9176 }, { "epoch": 24.237702211951138, "grad_norm": 1328.25048828125, "learning_rate": 0.00026348725784799997, "loss": 35.6554, "step": 9177 }, { "epoch": 24.240343347639485, "grad_norm": 225.64869689941406, "learning_rate": 0.00026344554200133656, "loss": 36.212, "step": 9178 }, { "epoch": 24.24298448332783, "grad_norm": 987.0000610351562, "learning_rate": 0.00026340382577921427, "loss": 35.9712, "step": 9179 }, { "epoch": 24.245625619016177, "grad_norm": 224.16322326660156, "learning_rate": 0.00026336210918279806, "loss": 36.1352, "step": 9180 }, { "epoch": 24.248266754704524, "grad_norm": 643.7598876953125, "learning_rate": 0.0002633203922132527, "loss": 34.8861, "step": 9181 }, { "epoch": 24.25090789039287, "grad_norm": 571.7896118164062, "learning_rate": 0.0002632786748717432, "loss": 35.617, "step": 9182 }, { "epoch": 24.253549026081213, "grad_norm": 591.9136352539062, "learning_rate": 0.0002632369571594346, "loss": 34.7317, "step": 9183 }, { "epoch": 24.25619016176956, "grad_norm": 331.3962707519531, "learning_rate": 0.0002631952390774917, "loss": 35.3867, "step": 9184 }, { "epoch": 24.258831297457906, "grad_norm": 765.8751831054688, "learning_rate": 0.00026315352062707955, "loss": 37.3872, "step": 9185 }, { "epoch": 24.261472433146253, "grad_norm": 6286.91650390625, "learning_rate": 0.000263111801809363, "loss": 36.985, "step": 9186 }, { "epoch": 24.2641135688346, "grad_norm": 916.1663208007812, "learning_rate": 0.00026307008262550714, "loss": 43.2077, "step": 9187 }, { "epoch": 24.266754704522945, "grad_norm": 360.06317138671875, "learning_rate": 0.00026302836307667685, "loss": 39.3208, "step": 9188 }, { "epoch": 24.269395840211292, "grad_norm": 305.3462829589844, "learning_rate": 0.00026298664316403716, "loss": 41.3303, "step": 9189 }, { "epoch": 24.27203697589964, "grad_norm": 955.2461547851562, "learning_rate": 0.00026294492288875314, "loss": 41.7614, "step": 9190 }, { "epoch": 24.27467811158798, "grad_norm": 498.46551513671875, "learning_rate": 0.0002629032022519897, "loss": 42.3273, "step": 9191 }, { "epoch": 24.277319247276328, "grad_norm": 353.84930419921875, "learning_rate": 0.000262861481254912, "loss": 43.3869, "step": 9192 }, { "epoch": 24.279960382964674, "grad_norm": 584.8117065429688, "learning_rate": 0.00026281975989868496, "loss": 42.5466, "step": 9193 }, { "epoch": 24.28260151865302, "grad_norm": 479.61920166015625, "learning_rate": 0.0002627780381844737, "loss": 42.4256, "step": 9194 }, { "epoch": 24.285242654341367, "grad_norm": 265.8860778808594, "learning_rate": 0.00026273631611344305, "loss": 39.6532, "step": 9195 }, { "epoch": 24.287883790029714, "grad_norm": 322.3811340332031, "learning_rate": 0.00026269459368675834, "loss": 42.4908, "step": 9196 }, { "epoch": 24.29052492571806, "grad_norm": 227.5099334716797, "learning_rate": 0.0002626528709055847, "loss": 39.9767, "step": 9197 }, { "epoch": 24.293166061406406, "grad_norm": 249.94020080566406, "learning_rate": 0.0002626111477710869, "loss": 40.2884, "step": 9198 }, { "epoch": 24.29580719709475, "grad_norm": 425.482177734375, "learning_rate": 0.00026256942428443033, "loss": 40.2819, "step": 9199 }, { "epoch": 24.298448332783096, "grad_norm": 321.4885559082031, "learning_rate": 0.0002625277004467798, "loss": 39.2135, "step": 9200 }, { "epoch": 24.298448332783096, "eval_loss": 4.003201484680176, "eval_runtime": 2.1463, "eval_samples_per_second": 230.634, "eval_steps_per_second": 28.887, "step": 9200 }, { "epoch": 24.301089468471442, "grad_norm": 490.5353088378906, "learning_rate": 0.00026248597625930074, "loss": 37.849, "step": 9201 }, { "epoch": 24.30373060415979, "grad_norm": 459.7245178222656, "learning_rate": 0.00026244425172315813, "loss": 36.9927, "step": 9202 }, { "epoch": 24.306371739848135, "grad_norm": 1113.9901123046875, "learning_rate": 0.00026240252683951707, "loss": 37.409, "step": 9203 }, { "epoch": 24.30901287553648, "grad_norm": 455.0721130371094, "learning_rate": 0.0002623608016095427, "loss": 35.7711, "step": 9204 }, { "epoch": 24.311654011224828, "grad_norm": 547.4840698242188, "learning_rate": 0.00026231907603440025, "loss": 35.3988, "step": 9205 }, { "epoch": 24.31429514691317, "grad_norm": 570.0902709960938, "learning_rate": 0.0002622773501152548, "loss": 35.7708, "step": 9206 }, { "epoch": 24.316936282601517, "grad_norm": 617.2108154296875, "learning_rate": 0.0002622356238532716, "loss": 36.8604, "step": 9207 }, { "epoch": 24.319577418289864, "grad_norm": 433.6865539550781, "learning_rate": 0.0002621938972496157, "loss": 42.6667, "step": 9208 }, { "epoch": 24.32221855397821, "grad_norm": 3773.8544921875, "learning_rate": 0.00026215217030545245, "loss": 40.0932, "step": 9209 }, { "epoch": 24.324859689666557, "grad_norm": 1943.7049560546875, "learning_rate": 0.00026211044302194706, "loss": 24.1641, "step": 9210 }, { "epoch": 24.327500825354903, "grad_norm": 1487.4718017578125, "learning_rate": 0.0002620687154002646, "loss": 19.5176, "step": 9211 }, { "epoch": 24.33014196104325, "grad_norm": 2693.735595703125, "learning_rate": 0.0002620269874415703, "loss": 17.9115, "step": 9212 }, { "epoch": 24.332783096731596, "grad_norm": 1297.529541015625, "learning_rate": 0.0002619852591470295, "loss": 17.9964, "step": 9213 }, { "epoch": 24.33542423241994, "grad_norm": 1180.77978515625, "learning_rate": 0.0002619435305178074, "loss": 14.781, "step": 9214 }, { "epoch": 24.338065368108285, "grad_norm": 2204.0810546875, "learning_rate": 0.00026190180155506916, "loss": 25.3559, "step": 9215 }, { "epoch": 24.340706503796632, "grad_norm": 862.1535034179688, "learning_rate": 0.00026186007225998016, "loss": 11.891, "step": 9216 }, { "epoch": 24.34334763948498, "grad_norm": 3292.02294921875, "learning_rate": 0.0002618183426337055, "loss": 10.9369, "step": 9217 }, { "epoch": 24.345988775173325, "grad_norm": 2972.414306640625, "learning_rate": 0.00026177661267741067, "loss": 17.1784, "step": 9218 }, { "epoch": 24.34862991086167, "grad_norm": 3382.50732421875, "learning_rate": 0.00026173488239226083, "loss": 32.5407, "step": 9219 }, { "epoch": 24.351271046550018, "grad_norm": 410.046875, "learning_rate": 0.00026169315177942135, "loss": 35.1533, "step": 9220 }, { "epoch": 24.353912182238364, "grad_norm": 1265.716552734375, "learning_rate": 0.0002616514208400573, "loss": 36.0149, "step": 9221 }, { "epoch": 24.356553317926707, "grad_norm": 429.8586730957031, "learning_rate": 0.0002616096895753343, "loss": 36.9822, "step": 9222 }, { "epoch": 24.359194453615054, "grad_norm": 442.1864013671875, "learning_rate": 0.0002615679579864176, "loss": 34.501, "step": 9223 }, { "epoch": 24.3618355893034, "grad_norm": 524.7454223632812, "learning_rate": 0.00026152622607447234, "loss": 35.9384, "step": 9224 }, { "epoch": 24.364476724991746, "grad_norm": 530.625, "learning_rate": 0.00026148449384066403, "loss": 37.7442, "step": 9225 }, { "epoch": 24.367117860680093, "grad_norm": 739.9453735351562, "learning_rate": 0.00026144276128615794, "loss": 37.2056, "step": 9226 }, { "epoch": 24.36975899636844, "grad_norm": 290.34771728515625, "learning_rate": 0.0002614010284121195, "loss": 34.8551, "step": 9227 }, { "epoch": 24.372400132056786, "grad_norm": 248.88729858398438, "learning_rate": 0.0002613592952197141, "loss": 34.8828, "step": 9228 }, { "epoch": 24.37504126774513, "grad_norm": 541.798828125, "learning_rate": 0.000261317561710107, "loss": 34.7258, "step": 9229 }, { "epoch": 24.377682403433475, "grad_norm": 533.7413330078125, "learning_rate": 0.0002612758278844636, "loss": 35.8052, "step": 9230 }, { "epoch": 24.38032353912182, "grad_norm": 569.7318725585938, "learning_rate": 0.00026123409374394934, "loss": 36.024, "step": 9231 }, { "epoch": 24.382964674810168, "grad_norm": 289.3833923339844, "learning_rate": 0.00026119235928972974, "loss": 35.7456, "step": 9232 }, { "epoch": 24.385605810498515, "grad_norm": 308.6817626953125, "learning_rate": 0.0002611506245229699, "loss": 35.6459, "step": 9233 }, { "epoch": 24.38824694618686, "grad_norm": 490.14013671875, "learning_rate": 0.0002611088894448355, "loss": 35.3436, "step": 9234 }, { "epoch": 24.390888081875207, "grad_norm": 513.602783203125, "learning_rate": 0.0002610671540564919, "loss": 36.9689, "step": 9235 }, { "epoch": 24.393529217563554, "grad_norm": 928.6381225585938, "learning_rate": 0.0002610254183591045, "loss": 37.7548, "step": 9236 }, { "epoch": 24.396170353251897, "grad_norm": 1169.134521484375, "learning_rate": 0.0002609836823538388, "loss": 41.4693, "step": 9237 }, { "epoch": 24.398811488940243, "grad_norm": 360.4246520996094, "learning_rate": 0.0002609419460418602, "loss": 39.8004, "step": 9238 }, { "epoch": 24.40145262462859, "grad_norm": 760.50537109375, "learning_rate": 0.00026090020942433416, "loss": 40.7205, "step": 9239 }, { "epoch": 24.404093760316936, "grad_norm": 668.5769653320312, "learning_rate": 0.0002608584725024262, "loss": 40.653, "step": 9240 }, { "epoch": 24.406734896005283, "grad_norm": 255.96368408203125, "learning_rate": 0.0002608167352773018, "loss": 40.3737, "step": 9241 }, { "epoch": 24.40937603169363, "grad_norm": 570.6239013671875, "learning_rate": 0.00026077499775012636, "loss": 45.2338, "step": 9242 }, { "epoch": 24.412017167381975, "grad_norm": 374.0791320800781, "learning_rate": 0.00026073325992206545, "loss": 43.6615, "step": 9243 }, { "epoch": 24.414658303070322, "grad_norm": 393.5888671875, "learning_rate": 0.0002606915217942846, "loss": 43.3099, "step": 9244 }, { "epoch": 24.417299438758665, "grad_norm": 3527.81103515625, "learning_rate": 0.0002606497833679493, "loss": 43.0677, "step": 9245 }, { "epoch": 24.41994057444701, "grad_norm": 1590.6392822265625, "learning_rate": 0.00026060804464422507, "loss": 40.7487, "step": 9246 }, { "epoch": 24.422581710135358, "grad_norm": 236.68739318847656, "learning_rate": 0.0002605663056242773, "loss": 38.1008, "step": 9247 }, { "epoch": 24.425222845823704, "grad_norm": 235.1626434326172, "learning_rate": 0.00026052456630927176, "loss": 40.1107, "step": 9248 }, { "epoch": 24.42786398151205, "grad_norm": 703.4735107421875, "learning_rate": 0.0002604828267003739, "loss": 36.3926, "step": 9249 }, { "epoch": 24.430505117200397, "grad_norm": 352.66961669921875, "learning_rate": 0.00026044108679874916, "loss": 37.2709, "step": 9250 }, { "epoch": 24.433146252888744, "grad_norm": 206.29673767089844, "learning_rate": 0.0002603993466055633, "loss": 37.8735, "step": 9251 }, { "epoch": 24.435787388577086, "grad_norm": 693.716796875, "learning_rate": 0.0002603576061219817, "loss": 36.4586, "step": 9252 }, { "epoch": 24.438428524265433, "grad_norm": 252.5102996826172, "learning_rate": 0.00026031586534917015, "loss": 36.8953, "step": 9253 }, { "epoch": 24.44106965995378, "grad_norm": 313.2264404296875, "learning_rate": 0.0002602741242882941, "loss": 35.1017, "step": 9254 }, { "epoch": 24.443710795642126, "grad_norm": 672.5521850585938, "learning_rate": 0.0002602323829405191, "loss": 35.8131, "step": 9255 }, { "epoch": 24.446351931330472, "grad_norm": 1283.1060791015625, "learning_rate": 0.0002601906413070108, "loss": 36.016, "step": 9256 }, { "epoch": 24.44899306701882, "grad_norm": 934.5150756835938, "learning_rate": 0.0002601488993889349, "loss": 35.1555, "step": 9257 }, { "epoch": 24.451634202707165, "grad_norm": 943.2974853515625, "learning_rate": 0.000260107157187457, "loss": 40.6324, "step": 9258 }, { "epoch": 24.45427533839551, "grad_norm": 13471.044921875, "learning_rate": 0.0002600654147037425, "loss": 24.6048, "step": 9259 }, { "epoch": 24.456916474083854, "grad_norm": 5853.66357421875, "learning_rate": 0.00026002367193895734, "loss": 19.395, "step": 9260 }, { "epoch": 24.4595576097722, "grad_norm": 3560.25, "learning_rate": 0.0002599819288942669, "loss": 19.1888, "step": 9261 }, { "epoch": 24.462198745460547, "grad_norm": 1432.421630859375, "learning_rate": 0.0002599401855708371, "loss": 17.1672, "step": 9262 }, { "epoch": 24.464839881148894, "grad_norm": 1757.2728271484375, "learning_rate": 0.0002598984419698334, "loss": 19.1602, "step": 9263 }, { "epoch": 24.46748101683724, "grad_norm": 1191.7166748046875, "learning_rate": 0.00025985669809242153, "loss": 13.5311, "step": 9264 }, { "epoch": 24.470122152525587, "grad_norm": 1255.8426513671875, "learning_rate": 0.00025981495393976716, "loss": 21.3616, "step": 9265 }, { "epoch": 24.472763288213933, "grad_norm": 1046.9046630859375, "learning_rate": 0.00025977320951303597, "loss": 15.8882, "step": 9266 }, { "epoch": 24.47540442390228, "grad_norm": 2916.8828125, "learning_rate": 0.00025973146481339364, "loss": 15.8212, "step": 9267 }, { "epoch": 24.478045559590623, "grad_norm": 1543.136474609375, "learning_rate": 0.00025968971984200584, "loss": 13.3919, "step": 9268 }, { "epoch": 24.48068669527897, "grad_norm": 894.0491333007812, "learning_rate": 0.00025964797460003837, "loss": 30.7859, "step": 9269 }, { "epoch": 24.483327830967315, "grad_norm": 279.8694763183594, "learning_rate": 0.00025960622908865685, "loss": 38.3785, "step": 9270 }, { "epoch": 24.485968966655662, "grad_norm": 322.53875732421875, "learning_rate": 0.00025956448330902697, "loss": 35.0484, "step": 9271 }, { "epoch": 24.48861010234401, "grad_norm": 320.76165771484375, "learning_rate": 0.0002595227372623146, "loss": 35.7769, "step": 9272 }, { "epoch": 24.491251238032355, "grad_norm": 809.2149047851562, "learning_rate": 0.0002594809909496853, "loss": 35.0827, "step": 9273 }, { "epoch": 24.4938923737207, "grad_norm": 391.41375732421875, "learning_rate": 0.000259439244372305, "loss": 35.3261, "step": 9274 }, { "epoch": 24.496533509409044, "grad_norm": 2939.773193359375, "learning_rate": 0.0002593974975313393, "loss": 34.4254, "step": 9275 }, { "epoch": 24.49917464509739, "grad_norm": 284.9442138671875, "learning_rate": 0.00025935575042795395, "loss": 35.3525, "step": 9276 }, { "epoch": 24.501815780785737, "grad_norm": 411.51092529296875, "learning_rate": 0.0002593140030633148, "loss": 35.6505, "step": 9277 }, { "epoch": 24.504456916474084, "grad_norm": 401.38433837890625, "learning_rate": 0.00025927225543858755, "loss": 34.5886, "step": 9278 }, { "epoch": 24.50709805216243, "grad_norm": 280.8448181152344, "learning_rate": 0.0002592305075549381, "loss": 37.143, "step": 9279 }, { "epoch": 24.509739187850776, "grad_norm": 783.2456665039062, "learning_rate": 0.0002591887594135321, "loss": 36.7554, "step": 9280 }, { "epoch": 24.512380323539123, "grad_norm": 463.17626953125, "learning_rate": 0.0002591470110155354, "loss": 35.0697, "step": 9281 }, { "epoch": 24.51502145922747, "grad_norm": 572.6328735351562, "learning_rate": 0.0002591052623621138, "loss": 35.6266, "step": 9282 }, { "epoch": 24.517662594915812, "grad_norm": 502.9585876464844, "learning_rate": 0.000259063513454433, "loss": 34.685, "step": 9283 }, { "epoch": 24.52030373060416, "grad_norm": 697.94482421875, "learning_rate": 0.0002590217642936591, "loss": 35.825, "step": 9284 }, { "epoch": 24.522944866292505, "grad_norm": 1168.467041015625, "learning_rate": 0.0002589800148809576, "loss": 36.8619, "step": 9285 }, { "epoch": 24.52558600198085, "grad_norm": 1619.002685546875, "learning_rate": 0.00025893826521749446, "loss": 36.6986, "step": 9286 }, { "epoch": 24.528227137669198, "grad_norm": 13924.822265625, "learning_rate": 0.0002588965153044355, "loss": 39.8223, "step": 9287 }, { "epoch": 24.530868273357544, "grad_norm": 637.4862670898438, "learning_rate": 0.00025885476514294665, "loss": 39.3654, "step": 9288 }, { "epoch": 24.53350940904589, "grad_norm": 335.2846984863281, "learning_rate": 0.0002588130147341937, "loss": 42.0778, "step": 9289 }, { "epoch": 24.536150544734237, "grad_norm": 646.4586791992188, "learning_rate": 0.00025877126407934236, "loss": 39.9237, "step": 9290 }, { "epoch": 24.53879168042258, "grad_norm": 528.3385009765625, "learning_rate": 0.0002587295131795587, "loss": 42.7638, "step": 9291 }, { "epoch": 24.541432816110927, "grad_norm": 378.4797058105469, "learning_rate": 0.0002586877620360085, "loss": 42.756, "step": 9292 }, { "epoch": 24.544073951799273, "grad_norm": 254.78089904785156, "learning_rate": 0.00025864601064985775, "loss": 42.0272, "step": 9293 }, { "epoch": 24.54671508748762, "grad_norm": 569.2401733398438, "learning_rate": 0.0002586042590222721, "loss": 44.564, "step": 9294 }, { "epoch": 24.549356223175966, "grad_norm": 373.276611328125, "learning_rate": 0.00025856250715441763, "loss": 42.6369, "step": 9295 }, { "epoch": 24.551997358864313, "grad_norm": 377.1991882324219, "learning_rate": 0.00025852075504746014, "loss": 43.822, "step": 9296 }, { "epoch": 24.55463849455266, "grad_norm": 865.6290893554688, "learning_rate": 0.00025847900270256565, "loss": 38.6567, "step": 9297 }, { "epoch": 24.557279630241002, "grad_norm": 435.03778076171875, "learning_rate": 0.0002584372501209, "loss": 41.1996, "step": 9298 }, { "epoch": 24.55992076592935, "grad_norm": 388.7137145996094, "learning_rate": 0.0002583954973036289, "loss": 37.6067, "step": 9299 }, { "epoch": 24.562561901617695, "grad_norm": 878.3382568359375, "learning_rate": 0.0002583537442519187, "loss": 39.1899, "step": 9300 }, { "epoch": 24.56520303730604, "grad_norm": 422.0450134277344, "learning_rate": 0.0002583119909669349, "loss": 37.4858, "step": 9301 }, { "epoch": 24.567844172994388, "grad_norm": 165.94747924804688, "learning_rate": 0.00025827023744984386, "loss": 37.0981, "step": 9302 }, { "epoch": 24.570485308682734, "grad_norm": 435.9466857910156, "learning_rate": 0.00025822848370181113, "loss": 35.6014, "step": 9303 }, { "epoch": 24.57312644437108, "grad_norm": 752.4287719726562, "learning_rate": 0.00025818672972400283, "loss": 36.679, "step": 9304 }, { "epoch": 24.575767580059427, "grad_norm": 524.203369140625, "learning_rate": 0.000258144975517585, "loss": 35.1056, "step": 9305 }, { "epoch": 24.57840871574777, "grad_norm": 468.3339538574219, "learning_rate": 0.00025810322108372346, "loss": 36.3299, "step": 9306 }, { "epoch": 24.581049851436116, "grad_norm": 332.2691650390625, "learning_rate": 0.00025806146642358424, "loss": 35.1789, "step": 9307 }, { "epoch": 24.583690987124463, "grad_norm": 243.23220825195312, "learning_rate": 0.00025801971153833326, "loss": 34.8344, "step": 9308 }, { "epoch": 24.58633212281281, "grad_norm": 810.0825805664062, "learning_rate": 0.00025797795642913663, "loss": 36.4414, "step": 9309 }, { "epoch": 24.588973258501156, "grad_norm": 994.4607543945312, "learning_rate": 0.00025793620109716014, "loss": 44.2037, "step": 9310 }, { "epoch": 24.591614394189502, "grad_norm": 1549.8375244140625, "learning_rate": 0.00025789444554357, "loss": 19.7141, "step": 9311 }, { "epoch": 24.59425552987785, "grad_norm": 5828.314453125, "learning_rate": 0.00025785268976953206, "loss": 22.697, "step": 9312 }, { "epoch": 24.596896665566195, "grad_norm": 2181.264892578125, "learning_rate": 0.0002578109337762124, "loss": 25.0986, "step": 9313 }, { "epoch": 24.599537801254538, "grad_norm": 1859.784423828125, "learning_rate": 0.00025776917756477704, "loss": 13.9898, "step": 9314 }, { "epoch": 24.602178936942884, "grad_norm": 9000.48046875, "learning_rate": 0.0002577274211363919, "loss": 17.2807, "step": 9315 }, { "epoch": 24.60482007263123, "grad_norm": 1149.5831298828125, "learning_rate": 0.0002576856644922231, "loss": 13.3498, "step": 9316 }, { "epoch": 24.607461208319577, "grad_norm": 1465.03564453125, "learning_rate": 0.00025764390763343656, "loss": 11.721, "step": 9317 }, { "epoch": 24.610102344007924, "grad_norm": 600.99462890625, "learning_rate": 0.0002576021505611984, "loss": 16.3535, "step": 9318 }, { "epoch": 24.61274347969627, "grad_norm": 1367.25048828125, "learning_rate": 0.00025756039327667473, "loss": 14.2447, "step": 9319 }, { "epoch": 24.615384615384617, "grad_norm": 380.96697998046875, "learning_rate": 0.0002575186357810315, "loss": 35.0206, "step": 9320 }, { "epoch": 24.618025751072963, "grad_norm": 453.7112731933594, "learning_rate": 0.00025747687807543476, "loss": 36.8591, "step": 9321 }, { "epoch": 24.620666886761306, "grad_norm": 319.9150085449219, "learning_rate": 0.0002574351201610505, "loss": 36.4959, "step": 9322 }, { "epoch": 24.623308022449653, "grad_norm": 460.05126953125, "learning_rate": 0.00025739336203904503, "loss": 37.3885, "step": 9323 }, { "epoch": 24.625949158138, "grad_norm": 703.7431640625, "learning_rate": 0.0002573516037105842, "loss": 34.596, "step": 9324 }, { "epoch": 24.628590293826345, "grad_norm": 358.2337646484375, "learning_rate": 0.00025730984517683417, "loss": 36.5404, "step": 9325 }, { "epoch": 24.631231429514692, "grad_norm": 485.4752197265625, "learning_rate": 0.000257268086438961, "loss": 37.3175, "step": 9326 }, { "epoch": 24.63387256520304, "grad_norm": 477.4969177246094, "learning_rate": 0.00025722632749813077, "loss": 35.2978, "step": 9327 }, { "epoch": 24.636513700891385, "grad_norm": 626.77783203125, "learning_rate": 0.0002571845683555096, "loss": 35.4567, "step": 9328 }, { "epoch": 24.639154836579728, "grad_norm": 376.34686279296875, "learning_rate": 0.00025714280901226357, "loss": 35.537, "step": 9329 }, { "epoch": 24.641795972268074, "grad_norm": 569.904296875, "learning_rate": 0.0002571010494695588, "loss": 34.8989, "step": 9330 }, { "epoch": 24.64443710795642, "grad_norm": 442.5648498535156, "learning_rate": 0.0002570592897285614, "loss": 35.0404, "step": 9331 }, { "epoch": 24.647078243644767, "grad_norm": 376.8979797363281, "learning_rate": 0.0002570175297904375, "loss": 35.4949, "step": 9332 }, { "epoch": 24.649719379333114, "grad_norm": 648.09375, "learning_rate": 0.00025697576965635324, "loss": 35.5047, "step": 9333 }, { "epoch": 24.65236051502146, "grad_norm": 788.081787109375, "learning_rate": 0.00025693400932747456, "loss": 34.3517, "step": 9334 }, { "epoch": 24.655001650709806, "grad_norm": 895.4010009765625, "learning_rate": 0.0002568922488049679, "loss": 36.1194, "step": 9335 }, { "epoch": 24.657642786398153, "grad_norm": 672.8462524414062, "learning_rate": 0.0002568504880899991, "loss": 37.29, "step": 9336 }, { "epoch": 24.660283922086496, "grad_norm": 1965.7384033203125, "learning_rate": 0.00025680872718373453, "loss": 43.4586, "step": 9337 }, { "epoch": 24.662925057774842, "grad_norm": 279.49945068359375, "learning_rate": 0.0002567669660873403, "loss": 39.5718, "step": 9338 }, { "epoch": 24.66556619346319, "grad_norm": 291.6850280761719, "learning_rate": 0.0002567252048019824, "loss": 40.4367, "step": 9339 }, { "epoch": 24.668207329151535, "grad_norm": 297.9434509277344, "learning_rate": 0.0002566834433288272, "loss": 40.1047, "step": 9340 }, { "epoch": 24.67084846483988, "grad_norm": 285.46185302734375, "learning_rate": 0.00025664168166904077, "loss": 40.5059, "step": 9341 }, { "epoch": 24.673489600528228, "grad_norm": 397.02728271484375, "learning_rate": 0.0002565999198237893, "loss": 42.6544, "step": 9342 }, { "epoch": 24.676130736216574, "grad_norm": 379.8876037597656, "learning_rate": 0.0002565581577942389, "loss": 45.5553, "step": 9343 }, { "epoch": 24.67877187190492, "grad_norm": 241.8949737548828, "learning_rate": 0.0002565163955815557, "loss": 41.4588, "step": 9344 }, { "epoch": 24.681413007593264, "grad_norm": 336.2810974121094, "learning_rate": 0.0002564746331869061, "loss": 41.5712, "step": 9345 }, { "epoch": 24.68405414328161, "grad_norm": 413.93731689453125, "learning_rate": 0.00025643287061145615, "loss": 38.6262, "step": 9346 }, { "epoch": 24.686695278969957, "grad_norm": 217.94395446777344, "learning_rate": 0.0002563911078563721, "loss": 39.128, "step": 9347 }, { "epoch": 24.689336414658303, "grad_norm": 585.0250244140625, "learning_rate": 0.00025634934492282, "loss": 38.5597, "step": 9348 }, { "epoch": 24.69197755034665, "grad_norm": 399.62786865234375, "learning_rate": 0.00025630758181196633, "loss": 37.4652, "step": 9349 }, { "epoch": 24.694618686034996, "grad_norm": 491.84344482421875, "learning_rate": 0.0002562658185249771, "loss": 36.7209, "step": 9350 }, { "epoch": 24.697259821723343, "grad_norm": 560.1140747070312, "learning_rate": 0.0002562240550630185, "loss": 36.4052, "step": 9351 }, { "epoch": 24.699900957411685, "grad_norm": 297.1780700683594, "learning_rate": 0.0002561822914272569, "loss": 35.4643, "step": 9352 }, { "epoch": 24.702542093100032, "grad_norm": 696.7481689453125, "learning_rate": 0.0002561405276188583, "loss": 36.3988, "step": 9353 }, { "epoch": 24.70518322878838, "grad_norm": 504.9494934082031, "learning_rate": 0.00025609876363898924, "loss": 34.7877, "step": 9354 }, { "epoch": 24.707824364476725, "grad_norm": 210.70611572265625, "learning_rate": 0.0002560569994888157, "loss": 35.3838, "step": 9355 }, { "epoch": 24.71046550016507, "grad_norm": 263.2250061035156, "learning_rate": 0.00025601523516950405, "loss": 34.7756, "step": 9356 }, { "epoch": 24.713106635853418, "grad_norm": 432.2035217285156, "learning_rate": 0.0002559734706822204, "loss": 34.4196, "step": 9357 }, { "epoch": 24.715747771541764, "grad_norm": 259.898193359375, "learning_rate": 0.0002559317060281312, "loss": 35.8442, "step": 9358 }, { "epoch": 24.71838890723011, "grad_norm": 295.4598388671875, "learning_rate": 0.0002558899412084026, "loss": 36.3205, "step": 9359 }, { "epoch": 24.721030042918454, "grad_norm": 980.1126708984375, "learning_rate": 0.00025584817622420074, "loss": 33.9547, "step": 9360 }, { "epoch": 24.7236711786068, "grad_norm": 2115.505615234375, "learning_rate": 0.00025580641107669213, "loss": 27.3483, "step": 9361 }, { "epoch": 24.726312314295146, "grad_norm": 1974.876220703125, "learning_rate": 0.0002557646457670427, "loss": 22.9998, "step": 9362 }, { "epoch": 24.728953449983493, "grad_norm": 3214.2080078125, "learning_rate": 0.00025572288029641915, "loss": 25.25, "step": 9363 }, { "epoch": 24.73159458567184, "grad_norm": 1143.660888671875, "learning_rate": 0.00025568111466598743, "loss": 18.151, "step": 9364 }, { "epoch": 24.734235721360186, "grad_norm": 562.4105224609375, "learning_rate": 0.00025563934887691385, "loss": 12.9656, "step": 9365 }, { "epoch": 24.736876857048532, "grad_norm": 486.8499450683594, "learning_rate": 0.0002555975829303649, "loss": 15.3775, "step": 9366 }, { "epoch": 24.73951799273688, "grad_norm": 1332.185546875, "learning_rate": 0.00025555581682750665, "loss": 18.3773, "step": 9367 }, { "epoch": 24.74215912842522, "grad_norm": 456.570068359375, "learning_rate": 0.00025551405056950554, "loss": 9.6703, "step": 9368 }, { "epoch": 24.744800264113568, "grad_norm": 314.5206298828125, "learning_rate": 0.00025547228415752777, "loss": 19.0792, "step": 9369 }, { "epoch": 24.747441399801914, "grad_norm": 331.21881103515625, "learning_rate": 0.0002554305175927397, "loss": 37.5231, "step": 9370 }, { "epoch": 24.75008253549026, "grad_norm": 243.06893920898438, "learning_rate": 0.00025538875087630753, "loss": 33.5353, "step": 9371 }, { "epoch": 24.752723671178607, "grad_norm": 977.0672607421875, "learning_rate": 0.00025534698400939777, "loss": 36.3449, "step": 9372 }, { "epoch": 24.755364806866954, "grad_norm": 726.2858276367188, "learning_rate": 0.00025530521699317657, "loss": 35.2247, "step": 9373 }, { "epoch": 24.7580059425553, "grad_norm": 983.5985717773438, "learning_rate": 0.00025526344982881025, "loss": 35.7066, "step": 9374 }, { "epoch": 24.760647078243643, "grad_norm": 464.0201721191406, "learning_rate": 0.0002552216825174653, "loss": 34.7341, "step": 9375 }, { "epoch": 24.76328821393199, "grad_norm": 341.11895751953125, "learning_rate": 0.00025517991506030787, "loss": 35.2449, "step": 9376 }, { "epoch": 24.765929349620336, "grad_norm": 727.9418334960938, "learning_rate": 0.0002551381474585044, "loss": 34.8532, "step": 9377 }, { "epoch": 24.768570485308683, "grad_norm": 379.80865478515625, "learning_rate": 0.00025509637971322106, "loss": 34.415, "step": 9378 }, { "epoch": 24.77121162099703, "grad_norm": 1050.7630615234375, "learning_rate": 0.0002550546118256244, "loss": 35.4384, "step": 9379 }, { "epoch": 24.773852756685375, "grad_norm": 1230.103759765625, "learning_rate": 0.00025501284379688067, "loss": 36.6234, "step": 9380 }, { "epoch": 24.776493892373722, "grad_norm": 840.509521484375, "learning_rate": 0.00025497107562815624, "loss": 36.2438, "step": 9381 }, { "epoch": 24.77913502806207, "grad_norm": 852.9190063476562, "learning_rate": 0.0002549293073206174, "loss": 35.4909, "step": 9382 }, { "epoch": 24.78177616375041, "grad_norm": 366.938232421875, "learning_rate": 0.0002548875388754305, "loss": 35.771, "step": 9383 }, { "epoch": 24.784417299438758, "grad_norm": 399.926513671875, "learning_rate": 0.0002548457702937621, "loss": 34.1179, "step": 9384 }, { "epoch": 24.787058435127104, "grad_norm": 883.7149047851562, "learning_rate": 0.00025480400157677834, "loss": 36.3343, "step": 9385 }, { "epoch": 24.78969957081545, "grad_norm": 3631.897705078125, "learning_rate": 0.00025476223272564557, "loss": 36.9757, "step": 9386 }, { "epoch": 24.792340706503797, "grad_norm": 1031.0667724609375, "learning_rate": 0.0002547204637415304, "loss": 40.2908, "step": 9387 }, { "epoch": 24.794981842192144, "grad_norm": 570.1069946289062, "learning_rate": 0.0002546786946255989, "loss": 41.8946, "step": 9388 }, { "epoch": 24.79762297788049, "grad_norm": 517.7555541992188, "learning_rate": 0.0002546369253790178, "loss": 41.3996, "step": 9389 }, { "epoch": 24.800264113568836, "grad_norm": 799.6539306640625, "learning_rate": 0.00025459515600295305, "loss": 42.9061, "step": 9390 }, { "epoch": 24.80290524925718, "grad_norm": 290.64501953125, "learning_rate": 0.00025455338649857146, "loss": 42.6737, "step": 9391 }, { "epoch": 24.805546384945526, "grad_norm": 1119.364501953125, "learning_rate": 0.00025451161686703913, "loss": 45.0123, "step": 9392 }, { "epoch": 24.808187520633872, "grad_norm": 412.6994323730469, "learning_rate": 0.0002544698471095225, "loss": 43.3822, "step": 9393 }, { "epoch": 24.81082865632222, "grad_norm": 1082.099609375, "learning_rate": 0.00025442807722718814, "loss": 41.8956, "step": 9394 }, { "epoch": 24.813469792010565, "grad_norm": 479.6311340332031, "learning_rate": 0.00025438630722120225, "loss": 41.9892, "step": 9395 }, { "epoch": 24.81611092769891, "grad_norm": 625.1589965820312, "learning_rate": 0.00025434453709273134, "loss": 40.1628, "step": 9396 }, { "epoch": 24.818752063387258, "grad_norm": 328.8375549316406, "learning_rate": 0.0002543027668429417, "loss": 42.685, "step": 9397 }, { "epoch": 24.8213931990756, "grad_norm": 253.03697204589844, "learning_rate": 0.000254260996473, "loss": 38.8654, "step": 9398 }, { "epoch": 24.824034334763947, "grad_norm": 1598.000732421875, "learning_rate": 0.0002542192259840723, "loss": 38.8583, "step": 9399 }, { "epoch": 24.826675470452294, "grad_norm": 246.3529815673828, "learning_rate": 0.0002541774553773252, "loss": 37.4056, "step": 9400 }, { "epoch": 24.826675470452294, "eval_loss": 3.9009642601013184, "eval_runtime": 2.0563, "eval_samples_per_second": 240.726, "eval_steps_per_second": 30.152, "step": 9400 }, { "epoch": 24.82931660614064, "grad_norm": 576.1364135742188, "learning_rate": 0.00025413568465392525, "loss": 36.7675, "step": 9401 }, { "epoch": 24.831957741828987, "grad_norm": 399.1614685058594, "learning_rate": 0.00025409391381503866, "loss": 36.6244, "step": 9402 }, { "epoch": 24.834598877517333, "grad_norm": 311.8328857421875, "learning_rate": 0.0002540521428618319, "loss": 35.5069, "step": 9403 }, { "epoch": 24.83724001320568, "grad_norm": 286.7568359375, "learning_rate": 0.00025401037179547146, "loss": 34.1757, "step": 9404 }, { "epoch": 24.839881148894026, "grad_norm": 843.743896484375, "learning_rate": 0.00025396860061712367, "loss": 35.1792, "step": 9405 }, { "epoch": 24.84252228458237, "grad_norm": 701.9169311523438, "learning_rate": 0.0002539268293279552, "loss": 35.5695, "step": 9406 }, { "epoch": 24.845163420270715, "grad_norm": 351.1475830078125, "learning_rate": 0.0002538850579291322, "loss": 35.4811, "step": 9407 }, { "epoch": 24.847804555959062, "grad_norm": 1403.718505859375, "learning_rate": 0.0002538432864218213, "loss": 36.092, "step": 9408 }, { "epoch": 24.85044569164741, "grad_norm": 462.28619384765625, "learning_rate": 0.00025380151480718885, "loss": 36.7318, "step": 9409 }, { "epoch": 24.853086827335755, "grad_norm": 12551.974609375, "learning_rate": 0.0002537597430864014, "loss": 34.1869, "step": 9410 }, { "epoch": 24.8557279630241, "grad_norm": 4009.67431640625, "learning_rate": 0.00025371797126062524, "loss": 29.475, "step": 9411 }, { "epoch": 24.858369098712448, "grad_norm": 6940.34375, "learning_rate": 0.00025367619933102693, "loss": 23.4297, "step": 9412 }, { "epoch": 24.861010234400794, "grad_norm": 10780.0419921875, "learning_rate": 0.00025363442729877295, "loss": 26.1357, "step": 9413 }, { "epoch": 24.863651370089137, "grad_norm": 2476.318115234375, "learning_rate": 0.0002535926551650297, "loss": 21.8187, "step": 9414 }, { "epoch": 24.866292505777484, "grad_norm": 3900.88916015625, "learning_rate": 0.0002535508829309637, "loss": 20.7358, "step": 9415 }, { "epoch": 24.86893364146583, "grad_norm": 1584.7076416015625, "learning_rate": 0.0002535091105977413, "loss": 23.1773, "step": 9416 }, { "epoch": 24.871574777154176, "grad_norm": 1652.3172607421875, "learning_rate": 0.0002534673381665292, "loss": 14.7489, "step": 9417 }, { "epoch": 24.874215912842523, "grad_norm": 821.2532958984375, "learning_rate": 0.00025342556563849356, "loss": 14.7903, "step": 9418 }, { "epoch": 24.87685704853087, "grad_norm": 6130.74267578125, "learning_rate": 0.0002533837930148011, "loss": 20.4497, "step": 9419 }, { "epoch": 24.879498184219216, "grad_norm": 568.45751953125, "learning_rate": 0.00025334202029661816, "loss": 36.4637, "step": 9420 }, { "epoch": 24.88213931990756, "grad_norm": 524.2603149414062, "learning_rate": 0.0002533002474851113, "loss": 35.7187, "step": 9421 }, { "epoch": 24.884780455595905, "grad_norm": 445.48828125, "learning_rate": 0.00025325847458144697, "loss": 34.4198, "step": 9422 }, { "epoch": 24.88742159128425, "grad_norm": 2663.47705078125, "learning_rate": 0.0002532167015867916, "loss": 34.7695, "step": 9423 }, { "epoch": 24.890062726972598, "grad_norm": 861.1591796875, "learning_rate": 0.0002531749285023118, "loss": 35.3114, "step": 9424 }, { "epoch": 24.892703862660944, "grad_norm": 980.3685913085938, "learning_rate": 0.00025313315532917395, "loss": 34.06, "step": 9425 }, { "epoch": 24.89534499834929, "grad_norm": 1094.6827392578125, "learning_rate": 0.00025309138206854455, "loss": 35.8773, "step": 9426 }, { "epoch": 24.897986134037637, "grad_norm": 575.0640869140625, "learning_rate": 0.00025304960872159015, "loss": 34.2489, "step": 9427 }, { "epoch": 24.900627269725984, "grad_norm": 522.2175903320312, "learning_rate": 0.00025300783528947726, "loss": 35.1031, "step": 9428 }, { "epoch": 24.903268405414327, "grad_norm": 986.3118896484375, "learning_rate": 0.0002529660617733723, "loss": 34.5669, "step": 9429 }, { "epoch": 24.905909541102673, "grad_norm": 614.2421875, "learning_rate": 0.0002529242881744418, "loss": 36.9196, "step": 9430 }, { "epoch": 24.90855067679102, "grad_norm": 1015.4884033203125, "learning_rate": 0.0002528825144938523, "loss": 35.9961, "step": 9431 }, { "epoch": 24.911191812479366, "grad_norm": 413.4164123535156, "learning_rate": 0.0002528407407327702, "loss": 34.943, "step": 9432 }, { "epoch": 24.913832948167713, "grad_norm": 550.5525512695312, "learning_rate": 0.00025279896689236217, "loss": 33.9384, "step": 9433 }, { "epoch": 24.91647408385606, "grad_norm": 463.14166259765625, "learning_rate": 0.00025275719297379454, "loss": 35.212, "step": 9434 }, { "epoch": 24.919115219544405, "grad_norm": 1093.0997314453125, "learning_rate": 0.000252715418978234, "loss": 35.5677, "step": 9435 }, { "epoch": 24.921756355232752, "grad_norm": 3599.05322265625, "learning_rate": 0.00025267364490684697, "loss": 36.8112, "step": 9436 }, { "epoch": 24.924397490921095, "grad_norm": 2366.669189453125, "learning_rate": 0.00025263187076079996, "loss": 39.5333, "step": 9437 }, { "epoch": 24.92703862660944, "grad_norm": 288.0251770019531, "learning_rate": 0.00025259009654125954, "loss": 38.2481, "step": 9438 }, { "epoch": 24.929679762297788, "grad_norm": 984.4505004882812, "learning_rate": 0.0002525483222493921, "loss": 40.5408, "step": 9439 }, { "epoch": 24.932320897986134, "grad_norm": 292.6039733886719, "learning_rate": 0.0002525065478863643, "loss": 41.7583, "step": 9440 }, { "epoch": 24.93496203367448, "grad_norm": 461.3492126464844, "learning_rate": 0.0002524647734533427, "loss": 42.3489, "step": 9441 }, { "epoch": 24.937603169362827, "grad_norm": 574.4833374023438, "learning_rate": 0.00025242299895149366, "loss": 41.865, "step": 9442 }, { "epoch": 24.940244305051174, "grad_norm": 252.27891540527344, "learning_rate": 0.00025238122438198384, "loss": 42.7823, "step": 9443 }, { "epoch": 24.942885440739516, "grad_norm": 547.8043823242188, "learning_rate": 0.00025233944974597974, "loss": 40.2753, "step": 9444 }, { "epoch": 24.945526576427863, "grad_norm": 1673.611572265625, "learning_rate": 0.00025229767504464785, "loss": 38.0824, "step": 9445 }, { "epoch": 24.94816771211621, "grad_norm": 379.5542297363281, "learning_rate": 0.0002522559002791547, "loss": 36.3771, "step": 9446 }, { "epoch": 24.950808847804556, "grad_norm": 331.7046813964844, "learning_rate": 0.0002522141254506669, "loss": 36.0268, "step": 9447 }, { "epoch": 24.953449983492902, "grad_norm": 201.37628173828125, "learning_rate": 0.000252172350560351, "loss": 35.1943, "step": 9448 }, { "epoch": 24.95609111918125, "grad_norm": 518.6072387695312, "learning_rate": 0.00025213057560937336, "loss": 35.9838, "step": 9449 }, { "epoch": 24.958732254869595, "grad_norm": 5188.96533203125, "learning_rate": 0.0002520888005989008, "loss": 37.8798, "step": 9450 }, { "epoch": 24.96137339055794, "grad_norm": 1576.5438232421875, "learning_rate": 0.0002520470255300996, "loss": 14.0211, "step": 9451 }, { "epoch": 24.964014526246284, "grad_norm": 1137.4033203125, "learning_rate": 0.00025200525040413647, "loss": 12.1133, "step": 9452 }, { "epoch": 24.96665566193463, "grad_norm": 550.1065673828125, "learning_rate": 0.0002519634752221778, "loss": 10.2285, "step": 9453 }, { "epoch": 24.969296797622977, "grad_norm": 775.4476318359375, "learning_rate": 0.0002519216999853904, "loss": 18.2115, "step": 9454 }, { "epoch": 24.971937933311324, "grad_norm": 1180.103759765625, "learning_rate": 0.00025187992469494056, "loss": 11.8061, "step": 9455 }, { "epoch": 24.97457906899967, "grad_norm": 823.925537109375, "learning_rate": 0.0002518381493519949, "loss": 36.5806, "step": 9456 }, { "epoch": 24.977220204688017, "grad_norm": 375.3466796875, "learning_rate": 0.0002517963739577201, "loss": 33.6278, "step": 9457 }, { "epoch": 24.979861340376363, "grad_norm": 1523.439208984375, "learning_rate": 0.0002517545985132825, "loss": 35.799, "step": 9458 }, { "epoch": 24.98250247606471, "grad_norm": 334.0960693359375, "learning_rate": 0.0002517128230198489, "loss": 35.1182, "step": 9459 }, { "epoch": 24.985143611753053, "grad_norm": 929.5379638671875, "learning_rate": 0.0002516710474785856, "loss": 35.168, "step": 9460 }, { "epoch": 24.9877847474414, "grad_norm": 1326.036376953125, "learning_rate": 0.00025162927189065925, "loss": 36.742, "step": 9461 }, { "epoch": 24.990425883129745, "grad_norm": 1238.62890625, "learning_rate": 0.00025158749625723653, "loss": 34.5597, "step": 9462 }, { "epoch": 24.993067018818092, "grad_norm": 1724.4593505859375, "learning_rate": 0.00025154572057948393, "loss": 34.596, "step": 9463 }, { "epoch": 24.99570815450644, "grad_norm": 781.7833862304688, "learning_rate": 0.000251503944858568, "loss": 35.3664, "step": 9464 }, { "epoch": 24.998349290194785, "grad_norm": 676.0555419921875, "learning_rate": 0.00025146216909565516, "loss": 36.3572, "step": 9465 }, { "epoch": 25.00099042588313, "grad_norm": 603.4251708984375, "learning_rate": 0.00025142039329191225, "loss": 41.9533, "step": 9466 }, { "epoch": 25.003631561571474, "grad_norm": 726.4046020507812, "learning_rate": 0.0002513786174485056, "loss": 39.4682, "step": 9467 }, { "epoch": 25.00627269725982, "grad_norm": 447.4329528808594, "learning_rate": 0.0002513368415666019, "loss": 39.1338, "step": 9468 }, { "epoch": 25.008913832948167, "grad_norm": 405.15765380859375, "learning_rate": 0.0002512950656473677, "loss": 40.2834, "step": 9469 }, { "epoch": 25.011554968636514, "grad_norm": 1072.7166748046875, "learning_rate": 0.0002512532896919695, "loss": 43.2182, "step": 9470 }, { "epoch": 25.01419610432486, "grad_norm": 474.7765197753906, "learning_rate": 0.00025121151370157395, "loss": 39.0469, "step": 9471 }, { "epoch": 25.016837240013206, "grad_norm": 434.9425354003906, "learning_rate": 0.0002511697376773476, "loss": 41.949, "step": 9472 }, { "epoch": 25.019478375701553, "grad_norm": 411.9859619140625, "learning_rate": 0.00025112796162045706, "loss": 40.8251, "step": 9473 }, { "epoch": 25.0221195113899, "grad_norm": 1496.5733642578125, "learning_rate": 0.0002510861855320688, "loss": 42.3023, "step": 9474 }, { "epoch": 25.024760647078242, "grad_norm": 596.9638671875, "learning_rate": 0.00025104440941334943, "loss": 40.9704, "step": 9475 }, { "epoch": 25.02740178276659, "grad_norm": 694.1116943359375, "learning_rate": 0.00025100263326546557, "loss": 39.1916, "step": 9476 }, { "epoch": 25.030042918454935, "grad_norm": 396.1424865722656, "learning_rate": 0.00025096085708958376, "loss": 39.0825, "step": 9477 }, { "epoch": 25.03268405414328, "grad_norm": 233.84811401367188, "learning_rate": 0.00025091908088687066, "loss": 37.5623, "step": 9478 }, { "epoch": 25.035325189831628, "grad_norm": 594.9880981445312, "learning_rate": 0.0002508773046584926, "loss": 36.742, "step": 9479 }, { "epoch": 25.037966325519974, "grad_norm": 669.3203125, "learning_rate": 0.00025083552840561644, "loss": 36.4145, "step": 9480 }, { "epoch": 25.04060746120832, "grad_norm": 690.9224243164062, "learning_rate": 0.00025079375212940864, "loss": 35.7808, "step": 9481 }, { "epoch": 25.043248596896664, "grad_norm": 434.58013916015625, "learning_rate": 0.0002507519758310357, "loss": 36.4397, "step": 9482 }, { "epoch": 25.04588973258501, "grad_norm": 964.4848022460938, "learning_rate": 0.00025071019951166437, "loss": 35.7739, "step": 9483 }, { "epoch": 25.048530868273357, "grad_norm": 534.6029663085938, "learning_rate": 0.000250668423172461, "loss": 34.1642, "step": 9484 }, { "epoch": 25.051172003961703, "grad_norm": 482.6654968261719, "learning_rate": 0.00025062664681459244, "loss": 34.8995, "step": 9485 }, { "epoch": 25.05381313965005, "grad_norm": 580.6810913085938, "learning_rate": 0.00025058487043922513, "loss": 35.0536, "step": 9486 }, { "epoch": 25.056454275338396, "grad_norm": 311.6623229980469, "learning_rate": 0.00025054309404752556, "loss": 36.115, "step": 9487 }, { "epoch": 25.059095411026743, "grad_norm": 2360.947021484375, "learning_rate": 0.00025050131764066055, "loss": 40.5833, "step": 9488 }, { "epoch": 25.06173654671509, "grad_norm": 758.7177124023438, "learning_rate": 0.00025045954121979643, "loss": 15.8971, "step": 9489 }, { "epoch": 25.064377682403432, "grad_norm": 1403.40380859375, "learning_rate": 0.00025041776478609997, "loss": 16.6248, "step": 9490 }, { "epoch": 25.06701881809178, "grad_norm": 5409.0556640625, "learning_rate": 0.00025037598834073754, "loss": 12.1598, "step": 9491 }, { "epoch": 25.069659953780125, "grad_norm": 513.4307250976562, "learning_rate": 0.00025033421188487597, "loss": 12.1303, "step": 9492 }, { "epoch": 25.07230108946847, "grad_norm": 2361.94091796875, "learning_rate": 0.00025029243541968164, "loss": 13.6828, "step": 9493 }, { "epoch": 25.074942225156818, "grad_norm": 10082.0107421875, "learning_rate": 0.0002502506589463213, "loss": 11.6066, "step": 9494 }, { "epoch": 25.077583360845164, "grad_norm": 3280.123046875, "learning_rate": 0.00025020888246596144, "loss": 15.5516, "step": 9495 }, { "epoch": 25.08022449653351, "grad_norm": 632.3145751953125, "learning_rate": 0.00025016710597976867, "loss": 11.3631, "step": 9496 }, { "epoch": 25.082865632221857, "grad_norm": 5019.3486328125, "learning_rate": 0.00025012532948890954, "loss": 14.3679, "step": 9497 }, { "epoch": 25.0855067679102, "grad_norm": 407.71502685546875, "learning_rate": 0.0002500835529945507, "loss": 22.6484, "step": 9498 }, { "epoch": 25.088147903598546, "grad_norm": 1153.7340087890625, "learning_rate": 0.0002500417764978587, "loss": 38.2749, "step": 9499 }, { "epoch": 25.090789039286893, "grad_norm": 937.9046630859375, "learning_rate": 0.00025, "loss": 35.1048, "step": 9500 }, { "epoch": 25.09343017497524, "grad_norm": 654.7449340820312, "learning_rate": 0.00024995822350214143, "loss": 36.3754, "step": 9501 }, { "epoch": 25.096071310663586, "grad_norm": 839.3206787109375, "learning_rate": 0.00024991644700544935, "loss": 35.4061, "step": 9502 }, { "epoch": 25.098712446351932, "grad_norm": 586.322265625, "learning_rate": 0.00024987467051109047, "loss": 35.9601, "step": 9503 }, { "epoch": 25.10135358204028, "grad_norm": 499.5295104980469, "learning_rate": 0.0002498328940202314, "loss": 34.5185, "step": 9504 }, { "epoch": 25.10399471772862, "grad_norm": 611.3779907226562, "learning_rate": 0.0002497911175340386, "loss": 34.1401, "step": 9505 }, { "epoch": 25.106635853416968, "grad_norm": 854.5836791992188, "learning_rate": 0.0002497493410536787, "loss": 35.3885, "step": 9506 }, { "epoch": 25.109276989105314, "grad_norm": 1302.5645751953125, "learning_rate": 0.00024970756458031837, "loss": 37.3595, "step": 9507 }, { "epoch": 25.11191812479366, "grad_norm": 574.968017578125, "learning_rate": 0.00024966578811512415, "loss": 35.2542, "step": 9508 }, { "epoch": 25.114559260482007, "grad_norm": 629.740478515625, "learning_rate": 0.0002496240116592624, "loss": 36.0756, "step": 9509 }, { "epoch": 25.117200396170354, "grad_norm": 759.2334594726562, "learning_rate": 0.0002495822352139001, "loss": 37.4843, "step": 9510 }, { "epoch": 25.1198415318587, "grad_norm": 893.3779296875, "learning_rate": 0.0002495404587802036, "loss": 34.5812, "step": 9511 }, { "epoch": 25.122482667547047, "grad_norm": 713.3433837890625, "learning_rate": 0.00024949868235933957, "loss": 37.3702, "step": 9512 }, { "epoch": 25.12512380323539, "grad_norm": 289.5477600097656, "learning_rate": 0.0002494569059524744, "loss": 35.3629, "step": 9513 }, { "epoch": 25.127764938923736, "grad_norm": 360.1915588378906, "learning_rate": 0.00024941512956077493, "loss": 36.4303, "step": 9514 }, { "epoch": 25.130406074612083, "grad_norm": 492.2436828613281, "learning_rate": 0.00024937335318540757, "loss": 37.999, "step": 9515 }, { "epoch": 25.13304721030043, "grad_norm": 632.3561401367188, "learning_rate": 0.00024933157682753896, "loss": 40.5406, "step": 9516 }, { "epoch": 25.135688345988775, "grad_norm": 987.2322387695312, "learning_rate": 0.0002492898004883357, "loss": 40.0595, "step": 9517 }, { "epoch": 25.138329481677122, "grad_norm": 714.896240234375, "learning_rate": 0.00024924802416896434, "loss": 40.2077, "step": 9518 }, { "epoch": 25.14097061736547, "grad_norm": 639.9889526367188, "learning_rate": 0.0002492062478705915, "loss": 40.3687, "step": 9519 }, { "epoch": 25.143611753053815, "grad_norm": 607.7054443359375, "learning_rate": 0.0002491644715943836, "loss": 42.9118, "step": 9520 }, { "epoch": 25.146252888742158, "grad_norm": 493.3522033691406, "learning_rate": 0.0002491226953415074, "loss": 45.0329, "step": 9521 }, { "epoch": 25.148894024430504, "grad_norm": 484.6133117675781, "learning_rate": 0.00024908091911312946, "loss": 43.5314, "step": 9522 }, { "epoch": 25.15153516011885, "grad_norm": 396.6326904296875, "learning_rate": 0.0002490391429104162, "loss": 42.4159, "step": 9523 }, { "epoch": 25.154176295807197, "grad_norm": 586.5709228515625, "learning_rate": 0.00024899736673453444, "loss": 41.6272, "step": 9524 }, { "epoch": 25.156817431495544, "grad_norm": 859.0074462890625, "learning_rate": 0.00024895559058665063, "loss": 39.6756, "step": 9525 }, { "epoch": 25.15945856718389, "grad_norm": 1025.541259765625, "learning_rate": 0.00024891381446793127, "loss": 39.9393, "step": 9526 }, { "epoch": 25.162099702872236, "grad_norm": 280.40679931640625, "learning_rate": 0.00024887203837954295, "loss": 38.2663, "step": 9527 }, { "epoch": 25.16474083856058, "grad_norm": 388.1039123535156, "learning_rate": 0.0002488302623226524, "loss": 38.7264, "step": 9528 }, { "epoch": 25.167381974248926, "grad_norm": 337.84368896484375, "learning_rate": 0.0002487884862984261, "loss": 38.0031, "step": 9529 }, { "epoch": 25.170023109937272, "grad_norm": 335.89239501953125, "learning_rate": 0.0002487467103080305, "loss": 36.9632, "step": 9530 }, { "epoch": 25.17266424562562, "grad_norm": 255.45840454101562, "learning_rate": 0.00024870493435263236, "loss": 36.3501, "step": 9531 }, { "epoch": 25.175305381313965, "grad_norm": 579.4744262695312, "learning_rate": 0.00024866315843339817, "loss": 36.4535, "step": 9532 }, { "epoch": 25.17794651700231, "grad_norm": 358.73236083984375, "learning_rate": 0.0002486213825514945, "loss": 36.5825, "step": 9533 }, { "epoch": 25.180587652690658, "grad_norm": 239.34397888183594, "learning_rate": 0.0002485796067080878, "loss": 35.724, "step": 9534 }, { "epoch": 25.183228788379004, "grad_norm": 200.65614318847656, "learning_rate": 0.00024853783090434485, "loss": 34.9001, "step": 9535 }, { "epoch": 25.185869924067347, "grad_norm": 305.9940490722656, "learning_rate": 0.0002484960551414321, "loss": 35.6239, "step": 9536 }, { "epoch": 25.188511059755694, "grad_norm": 278.1329650878906, "learning_rate": 0.0002484542794205161, "loss": 33.7516, "step": 9537 }, { "epoch": 25.19115219544404, "grad_norm": 1359.8399658203125, "learning_rate": 0.0002484125037427635, "loss": 45.0612, "step": 9538 }, { "epoch": 25.193793331132387, "grad_norm": 8951.658203125, "learning_rate": 0.00024837072810934076, "loss": 21.1154, "step": 9539 }, { "epoch": 25.196434466820733, "grad_norm": 1335.9417724609375, "learning_rate": 0.0002483289525214145, "loss": 16.8794, "step": 9540 }, { "epoch": 25.19907560250908, "grad_norm": 2500.403076171875, "learning_rate": 0.00024828717698015116, "loss": 16.4149, "step": 9541 }, { "epoch": 25.201716738197426, "grad_norm": 2738.8466796875, "learning_rate": 0.0002482454014867176, "loss": 15.765, "step": 9542 }, { "epoch": 25.204357873885773, "grad_norm": 1185.351806640625, "learning_rate": 0.00024820362604228004, "loss": 18.0574, "step": 9543 }, { "epoch": 25.206999009574115, "grad_norm": 5971.09765625, "learning_rate": 0.00024816185064800505, "loss": 10.606, "step": 9544 }, { "epoch": 25.209640145262462, "grad_norm": 436.3620300292969, "learning_rate": 0.0002481200753050595, "loss": 15.5852, "step": 9545 }, { "epoch": 25.21228128095081, "grad_norm": 753.8600463867188, "learning_rate": 0.00024807830001460966, "loss": 9.868, "step": 9546 }, { "epoch": 25.214922416639155, "grad_norm": 614.2459106445312, "learning_rate": 0.00024803652477782225, "loss": 15.9661, "step": 9547 }, { "epoch": 25.2175635523275, "grad_norm": 489.88861083984375, "learning_rate": 0.0002479947495958636, "loss": 37.0782, "step": 9548 }, { "epoch": 25.220204688015848, "grad_norm": 437.51373291015625, "learning_rate": 0.0002479529744699004, "loss": 35.0462, "step": 9549 }, { "epoch": 25.222845823704194, "grad_norm": 967.2049560546875, "learning_rate": 0.0002479111994010993, "loss": 34.46, "step": 9550 }, { "epoch": 25.225486959392537, "grad_norm": 457.2424621582031, "learning_rate": 0.0002478694243906266, "loss": 35.7425, "step": 9551 }, { "epoch": 25.228128095080883, "grad_norm": 448.7281188964844, "learning_rate": 0.00024782764943964903, "loss": 36.0133, "step": 9552 }, { "epoch": 25.23076923076923, "grad_norm": 749.3480224609375, "learning_rate": 0.0002477858745493331, "loss": 36.3453, "step": 9553 }, { "epoch": 25.233410366457576, "grad_norm": 591.5284423828125, "learning_rate": 0.00024774409972084534, "loss": 36.7285, "step": 9554 }, { "epoch": 25.236051502145923, "grad_norm": 485.65838623046875, "learning_rate": 0.0002477023249553522, "loss": 35.6506, "step": 9555 }, { "epoch": 25.23869263783427, "grad_norm": 344.6423034667969, "learning_rate": 0.0002476605502540203, "loss": 34.7533, "step": 9556 }, { "epoch": 25.241333773522616, "grad_norm": 858.4865112304688, "learning_rate": 0.0002476187756180162, "loss": 35.9835, "step": 9557 }, { "epoch": 25.243974909210962, "grad_norm": 481.6986083984375, "learning_rate": 0.0002475770010485063, "loss": 36.2593, "step": 9558 }, { "epoch": 25.246616044899305, "grad_norm": 979.0288696289062, "learning_rate": 0.00024753522654665736, "loss": 35.6741, "step": 9559 }, { "epoch": 25.24925718058765, "grad_norm": 711.2951049804688, "learning_rate": 0.00024749345211363575, "loss": 35.7602, "step": 9560 }, { "epoch": 25.251898316275998, "grad_norm": 475.3460388183594, "learning_rate": 0.00024745167775060795, "loss": 34.3697, "step": 9561 }, { "epoch": 25.254539451964344, "grad_norm": 568.0121459960938, "learning_rate": 0.0002474099034587405, "loss": 34.6347, "step": 9562 }, { "epoch": 25.25718058765269, "grad_norm": 3902.48046875, "learning_rate": 0.00024736812923920005, "loss": 35.1041, "step": 9563 }, { "epoch": 25.259821723341037, "grad_norm": 3437.33740234375, "learning_rate": 0.0002473263550931531, "loss": 36.9116, "step": 9564 }, { "epoch": 25.262462859029384, "grad_norm": 849.5951538085938, "learning_rate": 0.000247284581021766, "loss": 38.2992, "step": 9565 }, { "epoch": 25.26510399471773, "grad_norm": 2916.27197265625, "learning_rate": 0.00024724280702620547, "loss": 40.4773, "step": 9566 }, { "epoch": 25.267745130406073, "grad_norm": 1370.6737060546875, "learning_rate": 0.0002472010331076379, "loss": 40.1594, "step": 9567 }, { "epoch": 25.27038626609442, "grad_norm": 574.1068725585938, "learning_rate": 0.0002471592592672298, "loss": 40.0617, "step": 9568 }, { "epoch": 25.273027401782766, "grad_norm": 318.0805969238281, "learning_rate": 0.00024711748550614775, "loss": 40.8939, "step": 9569 }, { "epoch": 25.275668537471113, "grad_norm": 282.48944091796875, "learning_rate": 0.0002470757118255582, "loss": 41.3053, "step": 9570 }, { "epoch": 25.27830967315946, "grad_norm": 763.5269775390625, "learning_rate": 0.0002470339382266278, "loss": 41.9774, "step": 9571 }, { "epoch": 25.280950808847805, "grad_norm": 723.21435546875, "learning_rate": 0.00024699216471052275, "loss": 41.627, "step": 9572 }, { "epoch": 25.283591944536152, "grad_norm": 946.2518310546875, "learning_rate": 0.00024695039127840986, "loss": 43.9037, "step": 9573 }, { "epoch": 25.2862330802245, "grad_norm": 473.75103759765625, "learning_rate": 0.0002469086179314555, "loss": 41.8058, "step": 9574 }, { "epoch": 25.28887421591284, "grad_norm": 376.5799560546875, "learning_rate": 0.000246866844670826, "loss": 39.5268, "step": 9575 }, { "epoch": 25.291515351601188, "grad_norm": 854.25244140625, "learning_rate": 0.0002468250714976882, "loss": 39.5919, "step": 9576 }, { "epoch": 25.294156487289534, "grad_norm": 408.45550537109375, "learning_rate": 0.0002467832984132084, "loss": 40.486, "step": 9577 }, { "epoch": 25.29679762297788, "grad_norm": 617.3741455078125, "learning_rate": 0.00024674152541855315, "loss": 37.575, "step": 9578 }, { "epoch": 25.299438758666227, "grad_norm": 572.9140014648438, "learning_rate": 0.0002466997525148887, "loss": 38.9326, "step": 9579 }, { "epoch": 25.302079894354573, "grad_norm": 861.7550048828125, "learning_rate": 0.00024665797970338185, "loss": 35.3685, "step": 9580 }, { "epoch": 25.30472103004292, "grad_norm": 552.7175903320312, "learning_rate": 0.00024661620698519896, "loss": 37.2692, "step": 9581 }, { "epoch": 25.307362165731263, "grad_norm": 435.2423095703125, "learning_rate": 0.0002465744343615064, "loss": 36.9237, "step": 9582 }, { "epoch": 25.31000330141961, "grad_norm": 347.69256591796875, "learning_rate": 0.0002465326618334709, "loss": 34.9876, "step": 9583 }, { "epoch": 25.312644437107956, "grad_norm": 754.4638061523438, "learning_rate": 0.0002464908894022587, "loss": 36.1778, "step": 9584 }, { "epoch": 25.315285572796302, "grad_norm": 800.4290771484375, "learning_rate": 0.00024644911706903636, "loss": 36.1081, "step": 9585 }, { "epoch": 25.31792670848465, "grad_norm": 508.4793701171875, "learning_rate": 0.0002464073448349703, "loss": 36.5523, "step": 9586 }, { "epoch": 25.320567844172995, "grad_norm": 331.4502868652344, "learning_rate": 0.00024636557270122706, "loss": 36.0996, "step": 9587 }, { "epoch": 25.32320897986134, "grad_norm": 790.4620361328125, "learning_rate": 0.0002463238006689731, "loss": 43.2261, "step": 9588 }, { "epoch": 25.325850115549688, "grad_norm": 2127.23779296875, "learning_rate": 0.00024628202873937477, "loss": 20.2313, "step": 9589 }, { "epoch": 25.32849125123803, "grad_norm": 6030.1806640625, "learning_rate": 0.0002462402569135987, "loss": 18.6896, "step": 9590 }, { "epoch": 25.331132386926377, "grad_norm": 1856.003173828125, "learning_rate": 0.00024619848519281116, "loss": 18.5155, "step": 9591 }, { "epoch": 25.333773522614724, "grad_norm": 6162.828125, "learning_rate": 0.0002461567135781788, "loss": 16.1685, "step": 9592 }, { "epoch": 25.33641465830307, "grad_norm": 6106.3017578125, "learning_rate": 0.0002461149420708678, "loss": 16.1571, "step": 9593 }, { "epoch": 25.339055793991417, "grad_norm": 7184.056640625, "learning_rate": 0.0002460731706720449, "loss": 15.019, "step": 9594 }, { "epoch": 25.341696929679763, "grad_norm": 3952.50244140625, "learning_rate": 0.00024603139938287634, "loss": 13.2945, "step": 9595 }, { "epoch": 25.34433806536811, "grad_norm": 2406.1982421875, "learning_rate": 0.00024598962820452855, "loss": 18.342, "step": 9596 }, { "epoch": 25.346979201056456, "grad_norm": 1954.9869384765625, "learning_rate": 0.0002459478571381681, "loss": 19.1835, "step": 9597 }, { "epoch": 25.3496203367448, "grad_norm": 3419.968017578125, "learning_rate": 0.0002459060861849614, "loss": 26.4434, "step": 9598 }, { "epoch": 25.352261472433145, "grad_norm": 1039.411376953125, "learning_rate": 0.0002458643153460748, "loss": 35.1152, "step": 9599 }, { "epoch": 25.354902608121492, "grad_norm": 667.756103515625, "learning_rate": 0.00024582254462267474, "loss": 34.5682, "step": 9600 }, { "epoch": 25.354902608121492, "eval_loss": 3.84483003616333, "eval_runtime": 2.1823, "eval_samples_per_second": 226.825, "eval_steps_per_second": 28.41, "step": 9600 }, { "epoch": 25.35754374380984, "grad_norm": 1017.6758422851562, "learning_rate": 0.00024578077401592776, "loss": 36.9818, "step": 9601 }, { "epoch": 25.360184879498185, "grad_norm": 2036.3612060546875, "learning_rate": 0.0002457390035270001, "loss": 34.1943, "step": 9602 }, { "epoch": 25.36282601518653, "grad_norm": 987.655517578125, "learning_rate": 0.0002456972331570583, "loss": 35.8644, "step": 9603 }, { "epoch": 25.365467150874878, "grad_norm": 792.977783203125, "learning_rate": 0.0002456554629072687, "loss": 35.1567, "step": 9604 }, { "epoch": 25.36810828656322, "grad_norm": 1615.9698486328125, "learning_rate": 0.0002456136927787978, "loss": 35.1927, "step": 9605 }, { "epoch": 25.370749422251567, "grad_norm": 620.724365234375, "learning_rate": 0.000245571922772812, "loss": 35.373, "step": 9606 }, { "epoch": 25.373390557939913, "grad_norm": 1443.17822265625, "learning_rate": 0.0002455301528904775, "loss": 35.5329, "step": 9607 }, { "epoch": 25.37603169362826, "grad_norm": 1146.6527099609375, "learning_rate": 0.00024548838313296094, "loss": 36.5353, "step": 9608 }, { "epoch": 25.378672829316606, "grad_norm": 911.658935546875, "learning_rate": 0.00024544661350142866, "loss": 35.9708, "step": 9609 }, { "epoch": 25.381313965004953, "grad_norm": 667.6551513671875, "learning_rate": 0.0002454048439970469, "loss": 34.6529, "step": 9610 }, { "epoch": 25.3839551006933, "grad_norm": 1377.469482421875, "learning_rate": 0.00024536307462098226, "loss": 35.0229, "step": 9611 }, { "epoch": 25.386596236381646, "grad_norm": 1014.5332641601562, "learning_rate": 0.0002453213053744011, "loss": 34.9347, "step": 9612 }, { "epoch": 25.38923737206999, "grad_norm": 1156.5576171875, "learning_rate": 0.0002452795362584697, "loss": 35.7566, "step": 9613 }, { "epoch": 25.391878507758335, "grad_norm": 1219.25732421875, "learning_rate": 0.0002452377672743544, "loss": 35.4984, "step": 9614 }, { "epoch": 25.39451964344668, "grad_norm": 1475.5496826171875, "learning_rate": 0.00024519599842322173, "loss": 38.0348, "step": 9615 }, { "epoch": 25.397160779135028, "grad_norm": 932.336181640625, "learning_rate": 0.00024515422970623795, "loss": 43.5414, "step": 9616 }, { "epoch": 25.399801914823374, "grad_norm": 809.8297729492188, "learning_rate": 0.00024511246112456943, "loss": 41.2125, "step": 9617 }, { "epoch": 25.40244305051172, "grad_norm": 518.7332763671875, "learning_rate": 0.0002450706926793826, "loss": 39.6047, "step": 9618 }, { "epoch": 25.405084186200067, "grad_norm": 421.1401062011719, "learning_rate": 0.0002450289243718438, "loss": 40.6229, "step": 9619 }, { "epoch": 25.407725321888414, "grad_norm": 766.6721801757812, "learning_rate": 0.00024498715620311935, "loss": 41.4378, "step": 9620 }, { "epoch": 25.410366457576757, "grad_norm": 639.9434204101562, "learning_rate": 0.0002449453881743756, "loss": 42.0478, "step": 9621 }, { "epoch": 25.413007593265103, "grad_norm": 746.0946655273438, "learning_rate": 0.00024490362028677895, "loss": 43.6641, "step": 9622 }, { "epoch": 25.41564872895345, "grad_norm": 2536.498046875, "learning_rate": 0.0002448618525414957, "loss": 42.2594, "step": 9623 }, { "epoch": 25.418289864641796, "grad_norm": 1460.0660400390625, "learning_rate": 0.00024482008493969214, "loss": 39.0277, "step": 9624 }, { "epoch": 25.420931000330143, "grad_norm": 788.7477416992188, "learning_rate": 0.00024477831748253475, "loss": 40.4846, "step": 9625 }, { "epoch": 25.42357213601849, "grad_norm": 602.748046875, "learning_rate": 0.0002447365501711898, "loss": 38.0108, "step": 9626 }, { "epoch": 25.426213271706835, "grad_norm": 629.4589233398438, "learning_rate": 0.0002446947830068235, "loss": 38.8278, "step": 9627 }, { "epoch": 25.42885440739518, "grad_norm": 1150.5362548828125, "learning_rate": 0.00024465301599060224, "loss": 39.6477, "step": 9628 }, { "epoch": 25.431495543083525, "grad_norm": 672.85986328125, "learning_rate": 0.00024461124912369253, "loss": 37.188, "step": 9629 }, { "epoch": 25.43413667877187, "grad_norm": 500.0916442871094, "learning_rate": 0.00024456948240726044, "loss": 36.7216, "step": 9630 }, { "epoch": 25.436777814460218, "grad_norm": 1042.579345703125, "learning_rate": 0.00024452771584247224, "loss": 37.0316, "step": 9631 }, { "epoch": 25.439418950148564, "grad_norm": 2467.825439453125, "learning_rate": 0.00024448594943049447, "loss": 35.0515, "step": 9632 }, { "epoch": 25.44206008583691, "grad_norm": 756.578369140625, "learning_rate": 0.00024444418317249336, "loss": 36.7642, "step": 9633 }, { "epoch": 25.444701221525257, "grad_norm": 637.714599609375, "learning_rate": 0.00024440241706963513, "loss": 34.9916, "step": 9634 }, { "epoch": 25.447342357213603, "grad_norm": 1207.7451171875, "learning_rate": 0.0002443606511230861, "loss": 35.6613, "step": 9635 }, { "epoch": 25.449983492901946, "grad_norm": 599.9757690429688, "learning_rate": 0.00024431888533401263, "loss": 36.1939, "step": 9636 }, { "epoch": 25.452624628590293, "grad_norm": 712.6520385742188, "learning_rate": 0.0002442771197035809, "loss": 35.7313, "step": 9637 }, { "epoch": 25.45526576427864, "grad_norm": 7148.3291015625, "learning_rate": 0.00024423535423295724, "loss": 30.9722, "step": 9638 }, { "epoch": 25.457906899966986, "grad_norm": 5663.2568359375, "learning_rate": 0.00024419358892330793, "loss": 14.5951, "step": 9639 }, { "epoch": 25.460548035655332, "grad_norm": 11014.3876953125, "learning_rate": 0.00024415182377579927, "loss": 13.2339, "step": 9640 }, { "epoch": 25.46318917134368, "grad_norm": 1701.8935546875, "learning_rate": 0.0002441100587915975, "loss": 21.1431, "step": 9641 }, { "epoch": 25.465830307032025, "grad_norm": 941.354736328125, "learning_rate": 0.00024406829397186883, "loss": 15.4657, "step": 9642 }, { "epoch": 25.46847144272037, "grad_norm": 4786.50830078125, "learning_rate": 0.0002440265293177796, "loss": 21.4922, "step": 9643 }, { "epoch": 25.471112578408714, "grad_norm": 5277.287109375, "learning_rate": 0.00024398476483049607, "loss": 12.0175, "step": 9644 }, { "epoch": 25.47375371409706, "grad_norm": 1359.392822265625, "learning_rate": 0.0002439430005111843, "loss": 13.4925, "step": 9645 }, { "epoch": 25.476394849785407, "grad_norm": 1904.53125, "learning_rate": 0.00024390123636101077, "loss": 17.2655, "step": 9646 }, { "epoch": 25.479035985473754, "grad_norm": 2807.89306640625, "learning_rate": 0.00024385947238114174, "loss": 11.7979, "step": 9647 }, { "epoch": 25.4816771211621, "grad_norm": 554.2650756835938, "learning_rate": 0.00024381770857274325, "loss": 32.5654, "step": 9648 }, { "epoch": 25.484318256850447, "grad_norm": 551.1719360351562, "learning_rate": 0.00024377594493698153, "loss": 36.3784, "step": 9649 }, { "epoch": 25.486959392538793, "grad_norm": 656.0079956054688, "learning_rate": 0.00024373418147502298, "loss": 35.8646, "step": 9650 }, { "epoch": 25.489600528227136, "grad_norm": 1110.2120361328125, "learning_rate": 0.00024369241818803373, "loss": 36.6367, "step": 9651 }, { "epoch": 25.492241663915483, "grad_norm": 898.0217895507812, "learning_rate": 0.00024365065507717993, "loss": 35.6731, "step": 9652 }, { "epoch": 25.49488279960383, "grad_norm": 404.636962890625, "learning_rate": 0.00024360889214362794, "loss": 34.8245, "step": 9653 }, { "epoch": 25.497523935292175, "grad_norm": 517.3562622070312, "learning_rate": 0.00024356712938854386, "loss": 36.059, "step": 9654 }, { "epoch": 25.500165070980522, "grad_norm": 853.8151245117188, "learning_rate": 0.00024352536681309395, "loss": 35.2939, "step": 9655 }, { "epoch": 25.50280620666887, "grad_norm": 533.84716796875, "learning_rate": 0.0002434836044184443, "loss": 35.5323, "step": 9656 }, { "epoch": 25.505447342357215, "grad_norm": 637.78759765625, "learning_rate": 0.0002434418422057612, "loss": 34.3306, "step": 9657 }, { "epoch": 25.50808847804556, "grad_norm": 4440.41552734375, "learning_rate": 0.0002434000801762108, "loss": 34.2197, "step": 9658 }, { "epoch": 25.510729613733904, "grad_norm": 782.248291015625, "learning_rate": 0.00024335831833095925, "loss": 37.0611, "step": 9659 }, { "epoch": 25.51337074942225, "grad_norm": 789.945556640625, "learning_rate": 0.0002433165566711728, "loss": 34.6492, "step": 9660 }, { "epoch": 25.516011885110597, "grad_norm": 754.0993041992188, "learning_rate": 0.0002432747951980176, "loss": 35.6273, "step": 9661 }, { "epoch": 25.518653020798943, "grad_norm": 483.8314208984375, "learning_rate": 0.0002432330339126598, "loss": 34.8152, "step": 9662 }, { "epoch": 25.52129415648729, "grad_norm": 663.4259033203125, "learning_rate": 0.0002431912728162654, "loss": 35.5033, "step": 9663 }, { "epoch": 25.523935292175636, "grad_norm": 716.2755737304688, "learning_rate": 0.0002431495119100009, "loss": 37.2505, "step": 9664 }, { "epoch": 25.526576427863983, "grad_norm": 713.8256225585938, "learning_rate": 0.0002431077511950322, "loss": 38.0002, "step": 9665 }, { "epoch": 25.52921756355233, "grad_norm": 641.4161376953125, "learning_rate": 0.0002430659906725254, "loss": 41.7187, "step": 9666 }, { "epoch": 25.531858699240672, "grad_norm": 228.12876892089844, "learning_rate": 0.00024302423034364685, "loss": 37.8971, "step": 9667 }, { "epoch": 25.53449983492902, "grad_norm": 416.2582092285156, "learning_rate": 0.00024298247020956254, "loss": 39.9772, "step": 9668 }, { "epoch": 25.537140970617365, "grad_norm": 301.2333679199219, "learning_rate": 0.00024294071027143864, "loss": 39.7556, "step": 9669 }, { "epoch": 25.53978210630571, "grad_norm": 939.2313232421875, "learning_rate": 0.00024289895053044119, "loss": 40.1655, "step": 9670 }, { "epoch": 25.542423241994058, "grad_norm": 642.006591796875, "learning_rate": 0.00024285719098773645, "loss": 44.5769, "step": 9671 }, { "epoch": 25.545064377682404, "grad_norm": 525.9799194335938, "learning_rate": 0.00024281543164449045, "loss": 46.0128, "step": 9672 }, { "epoch": 25.54770551337075, "grad_norm": 382.5596008300781, "learning_rate": 0.00024277367250186927, "loss": 41.3015, "step": 9673 }, { "epoch": 25.550346649059094, "grad_norm": 522.6394653320312, "learning_rate": 0.00024273191356103904, "loss": 42.2347, "step": 9674 }, { "epoch": 25.55298778474744, "grad_norm": 236.09994506835938, "learning_rate": 0.00024269015482316592, "loss": 39.0506, "step": 9675 }, { "epoch": 25.555628920435787, "grad_norm": 861.8197021484375, "learning_rate": 0.00024264839628941587, "loss": 39.521, "step": 9676 }, { "epoch": 25.558270056124133, "grad_norm": 877.677734375, "learning_rate": 0.00024260663796095503, "loss": 39.9539, "step": 9677 }, { "epoch": 25.56091119181248, "grad_norm": 264.1480407714844, "learning_rate": 0.00024256487983894952, "loss": 38.4789, "step": 9678 }, { "epoch": 25.563552327500826, "grad_norm": 257.27032470703125, "learning_rate": 0.00024252312192456536, "loss": 37.4664, "step": 9679 }, { "epoch": 25.566193463189173, "grad_norm": 694.4932861328125, "learning_rate": 0.00024248136421896853, "loss": 37.1112, "step": 9680 }, { "epoch": 25.56883459887752, "grad_norm": 289.4815673828125, "learning_rate": 0.0002424396067233253, "loss": 34.8736, "step": 9681 }, { "epoch": 25.571475734565862, "grad_norm": 594.9862060546875, "learning_rate": 0.00024239784943880168, "loss": 35.7819, "step": 9682 }, { "epoch": 25.57411687025421, "grad_norm": 301.0420837402344, "learning_rate": 0.00024235609236656356, "loss": 37.0212, "step": 9683 }, { "epoch": 25.576758005942555, "grad_norm": 289.84002685546875, "learning_rate": 0.00024231433550777695, "loss": 36.0455, "step": 9684 }, { "epoch": 25.5793991416309, "grad_norm": 514.5908813476562, "learning_rate": 0.00024227257886360814, "loss": 36.1483, "step": 9685 }, { "epoch": 25.582040277319248, "grad_norm": 273.67303466796875, "learning_rate": 0.00024223082243522305, "loss": 35.4044, "step": 9686 }, { "epoch": 25.584681413007594, "grad_norm": 439.3143310546875, "learning_rate": 0.0002421890662237876, "loss": 36.3691, "step": 9687 }, { "epoch": 25.58732254869594, "grad_norm": 868.870361328125, "learning_rate": 0.00024214731023046793, "loss": 49.0535, "step": 9688 }, { "epoch": 25.589963684384287, "grad_norm": 1265.52880859375, "learning_rate": 0.00024210555445643004, "loss": 17.8403, "step": 9689 }, { "epoch": 25.59260482007263, "grad_norm": 831.1594848632812, "learning_rate": 0.0002420637989028399, "loss": 16.3068, "step": 9690 }, { "epoch": 25.595245955760976, "grad_norm": 747.990478515625, "learning_rate": 0.0002420220435708634, "loss": 14.4115, "step": 9691 }, { "epoch": 25.597887091449323, "grad_norm": 6511.56982421875, "learning_rate": 0.00024198028846166678, "loss": 13.4168, "step": 9692 }, { "epoch": 25.60052822713767, "grad_norm": 1526.0457763671875, "learning_rate": 0.00024193853357641585, "loss": 11.7497, "step": 9693 }, { "epoch": 25.603169362826016, "grad_norm": 973.4998168945312, "learning_rate": 0.0002418967789162766, "loss": 14.9878, "step": 9694 }, { "epoch": 25.605810498514362, "grad_norm": 780.747802734375, "learning_rate": 0.00024185502448241509, "loss": 15.0382, "step": 9695 }, { "epoch": 25.60845163420271, "grad_norm": 2162.437744140625, "learning_rate": 0.00024181327027599723, "loss": 14.1271, "step": 9696 }, { "epoch": 25.61109276989105, "grad_norm": 215.41156005859375, "learning_rate": 0.00024177151629818885, "loss": 12.5511, "step": 9697 }, { "epoch": 25.613733905579398, "grad_norm": 1852.104736328125, "learning_rate": 0.0002417297625501562, "loss": 22.6147, "step": 9698 }, { "epoch": 25.616375041267744, "grad_norm": 953.2186889648438, "learning_rate": 0.00024168800903306512, "loss": 36.7045, "step": 9699 }, { "epoch": 25.61901617695609, "grad_norm": 819.2453002929688, "learning_rate": 0.00024164625574808144, "loss": 36.4446, "step": 9700 }, { "epoch": 25.621657312644437, "grad_norm": 441.0847473144531, "learning_rate": 0.00024160450269637105, "loss": 36.0162, "step": 9701 }, { "epoch": 25.624298448332784, "grad_norm": 890.8834228515625, "learning_rate": 0.0002415627498791001, "loss": 36.3482, "step": 9702 }, { "epoch": 25.62693958402113, "grad_norm": 383.4891357421875, "learning_rate": 0.0002415209972974344, "loss": 36.4179, "step": 9703 }, { "epoch": 25.629580719709477, "grad_norm": 502.99114990234375, "learning_rate": 0.00024147924495253982, "loss": 36.7762, "step": 9704 }, { "epoch": 25.63222185539782, "grad_norm": 395.47039794921875, "learning_rate": 0.00024143749284558235, "loss": 36.3538, "step": 9705 }, { "epoch": 25.634862991086166, "grad_norm": 436.6257019042969, "learning_rate": 0.0002413957409777279, "loss": 35.6578, "step": 9706 }, { "epoch": 25.637504126774513, "grad_norm": 523.091064453125, "learning_rate": 0.00024135398935014232, "loss": 35.57, "step": 9707 }, { "epoch": 25.64014526246286, "grad_norm": 461.2916564941406, "learning_rate": 0.00024131223796399146, "loss": 35.8486, "step": 9708 }, { "epoch": 25.642786398151205, "grad_norm": 389.3424072265625, "learning_rate": 0.00024127048682044127, "loss": 35.8927, "step": 9709 }, { "epoch": 25.645427533839552, "grad_norm": 335.3671875, "learning_rate": 0.00024122873592065763, "loss": 35.1661, "step": 9710 }, { "epoch": 25.6480686695279, "grad_norm": 1150.975830078125, "learning_rate": 0.00024118698526580634, "loss": 35.394, "step": 9711 }, { "epoch": 25.650709805216245, "grad_norm": 528.4398803710938, "learning_rate": 0.00024114523485705336, "loss": 35.098, "step": 9712 }, { "epoch": 25.653350940904588, "grad_norm": 778.2303466796875, "learning_rate": 0.0002411034846955645, "loss": 35.2651, "step": 9713 }, { "epoch": 25.655992076592934, "grad_norm": 424.43389892578125, "learning_rate": 0.0002410617347825056, "loss": 37.8479, "step": 9714 }, { "epoch": 25.65863321228128, "grad_norm": 877.7481079101562, "learning_rate": 0.00024101998511904242, "loss": 37.0165, "step": 9715 }, { "epoch": 25.661274347969627, "grad_norm": 2107.5234375, "learning_rate": 0.00024097823570634096, "loss": 43.5156, "step": 9716 }, { "epoch": 25.663915483657973, "grad_norm": 419.83221435546875, "learning_rate": 0.00024093648654556702, "loss": 40.5992, "step": 9717 }, { "epoch": 25.66655661934632, "grad_norm": 503.3124694824219, "learning_rate": 0.0002408947376378862, "loss": 40.6515, "step": 9718 }, { "epoch": 25.669197755034666, "grad_norm": 693.7396240234375, "learning_rate": 0.0002408529889844646, "loss": 42.2717, "step": 9719 }, { "epoch": 25.67183889072301, "grad_norm": 301.7452697753906, "learning_rate": 0.0002408112405864679, "loss": 42.9272, "step": 9720 }, { "epoch": 25.674480026411356, "grad_norm": 803.7210693359375, "learning_rate": 0.00024076949244506194, "loss": 43.8463, "step": 9721 }, { "epoch": 25.677121162099702, "grad_norm": 355.95831298828125, "learning_rate": 0.0002407277445614124, "loss": 43.4029, "step": 9722 }, { "epoch": 25.67976229778805, "grad_norm": 327.216064453125, "learning_rate": 0.00024068599693668523, "loss": 41.4909, "step": 9723 }, { "epoch": 25.682403433476395, "grad_norm": 1023.0106201171875, "learning_rate": 0.0002406442495720461, "loss": 41.1852, "step": 9724 }, { "epoch": 25.68504456916474, "grad_norm": 394.24725341796875, "learning_rate": 0.00024060250246866075, "loss": 42.2951, "step": 9725 }, { "epoch": 25.687685704853088, "grad_norm": 281.4678039550781, "learning_rate": 0.00024056075562769506, "loss": 39.5435, "step": 9726 }, { "epoch": 25.690326840541434, "grad_norm": 837.0789794921875, "learning_rate": 0.00024051900905031472, "loss": 38.8037, "step": 9727 }, { "epoch": 25.692967976229777, "grad_norm": 302.9608154296875, "learning_rate": 0.00024047726273768546, "loss": 39.7702, "step": 9728 }, { "epoch": 25.695609111918124, "grad_norm": 260.9415283203125, "learning_rate": 0.00024043551669097302, "loss": 38.1598, "step": 9729 }, { "epoch": 25.69825024760647, "grad_norm": 225.397216796875, "learning_rate": 0.0002403937709113432, "loss": 36.6083, "step": 9730 }, { "epoch": 25.700891383294817, "grad_norm": 436.08746337890625, "learning_rate": 0.0002403520253999617, "loss": 36.5467, "step": 9731 }, { "epoch": 25.703532518983163, "grad_norm": 706.8341674804688, "learning_rate": 0.00024031028015799412, "loss": 36.5712, "step": 9732 }, { "epoch": 25.70617365467151, "grad_norm": 201.2654266357422, "learning_rate": 0.00024026853518660635, "loss": 34.6829, "step": 9733 }, { "epoch": 25.708814790359856, "grad_norm": 330.4819641113281, "learning_rate": 0.00024022679048696412, "loss": 35.2006, "step": 9734 }, { "epoch": 25.711455926048203, "grad_norm": 312.4542236328125, "learning_rate": 0.00024018504606023293, "loss": 35.4939, "step": 9735 }, { "epoch": 25.714097061736545, "grad_norm": 274.3722839355469, "learning_rate": 0.00024014330190757845, "loss": 35.2985, "step": 9736 }, { "epoch": 25.716738197424892, "grad_norm": 343.1868896484375, "learning_rate": 0.00024010155803016661, "loss": 37.6176, "step": 9737 }, { "epoch": 25.71937933311324, "grad_norm": 1017.2772827148438, "learning_rate": 0.00024005981442916293, "loss": 49.6362, "step": 9738 }, { "epoch": 25.722020468801585, "grad_norm": 2863.701416015625, "learning_rate": 0.00024001807110573303, "loss": 41.5863, "step": 9739 }, { "epoch": 25.72466160448993, "grad_norm": 1800.94873046875, "learning_rate": 0.00023997632806104272, "loss": 15.1327, "step": 9740 }, { "epoch": 25.727302740178278, "grad_norm": 7495.59912109375, "learning_rate": 0.00023993458529625753, "loss": 23.4464, "step": 9741 }, { "epoch": 25.729943875866624, "grad_norm": 2964.497314453125, "learning_rate": 0.00023989284281254313, "loss": 19.2385, "step": 9742 }, { "epoch": 25.732585011554967, "grad_norm": 985.4930419921875, "learning_rate": 0.00023985110061106512, "loss": 19.2944, "step": 9743 }, { "epoch": 25.735226147243313, "grad_norm": 953.3027954101562, "learning_rate": 0.00023980935869298922, "loss": 11.802, "step": 9744 }, { "epoch": 25.73786728293166, "grad_norm": 659.3256225585938, "learning_rate": 0.000239767617059481, "loss": 17.8154, "step": 9745 }, { "epoch": 25.740508418620006, "grad_norm": 1942.9166259765625, "learning_rate": 0.000239725875711706, "loss": 16.5556, "step": 9746 }, { "epoch": 25.743149554308353, "grad_norm": 2205.136474609375, "learning_rate": 0.00023968413465082992, "loss": 18.2711, "step": 9747 }, { "epoch": 25.7457906899967, "grad_norm": 1470.204833984375, "learning_rate": 0.00023964239387801834, "loss": 32.1328, "step": 9748 }, { "epoch": 25.748431825685046, "grad_norm": 453.1188659667969, "learning_rate": 0.0002396006533944368, "loss": 36.1532, "step": 9749 }, { "epoch": 25.751072961373392, "grad_norm": 678.5189208984375, "learning_rate": 0.00023955891320125085, "loss": 36.6299, "step": 9750 }, { "epoch": 25.753714097061735, "grad_norm": 529.646728515625, "learning_rate": 0.00023951717329962616, "loss": 35.9047, "step": 9751 }, { "epoch": 25.75635523275008, "grad_norm": 671.7742919921875, "learning_rate": 0.00023947543369072833, "loss": 36.1854, "step": 9752 }, { "epoch": 25.758996368438428, "grad_norm": 2313.537353515625, "learning_rate": 0.0002394336943757227, "loss": 35.3815, "step": 9753 }, { "epoch": 25.761637504126774, "grad_norm": 405.261962890625, "learning_rate": 0.00023939195535577503, "loss": 36.0741, "step": 9754 }, { "epoch": 25.76427863981512, "grad_norm": 258.4381408691406, "learning_rate": 0.00023935021663205075, "loss": 36.9065, "step": 9755 }, { "epoch": 25.766919775503467, "grad_norm": 817.641357421875, "learning_rate": 0.00023930847820571543, "loss": 36.8827, "step": 9756 }, { "epoch": 25.769560911191814, "grad_norm": 733.4962768554688, "learning_rate": 0.00023926674007793453, "loss": 34.1745, "step": 9757 }, { "epoch": 25.77220204688016, "grad_norm": 407.2287292480469, "learning_rate": 0.00023922500224987368, "loss": 35.8105, "step": 9758 }, { "epoch": 25.774843182568503, "grad_norm": 801.2236328125, "learning_rate": 0.00023918326472269826, "loss": 36.348, "step": 9759 }, { "epoch": 25.77748431825685, "grad_norm": 287.5462646484375, "learning_rate": 0.0002391415274975738, "loss": 36.6272, "step": 9760 }, { "epoch": 25.780125453945196, "grad_norm": 797.314453125, "learning_rate": 0.00023909979057566588, "loss": 36.6021, "step": 9761 }, { "epoch": 25.782766589633543, "grad_norm": 804.093505859375, "learning_rate": 0.00023905805395813988, "loss": 34.0562, "step": 9762 }, { "epoch": 25.78540772532189, "grad_norm": 329.8608093261719, "learning_rate": 0.0002390163176461613, "loss": 36.3311, "step": 9763 }, { "epoch": 25.788048861010235, "grad_norm": 422.57354736328125, "learning_rate": 0.00023897458164089553, "loss": 36.3775, "step": 9764 }, { "epoch": 25.790689996698582, "grad_norm": 518.6683959960938, "learning_rate": 0.00023893284594350816, "loss": 40.347, "step": 9765 }, { "epoch": 25.793331132386925, "grad_norm": 351.53692626953125, "learning_rate": 0.00023889111055516458, "loss": 43.3632, "step": 9766 }, { "epoch": 25.79597226807527, "grad_norm": 227.74725341796875, "learning_rate": 0.0002388493754770301, "loss": 39.4898, "step": 9767 }, { "epoch": 25.798613403763618, "grad_norm": 449.4461364746094, "learning_rate": 0.00023880764071027033, "loss": 41.7037, "step": 9768 }, { "epoch": 25.801254539451964, "grad_norm": 241.65496826171875, "learning_rate": 0.00023876590625605072, "loss": 42.7173, "step": 9769 }, { "epoch": 25.80389567514031, "grad_norm": 305.11407470703125, "learning_rate": 0.00023872417211553648, "loss": 43.1458, "step": 9770 }, { "epoch": 25.806536810828657, "grad_norm": 1019.309326171875, "learning_rate": 0.00023868243828989303, "loss": 44.1618, "step": 9771 }, { "epoch": 25.809177946517003, "grad_norm": 286.18597412109375, "learning_rate": 0.00023864070478028592, "loss": 43.5529, "step": 9772 }, { "epoch": 25.81181908220535, "grad_norm": 270.6082763671875, "learning_rate": 0.00023859897158788048, "loss": 40.1278, "step": 9773 }, { "epoch": 25.814460217893693, "grad_norm": 255.32923889160156, "learning_rate": 0.00023855723871384201, "loss": 41.4567, "step": 9774 }, { "epoch": 25.81710135358204, "grad_norm": 314.7471008300781, "learning_rate": 0.00023851550615933598, "loss": 42.1953, "step": 9775 }, { "epoch": 25.819742489270386, "grad_norm": 384.8096923828125, "learning_rate": 0.00023847377392552773, "loss": 39.9316, "step": 9776 }, { "epoch": 25.822383624958732, "grad_norm": 329.6541442871094, "learning_rate": 0.0002384320420135825, "loss": 39.4169, "step": 9777 }, { "epoch": 25.82502476064708, "grad_norm": 250.2515411376953, "learning_rate": 0.00023839031042466568, "loss": 37.1083, "step": 9778 }, { "epoch": 25.827665896335425, "grad_norm": 256.0419616699219, "learning_rate": 0.00023834857915994268, "loss": 37.0809, "step": 9779 }, { "epoch": 25.83030703202377, "grad_norm": 255.00521850585938, "learning_rate": 0.00023830684822057877, "loss": 35.8365, "step": 9780 }, { "epoch": 25.832948167712118, "grad_norm": 285.44830322265625, "learning_rate": 0.0002382651176077392, "loss": 35.368, "step": 9781 }, { "epoch": 25.83558930340046, "grad_norm": 231.882568359375, "learning_rate": 0.00023822338732258937, "loss": 35.2016, "step": 9782 }, { "epoch": 25.838230439088807, "grad_norm": 434.0447082519531, "learning_rate": 0.00023818165736629453, "loss": 35.9291, "step": 9783 }, { "epoch": 25.840871574777154, "grad_norm": 315.4424133300781, "learning_rate": 0.00023813992774001996, "loss": 34.1277, "step": 9784 }, { "epoch": 25.8435127104655, "grad_norm": 392.5423278808594, "learning_rate": 0.00023809819844493085, "loss": 36.8003, "step": 9785 }, { "epoch": 25.846153846153847, "grad_norm": 383.17767333984375, "learning_rate": 0.00023805646948219265, "loss": 36.1959, "step": 9786 }, { "epoch": 25.848794981842193, "grad_norm": 7651.4755859375, "learning_rate": 0.0002380147408529706, "loss": 36.0714, "step": 9787 }, { "epoch": 25.85143611753054, "grad_norm": 2561.38916015625, "learning_rate": 0.00023797301255842969, "loss": 38.0223, "step": 9788 }, { "epoch": 25.854077253218883, "grad_norm": 3905.67626953125, "learning_rate": 0.00023793128459973544, "loss": 32.0522, "step": 9789 }, { "epoch": 25.85671838890723, "grad_norm": 1456.0689697265625, "learning_rate": 0.00023788955697805298, "loss": 28.6297, "step": 9790 }, { "epoch": 25.859359524595575, "grad_norm": 1620.1689453125, "learning_rate": 0.00023784782969454754, "loss": 26.0523, "step": 9791 }, { "epoch": 25.862000660283922, "grad_norm": 1751.353515625, "learning_rate": 0.00023780610275038428, "loss": 25.5958, "step": 9792 }, { "epoch": 25.86464179597227, "grad_norm": 977.9639282226562, "learning_rate": 0.00023776437614672847, "loss": 22.7836, "step": 9793 }, { "epoch": 25.867282931660615, "grad_norm": 679.1505126953125, "learning_rate": 0.00023772264988474527, "loss": 18.48, "step": 9794 }, { "epoch": 25.86992406734896, "grad_norm": 1033.5609130859375, "learning_rate": 0.0002376809239655998, "loss": 19.3207, "step": 9795 }, { "epoch": 25.872565203037308, "grad_norm": 395.9263000488281, "learning_rate": 0.00023763919839045732, "loss": 15.2481, "step": 9796 }, { "epoch": 25.87520633872565, "grad_norm": 10615.453125, "learning_rate": 0.00023759747316048303, "loss": 15.7077, "step": 9797 }, { "epoch": 25.877847474413997, "grad_norm": 331.1806640625, "learning_rate": 0.000237555748276842, "loss": 37.7716, "step": 9798 }, { "epoch": 25.880488610102343, "grad_norm": 849.179443359375, "learning_rate": 0.00023751402374069927, "loss": 37.2858, "step": 9799 }, { "epoch": 25.88312974579069, "grad_norm": 827.6597900390625, "learning_rate": 0.00023747229955322022, "loss": 36.7296, "step": 9800 }, { "epoch": 25.88312974579069, "eval_loss": 4.082005023956299, "eval_runtime": 2.2584, "eval_samples_per_second": 219.182, "eval_steps_per_second": 27.453, "step": 9800 }, { "epoch": 25.885770881479036, "grad_norm": 308.9967346191406, "learning_rate": 0.00023743057571556982, "loss": 35.8073, "step": 9801 }, { "epoch": 25.888412017167383, "grad_norm": 391.19940185546875, "learning_rate": 0.0002373888522289131, "loss": 35.9073, "step": 9802 }, { "epoch": 25.89105315285573, "grad_norm": 420.3232421875, "learning_rate": 0.00023734712909441536, "loss": 36.4893, "step": 9803 }, { "epoch": 25.893694288544076, "grad_norm": 675.367431640625, "learning_rate": 0.0002373054063132417, "loss": 37.531, "step": 9804 }, { "epoch": 25.89633542423242, "grad_norm": 377.8209228515625, "learning_rate": 0.00023726368388655704, "loss": 35.4628, "step": 9805 }, { "epoch": 25.898976559920765, "grad_norm": 461.9979553222656, "learning_rate": 0.00023722196181552642, "loss": 35.7577, "step": 9806 }, { "epoch": 25.90161769560911, "grad_norm": 364.1037902832031, "learning_rate": 0.0002371802401013151, "loss": 36.6168, "step": 9807 }, { "epoch": 25.904258831297458, "grad_norm": 364.8203430175781, "learning_rate": 0.00023713851874508808, "loss": 36.2177, "step": 9808 }, { "epoch": 25.906899966985804, "grad_norm": 497.7905578613281, "learning_rate": 0.00023709679774801026, "loss": 34.2463, "step": 9809 }, { "epoch": 25.90954110267415, "grad_norm": 418.56011962890625, "learning_rate": 0.00023705507711124687, "loss": 35.5255, "step": 9810 }, { "epoch": 25.912182238362497, "grad_norm": 329.0584411621094, "learning_rate": 0.00023701335683596288, "loss": 36.157, "step": 9811 }, { "epoch": 25.91482337405084, "grad_norm": 362.4494934082031, "learning_rate": 0.00023697163692332324, "loss": 36.5246, "step": 9812 }, { "epoch": 25.917464509739187, "grad_norm": 602.2438354492188, "learning_rate": 0.00023692991737449292, "loss": 36.9472, "step": 9813 }, { "epoch": 25.920105645427533, "grad_norm": 2858.4365234375, "learning_rate": 0.00023688819819063706, "loss": 37.0898, "step": 9814 }, { "epoch": 25.92274678111588, "grad_norm": 856.13525390625, "learning_rate": 0.00023684647937292054, "loss": 38.3322, "step": 9815 }, { "epoch": 25.925387916804226, "grad_norm": 657.6893310546875, "learning_rate": 0.0002368047609225083, "loss": 43.0228, "step": 9816 }, { "epoch": 25.928029052492573, "grad_norm": 707.6056518554688, "learning_rate": 0.00023676304284056543, "loss": 40.7232, "step": 9817 }, { "epoch": 25.93067018818092, "grad_norm": 676.2068481445312, "learning_rate": 0.00023672132512825682, "loss": 43.3393, "step": 9818 }, { "epoch": 25.933311323869265, "grad_norm": 596.6720581054688, "learning_rate": 0.00023667960778674742, "loss": 45.0472, "step": 9819 }, { "epoch": 25.93595245955761, "grad_norm": 424.736083984375, "learning_rate": 0.000236637890817202, "loss": 43.0196, "step": 9820 }, { "epoch": 25.938593595245955, "grad_norm": 352.9418640136719, "learning_rate": 0.00023659617422078574, "loss": 43.5769, "step": 9821 }, { "epoch": 25.9412347309343, "grad_norm": 307.6009521484375, "learning_rate": 0.0002365544579986635, "loss": 41.0798, "step": 9822 }, { "epoch": 25.943875866622648, "grad_norm": 234.73751831054688, "learning_rate": 0.00023651274215199996, "loss": 40.2192, "step": 9823 }, { "epoch": 25.946517002310994, "grad_norm": 492.69012451171875, "learning_rate": 0.0002364710266819603, "loss": 37.9326, "step": 9824 }, { "epoch": 25.94915813799934, "grad_norm": 291.3377685546875, "learning_rate": 0.00023642931158970927, "loss": 37.0329, "step": 9825 }, { "epoch": 25.951799273687687, "grad_norm": 221.92831420898438, "learning_rate": 0.00023638759687641172, "loss": 36.0231, "step": 9826 }, { "epoch": 25.954440409376033, "grad_norm": 222.90206909179688, "learning_rate": 0.0002363458825432325, "loss": 36.3778, "step": 9827 }, { "epoch": 25.957081545064376, "grad_norm": 407.59423828125, "learning_rate": 0.0002363041685913365, "loss": 42.8802, "step": 9828 }, { "epoch": 25.959722680752723, "grad_norm": 1062.1727294921875, "learning_rate": 0.00023626245502188863, "loss": 29.5161, "step": 9829 }, { "epoch": 25.96236381644107, "grad_norm": 1070.3125, "learning_rate": 0.00023622074183605348, "loss": 11.2188, "step": 9830 }, { "epoch": 25.965004952129416, "grad_norm": 3397.7646484375, "learning_rate": 0.00023617902903499613, "loss": 16.3883, "step": 9831 }, { "epoch": 25.967646087817762, "grad_norm": 1331.85791015625, "learning_rate": 0.00023613731661988128, "loss": 12.7532, "step": 9832 }, { "epoch": 25.97028722350611, "grad_norm": 671.9064331054688, "learning_rate": 0.0002360956045918736, "loss": 22.3636, "step": 9833 }, { "epoch": 25.972928359194455, "grad_norm": 1215.09228515625, "learning_rate": 0.00023605389295213816, "loss": 13.6254, "step": 9834 }, { "epoch": 25.975569494882798, "grad_norm": 479.7579650878906, "learning_rate": 0.00023601218170183952, "loss": 36.1521, "step": 9835 }, { "epoch": 25.978210630571144, "grad_norm": 313.0935974121094, "learning_rate": 0.00023597047084214246, "loss": 35.2579, "step": 9836 }, { "epoch": 25.98085176625949, "grad_norm": 459.24029541015625, "learning_rate": 0.00023592876037421167, "loss": 34.639, "step": 9837 }, { "epoch": 25.983492901947837, "grad_norm": 379.1714172363281, "learning_rate": 0.0002358870502992121, "loss": 34.9718, "step": 9838 }, { "epoch": 25.986134037636184, "grad_norm": 427.9230041503906, "learning_rate": 0.0002358453406183084, "loss": 33.5085, "step": 9839 }, { "epoch": 25.98877517332453, "grad_norm": 545.9833984375, "learning_rate": 0.00023580363133266508, "loss": 36.086, "step": 9840 }, { "epoch": 25.991416309012877, "grad_norm": 732.1455688476562, "learning_rate": 0.00023576192244344714, "loss": 36.4764, "step": 9841 }, { "epoch": 25.994057444701223, "grad_norm": 347.3183288574219, "learning_rate": 0.00023572021395181915, "loss": 35.1866, "step": 9842 }, { "epoch": 25.996698580389566, "grad_norm": 486.70556640625, "learning_rate": 0.00023567850585894579, "loss": 34.8091, "step": 9843 }, { "epoch": 25.999339716077912, "grad_norm": 889.1964721679688, "learning_rate": 0.00023563679816599167, "loss": 38.6844, "step": 9844 }, { "epoch": 26.00198085176626, "grad_norm": 516.9164428710938, "learning_rate": 0.00023559509087412163, "loss": 41.0731, "step": 9845 }, { "epoch": 26.004621987454605, "grad_norm": 192.24288940429688, "learning_rate": 0.00023555338398450015, "loss": 39.9914, "step": 9846 }, { "epoch": 26.007263123142952, "grad_norm": 458.00323486328125, "learning_rate": 0.0002355116774982919, "loss": 39.4411, "step": 9847 }, { "epoch": 26.0099042588313, "grad_norm": 718.3760986328125, "learning_rate": 0.00023546997141666163, "loss": 39.5651, "step": 9848 }, { "epoch": 26.012545394519645, "grad_norm": 253.1524200439453, "learning_rate": 0.00023542826574077385, "loss": 42.1754, "step": 9849 }, { "epoch": 26.01518653020799, "grad_norm": 572.2765502929688, "learning_rate": 0.00023538656047179322, "loss": 41.6052, "step": 9850 }, { "epoch": 26.017827665896334, "grad_norm": 586.9624633789062, "learning_rate": 0.00023534485561088413, "loss": 43.9359, "step": 9851 }, { "epoch": 26.02046880158468, "grad_norm": 676.6436157226562, "learning_rate": 0.00023530315115921158, "loss": 39.7294, "step": 9852 }, { "epoch": 26.023109937273027, "grad_norm": 284.6319885253906, "learning_rate": 0.0002352614471179398, "loss": 39.0758, "step": 9853 }, { "epoch": 26.025751072961373, "grad_norm": 266.4393310546875, "learning_rate": 0.00023521974348823335, "loss": 40.4819, "step": 9854 }, { "epoch": 26.02839220864972, "grad_norm": 231.6123504638672, "learning_rate": 0.000235178040271257, "loss": 39.4304, "step": 9855 }, { "epoch": 26.031033344338066, "grad_norm": 449.3609619140625, "learning_rate": 0.00023513633746817514, "loss": 38.2477, "step": 9856 }, { "epoch": 26.033674480026413, "grad_norm": 345.76934814453125, "learning_rate": 0.0002350946350801524, "loss": 36.4963, "step": 9857 }, { "epoch": 26.036315615714756, "grad_norm": 297.66912841796875, "learning_rate": 0.00023505293310835303, "loss": 35.5485, "step": 9858 }, { "epoch": 26.038956751403102, "grad_norm": 262.76800537109375, "learning_rate": 0.00023501123155394188, "loss": 36.3039, "step": 9859 }, { "epoch": 26.04159788709145, "grad_norm": 440.5920715332031, "learning_rate": 0.00023496953041808325, "loss": 36.9038, "step": 9860 }, { "epoch": 26.044239022779795, "grad_norm": 307.0908203125, "learning_rate": 0.00023492782970194163, "loss": 34.0577, "step": 9861 }, { "epoch": 26.04688015846814, "grad_norm": 179.41566467285156, "learning_rate": 0.00023488612940668154, "loss": 34.1754, "step": 9862 }, { "epoch": 26.049521294156488, "grad_norm": 193.85243225097656, "learning_rate": 0.00023484442953346742, "loss": 35.0922, "step": 9863 }, { "epoch": 26.052162429844834, "grad_norm": 268.5100402832031, "learning_rate": 0.00023480273008346368, "loss": 36.5751, "step": 9864 }, { "epoch": 26.05480356553318, "grad_norm": 229.03781127929688, "learning_rate": 0.00023476103105783474, "loss": 37.3191, "step": 9865 }, { "epoch": 26.057444701221524, "grad_norm": 1424.3856201171875, "learning_rate": 0.0002347193324577451, "loss": 45.4037, "step": 9866 }, { "epoch": 26.06008583690987, "grad_norm": 2287.496337890625, "learning_rate": 0.00023467763428435914, "loss": 18.4048, "step": 9867 }, { "epoch": 26.062726972598217, "grad_norm": 1201.4267578125, "learning_rate": 0.0002346359365388411, "loss": 18.1759, "step": 9868 }, { "epoch": 26.065368108286563, "grad_norm": 788.999267578125, "learning_rate": 0.0002345942392223557, "loss": 12.3719, "step": 9869 }, { "epoch": 26.06800924397491, "grad_norm": 1896.6063232421875, "learning_rate": 0.00023455254233606707, "loss": 19.4093, "step": 9870 }, { "epoch": 26.070650379663256, "grad_norm": 4031.561767578125, "learning_rate": 0.0002345108458811396, "loss": 14.5191, "step": 9871 }, { "epoch": 26.073291515351602, "grad_norm": 13163.2861328125, "learning_rate": 0.00023446914985873757, "loss": 10.0579, "step": 9872 }, { "epoch": 26.07593265103995, "grad_norm": 1173.532958984375, "learning_rate": 0.0002344274542700255, "loss": 14.6392, "step": 9873 }, { "epoch": 26.078573786728292, "grad_norm": 2478.786376953125, "learning_rate": 0.0002343857591161677, "loss": 11.5149, "step": 9874 }, { "epoch": 26.08121492241664, "grad_norm": 703.4743041992188, "learning_rate": 0.00023434406439832821, "loss": 13.8663, "step": 9875 }, { "epoch": 26.083856058104985, "grad_norm": 1472.2916259765625, "learning_rate": 0.00023430237011767165, "loss": 22.0466, "step": 9876 }, { "epoch": 26.08649719379333, "grad_norm": 1779.2235107421875, "learning_rate": 0.00023426067627536218, "loss": 35.5844, "step": 9877 }, { "epoch": 26.089138329481678, "grad_norm": 277.0855712890625, "learning_rate": 0.0002342189828725641, "loss": 35.0365, "step": 9878 }, { "epoch": 26.091779465170024, "grad_norm": 627.5877685546875, "learning_rate": 0.00023417728991044156, "loss": 34.8938, "step": 9879 }, { "epoch": 26.09442060085837, "grad_norm": 311.9959716796875, "learning_rate": 0.000234135597390159, "loss": 34.9055, "step": 9880 }, { "epoch": 26.097061736546713, "grad_norm": 171.7127685546875, "learning_rate": 0.0002340939053128805, "loss": 35.2245, "step": 9881 }, { "epoch": 26.09970287223506, "grad_norm": 349.7207946777344, "learning_rate": 0.00023405221367977036, "loss": 34.2182, "step": 9882 }, { "epoch": 26.102344007923406, "grad_norm": 239.75454711914062, "learning_rate": 0.0002340105224919928, "loss": 35.4609, "step": 9883 }, { "epoch": 26.104985143611753, "grad_norm": 575.1367797851562, "learning_rate": 0.00023396883175071204, "loss": 34.8039, "step": 9884 }, { "epoch": 26.1076262793001, "grad_norm": 351.80731201171875, "learning_rate": 0.0002339271414570922, "loss": 35.74, "step": 9885 }, { "epoch": 26.110267414988446, "grad_norm": 375.2819519042969, "learning_rate": 0.00023388545161229745, "loss": 34.6556, "step": 9886 }, { "epoch": 26.112908550676792, "grad_norm": 544.3418579101562, "learning_rate": 0.0002338437622174921, "loss": 35.2444, "step": 9887 }, { "epoch": 26.11554968636514, "grad_norm": 447.0566101074219, "learning_rate": 0.00023380207327384014, "loss": 34.7843, "step": 9888 }, { "epoch": 26.11819082205348, "grad_norm": 420.3234558105469, "learning_rate": 0.0002337603847825057, "loss": 35.0471, "step": 9889 }, { "epoch": 26.120831957741828, "grad_norm": 289.0466613769531, "learning_rate": 0.00023371869674465305, "loss": 35.3312, "step": 9890 }, { "epoch": 26.123473093430174, "grad_norm": 602.9617309570312, "learning_rate": 0.00023367700916144624, "loss": 34.299, "step": 9891 }, { "epoch": 26.12611422911852, "grad_norm": 429.20599365234375, "learning_rate": 0.0002336353220340494, "loss": 37.1895, "step": 9892 }, { "epoch": 26.128755364806867, "grad_norm": 404.6891174316406, "learning_rate": 0.00023359363536362647, "loss": 38.0808, "step": 9893 }, { "epoch": 26.131396500495214, "grad_norm": 412.3213195800781, "learning_rate": 0.0002335519491513417, "loss": 39.0864, "step": 9894 }, { "epoch": 26.13403763618356, "grad_norm": 198.60525512695312, "learning_rate": 0.0002335102633983591, "loss": 39.3206, "step": 9895 }, { "epoch": 26.136678771871907, "grad_norm": 248.26356506347656, "learning_rate": 0.00023346857810584262, "loss": 39.4453, "step": 9896 }, { "epoch": 26.13931990756025, "grad_norm": 561.3629150390625, "learning_rate": 0.00023342689327495648, "loss": 40.5251, "step": 9897 }, { "epoch": 26.141961043248596, "grad_norm": 306.3916015625, "learning_rate": 0.0002333852089068646, "loss": 40.5016, "step": 9898 }, { "epoch": 26.144602178936942, "grad_norm": 294.3307800292969, "learning_rate": 0.00023334352500273102, "loss": 41.9525, "step": 9899 }, { "epoch": 26.14724331462529, "grad_norm": 268.2700500488281, "learning_rate": 0.0002333018415637196, "loss": 43.3184, "step": 9900 }, { "epoch": 26.149884450313635, "grad_norm": 387.4908447265625, "learning_rate": 0.00023326015859099456, "loss": 42.9003, "step": 9901 }, { "epoch": 26.152525586001982, "grad_norm": 250.32521057128906, "learning_rate": 0.00023321847608571975, "loss": 41.5937, "step": 9902 }, { "epoch": 26.15516672169033, "grad_norm": 234.07725524902344, "learning_rate": 0.00023317679404905904, "loss": 39.7584, "step": 9903 }, { "epoch": 26.15780785737867, "grad_norm": 253.22938537597656, "learning_rate": 0.0002331351124821766, "loss": 38.9876, "step": 9904 }, { "epoch": 26.160448993067018, "grad_norm": 389.4393005371094, "learning_rate": 0.0002330934313862362, "loss": 37.2722, "step": 9905 }, { "epoch": 26.163090128755364, "grad_norm": 216.61697387695312, "learning_rate": 0.00023305175076240178, "loss": 37.7404, "step": 9906 }, { "epoch": 26.16573126444371, "grad_norm": 209.73876953125, "learning_rate": 0.00023301007061183717, "loss": 37.6429, "step": 9907 }, { "epoch": 26.168372400132057, "grad_norm": 355.8127746582031, "learning_rate": 0.00023296839093570648, "loss": 37.1222, "step": 9908 }, { "epoch": 26.171013535820403, "grad_norm": 338.2188415527344, "learning_rate": 0.0002329267117351735, "loss": 35.3465, "step": 9909 }, { "epoch": 26.17365467150875, "grad_norm": 209.8291015625, "learning_rate": 0.00023288503301140194, "loss": 35.6062, "step": 9910 }, { "epoch": 26.176295807197096, "grad_norm": 226.57081604003906, "learning_rate": 0.00023284335476555585, "loss": 35.9512, "step": 9911 }, { "epoch": 26.17893694288544, "grad_norm": 202.52166748046875, "learning_rate": 0.00023280167699879902, "loss": 35.193, "step": 9912 }, { "epoch": 26.181578078573786, "grad_norm": 266.3714294433594, "learning_rate": 0.00023275999971229526, "loss": 35.7752, "step": 9913 }, { "epoch": 26.184219214262132, "grad_norm": 405.2455139160156, "learning_rate": 0.0002327183229072083, "loss": 35.1707, "step": 9914 }, { "epoch": 26.18686034995048, "grad_norm": 210.9546356201172, "learning_rate": 0.0002326766465847021, "loss": 35.5662, "step": 9915 }, { "epoch": 26.189501485638825, "grad_norm": 3509.75927734375, "learning_rate": 0.00023263497074594035, "loss": 48.8028, "step": 9916 }, { "epoch": 26.19214262132717, "grad_norm": 2850.33544921875, "learning_rate": 0.00023259329539208678, "loss": 28.1452, "step": 9917 }, { "epoch": 26.194783757015518, "grad_norm": 1062.822998046875, "learning_rate": 0.00023255162052430528, "loss": 28.1794, "step": 9918 }, { "epoch": 26.197424892703864, "grad_norm": 1070.2027587890625, "learning_rate": 0.00023250994614375952, "loss": 20.2164, "step": 9919 }, { "epoch": 26.200066028392207, "grad_norm": 1251.0972900390625, "learning_rate": 0.00023246827225161326, "loss": 19.8336, "step": 9920 }, { "epoch": 26.202707164080554, "grad_norm": 894.9100341796875, "learning_rate": 0.00023242659884903008, "loss": 16.8152, "step": 9921 }, { "epoch": 26.2053482997689, "grad_norm": 326.1106262207031, "learning_rate": 0.00023238492593717397, "loss": 18.0052, "step": 9922 }, { "epoch": 26.207989435457247, "grad_norm": 2785.066650390625, "learning_rate": 0.0002323432535172084, "loss": 11.4892, "step": 9923 }, { "epoch": 26.210630571145593, "grad_norm": 1090.0008544921875, "learning_rate": 0.000232301581590297, "loss": 11.9361, "step": 9924 }, { "epoch": 26.21327170683394, "grad_norm": 811.366943359375, "learning_rate": 0.00023225991015760362, "loss": 14.4696, "step": 9925 }, { "epoch": 26.215912842522286, "grad_norm": 509.70916748046875, "learning_rate": 0.00023221823922029184, "loss": 25.8785, "step": 9926 }, { "epoch": 26.21855397821063, "grad_norm": 517.59765625, "learning_rate": 0.00023217656877952533, "loss": 36.3082, "step": 9927 }, { "epoch": 26.221195113898975, "grad_norm": 387.73175048828125, "learning_rate": 0.00023213489883646755, "loss": 36.8569, "step": 9928 }, { "epoch": 26.223836249587322, "grad_norm": 395.451416015625, "learning_rate": 0.00023209322939228227, "loss": 37.3656, "step": 9929 }, { "epoch": 26.22647738527567, "grad_norm": 248.390380859375, "learning_rate": 0.00023205156044813307, "loss": 35.8426, "step": 9930 }, { "epoch": 26.229118520964015, "grad_norm": 761.4664306640625, "learning_rate": 0.00023200989200518346, "loss": 35.5575, "step": 9931 }, { "epoch": 26.23175965665236, "grad_norm": 425.7212219238281, "learning_rate": 0.00023196822406459708, "loss": 37.2016, "step": 9932 }, { "epoch": 26.234400792340708, "grad_norm": 428.594970703125, "learning_rate": 0.00023192655662753747, "loss": 36.794, "step": 9933 }, { "epoch": 26.237041928029054, "grad_norm": 339.12835693359375, "learning_rate": 0.00023188488969516814, "loss": 35.9822, "step": 9934 }, { "epoch": 26.239683063717397, "grad_norm": 471.1688537597656, "learning_rate": 0.00023184322326865256, "loss": 36.9773, "step": 9935 }, { "epoch": 26.242324199405743, "grad_norm": 257.71881103515625, "learning_rate": 0.00023180155734915436, "loss": 34.4108, "step": 9936 }, { "epoch": 26.24496533509409, "grad_norm": 828.6177368164062, "learning_rate": 0.000231759891937837, "loss": 35.323, "step": 9937 }, { "epoch": 26.247606470782436, "grad_norm": 340.8194885253906, "learning_rate": 0.0002317182270358638, "loss": 35.3405, "step": 9938 }, { "epoch": 26.250247606470783, "grad_norm": 563.2904663085938, "learning_rate": 0.00023167656264439857, "loss": 36.9788, "step": 9939 }, { "epoch": 26.25288874215913, "grad_norm": 422.1213073730469, "learning_rate": 0.0002316348987646045, "loss": 35.4946, "step": 9940 }, { "epoch": 26.255529877847476, "grad_norm": 283.9878234863281, "learning_rate": 0.0002315932353976451, "loss": 35.122, "step": 9941 }, { "epoch": 26.258171013535822, "grad_norm": 256.96759033203125, "learning_rate": 0.00023155157254468368, "loss": 37.9962, "step": 9942 }, { "epoch": 26.260812149224165, "grad_norm": 486.46282958984375, "learning_rate": 0.00023150991020688384, "loss": 37.7739, "step": 9943 }, { "epoch": 26.26345328491251, "grad_norm": 313.16827392578125, "learning_rate": 0.00023146824838540898, "loss": 41.1657, "step": 9944 }, { "epoch": 26.266094420600858, "grad_norm": 446.5528869628906, "learning_rate": 0.00023142658708142223, "loss": 40.2564, "step": 9945 }, { "epoch": 26.268735556289204, "grad_norm": 226.24952697753906, "learning_rate": 0.0002313849262960872, "loss": 40.9439, "step": 9946 }, { "epoch": 26.27137669197755, "grad_norm": 356.7280578613281, "learning_rate": 0.00023134326603056722, "loss": 40.8529, "step": 9947 }, { "epoch": 26.274017827665897, "grad_norm": 520.3714599609375, "learning_rate": 0.00023130160628602553, "loss": 42.7021, "step": 9948 }, { "epoch": 26.276658963354244, "grad_norm": 390.0721740722656, "learning_rate": 0.00023125994706362546, "loss": 42.132, "step": 9949 }, { "epoch": 26.279300099042587, "grad_norm": 208.41363525390625, "learning_rate": 0.00023121828836453044, "loss": 42.6323, "step": 9950 }, { "epoch": 26.281941234730933, "grad_norm": 166.0548095703125, "learning_rate": 0.00023117663018990366, "loss": 41.9827, "step": 9951 }, { "epoch": 26.28458237041928, "grad_norm": 144.5190887451172, "learning_rate": 0.00023113497254090837, "loss": 42.716, "step": 9952 }, { "epoch": 26.287223506107626, "grad_norm": 283.2150573730469, "learning_rate": 0.00023109331541870795, "loss": 40.198, "step": 9953 }, { "epoch": 26.289864641795972, "grad_norm": 1164.9847412109375, "learning_rate": 0.0002310516588244656, "loss": 37.6383, "step": 9954 }, { "epoch": 26.29250577748432, "grad_norm": 319.76116943359375, "learning_rate": 0.00023101000275934454, "loss": 39.3487, "step": 9955 }, { "epoch": 26.295146913172665, "grad_norm": 208.1700439453125, "learning_rate": 0.00023096834722450787, "loss": 38.5492, "step": 9956 }, { "epoch": 26.297788048861012, "grad_norm": 821.7924194335938, "learning_rate": 0.0002309266922211191, "loss": 37.5305, "step": 9957 }, { "epoch": 26.300429184549355, "grad_norm": 270.96368408203125, "learning_rate": 0.0002308850377503412, "loss": 36.5709, "step": 9958 }, { "epoch": 26.3030703202377, "grad_norm": 823.7708129882812, "learning_rate": 0.00023084338381333729, "loss": 36.2321, "step": 9959 }, { "epoch": 26.305711455926048, "grad_norm": 191.97193908691406, "learning_rate": 0.00023080173041127073, "loss": 35.5361, "step": 9960 }, { "epoch": 26.308352591614394, "grad_norm": 1135.9915771484375, "learning_rate": 0.00023076007754530456, "loss": 36.6514, "step": 9961 }, { "epoch": 26.31099372730274, "grad_norm": 236.07247924804688, "learning_rate": 0.00023071842521660187, "loss": 36.68, "step": 9962 }, { "epoch": 26.313634862991087, "grad_norm": 174.71337890625, "learning_rate": 0.0002306767734263259, "loss": 35.2812, "step": 9963 }, { "epoch": 26.316275998679433, "grad_norm": 302.10467529296875, "learning_rate": 0.0002306351221756397, "loss": 35.0079, "step": 9964 }, { "epoch": 26.31891713436778, "grad_norm": 267.89044189453125, "learning_rate": 0.0002305934714657063, "loss": 35.6868, "step": 9965 }, { "epoch": 26.321558270056123, "grad_norm": 233.6531524658203, "learning_rate": 0.00023055182129768875, "loss": 36.2079, "step": 9966 }, { "epoch": 26.32419940574447, "grad_norm": 2414.182373046875, "learning_rate": 0.00023051017167275025, "loss": 42.5504, "step": 9967 }, { "epoch": 26.326840541432816, "grad_norm": 2277.74658203125, "learning_rate": 0.00023046852259205375, "loss": 37.8512, "step": 9968 }, { "epoch": 26.329481677121162, "grad_norm": 4521.0625, "learning_rate": 0.0002304268740567622, "loss": 23.3925, "step": 9969 }, { "epoch": 26.33212281280951, "grad_norm": 2747.065185546875, "learning_rate": 0.0002303852260680388, "loss": 25.7778, "step": 9970 }, { "epoch": 26.334763948497855, "grad_norm": 1509.8226318359375, "learning_rate": 0.00023034357862704638, "loss": 29.061, "step": 9971 }, { "epoch": 26.3374050841862, "grad_norm": 5600.07568359375, "learning_rate": 0.00023030193173494801, "loss": 28.3008, "step": 9972 }, { "epoch": 26.340046219874544, "grad_norm": 2341.9033203125, "learning_rate": 0.00023026028539290653, "loss": 22.1216, "step": 9973 }, { "epoch": 26.34268735556289, "grad_norm": 3194.9638671875, "learning_rate": 0.00023021863960208513, "loss": 13.8888, "step": 9974 }, { "epoch": 26.345328491251237, "grad_norm": 1624.391357421875, "learning_rate": 0.00023017699436364656, "loss": 19.6504, "step": 9975 }, { "epoch": 26.347969626939584, "grad_norm": 3161.30078125, "learning_rate": 0.00023013534967875366, "loss": 13.3117, "step": 9976 }, { "epoch": 26.35061076262793, "grad_norm": 473.25494384765625, "learning_rate": 0.00023009370554856955, "loss": 28.7808, "step": 9977 }, { "epoch": 26.353251898316277, "grad_norm": 473.3243408203125, "learning_rate": 0.00023005206197425704, "loss": 35.8052, "step": 9978 }, { "epoch": 26.355893034004623, "grad_norm": 467.8722839355469, "learning_rate": 0.00023001041895697897, "loss": 34.9996, "step": 9979 }, { "epoch": 26.35853416969297, "grad_norm": 351.6464538574219, "learning_rate": 0.00022996877649789812, "loss": 35.7166, "step": 9980 }, { "epoch": 26.361175305381312, "grad_norm": 340.1560363769531, "learning_rate": 0.00022992713459817752, "loss": 35.6255, "step": 9981 }, { "epoch": 26.36381644106966, "grad_norm": 267.38275146484375, "learning_rate": 0.0002298854932589799, "loss": 34.4698, "step": 9982 }, { "epoch": 26.366457576758005, "grad_norm": 285.9410400390625, "learning_rate": 0.00022984385248146796, "loss": 35.1044, "step": 9983 }, { "epoch": 26.369098712446352, "grad_norm": 322.3912658691406, "learning_rate": 0.0002298022122668047, "loss": 35.3033, "step": 9984 }, { "epoch": 26.3717398481347, "grad_norm": 528.822265625, "learning_rate": 0.00022976057261615279, "loss": 34.8924, "step": 9985 }, { "epoch": 26.374380983823045, "grad_norm": 338.0948791503906, "learning_rate": 0.000229718933530675, "loss": 34.724, "step": 9986 }, { "epoch": 26.37702211951139, "grad_norm": 224.7971649169922, "learning_rate": 0.00022967729501153402, "loss": 35.1453, "step": 9987 }, { "epoch": 26.379663255199738, "grad_norm": 259.1646423339844, "learning_rate": 0.00022963565705989267, "loss": 35.4574, "step": 9988 }, { "epoch": 26.38230439088808, "grad_norm": 483.09112548828125, "learning_rate": 0.00022959401967691365, "loss": 35.1714, "step": 9989 }, { "epoch": 26.384945526576427, "grad_norm": 409.9547119140625, "learning_rate": 0.00022955238286375955, "loss": 34.5443, "step": 9990 }, { "epoch": 26.387586662264773, "grad_norm": 283.9546813964844, "learning_rate": 0.0002295107466215932, "loss": 35.1258, "step": 9991 }, { "epoch": 26.39022779795312, "grad_norm": 645.16455078125, "learning_rate": 0.00022946911095157735, "loss": 35.2904, "step": 9992 }, { "epoch": 26.392868933641466, "grad_norm": 904.1246337890625, "learning_rate": 0.00022942747585487442, "loss": 35.4714, "step": 9993 }, { "epoch": 26.395510069329813, "grad_norm": 643.66259765625, "learning_rate": 0.00022938584133264702, "loss": 38.7843, "step": 9994 }, { "epoch": 26.39815120501816, "grad_norm": 325.1429138183594, "learning_rate": 0.000229344207386058, "loss": 40.2316, "step": 9995 }, { "epoch": 26.400792340706502, "grad_norm": 314.67340087890625, "learning_rate": 0.0002293025740162699, "loss": 39.8123, "step": 9996 }, { "epoch": 26.40343347639485, "grad_norm": 335.2361755371094, "learning_rate": 0.00022926094122444518, "loss": 40.8068, "step": 9997 }, { "epoch": 26.406074612083195, "grad_norm": 578.0187377929688, "learning_rate": 0.00022921930901174655, "loss": 40.3543, "step": 9998 }, { "epoch": 26.40871574777154, "grad_norm": 249.95364379882812, "learning_rate": 0.00022917767737933652, "loss": 43.347, "step": 9999 }, { "epoch": 26.411356883459888, "grad_norm": 454.3074951171875, "learning_rate": 0.00022913604632837759, "loss": 42.8511, "step": 10000 }, { "epoch": 26.411356883459888, "eval_loss": 3.921495199203491, "eval_runtime": 2.126, "eval_samples_per_second": 232.829, "eval_steps_per_second": 29.162, "step": 10000 }, { "epoch": 26.413998019148234, "grad_norm": 387.3442077636719, "learning_rate": 0.00022909441586003228, "loss": 44.5458, "step": 10001 }, { "epoch": 26.41663915483658, "grad_norm": 757.1671142578125, "learning_rate": 0.00022905278597546314, "loss": 43.4681, "step": 10002 }, { "epoch": 26.419280290524927, "grad_norm": 257.8835144042969, "learning_rate": 0.0002290111566758327, "loss": 40.016, "step": 10003 }, { "epoch": 26.42192142621327, "grad_norm": 216.41932678222656, "learning_rate": 0.0002289695279623033, "loss": 39.7558, "step": 10004 }, { "epoch": 26.424562561901617, "grad_norm": 418.47698974609375, "learning_rate": 0.00022892789983603755, "loss": 40.282, "step": 10005 }, { "epoch": 26.427203697589963, "grad_norm": 665.90625, "learning_rate": 0.00022888627229819777, "loss": 39.9555, "step": 10006 }, { "epoch": 26.42984483327831, "grad_norm": 419.5240783691406, "learning_rate": 0.0002288446453499465, "loss": 39.3564, "step": 10007 }, { "epoch": 26.432485968966656, "grad_norm": 259.4598388671875, "learning_rate": 0.0002288030189924459, "loss": 37.378, "step": 10008 }, { "epoch": 26.435127104655002, "grad_norm": 393.4548645019531, "learning_rate": 0.00022876139322685878, "loss": 35.3989, "step": 10009 }, { "epoch": 26.43776824034335, "grad_norm": 184.88148498535156, "learning_rate": 0.00022871976805434715, "loss": 35.522, "step": 10010 }, { "epoch": 26.440409376031695, "grad_norm": 377.1524658203125, "learning_rate": 0.00022867814347607342, "loss": 35.2094, "step": 10011 }, { "epoch": 26.44305051172004, "grad_norm": 259.7772521972656, "learning_rate": 0.00022863651949320016, "loss": 35.5415, "step": 10012 }, { "epoch": 26.445691647408385, "grad_norm": 213.90692138671875, "learning_rate": 0.0002285948961068895, "loss": 35.2364, "step": 10013 }, { "epoch": 26.44833278309673, "grad_norm": 339.6542053222656, "learning_rate": 0.0002285532733183038, "loss": 34.7823, "step": 10014 }, { "epoch": 26.450973918785078, "grad_norm": 520.7203979492188, "learning_rate": 0.00022851165112860528, "loss": 34.7757, "step": 10015 }, { "epoch": 26.453615054473424, "grad_norm": 553.3724365234375, "learning_rate": 0.00022847002953895634, "loss": 39.3868, "step": 10016 }, { "epoch": 26.45625619016177, "grad_norm": 1508.6123046875, "learning_rate": 0.00022842840855051918, "loss": 25.4509, "step": 10017 }, { "epoch": 26.458897325850117, "grad_norm": 1426.1475830078125, "learning_rate": 0.000228386788164456, "loss": 20.1281, "step": 10018 }, { "epoch": 26.46153846153846, "grad_norm": 1144.126220703125, "learning_rate": 0.00022834516838192908, "loss": 21.1944, "step": 10019 }, { "epoch": 26.464179597226806, "grad_norm": 656.1824340820312, "learning_rate": 0.00022830354920410064, "loss": 20.8988, "step": 10020 }, { "epoch": 26.466820732915153, "grad_norm": 3074.83447265625, "learning_rate": 0.00022826193063213285, "loss": 19.8932, "step": 10021 }, { "epoch": 26.4694618686035, "grad_norm": 9167.1005859375, "learning_rate": 0.00022822031266718783, "loss": 14.8507, "step": 10022 }, { "epoch": 26.472103004291846, "grad_norm": 2408.223388671875, "learning_rate": 0.0002281786953104278, "loss": 14.0991, "step": 10023 }, { "epoch": 26.474744139980192, "grad_norm": 503.7789611816406, "learning_rate": 0.0002281370785630149, "loss": 14.7453, "step": 10024 }, { "epoch": 26.47738527566854, "grad_norm": 870.3728637695312, "learning_rate": 0.00022809546242611115, "loss": 13.3067, "step": 10025 }, { "epoch": 26.480026411356885, "grad_norm": 617.8472290039062, "learning_rate": 0.00022805384690087883, "loss": 20.1405, "step": 10026 }, { "epoch": 26.482667547045228, "grad_norm": 1699.8873291015625, "learning_rate": 0.00022801223198848, "loss": 36.6915, "step": 10027 }, { "epoch": 26.485308682733574, "grad_norm": 292.9767761230469, "learning_rate": 0.00022797061769007666, "loss": 34.0073, "step": 10028 }, { "epoch": 26.48794981842192, "grad_norm": 308.87530517578125, "learning_rate": 0.00022792900400683073, "loss": 35.7147, "step": 10029 }, { "epoch": 26.490590954110267, "grad_norm": 375.8299255371094, "learning_rate": 0.00022788739093990452, "loss": 35.1949, "step": 10030 }, { "epoch": 26.493232089798614, "grad_norm": 308.137939453125, "learning_rate": 0.0002278457784904599, "loss": 35.172, "step": 10031 }, { "epoch": 26.49587322548696, "grad_norm": 619.2215576171875, "learning_rate": 0.00022780416665965888, "loss": 35.651, "step": 10032 }, { "epoch": 26.498514361175307, "grad_norm": 280.6566467285156, "learning_rate": 0.0002277625554486635, "loss": 34.8926, "step": 10033 }, { "epoch": 26.501155496863653, "grad_norm": 490.0456237792969, "learning_rate": 0.00022772094485863574, "loss": 34.8382, "step": 10034 }, { "epoch": 26.503796632551996, "grad_norm": 626.8161010742188, "learning_rate": 0.00022767933489073748, "loss": 34.6076, "step": 10035 }, { "epoch": 26.506437768240342, "grad_norm": 210.18666076660156, "learning_rate": 0.0002276377255461306, "loss": 36.5137, "step": 10036 }, { "epoch": 26.50907890392869, "grad_norm": 323.9216613769531, "learning_rate": 0.00022759611682597716, "loss": 35.3284, "step": 10037 }, { "epoch": 26.511720039617035, "grad_norm": 400.7099609375, "learning_rate": 0.00022755450873143904, "loss": 36.1132, "step": 10038 }, { "epoch": 26.514361175305382, "grad_norm": 3469.6787109375, "learning_rate": 0.000227512901263678, "loss": 35.5255, "step": 10039 }, { "epoch": 26.51700231099373, "grad_norm": 326.4914245605469, "learning_rate": 0.00022747129442385604, "loss": 34.1422, "step": 10040 }, { "epoch": 26.519643446682075, "grad_norm": 570.9197998046875, "learning_rate": 0.0002274296882131349, "loss": 36.4039, "step": 10041 }, { "epoch": 26.522284582370418, "grad_norm": 402.890625, "learning_rate": 0.00022738808263267652, "loss": 36.8312, "step": 10042 }, { "epoch": 26.524925718058764, "grad_norm": 445.6131286621094, "learning_rate": 0.00022734647768364252, "loss": 35.7924, "step": 10043 }, { "epoch": 26.52756685374711, "grad_norm": 474.8123474121094, "learning_rate": 0.000227304873367195, "loss": 40.0702, "step": 10044 }, { "epoch": 26.530207989435457, "grad_norm": 1550.7486572265625, "learning_rate": 0.00022726326968449547, "loss": 40.9187, "step": 10045 }, { "epoch": 26.532849125123803, "grad_norm": 412.7057800292969, "learning_rate": 0.00022722166663670572, "loss": 40.2814, "step": 10046 }, { "epoch": 26.53549026081215, "grad_norm": 425.6887512207031, "learning_rate": 0.00022718006422498764, "loss": 39.8368, "step": 10047 }, { "epoch": 26.538131396500496, "grad_norm": 345.8193664550781, "learning_rate": 0.00022713846245050282, "loss": 40.4682, "step": 10048 }, { "epoch": 26.540772532188843, "grad_norm": 653.34228515625, "learning_rate": 0.00022709686131441303, "loss": 43.979, "step": 10049 }, { "epoch": 26.543413667877186, "grad_norm": 620.3806762695312, "learning_rate": 0.00022705526081787992, "loss": 42.3468, "step": 10050 }, { "epoch": 26.546054803565532, "grad_norm": 467.1194763183594, "learning_rate": 0.0002270136609620652, "loss": 43.104, "step": 10051 }, { "epoch": 26.54869593925388, "grad_norm": 358.0138244628906, "learning_rate": 0.00022697206174813046, "loss": 41.6357, "step": 10052 }, { "epoch": 26.551337074942225, "grad_norm": 394.5254211425781, "learning_rate": 0.00022693046317723734, "loss": 40.7412, "step": 10053 }, { "epoch": 26.55397821063057, "grad_norm": 286.0265197753906, "learning_rate": 0.00022688886525054755, "loss": 42.5965, "step": 10054 }, { "epoch": 26.556619346318918, "grad_norm": 324.6941223144531, "learning_rate": 0.00022684726796922262, "loss": 39.4762, "step": 10055 }, { "epoch": 26.559260482007264, "grad_norm": 887.540771484375, "learning_rate": 0.00022680567133442412, "loss": 38.1815, "step": 10056 }, { "epoch": 26.56190161769561, "grad_norm": 803.08154296875, "learning_rate": 0.00022676407534731358, "loss": 38.1948, "step": 10057 }, { "epoch": 26.564542753383954, "grad_norm": 308.27044677734375, "learning_rate": 0.00022672248000905264, "loss": 36.7962, "step": 10058 }, { "epoch": 26.5671838890723, "grad_norm": 267.4241638183594, "learning_rate": 0.00022668088532080274, "loss": 36.2624, "step": 10059 }, { "epoch": 26.569825024760647, "grad_norm": 751.0646362304688, "learning_rate": 0.00022663929128372535, "loss": 36.7994, "step": 10060 }, { "epoch": 26.572466160448993, "grad_norm": 366.40399169921875, "learning_rate": 0.00022659769789898213, "loss": 36.7177, "step": 10061 }, { "epoch": 26.57510729613734, "grad_norm": 291.99188232421875, "learning_rate": 0.00022655610516773448, "loss": 35.7929, "step": 10062 }, { "epoch": 26.577748431825686, "grad_norm": 272.1979064941406, "learning_rate": 0.0002265145130911438, "loss": 35.4069, "step": 10063 }, { "epoch": 26.580389567514032, "grad_norm": 587.1124267578125, "learning_rate": 0.00022647292167037142, "loss": 36.0577, "step": 10064 }, { "epoch": 26.583030703202375, "grad_norm": 772.7613525390625, "learning_rate": 0.00022643133090657897, "loss": 35.1742, "step": 10065 }, { "epoch": 26.585671838890722, "grad_norm": 864.3765869140625, "learning_rate": 0.00022638974080092778, "loss": 41.4293, "step": 10066 }, { "epoch": 26.58831297457907, "grad_norm": 676.291015625, "learning_rate": 0.00022634815135457917, "loss": 38.4443, "step": 10067 }, { "epoch": 26.590954110267415, "grad_norm": 9767.9521484375, "learning_rate": 0.00022630656256869458, "loss": 16.7227, "step": 10068 }, { "epoch": 26.59359524595576, "grad_norm": 992.3932495117188, "learning_rate": 0.0002262649744444353, "loss": 18.6302, "step": 10069 }, { "epoch": 26.596236381644108, "grad_norm": 2704.50244140625, "learning_rate": 0.0002262233869829627, "loss": 18.5177, "step": 10070 }, { "epoch": 26.598877517332454, "grad_norm": 1194.183349609375, "learning_rate": 0.00022618180018543792, "loss": 11.4307, "step": 10071 }, { "epoch": 26.6015186530208, "grad_norm": 2899.636474609375, "learning_rate": 0.0002261402140530225, "loss": 16.6188, "step": 10072 }, { "epoch": 26.604159788709143, "grad_norm": 591.6388549804688, "learning_rate": 0.00022609862858687758, "loss": 13.3205, "step": 10073 }, { "epoch": 26.60680092439749, "grad_norm": 5370.9931640625, "learning_rate": 0.00022605704378816436, "loss": 16.4422, "step": 10074 }, { "epoch": 26.609442060085836, "grad_norm": 779.5655517578125, "learning_rate": 0.00022601545965804417, "loss": 13.0174, "step": 10075 }, { "epoch": 26.612083195774183, "grad_norm": 2026.7880859375, "learning_rate": 0.00022597387619767821, "loss": 17.5171, "step": 10076 }, { "epoch": 26.61472433146253, "grad_norm": 411.8468017578125, "learning_rate": 0.0002259322934082276, "loss": 36.3229, "step": 10077 }, { "epoch": 26.617365467150876, "grad_norm": 348.6265563964844, "learning_rate": 0.00022589071129085347, "loss": 36.254, "step": 10078 }, { "epoch": 26.620006602839222, "grad_norm": 2382.08349609375, "learning_rate": 0.00022584912984671723, "loss": 35.1556, "step": 10079 }, { "epoch": 26.62264773852757, "grad_norm": 459.85089111328125, "learning_rate": 0.00022580754907697978, "loss": 34.1597, "step": 10080 }, { "epoch": 26.62528887421591, "grad_norm": 493.641845703125, "learning_rate": 0.0002257659689828022, "loss": 36.2299, "step": 10081 }, { "epoch": 26.627930009904258, "grad_norm": 479.6994934082031, "learning_rate": 0.00022572438956534587, "loss": 35.4054, "step": 10082 }, { "epoch": 26.630571145592604, "grad_norm": 606.2343139648438, "learning_rate": 0.00022568281082577162, "loss": 34.4214, "step": 10083 }, { "epoch": 26.63321228128095, "grad_norm": 371.928955078125, "learning_rate": 0.00022564123276524065, "loss": 35.1858, "step": 10084 }, { "epoch": 26.635853416969297, "grad_norm": 456.57672119140625, "learning_rate": 0.00022559965538491387, "loss": 34.5018, "step": 10085 }, { "epoch": 26.638494552657644, "grad_norm": 353.2557067871094, "learning_rate": 0.00022555807868595245, "loss": 34.6543, "step": 10086 }, { "epoch": 26.64113568834599, "grad_norm": 791.778564453125, "learning_rate": 0.0002255165026695173, "loss": 37.1425, "step": 10087 }, { "epoch": 26.643776824034333, "grad_norm": 546.9942016601562, "learning_rate": 0.0002254749273367694, "loss": 36.0182, "step": 10088 }, { "epoch": 26.64641795972268, "grad_norm": 630.5552368164062, "learning_rate": 0.00022543335268886976, "loss": 34.4372, "step": 10089 }, { "epoch": 26.649059095411026, "grad_norm": 548.4630126953125, "learning_rate": 0.00022539177872697935, "loss": 34.0002, "step": 10090 }, { "epoch": 26.651700231099372, "grad_norm": 1144.764892578125, "learning_rate": 0.00022535020545225902, "loss": 35.1231, "step": 10091 }, { "epoch": 26.65434136678772, "grad_norm": 725.8726806640625, "learning_rate": 0.00022530863286586975, "loss": 35.6396, "step": 10092 }, { "epoch": 26.656982502476065, "grad_norm": 836.0032958984375, "learning_rate": 0.0002252670609689724, "loss": 37.6513, "step": 10093 }, { "epoch": 26.659623638164412, "grad_norm": 1268.3636474609375, "learning_rate": 0.00022522548976272786, "loss": 40.0589, "step": 10094 }, { "epoch": 26.66226477385276, "grad_norm": 374.0074768066406, "learning_rate": 0.00022518391924829684, "loss": 39.5451, "step": 10095 }, { "epoch": 26.6649059095411, "grad_norm": 658.2039184570312, "learning_rate": 0.00022514234942684042, "loss": 40.4955, "step": 10096 }, { "epoch": 26.667547045229448, "grad_norm": 332.36431884765625, "learning_rate": 0.00022510078029951936, "loss": 39.4293, "step": 10097 }, { "epoch": 26.670188180917794, "grad_norm": 628.2826538085938, "learning_rate": 0.0002250592118674942, "loss": 40.393, "step": 10098 }, { "epoch": 26.67282931660614, "grad_norm": 488.5462951660156, "learning_rate": 0.00022501764413192598, "loss": 41.9871, "step": 10099 }, { "epoch": 26.675470452294487, "grad_norm": 336.9096984863281, "learning_rate": 0.0002249760770939754, "loss": 42.2509, "step": 10100 }, { "epoch": 26.678111587982833, "grad_norm": 1835.72021484375, "learning_rate": 0.0002249345107548032, "loss": 41.1424, "step": 10101 }, { "epoch": 26.68075272367118, "grad_norm": 634.84130859375, "learning_rate": 0.00022489294511556995, "loss": 40.0419, "step": 10102 }, { "epoch": 26.683393859359526, "grad_norm": 394.6315612792969, "learning_rate": 0.00022485138017743655, "loss": 41.0135, "step": 10103 }, { "epoch": 26.68603499504787, "grad_norm": 1060.598876953125, "learning_rate": 0.00022480981594156358, "loss": 39.6203, "step": 10104 }, { "epoch": 26.688676130736216, "grad_norm": 421.06671142578125, "learning_rate": 0.00022476825240911165, "loss": 37.0154, "step": 10105 }, { "epoch": 26.691317266424562, "grad_norm": 377.8204650878906, "learning_rate": 0.0002247266895812415, "loss": 39.4725, "step": 10106 }, { "epoch": 26.69395840211291, "grad_norm": 429.357666015625, "learning_rate": 0.00022468512745911368, "loss": 36.882, "step": 10107 }, { "epoch": 26.696599537801255, "grad_norm": 674.3328247070312, "learning_rate": 0.00022464356604388883, "loss": 36.0346, "step": 10108 }, { "epoch": 26.6992406734896, "grad_norm": 501.5240478515625, "learning_rate": 0.0002246020053367274, "loss": 36.2451, "step": 10109 }, { "epoch": 26.701881809177948, "grad_norm": 1156.2684326171875, "learning_rate": 0.00022456044533879016, "loss": 36.4736, "step": 10110 }, { "epoch": 26.70452294486629, "grad_norm": 456.268310546875, "learning_rate": 0.00022451888605123756, "loss": 34.027, "step": 10111 }, { "epoch": 26.707164080554637, "grad_norm": 601.6456909179688, "learning_rate": 0.00022447732747522996, "loss": 35.4014, "step": 10112 }, { "epoch": 26.709805216242984, "grad_norm": 1225.6962890625, "learning_rate": 0.00022443576961192808, "loss": 35.7559, "step": 10113 }, { "epoch": 26.71244635193133, "grad_norm": 475.90509033203125, "learning_rate": 0.0002243942124624924, "loss": 34.8049, "step": 10114 }, { "epoch": 26.715087487619677, "grad_norm": 864.1088256835938, "learning_rate": 0.0002243526560280832, "loss": 35.332, "step": 10115 }, { "epoch": 26.717728623308023, "grad_norm": 543.4190673828125, "learning_rate": 0.00022431110030986096, "loss": 36.6372, "step": 10116 }, { "epoch": 26.72036975899637, "grad_norm": 4673.74755859375, "learning_rate": 0.00022426954530898626, "loss": 41.476, "step": 10117 }, { "epoch": 26.723010894684716, "grad_norm": 2642.2412109375, "learning_rate": 0.00022422799102661935, "loss": 19.8733, "step": 10118 }, { "epoch": 26.72565203037306, "grad_norm": 4680.669921875, "learning_rate": 0.0002241864374639206, "loss": 21.8718, "step": 10119 }, { "epoch": 26.728293166061405, "grad_norm": 4876.24658203125, "learning_rate": 0.0002241448846220505, "loss": 17.1412, "step": 10120 }, { "epoch": 26.730934301749752, "grad_norm": 2777.47021484375, "learning_rate": 0.0002241033325021693, "loss": 17.3048, "step": 10121 }, { "epoch": 26.7335754374381, "grad_norm": 1142.0853271484375, "learning_rate": 0.00022406178110543734, "loss": 18.4391, "step": 10122 }, { "epoch": 26.736216573126445, "grad_norm": 2068.43359375, "learning_rate": 0.00022402023043301483, "loss": 18.8691, "step": 10123 }, { "epoch": 26.73885770881479, "grad_norm": 6555.92578125, "learning_rate": 0.0002239786804860622, "loss": 13.9185, "step": 10124 }, { "epoch": 26.741498844503138, "grad_norm": 8297.5400390625, "learning_rate": 0.0002239371312657396, "loss": 11.138, "step": 10125 }, { "epoch": 26.744139980191484, "grad_norm": 1590.9237060546875, "learning_rate": 0.0002238955827732073, "loss": 15.9005, "step": 10126 }, { "epoch": 26.746781115879827, "grad_norm": 423.01300048828125, "learning_rate": 0.00022385403500962552, "loss": 35.006, "step": 10127 }, { "epoch": 26.749422251568173, "grad_norm": 2152.87646484375, "learning_rate": 0.00022381248797615448, "loss": 35.3893, "step": 10128 }, { "epoch": 26.75206338725652, "grad_norm": 493.66021728515625, "learning_rate": 0.00022377094167395432, "loss": 34.8323, "step": 10129 }, { "epoch": 26.754704522944866, "grad_norm": 242.34390258789062, "learning_rate": 0.00022372939610418508, "loss": 33.9608, "step": 10130 }, { "epoch": 26.757345658633213, "grad_norm": 1256.8297119140625, "learning_rate": 0.00022368785126800715, "loss": 35.2898, "step": 10131 }, { "epoch": 26.75998679432156, "grad_norm": 317.2818603515625, "learning_rate": 0.0002236463071665806, "loss": 35.5866, "step": 10132 }, { "epoch": 26.762627930009906, "grad_norm": 482.12908935546875, "learning_rate": 0.0002236047638010652, "loss": 35.0009, "step": 10133 }, { "epoch": 26.76526906569825, "grad_norm": 420.4753723144531, "learning_rate": 0.00022356322117262142, "loss": 34.2296, "step": 10134 }, { "epoch": 26.767910201386595, "grad_norm": 501.43743896484375, "learning_rate": 0.00022352167928240917, "loss": 36.0673, "step": 10135 }, { "epoch": 26.77055133707494, "grad_norm": 432.5773010253906, "learning_rate": 0.00022348013813158846, "loss": 38.7124, "step": 10136 }, { "epoch": 26.773192472763288, "grad_norm": 304.2409973144531, "learning_rate": 0.00022343859772131928, "loss": 35.1351, "step": 10137 }, { "epoch": 26.775833608451634, "grad_norm": 734.634033203125, "learning_rate": 0.00022339705805276167, "loss": 35.1103, "step": 10138 }, { "epoch": 26.77847474413998, "grad_norm": 776.8221435546875, "learning_rate": 0.00022335551912707561, "loss": 34.7675, "step": 10139 }, { "epoch": 26.781115879828327, "grad_norm": 785.7705078125, "learning_rate": 0.00022331398094542097, "loss": 34.1247, "step": 10140 }, { "epoch": 26.783757015516674, "grad_norm": 370.99969482421875, "learning_rate": 0.00022327244350895778, "loss": 34.3771, "step": 10141 }, { "epoch": 26.786398151205017, "grad_norm": 428.3028259277344, "learning_rate": 0.00022323090681884591, "loss": 35.2725, "step": 10142 }, { "epoch": 26.789039286893363, "grad_norm": 481.9070129394531, "learning_rate": 0.00022318937087624526, "loss": 35.6825, "step": 10143 }, { "epoch": 26.79168042258171, "grad_norm": 1051.0062255859375, "learning_rate": 0.00022314783568231562, "loss": 37.6173, "step": 10144 }, { "epoch": 26.794321558270056, "grad_norm": 561.3711547851562, "learning_rate": 0.00022310630123821692, "loss": 42.1547, "step": 10145 }, { "epoch": 26.796962693958402, "grad_norm": 333.8743896484375, "learning_rate": 0.000223064767545109, "loss": 39.343, "step": 10146 }, { "epoch": 26.79960382964675, "grad_norm": 410.0670471191406, "learning_rate": 0.0002230232346041515, "loss": 39.8299, "step": 10147 }, { "epoch": 26.802244965335095, "grad_norm": 437.43035888671875, "learning_rate": 0.00022298170241650444, "loss": 40.238, "step": 10148 }, { "epoch": 26.804886101023442, "grad_norm": 260.95806884765625, "learning_rate": 0.0002229401709833275, "loss": 41.7474, "step": 10149 }, { "epoch": 26.807527236711785, "grad_norm": 388.71612548828125, "learning_rate": 0.00022289864030578034, "loss": 43.5907, "step": 10150 }, { "epoch": 26.81016837240013, "grad_norm": 394.9241943359375, "learning_rate": 0.00022285711038502263, "loss": 43.6438, "step": 10151 }, { "epoch": 26.812809508088478, "grad_norm": 305.3597412109375, "learning_rate": 0.00022281558122221425, "loss": 40.4177, "step": 10152 }, { "epoch": 26.815450643776824, "grad_norm": 446.241943359375, "learning_rate": 0.00022277405281851485, "loss": 41.8827, "step": 10153 }, { "epoch": 26.81809177946517, "grad_norm": 560.6286010742188, "learning_rate": 0.00022273252517508393, "loss": 39.3793, "step": 10154 }, { "epoch": 26.820732915153517, "grad_norm": 1269.7039794921875, "learning_rate": 0.0002226909982930813, "loss": 40.9348, "step": 10155 }, { "epoch": 26.823374050841863, "grad_norm": 247.65133666992188, "learning_rate": 0.00022264947217366651, "loss": 38.3996, "step": 10156 }, { "epoch": 26.826015186530206, "grad_norm": 769.6239624023438, "learning_rate": 0.00022260794681799913, "loss": 38.3606, "step": 10157 }, { "epoch": 26.828656322218553, "grad_norm": 720.7368774414062, "learning_rate": 0.00022256642222723868, "loss": 38.2876, "step": 10158 }, { "epoch": 26.8312974579069, "grad_norm": 582.8023071289062, "learning_rate": 0.00022252489840254486, "loss": 37.0925, "step": 10159 }, { "epoch": 26.833938593595246, "grad_norm": 281.37261962890625, "learning_rate": 0.00022248337534507709, "loss": 35.6383, "step": 10160 }, { "epoch": 26.836579729283592, "grad_norm": 446.9889831542969, "learning_rate": 0.00022244185305599485, "loss": 36.1595, "step": 10161 }, { "epoch": 26.83922086497194, "grad_norm": 296.5177307128906, "learning_rate": 0.00022240033153645772, "loss": 35.8807, "step": 10162 }, { "epoch": 26.841862000660285, "grad_norm": 812.3253784179688, "learning_rate": 0.00022235881078762514, "loss": 34.7175, "step": 10163 }, { "epoch": 26.84450313634863, "grad_norm": 715.1119995117188, "learning_rate": 0.0002223172908106565, "loss": 34.7484, "step": 10164 }, { "epoch": 26.847144272036974, "grad_norm": 380.83612060546875, "learning_rate": 0.00022227577160671116, "loss": 33.8103, "step": 10165 }, { "epoch": 26.84978540772532, "grad_norm": 540.3289794921875, "learning_rate": 0.00022223425317694873, "loss": 35.0448, "step": 10166 }, { "epoch": 26.852426543413667, "grad_norm": 1728.007568359375, "learning_rate": 0.0002221927355225285, "loss": 38.4312, "step": 10167 }, { "epoch": 26.855067679102014, "grad_norm": 540.7081909179688, "learning_rate": 0.00022215121864460965, "loss": 17.556, "step": 10168 }, { "epoch": 26.85770881479036, "grad_norm": 903.064453125, "learning_rate": 0.00022210970254435178, "loss": 12.6138, "step": 10169 }, { "epoch": 26.860349950478707, "grad_norm": 1253.994873046875, "learning_rate": 0.00022206818722291405, "loss": 16.2444, "step": 10170 }, { "epoch": 26.862991086167053, "grad_norm": 1069.2584228515625, "learning_rate": 0.00022202667268145582, "loss": 11.4448, "step": 10171 }, { "epoch": 26.8656322218554, "grad_norm": 1329.6861572265625, "learning_rate": 0.00022198515892113624, "loss": 9.8664, "step": 10172 }, { "epoch": 26.868273357543742, "grad_norm": 1099.80029296875, "learning_rate": 0.00022194364594311472, "loss": 11.1993, "step": 10173 }, { "epoch": 26.87091449323209, "grad_norm": 295.5238342285156, "learning_rate": 0.0002219021337485504, "loss": 14.3952, "step": 10174 }, { "epoch": 26.873555628920435, "grad_norm": 1102.5928955078125, "learning_rate": 0.00022186062233860244, "loss": 12.2084, "step": 10175 }, { "epoch": 26.876196764608782, "grad_norm": 600.80810546875, "learning_rate": 0.00022181911171443008, "loss": 17.0825, "step": 10176 }, { "epoch": 26.87883790029713, "grad_norm": 551.913330078125, "learning_rate": 0.0002217776018771925, "loss": 35.4308, "step": 10177 }, { "epoch": 26.881479035985475, "grad_norm": 1454.72607421875, "learning_rate": 0.00022173609282804882, "loss": 35.1205, "step": 10178 }, { "epoch": 26.88412017167382, "grad_norm": 389.8366394042969, "learning_rate": 0.0002216945845681581, "loss": 37.0869, "step": 10179 }, { "epoch": 26.886761307362164, "grad_norm": 443.46917724609375, "learning_rate": 0.0002216530770986795, "loss": 35.2477, "step": 10180 }, { "epoch": 26.88940244305051, "grad_norm": 221.78616333007812, "learning_rate": 0.0002216115704207721, "loss": 35.0018, "step": 10181 }, { "epoch": 26.892043578738857, "grad_norm": 318.61834716796875, "learning_rate": 0.0002215700645355948, "loss": 35.3207, "step": 10182 }, { "epoch": 26.894684714427203, "grad_norm": 387.33868408203125, "learning_rate": 0.00022152855944430685, "loss": 35.6808, "step": 10183 }, { "epoch": 26.89732585011555, "grad_norm": 1040.7288818359375, "learning_rate": 0.0002214870551480672, "loss": 33.7727, "step": 10184 }, { "epoch": 26.899966985803896, "grad_norm": 1060.142822265625, "learning_rate": 0.00022144555164803475, "loss": 34.944, "step": 10185 }, { "epoch": 26.902608121492243, "grad_norm": 407.03424072265625, "learning_rate": 0.00022140404894536837, "loss": 34.082, "step": 10186 }, { "epoch": 26.90524925718059, "grad_norm": 379.7122497558594, "learning_rate": 0.00022136254704122724, "loss": 35.3609, "step": 10187 }, { "epoch": 26.907890392868932, "grad_norm": 1183.5767822265625, "learning_rate": 0.0002213210459367702, "loss": 35.0683, "step": 10188 }, { "epoch": 26.91053152855728, "grad_norm": 903.3491821289062, "learning_rate": 0.000221279545633156, "loss": 33.4948, "step": 10189 }, { "epoch": 26.913172664245625, "grad_norm": 691.0319213867188, "learning_rate": 0.0002212380461315437, "loss": 35.1157, "step": 10190 }, { "epoch": 26.91581379993397, "grad_norm": 2968.457763671875, "learning_rate": 0.00022119654743309202, "loss": 33.8761, "step": 10191 }, { "epoch": 26.918454935622318, "grad_norm": 761.0626831054688, "learning_rate": 0.00022115504953895988, "loss": 33.757, "step": 10192 }, { "epoch": 26.921096071310664, "grad_norm": 1802.4000244140625, "learning_rate": 0.00022111355245030596, "loss": 35.7496, "step": 10193 }, { "epoch": 26.92373720699901, "grad_norm": 503.4334716796875, "learning_rate": 0.00022107205616828918, "loss": 38.1707, "step": 10194 }, { "epoch": 26.926378342687357, "grad_norm": 2618.371826171875, "learning_rate": 0.00022103056069406823, "loss": 39.5867, "step": 10195 }, { "epoch": 26.9290194783757, "grad_norm": 549.9456176757812, "learning_rate": 0.00022098906602880183, "loss": 38.719, "step": 10196 }, { "epoch": 26.931660614064047, "grad_norm": 1136.5853271484375, "learning_rate": 0.00022094757217364875, "loss": 40.9496, "step": 10197 }, { "epoch": 26.934301749752393, "grad_norm": 318.7315673828125, "learning_rate": 0.00022090607912976764, "loss": 41.0964, "step": 10198 }, { "epoch": 26.93694288544074, "grad_norm": 400.2726135253906, "learning_rate": 0.0002208645868983172, "loss": 43.4615, "step": 10199 }, { "epoch": 26.939584021129086, "grad_norm": 362.19677734375, "learning_rate": 0.00022082309548045596, "loss": 40.7808, "step": 10200 }, { "epoch": 26.939584021129086, "eval_loss": 3.8901681900024414, "eval_runtime": 2.1665, "eval_samples_per_second": 228.474, "eval_steps_per_second": 28.617, "step": 10200 }, { "epoch": 26.942225156817432, "grad_norm": 489.3386535644531, "learning_rate": 0.0002207816048773427, "loss": 39.2027, "step": 10201 }, { "epoch": 26.94486629250578, "grad_norm": 363.4309387207031, "learning_rate": 0.00022074011509013605, "loss": 37.4698, "step": 10202 }, { "epoch": 26.947507428194122, "grad_norm": 551.6636962890625, "learning_rate": 0.00022069862611999433, "loss": 35.846, "step": 10203 }, { "epoch": 26.95014856388247, "grad_norm": 433.5449523925781, "learning_rate": 0.0002206571379680764, "loss": 35.1283, "step": 10204 }, { "epoch": 26.952789699570815, "grad_norm": 430.82110595703125, "learning_rate": 0.00022061565063554063, "loss": 35.9306, "step": 10205 }, { "epoch": 26.95543083525916, "grad_norm": 309.677978515625, "learning_rate": 0.00022057416412354553, "loss": 35.614, "step": 10206 }, { "epoch": 26.958071970947508, "grad_norm": 595.679443359375, "learning_rate": 0.00022053267843324958, "loss": 33.2456, "step": 10207 }, { "epoch": 26.960713106635854, "grad_norm": 25759.6328125, "learning_rate": 0.00022049119356581132, "loss": 15.0441, "step": 10208 }, { "epoch": 26.9633542423242, "grad_norm": 3210.432373046875, "learning_rate": 0.00022044970952238917, "loss": 19.8758, "step": 10209 }, { "epoch": 26.965995378012547, "grad_norm": 1229.8548583984375, "learning_rate": 0.00022040822630414142, "loss": 11.1724, "step": 10210 }, { "epoch": 26.96863651370089, "grad_norm": 3246.50146484375, "learning_rate": 0.00022036674391222666, "loss": 14.997, "step": 10211 }, { "epoch": 26.971277649389236, "grad_norm": 19258.33203125, "learning_rate": 0.00022032526234780318, "loss": 18.3051, "step": 10212 }, { "epoch": 26.973918785077583, "grad_norm": 729.4208374023438, "learning_rate": 0.00022028378161202927, "loss": 26.9576, "step": 10213 }, { "epoch": 26.97655992076593, "grad_norm": 303.5162658691406, "learning_rate": 0.0002202423017060633, "loss": 34.7801, "step": 10214 }, { "epoch": 26.979201056454276, "grad_norm": 491.57818603515625, "learning_rate": 0.0002202008226310636, "loss": 36.72, "step": 10215 }, { "epoch": 26.981842192142622, "grad_norm": 540.75537109375, "learning_rate": 0.00022015934438818842, "loss": 34.3785, "step": 10216 }, { "epoch": 26.98448332783097, "grad_norm": 2453.635498046875, "learning_rate": 0.00022011786697859594, "loss": 35.126, "step": 10217 }, { "epoch": 26.987124463519315, "grad_norm": 501.3531494140625, "learning_rate": 0.00022007639040344456, "loss": 35.6853, "step": 10218 }, { "epoch": 26.989765599207658, "grad_norm": 1075.5963134765625, "learning_rate": 0.00022003491466389242, "loss": 34.7994, "step": 10219 }, { "epoch": 26.992406734896004, "grad_norm": 395.7532043457031, "learning_rate": 0.00021999343976109755, "loss": 35.5051, "step": 10220 }, { "epoch": 26.99504787058435, "grad_norm": 3010.687255859375, "learning_rate": 0.00021995196569621835, "loss": 35.6068, "step": 10221 }, { "epoch": 26.997689006272697, "grad_norm": 1645.778564453125, "learning_rate": 0.0002199104924704129, "loss": 35.6311, "step": 10222 }, { "epoch": 27.000330141961044, "grad_norm": 767.13720703125, "learning_rate": 0.00021986902008483923, "loss": 38.4368, "step": 10223 }, { "epoch": 27.00297127764939, "grad_norm": 454.60772705078125, "learning_rate": 0.00021982754854065544, "loss": 39.824, "step": 10224 }, { "epoch": 27.005612413337737, "grad_norm": 948.8088989257812, "learning_rate": 0.00021978607783901965, "loss": 37.6618, "step": 10225 }, { "epoch": 27.00825354902608, "grad_norm": 884.6514282226562, "learning_rate": 0.0002197446079810899, "loss": 38.7825, "step": 10226 }, { "epoch": 27.010894684714426, "grad_norm": 473.16839599609375, "learning_rate": 0.00021970313896802417, "loss": 40.3973, "step": 10227 }, { "epoch": 27.013535820402772, "grad_norm": 434.881591796875, "learning_rate": 0.00021966167080098053, "loss": 40.0707, "step": 10228 }, { "epoch": 27.01617695609112, "grad_norm": 1204.0390625, "learning_rate": 0.00021962020348111693, "loss": 42.7179, "step": 10229 }, { "epoch": 27.018818091779465, "grad_norm": 1022.9898071289062, "learning_rate": 0.00021957873700959127, "loss": 43.0758, "step": 10230 }, { "epoch": 27.021459227467812, "grad_norm": 820.9922485351562, "learning_rate": 0.0002195372713875614, "loss": 40.7064, "step": 10231 }, { "epoch": 27.02410036315616, "grad_norm": 527.9517211914062, "learning_rate": 0.00021949580661618552, "loss": 39.4574, "step": 10232 }, { "epoch": 27.026741498844505, "grad_norm": 758.6876831054688, "learning_rate": 0.00021945434269662127, "loss": 39.7178, "step": 10233 }, { "epoch": 27.029382634532848, "grad_norm": 475.9188537597656, "learning_rate": 0.00021941287963002642, "loss": 37.4802, "step": 10234 }, { "epoch": 27.032023770221194, "grad_norm": 259.29620361328125, "learning_rate": 0.00021937141741755907, "loss": 38.2685, "step": 10235 }, { "epoch": 27.03466490590954, "grad_norm": 289.0583801269531, "learning_rate": 0.00021932995606037688, "loss": 36.3355, "step": 10236 }, { "epoch": 27.037306041597887, "grad_norm": 414.6866760253906, "learning_rate": 0.0002192884955596377, "loss": 34.4877, "step": 10237 }, { "epoch": 27.039947177286233, "grad_norm": 689.8402099609375, "learning_rate": 0.00021924703591649907, "loss": 36.4618, "step": 10238 }, { "epoch": 27.04258831297458, "grad_norm": 323.615966796875, "learning_rate": 0.00021920557713211904, "loss": 34.5113, "step": 10239 }, { "epoch": 27.045229448662926, "grad_norm": 502.9082946777344, "learning_rate": 0.00021916411920765516, "loss": 36.2558, "step": 10240 }, { "epoch": 27.047870584351273, "grad_norm": 729.7155151367188, "learning_rate": 0.0002191226621442651, "loss": 35.1768, "step": 10241 }, { "epoch": 27.050511720039616, "grad_norm": 475.19647216796875, "learning_rate": 0.0002190812059431066, "loss": 34.7481, "step": 10242 }, { "epoch": 27.053152855727962, "grad_norm": 390.00079345703125, "learning_rate": 0.00021903975060533727, "loss": 34.8794, "step": 10243 }, { "epoch": 27.05579399141631, "grad_norm": 1182.943359375, "learning_rate": 0.00021899829613211469, "loss": 35.3878, "step": 10244 }, { "epoch": 27.058435127104655, "grad_norm": 1430.09765625, "learning_rate": 0.00021895684252459647, "loss": 40.3149, "step": 10245 }, { "epoch": 27.061076262793, "grad_norm": 1348.7769775390625, "learning_rate": 0.0002189153897839402, "loss": 14.7066, "step": 10246 }, { "epoch": 27.063717398481348, "grad_norm": 2087.47607421875, "learning_rate": 0.0002188739379113034, "loss": 21.7688, "step": 10247 }, { "epoch": 27.066358534169694, "grad_norm": 1242.920166015625, "learning_rate": 0.00021883248690784355, "loss": 18.6062, "step": 10248 }, { "epoch": 27.068999669858037, "grad_norm": 4201.28515625, "learning_rate": 0.0002187910367747183, "loss": 18.3652, "step": 10249 }, { "epoch": 27.071640805546384, "grad_norm": 3211.173095703125, "learning_rate": 0.000218749587513085, "loss": 13.6603, "step": 10250 }, { "epoch": 27.07428194123473, "grad_norm": 627.6304321289062, "learning_rate": 0.0002187081391241011, "loss": 21.5877, "step": 10251 }, { "epoch": 27.076923076923077, "grad_norm": 3869.21630859375, "learning_rate": 0.00021866669160892392, "loss": 13.8863, "step": 10252 }, { "epoch": 27.079564212611423, "grad_norm": 1232.4627685546875, "learning_rate": 0.00021862524496871108, "loss": 10.6435, "step": 10253 }, { "epoch": 27.08220534829977, "grad_norm": 1809.767822265625, "learning_rate": 0.00021858379920461991, "loss": 11.856, "step": 10254 }, { "epoch": 27.084846483988116, "grad_norm": 1306.6494140625, "learning_rate": 0.0002185423543178075, "loss": 10.3027, "step": 10255 }, { "epoch": 27.087487619676462, "grad_norm": 544.8922119140625, "learning_rate": 0.00021850091030943148, "loss": 35.5003, "step": 10256 }, { "epoch": 27.090128755364805, "grad_norm": 637.8751220703125, "learning_rate": 0.00021845946718064907, "loss": 33.8304, "step": 10257 }, { "epoch": 27.092769891053152, "grad_norm": 1172.04931640625, "learning_rate": 0.00021841802493261748, "loss": 34.3039, "step": 10258 }, { "epoch": 27.0954110267415, "grad_norm": 1010.841552734375, "learning_rate": 0.00021837658356649395, "loss": 35.1306, "step": 10259 }, { "epoch": 27.098052162429845, "grad_norm": 559.9378051757812, "learning_rate": 0.0002183351430834358, "loss": 33.8436, "step": 10260 }, { "epoch": 27.10069329811819, "grad_norm": 481.3129577636719, "learning_rate": 0.00021829370348460018, "loss": 36.3065, "step": 10261 }, { "epoch": 27.103334433806538, "grad_norm": 349.3316955566406, "learning_rate": 0.0002182522647711442, "loss": 34.8492, "step": 10262 }, { "epoch": 27.105975569494884, "grad_norm": 275.6000061035156, "learning_rate": 0.0002182108269442251, "loss": 36.0977, "step": 10263 }, { "epoch": 27.10861670518323, "grad_norm": 497.15472412109375, "learning_rate": 0.00021816939000500005, "loss": 34.3234, "step": 10264 }, { "epoch": 27.111257840871573, "grad_norm": 1269.778564453125, "learning_rate": 0.00021812795395462607, "loss": 35.1168, "step": 10265 }, { "epoch": 27.11389897655992, "grad_norm": 517.7770385742188, "learning_rate": 0.00021808651879426011, "loss": 35.2022, "step": 10266 }, { "epoch": 27.116540112248266, "grad_norm": 384.8587341308594, "learning_rate": 0.00021804508452505955, "loss": 34.9524, "step": 10267 }, { "epoch": 27.119181247936613, "grad_norm": 344.9941101074219, "learning_rate": 0.0002180036511481812, "loss": 34.4728, "step": 10268 }, { "epoch": 27.12182238362496, "grad_norm": 433.7922058105469, "learning_rate": 0.00021796221866478193, "loss": 35.4212, "step": 10269 }, { "epoch": 27.124463519313306, "grad_norm": 613.856689453125, "learning_rate": 0.00021792078707601905, "loss": 34.7968, "step": 10270 }, { "epoch": 27.127104655001652, "grad_norm": 712.3602294921875, "learning_rate": 0.00021787935638304935, "loss": 35.3236, "step": 10271 }, { "epoch": 27.129745790689995, "grad_norm": 704.4705810546875, "learning_rate": 0.00021783792658702978, "loss": 36.3194, "step": 10272 }, { "epoch": 27.13238692637834, "grad_norm": 965.912353515625, "learning_rate": 0.00021779649768911705, "loss": 40.2637, "step": 10273 }, { "epoch": 27.135028062066688, "grad_norm": 347.9169006347656, "learning_rate": 0.00021775506969046833, "loss": 38.056, "step": 10274 }, { "epoch": 27.137669197755034, "grad_norm": 583.322509765625, "learning_rate": 0.00021771364259224032, "loss": 40.0071, "step": 10275 }, { "epoch": 27.14031033344338, "grad_norm": 527.9020385742188, "learning_rate": 0.00021767221639558984, "loss": 39.9643, "step": 10276 }, { "epoch": 27.142951469131727, "grad_norm": 634.965087890625, "learning_rate": 0.00021763079110167377, "loss": 41.5973, "step": 10277 }, { "epoch": 27.145592604820074, "grad_norm": 583.245361328125, "learning_rate": 0.00021758936671164886, "loss": 43.0706, "step": 10278 }, { "epoch": 27.14823374050842, "grad_norm": 315.5676574707031, "learning_rate": 0.00021754794322667185, "loss": 41.3365, "step": 10279 }, { "epoch": 27.150874876196763, "grad_norm": 538.7821655273438, "learning_rate": 0.00021750652064789937, "loss": 41.8392, "step": 10280 }, { "epoch": 27.15351601188511, "grad_norm": 589.0797119140625, "learning_rate": 0.00021746509897648832, "loss": 40.9461, "step": 10281 }, { "epoch": 27.156157147573456, "grad_norm": 541.9484252929688, "learning_rate": 0.00021742367821359524, "loss": 39.8986, "step": 10282 }, { "epoch": 27.158798283261802, "grad_norm": 625.6246337890625, "learning_rate": 0.0002173822583603767, "loss": 38.3735, "step": 10283 }, { "epoch": 27.16143941895015, "grad_norm": 652.0928344726562, "learning_rate": 0.0002173408394179896, "loss": 40.6804, "step": 10284 }, { "epoch": 27.164080554638495, "grad_norm": 274.3187561035156, "learning_rate": 0.00021729942138759036, "loss": 37.1292, "step": 10285 }, { "epoch": 27.166721690326842, "grad_norm": 498.7613830566406, "learning_rate": 0.0002172580042703355, "loss": 37.2797, "step": 10286 }, { "epoch": 27.16936282601519, "grad_norm": 407.8238830566406, "learning_rate": 0.00021721658806738158, "loss": 37.2462, "step": 10287 }, { "epoch": 27.17200396170353, "grad_norm": 478.9040222167969, "learning_rate": 0.00021717517277988524, "loss": 36.1056, "step": 10288 }, { "epoch": 27.174645097391878, "grad_norm": 394.6604919433594, "learning_rate": 0.00021713375840900302, "loss": 35.8118, "step": 10289 }, { "epoch": 27.177286233080224, "grad_norm": 422.033203125, "learning_rate": 0.00021709234495589108, "loss": 34.9155, "step": 10290 }, { "epoch": 27.17992736876857, "grad_norm": 227.83558654785156, "learning_rate": 0.00021705093242170624, "loss": 36.9471, "step": 10291 }, { "epoch": 27.182568504456917, "grad_norm": 341.8893127441406, "learning_rate": 0.00021700952080760472, "loss": 35.6022, "step": 10292 }, { "epoch": 27.185209640145263, "grad_norm": 924.910400390625, "learning_rate": 0.00021696811011474295, "loss": 35.7015, "step": 10293 }, { "epoch": 27.18785077583361, "grad_norm": 436.107421875, "learning_rate": 0.00021692670034427724, "loss": 35.1268, "step": 10294 }, { "epoch": 27.190491911521953, "grad_norm": 1432.7066650390625, "learning_rate": 0.00021688529149736408, "loss": 41.8655, "step": 10295 }, { "epoch": 27.1931330472103, "grad_norm": 3290.794189453125, "learning_rate": 0.00021684388357515972, "loss": 19.663, "step": 10296 }, { "epoch": 27.195774182898646, "grad_norm": 4066.929931640625, "learning_rate": 0.00021680247657882031, "loss": 18.8265, "step": 10297 }, { "epoch": 27.198415318586992, "grad_norm": 3343.8359375, "learning_rate": 0.00021676107050950234, "loss": 20.8944, "step": 10298 }, { "epoch": 27.20105645427534, "grad_norm": 3774.3828125, "learning_rate": 0.00021671966536836195, "loss": 20.998, "step": 10299 }, { "epoch": 27.203697589963685, "grad_norm": 7433.37548828125, "learning_rate": 0.00021667826115655538, "loss": 23.3513, "step": 10300 }, { "epoch": 27.20633872565203, "grad_norm": 928.3593139648438, "learning_rate": 0.00021663685787523862, "loss": 15.7053, "step": 10301 }, { "epoch": 27.208979861340378, "grad_norm": 1399.7703857421875, "learning_rate": 0.0002165954555255682, "loss": 19.4396, "step": 10302 }, { "epoch": 27.21162099702872, "grad_norm": 5516.96923828125, "learning_rate": 0.00021655405410869998, "loss": 17.6404, "step": 10303 }, { "epoch": 27.214262132717067, "grad_norm": 3257.94921875, "learning_rate": 0.00021651265362579008, "loss": 14.639, "step": 10304 }, { "epoch": 27.216903268405414, "grad_norm": 1670.962646484375, "learning_rate": 0.0002164712540779948, "loss": 23.6579, "step": 10305 }, { "epoch": 27.21954440409376, "grad_norm": 894.7132568359375, "learning_rate": 0.00021642985546646999, "loss": 36.7767, "step": 10306 }, { "epoch": 27.222185539782107, "grad_norm": 631.7633666992188, "learning_rate": 0.00021638845779237183, "loss": 35.6878, "step": 10307 }, { "epoch": 27.224826675470453, "grad_norm": 658.1669921875, "learning_rate": 0.00021634706105685606, "loss": 35.3575, "step": 10308 }, { "epoch": 27.2274678111588, "grad_norm": 486.38104248046875, "learning_rate": 0.00021630566526107894, "loss": 34.5494, "step": 10309 }, { "epoch": 27.230108946847146, "grad_norm": 585.3006591796875, "learning_rate": 0.00021626427040619635, "loss": 34.1401, "step": 10310 }, { "epoch": 27.23275008253549, "grad_norm": 1425.3175048828125, "learning_rate": 0.00021622287649336415, "loss": 34.7958, "step": 10311 }, { "epoch": 27.235391218223835, "grad_norm": 1333.7086181640625, "learning_rate": 0.0002161814835237383, "loss": 34.6894, "step": 10312 }, { "epoch": 27.238032353912182, "grad_norm": 636.9788818359375, "learning_rate": 0.00021614009149847465, "loss": 35.5598, "step": 10313 }, { "epoch": 27.24067348960053, "grad_norm": 708.4896850585938, "learning_rate": 0.00021609870041872908, "loss": 35.5683, "step": 10314 }, { "epoch": 27.243314625288875, "grad_norm": 1282.1513671875, "learning_rate": 0.00021605731028565735, "loss": 36.6121, "step": 10315 }, { "epoch": 27.24595576097722, "grad_norm": 586.3878173828125, "learning_rate": 0.00021601592110041533, "loss": 35.6907, "step": 10316 }, { "epoch": 27.248596896665568, "grad_norm": 959.858154296875, "learning_rate": 0.00021597453286415874, "loss": 34.5189, "step": 10317 }, { "epoch": 27.25123803235391, "grad_norm": 649.7307739257812, "learning_rate": 0.0002159331455780432, "loss": 35.5042, "step": 10318 }, { "epoch": 27.253879168042257, "grad_norm": 521.1798095703125, "learning_rate": 0.00021589175924322478, "loss": 35.5916, "step": 10319 }, { "epoch": 27.256520303730603, "grad_norm": 1151.929443359375, "learning_rate": 0.00021585037386085884, "loss": 35.6869, "step": 10320 }, { "epoch": 27.25916143941895, "grad_norm": 735.7586669921875, "learning_rate": 0.0002158089894321012, "loss": 36.9015, "step": 10321 }, { "epoch": 27.261802575107296, "grad_norm": 974.3328857421875, "learning_rate": 0.0002157676059581073, "loss": 39.1554, "step": 10322 }, { "epoch": 27.264443710795643, "grad_norm": 3770.16748046875, "learning_rate": 0.000215726223440033, "loss": 41.2257, "step": 10323 }, { "epoch": 27.26708484648399, "grad_norm": 488.1509094238281, "learning_rate": 0.00021568484187903383, "loss": 40.4504, "step": 10324 }, { "epoch": 27.269725982172336, "grad_norm": 512.8577270507812, "learning_rate": 0.00021564346127626516, "loss": 40.5456, "step": 10325 }, { "epoch": 27.27236711786068, "grad_norm": 1519.6207275390625, "learning_rate": 0.00021560208163288276, "loss": 40.1588, "step": 10326 }, { "epoch": 27.275008253549025, "grad_norm": 501.9693603515625, "learning_rate": 0.00021556070295004198, "loss": 40.3792, "step": 10327 }, { "epoch": 27.27764938923737, "grad_norm": 1346.6556396484375, "learning_rate": 0.00021551932522889837, "loss": 41.3852, "step": 10328 }, { "epoch": 27.280290524925718, "grad_norm": 704.4264526367188, "learning_rate": 0.00021547794847060728, "loss": 45.1406, "step": 10329 }, { "epoch": 27.282931660614064, "grad_norm": 531.1487426757812, "learning_rate": 0.00021543657267632428, "loss": 42.2755, "step": 10330 }, { "epoch": 27.28557279630241, "grad_norm": 1236.80908203125, "learning_rate": 0.00021539519784720466, "loss": 42.6334, "step": 10331 }, { "epoch": 27.288213931990757, "grad_norm": 804.459228515625, "learning_rate": 0.00021535382398440376, "loss": 40.1853, "step": 10332 }, { "epoch": 27.290855067679104, "grad_norm": 1741.2977294921875, "learning_rate": 0.00021531245108907704, "loss": 40.3876, "step": 10333 }, { "epoch": 27.293496203367447, "grad_norm": 1367.53662109375, "learning_rate": 0.00021527107916237973, "loss": 39.5888, "step": 10334 }, { "epoch": 27.296137339055793, "grad_norm": 1316.5205078125, "learning_rate": 0.00021522970820546714, "loss": 39.0782, "step": 10335 }, { "epoch": 27.29877847474414, "grad_norm": 567.7938842773438, "learning_rate": 0.0002151883382194944, "loss": 37.1987, "step": 10336 }, { "epoch": 27.301419610432486, "grad_norm": 986.3611450195312, "learning_rate": 0.00021514696920561704, "loss": 37.3059, "step": 10337 }, { "epoch": 27.304060746120832, "grad_norm": 534.6068725585938, "learning_rate": 0.00021510560116499007, "loss": 37.0094, "step": 10338 }, { "epoch": 27.30670188180918, "grad_norm": 616.7809448242188, "learning_rate": 0.00021506423409876855, "loss": 38.0413, "step": 10339 }, { "epoch": 27.309343017497525, "grad_norm": 859.1865234375, "learning_rate": 0.0002150228680081079, "loss": 36.0045, "step": 10340 }, { "epoch": 27.31198415318587, "grad_norm": 1137.3646240234375, "learning_rate": 0.00021498150289416307, "loss": 35.551, "step": 10341 }, { "epoch": 27.314625288874215, "grad_norm": 599.9019165039062, "learning_rate": 0.0002149401387580893, "loss": 34.1455, "step": 10342 }, { "epoch": 27.31726642456256, "grad_norm": 833.7528686523438, "learning_rate": 0.0002148987756010414, "loss": 35.671, "step": 10343 }, { "epoch": 27.319907560250908, "grad_norm": 691.8887939453125, "learning_rate": 0.00021485741342417463, "loss": 36.2321, "step": 10344 }, { "epoch": 27.322548695939254, "grad_norm": 3647.150634765625, "learning_rate": 0.000214816052228644, "loss": 39.9863, "step": 10345 }, { "epoch": 27.3251898316276, "grad_norm": 1679.6658935546875, "learning_rate": 0.00021477469201560434, "loss": 24.038, "step": 10346 }, { "epoch": 27.327830967315947, "grad_norm": 8517.6025390625, "learning_rate": 0.0002147333327862108, "loss": 19.8509, "step": 10347 }, { "epoch": 27.330472103004293, "grad_norm": 2193.380126953125, "learning_rate": 0.0002146919745416182, "loss": 15.9586, "step": 10348 }, { "epoch": 27.333113238692636, "grad_norm": 10248.689453125, "learning_rate": 0.00021465061728298144, "loss": 16.6079, "step": 10349 }, { "epoch": 27.335754374380983, "grad_norm": 4070.887939453125, "learning_rate": 0.00021460926101145547, "loss": 12.4713, "step": 10350 }, { "epoch": 27.33839551006933, "grad_norm": 2191.33837890625, "learning_rate": 0.0002145679057281951, "loss": 18.578, "step": 10351 }, { "epoch": 27.341036645757676, "grad_norm": 5832.55224609375, "learning_rate": 0.00021452655143435514, "loss": 25.0609, "step": 10352 }, { "epoch": 27.343677781446022, "grad_norm": 3524.950927734375, "learning_rate": 0.0002144851981310903, "loss": 18.8782, "step": 10353 }, { "epoch": 27.34631891713437, "grad_norm": 627.3440551757812, "learning_rate": 0.0002144438458195556, "loss": 14.686, "step": 10354 }, { "epoch": 27.348960052822715, "grad_norm": 9004.265625, "learning_rate": 0.00021440249450090554, "loss": 23.0029, "step": 10355 }, { "epoch": 27.35160118851106, "grad_norm": 808.3221435546875, "learning_rate": 0.0002143611441762948, "loss": 36.3034, "step": 10356 }, { "epoch": 27.354242324199404, "grad_norm": 679.0690307617188, "learning_rate": 0.00021431979484687832, "loss": 35.6811, "step": 10357 }, { "epoch": 27.35688345988775, "grad_norm": 1877.9449462890625, "learning_rate": 0.0002142784465138106, "loss": 36.6335, "step": 10358 }, { "epoch": 27.359524595576097, "grad_norm": 944.1171875, "learning_rate": 0.0002142370991782463, "loss": 35.3327, "step": 10359 }, { "epoch": 27.362165731264444, "grad_norm": 647.04150390625, "learning_rate": 0.00021419575284133992, "loss": 34.2581, "step": 10360 }, { "epoch": 27.36480686695279, "grad_norm": 752.8772583007812, "learning_rate": 0.00021415440750424615, "loss": 34.1859, "step": 10361 }, { "epoch": 27.367448002641137, "grad_norm": 739.1809692382812, "learning_rate": 0.00021411306316811949, "loss": 37.0336, "step": 10362 }, { "epoch": 27.370089138329483, "grad_norm": 2925.058349609375, "learning_rate": 0.0002140717198341144, "loss": 35.3221, "step": 10363 }, { "epoch": 27.372730274017826, "grad_norm": 1974.8807373046875, "learning_rate": 0.0002140303775033855, "loss": 34.6288, "step": 10364 }, { "epoch": 27.375371409706172, "grad_norm": 819.7526245117188, "learning_rate": 0.00021398903617708716, "loss": 35.1484, "step": 10365 }, { "epoch": 27.37801254539452, "grad_norm": 1108.4544677734375, "learning_rate": 0.00021394769585637385, "loss": 37.0058, "step": 10366 }, { "epoch": 27.380653681082865, "grad_norm": 1069.8280029296875, "learning_rate": 0.00021390635654239991, "loss": 36.0622, "step": 10367 }, { "epoch": 27.383294816771212, "grad_norm": 1323.9727783203125, "learning_rate": 0.00021386501823631983, "loss": 34.9692, "step": 10368 }, { "epoch": 27.38593595245956, "grad_norm": 628.5791625976562, "learning_rate": 0.00021382368093928784, "loss": 35.1736, "step": 10369 }, { "epoch": 27.388577088147905, "grad_norm": 1103.2708740234375, "learning_rate": 0.00021378234465245822, "loss": 36.1567, "step": 10370 }, { "epoch": 27.39121822383625, "grad_norm": 859.253173828125, "learning_rate": 0.00021374100937698545, "loss": 37.513, "step": 10371 }, { "epoch": 27.393859359524594, "grad_norm": 980.3148193359375, "learning_rate": 0.00021369967511402377, "loss": 38.8258, "step": 10372 }, { "epoch": 27.39650049521294, "grad_norm": 847.2576293945312, "learning_rate": 0.00021365834186472726, "loss": 41.1004, "step": 10373 }, { "epoch": 27.399141630901287, "grad_norm": 348.57098388671875, "learning_rate": 0.00021361700963025014, "loss": 38.8663, "step": 10374 }, { "epoch": 27.401782766589633, "grad_norm": 794.7417602539062, "learning_rate": 0.0002135756784117467, "loss": 39.9999, "step": 10375 }, { "epoch": 27.40442390227798, "grad_norm": 1100.255126953125, "learning_rate": 0.0002135343482103711, "loss": 40.1468, "step": 10376 }, { "epoch": 27.407065037966326, "grad_norm": 655.1250610351562, "learning_rate": 0.0002134930190272773, "loss": 39.8298, "step": 10377 }, { "epoch": 27.409706173654673, "grad_norm": 1092.108154296875, "learning_rate": 0.0002134516908636196, "loss": 41.7973, "step": 10378 }, { "epoch": 27.41234730934302, "grad_norm": 2099.639892578125, "learning_rate": 0.00021341036372055195, "loss": 44.0006, "step": 10379 }, { "epoch": 27.414988445031362, "grad_norm": 702.499267578125, "learning_rate": 0.00021336903759922837, "loss": 42.4188, "step": 10380 }, { "epoch": 27.41762958071971, "grad_norm": 1056.1175537109375, "learning_rate": 0.00021332771250080288, "loss": 41.2033, "step": 10381 }, { "epoch": 27.420270716408055, "grad_norm": 923.6319580078125, "learning_rate": 0.0002132863884264295, "loss": 41.5436, "step": 10382 }, { "epoch": 27.4229118520964, "grad_norm": 485.7117004394531, "learning_rate": 0.00021324506537726218, "loss": 39.5235, "step": 10383 }, { "epoch": 27.425552987784748, "grad_norm": 487.77252197265625, "learning_rate": 0.00021320374335445476, "loss": 38.5739, "step": 10384 }, { "epoch": 27.428194123473094, "grad_norm": 839.6506958007812, "learning_rate": 0.0002131624223591612, "loss": 37.5166, "step": 10385 }, { "epoch": 27.43083525916144, "grad_norm": 460.8053894042969, "learning_rate": 0.00021312110239253543, "loss": 37.9358, "step": 10386 }, { "epoch": 27.433476394849784, "grad_norm": 963.3076171875, "learning_rate": 0.00021307978345573117, "loss": 37.53, "step": 10387 }, { "epoch": 27.43611753053813, "grad_norm": 597.1748046875, "learning_rate": 0.00021303846554990218, "loss": 35.633, "step": 10388 }, { "epoch": 27.438758666226477, "grad_norm": 641.7100830078125, "learning_rate": 0.00021299714867620246, "loss": 36.3774, "step": 10389 }, { "epoch": 27.441399801914823, "grad_norm": 853.0494995117188, "learning_rate": 0.0002129558328357856, "loss": 36.1919, "step": 10390 }, { "epoch": 27.44404093760317, "grad_norm": 1073.8970947265625, "learning_rate": 0.00021291451802980522, "loss": 34.8485, "step": 10391 }, { "epoch": 27.446682073291516, "grad_norm": 491.760009765625, "learning_rate": 0.00021287320425941524, "loss": 34.4158, "step": 10392 }, { "epoch": 27.449323208979862, "grad_norm": 612.0150146484375, "learning_rate": 0.00021283189152576927, "loss": 35.4834, "step": 10393 }, { "epoch": 27.45196434466821, "grad_norm": 1004.3206787109375, "learning_rate": 0.00021279057983002086, "loss": 35.4442, "step": 10394 }, { "epoch": 27.454605480356552, "grad_norm": 2835.18994140625, "learning_rate": 0.0002127492691733236, "loss": 39.1712, "step": 10395 }, { "epoch": 27.4572466160449, "grad_norm": 1426.43212890625, "learning_rate": 0.00021270795955683118, "loss": 27.6229, "step": 10396 }, { "epoch": 27.459887751733245, "grad_norm": 5060.02978515625, "learning_rate": 0.00021266665098169707, "loss": 13.3561, "step": 10397 }, { "epoch": 27.46252888742159, "grad_norm": 1113.700927734375, "learning_rate": 0.00021262534344907474, "loss": 11.4792, "step": 10398 }, { "epoch": 27.465170023109938, "grad_norm": 22929.05859375, "learning_rate": 0.0002125840369601178, "loss": 13.1775, "step": 10399 }, { "epoch": 27.467811158798284, "grad_norm": 2029.7340087890625, "learning_rate": 0.00021254273151597963, "loss": 15.1421, "step": 10400 }, { "epoch": 27.467811158798284, "eval_loss": 3.7762093544006348, "eval_runtime": 2.1219, "eval_samples_per_second": 233.285, "eval_steps_per_second": 29.219, "step": 10400 }, { "epoch": 27.47045229448663, "grad_norm": 2861.456298828125, "learning_rate": 0.00021250142711781372, "loss": 16.2416, "step": 10401 }, { "epoch": 27.473093430174977, "grad_norm": 6897.443359375, "learning_rate": 0.00021246012376677334, "loss": 17.4712, "step": 10402 }, { "epoch": 27.47573456586332, "grad_norm": 2605.982666015625, "learning_rate": 0.00021241882146401205, "loss": 16.4499, "step": 10403 }, { "epoch": 27.478375701551666, "grad_norm": 5481.17919921875, "learning_rate": 0.00021237752021068305, "loss": 15.9396, "step": 10404 }, { "epoch": 27.481016837240013, "grad_norm": 1495.4984130859375, "learning_rate": 0.0002123362200079396, "loss": 20.0269, "step": 10405 }, { "epoch": 27.48365797292836, "grad_norm": 1079.4129638671875, "learning_rate": 0.0002122949208569352, "loss": 36.8526, "step": 10406 }, { "epoch": 27.486299108616706, "grad_norm": 377.8696594238281, "learning_rate": 0.00021225362275882303, "loss": 35.6328, "step": 10407 }, { "epoch": 27.488940244305052, "grad_norm": 959.376708984375, "learning_rate": 0.00021221232571475623, "loss": 35.3543, "step": 10408 }, { "epoch": 27.4915813799934, "grad_norm": 415.3604736328125, "learning_rate": 0.00021217102972588791, "loss": 36.5336, "step": 10409 }, { "epoch": 27.49422251568174, "grad_norm": 487.9943542480469, "learning_rate": 0.0002121297347933715, "loss": 35.2166, "step": 10410 }, { "epoch": 27.496863651370088, "grad_norm": 1085.32080078125, "learning_rate": 0.00021208844091835996, "loss": 35.032, "step": 10411 }, { "epoch": 27.499504787058434, "grad_norm": 475.9812316894531, "learning_rate": 0.0002120471481020064, "loss": 37.0369, "step": 10412 }, { "epoch": 27.50214592274678, "grad_norm": 618.6546630859375, "learning_rate": 0.000212005856345464, "loss": 34.5035, "step": 10413 }, { "epoch": 27.504787058435127, "grad_norm": 5205.37060546875, "learning_rate": 0.00021196456564988571, "loss": 35.7629, "step": 10414 }, { "epoch": 27.507428194123474, "grad_norm": 608.267822265625, "learning_rate": 0.00021192327601642462, "loss": 34.4085, "step": 10415 }, { "epoch": 27.51006932981182, "grad_norm": 800.3834838867188, "learning_rate": 0.00021188198744623364, "loss": 35.5705, "step": 10416 }, { "epoch": 27.512710465500167, "grad_norm": 310.05328369140625, "learning_rate": 0.00021184069994046578, "loss": 34.4027, "step": 10417 }, { "epoch": 27.51535160118851, "grad_norm": 642.9067993164062, "learning_rate": 0.000211799413500274, "loss": 35.3687, "step": 10418 }, { "epoch": 27.517992736876856, "grad_norm": 997.189697265625, "learning_rate": 0.00021175812812681107, "loss": 34.0898, "step": 10419 }, { "epoch": 27.520633872565202, "grad_norm": 513.9119873046875, "learning_rate": 0.00021171684382123, "loss": 35.4009, "step": 10420 }, { "epoch": 27.52327500825355, "grad_norm": 1078.131103515625, "learning_rate": 0.00021167556058468358, "loss": 35.5859, "step": 10421 }, { "epoch": 27.525916143941895, "grad_norm": 2197.708740234375, "learning_rate": 0.00021163427841832462, "loss": 35.8679, "step": 10422 }, { "epoch": 27.528557279630242, "grad_norm": 771.9027099609375, "learning_rate": 0.00021159299732330578, "loss": 41.5539, "step": 10423 }, { "epoch": 27.53119841531859, "grad_norm": 507.9363098144531, "learning_rate": 0.0002115517173007801, "loss": 40.3996, "step": 10424 }, { "epoch": 27.533839551006935, "grad_norm": 398.56854248046875, "learning_rate": 0.0002115104383519001, "loss": 40.16, "step": 10425 }, { "epoch": 27.536480686695278, "grad_norm": 300.4814147949219, "learning_rate": 0.00021146916047781836, "loss": 39.9491, "step": 10426 }, { "epoch": 27.539121822383624, "grad_norm": 1060.577880859375, "learning_rate": 0.00021142788367968778, "loss": 41.7671, "step": 10427 }, { "epoch": 27.54176295807197, "grad_norm": 598.6190185546875, "learning_rate": 0.0002113866079586609, "loss": 44.0661, "step": 10428 }, { "epoch": 27.544404093760317, "grad_norm": 481.5611267089844, "learning_rate": 0.00021134533331589034, "loss": 43.9894, "step": 10429 }, { "epoch": 27.547045229448663, "grad_norm": 392.4596862792969, "learning_rate": 0.00021130405975252855, "loss": 41.6017, "step": 10430 }, { "epoch": 27.54968636513701, "grad_norm": 491.8753662109375, "learning_rate": 0.00021126278726972823, "loss": 40.9471, "step": 10431 }, { "epoch": 27.552327500825356, "grad_norm": 1068.93310546875, "learning_rate": 0.0002112215158686418, "loss": 41.5635, "step": 10432 }, { "epoch": 27.5549686365137, "grad_norm": 832.085693359375, "learning_rate": 0.00021118024555042173, "loss": 38.1809, "step": 10433 }, { "epoch": 27.557609772202046, "grad_norm": 658.826904296875, "learning_rate": 0.00021113897631622056, "loss": 39.5511, "step": 10434 }, { "epoch": 27.560250907890392, "grad_norm": 407.7840881347656, "learning_rate": 0.00021109770816719062, "loss": 38.7109, "step": 10435 }, { "epoch": 27.56289204357874, "grad_norm": 708.197021484375, "learning_rate": 0.00021105644110448437, "loss": 37.1674, "step": 10436 }, { "epoch": 27.565533179267085, "grad_norm": 448.043212890625, "learning_rate": 0.00021101517512925403, "loss": 36.2932, "step": 10437 }, { "epoch": 27.56817431495543, "grad_norm": 487.5832824707031, "learning_rate": 0.00021097391024265207, "loss": 36.2967, "step": 10438 }, { "epoch": 27.570815450643778, "grad_norm": 497.9703063964844, "learning_rate": 0.00021093264644583078, "loss": 36.5096, "step": 10439 }, { "epoch": 27.573456586332124, "grad_norm": 407.8791198730469, "learning_rate": 0.00021089138373994224, "loss": 34.912, "step": 10440 }, { "epoch": 27.576097722020467, "grad_norm": 434.5149841308594, "learning_rate": 0.00021085012212613896, "loss": 33.8962, "step": 10441 }, { "epoch": 27.578738857708814, "grad_norm": 295.6499938964844, "learning_rate": 0.00021080886160557312, "loss": 37.0036, "step": 10442 }, { "epoch": 27.58137999339716, "grad_norm": 216.54617309570312, "learning_rate": 0.00021076760217939668, "loss": 35.5138, "step": 10443 }, { "epoch": 27.584021129085507, "grad_norm": 1088.8629150390625, "learning_rate": 0.00021072634384876183, "loss": 36.3515, "step": 10444 }, { "epoch": 27.586662264773853, "grad_norm": 1534.37255859375, "learning_rate": 0.00021068508661482082, "loss": 43.5018, "step": 10445 }, { "epoch": 27.5893034004622, "grad_norm": 1431.724365234375, "learning_rate": 0.00021064383047872574, "loss": 34.6049, "step": 10446 }, { "epoch": 27.591944536150546, "grad_norm": 1215.2117919921875, "learning_rate": 0.00021060257544162848, "loss": 21.1824, "step": 10447 }, { "epoch": 27.594585671838892, "grad_norm": 9840.330078125, "learning_rate": 0.0002105613215046812, "loss": 15.9635, "step": 10448 }, { "epoch": 27.597226807527235, "grad_norm": 1679.2945556640625, "learning_rate": 0.00021052006866903588, "loss": 17.5542, "step": 10449 }, { "epoch": 27.599867943215582, "grad_norm": 637.0630493164062, "learning_rate": 0.00021047881693584444, "loss": 11.2074, "step": 10450 }, { "epoch": 27.60250907890393, "grad_norm": 1082.6282958984375, "learning_rate": 0.00021043756630625875, "loss": 15.1826, "step": 10451 }, { "epoch": 27.605150214592275, "grad_norm": 1043.3809814453125, "learning_rate": 0.00021039631678143088, "loss": 13.534, "step": 10452 }, { "epoch": 27.60779135028062, "grad_norm": 4146.9833984375, "learning_rate": 0.00021035506836251255, "loss": 10.5723, "step": 10453 }, { "epoch": 27.610432485968968, "grad_norm": 1194.9273681640625, "learning_rate": 0.0002103138210506556, "loss": 11.6769, "step": 10454 }, { "epoch": 27.613073621657314, "grad_norm": 1645.4794921875, "learning_rate": 0.00021027257484701196, "loss": 35.8848, "step": 10455 }, { "epoch": 27.615714757345657, "grad_norm": 304.1392822265625, "learning_rate": 0.00021023132975273333, "loss": 35.7192, "step": 10456 }, { "epoch": 27.618355893034003, "grad_norm": 535.7894287109375, "learning_rate": 0.00021019008576897145, "loss": 36.7412, "step": 10457 }, { "epoch": 27.62099702872235, "grad_norm": 419.25225830078125, "learning_rate": 0.0002101488428968779, "loss": 37.2614, "step": 10458 }, { "epoch": 27.623638164410696, "grad_norm": 710.9033203125, "learning_rate": 0.00021010760113760471, "loss": 35.5293, "step": 10459 }, { "epoch": 27.626279300099043, "grad_norm": 486.5763854980469, "learning_rate": 0.00021006636049230326, "loss": 35.1344, "step": 10460 }, { "epoch": 27.62892043578739, "grad_norm": 478.6801452636719, "learning_rate": 0.0002100251209621251, "loss": 36.2394, "step": 10461 }, { "epoch": 27.631561571475736, "grad_norm": 2217.764892578125, "learning_rate": 0.0002099838825482221, "loss": 35.4176, "step": 10462 }, { "epoch": 27.634202707164082, "grad_norm": 1019.0238647460938, "learning_rate": 0.00020994264525174563, "loss": 35.0708, "step": 10463 }, { "epoch": 27.636843842852425, "grad_norm": 419.7914733886719, "learning_rate": 0.0002099014090738473, "loss": 34.6279, "step": 10464 }, { "epoch": 27.63948497854077, "grad_norm": 301.7767639160156, "learning_rate": 0.0002098601740156785, "loss": 35.516, "step": 10465 }, { "epoch": 27.642126114229118, "grad_norm": 486.34375, "learning_rate": 0.0002098189400783908, "loss": 34.9863, "step": 10466 }, { "epoch": 27.644767249917464, "grad_norm": 1148.6961669921875, "learning_rate": 0.0002097777072631356, "loss": 35.6192, "step": 10467 }, { "epoch": 27.64740838560581, "grad_norm": 437.3202209472656, "learning_rate": 0.00020973647557106427, "loss": 35.4383, "step": 10468 }, { "epoch": 27.650049521294157, "grad_norm": 503.3974609375, "learning_rate": 0.00020969524500332823, "loss": 34.3374, "step": 10469 }, { "epoch": 27.652690656982504, "grad_norm": 883.9683837890625, "learning_rate": 0.00020965401556107883, "loss": 34.8536, "step": 10470 }, { "epoch": 27.65533179267085, "grad_norm": 1139.522216796875, "learning_rate": 0.0002096127872454673, "loss": 37.2521, "step": 10471 }, { "epoch": 27.657972928359193, "grad_norm": 643.8981323242188, "learning_rate": 0.00020957156005764493, "loss": 38.2491, "step": 10472 }, { "epoch": 27.66061406404754, "grad_norm": 7319.2080078125, "learning_rate": 0.00020953033399876306, "loss": 41.2308, "step": 10473 }, { "epoch": 27.663255199735886, "grad_norm": 396.0033874511719, "learning_rate": 0.00020948910906997285, "loss": 39.4674, "step": 10474 }, { "epoch": 27.665896335424232, "grad_norm": 301.2138366699219, "learning_rate": 0.00020944788527242533, "loss": 41.1247, "step": 10475 }, { "epoch": 27.66853747111258, "grad_norm": 676.4877319335938, "learning_rate": 0.00020940666260727193, "loss": 39.8092, "step": 10476 }, { "epoch": 27.671178606800925, "grad_norm": 272.95562744140625, "learning_rate": 0.00020936544107566366, "loss": 42.1934, "step": 10477 }, { "epoch": 27.673819742489272, "grad_norm": 737.1470336914062, "learning_rate": 0.00020932422067875155, "loss": 42.6012, "step": 10478 }, { "epoch": 27.676460878177615, "grad_norm": 530.3883056640625, "learning_rate": 0.00020928300141768652, "loss": 40.7356, "step": 10479 }, { "epoch": 27.67910201386596, "grad_norm": 347.9985046386719, "learning_rate": 0.00020924178329361992, "loss": 43.5024, "step": 10480 }, { "epoch": 27.681743149554308, "grad_norm": 352.34039306640625, "learning_rate": 0.00020920056630770257, "loss": 40.8212, "step": 10481 }, { "epoch": 27.684384285242654, "grad_norm": 401.6942138671875, "learning_rate": 0.00020915935046108536, "loss": 39.831, "step": 10482 }, { "epoch": 27.687025420931, "grad_norm": 801.7085571289062, "learning_rate": 0.0002091181357549194, "loss": 41.5305, "step": 10483 }, { "epoch": 27.689666556619347, "grad_norm": 603.9293823242188, "learning_rate": 0.00020907692219035545, "loss": 39.6396, "step": 10484 }, { "epoch": 27.692307692307693, "grad_norm": 2502.32470703125, "learning_rate": 0.00020903570976854436, "loss": 39.3256, "step": 10485 }, { "epoch": 27.69494882799604, "grad_norm": 362.245849609375, "learning_rate": 0.00020899449849063706, "loss": 38.2645, "step": 10486 }, { "epoch": 27.697589963684383, "grad_norm": 261.20965576171875, "learning_rate": 0.0002089532883577843, "loss": 37.1471, "step": 10487 }, { "epoch": 27.70023109937273, "grad_norm": 297.52044677734375, "learning_rate": 0.00020891207937113684, "loss": 35.8636, "step": 10488 }, { "epoch": 27.702872235061076, "grad_norm": 474.7991027832031, "learning_rate": 0.0002088708715318454, "loss": 35.5267, "step": 10489 }, { "epoch": 27.705513370749422, "grad_norm": 330.0349426269531, "learning_rate": 0.00020882966484106076, "loss": 35.401, "step": 10490 }, { "epoch": 27.70815450643777, "grad_norm": 235.8453826904297, "learning_rate": 0.00020878845929993354, "loss": 34.9024, "step": 10491 }, { "epoch": 27.710795642126115, "grad_norm": 298.3626708984375, "learning_rate": 0.00020874725490961426, "loss": 35.4937, "step": 10492 }, { "epoch": 27.71343677781446, "grad_norm": 573.0230712890625, "learning_rate": 0.00020870605167125377, "loss": 36.4275, "step": 10493 }, { "epoch": 27.716077913502808, "grad_norm": 512.0449829101562, "learning_rate": 0.0002086648495860026, "loss": 35.5555, "step": 10494 }, { "epoch": 27.71871904919115, "grad_norm": 1480.3272705078125, "learning_rate": 0.00020862364865501114, "loss": 37.9702, "step": 10495 }, { "epoch": 27.721360184879497, "grad_norm": 2330.423828125, "learning_rate": 0.00020858244887942993, "loss": 17.6467, "step": 10496 }, { "epoch": 27.724001320567844, "grad_norm": 1800.333740234375, "learning_rate": 0.00020854125026040956, "loss": 16.7056, "step": 10497 }, { "epoch": 27.72664245625619, "grad_norm": 15216.1337890625, "learning_rate": 0.00020850005279910046, "loss": 18.6449, "step": 10498 }, { "epoch": 27.729283591944537, "grad_norm": 2101.7265625, "learning_rate": 0.00020845885649665294, "loss": 16.8664, "step": 10499 }, { "epoch": 27.731924727632883, "grad_norm": 1256.7471923828125, "learning_rate": 0.0002084176613542175, "loss": 15.4013, "step": 10500 }, { "epoch": 27.73456586332123, "grad_norm": 1778.370849609375, "learning_rate": 0.00020837646737294444, "loss": 16.09, "step": 10501 }, { "epoch": 27.737206999009572, "grad_norm": 3779.27197265625, "learning_rate": 0.00020833527455398407, "loss": 12.8392, "step": 10502 }, { "epoch": 27.73984813469792, "grad_norm": 840.2037963867188, "learning_rate": 0.00020829408289848664, "loss": 10.9121, "step": 10503 }, { "epoch": 27.742489270386265, "grad_norm": 473.74908447265625, "learning_rate": 0.0002082528924076025, "loss": 15.0533, "step": 10504 }, { "epoch": 27.745130406074612, "grad_norm": 493.8183288574219, "learning_rate": 0.0002082117030824818, "loss": 35.9422, "step": 10505 }, { "epoch": 27.74777154176296, "grad_norm": 456.3131103515625, "learning_rate": 0.00020817051492427472, "loss": 36.7676, "step": 10506 }, { "epoch": 27.750412677451305, "grad_norm": 761.57666015625, "learning_rate": 0.00020812932793413147, "loss": 33.4411, "step": 10507 }, { "epoch": 27.75305381313965, "grad_norm": 713.8381958007812, "learning_rate": 0.00020808814211320214, "loss": 35.2109, "step": 10508 }, { "epoch": 27.755694948827998, "grad_norm": 692.2062377929688, "learning_rate": 0.00020804695746263682, "loss": 36.3507, "step": 10509 }, { "epoch": 27.75833608451634, "grad_norm": 601.3519897460938, "learning_rate": 0.00020800577398358546, "loss": 35.937, "step": 10510 }, { "epoch": 27.760977220204687, "grad_norm": 548.0100708007812, "learning_rate": 0.00020796459167719833, "loss": 35.9553, "step": 10511 }, { "epoch": 27.763618355893033, "grad_norm": 504.353271484375, "learning_rate": 0.00020792341054462534, "loss": 34.5163, "step": 10512 }, { "epoch": 27.76625949158138, "grad_norm": 2862.65478515625, "learning_rate": 0.00020788223058701617, "loss": 34.0216, "step": 10513 }, { "epoch": 27.768900627269726, "grad_norm": 459.1842956542969, "learning_rate": 0.0002078410518055211, "loss": 35.3592, "step": 10514 }, { "epoch": 27.771541762958073, "grad_norm": 587.4310913085938, "learning_rate": 0.00020779987420128994, "loss": 34.1063, "step": 10515 }, { "epoch": 27.77418289864642, "grad_norm": 454.46881103515625, "learning_rate": 0.00020775869777547247, "loss": 35.9694, "step": 10516 }, { "epoch": 27.776824034334766, "grad_norm": 427.7170715332031, "learning_rate": 0.0002077175225292185, "loss": 35.6664, "step": 10517 }, { "epoch": 27.77946517002311, "grad_norm": 618.0374145507812, "learning_rate": 0.00020767634846367792, "loss": 36.7162, "step": 10518 }, { "epoch": 27.782106305711455, "grad_norm": 1545.95654296875, "learning_rate": 0.00020763517558000045, "loss": 34.4204, "step": 10519 }, { "epoch": 27.7847474413998, "grad_norm": 1229.4320068359375, "learning_rate": 0.00020759400387933578, "loss": 36.1134, "step": 10520 }, { "epoch": 27.787388577088148, "grad_norm": 592.20068359375, "learning_rate": 0.00020755283336283366, "loss": 35.3844, "step": 10521 }, { "epoch": 27.790029712776494, "grad_norm": 651.291259765625, "learning_rate": 0.00020751166403164378, "loss": 38.2285, "step": 10522 }, { "epoch": 27.79267084846484, "grad_norm": 1983.4410400390625, "learning_rate": 0.0002074704958869157, "loss": 41.6964, "step": 10523 }, { "epoch": 27.795311984153187, "grad_norm": 587.8732299804688, "learning_rate": 0.00020742932892979895, "loss": 38.7424, "step": 10524 }, { "epoch": 27.79795311984153, "grad_norm": 412.98504638671875, "learning_rate": 0.0002073881631614433, "loss": 40.2557, "step": 10525 }, { "epoch": 27.800594255529877, "grad_norm": 768.880859375, "learning_rate": 0.00020734699858299813, "loss": 40.3275, "step": 10526 }, { "epoch": 27.803235391218223, "grad_norm": 254.55694580078125, "learning_rate": 0.00020730583519561286, "loss": 41.5288, "step": 10527 }, { "epoch": 27.80587652690657, "grad_norm": 745.4658813476562, "learning_rate": 0.00020726467300043717, "loss": 45.3762, "step": 10528 }, { "epoch": 27.808517662594916, "grad_norm": 819.3748168945312, "learning_rate": 0.00020722351199862045, "loss": 42.0118, "step": 10529 }, { "epoch": 27.811158798283262, "grad_norm": 544.1099243164062, "learning_rate": 0.00020718235219131196, "loss": 41.8023, "step": 10530 }, { "epoch": 27.81379993397161, "grad_norm": 467.861083984375, "learning_rate": 0.00020714119357966102, "loss": 41.1496, "step": 10531 }, { "epoch": 27.816441069659955, "grad_norm": 352.08233642578125, "learning_rate": 0.00020710003616481717, "loss": 40.101, "step": 10532 }, { "epoch": 27.8190822053483, "grad_norm": 644.5222778320312, "learning_rate": 0.00020705887994792966, "loss": 39.2473, "step": 10533 }, { "epoch": 27.821723341036645, "grad_norm": 1043.082275390625, "learning_rate": 0.00020701772493014758, "loss": 37.1612, "step": 10534 }, { "epoch": 27.82436447672499, "grad_norm": 559.3202514648438, "learning_rate": 0.00020697657111262038, "loss": 39.3747, "step": 10535 }, { "epoch": 27.827005612413338, "grad_norm": 487.8025207519531, "learning_rate": 0.00020693541849649716, "loss": 36.2718, "step": 10536 }, { "epoch": 27.829646748101684, "grad_norm": 257.8153381347656, "learning_rate": 0.0002068942670829271, "loss": 35.3866, "step": 10537 }, { "epoch": 27.83228788379003, "grad_norm": 449.5181579589844, "learning_rate": 0.00020685311687305922, "loss": 34.6848, "step": 10538 }, { "epoch": 27.834929019478377, "grad_norm": 693.3624267578125, "learning_rate": 0.00020681196786804276, "loss": 35.4399, "step": 10539 }, { "epoch": 27.837570155166723, "grad_norm": 455.7016296386719, "learning_rate": 0.00020677082006902674, "loss": 36.449, "step": 10540 }, { "epoch": 27.840211290855066, "grad_norm": 766.4063110351562, "learning_rate": 0.0002067296734771601, "loss": 34.5445, "step": 10541 }, { "epoch": 27.842852426543413, "grad_norm": 843.4524536132812, "learning_rate": 0.00020668852809359198, "loss": 34.8732, "step": 10542 }, { "epoch": 27.84549356223176, "grad_norm": 1780.9141845703125, "learning_rate": 0.00020664738391947126, "loss": 35.19, "step": 10543 }, { "epoch": 27.848134697920106, "grad_norm": 1983.83984375, "learning_rate": 0.0002066062409559469, "loss": 34.4105, "step": 10544 }, { "epoch": 27.850775833608452, "grad_norm": 3001.85693359375, "learning_rate": 0.00020656509920416767, "loss": 42.586, "step": 10545 }, { "epoch": 27.8534169692968, "grad_norm": 2085.28955078125, "learning_rate": 0.00020652395866528262, "loss": 19.7031, "step": 10546 }, { "epoch": 27.856058104985145, "grad_norm": 4915.1416015625, "learning_rate": 0.0002064828193404405, "loss": 21.3803, "step": 10547 }, { "epoch": 27.858699240673488, "grad_norm": 1529.916015625, "learning_rate": 0.00020644168123078998, "loss": 21.5925, "step": 10548 }, { "epoch": 27.861340376361834, "grad_norm": 2152.8203125, "learning_rate": 0.00020640054433748001, "loss": 15.4051, "step": 10549 }, { "epoch": 27.86398151205018, "grad_norm": 5153.27392578125, "learning_rate": 0.00020635940866165926, "loss": 13.4796, "step": 10550 }, { "epoch": 27.866622647738527, "grad_norm": 5418.58837890625, "learning_rate": 0.00020631827420447636, "loss": 15.418, "step": 10551 }, { "epoch": 27.869263783426874, "grad_norm": 1961.2960205078125, "learning_rate": 0.0002062771409670799, "loss": 16.8495, "step": 10552 }, { "epoch": 27.87190491911522, "grad_norm": 3082.6552734375, "learning_rate": 0.0002062360089506187, "loss": 17.5932, "step": 10553 }, { "epoch": 27.874546054803567, "grad_norm": 2922.386962890625, "learning_rate": 0.0002061948781562412, "loss": 10.9911, "step": 10554 }, { "epoch": 27.877187190491913, "grad_norm": 396.245849609375, "learning_rate": 0.000206153748585096, "loss": 27.2786, "step": 10555 }, { "epoch": 27.879828326180256, "grad_norm": 1371.756103515625, "learning_rate": 0.00020611262023833164, "loss": 35.985, "step": 10556 }, { "epoch": 27.882469461868602, "grad_norm": 4677.1064453125, "learning_rate": 0.00020607149311709658, "loss": 34.4416, "step": 10557 }, { "epoch": 27.88511059755695, "grad_norm": 419.7066345214844, "learning_rate": 0.00020603036722253927, "loss": 35.1351, "step": 10558 }, { "epoch": 27.887751733245295, "grad_norm": 494.39794921875, "learning_rate": 0.00020598924255580805, "loss": 37.0652, "step": 10559 }, { "epoch": 27.890392868933642, "grad_norm": 1044.3721923828125, "learning_rate": 0.00020594811911805146, "loss": 36.3168, "step": 10560 }, { "epoch": 27.89303400462199, "grad_norm": 475.3913269042969, "learning_rate": 0.00020590699691041775, "loss": 35.3921, "step": 10561 }, { "epoch": 27.895675140310335, "grad_norm": 781.3034057617188, "learning_rate": 0.00020586587593405514, "loss": 35.0854, "step": 10562 }, { "epoch": 27.89831627599868, "grad_norm": 732.216796875, "learning_rate": 0.00020582475619011214, "loss": 35.1287, "step": 10563 }, { "epoch": 27.900957411687024, "grad_norm": 683.7031860351562, "learning_rate": 0.00020578363767973694, "loss": 36.846, "step": 10564 }, { "epoch": 27.90359854737537, "grad_norm": 621.5389404296875, "learning_rate": 0.0002057425204040776, "loss": 36.4349, "step": 10565 }, { "epoch": 27.906239683063717, "grad_norm": 369.4410400390625, "learning_rate": 0.00020570140436428227, "loss": 36.7965, "step": 10566 }, { "epoch": 27.908880818752063, "grad_norm": 345.8061218261719, "learning_rate": 0.00020566028956149935, "loss": 34.7718, "step": 10567 }, { "epoch": 27.91152195444041, "grad_norm": 693.2421875, "learning_rate": 0.00020561917599687675, "loss": 34.9022, "step": 10568 }, { "epoch": 27.914163090128756, "grad_norm": 727.1898193359375, "learning_rate": 0.00020557806367156256, "loss": 33.6745, "step": 10569 }, { "epoch": 27.916804225817103, "grad_norm": 562.4916381835938, "learning_rate": 0.00020553695258670492, "loss": 35.8281, "step": 10570 }, { "epoch": 27.919445361505446, "grad_norm": 529.2643432617188, "learning_rate": 0.0002054958427434517, "loss": 37.1216, "step": 10571 }, { "epoch": 27.922086497193792, "grad_norm": 3588.597412109375, "learning_rate": 0.00020545473414295095, "loss": 37.0655, "step": 10572 }, { "epoch": 27.92472763288214, "grad_norm": 759.2399291992188, "learning_rate": 0.00020541362678635057, "loss": 40.3348, "step": 10573 }, { "epoch": 27.927368768570485, "grad_norm": 625.6346435546875, "learning_rate": 0.0002053725206747985, "loss": 39.9138, "step": 10574 }, { "epoch": 27.93000990425883, "grad_norm": 628.4934692382812, "learning_rate": 0.00020533141580944257, "loss": 39.3649, "step": 10575 }, { "epoch": 27.932651039947178, "grad_norm": 433.2944641113281, "learning_rate": 0.00020529031219143057, "loss": 44.0937, "step": 10576 }, { "epoch": 27.935292175635524, "grad_norm": 1156.6048583984375, "learning_rate": 0.00020524920982191037, "loss": 41.7096, "step": 10577 }, { "epoch": 27.93793331132387, "grad_norm": 658.5535278320312, "learning_rate": 0.0002052081087020297, "loss": 40.0623, "step": 10578 }, { "epoch": 27.940574447012214, "grad_norm": 645.835693359375, "learning_rate": 0.00020516700883293633, "loss": 41.0218, "step": 10579 }, { "epoch": 27.94321558270056, "grad_norm": 525.95166015625, "learning_rate": 0.00020512591021577773, "loss": 38.9701, "step": 10580 }, { "epoch": 27.945856718388907, "grad_norm": 609.3032836914062, "learning_rate": 0.00020508481285170185, "loss": 37.2911, "step": 10581 }, { "epoch": 27.948497854077253, "grad_norm": 835.6380004882812, "learning_rate": 0.00020504371674185626, "loss": 38.3178, "step": 10582 }, { "epoch": 27.9511389897656, "grad_norm": 1017.253173828125, "learning_rate": 0.0002050026218873883, "loss": 35.8378, "step": 10583 }, { "epoch": 27.953780125453946, "grad_norm": 1610.3768310546875, "learning_rate": 0.0002049615282894458, "loss": 37.5302, "step": 10584 }, { "epoch": 27.956421261142292, "grad_norm": 885.6055908203125, "learning_rate": 0.00020492043594917618, "loss": 37.4594, "step": 10585 }, { "epoch": 27.95906239683064, "grad_norm": 971.6080322265625, "learning_rate": 0.0002048793448677269, "loss": 37.4877, "step": 10586 }, { "epoch": 27.96170353251898, "grad_norm": 1224.861328125, "learning_rate": 0.00020483825504624532, "loss": 12.7817, "step": 10587 }, { "epoch": 27.96434466820733, "grad_norm": 1345.5262451171875, "learning_rate": 0.00020479716648587904, "loss": 20.6402, "step": 10588 }, { "epoch": 27.966985803895675, "grad_norm": 4486.12353515625, "learning_rate": 0.00020475607918777532, "loss": 9.2096, "step": 10589 }, { "epoch": 27.96962693958402, "grad_norm": 1029.907958984375, "learning_rate": 0.0002047149931530814, "loss": 12.636, "step": 10590 }, { "epoch": 27.972268075272368, "grad_norm": 1643.02734375, "learning_rate": 0.00020467390838294484, "loss": 14.5708, "step": 10591 }, { "epoch": 27.974909210960714, "grad_norm": 744.41943359375, "learning_rate": 0.0002046328248785127, "loss": 35.805, "step": 10592 }, { "epoch": 27.97755034664906, "grad_norm": 1104.5751953125, "learning_rate": 0.00020459174264093232, "loss": 36.6073, "step": 10593 }, { "epoch": 27.980191482337403, "grad_norm": 1456.43603515625, "learning_rate": 0.00020455066167135075, "loss": 33.824, "step": 10594 }, { "epoch": 27.98283261802575, "grad_norm": 2801.041259765625, "learning_rate": 0.00020450958197091532, "loss": 35.3028, "step": 10595 }, { "epoch": 27.985473753714096, "grad_norm": 715.8236083984375, "learning_rate": 0.00020446850354077312, "loss": 33.9399, "step": 10596 }, { "epoch": 27.988114889402443, "grad_norm": 887.4833374023438, "learning_rate": 0.00020442742638207106, "loss": 35.2385, "step": 10597 }, { "epoch": 27.99075602509079, "grad_norm": 785.6209106445312, "learning_rate": 0.0002043863504959565, "loss": 36.0061, "step": 10598 }, { "epoch": 27.993397160779136, "grad_norm": 616.4111938476562, "learning_rate": 0.00020434527588357636, "loss": 34.5022, "step": 10599 }, { "epoch": 27.996038296467482, "grad_norm": 1994.1484375, "learning_rate": 0.00020430420254607747, "loss": 34.2062, "step": 10600 }, { "epoch": 27.996038296467482, "eval_loss": 3.841506242752075, "eval_runtime": 2.177, "eval_samples_per_second": 227.375, "eval_steps_per_second": 28.479, "step": 10600 }, { "epoch": 27.99867943215583, "grad_norm": 3171.18701171875, "learning_rate": 0.0002042631304846068, "loss": 37.5482, "step": 10601 }, { "epoch": 28.00132056784417, "grad_norm": 2074.631103515625, "learning_rate": 0.00020422205970031145, "loss": 40.9665, "step": 10602 }, { "epoch": 28.003961703532518, "grad_norm": 1050.379150390625, "learning_rate": 0.0002041809901943382, "loss": 38.8753, "step": 10603 }, { "epoch": 28.006602839220864, "grad_norm": 572.134033203125, "learning_rate": 0.00020413992196783386, "loss": 39.053, "step": 10604 }, { "epoch": 28.00924397490921, "grad_norm": 831.8797607421875, "learning_rate": 0.00020409885502194525, "loss": 38.9445, "step": 10605 }, { "epoch": 28.011885110597557, "grad_norm": 485.50689697265625, "learning_rate": 0.00020405778935781918, "loss": 42.5596, "step": 10606 }, { "epoch": 28.014526246285904, "grad_norm": 690.8204345703125, "learning_rate": 0.00020401672497660236, "loss": 40.3387, "step": 10607 }, { "epoch": 28.01716738197425, "grad_norm": 709.7884521484375, "learning_rate": 0.00020397566187944143, "loss": 41.0594, "step": 10608 }, { "epoch": 28.019808517662597, "grad_norm": 517.4149780273438, "learning_rate": 0.00020393460006748316, "loss": 41.4691, "step": 10609 }, { "epoch": 28.02244965335094, "grad_norm": 363.90594482421875, "learning_rate": 0.00020389353954187412, "loss": 39.1957, "step": 10610 }, { "epoch": 28.025090789039286, "grad_norm": 877.4991455078125, "learning_rate": 0.00020385248030376082, "loss": 39.3739, "step": 10611 }, { "epoch": 28.027731924727632, "grad_norm": 977.4172973632812, "learning_rate": 0.00020381142235428997, "loss": 39.723, "step": 10612 }, { "epoch": 28.03037306041598, "grad_norm": 312.7755432128906, "learning_rate": 0.00020377036569460804, "loss": 38.5895, "step": 10613 }, { "epoch": 28.033014196104325, "grad_norm": 858.8355712890625, "learning_rate": 0.00020372931032586138, "loss": 39.032, "step": 10614 }, { "epoch": 28.03565533179267, "grad_norm": 1600.3953857421875, "learning_rate": 0.00020368825624919663, "loss": 36.1438, "step": 10615 }, { "epoch": 28.03829646748102, "grad_norm": 349.1685791015625, "learning_rate": 0.00020364720346576016, "loss": 35.2804, "step": 10616 }, { "epoch": 28.04093760316936, "grad_norm": 378.9905090332031, "learning_rate": 0.00020360615197669832, "loss": 36.833, "step": 10617 }, { "epoch": 28.043578738857708, "grad_norm": 618.9228515625, "learning_rate": 0.0002035651017831573, "loss": 35.6453, "step": 10618 }, { "epoch": 28.046219874546054, "grad_norm": 574.5123901367188, "learning_rate": 0.0002035240528862836, "loss": 36.0358, "step": 10619 }, { "epoch": 28.0488610102344, "grad_norm": 699.6314086914062, "learning_rate": 0.00020348300528722348, "loss": 35.0312, "step": 10620 }, { "epoch": 28.051502145922747, "grad_norm": 1743.1153564453125, "learning_rate": 0.000203441958987123, "loss": 34.5989, "step": 10621 }, { "epoch": 28.054143281611093, "grad_norm": 708.8858032226562, "learning_rate": 0.00020340091398712856, "loss": 35.6651, "step": 10622 }, { "epoch": 28.05678441729944, "grad_norm": 1302.0826416015625, "learning_rate": 0.0002033598702883862, "loss": 37.4409, "step": 10623 }, { "epoch": 28.059425552987786, "grad_norm": 1690.1734619140625, "learning_rate": 0.00020331882789204205, "loss": 24.7555, "step": 10624 }, { "epoch": 28.06206668867613, "grad_norm": 2575.275634765625, "learning_rate": 0.00020327778679924219, "loss": 11.2739, "step": 10625 }, { "epoch": 28.064707824364476, "grad_norm": 1780.6824951171875, "learning_rate": 0.00020323674701113272, "loss": 13.5778, "step": 10626 }, { "epoch": 28.067348960052822, "grad_norm": 2728.41455078125, "learning_rate": 0.00020319570852885967, "loss": 14.2417, "step": 10627 }, { "epoch": 28.06999009574117, "grad_norm": 3165.642822265625, "learning_rate": 0.0002031546713535688, "loss": 16.9833, "step": 10628 }, { "epoch": 28.072631231429515, "grad_norm": 1070.10693359375, "learning_rate": 0.00020311363548640638, "loss": 12.1769, "step": 10629 }, { "epoch": 28.07527236711786, "grad_norm": 1366.324951171875, "learning_rate": 0.00020307260092851812, "loss": 17.3956, "step": 10630 }, { "epoch": 28.077913502806208, "grad_norm": 1747.100830078125, "learning_rate": 0.0002030315676810499, "loss": 12.1794, "step": 10631 }, { "epoch": 28.080554638494554, "grad_norm": 2236.650634765625, "learning_rate": 0.00020299053574514744, "loss": 12.4487, "step": 10632 }, { "epoch": 28.083195774182897, "grad_norm": 5206.14990234375, "learning_rate": 0.00020294950512195678, "loss": 16.5587, "step": 10633 }, { "epoch": 28.085836909871244, "grad_norm": 1019.8571166992188, "learning_rate": 0.0002029084758126236, "loss": 35.3282, "step": 10634 }, { "epoch": 28.08847804555959, "grad_norm": 570.0744018554688, "learning_rate": 0.00020286744781829343, "loss": 35.4479, "step": 10635 }, { "epoch": 28.091119181247937, "grad_norm": 472.57989501953125, "learning_rate": 0.00020282642114011215, "loss": 34.7359, "step": 10636 }, { "epoch": 28.093760316936283, "grad_norm": 2188.033935546875, "learning_rate": 0.0002027853957792254, "loss": 34.7161, "step": 10637 }, { "epoch": 28.09640145262463, "grad_norm": 1054.9107666015625, "learning_rate": 0.00020274437173677868, "loss": 35.3563, "step": 10638 }, { "epoch": 28.099042588312976, "grad_norm": 1091.85107421875, "learning_rate": 0.0002027033490139176, "loss": 36.4923, "step": 10639 }, { "epoch": 28.10168372400132, "grad_norm": 650.76171875, "learning_rate": 0.00020266232761178776, "loss": 34.2161, "step": 10640 }, { "epoch": 28.104324859689665, "grad_norm": 549.8685913085938, "learning_rate": 0.00020262130753153463, "loss": 33.7433, "step": 10641 }, { "epoch": 28.10696599537801, "grad_norm": 677.7518920898438, "learning_rate": 0.0002025802887743036, "loss": 34.4219, "step": 10642 }, { "epoch": 28.109607131066358, "grad_norm": 1753.1395263671875, "learning_rate": 0.0002025392713412402, "loss": 35.6859, "step": 10643 }, { "epoch": 28.112248266754705, "grad_norm": 479.4626159667969, "learning_rate": 0.00020249825523348974, "loss": 34.8165, "step": 10644 }, { "epoch": 28.11488940244305, "grad_norm": 1907.916015625, "learning_rate": 0.00020245724045219767, "loss": 34.8401, "step": 10645 }, { "epoch": 28.117530538131398, "grad_norm": 1251.5400390625, "learning_rate": 0.00020241622699850907, "loss": 36.1348, "step": 10646 }, { "epoch": 28.120171673819744, "grad_norm": 363.35205078125, "learning_rate": 0.0002023752148735695, "loss": 34.9965, "step": 10647 }, { "epoch": 28.122812809508087, "grad_norm": 414.09765625, "learning_rate": 0.0002023342040785241, "loss": 35.7953, "step": 10648 }, { "epoch": 28.125453945196433, "grad_norm": 492.30859375, "learning_rate": 0.00020229319461451792, "loss": 36.163, "step": 10649 }, { "epoch": 28.12809508088478, "grad_norm": 1131.4573974609375, "learning_rate": 0.00020225218648269635, "loss": 36.2435, "step": 10650 }, { "epoch": 28.130736216573126, "grad_norm": 1529.708984375, "learning_rate": 0.00020221117968420444, "loss": 39.19, "step": 10651 }, { "epoch": 28.133377352261473, "grad_norm": 705.1182250976562, "learning_rate": 0.00020217017422018732, "loss": 40.6398, "step": 10652 }, { "epoch": 28.13601848794982, "grad_norm": 938.7706298828125, "learning_rate": 0.00020212917009178983, "loss": 39.1206, "step": 10653 }, { "epoch": 28.138659623638166, "grad_norm": 539.1363525390625, "learning_rate": 0.00020208816730015721, "loss": 40.1688, "step": 10654 }, { "epoch": 28.141300759326512, "grad_norm": 896.236572265625, "learning_rate": 0.00020204716584643442, "loss": 39.2025, "step": 10655 }, { "epoch": 28.143941895014855, "grad_norm": 4260.4619140625, "learning_rate": 0.00020200616573176627, "loss": 43.2025, "step": 10656 }, { "epoch": 28.1465830307032, "grad_norm": 1166.0279541015625, "learning_rate": 0.00020196516695729784, "loss": 41.6392, "step": 10657 }, { "epoch": 28.149224166391548, "grad_norm": 1941.0753173828125, "learning_rate": 0.00020192416952417386, "loss": 39.9345, "step": 10658 }, { "epoch": 28.151865302079894, "grad_norm": 352.0368347167969, "learning_rate": 0.00020188317343353924, "loss": 42.8201, "step": 10659 }, { "epoch": 28.15450643776824, "grad_norm": 842.870361328125, "learning_rate": 0.00020184217868653867, "loss": 40.726, "step": 10660 }, { "epoch": 28.157147573456587, "grad_norm": 938.2047729492188, "learning_rate": 0.000201801185284317, "loss": 37.9394, "step": 10661 }, { "epoch": 28.159788709144934, "grad_norm": 658.834716796875, "learning_rate": 0.00020176019322801897, "loss": 37.4905, "step": 10662 }, { "epoch": 28.162429844833277, "grad_norm": 680.517822265625, "learning_rate": 0.00020171920251878906, "loss": 38.6391, "step": 10663 }, { "epoch": 28.165070980521623, "grad_norm": 583.5377807617188, "learning_rate": 0.00020167821315777225, "loss": 39.316, "step": 10664 }, { "epoch": 28.16771211620997, "grad_norm": 445.3019104003906, "learning_rate": 0.0002016372251461129, "loss": 36.3483, "step": 10665 }, { "epoch": 28.170353251898316, "grad_norm": 1589.649658203125, "learning_rate": 0.00020159623848495556, "loss": 36.1819, "step": 10666 }, { "epoch": 28.172994387586662, "grad_norm": 352.2743835449219, "learning_rate": 0.0002015552531754448, "loss": 36.6965, "step": 10667 }, { "epoch": 28.17563552327501, "grad_norm": 689.374267578125, "learning_rate": 0.00020151426921872522, "loss": 36.8838, "step": 10668 }, { "epoch": 28.178276658963355, "grad_norm": 869.4014282226562, "learning_rate": 0.0002014732866159412, "loss": 35.0846, "step": 10669 }, { "epoch": 28.1809177946517, "grad_norm": 725.0062255859375, "learning_rate": 0.00020143230536823702, "loss": 37.8157, "step": 10670 }, { "epoch": 28.183558930340045, "grad_norm": 654.5965576171875, "learning_rate": 0.0002013913254767573, "loss": 35.8056, "step": 10671 }, { "epoch": 28.18620006602839, "grad_norm": 493.345947265625, "learning_rate": 0.00020135034694264626, "loss": 34.8737, "step": 10672 }, { "epoch": 28.188841201716738, "grad_norm": 1692.3975830078125, "learning_rate": 0.00020130936976704816, "loss": 36.7279, "step": 10673 }, { "epoch": 28.191482337405084, "grad_norm": 3028.59912109375, "learning_rate": 0.0002012683939511073, "loss": 42.7909, "step": 10674 }, { "epoch": 28.19412347309343, "grad_norm": 1361.1695556640625, "learning_rate": 0.00020122741949596797, "loss": 37.4888, "step": 10675 }, { "epoch": 28.196764608781777, "grad_norm": 2016.710205078125, "learning_rate": 0.0002011864464027743, "loss": 18.1877, "step": 10676 }, { "epoch": 28.199405744470123, "grad_norm": 1016.4195556640625, "learning_rate": 0.00020114547467267032, "loss": 12.054, "step": 10677 }, { "epoch": 28.20204688015847, "grad_norm": 116741.7421875, "learning_rate": 0.00020110450430680038, "loss": 13.2762, "step": 10678 }, { "epoch": 28.204688015846813, "grad_norm": 1015.2744140625, "learning_rate": 0.00020106353530630842, "loss": 14.0282, "step": 10679 }, { "epoch": 28.20732915153516, "grad_norm": 972.494384765625, "learning_rate": 0.00020102256767233847, "loss": 13.8253, "step": 10680 }, { "epoch": 28.209970287223506, "grad_norm": 873.0304565429688, "learning_rate": 0.00020098160140603446, "loss": 16.8329, "step": 10681 }, { "epoch": 28.212611422911852, "grad_norm": 11543.4267578125, "learning_rate": 0.0002009406365085406, "loss": 10.4835, "step": 10682 }, { "epoch": 28.2152525586002, "grad_norm": 724.0035400390625, "learning_rate": 0.0002008996729810006, "loss": 9.5621, "step": 10683 }, { "epoch": 28.217893694288545, "grad_norm": 1616.267578125, "learning_rate": 0.00020085871082455825, "loss": 22.2036, "step": 10684 }, { "epoch": 28.22053482997689, "grad_norm": 316.9254455566406, "learning_rate": 0.00020081775004035765, "loss": 36.324, "step": 10685 }, { "epoch": 28.223175965665234, "grad_norm": 741.2373657226562, "learning_rate": 0.00020077679062954252, "loss": 35.3669, "step": 10686 }, { "epoch": 28.22581710135358, "grad_norm": 1335.377197265625, "learning_rate": 0.0002007358325932566, "loss": 35.7218, "step": 10687 }, { "epoch": 28.228458237041927, "grad_norm": 712.0152587890625, "learning_rate": 0.0002006948759326435, "loss": 35.0843, "step": 10688 }, { "epoch": 28.231099372730274, "grad_norm": 1368.867919921875, "learning_rate": 0.0002006539206488471, "loss": 34.2506, "step": 10689 }, { "epoch": 28.23374050841862, "grad_norm": 605.9955444335938, "learning_rate": 0.000200612966743011, "loss": 35.9702, "step": 10690 }, { "epoch": 28.236381644106967, "grad_norm": 371.10101318359375, "learning_rate": 0.00020057201421627875, "loss": 34.8685, "step": 10691 }, { "epoch": 28.239022779795313, "grad_norm": 877.54638671875, "learning_rate": 0.00020053106306979398, "loss": 35.6128, "step": 10692 }, { "epoch": 28.24166391548366, "grad_norm": 2532.338134765625, "learning_rate": 0.00020049011330470023, "loss": 35.7784, "step": 10693 }, { "epoch": 28.244305051172002, "grad_norm": 1560.0667724609375, "learning_rate": 0.000200449164922141, "loss": 35.2534, "step": 10694 }, { "epoch": 28.24694618686035, "grad_norm": 1814.2340087890625, "learning_rate": 0.00020040821792325966, "loss": 34.5398, "step": 10695 }, { "epoch": 28.249587322548695, "grad_norm": 344.212158203125, "learning_rate": 0.00020036727230919975, "loss": 35.4496, "step": 10696 }, { "epoch": 28.25222845823704, "grad_norm": 1025.4395751953125, "learning_rate": 0.00020032632808110465, "loss": 33.8541, "step": 10697 }, { "epoch": 28.254869593925388, "grad_norm": 421.84002685546875, "learning_rate": 0.00020028538524011753, "loss": 34.4866, "step": 10698 }, { "epoch": 28.257510729613735, "grad_norm": 478.7667541503906, "learning_rate": 0.00020024444378738195, "loss": 36.4297, "step": 10699 }, { "epoch": 28.26015186530208, "grad_norm": 1529.3604736328125, "learning_rate": 0.00020020350372404103, "loss": 36.6688, "step": 10700 }, { "epoch": 28.262793000990428, "grad_norm": 1762.9041748046875, "learning_rate": 0.00020016256505123802, "loss": 37.2233, "step": 10701 }, { "epoch": 28.26543413667877, "grad_norm": 933.7909545898438, "learning_rate": 0.00020012162777011596, "loss": 40.7865, "step": 10702 }, { "epoch": 28.268075272367117, "grad_norm": 517.8753051757812, "learning_rate": 0.0002000806918818183, "loss": 39.2313, "step": 10703 }, { "epoch": 28.270716408055463, "grad_norm": 782.3048706054688, "learning_rate": 0.000200039757387488, "loss": 40.5479, "step": 10704 }, { "epoch": 28.27335754374381, "grad_norm": 1143.9432373046875, "learning_rate": 0.000199998824288268, "loss": 40.1379, "step": 10705 }, { "epoch": 28.275998679432156, "grad_norm": 937.4865112304688, "learning_rate": 0.00019995789258530155, "loss": 43.0623, "step": 10706 }, { "epoch": 28.278639815120503, "grad_norm": 666.4412841796875, "learning_rate": 0.00019991696227973155, "loss": 44.6452, "step": 10707 }, { "epoch": 28.28128095080885, "grad_norm": 1481.625, "learning_rate": 0.000199876033372701, "loss": 42.8602, "step": 10708 }, { "epoch": 28.283922086497192, "grad_norm": 1100.2628173828125, "learning_rate": 0.00019983510586535266, "loss": 42.5872, "step": 10709 }, { "epoch": 28.28656322218554, "grad_norm": 887.0995483398438, "learning_rate": 0.0001997941797588296, "loss": 41.6083, "step": 10710 }, { "epoch": 28.289204357873885, "grad_norm": 648.8895263671875, "learning_rate": 0.0001997532550542746, "loss": 40.0139, "step": 10711 }, { "epoch": 28.29184549356223, "grad_norm": 728.2783813476562, "learning_rate": 0.00019971233175283034, "loss": 39.4205, "step": 10712 }, { "epoch": 28.294486629250578, "grad_norm": 796.1244506835938, "learning_rate": 0.00019967140985563975, "loss": 38.8821, "step": 10713 }, { "epoch": 28.297127764938924, "grad_norm": 597.5930786132812, "learning_rate": 0.0001996304893638455, "loss": 38.8589, "step": 10714 }, { "epoch": 28.29976890062727, "grad_norm": 432.7640686035156, "learning_rate": 0.0001995895702785902, "loss": 38.7302, "step": 10715 }, { "epoch": 28.302410036315617, "grad_norm": 383.24761962890625, "learning_rate": 0.00019954865260101646, "loss": 36.5441, "step": 10716 }, { "epoch": 28.30505117200396, "grad_norm": 308.0867919921875, "learning_rate": 0.00019950773633226713, "loss": 36.8337, "step": 10717 }, { "epoch": 28.307692307692307, "grad_norm": 662.4080200195312, "learning_rate": 0.0001994668214734845, "loss": 35.6937, "step": 10718 }, { "epoch": 28.310333443380653, "grad_norm": 1923.29150390625, "learning_rate": 0.00019942590802581113, "loss": 36.4881, "step": 10719 }, { "epoch": 28.312974579069, "grad_norm": 998.947021484375, "learning_rate": 0.00019938499599038968, "loss": 35.7739, "step": 10720 }, { "epoch": 28.315615714757346, "grad_norm": 1534.737060546875, "learning_rate": 0.00019934408536836245, "loss": 35.5862, "step": 10721 }, { "epoch": 28.318256850445692, "grad_norm": 466.3896789550781, "learning_rate": 0.00019930317616087195, "loss": 36.8914, "step": 10722 }, { "epoch": 28.32089798613404, "grad_norm": 654.7301025390625, "learning_rate": 0.00019926226836906034, "loss": 35.8841, "step": 10723 }, { "epoch": 28.323539121822385, "grad_norm": 4072.369140625, "learning_rate": 0.00019922136199407016, "loss": 40.2064, "step": 10724 }, { "epoch": 28.326180257510728, "grad_norm": 3602.418212890625, "learning_rate": 0.00019918045703704363, "loss": 14.5163, "step": 10725 }, { "epoch": 28.328821393199075, "grad_norm": 1064.918212890625, "learning_rate": 0.00019913955349912292, "loss": 15.2069, "step": 10726 }, { "epoch": 28.33146252888742, "grad_norm": 2756.106689453125, "learning_rate": 0.00019909865138145034, "loss": 21.4132, "step": 10727 }, { "epoch": 28.334103664575768, "grad_norm": 1301.3321533203125, "learning_rate": 0.00019905775068516804, "loss": 11.522, "step": 10728 }, { "epoch": 28.336744800264114, "grad_norm": 1693.50048828125, "learning_rate": 0.00019901685141141811, "loss": 17.04, "step": 10729 }, { "epoch": 28.33938593595246, "grad_norm": 3399.2177734375, "learning_rate": 0.00019897595356134262, "loss": 15.089, "step": 10730 }, { "epoch": 28.342027071640807, "grad_norm": 2177.221435546875, "learning_rate": 0.00019893505713608373, "loss": 10.5169, "step": 10731 }, { "epoch": 28.34466820732915, "grad_norm": 3122.845458984375, "learning_rate": 0.00019889416213678333, "loss": 10.2938, "step": 10732 }, { "epoch": 28.347309343017496, "grad_norm": 5094.0263671875, "learning_rate": 0.00019885326856458336, "loss": 13.6142, "step": 10733 }, { "epoch": 28.349950478705843, "grad_norm": 1657.5128173828125, "learning_rate": 0.00019881237642062593, "loss": 29.3301, "step": 10734 }, { "epoch": 28.35259161439419, "grad_norm": 669.564453125, "learning_rate": 0.0001987714857060528, "loss": 36.8536, "step": 10735 }, { "epoch": 28.355232750082536, "grad_norm": 483.60577392578125, "learning_rate": 0.00019873059642200582, "loss": 35.1297, "step": 10736 }, { "epoch": 28.357873885770882, "grad_norm": 1693.095703125, "learning_rate": 0.00019868970856962677, "loss": 37.1365, "step": 10737 }, { "epoch": 28.36051502145923, "grad_norm": 596.5609741210938, "learning_rate": 0.0001986488221500575, "loss": 36.6034, "step": 10738 }, { "epoch": 28.363156157147575, "grad_norm": 466.5419921875, "learning_rate": 0.00019860793716443972, "loss": 35.6227, "step": 10739 }, { "epoch": 28.365797292835918, "grad_norm": 504.09661865234375, "learning_rate": 0.0001985670536139151, "loss": 35.7321, "step": 10740 }, { "epoch": 28.368438428524264, "grad_norm": 463.2750549316406, "learning_rate": 0.0001985261714996253, "loss": 34.5598, "step": 10741 }, { "epoch": 28.37107956421261, "grad_norm": 716.469482421875, "learning_rate": 0.00019848529082271195, "loss": 35.8596, "step": 10742 }, { "epoch": 28.373720699900957, "grad_norm": 473.8251647949219, "learning_rate": 0.0001984444115843165, "loss": 35.9138, "step": 10743 }, { "epoch": 28.376361835589304, "grad_norm": 550.2059936523438, "learning_rate": 0.00019840353378558067, "loss": 35.9339, "step": 10744 }, { "epoch": 28.37900297127765, "grad_norm": 702.880126953125, "learning_rate": 0.00019836265742764582, "loss": 36.7192, "step": 10745 }, { "epoch": 28.381644106965997, "grad_norm": 928.87451171875, "learning_rate": 0.00019832178251165346, "loss": 35.8465, "step": 10746 }, { "epoch": 28.384285242654343, "grad_norm": 405.74072265625, "learning_rate": 0.00019828090903874486, "loss": 33.9533, "step": 10747 }, { "epoch": 28.386926378342686, "grad_norm": 2385.900146484375, "learning_rate": 0.0001982400370100616, "loss": 35.6775, "step": 10748 }, { "epoch": 28.389567514031032, "grad_norm": 554.4389038085938, "learning_rate": 0.00019819916642674487, "loss": 36.8462, "step": 10749 }, { "epoch": 28.39220864971938, "grad_norm": 1298.877685546875, "learning_rate": 0.0001981582972899359, "loss": 36.5039, "step": 10750 }, { "epoch": 28.394849785407725, "grad_norm": 638.3887329101562, "learning_rate": 0.0001981174296007761, "loss": 38.7447, "step": 10751 }, { "epoch": 28.39749092109607, "grad_norm": 681.6220092773438, "learning_rate": 0.00019807656336040668, "loss": 40.6373, "step": 10752 }, { "epoch": 28.400132056784418, "grad_norm": 1507.4168701171875, "learning_rate": 0.00019803569856996866, "loss": 40.2357, "step": 10753 }, { "epoch": 28.402773192472765, "grad_norm": 528.6216430664062, "learning_rate": 0.00019799483523060316, "loss": 39.1404, "step": 10754 }, { "epoch": 28.405414328161108, "grad_norm": 412.40399169921875, "learning_rate": 0.00019795397334345142, "loss": 41.4328, "step": 10755 }, { "epoch": 28.408055463849454, "grad_norm": 574.0863037109375, "learning_rate": 0.00019791311290965442, "loss": 43.1671, "step": 10756 }, { "epoch": 28.4106965995378, "grad_norm": 903.1192626953125, "learning_rate": 0.00019787225393035304, "loss": 44.4956, "step": 10757 }, { "epoch": 28.413337735226147, "grad_norm": 677.8308715820312, "learning_rate": 0.00019783139640668846, "loss": 42.8284, "step": 10758 }, { "epoch": 28.415978870914493, "grad_norm": 435.6869201660156, "learning_rate": 0.00019779054033980143, "loss": 40.6923, "step": 10759 }, { "epoch": 28.41862000660284, "grad_norm": 778.6190185546875, "learning_rate": 0.00019774968573083292, "loss": 40.1045, "step": 10760 }, { "epoch": 28.421261142291186, "grad_norm": 418.05572509765625, "learning_rate": 0.0001977088325809237, "loss": 41.3364, "step": 10761 }, { "epoch": 28.423902277979533, "grad_norm": 463.72869873046875, "learning_rate": 0.00019766798089121466, "loss": 39.8913, "step": 10762 }, { "epoch": 28.426543413667876, "grad_norm": 1516.54052734375, "learning_rate": 0.0001976271306628465, "loss": 37.9367, "step": 10763 }, { "epoch": 28.429184549356222, "grad_norm": 566.012451171875, "learning_rate": 0.00019758628189695993, "loss": 37.8226, "step": 10764 }, { "epoch": 28.43182568504457, "grad_norm": 349.7416687011719, "learning_rate": 0.0001975454345946957, "loss": 37.4037, "step": 10765 }, { "epoch": 28.434466820732915, "grad_norm": 436.29852294921875, "learning_rate": 0.00019750458875719435, "loss": 35.7104, "step": 10766 }, { "epoch": 28.43710795642126, "grad_norm": 1145.097412109375, "learning_rate": 0.00019746374438559656, "loss": 36.4979, "step": 10767 }, { "epoch": 28.439749092109608, "grad_norm": 421.5682678222656, "learning_rate": 0.0001974229014810427, "loss": 35.8524, "step": 10768 }, { "epoch": 28.442390227797954, "grad_norm": 525.9927978515625, "learning_rate": 0.00019738206004467362, "loss": 35.4026, "step": 10769 }, { "epoch": 28.4450313634863, "grad_norm": 368.61126708984375, "learning_rate": 0.00019734122007762956, "loss": 35.3292, "step": 10770 }, { "epoch": 28.447672499174644, "grad_norm": 371.3792419433594, "learning_rate": 0.00019730038158105088, "loss": 34.8247, "step": 10771 }, { "epoch": 28.45031363486299, "grad_norm": 311.1861572265625, "learning_rate": 0.00019725954455607818, "loss": 35.3944, "step": 10772 }, { "epoch": 28.452954770551337, "grad_norm": 1154.5206298828125, "learning_rate": 0.00019721870900385173, "loss": 34.9888, "step": 10773 }, { "epoch": 28.455595906239683, "grad_norm": 1403.521484375, "learning_rate": 0.0001971778749255118, "loss": 44.9587, "step": 10774 }, { "epoch": 28.45823704192803, "grad_norm": 1850.2176513671875, "learning_rate": 0.00019713704232219868, "loss": 21.8196, "step": 10775 }, { "epoch": 28.460878177616376, "grad_norm": 1444.9229736328125, "learning_rate": 0.00019709621119505261, "loss": 20.9001, "step": 10776 }, { "epoch": 28.463519313304722, "grad_norm": 1312.496826171875, "learning_rate": 0.0001970553815452138, "loss": 15.6162, "step": 10777 }, { "epoch": 28.466160448993065, "grad_norm": 1314.8292236328125, "learning_rate": 0.0001970145533738223, "loss": 16.6672, "step": 10778 }, { "epoch": 28.46880158468141, "grad_norm": 1585.180419921875, "learning_rate": 0.0001969737266820183, "loss": 11.3319, "step": 10779 }, { "epoch": 28.471442720369758, "grad_norm": 1101.8828125, "learning_rate": 0.00019693290147094184, "loss": 17.0519, "step": 10780 }, { "epoch": 28.474083856058105, "grad_norm": 10300.7734375, "learning_rate": 0.00019689207774173292, "loss": 12.122, "step": 10781 }, { "epoch": 28.47672499174645, "grad_norm": 706.6307983398438, "learning_rate": 0.0001968512554955315, "loss": 11.6505, "step": 10782 }, { "epoch": 28.479366127434798, "grad_norm": 789.9776000976562, "learning_rate": 0.0001968104347334776, "loss": 14.6863, "step": 10783 }, { "epoch": 28.482007263123144, "grad_norm": 835.6613159179688, "learning_rate": 0.00019676961545671108, "loss": 32.9954, "step": 10784 }, { "epoch": 28.48464839881149, "grad_norm": 413.4331970214844, "learning_rate": 0.00019672879766637164, "loss": 35.1099, "step": 10785 }, { "epoch": 28.487289534499833, "grad_norm": 540.1195678710938, "learning_rate": 0.00019668798136359935, "loss": 35.6414, "step": 10786 }, { "epoch": 28.48993067018818, "grad_norm": 293.19140625, "learning_rate": 0.00019664716654953395, "loss": 35.35, "step": 10787 }, { "epoch": 28.492571805876526, "grad_norm": 494.6001281738281, "learning_rate": 0.00019660635322531499, "loss": 34.7276, "step": 10788 }, { "epoch": 28.495212941564873, "grad_norm": 399.9692077636719, "learning_rate": 0.00019656554139208215, "loss": 34.4669, "step": 10789 }, { "epoch": 28.49785407725322, "grad_norm": 297.9041748046875, "learning_rate": 0.0001965247310509753, "loss": 37.3689, "step": 10790 }, { "epoch": 28.500495212941566, "grad_norm": 507.8460693359375, "learning_rate": 0.0001964839222031339, "loss": 34.847, "step": 10791 }, { "epoch": 28.503136348629912, "grad_norm": 567.4299926757812, "learning_rate": 0.0001964431148496975, "loss": 35.9461, "step": 10792 }, { "epoch": 28.50577748431826, "grad_norm": 507.8136901855469, "learning_rate": 0.0001964023089918057, "loss": 33.6938, "step": 10793 }, { "epoch": 28.5084186200066, "grad_norm": 236.72952270507812, "learning_rate": 0.00019636150463059793, "loss": 35.3297, "step": 10794 }, { "epoch": 28.511059755694948, "grad_norm": 483.0977783203125, "learning_rate": 0.00019632070176721364, "loss": 38.0845, "step": 10795 }, { "epoch": 28.513700891383294, "grad_norm": 340.0946044921875, "learning_rate": 0.0001962799004027922, "loss": 36.5301, "step": 10796 }, { "epoch": 28.51634202707164, "grad_norm": 410.26513671875, "learning_rate": 0.000196239100538473, "loss": 34.3976, "step": 10797 }, { "epoch": 28.518983162759987, "grad_norm": 305.23260498046875, "learning_rate": 0.00019619830217539532, "loss": 34.4379, "step": 10798 }, { "epoch": 28.521624298448334, "grad_norm": 313.745361328125, "learning_rate": 0.00019615750531469845, "loss": 34.6964, "step": 10799 }, { "epoch": 28.52426543413668, "grad_norm": 489.6083068847656, "learning_rate": 0.00019611670995752162, "loss": 36.2686, "step": 10800 }, { "epoch": 28.52426543413668, "eval_loss": 3.8350625038146973, "eval_runtime": 2.1642, "eval_samples_per_second": 228.721, "eval_steps_per_second": 28.648, "step": 10800 }, { "epoch": 28.526906569825023, "grad_norm": 535.0438842773438, "learning_rate": 0.00019607591610500404, "loss": 37.0784, "step": 10801 }, { "epoch": 28.52954770551337, "grad_norm": 474.3359375, "learning_rate": 0.00019603512375828482, "loss": 38.9222, "step": 10802 }, { "epoch": 28.532188841201716, "grad_norm": 263.7882995605469, "learning_rate": 0.00019599433291850294, "loss": 40.0785, "step": 10803 }, { "epoch": 28.534829976890062, "grad_norm": 316.2320251464844, "learning_rate": 0.0001959535435867978, "loss": 41.2192, "step": 10804 }, { "epoch": 28.53747111257841, "grad_norm": 428.9290771484375, "learning_rate": 0.0001959127557643081, "loss": 39.6152, "step": 10805 }, { "epoch": 28.540112248266755, "grad_norm": 415.3280334472656, "learning_rate": 0.00019587196945217283, "loss": 41.079, "step": 10806 }, { "epoch": 28.5427533839551, "grad_norm": 860.8972778320312, "learning_rate": 0.00019583118465153114, "loss": 42.9208, "step": 10807 }, { "epoch": 28.545394519643448, "grad_norm": 329.7312927246094, "learning_rate": 0.00019579040136352178, "loss": 45.632, "step": 10808 }, { "epoch": 28.54803565533179, "grad_norm": 189.6781768798828, "learning_rate": 0.0001957496195892836, "loss": 39.9749, "step": 10809 }, { "epoch": 28.550676791020138, "grad_norm": 621.54345703125, "learning_rate": 0.00019570883932995542, "loss": 40.8032, "step": 10810 }, { "epoch": 28.553317926708484, "grad_norm": 340.2553405761719, "learning_rate": 0.00019566806058667603, "loss": 40.6135, "step": 10811 }, { "epoch": 28.55595906239683, "grad_norm": 478.1294250488281, "learning_rate": 0.00019562728336058413, "loss": 39.6565, "step": 10812 }, { "epoch": 28.558600198085177, "grad_norm": 1900.5452880859375, "learning_rate": 0.0001955865076528184, "loss": 38.2423, "step": 10813 }, { "epoch": 28.561241333773523, "grad_norm": 413.70574951171875, "learning_rate": 0.00019554573346451746, "loss": 38.1706, "step": 10814 }, { "epoch": 28.56388246946187, "grad_norm": 399.2471923828125, "learning_rate": 0.00019550496079681994, "loss": 37.7514, "step": 10815 }, { "epoch": 28.566523605150216, "grad_norm": 286.40960693359375, "learning_rate": 0.00019546418965086444, "loss": 35.2959, "step": 10816 }, { "epoch": 28.56916474083856, "grad_norm": 320.37255859375, "learning_rate": 0.00019542342002778929, "loss": 36.5163, "step": 10817 }, { "epoch": 28.571805876526906, "grad_norm": 271.69232177734375, "learning_rate": 0.00019538265192873313, "loss": 35.9986, "step": 10818 }, { "epoch": 28.574447012215252, "grad_norm": 322.4010925292969, "learning_rate": 0.00019534188535483437, "loss": 35.8093, "step": 10819 }, { "epoch": 28.5770881479036, "grad_norm": 241.5449676513672, "learning_rate": 0.0001953011203072312, "loss": 35.5215, "step": 10820 }, { "epoch": 28.579729283591945, "grad_norm": 391.0058288574219, "learning_rate": 0.00019526035678706227, "loss": 36.1552, "step": 10821 }, { "epoch": 28.58237041928029, "grad_norm": 431.51104736328125, "learning_rate": 0.00019521959479546576, "loss": 36.4633, "step": 10822 }, { "epoch": 28.585011554968638, "grad_norm": 11407.0634765625, "learning_rate": 0.0001951788343335798, "loss": 45.3363, "step": 10823 }, { "epoch": 28.58765269065698, "grad_norm": 3261.365966796875, "learning_rate": 0.0001951380754025426, "loss": 18.1576, "step": 10824 }, { "epoch": 28.590293826345327, "grad_norm": 2058.112548828125, "learning_rate": 0.00019509731800349252, "loss": 27.1119, "step": 10825 }, { "epoch": 28.592934962033674, "grad_norm": 2424.8212890625, "learning_rate": 0.00019505656213756763, "loss": 21.2112, "step": 10826 }, { "epoch": 28.59557609772202, "grad_norm": 7419.4013671875, "learning_rate": 0.00019501580780590584, "loss": 19.5856, "step": 10827 }, { "epoch": 28.598217233410367, "grad_norm": 4603.34619140625, "learning_rate": 0.00019497505500964542, "loss": 18.8726, "step": 10828 }, { "epoch": 28.600858369098713, "grad_norm": 4568.82177734375, "learning_rate": 0.00019493430374992423, "loss": 17.8232, "step": 10829 }, { "epoch": 28.60349950478706, "grad_norm": 2332.60302734375, "learning_rate": 0.0001948935540278803, "loss": 14.969, "step": 10830 }, { "epoch": 28.606140640475406, "grad_norm": 565.0730590820312, "learning_rate": 0.00019485280584465142, "loss": 13.2159, "step": 10831 }, { "epoch": 28.60878177616375, "grad_norm": 1266.46484375, "learning_rate": 0.0001948120592013756, "loss": 13.2302, "step": 10832 }, { "epoch": 28.611422911852095, "grad_norm": 558.4273681640625, "learning_rate": 0.0001947713140991906, "loss": 23.2214, "step": 10833 }, { "epoch": 28.61406404754044, "grad_norm": 515.1412963867188, "learning_rate": 0.00019473057053923418, "loss": 35.3768, "step": 10834 }, { "epoch": 28.616705183228788, "grad_norm": 481.4014587402344, "learning_rate": 0.00019468982852264416, "loss": 34.1884, "step": 10835 }, { "epoch": 28.619346318917135, "grad_norm": 552.415771484375, "learning_rate": 0.00019464908805055816, "loss": 35.6525, "step": 10836 }, { "epoch": 28.62198745460548, "grad_norm": 416.269775390625, "learning_rate": 0.0001946083491241139, "loss": 35.4805, "step": 10837 }, { "epoch": 28.624628590293828, "grad_norm": 450.7423400878906, "learning_rate": 0.0001945676117444488, "loss": 33.9051, "step": 10838 }, { "epoch": 28.627269725982174, "grad_norm": 501.621826171875, "learning_rate": 0.00019452687591270078, "loss": 35.1884, "step": 10839 }, { "epoch": 28.629910861670517, "grad_norm": 330.9325256347656, "learning_rate": 0.0001944861416300071, "loss": 35.9886, "step": 10840 }, { "epoch": 28.632551997358863, "grad_norm": 2097.87841796875, "learning_rate": 0.0001944454088975052, "loss": 35.3681, "step": 10841 }, { "epoch": 28.63519313304721, "grad_norm": 531.3524780273438, "learning_rate": 0.00019440467771633272, "loss": 33.3883, "step": 10842 }, { "epoch": 28.637834268735556, "grad_norm": 525.038330078125, "learning_rate": 0.00019436394808762698, "loss": 36.7153, "step": 10843 }, { "epoch": 28.640475404423903, "grad_norm": 1988.8448486328125, "learning_rate": 0.00019432322001252528, "loss": 35.0258, "step": 10844 }, { "epoch": 28.64311654011225, "grad_norm": 267.0218200683594, "learning_rate": 0.0001942824934921649, "loss": 34.8174, "step": 10845 }, { "epoch": 28.645757675800596, "grad_norm": 386.2565612792969, "learning_rate": 0.00019424176852768322, "loss": 34.7145, "step": 10846 }, { "epoch": 28.64839881148894, "grad_norm": 267.12078857421875, "learning_rate": 0.00019420104512021742, "loss": 35.6009, "step": 10847 }, { "epoch": 28.651039947177285, "grad_norm": 525.3320922851562, "learning_rate": 0.00019416032327090454, "loss": 34.695, "step": 10848 }, { "epoch": 28.65368108286563, "grad_norm": 488.0237731933594, "learning_rate": 0.00019411960298088194, "loss": 36.1791, "step": 10849 }, { "epoch": 28.656322218553978, "grad_norm": 455.8106689453125, "learning_rate": 0.00019407888425128662, "loss": 37.173, "step": 10850 }, { "epoch": 28.658963354242324, "grad_norm": 529.5252685546875, "learning_rate": 0.00019403816708325554, "loss": 39.2478, "step": 10851 }, { "epoch": 28.66160448993067, "grad_norm": 2138.03955078125, "learning_rate": 0.00019399745147792577, "loss": 41.2595, "step": 10852 }, { "epoch": 28.664245625619017, "grad_norm": 614.9776611328125, "learning_rate": 0.0001939567374364343, "loss": 38.9689, "step": 10853 }, { "epoch": 28.666886761307364, "grad_norm": 265.7425231933594, "learning_rate": 0.00019391602495991803, "loss": 40.2002, "step": 10854 }, { "epoch": 28.669527896995707, "grad_norm": 949.8306884765625, "learning_rate": 0.0001938753140495137, "loss": 40.475, "step": 10855 }, { "epoch": 28.672169032684053, "grad_norm": 488.2051696777344, "learning_rate": 0.00019383460470635835, "loss": 40.6093, "step": 10856 }, { "epoch": 28.6748101683724, "grad_norm": 232.1141815185547, "learning_rate": 0.00019379389693158875, "loss": 42.5939, "step": 10857 }, { "epoch": 28.677451304060746, "grad_norm": 257.7731018066406, "learning_rate": 0.0001937531907263415, "loss": 42.84, "step": 10858 }, { "epoch": 28.680092439749092, "grad_norm": 287.4084777832031, "learning_rate": 0.00019371248609175324, "loss": 42.704, "step": 10859 }, { "epoch": 28.68273357543744, "grad_norm": 918.1902465820312, "learning_rate": 0.00019367178302896084, "loss": 39.5328, "step": 10860 }, { "epoch": 28.685374711125785, "grad_norm": 386.3226013183594, "learning_rate": 0.00019363108153910087, "loss": 39.6775, "step": 10861 }, { "epoch": 28.68801584681413, "grad_norm": 284.496337890625, "learning_rate": 0.00019359038162330972, "loss": 40.4858, "step": 10862 }, { "epoch": 28.690656982502475, "grad_norm": 275.60986328125, "learning_rate": 0.0001935496832827241, "loss": 37.5643, "step": 10863 }, { "epoch": 28.69329811819082, "grad_norm": 269.4586181640625, "learning_rate": 0.0001935089865184804, "loss": 37.1256, "step": 10864 }, { "epoch": 28.695939253879168, "grad_norm": 359.0251159667969, "learning_rate": 0.00019346829133171507, "loss": 36.7802, "step": 10865 }, { "epoch": 28.698580389567514, "grad_norm": 156.31707763671875, "learning_rate": 0.00019342759772356444, "loss": 36.0108, "step": 10866 }, { "epoch": 28.70122152525586, "grad_norm": 330.5562438964844, "learning_rate": 0.00019338690569516498, "loss": 34.5726, "step": 10867 }, { "epoch": 28.703862660944207, "grad_norm": 402.3415222167969, "learning_rate": 0.00019334621524765293, "loss": 34.7779, "step": 10868 }, { "epoch": 28.706503796632553, "grad_norm": 265.6134338378906, "learning_rate": 0.00019330552638216448, "loss": 34.3714, "step": 10869 }, { "epoch": 28.709144932320896, "grad_norm": 567.593017578125, "learning_rate": 0.00019326483909983593, "loss": 34.3435, "step": 10870 }, { "epoch": 28.711786068009243, "grad_norm": 572.1686401367188, "learning_rate": 0.00019322415340180345, "loss": 34.5422, "step": 10871 }, { "epoch": 28.71442720369759, "grad_norm": 528.8600463867188, "learning_rate": 0.000193183469289203, "loss": 35.0518, "step": 10872 }, { "epoch": 28.717068339385936, "grad_norm": 370.3533935546875, "learning_rate": 0.00019314278676317093, "loss": 34.7716, "step": 10873 }, { "epoch": 28.719709475074282, "grad_norm": 600.9869384765625, "learning_rate": 0.00019310210582484317, "loss": 33.4371, "step": 10874 }, { "epoch": 28.72235061076263, "grad_norm": 1752.8277587890625, "learning_rate": 0.00019306142647535564, "loss": 21.3789, "step": 10875 }, { "epoch": 28.724991746450975, "grad_norm": 1346.9334716796875, "learning_rate": 0.00019302074871584422, "loss": 23.1823, "step": 10876 }, { "epoch": 28.72763288213932, "grad_norm": 5979.70654296875, "learning_rate": 0.000192980072547445, "loss": 17.8523, "step": 10877 }, { "epoch": 28.730274017827664, "grad_norm": 778.0538330078125, "learning_rate": 0.0001929393979712938, "loss": 18.7542, "step": 10878 }, { "epoch": 28.73291515351601, "grad_norm": 361.7986145019531, "learning_rate": 0.00019289872498852632, "loss": 13.6419, "step": 10879 }, { "epoch": 28.735556289204357, "grad_norm": 380.82666015625, "learning_rate": 0.00019285805360027847, "loss": 11.9057, "step": 10880 }, { "epoch": 28.738197424892704, "grad_norm": 392.2677001953125, "learning_rate": 0.00019281738380768588, "loss": 13.9551, "step": 10881 }, { "epoch": 28.74083856058105, "grad_norm": 1826.8466796875, "learning_rate": 0.00019277671561188428, "loss": 9.6592, "step": 10882 }, { "epoch": 28.743479696269397, "grad_norm": 408.1372985839844, "learning_rate": 0.0001927360490140092, "loss": 18.9356, "step": 10883 }, { "epoch": 28.746120831957743, "grad_norm": 542.0947265625, "learning_rate": 0.0001926953840151964, "loss": 36.8923, "step": 10884 }, { "epoch": 28.74876196764609, "grad_norm": 422.3715515136719, "learning_rate": 0.00019265472061658133, "loss": 34.5494, "step": 10885 }, { "epoch": 28.751403103334432, "grad_norm": 1168.3160400390625, "learning_rate": 0.00019261405881929944, "loss": 35.1331, "step": 10886 }, { "epoch": 28.75404423902278, "grad_norm": 943.8421630859375, "learning_rate": 0.0001925733986244863, "loss": 35.9703, "step": 10887 }, { "epoch": 28.756685374711125, "grad_norm": 2365.8046875, "learning_rate": 0.00019253274003327726, "loss": 35.6547, "step": 10888 }, { "epoch": 28.75932651039947, "grad_norm": 600.0653076171875, "learning_rate": 0.0001924920830468077, "loss": 35.8757, "step": 10889 }, { "epoch": 28.761967646087818, "grad_norm": 358.46673583984375, "learning_rate": 0.00019245142766621286, "loss": 35.0767, "step": 10890 }, { "epoch": 28.764608781776165, "grad_norm": 338.00384521484375, "learning_rate": 0.00019241077389262819, "loss": 36.2867, "step": 10891 }, { "epoch": 28.76724991746451, "grad_norm": 287.5875549316406, "learning_rate": 0.00019237012172718888, "loss": 34.9608, "step": 10892 }, { "epoch": 28.769891053152854, "grad_norm": 418.5331115722656, "learning_rate": 0.00019232947117102988, "loss": 34.9629, "step": 10893 }, { "epoch": 28.7725321888412, "grad_norm": 773.4744873046875, "learning_rate": 0.00019228882222528665, "loss": 34.6741, "step": 10894 }, { "epoch": 28.775173324529547, "grad_norm": 370.5439453125, "learning_rate": 0.00019224817489109415, "loss": 34.8555, "step": 10895 }, { "epoch": 28.777814460217893, "grad_norm": 746.5650634765625, "learning_rate": 0.00019220752916958745, "loss": 34.9216, "step": 10896 }, { "epoch": 28.78045559590624, "grad_norm": 812.5414428710938, "learning_rate": 0.0001921668850619015, "loss": 35.3623, "step": 10897 }, { "epoch": 28.783096731594586, "grad_norm": 1065.5191650390625, "learning_rate": 0.00019212624256917136, "loss": 33.027, "step": 10898 }, { "epoch": 28.785737867282933, "grad_norm": 302.2998962402344, "learning_rate": 0.0001920856016925319, "loss": 35.0528, "step": 10899 }, { "epoch": 28.78837900297128, "grad_norm": 882.2318725585938, "learning_rate": 0.00019204496243311792, "loss": 37.3652, "step": 10900 }, { "epoch": 28.791020138659622, "grad_norm": 1783.29833984375, "learning_rate": 0.0001920043247920644, "loss": 37.6018, "step": 10901 }, { "epoch": 28.79366127434797, "grad_norm": 1243.817626953125, "learning_rate": 0.000191963688770506, "loss": 42.3571, "step": 10902 }, { "epoch": 28.796302410036315, "grad_norm": 300.9328918457031, "learning_rate": 0.00019192305436957756, "loss": 39.3776, "step": 10903 }, { "epoch": 28.79894354572466, "grad_norm": 283.52911376953125, "learning_rate": 0.00019188242159041364, "loss": 40.8371, "step": 10904 }, { "epoch": 28.801584681413008, "grad_norm": 657.2103271484375, "learning_rate": 0.00019184179043414898, "loss": 40.4745, "step": 10905 }, { "epoch": 28.804225817101354, "grad_norm": 476.221435546875, "learning_rate": 0.0001918011609019182, "loss": 43.5813, "step": 10906 }, { "epoch": 28.8068669527897, "grad_norm": 839.5110473632812, "learning_rate": 0.00019176053299485566, "loss": 42.6108, "step": 10907 }, { "epoch": 28.809508088478047, "grad_norm": 468.4004821777344, "learning_rate": 0.00019171990671409614, "loss": 43.5152, "step": 10908 }, { "epoch": 28.81214922416639, "grad_norm": 824.4879760742188, "learning_rate": 0.00019167928206077406, "loss": 43.2689, "step": 10909 }, { "epoch": 28.814790359854737, "grad_norm": 324.1143798828125, "learning_rate": 0.00019163865903602372, "loss": 42.2646, "step": 10910 }, { "epoch": 28.817431495543083, "grad_norm": 337.059814453125, "learning_rate": 0.00019159803764097944, "loss": 41.0767, "step": 10911 }, { "epoch": 28.82007263123143, "grad_norm": 883.1610717773438, "learning_rate": 0.0001915574178767757, "loss": 39.2016, "step": 10912 }, { "epoch": 28.822713766919776, "grad_norm": 374.8809814453125, "learning_rate": 0.00019151679974454682, "loss": 39.5017, "step": 10913 }, { "epoch": 28.825354902608122, "grad_norm": 341.44012451171875, "learning_rate": 0.00019147618324542688, "loss": 36.6242, "step": 10914 }, { "epoch": 28.82799603829647, "grad_norm": 410.5568542480469, "learning_rate": 0.00019143556838055014, "loss": 37.3957, "step": 10915 }, { "epoch": 28.83063717398481, "grad_norm": 307.9275817871094, "learning_rate": 0.0001913949551510508, "loss": 36.8445, "step": 10916 }, { "epoch": 28.833278309673158, "grad_norm": 375.1118469238281, "learning_rate": 0.0001913543435580629, "loss": 35.4037, "step": 10917 }, { "epoch": 28.835919445361505, "grad_norm": 323.09228515625, "learning_rate": 0.00019131373360272046, "loss": 36.6509, "step": 10918 }, { "epoch": 28.83856058104985, "grad_norm": 218.6006622314453, "learning_rate": 0.00019127312528615758, "loss": 34.6956, "step": 10919 }, { "epoch": 28.841201716738198, "grad_norm": 343.5614929199219, "learning_rate": 0.00019123251860950818, "loss": 34.3721, "step": 10920 }, { "epoch": 28.843842852426544, "grad_norm": 457.76422119140625, "learning_rate": 0.00019119191357390612, "loss": 35.7753, "step": 10921 }, { "epoch": 28.84648398811489, "grad_norm": 652.0157470703125, "learning_rate": 0.00019115131018048538, "loss": 35.1428, "step": 10922 }, { "epoch": 28.849125123803237, "grad_norm": 837.0068969726562, "learning_rate": 0.00019111070843037974, "loss": 36.8961, "step": 10923 }, { "epoch": 28.85176625949158, "grad_norm": 1796.29638671875, "learning_rate": 0.00019107010832472297, "loss": 41.559, "step": 10924 }, { "epoch": 28.854407395179926, "grad_norm": 1672.037109375, "learning_rate": 0.0001910295098646487, "loss": 27.9274, "step": 10925 }, { "epoch": 28.857048530868273, "grad_norm": 3311.4658203125, "learning_rate": 0.00019098891305129083, "loss": 23.7322, "step": 10926 }, { "epoch": 28.85968966655662, "grad_norm": 2823.754638671875, "learning_rate": 0.00019094831788578297, "loss": 22.3697, "step": 10927 }, { "epoch": 28.862330802244966, "grad_norm": 1166.6854248046875, "learning_rate": 0.00019090772436925846, "loss": 17.4281, "step": 10928 }, { "epoch": 28.864971937933312, "grad_norm": 437.0995178222656, "learning_rate": 0.00019086713250285114, "loss": 17.128, "step": 10929 }, { "epoch": 28.86761307362166, "grad_norm": 472.038818359375, "learning_rate": 0.0001908265422876944, "loss": 13.0497, "step": 10930 }, { "epoch": 28.870254209310005, "grad_norm": 582.2748413085938, "learning_rate": 0.00019078595372492175, "loss": 10.4902, "step": 10931 }, { "epoch": 28.872895344998348, "grad_norm": 527.5643310546875, "learning_rate": 0.00019074536681566644, "loss": 8.8782, "step": 10932 }, { "epoch": 28.875536480686694, "grad_norm": 1325.4219970703125, "learning_rate": 0.00019070478156106208, "loss": 18.2854, "step": 10933 }, { "epoch": 28.87817761637504, "grad_norm": 1241.634033203125, "learning_rate": 0.00019066419796224182, "loss": 31.83, "step": 10934 }, { "epoch": 28.880818752063387, "grad_norm": 464.3650817871094, "learning_rate": 0.00019062361602033891, "loss": 36.3266, "step": 10935 }, { "epoch": 28.883459887751734, "grad_norm": 286.9295349121094, "learning_rate": 0.00019058303573648674, "loss": 35.3015, "step": 10936 }, { "epoch": 28.88610102344008, "grad_norm": 302.33953857421875, "learning_rate": 0.00019054245711181837, "loss": 37.7827, "step": 10937 }, { "epoch": 28.888742159128427, "grad_norm": 538.1773681640625, "learning_rate": 0.00019050188014746694, "loss": 35.0693, "step": 10938 }, { "epoch": 28.89138329481677, "grad_norm": 374.7483825683594, "learning_rate": 0.00019046130484456553, "loss": 35.3293, "step": 10939 }, { "epoch": 28.894024430505116, "grad_norm": 645.2488403320312, "learning_rate": 0.00019042073120424728, "loss": 34.1888, "step": 10940 }, { "epoch": 28.896665566193462, "grad_norm": 403.596923828125, "learning_rate": 0.00019038015922764507, "loss": 35.0527, "step": 10941 }, { "epoch": 28.89930670188181, "grad_norm": 247.0836944580078, "learning_rate": 0.0001903395889158918, "loss": 34.9456, "step": 10942 }, { "epoch": 28.901947837570155, "grad_norm": 683.6395874023438, "learning_rate": 0.00019029902027012057, "loss": 36.2596, "step": 10943 }, { "epoch": 28.9045889732585, "grad_norm": 272.9915466308594, "learning_rate": 0.00019025845329146419, "loss": 35.0798, "step": 10944 }, { "epoch": 28.907230108946848, "grad_norm": 576.7091064453125, "learning_rate": 0.00019021788798105532, "loss": 35.3508, "step": 10945 }, { "epoch": 28.909871244635195, "grad_norm": 550.2239990234375, "learning_rate": 0.00019017732434002674, "loss": 35.4353, "step": 10946 }, { "epoch": 28.912512380323538, "grad_norm": 571.919189453125, "learning_rate": 0.00019013676236951133, "loss": 36.1252, "step": 10947 }, { "epoch": 28.915153516011884, "grad_norm": 272.2864074707031, "learning_rate": 0.00019009620207064166, "loss": 34.9415, "step": 10948 }, { "epoch": 28.91779465170023, "grad_norm": 523.5857543945312, "learning_rate": 0.0001900556434445503, "loss": 34.4745, "step": 10949 }, { "epoch": 28.920435787388577, "grad_norm": 1440.732177734375, "learning_rate": 0.00019001508649237, "loss": 35.5887, "step": 10950 }, { "epoch": 28.923076923076923, "grad_norm": 869.7168579101562, "learning_rate": 0.00018997453121523308, "loss": 36.9799, "step": 10951 }, { "epoch": 28.92571805876527, "grad_norm": 491.67388916015625, "learning_rate": 0.00018993397761427218, "loss": 42.4615, "step": 10952 }, { "epoch": 28.928359194453616, "grad_norm": 982.4248046875, "learning_rate": 0.00018989342569061956, "loss": 41.6456, "step": 10953 }, { "epoch": 28.931000330141963, "grad_norm": 385.0172119140625, "learning_rate": 0.00018985287544540783, "loss": 41.749, "step": 10954 }, { "epoch": 28.933641465830306, "grad_norm": 379.12066650390625, "learning_rate": 0.0001898123268797692, "loss": 42.9281, "step": 10955 }, { "epoch": 28.936282601518652, "grad_norm": 264.75726318359375, "learning_rate": 0.0001897717799948359, "loss": 42.9622, "step": 10956 }, { "epoch": 28.938923737207, "grad_norm": 303.10498046875, "learning_rate": 0.00018973123479174036, "loss": 43.5653, "step": 10957 }, { "epoch": 28.941564872895345, "grad_norm": 296.87200927734375, "learning_rate": 0.00018969069127161467, "loss": 40.5693, "step": 10958 }, { "epoch": 28.94420600858369, "grad_norm": 599.327392578125, "learning_rate": 0.000189650149435591, "loss": 38.8109, "step": 10959 }, { "epoch": 28.946847144272038, "grad_norm": 543.4644775390625, "learning_rate": 0.00018960960928480136, "loss": 37.2967, "step": 10960 }, { "epoch": 28.949488279960384, "grad_norm": 644.0479125976562, "learning_rate": 0.000189569070820378, "loss": 37.0836, "step": 10961 }, { "epoch": 28.952129415648727, "grad_norm": 204.77157592773438, "learning_rate": 0.00018952853404345288, "loss": 34.9884, "step": 10962 }, { "epoch": 28.954770551337074, "grad_norm": 815.5321655273438, "learning_rate": 0.00018948799895515778, "loss": 35.1289, "step": 10963 }, { "epoch": 28.95741168702542, "grad_norm": 390.7596435546875, "learning_rate": 0.00018944746555662486, "loss": 36.6604, "step": 10964 }, { "epoch": 28.960052822713767, "grad_norm": 2049.79736328125, "learning_rate": 0.00018940693384898591, "loss": 30.1375, "step": 10965 }, { "epoch": 28.962693958402113, "grad_norm": 1174.3948974609375, "learning_rate": 0.00018936640383337272, "loss": 11.3878, "step": 10966 }, { "epoch": 28.96533509409046, "grad_norm": 1898.6474609375, "learning_rate": 0.00018932587551091703, "loss": 18.936, "step": 10967 }, { "epoch": 28.967976229778806, "grad_norm": 1401.0213623046875, "learning_rate": 0.0001892853488827507, "loss": 18.9569, "step": 10968 }, { "epoch": 28.970617365467152, "grad_norm": 1192.052001953125, "learning_rate": 0.00018924482395000532, "loss": 13.4208, "step": 10969 }, { "epoch": 28.973258501155495, "grad_norm": 1040.507080078125, "learning_rate": 0.0001892043007138125, "loss": 19.838, "step": 10970 }, { "epoch": 28.97589963684384, "grad_norm": 743.2376708984375, "learning_rate": 0.00018916377917530393, "loss": 35.5653, "step": 10971 }, { "epoch": 28.978540772532188, "grad_norm": 279.1946105957031, "learning_rate": 0.00018912325933561106, "loss": 35.1021, "step": 10972 }, { "epoch": 28.981181908220535, "grad_norm": 433.6942443847656, "learning_rate": 0.0001890827411958654, "loss": 35.2223, "step": 10973 }, { "epoch": 28.98382304390888, "grad_norm": 264.39556884765625, "learning_rate": 0.0001890422247571984, "loss": 35.7925, "step": 10974 }, { "epoch": 28.986464179597228, "grad_norm": 921.2327270507812, "learning_rate": 0.00018900171002074148, "loss": 34.9389, "step": 10975 }, { "epoch": 28.989105315285574, "grad_norm": 503.7756042480469, "learning_rate": 0.00018896119698762602, "loss": 34.8576, "step": 10976 }, { "epoch": 28.99174645097392, "grad_norm": 701.4953002929688, "learning_rate": 0.00018892068565898312, "loss": 34.7746, "step": 10977 }, { "epoch": 28.994387586662263, "grad_norm": 384.4337158203125, "learning_rate": 0.0001888801760359443, "loss": 34.2291, "step": 10978 }, { "epoch": 28.99702872235061, "grad_norm": 1582.52587890625, "learning_rate": 0.00018883966811964077, "loss": 34.7887, "step": 10979 }, { "epoch": 28.999669858038956, "grad_norm": 501.95989990234375, "learning_rate": 0.00018879916191120349, "loss": 38.3724, "step": 10980 }, { "epoch": 29.002310993727303, "grad_norm": 419.0776672363281, "learning_rate": 0.00018875865741176356, "loss": 39.6754, "step": 10981 }, { "epoch": 29.00495212941565, "grad_norm": 442.2843017578125, "learning_rate": 0.00018871815462245226, "loss": 39.8634, "step": 10982 }, { "epoch": 29.007593265103996, "grad_norm": 287.3890075683594, "learning_rate": 0.00018867765354440048, "loss": 39.1174, "step": 10983 }, { "epoch": 29.010234400792342, "grad_norm": 562.086181640625, "learning_rate": 0.00018863715417873916, "loss": 40.1066, "step": 10984 }, { "epoch": 29.012875536480685, "grad_norm": 295.5762634277344, "learning_rate": 0.0001885966565265993, "loss": 41.8482, "step": 10985 }, { "epoch": 29.01551667216903, "grad_norm": 826.7101440429688, "learning_rate": 0.00018855616058911177, "loss": 41.1028, "step": 10986 }, { "epoch": 29.018157807857378, "grad_norm": 1149.8441162109375, "learning_rate": 0.00018851566636740732, "loss": 39.9907, "step": 10987 }, { "epoch": 29.020798943545724, "grad_norm": 537.105712890625, "learning_rate": 0.00018847517386261675, "loss": 41.6897, "step": 10988 }, { "epoch": 29.02344007923407, "grad_norm": 196.84271240234375, "learning_rate": 0.00018843468307587087, "loss": 37.5946, "step": 10989 }, { "epoch": 29.026081214922417, "grad_norm": 702.3473510742188, "learning_rate": 0.00018839419400830027, "loss": 42.0339, "step": 10990 }, { "epoch": 29.028722350610764, "grad_norm": 397.18096923828125, "learning_rate": 0.00018835370666103553, "loss": 38.4675, "step": 10991 }, { "epoch": 29.03136348629911, "grad_norm": 383.26116943359375, "learning_rate": 0.00018831322103520743, "loss": 38.0375, "step": 10992 }, { "epoch": 29.034004621987453, "grad_norm": 934.8701171875, "learning_rate": 0.00018827273713194637, "loss": 37.6768, "step": 10993 }, { "epoch": 29.0366457576758, "grad_norm": 371.4425964355469, "learning_rate": 0.00018823225495238286, "loss": 35.8796, "step": 10994 }, { "epoch": 29.039286893364146, "grad_norm": 319.4157409667969, "learning_rate": 0.00018819177449764726, "loss": 36.4471, "step": 10995 }, { "epoch": 29.041928029052492, "grad_norm": 592.1325073242188, "learning_rate": 0.00018815129576887012, "loss": 35.4457, "step": 10996 }, { "epoch": 29.04456916474084, "grad_norm": 1320.789306640625, "learning_rate": 0.0001881108187671818, "loss": 35.3112, "step": 10997 }, { "epoch": 29.047210300429185, "grad_norm": 229.30555725097656, "learning_rate": 0.00018807034349371235, "loss": 34.181, "step": 10998 }, { "epoch": 29.04985143611753, "grad_norm": 247.5134735107422, "learning_rate": 0.00018802986994959227, "loss": 35.1573, "step": 10999 }, { "epoch": 29.052492571805878, "grad_norm": 265.327880859375, "learning_rate": 0.00018798939813595168, "loss": 36.9976, "step": 11000 }, { "epoch": 29.052492571805878, "eval_loss": 3.779315233230591, "eval_runtime": 2.0411, "eval_samples_per_second": 242.516, "eval_steps_per_second": 30.376, "step": 11000 }, { "epoch": 29.05513370749422, "grad_norm": 530.884521484375, "learning_rate": 0.00018794892805392072, "loss": 35.6315, "step": 11001 }, { "epoch": 29.057774843182568, "grad_norm": 691.7421264648438, "learning_rate": 0.00018790845970462943, "loss": 40.1315, "step": 11002 }, { "epoch": 29.060415978870914, "grad_norm": 775.5342407226562, "learning_rate": 0.000187867993089208, "loss": 17.5281, "step": 11003 }, { "epoch": 29.06305711455926, "grad_norm": 1709.7789306640625, "learning_rate": 0.00018782752820878634, "loss": 10.0893, "step": 11004 }, { "epoch": 29.065698250247607, "grad_norm": 471.11627197265625, "learning_rate": 0.0001877870650644944, "loss": 13.8362, "step": 11005 }, { "epoch": 29.068339385935953, "grad_norm": 790.9659423828125, "learning_rate": 0.00018774660365746215, "loss": 14.2434, "step": 11006 }, { "epoch": 29.0709805216243, "grad_norm": 5281.51806640625, "learning_rate": 0.00018770614398881948, "loss": 14.6176, "step": 11007 }, { "epoch": 29.073621657312643, "grad_norm": 3442.739990234375, "learning_rate": 0.00018766568605969598, "loss": 11.1339, "step": 11008 }, { "epoch": 29.07626279300099, "grad_norm": 541.7053833007812, "learning_rate": 0.00018762522987122174, "loss": 10.2957, "step": 11009 }, { "epoch": 29.078903928689336, "grad_norm": 14911.1572265625, "learning_rate": 0.0001875847754245263, "loss": 11.1911, "step": 11010 }, { "epoch": 29.081545064377682, "grad_norm": 6537.3818359375, "learning_rate": 0.0001875443227207393, "loss": 9.4016, "step": 11011 }, { "epoch": 29.08418620006603, "grad_norm": 1904.7572021484375, "learning_rate": 0.00018750387176099032, "loss": 27.3692, "step": 11012 }, { "epoch": 29.086827335754375, "grad_norm": 392.1922607421875, "learning_rate": 0.00018746342254640912, "loss": 35.7497, "step": 11013 }, { "epoch": 29.08946847144272, "grad_norm": 401.0101318359375, "learning_rate": 0.00018742297507812517, "loss": 36.2639, "step": 11014 }, { "epoch": 29.092109607131068, "grad_norm": 1035.6865234375, "learning_rate": 0.00018738252935726767, "loss": 35.5708, "step": 11015 }, { "epoch": 29.09475074281941, "grad_norm": 262.99676513671875, "learning_rate": 0.0001873420853849664, "loss": 36.6035, "step": 11016 }, { "epoch": 29.097391878507757, "grad_norm": 568.259033203125, "learning_rate": 0.0001873016431623506, "loss": 34.9036, "step": 11017 }, { "epoch": 29.100033014196104, "grad_norm": 396.39556884765625, "learning_rate": 0.00018726120269054955, "loss": 34.0405, "step": 11018 }, { "epoch": 29.10267414988445, "grad_norm": 918.7633056640625, "learning_rate": 0.00018722076397069255, "loss": 35.6423, "step": 11019 }, { "epoch": 29.105315285572797, "grad_norm": 954.5467529296875, "learning_rate": 0.00018718032700390885, "loss": 36.0536, "step": 11020 }, { "epoch": 29.107956421261143, "grad_norm": 440.8206481933594, "learning_rate": 0.00018713989179132762, "loss": 34.3825, "step": 11021 }, { "epoch": 29.11059755694949, "grad_norm": 300.45220947265625, "learning_rate": 0.00018709945833407794, "loss": 35.4842, "step": 11022 }, { "epoch": 29.113238692637836, "grad_norm": 296.7980651855469, "learning_rate": 0.000187059026633289, "loss": 36.1168, "step": 11023 }, { "epoch": 29.11587982832618, "grad_norm": 287.4378967285156, "learning_rate": 0.00018701859669008976, "loss": 37.4173, "step": 11024 }, { "epoch": 29.118520964014525, "grad_norm": 272.9539489746094, "learning_rate": 0.00018697816850560923, "loss": 35.0592, "step": 11025 }, { "epoch": 29.12116209970287, "grad_norm": 874.1095581054688, "learning_rate": 0.00018693774208097618, "loss": 33.7643, "step": 11026 }, { "epoch": 29.123803235391218, "grad_norm": 816.5919189453125, "learning_rate": 0.0001868973174173198, "loss": 33.9503, "step": 11027 }, { "epoch": 29.126444371079565, "grad_norm": 434.1478576660156, "learning_rate": 0.00018685689451576872, "loss": 35.7762, "step": 11028 }, { "epoch": 29.12908550676791, "grad_norm": 419.516357421875, "learning_rate": 0.00018681647337745168, "loss": 38.1131, "step": 11029 }, { "epoch": 29.131726642456258, "grad_norm": 399.5688171386719, "learning_rate": 0.00018677605400349763, "loss": 38.3103, "step": 11030 }, { "epoch": 29.1343677781446, "grad_norm": 259.93182373046875, "learning_rate": 0.00018673563639503508, "loss": 42.1095, "step": 11031 }, { "epoch": 29.137008913832947, "grad_norm": 602.1558837890625, "learning_rate": 0.00018669522055319278, "loss": 40.8697, "step": 11032 }, { "epoch": 29.139650049521293, "grad_norm": 765.6524658203125, "learning_rate": 0.0001866548064790991, "loss": 39.8447, "step": 11033 }, { "epoch": 29.14229118520964, "grad_norm": 220.71063232421875, "learning_rate": 0.00018661439417388286, "loss": 40.911, "step": 11034 }, { "epoch": 29.144932320897986, "grad_norm": 348.86029052734375, "learning_rate": 0.00018657398363867246, "loss": 42.5507, "step": 11035 }, { "epoch": 29.147573456586333, "grad_norm": 349.2120666503906, "learning_rate": 0.0001865335748745962, "loss": 41.3939, "step": 11036 }, { "epoch": 29.15021459227468, "grad_norm": 633.6917724609375, "learning_rate": 0.00018649316788278263, "loss": 42.8171, "step": 11037 }, { "epoch": 29.152855727963026, "grad_norm": 282.0126647949219, "learning_rate": 0.00018645276266436006, "loss": 40.4496, "step": 11038 }, { "epoch": 29.15549686365137, "grad_norm": 295.8717041015625, "learning_rate": 0.0001864123592204568, "loss": 39.6801, "step": 11039 }, { "epoch": 29.158137999339715, "grad_norm": 257.5666809082031, "learning_rate": 0.00018637195755220095, "loss": 38.4547, "step": 11040 }, { "epoch": 29.16077913502806, "grad_norm": 289.15509033203125, "learning_rate": 0.0001863315576607209, "loss": 36.2322, "step": 11041 }, { "epoch": 29.163420270716408, "grad_norm": 448.88726806640625, "learning_rate": 0.00018629115954714464, "loss": 38.8358, "step": 11042 }, { "epoch": 29.166061406404754, "grad_norm": 351.6520080566406, "learning_rate": 0.0001862507632126003, "loss": 37.3991, "step": 11043 }, { "epoch": 29.1687025420931, "grad_norm": 321.58624267578125, "learning_rate": 0.00018621036865821606, "loss": 35.9336, "step": 11044 }, { "epoch": 29.171343677781447, "grad_norm": 256.3047180175781, "learning_rate": 0.00018616997588511976, "loss": 36.0845, "step": 11045 }, { "epoch": 29.173984813469794, "grad_norm": 1264.8299560546875, "learning_rate": 0.00018612958489443938, "loss": 36.6839, "step": 11046 }, { "epoch": 29.176625949158137, "grad_norm": 425.0970153808594, "learning_rate": 0.00018608919568730272, "loss": 34.6757, "step": 11047 }, { "epoch": 29.179267084846483, "grad_norm": 267.9573974609375, "learning_rate": 0.00018604880826483784, "loss": 34.8277, "step": 11048 }, { "epoch": 29.18190822053483, "grad_norm": 295.6809387207031, "learning_rate": 0.00018600842262817246, "loss": 34.8183, "step": 11049 }, { "epoch": 29.184549356223176, "grad_norm": 878.8816528320312, "learning_rate": 0.00018596803877843416, "loss": 35.8848, "step": 11050 }, { "epoch": 29.187190491911522, "grad_norm": 338.6939697265625, "learning_rate": 0.00018592765671675081, "loss": 35.3583, "step": 11051 }, { "epoch": 29.18983162759987, "grad_norm": 1253.0953369140625, "learning_rate": 0.00018588727644425006, "loss": 39.675, "step": 11052 }, { "epoch": 29.192472763288215, "grad_norm": 2504.47998046875, "learning_rate": 0.00018584689796205946, "loss": 17.9829, "step": 11053 }, { "epoch": 29.195113898976558, "grad_norm": 1404.09765625, "learning_rate": 0.00018580652127130648, "loss": 12.338, "step": 11054 }, { "epoch": 29.197755034664905, "grad_norm": 769.7220458984375, "learning_rate": 0.00018576614637311873, "loss": 16.4763, "step": 11055 }, { "epoch": 29.20039617035325, "grad_norm": 2931.150390625, "learning_rate": 0.0001857257732686236, "loss": 17.3628, "step": 11056 }, { "epoch": 29.203037306041598, "grad_norm": 766.96630859375, "learning_rate": 0.00018568540195894846, "loss": 17.7611, "step": 11057 }, { "epoch": 29.205678441729944, "grad_norm": 6656.56884765625, "learning_rate": 0.00018564503244522073, "loss": 17.3521, "step": 11058 }, { "epoch": 29.20831957741829, "grad_norm": 6175.6767578125, "learning_rate": 0.00018560466472856767, "loss": 13.9139, "step": 11059 }, { "epoch": 29.210960713106637, "grad_norm": 667.2337036132812, "learning_rate": 0.00018556429881011656, "loss": 11.5889, "step": 11060 }, { "epoch": 29.213601848794983, "grad_norm": 1750.4957275390625, "learning_rate": 0.00018552393469099438, "loss": 13.6357, "step": 11061 }, { "epoch": 29.216242984483326, "grad_norm": 329.82989501953125, "learning_rate": 0.00018548357237232866, "loss": 17.1756, "step": 11062 }, { "epoch": 29.218884120171673, "grad_norm": 394.3479919433594, "learning_rate": 0.0001854432118552462, "loss": 34.9437, "step": 11063 }, { "epoch": 29.22152525586002, "grad_norm": 496.49005126953125, "learning_rate": 0.000185402853140874, "loss": 34.9364, "step": 11064 }, { "epoch": 29.224166391548366, "grad_norm": 453.1318664550781, "learning_rate": 0.00018536249623033932, "loss": 36.0334, "step": 11065 }, { "epoch": 29.226807527236712, "grad_norm": 309.01483154296875, "learning_rate": 0.00018532214112476896, "loss": 35.7666, "step": 11066 }, { "epoch": 29.22944866292506, "grad_norm": 984.73486328125, "learning_rate": 0.00018528178782528987, "loss": 35.2764, "step": 11067 }, { "epoch": 29.232089798613405, "grad_norm": 319.9355163574219, "learning_rate": 0.00018524143633302865, "loss": 33.8551, "step": 11068 }, { "epoch": 29.23473093430175, "grad_norm": 422.62652587890625, "learning_rate": 0.0001852010866491124, "loss": 35.3116, "step": 11069 }, { "epoch": 29.237372069990094, "grad_norm": 341.4410400390625, "learning_rate": 0.00018516073877466777, "loss": 34.0596, "step": 11070 }, { "epoch": 29.24001320567844, "grad_norm": 488.8677062988281, "learning_rate": 0.00018512039271082134, "loss": 34.4667, "step": 11071 }, { "epoch": 29.242654341366787, "grad_norm": 529.9269409179688, "learning_rate": 0.00018508004845869987, "loss": 36.0246, "step": 11072 }, { "epoch": 29.245295477055134, "grad_norm": 1114.2115478515625, "learning_rate": 0.00018503970601942992, "loss": 36.4322, "step": 11073 }, { "epoch": 29.24793661274348, "grad_norm": 461.5663757324219, "learning_rate": 0.00018499936539413805, "loss": 34.3857, "step": 11074 }, { "epoch": 29.250577748431827, "grad_norm": 388.38629150390625, "learning_rate": 0.00018495902658395065, "loss": 34.3697, "step": 11075 }, { "epoch": 29.253218884120173, "grad_norm": 337.6130676269531, "learning_rate": 0.00018491868958999427, "loss": 35.197, "step": 11076 }, { "epoch": 29.255860019808516, "grad_norm": 1077.4312744140625, "learning_rate": 0.00018487835441339525, "loss": 33.6216, "step": 11077 }, { "epoch": 29.258501155496862, "grad_norm": 305.5246887207031, "learning_rate": 0.00018483802105527986, "loss": 35.8549, "step": 11078 }, { "epoch": 29.26114229118521, "grad_norm": 2242.6708984375, "learning_rate": 0.00018479768951677457, "loss": 36.2753, "step": 11079 }, { "epoch": 29.263783426873555, "grad_norm": 882.9261474609375, "learning_rate": 0.00018475735979900544, "loss": 39.6759, "step": 11080 }, { "epoch": 29.2664245625619, "grad_norm": 274.0986633300781, "learning_rate": 0.00018471703190309875, "loss": 40.2454, "step": 11081 }, { "epoch": 29.269065698250248, "grad_norm": 616.3594360351562, "learning_rate": 0.00018467670583018045, "loss": 39.1796, "step": 11082 }, { "epoch": 29.271706833938595, "grad_norm": 396.3511657714844, "learning_rate": 0.00018463638158137691, "loss": 39.3687, "step": 11083 }, { "epoch": 29.27434796962694, "grad_norm": 262.7916564941406, "learning_rate": 0.00018459605915781402, "loss": 40.249, "step": 11084 }, { "epoch": 29.276989105315284, "grad_norm": 262.6884765625, "learning_rate": 0.00018455573856061765, "loss": 45.9586, "step": 11085 }, { "epoch": 29.27963024100363, "grad_norm": 451.4105529785156, "learning_rate": 0.0001845154197909139, "loss": 41.4797, "step": 11086 }, { "epoch": 29.282271376691977, "grad_norm": 361.21728515625, "learning_rate": 0.00018447510284982864, "loss": 44.3803, "step": 11087 }, { "epoch": 29.284912512380323, "grad_norm": 634.4456787109375, "learning_rate": 0.0001844347877384876, "loss": 40.708, "step": 11088 }, { "epoch": 29.28755364806867, "grad_norm": 323.1626892089844, "learning_rate": 0.00018439447445801655, "loss": 41.9449, "step": 11089 }, { "epoch": 29.290194783757016, "grad_norm": 755.2767944335938, "learning_rate": 0.00018435416300954132, "loss": 41.8057, "step": 11090 }, { "epoch": 29.292835919445363, "grad_norm": 184.3681640625, "learning_rate": 0.00018431385339418753, "loss": 39.3709, "step": 11091 }, { "epoch": 29.29547705513371, "grad_norm": 282.96612548828125, "learning_rate": 0.00018427354561308075, "loss": 36.4237, "step": 11092 }, { "epoch": 29.298118190822052, "grad_norm": 551.1918334960938, "learning_rate": 0.0001842332396673466, "loss": 37.7118, "step": 11093 }, { "epoch": 29.3007593265104, "grad_norm": 366.49517822265625, "learning_rate": 0.00018419293555811065, "loss": 36.4709, "step": 11094 }, { "epoch": 29.303400462198745, "grad_norm": 417.60430908203125, "learning_rate": 0.0001841526332864983, "loss": 35.5872, "step": 11095 }, { "epoch": 29.30604159788709, "grad_norm": 556.4608154296875, "learning_rate": 0.00018411233285363487, "loss": 35.9323, "step": 11096 }, { "epoch": 29.308682733575438, "grad_norm": 256.1919860839844, "learning_rate": 0.00018407203426064602, "loss": 34.4488, "step": 11097 }, { "epoch": 29.311323869263784, "grad_norm": 655.3599853515625, "learning_rate": 0.00018403173750865685, "loss": 35.4413, "step": 11098 }, { "epoch": 29.31396500495213, "grad_norm": 207.1988067626953, "learning_rate": 0.00018399144259879252, "loss": 34.7621, "step": 11099 }, { "epoch": 29.316606140640474, "grad_norm": 532.1217041015625, "learning_rate": 0.0001839511495321785, "loss": 35.6135, "step": 11100 }, { "epoch": 29.31924727632882, "grad_norm": 349.1621398925781, "learning_rate": 0.0001839108583099398, "loss": 36.0966, "step": 11101 }, { "epoch": 29.321888412017167, "grad_norm": 654.6876220703125, "learning_rate": 0.00018387056893320164, "loss": 36.1369, "step": 11102 }, { "epoch": 29.324529547705513, "grad_norm": 811.5839233398438, "learning_rate": 0.00018383028140308883, "loss": 30.5934, "step": 11103 }, { "epoch": 29.32717068339386, "grad_norm": 3907.41650390625, "learning_rate": 0.00018378999572072667, "loss": 19.9537, "step": 11104 }, { "epoch": 29.329811819082206, "grad_norm": 1825.463134765625, "learning_rate": 0.00018374971188723997, "loss": 22.4364, "step": 11105 }, { "epoch": 29.332452954770552, "grad_norm": 1301.603271484375, "learning_rate": 0.00018370942990375357, "loss": 17.5255, "step": 11106 }, { "epoch": 29.3350940904589, "grad_norm": 645.1001586914062, "learning_rate": 0.0001836691497713925, "loss": 13.9747, "step": 11107 }, { "epoch": 29.33773522614724, "grad_norm": 3315.6875, "learning_rate": 0.00018362887149128143, "loss": 9.9915, "step": 11108 }, { "epoch": 29.340376361835588, "grad_norm": 1858.4649658203125, "learning_rate": 0.00018358859506454513, "loss": 13.8828, "step": 11109 }, { "epoch": 29.343017497523935, "grad_norm": 323.67950439453125, "learning_rate": 0.00018354832049230824, "loss": 10.2307, "step": 11110 }, { "epoch": 29.34565863321228, "grad_norm": 644.3701171875, "learning_rate": 0.00018350804777569551, "loss": 10.5956, "step": 11111 }, { "epoch": 29.348299768900628, "grad_norm": 304.1355285644531, "learning_rate": 0.00018346777691583157, "loss": 22.0657, "step": 11112 }, { "epoch": 29.350940904588974, "grad_norm": 651.2887573242188, "learning_rate": 0.00018342750791384066, "loss": 38.0863, "step": 11113 }, { "epoch": 29.35358204027732, "grad_norm": 267.13140869140625, "learning_rate": 0.0001833872407708477, "loss": 35.5537, "step": 11114 }, { "epoch": 29.356223175965667, "grad_norm": 291.4630432128906, "learning_rate": 0.00018334697548797686, "loss": 34.2844, "step": 11115 }, { "epoch": 29.35886431165401, "grad_norm": 514.2410278320312, "learning_rate": 0.00018330671206635257, "loss": 33.169, "step": 11116 }, { "epoch": 29.361505447342356, "grad_norm": 421.8961181640625, "learning_rate": 0.00018326645050709906, "loss": 34.5752, "step": 11117 }, { "epoch": 29.364146583030703, "grad_norm": 606.2244873046875, "learning_rate": 0.0001832261908113408, "loss": 35.0984, "step": 11118 }, { "epoch": 29.36678771871905, "grad_norm": 728.2184448242188, "learning_rate": 0.00018318593298020204, "loss": 35.1021, "step": 11119 }, { "epoch": 29.369428854407396, "grad_norm": 710.7893676757812, "learning_rate": 0.00018314567701480664, "loss": 35.1688, "step": 11120 }, { "epoch": 29.372069990095742, "grad_norm": 225.30535888671875, "learning_rate": 0.00018310542291627907, "loss": 34.8175, "step": 11121 }, { "epoch": 29.37471112578409, "grad_norm": 287.0409240722656, "learning_rate": 0.00018306517068574326, "loss": 35.6282, "step": 11122 }, { "epoch": 29.37735226147243, "grad_norm": 515.982177734375, "learning_rate": 0.00018302492032432326, "loss": 34.4867, "step": 11123 }, { "epoch": 29.379993397160778, "grad_norm": 242.6053466796875, "learning_rate": 0.00018298467183314293, "loss": 34.14, "step": 11124 }, { "epoch": 29.382634532849124, "grad_norm": 411.6214904785156, "learning_rate": 0.00018294442521332634, "loss": 34.7504, "step": 11125 }, { "epoch": 29.38527566853747, "grad_norm": 204.1327362060547, "learning_rate": 0.00018290418046599733, "loss": 34.257, "step": 11126 }, { "epoch": 29.387916804225817, "grad_norm": 394.8511962890625, "learning_rate": 0.00018286393759227955, "loss": 34.3273, "step": 11127 }, { "epoch": 29.390557939914164, "grad_norm": 397.69403076171875, "learning_rate": 0.00018282369659329694, "loss": 34.9474, "step": 11128 }, { "epoch": 29.39319907560251, "grad_norm": 721.0634765625, "learning_rate": 0.00018278345747017315, "loss": 35.1699, "step": 11129 }, { "epoch": 29.395840211290857, "grad_norm": 653.8502197265625, "learning_rate": 0.00018274322022403185, "loss": 38.5036, "step": 11130 }, { "epoch": 29.3984813469792, "grad_norm": 397.79345703125, "learning_rate": 0.00018270298485599645, "loss": 39.4394, "step": 11131 }, { "epoch": 29.401122482667546, "grad_norm": 397.68121337890625, "learning_rate": 0.0001826627513671909, "loss": 39.8444, "step": 11132 }, { "epoch": 29.403763618355892, "grad_norm": 589.3433227539062, "learning_rate": 0.00018262251975873837, "loss": 39.9794, "step": 11133 }, { "epoch": 29.40640475404424, "grad_norm": 857.114990234375, "learning_rate": 0.0001825822900317623, "loss": 40.3149, "step": 11134 }, { "epoch": 29.409045889732585, "grad_norm": 163.46621704101562, "learning_rate": 0.00018254206218738626, "loss": 41.6563, "step": 11135 }, { "epoch": 29.41168702542093, "grad_norm": 240.0555419921875, "learning_rate": 0.0001825018362267335, "loss": 41.3427, "step": 11136 }, { "epoch": 29.414328161109278, "grad_norm": 256.8756408691406, "learning_rate": 0.0001824616121509273, "loss": 43.0865, "step": 11137 }, { "epoch": 29.416969296797625, "grad_norm": 393.9174499511719, "learning_rate": 0.00018242138996109092, "loss": 39.8015, "step": 11138 }, { "epoch": 29.419610432485968, "grad_norm": 218.7537841796875, "learning_rate": 0.00018238116965834755, "loss": 39.6243, "step": 11139 }, { "epoch": 29.422251568174314, "grad_norm": 206.17002868652344, "learning_rate": 0.0001823409512438203, "loss": 39.7116, "step": 11140 }, { "epoch": 29.42489270386266, "grad_norm": 359.45562744140625, "learning_rate": 0.00018230073471863217, "loss": 39.1688, "step": 11141 }, { "epoch": 29.427533839551007, "grad_norm": 254.2259063720703, "learning_rate": 0.00018226052008390634, "loss": 36.5506, "step": 11142 }, { "epoch": 29.430174975239353, "grad_norm": 167.23748779296875, "learning_rate": 0.00018222030734076566, "loss": 38.5918, "step": 11143 }, { "epoch": 29.4328161109277, "grad_norm": 491.08758544921875, "learning_rate": 0.00018218009649033308, "loss": 36.95, "step": 11144 }, { "epoch": 29.435457246616046, "grad_norm": 307.5212707519531, "learning_rate": 0.00018213988753373146, "loss": 35.106, "step": 11145 }, { "epoch": 29.43809838230439, "grad_norm": 278.3515625, "learning_rate": 0.00018209968047208365, "loss": 35.0667, "step": 11146 }, { "epoch": 29.440739517992736, "grad_norm": 249.25640869140625, "learning_rate": 0.00018205947530651234, "loss": 35.4671, "step": 11147 }, { "epoch": 29.443380653681082, "grad_norm": 419.8179626464844, "learning_rate": 0.0001820192720381402, "loss": 36.4912, "step": 11148 }, { "epoch": 29.44602178936943, "grad_norm": 389.0391540527344, "learning_rate": 0.00018197907066809011, "loss": 34.6786, "step": 11149 }, { "epoch": 29.448662925057775, "grad_norm": 436.02752685546875, "learning_rate": 0.00018193887119748444, "loss": 34.2015, "step": 11150 }, { "epoch": 29.45130406074612, "grad_norm": 121.87616729736328, "learning_rate": 0.00018189867362744573, "loss": 35.0952, "step": 11151 }, { "epoch": 29.453945196434468, "grad_norm": 322.7328186035156, "learning_rate": 0.00018185847795909666, "loss": 37.0907, "step": 11152 }, { "epoch": 29.456586332122814, "grad_norm": 1921.9737548828125, "learning_rate": 0.00018181828419355957, "loss": 37.6405, "step": 11153 }, { "epoch": 29.459227467811157, "grad_norm": 1081.6287841796875, "learning_rate": 0.0001817780923319568, "loss": 19.1774, "step": 11154 }, { "epoch": 29.461868603499504, "grad_norm": 3303.303466796875, "learning_rate": 0.0001817379023754107, "loss": 23.8892, "step": 11155 }, { "epoch": 29.46450973918785, "grad_norm": 6809.4189453125, "learning_rate": 0.00018169771432504362, "loss": 17.8461, "step": 11156 }, { "epoch": 29.467150874876197, "grad_norm": 2270.856201171875, "learning_rate": 0.00018165752818197775, "loss": 13.7526, "step": 11157 }, { "epoch": 29.469792010564543, "grad_norm": 644.0167846679688, "learning_rate": 0.0001816173439473352, "loss": 16.6763, "step": 11158 }, { "epoch": 29.47243314625289, "grad_norm": 870.3238525390625, "learning_rate": 0.00018157716162223823, "loss": 13.0106, "step": 11159 }, { "epoch": 29.475074281941236, "grad_norm": 634.7130737304688, "learning_rate": 0.00018153698120780882, "loss": 15.385, "step": 11160 }, { "epoch": 29.477715417629582, "grad_norm": 4681.765625, "learning_rate": 0.00018149680270516897, "loss": 8.623, "step": 11161 }, { "epoch": 29.480356553317925, "grad_norm": 358.811767578125, "learning_rate": 0.00018145662611544061, "loss": 28.3479, "step": 11162 }, { "epoch": 29.48299768900627, "grad_norm": 442.6336669921875, "learning_rate": 0.00018141645143974577, "loss": 35.0502, "step": 11163 }, { "epoch": 29.485638824694618, "grad_norm": 179.2297821044922, "learning_rate": 0.00018137627867920625, "loss": 34.8053, "step": 11164 }, { "epoch": 29.488279960382965, "grad_norm": 226.76461791992188, "learning_rate": 0.0001813361078349437, "loss": 37.008, "step": 11165 }, { "epoch": 29.49092109607131, "grad_norm": 219.6723175048828, "learning_rate": 0.00018129593890808013, "loss": 34.7782, "step": 11166 }, { "epoch": 29.493562231759658, "grad_norm": 409.94866943359375, "learning_rate": 0.00018125577189973712, "loss": 34.4776, "step": 11167 }, { "epoch": 29.496203367448004, "grad_norm": 1054.63427734375, "learning_rate": 0.0001812156068110363, "loss": 36.6997, "step": 11168 }, { "epoch": 29.498844503136347, "grad_norm": 3933.684326171875, "learning_rate": 0.00018117544364309912, "loss": 35.6679, "step": 11169 }, { "epoch": 29.501485638824693, "grad_norm": 445.9559020996094, "learning_rate": 0.00018113528239704736, "loss": 34.805, "step": 11170 }, { "epoch": 29.50412677451304, "grad_norm": 471.7331848144531, "learning_rate": 0.0001810951230740024, "loss": 35.1913, "step": 11171 }, { "epoch": 29.506767910201386, "grad_norm": 772.1503295898438, "learning_rate": 0.00018105496567508557, "loss": 34.2512, "step": 11172 }, { "epoch": 29.509409045889733, "grad_norm": 763.637939453125, "learning_rate": 0.0001810148102014184, "loss": 35.5949, "step": 11173 }, { "epoch": 29.51205018157808, "grad_norm": 324.0919189453125, "learning_rate": 0.00018097465665412212, "loss": 35.4165, "step": 11174 }, { "epoch": 29.514691317266426, "grad_norm": 298.0108642578125, "learning_rate": 0.00018093450503431797, "loss": 35.0241, "step": 11175 }, { "epoch": 29.517332452954772, "grad_norm": 234.71507263183594, "learning_rate": 0.0001808943553431272, "loss": 34.8913, "step": 11176 }, { "epoch": 29.519973588643115, "grad_norm": 302.3192443847656, "learning_rate": 0.00018085420758167097, "loss": 35.2962, "step": 11177 }, { "epoch": 29.52261472433146, "grad_norm": 446.8777160644531, "learning_rate": 0.0001808140617510704, "loss": 36.3537, "step": 11178 }, { "epoch": 29.525255860019808, "grad_norm": 255.75660705566406, "learning_rate": 0.00018077391785244646, "loss": 38.1289, "step": 11179 }, { "epoch": 29.527896995708154, "grad_norm": 515.0046997070312, "learning_rate": 0.00018073377588692026, "loss": 38.8114, "step": 11180 }, { "epoch": 29.5305381313965, "grad_norm": 269.46478271484375, "learning_rate": 0.00018069363585561266, "loss": 39.5904, "step": 11181 }, { "epoch": 29.533179267084847, "grad_norm": 409.12396240234375, "learning_rate": 0.00018065349775964456, "loss": 39.2176, "step": 11182 }, { "epoch": 29.535820402773194, "grad_norm": 807.1159057617188, "learning_rate": 0.0001806133616001367, "loss": 40.828, "step": 11183 }, { "epoch": 29.53846153846154, "grad_norm": 330.1885986328125, "learning_rate": 0.00018057322737821013, "loss": 39.8652, "step": 11184 }, { "epoch": 29.541102674149883, "grad_norm": 354.80865478515625, "learning_rate": 0.00018053309509498533, "loss": 44.8405, "step": 11185 }, { "epoch": 29.54374380983823, "grad_norm": 214.1388397216797, "learning_rate": 0.00018049296475158294, "loss": 44.1182, "step": 11186 }, { "epoch": 29.546384945526576, "grad_norm": 352.571533203125, "learning_rate": 0.0001804528363491238, "loss": 42.039, "step": 11187 }, { "epoch": 29.549026081214922, "grad_norm": 436.9131774902344, "learning_rate": 0.00018041270988872836, "loss": 42.5017, "step": 11188 }, { "epoch": 29.55166721690327, "grad_norm": 318.54510498046875, "learning_rate": 0.00018037258537151706, "loss": 41.7196, "step": 11189 }, { "epoch": 29.554308352591615, "grad_norm": 1789.1630859375, "learning_rate": 0.00018033246279861042, "loss": 40.1668, "step": 11190 }, { "epoch": 29.55694948827996, "grad_norm": 431.2928771972656, "learning_rate": 0.0001802923421711289, "loss": 38.9517, "step": 11191 }, { "epoch": 29.559590623968305, "grad_norm": 625.5386962890625, "learning_rate": 0.0001802522234901927, "loss": 38.295, "step": 11192 }, { "epoch": 29.56223175965665, "grad_norm": 319.7022399902344, "learning_rate": 0.00018021210675692217, "loss": 35.8842, "step": 11193 }, { "epoch": 29.564872895344998, "grad_norm": 1799.71728515625, "learning_rate": 0.00018017199197243764, "loss": 37.2683, "step": 11194 }, { "epoch": 29.567514031033344, "grad_norm": 289.7334899902344, "learning_rate": 0.0001801318791378592, "loss": 35.9231, "step": 11195 }, { "epoch": 29.57015516672169, "grad_norm": 670.0664672851562, "learning_rate": 0.00018009176825430702, "loss": 35.6445, "step": 11196 }, { "epoch": 29.572796302410037, "grad_norm": 345.59423828125, "learning_rate": 0.00018005165932290102, "loss": 35.3327, "step": 11197 }, { "epoch": 29.575437438098383, "grad_norm": 416.25457763671875, "learning_rate": 0.00018001155234476148, "loss": 35.2625, "step": 11198 }, { "epoch": 29.57807857378673, "grad_norm": 263.7693786621094, "learning_rate": 0.00017997144732100816, "loss": 35.6209, "step": 11199 }, { "epoch": 29.580719709475073, "grad_norm": 1062.4893798828125, "learning_rate": 0.00017993134425276094, "loss": 35.2614, "step": 11200 }, { "epoch": 29.580719709475073, "eval_loss": 3.8270294666290283, "eval_runtime": 2.2319, "eval_samples_per_second": 221.784, "eval_steps_per_second": 27.779, "step": 11200 }, { "epoch": 29.58336084516342, "grad_norm": 253.57305908203125, "learning_rate": 0.00017989124314113993, "loss": 35.1627, "step": 11201 }, { "epoch": 29.586001980851766, "grad_norm": 383.4979248046875, "learning_rate": 0.00017985114398726476, "loss": 40.7077, "step": 11202 }, { "epoch": 29.588643116540112, "grad_norm": 1144.4263916015625, "learning_rate": 0.00017981104679225515, "loss": 32.2695, "step": 11203 }, { "epoch": 29.59128425222846, "grad_norm": 816.0383911132812, "learning_rate": 0.0001797709515572307, "loss": 16.4567, "step": 11204 }, { "epoch": 29.593925387916805, "grad_norm": 885.9412841796875, "learning_rate": 0.00017973085828331135, "loss": 12.6057, "step": 11205 }, { "epoch": 29.59656652360515, "grad_norm": 583.5675048828125, "learning_rate": 0.00017969076697161646, "loss": 11.6245, "step": 11206 }, { "epoch": 29.599207659293498, "grad_norm": 915.1746215820312, "learning_rate": 0.00017965067762326553, "loss": 12.2758, "step": 11207 }, { "epoch": 29.60184879498184, "grad_norm": 5140.927734375, "learning_rate": 0.0001796105902393782, "loss": 12.1102, "step": 11208 }, { "epoch": 29.604489930670187, "grad_norm": 1310.9869384765625, "learning_rate": 0.00017957050482107377, "loss": 15.6138, "step": 11209 }, { "epoch": 29.607131066358534, "grad_norm": 931.9530029296875, "learning_rate": 0.0001795304213694716, "loss": 11.4565, "step": 11210 }, { "epoch": 29.60977220204688, "grad_norm": 2011.971923828125, "learning_rate": 0.00017949033988569097, "loss": 20.0116, "step": 11211 }, { "epoch": 29.612413337735227, "grad_norm": 1120.8692626953125, "learning_rate": 0.00017945026037085126, "loss": 19.8114, "step": 11212 }, { "epoch": 29.615054473423573, "grad_norm": 695.4345703125, "learning_rate": 0.00017941018282607157, "loss": 36.1421, "step": 11213 }, { "epoch": 29.61769560911192, "grad_norm": 306.53265380859375, "learning_rate": 0.000179370107252471, "loss": 36.1879, "step": 11214 }, { "epoch": 29.620336744800262, "grad_norm": 696.7673950195312, "learning_rate": 0.00017933003365116874, "loss": 35.1603, "step": 11215 }, { "epoch": 29.62297788048861, "grad_norm": 298.4958801269531, "learning_rate": 0.0001792899620232838, "loss": 35.6087, "step": 11216 }, { "epoch": 29.625619016176955, "grad_norm": 647.943115234375, "learning_rate": 0.0001792498923699351, "loss": 34.9878, "step": 11217 }, { "epoch": 29.6282601518653, "grad_norm": 767.7568969726562, "learning_rate": 0.0001792098246922415, "loss": 35.6842, "step": 11218 }, { "epoch": 29.630901287553648, "grad_norm": 500.7084045410156, "learning_rate": 0.00017916975899132215, "loss": 34.5401, "step": 11219 }, { "epoch": 29.633542423241995, "grad_norm": 214.62791442871094, "learning_rate": 0.00017912969526829559, "loss": 33.5645, "step": 11220 }, { "epoch": 29.63618355893034, "grad_norm": 230.58876037597656, "learning_rate": 0.00017908963352428057, "loss": 35.1616, "step": 11221 }, { "epoch": 29.638824694618688, "grad_norm": 451.2610778808594, "learning_rate": 0.000179049573760396, "loss": 35.0718, "step": 11222 }, { "epoch": 29.64146583030703, "grad_norm": 546.6537475585938, "learning_rate": 0.00017900951597776035, "loss": 34.8676, "step": 11223 }, { "epoch": 29.644106965995377, "grad_norm": 502.6434631347656, "learning_rate": 0.0001789694601774923, "loss": 34.6557, "step": 11224 }, { "epoch": 29.646748101683723, "grad_norm": 324.7727355957031, "learning_rate": 0.00017892940636071028, "loss": 35.4089, "step": 11225 }, { "epoch": 29.64938923737207, "grad_norm": 1256.56689453125, "learning_rate": 0.0001788893545285329, "loss": 34.6332, "step": 11226 }, { "epoch": 29.652030373060416, "grad_norm": 841.1053466796875, "learning_rate": 0.00017884930468207853, "loss": 34.6804, "step": 11227 }, { "epoch": 29.654671508748763, "grad_norm": 486.4619445800781, "learning_rate": 0.00017880925682246546, "loss": 33.9578, "step": 11228 }, { "epoch": 29.65731264443711, "grad_norm": 338.58099365234375, "learning_rate": 0.00017876921095081214, "loss": 36.8632, "step": 11229 }, { "epoch": 29.659953780125456, "grad_norm": 676.1008911132812, "learning_rate": 0.00017872916706823677, "loss": 40.7501, "step": 11230 }, { "epoch": 29.6625949158138, "grad_norm": 632.8070068359375, "learning_rate": 0.0001786891251758575, "loss": 39.2528, "step": 11231 }, { "epoch": 29.665236051502145, "grad_norm": 235.23123168945312, "learning_rate": 0.0001786490852747925, "loss": 39.7239, "step": 11232 }, { "epoch": 29.66787718719049, "grad_norm": 575.92919921875, "learning_rate": 0.00017860904736615997, "loss": 41.0352, "step": 11233 }, { "epoch": 29.670518322878838, "grad_norm": 332.66583251953125, "learning_rate": 0.0001785690114510778, "loss": 43.1758, "step": 11234 }, { "epoch": 29.673159458567184, "grad_norm": 547.037109375, "learning_rate": 0.00017852897753066393, "loss": 43.3833, "step": 11235 }, { "epoch": 29.67580059425553, "grad_norm": 442.2765808105469, "learning_rate": 0.00017848894560603648, "loss": 42.7336, "step": 11236 }, { "epoch": 29.678441729943877, "grad_norm": 340.6794128417969, "learning_rate": 0.00017844891567831328, "loss": 43.6427, "step": 11237 }, { "epoch": 29.68108286563222, "grad_norm": 346.6161193847656, "learning_rate": 0.00017840888774861202, "loss": 42.39, "step": 11238 }, { "epoch": 29.683724001320567, "grad_norm": 289.92144775390625, "learning_rate": 0.0001783688618180504, "loss": 40.3175, "step": 11239 }, { "epoch": 29.686365137008913, "grad_norm": 450.056396484375, "learning_rate": 0.00017832883788774636, "loss": 39.7876, "step": 11240 }, { "epoch": 29.68900627269726, "grad_norm": 387.1024475097656, "learning_rate": 0.0001782888159588174, "loss": 38.8689, "step": 11241 }, { "epoch": 29.691647408385606, "grad_norm": 223.58221435546875, "learning_rate": 0.0001782487960323811, "loss": 39.8509, "step": 11242 }, { "epoch": 29.694288544073952, "grad_norm": 458.4283142089844, "learning_rate": 0.00017820877810955505, "loss": 37.1093, "step": 11243 }, { "epoch": 29.6969296797623, "grad_norm": 474.74835205078125, "learning_rate": 0.0001781687621914567, "loss": 35.8743, "step": 11244 }, { "epoch": 29.699570815450645, "grad_norm": 490.2605285644531, "learning_rate": 0.0001781287482792035, "loss": 35.5366, "step": 11245 }, { "epoch": 29.702211951138988, "grad_norm": 378.9906921386719, "learning_rate": 0.0001780887363739127, "loss": 35.9301, "step": 11246 }, { "epoch": 29.704853086827335, "grad_norm": 151.85781860351562, "learning_rate": 0.00017804872647670178, "loss": 35.9531, "step": 11247 }, { "epoch": 29.70749422251568, "grad_norm": 675.5677490234375, "learning_rate": 0.00017800871858868788, "loss": 36.1658, "step": 11248 }, { "epoch": 29.710135358204028, "grad_norm": 280.5035095214844, "learning_rate": 0.00017796871271098818, "loss": 34.9495, "step": 11249 }, { "epoch": 29.712776493892374, "grad_norm": 220.3152618408203, "learning_rate": 0.0001779287088447199, "loss": 35.3923, "step": 11250 }, { "epoch": 29.71541762958072, "grad_norm": 899.8474731445312, "learning_rate": 0.00017788870699100013, "loss": 34.8152, "step": 11251 }, { "epoch": 29.718058765269067, "grad_norm": 236.60763549804688, "learning_rate": 0.0001778487071509458, "loss": 37.4883, "step": 11252 }, { "epoch": 29.720699900957413, "grad_norm": 605.3970947265625, "learning_rate": 0.00017780870932567388, "loss": 24.3015, "step": 11253 }, { "epoch": 29.723341036645756, "grad_norm": 7844.75439453125, "learning_rate": 0.00017776871351630149, "loss": 13.7819, "step": 11254 }, { "epoch": 29.725982172334103, "grad_norm": 6650.4375, "learning_rate": 0.00017772871972394528, "loss": 16.923, "step": 11255 }, { "epoch": 29.72862330802245, "grad_norm": 411.1484680175781, "learning_rate": 0.00017768872794972202, "loss": 16.724, "step": 11256 }, { "epoch": 29.731264443710796, "grad_norm": 1944.906494140625, "learning_rate": 0.00017764873819474866, "loss": 12.442, "step": 11257 }, { "epoch": 29.733905579399142, "grad_norm": 813.3095092773438, "learning_rate": 0.0001776087504601418, "loss": 12.8062, "step": 11258 }, { "epoch": 29.73654671508749, "grad_norm": 333.0063781738281, "learning_rate": 0.00017756876474701804, "loss": 13.7303, "step": 11259 }, { "epoch": 29.739187850775835, "grad_norm": 1687.9512939453125, "learning_rate": 0.00017752878105649388, "loss": 13.2133, "step": 11260 }, { "epoch": 29.741828986464178, "grad_norm": 1873.20556640625, "learning_rate": 0.00017748879938968607, "loss": 9.8531, "step": 11261 }, { "epoch": 29.744470122152524, "grad_norm": 526.8546752929688, "learning_rate": 0.0001774488197477109, "loss": 14.6911, "step": 11262 }, { "epoch": 29.74711125784087, "grad_norm": 266.15472412109375, "learning_rate": 0.0001774088421316848, "loss": 36.192, "step": 11263 }, { "epoch": 29.749752393529217, "grad_norm": 1095.8966064453125, "learning_rate": 0.00017736886654272417, "loss": 33.8212, "step": 11264 }, { "epoch": 29.752393529217564, "grad_norm": 253.8108367919922, "learning_rate": 0.00017732889298194523, "loss": 34.4617, "step": 11265 }, { "epoch": 29.75503466490591, "grad_norm": 359.5855407714844, "learning_rate": 0.00017728892145046427, "loss": 35.8075, "step": 11266 }, { "epoch": 29.757675800594257, "grad_norm": 393.9140625, "learning_rate": 0.0001772489519493975, "loss": 35.225, "step": 11267 }, { "epoch": 29.760316936282603, "grad_norm": 564.1697998046875, "learning_rate": 0.000177208984479861, "loss": 34.3757, "step": 11268 }, { "epoch": 29.762958071970946, "grad_norm": 280.54559326171875, "learning_rate": 0.0001771690190429709, "loss": 34.8097, "step": 11269 }, { "epoch": 29.765599207659292, "grad_norm": 393.22369384765625, "learning_rate": 0.00017712905563984298, "loss": 34.5794, "step": 11270 }, { "epoch": 29.76824034334764, "grad_norm": 300.3481140136719, "learning_rate": 0.00017708909427159353, "loss": 33.6178, "step": 11271 }, { "epoch": 29.770881479035985, "grad_norm": 561.131591796875, "learning_rate": 0.00017704913493933833, "loss": 34.3614, "step": 11272 }, { "epoch": 29.77352261472433, "grad_norm": 383.44989013671875, "learning_rate": 0.00017700917764419303, "loss": 35.9389, "step": 11273 }, { "epoch": 29.776163750412678, "grad_norm": 1580.23046875, "learning_rate": 0.00017696922238727368, "loss": 34.4135, "step": 11274 }, { "epoch": 29.778804886101025, "grad_norm": 480.2593688964844, "learning_rate": 0.00017692926916969588, "loss": 35.2409, "step": 11275 }, { "epoch": 29.78144602178937, "grad_norm": 717.3609619140625, "learning_rate": 0.00017688931799257529, "loss": 34.053, "step": 11276 }, { "epoch": 29.784087157477714, "grad_norm": 603.9893188476562, "learning_rate": 0.00017684936885702755, "loss": 35.3798, "step": 11277 }, { "epoch": 29.78672829316606, "grad_norm": 351.1831359863281, "learning_rate": 0.00017680942176416823, "loss": 35.3259, "step": 11278 }, { "epoch": 29.789369428854407, "grad_norm": 790.1341552734375, "learning_rate": 0.00017676947671511282, "loss": 37.6575, "step": 11279 }, { "epoch": 29.792010564542753, "grad_norm": 807.6943359375, "learning_rate": 0.0001767295337109767, "loss": 38.2049, "step": 11280 }, { "epoch": 29.7946517002311, "grad_norm": 634.8447875976562, "learning_rate": 0.00017668959275287538, "loss": 39.8226, "step": 11281 }, { "epoch": 29.797292835919446, "grad_norm": 648.6904907226562, "learning_rate": 0.00017664965384192412, "loss": 41.1408, "step": 11282 }, { "epoch": 29.799933971607793, "grad_norm": 274.0166931152344, "learning_rate": 0.00017660971697923815, "loss": 39.8872, "step": 11283 }, { "epoch": 29.802575107296136, "grad_norm": 393.70001220703125, "learning_rate": 0.00017656978216593267, "loss": 39.7511, "step": 11284 }, { "epoch": 29.805216242984482, "grad_norm": 498.6604309082031, "learning_rate": 0.00017652984940312298, "loss": 41.5746, "step": 11285 }, { "epoch": 29.80785737867283, "grad_norm": 397.8293151855469, "learning_rate": 0.00017648991869192405, "loss": 41.4891, "step": 11286 }, { "epoch": 29.810498514361175, "grad_norm": 478.2894287109375, "learning_rate": 0.00017644999003345088, "loss": 43.4944, "step": 11287 }, { "epoch": 29.81313965004952, "grad_norm": 263.031494140625, "learning_rate": 0.00017641006342881862, "loss": 41.9076, "step": 11288 }, { "epoch": 29.815780785737868, "grad_norm": 218.76686096191406, "learning_rate": 0.0001763701388791422, "loss": 39.9673, "step": 11289 }, { "epoch": 29.818421921426214, "grad_norm": 425.27777099609375, "learning_rate": 0.00017633021638553632, "loss": 39.1261, "step": 11290 }, { "epoch": 29.82106305711456, "grad_norm": 419.4116516113281, "learning_rate": 0.00017629029594911575, "loss": 40.32, "step": 11291 }, { "epoch": 29.823704192802904, "grad_norm": 505.199951171875, "learning_rate": 0.0001762503775709955, "loss": 38.5115, "step": 11292 }, { "epoch": 29.82634532849125, "grad_norm": 215.59347534179688, "learning_rate": 0.0001762104612522901, "loss": 38.5747, "step": 11293 }, { "epoch": 29.828986464179597, "grad_norm": 479.6504211425781, "learning_rate": 0.0001761705469941142, "loss": 37.46, "step": 11294 }, { "epoch": 29.831627599867943, "grad_norm": 661.8500366210938, "learning_rate": 0.00017613063479758246, "loss": 35.4141, "step": 11295 }, { "epoch": 29.83426873555629, "grad_norm": 182.4176025390625, "learning_rate": 0.00017609072466380932, "loss": 36.876, "step": 11296 }, { "epoch": 29.836909871244636, "grad_norm": 387.9740295410156, "learning_rate": 0.0001760508165939093, "loss": 35.0932, "step": 11297 }, { "epoch": 29.839551006932982, "grad_norm": 193.91073608398438, "learning_rate": 0.00017601091058899672, "loss": 34.0554, "step": 11298 }, { "epoch": 29.84219214262133, "grad_norm": 600.7193603515625, "learning_rate": 0.00017597100665018607, "loss": 35.4566, "step": 11299 }, { "epoch": 29.84483327830967, "grad_norm": 912.7197265625, "learning_rate": 0.00017593110477859153, "loss": 33.7544, "step": 11300 }, { "epoch": 29.847474413998018, "grad_norm": 243.54844665527344, "learning_rate": 0.00017589120497532736, "loss": 33.7814, "step": 11301 }, { "epoch": 29.850115549686365, "grad_norm": 233.51280212402344, "learning_rate": 0.00017585130724150782, "loss": 37.0917, "step": 11302 }, { "epoch": 29.85275668537471, "grad_norm": 6566.83935546875, "learning_rate": 0.00017581141157824696, "loss": 30.8321, "step": 11303 }, { "epoch": 29.855397821063058, "grad_norm": 1551.1710205078125, "learning_rate": 0.00017577151798665885, "loss": 17.3061, "step": 11304 }, { "epoch": 29.858038956751404, "grad_norm": 665.1370849609375, "learning_rate": 0.0001757316264678574, "loss": 11.1301, "step": 11305 }, { "epoch": 29.86068009243975, "grad_norm": 931.5036010742188, "learning_rate": 0.0001756917370229567, "loss": 17.2089, "step": 11306 }, { "epoch": 29.863321228128093, "grad_norm": 496.8948059082031, "learning_rate": 0.00017565184965307075, "loss": 12.7913, "step": 11307 }, { "epoch": 29.86596236381644, "grad_norm": 1662.8665771484375, "learning_rate": 0.00017561196435931297, "loss": 14.8773, "step": 11308 }, { "epoch": 29.868603499504786, "grad_norm": 563.8680419921875, "learning_rate": 0.00017557208114279754, "loss": 16.7579, "step": 11309 }, { "epoch": 29.871244635193133, "grad_norm": 552.9932861328125, "learning_rate": 0.000175532200004638, "loss": 13.3893, "step": 11310 }, { "epoch": 29.87388577088148, "grad_norm": 2041.584716796875, "learning_rate": 0.00017549232094594806, "loss": 11.9864, "step": 11311 }, { "epoch": 29.876526906569826, "grad_norm": 6421.23583984375, "learning_rate": 0.0001754524439678412, "loss": 13.7588, "step": 11312 }, { "epoch": 29.879168042258172, "grad_norm": 872.9312133789062, "learning_rate": 0.00017541256907143113, "loss": 31.3813, "step": 11313 }, { "epoch": 29.88180917794652, "grad_norm": 556.2342529296875, "learning_rate": 0.00017537269625783126, "loss": 34.5317, "step": 11314 }, { "epoch": 29.88445031363486, "grad_norm": 862.8954467773438, "learning_rate": 0.00017533282552815494, "loss": 35.1143, "step": 11315 }, { "epoch": 29.887091449323208, "grad_norm": 490.9978332519531, "learning_rate": 0.00017529295688351566, "loss": 34.82, "step": 11316 }, { "epoch": 29.889732585011554, "grad_norm": 818.1834716796875, "learning_rate": 0.00017525309032502667, "loss": 34.5609, "step": 11317 }, { "epoch": 29.8923737206999, "grad_norm": 473.1070556640625, "learning_rate": 0.00017521322585380123, "loss": 34.3449, "step": 11318 }, { "epoch": 29.895014856388247, "grad_norm": 352.1924743652344, "learning_rate": 0.00017517336347095248, "loss": 35.9864, "step": 11319 }, { "epoch": 29.897655992076594, "grad_norm": 266.23248291015625, "learning_rate": 0.00017513350317759365, "loss": 33.9413, "step": 11320 }, { "epoch": 29.90029712776494, "grad_norm": 367.619140625, "learning_rate": 0.00017509364497483772, "loss": 35.2686, "step": 11321 }, { "epoch": 29.902938263453287, "grad_norm": 294.9030456542969, "learning_rate": 0.00017505378886379768, "loss": 34.0436, "step": 11322 }, { "epoch": 29.90557939914163, "grad_norm": 452.5902404785156, "learning_rate": 0.0001750139348455867, "loss": 35.2098, "step": 11323 }, { "epoch": 29.908220534829976, "grad_norm": 660.2965698242188, "learning_rate": 0.00017497408292131757, "loss": 35.3216, "step": 11324 }, { "epoch": 29.910861670518322, "grad_norm": 656.7147827148438, "learning_rate": 0.00017493423309210304, "loss": 34.9698, "step": 11325 }, { "epoch": 29.91350280620667, "grad_norm": 958.8786010742188, "learning_rate": 0.00017489438535905582, "loss": 35.5231, "step": 11326 }, { "epoch": 29.916143941895015, "grad_norm": 942.9075927734375, "learning_rate": 0.0001748545397232889, "loss": 35.2861, "step": 11327 }, { "epoch": 29.91878507758336, "grad_norm": 552.408447265625, "learning_rate": 0.00017481469618591482, "loss": 36.798, "step": 11328 }, { "epoch": 29.921426213271708, "grad_norm": 384.2944030761719, "learning_rate": 0.00017477485474804618, "loss": 35.0066, "step": 11329 }, { "epoch": 29.92406734896005, "grad_norm": 4813.92333984375, "learning_rate": 0.0001747350154107955, "loss": 40.1716, "step": 11330 }, { "epoch": 29.926708484648398, "grad_norm": 736.773681640625, "learning_rate": 0.00017469517817527536, "loss": 40.711, "step": 11331 }, { "epoch": 29.929349620336744, "grad_norm": 1288.153076171875, "learning_rate": 0.0001746553430425981, "loss": 38.9896, "step": 11332 }, { "epoch": 29.93199075602509, "grad_norm": 1104.7967529296875, "learning_rate": 0.0001746155100138761, "loss": 42.1899, "step": 11333 }, { "epoch": 29.934631891713437, "grad_norm": 196.50164794921875, "learning_rate": 0.0001745756790902217, "loss": 43.1823, "step": 11334 }, { "epoch": 29.937273027401783, "grad_norm": 246.3275909423828, "learning_rate": 0.0001745358502727472, "loss": 41.4996, "step": 11335 }, { "epoch": 29.93991416309013, "grad_norm": 391.1701965332031, "learning_rate": 0.00017449602356256472, "loss": 39.1242, "step": 11336 }, { "epoch": 29.942555298778476, "grad_norm": 377.4785461425781, "learning_rate": 0.00017445619896078643, "loss": 39.4714, "step": 11337 }, { "epoch": 29.94519643446682, "grad_norm": 578.518310546875, "learning_rate": 0.00017441637646852442, "loss": 36.8169, "step": 11338 }, { "epoch": 29.947837570155166, "grad_norm": 472.2310485839844, "learning_rate": 0.00017437655608689074, "loss": 36.3835, "step": 11339 }, { "epoch": 29.950478705843512, "grad_norm": 756.185546875, "learning_rate": 0.00017433673781699715, "loss": 36.0154, "step": 11340 }, { "epoch": 29.95311984153186, "grad_norm": 370.87530517578125, "learning_rate": 0.00017429692165995582, "loss": 35.1031, "step": 11341 }, { "epoch": 29.955760977220205, "grad_norm": 235.44456481933594, "learning_rate": 0.00017425710761687858, "loss": 35.3229, "step": 11342 }, { "epoch": 29.95840211290855, "grad_norm": 2295.069580078125, "learning_rate": 0.0001742172956888769, "loss": 43.4771, "step": 11343 }, { "epoch": 29.961043248596898, "grad_norm": 5740.76318359375, "learning_rate": 0.00017417748587706284, "loss": 10.6162, "step": 11344 }, { "epoch": 29.963684384285244, "grad_norm": 6261.93115234375, "learning_rate": 0.00017413767818254798, "loss": 14.3939, "step": 11345 }, { "epoch": 29.966325519973587, "grad_norm": 1760.0400390625, "learning_rate": 0.00017409787260644385, "loss": 15.9848, "step": 11346 }, { "epoch": 29.968966655661934, "grad_norm": 14904.9580078125, "learning_rate": 0.000174058069149862, "loss": 22.3653, "step": 11347 }, { "epoch": 29.97160779135028, "grad_norm": 4814.68017578125, "learning_rate": 0.00017401826781391402, "loss": 16.2136, "step": 11348 }, { "epoch": 29.974248927038627, "grad_norm": 924.5035400390625, "learning_rate": 0.0001739784685997113, "loss": 28.5066, "step": 11349 }, { "epoch": 29.976890062726973, "grad_norm": 1100.8502197265625, "learning_rate": 0.00017393867150836508, "loss": 34.6541, "step": 11350 }, { "epoch": 29.97953119841532, "grad_norm": 346.01849365234375, "learning_rate": 0.00017389887654098687, "loss": 33.962, "step": 11351 }, { "epoch": 29.982172334103666, "grad_norm": 306.5210266113281, "learning_rate": 0.00017385908369868782, "loss": 34.7084, "step": 11352 }, { "epoch": 29.98481346979201, "grad_norm": 474.1645202636719, "learning_rate": 0.00017381929298257914, "loss": 34.839, "step": 11353 }, { "epoch": 29.987454605480355, "grad_norm": 673.2625122070312, "learning_rate": 0.00017377950439377188, "loss": 35.3053, "step": 11354 }, { "epoch": 29.9900957411687, "grad_norm": 1185.547119140625, "learning_rate": 0.00017373971793337723, "loss": 35.9756, "step": 11355 }, { "epoch": 29.992736876857048, "grad_norm": 401.3995056152344, "learning_rate": 0.00017369993360250618, "loss": 34.851, "step": 11356 }, { "epoch": 29.995378012545395, "grad_norm": 576.1262817382812, "learning_rate": 0.00017366015140226956, "loss": 35.0246, "step": 11357 }, { "epoch": 29.99801914823374, "grad_norm": 774.2583618164062, "learning_rate": 0.00017362037133377849, "loss": 35.1499, "step": 11358 }, { "epoch": 30.000660283922088, "grad_norm": 468.7923889160156, "learning_rate": 0.0001735805933981437, "loss": 39.1938, "step": 11359 }, { "epoch": 30.003301419610434, "grad_norm": 281.9361267089844, "learning_rate": 0.0001735408175964759, "loss": 39.2717, "step": 11360 }, { "epoch": 30.005942555298777, "grad_norm": 245.3278350830078, "learning_rate": 0.00017350104392988575, "loss": 38.8156, "step": 11361 }, { "epoch": 30.008583690987123, "grad_norm": 289.6549377441406, "learning_rate": 0.00017346127239948413, "loss": 39.724, "step": 11362 }, { "epoch": 30.01122482667547, "grad_norm": 470.6976013183594, "learning_rate": 0.0001734215030063815, "loss": 41.187, "step": 11363 }, { "epoch": 30.013865962363816, "grad_norm": 285.9899597167969, "learning_rate": 0.00017338173575168837, "loss": 40.616, "step": 11364 }, { "epoch": 30.016507098052163, "grad_norm": 312.0750732421875, "learning_rate": 0.0001733419706365153, "loss": 41.51, "step": 11365 }, { "epoch": 30.01914823374051, "grad_norm": 309.03375244140625, "learning_rate": 0.00017330220766197268, "loss": 40.9674, "step": 11366 }, { "epoch": 30.021789369428856, "grad_norm": 876.3776245117188, "learning_rate": 0.00017326244682917087, "loss": 40.5863, "step": 11367 }, { "epoch": 30.024430505117202, "grad_norm": 286.2333984375, "learning_rate": 0.0001732226881392201, "loss": 40.7707, "step": 11368 }, { "epoch": 30.027071640805545, "grad_norm": 320.32501220703125, "learning_rate": 0.0001731829315932307, "loss": 38.8096, "step": 11369 }, { "epoch": 30.02971277649389, "grad_norm": 236.99717712402344, "learning_rate": 0.00017314317719231283, "loss": 37.2665, "step": 11370 }, { "epoch": 30.032353912182238, "grad_norm": 432.240234375, "learning_rate": 0.00017310342493757653, "loss": 36.9363, "step": 11371 }, { "epoch": 30.034995047870584, "grad_norm": 323.13031005859375, "learning_rate": 0.00017306367483013195, "loss": 35.8862, "step": 11372 }, { "epoch": 30.03763618355893, "grad_norm": 732.9210815429688, "learning_rate": 0.00017302392687108906, "loss": 35.6581, "step": 11373 }, { "epoch": 30.040277319247277, "grad_norm": 207.01461791992188, "learning_rate": 0.00017298418106155778, "loss": 34.9779, "step": 11374 }, { "epoch": 30.042918454935624, "grad_norm": 637.7095336914062, "learning_rate": 0.00017294443740264792, "loss": 35.4192, "step": 11375 }, { "epoch": 30.045559590623967, "grad_norm": 439.7964782714844, "learning_rate": 0.0001729046958954695, "loss": 36.3709, "step": 11376 }, { "epoch": 30.048200726312313, "grad_norm": 374.90740966796875, "learning_rate": 0.0001728649565411322, "loss": 35.6014, "step": 11377 }, { "epoch": 30.05084186200066, "grad_norm": 292.0330810546875, "learning_rate": 0.00017282521934074553, "loss": 33.5381, "step": 11378 }, { "epoch": 30.053482997689006, "grad_norm": 792.8583374023438, "learning_rate": 0.00017278548429541934, "loss": 36.1345, "step": 11379 }, { "epoch": 30.056124133377352, "grad_norm": 370.0148620605469, "learning_rate": 0.00017274575140626317, "loss": 37.1603, "step": 11380 }, { "epoch": 30.0587652690657, "grad_norm": 1531.18994140625, "learning_rate": 0.0001727060206743865, "loss": 40.7778, "step": 11381 }, { "epoch": 30.061406404754045, "grad_norm": 1856.85498046875, "learning_rate": 0.00017266629210089873, "loss": 23.8744, "step": 11382 }, { "epoch": 30.06404754044239, "grad_norm": 7221.546875, "learning_rate": 0.00017262656568690942, "loss": 26.0413, "step": 11383 }, { "epoch": 30.066688676130735, "grad_norm": 7188.03076171875, "learning_rate": 0.0001725868414335278, "loss": 21.2781, "step": 11384 }, { "epoch": 30.06932981181908, "grad_norm": 3793.075927734375, "learning_rate": 0.0001725471193418631, "loss": 22.5999, "step": 11385 }, { "epoch": 30.071970947507427, "grad_norm": 2025.93603515625, "learning_rate": 0.00017250739941302462, "loss": 18.5894, "step": 11386 }, { "epoch": 30.074612083195774, "grad_norm": 2633.207763671875, "learning_rate": 0.00017246768164812153, "loss": 13.3015, "step": 11387 }, { "epoch": 30.07725321888412, "grad_norm": 1046.54638671875, "learning_rate": 0.00017242796604826288, "loss": 17.7374, "step": 11388 }, { "epoch": 30.079894354572467, "grad_norm": 2772.398681640625, "learning_rate": 0.00017238825261455765, "loss": 15.2729, "step": 11389 }, { "epoch": 30.082535490260813, "grad_norm": 30088.49609375, "learning_rate": 0.0001723485413481149, "loss": 9.2195, "step": 11390 }, { "epoch": 30.08517662594916, "grad_norm": 2491.576904296875, "learning_rate": 0.00017230883225004357, "loss": 28.3957, "step": 11391 }, { "epoch": 30.087817761637503, "grad_norm": 511.7801513671875, "learning_rate": 0.0001722691253214523, "loss": 37.0063, "step": 11392 }, { "epoch": 30.09045889732585, "grad_norm": 305.2440185546875, "learning_rate": 0.00017222942056345018, "loss": 34.951, "step": 11393 }, { "epoch": 30.093100033014196, "grad_norm": 434.353759765625, "learning_rate": 0.00017218971797714586, "loss": 34.8535, "step": 11394 }, { "epoch": 30.095741168702542, "grad_norm": 447.9736328125, "learning_rate": 0.00017215001756364778, "loss": 36.8247, "step": 11395 }, { "epoch": 30.09838230439089, "grad_norm": 231.62957763671875, "learning_rate": 0.00017211031932406483, "loss": 35.0532, "step": 11396 }, { "epoch": 30.101023440079235, "grad_norm": 649.1861572265625, "learning_rate": 0.0001720706232595055, "loss": 34.6948, "step": 11397 }, { "epoch": 30.10366457576758, "grad_norm": 316.7367248535156, "learning_rate": 0.00017203092937107821, "loss": 34.0427, "step": 11398 }, { "epoch": 30.106305711455924, "grad_norm": 366.01104736328125, "learning_rate": 0.00017199123765989136, "loss": 34.4725, "step": 11399 }, { "epoch": 30.10894684714427, "grad_norm": 328.1031188964844, "learning_rate": 0.00017195154812705342, "loss": 34.4366, "step": 11400 }, { "epoch": 30.10894684714427, "eval_loss": 3.7928712368011475, "eval_runtime": 2.1137, "eval_samples_per_second": 234.183, "eval_steps_per_second": 29.332, "step": 11400 }, { "epoch": 30.111587982832617, "grad_norm": 627.2921752929688, "learning_rate": 0.00017191186077367265, "loss": 34.464, "step": 11401 }, { "epoch": 30.114229118520964, "grad_norm": 265.007080078125, "learning_rate": 0.00017187217560085728, "loss": 35.1869, "step": 11402 }, { "epoch": 30.11687025420931, "grad_norm": 410.70428466796875, "learning_rate": 0.00017183249260971554, "loss": 34.5683, "step": 11403 }, { "epoch": 30.119511389897657, "grad_norm": 947.199462890625, "learning_rate": 0.00017179281180135553, "loss": 34.1409, "step": 11404 }, { "epoch": 30.122152525586003, "grad_norm": 970.2008666992188, "learning_rate": 0.00017175313317688528, "loss": 33.9108, "step": 11405 }, { "epoch": 30.12479366127435, "grad_norm": 1189.900146484375, "learning_rate": 0.00017171345673741277, "loss": 33.4925, "step": 11406 }, { "epoch": 30.127434796962692, "grad_norm": 667.5577392578125, "learning_rate": 0.0001716737824840461, "loss": 37.0314, "step": 11407 }, { "epoch": 30.13007593265104, "grad_norm": 549.9835815429688, "learning_rate": 0.000171634110417893, "loss": 37.489, "step": 11408 }, { "epoch": 30.132717068339385, "grad_norm": 1096.1231689453125, "learning_rate": 0.00017159444054006126, "loss": 41.2918, "step": 11409 }, { "epoch": 30.13535820402773, "grad_norm": 127.00936889648438, "learning_rate": 0.0001715547728516588, "loss": 39.1651, "step": 11410 }, { "epoch": 30.137999339716078, "grad_norm": 347.1514587402344, "learning_rate": 0.00017151510735379321, "loss": 39.6801, "step": 11411 }, { "epoch": 30.140640475404425, "grad_norm": 297.9109802246094, "learning_rate": 0.0001714754440475722, "loss": 38.8246, "step": 11412 }, { "epoch": 30.14328161109277, "grad_norm": 220.07594299316406, "learning_rate": 0.00017143578293410313, "loss": 41.8898, "step": 11413 }, { "epoch": 30.145922746781117, "grad_norm": 557.9159545898438, "learning_rate": 0.00017139612401449378, "loss": 39.9808, "step": 11414 }, { "epoch": 30.14856388246946, "grad_norm": 392.2851867675781, "learning_rate": 0.00017135646728985148, "loss": 40.5697, "step": 11415 }, { "epoch": 30.151205018157807, "grad_norm": 371.04986572265625, "learning_rate": 0.00017131681276128365, "loss": 42.9593, "step": 11416 }, { "epoch": 30.153846153846153, "grad_norm": 235.40423583984375, "learning_rate": 0.00017127716042989754, "loss": 40.4108, "step": 11417 }, { "epoch": 30.1564872895345, "grad_norm": 267.9524841308594, "learning_rate": 0.00017123751029680056, "loss": 40.514, "step": 11418 }, { "epoch": 30.159128425222846, "grad_norm": 212.7513427734375, "learning_rate": 0.00017119786236309981, "loss": 39.6806, "step": 11419 }, { "epoch": 30.161769560911193, "grad_norm": 330.61517333984375, "learning_rate": 0.00017115821662990244, "loss": 39.0052, "step": 11420 }, { "epoch": 30.16441069659954, "grad_norm": 389.3532409667969, "learning_rate": 0.00017111857309831557, "loss": 38.5242, "step": 11421 }, { "epoch": 30.167051832287882, "grad_norm": 370.5462951660156, "learning_rate": 0.00017107893176944623, "loss": 38.1202, "step": 11422 }, { "epoch": 30.16969296797623, "grad_norm": 269.1800537109375, "learning_rate": 0.00017103929264440128, "loss": 34.8742, "step": 11423 }, { "epoch": 30.172334103664575, "grad_norm": 745.25439453125, "learning_rate": 0.0001709996557242878, "loss": 35.601, "step": 11424 }, { "epoch": 30.17497523935292, "grad_norm": 456.5425109863281, "learning_rate": 0.00017096002101021248, "loss": 35.9029, "step": 11425 }, { "epoch": 30.177616375041268, "grad_norm": 317.6254577636719, "learning_rate": 0.00017092038850328217, "loss": 34.9477, "step": 11426 }, { "epoch": 30.180257510729614, "grad_norm": 493.05133056640625, "learning_rate": 0.00017088075820460348, "loss": 34.0802, "step": 11427 }, { "epoch": 30.18289864641796, "grad_norm": 543.732666015625, "learning_rate": 0.00017084113011528319, "loss": 35.5197, "step": 11428 }, { "epoch": 30.185539782106307, "grad_norm": 328.9358215332031, "learning_rate": 0.00017080150423642791, "loss": 34.4876, "step": 11429 }, { "epoch": 30.18818091779465, "grad_norm": 222.70970153808594, "learning_rate": 0.00017076188056914393, "loss": 36.1714, "step": 11430 }, { "epoch": 30.190822053482997, "grad_norm": 1395.6513671875, "learning_rate": 0.00017072225911453804, "loss": 44.4182, "step": 11431 }, { "epoch": 30.193463189171343, "grad_norm": 1485.7703857421875, "learning_rate": 0.00017068263987371645, "loss": 16.3975, "step": 11432 }, { "epoch": 30.19610432485969, "grad_norm": 1512.2265625, "learning_rate": 0.00017064302284778554, "loss": 22.547, "step": 11433 }, { "epoch": 30.198745460548036, "grad_norm": 679.3670043945312, "learning_rate": 0.00017060340803785157, "loss": 17.8556, "step": 11434 }, { "epoch": 30.201386596236382, "grad_norm": 1227.019775390625, "learning_rate": 0.00017056379544502085, "loss": 13.039, "step": 11435 }, { "epoch": 30.20402773192473, "grad_norm": 1130.1763916015625, "learning_rate": 0.00017052418507039946, "loss": 14.4873, "step": 11436 }, { "epoch": 30.206668867613075, "grad_norm": 1320.64111328125, "learning_rate": 0.0001704845769150935, "loss": 11.5842, "step": 11437 }, { "epoch": 30.209310003301418, "grad_norm": 2645.305419921875, "learning_rate": 0.00017044497098020903, "loss": 9.9437, "step": 11438 }, { "epoch": 30.211951138989765, "grad_norm": 7383.6787109375, "learning_rate": 0.00017040536726685201, "loss": 14.2962, "step": 11439 }, { "epoch": 30.21459227467811, "grad_norm": 902.7761840820312, "learning_rate": 0.0001703657657761284, "loss": 12.0788, "step": 11440 }, { "epoch": 30.217233410366457, "grad_norm": 2377.590576171875, "learning_rate": 0.00017032616650914384, "loss": 18.284, "step": 11441 }, { "epoch": 30.219874546054804, "grad_norm": 821.3089599609375, "learning_rate": 0.0001702865694670045, "loss": 34.8352, "step": 11442 }, { "epoch": 30.22251568174315, "grad_norm": 365.5699768066406, "learning_rate": 0.0001702469746508158, "loss": 34.6305, "step": 11443 }, { "epoch": 30.225156817431497, "grad_norm": 831.7108764648438, "learning_rate": 0.00017020738206168333, "loss": 35.5421, "step": 11444 }, { "epoch": 30.22779795311984, "grad_norm": 339.35693359375, "learning_rate": 0.000170167791700713, "loss": 34.361, "step": 11445 }, { "epoch": 30.230439088808186, "grad_norm": 350.9980163574219, "learning_rate": 0.00017012820356901014, "loss": 34.7413, "step": 11446 }, { "epoch": 30.233080224496533, "grad_norm": 289.9776306152344, "learning_rate": 0.00017008861766768042, "loss": 35.236, "step": 11447 }, { "epoch": 30.23572136018488, "grad_norm": 486.0855407714844, "learning_rate": 0.00017004903399782888, "loss": 35.1494, "step": 11448 }, { "epoch": 30.238362495873226, "grad_norm": 360.3231506347656, "learning_rate": 0.00017000945256056123, "loss": 34.622, "step": 11449 }, { "epoch": 30.241003631561572, "grad_norm": 416.3341369628906, "learning_rate": 0.0001699698733569826, "loss": 34.4024, "step": 11450 }, { "epoch": 30.24364476724992, "grad_norm": 490.7485046386719, "learning_rate": 0.00016993029638819824, "loss": 34.5229, "step": 11451 }, { "epoch": 30.246285902938265, "grad_norm": 615.8084716796875, "learning_rate": 0.00016989072165531332, "loss": 34.2111, "step": 11452 }, { "epoch": 30.248927038626608, "grad_norm": 566.8674926757812, "learning_rate": 0.00016985114915943296, "loss": 34.9288, "step": 11453 }, { "epoch": 30.251568174314954, "grad_norm": 626.5137329101562, "learning_rate": 0.0001698115789016622, "loss": 34.8649, "step": 11454 }, { "epoch": 30.2542093100033, "grad_norm": 487.74664306640625, "learning_rate": 0.00016977201088310589, "loss": 34.1972, "step": 11455 }, { "epoch": 30.256850445691647, "grad_norm": 349.7456970214844, "learning_rate": 0.00016973244510486916, "loss": 35.209, "step": 11456 }, { "epoch": 30.259491581379994, "grad_norm": 1140.1490478515625, "learning_rate": 0.00016969288156805671, "loss": 37.6279, "step": 11457 }, { "epoch": 30.26213271706834, "grad_norm": 963.7529907226562, "learning_rate": 0.0001696533202737733, "loss": 36.491, "step": 11458 }, { "epoch": 30.264773852756687, "grad_norm": 498.6017761230469, "learning_rate": 0.00016961376122312383, "loss": 40.1678, "step": 11459 }, { "epoch": 30.267414988445033, "grad_norm": 257.0113830566406, "learning_rate": 0.00016957420441721284, "loss": 38.7915, "step": 11460 }, { "epoch": 30.270056124133376, "grad_norm": 528.6461181640625, "learning_rate": 0.00016953464985714492, "loss": 40.6763, "step": 11461 }, { "epoch": 30.272697259821722, "grad_norm": 503.45306396484375, "learning_rate": 0.00016949509754402454, "loss": 39.2966, "step": 11462 }, { "epoch": 30.27533839551007, "grad_norm": 555.5396118164062, "learning_rate": 0.00016945554747895637, "loss": 40.5351, "step": 11463 }, { "epoch": 30.277979531198415, "grad_norm": 610.9622192382812, "learning_rate": 0.0001694159996630448, "loss": 44.1927, "step": 11464 }, { "epoch": 30.28062066688676, "grad_norm": 443.22113037109375, "learning_rate": 0.00016937645409739394, "loss": 42.9045, "step": 11465 }, { "epoch": 30.283261802575108, "grad_norm": 328.3077392578125, "learning_rate": 0.00016933691078310832, "loss": 41.2327, "step": 11466 }, { "epoch": 30.285902938263455, "grad_norm": 238.45204162597656, "learning_rate": 0.00016929736972129208, "loss": 39.8305, "step": 11467 }, { "epoch": 30.288544073951797, "grad_norm": 295.5035095214844, "learning_rate": 0.0001692578309130494, "loss": 38.5759, "step": 11468 }, { "epoch": 30.291185209640144, "grad_norm": 781.6595458984375, "learning_rate": 0.00016921829435948433, "loss": 40.8629, "step": 11469 }, { "epoch": 30.29382634532849, "grad_norm": 274.9848937988281, "learning_rate": 0.00016917876006170097, "loss": 40.0284, "step": 11470 }, { "epoch": 30.296467481016837, "grad_norm": 375.41375732421875, "learning_rate": 0.00016913922802080323, "loss": 37.1011, "step": 11471 }, { "epoch": 30.299108616705183, "grad_norm": 1121.43017578125, "learning_rate": 0.000169099698237895, "loss": 37.3135, "step": 11472 }, { "epoch": 30.30174975239353, "grad_norm": 680.1224365234375, "learning_rate": 0.00016906017071408025, "loss": 36.0475, "step": 11473 }, { "epoch": 30.304390888081876, "grad_norm": 811.4808349609375, "learning_rate": 0.0001690206454504627, "loss": 36.5145, "step": 11474 }, { "epoch": 30.307032023770223, "grad_norm": 253.57044982910156, "learning_rate": 0.000168981122448146, "loss": 36.0923, "step": 11475 }, { "epoch": 30.309673159458566, "grad_norm": 846.1730346679688, "learning_rate": 0.0001689416017082338, "loss": 34.7789, "step": 11476 }, { "epoch": 30.312314295146912, "grad_norm": 576.730712890625, "learning_rate": 0.00016890208323182992, "loss": 35.8423, "step": 11477 }, { "epoch": 30.31495543083526, "grad_norm": 431.3196105957031, "learning_rate": 0.00016886256702003765, "loss": 34.995, "step": 11478 }, { "epoch": 30.317596566523605, "grad_norm": 295.10626220703125, "learning_rate": 0.00016882305307396045, "loss": 35.2851, "step": 11479 }, { "epoch": 30.32023770221195, "grad_norm": 646.252685546875, "learning_rate": 0.00016878354139470192, "loss": 35.4667, "step": 11480 }, { "epoch": 30.322878837900298, "grad_norm": 1563.5816650390625, "learning_rate": 0.00016874403198336525, "loss": 38.75, "step": 11481 }, { "epoch": 30.325519973588644, "grad_norm": 4739.9228515625, "learning_rate": 0.0001687045248410538, "loss": 20.4661, "step": 11482 }, { "epoch": 30.32816110927699, "grad_norm": 5336.1962890625, "learning_rate": 0.00016866501996887063, "loss": 10.2321, "step": 11483 }, { "epoch": 30.330802244965334, "grad_norm": 722.6278076171875, "learning_rate": 0.0001686255173679191, "loss": 11.027, "step": 11484 }, { "epoch": 30.33344338065368, "grad_norm": 3673.450439453125, "learning_rate": 0.00016858601703930215, "loss": 12.7509, "step": 11485 }, { "epoch": 30.336084516342027, "grad_norm": 1543.7752685546875, "learning_rate": 0.00016854651898412282, "loss": 18.9862, "step": 11486 }, { "epoch": 30.338725652030373, "grad_norm": 1208.9571533203125, "learning_rate": 0.00016850702320348414, "loss": 11.0355, "step": 11487 }, { "epoch": 30.34136678771872, "grad_norm": 2376.758056640625, "learning_rate": 0.000168467529698489, "loss": 15.9102, "step": 11488 }, { "epoch": 30.344007923407066, "grad_norm": 1878.2330322265625, "learning_rate": 0.00016842803847024018, "loss": 13.8723, "step": 11489 }, { "epoch": 30.346649059095412, "grad_norm": 4768.01904296875, "learning_rate": 0.00016838854951984039, "loss": 14.8379, "step": 11490 }, { "epoch": 30.349290194783755, "grad_norm": 1836.6041259765625, "learning_rate": 0.00016834906284839252, "loss": 26.5664, "step": 11491 }, { "epoch": 30.3519313304721, "grad_norm": 305.9479064941406, "learning_rate": 0.00016830957845699907, "loss": 36.0559, "step": 11492 }, { "epoch": 30.354572466160448, "grad_norm": 894.59814453125, "learning_rate": 0.00016827009634676256, "loss": 35.0459, "step": 11493 }, { "epoch": 30.357213601848795, "grad_norm": 498.1817321777344, "learning_rate": 0.00016823061651878574, "loss": 35.0271, "step": 11494 }, { "epoch": 30.35985473753714, "grad_norm": 735.5601196289062, "learning_rate": 0.00016819113897417087, "loss": 34.3405, "step": 11495 }, { "epoch": 30.362495873225487, "grad_norm": 541.552001953125, "learning_rate": 0.0001681516637140204, "loss": 34.9866, "step": 11496 }, { "epoch": 30.365137008913834, "grad_norm": 1416.71728515625, "learning_rate": 0.0001681121907394365, "loss": 36.2156, "step": 11497 }, { "epoch": 30.36777814460218, "grad_norm": 264.9786071777344, "learning_rate": 0.00016807272005152173, "loss": 33.9995, "step": 11498 }, { "epoch": 30.370419280290523, "grad_norm": 293.0167236328125, "learning_rate": 0.00016803325165137814, "loss": 34.0804, "step": 11499 }, { "epoch": 30.37306041597887, "grad_norm": 574.3018798828125, "learning_rate": 0.00016799378554010772, "loss": 34.4134, "step": 11500 }, { "epoch": 30.375701551667216, "grad_norm": 926.72314453125, "learning_rate": 0.00016795432171881275, "loss": 34.1538, "step": 11501 }, { "epoch": 30.378342687355563, "grad_norm": 525.5244140625, "learning_rate": 0.00016791486018859515, "loss": 35.37, "step": 11502 }, { "epoch": 30.38098382304391, "grad_norm": 380.0943603515625, "learning_rate": 0.00016787540095055686, "loss": 34.5248, "step": 11503 }, { "epoch": 30.383624958732256, "grad_norm": 416.14581298828125, "learning_rate": 0.0001678359440057997, "loss": 34.8505, "step": 11504 }, { "epoch": 30.386266094420602, "grad_norm": 388.4526672363281, "learning_rate": 0.0001677964893554256, "loss": 34.7854, "step": 11505 }, { "epoch": 30.38890723010895, "grad_norm": 789.7333984375, "learning_rate": 0.00016775703700053625, "loss": 34.7541, "step": 11506 }, { "epoch": 30.39154836579729, "grad_norm": 1550.9149169921875, "learning_rate": 0.00016771758694223327, "loss": 36.4687, "step": 11507 }, { "epoch": 30.394189501485638, "grad_norm": 407.7308654785156, "learning_rate": 0.0001676781391816184, "loss": 37.6182, "step": 11508 }, { "epoch": 30.396830637173984, "grad_norm": 601.628173828125, "learning_rate": 0.00016763869371979313, "loss": 41.6587, "step": 11509 }, { "epoch": 30.39947177286233, "grad_norm": 493.507568359375, "learning_rate": 0.00016759925055785893, "loss": 38.594, "step": 11510 }, { "epoch": 30.402112908550677, "grad_norm": 921.8613891601562, "learning_rate": 0.0001675598096969172, "loss": 39.3426, "step": 11511 }, { "epoch": 30.404754044239024, "grad_norm": 729.8378295898438, "learning_rate": 0.00016752037113806944, "loss": 38.8359, "step": 11512 }, { "epoch": 30.40739517992737, "grad_norm": 408.810546875, "learning_rate": 0.00016748093488241689, "loss": 38.8288, "step": 11513 }, { "epoch": 30.410036315615713, "grad_norm": 665.522705078125, "learning_rate": 0.00016744150093106063, "loss": 40.6915, "step": 11514 }, { "epoch": 30.41267745130406, "grad_norm": 313.14605712890625, "learning_rate": 0.00016740206928510205, "loss": 40.0656, "step": 11515 }, { "epoch": 30.415318586992406, "grad_norm": 685.8394165039062, "learning_rate": 0.00016736263994564216, "loss": 43.7831, "step": 11516 }, { "epoch": 30.417959722680752, "grad_norm": 281.4272766113281, "learning_rate": 0.00016732321291378205, "loss": 39.7373, "step": 11517 }, { "epoch": 30.4206008583691, "grad_norm": 697.8084716796875, "learning_rate": 0.0001672837881906225, "loss": 40.0195, "step": 11518 }, { "epoch": 30.423241994057445, "grad_norm": 356.1116943359375, "learning_rate": 0.00016724436577726472, "loss": 39.4228, "step": 11519 }, { "epoch": 30.42588312974579, "grad_norm": 3260.76513671875, "learning_rate": 0.00016720494567480938, "loss": 37.2629, "step": 11520 }, { "epoch": 30.428524265434138, "grad_norm": 391.2727355957031, "learning_rate": 0.00016716552788435723, "loss": 36.9639, "step": 11521 }, { "epoch": 30.43116540112248, "grad_norm": 493.7130126953125, "learning_rate": 0.00016712611240700916, "loss": 36.342, "step": 11522 }, { "epoch": 30.433806536810827, "grad_norm": 408.348876953125, "learning_rate": 0.00016708669924386566, "loss": 36.4267, "step": 11523 }, { "epoch": 30.436447672499174, "grad_norm": 1370.7724609375, "learning_rate": 0.00016704728839602733, "loss": 35.1397, "step": 11524 }, { "epoch": 30.43908880818752, "grad_norm": 369.44366455078125, "learning_rate": 0.00016700787986459483, "loss": 35.9948, "step": 11525 }, { "epoch": 30.441729943875867, "grad_norm": 641.6045532226562, "learning_rate": 0.00016696847365066852, "loss": 34.6661, "step": 11526 }, { "epoch": 30.444371079564213, "grad_norm": 435.7768249511719, "learning_rate": 0.00016692906975534884, "loss": 35.7018, "step": 11527 }, { "epoch": 30.44701221525256, "grad_norm": 514.79345703125, "learning_rate": 0.00016688966817973591, "loss": 34.7069, "step": 11528 }, { "epoch": 30.449653350940906, "grad_norm": 545.4210815429688, "learning_rate": 0.00016685026892493038, "loss": 35.2196, "step": 11529 }, { "epoch": 30.45229448662925, "grad_norm": 372.5634765625, "learning_rate": 0.0001668108719920322, "loss": 35.5757, "step": 11530 }, { "epoch": 30.454935622317596, "grad_norm": 496.7613220214844, "learning_rate": 0.0001667714773821414, "loss": 38.2094, "step": 11531 }, { "epoch": 30.457576758005942, "grad_norm": 1484.82373046875, "learning_rate": 0.00016673208509635835, "loss": 27.5441, "step": 11532 }, { "epoch": 30.46021789369429, "grad_norm": 1011.45849609375, "learning_rate": 0.0001666926951357829, "loss": 14.6593, "step": 11533 }, { "epoch": 30.462859029382635, "grad_norm": 2455.010986328125, "learning_rate": 0.00016665330750151498, "loss": 14.6842, "step": 11534 }, { "epoch": 30.46550016507098, "grad_norm": 502.1005859375, "learning_rate": 0.00016661392219465446, "loss": 13.6394, "step": 11535 }, { "epoch": 30.468141300759328, "grad_norm": 1870.25830078125, "learning_rate": 0.00016657453921630123, "loss": 15.5999, "step": 11536 }, { "epoch": 30.47078243644767, "grad_norm": 2554.033203125, "learning_rate": 0.00016653515856755496, "loss": 15.2791, "step": 11537 }, { "epoch": 30.473423572136017, "grad_norm": 1330.7794189453125, "learning_rate": 0.00016649578024951534, "loss": 10.8596, "step": 11538 }, { "epoch": 30.476064707824364, "grad_norm": 4768.8798828125, "learning_rate": 0.00016645640426328205, "loss": 10.902, "step": 11539 }, { "epoch": 30.47870584351271, "grad_norm": 2586.045166015625, "learning_rate": 0.00016641703060995457, "loss": 15.716, "step": 11540 }, { "epoch": 30.481346979201057, "grad_norm": 1138.5902099609375, "learning_rate": 0.00016637765929063242, "loss": 33.7343, "step": 11541 }, { "epoch": 30.483988114889403, "grad_norm": 1023.2827758789062, "learning_rate": 0.00016633829030641498, "loss": 35.3613, "step": 11542 }, { "epoch": 30.48662925057775, "grad_norm": 569.0360107421875, "learning_rate": 0.00016629892365840166, "loss": 35.5021, "step": 11543 }, { "epoch": 30.489270386266096, "grad_norm": 282.30224609375, "learning_rate": 0.00016625955934769176, "loss": 34.4989, "step": 11544 }, { "epoch": 30.49191152195444, "grad_norm": 327.85272216796875, "learning_rate": 0.00016622019737538436, "loss": 33.6858, "step": 11545 }, { "epoch": 30.494552657642785, "grad_norm": 624.916015625, "learning_rate": 0.0001661808377425788, "loss": 35.6387, "step": 11546 }, { "epoch": 30.49719379333113, "grad_norm": 374.0199279785156, "learning_rate": 0.0001661414804503742, "loss": 35.0603, "step": 11547 }, { "epoch": 30.499834929019478, "grad_norm": 1100.204833984375, "learning_rate": 0.00016610212549986945, "loss": 35.8162, "step": 11548 }, { "epoch": 30.502476064707825, "grad_norm": 658.1936645507812, "learning_rate": 0.00016606277289216343, "loss": 35.1267, "step": 11549 }, { "epoch": 30.50511720039617, "grad_norm": 542.8171997070312, "learning_rate": 0.00016602342262835528, "loss": 34.8321, "step": 11550 }, { "epoch": 30.507758336084517, "grad_norm": 415.88726806640625, "learning_rate": 0.00016598407470954374, "loss": 35.6948, "step": 11551 }, { "epoch": 30.510399471772864, "grad_norm": 1673.2119140625, "learning_rate": 0.0001659447291368275, "loss": 34.0689, "step": 11552 }, { "epoch": 30.513040607461207, "grad_norm": 934.7672119140625, "learning_rate": 0.00016590538591130537, "loss": 34.8041, "step": 11553 }, { "epoch": 30.515681743149553, "grad_norm": 1126.2723388671875, "learning_rate": 0.00016586604503407594, "loss": 34.1607, "step": 11554 }, { "epoch": 30.5183228788379, "grad_norm": 697.0814819335938, "learning_rate": 0.00016582670650623776, "loss": 35.4078, "step": 11555 }, { "epoch": 30.520964014526246, "grad_norm": 296.7439880371094, "learning_rate": 0.00016578737032888936, "loss": 34.8459, "step": 11556 }, { "epoch": 30.523605150214593, "grad_norm": 1014.8530883789062, "learning_rate": 0.00016574803650312917, "loss": 34.9669, "step": 11557 }, { "epoch": 30.52624628590294, "grad_norm": 1763.4290771484375, "learning_rate": 0.00016570870503005558, "loss": 38.4372, "step": 11558 }, { "epoch": 30.528887421591286, "grad_norm": 1094.007568359375, "learning_rate": 0.00016566937591076686, "loss": 39.8408, "step": 11559 }, { "epoch": 30.53152855727963, "grad_norm": 624.329345703125, "learning_rate": 0.00016563004914636132, "loss": 38.12, "step": 11560 }, { "epoch": 30.534169692967975, "grad_norm": 435.0545654296875, "learning_rate": 0.00016559072473793708, "loss": 39.6091, "step": 11561 }, { "epoch": 30.53681082865632, "grad_norm": 308.74371337890625, "learning_rate": 0.00016555140268659224, "loss": 40.581, "step": 11562 }, { "epoch": 30.539451964344668, "grad_norm": 381.9983215332031, "learning_rate": 0.00016551208299342483, "loss": 39.2747, "step": 11563 }, { "epoch": 30.542093100033014, "grad_norm": 406.6407775878906, "learning_rate": 0.000165472765659533, "loss": 45.3278, "step": 11564 }, { "epoch": 30.54473423572136, "grad_norm": 667.2078247070312, "learning_rate": 0.00016543345068601448, "loss": 42.6953, "step": 11565 }, { "epoch": 30.547375371409707, "grad_norm": 349.48760986328125, "learning_rate": 0.00016539413807396708, "loss": 42.5029, "step": 11566 }, { "epoch": 30.550016507098054, "grad_norm": 627.1104125976562, "learning_rate": 0.00016535482782448877, "loss": 40.8294, "step": 11567 }, { "epoch": 30.552657642786397, "grad_norm": 455.7877197265625, "learning_rate": 0.00016531551993867715, "loss": 38.736, "step": 11568 }, { "epoch": 30.555298778474743, "grad_norm": 433.05853271484375, "learning_rate": 0.0001652762144176299, "loss": 40.7042, "step": 11569 }, { "epoch": 30.55793991416309, "grad_norm": 494.03082275390625, "learning_rate": 0.00016523691126244455, "loss": 36.9385, "step": 11570 }, { "epoch": 30.560581049851436, "grad_norm": 520.8511352539062, "learning_rate": 0.00016519761047421874, "loss": 36.2508, "step": 11571 }, { "epoch": 30.563222185539782, "grad_norm": 639.8685302734375, "learning_rate": 0.0001651583120540498, "loss": 36.6974, "step": 11572 }, { "epoch": 30.56586332122813, "grad_norm": 773.4994506835938, "learning_rate": 0.00016511901600303514, "loss": 35.5995, "step": 11573 }, { "epoch": 30.568504456916475, "grad_norm": 442.5214538574219, "learning_rate": 0.00016507972232227214, "loss": 35.8063, "step": 11574 }, { "epoch": 30.57114559260482, "grad_norm": 315.40740966796875, "learning_rate": 0.00016504043101285798, "loss": 35.5952, "step": 11575 }, { "epoch": 30.573786728293165, "grad_norm": 291.0187683105469, "learning_rate": 0.00016500114207588989, "loss": 35.3196, "step": 11576 }, { "epoch": 30.57642786398151, "grad_norm": 624.9857788085938, "learning_rate": 0.00016496185551246494, "loss": 35.562, "step": 11577 }, { "epoch": 30.579068999669857, "grad_norm": 482.9898376464844, "learning_rate": 0.00016492257132368028, "loss": 34.6131, "step": 11578 }, { "epoch": 30.581710135358204, "grad_norm": 563.9775390625, "learning_rate": 0.0001648832895106328, "loss": 35.6886, "step": 11579 }, { "epoch": 30.58435127104655, "grad_norm": 747.7648315429688, "learning_rate": 0.00016484401007441936, "loss": 35.9698, "step": 11580 }, { "epoch": 30.586992406734897, "grad_norm": 816.7426147460938, "learning_rate": 0.00016480473301613705, "loss": 38.683, "step": 11581 }, { "epoch": 30.589633542423243, "grad_norm": 6732.8173828125, "learning_rate": 0.00016476545833688257, "loss": 15.6289, "step": 11582 }, { "epoch": 30.592274678111586, "grad_norm": 6085.97900390625, "learning_rate": 0.00016472618603775253, "loss": 17.4371, "step": 11583 }, { "epoch": 30.594915813799933, "grad_norm": 4095.82958984375, "learning_rate": 0.00016468691611984355, "loss": 20.497, "step": 11584 }, { "epoch": 30.59755694948828, "grad_norm": 5697.0751953125, "learning_rate": 0.00016464764858425244, "loss": 12.6906, "step": 11585 }, { "epoch": 30.600198085176626, "grad_norm": 31653.115234375, "learning_rate": 0.00016460838343207557, "loss": 17.8163, "step": 11586 }, { "epoch": 30.602839220864972, "grad_norm": 3673.533935546875, "learning_rate": 0.00016456912066440944, "loss": 13.8477, "step": 11587 }, { "epoch": 30.60548035655332, "grad_norm": 1635.3408203125, "learning_rate": 0.0001645298602823504, "loss": 14.9868, "step": 11588 }, { "epoch": 30.608121492241665, "grad_norm": 1321.809326171875, "learning_rate": 0.00016449060228699487, "loss": 20.1814, "step": 11589 }, { "epoch": 30.61076262793001, "grad_norm": 8371.7568359375, "learning_rate": 0.000164451346679439, "loss": 19.1214, "step": 11590 }, { "epoch": 30.613403763618354, "grad_norm": 349.0552062988281, "learning_rate": 0.000164412093460779, "loss": 35.8173, "step": 11591 }, { "epoch": 30.6160448993067, "grad_norm": 798.7359008789062, "learning_rate": 0.00016437284263211106, "loss": 35.6859, "step": 11592 }, { "epoch": 30.618686034995047, "grad_norm": 593.1331787109375, "learning_rate": 0.0001643335941945312, "loss": 32.9578, "step": 11593 }, { "epoch": 30.621327170683394, "grad_norm": 318.50067138671875, "learning_rate": 0.0001642943481491353, "loss": 34.6035, "step": 11594 }, { "epoch": 30.62396830637174, "grad_norm": 310.9840087890625, "learning_rate": 0.00016425510449701947, "loss": 34.2242, "step": 11595 }, { "epoch": 30.626609442060087, "grad_norm": 398.62890625, "learning_rate": 0.00016421586323927948, "loss": 33.8502, "step": 11596 }, { "epoch": 30.629250577748433, "grad_norm": 1443.455810546875, "learning_rate": 0.00016417662437701112, "loss": 34.5932, "step": 11597 }, { "epoch": 30.63189171343678, "grad_norm": 636.38671875, "learning_rate": 0.00016413738791131, "loss": 35.2063, "step": 11598 }, { "epoch": 30.634532849125122, "grad_norm": 239.59730529785156, "learning_rate": 0.000164098153843272, "loss": 34.3865, "step": 11599 }, { "epoch": 30.63717398481347, "grad_norm": 599.3663330078125, "learning_rate": 0.0001640589221739926, "loss": 34.2567, "step": 11600 }, { "epoch": 30.63717398481347, "eval_loss": 3.816429376602173, "eval_runtime": 2.0531, "eval_samples_per_second": 241.094, "eval_steps_per_second": 30.198, "step": 11600 }, { "epoch": 30.639815120501815, "grad_norm": 445.37738037109375, "learning_rate": 0.00016401969290456717, "loss": 35.9853, "step": 11601 }, { "epoch": 30.64245625619016, "grad_norm": 692.1782836914062, "learning_rate": 0.0001639804660360914, "loss": 35.2808, "step": 11602 }, { "epoch": 30.645097391878508, "grad_norm": 672.3690795898438, "learning_rate": 0.00016394124156966062, "loss": 34.4691, "step": 11603 }, { "epoch": 30.647738527566855, "grad_norm": 351.3919982910156, "learning_rate": 0.00016390201950637013, "loss": 34.0667, "step": 11604 }, { "epoch": 30.6503796632552, "grad_norm": 756.2464599609375, "learning_rate": 0.0001638627998473151, "loss": 34.2833, "step": 11605 }, { "epoch": 30.653020798943544, "grad_norm": 441.34857177734375, "learning_rate": 0.0001638235825935908, "loss": 35.089, "step": 11606 }, { "epoch": 30.65566193463189, "grad_norm": 768.3705444335938, "learning_rate": 0.0001637843677462924, "loss": 35.28, "step": 11607 }, { "epoch": 30.658303070320237, "grad_norm": 489.671142578125, "learning_rate": 0.00016374515530651477, "loss": 37.3164, "step": 11608 }, { "epoch": 30.660944206008583, "grad_norm": 894.5435180664062, "learning_rate": 0.0001637059452753531, "loss": 41.8684, "step": 11609 }, { "epoch": 30.66358534169693, "grad_norm": 863.2798461914062, "learning_rate": 0.00016366673765390218, "loss": 38.3947, "step": 11610 }, { "epoch": 30.666226477385276, "grad_norm": 929.998779296875, "learning_rate": 0.00016362753244325692, "loss": 38.2026, "step": 11611 }, { "epoch": 30.668867613073623, "grad_norm": 615.3384399414062, "learning_rate": 0.00016358832964451204, "loss": 38.5103, "step": 11612 }, { "epoch": 30.67150874876197, "grad_norm": 514.8729248046875, "learning_rate": 0.0001635491292587623, "loss": 41.7654, "step": 11613 }, { "epoch": 30.674149884450312, "grad_norm": 428.80535888671875, "learning_rate": 0.0001635099312871024, "loss": 41.1881, "step": 11614 }, { "epoch": 30.67679102013866, "grad_norm": 600.2818603515625, "learning_rate": 0.0001634707357306267, "loss": 43.622, "step": 11615 }, { "epoch": 30.679432155827005, "grad_norm": 644.0096435546875, "learning_rate": 0.00016343154259043004, "loss": 40.0187, "step": 11616 }, { "epoch": 30.68207329151535, "grad_norm": 547.5567626953125, "learning_rate": 0.0001633923518676067, "loss": 39.7371, "step": 11617 }, { "epoch": 30.684714427203698, "grad_norm": 305.5939025878906, "learning_rate": 0.00016335316356325103, "loss": 39.5101, "step": 11618 }, { "epoch": 30.687355562892044, "grad_norm": 2289.449951171875, "learning_rate": 0.0001633139776784573, "loss": 39.5753, "step": 11619 }, { "epoch": 30.68999669858039, "grad_norm": 787.140869140625, "learning_rate": 0.00016327479421431983, "loss": 38.8231, "step": 11620 }, { "epoch": 30.692637834268737, "grad_norm": 487.8183288574219, "learning_rate": 0.00016323561317193287, "loss": 38.018, "step": 11621 }, { "epoch": 30.69527896995708, "grad_norm": 811.669189453125, "learning_rate": 0.00016319643455239036, "loss": 37.3793, "step": 11622 }, { "epoch": 30.697920105645427, "grad_norm": 383.5159606933594, "learning_rate": 0.0001631572583567865, "loss": 36.248, "step": 11623 }, { "epoch": 30.700561241333773, "grad_norm": 456.4971923828125, "learning_rate": 0.00016311808458621517, "loss": 35.6095, "step": 11624 }, { "epoch": 30.70320237702212, "grad_norm": 350.5683288574219, "learning_rate": 0.00016307891324177028, "loss": 35.2385, "step": 11625 }, { "epoch": 30.705843512710466, "grad_norm": 432.5056457519531, "learning_rate": 0.00016303974432454562, "loss": 34.1521, "step": 11626 }, { "epoch": 30.708484648398812, "grad_norm": 2433.46728515625, "learning_rate": 0.0001630005778356351, "loss": 34.3767, "step": 11627 }, { "epoch": 30.71112578408716, "grad_norm": 479.4820861816406, "learning_rate": 0.00016296141377613232, "loss": 34.6065, "step": 11628 }, { "epoch": 30.7137669197755, "grad_norm": 279.9892578125, "learning_rate": 0.00016292225214713086, "loss": 34.5141, "step": 11629 }, { "epoch": 30.716408055463848, "grad_norm": 437.69036865234375, "learning_rate": 0.00016288309294972442, "loss": 35.7737, "step": 11630 }, { "epoch": 30.719049191152195, "grad_norm": 3059.73974609375, "learning_rate": 0.00016284393618500644, "loss": 45.5061, "step": 11631 }, { "epoch": 30.72169032684054, "grad_norm": 3082.627685546875, "learning_rate": 0.00016280478185407037, "loss": 20.336, "step": 11632 }, { "epoch": 30.724331462528887, "grad_norm": 1468.0928955078125, "learning_rate": 0.0001627656299580094, "loss": 17.8926, "step": 11633 }, { "epoch": 30.726972598217234, "grad_norm": 841.1458740234375, "learning_rate": 0.00016272648049791717, "loss": 17.8164, "step": 11634 }, { "epoch": 30.72961373390558, "grad_norm": 1957.7088623046875, "learning_rate": 0.00016268733347488663, "loss": 19.3789, "step": 11635 }, { "epoch": 30.732254869593927, "grad_norm": 7369.01025390625, "learning_rate": 0.00016264818889001088, "loss": 10.4517, "step": 11636 }, { "epoch": 30.73489600528227, "grad_norm": 4235.6298828125, "learning_rate": 0.00016260904674438327, "loss": 18.2727, "step": 11637 }, { "epoch": 30.737537140970616, "grad_norm": 1091.430908203125, "learning_rate": 0.00016256990703909669, "loss": 11.3517, "step": 11638 }, { "epoch": 30.740178276658963, "grad_norm": 3556.326416015625, "learning_rate": 0.00016253076977524412, "loss": 12.5119, "step": 11639 }, { "epoch": 30.74281941234731, "grad_norm": 391.9276123046875, "learning_rate": 0.00016249163495391834, "loss": 9.8857, "step": 11640 }, { "epoch": 30.745460548035656, "grad_norm": 1380.2659912109375, "learning_rate": 0.0001624525025762123, "loss": 34.1216, "step": 11641 }, { "epoch": 30.748101683724002, "grad_norm": 602.521484375, "learning_rate": 0.0001624133726432187, "loss": 35.3876, "step": 11642 }, { "epoch": 30.75074281941235, "grad_norm": 611.011474609375, "learning_rate": 0.00016237424515603018, "loss": 36.2619, "step": 11643 }, { "epoch": 30.753383955100695, "grad_norm": 954.3313598632812, "learning_rate": 0.00016233512011573943, "loss": 35.9521, "step": 11644 }, { "epoch": 30.756025090789038, "grad_norm": 1090.128662109375, "learning_rate": 0.00016229599752343898, "loss": 35.188, "step": 11645 }, { "epoch": 30.758666226477384, "grad_norm": 473.952880859375, "learning_rate": 0.00016225687738022124, "loss": 34.8718, "step": 11646 }, { "epoch": 30.76130736216573, "grad_norm": 1120.4365234375, "learning_rate": 0.00016221775968717862, "loss": 34.5215, "step": 11647 }, { "epoch": 30.763948497854077, "grad_norm": 500.35107421875, "learning_rate": 0.00016217864444540355, "loss": 35.1439, "step": 11648 }, { "epoch": 30.766589633542424, "grad_norm": 451.15130615234375, "learning_rate": 0.00016213953165598825, "loss": 35.1299, "step": 11649 }, { "epoch": 30.76923076923077, "grad_norm": 1301.2613525390625, "learning_rate": 0.0001621004213200248, "loss": 36.1797, "step": 11650 }, { "epoch": 30.771871904919117, "grad_norm": 704.1284790039062, "learning_rate": 0.00016206131343860557, "loss": 36.0636, "step": 11651 }, { "epoch": 30.77451304060746, "grad_norm": 611.6323852539062, "learning_rate": 0.00016202220801282252, "loss": 35.7699, "step": 11652 }, { "epoch": 30.777154176295806, "grad_norm": 1453.083740234375, "learning_rate": 0.0001619831050437676, "loss": 34.7175, "step": 11653 }, { "epoch": 30.779795311984152, "grad_norm": 1148.628662109375, "learning_rate": 0.00016194400453253268, "loss": 34.9946, "step": 11654 }, { "epoch": 30.7824364476725, "grad_norm": 342.6692199707031, "learning_rate": 0.0001619049064802098, "loss": 34.7958, "step": 11655 }, { "epoch": 30.785077583360845, "grad_norm": 1387.5513916015625, "learning_rate": 0.00016186581088789066, "loss": 34.8492, "step": 11656 }, { "epoch": 30.78771871904919, "grad_norm": 1908.1683349609375, "learning_rate": 0.0001618267177566669, "loss": 35.4946, "step": 11657 }, { "epoch": 30.790359854737538, "grad_norm": 526.4359130859375, "learning_rate": 0.00016178762708763035, "loss": 35.969, "step": 11658 }, { "epoch": 30.793000990425885, "grad_norm": 1688.9305419921875, "learning_rate": 0.00016174853888187247, "loss": 40.888, "step": 11659 }, { "epoch": 30.795642126114227, "grad_norm": 867.0115356445312, "learning_rate": 0.00016170945314048475, "loss": 38.9273, "step": 11660 }, { "epoch": 30.798283261802574, "grad_norm": 821.3202514648438, "learning_rate": 0.00016167036986455876, "loss": 40.1855, "step": 11661 }, { "epoch": 30.80092439749092, "grad_norm": 629.158935546875, "learning_rate": 0.00016163128905518576, "loss": 39.4491, "step": 11662 }, { "epoch": 30.803565533179267, "grad_norm": 743.4435424804688, "learning_rate": 0.0001615922107134571, "loss": 41.8045, "step": 11663 }, { "epoch": 30.806206668867613, "grad_norm": 430.1174011230469, "learning_rate": 0.00016155313484046403, "loss": 39.8811, "step": 11664 }, { "epoch": 30.80884780455596, "grad_norm": 702.7217407226562, "learning_rate": 0.00016151406143729769, "loss": 41.7456, "step": 11665 }, { "epoch": 30.811488940244306, "grad_norm": 328.0506591796875, "learning_rate": 0.00016147499050504926, "loss": 42.4101, "step": 11666 }, { "epoch": 30.814130075932653, "grad_norm": 422.9954833984375, "learning_rate": 0.0001614359220448096, "loss": 41.0088, "step": 11667 }, { "epoch": 30.816771211620996, "grad_norm": 517.1026611328125, "learning_rate": 0.00016139685605766989, "loss": 38.8593, "step": 11668 }, { "epoch": 30.819412347309342, "grad_norm": 562.7623291015625, "learning_rate": 0.00016135779254472095, "loss": 36.8051, "step": 11669 }, { "epoch": 30.82205348299769, "grad_norm": 562.0330810546875, "learning_rate": 0.00016131873150705356, "loss": 37.4153, "step": 11670 }, { "epoch": 30.824694618686035, "grad_norm": 901.4487915039062, "learning_rate": 0.0001612796729457584, "loss": 38.266, "step": 11671 }, { "epoch": 30.82733575437438, "grad_norm": 2506.04150390625, "learning_rate": 0.00016124061686192636, "loss": 36.8727, "step": 11672 }, { "epoch": 30.829976890062728, "grad_norm": 314.9282531738281, "learning_rate": 0.00016120156325664795, "loss": 35.8828, "step": 11673 }, { "epoch": 30.832618025751074, "grad_norm": 421.6378173828125, "learning_rate": 0.0001611625121310137, "loss": 35.5911, "step": 11674 }, { "epoch": 30.835259161439417, "grad_norm": 671.2399291992188, "learning_rate": 0.00016112346348611412, "loss": 35.4857, "step": 11675 }, { "epoch": 30.837900297127764, "grad_norm": 859.97900390625, "learning_rate": 0.00016108441732303963, "loss": 34.1906, "step": 11676 }, { "epoch": 30.84054143281611, "grad_norm": 402.8672180175781, "learning_rate": 0.0001610453736428805, "loss": 34.2137, "step": 11677 }, { "epoch": 30.843182568504456, "grad_norm": 386.2143249511719, "learning_rate": 0.00016100633244672707, "loss": 35.3054, "step": 11678 }, { "epoch": 30.845823704192803, "grad_norm": 2014.6549072265625, "learning_rate": 0.00016096729373566958, "loss": 34.8863, "step": 11679 }, { "epoch": 30.84846483988115, "grad_norm": 716.7357177734375, "learning_rate": 0.0001609282575107981, "loss": 36.6635, "step": 11680 }, { "epoch": 30.851105975569496, "grad_norm": 7021.837890625, "learning_rate": 0.00016088922377320263, "loss": 32.3197, "step": 11681 }, { "epoch": 30.853747111257842, "grad_norm": 1145.398193359375, "learning_rate": 0.00016085019252397333, "loss": 18.8833, "step": 11682 }, { "epoch": 30.856388246946185, "grad_norm": 4143.28955078125, "learning_rate": 0.00016081116376420002, "loss": 17.662, "step": 11683 }, { "epoch": 30.85902938263453, "grad_norm": 2510.98193359375, "learning_rate": 0.00016077213749497257, "loss": 17.8978, "step": 11684 }, { "epoch": 30.861670518322878, "grad_norm": 654.2964477539062, "learning_rate": 0.00016073311371738068, "loss": 13.7542, "step": 11685 }, { "epoch": 30.864311654011225, "grad_norm": 1003.623291015625, "learning_rate": 0.00016069409243251421, "loss": 14.5248, "step": 11686 }, { "epoch": 30.86695278969957, "grad_norm": 707.442138671875, "learning_rate": 0.0001606550736414628, "loss": 12.6778, "step": 11687 }, { "epoch": 30.869593925387917, "grad_norm": 2547.307861328125, "learning_rate": 0.00016061605734531587, "loss": 11.3443, "step": 11688 }, { "epoch": 30.872235061076264, "grad_norm": 1069.39501953125, "learning_rate": 0.00016057704354516313, "loss": 12.0746, "step": 11689 }, { "epoch": 30.87487619676461, "grad_norm": 3621.222412109375, "learning_rate": 0.00016053803224209387, "loss": 9.9239, "step": 11690 }, { "epoch": 30.877517332452953, "grad_norm": 1435.8990478515625, "learning_rate": 0.00016049902343719753, "loss": 18.5525, "step": 11691 }, { "epoch": 30.8801584681413, "grad_norm": 584.8074951171875, "learning_rate": 0.00016046001713156333, "loss": 35.3086, "step": 11692 }, { "epoch": 30.882799603829646, "grad_norm": 576.3783569335938, "learning_rate": 0.00016042101332628062, "loss": 34.2749, "step": 11693 }, { "epoch": 30.885440739517993, "grad_norm": 408.18798828125, "learning_rate": 0.0001603820120224385, "loss": 35.4108, "step": 11694 }, { "epoch": 30.88808187520634, "grad_norm": 716.5244750976562, "learning_rate": 0.00016034301322112596, "loss": 35.2667, "step": 11695 }, { "epoch": 30.890723010894686, "grad_norm": 495.8454895019531, "learning_rate": 0.0001603040169234322, "loss": 34.908, "step": 11696 }, { "epoch": 30.893364146583032, "grad_norm": 794.5575561523438, "learning_rate": 0.00016026502313044606, "loss": 35.0549, "step": 11697 }, { "epoch": 30.896005282271375, "grad_norm": 459.5784912109375, "learning_rate": 0.0001602260318432564, "loss": 34.9611, "step": 11698 }, { "epoch": 30.89864641795972, "grad_norm": 458.3370666503906, "learning_rate": 0.00016018704306295205, "loss": 35.3595, "step": 11699 }, { "epoch": 30.901287553648068, "grad_norm": 374.00390625, "learning_rate": 0.00016014805679062183, "loss": 34.9544, "step": 11700 }, { "epoch": 30.903928689336414, "grad_norm": 1327.78076171875, "learning_rate": 0.00016010907302735434, "loss": 35.7549, "step": 11701 }, { "epoch": 30.90656982502476, "grad_norm": 556.8425903320312, "learning_rate": 0.00016007009177423803, "loss": 35.8056, "step": 11702 }, { "epoch": 30.909210960713107, "grad_norm": 1449.24755859375, "learning_rate": 0.00016003111303236172, "loss": 33.4737, "step": 11703 }, { "epoch": 30.911852096401454, "grad_norm": 356.2830810546875, "learning_rate": 0.00015999213680281378, "loss": 36.1817, "step": 11704 }, { "epoch": 30.9144932320898, "grad_norm": 493.8127136230469, "learning_rate": 0.0001599531630866825, "loss": 33.9986, "step": 11705 }, { "epoch": 30.917134367778143, "grad_norm": 1353.4219970703125, "learning_rate": 0.00015991419188505612, "loss": 35.4938, "step": 11706 }, { "epoch": 30.91977550346649, "grad_norm": 644.0614013671875, "learning_rate": 0.00015987522319902314, "loss": 33.9848, "step": 11707 }, { "epoch": 30.922416639154836, "grad_norm": 879.418701171875, "learning_rate": 0.0001598362570296716, "loss": 37.8342, "step": 11708 }, { "epoch": 30.925057774843182, "grad_norm": 2291.268310546875, "learning_rate": 0.00015979729337808955, "loss": 40.8574, "step": 11709 }, { "epoch": 30.92769891053153, "grad_norm": 782.9966430664062, "learning_rate": 0.00015975833224536514, "loss": 39.1873, "step": 11710 }, { "epoch": 30.930340046219875, "grad_norm": 570.2599487304688, "learning_rate": 0.00015971937363258633, "loss": 39.4996, "step": 11711 }, { "epoch": 30.93298118190822, "grad_norm": 527.310546875, "learning_rate": 0.00015968041754084095, "loss": 44.151, "step": 11712 }, { "epoch": 30.935622317596568, "grad_norm": 528.0776977539062, "learning_rate": 0.0001596414639712168, "loss": 42.3967, "step": 11713 }, { "epoch": 30.93826345328491, "grad_norm": 2345.435791015625, "learning_rate": 0.00015960251292480176, "loss": 40.1319, "step": 11714 }, { "epoch": 30.940904588973257, "grad_norm": 496.3359375, "learning_rate": 0.00015956356440268343, "loss": 40.4367, "step": 11715 }, { "epoch": 30.943545724661604, "grad_norm": 380.6175842285156, "learning_rate": 0.00015952461840594939, "loss": 39.7076, "step": 11716 }, { "epoch": 30.94618686034995, "grad_norm": 463.60198974609375, "learning_rate": 0.00015948567493568726, "loss": 38.2867, "step": 11717 }, { "epoch": 30.948827996038297, "grad_norm": 435.4183349609375, "learning_rate": 0.00015944673399298455, "loss": 37.5193, "step": 11718 }, { "epoch": 30.951469131726643, "grad_norm": 2228.708984375, "learning_rate": 0.00015940779557892853, "loss": 35.4851, "step": 11719 }, { "epoch": 30.95411026741499, "grad_norm": 602.3682861328125, "learning_rate": 0.00015936885969460656, "loss": 36.3453, "step": 11720 }, { "epoch": 30.956751403103333, "grad_norm": 331.95526123046875, "learning_rate": 0.000159329926341106, "loss": 36.3129, "step": 11721 }, { "epoch": 30.95939253879168, "grad_norm": 1925.658203125, "learning_rate": 0.00015929099551951405, "loss": 37.325, "step": 11722 }, { "epoch": 30.962033674480026, "grad_norm": 1618.3018798828125, "learning_rate": 0.0001592520672309176, "loss": 11.4321, "step": 11723 }, { "epoch": 30.964674810168372, "grad_norm": 947.0350952148438, "learning_rate": 0.00015921314147640398, "loss": 11.9995, "step": 11724 }, { "epoch": 30.96731594585672, "grad_norm": 692.6607666015625, "learning_rate": 0.00015917421825706007, "loss": 15.8006, "step": 11725 }, { "epoch": 30.969957081545065, "grad_norm": 1676.2950439453125, "learning_rate": 0.00015913529757397273, "loss": 15.1637, "step": 11726 }, { "epoch": 30.97259821723341, "grad_norm": 860.313720703125, "learning_rate": 0.0001590963794282288, "loss": 11.1296, "step": 11727 }, { "epoch": 30.975239352921758, "grad_norm": 663.4318237304688, "learning_rate": 0.00015905746382091513, "loss": 36.3613, "step": 11728 }, { "epoch": 30.9778804886101, "grad_norm": 692.6356811523438, "learning_rate": 0.00015901855075311832, "loss": 34.4906, "step": 11729 }, { "epoch": 30.980521624298447, "grad_norm": 452.8460693359375, "learning_rate": 0.00015897964022592505, "loss": 35.2721, "step": 11730 }, { "epoch": 30.983162759986794, "grad_norm": 407.4716491699219, "learning_rate": 0.00015894073224042183, "loss": 34.8906, "step": 11731 }, { "epoch": 30.98580389567514, "grad_norm": 571.0964965820312, "learning_rate": 0.00015890182679769522, "loss": 33.289, "step": 11732 }, { "epoch": 30.988445031363486, "grad_norm": 920.9212646484375, "learning_rate": 0.00015886292389883158, "loss": 33.915, "step": 11733 }, { "epoch": 30.991086167051833, "grad_norm": 343.2620544433594, "learning_rate": 0.00015882402354491722, "loss": 35.1379, "step": 11734 }, { "epoch": 30.99372730274018, "grad_norm": 939.3124389648438, "learning_rate": 0.00015878512573703845, "loss": 36.0967, "step": 11735 }, { "epoch": 30.996368438428526, "grad_norm": 2480.538330078125, "learning_rate": 0.00015874623047628148, "loss": 35.0129, "step": 11736 }, { "epoch": 30.99900957411687, "grad_norm": 514.2723999023438, "learning_rate": 0.0001587073377637323, "loss": 37.9025, "step": 11737 }, { "epoch": 31.001650709805215, "grad_norm": 485.9093017578125, "learning_rate": 0.0001586684476004772, "loss": 41.2733, "step": 11738 }, { "epoch": 31.00429184549356, "grad_norm": 715.0105590820312, "learning_rate": 0.00015862955998760214, "loss": 40.3721, "step": 11739 }, { "epoch": 31.006932981181908, "grad_norm": 845.0299682617188, "learning_rate": 0.00015859067492619283, "loss": 39.6565, "step": 11740 }, { "epoch": 31.009574116870255, "grad_norm": 812.9136352539062, "learning_rate": 0.00015855179241733517, "loss": 38.7578, "step": 11741 }, { "epoch": 31.0122152525586, "grad_norm": 568.1171875, "learning_rate": 0.00015851291246211506, "loss": 42.5328, "step": 11742 }, { "epoch": 31.014856388246947, "grad_norm": 3495.307373046875, "learning_rate": 0.00015847403506161813, "loss": 40.8752, "step": 11743 }, { "epoch": 31.017497523935294, "grad_norm": 936.4544067382812, "learning_rate": 0.00015843516021692998, "loss": 42.6093, "step": 11744 }, { "epoch": 31.020138659623637, "grad_norm": 325.03363037109375, "learning_rate": 0.00015839628792913623, "loss": 42.3927, "step": 11745 }, { "epoch": 31.022779795311983, "grad_norm": 612.6188354492188, "learning_rate": 0.00015835741819932231, "loss": 43.5088, "step": 11746 }, { "epoch": 31.02542093100033, "grad_norm": 549.6780395507812, "learning_rate": 0.00015831855102857367, "loss": 40.0124, "step": 11747 }, { "epoch": 31.028062066688676, "grad_norm": 817.2539672851562, "learning_rate": 0.00015827968641797557, "loss": 38.1995, "step": 11748 }, { "epoch": 31.030703202377023, "grad_norm": 601.3956909179688, "learning_rate": 0.00015824082436861341, "loss": 38.4001, "step": 11749 }, { "epoch": 31.03334433806537, "grad_norm": 802.7496948242188, "learning_rate": 0.00015820196488157236, "loss": 37.543, "step": 11750 }, { "epoch": 31.035985473753716, "grad_norm": 309.0021057128906, "learning_rate": 0.0001581631079579374, "loss": 37.0314, "step": 11751 }, { "epoch": 31.03862660944206, "grad_norm": 579.3078002929688, "learning_rate": 0.0001581242535987938, "loss": 35.0227, "step": 11752 }, { "epoch": 31.041267745130405, "grad_norm": 554.7482299804688, "learning_rate": 0.00015808540180522646, "loss": 36.2235, "step": 11753 }, { "epoch": 31.04390888081875, "grad_norm": 426.310302734375, "learning_rate": 0.00015804655257832024, "loss": 35.29, "step": 11754 }, { "epoch": 31.046550016507098, "grad_norm": 411.3284606933594, "learning_rate": 0.00015800770591915995, "loss": 35.1881, "step": 11755 }, { "epoch": 31.049191152195444, "grad_norm": 627.56640625, "learning_rate": 0.00015796886182883053, "loss": 34.6419, "step": 11756 }, { "epoch": 31.05183228788379, "grad_norm": 301.8323669433594, "learning_rate": 0.0001579300203084166, "loss": 35.7399, "step": 11757 }, { "epoch": 31.054473423572137, "grad_norm": 623.0087280273438, "learning_rate": 0.00015789118135900267, "loss": 35.0249, "step": 11758 }, { "epoch": 31.057114559260484, "grad_norm": 1608.053466796875, "learning_rate": 0.00015785234498167346, "loss": 38.4806, "step": 11759 }, { "epoch": 31.059755694948826, "grad_norm": 1126.468505859375, "learning_rate": 0.00015781351117751335, "loss": 19.7068, "step": 11760 }, { "epoch": 31.062396830637173, "grad_norm": 1801.5650634765625, "learning_rate": 0.00015777467994760686, "loss": 17.036, "step": 11761 }, { "epoch": 31.06503796632552, "grad_norm": 5671.53857421875, "learning_rate": 0.00015773585129303814, "loss": 18.1608, "step": 11762 }, { "epoch": 31.067679102013866, "grad_norm": 8634.845703125, "learning_rate": 0.00015769702521489166, "loss": 12.4094, "step": 11763 }, { "epoch": 31.070320237702212, "grad_norm": 863.3614501953125, "learning_rate": 0.00015765820171425156, "loss": 13.7465, "step": 11764 }, { "epoch": 31.07296137339056, "grad_norm": 3112.012939453125, "learning_rate": 0.00015761938079220184, "loss": 12.8584, "step": 11765 }, { "epoch": 31.075602509078905, "grad_norm": 1866.4638671875, "learning_rate": 0.0001575805624498267, "loss": 17.0839, "step": 11766 }, { "epoch": 31.07824364476725, "grad_norm": 435.00482177734375, "learning_rate": 0.00015754174668821007, "loss": 11.7896, "step": 11767 }, { "epoch": 31.080884780455595, "grad_norm": 21963.91796875, "learning_rate": 0.00015750293350843585, "loss": 11.6423, "step": 11768 }, { "epoch": 31.08352591614394, "grad_norm": 686.5189819335938, "learning_rate": 0.0001574641229115878, "loss": 15.4203, "step": 11769 }, { "epoch": 31.086167051832287, "grad_norm": 463.5863037109375, "learning_rate": 0.00015742531489874983, "loss": 34.441, "step": 11770 }, { "epoch": 31.088808187520634, "grad_norm": 905.3977661132812, "learning_rate": 0.00015738650947100559, "loss": 35.1347, "step": 11771 }, { "epoch": 31.09144932320898, "grad_norm": 742.23486328125, "learning_rate": 0.00015734770662943854, "loss": 35.0587, "step": 11772 }, { "epoch": 31.094090458897327, "grad_norm": 393.3357849121094, "learning_rate": 0.00015730890637513246, "loss": 34.4872, "step": 11773 }, { "epoch": 31.096731594585673, "grad_norm": 1421.96728515625, "learning_rate": 0.00015727010870917075, "loss": 33.4603, "step": 11774 }, { "epoch": 31.099372730274016, "grad_norm": 878.7171630859375, "learning_rate": 0.00015723131363263676, "loss": 34.6091, "step": 11775 }, { "epoch": 31.102013865962363, "grad_norm": 1618.8927001953125, "learning_rate": 0.0001571925211466137, "loss": 34.9924, "step": 11776 }, { "epoch": 31.10465500165071, "grad_norm": 423.62054443359375, "learning_rate": 0.00015715373125218512, "loss": 35.6265, "step": 11777 }, { "epoch": 31.107296137339056, "grad_norm": 1075.1612548828125, "learning_rate": 0.000157114943950434, "loss": 34.7517, "step": 11778 }, { "epoch": 31.109937273027402, "grad_norm": 729.5691528320312, "learning_rate": 0.0001570761592424435, "loss": 33.0454, "step": 11779 }, { "epoch": 31.11257840871575, "grad_norm": 667.1948852539062, "learning_rate": 0.0001570373771292967, "loss": 34.8402, "step": 11780 }, { "epoch": 31.115219544404095, "grad_norm": 1272.927978515625, "learning_rate": 0.00015699859761207657, "loss": 36.3664, "step": 11781 }, { "epoch": 31.11786068009244, "grad_norm": 444.3436279296875, "learning_rate": 0.00015695982069186592, "loss": 34.568, "step": 11782 }, { "epoch": 31.120501815780784, "grad_norm": 487.68792724609375, "learning_rate": 0.0001569210463697476, "loss": 33.6805, "step": 11783 }, { "epoch": 31.12314295146913, "grad_norm": 380.1989440917969, "learning_rate": 0.00015688227464680444, "loss": 34.5869, "step": 11784 }, { "epoch": 31.125784087157477, "grad_norm": 531.980712890625, "learning_rate": 0.00015684350552411903, "loss": 35.5168, "step": 11785 }, { "epoch": 31.128425222845824, "grad_norm": 770.9573974609375, "learning_rate": 0.00015680473900277399, "loss": 36.4123, "step": 11786 }, { "epoch": 31.13106635853417, "grad_norm": 432.3338623046875, "learning_rate": 0.00015676597508385189, "loss": 38.2584, "step": 11787 }, { "epoch": 31.133707494222516, "grad_norm": 1479.726806640625, "learning_rate": 0.0001567272137684352, "loss": 40.6392, "step": 11788 }, { "epoch": 31.136348629910863, "grad_norm": 351.9877624511719, "learning_rate": 0.0001566884550576061, "loss": 40.398, "step": 11789 }, { "epoch": 31.13898976559921, "grad_norm": 698.4280395507812, "learning_rate": 0.00015664969895244724, "loss": 38.8972, "step": 11790 }, { "epoch": 31.141630901287552, "grad_norm": 569.34814453125, "learning_rate": 0.0001566109454540407, "loss": 40.4354, "step": 11791 }, { "epoch": 31.1442720369759, "grad_norm": 668.5159301757812, "learning_rate": 0.00015657219456346867, "loss": 39.8322, "step": 11792 }, { "epoch": 31.146913172664245, "grad_norm": 779.6992797851562, "learning_rate": 0.0001565334462818131, "loss": 42.4467, "step": 11793 }, { "epoch": 31.14955430835259, "grad_norm": 733.591064453125, "learning_rate": 0.00015649470061015623, "loss": 44.0383, "step": 11794 }, { "epoch": 31.152195444040938, "grad_norm": 399.1832275390625, "learning_rate": 0.00015645595754957992, "loss": 40.1575, "step": 11795 }, { "epoch": 31.154836579729285, "grad_norm": 1089.70751953125, "learning_rate": 0.00015641721710116596, "loss": 39.6249, "step": 11796 }, { "epoch": 31.15747771541763, "grad_norm": 294.7997741699219, "learning_rate": 0.00015637847926599634, "loss": 38.4664, "step": 11797 }, { "epoch": 31.160118851105974, "grad_norm": 1511.48291015625, "learning_rate": 0.00015633974404515266, "loss": 39.8355, "step": 11798 }, { "epoch": 31.16275998679432, "grad_norm": 357.81317138671875, "learning_rate": 0.00015630101143971663, "loss": 39.1065, "step": 11799 }, { "epoch": 31.165401122482667, "grad_norm": 398.9563293457031, "learning_rate": 0.00015626228145076976, "loss": 40.2896, "step": 11800 }, { "epoch": 31.165401122482667, "eval_loss": 3.770617723464966, "eval_runtime": 2.1777, "eval_samples_per_second": 227.303, "eval_steps_per_second": 28.47, "step": 11800 }, { "epoch": 31.168042258171013, "grad_norm": 589.0744018554688, "learning_rate": 0.00015622355407939365, "loss": 35.9588, "step": 11801 }, { "epoch": 31.17068339385936, "grad_norm": 511.5504455566406, "learning_rate": 0.00015618482932666972, "loss": 36.6844, "step": 11802 }, { "epoch": 31.173324529547706, "grad_norm": 402.66839599609375, "learning_rate": 0.0001561461071936792, "loss": 36.4959, "step": 11803 }, { "epoch": 31.175965665236053, "grad_norm": 421.99578857421875, "learning_rate": 0.00015610738768150368, "loss": 35.0994, "step": 11804 }, { "epoch": 31.1786068009244, "grad_norm": 486.7666931152344, "learning_rate": 0.00015606867079122412, "loss": 35.2184, "step": 11805 }, { "epoch": 31.181247936612742, "grad_norm": 309.6825256347656, "learning_rate": 0.00015602995652392178, "loss": 35.197, "step": 11806 }, { "epoch": 31.18388907230109, "grad_norm": 394.684814453125, "learning_rate": 0.0001559912448806776, "loss": 34.752, "step": 11807 }, { "epoch": 31.186530207989435, "grad_norm": 796.5833129882812, "learning_rate": 0.00015595253586257274, "loss": 34.379, "step": 11808 }, { "epoch": 31.18917134367778, "grad_norm": 635.99267578125, "learning_rate": 0.00015591382947068816, "loss": 36.9616, "step": 11809 }, { "epoch": 31.191812479366128, "grad_norm": 4737.38037109375, "learning_rate": 0.0001558751257061045, "loss": 31.1308, "step": 11810 }, { "epoch": 31.194453615054474, "grad_norm": 1987.791259765625, "learning_rate": 0.00015583642456990272, "loss": 12.1624, "step": 11811 }, { "epoch": 31.19709475074282, "grad_norm": 650.79833984375, "learning_rate": 0.00015579772606316348, "loss": 8.872, "step": 11812 }, { "epoch": 31.199735886431167, "grad_norm": 717.814208984375, "learning_rate": 0.0001557590301869674, "loss": 15.6585, "step": 11813 }, { "epoch": 31.20237702211951, "grad_norm": 1056.5574951171875, "learning_rate": 0.00015572033694239503, "loss": 11.8545, "step": 11814 }, { "epoch": 31.205018157807856, "grad_norm": 2044.1180419921875, "learning_rate": 0.00015568164633052685, "loss": 9.3842, "step": 11815 }, { "epoch": 31.207659293496203, "grad_norm": 1820.381103515625, "learning_rate": 0.00015564295835244333, "loss": 10.9283, "step": 11816 }, { "epoch": 31.21030042918455, "grad_norm": 1193.3297119140625, "learning_rate": 0.0001556042730092247, "loss": 12.486, "step": 11817 }, { "epoch": 31.212941564872896, "grad_norm": 2452.6875, "learning_rate": 0.0001555655903019514, "loss": 15.0864, "step": 11818 }, { "epoch": 31.215582700561242, "grad_norm": 20719.1171875, "learning_rate": 0.00015552691023170345, "loss": 9.899, "step": 11819 }, { "epoch": 31.21822383624959, "grad_norm": 572.6615600585938, "learning_rate": 0.00015548823279956103, "loss": 33.646, "step": 11820 }, { "epoch": 31.22086497193793, "grad_norm": 708.6595458984375, "learning_rate": 0.00015544955800660416, "loss": 35.7596, "step": 11821 }, { "epoch": 31.223506107626278, "grad_norm": 2255.271728515625, "learning_rate": 0.00015541088585391293, "loss": 35.2937, "step": 11822 }, { "epoch": 31.226147243314625, "grad_norm": 562.8909301757812, "learning_rate": 0.00015537221634256713, "loss": 35.461, "step": 11823 }, { "epoch": 31.22878837900297, "grad_norm": 483.25933837890625, "learning_rate": 0.00015533354947364647, "loss": 36.4305, "step": 11824 }, { "epoch": 31.231429514691317, "grad_norm": 2504.5615234375, "learning_rate": 0.00015529488524823097, "loss": 35.4654, "step": 11825 }, { "epoch": 31.234070650379664, "grad_norm": 578.5032958984375, "learning_rate": 0.00015525622366740016, "loss": 34.9316, "step": 11826 }, { "epoch": 31.23671178606801, "grad_norm": 376.42047119140625, "learning_rate": 0.00015521756473223366, "loss": 33.7746, "step": 11827 }, { "epoch": 31.239352921756357, "grad_norm": 387.5173645019531, "learning_rate": 0.00015517890844381088, "loss": 34.0472, "step": 11828 }, { "epoch": 31.2419940574447, "grad_norm": 387.35040283203125, "learning_rate": 0.00015514025480321147, "loss": 34.7324, "step": 11829 }, { "epoch": 31.244635193133046, "grad_norm": 674.77783203125, "learning_rate": 0.00015510160381151472, "loss": 34.8871, "step": 11830 }, { "epoch": 31.247276328821393, "grad_norm": 553.2755737304688, "learning_rate": 0.00015506295546979988, "loss": 34.7757, "step": 11831 }, { "epoch": 31.24991746450974, "grad_norm": 705.4880981445312, "learning_rate": 0.0001550243097791463, "loss": 34.8024, "step": 11832 }, { "epoch": 31.252558600198086, "grad_norm": 527.5364379882812, "learning_rate": 0.00015498566674063306, "loss": 34.2256, "step": 11833 }, { "epoch": 31.255199735886432, "grad_norm": 756.25048828125, "learning_rate": 0.00015494702635533925, "loss": 34.5601, "step": 11834 }, { "epoch": 31.25784087157478, "grad_norm": 759.1094360351562, "learning_rate": 0.00015490838862434388, "loss": 34.2735, "step": 11835 }, { "epoch": 31.260482007263125, "grad_norm": 495.6990051269531, "learning_rate": 0.00015486975354872594, "loss": 36.7836, "step": 11836 }, { "epoch": 31.263123142951468, "grad_norm": 1271.8616943359375, "learning_rate": 0.00015483112112956422, "loss": 38.6059, "step": 11837 }, { "epoch": 31.265764278639814, "grad_norm": 1282.8001708984375, "learning_rate": 0.00015479249136793747, "loss": 39.2491, "step": 11838 }, { "epoch": 31.26840541432816, "grad_norm": 310.5227355957031, "learning_rate": 0.00015475386426492462, "loss": 38.5181, "step": 11839 }, { "epoch": 31.271046550016507, "grad_norm": 427.5605163574219, "learning_rate": 0.00015471523982160408, "loss": 39.6637, "step": 11840 }, { "epoch": 31.273687685704854, "grad_norm": 2871.722412109375, "learning_rate": 0.00015467661803905453, "loss": 38.979, "step": 11841 }, { "epoch": 31.2763288213932, "grad_norm": 377.89532470703125, "learning_rate": 0.0001546379989183543, "loss": 40.6288, "step": 11842 }, { "epoch": 31.278969957081546, "grad_norm": 526.3246459960938, "learning_rate": 0.00015459938246058203, "loss": 42.0178, "step": 11843 }, { "epoch": 31.28161109276989, "grad_norm": 809.8905639648438, "learning_rate": 0.000154560768666816, "loss": 42.3584, "step": 11844 }, { "epoch": 31.284252228458236, "grad_norm": 429.6329345703125, "learning_rate": 0.00015452215753813435, "loss": 40.2028, "step": 11845 }, { "epoch": 31.286893364146582, "grad_norm": 332.7565612792969, "learning_rate": 0.0001544835490756154, "loss": 38.6134, "step": 11846 }, { "epoch": 31.28953449983493, "grad_norm": 327.2239685058594, "learning_rate": 0.00015444494328033728, "loss": 38.7745, "step": 11847 }, { "epoch": 31.292175635523275, "grad_norm": 321.80877685546875, "learning_rate": 0.00015440634015337796, "loss": 36.2961, "step": 11848 }, { "epoch": 31.29481677121162, "grad_norm": 418.9794921875, "learning_rate": 0.0001543677396958154, "loss": 36.9703, "step": 11849 }, { "epoch": 31.297457906899968, "grad_norm": 584.9432983398438, "learning_rate": 0.00015432914190872756, "loss": 39.3651, "step": 11850 }, { "epoch": 31.300099042588315, "grad_norm": 539.907958984375, "learning_rate": 0.00015429054679319226, "loss": 37.3276, "step": 11851 }, { "epoch": 31.302740178276657, "grad_norm": 353.33648681640625, "learning_rate": 0.00015425195435028716, "loss": 34.4474, "step": 11852 }, { "epoch": 31.305381313965004, "grad_norm": 312.9964294433594, "learning_rate": 0.00015421336458109003, "loss": 34.716, "step": 11853 }, { "epoch": 31.30802244965335, "grad_norm": 1452.9852294921875, "learning_rate": 0.00015417477748667842, "loss": 35.0567, "step": 11854 }, { "epoch": 31.310663585341697, "grad_norm": 332.8565673828125, "learning_rate": 0.00015413619306812987, "loss": 36.1899, "step": 11855 }, { "epoch": 31.313304721030043, "grad_norm": 665.4714965820312, "learning_rate": 0.00015409761132652172, "loss": 33.6655, "step": 11856 }, { "epoch": 31.31594585671839, "grad_norm": 637.6964111328125, "learning_rate": 0.00015405903226293162, "loss": 35.5888, "step": 11857 }, { "epoch": 31.318586992406736, "grad_norm": 578.79833984375, "learning_rate": 0.00015402045587843658, "loss": 35.5726, "step": 11858 }, { "epoch": 31.321228128095083, "grad_norm": 974.924560546875, "learning_rate": 0.00015398188217411384, "loss": 44.0865, "step": 11859 }, { "epoch": 31.323869263783426, "grad_norm": 2057.674072265625, "learning_rate": 0.00015394331115104075, "loss": 23.4051, "step": 11860 }, { "epoch": 31.326510399471772, "grad_norm": 6120.24365234375, "learning_rate": 0.00015390474281029427, "loss": 15.8205, "step": 11861 }, { "epoch": 31.32915153516012, "grad_norm": 1531.427734375, "learning_rate": 0.00015386617715295141, "loss": 11.8896, "step": 11862 }, { "epoch": 31.331792670848465, "grad_norm": 8857.4384765625, "learning_rate": 0.00015382761418008894, "loss": 14.1047, "step": 11863 }, { "epoch": 31.33443380653681, "grad_norm": 2105.588623046875, "learning_rate": 0.000153789053892784, "loss": 16.4341, "step": 11864 }, { "epoch": 31.337074942225158, "grad_norm": 1054.9237060546875, "learning_rate": 0.0001537504962921132, "loss": 9.4094, "step": 11865 }, { "epoch": 31.339716077913504, "grad_norm": 1536.730712890625, "learning_rate": 0.00015371194137915318, "loss": 15.3669, "step": 11866 }, { "epoch": 31.342357213601847, "grad_norm": 475.7119140625, "learning_rate": 0.00015367338915498065, "loss": 9.5675, "step": 11867 }, { "epoch": 31.344998349290194, "grad_norm": 2066.1533203125, "learning_rate": 0.0001536348396206722, "loss": 18.1667, "step": 11868 }, { "epoch": 31.34763948497854, "grad_norm": 2304.51171875, "learning_rate": 0.00015359629277730423, "loss": 16.4924, "step": 11869 }, { "epoch": 31.350280620666886, "grad_norm": 909.8825073242188, "learning_rate": 0.00015355774862595308, "loss": 36.6051, "step": 11870 }, { "epoch": 31.352921756355233, "grad_norm": 420.4331970214844, "learning_rate": 0.00015351920716769524, "loss": 34.7766, "step": 11871 }, { "epoch": 31.35556289204358, "grad_norm": 768.3724365234375, "learning_rate": 0.0001534806684036068, "loss": 36.0398, "step": 11872 }, { "epoch": 31.358204027731926, "grad_norm": 504.5380554199219, "learning_rate": 0.00015344213233476394, "loss": 33.5715, "step": 11873 }, { "epoch": 31.360845163420272, "grad_norm": 915.678466796875, "learning_rate": 0.00015340359896224298, "loss": 34.0198, "step": 11874 }, { "epoch": 31.363486299108615, "grad_norm": 472.6131896972656, "learning_rate": 0.00015336506828711973, "loss": 34.3323, "step": 11875 }, { "epoch": 31.36612743479696, "grad_norm": 350.0528869628906, "learning_rate": 0.0001533265403104701, "loss": 34.4374, "step": 11876 }, { "epoch": 31.368768570485308, "grad_norm": 1140.2337646484375, "learning_rate": 0.00015328801503337004, "loss": 34.6986, "step": 11877 }, { "epoch": 31.371409706173655, "grad_norm": 358.7889099121094, "learning_rate": 0.0001532494924568954, "loss": 34.2909, "step": 11878 }, { "epoch": 31.374050841862, "grad_norm": 684.1426391601562, "learning_rate": 0.0001532109725821219, "loss": 35.3033, "step": 11879 }, { "epoch": 31.376691977550347, "grad_norm": 550.473388671875, "learning_rate": 0.00015317245541012498, "loss": 33.7122, "step": 11880 }, { "epoch": 31.379333113238694, "grad_norm": 699.7581176757812, "learning_rate": 0.00015313394094198046, "loss": 34.1506, "step": 11881 }, { "epoch": 31.38197424892704, "grad_norm": 481.0455627441406, "learning_rate": 0.00015309542917876377, "loss": 34.3992, "step": 11882 }, { "epoch": 31.384615384615383, "grad_norm": 476.10400390625, "learning_rate": 0.00015305692012155026, "loss": 33.7753, "step": 11883 }, { "epoch": 31.38725652030373, "grad_norm": 366.46392822265625, "learning_rate": 0.0001530184137714153, "loss": 35.5447, "step": 11884 }, { "epoch": 31.389897655992076, "grad_norm": 3414.71484375, "learning_rate": 0.0001529799101294342, "loss": 35.1088, "step": 11885 }, { "epoch": 31.392538791680423, "grad_norm": 406.9238586425781, "learning_rate": 0.0001529414091966821, "loss": 36.661, "step": 11886 }, { "epoch": 31.39517992736877, "grad_norm": 796.5253295898438, "learning_rate": 0.0001529029109742341, "loss": 40.0386, "step": 11887 }, { "epoch": 31.397821063057116, "grad_norm": 548.9298706054688, "learning_rate": 0.00015286441546316526, "loss": 42.6841, "step": 11888 }, { "epoch": 31.400462198745462, "grad_norm": 370.18231201171875, "learning_rate": 0.00015282592266455065, "loss": 39.7489, "step": 11889 }, { "epoch": 31.403103334433805, "grad_norm": 845.547607421875, "learning_rate": 0.00015278743257946502, "loss": 40.2011, "step": 11890 }, { "epoch": 31.40574447012215, "grad_norm": 2101.113525390625, "learning_rate": 0.00015274894520898313, "loss": 40.623, "step": 11891 }, { "epoch": 31.408385605810498, "grad_norm": 732.5573120117188, "learning_rate": 0.00015271046055417996, "loss": 44.0898, "step": 11892 }, { "epoch": 31.411026741498844, "grad_norm": 871.6068115234375, "learning_rate": 0.00015267197861613, "loss": 42.1846, "step": 11893 }, { "epoch": 31.41366787718719, "grad_norm": 1285.7501220703125, "learning_rate": 0.00015263349939590777, "loss": 42.8723, "step": 11894 }, { "epoch": 31.416309012875537, "grad_norm": 728.2650756835938, "learning_rate": 0.00015259502289458794, "loss": 43.611, "step": 11895 }, { "epoch": 31.418950148563884, "grad_norm": 505.95843505859375, "learning_rate": 0.0001525565491132449, "loss": 39.019, "step": 11896 }, { "epoch": 31.42159128425223, "grad_norm": 445.82513427734375, "learning_rate": 0.000152518078052953, "loss": 39.2504, "step": 11897 }, { "epoch": 31.424232419940573, "grad_norm": 388.4133605957031, "learning_rate": 0.00015247960971478638, "loss": 39.2428, "step": 11898 }, { "epoch": 31.42687355562892, "grad_norm": 409.7354736328125, "learning_rate": 0.0001524411440998195, "loss": 37.3454, "step": 11899 }, { "epoch": 31.429514691317266, "grad_norm": 552.7164306640625, "learning_rate": 0.00015240268120912632, "loss": 38.2845, "step": 11900 }, { "epoch": 31.432155827005612, "grad_norm": 489.6278381347656, "learning_rate": 0.0001523642210437809, "loss": 37.7211, "step": 11901 }, { "epoch": 31.43479696269396, "grad_norm": 324.8143005371094, "learning_rate": 0.00015232576360485733, "loss": 34.3912, "step": 11902 }, { "epoch": 31.437438098382305, "grad_norm": 449.39923095703125, "learning_rate": 0.0001522873088934294, "loss": 36.2311, "step": 11903 }, { "epoch": 31.44007923407065, "grad_norm": 810.2955322265625, "learning_rate": 0.000152248856910571, "loss": 36.3442, "step": 11904 }, { "epoch": 31.442720369758998, "grad_norm": 623.5410766601562, "learning_rate": 0.00015221040765735577, "loss": 34.4486, "step": 11905 }, { "epoch": 31.44536150544734, "grad_norm": 470.3873291015625, "learning_rate": 0.0001521719611348575, "loss": 34.9515, "step": 11906 }, { "epoch": 31.448002641135687, "grad_norm": 491.1092529296875, "learning_rate": 0.00015213351734414977, "loss": 34.959, "step": 11907 }, { "epoch": 31.450643776824034, "grad_norm": 497.4880676269531, "learning_rate": 0.00015209507628630598, "loss": 36.2592, "step": 11908 }, { "epoch": 31.45328491251238, "grad_norm": 1699.537353515625, "learning_rate": 0.00015205663796239984, "loss": 40.0655, "step": 11909 }, { "epoch": 31.455926048200727, "grad_norm": 6786.80859375, "learning_rate": 0.00015201820237350448, "loss": 22.2534, "step": 11910 }, { "epoch": 31.458567183889073, "grad_norm": 762.1692504882812, "learning_rate": 0.00015197976952069326, "loss": 13.7931, "step": 11911 }, { "epoch": 31.46120831957742, "grad_norm": 578.2427978515625, "learning_rate": 0.00015194133940503929, "loss": 8.5716, "step": 11912 }, { "epoch": 31.463849455265763, "grad_norm": 1276.8924560546875, "learning_rate": 0.0001519029120276159, "loss": 8.9204, "step": 11913 }, { "epoch": 31.46649059095411, "grad_norm": 1353.4874267578125, "learning_rate": 0.0001518644873894961, "loss": 13.0651, "step": 11914 }, { "epoch": 31.469131726642456, "grad_norm": 1906.4561767578125, "learning_rate": 0.00015182606549175276, "loss": 16.3064, "step": 11915 }, { "epoch": 31.471772862330802, "grad_norm": 2165.753662109375, "learning_rate": 0.00015178764633545894, "loss": 14.256, "step": 11916 }, { "epoch": 31.47441399801915, "grad_norm": 4017.53759765625, "learning_rate": 0.00015174922992168744, "loss": 12.4267, "step": 11917 }, { "epoch": 31.477055133707495, "grad_norm": 1129.0694580078125, "learning_rate": 0.00015171081625151088, "loss": 9.9159, "step": 11918 }, { "epoch": 31.47969626939584, "grad_norm": 588.7328491210938, "learning_rate": 0.00015167240532600209, "loss": 24.9903, "step": 11919 }, { "epoch": 31.482337405084188, "grad_norm": 591.9722900390625, "learning_rate": 0.00015163399714623364, "loss": 34.1026, "step": 11920 }, { "epoch": 31.48497854077253, "grad_norm": 513.9570922851562, "learning_rate": 0.000151595591713278, "loss": 33.9539, "step": 11921 }, { "epoch": 31.487619676460877, "grad_norm": 1061.2158203125, "learning_rate": 0.00015155718902820764, "loss": 35.3612, "step": 11922 }, { "epoch": 31.490260812149224, "grad_norm": 796.1035766601562, "learning_rate": 0.00015151878909209498, "loss": 33.8204, "step": 11923 }, { "epoch": 31.49290194783757, "grad_norm": 567.744140625, "learning_rate": 0.00015148039190601231, "loss": 35.4116, "step": 11924 }, { "epoch": 31.495543083525916, "grad_norm": 365.68414306640625, "learning_rate": 0.0001514419974710317, "loss": 34.9386, "step": 11925 }, { "epoch": 31.498184219214263, "grad_norm": 2379.872802734375, "learning_rate": 0.00015140360578822553, "loss": 33.8838, "step": 11926 }, { "epoch": 31.50082535490261, "grad_norm": 1057.123779296875, "learning_rate": 0.00015136521685866578, "loss": 34.5716, "step": 11927 }, { "epoch": 31.503466490590956, "grad_norm": 1102.8681640625, "learning_rate": 0.00015132683068342438, "loss": 34.6842, "step": 11928 }, { "epoch": 31.5061076262793, "grad_norm": 436.34942626953125, "learning_rate": 0.00015128844726357315, "loss": 36.0605, "step": 11929 }, { "epoch": 31.508748761967645, "grad_norm": 653.3214111328125, "learning_rate": 0.00015125006660018415, "loss": 35.5213, "step": 11930 }, { "epoch": 31.51138989765599, "grad_norm": 1932.4031982421875, "learning_rate": 0.00015121168869432905, "loss": 35.3632, "step": 11931 }, { "epoch": 31.514031033344338, "grad_norm": 819.1239013671875, "learning_rate": 0.0001511733135470794, "loss": 36.0749, "step": 11932 }, { "epoch": 31.516672169032685, "grad_norm": 657.3931274414062, "learning_rate": 0.00015113494115950705, "loss": 34.2029, "step": 11933 }, { "epoch": 31.51931330472103, "grad_norm": 788.7821044921875, "learning_rate": 0.00015109657153268335, "loss": 35.285, "step": 11934 }, { "epoch": 31.521954440409377, "grad_norm": 1256.062744140625, "learning_rate": 0.00015105820466767978, "loss": 35.6379, "step": 11935 }, { "epoch": 31.52459557609772, "grad_norm": 1091.493896484375, "learning_rate": 0.00015101984056556766, "loss": 35.6299, "step": 11936 }, { "epoch": 31.527236711786067, "grad_norm": 1410.6611328125, "learning_rate": 0.00015098147922741845, "loss": 39.285, "step": 11937 }, { "epoch": 31.529877847474413, "grad_norm": 1051.1441650390625, "learning_rate": 0.00015094312065430323, "loss": 39.4849, "step": 11938 }, { "epoch": 31.53251898316276, "grad_norm": 997.0089111328125, "learning_rate": 0.00015090476484729316, "loss": 38.3496, "step": 11939 }, { "epoch": 31.535160118851106, "grad_norm": 740.220458984375, "learning_rate": 0.00015086641180745932, "loss": 38.2455, "step": 11940 }, { "epoch": 31.537801254539453, "grad_norm": 466.7442626953125, "learning_rate": 0.00015082806153587274, "loss": 39.1072, "step": 11941 }, { "epoch": 31.5404423902278, "grad_norm": 337.2657165527344, "learning_rate": 0.00015078971403360427, "loss": 41.2342, "step": 11942 }, { "epoch": 31.543083525916146, "grad_norm": 3009.149658203125, "learning_rate": 0.00015075136930172464, "loss": 42.9942, "step": 11943 }, { "epoch": 31.54572466160449, "grad_norm": 868.5889282226562, "learning_rate": 0.00015071302734130488, "loss": 40.5926, "step": 11944 }, { "epoch": 31.548365797292835, "grad_norm": 476.0954284667969, "learning_rate": 0.00015067468815341546, "loss": 40.472, "step": 11945 }, { "epoch": 31.55100693298118, "grad_norm": 388.6041259765625, "learning_rate": 0.00015063635173912692, "loss": 40.349, "step": 11946 }, { "epoch": 31.553648068669528, "grad_norm": 410.72027587890625, "learning_rate": 0.00015059801809951, "loss": 40.7452, "step": 11947 }, { "epoch": 31.556289204357874, "grad_norm": 859.091796875, "learning_rate": 0.00015055968723563505, "loss": 39.3123, "step": 11948 }, { "epoch": 31.55893034004622, "grad_norm": 1839.5828857421875, "learning_rate": 0.0001505213591485724, "loss": 38.5129, "step": 11949 }, { "epoch": 31.561571475734567, "grad_norm": 994.81396484375, "learning_rate": 0.0001504830338393923, "loss": 37.0564, "step": 11950 }, { "epoch": 31.564212611422914, "grad_norm": 647.7457275390625, "learning_rate": 0.00015044471130916506, "loss": 36.3182, "step": 11951 }, { "epoch": 31.566853747111256, "grad_norm": 782.4671630859375, "learning_rate": 0.0001504063915589608, "loss": 35.3848, "step": 11952 }, { "epoch": 31.569494882799603, "grad_norm": 442.00885009765625, "learning_rate": 0.0001503680745898495, "loss": 35.0047, "step": 11953 }, { "epoch": 31.57213601848795, "grad_norm": 775.9569091796875, "learning_rate": 0.00015032976040290125, "loss": 34.7372, "step": 11954 }, { "epoch": 31.574777154176296, "grad_norm": 383.700439453125, "learning_rate": 0.00015029144899918586, "loss": 34.8238, "step": 11955 }, { "epoch": 31.577418289864642, "grad_norm": 1015.8504638671875, "learning_rate": 0.0001502531403797732, "loss": 35.5563, "step": 11956 }, { "epoch": 31.58005942555299, "grad_norm": 696.3571166992188, "learning_rate": 0.00015021483454573292, "loss": 35.2826, "step": 11957 }, { "epoch": 31.582700561241335, "grad_norm": 653.4765014648438, "learning_rate": 0.00015017653149813488, "loss": 35.2952, "step": 11958 }, { "epoch": 31.585341696929678, "grad_norm": 844.6091918945312, "learning_rate": 0.00015013823123804853, "loss": 35.6603, "step": 11959 }, { "epoch": 31.587982832618025, "grad_norm": 980.7599487304688, "learning_rate": 0.00015009993376654328, "loss": 34.8213, "step": 11960 }, { "epoch": 31.59062396830637, "grad_norm": 2050.892333984375, "learning_rate": 0.00015006163908468884, "loss": 16.677, "step": 11961 }, { "epoch": 31.593265103994717, "grad_norm": 3071.4287109375, "learning_rate": 0.00015002334719355445, "loss": 15.7883, "step": 11962 }, { "epoch": 31.595906239683064, "grad_norm": 1538.721435546875, "learning_rate": 0.00014998505809420932, "loss": 15.7789, "step": 11963 }, { "epoch": 31.59854737537141, "grad_norm": 2346.251220703125, "learning_rate": 0.0001499467717877226, "loss": 16.3509, "step": 11964 }, { "epoch": 31.601188511059757, "grad_norm": 3946.378662109375, "learning_rate": 0.00014990848827516358, "loss": 16.9165, "step": 11965 }, { "epoch": 31.603829646748103, "grad_norm": 1302.0980224609375, "learning_rate": 0.00014987020755760123, "loss": 9.4733, "step": 11966 }, { "epoch": 31.606470782436446, "grad_norm": 1852.8475341796875, "learning_rate": 0.0001498319296361045, "loss": 12.1926, "step": 11967 }, { "epoch": 31.609111918124793, "grad_norm": 729.24609375, "learning_rate": 0.00014979365451174226, "loss": 13.454, "step": 11968 }, { "epoch": 31.61175305381314, "grad_norm": 965.2940673828125, "learning_rate": 0.00014975538218558342, "loss": 16.0771, "step": 11969 }, { "epoch": 31.614394189501485, "grad_norm": 412.7833557128906, "learning_rate": 0.00014971711265869664, "loss": 34.9287, "step": 11970 }, { "epoch": 31.617035325189832, "grad_norm": 1085.442626953125, "learning_rate": 0.00014967884593215054, "loss": 35.6158, "step": 11971 }, { "epoch": 31.61967646087818, "grad_norm": 644.2136840820312, "learning_rate": 0.00014964058200701375, "loss": 34.7875, "step": 11972 }, { "epoch": 31.622317596566525, "grad_norm": 1213.9464111328125, "learning_rate": 0.00014960232088435477, "loss": 36.2038, "step": 11973 }, { "epoch": 31.62495873225487, "grad_norm": 772.1779174804688, "learning_rate": 0.00014956406256524197, "loss": 34.673, "step": 11974 }, { "epoch": 31.627599867943214, "grad_norm": 1703.589111328125, "learning_rate": 0.00014952580705074377, "loss": 34.6168, "step": 11975 }, { "epoch": 31.63024100363156, "grad_norm": 781.4306030273438, "learning_rate": 0.00014948755434192833, "loss": 35.5078, "step": 11976 }, { "epoch": 31.632882139319907, "grad_norm": 778.2421875, "learning_rate": 0.00014944930443986393, "loss": 34.6755, "step": 11977 }, { "epoch": 31.635523275008254, "grad_norm": 1068.3853759765625, "learning_rate": 0.00014941105734561855, "loss": 36.1914, "step": 11978 }, { "epoch": 31.6381644106966, "grad_norm": 1166.9796142578125, "learning_rate": 0.00014937281306026047, "loss": 34.0291, "step": 11979 }, { "epoch": 31.640805546384946, "grad_norm": 1002.121826171875, "learning_rate": 0.0001493345715848574, "loss": 34.5027, "step": 11980 }, { "epoch": 31.643446682073293, "grad_norm": 459.29962158203125, "learning_rate": 0.00014929633292047717, "loss": 35.7384, "step": 11981 }, { "epoch": 31.646087817761636, "grad_norm": 772.5812377929688, "learning_rate": 0.0001492580970681878, "loss": 34.1292, "step": 11982 }, { "epoch": 31.648728953449982, "grad_norm": 516.9810180664062, "learning_rate": 0.0001492198640290569, "loss": 35.8677, "step": 11983 }, { "epoch": 31.65137008913833, "grad_norm": 851.876220703125, "learning_rate": 0.00014918163380415206, "loss": 34.5936, "step": 11984 }, { "epoch": 31.654011224826675, "grad_norm": 814.172607421875, "learning_rate": 0.00014914340639454087, "loss": 35.4429, "step": 11985 }, { "epoch": 31.65665236051502, "grad_norm": 6044.095703125, "learning_rate": 0.00014910518180129084, "loss": 36.5614, "step": 11986 }, { "epoch": 31.659293496203368, "grad_norm": 572.9071044921875, "learning_rate": 0.00014906696002546933, "loss": 37.2924, "step": 11987 }, { "epoch": 31.661934631891715, "grad_norm": 983.5037841796875, "learning_rate": 0.00014902874106814361, "loss": 40.2979, "step": 11988 }, { "epoch": 31.66457576758006, "grad_norm": 786.0078125, "learning_rate": 0.00014899052493038106, "loss": 38.4576, "step": 11989 }, { "epoch": 31.667216903268404, "grad_norm": 469.3016357421875, "learning_rate": 0.00014895231161324876, "loss": 39.5846, "step": 11990 }, { "epoch": 31.66985803895675, "grad_norm": 1851.012451171875, "learning_rate": 0.0001489141011178138, "loss": 37.8916, "step": 11991 }, { "epoch": 31.672499174645097, "grad_norm": 1078.5738525390625, "learning_rate": 0.00014887589344514312, "loss": 42.1983, "step": 11992 }, { "epoch": 31.675140310333443, "grad_norm": 388.1827697753906, "learning_rate": 0.00014883768859630378, "loss": 41.4505, "step": 11993 }, { "epoch": 31.67778144602179, "grad_norm": 957.551025390625, "learning_rate": 0.00014879948657236256, "loss": 40.737, "step": 11994 }, { "epoch": 31.680422581710136, "grad_norm": 681.97412109375, "learning_rate": 0.0001487612873743861, "loss": 41.6219, "step": 11995 }, { "epoch": 31.683063717398483, "grad_norm": 381.99029541015625, "learning_rate": 0.00014872309100344137, "loss": 41.8587, "step": 11996 }, { "epoch": 31.68570485308683, "grad_norm": 527.11376953125, "learning_rate": 0.00014868489746059483, "loss": 39.8626, "step": 11997 }, { "epoch": 31.688345988775172, "grad_norm": 1125.5118408203125, "learning_rate": 0.000148646706746913, "loss": 38.509, "step": 11998 }, { "epoch": 31.69098712446352, "grad_norm": 1879.7841796875, "learning_rate": 0.00014860851886346223, "loss": 39.1346, "step": 11999 }, { "epoch": 31.693628260151865, "grad_norm": 586.5960083007812, "learning_rate": 0.0001485703338113091, "loss": 37.7728, "step": 12000 }, { "epoch": 31.693628260151865, "eval_loss": 3.7790510654449463, "eval_runtime": 2.2106, "eval_samples_per_second": 223.919, "eval_steps_per_second": 28.046, "step": 12000 }, { "epoch": 31.69626939584021, "grad_norm": 486.1837463378906, "learning_rate": 0.00014853215159151985, "loss": 36.7969, "step": 12001 }, { "epoch": 31.698910531528558, "grad_norm": 718.3618774414062, "learning_rate": 0.00014849397220516056, "loss": 35.5973, "step": 12002 }, { "epoch": 31.701551667216904, "grad_norm": 581.2941284179688, "learning_rate": 0.00014845579565329753, "loss": 35.0456, "step": 12003 }, { "epoch": 31.70419280290525, "grad_norm": 569.0777587890625, "learning_rate": 0.00014841762193699677, "loss": 35.6893, "step": 12004 }, { "epoch": 31.706833938593594, "grad_norm": 610.3575439453125, "learning_rate": 0.00014837945105732425, "loss": 35.3898, "step": 12005 }, { "epoch": 31.70947507428194, "grad_norm": 864.2611083984375, "learning_rate": 0.00014834128301534583, "loss": 34.8787, "step": 12006 }, { "epoch": 31.712116209970286, "grad_norm": 1417.598388671875, "learning_rate": 0.00014830311781212737, "loss": 35.2281, "step": 12007 }, { "epoch": 31.714757345658633, "grad_norm": 275.428466796875, "learning_rate": 0.00014826495544873464, "loss": 34.6456, "step": 12008 }, { "epoch": 31.71739848134698, "grad_norm": 1589.3492431640625, "learning_rate": 0.00014822679592623323, "loss": 35.7933, "step": 12009 }, { "epoch": 31.720039617035326, "grad_norm": 1724.25341796875, "learning_rate": 0.00014818863924568877, "loss": 41.0237, "step": 12010 }, { "epoch": 31.722680752723672, "grad_norm": 1377.461669921875, "learning_rate": 0.00014815048540816678, "loss": 12.2235, "step": 12011 }, { "epoch": 31.72532188841202, "grad_norm": 2182.24755859375, "learning_rate": 0.00014811233441473263, "loss": 11.767, "step": 12012 }, { "epoch": 31.72796302410036, "grad_norm": 1093.6805419921875, "learning_rate": 0.00014807418626645158, "loss": 13.4376, "step": 12013 }, { "epoch": 31.730604159788708, "grad_norm": 1240.9608154296875, "learning_rate": 0.00014803604096438918, "loss": 17.838, "step": 12014 }, { "epoch": 31.733245295477055, "grad_norm": 1137.5611572265625, "learning_rate": 0.00014799789850961038, "loss": 10.9819, "step": 12015 }, { "epoch": 31.7358864311654, "grad_norm": 1596.3702392578125, "learning_rate": 0.00014795975890318026, "loss": 12.1083, "step": 12016 }, { "epoch": 31.738527566853747, "grad_norm": 341.6136169433594, "learning_rate": 0.000147921622146164, "loss": 11.7832, "step": 12017 }, { "epoch": 31.741168702542094, "grad_norm": 1090.2532958984375, "learning_rate": 0.0001478834882396265, "loss": 14.1445, "step": 12018 }, { "epoch": 31.74380983823044, "grad_norm": 2973.657958984375, "learning_rate": 0.0001478453571846326, "loss": 15.3277, "step": 12019 }, { "epoch": 31.746450973918787, "grad_norm": 1512.5257568359375, "learning_rate": 0.00014780722898224708, "loss": 32.9315, "step": 12020 }, { "epoch": 31.74909210960713, "grad_norm": 4679.90087890625, "learning_rate": 0.00014776910363353466, "loss": 35.6904, "step": 12021 }, { "epoch": 31.751733245295476, "grad_norm": 1669.8377685546875, "learning_rate": 0.00014773098113955997, "loss": 33.0454, "step": 12022 }, { "epoch": 31.754374380983823, "grad_norm": 5724.56103515625, "learning_rate": 0.00014769286150138753, "loss": 35.9384, "step": 12023 }, { "epoch": 31.75701551667217, "grad_norm": 448.1291809082031, "learning_rate": 0.00014765474472008187, "loss": 35.1005, "step": 12024 }, { "epoch": 31.759656652360515, "grad_norm": 492.9501037597656, "learning_rate": 0.00014761663079670734, "loss": 33.6067, "step": 12025 }, { "epoch": 31.762297788048862, "grad_norm": 805.6570434570312, "learning_rate": 0.00014757851973232824, "loss": 35.9325, "step": 12026 }, { "epoch": 31.76493892373721, "grad_norm": 472.38519287109375, "learning_rate": 0.0001475404115280088, "loss": 34.2222, "step": 12027 }, { "epoch": 31.76758005942555, "grad_norm": 1271.65283203125, "learning_rate": 0.0001475023061848132, "loss": 35.2151, "step": 12028 }, { "epoch": 31.770221195113898, "grad_norm": 450.4981994628906, "learning_rate": 0.0001474642037038055, "loss": 37.2358, "step": 12029 }, { "epoch": 31.772862330802244, "grad_norm": 2404.069580078125, "learning_rate": 0.00014742610408604957, "loss": 36.0227, "step": 12030 }, { "epoch": 31.77550346649059, "grad_norm": 3951.547607421875, "learning_rate": 0.00014738800733260954, "loss": 34.8172, "step": 12031 }, { "epoch": 31.778144602178937, "grad_norm": 771.26318359375, "learning_rate": 0.00014734991344454915, "loss": 35.1709, "step": 12032 }, { "epoch": 31.780785737867284, "grad_norm": 565.1847534179688, "learning_rate": 0.00014731182242293212, "loss": 34.5945, "step": 12033 }, { "epoch": 31.78342687355563, "grad_norm": 785.671142578125, "learning_rate": 0.000147273734268822, "loss": 35.4955, "step": 12034 }, { "epoch": 31.786068009243976, "grad_norm": 1608.1341552734375, "learning_rate": 0.0001472356489832826, "loss": 36.1382, "step": 12035 }, { "epoch": 31.78870914493232, "grad_norm": 1123.9249267578125, "learning_rate": 0.0001471975665673773, "loss": 35.5712, "step": 12036 }, { "epoch": 31.791350280620666, "grad_norm": 1348.6485595703125, "learning_rate": 0.0001471594870221696, "loss": 39.2578, "step": 12037 }, { "epoch": 31.793991416309012, "grad_norm": 653.6434936523438, "learning_rate": 0.0001471214103487228, "loss": 40.5557, "step": 12038 }, { "epoch": 31.79663255199736, "grad_norm": 538.72314453125, "learning_rate": 0.0001470833365481002, "loss": 37.9315, "step": 12039 }, { "epoch": 31.799273687685705, "grad_norm": 620.4283447265625, "learning_rate": 0.000147045265621365, "loss": 39.5645, "step": 12040 }, { "epoch": 31.80191482337405, "grad_norm": 1481.3214111328125, "learning_rate": 0.00014700719756958019, "loss": 40.0303, "step": 12041 }, { "epoch": 31.804555959062398, "grad_norm": 644.512451171875, "learning_rate": 0.00014696913239380896, "loss": 42.0304, "step": 12042 }, { "epoch": 31.807197094750745, "grad_norm": 508.8630065917969, "learning_rate": 0.00014693107009511417, "loss": 44.0236, "step": 12043 }, { "epoch": 31.809838230439087, "grad_norm": 862.4542236328125, "learning_rate": 0.00014689301067455864, "loss": 42.7851, "step": 12044 }, { "epoch": 31.812479366127434, "grad_norm": 938.1173706054688, "learning_rate": 0.0001468549541332053, "loss": 42.374, "step": 12045 }, { "epoch": 31.81512050181578, "grad_norm": 1016.5126953125, "learning_rate": 0.00014681690047211676, "loss": 41.6087, "step": 12046 }, { "epoch": 31.817761637504127, "grad_norm": 1280.1993408203125, "learning_rate": 0.00014677884969235556, "loss": 37.8821, "step": 12047 }, { "epoch": 31.820402773192473, "grad_norm": 3086.08837890625, "learning_rate": 0.00014674080179498446, "loss": 41.7559, "step": 12048 }, { "epoch": 31.82304390888082, "grad_norm": 860.2265625, "learning_rate": 0.0001467027567810658, "loss": 39.3555, "step": 12049 }, { "epoch": 31.825685044569166, "grad_norm": 1059.0579833984375, "learning_rate": 0.000146664714651662, "loss": 36.7719, "step": 12050 }, { "epoch": 31.82832618025751, "grad_norm": 545.1806640625, "learning_rate": 0.00014662667540783523, "loss": 36.3404, "step": 12051 }, { "epoch": 31.830967315945855, "grad_norm": 537.419677734375, "learning_rate": 0.0001465886390506479, "loss": 36.429, "step": 12052 }, { "epoch": 31.833608451634202, "grad_norm": 405.5992736816406, "learning_rate": 0.0001465506055811621, "loss": 35.4552, "step": 12053 }, { "epoch": 31.83624958732255, "grad_norm": 799.9274291992188, "learning_rate": 0.0001465125750004398, "loss": 36.7604, "step": 12054 }, { "epoch": 31.838890723010895, "grad_norm": 527.8311157226562, "learning_rate": 0.00014647454730954313, "loss": 36.2175, "step": 12055 }, { "epoch": 31.84153185869924, "grad_norm": 535.396484375, "learning_rate": 0.00014643652250953388, "loss": 35.0694, "step": 12056 }, { "epoch": 31.844172994387588, "grad_norm": 428.8114013671875, "learning_rate": 0.0001463985006014739, "loss": 35.2127, "step": 12057 }, { "epoch": 31.846814130075934, "grad_norm": 299.0433349609375, "learning_rate": 0.0001463604815864249, "loss": 36.1665, "step": 12058 }, { "epoch": 31.849455265764277, "grad_norm": 477.7793273925781, "learning_rate": 0.00014632246546544863, "loss": 34.9638, "step": 12059 }, { "epoch": 31.852096401452624, "grad_norm": 933.4197998046875, "learning_rate": 0.0001462844522396066, "loss": 35.993, "step": 12060 }, { "epoch": 31.85473753714097, "grad_norm": 4669.25390625, "learning_rate": 0.00014624644190996025, "loss": 28.5974, "step": 12061 }, { "epoch": 31.857378672829316, "grad_norm": 2497.777099609375, "learning_rate": 0.00014620843447757113, "loss": 13.7911, "step": 12062 }, { "epoch": 31.860019808517663, "grad_norm": 8198.328125, "learning_rate": 0.0001461704299435005, "loss": 11.8891, "step": 12063 }, { "epoch": 31.86266094420601, "grad_norm": 3066.855712890625, "learning_rate": 0.00014613242830880963, "loss": 16.5708, "step": 12064 }, { "epoch": 31.865302079894356, "grad_norm": 3100.809814453125, "learning_rate": 0.00014609442957455953, "loss": 14.997, "step": 12065 }, { "epoch": 31.867943215582702, "grad_norm": 1182.8134765625, "learning_rate": 0.0001460564337418116, "loss": 18.2787, "step": 12066 }, { "epoch": 31.870584351271045, "grad_norm": 3528.55419921875, "learning_rate": 0.00014601844081162677, "loss": 16.4559, "step": 12067 }, { "epoch": 31.87322548695939, "grad_norm": 1318.2369384765625, "learning_rate": 0.00014598045078506572, "loss": 13.3287, "step": 12068 }, { "epoch": 31.875866622647738, "grad_norm": 1793.552734375, "learning_rate": 0.00014594246366318955, "loss": 13.1217, "step": 12069 }, { "epoch": 31.878507758336085, "grad_norm": 810.267578125, "learning_rate": 0.000145904479447059, "loss": 24.9805, "step": 12070 }, { "epoch": 31.88114889402443, "grad_norm": 563.8057861328125, "learning_rate": 0.00014586649813773468, "loss": 35.5507, "step": 12071 }, { "epoch": 31.883790029712777, "grad_norm": 746.9653930664062, "learning_rate": 0.00014582851973627716, "loss": 35.9678, "step": 12072 }, { "epoch": 31.886431165401124, "grad_norm": 717.3477172851562, "learning_rate": 0.0001457905442437472, "loss": 35.2284, "step": 12073 }, { "epoch": 31.889072301089467, "grad_norm": 4945.248046875, "learning_rate": 0.00014575257166120497, "loss": 34.332, "step": 12074 }, { "epoch": 31.891713436777813, "grad_norm": 1420.62255859375, "learning_rate": 0.00014571460198971087, "loss": 34.1545, "step": 12075 }, { "epoch": 31.89435457246616, "grad_norm": 838.103271484375, "learning_rate": 0.00014567663523032534, "loss": 35.4473, "step": 12076 }, { "epoch": 31.896995708154506, "grad_norm": 827.0696411132812, "learning_rate": 0.00014563867138410848, "loss": 35.057, "step": 12077 }, { "epoch": 31.899636843842853, "grad_norm": 1239.039794921875, "learning_rate": 0.00014560071045212043, "loss": 34.5351, "step": 12078 }, { "epoch": 31.9022779795312, "grad_norm": 565.4935302734375, "learning_rate": 0.0001455627524354211, "loss": 34.3536, "step": 12079 }, { "epoch": 31.904919115219545, "grad_norm": 3927.72509765625, "learning_rate": 0.00014552479733507067, "loss": 35.0567, "step": 12080 }, { "epoch": 31.907560250907892, "grad_norm": 916.8865966796875, "learning_rate": 0.0001454868451521289, "loss": 35.1972, "step": 12081 }, { "epoch": 31.910201386596235, "grad_norm": 1875.937744140625, "learning_rate": 0.00014544889588765557, "loss": 35.0227, "step": 12082 }, { "epoch": 31.91284252228458, "grad_norm": 739.6204833984375, "learning_rate": 0.00014541094954271042, "loss": 34.2137, "step": 12083 }, { "epoch": 31.915483657972928, "grad_norm": 742.9683837890625, "learning_rate": 0.00014537300611835308, "loss": 33.4212, "step": 12084 }, { "epoch": 31.918124793661274, "grad_norm": 2194.787353515625, "learning_rate": 0.00014533506561564306, "loss": 33.7573, "step": 12085 }, { "epoch": 31.92076592934962, "grad_norm": 961.0106201171875, "learning_rate": 0.00014529712803563975, "loss": 36.527, "step": 12086 }, { "epoch": 31.923407065037967, "grad_norm": 2174.298583984375, "learning_rate": 0.00014525919337940275, "loss": 38.8116, "step": 12087 }, { "epoch": 31.926048200726314, "grad_norm": 2271.18505859375, "learning_rate": 0.0001452212616479912, "loss": 40.7528, "step": 12088 }, { "epoch": 31.92868933641466, "grad_norm": 470.3843688964844, "learning_rate": 0.00014518333284246432, "loss": 40.1969, "step": 12089 }, { "epoch": 31.931330472103003, "grad_norm": 1504.2437744140625, "learning_rate": 0.0001451454069638815, "loss": 40.707, "step": 12090 }, { "epoch": 31.93397160779135, "grad_norm": 432.93145751953125, "learning_rate": 0.00014510748401330145, "loss": 40.0882, "step": 12091 }, { "epoch": 31.936612743479696, "grad_norm": 987.4118041992188, "learning_rate": 0.00014506956399178335, "loss": 40.1003, "step": 12092 }, { "epoch": 31.939253879168042, "grad_norm": 505.43890380859375, "learning_rate": 0.00014503164690038585, "loss": 39.5061, "step": 12093 }, { "epoch": 31.94189501485639, "grad_norm": 399.5195617675781, "learning_rate": 0.00014499373274016813, "loss": 37.403, "step": 12094 }, { "epoch": 31.944536150544735, "grad_norm": 1160.58544921875, "learning_rate": 0.00014495582151218873, "loss": 36.6763, "step": 12095 }, { "epoch": 31.94717728623308, "grad_norm": 1076.2142333984375, "learning_rate": 0.00014491791321750617, "loss": 37.6614, "step": 12096 }, { "epoch": 31.949818421921425, "grad_norm": 715.5043334960938, "learning_rate": 0.00014488000785717924, "loss": 35.5424, "step": 12097 }, { "epoch": 31.95245955760977, "grad_norm": 872.9754638671875, "learning_rate": 0.00014484210543226639, "loss": 34.7595, "step": 12098 }, { "epoch": 31.955100693298117, "grad_norm": 380.13360595703125, "learning_rate": 0.000144804205943826, "loss": 34.8385, "step": 12099 }, { "epoch": 31.957741828986464, "grad_norm": 3467.50830078125, "learning_rate": 0.0001447663093929163, "loss": 35.4643, "step": 12100 }, { "epoch": 31.96038296467481, "grad_norm": 3288.198974609375, "learning_rate": 0.00014472841578059558, "loss": 9.4714, "step": 12101 }, { "epoch": 31.963024100363157, "grad_norm": 816.7288818359375, "learning_rate": 0.00014469052510792207, "loss": 11.5838, "step": 12102 }, { "epoch": 31.965665236051503, "grad_norm": 2080.834228515625, "learning_rate": 0.00014465263737595363, "loss": 10.2118, "step": 12103 }, { "epoch": 31.96830637173985, "grad_norm": 4479.32080078125, "learning_rate": 0.00014461475258574854, "loss": 11.4502, "step": 12104 }, { "epoch": 31.970947507428193, "grad_norm": 4195.31884765625, "learning_rate": 0.00014457687073836457, "loss": 17.4883, "step": 12105 }, { "epoch": 31.97358864311654, "grad_norm": 508.4117431640625, "learning_rate": 0.00014453899183485957, "loss": 31.5539, "step": 12106 }, { "epoch": 31.976229778804885, "grad_norm": 706.76171875, "learning_rate": 0.00014450111587629118, "loss": 34.5257, "step": 12107 }, { "epoch": 31.978870914493232, "grad_norm": 991.900146484375, "learning_rate": 0.0001444632428637173, "loss": 34.9218, "step": 12108 }, { "epoch": 31.98151205018158, "grad_norm": 847.473388671875, "learning_rate": 0.0001444253727981953, "loss": 34.666, "step": 12109 }, { "epoch": 31.984153185869925, "grad_norm": 583.883056640625, "learning_rate": 0.00014438750568078265, "loss": 34.6037, "step": 12110 }, { "epoch": 31.98679432155827, "grad_norm": 1169.1519775390625, "learning_rate": 0.000144349641512537, "loss": 34.2118, "step": 12111 }, { "epoch": 31.989435457246618, "grad_norm": 425.7148132324219, "learning_rate": 0.00014431178029451546, "loss": 34.9305, "step": 12112 }, { "epoch": 31.99207659293496, "grad_norm": 431.1771545410156, "learning_rate": 0.00014427392202777544, "loss": 34.1989, "step": 12113 }, { "epoch": 31.994717728623307, "grad_norm": 705.6990966796875, "learning_rate": 0.00014423606671337393, "loss": 34.5478, "step": 12114 }, { "epoch": 31.997358864311654, "grad_norm": 1802.2427978515625, "learning_rate": 0.00014419821435236822, "loss": 35.1833, "step": 12115 }, { "epoch": 32.0, "grad_norm": 2218.279541015625, "learning_rate": 0.00014416036494581521, "loss": 40.6987, "step": 12116 }, { "epoch": 32.00264113568834, "grad_norm": 712.4857788085938, "learning_rate": 0.00014412251849477186, "loss": 39.25, "step": 12117 }, { "epoch": 32.00528227137669, "grad_norm": 635.12109375, "learning_rate": 0.000144084675000295, "loss": 38.8262, "step": 12118 }, { "epoch": 32.007923407065036, "grad_norm": 427.17864990234375, "learning_rate": 0.0001440468344634413, "loss": 38.5259, "step": 12119 }, { "epoch": 32.010564542753386, "grad_norm": 716.6590576171875, "learning_rate": 0.00014400899688526755, "loss": 39.7171, "step": 12120 }, { "epoch": 32.01320567844173, "grad_norm": 705.0234375, "learning_rate": 0.00014397116226683022, "loss": 42.6218, "step": 12121 }, { "epoch": 32.01584681413008, "grad_norm": 1079.136962890625, "learning_rate": 0.00014393333060918598, "loss": 42.5813, "step": 12122 }, { "epoch": 32.01848794981842, "grad_norm": 763.9215698242188, "learning_rate": 0.00014389550191339117, "loss": 41.0388, "step": 12123 }, { "epoch": 32.021129085506765, "grad_norm": 461.9720153808594, "learning_rate": 0.00014385767618050205, "loss": 40.4452, "step": 12124 }, { "epoch": 32.023770221195115, "grad_norm": 1090.3270263671875, "learning_rate": 0.00014381985341157516, "loss": 40.7274, "step": 12125 }, { "epoch": 32.02641135688346, "grad_norm": 796.19287109375, "learning_rate": 0.0001437820336076664, "loss": 38.9947, "step": 12126 }, { "epoch": 32.02905249257181, "grad_norm": 365.47918701171875, "learning_rate": 0.00014374421676983197, "loss": 36.8878, "step": 12127 }, { "epoch": 32.03169362826015, "grad_norm": 435.6695861816406, "learning_rate": 0.00014370640289912773, "loss": 37.9126, "step": 12128 }, { "epoch": 32.0343347639485, "grad_norm": 857.0391235351562, "learning_rate": 0.00014366859199660986, "loss": 35.5872, "step": 12129 }, { "epoch": 32.03697589963684, "grad_norm": 436.5298767089844, "learning_rate": 0.00014363078406333412, "loss": 36.1276, "step": 12130 }, { "epoch": 32.03961703532519, "grad_norm": 554.660400390625, "learning_rate": 0.00014359297910035616, "loss": 35.9316, "step": 12131 }, { "epoch": 32.042258171013536, "grad_norm": 556.2432250976562, "learning_rate": 0.00014355517710873183, "loss": 35.0241, "step": 12132 }, { "epoch": 32.04489930670188, "grad_norm": 1166.9654541015625, "learning_rate": 0.00014351737808951666, "loss": 34.7311, "step": 12133 }, { "epoch": 32.04754044239023, "grad_norm": 526.9652099609375, "learning_rate": 0.00014347958204376612, "loss": 34.2454, "step": 12134 }, { "epoch": 32.05018157807857, "grad_norm": 634.200927734375, "learning_rate": 0.0001434417889725357, "loss": 34.6508, "step": 12135 }, { "epoch": 32.05282271376692, "grad_norm": 896.5753173828125, "learning_rate": 0.00014340399887688074, "loss": 33.9398, "step": 12136 }, { "epoch": 32.055463849455265, "grad_norm": 415.74493408203125, "learning_rate": 0.00014336621175785647, "loss": 35.6353, "step": 12137 }, { "epoch": 32.058104985143615, "grad_norm": 1969.9464111328125, "learning_rate": 0.000143328427616518, "loss": 47.2148, "step": 12138 }, { "epoch": 32.06074612083196, "grad_norm": 6402.9189453125, "learning_rate": 0.00014329064645392066, "loss": 12.4898, "step": 12139 }, { "epoch": 32.0633872565203, "grad_norm": 3341.9150390625, "learning_rate": 0.0001432528682711193, "loss": 16.2872, "step": 12140 }, { "epoch": 32.06602839220865, "grad_norm": 1691.764404296875, "learning_rate": 0.0001432150930691689, "loss": 16.1319, "step": 12141 }, { "epoch": 32.06866952789699, "grad_norm": 1468.416015625, "learning_rate": 0.0001431773208491242, "loss": 10.0717, "step": 12142 }, { "epoch": 32.07131066358534, "grad_norm": 1114.9288330078125, "learning_rate": 0.00014313955161204025, "loss": 11.5344, "step": 12143 }, { "epoch": 32.073951799273686, "grad_norm": 2013.4888916015625, "learning_rate": 0.00014310178535897146, "loss": 12.5292, "step": 12144 }, { "epoch": 32.07659293496204, "grad_norm": 1271.7669677734375, "learning_rate": 0.0001430640220909724, "loss": 9.5802, "step": 12145 }, { "epoch": 32.07923407065038, "grad_norm": 1061.297119140625, "learning_rate": 0.00014302626180909785, "loss": 13.5837, "step": 12146 }, { "epoch": 32.08187520633872, "grad_norm": 1962.01025390625, "learning_rate": 0.00014298850451440208, "loss": 21.1371, "step": 12147 }, { "epoch": 32.08451634202707, "grad_norm": 546.698974609375, "learning_rate": 0.00014295075020793942, "loss": 35.2546, "step": 12148 }, { "epoch": 32.087157477715415, "grad_norm": 611.9085693359375, "learning_rate": 0.0001429129988907641, "loss": 34.2609, "step": 12149 }, { "epoch": 32.089798613403765, "grad_norm": 933.9723510742188, "learning_rate": 0.0001428752505639305, "loss": 35.888, "step": 12150 }, { "epoch": 32.09243974909211, "grad_norm": 790.4387817382812, "learning_rate": 0.0001428375052284926, "loss": 34.7346, "step": 12151 }, { "epoch": 32.09508088478046, "grad_norm": 537.6499633789062, "learning_rate": 0.0001427997628855044, "loss": 34.9825, "step": 12152 }, { "epoch": 32.0977220204688, "grad_norm": 1374.9224853515625, "learning_rate": 0.00014276202353601984, "loss": 33.9858, "step": 12153 }, { "epoch": 32.10036315615715, "grad_norm": 4696.744140625, "learning_rate": 0.0001427242871810928, "loss": 35.4516, "step": 12154 }, { "epoch": 32.103004291845494, "grad_norm": 930.5995483398438, "learning_rate": 0.00014268655382177698, "loss": 33.7858, "step": 12155 }, { "epoch": 32.10564542753384, "grad_norm": 734.2909545898438, "learning_rate": 0.00014264882345912605, "loss": 33.6452, "step": 12156 }, { "epoch": 32.10828656322219, "grad_norm": 5017.982421875, "learning_rate": 0.00014261109609419375, "loss": 34.8351, "step": 12157 }, { "epoch": 32.11092769891053, "grad_norm": 393.92657470703125, "learning_rate": 0.0001425733717280335, "loss": 35.1651, "step": 12158 }, { "epoch": 32.11356883459888, "grad_norm": 1215.4398193359375, "learning_rate": 0.00014253565036169863, "loss": 35.0339, "step": 12159 }, { "epoch": 32.11620997028722, "grad_norm": 762.8419189453125, "learning_rate": 0.00014249793199624282, "loss": 34.6743, "step": 12160 }, { "epoch": 32.11885110597557, "grad_norm": 530.3524169921875, "learning_rate": 0.00014246021663271902, "loss": 33.3271, "step": 12161 }, { "epoch": 32.121492241663915, "grad_norm": 1563.593505859375, "learning_rate": 0.00014242250427218045, "loss": 33.606, "step": 12162 }, { "epoch": 32.12413337735226, "grad_norm": 579.625, "learning_rate": 0.0001423847949156802, "loss": 34.5745, "step": 12163 }, { "epoch": 32.12677451304061, "grad_norm": 1229.2857666015625, "learning_rate": 0.00014234708856427142, "loss": 35.8328, "step": 12164 }, { "epoch": 32.12941564872895, "grad_norm": 2197.927490234375, "learning_rate": 0.00014230938521900694, "loss": 34.883, "step": 12165 }, { "epoch": 32.1320567844173, "grad_norm": 1563.547119140625, "learning_rate": 0.00014227168488093955, "loss": 39.2, "step": 12166 }, { "epoch": 32.134697920105644, "grad_norm": 526.9262084960938, "learning_rate": 0.00014223398755112217, "loss": 39.1864, "step": 12167 }, { "epoch": 32.137339055793994, "grad_norm": 887.877197265625, "learning_rate": 0.00014219629323060734, "loss": 39.8397, "step": 12168 }, { "epoch": 32.13998019148234, "grad_norm": 724.404541015625, "learning_rate": 0.0001421586019204477, "loss": 39.1703, "step": 12169 }, { "epoch": 32.14262132717068, "grad_norm": 1274.0146484375, "learning_rate": 0.0001421209136216958, "loss": 40.9844, "step": 12170 }, { "epoch": 32.14526246285903, "grad_norm": 493.2265625, "learning_rate": 0.00014208322833540398, "loss": 42.2481, "step": 12171 }, { "epoch": 32.14790359854737, "grad_norm": 473.2431640625, "learning_rate": 0.00014204554606262463, "loss": 39.5091, "step": 12172 }, { "epoch": 32.15054473423572, "grad_norm": 963.1893920898438, "learning_rate": 0.0001420078668044099, "loss": 41.9441, "step": 12173 }, { "epoch": 32.153185869924066, "grad_norm": 542.927490234375, "learning_rate": 0.0001419701905618121, "loss": 40.0783, "step": 12174 }, { "epoch": 32.155827005612416, "grad_norm": 2100.561767578125, "learning_rate": 0.0001419325173358833, "loss": 40.9725, "step": 12175 }, { "epoch": 32.15846814130076, "grad_norm": 775.9593505859375, "learning_rate": 0.00014189484712767548, "loss": 39.0033, "step": 12176 }, { "epoch": 32.16110927698911, "grad_norm": 883.1022338867188, "learning_rate": 0.00014185717993824045, "loss": 39.9197, "step": 12177 }, { "epoch": 32.16375041267745, "grad_norm": 1063.04150390625, "learning_rate": 0.00014181951576863034, "loss": 38.3561, "step": 12178 }, { "epoch": 32.166391548365795, "grad_norm": 852.4481811523438, "learning_rate": 0.0001417818546198966, "loss": 37.3005, "step": 12179 }, { "epoch": 32.169032684054145, "grad_norm": 629.644287109375, "learning_rate": 0.00014174419649309089, "loss": 36.4965, "step": 12180 }, { "epoch": 32.17167381974249, "grad_norm": 652.4221801757812, "learning_rate": 0.00014170654138926503, "loss": 35.3255, "step": 12181 }, { "epoch": 32.17431495543084, "grad_norm": 857.7116088867188, "learning_rate": 0.00014166888930947037, "loss": 35.0681, "step": 12182 }, { "epoch": 32.17695609111918, "grad_norm": 1440.944580078125, "learning_rate": 0.0001416312402547582, "loss": 35.5136, "step": 12183 }, { "epoch": 32.17959722680753, "grad_norm": 942.9962158203125, "learning_rate": 0.00014159359422618014, "loss": 35.479, "step": 12184 }, { "epoch": 32.18223836249587, "grad_norm": 861.1096801757812, "learning_rate": 0.00014155595122478725, "loss": 34.7587, "step": 12185 }, { "epoch": 32.184879498184216, "grad_norm": 489.3499450683594, "learning_rate": 0.00014151831125163076, "loss": 34.1554, "step": 12186 }, { "epoch": 32.187520633872566, "grad_norm": 1336.35986328125, "learning_rate": 0.00014148067430776168, "loss": 35.5167, "step": 12187 }, { "epoch": 32.19016176956091, "grad_norm": 1548.5533447265625, "learning_rate": 0.000141443040394231, "loss": 35.5778, "step": 12188 }, { "epoch": 32.19280290524926, "grad_norm": 1272.092529296875, "learning_rate": 0.00014140540951208968, "loss": 33.1604, "step": 12189 }, { "epoch": 32.1954440409376, "grad_norm": 3651.8994140625, "learning_rate": 0.0001413677816623884, "loss": 10.2596, "step": 12190 }, { "epoch": 32.19808517662595, "grad_norm": 2071.074951171875, "learning_rate": 0.00014133015684617806, "loss": 16.7104, "step": 12191 }, { "epoch": 32.200726312314295, "grad_norm": 343.5858459472656, "learning_rate": 0.00014129253506450928, "loss": 7.2905, "step": 12192 }, { "epoch": 32.20336744800264, "grad_norm": 1370.722412109375, "learning_rate": 0.00014125491631843262, "loss": 18.4206, "step": 12193 }, { "epoch": 32.20600858369099, "grad_norm": 981.6693725585938, "learning_rate": 0.00014121730060899845, "loss": 8.7046, "step": 12194 }, { "epoch": 32.20864971937933, "grad_norm": 6376.4736328125, "learning_rate": 0.00014117968793725744, "loss": 15.7716, "step": 12195 }, { "epoch": 32.21129085506768, "grad_norm": 870.04638671875, "learning_rate": 0.00014114207830425959, "loss": 10.0817, "step": 12196 }, { "epoch": 32.21393199075602, "grad_norm": 87375.4375, "learning_rate": 0.0001411044717110552, "loss": 12.3379, "step": 12197 }, { "epoch": 32.21657312644437, "grad_norm": 485.40655517578125, "learning_rate": 0.00014106686815869453, "loss": 29.4638, "step": 12198 }, { "epoch": 32.219214262132716, "grad_norm": 2034.5135498046875, "learning_rate": 0.0001410292676482276, "loss": 35.1248, "step": 12199 }, { "epoch": 32.221855397821066, "grad_norm": 1286.6754150390625, "learning_rate": 0.00014099167018070437, "loss": 35.5756, "step": 12200 }, { "epoch": 32.221855397821066, "eval_loss": 3.7962210178375244, "eval_runtime": 2.1509, "eval_samples_per_second": 230.139, "eval_steps_per_second": 28.826, "step": 12200 }, { "epoch": 32.22449653350941, "grad_norm": 1493.4691162109375, "learning_rate": 0.00014095407575717461, "loss": 35.5378, "step": 12201 }, { "epoch": 32.22713766919775, "grad_norm": 739.6961059570312, "learning_rate": 0.00014091648437868833, "loss": 34.4006, "step": 12202 }, { "epoch": 32.2297788048861, "grad_norm": 3825.380859375, "learning_rate": 0.00014087889604629512, "loss": 34.8248, "step": 12203 }, { "epoch": 32.232419940574445, "grad_norm": 801.38134765625, "learning_rate": 0.00014084131076104464, "loss": 34.6929, "step": 12204 }, { "epoch": 32.235061076262795, "grad_norm": 575.5792236328125, "learning_rate": 0.00014080372852398644, "loss": 34.4986, "step": 12205 }, { "epoch": 32.23770221195114, "grad_norm": 1062.57666015625, "learning_rate": 0.00014076614933617, "loss": 34.1658, "step": 12206 }, { "epoch": 32.24034334763949, "grad_norm": 1311.792236328125, "learning_rate": 0.00014072857319864462, "loss": 34.5979, "step": 12207 }, { "epoch": 32.24298448332783, "grad_norm": 1156.1378173828125, "learning_rate": 0.0001406910001124596, "loss": 34.9686, "step": 12208 }, { "epoch": 32.245625619016174, "grad_norm": 3498.230224609375, "learning_rate": 0.00014065343007866427, "loss": 37.2131, "step": 12209 }, { "epoch": 32.248266754704524, "grad_norm": 1713.4232177734375, "learning_rate": 0.00014061586309830766, "loss": 35.5822, "step": 12210 }, { "epoch": 32.25090789039287, "grad_norm": 2090.821533203125, "learning_rate": 0.00014057829917243876, "loss": 33.8833, "step": 12211 }, { "epoch": 32.25354902608122, "grad_norm": 2054.08642578125, "learning_rate": 0.00014054073830210663, "loss": 34.5744, "step": 12212 }, { "epoch": 32.25619016176956, "grad_norm": 1585.307373046875, "learning_rate": 0.0001405031804883602, "loss": 33.8881, "step": 12213 }, { "epoch": 32.25883129745791, "grad_norm": 1920.439453125, "learning_rate": 0.00014046562573224802, "loss": 35.5952, "step": 12214 }, { "epoch": 32.26147243314625, "grad_norm": 1573.8798828125, "learning_rate": 0.0001404280740348188, "loss": 37.2991, "step": 12215 }, { "epoch": 32.264113568834595, "grad_norm": 9918.4248046875, "learning_rate": 0.00014039052539712135, "loss": 41.1222, "step": 12216 }, { "epoch": 32.266754704522945, "grad_norm": 634.6954956054688, "learning_rate": 0.00014035297982020409, "loss": 40.3909, "step": 12217 }, { "epoch": 32.26939584021129, "grad_norm": 686.4661865234375, "learning_rate": 0.00014031543730511537, "loss": 39.3024, "step": 12218 }, { "epoch": 32.27203697589964, "grad_norm": 829.98876953125, "learning_rate": 0.00014027789785290375, "loss": 38.9081, "step": 12219 }, { "epoch": 32.27467811158798, "grad_norm": 573.3073120117188, "learning_rate": 0.00014024036146461732, "loss": 40.9082, "step": 12220 }, { "epoch": 32.27731924727633, "grad_norm": 587.898193359375, "learning_rate": 0.00014020282814130436, "loss": 43.5901, "step": 12221 }, { "epoch": 32.279960382964674, "grad_norm": 583.7857055664062, "learning_rate": 0.00014016529788401288, "loss": 43.1098, "step": 12222 }, { "epoch": 32.282601518653024, "grad_norm": 441.00689697265625, "learning_rate": 0.00014012777069379102, "loss": 40.5815, "step": 12223 }, { "epoch": 32.28524265434137, "grad_norm": 499.6117248535156, "learning_rate": 0.00014009024657168657, "loss": 43.6012, "step": 12224 }, { "epoch": 32.28788379002971, "grad_norm": 1302.2913818359375, "learning_rate": 0.00014005272551874731, "loss": 41.0662, "step": 12225 }, { "epoch": 32.29052492571806, "grad_norm": 827.0116577148438, "learning_rate": 0.0001400152075360212, "loss": 38.8216, "step": 12226 }, { "epoch": 32.2931660614064, "grad_norm": 444.72113037109375, "learning_rate": 0.00013997769262455583, "loss": 38.4882, "step": 12227 }, { "epoch": 32.29580719709475, "grad_norm": 672.8353881835938, "learning_rate": 0.00013994018078539877, "loss": 37.7413, "step": 12228 }, { "epoch": 32.298448332783096, "grad_norm": 712.3863525390625, "learning_rate": 0.00013990267201959738, "loss": 36.4118, "step": 12229 }, { "epoch": 32.301089468471446, "grad_norm": 681.659423828125, "learning_rate": 0.00013986516632819945, "loss": 37.5172, "step": 12230 }, { "epoch": 32.30373060415979, "grad_norm": 526.7298583984375, "learning_rate": 0.0001398276637122519, "loss": 35.9837, "step": 12231 }, { "epoch": 32.30637173984813, "grad_norm": 770.3709106445312, "learning_rate": 0.00013979016417280205, "loss": 35.8924, "step": 12232 }, { "epoch": 32.30901287553648, "grad_norm": 706.2981567382812, "learning_rate": 0.00013975266771089722, "loss": 34.343, "step": 12233 }, { "epoch": 32.311654011224825, "grad_norm": 1060.7027587890625, "learning_rate": 0.0001397151743275844, "loss": 35.2117, "step": 12234 }, { "epoch": 32.314295146913175, "grad_norm": 810.2196044921875, "learning_rate": 0.0001396776840239105, "loss": 34.5812, "step": 12235 }, { "epoch": 32.31693628260152, "grad_norm": 755.246337890625, "learning_rate": 0.00013964019680092244, "loss": 35.6558, "step": 12236 }, { "epoch": 32.31957741828987, "grad_norm": 416.3034362792969, "learning_rate": 0.00013960271265966713, "loss": 34.7998, "step": 12237 }, { "epoch": 32.32221855397821, "grad_norm": 7739.1943359375, "learning_rate": 0.00013956523160119116, "loss": 44.0105, "step": 12238 }, { "epoch": 32.32485968966655, "grad_norm": 467.5049743652344, "learning_rate": 0.0001395277536265413, "loss": 17.2609, "step": 12239 }, { "epoch": 32.3275008253549, "grad_norm": 2003.640380859375, "learning_rate": 0.00013949027873676396, "loss": 13.3585, "step": 12240 }, { "epoch": 32.330141961043246, "grad_norm": 724.8034057617188, "learning_rate": 0.0001394528069329057, "loss": 9.6617, "step": 12241 }, { "epoch": 32.332783096731596, "grad_norm": 987.0281372070312, "learning_rate": 0.0001394153382160129, "loss": 15.0797, "step": 12242 }, { "epoch": 32.33542423241994, "grad_norm": 794.8633422851562, "learning_rate": 0.0001393778725871317, "loss": 16.1475, "step": 12243 }, { "epoch": 32.33806536810829, "grad_norm": 3744.466796875, "learning_rate": 0.0001393404100473085, "loss": 12.4524, "step": 12244 }, { "epoch": 32.34070650379663, "grad_norm": 602.9390258789062, "learning_rate": 0.00013930295059758936, "loss": 12.2072, "step": 12245 }, { "epoch": 32.34334763948498, "grad_norm": 2328.9560546875, "learning_rate": 0.0001392654942390202, "loss": 13.9826, "step": 12246 }, { "epoch": 32.345988775173325, "grad_norm": 1723.251708984375, "learning_rate": 0.00013922804097264713, "loss": 11.0997, "step": 12247 }, { "epoch": 32.34862991086167, "grad_norm": 509.5841369628906, "learning_rate": 0.0001391905907995161, "loss": 25.005, "step": 12248 }, { "epoch": 32.35127104655002, "grad_norm": 602.9991455078125, "learning_rate": 0.0001391531437206726, "loss": 36.9061, "step": 12249 }, { "epoch": 32.35391218223836, "grad_norm": 550.5824584960938, "learning_rate": 0.00013911569973716235, "loss": 35.996, "step": 12250 }, { "epoch": 32.35655331792671, "grad_norm": 1706.4615478515625, "learning_rate": 0.00013907825885003114, "loss": 36.7439, "step": 12251 }, { "epoch": 32.35919445361505, "grad_norm": 406.9903564453125, "learning_rate": 0.0001390408210603244, "loss": 34.4043, "step": 12252 }, { "epoch": 32.3618355893034, "grad_norm": 1178.7152099609375, "learning_rate": 0.00013900338636908748, "loss": 36.1112, "step": 12253 }, { "epoch": 32.364476724991746, "grad_norm": 628.8358154296875, "learning_rate": 0.00013896595477736585, "loss": 35.0439, "step": 12254 }, { "epoch": 32.36711786068009, "grad_norm": 695.2954711914062, "learning_rate": 0.00013892852628620472, "loss": 32.6237, "step": 12255 }, { "epoch": 32.36975899636844, "grad_norm": 648.6275634765625, "learning_rate": 0.00013889110089664925, "loss": 34.4781, "step": 12256 }, { "epoch": 32.37240013205678, "grad_norm": 982.4111938476562, "learning_rate": 0.00013885367860974452, "loss": 33.8424, "step": 12257 }, { "epoch": 32.37504126774513, "grad_norm": 566.7881469726562, "learning_rate": 0.00013881625942653553, "loss": 34.3832, "step": 12258 }, { "epoch": 32.377682403433475, "grad_norm": 609.246826171875, "learning_rate": 0.00013877884334806718, "loss": 36.6783, "step": 12259 }, { "epoch": 32.380323539121825, "grad_norm": 1523.254150390625, "learning_rate": 0.00013874143037538418, "loss": 35.9881, "step": 12260 }, { "epoch": 32.38296467481017, "grad_norm": 1958.5172119140625, "learning_rate": 0.00013870402050953148, "loss": 35.1511, "step": 12261 }, { "epoch": 32.38560581049851, "grad_norm": 551.3795166015625, "learning_rate": 0.00013866661375155364, "loss": 35.0394, "step": 12262 }, { "epoch": 32.38824694618686, "grad_norm": 804.2717895507812, "learning_rate": 0.00013862921010249522, "loss": 34.852, "step": 12263 }, { "epoch": 32.390888081875204, "grad_norm": 1233.4931640625, "learning_rate": 0.00013859180956340056, "loss": 34.8238, "step": 12264 }, { "epoch": 32.393529217563554, "grad_norm": 1687.2613525390625, "learning_rate": 0.00013855441213531428, "loss": 36.2577, "step": 12265 }, { "epoch": 32.3961703532519, "grad_norm": 845.615234375, "learning_rate": 0.0001385170178192807, "loss": 42.3203, "step": 12266 }, { "epoch": 32.39881148894025, "grad_norm": 470.2928161621094, "learning_rate": 0.00013847962661634367, "loss": 38.4918, "step": 12267 }, { "epoch": 32.40145262462859, "grad_norm": 402.1431579589844, "learning_rate": 0.00013844223852754766, "loss": 39.7072, "step": 12268 }, { "epoch": 32.40409376031694, "grad_norm": 1790.959716796875, "learning_rate": 0.00013840485355393666, "loss": 38.814, "step": 12269 }, { "epoch": 32.40673489600528, "grad_norm": 820.9305419921875, "learning_rate": 0.00013836747169655454, "loss": 39.938, "step": 12270 }, { "epoch": 32.409376031693625, "grad_norm": 399.59881591796875, "learning_rate": 0.00013833009295644507, "loss": 42.3761, "step": 12271 }, { "epoch": 32.412017167381975, "grad_norm": 574.8585205078125, "learning_rate": 0.00013829271733465227, "loss": 42.2345, "step": 12272 }, { "epoch": 32.41465830307032, "grad_norm": 401.01422119140625, "learning_rate": 0.00013825534483221974, "loss": 44.8873, "step": 12273 }, { "epoch": 32.41729943875867, "grad_norm": 788.85400390625, "learning_rate": 0.00013821797545019105, "loss": 40.7779, "step": 12274 }, { "epoch": 32.41994057444701, "grad_norm": 1551.565673828125, "learning_rate": 0.00013818060918960974, "loss": 39.882, "step": 12275 }, { "epoch": 32.42258171013536, "grad_norm": 3598.739990234375, "learning_rate": 0.0001381432460515192, "loss": 39.6535, "step": 12276 }, { "epoch": 32.425222845823704, "grad_norm": 727.2115478515625, "learning_rate": 0.00013810588603696284, "loss": 40.489, "step": 12277 }, { "epoch": 32.42786398151205, "grad_norm": 2431.46533203125, "learning_rate": 0.00013806852914698375, "loss": 37.9184, "step": 12278 }, { "epoch": 32.4305051172004, "grad_norm": 714.262939453125, "learning_rate": 0.00013803117538262533, "loss": 37.0973, "step": 12279 }, { "epoch": 32.43314625288874, "grad_norm": 1684.142822265625, "learning_rate": 0.0001379938247449306, "loss": 36.9128, "step": 12280 }, { "epoch": 32.43578738857709, "grad_norm": 740.0056762695312, "learning_rate": 0.0001379564772349424, "loss": 36.5514, "step": 12281 }, { "epoch": 32.43842852426543, "grad_norm": 389.33416748046875, "learning_rate": 0.00013791913285370386, "loss": 34.8791, "step": 12282 }, { "epoch": 32.44106965995378, "grad_norm": 925.73828125, "learning_rate": 0.00013788179160225777, "loss": 35.1673, "step": 12283 }, { "epoch": 32.443710795642126, "grad_norm": 964.7802124023438, "learning_rate": 0.00013784445348164674, "loss": 34.8047, "step": 12284 }, { "epoch": 32.44635193133047, "grad_norm": 654.2106323242188, "learning_rate": 0.00013780711849291333, "loss": 34.7188, "step": 12285 }, { "epoch": 32.44899306701882, "grad_norm": 1603.7010498046875, "learning_rate": 0.00013776978663710034, "loss": 35.6772, "step": 12286 }, { "epoch": 32.45163420270716, "grad_norm": 1267.1575927734375, "learning_rate": 0.0001377324579152501, "loss": 34.861, "step": 12287 }, { "epoch": 32.45427533839551, "grad_norm": 738.0974731445312, "learning_rate": 0.00013769513232840495, "loss": 35.8191, "step": 12288 }, { "epoch": 32.456916474083854, "grad_norm": 2388.471435546875, "learning_rate": 0.00013765780987760735, "loss": 25.0849, "step": 12289 }, { "epoch": 32.459557609772205, "grad_norm": 1678.6627197265625, "learning_rate": 0.00013762049056389943, "loss": 10.413, "step": 12290 }, { "epoch": 32.46219874546055, "grad_norm": 1204.125, "learning_rate": 0.0001375831743883233, "loss": 13.6518, "step": 12291 }, { "epoch": 32.4648398811489, "grad_norm": 3488.5654296875, "learning_rate": 0.00013754586135192093, "loss": 12.3187, "step": 12292 }, { "epoch": 32.46748101683724, "grad_norm": 971.8365478515625, "learning_rate": 0.00013750855145573437, "loss": 10.9314, "step": 12293 }, { "epoch": 32.47012215252558, "grad_norm": 1378.1029052734375, "learning_rate": 0.00013747124470080545, "loss": 10.9218, "step": 12294 }, { "epoch": 32.47276328821393, "grad_norm": 2050.572509765625, "learning_rate": 0.00013743394108817577, "loss": 15.338, "step": 12295 }, { "epoch": 32.475404423902276, "grad_norm": 3651.71875, "learning_rate": 0.00013739664061888728, "loss": 9.3062, "step": 12296 }, { "epoch": 32.478045559590626, "grad_norm": 2371.451171875, "learning_rate": 0.00013735934329398147, "loss": 11.8761, "step": 12297 }, { "epoch": 32.48068669527897, "grad_norm": 2325.49560546875, "learning_rate": 0.00013732204911449982, "loss": 14.1441, "step": 12298 }, { "epoch": 32.48332783096732, "grad_norm": 999.086181640625, "learning_rate": 0.00013728475808148365, "loss": 36.2124, "step": 12299 }, { "epoch": 32.48596896665566, "grad_norm": 843.1417236328125, "learning_rate": 0.0001372474701959745, "loss": 35.459, "step": 12300 }, { "epoch": 32.488610102344005, "grad_norm": 1582.79638671875, "learning_rate": 0.00013721018545901365, "loss": 34.6348, "step": 12301 }, { "epoch": 32.491251238032355, "grad_norm": 883.7091674804688, "learning_rate": 0.00013717290387164186, "loss": 34.7435, "step": 12302 }, { "epoch": 32.4938923737207, "grad_norm": 1791.168212890625, "learning_rate": 0.00013713562543490057, "loss": 33.7693, "step": 12303 }, { "epoch": 32.49653350940905, "grad_norm": 930.4835815429688, "learning_rate": 0.00013709835014983062, "loss": 36.0341, "step": 12304 }, { "epoch": 32.49917464509739, "grad_norm": 729.6206665039062, "learning_rate": 0.00013706107801747297, "loss": 35.6218, "step": 12305 }, { "epoch": 32.50181578078574, "grad_norm": 839.6556396484375, "learning_rate": 0.00013702380903886824, "loss": 36.0014, "step": 12306 }, { "epoch": 32.50445691647408, "grad_norm": 1795.4598388671875, "learning_rate": 0.00013698654321505737, "loss": 34.0113, "step": 12307 }, { "epoch": 32.507098052162426, "grad_norm": 1242.1368408203125, "learning_rate": 0.00013694928054708088, "loss": 34.8551, "step": 12308 }, { "epoch": 32.509739187850776, "grad_norm": 1706.5623779296875, "learning_rate": 0.00013691202103597937, "loss": 34.3651, "step": 12309 }, { "epoch": 32.51238032353912, "grad_norm": 2041.98583984375, "learning_rate": 0.0001368747646827932, "loss": 35.7989, "step": 12310 }, { "epoch": 32.51502145922747, "grad_norm": 1470.587158203125, "learning_rate": 0.0001368375114885628, "loss": 34.4596, "step": 12311 }, { "epoch": 32.51766259491581, "grad_norm": 588.7734985351562, "learning_rate": 0.0001368002614543283, "loss": 33.6253, "step": 12312 }, { "epoch": 32.52030373060416, "grad_norm": 763.4391479492188, "learning_rate": 0.00013676301458113012, "loss": 35.66, "step": 12313 }, { "epoch": 32.522944866292505, "grad_norm": 1805.8538818359375, "learning_rate": 0.0001367257708700082, "loss": 35.7814, "step": 12314 }, { "epoch": 32.525586001980855, "grad_norm": 7927.30615234375, "learning_rate": 0.00013668853032200264, "loss": 35.9451, "step": 12315 }, { "epoch": 32.5282271376692, "grad_norm": 962.6067504882812, "learning_rate": 0.00013665129293815315, "loss": 39.9647, "step": 12316 }, { "epoch": 32.53086827335754, "grad_norm": 974.9136962890625, "learning_rate": 0.00013661405871949987, "loss": 37.3059, "step": 12317 }, { "epoch": 32.53350940904589, "grad_norm": 749.5453491210938, "learning_rate": 0.00013657682766708245, "loss": 38.9091, "step": 12318 }, { "epoch": 32.536150544734234, "grad_norm": 1089.348876953125, "learning_rate": 0.00013653959978194034, "loss": 39.4398, "step": 12319 }, { "epoch": 32.538791680422584, "grad_norm": 512.1325073242188, "learning_rate": 0.00013650237506511331, "loss": 38.6637, "step": 12320 }, { "epoch": 32.54143281611093, "grad_norm": 403.884033203125, "learning_rate": 0.00013646515351764082, "loss": 41.0728, "step": 12321 }, { "epoch": 32.54407395179928, "grad_norm": 1287.6693115234375, "learning_rate": 0.0001364279351405622, "loss": 43.4984, "step": 12322 }, { "epoch": 32.54671508748762, "grad_norm": 413.3851013183594, "learning_rate": 0.0001363907199349167, "loss": 41.4996, "step": 12323 }, { "epoch": 32.54935622317596, "grad_norm": 1074.0374755859375, "learning_rate": 0.00013635350790174367, "loss": 41.2089, "step": 12324 }, { "epoch": 32.55199735886431, "grad_norm": 1467.5076904296875, "learning_rate": 0.0001363162990420822, "loss": 40.6966, "step": 12325 }, { "epoch": 32.554638494552655, "grad_norm": 1203.3673095703125, "learning_rate": 0.0001362790933569713, "loss": 40.1728, "step": 12326 }, { "epoch": 32.557279630241005, "grad_norm": 1078.9600830078125, "learning_rate": 0.0001362418908474499, "loss": 40.2366, "step": 12327 }, { "epoch": 32.55992076592935, "grad_norm": 1108.484619140625, "learning_rate": 0.0001362046915145569, "loss": 38.709, "step": 12328 }, { "epoch": 32.5625619016177, "grad_norm": 709.67333984375, "learning_rate": 0.00013616749535933097, "loss": 36.8362, "step": 12329 }, { "epoch": 32.56520303730604, "grad_norm": 1236.1229248046875, "learning_rate": 0.00013613030238281083, "loss": 37.3955, "step": 12330 }, { "epoch": 32.567844172994384, "grad_norm": 1008.8990478515625, "learning_rate": 0.0001360931125860352, "loss": 36.2625, "step": 12331 }, { "epoch": 32.570485308682734, "grad_norm": 1000.3048095703125, "learning_rate": 0.00013605592597004245, "loss": 36.7007, "step": 12332 }, { "epoch": 32.57312644437108, "grad_norm": 1145.187744140625, "learning_rate": 0.000136018742535871, "loss": 35.7798, "step": 12333 }, { "epoch": 32.57576758005943, "grad_norm": 606.2276611328125, "learning_rate": 0.00013598156228455927, "loss": 36.0449, "step": 12334 }, { "epoch": 32.57840871574777, "grad_norm": 980.8646240234375, "learning_rate": 0.0001359443852171454, "loss": 36.6593, "step": 12335 }, { "epoch": 32.58104985143612, "grad_norm": 969.781982421875, "learning_rate": 0.0001359072113346677, "loss": 35.8172, "step": 12336 }, { "epoch": 32.58369098712446, "grad_norm": 1553.9140625, "learning_rate": 0.00013587004063816393, "loss": 35.2816, "step": 12337 }, { "epoch": 32.58633212281281, "grad_norm": 2141.10888671875, "learning_rate": 0.00013583287312867232, "loss": 38.8864, "step": 12338 }, { "epoch": 32.588973258501156, "grad_norm": 6392.3408203125, "learning_rate": 0.00013579570880723063, "loss": 15.7743, "step": 12339 }, { "epoch": 32.5916143941895, "grad_norm": 1144.4432373046875, "learning_rate": 0.0001357585476748766, "loss": 11.324, "step": 12340 }, { "epoch": 32.59425552987785, "grad_norm": 460.38934326171875, "learning_rate": 0.00013572138973264814, "loss": 8.3584, "step": 12341 }, { "epoch": 32.59689666556619, "grad_norm": 1312.49462890625, "learning_rate": 0.00013568423498158273, "loss": 13.6827, "step": 12342 }, { "epoch": 32.59953780125454, "grad_norm": 1295.5196533203125, "learning_rate": 0.00013564708342271792, "loss": 10.756, "step": 12343 }, { "epoch": 32.602178936942884, "grad_norm": 3074.8583984375, "learning_rate": 0.0001356099350570911, "loss": 12.7428, "step": 12344 }, { "epoch": 32.604820072631234, "grad_norm": 4441.16455078125, "learning_rate": 0.00013557278988573967, "loss": 16.3659, "step": 12345 }, { "epoch": 32.60746120831958, "grad_norm": 960.37255859375, "learning_rate": 0.00013553564790970085, "loss": 9.4325, "step": 12346 }, { "epoch": 32.61010234400792, "grad_norm": 2087.077392578125, "learning_rate": 0.0001354985091300117, "loss": 8.7003, "step": 12347 }, { "epoch": 32.61274347969627, "grad_norm": 550.56201171875, "learning_rate": 0.00013546137354770958, "loss": 13.6001, "step": 12348 }, { "epoch": 32.61538461538461, "grad_norm": 1032.6474609375, "learning_rate": 0.0001354242411638313, "loss": 36.4671, "step": 12349 }, { "epoch": 32.61802575107296, "grad_norm": 3108.505126953125, "learning_rate": 0.00013538711197941372, "loss": 35.2902, "step": 12350 }, { "epoch": 32.620666886761306, "grad_norm": 1089.1468505859375, "learning_rate": 0.00013534998599549366, "loss": 32.9631, "step": 12351 }, { "epoch": 32.623308022449656, "grad_norm": 1282.22900390625, "learning_rate": 0.000135312863213108, "loss": 34.5744, "step": 12352 }, { "epoch": 32.625949158138, "grad_norm": 1117.8369140625, "learning_rate": 0.00013527574363329337, "loss": 34.2988, "step": 12353 }, { "epoch": 32.62859029382634, "grad_norm": 2264.52099609375, "learning_rate": 0.00013523862725708596, "loss": 34.0695, "step": 12354 }, { "epoch": 32.63123142951469, "grad_norm": 946.4656372070312, "learning_rate": 0.00013520151408552263, "loss": 34.888, "step": 12355 }, { "epoch": 32.633872565203035, "grad_norm": 4179.81298828125, "learning_rate": 0.00013516440411963954, "loss": 34.4273, "step": 12356 }, { "epoch": 32.636513700891385, "grad_norm": 932.9711303710938, "learning_rate": 0.00013512729736047302, "loss": 34.3194, "step": 12357 }, { "epoch": 32.63915483657973, "grad_norm": 863.3092041015625, "learning_rate": 0.00013509019380905912, "loss": 36.4969, "step": 12358 }, { "epoch": 32.64179597226808, "grad_norm": 1580.3157958984375, "learning_rate": 0.0001350530934664342, "loss": 36.6028, "step": 12359 }, { "epoch": 32.64443710795642, "grad_norm": 1042.51513671875, "learning_rate": 0.0001350159963336341, "loss": 33.732, "step": 12360 }, { "epoch": 32.64707824364477, "grad_norm": 1461.6904296875, "learning_rate": 0.00013497890241169475, "loss": 35.6847, "step": 12361 }, { "epoch": 32.64971937933311, "grad_norm": 1289.439697265625, "learning_rate": 0.00013494181170165199, "loss": 34.4204, "step": 12362 }, { "epoch": 32.652360515021456, "grad_norm": 1617.97509765625, "learning_rate": 0.00013490472420454153, "loss": 35.3875, "step": 12363 }, { "epoch": 32.655001650709806, "grad_norm": 3225.9072265625, "learning_rate": 0.00013486763992139904, "loss": 36.7829, "step": 12364 }, { "epoch": 32.65764278639815, "grad_norm": 1631.731689453125, "learning_rate": 0.00013483055885326, "loss": 37.9831, "step": 12365 }, { "epoch": 32.6602839220865, "grad_norm": 7283.41015625, "learning_rate": 0.00013479348100116004, "loss": 39.7859, "step": 12366 }, { "epoch": 32.66292505777484, "grad_norm": 475.59930419921875, "learning_rate": 0.00013475640636613446, "loss": 37.8218, "step": 12367 }, { "epoch": 32.66556619346319, "grad_norm": 487.9519958496094, "learning_rate": 0.0001347193349492184, "loss": 39.9753, "step": 12368 }, { "epoch": 32.668207329151535, "grad_norm": 483.7272033691406, "learning_rate": 0.0001346822667514473, "loss": 38.6062, "step": 12369 }, { "epoch": 32.67084846483988, "grad_norm": 455.85516357421875, "learning_rate": 0.0001346452017738562, "loss": 41.3562, "step": 12370 }, { "epoch": 32.67348960052823, "grad_norm": 761.077880859375, "learning_rate": 0.00013460814001748012, "loss": 44.4279, "step": 12371 }, { "epoch": 32.67613073621657, "grad_norm": 1123.0579833984375, "learning_rate": 0.00013457108148335378, "loss": 41.2482, "step": 12372 }, { "epoch": 32.67877187190492, "grad_norm": 542.0755004882812, "learning_rate": 0.00013453402617251225, "loss": 40.4357, "step": 12373 }, { "epoch": 32.681413007593264, "grad_norm": 658.92578125, "learning_rate": 0.00013449697408599026, "loss": 40.0071, "step": 12374 }, { "epoch": 32.684054143281614, "grad_norm": 558.0750122070312, "learning_rate": 0.00013445992522482232, "loss": 39.0476, "step": 12375 }, { "epoch": 32.68669527896996, "grad_norm": 656.2588500976562, "learning_rate": 0.00013442287959004312, "loss": 38.1714, "step": 12376 }, { "epoch": 32.6893364146583, "grad_norm": 366.65277099609375, "learning_rate": 0.00013438583718268718, "loss": 39.1015, "step": 12377 }, { "epoch": 32.69197755034665, "grad_norm": 523.100341796875, "learning_rate": 0.0001343487980037888, "loss": 38.7468, "step": 12378 }, { "epoch": 32.69461868603499, "grad_norm": 453.4934387207031, "learning_rate": 0.00013431176205438232, "loss": 37.5356, "step": 12379 }, { "epoch": 32.69725982172334, "grad_norm": 849.6903686523438, "learning_rate": 0.00013427472933550189, "loss": 36.0446, "step": 12380 }, { "epoch": 32.699900957411685, "grad_norm": 1556.5902099609375, "learning_rate": 0.00013423769984818168, "loss": 36.0198, "step": 12381 }, { "epoch": 32.702542093100035, "grad_norm": 1293.01220703125, "learning_rate": 0.00013420067359345563, "loss": 34.4207, "step": 12382 }, { "epoch": 32.70518322878838, "grad_norm": 1762.0955810546875, "learning_rate": 0.0001341636505723578, "loss": 36.8104, "step": 12383 }, { "epoch": 32.70782436447673, "grad_norm": 704.4073486328125, "learning_rate": 0.00013412663078592199, "loss": 34.3815, "step": 12384 }, { "epoch": 32.71046550016507, "grad_norm": 717.3561401367188, "learning_rate": 0.00013408961423518195, "loss": 34.6595, "step": 12385 }, { "epoch": 32.713106635853414, "grad_norm": 416.2254943847656, "learning_rate": 0.00013405260092117129, "loss": 34.185, "step": 12386 }, { "epoch": 32.715747771541764, "grad_norm": 1227.5074462890625, "learning_rate": 0.00013401559084492368, "loss": 35.586, "step": 12387 }, { "epoch": 32.71838890723011, "grad_norm": 41075.0234375, "learning_rate": 0.00013397858400747268, "loss": 40.6402, "step": 12388 }, { "epoch": 32.72103004291846, "grad_norm": 2620.78857421875, "learning_rate": 0.00013394158040985136, "loss": 20.4863, "step": 12389 }, { "epoch": 32.7236711786068, "grad_norm": 108087.203125, "learning_rate": 0.00013390458005309332, "loss": 15.1193, "step": 12390 }, { "epoch": 32.72631231429515, "grad_norm": 2482.447998046875, "learning_rate": 0.0001338675829382317, "loss": 17.146, "step": 12391 }, { "epoch": 32.72895344998349, "grad_norm": 3882.771484375, "learning_rate": 0.00013383058906629963, "loss": 14.0407, "step": 12392 }, { "epoch": 32.731594585671836, "grad_norm": 689.7031860351562, "learning_rate": 0.00013379359843833, "loss": 11.1173, "step": 12393 }, { "epoch": 32.734235721360186, "grad_norm": 205872.046875, "learning_rate": 0.00013375661105535596, "loss": 15.5263, "step": 12394 }, { "epoch": 32.73687685704853, "grad_norm": 1075.0787353515625, "learning_rate": 0.00013371962691841028, "loss": 14.1817, "step": 12395 }, { "epoch": 32.73951799273688, "grad_norm": 1094.6873779296875, "learning_rate": 0.0001336826460285257, "loss": 11.5053, "step": 12396 }, { "epoch": 32.74215912842522, "grad_norm": 1684.4403076171875, "learning_rate": 0.00013364566838673493, "loss": 13.6849, "step": 12397 }, { "epoch": 32.74480026411357, "grad_norm": 1127.140869140625, "learning_rate": 0.0001336086939940705, "loss": 16.899, "step": 12398 }, { "epoch": 32.747441399801914, "grad_norm": 1307.025634765625, "learning_rate": 0.0001335717228515649, "loss": 34.1943, "step": 12399 }, { "epoch": 32.75008253549026, "grad_norm": 843.2037963867188, "learning_rate": 0.00013353475496025047, "loss": 34.3571, "step": 12400 }, { "epoch": 32.75008253549026, "eval_loss": 3.87363862991333, "eval_runtime": 2.2187, "eval_samples_per_second": 223.103, "eval_steps_per_second": 27.944, "step": 12400 }, { "epoch": 32.75272367117861, "grad_norm": 731.2911376953125, "learning_rate": 0.0001334977903211597, "loss": 35.4375, "step": 12401 }, { "epoch": 32.75536480686695, "grad_norm": 1276.3758544921875, "learning_rate": 0.0001334608289353247, "loss": 36.0448, "step": 12402 }, { "epoch": 32.7580059425553, "grad_norm": 1871.25537109375, "learning_rate": 0.00013342387080377747, "loss": 34.8421, "step": 12403 }, { "epoch": 32.76064707824364, "grad_norm": 2668.091064453125, "learning_rate": 0.00013338691592755026, "loss": 34.5541, "step": 12404 }, { "epoch": 32.76328821393199, "grad_norm": 607.4151611328125, "learning_rate": 0.00013334996430767494, "loss": 35.0188, "step": 12405 }, { "epoch": 32.765929349620336, "grad_norm": 722.1642456054688, "learning_rate": 0.00013331301594518344, "loss": 35.2092, "step": 12406 }, { "epoch": 32.768570485308686, "grad_norm": 811.744873046875, "learning_rate": 0.00013327607084110722, "loss": 35.3687, "step": 12407 }, { "epoch": 32.77121162099703, "grad_norm": 2871.463623046875, "learning_rate": 0.00013323912899647827, "loss": 34.3011, "step": 12408 }, { "epoch": 32.77385275668537, "grad_norm": 622.4263305664062, "learning_rate": 0.00013320219041232804, "loss": 33.8498, "step": 12409 }, { "epoch": 32.77649389237372, "grad_norm": 861.0094604492188, "learning_rate": 0.0001331652550896879, "loss": 34.6596, "step": 12410 }, { "epoch": 32.779135028062065, "grad_norm": 626.8021240234375, "learning_rate": 0.00013312832302958956, "loss": 35.11, "step": 12411 }, { "epoch": 32.781776163750415, "grad_norm": 869.5277709960938, "learning_rate": 0.00013309139423306408, "loss": 34.034, "step": 12412 }, { "epoch": 32.78441729943876, "grad_norm": 707.6112670898438, "learning_rate": 0.00013305446870114275, "loss": 33.6595, "step": 12413 }, { "epoch": 32.78705843512711, "grad_norm": 1443.0302734375, "learning_rate": 0.0001330175464348567, "loss": 35.9931, "step": 12414 }, { "epoch": 32.78969957081545, "grad_norm": 1109.763916015625, "learning_rate": 0.00013298062743523692, "loss": 36.9581, "step": 12415 }, { "epoch": 32.79234070650379, "grad_norm": 3124.88916015625, "learning_rate": 0.00013294371170331442, "loss": 41.5276, "step": 12416 }, { "epoch": 32.79498184219214, "grad_norm": 765.0674438476562, "learning_rate": 0.00013290679924011988, "loss": 37.5109, "step": 12417 }, { "epoch": 32.797622977880486, "grad_norm": 461.2922058105469, "learning_rate": 0.00013286989004668428, "loss": 39.1763, "step": 12418 }, { "epoch": 32.800264113568836, "grad_norm": 715.3441162109375, "learning_rate": 0.00013283298412403822, "loss": 40.0289, "step": 12419 }, { "epoch": 32.80290524925718, "grad_norm": 812.7244873046875, "learning_rate": 0.00013279608147321223, "loss": 41.4558, "step": 12420 }, { "epoch": 32.80554638494553, "grad_norm": 2041.2783203125, "learning_rate": 0.00013275918209523674, "loss": 40.6891, "step": 12421 }, { "epoch": 32.80818752063387, "grad_norm": 2187.2734375, "learning_rate": 0.00013272228599114227, "loss": 39.6126, "step": 12422 }, { "epoch": 32.810828656322215, "grad_norm": 686.5364990234375, "learning_rate": 0.00013268539316195921, "loss": 42.6439, "step": 12423 }, { "epoch": 32.813469792010565, "grad_norm": 679.4469604492188, "learning_rate": 0.00013264850360871744, "loss": 38.6393, "step": 12424 }, { "epoch": 32.81611092769891, "grad_norm": 541.1776123046875, "learning_rate": 0.00013261161733244738, "loss": 38.8312, "step": 12425 }, { "epoch": 32.81875206338726, "grad_norm": 848.3509521484375, "learning_rate": 0.00013257473433417888, "loss": 41.1851, "step": 12426 }, { "epoch": 32.8213931990756, "grad_norm": 651.4778442382812, "learning_rate": 0.000132537854614942, "loss": 37.1908, "step": 12427 }, { "epoch": 32.82403433476395, "grad_norm": 329.0972595214844, "learning_rate": 0.00013250097817576643, "loss": 37.5043, "step": 12428 }, { "epoch": 32.826675470452294, "grad_norm": 586.905029296875, "learning_rate": 0.00013246410501768208, "loss": 37.3059, "step": 12429 }, { "epoch": 32.829316606140644, "grad_norm": 570.4904174804688, "learning_rate": 0.0001324272351417186, "loss": 35.9133, "step": 12430 }, { "epoch": 32.83195774182899, "grad_norm": 1608.7674560546875, "learning_rate": 0.00013239036854890548, "loss": 36.01, "step": 12431 }, { "epoch": 32.83459887751733, "grad_norm": 706.0025634765625, "learning_rate": 0.00013235350524027227, "loss": 35.8526, "step": 12432 }, { "epoch": 32.83724001320568, "grad_norm": 508.4577941894531, "learning_rate": 0.00013231664521684826, "loss": 35.0794, "step": 12433 }, { "epoch": 32.83988114889402, "grad_norm": 1161.4794921875, "learning_rate": 0.00013227978847966284, "loss": 35.8821, "step": 12434 }, { "epoch": 32.84252228458237, "grad_norm": 369.12689208984375, "learning_rate": 0.0001322429350297451, "loss": 35.3621, "step": 12435 }, { "epoch": 32.845163420270715, "grad_norm": 588.9650268554688, "learning_rate": 0.00013220608486812428, "loss": 35.4892, "step": 12436 }, { "epoch": 32.847804555959065, "grad_norm": 443.9806823730469, "learning_rate": 0.00013216923799582936, "loss": 35.2099, "step": 12437 }, { "epoch": 32.85044569164741, "grad_norm": 1290.937744140625, "learning_rate": 0.00013213239441388914, "loss": 35.813, "step": 12438 }, { "epoch": 32.85308682733575, "grad_norm": 5478.00830078125, "learning_rate": 0.00013209555412333268, "loss": 17.538, "step": 12439 }, { "epoch": 32.8557279630241, "grad_norm": 6533.76806640625, "learning_rate": 0.00013205871712518858, "loss": 13.7243, "step": 12440 }, { "epoch": 32.858369098712444, "grad_norm": 3317.778076171875, "learning_rate": 0.00013202188342048554, "loss": 14.6249, "step": 12441 }, { "epoch": 32.861010234400794, "grad_norm": 3940.191650390625, "learning_rate": 0.00013198505301025213, "loss": 12.7761, "step": 12442 }, { "epoch": 32.86365137008914, "grad_norm": 750.7222900390625, "learning_rate": 0.00013194822589551676, "loss": 14.9798, "step": 12443 }, { "epoch": 32.86629250577749, "grad_norm": 872.3194580078125, "learning_rate": 0.00013191140207730783, "loss": 12.2346, "step": 12444 }, { "epoch": 32.86893364146583, "grad_norm": 559.5982666015625, "learning_rate": 0.00013187458155665356, "loss": 12.8758, "step": 12445 }, { "epoch": 32.87157477715417, "grad_norm": 3748.58740234375, "learning_rate": 0.00013183776433458227, "loss": 12.2989, "step": 12446 }, { "epoch": 32.87421591284252, "grad_norm": 113178.4296875, "learning_rate": 0.00013180095041212203, "loss": 10.0465, "step": 12447 }, { "epoch": 32.876857048530866, "grad_norm": 528.782958984375, "learning_rate": 0.0001317641397903008, "loss": 21.2214, "step": 12448 }, { "epoch": 32.879498184219216, "grad_norm": 1394.28662109375, "learning_rate": 0.00013172733247014651, "loss": 35.3545, "step": 12449 }, { "epoch": 32.88213931990756, "grad_norm": 578.5130615234375, "learning_rate": 0.000131690528452687, "loss": 34.6488, "step": 12450 }, { "epoch": 32.88478045559591, "grad_norm": 812.9246826171875, "learning_rate": 0.00013165372773894996, "loss": 34.241, "step": 12451 }, { "epoch": 32.88742159128425, "grad_norm": 537.5830078125, "learning_rate": 0.00013161693032996298, "loss": 37.6219, "step": 12452 }, { "epoch": 32.8900627269726, "grad_norm": 571.6407470703125, "learning_rate": 0.00013158013622675374, "loss": 35.8283, "step": 12453 }, { "epoch": 32.892703862660944, "grad_norm": 704.56298828125, "learning_rate": 0.00013154334543034969, "loss": 33.6473, "step": 12454 }, { "epoch": 32.89534499834929, "grad_norm": 701.3800048828125, "learning_rate": 0.000131506557941778, "loss": 35.7153, "step": 12455 }, { "epoch": 32.89798613403764, "grad_norm": 653.7135620117188, "learning_rate": 0.00013146977376206614, "loss": 36.0509, "step": 12456 }, { "epoch": 32.90062726972598, "grad_norm": 424.5854797363281, "learning_rate": 0.00013143299289224126, "loss": 33.8827, "step": 12457 }, { "epoch": 32.90326840541433, "grad_norm": 513.5452880859375, "learning_rate": 0.00013139621533333053, "loss": 34.2038, "step": 12458 }, { "epoch": 32.90590954110267, "grad_norm": 2972.1533203125, "learning_rate": 0.00013135944108636055, "loss": 35.1703, "step": 12459 }, { "epoch": 32.90855067679102, "grad_norm": 531.0247192382812, "learning_rate": 0.00013132267015235861, "loss": 34.9146, "step": 12460 }, { "epoch": 32.911191812479366, "grad_norm": 1166.2154541015625, "learning_rate": 0.0001312859025323514, "loss": 34.3869, "step": 12461 }, { "epoch": 32.91383294816771, "grad_norm": 832.1980590820312, "learning_rate": 0.0001312491382273655, "loss": 35.9603, "step": 12462 }, { "epoch": 32.91647408385606, "grad_norm": 613.184814453125, "learning_rate": 0.0001312123772384278, "loss": 35.1669, "step": 12463 }, { "epoch": 32.9191152195444, "grad_norm": 1499.9835205078125, "learning_rate": 0.00013117561956656464, "loss": 36.8224, "step": 12464 }, { "epoch": 32.92175635523275, "grad_norm": 4311.974609375, "learning_rate": 0.00013113886521280254, "loss": 36.8748, "step": 12465 }, { "epoch": 32.924397490921095, "grad_norm": 1782.4371337890625, "learning_rate": 0.0001311021141781678, "loss": 40.106, "step": 12466 }, { "epoch": 32.927038626609445, "grad_norm": 1130.078369140625, "learning_rate": 0.00013106536646368666, "loss": 40.5986, "step": 12467 }, { "epoch": 32.92967976229779, "grad_norm": 1012.3093872070312, "learning_rate": 0.00013102862207038532, "loss": 39.1147, "step": 12468 }, { "epoch": 32.93232089798613, "grad_norm": 629.9141845703125, "learning_rate": 0.00013099188099928972, "loss": 41.4667, "step": 12469 }, { "epoch": 32.93496203367448, "grad_norm": 368.373779296875, "learning_rate": 0.000130955143251426, "loss": 44.3549, "step": 12470 }, { "epoch": 32.93760316936282, "grad_norm": 773.7748413085938, "learning_rate": 0.00013091840882782, "loss": 40.5779, "step": 12471 }, { "epoch": 32.94024430505117, "grad_norm": 745.58544921875, "learning_rate": 0.0001308816777294975, "loss": 38.1422, "step": 12472 }, { "epoch": 32.942885440739516, "grad_norm": 531.7463989257812, "learning_rate": 0.00013084494995748404, "loss": 38.1608, "step": 12473 }, { "epoch": 32.945526576427866, "grad_norm": 1472.5306396484375, "learning_rate": 0.00013080822551280548, "loss": 36.9121, "step": 12474 }, { "epoch": 32.94816771211621, "grad_norm": 417.74578857421875, "learning_rate": 0.00013077150439648715, "loss": 35.947, "step": 12475 }, { "epoch": 32.95080884780456, "grad_norm": 274.5064392089844, "learning_rate": 0.00013073478660955456, "loss": 35.4097, "step": 12476 }, { "epoch": 32.9534499834929, "grad_norm": 1314.189453125, "learning_rate": 0.00013069807215303303, "loss": 34.4628, "step": 12477 }, { "epoch": 32.956091119181245, "grad_norm": 684.2517700195312, "learning_rate": 0.0001306613610279477, "loss": 36.7476, "step": 12478 }, { "epoch": 32.958732254869595, "grad_norm": 5849.66552734375, "learning_rate": 0.0001306246532353238, "loss": 33.7476, "step": 12479 }, { "epoch": 32.96137339055794, "grad_norm": 968.3806762695312, "learning_rate": 0.0001305879487761862, "loss": 11.6265, "step": 12480 }, { "epoch": 32.96401452624629, "grad_norm": 413936.71875, "learning_rate": 0.0001305512476515601, "loss": 16.1853, "step": 12481 }, { "epoch": 32.96665566193463, "grad_norm": 2596.104248046875, "learning_rate": 0.00013051454986247026, "loss": 11.7805, "step": 12482 }, { "epoch": 32.96929679762298, "grad_norm": 895.6355590820312, "learning_rate": 0.00013047785540994138, "loss": 13.6279, "step": 12483 }, { "epoch": 32.971937933311324, "grad_norm": 1330.6397705078125, "learning_rate": 0.00013044116429499823, "loss": 12.1446, "step": 12484 }, { "epoch": 32.97457906899967, "grad_norm": 910.7513427734375, "learning_rate": 0.00013040447651866527, "loss": 34.2605, "step": 12485 }, { "epoch": 32.97722020468802, "grad_norm": 549.4996337890625, "learning_rate": 0.00013036779208196712, "loss": 35.0069, "step": 12486 }, { "epoch": 32.97986134037636, "grad_norm": 539.7322387695312, "learning_rate": 0.00013033111098592796, "loss": 33.7959, "step": 12487 }, { "epoch": 32.98250247606471, "grad_norm": 702.833740234375, "learning_rate": 0.00013029443323157234, "loss": 35.7348, "step": 12488 }, { "epoch": 32.98514361175305, "grad_norm": 394.5403747558594, "learning_rate": 0.00013025775881992435, "loss": 34.3009, "step": 12489 }, { "epoch": 32.9877847474414, "grad_norm": 431.00286865234375, "learning_rate": 0.000130221087752008, "loss": 36.3594, "step": 12490 }, { "epoch": 32.990425883129745, "grad_norm": 2572.179443359375, "learning_rate": 0.00013018442002884752, "loss": 35.5443, "step": 12491 }, { "epoch": 32.99306701881809, "grad_norm": 556.91943359375, "learning_rate": 0.0001301477556514667, "loss": 35.8845, "step": 12492 }, { "epoch": 32.99570815450644, "grad_norm": 412.226318359375, "learning_rate": 0.00013011109462088955, "loss": 34.5847, "step": 12493 }, { "epoch": 32.99834929019478, "grad_norm": 1945.5592041015625, "learning_rate": 0.0001300744369381394, "loss": 36.3154, "step": 12494 }, { "epoch": 33.00099042588313, "grad_norm": 916.5772094726562, "learning_rate": 0.00013003778260424026, "loss": 41.9328, "step": 12495 }, { "epoch": 33.003631561571474, "grad_norm": 712.2174072265625, "learning_rate": 0.0001300011316202156, "loss": 39.7887, "step": 12496 }, { "epoch": 33.006272697259824, "grad_norm": 516.581787109375, "learning_rate": 0.00012996448398708876, "loss": 38.705, "step": 12497 }, { "epoch": 33.00891383294817, "grad_norm": 507.38970947265625, "learning_rate": 0.00012992783970588325, "loss": 38.1411, "step": 12498 }, { "epoch": 33.01155496863652, "grad_norm": 951.3145751953125, "learning_rate": 0.0001298911987776223, "loss": 40.8685, "step": 12499 }, { "epoch": 33.01419610432486, "grad_norm": 1624.6802978515625, "learning_rate": 0.00012985456120332905, "loss": 42.0416, "step": 12500 }, { "epoch": 33.0168372400132, "grad_norm": 811.9551391601562, "learning_rate": 0.00012981792698402663, "loss": 41.8892, "step": 12501 }, { "epoch": 33.01947837570155, "grad_norm": 748.1991577148438, "learning_rate": 0.00012978129612073802, "loss": 40.5552, "step": 12502 }, { "epoch": 33.022119511389896, "grad_norm": 485.20135498046875, "learning_rate": 0.00012974466861448607, "loss": 40.9417, "step": 12503 }, { "epoch": 33.024760647078246, "grad_norm": 724.7617797851562, "learning_rate": 0.00012970804446629347, "loss": 40.4514, "step": 12504 }, { "epoch": 33.02740178276659, "grad_norm": 799.8744506835938, "learning_rate": 0.00012967142367718322, "loss": 38.4122, "step": 12505 }, { "epoch": 33.03004291845494, "grad_norm": 556.97314453125, "learning_rate": 0.00012963480624817776, "loss": 35.5152, "step": 12506 }, { "epoch": 33.03268405414328, "grad_norm": 287.17510986328125, "learning_rate": 0.00012959819218029962, "loss": 37.53, "step": 12507 }, { "epoch": 33.035325189831624, "grad_norm": 532.4328002929688, "learning_rate": 0.00012956158147457115, "loss": 37.9521, "step": 12508 }, { "epoch": 33.037966325519974, "grad_norm": 633.3938598632812, "learning_rate": 0.00012952497413201485, "loss": 35.6082, "step": 12509 }, { "epoch": 33.04060746120832, "grad_norm": 588.4879760742188, "learning_rate": 0.00012948837015365288, "loss": 34.9926, "step": 12510 }, { "epoch": 33.04324859689667, "grad_norm": 673.113037109375, "learning_rate": 0.00012945176954050743, "loss": 36.2131, "step": 12511 }, { "epoch": 33.04588973258501, "grad_norm": 406.46685791015625, "learning_rate": 0.00012941517229360041, "loss": 34.9778, "step": 12512 }, { "epoch": 33.04853086827336, "grad_norm": 355.33709716796875, "learning_rate": 0.00012937857841395395, "loss": 34.6465, "step": 12513 }, { "epoch": 33.0511720039617, "grad_norm": 530.65283203125, "learning_rate": 0.00012934198790258978, "loss": 35.5293, "step": 12514 }, { "epoch": 33.053813139650046, "grad_norm": 472.10235595703125, "learning_rate": 0.00012930540076052966, "loss": 35.2074, "step": 12515 }, { "epoch": 33.056454275338396, "grad_norm": 1545.305419921875, "learning_rate": 0.0001292688169887954, "loss": 37.0413, "step": 12516 }, { "epoch": 33.05909541102674, "grad_norm": 2560.1806640625, "learning_rate": 0.0001292322365884085, "loss": 35.397, "step": 12517 }, { "epoch": 33.06173654671509, "grad_norm": 2474.14599609375, "learning_rate": 0.00012919565956039046, "loss": 15.01, "step": 12518 }, { "epoch": 33.06437768240343, "grad_norm": 3797.0693359375, "learning_rate": 0.00012915908590576265, "loss": 17.4973, "step": 12519 }, { "epoch": 33.06701881809178, "grad_norm": 713.8702392578125, "learning_rate": 0.0001291225156255464, "loss": 12.1891, "step": 12520 }, { "epoch": 33.069659953780125, "grad_norm": 1268.6778564453125, "learning_rate": 0.00012908594872076284, "loss": 16.8122, "step": 12521 }, { "epoch": 33.072301089468475, "grad_norm": 12204.595703125, "learning_rate": 0.00012904938519243308, "loss": 13.9372, "step": 12522 }, { "epoch": 33.07494222515682, "grad_norm": 1000.23095703125, "learning_rate": 0.0001290128250415783, "loss": 12.71, "step": 12523 }, { "epoch": 33.07758336084516, "grad_norm": 1184.017333984375, "learning_rate": 0.0001289762682692193, "loss": 14.6238, "step": 12524 }, { "epoch": 33.08022449653351, "grad_norm": 5916.017578125, "learning_rate": 0.00012893971487637682, "loss": 13.6626, "step": 12525 }, { "epoch": 33.08286563222185, "grad_norm": 1437.39697265625, "learning_rate": 0.00012890316486407179, "loss": 12.0546, "step": 12526 }, { "epoch": 33.0855067679102, "grad_norm": 610.7853393554688, "learning_rate": 0.00012886661823332472, "loss": 29.5925, "step": 12527 }, { "epoch": 33.088147903598546, "grad_norm": 384.1267395019531, "learning_rate": 0.0001288300749851563, "loss": 34.6204, "step": 12528 }, { "epoch": 33.090789039286896, "grad_norm": 535.7877807617188, "learning_rate": 0.0001287935351205866, "loss": 34.7729, "step": 12529 }, { "epoch": 33.09343017497524, "grad_norm": 1107.8895263671875, "learning_rate": 0.00012875699864063638, "loss": 35.8479, "step": 12530 }, { "epoch": 33.09607131066358, "grad_norm": 548.4794311523438, "learning_rate": 0.00012872046554632577, "loss": 34.0765, "step": 12531 }, { "epoch": 33.09871244635193, "grad_norm": 380.48626708984375, "learning_rate": 0.0001286839358386748, "loss": 35.7275, "step": 12532 }, { "epoch": 33.101353582040275, "grad_norm": 637.4548950195312, "learning_rate": 0.00012864740951870374, "loss": 35.068, "step": 12533 }, { "epoch": 33.103994717728625, "grad_norm": 624.420654296875, "learning_rate": 0.00012861088658743251, "loss": 34.9797, "step": 12534 }, { "epoch": 33.10663585341697, "grad_norm": 779.6350708007812, "learning_rate": 0.00012857436704588095, "loss": 34.6054, "step": 12535 }, { "epoch": 33.10927698910532, "grad_norm": 659.0732421875, "learning_rate": 0.00012853785089506886, "loss": 35.4051, "step": 12536 }, { "epoch": 33.11191812479366, "grad_norm": 630.1134643554688, "learning_rate": 0.00012850133813601595, "loss": 35.1701, "step": 12537 }, { "epoch": 33.114559260482004, "grad_norm": 413.4668273925781, "learning_rate": 0.00012846482876974175, "loss": 34.8117, "step": 12538 }, { "epoch": 33.117200396170354, "grad_norm": 985.0581665039062, "learning_rate": 0.00012842832279726573, "loss": 35.3146, "step": 12539 }, { "epoch": 33.1198415318587, "grad_norm": 663.3121337890625, "learning_rate": 0.0001283918202196075, "loss": 35.7258, "step": 12540 }, { "epoch": 33.12248266754705, "grad_norm": 803.52978515625, "learning_rate": 0.00012835532103778628, "loss": 34.449, "step": 12541 }, { "epoch": 33.12512380323539, "grad_norm": 564.6267700195312, "learning_rate": 0.00012831882525282123, "loss": 35.2389, "step": 12542 }, { "epoch": 33.12776493892374, "grad_norm": 1415.555908203125, "learning_rate": 0.00012828233286573141, "loss": 35.8476, "step": 12543 }, { "epoch": 33.13040607461208, "grad_norm": 479.19012451171875, "learning_rate": 0.00012824584387753605, "loss": 39.2998, "step": 12544 }, { "epoch": 33.13304721030043, "grad_norm": 878.2568969726562, "learning_rate": 0.00012820935828925396, "loss": 38.9352, "step": 12545 }, { "epoch": 33.135688345988775, "grad_norm": 462.0787048339844, "learning_rate": 0.00012817287610190401, "loss": 37.6752, "step": 12546 }, { "epoch": 33.13832948167712, "grad_norm": 749.777099609375, "learning_rate": 0.0001281363973165049, "loss": 39.484, "step": 12547 }, { "epoch": 33.14097061736547, "grad_norm": 560.2760009765625, "learning_rate": 0.00012809992193407534, "loss": 39.9526, "step": 12548 }, { "epoch": 33.14361175305381, "grad_norm": 2091.071533203125, "learning_rate": 0.00012806344995563385, "loss": 40.8928, "step": 12549 }, { "epoch": 33.14625288874216, "grad_norm": 439.20208740234375, "learning_rate": 0.0001280269813821988, "loss": 43.069, "step": 12550 }, { "epoch": 33.148894024430504, "grad_norm": 588.9216918945312, "learning_rate": 0.00012799051621478874, "loss": 43.2923, "step": 12551 }, { "epoch": 33.151535160118854, "grad_norm": 613.0855102539062, "learning_rate": 0.00012795405445442182, "loss": 41.8888, "step": 12552 }, { "epoch": 33.1541762958072, "grad_norm": 1103.1319580078125, "learning_rate": 0.00012791759610211625, "loss": 38.7456, "step": 12553 }, { "epoch": 33.15681743149554, "grad_norm": 440.5768127441406, "learning_rate": 0.0001278811411588901, "loss": 39.1968, "step": 12554 }, { "epoch": 33.15945856718389, "grad_norm": 411.4342346191406, "learning_rate": 0.00012784468962576134, "loss": 39.8198, "step": 12555 }, { "epoch": 33.16209970287223, "grad_norm": 501.9747009277344, "learning_rate": 0.00012780824150374786, "loss": 37.4577, "step": 12556 }, { "epoch": 33.16474083856058, "grad_norm": 545.70068359375, "learning_rate": 0.00012777179679386734, "loss": 37.271, "step": 12557 }, { "epoch": 33.167381974248926, "grad_norm": 400.4726257324219, "learning_rate": 0.00012773535549713772, "loss": 34.4247, "step": 12558 }, { "epoch": 33.170023109937276, "grad_norm": 984.2301635742188, "learning_rate": 0.00012769891761457648, "loss": 35.5562, "step": 12559 }, { "epoch": 33.17266424562562, "grad_norm": 380.841064453125, "learning_rate": 0.00012766248314720102, "loss": 36.1189, "step": 12560 }, { "epoch": 33.17530538131396, "grad_norm": 628.6178588867188, "learning_rate": 0.00012762605209602895, "loss": 35.0831, "step": 12561 }, { "epoch": 33.17794651700231, "grad_norm": 1171.57861328125, "learning_rate": 0.00012758962446207748, "loss": 35.9148, "step": 12562 }, { "epoch": 33.180587652690654, "grad_norm": 862.7238159179688, "learning_rate": 0.00012755320024636393, "loss": 34.8329, "step": 12563 }, { "epoch": 33.183228788379004, "grad_norm": 406.80328369140625, "learning_rate": 0.00012751677944990514, "loss": 34.0999, "step": 12564 }, { "epoch": 33.18586992406735, "grad_norm": 622.7584838867188, "learning_rate": 0.00012748036207371844, "loss": 36.2167, "step": 12565 }, { "epoch": 33.1885110597557, "grad_norm": 1103.73193359375, "learning_rate": 0.00012744394811882066, "loss": 35.36, "step": 12566 }, { "epoch": 33.19115219544404, "grad_norm": 1199.6114501953125, "learning_rate": 0.00012740753758622853, "loss": 39.1592, "step": 12567 }, { "epoch": 33.19379333113239, "grad_norm": 2839.214111328125, "learning_rate": 0.00012737113047695903, "loss": 11.8652, "step": 12568 }, { "epoch": 33.19643446682073, "grad_norm": 2157.27392578125, "learning_rate": 0.00012733472679202863, "loss": 12.1998, "step": 12569 }, { "epoch": 33.199075602509076, "grad_norm": 1964.6710205078125, "learning_rate": 0.00012729832653245393, "loss": 16.8788, "step": 12570 }, { "epoch": 33.201716738197426, "grad_norm": 2195.901123046875, "learning_rate": 0.0001272619296992514, "loss": 17.9201, "step": 12571 }, { "epoch": 33.20435787388577, "grad_norm": 720.572998046875, "learning_rate": 0.00012722553629343738, "loss": 12.2885, "step": 12572 }, { "epoch": 33.20699900957412, "grad_norm": 1878.1011962890625, "learning_rate": 0.00012718914631602813, "loss": 10.6917, "step": 12573 }, { "epoch": 33.20964014526246, "grad_norm": 431.1042175292969, "learning_rate": 0.00012715275976803975, "loss": 19.0973, "step": 12574 }, { "epoch": 33.21228128095081, "grad_norm": 1342.0302734375, "learning_rate": 0.00012711637665048848, "loss": 14.171, "step": 12575 }, { "epoch": 33.214922416639155, "grad_norm": 1337.6246337890625, "learning_rate": 0.00012707999696439021, "loss": 17.5714, "step": 12576 }, { "epoch": 33.2175635523275, "grad_norm": 780.9798583984375, "learning_rate": 0.0001270436207107607, "loss": 35.7388, "step": 12577 }, { "epoch": 33.22020468801585, "grad_norm": 716.7817993164062, "learning_rate": 0.00012700724789061596, "loss": 35.091, "step": 12578 }, { "epoch": 33.22284582370419, "grad_norm": 706.7227783203125, "learning_rate": 0.00012697087850497158, "loss": 35.3884, "step": 12579 }, { "epoch": 33.22548695939254, "grad_norm": 413.4629211425781, "learning_rate": 0.00012693451255484312, "loss": 34.4639, "step": 12580 }, { "epoch": 33.22812809508088, "grad_norm": 641.2777099609375, "learning_rate": 0.0001268981500412461, "loss": 34.451, "step": 12581 }, { "epoch": 33.23076923076923, "grad_norm": 959.3433227539062, "learning_rate": 0.00012686179096519596, "loss": 34.4382, "step": 12582 }, { "epoch": 33.233410366457576, "grad_norm": 350.9865417480469, "learning_rate": 0.00012682543532770797, "loss": 35.6711, "step": 12583 }, { "epoch": 33.23605150214592, "grad_norm": 293.02801513671875, "learning_rate": 0.00012678908312979722, "loss": 34.1825, "step": 12584 }, { "epoch": 33.23869263783427, "grad_norm": 338.8668518066406, "learning_rate": 0.00012675273437247908, "loss": 34.3504, "step": 12585 }, { "epoch": 33.24133377352261, "grad_norm": 604.3727416992188, "learning_rate": 0.0001267163890567684, "loss": 36.3072, "step": 12586 }, { "epoch": 33.24397490921096, "grad_norm": 426.1373291015625, "learning_rate": 0.00012668004718368014, "loss": 34.6874, "step": 12587 }, { "epoch": 33.246616044899305, "grad_norm": 660.76953125, "learning_rate": 0.00012664370875422915, "loss": 34.1283, "step": 12588 }, { "epoch": 33.249257180587655, "grad_norm": 3146.161865234375, "learning_rate": 0.0001266073737694301, "loss": 35.2776, "step": 12589 }, { "epoch": 33.251898316276, "grad_norm": 394.1177978515625, "learning_rate": 0.00012657104223029765, "loss": 34.2248, "step": 12590 }, { "epoch": 33.25453945196435, "grad_norm": 322.3332824707031, "learning_rate": 0.00012653471413784622, "loss": 33.5458, "step": 12591 }, { "epoch": 33.25718058765269, "grad_norm": 372.6321716308594, "learning_rate": 0.0001264983894930905, "loss": 33.7562, "step": 12592 }, { "epoch": 33.259821723341034, "grad_norm": 721.3158569335938, "learning_rate": 0.0001264620682970447, "loss": 34.9509, "step": 12593 }, { "epoch": 33.262462859029384, "grad_norm": 1520.7774658203125, "learning_rate": 0.00012642575055072308, "loss": 35.3691, "step": 12594 }, { "epoch": 33.26510399471773, "grad_norm": 2660.209716796875, "learning_rate": 0.0001263894362551397, "loss": 41.374, "step": 12595 }, { "epoch": 33.26774513040608, "grad_norm": 602.503662109375, "learning_rate": 0.00012635312541130878, "loss": 39.9409, "step": 12596 }, { "epoch": 33.27038626609442, "grad_norm": 447.392333984375, "learning_rate": 0.0001263168180202442, "loss": 38.9125, "step": 12597 }, { "epoch": 33.27302740178277, "grad_norm": 664.0201416015625, "learning_rate": 0.0001262805140829598, "loss": 39.9834, "step": 12598 }, { "epoch": 33.27566853747111, "grad_norm": 797.5061645507812, "learning_rate": 0.0001262442136004694, "loss": 43.3458, "step": 12599 }, { "epoch": 33.278309673159455, "grad_norm": 939.1535034179688, "learning_rate": 0.00012620791657378663, "loss": 42.4826, "step": 12600 }, { "epoch": 33.278309673159455, "eval_loss": 3.79601788520813, "eval_runtime": 2.2305, "eval_samples_per_second": 221.927, "eval_steps_per_second": 27.797, "step": 12600 }, { "epoch": 33.280950808847805, "grad_norm": 801.71533203125, "learning_rate": 0.00012617162300392507, "loss": 43.6585, "step": 12601 }, { "epoch": 33.28359194453615, "grad_norm": 329.6461181640625, "learning_rate": 0.0001261353328918981, "loss": 41.7813, "step": 12602 }, { "epoch": 33.2862330802245, "grad_norm": 464.6836242675781, "learning_rate": 0.0001260990462387193, "loss": 40.7026, "step": 12603 }, { "epoch": 33.28887421591284, "grad_norm": 707.2406005859375, "learning_rate": 0.00012606276304540182, "loss": 40.5161, "step": 12604 }, { "epoch": 33.29151535160119, "grad_norm": 484.3229064941406, "learning_rate": 0.0001260264833129589, "loss": 38.3253, "step": 12605 }, { "epoch": 33.294156487289534, "grad_norm": 350.3291015625, "learning_rate": 0.0001259902070424036, "loss": 37.3266, "step": 12606 }, { "epoch": 33.29679762297788, "grad_norm": 601.28564453125, "learning_rate": 0.00012595393423474893, "loss": 39.1485, "step": 12607 }, { "epoch": 33.29943875866623, "grad_norm": 870.3259887695312, "learning_rate": 0.00012591766489100775, "loss": 38.262, "step": 12608 }, { "epoch": 33.30207989435457, "grad_norm": 506.09490966796875, "learning_rate": 0.00012588139901219283, "loss": 35.9346, "step": 12609 }, { "epoch": 33.30472103004292, "grad_norm": 698.443115234375, "learning_rate": 0.000125845136599317, "loss": 35.7752, "step": 12610 }, { "epoch": 33.30736216573126, "grad_norm": 1149.8575439453125, "learning_rate": 0.00012580887765339282, "loss": 36.3453, "step": 12611 }, { "epoch": 33.31000330141961, "grad_norm": 520.1011352539062, "learning_rate": 0.00012577262217543266, "loss": 35.2746, "step": 12612 }, { "epoch": 33.312644437107956, "grad_norm": 1314.7257080078125, "learning_rate": 0.0001257363701664492, "loss": 34.2652, "step": 12613 }, { "epoch": 33.315285572796306, "grad_norm": 1181.32421875, "learning_rate": 0.00012570012162745455, "loss": 34.8919, "step": 12614 }, { "epoch": 33.31792670848465, "grad_norm": 722.8828125, "learning_rate": 0.00012566387655946098, "loss": 35.1957, "step": 12615 }, { "epoch": 33.32056784417299, "grad_norm": 314.155029296875, "learning_rate": 0.00012562763496348067, "loss": 35.6414, "step": 12616 }, { "epoch": 33.32320897986134, "grad_norm": 579.9517822265625, "learning_rate": 0.00012559139684052557, "loss": 36.6052, "step": 12617 }, { "epoch": 33.325850115549684, "grad_norm": 1162.866455078125, "learning_rate": 0.00012555516219160762, "loss": 20.1414, "step": 12618 }, { "epoch": 33.328491251238034, "grad_norm": 1062.619873046875, "learning_rate": 0.00012551893101773854, "loss": 15.9602, "step": 12619 }, { "epoch": 33.33113238692638, "grad_norm": 1123.4735107421875, "learning_rate": 0.00012548270331993035, "loss": 14.3576, "step": 12620 }, { "epoch": 33.33377352261473, "grad_norm": 794.5001831054688, "learning_rate": 0.00012544647909919449, "loss": 10.3094, "step": 12621 }, { "epoch": 33.33641465830307, "grad_norm": 1134.125732421875, "learning_rate": 0.00012541025835654252, "loss": 11.649, "step": 12622 }, { "epoch": 33.33905579399141, "grad_norm": 3736.70654296875, "learning_rate": 0.0001253740410929859, "loss": 14.6478, "step": 12623 }, { "epoch": 33.34169692967976, "grad_norm": 4684.72216796875, "learning_rate": 0.00012533782730953598, "loss": 14.285, "step": 12624 }, { "epoch": 33.344338065368106, "grad_norm": 852.5457763671875, "learning_rate": 0.00012530161700720398, "loss": 12.6854, "step": 12625 }, { "epoch": 33.346979201056456, "grad_norm": 1320.8280029296875, "learning_rate": 0.00012526541018700104, "loss": 10.7969, "step": 12626 }, { "epoch": 33.3496203367448, "grad_norm": 591.59765625, "learning_rate": 0.0001252292068499383, "loss": 28.4165, "step": 12627 }, { "epoch": 33.35226147243315, "grad_norm": 1284.72314453125, "learning_rate": 0.0001251930069970267, "loss": 34.3125, "step": 12628 }, { "epoch": 33.35490260812149, "grad_norm": 1551.6761474609375, "learning_rate": 0.000125156810629277, "loss": 33.9798, "step": 12629 }, { "epoch": 33.357543743809835, "grad_norm": 441.22113037109375, "learning_rate": 0.00012512061774769997, "loss": 35.3983, "step": 12630 }, { "epoch": 33.360184879498185, "grad_norm": 764.881591796875, "learning_rate": 0.0001250844283533064, "loss": 33.8499, "step": 12631 }, { "epoch": 33.36282601518653, "grad_norm": 719.605224609375, "learning_rate": 0.00012504824244710682, "loss": 34.6287, "step": 12632 }, { "epoch": 33.36546715087488, "grad_norm": 323.2344665527344, "learning_rate": 0.00012501206003011166, "loss": 34.8109, "step": 12633 }, { "epoch": 33.36810828656322, "grad_norm": 655.1342163085938, "learning_rate": 0.00012497588110333132, "loss": 34.0323, "step": 12634 }, { "epoch": 33.37074942225157, "grad_norm": 795.5455932617188, "learning_rate": 0.00012493970566777605, "loss": 34.416, "step": 12635 }, { "epoch": 33.37339055793991, "grad_norm": 802.5441284179688, "learning_rate": 0.000124903533724456, "loss": 34.507, "step": 12636 }, { "epoch": 33.37603169362826, "grad_norm": 613.1825561523438, "learning_rate": 0.0001248673652743812, "loss": 34.4232, "step": 12637 }, { "epoch": 33.378672829316606, "grad_norm": 1030.214111328125, "learning_rate": 0.00012483120031856184, "loss": 33.6638, "step": 12638 }, { "epoch": 33.38131396500495, "grad_norm": 439.1671142578125, "learning_rate": 0.00012479503885800769, "loss": 34.7315, "step": 12639 }, { "epoch": 33.3839551006933, "grad_norm": 464.05889892578125, "learning_rate": 0.0001247588808937285, "loss": 33.9416, "step": 12640 }, { "epoch": 33.38659623638164, "grad_norm": 1381.103515625, "learning_rate": 0.000124722726426734, "loss": 34.3652, "step": 12641 }, { "epoch": 33.38923737206999, "grad_norm": 617.5750122070312, "learning_rate": 0.00012468657545803378, "loss": 36.1242, "step": 12642 }, { "epoch": 33.391878507758335, "grad_norm": 779.926513671875, "learning_rate": 0.0001246504279886373, "loss": 36.7812, "step": 12643 }, { "epoch": 33.394519643446685, "grad_norm": 959.6658325195312, "learning_rate": 0.00012461428401955394, "loss": 38.8908, "step": 12644 }, { "epoch": 33.39716077913503, "grad_norm": 1640.37060546875, "learning_rate": 0.0001245781435517931, "loss": 42.1305, "step": 12645 }, { "epoch": 33.39980191482337, "grad_norm": 1041.1080322265625, "learning_rate": 0.00012454200658636392, "loss": 39.6157, "step": 12646 }, { "epoch": 33.40244305051172, "grad_norm": 933.5064697265625, "learning_rate": 0.00012450587312427545, "loss": 38.8401, "step": 12647 }, { "epoch": 33.405084186200064, "grad_norm": 442.66705322265625, "learning_rate": 0.00012446974316653682, "loss": 38.5631, "step": 12648 }, { "epoch": 33.407725321888414, "grad_norm": 556.580322265625, "learning_rate": 0.00012443361671415687, "loss": 40.9137, "step": 12649 }, { "epoch": 33.41036645757676, "grad_norm": 934.5072021484375, "learning_rate": 0.00012439749376814442, "loss": 41.6825, "step": 12650 }, { "epoch": 33.41300759326511, "grad_norm": 644.5703735351562, "learning_rate": 0.00012436137432950817, "loss": 43.7968, "step": 12651 }, { "epoch": 33.41564872895345, "grad_norm": 572.5237426757812, "learning_rate": 0.00012432525839925674, "loss": 39.2429, "step": 12652 }, { "epoch": 33.41828986464179, "grad_norm": 333.7579650878906, "learning_rate": 0.00012428914597839864, "loss": 41.5383, "step": 12653 }, { "epoch": 33.42093100033014, "grad_norm": 416.6922607421875, "learning_rate": 0.0001242530370679422, "loss": 41.0456, "step": 12654 }, { "epoch": 33.423572136018485, "grad_norm": 1118.847900390625, "learning_rate": 0.0001242169316688959, "loss": 38.96, "step": 12655 }, { "epoch": 33.426213271706835, "grad_norm": 2210.19091796875, "learning_rate": 0.00012418082978226793, "loss": 38.2889, "step": 12656 }, { "epoch": 33.42885440739518, "grad_norm": 953.962158203125, "learning_rate": 0.00012414473140906636, "loss": 38.2293, "step": 12657 }, { "epoch": 33.43149554308353, "grad_norm": 1013.9146728515625, "learning_rate": 0.00012410863655029929, "loss": 37.0382, "step": 12658 }, { "epoch": 33.43413667877187, "grad_norm": 752.2908935546875, "learning_rate": 0.00012407254520697454, "loss": 36.653, "step": 12659 }, { "epoch": 33.43677781446022, "grad_norm": 736.4697265625, "learning_rate": 0.00012403645738009997, "loss": 35.4442, "step": 12660 }, { "epoch": 33.439418950148564, "grad_norm": 1006.478271484375, "learning_rate": 0.00012400037307068327, "loss": 35.8424, "step": 12661 }, { "epoch": 33.44206008583691, "grad_norm": 576.5436401367188, "learning_rate": 0.00012396429227973222, "loss": 34.2813, "step": 12662 }, { "epoch": 33.44470122152526, "grad_norm": 621.8555908203125, "learning_rate": 0.00012392821500825427, "loss": 34.6352, "step": 12663 }, { "epoch": 33.4473423572136, "grad_norm": 774.7807006835938, "learning_rate": 0.00012389214125725688, "loss": 34.145, "step": 12664 }, { "epoch": 33.44998349290195, "grad_norm": 613.1556396484375, "learning_rate": 0.00012385607102774723, "loss": 34.4685, "step": 12665 }, { "epoch": 33.45262462859029, "grad_norm": 964.1602172851562, "learning_rate": 0.00012382000432073284, "loss": 35.567, "step": 12666 }, { "epoch": 33.45526576427864, "grad_norm": 1113.65478515625, "learning_rate": 0.00012378394113722065, "loss": 36.1711, "step": 12667 }, { "epoch": 33.457906899966986, "grad_norm": 493.79345703125, "learning_rate": 0.00012374788147821782, "loss": 28.4513, "step": 12668 }, { "epoch": 33.46054803565533, "grad_norm": 1324.76416015625, "learning_rate": 0.00012371182534473118, "loss": 13.7131, "step": 12669 }, { "epoch": 33.46318917134368, "grad_norm": 619.4127197265625, "learning_rate": 0.00012367577273776766, "loss": 14.1824, "step": 12670 }, { "epoch": 33.46583030703202, "grad_norm": 2244.501708984375, "learning_rate": 0.000123639723658334, "loss": 15.0149, "step": 12671 }, { "epoch": 33.46847144272037, "grad_norm": 2196.236083984375, "learning_rate": 0.0001236036781074367, "loss": 11.3931, "step": 12672 }, { "epoch": 33.471112578408714, "grad_norm": 33036.1953125, "learning_rate": 0.00012356763608608256, "loss": 13.7705, "step": 12673 }, { "epoch": 33.473753714097064, "grad_norm": 1062.4471435546875, "learning_rate": 0.00012353159759527792, "loss": 11.3609, "step": 12674 }, { "epoch": 33.47639484978541, "grad_norm": 559.7110595703125, "learning_rate": 0.00012349556263602907, "loss": 11.1852, "step": 12675 }, { "epoch": 33.47903598547375, "grad_norm": 2487.82275390625, "learning_rate": 0.00012345953120934235, "loss": 14.8661, "step": 12676 }, { "epoch": 33.4816771211621, "grad_norm": 1587.0411376953125, "learning_rate": 0.0001234235033162239, "loss": 25.2633, "step": 12677 }, { "epoch": 33.48431825685044, "grad_norm": 2914.160400390625, "learning_rate": 0.00012338747895767975, "loss": 38.2167, "step": 12678 }, { "epoch": 33.48695939253879, "grad_norm": 833.1068115234375, "learning_rate": 0.0001233514581347158, "loss": 34.2203, "step": 12679 }, { "epoch": 33.489600528227136, "grad_norm": 630.9000244140625, "learning_rate": 0.00012331544084833807, "loss": 34.4978, "step": 12680 }, { "epoch": 33.492241663915486, "grad_norm": 1745.5394287109375, "learning_rate": 0.00012327942709955221, "loss": 35.5326, "step": 12681 }, { "epoch": 33.49488279960383, "grad_norm": 1542.616455078125, "learning_rate": 0.00012324341688936386, "loss": 34.9, "step": 12682 }, { "epoch": 33.49752393529218, "grad_norm": 1091.2193603515625, "learning_rate": 0.0001232074102187787, "loss": 34.6892, "step": 12683 }, { "epoch": 33.50016507098052, "grad_norm": 713.98876953125, "learning_rate": 0.0001231714070888021, "loss": 35.0782, "step": 12684 }, { "epoch": 33.502806206668865, "grad_norm": 559.4713745117188, "learning_rate": 0.0001231354075004395, "loss": 34.3844, "step": 12685 }, { "epoch": 33.505447342357215, "grad_norm": 1386.5340576171875, "learning_rate": 0.0001230994114546961, "loss": 34.5117, "step": 12686 }, { "epoch": 33.50808847804556, "grad_norm": 3191.88232421875, "learning_rate": 0.0001230634189525771, "loss": 35.0454, "step": 12687 }, { "epoch": 33.51072961373391, "grad_norm": 1137.3466796875, "learning_rate": 0.00012302742999508758, "loss": 36.3122, "step": 12688 }, { "epoch": 33.51337074942225, "grad_norm": 5373.68212890625, "learning_rate": 0.00012299144458323232, "loss": 35.0507, "step": 12689 }, { "epoch": 33.5160118851106, "grad_norm": 808.3257446289062, "learning_rate": 0.00012295546271801652, "loss": 35.0941, "step": 12690 }, { "epoch": 33.51865302079894, "grad_norm": 1403.3026123046875, "learning_rate": 0.00012291948440044478, "loss": 34.2015, "step": 12691 }, { "epoch": 33.521294156487286, "grad_norm": 667.52734375, "learning_rate": 0.00012288350963152178, "loss": 35.485, "step": 12692 }, { "epoch": 33.523935292175636, "grad_norm": 1318.453125, "learning_rate": 0.0001228475384122521, "loss": 36.2052, "step": 12693 }, { "epoch": 33.52657642786398, "grad_norm": 2970.247314453125, "learning_rate": 0.00012281157074364024, "loss": 37.148, "step": 12694 }, { "epoch": 33.52921756355233, "grad_norm": 640.7852172851562, "learning_rate": 0.00012277560662669052, "loss": 39.5483, "step": 12695 }, { "epoch": 33.53185869924067, "grad_norm": 1702.70068359375, "learning_rate": 0.00012273964606240718, "loss": 39.563, "step": 12696 }, { "epoch": 33.53449983492902, "grad_norm": 738.0660400390625, "learning_rate": 0.00012270368905179454, "loss": 40.5099, "step": 12697 }, { "epoch": 33.537140970617365, "grad_norm": 503.2278137207031, "learning_rate": 0.00012266773559585663, "loss": 40.7538, "step": 12698 }, { "epoch": 33.53978210630571, "grad_norm": 861.084716796875, "learning_rate": 0.00012263178569559729, "loss": 42.9969, "step": 12699 }, { "epoch": 33.54242324199406, "grad_norm": 900.3402709960938, "learning_rate": 0.00012259583935202061, "loss": 42.9177, "step": 12700 }, { "epoch": 33.5450643776824, "grad_norm": 491.7000732421875, "learning_rate": 0.0001225598965661303, "loss": 41.3294, "step": 12701 }, { "epoch": 33.54770551337075, "grad_norm": 566.0220947265625, "learning_rate": 0.00012252395733892998, "loss": 42.0667, "step": 12702 }, { "epoch": 33.550346649059094, "grad_norm": 960.474609375, "learning_rate": 0.00012248802167142328, "loss": 39.8494, "step": 12703 }, { "epoch": 33.552987784747444, "grad_norm": 1488.504638671875, "learning_rate": 0.00012245208956461367, "loss": 39.8439, "step": 12704 }, { "epoch": 33.55562892043579, "grad_norm": 1208.33154296875, "learning_rate": 0.00012241616101950453, "loss": 39.0245, "step": 12705 }, { "epoch": 33.55827005612414, "grad_norm": 808.1565551757812, "learning_rate": 0.00012238023603709908, "loss": 39.1982, "step": 12706 }, { "epoch": 33.56091119181248, "grad_norm": 2877.7841796875, "learning_rate": 0.00012234431461840065, "loss": 37.3416, "step": 12707 }, { "epoch": 33.56355232750082, "grad_norm": 1114.0262451171875, "learning_rate": 0.00012230839676441221, "loss": 35.9727, "step": 12708 }, { "epoch": 33.56619346318917, "grad_norm": 1890.22509765625, "learning_rate": 0.00012227248247613685, "loss": 35.2913, "step": 12709 }, { "epoch": 33.568834598877515, "grad_norm": 1177.4932861328125, "learning_rate": 0.00012223657175457734, "loss": 35.6229, "step": 12710 }, { "epoch": 33.571475734565865, "grad_norm": 527.0404052734375, "learning_rate": 0.0001222006646007365, "loss": 36.2578, "step": 12711 }, { "epoch": 33.57411687025421, "grad_norm": 853.775634765625, "learning_rate": 0.00012216476101561705, "loss": 35.5901, "step": 12712 }, { "epoch": 33.57675800594256, "grad_norm": 688.628662109375, "learning_rate": 0.00012212886100022144, "loss": 34.7709, "step": 12713 }, { "epoch": 33.5793991416309, "grad_norm": 821.39501953125, "learning_rate": 0.0001220929645555524, "loss": 35.8867, "step": 12714 }, { "epoch": 33.582040277319244, "grad_norm": 746.9749145507812, "learning_rate": 0.00012205707168261216, "loss": 35.5982, "step": 12715 }, { "epoch": 33.584681413007594, "grad_norm": 1735.21044921875, "learning_rate": 0.00012202118238240303, "loss": 40.2402, "step": 12716 }, { "epoch": 33.58732254869594, "grad_norm": 1015.8449096679688, "learning_rate": 0.00012198529665592714, "loss": 21.8088, "step": 12717 }, { "epoch": 33.58996368438429, "grad_norm": 12143.73046875, "learning_rate": 0.0001219494145041867, "loss": 10.9194, "step": 12718 }, { "epoch": 33.59260482007263, "grad_norm": 2379.17041015625, "learning_rate": 0.00012191353592818366, "loss": 12.9813, "step": 12719 }, { "epoch": 33.59524595576098, "grad_norm": 1004.7540893554688, "learning_rate": 0.00012187766092891988, "loss": 10.1753, "step": 12720 }, { "epoch": 33.59788709144932, "grad_norm": 1474.698486328125, "learning_rate": 0.00012184178950739719, "loss": 12.33, "step": 12721 }, { "epoch": 33.600528227137666, "grad_norm": 5978.79443359375, "learning_rate": 0.0001218059216646172, "loss": 16.3458, "step": 12722 }, { "epoch": 33.603169362826016, "grad_norm": 478.7947998046875, "learning_rate": 0.00012177005740158156, "loss": 12.7175, "step": 12723 }, { "epoch": 33.60581049851436, "grad_norm": 487.4851379394531, "learning_rate": 0.00012173419671929165, "loss": 9.0483, "step": 12724 }, { "epoch": 33.60845163420271, "grad_norm": 6786.89501953125, "learning_rate": 0.00012169833961874902, "loss": 16.624, "step": 12725 }, { "epoch": 33.61109276989105, "grad_norm": 1037.5595703125, "learning_rate": 0.0001216624861009549, "loss": 11.2134, "step": 12726 }, { "epoch": 33.6137339055794, "grad_norm": 803.2916870117188, "learning_rate": 0.00012162663616691039, "loss": 32.5477, "step": 12727 }, { "epoch": 33.616375041267744, "grad_norm": 693.1842041015625, "learning_rate": 0.00012159078981761682, "loss": 35.6353, "step": 12728 }, { "epoch": 33.619016176956094, "grad_norm": 886.1411743164062, "learning_rate": 0.00012155494705407494, "loss": 34.5445, "step": 12729 }, { "epoch": 33.62165731264444, "grad_norm": 1562.918701171875, "learning_rate": 0.00012151910787728568, "loss": 34.7564, "step": 12730 }, { "epoch": 33.62429844833278, "grad_norm": 338.9871520996094, "learning_rate": 0.0001214832722882498, "loss": 34.3026, "step": 12731 }, { "epoch": 33.62693958402113, "grad_norm": 705.322265625, "learning_rate": 0.00012144744028796812, "loss": 34.5778, "step": 12732 }, { "epoch": 33.62958071970947, "grad_norm": 558.3930053710938, "learning_rate": 0.00012141161187744115, "loss": 35.3449, "step": 12733 }, { "epoch": 33.63222185539782, "grad_norm": 566.0293579101562, "learning_rate": 0.00012137578705766928, "loss": 34.1199, "step": 12734 }, { "epoch": 33.634862991086166, "grad_norm": 945.731689453125, "learning_rate": 0.00012133996582965312, "loss": 34.6636, "step": 12735 }, { "epoch": 33.637504126774516, "grad_norm": 603.1190185546875, "learning_rate": 0.00012130414819439281, "loss": 34.6033, "step": 12736 }, { "epoch": 33.64014526246286, "grad_norm": 756.6117553710938, "learning_rate": 0.00012126833415288856, "loss": 35.1336, "step": 12737 }, { "epoch": 33.6427863981512, "grad_norm": 1766.0494384765625, "learning_rate": 0.00012123252370614049, "loss": 35.1535, "step": 12738 }, { "epoch": 33.64542753383955, "grad_norm": 1147.1676025390625, "learning_rate": 0.00012119671685514852, "loss": 35.3566, "step": 12739 }, { "epoch": 33.648068669527895, "grad_norm": 1100.321533203125, "learning_rate": 0.00012116091360091261, "loss": 33.5927, "step": 12740 }, { "epoch": 33.650709805216245, "grad_norm": 811.0950927734375, "learning_rate": 0.00012112511394443237, "loss": 34.2693, "step": 12741 }, { "epoch": 33.65335094090459, "grad_norm": 534.4453125, "learning_rate": 0.00012108931788670771, "loss": 36.093, "step": 12742 }, { "epoch": 33.65599207659294, "grad_norm": 1746.0576171875, "learning_rate": 0.00012105352542873815, "loss": 35.3111, "step": 12743 }, { "epoch": 33.65863321228128, "grad_norm": 1004.9769287109375, "learning_rate": 0.00012101773657152312, "loss": 38.0649, "step": 12744 }, { "epoch": 33.66127434796962, "grad_norm": 778.9418334960938, "learning_rate": 0.00012098195131606204, "loss": 39.6062, "step": 12745 }, { "epoch": 33.66391548365797, "grad_norm": 1178.4049072265625, "learning_rate": 0.0001209461696633542, "loss": 38.6035, "step": 12746 }, { "epoch": 33.666556619346316, "grad_norm": 1033.4267578125, "learning_rate": 0.00012091039161439874, "loss": 39.4006, "step": 12747 }, { "epoch": 33.669197755034666, "grad_norm": 1833.007080078125, "learning_rate": 0.00012087461717019465, "loss": 40.6045, "step": 12748 }, { "epoch": 33.67183889072301, "grad_norm": 1150.8157958984375, "learning_rate": 0.00012083884633174114, "loss": 39.8693, "step": 12749 }, { "epoch": 33.67448002641136, "grad_norm": 1519.557373046875, "learning_rate": 0.00012080307910003699, "loss": 43.4711, "step": 12750 }, { "epoch": 33.6771211620997, "grad_norm": 345.22454833984375, "learning_rate": 0.00012076731547608095, "loss": 42.0627, "step": 12751 }, { "epoch": 33.67976229778805, "grad_norm": 600.2847290039062, "learning_rate": 0.00012073155546087159, "loss": 41.9695, "step": 12752 }, { "epoch": 33.682403433476395, "grad_norm": 1009.0667724609375, "learning_rate": 0.00012069579905540773, "loss": 40.8282, "step": 12753 }, { "epoch": 33.68504456916474, "grad_norm": 750.8301391601562, "learning_rate": 0.00012066004626068769, "loss": 38.0434, "step": 12754 }, { "epoch": 33.68768570485309, "grad_norm": 866.562744140625, "learning_rate": 0.00012062429707770991, "loss": 36.9196, "step": 12755 }, { "epoch": 33.69032684054143, "grad_norm": 661.04931640625, "learning_rate": 0.00012058855150747264, "loss": 39.2778, "step": 12756 }, { "epoch": 33.69296797622978, "grad_norm": 328.416748046875, "learning_rate": 0.00012055280955097403, "loss": 39.4341, "step": 12757 }, { "epoch": 33.695609111918124, "grad_norm": 366.3754577636719, "learning_rate": 0.00012051707120921218, "loss": 37.6072, "step": 12758 }, { "epoch": 33.698250247606474, "grad_norm": 946.0753784179688, "learning_rate": 0.00012048133648318496, "loss": 37.3458, "step": 12759 }, { "epoch": 33.70089138329482, "grad_norm": 610.4987182617188, "learning_rate": 0.00012044560537389044, "loss": 37.5552, "step": 12760 }, { "epoch": 33.70353251898316, "grad_norm": 1739.5167236328125, "learning_rate": 0.00012040987788232627, "loss": 37.3888, "step": 12761 }, { "epoch": 33.70617365467151, "grad_norm": 363.66583251953125, "learning_rate": 0.00012037415400949004, "loss": 35.6735, "step": 12762 }, { "epoch": 33.70881479035985, "grad_norm": 562.6676635742188, "learning_rate": 0.0001203384337563796, "loss": 35.7503, "step": 12763 }, { "epoch": 33.7114559260482, "grad_norm": 542.2778930664062, "learning_rate": 0.00012030271712399213, "loss": 35.6557, "step": 12764 }, { "epoch": 33.714097061736545, "grad_norm": 482.14202880859375, "learning_rate": 0.00012026700411332511, "loss": 35.5836, "step": 12765 }, { "epoch": 33.716738197424895, "grad_norm": 460.9536437988281, "learning_rate": 0.00012023129472537569, "loss": 35.3031, "step": 12766 }, { "epoch": 33.71937933311324, "grad_norm": 1029.8406982421875, "learning_rate": 0.00012019558896114125, "loss": 38.6705, "step": 12767 }, { "epoch": 33.72202046880158, "grad_norm": 1059.5189208984375, "learning_rate": 0.00012015988682161872, "loss": 32.2959, "step": 12768 }, { "epoch": 33.72466160448993, "grad_norm": 1894.779052734375, "learning_rate": 0.00012012418830780497, "loss": 10.5107, "step": 12769 }, { "epoch": 33.727302740178274, "grad_norm": 1555.1190185546875, "learning_rate": 0.00012008849342069708, "loss": 9.1401, "step": 12770 }, { "epoch": 33.729943875866624, "grad_norm": 1944.6656494140625, "learning_rate": 0.00012005280216129169, "loss": 13.4392, "step": 12771 }, { "epoch": 33.73258501155497, "grad_norm": 1025.166748046875, "learning_rate": 0.00012001711453058548, "loss": 11.8966, "step": 12772 }, { "epoch": 33.73522614724332, "grad_norm": 1587.70947265625, "learning_rate": 0.00011998143052957496, "loss": 14.5647, "step": 12773 }, { "epoch": 33.73786728293166, "grad_norm": 2592.66162109375, "learning_rate": 0.00011994575015925666, "loss": 9.9679, "step": 12774 }, { "epoch": 33.74050841862001, "grad_norm": 1543.2642822265625, "learning_rate": 0.00011991007342062685, "loss": 11.9918, "step": 12775 }, { "epoch": 33.74314955430835, "grad_norm": 6830.4921875, "learning_rate": 0.00011987440031468175, "loss": 15.137, "step": 12776 }, { "epoch": 33.745790689996696, "grad_norm": 626.1482543945312, "learning_rate": 0.00011983873084241764, "loss": 15.0508, "step": 12777 }, { "epoch": 33.748431825685046, "grad_norm": 520.3231201171875, "learning_rate": 0.00011980306500483054, "loss": 35.1507, "step": 12778 }, { "epoch": 33.75107296137339, "grad_norm": 429.4674987792969, "learning_rate": 0.00011976740280291637, "loss": 36.1593, "step": 12779 }, { "epoch": 33.75371409706174, "grad_norm": 721.8549194335938, "learning_rate": 0.00011973174423767097, "loss": 34.9021, "step": 12780 }, { "epoch": 33.75635523275008, "grad_norm": 608.5694580078125, "learning_rate": 0.00011969608931009007, "loss": 35.5053, "step": 12781 }, { "epoch": 33.75899636843843, "grad_norm": 573.83349609375, "learning_rate": 0.00011966043802116936, "loss": 35.1963, "step": 12782 }, { "epoch": 33.761637504126774, "grad_norm": 2045.114013671875, "learning_rate": 0.00011962479037190424, "loss": 33.8979, "step": 12783 }, { "epoch": 33.76427863981512, "grad_norm": 1705.33837890625, "learning_rate": 0.00011958914636329038, "loss": 34.757, "step": 12784 }, { "epoch": 33.76691977550347, "grad_norm": 620.249267578125, "learning_rate": 0.00011955350599632298, "loss": 35.3319, "step": 12785 }, { "epoch": 33.76956091119181, "grad_norm": 594.3553466796875, "learning_rate": 0.0001195178692719973, "loss": 33.5678, "step": 12786 }, { "epoch": 33.77220204688016, "grad_norm": 742.3646240234375, "learning_rate": 0.00011948223619130838, "loss": 34.8282, "step": 12787 }, { "epoch": 33.7748431825685, "grad_norm": 1013.1198120117188, "learning_rate": 0.00011944660675525142, "loss": 34.1418, "step": 12788 }, { "epoch": 33.77748431825685, "grad_norm": 917.885498046875, "learning_rate": 0.00011941098096482128, "loss": 36.4138, "step": 12789 }, { "epoch": 33.780125453945196, "grad_norm": 729.3921508789062, "learning_rate": 0.00011937535882101281, "loss": 35.0457, "step": 12790 }, { "epoch": 33.78276658963354, "grad_norm": 2326.93603515625, "learning_rate": 0.0001193397403248207, "loss": 35.2662, "step": 12791 }, { "epoch": 33.78540772532189, "grad_norm": 800.8577270507812, "learning_rate": 0.00011930412547723957, "loss": 34.1339, "step": 12792 }, { "epoch": 33.78804886101023, "grad_norm": 3409.1123046875, "learning_rate": 0.00011926851427926397, "loss": 34.7765, "step": 12793 }, { "epoch": 33.79068999669858, "grad_norm": 1337.291748046875, "learning_rate": 0.00011923290673188822, "loss": 38.6958, "step": 12794 }, { "epoch": 33.793331132386925, "grad_norm": 667.736083984375, "learning_rate": 0.00011919730283610684, "loss": 39.7855, "step": 12795 }, { "epoch": 33.795972268075275, "grad_norm": 537.4090576171875, "learning_rate": 0.00011916170259291392, "loss": 39.349, "step": 12796 }, { "epoch": 33.79861340376362, "grad_norm": 548.5026245117188, "learning_rate": 0.00011912610600330353, "loss": 37.9793, "step": 12797 }, { "epoch": 33.80125453945197, "grad_norm": 713.9087524414062, "learning_rate": 0.00011909051306826993, "loss": 40.6659, "step": 12798 }, { "epoch": 33.80389567514031, "grad_norm": 1353.434326171875, "learning_rate": 0.00011905492378880672, "loss": 40.685, "step": 12799 }, { "epoch": 33.80653681082865, "grad_norm": 624.8223876953125, "learning_rate": 0.00011901933816590788, "loss": 40.7814, "step": 12800 }, { "epoch": 33.80653681082865, "eval_loss": 3.8017094135284424, "eval_runtime": 2.1936, "eval_samples_per_second": 225.654, "eval_steps_per_second": 28.264, "step": 12800 }, { "epoch": 33.809177946517, "grad_norm": 642.2083740234375, "learning_rate": 0.00011898375620056698, "loss": 40.2341, "step": 12801 }, { "epoch": 33.811819082205346, "grad_norm": 996.353271484375, "learning_rate": 0.00011894817789377782, "loss": 41.4325, "step": 12802 }, { "epoch": 33.814460217893696, "grad_norm": 626.7393188476562, "learning_rate": 0.0001189126032465338, "loss": 41.5196, "step": 12803 }, { "epoch": 33.81710135358204, "grad_norm": 492.5699462890625, "learning_rate": 0.00011887703225982824, "loss": 41.9243, "step": 12804 }, { "epoch": 33.81974248927039, "grad_norm": 411.404296875, "learning_rate": 0.00011884146493465461, "loss": 39.963, "step": 12805 }, { "epoch": 33.82238362495873, "grad_norm": 516.8643188476562, "learning_rate": 0.00011880590127200605, "loss": 40.7186, "step": 12806 }, { "epoch": 33.825024760647075, "grad_norm": 381.90667724609375, "learning_rate": 0.00011877034127287562, "loss": 38.4356, "step": 12807 }, { "epoch": 33.827665896335425, "grad_norm": 1032.084228515625, "learning_rate": 0.00011873478493825631, "loss": 35.8096, "step": 12808 }, { "epoch": 33.83030703202377, "grad_norm": 449.1246643066406, "learning_rate": 0.00011869923226914104, "loss": 35.1908, "step": 12809 }, { "epoch": 33.83294816771212, "grad_norm": 449.566650390625, "learning_rate": 0.00011866368326652257, "loss": 35.4127, "step": 12810 }, { "epoch": 33.83558930340046, "grad_norm": 346.2059631347656, "learning_rate": 0.00011862813793139348, "loss": 35.7122, "step": 12811 }, { "epoch": 33.83823043908881, "grad_norm": 367.5126953125, "learning_rate": 0.00011859259626474658, "loss": 35.1439, "step": 12812 }, { "epoch": 33.840871574777154, "grad_norm": 375.3915710449219, "learning_rate": 0.00011855705826757423, "loss": 34.8416, "step": 12813 }, { "epoch": 33.8435127104655, "grad_norm": 480.6423034667969, "learning_rate": 0.00011852152394086885, "loss": 35.6599, "step": 12814 }, { "epoch": 33.84615384615385, "grad_norm": 2071.4375, "learning_rate": 0.00011848599328562262, "loss": 35.0482, "step": 12815 }, { "epoch": 33.84879498184219, "grad_norm": 9114.8515625, "learning_rate": 0.00011845046630282783, "loss": 38.8467, "step": 12816 }, { "epoch": 33.85143611753054, "grad_norm": 959.050537109375, "learning_rate": 0.00011841494299347643, "loss": 39.5065, "step": 12817 }, { "epoch": 33.85407725321888, "grad_norm": 1499.1453857421875, "learning_rate": 0.0001183794233585604, "loss": 16.5157, "step": 12818 }, { "epoch": 33.85671838890723, "grad_norm": 3111.519287109375, "learning_rate": 0.00011834390739907175, "loss": 11.3493, "step": 12819 }, { "epoch": 33.859359524595575, "grad_norm": 11399.1357421875, "learning_rate": 0.00011830839511600211, "loss": 12.6614, "step": 12820 }, { "epoch": 33.862000660283925, "grad_norm": 5482.40869140625, "learning_rate": 0.00011827288651034321, "loss": 15.0363, "step": 12821 }, { "epoch": 33.86464179597227, "grad_norm": 1936.573486328125, "learning_rate": 0.00011823738158308647, "loss": 14.1951, "step": 12822 }, { "epoch": 33.86728293166061, "grad_norm": 3195.3955078125, "learning_rate": 0.00011820188033522353, "loss": 14.9403, "step": 12823 }, { "epoch": 33.86992406734896, "grad_norm": 2523.25634765625, "learning_rate": 0.00011816638276774569, "loss": 14.162, "step": 12824 }, { "epoch": 33.872565203037304, "grad_norm": 1066.357421875, "learning_rate": 0.00011813088888164413, "loss": 11.5393, "step": 12825 }, { "epoch": 33.875206338725654, "grad_norm": 364.3730773925781, "learning_rate": 0.00011809539867791005, "loss": 9.0905, "step": 12826 }, { "epoch": 33.877847474414, "grad_norm": 676.17626953125, "learning_rate": 0.00011805991215753447, "loss": 26.8951, "step": 12827 }, { "epoch": 33.88048861010235, "grad_norm": 963.9036254882812, "learning_rate": 0.00011802442932150836, "loss": 36.3182, "step": 12828 }, { "epoch": 33.88312974579069, "grad_norm": 992.8408203125, "learning_rate": 0.00011798895017082243, "loss": 34.857, "step": 12829 }, { "epoch": 33.88577088147903, "grad_norm": 1849.2183837890625, "learning_rate": 0.00011795347470646762, "loss": 35.0258, "step": 12830 }, { "epoch": 33.88841201716738, "grad_norm": 346.6271057128906, "learning_rate": 0.00011791800292943444, "loss": 34.3265, "step": 12831 }, { "epoch": 33.891053152855726, "grad_norm": 625.0454711914062, "learning_rate": 0.00011788253484071335, "loss": 35.1193, "step": 12832 }, { "epoch": 33.893694288544076, "grad_norm": 573.9558715820312, "learning_rate": 0.00011784707044129506, "loss": 33.3617, "step": 12833 }, { "epoch": 33.89633542423242, "grad_norm": 1259.0367431640625, "learning_rate": 0.00011781160973216962, "loss": 34.1782, "step": 12834 }, { "epoch": 33.89897655992077, "grad_norm": 1003.4657592773438, "learning_rate": 0.0001177761527143272, "loss": 34.0577, "step": 12835 }, { "epoch": 33.90161769560911, "grad_norm": 549.7288818359375, "learning_rate": 0.00011774069938875814, "loss": 34.4488, "step": 12836 }, { "epoch": 33.904258831297454, "grad_norm": 587.7720336914062, "learning_rate": 0.00011770524975645239, "loss": 34.4413, "step": 12837 }, { "epoch": 33.906899966985804, "grad_norm": 745.460693359375, "learning_rate": 0.00011766980381839978, "loss": 35.0366, "step": 12838 }, { "epoch": 33.90954110267415, "grad_norm": 505.54315185546875, "learning_rate": 0.0001176343615755901, "loss": 34.2382, "step": 12839 }, { "epoch": 33.9121822383625, "grad_norm": 1412.8074951171875, "learning_rate": 0.0001175989230290132, "loss": 33.5961, "step": 12840 }, { "epoch": 33.91482337405084, "grad_norm": 483.53759765625, "learning_rate": 0.00011756348817965862, "loss": 34.8159, "step": 12841 }, { "epoch": 33.91746450973919, "grad_norm": 1327.3531494140625, "learning_rate": 0.0001175280570285158, "loss": 34.0427, "step": 12842 }, { "epoch": 33.92010564542753, "grad_norm": 943.9072875976562, "learning_rate": 0.00011749262957657419, "loss": 35.9309, "step": 12843 }, { "epoch": 33.92274678111588, "grad_norm": 750.2090454101562, "learning_rate": 0.00011745720582482306, "loss": 37.7427, "step": 12844 }, { "epoch": 33.925387916804226, "grad_norm": 1945.3106689453125, "learning_rate": 0.00011742178577425159, "loss": 42.3896, "step": 12845 }, { "epoch": 33.92802905249257, "grad_norm": 660.7682495117188, "learning_rate": 0.00011738636942584879, "loss": 39.9611, "step": 12846 }, { "epoch": 33.93067018818092, "grad_norm": 778.34228515625, "learning_rate": 0.00011735095678060378, "loss": 41.3229, "step": 12847 }, { "epoch": 33.93331132386926, "grad_norm": 351.7457275390625, "learning_rate": 0.00011731554783950543, "loss": 42.117, "step": 12848 }, { "epoch": 33.93595245955761, "grad_norm": 573.0509033203125, "learning_rate": 0.00011728014260354234, "loss": 42.1677, "step": 12849 }, { "epoch": 33.938593595245955, "grad_norm": 712.2510986328125, "learning_rate": 0.0001172447410737035, "loss": 40.3269, "step": 12850 }, { "epoch": 33.941234730934305, "grad_norm": 1079.3582763671875, "learning_rate": 0.00011720934325097718, "loss": 38.4547, "step": 12851 }, { "epoch": 33.94387586662265, "grad_norm": 654.138671875, "learning_rate": 0.00011717394913635196, "loss": 37.224, "step": 12852 }, { "epoch": 33.94651700231099, "grad_norm": 514.2325439453125, "learning_rate": 0.00011713855873081608, "loss": 36.8322, "step": 12853 }, { "epoch": 33.94915813799934, "grad_norm": 629.052734375, "learning_rate": 0.000117103172035358, "loss": 35.5869, "step": 12854 }, { "epoch": 33.95179927368768, "grad_norm": 578.89501953125, "learning_rate": 0.00011706778905096577, "loss": 35.1917, "step": 12855 }, { "epoch": 33.95444040937603, "grad_norm": 512.6728515625, "learning_rate": 0.00011703240977862733, "loss": 35.6578, "step": 12856 }, { "epoch": 33.957081545064376, "grad_norm": 1085.56640625, "learning_rate": 0.00011699703421933086, "loss": 25.6553, "step": 12857 }, { "epoch": 33.959722680752726, "grad_norm": 6987.72314453125, "learning_rate": 0.00011696166237406405, "loss": 13.7121, "step": 12858 }, { "epoch": 33.96236381644107, "grad_norm": 499.42138671875, "learning_rate": 0.00011692629424381468, "loss": 11.5184, "step": 12859 }, { "epoch": 33.96500495212941, "grad_norm": 2135.862548828125, "learning_rate": 0.0001168909298295704, "loss": 10.0469, "step": 12860 }, { "epoch": 33.96764608781776, "grad_norm": 675.4156494140625, "learning_rate": 0.00011685556913231868, "loss": 10.527, "step": 12861 }, { "epoch": 33.970287223506105, "grad_norm": 614.247314453125, "learning_rate": 0.00011682021215304697, "loss": 12.03, "step": 12862 }, { "epoch": 33.972928359194455, "grad_norm": 698.2581176757812, "learning_rate": 0.00011678485889274254, "loss": 31.9478, "step": 12863 }, { "epoch": 33.9755694948828, "grad_norm": 885.1618041992188, "learning_rate": 0.00011674950935239273, "loss": 33.6826, "step": 12864 }, { "epoch": 33.97821063057115, "grad_norm": 927.28271484375, "learning_rate": 0.0001167141635329846, "loss": 34.5318, "step": 12865 }, { "epoch": 33.98085176625949, "grad_norm": 1407.700927734375, "learning_rate": 0.00011667882143550521, "loss": 34.9483, "step": 12866 }, { "epoch": 33.98349290194784, "grad_norm": 524.3601684570312, "learning_rate": 0.00011664348306094128, "loss": 35.853, "step": 12867 }, { "epoch": 33.986134037636184, "grad_norm": 1014.2113037109375, "learning_rate": 0.00011660814841027992, "loss": 34.3119, "step": 12868 }, { "epoch": 33.98877517332453, "grad_norm": 727.721435546875, "learning_rate": 0.00011657281748450757, "loss": 36.7639, "step": 12869 }, { "epoch": 33.99141630901288, "grad_norm": 1193.58203125, "learning_rate": 0.00011653749028461083, "loss": 34.3874, "step": 12870 }, { "epoch": 33.99405744470122, "grad_norm": 1371.3199462890625, "learning_rate": 0.00011650216681157638, "loss": 34.2861, "step": 12871 }, { "epoch": 33.99669858038957, "grad_norm": 535.3816528320312, "learning_rate": 0.00011646684706639049, "loss": 36.2524, "step": 12872 }, { "epoch": 33.99933971607791, "grad_norm": 1024.8892822265625, "learning_rate": 0.00011643153105003946, "loss": 35.7168, "step": 12873 }, { "epoch": 34.00198085176626, "grad_norm": 627.8704833984375, "learning_rate": 0.00011639621876350934, "loss": 39.0294, "step": 12874 }, { "epoch": 34.004621987454605, "grad_norm": 484.26470947265625, "learning_rate": 0.00011636091020778644, "loss": 38.2214, "step": 12875 }, { "epoch": 34.00726312314295, "grad_norm": 671.8497314453125, "learning_rate": 0.00011632560538385662, "loss": 39.0343, "step": 12876 }, { "epoch": 34.0099042588313, "grad_norm": 1365.7406005859375, "learning_rate": 0.00011629030429270576, "loss": 39.1477, "step": 12877 }, { "epoch": 34.01254539451964, "grad_norm": 976.1868896484375, "learning_rate": 0.00011625500693531959, "loss": 42.4232, "step": 12878 }, { "epoch": 34.01518653020799, "grad_norm": 392.59368896484375, "learning_rate": 0.00011621971331268377, "loss": 41.8982, "step": 12879 }, { "epoch": 34.017827665896334, "grad_norm": 464.30126953125, "learning_rate": 0.0001161844234257839, "loss": 43.3334, "step": 12880 }, { "epoch": 34.020468801584684, "grad_norm": 383.5874328613281, "learning_rate": 0.00011614913727560527, "loss": 38.4791, "step": 12881 }, { "epoch": 34.02310993727303, "grad_norm": 498.8888854980469, "learning_rate": 0.0001161138548631335, "loss": 39.4081, "step": 12882 }, { "epoch": 34.02575107296137, "grad_norm": 982.1229858398438, "learning_rate": 0.00011607857618935363, "loss": 40.2914, "step": 12883 }, { "epoch": 34.02839220864972, "grad_norm": 668.8703002929688, "learning_rate": 0.00011604330125525078, "loss": 38.3673, "step": 12884 }, { "epoch": 34.03103334433806, "grad_norm": 869.6382446289062, "learning_rate": 0.00011600803006181024, "loss": 38.1044, "step": 12885 }, { "epoch": 34.03367448002641, "grad_norm": 1005.2489624023438, "learning_rate": 0.00011597276261001666, "loss": 37.8617, "step": 12886 }, { "epoch": 34.036315615714756, "grad_norm": 985.984619140625, "learning_rate": 0.00011593749890085495, "loss": 36.9279, "step": 12887 }, { "epoch": 34.038956751403106, "grad_norm": 439.5797119140625, "learning_rate": 0.00011590223893530975, "loss": 37.1987, "step": 12888 }, { "epoch": 34.04159788709145, "grad_norm": 1278.444091796875, "learning_rate": 0.00011586698271436582, "loss": 35.2706, "step": 12889 }, { "epoch": 34.0442390227798, "grad_norm": 470.570556640625, "learning_rate": 0.00011583173023900762, "loss": 35.3014, "step": 12890 }, { "epoch": 34.04688015846814, "grad_norm": 258.0578918457031, "learning_rate": 0.00011579648151021943, "loss": 33.6948, "step": 12891 }, { "epoch": 34.049521294156484, "grad_norm": 1037.92236328125, "learning_rate": 0.00011576123652898576, "loss": 34.7819, "step": 12892 }, { "epoch": 34.052162429844834, "grad_norm": 639.3243408203125, "learning_rate": 0.00011572599529629072, "loss": 34.6946, "step": 12893 }, { "epoch": 34.05480356553318, "grad_norm": 499.4789733886719, "learning_rate": 0.00011569075781311835, "loss": 34.9021, "step": 12894 }, { "epoch": 34.05744470122153, "grad_norm": 1109.786865234375, "learning_rate": 0.00011565552408045272, "loss": 44.9976, "step": 12895 }, { "epoch": 34.06008583690987, "grad_norm": 8616.576171875, "learning_rate": 0.0001156202940992776, "loss": 17.2189, "step": 12896 }, { "epoch": 34.06272697259822, "grad_norm": 1456.0623779296875, "learning_rate": 0.00011558506787057687, "loss": 14.0452, "step": 12897 }, { "epoch": 34.06536810828656, "grad_norm": 2433.974365234375, "learning_rate": 0.00011554984539533406, "loss": 12.635, "step": 12898 }, { "epoch": 34.068009243974906, "grad_norm": 20673.22265625, "learning_rate": 0.00011551462667453294, "loss": 18.1613, "step": 12899 }, { "epoch": 34.070650379663256, "grad_norm": 2955.111572265625, "learning_rate": 0.00011547941170915685, "loss": 12.2303, "step": 12900 }, { "epoch": 34.0732915153516, "grad_norm": 1634.0958251953125, "learning_rate": 0.00011544420050018917, "loss": 16.1356, "step": 12901 }, { "epoch": 34.07593265103995, "grad_norm": 821.5759887695312, "learning_rate": 0.00011540899304861304, "loss": 12.0997, "step": 12902 }, { "epoch": 34.07857378672829, "grad_norm": 595.9430541992188, "learning_rate": 0.00011537378935541193, "loss": 12.93, "step": 12903 }, { "epoch": 34.08121492241664, "grad_norm": 1727.4912109375, "learning_rate": 0.00011533858942156857, "loss": 11.3099, "step": 12904 }, { "epoch": 34.083856058104985, "grad_norm": 922.1354370117188, "learning_rate": 0.00011530339324806591, "loss": 15.1586, "step": 12905 }, { "epoch": 34.08649719379333, "grad_norm": 948.9271240234375, "learning_rate": 0.00011526820083588696, "loss": 35.8362, "step": 12906 }, { "epoch": 34.08913832948168, "grad_norm": 2625.19580078125, "learning_rate": 0.00011523301218601434, "loss": 34.9665, "step": 12907 }, { "epoch": 34.09177946517002, "grad_norm": 471.1881408691406, "learning_rate": 0.0001151978272994307, "loss": 35.0242, "step": 12908 }, { "epoch": 34.09442060085837, "grad_norm": 1217.409423828125, "learning_rate": 0.00011516264617711844, "loss": 35.706, "step": 12909 }, { "epoch": 34.09706173654671, "grad_norm": 1072.564453125, "learning_rate": 0.00011512746882006018, "loss": 34.8306, "step": 12910 }, { "epoch": 34.09970287223506, "grad_norm": 636.2339477539062, "learning_rate": 0.00011509229522923811, "loss": 36.9955, "step": 12911 }, { "epoch": 34.102344007923406, "grad_norm": 446.4933776855469, "learning_rate": 0.00011505712540563448, "loss": 35.6541, "step": 12912 }, { "epoch": 34.104985143611756, "grad_norm": 321.0964660644531, "learning_rate": 0.00011502195935023133, "loss": 34.5206, "step": 12913 }, { "epoch": 34.1076262793001, "grad_norm": 1431.668212890625, "learning_rate": 0.00011498679706401069, "loss": 34.4453, "step": 12914 }, { "epoch": 34.11026741498844, "grad_norm": 6623.30126953125, "learning_rate": 0.0001149516385479544, "loss": 34.116, "step": 12915 }, { "epoch": 34.11290855067679, "grad_norm": 598.5888671875, "learning_rate": 0.0001149164838030442, "loss": 35.5315, "step": 12916 }, { "epoch": 34.115549686365135, "grad_norm": 988.9390869140625, "learning_rate": 0.00011488133283026195, "loss": 33.2439, "step": 12917 }, { "epoch": 34.118190822053485, "grad_norm": 789.98486328125, "learning_rate": 0.00011484618563058907, "loss": 35.9048, "step": 12918 }, { "epoch": 34.12083195774183, "grad_norm": 616.5728149414062, "learning_rate": 0.00011481104220500698, "loss": 34.8521, "step": 12919 }, { "epoch": 34.12347309343018, "grad_norm": 1303.61181640625, "learning_rate": 0.00011477590255449732, "loss": 35.3079, "step": 12920 }, { "epoch": 34.12611422911852, "grad_norm": 1035.878173828125, "learning_rate": 0.00011474076668004104, "loss": 36.4948, "step": 12921 }, { "epoch": 34.128755364806864, "grad_norm": 555.4237060546875, "learning_rate": 0.0001147056345826194, "loss": 36.2893, "step": 12922 }, { "epoch": 34.131396500495214, "grad_norm": 3110.845947265625, "learning_rate": 0.00011467050626321334, "loss": 39.4224, "step": 12923 }, { "epoch": 34.13403763618356, "grad_norm": 567.085693359375, "learning_rate": 0.00011463538172280402, "loss": 40.9209, "step": 12924 }, { "epoch": 34.13667877187191, "grad_norm": 532.3855590820312, "learning_rate": 0.00011460026096237211, "loss": 39.1919, "step": 12925 }, { "epoch": 34.13931990756025, "grad_norm": 703.107177734375, "learning_rate": 0.0001145651439828983, "loss": 39.6443, "step": 12926 }, { "epoch": 34.1419610432486, "grad_norm": 594.151123046875, "learning_rate": 0.0001145300307853634, "loss": 41.4022, "step": 12927 }, { "epoch": 34.14460217893694, "grad_norm": 1505.80712890625, "learning_rate": 0.00011449492137074777, "loss": 41.202, "step": 12928 }, { "epoch": 34.147243314625285, "grad_norm": 532.6651000976562, "learning_rate": 0.00011445981574003186, "loss": 44.38, "step": 12929 }, { "epoch": 34.149884450313635, "grad_norm": 884.6322631835938, "learning_rate": 0.00011442471389419601, "loss": 41.1932, "step": 12930 }, { "epoch": 34.15252558600198, "grad_norm": 669.2030029296875, "learning_rate": 0.00011438961583422036, "loss": 44.8718, "step": 12931 }, { "epoch": 34.15516672169033, "grad_norm": 411.6371154785156, "learning_rate": 0.00011435452156108503, "loss": 40.568, "step": 12932 }, { "epoch": 34.15780785737867, "grad_norm": 3988.9228515625, "learning_rate": 0.00011431943107576992, "loss": 40.5471, "step": 12933 }, { "epoch": 34.16044899306702, "grad_norm": 581.9429321289062, "learning_rate": 0.00011428434437925506, "loss": 38.3125, "step": 12934 }, { "epoch": 34.163090128755364, "grad_norm": 1303.31640625, "learning_rate": 0.00011424926147252019, "loss": 38.1952, "step": 12935 }, { "epoch": 34.165731264443714, "grad_norm": 802.8572998046875, "learning_rate": 0.00011421418235654493, "loss": 37.5891, "step": 12936 }, { "epoch": 34.16837240013206, "grad_norm": 441.96478271484375, "learning_rate": 0.00011417910703230878, "loss": 35.6478, "step": 12937 }, { "epoch": 34.1710135358204, "grad_norm": 654.8563232421875, "learning_rate": 0.00011414403550079144, "loss": 36.7586, "step": 12938 }, { "epoch": 34.17365467150875, "grad_norm": 711.0220336914062, "learning_rate": 0.00011410896776297205, "loss": 35.6819, "step": 12939 }, { "epoch": 34.17629580719709, "grad_norm": 816.2174682617188, "learning_rate": 0.0001140739038198298, "loss": 34.932, "step": 12940 }, { "epoch": 34.17893694288544, "grad_norm": 1535.021728515625, "learning_rate": 0.00011403884367234402, "loss": 36.433, "step": 12941 }, { "epoch": 34.181578078573786, "grad_norm": 929.101806640625, "learning_rate": 0.00011400378732149366, "loss": 34.4516, "step": 12942 }, { "epoch": 34.184219214262136, "grad_norm": 429.1257019042969, "learning_rate": 0.00011396873476825768, "loss": 35.1168, "step": 12943 }, { "epoch": 34.18686034995048, "grad_norm": 994.0762329101562, "learning_rate": 0.00011393368601361475, "loss": 36.5734, "step": 12944 }, { "epoch": 34.18950148563882, "grad_norm": 1418.5762939453125, "learning_rate": 0.00011389864105854384, "loss": 36.2421, "step": 12945 }, { "epoch": 34.19214262132717, "grad_norm": 1397.60986328125, "learning_rate": 0.0001138635999040234, "loss": 31.7882, "step": 12946 }, { "epoch": 34.194783757015514, "grad_norm": 670.8582763671875, "learning_rate": 0.00011382856255103194, "loss": 9.8837, "step": 12947 }, { "epoch": 34.197424892703864, "grad_norm": 1802.5277099609375, "learning_rate": 0.0001137935290005479, "loss": 9.2692, "step": 12948 }, { "epoch": 34.20006602839221, "grad_norm": 1570.926025390625, "learning_rate": 0.00011375849925354956, "loss": 13.1015, "step": 12949 }, { "epoch": 34.20270716408056, "grad_norm": 2191.90234375, "learning_rate": 0.00011372347331101509, "loss": 9.4562, "step": 12950 }, { "epoch": 34.2053482997689, "grad_norm": 1180.99365234375, "learning_rate": 0.00011368845117392248, "loss": 13.0398, "step": 12951 }, { "epoch": 34.20798943545724, "grad_norm": 773.9447021484375, "learning_rate": 0.0001136534328432499, "loss": 15.0529, "step": 12952 }, { "epoch": 34.21063057114559, "grad_norm": 2931.80224609375, "learning_rate": 0.0001136184183199751, "loss": 11.7968, "step": 12953 }, { "epoch": 34.213271706833936, "grad_norm": 1193.555419921875, "learning_rate": 0.00011358340760507576, "loss": 10.2077, "step": 12954 }, { "epoch": 34.215912842522286, "grad_norm": 3512.248779296875, "learning_rate": 0.00011354840069952982, "loss": 13.566, "step": 12955 }, { "epoch": 34.21855397821063, "grad_norm": 980.5659790039062, "learning_rate": 0.00011351339760431454, "loss": 34.2944, "step": 12956 }, { "epoch": 34.22119511389898, "grad_norm": 450.9232482910156, "learning_rate": 0.00011347839832040746, "loss": 35.2528, "step": 12957 }, { "epoch": 34.22383624958732, "grad_norm": 339.1415710449219, "learning_rate": 0.00011344340284878579, "loss": 33.7899, "step": 12958 }, { "epoch": 34.22647738527567, "grad_norm": 454.652587890625, "learning_rate": 0.000113408411190427, "loss": 34.7745, "step": 12959 }, { "epoch": 34.229118520964015, "grad_norm": 2684.8994140625, "learning_rate": 0.0001133734233463081, "loss": 34.1462, "step": 12960 }, { "epoch": 34.23175965665236, "grad_norm": 579.93896484375, "learning_rate": 0.00011333843931740597, "loss": 35.5699, "step": 12961 }, { "epoch": 34.23440079234071, "grad_norm": 1285.09375, "learning_rate": 0.00011330345910469775, "loss": 35.5762, "step": 12962 }, { "epoch": 34.23704192802905, "grad_norm": 650.8252563476562, "learning_rate": 0.00011326848270916012, "loss": 34.7917, "step": 12963 }, { "epoch": 34.2396830637174, "grad_norm": 1521.834228515625, "learning_rate": 0.0001132335101317698, "loss": 34.3371, "step": 12964 }, { "epoch": 34.24232419940574, "grad_norm": 944.7412109375, "learning_rate": 0.00011319854137350338, "loss": 35.1906, "step": 12965 }, { "epoch": 34.24496533509409, "grad_norm": 736.7548217773438, "learning_rate": 0.00011316357643533732, "loss": 34.881, "step": 12966 }, { "epoch": 34.247606470782436, "grad_norm": 866.8917236328125, "learning_rate": 0.000113128615318248, "loss": 33.9223, "step": 12967 }, { "epoch": 34.25024760647078, "grad_norm": 767.2023315429688, "learning_rate": 0.00011309365802321161, "loss": 34.0194, "step": 12968 }, { "epoch": 34.25288874215913, "grad_norm": 764.100341796875, "learning_rate": 0.00011305870455120452, "loss": 34.2934, "step": 12969 }, { "epoch": 34.25552987784747, "grad_norm": 981.8917846679688, "learning_rate": 0.0001130237549032026, "loss": 34.4434, "step": 12970 }, { "epoch": 34.25817101353582, "grad_norm": 750.3289184570312, "learning_rate": 0.00011298880908018181, "loss": 35.3213, "step": 12971 }, { "epoch": 34.260812149224165, "grad_norm": 2555.807373046875, "learning_rate": 0.00011295386708311812, "loss": 36.2853, "step": 12972 }, { "epoch": 34.263453284912515, "grad_norm": 1087.5567626953125, "learning_rate": 0.00011291892891298727, "loss": 38.507, "step": 12973 }, { "epoch": 34.26609442060086, "grad_norm": 5659.96826171875, "learning_rate": 0.00011288399457076473, "loss": 40.3351, "step": 12974 }, { "epoch": 34.2687355562892, "grad_norm": 993.2999877929688, "learning_rate": 0.000112849064057426, "loss": 39.0627, "step": 12975 }, { "epoch": 34.27137669197755, "grad_norm": 829.3589477539062, "learning_rate": 0.00011281413737394669, "loss": 39.2371, "step": 12976 }, { "epoch": 34.274017827665894, "grad_norm": 1052.96630859375, "learning_rate": 0.000112779214521302, "loss": 41.0491, "step": 12977 }, { "epoch": 34.276658963354244, "grad_norm": 1136.2432861328125, "learning_rate": 0.00011274429550046702, "loss": 41.8179, "step": 12978 }, { "epoch": 34.27930009904259, "grad_norm": 1285.8641357421875, "learning_rate": 0.00011270938031241709, "loss": 42.2096, "step": 12979 }, { "epoch": 34.28194123473094, "grad_norm": 806.142578125, "learning_rate": 0.00011267446895812702, "loss": 42.2329, "step": 12980 }, { "epoch": 34.28458237041928, "grad_norm": 741.9119873046875, "learning_rate": 0.0001126395614385718, "loss": 41.1973, "step": 12981 }, { "epoch": 34.28722350610763, "grad_norm": 454.4678649902344, "learning_rate": 0.00011260465775472609, "loss": 41.4654, "step": 12982 }, { "epoch": 34.28986464179597, "grad_norm": 1463.4359130859375, "learning_rate": 0.0001125697579075646, "loss": 40.2906, "step": 12983 }, { "epoch": 34.292505777484315, "grad_norm": 520.4700317382812, "learning_rate": 0.00011253486189806189, "loss": 40.9333, "step": 12984 }, { "epoch": 34.295146913172665, "grad_norm": 974.35791015625, "learning_rate": 0.00011249996972719234, "loss": 38.3877, "step": 12985 }, { "epoch": 34.29778804886101, "grad_norm": 696.685546875, "learning_rate": 0.00011246508139593046, "loss": 38.4707, "step": 12986 }, { "epoch": 34.30042918454936, "grad_norm": 648.0191650390625, "learning_rate": 0.00011243019690525036, "loss": 37.5932, "step": 12987 }, { "epoch": 34.3030703202377, "grad_norm": 1740.66455078125, "learning_rate": 0.0001123953162561262, "loss": 34.373, "step": 12988 }, { "epoch": 34.30571145592605, "grad_norm": 752.5821533203125, "learning_rate": 0.00011236043944953191, "loss": 35.1807, "step": 12989 }, { "epoch": 34.308352591614394, "grad_norm": 494.8115234375, "learning_rate": 0.0001123255664864417, "loss": 34.9863, "step": 12990 }, { "epoch": 34.31099372730274, "grad_norm": 870.9237670898438, "learning_rate": 0.00011229069736782908, "loss": 35.7646, "step": 12991 }, { "epoch": 34.31363486299109, "grad_norm": 1101.4000244140625, "learning_rate": 0.00011225583209466772, "loss": 34.9018, "step": 12992 }, { "epoch": 34.31627599867943, "grad_norm": 741.7435302734375, "learning_rate": 0.00011222097066793147, "loss": 35.4868, "step": 12993 }, { "epoch": 34.31891713436778, "grad_norm": 1993.85693359375, "learning_rate": 0.00011218611308859364, "loss": 35.1365, "step": 12994 }, { "epoch": 34.32155827005612, "grad_norm": 1852.429443359375, "learning_rate": 0.00011215125935762765, "loss": 36.7777, "step": 12995 }, { "epoch": 34.32419940574447, "grad_norm": 1338.2518310546875, "learning_rate": 0.00011211640947600669, "loss": 24.1132, "step": 12996 }, { "epoch": 34.326840541432816, "grad_norm": 686.266845703125, "learning_rate": 0.00011208156344470407, "loss": 16.4151, "step": 12997 }, { "epoch": 34.32948167712116, "grad_norm": 445.96270751953125, "learning_rate": 0.00011204672126469279, "loss": 10.3004, "step": 12998 }, { "epoch": 34.33212281280951, "grad_norm": 1731.47412109375, "learning_rate": 0.00011201188293694577, "loss": 14.5748, "step": 12999 }, { "epoch": 34.33476394849785, "grad_norm": 2356.53955078125, "learning_rate": 0.00011197704846243586, "loss": 14.1216, "step": 13000 }, { "epoch": 34.33476394849785, "eval_loss": 3.785656452178955, "eval_runtime": 2.2691, "eval_samples_per_second": 218.146, "eval_steps_per_second": 27.323, "step": 13000 }, { "epoch": 34.3374050841862, "grad_norm": 1095.87890625, "learning_rate": 0.00011194221784213582, "loss": 11.5222, "step": 13001 }, { "epoch": 34.340046219874544, "grad_norm": 4752.513671875, "learning_rate": 0.00011190739107701822, "loss": 12.8278, "step": 13002 }, { "epoch": 34.342687355562894, "grad_norm": 695.555908203125, "learning_rate": 0.00011187256816805552, "loss": 9.3439, "step": 13003 }, { "epoch": 34.34532849125124, "grad_norm": 3240.71044921875, "learning_rate": 0.0001118377491162203, "loss": 11.3994, "step": 13004 }, { "epoch": 34.34796962693959, "grad_norm": 787.0066528320312, "learning_rate": 0.00011180293392248478, "loss": 16.3925, "step": 13005 }, { "epoch": 34.35061076262793, "grad_norm": 2192.521484375, "learning_rate": 0.00011176812258782104, "loss": 34.7951, "step": 13006 }, { "epoch": 34.35325189831627, "grad_norm": 618.2100830078125, "learning_rate": 0.0001117333151132014, "loss": 35.6055, "step": 13007 }, { "epoch": 34.35589303400462, "grad_norm": 2251.180908203125, "learning_rate": 0.00011169851149959778, "loss": 35.542, "step": 13008 }, { "epoch": 34.358534169692966, "grad_norm": 2540.67138671875, "learning_rate": 0.0001116637117479819, "loss": 35.3772, "step": 13009 }, { "epoch": 34.361175305381316, "grad_norm": 581.6477661132812, "learning_rate": 0.0001116289158593255, "loss": 33.8968, "step": 13010 }, { "epoch": 34.36381644106966, "grad_norm": 2767.517822265625, "learning_rate": 0.00011159412383460044, "loss": 32.9055, "step": 13011 }, { "epoch": 34.36645757675801, "grad_norm": 534.486572265625, "learning_rate": 0.0001115593356747782, "loss": 33.3183, "step": 13012 }, { "epoch": 34.36909871244635, "grad_norm": 925.2243041992188, "learning_rate": 0.00011152455138083003, "loss": 33.2309, "step": 13013 }, { "epoch": 34.371739848134695, "grad_norm": 948.2069702148438, "learning_rate": 0.00011148977095372756, "loss": 34.2894, "step": 13014 }, { "epoch": 34.374380983823045, "grad_norm": 1252.939697265625, "learning_rate": 0.00011145499439444188, "loss": 34.5822, "step": 13015 }, { "epoch": 34.37702211951139, "grad_norm": 1944.8590087890625, "learning_rate": 0.00011142022170394406, "loss": 36.3079, "step": 13016 }, { "epoch": 34.37966325519974, "grad_norm": 1213.408447265625, "learning_rate": 0.00011138545288320515, "loss": 37.4561, "step": 13017 }, { "epoch": 34.38230439088808, "grad_norm": 1540.173583984375, "learning_rate": 0.00011135068793319605, "loss": 33.9907, "step": 13018 }, { "epoch": 34.38494552657643, "grad_norm": 1681.4361572265625, "learning_rate": 0.00011131592685488756, "loss": 34.3818, "step": 13019 }, { "epoch": 34.38758666226477, "grad_norm": 962.8775024414062, "learning_rate": 0.00011128116964925022, "loss": 34.1364, "step": 13020 }, { "epoch": 34.390227797953116, "grad_norm": 2567.220703125, "learning_rate": 0.00011124641631725485, "loss": 34.1265, "step": 13021 }, { "epoch": 34.392868933641466, "grad_norm": 3081.71240234375, "learning_rate": 0.00011121166685987177, "loss": 35.2452, "step": 13022 }, { "epoch": 34.39551006932981, "grad_norm": 1328.6597900390625, "learning_rate": 0.00011117692127807139, "loss": 40.7972, "step": 13023 }, { "epoch": 34.39815120501816, "grad_norm": 586.0791015625, "learning_rate": 0.00011114217957282383, "loss": 39.7676, "step": 13024 }, { "epoch": 34.4007923407065, "grad_norm": 667.3784790039062, "learning_rate": 0.00011110744174509952, "loss": 38.7728, "step": 13025 }, { "epoch": 34.40343347639485, "grad_norm": 913.138916015625, "learning_rate": 0.0001110727077958682, "loss": 38.5104, "step": 13026 }, { "epoch": 34.406074612083195, "grad_norm": 492.7184753417969, "learning_rate": 0.00011103797772609983, "loss": 38.4476, "step": 13027 }, { "epoch": 34.408715747771545, "grad_norm": 543.527099609375, "learning_rate": 0.00011100325153676441, "loss": 40.0857, "step": 13028 }, { "epoch": 34.41135688345989, "grad_norm": 490.042236328125, "learning_rate": 0.00011096852922883152, "loss": 40.5024, "step": 13029 }, { "epoch": 34.41399801914823, "grad_norm": 672.0046997070312, "learning_rate": 0.00011093381080327079, "loss": 41.9409, "step": 13030 }, { "epoch": 34.41663915483658, "grad_norm": 1558.4727783203125, "learning_rate": 0.00011089909626105158, "loss": 40.7702, "step": 13031 }, { "epoch": 34.419280290524924, "grad_norm": 1569.2677001953125, "learning_rate": 0.00011086438560314352, "loss": 40.2941, "step": 13032 }, { "epoch": 34.421921426213274, "grad_norm": 464.26300048828125, "learning_rate": 0.00011082967883051573, "loss": 40.0942, "step": 13033 }, { "epoch": 34.42456256190162, "grad_norm": 1369.7130126953125, "learning_rate": 0.00011079497594413743, "loss": 39.8755, "step": 13034 }, { "epoch": 34.42720369758997, "grad_norm": 1693.7470703125, "learning_rate": 0.00011076027694497764, "loss": 37.8595, "step": 13035 }, { "epoch": 34.42984483327831, "grad_norm": 892.165283203125, "learning_rate": 0.00011072558183400533, "loss": 35.7425, "step": 13036 }, { "epoch": 34.43248596896665, "grad_norm": 556.4497680664062, "learning_rate": 0.00011069089061218934, "loss": 37.0105, "step": 13037 }, { "epoch": 34.435127104655, "grad_norm": 581.2847290039062, "learning_rate": 0.0001106562032804983, "loss": 36.3897, "step": 13038 }, { "epoch": 34.437768240343345, "grad_norm": 875.1748657226562, "learning_rate": 0.00011062151983990101, "loss": 35.3972, "step": 13039 }, { "epoch": 34.440409376031695, "grad_norm": 873.0851440429688, "learning_rate": 0.0001105868402913659, "loss": 34.3162, "step": 13040 }, { "epoch": 34.44305051172004, "grad_norm": 740.0084228515625, "learning_rate": 0.0001105521646358613, "loss": 34.9575, "step": 13041 }, { "epoch": 34.44569164740839, "grad_norm": 563.9918823242188, "learning_rate": 0.00011051749287435567, "loss": 35.5007, "step": 13042 }, { "epoch": 34.44833278309673, "grad_norm": 656.1404418945312, "learning_rate": 0.0001104828250078172, "loss": 34.8048, "step": 13043 }, { "epoch": 34.450973918785074, "grad_norm": 466.1817626953125, "learning_rate": 0.00011044816103721381, "loss": 34.7558, "step": 13044 }, { "epoch": 34.453615054473424, "grad_norm": 558.916748046875, "learning_rate": 0.00011041350096351346, "loss": 37.7354, "step": 13045 }, { "epoch": 34.45625619016177, "grad_norm": 2389.95263671875, "learning_rate": 0.00011037884478768418, "loss": 31.0788, "step": 13046 }, { "epoch": 34.45889732585012, "grad_norm": 1146.46484375, "learning_rate": 0.00011034419251069364, "loss": 15.4169, "step": 13047 }, { "epoch": 34.46153846153846, "grad_norm": 1180.920166015625, "learning_rate": 0.00011030954413350939, "loss": 12.7011, "step": 13048 }, { "epoch": 34.46417959722681, "grad_norm": 1103.552978515625, "learning_rate": 0.00011027489965709917, "loss": 12.275, "step": 13049 }, { "epoch": 34.46682073291515, "grad_norm": 1718.6226806640625, "learning_rate": 0.00011024025908243026, "loss": 11.2133, "step": 13050 }, { "epoch": 34.4694618686035, "grad_norm": 1565.9486083984375, "learning_rate": 0.00011020562241047005, "loss": 11.0668, "step": 13051 }, { "epoch": 34.472103004291846, "grad_norm": 1478.2376708984375, "learning_rate": 0.00011017098964218569, "loss": 9.6509, "step": 13052 }, { "epoch": 34.47474413998019, "grad_norm": 582.7564697265625, "learning_rate": 0.00011013636077854433, "loss": 10.3068, "step": 13053 }, { "epoch": 34.47738527566854, "grad_norm": 910.8904418945312, "learning_rate": 0.00011010173582051291, "loss": 8.6539, "step": 13054 }, { "epoch": 34.48002641135688, "grad_norm": 599.8759155273438, "learning_rate": 0.00011006711476905829, "loss": 9.5752, "step": 13055 }, { "epoch": 34.48266754704523, "grad_norm": 897.7676391601562, "learning_rate": 0.00011003249762514733, "loss": 31.4786, "step": 13056 }, { "epoch": 34.485308682733574, "grad_norm": 820.2510986328125, "learning_rate": 0.00010999788438974667, "loss": 35.7886, "step": 13057 }, { "epoch": 34.487949818421924, "grad_norm": 864.6314697265625, "learning_rate": 0.00010996327506382286, "loss": 35.7865, "step": 13058 }, { "epoch": 34.49059095411027, "grad_norm": 865.82177734375, "learning_rate": 0.00010992866964834223, "loss": 34.2802, "step": 13059 }, { "epoch": 34.49323208979861, "grad_norm": 509.14532470703125, "learning_rate": 0.0001098940681442713, "loss": 35.8964, "step": 13060 }, { "epoch": 34.49587322548696, "grad_norm": 804.40380859375, "learning_rate": 0.00010985947055257631, "loss": 34.622, "step": 13061 }, { "epoch": 34.4985143611753, "grad_norm": 655.8861694335938, "learning_rate": 0.0001098248768742231, "loss": 35.8984, "step": 13062 }, { "epoch": 34.50115549686365, "grad_norm": 803.6594848632812, "learning_rate": 0.00010979028711017796, "loss": 35.6275, "step": 13063 }, { "epoch": 34.503796632551996, "grad_norm": 476.8307189941406, "learning_rate": 0.00010975570126140669, "loss": 34.13, "step": 13064 }, { "epoch": 34.506437768240346, "grad_norm": 615.8994140625, "learning_rate": 0.00010972111932887507, "loss": 35.9067, "step": 13065 }, { "epoch": 34.50907890392869, "grad_norm": 1001.4652099609375, "learning_rate": 0.00010968654131354869, "loss": 36.296, "step": 13066 }, { "epoch": 34.51172003961703, "grad_norm": 814.657958984375, "learning_rate": 0.0001096519672163933, "loss": 34.2101, "step": 13067 }, { "epoch": 34.51436117530538, "grad_norm": 483.72003173828125, "learning_rate": 0.00010961739703837431, "loss": 34.6852, "step": 13068 }, { "epoch": 34.517002310993725, "grad_norm": 1065.2542724609375, "learning_rate": 0.00010958283078045705, "loss": 34.1493, "step": 13069 }, { "epoch": 34.519643446682075, "grad_norm": 3549.988525390625, "learning_rate": 0.00010954826844360674, "loss": 36.3698, "step": 13070 }, { "epoch": 34.52228458237042, "grad_norm": 1463.407470703125, "learning_rate": 0.00010951371002878851, "loss": 35.8209, "step": 13071 }, { "epoch": 34.52492571805877, "grad_norm": 934.5994873046875, "learning_rate": 0.0001094791555369674, "loss": 35.8323, "step": 13072 }, { "epoch": 34.52756685374711, "grad_norm": 1452.220703125, "learning_rate": 0.00010944460496910827, "loss": 39.1676, "step": 13073 }, { "epoch": 34.53020798943546, "grad_norm": 4837.71337890625, "learning_rate": 0.00010941005832617607, "loss": 39.1583, "step": 13074 }, { "epoch": 34.5328491251238, "grad_norm": 838.2659301757812, "learning_rate": 0.00010937551560913536, "loss": 40.5114, "step": 13075 }, { "epoch": 34.535490260812146, "grad_norm": 503.1991882324219, "learning_rate": 0.00010934097681895073, "loss": 38.8312, "step": 13076 }, { "epoch": 34.538131396500496, "grad_norm": 2214.60986328125, "learning_rate": 0.00010930644195658674, "loss": 40.2648, "step": 13077 }, { "epoch": 34.54077253218884, "grad_norm": 560.1991577148438, "learning_rate": 0.00010927191102300782, "loss": 43.0527, "step": 13078 }, { "epoch": 34.54341366787719, "grad_norm": 1434.1634521484375, "learning_rate": 0.00010923738401917804, "loss": 42.5105, "step": 13079 }, { "epoch": 34.54605480356553, "grad_norm": 538.0093994140625, "learning_rate": 0.00010920286094606152, "loss": 45.5901, "step": 13080 }, { "epoch": 34.54869593925388, "grad_norm": 644.8826293945312, "learning_rate": 0.00010916834180462248, "loss": 39.626, "step": 13081 }, { "epoch": 34.551337074942225, "grad_norm": 1889.8248291015625, "learning_rate": 0.00010913382659582477, "loss": 40.0206, "step": 13082 }, { "epoch": 34.55397821063057, "grad_norm": 490.47137451171875, "learning_rate": 0.0001090993153206321, "loss": 38.3915, "step": 13083 }, { "epoch": 34.55661934631892, "grad_norm": 1086.3729248046875, "learning_rate": 0.00010906480798000839, "loss": 40.1776, "step": 13084 }, { "epoch": 34.55926048200726, "grad_norm": 1276.44580078125, "learning_rate": 0.00010903030457491714, "loss": 38.8481, "step": 13085 }, { "epoch": 34.56190161769561, "grad_norm": 2308.517578125, "learning_rate": 0.00010899580510632182, "loss": 36.7322, "step": 13086 }, { "epoch": 34.564542753383954, "grad_norm": 1564.4906005859375, "learning_rate": 0.0001089613095751858, "loss": 36.2319, "step": 13087 }, { "epoch": 34.567183889072304, "grad_norm": 688.6301879882812, "learning_rate": 0.00010892681798247236, "loss": 35.7445, "step": 13088 }, { "epoch": 34.56982502476065, "grad_norm": 432.105224609375, "learning_rate": 0.00010889233032914467, "loss": 37.3512, "step": 13089 }, { "epoch": 34.572466160449, "grad_norm": 497.630859375, "learning_rate": 0.00010885784661616566, "loss": 35.6589, "step": 13090 }, { "epoch": 34.57510729613734, "grad_norm": 1029.543701171875, "learning_rate": 0.00010882336684449845, "loss": 34.573, "step": 13091 }, { "epoch": 34.57774843182568, "grad_norm": 476.2089538574219, "learning_rate": 0.0001087888910151058, "loss": 34.2077, "step": 13092 }, { "epoch": 34.58038956751403, "grad_norm": 386.5149230957031, "learning_rate": 0.00010875441912895035, "loss": 35.9523, "step": 13093 }, { "epoch": 34.583030703202375, "grad_norm": 847.7770385742188, "learning_rate": 0.00010871995118699482, "loss": 36.153, "step": 13094 }, { "epoch": 34.585671838890725, "grad_norm": 703.4468383789062, "learning_rate": 0.00010868548719020168, "loss": 37.9935, "step": 13095 }, { "epoch": 34.58831297457907, "grad_norm": 1822.3502197265625, "learning_rate": 0.00010865102713953343, "loss": 23.0495, "step": 13096 }, { "epoch": 34.59095411026742, "grad_norm": 2389.626708984375, "learning_rate": 0.000108616571035952, "loss": 13.8749, "step": 13097 }, { "epoch": 34.59359524595576, "grad_norm": 37209.8671875, "learning_rate": 0.00010858211888041983, "loss": 9.9474, "step": 13098 }, { "epoch": 34.596236381644104, "grad_norm": 9710.75, "learning_rate": 0.00010854767067389895, "loss": 10.0904, "step": 13099 }, { "epoch": 34.598877517332454, "grad_norm": 23091.185546875, "learning_rate": 0.00010851322641735117, "loss": 14.1043, "step": 13100 }, { "epoch": 34.6015186530208, "grad_norm": 2415.581298828125, "learning_rate": 0.00010847878611173853, "loss": 9.5786, "step": 13101 }, { "epoch": 34.60415978870915, "grad_norm": 1832.4151611328125, "learning_rate": 0.00010844434975802264, "loss": 17.4591, "step": 13102 }, { "epoch": 34.60680092439749, "grad_norm": 998.18017578125, "learning_rate": 0.00010840991735716515, "loss": 16.5681, "step": 13103 }, { "epoch": 34.60944206008584, "grad_norm": 3630.17236328125, "learning_rate": 0.00010837548891012752, "loss": 15.4479, "step": 13104 }, { "epoch": 34.61208319577418, "grad_norm": 413.7705383300781, "learning_rate": 0.0001083410644178712, "loss": 26.5148, "step": 13105 }, { "epoch": 34.614724331462526, "grad_norm": 750.543212890625, "learning_rate": 0.00010830664388135741, "loss": 35.4206, "step": 13106 }, { "epoch": 34.617365467150876, "grad_norm": 1375.1995849609375, "learning_rate": 0.00010827222730154729, "loss": 34.3958, "step": 13107 }, { "epoch": 34.62000660283922, "grad_norm": 538.2876586914062, "learning_rate": 0.00010823781467940202, "loss": 33.8318, "step": 13108 }, { "epoch": 34.62264773852757, "grad_norm": 1035.2642822265625, "learning_rate": 0.00010820340601588252, "loss": 34.0586, "step": 13109 }, { "epoch": 34.62528887421591, "grad_norm": 417.2077941894531, "learning_rate": 0.00010816900131194963, "loss": 34.512, "step": 13110 }, { "epoch": 34.62793000990426, "grad_norm": 498.0998229980469, "learning_rate": 0.00010813460056856395, "loss": 35.7778, "step": 13111 }, { "epoch": 34.630571145592604, "grad_norm": 551.3736572265625, "learning_rate": 0.00010810020378668631, "loss": 34.4128, "step": 13112 }, { "epoch": 34.63321228128095, "grad_norm": 657.1266479492188, "learning_rate": 0.00010806581096727722, "loss": 35.0178, "step": 13113 }, { "epoch": 34.6358534169693, "grad_norm": 4115.24462890625, "learning_rate": 0.00010803142211129677, "loss": 34.1086, "step": 13114 }, { "epoch": 34.63849455265764, "grad_norm": 804.865234375, "learning_rate": 0.00010799703721970558, "loss": 35.0598, "step": 13115 }, { "epoch": 34.64113568834599, "grad_norm": 405.7346496582031, "learning_rate": 0.00010796265629346369, "loss": 34.9979, "step": 13116 }, { "epoch": 34.64377682403433, "grad_norm": 444.9515075683594, "learning_rate": 0.00010792827933353121, "loss": 34.4781, "step": 13117 }, { "epoch": 34.64641795972268, "grad_norm": 858.63330078125, "learning_rate": 0.00010789390634086796, "loss": 34.6325, "step": 13118 }, { "epoch": 34.649059095411026, "grad_norm": 434.9767761230469, "learning_rate": 0.000107859537316434, "loss": 33.5302, "step": 13119 }, { "epoch": 34.651700231099376, "grad_norm": 680.0840454101562, "learning_rate": 0.00010782517226118896, "loss": 35.6492, "step": 13120 }, { "epoch": 34.65434136678772, "grad_norm": 548.8240966796875, "learning_rate": 0.00010779081117609247, "loss": 34.4478, "step": 13121 }, { "epoch": 34.65698250247606, "grad_norm": 1040.5439453125, "learning_rate": 0.00010775645406210405, "loss": 36.5864, "step": 13122 }, { "epoch": 34.65962363816441, "grad_norm": 1429.6727294921875, "learning_rate": 0.00010772210092018305, "loss": 38.3265, "step": 13123 }, { "epoch": 34.662264773852755, "grad_norm": 821.0809326171875, "learning_rate": 0.00010768775175128884, "loss": 39.6937, "step": 13124 }, { "epoch": 34.664905909541105, "grad_norm": 540.5706787109375, "learning_rate": 0.00010765340655638047, "loss": 41.6906, "step": 13125 }, { "epoch": 34.66754704522945, "grad_norm": 524.4441528320312, "learning_rate": 0.00010761906533641719, "loss": 38.8403, "step": 13126 }, { "epoch": 34.6701881809178, "grad_norm": 518.9348754882812, "learning_rate": 0.00010758472809235787, "loss": 39.1301, "step": 13127 }, { "epoch": 34.67282931660614, "grad_norm": 544.861328125, "learning_rate": 0.00010755039482516127, "loss": 43.0908, "step": 13128 }, { "epoch": 34.67547045229448, "grad_norm": 582.3239135742188, "learning_rate": 0.0001075160655357863, "loss": 41.1738, "step": 13129 }, { "epoch": 34.67811158798283, "grad_norm": 478.9313049316406, "learning_rate": 0.0001074817402251915, "loss": 42.1372, "step": 13130 }, { "epoch": 34.680752723671176, "grad_norm": 670.1356811523438, "learning_rate": 0.00010744741889433548, "loss": 40.117, "step": 13131 }, { "epoch": 34.683393859359526, "grad_norm": 476.8475341796875, "learning_rate": 0.00010741310154417638, "loss": 40.9383, "step": 13132 }, { "epoch": 34.68603499504787, "grad_norm": 560.28857421875, "learning_rate": 0.00010737878817567273, "loss": 37.6069, "step": 13133 }, { "epoch": 34.68867613073622, "grad_norm": 623.2772827148438, "learning_rate": 0.00010734447878978265, "loss": 37.8918, "step": 13134 }, { "epoch": 34.69131726642456, "grad_norm": 786.99853515625, "learning_rate": 0.0001073101733874641, "loss": 37.2545, "step": 13135 }, { "epoch": 34.69395840211291, "grad_norm": 535.7615356445312, "learning_rate": 0.00010727587196967522, "loss": 37.8209, "step": 13136 }, { "epoch": 34.696599537801255, "grad_norm": 626.1044921875, "learning_rate": 0.0001072415745373738, "loss": 36.4869, "step": 13137 }, { "epoch": 34.6992406734896, "grad_norm": 772.2133178710938, "learning_rate": 0.00010720728109151754, "loss": 34.7638, "step": 13138 }, { "epoch": 34.70188180917795, "grad_norm": 363.3681945800781, "learning_rate": 0.00010717299163306407, "loss": 35.8709, "step": 13139 }, { "epoch": 34.70452294486629, "grad_norm": 663.9353637695312, "learning_rate": 0.00010713870616297092, "loss": 35.5147, "step": 13140 }, { "epoch": 34.70716408055464, "grad_norm": 410.3374328613281, "learning_rate": 0.00010710442468219544, "loss": 35.2904, "step": 13141 }, { "epoch": 34.709805216242984, "grad_norm": 634.3136596679688, "learning_rate": 0.00010707014719169491, "loss": 34.8213, "step": 13142 }, { "epoch": 34.712446351931334, "grad_norm": 653.1753540039062, "learning_rate": 0.00010703587369242663, "loss": 35.373, "step": 13143 }, { "epoch": 34.71508748761968, "grad_norm": 484.88299560546875, "learning_rate": 0.0001070016041853476, "loss": 35.2843, "step": 13144 }, { "epoch": 34.71772862330802, "grad_norm": 2229.174072265625, "learning_rate": 0.00010696733867141479, "loss": 35.3298, "step": 13145 }, { "epoch": 34.72036975899637, "grad_norm": 5385.31787109375, "learning_rate": 0.0001069330771515849, "loss": 26.0564, "step": 13146 }, { "epoch": 34.72301089468471, "grad_norm": 2280.17919921875, "learning_rate": 0.00010689881962681491, "loss": 16.8231, "step": 13147 }, { "epoch": 34.72565203037306, "grad_norm": 2237.71923828125, "learning_rate": 0.00010686456609806138, "loss": 11.6448, "step": 13148 }, { "epoch": 34.728293166061405, "grad_norm": 4874.99951171875, "learning_rate": 0.00010683031656628061, "loss": 14.9729, "step": 13149 }, { "epoch": 34.730934301749755, "grad_norm": 1471.105224609375, "learning_rate": 0.00010679607103242922, "loss": 12.5833, "step": 13150 }, { "epoch": 34.7335754374381, "grad_norm": 1928.5068359375, "learning_rate": 0.00010676182949746344, "loss": 17.2007, "step": 13151 }, { "epoch": 34.73621657312644, "grad_norm": 1126.4979248046875, "learning_rate": 0.00010672759196233939, "loss": 13.2806, "step": 13152 }, { "epoch": 34.73885770881479, "grad_norm": 1578.767822265625, "learning_rate": 0.00010669335842801312, "loss": 12.5378, "step": 13153 }, { "epoch": 34.741498844503134, "grad_norm": 1903.880126953125, "learning_rate": 0.00010665912889544072, "loss": 12.858, "step": 13154 }, { "epoch": 34.744139980191484, "grad_norm": 2475.956298828125, "learning_rate": 0.00010662490336557793, "loss": 14.4385, "step": 13155 }, { "epoch": 34.74678111587983, "grad_norm": 582.084716796875, "learning_rate": 0.00010659068183938048, "loss": 35.6884, "step": 13156 }, { "epoch": 34.74942225156818, "grad_norm": 764.875, "learning_rate": 0.00010655646431780403, "loss": 34.6202, "step": 13157 }, { "epoch": 34.75206338725652, "grad_norm": 686.235107421875, "learning_rate": 0.00010652225080180403, "loss": 34.428, "step": 13158 }, { "epoch": 34.75470452294486, "grad_norm": 872.1807861328125, "learning_rate": 0.00010648804129233588, "loss": 36.5135, "step": 13159 }, { "epoch": 34.75734565863321, "grad_norm": 1868.62744140625, "learning_rate": 0.0001064538357903548, "loss": 34.8732, "step": 13160 }, { "epoch": 34.759986794321556, "grad_norm": 609.4488525390625, "learning_rate": 0.00010641963429681612, "loss": 33.4248, "step": 13161 }, { "epoch": 34.762627930009906, "grad_norm": 566.228759765625, "learning_rate": 0.0001063854368126748, "loss": 35.4864, "step": 13162 }, { "epoch": 34.76526906569825, "grad_norm": 591.0899047851562, "learning_rate": 0.00010635124333888572, "loss": 34.6276, "step": 13163 }, { "epoch": 34.7679102013866, "grad_norm": 979.3658447265625, "learning_rate": 0.00010631705387640386, "loss": 35.0849, "step": 13164 }, { "epoch": 34.77055133707494, "grad_norm": 1236.817138671875, "learning_rate": 0.00010628286842618387, "loss": 34.2476, "step": 13165 }, { "epoch": 34.77319247276329, "grad_norm": 671.98388671875, "learning_rate": 0.00010624868698918044, "loss": 34.2105, "step": 13166 }, { "epoch": 34.775833608451634, "grad_norm": 3011.448486328125, "learning_rate": 0.00010621450956634781, "loss": 34.9487, "step": 13167 }, { "epoch": 34.77847474413998, "grad_norm": 2495.0419921875, "learning_rate": 0.00010618033615864064, "loss": 36.096, "step": 13168 }, { "epoch": 34.78111587982833, "grad_norm": 499.9140930175781, "learning_rate": 0.00010614616676701308, "loss": 33.8889, "step": 13169 }, { "epoch": 34.78375701551667, "grad_norm": 2907.537353515625, "learning_rate": 0.00010611200139241922, "loss": 34.2715, "step": 13170 }, { "epoch": 34.78639815120502, "grad_norm": 467.80908203125, "learning_rate": 0.00010607784003581331, "loss": 35.6807, "step": 13171 }, { "epoch": 34.78903928689336, "grad_norm": 3494.85791015625, "learning_rate": 0.00010604368269814915, "loss": 36.1266, "step": 13172 }, { "epoch": 34.79168042258171, "grad_norm": 768.948974609375, "learning_rate": 0.00010600952938038058, "loss": 36.6076, "step": 13173 }, { "epoch": 34.794321558270056, "grad_norm": 2155.863037109375, "learning_rate": 0.00010597538008346133, "loss": 40.2947, "step": 13174 }, { "epoch": 34.7969626939584, "grad_norm": 514.9000244140625, "learning_rate": 0.00010594123480834498, "loss": 38.8938, "step": 13175 }, { "epoch": 34.79960382964675, "grad_norm": 649.52734375, "learning_rate": 0.00010590709355598502, "loss": 38.4088, "step": 13176 }, { "epoch": 34.80224496533509, "grad_norm": 942.5216064453125, "learning_rate": 0.00010587295632733474, "loss": 40.5937, "step": 13177 }, { "epoch": 34.80488610102344, "grad_norm": 407.4947204589844, "learning_rate": 0.00010583882312334758, "loss": 41.326, "step": 13178 }, { "epoch": 34.807527236711785, "grad_norm": 849.01708984375, "learning_rate": 0.0001058046939449766, "loss": 41.379, "step": 13179 }, { "epoch": 34.810168372400135, "grad_norm": 939.1093139648438, "learning_rate": 0.00010577056879317484, "loss": 41.3427, "step": 13180 }, { "epoch": 34.81280950808848, "grad_norm": 757.6052856445312, "learning_rate": 0.00010573644766889515, "loss": 41.4574, "step": 13181 }, { "epoch": 34.81545064377683, "grad_norm": 1016.1561889648438, "learning_rate": 0.00010570233057309048, "loss": 42.9435, "step": 13182 }, { "epoch": 34.81809177946517, "grad_norm": 444.8688049316406, "learning_rate": 0.00010566821750671357, "loss": 39.5325, "step": 13183 }, { "epoch": 34.82073291515351, "grad_norm": 437.7701721191406, "learning_rate": 0.0001056341084707167, "loss": 39.9431, "step": 13184 }, { "epoch": 34.82337405084186, "grad_norm": 432.09124755859375, "learning_rate": 0.00010560000346605265, "loss": 39.3967, "step": 13185 }, { "epoch": 34.826015186530206, "grad_norm": 1186.4608154296875, "learning_rate": 0.00010556590249367368, "loss": 38.741, "step": 13186 }, { "epoch": 34.828656322218556, "grad_norm": 600.1583251953125, "learning_rate": 0.00010553180555453202, "loss": 35.2421, "step": 13187 }, { "epoch": 34.8312974579069, "grad_norm": 414.65509033203125, "learning_rate": 0.00010549771264957975, "loss": 35.3032, "step": 13188 }, { "epoch": 34.83393859359525, "grad_norm": 431.9755554199219, "learning_rate": 0.00010546362377976907, "loss": 36.146, "step": 13189 }, { "epoch": 34.83657972928359, "grad_norm": 452.8738708496094, "learning_rate": 0.00010542953894605175, "loss": 37.09, "step": 13190 }, { "epoch": 34.839220864971935, "grad_norm": 360.18377685546875, "learning_rate": 0.00010539545814937967, "loss": 34.8813, "step": 13191 }, { "epoch": 34.841862000660285, "grad_norm": 455.70086669921875, "learning_rate": 0.00010536138139070444, "loss": 34.4238, "step": 13192 }, { "epoch": 34.84450313634863, "grad_norm": 726.1504516601562, "learning_rate": 0.0001053273086709777, "loss": 34.6859, "step": 13193 }, { "epoch": 34.84714427203698, "grad_norm": 972.110595703125, "learning_rate": 0.00010529323999115086, "loss": 35.1174, "step": 13194 }, { "epoch": 34.84978540772532, "grad_norm": 1388.723388671875, "learning_rate": 0.00010525917535217522, "loss": 37.4736, "step": 13195 }, { "epoch": 34.85242654341367, "grad_norm": 1165.1617431640625, "learning_rate": 0.00010522511475500218, "loss": 24.0005, "step": 13196 }, { "epoch": 34.855067679102014, "grad_norm": 815.379638671875, "learning_rate": 0.00010519105820058273, "loss": 9.9931, "step": 13197 }, { "epoch": 34.85770881479036, "grad_norm": 1186.4283447265625, "learning_rate": 0.00010515700568986786, "loss": 9.9643, "step": 13198 }, { "epoch": 34.86034995047871, "grad_norm": 1425.7349853515625, "learning_rate": 0.00010512295722380857, "loss": 11.1534, "step": 13199 }, { "epoch": 34.86299108616705, "grad_norm": 4196.9267578125, "learning_rate": 0.00010508891280335561, "loss": 11.2619, "step": 13200 }, { "epoch": 34.86299108616705, "eval_loss": 3.7640507221221924, "eval_runtime": 2.1657, "eval_samples_per_second": 228.568, "eval_steps_per_second": 28.629, "step": 13200 }, { "epoch": 34.8656322218554, "grad_norm": 587.6546020507812, "learning_rate": 0.00010505487242945974, "loss": 8.864, "step": 13201 }, { "epoch": 34.86827335754374, "grad_norm": 2986.245361328125, "learning_rate": 0.00010502083610307123, "loss": 9.8474, "step": 13202 }, { "epoch": 34.87091449323209, "grad_norm": 1125.7677001953125, "learning_rate": 0.00010498680382514078, "loss": 12.7232, "step": 13203 }, { "epoch": 34.873555628920435, "grad_norm": 6165.6748046875, "learning_rate": 0.00010495277559661868, "loss": 15.4692, "step": 13204 }, { "epoch": 34.87619676460878, "grad_norm": 911.1627807617188, "learning_rate": 0.000104918751418455, "loss": 15.4122, "step": 13205 }, { "epoch": 34.87883790029713, "grad_norm": 2129.222412109375, "learning_rate": 0.00010488473129160004, "loss": 34.8326, "step": 13206 }, { "epoch": 34.88147903598547, "grad_norm": 1425.3984375, "learning_rate": 0.00010485071521700373, "loss": 35.5388, "step": 13207 }, { "epoch": 34.88412017167382, "grad_norm": 2647.9189453125, "learning_rate": 0.00010481670319561592, "loss": 34.8378, "step": 13208 }, { "epoch": 34.886761307362164, "grad_norm": 457.349853515625, "learning_rate": 0.0001047826952283864, "loss": 35.4899, "step": 13209 }, { "epoch": 34.889402443050514, "grad_norm": 1234.3731689453125, "learning_rate": 0.00010474869131626482, "loss": 34.5135, "step": 13210 }, { "epoch": 34.89204357873886, "grad_norm": 647.5882568359375, "learning_rate": 0.00010471469146020069, "loss": 33.4902, "step": 13211 }, { "epoch": 34.89468471442721, "grad_norm": 1146.1934814453125, "learning_rate": 0.00010468069566114338, "loss": 33.4383, "step": 13212 }, { "epoch": 34.89732585011555, "grad_norm": 1057.1566162109375, "learning_rate": 0.00010464670392004236, "loss": 34.2121, "step": 13213 }, { "epoch": 34.89996698580389, "grad_norm": 1110.4656982421875, "learning_rate": 0.00010461271623784676, "loss": 34.0077, "step": 13214 }, { "epoch": 34.90260812149224, "grad_norm": 659.6460571289062, "learning_rate": 0.00010457873261550563, "loss": 35.5322, "step": 13215 }, { "epoch": 34.905249257180586, "grad_norm": 960.6448974609375, "learning_rate": 0.0001045447530539679, "loss": 35.9148, "step": 13216 }, { "epoch": 34.907890392868936, "grad_norm": 817.8479614257812, "learning_rate": 0.0001045107775541826, "loss": 34.7187, "step": 13217 }, { "epoch": 34.91053152855728, "grad_norm": 696.5391845703125, "learning_rate": 0.00010447680611709843, "loss": 34.0208, "step": 13218 }, { "epoch": 34.91317266424563, "grad_norm": 602.6220703125, "learning_rate": 0.00010444283874366378, "loss": 33.9223, "step": 13219 }, { "epoch": 34.91581379993397, "grad_norm": 983.7799682617188, "learning_rate": 0.00010440887543482746, "loss": 33.0823, "step": 13220 }, { "epoch": 34.918454935622314, "grad_norm": 1483.4918212890625, "learning_rate": 0.00010437491619153777, "loss": 35.0435, "step": 13221 }, { "epoch": 34.921096071310664, "grad_norm": 2116.022216796875, "learning_rate": 0.00010434096101474289, "loss": 37.3955, "step": 13222 }, { "epoch": 34.92373720699901, "grad_norm": 1592.4437255859375, "learning_rate": 0.00010430700990539122, "loss": 40.5285, "step": 13223 }, { "epoch": 34.92637834268736, "grad_norm": 924.6614379882812, "learning_rate": 0.0001042730628644307, "loss": 38.5724, "step": 13224 }, { "epoch": 34.9290194783757, "grad_norm": 1145.0821533203125, "learning_rate": 0.00010423911989280932, "loss": 41.5424, "step": 13225 }, { "epoch": 34.93166061406405, "grad_norm": 798.39990234375, "learning_rate": 0.00010420518099147486, "loss": 42.397, "step": 13226 }, { "epoch": 34.93430174975239, "grad_norm": 1083.147216796875, "learning_rate": 0.0001041712461613751, "loss": 42.7034, "step": 13227 }, { "epoch": 34.93694288544074, "grad_norm": 693.3704833984375, "learning_rate": 0.0001041373154034576, "loss": 39.1205, "step": 13228 }, { "epoch": 34.939584021129086, "grad_norm": 1116.029541015625, "learning_rate": 0.00010410338871866984, "loss": 38.3058, "step": 13229 }, { "epoch": 34.94222515681743, "grad_norm": 1126.208740234375, "learning_rate": 0.00010406946610795931, "loss": 37.4682, "step": 13230 }, { "epoch": 34.94486629250578, "grad_norm": 584.1022338867188, "learning_rate": 0.00010403554757227321, "loss": 37.21, "step": 13231 }, { "epoch": 34.94750742819412, "grad_norm": 1310.308349609375, "learning_rate": 0.0001040016331125587, "loss": 36.9018, "step": 13232 }, { "epoch": 34.95014856388247, "grad_norm": 707.9114379882812, "learning_rate": 0.0001039677227297627, "loss": 36.4183, "step": 13233 }, { "epoch": 34.952789699570815, "grad_norm": 407.1260070800781, "learning_rate": 0.00010393381642483241, "loss": 35.7028, "step": 13234 }, { "epoch": 34.955430835259165, "grad_norm": 926.2763671875, "learning_rate": 0.00010389991419871444, "loss": 36.1863, "step": 13235 }, { "epoch": 34.95807197094751, "grad_norm": 3669.45849609375, "learning_rate": 0.00010386601605235557, "loss": 35.9553, "step": 13236 }, { "epoch": 34.96071310663585, "grad_norm": 1624.256591796875, "learning_rate": 0.00010383212198670236, "loss": 23.9156, "step": 13237 }, { "epoch": 34.9633542423242, "grad_norm": 1307.5228271484375, "learning_rate": 0.00010379823200270128, "loss": 11.0921, "step": 13238 }, { "epoch": 34.96599537801254, "grad_norm": 3225.262451171875, "learning_rate": 0.00010376434610129867, "loss": 9.7979, "step": 13239 }, { "epoch": 34.96863651370089, "grad_norm": 3735.98828125, "learning_rate": 0.00010373046428344074, "loss": 14.5086, "step": 13240 }, { "epoch": 34.971277649389236, "grad_norm": 1153.9493408203125, "learning_rate": 0.00010369658655007372, "loss": 9.1466, "step": 13241 }, { "epoch": 34.973918785077586, "grad_norm": 796.8379516601562, "learning_rate": 0.0001036627129021436, "loss": 27.1413, "step": 13242 }, { "epoch": 34.97655992076593, "grad_norm": 595.7974853515625, "learning_rate": 0.00010362884334059624, "loss": 35.3888, "step": 13243 }, { "epoch": 34.97920105645427, "grad_norm": 1033.67333984375, "learning_rate": 0.00010359497786637748, "loss": 32.7106, "step": 13244 }, { "epoch": 34.98184219214262, "grad_norm": 1532.712158203125, "learning_rate": 0.00010356111648043293, "loss": 35.1765, "step": 13245 }, { "epoch": 34.984483327830965, "grad_norm": 716.00341796875, "learning_rate": 0.00010352725918370817, "loss": 34.645, "step": 13246 }, { "epoch": 34.987124463519315, "grad_norm": 515.14697265625, "learning_rate": 0.0001034934059771486, "loss": 34.6417, "step": 13247 }, { "epoch": 34.98976559920766, "grad_norm": 1119.8250732421875, "learning_rate": 0.00010345955686169967, "loss": 34.4978, "step": 13248 }, { "epoch": 34.99240673489601, "grad_norm": 920.0421752929688, "learning_rate": 0.00010342571183830654, "loss": 35.1345, "step": 13249 }, { "epoch": 34.99504787058435, "grad_norm": 458.37310791015625, "learning_rate": 0.00010339187090791421, "loss": 34.7294, "step": 13250 }, { "epoch": 34.997689006272694, "grad_norm": 1014.1883544921875, "learning_rate": 0.00010335803407146785, "loss": 36.0763, "step": 13251 }, { "epoch": 35.000330141961044, "grad_norm": 1432.201416015625, "learning_rate": 0.00010332420132991227, "loss": 41.0227, "step": 13252 }, { "epoch": 35.00297127764939, "grad_norm": 445.6134948730469, "learning_rate": 0.00010329037268419229, "loss": 38.8437, "step": 13253 }, { "epoch": 35.00561241333774, "grad_norm": 1258.056884765625, "learning_rate": 0.00010325654813525226, "loss": 38.3562, "step": 13254 }, { "epoch": 35.00825354902608, "grad_norm": 1574.05029296875, "learning_rate": 0.00010322272768403703, "loss": 38.2347, "step": 13255 }, { "epoch": 35.01089468471443, "grad_norm": 642.2098999023438, "learning_rate": 0.00010318891133149092, "loss": 39.9638, "step": 13256 }, { "epoch": 35.01353582040277, "grad_norm": 1082.5560302734375, "learning_rate": 0.00010315509907855811, "loss": 42.7579, "step": 13257 }, { "epoch": 35.01617695609112, "grad_norm": 1512.727294921875, "learning_rate": 0.000103121290926183, "loss": 43.2142, "step": 13258 }, { "epoch": 35.018818091779465, "grad_norm": 786.3167114257812, "learning_rate": 0.00010308748687530956, "loss": 39.5923, "step": 13259 }, { "epoch": 35.02145922746781, "grad_norm": 530.0614013671875, "learning_rate": 0.00010305368692688174, "loss": 39.6541, "step": 13260 }, { "epoch": 35.02410036315616, "grad_norm": 688.6528930664062, "learning_rate": 0.0001030198910818434, "loss": 37.4533, "step": 13261 }, { "epoch": 35.0267414988445, "grad_norm": 470.1859436035156, "learning_rate": 0.00010298609934113825, "loss": 37.5677, "step": 13262 }, { "epoch": 35.02938263453285, "grad_norm": 1150.5712890625, "learning_rate": 0.00010295231170570992, "loss": 37.8459, "step": 13263 }, { "epoch": 35.032023770221194, "grad_norm": 440.52508544921875, "learning_rate": 0.00010291852817650182, "loss": 36.9702, "step": 13264 }, { "epoch": 35.034664905909544, "grad_norm": 635.3128662109375, "learning_rate": 0.0001028847487544575, "loss": 35.1965, "step": 13265 }, { "epoch": 35.03730604159789, "grad_norm": 1189.2384033203125, "learning_rate": 0.00010285097344052016, "loss": 37.0271, "step": 13266 }, { "epoch": 35.03994717728623, "grad_norm": 910.3121337890625, "learning_rate": 0.00010281720223563296, "loss": 35.9306, "step": 13267 }, { "epoch": 35.04258831297458, "grad_norm": 589.1762084960938, "learning_rate": 0.00010278343514073881, "loss": 36.4546, "step": 13268 }, { "epoch": 35.04522944866292, "grad_norm": 587.2432250976562, "learning_rate": 0.00010274967215678085, "loss": 34.3492, "step": 13269 }, { "epoch": 35.04787058435127, "grad_norm": 245.67425537109375, "learning_rate": 0.0001027159132847018, "loss": 34.6723, "step": 13270 }, { "epoch": 35.050511720039616, "grad_norm": 659.904541015625, "learning_rate": 0.00010268215852544435, "loss": 34.6812, "step": 13271 }, { "epoch": 35.053152855727966, "grad_norm": 1093.211181640625, "learning_rate": 0.00010264840787995108, "loss": 35.7331, "step": 13272 }, { "epoch": 35.05579399141631, "grad_norm": 1357.4530029296875, "learning_rate": 0.00010261466134916447, "loss": 34.8622, "step": 13273 }, { "epoch": 35.05843512710465, "grad_norm": 7048.02099609375, "learning_rate": 0.00010258091893402688, "loss": 41.9602, "step": 13274 }, { "epoch": 35.061076262793, "grad_norm": 1421.7021484375, "learning_rate": 0.0001025471806354804, "loss": 23.4656, "step": 13275 }, { "epoch": 35.063717398481344, "grad_norm": 2765.6064453125, "learning_rate": 0.00010251344645446742, "loss": 15.6259, "step": 13276 }, { "epoch": 35.066358534169694, "grad_norm": 9709.15234375, "learning_rate": 0.00010247971639192977, "loss": 10.9963, "step": 13277 }, { "epoch": 35.06899966985804, "grad_norm": 922.6771240234375, "learning_rate": 0.00010244599044880939, "loss": 11.5189, "step": 13278 }, { "epoch": 35.07164080554639, "grad_norm": 2002.5166015625, "learning_rate": 0.00010241226862604805, "loss": 13.6966, "step": 13279 }, { "epoch": 35.07428194123473, "grad_norm": 4054.834716796875, "learning_rate": 0.00010237855092458742, "loss": 13.4936, "step": 13280 }, { "epoch": 35.07692307692308, "grad_norm": 1549.719970703125, "learning_rate": 0.00010234483734536904, "loss": 9.6357, "step": 13281 }, { "epoch": 35.07956421261142, "grad_norm": 2022.3758544921875, "learning_rate": 0.00010231112788933424, "loss": 15.6933, "step": 13282 }, { "epoch": 35.082205348299766, "grad_norm": 873.1365966796875, "learning_rate": 0.00010227742255742453, "loss": 7.8927, "step": 13283 }, { "epoch": 35.084846483988116, "grad_norm": 662.9503173828125, "learning_rate": 0.00010224372135058102, "loss": 16.4158, "step": 13284 }, { "epoch": 35.08748761967646, "grad_norm": 669.435546875, "learning_rate": 0.00010221002426974471, "loss": 35.8817, "step": 13285 }, { "epoch": 35.09012875536481, "grad_norm": 1397.6990966796875, "learning_rate": 0.00010217633131585671, "loss": 34.2277, "step": 13286 }, { "epoch": 35.09276989105315, "grad_norm": 1071.366943359375, "learning_rate": 0.00010214264248985785, "loss": 35.0586, "step": 13287 }, { "epoch": 35.0954110267415, "grad_norm": 581.6122436523438, "learning_rate": 0.00010210895779268894, "loss": 34.0864, "step": 13288 }, { "epoch": 35.098052162429845, "grad_norm": 1178.6573486328125, "learning_rate": 0.00010207527722529033, "loss": 34.643, "step": 13289 }, { "epoch": 35.10069329811819, "grad_norm": 1075.8719482421875, "learning_rate": 0.00010204160078860278, "loss": 35.1587, "step": 13290 }, { "epoch": 35.10333443380654, "grad_norm": 1779.8104248046875, "learning_rate": 0.00010200792848356663, "loss": 34.6551, "step": 13291 }, { "epoch": 35.10597556949488, "grad_norm": 739.0047607421875, "learning_rate": 0.00010197426031112203, "loss": 34.636, "step": 13292 }, { "epoch": 35.10861670518323, "grad_norm": 581.2999267578125, "learning_rate": 0.00010194059627220936, "loss": 34.8099, "step": 13293 }, { "epoch": 35.11125784087157, "grad_norm": 1438.0548095703125, "learning_rate": 0.00010190693636776857, "loss": 34.3969, "step": 13294 }, { "epoch": 35.11389897655992, "grad_norm": 323.10089111328125, "learning_rate": 0.00010187328059873958, "loss": 35.1259, "step": 13295 }, { "epoch": 35.116540112248266, "grad_norm": 861.696044921875, "learning_rate": 0.00010183962896606219, "loss": 35.2426, "step": 13296 }, { "epoch": 35.11918124793661, "grad_norm": 1028.66650390625, "learning_rate": 0.00010180598147067615, "loss": 34.154, "step": 13297 }, { "epoch": 35.12182238362496, "grad_norm": 896.09228515625, "learning_rate": 0.00010177233811352104, "loss": 35.0076, "step": 13298 }, { "epoch": 35.1244635193133, "grad_norm": 488.1494140625, "learning_rate": 0.00010173869889553622, "loss": 33.9568, "step": 13299 }, { "epoch": 35.12710465500165, "grad_norm": 1062.13134765625, "learning_rate": 0.0001017050638176612, "loss": 34.7768, "step": 13300 }, { "epoch": 35.129745790689995, "grad_norm": 470.0558776855469, "learning_rate": 0.00010167143288083519, "loss": 35.9665, "step": 13301 }, { "epoch": 35.132386926378345, "grad_norm": 846.3417358398438, "learning_rate": 0.00010163780608599727, "loss": 43.061, "step": 13302 }, { "epoch": 35.13502806206669, "grad_norm": 389.38067626953125, "learning_rate": 0.00010160418343408637, "loss": 38.5002, "step": 13303 }, { "epoch": 35.13766919775504, "grad_norm": 1046.7972412109375, "learning_rate": 0.00010157056492604158, "loss": 39.5321, "step": 13304 }, { "epoch": 35.14031033344338, "grad_norm": 1208.4117431640625, "learning_rate": 0.00010153695056280154, "loss": 38.9779, "step": 13305 }, { "epoch": 35.142951469131724, "grad_norm": 657.2825927734375, "learning_rate": 0.00010150334034530498, "loss": 41.1293, "step": 13306 }, { "epoch": 35.145592604820074, "grad_norm": 490.0907897949219, "learning_rate": 0.00010146973427449039, "loss": 42.148, "step": 13307 }, { "epoch": 35.14823374050842, "grad_norm": 415.4326477050781, "learning_rate": 0.00010143613235129623, "loss": 40.3161, "step": 13308 }, { "epoch": 35.15087487619677, "grad_norm": 528.3810424804688, "learning_rate": 0.0001014025345766608, "loss": 43.1857, "step": 13309 }, { "epoch": 35.15351601188511, "grad_norm": 736.416748046875, "learning_rate": 0.00010136894095152222, "loss": 40.5884, "step": 13310 }, { "epoch": 35.15615714757346, "grad_norm": 2400.04833984375, "learning_rate": 0.00010133535147681871, "loss": 38.6441, "step": 13311 }, { "epoch": 35.1587982832618, "grad_norm": 640.9338989257812, "learning_rate": 0.00010130176615348821, "loss": 41.5252, "step": 13312 }, { "epoch": 35.161439418950145, "grad_norm": 581.85205078125, "learning_rate": 0.00010126818498246853, "loss": 40.76, "step": 13313 }, { "epoch": 35.164080554638495, "grad_norm": 1302.3558349609375, "learning_rate": 0.00010123460796469741, "loss": 37.466, "step": 13314 }, { "epoch": 35.16672169032684, "grad_norm": 826.8015747070312, "learning_rate": 0.00010120103510111248, "loss": 37.9506, "step": 13315 }, { "epoch": 35.16936282601519, "grad_norm": 699.3489379882812, "learning_rate": 0.00010116746639265123, "loss": 34.7639, "step": 13316 }, { "epoch": 35.17200396170353, "grad_norm": 534.8659057617188, "learning_rate": 0.00010113390184025098, "loss": 35.4678, "step": 13317 }, { "epoch": 35.17464509739188, "grad_norm": 749.0558471679688, "learning_rate": 0.00010110034144484914, "loss": 34.995, "step": 13318 }, { "epoch": 35.177286233080224, "grad_norm": 1782.8582763671875, "learning_rate": 0.0001010667852073828, "loss": 34.5123, "step": 13319 }, { "epoch": 35.17992736876857, "grad_norm": 1245.3446044921875, "learning_rate": 0.00010103323312878893, "loss": 35.5681, "step": 13320 }, { "epoch": 35.18256850445692, "grad_norm": 1313.204345703125, "learning_rate": 0.00010099968521000458, "loss": 36.5149, "step": 13321 }, { "epoch": 35.18520964014526, "grad_norm": 420.1699523925781, "learning_rate": 0.00010096614145196651, "loss": 35.4523, "step": 13322 }, { "epoch": 35.18785077583361, "grad_norm": 1270.4063720703125, "learning_rate": 0.00010093260185561145, "loss": 36.6511, "step": 13323 }, { "epoch": 35.19049191152195, "grad_norm": 1343.6885986328125, "learning_rate": 0.00010089906642187577, "loss": 37.7147, "step": 13324 }, { "epoch": 35.1931330472103, "grad_norm": 1174.0255126953125, "learning_rate": 0.00010086553515169617, "loss": 10.5868, "step": 13325 }, { "epoch": 35.195774182898646, "grad_norm": 793.1741333007812, "learning_rate": 0.00010083200804600887, "loss": 16.4787, "step": 13326 }, { "epoch": 35.198415318586996, "grad_norm": 2391.3671875, "learning_rate": 0.00010079848510575004, "loss": 16.1834, "step": 13327 }, { "epoch": 35.20105645427534, "grad_norm": 2270.8359375, "learning_rate": 0.00010076496633185595, "loss": 14.2059, "step": 13328 }, { "epoch": 35.20369758996368, "grad_norm": 2104.55712890625, "learning_rate": 0.00010073145172526252, "loss": 9.3962, "step": 13329 }, { "epoch": 35.20633872565203, "grad_norm": 1409.692626953125, "learning_rate": 0.0001006979412869056, "loss": 7.6327, "step": 13330 }, { "epoch": 35.208979861340374, "grad_norm": 1832.9068603515625, "learning_rate": 0.00010066443501772097, "loss": 10.8405, "step": 13331 }, { "epoch": 35.211620997028724, "grad_norm": 4363.7646484375, "learning_rate": 0.00010063093291864428, "loss": 11.8467, "step": 13332 }, { "epoch": 35.21426213271707, "grad_norm": 1637.3084716796875, "learning_rate": 0.00010059743499061102, "loss": 12.8786, "step": 13333 }, { "epoch": 35.21690326840542, "grad_norm": 2564.8583984375, "learning_rate": 0.00010056394123455656, "loss": 27.0702, "step": 13334 }, { "epoch": 35.21954440409376, "grad_norm": 1879.4410400390625, "learning_rate": 0.00010053045165141633, "loss": 34.6649, "step": 13335 }, { "epoch": 35.2221855397821, "grad_norm": 728.94287109375, "learning_rate": 0.00010049696624212545, "loss": 34.6944, "step": 13336 }, { "epoch": 35.22482667547045, "grad_norm": 542.7716064453125, "learning_rate": 0.00010046348500761895, "loss": 36.3947, "step": 13337 }, { "epoch": 35.227467811158796, "grad_norm": 1292.4912109375, "learning_rate": 0.00010043000794883172, "loss": 32.9716, "step": 13338 }, { "epoch": 35.230108946847146, "grad_norm": 568.1554565429688, "learning_rate": 0.00010039653506669874, "loss": 35.4787, "step": 13339 }, { "epoch": 35.23275008253549, "grad_norm": 687.0631103515625, "learning_rate": 0.00010036306636215461, "loss": 34.0214, "step": 13340 }, { "epoch": 35.23539121822384, "grad_norm": 773.5691528320312, "learning_rate": 0.00010032960183613398, "loss": 34.6755, "step": 13341 }, { "epoch": 35.23803235391218, "grad_norm": 990.2635498046875, "learning_rate": 0.00010029614148957128, "loss": 35.894, "step": 13342 }, { "epoch": 35.240673489600525, "grad_norm": 1125.1368408203125, "learning_rate": 0.0001002626853234009, "loss": 34.2531, "step": 13343 }, { "epoch": 35.243314625288875, "grad_norm": 710.55419921875, "learning_rate": 0.00010022923333855707, "loss": 35.0049, "step": 13344 }, { "epoch": 35.24595576097722, "grad_norm": 2122.084228515625, "learning_rate": 0.00010019578553597383, "loss": 35.127, "step": 13345 }, { "epoch": 35.24859689666557, "grad_norm": 1011.8890380859375, "learning_rate": 0.00010016234191658536, "loss": 34.7338, "step": 13346 }, { "epoch": 35.25123803235391, "grad_norm": 2019.3106689453125, "learning_rate": 0.00010012890248132547, "loss": 33.983, "step": 13347 }, { "epoch": 35.25387916804226, "grad_norm": 2359.056640625, "learning_rate": 0.00010009546723112794, "loss": 35.1149, "step": 13348 }, { "epoch": 35.2565203037306, "grad_norm": 1063.5308837890625, "learning_rate": 0.00010006203616692645, "loss": 33.4615, "step": 13349 }, { "epoch": 35.25916143941895, "grad_norm": 1202.400634765625, "learning_rate": 0.00010002860928965452, "loss": 35.8565, "step": 13350 }, { "epoch": 35.261802575107296, "grad_norm": 2739.922119140625, "learning_rate": 9.999518660024553e-05, "loss": 37.2014, "step": 13351 }, { "epoch": 35.26444371079564, "grad_norm": 644.1912231445312, "learning_rate": 9.996176809963281e-05, "loss": 39.9455, "step": 13352 }, { "epoch": 35.26708484648399, "grad_norm": 500.8750915527344, "learning_rate": 9.992835378874962e-05, "loss": 38.5963, "step": 13353 }, { "epoch": 35.26972598217233, "grad_norm": 575.7830810546875, "learning_rate": 9.989494366852902e-05, "loss": 38.5261, "step": 13354 }, { "epoch": 35.27236711786068, "grad_norm": 808.1809692382812, "learning_rate": 9.986153773990386e-05, "loss": 38.1848, "step": 13355 }, { "epoch": 35.275008253549025, "grad_norm": 1186.90673828125, "learning_rate": 9.982813600380714e-05, "loss": 39.7214, "step": 13356 }, { "epoch": 35.277649389237375, "grad_norm": 1340.60693359375, "learning_rate": 9.979473846117149e-05, "loss": 40.2611, "step": 13357 }, { "epoch": 35.28029052492572, "grad_norm": 861.7920532226562, "learning_rate": 9.976134511292956e-05, "loss": 41.4101, "step": 13358 }, { "epoch": 35.28293166061406, "grad_norm": 1020.760009765625, "learning_rate": 9.97279559600138e-05, "loss": 40.1036, "step": 13359 }, { "epoch": 35.28557279630241, "grad_norm": 748.34912109375, "learning_rate": 9.969457100335657e-05, "loss": 42.2914, "step": 13360 }, { "epoch": 35.288213931990754, "grad_norm": 1510.1337890625, "learning_rate": 9.96611902438902e-05, "loss": 39.1026, "step": 13361 }, { "epoch": 35.290855067679104, "grad_norm": 649.71728515625, "learning_rate": 9.962781368254665e-05, "loss": 38.1825, "step": 13362 }, { "epoch": 35.29349620336745, "grad_norm": 492.8766784667969, "learning_rate": 9.959444132025819e-05, "loss": 37.8875, "step": 13363 }, { "epoch": 35.2961373390558, "grad_norm": 1051.8966064453125, "learning_rate": 9.956107315795657e-05, "loss": 37.6383, "step": 13364 }, { "epoch": 35.29877847474414, "grad_norm": 534.7699584960938, "learning_rate": 9.952770919657364e-05, "loss": 38.6834, "step": 13365 }, { "epoch": 35.30141961043249, "grad_norm": 752.9635009765625, "learning_rate": 9.949434943704102e-05, "loss": 37.2906, "step": 13366 }, { "epoch": 35.30406074612083, "grad_norm": 688.0452880859375, "learning_rate": 9.946099388029028e-05, "loss": 37.3013, "step": 13367 }, { "epoch": 35.306701881809175, "grad_norm": 487.4883117675781, "learning_rate": 9.942764252725286e-05, "loss": 35.8147, "step": 13368 }, { "epoch": 35.309343017497525, "grad_norm": 727.61376953125, "learning_rate": 9.939429537885999e-05, "loss": 34.4642, "step": 13369 }, { "epoch": 35.31198415318587, "grad_norm": 609.281005859375, "learning_rate": 9.936095243604301e-05, "loss": 34.6182, "step": 13370 }, { "epoch": 35.31462528887422, "grad_norm": 479.4808654785156, "learning_rate": 9.932761369973298e-05, "loss": 35.1573, "step": 13371 }, { "epoch": 35.31726642456256, "grad_norm": 668.9208984375, "learning_rate": 9.929427917086073e-05, "loss": 34.3092, "step": 13372 }, { "epoch": 35.31990756025091, "grad_norm": 641.64208984375, "learning_rate": 9.926094885035728e-05, "loss": 35.2854, "step": 13373 }, { "epoch": 35.322548695939254, "grad_norm": 844.676513671875, "learning_rate": 9.922762273915328e-05, "loss": 33.9869, "step": 13374 }, { "epoch": 35.3251898316276, "grad_norm": 880.5382080078125, "learning_rate": 9.919430083817937e-05, "loss": 10.1114, "step": 13375 }, { "epoch": 35.32783096731595, "grad_norm": 17249.724609375, "learning_rate": 9.9160983148366e-05, "loss": 13.7741, "step": 13376 }, { "epoch": 35.33047210300429, "grad_norm": 6022.8154296875, "learning_rate": 9.912766967064357e-05, "loss": 19.4225, "step": 13377 }, { "epoch": 35.33311323869264, "grad_norm": 719.08251953125, "learning_rate": 9.909436040594235e-05, "loss": 12.7043, "step": 13378 }, { "epoch": 35.33575437438098, "grad_norm": 633.5109252929688, "learning_rate": 9.906105535519239e-05, "loss": 8.4012, "step": 13379 }, { "epoch": 35.33839551006933, "grad_norm": 1020.7821044921875, "learning_rate": 9.902775451932386e-05, "loss": 12.0572, "step": 13380 }, { "epoch": 35.341036645757676, "grad_norm": 981.6080932617188, "learning_rate": 9.899445789926662e-05, "loss": 11.8658, "step": 13381 }, { "epoch": 35.34367778144602, "grad_norm": 4891.76171875, "learning_rate": 9.896116549595042e-05, "loss": 12.8192, "step": 13382 }, { "epoch": 35.34631891713437, "grad_norm": 1355.22705078125, "learning_rate": 9.892787731030495e-05, "loss": 19.9468, "step": 13383 }, { "epoch": 35.34896005282271, "grad_norm": 458.1482238769531, "learning_rate": 9.889459334325976e-05, "loss": 35.372, "step": 13384 }, { "epoch": 35.35160118851106, "grad_norm": 458.5079040527344, "learning_rate": 9.886131359574429e-05, "loss": 36.3013, "step": 13385 }, { "epoch": 35.354242324199404, "grad_norm": 1101.0712890625, "learning_rate": 9.882803806868777e-05, "loss": 34.4608, "step": 13386 }, { "epoch": 35.356883459887754, "grad_norm": 618.7556762695312, "learning_rate": 9.879476676301955e-05, "loss": 34.8623, "step": 13387 }, { "epoch": 35.3595245955761, "grad_norm": 1573.5196533203125, "learning_rate": 9.876149967966865e-05, "loss": 34.405, "step": 13388 }, { "epoch": 35.36216573126444, "grad_norm": 858.171142578125, "learning_rate": 9.872823681956405e-05, "loss": 34.6035, "step": 13389 }, { "epoch": 35.36480686695279, "grad_norm": 749.7352905273438, "learning_rate": 9.869497818363449e-05, "loss": 35.1083, "step": 13390 }, { "epoch": 35.36744800264113, "grad_norm": 682.3179321289062, "learning_rate": 9.866172377280885e-05, "loss": 33.6984, "step": 13391 }, { "epoch": 35.37008913832948, "grad_norm": 1019.3212280273438, "learning_rate": 9.862847358801569e-05, "loss": 33.9938, "step": 13392 }, { "epoch": 35.372730274017826, "grad_norm": 595.1904907226562, "learning_rate": 9.859522763018347e-05, "loss": 34.1803, "step": 13393 }, { "epoch": 35.375371409706176, "grad_norm": 733.2791748046875, "learning_rate": 9.856198590024059e-05, "loss": 35.6607, "step": 13394 }, { "epoch": 35.37801254539452, "grad_norm": 955.7676391601562, "learning_rate": 9.852874839911529e-05, "loss": 36.2759, "step": 13395 }, { "epoch": 35.38065368108287, "grad_norm": 650.7474365234375, "learning_rate": 9.849551512773571e-05, "loss": 34.4672, "step": 13396 }, { "epoch": 35.38329481677121, "grad_norm": 656.7113647460938, "learning_rate": 9.846228608702978e-05, "loss": 34.6856, "step": 13397 }, { "epoch": 35.385935952459555, "grad_norm": 1218.7652587890625, "learning_rate": 9.842906127792561e-05, "loss": 33.8046, "step": 13398 }, { "epoch": 35.388577088147905, "grad_norm": 558.53955078125, "learning_rate": 9.839584070135083e-05, "loss": 34.6924, "step": 13399 }, { "epoch": 35.39121822383625, "grad_norm": 1031.533447265625, "learning_rate": 9.836262435823316e-05, "loss": 37.2205, "step": 13400 }, { "epoch": 35.39121822383625, "eval_loss": 3.834505319595337, "eval_runtime": 2.2476, "eval_samples_per_second": 220.238, "eval_steps_per_second": 27.585, "step": 13400 }, { "epoch": 35.3938593595246, "grad_norm": 3197.629638671875, "learning_rate": 9.832941224950012e-05, "loss": 36.9658, "step": 13401 }, { "epoch": 35.39650049521294, "grad_norm": 1009.0632934570312, "learning_rate": 9.829620437607914e-05, "loss": 39.9458, "step": 13402 }, { "epoch": 35.39914163090129, "grad_norm": 450.6802062988281, "learning_rate": 9.826300073889754e-05, "loss": 38.5742, "step": 13403 }, { "epoch": 35.40178276658963, "grad_norm": 1562.3946533203125, "learning_rate": 9.822980133888245e-05, "loss": 39.1867, "step": 13404 }, { "epoch": 35.404423902277976, "grad_norm": 2182.99853515625, "learning_rate": 9.819660617696108e-05, "loss": 40.0077, "step": 13405 }, { "epoch": 35.407065037966326, "grad_norm": 1763.998046875, "learning_rate": 9.816341525406028e-05, "loss": 40.7468, "step": 13406 }, { "epoch": 35.40970617365467, "grad_norm": 971.7041015625, "learning_rate": 9.813022857110685e-05, "loss": 42.6138, "step": 13407 }, { "epoch": 35.41234730934302, "grad_norm": 1082.1693115234375, "learning_rate": 9.809704612902765e-05, "loss": 42.3989, "step": 13408 }, { "epoch": 35.41498844503136, "grad_norm": 426.9930114746094, "learning_rate": 9.806386792874922e-05, "loss": 38.6945, "step": 13409 }, { "epoch": 35.41762958071971, "grad_norm": 1320.5284423828125, "learning_rate": 9.803069397119802e-05, "loss": 41.7125, "step": 13410 }, { "epoch": 35.420270716408055, "grad_norm": 883.3333740234375, "learning_rate": 9.79975242573004e-05, "loss": 39.2727, "step": 13411 }, { "epoch": 35.422911852096405, "grad_norm": 739.9631958007812, "learning_rate": 9.796435878798265e-05, "loss": 38.8822, "step": 13412 }, { "epoch": 35.42555298778475, "grad_norm": 804.5013427734375, "learning_rate": 9.793119756417086e-05, "loss": 37.8082, "step": 13413 }, { "epoch": 35.42819412347309, "grad_norm": 696.9165649414062, "learning_rate": 9.789804058679094e-05, "loss": 37.9023, "step": 13414 }, { "epoch": 35.43083525916144, "grad_norm": 556.7669677734375, "learning_rate": 9.7864887856769e-05, "loss": 36.6145, "step": 13415 }, { "epoch": 35.433476394849784, "grad_norm": 568.1094970703125, "learning_rate": 9.78317393750307e-05, "loss": 35.5201, "step": 13416 }, { "epoch": 35.436117530538134, "grad_norm": 966.7631225585938, "learning_rate": 9.77985951425017e-05, "loss": 35.4908, "step": 13417 }, { "epoch": 35.43875866622648, "grad_norm": 424.8535461425781, "learning_rate": 9.776545516010749e-05, "loss": 35.1721, "step": 13418 }, { "epoch": 35.44139980191483, "grad_norm": 787.9824829101562, "learning_rate": 9.773231942877353e-05, "loss": 34.9251, "step": 13419 }, { "epoch": 35.44404093760317, "grad_norm": 642.171875, "learning_rate": 9.769918794942512e-05, "loss": 35.2892, "step": 13420 }, { "epoch": 35.44668207329151, "grad_norm": 1356.3291015625, "learning_rate": 9.76660607229873e-05, "loss": 33.5877, "step": 13421 }, { "epoch": 35.44932320897986, "grad_norm": 1019.9976806640625, "learning_rate": 9.763293775038538e-05, "loss": 34.662, "step": 13422 }, { "epoch": 35.451964344668205, "grad_norm": 1185.0692138671875, "learning_rate": 9.759981903254411e-05, "loss": 44.0773, "step": 13423 }, { "epoch": 35.454605480356555, "grad_norm": 1964.70361328125, "learning_rate": 9.756670457038841e-05, "loss": 12.4117, "step": 13424 }, { "epoch": 35.4572466160449, "grad_norm": 1377.1285400390625, "learning_rate": 9.753359436484285e-05, "loss": 10.7793, "step": 13425 }, { "epoch": 35.45988775173325, "grad_norm": 657.0579833984375, "learning_rate": 9.750048841683218e-05, "loss": 10.0587, "step": 13426 }, { "epoch": 35.46252888742159, "grad_norm": 668.0640869140625, "learning_rate": 9.746738672728078e-05, "loss": 10.7025, "step": 13427 }, { "epoch": 35.465170023109934, "grad_norm": 1435.303466796875, "learning_rate": 9.743428929711304e-05, "loss": 9.2513, "step": 13428 }, { "epoch": 35.467811158798284, "grad_norm": 3082.576416015625, "learning_rate": 9.740119612725312e-05, "loss": 9.4258, "step": 13429 }, { "epoch": 35.47045229448663, "grad_norm": 1634.778076171875, "learning_rate": 9.736810721862515e-05, "loss": 14.4698, "step": 13430 }, { "epoch": 35.47309343017498, "grad_norm": 510.395751953125, "learning_rate": 9.733502257215315e-05, "loss": 11.3112, "step": 13431 }, { "epoch": 35.47573456586332, "grad_norm": 3420.81689453125, "learning_rate": 9.730194218876087e-05, "loss": 12.7945, "step": 13432 }, { "epoch": 35.47837570155167, "grad_norm": 468.2934875488281, "learning_rate": 9.726886606937224e-05, "loss": 15.9158, "step": 13433 }, { "epoch": 35.48101683724001, "grad_norm": 811.0436401367188, "learning_rate": 9.72357942149108e-05, "loss": 35.122, "step": 13434 }, { "epoch": 35.483657972928356, "grad_norm": 1022.581787109375, "learning_rate": 9.720272662630006e-05, "loss": 36.7062, "step": 13435 }, { "epoch": 35.486299108616706, "grad_norm": 410.77935791015625, "learning_rate": 9.716966330446344e-05, "loss": 34.7627, "step": 13436 }, { "epoch": 35.48894024430505, "grad_norm": 1840.106201171875, "learning_rate": 9.713660425032417e-05, "loss": 36.1398, "step": 13437 }, { "epoch": 35.4915813799934, "grad_norm": 1489.8829345703125, "learning_rate": 9.710354946480545e-05, "loss": 33.9002, "step": 13438 }, { "epoch": 35.49422251568174, "grad_norm": 569.4260864257812, "learning_rate": 9.707049894883021e-05, "loss": 35.4251, "step": 13439 }, { "epoch": 35.49686365137009, "grad_norm": 1290.03369140625, "learning_rate": 9.703745270332154e-05, "loss": 34.4794, "step": 13440 }, { "epoch": 35.499504787058434, "grad_norm": 770.2176513671875, "learning_rate": 9.700441072920214e-05, "loss": 34.6183, "step": 13441 }, { "epoch": 35.502145922746784, "grad_norm": 942.8291625976562, "learning_rate": 9.69713730273946e-05, "loss": 34.884, "step": 13442 }, { "epoch": 35.50478705843513, "grad_norm": 2321.259765625, "learning_rate": 9.693833959882168e-05, "loss": 34.0738, "step": 13443 }, { "epoch": 35.50742819412347, "grad_norm": 1910.6085205078125, "learning_rate": 9.69053104444057e-05, "loss": 34.6734, "step": 13444 }, { "epoch": 35.51006932981182, "grad_norm": 828.817626953125, "learning_rate": 9.687228556506899e-05, "loss": 36.7805, "step": 13445 }, { "epoch": 35.51271046550016, "grad_norm": 883.20654296875, "learning_rate": 9.683926496173376e-05, "loss": 34.0812, "step": 13446 }, { "epoch": 35.51535160118851, "grad_norm": 1391.6539306640625, "learning_rate": 9.680624863532208e-05, "loss": 33.8606, "step": 13447 }, { "epoch": 35.517992736876856, "grad_norm": 760.802734375, "learning_rate": 9.677323658675594e-05, "loss": 34.0902, "step": 13448 }, { "epoch": 35.520633872565206, "grad_norm": 505.7845764160156, "learning_rate": 9.674022881695704e-05, "loss": 33.9906, "step": 13449 }, { "epoch": 35.52327500825355, "grad_norm": 1205.832275390625, "learning_rate": 9.67072253268473e-05, "loss": 35.9846, "step": 13450 }, { "epoch": 35.52591614394189, "grad_norm": 1167.583984375, "learning_rate": 9.667422611734828e-05, "loss": 37.0918, "step": 13451 }, { "epoch": 35.52855727963024, "grad_norm": 766.8516845703125, "learning_rate": 9.664123118938143e-05, "loss": 42.3686, "step": 13452 }, { "epoch": 35.531198415318585, "grad_norm": 521.78515625, "learning_rate": 9.660824054386808e-05, "loss": 38.3271, "step": 13453 }, { "epoch": 35.533839551006935, "grad_norm": 570.1787109375, "learning_rate": 9.657525418172953e-05, "loss": 39.5761, "step": 13454 }, { "epoch": 35.53648068669528, "grad_norm": 1000.5079956054688, "learning_rate": 9.654227210388685e-05, "loss": 39.0928, "step": 13455 }, { "epoch": 35.53912182238363, "grad_norm": 521.6411743164062, "learning_rate": 9.650929431126101e-05, "loss": 40.3727, "step": 13456 }, { "epoch": 35.54176295807197, "grad_norm": 899.7103271484375, "learning_rate": 9.647632080477303e-05, "loss": 40.0831, "step": 13457 }, { "epoch": 35.54440409376032, "grad_norm": 774.7179565429688, "learning_rate": 9.644335158534364e-05, "loss": 41.1001, "step": 13458 }, { "epoch": 35.54704522944866, "grad_norm": 1242.366943359375, "learning_rate": 9.641038665389343e-05, "loss": 42.2108, "step": 13459 }, { "epoch": 35.549686365137006, "grad_norm": 829.1432495117188, "learning_rate": 9.637742601134286e-05, "loss": 39.0151, "step": 13460 }, { "epoch": 35.552327500825356, "grad_norm": 786.607421875, "learning_rate": 9.634446965861252e-05, "loss": 40.1724, "step": 13461 }, { "epoch": 35.5549686365137, "grad_norm": 965.9329833984375, "learning_rate": 9.63115175966226e-05, "loss": 39.5324, "step": 13462 }, { "epoch": 35.55760977220205, "grad_norm": 1411.57568359375, "learning_rate": 9.627856982629329e-05, "loss": 37.471, "step": 13463 }, { "epoch": 35.56025090789039, "grad_norm": 3089.8154296875, "learning_rate": 9.624562634854459e-05, "loss": 37.8863, "step": 13464 }, { "epoch": 35.56289204357874, "grad_norm": 986.3995971679688, "learning_rate": 9.621268716429646e-05, "loss": 37.3417, "step": 13465 }, { "epoch": 35.565533179267085, "grad_norm": 950.383056640625, "learning_rate": 9.617975227446871e-05, "loss": 34.7267, "step": 13466 }, { "epoch": 35.56817431495543, "grad_norm": 948.5822143554688, "learning_rate": 9.614682167998093e-05, "loss": 35.8297, "step": 13467 }, { "epoch": 35.57081545064378, "grad_norm": 1161.866943359375, "learning_rate": 9.611389538175286e-05, "loss": 36.5159, "step": 13468 }, { "epoch": 35.57345658633212, "grad_norm": 694.2733764648438, "learning_rate": 9.608097338070384e-05, "loss": 34.8884, "step": 13469 }, { "epoch": 35.57609772202047, "grad_norm": 1538.6378173828125, "learning_rate": 9.604805567775326e-05, "loss": 36.2099, "step": 13470 }, { "epoch": 35.578738857708814, "grad_norm": 975.3953857421875, "learning_rate": 9.601514227382027e-05, "loss": 35.3025, "step": 13471 }, { "epoch": 35.581379993397164, "grad_norm": 894.3604125976562, "learning_rate": 9.598223316982399e-05, "loss": 36.6609, "step": 13472 }, { "epoch": 35.58402112908551, "grad_norm": 1860.69091796875, "learning_rate": 9.594932836668336e-05, "loss": 34.9155, "step": 13473 }, { "epoch": 35.58666226477385, "grad_norm": 1317.7412109375, "learning_rate": 9.591642786531715e-05, "loss": 40.3763, "step": 13474 }, { "epoch": 35.5893034004622, "grad_norm": 66300.9140625, "learning_rate": 9.588353166664427e-05, "loss": 17.6095, "step": 13475 }, { "epoch": 35.59194453615054, "grad_norm": 1075.678955078125, "learning_rate": 9.585063977158323e-05, "loss": 12.5895, "step": 13476 }, { "epoch": 35.59458567183889, "grad_norm": 1536.9742431640625, "learning_rate": 9.581775218105246e-05, "loss": 10.3582, "step": 13477 }, { "epoch": 35.597226807527235, "grad_norm": 810.9127807617188, "learning_rate": 9.578486889597046e-05, "loss": 9.4839, "step": 13478 }, { "epoch": 35.599867943215585, "grad_norm": 624.0133056640625, "learning_rate": 9.57519899172554e-05, "loss": 7.8804, "step": 13479 }, { "epoch": 35.60250907890393, "grad_norm": 1312.7894287109375, "learning_rate": 9.571911524582541e-05, "loss": 13.6638, "step": 13480 }, { "epoch": 35.60515021459227, "grad_norm": 2240.960205078125, "learning_rate": 9.56862448825985e-05, "loss": 12.5291, "step": 13481 }, { "epoch": 35.60779135028062, "grad_norm": 727.4493408203125, "learning_rate": 9.565337882849257e-05, "loss": 8.0059, "step": 13482 }, { "epoch": 35.610432485968964, "grad_norm": 2083.81201171875, "learning_rate": 9.562051708442535e-05, "loss": 12.5613, "step": 13483 }, { "epoch": 35.613073621657314, "grad_norm": 627.2017822265625, "learning_rate": 9.558765965131441e-05, "loss": 11.5657, "step": 13484 }, { "epoch": 35.61571475734566, "grad_norm": 580.2097778320312, "learning_rate": 9.555480653007748e-05, "loss": 35.4706, "step": 13485 }, { "epoch": 35.61835589303401, "grad_norm": 1196.9759521484375, "learning_rate": 9.552195772163183e-05, "loss": 34.9149, "step": 13486 }, { "epoch": 35.62099702872235, "grad_norm": 1405.2752685546875, "learning_rate": 9.548911322689469e-05, "loss": 34.837, "step": 13487 }, { "epoch": 35.6236381644107, "grad_norm": 2490.408935546875, "learning_rate": 9.545627304678345e-05, "loss": 36.9542, "step": 13488 }, { "epoch": 35.62627930009904, "grad_norm": 520.31640625, "learning_rate": 9.542343718221493e-05, "loss": 34.9244, "step": 13489 }, { "epoch": 35.628920435787386, "grad_norm": 1073.967041015625, "learning_rate": 9.539060563410615e-05, "loss": 35.509, "step": 13490 }, { "epoch": 35.631561571475736, "grad_norm": 1190.544189453125, "learning_rate": 9.535777840337376e-05, "loss": 34.2146, "step": 13491 }, { "epoch": 35.63420270716408, "grad_norm": 739.5152587890625, "learning_rate": 9.532495549093468e-05, "loss": 33.7343, "step": 13492 }, { "epoch": 35.63684384285243, "grad_norm": 864.9472045898438, "learning_rate": 9.529213689770532e-05, "loss": 34.3531, "step": 13493 }, { "epoch": 35.63948497854077, "grad_norm": 512.2130737304688, "learning_rate": 9.52593226246021e-05, "loss": 34.9303, "step": 13494 }, { "epoch": 35.64212611422912, "grad_norm": 1328.7669677734375, "learning_rate": 9.522651267254148e-05, "loss": 34.4855, "step": 13495 }, { "epoch": 35.644767249917464, "grad_norm": 1313.404296875, "learning_rate": 9.519370704243956e-05, "loss": 34.9912, "step": 13496 }, { "epoch": 35.64740838560581, "grad_norm": 550.6650390625, "learning_rate": 9.516090573521246e-05, "loss": 34.6535, "step": 13497 }, { "epoch": 35.65004952129416, "grad_norm": 658.1213989257812, "learning_rate": 9.512810875177607e-05, "loss": 34.7136, "step": 13498 }, { "epoch": 35.6526906569825, "grad_norm": 608.8890380859375, "learning_rate": 9.50953160930463e-05, "loss": 34.7814, "step": 13499 }, { "epoch": 35.65533179267085, "grad_norm": 2448.495361328125, "learning_rate": 9.506252775993881e-05, "loss": 36.0541, "step": 13500 }, { "epoch": 35.65797292835919, "grad_norm": 1068.6932373046875, "learning_rate": 9.502974375336917e-05, "loss": 35.7011, "step": 13501 }, { "epoch": 35.66061406404754, "grad_norm": 1838.967041015625, "learning_rate": 9.499696407425296e-05, "loss": 39.7702, "step": 13502 }, { "epoch": 35.663255199735886, "grad_norm": 744.1956176757812, "learning_rate": 9.496418872350546e-05, "loss": 38.9478, "step": 13503 }, { "epoch": 35.665896335424236, "grad_norm": 885.576904296875, "learning_rate": 9.493141770204194e-05, "loss": 39.5702, "step": 13504 }, { "epoch": 35.66853747111258, "grad_norm": 734.9451293945312, "learning_rate": 9.48986510107775e-05, "loss": 39.6684, "step": 13505 }, { "epoch": 35.67117860680092, "grad_norm": 1908.05615234375, "learning_rate": 9.486588865062709e-05, "loss": 39.1145, "step": 13506 }, { "epoch": 35.67381974248927, "grad_norm": 844.4484252929688, "learning_rate": 9.483313062250561e-05, "loss": 43.5307, "step": 13507 }, { "epoch": 35.676460878177615, "grad_norm": 723.1057739257812, "learning_rate": 9.480037692732774e-05, "loss": 42.7593, "step": 13508 }, { "epoch": 35.679102013865965, "grad_norm": 607.3244018554688, "learning_rate": 9.476762756600823e-05, "loss": 42.651, "step": 13509 }, { "epoch": 35.68174314955431, "grad_norm": 923.9342651367188, "learning_rate": 9.473488253946156e-05, "loss": 41.7551, "step": 13510 }, { "epoch": 35.68438428524266, "grad_norm": 1217.0736083984375, "learning_rate": 9.470214184860207e-05, "loss": 40.5015, "step": 13511 }, { "epoch": 35.687025420931, "grad_norm": 5893.9873046875, "learning_rate": 9.466940549434396e-05, "loss": 39.6387, "step": 13512 }, { "epoch": 35.68966655661934, "grad_norm": 878.3471069335938, "learning_rate": 9.463667347760155e-05, "loss": 39.7, "step": 13513 }, { "epoch": 35.69230769230769, "grad_norm": 1326.8612060546875, "learning_rate": 9.460394579928877e-05, "loss": 37.9596, "step": 13514 }, { "epoch": 35.694948827996036, "grad_norm": 555.3880004882812, "learning_rate": 9.457122246031952e-05, "loss": 38.7328, "step": 13515 }, { "epoch": 35.697589963684386, "grad_norm": 1285.7171630859375, "learning_rate": 9.453850346160758e-05, "loss": 36.3836, "step": 13516 }, { "epoch": 35.70023109937273, "grad_norm": 13229.08203125, "learning_rate": 9.45057888040666e-05, "loss": 36.6936, "step": 13517 }, { "epoch": 35.70287223506108, "grad_norm": 443.4270324707031, "learning_rate": 9.447307848861012e-05, "loss": 35.6014, "step": 13518 }, { "epoch": 35.70551337074942, "grad_norm": 1022.5878295898438, "learning_rate": 9.444037251615148e-05, "loss": 36.2598, "step": 13519 }, { "epoch": 35.708154506437765, "grad_norm": 1307.5438232421875, "learning_rate": 9.440767088760416e-05, "loss": 35.3067, "step": 13520 }, { "epoch": 35.710795642126115, "grad_norm": 524.9102172851562, "learning_rate": 9.437497360388119e-05, "loss": 35.1135, "step": 13521 }, { "epoch": 35.71343677781446, "grad_norm": 2059.61376953125, "learning_rate": 9.434228066589562e-05, "loss": 34.608, "step": 13522 }, { "epoch": 35.71607791350281, "grad_norm": 1395.0357666015625, "learning_rate": 9.430959207456058e-05, "loss": 35.2215, "step": 13523 }, { "epoch": 35.71871904919115, "grad_norm": 879.4143676757812, "learning_rate": 9.427690783078863e-05, "loss": 36.0979, "step": 13524 }, { "epoch": 35.7213601848795, "grad_norm": 2065.95068359375, "learning_rate": 9.424422793549256e-05, "loss": 43.629, "step": 13525 }, { "epoch": 35.724001320567844, "grad_norm": 4736.93310546875, "learning_rate": 9.421155238958485e-05, "loss": 13.3711, "step": 13526 }, { "epoch": 35.72664245625619, "grad_norm": 7278.244140625, "learning_rate": 9.41788811939781e-05, "loss": 10.047, "step": 13527 }, { "epoch": 35.72928359194454, "grad_norm": 8857.6240234375, "learning_rate": 9.414621434958457e-05, "loss": 14.4302, "step": 13528 }, { "epoch": 35.73192472763288, "grad_norm": 625.6866455078125, "learning_rate": 9.411355185731635e-05, "loss": 10.4804, "step": 13529 }, { "epoch": 35.73456586332123, "grad_norm": 3137.901123046875, "learning_rate": 9.40808937180857e-05, "loss": 11.9271, "step": 13530 }, { "epoch": 35.73720699900957, "grad_norm": 771.6651611328125, "learning_rate": 9.40482399328045e-05, "loss": 13.6901, "step": 13531 }, { "epoch": 35.73984813469792, "grad_norm": 1223.6651611328125, "learning_rate": 9.401559050238462e-05, "loss": 9.2523, "step": 13532 }, { "epoch": 35.742489270386265, "grad_norm": 624.316650390625, "learning_rate": 9.398294542773772e-05, "loss": 12.6211, "step": 13533 }, { "epoch": 35.745130406074615, "grad_norm": 618.6904907226562, "learning_rate": 9.395030470977544e-05, "loss": 13.073, "step": 13534 }, { "epoch": 35.74777154176296, "grad_norm": 2434.5986328125, "learning_rate": 9.391766834940921e-05, "loss": 31.4622, "step": 13535 }, { "epoch": 35.7504126774513, "grad_norm": 1191.635986328125, "learning_rate": 9.388503634755034e-05, "loss": 34.6561, "step": 13536 }, { "epoch": 35.75305381313965, "grad_norm": 974.6755981445312, "learning_rate": 9.38524087051102e-05, "loss": 32.9829, "step": 13537 }, { "epoch": 35.755694948827994, "grad_norm": 1013.4242553710938, "learning_rate": 9.38197854229998e-05, "loss": 34.9205, "step": 13538 }, { "epoch": 35.758336084516344, "grad_norm": 3297.102783203125, "learning_rate": 9.378716650213017e-05, "loss": 34.681, "step": 13539 }, { "epoch": 35.76097722020469, "grad_norm": 1895.2320556640625, "learning_rate": 9.375455194341214e-05, "loss": 33.4933, "step": 13540 }, { "epoch": 35.76361835589304, "grad_norm": 854.212646484375, "learning_rate": 9.372194174775647e-05, "loss": 37.3528, "step": 13541 }, { "epoch": 35.76625949158138, "grad_norm": 1068.3948974609375, "learning_rate": 9.368933591607378e-05, "loss": 34.8427, "step": 13542 }, { "epoch": 35.76890062726972, "grad_norm": 1158.702880859375, "learning_rate": 9.365673444927447e-05, "loss": 34.3786, "step": 13543 }, { "epoch": 35.77154176295807, "grad_norm": 2055.84814453125, "learning_rate": 9.362413734826908e-05, "loss": 33.9699, "step": 13544 }, { "epoch": 35.774182898646416, "grad_norm": 1163.40869140625, "learning_rate": 9.359154461396782e-05, "loss": 35.5679, "step": 13545 }, { "epoch": 35.776824034334766, "grad_norm": 866.1390380859375, "learning_rate": 9.355895624728078e-05, "loss": 34.722, "step": 13546 }, { "epoch": 35.77946517002311, "grad_norm": 1498.8792724609375, "learning_rate": 9.352637224911792e-05, "loss": 33.9797, "step": 13547 }, { "epoch": 35.78210630571146, "grad_norm": 1576.7862548828125, "learning_rate": 9.349379262038927e-05, "loss": 34.6973, "step": 13548 }, { "epoch": 35.7847474413998, "grad_norm": 1225.0020751953125, "learning_rate": 9.346121736200453e-05, "loss": 34.8903, "step": 13549 }, { "epoch": 35.78738857708815, "grad_norm": 646.5186767578125, "learning_rate": 9.342864647487334e-05, "loss": 36.1844, "step": 13550 }, { "epoch": 35.790029712776494, "grad_norm": 2131.8486328125, "learning_rate": 9.33960799599052e-05, "loss": 36.792, "step": 13551 }, { "epoch": 35.79267084846484, "grad_norm": 837.7431640625, "learning_rate": 9.336351781800956e-05, "loss": 39.8987, "step": 13552 }, { "epoch": 35.79531198415319, "grad_norm": 530.1489868164062, "learning_rate": 9.333096005009567e-05, "loss": 39.4286, "step": 13553 }, { "epoch": 35.79795311984153, "grad_norm": 1000.5003051757812, "learning_rate": 9.32984066570726e-05, "loss": 39.32, "step": 13554 }, { "epoch": 35.80059425552988, "grad_norm": 820.5979614257812, "learning_rate": 9.326585763984954e-05, "loss": 39.3553, "step": 13555 }, { "epoch": 35.80323539121822, "grad_norm": 1282.470947265625, "learning_rate": 9.323331299933535e-05, "loss": 40.6777, "step": 13556 }, { "epoch": 35.80587652690657, "grad_norm": 1388.911376953125, "learning_rate": 9.320077273643874e-05, "loss": 42.6656, "step": 13557 }, { "epoch": 35.808517662594916, "grad_norm": 772.4195556640625, "learning_rate": 9.316823685206857e-05, "loss": 43.5481, "step": 13558 }, { "epoch": 35.81115879828326, "grad_norm": 703.904052734375, "learning_rate": 9.31357053471332e-05, "loss": 42.4432, "step": 13559 }, { "epoch": 35.81379993397161, "grad_norm": 744.7845458984375, "learning_rate": 9.31031782225411e-05, "loss": 39.7576, "step": 13560 }, { "epoch": 35.81644106965995, "grad_norm": 1270.6890869140625, "learning_rate": 9.307065547920051e-05, "loss": 42.0612, "step": 13561 }, { "epoch": 35.8190822053483, "grad_norm": 2241.1826171875, "learning_rate": 9.303813711801975e-05, "loss": 41.2014, "step": 13562 }, { "epoch": 35.821723341036645, "grad_norm": 1572.333740234375, "learning_rate": 9.300562313990682e-05, "loss": 38.879, "step": 13563 }, { "epoch": 35.824364476724995, "grad_norm": 956.048095703125, "learning_rate": 9.297311354576954e-05, "loss": 37.024, "step": 13564 }, { "epoch": 35.82700561241334, "grad_norm": 648.209716796875, "learning_rate": 9.294060833651591e-05, "loss": 36.2799, "step": 13565 }, { "epoch": 35.82964674810168, "grad_norm": 512.4254150390625, "learning_rate": 9.290810751305354e-05, "loss": 36.2352, "step": 13566 }, { "epoch": 35.83228788379003, "grad_norm": 665.2338256835938, "learning_rate": 9.287561107629e-05, "loss": 35.2611, "step": 13567 }, { "epoch": 35.83492901947837, "grad_norm": 2027.444091796875, "learning_rate": 9.284311902713269e-05, "loss": 35.7304, "step": 13568 }, { "epoch": 35.83757015516672, "grad_norm": 1003.051513671875, "learning_rate": 9.281063136648898e-05, "loss": 35.2364, "step": 13569 }, { "epoch": 35.840211290855066, "grad_norm": 1056.018798828125, "learning_rate": 9.277814809526606e-05, "loss": 34.6677, "step": 13570 }, { "epoch": 35.842852426543416, "grad_norm": 1374.495849609375, "learning_rate": 9.274566921437088e-05, "loss": 35.2388, "step": 13571 }, { "epoch": 35.84549356223176, "grad_norm": 2594.005126953125, "learning_rate": 9.271319472471063e-05, "loss": 34.2691, "step": 13572 }, { "epoch": 35.8481346979201, "grad_norm": 1318.21240234375, "learning_rate": 9.268072462719202e-05, "loss": 34.4445, "step": 13573 }, { "epoch": 35.85077583360845, "grad_norm": 1560.6783447265625, "learning_rate": 9.264825892272172e-05, "loss": 33.8884, "step": 13574 }, { "epoch": 35.853416969296795, "grad_norm": 2559.66162109375, "learning_rate": 9.261579761220639e-05, "loss": 31.3373, "step": 13575 }, { "epoch": 35.856058104985145, "grad_norm": 1179.833251953125, "learning_rate": 9.258334069655248e-05, "loss": 12.9867, "step": 13576 }, { "epoch": 35.85869924067349, "grad_norm": 29944.521484375, "learning_rate": 9.255088817666626e-05, "loss": 11.9227, "step": 13577 }, { "epoch": 35.86134037636184, "grad_norm": 893.4591064453125, "learning_rate": 9.251844005345395e-05, "loss": 16.1463, "step": 13578 }, { "epoch": 35.86398151205018, "grad_norm": 351.01885986328125, "learning_rate": 9.248599632782177e-05, "loss": 9.7165, "step": 13579 }, { "epoch": 35.86662264773853, "grad_norm": 979.5623779296875, "learning_rate": 9.24535570006756e-05, "loss": 13.677, "step": 13580 }, { "epoch": 35.869263783426874, "grad_norm": 11307.115234375, "learning_rate": 9.24211220729213e-05, "loss": 12.8108, "step": 13581 }, { "epoch": 35.87190491911522, "grad_norm": 2025.48828125, "learning_rate": 9.23886915454645e-05, "loss": 8.5723, "step": 13582 }, { "epoch": 35.87454605480357, "grad_norm": 753.188232421875, "learning_rate": 9.235626541921102e-05, "loss": 15.7951, "step": 13583 }, { "epoch": 35.87718719049191, "grad_norm": 1141.171875, "learning_rate": 9.232384369506622e-05, "loss": 19.2215, "step": 13584 }, { "epoch": 35.87982832618026, "grad_norm": 665.630126953125, "learning_rate": 9.229142637393545e-05, "loss": 35.589, "step": 13585 }, { "epoch": 35.8824694618686, "grad_norm": 945.5473022460938, "learning_rate": 9.225901345672396e-05, "loss": 35.1009, "step": 13586 }, { "epoch": 35.88511059755695, "grad_norm": 775.5069580078125, "learning_rate": 9.222660494433687e-05, "loss": 36.0939, "step": 13587 }, { "epoch": 35.887751733245295, "grad_norm": 624.1502075195312, "learning_rate": 9.219420083767912e-05, "loss": 34.4857, "step": 13588 }, { "epoch": 35.89039286893364, "grad_norm": 534.0523681640625, "learning_rate": 9.216180113765556e-05, "loss": 34.0548, "step": 13589 }, { "epoch": 35.89303400462199, "grad_norm": 953.5933227539062, "learning_rate": 9.212940584517107e-05, "loss": 34.1842, "step": 13590 }, { "epoch": 35.89567514031033, "grad_norm": 416.4883728027344, "learning_rate": 9.209701496113018e-05, "loss": 33.8279, "step": 13591 }, { "epoch": 35.89831627599868, "grad_norm": 959.3870849609375, "learning_rate": 9.20646284864373e-05, "loss": 33.9916, "step": 13592 }, { "epoch": 35.900957411687024, "grad_norm": 1011.0264282226562, "learning_rate": 9.203224642199707e-05, "loss": 35.6725, "step": 13593 }, { "epoch": 35.903598547375374, "grad_norm": 722.5364990234375, "learning_rate": 9.199986876871343e-05, "loss": 35.4723, "step": 13594 }, { "epoch": 35.90623968306372, "grad_norm": 3511.913330078125, "learning_rate": 9.19674955274907e-05, "loss": 35.3902, "step": 13595 }, { "epoch": 35.90888081875207, "grad_norm": 2340.12109375, "learning_rate": 9.193512669923271e-05, "loss": 34.2553, "step": 13596 }, { "epoch": 35.91152195444041, "grad_norm": 792.3906860351562, "learning_rate": 9.190276228484357e-05, "loss": 34.9062, "step": 13597 }, { "epoch": 35.91416309012875, "grad_norm": 690.8374633789062, "learning_rate": 9.187040228522689e-05, "loss": 34.754, "step": 13598 }, { "epoch": 35.9168042258171, "grad_norm": 1186.4998779296875, "learning_rate": 9.183804670128625e-05, "loss": 34.9864, "step": 13599 }, { "epoch": 35.919445361505446, "grad_norm": 1537.3160400390625, "learning_rate": 9.180569553392534e-05, "loss": 35.3169, "step": 13600 }, { "epoch": 35.919445361505446, "eval_loss": 3.8081424236297607, "eval_runtime": 2.273, "eval_samples_per_second": 217.778, "eval_steps_per_second": 27.277, "step": 13600 }, { "epoch": 35.922086497193796, "grad_norm": 634.2328491210938, "learning_rate": 9.177334878404748e-05, "loss": 36.4428, "step": 13601 }, { "epoch": 35.92472763288214, "grad_norm": 1010.2969360351562, "learning_rate": 9.174100645255587e-05, "loss": 38.4169, "step": 13602 }, { "epoch": 35.92736876857049, "grad_norm": 679.2219848632812, "learning_rate": 9.17086685403537e-05, "loss": 38.5278, "step": 13603 }, { "epoch": 35.93000990425883, "grad_norm": 501.0888671875, "learning_rate": 9.167633504834396e-05, "loss": 39.2709, "step": 13604 }, { "epoch": 35.932651039947174, "grad_norm": 647.1530151367188, "learning_rate": 9.164400597742959e-05, "loss": 41.7862, "step": 13605 }, { "epoch": 35.935292175635524, "grad_norm": 656.753662109375, "learning_rate": 9.161168132851325e-05, "loss": 44.5845, "step": 13606 }, { "epoch": 35.93793331132387, "grad_norm": 1040.34716796875, "learning_rate": 9.157936110249773e-05, "loss": 39.0854, "step": 13607 }, { "epoch": 35.94057444701222, "grad_norm": 629.8601684570312, "learning_rate": 9.154704530028549e-05, "loss": 39.5212, "step": 13608 }, { "epoch": 35.94321558270056, "grad_norm": 382.13409423828125, "learning_rate": 9.151473392277892e-05, "loss": 37.9456, "step": 13609 }, { "epoch": 35.94585671838891, "grad_norm": 798.4747924804688, "learning_rate": 9.148242697088033e-05, "loss": 37.0684, "step": 13610 }, { "epoch": 35.94849785407725, "grad_norm": 425.8293151855469, "learning_rate": 9.145012444549183e-05, "loss": 35.9826, "step": 13611 }, { "epoch": 35.951138989765596, "grad_norm": 643.5346069335938, "learning_rate": 9.141782634751548e-05, "loss": 36.4163, "step": 13612 }, { "epoch": 35.953780125453946, "grad_norm": 2597.443603515625, "learning_rate": 9.13855326778531e-05, "loss": 33.5844, "step": 13613 }, { "epoch": 35.95642126114229, "grad_norm": 632.4301147460938, "learning_rate": 9.135324343740662e-05, "loss": 35.4893, "step": 13614 }, { "epoch": 35.95906239683064, "grad_norm": 2668.38134765625, "learning_rate": 9.13209586270776e-05, "loss": 26.713, "step": 13615 }, { "epoch": 35.96170353251898, "grad_norm": 434.0716247558594, "learning_rate": 9.128867824776752e-05, "loss": 9.6883, "step": 13616 }, { "epoch": 35.96434466820733, "grad_norm": 544.4835205078125, "learning_rate": 9.125640230037796e-05, "loss": 11.5568, "step": 13617 }, { "epoch": 35.966985803895675, "grad_norm": 1296.071533203125, "learning_rate": 9.122413078581013e-05, "loss": 13.979, "step": 13618 }, { "epoch": 35.96962693958402, "grad_norm": 498.5622253417969, "learning_rate": 9.119186370496516e-05, "loss": 10.2174, "step": 13619 }, { "epoch": 35.97226807527237, "grad_norm": 3737.309326171875, "learning_rate": 9.11596010587441e-05, "loss": 22.5963, "step": 13620 }, { "epoch": 35.97490921096071, "grad_norm": 1621.074462890625, "learning_rate": 9.112734284804788e-05, "loss": 35.0658, "step": 13621 }, { "epoch": 35.97755034664906, "grad_norm": 763.4306640625, "learning_rate": 9.10950890737773e-05, "loss": 33.3623, "step": 13622 }, { "epoch": 35.9801914823374, "grad_norm": 673.7594604492188, "learning_rate": 9.10628397368329e-05, "loss": 33.5504, "step": 13623 }, { "epoch": 35.98283261802575, "grad_norm": 1594.4632568359375, "learning_rate": 9.103059483811546e-05, "loss": 34.5581, "step": 13624 }, { "epoch": 35.985473753714096, "grad_norm": 1037.6212158203125, "learning_rate": 9.099835437852525e-05, "loss": 35.4613, "step": 13625 }, { "epoch": 35.988114889402446, "grad_norm": 1589.259521484375, "learning_rate": 9.09661183589626e-05, "loss": 35.0715, "step": 13626 }, { "epoch": 35.99075602509079, "grad_norm": 1488.767578125, "learning_rate": 9.093388678032757e-05, "loss": 34.2014, "step": 13627 }, { "epoch": 35.99339716077913, "grad_norm": 544.5764770507812, "learning_rate": 9.090165964352049e-05, "loss": 33.9368, "step": 13628 }, { "epoch": 35.99603829646748, "grad_norm": 1499.588134765625, "learning_rate": 9.086943694944103e-05, "loss": 34.7146, "step": 13629 }, { "epoch": 35.998679432155825, "grad_norm": 1359.370849609375, "learning_rate": 9.083721869898894e-05, "loss": 38.0254, "step": 13630 }, { "epoch": 36.001320567844175, "grad_norm": 880.0406494140625, "learning_rate": 9.080500489306415e-05, "loss": 39.5238, "step": 13631 }, { "epoch": 36.00396170353252, "grad_norm": 904.623046875, "learning_rate": 9.077279553256607e-05, "loss": 38.9068, "step": 13632 }, { "epoch": 36.00660283922087, "grad_norm": 1355.743408203125, "learning_rate": 9.074059061839416e-05, "loss": 38.5661, "step": 13633 }, { "epoch": 36.00924397490921, "grad_norm": 649.7552490234375, "learning_rate": 9.070839015144758e-05, "loss": 40.3785, "step": 13634 }, { "epoch": 36.011885110597554, "grad_norm": 571.5380249023438, "learning_rate": 9.067619413262573e-05, "loss": 41.0912, "step": 13635 }, { "epoch": 36.014526246285904, "grad_norm": 531.9892578125, "learning_rate": 9.064400256282756e-05, "loss": 43.7174, "step": 13636 }, { "epoch": 36.01716738197425, "grad_norm": 730.4701538085938, "learning_rate": 9.061181544295202e-05, "loss": 41.7518, "step": 13637 }, { "epoch": 36.0198085176626, "grad_norm": 887.8839111328125, "learning_rate": 9.057963277389791e-05, "loss": 40.8339, "step": 13638 }, { "epoch": 36.02244965335094, "grad_norm": 952.162841796875, "learning_rate": 9.054745455656391e-05, "loss": 39.9706, "step": 13639 }, { "epoch": 36.02509078903929, "grad_norm": 1307.909912109375, "learning_rate": 9.051528079184856e-05, "loss": 39.6157, "step": 13640 }, { "epoch": 36.02773192472763, "grad_norm": 1176.0601806640625, "learning_rate": 9.048311148065023e-05, "loss": 39.72, "step": 13641 }, { "epoch": 36.03037306041598, "grad_norm": 811.862548828125, "learning_rate": 9.045094662386738e-05, "loss": 36.6312, "step": 13642 }, { "epoch": 36.033014196104325, "grad_norm": 1114.6893310546875, "learning_rate": 9.041878622239813e-05, "loss": 38.1089, "step": 13643 }, { "epoch": 36.03565533179267, "grad_norm": 614.1411743164062, "learning_rate": 9.038663027714044e-05, "loss": 37.6874, "step": 13644 }, { "epoch": 36.03829646748102, "grad_norm": 521.9929809570312, "learning_rate": 9.035447878899253e-05, "loss": 35.6566, "step": 13645 }, { "epoch": 36.04093760316936, "grad_norm": 595.6957397460938, "learning_rate": 9.032233175885191e-05, "loss": 36.3373, "step": 13646 }, { "epoch": 36.04357873885771, "grad_norm": 942.3541870117188, "learning_rate": 9.029018918761636e-05, "loss": 35.7512, "step": 13647 }, { "epoch": 36.046219874546054, "grad_norm": 1845.422119140625, "learning_rate": 9.02580510761834e-05, "loss": 34.9275, "step": 13648 }, { "epoch": 36.048861010234404, "grad_norm": 596.0094604492188, "learning_rate": 9.022591742545062e-05, "loss": 34.428, "step": 13649 }, { "epoch": 36.05150214592275, "grad_norm": 522.0142211914062, "learning_rate": 9.019378823631522e-05, "loss": 34.5418, "step": 13650 }, { "epoch": 36.05414328161109, "grad_norm": 499.0349426269531, "learning_rate": 9.016166350967433e-05, "loss": 33.7526, "step": 13651 }, { "epoch": 36.05678441729944, "grad_norm": 5554.76025390625, "learning_rate": 9.012954324642517e-05, "loss": 36.412, "step": 13652 }, { "epoch": 36.05942555298778, "grad_norm": 4648.7158203125, "learning_rate": 9.009742744746463e-05, "loss": 17.8535, "step": 13653 }, { "epoch": 36.06206668867613, "grad_norm": 2166.795654296875, "learning_rate": 9.006531611368945e-05, "loss": 11.3148, "step": 13654 }, { "epoch": 36.064707824364476, "grad_norm": 1318.5394287109375, "learning_rate": 9.00332092459964e-05, "loss": 11.0665, "step": 13655 }, { "epoch": 36.067348960052826, "grad_norm": 1114.177978515625, "learning_rate": 9.0001106845282e-05, "loss": 12.3134, "step": 13656 }, { "epoch": 36.06999009574117, "grad_norm": 840.6783447265625, "learning_rate": 8.996900891244273e-05, "loss": 8.5605, "step": 13657 }, { "epoch": 36.07263123142951, "grad_norm": 394.7906188964844, "learning_rate": 8.993691544837477e-05, "loss": 9.4956, "step": 13658 }, { "epoch": 36.07527236711786, "grad_norm": 686.5553588867188, "learning_rate": 8.99048264539745e-05, "loss": 10.0072, "step": 13659 }, { "epoch": 36.077913502806204, "grad_norm": 6031.6318359375, "learning_rate": 8.987274193013791e-05, "loss": 14.7689, "step": 13660 }, { "epoch": 36.080554638494554, "grad_norm": 4216.56591796875, "learning_rate": 8.984066187776094e-05, "loss": 16.0109, "step": 13661 }, { "epoch": 36.0831957741829, "grad_norm": 1600.181396484375, "learning_rate": 8.980858629773933e-05, "loss": 14.4209, "step": 13662 }, { "epoch": 36.08583690987125, "grad_norm": 751.6810913085938, "learning_rate": 8.977651519096899e-05, "loss": 34.9506, "step": 13663 }, { "epoch": 36.08847804555959, "grad_norm": 812.1043090820312, "learning_rate": 8.974444855834527e-05, "loss": 35.3059, "step": 13664 }, { "epoch": 36.09111918124793, "grad_norm": 706.86279296875, "learning_rate": 8.971238640076362e-05, "loss": 34.1692, "step": 13665 }, { "epoch": 36.09376031693628, "grad_norm": 1316.6278076171875, "learning_rate": 8.968032871911949e-05, "loss": 35.0091, "step": 13666 }, { "epoch": 36.096401452624626, "grad_norm": 1899.9306640625, "learning_rate": 8.964827551430799e-05, "loss": 33.9868, "step": 13667 }, { "epoch": 36.099042588312976, "grad_norm": 1210.366943359375, "learning_rate": 8.961622678722423e-05, "loss": 33.4043, "step": 13668 }, { "epoch": 36.10168372400132, "grad_norm": 750.6251831054688, "learning_rate": 8.958418253876302e-05, "loss": 33.8565, "step": 13669 }, { "epoch": 36.10432485968967, "grad_norm": 1555.88525390625, "learning_rate": 8.955214276981935e-05, "loss": 35.7078, "step": 13670 }, { "epoch": 36.10696599537801, "grad_norm": 635.237548828125, "learning_rate": 8.952010748128786e-05, "loss": 34.858, "step": 13671 }, { "epoch": 36.10960713106636, "grad_norm": 949.9328002929688, "learning_rate": 8.948807667406312e-05, "loss": 34.6957, "step": 13672 }, { "epoch": 36.112248266754705, "grad_norm": 1588.62939453125, "learning_rate": 8.94560503490395e-05, "loss": 34.8559, "step": 13673 }, { "epoch": 36.11488940244305, "grad_norm": 642.9598999023438, "learning_rate": 8.94240285071114e-05, "loss": 34.01, "step": 13674 }, { "epoch": 36.1175305381314, "grad_norm": 3505.678466796875, "learning_rate": 8.939201114917295e-05, "loss": 35.2777, "step": 13675 }, { "epoch": 36.12017167381974, "grad_norm": 1090.789306640625, "learning_rate": 8.935999827611818e-05, "loss": 35.3615, "step": 13676 }, { "epoch": 36.12281280950809, "grad_norm": 839.4583740234375, "learning_rate": 8.932798988884116e-05, "loss": 34.1146, "step": 13677 }, { "epoch": 36.12545394519643, "grad_norm": 1452.54345703125, "learning_rate": 8.929598598823566e-05, "loss": 34.9414, "step": 13678 }, { "epoch": 36.12809508088478, "grad_norm": 1564.6175537109375, "learning_rate": 8.926398657519524e-05, "loss": 36.2347, "step": 13679 }, { "epoch": 36.130736216573126, "grad_norm": 1476.0201416015625, "learning_rate": 8.923199165061377e-05, "loss": 37.7183, "step": 13680 }, { "epoch": 36.13337735226147, "grad_norm": 851.2693481445312, "learning_rate": 8.92000012153844e-05, "loss": 40.474, "step": 13681 }, { "epoch": 36.13601848794982, "grad_norm": 953.8187255859375, "learning_rate": 8.916801527040053e-05, "loss": 38.3381, "step": 13682 }, { "epoch": 36.13865962363816, "grad_norm": 931.7254638671875, "learning_rate": 8.913603381655528e-05, "loss": 38.9493, "step": 13683 }, { "epoch": 36.14130075932651, "grad_norm": 1406.376953125, "learning_rate": 8.910405685474185e-05, "loss": 39.6128, "step": 13684 }, { "epoch": 36.143941895014855, "grad_norm": 1164.8023681640625, "learning_rate": 8.907208438585315e-05, "loss": 40.8803, "step": 13685 }, { "epoch": 36.146583030703205, "grad_norm": 436.7814025878906, "learning_rate": 8.904011641078185e-05, "loss": 42.9348, "step": 13686 }, { "epoch": 36.14922416639155, "grad_norm": 640.6046142578125, "learning_rate": 8.900815293042084e-05, "loss": 42.4249, "step": 13687 }, { "epoch": 36.1518653020799, "grad_norm": 1325.9542236328125, "learning_rate": 8.897619394566259e-05, "loss": 40.1925, "step": 13688 }, { "epoch": 36.15450643776824, "grad_norm": 330.6681823730469, "learning_rate": 8.894423945739951e-05, "loss": 40.4969, "step": 13689 }, { "epoch": 36.157147573456584, "grad_norm": 786.9987182617188, "learning_rate": 8.891228946652397e-05, "loss": 38.565, "step": 13690 }, { "epoch": 36.159788709144934, "grad_norm": 884.4855346679688, "learning_rate": 8.88803439739281e-05, "loss": 38.6453, "step": 13691 }, { "epoch": 36.16242984483328, "grad_norm": 910.4380493164062, "learning_rate": 8.884840298050397e-05, "loss": 39.1936, "step": 13692 }, { "epoch": 36.16507098052163, "grad_norm": 849.4227294921875, "learning_rate": 8.881646648714348e-05, "loss": 38.1247, "step": 13693 }, { "epoch": 36.16771211620997, "grad_norm": 1009.06982421875, "learning_rate": 8.878453449473853e-05, "loss": 36.6752, "step": 13694 }, { "epoch": 36.17035325189832, "grad_norm": 775.8865356445312, "learning_rate": 8.875260700418076e-05, "loss": 36.2986, "step": 13695 }, { "epoch": 36.17299438758666, "grad_norm": 1671.3067626953125, "learning_rate": 8.872068401636169e-05, "loss": 36.5136, "step": 13696 }, { "epoch": 36.175635523275005, "grad_norm": 1003.0162353515625, "learning_rate": 8.86887655321727e-05, "loss": 36.2241, "step": 13697 }, { "epoch": 36.178276658963355, "grad_norm": 1069.8116455078125, "learning_rate": 8.865685155250536e-05, "loss": 34.8598, "step": 13698 }, { "epoch": 36.1809177946517, "grad_norm": 613.6064453125, "learning_rate": 8.862494207825058e-05, "loss": 33.8905, "step": 13699 }, { "epoch": 36.18355893034005, "grad_norm": 1221.280029296875, "learning_rate": 8.85930371102994e-05, "loss": 34.0896, "step": 13700 }, { "epoch": 36.18620006602839, "grad_norm": 513.436767578125, "learning_rate": 8.856113664954294e-05, "loss": 36.0348, "step": 13701 }, { "epoch": 36.18884120171674, "grad_norm": 2114.96484375, "learning_rate": 8.85292406968719e-05, "loss": 36.8138, "step": 13702 }, { "epoch": 36.191482337405084, "grad_norm": 2009.236328125, "learning_rate": 8.849734925317696e-05, "loss": 17.904, "step": 13703 }, { "epoch": 36.19412347309343, "grad_norm": 418.4693603515625, "learning_rate": 8.846546231934858e-05, "loss": 12.0509, "step": 13704 }, { "epoch": 36.19676460878178, "grad_norm": 456.9958190917969, "learning_rate": 8.843357989627734e-05, "loss": 8.7488, "step": 13705 }, { "epoch": 36.19940574447012, "grad_norm": 14333.8359375, "learning_rate": 8.84017019848535e-05, "loss": 10.0234, "step": 13706 }, { "epoch": 36.20204688015847, "grad_norm": 2600.462646484375, "learning_rate": 8.836982858596721e-05, "loss": 18.7845, "step": 13707 }, { "epoch": 36.20468801584681, "grad_norm": 3360.210205078125, "learning_rate": 8.833795970050851e-05, "loss": 13.066, "step": 13708 }, { "epoch": 36.20732915153516, "grad_norm": 12758.9541015625, "learning_rate": 8.830609532936729e-05, "loss": 17.6305, "step": 13709 }, { "epoch": 36.209970287223506, "grad_norm": 2875.426513671875, "learning_rate": 8.827423547343338e-05, "loss": 11.1514, "step": 13710 }, { "epoch": 36.21261142291185, "grad_norm": 689.4794921875, "learning_rate": 8.82423801335964e-05, "loss": 14.9029, "step": 13711 }, { "epoch": 36.2152525586002, "grad_norm": 961.4501953125, "learning_rate": 8.821052931074597e-05, "loss": 17.1357, "step": 13712 }, { "epoch": 36.21789369428854, "grad_norm": 680.2169189453125, "learning_rate": 8.817868300577148e-05, "loss": 34.4417, "step": 13713 }, { "epoch": 36.22053482997689, "grad_norm": 522.490478515625, "learning_rate": 8.814684121956213e-05, "loss": 35.8373, "step": 13714 }, { "epoch": 36.223175965665234, "grad_norm": 1366.762451171875, "learning_rate": 8.811500395300733e-05, "loss": 34.8704, "step": 13715 }, { "epoch": 36.225817101353584, "grad_norm": 771.2030029296875, "learning_rate": 8.808317120699588e-05, "loss": 34.5671, "step": 13716 }, { "epoch": 36.22845823704193, "grad_norm": 1005.5267333984375, "learning_rate": 8.805134298241673e-05, "loss": 33.5687, "step": 13717 }, { "epoch": 36.23109937273028, "grad_norm": 1127.8291015625, "learning_rate": 8.801951928015861e-05, "loss": 34.8549, "step": 13718 }, { "epoch": 36.23374050841862, "grad_norm": 762.6132202148438, "learning_rate": 8.798770010111037e-05, "loss": 36.6371, "step": 13719 }, { "epoch": 36.23638164410696, "grad_norm": 1141.0758056640625, "learning_rate": 8.795588544616042e-05, "loss": 34.2173, "step": 13720 }, { "epoch": 36.23902277979531, "grad_norm": 791.162841796875, "learning_rate": 8.792407531619709e-05, "loss": 35.6119, "step": 13721 }, { "epoch": 36.241663915483656, "grad_norm": 867.0126953125, "learning_rate": 8.789226971210884e-05, "loss": 34.6205, "step": 13722 }, { "epoch": 36.244305051172006, "grad_norm": 1191.367431640625, "learning_rate": 8.786046863478372e-05, "loss": 35.3225, "step": 13723 }, { "epoch": 36.24694618686035, "grad_norm": 1223.985107421875, "learning_rate": 8.782867208510977e-05, "loss": 35.9596, "step": 13724 }, { "epoch": 36.2495873225487, "grad_norm": 792.2584838867188, "learning_rate": 8.77968800639749e-05, "loss": 35.2848, "step": 13725 }, { "epoch": 36.25222845823704, "grad_norm": 1241.1224365234375, "learning_rate": 8.776509257226689e-05, "loss": 34.1468, "step": 13726 }, { "epoch": 36.254869593925385, "grad_norm": 1265.9212646484375, "learning_rate": 8.773330961087334e-05, "loss": 33.6527, "step": 13727 }, { "epoch": 36.257510729613735, "grad_norm": 1109.7081298828125, "learning_rate": 8.770153118068172e-05, "loss": 36.852, "step": 13728 }, { "epoch": 36.26015186530208, "grad_norm": 4270.87744140625, "learning_rate": 8.766975728257956e-05, "loss": 35.9047, "step": 13729 }, { "epoch": 36.26279300099043, "grad_norm": 726.92333984375, "learning_rate": 8.763798791745412e-05, "loss": 36.2315, "step": 13730 }, { "epoch": 36.26543413667877, "grad_norm": 1450.071533203125, "learning_rate": 8.760622308619245e-05, "loss": 42.4143, "step": 13731 }, { "epoch": 36.26807527236712, "grad_norm": 558.1622314453125, "learning_rate": 8.757446278968153e-05, "loss": 39.9095, "step": 13732 }, { "epoch": 36.27071640805546, "grad_norm": 1621.0838623046875, "learning_rate": 8.754270702880852e-05, "loss": 39.0652, "step": 13733 }, { "epoch": 36.27335754374381, "grad_norm": 756.9959716796875, "learning_rate": 8.751095580445986e-05, "loss": 39.5654, "step": 13734 }, { "epoch": 36.275998679432156, "grad_norm": 1736.8668212890625, "learning_rate": 8.747920911752224e-05, "loss": 40.6533, "step": 13735 }, { "epoch": 36.2786398151205, "grad_norm": 779.1088256835938, "learning_rate": 8.744746696888231e-05, "loss": 41.6214, "step": 13736 }, { "epoch": 36.28128095080885, "grad_norm": 1021.8155517578125, "learning_rate": 8.74157293594264e-05, "loss": 45.1331, "step": 13737 }, { "epoch": 36.28392208649719, "grad_norm": 649.6214599609375, "learning_rate": 8.73839962900407e-05, "loss": 41.5966, "step": 13738 }, { "epoch": 36.28656322218554, "grad_norm": 600.3781127929688, "learning_rate": 8.735226776161135e-05, "loss": 40.0758, "step": 13739 }, { "epoch": 36.289204357873885, "grad_norm": 736.8853149414062, "learning_rate": 8.732054377502441e-05, "loss": 37.827, "step": 13740 }, { "epoch": 36.291845493562235, "grad_norm": 881.2184448242188, "learning_rate": 8.728882433116575e-05, "loss": 40.1357, "step": 13741 }, { "epoch": 36.29448662925058, "grad_norm": 855.4598388671875, "learning_rate": 8.725710943092108e-05, "loss": 39.5346, "step": 13742 }, { "epoch": 36.29712776493892, "grad_norm": 1398.2080078125, "learning_rate": 8.722539907517604e-05, "loss": 35.7412, "step": 13743 }, { "epoch": 36.29976890062727, "grad_norm": 888.2980346679688, "learning_rate": 8.719369326481611e-05, "loss": 36.4737, "step": 13744 }, { "epoch": 36.302410036315614, "grad_norm": 1232.5989990234375, "learning_rate": 8.716199200072656e-05, "loss": 35.0186, "step": 13745 }, { "epoch": 36.305051172003964, "grad_norm": 1008.20703125, "learning_rate": 8.713029528379282e-05, "loss": 35.855, "step": 13746 }, { "epoch": 36.30769230769231, "grad_norm": 563.5887451171875, "learning_rate": 8.709860311489993e-05, "loss": 34.4693, "step": 13747 }, { "epoch": 36.31033344338066, "grad_norm": 558.7401733398438, "learning_rate": 8.706691549493282e-05, "loss": 35.5354, "step": 13748 }, { "epoch": 36.312974579069, "grad_norm": 523.1915893554688, "learning_rate": 8.70352324247763e-05, "loss": 35.8039, "step": 13749 }, { "epoch": 36.31561571475734, "grad_norm": 839.2578735351562, "learning_rate": 8.700355390531537e-05, "loss": 36.3076, "step": 13750 }, { "epoch": 36.31825685044569, "grad_norm": 776.08154296875, "learning_rate": 8.697187993743435e-05, "loss": 36.2962, "step": 13751 }, { "epoch": 36.320897986134035, "grad_norm": 2663.443603515625, "learning_rate": 8.694021052201775e-05, "loss": 35.2364, "step": 13752 }, { "epoch": 36.323539121822385, "grad_norm": 3914.619873046875, "learning_rate": 8.690854565995005e-05, "loss": 48.4499, "step": 13753 }, { "epoch": 36.32618025751073, "grad_norm": 50332.23046875, "learning_rate": 8.68768853521154e-05, "loss": 17.8513, "step": 13754 }, { "epoch": 36.32882139319908, "grad_norm": 622.03173828125, "learning_rate": 8.684522959939792e-05, "loss": 13.4628, "step": 13755 }, { "epoch": 36.33146252888742, "grad_norm": 2009.181884765625, "learning_rate": 8.681357840268147e-05, "loss": 11.0621, "step": 13756 }, { "epoch": 36.334103664575764, "grad_norm": 8108.75, "learning_rate": 8.67819317628501e-05, "loss": 11.425, "step": 13757 }, { "epoch": 36.336744800264114, "grad_norm": 489.98370361328125, "learning_rate": 8.675028968078735e-05, "loss": 12.257, "step": 13758 }, { "epoch": 36.33938593595246, "grad_norm": 3595.2841796875, "learning_rate": 8.67186521573769e-05, "loss": 9.873, "step": 13759 }, { "epoch": 36.34202707164081, "grad_norm": 58497.27734375, "learning_rate": 8.668701919350217e-05, "loss": 11.95, "step": 13760 }, { "epoch": 36.34466820732915, "grad_norm": 1911.404296875, "learning_rate": 8.665539079004647e-05, "loss": 13.0283, "step": 13761 }, { "epoch": 36.3473093430175, "grad_norm": 4120.25634765625, "learning_rate": 8.662376694789307e-05, "loss": 9.6363, "step": 13762 }, { "epoch": 36.34995047870584, "grad_norm": 591.0538330078125, "learning_rate": 8.659214766792486e-05, "loss": 29.5603, "step": 13763 }, { "epoch": 36.35259161439419, "grad_norm": 970.798583984375, "learning_rate": 8.656053295102507e-05, "loss": 35.3864, "step": 13764 }, { "epoch": 36.355232750082536, "grad_norm": 611.0964965820312, "learning_rate": 8.652892279807637e-05, "loss": 33.4425, "step": 13765 }, { "epoch": 36.35787388577088, "grad_norm": 743.9717407226562, "learning_rate": 8.64973172099614e-05, "loss": 35.8586, "step": 13766 }, { "epoch": 36.36051502145923, "grad_norm": 1216.9539794921875, "learning_rate": 8.646571618756285e-05, "loss": 35.1403, "step": 13767 }, { "epoch": 36.36315615714757, "grad_norm": 647.2957153320312, "learning_rate": 8.643411973176323e-05, "loss": 33.9182, "step": 13768 }, { "epoch": 36.36579729283592, "grad_norm": 920.8021240234375, "learning_rate": 8.640252784344463e-05, "loss": 34.7373, "step": 13769 }, { "epoch": 36.368438428524264, "grad_norm": 889.6873168945312, "learning_rate": 8.637094052348926e-05, "loss": 35.2657, "step": 13770 }, { "epoch": 36.371079564212614, "grad_norm": 881.3602294921875, "learning_rate": 8.633935777277935e-05, "loss": 35.4527, "step": 13771 }, { "epoch": 36.37372069990096, "grad_norm": 664.219970703125, "learning_rate": 8.630777959219675e-05, "loss": 34.8844, "step": 13772 }, { "epoch": 36.3763618355893, "grad_norm": 483.4425048828125, "learning_rate": 8.627620598262311e-05, "loss": 34.3992, "step": 13773 }, { "epoch": 36.37900297127765, "grad_norm": 777.5111694335938, "learning_rate": 8.624463694494036e-05, "loss": 35.4886, "step": 13774 }, { "epoch": 36.38164410696599, "grad_norm": 2047.0177001953125, "learning_rate": 8.621307248002994e-05, "loss": 34.7604, "step": 13775 }, { "epoch": 36.38428524265434, "grad_norm": 1350.4007568359375, "learning_rate": 8.618151258877322e-05, "loss": 34.2886, "step": 13776 }, { "epoch": 36.386926378342686, "grad_norm": 1335.5953369140625, "learning_rate": 8.614995727205155e-05, "loss": 34.483, "step": 13777 }, { "epoch": 36.389567514031036, "grad_norm": 1094.6011962890625, "learning_rate": 8.611840653074607e-05, "loss": 33.9898, "step": 13778 }, { "epoch": 36.39220864971938, "grad_norm": 4142.73681640625, "learning_rate": 8.608686036573782e-05, "loss": 35.07, "step": 13779 }, { "epoch": 36.39484978540773, "grad_norm": 2481.44140625, "learning_rate": 8.605531877790762e-05, "loss": 39.2095, "step": 13780 }, { "epoch": 36.39749092109607, "grad_norm": 1056.597900390625, "learning_rate": 8.60237817681364e-05, "loss": 39.8955, "step": 13781 }, { "epoch": 36.400132056784415, "grad_norm": 644.488037109375, "learning_rate": 8.599224933730479e-05, "loss": 38.9041, "step": 13782 }, { "epoch": 36.402773192472765, "grad_norm": 1246.1177978515625, "learning_rate": 8.596072148629325e-05, "loss": 38.8375, "step": 13783 }, { "epoch": 36.40541432816111, "grad_norm": 669.4623413085938, "learning_rate": 8.592919821598213e-05, "loss": 38.022, "step": 13784 }, { "epoch": 36.40805546384946, "grad_norm": 614.1949462890625, "learning_rate": 8.589767952725191e-05, "loss": 40.3664, "step": 13785 }, { "epoch": 36.4106965995378, "grad_norm": 2297.473388671875, "learning_rate": 8.586616542098256e-05, "loss": 41.944, "step": 13786 }, { "epoch": 36.41333773522615, "grad_norm": 563.6024169921875, "learning_rate": 8.583465589805397e-05, "loss": 40.1916, "step": 13787 }, { "epoch": 36.41597887091449, "grad_norm": 1529.6031494140625, "learning_rate": 8.580315095934632e-05, "loss": 41.1972, "step": 13788 }, { "epoch": 36.418620006602836, "grad_norm": 4187.2431640625, "learning_rate": 8.57716506057392e-05, "loss": 38.8995, "step": 13789 }, { "epoch": 36.421261142291186, "grad_norm": 825.9549560546875, "learning_rate": 8.574015483811226e-05, "loss": 40.8878, "step": 13790 }, { "epoch": 36.42390227797953, "grad_norm": 746.0909423828125, "learning_rate": 8.570866365734493e-05, "loss": 37.7013, "step": 13791 }, { "epoch": 36.42654341366788, "grad_norm": 580.790283203125, "learning_rate": 8.567717706431674e-05, "loss": 38.1296, "step": 13792 }, { "epoch": 36.42918454935622, "grad_norm": 1465.984375, "learning_rate": 8.564569505990686e-05, "loss": 36.2872, "step": 13793 }, { "epoch": 36.43182568504457, "grad_norm": 898.0298461914062, "learning_rate": 8.561421764499439e-05, "loss": 36.1384, "step": 13794 }, { "epoch": 36.434466820732915, "grad_norm": 734.30810546875, "learning_rate": 8.55827448204583e-05, "loss": 35.0953, "step": 13795 }, { "epoch": 36.43710795642126, "grad_norm": 404.64837646484375, "learning_rate": 8.555127658717751e-05, "loss": 35.3477, "step": 13796 }, { "epoch": 36.43974909210961, "grad_norm": 895.5426635742188, "learning_rate": 8.55198129460307e-05, "loss": 35.6302, "step": 13797 }, { "epoch": 36.44239022779795, "grad_norm": 666.1434326171875, "learning_rate": 8.548835389789644e-05, "loss": 36.2951, "step": 13798 }, { "epoch": 36.4450313634863, "grad_norm": 535.483642578125, "learning_rate": 8.545689944365331e-05, "loss": 34.3337, "step": 13799 }, { "epoch": 36.447672499174644, "grad_norm": 1107.091796875, "learning_rate": 8.542544958417961e-05, "loss": 34.3745, "step": 13800 }, { "epoch": 36.447672499174644, "eval_loss": 3.703050136566162, "eval_runtime": 2.1406, "eval_samples_per_second": 231.246, "eval_steps_per_second": 28.964, "step": 13800 }, { "epoch": 36.450313634862994, "grad_norm": 1098.410400390625, "learning_rate": 8.539400432035349e-05, "loss": 35.1715, "step": 13801 }, { "epoch": 36.45295477055134, "grad_norm": 2108.77880859375, "learning_rate": 8.536256365305317e-05, "loss": 35.693, "step": 13802 }, { "epoch": 36.45559590623968, "grad_norm": 1394.2078857421875, "learning_rate": 8.533112758315665e-05, "loss": 34.7988, "step": 13803 }, { "epoch": 36.45823704192803, "grad_norm": 2861.7158203125, "learning_rate": 8.529969611154156e-05, "loss": 12.5539, "step": 13804 }, { "epoch": 36.46087817761637, "grad_norm": 1313.6220703125, "learning_rate": 8.526826923908563e-05, "loss": 13.352, "step": 13805 }, { "epoch": 36.46351931330472, "grad_norm": 3092.956787109375, "learning_rate": 8.523684696666658e-05, "loss": 10.9061, "step": 13806 }, { "epoch": 36.466160448993065, "grad_norm": 1052.39013671875, "learning_rate": 8.520542929516182e-05, "loss": 15.3639, "step": 13807 }, { "epoch": 36.468801584681415, "grad_norm": 4141.39111328125, "learning_rate": 8.517401622544851e-05, "loss": 15.4676, "step": 13808 }, { "epoch": 36.47144272036976, "grad_norm": 1173.11181640625, "learning_rate": 8.51426077584041e-05, "loss": 11.7525, "step": 13809 }, { "epoch": 36.47408385605811, "grad_norm": 281.2746276855469, "learning_rate": 8.511120389490551e-05, "loss": 15.6, "step": 13810 }, { "epoch": 36.47672499174645, "grad_norm": 539.2705078125, "learning_rate": 8.50798046358297e-05, "loss": 8.0432, "step": 13811 }, { "epoch": 36.479366127434794, "grad_norm": 473.6065673828125, "learning_rate": 8.504840998205346e-05, "loss": 13.4816, "step": 13812 }, { "epoch": 36.482007263123144, "grad_norm": 535.40576171875, "learning_rate": 8.501701993445346e-05, "loss": 29.4632, "step": 13813 }, { "epoch": 36.48464839881149, "grad_norm": 915.2354736328125, "learning_rate": 8.498563449390628e-05, "loss": 35.5329, "step": 13814 }, { "epoch": 36.48728953449984, "grad_norm": 628.357666015625, "learning_rate": 8.495425366128822e-05, "loss": 35.2075, "step": 13815 }, { "epoch": 36.48993067018818, "grad_norm": 1129.658203125, "learning_rate": 8.492287743747578e-05, "loss": 34.0521, "step": 13816 }, { "epoch": 36.49257180587653, "grad_norm": 1457.9097900390625, "learning_rate": 8.489150582334501e-05, "loss": 33.7226, "step": 13817 }, { "epoch": 36.49521294156487, "grad_norm": 1050.5423583984375, "learning_rate": 8.486013881977195e-05, "loss": 33.5286, "step": 13818 }, { "epoch": 36.497854077253216, "grad_norm": 1203.9271240234375, "learning_rate": 8.482877642763243e-05, "loss": 33.7858, "step": 13819 }, { "epoch": 36.500495212941566, "grad_norm": 2477.156005859375, "learning_rate": 8.479741864780236e-05, "loss": 35.1923, "step": 13820 }, { "epoch": 36.50313634862991, "grad_norm": 1045.891845703125, "learning_rate": 8.476606548115742e-05, "loss": 33.8504, "step": 13821 }, { "epoch": 36.50577748431826, "grad_norm": 658.1633911132812, "learning_rate": 8.473471692857287e-05, "loss": 33.3738, "step": 13822 }, { "epoch": 36.5084186200066, "grad_norm": 948.7498779296875, "learning_rate": 8.470337299092437e-05, "loss": 34.3768, "step": 13823 }, { "epoch": 36.51105975569495, "grad_norm": 412.1817626953125, "learning_rate": 8.467203366908707e-05, "loss": 35.8274, "step": 13824 }, { "epoch": 36.513700891383294, "grad_norm": 1858.1356201171875, "learning_rate": 8.464069896393612e-05, "loss": 34.7446, "step": 13825 }, { "epoch": 36.516342027071644, "grad_norm": 1097.101318359375, "learning_rate": 8.460936887634641e-05, "loss": 34.837, "step": 13826 }, { "epoch": 36.51898316275999, "grad_norm": 1381.119140625, "learning_rate": 8.457804340719299e-05, "loss": 34.8881, "step": 13827 }, { "epoch": 36.52162429844833, "grad_norm": 7623.728515625, "learning_rate": 8.454672255735058e-05, "loss": 35.0449, "step": 13828 }, { "epoch": 36.52426543413668, "grad_norm": 1140.0751953125, "learning_rate": 8.45154063276937e-05, "loss": 35.9451, "step": 13829 }, { "epoch": 36.52690656982502, "grad_norm": 709.4390869140625, "learning_rate": 8.44840947190969e-05, "loss": 38.6111, "step": 13830 }, { "epoch": 36.52954770551337, "grad_norm": 1353.5654296875, "learning_rate": 8.445278773243456e-05, "loss": 39.2984, "step": 13831 }, { "epoch": 36.532188841201716, "grad_norm": 742.3441772460938, "learning_rate": 8.442148536858085e-05, "loss": 37.2516, "step": 13832 }, { "epoch": 36.534829976890066, "grad_norm": 712.34375, "learning_rate": 8.439018762840981e-05, "loss": 39.2321, "step": 13833 }, { "epoch": 36.53747111257841, "grad_norm": 775.1405639648438, "learning_rate": 8.435889451279558e-05, "loss": 40.9891, "step": 13834 }, { "epoch": 36.54011224826675, "grad_norm": 1141.1766357421875, "learning_rate": 8.432760602261192e-05, "loss": 41.7241, "step": 13835 }, { "epoch": 36.5427533839551, "grad_norm": 702.557861328125, "learning_rate": 8.429632215873245e-05, "loss": 40.094, "step": 13836 }, { "epoch": 36.545394519643445, "grad_norm": 656.333984375, "learning_rate": 8.426504292203094e-05, "loss": 42.1716, "step": 13837 }, { "epoch": 36.548035655331795, "grad_norm": 1105.506103515625, "learning_rate": 8.423376831338084e-05, "loss": 41.5161, "step": 13838 }, { "epoch": 36.55067679102014, "grad_norm": 585.9474487304688, "learning_rate": 8.420249833365528e-05, "loss": 39.7099, "step": 13839 }, { "epoch": 36.55331792670849, "grad_norm": 896.4719848632812, "learning_rate": 8.417123298372748e-05, "loss": 40.3087, "step": 13840 }, { "epoch": 36.55595906239683, "grad_norm": 1059.451904296875, "learning_rate": 8.413997226447068e-05, "loss": 39.9231, "step": 13841 }, { "epoch": 36.55860019808517, "grad_norm": 852.7432250976562, "learning_rate": 8.41087161767577e-05, "loss": 39.2511, "step": 13842 }, { "epoch": 36.56124133377352, "grad_norm": 1081.594482421875, "learning_rate": 8.40774647214613e-05, "loss": 37.877, "step": 13843 }, { "epoch": 36.563882469461866, "grad_norm": 578.4306640625, "learning_rate": 8.404621789945433e-05, "loss": 37.106, "step": 13844 }, { "epoch": 36.566523605150216, "grad_norm": 1474.146240234375, "learning_rate": 8.40149757116092e-05, "loss": 34.9655, "step": 13845 }, { "epoch": 36.56916474083856, "grad_norm": 528.3135986328125, "learning_rate": 8.398373815879842e-05, "loss": 36.7382, "step": 13846 }, { "epoch": 36.57180587652691, "grad_norm": 561.1124267578125, "learning_rate": 8.395250524189418e-05, "loss": 34.1515, "step": 13847 }, { "epoch": 36.57444701221525, "grad_norm": 492.9225769042969, "learning_rate": 8.392127696176872e-05, "loss": 35.2816, "step": 13848 }, { "epoch": 36.577088147903595, "grad_norm": 950.3215942382812, "learning_rate": 8.389005331929403e-05, "loss": 34.8696, "step": 13849 }, { "epoch": 36.579729283591945, "grad_norm": 587.7005615234375, "learning_rate": 8.385883431534195e-05, "loss": 34.9038, "step": 13850 }, { "epoch": 36.58237041928029, "grad_norm": 1269.0155029296875, "learning_rate": 8.382761995078439e-05, "loss": 34.3785, "step": 13851 }, { "epoch": 36.58501155496864, "grad_norm": 1044.2144775390625, "learning_rate": 8.379641022649295e-05, "loss": 34.7886, "step": 13852 }, { "epoch": 36.58765269065698, "grad_norm": 5917.15966796875, "learning_rate": 8.37652051433391e-05, "loss": 34.8756, "step": 13853 }, { "epoch": 36.59029382634533, "grad_norm": 887.9225463867188, "learning_rate": 8.373400470219417e-05, "loss": 8.5194, "step": 13854 }, { "epoch": 36.592934962033674, "grad_norm": 1880.5223388671875, "learning_rate": 8.370280890392956e-05, "loss": 10.0882, "step": 13855 }, { "epoch": 36.595576097722024, "grad_norm": 16543.541015625, "learning_rate": 8.367161774941642e-05, "loss": 11.429, "step": 13856 }, { "epoch": 36.59821723341037, "grad_norm": 1351.3428955078125, "learning_rate": 8.36404312395255e-05, "loss": 9.7028, "step": 13857 }, { "epoch": 36.60085836909871, "grad_norm": 1212.04248046875, "learning_rate": 8.360924937512787e-05, "loss": 11.8463, "step": 13858 }, { "epoch": 36.60349950478706, "grad_norm": 2156.30029296875, "learning_rate": 8.35780721570942e-05, "loss": 11.6761, "step": 13859 }, { "epoch": 36.6061406404754, "grad_norm": 1198.6593017578125, "learning_rate": 8.354689958629513e-05, "loss": 16.7446, "step": 13860 }, { "epoch": 36.60878177616375, "grad_norm": 3173.23046875, "learning_rate": 8.351573166360102e-05, "loss": 10.1191, "step": 13861 }, { "epoch": 36.611422911852095, "grad_norm": 443.0749206542969, "learning_rate": 8.348456838988239e-05, "loss": 8.569, "step": 13862 }, { "epoch": 36.614064047540445, "grad_norm": 1360.8431396484375, "learning_rate": 8.345340976600935e-05, "loss": 35.457, "step": 13863 }, { "epoch": 36.61670518322879, "grad_norm": 641.7767944335938, "learning_rate": 8.3422255792852e-05, "loss": 35.1307, "step": 13864 }, { "epoch": 36.61934631891713, "grad_norm": 692.7201538085938, "learning_rate": 8.339110647128035e-05, "loss": 34.753, "step": 13865 }, { "epoch": 36.62198745460548, "grad_norm": 1203.3388671875, "learning_rate": 8.335996180216416e-05, "loss": 35.4678, "step": 13866 }, { "epoch": 36.624628590293824, "grad_norm": 1265.6751708984375, "learning_rate": 8.332882178637313e-05, "loss": 36.4717, "step": 13867 }, { "epoch": 36.627269725982174, "grad_norm": 1717.8798828125, "learning_rate": 8.329768642477679e-05, "loss": 34.7095, "step": 13868 }, { "epoch": 36.62991086167052, "grad_norm": 2030.953857421875, "learning_rate": 8.326655571824468e-05, "loss": 33.5932, "step": 13869 }, { "epoch": 36.63255199735887, "grad_norm": 735.9630126953125, "learning_rate": 8.323542966764608e-05, "loss": 34.2333, "step": 13870 }, { "epoch": 36.63519313304721, "grad_norm": 784.4578247070312, "learning_rate": 8.320430827385004e-05, "loss": 33.702, "step": 13871 }, { "epoch": 36.63783426873556, "grad_norm": 575.63232421875, "learning_rate": 8.317319153772582e-05, "loss": 34.7394, "step": 13872 }, { "epoch": 36.6404754044239, "grad_norm": 864.8531494140625, "learning_rate": 8.31420794601423e-05, "loss": 34.8175, "step": 13873 }, { "epoch": 36.643116540112246, "grad_norm": 991.3875122070312, "learning_rate": 8.311097204196802e-05, "loss": 34.0639, "step": 13874 }, { "epoch": 36.645757675800596, "grad_norm": 1321.97900390625, "learning_rate": 8.307986928407188e-05, "loss": 34.0126, "step": 13875 }, { "epoch": 36.64839881148894, "grad_norm": 512.8470458984375, "learning_rate": 8.304877118732237e-05, "loss": 33.8919, "step": 13876 }, { "epoch": 36.65103994717729, "grad_norm": 2312.088134765625, "learning_rate": 8.301767775258784e-05, "loss": 33.712, "step": 13877 }, { "epoch": 36.65368108286563, "grad_norm": 2653.648193359375, "learning_rate": 8.298658898073646e-05, "loss": 34.7993, "step": 13878 }, { "epoch": 36.65632221855398, "grad_norm": 1109.0477294921875, "learning_rate": 8.295550487263658e-05, "loss": 37.2094, "step": 13879 }, { "epoch": 36.658963354242324, "grad_norm": 734.2352905273438, "learning_rate": 8.29244254291561e-05, "loss": 37.0527, "step": 13880 }, { "epoch": 36.66160448993067, "grad_norm": 2513.680908203125, "learning_rate": 8.289335065116291e-05, "loss": 39.8797, "step": 13881 }, { "epoch": 36.66424562561902, "grad_norm": 532.29052734375, "learning_rate": 8.286228053952471e-05, "loss": 37.9283, "step": 13882 }, { "epoch": 36.66688676130736, "grad_norm": 797.4213256835938, "learning_rate": 8.283121509510916e-05, "loss": 38.4507, "step": 13883 }, { "epoch": 36.66952789699571, "grad_norm": 782.6586303710938, "learning_rate": 8.280015431878371e-05, "loss": 38.3325, "step": 13884 }, { "epoch": 36.67216903268405, "grad_norm": 618.1929931640625, "learning_rate": 8.276909821141565e-05, "loss": 40.8922, "step": 13885 }, { "epoch": 36.6748101683724, "grad_norm": 436.31915283203125, "learning_rate": 8.273804677387239e-05, "loss": 41.713, "step": 13886 }, { "epoch": 36.677451304060746, "grad_norm": 1364.0440673828125, "learning_rate": 8.270700000702092e-05, "loss": 42.8853, "step": 13887 }, { "epoch": 36.68009243974909, "grad_norm": 627.8226318359375, "learning_rate": 8.26759579117281e-05, "loss": 40.5447, "step": 13888 }, { "epoch": 36.68273357543744, "grad_norm": 477.91656494140625, "learning_rate": 8.264492048886096e-05, "loss": 40.6649, "step": 13889 }, { "epoch": 36.68537471112578, "grad_norm": 787.3984985351562, "learning_rate": 8.26138877392861e-05, "loss": 38.6352, "step": 13890 }, { "epoch": 36.68801584681413, "grad_norm": 2277.562744140625, "learning_rate": 8.258285966387016e-05, "loss": 37.4101, "step": 13891 }, { "epoch": 36.690656982502475, "grad_norm": 595.8128051757812, "learning_rate": 8.255183626347939e-05, "loss": 37.9306, "step": 13892 }, { "epoch": 36.693298118190825, "grad_norm": 591.1538696289062, "learning_rate": 8.252081753898027e-05, "loss": 36.961, "step": 13893 }, { "epoch": 36.69593925387917, "grad_norm": 665.4630737304688, "learning_rate": 8.248980349123897e-05, "loss": 36.4842, "step": 13894 }, { "epoch": 36.69858038956751, "grad_norm": 677.4937744140625, "learning_rate": 8.245879412112142e-05, "loss": 35.9633, "step": 13895 }, { "epoch": 36.70122152525586, "grad_norm": 605.5159301757812, "learning_rate": 8.242778942949369e-05, "loss": 34.9519, "step": 13896 }, { "epoch": 36.7038626609442, "grad_norm": 1000.7980346679688, "learning_rate": 8.239678941722154e-05, "loss": 34.5501, "step": 13897 }, { "epoch": 36.70650379663255, "grad_norm": 472.9693298339844, "learning_rate": 8.236579408517056e-05, "loss": 34.9514, "step": 13898 }, { "epoch": 36.709144932320896, "grad_norm": 957.1096801757812, "learning_rate": 8.233480343420629e-05, "loss": 33.8988, "step": 13899 }, { "epoch": 36.711786068009246, "grad_norm": 572.7088012695312, "learning_rate": 8.23038174651942e-05, "loss": 35.5182, "step": 13900 }, { "epoch": 36.71442720369759, "grad_norm": 828.7753295898438, "learning_rate": 8.227283617899944e-05, "loss": 35.6198, "step": 13901 }, { "epoch": 36.71706833938594, "grad_norm": 741.8048095703125, "learning_rate": 8.224185957648714e-05, "loss": 35.1304, "step": 13902 }, { "epoch": 36.71970947507428, "grad_norm": 2814.216552734375, "learning_rate": 8.221088765852244e-05, "loss": 36.3339, "step": 13903 }, { "epoch": 36.722350610762625, "grad_norm": 353079.5, "learning_rate": 8.217992042597017e-05, "loss": 32.0307, "step": 13904 }, { "epoch": 36.724991746450975, "grad_norm": 852.795654296875, "learning_rate": 8.214895787969501e-05, "loss": 9.2166, "step": 13905 }, { "epoch": 36.72763288213932, "grad_norm": 595.8095703125, "learning_rate": 8.211800002056152e-05, "loss": 8.7284, "step": 13906 }, { "epoch": 36.73027401782767, "grad_norm": 4687.6982421875, "learning_rate": 8.208704684943433e-05, "loss": 8.1161, "step": 13907 }, { "epoch": 36.73291515351601, "grad_norm": 2114.16943359375, "learning_rate": 8.205609836717781e-05, "loss": 12.8289, "step": 13908 }, { "epoch": 36.73555628920436, "grad_norm": 3988.363037109375, "learning_rate": 8.202515457465595e-05, "loss": 18.3616, "step": 13909 }, { "epoch": 36.738197424892704, "grad_norm": 2145.366943359375, "learning_rate": 8.199421547273303e-05, "loss": 8.5051, "step": 13910 }, { "epoch": 36.74083856058105, "grad_norm": 1510.7841796875, "learning_rate": 8.196328106227297e-05, "loss": 9.8752, "step": 13911 }, { "epoch": 36.7434796962694, "grad_norm": 2740.090576171875, "learning_rate": 8.193235134413953e-05, "loss": 11.4265, "step": 13912 }, { "epoch": 36.74612083195774, "grad_norm": 2764.304443359375, "learning_rate": 8.190142631919642e-05, "loss": 14.9262, "step": 13913 }, { "epoch": 36.74876196764609, "grad_norm": 756.3966064453125, "learning_rate": 8.187050598830731e-05, "loss": 33.7849, "step": 13914 }, { "epoch": 36.75140310333443, "grad_norm": 1531.3077392578125, "learning_rate": 8.183959035233554e-05, "loss": 35.6204, "step": 13915 }, { "epoch": 36.75404423902278, "grad_norm": 981.7586059570312, "learning_rate": 8.180867941214442e-05, "loss": 33.5123, "step": 13916 }, { "epoch": 36.756685374711125, "grad_norm": 1021.7100830078125, "learning_rate": 8.177777316859714e-05, "loss": 34.7208, "step": 13917 }, { "epoch": 36.759326510399475, "grad_norm": 701.9812622070312, "learning_rate": 8.174687162255672e-05, "loss": 35.1285, "step": 13918 }, { "epoch": 36.76196764608782, "grad_norm": 849.3599853515625, "learning_rate": 8.171597477488607e-05, "loss": 34.4743, "step": 13919 }, { "epoch": 36.76460878177616, "grad_norm": 1172.038330078125, "learning_rate": 8.168508262644788e-05, "loss": 33.7749, "step": 13920 }, { "epoch": 36.76724991746451, "grad_norm": 1122.5787353515625, "learning_rate": 8.165419517810499e-05, "loss": 33.9758, "step": 13921 }, { "epoch": 36.769891053152854, "grad_norm": 629.694091796875, "learning_rate": 8.162331243071979e-05, "loss": 33.7308, "step": 13922 }, { "epoch": 36.772532188841204, "grad_norm": 1066.405517578125, "learning_rate": 8.15924343851546e-05, "loss": 35.5379, "step": 13923 }, { "epoch": 36.77517332452955, "grad_norm": 542.97802734375, "learning_rate": 8.156156104227186e-05, "loss": 35.369, "step": 13924 }, { "epoch": 36.7778144602179, "grad_norm": 1448.304931640625, "learning_rate": 8.153069240293353e-05, "loss": 33.6862, "step": 13925 }, { "epoch": 36.78045559590624, "grad_norm": 3104.246337890625, "learning_rate": 8.149982846800178e-05, "loss": 33.5725, "step": 13926 }, { "epoch": 36.78309673159458, "grad_norm": 2026.77197265625, "learning_rate": 8.146896923833816e-05, "loss": 34.9278, "step": 13927 }, { "epoch": 36.78573786728293, "grad_norm": 809.0256958007812, "learning_rate": 8.143811471480463e-05, "loss": 34.6174, "step": 13928 }, { "epoch": 36.788379002971276, "grad_norm": 724.649169921875, "learning_rate": 8.140726489826275e-05, "loss": 36.178, "step": 13929 }, { "epoch": 36.791020138659626, "grad_norm": 953.966064453125, "learning_rate": 8.137641978957389e-05, "loss": 39.5878, "step": 13930 }, { "epoch": 36.79366127434797, "grad_norm": 2036.260498046875, "learning_rate": 8.134557938959952e-05, "loss": 42.1346, "step": 13931 }, { "epoch": 36.79630241003632, "grad_norm": 769.2549438476562, "learning_rate": 8.131474369920078e-05, "loss": 37.7748, "step": 13932 }, { "epoch": 36.79894354572466, "grad_norm": 681.8550415039062, "learning_rate": 8.128391271923872e-05, "loss": 38.3997, "step": 13933 }, { "epoch": 36.801584681413004, "grad_norm": 658.2531127929688, "learning_rate": 8.125308645057431e-05, "loss": 39.4176, "step": 13934 }, { "epoch": 36.804225817101354, "grad_norm": 1082.5010986328125, "learning_rate": 8.122226489406831e-05, "loss": 40.5199, "step": 13935 }, { "epoch": 36.8068669527897, "grad_norm": 2456.20068359375, "learning_rate": 8.119144805058143e-05, "loss": 41.7625, "step": 13936 }, { "epoch": 36.80950808847805, "grad_norm": 797.8673095703125, "learning_rate": 8.116063592097411e-05, "loss": 39.2705, "step": 13937 }, { "epoch": 36.81214922416639, "grad_norm": 4140.51123046875, "learning_rate": 8.112982850610693e-05, "loss": 40.3029, "step": 13938 }, { "epoch": 36.81479035985474, "grad_norm": 1029.6600341796875, "learning_rate": 8.10990258068401e-05, "loss": 41.7214, "step": 13939 }, { "epoch": 36.81743149554308, "grad_norm": 583.0009155273438, "learning_rate": 8.106822782403376e-05, "loss": 39.9091, "step": 13940 }, { "epoch": 36.820072631231426, "grad_norm": 722.4261474609375, "learning_rate": 8.103743455854781e-05, "loss": 39.7795, "step": 13941 }, { "epoch": 36.822713766919776, "grad_norm": 771.69482421875, "learning_rate": 8.100664601124236e-05, "loss": 37.1845, "step": 13942 }, { "epoch": 36.82535490260812, "grad_norm": 882.6727294921875, "learning_rate": 8.097586218297712e-05, "loss": 38.0087, "step": 13943 }, { "epoch": 36.82799603829647, "grad_norm": 536.830322265625, "learning_rate": 8.094508307461146e-05, "loss": 36.3842, "step": 13944 }, { "epoch": 36.83063717398481, "grad_norm": 1372.9688720703125, "learning_rate": 8.091430868700516e-05, "loss": 35.1501, "step": 13945 }, { "epoch": 36.83327830967316, "grad_norm": 860.5783081054688, "learning_rate": 8.088353902101744e-05, "loss": 36.4055, "step": 13946 }, { "epoch": 36.835919445361505, "grad_norm": 1040.7816162109375, "learning_rate": 8.085277407750754e-05, "loss": 34.9911, "step": 13947 }, { "epoch": 36.838560581049855, "grad_norm": 2799.5625, "learning_rate": 8.08220138573345e-05, "loss": 35.2247, "step": 13948 }, { "epoch": 36.8412017167382, "grad_norm": 727.5560302734375, "learning_rate": 8.079125836135742e-05, "loss": 34.8173, "step": 13949 }, { "epoch": 36.84384285242654, "grad_norm": 1409.1636962890625, "learning_rate": 8.076050759043505e-05, "loss": 35.6617, "step": 13950 }, { "epoch": 36.84648398811489, "grad_norm": 950.9471435546875, "learning_rate": 8.07297615454261e-05, "loss": 35.687, "step": 13951 }, { "epoch": 36.84912512380323, "grad_norm": 8881.033203125, "learning_rate": 8.069902022718911e-05, "loss": 43.5923, "step": 13952 }, { "epoch": 36.85176625949158, "grad_norm": 3310.705078125, "learning_rate": 8.066828363658254e-05, "loss": 14.3121, "step": 13953 }, { "epoch": 36.854407395179926, "grad_norm": 7445.60205078125, "learning_rate": 8.063755177446467e-05, "loss": 8.8217, "step": 13954 }, { "epoch": 36.857048530868276, "grad_norm": 2612.7685546875, "learning_rate": 8.060682464169359e-05, "loss": 13.8419, "step": 13955 }, { "epoch": 36.85968966655662, "grad_norm": 3267.4501953125, "learning_rate": 8.05761022391275e-05, "loss": 13.3254, "step": 13956 }, { "epoch": 36.86233080224496, "grad_norm": 1791.8050537109375, "learning_rate": 8.054538456762425e-05, "loss": 10.0813, "step": 13957 }, { "epoch": 36.86497193793331, "grad_norm": 1972.7449951171875, "learning_rate": 8.05146716280415e-05, "loss": 15.5304, "step": 13958 }, { "epoch": 36.867613073621655, "grad_norm": 3405.941162109375, "learning_rate": 8.048396342123707e-05, "loss": 8.9885, "step": 13959 }, { "epoch": 36.870254209310005, "grad_norm": 681.5394287109375, "learning_rate": 8.045325994806837e-05, "loss": 9.486, "step": 13960 }, { "epoch": 36.87289534499835, "grad_norm": 1916.5260009765625, "learning_rate": 8.042256120939289e-05, "loss": 13.6982, "step": 13961 }, { "epoch": 36.8755364806867, "grad_norm": 916.0855102539062, "learning_rate": 8.039186720606761e-05, "loss": 15.8364, "step": 13962 }, { "epoch": 36.87817761637504, "grad_norm": 780.9562377929688, "learning_rate": 8.036117793894987e-05, "loss": 36.423, "step": 13963 }, { "epoch": 36.88081875206339, "grad_norm": 714.3316650390625, "learning_rate": 8.033049340889658e-05, "loss": 34.831, "step": 13964 }, { "epoch": 36.883459887751734, "grad_norm": 962.0294799804688, "learning_rate": 8.029981361676455e-05, "loss": 35.1973, "step": 13965 }, { "epoch": 36.88610102344008, "grad_norm": 547.8292236328125, "learning_rate": 8.026913856341058e-05, "loss": 35.1449, "step": 13966 }, { "epoch": 36.88874215912843, "grad_norm": 953.0751342773438, "learning_rate": 8.023846824969122e-05, "loss": 34.7469, "step": 13967 }, { "epoch": 36.89138329481677, "grad_norm": 825.6815185546875, "learning_rate": 8.020780267646291e-05, "loss": 34.4157, "step": 13968 }, { "epoch": 36.89402443050512, "grad_norm": 768.1284790039062, "learning_rate": 8.0177141844582e-05, "loss": 35.663, "step": 13969 }, { "epoch": 36.89666556619346, "grad_norm": 1503.934326171875, "learning_rate": 8.01464857549046e-05, "loss": 34.7432, "step": 13970 }, { "epoch": 36.89930670188181, "grad_norm": 1058.337890625, "learning_rate": 8.011583440828682e-05, "loss": 34.8174, "step": 13971 }, { "epoch": 36.901947837570155, "grad_norm": 1358.7847900390625, "learning_rate": 8.00851878055845e-05, "loss": 35.8028, "step": 13972 }, { "epoch": 36.9045889732585, "grad_norm": 957.9942626953125, "learning_rate": 8.005454594765357e-05, "loss": 35.2416, "step": 13973 }, { "epoch": 36.90723010894685, "grad_norm": 877.500732421875, "learning_rate": 8.002390883534963e-05, "loss": 34.5397, "step": 13974 }, { "epoch": 36.90987124463519, "grad_norm": 675.1293334960938, "learning_rate": 7.999327646952817e-05, "loss": 34.8048, "step": 13975 }, { "epoch": 36.91251238032354, "grad_norm": 1378.364990234375, "learning_rate": 7.996264885104451e-05, "loss": 34.2346, "step": 13976 }, { "epoch": 36.915153516011884, "grad_norm": 3925.623046875, "learning_rate": 7.993202598075408e-05, "loss": 35.0736, "step": 13977 }, { "epoch": 36.917794651700234, "grad_norm": 1436.093017578125, "learning_rate": 7.9901407859512e-05, "loss": 34.9215, "step": 13978 }, { "epoch": 36.92043578738858, "grad_norm": 1000.9903564453125, "learning_rate": 7.987079448817302e-05, "loss": 35.166, "step": 13979 }, { "epoch": 36.92307692307692, "grad_norm": 2432.685791015625, "learning_rate": 7.984018586759225e-05, "loss": 40.7798, "step": 13980 }, { "epoch": 36.92571805876527, "grad_norm": 4201.57958984375, "learning_rate": 7.980958199862434e-05, "loss": 39.2525, "step": 13981 }, { "epoch": 36.92835919445361, "grad_norm": 1066.9266357421875, "learning_rate": 7.977898288212387e-05, "loss": 39.0718, "step": 13982 }, { "epoch": 36.93100033014196, "grad_norm": 952.6471557617188, "learning_rate": 7.974838851894526e-05, "loss": 40.0937, "step": 13983 }, { "epoch": 36.933641465830306, "grad_norm": 885.3033447265625, "learning_rate": 7.971779890994293e-05, "loss": 41.1425, "step": 13984 }, { "epoch": 36.936282601518656, "grad_norm": 529.1693115234375, "learning_rate": 7.968721405597104e-05, "loss": 40.2311, "step": 13985 }, { "epoch": 36.938923737207, "grad_norm": 786.0187377929688, "learning_rate": 7.965663395788366e-05, "loss": 39.3363, "step": 13986 }, { "epoch": 36.94156487289534, "grad_norm": 1007.5599975585938, "learning_rate": 7.962605861653471e-05, "loss": 38.5476, "step": 13987 }, { "epoch": 36.94420600858369, "grad_norm": 710.4396362304688, "learning_rate": 7.9595488032778e-05, "loss": 37.1391, "step": 13988 }, { "epoch": 36.946847144272034, "grad_norm": 817.77978515625, "learning_rate": 7.956492220746717e-05, "loss": 37.3226, "step": 13989 }, { "epoch": 36.949488279960384, "grad_norm": 1126.65673828125, "learning_rate": 7.95343611414557e-05, "loss": 35.5164, "step": 13990 }, { "epoch": 36.95212941564873, "grad_norm": 953.8087158203125, "learning_rate": 7.950380483559714e-05, "loss": 35.0027, "step": 13991 }, { "epoch": 36.95477055133708, "grad_norm": 1454.6654052734375, "learning_rate": 7.947325329074465e-05, "loss": 34.5399, "step": 13992 }, { "epoch": 36.95741168702542, "grad_norm": 2876.8271484375, "learning_rate": 7.944270650775134e-05, "loss": 39.084, "step": 13993 }, { "epoch": 36.96005282271377, "grad_norm": 1279.9029541015625, "learning_rate": 7.941216448747032e-05, "loss": 19.9946, "step": 13994 }, { "epoch": 36.96269395840211, "grad_norm": 6536.57275390625, "learning_rate": 7.93816272307544e-05, "loss": 8.9836, "step": 13995 }, { "epoch": 36.965335094090456, "grad_norm": 5834.15234375, "learning_rate": 7.93510947384564e-05, "loss": 14.3502, "step": 13996 }, { "epoch": 36.967976229778806, "grad_norm": 1583.475830078125, "learning_rate": 7.932056701142867e-05, "loss": 12.8191, "step": 13997 }, { "epoch": 36.97061736546715, "grad_norm": 5050.07470703125, "learning_rate": 7.929004405052392e-05, "loss": 12.4494, "step": 13998 }, { "epoch": 36.9732585011555, "grad_norm": 839.2551879882812, "learning_rate": 7.925952585659443e-05, "loss": 27.462, "step": 13999 }, { "epoch": 36.97589963684384, "grad_norm": 996.971923828125, "learning_rate": 7.92290124304923e-05, "loss": 34.5547, "step": 14000 }, { "epoch": 36.97589963684384, "eval_loss": 3.7369279861450195, "eval_runtime": 2.0696, "eval_samples_per_second": 239.182, "eval_steps_per_second": 29.958, "step": 14000 }, { "epoch": 36.97854077253219, "grad_norm": 2213.029541015625, "learning_rate": 7.919850377306972e-05, "loss": 34.8921, "step": 14001 }, { "epoch": 36.981181908220535, "grad_norm": 2075.300048828125, "learning_rate": 7.916799988517864e-05, "loss": 34.7814, "step": 14002 }, { "epoch": 36.98382304390888, "grad_norm": 1403.61669921875, "learning_rate": 7.913750076767079e-05, "loss": 33.5817, "step": 14003 }, { "epoch": 36.98646417959723, "grad_norm": 924.9443969726562, "learning_rate": 7.910700642139784e-05, "loss": 34.1334, "step": 14004 }, { "epoch": 36.98910531528557, "grad_norm": 929.16357421875, "learning_rate": 7.907651684721138e-05, "loss": 34.7121, "step": 14005 }, { "epoch": 36.99174645097392, "grad_norm": 747.062255859375, "learning_rate": 7.904603204596278e-05, "loss": 34.2678, "step": 14006 }, { "epoch": 36.99438758666226, "grad_norm": 789.4495239257812, "learning_rate": 7.901555201850322e-05, "loss": 33.5252, "step": 14007 }, { "epoch": 36.99702872235061, "grad_norm": 750.4456787109375, "learning_rate": 7.898507676568401e-05, "loss": 34.8461, "step": 14008 }, { "epoch": 36.999669858038956, "grad_norm": 2525.612060546875, "learning_rate": 7.895460628835604e-05, "loss": 37.6554, "step": 14009 }, { "epoch": 37.002310993727306, "grad_norm": 705.176513671875, "learning_rate": 7.892414058737019e-05, "loss": 39.3099, "step": 14010 }, { "epoch": 37.00495212941565, "grad_norm": 858.4520263671875, "learning_rate": 7.889367966357724e-05, "loss": 39.8895, "step": 14011 }, { "epoch": 37.00759326510399, "grad_norm": 2639.95556640625, "learning_rate": 7.886322351782782e-05, "loss": 39.6178, "step": 14012 }, { "epoch": 37.01023440079234, "grad_norm": 860.589599609375, "learning_rate": 7.883277215097243e-05, "loss": 37.7934, "step": 14013 }, { "epoch": 37.012875536480685, "grad_norm": 1170.708740234375, "learning_rate": 7.880232556386113e-05, "loss": 44.0961, "step": 14014 }, { "epoch": 37.015516672169035, "grad_norm": 2027.6981201171875, "learning_rate": 7.877188375734443e-05, "loss": 40.67, "step": 14015 }, { "epoch": 37.01815780785738, "grad_norm": 1702.635498046875, "learning_rate": 7.874144673227229e-05, "loss": 39.3829, "step": 14016 }, { "epoch": 37.02079894354573, "grad_norm": 1001.1494140625, "learning_rate": 7.871101448949455e-05, "loss": 40.7864, "step": 14017 }, { "epoch": 37.02344007923407, "grad_norm": 618.6563110351562, "learning_rate": 7.868058702986122e-05, "loss": 38.0381, "step": 14018 }, { "epoch": 37.026081214922414, "grad_norm": 670.6005249023438, "learning_rate": 7.865016435422184e-05, "loss": 40.032, "step": 14019 }, { "epoch": 37.028722350610764, "grad_norm": 678.8272094726562, "learning_rate": 7.861974646342596e-05, "loss": 38.0877, "step": 14020 }, { "epoch": 37.03136348629911, "grad_norm": 684.6522827148438, "learning_rate": 7.858933335832299e-05, "loss": 38.5566, "step": 14021 }, { "epoch": 37.03400462198746, "grad_norm": 953.6554565429688, "learning_rate": 7.85589250397622e-05, "loss": 37.3866, "step": 14022 }, { "epoch": 37.0366457576758, "grad_norm": 717.177490234375, "learning_rate": 7.852852150859272e-05, "loss": 35.6993, "step": 14023 }, { "epoch": 37.03928689336415, "grad_norm": 720.747314453125, "learning_rate": 7.849812276566346e-05, "loss": 35.7611, "step": 14024 }, { "epoch": 37.04192802905249, "grad_norm": 787.9772338867188, "learning_rate": 7.846772881182345e-05, "loss": 35.5639, "step": 14025 }, { "epoch": 37.044569164740835, "grad_norm": 1038.64111328125, "learning_rate": 7.843733964792135e-05, "loss": 34.7873, "step": 14026 }, { "epoch": 37.047210300429185, "grad_norm": 814.0156860351562, "learning_rate": 7.840695527480579e-05, "loss": 35.5996, "step": 14027 }, { "epoch": 37.04985143611753, "grad_norm": 848.6982421875, "learning_rate": 7.837657569332509e-05, "loss": 34.9117, "step": 14028 }, { "epoch": 37.05249257180588, "grad_norm": 646.4021606445312, "learning_rate": 7.834620090432779e-05, "loss": 35.5705, "step": 14029 }, { "epoch": 37.05513370749422, "grad_norm": 720.3003540039062, "learning_rate": 7.831583090866198e-05, "loss": 34.1715, "step": 14030 }, { "epoch": 37.05777484318257, "grad_norm": 821.0743408203125, "learning_rate": 7.828546570717576e-05, "loss": 35.5541, "step": 14031 }, { "epoch": 37.060415978870914, "grad_norm": 2098.9150390625, "learning_rate": 7.825510530071703e-05, "loss": 19.2725, "step": 14032 }, { "epoch": 37.063057114559264, "grad_norm": 966.1151733398438, "learning_rate": 7.82247496901336e-05, "loss": 13.4118, "step": 14033 }, { "epoch": 37.06569825024761, "grad_norm": 3695.8017578125, "learning_rate": 7.819439887627313e-05, "loss": 9.3307, "step": 14034 }, { "epoch": 37.06833938593595, "grad_norm": 8129.12646484375, "learning_rate": 7.816405285998307e-05, "loss": 15.5502, "step": 14035 }, { "epoch": 37.0709805216243, "grad_norm": 3399.505615234375, "learning_rate": 7.813371164211097e-05, "loss": 12.8537, "step": 14036 }, { "epoch": 37.07362165731264, "grad_norm": 1361.7191162109375, "learning_rate": 7.8103375223504e-05, "loss": 12.1398, "step": 14037 }, { "epoch": 37.07626279300099, "grad_norm": 1107.2886962890625, "learning_rate": 7.80730436050093e-05, "loss": 8.6972, "step": 14038 }, { "epoch": 37.078903928689336, "grad_norm": 1648.70703125, "learning_rate": 7.804271678747388e-05, "loss": 10.3288, "step": 14039 }, { "epoch": 37.081545064377686, "grad_norm": 3014.6015625, "learning_rate": 7.801239477174458e-05, "loss": 14.7135, "step": 14040 }, { "epoch": 37.08418620006603, "grad_norm": 1154.058837890625, "learning_rate": 7.798207755866812e-05, "loss": 35.5636, "step": 14041 }, { "epoch": 37.08682733575437, "grad_norm": 780.65673828125, "learning_rate": 7.795176514909099e-05, "loss": 35.1418, "step": 14042 }, { "epoch": 37.08946847144272, "grad_norm": 1104.7000732421875, "learning_rate": 7.792145754385987e-05, "loss": 32.6374, "step": 14043 }, { "epoch": 37.092109607131064, "grad_norm": 1512.5023193359375, "learning_rate": 7.789115474382094e-05, "loss": 35.8262, "step": 14044 }, { "epoch": 37.094750742819414, "grad_norm": 2008.286865234375, "learning_rate": 7.786085674982036e-05, "loss": 36.1864, "step": 14045 }, { "epoch": 37.09739187850776, "grad_norm": 1196.125732421875, "learning_rate": 7.783056356270427e-05, "loss": 34.4491, "step": 14046 }, { "epoch": 37.10003301419611, "grad_norm": 887.6007690429688, "learning_rate": 7.780027518331859e-05, "loss": 35.0406, "step": 14047 }, { "epoch": 37.10267414988445, "grad_norm": 876.4890747070312, "learning_rate": 7.776999161250914e-05, "loss": 34.6988, "step": 14048 }, { "epoch": 37.10531528557279, "grad_norm": 1282.1824951171875, "learning_rate": 7.773971285112133e-05, "loss": 33.9152, "step": 14049 }, { "epoch": 37.10795642126114, "grad_norm": 1067.9130859375, "learning_rate": 7.770943890000092e-05, "loss": 33.252, "step": 14050 }, { "epoch": 37.110597556949486, "grad_norm": 4682.6826171875, "learning_rate": 7.767916975999325e-05, "loss": 35.3456, "step": 14051 }, { "epoch": 37.113238692637836, "grad_norm": 2404.468017578125, "learning_rate": 7.764890543194345e-05, "loss": 35.1854, "step": 14052 }, { "epoch": 37.11587982832618, "grad_norm": 1181.994140625, "learning_rate": 7.761864591669678e-05, "loss": 34.736, "step": 14053 }, { "epoch": 37.11852096401453, "grad_norm": 1282.2266845703125, "learning_rate": 7.75883912150982e-05, "loss": 33.9778, "step": 14054 }, { "epoch": 37.12116209970287, "grad_norm": 1251.105712890625, "learning_rate": 7.755814132799249e-05, "loss": 33.1782, "step": 14055 }, { "epoch": 37.12380323539122, "grad_norm": 1873.412841796875, "learning_rate": 7.752789625622439e-05, "loss": 33.8668, "step": 14056 }, { "epoch": 37.126444371079565, "grad_norm": 847.3369140625, "learning_rate": 7.749765600063846e-05, "loss": 34.5293, "step": 14057 }, { "epoch": 37.12908550676791, "grad_norm": 2827.137939453125, "learning_rate": 7.746742056207915e-05, "loss": 35.6146, "step": 14058 }, { "epoch": 37.13172664245626, "grad_norm": 8989.345703125, "learning_rate": 7.743718994139071e-05, "loss": 40.1422, "step": 14059 }, { "epoch": 37.1343677781446, "grad_norm": 947.2800903320312, "learning_rate": 7.740696413941744e-05, "loss": 40.0063, "step": 14060 }, { "epoch": 37.13700891383295, "grad_norm": 2061.84130859375, "learning_rate": 7.73767431570033e-05, "loss": 38.3716, "step": 14061 }, { "epoch": 37.13965004952129, "grad_norm": 1516.275390625, "learning_rate": 7.734652699499223e-05, "loss": 38.1739, "step": 14062 }, { "epoch": 37.14229118520964, "grad_norm": 632.6306762695312, "learning_rate": 7.731631565422787e-05, "loss": 38.7566, "step": 14063 }, { "epoch": 37.144932320897986, "grad_norm": 1135.9561767578125, "learning_rate": 7.728610913555403e-05, "loss": 40.2384, "step": 14064 }, { "epoch": 37.14757345658633, "grad_norm": 1179.9559326171875, "learning_rate": 7.725590743981417e-05, "loss": 41.9543, "step": 14065 }, { "epoch": 37.15021459227468, "grad_norm": 643.754150390625, "learning_rate": 7.722571056785161e-05, "loss": 41.7563, "step": 14066 }, { "epoch": 37.15285572796302, "grad_norm": 1359.2686767578125, "learning_rate": 7.719551852050954e-05, "loss": 40.5387, "step": 14067 }, { "epoch": 37.15549686365137, "grad_norm": 977.1807250976562, "learning_rate": 7.716533129863116e-05, "loss": 39.8193, "step": 14068 }, { "epoch": 37.158137999339715, "grad_norm": 1563.7459716796875, "learning_rate": 7.713514890305937e-05, "loss": 37.7302, "step": 14069 }, { "epoch": 37.160779135028065, "grad_norm": 1604.111083984375, "learning_rate": 7.71049713346369e-05, "loss": 38.2591, "step": 14070 }, { "epoch": 37.16342027071641, "grad_norm": 1013.3499145507812, "learning_rate": 7.70747985942066e-05, "loss": 37.5789, "step": 14071 }, { "epoch": 37.16606140640475, "grad_norm": 967.4835815429688, "learning_rate": 7.7044630682611e-05, "loss": 37.0226, "step": 14072 }, { "epoch": 37.1687025420931, "grad_norm": 713.0668334960938, "learning_rate": 7.70144676006925e-05, "loss": 35.2701, "step": 14073 }, { "epoch": 37.171343677781444, "grad_norm": 824.071533203125, "learning_rate": 7.698430934929334e-05, "loss": 36.086, "step": 14074 }, { "epoch": 37.173984813469794, "grad_norm": 867.686279296875, "learning_rate": 7.695415592925572e-05, "loss": 36.0996, "step": 14075 }, { "epoch": 37.17662594915814, "grad_norm": 2312.7197265625, "learning_rate": 7.692400734142166e-05, "loss": 35.8777, "step": 14076 }, { "epoch": 37.17926708484649, "grad_norm": 1648.4940185546875, "learning_rate": 7.689386358663295e-05, "loss": 33.7111, "step": 14077 }, { "epoch": 37.18190822053483, "grad_norm": 688.9842529296875, "learning_rate": 7.686372466573146e-05, "loss": 34.8995, "step": 14078 }, { "epoch": 37.18454935622318, "grad_norm": 1261.515869140625, "learning_rate": 7.683359057955877e-05, "loss": 33.6177, "step": 14079 }, { "epoch": 37.18719049191152, "grad_norm": 1463.791748046875, "learning_rate": 7.680346132895624e-05, "loss": 35.1515, "step": 14080 }, { "epoch": 37.189831627599865, "grad_norm": 4436.07861328125, "learning_rate": 7.677333691476543e-05, "loss": 41.061, "step": 14081 }, { "epoch": 37.192472763288215, "grad_norm": 17029.53125, "learning_rate": 7.67432173378274e-05, "loss": 12.9612, "step": 14082 }, { "epoch": 37.19511389897656, "grad_norm": 14901.5419921875, "learning_rate": 7.671310259898334e-05, "loss": 13.2916, "step": 14083 }, { "epoch": 37.19775503466491, "grad_norm": 2668.54052734375, "learning_rate": 7.668299269907391e-05, "loss": 11.4851, "step": 14084 }, { "epoch": 37.20039617035325, "grad_norm": 2733.287353515625, "learning_rate": 7.665288763894021e-05, "loss": 12.5035, "step": 14085 }, { "epoch": 37.2030373060416, "grad_norm": 1288.6529541015625, "learning_rate": 7.662278741942277e-05, "loss": 15.7516, "step": 14086 }, { "epoch": 37.205678441729944, "grad_norm": 26944.208984375, "learning_rate": 7.65926920413621e-05, "loss": 12.0167, "step": 14087 }, { "epoch": 37.20831957741829, "grad_norm": 2341.59765625, "learning_rate": 7.656260150559868e-05, "loss": 15.5938, "step": 14088 }, { "epoch": 37.21096071310664, "grad_norm": 38705.45703125, "learning_rate": 7.653251581297274e-05, "loss": 10.8047, "step": 14089 }, { "epoch": 37.21360184879498, "grad_norm": 13394.296875, "learning_rate": 7.650243496432441e-05, "loss": 13.1117, "step": 14090 }, { "epoch": 37.21624298448333, "grad_norm": 1284.0643310546875, "learning_rate": 7.647235896049365e-05, "loss": 18.4264, "step": 14091 }, { "epoch": 37.21888412017167, "grad_norm": 1406.049560546875, "learning_rate": 7.644228780232034e-05, "loss": 34.5621, "step": 14092 }, { "epoch": 37.22152525586002, "grad_norm": 1372.2982177734375, "learning_rate": 7.64122214906442e-05, "loss": 35.2792, "step": 14093 }, { "epoch": 37.224166391548366, "grad_norm": 1453.92138671875, "learning_rate": 7.638216002630468e-05, "loss": 34.2374, "step": 14094 }, { "epoch": 37.22680752723671, "grad_norm": 5032.0830078125, "learning_rate": 7.635210341014148e-05, "loss": 34.0588, "step": 14095 }, { "epoch": 37.22944866292506, "grad_norm": 2277.594970703125, "learning_rate": 7.632205164299377e-05, "loss": 34.043, "step": 14096 }, { "epoch": 37.2320897986134, "grad_norm": 1687.801025390625, "learning_rate": 7.629200472570074e-05, "loss": 34.2752, "step": 14097 }, { "epoch": 37.23473093430175, "grad_norm": 897.3594360351562, "learning_rate": 7.626196265910138e-05, "loss": 36.9463, "step": 14098 }, { "epoch": 37.237372069990094, "grad_norm": 1845.904296875, "learning_rate": 7.62319254440347e-05, "loss": 35.3972, "step": 14099 }, { "epoch": 37.240013205678444, "grad_norm": 1420.0401611328125, "learning_rate": 7.620189308133943e-05, "loss": 34.5749, "step": 14100 }, { "epoch": 37.24265434136679, "grad_norm": 1651.689697265625, "learning_rate": 7.617186557185421e-05, "loss": 33.469, "step": 14101 }, { "epoch": 37.24529547705514, "grad_norm": 524.1572875976562, "learning_rate": 7.614184291641755e-05, "loss": 35.3746, "step": 14102 }, { "epoch": 37.24793661274348, "grad_norm": 1605.9893798828125, "learning_rate": 7.611182511586775e-05, "loss": 35.0756, "step": 14103 }, { "epoch": 37.25057774843182, "grad_norm": 1785.6373291015625, "learning_rate": 7.608181217104312e-05, "loss": 35.3255, "step": 14104 }, { "epoch": 37.25321888412017, "grad_norm": 1026.7177734375, "learning_rate": 7.605180408278165e-05, "loss": 34.1481, "step": 14105 }, { "epoch": 37.255860019808516, "grad_norm": 1285.9844970703125, "learning_rate": 7.602180085192142e-05, "loss": 33.8164, "step": 14106 }, { "epoch": 37.258501155496866, "grad_norm": 1628.3199462890625, "learning_rate": 7.599180247930021e-05, "loss": 35.7771, "step": 14107 }, { "epoch": 37.26114229118521, "grad_norm": 4657.51708984375, "learning_rate": 7.59618089657557e-05, "loss": 35.6822, "step": 14108 }, { "epoch": 37.26378342687356, "grad_norm": 1175.432861328125, "learning_rate": 7.593182031212545e-05, "loss": 37.6338, "step": 14109 }, { "epoch": 37.2664245625619, "grad_norm": 864.54443359375, "learning_rate": 7.590183651924685e-05, "loss": 37.762, "step": 14110 }, { "epoch": 37.269065698250245, "grad_norm": 869.0571899414062, "learning_rate": 7.58718575879572e-05, "loss": 38.2302, "step": 14111 }, { "epoch": 37.271706833938595, "grad_norm": 1239.978271484375, "learning_rate": 7.584188351909355e-05, "loss": 37.9914, "step": 14112 }, { "epoch": 37.27434796962694, "grad_norm": 901.9515991210938, "learning_rate": 7.58119143134931e-05, "loss": 39.4569, "step": 14113 }, { "epoch": 37.27698910531529, "grad_norm": 920.8353881835938, "learning_rate": 7.578194997199259e-05, "loss": 41.7444, "step": 14114 }, { "epoch": 37.27963024100363, "grad_norm": 1262.8443603515625, "learning_rate": 7.575199049542872e-05, "loss": 41.773, "step": 14115 }, { "epoch": 37.28227137669198, "grad_norm": 1071.636474609375, "learning_rate": 7.572203588463824e-05, "loss": 40.6411, "step": 14116 }, { "epoch": 37.28491251238032, "grad_norm": 1145.75537109375, "learning_rate": 7.569208614045753e-05, "loss": 40.2085, "step": 14117 }, { "epoch": 37.287553648068666, "grad_norm": 964.756591796875, "learning_rate": 7.5662141263723e-05, "loss": 40.1936, "step": 14118 }, { "epoch": 37.290194783757016, "grad_norm": 1015.2573852539062, "learning_rate": 7.56322012552706e-05, "loss": 40.1734, "step": 14119 }, { "epoch": 37.29283591944536, "grad_norm": 1315.468994140625, "learning_rate": 7.560226611593662e-05, "loss": 39.415, "step": 14120 }, { "epoch": 37.29547705513371, "grad_norm": 1393.19775390625, "learning_rate": 7.557233584655693e-05, "loss": 37.7416, "step": 14121 }, { "epoch": 37.29811819082205, "grad_norm": 1664.414794921875, "learning_rate": 7.554241044796723e-05, "loss": 36.645, "step": 14122 }, { "epoch": 37.3007593265104, "grad_norm": 1927.3265380859375, "learning_rate": 7.55124899210033e-05, "loss": 35.6094, "step": 14123 }, { "epoch": 37.303400462198745, "grad_norm": 1618.869873046875, "learning_rate": 7.548257426650057e-05, "loss": 35.5025, "step": 14124 }, { "epoch": 37.306041597887095, "grad_norm": 1562.514404296875, "learning_rate": 7.545266348529445e-05, "loss": 36.0315, "step": 14125 }, { "epoch": 37.30868273357544, "grad_norm": 1407.086181640625, "learning_rate": 7.542275757822018e-05, "loss": 34.8448, "step": 14126 }, { "epoch": 37.31132386926378, "grad_norm": 1052.751708984375, "learning_rate": 7.539285654611283e-05, "loss": 34.7564, "step": 14127 }, { "epoch": 37.31396500495213, "grad_norm": 1027.902099609375, "learning_rate": 7.536296038980742e-05, "loss": 34.5685, "step": 14128 }, { "epoch": 37.316606140640474, "grad_norm": 1580.622314453125, "learning_rate": 7.533306911013865e-05, "loss": 35.7166, "step": 14129 }, { "epoch": 37.319247276328824, "grad_norm": 2724.837646484375, "learning_rate": 7.53031827079414e-05, "loss": 39.6456, "step": 14130 }, { "epoch": 37.32188841201717, "grad_norm": 29852.130859375, "learning_rate": 7.527330118405015e-05, "loss": 26.8561, "step": 14131 }, { "epoch": 37.32452954770552, "grad_norm": 1171.745361328125, "learning_rate": 7.524342453929936e-05, "loss": 11.586, "step": 14132 }, { "epoch": 37.32717068339386, "grad_norm": 7468.90673828125, "learning_rate": 7.521355277452316e-05, "loss": 10.3545, "step": 14133 }, { "epoch": 37.3298118190822, "grad_norm": 1990.335205078125, "learning_rate": 7.518368589055594e-05, "loss": 19.6527, "step": 14134 }, { "epoch": 37.33245295477055, "grad_norm": 38434.203125, "learning_rate": 7.515382388823155e-05, "loss": 9.755, "step": 14135 }, { "epoch": 37.335094090458895, "grad_norm": 10645.6259765625, "learning_rate": 7.512396676838395e-05, "loss": 15.4503, "step": 14136 }, { "epoch": 37.337735226147245, "grad_norm": 4885.7705078125, "learning_rate": 7.509411453184687e-05, "loss": 11.3497, "step": 14137 }, { "epoch": 37.34037636183559, "grad_norm": 8463.2646484375, "learning_rate": 7.506426717945387e-05, "loss": 9.213, "step": 14138 }, { "epoch": 37.34301749752394, "grad_norm": 1691.0218505859375, "learning_rate": 7.503442471203837e-05, "loss": 13.1631, "step": 14139 }, { "epoch": 37.34565863321228, "grad_norm": 11715.23828125, "learning_rate": 7.500458713043385e-05, "loss": 12.97, "step": 14140 }, { "epoch": 37.348299768900624, "grad_norm": 858.4462890625, "learning_rate": 7.497475443547343e-05, "loss": 34.2838, "step": 14141 }, { "epoch": 37.350940904588974, "grad_norm": 1697.062255859375, "learning_rate": 7.49449266279902e-05, "loss": 34.0475, "step": 14142 }, { "epoch": 37.35358204027732, "grad_norm": 1358.3497314453125, "learning_rate": 7.491510370881704e-05, "loss": 36.2194, "step": 14143 }, { "epoch": 37.35622317596567, "grad_norm": 1423.8048095703125, "learning_rate": 7.488528567878678e-05, "loss": 33.7049, "step": 14144 }, { "epoch": 37.35886431165401, "grad_norm": 3635.572265625, "learning_rate": 7.485547253873203e-05, "loss": 34.2197, "step": 14145 }, { "epoch": 37.36150544734236, "grad_norm": 1479.1654052734375, "learning_rate": 7.482566428948525e-05, "loss": 35.7255, "step": 14146 }, { "epoch": 37.3641465830307, "grad_norm": 1469.2357177734375, "learning_rate": 7.479586093187899e-05, "loss": 34.4602, "step": 14147 }, { "epoch": 37.36678771871905, "grad_norm": 2374.066650390625, "learning_rate": 7.47660624667454e-05, "loss": 34.2485, "step": 14148 }, { "epoch": 37.369428854407396, "grad_norm": 1246.513671875, "learning_rate": 7.473626889491656e-05, "loss": 33.7772, "step": 14149 }, { "epoch": 37.37206999009574, "grad_norm": 1728.3914794921875, "learning_rate": 7.470648021722434e-05, "loss": 34.2428, "step": 14150 }, { "epoch": 37.37471112578409, "grad_norm": 1456.3873291015625, "learning_rate": 7.467669643450081e-05, "loss": 35.0649, "step": 14151 }, { "epoch": 37.37735226147243, "grad_norm": 1260.7125244140625, "learning_rate": 7.464691754757755e-05, "loss": 34.3666, "step": 14152 }, { "epoch": 37.37999339716078, "grad_norm": 1575.5753173828125, "learning_rate": 7.461714355728607e-05, "loss": 33.882, "step": 14153 }, { "epoch": 37.382634532849124, "grad_norm": 973.6550903320312, "learning_rate": 7.458737446445785e-05, "loss": 33.79, "step": 14154 }, { "epoch": 37.385275668537474, "grad_norm": 2463.748046875, "learning_rate": 7.455761026992417e-05, "loss": 33.791, "step": 14155 }, { "epoch": 37.38791680422582, "grad_norm": 14167.1162109375, "learning_rate": 7.452785097451614e-05, "loss": 34.8371, "step": 14156 }, { "epoch": 37.39055793991416, "grad_norm": 3488.39111328125, "learning_rate": 7.449809657906472e-05, "loss": 36.4758, "step": 14157 }, { "epoch": 37.39319907560251, "grad_norm": 3699.689697265625, "learning_rate": 7.446834708440095e-05, "loss": 37.1003, "step": 14158 }, { "epoch": 37.39584021129085, "grad_norm": 3831.498291015625, "learning_rate": 7.443860249135548e-05, "loss": 39.7994, "step": 14159 }, { "epoch": 37.3984813469792, "grad_norm": 4598.2041015625, "learning_rate": 7.440886280075887e-05, "loss": 40.5772, "step": 14160 }, { "epoch": 37.401122482667546, "grad_norm": 1001.9461669921875, "learning_rate": 7.437912801344166e-05, "loss": 39.1456, "step": 14161 }, { "epoch": 37.403763618355896, "grad_norm": 1555.28125, "learning_rate": 7.434939813023409e-05, "loss": 38.3039, "step": 14162 }, { "epoch": 37.40640475404424, "grad_norm": 2197.11181640625, "learning_rate": 7.431967315196641e-05, "loss": 38.5786, "step": 14163 }, { "epoch": 37.40904588973258, "grad_norm": 1323.77294921875, "learning_rate": 7.42899530794686e-05, "loss": 42.293, "step": 14164 }, { "epoch": 37.41168702542093, "grad_norm": 1457.496826171875, "learning_rate": 7.42602379135707e-05, "loss": 40.2753, "step": 14165 }, { "epoch": 37.414328161109275, "grad_norm": 1683.6201171875, "learning_rate": 7.42305276551024e-05, "loss": 41.2458, "step": 14166 }, { "epoch": 37.416969296797625, "grad_norm": 1575.1400146484375, "learning_rate": 7.420082230489333e-05, "loss": 39.5219, "step": 14167 }, { "epoch": 37.41961043248597, "grad_norm": 1919.25146484375, "learning_rate": 7.417112186377307e-05, "loss": 37.2742, "step": 14168 }, { "epoch": 37.42225156817432, "grad_norm": 2048.66943359375, "learning_rate": 7.414142633257095e-05, "loss": 37.7856, "step": 14169 }, { "epoch": 37.42489270386266, "grad_norm": 1369.6123046875, "learning_rate": 7.411173571211621e-05, "loss": 37.4206, "step": 14170 }, { "epoch": 37.42753383955101, "grad_norm": 931.61181640625, "learning_rate": 7.408205000323792e-05, "loss": 37.6878, "step": 14171 }, { "epoch": 37.43017497523935, "grad_norm": 1552.632568359375, "learning_rate": 7.405236920676503e-05, "loss": 36.5531, "step": 14172 }, { "epoch": 37.432816110927696, "grad_norm": 952.3462524414062, "learning_rate": 7.402269332352637e-05, "loss": 35.5391, "step": 14173 }, { "epoch": 37.435457246616046, "grad_norm": 1050.5045166015625, "learning_rate": 7.399302235435057e-05, "loss": 34.0828, "step": 14174 }, { "epoch": 37.43809838230439, "grad_norm": 1324.090576171875, "learning_rate": 7.396335630006628e-05, "loss": 35.8911, "step": 14175 }, { "epoch": 37.44073951799274, "grad_norm": 2438.11279296875, "learning_rate": 7.393369516150189e-05, "loss": 34.0581, "step": 14176 }, { "epoch": 37.44338065368108, "grad_norm": 1082.9385986328125, "learning_rate": 7.39040389394856e-05, "loss": 35.2615, "step": 14177 }, { "epoch": 37.44602178936943, "grad_norm": 2161.267333984375, "learning_rate": 7.387438763484561e-05, "loss": 36.3704, "step": 14178 }, { "epoch": 37.448662925057775, "grad_norm": 2267.988037109375, "learning_rate": 7.384474124840987e-05, "loss": 34.6865, "step": 14179 }, { "epoch": 37.45130406074612, "grad_norm": 2978.968017578125, "learning_rate": 7.381509978100626e-05, "loss": 34.8583, "step": 14180 }, { "epoch": 37.45394519643447, "grad_norm": 19340.654296875, "learning_rate": 7.378546323346238e-05, "loss": 39.9394, "step": 14181 }, { "epoch": 37.45658633212281, "grad_norm": 1316.4384765625, "learning_rate": 7.375583160660603e-05, "loss": 24.0328, "step": 14182 }, { "epoch": 37.45922746781116, "grad_norm": 1529.8470458984375, "learning_rate": 7.372620490126455e-05, "loss": 8.9858, "step": 14183 }, { "epoch": 37.461868603499504, "grad_norm": 2966.87548828125, "learning_rate": 7.369658311826524e-05, "loss": 12.9275, "step": 14184 }, { "epoch": 37.464509739187854, "grad_norm": 3256.978515625, "learning_rate": 7.366696625843523e-05, "loss": 12.7113, "step": 14185 }, { "epoch": 37.4671508748762, "grad_norm": 3666.703857421875, "learning_rate": 7.363735432260165e-05, "loss": 12.4043, "step": 14186 }, { "epoch": 37.46979201056454, "grad_norm": 2267.09423828125, "learning_rate": 7.360774731159137e-05, "loss": 13.1207, "step": 14187 }, { "epoch": 37.47243314625289, "grad_norm": 2650.30810546875, "learning_rate": 7.35781452262311e-05, "loss": 13.8625, "step": 14188 }, { "epoch": 37.47507428194123, "grad_norm": 1141.9903564453125, "learning_rate": 7.354854806734751e-05, "loss": 10.119, "step": 14189 }, { "epoch": 37.47771541762958, "grad_norm": 1756.3819580078125, "learning_rate": 7.351895583576706e-05, "loss": 10.2941, "step": 14190 }, { "epoch": 37.480356553317925, "grad_norm": 1692.926513671875, "learning_rate": 7.348936853231611e-05, "loss": 10.1915, "step": 14191 }, { "epoch": 37.482997689006275, "grad_norm": 1194.1451416015625, "learning_rate": 7.345978615782076e-05, "loss": 31.2121, "step": 14192 }, { "epoch": 37.48563882469462, "grad_norm": 1964.5390625, "learning_rate": 7.343020871310727e-05, "loss": 35.4489, "step": 14193 }, { "epoch": 37.48827996038297, "grad_norm": 1389.904052734375, "learning_rate": 7.340063619900148e-05, "loss": 35.2508, "step": 14194 }, { "epoch": 37.49092109607131, "grad_norm": 1736.1693115234375, "learning_rate": 7.337106861632917e-05, "loss": 33.1514, "step": 14195 }, { "epoch": 37.493562231759654, "grad_norm": 1310.043701171875, "learning_rate": 7.334150596591599e-05, "loss": 34.0161, "step": 14196 }, { "epoch": 37.496203367448004, "grad_norm": 2724.99072265625, "learning_rate": 7.33119482485875e-05, "loss": 34.3346, "step": 14197 }, { "epoch": 37.49884450313635, "grad_norm": 1302.1007080078125, "learning_rate": 7.328239546516907e-05, "loss": 34.6681, "step": 14198 }, { "epoch": 37.5014856388247, "grad_norm": 2144.34521484375, "learning_rate": 7.325284761648588e-05, "loss": 34.2735, "step": 14199 }, { "epoch": 37.50412677451304, "grad_norm": 1135.2862548828125, "learning_rate": 7.322330470336314e-05, "loss": 34.5162, "step": 14200 }, { "epoch": 37.50412677451304, "eval_loss": 3.699526309967041, "eval_runtime": 2.1973, "eval_samples_per_second": 225.272, "eval_steps_per_second": 28.216, "step": 14200 }, { "epoch": 37.50676791020139, "grad_norm": 2232.03173828125, "learning_rate": 7.319376672662578e-05, "loss": 35.8124, "step": 14201 }, { "epoch": 37.50940904588973, "grad_norm": 2626.400390625, "learning_rate": 7.316423368709854e-05, "loss": 35.0191, "step": 14202 }, { "epoch": 37.512050181578076, "grad_norm": 2557.93994140625, "learning_rate": 7.313470558560628e-05, "loss": 34.7925, "step": 14203 }, { "epoch": 37.514691317266426, "grad_norm": 2540.18310546875, "learning_rate": 7.310518242297348e-05, "loss": 36.1457, "step": 14204 }, { "epoch": 37.51733245295477, "grad_norm": 2542.1005859375, "learning_rate": 7.307566420002452e-05, "loss": 33.6829, "step": 14205 }, { "epoch": 37.51997358864312, "grad_norm": 1048.22314453125, "learning_rate": 7.304615091758373e-05, "loss": 34.1618, "step": 14206 }, { "epoch": 37.52261472433146, "grad_norm": 1892.854248046875, "learning_rate": 7.301664257647522e-05, "loss": 36.1369, "step": 14207 }, { "epoch": 37.52525586001981, "grad_norm": 1533.629638671875, "learning_rate": 7.2987139177523e-05, "loss": 35.5717, "step": 14208 }, { "epoch": 37.527896995708154, "grad_norm": 2411.619140625, "learning_rate": 7.295764072155083e-05, "loss": 38.3563, "step": 14209 }, { "epoch": 37.5305381313965, "grad_norm": 2726.40576171875, "learning_rate": 7.292814720938268e-05, "loss": 39.1426, "step": 14210 }, { "epoch": 37.53317926708485, "grad_norm": 1513.273193359375, "learning_rate": 7.289865864184195e-05, "loss": 39.0354, "step": 14211 }, { "epoch": 37.53582040277319, "grad_norm": 617.4544677734375, "learning_rate": 7.286917501975219e-05, "loss": 37.9772, "step": 14212 }, { "epoch": 37.53846153846154, "grad_norm": 1233.81689453125, "learning_rate": 7.283969634393661e-05, "loss": 39.4909, "step": 14213 }, { "epoch": 37.54110267414988, "grad_norm": 637.93017578125, "learning_rate": 7.281022261521847e-05, "loss": 42.6863, "step": 14214 }, { "epoch": 37.54374380983823, "grad_norm": 1920.826416015625, "learning_rate": 7.27807538344208e-05, "loss": 40.9179, "step": 14215 }, { "epoch": 37.546384945526576, "grad_norm": 1124.5804443359375, "learning_rate": 7.275129000236635e-05, "loss": 43.1404, "step": 14216 }, { "epoch": 37.549026081214926, "grad_norm": 1362.60205078125, "learning_rate": 7.272183111987809e-05, "loss": 40.5433, "step": 14217 }, { "epoch": 37.55166721690327, "grad_norm": 1578.0030517578125, "learning_rate": 7.269237718777857e-05, "loss": 40.1201, "step": 14218 }, { "epoch": 37.55430835259161, "grad_norm": 1010.3552856445312, "learning_rate": 7.266292820689027e-05, "loss": 39.4803, "step": 14219 }, { "epoch": 37.55694948827996, "grad_norm": 1682.6763916015625, "learning_rate": 7.263348417803544e-05, "loss": 37.8342, "step": 14220 }, { "epoch": 37.559590623968305, "grad_norm": 2987.505859375, "learning_rate": 7.260404510203647e-05, "loss": 36.5273, "step": 14221 }, { "epoch": 37.562231759656655, "grad_norm": 2735.641357421875, "learning_rate": 7.25746109797153e-05, "loss": 37.2918, "step": 14222 }, { "epoch": 37.564872895345, "grad_norm": 1968.51806640625, "learning_rate": 7.25451818118939e-05, "loss": 36.292, "step": 14223 }, { "epoch": 37.56751403103335, "grad_norm": 1322.1622314453125, "learning_rate": 7.251575759939407e-05, "loss": 35.4653, "step": 14224 }, { "epoch": 37.57015516672169, "grad_norm": 2937.744140625, "learning_rate": 7.248633834303745e-05, "loss": 35.2096, "step": 14225 }, { "epoch": 37.57279630241003, "grad_norm": 892.0851440429688, "learning_rate": 7.245692404364554e-05, "loss": 35.8105, "step": 14226 }, { "epoch": 37.57543743809838, "grad_norm": 4065.121826171875, "learning_rate": 7.242751470203965e-05, "loss": 34.654, "step": 14227 }, { "epoch": 37.578078573786726, "grad_norm": 1219.9869384765625, "learning_rate": 7.23981103190412e-05, "loss": 34.1409, "step": 14228 }, { "epoch": 37.580719709475076, "grad_norm": 802.9368896484375, "learning_rate": 7.236871089547117e-05, "loss": 34.6267, "step": 14229 }, { "epoch": 37.58336084516342, "grad_norm": 1651.718994140625, "learning_rate": 7.233931643215056e-05, "loss": 34.5672, "step": 14230 }, { "epoch": 37.58600198085177, "grad_norm": 1074.2928466796875, "learning_rate": 7.230992692990016e-05, "loss": 36.0894, "step": 14231 }, { "epoch": 37.58864311654011, "grad_norm": 48455.8046875, "learning_rate": 7.228054238954068e-05, "loss": 35.2146, "step": 14232 }, { "epoch": 37.591284252228455, "grad_norm": 2773.164306640625, "learning_rate": 7.225116281189264e-05, "loss": 10.8768, "step": 14233 }, { "epoch": 37.593925387916805, "grad_norm": 1140.915771484375, "learning_rate": 7.222178819777641e-05, "loss": 17.5853, "step": 14234 }, { "epoch": 37.59656652360515, "grad_norm": 1089.632568359375, "learning_rate": 7.219241854801237e-05, "loss": 11.7194, "step": 14235 }, { "epoch": 37.5992076592935, "grad_norm": 868.0955200195312, "learning_rate": 7.21630538634206e-05, "loss": 10.5743, "step": 14236 }, { "epoch": 37.60184879498184, "grad_norm": 2782.914306640625, "learning_rate": 7.213369414482102e-05, "loss": 10.9924, "step": 14237 }, { "epoch": 37.60448993067019, "grad_norm": 2827.054931640625, "learning_rate": 7.21043393930336e-05, "loss": 12.7701, "step": 14238 }, { "epoch": 37.607131066358534, "grad_norm": 5092.3349609375, "learning_rate": 7.207498960887801e-05, "loss": 9.7832, "step": 14239 }, { "epoch": 37.609772202046884, "grad_norm": 996.7136840820312, "learning_rate": 7.20456447931738e-05, "loss": 11.7094, "step": 14240 }, { "epoch": 37.61241333773523, "grad_norm": 11338.4443359375, "learning_rate": 7.201630494674044e-05, "loss": 23.5157, "step": 14241 }, { "epoch": 37.61505447342357, "grad_norm": 2045.4716796875, "learning_rate": 7.198697007039723e-05, "loss": 34.8789, "step": 14242 }, { "epoch": 37.61769560911192, "grad_norm": 1999.3958740234375, "learning_rate": 7.195764016496328e-05, "loss": 33.9963, "step": 14243 }, { "epoch": 37.62033674480026, "grad_norm": 1281.9449462890625, "learning_rate": 7.192831523125757e-05, "loss": 34.589, "step": 14244 }, { "epoch": 37.62297788048861, "grad_norm": 1488.1103515625, "learning_rate": 7.189899527009913e-05, "loss": 36.4284, "step": 14245 }, { "epoch": 37.625619016176955, "grad_norm": 1199.2734375, "learning_rate": 7.186968028230664e-05, "loss": 34.4926, "step": 14246 }, { "epoch": 37.628260151865305, "grad_norm": 1560.0513916015625, "learning_rate": 7.184037026869867e-05, "loss": 34.8972, "step": 14247 }, { "epoch": 37.63090128755365, "grad_norm": 1194.6513671875, "learning_rate": 7.18110652300937e-05, "loss": 33.8932, "step": 14248 }, { "epoch": 37.63354242324199, "grad_norm": 962.2896118164062, "learning_rate": 7.178176516731008e-05, "loss": 33.9637, "step": 14249 }, { "epoch": 37.63618355893034, "grad_norm": 1192.1239013671875, "learning_rate": 7.175247008116598e-05, "loss": 33.8698, "step": 14250 }, { "epoch": 37.638824694618684, "grad_norm": 2031.025146484375, "learning_rate": 7.172317997247935e-05, "loss": 33.6775, "step": 14251 }, { "epoch": 37.641465830307034, "grad_norm": 1341.421142578125, "learning_rate": 7.169389484206829e-05, "loss": 34.6061, "step": 14252 }, { "epoch": 37.64410696599538, "grad_norm": 1264.7149658203125, "learning_rate": 7.166461469075045e-05, "loss": 33.5339, "step": 14253 }, { "epoch": 37.64674810168373, "grad_norm": 2421.026611328125, "learning_rate": 7.163533951934348e-05, "loss": 34.5782, "step": 14254 }, { "epoch": 37.64938923737207, "grad_norm": 2442.10986328125, "learning_rate": 7.160606932866482e-05, "loss": 36.0413, "step": 14255 }, { "epoch": 37.65203037306041, "grad_norm": 1596.5001220703125, "learning_rate": 7.157680411953197e-05, "loss": 36.112, "step": 14256 }, { "epoch": 37.65467150874876, "grad_norm": 1614.504150390625, "learning_rate": 7.154754389276203e-05, "loss": 35.366, "step": 14257 }, { "epoch": 37.657312644437106, "grad_norm": 1044.068603515625, "learning_rate": 7.15182886491721e-05, "loss": 35.893, "step": 14258 }, { "epoch": 37.659953780125456, "grad_norm": 7142.34130859375, "learning_rate": 7.148903838957909e-05, "loss": 38.2323, "step": 14259 }, { "epoch": 37.6625949158138, "grad_norm": 1504.8927001953125, "learning_rate": 7.145979311479986e-05, "loss": 38.2975, "step": 14260 }, { "epoch": 37.66523605150215, "grad_norm": 1923.0948486328125, "learning_rate": 7.1430552825651e-05, "loss": 38.6176, "step": 14261 }, { "epoch": 37.66787718719049, "grad_norm": 2132.22900390625, "learning_rate": 7.140131752294902e-05, "loss": 38.1742, "step": 14262 }, { "epoch": 37.67051832287884, "grad_norm": 1182.038818359375, "learning_rate": 7.137208720751037e-05, "loss": 39.6108, "step": 14263 }, { "epoch": 37.673159458567184, "grad_norm": 2096.894775390625, "learning_rate": 7.134286188015127e-05, "loss": 40.4225, "step": 14264 }, { "epoch": 37.67580059425553, "grad_norm": 1319.5682373046875, "learning_rate": 7.131364154168779e-05, "loss": 46.0032, "step": 14265 }, { "epoch": 37.67844172994388, "grad_norm": 1918.8035888671875, "learning_rate": 7.12844261929359e-05, "loss": 42.3676, "step": 14266 }, { "epoch": 37.68108286563222, "grad_norm": 2352.696044921875, "learning_rate": 7.125521583471145e-05, "loss": 41.0668, "step": 14267 }, { "epoch": 37.68372400132057, "grad_norm": 2444.938232421875, "learning_rate": 7.122601046783001e-05, "loss": 39.7773, "step": 14268 }, { "epoch": 37.68636513700891, "grad_norm": 1988.5858154296875, "learning_rate": 7.119681009310727e-05, "loss": 40.0121, "step": 14269 }, { "epoch": 37.68900627269726, "grad_norm": 1933.552001953125, "learning_rate": 7.11676147113586e-05, "loss": 39.9492, "step": 14270 }, { "epoch": 37.691647408385606, "grad_norm": 1322.28076171875, "learning_rate": 7.113842432339923e-05, "loss": 39.9259, "step": 14271 }, { "epoch": 37.69428854407395, "grad_norm": 1677.8619384765625, "learning_rate": 7.11092389300442e-05, "loss": 36.4793, "step": 14272 }, { "epoch": 37.6969296797623, "grad_norm": 1445.755615234375, "learning_rate": 7.108005853210869e-05, "loss": 36.7477, "step": 14273 }, { "epoch": 37.69957081545064, "grad_norm": 1809.1656494140625, "learning_rate": 7.105088313040742e-05, "loss": 35.7187, "step": 14274 }, { "epoch": 37.70221195113899, "grad_norm": 938.1073608398438, "learning_rate": 7.10217127257551e-05, "loss": 34.8982, "step": 14275 }, { "epoch": 37.704853086827335, "grad_norm": 1001.1655883789062, "learning_rate": 7.099254731896637e-05, "loss": 36.1344, "step": 14276 }, { "epoch": 37.707494222515685, "grad_norm": 2848.59765625, "learning_rate": 7.096338691085557e-05, "loss": 35.5549, "step": 14277 }, { "epoch": 37.71013535820403, "grad_norm": 1260.1915283203125, "learning_rate": 7.093423150223701e-05, "loss": 35.0023, "step": 14278 }, { "epoch": 37.71277649389237, "grad_norm": 1306.0240478515625, "learning_rate": 7.09050810939248e-05, "loss": 34.9825, "step": 14279 }, { "epoch": 37.71541762958072, "grad_norm": 1260.1005859375, "learning_rate": 7.087593568673304e-05, "loss": 34.5812, "step": 14280 }, { "epoch": 37.71805876526906, "grad_norm": 1439.864013671875, "learning_rate": 7.084679528147555e-05, "loss": 36.422, "step": 14281 }, { "epoch": 37.72069990095741, "grad_norm": 12530.3212890625, "learning_rate": 7.081765987896599e-05, "loss": 37.7488, "step": 14282 }, { "epoch": 37.723341036645756, "grad_norm": 1652.7825927734375, "learning_rate": 7.078852948001818e-05, "loss": 12.5576, "step": 14283 }, { "epoch": 37.725982172334106, "grad_norm": 8208.0966796875, "learning_rate": 7.075940408544534e-05, "loss": 13.5479, "step": 14284 }, { "epoch": 37.72862330802245, "grad_norm": 12636.7138671875, "learning_rate": 7.073028369606086e-05, "loss": 10.4405, "step": 14285 }, { "epoch": 37.7312644437108, "grad_norm": 966.8709716796875, "learning_rate": 7.070116831267781e-05, "loss": 9.5805, "step": 14286 }, { "epoch": 37.73390557939914, "grad_norm": 10982.244140625, "learning_rate": 7.067205793610936e-05, "loss": 8.2016, "step": 14287 }, { "epoch": 37.736546715087485, "grad_norm": 1212.8309326171875, "learning_rate": 7.064295256716838e-05, "loss": 15.2007, "step": 14288 }, { "epoch": 37.739187850775835, "grad_norm": 2459.569091796875, "learning_rate": 7.06138522066675e-05, "loss": 7.0278, "step": 14289 }, { "epoch": 37.74182898646418, "grad_norm": 2529.775146484375, "learning_rate": 7.058475685541951e-05, "loss": 12.95, "step": 14290 }, { "epoch": 37.74447012215253, "grad_norm": 5489.96826171875, "learning_rate": 7.055566651423678e-05, "loss": 10.6518, "step": 14291 }, { "epoch": 37.74711125784087, "grad_norm": 1566.135009765625, "learning_rate": 7.052658118393165e-05, "loss": 27.8321, "step": 14292 }, { "epoch": 37.74975239352922, "grad_norm": 1984.455810546875, "learning_rate": 7.049750086531632e-05, "loss": 35.1685, "step": 14293 }, { "epoch": 37.752393529217564, "grad_norm": 1177.1318359375, "learning_rate": 7.046842555920283e-05, "loss": 34.7753, "step": 14294 }, { "epoch": 37.75503466490591, "grad_norm": 2016.2645263671875, "learning_rate": 7.043935526640308e-05, "loss": 35.2988, "step": 14295 }, { "epoch": 37.75767580059426, "grad_norm": 1711.622314453125, "learning_rate": 7.04102899877288e-05, "loss": 34.6276, "step": 14296 }, { "epoch": 37.7603169362826, "grad_norm": 1779.3614501953125, "learning_rate": 7.038122972399174e-05, "loss": 33.1318, "step": 14297 }, { "epoch": 37.76295807197095, "grad_norm": 1260.1361083984375, "learning_rate": 7.035217447600334e-05, "loss": 34.8183, "step": 14298 }, { "epoch": 37.76559920765929, "grad_norm": 1320.577880859375, "learning_rate": 7.03231242445749e-05, "loss": 35.7552, "step": 14299 }, { "epoch": 37.76824034334764, "grad_norm": 1224.6439208984375, "learning_rate": 7.029407903051771e-05, "loss": 35.1063, "step": 14300 }, { "epoch": 37.770881479035985, "grad_norm": 3013.482421875, "learning_rate": 7.026503883464277e-05, "loss": 34.5066, "step": 14301 }, { "epoch": 37.77352261472433, "grad_norm": 1923.8870849609375, "learning_rate": 7.023600365776106e-05, "loss": 34.7217, "step": 14302 }, { "epoch": 37.77616375041268, "grad_norm": 4030.6455078125, "learning_rate": 7.020697350068325e-05, "loss": 34.4706, "step": 14303 }, { "epoch": 37.77880488610102, "grad_norm": 1710.6046142578125, "learning_rate": 7.017794836422018e-05, "loss": 34.3177, "step": 14304 }, { "epoch": 37.78144602178937, "grad_norm": 660.9146728515625, "learning_rate": 7.014892824918227e-05, "loss": 34.8483, "step": 14305 }, { "epoch": 37.784087157477714, "grad_norm": 1188.7969970703125, "learning_rate": 7.01199131563799e-05, "loss": 35.5487, "step": 14306 }, { "epoch": 37.786728293166064, "grad_norm": 2414.89892578125, "learning_rate": 7.009090308662317e-05, "loss": 34.5888, "step": 14307 }, { "epoch": 37.78936942885441, "grad_norm": 2640.939453125, "learning_rate": 7.006189804072238e-05, "loss": 35.5636, "step": 14308 }, { "epoch": 37.79201056454276, "grad_norm": 13048.2431640625, "learning_rate": 7.00328980194874e-05, "loss": 41.9741, "step": 14309 }, { "epoch": 37.7946517002311, "grad_norm": 5862.09130859375, "learning_rate": 7.000390302372803e-05, "loss": 39.5961, "step": 14310 }, { "epoch": 37.79729283591944, "grad_norm": 1439.7042236328125, "learning_rate": 6.997491305425391e-05, "loss": 38.7346, "step": 14311 }, { "epoch": 37.79993397160779, "grad_norm": 2084.475830078125, "learning_rate": 6.994592811187461e-05, "loss": 39.0479, "step": 14312 }, { "epoch": 37.802575107296136, "grad_norm": 2052.3662109375, "learning_rate": 6.991694819739949e-05, "loss": 40.696, "step": 14313 }, { "epoch": 37.805216242984486, "grad_norm": 2350.16748046875, "learning_rate": 6.988797331163774e-05, "loss": 41.8747, "step": 14314 }, { "epoch": 37.80785737867283, "grad_norm": 1044.3172607421875, "learning_rate": 6.985900345539859e-05, "loss": 41.5692, "step": 14315 }, { "epoch": 37.81049851436118, "grad_norm": 1120.6903076171875, "learning_rate": 6.983003862949095e-05, "loss": 42.5942, "step": 14316 }, { "epoch": 37.81313965004952, "grad_norm": 1244.9541015625, "learning_rate": 6.980107883472356e-05, "loss": 43.3616, "step": 14317 }, { "epoch": 37.815780785737864, "grad_norm": 1307.60400390625, "learning_rate": 6.977212407190536e-05, "loss": 38.8569, "step": 14318 }, { "epoch": 37.818421921426214, "grad_norm": 1813.9490966796875, "learning_rate": 6.974317434184463e-05, "loss": 39.9832, "step": 14319 }, { "epoch": 37.82106305711456, "grad_norm": 1784.91943359375, "learning_rate": 6.97142296453499e-05, "loss": 38.2956, "step": 14320 }, { "epoch": 37.82370419280291, "grad_norm": 906.0755615234375, "learning_rate": 6.968528998322929e-05, "loss": 39.05, "step": 14321 }, { "epoch": 37.82634532849125, "grad_norm": 1956.16845703125, "learning_rate": 6.965635535629114e-05, "loss": 39.9102, "step": 14322 }, { "epoch": 37.8289864641796, "grad_norm": 1482.2470703125, "learning_rate": 6.962742576534331e-05, "loss": 35.8369, "step": 14323 }, { "epoch": 37.83162759986794, "grad_norm": 4587.27880859375, "learning_rate": 6.959850121119357e-05, "loss": 36.8701, "step": 14324 }, { "epoch": 37.834268735556286, "grad_norm": 2601.8076171875, "learning_rate": 6.95695816946498e-05, "loss": 36.1198, "step": 14325 }, { "epoch": 37.836909871244636, "grad_norm": 1255.1934814453125, "learning_rate": 6.954066721651948e-05, "loss": 35.5227, "step": 14326 }, { "epoch": 37.83955100693298, "grad_norm": 1268.6864013671875, "learning_rate": 6.951175777761001e-05, "loss": 35.0586, "step": 14327 }, { "epoch": 37.84219214262133, "grad_norm": 1269.8360595703125, "learning_rate": 6.948285337872867e-05, "loss": 34.6599, "step": 14328 }, { "epoch": 37.84483327830967, "grad_norm": 1859.4287109375, "learning_rate": 6.945395402068263e-05, "loss": 34.9799, "step": 14329 }, { "epoch": 37.84747441399802, "grad_norm": 990.3689575195312, "learning_rate": 6.942505970427887e-05, "loss": 36.2127, "step": 14330 }, { "epoch": 37.850115549686365, "grad_norm": 1920.94140625, "learning_rate": 6.939617043032415e-05, "loss": 35.4517, "step": 14331 }, { "epoch": 37.852756685374715, "grad_norm": 3213.2978515625, "learning_rate": 6.936728619962535e-05, "loss": 45.1107, "step": 14332 }, { "epoch": 37.85539782106306, "grad_norm": 19141.609375, "learning_rate": 6.933840701298896e-05, "loss": 10.4089, "step": 14333 }, { "epoch": 37.8580389567514, "grad_norm": 1698.0330810546875, "learning_rate": 6.930953287122147e-05, "loss": 9.2776, "step": 14334 }, { "epoch": 37.86068009243975, "grad_norm": 3829.72900390625, "learning_rate": 6.928066377512909e-05, "loss": 8.3464, "step": 14335 }, { "epoch": 37.86332122812809, "grad_norm": 1009.1595458984375, "learning_rate": 6.925179972551805e-05, "loss": 14.4538, "step": 14336 }, { "epoch": 37.86596236381644, "grad_norm": 2614.259521484375, "learning_rate": 6.92229407231943e-05, "loss": 12.6548, "step": 14337 }, { "epoch": 37.868603499504786, "grad_norm": 28233.58203125, "learning_rate": 6.919408676896367e-05, "loss": 14.2433, "step": 14338 }, { "epoch": 37.871244635193136, "grad_norm": 745.8790283203125, "learning_rate": 6.916523786363202e-05, "loss": 10.6207, "step": 14339 }, { "epoch": 37.87388577088148, "grad_norm": 2416.739990234375, "learning_rate": 6.913639400800489e-05, "loss": 11.5216, "step": 14340 }, { "epoch": 37.87652690656982, "grad_norm": 2077.33837890625, "learning_rate": 6.91075552028877e-05, "loss": 13.0108, "step": 14341 }, { "epoch": 37.87916804225817, "grad_norm": 1183.1748046875, "learning_rate": 6.90787214490857e-05, "loss": 34.9117, "step": 14342 }, { "epoch": 37.881809177946515, "grad_norm": 2852.7822265625, "learning_rate": 6.90498927474042e-05, "loss": 35.9369, "step": 14343 }, { "epoch": 37.884450313634865, "grad_norm": 1975.5487060546875, "learning_rate": 6.902106909864816e-05, "loss": 35.1149, "step": 14344 }, { "epoch": 37.88709144932321, "grad_norm": 9050.5849609375, "learning_rate": 6.899225050362242e-05, "loss": 35.542, "step": 14345 }, { "epoch": 37.88973258501156, "grad_norm": 1678.620849609375, "learning_rate": 6.896343696313179e-05, "loss": 34.7303, "step": 14346 }, { "epoch": 37.8923737206999, "grad_norm": 1751.457275390625, "learning_rate": 6.893462847798083e-05, "loss": 34.2451, "step": 14347 }, { "epoch": 37.895014856388244, "grad_norm": 1514.816162109375, "learning_rate": 6.8905825048974e-05, "loss": 34.8883, "step": 14348 }, { "epoch": 37.897655992076594, "grad_norm": 2056.220703125, "learning_rate": 6.887702667691553e-05, "loss": 33.1333, "step": 14349 }, { "epoch": 37.90029712776494, "grad_norm": 1667.7706298828125, "learning_rate": 6.884823336260981e-05, "loss": 33.9654, "step": 14350 }, { "epoch": 37.90293826345329, "grad_norm": 1510.1654052734375, "learning_rate": 6.881944510686072e-05, "loss": 35.7232, "step": 14351 }, { "epoch": 37.90557939914163, "grad_norm": 1806.0977783203125, "learning_rate": 6.879066191047215e-05, "loss": 34.6622, "step": 14352 }, { "epoch": 37.90822053482998, "grad_norm": 1288.7679443359375, "learning_rate": 6.876188377424803e-05, "loss": 36.7017, "step": 14353 }, { "epoch": 37.91086167051832, "grad_norm": 1433.9742431640625, "learning_rate": 6.873311069899177e-05, "loss": 34.5524, "step": 14354 }, { "epoch": 37.91350280620667, "grad_norm": 1497.10693359375, "learning_rate": 6.870434268550691e-05, "loss": 34.1637, "step": 14355 }, { "epoch": 37.916143941895015, "grad_norm": 2087.345703125, "learning_rate": 6.86755797345967e-05, "loss": 33.6987, "step": 14356 }, { "epoch": 37.91878507758336, "grad_norm": 3624.03515625, "learning_rate": 6.86468218470645e-05, "loss": 35.402, "step": 14357 }, { "epoch": 37.92142621327171, "grad_norm": 2717.76806640625, "learning_rate": 6.861806902371326e-05, "loss": 36.2997, "step": 14358 }, { "epoch": 37.92406734896005, "grad_norm": 2494.8779296875, "learning_rate": 6.85893212653458e-05, "loss": 39.2888, "step": 14359 }, { "epoch": 37.9267084846484, "grad_norm": 1008.789306640625, "learning_rate": 6.856057857276506e-05, "loss": 40.1298, "step": 14360 }, { "epoch": 37.929349620336744, "grad_norm": 2414.851806640625, "learning_rate": 6.853184094677361e-05, "loss": 39.2525, "step": 14361 }, { "epoch": 37.931990756025094, "grad_norm": 1176.412841796875, "learning_rate": 6.850310838817389e-05, "loss": 38.7998, "step": 14362 }, { "epoch": 37.93463189171344, "grad_norm": 3108.345458984375, "learning_rate": 6.847438089776822e-05, "loss": 40.6662, "step": 14363 }, { "epoch": 37.93727302740178, "grad_norm": 797.701171875, "learning_rate": 6.844565847635886e-05, "loss": 41.262, "step": 14364 }, { "epoch": 37.93991416309013, "grad_norm": 1374.6090087890625, "learning_rate": 6.841694112474784e-05, "loss": 39.3923, "step": 14365 }, { "epoch": 37.94255529877847, "grad_norm": 2006.0714111328125, "learning_rate": 6.838822884373697e-05, "loss": 37.0262, "step": 14366 }, { "epoch": 37.94519643446682, "grad_norm": 897.2962646484375, "learning_rate": 6.835952163412823e-05, "loss": 37.4262, "step": 14367 }, { "epoch": 37.947837570155166, "grad_norm": 1163.1072998046875, "learning_rate": 6.833081949672312e-05, "loss": 35.7815, "step": 14368 }, { "epoch": 37.950478705843516, "grad_norm": 1328.3583984375, "learning_rate": 6.830212243232317e-05, "loss": 34.2639, "step": 14369 }, { "epoch": 37.95311984153186, "grad_norm": 1021.148193359375, "learning_rate": 6.82734304417297e-05, "loss": 34.4483, "step": 14370 }, { "epoch": 37.9557609772202, "grad_norm": 1531.149169921875, "learning_rate": 6.824474352574398e-05, "loss": 35.8274, "step": 14371 }, { "epoch": 37.95840211290855, "grad_norm": 4257.52392578125, "learning_rate": 6.821606168516697e-05, "loss": 31.86, "step": 14372 }, { "epoch": 37.961043248596894, "grad_norm": 5932.58251953125, "learning_rate": 6.818738492079962e-05, "loss": 8.97, "step": 14373 }, { "epoch": 37.963684384285244, "grad_norm": 12811.5908203125, "learning_rate": 6.81587132334428e-05, "loss": 12.2838, "step": 14374 }, { "epoch": 37.96632551997359, "grad_norm": 7934.9658203125, "learning_rate": 6.81300466238971e-05, "loss": 11.4217, "step": 14375 }, { "epoch": 37.96896665566194, "grad_norm": 1205.4149169921875, "learning_rate": 6.810138509296299e-05, "loss": 12.0625, "step": 14376 }, { "epoch": 37.97160779135028, "grad_norm": 1473.2686767578125, "learning_rate": 6.807272864144078e-05, "loss": 15.328, "step": 14377 }, { "epoch": 37.97424892703863, "grad_norm": 1461.15576171875, "learning_rate": 6.804407727013085e-05, "loss": 33.4766, "step": 14378 }, { "epoch": 37.97689006272697, "grad_norm": 2045.7640380859375, "learning_rate": 6.801543097983318e-05, "loss": 34.4755, "step": 14379 }, { "epoch": 37.979531198415316, "grad_norm": 3247.454345703125, "learning_rate": 6.798678977134767e-05, "loss": 34.286, "step": 14380 }, { "epoch": 37.982172334103666, "grad_norm": 1820.426513671875, "learning_rate": 6.795815364547414e-05, "loss": 33.6363, "step": 14381 }, { "epoch": 37.98481346979201, "grad_norm": 2108.35693359375, "learning_rate": 6.792952260301225e-05, "loss": 35.888, "step": 14382 }, { "epoch": 37.98745460548036, "grad_norm": 2406.466796875, "learning_rate": 6.790089664476149e-05, "loss": 35.269, "step": 14383 }, { "epoch": 37.9900957411687, "grad_norm": 1266.4781494140625, "learning_rate": 6.787227577152111e-05, "loss": 34.9024, "step": 14384 }, { "epoch": 37.99273687685705, "grad_norm": 2991.23779296875, "learning_rate": 6.784365998409054e-05, "loss": 35.5908, "step": 14385 }, { "epoch": 37.995378012545395, "grad_norm": 1254.66357421875, "learning_rate": 6.781504928326876e-05, "loss": 33.3919, "step": 14386 }, { "epoch": 37.99801914823374, "grad_norm": 2054.4248046875, "learning_rate": 6.778644366985462e-05, "loss": 35.8812, "step": 14387 }, { "epoch": 38.00066028392209, "grad_norm": 3239.231201171875, "learning_rate": 6.775784314464717e-05, "loss": 38.4026, "step": 14388 }, { "epoch": 38.00330141961043, "grad_norm": 1144.0128173828125, "learning_rate": 6.77292477084448e-05, "loss": 38.4041, "step": 14389 }, { "epoch": 38.00594255529878, "grad_norm": 1319.1341552734375, "learning_rate": 6.770065736204614e-05, "loss": 39.2134, "step": 14390 }, { "epoch": 38.00858369098712, "grad_norm": 1695.495849609375, "learning_rate": 6.767207210624945e-05, "loss": 38.9016, "step": 14391 }, { "epoch": 38.01122482667547, "grad_norm": 1127.3035888671875, "learning_rate": 6.764349194185313e-05, "loss": 38.3194, "step": 14392 }, { "epoch": 38.013865962363816, "grad_norm": 1708.88134765625, "learning_rate": 6.761491686965516e-05, "loss": 42.8573, "step": 14393 }, { "epoch": 38.01650709805216, "grad_norm": 1115.0654296875, "learning_rate": 6.75863468904534e-05, "loss": 39.8828, "step": 14394 }, { "epoch": 38.01914823374051, "grad_norm": 1151.6563720703125, "learning_rate": 6.755778200504583e-05, "loss": 39.6639, "step": 14395 }, { "epoch": 38.02178936942885, "grad_norm": 844.8245239257812, "learning_rate": 6.752922221423005e-05, "loss": 41.6334, "step": 14396 }, { "epoch": 38.0244305051172, "grad_norm": 1651.3350830078125, "learning_rate": 6.750066751880354e-05, "loss": 39.0255, "step": 14397 }, { "epoch": 38.027071640805545, "grad_norm": 1868.0882568359375, "learning_rate": 6.74721179195637e-05, "loss": 40.216, "step": 14398 }, { "epoch": 38.029712776493895, "grad_norm": 1202.4134521484375, "learning_rate": 6.744357341730772e-05, "loss": 39.1215, "step": 14399 }, { "epoch": 38.03235391218224, "grad_norm": 1861.8861083984375, "learning_rate": 6.741503401283272e-05, "loss": 37.7118, "step": 14400 }, { "epoch": 38.03235391218224, "eval_loss": 3.720661163330078, "eval_runtime": 2.091, "eval_samples_per_second": 236.734, "eval_steps_per_second": 29.652, "step": 14400 }, { "epoch": 38.03499504787059, "grad_norm": 2660.341796875, "learning_rate": 6.738649970693559e-05, "loss": 36.5543, "step": 14401 }, { "epoch": 38.03763618355893, "grad_norm": 1011.1486206054688, "learning_rate": 6.735797050041323e-05, "loss": 37.5304, "step": 14402 }, { "epoch": 38.040277319247274, "grad_norm": 860.9473876953125, "learning_rate": 6.732944639406225e-05, "loss": 36.0227, "step": 14403 }, { "epoch": 38.042918454935624, "grad_norm": 1355.567138671875, "learning_rate": 6.73009273886791e-05, "loss": 35.4428, "step": 14404 }, { "epoch": 38.04555959062397, "grad_norm": 1400.637939453125, "learning_rate": 6.727241348506039e-05, "loss": 34.2793, "step": 14405 }, { "epoch": 38.04820072631232, "grad_norm": 1182.5640869140625, "learning_rate": 6.72439046840021e-05, "loss": 34.9126, "step": 14406 }, { "epoch": 38.05084186200066, "grad_norm": 1068.455810546875, "learning_rate": 6.721540098630044e-05, "loss": 35.0591, "step": 14407 }, { "epoch": 38.05348299768901, "grad_norm": 980.1810302734375, "learning_rate": 6.718690239275124e-05, "loss": 34.2807, "step": 14408 }, { "epoch": 38.05612413337735, "grad_norm": 1119.413818359375, "learning_rate": 6.715840890415049e-05, "loss": 34.8849, "step": 14409 }, { "epoch": 38.058765269065695, "grad_norm": 3160.292724609375, "learning_rate": 6.712992052129377e-05, "loss": 42.6598, "step": 14410 }, { "epoch": 38.061406404754045, "grad_norm": 3282.65771484375, "learning_rate": 6.710143724497649e-05, "loss": 11.2639, "step": 14411 }, { "epoch": 38.06404754044239, "grad_norm": 8304.4404296875, "learning_rate": 6.707295907599422e-05, "loss": 13.0564, "step": 14412 }, { "epoch": 38.06668867613074, "grad_norm": 19934.03125, "learning_rate": 6.704448601514213e-05, "loss": 12.292, "step": 14413 }, { "epoch": 38.06932981181908, "grad_norm": 6399.001953125, "learning_rate": 6.701601806321528e-05, "loss": 11.2388, "step": 14414 }, { "epoch": 38.07197094750743, "grad_norm": 1459.5914306640625, "learning_rate": 6.698755522100863e-05, "loss": 12.2682, "step": 14415 }, { "epoch": 38.074612083195774, "grad_norm": 2523.93212890625, "learning_rate": 6.6959097489317e-05, "loss": 15.7772, "step": 14416 }, { "epoch": 38.07725321888412, "grad_norm": 2664.705322265625, "learning_rate": 6.693064486893508e-05, "loss": 12.4702, "step": 14417 }, { "epoch": 38.07989435457247, "grad_norm": 8572.13671875, "learning_rate": 6.690219736065723e-05, "loss": 14.948, "step": 14418 }, { "epoch": 38.08253549026081, "grad_norm": 1585.265380859375, "learning_rate": 6.687375496527806e-05, "loss": 11.0565, "step": 14419 }, { "epoch": 38.08517662594916, "grad_norm": 11053.0166015625, "learning_rate": 6.684531768359173e-05, "loss": 30.5437, "step": 14420 }, { "epoch": 38.0878177616375, "grad_norm": 1493.9000244140625, "learning_rate": 6.681688551639229e-05, "loss": 36.0114, "step": 14421 }, { "epoch": 38.09045889732585, "grad_norm": 2491.438232421875, "learning_rate": 6.678845846447362e-05, "loss": 33.8318, "step": 14422 }, { "epoch": 38.093100033014196, "grad_norm": 2647.86083984375, "learning_rate": 6.67600365286298e-05, "loss": 34.5643, "step": 14423 }, { "epoch": 38.095741168702546, "grad_norm": 1125.16748046875, "learning_rate": 6.673161970965425e-05, "loss": 33.5193, "step": 14424 }, { "epoch": 38.09838230439089, "grad_norm": 3214.739013671875, "learning_rate": 6.670320800834048e-05, "loss": 34.3327, "step": 14425 }, { "epoch": 38.10102344007923, "grad_norm": 843.4976196289062, "learning_rate": 6.667480142548201e-05, "loss": 34.7192, "step": 14426 }, { "epoch": 38.10366457576758, "grad_norm": 1011.0674438476562, "learning_rate": 6.664639996187205e-05, "loss": 33.7724, "step": 14427 }, { "epoch": 38.106305711455924, "grad_norm": 931.3140869140625, "learning_rate": 6.661800361830365e-05, "loss": 35.2222, "step": 14428 }, { "epoch": 38.108946847144274, "grad_norm": 4523.48779296875, "learning_rate": 6.658961239556969e-05, "loss": 34.142, "step": 14429 }, { "epoch": 38.11158798283262, "grad_norm": 1680.9554443359375, "learning_rate": 6.656122629446315e-05, "loss": 35.2653, "step": 14430 }, { "epoch": 38.11422911852097, "grad_norm": 2351.547607421875, "learning_rate": 6.653284531577658e-05, "loss": 33.7033, "step": 14431 }, { "epoch": 38.11687025420931, "grad_norm": 1458.1871337890625, "learning_rate": 6.650446946030255e-05, "loss": 34.9105, "step": 14432 }, { "epoch": 38.11951138989765, "grad_norm": 1917.30908203125, "learning_rate": 6.647609872883342e-05, "loss": 35.0806, "step": 14433 }, { "epoch": 38.122152525586, "grad_norm": 2082.501220703125, "learning_rate": 6.644773312216143e-05, "loss": 34.1901, "step": 14434 }, { "epoch": 38.124793661274346, "grad_norm": 1107.558349609375, "learning_rate": 6.641937264107867e-05, "loss": 34.959, "step": 14435 }, { "epoch": 38.127434796962696, "grad_norm": 1960.398681640625, "learning_rate": 6.639101728637701e-05, "loss": 35.7813, "step": 14436 }, { "epoch": 38.13007593265104, "grad_norm": 1518.8123779296875, "learning_rate": 6.636266705884841e-05, "loss": 38.9794, "step": 14437 }, { "epoch": 38.13271706833939, "grad_norm": 7392.87841796875, "learning_rate": 6.633432195928446e-05, "loss": 40.9119, "step": 14438 }, { "epoch": 38.13535820402773, "grad_norm": 2023.408447265625, "learning_rate": 6.630598198847659e-05, "loss": 39.5583, "step": 14439 }, { "epoch": 38.137999339716075, "grad_norm": 1660.9251708984375, "learning_rate": 6.627764714721643e-05, "loss": 38.075, "step": 14440 }, { "epoch": 38.140640475404425, "grad_norm": 1745.3453369140625, "learning_rate": 6.624931743629495e-05, "loss": 37.1262, "step": 14441 }, { "epoch": 38.14328161109277, "grad_norm": 1383.2379150390625, "learning_rate": 6.622099285650338e-05, "loss": 40.9501, "step": 14442 }, { "epoch": 38.14592274678112, "grad_norm": 1326.5030517578125, "learning_rate": 6.619267340863249e-05, "loss": 43.0718, "step": 14443 }, { "epoch": 38.14856388246946, "grad_norm": 2206.399658203125, "learning_rate": 6.616435909347335e-05, "loss": 42.6268, "step": 14444 }, { "epoch": 38.15120501815781, "grad_norm": 877.8624877929688, "learning_rate": 6.613604991181649e-05, "loss": 42.5758, "step": 14445 }, { "epoch": 38.15384615384615, "grad_norm": 1274.2623291015625, "learning_rate": 6.610774586445234e-05, "loss": 39.7927, "step": 14446 }, { "epoch": 38.1564872895345, "grad_norm": 1155.0133056640625, "learning_rate": 6.607944695217144e-05, "loss": 39.037, "step": 14447 }, { "epoch": 38.159128425222846, "grad_norm": 1182.5279541015625, "learning_rate": 6.605115317576394e-05, "loss": 40.0186, "step": 14448 }, { "epoch": 38.16176956091119, "grad_norm": 1038.5262451171875, "learning_rate": 6.602286453601991e-05, "loss": 38.2886, "step": 14449 }, { "epoch": 38.16441069659954, "grad_norm": 1243.5897216796875, "learning_rate": 6.599458103372935e-05, "loss": 36.2277, "step": 14450 }, { "epoch": 38.16705183228788, "grad_norm": 1465.679443359375, "learning_rate": 6.596630266968201e-05, "loss": 37.2587, "step": 14451 }, { "epoch": 38.16969296797623, "grad_norm": 649.659912109375, "learning_rate": 6.593802944466753e-05, "loss": 36.8347, "step": 14452 }, { "epoch": 38.172334103664575, "grad_norm": 5187.54052734375, "learning_rate": 6.590976135947543e-05, "loss": 34.9095, "step": 14453 }, { "epoch": 38.174975239352925, "grad_norm": 943.2210083007812, "learning_rate": 6.588149841489516e-05, "loss": 36.9191, "step": 14454 }, { "epoch": 38.17761637504127, "grad_norm": 1627.3194580078125, "learning_rate": 6.585324061171588e-05, "loss": 34.5522, "step": 14455 }, { "epoch": 38.18025751072961, "grad_norm": 1159.6141357421875, "learning_rate": 6.582498795072669e-05, "loss": 35.1965, "step": 14456 }, { "epoch": 38.18289864641796, "grad_norm": 1540.2333984375, "learning_rate": 6.579674043271645e-05, "loss": 36.0065, "step": 14457 }, { "epoch": 38.185539782106304, "grad_norm": 1386.6358642578125, "learning_rate": 6.576849805847415e-05, "loss": 35.0456, "step": 14458 }, { "epoch": 38.188180917794654, "grad_norm": 1374.437744140625, "learning_rate": 6.574026082878829e-05, "loss": 36.112, "step": 14459 }, { "epoch": 38.190822053483, "grad_norm": 12260.564453125, "learning_rate": 6.571202874444729e-05, "loss": 37.491, "step": 14460 }, { "epoch": 38.19346318917135, "grad_norm": 10588.626953125, "learning_rate": 6.568380180623973e-05, "loss": 12.5475, "step": 14461 }, { "epoch": 38.19610432485969, "grad_norm": 14361.49609375, "learning_rate": 6.565558001495373e-05, "loss": 9.9102, "step": 14462 }, { "epoch": 38.19874546054803, "grad_norm": 1083.43310546875, "learning_rate": 6.562736337137734e-05, "loss": 15.4809, "step": 14463 }, { "epoch": 38.20138659623638, "grad_norm": 2469.278076171875, "learning_rate": 6.559915187629848e-05, "loss": 9.8338, "step": 14464 }, { "epoch": 38.204027731924725, "grad_norm": 1637.0413818359375, "learning_rate": 6.557094553050505e-05, "loss": 11.25, "step": 14465 }, { "epoch": 38.206668867613075, "grad_norm": 6573.51318359375, "learning_rate": 6.554274433478461e-05, "loss": 14.3718, "step": 14466 }, { "epoch": 38.20931000330142, "grad_norm": 17685.873046875, "learning_rate": 6.551454828992468e-05, "loss": 12.4897, "step": 14467 }, { "epoch": 38.21195113898977, "grad_norm": 806.291259765625, "learning_rate": 6.548635739671264e-05, "loss": 10.0463, "step": 14468 }, { "epoch": 38.21459227467811, "grad_norm": 1444.0989990234375, "learning_rate": 6.545817165593565e-05, "loss": 14.4855, "step": 14469 }, { "epoch": 38.21723341036646, "grad_norm": 2570.722900390625, "learning_rate": 6.542999106838082e-05, "loss": 35.3696, "step": 14470 }, { "epoch": 38.219874546054804, "grad_norm": 1200.758544921875, "learning_rate": 6.540181563483503e-05, "loss": 33.7204, "step": 14471 }, { "epoch": 38.22251568174315, "grad_norm": 901.1401977539062, "learning_rate": 6.537364535608511e-05, "loss": 35.3186, "step": 14472 }, { "epoch": 38.2251568174315, "grad_norm": 8665.2841796875, "learning_rate": 6.534548023291773e-05, "loss": 34.9416, "step": 14473 }, { "epoch": 38.22779795311984, "grad_norm": 2706.84765625, "learning_rate": 6.531732026611925e-05, "loss": 35.0416, "step": 14474 }, { "epoch": 38.23043908880819, "grad_norm": 965.4544067382812, "learning_rate": 6.528916545647628e-05, "loss": 35.7568, "step": 14475 }, { "epoch": 38.23308022449653, "grad_norm": 1066.0322265625, "learning_rate": 6.526101580477478e-05, "loss": 37.0368, "step": 14476 }, { "epoch": 38.23572136018488, "grad_norm": 808.1142578125, "learning_rate": 6.52328713118009e-05, "loss": 34.0253, "step": 14477 }, { "epoch": 38.238362495873226, "grad_norm": 2707.64306640625, "learning_rate": 6.520473197834046e-05, "loss": 36.1075, "step": 14478 }, { "epoch": 38.24100363156157, "grad_norm": 2208.990478515625, "learning_rate": 6.51765978051794e-05, "loss": 35.161, "step": 14479 }, { "epoch": 38.24364476724992, "grad_norm": 1668.4576416015625, "learning_rate": 6.514846879310326e-05, "loss": 34.3509, "step": 14480 }, { "epoch": 38.24628590293826, "grad_norm": 1465.1861572265625, "learning_rate": 6.512034494289751e-05, "loss": 36.2818, "step": 14481 }, { "epoch": 38.24892703862661, "grad_norm": 1882.082275390625, "learning_rate": 6.509222625534755e-05, "loss": 34.3081, "step": 14482 }, { "epoch": 38.251568174314954, "grad_norm": 2048.91650390625, "learning_rate": 6.506411273123858e-05, "loss": 34.3361, "step": 14483 }, { "epoch": 38.254209310003304, "grad_norm": 1660.4417724609375, "learning_rate": 6.503600437135562e-05, "loss": 34.2058, "step": 14484 }, { "epoch": 38.25685044569165, "grad_norm": 1308.560546875, "learning_rate": 6.500790117648358e-05, "loss": 34.9122, "step": 14485 }, { "epoch": 38.25949158137999, "grad_norm": 5126.6455078125, "learning_rate": 6.497980314740723e-05, "loss": 36.0982, "step": 14486 }, { "epoch": 38.26213271706834, "grad_norm": 1339.56591796875, "learning_rate": 6.49517102849112e-05, "loss": 37.9305, "step": 14487 }, { "epoch": 38.26477385275668, "grad_norm": 2019.4527587890625, "learning_rate": 6.492362258977988e-05, "loss": 39.4759, "step": 14488 }, { "epoch": 38.26741498844503, "grad_norm": 948.006103515625, "learning_rate": 6.489554006279774e-05, "loss": 38.2985, "step": 14489 }, { "epoch": 38.270056124133376, "grad_norm": 1161.7164306640625, "learning_rate": 6.486746270474895e-05, "loss": 38.825, "step": 14490 }, { "epoch": 38.272697259821726, "grad_norm": 2387.537841796875, "learning_rate": 6.483939051641747e-05, "loss": 39.6416, "step": 14491 }, { "epoch": 38.27533839551007, "grad_norm": 1420.2056884765625, "learning_rate": 6.481132349858714e-05, "loss": 39.6929, "step": 14492 }, { "epoch": 38.27797953119842, "grad_norm": 1412.2117919921875, "learning_rate": 6.478326165204201e-05, "loss": 43.536, "step": 14493 }, { "epoch": 38.28062066688676, "grad_norm": 1142.1778564453125, "learning_rate": 6.475520497756542e-05, "loss": 42.2129, "step": 14494 }, { "epoch": 38.283261802575105, "grad_norm": 1660.4095458984375, "learning_rate": 6.472715347594082e-05, "loss": 40.9157, "step": 14495 }, { "epoch": 38.285902938263455, "grad_norm": 2282.62255859375, "learning_rate": 6.469910714795171e-05, "loss": 40.2752, "step": 14496 }, { "epoch": 38.2885440739518, "grad_norm": 2779.4453125, "learning_rate": 6.467106599438119e-05, "loss": 39.7448, "step": 14497 }, { "epoch": 38.29118520964015, "grad_norm": 1573.3370361328125, "learning_rate": 6.464303001601227e-05, "loss": 41.5079, "step": 14498 }, { "epoch": 38.29382634532849, "grad_norm": 2113.006103515625, "learning_rate": 6.461499921362779e-05, "loss": 36.5072, "step": 14499 }, { "epoch": 38.29646748101684, "grad_norm": 1477.236572265625, "learning_rate": 6.458697358801061e-05, "loss": 37.7188, "step": 14500 }, { "epoch": 38.29910861670518, "grad_norm": 1668.375, "learning_rate": 6.455895313994329e-05, "loss": 38.0049, "step": 14501 }, { "epoch": 38.301749752393526, "grad_norm": 2302.307861328125, "learning_rate": 6.453093787020825e-05, "loss": 34.8405, "step": 14502 }, { "epoch": 38.304390888081876, "grad_norm": 4999.59130859375, "learning_rate": 6.450292777958786e-05, "loss": 36.0347, "step": 14503 }, { "epoch": 38.30703202377022, "grad_norm": 6480.01318359375, "learning_rate": 6.44749228688642e-05, "loss": 34.7241, "step": 14504 }, { "epoch": 38.30967315945857, "grad_norm": 2010.678466796875, "learning_rate": 6.444692313881936e-05, "loss": 35.081, "step": 14505 }, { "epoch": 38.31231429514691, "grad_norm": 1877.9105224609375, "learning_rate": 6.441892859023512e-05, "loss": 35.2718, "step": 14506 }, { "epoch": 38.31495543083526, "grad_norm": 2370.083984375, "learning_rate": 6.439093922389333e-05, "loss": 35.2671, "step": 14507 }, { "epoch": 38.317596566523605, "grad_norm": 2199.762451171875, "learning_rate": 6.436295504057557e-05, "loss": 34.7652, "step": 14508 }, { "epoch": 38.32023770221195, "grad_norm": 1425.3602294921875, "learning_rate": 6.433497604106314e-05, "loss": 36.8599, "step": 14509 }, { "epoch": 38.3228788379003, "grad_norm": 4345.10302734375, "learning_rate": 6.430700222613759e-05, "loss": 34.7499, "step": 14510 }, { "epoch": 38.32551997358864, "grad_norm": 1600.1517333984375, "learning_rate": 6.427903359657985e-05, "loss": 10.472, "step": 14511 }, { "epoch": 38.32816110927699, "grad_norm": 1332.1444091796875, "learning_rate": 6.4251070153171e-05, "loss": 9.1155, "step": 14512 }, { "epoch": 38.330802244965334, "grad_norm": 3494.656982421875, "learning_rate": 6.422311189669183e-05, "loss": 9.3956, "step": 14513 }, { "epoch": 38.333443380653684, "grad_norm": 2650.89111328125, "learning_rate": 6.41951588279232e-05, "loss": 9.3392, "step": 14514 }, { "epoch": 38.33608451634203, "grad_norm": 7834.67236328125, "learning_rate": 6.41672109476456e-05, "loss": 10.6763, "step": 14515 }, { "epoch": 38.33872565203038, "grad_norm": 1200.359375, "learning_rate": 6.41392682566394e-05, "loss": 12.0832, "step": 14516 }, { "epoch": 38.34136678771872, "grad_norm": 31466.18359375, "learning_rate": 6.411133075568504e-05, "loss": 11.6894, "step": 14517 }, { "epoch": 38.34400792340706, "grad_norm": 1079.68994140625, "learning_rate": 6.408339844556257e-05, "loss": 16.246, "step": 14518 }, { "epoch": 38.34664905909541, "grad_norm": 10170.4013671875, "learning_rate": 6.405547132705197e-05, "loss": 10.3656, "step": 14519 }, { "epoch": 38.349290194783755, "grad_norm": 844.4031982421875, "learning_rate": 6.402754940093314e-05, "loss": 21.6728, "step": 14520 }, { "epoch": 38.351931330472105, "grad_norm": 1899.196533203125, "learning_rate": 6.399963266798569e-05, "loss": 35.7808, "step": 14521 }, { "epoch": 38.35457246616045, "grad_norm": 1196.8057861328125, "learning_rate": 6.397172112898927e-05, "loss": 33.9094, "step": 14522 }, { "epoch": 38.3572136018488, "grad_norm": 2194.715087890625, "learning_rate": 6.394381478472319e-05, "loss": 36.0867, "step": 14523 }, { "epoch": 38.35985473753714, "grad_norm": 1359.2203369140625, "learning_rate": 6.391591363596685e-05, "loss": 35.0799, "step": 14524 }, { "epoch": 38.362495873225484, "grad_norm": 1103.7086181640625, "learning_rate": 6.388801768349933e-05, "loss": 34.3658, "step": 14525 }, { "epoch": 38.365137008913834, "grad_norm": 2905.501220703125, "learning_rate": 6.386012692809956e-05, "loss": 34.4716, "step": 14526 }, { "epoch": 38.36777814460218, "grad_norm": 2360.571533203125, "learning_rate": 6.383224137054633e-05, "loss": 35.8723, "step": 14527 }, { "epoch": 38.37041928029053, "grad_norm": 4264.58251953125, "learning_rate": 6.380436101161858e-05, "loss": 34.6577, "step": 14528 }, { "epoch": 38.37306041597887, "grad_norm": 2902.12451171875, "learning_rate": 6.377648585209455e-05, "loss": 34.1201, "step": 14529 }, { "epoch": 38.37570155166722, "grad_norm": 2162.798828125, "learning_rate": 6.374861589275272e-05, "loss": 34.3231, "step": 14530 }, { "epoch": 38.37834268735556, "grad_norm": 3660.873046875, "learning_rate": 6.372075113437142e-05, "loss": 36.401, "step": 14531 }, { "epoch": 38.380983823043906, "grad_norm": 1460.58203125, "learning_rate": 6.369289157772876e-05, "loss": 36.3984, "step": 14532 }, { "epoch": 38.383624958732256, "grad_norm": 2659.740478515625, "learning_rate": 6.366503722360256e-05, "loss": 34.9733, "step": 14533 }, { "epoch": 38.3862660944206, "grad_norm": 1311.7047119140625, "learning_rate": 6.363718807277083e-05, "loss": 35.1195, "step": 14534 }, { "epoch": 38.38890723010895, "grad_norm": 1289.7750244140625, "learning_rate": 6.360934412601113e-05, "loss": 35.5273, "step": 14535 }, { "epoch": 38.39154836579729, "grad_norm": 1855.5538330078125, "learning_rate": 6.358150538410101e-05, "loss": 36.7478, "step": 14536 }, { "epoch": 38.39418950148564, "grad_norm": 2305.853515625, "learning_rate": 6.355367184781782e-05, "loss": 37.5299, "step": 14537 }, { "epoch": 38.396830637173984, "grad_norm": 2361.1845703125, "learning_rate": 6.352584351793886e-05, "loss": 43.5, "step": 14538 }, { "epoch": 38.399471772862334, "grad_norm": 1741.0137939453125, "learning_rate": 6.349802039524117e-05, "loss": 37.8079, "step": 14539 }, { "epoch": 38.40211290855068, "grad_norm": 1117.907958984375, "learning_rate": 6.347020248050159e-05, "loss": 38.532, "step": 14540 }, { "epoch": 38.40475404423902, "grad_norm": 2140.8095703125, "learning_rate": 6.344238977449715e-05, "loss": 38.0524, "step": 14541 }, { "epoch": 38.40739517992737, "grad_norm": 994.450439453125, "learning_rate": 6.341458227800436e-05, "loss": 41.806, "step": 14542 }, { "epoch": 38.41003631561571, "grad_norm": 1469.922119140625, "learning_rate": 6.338677999179977e-05, "loss": 40.8275, "step": 14543 }, { "epoch": 38.41267745130406, "grad_norm": 1218.919677734375, "learning_rate": 6.335898291665964e-05, "loss": 42.195, "step": 14544 }, { "epoch": 38.415318586992406, "grad_norm": 1232.7354736328125, "learning_rate": 6.333119105336044e-05, "loss": 40.8103, "step": 14545 }, { "epoch": 38.417959722680756, "grad_norm": 1572.5245361328125, "learning_rate": 6.330340440267798e-05, "loss": 41.9731, "step": 14546 }, { "epoch": 38.4206008583691, "grad_norm": 963.7595825195312, "learning_rate": 6.327562296538823e-05, "loss": 40.3493, "step": 14547 }, { "epoch": 38.42324199405744, "grad_norm": 1379.2008056640625, "learning_rate": 6.324784674226708e-05, "loss": 39.0443, "step": 14548 }, { "epoch": 38.42588312974579, "grad_norm": 1301.8785400390625, "learning_rate": 6.322007573409011e-05, "loss": 37.4428, "step": 14549 }, { "epoch": 38.428524265434135, "grad_norm": 1723.939453125, "learning_rate": 6.319230994163277e-05, "loss": 37.5049, "step": 14550 }, { "epoch": 38.431165401122485, "grad_norm": 2295.671875, "learning_rate": 6.31645493656704e-05, "loss": 38.935, "step": 14551 }, { "epoch": 38.43380653681083, "grad_norm": 1118.3406982421875, "learning_rate": 6.31367940069783e-05, "loss": 36.8072, "step": 14552 }, { "epoch": 38.43644767249918, "grad_norm": 1743.297607421875, "learning_rate": 6.310904386633149e-05, "loss": 36.1162, "step": 14553 }, { "epoch": 38.43908880818752, "grad_norm": 1303.81640625, "learning_rate": 6.308129894450481e-05, "loss": 35.9258, "step": 14554 }, { "epoch": 38.44172994387586, "grad_norm": 990.5267333984375, "learning_rate": 6.305355924227305e-05, "loss": 34.8096, "step": 14555 }, { "epoch": 38.44437107956421, "grad_norm": 953.7818603515625, "learning_rate": 6.302582476041086e-05, "loss": 36.3156, "step": 14556 }, { "epoch": 38.447012215252556, "grad_norm": 2821.49462890625, "learning_rate": 6.299809549969265e-05, "loss": 34.7083, "step": 14557 }, { "epoch": 38.449653350940906, "grad_norm": 799.6235961914062, "learning_rate": 6.297037146089271e-05, "loss": 35.8044, "step": 14558 }, { "epoch": 38.45229448662925, "grad_norm": 1055.0599365234375, "learning_rate": 6.294265264478535e-05, "loss": 36.1111, "step": 14559 }, { "epoch": 38.4549356223176, "grad_norm": 2221.058837890625, "learning_rate": 6.291493905214454e-05, "loss": 37.2003, "step": 14560 }, { "epoch": 38.45757675800594, "grad_norm": 2497.293701171875, "learning_rate": 6.288723068374408e-05, "loss": 15.5024, "step": 14561 }, { "epoch": 38.46021789369429, "grad_norm": 4200.3046875, "learning_rate": 6.285952754035784e-05, "loss": 13.069, "step": 14562 }, { "epoch": 38.462859029382635, "grad_norm": 1070.1954345703125, "learning_rate": 6.283182962275946e-05, "loss": 11.1482, "step": 14563 }, { "epoch": 38.46550016507098, "grad_norm": 6088.61328125, "learning_rate": 6.280413693172222e-05, "loss": 11.3567, "step": 14564 }, { "epoch": 38.46814130075933, "grad_norm": 43642.62109375, "learning_rate": 6.277644946801939e-05, "loss": 11.9998, "step": 14565 }, { "epoch": 38.47078243644767, "grad_norm": 8306.775390625, "learning_rate": 6.274876723242431e-05, "loss": 10.966, "step": 14566 }, { "epoch": 38.47342357213602, "grad_norm": 3577.93212890625, "learning_rate": 6.272109022570993e-05, "loss": 16.2099, "step": 14567 }, { "epoch": 38.476064707824364, "grad_norm": 17048.361328125, "learning_rate": 6.269341844864901e-05, "loss": 14.2731, "step": 14568 }, { "epoch": 38.478705843512714, "grad_norm": 2690.651123046875, "learning_rate": 6.26657519020144e-05, "loss": 10.5536, "step": 14569 }, { "epoch": 38.48134697920106, "grad_norm": 4896.6337890625, "learning_rate": 6.263809058657865e-05, "loss": 12.0307, "step": 14570 }, { "epoch": 38.4839881148894, "grad_norm": 1978.7899169921875, "learning_rate": 6.261043450311415e-05, "loss": 35.0221, "step": 14571 }, { "epoch": 38.48662925057775, "grad_norm": 2216.12353515625, "learning_rate": 6.25827836523932e-05, "loss": 34.9885, "step": 14572 }, { "epoch": 38.48927038626609, "grad_norm": 1487.26220703125, "learning_rate": 6.25551380351879e-05, "loss": 34.1289, "step": 14573 }, { "epoch": 38.49191152195444, "grad_norm": 2882.60009765625, "learning_rate": 6.252749765227026e-05, "loss": 34.9217, "step": 14574 }, { "epoch": 38.494552657642785, "grad_norm": 1429.216796875, "learning_rate": 6.249986250441206e-05, "loss": 34.3804, "step": 14575 }, { "epoch": 38.497193793331135, "grad_norm": 2153.037109375, "learning_rate": 6.24722325923851e-05, "loss": 35.0303, "step": 14576 }, { "epoch": 38.49983492901948, "grad_norm": 3281.633056640625, "learning_rate": 6.244460791696094e-05, "loss": 34.9267, "step": 14577 }, { "epoch": 38.50247606470782, "grad_norm": 3982.777587890625, "learning_rate": 6.241698847891089e-05, "loss": 34.241, "step": 14578 }, { "epoch": 38.50511720039617, "grad_norm": 1220.246337890625, "learning_rate": 6.238937427900617e-05, "loss": 34.0699, "step": 14579 }, { "epoch": 38.507758336084514, "grad_norm": 3965.611328125, "learning_rate": 6.236176531801813e-05, "loss": 36.444, "step": 14580 }, { "epoch": 38.510399471772864, "grad_norm": 1876.587158203125, "learning_rate": 6.233416159671748e-05, "loss": 33.8175, "step": 14581 }, { "epoch": 38.51304060746121, "grad_norm": 2016.4857177734375, "learning_rate": 6.230656311587504e-05, "loss": 35.2508, "step": 14582 }, { "epoch": 38.51568174314956, "grad_norm": 3226.304931640625, "learning_rate": 6.227896987626166e-05, "loss": 33.9782, "step": 14583 }, { "epoch": 38.5183228788379, "grad_norm": 2800.62109375, "learning_rate": 6.225138187864779e-05, "loss": 34.404, "step": 14584 }, { "epoch": 38.52096401452625, "grad_norm": 2138.020751953125, "learning_rate": 6.222379912380377e-05, "loss": 34.589, "step": 14585 }, { "epoch": 38.52360515021459, "grad_norm": 2013.8736572265625, "learning_rate": 6.219622161249977e-05, "loss": 36.5477, "step": 14586 }, { "epoch": 38.526246285902936, "grad_norm": 2388.060302734375, "learning_rate": 6.216864934550607e-05, "loss": 38.0237, "step": 14587 }, { "epoch": 38.528887421591286, "grad_norm": 2914.222412109375, "learning_rate": 6.214108232359244e-05, "loss": 42.0804, "step": 14588 }, { "epoch": 38.53152855727963, "grad_norm": 2137.321044921875, "learning_rate": 6.211352054752878e-05, "loss": 37.2381, "step": 14589 }, { "epoch": 38.53416969296798, "grad_norm": 1843.0711669921875, "learning_rate": 6.208596401808469e-05, "loss": 39.3854, "step": 14590 }, { "epoch": 38.53681082865632, "grad_norm": 1525.955810546875, "learning_rate": 6.205841273602963e-05, "loss": 38.7681, "step": 14591 }, { "epoch": 38.53945196434467, "grad_norm": 1490.2779541015625, "learning_rate": 6.203086670213304e-05, "loss": 38.9096, "step": 14592 }, { "epoch": 38.542093100033014, "grad_norm": 1231.122802734375, "learning_rate": 6.200332591716398e-05, "loss": 40.9843, "step": 14593 }, { "epoch": 38.54473423572136, "grad_norm": 1786.6514892578125, "learning_rate": 6.197579038189166e-05, "loss": 40.8787, "step": 14594 }, { "epoch": 38.54737537140971, "grad_norm": 2372.88037109375, "learning_rate": 6.194826009708498e-05, "loss": 39.722, "step": 14595 }, { "epoch": 38.55001650709805, "grad_norm": 1489.2200927734375, "learning_rate": 6.192073506351258e-05, "loss": 39.0976, "step": 14596 }, { "epoch": 38.5526576427864, "grad_norm": 1025.465087890625, "learning_rate": 6.189321528194324e-05, "loss": 40.4489, "step": 14597 }, { "epoch": 38.55529877847474, "grad_norm": 1989.8726806640625, "learning_rate": 6.186570075314543e-05, "loss": 40.1485, "step": 14598 }, { "epoch": 38.55793991416309, "grad_norm": 1712.888427734375, "learning_rate": 6.183819147788735e-05, "loss": 38.0642, "step": 14599 }, { "epoch": 38.560581049851436, "grad_norm": 2316.474609375, "learning_rate": 6.181068745693715e-05, "loss": 39.0695, "step": 14600 }, { "epoch": 38.560581049851436, "eval_loss": 3.7334887981414795, "eval_runtime": 2.1241, "eval_samples_per_second": 233.036, "eval_steps_per_second": 29.188, "step": 14600 }, { "epoch": 38.56322218553978, "grad_norm": 2468.188720703125, "learning_rate": 6.178318869106306e-05, "loss": 34.9097, "step": 14601 }, { "epoch": 38.56586332122813, "grad_norm": 1806.87548828125, "learning_rate": 6.175569518103285e-05, "loss": 35.6104, "step": 14602 }, { "epoch": 38.56850445691647, "grad_norm": 2238.684814453125, "learning_rate": 6.172820692761419e-05, "loss": 34.8877, "step": 14603 }, { "epoch": 38.57114559260482, "grad_norm": 1123.0054931640625, "learning_rate": 6.170072393157484e-05, "loss": 35.0762, "step": 14604 }, { "epoch": 38.573786728293165, "grad_norm": 1384.926513671875, "learning_rate": 6.167324619368214e-05, "loss": 34.5791, "step": 14605 }, { "epoch": 38.576427863981515, "grad_norm": 698.33154296875, "learning_rate": 6.164577371470343e-05, "loss": 34.738, "step": 14606 }, { "epoch": 38.57906899966986, "grad_norm": 1492.8128662109375, "learning_rate": 6.161830649540582e-05, "loss": 35.5892, "step": 14607 }, { "epoch": 38.58171013535821, "grad_norm": 1134.4493408203125, "learning_rate": 6.159084453655637e-05, "loss": 35.9785, "step": 14608 }, { "epoch": 38.58435127104655, "grad_norm": 954.0172729492188, "learning_rate": 6.156338783892187e-05, "loss": 34.9619, "step": 14609 }, { "epoch": 38.58699240673489, "grad_norm": 6993.79931640625, "learning_rate": 6.153593640326904e-05, "loss": 35.8738, "step": 14610 }, { "epoch": 38.58963354242324, "grad_norm": 22236.734375, "learning_rate": 6.150849023036453e-05, "loss": 10.5557, "step": 14611 }, { "epoch": 38.592274678111586, "grad_norm": 2846.656005859375, "learning_rate": 6.14810493209747e-05, "loss": 14.4928, "step": 14612 }, { "epoch": 38.594915813799936, "grad_norm": 2407.43603515625, "learning_rate": 6.145361367586583e-05, "loss": 11.4567, "step": 14613 }, { "epoch": 38.59755694948828, "grad_norm": 1471.498046875, "learning_rate": 6.142618329580397e-05, "loss": 8.8845, "step": 14614 }, { "epoch": 38.60019808517663, "grad_norm": 6675.5810546875, "learning_rate": 6.139875818155524e-05, "loss": 17.798, "step": 14615 }, { "epoch": 38.60283922086497, "grad_norm": 12062.2412109375, "learning_rate": 6.137133833388548e-05, "loss": 14.1144, "step": 14616 }, { "epoch": 38.605480356553315, "grad_norm": 4354.5732421875, "learning_rate": 6.134392375356012e-05, "loss": 12.8408, "step": 14617 }, { "epoch": 38.608121492241665, "grad_norm": 1101.593017578125, "learning_rate": 6.131651444134494e-05, "loss": 10.9162, "step": 14618 }, { "epoch": 38.61076262793001, "grad_norm": 17726.115234375, "learning_rate": 6.128911039800527e-05, "loss": 11.9838, "step": 14619 }, { "epoch": 38.61340376361836, "grad_norm": 6929.31298828125, "learning_rate": 6.126171162430635e-05, "loss": 35.6532, "step": 14620 }, { "epoch": 38.6160448993067, "grad_norm": 1482.1212158203125, "learning_rate": 6.123431812101318e-05, "loss": 34.0964, "step": 14621 }, { "epoch": 38.61868603499505, "grad_norm": 1023.6336059570312, "learning_rate": 6.120692988889085e-05, "loss": 33.0312, "step": 14622 }, { "epoch": 38.621327170683394, "grad_norm": 2089.694091796875, "learning_rate": 6.117954692870411e-05, "loss": 36.3282, "step": 14623 }, { "epoch": 38.62396830637174, "grad_norm": 2359.217041015625, "learning_rate": 6.115216924121761e-05, "loss": 34.287, "step": 14624 }, { "epoch": 38.62660944206009, "grad_norm": 1367.376953125, "learning_rate": 6.112479682719583e-05, "loss": 34.7236, "step": 14625 }, { "epoch": 38.62925057774843, "grad_norm": 1679.8482666015625, "learning_rate": 6.109742968740315e-05, "loss": 34.056, "step": 14626 }, { "epoch": 38.63189171343678, "grad_norm": 2114.705810546875, "learning_rate": 6.10700678226038e-05, "loss": 32.6045, "step": 14627 }, { "epoch": 38.63453284912512, "grad_norm": 2555.017578125, "learning_rate": 6.104271123356173e-05, "loss": 34.4836, "step": 14628 }, { "epoch": 38.63717398481347, "grad_norm": 2070.113525390625, "learning_rate": 6.1015359921041014e-05, "loss": 35.5603, "step": 14629 }, { "epoch": 38.639815120501815, "grad_norm": 1106.453369140625, "learning_rate": 6.098801388580538e-05, "loss": 34.8476, "step": 14630 }, { "epoch": 38.642456256190165, "grad_norm": 1712.67431640625, "learning_rate": 6.0960673128618335e-05, "loss": 35.2566, "step": 14631 }, { "epoch": 38.64509739187851, "grad_norm": 1745.4010009765625, "learning_rate": 6.0933337650243534e-05, "loss": 34.2034, "step": 14632 }, { "epoch": 38.64773852756685, "grad_norm": 1191.0943603515625, "learning_rate": 6.090600745144428e-05, "loss": 35.0328, "step": 14633 }, { "epoch": 38.6503796632552, "grad_norm": 1952.3759765625, "learning_rate": 6.087868253298362e-05, "loss": 34.2856, "step": 14634 }, { "epoch": 38.653020798943544, "grad_norm": 1315.3885498046875, "learning_rate": 6.0851362895624565e-05, "loss": 33.3737, "step": 14635 }, { "epoch": 38.655661934631894, "grad_norm": 2508.049072265625, "learning_rate": 6.082404854013019e-05, "loss": 35.8123, "step": 14636 }, { "epoch": 38.65830307032024, "grad_norm": 13173.3662109375, "learning_rate": 6.079673946726311e-05, "loss": 36.8578, "step": 14637 }, { "epoch": 38.66094420600859, "grad_norm": 6298.8642578125, "learning_rate": 6.076943567778587e-05, "loss": 41.0095, "step": 14638 }, { "epoch": 38.66358534169693, "grad_norm": 1766.1092529296875, "learning_rate": 6.074213717246105e-05, "loss": 39.214, "step": 14639 }, { "epoch": 38.66622647738527, "grad_norm": 1828.560791015625, "learning_rate": 6.071484395205087e-05, "loss": 38.2411, "step": 14640 }, { "epoch": 38.66886761307362, "grad_norm": 1663.25927734375, "learning_rate": 6.0687556017317485e-05, "loss": 39.258, "step": 14641 }, { "epoch": 38.671508748761966, "grad_norm": 4026.165771484375, "learning_rate": 6.0660273369022886e-05, "loss": 42.298, "step": 14642 }, { "epoch": 38.674149884450316, "grad_norm": 1304.099365234375, "learning_rate": 6.063299600792893e-05, "loss": 42.3026, "step": 14643 }, { "epoch": 38.67679102013866, "grad_norm": 947.7264404296875, "learning_rate": 6.060572393479732e-05, "loss": 46.4053, "step": 14644 }, { "epoch": 38.67943215582701, "grad_norm": 2309.553466796875, "learning_rate": 6.057845715038954e-05, "loss": 40.3215, "step": 14645 }, { "epoch": 38.68207329151535, "grad_norm": 1841.72802734375, "learning_rate": 6.055119565546715e-05, "loss": 39.3352, "step": 14646 }, { "epoch": 38.684714427203694, "grad_norm": 1444.0455322265625, "learning_rate": 6.052393945079132e-05, "loss": 37.1385, "step": 14647 }, { "epoch": 38.687355562892044, "grad_norm": 1298.912109375, "learning_rate": 6.04966885371232e-05, "loss": 38.4548, "step": 14648 }, { "epoch": 38.68999669858039, "grad_norm": 1218.4498291015625, "learning_rate": 6.046944291522366e-05, "loss": 36.9344, "step": 14649 }, { "epoch": 38.69263783426874, "grad_norm": 970.095458984375, "learning_rate": 6.0442202585853646e-05, "loss": 39.4224, "step": 14650 }, { "epoch": 38.69527896995708, "grad_norm": 2048.044921875, "learning_rate": 6.041496754977385e-05, "loss": 36.3873, "step": 14651 }, { "epoch": 38.69792010564543, "grad_norm": 1148.441650390625, "learning_rate": 6.038773780774456e-05, "loss": 36.8487, "step": 14652 }, { "epoch": 38.70056124133377, "grad_norm": 1877.907470703125, "learning_rate": 6.0360513360526396e-05, "loss": 35.8269, "step": 14653 }, { "epoch": 38.70320237702212, "grad_norm": 1733.173583984375, "learning_rate": 6.033329420887948e-05, "loss": 35.9452, "step": 14654 }, { "epoch": 38.705843512710466, "grad_norm": 1519.2669677734375, "learning_rate": 6.030608035356391e-05, "loss": 35.9257, "step": 14655 }, { "epoch": 38.70848464839881, "grad_norm": 1640.931884765625, "learning_rate": 6.027887179533956e-05, "loss": 35.4978, "step": 14656 }, { "epoch": 38.71112578408716, "grad_norm": 1079.140625, "learning_rate": 6.025166853496633e-05, "loss": 33.347, "step": 14657 }, { "epoch": 38.7137669197755, "grad_norm": 1553.8568115234375, "learning_rate": 6.0224470573203794e-05, "loss": 34.4617, "step": 14658 }, { "epoch": 38.71640805546385, "grad_norm": 1219.4849853515625, "learning_rate": 6.019727791081142e-05, "loss": 35.1194, "step": 14659 }, { "epoch": 38.719049191152195, "grad_norm": 1219.4229736328125, "learning_rate": 6.017009054854858e-05, "loss": 35.4243, "step": 14660 }, { "epoch": 38.721690326840545, "grad_norm": 4088.210205078125, "learning_rate": 6.014290848717446e-05, "loss": 27.1198, "step": 14661 }, { "epoch": 38.72433146252889, "grad_norm": 2121.11572265625, "learning_rate": 6.0115731727448e-05, "loss": 11.8747, "step": 14662 }, { "epoch": 38.72697259821723, "grad_norm": 2925.38720703125, "learning_rate": 6.008856027012827e-05, "loss": 9.3214, "step": 14663 }, { "epoch": 38.72961373390558, "grad_norm": 1430.3721923828125, "learning_rate": 6.0061394115973946e-05, "loss": 10.3138, "step": 14664 }, { "epoch": 38.73225486959392, "grad_norm": 2119.56396484375, "learning_rate": 6.003423326574359e-05, "loss": 11.4752, "step": 14665 }, { "epoch": 38.73489600528227, "grad_norm": 4134.26513671875, "learning_rate": 6.000707772019562e-05, "loss": 15.7742, "step": 14666 }, { "epoch": 38.737537140970616, "grad_norm": 1251.3914794921875, "learning_rate": 5.997992748008846e-05, "loss": 9.4426, "step": 14667 }, { "epoch": 38.740178276658966, "grad_norm": 621.89697265625, "learning_rate": 5.995278254618028e-05, "loss": 17.3179, "step": 14668 }, { "epoch": 38.74281941234731, "grad_norm": 3224.283447265625, "learning_rate": 5.992564291922886e-05, "loss": 18.323, "step": 14669 }, { "epoch": 38.74546054803565, "grad_norm": 3630.193603515625, "learning_rate": 5.989850859999227e-05, "loss": 16.7176, "step": 14670 }, { "epoch": 38.748101683724, "grad_norm": 1626.3985595703125, "learning_rate": 5.987137958922817e-05, "loss": 34.7994, "step": 14671 }, { "epoch": 38.750742819412345, "grad_norm": 3352.2998046875, "learning_rate": 5.98442558876941e-05, "loss": 34.0632, "step": 14672 }, { "epoch": 38.753383955100695, "grad_norm": 3949.03955078125, "learning_rate": 5.981713749614742e-05, "loss": 33.9306, "step": 14673 }, { "epoch": 38.75602509078904, "grad_norm": 1020.3318481445312, "learning_rate": 5.9790024415345534e-05, "loss": 35.5449, "step": 14674 }, { "epoch": 38.75866622647739, "grad_norm": 1251.44921875, "learning_rate": 5.976291664604547e-05, "loss": 34.5579, "step": 14675 }, { "epoch": 38.76130736216573, "grad_norm": 882.1839599609375, "learning_rate": 5.97358141890042e-05, "loss": 34.4451, "step": 14676 }, { "epoch": 38.76394849785408, "grad_norm": 1628.789794921875, "learning_rate": 5.970871704497855e-05, "loss": 34.8558, "step": 14677 }, { "epoch": 38.766589633542424, "grad_norm": 5757.5673828125, "learning_rate": 5.96816252147252e-05, "loss": 35.5853, "step": 14678 }, { "epoch": 38.76923076923077, "grad_norm": 1291.5894775390625, "learning_rate": 5.965453869900067e-05, "loss": 35.5689, "step": 14679 }, { "epoch": 38.77187190491912, "grad_norm": 1965.88525390625, "learning_rate": 5.962745749856124e-05, "loss": 34.6643, "step": 14680 }, { "epoch": 38.77451304060746, "grad_norm": 1629.1285400390625, "learning_rate": 5.960038161416331e-05, "loss": 36.3302, "step": 14681 }, { "epoch": 38.77715417629581, "grad_norm": 2040.386962890625, "learning_rate": 5.957331104656291e-05, "loss": 34.0947, "step": 14682 }, { "epoch": 38.77979531198415, "grad_norm": 2815.33447265625, "learning_rate": 5.954624579651583e-05, "loss": 34.4084, "step": 14683 }, { "epoch": 38.7824364476725, "grad_norm": 1574.3243408203125, "learning_rate": 5.951918586477803e-05, "loss": 33.7664, "step": 14684 }, { "epoch": 38.785077583360845, "grad_norm": 2952.67431640625, "learning_rate": 5.949213125210509e-05, "loss": 33.9279, "step": 14685 }, { "epoch": 38.78771871904919, "grad_norm": 2533.20166015625, "learning_rate": 5.9465081959252516e-05, "loss": 34.3672, "step": 14686 }, { "epoch": 38.79035985473754, "grad_norm": 885.0406494140625, "learning_rate": 5.943803798697547e-05, "loss": 35.637, "step": 14687 }, { "epoch": 38.79300099042588, "grad_norm": 1863.1484375, "learning_rate": 5.941099933602936e-05, "loss": 39.9051, "step": 14688 }, { "epoch": 38.79564212611423, "grad_norm": 3384.7021484375, "learning_rate": 5.938396600716914e-05, "loss": 37.6714, "step": 14689 }, { "epoch": 38.798283261802574, "grad_norm": 4438.951171875, "learning_rate": 5.9356938001149605e-05, "loss": 39.1007, "step": 14690 }, { "epoch": 38.800924397490924, "grad_norm": 1497.9530029296875, "learning_rate": 5.932991531872564e-05, "loss": 38.4022, "step": 14691 }, { "epoch": 38.80356553317927, "grad_norm": 1492.6531982421875, "learning_rate": 5.930289796065183e-05, "loss": 40.0467, "step": 14692 }, { "epoch": 38.80620666886761, "grad_norm": 1186.5927734375, "learning_rate": 5.9275885927682544e-05, "loss": 39.1378, "step": 14693 }, { "epoch": 38.80884780455596, "grad_norm": 669.230712890625, "learning_rate": 5.924887922057209e-05, "loss": 42.1318, "step": 14694 }, { "epoch": 38.8114889402443, "grad_norm": 854.2947387695312, "learning_rate": 5.922187784007465e-05, "loss": 42.8436, "step": 14695 }, { "epoch": 38.81413007593265, "grad_norm": 1093.438232421875, "learning_rate": 5.919488178694421e-05, "loss": 40.3322, "step": 14696 }, { "epoch": 38.816771211620996, "grad_norm": 822.1630249023438, "learning_rate": 5.916789106193454e-05, "loss": 39.7499, "step": 14697 }, { "epoch": 38.819412347309346, "grad_norm": 3387.062744140625, "learning_rate": 5.914090566579947e-05, "loss": 37.1386, "step": 14698 }, { "epoch": 38.82205348299769, "grad_norm": 1084.6898193359375, "learning_rate": 5.911392559929249e-05, "loss": 38.8518, "step": 14699 }, { "epoch": 38.82469461868604, "grad_norm": 7999.142578125, "learning_rate": 5.908695086316701e-05, "loss": 38.9766, "step": 14700 }, { "epoch": 38.82733575437438, "grad_norm": 1045.587646484375, "learning_rate": 5.905998145817623e-05, "loss": 34.9878, "step": 14701 }, { "epoch": 38.829976890062724, "grad_norm": 1254.458984375, "learning_rate": 5.903301738507336e-05, "loss": 35.1396, "step": 14702 }, { "epoch": 38.832618025751074, "grad_norm": 2110.542724609375, "learning_rate": 5.900605864461137e-05, "loss": 36.5482, "step": 14703 }, { "epoch": 38.83525916143942, "grad_norm": 944.9866333007812, "learning_rate": 5.897910523754283e-05, "loss": 34.8637, "step": 14704 }, { "epoch": 38.83790029712777, "grad_norm": 1543.8040771484375, "learning_rate": 5.8952157164620654e-05, "loss": 35.3145, "step": 14705 }, { "epoch": 38.84054143281611, "grad_norm": 2437.295654296875, "learning_rate": 5.892521442659726e-05, "loss": 33.5218, "step": 14706 }, { "epoch": 38.84318256850446, "grad_norm": 1139.3338623046875, "learning_rate": 5.889827702422501e-05, "loss": 35.5183, "step": 14707 }, { "epoch": 38.8458237041928, "grad_norm": 2116.25341796875, "learning_rate": 5.887134495825605e-05, "loss": 34.7809, "step": 14708 }, { "epoch": 38.848464839881146, "grad_norm": 1600.5400390625, "learning_rate": 5.884441822944259e-05, "loss": 36.3187, "step": 14709 }, { "epoch": 38.851105975569496, "grad_norm": 2024.6435546875, "learning_rate": 5.8817496838536434e-05, "loss": 33.7871, "step": 14710 }, { "epoch": 38.85374711125784, "grad_norm": 1476.4039306640625, "learning_rate": 5.879058078628938e-05, "loss": 9.7021, "step": 14711 }, { "epoch": 38.85638824694619, "grad_norm": 596.2813720703125, "learning_rate": 5.876367007345304e-05, "loss": 9.5492, "step": 14712 }, { "epoch": 38.85902938263453, "grad_norm": 3239.010009765625, "learning_rate": 5.8736764700778877e-05, "loss": 12.1744, "step": 14713 }, { "epoch": 38.86167051832288, "grad_norm": 1496.6875, "learning_rate": 5.8709864669018206e-05, "loss": 12.2178, "step": 14714 }, { "epoch": 38.864311654011225, "grad_norm": 818.6159057617188, "learning_rate": 5.868296997892211e-05, "loss": 12.9181, "step": 14715 }, { "epoch": 38.86695278969957, "grad_norm": 2276.77197265625, "learning_rate": 5.8656080631241784e-05, "loss": 11.2778, "step": 14716 }, { "epoch": 38.86959392538792, "grad_norm": 15779.953125, "learning_rate": 5.8629196626728e-05, "loss": 11.6224, "step": 14717 }, { "epoch": 38.87223506107626, "grad_norm": 2085.284912109375, "learning_rate": 5.860231796613138e-05, "loss": 9.3917, "step": 14718 }, { "epoch": 38.87487619676461, "grad_norm": 809.4859008789062, "learning_rate": 5.8575444650202706e-05, "loss": 20.7657, "step": 14719 }, { "epoch": 38.87751733245295, "grad_norm": 1208.9300537109375, "learning_rate": 5.854857667969227e-05, "loss": 35.6518, "step": 14720 }, { "epoch": 38.8801584681413, "grad_norm": 1514.1583251953125, "learning_rate": 5.852171405535045e-05, "loss": 36.5195, "step": 14721 }, { "epoch": 38.882799603829646, "grad_norm": 952.3965454101562, "learning_rate": 5.849485677792712e-05, "loss": 35.0299, "step": 14722 }, { "epoch": 38.885440739517996, "grad_norm": 1380.4503173828125, "learning_rate": 5.8468004848172526e-05, "loss": 34.0897, "step": 14723 }, { "epoch": 38.88808187520634, "grad_norm": 1179.1019287109375, "learning_rate": 5.8441158266836355e-05, "loss": 35.0566, "step": 14724 }, { "epoch": 38.89072301089468, "grad_norm": 1695.8607177734375, "learning_rate": 5.841431703466826e-05, "loss": 33.96, "step": 14725 }, { "epoch": 38.89336414658303, "grad_norm": 2434.529296875, "learning_rate": 5.838748115241788e-05, "loss": 33.8534, "step": 14726 }, { "epoch": 38.896005282271375, "grad_norm": 1445.5216064453125, "learning_rate": 5.8360650620834535e-05, "loss": 33.0344, "step": 14727 }, { "epoch": 38.898646417959725, "grad_norm": 1434.6590576171875, "learning_rate": 5.833382544066743e-05, "loss": 33.9123, "step": 14728 }, { "epoch": 38.90128755364807, "grad_norm": 744.00244140625, "learning_rate": 5.830700561266566e-05, "loss": 33.915, "step": 14729 }, { "epoch": 38.90392868933642, "grad_norm": 1354.4224853515625, "learning_rate": 5.828019113757818e-05, "loss": 35.0748, "step": 14730 }, { "epoch": 38.90656982502476, "grad_norm": 1684.7840576171875, "learning_rate": 5.825338201615371e-05, "loss": 34.3112, "step": 14731 }, { "epoch": 38.909210960713104, "grad_norm": 1307.74462890625, "learning_rate": 5.822657824914085e-05, "loss": 35.5405, "step": 14732 }, { "epoch": 38.911852096401454, "grad_norm": 5046.3134765625, "learning_rate": 5.819977983728822e-05, "loss": 35.1196, "step": 14733 }, { "epoch": 38.9144932320898, "grad_norm": 1206.98095703125, "learning_rate": 5.817298678134405e-05, "loss": 34.4994, "step": 14734 }, { "epoch": 38.91713436777815, "grad_norm": 789.59375, "learning_rate": 5.814619908205657e-05, "loss": 34.6098, "step": 14735 }, { "epoch": 38.91977550346649, "grad_norm": 2882.898193359375, "learning_rate": 5.811941674017368e-05, "loss": 35.3425, "step": 14736 }, { "epoch": 38.92241663915484, "grad_norm": 2026.5068359375, "learning_rate": 5.809263975644344e-05, "loss": 36.9576, "step": 14737 }, { "epoch": 38.92505777484318, "grad_norm": 1891.0565185546875, "learning_rate": 5.8065868131613546e-05, "loss": 41.996, "step": 14738 }, { "epoch": 38.927698910531525, "grad_norm": 1173.1427001953125, "learning_rate": 5.803910186643144e-05, "loss": 38.2945, "step": 14739 }, { "epoch": 38.930340046219875, "grad_norm": 871.5755615234375, "learning_rate": 5.801234096164468e-05, "loss": 41.9727, "step": 14740 }, { "epoch": 38.93298118190822, "grad_norm": 1945.5457763671875, "learning_rate": 5.7985585418000556e-05, "loss": 40.9073, "step": 14741 }, { "epoch": 38.93562231759657, "grad_norm": 704.5970458984375, "learning_rate": 5.795883523624615e-05, "loss": 42.2693, "step": 14742 }, { "epoch": 38.93826345328491, "grad_norm": 2104.8779296875, "learning_rate": 5.793209041712838e-05, "loss": 40.7578, "step": 14743 }, { "epoch": 38.94090458897326, "grad_norm": 1379.920654296875, "learning_rate": 5.790535096139424e-05, "loss": 38.8713, "step": 14744 }, { "epoch": 38.943545724661604, "grad_norm": 1106.0498046875, "learning_rate": 5.787861686979032e-05, "loss": 36.2782, "step": 14745 }, { "epoch": 38.946186860349954, "grad_norm": 786.6151123046875, "learning_rate": 5.785188814306316e-05, "loss": 35.5928, "step": 14746 }, { "epoch": 38.9488279960383, "grad_norm": 1041.849853515625, "learning_rate": 5.782516478195918e-05, "loss": 36.0296, "step": 14747 }, { "epoch": 38.95146913172664, "grad_norm": 16484.345703125, "learning_rate": 5.7798446787224565e-05, "loss": 35.3032, "step": 14748 }, { "epoch": 38.95411026741499, "grad_norm": 2359.146240234375, "learning_rate": 5.77717341596054e-05, "loss": 34.6545, "step": 14749 }, { "epoch": 38.95675140310333, "grad_norm": 1017.6690063476562, "learning_rate": 5.7745026899847594e-05, "loss": 35.6542, "step": 14750 }, { "epoch": 38.95939253879168, "grad_norm": 1238.0477294921875, "learning_rate": 5.771832500869703e-05, "loss": 25.9904, "step": 14751 }, { "epoch": 38.962033674480026, "grad_norm": 856.5126953125, "learning_rate": 5.76916284868993e-05, "loss": 8.6018, "step": 14752 }, { "epoch": 38.964674810168376, "grad_norm": 953.209716796875, "learning_rate": 5.7664937335199816e-05, "loss": 8.3629, "step": 14753 }, { "epoch": 38.96731594585672, "grad_norm": 1349.41259765625, "learning_rate": 5.763825155434402e-05, "loss": 8.4083, "step": 14754 }, { "epoch": 38.96995708154506, "grad_norm": 640.6498413085938, "learning_rate": 5.761157114507706e-05, "loss": 17.4263, "step": 14755 }, { "epoch": 38.97259821723341, "grad_norm": 1791.0716552734375, "learning_rate": 5.758489610814402e-05, "loss": 12.1173, "step": 14756 }, { "epoch": 38.975239352921754, "grad_norm": 3993.6640625, "learning_rate": 5.7558226444289596e-05, "loss": 28.5249, "step": 14757 }, { "epoch": 38.977880488610104, "grad_norm": 1094.814208984375, "learning_rate": 5.7531562154258726e-05, "loss": 33.8722, "step": 14758 }, { "epoch": 38.98052162429845, "grad_norm": 1718.4288330078125, "learning_rate": 5.750490323879592e-05, "loss": 33.4806, "step": 14759 }, { "epoch": 38.9831627599868, "grad_norm": 2656.663818359375, "learning_rate": 5.7478249698645543e-05, "loss": 33.9613, "step": 14760 }, { "epoch": 38.98580389567514, "grad_norm": 497.5028381347656, "learning_rate": 5.7451601534552003e-05, "loss": 33.104, "step": 14761 }, { "epoch": 38.98844503136348, "grad_norm": 4253.28857421875, "learning_rate": 5.742495874725939e-05, "loss": 34.1722, "step": 14762 }, { "epoch": 38.99108616705183, "grad_norm": 834.8242797851562, "learning_rate": 5.739832133751169e-05, "loss": 33.4558, "step": 14763 }, { "epoch": 38.993727302740176, "grad_norm": 3178.8349609375, "learning_rate": 5.737168930605272e-05, "loss": 34.7659, "step": 14764 }, { "epoch": 38.996368438428526, "grad_norm": 710.784423828125, "learning_rate": 5.7345062653626184e-05, "loss": 34.6241, "step": 14765 }, { "epoch": 38.99900957411687, "grad_norm": 1926.997802734375, "learning_rate": 5.731844138097558e-05, "loss": 36.0133, "step": 14766 }, { "epoch": 39.00165070980522, "grad_norm": 5033.271484375, "learning_rate": 5.7291825488844236e-05, "loss": 39.7541, "step": 14767 }, { "epoch": 39.00429184549356, "grad_norm": 1691.376953125, "learning_rate": 5.726521497797555e-05, "loss": 40.9763, "step": 14768 }, { "epoch": 39.00693298118191, "grad_norm": 1765.4437255859375, "learning_rate": 5.7238609849112524e-05, "loss": 37.8821, "step": 14769 }, { "epoch": 39.009574116870255, "grad_norm": 1780.60498046875, "learning_rate": 5.721201010299806e-05, "loss": 39.1625, "step": 14770 }, { "epoch": 39.0122152525586, "grad_norm": 1870.78271484375, "learning_rate": 5.718541574037492e-05, "loss": 41.3134, "step": 14771 }, { "epoch": 39.01485638824695, "grad_norm": 2124.372802734375, "learning_rate": 5.7158826761985835e-05, "loss": 40.144, "step": 14772 }, { "epoch": 39.01749752393529, "grad_norm": 1852.91015625, "learning_rate": 5.71322431685733e-05, "loss": 41.0459, "step": 14773 }, { "epoch": 39.02013865962364, "grad_norm": 3492.799560546875, "learning_rate": 5.710566496087946e-05, "loss": 41.46, "step": 14774 }, { "epoch": 39.02277979531198, "grad_norm": 978.2021484375, "learning_rate": 5.7079092139646655e-05, "loss": 39.6026, "step": 14775 }, { "epoch": 39.02542093100033, "grad_norm": 1423.2315673828125, "learning_rate": 5.7052524705616896e-05, "loss": 37.926, "step": 14776 }, { "epoch": 39.028062066688676, "grad_norm": 948.085693359375, "learning_rate": 5.702596265953203e-05, "loss": 40.7419, "step": 14777 }, { "epoch": 39.03070320237702, "grad_norm": 1518.833740234375, "learning_rate": 5.6999406002133715e-05, "loss": 37.2656, "step": 14778 }, { "epoch": 39.03334433806537, "grad_norm": 2190.39111328125, "learning_rate": 5.697285473416369e-05, "loss": 38.0558, "step": 14779 }, { "epoch": 39.03598547375371, "grad_norm": 1324.6220703125, "learning_rate": 5.6946308856363316e-05, "loss": 36.9666, "step": 14780 }, { "epoch": 39.03862660944206, "grad_norm": 1241.6646728515625, "learning_rate": 5.691976836947388e-05, "loss": 34.8491, "step": 14781 }, { "epoch": 39.041267745130405, "grad_norm": 1199.0279541015625, "learning_rate": 5.6893233274236454e-05, "loss": 35.8052, "step": 14782 }, { "epoch": 39.043908880818755, "grad_norm": 1272.9715576171875, "learning_rate": 5.6866703571392065e-05, "loss": 33.8932, "step": 14783 }, { "epoch": 39.0465500165071, "grad_norm": 1436.1221923828125, "learning_rate": 5.6840179261681526e-05, "loss": 34.8368, "step": 14784 }, { "epoch": 39.04919115219544, "grad_norm": 1126.8404541015625, "learning_rate": 5.6813660345845455e-05, "loss": 35.3871, "step": 14785 }, { "epoch": 39.05183228788379, "grad_norm": 1885.0164794921875, "learning_rate": 5.678714682462449e-05, "loss": 34.5842, "step": 14786 }, { "epoch": 39.054473423572134, "grad_norm": 2252.646484375, "learning_rate": 5.6760638698758946e-05, "loss": 34.9517, "step": 14787 }, { "epoch": 39.057114559260484, "grad_norm": 10619.8203125, "learning_rate": 5.6734135968988993e-05, "loss": 38.2973, "step": 14788 }, { "epoch": 39.05975569494883, "grad_norm": 1546.447021484375, "learning_rate": 5.6707638636054824e-05, "loss": 16.5593, "step": 14789 }, { "epoch": 39.06239683063718, "grad_norm": 2389.331298828125, "learning_rate": 5.6681146700696324e-05, "loss": 13.3393, "step": 14790 }, { "epoch": 39.06503796632552, "grad_norm": 4836.38330078125, "learning_rate": 5.66546601636532e-05, "loss": 11.3028, "step": 14791 }, { "epoch": 39.06767910201387, "grad_norm": 7348.0732421875, "learning_rate": 5.6628179025665136e-05, "loss": 11.3051, "step": 14792 }, { "epoch": 39.07032023770221, "grad_norm": 1715.992919921875, "learning_rate": 5.6601703287471604e-05, "loss": 10.8686, "step": 14793 }, { "epoch": 39.072961373390555, "grad_norm": 2329.36962890625, "learning_rate": 5.657523294981187e-05, "loss": 13.9349, "step": 14794 }, { "epoch": 39.075602509078905, "grad_norm": 1322.14697265625, "learning_rate": 5.654876801342507e-05, "loss": 9.8191, "step": 14795 }, { "epoch": 39.07824364476725, "grad_norm": 3361.409423828125, "learning_rate": 5.652230847905035e-05, "loss": 10.1212, "step": 14796 }, { "epoch": 39.0808847804556, "grad_norm": 3912.093994140625, "learning_rate": 5.649585434742652e-05, "loss": 13.3713, "step": 14797 }, { "epoch": 39.08352591614394, "grad_norm": 4366.04052734375, "learning_rate": 5.6469405619292255e-05, "loss": 11.5226, "step": 14798 }, { "epoch": 39.08616705183229, "grad_norm": 4553.841796875, "learning_rate": 5.644296229538618e-05, "loss": 32.457, "step": 14799 }, { "epoch": 39.088808187520634, "grad_norm": 1733.816162109375, "learning_rate": 5.641652437644667e-05, "loss": 34.4469, "step": 14800 }, { "epoch": 39.088808187520634, "eval_loss": 3.748194932937622, "eval_runtime": 2.2188, "eval_samples_per_second": 223.096, "eval_steps_per_second": 27.943, "step": 14800 }, { "epoch": 39.09144932320898, "grad_norm": 1281.0418701171875, "learning_rate": 5.639009186321203e-05, "loss": 34.8131, "step": 14801 }, { "epoch": 39.09409045889733, "grad_norm": 1950.681396484375, "learning_rate": 5.636366475642024e-05, "loss": 33.8214, "step": 14802 }, { "epoch": 39.09673159458567, "grad_norm": 959.064208984375, "learning_rate": 5.6337243056809454e-05, "loss": 34.1479, "step": 14803 }, { "epoch": 39.09937273027402, "grad_norm": 1973.56298828125, "learning_rate": 5.631082676511737e-05, "loss": 34.626, "step": 14804 }, { "epoch": 39.10201386596236, "grad_norm": 2760.51318359375, "learning_rate": 5.6284415882081606e-05, "loss": 34.8945, "step": 14805 }, { "epoch": 39.10465500165071, "grad_norm": 1418.2076416015625, "learning_rate": 5.6258010408439805e-05, "loss": 35.1301, "step": 14806 }, { "epoch": 39.107296137339056, "grad_norm": 1936.782470703125, "learning_rate": 5.623161034492927e-05, "loss": 34.7162, "step": 14807 }, { "epoch": 39.1099372730274, "grad_norm": 1655.59228515625, "learning_rate": 5.6205215692287234e-05, "loss": 34.3155, "step": 14808 }, { "epoch": 39.11257840871575, "grad_norm": 2938.766845703125, "learning_rate": 5.617882645125058e-05, "loss": 35.2, "step": 14809 }, { "epoch": 39.11521954440409, "grad_norm": 1263.080078125, "learning_rate": 5.615244262255642e-05, "loss": 34.4431, "step": 14810 }, { "epoch": 39.11786068009244, "grad_norm": 1157.9139404296875, "learning_rate": 5.612606420694141e-05, "loss": 36.1428, "step": 14811 }, { "epoch": 39.120501815780784, "grad_norm": 2652.23046875, "learning_rate": 5.6099691205142106e-05, "loss": 33.8418, "step": 14812 }, { "epoch": 39.123142951469134, "grad_norm": 1125.084716796875, "learning_rate": 5.6073323617895106e-05, "loss": 33.8256, "step": 14813 }, { "epoch": 39.12578408715748, "grad_norm": 1805.0166015625, "learning_rate": 5.604696144593663e-05, "loss": 36.5836, "step": 14814 }, { "epoch": 39.12842522284583, "grad_norm": 4811.54638671875, "learning_rate": 5.602060469000278e-05, "loss": 36.3183, "step": 14815 }, { "epoch": 39.13106635853417, "grad_norm": 1101.2076416015625, "learning_rate": 5.599425335082964e-05, "loss": 38.0037, "step": 14816 }, { "epoch": 39.13370749422251, "grad_norm": 842.8501586914062, "learning_rate": 5.596790742915297e-05, "loss": 40.2507, "step": 14817 }, { "epoch": 39.13634862991086, "grad_norm": 1253.9501953125, "learning_rate": 5.594156692570854e-05, "loss": 38.497, "step": 14818 }, { "epoch": 39.138989765599206, "grad_norm": 804.4869384765625, "learning_rate": 5.591523184123176e-05, "loss": 38.2572, "step": 14819 }, { "epoch": 39.141630901287556, "grad_norm": 1220.04052734375, "learning_rate": 5.58889021764582e-05, "loss": 38.4128, "step": 14820 }, { "epoch": 39.1442720369759, "grad_norm": 2014.4451904296875, "learning_rate": 5.5862577932123045e-05, "loss": 41.5456, "step": 14821 }, { "epoch": 39.14691317266425, "grad_norm": 1009.2052612304688, "learning_rate": 5.583625910896131e-05, "loss": 42.2615, "step": 14822 }, { "epoch": 39.14955430835259, "grad_norm": 1345.3299560546875, "learning_rate": 5.5809945707707946e-05, "loss": 41.5239, "step": 14823 }, { "epoch": 39.152195444040935, "grad_norm": 1424.7193603515625, "learning_rate": 5.578363772909783e-05, "loss": 40.7903, "step": 14824 }, { "epoch": 39.154836579729285, "grad_norm": 2889.557373046875, "learning_rate": 5.575733517386555e-05, "loss": 41.0159, "step": 14825 }, { "epoch": 39.15747771541763, "grad_norm": 786.556884765625, "learning_rate": 5.573103804274557e-05, "loss": 39.8204, "step": 14826 }, { "epoch": 39.16011885110598, "grad_norm": 853.8578491210938, "learning_rate": 5.570474633647224e-05, "loss": 39.2966, "step": 14827 }, { "epoch": 39.16275998679432, "grad_norm": 1016.1546020507812, "learning_rate": 5.567846005577973e-05, "loss": 37.806, "step": 14828 }, { "epoch": 39.16540112248267, "grad_norm": 2285.242919921875, "learning_rate": 5.5652179201402085e-05, "loss": 38.0543, "step": 14829 }, { "epoch": 39.16804225817101, "grad_norm": 1575.657470703125, "learning_rate": 5.562590377407309e-05, "loss": 36.9056, "step": 14830 }, { "epoch": 39.170683393859356, "grad_norm": 1423.3485107421875, "learning_rate": 5.559963377452662e-05, "loss": 35.1639, "step": 14831 }, { "epoch": 39.173324529547706, "grad_norm": 737.248779296875, "learning_rate": 5.5573369203496177e-05, "loss": 34.5491, "step": 14832 }, { "epoch": 39.17596566523605, "grad_norm": 674.1100463867188, "learning_rate": 5.55471100617152e-05, "loss": 35.8199, "step": 14833 }, { "epoch": 39.1786068009244, "grad_norm": 1260.0333251953125, "learning_rate": 5.552085634991694e-05, "loss": 34.9326, "step": 14834 }, { "epoch": 39.18124793661274, "grad_norm": 1685.6416015625, "learning_rate": 5.549460806883452e-05, "loss": 34.5587, "step": 14835 }, { "epoch": 39.18388907230109, "grad_norm": 2182.74853515625, "learning_rate": 5.54683652192009e-05, "loss": 34.4197, "step": 14836 }, { "epoch": 39.186530207989435, "grad_norm": 1551.64599609375, "learning_rate": 5.544212780174884e-05, "loss": 35.6889, "step": 14837 }, { "epoch": 39.189171343677785, "grad_norm": 1854.5631103515625, "learning_rate": 5.541589581721115e-05, "loss": 35.6289, "step": 14838 }, { "epoch": 39.19181247936613, "grad_norm": 2899.630859375, "learning_rate": 5.5389669266320275e-05, "loss": 46.7853, "step": 14839 }, { "epoch": 39.19445361505447, "grad_norm": 2880.2265625, "learning_rate": 5.536344814980848e-05, "loss": 14.9794, "step": 14840 }, { "epoch": 39.19709475074282, "grad_norm": 1332.6561279296875, "learning_rate": 5.533723246840816e-05, "loss": 9.461, "step": 14841 }, { "epoch": 39.199735886431164, "grad_norm": 1250.087890625, "learning_rate": 5.5311022222851224e-05, "loss": 13.5208, "step": 14842 }, { "epoch": 39.202377022119514, "grad_norm": 9899.8330078125, "learning_rate": 5.5284817413869733e-05, "loss": 9.473, "step": 14843 }, { "epoch": 39.20501815780786, "grad_norm": 1490.3577880859375, "learning_rate": 5.525861804219517e-05, "loss": 14.4674, "step": 14844 }, { "epoch": 39.20765929349621, "grad_norm": 421.6707458496094, "learning_rate": 5.5232424108559384e-05, "loss": 10.5831, "step": 14845 }, { "epoch": 39.21030042918455, "grad_norm": 3518.21435546875, "learning_rate": 5.520623561369376e-05, "loss": 8.4857, "step": 14846 }, { "epoch": 39.21294156487289, "grad_norm": 6201.3154296875, "learning_rate": 5.518005255832948e-05, "loss": 13.5784, "step": 14847 }, { "epoch": 39.21558270056124, "grad_norm": 2342.73095703125, "learning_rate": 5.515387494319785e-05, "loss": 13.4922, "step": 14848 }, { "epoch": 39.218223836249585, "grad_norm": 1943.55078125, "learning_rate": 5.5127702769029815e-05, "loss": 35.6404, "step": 14849 }, { "epoch": 39.220864971937935, "grad_norm": 3488.836669921875, "learning_rate": 5.5101536036556213e-05, "loss": 34.35, "step": 14850 }, { "epoch": 39.22350610762628, "grad_norm": 1886.4677734375, "learning_rate": 5.507537474650773e-05, "loss": 35.2836, "step": 14851 }, { "epoch": 39.22614724331463, "grad_norm": 1334.00439453125, "learning_rate": 5.504921889961489e-05, "loss": 34.3814, "step": 14852 }, { "epoch": 39.22878837900297, "grad_norm": 1471.78662109375, "learning_rate": 5.502306849660812e-05, "loss": 33.4608, "step": 14853 }, { "epoch": 39.231429514691314, "grad_norm": 1094.5145263671875, "learning_rate": 5.499692353821753e-05, "loss": 34.2507, "step": 14854 }, { "epoch": 39.234070650379664, "grad_norm": 6509.63671875, "learning_rate": 5.497078402517339e-05, "loss": 34.0039, "step": 14855 }, { "epoch": 39.23671178606801, "grad_norm": 893.7327270507812, "learning_rate": 5.49446499582055e-05, "loss": 34.463, "step": 14856 }, { "epoch": 39.23935292175636, "grad_norm": 3883.1494140625, "learning_rate": 5.4918521338043704e-05, "loss": 35.005, "step": 14857 }, { "epoch": 39.2419940574447, "grad_norm": 876.3556518554688, "learning_rate": 5.489239816541755e-05, "loss": 34.3166, "step": 14858 }, { "epoch": 39.24463519313305, "grad_norm": 786.8693237304688, "learning_rate": 5.48662804410566e-05, "loss": 34.0544, "step": 14859 }, { "epoch": 39.24727632882139, "grad_norm": 1333.6502685546875, "learning_rate": 5.484016816569015e-05, "loss": 35.8268, "step": 14860 }, { "epoch": 39.24991746450974, "grad_norm": 839.7163696289062, "learning_rate": 5.4814061340047375e-05, "loss": 34.6319, "step": 14861 }, { "epoch": 39.252558600198086, "grad_norm": 10063.8955078125, "learning_rate": 5.4787959964857276e-05, "loss": 34.1086, "step": 14862 }, { "epoch": 39.25519973588643, "grad_norm": 4765.3818359375, "learning_rate": 5.4761864040848736e-05, "loss": 35.9393, "step": 14863 }, { "epoch": 39.25784087157478, "grad_norm": 2068.028076171875, "learning_rate": 5.473577356875042e-05, "loss": 34.6672, "step": 14864 }, { "epoch": 39.26048200726312, "grad_norm": 1705.2349853515625, "learning_rate": 5.470968854929087e-05, "loss": 35.1861, "step": 14865 }, { "epoch": 39.26312314295147, "grad_norm": 2476.02099609375, "learning_rate": 5.4683608983198604e-05, "loss": 38.9757, "step": 14866 }, { "epoch": 39.265764278639814, "grad_norm": 768.6463623046875, "learning_rate": 5.465753487120184e-05, "loss": 39.2219, "step": 14867 }, { "epoch": 39.268405414328164, "grad_norm": 829.1747436523438, "learning_rate": 5.463146621402867e-05, "loss": 39.6696, "step": 14868 }, { "epoch": 39.27104655001651, "grad_norm": 3569.09130859375, "learning_rate": 5.460540301240702e-05, "loss": 38.0579, "step": 14869 }, { "epoch": 39.27368768570485, "grad_norm": 1596.835693359375, "learning_rate": 5.457934526706471e-05, "loss": 39.4805, "step": 14870 }, { "epoch": 39.2763288213932, "grad_norm": 1813.5028076171875, "learning_rate": 5.455329297872941e-05, "loss": 39.6979, "step": 14871 }, { "epoch": 39.27896995708154, "grad_norm": 1854.52294921875, "learning_rate": 5.452724614812851e-05, "loss": 42.0326, "step": 14872 }, { "epoch": 39.28161109276989, "grad_norm": 960.72412109375, "learning_rate": 5.450120477598949e-05, "loss": 39.6732, "step": 14873 }, { "epoch": 39.284252228458236, "grad_norm": 647.6405639648438, "learning_rate": 5.4475168863039494e-05, "loss": 39.6077, "step": 14874 }, { "epoch": 39.286893364146586, "grad_norm": 721.5799560546875, "learning_rate": 5.444913841000548e-05, "loss": 41.1837, "step": 14875 }, { "epoch": 39.28953449983493, "grad_norm": 791.7599487304688, "learning_rate": 5.44231134176145e-05, "loss": 40.3843, "step": 14876 }, { "epoch": 39.29217563552327, "grad_norm": 1827.56982421875, "learning_rate": 5.439709388659314e-05, "loss": 38.9735, "step": 14877 }, { "epoch": 39.29481677121162, "grad_norm": 1600.9765625, "learning_rate": 5.437107981766812e-05, "loss": 37.9163, "step": 14878 }, { "epoch": 39.297457906899965, "grad_norm": 576.073974609375, "learning_rate": 5.434507121156565e-05, "loss": 36.7952, "step": 14879 }, { "epoch": 39.300099042588315, "grad_norm": 909.7897338867188, "learning_rate": 5.4319068069012184e-05, "loss": 36.8049, "step": 14880 }, { "epoch": 39.30274017827666, "grad_norm": 875.7329711914062, "learning_rate": 5.4293070390733806e-05, "loss": 37.012, "step": 14881 }, { "epoch": 39.30538131396501, "grad_norm": 4439.1943359375, "learning_rate": 5.4267078177456405e-05, "loss": 36.0758, "step": 14882 }, { "epoch": 39.30802244965335, "grad_norm": 746.5020141601562, "learning_rate": 5.4241091429905935e-05, "loss": 36.231, "step": 14883 }, { "epoch": 39.3106635853417, "grad_norm": 1611.2471923828125, "learning_rate": 5.421511014880798e-05, "loss": 35.5769, "step": 14884 }, { "epoch": 39.31330472103004, "grad_norm": 3049.175048828125, "learning_rate": 5.418913433488809e-05, "loss": 34.8002, "step": 14885 }, { "epoch": 39.315945856718386, "grad_norm": 820.6925659179688, "learning_rate": 5.4163163988871585e-05, "loss": 34.8774, "step": 14886 }, { "epoch": 39.318586992406736, "grad_norm": 1707.5850830078125, "learning_rate": 5.413719911148368e-05, "loss": 34.4807, "step": 14887 }, { "epoch": 39.32122812809508, "grad_norm": 1165.6092529296875, "learning_rate": 5.4111239703449415e-05, "loss": 34.725, "step": 14888 }, { "epoch": 39.32386926378343, "grad_norm": 7252.3330078125, "learning_rate": 5.408528576549368e-05, "loss": 30.8111, "step": 14889 }, { "epoch": 39.32651039947177, "grad_norm": 799.6859741210938, "learning_rate": 5.405933729834128e-05, "loss": 11.6022, "step": 14890 }, { "epoch": 39.32915153516012, "grad_norm": 1126.8712158203125, "learning_rate": 5.403339430271681e-05, "loss": 12.1129, "step": 14891 }, { "epoch": 39.331792670848465, "grad_norm": 2268.375732421875, "learning_rate": 5.400745677934468e-05, "loss": 10.1297, "step": 14892 }, { "epoch": 39.33443380653681, "grad_norm": 1741.3961181640625, "learning_rate": 5.3981524728949125e-05, "loss": 12.0428, "step": 14893 }, { "epoch": 39.33707494222516, "grad_norm": 6601.75439453125, "learning_rate": 5.3955598152254426e-05, "loss": 10.1179, "step": 14894 }, { "epoch": 39.3397160779135, "grad_norm": 1087.8160400390625, "learning_rate": 5.392967704998447e-05, "loss": 11.6733, "step": 14895 }, { "epoch": 39.34235721360185, "grad_norm": 3800.7197265625, "learning_rate": 5.39037614228631e-05, "loss": 9.5509, "step": 14896 }, { "epoch": 39.344998349290194, "grad_norm": 8098.3857421875, "learning_rate": 5.387785127161399e-05, "loss": 13.7994, "step": 14897 }, { "epoch": 39.347639484978544, "grad_norm": 1079.427490234375, "learning_rate": 5.385194659696069e-05, "loss": 19.1352, "step": 14898 }, { "epoch": 39.35028062066689, "grad_norm": 1512.7574462890625, "learning_rate": 5.382604739962657e-05, "loss": 35.4531, "step": 14899 }, { "epoch": 39.35292175635523, "grad_norm": 1037.6475830078125, "learning_rate": 5.3800153680334754e-05, "loss": 35.5363, "step": 14900 }, { "epoch": 39.35556289204358, "grad_norm": 2011.1737060546875, "learning_rate": 5.3774265439808456e-05, "loss": 34.0381, "step": 14901 }, { "epoch": 39.35820402773192, "grad_norm": 7430.0166015625, "learning_rate": 5.374838267877052e-05, "loss": 35.1424, "step": 14902 }, { "epoch": 39.36084516342027, "grad_norm": 906.2781372070312, "learning_rate": 5.3722505397943736e-05, "loss": 34.0217, "step": 14903 }, { "epoch": 39.363486299108615, "grad_norm": 936.6663208007812, "learning_rate": 5.369663359805069e-05, "loss": 34.9511, "step": 14904 }, { "epoch": 39.366127434796965, "grad_norm": 1102.4700927734375, "learning_rate": 5.367076727981382e-05, "loss": 32.309, "step": 14905 }, { "epoch": 39.36876857048531, "grad_norm": 4594.15576171875, "learning_rate": 5.364490644395545e-05, "loss": 34.3452, "step": 14906 }, { "epoch": 39.37140970617366, "grad_norm": 1484.42626953125, "learning_rate": 5.361905109119766e-05, "loss": 34.584, "step": 14907 }, { "epoch": 39.374050841862, "grad_norm": 3994.567626953125, "learning_rate": 5.359320122226255e-05, "loss": 34.9456, "step": 14908 }, { "epoch": 39.376691977550344, "grad_norm": 1153.6827392578125, "learning_rate": 5.356735683787195e-05, "loss": 35.4096, "step": 14909 }, { "epoch": 39.379333113238694, "grad_norm": 1653.7418212890625, "learning_rate": 5.354151793874743e-05, "loss": 36.0722, "step": 14910 }, { "epoch": 39.38197424892704, "grad_norm": 1694.957275390625, "learning_rate": 5.3515684525610706e-05, "loss": 34.7456, "step": 14911 }, { "epoch": 39.38461538461539, "grad_norm": 1680.138671875, "learning_rate": 5.348985659918307e-05, "loss": 33.8917, "step": 14912 }, { "epoch": 39.38725652030373, "grad_norm": 2205.492919921875, "learning_rate": 5.3464034160185804e-05, "loss": 34.8113, "step": 14913 }, { "epoch": 39.38989765599208, "grad_norm": 2119.48876953125, "learning_rate": 5.343821720933978e-05, "loss": 35.0328, "step": 14914 }, { "epoch": 39.39253879168042, "grad_norm": 1058.174560546875, "learning_rate": 5.341240574736617e-05, "loss": 35.0109, "step": 14915 }, { "epoch": 39.395179927368766, "grad_norm": 2816.906005859375, "learning_rate": 5.338659977498564e-05, "loss": 39.4651, "step": 14916 }, { "epoch": 39.397821063057116, "grad_norm": 4040.817138671875, "learning_rate": 5.336079929291876e-05, "loss": 40.2703, "step": 14917 }, { "epoch": 39.40046219874546, "grad_norm": 738.5347290039062, "learning_rate": 5.3335004301886094e-05, "loss": 37.8536, "step": 14918 }, { "epoch": 39.40310333443381, "grad_norm": 1693.913818359375, "learning_rate": 5.330921480260792e-05, "loss": 40.1621, "step": 14919 }, { "epoch": 39.40574447012215, "grad_norm": 892.4551391601562, "learning_rate": 5.328343079580436e-05, "loss": 38.8517, "step": 14920 }, { "epoch": 39.4083856058105, "grad_norm": 1031.525390625, "learning_rate": 5.325765228219545e-05, "loss": 42.5094, "step": 14921 }, { "epoch": 39.411026741498844, "grad_norm": 2572.09716796875, "learning_rate": 5.323187926250103e-05, "loss": 42.3705, "step": 14922 }, { "epoch": 39.41366787718719, "grad_norm": 7120.2548828125, "learning_rate": 5.3206111737440804e-05, "loss": 45.0093, "step": 14923 }, { "epoch": 39.41630901287554, "grad_norm": 1848.678955078125, "learning_rate": 5.318034970773422e-05, "loss": 43.2692, "step": 14924 }, { "epoch": 39.41895014856388, "grad_norm": 1097.79638671875, "learning_rate": 5.315459317410085e-05, "loss": 40.3722, "step": 14925 }, { "epoch": 39.42159128425223, "grad_norm": 1019.7534790039062, "learning_rate": 5.31288421372598e-05, "loss": 38.8565, "step": 14926 }, { "epoch": 39.42423241994057, "grad_norm": 736.2131958007812, "learning_rate": 5.310309659793011e-05, "loss": 39.3242, "step": 14927 }, { "epoch": 39.42687355562892, "grad_norm": 959.1550903320312, "learning_rate": 5.307735655683088e-05, "loss": 38.4366, "step": 14928 }, { "epoch": 39.429514691317266, "grad_norm": 3265.261474609375, "learning_rate": 5.305162201468078e-05, "loss": 37.4092, "step": 14929 }, { "epoch": 39.432155827005616, "grad_norm": 1225.991943359375, "learning_rate": 5.3025892972198425e-05, "loss": 35.0293, "step": 14930 }, { "epoch": 39.43479696269396, "grad_norm": 1951.90771484375, "learning_rate": 5.300016943010233e-05, "loss": 36.1424, "step": 14931 }, { "epoch": 39.4374380983823, "grad_norm": 1760.5513916015625, "learning_rate": 5.297445138911075e-05, "loss": 35.6469, "step": 14932 }, { "epoch": 39.44007923407065, "grad_norm": 2395.337890625, "learning_rate": 5.294873884994189e-05, "loss": 34.7584, "step": 14933 }, { "epoch": 39.442720369758995, "grad_norm": 2400.5400390625, "learning_rate": 5.2923031813313676e-05, "loss": 36.9424, "step": 14934 }, { "epoch": 39.445361505447345, "grad_norm": 1991.182861328125, "learning_rate": 5.289733027994409e-05, "loss": 33.876, "step": 14935 }, { "epoch": 39.44800264113569, "grad_norm": 818.633056640625, "learning_rate": 5.287163425055078e-05, "loss": 35.7265, "step": 14936 }, { "epoch": 39.45064377682404, "grad_norm": 842.444580078125, "learning_rate": 5.284594372585128e-05, "loss": 35.1288, "step": 14937 }, { "epoch": 39.45328491251238, "grad_norm": 3617.649658203125, "learning_rate": 5.282025870656299e-05, "loss": 34.3649, "step": 14938 }, { "epoch": 39.45592604820072, "grad_norm": 2175.8876953125, "learning_rate": 5.279457919340316e-05, "loss": 10.2742, "step": 14939 }, { "epoch": 39.45856718388907, "grad_norm": 1316.5335693359375, "learning_rate": 5.276890518708885e-05, "loss": 10.7487, "step": 14940 }, { "epoch": 39.461208319577416, "grad_norm": 3359.71875, "learning_rate": 5.274323668833692e-05, "loss": 12.4476, "step": 14941 }, { "epoch": 39.463849455265766, "grad_norm": 4370.7255859375, "learning_rate": 5.271757369786431e-05, "loss": 10.9304, "step": 14942 }, { "epoch": 39.46649059095411, "grad_norm": 2819.177001953125, "learning_rate": 5.2691916216387556e-05, "loss": 11.2899, "step": 14943 }, { "epoch": 39.46913172664246, "grad_norm": 7706.478515625, "learning_rate": 5.2666264244623145e-05, "loss": 12.6727, "step": 14944 }, { "epoch": 39.4717728623308, "grad_norm": 1261.796875, "learning_rate": 5.2640617783287327e-05, "loss": 13.5932, "step": 14945 }, { "epoch": 39.474413998019145, "grad_norm": 3745.76904296875, "learning_rate": 5.261497683309638e-05, "loss": 8.6245, "step": 14946 }, { "epoch": 39.477055133707495, "grad_norm": 3014.724609375, "learning_rate": 5.258934139476626e-05, "loss": 12.1424, "step": 14947 }, { "epoch": 39.47969626939584, "grad_norm": 1173.553955078125, "learning_rate": 5.2563711469012785e-05, "loss": 15.0443, "step": 14948 }, { "epoch": 39.48233740508419, "grad_norm": 1467.24072265625, "learning_rate": 5.253808705655172e-05, "loss": 36.1072, "step": 14949 }, { "epoch": 39.48497854077253, "grad_norm": 1424.8111572265625, "learning_rate": 5.251246815809857e-05, "loss": 35.4758, "step": 14950 }, { "epoch": 39.48761967646088, "grad_norm": 2057.101806640625, "learning_rate": 5.2486854774368737e-05, "loss": 33.9439, "step": 14951 }, { "epoch": 39.490260812149224, "grad_norm": 3729.604736328125, "learning_rate": 5.2461246906077396e-05, "loss": 35.97, "step": 14952 }, { "epoch": 39.492901947837574, "grad_norm": 2006.665771484375, "learning_rate": 5.243564455393976e-05, "loss": 33.5141, "step": 14953 }, { "epoch": 39.49554308352592, "grad_norm": 957.853515625, "learning_rate": 5.24100477186707e-05, "loss": 34.861, "step": 14954 }, { "epoch": 39.49818421921426, "grad_norm": 2686.620361328125, "learning_rate": 5.238445640098496e-05, "loss": 34.1872, "step": 14955 }, { "epoch": 39.50082535490261, "grad_norm": 1723.1356201171875, "learning_rate": 5.235887060159722e-05, "loss": 35.2567, "step": 14956 }, { "epoch": 39.50346649059095, "grad_norm": 1977.265625, "learning_rate": 5.233329032122191e-05, "loss": 33.8945, "step": 14957 }, { "epoch": 39.5061076262793, "grad_norm": 1282.3167724609375, "learning_rate": 5.2307715560573345e-05, "loss": 35.0677, "step": 14958 }, { "epoch": 39.508748761967645, "grad_norm": 1434.2437744140625, "learning_rate": 5.2282146320365626e-05, "loss": 34.3961, "step": 14959 }, { "epoch": 39.511389897655995, "grad_norm": 3430.320556640625, "learning_rate": 5.2256582601312885e-05, "loss": 34.19, "step": 14960 }, { "epoch": 39.51403103334434, "grad_norm": 1075.4364013671875, "learning_rate": 5.223102440412894e-05, "loss": 34.0887, "step": 14961 }, { "epoch": 39.51667216903268, "grad_norm": 1203.41357421875, "learning_rate": 5.220547172952736e-05, "loss": 35.7811, "step": 14962 }, { "epoch": 39.51931330472103, "grad_norm": 2124.69384765625, "learning_rate": 5.2179924578221875e-05, "loss": 34.863, "step": 14963 }, { "epoch": 39.521954440409374, "grad_norm": 4276.34814453125, "learning_rate": 5.215438295092581e-05, "loss": 33.9039, "step": 14964 }, { "epoch": 39.524595576097724, "grad_norm": 7333.20458984375, "learning_rate": 5.212884684835237e-05, "loss": 36.4363, "step": 14965 }, { "epoch": 39.52723671178607, "grad_norm": 1619.048583984375, "learning_rate": 5.210331627121462e-05, "loss": 38.6323, "step": 14966 }, { "epoch": 39.52987784747442, "grad_norm": 1827.988525390625, "learning_rate": 5.2077791220225544e-05, "loss": 41.0739, "step": 14967 }, { "epoch": 39.53251898316276, "grad_norm": 2215.961669921875, "learning_rate": 5.205227169609786e-05, "loss": 37.5615, "step": 14968 }, { "epoch": 39.5351601188511, "grad_norm": 872.847412109375, "learning_rate": 5.202675769954415e-05, "loss": 39.2799, "step": 14969 }, { "epoch": 39.53780125453945, "grad_norm": 1677.9622802734375, "learning_rate": 5.2001249231277e-05, "loss": 38.6343, "step": 14970 }, { "epoch": 39.540442390227795, "grad_norm": 4599.0849609375, "learning_rate": 5.1975746292008656e-05, "loss": 42.5882, "step": 14971 }, { "epoch": 39.543083525916146, "grad_norm": 1994.1314697265625, "learning_rate": 5.1950248882451285e-05, "loss": 43.8334, "step": 14972 }, { "epoch": 39.54572466160449, "grad_norm": 3051.13671875, "learning_rate": 5.192475700331686e-05, "loss": 42.1315, "step": 14973 }, { "epoch": 39.54836579729284, "grad_norm": 894.8685913085938, "learning_rate": 5.189927065531724e-05, "loss": 41.3786, "step": 14974 }, { "epoch": 39.55100693298118, "grad_norm": 1566.894775390625, "learning_rate": 5.1873789839164134e-05, "loss": 38.8138, "step": 14975 }, { "epoch": 39.55364806866953, "grad_norm": 1278.5838623046875, "learning_rate": 5.184831455556896e-05, "loss": 39.91, "step": 14976 }, { "epoch": 39.556289204357874, "grad_norm": 1070.7742919921875, "learning_rate": 5.18228448052433e-05, "loss": 40.1296, "step": 14977 }, { "epoch": 39.55893034004622, "grad_norm": 1025.1583251953125, "learning_rate": 5.1797380588898266e-05, "loss": 37.7421, "step": 14978 }, { "epoch": 39.56157147573457, "grad_norm": 1388.3277587890625, "learning_rate": 5.1771921907244944e-05, "loss": 40.0528, "step": 14979 }, { "epoch": 39.56421261142291, "grad_norm": 856.2947998046875, "learning_rate": 5.17464687609942e-05, "loss": 36.5632, "step": 14980 }, { "epoch": 39.56685374711126, "grad_norm": 618.526611328125, "learning_rate": 5.172102115085692e-05, "loss": 36.7178, "step": 14981 }, { "epoch": 39.5694948827996, "grad_norm": 970.9625854492188, "learning_rate": 5.169557907754366e-05, "loss": 36.1643, "step": 14982 }, { "epoch": 39.57213601848795, "grad_norm": 1320.431396484375, "learning_rate": 5.167014254176483e-05, "loss": 37.1219, "step": 14983 }, { "epoch": 39.574777154176296, "grad_norm": 2238.2861328125, "learning_rate": 5.164471154423081e-05, "loss": 35.6985, "step": 14984 }, { "epoch": 39.57741828986464, "grad_norm": 2338.01171875, "learning_rate": 5.161928608565167e-05, "loss": 36.3594, "step": 14985 }, { "epoch": 39.58005942555299, "grad_norm": 2095.536376953125, "learning_rate": 5.159386616673745e-05, "loss": 35.2948, "step": 14986 }, { "epoch": 39.58270056124133, "grad_norm": 2467.102294921875, "learning_rate": 5.156845178819788e-05, "loss": 35.0656, "step": 14987 }, { "epoch": 39.58534169692968, "grad_norm": 1270.5460205078125, "learning_rate": 5.15430429507428e-05, "loss": 36.0967, "step": 14988 }, { "epoch": 39.587982832618025, "grad_norm": 4116.77734375, "learning_rate": 5.151763965508166e-05, "loss": 41.4123, "step": 14989 }, { "epoch": 39.590623968306375, "grad_norm": 2274.48828125, "learning_rate": 5.149224190192387e-05, "loss": 14.4279, "step": 14990 }, { "epoch": 39.59326510399472, "grad_norm": 163597.21875, "learning_rate": 5.1466849691978566e-05, "loss": 13.0023, "step": 14991 }, { "epoch": 39.59590623968306, "grad_norm": 6714.16455078125, "learning_rate": 5.1441463025954896e-05, "loss": 7.6266, "step": 14992 }, { "epoch": 39.59854737537141, "grad_norm": 2555.804931640625, "learning_rate": 5.1416081904561736e-05, "loss": 14.8806, "step": 14993 }, { "epoch": 39.60118851105975, "grad_norm": 26536.806640625, "learning_rate": 5.139070632850773e-05, "loss": 9.6947, "step": 14994 }, { "epoch": 39.6038296467481, "grad_norm": 1488.102783203125, "learning_rate": 5.136533629850168e-05, "loss": 10.4168, "step": 14995 }, { "epoch": 39.606470782436446, "grad_norm": 46177.07421875, "learning_rate": 5.133997181525193e-05, "loss": 15.6868, "step": 14996 }, { "epoch": 39.609111918124796, "grad_norm": 1838.44775390625, "learning_rate": 5.131461287946668e-05, "loss": 12.7819, "step": 14997 }, { "epoch": 39.61175305381314, "grad_norm": 688.2957153320312, "learning_rate": 5.128925949185423e-05, "loss": 9.2986, "step": 14998 }, { "epoch": 39.61439418950149, "grad_norm": 991.876708984375, "learning_rate": 5.12639116531225e-05, "loss": 30.1291, "step": 14999 }, { "epoch": 39.61703532518983, "grad_norm": 1555.800537109375, "learning_rate": 5.123856936397925e-05, "loss": 34.4357, "step": 15000 }, { "epoch": 39.61703532518983, "eval_loss": 3.797048330307007, "eval_runtime": 2.1806, "eval_samples_per_second": 227.004, "eval_steps_per_second": 28.433, "step": 15000 }, { "epoch": 39.619676460878175, "grad_norm": 2596.972412109375, "learning_rate": 5.121323262513222e-05, "loss": 34.5106, "step": 15001 }, { "epoch": 39.622317596566525, "grad_norm": 1605.546142578125, "learning_rate": 5.118790143728888e-05, "loss": 35.1987, "step": 15002 }, { "epoch": 39.62495873225487, "grad_norm": 4256.685546875, "learning_rate": 5.116257580115663e-05, "loss": 33.8509, "step": 15003 }, { "epoch": 39.62759986794322, "grad_norm": 1738.750244140625, "learning_rate": 5.113725571744257e-05, "loss": 34.6462, "step": 15004 }, { "epoch": 39.63024100363156, "grad_norm": 1802.726806640625, "learning_rate": 5.111194118685386e-05, "loss": 36.7273, "step": 15005 }, { "epoch": 39.63288213931991, "grad_norm": 602.3189086914062, "learning_rate": 5.10866322100974e-05, "loss": 34.7584, "step": 15006 }, { "epoch": 39.635523275008254, "grad_norm": 2417.079345703125, "learning_rate": 5.106132878787986e-05, "loss": 33.812, "step": 15007 }, { "epoch": 39.6381644106966, "grad_norm": 1717.8018798828125, "learning_rate": 5.103603092090789e-05, "loss": 35.6113, "step": 15008 }, { "epoch": 39.64080554638495, "grad_norm": 1357.32373046875, "learning_rate": 5.101073860988784e-05, "loss": 33.9873, "step": 15009 }, { "epoch": 39.64344668207329, "grad_norm": 1240.58984375, "learning_rate": 5.0985451855526026e-05, "loss": 33.9317, "step": 15010 }, { "epoch": 39.64608781776164, "grad_norm": 973.8977661132812, "learning_rate": 5.096017065852851e-05, "loss": 35.1909, "step": 15011 }, { "epoch": 39.64872895344998, "grad_norm": 811.7449340820312, "learning_rate": 5.093489501960136e-05, "loss": 33.3343, "step": 15012 }, { "epoch": 39.65137008913833, "grad_norm": 1104.0208740234375, "learning_rate": 5.090962493945034e-05, "loss": 34.1448, "step": 15013 }, { "epoch": 39.654011224826675, "grad_norm": 1011.8849487304688, "learning_rate": 5.0884360418781086e-05, "loss": 36.1458, "step": 15014 }, { "epoch": 39.65665236051502, "grad_norm": 15042.0751953125, "learning_rate": 5.0859101458299026e-05, "loss": 38.2684, "step": 15015 }, { "epoch": 39.65929349620337, "grad_norm": 2765.16064453125, "learning_rate": 5.083384805870966e-05, "loss": 39.8069, "step": 15016 }, { "epoch": 39.66193463189171, "grad_norm": 10551.5966796875, "learning_rate": 5.080860022071809e-05, "loss": 39.9661, "step": 15017 }, { "epoch": 39.66457576758006, "grad_norm": 1189.468994140625, "learning_rate": 5.0783357945029335e-05, "loss": 40.4562, "step": 15018 }, { "epoch": 39.667216903268404, "grad_norm": 1490.860107421875, "learning_rate": 5.0758121232348295e-05, "loss": 38.4113, "step": 15019 }, { "epoch": 39.669858038956754, "grad_norm": 798.2118530273438, "learning_rate": 5.073289008337967e-05, "loss": 40.1162, "step": 15020 }, { "epoch": 39.6724991746451, "grad_norm": 782.8651123046875, "learning_rate": 5.070766449882802e-05, "loss": 40.9087, "step": 15021 }, { "epoch": 39.67514031033345, "grad_norm": 736.1976928710938, "learning_rate": 5.068244447939771e-05, "loss": 40.2597, "step": 15022 }, { "epoch": 39.67778144602179, "grad_norm": 2093.069091796875, "learning_rate": 5.0657230025793145e-05, "loss": 42.2783, "step": 15023 }, { "epoch": 39.68042258171013, "grad_norm": 857.519287109375, "learning_rate": 5.063202113871832e-05, "loss": 41.734, "step": 15024 }, { "epoch": 39.68306371739848, "grad_norm": 1029.739501953125, "learning_rate": 5.0606817818877157e-05, "loss": 40.4209, "step": 15025 }, { "epoch": 39.685704853086825, "grad_norm": 1068.6839599609375, "learning_rate": 5.0581620066973504e-05, "loss": 37.7675, "step": 15026 }, { "epoch": 39.688345988775175, "grad_norm": 1407.4190673828125, "learning_rate": 5.055642788371098e-05, "loss": 37.5133, "step": 15027 }, { "epoch": 39.69098712446352, "grad_norm": 1365.66162109375, "learning_rate": 5.0531241269793025e-05, "loss": 37.0852, "step": 15028 }, { "epoch": 39.69362826015187, "grad_norm": 985.9453125, "learning_rate": 5.050606022592291e-05, "loss": 36.7771, "step": 15029 }, { "epoch": 39.69626939584021, "grad_norm": 1337.52392578125, "learning_rate": 5.048088475280396e-05, "loss": 36.5378, "step": 15030 }, { "epoch": 39.698910531528554, "grad_norm": 1605.0032958984375, "learning_rate": 5.045571485113909e-05, "loss": 35.5221, "step": 15031 }, { "epoch": 39.701551667216904, "grad_norm": 1700.7572021484375, "learning_rate": 5.043055052163109e-05, "loss": 35.6777, "step": 15032 }, { "epoch": 39.70419280290525, "grad_norm": 1368.6619873046875, "learning_rate": 5.040539176498282e-05, "loss": 33.7469, "step": 15033 }, { "epoch": 39.7068339385936, "grad_norm": 697.2631225585938, "learning_rate": 5.038023858189672e-05, "loss": 35.4302, "step": 15034 }, { "epoch": 39.70947507428194, "grad_norm": 696.0581665039062, "learning_rate": 5.03550909730752e-05, "loss": 33.5763, "step": 15035 }, { "epoch": 39.71211620997029, "grad_norm": 1755.4959716796875, "learning_rate": 5.0329948939220495e-05, "loss": 34.3077, "step": 15036 }, { "epoch": 39.71475734565863, "grad_norm": 862.860595703125, "learning_rate": 5.030481248103466e-05, "loss": 34.6613, "step": 15037 }, { "epoch": 39.717398481346976, "grad_norm": 1017.0274658203125, "learning_rate": 5.027968159921964e-05, "loss": 36.4114, "step": 15038 }, { "epoch": 39.720039617035326, "grad_norm": 4110.7548828125, "learning_rate": 5.0254556294477114e-05, "loss": 36.8251, "step": 15039 }, { "epoch": 39.72268075272367, "grad_norm": 1498.5404052734375, "learning_rate": 5.022943656750884e-05, "loss": 20.5976, "step": 15040 }, { "epoch": 39.72532188841202, "grad_norm": 828.6658935546875, "learning_rate": 5.0204322419016204e-05, "loss": 8.6284, "step": 15041 }, { "epoch": 39.72796302410036, "grad_norm": 1834.6746826171875, "learning_rate": 5.0179213849700475e-05, "loss": 11.0363, "step": 15042 }, { "epoch": 39.73060415978871, "grad_norm": 2116.657958984375, "learning_rate": 5.015411086026284e-05, "loss": 13.1844, "step": 15043 }, { "epoch": 39.733245295477055, "grad_norm": 644.1640625, "learning_rate": 5.012901345140425e-05, "loss": 9.7093, "step": 15044 }, { "epoch": 39.735886431165405, "grad_norm": 4175.9814453125, "learning_rate": 5.010392162382557e-05, "loss": 10.6605, "step": 15045 }, { "epoch": 39.73852756685375, "grad_norm": 1171.991943359375, "learning_rate": 5.007883537822736e-05, "loss": 10.5481, "step": 15046 }, { "epoch": 39.74116870254209, "grad_norm": 439.8592529296875, "learning_rate": 5.005375471531032e-05, "loss": 10.5786, "step": 15047 }, { "epoch": 39.74380983823044, "grad_norm": 23669.5078125, "learning_rate": 5.0028679635774704e-05, "loss": 12.2185, "step": 15048 }, { "epoch": 39.74645097391878, "grad_norm": 727.4189453125, "learning_rate": 5.00036101403207e-05, "loss": 24.5009, "step": 15049 }, { "epoch": 39.74909210960713, "grad_norm": 758.1328125, "learning_rate": 4.9978546229648455e-05, "loss": 35.2938, "step": 15050 }, { "epoch": 39.751733245295476, "grad_norm": 1035.6439208984375, "learning_rate": 4.9953487904457776e-05, "loss": 34.0927, "step": 15051 }, { "epoch": 39.754374380983826, "grad_norm": 638.8212890625, "learning_rate": 4.992843516544848e-05, "loss": 34.725, "step": 15052 }, { "epoch": 39.75701551667217, "grad_norm": 953.097412109375, "learning_rate": 4.990338801332009e-05, "loss": 35.0404, "step": 15053 }, { "epoch": 39.75965665236051, "grad_norm": 840.08349609375, "learning_rate": 4.987834644877204e-05, "loss": 35.6936, "step": 15054 }, { "epoch": 39.76229778804886, "grad_norm": 1243.2109375, "learning_rate": 4.985331047250361e-05, "loss": 34.4164, "step": 15055 }, { "epoch": 39.764938923737205, "grad_norm": 1021.890869140625, "learning_rate": 4.982828008521384e-05, "loss": 34.4784, "step": 15056 }, { "epoch": 39.767580059425555, "grad_norm": 2241.55419921875, "learning_rate": 4.9803255287601826e-05, "loss": 34.2048, "step": 15057 }, { "epoch": 39.7702211951139, "grad_norm": 1765.350341796875, "learning_rate": 4.97782360803663e-05, "loss": 34.9949, "step": 15058 }, { "epoch": 39.77286233080225, "grad_norm": 1509.7271728515625, "learning_rate": 4.975322246420594e-05, "loss": 35.2998, "step": 15059 }, { "epoch": 39.77550346649059, "grad_norm": 2310.382568359375, "learning_rate": 4.97282144398192e-05, "loss": 35.52, "step": 15060 }, { "epoch": 39.778144602178934, "grad_norm": 900.8861694335938, "learning_rate": 4.9703212007904425e-05, "loss": 35.4728, "step": 15061 }, { "epoch": 39.780785737867284, "grad_norm": 1783.17919921875, "learning_rate": 4.967821516915977e-05, "loss": 34.9262, "step": 15062 }, { "epoch": 39.78342687355563, "grad_norm": 827.2960205078125, "learning_rate": 4.965322392428326e-05, "loss": 34.3449, "step": 15063 }, { "epoch": 39.78606800924398, "grad_norm": 3062.089599609375, "learning_rate": 4.962823827397281e-05, "loss": 36.0214, "step": 15064 }, { "epoch": 39.78870914493232, "grad_norm": 7239.87109375, "learning_rate": 4.960325821892611e-05, "loss": 35.8102, "step": 15065 }, { "epoch": 39.79135028062067, "grad_norm": 895.0198974609375, "learning_rate": 4.957828375984072e-05, "loss": 37.4269, "step": 15066 }, { "epoch": 39.79399141630901, "grad_norm": 1350.34716796875, "learning_rate": 4.955331489741396e-05, "loss": 38.6148, "step": 15067 }, { "epoch": 39.79663255199736, "grad_norm": 1140.7569580078125, "learning_rate": 4.9528351632343174e-05, "loss": 40.028, "step": 15068 }, { "epoch": 39.799273687685705, "grad_norm": 1016.20166015625, "learning_rate": 4.950339396532541e-05, "loss": 38.0444, "step": 15069 }, { "epoch": 39.80191482337405, "grad_norm": 941.1532592773438, "learning_rate": 4.9478441897057604e-05, "loss": 41.2844, "step": 15070 }, { "epoch": 39.8045559590624, "grad_norm": 2664.71923828125, "learning_rate": 4.945349542823652e-05, "loss": 39.9822, "step": 15071 }, { "epoch": 39.80719709475074, "grad_norm": 3489.58544921875, "learning_rate": 4.9428554559558744e-05, "loss": 41.1665, "step": 15072 }, { "epoch": 39.80983823043909, "grad_norm": 1294.0926513671875, "learning_rate": 4.94036192917208e-05, "loss": 41.9447, "step": 15073 }, { "epoch": 39.812479366127434, "grad_norm": 1093.18701171875, "learning_rate": 4.937868962541886e-05, "loss": 38.902, "step": 15074 }, { "epoch": 39.815120501815784, "grad_norm": 992.2801513671875, "learning_rate": 4.935376556134921e-05, "loss": 40.1552, "step": 15075 }, { "epoch": 39.81776163750413, "grad_norm": 1338.113037109375, "learning_rate": 4.9328847100207795e-05, "loss": 39.6259, "step": 15076 }, { "epoch": 39.82040277319247, "grad_norm": 983.3595581054688, "learning_rate": 4.930393424269042e-05, "loss": 38.4802, "step": 15077 }, { "epoch": 39.82304390888082, "grad_norm": 1095.2626953125, "learning_rate": 4.927902698949288e-05, "loss": 37.4731, "step": 15078 }, { "epoch": 39.82568504456916, "grad_norm": 1060.1151123046875, "learning_rate": 4.9254125341310545e-05, "loss": 38.7699, "step": 15079 }, { "epoch": 39.82832618025751, "grad_norm": 1414.46630859375, "learning_rate": 4.922922929883883e-05, "loss": 37.7641, "step": 15080 }, { "epoch": 39.830967315945855, "grad_norm": 1215.0404052734375, "learning_rate": 4.9204338862772896e-05, "loss": 36.3615, "step": 15081 }, { "epoch": 39.833608451634205, "grad_norm": 1152.3599853515625, "learning_rate": 4.917945403380788e-05, "loss": 35.0113, "step": 15082 }, { "epoch": 39.83624958732255, "grad_norm": 1605.6529541015625, "learning_rate": 4.915457481263866e-05, "loss": 35.3571, "step": 15083 }, { "epoch": 39.83889072301089, "grad_norm": 1341.41259765625, "learning_rate": 4.912970119995991e-05, "loss": 36.2196, "step": 15084 }, { "epoch": 39.84153185869924, "grad_norm": 3012.083740234375, "learning_rate": 4.9104833196466305e-05, "loss": 34.5283, "step": 15085 }, { "epoch": 39.844172994387584, "grad_norm": 1080.9078369140625, "learning_rate": 4.9079970802852206e-05, "loss": 34.4885, "step": 15086 }, { "epoch": 39.846814130075934, "grad_norm": 1185.9071044921875, "learning_rate": 4.905511401981191e-05, "loss": 35.1518, "step": 15087 }, { "epoch": 39.84945526576428, "grad_norm": 6132.69140625, "learning_rate": 4.903026284803952e-05, "loss": 43.3942, "step": 15088 }, { "epoch": 39.85209640145263, "grad_norm": 6816.04541015625, "learning_rate": 4.900541728822899e-05, "loss": 32.0983, "step": 15089 }, { "epoch": 39.85473753714097, "grad_norm": 1577.093505859375, "learning_rate": 4.8980577341074086e-05, "loss": 9.3054, "step": 15090 }, { "epoch": 39.85737867282932, "grad_norm": 904.5186767578125, "learning_rate": 4.895574300726843e-05, "loss": 9.891, "step": 15091 }, { "epoch": 39.86001980851766, "grad_norm": 4614.90625, "learning_rate": 4.8930914287505596e-05, "loss": 11.4279, "step": 15092 }, { "epoch": 39.862660944206006, "grad_norm": 17985.498046875, "learning_rate": 4.890609118247888e-05, "loss": 11.5325, "step": 15093 }, { "epoch": 39.865302079894356, "grad_norm": 1407.2288818359375, "learning_rate": 4.888127369288145e-05, "loss": 9.219, "step": 15094 }, { "epoch": 39.8679432155827, "grad_norm": 1277.1038818359375, "learning_rate": 4.8856461819406286e-05, "loss": 10.0185, "step": 15095 }, { "epoch": 39.87058435127105, "grad_norm": 4217.1328125, "learning_rate": 4.8831655562746294e-05, "loss": 9.6033, "step": 15096 }, { "epoch": 39.87322548695939, "grad_norm": 1323.032958984375, "learning_rate": 4.880685492359413e-05, "loss": 17.1335, "step": 15097 }, { "epoch": 39.87586662264774, "grad_norm": 1880.052001953125, "learning_rate": 4.878205990264228e-05, "loss": 26.6164, "step": 15098 }, { "epoch": 39.878507758336085, "grad_norm": 784.3045654296875, "learning_rate": 4.875727050058329e-05, "loss": 34.9431, "step": 15099 }, { "epoch": 39.88114889402443, "grad_norm": 1098.61376953125, "learning_rate": 4.8732486718109284e-05, "loss": 34.2234, "step": 15100 }, { "epoch": 39.88379002971278, "grad_norm": 927.699951171875, "learning_rate": 4.870770855591239e-05, "loss": 34.7272, "step": 15101 }, { "epoch": 39.88643116540112, "grad_norm": 1602.165283203125, "learning_rate": 4.868293601468437e-05, "loss": 34.47, "step": 15102 }, { "epoch": 39.88907230108947, "grad_norm": 977.16796875, "learning_rate": 4.865816909511722e-05, "loss": 34.8898, "step": 15103 }, { "epoch": 39.89171343677781, "grad_norm": 2706.106689453125, "learning_rate": 4.86334077979024e-05, "loss": 34.8819, "step": 15104 }, { "epoch": 39.89435457246616, "grad_norm": 1483.5533447265625, "learning_rate": 4.860865212373139e-05, "loss": 35.401, "step": 15105 }, { "epoch": 39.896995708154506, "grad_norm": 1546.60888671875, "learning_rate": 4.858390207329544e-05, "loss": 34.7937, "step": 15106 }, { "epoch": 39.89963684384285, "grad_norm": 1892.4864501953125, "learning_rate": 4.8559157647285715e-05, "loss": 34.6976, "step": 15107 }, { "epoch": 39.9022779795312, "grad_norm": 1195.3326416015625, "learning_rate": 4.8534418846393204e-05, "loss": 34.1318, "step": 15108 }, { "epoch": 39.90491911521954, "grad_norm": 952.8165283203125, "learning_rate": 4.8509685671308617e-05, "loss": 35.8383, "step": 15109 }, { "epoch": 39.90756025090789, "grad_norm": 1247.8984375, "learning_rate": 4.8484958122722774e-05, "loss": 33.8954, "step": 15110 }, { "epoch": 39.910201386596235, "grad_norm": 2009.2296142578125, "learning_rate": 4.846023620132608e-05, "loss": 34.135, "step": 15111 }, { "epoch": 39.912842522284585, "grad_norm": 1289.596923828125, "learning_rate": 4.843551990780884e-05, "loss": 33.635, "step": 15112 }, { "epoch": 39.91548365797293, "grad_norm": 1792.575439453125, "learning_rate": 4.841080924286143e-05, "loss": 35.4635, "step": 15113 }, { "epoch": 39.91812479366128, "grad_norm": 5182.47021484375, "learning_rate": 4.838610420717368e-05, "loss": 34.3716, "step": 15114 }, { "epoch": 39.92076592934962, "grad_norm": 2573.75244140625, "learning_rate": 4.8361404801435556e-05, "loss": 35.7328, "step": 15115 }, { "epoch": 39.92340706503796, "grad_norm": 2761.416748046875, "learning_rate": 4.833671102633669e-05, "loss": 39.516, "step": 15116 }, { "epoch": 39.926048200726314, "grad_norm": 1355.735595703125, "learning_rate": 4.831202288256675e-05, "loss": 39.3334, "step": 15117 }, { "epoch": 39.92868933641466, "grad_norm": 1377.1907958984375, "learning_rate": 4.8287340370815084e-05, "loss": 37.8457, "step": 15118 }, { "epoch": 39.93133047210301, "grad_norm": 818.125732421875, "learning_rate": 4.826266349177092e-05, "loss": 41.6026, "step": 15119 }, { "epoch": 39.93397160779135, "grad_norm": 695.0638427734375, "learning_rate": 4.8237992246123406e-05, "loss": 42.8246, "step": 15120 }, { "epoch": 39.9366127434797, "grad_norm": 626.3837890625, "learning_rate": 4.821332663456143e-05, "loss": 39.6466, "step": 15121 }, { "epoch": 39.93925387916804, "grad_norm": 1187.811279296875, "learning_rate": 4.81886666577738e-05, "loss": 38.2856, "step": 15122 }, { "epoch": 39.941895014856385, "grad_norm": 6696.9853515625, "learning_rate": 4.81640123164491e-05, "loss": 36.9502, "step": 15123 }, { "epoch": 39.944536150544735, "grad_norm": 995.9095458984375, "learning_rate": 4.8139363611275774e-05, "loss": 37.619, "step": 15124 }, { "epoch": 39.94717728623308, "grad_norm": 561.1224975585938, "learning_rate": 4.811472054294214e-05, "loss": 34.5882, "step": 15125 }, { "epoch": 39.94981842192143, "grad_norm": 845.6929321289062, "learning_rate": 4.809008311213628e-05, "loss": 35.643, "step": 15126 }, { "epoch": 39.95245955760977, "grad_norm": 524.87939453125, "learning_rate": 4.806545131954632e-05, "loss": 34.7387, "step": 15127 }, { "epoch": 39.95510069329812, "grad_norm": 1206.759521484375, "learning_rate": 4.8040825165860006e-05, "loss": 36.5263, "step": 15128 }, { "epoch": 39.957741828986464, "grad_norm": 2405.865478515625, "learning_rate": 4.8016204651764985e-05, "loss": 34.1999, "step": 15129 }, { "epoch": 39.96038296467481, "grad_norm": 13456.8310546875, "learning_rate": 4.799158977794882e-05, "loss": 28.0012, "step": 15130 }, { "epoch": 39.96302410036316, "grad_norm": 785.1386108398438, "learning_rate": 4.7966980545098846e-05, "loss": 10.9811, "step": 15131 }, { "epoch": 39.9656652360515, "grad_norm": 2390.748291015625, "learning_rate": 4.7942376953902245e-05, "loss": 7.8872, "step": 15132 }, { "epoch": 39.96830637173985, "grad_norm": 1759.0166015625, "learning_rate": 4.791777900504604e-05, "loss": 12.2008, "step": 15133 }, { "epoch": 39.97094750742819, "grad_norm": 27346.427734375, "learning_rate": 4.7893186699217186e-05, "loss": 9.7484, "step": 15134 }, { "epoch": 39.97358864311654, "grad_norm": 1683.4141845703125, "learning_rate": 4.786860003710236e-05, "loss": 20.9019, "step": 15135 }, { "epoch": 39.976229778804885, "grad_norm": 656.9573364257812, "learning_rate": 4.784401901938815e-05, "loss": 33.9304, "step": 15136 }, { "epoch": 39.978870914493235, "grad_norm": 3212.61767578125, "learning_rate": 4.7819443646760886e-05, "loss": 39.0725, "step": 15137 }, { "epoch": 39.98151205018158, "grad_norm": 765.5891723632812, "learning_rate": 4.779487391990697e-05, "loss": 34.9955, "step": 15138 }, { "epoch": 39.98415318586992, "grad_norm": 928.6543579101562, "learning_rate": 4.7770309839512416e-05, "loss": 35.0737, "step": 15139 }, { "epoch": 39.98679432155827, "grad_norm": 1673.0751953125, "learning_rate": 4.7745751406263163e-05, "loss": 34.6224, "step": 15140 }, { "epoch": 39.989435457246614, "grad_norm": 1059.062744140625, "learning_rate": 4.7721198620844977e-05, "loss": 34.5876, "step": 15141 }, { "epoch": 39.992076592934964, "grad_norm": 1772.10791015625, "learning_rate": 4.7696651483943505e-05, "loss": 34.6241, "step": 15142 }, { "epoch": 39.99471772862331, "grad_norm": 1414.863037109375, "learning_rate": 4.7672109996244216e-05, "loss": 33.5613, "step": 15143 }, { "epoch": 39.99735886431166, "grad_norm": 1657.76220703125, "learning_rate": 4.764757415843232e-05, "loss": 34.9564, "step": 15144 }, { "epoch": 40.0, "grad_norm": 5253.7314453125, "learning_rate": 4.76230439711931e-05, "loss": 38.0393, "step": 15145 }, { "epoch": 40.00264113568834, "grad_norm": 587.0114135742188, "learning_rate": 4.759851943521151e-05, "loss": 38.3398, "step": 15146 }, { "epoch": 40.00528227137669, "grad_norm": 1116.63720703125, "learning_rate": 4.757400055117231e-05, "loss": 38.06, "step": 15147 }, { "epoch": 40.007923407065036, "grad_norm": 699.8850708007812, "learning_rate": 4.754948731976036e-05, "loss": 36.7995, "step": 15148 }, { "epoch": 40.010564542753386, "grad_norm": 719.4987182617188, "learning_rate": 4.752497974165998e-05, "loss": 41.0624, "step": 15149 }, { "epoch": 40.01320567844173, "grad_norm": 692.3517456054688, "learning_rate": 4.750047781755559e-05, "loss": 42.394, "step": 15150 }, { "epoch": 40.01584681413008, "grad_norm": 2654.305908203125, "learning_rate": 4.747598154813135e-05, "loss": 41.3363, "step": 15151 }, { "epoch": 40.01848794981842, "grad_norm": 942.2615356445312, "learning_rate": 4.7451490934071414e-05, "loss": 43.5754, "step": 15152 }, { "epoch": 40.021129085506765, "grad_norm": 928.2283935546875, "learning_rate": 4.742700597605962e-05, "loss": 40.836, "step": 15153 }, { "epoch": 40.023770221195115, "grad_norm": 1097.8538818359375, "learning_rate": 4.740252667477959e-05, "loss": 39.1722, "step": 15154 }, { "epoch": 40.02641135688346, "grad_norm": 687.978759765625, "learning_rate": 4.737805303091511e-05, "loss": 42.0726, "step": 15155 }, { "epoch": 40.02905249257181, "grad_norm": 1112.7001953125, "learning_rate": 4.7353585045149444e-05, "loss": 38.8281, "step": 15156 }, { "epoch": 40.03169362826015, "grad_norm": 970.3552856445312, "learning_rate": 4.7329122718165864e-05, "loss": 37.6093, "step": 15157 }, { "epoch": 40.0343347639485, "grad_norm": 1415.4560546875, "learning_rate": 4.730466605064748e-05, "loss": 36.5557, "step": 15158 }, { "epoch": 40.03697589963684, "grad_norm": 986.3914184570312, "learning_rate": 4.7280215043277234e-05, "loss": 35.0796, "step": 15159 }, { "epoch": 40.03961703532519, "grad_norm": 954.927001953125, "learning_rate": 4.725576969673789e-05, "loss": 35.6629, "step": 15160 }, { "epoch": 40.042258171013536, "grad_norm": 2058.325439453125, "learning_rate": 4.723133001171204e-05, "loss": 34.8262, "step": 15161 }, { "epoch": 40.04489930670188, "grad_norm": 849.8889770507812, "learning_rate": 4.7206895988882224e-05, "loss": 35.4663, "step": 15162 }, { "epoch": 40.04754044239023, "grad_norm": 1087.6588134765625, "learning_rate": 4.718246762893072e-05, "loss": 33.4616, "step": 15163 }, { "epoch": 40.05018157807857, "grad_norm": 820.5493774414062, "learning_rate": 4.715804493253967e-05, "loss": 35.1242, "step": 15164 }, { "epoch": 40.05282271376692, "grad_norm": 814.1227416992188, "learning_rate": 4.713362790039105e-05, "loss": 34.291, "step": 15165 }, { "epoch": 40.055463849455265, "grad_norm": 1594.0843505859375, "learning_rate": 4.7109216533166715e-05, "loss": 36.1678, "step": 15166 }, { "epoch": 40.058104985143615, "grad_norm": 17285.36328125, "learning_rate": 4.708481083154834e-05, "loss": 31.4111, "step": 15167 }, { "epoch": 40.06074612083196, "grad_norm": 895.1237182617188, "learning_rate": 4.7060410796217315e-05, "loss": 12.1447, "step": 15168 }, { "epoch": 40.0633872565203, "grad_norm": 4757.55126953125, "learning_rate": 4.7036016427855185e-05, "loss": 14.4254, "step": 15169 }, { "epoch": 40.06602839220865, "grad_norm": 15581.0732421875, "learning_rate": 4.7011627727143067e-05, "loss": 13.0125, "step": 15170 }, { "epoch": 40.06866952789699, "grad_norm": 1312.40869140625, "learning_rate": 4.698724469476201e-05, "loss": 10.9226, "step": 15171 }, { "epoch": 40.07131066358534, "grad_norm": 4499.85205078125, "learning_rate": 4.696286733139285e-05, "loss": 11.6444, "step": 15172 }, { "epoch": 40.073951799273686, "grad_norm": 814.5867309570312, "learning_rate": 4.693849563771638e-05, "loss": 10.1096, "step": 15173 }, { "epoch": 40.07659293496204, "grad_norm": 7018.6279296875, "learning_rate": 4.6914129614413134e-05, "loss": 16.0523, "step": 15174 }, { "epoch": 40.07923407065038, "grad_norm": 1085.8656005859375, "learning_rate": 4.688976926216354e-05, "loss": 7.9624, "step": 15175 }, { "epoch": 40.08187520633872, "grad_norm": 1923.9122314453125, "learning_rate": 4.6865414581647806e-05, "loss": 8.9772, "step": 15176 }, { "epoch": 40.08451634202707, "grad_norm": 1025.1146240234375, "learning_rate": 4.684106557354606e-05, "loss": 31.589, "step": 15177 }, { "epoch": 40.087157477715415, "grad_norm": 902.375244140625, "learning_rate": 4.6816722238538195e-05, "loss": 34.4943, "step": 15178 }, { "epoch": 40.089798613403765, "grad_norm": 2264.390380859375, "learning_rate": 4.679238457730395e-05, "loss": 34.1171, "step": 15179 }, { "epoch": 40.09243974909211, "grad_norm": 1949.648193359375, "learning_rate": 4.676805259052305e-05, "loss": 33.7154, "step": 15180 }, { "epoch": 40.09508088478046, "grad_norm": 992.551025390625, "learning_rate": 4.674372627887491e-05, "loss": 33.3533, "step": 15181 }, { "epoch": 40.0977220204688, "grad_norm": 1782.2037353515625, "learning_rate": 4.6719405643038756e-05, "loss": 34.1634, "step": 15182 }, { "epoch": 40.10036315615715, "grad_norm": 1688.1968994140625, "learning_rate": 4.6695090683693904e-05, "loss": 36.0822, "step": 15183 }, { "epoch": 40.103004291845494, "grad_norm": 1175.509521484375, "learning_rate": 4.6670781401519145e-05, "loss": 33.7592, "step": 15184 }, { "epoch": 40.10564542753384, "grad_norm": 867.168212890625, "learning_rate": 4.664647779719333e-05, "loss": 35.8451, "step": 15185 }, { "epoch": 40.10828656322219, "grad_norm": 7298.86767578125, "learning_rate": 4.662217987139522e-05, "loss": 33.8005, "step": 15186 }, { "epoch": 40.11092769891053, "grad_norm": 1394.6077880859375, "learning_rate": 4.659788762480327e-05, "loss": 35.9586, "step": 15187 }, { "epoch": 40.11356883459888, "grad_norm": 801.0404663085938, "learning_rate": 4.657360105809583e-05, "loss": 34.6435, "step": 15188 }, { "epoch": 40.11620997028722, "grad_norm": 2256.4677734375, "learning_rate": 4.654932017195099e-05, "loss": 34.2925, "step": 15189 }, { "epoch": 40.11885110597557, "grad_norm": 1301.7213134765625, "learning_rate": 4.652504496704696e-05, "loss": 35.1523, "step": 15190 }, { "epoch": 40.121492241663915, "grad_norm": 1477.508056640625, "learning_rate": 4.650077544406151e-05, "loss": 34.2107, "step": 15191 }, { "epoch": 40.12413337735226, "grad_norm": 1117.8846435546875, "learning_rate": 4.647651160367236e-05, "loss": 34.9664, "step": 15192 }, { "epoch": 40.12677451304061, "grad_norm": 2429.40966796875, "learning_rate": 4.645225344655707e-05, "loss": 36.5293, "step": 15193 }, { "epoch": 40.12941564872895, "grad_norm": 1533.4783935546875, "learning_rate": 4.642800097339303e-05, "loss": 37.7624, "step": 15194 }, { "epoch": 40.1320567844173, "grad_norm": 2641.60546875, "learning_rate": 4.6403754184857485e-05, "loss": 40.6491, "step": 15195 }, { "epoch": 40.134697920105644, "grad_norm": 1411.9737548828125, "learning_rate": 4.637951308162744e-05, "loss": 37.5373, "step": 15196 }, { "epoch": 40.137339055793994, "grad_norm": 1106.804443359375, "learning_rate": 4.635527766437994e-05, "loss": 38.3719, "step": 15197 }, { "epoch": 40.13998019148234, "grad_norm": 965.8001098632812, "learning_rate": 4.6331047933791695e-05, "loss": 38.127, "step": 15198 }, { "epoch": 40.14262132717068, "grad_norm": 1097.576904296875, "learning_rate": 4.630682389053922e-05, "loss": 39.7173, "step": 15199 }, { "epoch": 40.14526246285903, "grad_norm": 650.2764892578125, "learning_rate": 4.628260553529917e-05, "loss": 43.1633, "step": 15200 }, { "epoch": 40.14526246285903, "eval_loss": 3.7944846153259277, "eval_runtime": 2.1736, "eval_samples_per_second": 227.733, "eval_steps_per_second": 28.524, "step": 15200 }, { "epoch": 40.14790359854737, "grad_norm": 1572.431640625, "learning_rate": 4.6258392868747615e-05, "loss": 41.0353, "step": 15201 }, { "epoch": 40.15054473423572, "grad_norm": 916.9337158203125, "learning_rate": 4.6234185891560755e-05, "loss": 43.0275, "step": 15202 }, { "epoch": 40.153185869924066, "grad_norm": 602.8663940429688, "learning_rate": 4.6209984604414504e-05, "loss": 40.0268, "step": 15203 }, { "epoch": 40.155827005612416, "grad_norm": 1923.4366455078125, "learning_rate": 4.618578900798479e-05, "loss": 40.6368, "step": 15204 }, { "epoch": 40.15846814130076, "grad_norm": 1934.569580078125, "learning_rate": 4.616159910294718e-05, "loss": 39.1318, "step": 15205 }, { "epoch": 40.16110927698911, "grad_norm": 1276.73828125, "learning_rate": 4.613741488997708e-05, "loss": 38.428, "step": 15206 }, { "epoch": 40.16375041267745, "grad_norm": 1665.0048828125, "learning_rate": 4.611323636975001e-05, "loss": 36.3453, "step": 15207 }, { "epoch": 40.166391548365795, "grad_norm": 904.52392578125, "learning_rate": 4.608906354294104e-05, "loss": 36.428, "step": 15208 }, { "epoch": 40.169032684054145, "grad_norm": 597.5055541992188, "learning_rate": 4.606489641022518e-05, "loss": 36.0503, "step": 15209 }, { "epoch": 40.17167381974249, "grad_norm": 772.0050659179688, "learning_rate": 4.604073497227731e-05, "loss": 35.7682, "step": 15210 }, { "epoch": 40.17431495543084, "grad_norm": 1569.0755615234375, "learning_rate": 4.601657922977209e-05, "loss": 35.6776, "step": 15211 }, { "epoch": 40.17695609111918, "grad_norm": 1124.3641357421875, "learning_rate": 4.599242918338406e-05, "loss": 35.174, "step": 15212 }, { "epoch": 40.17959722680753, "grad_norm": 1310.4432373046875, "learning_rate": 4.596828483378757e-05, "loss": 35.1955, "step": 15213 }, { "epoch": 40.18223836249587, "grad_norm": 850.009521484375, "learning_rate": 4.594414618165691e-05, "loss": 34.8868, "step": 15214 }, { "epoch": 40.184879498184216, "grad_norm": 1463.16455078125, "learning_rate": 4.592001322766609e-05, "loss": 34.3915, "step": 15215 }, { "epoch": 40.187520633872566, "grad_norm": 4045.95166015625, "learning_rate": 4.589588597248903e-05, "loss": 34.2764, "step": 15216 }, { "epoch": 40.19016176956091, "grad_norm": 3178.342529296875, "learning_rate": 4.587176441679941e-05, "loss": 41.833, "step": 15217 }, { "epoch": 40.19280290524926, "grad_norm": 2709.323974609375, "learning_rate": 4.584764856127097e-05, "loss": 30.0285, "step": 15218 }, { "epoch": 40.1954440409376, "grad_norm": 968.6331176757812, "learning_rate": 4.5823538406576936e-05, "loss": 8.0389, "step": 15219 }, { "epoch": 40.19808517662595, "grad_norm": 661.733154296875, "learning_rate": 4.5799433953390616e-05, "loss": 8.1824, "step": 15220 }, { "epoch": 40.200726312314295, "grad_norm": 861.2553100585938, "learning_rate": 4.577533520238519e-05, "loss": 8.8176, "step": 15221 }, { "epoch": 40.20336744800264, "grad_norm": 3326.939453125, "learning_rate": 4.57512421542336e-05, "loss": 10.6631, "step": 15222 }, { "epoch": 40.20600858369099, "grad_norm": 1586.228271484375, "learning_rate": 4.572715480960854e-05, "loss": 14.1179, "step": 15223 }, { "epoch": 40.20864971937933, "grad_norm": 1142.31884765625, "learning_rate": 4.570307316918265e-05, "loss": 9.854, "step": 15224 }, { "epoch": 40.21129085506768, "grad_norm": 2917.141845703125, "learning_rate": 4.5678997233628505e-05, "loss": 13.2707, "step": 15225 }, { "epoch": 40.21393199075602, "grad_norm": 875.4080200195312, "learning_rate": 4.565492700361831e-05, "loss": 13.32, "step": 15226 }, { "epoch": 40.21657312644437, "grad_norm": 869.2076416015625, "learning_rate": 4.5630862479824256e-05, "loss": 7.4242, "step": 15227 }, { "epoch": 40.219214262132716, "grad_norm": 647.7198486328125, "learning_rate": 4.560680366291831e-05, "loss": 35.4808, "step": 15228 }, { "epoch": 40.221855397821066, "grad_norm": 1394.866455078125, "learning_rate": 4.558275055357231e-05, "loss": 34.9739, "step": 15229 }, { "epoch": 40.22449653350941, "grad_norm": 948.906005859375, "learning_rate": 4.555870315245792e-05, "loss": 33.6762, "step": 15230 }, { "epoch": 40.22713766919775, "grad_norm": 1816.283935546875, "learning_rate": 4.553466146024657e-05, "loss": 35.1708, "step": 15231 }, { "epoch": 40.2297788048861, "grad_norm": 2226.45849609375, "learning_rate": 4.551062547760976e-05, "loss": 33.1883, "step": 15232 }, { "epoch": 40.232419940574445, "grad_norm": 2542.692138671875, "learning_rate": 4.548659520521861e-05, "loss": 34.4938, "step": 15233 }, { "epoch": 40.235061076262795, "grad_norm": 1338.8853759765625, "learning_rate": 4.54625706437441e-05, "loss": 34.8586, "step": 15234 }, { "epoch": 40.23770221195114, "grad_norm": 777.7747802734375, "learning_rate": 4.543855179385728e-05, "loss": 33.9683, "step": 15235 }, { "epoch": 40.24034334763949, "grad_norm": 3237.112060546875, "learning_rate": 4.541453865622866e-05, "loss": 34.4585, "step": 15236 }, { "epoch": 40.24298448332783, "grad_norm": 1301.6888427734375, "learning_rate": 4.539053123152886e-05, "loss": 34.7468, "step": 15237 }, { "epoch": 40.245625619016174, "grad_norm": 1380.686279296875, "learning_rate": 4.536652952042824e-05, "loss": 35.0589, "step": 15238 }, { "epoch": 40.248266754704524, "grad_norm": 4030.159423828125, "learning_rate": 4.534253352359713e-05, "loss": 35.2515, "step": 15239 }, { "epoch": 40.25090789039287, "grad_norm": 2011.3541259765625, "learning_rate": 4.5318543241705535e-05, "loss": 34.3556, "step": 15240 }, { "epoch": 40.25354902608122, "grad_norm": 981.8831176757812, "learning_rate": 4.5294558675423344e-05, "loss": 34.6408, "step": 15241 }, { "epoch": 40.25619016176956, "grad_norm": 2049.776123046875, "learning_rate": 4.52705798254204e-05, "loss": 34.1344, "step": 15242 }, { "epoch": 40.25883129745791, "grad_norm": 1123.0889892578125, "learning_rate": 4.524660669236624e-05, "loss": 36.3807, "step": 15243 }, { "epoch": 40.26147243314625, "grad_norm": 761.650146484375, "learning_rate": 4.522263927693035e-05, "loss": 34.879, "step": 15244 }, { "epoch": 40.264113568834595, "grad_norm": 9862.6572265625, "learning_rate": 4.519867757978194e-05, "loss": 39.3489, "step": 15245 }, { "epoch": 40.266754704522945, "grad_norm": 586.0521240234375, "learning_rate": 4.517472160159014e-05, "loss": 38.7792, "step": 15246 }, { "epoch": 40.26939584021129, "grad_norm": 1154.2376708984375, "learning_rate": 4.5150771343023924e-05, "loss": 39.1315, "step": 15247 }, { "epoch": 40.27203697589964, "grad_norm": 4888.09228515625, "learning_rate": 4.5126826804752025e-05, "loss": 39.8528, "step": 15248 }, { "epoch": 40.27467811158798, "grad_norm": 2303.525634765625, "learning_rate": 4.510288798744319e-05, "loss": 39.6248, "step": 15249 }, { "epoch": 40.27731924727633, "grad_norm": 957.658447265625, "learning_rate": 4.507895489176586e-05, "loss": 42.0052, "step": 15250 }, { "epoch": 40.279960382964674, "grad_norm": 1703.3555908203125, "learning_rate": 4.505502751838833e-05, "loss": 42.4902, "step": 15251 }, { "epoch": 40.282601518653024, "grad_norm": 2989.34814453125, "learning_rate": 4.503110586797871e-05, "loss": 40.542, "step": 15252 }, { "epoch": 40.28524265434137, "grad_norm": 905.5454711914062, "learning_rate": 4.500718994120517e-05, "loss": 39.3781, "step": 15253 }, { "epoch": 40.28788379002971, "grad_norm": 897.7832641601562, "learning_rate": 4.498327973873537e-05, "loss": 39.7144, "step": 15254 }, { "epoch": 40.29052492571806, "grad_norm": 846.068603515625, "learning_rate": 4.4959375261237e-05, "loss": 38.5771, "step": 15255 }, { "epoch": 40.2931660614064, "grad_norm": 816.8997192382812, "learning_rate": 4.493547650937768e-05, "loss": 37.7993, "step": 15256 }, { "epoch": 40.29580719709475, "grad_norm": 755.1742553710938, "learning_rate": 4.491158348382471e-05, "loss": 37.3324, "step": 15257 }, { "epoch": 40.298448332783096, "grad_norm": 1030.548095703125, "learning_rate": 4.488769618524532e-05, "loss": 36.9108, "step": 15258 }, { "epoch": 40.301089468471446, "grad_norm": 1589.7921142578125, "learning_rate": 4.4863814614306444e-05, "loss": 37.3026, "step": 15259 }, { "epoch": 40.30373060415979, "grad_norm": 1182.7822265625, "learning_rate": 4.483993877167511e-05, "loss": 34.9582, "step": 15260 }, { "epoch": 40.30637173984813, "grad_norm": 1608.307861328125, "learning_rate": 4.4816068658017964e-05, "loss": 35.5822, "step": 15261 }, { "epoch": 40.30901287553648, "grad_norm": 5455.7314453125, "learning_rate": 4.479220427400158e-05, "loss": 34.9542, "step": 15262 }, { "epoch": 40.311654011224825, "grad_norm": 733.4805297851562, "learning_rate": 4.476834562029233e-05, "loss": 34.9725, "step": 15263 }, { "epoch": 40.314295146913175, "grad_norm": 1944.4810791015625, "learning_rate": 4.474449269755646e-05, "loss": 35.5696, "step": 15264 }, { "epoch": 40.31693628260152, "grad_norm": 793.0634765625, "learning_rate": 4.472064550646007e-05, "loss": 35.4572, "step": 15265 }, { "epoch": 40.31957741828987, "grad_norm": 953.3244018554688, "learning_rate": 4.4696804047669015e-05, "loss": 36.2865, "step": 15266 }, { "epoch": 40.32221855397821, "grad_norm": 1559.474365234375, "learning_rate": 4.4672968321849165e-05, "loss": 36.5087, "step": 15267 }, { "epoch": 40.32485968966655, "grad_norm": 2562.882568359375, "learning_rate": 4.464913832966605e-05, "loss": 33.6589, "step": 15268 }, { "epoch": 40.3275008253549, "grad_norm": 7633.2685546875, "learning_rate": 4.462531407178505e-05, "loss": 12.4501, "step": 15269 }, { "epoch": 40.330141961043246, "grad_norm": 3183.19140625, "learning_rate": 4.460149554887166e-05, "loss": 12.7856, "step": 15270 }, { "epoch": 40.332783096731596, "grad_norm": 1749.1417236328125, "learning_rate": 4.457768276159077e-05, "loss": 11.5505, "step": 15271 }, { "epoch": 40.33542423241994, "grad_norm": 2977.053955078125, "learning_rate": 4.45538757106074e-05, "loss": 10.9091, "step": 15272 }, { "epoch": 40.33806536810829, "grad_norm": 489.0174560546875, "learning_rate": 4.4530074396586326e-05, "loss": 7.7587, "step": 15273 }, { "epoch": 40.34070650379663, "grad_norm": 1633.774658203125, "learning_rate": 4.450627882019229e-05, "loss": 12.0187, "step": 15274 }, { "epoch": 40.34334763948498, "grad_norm": 2437.563232421875, "learning_rate": 4.44824889820897e-05, "loss": 10.8448, "step": 15275 }, { "epoch": 40.345988775173325, "grad_norm": 3850.289306640625, "learning_rate": 4.445870488294282e-05, "loss": 9.7703, "step": 15276 }, { "epoch": 40.34862991086167, "grad_norm": 2528.851806640625, "learning_rate": 4.443492652341591e-05, "loss": 17.9798, "step": 15277 }, { "epoch": 40.35127104655002, "grad_norm": 3484.721435546875, "learning_rate": 4.441115390417294e-05, "loss": 36.8234, "step": 15278 }, { "epoch": 40.35391218223836, "grad_norm": 4993.81787109375, "learning_rate": 4.4387387025877716e-05, "loss": 34.241, "step": 15279 }, { "epoch": 40.35655331792671, "grad_norm": 919.0106811523438, "learning_rate": 4.436362588919393e-05, "loss": 35.557, "step": 15280 }, { "epoch": 40.35919445361505, "grad_norm": 969.0474853515625, "learning_rate": 4.433987049478508e-05, "loss": 33.3961, "step": 15281 }, { "epoch": 40.3618355893034, "grad_norm": 1041.56591796875, "learning_rate": 4.431612084331454e-05, "loss": 34.3715, "step": 15282 }, { "epoch": 40.364476724991746, "grad_norm": 2342.662841796875, "learning_rate": 4.4292376935445464e-05, "loss": 34.4989, "step": 15283 }, { "epoch": 40.36711786068009, "grad_norm": 880.9511108398438, "learning_rate": 4.4268638771840954e-05, "loss": 35.1118, "step": 15284 }, { "epoch": 40.36975899636844, "grad_norm": 1790.9395751953125, "learning_rate": 4.424490635316386e-05, "loss": 35.3269, "step": 15285 }, { "epoch": 40.37240013205678, "grad_norm": 983.7274780273438, "learning_rate": 4.422117968007691e-05, "loss": 33.6208, "step": 15286 }, { "epoch": 40.37504126774513, "grad_norm": 1013.4375, "learning_rate": 4.419745875324255e-05, "loss": 35.8095, "step": 15287 }, { "epoch": 40.377682403433475, "grad_norm": 1168.360107421875, "learning_rate": 4.41737435733234e-05, "loss": 34.39, "step": 15288 }, { "epoch": 40.380323539121825, "grad_norm": 3247.93115234375, "learning_rate": 4.415003414098148e-05, "loss": 34.495, "step": 15289 }, { "epoch": 40.38296467481017, "grad_norm": 1228.3365478515625, "learning_rate": 4.412633045687886e-05, "loss": 36.1698, "step": 15290 }, { "epoch": 40.38560581049851, "grad_norm": 3406.974365234375, "learning_rate": 4.410263252167762e-05, "loss": 34.1174, "step": 15291 }, { "epoch": 40.38824694618686, "grad_norm": 2124.00146484375, "learning_rate": 4.40789403360394e-05, "loss": 34.1894, "step": 15292 }, { "epoch": 40.390888081875204, "grad_norm": 2522.06787109375, "learning_rate": 4.405525390062579e-05, "loss": 36.4944, "step": 15293 }, { "epoch": 40.393529217563554, "grad_norm": 2284.107421875, "learning_rate": 4.403157321609819e-05, "loss": 36.3657, "step": 15294 }, { "epoch": 40.3961703532519, "grad_norm": 6704.01513671875, "learning_rate": 4.4007898283117995e-05, "loss": 41.3613, "step": 15295 }, { "epoch": 40.39881148894025, "grad_norm": 889.0579223632812, "learning_rate": 4.39842291023462e-05, "loss": 38.0583, "step": 15296 }, { "epoch": 40.40145262462859, "grad_norm": 1483.244873046875, "learning_rate": 4.3960565674443816e-05, "loss": 38.5688, "step": 15297 }, { "epoch": 40.40409376031694, "grad_norm": 1291.076904296875, "learning_rate": 4.393690800007158e-05, "loss": 38.3662, "step": 15298 }, { "epoch": 40.40673489600528, "grad_norm": 1392.7567138671875, "learning_rate": 4.391325607989013e-05, "loss": 39.7761, "step": 15299 }, { "epoch": 40.409376031693625, "grad_norm": 733.30712890625, "learning_rate": 4.388960991455998e-05, "loss": 40.2204, "step": 15300 }, { "epoch": 40.412017167381975, "grad_norm": 808.9638671875, "learning_rate": 4.386596950474131e-05, "loss": 41.1319, "step": 15301 }, { "epoch": 40.41465830307032, "grad_norm": 1030.528076171875, "learning_rate": 4.384233485109443e-05, "loss": 38.3629, "step": 15302 }, { "epoch": 40.41729943875867, "grad_norm": 889.3788452148438, "learning_rate": 4.3818705954279226e-05, "loss": 40.5904, "step": 15303 }, { "epoch": 40.41994057444701, "grad_norm": 1651.87841796875, "learning_rate": 4.379508281495548e-05, "loss": 40.3413, "step": 15304 }, { "epoch": 40.42258171013536, "grad_norm": 3033.1728515625, "learning_rate": 4.3771465433783045e-05, "loss": 37.4482, "step": 15305 }, { "epoch": 40.425222845823704, "grad_norm": 854.2684936523438, "learning_rate": 4.374785381142124e-05, "loss": 36.6701, "step": 15306 }, { "epoch": 40.42786398151205, "grad_norm": 694.8422241210938, "learning_rate": 4.372424794852944e-05, "loss": 37.9413, "step": 15307 }, { "epoch": 40.4305051172004, "grad_norm": 3580.375732421875, "learning_rate": 4.37006478457668e-05, "loss": 36.2534, "step": 15308 }, { "epoch": 40.43314625288874, "grad_norm": 482.47161865234375, "learning_rate": 4.367705350379245e-05, "loss": 36.4851, "step": 15309 }, { "epoch": 40.43578738857709, "grad_norm": 1218.5584716796875, "learning_rate": 4.365346492326516e-05, "loss": 35.4367, "step": 15310 }, { "epoch": 40.43842852426543, "grad_norm": 1176.6273193359375, "learning_rate": 4.362988210484359e-05, "loss": 35.6365, "step": 15311 }, { "epoch": 40.44106965995378, "grad_norm": 912.35888671875, "learning_rate": 4.360630504918639e-05, "loss": 34.5478, "step": 15312 }, { "epoch": 40.443710795642126, "grad_norm": 1075.7076416015625, "learning_rate": 4.3582733756951877e-05, "loss": 35.6295, "step": 15313 }, { "epoch": 40.44635193133047, "grad_norm": 4391.34033203125, "learning_rate": 4.35591682287983e-05, "loss": 34.8605, "step": 15314 }, { "epoch": 40.44899306701882, "grad_norm": 1966.6339111328125, "learning_rate": 4.353560846538365e-05, "loss": 35.4329, "step": 15315 }, { "epoch": 40.45163420270716, "grad_norm": 1199.3834228515625, "learning_rate": 4.3512054467365866e-05, "loss": 35.4446, "step": 15316 }, { "epoch": 40.45427533839551, "grad_norm": 2819.877685546875, "learning_rate": 4.348850623540265e-05, "loss": 38.8806, "step": 15317 }, { "epoch": 40.456916474083854, "grad_norm": 4031.22265625, "learning_rate": 4.3464963770151524e-05, "loss": 16.2949, "step": 15318 }, { "epoch": 40.459557609772205, "grad_norm": 1855.0914306640625, "learning_rate": 4.344142707227003e-05, "loss": 8.8685, "step": 15319 }, { "epoch": 40.46219874546055, "grad_norm": 2224.541748046875, "learning_rate": 4.341789614241534e-05, "loss": 13.895, "step": 15320 }, { "epoch": 40.4648398811489, "grad_norm": 1462.18505859375, "learning_rate": 4.339437098124449e-05, "loss": 10.8786, "step": 15321 }, { "epoch": 40.46748101683724, "grad_norm": 16425.732421875, "learning_rate": 4.3370851589414505e-05, "loss": 13.4261, "step": 15322 }, { "epoch": 40.47012215252558, "grad_norm": 971.2916870117188, "learning_rate": 4.3347337967582194e-05, "loss": 12.4098, "step": 15323 }, { "epoch": 40.47276328821393, "grad_norm": 2506.47509765625, "learning_rate": 4.3323830116404006e-05, "loss": 11.2589, "step": 15324 }, { "epoch": 40.475404423902276, "grad_norm": 1736.8997802734375, "learning_rate": 4.330032803653641e-05, "loss": 12.3831, "step": 15325 }, { "epoch": 40.478045559590626, "grad_norm": 5290.01318359375, "learning_rate": 4.3276831728635776e-05, "loss": 13.7799, "step": 15326 }, { "epoch": 40.48068669527897, "grad_norm": 65101.48046875, "learning_rate": 4.3253341193358195e-05, "loss": 10.871, "step": 15327 }, { "epoch": 40.48332783096732, "grad_norm": 1439.6168212890625, "learning_rate": 4.3229856431359515e-05, "loss": 34.5533, "step": 15328 }, { "epoch": 40.48596896665566, "grad_norm": 1019.7968139648438, "learning_rate": 4.320637744329573e-05, "loss": 34.4084, "step": 15329 }, { "epoch": 40.488610102344005, "grad_norm": 1515.965087890625, "learning_rate": 4.318290422982235e-05, "loss": 34.0277, "step": 15330 }, { "epoch": 40.491251238032355, "grad_norm": 1731.356201171875, "learning_rate": 4.3159436791594896e-05, "loss": 34.8873, "step": 15331 }, { "epoch": 40.4938923737207, "grad_norm": 1366.3135986328125, "learning_rate": 4.313597512926867e-05, "loss": 34.3933, "step": 15332 }, { "epoch": 40.49653350940905, "grad_norm": 1145.1212158203125, "learning_rate": 4.3112519243498794e-05, "loss": 33.7993, "step": 15333 }, { "epoch": 40.49917464509739, "grad_norm": 1289.076416015625, "learning_rate": 4.308906913494032e-05, "loss": 34.9706, "step": 15334 }, { "epoch": 40.50181578078574, "grad_norm": 1475.4669189453125, "learning_rate": 4.3065624804247946e-05, "loss": 33.7277, "step": 15335 }, { "epoch": 40.50445691647408, "grad_norm": 9705.845703125, "learning_rate": 4.304218625207654e-05, "loss": 34.8367, "step": 15336 }, { "epoch": 40.507098052162426, "grad_norm": 1286.7020263671875, "learning_rate": 4.3018753479080494e-05, "loss": 36.1279, "step": 15337 }, { "epoch": 40.509739187850776, "grad_norm": 1752.60595703125, "learning_rate": 4.299532648591414e-05, "loss": 34.5244, "step": 15338 }, { "epoch": 40.51238032353912, "grad_norm": 2977.433349609375, "learning_rate": 4.297190527323167e-05, "loss": 34.852, "step": 15339 }, { "epoch": 40.51502145922747, "grad_norm": 2435.466796875, "learning_rate": 4.294848984168723e-05, "loss": 34.0135, "step": 15340 }, { "epoch": 40.51766259491581, "grad_norm": 1212.72119140625, "learning_rate": 4.292508019193453e-05, "loss": 33.5488, "step": 15341 }, { "epoch": 40.52030373060416, "grad_norm": 2578.104736328125, "learning_rate": 4.290167632462727e-05, "loss": 34.5327, "step": 15342 }, { "epoch": 40.522944866292505, "grad_norm": 1432.343994140625, "learning_rate": 4.287827824041909e-05, "loss": 34.8718, "step": 15343 }, { "epoch": 40.525586001980855, "grad_norm": 1790.3856201171875, "learning_rate": 4.285488593996334e-05, "loss": 35.2811, "step": 15344 }, { "epoch": 40.5282271376692, "grad_norm": 1805.36181640625, "learning_rate": 4.2831499423913206e-05, "loss": 39.923, "step": 15345 }, { "epoch": 40.53086827335754, "grad_norm": 824.5446166992188, "learning_rate": 4.280811869292167e-05, "loss": 39.0149, "step": 15346 }, { "epoch": 40.53350940904589, "grad_norm": 1016.1780395507812, "learning_rate": 4.27847437476418e-05, "loss": 39.4787, "step": 15347 }, { "epoch": 40.536150544734234, "grad_norm": 1931.3095703125, "learning_rate": 4.276137458872623e-05, "loss": 38.8521, "step": 15348 }, { "epoch": 40.538791680422584, "grad_norm": 1131.521728515625, "learning_rate": 4.273801121682752e-05, "loss": 39.0194, "step": 15349 }, { "epoch": 40.54143281611093, "grad_norm": 1037.62353515625, "learning_rate": 4.2714653632598104e-05, "loss": 41.7673, "step": 15350 }, { "epoch": 40.54407395179928, "grad_norm": 1726.658447265625, "learning_rate": 4.269130183669023e-05, "loss": 39.0988, "step": 15351 }, { "epoch": 40.54671508748762, "grad_norm": 1360.0087890625, "learning_rate": 4.2667955829755964e-05, "loss": 40.4698, "step": 15352 }, { "epoch": 40.54935622317596, "grad_norm": 1667.17333984375, "learning_rate": 4.2644615612447175e-05, "loss": 40.7324, "step": 15353 }, { "epoch": 40.55199735886431, "grad_norm": 1245.56884765625, "learning_rate": 4.262128118541575e-05, "loss": 39.9982, "step": 15354 }, { "epoch": 40.554638494552655, "grad_norm": 1024.9232177734375, "learning_rate": 4.259795254931323e-05, "loss": 38.5556, "step": 15355 }, { "epoch": 40.557279630241005, "grad_norm": 1098.6407470703125, "learning_rate": 4.257462970479098e-05, "loss": 37.8312, "step": 15356 }, { "epoch": 40.55992076592935, "grad_norm": 2354.87451171875, "learning_rate": 4.2551312652500426e-05, "loss": 37.9888, "step": 15357 }, { "epoch": 40.5625619016177, "grad_norm": 1303.5994873046875, "learning_rate": 4.2528001393092656e-05, "loss": 37.5404, "step": 15358 }, { "epoch": 40.56520303730604, "grad_norm": 682.267333984375, "learning_rate": 4.2504695927218507e-05, "loss": 35.9929, "step": 15359 }, { "epoch": 40.567844172994384, "grad_norm": 1192.215576171875, "learning_rate": 4.248139625552877e-05, "loss": 35.3216, "step": 15360 }, { "epoch": 40.570485308682734, "grad_norm": 873.47802734375, "learning_rate": 4.24581023786742e-05, "loss": 35.2495, "step": 15361 }, { "epoch": 40.57312644437108, "grad_norm": 932.7284545898438, "learning_rate": 4.2434814297305224e-05, "loss": 35.1525, "step": 15362 }, { "epoch": 40.57576758005943, "grad_norm": 1039.5382080078125, "learning_rate": 4.241153201207204e-05, "loss": 34.946, "step": 15363 }, { "epoch": 40.57840871574777, "grad_norm": 1838.3912353515625, "learning_rate": 4.2388255523624966e-05, "loss": 34.7153, "step": 15364 }, { "epoch": 40.58104985143612, "grad_norm": 866.6227416992188, "learning_rate": 4.2364984832613886e-05, "loss": 35.8185, "step": 15365 }, { "epoch": 40.58369098712446, "grad_norm": 1981.0863037109375, "learning_rate": 4.2341719939688635e-05, "loss": 35.6627, "step": 15366 }, { "epoch": 40.58633212281281, "grad_norm": 1671.2418212890625, "learning_rate": 4.231846084549887e-05, "loss": 34.3208, "step": 15367 }, { "epoch": 40.588973258501156, "grad_norm": 3171.03271484375, "learning_rate": 4.229520755069407e-05, "loss": 35.6317, "step": 15368 }, { "epoch": 40.5916143941895, "grad_norm": 6576.2333984375, "learning_rate": 4.2271960055923624e-05, "loss": 12.0867, "step": 15369 }, { "epoch": 40.59425552987785, "grad_norm": 1245.93701171875, "learning_rate": 4.224871836183658e-05, "loss": 13.7848, "step": 15370 }, { "epoch": 40.59689666556619, "grad_norm": 1741.9930419921875, "learning_rate": 4.222548246908209e-05, "loss": 16.9053, "step": 15371 }, { "epoch": 40.59953780125454, "grad_norm": 4932.87548828125, "learning_rate": 4.2202252378308966e-05, "loss": 16.102, "step": 15372 }, { "epoch": 40.602178936942884, "grad_norm": 4397.49853515625, "learning_rate": 4.2179028090165885e-05, "loss": 10.3781, "step": 15373 }, { "epoch": 40.604820072631234, "grad_norm": 3137.043212890625, "learning_rate": 4.2155809605301296e-05, "loss": 10.0617, "step": 15374 }, { "epoch": 40.60746120831958, "grad_norm": 2147.206787109375, "learning_rate": 4.213259692436367e-05, "loss": 9.5275, "step": 15375 }, { "epoch": 40.61010234400792, "grad_norm": 4898.564453125, "learning_rate": 4.210939004800124e-05, "loss": 13.5532, "step": 15376 }, { "epoch": 40.61274347969627, "grad_norm": 7755.59228515625, "learning_rate": 4.2086188976861866e-05, "loss": 10.0191, "step": 15377 }, { "epoch": 40.61538461538461, "grad_norm": 971.00732421875, "learning_rate": 4.2062993711593586e-05, "loss": 33.8712, "step": 15378 }, { "epoch": 40.61802575107296, "grad_norm": 1576.2122802734375, "learning_rate": 4.203980425284404e-05, "loss": 34.4378, "step": 15379 }, { "epoch": 40.620666886761306, "grad_norm": 1034.3916015625, "learning_rate": 4.2016620601260796e-05, "loss": 34.4333, "step": 15380 }, { "epoch": 40.623308022449656, "grad_norm": 2186.348388671875, "learning_rate": 4.19934427574912e-05, "loss": 34.7595, "step": 15381 }, { "epoch": 40.625949158138, "grad_norm": 1732.5550537109375, "learning_rate": 4.197027072218259e-05, "loss": 34.3842, "step": 15382 }, { "epoch": 40.62859029382634, "grad_norm": 1099.713623046875, "learning_rate": 4.1947104495981966e-05, "loss": 34.9008, "step": 15383 }, { "epoch": 40.63123142951469, "grad_norm": 608.8836059570312, "learning_rate": 4.1923944079536205e-05, "loss": 36.6302, "step": 15384 }, { "epoch": 40.633872565203035, "grad_norm": 1036.0673828125, "learning_rate": 4.19007894734921e-05, "loss": 34.9744, "step": 15385 }, { "epoch": 40.636513700891385, "grad_norm": 1624.933349609375, "learning_rate": 4.1877640678496196e-05, "loss": 33.4226, "step": 15386 }, { "epoch": 40.63915483657973, "grad_norm": 2003.783935546875, "learning_rate": 4.185449769519492e-05, "loss": 34.8501, "step": 15387 }, { "epoch": 40.64179597226808, "grad_norm": 1469.6734619140625, "learning_rate": 4.1831360524234476e-05, "loss": 35.1689, "step": 15388 }, { "epoch": 40.64443710795642, "grad_norm": 2103.290283203125, "learning_rate": 4.1808229166261065e-05, "loss": 34.1931, "step": 15389 }, { "epoch": 40.64707824364477, "grad_norm": 4758.083984375, "learning_rate": 4.1785103621920534e-05, "loss": 35.1421, "step": 15390 }, { "epoch": 40.64971937933311, "grad_norm": 6525.36474609375, "learning_rate": 4.176198389185862e-05, "loss": 34.854, "step": 15391 }, { "epoch": 40.652360515021456, "grad_norm": 1815.071533203125, "learning_rate": 4.173886997672105e-05, "loss": 35.4891, "step": 15392 }, { "epoch": 40.655001650709806, "grad_norm": 7552.8740234375, "learning_rate": 4.171576187715326e-05, "loss": 36.4902, "step": 15393 }, { "epoch": 40.65764278639815, "grad_norm": 4344.001953125, "learning_rate": 4.169265959380039e-05, "loss": 37.4069, "step": 15394 }, { "epoch": 40.6602839220865, "grad_norm": 3821.215576171875, "learning_rate": 4.1669563127307606e-05, "loss": 40.1355, "step": 15395 }, { "epoch": 40.66292505777484, "grad_norm": 1230.5814208984375, "learning_rate": 4.1646472478319904e-05, "loss": 38.376, "step": 15396 }, { "epoch": 40.66556619346319, "grad_norm": 2049.59423828125, "learning_rate": 4.1623387647482105e-05, "loss": 39.7797, "step": 15397 }, { "epoch": 40.668207329151535, "grad_norm": 2047.02978515625, "learning_rate": 4.160030863543871e-05, "loss": 37.548, "step": 15398 }, { "epoch": 40.67084846483988, "grad_norm": 1000.5325927734375, "learning_rate": 4.157723544283434e-05, "loss": 38.6865, "step": 15399 }, { "epoch": 40.67348960052823, "grad_norm": 1067.2276611328125, "learning_rate": 4.155416807031326e-05, "loss": 42.9024, "step": 15400 }, { "epoch": 40.67348960052823, "eval_loss": 3.726862668991089, "eval_runtime": 2.1691, "eval_samples_per_second": 228.203, "eval_steps_per_second": 28.583, "step": 15400 }, { "epoch": 40.67613073621657, "grad_norm": 778.5697021484375, "learning_rate": 4.1531106518519576e-05, "loss": 41.3673, "step": 15401 }, { "epoch": 40.67877187190492, "grad_norm": 1186.3138427734375, "learning_rate": 4.1508050788097275e-05, "loss": 42.513, "step": 15402 }, { "epoch": 40.681413007593264, "grad_norm": 865.9370727539062, "learning_rate": 4.1485000879690174e-05, "loss": 42.5682, "step": 15403 }, { "epoch": 40.684054143281614, "grad_norm": 1137.2359619140625, "learning_rate": 4.146195679394196e-05, "loss": 39.6828, "step": 15404 }, { "epoch": 40.68669527896996, "grad_norm": 939.2041625976562, "learning_rate": 4.1438918531496034e-05, "loss": 37.3306, "step": 15405 }, { "epoch": 40.6893364146583, "grad_norm": 1181.9495849609375, "learning_rate": 4.141588609299585e-05, "loss": 38.3894, "step": 15406 }, { "epoch": 40.69197755034665, "grad_norm": 2023.1373291015625, "learning_rate": 4.13928594790845e-05, "loss": 37.762, "step": 15407 }, { "epoch": 40.69461868603499, "grad_norm": 2278.0302734375, "learning_rate": 4.136983869040503e-05, "loss": 39.4442, "step": 15408 }, { "epoch": 40.69725982172334, "grad_norm": 898.3765869140625, "learning_rate": 4.134682372760018e-05, "loss": 35.5768, "step": 15409 }, { "epoch": 40.699900957411685, "grad_norm": 1880.171142578125, "learning_rate": 4.132381459131279e-05, "loss": 35.3959, "step": 15410 }, { "epoch": 40.702542093100035, "grad_norm": 838.5953369140625, "learning_rate": 4.1300811282185326e-05, "loss": 35.5367, "step": 15411 }, { "epoch": 40.70518322878838, "grad_norm": 763.1802978515625, "learning_rate": 4.127781380086002e-05, "loss": 34.9223, "step": 15412 }, { "epoch": 40.70782436447673, "grad_norm": 10445.4619140625, "learning_rate": 4.125482214797918e-05, "loss": 34.2934, "step": 15413 }, { "epoch": 40.71046550016507, "grad_norm": 2050.570556640625, "learning_rate": 4.123183632418484e-05, "loss": 34.4986, "step": 15414 }, { "epoch": 40.713106635853414, "grad_norm": 1320.9268798828125, "learning_rate": 4.120885633011881e-05, "loss": 35.4218, "step": 15415 }, { "epoch": 40.715747771541764, "grad_norm": 3106.584228515625, "learning_rate": 4.118588216642277e-05, "loss": 36.3586, "step": 15416 }, { "epoch": 40.71838890723011, "grad_norm": 11621.3779296875, "learning_rate": 4.116291383373838e-05, "loss": 36.6821, "step": 15417 }, { "epoch": 40.72103004291846, "grad_norm": 10681.2060546875, "learning_rate": 4.113995133270693e-05, "loss": 11.8411, "step": 15418 }, { "epoch": 40.7236711786068, "grad_norm": 4068.52392578125, "learning_rate": 4.111699466396962e-05, "loss": 17.9837, "step": 15419 }, { "epoch": 40.72631231429515, "grad_norm": 596.333984375, "learning_rate": 4.109404382816756e-05, "loss": 12.9771, "step": 15420 }, { "epoch": 40.72895344998349, "grad_norm": 7053.70849609375, "learning_rate": 4.107109882594162e-05, "loss": 12.928, "step": 15421 }, { "epoch": 40.731594585671836, "grad_norm": 2466.833251953125, "learning_rate": 4.104815965793249e-05, "loss": 13.9519, "step": 15422 }, { "epoch": 40.734235721360186, "grad_norm": 3533.169189453125, "learning_rate": 4.10252263247807e-05, "loss": 10.5582, "step": 15423 }, { "epoch": 40.73687685704853, "grad_norm": 1810.39990234375, "learning_rate": 4.100229882712678e-05, "loss": 13.7639, "step": 15424 }, { "epoch": 40.73951799273688, "grad_norm": 2439.568359375, "learning_rate": 4.097937716561087e-05, "loss": 16.1959, "step": 15425 }, { "epoch": 40.74215912842522, "grad_norm": 8430.13671875, "learning_rate": 4.095646134087303e-05, "loss": 11.4306, "step": 15426 }, { "epoch": 40.74480026411357, "grad_norm": 1094.00634765625, "learning_rate": 4.0933551353553256e-05, "loss": 27.4831, "step": 15427 }, { "epoch": 40.747441399801914, "grad_norm": 1631.063720703125, "learning_rate": 4.09106472042913e-05, "loss": 34.6449, "step": 15428 }, { "epoch": 40.75008253549026, "grad_norm": 1613.47998046875, "learning_rate": 4.088774889372665e-05, "loss": 34.8067, "step": 15429 }, { "epoch": 40.75272367117861, "grad_norm": 1600.79248046875, "learning_rate": 4.086485642249871e-05, "loss": 34.9703, "step": 15430 }, { "epoch": 40.75536480686695, "grad_norm": 973.8427734375, "learning_rate": 4.084196979124685e-05, "loss": 33.667, "step": 15431 }, { "epoch": 40.7580059425553, "grad_norm": 3684.54931640625, "learning_rate": 4.0819089000610155e-05, "loss": 33.7877, "step": 15432 }, { "epoch": 40.76064707824364, "grad_norm": 2586.966552734375, "learning_rate": 4.0796214051227407e-05, "loss": 34.8804, "step": 15433 }, { "epoch": 40.76328821393199, "grad_norm": 1361.1904296875, "learning_rate": 4.077334494373758e-05, "loss": 34.1852, "step": 15434 }, { "epoch": 40.765929349620336, "grad_norm": 1277.5673828125, "learning_rate": 4.075048167877918e-05, "loss": 33.6157, "step": 15435 }, { "epoch": 40.768570485308686, "grad_norm": 1975.0416259765625, "learning_rate": 4.072762425699067e-05, "loss": 35.2659, "step": 15436 }, { "epoch": 40.77121162099703, "grad_norm": 725.5614013671875, "learning_rate": 4.070477267901032e-05, "loss": 33.5956, "step": 15437 }, { "epoch": 40.77385275668537, "grad_norm": 2802.76171875, "learning_rate": 4.068192694547623e-05, "loss": 36.5464, "step": 15438 }, { "epoch": 40.77649389237372, "grad_norm": 1401.7313232421875, "learning_rate": 4.0659087057026394e-05, "loss": 34.0851, "step": 15439 }, { "epoch": 40.779135028062065, "grad_norm": 4170.49609375, "learning_rate": 4.063625301429849e-05, "loss": 34.2773, "step": 15440 }, { "epoch": 40.781776163750415, "grad_norm": 2399.111328125, "learning_rate": 4.061342481793032e-05, "loss": 33.5626, "step": 15441 }, { "epoch": 40.78441729943876, "grad_norm": 1346.5023193359375, "learning_rate": 4.059060246855925e-05, "loss": 35.3406, "step": 15442 }, { "epoch": 40.78705843512711, "grad_norm": 1188.8779296875, "learning_rate": 4.0567785966822536e-05, "loss": 36.3299, "step": 15443 }, { "epoch": 40.78969957081545, "grad_norm": 1455.415771484375, "learning_rate": 4.054497531335741e-05, "loss": 39.1415, "step": 15444 }, { "epoch": 40.79234070650379, "grad_norm": 937.1697998046875, "learning_rate": 4.0522170508800834e-05, "loss": 40.8491, "step": 15445 }, { "epoch": 40.79498184219214, "grad_norm": 708.0310668945312, "learning_rate": 4.0499371553789634e-05, "loss": 39.4265, "step": 15446 }, { "epoch": 40.797622977880486, "grad_norm": 643.1510620117188, "learning_rate": 4.04765784489603e-05, "loss": 38.2341, "step": 15447 }, { "epoch": 40.800264113568836, "grad_norm": 1381.8330078125, "learning_rate": 4.0453791194949516e-05, "loss": 39.6048, "step": 15448 }, { "epoch": 40.80290524925718, "grad_norm": 799.1046752929688, "learning_rate": 4.0431009792393506e-05, "loss": 40.3551, "step": 15449 }, { "epoch": 40.80554638494553, "grad_norm": 910.5795288085938, "learning_rate": 4.0408234241928355e-05, "loss": 39.5649, "step": 15450 }, { "epoch": 40.80818752063387, "grad_norm": 1552.3414306640625, "learning_rate": 4.038546454419023e-05, "loss": 44.9549, "step": 15451 }, { "epoch": 40.810828656322215, "grad_norm": 1521.6605224609375, "learning_rate": 4.0362700699814846e-05, "loss": 41.8709, "step": 15452 }, { "epoch": 40.813469792010565, "grad_norm": 1391.0318603515625, "learning_rate": 4.0339942709437945e-05, "loss": 40.3035, "step": 15453 }, { "epoch": 40.81611092769891, "grad_norm": 1464.0115966796875, "learning_rate": 4.031719057369493e-05, "loss": 40.0907, "step": 15454 }, { "epoch": 40.81875206338726, "grad_norm": 736.9207153320312, "learning_rate": 4.0294444293221236e-05, "loss": 38.3078, "step": 15455 }, { "epoch": 40.8213931990756, "grad_norm": 3395.281494140625, "learning_rate": 4.027170386865198e-05, "loss": 39.3268, "step": 15456 }, { "epoch": 40.82403433476395, "grad_norm": 699.2495727539062, "learning_rate": 4.024896930062213e-05, "loss": 38.0028, "step": 15457 }, { "epoch": 40.826675470452294, "grad_norm": 1479.075927734375, "learning_rate": 4.0226240589766646e-05, "loss": 38.2826, "step": 15458 }, { "epoch": 40.829316606140644, "grad_norm": 1124.9859619140625, "learning_rate": 4.020351773672018e-05, "loss": 36.5599, "step": 15459 }, { "epoch": 40.83195774182899, "grad_norm": 798.1476440429688, "learning_rate": 4.0180800742117244e-05, "loss": 36.4182, "step": 15460 }, { "epoch": 40.83459887751733, "grad_norm": 1606.0355224609375, "learning_rate": 4.0158089606592116e-05, "loss": 35.9044, "step": 15461 }, { "epoch": 40.83724001320568, "grad_norm": 1141.9129638671875, "learning_rate": 4.013538433077915e-05, "loss": 34.8348, "step": 15462 }, { "epoch": 40.83988114889402, "grad_norm": 1222.42626953125, "learning_rate": 4.011268491531234e-05, "loss": 35.052, "step": 15463 }, { "epoch": 40.84252228458237, "grad_norm": 1310.83984375, "learning_rate": 4.008999136082539e-05, "loss": 34.3393, "step": 15464 }, { "epoch": 40.845163420270715, "grad_norm": 627.574462890625, "learning_rate": 4.0067303667952195e-05, "loss": 35.417, "step": 15465 }, { "epoch": 40.847804555959065, "grad_norm": 774.8554077148438, "learning_rate": 4.004462183732621e-05, "loss": 34.9419, "step": 15466 }, { "epoch": 40.85044569164741, "grad_norm": 2186.68310546875, "learning_rate": 4.0021945869580856e-05, "loss": 23.3619, "step": 15467 }, { "epoch": 40.85308682733575, "grad_norm": 3909.03271484375, "learning_rate": 3.999927576534923e-05, "loss": 14.4509, "step": 15468 }, { "epoch": 40.8557279630241, "grad_norm": 754.065673828125, "learning_rate": 3.997661152526452e-05, "loss": 10.9463, "step": 15469 }, { "epoch": 40.858369098712444, "grad_norm": 4853.47119140625, "learning_rate": 3.995395314995956e-05, "loss": 9.0166, "step": 15470 }, { "epoch": 40.861010234400794, "grad_norm": 9141.087890625, "learning_rate": 3.993130064006709e-05, "loss": 14.2532, "step": 15471 }, { "epoch": 40.86365137008914, "grad_norm": 782.437255859375, "learning_rate": 3.990865399621965e-05, "loss": 12.3992, "step": 15472 }, { "epoch": 40.86629250577749, "grad_norm": 24220.41015625, "learning_rate": 3.988601321904961e-05, "loss": 10.0794, "step": 15473 }, { "epoch": 40.86893364146583, "grad_norm": 2417.1259765625, "learning_rate": 3.986337830918923e-05, "loss": 13.5971, "step": 15474 }, { "epoch": 40.87157477715417, "grad_norm": 1929.0361328125, "learning_rate": 3.984074926727052e-05, "loss": 12.2571, "step": 15475 }, { "epoch": 40.87421591284252, "grad_norm": 15207.9208984375, "learning_rate": 3.9818126093925486e-05, "loss": 20.4733, "step": 15476 }, { "epoch": 40.876857048530866, "grad_norm": 819.0918579101562, "learning_rate": 3.97955087897858e-05, "loss": 34.9432, "step": 15477 }, { "epoch": 40.879498184219216, "grad_norm": 1158.1776123046875, "learning_rate": 3.977289735548301e-05, "loss": 33.9597, "step": 15478 }, { "epoch": 40.88213931990756, "grad_norm": 1454.3380126953125, "learning_rate": 3.975029179164863e-05, "loss": 35.0877, "step": 15479 }, { "epoch": 40.88478045559591, "grad_norm": 2820.49560546875, "learning_rate": 3.972769209891383e-05, "loss": 36.6057, "step": 15480 }, { "epoch": 40.88742159128425, "grad_norm": 787.4188232421875, "learning_rate": 3.970509827790978e-05, "loss": 35.805, "step": 15481 }, { "epoch": 40.8900627269726, "grad_norm": 1265.5548095703125, "learning_rate": 3.9682510329267205e-05, "loss": 33.8414, "step": 15482 }, { "epoch": 40.892703862660944, "grad_norm": 837.8946533203125, "learning_rate": 3.9659928253617026e-05, "loss": 35.1361, "step": 15483 }, { "epoch": 40.89534499834929, "grad_norm": 1350.5147705078125, "learning_rate": 3.9637352051589805e-05, "loss": 33.8561, "step": 15484 }, { "epoch": 40.89798613403764, "grad_norm": 773.5988159179688, "learning_rate": 3.9614781723815885e-05, "loss": 34.0104, "step": 15485 }, { "epoch": 40.90062726972598, "grad_norm": 1040.3070068359375, "learning_rate": 3.959221727092566e-05, "loss": 33.1356, "step": 15486 }, { "epoch": 40.90326840541433, "grad_norm": 1048.3651123046875, "learning_rate": 3.956965869354917e-05, "loss": 34.6397, "step": 15487 }, { "epoch": 40.90590954110267, "grad_norm": 2389.39208984375, "learning_rate": 3.954710599231634e-05, "loss": 34.8614, "step": 15488 }, { "epoch": 40.90855067679102, "grad_norm": 1183.7421875, "learning_rate": 3.9524559167856965e-05, "loss": 35.1178, "step": 15489 }, { "epoch": 40.911191812479366, "grad_norm": 1984.763671875, "learning_rate": 3.950201822080063e-05, "loss": 34.0561, "step": 15490 }, { "epoch": 40.91383294816771, "grad_norm": 1703.767578125, "learning_rate": 3.947948315177677e-05, "loss": 34.2446, "step": 15491 }, { "epoch": 40.91647408385606, "grad_norm": 2387.3271484375, "learning_rate": 3.945695396141463e-05, "loss": 33.536, "step": 15492 }, { "epoch": 40.9191152195444, "grad_norm": 832.1292724609375, "learning_rate": 3.943443065034344e-05, "loss": 34.6132, "step": 15493 }, { "epoch": 40.92175635523275, "grad_norm": 2631.54638671875, "learning_rate": 3.941191321919207e-05, "loss": 37.6267, "step": 15494 }, { "epoch": 40.924397490921095, "grad_norm": 3563.111572265625, "learning_rate": 3.938940166858931e-05, "loss": 39.5721, "step": 15495 }, { "epoch": 40.927038626609445, "grad_norm": 959.8031616210938, "learning_rate": 3.9366895999163734e-05, "loss": 37.2448, "step": 15496 }, { "epoch": 40.92967976229779, "grad_norm": 926.3888549804688, "learning_rate": 3.934439621154393e-05, "loss": 38.1944, "step": 15497 }, { "epoch": 40.93232089798613, "grad_norm": 976.1473999023438, "learning_rate": 3.9321902306358155e-05, "loss": 40.8782, "step": 15498 }, { "epoch": 40.93496203367448, "grad_norm": 807.3571166992188, "learning_rate": 3.9299414284234364e-05, "loss": 41.43, "step": 15499 }, { "epoch": 40.93760316936282, "grad_norm": 1056.16748046875, "learning_rate": 3.927693214580075e-05, "loss": 40.4984, "step": 15500 }, { "epoch": 40.94024430505117, "grad_norm": 1169.4342041015625, "learning_rate": 3.9254455891685e-05, "loss": 39.8696, "step": 15501 }, { "epoch": 40.942885440739516, "grad_norm": 844.1281127929688, "learning_rate": 3.9231985522514776e-05, "loss": 36.4219, "step": 15502 }, { "epoch": 40.945526576427866, "grad_norm": 2291.895263671875, "learning_rate": 3.920952103891751e-05, "loss": 36.6573, "step": 15503 }, { "epoch": 40.94816771211621, "grad_norm": 957.5836181640625, "learning_rate": 3.9187062441520564e-05, "loss": 35.2555, "step": 15504 }, { "epoch": 40.95080884780456, "grad_norm": 990.4598388671875, "learning_rate": 3.916460973095109e-05, "loss": 35.2931, "step": 15505 }, { "epoch": 40.9534499834929, "grad_norm": 810.459716796875, "learning_rate": 3.914216290783604e-05, "loss": 36.0527, "step": 15506 }, { "epoch": 40.956091119181245, "grad_norm": 1786.8311767578125, "learning_rate": 3.91197219728022e-05, "loss": 33.8432, "step": 15507 }, { "epoch": 40.958732254869595, "grad_norm": 1425.1595458984375, "learning_rate": 3.9097286926476267e-05, "loss": 35.2194, "step": 15508 }, { "epoch": 40.96137339055794, "grad_norm": 4046.083740234375, "learning_rate": 3.907485776948469e-05, "loss": 24.928, "step": 15509 }, { "epoch": 40.96401452624629, "grad_norm": 3326.385009765625, "learning_rate": 3.905243450245374e-05, "loss": 9.1103, "step": 15510 }, { "epoch": 40.96665566193463, "grad_norm": 1406.424560546875, "learning_rate": 3.9030017126009734e-05, "loss": 7.8569, "step": 15511 }, { "epoch": 40.96929679762298, "grad_norm": 545.4920043945312, "learning_rate": 3.9007605640778544e-05, "loss": 10.1572, "step": 15512 }, { "epoch": 40.971937933311324, "grad_norm": 1224.3519287109375, "learning_rate": 3.898520004738596e-05, "loss": 7.8329, "step": 15513 }, { "epoch": 40.97457906899967, "grad_norm": 6525.7197265625, "learning_rate": 3.896280034645777e-05, "loss": 24.0896, "step": 15514 }, { "epoch": 40.97722020468802, "grad_norm": 2368.675048828125, "learning_rate": 3.894040653861941e-05, "loss": 35.6166, "step": 15515 }, { "epoch": 40.97986134037636, "grad_norm": 1712.530517578125, "learning_rate": 3.891801862449629e-05, "loss": 35.8244, "step": 15516 }, { "epoch": 40.98250247606471, "grad_norm": 791.325927734375, "learning_rate": 3.8895636604713375e-05, "loss": 34.1598, "step": 15517 }, { "epoch": 40.98514361175305, "grad_norm": 2624.652587890625, "learning_rate": 3.8873260479895864e-05, "loss": 33.9113, "step": 15518 }, { "epoch": 40.9877847474414, "grad_norm": 1517.5670166015625, "learning_rate": 3.8850890250668515e-05, "loss": 33.9795, "step": 15519 }, { "epoch": 40.990425883129745, "grad_norm": 1246.4229736328125, "learning_rate": 3.8828525917655984e-05, "loss": 34.1731, "step": 15520 }, { "epoch": 40.99306701881809, "grad_norm": 2564.607666015625, "learning_rate": 3.880616748148286e-05, "loss": 33.8732, "step": 15521 }, { "epoch": 40.99570815450644, "grad_norm": 3476.918212890625, "learning_rate": 3.8783814942773445e-05, "loss": 33.4244, "step": 15522 }, { "epoch": 40.99834929019478, "grad_norm": 5988.26806640625, "learning_rate": 3.876146830215194e-05, "loss": 34.8524, "step": 15523 }, { "epoch": 41.00099042588313, "grad_norm": 920.0819091796875, "learning_rate": 3.873912756024231e-05, "loss": 40.3064, "step": 15524 }, { "epoch": 41.003631561571474, "grad_norm": 743.5697631835938, "learning_rate": 3.871679271766848e-05, "loss": 38.2014, "step": 15525 }, { "epoch": 41.006272697259824, "grad_norm": 1058.200439453125, "learning_rate": 3.8694463775054065e-05, "loss": 37.9049, "step": 15526 }, { "epoch": 41.00891383294817, "grad_norm": 695.2650756835938, "learning_rate": 3.8672140733022585e-05, "loss": 38.4996, "step": 15527 }, { "epoch": 41.01155496863652, "grad_norm": 978.7110595703125, "learning_rate": 3.864982359219749e-05, "loss": 39.6639, "step": 15528 }, { "epoch": 41.01419610432486, "grad_norm": 949.3389282226562, "learning_rate": 3.862751235320191e-05, "loss": 41.3299, "step": 15529 }, { "epoch": 41.0168372400132, "grad_norm": 924.9093627929688, "learning_rate": 3.86052070166589e-05, "loss": 41.4602, "step": 15530 }, { "epoch": 41.01947837570155, "grad_norm": 720.4977416992188, "learning_rate": 3.8582907583191216e-05, "loss": 41.7164, "step": 15531 }, { "epoch": 41.022119511389896, "grad_norm": 1432.0257568359375, "learning_rate": 3.8560614053421734e-05, "loss": 38.9084, "step": 15532 }, { "epoch": 41.024760647078246, "grad_norm": 2106.36376953125, "learning_rate": 3.8538326427972945e-05, "loss": 40.6146, "step": 15533 }, { "epoch": 41.02740178276659, "grad_norm": 1183.4752197265625, "learning_rate": 3.851604470746703e-05, "loss": 40.069, "step": 15534 }, { "epoch": 41.03004291845494, "grad_norm": 1324.7755126953125, "learning_rate": 3.849376889252643e-05, "loss": 38.0479, "step": 15535 }, { "epoch": 41.03268405414328, "grad_norm": 794.7160034179688, "learning_rate": 3.8471498983773054e-05, "loss": 40.3112, "step": 15536 }, { "epoch": 41.035325189831624, "grad_norm": 1866.0325927734375, "learning_rate": 3.844923498182884e-05, "loss": 36.7238, "step": 15537 }, { "epoch": 41.037966325519974, "grad_norm": 893.0897216796875, "learning_rate": 3.842697688731539e-05, "loss": 35.5197, "step": 15538 }, { "epoch": 41.04060746120832, "grad_norm": 1152.822509765625, "learning_rate": 3.840472470085441e-05, "loss": 34.2291, "step": 15539 }, { "epoch": 41.04324859689667, "grad_norm": 822.642333984375, "learning_rate": 3.838247842306716e-05, "loss": 34.5777, "step": 15540 }, { "epoch": 41.04588973258501, "grad_norm": 1272.21923828125, "learning_rate": 3.836023805457492e-05, "loss": 35.8345, "step": 15541 }, { "epoch": 41.04853086827336, "grad_norm": 2195.5087890625, "learning_rate": 3.833800359599868e-05, "loss": 33.9253, "step": 15542 }, { "epoch": 41.0511720039617, "grad_norm": 914.1217041015625, "learning_rate": 3.8315775047959387e-05, "loss": 34.3807, "step": 15543 }, { "epoch": 41.053813139650046, "grad_norm": 1357.809326171875, "learning_rate": 3.8293552411077684e-05, "loss": 35.3977, "step": 15544 }, { "epoch": 41.056454275338396, "grad_norm": 2318.43701171875, "learning_rate": 3.827133568597413e-05, "loss": 36.071, "step": 15545 }, { "epoch": 41.05909541102674, "grad_norm": 8455.4091796875, "learning_rate": 3.82491248732692e-05, "loss": 28.6031, "step": 15546 }, { "epoch": 41.06173654671509, "grad_norm": 7401.3076171875, "learning_rate": 3.82269199735831e-05, "loss": 9.7252, "step": 15547 }, { "epoch": 41.06437768240343, "grad_norm": 2540.754638671875, "learning_rate": 3.8204720987535776e-05, "loss": 11.8851, "step": 15548 }, { "epoch": 41.06701881809178, "grad_norm": 3174.79833984375, "learning_rate": 3.818252791574725e-05, "loss": 13.7093, "step": 15549 }, { "epoch": 41.069659953780125, "grad_norm": 1670.5001220703125, "learning_rate": 3.816034075883723e-05, "loss": 10.5519, "step": 15550 }, { "epoch": 41.072301089468475, "grad_norm": 11720.9013671875, "learning_rate": 3.8138159517425303e-05, "loss": 14.0536, "step": 15551 }, { "epoch": 41.07494222515682, "grad_norm": 10986.8798828125, "learning_rate": 3.811598419213069e-05, "loss": 16.8921, "step": 15552 }, { "epoch": 41.07758336084516, "grad_norm": 1178.41162109375, "learning_rate": 3.8093814783572794e-05, "loss": 10.2084, "step": 15553 }, { "epoch": 41.08022449653351, "grad_norm": 27248.6875, "learning_rate": 3.807165129237067e-05, "loss": 10.2959, "step": 15554 }, { "epoch": 41.08286563222185, "grad_norm": 861.486572265625, "learning_rate": 3.8049493719143125e-05, "loss": 9.305, "step": 15555 }, { "epoch": 41.0855067679102, "grad_norm": 1995.5455322265625, "learning_rate": 3.8027342064509e-05, "loss": 34.7858, "step": 15556 }, { "epoch": 41.088147903598546, "grad_norm": 865.5182495117188, "learning_rate": 3.800519632908686e-05, "loss": 35.5089, "step": 15557 }, { "epoch": 41.090789039286896, "grad_norm": 1799.8985595703125, "learning_rate": 3.798305651349507e-05, "loss": 33.6479, "step": 15558 }, { "epoch": 41.09343017497524, "grad_norm": 1035.0216064453125, "learning_rate": 3.796092261835188e-05, "loss": 34.0449, "step": 15559 }, { "epoch": 41.09607131066358, "grad_norm": 1456.54541015625, "learning_rate": 3.793879464427536e-05, "loss": 32.7358, "step": 15560 }, { "epoch": 41.09871244635193, "grad_norm": 870.8121948242188, "learning_rate": 3.7916672591883414e-05, "loss": 33.7536, "step": 15561 }, { "epoch": 41.101353582040275, "grad_norm": 845.991455078125, "learning_rate": 3.7894556461793765e-05, "loss": 35.9316, "step": 15562 }, { "epoch": 41.103994717728625, "grad_norm": 1394.182373046875, "learning_rate": 3.787244625462411e-05, "loss": 35.0031, "step": 15563 }, { "epoch": 41.10663585341697, "grad_norm": 1663.6156005859375, "learning_rate": 3.7850341970991745e-05, "loss": 34.5853, "step": 15564 }, { "epoch": 41.10927698910532, "grad_norm": 815.2273559570312, "learning_rate": 3.782824361151396e-05, "loss": 34.1646, "step": 15565 }, { "epoch": 41.11191812479366, "grad_norm": 1074.6983642578125, "learning_rate": 3.780615117680778e-05, "loss": 34.9766, "step": 15566 }, { "epoch": 41.114559260482004, "grad_norm": 1250.35791015625, "learning_rate": 3.778406466749024e-05, "loss": 33.2557, "step": 15567 }, { "epoch": 41.117200396170354, "grad_norm": 12670.3115234375, "learning_rate": 3.7761984084178105e-05, "loss": 34.3506, "step": 15568 }, { "epoch": 41.1198415318587, "grad_norm": 1561.8504638671875, "learning_rate": 3.7739909427487756e-05, "loss": 33.5809, "step": 15569 }, { "epoch": 41.12248266754705, "grad_norm": 626.8410034179688, "learning_rate": 3.771784069803583e-05, "loss": 33.7589, "step": 15570 }, { "epoch": 41.12512380323539, "grad_norm": 2906.554931640625, "learning_rate": 3.7695777896438464e-05, "loss": 33.7435, "step": 15571 }, { "epoch": 41.12776493892374, "grad_norm": 907.7128295898438, "learning_rate": 3.767372102331174e-05, "loss": 35.6613, "step": 15572 }, { "epoch": 41.13040607461208, "grad_norm": 1299.946044921875, "learning_rate": 3.76516700792717e-05, "loss": 38.2189, "step": 15573 }, { "epoch": 41.13304721030043, "grad_norm": 1234.385986328125, "learning_rate": 3.762962506493403e-05, "loss": 40.1127, "step": 15574 }, { "epoch": 41.135688345988775, "grad_norm": 665.300048828125, "learning_rate": 3.760758598091432e-05, "loss": 39.3034, "step": 15575 }, { "epoch": 41.13832948167712, "grad_norm": 1979.9305419921875, "learning_rate": 3.7585552827827997e-05, "loss": 38.3437, "step": 15576 }, { "epoch": 41.14097061736547, "grad_norm": 1195.9451904296875, "learning_rate": 3.756352560629036e-05, "loss": 37.8983, "step": 15577 }, { "epoch": 41.14361175305381, "grad_norm": 2356.623291015625, "learning_rate": 3.7541504316916466e-05, "loss": 40.3438, "step": 15578 }, { "epoch": 41.14625288874216, "grad_norm": 1274.477294921875, "learning_rate": 3.7519488960321204e-05, "loss": 41.4337, "step": 15579 }, { "epoch": 41.148894024430504, "grad_norm": 989.7097778320312, "learning_rate": 3.7497479537119435e-05, "loss": 40.7412, "step": 15580 }, { "epoch": 41.151535160118854, "grad_norm": 2669.506591796875, "learning_rate": 3.7475476047925725e-05, "loss": 39.1894, "step": 15581 }, { "epoch": 41.1541762958072, "grad_norm": 1104.965576171875, "learning_rate": 3.745347849335451e-05, "loss": 39.0099, "step": 15582 }, { "epoch": 41.15681743149554, "grad_norm": 1470.86865234375, "learning_rate": 3.7431486874019983e-05, "loss": 38.7348, "step": 15583 }, { "epoch": 41.15945856718389, "grad_norm": 802.5421142578125, "learning_rate": 3.740950119053641e-05, "loss": 40.209, "step": 15584 }, { "epoch": 41.16209970287223, "grad_norm": 1246.472412109375, "learning_rate": 3.738752144351759e-05, "loss": 37.0432, "step": 15585 }, { "epoch": 41.16474083856058, "grad_norm": 1527.3525390625, "learning_rate": 3.736554763357736e-05, "loss": 37.2628, "step": 15586 }, { "epoch": 41.167381974248926, "grad_norm": 1767.36572265625, "learning_rate": 3.7343579761329295e-05, "loss": 34.7256, "step": 15587 }, { "epoch": 41.170023109937276, "grad_norm": 2395.76171875, "learning_rate": 3.732161782738686e-05, "loss": 36.8457, "step": 15588 }, { "epoch": 41.17266424562562, "grad_norm": 1223.4739990234375, "learning_rate": 3.7299661832363276e-05, "loss": 36.581, "step": 15589 }, { "epoch": 41.17530538131396, "grad_norm": 1310.0477294921875, "learning_rate": 3.727771177687167e-05, "loss": 36.0313, "step": 15590 }, { "epoch": 41.17794651700231, "grad_norm": 815.2692260742188, "learning_rate": 3.7255767661525054e-05, "loss": 35.6956, "step": 15591 }, { "epoch": 41.180587652690654, "grad_norm": 3077.872314453125, "learning_rate": 3.723382948693613e-05, "loss": 34.7952, "step": 15592 }, { "epoch": 41.183228788379004, "grad_norm": 1344.3062744140625, "learning_rate": 3.721189725371754e-05, "loss": 34.8431, "step": 15593 }, { "epoch": 41.18586992406735, "grad_norm": 958.5266723632812, "learning_rate": 3.718997096248175e-05, "loss": 35.339, "step": 15594 }, { "epoch": 41.1885110597557, "grad_norm": 872.857666015625, "learning_rate": 3.7168050613840986e-05, "loss": 35.4765, "step": 15595 }, { "epoch": 41.19115219544404, "grad_norm": 1525.9735107421875, "learning_rate": 3.714613620840737e-05, "loss": 41.662, "step": 15596 }, { "epoch": 41.19379333113239, "grad_norm": 1153.0302734375, "learning_rate": 3.712422774679283e-05, "loss": 18.3503, "step": 15597 }, { "epoch": 41.19643446682073, "grad_norm": 13848.9970703125, "learning_rate": 3.710232522960921e-05, "loss": 13.3951, "step": 15598 }, { "epoch": 41.199075602509076, "grad_norm": 2981.750244140625, "learning_rate": 3.7080428657468134e-05, "loss": 12.6004, "step": 15599 }, { "epoch": 41.201716738197426, "grad_norm": 13778.2177734375, "learning_rate": 3.705853803098094e-05, "loss": 14.7952, "step": 15600 }, { "epoch": 41.201716738197426, "eval_loss": 3.8083479404449463, "eval_runtime": 2.2126, "eval_samples_per_second": 223.721, "eval_steps_per_second": 28.022, "step": 15600 }, { "epoch": 41.20435787388577, "grad_norm": 425.9028015136719, "learning_rate": 3.7036653350759036e-05, "loss": 12.7014, "step": 15601 }, { "epoch": 41.20699900957412, "grad_norm": 2612.974365234375, "learning_rate": 3.701477461741348e-05, "loss": 11.3918, "step": 15602 }, { "epoch": 41.20964014526246, "grad_norm": 3356.100830078125, "learning_rate": 3.699290183155529e-05, "loss": 14.3875, "step": 15603 }, { "epoch": 41.21228128095081, "grad_norm": 744.22265625, "learning_rate": 3.697103499379506e-05, "loss": 12.4173, "step": 15604 }, { "epoch": 41.214922416639155, "grad_norm": 1442.56640625, "learning_rate": 3.694917410474363e-05, "loss": 7.4784, "step": 15605 }, { "epoch": 41.2175635523275, "grad_norm": 1209.479248046875, "learning_rate": 3.692731916501132e-05, "loss": 9.4342, "step": 15606 }, { "epoch": 41.22020468801585, "grad_norm": 769.5504760742188, "learning_rate": 3.6905470175208425e-05, "loss": 36.6855, "step": 15607 }, { "epoch": 41.22284582370419, "grad_norm": 2506.052978515625, "learning_rate": 3.688362713594515e-05, "loss": 35.5492, "step": 15608 }, { "epoch": 41.22548695939254, "grad_norm": 897.8012084960938, "learning_rate": 3.6861790047831415e-05, "loss": 33.836, "step": 15609 }, { "epoch": 41.22812809508088, "grad_norm": 977.7022705078125, "learning_rate": 3.6839958911476953e-05, "loss": 34.8624, "step": 15610 }, { "epoch": 41.23076923076923, "grad_norm": 551.193603515625, "learning_rate": 3.6818133727491434e-05, "loss": 32.7685, "step": 15611 }, { "epoch": 41.233410366457576, "grad_norm": 1259.439208984375, "learning_rate": 3.679631449648432e-05, "loss": 34.2131, "step": 15612 }, { "epoch": 41.23605150214592, "grad_norm": 1757.1455078125, "learning_rate": 3.6774501219064866e-05, "loss": 35.8879, "step": 15613 }, { "epoch": 41.23869263783427, "grad_norm": 1101.9666748046875, "learning_rate": 3.6752693895842156e-05, "loss": 34.8661, "step": 15614 }, { "epoch": 41.24133377352261, "grad_norm": 775.9234619140625, "learning_rate": 3.673089252742523e-05, "loss": 33.8855, "step": 15615 }, { "epoch": 41.24397490921096, "grad_norm": 3126.60107421875, "learning_rate": 3.6709097114422885e-05, "loss": 35.6119, "step": 15616 }, { "epoch": 41.246616044899305, "grad_norm": 843.1624145507812, "learning_rate": 3.6687307657443686e-05, "loss": 34.3621, "step": 15617 }, { "epoch": 41.249257180587655, "grad_norm": 510.3697814941406, "learning_rate": 3.666552415709609e-05, "loss": 34.9954, "step": 15618 }, { "epoch": 41.251898316276, "grad_norm": 1671.0704345703125, "learning_rate": 3.664374661398845e-05, "loss": 36.0744, "step": 15619 }, { "epoch": 41.25453945196435, "grad_norm": 863.9876098632812, "learning_rate": 3.662197502872885e-05, "loss": 34.4861, "step": 15620 }, { "epoch": 41.25718058765269, "grad_norm": 680.076171875, "learning_rate": 3.660020940192524e-05, "loss": 34.872, "step": 15621 }, { "epoch": 41.259821723341034, "grad_norm": 2427.080810546875, "learning_rate": 3.657844973418545e-05, "loss": 37.4747, "step": 15622 }, { "epoch": 41.262462859029384, "grad_norm": 4442.04736328125, "learning_rate": 3.655669602611705e-05, "loss": 37.7025, "step": 15623 }, { "epoch": 41.26510399471773, "grad_norm": 1268.418212890625, "learning_rate": 3.6534948278327565e-05, "loss": 39.0392, "step": 15624 }, { "epoch": 41.26774513040608, "grad_norm": 944.2953491210938, "learning_rate": 3.651320649142417e-05, "loss": 38.6724, "step": 15625 }, { "epoch": 41.27038626609442, "grad_norm": 618.6415405273438, "learning_rate": 3.649147066601416e-05, "loss": 37.5489, "step": 15626 }, { "epoch": 41.27302740178277, "grad_norm": 658.1784057617188, "learning_rate": 3.6469740802704376e-05, "loss": 38.3743, "step": 15627 }, { "epoch": 41.27566853747111, "grad_norm": 2306.224365234375, "learning_rate": 3.6448016902101674e-05, "loss": 41.1364, "step": 15628 }, { "epoch": 41.278309673159455, "grad_norm": 1812.3651123046875, "learning_rate": 3.642629896481264e-05, "loss": 40.4263, "step": 15629 }, { "epoch": 41.280950808847805, "grad_norm": 1276.623779296875, "learning_rate": 3.640458699144375e-05, "loss": 42.7051, "step": 15630 }, { "epoch": 41.28359194453615, "grad_norm": 712.7718505859375, "learning_rate": 3.638288098260129e-05, "loss": 40.4667, "step": 15631 }, { "epoch": 41.2862330802245, "grad_norm": 2522.492919921875, "learning_rate": 3.6361180938891345e-05, "loss": 40.1286, "step": 15632 }, { "epoch": 41.28887421591284, "grad_norm": 1819.1929931640625, "learning_rate": 3.633948686091998e-05, "loss": 39.4932, "step": 15633 }, { "epoch": 41.29151535160119, "grad_norm": 565.8905029296875, "learning_rate": 3.6317798749292915e-05, "loss": 37.0732, "step": 15634 }, { "epoch": 41.294156487289534, "grad_norm": 683.5677490234375, "learning_rate": 3.629611660461577e-05, "loss": 37.3034, "step": 15635 }, { "epoch": 41.29679762297788, "grad_norm": 854.1527709960938, "learning_rate": 3.627444042749406e-05, "loss": 37.7096, "step": 15636 }, { "epoch": 41.29943875866623, "grad_norm": 1250.46630859375, "learning_rate": 3.625277021853307e-05, "loss": 35.9614, "step": 15637 }, { "epoch": 41.30207989435457, "grad_norm": 1017.3414916992188, "learning_rate": 3.6231105978337955e-05, "loss": 35.237, "step": 15638 }, { "epoch": 41.30472103004292, "grad_norm": 5483.2705078125, "learning_rate": 3.6209447707513524e-05, "loss": 35.613, "step": 15639 }, { "epoch": 41.30736216573126, "grad_norm": 853.8096923828125, "learning_rate": 3.6187795406664725e-05, "loss": 34.4677, "step": 15640 }, { "epoch": 41.31000330141961, "grad_norm": 775.6673583984375, "learning_rate": 3.616614907639615e-05, "loss": 34.3752, "step": 15641 }, { "epoch": 41.312644437107956, "grad_norm": 1015.9445190429688, "learning_rate": 3.6144508717312204e-05, "loss": 35.0968, "step": 15642 }, { "epoch": 41.315285572796306, "grad_norm": 1544.4588623046875, "learning_rate": 3.6122874330017265e-05, "loss": 35.0192, "step": 15643 }, { "epoch": 41.31792670848465, "grad_norm": 1280.8385009765625, "learning_rate": 3.610124591511543e-05, "loss": 35.5501, "step": 15644 }, { "epoch": 41.32056784417299, "grad_norm": 7827.38818359375, "learning_rate": 3.6079623473210645e-05, "loss": 44.189, "step": 15645 }, { "epoch": 41.32320897986134, "grad_norm": 1176.3692626953125, "learning_rate": 3.605800700490672e-05, "loss": 20.2175, "step": 15646 }, { "epoch": 41.325850115549684, "grad_norm": 838.5379638671875, "learning_rate": 3.603639651080726e-05, "loss": 10.2049, "step": 15647 }, { "epoch": 41.328491251238034, "grad_norm": 1253.4410400390625, "learning_rate": 3.601479199151575e-05, "loss": 13.012, "step": 15648 }, { "epoch": 41.33113238692638, "grad_norm": 938.2706298828125, "learning_rate": 3.5993193447635405e-05, "loss": 10.979, "step": 15649 }, { "epoch": 41.33377352261473, "grad_norm": 857.16552734375, "learning_rate": 3.59716008797695e-05, "loss": 10.1744, "step": 15650 }, { "epoch": 41.33641465830307, "grad_norm": 349.7117004394531, "learning_rate": 3.595001428852088e-05, "loss": 12.6063, "step": 15651 }, { "epoch": 41.33905579399141, "grad_norm": 889.4187622070312, "learning_rate": 3.592843367449239e-05, "loss": 15.7054, "step": 15652 }, { "epoch": 41.34169692967976, "grad_norm": 2483.92578125, "learning_rate": 3.590685903828658e-05, "loss": 15.7441, "step": 15653 }, { "epoch": 41.344338065368106, "grad_norm": 5358.974609375, "learning_rate": 3.588529038050603e-05, "loss": 14.3017, "step": 15654 }, { "epoch": 41.346979201056456, "grad_norm": 2274.649658203125, "learning_rate": 3.586372770175295e-05, "loss": 18.3843, "step": 15655 }, { "epoch": 41.3496203367448, "grad_norm": 1082.5501708984375, "learning_rate": 3.584217100262949e-05, "loss": 33.7261, "step": 15656 }, { "epoch": 41.35226147243315, "grad_norm": 1466.581787109375, "learning_rate": 3.5820620283737615e-05, "loss": 34.7899, "step": 15657 }, { "epoch": 41.35490260812149, "grad_norm": 1003.5037841796875, "learning_rate": 3.57990755456791e-05, "loss": 34.3146, "step": 15658 }, { "epoch": 41.357543743809835, "grad_norm": 916.930419921875, "learning_rate": 3.577753678905554e-05, "loss": 34.6156, "step": 15659 }, { "epoch": 41.360184879498185, "grad_norm": 1025.31396484375, "learning_rate": 3.5756004014468406e-05, "loss": 33.8729, "step": 15660 }, { "epoch": 41.36282601518653, "grad_norm": 1527.177978515625, "learning_rate": 3.5734477222519044e-05, "loss": 33.8419, "step": 15661 }, { "epoch": 41.36546715087488, "grad_norm": 662.1027221679688, "learning_rate": 3.5712956413808536e-05, "loss": 34.2933, "step": 15662 }, { "epoch": 41.36810828656322, "grad_norm": 784.1380004882812, "learning_rate": 3.569144158893786e-05, "loss": 34.2403, "step": 15663 }, { "epoch": 41.37074942225157, "grad_norm": 891.2294311523438, "learning_rate": 3.566993274850777e-05, "loss": 34.8101, "step": 15664 }, { "epoch": 41.37339055793991, "grad_norm": 1407.43701171875, "learning_rate": 3.5648429893118894e-05, "loss": 35.1414, "step": 15665 }, { "epoch": 41.37603169362826, "grad_norm": 5069.99462890625, "learning_rate": 3.562693302337172e-05, "loss": 35.8704, "step": 15666 }, { "epoch": 41.378672829316606, "grad_norm": 1128.183837890625, "learning_rate": 3.560544213986644e-05, "loss": 36.0339, "step": 15667 }, { "epoch": 41.38131396500495, "grad_norm": 1732.473876953125, "learning_rate": 3.558395724320332e-05, "loss": 34.5549, "step": 15668 }, { "epoch": 41.3839551006933, "grad_norm": 1207.511962890625, "learning_rate": 3.5562478333982215e-05, "loss": 34.3857, "step": 15669 }, { "epoch": 41.38659623638164, "grad_norm": 2136.205322265625, "learning_rate": 3.5541005412802894e-05, "loss": 34.3777, "step": 15670 }, { "epoch": 41.38923737206999, "grad_norm": 2203.4853515625, "learning_rate": 3.551953848026507e-05, "loss": 34.2272, "step": 15671 }, { "epoch": 41.391878507758335, "grad_norm": 1052.8890380859375, "learning_rate": 3.549807753696816e-05, "loss": 35.3792, "step": 15672 }, { "epoch": 41.394519643446685, "grad_norm": 1493.3216552734375, "learning_rate": 3.547662258351145e-05, "loss": 36.6583, "step": 15673 }, { "epoch": 41.39716077913503, "grad_norm": 1224.2584228515625, "learning_rate": 3.545517362049394e-05, "loss": 40.7239, "step": 15674 }, { "epoch": 41.39980191482337, "grad_norm": 698.3516235351562, "learning_rate": 3.543373064851474e-05, "loss": 38.1472, "step": 15675 }, { "epoch": 41.40244305051172, "grad_norm": 1954.3511962890625, "learning_rate": 3.5412293668172556e-05, "loss": 38.4824, "step": 15676 }, { "epoch": 41.405084186200064, "grad_norm": 2995.275390625, "learning_rate": 3.539086268006597e-05, "loss": 38.7219, "step": 15677 }, { "epoch": 41.407725321888414, "grad_norm": 1090.2918701171875, "learning_rate": 3.536943768479353e-05, "loss": 41.1299, "step": 15678 }, { "epoch": 41.41036645757676, "grad_norm": 879.1353149414062, "learning_rate": 3.534801868295345e-05, "loss": 42.6301, "step": 15679 }, { "epoch": 41.41300759326511, "grad_norm": 719.4048461914062, "learning_rate": 3.532660567514387e-05, "loss": 39.8699, "step": 15680 }, { "epoch": 41.41564872895345, "grad_norm": 3421.303466796875, "learning_rate": 3.5305198661962716e-05, "loss": 43.0178, "step": 15681 }, { "epoch": 41.41828986464179, "grad_norm": 888.6823120117188, "learning_rate": 3.528379764400777e-05, "loss": 40.2183, "step": 15682 }, { "epoch": 41.42093100033014, "grad_norm": 1742.181640625, "learning_rate": 3.5262402621876664e-05, "loss": 40.848, "step": 15683 }, { "epoch": 41.423572136018485, "grad_norm": 2154.02099609375, "learning_rate": 3.524101359616672e-05, "loss": 39.9386, "step": 15684 }, { "epoch": 41.426213271706835, "grad_norm": 1000.4546508789062, "learning_rate": 3.52196305674754e-05, "loss": 37.8945, "step": 15685 }, { "epoch": 41.42885440739518, "grad_norm": 1113.2156982421875, "learning_rate": 3.519825353639972e-05, "loss": 36.2408, "step": 15686 }, { "epoch": 41.43149554308353, "grad_norm": 1005.0104370117188, "learning_rate": 3.517688250353665e-05, "loss": 38.5801, "step": 15687 }, { "epoch": 41.43413667877187, "grad_norm": 2428.66455078125, "learning_rate": 3.515551746948287e-05, "loss": 36.4713, "step": 15688 }, { "epoch": 41.43677781446022, "grad_norm": 1308.3297119140625, "learning_rate": 3.513415843483511e-05, "loss": 34.3177, "step": 15689 }, { "epoch": 41.439418950148564, "grad_norm": 1334.069580078125, "learning_rate": 3.511280540018974e-05, "loss": 36.0068, "step": 15690 }, { "epoch": 41.44206008583691, "grad_norm": 1068.0994873046875, "learning_rate": 3.509145836614308e-05, "loss": 35.2319, "step": 15691 }, { "epoch": 41.44470122152526, "grad_norm": 624.7042236328125, "learning_rate": 3.507011733329118e-05, "loss": 35.104, "step": 15692 }, { "epoch": 41.4473423572136, "grad_norm": 945.2900390625, "learning_rate": 3.504878230223002e-05, "loss": 35.6235, "step": 15693 }, { "epoch": 41.44998349290195, "grad_norm": 993.1011962890625, "learning_rate": 3.50274532735553e-05, "loss": 34.8942, "step": 15694 }, { "epoch": 41.45262462859029, "grad_norm": 1417.68017578125, "learning_rate": 3.500613024786264e-05, "loss": 36.3956, "step": 15695 }, { "epoch": 41.45526576427864, "grad_norm": 832.1917114257812, "learning_rate": 3.498481322574756e-05, "loss": 26.5759, "step": 15696 }, { "epoch": 41.457906899966986, "grad_norm": 668.90625, "learning_rate": 3.496350220780525e-05, "loss": 7.6279, "step": 15697 }, { "epoch": 41.46054803565533, "grad_norm": 19626.333984375, "learning_rate": 3.494219719463085e-05, "loss": 10.2237, "step": 15698 }, { "epoch": 41.46318917134368, "grad_norm": 1060.892578125, "learning_rate": 3.4920898186819236e-05, "loss": 8.5072, "step": 15699 }, { "epoch": 41.46583030703202, "grad_norm": 1507.2525634765625, "learning_rate": 3.489960518496521e-05, "loss": 12.3143, "step": 15700 }, { "epoch": 41.46847144272037, "grad_norm": 3663.680908203125, "learning_rate": 3.487831818966333e-05, "loss": 14.3006, "step": 15701 }, { "epoch": 41.471112578408714, "grad_norm": 1851.689453125, "learning_rate": 3.485703720150801e-05, "loss": 14.6638, "step": 15702 }, { "epoch": 41.473753714097064, "grad_norm": 4111.54150390625, "learning_rate": 3.483576222109358e-05, "loss": 9.1247, "step": 15703 }, { "epoch": 41.47639484978541, "grad_norm": 2516.71435546875, "learning_rate": 3.481449324901412e-05, "loss": 10.683, "step": 15704 }, { "epoch": 41.47903598547375, "grad_norm": 637.6734619140625, "learning_rate": 3.4793230285863455e-05, "loss": 8.3747, "step": 15705 }, { "epoch": 41.4816771211621, "grad_norm": 1049.520751953125, "learning_rate": 3.477197333223547e-05, "loss": 24.1224, "step": 15706 }, { "epoch": 41.48431825685044, "grad_norm": 931.4406127929688, "learning_rate": 3.475072238872371e-05, "loss": 36.8846, "step": 15707 }, { "epoch": 41.48695939253879, "grad_norm": 2261.4111328125, "learning_rate": 3.4729477455921586e-05, "loss": 33.2562, "step": 15708 }, { "epoch": 41.489600528227136, "grad_norm": 1377.081787109375, "learning_rate": 3.4708238534422344e-05, "loss": 35.6135, "step": 15709 }, { "epoch": 41.492241663915486, "grad_norm": 1279.923583984375, "learning_rate": 3.468700562481905e-05, "loss": 35.3134, "step": 15710 }, { "epoch": 41.49488279960383, "grad_norm": 1473.0577392578125, "learning_rate": 3.466577872770468e-05, "loss": 33.3999, "step": 15711 }, { "epoch": 41.49752393529218, "grad_norm": 2159.156494140625, "learning_rate": 3.4644557843671845e-05, "loss": 34.134, "step": 15712 }, { "epoch": 41.50016507098052, "grad_norm": 2526.076904296875, "learning_rate": 3.462334297331332e-05, "loss": 33.7415, "step": 15713 }, { "epoch": 41.502806206668865, "grad_norm": 1684.27783203125, "learning_rate": 3.460213411722141e-05, "loss": 34.1187, "step": 15714 }, { "epoch": 41.505447342357215, "grad_norm": 956.0166625976562, "learning_rate": 3.4580931275988384e-05, "loss": 34.5329, "step": 15715 }, { "epoch": 41.50808847804556, "grad_norm": 3296.703857421875, "learning_rate": 3.455973445020627e-05, "loss": 34.8995, "step": 15716 }, { "epoch": 41.51072961373391, "grad_norm": 2757.093994140625, "learning_rate": 3.453854364046705e-05, "loss": 33.4482, "step": 15717 }, { "epoch": 41.51337074942225, "grad_norm": 1552.983154296875, "learning_rate": 3.451735884736243e-05, "loss": 34.2319, "step": 15718 }, { "epoch": 41.5160118851106, "grad_norm": 1410.72412109375, "learning_rate": 3.449618007148392e-05, "loss": 34.1157, "step": 15719 }, { "epoch": 41.51865302079894, "grad_norm": 638.6843872070312, "learning_rate": 3.447500731342304e-05, "loss": 34.2562, "step": 15720 }, { "epoch": 41.521294156487286, "grad_norm": 1193.315673828125, "learning_rate": 3.445384057377096e-05, "loss": 34.5537, "step": 15721 }, { "epoch": 41.523935292175636, "grad_norm": 1170.5054931640625, "learning_rate": 3.443267985311874e-05, "loss": 35.0394, "step": 15722 }, { "epoch": 41.52657642786398, "grad_norm": 6209.193359375, "learning_rate": 3.441152515205734e-05, "loss": 38.2813, "step": 15723 }, { "epoch": 41.52921756355233, "grad_norm": 14045.0009765625, "learning_rate": 3.439037647117746e-05, "loss": 41.2294, "step": 15724 }, { "epoch": 41.53185869924067, "grad_norm": 959.9786987304688, "learning_rate": 3.4369233811069656e-05, "loss": 37.7391, "step": 15725 }, { "epoch": 41.53449983492902, "grad_norm": 906.3759155273438, "learning_rate": 3.434809717232434e-05, "loss": 38.5409, "step": 15726 }, { "epoch": 41.537140970617365, "grad_norm": 688.5405883789062, "learning_rate": 3.432696655553172e-05, "loss": 39.9435, "step": 15727 }, { "epoch": 41.53978210630571, "grad_norm": 1043.707763671875, "learning_rate": 3.430584196128186e-05, "loss": 40.6941, "step": 15728 }, { "epoch": 41.54242324199406, "grad_norm": 783.1728515625, "learning_rate": 3.4284723390164616e-05, "loss": 41.5293, "step": 15729 }, { "epoch": 41.5450643776824, "grad_norm": 774.549072265625, "learning_rate": 3.42636108427698e-05, "loss": 42.1751, "step": 15730 }, { "epoch": 41.54770551337075, "grad_norm": 810.0003051757812, "learning_rate": 3.4242504319686925e-05, "loss": 41.3341, "step": 15731 }, { "epoch": 41.550346649059094, "grad_norm": 712.9656372070312, "learning_rate": 3.422140382150538e-05, "loss": 41.8793, "step": 15732 }, { "epoch": 41.552987784747444, "grad_norm": 910.2490844726562, "learning_rate": 3.420030934881438e-05, "loss": 40.5083, "step": 15733 }, { "epoch": 41.55562892043579, "grad_norm": 1197.869873046875, "learning_rate": 3.417922090220296e-05, "loss": 38.0811, "step": 15734 }, { "epoch": 41.55827005612414, "grad_norm": 747.0034790039062, "learning_rate": 3.4158138482260016e-05, "loss": 40.9508, "step": 15735 }, { "epoch": 41.56091119181248, "grad_norm": 1743.9859619140625, "learning_rate": 3.4137062089574234e-05, "loss": 37.8232, "step": 15736 }, { "epoch": 41.56355232750082, "grad_norm": 1583.759765625, "learning_rate": 3.41159917247342e-05, "loss": 37.4653, "step": 15737 }, { "epoch": 41.56619346318917, "grad_norm": 815.1566162109375, "learning_rate": 3.409492738832831e-05, "loss": 34.9678, "step": 15738 }, { "epoch": 41.568834598877515, "grad_norm": 857.5234985351562, "learning_rate": 3.407386908094473e-05, "loss": 34.9234, "step": 15739 }, { "epoch": 41.571475734565865, "grad_norm": 931.4866943359375, "learning_rate": 3.4052816803171485e-05, "loss": 36.9033, "step": 15740 }, { "epoch": 41.57411687025421, "grad_norm": 1374.978515625, "learning_rate": 3.40317705555965e-05, "loss": 35.1954, "step": 15741 }, { "epoch": 41.57675800594256, "grad_norm": 1138.8846435546875, "learning_rate": 3.401073033880747e-05, "loss": 34.3594, "step": 15742 }, { "epoch": 41.5793991416309, "grad_norm": 684.0786743164062, "learning_rate": 3.398969615339193e-05, "loss": 33.8725, "step": 15743 }, { "epoch": 41.582040277319244, "grad_norm": 1569.2972412109375, "learning_rate": 3.3968667999937196e-05, "loss": 34.725, "step": 15744 }, { "epoch": 41.584681413007594, "grad_norm": 666.532470703125, "learning_rate": 3.394764587903054e-05, "loss": 35.9375, "step": 15745 }, { "epoch": 41.58732254869594, "grad_norm": 8321.8349609375, "learning_rate": 3.392662979125893e-05, "loss": 50.5912, "step": 15746 }, { "epoch": 41.58996368438429, "grad_norm": 1189.942626953125, "learning_rate": 3.390561973720921e-05, "loss": 13.3221, "step": 15747 }, { "epoch": 41.59260482007263, "grad_norm": 6092.3203125, "learning_rate": 3.388461571746818e-05, "loss": 7.5955, "step": 15748 }, { "epoch": 41.59524595576098, "grad_norm": 1861.0013427734375, "learning_rate": 3.386361773262231e-05, "loss": 9.3057, "step": 15749 }, { "epoch": 41.59788709144932, "grad_norm": 2096.56884765625, "learning_rate": 3.384262578325795e-05, "loss": 9.0647, "step": 15750 }, { "epoch": 41.600528227137666, "grad_norm": 585.76025390625, "learning_rate": 3.3821639869961257e-05, "loss": 9.9054, "step": 15751 }, { "epoch": 41.603169362826016, "grad_norm": 1880.021728515625, "learning_rate": 3.38006599933183e-05, "loss": 15.0457, "step": 15752 }, { "epoch": 41.60581049851436, "grad_norm": 2963.088134765625, "learning_rate": 3.377968615391491e-05, "loss": 14.6745, "step": 15753 }, { "epoch": 41.60845163420271, "grad_norm": 1156.3785400390625, "learning_rate": 3.375871835233671e-05, "loss": 13.2706, "step": 15754 }, { "epoch": 41.61109276989105, "grad_norm": 936.1927490234375, "learning_rate": 3.373775658916933e-05, "loss": 13.3935, "step": 15755 }, { "epoch": 41.6137339055794, "grad_norm": 3359.977294921875, "learning_rate": 3.3716800864998064e-05, "loss": 17.5438, "step": 15756 }, { "epoch": 41.616375041267744, "grad_norm": 665.6461791992188, "learning_rate": 3.369585118040802e-05, "loss": 34.2818, "step": 15757 }, { "epoch": 41.619016176956094, "grad_norm": 1181.1064453125, "learning_rate": 3.367490753598435e-05, "loss": 33.7568, "step": 15758 }, { "epoch": 41.62165731264444, "grad_norm": 1296.4814453125, "learning_rate": 3.365396993231176e-05, "loss": 36.3587, "step": 15759 }, { "epoch": 41.62429844833278, "grad_norm": 1256.7926025390625, "learning_rate": 3.3633038369975e-05, "loss": 34.3242, "step": 15760 }, { "epoch": 41.62693958402113, "grad_norm": 2129.69921875, "learning_rate": 3.3612112849558565e-05, "loss": 35.7484, "step": 15761 }, { "epoch": 41.62958071970947, "grad_norm": 1107.484619140625, "learning_rate": 3.3591193371646736e-05, "loss": 34.3313, "step": 15762 }, { "epoch": 41.63222185539782, "grad_norm": 1249.2601318359375, "learning_rate": 3.3570279936823714e-05, "loss": 32.9928, "step": 15763 }, { "epoch": 41.634862991086166, "grad_norm": 1910.03662109375, "learning_rate": 3.3549372545673415e-05, "loss": 33.4397, "step": 15764 }, { "epoch": 41.637504126774516, "grad_norm": 905.6615600585938, "learning_rate": 3.3528471198779805e-05, "loss": 34.9026, "step": 15765 }, { "epoch": 41.64014526246286, "grad_norm": 1405.964599609375, "learning_rate": 3.350757589672648e-05, "loss": 35.0985, "step": 15766 }, { "epoch": 41.6427863981512, "grad_norm": 941.8632202148438, "learning_rate": 3.348668664009694e-05, "loss": 35.3366, "step": 15767 }, { "epoch": 41.64542753383955, "grad_norm": 933.4540405273438, "learning_rate": 3.3465803429474445e-05, "loss": 35.0264, "step": 15768 }, { "epoch": 41.648068669527895, "grad_norm": 3794.801025390625, "learning_rate": 3.3444926265442236e-05, "loss": 35.1445, "step": 15769 }, { "epoch": 41.650709805216245, "grad_norm": 1330.061279296875, "learning_rate": 3.342405514858321e-05, "loss": 33.9601, "step": 15770 }, { "epoch": 41.65335094090459, "grad_norm": 1614.3828125, "learning_rate": 3.34031900794802e-05, "loss": 35.0207, "step": 15771 }, { "epoch": 41.65599207659294, "grad_norm": 1231.1494140625, "learning_rate": 3.338233105871591e-05, "loss": 35.2021, "step": 15772 }, { "epoch": 41.65863321228128, "grad_norm": 8069.455078125, "learning_rate": 3.336147808687276e-05, "loss": 36.9948, "step": 15773 }, { "epoch": 41.66127434796962, "grad_norm": 3299.740234375, "learning_rate": 3.334063116453309e-05, "loss": 40.2996, "step": 15774 }, { "epoch": 41.66391548365797, "grad_norm": 1227.92822265625, "learning_rate": 3.3319790292278996e-05, "loss": 38.0299, "step": 15775 }, { "epoch": 41.666556619346316, "grad_norm": 1139.388427734375, "learning_rate": 3.329895547069248e-05, "loss": 37.9622, "step": 15776 }, { "epoch": 41.669197755034666, "grad_norm": 1946.0609130859375, "learning_rate": 3.3278126700355376e-05, "loss": 39.1389, "step": 15777 }, { "epoch": 41.67183889072301, "grad_norm": 1843.9827880859375, "learning_rate": 3.325730398184923e-05, "loss": 39.5728, "step": 15778 }, { "epoch": 41.67448002641136, "grad_norm": 1454.5968017578125, "learning_rate": 3.32364873157556e-05, "loss": 43.7727, "step": 15779 }, { "epoch": 41.6771211620997, "grad_norm": 779.2555541992188, "learning_rate": 3.321567670265568e-05, "loss": 43.2577, "step": 15780 }, { "epoch": 41.67976229778805, "grad_norm": 745.8431396484375, "learning_rate": 3.319487214313066e-05, "loss": 41.2384, "step": 15781 }, { "epoch": 41.682403433476395, "grad_norm": 653.2885131835938, "learning_rate": 3.317407363776143e-05, "loss": 39.9684, "step": 15782 }, { "epoch": 41.68504456916474, "grad_norm": 1045.9720458984375, "learning_rate": 3.315328118712888e-05, "loss": 39.4924, "step": 15783 }, { "epoch": 41.68768570485309, "grad_norm": 1014.9223022460938, "learning_rate": 3.3132494791813574e-05, "loss": 37.0711, "step": 15784 }, { "epoch": 41.69032684054143, "grad_norm": 1074.8623046875, "learning_rate": 3.311171445239594e-05, "loss": 37.8416, "step": 15785 }, { "epoch": 41.69296797622978, "grad_norm": 1391.927734375, "learning_rate": 3.309094016945627e-05, "loss": 36.9466, "step": 15786 }, { "epoch": 41.695609111918124, "grad_norm": 4548.46435546875, "learning_rate": 3.30701719435747e-05, "loss": 36.2679, "step": 15787 }, { "epoch": 41.698250247606474, "grad_norm": 1562.72412109375, "learning_rate": 3.3049409775331134e-05, "loss": 35.9651, "step": 15788 }, { "epoch": 41.70089138329482, "grad_norm": 950.1175537109375, "learning_rate": 3.3028653665305304e-05, "loss": 36.455, "step": 15789 }, { "epoch": 41.70353251898316, "grad_norm": 796.51171875, "learning_rate": 3.3007903614076906e-05, "loss": 36.6452, "step": 15790 }, { "epoch": 41.70617365467151, "grad_norm": 1488.802490234375, "learning_rate": 3.2987159622225344e-05, "loss": 34.7322, "step": 15791 }, { "epoch": 41.70881479035985, "grad_norm": 1062.6171875, "learning_rate": 3.296642169032982e-05, "loss": 33.5394, "step": 15792 }, { "epoch": 41.7114559260482, "grad_norm": 1211.538330078125, "learning_rate": 3.294568981896951e-05, "loss": 35.7922, "step": 15793 }, { "epoch": 41.714097061736545, "grad_norm": 1069.358642578125, "learning_rate": 3.292496400872333e-05, "loss": 35.601, "step": 15794 }, { "epoch": 41.716738197424895, "grad_norm": 1747.6785888671875, "learning_rate": 3.2904244260169994e-05, "loss": 34.4567, "step": 15795 }, { "epoch": 41.71937933311324, "grad_norm": 5991.80224609375, "learning_rate": 3.2883530573888125e-05, "loss": 42.0844, "step": 15796 }, { "epoch": 41.72202046880158, "grad_norm": 2183.27392578125, "learning_rate": 3.286282295045612e-05, "loss": 18.1196, "step": 15797 }, { "epoch": 41.72466160448993, "grad_norm": 432.39703369140625, "learning_rate": 3.284212139045223e-05, "loss": 8.9549, "step": 15798 }, { "epoch": 41.727302740178274, "grad_norm": 386.11431884765625, "learning_rate": 3.282142589445447e-05, "loss": 12.4575, "step": 15799 }, { "epoch": 41.729943875866624, "grad_norm": 2447.7373046875, "learning_rate": 3.280073646304088e-05, "loss": 10.8769, "step": 15800 }, { "epoch": 41.729943875866624, "eval_loss": 3.7763993740081787, "eval_runtime": 2.1749, "eval_samples_per_second": 227.601, "eval_steps_per_second": 28.508, "step": 15800 }, { "epoch": 41.73258501155497, "grad_norm": 5750.7451171875, "learning_rate": 3.278005309678913e-05, "loss": 11.7776, "step": 15801 }, { "epoch": 41.73522614724332, "grad_norm": 1887.3798828125, "learning_rate": 3.2759375796276804e-05, "loss": 7.5619, "step": 15802 }, { "epoch": 41.73786728293166, "grad_norm": 3649.803955078125, "learning_rate": 3.2738704562081275e-05, "loss": 9.9379, "step": 15803 }, { "epoch": 41.74050841862001, "grad_norm": 15182.02734375, "learning_rate": 3.2718039394779825e-05, "loss": 9.2278, "step": 15804 }, { "epoch": 41.74314955430835, "grad_norm": 2243.4033203125, "learning_rate": 3.269738029494945e-05, "loss": 11.6859, "step": 15805 }, { "epoch": 41.745790689996696, "grad_norm": 1364.57763671875, "learning_rate": 3.2676727263167064e-05, "loss": 35.4186, "step": 15806 }, { "epoch": 41.748431825685046, "grad_norm": 837.1678466796875, "learning_rate": 3.265608030000944e-05, "loss": 34.3851, "step": 15807 }, { "epoch": 41.75107296137339, "grad_norm": 2899.85791015625, "learning_rate": 3.2635439406053096e-05, "loss": 36.3884, "step": 15808 }, { "epoch": 41.75371409706174, "grad_norm": 801.3580932617188, "learning_rate": 3.2614804581874436e-05, "loss": 35.1752, "step": 15809 }, { "epoch": 41.75635523275008, "grad_norm": 2047.0828857421875, "learning_rate": 3.2594175828049576e-05, "loss": 36.6706, "step": 15810 }, { "epoch": 41.75899636843843, "grad_norm": 972.9080200195312, "learning_rate": 3.257355314515473e-05, "loss": 34.1612, "step": 15811 }, { "epoch": 41.761637504126774, "grad_norm": 1081.9735107421875, "learning_rate": 3.2552936533765675e-05, "loss": 35.4339, "step": 15812 }, { "epoch": 41.76427863981512, "grad_norm": 2080.543212890625, "learning_rate": 3.2532325994458126e-05, "loss": 34.9838, "step": 15813 }, { "epoch": 41.76691977550347, "grad_norm": 1572.942626953125, "learning_rate": 3.251172152780762e-05, "loss": 33.6539, "step": 15814 }, { "epoch": 41.76956091119181, "grad_norm": 1057.0428466796875, "learning_rate": 3.2491123134389564e-05, "loss": 33.3968, "step": 15815 }, { "epoch": 41.77220204688016, "grad_norm": 1123.822021484375, "learning_rate": 3.24705308147791e-05, "loss": 33.8541, "step": 15816 }, { "epoch": 41.7748431825685, "grad_norm": 938.0248413085938, "learning_rate": 3.244994456955122e-05, "loss": 34.723, "step": 15817 }, { "epoch": 41.77748431825685, "grad_norm": 1224.24072265625, "learning_rate": 3.24293643992809e-05, "loss": 35.8886, "step": 15818 }, { "epoch": 41.780125453945196, "grad_norm": 1109.5205078125, "learning_rate": 3.2408790304542774e-05, "loss": 33.9196, "step": 15819 }, { "epoch": 41.78276658963354, "grad_norm": 1910.908203125, "learning_rate": 3.238822228591137e-05, "loss": 34.4341, "step": 15820 }, { "epoch": 41.78540772532189, "grad_norm": 1574.2373046875, "learning_rate": 3.236766034396102e-05, "loss": 34.4596, "step": 15821 }, { "epoch": 41.78804886101023, "grad_norm": 955.6424560546875, "learning_rate": 3.234710447926589e-05, "loss": 36.6489, "step": 15822 }, { "epoch": 41.79068999669858, "grad_norm": 882.1766967773438, "learning_rate": 3.232655469240003e-05, "loss": 35.9811, "step": 15823 }, { "epoch": 41.793331132386925, "grad_norm": 9210.4736328125, "learning_rate": 3.230601098393721e-05, "loss": 42.6268, "step": 15824 }, { "epoch": 41.795972268075275, "grad_norm": 757.3482666015625, "learning_rate": 3.2285473354451203e-05, "loss": 39.2915, "step": 15825 }, { "epoch": 41.79861340376362, "grad_norm": 2647.4736328125, "learning_rate": 3.2264941804515473e-05, "loss": 39.7851, "step": 15826 }, { "epoch": 41.80125453945197, "grad_norm": 1502.478759765625, "learning_rate": 3.224441633470326e-05, "loss": 38.1602, "step": 15827 }, { "epoch": 41.80389567514031, "grad_norm": 909.852783203125, "learning_rate": 3.222389694558789e-05, "loss": 40.7737, "step": 15828 }, { "epoch": 41.80653681082865, "grad_norm": 931.0784301757812, "learning_rate": 3.220338363774222e-05, "loss": 42.1053, "step": 15829 }, { "epoch": 41.809177946517, "grad_norm": 704.5678100585938, "learning_rate": 3.218287641173917e-05, "loss": 41.2169, "step": 15830 }, { "epoch": 41.811819082205346, "grad_norm": 629.2551879882812, "learning_rate": 3.2162375268151326e-05, "loss": 40.4605, "step": 15831 }, { "epoch": 41.814460217893696, "grad_norm": 932.3148193359375, "learning_rate": 3.214188020755118e-05, "loss": 39.6814, "step": 15832 }, { "epoch": 41.81710135358204, "grad_norm": 1343.0726318359375, "learning_rate": 3.2121391230511054e-05, "loss": 37.9807, "step": 15833 }, { "epoch": 41.81974248927039, "grad_norm": 981.226318359375, "learning_rate": 3.210090833760304e-05, "loss": 38.0256, "step": 15834 }, { "epoch": 41.82238362495873, "grad_norm": 673.7756958007812, "learning_rate": 3.2080431529399214e-05, "loss": 37.133, "step": 15835 }, { "epoch": 41.825024760647075, "grad_norm": 1920.1346435546875, "learning_rate": 3.2059960806471337e-05, "loss": 37.8575, "step": 15836 }, { "epoch": 41.827665896335425, "grad_norm": 1568.9644775390625, "learning_rate": 3.203949616939095e-05, "loss": 36.5339, "step": 15837 }, { "epoch": 41.83030703202377, "grad_norm": 980.11767578125, "learning_rate": 3.201903761872973e-05, "loss": 35.63, "step": 15838 }, { "epoch": 41.83294816771212, "grad_norm": 730.6539916992188, "learning_rate": 3.199858515505877e-05, "loss": 35.859, "step": 15839 }, { "epoch": 41.83558930340046, "grad_norm": 1299.63818359375, "learning_rate": 3.1978138778949254e-05, "loss": 34.332, "step": 15840 }, { "epoch": 41.83823043908881, "grad_norm": 1600.8502197265625, "learning_rate": 3.195769849097208e-05, "loss": 34.7925, "step": 15841 }, { "epoch": 41.840871574777154, "grad_norm": 675.451171875, "learning_rate": 3.193726429169816e-05, "loss": 36.3418, "step": 15842 }, { "epoch": 41.8435127104655, "grad_norm": 1705.3734130859375, "learning_rate": 3.1916836181698024e-05, "loss": 34.4531, "step": 15843 }, { "epoch": 41.84615384615385, "grad_norm": 677.0645751953125, "learning_rate": 3.189641416154207e-05, "loss": 35.3492, "step": 15844 }, { "epoch": 41.84879498184219, "grad_norm": 1110.4986572265625, "learning_rate": 3.187599823180071e-05, "loss": 35.7628, "step": 15845 }, { "epoch": 41.85143611753054, "grad_norm": 825.8629760742188, "learning_rate": 3.185558839304395e-05, "loss": 29.8743, "step": 15846 }, { "epoch": 41.85407725321888, "grad_norm": 4838.6923828125, "learning_rate": 3.183518464584173e-05, "loss": 10.3732, "step": 15847 }, { "epoch": 41.85671838890723, "grad_norm": 2014.9696044921875, "learning_rate": 3.181478699076384e-05, "loss": 10.9133, "step": 15848 }, { "epoch": 41.859359524595575, "grad_norm": 1730.248046875, "learning_rate": 3.179439542837983e-05, "loss": 15.2378, "step": 15849 }, { "epoch": 41.862000660283925, "grad_norm": 514.4814453125, "learning_rate": 3.177400995925919e-05, "loss": 8.7195, "step": 15850 }, { "epoch": 41.86464179597227, "grad_norm": 3191.207275390625, "learning_rate": 3.1753630583971045e-05, "loss": 9.6196, "step": 15851 }, { "epoch": 41.86728293166061, "grad_norm": 803.224853515625, "learning_rate": 3.17332573030846e-05, "loss": 9.9315, "step": 15852 }, { "epoch": 41.86992406734896, "grad_norm": 964.6346435546875, "learning_rate": 3.171289011716877e-05, "loss": 11.1438, "step": 15853 }, { "epoch": 41.872565203037304, "grad_norm": 1832.634765625, "learning_rate": 3.1692529026792236e-05, "loss": 11.1505, "step": 15854 }, { "epoch": 41.875206338725654, "grad_norm": 9896.0576171875, "learning_rate": 3.1672174032523565e-05, "loss": 14.2847, "step": 15855 }, { "epoch": 41.877847474414, "grad_norm": 698.3348388671875, "learning_rate": 3.16518251349312e-05, "loss": 35.2641, "step": 15856 }, { "epoch": 41.88048861010235, "grad_norm": 2408.956787109375, "learning_rate": 3.163148233458335e-05, "loss": 35.1199, "step": 15857 }, { "epoch": 41.88312974579069, "grad_norm": 1421.821044921875, "learning_rate": 3.161114563204803e-05, "loss": 33.2028, "step": 15858 }, { "epoch": 41.88577088147903, "grad_norm": 1047.197509765625, "learning_rate": 3.159081502789324e-05, "loss": 35.5114, "step": 15859 }, { "epoch": 41.88841201716738, "grad_norm": 1009.5863037109375, "learning_rate": 3.157049052268662e-05, "loss": 35.3543, "step": 15860 }, { "epoch": 41.891053152855726, "grad_norm": 530.820556640625, "learning_rate": 3.155017211699576e-05, "loss": 33.5657, "step": 15861 }, { "epoch": 41.893694288544076, "grad_norm": 567.6041259765625, "learning_rate": 3.152985981138795e-05, "loss": 33.7996, "step": 15862 }, { "epoch": 41.89633542423242, "grad_norm": 1114.074951171875, "learning_rate": 3.150955360643054e-05, "loss": 34.7266, "step": 15863 }, { "epoch": 41.89897655992077, "grad_norm": 962.9176635742188, "learning_rate": 3.1489253502690495e-05, "loss": 34.5596, "step": 15864 }, { "epoch": 41.90161769560911, "grad_norm": 1362.48681640625, "learning_rate": 3.146895950073467e-05, "loss": 33.7836, "step": 15865 }, { "epoch": 41.904258831297454, "grad_norm": 1700.570068359375, "learning_rate": 3.14486716011298e-05, "loss": 35.1625, "step": 15866 }, { "epoch": 41.906899966985804, "grad_norm": 958.5574340820312, "learning_rate": 3.142838980444235e-05, "loss": 34.7136, "step": 15867 }, { "epoch": 41.90954110267415, "grad_norm": 1168.24462890625, "learning_rate": 3.140811411123876e-05, "loss": 34.024, "step": 15868 }, { "epoch": 41.9121822383625, "grad_norm": 1716.154541015625, "learning_rate": 3.138784452208509e-05, "loss": 33.8834, "step": 15869 }, { "epoch": 41.91482337405084, "grad_norm": 2613.480712890625, "learning_rate": 3.136758103754753e-05, "loss": 34.8469, "step": 15870 }, { "epoch": 41.91746450973919, "grad_norm": 904.0540771484375, "learning_rate": 3.134732365819182e-05, "loss": 34.5204, "step": 15871 }, { "epoch": 41.92010564542753, "grad_norm": 2608.210693359375, "learning_rate": 3.13270723845836e-05, "loss": 35.4327, "step": 15872 }, { "epoch": 41.92274678111588, "grad_norm": 3758.044677734375, "learning_rate": 3.130682721728853e-05, "loss": 37.8593, "step": 15873 }, { "epoch": 41.925387916804226, "grad_norm": 1399.0821533203125, "learning_rate": 3.1286588156871796e-05, "loss": 40.4392, "step": 15874 }, { "epoch": 41.92802905249257, "grad_norm": 1521.701904296875, "learning_rate": 3.126635520389862e-05, "loss": 40.0147, "step": 15875 }, { "epoch": 41.93067018818092, "grad_norm": 1127.750244140625, "learning_rate": 3.124612835893392e-05, "loss": 39.0265, "step": 15876 }, { "epoch": 41.93331132386926, "grad_norm": 1126.3419189453125, "learning_rate": 3.122590762254263e-05, "loss": 42.0588, "step": 15877 }, { "epoch": 41.93595245955761, "grad_norm": 914.6119384765625, "learning_rate": 3.120569299528936e-05, "loss": 40.446, "step": 15878 }, { "epoch": 41.938593595245955, "grad_norm": 693.853271484375, "learning_rate": 3.1185484477738526e-05, "loss": 39.3196, "step": 15879 }, { "epoch": 41.941234730934305, "grad_norm": 565.7421264648438, "learning_rate": 3.1165282070454576e-05, "loss": 38.7015, "step": 15880 }, { "epoch": 41.94387586662265, "grad_norm": 909.4725952148438, "learning_rate": 3.114508577400155e-05, "loss": 37.48, "step": 15881 }, { "epoch": 41.94651700231099, "grad_norm": 988.2545166015625, "learning_rate": 3.112489558894346e-05, "loss": 35.4006, "step": 15882 }, { "epoch": 41.94915813799934, "grad_norm": 1429.0159912109375, "learning_rate": 3.110471151584407e-05, "loss": 34.9055, "step": 15883 }, { "epoch": 41.95179927368768, "grad_norm": 1221.4639892578125, "learning_rate": 3.108453355526703e-05, "loss": 34.6806, "step": 15884 }, { "epoch": 41.95444040937603, "grad_norm": 2729.934326171875, "learning_rate": 3.10643617077758e-05, "loss": 35.5756, "step": 15885 }, { "epoch": 41.957081545064376, "grad_norm": 2282.075927734375, "learning_rate": 3.10441959739336e-05, "loss": 35.1643, "step": 15886 }, { "epoch": 41.959722680752726, "grad_norm": 4798.046875, "learning_rate": 3.102403635430365e-05, "loss": 23.0901, "step": 15887 }, { "epoch": 41.96236381644107, "grad_norm": 1342.96630859375, "learning_rate": 3.1003882849448855e-05, "loss": 8.7883, "step": 15888 }, { "epoch": 41.96500495212941, "grad_norm": 1210.732177734375, "learning_rate": 3.0983735459931994e-05, "loss": 17.7956, "step": 15889 }, { "epoch": 41.96764608781776, "grad_norm": 3617.563232421875, "learning_rate": 3.096359418631567e-05, "loss": 10.8915, "step": 15890 }, { "epoch": 41.970287223506105, "grad_norm": 769.898681640625, "learning_rate": 3.094345902916229e-05, "loss": 8.6844, "step": 15891 }, { "epoch": 41.972928359194455, "grad_norm": 3942.588134765625, "learning_rate": 3.092332998903416e-05, "loss": 23.0364, "step": 15892 }, { "epoch": 41.9755694948828, "grad_norm": 1707.7021484375, "learning_rate": 3.090320706649327e-05, "loss": 33.7514, "step": 15893 }, { "epoch": 41.97821063057115, "grad_norm": 1269.4613037109375, "learning_rate": 3.0883090262101676e-05, "loss": 34.8607, "step": 15894 }, { "epoch": 41.98085176625949, "grad_norm": 1413.7657470703125, "learning_rate": 3.086297957642109e-05, "loss": 34.1915, "step": 15895 }, { "epoch": 41.98349290194784, "grad_norm": 1549.2911376953125, "learning_rate": 3.084287501001304e-05, "loss": 33.9565, "step": 15896 }, { "epoch": 41.986134037636184, "grad_norm": 1220.7435302734375, "learning_rate": 3.082277656343893e-05, "loss": 34.3077, "step": 15897 }, { "epoch": 41.98877517332453, "grad_norm": 2202.86865234375, "learning_rate": 3.08026842372601e-05, "loss": 35.0606, "step": 15898 }, { "epoch": 41.99141630901288, "grad_norm": 693.582275390625, "learning_rate": 3.078259803203754e-05, "loss": 34.5042, "step": 15899 }, { "epoch": 41.99405744470122, "grad_norm": 1167.59521484375, "learning_rate": 3.076251794833213e-05, "loss": 34.2306, "step": 15900 }, { "epoch": 41.99669858038957, "grad_norm": 4746.943359375, "learning_rate": 3.0742443986704645e-05, "loss": 35.6282, "step": 15901 }, { "epoch": 41.99933971607791, "grad_norm": 2108.155029296875, "learning_rate": 3.072237614771561e-05, "loss": 37.9125, "step": 15902 }, { "epoch": 42.00198085176626, "grad_norm": 604.7827758789062, "learning_rate": 3.0702314431925394e-05, "loss": 39.675, "step": 15903 }, { "epoch": 42.004621987454605, "grad_norm": 2284.14306640625, "learning_rate": 3.0682258839894175e-05, "loss": 38.2174, "step": 15904 }, { "epoch": 42.00726312314295, "grad_norm": 1136.2174072265625, "learning_rate": 3.0662209372182114e-05, "loss": 38.2666, "step": 15905 }, { "epoch": 42.0099042588313, "grad_norm": 977.27783203125, "learning_rate": 3.0642166029349e-05, "loss": 38.4942, "step": 15906 }, { "epoch": 42.01254539451964, "grad_norm": 673.5353393554688, "learning_rate": 3.0622128811954504e-05, "loss": 41.5332, "step": 15907 }, { "epoch": 42.01518653020799, "grad_norm": 1518.2330322265625, "learning_rate": 3.060209772055828e-05, "loss": 40.6405, "step": 15908 }, { "epoch": 42.017827665896334, "grad_norm": 761.6021728515625, "learning_rate": 3.058207275571956e-05, "loss": 41.9641, "step": 15909 }, { "epoch": 42.020468801584684, "grad_norm": 1925.47509765625, "learning_rate": 3.0562053917997545e-05, "loss": 40.1414, "step": 15910 }, { "epoch": 42.02310993727303, "grad_norm": 846.26708984375, "learning_rate": 3.05420412079512e-05, "loss": 39.8825, "step": 15911 }, { "epoch": 42.02575107296137, "grad_norm": 1142.5269775390625, "learning_rate": 3.052203462613953e-05, "loss": 40.0734, "step": 15912 }, { "epoch": 42.02839220864972, "grad_norm": 977.3800048828125, "learning_rate": 3.0502034173121114e-05, "loss": 39.8919, "step": 15913 }, { "epoch": 42.03103334433806, "grad_norm": 792.1771240234375, "learning_rate": 3.0482039849454385e-05, "loss": 38.6312, "step": 15914 }, { "epoch": 42.03367448002641, "grad_norm": 856.7683715820312, "learning_rate": 3.0462051655697793e-05, "loss": 36.8142, "step": 15915 }, { "epoch": 42.036315615714756, "grad_norm": 577.3836669921875, "learning_rate": 3.0442069592409442e-05, "loss": 36.0357, "step": 15916 }, { "epoch": 42.038956751403106, "grad_norm": 713.4672241210938, "learning_rate": 3.0422093660147337e-05, "loss": 34.7636, "step": 15917 }, { "epoch": 42.04159788709145, "grad_norm": 1600.7149658203125, "learning_rate": 3.040212385946925e-05, "loss": 37.1298, "step": 15918 }, { "epoch": 42.0442390227798, "grad_norm": 1493.4493408203125, "learning_rate": 3.0382160190932874e-05, "loss": 34.6152, "step": 15919 }, { "epoch": 42.04688015846814, "grad_norm": 1099.5596923828125, "learning_rate": 3.036220265509568e-05, "loss": 35.2248, "step": 15920 }, { "epoch": 42.049521294156484, "grad_norm": 920.6664428710938, "learning_rate": 3.0342251252514886e-05, "loss": 34.7541, "step": 15921 }, { "epoch": 42.052162429844834, "grad_norm": 1257.3956298828125, "learning_rate": 3.0322305983747744e-05, "loss": 35.3328, "step": 15922 }, { "epoch": 42.05480356553318, "grad_norm": 599.1864013671875, "learning_rate": 3.030236684935117e-05, "loss": 36.0358, "step": 15923 }, { "epoch": 42.05744470122153, "grad_norm": 1228.556396484375, "learning_rate": 3.028243384988194e-05, "loss": 34.4486, "step": 15924 }, { "epoch": 42.06008583690987, "grad_norm": 2283.531982421875, "learning_rate": 3.0262506985896664e-05, "loss": 39.2769, "step": 15925 }, { "epoch": 42.06272697259822, "grad_norm": 1259.264404296875, "learning_rate": 3.0242586257951815e-05, "loss": 13.2718, "step": 15926 }, { "epoch": 42.06536810828656, "grad_norm": 1228.9696044921875, "learning_rate": 3.0222671666603642e-05, "loss": 12.553, "step": 15927 }, { "epoch": 42.068009243974906, "grad_norm": 1355.1619873046875, "learning_rate": 3.0202763212408223e-05, "loss": 17.7873, "step": 15928 }, { "epoch": 42.070650379663256, "grad_norm": 1612.0465087890625, "learning_rate": 3.0182860895921567e-05, "loss": 9.3833, "step": 15929 }, { "epoch": 42.0732915153516, "grad_norm": 3324.16552734375, "learning_rate": 3.0162964717699385e-05, "loss": 10.2685, "step": 15930 }, { "epoch": 42.07593265103995, "grad_norm": 2281.941650390625, "learning_rate": 3.014307467829727e-05, "loss": 8.6602, "step": 15931 }, { "epoch": 42.07857378672829, "grad_norm": 533.0982666015625, "learning_rate": 3.012319077827061e-05, "loss": 8.9848, "step": 15932 }, { "epoch": 42.08121492241664, "grad_norm": 3737.506591796875, "learning_rate": 3.0103313018174734e-05, "loss": 19.2449, "step": 15933 }, { "epoch": 42.083856058104985, "grad_norm": 1549.776123046875, "learning_rate": 3.008344139856467e-05, "loss": 12.6085, "step": 15934 }, { "epoch": 42.08649719379333, "grad_norm": 971.6968994140625, "learning_rate": 3.006357591999531e-05, "loss": 31.796, "step": 15935 }, { "epoch": 42.08913832948168, "grad_norm": 1511.486083984375, "learning_rate": 3.004371658302138e-05, "loss": 34.9694, "step": 15936 }, { "epoch": 42.09177946517002, "grad_norm": 651.22802734375, "learning_rate": 3.0023863388197488e-05, "loss": 33.7903, "step": 15937 }, { "epoch": 42.09442060085837, "grad_norm": 623.60009765625, "learning_rate": 3.0004016336077967e-05, "loss": 34.8645, "step": 15938 }, { "epoch": 42.09706173654671, "grad_norm": 3760.0439453125, "learning_rate": 2.9984175427217013e-05, "loss": 34.1329, "step": 15939 }, { "epoch": 42.09970287223506, "grad_norm": 524.5670776367188, "learning_rate": 2.9964340662168772e-05, "loss": 36.169, "step": 15940 }, { "epoch": 42.102344007923406, "grad_norm": 2262.724853515625, "learning_rate": 2.9944512041487044e-05, "loss": 35.7122, "step": 15941 }, { "epoch": 42.104985143611756, "grad_norm": 1266.3333740234375, "learning_rate": 2.9924689565725528e-05, "loss": 34.1407, "step": 15942 }, { "epoch": 42.1076262793001, "grad_norm": 1296.0888671875, "learning_rate": 2.990487323543786e-05, "loss": 34.2785, "step": 15943 }, { "epoch": 42.11026741498844, "grad_norm": 1234.018798828125, "learning_rate": 2.9885063051177275e-05, "loss": 34.3525, "step": 15944 }, { "epoch": 42.11290855067679, "grad_norm": 818.1495361328125, "learning_rate": 2.9865259013496987e-05, "loss": 35.1494, "step": 15945 }, { "epoch": 42.115549686365135, "grad_norm": 984.8394165039062, "learning_rate": 2.9845461122949974e-05, "loss": 35.7005, "step": 15946 }, { "epoch": 42.118190822053485, "grad_norm": 1569.280517578125, "learning_rate": 2.9825669380089182e-05, "loss": 34.1967, "step": 15947 }, { "epoch": 42.12083195774183, "grad_norm": 1244.0341796875, "learning_rate": 2.9805883785467254e-05, "loss": 35.0082, "step": 15948 }, { "epoch": 42.12347309343018, "grad_norm": 591.0576782226562, "learning_rate": 2.9786104339636634e-05, "loss": 34.4094, "step": 15949 }, { "epoch": 42.12611422911852, "grad_norm": 1500.4268798828125, "learning_rate": 2.9766331043149713e-05, "loss": 35.3635, "step": 15950 }, { "epoch": 42.128755364806864, "grad_norm": 976.126953125, "learning_rate": 2.9746563896558638e-05, "loss": 36.5056, "step": 15951 }, { "epoch": 42.131396500495214, "grad_norm": 2023.7576904296875, "learning_rate": 2.972680290041538e-05, "loss": 39.7087, "step": 15952 }, { "epoch": 42.13403763618356, "grad_norm": 2975.31298828125, "learning_rate": 2.9707048055271746e-05, "loss": 39.2311, "step": 15953 }, { "epoch": 42.13667877187191, "grad_norm": 702.4969482421875, "learning_rate": 2.9687299361679382e-05, "loss": 39.5343, "step": 15954 }, { "epoch": 42.13931990756025, "grad_norm": 601.4874267578125, "learning_rate": 2.966755682018979e-05, "loss": 37.9219, "step": 15955 }, { "epoch": 42.1419610432486, "grad_norm": 801.541259765625, "learning_rate": 2.9647820431354167e-05, "loss": 37.795, "step": 15956 }, { "epoch": 42.14460217893694, "grad_norm": 843.1380615234375, "learning_rate": 2.9628090195723796e-05, "loss": 41.0852, "step": 15957 }, { "epoch": 42.147243314625285, "grad_norm": 1039.8348388671875, "learning_rate": 2.960836611384954e-05, "loss": 41.0451, "step": 15958 }, { "epoch": 42.149884450313635, "grad_norm": 743.5657958984375, "learning_rate": 2.958864818628218e-05, "loss": 43.0089, "step": 15959 }, { "epoch": 42.15252558600198, "grad_norm": 1527.2418212890625, "learning_rate": 2.956893641357236e-05, "loss": 40.8332, "step": 15960 }, { "epoch": 42.15516672169033, "grad_norm": 995.1256713867188, "learning_rate": 2.9549230796270504e-05, "loss": 39.7547, "step": 15961 }, { "epoch": 42.15780785737867, "grad_norm": 955.5467529296875, "learning_rate": 2.9529531334926864e-05, "loss": 38.7837, "step": 15962 }, { "epoch": 42.16044899306702, "grad_norm": 1918.4189453125, "learning_rate": 2.9509838030091497e-05, "loss": 38.9757, "step": 15963 }, { "epoch": 42.163090128755364, "grad_norm": 1160.97900390625, "learning_rate": 2.949015088231444e-05, "loss": 36.8847, "step": 15964 }, { "epoch": 42.165731264443714, "grad_norm": 1704.7735595703125, "learning_rate": 2.9470469892145384e-05, "loss": 38.835, "step": 15965 }, { "epoch": 42.16837240013206, "grad_norm": 1235.8819580078125, "learning_rate": 2.9450795060133873e-05, "loss": 35.6621, "step": 15966 }, { "epoch": 42.1710135358204, "grad_norm": 627.275634765625, "learning_rate": 2.9431126386829404e-05, "loss": 35.4527, "step": 15967 }, { "epoch": 42.17365467150875, "grad_norm": 1857.71630859375, "learning_rate": 2.9411463872781126e-05, "loss": 35.9418, "step": 15968 }, { "epoch": 42.17629580719709, "grad_norm": 844.733154296875, "learning_rate": 2.9391807518538176e-05, "loss": 33.8301, "step": 15969 }, { "epoch": 42.17893694288544, "grad_norm": 650.947021484375, "learning_rate": 2.93721573246494e-05, "loss": 34.295, "step": 15970 }, { "epoch": 42.181578078573786, "grad_norm": 1807.8660888671875, "learning_rate": 2.9352513291663523e-05, "loss": 34.5223, "step": 15971 }, { "epoch": 42.184219214262136, "grad_norm": 834.3553466796875, "learning_rate": 2.933287542012908e-05, "loss": 34.9758, "step": 15972 }, { "epoch": 42.18686034995048, "grad_norm": 1242.5643310546875, "learning_rate": 2.9313243710594435e-05, "loss": 35.2421, "step": 15973 }, { "epoch": 42.18950148563882, "grad_norm": 684.5458374023438, "learning_rate": 2.9293618163607872e-05, "loss": 41.8332, "step": 15974 }, { "epoch": 42.19214262132717, "grad_norm": 1091.0689697265625, "learning_rate": 2.927399877971737e-05, "loss": 17.6613, "step": 15975 }, { "epoch": 42.194783757015514, "grad_norm": 22238.634765625, "learning_rate": 2.925438555947077e-05, "loss": 11.5397, "step": 15976 }, { "epoch": 42.197424892703864, "grad_norm": 1037.1573486328125, "learning_rate": 2.9234778503415743e-05, "loss": 13.0341, "step": 15977 }, { "epoch": 42.20006602839221, "grad_norm": 320.8069152832031, "learning_rate": 2.9215177612099965e-05, "loss": 9.383, "step": 15978 }, { "epoch": 42.20270716408056, "grad_norm": 1957.07666015625, "learning_rate": 2.9195582886070578e-05, "loss": 10.659, "step": 15979 }, { "epoch": 42.2053482997689, "grad_norm": 13203.9326171875, "learning_rate": 2.917599432587478e-05, "loss": 11.4517, "step": 15980 }, { "epoch": 42.20798943545724, "grad_norm": 7947.59423828125, "learning_rate": 2.91564119320597e-05, "loss": 10.9453, "step": 15981 }, { "epoch": 42.21063057114559, "grad_norm": 1465.3935546875, "learning_rate": 2.913683570517206e-05, "loss": 10.3573, "step": 15982 }, { "epoch": 42.213271706833936, "grad_norm": 2865.482421875, "learning_rate": 2.9117265645758533e-05, "loss": 13.7018, "step": 15983 }, { "epoch": 42.215912842522286, "grad_norm": 1067.3297119140625, "learning_rate": 2.9097701754365575e-05, "loss": 18.1954, "step": 15984 }, { "epoch": 42.21855397821063, "grad_norm": 1361.952392578125, "learning_rate": 2.9078144031539576e-05, "loss": 36.264, "step": 15985 }, { "epoch": 42.22119511389898, "grad_norm": 921.514892578125, "learning_rate": 2.9058592477826635e-05, "loss": 35.2694, "step": 15986 }, { "epoch": 42.22383624958732, "grad_norm": 644.9981689453125, "learning_rate": 2.9039047093772723e-05, "loss": 33.6149, "step": 15987 }, { "epoch": 42.22647738527567, "grad_norm": 1804.204345703125, "learning_rate": 2.9019507879923607e-05, "loss": 36.1868, "step": 15988 }, { "epoch": 42.229118520964015, "grad_norm": 860.2958984375, "learning_rate": 2.899997483682493e-05, "loss": 34.5746, "step": 15989 }, { "epoch": 42.23175965665236, "grad_norm": 709.672607421875, "learning_rate": 2.8980447965022112e-05, "loss": 34.3518, "step": 15990 }, { "epoch": 42.23440079234071, "grad_norm": 1092.0228271484375, "learning_rate": 2.8960927265060417e-05, "loss": 34.5084, "step": 15991 }, { "epoch": 42.23704192802905, "grad_norm": 1855.5531005859375, "learning_rate": 2.8941412737485017e-05, "loss": 33.8458, "step": 15992 }, { "epoch": 42.2396830637174, "grad_norm": 890.01416015625, "learning_rate": 2.8921904382840843e-05, "loss": 33.5209, "step": 15993 }, { "epoch": 42.24232419940574, "grad_norm": 1104.2142333984375, "learning_rate": 2.8902402201672533e-05, "loss": 36.3742, "step": 15994 }, { "epoch": 42.24496533509409, "grad_norm": 1198.16943359375, "learning_rate": 2.888290619452488e-05, "loss": 34.1813, "step": 15995 }, { "epoch": 42.247606470782436, "grad_norm": 2450.63330078125, "learning_rate": 2.8863416361942148e-05, "loss": 35.6736, "step": 15996 }, { "epoch": 42.25024760647078, "grad_norm": 2897.88525390625, "learning_rate": 2.8843932704468586e-05, "loss": 33.9623, "step": 15997 }, { "epoch": 42.25288874215913, "grad_norm": 2490.287353515625, "learning_rate": 2.8824455222648233e-05, "loss": 33.1014, "step": 15998 }, { "epoch": 42.25552987784747, "grad_norm": 1594.55224609375, "learning_rate": 2.880498391702513e-05, "loss": 34.3778, "step": 15999 }, { "epoch": 42.25817101353582, "grad_norm": 1021.4105224609375, "learning_rate": 2.8785518788142866e-05, "loss": 35.1502, "step": 16000 }, { "epoch": 42.25817101353582, "eval_loss": 3.8322572708129883, "eval_runtime": 2.1328, "eval_samples_per_second": 232.088, "eval_steps_per_second": 29.07, "step": 16000 }, { "epoch": 42.260812149224165, "grad_norm": 934.1201171875, "learning_rate": 2.8766059836545013e-05, "loss": 35.5244, "step": 16001 }, { "epoch": 42.263453284912515, "grad_norm": 2111.020751953125, "learning_rate": 2.8746607062775044e-05, "loss": 37.1023, "step": 16002 }, { "epoch": 42.26609442060086, "grad_norm": 1100.68115234375, "learning_rate": 2.8727160467376086e-05, "loss": 39.3459, "step": 16003 }, { "epoch": 42.2687355562892, "grad_norm": 423.2701110839844, "learning_rate": 2.87077200508912e-05, "loss": 38.7712, "step": 16004 }, { "epoch": 42.27137669197755, "grad_norm": 534.8998413085938, "learning_rate": 2.8688285813863235e-05, "loss": 37.9154, "step": 16005 }, { "epoch": 42.274017827665894, "grad_norm": 561.9126586914062, "learning_rate": 2.8668857756834867e-05, "loss": 40.5005, "step": 16006 }, { "epoch": 42.276658963354244, "grad_norm": 744.7254028320312, "learning_rate": 2.864943588034863e-05, "loss": 42.3561, "step": 16007 }, { "epoch": 42.27930009904259, "grad_norm": 792.5721435546875, "learning_rate": 2.8630020184946842e-05, "loss": 40.9461, "step": 16008 }, { "epoch": 42.28194123473094, "grad_norm": 1384.4422607421875, "learning_rate": 2.8610610671171737e-05, "loss": 40.5714, "step": 16009 }, { "epoch": 42.28458237041928, "grad_norm": 1299.25146484375, "learning_rate": 2.8591207339565274e-05, "loss": 39.8708, "step": 16010 }, { "epoch": 42.28722350610763, "grad_norm": 1074.118408203125, "learning_rate": 2.857181019066929e-05, "loss": 39.6105, "step": 16011 }, { "epoch": 42.28986464179597, "grad_norm": 544.7572631835938, "learning_rate": 2.8552419225025385e-05, "loss": 37.2455, "step": 16012 }, { "epoch": 42.292505777484315, "grad_norm": 2289.744384765625, "learning_rate": 2.8533034443175176e-05, "loss": 37.8041, "step": 16013 }, { "epoch": 42.295146913172665, "grad_norm": 828.7776489257812, "learning_rate": 2.851365584565982e-05, "loss": 38.1747, "step": 16014 }, { "epoch": 42.29778804886101, "grad_norm": 745.34814453125, "learning_rate": 2.849428343302049e-05, "loss": 37.0965, "step": 16015 }, { "epoch": 42.30042918454936, "grad_norm": 489.4957580566406, "learning_rate": 2.847491720579823e-05, "loss": 35.3396, "step": 16016 }, { "epoch": 42.3030703202377, "grad_norm": 1060.8577880859375, "learning_rate": 2.845555716453377e-05, "loss": 35.8469, "step": 16017 }, { "epoch": 42.30571145592605, "grad_norm": 1674.154052734375, "learning_rate": 2.8436203309767734e-05, "loss": 35.696, "step": 16018 }, { "epoch": 42.308352591614394, "grad_norm": 1130.2532958984375, "learning_rate": 2.8416855642040497e-05, "loss": 35.3761, "step": 16019 }, { "epoch": 42.31099372730274, "grad_norm": 1185.0640869140625, "learning_rate": 2.8397514161892484e-05, "loss": 34.7405, "step": 16020 }, { "epoch": 42.31363486299109, "grad_norm": 1416.5574951171875, "learning_rate": 2.8378178869863686e-05, "loss": 34.7318, "step": 16021 }, { "epoch": 42.31627599867943, "grad_norm": 604.9564819335938, "learning_rate": 2.8358849766494084e-05, "loss": 35.4092, "step": 16022 }, { "epoch": 42.31891713436778, "grad_norm": 531.0277099609375, "learning_rate": 2.8339526852323383e-05, "loss": 35.2905, "step": 16023 }, { "epoch": 42.32155827005612, "grad_norm": 1925.05029296875, "learning_rate": 2.832021012789118e-05, "loss": 35.2811, "step": 16024 }, { "epoch": 42.32419940574447, "grad_norm": 11817.0849609375, "learning_rate": 2.8300899593736885e-05, "loss": 27.1396, "step": 16025 }, { "epoch": 42.326840541432816, "grad_norm": 1255.1123046875, "learning_rate": 2.8281595250399694e-05, "loss": 9.5483, "step": 16026 }, { "epoch": 42.32948167712116, "grad_norm": 7104.24267578125, "learning_rate": 2.8262297098418764e-05, "loss": 10.0556, "step": 16027 }, { "epoch": 42.33212281280951, "grad_norm": 1351.251953125, "learning_rate": 2.8243005138332916e-05, "loss": 11.9439, "step": 16028 }, { "epoch": 42.33476394849785, "grad_norm": 676.3833618164062, "learning_rate": 2.8223719370680856e-05, "loss": 10.3294, "step": 16029 }, { "epoch": 42.3374050841862, "grad_norm": 1506.13916015625, "learning_rate": 2.8204439796001235e-05, "loss": 14.2568, "step": 16030 }, { "epoch": 42.340046219874544, "grad_norm": 1088.5494384765625, "learning_rate": 2.818516641483232e-05, "loss": 13.5747, "step": 16031 }, { "epoch": 42.342687355562894, "grad_norm": 602.4264526367188, "learning_rate": 2.8165899227712317e-05, "loss": 15.6998, "step": 16032 }, { "epoch": 42.34532849125124, "grad_norm": 785.3316040039062, "learning_rate": 2.8146638235179213e-05, "loss": 11.9627, "step": 16033 }, { "epoch": 42.34796962693959, "grad_norm": 7499.16357421875, "learning_rate": 2.8127383437770938e-05, "loss": 8.5715, "step": 16034 }, { "epoch": 42.35061076262793, "grad_norm": 956.0536499023438, "learning_rate": 2.8108134836025174e-05, "loss": 26.4734, "step": 16035 }, { "epoch": 42.35325189831627, "grad_norm": 700.001708984375, "learning_rate": 2.808889243047935e-05, "loss": 35.0694, "step": 16036 }, { "epoch": 42.35589303400462, "grad_norm": 602.2323608398438, "learning_rate": 2.8069656221670904e-05, "loss": 34.0358, "step": 16037 }, { "epoch": 42.358534169692966, "grad_norm": 723.2647094726562, "learning_rate": 2.8050426210136897e-05, "loss": 34.3306, "step": 16038 }, { "epoch": 42.361175305381316, "grad_norm": 994.0838012695312, "learning_rate": 2.8031202396414407e-05, "loss": 34.4074, "step": 16039 }, { "epoch": 42.36381644106966, "grad_norm": 938.9498291015625, "learning_rate": 2.8011984781040168e-05, "loss": 34.2913, "step": 16040 }, { "epoch": 42.36645757675801, "grad_norm": 1627.263671875, "learning_rate": 2.799277336455086e-05, "loss": 35.1747, "step": 16041 }, { "epoch": 42.36909871244635, "grad_norm": 868.9473266601562, "learning_rate": 2.7973568147482946e-05, "loss": 34.7019, "step": 16042 }, { "epoch": 42.371739848134695, "grad_norm": 1380.1409912109375, "learning_rate": 2.7954369130372665e-05, "loss": 33.9483, "step": 16043 }, { "epoch": 42.374380983823045, "grad_norm": 1477.89453125, "learning_rate": 2.793517631375625e-05, "loss": 34.4744, "step": 16044 }, { "epoch": 42.37702211951139, "grad_norm": 1350.4559326171875, "learning_rate": 2.791598969816955e-05, "loss": 33.7201, "step": 16045 }, { "epoch": 42.37966325519974, "grad_norm": 656.7791137695312, "learning_rate": 2.7896809284148422e-05, "loss": 35.1277, "step": 16046 }, { "epoch": 42.38230439088808, "grad_norm": 1477.2916259765625, "learning_rate": 2.7877635072228347e-05, "loss": 34.5949, "step": 16047 }, { "epoch": 42.38494552657643, "grad_norm": 1279.8006591796875, "learning_rate": 2.7858467062944954e-05, "loss": 34.0724, "step": 16048 }, { "epoch": 42.38758666226477, "grad_norm": 3295.090087890625, "learning_rate": 2.7839305256833315e-05, "loss": 35.0817, "step": 16049 }, { "epoch": 42.390227797953116, "grad_norm": 3030.2216796875, "learning_rate": 2.78201496544285e-05, "loss": 35.305, "step": 16050 }, { "epoch": 42.392868933641466, "grad_norm": 690.006103515625, "learning_rate": 2.7801000256265556e-05, "loss": 37.3129, "step": 16051 }, { "epoch": 42.39551006932981, "grad_norm": 1647.6834716796875, "learning_rate": 2.7781857062879167e-05, "loss": 37.7094, "step": 16052 }, { "epoch": 42.39815120501816, "grad_norm": 1901.9749755859375, "learning_rate": 2.7762720074803877e-05, "loss": 41.8183, "step": 16053 }, { "epoch": 42.4007923407065, "grad_norm": 579.271484375, "learning_rate": 2.7743589292574033e-05, "loss": 39.4659, "step": 16054 }, { "epoch": 42.40343347639485, "grad_norm": 1296.2115478515625, "learning_rate": 2.7724464716723935e-05, "loss": 39.7108, "step": 16055 }, { "epoch": 42.406074612083195, "grad_norm": 954.3048095703125, "learning_rate": 2.7705346347787598e-05, "loss": 39.0834, "step": 16056 }, { "epoch": 42.408715747771545, "grad_norm": 1390.8953857421875, "learning_rate": 2.7686234186298868e-05, "loss": 42.1205, "step": 16057 }, { "epoch": 42.41135688345989, "grad_norm": 552.527099609375, "learning_rate": 2.7667128232791466e-05, "loss": 40.6982, "step": 16058 }, { "epoch": 42.41399801914823, "grad_norm": 2237.12841796875, "learning_rate": 2.7648028487798875e-05, "loss": 41.9313, "step": 16059 }, { "epoch": 42.41663915483658, "grad_norm": 683.2776489257812, "learning_rate": 2.7628934951854506e-05, "loss": 39.7922, "step": 16060 }, { "epoch": 42.419280290524924, "grad_norm": 684.0894165039062, "learning_rate": 2.760984762549143e-05, "loss": 38.4356, "step": 16061 }, { "epoch": 42.421921426213274, "grad_norm": 1454.34326171875, "learning_rate": 2.7590766509242805e-05, "loss": 40.623, "step": 16062 }, { "epoch": 42.42456256190162, "grad_norm": 484.5628356933594, "learning_rate": 2.757169160364134e-05, "loss": 39.928, "step": 16063 }, { "epoch": 42.42720369758997, "grad_norm": 1300.5811767578125, "learning_rate": 2.755262290921967e-05, "loss": 39.0731, "step": 16064 }, { "epoch": 42.42984483327831, "grad_norm": 761.1798706054688, "learning_rate": 2.753356042651045e-05, "loss": 37.661, "step": 16065 }, { "epoch": 42.43248596896665, "grad_norm": 1464.781494140625, "learning_rate": 2.7514504156045837e-05, "loss": 37.1631, "step": 16066 }, { "epoch": 42.435127104655, "grad_norm": 660.23486328125, "learning_rate": 2.7495454098357963e-05, "loss": 35.0478, "step": 16067 }, { "epoch": 42.437768240343345, "grad_norm": 611.1641235351562, "learning_rate": 2.747641025397882e-05, "loss": 35.3277, "step": 16068 }, { "epoch": 42.440409376031695, "grad_norm": 877.8240356445312, "learning_rate": 2.7457372623440223e-05, "loss": 35.3178, "step": 16069 }, { "epoch": 42.44305051172004, "grad_norm": 904.6976318359375, "learning_rate": 2.743834120727376e-05, "loss": 35.0816, "step": 16070 }, { "epoch": 42.44569164740839, "grad_norm": 1333.3798828125, "learning_rate": 2.7419316006010858e-05, "loss": 36.0112, "step": 16071 }, { "epoch": 42.44833278309673, "grad_norm": 2071.600341796875, "learning_rate": 2.7400297020182817e-05, "loss": 35.5556, "step": 16072 }, { "epoch": 42.450973918785074, "grad_norm": 1421.302001953125, "learning_rate": 2.738128425032074e-05, "loss": 35.2631, "step": 16073 }, { "epoch": 42.453615054473424, "grad_norm": 552.97607421875, "learning_rate": 2.736227769695554e-05, "loss": 39.6182, "step": 16074 }, { "epoch": 42.45625619016177, "grad_norm": 6241.59716796875, "learning_rate": 2.734327736061795e-05, "loss": 28.4573, "step": 16075 }, { "epoch": 42.45889732585012, "grad_norm": 3397.124267578125, "learning_rate": 2.7324283241838527e-05, "loss": 11.9608, "step": 16076 }, { "epoch": 42.46153846153846, "grad_norm": 452.08062744140625, "learning_rate": 2.73052953411477e-05, "loss": 10.6996, "step": 16077 }, { "epoch": 42.46417959722681, "grad_norm": 831.1965942382812, "learning_rate": 2.7286313659075634e-05, "loss": 12.3581, "step": 16078 }, { "epoch": 42.46682073291515, "grad_norm": 7961.701171875, "learning_rate": 2.7267338196152457e-05, "loss": 12.7819, "step": 16079 }, { "epoch": 42.4694618686035, "grad_norm": 2057.50146484375, "learning_rate": 2.7248368952908055e-05, "loss": 10.3715, "step": 16080 }, { "epoch": 42.472103004291846, "grad_norm": 1240.4866943359375, "learning_rate": 2.722940592987208e-05, "loss": 12.375, "step": 16081 }, { "epoch": 42.47474413998019, "grad_norm": 554.7084350585938, "learning_rate": 2.721044912757403e-05, "loss": 9.0954, "step": 16082 }, { "epoch": 42.47738527566854, "grad_norm": 740.7442016601562, "learning_rate": 2.71914985465434e-05, "loss": 16.7869, "step": 16083 }, { "epoch": 42.48002641135688, "grad_norm": 2424.5634765625, "learning_rate": 2.7172554187309267e-05, "loss": 27.828, "step": 16084 }, { "epoch": 42.48266754704523, "grad_norm": 618.604248046875, "learning_rate": 2.7153616050400592e-05, "loss": 36.3048, "step": 16085 }, { "epoch": 42.485308682733574, "grad_norm": 1909.4517822265625, "learning_rate": 2.7134684136346344e-05, "loss": 35.7925, "step": 16086 }, { "epoch": 42.487949818421924, "grad_norm": 1999.91455078125, "learning_rate": 2.7115758445675127e-05, "loss": 37.3745, "step": 16087 }, { "epoch": 42.49059095411027, "grad_norm": 562.7091064453125, "learning_rate": 2.7096838978915433e-05, "loss": 34.612, "step": 16088 }, { "epoch": 42.49323208979861, "grad_norm": 1896.64990234375, "learning_rate": 2.7077925736595483e-05, "loss": 35.2179, "step": 16089 }, { "epoch": 42.49587322548696, "grad_norm": 1754.4810791015625, "learning_rate": 2.70590187192436e-05, "loss": 34.833, "step": 16090 }, { "epoch": 42.4985143611753, "grad_norm": 1275.0394287109375, "learning_rate": 2.7040117927387614e-05, "loss": 34.7082, "step": 16091 }, { "epoch": 42.50115549686365, "grad_norm": 916.8746948242188, "learning_rate": 2.702122336155538e-05, "loss": 35.0839, "step": 16092 }, { "epoch": 42.503796632551996, "grad_norm": 600.5023193359375, "learning_rate": 2.7002335022274503e-05, "loss": 33.3465, "step": 16093 }, { "epoch": 42.506437768240346, "grad_norm": 1729.4332275390625, "learning_rate": 2.6983452910072424e-05, "loss": 33.8964, "step": 16094 }, { "epoch": 42.50907890392869, "grad_norm": 1428.09130859375, "learning_rate": 2.696457702547639e-05, "loss": 33.6807, "step": 16095 }, { "epoch": 42.51172003961703, "grad_norm": 968.3765258789062, "learning_rate": 2.6945707369013526e-05, "loss": 34.5261, "step": 16096 }, { "epoch": 42.51436117530538, "grad_norm": 963.2916259765625, "learning_rate": 2.6926843941210782e-05, "loss": 33.4175, "step": 16097 }, { "epoch": 42.517002310993725, "grad_norm": 1233.781982421875, "learning_rate": 2.690798674259487e-05, "loss": 34.3923, "step": 16098 }, { "epoch": 42.519643446682075, "grad_norm": 1104.35107421875, "learning_rate": 2.688913577369234e-05, "loss": 34.5536, "step": 16099 }, { "epoch": 42.52228458237042, "grad_norm": 583.0357666015625, "learning_rate": 2.687029103502972e-05, "loss": 34.0629, "step": 16100 }, { "epoch": 42.52492571805877, "grad_norm": 2461.132568359375, "learning_rate": 2.6851452527133114e-05, "loss": 36.8282, "step": 16101 }, { "epoch": 42.52756685374711, "grad_norm": 3367.23681640625, "learning_rate": 2.6832620250528543e-05, "loss": 39.595, "step": 16102 }, { "epoch": 42.53020798943546, "grad_norm": 1180.19580078125, "learning_rate": 2.681379420574201e-05, "loss": 39.6884, "step": 16103 }, { "epoch": 42.5328491251238, "grad_norm": 576.51708984375, "learning_rate": 2.6794974393299172e-05, "loss": 38.4277, "step": 16104 }, { "epoch": 42.535490260812146, "grad_norm": 892.351318359375, "learning_rate": 2.6776160813725552e-05, "loss": 38.8031, "step": 16105 }, { "epoch": 42.538131396500496, "grad_norm": 1063.154541015625, "learning_rate": 2.6757353467546487e-05, "loss": 40.7275, "step": 16106 }, { "epoch": 42.54077253218884, "grad_norm": 801.2911987304688, "learning_rate": 2.6738552355287215e-05, "loss": 40.5066, "step": 16107 }, { "epoch": 42.54341366787719, "grad_norm": 1131.91259765625, "learning_rate": 2.671975747747271e-05, "loss": 40.9471, "step": 16108 }, { "epoch": 42.54605480356553, "grad_norm": 735.3983764648438, "learning_rate": 2.670096883462783e-05, "loss": 42.0623, "step": 16109 }, { "epoch": 42.54869593925388, "grad_norm": 1710.615966796875, "learning_rate": 2.668218642727721e-05, "loss": 39.9028, "step": 16110 }, { "epoch": 42.551337074942225, "grad_norm": 1075.779296875, "learning_rate": 2.6663410255945347e-05, "loss": 40.7072, "step": 16111 }, { "epoch": 42.55397821063057, "grad_norm": 633.3489379882812, "learning_rate": 2.6644640321156572e-05, "loss": 38.359, "step": 16112 }, { "epoch": 42.55661934631892, "grad_norm": 442.97845458984375, "learning_rate": 2.6625876623434965e-05, "loss": 38.666, "step": 16113 }, { "epoch": 42.55926048200726, "grad_norm": 489.4508972167969, "learning_rate": 2.6607119163304578e-05, "loss": 38.4377, "step": 16114 }, { "epoch": 42.56190161769561, "grad_norm": 1485.5145263671875, "learning_rate": 2.658836794128916e-05, "loss": 37.3732, "step": 16115 }, { "epoch": 42.564542753383954, "grad_norm": 388.8692321777344, "learning_rate": 2.6569622957912264e-05, "loss": 37.3925, "step": 16116 }, { "epoch": 42.567183889072304, "grad_norm": 626.218994140625, "learning_rate": 2.6550884213697474e-05, "loss": 34.438, "step": 16117 }, { "epoch": 42.56982502476065, "grad_norm": 924.1160278320312, "learning_rate": 2.6532151709168007e-05, "loss": 34.8783, "step": 16118 }, { "epoch": 42.572466160449, "grad_norm": 1630.5595703125, "learning_rate": 2.651342544484689e-05, "loss": 34.6068, "step": 16119 }, { "epoch": 42.57510729613734, "grad_norm": 1080.6727294921875, "learning_rate": 2.6494705421257036e-05, "loss": 35.8316, "step": 16120 }, { "epoch": 42.57774843182568, "grad_norm": 794.6130981445312, "learning_rate": 2.6475991638921283e-05, "loss": 34.9264, "step": 16121 }, { "epoch": 42.58038956751403, "grad_norm": 815.2269287109375, "learning_rate": 2.6457284098362177e-05, "loss": 34.2619, "step": 16122 }, { "epoch": 42.583030703202375, "grad_norm": 772.1775512695312, "learning_rate": 2.643858280010203e-05, "loss": 34.3832, "step": 16123 }, { "epoch": 42.585671838890725, "grad_norm": 2076.310546875, "learning_rate": 2.6419887744663166e-05, "loss": 39.4924, "step": 16124 }, { "epoch": 42.58831297457907, "grad_norm": 1598.0704345703125, "learning_rate": 2.640119893256762e-05, "loss": 25.6319, "step": 16125 }, { "epoch": 42.59095411026742, "grad_norm": 2656.8037109375, "learning_rate": 2.638251636433725e-05, "loss": 11.2845, "step": 16126 }, { "epoch": 42.59359524595576, "grad_norm": 38722.12890625, "learning_rate": 2.6363840040493748e-05, "loss": 14.0417, "step": 16127 }, { "epoch": 42.596236381644104, "grad_norm": 1323.081298828125, "learning_rate": 2.634516996155864e-05, "loss": 9.3531, "step": 16128 }, { "epoch": 42.598877517332454, "grad_norm": 4346.14990234375, "learning_rate": 2.632650612805326e-05, "loss": 9.7881, "step": 16129 }, { "epoch": 42.6015186530208, "grad_norm": 2863.268310546875, "learning_rate": 2.630784854049878e-05, "loss": 10.752, "step": 16130 }, { "epoch": 42.60415978870915, "grad_norm": 1469.3839111328125, "learning_rate": 2.628919719941625e-05, "loss": 16.792, "step": 16131 }, { "epoch": 42.60680092439749, "grad_norm": 1631.258056640625, "learning_rate": 2.62705521053265e-05, "loss": 10.0781, "step": 16132 }, { "epoch": 42.60944206008584, "grad_norm": 7286.15478515625, "learning_rate": 2.6251913258750148e-05, "loss": 13.404, "step": 16133 }, { "epoch": 42.61208319577418, "grad_norm": 1173.9659423828125, "learning_rate": 2.6233280660207637e-05, "loss": 14.6255, "step": 16134 }, { "epoch": 42.614724331462526, "grad_norm": 878.60205078125, "learning_rate": 2.6214654310219382e-05, "loss": 35.8545, "step": 16135 }, { "epoch": 42.617365467150876, "grad_norm": 1539.10986328125, "learning_rate": 2.619603420930544e-05, "loss": 33.7535, "step": 16136 }, { "epoch": 42.62000660283922, "grad_norm": 857.077392578125, "learning_rate": 2.6177420357985704e-05, "loss": 35.5002, "step": 16137 }, { "epoch": 42.62264773852757, "grad_norm": 602.2918701171875, "learning_rate": 2.615881275678006e-05, "loss": 35.3513, "step": 16138 }, { "epoch": 42.62528887421591, "grad_norm": 779.9974975585938, "learning_rate": 2.614021140620809e-05, "loss": 33.3267, "step": 16139 }, { "epoch": 42.62793000990426, "grad_norm": 1135.8226318359375, "learning_rate": 2.612161630678922e-05, "loss": 34.5, "step": 16140 }, { "epoch": 42.630571145592604, "grad_norm": 528.3114624023438, "learning_rate": 2.6103027459042666e-05, "loss": 33.6474, "step": 16141 }, { "epoch": 42.63321228128095, "grad_norm": 678.0059204101562, "learning_rate": 2.6084444863487573e-05, "loss": 33.7367, "step": 16142 }, { "epoch": 42.6358534169693, "grad_norm": 1175.470703125, "learning_rate": 2.6065868520642828e-05, "loss": 34.5701, "step": 16143 }, { "epoch": 42.63849455265764, "grad_norm": 491.4538879394531, "learning_rate": 2.6047298431027157e-05, "loss": 34.5775, "step": 16144 }, { "epoch": 42.64113568834599, "grad_norm": 2571.50927734375, "learning_rate": 2.6028734595159115e-05, "loss": 34.3825, "step": 16145 }, { "epoch": 42.64377682403433, "grad_norm": 1206.624267578125, "learning_rate": 2.6010177013557095e-05, "loss": 34.4165, "step": 16146 }, { "epoch": 42.64641795972268, "grad_norm": 791.8714599609375, "learning_rate": 2.5991625686739323e-05, "loss": 34.3564, "step": 16147 }, { "epoch": 42.649059095411026, "grad_norm": 2246.59765625, "learning_rate": 2.5973080615223742e-05, "loss": 33.4235, "step": 16148 }, { "epoch": 42.651700231099376, "grad_norm": 2472.560546875, "learning_rate": 2.595454179952833e-05, "loss": 33.2638, "step": 16149 }, { "epoch": 42.65434136678772, "grad_norm": 788.3953857421875, "learning_rate": 2.5936009240170725e-05, "loss": 36.2132, "step": 16150 }, { "epoch": 42.65698250247606, "grad_norm": 5147.91943359375, "learning_rate": 2.5917482937668408e-05, "loss": 34.8103, "step": 16151 }, { "epoch": 42.65962363816441, "grad_norm": 1239.751220703125, "learning_rate": 2.589896289253879e-05, "loss": 37.6174, "step": 16152 }, { "epoch": 42.662264773852755, "grad_norm": 622.468017578125, "learning_rate": 2.5880449105299048e-05, "loss": 41.2541, "step": 16153 }, { "epoch": 42.664905909541105, "grad_norm": 918.38134765625, "learning_rate": 2.5861941576466042e-05, "loss": 39.5967, "step": 16154 }, { "epoch": 42.66754704522945, "grad_norm": 1288.7757568359375, "learning_rate": 2.584344030655661e-05, "loss": 37.3721, "step": 16155 }, { "epoch": 42.6701881809178, "grad_norm": 999.4424438476562, "learning_rate": 2.5824945296087475e-05, "loss": 38.7035, "step": 16156 }, { "epoch": 42.67282931660614, "grad_norm": 1133.7977294921875, "learning_rate": 2.5806456545575064e-05, "loss": 41.4297, "step": 16157 }, { "epoch": 42.67547045229448, "grad_norm": 2589.371826171875, "learning_rate": 2.5787974055535624e-05, "loss": 39.6268, "step": 16158 }, { "epoch": 42.67811158798283, "grad_norm": 650.9584350585938, "learning_rate": 2.5769497826485327e-05, "loss": 44.3429, "step": 16159 }, { "epoch": 42.680752723671176, "grad_norm": 1347.228759765625, "learning_rate": 2.5751027858940067e-05, "loss": 40.4255, "step": 16160 }, { "epoch": 42.683393859359526, "grad_norm": 785.663818359375, "learning_rate": 2.5732564153415654e-05, "loss": 41.4092, "step": 16161 }, { "epoch": 42.68603499504787, "grad_norm": 1405.2833251953125, "learning_rate": 2.5714106710427622e-05, "loss": 39.43, "step": 16162 }, { "epoch": 42.68867613073622, "grad_norm": 832.1963500976562, "learning_rate": 2.5695655530491414e-05, "loss": 38.5446, "step": 16163 }, { "epoch": 42.69131726642456, "grad_norm": 599.8563842773438, "learning_rate": 2.567721061412226e-05, "loss": 37.9537, "step": 16164 }, { "epoch": 42.69395840211291, "grad_norm": 760.8855590820312, "learning_rate": 2.565877196183519e-05, "loss": 36.5546, "step": 16165 }, { "epoch": 42.696599537801255, "grad_norm": 882.7312622070312, "learning_rate": 2.5640339574145162e-05, "loss": 36.527, "step": 16166 }, { "epoch": 42.6992406734896, "grad_norm": 1030.2362060546875, "learning_rate": 2.5621913451566862e-05, "loss": 34.851, "step": 16167 }, { "epoch": 42.70188180917795, "grad_norm": 780.09521484375, "learning_rate": 2.5603493594614836e-05, "loss": 36.3016, "step": 16168 }, { "epoch": 42.70452294486629, "grad_norm": 699.352783203125, "learning_rate": 2.5585080003803356e-05, "loss": 34.5194, "step": 16169 }, { "epoch": 42.70716408055464, "grad_norm": 617.6963500976562, "learning_rate": 2.556667267964677e-05, "loss": 35.021, "step": 16170 }, { "epoch": 42.709805216242984, "grad_norm": 712.5615234375, "learning_rate": 2.554827162265902e-05, "loss": 34.3531, "step": 16171 }, { "epoch": 42.712446351931334, "grad_norm": 573.57666015625, "learning_rate": 2.5529876833353865e-05, "loss": 34.8301, "step": 16172 }, { "epoch": 42.71508748761968, "grad_norm": 1799.79736328125, "learning_rate": 2.551148831224509e-05, "loss": 34.1779, "step": 16173 }, { "epoch": 42.71772862330802, "grad_norm": 15406.2744140625, "learning_rate": 2.5493106059846115e-05, "loss": 41.4549, "step": 16174 }, { "epoch": 42.72036975899637, "grad_norm": 2075.182373046875, "learning_rate": 2.5474730076670282e-05, "loss": 19.521, "step": 16175 }, { "epoch": 42.72301089468471, "grad_norm": 1517.4815673828125, "learning_rate": 2.545636036323065e-05, "loss": 11.141, "step": 16176 }, { "epoch": 42.72565203037306, "grad_norm": 1437.509033203125, "learning_rate": 2.543799692004031e-05, "loss": 10.0147, "step": 16177 }, { "epoch": 42.728293166061405, "grad_norm": 1929.021728515625, "learning_rate": 2.541963974761202e-05, "loss": 12.3249, "step": 16178 }, { "epoch": 42.730934301749755, "grad_norm": 866.04345703125, "learning_rate": 2.540128884645834e-05, "loss": 10.4073, "step": 16179 }, { "epoch": 42.7335754374381, "grad_norm": 3320.251220703125, "learning_rate": 2.5382944217091723e-05, "loss": 10.812, "step": 16180 }, { "epoch": 42.73621657312644, "grad_norm": 8719.8564453125, "learning_rate": 2.5364605860024452e-05, "loss": 11.0725, "step": 16181 }, { "epoch": 42.73885770881479, "grad_norm": 4420.3203125, "learning_rate": 2.5346273775768617e-05, "loss": 10.8165, "step": 16182 }, { "epoch": 42.741498844503134, "grad_norm": 770.2809448242188, "learning_rate": 2.5327947964836034e-05, "loss": 17.0434, "step": 16183 }, { "epoch": 42.744139980191484, "grad_norm": 921.3418579101562, "learning_rate": 2.5309628427738596e-05, "loss": 16.0096, "step": 16184 }, { "epoch": 42.74678111587983, "grad_norm": 763.852294921875, "learning_rate": 2.529131516498781e-05, "loss": 34.5085, "step": 16185 }, { "epoch": 42.74942225156818, "grad_norm": 929.2619018554688, "learning_rate": 2.5273008177094964e-05, "loss": 34.4808, "step": 16186 }, { "epoch": 42.75206338725652, "grad_norm": 2686.068603515625, "learning_rate": 2.5254707464571425e-05, "loss": 34.4488, "step": 16187 }, { "epoch": 42.75470452294486, "grad_norm": 718.1748046875, "learning_rate": 2.52364130279282e-05, "loss": 34.7727, "step": 16188 }, { "epoch": 42.75734565863321, "grad_norm": 1268.18798828125, "learning_rate": 2.521812486767608e-05, "loss": 33.7566, "step": 16189 }, { "epoch": 42.759986794321556, "grad_norm": 1910.88720703125, "learning_rate": 2.5199842984325706e-05, "loss": 33.7667, "step": 16190 }, { "epoch": 42.762627930009906, "grad_norm": 2648.51025390625, "learning_rate": 2.51815673783877e-05, "loss": 34.4418, "step": 16191 }, { "epoch": 42.76526906569825, "grad_norm": 1073.1434326171875, "learning_rate": 2.5163298050372373e-05, "loss": 34.3442, "step": 16192 }, { "epoch": 42.7679102013866, "grad_norm": 971.9597778320312, "learning_rate": 2.5145035000789824e-05, "loss": 34.8754, "step": 16193 }, { "epoch": 42.77055133707494, "grad_norm": 1193.552001953125, "learning_rate": 2.512677823015014e-05, "loss": 34.7732, "step": 16194 }, { "epoch": 42.77319247276329, "grad_norm": 1505.268798828125, "learning_rate": 2.510852773896308e-05, "loss": 36.3783, "step": 16195 }, { "epoch": 42.775833608451634, "grad_norm": 1289.813720703125, "learning_rate": 2.509028352773826e-05, "loss": 34.3174, "step": 16196 }, { "epoch": 42.77847474413998, "grad_norm": 1387.0386962890625, "learning_rate": 2.5072045596985172e-05, "loss": 35.6982, "step": 16197 }, { "epoch": 42.78111587982833, "grad_norm": 1243.9796142578125, "learning_rate": 2.5053813947213068e-05, "loss": 33.4136, "step": 16198 }, { "epoch": 42.78375701551667, "grad_norm": 633.9259033203125, "learning_rate": 2.5035588578931067e-05, "loss": 34.6893, "step": 16199 }, { "epoch": 42.78639815120502, "grad_norm": 1660.4183349609375, "learning_rate": 2.5017369492648044e-05, "loss": 35.9114, "step": 16200 }, { "epoch": 42.78639815120502, "eval_loss": 3.710871934890747, "eval_runtime": 2.1346, "eval_samples_per_second": 231.892, "eval_steps_per_second": 29.045, "step": 16200 }, { "epoch": 42.78903928689336, "grad_norm": 2886.264892578125, "learning_rate": 2.499915668887287e-05, "loss": 36.0725, "step": 16201 }, { "epoch": 42.79168042258171, "grad_norm": 891.5411376953125, "learning_rate": 2.498095016811408e-05, "loss": 38.1159, "step": 16202 }, { "epoch": 42.794321558270056, "grad_norm": 1098.910888671875, "learning_rate": 2.4962749930880073e-05, "loss": 40.348, "step": 16203 }, { "epoch": 42.7969626939584, "grad_norm": 1072.707763671875, "learning_rate": 2.494455597767903e-05, "loss": 37.6926, "step": 16204 }, { "epoch": 42.79960382964675, "grad_norm": 1501.6343994140625, "learning_rate": 2.4926368309019122e-05, "loss": 39.6687, "step": 16205 }, { "epoch": 42.80224496533509, "grad_norm": 965.1445922851562, "learning_rate": 2.4908186925408193e-05, "loss": 39.4511, "step": 16206 }, { "epoch": 42.80488610102344, "grad_norm": 603.2761840820312, "learning_rate": 2.4890011827353814e-05, "loss": 40.222, "step": 16207 }, { "epoch": 42.807527236711785, "grad_norm": 1097.9178466796875, "learning_rate": 2.487184301536369e-05, "loss": 42.06, "step": 16208 }, { "epoch": 42.810168372400135, "grad_norm": 614.2516479492188, "learning_rate": 2.48536804899451e-05, "loss": 40.4336, "step": 16209 }, { "epoch": 42.81280950808848, "grad_norm": 1152.41015625, "learning_rate": 2.4835524251605206e-05, "loss": 41.4067, "step": 16210 }, { "epoch": 42.81545064377683, "grad_norm": 4533.669921875, "learning_rate": 2.4817374300850988e-05, "loss": 40.1348, "step": 16211 }, { "epoch": 42.81809177946517, "grad_norm": 1076.0308837890625, "learning_rate": 2.4799230638189368e-05, "loss": 38.6647, "step": 16212 }, { "epoch": 42.82073291515351, "grad_norm": 917.1864624023438, "learning_rate": 2.478109326412692e-05, "loss": 38.606, "step": 16213 }, { "epoch": 42.82337405084186, "grad_norm": 991.785400390625, "learning_rate": 2.4762962179170178e-05, "loss": 36.6321, "step": 16214 }, { "epoch": 42.826015186530206, "grad_norm": 1068.3663330078125, "learning_rate": 2.474483738382538e-05, "loss": 35.5132, "step": 16215 }, { "epoch": 42.828656322218556, "grad_norm": 876.1483764648438, "learning_rate": 2.4726718878598674e-05, "loss": 36.2584, "step": 16216 }, { "epoch": 42.8312974579069, "grad_norm": 1502.61767578125, "learning_rate": 2.4708606663996018e-05, "loss": 35.4338, "step": 16217 }, { "epoch": 42.83393859359525, "grad_norm": 1014.7556762695312, "learning_rate": 2.469050074052315e-05, "loss": 34.5077, "step": 16218 }, { "epoch": 42.83657972928359, "grad_norm": 1041.08447265625, "learning_rate": 2.4672401108685712e-05, "loss": 35.8373, "step": 16219 }, { "epoch": 42.839220864971935, "grad_norm": 707.4576416015625, "learning_rate": 2.4654307768989108e-05, "loss": 35.0635, "step": 16220 }, { "epoch": 42.841862000660285, "grad_norm": 848.5215454101562, "learning_rate": 2.4636220721938552e-05, "loss": 34.574, "step": 16221 }, { "epoch": 42.84450313634863, "grad_norm": 1305.7408447265625, "learning_rate": 2.4618139968039187e-05, "loss": 33.7756, "step": 16222 }, { "epoch": 42.84714427203698, "grad_norm": 1213.650146484375, "learning_rate": 2.4600065507795923e-05, "loss": 36.5218, "step": 16223 }, { "epoch": 42.84978540772532, "grad_norm": 3967.453125, "learning_rate": 2.458199734171332e-05, "loss": 43.1602, "step": 16224 }, { "epoch": 42.85242654341367, "grad_norm": 4137.85302734375, "learning_rate": 2.456393547029609e-05, "loss": 29.5077, "step": 16225 }, { "epoch": 42.855067679102014, "grad_norm": 4697.66943359375, "learning_rate": 2.4545879894048494e-05, "loss": 8.3497, "step": 16226 }, { "epoch": 42.85770881479036, "grad_norm": 6735.1328125, "learning_rate": 2.45278306134748e-05, "loss": 14.8899, "step": 16227 }, { "epoch": 42.86034995047871, "grad_norm": 5518.544921875, "learning_rate": 2.4509787629078934e-05, "loss": 13.3612, "step": 16228 }, { "epoch": 42.86299108616705, "grad_norm": 10081.45703125, "learning_rate": 2.4491750941364826e-05, "loss": 12.7808, "step": 16229 }, { "epoch": 42.8656322218554, "grad_norm": 29402.7890625, "learning_rate": 2.4473720550836104e-05, "loss": 10.2653, "step": 16230 }, { "epoch": 42.86827335754374, "grad_norm": 794.8963623046875, "learning_rate": 2.445569645799625e-05, "loss": 8.3551, "step": 16231 }, { "epoch": 42.87091449323209, "grad_norm": 1482.22900390625, "learning_rate": 2.443767866334859e-05, "loss": 8.2416, "step": 16232 }, { "epoch": 42.873555628920435, "grad_norm": 891.9694213867188, "learning_rate": 2.4419667167396244e-05, "loss": 10.7473, "step": 16233 }, { "epoch": 42.87619676460878, "grad_norm": 7161.81201171875, "learning_rate": 2.440166197064217e-05, "loss": 9.121, "step": 16234 }, { "epoch": 42.87883790029713, "grad_norm": 1758.5576171875, "learning_rate": 2.4383663073589135e-05, "loss": 34.6682, "step": 16235 }, { "epoch": 42.88147903598547, "grad_norm": 1212.3077392578125, "learning_rate": 2.4365670476739793e-05, "loss": 33.4285, "step": 16236 }, { "epoch": 42.88412017167382, "grad_norm": 1626.5528564453125, "learning_rate": 2.4347684180596576e-05, "loss": 33.8843, "step": 16237 }, { "epoch": 42.886761307362164, "grad_norm": 730.355712890625, "learning_rate": 2.432970418566169e-05, "loss": 34.4347, "step": 16238 }, { "epoch": 42.889402443050514, "grad_norm": 1160.219482421875, "learning_rate": 2.431173049243729e-05, "loss": 34.81, "step": 16239 }, { "epoch": 42.89204357873886, "grad_norm": 730.0005493164062, "learning_rate": 2.4293763101425226e-05, "loss": 33.9481, "step": 16240 }, { "epoch": 42.89468471442721, "grad_norm": 955.25927734375, "learning_rate": 2.427580201312729e-05, "loss": 33.6722, "step": 16241 }, { "epoch": 42.89732585011555, "grad_norm": 978.0078125, "learning_rate": 2.4257847228044887e-05, "loss": 35.1835, "step": 16242 }, { "epoch": 42.89996698580389, "grad_norm": 1216.757080078125, "learning_rate": 2.4239898746679563e-05, "loss": 33.4437, "step": 16243 }, { "epoch": 42.90260812149224, "grad_norm": 1375.1829833984375, "learning_rate": 2.422195656953241e-05, "loss": 34.8062, "step": 16244 }, { "epoch": 42.905249257180586, "grad_norm": 793.3680419921875, "learning_rate": 2.420402069710448e-05, "loss": 34.7655, "step": 16245 }, { "epoch": 42.907890392868936, "grad_norm": 5606.09521484375, "learning_rate": 2.418609112989667e-05, "loss": 34.9578, "step": 16246 }, { "epoch": 42.91053152855728, "grad_norm": 1478.80859375, "learning_rate": 2.4168167868409585e-05, "loss": 34.4349, "step": 16247 }, { "epoch": 42.91317266424563, "grad_norm": 1042.6258544921875, "learning_rate": 2.4150250913143796e-05, "loss": 35.1501, "step": 16248 }, { "epoch": 42.91581379993397, "grad_norm": 838.7106323242188, "learning_rate": 2.413234026459954e-05, "loss": 33.163, "step": 16249 }, { "epoch": 42.918454935622314, "grad_norm": 1176.220947265625, "learning_rate": 2.411443592327703e-05, "loss": 35.677, "step": 16250 }, { "epoch": 42.921096071310664, "grad_norm": 1324.200927734375, "learning_rate": 2.4096537889676194e-05, "loss": 33.7457, "step": 16251 }, { "epoch": 42.92373720699901, "grad_norm": 5606.2763671875, "learning_rate": 2.407864616429678e-05, "loss": 38.787, "step": 16252 }, { "epoch": 42.92637834268736, "grad_norm": 749.5049438476562, "learning_rate": 2.4060760747638522e-05, "loss": 37.8591, "step": 16253 }, { "epoch": 42.9290194783757, "grad_norm": 2341.722900390625, "learning_rate": 2.4042881640200798e-05, "loss": 37.0853, "step": 16254 }, { "epoch": 42.93166061406405, "grad_norm": 993.3876953125, "learning_rate": 2.402500884248285e-05, "loss": 41.7144, "step": 16255 }, { "epoch": 42.93430174975239, "grad_norm": 1092.7713623046875, "learning_rate": 2.4007142354983747e-05, "loss": 41.2161, "step": 16256 }, { "epoch": 42.93694288544074, "grad_norm": 795.3884887695312, "learning_rate": 2.3989282178202483e-05, "loss": 39.3496, "step": 16257 }, { "epoch": 42.939584021129086, "grad_norm": 594.094482421875, "learning_rate": 2.3971428312637793e-05, "loss": 37.4703, "step": 16258 }, { "epoch": 42.94222515681743, "grad_norm": 862.247802734375, "learning_rate": 2.395358075878809e-05, "loss": 37.3872, "step": 16259 }, { "epoch": 42.94486629250578, "grad_norm": 3140.726318359375, "learning_rate": 2.3935739517151916e-05, "loss": 38.0776, "step": 16260 }, { "epoch": 42.94750742819412, "grad_norm": 1918.59033203125, "learning_rate": 2.391790458822743e-05, "loss": 36.245, "step": 16261 }, { "epoch": 42.95014856388247, "grad_norm": 908.7012939453125, "learning_rate": 2.390007597251265e-05, "loss": 35.6607, "step": 16262 }, { "epoch": 42.952789699570815, "grad_norm": 672.3739624023438, "learning_rate": 2.3882253670505367e-05, "loss": 33.9661, "step": 16263 }, { "epoch": 42.955430835259165, "grad_norm": 1117.385498046875, "learning_rate": 2.386443768270338e-05, "loss": 35.7465, "step": 16264 }, { "epoch": 42.95807197094751, "grad_norm": 2071.64453125, "learning_rate": 2.3846628009604103e-05, "loss": 30.3376, "step": 16265 }, { "epoch": 42.96071310663585, "grad_norm": 1416.35302734375, "learning_rate": 2.3828824651704937e-05, "loss": 8.9667, "step": 16266 }, { "epoch": 42.9633542423242, "grad_norm": 7927.3642578125, "learning_rate": 2.3811027609502933e-05, "loss": 13.3165, "step": 16267 }, { "epoch": 42.96599537801254, "grad_norm": 1303.000732421875, "learning_rate": 2.379323688349516e-05, "loss": 10.2415, "step": 16268 }, { "epoch": 42.96863651370089, "grad_norm": 990.7438354492188, "learning_rate": 2.377545247417834e-05, "loss": 13.7147, "step": 16269 }, { "epoch": 42.971277649389236, "grad_norm": 1764.1993408203125, "learning_rate": 2.3757674382049064e-05, "loss": 9.7209, "step": 16270 }, { "epoch": 42.973918785077586, "grad_norm": 1782.125244140625, "learning_rate": 2.373990260760389e-05, "loss": 28.246, "step": 16271 }, { "epoch": 42.97655992076593, "grad_norm": 1629.7965087890625, "learning_rate": 2.3722137151339003e-05, "loss": 35.4075, "step": 16272 }, { "epoch": 42.97920105645427, "grad_norm": 746.3011474609375, "learning_rate": 2.37043780137505e-05, "loss": 33.4089, "step": 16273 }, { "epoch": 42.98184219214262, "grad_norm": 1545.833740234375, "learning_rate": 2.3686625195334322e-05, "loss": 33.5007, "step": 16274 }, { "epoch": 42.984483327830965, "grad_norm": 801.9505615234375, "learning_rate": 2.366887869658621e-05, "loss": 34.2225, "step": 16275 }, { "epoch": 42.987124463519315, "grad_norm": 2157.517578125, "learning_rate": 2.3651138518001768e-05, "loss": 34.7119, "step": 16276 }, { "epoch": 42.98976559920766, "grad_norm": 1703.4986572265625, "learning_rate": 2.3633404660076185e-05, "loss": 34.2117, "step": 16277 }, { "epoch": 42.99240673489601, "grad_norm": 1667.990966796875, "learning_rate": 2.361567712330487e-05, "loss": 34.5221, "step": 16278 }, { "epoch": 42.99504787058435, "grad_norm": 1327.0379638671875, "learning_rate": 2.3597955908182784e-05, "loss": 35.0314, "step": 16279 }, { "epoch": 42.997689006272694, "grad_norm": 3124.30126953125, "learning_rate": 2.3580241015204757e-05, "loss": 34.3638, "step": 16280 }, { "epoch": 43.000330141961044, "grad_norm": 2502.366943359375, "learning_rate": 2.3562532444865503e-05, "loss": 40.7649, "step": 16281 }, { "epoch": 43.00297127764939, "grad_norm": 1948.8992919921875, "learning_rate": 2.3544830197659544e-05, "loss": 37.7463, "step": 16282 }, { "epoch": 43.00561241333774, "grad_norm": 364.18304443359375, "learning_rate": 2.3527134274081153e-05, "loss": 37.3691, "step": 16283 }, { "epoch": 43.00825354902608, "grad_norm": 1768.8203125, "learning_rate": 2.3509444674624513e-05, "loss": 38.3502, "step": 16284 }, { "epoch": 43.01089468471443, "grad_norm": 658.8916015625, "learning_rate": 2.3491761399783594e-05, "loss": 41.0364, "step": 16285 }, { "epoch": 43.01353582040277, "grad_norm": 1034.8099365234375, "learning_rate": 2.347408445005217e-05, "loss": 42.5492, "step": 16286 }, { "epoch": 43.01617695609112, "grad_norm": 643.8961791992188, "learning_rate": 2.3456413825923812e-05, "loss": 40.9285, "step": 16287 }, { "epoch": 43.018818091779465, "grad_norm": 1269.8204345703125, "learning_rate": 2.3438749527892072e-05, "loss": 39.7454, "step": 16288 }, { "epoch": 43.02145922746781, "grad_norm": 827.4732055664062, "learning_rate": 2.342109155645017e-05, "loss": 40.4489, "step": 16289 }, { "epoch": 43.02410036315616, "grad_norm": 543.2915649414062, "learning_rate": 2.340343991209118e-05, "loss": 40.1546, "step": 16290 }, { "epoch": 43.0267414988445, "grad_norm": 1597.4556884765625, "learning_rate": 2.3385794595307958e-05, "loss": 36.9298, "step": 16291 }, { "epoch": 43.02938263453285, "grad_norm": 840.0899658203125, "learning_rate": 2.336815560659336e-05, "loss": 37.1967, "step": 16292 }, { "epoch": 43.032023770221194, "grad_norm": 1117.8079833984375, "learning_rate": 2.335052294643994e-05, "loss": 39.5156, "step": 16293 }, { "epoch": 43.034664905909544, "grad_norm": 817.0746459960938, "learning_rate": 2.333289661533994e-05, "loss": 37.1973, "step": 16294 }, { "epoch": 43.03730604159789, "grad_norm": 979.9299926757812, "learning_rate": 2.3315276613785686e-05, "loss": 36.7809, "step": 16295 }, { "epoch": 43.03994717728623, "grad_norm": 973.0825805664062, "learning_rate": 2.3297662942269176e-05, "loss": 34.237, "step": 16296 }, { "epoch": 43.04258831297458, "grad_norm": 568.1646728515625, "learning_rate": 2.328005560128224e-05, "loss": 36.1534, "step": 16297 }, { "epoch": 43.04522944866292, "grad_norm": 1245.7939453125, "learning_rate": 2.326245459131654e-05, "loss": 35.112, "step": 16298 }, { "epoch": 43.04787058435127, "grad_norm": 1007.8170166015625, "learning_rate": 2.3244859912863626e-05, "loss": 35.4944, "step": 16299 }, { "epoch": 43.050511720039616, "grad_norm": 884.40380859375, "learning_rate": 2.3227271566414826e-05, "loss": 35.1824, "step": 16300 }, { "epoch": 43.053152855727966, "grad_norm": 728.8618774414062, "learning_rate": 2.320968955246125e-05, "loss": 34.149, "step": 16301 }, { "epoch": 43.05579399141631, "grad_norm": 533.185302734375, "learning_rate": 2.3192113871493836e-05, "loss": 34.5297, "step": 16302 }, { "epoch": 43.05843512710465, "grad_norm": 1039.3988037109375, "learning_rate": 2.3174544524003445e-05, "loss": 35.8383, "step": 16303 }, { "epoch": 43.061076262793, "grad_norm": 31053.697265625, "learning_rate": 2.3156981510480625e-05, "loss": 24.9443, "step": 16304 }, { "epoch": 43.063717398481344, "grad_norm": 492.060791015625, "learning_rate": 2.3139424831415818e-05, "loss": 9.0297, "step": 16305 }, { "epoch": 43.066358534169694, "grad_norm": 1674.2672119140625, "learning_rate": 2.3121874487299354e-05, "loss": 13.0137, "step": 16306 }, { "epoch": 43.06899966985804, "grad_norm": 2196.04150390625, "learning_rate": 2.310433047862126e-05, "loss": 9.1229, "step": 16307 }, { "epoch": 43.07164080554639, "grad_norm": 3565.6962890625, "learning_rate": 2.3086792805871393e-05, "loss": 10.857, "step": 16308 }, { "epoch": 43.07428194123473, "grad_norm": 689.154296875, "learning_rate": 2.306926146953961e-05, "loss": 13.5298, "step": 16309 }, { "epoch": 43.07692307692308, "grad_norm": 2131.4013671875, "learning_rate": 2.305173647011538e-05, "loss": 10.3197, "step": 16310 }, { "epoch": 43.07956421261142, "grad_norm": 810.54736328125, "learning_rate": 2.3034217808088148e-05, "loss": 8.7032, "step": 16311 }, { "epoch": 43.082205348299766, "grad_norm": 1534.7982177734375, "learning_rate": 2.3016705483946965e-05, "loss": 14.5813, "step": 16312 }, { "epoch": 43.084846483988116, "grad_norm": 8105.66796875, "learning_rate": 2.2999199498180994e-05, "loss": 11.0162, "step": 16313 }, { "epoch": 43.08748761967646, "grad_norm": 1231.1358642578125, "learning_rate": 2.2981699851279013e-05, "loss": 35.1243, "step": 16314 }, { "epoch": 43.09012875536481, "grad_norm": 811.53125, "learning_rate": 2.296420654372966e-05, "loss": 34.0297, "step": 16315 }, { "epoch": 43.09276989105315, "grad_norm": 696.237548828125, "learning_rate": 2.294671957602154e-05, "loss": 33.9034, "step": 16316 }, { "epoch": 43.0954110267415, "grad_norm": 935.773681640625, "learning_rate": 2.292923894864288e-05, "loss": 34.7622, "step": 16317 }, { "epoch": 43.098052162429845, "grad_norm": 706.09814453125, "learning_rate": 2.2911764662081834e-05, "loss": 33.2563, "step": 16318 }, { "epoch": 43.10069329811819, "grad_norm": 1338.8416748046875, "learning_rate": 2.2894296716826356e-05, "loss": 36.2272, "step": 16319 }, { "epoch": 43.10333443380654, "grad_norm": 1227.853271484375, "learning_rate": 2.2876835113364247e-05, "loss": 35.2181, "step": 16320 }, { "epoch": 43.10597556949488, "grad_norm": 1452.914794921875, "learning_rate": 2.2859379852183086e-05, "loss": 35.1237, "step": 16321 }, { "epoch": 43.10861670518323, "grad_norm": 2366.330078125, "learning_rate": 2.2841930933770267e-05, "loss": 34.1639, "step": 16322 }, { "epoch": 43.11125784087157, "grad_norm": 752.524169921875, "learning_rate": 2.2824488358613145e-05, "loss": 34.5876, "step": 16323 }, { "epoch": 43.11389897655992, "grad_norm": 1864.9232177734375, "learning_rate": 2.280705212719872e-05, "loss": 34.9684, "step": 16324 }, { "epoch": 43.116540112248266, "grad_norm": 761.4306640625, "learning_rate": 2.2789622240013934e-05, "loss": 34.4418, "step": 16325 }, { "epoch": 43.11918124793661, "grad_norm": 989.1240844726562, "learning_rate": 2.2772198697545405e-05, "loss": 35.5315, "step": 16326 }, { "epoch": 43.12182238362496, "grad_norm": 2179.318603515625, "learning_rate": 2.2754781500279792e-05, "loss": 34.1672, "step": 16327 }, { "epoch": 43.1244635193133, "grad_norm": 1213.727783203125, "learning_rate": 2.273737064870346e-05, "loss": 35.3277, "step": 16328 }, { "epoch": 43.12710465500165, "grad_norm": 2124.794189453125, "learning_rate": 2.2719966143302463e-05, "loss": 34.8929, "step": 16329 }, { "epoch": 43.129745790689995, "grad_norm": 2430.078857421875, "learning_rate": 2.2702567984562937e-05, "loss": 35.7234, "step": 16330 }, { "epoch": 43.132386926378345, "grad_norm": 1741.3326416015625, "learning_rate": 2.2685176172970696e-05, "loss": 39.966, "step": 16331 }, { "epoch": 43.13502806206669, "grad_norm": 888.23095703125, "learning_rate": 2.266779070901137e-05, "loss": 38.2401, "step": 16332 }, { "epoch": 43.13766919775504, "grad_norm": 1158.6746826171875, "learning_rate": 2.2650411593170413e-05, "loss": 37.0715, "step": 16333 }, { "epoch": 43.14031033344338, "grad_norm": 2112.960693359375, "learning_rate": 2.2633038825933206e-05, "loss": 37.8911, "step": 16334 }, { "epoch": 43.142951469131724, "grad_norm": 5870.33447265625, "learning_rate": 2.261567240778481e-05, "loss": 38.1833, "step": 16335 }, { "epoch": 43.145592604820074, "grad_norm": 704.6458129882812, "learning_rate": 2.2598312339210196e-05, "loss": 43.7884, "step": 16336 }, { "epoch": 43.14823374050842, "grad_norm": 1409.584716796875, "learning_rate": 2.258095862069412e-05, "loss": 40.2566, "step": 16337 }, { "epoch": 43.15087487619677, "grad_norm": 1269.068115234375, "learning_rate": 2.256361125272119e-05, "loss": 41.5775, "step": 16338 }, { "epoch": 43.15351601188511, "grad_norm": 1159.802978515625, "learning_rate": 2.2546270235775794e-05, "loss": 38.3965, "step": 16339 }, { "epoch": 43.15615714757346, "grad_norm": 776.5504150390625, "learning_rate": 2.2528935570342164e-05, "loss": 38.8604, "step": 16340 }, { "epoch": 43.1587982832618, "grad_norm": 1645.8531494140625, "learning_rate": 2.251160725690443e-05, "loss": 40.1958, "step": 16341 }, { "epoch": 43.161439418950145, "grad_norm": 628.0682373046875, "learning_rate": 2.2494285295946408e-05, "loss": 37.7537, "step": 16342 }, { "epoch": 43.164080554638495, "grad_norm": 951.1746826171875, "learning_rate": 2.2476969687951822e-05, "loss": 36.4874, "step": 16343 }, { "epoch": 43.16672169032684, "grad_norm": 1103.726806640625, "learning_rate": 2.2459660433404223e-05, "loss": 36.5002, "step": 16344 }, { "epoch": 43.16936282601519, "grad_norm": 2717.859130859375, "learning_rate": 2.2442357532786922e-05, "loss": 35.7509, "step": 16345 }, { "epoch": 43.17200396170353, "grad_norm": 1064.8023681640625, "learning_rate": 2.2425060986583174e-05, "loss": 35.6731, "step": 16346 }, { "epoch": 43.17464509739188, "grad_norm": 1229.8690185546875, "learning_rate": 2.240777079527584e-05, "loss": 34.9489, "step": 16347 }, { "epoch": 43.177286233080224, "grad_norm": 891.1168823242188, "learning_rate": 2.239048695934784e-05, "loss": 34.7146, "step": 16348 }, { "epoch": 43.17992736876857, "grad_norm": 2071.515625, "learning_rate": 2.2373209479281788e-05, "loss": 33.8425, "step": 16349 }, { "epoch": 43.18256850445692, "grad_norm": 1360.3565673828125, "learning_rate": 2.2355938355560102e-05, "loss": 34.9189, "step": 16350 }, { "epoch": 43.18520964014526, "grad_norm": 921.63232421875, "learning_rate": 2.2338673588665147e-05, "loss": 34.0611, "step": 16351 }, { "epoch": 43.18785077583361, "grad_norm": 572.8201904296875, "learning_rate": 2.232141517907901e-05, "loss": 35.6864, "step": 16352 }, { "epoch": 43.19049191152195, "grad_norm": 1755.1759033203125, "learning_rate": 2.2304163127283615e-05, "loss": 39.2966, "step": 16353 }, { "epoch": 43.1931330472103, "grad_norm": 493.8397216796875, "learning_rate": 2.2286917433760685e-05, "loss": 23.285, "step": 16354 }, { "epoch": 43.195774182898646, "grad_norm": 1752.494873046875, "learning_rate": 2.226967809899183e-05, "loss": 11.5704, "step": 16355 }, { "epoch": 43.198415318586996, "grad_norm": 4054.53759765625, "learning_rate": 2.2252445123458447e-05, "loss": 10.9565, "step": 16356 }, { "epoch": 43.20105645427534, "grad_norm": 1253.218505859375, "learning_rate": 2.223521850764171e-05, "loss": 10.3113, "step": 16357 }, { "epoch": 43.20369758996368, "grad_norm": 715.99072265625, "learning_rate": 2.2217998252022725e-05, "loss": 12.3914, "step": 16358 }, { "epoch": 43.20633872565203, "grad_norm": 3469.158203125, "learning_rate": 2.2200784357082337e-05, "loss": 11.0363, "step": 16359 }, { "epoch": 43.208979861340374, "grad_norm": 2496.10302734375, "learning_rate": 2.2183576823301188e-05, "loss": 12.5055, "step": 16360 }, { "epoch": 43.211620997028724, "grad_norm": 1242.546630859375, "learning_rate": 2.2166375651159865e-05, "loss": 9.7692, "step": 16361 }, { "epoch": 43.21426213271707, "grad_norm": 682.06689453125, "learning_rate": 2.214918084113868e-05, "loss": 8.5618, "step": 16362 }, { "epoch": 43.21690326840542, "grad_norm": 1225.6068115234375, "learning_rate": 2.2131992393717826e-05, "loss": 13.4379, "step": 16363 }, { "epoch": 43.21954440409376, "grad_norm": 1243.4488525390625, "learning_rate": 2.211481030937712e-05, "loss": 35.1231, "step": 16364 }, { "epoch": 43.2221855397821, "grad_norm": 1062.2872314453125, "learning_rate": 2.209763458859651e-05, "loss": 35.2148, "step": 16365 }, { "epoch": 43.22482667547045, "grad_norm": 3108.849853515625, "learning_rate": 2.2080465231855585e-05, "loss": 34.4894, "step": 16366 }, { "epoch": 43.227467811158796, "grad_norm": 1033.718505859375, "learning_rate": 2.2063302239633737e-05, "loss": 35.6098, "step": 16367 }, { "epoch": 43.230108946847146, "grad_norm": 965.0396728515625, "learning_rate": 2.2046145612410308e-05, "loss": 33.5426, "step": 16368 }, { "epoch": 43.23275008253549, "grad_norm": 1992.3125, "learning_rate": 2.202899535066438e-05, "loss": 34.392, "step": 16369 }, { "epoch": 43.23539121822384, "grad_norm": 1400.052734375, "learning_rate": 2.201185145487483e-05, "loss": 34.2003, "step": 16370 }, { "epoch": 43.23803235391218, "grad_norm": 1160.7667236328125, "learning_rate": 2.1994713925520382e-05, "loss": 33.7654, "step": 16371 }, { "epoch": 43.240673489600525, "grad_norm": 1842.526611328125, "learning_rate": 2.197758276307965e-05, "loss": 33.4506, "step": 16372 }, { "epoch": 43.243314625288875, "grad_norm": 1619.61328125, "learning_rate": 2.1960457968030923e-05, "loss": 33.7111, "step": 16373 }, { "epoch": 43.24595576097722, "grad_norm": 2683.236328125, "learning_rate": 2.1943339540852452e-05, "loss": 34.9818, "step": 16374 }, { "epoch": 43.24859689666557, "grad_norm": 717.7245483398438, "learning_rate": 2.1926227482022277e-05, "loss": 34.1583, "step": 16375 }, { "epoch": 43.25123803235391, "grad_norm": 2195.27685546875, "learning_rate": 2.1909121792018206e-05, "loss": 34.6754, "step": 16376 }, { "epoch": 43.25387916804226, "grad_norm": 1986.2720947265625, "learning_rate": 2.189202247131794e-05, "loss": 33.5702, "step": 16377 }, { "epoch": 43.2565203037306, "grad_norm": 1214.7298583984375, "learning_rate": 2.1874929520398905e-05, "loss": 34.88, "step": 16378 }, { "epoch": 43.25916143941895, "grad_norm": 1764.841552734375, "learning_rate": 2.18578429397385e-05, "loss": 35.2523, "step": 16379 }, { "epoch": 43.261802575107296, "grad_norm": 692.3093872070312, "learning_rate": 2.1840762729813808e-05, "loss": 36.3598, "step": 16380 }, { "epoch": 43.26444371079564, "grad_norm": 5530.400390625, "learning_rate": 2.182368889110181e-05, "loss": 40.8838, "step": 16381 }, { "epoch": 43.26708484648399, "grad_norm": 951.4095458984375, "learning_rate": 2.1806621424079242e-05, "loss": 38.5049, "step": 16382 }, { "epoch": 43.26972598217233, "grad_norm": 640.404052734375, "learning_rate": 2.178956032922272e-05, "loss": 39.1671, "step": 16383 }, { "epoch": 43.27236711786068, "grad_norm": 1790.4984130859375, "learning_rate": 2.1772505607008664e-05, "loss": 38.3244, "step": 16384 }, { "epoch": 43.275008253549025, "grad_norm": 977.114990234375, "learning_rate": 2.175545725791328e-05, "loss": 37.6281, "step": 16385 }, { "epoch": 43.277649389237375, "grad_norm": 666.628662109375, "learning_rate": 2.1738415282412717e-05, "loss": 41.3073, "step": 16386 }, { "epoch": 43.28029052492572, "grad_norm": 3285.215087890625, "learning_rate": 2.172137968098284e-05, "loss": 39.1948, "step": 16387 }, { "epoch": 43.28293166061406, "grad_norm": 1853.474365234375, "learning_rate": 2.1704350454099325e-05, "loss": 43.8577, "step": 16388 }, { "epoch": 43.28557279630241, "grad_norm": 1364.428466796875, "learning_rate": 2.168732760223771e-05, "loss": 39.508, "step": 16389 }, { "epoch": 43.288213931990754, "grad_norm": 2232.1591796875, "learning_rate": 2.1670311125873367e-05, "loss": 39.0582, "step": 16390 }, { "epoch": 43.290855067679104, "grad_norm": 521.9419555664062, "learning_rate": 2.1653301025481435e-05, "loss": 39.1211, "step": 16391 }, { "epoch": 43.29349620336745, "grad_norm": 1078.3651123046875, "learning_rate": 2.1636297301536905e-05, "loss": 38.8018, "step": 16392 }, { "epoch": 43.2961373390558, "grad_norm": 720.8544921875, "learning_rate": 2.161929995451467e-05, "loss": 37.6621, "step": 16393 }, { "epoch": 43.29877847474414, "grad_norm": 728.5354614257812, "learning_rate": 2.1602308984889298e-05, "loss": 37.7949, "step": 16394 }, { "epoch": 43.30141961043249, "grad_norm": 893.2850341796875, "learning_rate": 2.1585324393135262e-05, "loss": 35.8609, "step": 16395 }, { "epoch": 43.30406074612083, "grad_norm": 620.9584350585938, "learning_rate": 2.1568346179726917e-05, "loss": 35.4559, "step": 16396 }, { "epoch": 43.306701881809175, "grad_norm": 563.039306640625, "learning_rate": 2.1551374345138293e-05, "loss": 37.8327, "step": 16397 }, { "epoch": 43.309343017497525, "grad_norm": 1262.0699462890625, "learning_rate": 2.15344088898434e-05, "loss": 35.5264, "step": 16398 }, { "epoch": 43.31198415318587, "grad_norm": 639.690673828125, "learning_rate": 2.1517449814315858e-05, "loss": 34.3559, "step": 16399 }, { "epoch": 43.31462528887422, "grad_norm": 1426.9342041015625, "learning_rate": 2.1500497119029322e-05, "loss": 34.8312, "step": 16400 }, { "epoch": 43.31462528887422, "eval_loss": 3.7495412826538086, "eval_runtime": 2.2639, "eval_samples_per_second": 218.646, "eval_steps_per_second": 27.386, "step": 16400 }, { "epoch": 43.31726642456256, "grad_norm": 854.1050415039062, "learning_rate": 2.1483550804457187e-05, "loss": 34.8696, "step": 16401 }, { "epoch": 43.31990756025091, "grad_norm": 1051.0579833984375, "learning_rate": 2.146661087107263e-05, "loss": 35.44, "step": 16402 }, { "epoch": 43.322548695939254, "grad_norm": 4662.02099609375, "learning_rate": 2.1449677319348744e-05, "loss": 42.155, "step": 16403 }, { "epoch": 43.3251898316276, "grad_norm": 951.0101318359375, "learning_rate": 2.1432750149758372e-05, "loss": 21.313, "step": 16404 }, { "epoch": 43.32783096731595, "grad_norm": 1704.465087890625, "learning_rate": 2.1415829362774198e-05, "loss": 12.3246, "step": 16405 }, { "epoch": 43.33047210300429, "grad_norm": 3164.624267578125, "learning_rate": 2.139891495886867e-05, "loss": 14.6558, "step": 16406 }, { "epoch": 43.33311323869264, "grad_norm": 618.36376953125, "learning_rate": 2.1382006938514192e-05, "loss": 13.6849, "step": 16407 }, { "epoch": 43.33575437438098, "grad_norm": 3067.503662109375, "learning_rate": 2.1365105302182856e-05, "loss": 12.7853, "step": 16408 }, { "epoch": 43.33839551006933, "grad_norm": 4081.280029296875, "learning_rate": 2.1348210050346596e-05, "loss": 14.8332, "step": 16409 }, { "epoch": 43.341036645757676, "grad_norm": 2471.7568359375, "learning_rate": 2.1331321183477302e-05, "loss": 10.4752, "step": 16410 }, { "epoch": 43.34367778144602, "grad_norm": 296.18170166015625, "learning_rate": 2.1314438702046553e-05, "loss": 11.218, "step": 16411 }, { "epoch": 43.34631891713437, "grad_norm": 28391.61328125, "learning_rate": 2.1297562606525765e-05, "loss": 10.5674, "step": 16412 }, { "epoch": 43.34896005282271, "grad_norm": 2331.90380859375, "learning_rate": 2.1280692897386128e-05, "loss": 28.1836, "step": 16413 }, { "epoch": 43.35160118851106, "grad_norm": 1766.1273193359375, "learning_rate": 2.126382957509884e-05, "loss": 35.966, "step": 16414 }, { "epoch": 43.354242324199404, "grad_norm": 1124.661865234375, "learning_rate": 2.1246972640134754e-05, "loss": 34.7306, "step": 16415 }, { "epoch": 43.356883459887754, "grad_norm": 549.232666015625, "learning_rate": 2.1230122092964572e-05, "loss": 35.3979, "step": 16416 }, { "epoch": 43.3595245955761, "grad_norm": 1020.6045532226562, "learning_rate": 2.1213277934058833e-05, "loss": 36.4784, "step": 16417 }, { "epoch": 43.36216573126444, "grad_norm": 1208.717041015625, "learning_rate": 2.1196440163887914e-05, "loss": 32.8697, "step": 16418 }, { "epoch": 43.36480686695279, "grad_norm": 735.9837036132812, "learning_rate": 2.117960878292202e-05, "loss": 33.0284, "step": 16419 }, { "epoch": 43.36744800264113, "grad_norm": 753.4673461914062, "learning_rate": 2.1162783791631057e-05, "loss": 34.2189, "step": 16420 }, { "epoch": 43.37008913832948, "grad_norm": 1439.875, "learning_rate": 2.1145965190485006e-05, "loss": 34.4221, "step": 16421 }, { "epoch": 43.372730274017826, "grad_norm": 1585.569091796875, "learning_rate": 2.112915297995341e-05, "loss": 33.6284, "step": 16422 }, { "epoch": 43.375371409706176, "grad_norm": 534.0453491210938, "learning_rate": 2.1112347160505808e-05, "loss": 35.4036, "step": 16423 }, { "epoch": 43.37801254539452, "grad_norm": 569.7202758789062, "learning_rate": 2.1095547732611437e-05, "loss": 34.2482, "step": 16424 }, { "epoch": 43.38065368108287, "grad_norm": 1219.3291015625, "learning_rate": 2.1078754696739422e-05, "loss": 34.2362, "step": 16425 }, { "epoch": 43.38329481677121, "grad_norm": 735.5545654296875, "learning_rate": 2.106196805335872e-05, "loss": 34.5054, "step": 16426 }, { "epoch": 43.385935952459555, "grad_norm": 1899.1427001953125, "learning_rate": 2.1045187802938015e-05, "loss": 34.4688, "step": 16427 }, { "epoch": 43.388577088147905, "grad_norm": 983.9288940429688, "learning_rate": 2.1028413945945984e-05, "loss": 34.9021, "step": 16428 }, { "epoch": 43.39121822383625, "grad_norm": 1405.3907470703125, "learning_rate": 2.1011646482851006e-05, "loss": 34.9063, "step": 16429 }, { "epoch": 43.3938593595246, "grad_norm": 1073.5330810546875, "learning_rate": 2.0994885414121255e-05, "loss": 36.2225, "step": 16430 }, { "epoch": 43.39650049521294, "grad_norm": 3973.390625, "learning_rate": 2.097813074022481e-05, "loss": 40.324, "step": 16431 }, { "epoch": 43.39914163090129, "grad_norm": 1227.1605224609375, "learning_rate": 2.0961382461629568e-05, "loss": 37.2531, "step": 16432 }, { "epoch": 43.40178276658963, "grad_norm": 978.3712768554688, "learning_rate": 2.0944640578803185e-05, "loss": 39.2695, "step": 16433 }, { "epoch": 43.404423902277976, "grad_norm": 734.2943725585938, "learning_rate": 2.0927905092213096e-05, "loss": 39.0065, "step": 16434 }, { "epoch": 43.407065037966326, "grad_norm": 1744.000244140625, "learning_rate": 2.0911176002326727e-05, "loss": 39.1828, "step": 16435 }, { "epoch": 43.40970617365467, "grad_norm": 1037.0225830078125, "learning_rate": 2.0894453309611206e-05, "loss": 39.0225, "step": 16436 }, { "epoch": 43.41234730934302, "grad_norm": 1219.3087158203125, "learning_rate": 2.087773701453344e-05, "loss": 43.1837, "step": 16437 }, { "epoch": 43.41498844503136, "grad_norm": 1115.3001708984375, "learning_rate": 2.08610271175603e-05, "loss": 41.353, "step": 16438 }, { "epoch": 43.41762958071971, "grad_norm": 483.6607666015625, "learning_rate": 2.084432361915839e-05, "loss": 42.1411, "step": 16439 }, { "epoch": 43.420270716408055, "grad_norm": 1525.97021484375, "learning_rate": 2.082762651979414e-05, "loss": 40.3281, "step": 16440 }, { "epoch": 43.422911852096405, "grad_norm": 2295.978759765625, "learning_rate": 2.081093581993379e-05, "loss": 38.2987, "step": 16441 }, { "epoch": 43.42555298778475, "grad_norm": 915.7620239257812, "learning_rate": 2.079425152004341e-05, "loss": 40.6377, "step": 16442 }, { "epoch": 43.42819412347309, "grad_norm": 2903.10205078125, "learning_rate": 2.0777573620588936e-05, "loss": 37.5775, "step": 16443 }, { "epoch": 43.43083525916144, "grad_norm": 901.805908203125, "learning_rate": 2.076090212203599e-05, "loss": 37.8094, "step": 16444 }, { "epoch": 43.433476394849784, "grad_norm": 4617.93701171875, "learning_rate": 2.074423702485026e-05, "loss": 36.7, "step": 16445 }, { "epoch": 43.436117530538134, "grad_norm": 1041.365478515625, "learning_rate": 2.0727578329497037e-05, "loss": 35.184, "step": 16446 }, { "epoch": 43.43875866622648, "grad_norm": 1301.0196533203125, "learning_rate": 2.071092603644151e-05, "loss": 33.985, "step": 16447 }, { "epoch": 43.44139980191483, "grad_norm": 1118.41943359375, "learning_rate": 2.069428014614863e-05, "loss": 33.7839, "step": 16448 }, { "epoch": 43.44404093760317, "grad_norm": 1211.07421875, "learning_rate": 2.0677640659083312e-05, "loss": 34.0053, "step": 16449 }, { "epoch": 43.44668207329151, "grad_norm": 964.638427734375, "learning_rate": 2.0661007575710185e-05, "loss": 34.8197, "step": 16450 }, { "epoch": 43.44932320897986, "grad_norm": 5678.70166015625, "learning_rate": 2.0644380896493702e-05, "loss": 35.0179, "step": 16451 }, { "epoch": 43.451964344668205, "grad_norm": 556.7559814453125, "learning_rate": 2.062776062189814e-05, "loss": 34.8273, "step": 16452 }, { "epoch": 43.454605480356555, "grad_norm": 18800.462890625, "learning_rate": 2.0611146752387623e-05, "loss": 38.0808, "step": 16453 }, { "epoch": 43.4572466160449, "grad_norm": 3221.437255859375, "learning_rate": 2.0594539288426083e-05, "loss": 14.352, "step": 16454 }, { "epoch": 43.45988775173325, "grad_norm": 970.7152709960938, "learning_rate": 2.057793823047721e-05, "loss": 9.7563, "step": 16455 }, { "epoch": 43.46252888742159, "grad_norm": 1785.2823486328125, "learning_rate": 2.0561343579004716e-05, "loss": 11.6047, "step": 16456 }, { "epoch": 43.465170023109934, "grad_norm": 1271.2589111328125, "learning_rate": 2.0544755334471922e-05, "loss": 11.162, "step": 16457 }, { "epoch": 43.467811158798284, "grad_norm": 12379.4326171875, "learning_rate": 2.052817349734204e-05, "loss": 11.0162, "step": 16458 }, { "epoch": 43.47045229448663, "grad_norm": 2445.54296875, "learning_rate": 2.051159806807809e-05, "loss": 10.1366, "step": 16459 }, { "epoch": 43.47309343017498, "grad_norm": 3595.84814453125, "learning_rate": 2.0495029047142983e-05, "loss": 11.4039, "step": 16460 }, { "epoch": 43.47573456586332, "grad_norm": 546.9393920898438, "learning_rate": 2.047846643499937e-05, "loss": 11.8046, "step": 16461 }, { "epoch": 43.47837570155167, "grad_norm": 888.0108642578125, "learning_rate": 2.0461910232109693e-05, "loss": 13.6193, "step": 16462 }, { "epoch": 43.48101683724001, "grad_norm": 539.3433227539062, "learning_rate": 2.0445360438936383e-05, "loss": 24.202, "step": 16463 }, { "epoch": 43.483657972928356, "grad_norm": 1548.348876953125, "learning_rate": 2.0428817055941545e-05, "loss": 36.3721, "step": 16464 }, { "epoch": 43.486299108616706, "grad_norm": 2259.184326171875, "learning_rate": 2.0412280083587088e-05, "loss": 34.4633, "step": 16465 }, { "epoch": 43.48894024430505, "grad_norm": 1338.0738525390625, "learning_rate": 2.0395749522334887e-05, "loss": 33.5983, "step": 16466 }, { "epoch": 43.4915813799934, "grad_norm": 2082.08251953125, "learning_rate": 2.03792253726465e-05, "loss": 33.8105, "step": 16467 }, { "epoch": 43.49422251568174, "grad_norm": 1358.7305908203125, "learning_rate": 2.036270763498341e-05, "loss": 35.0151, "step": 16468 }, { "epoch": 43.49686365137009, "grad_norm": 1089.7421875, "learning_rate": 2.034619630980672e-05, "loss": 34.8007, "step": 16469 }, { "epoch": 43.499504787058434, "grad_norm": 1616.697509765625, "learning_rate": 2.0329691397577623e-05, "loss": 33.5118, "step": 16470 }, { "epoch": 43.502145922746784, "grad_norm": 625.2450561523438, "learning_rate": 2.0313192898756993e-05, "loss": 34.1813, "step": 16471 }, { "epoch": 43.50478705843513, "grad_norm": 2200.744873046875, "learning_rate": 2.0296700813805496e-05, "loss": 34.7344, "step": 16472 }, { "epoch": 43.50742819412347, "grad_norm": 607.087646484375, "learning_rate": 2.0280215143183707e-05, "loss": 34.6628, "step": 16473 }, { "epoch": 43.51006932981182, "grad_norm": 735.4620361328125, "learning_rate": 2.0263735887351976e-05, "loss": 34.7122, "step": 16474 }, { "epoch": 43.51271046550016, "grad_norm": 749.7914428710938, "learning_rate": 2.0247263046770463e-05, "loss": 34.6959, "step": 16475 }, { "epoch": 43.51535160118851, "grad_norm": 837.29443359375, "learning_rate": 2.0230796621899195e-05, "loss": 34.5996, "step": 16476 }, { "epoch": 43.517992736876856, "grad_norm": 1597.2744140625, "learning_rate": 2.021433661319791e-05, "loss": 33.5125, "step": 16477 }, { "epoch": 43.520633872565206, "grad_norm": 875.0106201171875, "learning_rate": 2.0197883021126322e-05, "loss": 34.1422, "step": 16478 }, { "epoch": 43.52327500825355, "grad_norm": 1001.1399536132812, "learning_rate": 2.018143584614382e-05, "loss": 34.7068, "step": 16479 }, { "epoch": 43.52591614394189, "grad_norm": 1352.7589111328125, "learning_rate": 2.0164995088709754e-05, "loss": 37.802, "step": 16480 }, { "epoch": 43.52855727963024, "grad_norm": 2049.622314453125, "learning_rate": 2.0148560749283173e-05, "loss": 39.1389, "step": 16481 }, { "epoch": 43.531198415318585, "grad_norm": 783.2766723632812, "learning_rate": 2.013213282832302e-05, "loss": 39.3684, "step": 16482 }, { "epoch": 43.533839551006935, "grad_norm": 1566.0948486328125, "learning_rate": 2.011571132628798e-05, "loss": 38.5973, "step": 16483 }, { "epoch": 43.53648068669528, "grad_norm": 2325.889892578125, "learning_rate": 2.009929624363671e-05, "loss": 39.3754, "step": 16484 }, { "epoch": 43.53912182238363, "grad_norm": 927.6002197265625, "learning_rate": 2.0082887580827546e-05, "loss": 40.6747, "step": 16485 }, { "epoch": 43.54176295807197, "grad_norm": 1848.4610595703125, "learning_rate": 2.0066485338318675e-05, "loss": 43.4055, "step": 16486 }, { "epoch": 43.54440409376032, "grad_norm": 2845.93017578125, "learning_rate": 2.005008951656814e-05, "loss": 41.9746, "step": 16487 }, { "epoch": 43.54704522944866, "grad_norm": 448.1991882324219, "learning_rate": 2.0033700116033754e-05, "loss": 40.7696, "step": 16488 }, { "epoch": 43.549686365137006, "grad_norm": 1488.1934814453125, "learning_rate": 2.0017317137173197e-05, "loss": 42.3188, "step": 16489 }, { "epoch": 43.552327500825356, "grad_norm": 1000.3274536132812, "learning_rate": 2.0000940580443967e-05, "loss": 38.585, "step": 16490 }, { "epoch": 43.5549686365137, "grad_norm": 1088.019775390625, "learning_rate": 1.998457044630339e-05, "loss": 41.8452, "step": 16491 }, { "epoch": 43.55760977220205, "grad_norm": 1420.6048583984375, "learning_rate": 1.9968206735208548e-05, "loss": 38.1634, "step": 16492 }, { "epoch": 43.56025090789039, "grad_norm": 889.5377197265625, "learning_rate": 1.995184944761641e-05, "loss": 35.4965, "step": 16493 }, { "epoch": 43.56289204357874, "grad_norm": 973.7061157226562, "learning_rate": 1.9935498583983718e-05, "loss": 36.8443, "step": 16494 }, { "epoch": 43.565533179267085, "grad_norm": 1027.350830078125, "learning_rate": 1.991915414476711e-05, "loss": 35.3732, "step": 16495 }, { "epoch": 43.56817431495543, "grad_norm": 1417.85498046875, "learning_rate": 1.9902816130422914e-05, "loss": 35.7054, "step": 16496 }, { "epoch": 43.57081545064378, "grad_norm": 970.347900390625, "learning_rate": 1.9886484541407457e-05, "loss": 34.8392, "step": 16497 }, { "epoch": 43.57345658633212, "grad_norm": 1121.41552734375, "learning_rate": 1.9870159378176737e-05, "loss": 35.8128, "step": 16498 }, { "epoch": 43.57609772202047, "grad_norm": 1379.0111083984375, "learning_rate": 1.9853840641186644e-05, "loss": 35.6441, "step": 16499 }, { "epoch": 43.578738857708814, "grad_norm": 964.3658447265625, "learning_rate": 1.9837528330892778e-05, "loss": 34.5054, "step": 16500 }, { "epoch": 43.581379993397164, "grad_norm": 788.2714233398438, "learning_rate": 1.9821222447750807e-05, "loss": 35.9472, "step": 16501 }, { "epoch": 43.58402112908551, "grad_norm": 1795.927001953125, "learning_rate": 1.9804922992215978e-05, "loss": 35.8055, "step": 16502 }, { "epoch": 43.58666226477385, "grad_norm": 5210.841796875, "learning_rate": 1.9788629964743454e-05, "loss": 43.4093, "step": 16503 }, { "epoch": 43.5893034004622, "grad_norm": 2275.55810546875, "learning_rate": 1.9772343365788174e-05, "loss": 9.9999, "step": 16504 }, { "epoch": 43.59194453615054, "grad_norm": 4534.73388671875, "learning_rate": 1.9756063195805002e-05, "loss": 16.5098, "step": 16505 }, { "epoch": 43.59458567183889, "grad_norm": 11215.82421875, "learning_rate": 1.9739789455248486e-05, "loss": 9.9251, "step": 16506 }, { "epoch": 43.597226807527235, "grad_norm": 1283.0833740234375, "learning_rate": 1.972352214457307e-05, "loss": 8.9399, "step": 16507 }, { "epoch": 43.599867943215585, "grad_norm": 3833.233154296875, "learning_rate": 1.9707261264233056e-05, "loss": 9.4711, "step": 16508 }, { "epoch": 43.60250907890393, "grad_norm": 1368.550048828125, "learning_rate": 1.969100681468247e-05, "loss": 7.8973, "step": 16509 }, { "epoch": 43.60515021459227, "grad_norm": 3078.0478515625, "learning_rate": 1.967475879637526e-05, "loss": 10.3073, "step": 16510 }, { "epoch": 43.60779135028062, "grad_norm": 1913.6282958984375, "learning_rate": 1.965851720976508e-05, "loss": 13.5616, "step": 16511 }, { "epoch": 43.610432485968964, "grad_norm": 1225.8753662109375, "learning_rate": 1.9642282055305523e-05, "loss": 14.4241, "step": 16512 }, { "epoch": 43.613073621657314, "grad_norm": 4798.43701171875, "learning_rate": 1.9626053333449912e-05, "loss": 34.6591, "step": 16513 }, { "epoch": 43.61571475734566, "grad_norm": 2594.46826171875, "learning_rate": 1.9609831044651387e-05, "loss": 34.0211, "step": 16514 }, { "epoch": 43.61835589303401, "grad_norm": 1121.698974609375, "learning_rate": 1.9593615189363034e-05, "loss": 35.4775, "step": 16515 }, { "epoch": 43.62099702872235, "grad_norm": 1551.5977783203125, "learning_rate": 1.9577405768037625e-05, "loss": 34.9442, "step": 16516 }, { "epoch": 43.6236381644107, "grad_norm": 746.2571411132812, "learning_rate": 1.9561202781127774e-05, "loss": 33.618, "step": 16517 }, { "epoch": 43.62627930009904, "grad_norm": 595.5299682617188, "learning_rate": 1.9545006229086003e-05, "loss": 34.6544, "step": 16518 }, { "epoch": 43.628920435787386, "grad_norm": 843.2730712890625, "learning_rate": 1.952881611236454e-05, "loss": 35.5885, "step": 16519 }, { "epoch": 43.631561571475736, "grad_norm": 1326.025634765625, "learning_rate": 1.9512632431415518e-05, "loss": 34.0521, "step": 16520 }, { "epoch": 43.63420270716408, "grad_norm": 3784.373046875, "learning_rate": 1.9496455186690855e-05, "loss": 33.1094, "step": 16521 }, { "epoch": 43.63684384285243, "grad_norm": 1400.7164306640625, "learning_rate": 1.9480284378642244e-05, "loss": 34.424, "step": 16522 }, { "epoch": 43.63948497854077, "grad_norm": 3471.8564453125, "learning_rate": 1.94641200077213e-05, "loss": 35.0404, "step": 16523 }, { "epoch": 43.64212611422912, "grad_norm": 1525.742919921875, "learning_rate": 1.9447962074379322e-05, "loss": 36.6477, "step": 16524 }, { "epoch": 43.644767249917464, "grad_norm": 511.0165710449219, "learning_rate": 1.943181057906762e-05, "loss": 34.7718, "step": 16525 }, { "epoch": 43.64740838560581, "grad_norm": 1332.218505859375, "learning_rate": 1.941566552223717e-05, "loss": 35.003, "step": 16526 }, { "epoch": 43.65004952129416, "grad_norm": 763.9854736328125, "learning_rate": 1.93995269043388e-05, "loss": 35.3613, "step": 16527 }, { "epoch": 43.6526906569825, "grad_norm": 2247.988525390625, "learning_rate": 1.9383394725823178e-05, "loss": 33.5584, "step": 16528 }, { "epoch": 43.65533179267085, "grad_norm": 3641.546142578125, "learning_rate": 1.936726898714081e-05, "loss": 34.5541, "step": 16529 }, { "epoch": 43.65797292835919, "grad_norm": 2149.15234375, "learning_rate": 1.9351149688741936e-05, "loss": 35.3515, "step": 16530 }, { "epoch": 43.66061406404754, "grad_norm": 1559.165283203125, "learning_rate": 1.933503683107671e-05, "loss": 42.2059, "step": 16531 }, { "epoch": 43.663255199735886, "grad_norm": 1013.6947021484375, "learning_rate": 1.9318930414595093e-05, "loss": 38.4706, "step": 16532 }, { "epoch": 43.665896335424236, "grad_norm": 1286.8292236328125, "learning_rate": 1.9302830439746844e-05, "loss": 39.8744, "step": 16533 }, { "epoch": 43.66853747111258, "grad_norm": 687.2779541015625, "learning_rate": 1.9286736906981546e-05, "loss": 38.6061, "step": 16534 }, { "epoch": 43.67117860680092, "grad_norm": 1112.802734375, "learning_rate": 1.927064981674853e-05, "loss": 40.5064, "step": 16535 }, { "epoch": 43.67381974248927, "grad_norm": 3508.623291015625, "learning_rate": 1.925456916949714e-05, "loss": 41.0793, "step": 16536 }, { "epoch": 43.676460878177615, "grad_norm": 863.8665771484375, "learning_rate": 1.9238494965676367e-05, "loss": 41.6197, "step": 16537 }, { "epoch": 43.679102013865965, "grad_norm": 1017.0570678710938, "learning_rate": 1.9222427205735055e-05, "loss": 41.4054, "step": 16538 }, { "epoch": 43.68174314955431, "grad_norm": 1257.744873046875, "learning_rate": 1.920636589012187e-05, "loss": 40.1556, "step": 16539 }, { "epoch": 43.68438428524266, "grad_norm": 1071.1055908203125, "learning_rate": 1.9190311019285366e-05, "loss": 38.5795, "step": 16540 }, { "epoch": 43.687025420931, "grad_norm": 2624.572265625, "learning_rate": 1.917426259367386e-05, "loss": 38.1078, "step": 16541 }, { "epoch": 43.68966655661934, "grad_norm": 645.84423828125, "learning_rate": 1.9158220613735398e-05, "loss": 38.3177, "step": 16542 }, { "epoch": 43.69230769230769, "grad_norm": 818.8929443359375, "learning_rate": 1.9142185079918108e-05, "loss": 36.4738, "step": 16543 }, { "epoch": 43.694948827996036, "grad_norm": 780.9986572265625, "learning_rate": 1.9126155992669648e-05, "loss": 36.0349, "step": 16544 }, { "epoch": 43.697589963684386, "grad_norm": 1012.9498291015625, "learning_rate": 1.9110133352437665e-05, "loss": 35.9924, "step": 16545 }, { "epoch": 43.70023109937273, "grad_norm": 987.1205444335938, "learning_rate": 1.9094117159669604e-05, "loss": 35.7042, "step": 16546 }, { "epoch": 43.70287223506108, "grad_norm": 1169.1273193359375, "learning_rate": 1.907810741481264e-05, "loss": 35.8547, "step": 16547 }, { "epoch": 43.70551337074942, "grad_norm": 875.465576171875, "learning_rate": 1.906210411831391e-05, "loss": 35.1479, "step": 16548 }, { "epoch": 43.708154506437765, "grad_norm": 1193.337158203125, "learning_rate": 1.90461072706202e-05, "loss": 35.498, "step": 16549 }, { "epoch": 43.710795642126115, "grad_norm": 561.52490234375, "learning_rate": 1.9030116872178316e-05, "loss": 35.4, "step": 16550 }, { "epoch": 43.71343677781446, "grad_norm": 1643.0948486328125, "learning_rate": 1.9014132923434762e-05, "loss": 35.2984, "step": 16551 }, { "epoch": 43.71607791350281, "grad_norm": 1714.8460693359375, "learning_rate": 1.8998155424835796e-05, "loss": 36.3968, "step": 16552 }, { "epoch": 43.71871904919115, "grad_norm": 13298.32421875, "learning_rate": 1.8982184376827667e-05, "loss": 36.0135, "step": 16553 }, { "epoch": 43.7213601848795, "grad_norm": 425148.875, "learning_rate": 1.8966219779856353e-05, "loss": 11.1702, "step": 16554 }, { "epoch": 43.724001320567844, "grad_norm": 995.6235961914062, "learning_rate": 1.8950261634367637e-05, "loss": 10.1295, "step": 16555 }, { "epoch": 43.72664245625619, "grad_norm": 4895.94482421875, "learning_rate": 1.893430994080714e-05, "loss": 10.2975, "step": 16556 }, { "epoch": 43.72928359194454, "grad_norm": 3508.703369140625, "learning_rate": 1.89183646996203e-05, "loss": 11.9564, "step": 16557 }, { "epoch": 43.73192472763288, "grad_norm": 1013.960205078125, "learning_rate": 1.890242591125238e-05, "loss": 13.8684, "step": 16558 }, { "epoch": 43.73456586332123, "grad_norm": 3191.200927734375, "learning_rate": 1.8886493576148413e-05, "loss": 10.2216, "step": 16559 }, { "epoch": 43.73720699900957, "grad_norm": 2253.966796875, "learning_rate": 1.88705676947534e-05, "loss": 8.6407, "step": 16560 }, { "epoch": 43.73984813469792, "grad_norm": 795.2159423828125, "learning_rate": 1.8854648267512014e-05, "loss": 12.2978, "step": 16561 }, { "epoch": 43.742489270386265, "grad_norm": 28352.06640625, "learning_rate": 1.8838735294868793e-05, "loss": 12.8526, "step": 16562 }, { "epoch": 43.745130406074615, "grad_norm": 4228.95654296875, "learning_rate": 1.8822828777268097e-05, "loss": 32.542, "step": 16563 }, { "epoch": 43.74777154176296, "grad_norm": 1390.7340087890625, "learning_rate": 1.8806928715154103e-05, "loss": 34.759, "step": 16564 }, { "epoch": 43.7504126774513, "grad_norm": 4274.55908203125, "learning_rate": 1.8791035108970815e-05, "loss": 35.0128, "step": 16565 }, { "epoch": 43.75305381313965, "grad_norm": 760.8449096679688, "learning_rate": 1.877514795916199e-05, "loss": 34.087, "step": 16566 }, { "epoch": 43.755694948827994, "grad_norm": 1026.4757080078125, "learning_rate": 1.8759267266171415e-05, "loss": 34.144, "step": 16567 }, { "epoch": 43.758336084516344, "grad_norm": 922.16015625, "learning_rate": 1.8743393030442423e-05, "loss": 33.7703, "step": 16568 }, { "epoch": 43.76097722020469, "grad_norm": 1545.56787109375, "learning_rate": 1.8727525252418358e-05, "loss": 35.1061, "step": 16569 }, { "epoch": 43.76361835589304, "grad_norm": 1151.13525390625, "learning_rate": 1.8711663932542256e-05, "loss": 33.7711, "step": 16570 }, { "epoch": 43.76625949158138, "grad_norm": 3868.90966796875, "learning_rate": 1.869580907125709e-05, "loss": 34.8751, "step": 16571 }, { "epoch": 43.76890062726972, "grad_norm": 2220.75537109375, "learning_rate": 1.8679960669005597e-05, "loss": 34.0789, "step": 16572 }, { "epoch": 43.77154176295807, "grad_norm": 1344.7847900390625, "learning_rate": 1.8664118726230307e-05, "loss": 33.7942, "step": 16573 }, { "epoch": 43.774182898646416, "grad_norm": 1257.827880859375, "learning_rate": 1.8648283243373615e-05, "loss": 34.6419, "step": 16574 }, { "epoch": 43.776824034334766, "grad_norm": 1211.8822021484375, "learning_rate": 1.8632454220877698e-05, "loss": 35.0401, "step": 16575 }, { "epoch": 43.77946517002311, "grad_norm": 1007.7144775390625, "learning_rate": 1.861663165918459e-05, "loss": 34.226, "step": 16576 }, { "epoch": 43.78210630571146, "grad_norm": 605.3123168945312, "learning_rate": 1.8600815558736106e-05, "loss": 33.8834, "step": 16577 }, { "epoch": 43.7847474413998, "grad_norm": 590.4768676757812, "learning_rate": 1.858500591997392e-05, "loss": 37.2789, "step": 16578 }, { "epoch": 43.78738857708815, "grad_norm": 1095.6297607421875, "learning_rate": 1.8569202743339537e-05, "loss": 36.7388, "step": 16579 }, { "epoch": 43.790029712776494, "grad_norm": 2258.235595703125, "learning_rate": 1.8553406029274188e-05, "loss": 38.8035, "step": 16580 }, { "epoch": 43.79267084846484, "grad_norm": 10726.71875, "learning_rate": 1.8537615778219026e-05, "loss": 40.587, "step": 16581 }, { "epoch": 43.79531198415319, "grad_norm": 572.813232421875, "learning_rate": 1.8521831990614967e-05, "loss": 38.5845, "step": 16582 }, { "epoch": 43.79795311984153, "grad_norm": 869.0784301757812, "learning_rate": 1.8506054666902805e-05, "loss": 39.5357, "step": 16583 }, { "epoch": 43.80059425552988, "grad_norm": 783.5121459960938, "learning_rate": 1.849028380752299e-05, "loss": 39.6788, "step": 16584 }, { "epoch": 43.80323539121822, "grad_norm": 1416.765380859375, "learning_rate": 1.8474519412916086e-05, "loss": 41.706, "step": 16585 }, { "epoch": 43.80587652690657, "grad_norm": 1748.34619140625, "learning_rate": 1.8458761483522212e-05, "loss": 41.0159, "step": 16586 }, { "epoch": 43.808517662594916, "grad_norm": 1399.338134765625, "learning_rate": 1.8443010019781353e-05, "loss": 42.4832, "step": 16587 }, { "epoch": 43.81115879828326, "grad_norm": 1058.287353515625, "learning_rate": 1.8427265022133462e-05, "loss": 40.5938, "step": 16588 }, { "epoch": 43.81379993397161, "grad_norm": 1333.7232666015625, "learning_rate": 1.8411526491018155e-05, "loss": 38.544, "step": 16589 }, { "epoch": 43.81644106965995, "grad_norm": 578.40478515625, "learning_rate": 1.8395794426874947e-05, "loss": 39.6889, "step": 16590 }, { "epoch": 43.8190822053483, "grad_norm": 1525.2064208984375, "learning_rate": 1.838006883014312e-05, "loss": 36.842, "step": 16591 }, { "epoch": 43.821723341036645, "grad_norm": 863.0759887695312, "learning_rate": 1.83643497012618e-05, "loss": 38.1327, "step": 16592 }, { "epoch": 43.824364476724995, "grad_norm": 610.6430053710938, "learning_rate": 1.834863704066997e-05, "loss": 38.2574, "step": 16593 }, { "epoch": 43.82700561241334, "grad_norm": 1364.6240234375, "learning_rate": 1.83329308488063e-05, "loss": 37.2649, "step": 16594 }, { "epoch": 43.82964674810168, "grad_norm": 1060.14208984375, "learning_rate": 1.83172311261095e-05, "loss": 35.3416, "step": 16595 }, { "epoch": 43.83228788379003, "grad_norm": 582.5064086914062, "learning_rate": 1.830153787301794e-05, "loss": 35.8955, "step": 16596 }, { "epoch": 43.83492901947837, "grad_norm": 724.1063232421875, "learning_rate": 1.8285851089969803e-05, "loss": 35.2077, "step": 16597 }, { "epoch": 43.83757015516672, "grad_norm": 685.9267578125, "learning_rate": 1.827017077740317e-05, "loss": 35.3645, "step": 16598 }, { "epoch": 43.840211290855066, "grad_norm": 546.52490234375, "learning_rate": 1.82544969357559e-05, "loss": 35.1821, "step": 16599 }, { "epoch": 43.842852426543416, "grad_norm": 607.3004150390625, "learning_rate": 1.8238829565465658e-05, "loss": 34.5818, "step": 16600 }, { "epoch": 43.842852426543416, "eval_loss": 3.718729257583618, "eval_runtime": 2.1153, "eval_samples_per_second": 234.005, "eval_steps_per_second": 29.31, "step": 16600 }, { "epoch": 43.84549356223176, "grad_norm": 1369.3485107421875, "learning_rate": 1.822316866696991e-05, "loss": 36.1053, "step": 16601 }, { "epoch": 43.8481346979201, "grad_norm": 679.9463500976562, "learning_rate": 1.8207514240706075e-05, "loss": 36.6198, "step": 16602 }, { "epoch": 43.85077583360845, "grad_norm": 4239.15380859375, "learning_rate": 1.819186628711125e-05, "loss": 23.3735, "step": 16603 }, { "epoch": 43.853416969296795, "grad_norm": 895.9826049804688, "learning_rate": 1.817622480662237e-05, "loss": 11.2705, "step": 16604 }, { "epoch": 43.856058104985145, "grad_norm": 3139.19287109375, "learning_rate": 1.8160589799676193e-05, "loss": 13.494, "step": 16605 }, { "epoch": 43.85869924067349, "grad_norm": 1558.218994140625, "learning_rate": 1.814496126670942e-05, "loss": 10.981, "step": 16606 }, { "epoch": 43.86134037636184, "grad_norm": 1009.1492919921875, "learning_rate": 1.8129339208158373e-05, "loss": 9.2123, "step": 16607 }, { "epoch": 43.86398151205018, "grad_norm": 1516.873779296875, "learning_rate": 1.8113723624459345e-05, "loss": 10.8821, "step": 16608 }, { "epoch": 43.86662264773853, "grad_norm": 1351.6484375, "learning_rate": 1.8098114516048342e-05, "loss": 8.6945, "step": 16609 }, { "epoch": 43.869263783426874, "grad_norm": 1882.7979736328125, "learning_rate": 1.8082511883361267e-05, "loss": 8.7942, "step": 16610 }, { "epoch": 43.87190491911522, "grad_norm": 2663.004638671875, "learning_rate": 1.806691572683383e-05, "loss": 13.0426, "step": 16611 }, { "epoch": 43.87454605480357, "grad_norm": 1699.3856201171875, "learning_rate": 1.805132604690149e-05, "loss": 11.365, "step": 16612 }, { "epoch": 43.87718719049191, "grad_norm": 1207.4462890625, "learning_rate": 1.8035742843999643e-05, "loss": 31.2744, "step": 16613 }, { "epoch": 43.87982832618026, "grad_norm": 1535.1024169921875, "learning_rate": 1.8020166118563414e-05, "loss": 33.7252, "step": 16614 }, { "epoch": 43.8824694618686, "grad_norm": 1269.5369873046875, "learning_rate": 1.800459587102779e-05, "loss": 34.197, "step": 16615 }, { "epoch": 43.88511059755695, "grad_norm": 493.756591796875, "learning_rate": 1.7989032101827534e-05, "loss": 36.0369, "step": 16616 }, { "epoch": 43.887751733245295, "grad_norm": 1277.974609375, "learning_rate": 1.7973474811397243e-05, "loss": 34.5452, "step": 16617 }, { "epoch": 43.89039286893364, "grad_norm": 1138.423095703125, "learning_rate": 1.795792400017135e-05, "loss": 34.375, "step": 16618 }, { "epoch": 43.89303400462199, "grad_norm": 1434.9833984375, "learning_rate": 1.7942379668584168e-05, "loss": 34.8716, "step": 16619 }, { "epoch": 43.89567514031033, "grad_norm": 788.7632446289062, "learning_rate": 1.7926841817069717e-05, "loss": 36.385, "step": 16620 }, { "epoch": 43.89831627599868, "grad_norm": 867.1302490234375, "learning_rate": 1.7911310446061868e-05, "loss": 34.1817, "step": 16621 }, { "epoch": 43.900957411687024, "grad_norm": 2208.131103515625, "learning_rate": 1.7895785555994305e-05, "loss": 33.9304, "step": 16622 }, { "epoch": 43.903598547375374, "grad_norm": 1481.4432373046875, "learning_rate": 1.788026714730062e-05, "loss": 34.5787, "step": 16623 }, { "epoch": 43.90623968306372, "grad_norm": 1106.4405517578125, "learning_rate": 1.7864755220414113e-05, "loss": 35.0968, "step": 16624 }, { "epoch": 43.90888081875207, "grad_norm": 830.1920166015625, "learning_rate": 1.784924977576796e-05, "loss": 33.0259, "step": 16625 }, { "epoch": 43.91152195444041, "grad_norm": 1957.209716796875, "learning_rate": 1.7833750813795126e-05, "loss": 32.4917, "step": 16626 }, { "epoch": 43.91416309012875, "grad_norm": 1421.273193359375, "learning_rate": 1.781825833492845e-05, "loss": 34.7829, "step": 16627 }, { "epoch": 43.9168042258171, "grad_norm": 1423.9984130859375, "learning_rate": 1.780277233960048e-05, "loss": 34.5826, "step": 16628 }, { "epoch": 43.919445361505446, "grad_norm": 2899.793701171875, "learning_rate": 1.778729282824368e-05, "loss": 37.2059, "step": 16629 }, { "epoch": 43.922086497193796, "grad_norm": 9307.072265625, "learning_rate": 1.7771819801290336e-05, "loss": 36.7131, "step": 16630 }, { "epoch": 43.92472763288214, "grad_norm": 2139.754150390625, "learning_rate": 1.775635325917252e-05, "loss": 39.5548, "step": 16631 }, { "epoch": 43.92736876857049, "grad_norm": 661.3855590820312, "learning_rate": 1.7740893202322082e-05, "loss": 39.5053, "step": 16632 }, { "epoch": 43.93000990425883, "grad_norm": 1031.210205078125, "learning_rate": 1.772543963117082e-05, "loss": 37.1094, "step": 16633 }, { "epoch": 43.932651039947174, "grad_norm": 1134.432373046875, "learning_rate": 1.7709992546150188e-05, "loss": 38.8115, "step": 16634 }, { "epoch": 43.935292175635524, "grad_norm": 1111.7744140625, "learning_rate": 1.769455194769157e-05, "loss": 41.4278, "step": 16635 }, { "epoch": 43.93793331132387, "grad_norm": 490.5683898925781, "learning_rate": 1.7679117836226055e-05, "loss": 41.2479, "step": 16636 }, { "epoch": 43.94057444701222, "grad_norm": 1301.203369140625, "learning_rate": 1.766369021218478e-05, "loss": 37.7773, "step": 16637 }, { "epoch": 43.94321558270056, "grad_norm": 672.5255126953125, "learning_rate": 1.7648269075998453e-05, "loss": 37.5772, "step": 16638 }, { "epoch": 43.94585671838891, "grad_norm": 865.49462890625, "learning_rate": 1.763285442809767e-05, "loss": 36.5047, "step": 16639 }, { "epoch": 43.94849785407725, "grad_norm": 1352.7557373046875, "learning_rate": 1.7617446268912985e-05, "loss": 36.106, "step": 16640 }, { "epoch": 43.951138989765596, "grad_norm": 345.5325927734375, "learning_rate": 1.76020445988746e-05, "loss": 35.2162, "step": 16641 }, { "epoch": 43.953780125453946, "grad_norm": 547.3763427734375, "learning_rate": 1.758664941841259e-05, "loss": 34.1525, "step": 16642 }, { "epoch": 43.95642126114229, "grad_norm": 507.68096923828125, "learning_rate": 1.7571260727956865e-05, "loss": 34.9873, "step": 16643 }, { "epoch": 43.95906239683064, "grad_norm": 2870.2578125, "learning_rate": 1.7555878527937163e-05, "loss": 43.0434, "step": 16644 }, { "epoch": 43.96170353251898, "grad_norm": 1586.92724609375, "learning_rate": 1.7540502818783e-05, "loss": 20.238, "step": 16645 }, { "epoch": 43.96434466820733, "grad_norm": 1605.9173583984375, "learning_rate": 1.7525133600923726e-05, "loss": 10.2196, "step": 16646 }, { "epoch": 43.966985803895675, "grad_norm": 2503.33056640625, "learning_rate": 1.7509770874788554e-05, "loss": 16.5703, "step": 16647 }, { "epoch": 43.96962693958402, "grad_norm": 4292.90625, "learning_rate": 1.749441464080645e-05, "loss": 15.6954, "step": 16648 }, { "epoch": 43.97226807527237, "grad_norm": 2781.2607421875, "learning_rate": 1.7479064899406233e-05, "loss": 13.7911, "step": 16649 }, { "epoch": 43.97490921096071, "grad_norm": 894.0914306640625, "learning_rate": 1.746372165101656e-05, "loss": 20.5611, "step": 16650 }, { "epoch": 43.97755034664906, "grad_norm": 1968.9366455078125, "learning_rate": 1.7448384896065845e-05, "loss": 34.8503, "step": 16651 }, { "epoch": 43.9801914823374, "grad_norm": 728.3733520507812, "learning_rate": 1.7433054634982377e-05, "loss": 35.3426, "step": 16652 }, { "epoch": 43.98283261802575, "grad_norm": 770.0531616210938, "learning_rate": 1.7417730868194205e-05, "loss": 33.8882, "step": 16653 }, { "epoch": 43.985473753714096, "grad_norm": 1140.426025390625, "learning_rate": 1.7402413596129317e-05, "loss": 33.9896, "step": 16654 }, { "epoch": 43.988114889402446, "grad_norm": 920.3756713867188, "learning_rate": 1.738710281921538e-05, "loss": 34.9401, "step": 16655 }, { "epoch": 43.99075602509079, "grad_norm": 1310.3538818359375, "learning_rate": 1.737179853787996e-05, "loss": 35.7627, "step": 16656 }, { "epoch": 43.99339716077913, "grad_norm": 1948.8018798828125, "learning_rate": 1.735650075255038e-05, "loss": 35.0647, "step": 16657 }, { "epoch": 43.99603829646748, "grad_norm": 936.5068969726562, "learning_rate": 1.734120946365389e-05, "loss": 34.6764, "step": 16658 }, { "epoch": 43.998679432155825, "grad_norm": 1663.04638671875, "learning_rate": 1.732592467161745e-05, "loss": 34.8536, "step": 16659 }, { "epoch": 44.001320567844175, "grad_norm": 8846.3125, "learning_rate": 1.7310646376867885e-05, "loss": 38.3131, "step": 16660 }, { "epoch": 44.00396170353252, "grad_norm": 1098.3961181640625, "learning_rate": 1.7295374579831823e-05, "loss": 37.2869, "step": 16661 }, { "epoch": 44.00660283922087, "grad_norm": 853.5191040039062, "learning_rate": 1.7280109280935735e-05, "loss": 39.0413, "step": 16662 }, { "epoch": 44.00924397490921, "grad_norm": 936.4254760742188, "learning_rate": 1.7264850480605886e-05, "loss": 38.6739, "step": 16663 }, { "epoch": 44.011885110597554, "grad_norm": 845.1744384765625, "learning_rate": 1.724959817926833e-05, "loss": 39.3879, "step": 16664 }, { "epoch": 44.014526246285904, "grad_norm": 950.6748046875, "learning_rate": 1.7234352377349076e-05, "loss": 40.3899, "step": 16665 }, { "epoch": 44.01716738197425, "grad_norm": 1523.791015625, "learning_rate": 1.7219113075273766e-05, "loss": 41.2535, "step": 16666 }, { "epoch": 44.0198085176626, "grad_norm": 1132.2681884765625, "learning_rate": 1.7203880273467977e-05, "loss": 41.0452, "step": 16667 }, { "epoch": 44.02244965335094, "grad_norm": 2445.89111328125, "learning_rate": 1.718865397235714e-05, "loss": 40.4322, "step": 16668 }, { "epoch": 44.02509078903929, "grad_norm": 1710.9407958984375, "learning_rate": 1.717343417236633e-05, "loss": 37.6782, "step": 16669 }, { "epoch": 44.02773192472763, "grad_norm": 1748.498046875, "learning_rate": 1.7158220873920632e-05, "loss": 36.7788, "step": 16670 }, { "epoch": 44.03037306041598, "grad_norm": 807.9503784179688, "learning_rate": 1.7143014077444758e-05, "loss": 38.5112, "step": 16671 }, { "epoch": 44.033014196104325, "grad_norm": 762.3264770507812, "learning_rate": 1.7127813783363504e-05, "loss": 35.926, "step": 16672 }, { "epoch": 44.03565533179267, "grad_norm": 964.5203247070312, "learning_rate": 1.711261999210123e-05, "loss": 36.4949, "step": 16673 }, { "epoch": 44.03829646748102, "grad_norm": 5894.779296875, "learning_rate": 1.7097432704082206e-05, "loss": 35.8432, "step": 16674 }, { "epoch": 44.04093760316936, "grad_norm": 708.327880859375, "learning_rate": 1.708225191973059e-05, "loss": 35.2829, "step": 16675 }, { "epoch": 44.04357873885771, "grad_norm": 856.0765380859375, "learning_rate": 1.7067077639470297e-05, "loss": 35.3096, "step": 16676 }, { "epoch": 44.046219874546054, "grad_norm": 672.8444213867188, "learning_rate": 1.7051909863724985e-05, "loss": 35.226, "step": 16677 }, { "epoch": 44.048861010234404, "grad_norm": 1006.88232421875, "learning_rate": 1.703674859291829e-05, "loss": 34.5383, "step": 16678 }, { "epoch": 44.05150214592275, "grad_norm": 1412.85888671875, "learning_rate": 1.7021593827473507e-05, "loss": 35.1577, "step": 16679 }, { "epoch": 44.05414328161109, "grad_norm": 1424.78271484375, "learning_rate": 1.700644556781389e-05, "loss": 34.3376, "step": 16680 }, { "epoch": 44.05678441729944, "grad_norm": 796.2548828125, "learning_rate": 1.699130381436234e-05, "loss": 35.5188, "step": 16681 }, { "epoch": 44.05942555298778, "grad_norm": 892.5914916992188, "learning_rate": 1.6976168567541807e-05, "loss": 38.666, "step": 16682 }, { "epoch": 44.06206668867613, "grad_norm": 4208.8671875, "learning_rate": 1.6961039827774887e-05, "loss": 21.8287, "step": 16683 }, { "epoch": 44.064707824364476, "grad_norm": 4276.61962890625, "learning_rate": 1.694591759548403e-05, "loss": 11.5101, "step": 16684 }, { "epoch": 44.067348960052826, "grad_norm": 2330.813232421875, "learning_rate": 1.693080187109153e-05, "loss": 10.7464, "step": 16685 }, { "epoch": 44.06999009574117, "grad_norm": 4502.1708984375, "learning_rate": 1.6915692655019465e-05, "loss": 9.6141, "step": 16686 }, { "epoch": 44.07263123142951, "grad_norm": 3226.721435546875, "learning_rate": 1.6900589947689754e-05, "loss": 10.4861, "step": 16687 }, { "epoch": 44.07527236711786, "grad_norm": 6427.16650390625, "learning_rate": 1.688549374952411e-05, "loss": 15.6111, "step": 16688 }, { "epoch": 44.077913502806204, "grad_norm": 3295.49560546875, "learning_rate": 1.687040406094417e-05, "loss": 9.3621, "step": 16689 }, { "epoch": 44.080554638494554, "grad_norm": 2488.560302734375, "learning_rate": 1.6855320882371212e-05, "loss": 12.2395, "step": 16690 }, { "epoch": 44.0831957741829, "grad_norm": 4656.5859375, "learning_rate": 1.6840244214226503e-05, "loss": 10.5631, "step": 16691 }, { "epoch": 44.08583690987125, "grad_norm": 1878.3948974609375, "learning_rate": 1.682517405693093e-05, "loss": 25.844, "step": 16692 }, { "epoch": 44.08847804555959, "grad_norm": 1265.65234375, "learning_rate": 1.6810110410905465e-05, "loss": 35.6391, "step": 16693 }, { "epoch": 44.09111918124793, "grad_norm": 3277.09375, "learning_rate": 1.679505327657066e-05, "loss": 33.8667, "step": 16694 }, { "epoch": 44.09376031693628, "grad_norm": 1550.3089599609375, "learning_rate": 1.6780002654347037e-05, "loss": 33.857, "step": 16695 }, { "epoch": 44.096401452624626, "grad_norm": 1820.5242919921875, "learning_rate": 1.676495854465479e-05, "loss": 33.8827, "step": 16696 }, { "epoch": 44.099042588312976, "grad_norm": 2458.225830078125, "learning_rate": 1.674992094791411e-05, "loss": 33.4716, "step": 16697 }, { "epoch": 44.10168372400132, "grad_norm": 996.697998046875, "learning_rate": 1.6734889864544857e-05, "loss": 35.207, "step": 16698 }, { "epoch": 44.10432485968967, "grad_norm": 1086.760498046875, "learning_rate": 1.6719865294966718e-05, "loss": 34.1759, "step": 16699 }, { "epoch": 44.10696599537801, "grad_norm": 893.7155151367188, "learning_rate": 1.6704847239599364e-05, "loss": 34.8187, "step": 16700 }, { "epoch": 44.10960713106636, "grad_norm": 2498.740478515625, "learning_rate": 1.6689835698862094e-05, "loss": 34.6765, "step": 16701 }, { "epoch": 44.112248266754705, "grad_norm": 1931.1419677734375, "learning_rate": 1.6674830673174075e-05, "loss": 35.3165, "step": 16702 }, { "epoch": 44.11488940244305, "grad_norm": 1022.0126342773438, "learning_rate": 1.6659832162954414e-05, "loss": 35.7067, "step": 16703 }, { "epoch": 44.1175305381314, "grad_norm": 647.4356079101562, "learning_rate": 1.664484016862186e-05, "loss": 34.2666, "step": 16704 }, { "epoch": 44.12017167381974, "grad_norm": 1327.2601318359375, "learning_rate": 1.6629854690595024e-05, "loss": 34.3449, "step": 16705 }, { "epoch": 44.12281280950809, "grad_norm": 829.0723266601562, "learning_rate": 1.6614875729292407e-05, "loss": 33.5428, "step": 16706 }, { "epoch": 44.12545394519643, "grad_norm": 899.996826171875, "learning_rate": 1.6599903285132305e-05, "loss": 35.4322, "step": 16707 }, { "epoch": 44.12809508088478, "grad_norm": 1801.530517578125, "learning_rate": 1.6584937358532782e-05, "loss": 36.5626, "step": 16708 }, { "epoch": 44.130736216573126, "grad_norm": 828.921875, "learning_rate": 1.6569977949911746e-05, "loss": 39.8491, "step": 16709 }, { "epoch": 44.13337735226147, "grad_norm": 1668.55908203125, "learning_rate": 1.655502505968701e-05, "loss": 40.4464, "step": 16710 }, { "epoch": 44.13601848794982, "grad_norm": 641.7544555664062, "learning_rate": 1.6540078688276033e-05, "loss": 38.422, "step": 16711 }, { "epoch": 44.13865962363816, "grad_norm": 1493.2841796875, "learning_rate": 1.6525138836096244e-05, "loss": 38.5035, "step": 16712 }, { "epoch": 44.14130075932651, "grad_norm": 1575.3236083984375, "learning_rate": 1.6510205503564774e-05, "loss": 38.9339, "step": 16713 }, { "epoch": 44.143941895014855, "grad_norm": 727.3439331054688, "learning_rate": 1.6495278691098682e-05, "loss": 39.6393, "step": 16714 }, { "epoch": 44.146583030703205, "grad_norm": 531.9737548828125, "learning_rate": 1.6480358399114743e-05, "loss": 42.5096, "step": 16715 }, { "epoch": 44.14922416639155, "grad_norm": 661.5725708007812, "learning_rate": 1.6465444628029597e-05, "loss": 40.4549, "step": 16716 }, { "epoch": 44.1518653020799, "grad_norm": 1142.1336669921875, "learning_rate": 1.6450537378259746e-05, "loss": 38.4825, "step": 16717 }, { "epoch": 44.15450643776824, "grad_norm": 759.25830078125, "learning_rate": 1.6435636650221465e-05, "loss": 39.4154, "step": 16718 }, { "epoch": 44.157147573456584, "grad_norm": 808.026611328125, "learning_rate": 1.642074244433081e-05, "loss": 38.8553, "step": 16719 }, { "epoch": 44.159788709144934, "grad_norm": 755.4369506835938, "learning_rate": 1.640585476100373e-05, "loss": 39.6233, "step": 16720 }, { "epoch": 44.16242984483328, "grad_norm": 1146.6646728515625, "learning_rate": 1.6390973600655917e-05, "loss": 36.5473, "step": 16721 }, { "epoch": 44.16507098052163, "grad_norm": 1330.263916015625, "learning_rate": 1.637609896370293e-05, "loss": 35.4985, "step": 16722 }, { "epoch": 44.16771211620997, "grad_norm": 1891.028564453125, "learning_rate": 1.636123085056013e-05, "loss": 38.02, "step": 16723 }, { "epoch": 44.17035325189832, "grad_norm": 792.3671875, "learning_rate": 1.6346369261642736e-05, "loss": 35.9997, "step": 16724 }, { "epoch": 44.17299438758666, "grad_norm": 796.1390380859375, "learning_rate": 1.6331514197365727e-05, "loss": 35.9787, "step": 16725 }, { "epoch": 44.175635523275005, "grad_norm": 818.363525390625, "learning_rate": 1.631666565814391e-05, "loss": 36.1619, "step": 16726 }, { "epoch": 44.178276658963355, "grad_norm": 665.99462890625, "learning_rate": 1.6301823644391895e-05, "loss": 35.7697, "step": 16727 }, { "epoch": 44.1809177946517, "grad_norm": 1422.7874755859375, "learning_rate": 1.6286988156524213e-05, "loss": 34.8775, "step": 16728 }, { "epoch": 44.18355893034005, "grad_norm": 1299.538818359375, "learning_rate": 1.6272159194955117e-05, "loss": 34.4147, "step": 16729 }, { "epoch": 44.18620006602839, "grad_norm": 665.303466796875, "learning_rate": 1.6257336760098635e-05, "loss": 35.3161, "step": 16730 }, { "epoch": 44.18884120171674, "grad_norm": 1123.95947265625, "learning_rate": 1.624252085236874e-05, "loss": 37.0544, "step": 16731 }, { "epoch": 44.191482337405084, "grad_norm": 2046.802734375, "learning_rate": 1.6227711472179134e-05, "loss": 24.6259, "step": 16732 }, { "epoch": 44.19412347309343, "grad_norm": 2967.2080078125, "learning_rate": 1.6212908619943374e-05, "loss": 11.2079, "step": 16733 }, { "epoch": 44.19676460878178, "grad_norm": 833.5848388671875, "learning_rate": 1.6198112296074762e-05, "loss": 15.754, "step": 16734 }, { "epoch": 44.19940574447012, "grad_norm": 1320.249267578125, "learning_rate": 1.618332250098653e-05, "loss": 10.5538, "step": 16735 }, { "epoch": 44.20204688015847, "grad_norm": 1139.322998046875, "learning_rate": 1.6168539235091707e-05, "loss": 12.6097, "step": 16736 }, { "epoch": 44.20468801584681, "grad_norm": 2545.564208984375, "learning_rate": 1.6153762498803016e-05, "loss": 11.5706, "step": 16737 }, { "epoch": 44.20732915153516, "grad_norm": 931.117919921875, "learning_rate": 1.6138992292533183e-05, "loss": 16.6008, "step": 16738 }, { "epoch": 44.209970287223506, "grad_norm": 281.563232421875, "learning_rate": 1.6124228616694603e-05, "loss": 8.2952, "step": 16739 }, { "epoch": 44.21261142291185, "grad_norm": 1080.2532958984375, "learning_rate": 1.6109471471699556e-05, "loss": 10.4316, "step": 16740 }, { "epoch": 44.2152525586002, "grad_norm": 37631.30859375, "learning_rate": 1.609472085796007e-05, "loss": 8.5448, "step": 16741 }, { "epoch": 44.21789369428854, "grad_norm": 2140.31689453125, "learning_rate": 1.6079976775888156e-05, "loss": 27.5985, "step": 16742 }, { "epoch": 44.22053482997689, "grad_norm": 2135.93701171875, "learning_rate": 1.6065239225895478e-05, "loss": 34.7165, "step": 16743 }, { "epoch": 44.223175965665234, "grad_norm": 876.7875366210938, "learning_rate": 1.6050508208393517e-05, "loss": 34.4921, "step": 16744 }, { "epoch": 44.225817101353584, "grad_norm": 1833.1214599609375, "learning_rate": 1.6035783723793746e-05, "loss": 35.5492, "step": 16745 }, { "epoch": 44.22845823704193, "grad_norm": 1827.361328125, "learning_rate": 1.6021065772507254e-05, "loss": 33.6899, "step": 16746 }, { "epoch": 44.23109937273028, "grad_norm": 3180.514892578125, "learning_rate": 1.6006354354945074e-05, "loss": 34.1876, "step": 16747 }, { "epoch": 44.23374050841862, "grad_norm": 930.4688110351562, "learning_rate": 1.599164947151796e-05, "loss": 34.6246, "step": 16748 }, { "epoch": 44.23638164410696, "grad_norm": 922.0515747070312, "learning_rate": 1.5976951122636614e-05, "loss": 35.3682, "step": 16749 }, { "epoch": 44.23902277979531, "grad_norm": 3358.026611328125, "learning_rate": 1.5962259308711396e-05, "loss": 34.3529, "step": 16750 }, { "epoch": 44.241663915483656, "grad_norm": 1103.3515625, "learning_rate": 1.5947574030152596e-05, "loss": 35.0, "step": 16751 }, { "epoch": 44.244305051172006, "grad_norm": 1719.4544677734375, "learning_rate": 1.5932895287370324e-05, "loss": 34.1971, "step": 16752 }, { "epoch": 44.24694618686035, "grad_norm": 1077.2257080078125, "learning_rate": 1.5918223080774453e-05, "loss": 35.3232, "step": 16753 }, { "epoch": 44.2495873225487, "grad_norm": 1169.4288330078125, "learning_rate": 1.5903557410774676e-05, "loss": 35.2944, "step": 16754 }, { "epoch": 44.25222845823704, "grad_norm": 3857.534423828125, "learning_rate": 1.588889827778059e-05, "loss": 34.0708, "step": 16755 }, { "epoch": 44.254869593925385, "grad_norm": 957.7255859375, "learning_rate": 1.5874245682201472e-05, "loss": 34.6271, "step": 16756 }, { "epoch": 44.257510729613735, "grad_norm": 2205.4052734375, "learning_rate": 1.5859599624446526e-05, "loss": 34.9677, "step": 16757 }, { "epoch": 44.26015186530208, "grad_norm": 1831.0657958984375, "learning_rate": 1.584496010492467e-05, "loss": 35.8772, "step": 16758 }, { "epoch": 44.26279300099043, "grad_norm": 3403.920654296875, "learning_rate": 1.583032712404478e-05, "loss": 39.5124, "step": 16759 }, { "epoch": 44.26543413667877, "grad_norm": 1107.5321044921875, "learning_rate": 1.581570068221544e-05, "loss": 39.9637, "step": 16760 }, { "epoch": 44.26807527236712, "grad_norm": 727.8374633789062, "learning_rate": 1.5801080779845073e-05, "loss": 37.5801, "step": 16761 }, { "epoch": 44.27071640805546, "grad_norm": 558.85888671875, "learning_rate": 1.578646741734199e-05, "loss": 39.0215, "step": 16762 }, { "epoch": 44.27335754374381, "grad_norm": 779.6051635742188, "learning_rate": 1.5771860595114206e-05, "loss": 38.3918, "step": 16763 }, { "epoch": 44.275998679432156, "grad_norm": 1028.9251708984375, "learning_rate": 1.575726031356964e-05, "loss": 39.6374, "step": 16764 }, { "epoch": 44.2786398151205, "grad_norm": 723.7055053710938, "learning_rate": 1.574266657311596e-05, "loss": 41.9444, "step": 16765 }, { "epoch": 44.28128095080885, "grad_norm": 980.328857421875, "learning_rate": 1.572807937416071e-05, "loss": 40.8665, "step": 16766 }, { "epoch": 44.28392208649719, "grad_norm": 1607.8199462890625, "learning_rate": 1.5713498717111225e-05, "loss": 41.4485, "step": 16767 }, { "epoch": 44.28656322218554, "grad_norm": 684.9309692382812, "learning_rate": 1.5698924602374625e-05, "loss": 39.795, "step": 16768 }, { "epoch": 44.289204357873885, "grad_norm": 773.335205078125, "learning_rate": 1.568435703035795e-05, "loss": 38.1919, "step": 16769 }, { "epoch": 44.291845493562235, "grad_norm": 1133.8282470703125, "learning_rate": 1.566979600146795e-05, "loss": 38.7836, "step": 16770 }, { "epoch": 44.29448662925058, "grad_norm": 1064.1927490234375, "learning_rate": 1.565524151611128e-05, "loss": 38.5558, "step": 16771 }, { "epoch": 44.29712776493892, "grad_norm": 1138.5010986328125, "learning_rate": 1.5640693574694275e-05, "loss": 36.486, "step": 16772 }, { "epoch": 44.29976890062727, "grad_norm": 1373.94677734375, "learning_rate": 1.562615217762328e-05, "loss": 35.9832, "step": 16773 }, { "epoch": 44.302410036315614, "grad_norm": 1436.802490234375, "learning_rate": 1.5611617325304306e-05, "loss": 35.3552, "step": 16774 }, { "epoch": 44.305051172003964, "grad_norm": 4838.92529296875, "learning_rate": 1.559708901814319e-05, "loss": 35.9944, "step": 16775 }, { "epoch": 44.30769230769231, "grad_norm": 1444.14697265625, "learning_rate": 1.5582567256545692e-05, "loss": 33.2825, "step": 16776 }, { "epoch": 44.31033344338066, "grad_norm": 925.5545043945312, "learning_rate": 1.5568052040917295e-05, "loss": 34.6445, "step": 16777 }, { "epoch": 44.312974579069, "grad_norm": 1176.26806640625, "learning_rate": 1.5553543371663343e-05, "loss": 34.9819, "step": 16778 }, { "epoch": 44.31561571475734, "grad_norm": 1227.4124755859375, "learning_rate": 1.5539041249188925e-05, "loss": 35.4817, "step": 16779 }, { "epoch": 44.31825685044569, "grad_norm": 1538.53125, "learning_rate": 1.5524545673899104e-05, "loss": 34.8947, "step": 16780 }, { "epoch": 44.320897986134035, "grad_norm": 1799.910888671875, "learning_rate": 1.5510056646198588e-05, "loss": 45.5409, "step": 16781 }, { "epoch": 44.323539121822385, "grad_norm": 3390.707275390625, "learning_rate": 1.5495574166492e-05, "loss": 22.9296, "step": 16782 }, { "epoch": 44.32618025751073, "grad_norm": 2790.821533203125, "learning_rate": 1.5481098235183734e-05, "loss": 12.6976, "step": 16783 }, { "epoch": 44.32882139319908, "grad_norm": 1973.552978515625, "learning_rate": 1.546662885267805e-05, "loss": 11.0171, "step": 16784 }, { "epoch": 44.33146252888742, "grad_norm": 543.8848266601562, "learning_rate": 1.5452166019378987e-05, "loss": 8.2104, "step": 16785 }, { "epoch": 44.334103664575764, "grad_norm": 1134.0601806640625, "learning_rate": 1.5437709735690366e-05, "loss": 12.2089, "step": 16786 }, { "epoch": 44.336744800264114, "grad_norm": 2107.879638671875, "learning_rate": 1.5423260002015965e-05, "loss": 9.601, "step": 16787 }, { "epoch": 44.33938593595246, "grad_norm": 8497.810546875, "learning_rate": 1.540881681875922e-05, "loss": 13.502, "step": 16788 }, { "epoch": 44.34202707164081, "grad_norm": 1841.306640625, "learning_rate": 1.5394380186323414e-05, "loss": 10.2983, "step": 16789 }, { "epoch": 44.34466820732915, "grad_norm": 3155.2470703125, "learning_rate": 1.5379950105111783e-05, "loss": 9.7932, "step": 16790 }, { "epoch": 44.3473093430175, "grad_norm": 1129.7525634765625, "learning_rate": 1.53655265755272e-05, "loss": 15.3104, "step": 16791 }, { "epoch": 44.34995047870584, "grad_norm": 3217.025390625, "learning_rate": 1.5351109597972478e-05, "loss": 36.7816, "step": 16792 }, { "epoch": 44.35259161439419, "grad_norm": 786.9921264648438, "learning_rate": 1.5336699172850133e-05, "loss": 34.354, "step": 16793 }, { "epoch": 44.355232750082536, "grad_norm": 1965.237060546875, "learning_rate": 1.5322295300562645e-05, "loss": 34.0843, "step": 16794 }, { "epoch": 44.35787388577088, "grad_norm": 1378.11669921875, "learning_rate": 1.5307897981512192e-05, "loss": 34.6859, "step": 16795 }, { "epoch": 44.36051502145923, "grad_norm": 1118.90966796875, "learning_rate": 1.529350721610079e-05, "loss": 34.5231, "step": 16796 }, { "epoch": 44.36315615714757, "grad_norm": 2855.45458984375, "learning_rate": 1.527912300473036e-05, "loss": 33.4832, "step": 16797 }, { "epoch": 44.36579729283592, "grad_norm": 581.4411010742188, "learning_rate": 1.526474534780256e-05, "loss": 34.233, "step": 16798 }, { "epoch": 44.368438428524264, "grad_norm": 1605.2215576171875, "learning_rate": 1.5250374245718845e-05, "loss": 33.9585, "step": 16799 }, { "epoch": 44.371079564212614, "grad_norm": 2874.2822265625, "learning_rate": 1.5236009698880531e-05, "loss": 33.9313, "step": 16800 }, { "epoch": 44.371079564212614, "eval_loss": 3.766188144683838, "eval_runtime": 2.1314, "eval_samples_per_second": 232.244, "eval_steps_per_second": 29.089, "step": 16800 }, { "epoch": 44.37372069990096, "grad_norm": 588.4693603515625, "learning_rate": 1.5221651707688715e-05, "loss": 34.2367, "step": 16801 }, { "epoch": 44.3763618355893, "grad_norm": 1114.7769775390625, "learning_rate": 1.5207300272544383e-05, "loss": 35.6733, "step": 16802 }, { "epoch": 44.37900297127765, "grad_norm": 1785.8746337890625, "learning_rate": 1.519295539384824e-05, "loss": 34.2656, "step": 16803 }, { "epoch": 44.38164410696599, "grad_norm": 918.2731323242188, "learning_rate": 1.5178617072000883e-05, "loss": 34.494, "step": 16804 }, { "epoch": 44.38428524265434, "grad_norm": 1562.3984375, "learning_rate": 1.5164285307402741e-05, "loss": 34.9841, "step": 16805 }, { "epoch": 44.386926378342686, "grad_norm": 829.6256713867188, "learning_rate": 1.5149960100453968e-05, "loss": 34.2085, "step": 16806 }, { "epoch": 44.389567514031036, "grad_norm": 898.7567749023438, "learning_rate": 1.513564145155455e-05, "loss": 36.1123, "step": 16807 }, { "epoch": 44.39220864971938, "grad_norm": 879.407470703125, "learning_rate": 1.5121329361104468e-05, "loss": 34.9103, "step": 16808 }, { "epoch": 44.39484978540773, "grad_norm": 3063.07177734375, "learning_rate": 1.5107023829503242e-05, "loss": 36.4704, "step": 16809 }, { "epoch": 44.39749092109607, "grad_norm": 1216.4110107421875, "learning_rate": 1.5092724857150354e-05, "loss": 41.1622, "step": 16810 }, { "epoch": 44.400132056784415, "grad_norm": 1329.2744140625, "learning_rate": 1.5078432444445178e-05, "loss": 37.7943, "step": 16811 }, { "epoch": 44.402773192472765, "grad_norm": 863.322021484375, "learning_rate": 1.5064146591786786e-05, "loss": 37.7381, "step": 16812 }, { "epoch": 44.40541432816111, "grad_norm": 949.3975830078125, "learning_rate": 1.5049867299574082e-05, "loss": 39.2019, "step": 16813 }, { "epoch": 44.40805546384946, "grad_norm": 4110.62939453125, "learning_rate": 1.5035594568205773e-05, "loss": 41.5485, "step": 16814 }, { "epoch": 44.4106965995378, "grad_norm": 1657.67724609375, "learning_rate": 1.5021328398080485e-05, "loss": 42.0353, "step": 16815 }, { "epoch": 44.41333773522615, "grad_norm": 961.6349487304688, "learning_rate": 1.5007068789596595e-05, "loss": 41.1431, "step": 16816 }, { "epoch": 44.41597887091449, "grad_norm": 1318.7886962890625, "learning_rate": 1.4992815743152255e-05, "loss": 41.7338, "step": 16817 }, { "epoch": 44.418620006602836, "grad_norm": 1058.76416015625, "learning_rate": 1.4978569259145481e-05, "loss": 40.5967, "step": 16818 }, { "epoch": 44.421261142291186, "grad_norm": 1376.660400390625, "learning_rate": 1.4964329337974092e-05, "loss": 39.3067, "step": 16819 }, { "epoch": 44.42390227797953, "grad_norm": 755.4578857421875, "learning_rate": 1.4950095980035772e-05, "loss": 39.0675, "step": 16820 }, { "epoch": 44.42654341366788, "grad_norm": 460.30938720703125, "learning_rate": 1.4935869185727868e-05, "loss": 37.197, "step": 16821 }, { "epoch": 44.42918454935622, "grad_norm": 984.4067993164062, "learning_rate": 1.4921648955447787e-05, "loss": 37.5764, "step": 16822 }, { "epoch": 44.43182568504457, "grad_norm": 700.4166870117188, "learning_rate": 1.4907435289592568e-05, "loss": 36.8463, "step": 16823 }, { "epoch": 44.434466820732915, "grad_norm": 1086.257080078125, "learning_rate": 1.4893228188559094e-05, "loss": 35.1467, "step": 16824 }, { "epoch": 44.43710795642126, "grad_norm": 1726.1546630859375, "learning_rate": 1.4879027652744182e-05, "loss": 34.3507, "step": 16825 }, { "epoch": 44.43974909210961, "grad_norm": 902.330078125, "learning_rate": 1.4864833682544266e-05, "loss": 35.0039, "step": 16826 }, { "epoch": 44.44239022779795, "grad_norm": 902.8279418945312, "learning_rate": 1.4850646278355723e-05, "loss": 34.9301, "step": 16827 }, { "epoch": 44.4450313634863, "grad_norm": 2171.01123046875, "learning_rate": 1.4836465440574737e-05, "loss": 34.8583, "step": 16828 }, { "epoch": 44.447672499174644, "grad_norm": 980.0406494140625, "learning_rate": 1.4822291169597323e-05, "loss": 34.7251, "step": 16829 }, { "epoch": 44.450313634862994, "grad_norm": 785.17138671875, "learning_rate": 1.4808123465819306e-05, "loss": 35.2989, "step": 16830 }, { "epoch": 44.45295477055134, "grad_norm": 1640.552734375, "learning_rate": 1.479396232963623e-05, "loss": 35.3469, "step": 16831 }, { "epoch": 44.45559590623968, "grad_norm": 1680.8392333984375, "learning_rate": 1.4779807761443637e-05, "loss": 30.0542, "step": 16832 }, { "epoch": 44.45823704192803, "grad_norm": 23744.955078125, "learning_rate": 1.4765659761636713e-05, "loss": 7.9558, "step": 16833 }, { "epoch": 44.46087817761637, "grad_norm": 6417.21630859375, "learning_rate": 1.4751518330610587e-05, "loss": 10.7169, "step": 16834 }, { "epoch": 44.46351931330472, "grad_norm": 2442.617431640625, "learning_rate": 1.473738346876008e-05, "loss": 12.8591, "step": 16835 }, { "epoch": 44.466160448993065, "grad_norm": 2894.85595703125, "learning_rate": 1.472325517647996e-05, "loss": 11.0104, "step": 16836 }, { "epoch": 44.468801584681415, "grad_norm": 3537.260498046875, "learning_rate": 1.4709133454164748e-05, "loss": 13.1483, "step": 16837 }, { "epoch": 44.47144272036976, "grad_norm": 864.1000366210938, "learning_rate": 1.4695018302208708e-05, "loss": 15.0646, "step": 16838 }, { "epoch": 44.47408385605811, "grad_norm": 1108.980224609375, "learning_rate": 1.4680909721006108e-05, "loss": 11.5531, "step": 16839 }, { "epoch": 44.47672499174645, "grad_norm": 1677.389892578125, "learning_rate": 1.4666807710950857e-05, "loss": 12.6648, "step": 16840 }, { "epoch": 44.479366127434794, "grad_norm": 9218.71875, "learning_rate": 1.4652712272436752e-05, "loss": 11.6719, "step": 16841 }, { "epoch": 44.482007263123144, "grad_norm": 5751.3251953125, "learning_rate": 1.4638623405857394e-05, "loss": 32.6771, "step": 16842 }, { "epoch": 44.48464839881149, "grad_norm": 635.369384765625, "learning_rate": 1.4624541111606272e-05, "loss": 35.2464, "step": 16843 }, { "epoch": 44.48728953449984, "grad_norm": 698.6439208984375, "learning_rate": 1.4610465390076544e-05, "loss": 33.9499, "step": 16844 }, { "epoch": 44.48993067018818, "grad_norm": 2112.60888671875, "learning_rate": 1.4596396241661258e-05, "loss": 35.5322, "step": 16845 }, { "epoch": 44.49257180587653, "grad_norm": 695.7686767578125, "learning_rate": 1.458233366675335e-05, "loss": 33.6793, "step": 16846 }, { "epoch": 44.49521294156487, "grad_norm": 1767.4637451171875, "learning_rate": 1.4568277665745477e-05, "loss": 33.7175, "step": 16847 }, { "epoch": 44.497854077253216, "grad_norm": 1602.1812744140625, "learning_rate": 1.4554228239030187e-05, "loss": 34.1089, "step": 16848 }, { "epoch": 44.500495212941566, "grad_norm": 1013.63720703125, "learning_rate": 1.4540185386999693e-05, "loss": 33.8791, "step": 16849 }, { "epoch": 44.50313634862991, "grad_norm": 2153.95556640625, "learning_rate": 1.4526149110046267e-05, "loss": 35.2308, "step": 16850 }, { "epoch": 44.50577748431826, "grad_norm": 899.0536499023438, "learning_rate": 1.4512119408561787e-05, "loss": 33.6957, "step": 16851 }, { "epoch": 44.5084186200066, "grad_norm": 1224.5855712890625, "learning_rate": 1.4498096282938022e-05, "loss": 34.2323, "step": 16852 }, { "epoch": 44.51105975569495, "grad_norm": 1076.7286376953125, "learning_rate": 1.4484079733566607e-05, "loss": 35.0608, "step": 16853 }, { "epoch": 44.513700891383294, "grad_norm": 2297.10009765625, "learning_rate": 1.4470069760838894e-05, "loss": 34.2568, "step": 16854 }, { "epoch": 44.516342027071644, "grad_norm": 780.6688842773438, "learning_rate": 1.4456066365146153e-05, "loss": 34.8673, "step": 16855 }, { "epoch": 44.51898316275999, "grad_norm": 1333.4888916015625, "learning_rate": 1.4442069546879321e-05, "loss": 33.3259, "step": 16856 }, { "epoch": 44.52162429844833, "grad_norm": 1714.96630859375, "learning_rate": 1.4428079306429392e-05, "loss": 34.2672, "step": 16857 }, { "epoch": 44.52426543413668, "grad_norm": 2915.913330078125, "learning_rate": 1.4414095644186942e-05, "loss": 34.4009, "step": 16858 }, { "epoch": 44.52690656982502, "grad_norm": 38326.80859375, "learning_rate": 1.4400118560542463e-05, "loss": 36.9337, "step": 16859 }, { "epoch": 44.52954770551337, "grad_norm": 834.401123046875, "learning_rate": 1.4386148055886338e-05, "loss": 42.0656, "step": 16860 }, { "epoch": 44.532188841201716, "grad_norm": 1271.046630859375, "learning_rate": 1.437218413060859e-05, "loss": 38.1856, "step": 16861 }, { "epoch": 44.534829976890066, "grad_norm": 826.1083374023438, "learning_rate": 1.4358226785099183e-05, "loss": 37.7045, "step": 16862 }, { "epoch": 44.53747111257841, "grad_norm": 1455.4271240234375, "learning_rate": 1.4344276019747831e-05, "loss": 38.2191, "step": 16863 }, { "epoch": 44.54011224826675, "grad_norm": 779.2139282226562, "learning_rate": 1.4330331834944171e-05, "loss": 40.3194, "step": 16864 }, { "epoch": 44.5427533839551, "grad_norm": 688.236328125, "learning_rate": 1.4316394231077557e-05, "loss": 42.0056, "step": 16865 }, { "epoch": 44.545394519643445, "grad_norm": 513.4570922851562, "learning_rate": 1.4302463208537148e-05, "loss": 43.3952, "step": 16866 }, { "epoch": 44.548035655331795, "grad_norm": 1224.4716796875, "learning_rate": 1.428853876771205e-05, "loss": 41.7502, "step": 16867 }, { "epoch": 44.55067679102014, "grad_norm": 947.4041748046875, "learning_rate": 1.4274620908991009e-05, "loss": 38.5601, "step": 16868 }, { "epoch": 44.55331792670849, "grad_norm": 1628.6932373046875, "learning_rate": 1.4260709632762741e-05, "loss": 39.0512, "step": 16869 }, { "epoch": 44.55595906239683, "grad_norm": 979.3372802734375, "learning_rate": 1.4246804939415658e-05, "loss": 39.1521, "step": 16870 }, { "epoch": 44.55860019808517, "grad_norm": 563.3203735351562, "learning_rate": 1.423290682933806e-05, "loss": 39.4274, "step": 16871 }, { "epoch": 44.56124133377352, "grad_norm": 800.6558837890625, "learning_rate": 1.4219015302918054e-05, "loss": 38.9216, "step": 16872 }, { "epoch": 44.563882469461866, "grad_norm": 813.8302612304688, "learning_rate": 1.4205130360543495e-05, "loss": 36.8057, "step": 16873 }, { "epoch": 44.566523605150216, "grad_norm": 1671.71875, "learning_rate": 1.4191252002602185e-05, "loss": 35.3848, "step": 16874 }, { "epoch": 44.56916474083856, "grad_norm": 816.4888916015625, "learning_rate": 1.4177380229481645e-05, "loss": 36.1795, "step": 16875 }, { "epoch": 44.57180587652691, "grad_norm": 798.1346435546875, "learning_rate": 1.4163515041569236e-05, "loss": 35.4101, "step": 16876 }, { "epoch": 44.57444701221525, "grad_norm": 17262.486328125, "learning_rate": 1.4149656439252117e-05, "loss": 35.6793, "step": 16877 }, { "epoch": 44.577088147903595, "grad_norm": 880.1648559570312, "learning_rate": 1.4135804422917342e-05, "loss": 34.6072, "step": 16878 }, { "epoch": 44.579729283591945, "grad_norm": 960.4376831054688, "learning_rate": 1.4121958992951628e-05, "loss": 34.7558, "step": 16879 }, { "epoch": 44.58237041928029, "grad_norm": 945.842529296875, "learning_rate": 1.4108120149741638e-05, "loss": 34.1499, "step": 16880 }, { "epoch": 44.58501155496864, "grad_norm": 725.4118041992188, "learning_rate": 1.409428789367384e-05, "loss": 34.5127, "step": 16881 }, { "epoch": 44.58765269065698, "grad_norm": 18617.4296875, "learning_rate": 1.4080462225134482e-05, "loss": 35.5418, "step": 16882 }, { "epoch": 44.59029382634533, "grad_norm": 411.97967529296875, "learning_rate": 1.4066643144509589e-05, "loss": 8.2906, "step": 16883 }, { "epoch": 44.592934962033674, "grad_norm": 866.7672729492188, "learning_rate": 1.4052830652185128e-05, "loss": 13.2453, "step": 16884 }, { "epoch": 44.595576097722024, "grad_norm": 7178.478515625, "learning_rate": 1.4039024748546791e-05, "loss": 10.001, "step": 16885 }, { "epoch": 44.59821723341037, "grad_norm": 1484.4739990234375, "learning_rate": 1.4025225433980049e-05, "loss": 12.793, "step": 16886 }, { "epoch": 44.60085836909871, "grad_norm": 3947.64453125, "learning_rate": 1.4011432708870286e-05, "loss": 13.7171, "step": 16887 }, { "epoch": 44.60349950478706, "grad_norm": 4337.37548828125, "learning_rate": 1.3997646573602613e-05, "loss": 8.751, "step": 16888 }, { "epoch": 44.6061406404754, "grad_norm": 3290.95556640625, "learning_rate": 1.3983867028562052e-05, "loss": 13.142, "step": 16889 }, { "epoch": 44.60878177616375, "grad_norm": 1969.08935546875, "learning_rate": 1.39700940741333e-05, "loss": 9.9269, "step": 16890 }, { "epoch": 44.611422911852095, "grad_norm": 1242.7618408203125, "learning_rate": 1.3956327710701073e-05, "loss": 7.735, "step": 16891 }, { "epoch": 44.614064047540445, "grad_norm": 1781.519775390625, "learning_rate": 1.394256793864973e-05, "loss": 30.3067, "step": 16892 }, { "epoch": 44.61670518322879, "grad_norm": 714.00146484375, "learning_rate": 1.3928814758363495e-05, "loss": 35.2607, "step": 16893 }, { "epoch": 44.61934631891713, "grad_norm": 1518.5771484375, "learning_rate": 1.391506817022642e-05, "loss": 34.3575, "step": 16894 }, { "epoch": 44.62198745460548, "grad_norm": 1480.8670654296875, "learning_rate": 1.3901328174622446e-05, "loss": 34.7713, "step": 16895 }, { "epoch": 44.624628590293824, "grad_norm": 1514.96728515625, "learning_rate": 1.3887594771935158e-05, "loss": 33.9529, "step": 16896 }, { "epoch": 44.627269725982174, "grad_norm": 832.75927734375, "learning_rate": 1.3873867962548054e-05, "loss": 33.9152, "step": 16897 }, { "epoch": 44.62991086167052, "grad_norm": 1491.762939453125, "learning_rate": 1.3860147746844493e-05, "loss": 33.848, "step": 16898 }, { "epoch": 44.63255199735887, "grad_norm": 1156.1397705078125, "learning_rate": 1.3846434125207614e-05, "loss": 33.5919, "step": 16899 }, { "epoch": 44.63519313304721, "grad_norm": 2199.999755859375, "learning_rate": 1.3832727098020331e-05, "loss": 34.1029, "step": 16900 }, { "epoch": 44.63783426873556, "grad_norm": 1050.4580078125, "learning_rate": 1.3819026665665369e-05, "loss": 35.2545, "step": 16901 }, { "epoch": 44.6404754044239, "grad_norm": 1499.06494140625, "learning_rate": 1.3805332828525392e-05, "loss": 33.7894, "step": 16902 }, { "epoch": 44.643116540112246, "grad_norm": 1612.8885498046875, "learning_rate": 1.3791645586982731e-05, "loss": 34.5624, "step": 16903 }, { "epoch": 44.645757675800596, "grad_norm": 1649.6915283203125, "learning_rate": 1.3777964941419613e-05, "loss": 33.3113, "step": 16904 }, { "epoch": 44.64839881148894, "grad_norm": 1014.552978515625, "learning_rate": 1.376429089221809e-05, "loss": 33.4563, "step": 16905 }, { "epoch": 44.65103994717729, "grad_norm": 1056.3731689453125, "learning_rate": 1.3750623439759941e-05, "loss": 34.597, "step": 16906 }, { "epoch": 44.65368108286563, "grad_norm": 1315.4112548828125, "learning_rate": 1.373696258442686e-05, "loss": 35.1908, "step": 16907 }, { "epoch": 44.65632221855398, "grad_norm": 4217.7529296875, "learning_rate": 1.3723308326600265e-05, "loss": 36.8831, "step": 16908 }, { "epoch": 44.658963354242324, "grad_norm": 2389.79150390625, "learning_rate": 1.3709660666661544e-05, "loss": 38.8682, "step": 16909 }, { "epoch": 44.66160448993067, "grad_norm": 2198.473876953125, "learning_rate": 1.3696019604991727e-05, "loss": 39.6379, "step": 16910 }, { "epoch": 44.66424562561902, "grad_norm": 389.0918884277344, "learning_rate": 1.3682385141971731e-05, "loss": 38.1856, "step": 16911 }, { "epoch": 44.66688676130736, "grad_norm": 716.7162475585938, "learning_rate": 1.3668757277982335e-05, "loss": 39.159, "step": 16912 }, { "epoch": 44.66952789699571, "grad_norm": 2029.73095703125, "learning_rate": 1.3655136013404096e-05, "loss": 39.1692, "step": 16913 }, { "epoch": 44.67216903268405, "grad_norm": 1979.831298828125, "learning_rate": 1.364152134861732e-05, "loss": 41.3108, "step": 16914 }, { "epoch": 44.6748101683724, "grad_norm": 1008.4973754882812, "learning_rate": 1.3627913284002174e-05, "loss": 39.0018, "step": 16915 }, { "epoch": 44.677451304060746, "grad_norm": 609.0641479492188, "learning_rate": 1.3614311819938746e-05, "loss": 43.3022, "step": 16916 }, { "epoch": 44.68009243974909, "grad_norm": 658.051025390625, "learning_rate": 1.3600716956806786e-05, "loss": 41.6181, "step": 16917 }, { "epoch": 44.68273357543744, "grad_norm": 2803.54443359375, "learning_rate": 1.3587128694985906e-05, "loss": 39.6421, "step": 16918 }, { "epoch": 44.68537471112578, "grad_norm": 920.9937744140625, "learning_rate": 1.357354703485561e-05, "loss": 38.7279, "step": 16919 }, { "epoch": 44.68801584681413, "grad_norm": 563.5660400390625, "learning_rate": 1.3559971976795149e-05, "loss": 37.2335, "step": 16920 }, { "epoch": 44.690656982502475, "grad_norm": 702.2224731445312, "learning_rate": 1.3546403521183553e-05, "loss": 36.6802, "step": 16921 }, { "epoch": 44.693298118190825, "grad_norm": 849.1128540039062, "learning_rate": 1.3532841668399742e-05, "loss": 36.4926, "step": 16922 }, { "epoch": 44.69593925387917, "grad_norm": 813.2462768554688, "learning_rate": 1.351928641882244e-05, "loss": 36.004, "step": 16923 }, { "epoch": 44.69858038956751, "grad_norm": 1463.3658447265625, "learning_rate": 1.3505737772830123e-05, "loss": 34.7021, "step": 16924 }, { "epoch": 44.70122152525586, "grad_norm": 1045.7078857421875, "learning_rate": 1.3492195730801127e-05, "loss": 34.804, "step": 16925 }, { "epoch": 44.7038626609442, "grad_norm": 552.1766967773438, "learning_rate": 1.3478660293113675e-05, "loss": 35.7347, "step": 16926 }, { "epoch": 44.70650379663255, "grad_norm": 5938.69140625, "learning_rate": 1.346513146014569e-05, "loss": 35.0154, "step": 16927 }, { "epoch": 44.709144932320896, "grad_norm": 843.6589965820312, "learning_rate": 1.3451609232274953e-05, "loss": 34.664, "step": 16928 }, { "epoch": 44.711786068009246, "grad_norm": 889.6596069335938, "learning_rate": 1.3438093609879048e-05, "loss": 34.6223, "step": 16929 }, { "epoch": 44.71442720369759, "grad_norm": 2032.989501953125, "learning_rate": 1.3424584593335481e-05, "loss": 34.6701, "step": 16930 }, { "epoch": 44.71706833938594, "grad_norm": 4268.44873046875, "learning_rate": 1.3411082183021394e-05, "loss": 36.5756, "step": 16931 }, { "epoch": 44.71970947507428, "grad_norm": 7775.88525390625, "learning_rate": 1.3397586379313792e-05, "loss": 14.6591, "step": 16932 }, { "epoch": 44.722350610762625, "grad_norm": 986.9056396484375, "learning_rate": 1.3384097182589678e-05, "loss": 7.7422, "step": 16933 }, { "epoch": 44.724991746450975, "grad_norm": 1245.761474609375, "learning_rate": 1.3370614593225639e-05, "loss": 15.9527, "step": 16934 }, { "epoch": 44.72763288213932, "grad_norm": 1926.5003662109375, "learning_rate": 1.335713861159818e-05, "loss": 13.3458, "step": 16935 }, { "epoch": 44.73027401782767, "grad_norm": 6971.5810546875, "learning_rate": 1.3343669238083556e-05, "loss": 11.9884, "step": 16936 }, { "epoch": 44.73291515351601, "grad_norm": 1323.765625, "learning_rate": 1.3330206473058021e-05, "loss": 9.3346, "step": 16937 }, { "epoch": 44.73555628920436, "grad_norm": 3166.3681640625, "learning_rate": 1.3316750316897414e-05, "loss": 11.4659, "step": 16938 }, { "epoch": 44.738197424892704, "grad_norm": 944.3471069335938, "learning_rate": 1.3303300769977544e-05, "loss": 6.1373, "step": 16939 }, { "epoch": 44.74083856058105, "grad_norm": 4482.9443359375, "learning_rate": 1.3289857832673947e-05, "loss": 13.6773, "step": 16940 }, { "epoch": 44.7434796962694, "grad_norm": 2126.00634765625, "learning_rate": 1.3276421505362013e-05, "loss": 24.2797, "step": 16941 }, { "epoch": 44.74612083195774, "grad_norm": 1077.5574951171875, "learning_rate": 1.3262991788416945e-05, "loss": 35.7691, "step": 16942 }, { "epoch": 44.74876196764609, "grad_norm": 1638.284912109375, "learning_rate": 1.3249568682213748e-05, "loss": 34.945, "step": 16943 }, { "epoch": 44.75140310333443, "grad_norm": 576.9864501953125, "learning_rate": 1.3236152187127287e-05, "loss": 34.9645, "step": 16944 }, { "epoch": 44.75404423902278, "grad_norm": 1077.46337890625, "learning_rate": 1.322274230353221e-05, "loss": 35.137, "step": 16945 }, { "epoch": 44.756685374711125, "grad_norm": 1037.94140625, "learning_rate": 1.3209339031802908e-05, "loss": 35.4214, "step": 16946 }, { "epoch": 44.759326510399475, "grad_norm": 1403.464599609375, "learning_rate": 1.319594237231378e-05, "loss": 33.8576, "step": 16947 }, { "epoch": 44.76196764608782, "grad_norm": 5026.794921875, "learning_rate": 1.3182552325438857e-05, "loss": 34.3263, "step": 16948 }, { "epoch": 44.76460878177616, "grad_norm": 1048.5462646484375, "learning_rate": 1.3169168891552035e-05, "loss": 34.3906, "step": 16949 }, { "epoch": 44.76724991746451, "grad_norm": 1453.052001953125, "learning_rate": 1.3155792071027018e-05, "loss": 33.5189, "step": 16950 }, { "epoch": 44.769891053152854, "grad_norm": 2817.84619140625, "learning_rate": 1.3142421864237391e-05, "loss": 35.3103, "step": 16951 }, { "epoch": 44.772532188841204, "grad_norm": 1458.77880859375, "learning_rate": 1.3129058271556526e-05, "loss": 34.5634, "step": 16952 }, { "epoch": 44.77517332452955, "grad_norm": 1290.13037109375, "learning_rate": 1.3115701293357484e-05, "loss": 34.6911, "step": 16953 }, { "epoch": 44.7778144602179, "grad_norm": 1088.8414306640625, "learning_rate": 1.3102350930013413e-05, "loss": 35.4146, "step": 16954 }, { "epoch": 44.78045559590624, "grad_norm": 1734.64306640625, "learning_rate": 1.308900718189701e-05, "loss": 33.3558, "step": 16955 }, { "epoch": 44.78309673159458, "grad_norm": 761.8157348632812, "learning_rate": 1.30756700493809e-05, "loss": 35.2852, "step": 16956 }, { "epoch": 44.78573786728293, "grad_norm": 954.532470703125, "learning_rate": 1.3062339532837531e-05, "loss": 33.7942, "step": 16957 }, { "epoch": 44.788379002971276, "grad_norm": 1449.8072509765625, "learning_rate": 1.304901563263916e-05, "loss": 36.2095, "step": 16958 }, { "epoch": 44.791020138659626, "grad_norm": 5354.67529296875, "learning_rate": 1.3035698349157827e-05, "loss": 37.7813, "step": 16959 }, { "epoch": 44.79366127434797, "grad_norm": 1936.613525390625, "learning_rate": 1.3022387682765397e-05, "loss": 40.4137, "step": 16960 }, { "epoch": 44.79630241003632, "grad_norm": 914.9938354492188, "learning_rate": 1.3009083633833601e-05, "loss": 38.6634, "step": 16961 }, { "epoch": 44.79894354572466, "grad_norm": 666.81787109375, "learning_rate": 1.2995786202733923e-05, "loss": 38.288, "step": 16962 }, { "epoch": 44.801584681413004, "grad_norm": 1334.7596435546875, "learning_rate": 1.2982495389837701e-05, "loss": 38.6183, "step": 16963 }, { "epoch": 44.804225817101354, "grad_norm": 1407.379150390625, "learning_rate": 1.2969211195516029e-05, "loss": 41.4781, "step": 16964 }, { "epoch": 44.8068669527897, "grad_norm": 452.1187438964844, "learning_rate": 1.2955933620139915e-05, "loss": 39.9852, "step": 16965 }, { "epoch": 44.80950808847805, "grad_norm": 709.3836669921875, "learning_rate": 1.2942662664080173e-05, "loss": 43.4586, "step": 16966 }, { "epoch": 44.81214922416639, "grad_norm": 895.9000854492188, "learning_rate": 1.2929398327707232e-05, "loss": 40.4102, "step": 16967 }, { "epoch": 44.81479035985474, "grad_norm": 904.9974365234375, "learning_rate": 1.2916140611391598e-05, "loss": 40.4483, "step": 16968 }, { "epoch": 44.81743149554308, "grad_norm": 753.3121948242188, "learning_rate": 1.2902889515503503e-05, "loss": 38.3889, "step": 16969 }, { "epoch": 44.820072631231426, "grad_norm": 1149.9847412109375, "learning_rate": 1.2889645040412929e-05, "loss": 39.5328, "step": 16970 }, { "epoch": 44.822713766919776, "grad_norm": 893.2176513671875, "learning_rate": 1.2876407186489663e-05, "loss": 39.5891, "step": 16971 }, { "epoch": 44.82535490260812, "grad_norm": 1162.508056640625, "learning_rate": 1.2863175954103496e-05, "loss": 36.7486, "step": 16972 }, { "epoch": 44.82799603829647, "grad_norm": 1044.52734375, "learning_rate": 1.284995134362385e-05, "loss": 36.313, "step": 16973 }, { "epoch": 44.83063717398481, "grad_norm": 383.0960693359375, "learning_rate": 1.2836733355419989e-05, "loss": 36.7581, "step": 16974 }, { "epoch": 44.83327830967316, "grad_norm": 1025.5062255859375, "learning_rate": 1.2823521989861031e-05, "loss": 34.7928, "step": 16975 }, { "epoch": 44.835919445361505, "grad_norm": 710.752197265625, "learning_rate": 1.2810317247315906e-05, "loss": 36.2032, "step": 16976 }, { "epoch": 44.838560581049855, "grad_norm": 1170.359130859375, "learning_rate": 1.2797119128153317e-05, "loss": 34.835, "step": 16977 }, { "epoch": 44.8412017167382, "grad_norm": 1292.637451171875, "learning_rate": 1.2783927632741831e-05, "loss": 34.1008, "step": 16978 }, { "epoch": 44.84384285242654, "grad_norm": 1494.8714599609375, "learning_rate": 1.2770742761449822e-05, "loss": 35.0895, "step": 16979 }, { "epoch": 44.84648398811489, "grad_norm": 928.9723510742188, "learning_rate": 1.2757564514645492e-05, "loss": 34.0659, "step": 16980 }, { "epoch": 44.84912512380323, "grad_norm": 685.0453491210938, "learning_rate": 1.2744392892696743e-05, "loss": 35.5641, "step": 16981 }, { "epoch": 44.85176625949158, "grad_norm": 6023.064453125, "learning_rate": 1.2731227895971531e-05, "loss": 35.0876, "step": 16982 }, { "epoch": 44.854407395179926, "grad_norm": 1483.9166259765625, "learning_rate": 1.2718069524837394e-05, "loss": 36.3764, "step": 16983 }, { "epoch": 44.857048530868276, "grad_norm": 645.240478515625, "learning_rate": 1.2704917779661762e-05, "loss": 12.4842, "step": 16984 }, { "epoch": 44.85968966655662, "grad_norm": 1084.46484375, "learning_rate": 1.2691772660811867e-05, "loss": 10.8751, "step": 16985 }, { "epoch": 44.86233080224496, "grad_norm": 60628.75, "learning_rate": 1.2678634168654862e-05, "loss": 8.889, "step": 16986 }, { "epoch": 44.86497193793331, "grad_norm": 687.0867309570312, "learning_rate": 1.266550230355759e-05, "loss": 11.1497, "step": 16987 }, { "epoch": 44.867613073621655, "grad_norm": 7979.00439453125, "learning_rate": 1.2652377065886705e-05, "loss": 15.8038, "step": 16988 }, { "epoch": 44.870254209310005, "grad_norm": 803.4586791992188, "learning_rate": 1.2639258456008774e-05, "loss": 9.6575, "step": 16989 }, { "epoch": 44.87289534499835, "grad_norm": 906.0362548828125, "learning_rate": 1.2626146474290168e-05, "loss": 8.0567, "step": 16990 }, { "epoch": 44.8755364806867, "grad_norm": 671.5565185546875, "learning_rate": 1.2613041121096929e-05, "loss": 11.9178, "step": 16991 }, { "epoch": 44.87817761637504, "grad_norm": 942.5465087890625, "learning_rate": 1.2599942396795099e-05, "loss": 23.6423, "step": 16992 }, { "epoch": 44.88081875206339, "grad_norm": 1834.4967041015625, "learning_rate": 1.2586850301750413e-05, "loss": 34.9159, "step": 16993 }, { "epoch": 44.883459887751734, "grad_norm": 1772.32373046875, "learning_rate": 1.2573764836328493e-05, "loss": 33.5371, "step": 16994 }, { "epoch": 44.88610102344008, "grad_norm": 791.159423828125, "learning_rate": 1.2560686000894662e-05, "loss": 36.0839, "step": 16995 }, { "epoch": 44.88874215912843, "grad_norm": 816.5674438476562, "learning_rate": 1.2547613795814233e-05, "loss": 32.9722, "step": 16996 }, { "epoch": 44.89138329481677, "grad_norm": 2846.54052734375, "learning_rate": 1.2534548221452197e-05, "loss": 35.198, "step": 16997 }, { "epoch": 44.89402443050512, "grad_norm": 835.4585571289062, "learning_rate": 1.2521489278173398e-05, "loss": 34.8909, "step": 16998 }, { "epoch": 44.89666556619346, "grad_norm": 1038.1937255859375, "learning_rate": 1.250843696634249e-05, "loss": 33.3997, "step": 16999 }, { "epoch": 44.89930670188181, "grad_norm": 856.5112915039062, "learning_rate": 1.2495391286323986e-05, "loss": 34.665, "step": 17000 }, { "epoch": 44.89930670188181, "eval_loss": 3.8025693893432617, "eval_runtime": 2.1239, "eval_samples_per_second": 233.063, "eval_steps_per_second": 29.192, "step": 17000 }, { "epoch": 44.901947837570155, "grad_norm": 1538.806640625, "learning_rate": 1.2482352238482209e-05, "loss": 33.4583, "step": 17001 }, { "epoch": 44.9045889732585, "grad_norm": 9530.2373046875, "learning_rate": 1.2469319823181168e-05, "loss": 33.997, "step": 17002 }, { "epoch": 44.90723010894685, "grad_norm": 1274.0372314453125, "learning_rate": 1.2456294040784855e-05, "loss": 34.3827, "step": 17003 }, { "epoch": 44.90987124463519, "grad_norm": 618.954833984375, "learning_rate": 1.2443274891656974e-05, "loss": 34.3643, "step": 17004 }, { "epoch": 44.91251238032354, "grad_norm": 1913.316650390625, "learning_rate": 1.2430262376161128e-05, "loss": 34.7588, "step": 17005 }, { "epoch": 44.915153516011884, "grad_norm": 3386.43017578125, "learning_rate": 1.2417256494660606e-05, "loss": 34.1171, "step": 17006 }, { "epoch": 44.917794651700234, "grad_norm": 973.2444458007812, "learning_rate": 1.2404257247518646e-05, "loss": 34.7631, "step": 17007 }, { "epoch": 44.92043578738858, "grad_norm": 1578.90869140625, "learning_rate": 1.2391264635098237e-05, "loss": 34.2836, "step": 17008 }, { "epoch": 44.92307692307692, "grad_norm": 1480.4886474609375, "learning_rate": 1.2378278657762198e-05, "loss": 35.9889, "step": 17009 }, { "epoch": 44.92571805876527, "grad_norm": 1015.9917602539062, "learning_rate": 1.236529931587313e-05, "loss": 40.0242, "step": 17010 }, { "epoch": 44.92835919445361, "grad_norm": 553.3646850585938, "learning_rate": 1.2352326609793491e-05, "loss": 38.4663, "step": 17011 }, { "epoch": 44.93100033014196, "grad_norm": 816.3750610351562, "learning_rate": 1.2339360539885492e-05, "loss": 41.1997, "step": 17012 }, { "epoch": 44.933641465830306, "grad_norm": 889.3187255859375, "learning_rate": 1.2326401106511287e-05, "loss": 40.5593, "step": 17013 }, { "epoch": 44.936282601518656, "grad_norm": 872.5919189453125, "learning_rate": 1.23134483100327e-05, "loss": 39.5078, "step": 17014 }, { "epoch": 44.938923737207, "grad_norm": 765.8436279296875, "learning_rate": 1.2300502150811439e-05, "loss": 40.7698, "step": 17015 }, { "epoch": 44.94156487289534, "grad_norm": 538.8427124023438, "learning_rate": 1.2287562629208992e-05, "loss": 37.3464, "step": 17016 }, { "epoch": 44.94420600858369, "grad_norm": 2638.744873046875, "learning_rate": 1.2274629745586768e-05, "loss": 38.5017, "step": 17017 }, { "epoch": 44.946847144272034, "grad_norm": 932.5127563476562, "learning_rate": 1.2261703500305894e-05, "loss": 34.8746, "step": 17018 }, { "epoch": 44.949488279960384, "grad_norm": 1800.0709228515625, "learning_rate": 1.2248783893727216e-05, "loss": 34.7745, "step": 17019 }, { "epoch": 44.95212941564873, "grad_norm": 1114.499755859375, "learning_rate": 1.2235870926211617e-05, "loss": 34.3615, "step": 17020 }, { "epoch": 44.95477055133708, "grad_norm": 696.982421875, "learning_rate": 1.2222964598119667e-05, "loss": 35.241, "step": 17021 }, { "epoch": 44.95741168702542, "grad_norm": 849.6273803710938, "learning_rate": 1.221006490981172e-05, "loss": 34.9419, "step": 17022 }, { "epoch": 44.96005282271377, "grad_norm": 3724.50634765625, "learning_rate": 1.219717186164801e-05, "loss": 12.6284, "step": 17023 }, { "epoch": 44.96269395840211, "grad_norm": 923.3837280273438, "learning_rate": 1.2184285453988642e-05, "loss": 7.3948, "step": 17024 }, { "epoch": 44.965335094090456, "grad_norm": 2074.560546875, "learning_rate": 1.2171405687193382e-05, "loss": 7.7468, "step": 17025 }, { "epoch": 44.967976229778806, "grad_norm": 1288.3037109375, "learning_rate": 1.2158532561621889e-05, "loss": 8.703, "step": 17026 }, { "epoch": 44.97061736546715, "grad_norm": 436.7805480957031, "learning_rate": 1.2145666077633676e-05, "loss": 9.2046, "step": 17027 }, { "epoch": 44.9732585011555, "grad_norm": 707.0341186523438, "learning_rate": 1.2132806235588018e-05, "loss": 23.567, "step": 17028 }, { "epoch": 44.97589963684384, "grad_norm": 693.227783203125, "learning_rate": 1.211995303584401e-05, "loss": 36.357, "step": 17029 }, { "epoch": 44.97854077253219, "grad_norm": 1050.6904296875, "learning_rate": 1.2107106478760565e-05, "loss": 34.281, "step": 17030 }, { "epoch": 44.981181908220535, "grad_norm": 679.7249145507812, "learning_rate": 1.2094266564696421e-05, "loss": 35.5577, "step": 17031 }, { "epoch": 44.98382304390888, "grad_norm": 929.67041015625, "learning_rate": 1.2081433294010152e-05, "loss": 34.9534, "step": 17032 }, { "epoch": 44.98646417959723, "grad_norm": 885.327392578125, "learning_rate": 1.2068606667060055e-05, "loss": 34.1988, "step": 17033 }, { "epoch": 44.98910531528557, "grad_norm": 1654.8720703125, "learning_rate": 1.2055786684204401e-05, "loss": 34.4561, "step": 17034 }, { "epoch": 44.99174645097392, "grad_norm": 2832.61083984375, "learning_rate": 1.2042973345801122e-05, "loss": 34.6306, "step": 17035 }, { "epoch": 44.99438758666226, "grad_norm": 944.8486328125, "learning_rate": 1.2030166652208047e-05, "loss": 33.8424, "step": 17036 }, { "epoch": 44.99702872235061, "grad_norm": 1264.7130126953125, "learning_rate": 1.201736660378272e-05, "loss": 34.5357, "step": 17037 }, { "epoch": 44.999669858038956, "grad_norm": 976.76220703125, "learning_rate": 1.2004573200882662e-05, "loss": 38.1199, "step": 17038 }, { "epoch": 45.002310993727306, "grad_norm": 932.7182006835938, "learning_rate": 1.1991786443865088e-05, "loss": 38.4644, "step": 17039 }, { "epoch": 45.00495212941565, "grad_norm": 807.5425415039062, "learning_rate": 1.1979006333087044e-05, "loss": 39.818, "step": 17040 }, { "epoch": 45.00759326510399, "grad_norm": 1163.8243408203125, "learning_rate": 1.1966232868905442e-05, "loss": 38.6957, "step": 17041 }, { "epoch": 45.01023440079234, "grad_norm": 817.59326171875, "learning_rate": 1.1953466051676964e-05, "loss": 38.5336, "step": 17042 }, { "epoch": 45.012875536480685, "grad_norm": 1039.9525146484375, "learning_rate": 1.1940705881758108e-05, "loss": 42.7364, "step": 17043 }, { "epoch": 45.015516672169035, "grad_norm": 651.0551147460938, "learning_rate": 1.1927952359505223e-05, "loss": 39.3902, "step": 17044 }, { "epoch": 45.01815780785738, "grad_norm": 2932.861328125, "learning_rate": 1.1915205485274389e-05, "loss": 43.5653, "step": 17045 }, { "epoch": 45.02079894354573, "grad_norm": 617.466796875, "learning_rate": 1.1902465259421597e-05, "loss": 39.9514, "step": 17046 }, { "epoch": 45.02344007923407, "grad_norm": 627.9578857421875, "learning_rate": 1.1889731682302534e-05, "loss": 38.3927, "step": 17047 }, { "epoch": 45.026081214922414, "grad_norm": 882.1304931640625, "learning_rate": 1.1877004754272918e-05, "loss": 38.8713, "step": 17048 }, { "epoch": 45.028722350610764, "grad_norm": 1658.8780517578125, "learning_rate": 1.1864284475688047e-05, "loss": 39.6886, "step": 17049 }, { "epoch": 45.03136348629911, "grad_norm": 1135.7802734375, "learning_rate": 1.1851570846903138e-05, "loss": 38.2815, "step": 17050 }, { "epoch": 45.03400462198746, "grad_norm": 1272.7076416015625, "learning_rate": 1.1838863868273181e-05, "loss": 36.8213, "step": 17051 }, { "epoch": 45.0366457576758, "grad_norm": 1587.3988037109375, "learning_rate": 1.1826163540153062e-05, "loss": 35.9554, "step": 17052 }, { "epoch": 45.03928689336415, "grad_norm": 1295.435791015625, "learning_rate": 1.1813469862897496e-05, "loss": 35.0062, "step": 17053 }, { "epoch": 45.04192802905249, "grad_norm": 970.8302001953125, "learning_rate": 1.1800782836860757e-05, "loss": 35.0083, "step": 17054 }, { "epoch": 45.044569164740835, "grad_norm": 5624.0908203125, "learning_rate": 1.1788102462397282e-05, "loss": 34.797, "step": 17055 }, { "epoch": 45.047210300429185, "grad_norm": 1355.7794189453125, "learning_rate": 1.1775428739861122e-05, "loss": 36.0919, "step": 17056 }, { "epoch": 45.04985143611753, "grad_norm": 845.422607421875, "learning_rate": 1.1762761669606159e-05, "loss": 35.1347, "step": 17057 }, { "epoch": 45.05249257180588, "grad_norm": 661.1739501953125, "learning_rate": 1.1750101251986112e-05, "loss": 35.2525, "step": 17058 }, { "epoch": 45.05513370749422, "grad_norm": 795.6740112304688, "learning_rate": 1.173744748735453e-05, "loss": 34.912, "step": 17059 }, { "epoch": 45.05777484318257, "grad_norm": 825.9431762695312, "learning_rate": 1.1724800376064798e-05, "loss": 34.7527, "step": 17060 }, { "epoch": 45.060415978870914, "grad_norm": 2206.164306640625, "learning_rate": 1.1712159918470023e-05, "loss": 38.2354, "step": 17061 }, { "epoch": 45.063057114559264, "grad_norm": 3870.279296875, "learning_rate": 1.1699526114923226e-05, "loss": 9.7305, "step": 17062 }, { "epoch": 45.06569825024761, "grad_norm": 745.0078735351562, "learning_rate": 1.1686898965777154e-05, "loss": 11.566, "step": 17063 }, { "epoch": 45.06833938593595, "grad_norm": 1090.4814453125, "learning_rate": 1.167427847138447e-05, "loss": 14.0257, "step": 17064 }, { "epoch": 45.0709805216243, "grad_norm": 3130.263916015625, "learning_rate": 1.1661664632097502e-05, "loss": 9.0415, "step": 17065 }, { "epoch": 45.07362165731264, "grad_norm": 460.8376159667969, "learning_rate": 1.164905744826858e-05, "loss": 9.0891, "step": 17066 }, { "epoch": 45.07626279300099, "grad_norm": 698.5809936523438, "learning_rate": 1.16364569202497e-05, "loss": 10.2, "step": 17067 }, { "epoch": 45.078903928689336, "grad_norm": 10791.18359375, "learning_rate": 1.162386304839272e-05, "loss": 13.2446, "step": 17068 }, { "epoch": 45.081545064377686, "grad_norm": 2101.491943359375, "learning_rate": 1.1611275833049357e-05, "loss": 9.8128, "step": 17069 }, { "epoch": 45.08418620006603, "grad_norm": 1804.92578125, "learning_rate": 1.159869527457108e-05, "loss": 11.0651, "step": 17070 }, { "epoch": 45.08682733575437, "grad_norm": 1279.12841796875, "learning_rate": 1.158612137330925e-05, "loss": 35.5406, "step": 17071 }, { "epoch": 45.08946847144272, "grad_norm": 1956.0692138671875, "learning_rate": 1.1573554129614833e-05, "loss": 34.2891, "step": 17072 }, { "epoch": 45.092109607131064, "grad_norm": 1121.7501220703125, "learning_rate": 1.156099354383891e-05, "loss": 33.2434, "step": 17073 }, { "epoch": 45.094750742819414, "grad_norm": 1312.1783447265625, "learning_rate": 1.1548439616332173e-05, "loss": 34.9605, "step": 17074 }, { "epoch": 45.09739187850776, "grad_norm": 2375.581298828125, "learning_rate": 1.1535892347445148e-05, "loss": 35.1608, "step": 17075 }, { "epoch": 45.10003301419611, "grad_norm": 2696.684326171875, "learning_rate": 1.1523351737528275e-05, "loss": 34.2874, "step": 17076 }, { "epoch": 45.10267414988445, "grad_norm": 1191.3201904296875, "learning_rate": 1.1510817786931721e-05, "loss": 35.6361, "step": 17077 }, { "epoch": 45.10531528557279, "grad_norm": 4565.5283203125, "learning_rate": 1.1498290496005481e-05, "loss": 34.8673, "step": 17078 }, { "epoch": 45.10795642126114, "grad_norm": 884.9564208984375, "learning_rate": 1.1485769865099389e-05, "loss": 34.8389, "step": 17079 }, { "epoch": 45.110597556949486, "grad_norm": 1807.4820556640625, "learning_rate": 1.147325589456305e-05, "loss": 34.3573, "step": 17080 }, { "epoch": 45.113238692637836, "grad_norm": 1123.5916748046875, "learning_rate": 1.1460748584745912e-05, "loss": 35.3421, "step": 17081 }, { "epoch": 45.11587982832618, "grad_norm": 537.5679321289062, "learning_rate": 1.1448247935997246e-05, "loss": 35.1325, "step": 17082 }, { "epoch": 45.11852096401453, "grad_norm": 870.80126953125, "learning_rate": 1.1435753948666111e-05, "loss": 33.4364, "step": 17083 }, { "epoch": 45.12116209970287, "grad_norm": 1148.1541748046875, "learning_rate": 1.1423266623101448e-05, "loss": 34.4932, "step": 17084 }, { "epoch": 45.12380323539122, "grad_norm": 729.3212280273438, "learning_rate": 1.1410785959651893e-05, "loss": 33.5859, "step": 17085 }, { "epoch": 45.126444371079565, "grad_norm": 2607.5400390625, "learning_rate": 1.1398311958665975e-05, "loss": 35.7236, "step": 17086 }, { "epoch": 45.12908550676791, "grad_norm": 2275.207763671875, "learning_rate": 1.1385844620492024e-05, "loss": 36.5352, "step": 17087 }, { "epoch": 45.13172664245626, "grad_norm": 2464.86962890625, "learning_rate": 1.1373383945478266e-05, "loss": 39.0224, "step": 17088 }, { "epoch": 45.1343677781446, "grad_norm": 706.0526123046875, "learning_rate": 1.1360929933972503e-05, "loss": 38.5193, "step": 17089 }, { "epoch": 45.13700891383295, "grad_norm": 1505.67724609375, "learning_rate": 1.1348482586322622e-05, "loss": 39.473, "step": 17090 }, { "epoch": 45.13965004952129, "grad_norm": 506.2986755371094, "learning_rate": 1.1336041902876181e-05, "loss": 37.9503, "step": 17091 }, { "epoch": 45.14229118520964, "grad_norm": 804.3892211914062, "learning_rate": 1.1323607883980542e-05, "loss": 39.992, "step": 17092 }, { "epoch": 45.144932320897986, "grad_norm": 888.4639892578125, "learning_rate": 1.1311180529982951e-05, "loss": 42.1388, "step": 17093 }, { "epoch": 45.14757345658633, "grad_norm": 762.7203979492188, "learning_rate": 1.1298759841230438e-05, "loss": 40.9793, "step": 17094 }, { "epoch": 45.15021459227468, "grad_norm": 1676.20556640625, "learning_rate": 1.1286345818069837e-05, "loss": 40.4705, "step": 17095 }, { "epoch": 45.15285572796302, "grad_norm": 1529.149169921875, "learning_rate": 1.1273938460847816e-05, "loss": 38.8359, "step": 17096 }, { "epoch": 45.15549686365137, "grad_norm": 561.4640502929688, "learning_rate": 1.1261537769910818e-05, "loss": 41.6341, "step": 17097 }, { "epoch": 45.158137999339715, "grad_norm": 897.5321655273438, "learning_rate": 1.1249143745605151e-05, "loss": 38.6472, "step": 17098 }, { "epoch": 45.160779135028065, "grad_norm": 670.1100463867188, "learning_rate": 1.1236756388276869e-05, "loss": 38.1681, "step": 17099 }, { "epoch": 45.16342027071641, "grad_norm": 699.31982421875, "learning_rate": 1.1224375698271894e-05, "loss": 35.7679, "step": 17100 }, { "epoch": 45.16606140640475, "grad_norm": 682.8534545898438, "learning_rate": 1.1212001675936002e-05, "loss": 35.9493, "step": 17101 }, { "epoch": 45.1687025420931, "grad_norm": 1004.4952392578125, "learning_rate": 1.1199634321614666e-05, "loss": 37.0614, "step": 17102 }, { "epoch": 45.171343677781444, "grad_norm": 707.8761596679688, "learning_rate": 1.118727363565325e-05, "loss": 34.7819, "step": 17103 }, { "epoch": 45.173984813469794, "grad_norm": 850.8389282226562, "learning_rate": 1.1174919618396951e-05, "loss": 35.7075, "step": 17104 }, { "epoch": 45.17662594915814, "grad_norm": 1235.846923828125, "learning_rate": 1.1162572270190741e-05, "loss": 34.8319, "step": 17105 }, { "epoch": 45.17926708484649, "grad_norm": 1201.6063232421875, "learning_rate": 1.1150231591379429e-05, "loss": 33.6197, "step": 17106 }, { "epoch": 45.18190822053483, "grad_norm": 883.1559448242188, "learning_rate": 1.1137897582307515e-05, "loss": 35.5598, "step": 17107 }, { "epoch": 45.18454935622318, "grad_norm": 341.1605224609375, "learning_rate": 1.1125570243319533e-05, "loss": 35.2885, "step": 17108 }, { "epoch": 45.18719049191152, "grad_norm": 895.5253295898438, "learning_rate": 1.1113249574759703e-05, "loss": 35.7353, "step": 17109 }, { "epoch": 45.189831627599865, "grad_norm": 1649.4483642578125, "learning_rate": 1.1100935576971977e-05, "loss": 35.9558, "step": 17110 }, { "epoch": 45.192472763288215, "grad_norm": 1154.8988037109375, "learning_rate": 1.1088628250300353e-05, "loss": 27.7691, "step": 17111 }, { "epoch": 45.19511389897656, "grad_norm": 1375.280029296875, "learning_rate": 1.107632759508842e-05, "loss": 10.2076, "step": 17112 }, { "epoch": 45.19775503466491, "grad_norm": 2507.978759765625, "learning_rate": 1.1064033611679708e-05, "loss": 12.9546, "step": 17113 }, { "epoch": 45.20039617035325, "grad_norm": 540.6541748046875, "learning_rate": 1.105174630041747e-05, "loss": 9.379, "step": 17114 }, { "epoch": 45.2030373060416, "grad_norm": 26810.041015625, "learning_rate": 1.1039465661644876e-05, "loss": 12.1051, "step": 17115 }, { "epoch": 45.205678441729944, "grad_norm": 702.0369262695312, "learning_rate": 1.102719169570482e-05, "loss": 8.1561, "step": 17116 }, { "epoch": 45.20831957741829, "grad_norm": 1968.83837890625, "learning_rate": 1.1014924402940025e-05, "loss": 10.1042, "step": 17117 }, { "epoch": 45.21096071310664, "grad_norm": 2575.63623046875, "learning_rate": 1.1002663783693134e-05, "loss": 9.5848, "step": 17118 }, { "epoch": 45.21360184879498, "grad_norm": 10685.50390625, "learning_rate": 1.0990409838306431e-05, "loss": 9.9942, "step": 17119 }, { "epoch": 45.21624298448333, "grad_norm": 2379.475830078125, "learning_rate": 1.097816256712214e-05, "loss": 17.6587, "step": 17120 }, { "epoch": 45.21888412017167, "grad_norm": 1244.25537109375, "learning_rate": 1.0965921970482213e-05, "loss": 34.8786, "step": 17121 }, { "epoch": 45.22152525586002, "grad_norm": 424.6705017089844, "learning_rate": 1.0953688048728539e-05, "loss": 35.5104, "step": 17122 }, { "epoch": 45.224166391548366, "grad_norm": 891.8865356445312, "learning_rate": 1.0941460802202735e-05, "loss": 33.3581, "step": 17123 }, { "epoch": 45.22680752723671, "grad_norm": 1031.7261962890625, "learning_rate": 1.092924023124614e-05, "loss": 35.6281, "step": 17124 }, { "epoch": 45.22944866292506, "grad_norm": 777.4845581054688, "learning_rate": 1.0917026336200093e-05, "loss": 34.2021, "step": 17125 }, { "epoch": 45.2320897986134, "grad_norm": 786.5060424804688, "learning_rate": 1.090481911740568e-05, "loss": 34.5422, "step": 17126 }, { "epoch": 45.23473093430175, "grad_norm": 1013.4205322265625, "learning_rate": 1.089261857520371e-05, "loss": 33.661, "step": 17127 }, { "epoch": 45.237372069990094, "grad_norm": 715.8646850585938, "learning_rate": 1.0880424709934862e-05, "loss": 33.6663, "step": 17128 }, { "epoch": 45.240013205678444, "grad_norm": 446.5251770019531, "learning_rate": 1.0868237521939744e-05, "loss": 34.5511, "step": 17129 }, { "epoch": 45.24265434136679, "grad_norm": 1122.053466796875, "learning_rate": 1.0856057011558618e-05, "loss": 33.8583, "step": 17130 }, { "epoch": 45.24529547705514, "grad_norm": 1115.4205322265625, "learning_rate": 1.0843883179131625e-05, "loss": 34.2996, "step": 17131 }, { "epoch": 45.24793661274348, "grad_norm": 3195.748046875, "learning_rate": 1.0831716024998689e-05, "loss": 34.3303, "step": 17132 }, { "epoch": 45.25057774843182, "grad_norm": 1232.122802734375, "learning_rate": 1.0819555549499593e-05, "loss": 33.6022, "step": 17133 }, { "epoch": 45.25321888412017, "grad_norm": 812.8096923828125, "learning_rate": 1.08074017529739e-05, "loss": 35.0187, "step": 17134 }, { "epoch": 45.255860019808516, "grad_norm": 831.1178588867188, "learning_rate": 1.0795254635760976e-05, "loss": 35.4168, "step": 17135 }, { "epoch": 45.258501155496866, "grad_norm": 2314.728271484375, "learning_rate": 1.0783114198200106e-05, "loss": 36.0263, "step": 17136 }, { "epoch": 45.26114229118521, "grad_norm": 5985.5107421875, "learning_rate": 1.0770980440630213e-05, "loss": 35.8574, "step": 17137 }, { "epoch": 45.26378342687356, "grad_norm": 1295.046142578125, "learning_rate": 1.0758853363390136e-05, "loss": 42.0706, "step": 17138 }, { "epoch": 45.2664245625619, "grad_norm": 2690.88671875, "learning_rate": 1.0746732966818606e-05, "loss": 41.9055, "step": 17139 }, { "epoch": 45.269065698250245, "grad_norm": 912.140625, "learning_rate": 1.0734619251253963e-05, "loss": 38.0496, "step": 17140 }, { "epoch": 45.271706833938595, "grad_norm": 764.9498901367188, "learning_rate": 1.0722512217034575e-05, "loss": 37.8086, "step": 17141 }, { "epoch": 45.27434796962694, "grad_norm": 791.1719360351562, "learning_rate": 1.0710411864498448e-05, "loss": 38.896, "step": 17142 }, { "epoch": 45.27698910531529, "grad_norm": 1481.767333984375, "learning_rate": 1.069831819398348e-05, "loss": 41.2369, "step": 17143 }, { "epoch": 45.27963024100363, "grad_norm": 741.9652709960938, "learning_rate": 1.068623120582743e-05, "loss": 43.5128, "step": 17144 }, { "epoch": 45.28227137669198, "grad_norm": 1150.5906982421875, "learning_rate": 1.0674150900367774e-05, "loss": 41.3281, "step": 17145 }, { "epoch": 45.28491251238032, "grad_norm": 500.56304931640625, "learning_rate": 1.0662077277941884e-05, "loss": 40.9185, "step": 17146 }, { "epoch": 45.287553648068666, "grad_norm": 542.1292724609375, "learning_rate": 1.0650010338886878e-05, "loss": 41.082, "step": 17147 }, { "epoch": 45.290194783757016, "grad_norm": 2583.049560546875, "learning_rate": 1.0637950083539738e-05, "loss": 39.0244, "step": 17148 }, { "epoch": 45.29283591944536, "grad_norm": 1211.482666015625, "learning_rate": 1.0625896512237249e-05, "loss": 37.6423, "step": 17149 }, { "epoch": 45.29547705513371, "grad_norm": 916.5259399414062, "learning_rate": 1.0613849625315974e-05, "loss": 37.8936, "step": 17150 }, { "epoch": 45.29811819082205, "grad_norm": 859.2813110351562, "learning_rate": 1.0601809423112312e-05, "loss": 36.8794, "step": 17151 }, { "epoch": 45.3007593265104, "grad_norm": 610.4711303710938, "learning_rate": 1.0589775905962496e-05, "loss": 36.4693, "step": 17152 }, { "epoch": 45.303400462198745, "grad_norm": 1691.4632568359375, "learning_rate": 1.0577749074202559e-05, "loss": 33.8213, "step": 17153 }, { "epoch": 45.306041597887095, "grad_norm": 1262.6943359375, "learning_rate": 1.0565728928168317e-05, "loss": 35.509, "step": 17154 }, { "epoch": 45.30868273357544, "grad_norm": 586.5899047851562, "learning_rate": 1.0553715468195447e-05, "loss": 35.0078, "step": 17155 }, { "epoch": 45.31132386926378, "grad_norm": 2673.39208984375, "learning_rate": 1.0541708694619429e-05, "loss": 35.417, "step": 17156 }, { "epoch": 45.31396500495213, "grad_norm": 1320.38818359375, "learning_rate": 1.0529708607775524e-05, "loss": 34.7917, "step": 17157 }, { "epoch": 45.316606140640474, "grad_norm": 730.0694580078125, "learning_rate": 1.051771520799888e-05, "loss": 34.5363, "step": 17158 }, { "epoch": 45.319247276328824, "grad_norm": 2336.868896484375, "learning_rate": 1.0505728495624283e-05, "loss": 34.6415, "step": 17159 }, { "epoch": 45.32188841201717, "grad_norm": 987.2416381835938, "learning_rate": 1.0493748470986554e-05, "loss": 36.0667, "step": 17160 }, { "epoch": 45.32452954770552, "grad_norm": 2588.214111328125, "learning_rate": 1.0481775134420224e-05, "loss": 30.4725, "step": 17161 }, { "epoch": 45.32717068339386, "grad_norm": 1164.448486328125, "learning_rate": 1.0469808486259585e-05, "loss": 11.8566, "step": 17162 }, { "epoch": 45.3298118190822, "grad_norm": 808.0460815429688, "learning_rate": 1.045784852683887e-05, "loss": 9.8941, "step": 17163 }, { "epoch": 45.33245295477055, "grad_norm": 445.2684020996094, "learning_rate": 1.0445895256492033e-05, "loss": 10.5462, "step": 17164 }, { "epoch": 45.335094090458895, "grad_norm": 825.6309814453125, "learning_rate": 1.0433948675552835e-05, "loss": 8.2681, "step": 17165 }, { "epoch": 45.337735226147245, "grad_norm": 3459.30908203125, "learning_rate": 1.0422008784354869e-05, "loss": 7.6299, "step": 17166 }, { "epoch": 45.34037636183559, "grad_norm": 673.4483642578125, "learning_rate": 1.0410075583231594e-05, "loss": 10.1094, "step": 17167 }, { "epoch": 45.34301749752394, "grad_norm": 48321.57421875, "learning_rate": 1.039814907251624e-05, "loss": 9.2239, "step": 17168 }, { "epoch": 45.34565863321228, "grad_norm": 2520.347412109375, "learning_rate": 1.0386229252541762e-05, "loss": 9.6775, "step": 17169 }, { "epoch": 45.348299768900624, "grad_norm": 2313.9306640625, "learning_rate": 1.037431612364112e-05, "loss": 13.5236, "step": 17170 }, { "epoch": 45.350940904588974, "grad_norm": 895.4385986328125, "learning_rate": 1.0362409686146907e-05, "loss": 34.6445, "step": 17171 }, { "epoch": 45.35358204027732, "grad_norm": 1614.4442138671875, "learning_rate": 1.0350509940391662e-05, "loss": 35.2501, "step": 17172 }, { "epoch": 45.35622317596567, "grad_norm": 1182.9822998046875, "learning_rate": 1.033861688670762e-05, "loss": 35.7579, "step": 17173 }, { "epoch": 45.35886431165401, "grad_norm": 2092.95263671875, "learning_rate": 1.032673052542693e-05, "loss": 33.4293, "step": 17174 }, { "epoch": 45.36150544734236, "grad_norm": 1810.6490478515625, "learning_rate": 1.0314850856881525e-05, "loss": 33.8238, "step": 17175 }, { "epoch": 45.3641465830307, "grad_norm": 913.3386840820312, "learning_rate": 1.0302977881403081e-05, "loss": 35.478, "step": 17176 }, { "epoch": 45.36678771871905, "grad_norm": 1242.8603515625, "learning_rate": 1.0291111599323194e-05, "loss": 33.3589, "step": 17177 }, { "epoch": 45.369428854407396, "grad_norm": 1310.5064697265625, "learning_rate": 1.0279252010973183e-05, "loss": 33.6326, "step": 17178 }, { "epoch": 45.37206999009574, "grad_norm": 911.2095947265625, "learning_rate": 1.0267399116684257e-05, "loss": 33.7766, "step": 17179 }, { "epoch": 45.37471112578409, "grad_norm": 2289.833251953125, "learning_rate": 1.0255552916787341e-05, "loss": 33.9256, "step": 17180 }, { "epoch": 45.37735226147243, "grad_norm": 1839.802734375, "learning_rate": 1.0243713411613315e-05, "loss": 36.0187, "step": 17181 }, { "epoch": 45.37999339716078, "grad_norm": 677.0631103515625, "learning_rate": 1.0231880601492771e-05, "loss": 33.8104, "step": 17182 }, { "epoch": 45.382634532849124, "grad_norm": 905.5548095703125, "learning_rate": 1.0220054486756087e-05, "loss": 35.1164, "step": 17183 }, { "epoch": 45.385275668537474, "grad_norm": 2317.225830078125, "learning_rate": 1.0208235067733524e-05, "loss": 33.4539, "step": 17184 }, { "epoch": 45.38791680422582, "grad_norm": 1820.0831298828125, "learning_rate": 1.0196422344755152e-05, "loss": 33.4166, "step": 17185 }, { "epoch": 45.39055793991416, "grad_norm": 1169.013671875, "learning_rate": 1.018461631815082e-05, "loss": 35.578, "step": 17186 }, { "epoch": 45.39319907560251, "grad_norm": 3011.232177734375, "learning_rate": 1.017281698825015e-05, "loss": 35.5539, "step": 17187 }, { "epoch": 45.39584021129085, "grad_norm": 1039.3834228515625, "learning_rate": 1.0161024355382741e-05, "loss": 40.2184, "step": 17188 }, { "epoch": 45.3984813469792, "grad_norm": 1802.933837890625, "learning_rate": 1.0149238419877832e-05, "loss": 37.5719, "step": 17189 }, { "epoch": 45.401122482667546, "grad_norm": 1070.2574462890625, "learning_rate": 1.0137459182064491e-05, "loss": 38.6148, "step": 17190 }, { "epoch": 45.403763618355896, "grad_norm": 458.7352294921875, "learning_rate": 1.0125686642271759e-05, "loss": 37.5301, "step": 17191 }, { "epoch": 45.40640475404424, "grad_norm": 1171.944091796875, "learning_rate": 1.011392080082829e-05, "loss": 39.164, "step": 17192 }, { "epoch": 45.40904588973258, "grad_norm": 844.3007202148438, "learning_rate": 1.0102161658062714e-05, "loss": 43.5961, "step": 17193 }, { "epoch": 45.41168702542093, "grad_norm": 801.5813598632812, "learning_rate": 1.0090409214303292e-05, "loss": 39.536, "step": 17194 }, { "epoch": 45.414328161109275, "grad_norm": 475.4529724121094, "learning_rate": 1.0078663469878291e-05, "loss": 40.7114, "step": 17195 }, { "epoch": 45.416969296797625, "grad_norm": 1065.983642578125, "learning_rate": 1.006692442511567e-05, "loss": 41.583, "step": 17196 }, { "epoch": 45.41961043248597, "grad_norm": 1345.302490234375, "learning_rate": 1.0055192080343196e-05, "loss": 38.3546, "step": 17197 }, { "epoch": 45.42225156817432, "grad_norm": 1618.857421875, "learning_rate": 1.0043466435888576e-05, "loss": 37.5902, "step": 17198 }, { "epoch": 45.42489270386266, "grad_norm": 1090.55908203125, "learning_rate": 1.003174749207919e-05, "loss": 37.7056, "step": 17199 }, { "epoch": 45.42753383955101, "grad_norm": 705.5106811523438, "learning_rate": 1.0020035249242304e-05, "loss": 37.6367, "step": 17200 }, { "epoch": 45.42753383955101, "eval_loss": 3.6586904525756836, "eval_runtime": 2.2317, "eval_samples_per_second": 221.809, "eval_steps_per_second": 27.782, "step": 17200 }, { "epoch": 45.43017497523935, "grad_norm": 833.8543701171875, "learning_rate": 1.000832970770496e-05, "loss": 35.8792, "step": 17201 }, { "epoch": 45.432816110927696, "grad_norm": 1119.151123046875, "learning_rate": 9.996630867794037e-06, "loss": 35.9561, "step": 17202 }, { "epoch": 45.435457246616046, "grad_norm": 1208.6455078125, "learning_rate": 9.98493872983619e-06, "loss": 35.5892, "step": 17203 }, { "epoch": 45.43809838230439, "grad_norm": 530.5911865234375, "learning_rate": 9.973253294157908e-06, "loss": 35.2086, "step": 17204 }, { "epoch": 45.44073951799274, "grad_norm": 1051.7227783203125, "learning_rate": 9.961574561085568e-06, "loss": 35.0729, "step": 17205 }, { "epoch": 45.44338065368108, "grad_norm": 1215.922607421875, "learning_rate": 9.949902530945271e-06, "loss": 34.5263, "step": 17206 }, { "epoch": 45.44602178936943, "grad_norm": 1749.4940185546875, "learning_rate": 9.938237204062894e-06, "loss": 35.0319, "step": 17207 }, { "epoch": 45.448662925057775, "grad_norm": 1283.7530517578125, "learning_rate": 9.926578580764234e-06, "loss": 35.6503, "step": 17208 }, { "epoch": 45.45130406074612, "grad_norm": 1646.5509033203125, "learning_rate": 9.914926661374835e-06, "loss": 35.211, "step": 17209 }, { "epoch": 45.45394519643447, "grad_norm": 4024.69775390625, "learning_rate": 9.903281446220103e-06, "loss": 41.4661, "step": 17210 }, { "epoch": 45.45658633212281, "grad_norm": 7550.77099609375, "learning_rate": 9.891642935625167e-06, "loss": 10.8881, "step": 17211 }, { "epoch": 45.45922746781116, "grad_norm": 1428.6510009765625, "learning_rate": 9.880011129915073e-06, "loss": 10.1487, "step": 17212 }, { "epoch": 45.461868603499504, "grad_norm": 1225.8182373046875, "learning_rate": 9.868386029414617e-06, "loss": 14.0566, "step": 17213 }, { "epoch": 45.464509739187854, "grad_norm": 2845.885498046875, "learning_rate": 9.8567676344484e-06, "loss": 8.8604, "step": 17214 }, { "epoch": 45.4671508748762, "grad_norm": 1945.572998046875, "learning_rate": 9.845155945340884e-06, "loss": 10.5508, "step": 17215 }, { "epoch": 45.46979201056454, "grad_norm": 799.5757446289062, "learning_rate": 9.83355096241631e-06, "loss": 10.9366, "step": 17216 }, { "epoch": 45.47243314625289, "grad_norm": 929.955322265625, "learning_rate": 9.821952685998752e-06, "loss": 12.6368, "step": 17217 }, { "epoch": 45.47507428194123, "grad_norm": 5736.91064453125, "learning_rate": 9.810361116412092e-06, "loss": 9.7482, "step": 17218 }, { "epoch": 45.47771541762958, "grad_norm": 2461.949462890625, "learning_rate": 9.798776253979984e-06, "loss": 11.5828, "step": 17219 }, { "epoch": 45.480356553317925, "grad_norm": 865.3766479492188, "learning_rate": 9.78719809902598e-06, "loss": 26.6335, "step": 17220 }, { "epoch": 45.482997689006275, "grad_norm": 912.024658203125, "learning_rate": 9.775626651873315e-06, "loss": 35.7514, "step": 17221 }, { "epoch": 45.48563882469462, "grad_norm": 814.913330078125, "learning_rate": 9.764061912845151e-06, "loss": 34.0872, "step": 17222 }, { "epoch": 45.48827996038297, "grad_norm": 576.1611938476562, "learning_rate": 9.75250388226448e-06, "loss": 35.4581, "step": 17223 }, { "epoch": 45.49092109607131, "grad_norm": 761.3429565429688, "learning_rate": 9.740952560453985e-06, "loss": 33.8054, "step": 17224 }, { "epoch": 45.493562231759654, "grad_norm": 2602.3759765625, "learning_rate": 9.729407947736247e-06, "loss": 34.7851, "step": 17225 }, { "epoch": 45.496203367448004, "grad_norm": 800.5902709960938, "learning_rate": 9.717870044433668e-06, "loss": 34.9376, "step": 17226 }, { "epoch": 45.49884450313635, "grad_norm": 2184.183349609375, "learning_rate": 9.706338850868412e-06, "loss": 34.7142, "step": 17227 }, { "epoch": 45.5014856388247, "grad_norm": 1015.8903198242188, "learning_rate": 9.694814367362525e-06, "loss": 33.1693, "step": 17228 }, { "epoch": 45.50412677451304, "grad_norm": 1260.2314453125, "learning_rate": 9.683296594237723e-06, "loss": 34.1663, "step": 17229 }, { "epoch": 45.50676791020139, "grad_norm": 2486.016845703125, "learning_rate": 9.67178553181572e-06, "loss": 34.4563, "step": 17230 }, { "epoch": 45.50940904588973, "grad_norm": 1493.4036865234375, "learning_rate": 9.660281180417924e-06, "loss": 34.9267, "step": 17231 }, { "epoch": 45.512050181578076, "grad_norm": 1515.479248046875, "learning_rate": 9.64878354036558e-06, "loss": 33.7646, "step": 17232 }, { "epoch": 45.514691317266426, "grad_norm": 3105.3955078125, "learning_rate": 9.637292611979791e-06, "loss": 34.9868, "step": 17233 }, { "epoch": 45.51733245295477, "grad_norm": 1699.0927734375, "learning_rate": 9.625808395581414e-06, "loss": 34.3201, "step": 17234 }, { "epoch": 45.51997358864312, "grad_norm": 1096.487060546875, "learning_rate": 9.614330891491107e-06, "loss": 33.5188, "step": 17235 }, { "epoch": 45.52261472433146, "grad_norm": 1188.822021484375, "learning_rate": 9.60286010002942e-06, "loss": 35.4902, "step": 17236 }, { "epoch": 45.52525586001981, "grad_norm": 1266.6474609375, "learning_rate": 9.591396021516651e-06, "loss": 36.7527, "step": 17237 }, { "epoch": 45.527896995708154, "grad_norm": 4528.234375, "learning_rate": 9.579938656272935e-06, "loss": 40.0985, "step": 17238 }, { "epoch": 45.5305381313965, "grad_norm": 1489.7232666015625, "learning_rate": 9.568488004618153e-06, "loss": 38.5046, "step": 17239 }, { "epoch": 45.53317926708485, "grad_norm": 600.109619140625, "learning_rate": 9.557044066872134e-06, "loss": 37.7627, "step": 17240 }, { "epoch": 45.53582040277319, "grad_norm": 1508.9954833984375, "learning_rate": 9.545606843354426e-06, "loss": 38.3052, "step": 17241 }, { "epoch": 45.53846153846154, "grad_norm": 2074.751953125, "learning_rate": 9.534176334384386e-06, "loss": 39.5537, "step": 17242 }, { "epoch": 45.54110267414988, "grad_norm": 4948.33154296875, "learning_rate": 9.522752540281204e-06, "loss": 39.6227, "step": 17243 }, { "epoch": 45.54374380983823, "grad_norm": 653.7360229492188, "learning_rate": 9.511335461363902e-06, "loss": 41.8943, "step": 17244 }, { "epoch": 45.546384945526576, "grad_norm": 1375.6290283203125, "learning_rate": 9.499925097951278e-06, "loss": 38.971, "step": 17245 }, { "epoch": 45.549026081214926, "grad_norm": 1700.0687255859375, "learning_rate": 9.488521450361997e-06, "loss": 42.8399, "step": 17246 }, { "epoch": 45.55166721690327, "grad_norm": 623.2068481445312, "learning_rate": 9.47712451891447e-06, "loss": 41.6915, "step": 17247 }, { "epoch": 45.55430835259161, "grad_norm": 1379.741943359375, "learning_rate": 9.465734303926915e-06, "loss": 37.7558, "step": 17248 }, { "epoch": 45.55694948827996, "grad_norm": 1519.6259765625, "learning_rate": 9.454350805717466e-06, "loss": 39.599, "step": 17249 }, { "epoch": 45.559590623968305, "grad_norm": 516.8675537109375, "learning_rate": 9.442974024603923e-06, "loss": 39.381, "step": 17250 }, { "epoch": 45.562231759656655, "grad_norm": 1169.3388671875, "learning_rate": 9.431603960904062e-06, "loss": 36.5731, "step": 17251 }, { "epoch": 45.564872895345, "grad_norm": 1507.3846435546875, "learning_rate": 9.420240614935321e-06, "loss": 34.5639, "step": 17252 }, { "epoch": 45.56751403103335, "grad_norm": 1309.5247802734375, "learning_rate": 9.40888398701506e-06, "loss": 35.8285, "step": 17253 }, { "epoch": 45.57015516672169, "grad_norm": 1093.5159912109375, "learning_rate": 9.397534077460357e-06, "loss": 35.4317, "step": 17254 }, { "epoch": 45.57279630241003, "grad_norm": 1078.9520263671875, "learning_rate": 9.386190886588208e-06, "loss": 33.8378, "step": 17255 }, { "epoch": 45.57543743809838, "grad_norm": 1018.2072143554688, "learning_rate": 9.374854414715306e-06, "loss": 34.1218, "step": 17256 }, { "epoch": 45.578078573786726, "grad_norm": 592.4717407226562, "learning_rate": 9.36352466215823e-06, "loss": 35.229, "step": 17257 }, { "epoch": 45.580719709475076, "grad_norm": 1061.884521484375, "learning_rate": 9.352201629233393e-06, "loss": 34.6306, "step": 17258 }, { "epoch": 45.58336084516342, "grad_norm": 443.1015930175781, "learning_rate": 9.340885316256959e-06, "loss": 37.101, "step": 17259 }, { "epoch": 45.58600198085177, "grad_norm": 6737.0341796875, "learning_rate": 9.329575723544925e-06, "loss": 43.5947, "step": 17260 }, { "epoch": 45.58864311654011, "grad_norm": 18990.78125, "learning_rate": 9.318272851413123e-06, "loss": 10.0401, "step": 17261 }, { "epoch": 45.591284252228455, "grad_norm": 960.0808715820312, "learning_rate": 9.306976700177162e-06, "loss": 11.8814, "step": 17262 }, { "epoch": 45.593925387916805, "grad_norm": 743.5866088867188, "learning_rate": 9.295687270152508e-06, "loss": 15.0573, "step": 17263 }, { "epoch": 45.59656652360515, "grad_norm": 1510.9305419921875, "learning_rate": 9.284404561654359e-06, "loss": 10.2259, "step": 17264 }, { "epoch": 45.5992076592935, "grad_norm": 558.5340576171875, "learning_rate": 9.273128574997819e-06, "loss": 8.6823, "step": 17265 }, { "epoch": 45.60184879498184, "grad_norm": 2995.27392578125, "learning_rate": 9.26185931049775e-06, "loss": 12.0763, "step": 17266 }, { "epoch": 45.60448993067019, "grad_norm": 13092.65234375, "learning_rate": 9.250596768468844e-06, "loss": 14.6326, "step": 17267 }, { "epoch": 45.607131066358534, "grad_norm": 1527.68896484375, "learning_rate": 9.2393409492256e-06, "loss": 9.5473, "step": 17268 }, { "epoch": 45.609772202046884, "grad_norm": 758.2503051757812, "learning_rate": 9.22809185308232e-06, "loss": 9.2565, "step": 17269 }, { "epoch": 45.61241333773523, "grad_norm": 1542.4237060546875, "learning_rate": 9.216849480353174e-06, "loss": 24.3217, "step": 17270 }, { "epoch": 45.61505447342357, "grad_norm": 759.3069458007812, "learning_rate": 9.205613831352017e-06, "loss": 34.0089, "step": 17271 }, { "epoch": 45.61769560911192, "grad_norm": 826.1646118164062, "learning_rate": 9.194384906392683e-06, "loss": 34.6844, "step": 17272 }, { "epoch": 45.62033674480026, "grad_norm": 1055.0767822265625, "learning_rate": 9.183162705788672e-06, "loss": 35.3947, "step": 17273 }, { "epoch": 45.62297788048861, "grad_norm": 612.0464477539062, "learning_rate": 9.171947229853345e-06, "loss": 34.9943, "step": 17274 }, { "epoch": 45.625619016176955, "grad_norm": 631.3519897460938, "learning_rate": 9.160738478899978e-06, "loss": 33.5936, "step": 17275 }, { "epoch": 45.628260151865305, "grad_norm": 1206.638427734375, "learning_rate": 9.149536453241487e-06, "loss": 33.6806, "step": 17276 }, { "epoch": 45.63090128755365, "grad_norm": 626.0421142578125, "learning_rate": 9.138341153190705e-06, "loss": 34.5031, "step": 17277 }, { "epoch": 45.63354242324199, "grad_norm": 658.9154663085938, "learning_rate": 9.127152579060244e-06, "loss": 34.7561, "step": 17278 }, { "epoch": 45.63618355893034, "grad_norm": 670.3043823242188, "learning_rate": 9.115970731162576e-06, "loss": 33.8461, "step": 17279 }, { "epoch": 45.638824694618684, "grad_norm": 634.2061157226562, "learning_rate": 9.104795609809925e-06, "loss": 34.4524, "step": 17280 }, { "epoch": 45.641465830307034, "grad_norm": 1327.504638671875, "learning_rate": 9.093627215314343e-06, "loss": 35.8608, "step": 17281 }, { "epoch": 45.64410696599538, "grad_norm": 2972.85107421875, "learning_rate": 9.082465547987722e-06, "loss": 35.7319, "step": 17282 }, { "epoch": 45.64674810168373, "grad_norm": 1000.8757934570312, "learning_rate": 9.071310608141704e-06, "loss": 34.6469, "step": 17283 }, { "epoch": 45.64938923737207, "grad_norm": 638.485595703125, "learning_rate": 9.060162396087784e-06, "loss": 34.2915, "step": 17284 }, { "epoch": 45.65203037306041, "grad_norm": 1400.6173095703125, "learning_rate": 9.049020912137328e-06, "loss": 34.4567, "step": 17285 }, { "epoch": 45.65467150874876, "grad_norm": 3131.748779296875, "learning_rate": 9.037886156601422e-06, "loss": 35.0233, "step": 17286 }, { "epoch": 45.657312644437106, "grad_norm": 25433.04296875, "learning_rate": 9.02675812979098e-06, "loss": 35.9346, "step": 17287 }, { "epoch": 45.659953780125456, "grad_norm": 6283.8203125, "learning_rate": 9.015636832016782e-06, "loss": 37.7854, "step": 17288 }, { "epoch": 45.6625949158138, "grad_norm": 1111.3544921875, "learning_rate": 9.00452226358936e-06, "loss": 39.8124, "step": 17289 }, { "epoch": 45.66523605150215, "grad_norm": 2356.1435546875, "learning_rate": 8.993414424819074e-06, "loss": 39.3359, "step": 17290 }, { "epoch": 45.66787718719049, "grad_norm": 1118.6865234375, "learning_rate": 8.982313316016094e-06, "loss": 38.4501, "step": 17291 }, { "epoch": 45.67051832287884, "grad_norm": 975.4207153320312, "learning_rate": 8.97121893749045e-06, "loss": 37.7215, "step": 17292 }, { "epoch": 45.673159458567184, "grad_norm": 1070.82568359375, "learning_rate": 8.960131289551949e-06, "loss": 39.9119, "step": 17293 }, { "epoch": 45.67580059425553, "grad_norm": 570.3385009765625, "learning_rate": 8.949050372510176e-06, "loss": 39.7115, "step": 17294 }, { "epoch": 45.67844172994388, "grad_norm": 622.63330078125, "learning_rate": 8.937976186674551e-06, "loss": 43.2884, "step": 17295 }, { "epoch": 45.68108286563222, "grad_norm": 804.8334350585938, "learning_rate": 8.926908732354355e-06, "loss": 39.9452, "step": 17296 }, { "epoch": 45.68372400132057, "grad_norm": 386.9407958984375, "learning_rate": 8.915848009858618e-06, "loss": 39.1574, "step": 17297 }, { "epoch": 45.68636513700891, "grad_norm": 420.1756591796875, "learning_rate": 8.904794019496205e-06, "loss": 38.2807, "step": 17298 }, { "epoch": 45.68900627269726, "grad_norm": 962.2113647460938, "learning_rate": 8.893746761575783e-06, "loss": 40.8478, "step": 17299 }, { "epoch": 45.691647408385606, "grad_norm": 471.7637939453125, "learning_rate": 8.882706236405884e-06, "loss": 36.6145, "step": 17300 }, { "epoch": 45.69428854407395, "grad_norm": 832.91357421875, "learning_rate": 8.871672444294738e-06, "loss": 36.9336, "step": 17301 }, { "epoch": 45.6969296797623, "grad_norm": 886.7086181640625, "learning_rate": 8.860645385550481e-06, "loss": 37.6902, "step": 17302 }, { "epoch": 45.69957081545064, "grad_norm": 810.0604858398438, "learning_rate": 8.849625060481065e-06, "loss": 36.6512, "step": 17303 }, { "epoch": 45.70221195113899, "grad_norm": 1456.7296142578125, "learning_rate": 8.838611469394215e-06, "loss": 35.0331, "step": 17304 }, { "epoch": 45.704853086827335, "grad_norm": 505.4938659667969, "learning_rate": 8.827604612597463e-06, "loss": 34.9348, "step": 17305 }, { "epoch": 45.707494222515685, "grad_norm": 1034.9002685546875, "learning_rate": 8.8166044903982e-06, "loss": 36.1001, "step": 17306 }, { "epoch": 45.71013535820403, "grad_norm": 2546.8740234375, "learning_rate": 8.805611103103572e-06, "loss": 35.3008, "step": 17307 }, { "epoch": 45.71277649389237, "grad_norm": 744.129638671875, "learning_rate": 8.794624451020556e-06, "loss": 35.282, "step": 17308 }, { "epoch": 45.71541762958072, "grad_norm": 697.6663818359375, "learning_rate": 8.783644534455959e-06, "loss": 35.4364, "step": 17309 }, { "epoch": 45.71805876526906, "grad_norm": 1023.7959594726562, "learning_rate": 8.772671353716399e-06, "loss": 34.6716, "step": 17310 }, { "epoch": 45.72069990095741, "grad_norm": 4668.1923828125, "learning_rate": 8.761704909108298e-06, "loss": 37.1684, "step": 17311 }, { "epoch": 45.723341036645756, "grad_norm": 2451.062255859375, "learning_rate": 8.750745200937827e-06, "loss": 12.2175, "step": 17312 }, { "epoch": 45.725982172334106, "grad_norm": 584.1695556640625, "learning_rate": 8.73979222951113e-06, "loss": 13.7323, "step": 17313 }, { "epoch": 45.72862330802245, "grad_norm": 367.12939453125, "learning_rate": 8.728845995133988e-06, "loss": 20.4572, "step": 17314 }, { "epoch": 45.7312644437108, "grad_norm": 1945.5963134765625, "learning_rate": 8.717906498112105e-06, "loss": 10.7473, "step": 17315 }, { "epoch": 45.73390557939914, "grad_norm": 6933.7958984375, "learning_rate": 8.706973738750957e-06, "loss": 8.5904, "step": 17316 }, { "epoch": 45.736546715087485, "grad_norm": 509.255859375, "learning_rate": 8.6960477173558e-06, "loss": 10.0291, "step": 17317 }, { "epoch": 45.739187850775835, "grad_norm": 1143.90185546875, "learning_rate": 8.685128434231777e-06, "loss": 18.1527, "step": 17318 }, { "epoch": 45.74182898646418, "grad_norm": 1389.28515625, "learning_rate": 8.674215889683757e-06, "loss": 12.8042, "step": 17319 }, { "epoch": 45.74447012215253, "grad_norm": 818.0726928710938, "learning_rate": 8.663310084016523e-06, "loss": 7.9796, "step": 17320 }, { "epoch": 45.74711125784087, "grad_norm": 15982.6259765625, "learning_rate": 8.652411017534584e-06, "loss": 33.1715, "step": 17321 }, { "epoch": 45.74975239352922, "grad_norm": 1145.241943359375, "learning_rate": 8.641518690542276e-06, "loss": 34.339, "step": 17322 }, { "epoch": 45.752393529217564, "grad_norm": 1530.51220703125, "learning_rate": 8.630633103343805e-06, "loss": 35.9293, "step": 17323 }, { "epoch": 45.75503466490591, "grad_norm": 716.8244018554688, "learning_rate": 8.619754256243117e-06, "loss": 35.0323, "step": 17324 }, { "epoch": 45.75767580059426, "grad_norm": 811.6814575195312, "learning_rate": 8.608882149543972e-06, "loss": 33.4305, "step": 17325 }, { "epoch": 45.7603169362826, "grad_norm": 1480.8358154296875, "learning_rate": 8.59801678354999e-06, "loss": 33.2616, "step": 17326 }, { "epoch": 45.76295807197095, "grad_norm": 1429.61474609375, "learning_rate": 8.58715815856459e-06, "loss": 34.9892, "step": 17327 }, { "epoch": 45.76559920765929, "grad_norm": 643.3244018554688, "learning_rate": 8.576306274890976e-06, "loss": 35.0435, "step": 17328 }, { "epoch": 45.76824034334764, "grad_norm": 952.7316284179688, "learning_rate": 8.565461132832215e-06, "loss": 33.7954, "step": 17329 }, { "epoch": 45.770881479035985, "grad_norm": 1150.9046630859375, "learning_rate": 8.554622732691087e-06, "loss": 34.7254, "step": 17330 }, { "epoch": 45.77352261472433, "grad_norm": 678.6351928710938, "learning_rate": 8.543791074770324e-06, "loss": 35.0428, "step": 17331 }, { "epoch": 45.77616375041268, "grad_norm": 946.2312622070312, "learning_rate": 8.532966159372352e-06, "loss": 34.3285, "step": 17332 }, { "epoch": 45.77880488610102, "grad_norm": 1603.0980224609375, "learning_rate": 8.522147986799456e-06, "loss": 34.1078, "step": 17333 }, { "epoch": 45.78144602178937, "grad_norm": 2654.392578125, "learning_rate": 8.5113365573537e-06, "loss": 34.7319, "step": 17334 }, { "epoch": 45.784087157477714, "grad_norm": 643.853515625, "learning_rate": 8.500531871337036e-06, "loss": 33.2926, "step": 17335 }, { "epoch": 45.786728293166064, "grad_norm": 793.3998413085938, "learning_rate": 8.48973392905117e-06, "loss": 34.41, "step": 17336 }, { "epoch": 45.78936942885441, "grad_norm": 1095.03515625, "learning_rate": 8.478942730797551e-06, "loss": 36.2332, "step": 17337 }, { "epoch": 45.79201056454276, "grad_norm": 1678.4219970703125, "learning_rate": 8.468158276877635e-06, "loss": 38.5594, "step": 17338 }, { "epoch": 45.7946517002311, "grad_norm": 931.438232421875, "learning_rate": 8.457380567592487e-06, "loss": 39.2194, "step": 17339 }, { "epoch": 45.79729283591944, "grad_norm": 881.6495971679688, "learning_rate": 8.446609603243115e-06, "loss": 38.2976, "step": 17340 }, { "epoch": 45.79993397160779, "grad_norm": 1149.7686767578125, "learning_rate": 8.435845384130281e-06, "loss": 38.3997, "step": 17341 }, { "epoch": 45.802575107296136, "grad_norm": 796.2359008789062, "learning_rate": 8.425087910554546e-06, "loss": 38.809, "step": 17342 }, { "epoch": 45.805216242984486, "grad_norm": 1839.3509521484375, "learning_rate": 8.41433718281634e-06, "loss": 40.8063, "step": 17343 }, { "epoch": 45.80785737867283, "grad_norm": 1276.7601318359375, "learning_rate": 8.403593201215808e-06, "loss": 43.6902, "step": 17344 }, { "epoch": 45.81049851436118, "grad_norm": 1873.51708984375, "learning_rate": 8.392855966053047e-06, "loss": 39.8215, "step": 17345 }, { "epoch": 45.81313965004952, "grad_norm": 980.6915283203125, "learning_rate": 8.382125477627871e-06, "loss": 40.6432, "step": 17346 }, { "epoch": 45.815780785737864, "grad_norm": 867.4945068359375, "learning_rate": 8.371401736239847e-06, "loss": 38.5871, "step": 17347 }, { "epoch": 45.818421921426214, "grad_norm": 698.3887329101562, "learning_rate": 8.36068474218854e-06, "loss": 37.7291, "step": 17348 }, { "epoch": 45.82106305711456, "grad_norm": 628.9057006835938, "learning_rate": 8.349974495773182e-06, "loss": 38.165, "step": 17349 }, { "epoch": 45.82370419280291, "grad_norm": 1158.450439453125, "learning_rate": 8.339270997292814e-06, "loss": 37.9023, "step": 17350 }, { "epoch": 45.82634532849125, "grad_norm": 737.0029907226562, "learning_rate": 8.328574247046333e-06, "loss": 37.302, "step": 17351 }, { "epoch": 45.8289864641796, "grad_norm": 1094.945556640625, "learning_rate": 8.317884245332446e-06, "loss": 36.5877, "step": 17352 }, { "epoch": 45.83162759986794, "grad_norm": 963.0167236328125, "learning_rate": 8.307200992449692e-06, "loss": 36.1458, "step": 17353 }, { "epoch": 45.834268735556286, "grad_norm": 402.2606506347656, "learning_rate": 8.296524488696333e-06, "loss": 35.7792, "step": 17354 }, { "epoch": 45.836909871244636, "grad_norm": 979.0894775390625, "learning_rate": 8.285854734370574e-06, "loss": 34.1145, "step": 17355 }, { "epoch": 45.83955100693298, "grad_norm": 663.093505859375, "learning_rate": 8.275191729770343e-06, "loss": 34.1959, "step": 17356 }, { "epoch": 45.84219214262133, "grad_norm": 762.2584228515625, "learning_rate": 8.264535475193374e-06, "loss": 34.7243, "step": 17357 }, { "epoch": 45.84483327830967, "grad_norm": 3838.958740234375, "learning_rate": 8.253885970937236e-06, "loss": 34.0313, "step": 17358 }, { "epoch": 45.84747441399802, "grad_norm": 643.4646606445312, "learning_rate": 8.243243217299357e-06, "loss": 35.0537, "step": 17359 }, { "epoch": 45.850115549686365, "grad_norm": 8062.45361328125, "learning_rate": 8.23260721457686e-06, "loss": 38.1682, "step": 17360 }, { "epoch": 45.852756685374715, "grad_norm": 32591.943359375, "learning_rate": 8.221977963066785e-06, "loss": 14.6623, "step": 17361 }, { "epoch": 45.85539782106306, "grad_norm": 2557.554931640625, "learning_rate": 8.21135546306595e-06, "loss": 11.2171, "step": 17362 }, { "epoch": 45.8580389567514, "grad_norm": 1745.651611328125, "learning_rate": 8.200739714871009e-06, "loss": 12.1299, "step": 17363 }, { "epoch": 45.86068009243975, "grad_norm": 449.1130065917969, "learning_rate": 8.190130718778332e-06, "loss": 10.0034, "step": 17364 }, { "epoch": 45.86332122812809, "grad_norm": 991.5817260742188, "learning_rate": 8.179528475084214e-06, "loss": 10.8754, "step": 17365 }, { "epoch": 45.86596236381644, "grad_norm": 4094.07275390625, "learning_rate": 8.168932984084721e-06, "loss": 16.3638, "step": 17366 }, { "epoch": 45.868603499504786, "grad_norm": 620.9723510742188, "learning_rate": 8.158344246075727e-06, "loss": 8.7063, "step": 17367 }, { "epoch": 45.871244635193136, "grad_norm": 6627.9814453125, "learning_rate": 8.147762261352886e-06, "loss": 9.0926, "step": 17368 }, { "epoch": 45.87388577088148, "grad_norm": 2539.631591796875, "learning_rate": 8.13718703021174e-06, "loss": 10.8568, "step": 17369 }, { "epoch": 45.87652690656982, "grad_norm": 4544.07861328125, "learning_rate": 8.126618552947552e-06, "loss": 21.8474, "step": 17370 }, { "epoch": 45.87916804225817, "grad_norm": 1294.766845703125, "learning_rate": 8.116056829855444e-06, "loss": 34.5864, "step": 17371 }, { "epoch": 45.881809177946515, "grad_norm": 979.3794555664062, "learning_rate": 8.105501861230353e-06, "loss": 33.1214, "step": 17372 }, { "epoch": 45.884450313634865, "grad_norm": 913.6779174804688, "learning_rate": 8.094953647367037e-06, "loss": 34.5761, "step": 17373 }, { "epoch": 45.88709144932321, "grad_norm": 1626.391357421875, "learning_rate": 8.084412188560042e-06, "loss": 34.3091, "step": 17374 }, { "epoch": 45.88973258501156, "grad_norm": 1281.8702392578125, "learning_rate": 8.073877485103742e-06, "loss": 34.3987, "step": 17375 }, { "epoch": 45.8923737206999, "grad_norm": 1198.325927734375, "learning_rate": 8.063349537292264e-06, "loss": 35.8947, "step": 17376 }, { "epoch": 45.895014856388244, "grad_norm": 1993.5546875, "learning_rate": 8.05282834541965e-06, "loss": 34.0752, "step": 17377 }, { "epoch": 45.897655992076594, "grad_norm": 688.381591796875, "learning_rate": 8.042313909779691e-06, "loss": 33.8004, "step": 17378 }, { "epoch": 45.90029712776494, "grad_norm": 634.5468139648438, "learning_rate": 8.031806230665933e-06, "loss": 35.0975, "step": 17379 }, { "epoch": 45.90293826345329, "grad_norm": 618.3343505859375, "learning_rate": 8.02130530837189e-06, "loss": 34.7696, "step": 17380 }, { "epoch": 45.90557939914163, "grad_norm": 1195.8487548828125, "learning_rate": 8.010811143190743e-06, "loss": 34.6579, "step": 17381 }, { "epoch": 45.90822053482998, "grad_norm": 1161.115234375, "learning_rate": 8.00032373541551e-06, "loss": 33.303, "step": 17382 }, { "epoch": 45.91086167051832, "grad_norm": 1398.2698974609375, "learning_rate": 7.989843085339122e-06, "loss": 34.4833, "step": 17383 }, { "epoch": 45.91350280620667, "grad_norm": 1071.8643798828125, "learning_rate": 7.979369193254177e-06, "loss": 33.582, "step": 17384 }, { "epoch": 45.916143941895015, "grad_norm": 1576.6663818359375, "learning_rate": 7.968902059453193e-06, "loss": 34.9612, "step": 17385 }, { "epoch": 45.91878507758336, "grad_norm": 2167.40966796875, "learning_rate": 7.958441684228435e-06, "loss": 35.1299, "step": 17386 }, { "epoch": 45.92142621327171, "grad_norm": 2101.384521484375, "learning_rate": 7.947988067872031e-06, "loss": 37.7856, "step": 17387 }, { "epoch": 45.92406734896005, "grad_norm": 2039.041015625, "learning_rate": 7.937541210675858e-06, "loss": 39.5178, "step": 17388 }, { "epoch": 45.9267084846484, "grad_norm": 2565.794921875, "learning_rate": 7.927101112931628e-06, "loss": 40.0348, "step": 17389 }, { "epoch": 45.929349620336744, "grad_norm": 1614.318603515625, "learning_rate": 7.916667774930914e-06, "loss": 38.8433, "step": 17390 }, { "epoch": 45.931990756025094, "grad_norm": 847.7891235351562, "learning_rate": 7.906241196965036e-06, "loss": 39.6765, "step": 17391 }, { "epoch": 45.93463189171344, "grad_norm": 927.1287231445312, "learning_rate": 7.89582137932518e-06, "loss": 42.4998, "step": 17392 }, { "epoch": 45.93727302740178, "grad_norm": 1381.5970458984375, "learning_rate": 7.885408322302278e-06, "loss": 39.6944, "step": 17393 }, { "epoch": 45.93991416309013, "grad_norm": 1232.695068359375, "learning_rate": 7.875002026187129e-06, "loss": 39.824, "step": 17394 }, { "epoch": 45.94255529877847, "grad_norm": 1357.793701171875, "learning_rate": 7.864602491270329e-06, "loss": 36.4279, "step": 17395 }, { "epoch": 45.94519643446682, "grad_norm": 1161.370849609375, "learning_rate": 7.854209717842232e-06, "loss": 36.8243, "step": 17396 }, { "epoch": 45.947837570155166, "grad_norm": 785.0805053710938, "learning_rate": 7.843823706193104e-06, "loss": 35.283, "step": 17397 }, { "epoch": 45.950478705843516, "grad_norm": 625.3853759765625, "learning_rate": 7.833444456612936e-06, "loss": 34.5743, "step": 17398 }, { "epoch": 45.95311984153186, "grad_norm": 742.6533813476562, "learning_rate": 7.823071969391582e-06, "loss": 34.6708, "step": 17399 }, { "epoch": 45.9557609772202, "grad_norm": 1854.08154296875, "learning_rate": 7.812706244818668e-06, "loss": 38.0785, "step": 17400 }, { "epoch": 45.9557609772202, "eval_loss": 3.6726982593536377, "eval_runtime": 2.1566, "eval_samples_per_second": 229.531, "eval_steps_per_second": 28.749, "step": 17400 }, { "epoch": 45.95840211290855, "grad_norm": 5244.21337890625, "learning_rate": 7.802347283183685e-06, "loss": 17.5052, "step": 17401 }, { "epoch": 45.961043248596894, "grad_norm": 1699.042724609375, "learning_rate": 7.791995084775905e-06, "loss": 12.5609, "step": 17402 }, { "epoch": 45.963684384285244, "grad_norm": 1625.8846435546875, "learning_rate": 7.781649649884342e-06, "loss": 11.7557, "step": 17403 }, { "epoch": 45.96632551997359, "grad_norm": 1050.3883056640625, "learning_rate": 7.771310978797935e-06, "loss": 8.746, "step": 17404 }, { "epoch": 45.96896665566194, "grad_norm": 657.6646118164062, "learning_rate": 7.760979071805397e-06, "loss": 9.8262, "step": 17405 }, { "epoch": 45.97160779135028, "grad_norm": 2404.6015625, "learning_rate": 7.750653929195162e-06, "loss": 13.6633, "step": 17406 }, { "epoch": 45.97424892703863, "grad_norm": 788.4217529296875, "learning_rate": 7.740335551255668e-06, "loss": 35.9511, "step": 17407 }, { "epoch": 45.97689006272697, "grad_norm": 846.0668334960938, "learning_rate": 7.73002393827496e-06, "loss": 35.0344, "step": 17408 }, { "epoch": 45.979531198415316, "grad_norm": 3221.8759765625, "learning_rate": 7.719719090541033e-06, "loss": 32.7466, "step": 17409 }, { "epoch": 45.982172334103666, "grad_norm": 912.4186401367188, "learning_rate": 7.709421008341655e-06, "loss": 35.1264, "step": 17410 }, { "epoch": 45.98481346979201, "grad_norm": 678.662353515625, "learning_rate": 7.699129691964318e-06, "loss": 34.2516, "step": 17411 }, { "epoch": 45.98745460548036, "grad_norm": 972.3740234375, "learning_rate": 7.688845141696487e-06, "loss": 34.4421, "step": 17412 }, { "epoch": 45.9900957411687, "grad_norm": 3579.915283203125, "learning_rate": 7.678567357825266e-06, "loss": 32.9972, "step": 17413 }, { "epoch": 45.99273687685705, "grad_norm": 753.1476440429688, "learning_rate": 7.668296340637732e-06, "loss": 35.8726, "step": 17414 }, { "epoch": 45.995378012545395, "grad_norm": 1776.11328125, "learning_rate": 7.65803209042068e-06, "loss": 33.3071, "step": 17415 }, { "epoch": 45.99801914823374, "grad_norm": 562.1405029296875, "learning_rate": 7.647774607460717e-06, "loss": 35.051, "step": 17416 }, { "epoch": 46.00066028392209, "grad_norm": 915.6514282226562, "learning_rate": 7.637523892044252e-06, "loss": 37.8473, "step": 17417 }, { "epoch": 46.00330141961043, "grad_norm": 695.0635986328125, "learning_rate": 7.627279944457582e-06, "loss": 38.8453, "step": 17418 }, { "epoch": 46.00594255529878, "grad_norm": 1093.5645751953125, "learning_rate": 7.617042764986759e-06, "loss": 39.882, "step": 17419 }, { "epoch": 46.00858369098712, "grad_norm": 596.4004516601562, "learning_rate": 7.606812353917636e-06, "loss": 37.8485, "step": 17420 }, { "epoch": 46.01122482667547, "grad_norm": 1236.572021484375, "learning_rate": 7.596588711535873e-06, "loss": 39.8854, "step": 17421 }, { "epoch": 46.013865962363816, "grad_norm": 985.8015747070312, "learning_rate": 7.586371838126966e-06, "loss": 41.4575, "step": 17422 }, { "epoch": 46.01650709805216, "grad_norm": 930.140380859375, "learning_rate": 7.576161733976239e-06, "loss": 40.3433, "step": 17423 }, { "epoch": 46.01914823374051, "grad_norm": 749.6077880859375, "learning_rate": 7.565958399368772e-06, "loss": 42.1661, "step": 17424 }, { "epoch": 46.02178936942885, "grad_norm": 584.945556640625, "learning_rate": 7.555761834589503e-06, "loss": 39.2672, "step": 17425 }, { "epoch": 46.0244305051172, "grad_norm": 957.501220703125, "learning_rate": 7.545572039923176e-06, "loss": 40.2942, "step": 17426 }, { "epoch": 46.027071640805545, "grad_norm": 2062.295166015625, "learning_rate": 7.535389015654315e-06, "loss": 39.3388, "step": 17427 }, { "epoch": 46.029712776493895, "grad_norm": 1413.695556640625, "learning_rate": 7.52521276206733e-06, "loss": 38.3439, "step": 17428 }, { "epoch": 46.03235391218224, "grad_norm": 1210.0904541015625, "learning_rate": 7.515043279446298e-06, "loss": 37.7161, "step": 17429 }, { "epoch": 46.03499504787059, "grad_norm": 399.47613525390625, "learning_rate": 7.504880568075245e-06, "loss": 37.0208, "step": 17430 }, { "epoch": 46.03763618355893, "grad_norm": 782.96142578125, "learning_rate": 7.494724628237942e-06, "loss": 36.8884, "step": 17431 }, { "epoch": 46.040277319247274, "grad_norm": 799.498046875, "learning_rate": 7.4845754602180235e-06, "loss": 34.7402, "step": 17432 }, { "epoch": 46.042918454935624, "grad_norm": 1045.592529296875, "learning_rate": 7.474433064298846e-06, "loss": 35.6684, "step": 17433 }, { "epoch": 46.04555959062397, "grad_norm": 839.62060546875, "learning_rate": 7.464297440763657e-06, "loss": 35.7152, "step": 17434 }, { "epoch": 46.04820072631232, "grad_norm": 770.8656005859375, "learning_rate": 7.454168589895504e-06, "loss": 35.5141, "step": 17435 }, { "epoch": 46.05084186200066, "grad_norm": 797.2168579101562, "learning_rate": 7.44404651197722e-06, "loss": 34.8602, "step": 17436 }, { "epoch": 46.05348299768901, "grad_norm": 924.7506713867188, "learning_rate": 7.433931207291467e-06, "loss": 34.789, "step": 17437 }, { "epoch": 46.05612413337735, "grad_norm": 1661.683349609375, "learning_rate": 7.423822676120656e-06, "loss": 35.3836, "step": 17438 }, { "epoch": 46.058765269065695, "grad_norm": 2946.5234375, "learning_rate": 7.413720918747119e-06, "loss": 35.8192, "step": 17439 }, { "epoch": 46.061406404754045, "grad_norm": 2266.333984375, "learning_rate": 7.403625935452934e-06, "loss": 26.4059, "step": 17440 }, { "epoch": 46.06404754044239, "grad_norm": 6550.986328125, "learning_rate": 7.393537726519961e-06, "loss": 10.2757, "step": 17441 }, { "epoch": 46.06668867613074, "grad_norm": 1608.2059326171875, "learning_rate": 7.383456292229946e-06, "loss": 11.6211, "step": 17442 }, { "epoch": 46.06932981181908, "grad_norm": 5095.80419921875, "learning_rate": 7.373381632864384e-06, "loss": 16.2996, "step": 17443 }, { "epoch": 46.07197094750743, "grad_norm": 1915.7535400390625, "learning_rate": 7.363313748704636e-06, "loss": 6.65, "step": 17444 }, { "epoch": 46.074612083195774, "grad_norm": 1533.07958984375, "learning_rate": 7.35325264003181e-06, "loss": 12.9794, "step": 17445 }, { "epoch": 46.07725321888412, "grad_norm": 14816.1142578125, "learning_rate": 7.343198307126847e-06, "loss": 7.7102, "step": 17446 }, { "epoch": 46.07989435457247, "grad_norm": 1585.807373046875, "learning_rate": 7.333150750270551e-06, "loss": 12.9863, "step": 17447 }, { "epoch": 46.08253549026081, "grad_norm": 1229.4461669921875, "learning_rate": 7.323109969743447e-06, "loss": 12.6725, "step": 17448 }, { "epoch": 46.08517662594916, "grad_norm": 1297.0927734375, "learning_rate": 7.31307596582595e-06, "loss": 23.6311, "step": 17449 }, { "epoch": 46.0878177616375, "grad_norm": 1524.033203125, "learning_rate": 7.303048738798251e-06, "loss": 34.4089, "step": 17450 }, { "epoch": 46.09045889732585, "grad_norm": 805.6704711914062, "learning_rate": 7.293028288940351e-06, "loss": 33.6908, "step": 17451 }, { "epoch": 46.093100033014196, "grad_norm": 1362.6119384765625, "learning_rate": 7.283014616532024e-06, "loss": 35.4395, "step": 17452 }, { "epoch": 46.095741168702546, "grad_norm": 1493.9696044921875, "learning_rate": 7.273007721852965e-06, "loss": 33.2218, "step": 17453 }, { "epoch": 46.09838230439089, "grad_norm": 2695.0576171875, "learning_rate": 7.2630076051825865e-06, "loss": 35.2383, "step": 17454 }, { "epoch": 46.10102344007923, "grad_norm": 2043.401123046875, "learning_rate": 7.25301426680014e-06, "loss": 37.1692, "step": 17455 }, { "epoch": 46.10366457576758, "grad_norm": 1124.8385009765625, "learning_rate": 7.243027706984651e-06, "loss": 33.3121, "step": 17456 }, { "epoch": 46.106305711455924, "grad_norm": 1292.9697265625, "learning_rate": 7.233047926015035e-06, "loss": 34.5706, "step": 17457 }, { "epoch": 46.108946847144274, "grad_norm": 783.3118896484375, "learning_rate": 7.223074924169931e-06, "loss": 34.8141, "step": 17458 }, { "epoch": 46.11158798283262, "grad_norm": 961.5859985351562, "learning_rate": 7.213108701727811e-06, "loss": 36.1979, "step": 17459 }, { "epoch": 46.11422911852097, "grad_norm": 542.6450805664062, "learning_rate": 7.203149258967034e-06, "loss": 34.9182, "step": 17460 }, { "epoch": 46.11687025420931, "grad_norm": 1419.961669921875, "learning_rate": 7.1931965961657135e-06, "loss": 35.24, "step": 17461 }, { "epoch": 46.11951138989765, "grad_norm": 865.4012451171875, "learning_rate": 7.18325071360168e-06, "loss": 35.8317, "step": 17462 }, { "epoch": 46.122152525586, "grad_norm": 1836.087158203125, "learning_rate": 7.173311611552824e-06, "loss": 33.701, "step": 17463 }, { "epoch": 46.124793661274346, "grad_norm": 690.8026733398438, "learning_rate": 7.163379290296534e-06, "loss": 35.1917, "step": 17464 }, { "epoch": 46.127434796962696, "grad_norm": 1869.7655029296875, "learning_rate": 7.153453750110256e-06, "loss": 37.1127, "step": 17465 }, { "epoch": 46.13007593265104, "grad_norm": 2472.07666015625, "learning_rate": 7.1435349912711e-06, "loss": 36.8896, "step": 17466 }, { "epoch": 46.13271706833939, "grad_norm": 1768.79931640625, "learning_rate": 7.133623014056095e-06, "loss": 40.8539, "step": 17467 }, { "epoch": 46.13535820402773, "grad_norm": 1295.32958984375, "learning_rate": 7.1237178187419646e-06, "loss": 37.9493, "step": 17468 }, { "epoch": 46.137999339716075, "grad_norm": 712.8828735351562, "learning_rate": 7.1138194056053486e-06, "loss": 38.3861, "step": 17469 }, { "epoch": 46.140640475404425, "grad_norm": 952.5288696289062, "learning_rate": 7.103927774922664e-06, "loss": 38.3046, "step": 17470 }, { "epoch": 46.14328161109277, "grad_norm": 1086.32470703125, "learning_rate": 7.094042926970107e-06, "loss": 41.3517, "step": 17471 }, { "epoch": 46.14592274678112, "grad_norm": 547.7405395507812, "learning_rate": 7.084164862023734e-06, "loss": 42.31, "step": 17472 }, { "epoch": 46.14856388246946, "grad_norm": 1208.898681640625, "learning_rate": 7.074293580359326e-06, "loss": 40.5787, "step": 17473 }, { "epoch": 46.15120501815781, "grad_norm": 661.8211059570312, "learning_rate": 7.064429082252605e-06, "loss": 39.9772, "step": 17474 }, { "epoch": 46.15384615384615, "grad_norm": 907.4276733398438, "learning_rate": 7.054571367978963e-06, "loss": 41.4714, "step": 17475 }, { "epoch": 46.1564872895345, "grad_norm": 1266.702392578125, "learning_rate": 7.0447204378136785e-06, "loss": 38.3548, "step": 17476 }, { "epoch": 46.159128425222846, "grad_norm": 828.955810546875, "learning_rate": 7.034876292031894e-06, "loss": 38.4966, "step": 17477 }, { "epoch": 46.16176956091119, "grad_norm": 1485.9820556640625, "learning_rate": 7.0250389309084454e-06, "loss": 38.3144, "step": 17478 }, { "epoch": 46.16441069659954, "grad_norm": 1058.852294921875, "learning_rate": 7.015208354718083e-06, "loss": 36.23, "step": 17479 }, { "epoch": 46.16705183228788, "grad_norm": 816.21435546875, "learning_rate": 7.005384563735285e-06, "loss": 36.4287, "step": 17480 }, { "epoch": 46.16969296797623, "grad_norm": 685.7193603515625, "learning_rate": 6.995567558234356e-06, "loss": 35.293, "step": 17481 }, { "epoch": 46.172334103664575, "grad_norm": 967.8369750976562, "learning_rate": 6.985757338489468e-06, "loss": 36.2665, "step": 17482 }, { "epoch": 46.174975239352925, "grad_norm": 685.36474609375, "learning_rate": 6.975953904774513e-06, "loss": 34.7342, "step": 17483 }, { "epoch": 46.17761637504127, "grad_norm": 835.138671875, "learning_rate": 6.966157257363326e-06, "loss": 36.0318, "step": 17484 }, { "epoch": 46.18025751072961, "grad_norm": 632.0814208984375, "learning_rate": 6.9563673965294395e-06, "loss": 34.5751, "step": 17485 }, { "epoch": 46.18289864641796, "grad_norm": 1133.5980224609375, "learning_rate": 6.946584322546218e-06, "loss": 35.0638, "step": 17486 }, { "epoch": 46.185539782106304, "grad_norm": 1070.2403564453125, "learning_rate": 6.936808035686804e-06, "loss": 35.608, "step": 17487 }, { "epoch": 46.188180917794654, "grad_norm": 1984.9591064453125, "learning_rate": 6.927038536224284e-06, "loss": 34.3449, "step": 17488 }, { "epoch": 46.190822053483, "grad_norm": 4101.1298828125, "learning_rate": 6.917275824431441e-06, "loss": 27.3929, "step": 17489 }, { "epoch": 46.19346318917135, "grad_norm": 1687.711669921875, "learning_rate": 6.907519900580861e-06, "loss": 12.7659, "step": 17490 }, { "epoch": 46.19610432485969, "grad_norm": 7308.84716796875, "learning_rate": 6.897770764944994e-06, "loss": 9.382, "step": 17491 }, { "epoch": 46.19874546054803, "grad_norm": 2391.458984375, "learning_rate": 6.888028417796066e-06, "loss": 8.3725, "step": 17492 }, { "epoch": 46.20138659623638, "grad_norm": 767.7617797851562, "learning_rate": 6.87829285940611e-06, "loss": 13.4573, "step": 17493 }, { "epoch": 46.204027731924725, "grad_norm": 3620.6005859375, "learning_rate": 6.868564090047019e-06, "loss": 9.4028, "step": 17494 }, { "epoch": 46.206668867613075, "grad_norm": 6427.69677734375, "learning_rate": 6.858842109990465e-06, "loss": 7.8412, "step": 17495 }, { "epoch": 46.20931000330142, "grad_norm": 548.6278076171875, "learning_rate": 6.849126919507897e-06, "loss": 10.7902, "step": 17496 }, { "epoch": 46.21195113898977, "grad_norm": 1920.1348876953125, "learning_rate": 6.839418518870627e-06, "loss": 9.5327, "step": 17497 }, { "epoch": 46.21459227467811, "grad_norm": 442.3101501464844, "learning_rate": 6.8297169083497974e-06, "loss": 9.157, "step": 17498 }, { "epoch": 46.21723341036646, "grad_norm": 1431.662841796875, "learning_rate": 6.820022088216249e-06, "loss": 11.9247, "step": 17499 }, { "epoch": 46.219874546054804, "grad_norm": 775.4830932617188, "learning_rate": 6.810334058740736e-06, "loss": 34.8055, "step": 17500 }, { "epoch": 46.22251568174315, "grad_norm": 2947.34765625, "learning_rate": 6.800652820193764e-06, "loss": 35.3042, "step": 17501 }, { "epoch": 46.2251568174315, "grad_norm": 600.2713012695312, "learning_rate": 6.790978372845702e-06, "loss": 34.5689, "step": 17502 }, { "epoch": 46.22779795311984, "grad_norm": 833.1574096679688, "learning_rate": 6.781310716966721e-06, "loss": 36.263, "step": 17503 }, { "epoch": 46.23043908880819, "grad_norm": 1181.0091552734375, "learning_rate": 6.7716498528267445e-06, "loss": 33.6316, "step": 17504 }, { "epoch": 46.23308022449653, "grad_norm": 587.4283447265625, "learning_rate": 6.7619957806955845e-06, "loss": 35.0176, "step": 17505 }, { "epoch": 46.23572136018488, "grad_norm": 579.6844482421875, "learning_rate": 6.752348500842803e-06, "loss": 34.0629, "step": 17506 }, { "epoch": 46.238362495873226, "grad_norm": 1183.666259765625, "learning_rate": 6.7427080135377965e-06, "loss": 34.6539, "step": 17507 }, { "epoch": 46.24100363156157, "grad_norm": 763.8235473632812, "learning_rate": 6.7330743190497645e-06, "loss": 34.3893, "step": 17508 }, { "epoch": 46.24364476724992, "grad_norm": 1649.9306640625, "learning_rate": 6.723447417647743e-06, "loss": 34.4711, "step": 17509 }, { "epoch": 46.24628590293826, "grad_norm": 2103.089599609375, "learning_rate": 6.713827309600545e-06, "loss": 35.7145, "step": 17510 }, { "epoch": 46.24892703862661, "grad_norm": 1031.6160888671875, "learning_rate": 6.704213995176789e-06, "loss": 34.2229, "step": 17511 }, { "epoch": 46.251568174314954, "grad_norm": 686.8433837890625, "learning_rate": 6.694607474644954e-06, "loss": 34.4122, "step": 17512 }, { "epoch": 46.254209310003304, "grad_norm": 1130.4554443359375, "learning_rate": 6.68500774827327e-06, "loss": 33.8767, "step": 17513 }, { "epoch": 46.25685044569165, "grad_norm": 2179.89794921875, "learning_rate": 6.6754148163298e-06, "loss": 34.5649, "step": 17514 }, { "epoch": 46.25949158137999, "grad_norm": 653.8251342773438, "learning_rate": 6.66582867908247e-06, "loss": 35.2803, "step": 17515 }, { "epoch": 46.26213271706834, "grad_norm": 6426.1533203125, "learning_rate": 6.656249336798925e-06, "loss": 35.6459, "step": 17516 }, { "epoch": 46.26477385275668, "grad_norm": 756.5089111328125, "learning_rate": 6.6466767897466475e-06, "loss": 39.1022, "step": 17517 }, { "epoch": 46.26741498844503, "grad_norm": 888.8632202148438, "learning_rate": 6.63711103819295e-06, "loss": 37.7736, "step": 17518 }, { "epoch": 46.270056124133376, "grad_norm": 1371.202392578125, "learning_rate": 6.627552082405008e-06, "loss": 38.6179, "step": 17519 }, { "epoch": 46.272697259821726, "grad_norm": 1372.856689453125, "learning_rate": 6.617999922649692e-06, "loss": 37.8465, "step": 17520 }, { "epoch": 46.27533839551007, "grad_norm": 895.9163818359375, "learning_rate": 6.608454559193761e-06, "loss": 39.4492, "step": 17521 }, { "epoch": 46.27797953119842, "grad_norm": 1703.7969970703125, "learning_rate": 6.59891599230375e-06, "loss": 41.8713, "step": 17522 }, { "epoch": 46.28062066688676, "grad_norm": 1059.516357421875, "learning_rate": 6.5893842222460306e-06, "loss": 41.2317, "step": 17523 }, { "epoch": 46.283261802575105, "grad_norm": 808.3173217773438, "learning_rate": 6.579859249286779e-06, "loss": 41.8916, "step": 17524 }, { "epoch": 46.285902938263455, "grad_norm": 533.5791625976562, "learning_rate": 6.570341073691977e-06, "loss": 40.6541, "step": 17525 }, { "epoch": 46.2885440739518, "grad_norm": 1932.6617431640625, "learning_rate": 6.5608296957273825e-06, "loss": 38.9777, "step": 17526 }, { "epoch": 46.29118520964015, "grad_norm": 1439.05322265625, "learning_rate": 6.551325115658646e-06, "loss": 37.7876, "step": 17527 }, { "epoch": 46.29382634532849, "grad_norm": 1146.93310546875, "learning_rate": 6.54182733375111e-06, "loss": 37.8614, "step": 17528 }, { "epoch": 46.29646748101684, "grad_norm": 1305.48095703125, "learning_rate": 6.532336350270035e-06, "loss": 36.353, "step": 17529 }, { "epoch": 46.29910861670518, "grad_norm": 1165.76416015625, "learning_rate": 6.522852165480458e-06, "loss": 37.514, "step": 17530 }, { "epoch": 46.301749752393526, "grad_norm": 615.58544921875, "learning_rate": 6.513374779647196e-06, "loss": 36.4849, "step": 17531 }, { "epoch": 46.304390888081876, "grad_norm": 847.6339721679688, "learning_rate": 6.5039041930348986e-06, "loss": 36.0114, "step": 17532 }, { "epoch": 46.30703202377022, "grad_norm": 811.1950073242188, "learning_rate": 6.494440405908103e-06, "loss": 35.7931, "step": 17533 }, { "epoch": 46.30967315945857, "grad_norm": 1788.2845458984375, "learning_rate": 6.48498341853096e-06, "loss": 35.1957, "step": 17534 }, { "epoch": 46.31231429514691, "grad_norm": 634.545166015625, "learning_rate": 6.4755332311676185e-06, "loss": 35.0701, "step": 17535 }, { "epoch": 46.31495543083526, "grad_norm": 1007.7402954101562, "learning_rate": 6.466089844081952e-06, "loss": 35.0346, "step": 17536 }, { "epoch": 46.317596566523605, "grad_norm": 1269.8203125, "learning_rate": 6.456653257537665e-06, "loss": 34.0596, "step": 17537 }, { "epoch": 46.32023770221195, "grad_norm": 730.8482055664062, "learning_rate": 6.447223471798297e-06, "loss": 34.7293, "step": 17538 }, { "epoch": 46.3228788379003, "grad_norm": 22082.615234375, "learning_rate": 6.43780048712711e-06, "loss": 45.2178, "step": 17539 }, { "epoch": 46.32551997358864, "grad_norm": 4944.92529296875, "learning_rate": 6.428384303787282e-06, "loss": 18.8121, "step": 17540 }, { "epoch": 46.32816110927699, "grad_norm": 923.1083984375, "learning_rate": 6.418974922041743e-06, "loss": 12.5177, "step": 17541 }, { "epoch": 46.330802244965334, "grad_norm": 1004.3392333984375, "learning_rate": 6.409572342153252e-06, "loss": 13.081, "step": 17542 }, { "epoch": 46.333443380653684, "grad_norm": 2425.86181640625, "learning_rate": 6.4001765643843525e-06, "loss": 13.123, "step": 17543 }, { "epoch": 46.33608451634203, "grad_norm": 363.8482971191406, "learning_rate": 6.390787588997415e-06, "loss": 12.7311, "step": 17544 }, { "epoch": 46.33872565203038, "grad_norm": 974.0702514648438, "learning_rate": 6.381405416254648e-06, "loss": 13.0573, "step": 17545 }, { "epoch": 46.34136678771872, "grad_norm": 1173.095458984375, "learning_rate": 6.372030046417981e-06, "loss": 11.1445, "step": 17546 }, { "epoch": 46.34400792340706, "grad_norm": 1201.125, "learning_rate": 6.362661479749316e-06, "loss": 9.3241, "step": 17547 }, { "epoch": 46.34664905909541, "grad_norm": 4716.69189453125, "learning_rate": 6.353299716510192e-06, "loss": 14.6284, "step": 17548 }, { "epoch": 46.349290194783755, "grad_norm": 537.4781494140625, "learning_rate": 6.343944756962011e-06, "loss": 24.9329, "step": 17549 }, { "epoch": 46.351931330472105, "grad_norm": 903.3533325195312, "learning_rate": 6.334596601366094e-06, "loss": 35.958, "step": 17550 }, { "epoch": 46.35457246616045, "grad_norm": 1593.3165283203125, "learning_rate": 6.325255249983425e-06, "loss": 33.6995, "step": 17551 }, { "epoch": 46.3572136018488, "grad_norm": 1992.7691650390625, "learning_rate": 6.315920703074851e-06, "loss": 34.4659, "step": 17552 }, { "epoch": 46.35985473753714, "grad_norm": 2855.06201171875, "learning_rate": 6.306592960901025e-06, "loss": 33.7113, "step": 17553 }, { "epoch": 46.362495873225484, "grad_norm": 2731.48779296875, "learning_rate": 6.297272023722433e-06, "loss": 34.7158, "step": 17554 }, { "epoch": 46.365137008913834, "grad_norm": 680.6380004882812, "learning_rate": 6.287957891799395e-06, "loss": 34.5858, "step": 17555 }, { "epoch": 46.36777814460218, "grad_norm": 1093.8570556640625, "learning_rate": 6.2786505653919245e-06, "loss": 34.1116, "step": 17556 }, { "epoch": 46.37041928029053, "grad_norm": 1259.241943359375, "learning_rate": 6.269350044760008e-06, "loss": 33.5368, "step": 17557 }, { "epoch": 46.37306041597887, "grad_norm": 934.2600708007812, "learning_rate": 6.2600563301633e-06, "loss": 35.3242, "step": 17558 }, { "epoch": 46.37570155166722, "grad_norm": 1014.5399780273438, "learning_rate": 6.250769421861341e-06, "loss": 35.1537, "step": 17559 }, { "epoch": 46.37834268735556, "grad_norm": 883.926513671875, "learning_rate": 6.241489320113453e-06, "loss": 33.5537, "step": 17560 }, { "epoch": 46.380983823043906, "grad_norm": 2319.927978515625, "learning_rate": 6.2322160251788165e-06, "loss": 34.2195, "step": 17561 }, { "epoch": 46.383624958732256, "grad_norm": 952.8134765625, "learning_rate": 6.222949537316308e-06, "loss": 33.9706, "step": 17562 }, { "epoch": 46.3862660944206, "grad_norm": 2619.30078125, "learning_rate": 6.213689856784749e-06, "loss": 33.3406, "step": 17563 }, { "epoch": 46.38890723010895, "grad_norm": 2040.860595703125, "learning_rate": 6.204436983842682e-06, "loss": 32.614, "step": 17564 }, { "epoch": 46.39154836579729, "grad_norm": 1706.2305908203125, "learning_rate": 6.195190918748511e-06, "loss": 35.5941, "step": 17565 }, { "epoch": 46.39418950148564, "grad_norm": 2727.958740234375, "learning_rate": 6.185951661760419e-06, "loss": 37.9337, "step": 17566 }, { "epoch": 46.396830637173984, "grad_norm": 554.2254638671875, "learning_rate": 6.176719213136367e-06, "loss": 40.6363, "step": 17567 }, { "epoch": 46.399471772862334, "grad_norm": 755.4703369140625, "learning_rate": 6.167493573134258e-06, "loss": 37.8609, "step": 17568 }, { "epoch": 46.40211290855068, "grad_norm": 3528.307373046875, "learning_rate": 6.158274742011638e-06, "loss": 37.7173, "step": 17569 }, { "epoch": 46.40475404423902, "grad_norm": 953.4661254882812, "learning_rate": 6.149062720025938e-06, "loss": 39.015, "step": 17570 }, { "epoch": 46.40739517992737, "grad_norm": 1401.3714599609375, "learning_rate": 6.139857507434427e-06, "loss": 41.3622, "step": 17571 }, { "epoch": 46.41003631561571, "grad_norm": 514.60205078125, "learning_rate": 6.130659104494146e-06, "loss": 41.0881, "step": 17572 }, { "epoch": 46.41267745130406, "grad_norm": 986.1064453125, "learning_rate": 6.121467511461948e-06, "loss": 39.6065, "step": 17573 }, { "epoch": 46.415318586992406, "grad_norm": 679.223388671875, "learning_rate": 6.1122827285944875e-06, "loss": 41.9271, "step": 17574 }, { "epoch": 46.417959722680756, "grad_norm": 769.97705078125, "learning_rate": 6.103104756148281e-06, "loss": 40.7381, "step": 17575 }, { "epoch": 46.4206008583691, "grad_norm": 940.7098999023438, "learning_rate": 6.093933594379625e-06, "loss": 42.3305, "step": 17576 }, { "epoch": 46.42324199405744, "grad_norm": 642.2318115234375, "learning_rate": 6.0847692435445635e-06, "loss": 39.4821, "step": 17577 }, { "epoch": 46.42588312974579, "grad_norm": 1225.4869384765625, "learning_rate": 6.075611703899059e-06, "loss": 37.8441, "step": 17578 }, { "epoch": 46.428524265434135, "grad_norm": 898.7999877929688, "learning_rate": 6.066460975698795e-06, "loss": 36.3479, "step": 17579 }, { "epoch": 46.431165401122485, "grad_norm": 5388.1181640625, "learning_rate": 6.0573170591993185e-06, "loss": 36.508, "step": 17580 }, { "epoch": 46.43380653681083, "grad_norm": 682.2760009765625, "learning_rate": 6.048179954655952e-06, "loss": 35.0949, "step": 17581 }, { "epoch": 46.43644767249918, "grad_norm": 594.4054565429688, "learning_rate": 6.039049662323881e-06, "loss": 35.6159, "step": 17582 }, { "epoch": 46.43908880818752, "grad_norm": 777.7992553710938, "learning_rate": 6.029926182458012e-06, "loss": 34.9893, "step": 17583 }, { "epoch": 46.44172994387586, "grad_norm": 1043.4769287109375, "learning_rate": 6.020809515313141e-06, "loss": 34.2973, "step": 17584 }, { "epoch": 46.44437107956421, "grad_norm": 1146.505859375, "learning_rate": 6.011699661143871e-06, "loss": 33.6668, "step": 17585 }, { "epoch": 46.447012215252556, "grad_norm": 668.2792358398438, "learning_rate": 6.002596620204553e-06, "loss": 35.2965, "step": 17586 }, { "epoch": 46.449653350940906, "grad_norm": 748.508056640625, "learning_rate": 5.9935003927493735e-06, "loss": 34.1978, "step": 17587 }, { "epoch": 46.45229448662925, "grad_norm": 1120.03564453125, "learning_rate": 5.984410979032351e-06, "loss": 36.2558, "step": 17588 }, { "epoch": 46.4549356223176, "grad_norm": 3814.84375, "learning_rate": 5.975328379307337e-06, "loss": 34.9722, "step": 17589 }, { "epoch": 46.45757675800594, "grad_norm": 1295.54833984375, "learning_rate": 5.966252593827909e-06, "loss": 34.1964, "step": 17590 }, { "epoch": 46.46021789369429, "grad_norm": 3490.51318359375, "learning_rate": 5.957183622847529e-06, "loss": 16.0944, "step": 17591 }, { "epoch": 46.462859029382635, "grad_norm": 822.6759643554688, "learning_rate": 5.948121466619438e-06, "loss": 11.047, "step": 17592 }, { "epoch": 46.46550016507098, "grad_norm": 2528.219970703125, "learning_rate": 5.939066125396714e-06, "loss": 17.3538, "step": 17593 }, { "epoch": 46.46814130075933, "grad_norm": 1758.5067138671875, "learning_rate": 5.930017599432181e-06, "loss": 12.2119, "step": 17594 }, { "epoch": 46.47078243644767, "grad_norm": 1263.7113037109375, "learning_rate": 5.920975888978525e-06, "loss": 13.0629, "step": 17595 }, { "epoch": 46.47342357213602, "grad_norm": 1850.607666015625, "learning_rate": 5.91194099428824e-06, "loss": 13.4196, "step": 17596 }, { "epoch": 46.476064707824364, "grad_norm": 5018.57568359375, "learning_rate": 5.902912915613623e-06, "loss": 12.6918, "step": 17597 }, { "epoch": 46.478705843512714, "grad_norm": 4527.59912109375, "learning_rate": 5.893891653206751e-06, "loss": 9.2887, "step": 17598 }, { "epoch": 46.48134697920106, "grad_norm": 2356.808837890625, "learning_rate": 5.884877207319589e-06, "loss": 33.4451, "step": 17599 }, { "epoch": 46.4839881148894, "grad_norm": 1304.383544921875, "learning_rate": 5.875869578203824e-06, "loss": 33.453, "step": 17600 }, { "epoch": 46.4839881148894, "eval_loss": 3.771113395690918, "eval_runtime": 2.1984, "eval_samples_per_second": 225.163, "eval_steps_per_second": 28.202, "step": 17600 }, { "epoch": 46.48662925057775, "grad_norm": 1935.4638671875, "learning_rate": 5.8668687661110055e-06, "loss": 35.2141, "step": 17601 }, { "epoch": 46.48927038626609, "grad_norm": 713.8873901367188, "learning_rate": 5.857874771292432e-06, "loss": 34.5847, "step": 17602 }, { "epoch": 46.49191152195444, "grad_norm": 569.5056762695312, "learning_rate": 5.8488875939993195e-06, "loss": 36.2744, "step": 17603 }, { "epoch": 46.494552657642785, "grad_norm": 704.9664916992188, "learning_rate": 5.839907234482605e-06, "loss": 33.4174, "step": 17604 }, { "epoch": 46.497193793331135, "grad_norm": 1231.0985107421875, "learning_rate": 5.8309336929930066e-06, "loss": 34.0889, "step": 17605 }, { "epoch": 46.49983492901948, "grad_norm": 1265.168212890625, "learning_rate": 5.821966969781184e-06, "loss": 35.0379, "step": 17606 }, { "epoch": 46.50247606470782, "grad_norm": 796.540283203125, "learning_rate": 5.813007065097492e-06, "loss": 33.7837, "step": 17607 }, { "epoch": 46.50511720039617, "grad_norm": 962.903564453125, "learning_rate": 5.804053979192148e-06, "loss": 34.045, "step": 17608 }, { "epoch": 46.507758336084514, "grad_norm": 1953.2691650390625, "learning_rate": 5.795107712315117e-06, "loss": 33.4925, "step": 17609 }, { "epoch": 46.510399471772864, "grad_norm": 1276.922119140625, "learning_rate": 5.786168264716285e-06, "loss": 33.5695, "step": 17610 }, { "epoch": 46.51304060746121, "grad_norm": 1127.1007080078125, "learning_rate": 5.777235636645229e-06, "loss": 35.1382, "step": 17611 }, { "epoch": 46.51568174314956, "grad_norm": 1301.5260009765625, "learning_rate": 5.7683098283513865e-06, "loss": 33.4947, "step": 17612 }, { "epoch": 46.5183228788379, "grad_norm": 811.2383422851562, "learning_rate": 5.759390840084061e-06, "loss": 33.9159, "step": 17613 }, { "epoch": 46.52096401452625, "grad_norm": 1433.56298828125, "learning_rate": 5.750478672092246e-06, "loss": 35.7771, "step": 17614 }, { "epoch": 46.52360515021459, "grad_norm": 1131.021728515625, "learning_rate": 5.741573324624827e-06, "loss": 34.792, "step": 17615 }, { "epoch": 46.526246285902936, "grad_norm": 1089.76318359375, "learning_rate": 5.732674797930493e-06, "loss": 37.194, "step": 17616 }, { "epoch": 46.528887421591286, "grad_norm": 606.5958862304688, "learning_rate": 5.72378309225774e-06, "loss": 39.3811, "step": 17617 }, { "epoch": 46.53152855727963, "grad_norm": 844.4952392578125, "learning_rate": 5.714898207854841e-06, "loss": 38.1207, "step": 17618 }, { "epoch": 46.53416969296798, "grad_norm": 1596.4765625, "learning_rate": 5.706020144969876e-06, "loss": 37.0565, "step": 17619 }, { "epoch": 46.53681082865632, "grad_norm": 1760.17529296875, "learning_rate": 5.697148903850868e-06, "loss": 38.5461, "step": 17620 }, { "epoch": 46.53945196434467, "grad_norm": 1745.1739501953125, "learning_rate": 5.688284484745399e-06, "loss": 38.4399, "step": 17621 }, { "epoch": 46.542093100033014, "grad_norm": 1169.0186767578125, "learning_rate": 5.679426887901102e-06, "loss": 40.8503, "step": 17622 }, { "epoch": 46.54473423572136, "grad_norm": 621.4057006835938, "learning_rate": 5.670576113565251e-06, "loss": 43.4406, "step": 17623 }, { "epoch": 46.54737537140971, "grad_norm": 822.3005981445312, "learning_rate": 5.661732161985067e-06, "loss": 42.242, "step": 17624 }, { "epoch": 46.55001650709805, "grad_norm": 1727.8948974609375, "learning_rate": 5.65289503340749e-06, "loss": 40.5051, "step": 17625 }, { "epoch": 46.5526576427864, "grad_norm": 1526.859619140625, "learning_rate": 5.64406472807924e-06, "loss": 38.407, "step": 17626 }, { "epoch": 46.55529877847474, "grad_norm": 729.223388671875, "learning_rate": 5.63524124624698e-06, "loss": 39.1325, "step": 17627 }, { "epoch": 46.55793991416309, "grad_norm": 1253.1866455078125, "learning_rate": 5.626424588157042e-06, "loss": 39.0609, "step": 17628 }, { "epoch": 46.560581049851436, "grad_norm": 889.3016357421875, "learning_rate": 5.617614754055644e-06, "loss": 37.6627, "step": 17629 }, { "epoch": 46.56322218553978, "grad_norm": 1448.466796875, "learning_rate": 5.608811744188813e-06, "loss": 35.8251, "step": 17630 }, { "epoch": 46.56586332122813, "grad_norm": 704.1375122070312, "learning_rate": 5.600015558802352e-06, "loss": 34.6079, "step": 17631 }, { "epoch": 46.56850445691647, "grad_norm": 522.0245971679688, "learning_rate": 5.591226198141869e-06, "loss": 36.1841, "step": 17632 }, { "epoch": 46.57114559260482, "grad_norm": 734.102294921875, "learning_rate": 5.582443662452807e-06, "loss": 35.3733, "step": 17633 }, { "epoch": 46.573786728293165, "grad_norm": 797.8488159179688, "learning_rate": 5.573667951980443e-06, "loss": 35.528, "step": 17634 }, { "epoch": 46.576427863981515, "grad_norm": 816.7374877929688, "learning_rate": 5.56489906696983e-06, "loss": 34.7689, "step": 17635 }, { "epoch": 46.57906899966986, "grad_norm": 1411.0595703125, "learning_rate": 5.556137007665829e-06, "loss": 35.0179, "step": 17636 }, { "epoch": 46.58171013535821, "grad_norm": 983.4197998046875, "learning_rate": 5.547381774313076e-06, "loss": 34.8243, "step": 17637 }, { "epoch": 46.58435127104655, "grad_norm": 2921.10498046875, "learning_rate": 5.538633367156126e-06, "loss": 36.1939, "step": 17638 }, { "epoch": 46.58699240673489, "grad_norm": 1305.655517578125, "learning_rate": 5.529891786439201e-06, "loss": 33.9887, "step": 17639 }, { "epoch": 46.58963354242324, "grad_norm": 635.4917602539062, "learning_rate": 5.5211570324064375e-06, "loss": 9.814, "step": 17640 }, { "epoch": 46.592274678111586, "grad_norm": 3487.08740234375, "learning_rate": 5.512429105301781e-06, "loss": 10.1497, "step": 17641 }, { "epoch": 46.594915813799936, "grad_norm": 707.8228149414062, "learning_rate": 5.503708005368896e-06, "loss": 10.3321, "step": 17642 }, { "epoch": 46.59755694948828, "grad_norm": 1153.2264404296875, "learning_rate": 5.494993732851339e-06, "loss": 12.3317, "step": 17643 }, { "epoch": 46.60019808517663, "grad_norm": 746.0429077148438, "learning_rate": 5.4862862879924425e-06, "loss": 9.924, "step": 17644 }, { "epoch": 46.60283922086497, "grad_norm": 902.4255981445312, "learning_rate": 5.477585671035401e-06, "loss": 10.1405, "step": 17645 }, { "epoch": 46.605480356553315, "grad_norm": 1814.7535400390625, "learning_rate": 5.468891882223132e-06, "loss": 16.4808, "step": 17646 }, { "epoch": 46.608121492241665, "grad_norm": 2413.889404296875, "learning_rate": 5.460204921798384e-06, "loss": 14.8102, "step": 17647 }, { "epoch": 46.61076262793001, "grad_norm": 1129.9561767578125, "learning_rate": 5.451524790003798e-06, "loss": 8.2688, "step": 17648 }, { "epoch": 46.61340376361836, "grad_norm": 828.4638061523438, "learning_rate": 5.442851487081707e-06, "loss": 15.1056, "step": 17649 }, { "epoch": 46.6160448993067, "grad_norm": 2893.34130859375, "learning_rate": 5.434185013274335e-06, "loss": 35.7021, "step": 17650 }, { "epoch": 46.61868603499505, "grad_norm": 1242.5308837890625, "learning_rate": 5.425525368823653e-06, "loss": 33.8411, "step": 17651 }, { "epoch": 46.621327170683394, "grad_norm": 1226.68701171875, "learning_rate": 5.416872553971524e-06, "loss": 34.2926, "step": 17652 }, { "epoch": 46.62396830637174, "grad_norm": 1442.8507080078125, "learning_rate": 5.408226568959562e-06, "loss": 34.4115, "step": 17653 }, { "epoch": 46.62660944206009, "grad_norm": 1641.160400390625, "learning_rate": 5.399587414029183e-06, "loss": 33.9549, "step": 17654 }, { "epoch": 46.62925057774843, "grad_norm": 1917.2000732421875, "learning_rate": 5.390955089421667e-06, "loss": 33.7618, "step": 17655 }, { "epoch": 46.63189171343678, "grad_norm": 612.9322509765625, "learning_rate": 5.382329595378016e-06, "loss": 34.7086, "step": 17656 }, { "epoch": 46.63453284912512, "grad_norm": 1122.1978759765625, "learning_rate": 5.373710932139119e-06, "loss": 33.4439, "step": 17657 }, { "epoch": 46.63717398481347, "grad_norm": 1102.4713134765625, "learning_rate": 5.36509909994562e-06, "loss": 34.4183, "step": 17658 }, { "epoch": 46.639815120501815, "grad_norm": 855.5404052734375, "learning_rate": 5.356494099038045e-06, "loss": 35.0286, "step": 17659 }, { "epoch": 46.642456256190165, "grad_norm": 930.43408203125, "learning_rate": 5.347895929656649e-06, "loss": 35.651, "step": 17660 }, { "epoch": 46.64509739187851, "grad_norm": 1810.2384033203125, "learning_rate": 5.339304592041544e-06, "loss": 33.6005, "step": 17661 }, { "epoch": 46.64773852756685, "grad_norm": 1459.42724609375, "learning_rate": 5.33072008643265e-06, "loss": 34.4693, "step": 17662 }, { "epoch": 46.6503796632552, "grad_norm": 659.0938720703125, "learning_rate": 5.322142413069664e-06, "loss": 33.1097, "step": 17663 }, { "epoch": 46.653020798943544, "grad_norm": 2833.843505859375, "learning_rate": 5.3135715721921164e-06, "loss": 33.6481, "step": 17664 }, { "epoch": 46.655661934631894, "grad_norm": 2634.688232421875, "learning_rate": 5.305007564039344e-06, "loss": 34.8018, "step": 17665 }, { "epoch": 46.65830307032024, "grad_norm": 2090.116455078125, "learning_rate": 5.296450388850515e-06, "loss": 36.3701, "step": 17666 }, { "epoch": 46.66094420600859, "grad_norm": 2243.781005859375, "learning_rate": 5.287900046864552e-06, "loss": 41.1113, "step": 17667 }, { "epoch": 46.66358534169693, "grad_norm": 764.3488159179688, "learning_rate": 5.279356538320207e-06, "loss": 37.8798, "step": 17668 }, { "epoch": 46.66622647738527, "grad_norm": 583.60986328125, "learning_rate": 5.270819863456094e-06, "loss": 39.1081, "step": 17669 }, { "epoch": 46.66886761307362, "grad_norm": 6798.91015625, "learning_rate": 5.262290022510552e-06, "loss": 38.2903, "step": 17670 }, { "epoch": 46.671508748761966, "grad_norm": 681.5977783203125, "learning_rate": 5.253767015721806e-06, "loss": 42.2974, "step": 17671 }, { "epoch": 46.674149884450316, "grad_norm": 1092.1173095703125, "learning_rate": 5.245250843327859e-06, "loss": 41.9955, "step": 17672 }, { "epoch": 46.67679102013866, "grad_norm": 704.8975830078125, "learning_rate": 5.236741505566522e-06, "loss": 39.8313, "step": 17673 }, { "epoch": 46.67943215582701, "grad_norm": 602.9539794921875, "learning_rate": 5.228239002675383e-06, "loss": 41.2872, "step": 17674 }, { "epoch": 46.68207329151535, "grad_norm": 774.9223022460938, "learning_rate": 5.219743334891835e-06, "loss": 39.1142, "step": 17675 }, { "epoch": 46.684714427203694, "grad_norm": 700.0514526367188, "learning_rate": 5.211254502453217e-06, "loss": 39.1297, "step": 17676 }, { "epoch": 46.687355562892044, "grad_norm": 767.1354370117188, "learning_rate": 5.2027725055965036e-06, "loss": 39.0372, "step": 17677 }, { "epoch": 46.68999669858039, "grad_norm": 938.86767578125, "learning_rate": 5.194297344558535e-06, "loss": 38.2835, "step": 17678 }, { "epoch": 46.69263783426874, "grad_norm": 646.7610473632812, "learning_rate": 5.185829019576066e-06, "loss": 35.8623, "step": 17679 }, { "epoch": 46.69527896995708, "grad_norm": 2156.224365234375, "learning_rate": 5.177367530885463e-06, "loss": 37.778, "step": 17680 }, { "epoch": 46.69792010564543, "grad_norm": 696.2120361328125, "learning_rate": 5.168912878723092e-06, "loss": 36.781, "step": 17681 }, { "epoch": 46.70056124133377, "grad_norm": 1311.1505126953125, "learning_rate": 5.160465063324987e-06, "loss": 33.97, "step": 17682 }, { "epoch": 46.70320237702212, "grad_norm": 1077.6082763671875, "learning_rate": 5.1520240849270705e-06, "loss": 34.9177, "step": 17683 }, { "epoch": 46.705843512710466, "grad_norm": 848.7847290039062, "learning_rate": 5.1435899437650705e-06, "loss": 34.1689, "step": 17684 }, { "epoch": 46.70848464839881, "grad_norm": 610.3414306640625, "learning_rate": 5.135162640074437e-06, "loss": 35.3797, "step": 17685 }, { "epoch": 46.71112578408716, "grad_norm": 1081.4256591796875, "learning_rate": 5.126742174090565e-06, "loss": 34.6724, "step": 17686 }, { "epoch": 46.7137669197755, "grad_norm": 915.8339233398438, "learning_rate": 5.118328546048601e-06, "loss": 34.3046, "step": 17687 }, { "epoch": 46.71640805546385, "grad_norm": 1529.688232421875, "learning_rate": 5.109921756183439e-06, "loss": 34.9237, "step": 17688 }, { "epoch": 46.719049191152195, "grad_norm": 23539.544921875, "learning_rate": 5.101521804729836e-06, "loss": 42.4511, "step": 17689 }, { "epoch": 46.721690326840545, "grad_norm": 2076.32568359375, "learning_rate": 5.093128691922411e-06, "loss": 9.2542, "step": 17690 }, { "epoch": 46.72433146252889, "grad_norm": 1093.88818359375, "learning_rate": 5.084742417995502e-06, "loss": 8.7522, "step": 17691 }, { "epoch": 46.72697259821723, "grad_norm": 348.2218017578125, "learning_rate": 5.076362983183258e-06, "loss": 9.8182, "step": 17692 }, { "epoch": 46.72961373390558, "grad_norm": 2002.6097412109375, "learning_rate": 5.067990387719712e-06, "loss": 8.7987, "step": 17693 }, { "epoch": 46.73225486959392, "grad_norm": 6732.32958984375, "learning_rate": 5.05962463183865e-06, "loss": 17.051, "step": 17694 }, { "epoch": 46.73489600528227, "grad_norm": 1190.4263916015625, "learning_rate": 5.051265715773718e-06, "loss": 12.1359, "step": 17695 }, { "epoch": 46.737537140970616, "grad_norm": 1339.861328125, "learning_rate": 5.042913639758257e-06, "loss": 10.6915, "step": 17696 }, { "epoch": 46.740178276658966, "grad_norm": 4219.5439453125, "learning_rate": 5.034568404025553e-06, "loss": 9.0167, "step": 17697 }, { "epoch": 46.74281941234731, "grad_norm": 7886.34521484375, "learning_rate": 5.026230008808641e-06, "loss": 10.0926, "step": 17698 }, { "epoch": 46.74546054803565, "grad_norm": 1840.027587890625, "learning_rate": 5.017898454340336e-06, "loss": 32.2731, "step": 17699 }, { "epoch": 46.748101683724, "grad_norm": 917.9982299804688, "learning_rate": 5.009573740853312e-06, "loss": 34.354, "step": 17700 }, { "epoch": 46.750742819412345, "grad_norm": 957.2288208007812, "learning_rate": 5.001255868580024e-06, "loss": 34.9572, "step": 17701 }, { "epoch": 46.753383955100695, "grad_norm": 1505.805908203125, "learning_rate": 4.992944837752756e-06, "loss": 35.8743, "step": 17702 }, { "epoch": 46.75602509078904, "grad_norm": 1114.025634765625, "learning_rate": 4.984640648603545e-06, "loss": 33.3194, "step": 17703 }, { "epoch": 46.75866622647739, "grad_norm": 750.4212036132812, "learning_rate": 4.976343301364345e-06, "loss": 33.844, "step": 17704 }, { "epoch": 46.76130736216573, "grad_norm": 1317.407958984375, "learning_rate": 4.968052796266803e-06, "loss": 35.055, "step": 17705 }, { "epoch": 46.76394849785408, "grad_norm": 713.7617797851562, "learning_rate": 4.95976913354243e-06, "loss": 34.3873, "step": 17706 }, { "epoch": 46.766589633542424, "grad_norm": 1005.4776000976562, "learning_rate": 4.951492313422595e-06, "loss": 34.1188, "step": 17707 }, { "epoch": 46.76923076923077, "grad_norm": 1096.183837890625, "learning_rate": 4.94322233613842e-06, "loss": 33.4609, "step": 17708 }, { "epoch": 46.77187190491912, "grad_norm": 724.0404052734375, "learning_rate": 4.934959201920775e-06, "loss": 34.8005, "step": 17709 }, { "epoch": 46.77451304060746, "grad_norm": 1635.4786376953125, "learning_rate": 4.926702911000391e-06, "loss": 35.1305, "step": 17710 }, { "epoch": 46.77715417629581, "grad_norm": 1166.545166015625, "learning_rate": 4.91845346360792e-06, "loss": 34.284, "step": 17711 }, { "epoch": 46.77979531198415, "grad_norm": 1924.1456298828125, "learning_rate": 4.910210859973646e-06, "loss": 34.6755, "step": 17712 }, { "epoch": 46.7824364476725, "grad_norm": 1528.3251953125, "learning_rate": 4.901975100327749e-06, "loss": 34.0562, "step": 17713 }, { "epoch": 46.785077583360845, "grad_norm": 1553.346435546875, "learning_rate": 4.8937461849002386e-06, "loss": 34.3293, "step": 17714 }, { "epoch": 46.78771871904919, "grad_norm": 3289.877197265625, "learning_rate": 4.885524113920875e-06, "loss": 36.3227, "step": 17715 }, { "epoch": 46.79035985473754, "grad_norm": 3179.27734375, "learning_rate": 4.877308887619281e-06, "loss": 36.5783, "step": 17716 }, { "epoch": 46.79300099042588, "grad_norm": 1723.8963623046875, "learning_rate": 4.8691005062248276e-06, "loss": 40.5344, "step": 17717 }, { "epoch": 46.79564212611423, "grad_norm": 794.3160400390625, "learning_rate": 4.860898969966749e-06, "loss": 38.4174, "step": 17718 }, { "epoch": 46.798283261802574, "grad_norm": 1095.09765625, "learning_rate": 4.852704279074055e-06, "loss": 37.4555, "step": 17719 }, { "epoch": 46.800924397490924, "grad_norm": 541.3473510742188, "learning_rate": 4.844516433775592e-06, "loss": 38.6861, "step": 17720 }, { "epoch": 46.80356553317927, "grad_norm": 880.1412963867188, "learning_rate": 4.836335434299982e-06, "loss": 40.1684, "step": 17721 }, { "epoch": 46.80620666886761, "grad_norm": 1105.79248046875, "learning_rate": 4.828161280875709e-06, "loss": 39.4274, "step": 17722 }, { "epoch": 46.80884780455596, "grad_norm": 503.30731201171875, "learning_rate": 4.819993973731007e-06, "loss": 39.7779, "step": 17723 }, { "epoch": 46.8114889402443, "grad_norm": 902.24853515625, "learning_rate": 4.811833513093916e-06, "loss": 38.8444, "step": 17724 }, { "epoch": 46.81413007593265, "grad_norm": 734.6655883789062, "learning_rate": 4.803679899192393e-06, "loss": 39.8233, "step": 17725 }, { "epoch": 46.816771211620996, "grad_norm": 764.580078125, "learning_rate": 4.7955331322540595e-06, "loss": 37.3601, "step": 17726 }, { "epoch": 46.819412347309346, "grad_norm": 1757.524169921875, "learning_rate": 4.7873932125064025e-06, "loss": 38.8282, "step": 17727 }, { "epoch": 46.82205348299769, "grad_norm": 1729.778076171875, "learning_rate": 4.779260140176766e-06, "loss": 37.9042, "step": 17728 }, { "epoch": 46.82469461868604, "grad_norm": 792.88427734375, "learning_rate": 4.77113391549222e-06, "loss": 37.7002, "step": 17729 }, { "epoch": 46.82733575437438, "grad_norm": 2648.1181640625, "learning_rate": 4.763014538679722e-06, "loss": 35.0902, "step": 17730 }, { "epoch": 46.829976890062724, "grad_norm": 464.90118408203125, "learning_rate": 4.75490200996595e-06, "loss": 36.7237, "step": 17731 }, { "epoch": 46.832618025751074, "grad_norm": 981.8645629882812, "learning_rate": 4.746796329577502e-06, "loss": 35.0733, "step": 17732 }, { "epoch": 46.83525916143942, "grad_norm": 1551.41162109375, "learning_rate": 4.738697497740696e-06, "loss": 35.3299, "step": 17733 }, { "epoch": 46.83790029712777, "grad_norm": 5297.39404296875, "learning_rate": 4.730605514681685e-06, "loss": 34.3731, "step": 17734 }, { "epoch": 46.84054143281611, "grad_norm": 1090.8787841796875, "learning_rate": 4.722520380626427e-06, "loss": 34.0267, "step": 17735 }, { "epoch": 46.84318256850446, "grad_norm": 1189.6275634765625, "learning_rate": 4.714442095800714e-06, "loss": 35.1141, "step": 17736 }, { "epoch": 46.8458237041928, "grad_norm": 1178.7615966796875, "learning_rate": 4.706370660430142e-06, "loss": 34.4902, "step": 17737 }, { "epoch": 46.848464839881146, "grad_norm": 1257.5450439453125, "learning_rate": 4.698306074740033e-06, "loss": 42.6755, "step": 17738 }, { "epoch": 46.851105975569496, "grad_norm": 1787.6036376953125, "learning_rate": 4.690248338955649e-06, "loss": 15.5447, "step": 17739 }, { "epoch": 46.85374711125784, "grad_norm": 2013.9249267578125, "learning_rate": 4.682197453301951e-06, "loss": 9.9835, "step": 17740 }, { "epoch": 46.85638824694619, "grad_norm": 956.4601440429688, "learning_rate": 4.674153418003813e-06, "loss": 15.7137, "step": 17741 }, { "epoch": 46.85902938263453, "grad_norm": 791.5706176757812, "learning_rate": 4.666116233285805e-06, "loss": 13.4088, "step": 17742 }, { "epoch": 46.86167051832288, "grad_norm": 2454.4599609375, "learning_rate": 4.658085899372416e-06, "loss": 10.0324, "step": 17743 }, { "epoch": 46.864311654011225, "grad_norm": 453.2629699707031, "learning_rate": 4.650062416487854e-06, "loss": 10.1538, "step": 17744 }, { "epoch": 46.86695278969957, "grad_norm": 5351.50537109375, "learning_rate": 4.642045784856136e-06, "loss": 12.7766, "step": 17745 }, { "epoch": 46.86959392538792, "grad_norm": 9210.208984375, "learning_rate": 4.634036004701192e-06, "loss": 11.9494, "step": 17746 }, { "epoch": 46.87223506107626, "grad_norm": 2172.68994140625, "learning_rate": 4.62603307624665e-06, "loss": 10.8699, "step": 17747 }, { "epoch": 46.87487619676461, "grad_norm": 494.7276306152344, "learning_rate": 4.618036999715969e-06, "loss": 21.3501, "step": 17748 }, { "epoch": 46.87751733245295, "grad_norm": 1228.859619140625, "learning_rate": 4.610047775332471e-06, "loss": 35.6612, "step": 17749 }, { "epoch": 46.8801584681413, "grad_norm": 837.5335693359375, "learning_rate": 4.6020654033192555e-06, "loss": 34.4645, "step": 17750 }, { "epoch": 46.882799603829646, "grad_norm": 1187.0394287109375, "learning_rate": 4.5940898838992e-06, "loss": 33.9821, "step": 17751 }, { "epoch": 46.885440739517996, "grad_norm": 1234.9339599609375, "learning_rate": 4.586121217295042e-06, "loss": 32.574, "step": 17752 }, { "epoch": 46.88808187520634, "grad_norm": 863.0960693359375, "learning_rate": 4.578159403729271e-06, "loss": 34.6494, "step": 17753 }, { "epoch": 46.89072301089468, "grad_norm": 1152.7607421875, "learning_rate": 4.5702044434242365e-06, "loss": 33.3995, "step": 17754 }, { "epoch": 46.89336414658303, "grad_norm": 712.1937255859375, "learning_rate": 4.562256336602038e-06, "loss": 33.7398, "step": 17755 }, { "epoch": 46.896005282271375, "grad_norm": 4803.46484375, "learning_rate": 4.554315083484695e-06, "loss": 34.0056, "step": 17756 }, { "epoch": 46.898646417959725, "grad_norm": 1415.5792236328125, "learning_rate": 4.546380684293888e-06, "loss": 32.9196, "step": 17757 }, { "epoch": 46.90128755364807, "grad_norm": 964.7510986328125, "learning_rate": 4.538453139251247e-06, "loss": 34.4816, "step": 17758 }, { "epoch": 46.90392868933642, "grad_norm": 696.6563110351562, "learning_rate": 4.530532448578068e-06, "loss": 34.8248, "step": 17759 }, { "epoch": 46.90656982502476, "grad_norm": 1884.553955078125, "learning_rate": 4.522618612495616e-06, "loss": 34.1251, "step": 17760 }, { "epoch": 46.909210960713104, "grad_norm": 718.7238159179688, "learning_rate": 4.514711631224827e-06, "loss": 34.854, "step": 17761 }, { "epoch": 46.911852096401454, "grad_norm": 885.7345581054688, "learning_rate": 4.506811504986497e-06, "loss": 33.5012, "step": 17762 }, { "epoch": 46.9144932320898, "grad_norm": 1460.570556640625, "learning_rate": 4.498918234001254e-06, "loss": 34.8442, "step": 17763 }, { "epoch": 46.91713436777815, "grad_norm": 1586.835693359375, "learning_rate": 4.491031818489505e-06, "loss": 34.4259, "step": 17764 }, { "epoch": 46.91977550346649, "grad_norm": 2040.5374755859375, "learning_rate": 4.483152258671463e-06, "loss": 37.1597, "step": 17765 }, { "epoch": 46.92241663915484, "grad_norm": 2713.1875, "learning_rate": 4.475279554767148e-06, "loss": 37.6537, "step": 17766 }, { "epoch": 46.92505777484318, "grad_norm": 1140.1898193359375, "learning_rate": 4.467413706996465e-06, "loss": 40.1528, "step": 17767 }, { "epoch": 46.927698910531525, "grad_norm": 953.8179931640625, "learning_rate": 4.459554715579017e-06, "loss": 38.2888, "step": 17768 }, { "epoch": 46.930340046219875, "grad_norm": 1019.9520263671875, "learning_rate": 4.45170258073424e-06, "loss": 38.8576, "step": 17769 }, { "epoch": 46.93298118190822, "grad_norm": 661.14306640625, "learning_rate": 4.443857302681459e-06, "loss": 42.6058, "step": 17770 }, { "epoch": 46.93562231759657, "grad_norm": 853.5671997070312, "learning_rate": 4.436018881639692e-06, "loss": 39.9707, "step": 17771 }, { "epoch": 46.93826345328491, "grad_norm": 618.5018310546875, "learning_rate": 4.4281873178278475e-06, "loss": 39.2221, "step": 17772 }, { "epoch": 46.94090458897326, "grad_norm": 535.1079711914062, "learning_rate": 4.420362611464612e-06, "loss": 38.6594, "step": 17773 }, { "epoch": 46.943545724661604, "grad_norm": 2353.936767578125, "learning_rate": 4.4125447627684775e-06, "loss": 38.1968, "step": 17774 }, { "epoch": 46.946186860349954, "grad_norm": 897.6465454101562, "learning_rate": 4.404733771957797e-06, "loss": 35.3057, "step": 17775 }, { "epoch": 46.9488279960383, "grad_norm": 863.8644409179688, "learning_rate": 4.396929639250618e-06, "loss": 34.928, "step": 17776 }, { "epoch": 46.95146913172664, "grad_norm": 589.1157836914062, "learning_rate": 4.389132364864934e-06, "loss": 34.2976, "step": 17777 }, { "epoch": 46.95411026741499, "grad_norm": 778.1729125976562, "learning_rate": 4.381341949018458e-06, "loss": 34.9906, "step": 17778 }, { "epoch": 46.95675140310333, "grad_norm": 959.1165161132812, "learning_rate": 4.373558391928712e-06, "loss": 35.1469, "step": 17779 }, { "epoch": 46.95939253879168, "grad_norm": 1440.4124755859375, "learning_rate": 4.365781693813048e-06, "loss": 21.519, "step": 17780 }, { "epoch": 46.962033674480026, "grad_norm": 1343.2796630859375, "learning_rate": 4.358011854888627e-06, "loss": 15.0396, "step": 17781 }, { "epoch": 46.964674810168376, "grad_norm": 1798.4576416015625, "learning_rate": 4.350248875372442e-06, "loss": 10.1683, "step": 17782 }, { "epoch": 46.96731594585672, "grad_norm": 12199.8427734375, "learning_rate": 4.342492755481236e-06, "loss": 10.3145, "step": 17783 }, { "epoch": 46.96995708154506, "grad_norm": 970.0890502929688, "learning_rate": 4.334743495431642e-06, "loss": 10.2525, "step": 17784 }, { "epoch": 46.97259821723341, "grad_norm": 1338.5411376953125, "learning_rate": 4.327001095440014e-06, "loss": 17.0961, "step": 17785 }, { "epoch": 46.975239352921754, "grad_norm": 2539.028076171875, "learning_rate": 4.319265555722568e-06, "loss": 34.4331, "step": 17786 }, { "epoch": 46.977880488610104, "grad_norm": 1009.4705200195312, "learning_rate": 4.311536876495326e-06, "loss": 34.2132, "step": 17787 }, { "epoch": 46.98052162429845, "grad_norm": 2970.995849609375, "learning_rate": 4.303815057974086e-06, "loss": 34.4484, "step": 17788 }, { "epoch": 46.9831627599868, "grad_norm": 491.33001708984375, "learning_rate": 4.296100100374456e-06, "loss": 34.866, "step": 17789 }, { "epoch": 46.98580389567514, "grad_norm": 1326.8935546875, "learning_rate": 4.288392003911901e-06, "loss": 33.6662, "step": 17790 }, { "epoch": 46.98844503136348, "grad_norm": 1109.6165771484375, "learning_rate": 4.280690768801665e-06, "loss": 34.376, "step": 17791 }, { "epoch": 46.99108616705183, "grad_norm": 812.0618896484375, "learning_rate": 4.272996395258799e-06, "loss": 33.9732, "step": 17792 }, { "epoch": 46.993727302740176, "grad_norm": 3091.103271484375, "learning_rate": 4.265308883498131e-06, "loss": 34.119, "step": 17793 }, { "epoch": 46.996368438428526, "grad_norm": 2088.30078125, "learning_rate": 4.257628233734406e-06, "loss": 35.2444, "step": 17794 }, { "epoch": 46.99900957411687, "grad_norm": 5647.107421875, "learning_rate": 4.249954446182036e-06, "loss": 35.8775, "step": 17795 }, { "epoch": 47.00165070980522, "grad_norm": 519.2578125, "learning_rate": 4.242287521055349e-06, "loss": 37.9733, "step": 17796 }, { "epoch": 47.00429184549356, "grad_norm": 1232.717529296875, "learning_rate": 4.234627458568397e-06, "loss": 38.4344, "step": 17797 }, { "epoch": 47.00693298118191, "grad_norm": 951.757568359375, "learning_rate": 4.2269742589350904e-06, "loss": 37.6681, "step": 17798 }, { "epoch": 47.009574116870255, "grad_norm": 2863.688720703125, "learning_rate": 4.219327922369176e-06, "loss": 39.2601, "step": 17799 }, { "epoch": 47.0122152525586, "grad_norm": 1148.167236328125, "learning_rate": 4.211688449084122e-06, "loss": 40.0688, "step": 17800 }, { "epoch": 47.0122152525586, "eval_loss": 3.706843852996826, "eval_runtime": 2.1233, "eval_samples_per_second": 233.127, "eval_steps_per_second": 29.2, "step": 17800 }, { "epoch": 47.01485638824695, "grad_norm": 640.75830078125, "learning_rate": 4.204055839293314e-06, "loss": 42.7892, "step": 17801 }, { "epoch": 47.01749752393529, "grad_norm": 639.2543334960938, "learning_rate": 4.1964300932098575e-06, "loss": 39.7931, "step": 17802 }, { "epoch": 47.02013865962364, "grad_norm": 1720.1318359375, "learning_rate": 4.188811211046695e-06, "loss": 41.3984, "step": 17803 }, { "epoch": 47.02277979531198, "grad_norm": 689.1192626953125, "learning_rate": 4.181199193016572e-06, "loss": 38.5632, "step": 17804 }, { "epoch": 47.02542093100033, "grad_norm": 2078.33837890625, "learning_rate": 4.1735940393320685e-06, "loss": 41.5022, "step": 17805 }, { "epoch": 47.028062066688676, "grad_norm": 1410.60107421875, "learning_rate": 4.165995750205542e-06, "loss": 40.2691, "step": 17806 }, { "epoch": 47.03070320237702, "grad_norm": 486.23382568359375, "learning_rate": 4.15840432584913e-06, "loss": 37.3434, "step": 17807 }, { "epoch": 47.03334433806537, "grad_norm": 862.6912841796875, "learning_rate": 4.150819766474911e-06, "loss": 36.7962, "step": 17808 }, { "epoch": 47.03598547375371, "grad_norm": 682.5216674804688, "learning_rate": 4.143242072294634e-06, "loss": 36.4155, "step": 17809 }, { "epoch": 47.03862660944206, "grad_norm": 1262.9244384765625, "learning_rate": 4.135671243519878e-06, "loss": 35.9719, "step": 17810 }, { "epoch": 47.041267745130405, "grad_norm": 1202.8822021484375, "learning_rate": 4.128107280362087e-06, "loss": 35.4593, "step": 17811 }, { "epoch": 47.043908880818755, "grad_norm": 1102.31005859375, "learning_rate": 4.120550183032451e-06, "loss": 33.9165, "step": 17812 }, { "epoch": 47.0465500165071, "grad_norm": 1615.607666015625, "learning_rate": 4.112999951742052e-06, "loss": 33.8685, "step": 17813 }, { "epoch": 47.04919115219544, "grad_norm": 1007.1325073242188, "learning_rate": 4.1054565867016395e-06, "loss": 35.2317, "step": 17814 }, { "epoch": 47.05183228788379, "grad_norm": 1673.10009765625, "learning_rate": 4.0979200881219315e-06, "loss": 35.0932, "step": 17815 }, { "epoch": 47.054473423572134, "grad_norm": 2230.123291015625, "learning_rate": 4.090390456213345e-06, "loss": 34.4853, "step": 17816 }, { "epoch": 47.057114559260484, "grad_norm": 1375.50048828125, "learning_rate": 4.082867691186154e-06, "loss": 37.4606, "step": 17817 }, { "epoch": 47.05975569494883, "grad_norm": 5205.9462890625, "learning_rate": 4.075351793250415e-06, "loss": 12.0678, "step": 17818 }, { "epoch": 47.06239683063718, "grad_norm": 1776.897705078125, "learning_rate": 4.067842762616014e-06, "loss": 15.3241, "step": 17819 }, { "epoch": 47.06503796632552, "grad_norm": 1936.630615234375, "learning_rate": 4.060340599492646e-06, "loss": 11.3291, "step": 17820 }, { "epoch": 47.06767910201387, "grad_norm": 2484.8369140625, "learning_rate": 4.052845304089808e-06, "loss": 10.2926, "step": 17821 }, { "epoch": 47.07032023770221, "grad_norm": 1199.836181640625, "learning_rate": 4.045356876616779e-06, "loss": 7.7258, "step": 17822 }, { "epoch": 47.072961373390555, "grad_norm": 428.8656311035156, "learning_rate": 4.0378753172826685e-06, "loss": 10.9224, "step": 17823 }, { "epoch": 47.075602509078905, "grad_norm": 605.578369140625, "learning_rate": 4.030400626296393e-06, "loss": 18.0501, "step": 17824 }, { "epoch": 47.07824364476725, "grad_norm": 3558.21630859375, "learning_rate": 4.0229328038667025e-06, "loss": 10.6252, "step": 17825 }, { "epoch": 47.0808847804556, "grad_norm": 30584.86328125, "learning_rate": 4.015471850202123e-06, "loss": 9.6693, "step": 17826 }, { "epoch": 47.08352591614394, "grad_norm": 3748.276611328125, "learning_rate": 4.008017765510991e-06, "loss": 15.8972, "step": 17827 }, { "epoch": 47.08616705183229, "grad_norm": 678.75830078125, "learning_rate": 4.000570550001442e-06, "loss": 36.229, "step": 17828 }, { "epoch": 47.088808187520634, "grad_norm": 921.6018676757812, "learning_rate": 3.993130203881451e-06, "loss": 35.6425, "step": 17829 }, { "epoch": 47.09144932320898, "grad_norm": 959.1067504882812, "learning_rate": 3.985696727358823e-06, "loss": 34.5284, "step": 17830 }, { "epoch": 47.09409045889733, "grad_norm": 1892.5888671875, "learning_rate": 3.978270120641087e-06, "loss": 36.3801, "step": 17831 }, { "epoch": 47.09673159458567, "grad_norm": 807.8978271484375, "learning_rate": 3.9708503839356034e-06, "loss": 33.562, "step": 17832 }, { "epoch": 47.09937273027402, "grad_norm": 1127.7738037109375, "learning_rate": 3.963437517449625e-06, "loss": 34.6545, "step": 17833 }, { "epoch": 47.10201386596236, "grad_norm": 1904.99853515625, "learning_rate": 3.956031521390124e-06, "loss": 35.8115, "step": 17834 }, { "epoch": 47.10465500165071, "grad_norm": 818.04833984375, "learning_rate": 3.948632395963908e-06, "loss": 34.8661, "step": 17835 }, { "epoch": 47.107296137339056, "grad_norm": 925.4510498046875, "learning_rate": 3.941240141377589e-06, "loss": 34.1973, "step": 17836 }, { "epoch": 47.1099372730274, "grad_norm": 2312.2607421875, "learning_rate": 3.933854757837585e-06, "loss": 34.3694, "step": 17837 }, { "epoch": 47.11257840871575, "grad_norm": 17316.48828125, "learning_rate": 3.9264762455501756e-06, "loss": 33.7666, "step": 17838 }, { "epoch": 47.11521954440409, "grad_norm": 1342.9564208984375, "learning_rate": 3.9191046047213354e-06, "loss": 33.7107, "step": 17839 }, { "epoch": 47.11786068009244, "grad_norm": 818.1466674804688, "learning_rate": 3.911739835556955e-06, "loss": 34.2313, "step": 17840 }, { "epoch": 47.120501815780784, "grad_norm": 419.60772705078125, "learning_rate": 3.904381938262674e-06, "loss": 33.3381, "step": 17841 }, { "epoch": 47.123142951469134, "grad_norm": 1105.6712646484375, "learning_rate": 3.8970309130439415e-06, "loss": 33.1255, "step": 17842 }, { "epoch": 47.12578408715748, "grad_norm": 824.400146484375, "learning_rate": 3.889686760106092e-06, "loss": 34.158, "step": 17843 }, { "epoch": 47.12842522284583, "grad_norm": 2106.830322265625, "learning_rate": 3.882349479654157e-06, "loss": 36.5509, "step": 17844 }, { "epoch": 47.13106635853417, "grad_norm": 1725.7947998046875, "learning_rate": 3.875019071893026e-06, "loss": 39.277, "step": 17845 }, { "epoch": 47.13370749422251, "grad_norm": 34944.984375, "learning_rate": 3.8676955370273715e-06, "loss": 39.2373, "step": 17846 }, { "epoch": 47.13634862991086, "grad_norm": 610.3038330078125, "learning_rate": 3.860378875261777e-06, "loss": 38.1735, "step": 17847 }, { "epoch": 47.138989765599206, "grad_norm": 1003.557861328125, "learning_rate": 3.853069086800526e-06, "loss": 38.4768, "step": 17848 }, { "epoch": 47.141630901287556, "grad_norm": 761.0082397460938, "learning_rate": 3.845766171847704e-06, "loss": 37.6274, "step": 17849 }, { "epoch": 47.1442720369759, "grad_norm": 2111.213134765625, "learning_rate": 3.838470130607258e-06, "loss": 41.9932, "step": 17850 }, { "epoch": 47.14691317266425, "grad_norm": 874.4180908203125, "learning_rate": 3.831180963282943e-06, "loss": 42.4104, "step": 17851 }, { "epoch": 47.14955430835259, "grad_norm": 717.13671875, "learning_rate": 3.823898670078291e-06, "loss": 39.6713, "step": 17852 }, { "epoch": 47.152195444040935, "grad_norm": 1988.2977294921875, "learning_rate": 3.8166232511966385e-06, "loss": 40.7172, "step": 17853 }, { "epoch": 47.154836579729285, "grad_norm": 993.0811157226562, "learning_rate": 3.809354706841184e-06, "loss": 39.8916, "step": 17854 }, { "epoch": 47.15747771541763, "grad_norm": 635.5947875976562, "learning_rate": 3.8020930372148766e-06, "loss": 39.9413, "step": 17855 }, { "epoch": 47.16011885110598, "grad_norm": 970.91357421875, "learning_rate": 3.7948382425204985e-06, "loss": 39.6035, "step": 17856 }, { "epoch": 47.16275998679432, "grad_norm": 771.9959716796875, "learning_rate": 3.78759032296061e-06, "loss": 38.2156, "step": 17857 }, { "epoch": 47.16540112248267, "grad_norm": 1021.7356567382812, "learning_rate": 3.7803492787376605e-06, "loss": 37.0334, "step": 17858 }, { "epoch": 47.16804225817101, "grad_norm": 1111.6500244140625, "learning_rate": 3.7731151100538208e-06, "loss": 36.5358, "step": 17859 }, { "epoch": 47.170683393859356, "grad_norm": 644.80712890625, "learning_rate": 3.765887817111069e-06, "loss": 34.6138, "step": 17860 }, { "epoch": 47.173324529547706, "grad_norm": 801.737548828125, "learning_rate": 3.758667400111271e-06, "loss": 34.9194, "step": 17861 }, { "epoch": 47.17596566523605, "grad_norm": 1003.2276000976562, "learning_rate": 3.7514538592560433e-06, "loss": 33.4707, "step": 17862 }, { "epoch": 47.1786068009244, "grad_norm": 1163.6654052734375, "learning_rate": 3.7442471947467806e-06, "loss": 35.5031, "step": 17863 }, { "epoch": 47.18124793661274, "grad_norm": 537.6422729492188, "learning_rate": 3.7370474067847936e-06, "loss": 33.8048, "step": 17864 }, { "epoch": 47.18388907230109, "grad_norm": 843.4682006835938, "learning_rate": 3.729854495571089e-06, "loss": 34.4558, "step": 17865 }, { "epoch": 47.186530207989435, "grad_norm": 1031.028564453125, "learning_rate": 3.722668461306533e-06, "loss": 36.268, "step": 17866 }, { "epoch": 47.189171343677785, "grad_norm": 541.6668701171875, "learning_rate": 3.7154893041917716e-06, "loss": 36.3784, "step": 17867 }, { "epoch": 47.19181247936613, "grad_norm": 2471.944091796875, "learning_rate": 3.70831702442731e-06, "loss": 31.2452, "step": 17868 }, { "epoch": 47.19445361505447, "grad_norm": 2396.232666015625, "learning_rate": 3.7011516222134056e-06, "loss": 11.1118, "step": 17869 }, { "epoch": 47.19709475074282, "grad_norm": 1993.075439453125, "learning_rate": 3.693993097750148e-06, "loss": 11.2499, "step": 17870 }, { "epoch": 47.199735886431164, "grad_norm": 4928.318359375, "learning_rate": 3.6868414512374603e-06, "loss": 16.4177, "step": 17871 }, { "epoch": 47.202377022119514, "grad_norm": 1225.184814453125, "learning_rate": 3.6796966828750445e-06, "loss": 11.1912, "step": 17872 }, { "epoch": 47.20501815780786, "grad_norm": 14243.8330078125, "learning_rate": 3.6725587928624072e-06, "loss": 7.6309, "step": 17873 }, { "epoch": 47.20765929349621, "grad_norm": 4037.302001953125, "learning_rate": 3.6654277813988335e-06, "loss": 11.0332, "step": 17874 }, { "epoch": 47.21030042918455, "grad_norm": 1681.7447509765625, "learning_rate": 3.6583036486835254e-06, "loss": 9.3428, "step": 17875 }, { "epoch": 47.21294156487289, "grad_norm": 4035.576904296875, "learning_rate": 3.651186394915351e-06, "loss": 13.0016, "step": 17876 }, { "epoch": 47.21558270056124, "grad_norm": 8241.130859375, "learning_rate": 3.644076020293069e-06, "loss": 8.6158, "step": 17877 }, { "epoch": 47.218223836249585, "grad_norm": 1410.93408203125, "learning_rate": 3.6369725250152697e-06, "loss": 33.7297, "step": 17878 }, { "epoch": 47.220864971937935, "grad_norm": 546.9359741210938, "learning_rate": 3.6298759092802946e-06, "loss": 33.6419, "step": 17879 }, { "epoch": 47.22350610762628, "grad_norm": 549.3911743164062, "learning_rate": 3.622786173286291e-06, "loss": 33.1205, "step": 17880 }, { "epoch": 47.22614724331463, "grad_norm": 852.7583618164062, "learning_rate": 3.615703317231239e-06, "loss": 33.1838, "step": 17881 }, { "epoch": 47.22878837900297, "grad_norm": 1040.615478515625, "learning_rate": 3.6086273413129812e-06, "loss": 32.9072, "step": 17882 }, { "epoch": 47.231429514691314, "grad_norm": 512.0028686523438, "learning_rate": 3.6015582457290806e-06, "loss": 33.7205, "step": 17883 }, { "epoch": 47.234070650379664, "grad_norm": 634.0025024414062, "learning_rate": 3.5944960306768524e-06, "loss": 35.2848, "step": 17884 }, { "epoch": 47.23671178606801, "grad_norm": 576.202392578125, "learning_rate": 3.5874406963536388e-06, "loss": 33.4258, "step": 17885 }, { "epoch": 47.23935292175636, "grad_norm": 1041.875244140625, "learning_rate": 3.580392242956365e-06, "loss": 34.0241, "step": 17886 }, { "epoch": 47.2419940574447, "grad_norm": 1054.111083984375, "learning_rate": 3.573350670681874e-06, "loss": 34.4898, "step": 17887 }, { "epoch": 47.24463519313305, "grad_norm": 1756.4117431640625, "learning_rate": 3.566315979726814e-06, "loss": 34.81, "step": 17888 }, { "epoch": 47.24727632882139, "grad_norm": 8708.939453125, "learning_rate": 3.559288170287639e-06, "loss": 33.7715, "step": 17889 }, { "epoch": 47.24991746450974, "grad_norm": 3583.7978515625, "learning_rate": 3.552267242560553e-06, "loss": 35.4683, "step": 17890 }, { "epoch": 47.252558600198086, "grad_norm": 1662.98388671875, "learning_rate": 3.545253196741649e-06, "loss": 33.9854, "step": 17891 }, { "epoch": 47.25519973588643, "grad_norm": 1772.277099609375, "learning_rate": 3.538246033026743e-06, "loss": 33.7049, "step": 17892 }, { "epoch": 47.25784087157478, "grad_norm": 1631.9832763671875, "learning_rate": 3.531245751611567e-06, "loss": 35.6298, "step": 17893 }, { "epoch": 47.26048200726312, "grad_norm": 1641.866943359375, "learning_rate": 3.524252352691576e-06, "loss": 36.028, "step": 17894 }, { "epoch": 47.26312314295147, "grad_norm": 3121.241943359375, "learning_rate": 3.5172658364620026e-06, "loss": 36.9957, "step": 17895 }, { "epoch": 47.265764278639814, "grad_norm": 1373.4169921875, "learning_rate": 3.510286203118024e-06, "loss": 41.3354, "step": 17896 }, { "epoch": 47.268405414328164, "grad_norm": 711.5504150390625, "learning_rate": 3.503313452854512e-06, "loss": 37.2192, "step": 17897 }, { "epoch": 47.27104655001651, "grad_norm": 1999.9879150390625, "learning_rate": 3.496347585866144e-06, "loss": 38.8021, "step": 17898 }, { "epoch": 47.27368768570485, "grad_norm": 494.6622009277344, "learning_rate": 3.489388602347515e-06, "loss": 38.3157, "step": 17899 }, { "epoch": 47.2763288213932, "grad_norm": 1145.8232421875, "learning_rate": 3.4824365024928584e-06, "loss": 39.7026, "step": 17900 }, { "epoch": 47.27896995708154, "grad_norm": 627.40380859375, "learning_rate": 3.4754912864963795e-06, "loss": 42.2911, "step": 17901 }, { "epoch": 47.28161109276989, "grad_norm": 2184.252197265625, "learning_rate": 3.468552954551979e-06, "loss": 41.4338, "step": 17902 }, { "epoch": 47.284252228458236, "grad_norm": 1049.3809814453125, "learning_rate": 3.4616215068534186e-06, "loss": 41.622, "step": 17903 }, { "epoch": 47.286893364146586, "grad_norm": 972.7156372070312, "learning_rate": 3.454696943594238e-06, "loss": 39.8423, "step": 17904 }, { "epoch": 47.28953449983493, "grad_norm": 679.7606201171875, "learning_rate": 3.4477792649678097e-06, "loss": 38.77, "step": 17905 }, { "epoch": 47.29217563552327, "grad_norm": 779.1751098632812, "learning_rate": 3.440868471167341e-06, "loss": 39.4342, "step": 17906 }, { "epoch": 47.29481677121162, "grad_norm": 826.0564575195312, "learning_rate": 3.433964562385761e-06, "loss": 37.2395, "step": 17907 }, { "epoch": 47.297457906899965, "grad_norm": 565.093505859375, "learning_rate": 3.4270675388158867e-06, "loss": 38.1948, "step": 17908 }, { "epoch": 47.300099042588315, "grad_norm": 868.8029174804688, "learning_rate": 3.420177400650315e-06, "loss": 35.9709, "step": 17909 }, { "epoch": 47.30274017827666, "grad_norm": 1067.4156494140625, "learning_rate": 3.4132941480814472e-06, "loss": 34.2666, "step": 17910 }, { "epoch": 47.30538131396501, "grad_norm": 786.2214965820312, "learning_rate": 3.406417781301491e-06, "loss": 35.1246, "step": 17911 }, { "epoch": 47.30802244965335, "grad_norm": 519.1119995117188, "learning_rate": 3.399548300502431e-06, "loss": 35.7176, "step": 17912 }, { "epoch": 47.3106635853417, "grad_norm": 896.01953125, "learning_rate": 3.392685705876142e-06, "loss": 34.959, "step": 17913 }, { "epoch": 47.31330472103004, "grad_norm": 995.0502319335938, "learning_rate": 3.3858299976142206e-06, "loss": 35.4496, "step": 17914 }, { "epoch": 47.315945856718386, "grad_norm": 2248.292236328125, "learning_rate": 3.3789811759081525e-06, "loss": 35.5458, "step": 17915 }, { "epoch": 47.318586992406736, "grad_norm": 787.7293090820312, "learning_rate": 3.3721392409491456e-06, "loss": 34.4304, "step": 17916 }, { "epoch": 47.32122812809508, "grad_norm": 2035.1304931640625, "learning_rate": 3.3653041929282703e-06, "loss": 34.5318, "step": 17917 }, { "epoch": 47.32386926378343, "grad_norm": 5024.73681640625, "learning_rate": 3.3584760320364283e-06, "loss": 39.7121, "step": 17918 }, { "epoch": 47.32651039947177, "grad_norm": 588.6300659179688, "learning_rate": 3.3516547584642177e-06, "loss": 10.8254, "step": 17919 }, { "epoch": 47.32915153516012, "grad_norm": 809.1322021484375, "learning_rate": 3.34484037240218e-06, "loss": 12.3539, "step": 17920 }, { "epoch": 47.331792670848465, "grad_norm": 5609.5126953125, "learning_rate": 3.3380328740405806e-06, "loss": 11.3254, "step": 17921 }, { "epoch": 47.33443380653681, "grad_norm": 6629.2626953125, "learning_rate": 3.331232263569517e-06, "loss": 9.069, "step": 17922 }, { "epoch": 47.33707494222516, "grad_norm": 1011.3922729492188, "learning_rate": 3.3244385411788923e-06, "loss": 8.5389, "step": 17923 }, { "epoch": 47.3397160779135, "grad_norm": 2609.541748046875, "learning_rate": 3.3176517070584167e-06, "loss": 9.5728, "step": 17924 }, { "epoch": 47.34235721360185, "grad_norm": 15005.0947265625, "learning_rate": 3.3108717613976048e-06, "loss": 9.403, "step": 17925 }, { "epoch": 47.344998349290194, "grad_norm": 955.1204833984375, "learning_rate": 3.304098704385777e-06, "loss": 9.1095, "step": 17926 }, { "epoch": 47.347639484978544, "grad_norm": 916.9832763671875, "learning_rate": 3.2973325362120886e-06, "loss": 13.4983, "step": 17927 }, { "epoch": 47.35028062066689, "grad_norm": 970.98046875, "learning_rate": 3.290573257065471e-06, "loss": 34.2665, "step": 17928 }, { "epoch": 47.35292175635523, "grad_norm": 944.3224487304688, "learning_rate": 3.283820867134635e-06, "loss": 34.9603, "step": 17929 }, { "epoch": 47.35556289204358, "grad_norm": 2826.32421875, "learning_rate": 3.2770753666082066e-06, "loss": 34.1404, "step": 17930 }, { "epoch": 47.35820402773192, "grad_norm": 674.316650390625, "learning_rate": 3.270336755674508e-06, "loss": 33.7507, "step": 17931 }, { "epoch": 47.36084516342027, "grad_norm": 1055.635986328125, "learning_rate": 3.2636050345217217e-06, "loss": 35.0347, "step": 17932 }, { "epoch": 47.363486299108615, "grad_norm": 1142.8450927734375, "learning_rate": 3.2568802033378086e-06, "loss": 33.7283, "step": 17933 }, { "epoch": 47.366127434796965, "grad_norm": 1046.5821533203125, "learning_rate": 3.250162262310591e-06, "loss": 33.8381, "step": 17934 }, { "epoch": 47.36876857048531, "grad_norm": 815.4005737304688, "learning_rate": 3.2434512116276405e-06, "loss": 35.6246, "step": 17935 }, { "epoch": 47.37140970617366, "grad_norm": 1275.0467529296875, "learning_rate": 3.2367470514763632e-06, "loss": 34.0197, "step": 17936 }, { "epoch": 47.374050841862, "grad_norm": 1003.0054931640625, "learning_rate": 3.23004978204397e-06, "loss": 33.4706, "step": 17937 }, { "epoch": 47.376691977550344, "grad_norm": 895.7277221679688, "learning_rate": 3.2233594035174784e-06, "loss": 34.248, "step": 17938 }, { "epoch": 47.379333113238694, "grad_norm": 698.1702270507812, "learning_rate": 3.2166759160836834e-06, "loss": 33.7828, "step": 17939 }, { "epoch": 47.38197424892704, "grad_norm": 3235.60693359375, "learning_rate": 3.2099993199292688e-06, "loss": 34.5697, "step": 17940 }, { "epoch": 47.38461538461539, "grad_norm": 1032.3804931640625, "learning_rate": 3.203329615240641e-06, "loss": 33.3699, "step": 17941 }, { "epoch": 47.38725652030373, "grad_norm": 3469.868408203125, "learning_rate": 3.196666802204068e-06, "loss": 35.5452, "step": 17942 }, { "epoch": 47.38989765599208, "grad_norm": 1450.886474609375, "learning_rate": 3.190010881005595e-06, "loss": 35.6007, "step": 17943 }, { "epoch": 47.39253879168042, "grad_norm": 90488.9609375, "learning_rate": 3.1833618518310737e-06, "loss": 35.8777, "step": 17944 }, { "epoch": 47.395179927368766, "grad_norm": 1214.0867919921875, "learning_rate": 3.1767197148661888e-06, "loss": 38.9945, "step": 17945 }, { "epoch": 47.397821063057116, "grad_norm": 1425.64794921875, "learning_rate": 3.1700844702964303e-06, "loss": 41.5275, "step": 17946 }, { "epoch": 47.40046219874546, "grad_norm": 525.1972045898438, "learning_rate": 3.163456118307012e-06, "loss": 37.2469, "step": 17947 }, { "epoch": 47.40310333443381, "grad_norm": 460.61187744140625, "learning_rate": 3.1568346590831465e-06, "loss": 38.0554, "step": 17948 }, { "epoch": 47.40574447012215, "grad_norm": 1130.1573486328125, "learning_rate": 3.1502200928096305e-06, "loss": 38.6946, "step": 17949 }, { "epoch": 47.4083856058105, "grad_norm": 754.8953857421875, "learning_rate": 3.1436124196712047e-06, "loss": 42.053, "step": 17950 }, { "epoch": 47.411026741498844, "grad_norm": 773.0685424804688, "learning_rate": 3.137011639852416e-06, "loss": 42.9109, "step": 17951 }, { "epoch": 47.41366787718719, "grad_norm": 1642.50439453125, "learning_rate": 3.1304177535375623e-06, "loss": 37.6548, "step": 17952 }, { "epoch": 47.41630901287554, "grad_norm": 1622.682861328125, "learning_rate": 3.123830760910801e-06, "loss": 40.9848, "step": 17953 }, { "epoch": 47.41895014856388, "grad_norm": 2089.48193359375, "learning_rate": 3.1172506621559847e-06, "loss": 40.2698, "step": 17954 }, { "epoch": 47.42159128425223, "grad_norm": 979.7039184570312, "learning_rate": 3.1106774574569395e-06, "loss": 37.9327, "step": 17955 }, { "epoch": 47.42423241994057, "grad_norm": 633.671630859375, "learning_rate": 3.1041111469972128e-06, "loss": 38.0146, "step": 17956 }, { "epoch": 47.42687355562892, "grad_norm": 810.8627319335938, "learning_rate": 3.09755173096013e-06, "loss": 37.0564, "step": 17957 }, { "epoch": 47.429514691317266, "grad_norm": 929.19580078125, "learning_rate": 3.0909992095288773e-06, "loss": 37.7681, "step": 17958 }, { "epoch": 47.432155827005616, "grad_norm": 745.8366088867188, "learning_rate": 3.0844535828864207e-06, "loss": 36.0856, "step": 17959 }, { "epoch": 47.43479696269396, "grad_norm": 1077.4609375, "learning_rate": 3.077914851215585e-06, "loss": 34.2963, "step": 17960 }, { "epoch": 47.4374380983823, "grad_norm": 750.5681762695312, "learning_rate": 3.071383014698892e-06, "loss": 35.9313, "step": 17961 }, { "epoch": 47.44007923407065, "grad_norm": 907.0272216796875, "learning_rate": 3.064858073518806e-06, "loss": 35.619, "step": 17962 }, { "epoch": 47.442720369758995, "grad_norm": 619.240966796875, "learning_rate": 3.058340027857487e-06, "loss": 34.4343, "step": 17963 }, { "epoch": 47.445361505447345, "grad_norm": 812.4313354492188, "learning_rate": 3.0518288778969285e-06, "loss": 34.4792, "step": 17964 }, { "epoch": 47.44800264113569, "grad_norm": 959.7817993164062, "learning_rate": 3.0453246238190126e-06, "loss": 34.0508, "step": 17965 }, { "epoch": 47.45064377682404, "grad_norm": 1165.3363037109375, "learning_rate": 3.038827265805344e-06, "loss": 34.5694, "step": 17966 }, { "epoch": 47.45328491251238, "grad_norm": 1296.023193359375, "learning_rate": 3.032336804037361e-06, "loss": 38.4626, "step": 17967 }, { "epoch": 47.45592604820072, "grad_norm": 23933.333984375, "learning_rate": 3.0258532386962523e-06, "loss": 35.9948, "step": 17968 }, { "epoch": 47.45856718388907, "grad_norm": 910.15869140625, "learning_rate": 3.0193765699631504e-06, "loss": 10.0924, "step": 17969 }, { "epoch": 47.461208319577416, "grad_norm": 722.1980590820312, "learning_rate": 3.0129067980188552e-06, "loss": 16.4332, "step": 17970 }, { "epoch": 47.463849455265766, "grad_norm": 6489.89697265625, "learning_rate": 3.0064439230440554e-06, "loss": 10.423, "step": 17971 }, { "epoch": 47.46649059095411, "grad_norm": 1099.367919921875, "learning_rate": 2.999987945219218e-06, "loss": 9.9761, "step": 17972 }, { "epoch": 47.46913172664246, "grad_norm": 604.1163330078125, "learning_rate": 2.993538864724643e-06, "loss": 13.77, "step": 17973 }, { "epoch": 47.4717728623308, "grad_norm": 2186.559814453125, "learning_rate": 2.9870966817403533e-06, "loss": 10.1709, "step": 17974 }, { "epoch": 47.474413998019145, "grad_norm": 4401.703125, "learning_rate": 2.980661396446316e-06, "loss": 13.2507, "step": 17975 }, { "epoch": 47.477055133707495, "grad_norm": 1743.4639892578125, "learning_rate": 2.9742330090221927e-06, "loss": 11.1421, "step": 17976 }, { "epoch": 47.47969626939584, "grad_norm": 3595.390869140625, "learning_rate": 2.9678115196475065e-06, "loss": 13.2962, "step": 17977 }, { "epoch": 47.48233740508419, "grad_norm": 729.8426513671875, "learning_rate": 2.9613969285015587e-06, "loss": 34.049, "step": 17978 }, { "epoch": 47.48497854077253, "grad_norm": 670.5049438476562, "learning_rate": 2.9549892357634834e-06, "loss": 35.6319, "step": 17979 }, { "epoch": 47.48761967646088, "grad_norm": 924.7396850585938, "learning_rate": 2.948588441612221e-06, "loss": 34.8565, "step": 17980 }, { "epoch": 47.490260812149224, "grad_norm": 1073.790283203125, "learning_rate": 2.9421945462265177e-06, "loss": 35.86, "step": 17981 }, { "epoch": 47.492901947837574, "grad_norm": 1105.089111328125, "learning_rate": 2.9358075497848693e-06, "loss": 33.7299, "step": 17982 }, { "epoch": 47.49554308352592, "grad_norm": 562.231689453125, "learning_rate": 2.9294274524656605e-06, "loss": 33.5148, "step": 17983 }, { "epoch": 47.49818421921426, "grad_norm": 662.9395141601562, "learning_rate": 2.923054254447083e-06, "loss": 34.0186, "step": 17984 }, { "epoch": 47.50082535490261, "grad_norm": 2050.92431640625, "learning_rate": 2.9166879559070215e-06, "loss": 33.038, "step": 17985 }, { "epoch": 47.50346649059095, "grad_norm": 2368.548828125, "learning_rate": 2.9103285570233616e-06, "loss": 33.937, "step": 17986 }, { "epoch": 47.5061076262793, "grad_norm": 631.9532470703125, "learning_rate": 2.9039760579736008e-06, "loss": 34.9997, "step": 17987 }, { "epoch": 47.508748761967645, "grad_norm": 10877.1142578125, "learning_rate": 2.89763045893518e-06, "loss": 34.8115, "step": 17988 }, { "epoch": 47.511389897655995, "grad_norm": 1084.19140625, "learning_rate": 2.891291760085235e-06, "loss": 34.8469, "step": 17989 }, { "epoch": 47.51403103334434, "grad_norm": 776.2745361328125, "learning_rate": 2.884959961600847e-06, "loss": 33.6832, "step": 17990 }, { "epoch": 47.51667216903268, "grad_norm": 930.2211303710938, "learning_rate": 2.8786350636587634e-06, "loss": 34.5243, "step": 17991 }, { "epoch": 47.51931330472103, "grad_norm": 1522.8179931640625, "learning_rate": 2.87231706643562e-06, "loss": 33.7595, "step": 17992 }, { "epoch": 47.521954440409374, "grad_norm": 1018.2989501953125, "learning_rate": 2.866005970107888e-06, "loss": 35.4668, "step": 17993 }, { "epoch": 47.524595576097724, "grad_norm": 1236.918701171875, "learning_rate": 2.859701774851731e-06, "loss": 36.9001, "step": 17994 }, { "epoch": 47.52723671178607, "grad_norm": 3676.5283203125, "learning_rate": 2.853404480843258e-06, "loss": 39.9337, "step": 17995 }, { "epoch": 47.52987784747442, "grad_norm": 1501.11572265625, "learning_rate": 2.8471140882582456e-06, "loss": 40.4578, "step": 17996 }, { "epoch": 47.53251898316276, "grad_norm": 698.4735107421875, "learning_rate": 2.8408305972724146e-06, "loss": 39.0664, "step": 17997 }, { "epoch": 47.5351601188511, "grad_norm": 769.3230590820312, "learning_rate": 2.8345540080612074e-06, "loss": 37.3437, "step": 17998 }, { "epoch": 47.53780125453945, "grad_norm": 918.0499267578125, "learning_rate": 2.8282843207998456e-06, "loss": 39.2402, "step": 17999 }, { "epoch": 47.540442390227795, "grad_norm": 773.7362060546875, "learning_rate": 2.822021535663466e-06, "loss": 40.0877, "step": 18000 }, { "epoch": 47.540442390227795, "eval_loss": 3.838897943496704, "eval_runtime": 2.149, "eval_samples_per_second": 230.339, "eval_steps_per_second": 28.851, "step": 18000 }, { "epoch": 47.543083525916146, "grad_norm": 630.7689819335938, "learning_rate": 2.815765652826957e-06, "loss": 39.7411, "step": 18001 }, { "epoch": 47.54572466160449, "grad_norm": 903.9835205078125, "learning_rate": 2.8095166724649566e-06, "loss": 41.7299, "step": 18002 }, { "epoch": 47.54836579729284, "grad_norm": 3743.07421875, "learning_rate": 2.8032745947519923e-06, "loss": 38.8996, "step": 18003 }, { "epoch": 47.55100693298118, "grad_norm": 840.7726440429688, "learning_rate": 2.797039419862396e-06, "loss": 39.6218, "step": 18004 }, { "epoch": 47.55364806866953, "grad_norm": 2895.966796875, "learning_rate": 2.790811147970279e-06, "loss": 39.6564, "step": 18005 }, { "epoch": 47.556289204357874, "grad_norm": 561.0161743164062, "learning_rate": 2.7845897792495023e-06, "loss": 36.5512, "step": 18006 }, { "epoch": 47.55893034004622, "grad_norm": 807.8176879882812, "learning_rate": 2.778375313873871e-06, "loss": 37.0582, "step": 18007 }, { "epoch": 47.56157147573457, "grad_norm": 1180.3621826171875, "learning_rate": 2.7721677520168575e-06, "loss": 38.0639, "step": 18008 }, { "epoch": 47.56421261142291, "grad_norm": 781.0684814453125, "learning_rate": 2.765967093851823e-06, "loss": 36.0912, "step": 18009 }, { "epoch": 47.56685374711126, "grad_norm": 827.7734985351562, "learning_rate": 2.7597733395519353e-06, "loss": 34.9258, "step": 18010 }, { "epoch": 47.5694948827996, "grad_norm": 800.1716918945312, "learning_rate": 2.7535864892901386e-06, "loss": 36.4142, "step": 18011 }, { "epoch": 47.57213601848795, "grad_norm": 511.5994873046875, "learning_rate": 2.7474065432392113e-06, "loss": 36.3416, "step": 18012 }, { "epoch": 47.574777154176296, "grad_norm": 683.2205200195312, "learning_rate": 2.7412335015717104e-06, "loss": 35.4176, "step": 18013 }, { "epoch": 47.57741828986464, "grad_norm": 473.4149169921875, "learning_rate": 2.735067364459998e-06, "loss": 34.6589, "step": 18014 }, { "epoch": 47.58005942555299, "grad_norm": 910.0756225585938, "learning_rate": 2.7289081320762966e-06, "loss": 34.7406, "step": 18015 }, { "epoch": 47.58270056124133, "grad_norm": 920.92724609375, "learning_rate": 2.7227558045925806e-06, "loss": 34.2692, "step": 18016 }, { "epoch": 47.58534169692968, "grad_norm": 1152.830078125, "learning_rate": 2.716610382180601e-06, "loss": 35.3779, "step": 18017 }, { "epoch": 47.587982832618025, "grad_norm": 2024.9176025390625, "learning_rate": 2.7104718650120817e-06, "loss": 39.7527, "step": 18018 }, { "epoch": 47.590623968306375, "grad_norm": 3964.97314453125, "learning_rate": 2.7043402532583305e-06, "loss": 11.5538, "step": 18019 }, { "epoch": 47.59326510399472, "grad_norm": 1706.701171875, "learning_rate": 2.698215547090599e-06, "loss": 12.1476, "step": 18020 }, { "epoch": 47.59590623968306, "grad_norm": 2556.04931640625, "learning_rate": 2.6920977466799723e-06, "loss": 10.1584, "step": 18021 }, { "epoch": 47.59854737537141, "grad_norm": 2074.125244140625, "learning_rate": 2.685986852197203e-06, "loss": 11.238, "step": 18022 }, { "epoch": 47.60118851105975, "grad_norm": 413.3690490722656, "learning_rate": 2.679882863813016e-06, "loss": 11.7456, "step": 18023 }, { "epoch": 47.6038296467481, "grad_norm": 1048.3548583984375, "learning_rate": 2.673785781697746e-06, "loss": 11.249, "step": 18024 }, { "epoch": 47.606470782436446, "grad_norm": 6501.7666015625, "learning_rate": 2.6676956060217584e-06, "loss": 11.0319, "step": 18025 }, { "epoch": 47.609111918124796, "grad_norm": 413.9754943847656, "learning_rate": 2.6616123369550825e-06, "loss": 13.3884, "step": 18026 }, { "epoch": 47.61175305381314, "grad_norm": 1772.0491943359375, "learning_rate": 2.655535974667583e-06, "loss": 9.2425, "step": 18027 }, { "epoch": 47.61439418950149, "grad_norm": 489.201171875, "learning_rate": 2.649466519328958e-06, "loss": 27.3011, "step": 18028 }, { "epoch": 47.61703532518983, "grad_norm": 1122.376953125, "learning_rate": 2.6434039711086545e-06, "loss": 33.5016, "step": 18029 }, { "epoch": 47.619676460878175, "grad_norm": 390.45709228515625, "learning_rate": 2.637348330176009e-06, "loss": 33.2101, "step": 18030 }, { "epoch": 47.622317596566525, "grad_norm": 1015.042236328125, "learning_rate": 2.631299596700082e-06, "loss": 34.5055, "step": 18031 }, { "epoch": 47.62495873225487, "grad_norm": 663.8632202148438, "learning_rate": 2.6252577708498205e-06, "loss": 33.99, "step": 18032 }, { "epoch": 47.62759986794322, "grad_norm": 1222.1092529296875, "learning_rate": 2.6192228527939233e-06, "loss": 33.8634, "step": 18033 }, { "epoch": 47.63024100363156, "grad_norm": 1851.5537109375, "learning_rate": 2.6131948427008943e-06, "loss": 34.2945, "step": 18034 }, { "epoch": 47.63288213931991, "grad_norm": 1033.9544677734375, "learning_rate": 2.6071737407390716e-06, "loss": 33.2815, "step": 18035 }, { "epoch": 47.635523275008254, "grad_norm": 1667.09326171875, "learning_rate": 2.6011595470766256e-06, "loss": 33.5376, "step": 18036 }, { "epoch": 47.6381644106966, "grad_norm": 622.14404296875, "learning_rate": 2.5951522618814505e-06, "loss": 34.303, "step": 18037 }, { "epoch": 47.64080554638495, "grad_norm": 793.4998168945312, "learning_rate": 2.5891518853213003e-06, "loss": 36.0037, "step": 18038 }, { "epoch": 47.64344668207329, "grad_norm": 705.8638916015625, "learning_rate": 2.5831584175637635e-06, "loss": 33.9568, "step": 18039 }, { "epoch": 47.64608781776164, "grad_norm": 984.82177734375, "learning_rate": 2.5771718587761785e-06, "loss": 34.5021, "step": 18040 }, { "epoch": 47.64872895344998, "grad_norm": 1602.15625, "learning_rate": 2.5711922091257167e-06, "loss": 34.5675, "step": 18041 }, { "epoch": 47.65137008913833, "grad_norm": 1780.7213134765625, "learning_rate": 2.565219468779384e-06, "loss": 33.269, "step": 18042 }, { "epoch": 47.654011224826675, "grad_norm": 1558.4505615234375, "learning_rate": 2.5592536379039634e-06, "loss": 34.6726, "step": 18043 }, { "epoch": 47.65665236051502, "grad_norm": 7256.4970703125, "learning_rate": 2.5532947166660157e-06, "loss": 35.2836, "step": 18044 }, { "epoch": 47.65929349620337, "grad_norm": 655.556884765625, "learning_rate": 2.5473427052319087e-06, "loss": 38.4146, "step": 18045 }, { "epoch": 47.66193463189171, "grad_norm": 1477.928955078125, "learning_rate": 2.5413976037679533e-06, "loss": 42.5705, "step": 18046 }, { "epoch": 47.66457576758006, "grad_norm": 893.5469360351562, "learning_rate": 2.535459412440072e-06, "loss": 38.1633, "step": 18047 }, { "epoch": 47.667216903268404, "grad_norm": 911.4698486328125, "learning_rate": 2.5295281314141603e-06, "loss": 37.0953, "step": 18048 }, { "epoch": 47.669858038956754, "grad_norm": 513.1961669921875, "learning_rate": 2.5236037608557527e-06, "loss": 39.3887, "step": 18049 }, { "epoch": 47.6724991746451, "grad_norm": 836.9265747070312, "learning_rate": 2.517686300930383e-06, "loss": 40.8664, "step": 18050 }, { "epoch": 47.67514031033345, "grad_norm": 678.1201171875, "learning_rate": 2.511775751803197e-06, "loss": 41.0126, "step": 18051 }, { "epoch": 47.67778144602179, "grad_norm": 489.2272033691406, "learning_rate": 2.5058721136393125e-06, "loss": 42.9219, "step": 18052 }, { "epoch": 47.68042258171013, "grad_norm": 1645.7225341796875, "learning_rate": 2.4999753866035703e-06, "loss": 41.1459, "step": 18053 }, { "epoch": 47.68306371739848, "grad_norm": 840.958740234375, "learning_rate": 2.494085570860616e-06, "loss": 41.2607, "step": 18054 }, { "epoch": 47.685704853086825, "grad_norm": 773.4304809570312, "learning_rate": 2.488202666574929e-06, "loss": 38.2921, "step": 18055 }, { "epoch": 47.688345988775175, "grad_norm": 825.3190307617188, "learning_rate": 2.4823266739107675e-06, "loss": 39.9585, "step": 18056 }, { "epoch": 47.69098712446352, "grad_norm": 653.3385620117188, "learning_rate": 2.4764575930322776e-06, "loss": 37.6666, "step": 18057 }, { "epoch": 47.69362826015187, "grad_norm": 1716.3214111328125, "learning_rate": 2.4705954241032724e-06, "loss": 36.2967, "step": 18058 }, { "epoch": 47.69626939584021, "grad_norm": 1018.2378540039062, "learning_rate": 2.46474016728751e-06, "loss": 37.0525, "step": 18059 }, { "epoch": 47.698910531528554, "grad_norm": 1891.15869140625, "learning_rate": 2.4588918227484435e-06, "loss": 34.0466, "step": 18060 }, { "epoch": 47.701551667216904, "grad_norm": 602.9213256835938, "learning_rate": 2.453050390649442e-06, "loss": 34.9145, "step": 18061 }, { "epoch": 47.70419280290525, "grad_norm": 734.5121459960938, "learning_rate": 2.447215871153541e-06, "loss": 35.1184, "step": 18062 }, { "epoch": 47.7068339385936, "grad_norm": 1207.354736328125, "learning_rate": 2.4413882644237496e-06, "loss": 35.1702, "step": 18063 }, { "epoch": 47.70947507428194, "grad_norm": 999.3782348632812, "learning_rate": 2.4355675706227707e-06, "loss": 34.2624, "step": 18064 }, { "epoch": 47.71211620997029, "grad_norm": 1294.3067626953125, "learning_rate": 2.429753789913114e-06, "loss": 35.3192, "step": 18065 }, { "epoch": 47.71475734565863, "grad_norm": 1157.2305908203125, "learning_rate": 2.4239469224571762e-06, "loss": 34.5536, "step": 18066 }, { "epoch": 47.717398481346976, "grad_norm": 1258.2706298828125, "learning_rate": 2.418146968417079e-06, "loss": 36.2838, "step": 18067 }, { "epoch": 47.720039617035326, "grad_norm": 1357.633056640625, "learning_rate": 2.4123539279548025e-06, "loss": 29.2542, "step": 18068 }, { "epoch": 47.72268075272367, "grad_norm": 1139.40673828125, "learning_rate": 2.406567801232079e-06, "loss": 9.4175, "step": 18069 }, { "epoch": 47.72532188841202, "grad_norm": 1622.960205078125, "learning_rate": 2.4007885884105294e-06, "loss": 14.0221, "step": 18070 }, { "epoch": 47.72796302410036, "grad_norm": 608.1885986328125, "learning_rate": 2.395016289651497e-06, "loss": 10.8053, "step": 18071 }, { "epoch": 47.73060415978871, "grad_norm": 9338.513671875, "learning_rate": 2.389250905116158e-06, "loss": 12.7133, "step": 18072 }, { "epoch": 47.733245295477055, "grad_norm": 1216.6363525390625, "learning_rate": 2.3834924349655786e-06, "loss": 14.9321, "step": 18073 }, { "epoch": 47.735886431165405, "grad_norm": 1929.1591796875, "learning_rate": 2.3777408793604903e-06, "loss": 15.2222, "step": 18074 }, { "epoch": 47.73852756685375, "grad_norm": 1580.1905517578125, "learning_rate": 2.3719962384615436e-06, "loss": 11.3427, "step": 18075 }, { "epoch": 47.74116870254209, "grad_norm": 5556.27685546875, "learning_rate": 2.366258512429109e-06, "loss": 16.5326, "step": 18076 }, { "epoch": 47.74380983823044, "grad_norm": 4260.92236328125, "learning_rate": 2.360527701423476e-06, "loss": 11.2403, "step": 18077 }, { "epoch": 47.74645097391878, "grad_norm": 3200.038818359375, "learning_rate": 2.3548038056045994e-06, "loss": 36.0436, "step": 18078 }, { "epoch": 47.74909210960713, "grad_norm": 625.0850830078125, "learning_rate": 2.3490868251323515e-06, "loss": 35.6164, "step": 18079 }, { "epoch": 47.751733245295476, "grad_norm": 803.2149658203125, "learning_rate": 2.34337676016641e-06, "loss": 34.3702, "step": 18080 }, { "epoch": 47.754374380983826, "grad_norm": 981.0050048828125, "learning_rate": 2.3376736108661757e-06, "loss": 35.4096, "step": 18081 }, { "epoch": 47.75701551667217, "grad_norm": 736.9381713867188, "learning_rate": 2.3319773773909094e-06, "loss": 34.9697, "step": 18082 }, { "epoch": 47.75965665236051, "grad_norm": 1098.841064453125, "learning_rate": 2.326288059899706e-06, "loss": 35.1767, "step": 18083 }, { "epoch": 47.76229778804886, "grad_norm": 1078.706787109375, "learning_rate": 2.320605658551411e-06, "loss": 34.8063, "step": 18084 }, { "epoch": 47.764938923737205, "grad_norm": 1018.0624389648438, "learning_rate": 2.3149301735047035e-06, "loss": 35.0623, "step": 18085 }, { "epoch": 47.767580059425555, "grad_norm": 1287.7633056640625, "learning_rate": 2.309261604918067e-06, "loss": 34.3702, "step": 18086 }, { "epoch": 47.7702211951139, "grad_norm": 1039.46337890625, "learning_rate": 2.303599952949792e-06, "loss": 34.7459, "step": 18087 }, { "epoch": 47.77286233080225, "grad_norm": 631.0401000976562, "learning_rate": 2.297945217758002e-06, "loss": 34.8645, "step": 18088 }, { "epoch": 47.77550346649059, "grad_norm": 1332.13818359375, "learning_rate": 2.2922973995005982e-06, "loss": 33.1813, "step": 18089 }, { "epoch": 47.778144602178934, "grad_norm": 778.39208984375, "learning_rate": 2.2866564983352324e-06, "loss": 34.1056, "step": 18090 }, { "epoch": 47.780785737867284, "grad_norm": 493.998291015625, "learning_rate": 2.2810225144195007e-06, "loss": 35.9862, "step": 18091 }, { "epoch": 47.78342687355563, "grad_norm": 2156.07763671875, "learning_rate": 2.275395447910694e-06, "loss": 34.5396, "step": 18092 }, { "epoch": 47.78606800924398, "grad_norm": 1582.9263916015625, "learning_rate": 2.269775298965937e-06, "loss": 34.4771, "step": 18093 }, { "epoch": 47.78870914493232, "grad_norm": 1337.967041015625, "learning_rate": 2.2641620677422147e-06, "loss": 35.1032, "step": 18094 }, { "epoch": 47.79135028062067, "grad_norm": 2163.40966796875, "learning_rate": 2.2585557543962078e-06, "loss": 37.0725, "step": 18095 }, { "epoch": 47.79399141630901, "grad_norm": 962.0109252929688, "learning_rate": 2.2529563590845128e-06, "loss": 39.8613, "step": 18096 }, { "epoch": 47.79663255199736, "grad_norm": 1605.3648681640625, "learning_rate": 2.247363881963449e-06, "loss": 38.1584, "step": 18097 }, { "epoch": 47.799273687685705, "grad_norm": 786.7546997070312, "learning_rate": 2.241778323189225e-06, "loss": 39.4549, "step": 18098 }, { "epoch": 47.80191482337405, "grad_norm": 836.6746215820312, "learning_rate": 2.2361996829178277e-06, "loss": 38.2423, "step": 18099 }, { "epoch": 47.8045559590624, "grad_norm": 559.4232788085938, "learning_rate": 2.230627961304993e-06, "loss": 40.4127, "step": 18100 }, { "epoch": 47.80719709475074, "grad_norm": 709.4594116210938, "learning_rate": 2.2250631585063187e-06, "loss": 38.9325, "step": 18101 }, { "epoch": 47.80983823043909, "grad_norm": 693.2744140625, "learning_rate": 2.2195052746772083e-06, "loss": 42.249, "step": 18102 }, { "epoch": 47.812479366127434, "grad_norm": 693.0430908203125, "learning_rate": 2.2139543099728432e-06, "loss": 40.3458, "step": 18103 }, { "epoch": 47.815120501815784, "grad_norm": 605.192138671875, "learning_rate": 2.208410264548266e-06, "loss": 38.6654, "step": 18104 }, { "epoch": 47.81776163750413, "grad_norm": 2065.728759765625, "learning_rate": 2.20287313855827e-06, "loss": 36.6762, "step": 18105 }, { "epoch": 47.82040277319247, "grad_norm": 749.07421875, "learning_rate": 2.1973429321574812e-06, "loss": 38.6527, "step": 18106 }, { "epoch": 47.82304390888082, "grad_norm": 1448.152587890625, "learning_rate": 2.1918196455003037e-06, "loss": 36.7823, "step": 18107 }, { "epoch": 47.82568504456916, "grad_norm": 794.9075927734375, "learning_rate": 2.186303278741003e-06, "loss": 37.025, "step": 18108 }, { "epoch": 47.82832618025751, "grad_norm": 1721.6875, "learning_rate": 2.180793832033623e-06, "loss": 36.226, "step": 18109 }, { "epoch": 47.830967315945855, "grad_norm": 1358.6903076171875, "learning_rate": 2.1752913055319844e-06, "loss": 36.267, "step": 18110 }, { "epoch": 47.833608451634205, "grad_norm": 1057.93115234375, "learning_rate": 2.169795699389743e-06, "loss": 34.9158, "step": 18111 }, { "epoch": 47.83624958732255, "grad_norm": 727.7908935546875, "learning_rate": 2.1643070137604138e-06, "loss": 34.9948, "step": 18112 }, { "epoch": 47.83889072301089, "grad_norm": 891.5270385742188, "learning_rate": 2.1588252487971805e-06, "loss": 35.7876, "step": 18113 }, { "epoch": 47.84153185869924, "grad_norm": 1612.728759765625, "learning_rate": 2.1533504046531705e-06, "loss": 35.8902, "step": 18114 }, { "epoch": 47.844172994387584, "grad_norm": 752.7889404296875, "learning_rate": 2.1478824814812893e-06, "loss": 35.2081, "step": 18115 }, { "epoch": 47.846814130075934, "grad_norm": 711.0798950195312, "learning_rate": 2.1424214794341646e-06, "loss": 34.7241, "step": 18116 }, { "epoch": 47.84945526576428, "grad_norm": 1765.5496826171875, "learning_rate": 2.1369673986643412e-06, "loss": 33.3807, "step": 18117 }, { "epoch": 47.85209640145263, "grad_norm": 4800.70849609375, "learning_rate": 2.1315202393240863e-06, "loss": 12.0469, "step": 18118 }, { "epoch": 47.85473753714097, "grad_norm": 1768.3004150390625, "learning_rate": 2.126080001565528e-06, "loss": 10.9986, "step": 18119 }, { "epoch": 47.85737867282932, "grad_norm": 902.3251342773438, "learning_rate": 2.1206466855405725e-06, "loss": 9.8254, "step": 18120 }, { "epoch": 47.86001980851766, "grad_norm": 26406.478515625, "learning_rate": 2.115220291400932e-06, "loss": 10.326, "step": 18121 }, { "epoch": 47.862660944206006, "grad_norm": 3531.323486328125, "learning_rate": 2.10980081929818e-06, "loss": 12.4635, "step": 18122 }, { "epoch": 47.865302079894356, "grad_norm": 9535.666015625, "learning_rate": 2.1043882693836113e-06, "loss": 14.7388, "step": 18123 }, { "epoch": 47.8679432155827, "grad_norm": 949.677490234375, "learning_rate": 2.0989826418083834e-06, "loss": 10.7452, "step": 18124 }, { "epoch": 47.87058435127105, "grad_norm": 2005.2520751953125, "learning_rate": 2.0935839367234033e-06, "loss": 10.1301, "step": 18125 }, { "epoch": 47.87322548695939, "grad_norm": 920.2098388671875, "learning_rate": 2.0881921542794945e-06, "loss": 10.0183, "step": 18126 }, { "epoch": 47.87586662264774, "grad_norm": 834.8713989257812, "learning_rate": 2.0828072946272035e-06, "loss": 22.0662, "step": 18127 }, { "epoch": 47.878507758336085, "grad_norm": 1043.279296875, "learning_rate": 2.0774293579168546e-06, "loss": 34.5931, "step": 18128 }, { "epoch": 47.88114889402443, "grad_norm": 1255.8963623046875, "learning_rate": 2.0720583442986605e-06, "loss": 34.2199, "step": 18129 }, { "epoch": 47.88379002971278, "grad_norm": 800.3694458007812, "learning_rate": 2.066694253922613e-06, "loss": 34.3137, "step": 18130 }, { "epoch": 47.88643116540112, "grad_norm": 1116.5546875, "learning_rate": 2.0613370869384804e-06, "loss": 34.3493, "step": 18131 }, { "epoch": 47.88907230108947, "grad_norm": 598.0950927734375, "learning_rate": 2.0559868434958384e-06, "loss": 33.0823, "step": 18132 }, { "epoch": 47.89171343677781, "grad_norm": 968.5840454101562, "learning_rate": 2.050643523744122e-06, "loss": 34.3483, "step": 18133 }, { "epoch": 47.89435457246616, "grad_norm": 1270.1021728515625, "learning_rate": 2.0453071278325464e-06, "loss": 34.0604, "step": 18134 }, { "epoch": 47.896995708154506, "grad_norm": 2250.531005859375, "learning_rate": 2.0399776559101025e-06, "loss": 33.7818, "step": 18135 }, { "epoch": 47.89963684384285, "grad_norm": 1128.9453125, "learning_rate": 2.0346551081256438e-06, "loss": 34.9605, "step": 18136 }, { "epoch": 47.9022779795312, "grad_norm": 1988.5699462890625, "learning_rate": 2.0293394846277457e-06, "loss": 34.3676, "step": 18137 }, { "epoch": 47.90491911521954, "grad_norm": 795.5118408203125, "learning_rate": 2.024030785564901e-06, "loss": 35.4694, "step": 18138 }, { "epoch": 47.90756025090789, "grad_norm": 963.9420776367188, "learning_rate": 2.018729011085324e-06, "loss": 35.2057, "step": 18139 }, { "epoch": 47.910201386596235, "grad_norm": 838.0574340820312, "learning_rate": 2.0134341613370634e-06, "loss": 34.6133, "step": 18140 }, { "epoch": 47.912842522284585, "grad_norm": 1743.6007080078125, "learning_rate": 2.0081462364679725e-06, "loss": 34.2443, "step": 18141 }, { "epoch": 47.91548365797293, "grad_norm": 1233.678955078125, "learning_rate": 2.0028652366257117e-06, "loss": 34.3994, "step": 18142 }, { "epoch": 47.91812479366128, "grad_norm": 1312.4443359375, "learning_rate": 1.9975911619577738e-06, "loss": 35.3684, "step": 18143 }, { "epoch": 47.92076592934962, "grad_norm": 1476.531494140625, "learning_rate": 1.9923240126114018e-06, "loss": 35.4276, "step": 18144 }, { "epoch": 47.92340706503796, "grad_norm": 1385.271728515625, "learning_rate": 1.9870637887337282e-06, "loss": 37.6403, "step": 18145 }, { "epoch": 47.926048200726314, "grad_norm": 1003.9303588867188, "learning_rate": 1.9818104904715805e-06, "loss": 38.849, "step": 18146 }, { "epoch": 47.92868933641466, "grad_norm": 699.1722412109375, "learning_rate": 1.976564117971702e-06, "loss": 38.6578, "step": 18147 }, { "epoch": 47.93133047210301, "grad_norm": 1758.7218017578125, "learning_rate": 1.9713246713805587e-06, "loss": 40.7893, "step": 18148 }, { "epoch": 47.93397160779135, "grad_norm": 677.7974243164062, "learning_rate": 1.9660921508444506e-06, "loss": 42.2209, "step": 18149 }, { "epoch": 47.9366127434797, "grad_norm": 1291.9049072265625, "learning_rate": 1.9608665565095386e-06, "loss": 42.0268, "step": 18150 }, { "epoch": 47.93925387916804, "grad_norm": 956.5413818359375, "learning_rate": 1.9556478885217334e-06, "loss": 37.9482, "step": 18151 }, { "epoch": 47.941895014856385, "grad_norm": 647.3595581054688, "learning_rate": 1.9504361470267518e-06, "loss": 37.1282, "step": 18152 }, { "epoch": 47.944536150544735, "grad_norm": 807.302734375, "learning_rate": 1.9452313321701166e-06, "loss": 38.6634, "step": 18153 }, { "epoch": 47.94717728623308, "grad_norm": 596.1516723632812, "learning_rate": 1.9400334440971833e-06, "loss": 36.3026, "step": 18154 }, { "epoch": 47.94981842192143, "grad_norm": 653.9853515625, "learning_rate": 1.9348424829530864e-06, "loss": 35.0793, "step": 18155 }, { "epoch": 47.95245955760977, "grad_norm": 614.8616333007812, "learning_rate": 1.929658448882793e-06, "loss": 34.3699, "step": 18156 }, { "epoch": 47.95510069329812, "grad_norm": 906.5264892578125, "learning_rate": 1.9244813420310758e-06, "loss": 34.5576, "step": 18157 }, { "epoch": 47.957741828986464, "grad_norm": 1895.39111328125, "learning_rate": 1.9193111625424865e-06, "loss": 35.2084, "step": 18158 }, { "epoch": 47.96038296467481, "grad_norm": 8955.599609375, "learning_rate": 1.914147910561409e-06, "loss": 29.8557, "step": 18159 }, { "epoch": 47.96302410036316, "grad_norm": 1907.819580078125, "learning_rate": 1.908991586232006e-06, "loss": 11.5082, "step": 18160 }, { "epoch": 47.9656652360515, "grad_norm": 2838.987548828125, "learning_rate": 1.9038421896983015e-06, "loss": 12.3675, "step": 18161 }, { "epoch": 47.96830637173985, "grad_norm": 6775.8125, "learning_rate": 1.8986997211040413e-06, "loss": 10.241, "step": 18162 }, { "epoch": 47.97094750742819, "grad_norm": 2005.406982421875, "learning_rate": 1.8935641805928604e-06, "loss": 8.9331, "step": 18163 }, { "epoch": 47.97358864311654, "grad_norm": 459.2377624511719, "learning_rate": 1.8884355683081445e-06, "loss": 13.2005, "step": 18164 }, { "epoch": 47.976229778804885, "grad_norm": 682.9078979492188, "learning_rate": 1.8833138843931118e-06, "loss": 34.7379, "step": 18165 }, { "epoch": 47.978870914493235, "grad_norm": 791.9329833984375, "learning_rate": 1.8781991289907874e-06, "loss": 35.7436, "step": 18166 }, { "epoch": 47.98151205018158, "grad_norm": 801.924072265625, "learning_rate": 1.873091302244001e-06, "loss": 34.4812, "step": 18167 }, { "epoch": 47.98415318586992, "grad_norm": 916.9019775390625, "learning_rate": 1.8679904042953888e-06, "loss": 34.2005, "step": 18168 }, { "epoch": 47.98679432155827, "grad_norm": 630.6921997070312, "learning_rate": 1.8628964352873922e-06, "loss": 34.0617, "step": 18169 }, { "epoch": 47.989435457246614, "grad_norm": 1703.0595703125, "learning_rate": 1.857809395362231e-06, "loss": 33.9926, "step": 18170 }, { "epoch": 47.992076592934964, "grad_norm": 699.2515258789062, "learning_rate": 1.8527292846619859e-06, "loss": 36.6131, "step": 18171 }, { "epoch": 47.99471772862331, "grad_norm": 2553.591796875, "learning_rate": 1.8476561033284879e-06, "loss": 33.3555, "step": 18172 }, { "epoch": 47.99735886431166, "grad_norm": 1589.2626953125, "learning_rate": 1.8425898515034568e-06, "loss": 34.7557, "step": 18173 }, { "epoch": 48.0, "grad_norm": 955.7930297851562, "learning_rate": 1.8375305293282796e-06, "loss": 36.816, "step": 18174 }, { "epoch": 48.00264113568834, "grad_norm": 596.4268188476562, "learning_rate": 1.8324781369443154e-06, "loss": 37.1195, "step": 18175 }, { "epoch": 48.00528227137669, "grad_norm": 625.0328369140625, "learning_rate": 1.8274326744926185e-06, "loss": 38.1403, "step": 18176 }, { "epoch": 48.007923407065036, "grad_norm": 1306.809814453125, "learning_rate": 1.8223941421140755e-06, "loss": 38.5418, "step": 18177 }, { "epoch": 48.010564542753386, "grad_norm": 984.1712036132812, "learning_rate": 1.8173625399493798e-06, "loss": 39.6166, "step": 18178 }, { "epoch": 48.01320567844173, "grad_norm": 1098.37158203125, "learning_rate": 1.812337868139058e-06, "loss": 39.6653, "step": 18179 }, { "epoch": 48.01584681413008, "grad_norm": 1429.20458984375, "learning_rate": 1.807320126823414e-06, "loss": 41.1247, "step": 18180 }, { "epoch": 48.01848794981842, "grad_norm": 929.8070068359375, "learning_rate": 1.8023093161425309e-06, "loss": 38.1488, "step": 18181 }, { "epoch": 48.021129085506765, "grad_norm": 663.3582763671875, "learning_rate": 1.7973054362363795e-06, "loss": 40.1417, "step": 18182 }, { "epoch": 48.023770221195115, "grad_norm": 981.3143310546875, "learning_rate": 1.7923084872446815e-06, "loss": 39.7526, "step": 18183 }, { "epoch": 48.02641135688346, "grad_norm": 685.9434204101562, "learning_rate": 1.7873184693069366e-06, "loss": 37.5915, "step": 18184 }, { "epoch": 48.02905249257181, "grad_norm": 1061.3839111328125, "learning_rate": 1.7823353825625333e-06, "loss": 38.0152, "step": 18185 }, { "epoch": 48.03169362826015, "grad_norm": 940.9661254882812, "learning_rate": 1.77735922715061e-06, "loss": 36.8672, "step": 18186 }, { "epoch": 48.0343347639485, "grad_norm": 1152.9708251953125, "learning_rate": 1.7723900032100837e-06, "loss": 35.7884, "step": 18187 }, { "epoch": 48.03697589963684, "grad_norm": 471.3817443847656, "learning_rate": 1.7674277108797877e-06, "loss": 35.7734, "step": 18188 }, { "epoch": 48.03961703532519, "grad_norm": 828.1744384765625, "learning_rate": 1.7624723502982498e-06, "loss": 34.4249, "step": 18189 }, { "epoch": 48.042258171013536, "grad_norm": 811.539794921875, "learning_rate": 1.757523921603832e-06, "loss": 35.2199, "step": 18190 }, { "epoch": 48.04489930670188, "grad_norm": 496.80645751953125, "learning_rate": 1.752582424934729e-06, "loss": 34.4575, "step": 18191 }, { "epoch": 48.04754044239023, "grad_norm": 1681.7230224609375, "learning_rate": 1.7476478604289692e-06, "loss": 35.7145, "step": 18192 }, { "epoch": 48.05018157807857, "grad_norm": 800.3097534179688, "learning_rate": 1.742720228224276e-06, "loss": 35.2613, "step": 18193 }, { "epoch": 48.05282271376692, "grad_norm": 1545.675048828125, "learning_rate": 1.7377995284582892e-06, "loss": 34.9444, "step": 18194 }, { "epoch": 48.055463849455265, "grad_norm": 639.2427368164062, "learning_rate": 1.7328857612684267e-06, "loss": 34.5152, "step": 18195 }, { "epoch": 48.058104985143615, "grad_norm": 732.3209838867188, "learning_rate": 1.7279789267918843e-06, "loss": 34.5446, "step": 18196 }, { "epoch": 48.06074612083196, "grad_norm": 4505.48095703125, "learning_rate": 1.7230790251656914e-06, "loss": 27.5383, "step": 18197 }, { "epoch": 48.0633872565203, "grad_norm": 6045.12744140625, "learning_rate": 1.718186056526655e-06, "loss": 12.1474, "step": 18198 }, { "epoch": 48.06602839220865, "grad_norm": 14920.935546875, "learning_rate": 1.7133000210114436e-06, "loss": 9.5043, "step": 18199 }, { "epoch": 48.06866952789699, "grad_norm": 1466.5836181640625, "learning_rate": 1.708420918756476e-06, "loss": 10.4014, "step": 18200 }, { "epoch": 48.06866952789699, "eval_loss": 3.7921464443206787, "eval_runtime": 2.1345, "eval_samples_per_second": 231.903, "eval_steps_per_second": 29.046, "step": 18200 }, { "epoch": 48.07131066358534, "grad_norm": 418.94000244140625, "learning_rate": 1.7035487498979763e-06, "loss": 10.76, "step": 18201 }, { "epoch": 48.073951799273686, "grad_norm": 931.4216918945312, "learning_rate": 1.6986835145720582e-06, "loss": 8.7067, "step": 18202 }, { "epoch": 48.07659293496204, "grad_norm": 20132.71875, "learning_rate": 1.6938252129145016e-06, "loss": 12.7585, "step": 18203 }, { "epoch": 48.07923407065038, "grad_norm": 759.6986083984375, "learning_rate": 1.6889738450610592e-06, "loss": 10.4545, "step": 18204 }, { "epoch": 48.08187520633872, "grad_norm": 889.40771484375, "learning_rate": 1.6841294111471228e-06, "loss": 13.1808, "step": 18205 }, { "epoch": 48.08451634202707, "grad_norm": 1792.2752685546875, "learning_rate": 1.6792919113080008e-06, "loss": 8.8007, "step": 18206 }, { "epoch": 48.087157477715415, "grad_norm": 465.8105773925781, "learning_rate": 1.6744613456788072e-06, "loss": 24.7201, "step": 18207 }, { "epoch": 48.089798613403765, "grad_norm": 840.0628051757812, "learning_rate": 1.6696377143943786e-06, "loss": 34.6184, "step": 18208 }, { "epoch": 48.09243974909211, "grad_norm": 1393.989990234375, "learning_rate": 1.6648210175894685e-06, "loss": 33.4729, "step": 18209 }, { "epoch": 48.09508088478046, "grad_norm": 1751.86376953125, "learning_rate": 1.6600112553985246e-06, "loss": 33.0742, "step": 18210 }, { "epoch": 48.0977220204688, "grad_norm": 778.5565185546875, "learning_rate": 1.6552084279558843e-06, "loss": 35.7043, "step": 18211 }, { "epoch": 48.10036315615715, "grad_norm": 786.1278686523438, "learning_rate": 1.6504125353956622e-06, "loss": 33.9375, "step": 18212 }, { "epoch": 48.103004291845494, "grad_norm": 1906.3104248046875, "learning_rate": 1.6456235778518069e-06, "loss": 35.9539, "step": 18213 }, { "epoch": 48.10564542753384, "grad_norm": 1564.2542724609375, "learning_rate": 1.6408415554580169e-06, "loss": 33.8687, "step": 18214 }, { "epoch": 48.10828656322219, "grad_norm": 1442.2049560546875, "learning_rate": 1.6360664683478243e-06, "loss": 33.5128, "step": 18215 }, { "epoch": 48.11092769891053, "grad_norm": 575.8634643554688, "learning_rate": 1.631298316654567e-06, "loss": 34.9781, "step": 18216 }, { "epoch": 48.11356883459888, "grad_norm": 6308.64404296875, "learning_rate": 1.626537100511416e-06, "loss": 35.6085, "step": 18217 }, { "epoch": 48.11620997028722, "grad_norm": 660.5011596679688, "learning_rate": 1.6217828200513207e-06, "loss": 35.1148, "step": 18218 }, { "epoch": 48.11885110597557, "grad_norm": 1063.3519287109375, "learning_rate": 1.6170354754070082e-06, "loss": 34.7182, "step": 18219 }, { "epoch": 48.121492241663915, "grad_norm": 1197.498291015625, "learning_rate": 1.6122950667110946e-06, "loss": 33.7842, "step": 18220 }, { "epoch": 48.12413337735226, "grad_norm": 2373.030029296875, "learning_rate": 1.6075615940959465e-06, "loss": 33.4934, "step": 18221 }, { "epoch": 48.12677451304061, "grad_norm": 2555.130126953125, "learning_rate": 1.6028350576936801e-06, "loss": 35.4344, "step": 18222 }, { "epoch": 48.12941564872895, "grad_norm": 1324.5650634765625, "learning_rate": 1.5981154576363844e-06, "loss": 36.6879, "step": 18223 }, { "epoch": 48.1320567844173, "grad_norm": 3312.83935546875, "learning_rate": 1.5934027940557594e-06, "loss": 38.7621, "step": 18224 }, { "epoch": 48.134697920105644, "grad_norm": 914.3253173828125, "learning_rate": 1.5886970670834499e-06, "loss": 39.291, "step": 18225 }, { "epoch": 48.137339055793994, "grad_norm": 3919.779541015625, "learning_rate": 1.583998276850851e-06, "loss": 39.4783, "step": 18226 }, { "epoch": 48.13998019148234, "grad_norm": 531.2831420898438, "learning_rate": 1.579306423489163e-06, "loss": 38.9046, "step": 18227 }, { "epoch": 48.14262132717068, "grad_norm": 1050.12744140625, "learning_rate": 1.5746215071294202e-06, "loss": 37.7989, "step": 18228 }, { "epoch": 48.14526246285903, "grad_norm": 745.3256225585938, "learning_rate": 1.5699435279024344e-06, "loss": 42.0946, "step": 18229 }, { "epoch": 48.14790359854737, "grad_norm": 626.3561401367188, "learning_rate": 1.5652724859388235e-06, "loss": 40.553, "step": 18230 }, { "epoch": 48.15054473423572, "grad_norm": 1029.745849609375, "learning_rate": 1.5606083813690664e-06, "loss": 44.2065, "step": 18231 }, { "epoch": 48.153185869924066, "grad_norm": 670.0204467773438, "learning_rate": 1.5559512143233922e-06, "loss": 41.79, "step": 18232 }, { "epoch": 48.155827005612416, "grad_norm": 1156.402099609375, "learning_rate": 1.5513009849318083e-06, "loss": 38.5129, "step": 18233 }, { "epoch": 48.15846814130076, "grad_norm": 755.7456665039062, "learning_rate": 1.5466576933242104e-06, "loss": 38.3197, "step": 18234 }, { "epoch": 48.16110927698911, "grad_norm": 1879.521240234375, "learning_rate": 1.542021339630245e-06, "loss": 37.2215, "step": 18235 }, { "epoch": 48.16375041267745, "grad_norm": 797.5221557617188, "learning_rate": 1.5373919239793644e-06, "loss": 38.3466, "step": 18236 }, { "epoch": 48.166391548365795, "grad_norm": 1177.32958984375, "learning_rate": 1.5327694465008812e-06, "loss": 37.3238, "step": 18237 }, { "epoch": 48.169032684054145, "grad_norm": 1515.03857421875, "learning_rate": 1.5281539073238315e-06, "loss": 35.5248, "step": 18238 }, { "epoch": 48.17167381974249, "grad_norm": 682.8521118164062, "learning_rate": 1.5235453065771398e-06, "loss": 34.9322, "step": 18239 }, { "epoch": 48.17431495543084, "grad_norm": 760.796875, "learning_rate": 1.5189436443895088e-06, "loss": 35.8955, "step": 18240 }, { "epoch": 48.17695609111918, "grad_norm": 749.8568725585938, "learning_rate": 1.5143489208893635e-06, "loss": 35.2658, "step": 18241 }, { "epoch": 48.17959722680753, "grad_norm": 1276.084228515625, "learning_rate": 1.5097611362051012e-06, "loss": 34.0522, "step": 18242 }, { "epoch": 48.18223836249587, "grad_norm": 631.155517578125, "learning_rate": 1.5051802904647304e-06, "loss": 34.8507, "step": 18243 }, { "epoch": 48.184879498184216, "grad_norm": 1452.833740234375, "learning_rate": 1.5006063837962601e-06, "loss": 35.4088, "step": 18244 }, { "epoch": 48.187520633872566, "grad_norm": 1146.7724609375, "learning_rate": 1.496039416327394e-06, "loss": 35.7802, "step": 18245 }, { "epoch": 48.19016176956091, "grad_norm": 756.8984375, "learning_rate": 1.4914793881856404e-06, "loss": 39.4938, "step": 18246 }, { "epoch": 48.19280290524926, "grad_norm": 1585.3001708984375, "learning_rate": 1.4869262994983147e-06, "loss": 18.1911, "step": 18247 }, { "epoch": 48.1954440409376, "grad_norm": 679.3847045898438, "learning_rate": 1.4823801503926205e-06, "loss": 11.4091, "step": 18248 }, { "epoch": 48.19808517662595, "grad_norm": 4441.2158203125, "learning_rate": 1.4778409409954563e-06, "loss": 8.8221, "step": 18249 }, { "epoch": 48.200726312314295, "grad_norm": 2211.540771484375, "learning_rate": 1.4733086714336097e-06, "loss": 13.7311, "step": 18250 }, { "epoch": 48.20336744800264, "grad_norm": 1764.813720703125, "learning_rate": 1.4687833418336183e-06, "loss": 9.019, "step": 18251 }, { "epoch": 48.20600858369099, "grad_norm": 1224.439453125, "learning_rate": 1.4642649523218533e-06, "loss": 10.771, "step": 18252 }, { "epoch": 48.20864971937933, "grad_norm": 1392.664794921875, "learning_rate": 1.4597535030244913e-06, "loss": 10.0623, "step": 18253 }, { "epoch": 48.21129085506768, "grad_norm": 2193.267333984375, "learning_rate": 1.455248994067515e-06, "loss": 9.0253, "step": 18254 }, { "epoch": 48.21393199075602, "grad_norm": 1001.669921875, "learning_rate": 1.4507514255766853e-06, "loss": 8.9336, "step": 18255 }, { "epoch": 48.21657312644437, "grad_norm": 901.6137084960938, "learning_rate": 1.446260797677651e-06, "loss": 14.1396, "step": 18256 }, { "epoch": 48.219214262132716, "grad_norm": 897.82861328125, "learning_rate": 1.4417771104957566e-06, "loss": 35.2455, "step": 18257 }, { "epoch": 48.221855397821066, "grad_norm": 724.99169921875, "learning_rate": 1.4373003641562355e-06, "loss": 36.1983, "step": 18258 }, { "epoch": 48.22449653350941, "grad_norm": 8182.3544921875, "learning_rate": 1.4328305587840984e-06, "loss": 34.5715, "step": 18259 }, { "epoch": 48.22713766919775, "grad_norm": 2949.67138671875, "learning_rate": 1.4283676945041346e-06, "loss": 37.589, "step": 18260 }, { "epoch": 48.2297788048861, "grad_norm": 534.9635009765625, "learning_rate": 1.4239117714409667e-06, "loss": 32.7854, "step": 18261 }, { "epoch": 48.232419940574445, "grad_norm": 862.9527587890625, "learning_rate": 1.4194627897190504e-06, "loss": 35.1676, "step": 18262 }, { "epoch": 48.235061076262795, "grad_norm": 663.0031127929688, "learning_rate": 1.4150207494626477e-06, "loss": 34.6904, "step": 18263 }, { "epoch": 48.23770221195114, "grad_norm": 1210.389404296875, "learning_rate": 1.410585650795715e-06, "loss": 35.1353, "step": 18264 }, { "epoch": 48.24034334763949, "grad_norm": 2217.33056640625, "learning_rate": 1.4061574938421806e-06, "loss": 34.9216, "step": 18265 }, { "epoch": 48.24298448332783, "grad_norm": 2818.169677734375, "learning_rate": 1.4017362787256404e-06, "loss": 34.411, "step": 18266 }, { "epoch": 48.245625619016174, "grad_norm": 666.8219604492188, "learning_rate": 1.3973220055696068e-06, "loss": 33.7833, "step": 18267 }, { "epoch": 48.248266754704524, "grad_norm": 925.9810791015625, "learning_rate": 1.3929146744973143e-06, "loss": 34.7551, "step": 18268 }, { "epoch": 48.25090789039287, "grad_norm": 628.9298095703125, "learning_rate": 1.388514285631831e-06, "loss": 33.758, "step": 18269 }, { "epoch": 48.25354902608122, "grad_norm": 1396.4215087890625, "learning_rate": 1.3841208390960592e-06, "loss": 33.8754, "step": 18270 }, { "epoch": 48.25619016176956, "grad_norm": 1046.3802490234375, "learning_rate": 1.37973433501265e-06, "loss": 35.8159, "step": 18271 }, { "epoch": 48.25883129745791, "grad_norm": 922.9093017578125, "learning_rate": 1.375354773504117e-06, "loss": 36.2197, "step": 18272 }, { "epoch": 48.26147243314625, "grad_norm": 849.1924438476562, "learning_rate": 1.3709821546927516e-06, "loss": 37.9065, "step": 18273 }, { "epoch": 48.264113568834595, "grad_norm": 1988.2257080078125, "learning_rate": 1.3666164787006496e-06, "loss": 38.5912, "step": 18274 }, { "epoch": 48.266754704522945, "grad_norm": 4881.69677734375, "learning_rate": 1.362257745649742e-06, "loss": 38.8917, "step": 18275 }, { "epoch": 48.26939584021129, "grad_norm": 554.8306884765625, "learning_rate": 1.3579059556617367e-06, "loss": 38.2273, "step": 18276 }, { "epoch": 48.27203697589964, "grad_norm": 1151.885498046875, "learning_rate": 1.353561108858148e-06, "loss": 37.6678, "step": 18277 }, { "epoch": 48.27467811158798, "grad_norm": 642.67578125, "learning_rate": 1.3492232053602672e-06, "loss": 38.8882, "step": 18278 }, { "epoch": 48.27731924727633, "grad_norm": 1250.32421875, "learning_rate": 1.3448922452892753e-06, "loss": 41.9811, "step": 18279 }, { "epoch": 48.279960382964674, "grad_norm": 1925.8765869140625, "learning_rate": 1.3405682287661314e-06, "loss": 41.3122, "step": 18280 }, { "epoch": 48.282601518653024, "grad_norm": 1176.4735107421875, "learning_rate": 1.3362511559115165e-06, "loss": 38.4571, "step": 18281 }, { "epoch": 48.28524265434137, "grad_norm": 880.1795043945312, "learning_rate": 1.3319410268460286e-06, "loss": 41.3748, "step": 18282 }, { "epoch": 48.28788379002971, "grad_norm": 697.1157836914062, "learning_rate": 1.3276378416899881e-06, "loss": 41.9713, "step": 18283 }, { "epoch": 48.29052492571806, "grad_norm": 1049.2198486328125, "learning_rate": 1.3233416005636045e-06, "loss": 40.5922, "step": 18284 }, { "epoch": 48.2931660614064, "grad_norm": 459.7897033691406, "learning_rate": 1.3190523035868374e-06, "loss": 39.3888, "step": 18285 }, { "epoch": 48.29580719709475, "grad_norm": 1207.2642822265625, "learning_rate": 1.314769950879452e-06, "loss": 36.7504, "step": 18286 }, { "epoch": 48.298448332783096, "grad_norm": 689.048095703125, "learning_rate": 1.3104945425610192e-06, "loss": 36.2765, "step": 18287 }, { "epoch": 48.301089468471446, "grad_norm": 1475.871826171875, "learning_rate": 1.3062260787509161e-06, "loss": 34.9499, "step": 18288 }, { "epoch": 48.30373060415979, "grad_norm": 1653.86572265625, "learning_rate": 1.3019645595683804e-06, "loss": 35.6295, "step": 18289 }, { "epoch": 48.30637173984813, "grad_norm": 1201.79443359375, "learning_rate": 1.2977099851324003e-06, "loss": 34.5723, "step": 18290 }, { "epoch": 48.30901287553648, "grad_norm": 758.3944091796875, "learning_rate": 1.2934623555617697e-06, "loss": 34.1073, "step": 18291 }, { "epoch": 48.311654011224825, "grad_norm": 493.9418029785156, "learning_rate": 1.2892216709750882e-06, "loss": 34.0043, "step": 18292 }, { "epoch": 48.314295146913175, "grad_norm": 822.6787109375, "learning_rate": 1.2849879314908163e-06, "loss": 34.7689, "step": 18293 }, { "epoch": 48.31693628260152, "grad_norm": 697.52783203125, "learning_rate": 1.2807611372271376e-06, "loss": 34.3933, "step": 18294 }, { "epoch": 48.31957741828987, "grad_norm": 775.6818237304688, "learning_rate": 1.2765412883020967e-06, "loss": 36.6561, "step": 18295 }, { "epoch": 48.32221855397821, "grad_norm": 1349.0582275390625, "learning_rate": 1.2723283848335155e-06, "loss": 30.9355, "step": 18296 }, { "epoch": 48.32485968966655, "grad_norm": 979.1619262695312, "learning_rate": 1.268122426939078e-06, "loss": 11.4843, "step": 18297 }, { "epoch": 48.3275008253549, "grad_norm": 910.3896484375, "learning_rate": 1.2639234147362179e-06, "loss": 10.6004, "step": 18298 }, { "epoch": 48.330141961043246, "grad_norm": 1212.658935546875, "learning_rate": 1.2597313483421469e-06, "loss": 14.0513, "step": 18299 }, { "epoch": 48.332783096731596, "grad_norm": 1047.6390380859375, "learning_rate": 1.255546227873966e-06, "loss": 16.4179, "step": 18300 }, { "epoch": 48.33542423241994, "grad_norm": 4948.9052734375, "learning_rate": 1.251368053448554e-06, "loss": 13.1788, "step": 18301 }, { "epoch": 48.33806536810829, "grad_norm": 663.0606079101562, "learning_rate": 1.247196825182567e-06, "loss": 9.796, "step": 18302 }, { "epoch": 48.34070650379663, "grad_norm": 1005.6450805664062, "learning_rate": 1.243032543192496e-06, "loss": 10.7976, "step": 18303 }, { "epoch": 48.34334763948498, "grad_norm": 708.751953125, "learning_rate": 1.2388752075945808e-06, "loss": 12.6372, "step": 18304 }, { "epoch": 48.345988775173325, "grad_norm": 1267.19921875, "learning_rate": 1.2347248185049786e-06, "loss": 23.0073, "step": 18305 }, { "epoch": 48.34862991086167, "grad_norm": 1254.5408935546875, "learning_rate": 1.2305813760395136e-06, "loss": 33.8374, "step": 18306 }, { "epoch": 48.35127104655002, "grad_norm": 11216.095703125, "learning_rate": 1.2264448803139538e-06, "loss": 34.7197, "step": 18307 }, { "epoch": 48.35391218223836, "grad_norm": 1657.281005859375, "learning_rate": 1.2223153314437908e-06, "loss": 33.9101, "step": 18308 }, { "epoch": 48.35655331792671, "grad_norm": 645.9325561523438, "learning_rate": 1.2181927295443208e-06, "loss": 34.4454, "step": 18309 }, { "epoch": 48.35919445361505, "grad_norm": 667.7981567382812, "learning_rate": 1.214077074730674e-06, "loss": 34.3089, "step": 18310 }, { "epoch": 48.3618355893034, "grad_norm": 978.54833984375, "learning_rate": 1.2099683671177863e-06, "loss": 34.1204, "step": 18311 }, { "epoch": 48.364476724991746, "grad_norm": 2107.711669921875, "learning_rate": 1.2058666068203717e-06, "loss": 34.2546, "step": 18312 }, { "epoch": 48.36711786068009, "grad_norm": 2776.81005859375, "learning_rate": 1.2017717939530047e-06, "loss": 34.3854, "step": 18313 }, { "epoch": 48.36975899636844, "grad_norm": 757.0882568359375, "learning_rate": 1.1976839286299834e-06, "loss": 33.2451, "step": 18314 }, { "epoch": 48.37240013205678, "grad_norm": 1307.8369140625, "learning_rate": 1.193603010965494e-06, "loss": 33.4674, "step": 18315 }, { "epoch": 48.37504126774513, "grad_norm": 1665.633056640625, "learning_rate": 1.1895290410734727e-06, "loss": 34.2213, "step": 18316 }, { "epoch": 48.377682403433475, "grad_norm": 1172.6082763671875, "learning_rate": 1.1854620190677178e-06, "loss": 35.2642, "step": 18317 }, { "epoch": 48.380323539121825, "grad_norm": 1176.933349609375, "learning_rate": 1.1814019450617775e-06, "loss": 34.6454, "step": 18318 }, { "epoch": 48.38296467481017, "grad_norm": 2098.3046875, "learning_rate": 1.1773488191690052e-06, "loss": 34.4505, "step": 18319 }, { "epoch": 48.38560581049851, "grad_norm": 1219.1328125, "learning_rate": 1.173302641502616e-06, "loss": 34.5238, "step": 18320 }, { "epoch": 48.38824694618686, "grad_norm": 1106.3406982421875, "learning_rate": 1.1692634121755751e-06, "loss": 34.8633, "step": 18321 }, { "epoch": 48.390888081875204, "grad_norm": 3742.09912109375, "learning_rate": 1.1652311313006813e-06, "loss": 35.1794, "step": 18322 }, { "epoch": 48.393529217563554, "grad_norm": 1801.1163330078125, "learning_rate": 1.161205798990511e-06, "loss": 37.2415, "step": 18323 }, { "epoch": 48.3961703532519, "grad_norm": 2268.490966796875, "learning_rate": 1.1571874153575302e-06, "loss": 40.6214, "step": 18324 }, { "epoch": 48.39881148894025, "grad_norm": 558.237548828125, "learning_rate": 1.1531759805138986e-06, "loss": 37.74, "step": 18325 }, { "epoch": 48.40145262462859, "grad_norm": 582.78173828125, "learning_rate": 1.149171494571638e-06, "loss": 39.5161, "step": 18326 }, { "epoch": 48.40409376031694, "grad_norm": 1523.94482421875, "learning_rate": 1.1451739576425756e-06, "loss": 38.2224, "step": 18327 }, { "epoch": 48.40673489600528, "grad_norm": 1961.1319580078125, "learning_rate": 1.1411833698383721e-06, "loss": 38.1816, "step": 18328 }, { "epoch": 48.409376031693625, "grad_norm": 575.3073120117188, "learning_rate": 1.1371997312704107e-06, "loss": 39.7698, "step": 18329 }, { "epoch": 48.412017167381975, "grad_norm": 1535.6744384765625, "learning_rate": 1.1332230420499635e-06, "loss": 43.4789, "step": 18330 }, { "epoch": 48.41465830307032, "grad_norm": 644.5248413085938, "learning_rate": 1.1292533022880803e-06, "loss": 43.2225, "step": 18331 }, { "epoch": 48.41729943875867, "grad_norm": 583.3147583007812, "learning_rate": 1.125290512095589e-06, "loss": 39.4567, "step": 18332 }, { "epoch": 48.41994057444701, "grad_norm": 1181.0087890625, "learning_rate": 1.1213346715831795e-06, "loss": 37.9114, "step": 18333 }, { "epoch": 48.42258171013536, "grad_norm": 848.4822387695312, "learning_rate": 1.1173857808612908e-06, "loss": 38.7602, "step": 18334 }, { "epoch": 48.425222845823704, "grad_norm": 1356.49267578125, "learning_rate": 1.1134438400401957e-06, "loss": 37.2921, "step": 18335 }, { "epoch": 48.42786398151205, "grad_norm": 804.1538696289062, "learning_rate": 1.109508849230001e-06, "loss": 36.2737, "step": 18336 }, { "epoch": 48.4305051172004, "grad_norm": 1138.1871337890625, "learning_rate": 1.1055808085405628e-06, "loss": 37.7256, "step": 18337 }, { "epoch": 48.43314625288874, "grad_norm": 1682.2265625, "learning_rate": 1.1016597180815436e-06, "loss": 35.7265, "step": 18338 }, { "epoch": 48.43578738857709, "grad_norm": 1223.686767578125, "learning_rate": 1.0977455779624945e-06, "loss": 35.421, "step": 18339 }, { "epoch": 48.43842852426543, "grad_norm": 579.7277221679688, "learning_rate": 1.0938383882926617e-06, "loss": 36.4906, "step": 18340 }, { "epoch": 48.44106965995378, "grad_norm": 749.9002685546875, "learning_rate": 1.0899381491811799e-06, "loss": 35.4797, "step": 18341 }, { "epoch": 48.443710795642126, "grad_norm": 1888.5791015625, "learning_rate": 1.08604486073699e-06, "loss": 34.3296, "step": 18342 }, { "epoch": 48.44635193133047, "grad_norm": 452.3900451660156, "learning_rate": 1.0821585230687547e-06, "loss": 35.9107, "step": 18343 }, { "epoch": 48.44899306701882, "grad_norm": 743.423583984375, "learning_rate": 1.0782791362849987e-06, "loss": 34.6401, "step": 18344 }, { "epoch": 48.45163420270716, "grad_norm": 637.3538818359375, "learning_rate": 1.0744067004941071e-06, "loss": 34.3387, "step": 18345 }, { "epoch": 48.45427533839551, "grad_norm": 3948.716064453125, "learning_rate": 1.0705412158041882e-06, "loss": 45.3696, "step": 18346 }, { "epoch": 48.456916474083854, "grad_norm": 434.9161071777344, "learning_rate": 1.0666826823231557e-06, "loss": 15.3453, "step": 18347 }, { "epoch": 48.459557609772205, "grad_norm": 2842.712890625, "learning_rate": 1.0628311001587566e-06, "loss": 9.0906, "step": 18348 }, { "epoch": 48.46219874546055, "grad_norm": 776.5269775390625, "learning_rate": 1.0589864694185991e-06, "loss": 10.4099, "step": 18349 }, { "epoch": 48.4648398811489, "grad_norm": 1205.0577392578125, "learning_rate": 1.0551487902100142e-06, "loss": 17.2126, "step": 18350 }, { "epoch": 48.46748101683724, "grad_norm": 2882.67236328125, "learning_rate": 1.0513180626401387e-06, "loss": 15.2153, "step": 18351 }, { "epoch": 48.47012215252558, "grad_norm": 1074.113037109375, "learning_rate": 1.0474942868159699e-06, "loss": 8.8128, "step": 18352 }, { "epoch": 48.47276328821393, "grad_norm": 1292.9951171875, "learning_rate": 1.0436774628442835e-06, "loss": 9.2718, "step": 18353 }, { "epoch": 48.475404423902276, "grad_norm": 2015.486328125, "learning_rate": 1.0398675908316614e-06, "loss": 18.2924, "step": 18354 }, { "epoch": 48.478045559590626, "grad_norm": 5745.47314453125, "learning_rate": 1.03606467088449e-06, "loss": 16.0391, "step": 18355 }, { "epoch": 48.48068669527897, "grad_norm": 1441.21484375, "learning_rate": 1.0322687031089629e-06, "loss": 11.7586, "step": 18356 }, { "epoch": 48.48332783096732, "grad_norm": 2061.119384765625, "learning_rate": 1.028479687611078e-06, "loss": 34.6874, "step": 18357 }, { "epoch": 48.48596896665566, "grad_norm": 509.8160400390625, "learning_rate": 1.02469762449664e-06, "loss": 34.6055, "step": 18358 }, { "epoch": 48.488610102344005, "grad_norm": 2150.7783203125, "learning_rate": 1.0209225138712586e-06, "loss": 33.9363, "step": 18359 }, { "epoch": 48.491251238032355, "grad_norm": 1367.861328125, "learning_rate": 1.0171543558403773e-06, "loss": 35.0307, "step": 18360 }, { "epoch": 48.4938923737207, "grad_norm": 2296.896728515625, "learning_rate": 1.0133931505091899e-06, "loss": 34.5184, "step": 18361 }, { "epoch": 48.49653350940905, "grad_norm": 696.1845703125, "learning_rate": 1.0096388979827232e-06, "loss": 34.351, "step": 18362 }, { "epoch": 48.49917464509739, "grad_norm": 842.810546875, "learning_rate": 1.0058915983658657e-06, "loss": 33.9358, "step": 18363 }, { "epoch": 48.50181578078574, "grad_norm": 819.6289672851562, "learning_rate": 1.0021512517632004e-06, "loss": 34.0607, "step": 18364 }, { "epoch": 48.50445691647408, "grad_norm": 6186.904296875, "learning_rate": 9.984178582791715e-07, "loss": 33.3227, "step": 18365 }, { "epoch": 48.507098052162426, "grad_norm": 865.5881958007812, "learning_rate": 9.946914180180566e-07, "loss": 35.187, "step": 18366 }, { "epoch": 48.509739187850776, "grad_norm": 986.3164672851562, "learning_rate": 9.909719310839393e-07, "loss": 34.2796, "step": 18367 }, { "epoch": 48.51238032353912, "grad_norm": 4644.87353515625, "learning_rate": 9.872593975806255e-07, "loss": 33.4121, "step": 18368 }, { "epoch": 48.51502145922747, "grad_norm": 761.1536254882812, "learning_rate": 9.835538176118096e-07, "loss": 34.368, "step": 18369 }, { "epoch": 48.51766259491581, "grad_norm": 724.4569702148438, "learning_rate": 9.798551912809927e-07, "loss": 34.527, "step": 18370 }, { "epoch": 48.52030373060416, "grad_norm": 409.87420654296875, "learning_rate": 9.761635186914252e-07, "loss": 33.9725, "step": 18371 }, { "epoch": 48.522944866292505, "grad_norm": 2008.873291015625, "learning_rate": 9.724787999462193e-07, "loss": 34.062, "step": 18372 }, { "epoch": 48.525586001980855, "grad_norm": 1615.171142578125, "learning_rate": 9.688010351482646e-07, "loss": 35.6952, "step": 18373 }, { "epoch": 48.5282271376692, "grad_norm": 2171.325439453125, "learning_rate": 9.65130224400229e-07, "loss": 41.427, "step": 18374 }, { "epoch": 48.53086827335754, "grad_norm": 565.3984985351562, "learning_rate": 9.614663678046698e-07, "loss": 37.365, "step": 18375 }, { "epoch": 48.53350940904589, "grad_norm": 1561.6732177734375, "learning_rate": 9.57809465463838e-07, "loss": 38.8923, "step": 18376 }, { "epoch": 48.536150544734234, "grad_norm": 766.54736328125, "learning_rate": 9.541595174799023e-07, "loss": 38.9993, "step": 18377 }, { "epoch": 48.538791680422584, "grad_norm": 1070.4990234375, "learning_rate": 9.50516523954753e-07, "loss": 41.5122, "step": 18378 }, { "epoch": 48.54143281611093, "grad_norm": 1172.081298828125, "learning_rate": 9.468804849901147e-07, "loss": 41.8622, "step": 18379 }, { "epoch": 48.54407395179928, "grad_norm": 640.8319091796875, "learning_rate": 9.432514006875726e-07, "loss": 43.1126, "step": 18380 }, { "epoch": 48.54671508748762, "grad_norm": 1695.71435546875, "learning_rate": 9.396292711484067e-07, "loss": 40.9153, "step": 18381 }, { "epoch": 48.54935622317596, "grad_norm": 579.2877197265625, "learning_rate": 9.360140964738139e-07, "loss": 39.0458, "step": 18382 }, { "epoch": 48.55199735886431, "grad_norm": 776.3388061523438, "learning_rate": 9.324058767646859e-07, "loss": 39.7766, "step": 18383 }, { "epoch": 48.554638494552655, "grad_norm": 1804.92431640625, "learning_rate": 9.288046121218308e-07, "loss": 38.3267, "step": 18384 }, { "epoch": 48.557279630241005, "grad_norm": 1019.3013305664062, "learning_rate": 9.252103026457792e-07, "loss": 36.6794, "step": 18385 }, { "epoch": 48.55992076592935, "grad_norm": 642.7147216796875, "learning_rate": 9.216229484369232e-07, "loss": 36.7639, "step": 18386 }, { "epoch": 48.5625619016177, "grad_norm": 520.5737915039062, "learning_rate": 9.180425495954325e-07, "loss": 38.7655, "step": 18387 }, { "epoch": 48.56520303730604, "grad_norm": 999.0805053710938, "learning_rate": 9.144691062213106e-07, "loss": 35.5308, "step": 18388 }, { "epoch": 48.567844172994384, "grad_norm": 933.675537109375, "learning_rate": 9.109026184142832e-07, "loss": 35.346, "step": 18389 }, { "epoch": 48.570485308682734, "grad_norm": 602.994873046875, "learning_rate": 9.073430862739929e-07, "loss": 35.9871, "step": 18390 }, { "epoch": 48.57312644437108, "grad_norm": 1033.91552734375, "learning_rate": 9.037905098998322e-07, "loss": 34.5094, "step": 18391 }, { "epoch": 48.57576758005943, "grad_norm": 1053.4066162109375, "learning_rate": 9.002448893909721e-07, "loss": 36.0876, "step": 18392 }, { "epoch": 48.57840871574777, "grad_norm": 622.8946533203125, "learning_rate": 8.967062248464441e-07, "loss": 34.6199, "step": 18393 }, { "epoch": 48.58104985143612, "grad_norm": 737.5952758789062, "learning_rate": 8.931745163650862e-07, "loss": 35.5801, "step": 18394 }, { "epoch": 48.58369098712446, "grad_norm": 2253.212646484375, "learning_rate": 8.896497640454582e-07, "loss": 33.8142, "step": 18395 }, { "epoch": 48.58633212281281, "grad_norm": 1211.095947265625, "learning_rate": 8.861319679860646e-07, "loss": 38.5676, "step": 18396 }, { "epoch": 48.588973258501156, "grad_norm": 2805.43994140625, "learning_rate": 8.826211282850771e-07, "loss": 28.3283, "step": 18397 }, { "epoch": 48.5916143941895, "grad_norm": 10193.4267578125, "learning_rate": 8.791172450405837e-07, "loss": 9.0452, "step": 18398 }, { "epoch": 48.59425552987785, "grad_norm": 2791.802490234375, "learning_rate": 8.756203183503675e-07, "loss": 10.7587, "step": 18399 }, { "epoch": 48.59689666556619, "grad_norm": 6009.75830078125, "learning_rate": 8.721303483121001e-07, "loss": 10.9314, "step": 18400 }, { "epoch": 48.59689666556619, "eval_loss": 3.8023853302001953, "eval_runtime": 2.0894, "eval_samples_per_second": 236.911, "eval_steps_per_second": 29.674, "step": 18400 }, { "epoch": 48.59953780125454, "grad_norm": 2163.42578125, "learning_rate": 8.686473350232871e-07, "loss": 13.8889, "step": 18401 }, { "epoch": 48.602178936942884, "grad_norm": 1584.322509765625, "learning_rate": 8.651712785811283e-07, "loss": 8.1235, "step": 18402 }, { "epoch": 48.604820072631234, "grad_norm": 1324.433349609375, "learning_rate": 8.617021790827129e-07, "loss": 18.2948, "step": 18403 }, { "epoch": 48.60746120831958, "grad_norm": 1030.5906982421875, "learning_rate": 8.582400366249077e-07, "loss": 8.7393, "step": 18404 }, { "epoch": 48.61010234400792, "grad_norm": 9458.1953125, "learning_rate": 8.547848513043855e-07, "loss": 10.4284, "step": 18405 }, { "epoch": 48.61274347969627, "grad_norm": 694.549560546875, "learning_rate": 8.5133662321768e-07, "loss": 21.1324, "step": 18406 }, { "epoch": 48.61538461538461, "grad_norm": 909.1541137695312, "learning_rate": 8.4789535246102e-07, "loss": 36.5685, "step": 18407 }, { "epoch": 48.61802575107296, "grad_norm": 821.9458618164062, "learning_rate": 8.44461039130523e-07, "loss": 33.6622, "step": 18408 }, { "epoch": 48.620666886761306, "grad_norm": 1382.942138671875, "learning_rate": 8.410336833220844e-07, "loss": 34.0423, "step": 18409 }, { "epoch": 48.623308022449656, "grad_norm": 668.9431762695312, "learning_rate": 8.376132851314056e-07, "loss": 33.8215, "step": 18410 }, { "epoch": 48.625949158138, "grad_norm": 821.9414672851562, "learning_rate": 8.341998446540211e-07, "loss": 34.3088, "step": 18411 }, { "epoch": 48.62859029382634, "grad_norm": 825.1796875, "learning_rate": 8.307933619852436e-07, "loss": 34.2922, "step": 18412 }, { "epoch": 48.63123142951469, "grad_norm": 686.98681640625, "learning_rate": 8.273938372201917e-07, "loss": 34.3355, "step": 18413 }, { "epoch": 48.633872565203035, "grad_norm": 992.1302490234375, "learning_rate": 8.240012704537891e-07, "loss": 33.8211, "step": 18414 }, { "epoch": 48.636513700891385, "grad_norm": 824.12353515625, "learning_rate": 8.206156617807658e-07, "loss": 34.0693, "step": 18415 }, { "epoch": 48.63915483657973, "grad_norm": 1641.764892578125, "learning_rate": 8.172370112957128e-07, "loss": 34.4895, "step": 18416 }, { "epoch": 48.64179597226808, "grad_norm": 828.8064575195312, "learning_rate": 8.138653190929157e-07, "loss": 32.9132, "step": 18417 }, { "epoch": 48.64443710795642, "grad_norm": 901.1814575195312, "learning_rate": 8.105005852665215e-07, "loss": 34.5179, "step": 18418 }, { "epoch": 48.64707824364477, "grad_norm": 611.2674560546875, "learning_rate": 8.071428099105382e-07, "loss": 34.723, "step": 18419 }, { "epoch": 48.64971937933311, "grad_norm": 1505.8690185546875, "learning_rate": 8.037919931187243e-07, "loss": 33.5771, "step": 18420 }, { "epoch": 48.652360515021456, "grad_norm": 1327.35546875, "learning_rate": 8.00448134984616e-07, "loss": 33.8935, "step": 18421 }, { "epoch": 48.655001650709806, "grad_norm": 2277.1015625, "learning_rate": 7.971112356016108e-07, "loss": 35.0026, "step": 18422 }, { "epoch": 48.65764278639815, "grad_norm": 619.5135498046875, "learning_rate": 7.937812950628842e-07, "loss": 37.1103, "step": 18423 }, { "epoch": 48.6602839220865, "grad_norm": 5471.0185546875, "learning_rate": 7.904583134614174e-07, "loss": 40.346, "step": 18424 }, { "epoch": 48.66292505777484, "grad_norm": 656.9527587890625, "learning_rate": 7.87142290890025e-07, "loss": 38.8192, "step": 18425 }, { "epoch": 48.66556619346319, "grad_norm": 897.0670776367188, "learning_rate": 7.838332274412718e-07, "loss": 38.7869, "step": 18426 }, { "epoch": 48.668207329151535, "grad_norm": 682.7444458007812, "learning_rate": 7.80531123207584e-07, "loss": 37.9545, "step": 18427 }, { "epoch": 48.67084846483988, "grad_norm": 647.0674438476562, "learning_rate": 7.772359782811655e-07, "loss": 39.0784, "step": 18428 }, { "epoch": 48.67348960052823, "grad_norm": 908.313232421875, "learning_rate": 7.739477927540261e-07, "loss": 40.1273, "step": 18429 }, { "epoch": 48.67613073621657, "grad_norm": 1236.3909912109375, "learning_rate": 7.70666566718009e-07, "loss": 40.0112, "step": 18430 }, { "epoch": 48.67877187190492, "grad_norm": 589.5962524414062, "learning_rate": 7.673923002647076e-07, "loss": 42.5202, "step": 18431 }, { "epoch": 48.681413007593264, "grad_norm": 2610.132080078125, "learning_rate": 7.641249934855765e-07, "loss": 40.3946, "step": 18432 }, { "epoch": 48.684054143281614, "grad_norm": 892.3470458984375, "learning_rate": 7.608646464718482e-07, "loss": 38.9961, "step": 18433 }, { "epoch": 48.68669527896996, "grad_norm": 1187.005126953125, "learning_rate": 7.576112593145612e-07, "loss": 38.493, "step": 18434 }, { "epoch": 48.6893364146583, "grad_norm": 1878.519287109375, "learning_rate": 7.543648321045593e-07, "loss": 38.0031, "step": 18435 }, { "epoch": 48.69197755034665, "grad_norm": 937.717041015625, "learning_rate": 7.511253649324923e-07, "loss": 38.3227, "step": 18436 }, { "epoch": 48.69461868603499, "grad_norm": 677.7745361328125, "learning_rate": 7.478928578888433e-07, "loss": 37.3364, "step": 18437 }, { "epoch": 48.69725982172334, "grad_norm": 1033.227783203125, "learning_rate": 7.446673110638735e-07, "loss": 37.2228, "step": 18438 }, { "epoch": 48.699900957411685, "grad_norm": 464.65234375, "learning_rate": 7.414487245476498e-07, "loss": 34.8761, "step": 18439 }, { "epoch": 48.702542093100035, "grad_norm": 880.7210693359375, "learning_rate": 7.382370984300446e-07, "loss": 34.7979, "step": 18440 }, { "epoch": 48.70518322878838, "grad_norm": 713.3792724609375, "learning_rate": 7.350324328007641e-07, "loss": 33.9109, "step": 18441 }, { "epoch": 48.70782436447673, "grad_norm": 610.3822631835938, "learning_rate": 7.318347277492365e-07, "loss": 35.1145, "step": 18442 }, { "epoch": 48.71046550016507, "grad_norm": 1319.2193603515625, "learning_rate": 7.286439833648351e-07, "loss": 34.7179, "step": 18443 }, { "epoch": 48.713106635853414, "grad_norm": 546.2957763671875, "learning_rate": 7.254601997365995e-07, "loss": 35.2354, "step": 18444 }, { "epoch": 48.715747771541764, "grad_norm": 1625.0015869140625, "learning_rate": 7.222833769534587e-07, "loss": 34.8846, "step": 18445 }, { "epoch": 48.71838890723011, "grad_norm": 1551.665283203125, "learning_rate": 7.191135151041194e-07, "loss": 40.6297, "step": 18446 }, { "epoch": 48.72103004291846, "grad_norm": 7405.08203125, "learning_rate": 7.159506142770944e-07, "loss": 8.6302, "step": 18447 }, { "epoch": 48.7236711786068, "grad_norm": 6115.7607421875, "learning_rate": 7.127946745607295e-07, "loss": 11.0563, "step": 18448 }, { "epoch": 48.72631231429515, "grad_norm": 525.8458251953125, "learning_rate": 7.096456960431208e-07, "loss": 13.8616, "step": 18449 }, { "epoch": 48.72895344998349, "grad_norm": 2097.36328125, "learning_rate": 7.065036788122259e-07, "loss": 14.2687, "step": 18450 }, { "epoch": 48.731594585671836, "grad_norm": 6872.7998046875, "learning_rate": 7.033686229557523e-07, "loss": 8.5072, "step": 18451 }, { "epoch": 48.734235721360186, "grad_norm": 1436.2939453125, "learning_rate": 7.00240528561269e-07, "loss": 10.4535, "step": 18452 }, { "epoch": 48.73687685704853, "grad_norm": 356.40545654296875, "learning_rate": 6.971193957161504e-07, "loss": 10.4621, "step": 18453 }, { "epoch": 48.73951799273688, "grad_norm": 3131.2802734375, "learning_rate": 6.940052245074935e-07, "loss": 15.8664, "step": 18454 }, { "epoch": 48.74215912842522, "grad_norm": 1818.5711669921875, "learning_rate": 6.90898015022312e-07, "loss": 10.0784, "step": 18455 }, { "epoch": 48.74480026411357, "grad_norm": 1469.4310302734375, "learning_rate": 6.877977673473423e-07, "loss": 21.1812, "step": 18456 }, { "epoch": 48.747441399801914, "grad_norm": 1402.7686767578125, "learning_rate": 6.847044815691539e-07, "loss": 35.0716, "step": 18457 }, { "epoch": 48.75008253549026, "grad_norm": 2976.94580078125, "learning_rate": 6.816181577741499e-07, "loss": 35.1278, "step": 18458 }, { "epoch": 48.75272367117861, "grad_norm": 804.6087646484375, "learning_rate": 6.785387960485112e-07, "loss": 35.6075, "step": 18459 }, { "epoch": 48.75536480686695, "grad_norm": 2676.015869140625, "learning_rate": 6.754663964781971e-07, "loss": 34.8935, "step": 18460 }, { "epoch": 48.7580059425553, "grad_norm": 3034.936279296875, "learning_rate": 6.724009591490276e-07, "loss": 34.0865, "step": 18461 }, { "epoch": 48.76064707824364, "grad_norm": 974.6530151367188, "learning_rate": 6.693424841466011e-07, "loss": 33.6737, "step": 18462 }, { "epoch": 48.76328821393199, "grad_norm": 974.2628784179688, "learning_rate": 6.662909715563214e-07, "loss": 33.0506, "step": 18463 }, { "epoch": 48.765929349620336, "grad_norm": 769.9390258789062, "learning_rate": 6.632464214633982e-07, "loss": 34.3592, "step": 18464 }, { "epoch": 48.768570485308686, "grad_norm": 1038.5572509765625, "learning_rate": 6.602088339528745e-07, "loss": 34.7464, "step": 18465 }, { "epoch": 48.77121162099703, "grad_norm": 764.4484252929688, "learning_rate": 6.571782091095157e-07, "loss": 33.9863, "step": 18466 }, { "epoch": 48.77385275668537, "grad_norm": 690.8338012695312, "learning_rate": 6.541545470179766e-07, "loss": 34.7786, "step": 18467 }, { "epoch": 48.77649389237372, "grad_norm": 1014.2072143554688, "learning_rate": 6.51137847762745e-07, "loss": 35.8434, "step": 18468 }, { "epoch": 48.779135028062065, "grad_norm": 1085.089111328125, "learning_rate": 6.481281114279758e-07, "loss": 35.1141, "step": 18469 }, { "epoch": 48.781776163750415, "grad_norm": 1712.883544921875, "learning_rate": 6.451253380977684e-07, "loss": 33.5146, "step": 18470 }, { "epoch": 48.78441729943876, "grad_norm": 1251.7010498046875, "learning_rate": 6.421295278559447e-07, "loss": 32.9156, "step": 18471 }, { "epoch": 48.78705843512711, "grad_norm": 1599.341064453125, "learning_rate": 6.3914068078616e-07, "loss": 34.2342, "step": 18472 }, { "epoch": 48.78969957081545, "grad_norm": 1175.8253173828125, "learning_rate": 6.361587969719029e-07, "loss": 36.0436, "step": 18473 }, { "epoch": 48.79234070650379, "grad_norm": 1097.390625, "learning_rate": 6.331838764964404e-07, "loss": 40.4227, "step": 18474 }, { "epoch": 48.79498184219214, "grad_norm": 1383.822509765625, "learning_rate": 6.30215919442817e-07, "loss": 38.0684, "step": 18475 }, { "epoch": 48.797622977880486, "grad_norm": 1277.141357421875, "learning_rate": 6.272549258939386e-07, "loss": 37.8272, "step": 18476 }, { "epoch": 48.800264113568836, "grad_norm": 1084.544921875, "learning_rate": 6.243008959324892e-07, "loss": 37.1658, "step": 18477 }, { "epoch": 48.80290524925718, "grad_norm": 993.0777587890625, "learning_rate": 6.213538296409305e-07, "loss": 38.8686, "step": 18478 }, { "epoch": 48.80554638494553, "grad_norm": 662.8492431640625, "learning_rate": 6.184137271015578e-07, "loss": 41.6945, "step": 18479 }, { "epoch": 48.80818752063387, "grad_norm": 1796.37109375, "learning_rate": 6.154805883964998e-07, "loss": 40.808, "step": 18480 }, { "epoch": 48.810828656322215, "grad_norm": 8328.8818359375, "learning_rate": 6.125544136076355e-07, "loss": 40.1806, "step": 18481 }, { "epoch": 48.813469792010565, "grad_norm": 927.5701904296875, "learning_rate": 6.096352028167051e-07, "loss": 39.5233, "step": 18482 }, { "epoch": 48.81611092769891, "grad_norm": 691.4624633789062, "learning_rate": 6.067229561052268e-07, "loss": 40.0606, "step": 18483 }, { "epoch": 48.81875206338726, "grad_norm": 1163.811279296875, "learning_rate": 6.038176735544965e-07, "loss": 38.0621, "step": 18484 }, { "epoch": 48.8213931990756, "grad_norm": 948.9158325195312, "learning_rate": 6.009193552456437e-07, "loss": 39.4335, "step": 18485 }, { "epoch": 48.82403433476395, "grad_norm": 1288.4315185546875, "learning_rate": 5.980280012596318e-07, "loss": 38.1289, "step": 18486 }, { "epoch": 48.826675470452294, "grad_norm": 842.2913208007812, "learning_rate": 5.951436116771736e-07, "loss": 37.4502, "step": 18487 }, { "epoch": 48.829316606140644, "grad_norm": 803.3084106445312, "learning_rate": 5.922661865788159e-07, "loss": 36.2873, "step": 18488 }, { "epoch": 48.83195774182899, "grad_norm": 714.6492309570312, "learning_rate": 5.893957260449113e-07, "loss": 35.9217, "step": 18489 }, { "epoch": 48.83459887751733, "grad_norm": 1565.1878662109375, "learning_rate": 5.865322301556176e-07, "loss": 36.4811, "step": 18490 }, { "epoch": 48.83724001320568, "grad_norm": 571.4498901367188, "learning_rate": 5.836756989908987e-07, "loss": 34.1611, "step": 18491 }, { "epoch": 48.83988114889402, "grad_norm": 793.924560546875, "learning_rate": 5.808261326305519e-07, "loss": 34.581, "step": 18492 }, { "epoch": 48.84252228458237, "grad_norm": 958.0516967773438, "learning_rate": 5.779835311540971e-07, "loss": 35.1206, "step": 18493 }, { "epoch": 48.845163420270715, "grad_norm": 1373.231689453125, "learning_rate": 5.75147894640915e-07, "loss": 34.2466, "step": 18494 }, { "epoch": 48.847804555959065, "grad_norm": 2228.226806640625, "learning_rate": 5.723192231702201e-07, "loss": 36.3657, "step": 18495 }, { "epoch": 48.85044569164741, "grad_norm": 1500.18310546875, "learning_rate": 5.694975168210048e-07, "loss": 39.1459, "step": 18496 }, { "epoch": 48.85308682733575, "grad_norm": 1145.3812255859375, "learning_rate": 5.666827756720394e-07, "loss": 18.4318, "step": 18497 }, { "epoch": 48.8557279630241, "grad_norm": 1911.6168212890625, "learning_rate": 5.638749998019277e-07, "loss": 9.6017, "step": 18498 }, { "epoch": 48.858369098712444, "grad_norm": 3442.36865234375, "learning_rate": 5.610741892890792e-07, "loss": 10.2357, "step": 18499 }, { "epoch": 48.861010234400794, "grad_norm": 11495.9892578125, "learning_rate": 5.582803442117091e-07, "loss": 15.4753, "step": 18500 }, { "epoch": 48.86365137008914, "grad_norm": 5245.3837890625, "learning_rate": 5.554934646478382e-07, "loss": 9.8012, "step": 18501 }, { "epoch": 48.86629250577749, "grad_norm": 14968.400390625, "learning_rate": 5.527135506752656e-07, "loss": 8.4661, "step": 18502 }, { "epoch": 48.86893364146583, "grad_norm": 1886.55615234375, "learning_rate": 5.499406023716513e-07, "loss": 12.4471, "step": 18503 }, { "epoch": 48.87157477715417, "grad_norm": 1443.8248291015625, "learning_rate": 5.471746198144056e-07, "loss": 8.3805, "step": 18504 }, { "epoch": 48.87421591284252, "grad_norm": 5572.087890625, "learning_rate": 5.444156030807446e-07, "loss": 10.8336, "step": 18505 }, { "epoch": 48.876857048530866, "grad_norm": 2677.9755859375, "learning_rate": 5.416635522477731e-07, "loss": 24.1926, "step": 18506 }, { "epoch": 48.879498184219216, "grad_norm": 1433.5406494140625, "learning_rate": 5.389184673922909e-07, "loss": 35.9584, "step": 18507 }, { "epoch": 48.88213931990756, "grad_norm": 646.9115600585938, "learning_rate": 5.361803485909589e-07, "loss": 33.4979, "step": 18508 }, { "epoch": 48.88478045559591, "grad_norm": 743.7321166992188, "learning_rate": 5.334491959202714e-07, "loss": 34.3834, "step": 18509 }, { "epoch": 48.88742159128425, "grad_norm": 888.3682861328125, "learning_rate": 5.307250094564731e-07, "loss": 33.4921, "step": 18510 }, { "epoch": 48.8900627269726, "grad_norm": 663.0089721679688, "learning_rate": 5.280077892756141e-07, "loss": 32.9317, "step": 18511 }, { "epoch": 48.892703862660944, "grad_norm": 977.2494506835938, "learning_rate": 5.252975354536061e-07, "loss": 34.2566, "step": 18512 }, { "epoch": 48.89534499834929, "grad_norm": 1411.314697265625, "learning_rate": 5.225942480661106e-07, "loss": 34.9217, "step": 18513 }, { "epoch": 48.89798613403764, "grad_norm": 610.4389038085938, "learning_rate": 5.19897927188595e-07, "loss": 34.6814, "step": 18514 }, { "epoch": 48.90062726972598, "grad_norm": 1003.9006958007812, "learning_rate": 5.17208572896416e-07, "loss": 34.6174, "step": 18515 }, { "epoch": 48.90326840541433, "grad_norm": 4644.12841796875, "learning_rate": 5.145261852645966e-07, "loss": 34.4525, "step": 18516 }, { "epoch": 48.90590954110267, "grad_norm": 948.2931518554688, "learning_rate": 5.118507643681048e-07, "loss": 36.8015, "step": 18517 }, { "epoch": 48.90855067679102, "grad_norm": 1314.1319580078125, "learning_rate": 5.091823102816306e-07, "loss": 33.6063, "step": 18518 }, { "epoch": 48.911191812479366, "grad_norm": 747.1206665039062, "learning_rate": 5.065208230796426e-07, "loss": 33.8388, "step": 18519 }, { "epoch": 48.91383294816771, "grad_norm": 2759.375, "learning_rate": 5.038663028365253e-07, "loss": 34.0576, "step": 18520 }, { "epoch": 48.91647408385606, "grad_norm": 3237.031005859375, "learning_rate": 5.012187496263865e-07, "loss": 34.3608, "step": 18521 }, { "epoch": 48.9191152195444, "grad_norm": 1095.6942138671875, "learning_rate": 4.98578163523139e-07, "loss": 36.4114, "step": 18522 }, { "epoch": 48.92175635523275, "grad_norm": 3829.44384765625, "learning_rate": 4.959445446005295e-07, "loss": 36.3421, "step": 18523 }, { "epoch": 48.924397490921095, "grad_norm": 1808.708984375, "learning_rate": 4.933178929321103e-07, "loss": 39.7467, "step": 18524 }, { "epoch": 48.927038626609445, "grad_norm": 572.77392578125, "learning_rate": 4.906982085912115e-07, "loss": 36.7858, "step": 18525 }, { "epoch": 48.92967976229779, "grad_norm": 1016.255859375, "learning_rate": 4.880854916509969e-07, "loss": 38.8625, "step": 18526 }, { "epoch": 48.93232089798613, "grad_norm": 2990.181884765625, "learning_rate": 4.85479742184436e-07, "loss": 42.5643, "step": 18527 }, { "epoch": 48.93496203367448, "grad_norm": 860.7553100585938, "learning_rate": 4.828809602642759e-07, "loss": 41.5916, "step": 18528 }, { "epoch": 48.93760316936282, "grad_norm": 1062.1339111328125, "learning_rate": 4.802891459630976e-07, "loss": 39.2276, "step": 18529 }, { "epoch": 48.94024430505117, "grad_norm": 1122.0257568359375, "learning_rate": 4.777042993532599e-07, "loss": 39.2511, "step": 18530 }, { "epoch": 48.942885440739516, "grad_norm": 647.6951904296875, "learning_rate": 4.751264205069272e-07, "loss": 38.3224, "step": 18531 }, { "epoch": 48.945526576427866, "grad_norm": 745.1609497070312, "learning_rate": 4.725555094961531e-07, "loss": 35.154, "step": 18532 }, { "epoch": 48.94816771211621, "grad_norm": 523.59326171875, "learning_rate": 4.6999156639265796e-07, "loss": 35.7816, "step": 18533 }, { "epoch": 48.95080884780456, "grad_norm": 537.8704223632812, "learning_rate": 4.6743459126807886e-07, "loss": 34.7614, "step": 18534 }, { "epoch": 48.9534499834929, "grad_norm": 1636.24462890625, "learning_rate": 4.6488458419380317e-07, "loss": 34.3815, "step": 18535 }, { "epoch": 48.956091119181245, "grad_norm": 756.3084106445312, "learning_rate": 4.62341545241024e-07, "loss": 34.4477, "step": 18536 }, { "epoch": 48.958732254869595, "grad_norm": 922.3719482421875, "learning_rate": 4.598054744807956e-07, "loss": 38.8843, "step": 18537 }, { "epoch": 48.96137339055794, "grad_norm": 4925.92138671875, "learning_rate": 4.5727637198392236e-07, "loss": 10.8506, "step": 18538 }, { "epoch": 48.96401452624629, "grad_norm": 1437.27783203125, "learning_rate": 4.5475423782098677e-07, "loss": 11.3465, "step": 18539 }, { "epoch": 48.96665566193463, "grad_norm": 3041.906005859375, "learning_rate": 4.5223907206246026e-07, "loss": 8.8598, "step": 18540 }, { "epoch": 48.96929679762298, "grad_norm": 1382.787353515625, "learning_rate": 4.4973087477856445e-07, "loss": 14.4712, "step": 18541 }, { "epoch": 48.971937933311324, "grad_norm": 3182.06298828125, "learning_rate": 4.472296460393266e-07, "loss": 10.9551, "step": 18542 }, { "epoch": 48.97457906899967, "grad_norm": 796.1506958007812, "learning_rate": 4.447353859146075e-07, "loss": 34.0956, "step": 18543 }, { "epoch": 48.97722020468802, "grad_norm": 1259.0389404296875, "learning_rate": 4.4224809447407365e-07, "loss": 33.3985, "step": 18544 }, { "epoch": 48.97986134037636, "grad_norm": 1042.992431640625, "learning_rate": 4.397677717871695e-07, "loss": 35.4317, "step": 18545 }, { "epoch": 48.98250247606471, "grad_norm": 626.2212524414062, "learning_rate": 4.3729441792314527e-07, "loss": 34.5722, "step": 18546 }, { "epoch": 48.98514361175305, "grad_norm": 1238.9332275390625, "learning_rate": 4.348280329510568e-07, "loss": 33.154, "step": 18547 }, { "epoch": 48.9877847474414, "grad_norm": 929.450927734375, "learning_rate": 4.3236861693979333e-07, "loss": 34.1161, "step": 18548 }, { "epoch": 48.990425883129745, "grad_norm": 3967.9775390625, "learning_rate": 4.2991616995805006e-07, "loss": 34.0003, "step": 18549 }, { "epoch": 48.99306701881809, "grad_norm": 2917.4755859375, "learning_rate": 4.2747069207427216e-07, "loss": 35.8565, "step": 18550 }, { "epoch": 48.99570815450644, "grad_norm": 901.8189086914062, "learning_rate": 4.2503218335679385e-07, "loss": 34.2792, "step": 18551 }, { "epoch": 48.99834929019478, "grad_norm": 1387.9798583984375, "learning_rate": 4.226006438736718e-07, "loss": 36.3328, "step": 18552 }, { "epoch": 49.00099042588313, "grad_norm": 24101.62890625, "learning_rate": 4.2017607369282397e-07, "loss": 38.7248, "step": 18553 }, { "epoch": 49.003631561571474, "grad_norm": 1400.3531494140625, "learning_rate": 4.177584728819184e-07, "loss": 38.1101, "step": 18554 }, { "epoch": 49.006272697259824, "grad_norm": 962.381103515625, "learning_rate": 4.1534784150851213e-07, "loss": 38.4296, "step": 18555 }, { "epoch": 49.00891383294817, "grad_norm": 1277.5693359375, "learning_rate": 4.129441796399125e-07, "loss": 37.8677, "step": 18556 }, { "epoch": 49.01155496863652, "grad_norm": 673.411865234375, "learning_rate": 4.1054748734323244e-07, "loss": 39.233, "step": 18557 }, { "epoch": 49.01419610432486, "grad_norm": 1245.5626220703125, "learning_rate": 4.081577646853907e-07, "loss": 41.6078, "step": 18558 }, { "epoch": 49.0168372400132, "grad_norm": 778.6226806640625, "learning_rate": 4.0577501173313934e-07, "loss": 42.8104, "step": 18559 }, { "epoch": 49.01947837570155, "grad_norm": 495.551025390625, "learning_rate": 4.0339922855298083e-07, "loss": 40.2946, "step": 18560 }, { "epoch": 49.022119511389896, "grad_norm": 985.537109375, "learning_rate": 4.010304152112787e-07, "loss": 41.0478, "step": 18561 }, { "epoch": 49.024760647078246, "grad_norm": 1028.101318359375, "learning_rate": 3.9866857177417447e-07, "loss": 38.8035, "step": 18562 }, { "epoch": 49.02740178276659, "grad_norm": 495.1006774902344, "learning_rate": 3.963136983076432e-07, "loss": 40.4943, "step": 18563 }, { "epoch": 49.03004291845494, "grad_norm": 855.81884765625, "learning_rate": 3.9396579487741e-07, "loss": 39.5461, "step": 18564 }, { "epoch": 49.03268405414328, "grad_norm": 998.8055419921875, "learning_rate": 3.916248615490892e-07, "loss": 36.7186, "step": 18565 }, { "epoch": 49.035325189831624, "grad_norm": 911.9259033203125, "learning_rate": 3.8929089838798947e-07, "loss": 36.4368, "step": 18566 }, { "epoch": 49.037966325519974, "grad_norm": 979.3609619140625, "learning_rate": 3.869639054593088e-07, "loss": 35.5318, "step": 18567 }, { "epoch": 49.04060746120832, "grad_norm": 932.8025512695312, "learning_rate": 3.8464388282802297e-07, "loss": 36.1503, "step": 18568 }, { "epoch": 49.04324859689667, "grad_norm": 874.5147705078125, "learning_rate": 3.8233083055894127e-07, "loss": 35.8579, "step": 18569 }, { "epoch": 49.04588973258501, "grad_norm": 1849.9656982421875, "learning_rate": 3.800247487166508e-07, "loss": 35.6128, "step": 18570 }, { "epoch": 49.04853086827336, "grad_norm": 612.9039306640625, "learning_rate": 3.777256373655169e-07, "loss": 34.3124, "step": 18571 }, { "epoch": 49.0511720039617, "grad_norm": 479.78814697265625, "learning_rate": 3.7543349656976587e-07, "loss": 34.8015, "step": 18572 }, { "epoch": 49.053813139650046, "grad_norm": 729.021728515625, "learning_rate": 3.731483263934021e-07, "loss": 34.3765, "step": 18573 }, { "epoch": 49.056454275338396, "grad_norm": 796.6586303710938, "learning_rate": 3.7087012690020795e-07, "loss": 34.7511, "step": 18574 }, { "epoch": 49.05909541102674, "grad_norm": 1404.424072265625, "learning_rate": 3.685988981538546e-07, "loss": 36.7022, "step": 18575 }, { "epoch": 49.06173654671509, "grad_norm": 3230.44921875, "learning_rate": 3.6633464021770814e-07, "loss": 13.129, "step": 18576 }, { "epoch": 49.06437768240343, "grad_norm": 605.191650390625, "learning_rate": 3.640773531550512e-07, "loss": 12.112, "step": 18577 }, { "epoch": 49.06701881809178, "grad_norm": 3288.30224609375, "learning_rate": 3.618270370288612e-07, "loss": 12.1654, "step": 18578 }, { "epoch": 49.069659953780125, "grad_norm": 1252.12744140625, "learning_rate": 3.595836919020323e-07, "loss": 9.3531, "step": 18579 }, { "epoch": 49.072301089468475, "grad_norm": 1879.1490478515625, "learning_rate": 3.5734731783715333e-07, "loss": 9.475, "step": 18580 }, { "epoch": 49.07494222515682, "grad_norm": 2907.71435546875, "learning_rate": 3.551179148967298e-07, "loss": 11.2936, "step": 18581 }, { "epoch": 49.07758336084516, "grad_norm": 3255.13720703125, "learning_rate": 3.528954831429898e-07, "loss": 11.3945, "step": 18582 }, { "epoch": 49.08022449653351, "grad_norm": 903.0411987304688, "learning_rate": 3.5068002263796694e-07, "loss": 10.7448, "step": 18583 }, { "epoch": 49.08286563222185, "grad_norm": 1334.9345703125, "learning_rate": 3.4847153344358395e-07, "loss": 11.7183, "step": 18584 }, { "epoch": 49.0855067679102, "grad_norm": 602.8538818359375, "learning_rate": 3.462700156214582e-07, "loss": 29.8776, "step": 18585 }, { "epoch": 49.088147903598546, "grad_norm": 1432.6025390625, "learning_rate": 3.440754692330961e-07, "loss": 36.0308, "step": 18586 }, { "epoch": 49.090789039286896, "grad_norm": 796.0317993164062, "learning_rate": 3.418878943397541e-07, "loss": 35.031, "step": 18587 }, { "epoch": 49.09343017497524, "grad_norm": 357.42681884765625, "learning_rate": 3.3970729100255003e-07, "loss": 34.0078, "step": 18588 }, { "epoch": 49.09607131066358, "grad_norm": 831.8681030273438, "learning_rate": 3.3753365928235194e-07, "loss": 33.9717, "step": 18589 }, { "epoch": 49.09871244635193, "grad_norm": 904.05224609375, "learning_rate": 3.353669992398889e-07, "loss": 35.5725, "step": 18590 }, { "epoch": 49.101353582040275, "grad_norm": 1312.9827880859375, "learning_rate": 3.3320731093564036e-07, "loss": 34.2589, "step": 18591 }, { "epoch": 49.103994717728625, "grad_norm": 2334.537841796875, "learning_rate": 3.310545944298915e-07, "loss": 33.829, "step": 18592 }, { "epoch": 49.10663585341697, "grad_norm": 1459.8809814453125, "learning_rate": 3.289088497827886e-07, "loss": 34.561, "step": 18593 }, { "epoch": 49.10927698910532, "grad_norm": 726.7886352539062, "learning_rate": 3.2677007705425587e-07, "loss": 34.3126, "step": 18594 }, { "epoch": 49.11191812479366, "grad_norm": 1015.2454223632812, "learning_rate": 3.246382763039679e-07, "loss": 34.4418, "step": 18595 }, { "epoch": 49.114559260482004, "grad_norm": 836.442138671875, "learning_rate": 3.225134475915159e-07, "loss": 35.4667, "step": 18596 }, { "epoch": 49.117200396170354, "grad_norm": 1090.3477783203125, "learning_rate": 3.203955909762135e-07, "loss": 34.734, "step": 18597 }, { "epoch": 49.1198415318587, "grad_norm": 874.283203125, "learning_rate": 3.182847065171801e-07, "loss": 34.3639, "step": 18598 }, { "epoch": 49.12248266754705, "grad_norm": 892.4627685546875, "learning_rate": 3.161807942733963e-07, "loss": 34.5513, "step": 18599 }, { "epoch": 49.12512380323539, "grad_norm": 1430.36962890625, "learning_rate": 3.1408385430356514e-07, "loss": 33.8793, "step": 18600 }, { "epoch": 49.12512380323539, "eval_loss": 3.7170560359954834, "eval_runtime": 2.2188, "eval_samples_per_second": 223.091, "eval_steps_per_second": 27.943, "step": 18600 }, { "epoch": 49.12776493892374, "grad_norm": 1753.1502685546875, "learning_rate": 3.119938866662786e-07, "loss": 36.1757, "step": 18601 }, { "epoch": 49.13040607461208, "grad_norm": 9823.8173828125, "learning_rate": 3.0991089141987895e-07, "loss": 37.3952, "step": 18602 }, { "epoch": 49.13304721030043, "grad_norm": 5520.2666015625, "learning_rate": 3.078348686225696e-07, "loss": 40.838, "step": 18603 }, { "epoch": 49.135688345988775, "grad_norm": 502.91485595703125, "learning_rate": 3.0576581833227645e-07, "loss": 38.7548, "step": 18604 }, { "epoch": 49.13832948167712, "grad_norm": 718.4789428710938, "learning_rate": 3.037037406067866e-07, "loss": 38.0225, "step": 18605 }, { "epoch": 49.14097061736547, "grad_norm": 1086.66943359375, "learning_rate": 3.0164863550369294e-07, "loss": 38.7758, "step": 18606 }, { "epoch": 49.14361175305381, "grad_norm": 1010.87841796875, "learning_rate": 2.996005030803939e-07, "loss": 40.2538, "step": 18607 }, { "epoch": 49.14625288874216, "grad_norm": 782.4149169921875, "learning_rate": 2.97559343394066e-07, "loss": 42.182, "step": 18608 }, { "epoch": 49.148894024430504, "grad_norm": 1065.6976318359375, "learning_rate": 2.955251565016914e-07, "loss": 39.0735, "step": 18609 }, { "epoch": 49.151535160118854, "grad_norm": 1005.9532470703125, "learning_rate": 2.934979424601136e-07, "loss": 38.5208, "step": 18610 }, { "epoch": 49.1541762958072, "grad_norm": 1285.333984375, "learning_rate": 2.9147770132589845e-07, "loss": 41.5258, "step": 18611 }, { "epoch": 49.15681743149554, "grad_norm": 813.5408325195312, "learning_rate": 2.8946443315550073e-07, "loss": 36.8765, "step": 18612 }, { "epoch": 49.15945856718389, "grad_norm": 1075.8753662109375, "learning_rate": 2.8745813800512553e-07, "loss": 37.6305, "step": 18613 }, { "epoch": 49.16209970287223, "grad_norm": 1037.4837646484375, "learning_rate": 2.854588159307836e-07, "loss": 36.1419, "step": 18614 }, { "epoch": 49.16474083856058, "grad_norm": 1754.2720947265625, "learning_rate": 2.8346646698831913e-07, "loss": 36.9062, "step": 18615 }, { "epoch": 49.167381974248926, "grad_norm": 782.8102416992188, "learning_rate": 2.814810912333543e-07, "loss": 36.2681, "step": 18616 }, { "epoch": 49.170023109937276, "grad_norm": 1057.27880859375, "learning_rate": 2.79502688721317e-07, "loss": 34.2756, "step": 18617 }, { "epoch": 49.17266424562562, "grad_norm": 1640.52734375, "learning_rate": 2.7753125950752413e-07, "loss": 35.4741, "step": 18618 }, { "epoch": 49.17530538131396, "grad_norm": 865.1473388671875, "learning_rate": 2.7556680364693165e-07, "loss": 34.7888, "step": 18619 }, { "epoch": 49.17794651700231, "grad_norm": 671.6312255859375, "learning_rate": 2.736093211944679e-07, "loss": 36.463, "step": 18620 }, { "epoch": 49.180587652690654, "grad_norm": 1028.784423828125, "learning_rate": 2.7165881220475583e-07, "loss": 34.4343, "step": 18621 }, { "epoch": 49.183228788379004, "grad_norm": 801.0017700195312, "learning_rate": 2.6971527673225196e-07, "loss": 34.212, "step": 18622 }, { "epoch": 49.18586992406735, "grad_norm": 911.6104736328125, "learning_rate": 2.677787148312738e-07, "loss": 34.8827, "step": 18623 }, { "epoch": 49.1885110597557, "grad_norm": 1710.5611572265625, "learning_rate": 2.658491265558616e-07, "loss": 38.474, "step": 18624 }, { "epoch": 49.19115219544404, "grad_norm": 1567.602783203125, "learning_rate": 2.6392651195991656e-07, "loss": 39.6946, "step": 18625 }, { "epoch": 49.19379333113239, "grad_norm": 2153.39453125, "learning_rate": 2.620108710971458e-07, "loss": 11.9412, "step": 18626 }, { "epoch": 49.19643446682073, "grad_norm": 2194.87060546875, "learning_rate": 2.6010220402097884e-07, "loss": 10.609, "step": 18627 }, { "epoch": 49.199075602509076, "grad_norm": 3143.91552734375, "learning_rate": 2.5820051078478954e-07, "loss": 14.8788, "step": 18628 }, { "epoch": 49.201716738197426, "grad_norm": 752.637939453125, "learning_rate": 2.563057914416189e-07, "loss": 10.2957, "step": 18629 }, { "epoch": 49.20435787388577, "grad_norm": 557.4451904296875, "learning_rate": 2.544180460443968e-07, "loss": 11.5447, "step": 18630 }, { "epoch": 49.20699900957412, "grad_norm": 968.3526611328125, "learning_rate": 2.525372746458587e-07, "loss": 11.6987, "step": 18631 }, { "epoch": 49.20964014526246, "grad_norm": 4637.93359375, "learning_rate": 2.506634772984906e-07, "loss": 10.1933, "step": 18632 }, { "epoch": 49.21228128095081, "grad_norm": 1868.834228515625, "learning_rate": 2.487966540546671e-07, "loss": 10.9172, "step": 18633 }, { "epoch": 49.214922416639155, "grad_norm": 1684.407958984375, "learning_rate": 2.469368049664578e-07, "loss": 10.0618, "step": 18634 }, { "epoch": 49.2175635523275, "grad_norm": 1586.372314453125, "learning_rate": 2.45083930085821e-07, "loss": 28.3491, "step": 18635 }, { "epoch": 49.22020468801585, "grad_norm": 697.6124877929688, "learning_rate": 2.432380294645209e-07, "loss": 34.0272, "step": 18636 }, { "epoch": 49.22284582370419, "grad_norm": 4184.6728515625, "learning_rate": 2.4139910315407186e-07, "loss": 35.3365, "step": 18637 }, { "epoch": 49.22548695939254, "grad_norm": 1882.8131103515625, "learning_rate": 2.395671512058217e-07, "loss": 34.7301, "step": 18638 }, { "epoch": 49.22812809508088, "grad_norm": 844.7169189453125, "learning_rate": 2.377421736709795e-07, "loss": 34.6222, "step": 18639 }, { "epoch": 49.23076923076923, "grad_norm": 1481.781982421875, "learning_rate": 2.359241706004489e-07, "loss": 33.3091, "step": 18640 }, { "epoch": 49.233410366457576, "grad_norm": 1387.97412109375, "learning_rate": 2.341131420449949e-07, "loss": 34.1332, "step": 18641 }, { "epoch": 49.23605150214592, "grad_norm": 1291.931640625, "learning_rate": 2.3230908805524364e-07, "loss": 33.8915, "step": 18642 }, { "epoch": 49.23869263783427, "grad_norm": 2582.583740234375, "learning_rate": 2.3051200868154375e-07, "loss": 33.1256, "step": 18643 }, { "epoch": 49.24133377352261, "grad_norm": 732.63623046875, "learning_rate": 2.2872190397404958e-07, "loss": 34.0129, "step": 18644 }, { "epoch": 49.24397490921096, "grad_norm": 705.3766479492188, "learning_rate": 2.2693877398277664e-07, "loss": 35.2801, "step": 18645 }, { "epoch": 49.246616044899305, "grad_norm": 1257.620361328125, "learning_rate": 2.2516261875751843e-07, "loss": 34.0265, "step": 18646 }, { "epoch": 49.249257180587655, "grad_norm": 2291.916748046875, "learning_rate": 2.2339343834787418e-07, "loss": 35.1321, "step": 18647 }, { "epoch": 49.251898316276, "grad_norm": 1506.8896484375, "learning_rate": 2.2163123280322107e-07, "loss": 33.7844, "step": 18648 }, { "epoch": 49.25453945196435, "grad_norm": 3254.024658203125, "learning_rate": 2.1987600217279746e-07, "loss": 33.9716, "step": 18649 }, { "epoch": 49.25718058765269, "grad_norm": 1077.1058349609375, "learning_rate": 2.1812774650561973e-07, "loss": 34.3402, "step": 18650 }, { "epoch": 49.259821723341034, "grad_norm": 613.832275390625, "learning_rate": 2.1638646585048215e-07, "loss": 35.4051, "step": 18651 }, { "epoch": 49.262462859029384, "grad_norm": 1252.1527099609375, "learning_rate": 2.1465216025604028e-07, "loss": 37.4558, "step": 18652 }, { "epoch": 49.26510399471773, "grad_norm": 5028.111328125, "learning_rate": 2.1292482977069982e-07, "loss": 39.7998, "step": 18653 }, { "epoch": 49.26774513040608, "grad_norm": 1079.182861328125, "learning_rate": 2.1120447444267222e-07, "loss": 38.7318, "step": 18654 }, { "epoch": 49.27038626609442, "grad_norm": 4171.8623046875, "learning_rate": 2.0949109432005786e-07, "loss": 39.202, "step": 18655 }, { "epoch": 49.27302740178277, "grad_norm": 1743.5118408203125, "learning_rate": 2.0778468945065188e-07, "loss": 37.6749, "step": 18656 }, { "epoch": 49.27566853747111, "grad_norm": 843.4593505859375, "learning_rate": 2.0608525988213833e-07, "loss": 42.1473, "step": 18657 }, { "epoch": 49.278309673159455, "grad_norm": 861.03369140625, "learning_rate": 2.0439280566192375e-07, "loss": 41.188, "step": 18658 }, { "epoch": 49.280950808847805, "grad_norm": 935.9388427734375, "learning_rate": 2.0270732683733138e-07, "loss": 42.0739, "step": 18659 }, { "epoch": 49.28359194453615, "grad_norm": 595.401123046875, "learning_rate": 2.0102882345540696e-07, "loss": 38.7706, "step": 18660 }, { "epoch": 49.2862330802245, "grad_norm": 777.4688720703125, "learning_rate": 1.9935729556300185e-07, "loss": 39.7631, "step": 18661 }, { "epoch": 49.28887421591284, "grad_norm": 855.6248168945312, "learning_rate": 1.9769274320680097e-07, "loss": 40.0001, "step": 18662 }, { "epoch": 49.29151535160119, "grad_norm": 999.9473876953125, "learning_rate": 1.9603516643326713e-07, "loss": 38.4076, "step": 18663 }, { "epoch": 49.294156487289534, "grad_norm": 443.7738952636719, "learning_rate": 1.9438456528875214e-07, "loss": 36.2073, "step": 18664 }, { "epoch": 49.29679762297788, "grad_norm": 433.23907470703125, "learning_rate": 1.9274093981927476e-07, "loss": 39.2839, "step": 18665 }, { "epoch": 49.29943875866623, "grad_norm": 1728.5283203125, "learning_rate": 1.9110429007077046e-07, "loss": 35.3691, "step": 18666 }, { "epoch": 49.30207989435457, "grad_norm": 748.7033081054688, "learning_rate": 1.894746160889249e-07, "loss": 35.9336, "step": 18667 }, { "epoch": 49.30472103004292, "grad_norm": 922.5841064453125, "learning_rate": 1.8785191791925727e-07, "loss": 37.5198, "step": 18668 }, { "epoch": 49.30736216573126, "grad_norm": 845.7070922851562, "learning_rate": 1.8623619560709238e-07, "loss": 34.6961, "step": 18669 }, { "epoch": 49.31000330141961, "grad_norm": 677.9956665039062, "learning_rate": 1.8462744919750528e-07, "loss": 35.2741, "step": 18670 }, { "epoch": 49.312644437107956, "grad_norm": 461.83782958984375, "learning_rate": 1.8302567873546006e-07, "loss": 35.0209, "step": 18671 }, { "epoch": 49.315285572796306, "grad_norm": 1217.2205810546875, "learning_rate": 1.8143088426567089e-07, "loss": 34.2907, "step": 18672 }, { "epoch": 49.31792670848465, "grad_norm": 2066.934814453125, "learning_rate": 1.798430658326855e-07, "loss": 35.1195, "step": 18673 }, { "epoch": 49.32056784417299, "grad_norm": 982.3050537109375, "learning_rate": 1.7826222348082956e-07, "loss": 34.4816, "step": 18674 }, { "epoch": 49.32320897986134, "grad_norm": 6276.86328125, "learning_rate": 1.7668835725423437e-07, "loss": 24.2282, "step": 18675 }, { "epoch": 49.325850115549684, "grad_norm": 1250.989990234375, "learning_rate": 1.7512146719686483e-07, "loss": 12.1603, "step": 18676 }, { "epoch": 49.328491251238034, "grad_norm": 92201.046875, "learning_rate": 1.7356155335249146e-07, "loss": 10.6915, "step": 18677 }, { "epoch": 49.33113238692638, "grad_norm": 3985.151611328125, "learning_rate": 1.72008615764635e-07, "loss": 14.4647, "step": 18678 }, { "epoch": 49.33377352261473, "grad_norm": 764.5953369140625, "learning_rate": 1.7046265447667742e-07, "loss": 10.2489, "step": 18679 }, { "epoch": 49.33641465830307, "grad_norm": 2474.50390625, "learning_rate": 1.6892366953180638e-07, "loss": 10.7513, "step": 18680 }, { "epoch": 49.33905579399141, "grad_norm": 4499.3876953125, "learning_rate": 1.6739166097298752e-07, "loss": 14.2582, "step": 18681 }, { "epoch": 49.34169692967976, "grad_norm": 982.6395874023438, "learning_rate": 1.658666288429922e-07, "loss": 12.3555, "step": 18682 }, { "epoch": 49.344338065368106, "grad_norm": 924.7936401367188, "learning_rate": 1.6434857318439746e-07, "loss": 8.8025, "step": 18683 }, { "epoch": 49.346979201056456, "grad_norm": 865.5537719726562, "learning_rate": 1.6283749403961378e-07, "loss": 24.4585, "step": 18684 }, { "epoch": 49.3496203367448, "grad_norm": 1218.5489501953125, "learning_rate": 1.613333914508297e-07, "loss": 34.8706, "step": 18685 }, { "epoch": 49.35226147243315, "grad_norm": 720.201416015625, "learning_rate": 1.5983626546006712e-07, "loss": 35.3728, "step": 18686 }, { "epoch": 49.35490260812149, "grad_norm": 2409.991943359375, "learning_rate": 1.5834611610909822e-07, "loss": 32.5889, "step": 18687 }, { "epoch": 49.357543743809835, "grad_norm": 1174.3743896484375, "learning_rate": 1.5686294343952857e-07, "loss": 34.4758, "step": 18688 }, { "epoch": 49.360184879498185, "grad_norm": 1024.1956787109375, "learning_rate": 1.55386747492825e-07, "loss": 34.594, "step": 18689 }, { "epoch": 49.36282601518653, "grad_norm": 1867.9378662109375, "learning_rate": 1.539175283101768e-07, "loss": 33.8926, "step": 18690 }, { "epoch": 49.36546715087488, "grad_norm": 672.186279296875, "learning_rate": 1.5245528593260673e-07, "loss": 33.981, "step": 18691 }, { "epoch": 49.36810828656322, "grad_norm": 1671.85400390625, "learning_rate": 1.5100002040094319e-07, "loss": 34.2238, "step": 18692 }, { "epoch": 49.37074942225157, "grad_norm": 813.189697265625, "learning_rate": 1.4955173175584812e-07, "loss": 35.4729, "step": 18693 }, { "epoch": 49.37339055793991, "grad_norm": 9857.306640625, "learning_rate": 1.4811042003776143e-07, "loss": 35.4194, "step": 18694 }, { "epoch": 49.37603169362826, "grad_norm": 1158.1981201171875, "learning_rate": 1.466760852869009e-07, "loss": 35.8042, "step": 18695 }, { "epoch": 49.378672829316606, "grad_norm": 1652.43212890625, "learning_rate": 1.452487275433456e-07, "loss": 35.0496, "step": 18696 }, { "epoch": 49.38131396500495, "grad_norm": 1126.9371337890625, "learning_rate": 1.4382834684695255e-07, "loss": 34.2648, "step": 18697 }, { "epoch": 49.3839551006933, "grad_norm": 2344.557861328125, "learning_rate": 1.424149432373567e-07, "loss": 33.4364, "step": 18698 }, { "epoch": 49.38659623638164, "grad_norm": 667.9781494140625, "learning_rate": 1.41008516754082e-07, "loss": 32.6971, "step": 18699 }, { "epoch": 49.38923737206999, "grad_norm": 680.4779052734375, "learning_rate": 1.3960906743634706e-07, "loss": 35.4895, "step": 18700 }, { "epoch": 49.391878507758335, "grad_norm": 937.8045043945312, "learning_rate": 1.3821659532325947e-07, "loss": 36.844, "step": 18701 }, { "epoch": 49.394519643446685, "grad_norm": 903.4741821289062, "learning_rate": 1.3683110045370484e-07, "loss": 36.878, "step": 18702 }, { "epoch": 49.39716077913503, "grad_norm": 2453.75537109375, "learning_rate": 1.3545258286634666e-07, "loss": 39.2531, "step": 18703 }, { "epoch": 49.39980191482337, "grad_norm": 810.6607055664062, "learning_rate": 1.3408104259970966e-07, "loss": 37.9769, "step": 18704 }, { "epoch": 49.40244305051172, "grad_norm": 547.7814331054688, "learning_rate": 1.3271647969209654e-07, "loss": 38.2646, "step": 18705 }, { "epoch": 49.405084186200064, "grad_norm": 932.1572875976562, "learning_rate": 1.313588941815602e-07, "loss": 38.9284, "step": 18706 }, { "epoch": 49.407725321888414, "grad_norm": 914.15185546875, "learning_rate": 1.300082861060703e-07, "loss": 39.9498, "step": 18707 }, { "epoch": 49.41036645757676, "grad_norm": 483.6825866699219, "learning_rate": 1.2866465550329108e-07, "loss": 41.119, "step": 18708 }, { "epoch": 49.41300759326511, "grad_norm": 552.2621459960938, "learning_rate": 1.2732800241080367e-07, "loss": 41.2935, "step": 18709 }, { "epoch": 49.41564872895345, "grad_norm": 728.091552734375, "learning_rate": 1.2599832686588375e-07, "loss": 41.4129, "step": 18710 }, { "epoch": 49.41828986464179, "grad_norm": 1125.4437255859375, "learning_rate": 1.246756289056683e-07, "loss": 38.9578, "step": 18711 }, { "epoch": 49.42093100033014, "grad_norm": 833.8347778320312, "learning_rate": 1.233599085671e-07, "loss": 40.1691, "step": 18712 }, { "epoch": 49.423572136018485, "grad_norm": 1350.01953125, "learning_rate": 1.2205116588695496e-07, "loss": 39.3482, "step": 18713 }, { "epoch": 49.426213271706835, "grad_norm": 439.1014404296875, "learning_rate": 1.2074940090170407e-07, "loss": 37.9774, "step": 18714 }, { "epoch": 49.42885440739518, "grad_norm": 1044.0604248046875, "learning_rate": 1.1945461364776255e-07, "loss": 36.721, "step": 18715 }, { "epoch": 49.43149554308353, "grad_norm": 1076.7813720703125, "learning_rate": 1.1816680416124048e-07, "loss": 36.6694, "step": 18716 }, { "epoch": 49.43413667877187, "grad_norm": 2263.30859375, "learning_rate": 1.1688597247813682e-07, "loss": 35.8712, "step": 18717 }, { "epoch": 49.43677781446022, "grad_norm": 774.5147705078125, "learning_rate": 1.1561211863420073e-07, "loss": 35.6854, "step": 18718 }, { "epoch": 49.439418950148564, "grad_norm": 431.1009826660156, "learning_rate": 1.143452426650149e-07, "loss": 35.516, "step": 18719 }, { "epoch": 49.44206008583691, "grad_norm": 615.243896484375, "learning_rate": 1.1308534460593988e-07, "loss": 37.1697, "step": 18720 }, { "epoch": 49.44470122152526, "grad_norm": 949.8079833984375, "learning_rate": 1.118324244921698e-07, "loss": 34.47, "step": 18721 }, { "epoch": 49.4473423572136, "grad_norm": 592.4403686523438, "learning_rate": 1.1058648235867663e-07, "loss": 34.7989, "step": 18722 }, { "epoch": 49.44998349290195, "grad_norm": 1116.461669921875, "learning_rate": 1.0934751824026589e-07, "loss": 34.7612, "step": 18723 }, { "epoch": 49.45262462859029, "grad_norm": 684.4776611328125, "learning_rate": 1.0811553217154879e-07, "loss": 35.0268, "step": 18724 }, { "epoch": 49.45526576427864, "grad_norm": 1762.8665771484375, "learning_rate": 1.0689052418688672e-07, "loss": 15.0901, "step": 18725 }, { "epoch": 49.457906899966986, "grad_norm": 1700.4276123046875, "learning_rate": 1.0567249432053005e-07, "loss": 10.9446, "step": 18726 }, { "epoch": 49.46054803565533, "grad_norm": 424.3656311035156, "learning_rate": 1.0446144260645164e-07, "loss": 7.0379, "step": 18727 }, { "epoch": 49.46318917134368, "grad_norm": 843.7637939453125, "learning_rate": 1.0325736907851324e-07, "loss": 14.4325, "step": 18728 }, { "epoch": 49.46583030703202, "grad_norm": 5020.55029296875, "learning_rate": 1.0206027377029913e-07, "loss": 13.6138, "step": 18729 }, { "epoch": 49.46847144272037, "grad_norm": 4032.882080078125, "learning_rate": 1.0087015671528254e-07, "loss": 14.0224, "step": 18730 }, { "epoch": 49.471112578408714, "grad_norm": 561.978515625, "learning_rate": 9.968701794663138e-08, "loss": 11.028, "step": 18731 }, { "epoch": 49.473753714097064, "grad_norm": 3610.879638671875, "learning_rate": 9.851085749745803e-08, "loss": 11.5864, "step": 18732 }, { "epoch": 49.47639484978541, "grad_norm": 12500.2587890625, "learning_rate": 9.734167540054184e-08, "loss": 8.5623, "step": 18733 }, { "epoch": 49.47903598547375, "grad_norm": 662.7359008789062, "learning_rate": 9.617947168855113e-08, "loss": 21.6047, "step": 18734 }, { "epoch": 49.4816771211621, "grad_norm": 979.4824829101562, "learning_rate": 9.50242463939599e-08, "loss": 35.0861, "step": 18735 }, { "epoch": 49.48431825685044, "grad_norm": 1019.2941284179688, "learning_rate": 9.387599954902015e-08, "loss": 34.0156, "step": 18736 }, { "epoch": 49.48695939253879, "grad_norm": 1022.4595947265625, "learning_rate": 9.273473118578956e-08, "loss": 34.087, "step": 18737 }, { "epoch": 49.489600528227136, "grad_norm": 1431.5462646484375, "learning_rate": 9.160044133613155e-08, "loss": 33.5155, "step": 18738 }, { "epoch": 49.492241663915486, "grad_norm": 1126.647216796875, "learning_rate": 9.047313003174296e-08, "loss": 33.9429, "step": 18739 }, { "epoch": 49.49488279960383, "grad_norm": 1490.418701171875, "learning_rate": 8.935279730407086e-08, "loss": 34.2645, "step": 18740 }, { "epoch": 49.49752393529218, "grad_norm": 978.4599609375, "learning_rate": 8.823944318442356e-08, "loss": 33.6217, "step": 18741 }, { "epoch": 49.50016507098052, "grad_norm": 1336.7052001953125, "learning_rate": 8.713306770385953e-08, "loss": 33.7274, "step": 18742 }, { "epoch": 49.502806206668865, "grad_norm": 1794.14013671875, "learning_rate": 8.603367089332626e-08, "loss": 33.7911, "step": 18743 }, { "epoch": 49.505447342357215, "grad_norm": 1652.060546875, "learning_rate": 8.494125278349362e-08, "loss": 35.07, "step": 18744 }, { "epoch": 49.50808847804556, "grad_norm": 2043.781005859375, "learning_rate": 8.385581340486504e-08, "loss": 34.3972, "step": 18745 }, { "epoch": 49.51072961373391, "grad_norm": 688.0961303710938, "learning_rate": 8.277735278774956e-08, "loss": 34.1847, "step": 18746 }, { "epoch": 49.51337074942225, "grad_norm": 2141.064208984375, "learning_rate": 8.170587096226201e-08, "loss": 33.5246, "step": 18747 }, { "epoch": 49.5160118851106, "grad_norm": 1865.4815673828125, "learning_rate": 8.064136795835065e-08, "loss": 34.1978, "step": 18748 }, { "epoch": 49.51865302079894, "grad_norm": 886.9270629882812, "learning_rate": 7.958384380568618e-08, "loss": 35.1248, "step": 18749 }, { "epoch": 49.521294156487286, "grad_norm": 718.0796508789062, "learning_rate": 7.853329853385604e-08, "loss": 34.3018, "step": 18750 }, { "epoch": 49.523935292175636, "grad_norm": 885.7742309570312, "learning_rate": 7.748973217217015e-08, "loss": 35.4516, "step": 18751 }, { "epoch": 49.52657642786398, "grad_norm": 4383.9892578125, "learning_rate": 7.645314474974408e-08, "loss": 36.4724, "step": 18752 }, { "epoch": 49.52921756355233, "grad_norm": 2660.91796875, "learning_rate": 7.54235362955824e-08, "loss": 41.2825, "step": 18753 }, { "epoch": 49.53185869924067, "grad_norm": 731.4221801757812, "learning_rate": 7.44009068383844e-08, "loss": 38.4303, "step": 18754 }, { "epoch": 49.53449983492902, "grad_norm": 1216.79638671875, "learning_rate": 7.338525640673832e-08, "loss": 38.0776, "step": 18755 }, { "epoch": 49.537140970617365, "grad_norm": 3243.434814453125, "learning_rate": 7.237658502901034e-08, "loss": 38.777, "step": 18756 }, { "epoch": 49.53978210630571, "grad_norm": 1995.1502685546875, "learning_rate": 7.137489273331688e-08, "loss": 40.8015, "step": 18757 }, { "epoch": 49.54242324199406, "grad_norm": 6441.77001953125, "learning_rate": 7.038017954769105e-08, "loss": 40.2426, "step": 18758 }, { "epoch": 49.5450643776824, "grad_norm": 917.9876708984375, "learning_rate": 6.939244549986068e-08, "loss": 39.8854, "step": 18759 }, { "epoch": 49.54770551337075, "grad_norm": 778.2710571289062, "learning_rate": 6.841169061744257e-08, "loss": 41.8411, "step": 18760 }, { "epoch": 49.550346649059094, "grad_norm": 759.8295288085938, "learning_rate": 6.743791492780371e-08, "loss": 39.0365, "step": 18761 }, { "epoch": 49.552987784747444, "grad_norm": 822.1123046875, "learning_rate": 6.647111845814457e-08, "loss": 38.6434, "step": 18762 }, { "epoch": 49.55562892043579, "grad_norm": 861.2316284179688, "learning_rate": 6.551130123547134e-08, "loss": 39.4502, "step": 18763 }, { "epoch": 49.55827005612414, "grad_norm": 1124.6732177734375, "learning_rate": 6.455846328656812e-08, "loss": 39.0567, "step": 18764 }, { "epoch": 49.56091119181248, "grad_norm": 1170.3209228515625, "learning_rate": 6.361260463802476e-08, "loss": 37.959, "step": 18765 }, { "epoch": 49.56355232750082, "grad_norm": 638.6817626953125, "learning_rate": 6.267372531632009e-08, "loss": 36.5998, "step": 18766 }, { "epoch": 49.56619346318917, "grad_norm": 394.4034118652344, "learning_rate": 6.174182534759987e-08, "loss": 35.0149, "step": 18767 }, { "epoch": 49.568834598877515, "grad_norm": 1276.92333984375, "learning_rate": 6.081690475792657e-08, "loss": 34.7539, "step": 18768 }, { "epoch": 49.571475734565865, "grad_norm": 431.4994201660156, "learning_rate": 5.989896357314062e-08, "loss": 35.4435, "step": 18769 }, { "epoch": 49.57411687025421, "grad_norm": 496.62518310546875, "learning_rate": 5.898800181883268e-08, "loss": 34.9462, "step": 18770 }, { "epoch": 49.57675800594256, "grad_norm": 1936.6746826171875, "learning_rate": 5.808401952045461e-08, "loss": 35.0995, "step": 18771 }, { "epoch": 49.5793991416309, "grad_norm": 1257.4107666015625, "learning_rate": 5.718701670326398e-08, "loss": 34.6406, "step": 18772 }, { "epoch": 49.582040277319244, "grad_norm": 1113.4862060546875, "learning_rate": 5.629699339229633e-08, "loss": 34.497, "step": 18773 }, { "epoch": 49.584681413007594, "grad_norm": 488.90869140625, "learning_rate": 5.541394961242063e-08, "loss": 36.5658, "step": 18774 }, { "epoch": 49.58732254869594, "grad_norm": 4740.6953125, "learning_rate": 5.453788538828386e-08, "loss": 37.7827, "step": 18775 }, { "epoch": 49.58996368438429, "grad_norm": 623.5672607421875, "learning_rate": 5.366880074436642e-08, "loss": 11.1303, "step": 18776 }, { "epoch": 49.59260482007263, "grad_norm": 1121.445068359375, "learning_rate": 5.2806695704898936e-08, "loss": 12.6313, "step": 18777 }, { "epoch": 49.59524595576098, "grad_norm": 1741.7803955078125, "learning_rate": 5.1951570293973236e-08, "loss": 10.5553, "step": 18778 }, { "epoch": 49.59788709144932, "grad_norm": 576.5365600585938, "learning_rate": 5.1103424535486884e-08, "loss": 11.0662, "step": 18779 }, { "epoch": 49.600528227137666, "grad_norm": 3109.13818359375, "learning_rate": 5.026225845308763e-08, "loss": 7.3437, "step": 18780 }, { "epoch": 49.603169362826016, "grad_norm": 1656.306640625, "learning_rate": 4.942807207031219e-08, "loss": 11.1757, "step": 18781 }, { "epoch": 49.60581049851436, "grad_norm": 2057.439208984375, "learning_rate": 4.860086541041975e-08, "loss": 8.7256, "step": 18782 }, { "epoch": 49.60845163420271, "grad_norm": 874.270751953125, "learning_rate": 4.7780638496530694e-08, "loss": 10.6673, "step": 18783 }, { "epoch": 49.61109276989105, "grad_norm": 1539.79443359375, "learning_rate": 4.696739135151562e-08, "loss": 13.1183, "step": 18784 }, { "epoch": 49.6137339055794, "grad_norm": 646.902099609375, "learning_rate": 4.6161123998106344e-08, "loss": 21.911, "step": 18785 }, { "epoch": 49.616375041267744, "grad_norm": 1037.9320068359375, "learning_rate": 4.53618364588404e-08, "loss": 35.2551, "step": 18786 }, { "epoch": 49.619016176956094, "grad_norm": 3215.509033203125, "learning_rate": 4.456952875600551e-08, "loss": 33.0624, "step": 18787 }, { "epoch": 49.62165731264444, "grad_norm": 1019.3779296875, "learning_rate": 4.378420091172286e-08, "loss": 34.8086, "step": 18788 }, { "epoch": 49.62429844833278, "grad_norm": 1734.385009765625, "learning_rate": 4.300585294791936e-08, "loss": 34.3896, "step": 18789 }, { "epoch": 49.62693958402113, "grad_norm": 962.2691650390625, "learning_rate": 4.223448488638315e-08, "loss": 34.1329, "step": 18790 }, { "epoch": 49.62958071970947, "grad_norm": 890.2247314453125, "learning_rate": 4.147009674859703e-08, "loss": 34.7637, "step": 18791 }, { "epoch": 49.63222185539782, "grad_norm": 1075.4420166015625, "learning_rate": 4.07126885559328e-08, "loss": 34.7592, "step": 18792 }, { "epoch": 49.634862991086166, "grad_norm": 3411.531982421875, "learning_rate": 3.996226032951245e-08, "loss": 34.5126, "step": 18793 }, { "epoch": 49.637504126774516, "grad_norm": 1364.6939697265625, "learning_rate": 3.921881209031919e-08, "loss": 34.8321, "step": 18794 }, { "epoch": 49.64014526246286, "grad_norm": 575.0676879882812, "learning_rate": 3.84823438591142e-08, "loss": 33.9467, "step": 18795 }, { "epoch": 49.6427863981512, "grad_norm": 760.1478271484375, "learning_rate": 3.7752855656464355e-08, "loss": 36.0237, "step": 18796 }, { "epoch": 49.64542753383955, "grad_norm": 1882.7838134765625, "learning_rate": 3.70303475027145e-08, "loss": 33.6873, "step": 18797 }, { "epoch": 49.648068669527895, "grad_norm": 1150.13037109375, "learning_rate": 3.631481941804293e-08, "loss": 34.9491, "step": 18798 }, { "epoch": 49.650709805216245, "grad_norm": 911.4730224609375, "learning_rate": 3.560627142246142e-08, "loss": 34.6115, "step": 18799 }, { "epoch": 49.65335094090459, "grad_norm": 832.4959716796875, "learning_rate": 3.490470353573194e-08, "loss": 34.2436, "step": 18800 }, { "epoch": 49.65335094090459, "eval_loss": 3.8237195014953613, "eval_runtime": 2.208, "eval_samples_per_second": 224.185, "eval_steps_per_second": 28.08, "step": 18800 } ], "logging_steps": 1, "max_steps": 18900, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.669841698770125e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }