{ "best_metric": 2.4781594276428223, "best_model_checkpoint": "outputs/checkpoint-3125", "epoch": 0.025, "eval_steps": 3125, "global_step": 3125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8e-06, "grad_norm": 1.671875, "learning_rate": 4e-05, "loss": 2.2353, "step": 1 }, { "epoch": 1.6e-05, "grad_norm": 1.6171875, "learning_rate": 8e-05, "loss": 2.2466, "step": 2 }, { "epoch": 2.4e-05, "grad_norm": 1.8125, "learning_rate": 0.00012, "loss": 2.0724, "step": 3 }, { "epoch": 3.2e-05, "grad_norm": 1.40625, "learning_rate": 0.00016, "loss": 2.273, "step": 4 }, { "epoch": 4e-05, "grad_norm": 1.65625, "learning_rate": 0.0002, "loss": 2.1555, "step": 5 }, { "epoch": 4.8e-05, "grad_norm": 1.484375, "learning_rate": 0.00024, "loss": 2.0545, "step": 6 }, { "epoch": 5.6e-05, "grad_norm": 1.34375, "learning_rate": 0.00028000000000000003, "loss": 2.2388, "step": 7 }, { "epoch": 6.4e-05, "grad_norm": 1.578125, "learning_rate": 0.00032, "loss": 2.1948, "step": 8 }, { "epoch": 7.2e-05, "grad_norm": 2.25, "learning_rate": 0.00035999999999999997, "loss": 2.1208, "step": 9 }, { "epoch": 8e-05, "grad_norm": 1.46875, "learning_rate": 0.0004, "loss": 2.1646, "step": 10 }, { "epoch": 8.8e-05, "grad_norm": 1.53125, "learning_rate": 0.00044, "loss": 2.3298, "step": 11 }, { "epoch": 9.6e-05, "grad_norm": 1.5703125, "learning_rate": 0.00048, "loss": 2.2796, "step": 12 }, { "epoch": 0.000104, "grad_norm": 1.9296875, "learning_rate": 0.0005200000000000001, "loss": 2.1952, "step": 13 }, { "epoch": 0.000112, "grad_norm": 1.765625, "learning_rate": 0.0005600000000000001, "loss": 2.2962, "step": 14 }, { "epoch": 0.00012, "grad_norm": 1.484375, "learning_rate": 0.0006, "loss": 2.2255, "step": 15 }, { "epoch": 0.000128, "grad_norm": 1.3828125, "learning_rate": 0.00064, "loss": 2.2761, "step": 16 }, { "epoch": 0.000136, "grad_norm": 1.6171875, "learning_rate": 0.00068, "loss": 2.0533, "step": 17 }, { "epoch": 0.000144, "grad_norm": 1.3984375, "learning_rate": 0.0007199999999999999, "loss": 1.9831, "step": 18 }, { "epoch": 0.000152, "grad_norm": 1.4453125, "learning_rate": 0.00076, "loss": 2.03, "step": 19 }, { "epoch": 0.00016, "grad_norm": 1.4140625, "learning_rate": 0.0008, "loss": 2.2509, "step": 20 }, { "epoch": 0.000168, "grad_norm": 1.75, "learning_rate": 0.00084, "loss": 2.1374, "step": 21 }, { "epoch": 0.000176, "grad_norm": 1.5390625, "learning_rate": 0.00088, "loss": 2.1474, "step": 22 }, { "epoch": 0.000184, "grad_norm": 1.5625, "learning_rate": 0.00092, "loss": 2.317, "step": 23 }, { "epoch": 0.000192, "grad_norm": 1.65625, "learning_rate": 0.00096, "loss": 2.2492, "step": 24 }, { "epoch": 0.0002, "grad_norm": 1.546875, "learning_rate": 0.001, "loss": 1.9702, "step": 25 }, { "epoch": 0.000208, "grad_norm": 1.640625, "learning_rate": 0.0009999999841452775, "loss": 2.177, "step": 26 }, { "epoch": 0.000216, "grad_norm": 1.484375, "learning_rate": 0.0009999999365811114, "loss": 2.1614, "step": 27 }, { "epoch": 0.000224, "grad_norm": 1.4375, "learning_rate": 0.0009999998573075042, "loss": 2.3617, "step": 28 }, { "epoch": 0.000232, "grad_norm": 1.75, "learning_rate": 0.0009999997463244613, "loss": 1.9932, "step": 29 }, { "epoch": 0.00024, "grad_norm": 1.796875, "learning_rate": 0.0009999996036319896, "loss": 2.1507, "step": 30 }, { "epoch": 0.000248, "grad_norm": 1.546875, "learning_rate": 0.0009999994292300982, "loss": 2.057, "step": 31 }, { "epoch": 0.000256, "grad_norm": 1.8046875, "learning_rate": 0.000999999223118798, "loss": 2.1284, "step": 32 }, { "epoch": 0.000264, "grad_norm": 1.9375, "learning_rate": 0.0009999989852981024, "loss": 2.1191, "step": 33 }, { "epoch": 0.000272, "grad_norm": 1.9296875, "learning_rate": 0.0009999987157680262, "loss": 2.0103, "step": 34 }, { "epoch": 0.00028, "grad_norm": 1.640625, "learning_rate": 0.0009999984145285866, "loss": 1.7998, "step": 35 }, { "epoch": 0.000288, "grad_norm": 1.8125, "learning_rate": 0.0009999980815798028, "loss": 1.9909, "step": 36 }, { "epoch": 0.000296, "grad_norm": 1.6953125, "learning_rate": 0.0009999977169216957, "loss": 2.24, "step": 37 }, { "epoch": 0.000304, "grad_norm": 1.8359375, "learning_rate": 0.0009999973205542885, "loss": 1.9391, "step": 38 }, { "epoch": 0.000312, "grad_norm": 1.8671875, "learning_rate": 0.0009999968924776063, "loss": 1.8929, "step": 39 }, { "epoch": 0.00032, "grad_norm": 1.5859375, "learning_rate": 0.0009999964326916765, "loss": 2.0016, "step": 40 }, { "epoch": 0.000328, "grad_norm": 1.921875, "learning_rate": 0.000999995941196528, "loss": 2.1935, "step": 41 }, { "epoch": 0.000336, "grad_norm": 1.9140625, "learning_rate": 0.0009999954179921921, "loss": 2.1559, "step": 42 }, { "epoch": 0.000344, "grad_norm": 3.015625, "learning_rate": 0.000999994863078702, "loss": 1.9318, "step": 43 }, { "epoch": 0.000352, "grad_norm": 1.828125, "learning_rate": 0.0009999942764560925, "loss": 1.9298, "step": 44 }, { "epoch": 0.00036, "grad_norm": 1.4921875, "learning_rate": 0.0009999936581244013, "loss": 1.9234, "step": 45 }, { "epoch": 0.000368, "grad_norm": 28.5, "learning_rate": 0.0009999930080836674, "loss": 1.9601, "step": 46 }, { "epoch": 0.000376, "grad_norm": 2.015625, "learning_rate": 0.0009999923263339322, "loss": 2.0047, "step": 47 }, { "epoch": 0.000384, "grad_norm": 2.375, "learning_rate": 0.0009999916128752388, "loss": 2.0174, "step": 48 }, { "epoch": 0.000392, "grad_norm": 2.0, "learning_rate": 0.0009999908677076324, "loss": 1.7112, "step": 49 }, { "epoch": 0.0004, "grad_norm": 2.625, "learning_rate": 0.0009999900908311602, "loss": 2.126, "step": 50 }, { "epoch": 0.000408, "grad_norm": 2.125, "learning_rate": 0.0009999892822458716, "loss": 1.8007, "step": 51 }, { "epoch": 0.000416, "grad_norm": 2.359375, "learning_rate": 0.0009999884419518179, "loss": 1.7955, "step": 52 }, { "epoch": 0.000424, "grad_norm": 3.109375, "learning_rate": 0.0009999875699490523, "loss": 2.2763, "step": 53 }, { "epoch": 0.000432, "grad_norm": 1.90625, "learning_rate": 0.0009999866662376302, "loss": 1.7904, "step": 54 }, { "epoch": 0.00044, "grad_norm": 2.1875, "learning_rate": 0.0009999857308176088, "loss": 1.9445, "step": 55 }, { "epoch": 0.000448, "grad_norm": 2.015625, "learning_rate": 0.0009999847636890475, "loss": 1.5594, "step": 56 }, { "epoch": 0.000456, "grad_norm": 2.03125, "learning_rate": 0.0009999837648520075, "loss": 1.6301, "step": 57 }, { "epoch": 0.000464, "grad_norm": 2.34375, "learning_rate": 0.0009999827343065524, "loss": 2.0072, "step": 58 }, { "epoch": 0.000472, "grad_norm": 2.375, "learning_rate": 0.0009999816720527475, "loss": 1.8644, "step": 59 }, { "epoch": 0.00048, "grad_norm": 2.171875, "learning_rate": 0.00099998057809066, "loss": 1.7375, "step": 60 }, { "epoch": 0.000488, "grad_norm": 2.21875, "learning_rate": 0.0009999794524203592, "loss": 2.0267, "step": 61 }, { "epoch": 0.000496, "grad_norm": 2.140625, "learning_rate": 0.0009999782950419168, "loss": 2.1871, "step": 62 }, { "epoch": 0.000504, "grad_norm": 1.6953125, "learning_rate": 0.0009999771059554059, "loss": 1.6625, "step": 63 }, { "epoch": 0.000512, "grad_norm": 2.046875, "learning_rate": 0.0009999758851609022, "loss": 1.6938, "step": 64 }, { "epoch": 0.00052, "grad_norm": 1.984375, "learning_rate": 0.000999974632658483, "loss": 1.6608, "step": 65 }, { "epoch": 0.000528, "grad_norm": 1.65625, "learning_rate": 0.0009999733484482277, "loss": 1.5917, "step": 66 }, { "epoch": 0.000536, "grad_norm": 6.375, "learning_rate": 0.0009999720325302177, "loss": 3.0894, "step": 67 }, { "epoch": 0.000544, "grad_norm": 1.8515625, "learning_rate": 0.0009999706849045365, "loss": 1.7119, "step": 68 }, { "epoch": 0.000552, "grad_norm": 2.109375, "learning_rate": 0.0009999693055712695, "loss": 2.0265, "step": 69 }, { "epoch": 0.00056, "grad_norm": 1.8203125, "learning_rate": 0.0009999678945305044, "loss": 1.8266, "step": 70 }, { "epoch": 0.000568, "grad_norm": 2.25, "learning_rate": 0.0009999664517823304, "loss": 1.6983, "step": 71 }, { "epoch": 0.000576, "grad_norm": 2.1875, "learning_rate": 0.0009999649773268391, "loss": 2.1469, "step": 72 }, { "epoch": 0.000584, "grad_norm": 2.15625, "learning_rate": 0.0009999634711641242, "loss": 1.6582, "step": 73 }, { "epoch": 0.000592, "grad_norm": 2.4375, "learning_rate": 0.000999961933294281, "loss": 2.2505, "step": 74 }, { "epoch": 0.0006, "grad_norm": 2.375, "learning_rate": 0.0009999603637174071, "loss": 1.6958, "step": 75 }, { "epoch": 0.000608, "grad_norm": 1.8203125, "learning_rate": 0.0009999587624336022, "loss": 1.4917, "step": 76 }, { "epoch": 0.000616, "grad_norm": 2.171875, "learning_rate": 0.0009999571294429673, "loss": 2.3751, "step": 77 }, { "epoch": 0.000624, "grad_norm": 2.3125, "learning_rate": 0.0009999554647456067, "loss": 1.9417, "step": 78 }, { "epoch": 0.000632, "grad_norm": 2.078125, "learning_rate": 0.0009999537683416254, "loss": 1.9714, "step": 79 }, { "epoch": 0.00064, "grad_norm": 1.921875, "learning_rate": 0.0009999520402311313, "loss": 2.0728, "step": 80 }, { "epoch": 0.000648, "grad_norm": 2.1875, "learning_rate": 0.0009999502804142338, "loss": 2.105, "step": 81 }, { "epoch": 0.000656, "grad_norm": 2.546875, "learning_rate": 0.0009999484888910447, "loss": 1.8865, "step": 82 }, { "epoch": 0.000664, "grad_norm": 1.90625, "learning_rate": 0.0009999466656616775, "loss": 1.7919, "step": 83 }, { "epoch": 0.000672, "grad_norm": 2.109375, "learning_rate": 0.0009999448107262479, "loss": 1.9277, "step": 84 }, { "epoch": 0.00068, "grad_norm": 1.921875, "learning_rate": 0.0009999429240848733, "loss": 1.7681, "step": 85 }, { "epoch": 0.000688, "grad_norm": 2.203125, "learning_rate": 0.0009999410057376736, "loss": 1.9214, "step": 86 }, { "epoch": 0.000696, "grad_norm": 2.171875, "learning_rate": 0.0009999390556847704, "loss": 2.3393, "step": 87 }, { "epoch": 0.000704, "grad_norm": 2.03125, "learning_rate": 0.0009999370739262876, "loss": 2.0304, "step": 88 }, { "epoch": 0.000712, "grad_norm": 2.40625, "learning_rate": 0.0009999350604623504, "loss": 2.2334, "step": 89 }, { "epoch": 0.00072, "grad_norm": 2.046875, "learning_rate": 0.0009999330152930868, "loss": 1.7739, "step": 90 }, { "epoch": 0.000728, "grad_norm": 2.484375, "learning_rate": 0.0009999309384186264, "loss": 2.692, "step": 91 }, { "epoch": 0.000736, "grad_norm": 2.53125, "learning_rate": 0.0009999288298391008, "loss": 2.4189, "step": 92 }, { "epoch": 0.000744, "grad_norm": 2.34375, "learning_rate": 0.0009999266895546442, "loss": 2.9039, "step": 93 }, { "epoch": 0.000752, "grad_norm": 2.734375, "learning_rate": 0.000999924517565392, "loss": 2.078, "step": 94 }, { "epoch": 0.00076, "grad_norm": 2.234375, "learning_rate": 0.0009999223138714816, "loss": 2.3612, "step": 95 }, { "epoch": 0.000768, "grad_norm": 1.984375, "learning_rate": 0.0009999200784730534, "loss": 2.2161, "step": 96 }, { "epoch": 0.000776, "grad_norm": 2.09375, "learning_rate": 0.0009999178113702487, "loss": 1.9707, "step": 97 }, { "epoch": 0.000784, "grad_norm": 1.7890625, "learning_rate": 0.0009999155125632116, "loss": 2.1359, "step": 98 }, { "epoch": 0.000792, "grad_norm": 2.234375, "learning_rate": 0.0009999131820520877, "loss": 2.4184, "step": 99 }, { "epoch": 0.0008, "grad_norm": 2.28125, "learning_rate": 0.0009999108198370248, "loss": 2.6635, "step": 100 }, { "epoch": 0.000808, "grad_norm": 2.09375, "learning_rate": 0.0009999084259181728, "loss": 2.153, "step": 101 }, { "epoch": 0.000816, "grad_norm": 2.234375, "learning_rate": 0.0009999060002956835, "loss": 2.164, "step": 102 }, { "epoch": 0.000824, "grad_norm": 2.203125, "learning_rate": 0.0009999035429697105, "loss": 2.0433, "step": 103 }, { "epoch": 0.000832, "grad_norm": 1.90625, "learning_rate": 0.0009999010539404102, "loss": 2.3213, "step": 104 }, { "epoch": 0.00084, "grad_norm": 2.171875, "learning_rate": 0.0009998985332079398, "loss": 2.3252, "step": 105 }, { "epoch": 0.000848, "grad_norm": 2.359375, "learning_rate": 0.0009998959807724596, "loss": 2.6247, "step": 106 }, { "epoch": 0.000856, "grad_norm": 2.359375, "learning_rate": 0.0009998933966341313, "loss": 2.4031, "step": 107 }, { "epoch": 0.000864, "grad_norm": 2.46875, "learning_rate": 0.0009998907807931186, "loss": 2.4303, "step": 108 }, { "epoch": 0.000872, "grad_norm": 2.359375, "learning_rate": 0.0009998881332495878, "loss": 2.3953, "step": 109 }, { "epoch": 0.00088, "grad_norm": 1.828125, "learning_rate": 0.0009998854540037066, "loss": 2.0928, "step": 110 }, { "epoch": 0.000888, "grad_norm": 2.015625, "learning_rate": 0.000999882743055645, "loss": 2.4892, "step": 111 }, { "epoch": 0.000896, "grad_norm": 1.9296875, "learning_rate": 0.0009998800004055745, "loss": 2.3083, "step": 112 }, { "epoch": 0.000904, "grad_norm": 2.203125, "learning_rate": 0.0009998772260536696, "loss": 2.4337, "step": 113 }, { "epoch": 0.000912, "grad_norm": 2.203125, "learning_rate": 0.0009998744200001063, "loss": 2.0558, "step": 114 }, { "epoch": 0.00092, "grad_norm": 2.03125, "learning_rate": 0.000999871582245062, "loss": 2.0343, "step": 115 }, { "epoch": 0.000928, "grad_norm": 2.25, "learning_rate": 0.0009998687127887168, "loss": 2.2093, "step": 116 }, { "epoch": 0.000936, "grad_norm": 1.84375, "learning_rate": 0.000999865811631253, "loss": 2.3889, "step": 117 }, { "epoch": 0.000944, "grad_norm": 2.296875, "learning_rate": 0.0009998628787728546, "loss": 2.6206, "step": 118 }, { "epoch": 0.000952, "grad_norm": 2.171875, "learning_rate": 0.0009998599142137072, "loss": 2.3169, "step": 119 }, { "epoch": 0.00096, "grad_norm": 2.15625, "learning_rate": 0.0009998569179539992, "loss": 2.3796, "step": 120 }, { "epoch": 0.000968, "grad_norm": 2.546875, "learning_rate": 0.0009998538899939203, "loss": 2.3898, "step": 121 }, { "epoch": 0.000976, "grad_norm": 2.03125, "learning_rate": 0.0009998508303336625, "loss": 1.8769, "step": 122 }, { "epoch": 0.000984, "grad_norm": 2.125, "learning_rate": 0.0009998477389734203, "loss": 2.646, "step": 123 }, { "epoch": 0.000992, "grad_norm": 2.453125, "learning_rate": 0.0009998446159133897, "loss": 2.5595, "step": 124 }, { "epoch": 0.001, "grad_norm": 2.375, "learning_rate": 0.000999841461153768, "loss": 2.4175, "step": 125 }, { "epoch": 0.001008, "grad_norm": 2.0625, "learning_rate": 0.0009998382746947562, "loss": 2.0392, "step": 126 }, { "epoch": 0.001016, "grad_norm": 2.125, "learning_rate": 0.0009998350565365557, "loss": 2.2891, "step": 127 }, { "epoch": 0.001024, "grad_norm": 2.234375, "learning_rate": 0.000999831806679371, "loss": 2.5708, "step": 128 }, { "epoch": 0.001032, "grad_norm": 2.078125, "learning_rate": 0.000999828525123408, "loss": 2.0987, "step": 129 }, { "epoch": 0.00104, "grad_norm": 2.296875, "learning_rate": 0.0009998252118688749, "loss": 2.3873, "step": 130 }, { "epoch": 0.001048, "grad_norm": 2.328125, "learning_rate": 0.0009998218669159818, "loss": 2.6298, "step": 131 }, { "epoch": 0.001056, "grad_norm": 2.171875, "learning_rate": 0.000999818490264941, "loss": 2.2247, "step": 132 }, { "epoch": 0.001064, "grad_norm": 1.984375, "learning_rate": 0.0009998150819159664, "loss": 2.1807, "step": 133 }, { "epoch": 0.001072, "grad_norm": 1.9453125, "learning_rate": 0.0009998116418692743, "loss": 2.1879, "step": 134 }, { "epoch": 0.00108, "grad_norm": 2.140625, "learning_rate": 0.0009998081701250827, "loss": 2.2223, "step": 135 }, { "epoch": 0.001088, "grad_norm": 1.9375, "learning_rate": 0.0009998046666836117, "loss": 2.8348, "step": 136 }, { "epoch": 0.001096, "grad_norm": 2.015625, "learning_rate": 0.000999801131545084, "loss": 2.3764, "step": 137 }, { "epoch": 0.001104, "grad_norm": 2.46875, "learning_rate": 0.0009997975647097232, "loss": 2.6021, "step": 138 }, { "epoch": 0.001112, "grad_norm": 2.484375, "learning_rate": 0.000999793966177756, "loss": 2.5467, "step": 139 }, { "epoch": 0.00112, "grad_norm": 2.03125, "learning_rate": 0.0009997903359494102, "loss": 2.397, "step": 140 }, { "epoch": 0.001128, "grad_norm": 3.390625, "learning_rate": 0.0009997866740249165, "loss": 1.8168, "step": 141 }, { "epoch": 0.001136, "grad_norm": 2.0, "learning_rate": 0.0009997829804045063, "loss": 2.2066, "step": 142 }, { "epoch": 0.001144, "grad_norm": 2.03125, "learning_rate": 0.000999779255088415, "loss": 2.0214, "step": 143 }, { "epoch": 0.001152, "grad_norm": 2.4375, "learning_rate": 0.0009997754980768778, "loss": 1.8775, "step": 144 }, { "epoch": 0.00116, "grad_norm": 2.46875, "learning_rate": 0.0009997717093701336, "loss": 2.1563, "step": 145 }, { "epoch": 0.001168, "grad_norm": 2.140625, "learning_rate": 0.0009997678889684222, "loss": 2.2833, "step": 146 }, { "epoch": 0.001176, "grad_norm": 2.203125, "learning_rate": 0.0009997640368719866, "loss": 2.5408, "step": 147 }, { "epoch": 0.001184, "grad_norm": 2.03125, "learning_rate": 0.0009997601530810705, "loss": 2.1147, "step": 148 }, { "epoch": 0.001192, "grad_norm": 2.5625, "learning_rate": 0.0009997562375959202, "loss": 2.1716, "step": 149 }, { "epoch": 0.0012, "grad_norm": 2.109375, "learning_rate": 0.0009997522904167844, "loss": 2.5352, "step": 150 }, { "epoch": 0.001208, "grad_norm": 2.46875, "learning_rate": 0.000999748311543913, "loss": 2.3243, "step": 151 }, { "epoch": 0.001216, "grad_norm": 3.28125, "learning_rate": 0.0009997443009775587, "loss": 2.997, "step": 152 }, { "epoch": 0.001224, "grad_norm": 2.4375, "learning_rate": 0.0009997402587179755, "loss": 2.0119, "step": 153 }, { "epoch": 0.001232, "grad_norm": 2.890625, "learning_rate": 0.0009997361847654202, "loss": 2.3479, "step": 154 }, { "epoch": 0.00124, "grad_norm": 2.171875, "learning_rate": 0.0009997320791201507, "loss": 1.9742, "step": 155 }, { "epoch": 0.001248, "grad_norm": 2.0625, "learning_rate": 0.0009997279417824278, "loss": 2.156, "step": 156 }, { "epoch": 0.001256, "grad_norm": 1.8671875, "learning_rate": 0.0009997237727525136, "loss": 2.1854, "step": 157 }, { "epoch": 0.001264, "grad_norm": 2.1875, "learning_rate": 0.0009997195720306724, "loss": 1.9683, "step": 158 }, { "epoch": 0.001272, "grad_norm": 2.40625, "learning_rate": 0.000999715339617171, "loss": 2.6756, "step": 159 }, { "epoch": 0.00128, "grad_norm": 2.265625, "learning_rate": 0.0009997110755122773, "loss": 1.9821, "step": 160 }, { "epoch": 0.001288, "grad_norm": 2.703125, "learning_rate": 0.0009997067797162624, "loss": 2.911, "step": 161 }, { "epoch": 0.001296, "grad_norm": 1.9921875, "learning_rate": 0.0009997024522293981, "loss": 2.2004, "step": 162 }, { "epoch": 0.001304, "grad_norm": 2.3125, "learning_rate": 0.000999698093051959, "loss": 2.353, "step": 163 }, { "epoch": 0.001312, "grad_norm": 2.15625, "learning_rate": 0.0009996937021842219, "loss": 1.9116, "step": 164 }, { "epoch": 0.00132, "grad_norm": 3.375, "learning_rate": 0.0009996892796264648, "loss": 2.4662, "step": 165 }, { "epoch": 0.001328, "grad_norm": 2.328125, "learning_rate": 0.0009996848253789685, "loss": 2.2417, "step": 166 }, { "epoch": 0.001336, "grad_norm": 2.078125, "learning_rate": 0.0009996803394420153, "loss": 2.1415, "step": 167 }, { "epoch": 0.001344, "grad_norm": 2.296875, "learning_rate": 0.0009996758218158899, "loss": 2.3052, "step": 168 }, { "epoch": 0.001352, "grad_norm": 2.109375, "learning_rate": 0.0009996712725008786, "loss": 2.1729, "step": 169 }, { "epoch": 0.00136, "grad_norm": 2.640625, "learning_rate": 0.0009996666914972698, "loss": 2.0963, "step": 170 }, { "epoch": 0.001368, "grad_norm": 2.515625, "learning_rate": 0.0009996620788053543, "loss": 2.5435, "step": 171 }, { "epoch": 0.001376, "grad_norm": 2.78125, "learning_rate": 0.0009996574344254246, "loss": 2.5244, "step": 172 }, { "epoch": 0.001384, "grad_norm": 2.125, "learning_rate": 0.0009996527583577752, "loss": 1.8031, "step": 173 }, { "epoch": 0.001392, "grad_norm": 2.078125, "learning_rate": 0.0009996480506027025, "loss": 2.3148, "step": 174 }, { "epoch": 0.0014, "grad_norm": 2.03125, "learning_rate": 0.0009996433111605053, "loss": 2.4068, "step": 175 }, { "epoch": 0.001408, "grad_norm": 2.21875, "learning_rate": 0.000999638540031484, "loss": 2.5376, "step": 176 }, { "epoch": 0.001416, "grad_norm": 2.078125, "learning_rate": 0.0009996337372159413, "loss": 1.9859, "step": 177 }, { "epoch": 0.001424, "grad_norm": 2.171875, "learning_rate": 0.0009996289027141814, "loss": 2.5014, "step": 178 }, { "epoch": 0.001432, "grad_norm": 1.75, "learning_rate": 0.0009996240365265114, "loss": 2.3672, "step": 179 }, { "epoch": 0.00144, "grad_norm": 2.890625, "learning_rate": 0.00099961913865324, "loss": 2.8156, "step": 180 }, { "epoch": 0.001448, "grad_norm": 2.359375, "learning_rate": 0.0009996142090946771, "loss": 2.1967, "step": 181 }, { "epoch": 0.001456, "grad_norm": 1.734375, "learning_rate": 0.0009996092478511361, "loss": 1.7635, "step": 182 }, { "epoch": 0.001464, "grad_norm": 2.546875, "learning_rate": 0.0009996042549229311, "loss": 2.5308, "step": 183 }, { "epoch": 0.001472, "grad_norm": 2.53125, "learning_rate": 0.0009995992303103792, "loss": 2.8292, "step": 184 }, { "epoch": 0.00148, "grad_norm": 2.328125, "learning_rate": 0.0009995941740137985, "loss": 2.4503, "step": 185 }, { "epoch": 0.001488, "grad_norm": 2.46875, "learning_rate": 0.0009995890860335103, "loss": 2.2086, "step": 186 }, { "epoch": 0.001496, "grad_norm": 2.625, "learning_rate": 0.0009995839663698367, "loss": 2.4392, "step": 187 }, { "epoch": 0.001504, "grad_norm": 1.796875, "learning_rate": 0.000999578815023103, "loss": 1.655, "step": 188 }, { "epoch": 0.001512, "grad_norm": 2.03125, "learning_rate": 0.000999573631993635, "loss": 2.6982, "step": 189 }, { "epoch": 0.00152, "grad_norm": 1.8984375, "learning_rate": 0.0009995684172817623, "loss": 1.8588, "step": 190 }, { "epoch": 0.001528, "grad_norm": 3.203125, "learning_rate": 0.0009995631708878152, "loss": 2.2669, "step": 191 }, { "epoch": 0.001536, "grad_norm": 2.015625, "learning_rate": 0.0009995578928121262, "loss": 2.0772, "step": 192 }, { "epoch": 0.001544, "grad_norm": 2.203125, "learning_rate": 0.0009995525830550306, "loss": 2.3521, "step": 193 }, { "epoch": 0.001552, "grad_norm": 2.265625, "learning_rate": 0.0009995472416168647, "loss": 2.3464, "step": 194 }, { "epoch": 0.00156, "grad_norm": 2.078125, "learning_rate": 0.0009995418684979674, "loss": 2.2214, "step": 195 }, { "epoch": 0.001568, "grad_norm": 2.828125, "learning_rate": 0.0009995364636986794, "loss": 2.4397, "step": 196 }, { "epoch": 0.001576, "grad_norm": 2.671875, "learning_rate": 0.0009995310272193434, "loss": 2.3908, "step": 197 }, { "epoch": 0.001584, "grad_norm": 2.484375, "learning_rate": 0.0009995255590603046, "loss": 2.3377, "step": 198 }, { "epoch": 0.001592, "grad_norm": 2.84375, "learning_rate": 0.0009995200592219091, "loss": 2.3603, "step": 199 }, { "epoch": 0.0016, "grad_norm": 2.953125, "learning_rate": 0.0009995145277045061, "loss": 2.6503, "step": 200 }, { "epoch": 0.001608, "grad_norm": 2.515625, "learning_rate": 0.0009995089645084465, "loss": 2.1431, "step": 201 }, { "epoch": 0.001616, "grad_norm": 1.9296875, "learning_rate": 0.000999503369634083, "loss": 1.8555, "step": 202 }, { "epoch": 0.001624, "grad_norm": 2.984375, "learning_rate": 0.0009994977430817702, "loss": 2.3319, "step": 203 }, { "epoch": 0.001632, "grad_norm": 2.25, "learning_rate": 0.0009994920848518652, "loss": 2.4978, "step": 204 }, { "epoch": 0.00164, "grad_norm": 2.25, "learning_rate": 0.0009994863949447268, "loss": 2.4384, "step": 205 }, { "epoch": 0.001648, "grad_norm": 1.8828125, "learning_rate": 0.0009994806733607158, "loss": 2.4791, "step": 206 }, { "epoch": 0.001656, "grad_norm": 2.203125, "learning_rate": 0.0009994749201001952, "loss": 2.2863, "step": 207 }, { "epoch": 0.001664, "grad_norm": 2.125, "learning_rate": 0.0009994691351635295, "loss": 2.4172, "step": 208 }, { "epoch": 0.001672, "grad_norm": 1.78125, "learning_rate": 0.0009994633185510858, "loss": 2.5061, "step": 209 }, { "epoch": 0.00168, "grad_norm": 2.203125, "learning_rate": 0.0009994574702632331, "loss": 2.8813, "step": 210 }, { "epoch": 0.001688, "grad_norm": 2.28125, "learning_rate": 0.000999451590300342, "loss": 2.2217, "step": 211 }, { "epoch": 0.001696, "grad_norm": 1.8828125, "learning_rate": 0.0009994456786627858, "loss": 2.3392, "step": 212 }, { "epoch": 0.001704, "grad_norm": 2.328125, "learning_rate": 0.000999439735350939, "loss": 2.237, "step": 213 }, { "epoch": 0.001712, "grad_norm": 2.921875, "learning_rate": 0.000999433760365179, "loss": 2.4355, "step": 214 }, { "epoch": 0.00172, "grad_norm": 2.65625, "learning_rate": 0.0009994277537058842, "loss": 1.9892, "step": 215 }, { "epoch": 0.001728, "grad_norm": 2.140625, "learning_rate": 0.0009994217153734357, "loss": 2.0212, "step": 216 }, { "epoch": 0.001736, "grad_norm": 2.09375, "learning_rate": 0.0009994156453682166, "loss": 2.2348, "step": 217 }, { "epoch": 0.001744, "grad_norm": 2.671875, "learning_rate": 0.0009994095436906118, "loss": 2.3287, "step": 218 }, { "epoch": 0.001752, "grad_norm": 2.90625, "learning_rate": 0.0009994034103410082, "loss": 2.3509, "step": 219 }, { "epoch": 0.00176, "grad_norm": 2.484375, "learning_rate": 0.0009993972453197948, "loss": 2.4201, "step": 220 }, { "epoch": 0.001768, "grad_norm": 2.203125, "learning_rate": 0.0009993910486273624, "loss": 2.3065, "step": 221 }, { "epoch": 0.001776, "grad_norm": 2.5625, "learning_rate": 0.0009993848202641045, "loss": 2.0683, "step": 222 }, { "epoch": 0.001784, "grad_norm": 2.140625, "learning_rate": 0.0009993785602304154, "loss": 2.229, "step": 223 }, { "epoch": 0.001792, "grad_norm": 2.21875, "learning_rate": 0.0009993722685266927, "loss": 2.2344, "step": 224 }, { "epoch": 0.0018, "grad_norm": 2.046875, "learning_rate": 0.0009993659451533353, "loss": 2.3085, "step": 225 }, { "epoch": 0.001808, "grad_norm": 2.484375, "learning_rate": 0.0009993595901107437, "loss": 2.4873, "step": 226 }, { "epoch": 0.001816, "grad_norm": 2.390625, "learning_rate": 0.0009993532033993216, "loss": 2.0693, "step": 227 }, { "epoch": 0.001824, "grad_norm": 2.703125, "learning_rate": 0.0009993467850194738, "loss": 2.2549, "step": 228 }, { "epoch": 0.001832, "grad_norm": 2.140625, "learning_rate": 0.0009993403349716071, "loss": 2.0618, "step": 229 }, { "epoch": 0.00184, "grad_norm": 2.21875, "learning_rate": 0.000999333853256131, "loss": 1.9574, "step": 230 }, { "epoch": 0.001848, "grad_norm": 2.40625, "learning_rate": 0.0009993273398734561, "loss": 2.568, "step": 231 }, { "epoch": 0.001856, "grad_norm": 2.328125, "learning_rate": 0.0009993207948239958, "loss": 2.3608, "step": 232 }, { "epoch": 0.001864, "grad_norm": 2.625, "learning_rate": 0.000999314218108165, "loss": 2.5201, "step": 233 }, { "epoch": 0.001872, "grad_norm": 2.3125, "learning_rate": 0.0009993076097263812, "loss": 2.1417, "step": 234 }, { "epoch": 0.00188, "grad_norm": 2.34375, "learning_rate": 0.0009993009696790628, "loss": 2.3575, "step": 235 }, { "epoch": 0.001888, "grad_norm": 2.03125, "learning_rate": 0.0009992942979666314, "loss": 1.9917, "step": 236 }, { "epoch": 0.001896, "grad_norm": 2.265625, "learning_rate": 0.00099928759458951, "loss": 2.4808, "step": 237 }, { "epoch": 0.001904, "grad_norm": 2.09375, "learning_rate": 0.0009992808595481236, "loss": 2.2889, "step": 238 }, { "epoch": 0.001912, "grad_norm": 2.140625, "learning_rate": 0.0009992740928428997, "loss": 2.1614, "step": 239 }, { "epoch": 0.00192, "grad_norm": 11.75, "learning_rate": 0.0009992672944742669, "loss": 2.3334, "step": 240 }, { "epoch": 0.001928, "grad_norm": 3.890625, "learning_rate": 0.0009992604644426567, "loss": 2.3416, "step": 241 }, { "epoch": 0.001936, "grad_norm": 2.75, "learning_rate": 0.000999253602748502, "loss": 2.6133, "step": 242 }, { "epoch": 0.001944, "grad_norm": 2.5, "learning_rate": 0.0009992467093922384, "loss": 2.4849, "step": 243 }, { "epoch": 0.001952, "grad_norm": 3.03125, "learning_rate": 0.0009992397843743026, "loss": 2.4091, "step": 244 }, { "epoch": 0.00196, "grad_norm": 2.390625, "learning_rate": 0.000999232827695134, "loss": 1.8876, "step": 245 }, { "epoch": 0.001968, "grad_norm": 2.53125, "learning_rate": 0.0009992258393551738, "loss": 2.4843, "step": 246 }, { "epoch": 0.001976, "grad_norm": 2.625, "learning_rate": 0.0009992188193548653, "loss": 2.4377, "step": 247 }, { "epoch": 0.001984, "grad_norm": 2.421875, "learning_rate": 0.0009992117676946534, "loss": 1.8956, "step": 248 }, { "epoch": 0.001992, "grad_norm": 2.609375, "learning_rate": 0.0009992046843749853, "loss": 2.4234, "step": 249 }, { "epoch": 0.002, "grad_norm": 2.71875, "learning_rate": 0.0009991975693963108, "loss": 2.4962, "step": 250 }, { "epoch": 0.002008, "grad_norm": 3.03125, "learning_rate": 0.0009991904227590802, "loss": 2.6841, "step": 251 }, { "epoch": 0.002016, "grad_norm": 2.890625, "learning_rate": 0.0009991832444637475, "loss": 2.4064, "step": 252 }, { "epoch": 0.002024, "grad_norm": 2.359375, "learning_rate": 0.0009991760345107676, "loss": 2.5219, "step": 253 }, { "epoch": 0.002032, "grad_norm": 2.421875, "learning_rate": 0.000999168792900598, "loss": 2.5258, "step": 254 }, { "epoch": 0.00204, "grad_norm": 2.28125, "learning_rate": 0.0009991615196336975, "loss": 2.8346, "step": 255 }, { "epoch": 0.002048, "grad_norm": 2.671875, "learning_rate": 0.0009991542147105277, "loss": 2.685, "step": 256 }, { "epoch": 0.002056, "grad_norm": 2.40625, "learning_rate": 0.000999146878131552, "loss": 2.5313, "step": 257 }, { "epoch": 0.002064, "grad_norm": 2.484375, "learning_rate": 0.0009991395098972353, "loss": 2.9855, "step": 258 }, { "epoch": 0.002072, "grad_norm": 2.75, "learning_rate": 0.0009991321100080452, "loss": 2.438, "step": 259 }, { "epoch": 0.00208, "grad_norm": 2.34375, "learning_rate": 0.0009991246784644507, "loss": 2.2729, "step": 260 }, { "epoch": 0.002088, "grad_norm": 2.640625, "learning_rate": 0.0009991172152669235, "loss": 2.4812, "step": 261 }, { "epoch": 0.002096, "grad_norm": 2.515625, "learning_rate": 0.0009991097204159365, "loss": 2.5951, "step": 262 }, { "epoch": 0.002104, "grad_norm": 2.359375, "learning_rate": 0.000999102193911965, "loss": 2.3268, "step": 263 }, { "epoch": 0.002112, "grad_norm": 2.703125, "learning_rate": 0.000999094635755487, "loss": 2.5413, "step": 264 }, { "epoch": 0.00212, "grad_norm": 2.40625, "learning_rate": 0.0009990870459469809, "loss": 2.094, "step": 265 }, { "epoch": 0.002128, "grad_norm": 2.265625, "learning_rate": 0.0009990794244869286, "loss": 2.3004, "step": 266 }, { "epoch": 0.002136, "grad_norm": 2.34375, "learning_rate": 0.0009990717713758133, "loss": 2.2431, "step": 267 }, { "epoch": 0.002144, "grad_norm": 2.203125, "learning_rate": 0.0009990640866141204, "loss": 2.0509, "step": 268 }, { "epoch": 0.002152, "grad_norm": 2.453125, "learning_rate": 0.0009990563702023372, "loss": 2.2746, "step": 269 }, { "epoch": 0.00216, "grad_norm": 2.84375, "learning_rate": 0.0009990486221409531, "loss": 2.6621, "step": 270 }, { "epoch": 0.002168, "grad_norm": 2.5, "learning_rate": 0.0009990408424304595, "loss": 1.9638, "step": 271 }, { "epoch": 0.002176, "grad_norm": 2.203125, "learning_rate": 0.00099903303107135, "loss": 2.3514, "step": 272 }, { "epoch": 0.002184, "grad_norm": 36.25, "learning_rate": 0.0009990251880641194, "loss": 1.9819, "step": 273 }, { "epoch": 0.002192, "grad_norm": 2.921875, "learning_rate": 0.0009990173134092656, "loss": 2.5405, "step": 274 }, { "epoch": 0.0022, "grad_norm": 2.078125, "learning_rate": 0.0009990094071072877, "loss": 2.0224, "step": 275 }, { "epoch": 0.002208, "grad_norm": 2.09375, "learning_rate": 0.0009990014691586874, "loss": 2.2874, "step": 276 }, { "epoch": 0.002216, "grad_norm": 1.9609375, "learning_rate": 0.0009989934995639678, "loss": 1.9016, "step": 277 }, { "epoch": 0.002224, "grad_norm": 2.609375, "learning_rate": 0.0009989854983236345, "loss": 2.2984, "step": 278 }, { "epoch": 0.002232, "grad_norm": 2.5625, "learning_rate": 0.000998977465438195, "loss": 2.505, "step": 279 }, { "epoch": 0.00224, "grad_norm": 3.25, "learning_rate": 0.0009989694009081584, "loss": 2.3128, "step": 280 }, { "epoch": 0.002248, "grad_norm": 2.921875, "learning_rate": 0.0009989613047340366, "loss": 2.26, "step": 281 }, { "epoch": 0.002256, "grad_norm": 1.7265625, "learning_rate": 0.0009989531769163428, "loss": 2.1726, "step": 282 }, { "epoch": 0.002264, "grad_norm": 2.328125, "learning_rate": 0.0009989450174555927, "loss": 2.462, "step": 283 }, { "epoch": 0.002272, "grad_norm": 2.03125, "learning_rate": 0.0009989368263523034, "loss": 1.7459, "step": 284 }, { "epoch": 0.00228, "grad_norm": 2.484375, "learning_rate": 0.0009989286036069945, "loss": 2.8299, "step": 285 }, { "epoch": 0.002288, "grad_norm": 2.484375, "learning_rate": 0.0009989203492201876, "loss": 2.2145, "step": 286 }, { "epoch": 0.002296, "grad_norm": 3.28125, "learning_rate": 0.000998912063192406, "loss": 2.3917, "step": 287 }, { "epoch": 0.002304, "grad_norm": 2.234375, "learning_rate": 0.0009989037455241754, "loss": 2.0512, "step": 288 }, { "epoch": 0.002312, "grad_norm": 2.296875, "learning_rate": 0.0009988953962160232, "loss": 2.2264, "step": 289 }, { "epoch": 0.00232, "grad_norm": 2.875, "learning_rate": 0.0009988870152684787, "loss": 2.1718, "step": 290 }, { "epoch": 0.002328, "grad_norm": 2.59375, "learning_rate": 0.0009988786026820737, "loss": 2.2672, "step": 291 }, { "epoch": 0.002336, "grad_norm": 2.390625, "learning_rate": 0.0009988701584573417, "loss": 2.0606, "step": 292 }, { "epoch": 0.002344, "grad_norm": 4.96875, "learning_rate": 0.0009988616825948179, "loss": 2.4, "step": 293 }, { "epoch": 0.002352, "grad_norm": 2.625, "learning_rate": 0.0009988531750950403, "loss": 2.4624, "step": 294 }, { "epoch": 0.00236, "grad_norm": 1.90625, "learning_rate": 0.0009988446359585482, "loss": 2.0189, "step": 295 }, { "epoch": 0.002368, "grad_norm": 2.046875, "learning_rate": 0.000998836065185883, "loss": 2.3329, "step": 296 }, { "epoch": 0.002376, "grad_norm": 3.0, "learning_rate": 0.0009988274627775885, "loss": 2.4574, "step": 297 }, { "epoch": 0.002384, "grad_norm": 2.15625, "learning_rate": 0.00099881882873421, "loss": 2.3093, "step": 298 }, { "epoch": 0.002392, "grad_norm": 2.265625, "learning_rate": 0.0009988101630562954, "loss": 2.2495, "step": 299 }, { "epoch": 0.0024, "grad_norm": 2.34375, "learning_rate": 0.000998801465744394, "loss": 2.0472, "step": 300 }, { "epoch": 0.002408, "grad_norm": 2.234375, "learning_rate": 0.0009987927367990574, "loss": 2.8981, "step": 301 }, { "epoch": 0.002416, "grad_norm": 3.046875, "learning_rate": 0.0009987839762208392, "loss": 2.6346, "step": 302 }, { "epoch": 0.002424, "grad_norm": 2.59375, "learning_rate": 0.0009987751840102952, "loss": 2.5962, "step": 303 }, { "epoch": 0.002432, "grad_norm": 2.40625, "learning_rate": 0.0009987663601679828, "loss": 2.0378, "step": 304 }, { "epoch": 0.00244, "grad_norm": 2.59375, "learning_rate": 0.0009987575046944614, "loss": 2.4527, "step": 305 }, { "epoch": 0.002448, "grad_norm": 3.375, "learning_rate": 0.000998748617590293, "loss": 2.3003, "step": 306 }, { "epoch": 0.002456, "grad_norm": 2.234375, "learning_rate": 0.000998739698856041, "loss": 2.4695, "step": 307 }, { "epoch": 0.002464, "grad_norm": 2.859375, "learning_rate": 0.000998730748492271, "loss": 2.6476, "step": 308 }, { "epoch": 0.002472, "grad_norm": 2.328125, "learning_rate": 0.0009987217664995506, "loss": 2.0852, "step": 309 }, { "epoch": 0.00248, "grad_norm": 2.40625, "learning_rate": 0.0009987127528784496, "loss": 2.4339, "step": 310 }, { "epoch": 0.002488, "grad_norm": 2.484375, "learning_rate": 0.0009987037076295395, "loss": 2.1669, "step": 311 }, { "epoch": 0.002496, "grad_norm": 1.828125, "learning_rate": 0.000998694630753394, "loss": 2.2129, "step": 312 }, { "epoch": 0.002504, "grad_norm": 2.4375, "learning_rate": 0.0009986855222505887, "loss": 2.6125, "step": 313 }, { "epoch": 0.002512, "grad_norm": 2.421875, "learning_rate": 0.0009986763821217012, "loss": 2.0916, "step": 314 }, { "epoch": 0.00252, "grad_norm": 1.9921875, "learning_rate": 0.0009986672103673112, "loss": 1.9549, "step": 315 }, { "epoch": 0.002528, "grad_norm": 3.234375, "learning_rate": 0.0009986580069880004, "loss": 2.5368, "step": 316 }, { "epoch": 0.002536, "grad_norm": 2.84375, "learning_rate": 0.0009986487719843525, "loss": 2.4054, "step": 317 }, { "epoch": 0.002544, "grad_norm": 2.453125, "learning_rate": 0.000998639505356953, "loss": 2.1829, "step": 318 }, { "epoch": 0.002552, "grad_norm": 2.578125, "learning_rate": 0.00099863020710639, "loss": 2.448, "step": 319 }, { "epoch": 0.00256, "grad_norm": 2.390625, "learning_rate": 0.0009986208772332526, "loss": 2.6432, "step": 320 }, { "epoch": 0.002568, "grad_norm": 2.5, "learning_rate": 0.000998611515738133, "loss": 2.2065, "step": 321 }, { "epoch": 0.002576, "grad_norm": 2.4375, "learning_rate": 0.0009986021226216244, "loss": 2.2516, "step": 322 }, { "epoch": 0.002584, "grad_norm": 2.3125, "learning_rate": 0.000998592697884323, "loss": 2.4531, "step": 323 }, { "epoch": 0.002592, "grad_norm": 2.421875, "learning_rate": 0.000998583241526826, "loss": 2.4651, "step": 324 }, { "epoch": 0.0026, "grad_norm": 2.578125, "learning_rate": 0.0009985737535497337, "loss": 1.9364, "step": 325 }, { "epoch": 0.002608, "grad_norm": 2.9375, "learning_rate": 0.0009985642339536474, "loss": 2.1175, "step": 326 }, { "epoch": 0.002616, "grad_norm": 2.78125, "learning_rate": 0.000998554682739171, "loss": 3.052, "step": 327 }, { "epoch": 0.002624, "grad_norm": 2.640625, "learning_rate": 0.00099854509990691, "loss": 2.5822, "step": 328 }, { "epoch": 0.002632, "grad_norm": 2.125, "learning_rate": 0.0009985354854574724, "loss": 2.2052, "step": 329 }, { "epoch": 0.00264, "grad_norm": 2.296875, "learning_rate": 0.000998525839391468, "loss": 2.0189, "step": 330 }, { "epoch": 0.002648, "grad_norm": 2.265625, "learning_rate": 0.000998516161709508, "loss": 2.2192, "step": 331 }, { "epoch": 0.002656, "grad_norm": 2.234375, "learning_rate": 0.0009985064524122066, "loss": 2.536, "step": 332 }, { "epoch": 0.002664, "grad_norm": 2.421875, "learning_rate": 0.0009984967115001795, "loss": 2.2632, "step": 333 }, { "epoch": 0.002672, "grad_norm": 2.34375, "learning_rate": 0.0009984869389740446, "loss": 2.5079, "step": 334 }, { "epoch": 0.00268, "grad_norm": 2.203125, "learning_rate": 0.0009984771348344212, "loss": 1.8112, "step": 335 }, { "epoch": 0.002688, "grad_norm": 2.578125, "learning_rate": 0.0009984672990819316, "loss": 2.3602, "step": 336 }, { "epoch": 0.002696, "grad_norm": 2.296875, "learning_rate": 0.0009984574317171992, "loss": 2.281, "step": 337 }, { "epoch": 0.002704, "grad_norm": 2.265625, "learning_rate": 0.00099844753274085, "loss": 2.2476, "step": 338 }, { "epoch": 0.002712, "grad_norm": 2.078125, "learning_rate": 0.0009984376021535118, "loss": 2.4334, "step": 339 }, { "epoch": 0.00272, "grad_norm": 2.453125, "learning_rate": 0.0009984276399558141, "loss": 2.6812, "step": 340 }, { "epoch": 0.002728, "grad_norm": 2.109375, "learning_rate": 0.0009984176461483888, "loss": 2.2421, "step": 341 }, { "epoch": 0.002736, "grad_norm": 2.5625, "learning_rate": 0.00099840762073187, "loss": 2.5509, "step": 342 }, { "epoch": 0.002744, "grad_norm": 2.140625, "learning_rate": 0.000998397563706893, "loss": 2.5669, "step": 343 }, { "epoch": 0.002752, "grad_norm": 2.28125, "learning_rate": 0.0009983874750740963, "loss": 1.9478, "step": 344 }, { "epoch": 0.00276, "grad_norm": 2.46875, "learning_rate": 0.0009983773548341188, "loss": 2.7325, "step": 345 }, { "epoch": 0.002768, "grad_norm": 2.390625, "learning_rate": 0.000998367202987603, "loss": 2.4125, "step": 346 }, { "epoch": 0.002776, "grad_norm": 2.421875, "learning_rate": 0.0009983570195351925, "loss": 2.2774, "step": 347 }, { "epoch": 0.002784, "grad_norm": 2.59375, "learning_rate": 0.0009983468044775332, "loss": 2.6438, "step": 348 }, { "epoch": 0.002792, "grad_norm": 2.5625, "learning_rate": 0.000998336557815273, "loss": 2.801, "step": 349 }, { "epoch": 0.0028, "grad_norm": 2.296875, "learning_rate": 0.0009983262795490613, "loss": 2.4298, "step": 350 }, { "epoch": 0.002808, "grad_norm": 2.90625, "learning_rate": 0.0009983159696795503, "loss": 2.9898, "step": 351 }, { "epoch": 0.002816, "grad_norm": 2.328125, "learning_rate": 0.000998305628207394, "loss": 2.5354, "step": 352 }, { "epoch": 0.002824, "grad_norm": 2.203125, "learning_rate": 0.0009982952551332478, "loss": 2.5802, "step": 353 }, { "epoch": 0.002832, "grad_norm": 2.0, "learning_rate": 0.0009982848504577696, "loss": 1.9932, "step": 354 }, { "epoch": 0.00284, "grad_norm": 2.1875, "learning_rate": 0.0009982744141816197, "loss": 2.4366, "step": 355 }, { "epoch": 0.002848, "grad_norm": 2.421875, "learning_rate": 0.0009982639463054597, "loss": 2.34, "step": 356 }, { "epoch": 0.002856, "grad_norm": 3.3125, "learning_rate": 0.0009982534468299534, "loss": 2.6942, "step": 357 }, { "epoch": 0.002864, "grad_norm": 2.875, "learning_rate": 0.0009982429157557667, "loss": 2.2945, "step": 358 }, { "epoch": 0.002872, "grad_norm": 3.390625, "learning_rate": 0.0009982323530835673, "loss": 2.4701, "step": 359 }, { "epoch": 0.00288, "grad_norm": 2.265625, "learning_rate": 0.0009982217588140254, "loss": 2.1866, "step": 360 }, { "epoch": 0.002888, "grad_norm": 2.3125, "learning_rate": 0.0009982111329478126, "loss": 2.2042, "step": 361 }, { "epoch": 0.002896, "grad_norm": 1.96875, "learning_rate": 0.000998200475485603, "loss": 1.9329, "step": 362 }, { "epoch": 0.002904, "grad_norm": 2.296875, "learning_rate": 0.0009981897864280726, "loss": 1.9892, "step": 363 }, { "epoch": 0.002912, "grad_norm": 2.28125, "learning_rate": 0.000998179065775899, "loss": 2.2071, "step": 364 }, { "epoch": 0.00292, "grad_norm": 2.484375, "learning_rate": 0.0009981683135297618, "loss": 2.4648, "step": 365 }, { "epoch": 0.002928, "grad_norm": 2.296875, "learning_rate": 0.0009981575296903436, "loss": 2.3775, "step": 366 }, { "epoch": 0.002936, "grad_norm": 3.546875, "learning_rate": 0.0009981467142583277, "loss": 2.7459, "step": 367 }, { "epoch": 0.002944, "grad_norm": 2.796875, "learning_rate": 0.0009981358672344004, "loss": 2.6901, "step": 368 }, { "epoch": 0.002952, "grad_norm": 1.9609375, "learning_rate": 0.0009981249886192498, "loss": 2.4016, "step": 369 }, { "epoch": 0.00296, "grad_norm": 2.203125, "learning_rate": 0.0009981140784135652, "loss": 2.4385, "step": 370 }, { "epoch": 0.002968, "grad_norm": 2.046875, "learning_rate": 0.0009981031366180387, "loss": 2.2802, "step": 371 }, { "epoch": 0.002976, "grad_norm": 2.75, "learning_rate": 0.0009980921632333644, "loss": 2.5682, "step": 372 }, { "epoch": 0.002984, "grad_norm": 2.265625, "learning_rate": 0.000998081158260238, "loss": 2.4764, "step": 373 }, { "epoch": 0.002992, "grad_norm": 2.484375, "learning_rate": 0.0009980701216993578, "loss": 2.8371, "step": 374 }, { "epoch": 0.003, "grad_norm": 2.390625, "learning_rate": 0.0009980590535514234, "loss": 2.3063, "step": 375 }, { "epoch": 0.003008, "grad_norm": 2.03125, "learning_rate": 0.0009980479538171368, "loss": 2.2352, "step": 376 }, { "epoch": 0.003016, "grad_norm": 2.375, "learning_rate": 0.0009980368224972017, "loss": 2.6703, "step": 377 }, { "epoch": 0.003024, "grad_norm": 2.625, "learning_rate": 0.0009980256595923248, "loss": 2.422, "step": 378 }, { "epoch": 0.003032, "grad_norm": 2.640625, "learning_rate": 0.000998014465103213, "loss": 2.331, "step": 379 }, { "epoch": 0.00304, "grad_norm": 2.34375, "learning_rate": 0.0009980032390305771, "loss": 2.4668, "step": 380 }, { "epoch": 0.003048, "grad_norm": 2.28125, "learning_rate": 0.0009979919813751287, "loss": 2.5207, "step": 381 }, { "epoch": 0.003056, "grad_norm": 1.8515625, "learning_rate": 0.0009979806921375815, "loss": 2.0195, "step": 382 }, { "epoch": 0.003064, "grad_norm": 2.109375, "learning_rate": 0.0009979693713186516, "loss": 2.4653, "step": 383 }, { "epoch": 0.003072, "grad_norm": 2.84375, "learning_rate": 0.0009979580189190575, "loss": 2.4737, "step": 384 }, { "epoch": 0.00308, "grad_norm": 2.5, "learning_rate": 0.0009979466349395185, "loss": 2.107, "step": 385 }, { "epoch": 0.003088, "grad_norm": 5.84375, "learning_rate": 0.0009979352193807568, "loss": 1.9972, "step": 386 }, { "epoch": 0.003096, "grad_norm": 2.828125, "learning_rate": 0.0009979237722434963, "loss": 2.4289, "step": 387 }, { "epoch": 0.003104, "grad_norm": 2.96875, "learning_rate": 0.000997912293528463, "loss": 2.7841, "step": 388 }, { "epoch": 0.003112, "grad_norm": 2.421875, "learning_rate": 0.000997900783236385, "loss": 2.2794, "step": 389 }, { "epoch": 0.00312, "grad_norm": 2.296875, "learning_rate": 0.000997889241367992, "loss": 2.2905, "step": 390 }, { "epoch": 0.003128, "grad_norm": 2.40625, "learning_rate": 0.000997877667924016, "loss": 2.5872, "step": 391 }, { "epoch": 0.003136, "grad_norm": 2.15625, "learning_rate": 0.0009978660629051913, "loss": 2.3872, "step": 392 }, { "epoch": 0.003144, "grad_norm": 2.515625, "learning_rate": 0.0009978544263122536, "loss": 2.66, "step": 393 }, { "epoch": 0.003152, "grad_norm": 2.5, "learning_rate": 0.0009978427581459409, "loss": 2.4671, "step": 394 }, { "epoch": 0.00316, "grad_norm": 2.28125, "learning_rate": 0.0009978310584069932, "loss": 2.3653, "step": 395 }, { "epoch": 0.003168, "grad_norm": 2.1875, "learning_rate": 0.0009978193270961527, "loss": 2.2136, "step": 396 }, { "epoch": 0.003176, "grad_norm": 2.09375, "learning_rate": 0.000997807564214163, "loss": 1.9739, "step": 397 }, { "epoch": 0.003184, "grad_norm": 2.9375, "learning_rate": 0.0009977957697617703, "loss": 2.2714, "step": 398 }, { "epoch": 0.003192, "grad_norm": 2.3125, "learning_rate": 0.0009977839437397228, "loss": 2.3418, "step": 399 }, { "epoch": 0.0032, "grad_norm": 2.09375, "learning_rate": 0.00099777208614877, "loss": 2.4377, "step": 400 }, { "epoch": 0.003208, "grad_norm": 1.953125, "learning_rate": 0.0009977601969896643, "loss": 2.4357, "step": 401 }, { "epoch": 0.003216, "grad_norm": 2.875, "learning_rate": 0.0009977482762631594, "loss": 2.0659, "step": 402 }, { "epoch": 0.003224, "grad_norm": 2.53125, "learning_rate": 0.0009977363239700118, "loss": 2.1965, "step": 403 }, { "epoch": 0.003232, "grad_norm": 2.421875, "learning_rate": 0.0009977243401109788, "loss": 2.1372, "step": 404 }, { "epoch": 0.00324, "grad_norm": 2.3125, "learning_rate": 0.000997712324686821, "loss": 2.1244, "step": 405 }, { "epoch": 0.003248, "grad_norm": 2.3125, "learning_rate": 0.0009977002776983, "loss": 2.7695, "step": 406 }, { "epoch": 0.003256, "grad_norm": 2.28125, "learning_rate": 0.00099768819914618, "loss": 2.1499, "step": 407 }, { "epoch": 0.003264, "grad_norm": 2.53125, "learning_rate": 0.0009976760890312271, "loss": 2.2522, "step": 408 }, { "epoch": 0.003272, "grad_norm": 2.3125, "learning_rate": 0.0009976639473542092, "loss": 1.9964, "step": 409 }, { "epoch": 0.00328, "grad_norm": 2.5625, "learning_rate": 0.0009976517741158962, "loss": 2.4306, "step": 410 }, { "epoch": 0.003288, "grad_norm": 2.09375, "learning_rate": 0.0009976395693170603, "loss": 2.1884, "step": 411 }, { "epoch": 0.003296, "grad_norm": 2.09375, "learning_rate": 0.0009976273329584753, "loss": 1.679, "step": 412 }, { "epoch": 0.003304, "grad_norm": 2.1875, "learning_rate": 0.0009976150650409174, "loss": 1.7586, "step": 413 }, { "epoch": 0.003312, "grad_norm": 2.53125, "learning_rate": 0.0009976027655651646, "loss": 2.3981, "step": 414 }, { "epoch": 0.00332, "grad_norm": 2.765625, "learning_rate": 0.000997590434531997, "loss": 2.499, "step": 415 }, { "epoch": 0.003328, "grad_norm": 4.125, "learning_rate": 0.0009975780719421964, "loss": 2.2521, "step": 416 }, { "epoch": 0.003336, "grad_norm": 3.46875, "learning_rate": 0.000997565677796547, "loss": 2.3066, "step": 417 }, { "epoch": 0.003344, "grad_norm": 4.4375, "learning_rate": 0.0009975532520958346, "loss": 2.208, "step": 418 }, { "epoch": 0.003352, "grad_norm": 2.46875, "learning_rate": 0.0009975407948408475, "loss": 2.041, "step": 419 }, { "epoch": 0.00336, "grad_norm": 2.65625, "learning_rate": 0.0009975283060323756, "loss": 1.9143, "step": 420 }, { "epoch": 0.003368, "grad_norm": 2.078125, "learning_rate": 0.0009975157856712108, "loss": 2.0154, "step": 421 }, { "epoch": 0.003376, "grad_norm": 2.609375, "learning_rate": 0.0009975032337581474, "loss": 2.601, "step": 422 }, { "epoch": 0.003384, "grad_norm": 2.65625, "learning_rate": 0.0009974906502939814, "loss": 2.5768, "step": 423 }, { "epoch": 0.003392, "grad_norm": 2.6875, "learning_rate": 0.0009974780352795104, "loss": 2.4689, "step": 424 }, { "epoch": 0.0034, "grad_norm": 2.921875, "learning_rate": 0.000997465388715535, "loss": 2.2722, "step": 425 }, { "epoch": 0.003408, "grad_norm": 2.5625, "learning_rate": 0.0009974527106028567, "loss": 2.8003, "step": 426 }, { "epoch": 0.003416, "grad_norm": 2.328125, "learning_rate": 0.0009974400009422802, "loss": 2.4932, "step": 427 }, { "epoch": 0.003424, "grad_norm": 2.390625, "learning_rate": 0.0009974272597346107, "loss": 2.493, "step": 428 }, { "epoch": 0.003432, "grad_norm": 3.203125, "learning_rate": 0.000997414486980657, "loss": 2.4744, "step": 429 }, { "epoch": 0.00344, "grad_norm": 2.421875, "learning_rate": 0.0009974016826812288, "loss": 2.3724, "step": 430 }, { "epoch": 0.003448, "grad_norm": 2.671875, "learning_rate": 0.000997388846837138, "loss": 2.9139, "step": 431 }, { "epoch": 0.003456, "grad_norm": 2.546875, "learning_rate": 0.0009973759794491988, "loss": 2.2189, "step": 432 }, { "epoch": 0.003464, "grad_norm": 2.5, "learning_rate": 0.0009973630805182273, "loss": 1.8075, "step": 433 }, { "epoch": 0.003472, "grad_norm": 2.28125, "learning_rate": 0.0009973501500450414, "loss": 2.6154, "step": 434 }, { "epoch": 0.00348, "grad_norm": 2.65625, "learning_rate": 0.000997337188030461, "loss": 2.2914, "step": 435 }, { "epoch": 0.003488, "grad_norm": 2.40625, "learning_rate": 0.0009973241944753086, "loss": 2.3142, "step": 436 }, { "epoch": 0.003496, "grad_norm": 3.03125, "learning_rate": 0.000997311169380408, "loss": 2.4818, "step": 437 }, { "epoch": 0.003504, "grad_norm": 2.875, "learning_rate": 0.0009972981127465848, "loss": 2.1067, "step": 438 }, { "epoch": 0.003512, "grad_norm": 2.671875, "learning_rate": 0.000997285024574668, "loss": 2.1949, "step": 439 }, { "epoch": 0.00352, "grad_norm": 2.484375, "learning_rate": 0.0009972719048654867, "loss": 2.3398, "step": 440 }, { "epoch": 0.003528, "grad_norm": 2.265625, "learning_rate": 0.000997258753619873, "loss": 2.0794, "step": 441 }, { "epoch": 0.003536, "grad_norm": 4.84375, "learning_rate": 0.0009972455708386618, "loss": 2.4958, "step": 442 }, { "epoch": 0.003544, "grad_norm": 1.9765625, "learning_rate": 0.0009972323565226883, "loss": 1.6019, "step": 443 }, { "epoch": 0.003552, "grad_norm": 2.921875, "learning_rate": 0.000997219110672791, "loss": 2.1453, "step": 444 }, { "epoch": 0.00356, "grad_norm": 2.71875, "learning_rate": 0.0009972058332898095, "loss": 2.4853, "step": 445 }, { "epoch": 0.003568, "grad_norm": 2.5, "learning_rate": 0.0009971925243745863, "loss": 2.0103, "step": 446 }, { "epoch": 0.003576, "grad_norm": 3.453125, "learning_rate": 0.000997179183927965, "loss": 2.0358, "step": 447 }, { "epoch": 0.003584, "grad_norm": 3.6875, "learning_rate": 0.0009971658119507924, "loss": 2.811, "step": 448 }, { "epoch": 0.003592, "grad_norm": 2.796875, "learning_rate": 0.0009971524084439155, "loss": 2.0178, "step": 449 }, { "epoch": 0.0036, "grad_norm": 2.59375, "learning_rate": 0.0009971389734081848, "loss": 2.2803, "step": 450 }, { "epoch": 0.003608, "grad_norm": 2.734375, "learning_rate": 0.0009971255068444527, "loss": 1.9865, "step": 451 }, { "epoch": 0.003616, "grad_norm": 2.5, "learning_rate": 0.0009971120087535726, "loss": 2.4114, "step": 452 }, { "epoch": 0.003624, "grad_norm": 3.921875, "learning_rate": 0.000997098479136401, "loss": 2.1527, "step": 453 }, { "epoch": 0.003632, "grad_norm": 2.5, "learning_rate": 0.0009970849179937956, "loss": 1.8789, "step": 454 }, { "epoch": 0.00364, "grad_norm": 2.28125, "learning_rate": 0.000997071325326617, "loss": 2.3195, "step": 455 }, { "epoch": 0.003648, "grad_norm": 2.484375, "learning_rate": 0.0009970577011357265, "loss": 2.0212, "step": 456 }, { "epoch": 0.003656, "grad_norm": 3.109375, "learning_rate": 0.0009970440454219888, "loss": 2.1487, "step": 457 }, { "epoch": 0.003664, "grad_norm": 2.625, "learning_rate": 0.0009970303581862692, "loss": 2.4279, "step": 458 }, { "epoch": 0.003672, "grad_norm": 2.328125, "learning_rate": 0.0009970166394294364, "loss": 1.5254, "step": 459 }, { "epoch": 0.00368, "grad_norm": 2.53125, "learning_rate": 0.00099700288915236, "loss": 2.3411, "step": 460 }, { "epoch": 0.003688, "grad_norm": 2.90625, "learning_rate": 0.0009969891073559123, "loss": 2.4888, "step": 461 }, { "epoch": 0.003696, "grad_norm": 2.453125, "learning_rate": 0.0009969752940409671, "loss": 2.3925, "step": 462 }, { "epoch": 0.003704, "grad_norm": 2.75, "learning_rate": 0.0009969614492084005, "loss": 2.0726, "step": 463 }, { "epoch": 0.003712, "grad_norm": 2.96875, "learning_rate": 0.0009969475728590907, "loss": 3.1414, "step": 464 }, { "epoch": 0.00372, "grad_norm": 2.65625, "learning_rate": 0.0009969336649939175, "loss": 2.2196, "step": 465 }, { "epoch": 0.003728, "grad_norm": 2.390625, "learning_rate": 0.000996919725613763, "loss": 2.3339, "step": 466 }, { "epoch": 0.003736, "grad_norm": 2.609375, "learning_rate": 0.0009969057547195112, "loss": 2.2078, "step": 467 }, { "epoch": 0.003744, "grad_norm": 2.53125, "learning_rate": 0.0009968917523120483, "loss": 2.2207, "step": 468 }, { "epoch": 0.003752, "grad_norm": 2.4375, "learning_rate": 0.000996877718392262, "loss": 2.4416, "step": 469 }, { "epoch": 0.00376, "grad_norm": 2.28125, "learning_rate": 0.0009968636529610426, "loss": 1.9991, "step": 470 }, { "epoch": 0.003768, "grad_norm": 1.9453125, "learning_rate": 0.000996849556019282, "loss": 1.9126, "step": 471 }, { "epoch": 0.003776, "grad_norm": 2.578125, "learning_rate": 0.000996835427567874, "loss": 2.2536, "step": 472 }, { "epoch": 0.003784, "grad_norm": 2.578125, "learning_rate": 0.000996821267607715, "loss": 2.0077, "step": 473 }, { "epoch": 0.003792, "grad_norm": 3.125, "learning_rate": 0.0009968070761397028, "loss": 2.1747, "step": 474 }, { "epoch": 0.0038, "grad_norm": 3.25, "learning_rate": 0.0009967928531647372, "loss": 2.5177, "step": 475 }, { "epoch": 0.003808, "grad_norm": 2.9375, "learning_rate": 0.0009967785986837207, "loss": 2.4961, "step": 476 }, { "epoch": 0.003816, "grad_norm": 2.296875, "learning_rate": 0.000996764312697557, "loss": 1.9746, "step": 477 }, { "epoch": 0.003824, "grad_norm": 4.0, "learning_rate": 0.000996749995207152, "loss": 2.2243, "step": 478 }, { "epoch": 0.003832, "grad_norm": 2.1875, "learning_rate": 0.000996735646213414, "loss": 1.9488, "step": 479 }, { "epoch": 0.00384, "grad_norm": 2.734375, "learning_rate": 0.0009967212657172528, "loss": 2.1615, "step": 480 }, { "epoch": 0.003848, "grad_norm": 2.765625, "learning_rate": 0.0009967068537195805, "loss": 3.115, "step": 481 }, { "epoch": 0.003856, "grad_norm": 3.65625, "learning_rate": 0.0009966924102213107, "loss": 2.2303, "step": 482 }, { "epoch": 0.003864, "grad_norm": 2.40625, "learning_rate": 0.00099667793522336, "loss": 1.8144, "step": 483 }, { "epoch": 0.003872, "grad_norm": 2.46875, "learning_rate": 0.000996663428726646, "loss": 2.1236, "step": 484 }, { "epoch": 0.00388, "grad_norm": 3.171875, "learning_rate": 0.000996648890732089, "loss": 2.6643, "step": 485 }, { "epoch": 0.003888, "grad_norm": 2.921875, "learning_rate": 0.0009966343212406104, "loss": 2.3852, "step": 486 }, { "epoch": 0.003896, "grad_norm": 2.4375, "learning_rate": 0.0009966197202531347, "loss": 2.2391, "step": 487 }, { "epoch": 0.003904, "grad_norm": 2.25, "learning_rate": 0.000996605087770588, "loss": 2.1362, "step": 488 }, { "epoch": 0.003912, "grad_norm": 3.328125, "learning_rate": 0.0009965904237938977, "loss": 2.7153, "step": 489 }, { "epoch": 0.00392, "grad_norm": 2.640625, "learning_rate": 0.000996575728323994, "loss": 1.8512, "step": 490 }, { "epoch": 0.003928, "grad_norm": 2.671875, "learning_rate": 0.000996561001361809, "loss": 2.6052, "step": 491 }, { "epoch": 0.003936, "grad_norm": 2.703125, "learning_rate": 0.0009965462429082769, "loss": 2.6124, "step": 492 }, { "epoch": 0.003944, "grad_norm": 2.296875, "learning_rate": 0.0009965314529643333, "loss": 2.0278, "step": 493 }, { "epoch": 0.003952, "grad_norm": 2.828125, "learning_rate": 0.000996516631530916, "loss": 2.4138, "step": 494 }, { "epoch": 0.00396, "grad_norm": 2.5, "learning_rate": 0.0009965017786089654, "loss": 2.2343, "step": 495 }, { "epoch": 0.003968, "grad_norm": 3.921875, "learning_rate": 0.0009964868941994233, "loss": 2.0331, "step": 496 }, { "epoch": 0.003976, "grad_norm": 2.671875, "learning_rate": 0.0009964719783032334, "loss": 2.5999, "step": 497 }, { "epoch": 0.003984, "grad_norm": 2.3125, "learning_rate": 0.000996457030921342, "loss": 2.6078, "step": 498 }, { "epoch": 0.003992, "grad_norm": 3.078125, "learning_rate": 0.0009964420520546969, "loss": 2.2722, "step": 499 }, { "epoch": 0.004, "grad_norm": 2.28125, "learning_rate": 0.000996427041704248, "loss": 2.6602, "step": 500 }, { "epoch": 0.004008, "grad_norm": 2.171875, "learning_rate": 0.0009964119998709473, "loss": 2.2992, "step": 501 }, { "epoch": 0.004016, "grad_norm": 2.671875, "learning_rate": 0.0009963969265557487, "loss": 2.3706, "step": 502 }, { "epoch": 0.004024, "grad_norm": 2.203125, "learning_rate": 0.0009963818217596082, "loss": 2.6383, "step": 503 }, { "epoch": 0.004032, "grad_norm": 2.359375, "learning_rate": 0.0009963666854834838, "loss": 2.4167, "step": 504 }, { "epoch": 0.00404, "grad_norm": 2.703125, "learning_rate": 0.0009963515177283352, "loss": 2.2844, "step": 505 }, { "epoch": 0.004048, "grad_norm": 2.515625, "learning_rate": 0.0009963363184951244, "loss": 2.0252, "step": 506 }, { "epoch": 0.004056, "grad_norm": 2.171875, "learning_rate": 0.0009963210877848155, "loss": 2.4154, "step": 507 }, { "epoch": 0.004064, "grad_norm": 3.0, "learning_rate": 0.0009963058255983742, "loss": 2.3374, "step": 508 }, { "epoch": 0.004072, "grad_norm": 2.71875, "learning_rate": 0.0009962905319367682, "loss": 2.8243, "step": 509 }, { "epoch": 0.00408, "grad_norm": 2.453125, "learning_rate": 0.000996275206800968, "loss": 2.251, "step": 510 }, { "epoch": 0.004088, "grad_norm": 2.390625, "learning_rate": 0.0009962598501919453, "loss": 2.3201, "step": 511 }, { "epoch": 0.004096, "grad_norm": 3.125, "learning_rate": 0.0009962444621106736, "loss": 2.5702, "step": 512 }, { "epoch": 0.004104, "grad_norm": 2.703125, "learning_rate": 0.0009962290425581293, "loss": 2.6587, "step": 513 }, { "epoch": 0.004112, "grad_norm": 2.28125, "learning_rate": 0.00099621359153529, "loss": 2.0824, "step": 514 }, { "epoch": 0.00412, "grad_norm": 2.734375, "learning_rate": 0.0009961981090431356, "loss": 2.2835, "step": 515 }, { "epoch": 0.004128, "grad_norm": 2.90625, "learning_rate": 0.0009961825950826483, "loss": 2.1895, "step": 516 }, { "epoch": 0.004136, "grad_norm": 2.5625, "learning_rate": 0.0009961670496548114, "loss": 2.4409, "step": 517 }, { "epoch": 0.004144, "grad_norm": 2.3125, "learning_rate": 0.000996151472760611, "loss": 2.9021, "step": 518 }, { "epoch": 0.004152, "grad_norm": 4.96875, "learning_rate": 0.0009961358644010353, "loss": 2.2175, "step": 519 }, { "epoch": 0.00416, "grad_norm": 2.53125, "learning_rate": 0.000996120224577074, "loss": 2.4303, "step": 520 }, { "epoch": 0.004168, "grad_norm": 2.4375, "learning_rate": 0.0009961045532897188, "loss": 2.6017, "step": 521 }, { "epoch": 0.004176, "grad_norm": 2.75, "learning_rate": 0.0009960888505399638, "loss": 2.2876, "step": 522 }, { "epoch": 0.004184, "grad_norm": 2.4375, "learning_rate": 0.0009960731163288043, "loss": 2.3491, "step": 523 }, { "epoch": 0.004192, "grad_norm": 3.125, "learning_rate": 0.000996057350657239, "loss": 2.1683, "step": 524 }, { "epoch": 0.0042, "grad_norm": 2.203125, "learning_rate": 0.000996041553526267, "loss": 2.0598, "step": 525 }, { "epoch": 0.004208, "grad_norm": 2.6875, "learning_rate": 0.0009960257249368905, "loss": 2.0296, "step": 526 }, { "epoch": 0.004216, "grad_norm": 3.5, "learning_rate": 0.0009960098648901133, "loss": 2.6361, "step": 527 }, { "epoch": 0.004224, "grad_norm": 3.28125, "learning_rate": 0.000995993973386941, "loss": 2.6909, "step": 528 }, { "epoch": 0.004232, "grad_norm": 2.328125, "learning_rate": 0.0009959780504283818, "loss": 2.0399, "step": 529 }, { "epoch": 0.00424, "grad_norm": 2.515625, "learning_rate": 0.0009959620960154454, "loss": 2.5895, "step": 530 }, { "epoch": 0.004248, "grad_norm": 2.3125, "learning_rate": 0.0009959461101491432, "loss": 2.373, "step": 531 }, { "epoch": 0.004256, "grad_norm": 2.46875, "learning_rate": 0.0009959300928304895, "loss": 2.3215, "step": 532 }, { "epoch": 0.004264, "grad_norm": 2.203125, "learning_rate": 0.0009959140440605001, "loss": 2.3436, "step": 533 }, { "epoch": 0.004272, "grad_norm": 2.671875, "learning_rate": 0.0009958979638401925, "loss": 2.0572, "step": 534 }, { "epoch": 0.00428, "grad_norm": 2.125, "learning_rate": 0.0009958818521705866, "loss": 2.1372, "step": 535 }, { "epoch": 0.004288, "grad_norm": 2.375, "learning_rate": 0.0009958657090527042, "loss": 2.4991, "step": 536 }, { "epoch": 0.004296, "grad_norm": 2.5, "learning_rate": 0.000995849534487569, "loss": 2.5641, "step": 537 }, { "epoch": 0.004304, "grad_norm": 2.34375, "learning_rate": 0.000995833328476207, "loss": 2.0741, "step": 538 }, { "epoch": 0.004312, "grad_norm": 2.25, "learning_rate": 0.0009958170910196458, "loss": 2.2448, "step": 539 }, { "epoch": 0.00432, "grad_norm": 2.4375, "learning_rate": 0.000995800822118915, "loss": 2.3521, "step": 540 }, { "epoch": 0.004328, "grad_norm": 2.625, "learning_rate": 0.0009957845217750468, "loss": 2.1974, "step": 541 }, { "epoch": 0.004336, "grad_norm": 2.875, "learning_rate": 0.0009957681899890746, "loss": 2.8252, "step": 542 }, { "epoch": 0.004344, "grad_norm": 2.3125, "learning_rate": 0.0009957518267620343, "loss": 2.6459, "step": 543 }, { "epoch": 0.004352, "grad_norm": 2.203125, "learning_rate": 0.0009957354320949635, "loss": 1.9224, "step": 544 }, { "epoch": 0.00436, "grad_norm": 1.9140625, "learning_rate": 0.0009957190059889018, "loss": 1.8312, "step": 545 }, { "epoch": 0.004368, "grad_norm": 2.546875, "learning_rate": 0.0009957025484448913, "loss": 2.3098, "step": 546 }, { "epoch": 0.004376, "grad_norm": 2.46875, "learning_rate": 0.0009956860594639758, "loss": 2.3105, "step": 547 }, { "epoch": 0.004384, "grad_norm": 3.046875, "learning_rate": 0.0009956695390472006, "loss": 2.217, "step": 548 }, { "epoch": 0.004392, "grad_norm": 2.234375, "learning_rate": 0.0009956529871956133, "loss": 2.0734, "step": 549 }, { "epoch": 0.0044, "grad_norm": 2.609375, "learning_rate": 0.000995636403910264, "loss": 2.5188, "step": 550 }, { "epoch": 0.004408, "grad_norm": 2.3125, "learning_rate": 0.0009956197891922045, "loss": 1.8601, "step": 551 }, { "epoch": 0.004416, "grad_norm": 2.84375, "learning_rate": 0.000995603143042488, "loss": 2.7357, "step": 552 }, { "epoch": 0.004424, "grad_norm": 2.5, "learning_rate": 0.0009955864654621707, "loss": 2.0502, "step": 553 }, { "epoch": 0.004432, "grad_norm": 2.546875, "learning_rate": 0.0009955697564523096, "loss": 1.8601, "step": 554 }, { "epoch": 0.00444, "grad_norm": 2.71875, "learning_rate": 0.000995553016013965, "loss": 2.4712, "step": 555 }, { "epoch": 0.004448, "grad_norm": 2.609375, "learning_rate": 0.0009955362441481984, "loss": 2.1726, "step": 556 }, { "epoch": 0.004456, "grad_norm": 2.859375, "learning_rate": 0.0009955194408560733, "loss": 2.7408, "step": 557 }, { "epoch": 0.004464, "grad_norm": 2.34375, "learning_rate": 0.0009955026061386554, "loss": 2.6763, "step": 558 }, { "epoch": 0.004472, "grad_norm": 2.359375, "learning_rate": 0.0009954857399970124, "loss": 2.3973, "step": 559 }, { "epoch": 0.00448, "grad_norm": 2.28125, "learning_rate": 0.0009954688424322138, "loss": 2.1744, "step": 560 }, { "epoch": 0.004488, "grad_norm": 2.234375, "learning_rate": 0.0009954519134453316, "loss": 2.2592, "step": 561 }, { "epoch": 0.004496, "grad_norm": 2.359375, "learning_rate": 0.0009954349530374388, "loss": 2.0077, "step": 562 }, { "epoch": 0.004504, "grad_norm": 2.515625, "learning_rate": 0.0009954179612096114, "loss": 2.263, "step": 563 }, { "epoch": 0.004512, "grad_norm": 3.671875, "learning_rate": 0.0009954009379629272, "loss": 2.0888, "step": 564 }, { "epoch": 0.00452, "grad_norm": 2.359375, "learning_rate": 0.0009953838832984655, "loss": 2.1235, "step": 565 }, { "epoch": 0.004528, "grad_norm": 3.6875, "learning_rate": 0.0009953667972173077, "loss": 2.4951, "step": 566 }, { "epoch": 0.004536, "grad_norm": 2.453125, "learning_rate": 0.0009953496797205376, "loss": 2.1148, "step": 567 }, { "epoch": 0.004544, "grad_norm": 2.59375, "learning_rate": 0.0009953325308092412, "loss": 2.5784, "step": 568 }, { "epoch": 0.004552, "grad_norm": 2.40625, "learning_rate": 0.0009953153504845054, "loss": 2.3503, "step": 569 }, { "epoch": 0.00456, "grad_norm": 2.296875, "learning_rate": 0.00099529813874742, "loss": 2.1618, "step": 570 }, { "epoch": 0.004568, "grad_norm": 2.40625, "learning_rate": 0.0009952808955990765, "loss": 1.9721, "step": 571 }, { "epoch": 0.004576, "grad_norm": 2.140625, "learning_rate": 0.0009952636210405685, "loss": 2.085, "step": 572 }, { "epoch": 0.004584, "grad_norm": 3.0, "learning_rate": 0.0009952463150729918, "loss": 2.3833, "step": 573 }, { "epoch": 0.004592, "grad_norm": 2.546875, "learning_rate": 0.0009952289776974435, "loss": 2.1663, "step": 574 }, { "epoch": 0.0046, "grad_norm": 3.0625, "learning_rate": 0.0009952116089150232, "loss": 2.682, "step": 575 }, { "epoch": 0.004608, "grad_norm": 2.46875, "learning_rate": 0.0009951942087268325, "loss": 2.4682, "step": 576 }, { "epoch": 0.004616, "grad_norm": 3.640625, "learning_rate": 0.000995176777133975, "loss": 3.0183, "step": 577 }, { "epoch": 0.004624, "grad_norm": 2.859375, "learning_rate": 0.000995159314137556, "loss": 2.0004, "step": 578 }, { "epoch": 0.004632, "grad_norm": 2.75, "learning_rate": 0.000995141819738683, "loss": 2.5587, "step": 579 }, { "epoch": 0.00464, "grad_norm": 2.5, "learning_rate": 0.0009951242939384658, "loss": 2.4154, "step": 580 }, { "epoch": 0.004648, "grad_norm": 2.625, "learning_rate": 0.0009951067367380154, "loss": 2.3805, "step": 581 }, { "epoch": 0.004656, "grad_norm": 2.15625, "learning_rate": 0.0009950891481384455, "loss": 2.3163, "step": 582 }, { "epoch": 0.004664, "grad_norm": 2.390625, "learning_rate": 0.0009950715281408715, "loss": 2.3556, "step": 583 }, { "epoch": 0.004672, "grad_norm": 2.8125, "learning_rate": 0.0009950538767464108, "loss": 2.1463, "step": 584 }, { "epoch": 0.00468, "grad_norm": 2.53125, "learning_rate": 0.0009950361939561829, "loss": 2.4619, "step": 585 }, { "epoch": 0.004688, "grad_norm": 2.890625, "learning_rate": 0.0009950184797713093, "loss": 3.4063, "step": 586 }, { "epoch": 0.004696, "grad_norm": 2.34375, "learning_rate": 0.0009950007341929133, "loss": 1.9453, "step": 587 }, { "epoch": 0.004704, "grad_norm": 2.328125, "learning_rate": 0.0009949829572221203, "loss": 2.14, "step": 588 }, { "epoch": 0.004712, "grad_norm": 2.640625, "learning_rate": 0.0009949651488600577, "loss": 2.8374, "step": 589 }, { "epoch": 0.00472, "grad_norm": 2.03125, "learning_rate": 0.0009949473091078551, "loss": 1.8569, "step": 590 }, { "epoch": 0.004728, "grad_norm": 2.78125, "learning_rate": 0.0009949294379666433, "loss": 2.5642, "step": 591 }, { "epoch": 0.004736, "grad_norm": 2.65625, "learning_rate": 0.0009949115354375565, "loss": 2.4275, "step": 592 }, { "epoch": 0.004744, "grad_norm": 2.375, "learning_rate": 0.0009948936015217293, "loss": 1.9503, "step": 593 }, { "epoch": 0.004752, "grad_norm": 2.953125, "learning_rate": 0.0009948756362202996, "loss": 2.5101, "step": 594 }, { "epoch": 0.00476, "grad_norm": 2.421875, "learning_rate": 0.0009948576395344062, "loss": 2.4073, "step": 595 }, { "epoch": 0.004768, "grad_norm": 2.84375, "learning_rate": 0.000994839611465191, "loss": 2.1728, "step": 596 }, { "epoch": 0.004776, "grad_norm": 2.8125, "learning_rate": 0.000994821552013797, "loss": 2.0661, "step": 597 }, { "epoch": 0.004784, "grad_norm": 2.25, "learning_rate": 0.0009948034611813694, "loss": 2.0198, "step": 598 }, { "epoch": 0.004792, "grad_norm": 2.078125, "learning_rate": 0.000994785338969056, "loss": 2.1655, "step": 599 }, { "epoch": 0.0048, "grad_norm": 2.140625, "learning_rate": 0.0009947671853780054, "loss": 2.0912, "step": 600 }, { "epoch": 0.004808, "grad_norm": 2.40625, "learning_rate": 0.0009947490004093694, "loss": 1.86, "step": 601 }, { "epoch": 0.004816, "grad_norm": 2.53125, "learning_rate": 0.0009947307840643012, "loss": 2.2482, "step": 602 }, { "epoch": 0.004824, "grad_norm": 2.9375, "learning_rate": 0.0009947125363439558, "loss": 2.8052, "step": 603 }, { "epoch": 0.004832, "grad_norm": 3.21875, "learning_rate": 0.0009946942572494905, "loss": 2.2962, "step": 604 }, { "epoch": 0.00484, "grad_norm": 2.03125, "learning_rate": 0.000994675946782065, "loss": 2.1352, "step": 605 }, { "epoch": 0.004848, "grad_norm": 2.84375, "learning_rate": 0.0009946576049428399, "loss": 1.8982, "step": 606 }, { "epoch": 0.004856, "grad_norm": 2.53125, "learning_rate": 0.0009946392317329788, "loss": 2.1391, "step": 607 }, { "epoch": 0.004864, "grad_norm": 2.28125, "learning_rate": 0.0009946208271536469, "loss": 2.183, "step": 608 }, { "epoch": 0.004872, "grad_norm": 2.484375, "learning_rate": 0.0009946023912060112, "loss": 2.7096, "step": 609 }, { "epoch": 0.00488, "grad_norm": 2.578125, "learning_rate": 0.0009945839238912411, "loss": 2.4638, "step": 610 }, { "epoch": 0.004888, "grad_norm": 2.96875, "learning_rate": 0.0009945654252105076, "loss": 2.2467, "step": 611 }, { "epoch": 0.004896, "grad_norm": 2.09375, "learning_rate": 0.0009945468951649838, "loss": 2.041, "step": 612 }, { "epoch": 0.004904, "grad_norm": 2.546875, "learning_rate": 0.0009945283337558452, "loss": 2.5474, "step": 613 }, { "epoch": 0.004912, "grad_norm": 2.25, "learning_rate": 0.0009945097409842687, "loss": 2.1981, "step": 614 }, { "epoch": 0.00492, "grad_norm": 2.5625, "learning_rate": 0.0009944911168514336, "loss": 2.069, "step": 615 }, { "epoch": 0.004928, "grad_norm": 2.578125, "learning_rate": 0.0009944724613585208, "loss": 2.4677, "step": 616 }, { "epoch": 0.004936, "grad_norm": 2.921875, "learning_rate": 0.0009944537745067133, "loss": 2.4839, "step": 617 }, { "epoch": 0.004944, "grad_norm": 2.390625, "learning_rate": 0.0009944350562971966, "loss": 2.5064, "step": 618 }, { "epoch": 0.004952, "grad_norm": 2.390625, "learning_rate": 0.0009944163067311575, "loss": 2.257, "step": 619 }, { "epoch": 0.00496, "grad_norm": 2.359375, "learning_rate": 0.0009943975258097852, "loss": 2.1787, "step": 620 }, { "epoch": 0.004968, "grad_norm": 3.109375, "learning_rate": 0.000994378713534271, "loss": 2.9127, "step": 621 }, { "epoch": 0.004976, "grad_norm": 2.09375, "learning_rate": 0.0009943598699058073, "loss": 1.9914, "step": 622 }, { "epoch": 0.004984, "grad_norm": 3.546875, "learning_rate": 0.0009943409949255896, "loss": 2.2802, "step": 623 }, { "epoch": 0.004992, "grad_norm": 2.984375, "learning_rate": 0.000994322088594815, "loss": 2.5863, "step": 624 }, { "epoch": 0.005, "grad_norm": 2.0, "learning_rate": 0.0009943031509146824, "loss": 2.0677, "step": 625 }, { "epoch": 0.005008, "grad_norm": 2.734375, "learning_rate": 0.0009942841818863927, "loss": 2.4079, "step": 626 }, { "epoch": 0.005016, "grad_norm": 3.46875, "learning_rate": 0.000994265181511149, "loss": 2.0485, "step": 627 }, { "epoch": 0.005024, "grad_norm": 3.328125, "learning_rate": 0.0009942461497901563, "loss": 2.8185, "step": 628 }, { "epoch": 0.005032, "grad_norm": 2.015625, "learning_rate": 0.0009942270867246215, "loss": 2.1898, "step": 629 }, { "epoch": 0.00504, "grad_norm": 2.125, "learning_rate": 0.0009942079923157534, "loss": 2.1861, "step": 630 }, { "epoch": 0.005048, "grad_norm": 2.5, "learning_rate": 0.0009941888665647633, "loss": 1.9236, "step": 631 }, { "epoch": 0.005056, "grad_norm": 2.28125, "learning_rate": 0.0009941697094728642, "loss": 2.185, "step": 632 }, { "epoch": 0.005064, "grad_norm": 2.9375, "learning_rate": 0.0009941505210412705, "loss": 2.6772, "step": 633 }, { "epoch": 0.005072, "grad_norm": 2.359375, "learning_rate": 0.0009941313012711995, "loss": 2.1723, "step": 634 }, { "epoch": 0.00508, "grad_norm": 3.109375, "learning_rate": 0.00099411205016387, "loss": 2.828, "step": 635 }, { "epoch": 0.005088, "grad_norm": 2.609375, "learning_rate": 0.0009940927677205026, "loss": 2.5504, "step": 636 }, { "epoch": 0.005096, "grad_norm": 2.625, "learning_rate": 0.0009940734539423208, "loss": 2.3441, "step": 637 }, { "epoch": 0.005104, "grad_norm": 3.859375, "learning_rate": 0.0009940541088305492, "loss": 2.4745, "step": 638 }, { "epoch": 0.005112, "grad_norm": 2.609375, "learning_rate": 0.0009940347323864143, "loss": 2.4442, "step": 639 }, { "epoch": 0.00512, "grad_norm": 2.703125, "learning_rate": 0.0009940153246111453, "loss": 2.7647, "step": 640 }, { "epoch": 0.005128, "grad_norm": 2.1875, "learning_rate": 0.000993995885505973, "loss": 2.275, "step": 641 }, { "epoch": 0.005136, "grad_norm": 4.375, "learning_rate": 0.00099397641507213, "loss": 1.8479, "step": 642 }, { "epoch": 0.005144, "grad_norm": 2.546875, "learning_rate": 0.000993956913310851, "loss": 2.4127, "step": 643 }, { "epoch": 0.005152, "grad_norm": 2.796875, "learning_rate": 0.0009939373802233733, "loss": 2.1718, "step": 644 }, { "epoch": 0.00516, "grad_norm": 2.28125, "learning_rate": 0.0009939178158109353, "loss": 2.2501, "step": 645 }, { "epoch": 0.005168, "grad_norm": 2.515625, "learning_rate": 0.0009938982200747778, "loss": 2.7814, "step": 646 }, { "epoch": 0.005176, "grad_norm": 2.671875, "learning_rate": 0.0009938785930161437, "loss": 2.2868, "step": 647 }, { "epoch": 0.005184, "grad_norm": 2.328125, "learning_rate": 0.0009938589346362773, "loss": 2.4013, "step": 648 }, { "epoch": 0.005192, "grad_norm": 2.96875, "learning_rate": 0.0009938392449364256, "loss": 2.2646, "step": 649 }, { "epoch": 0.0052, "grad_norm": 2.890625, "learning_rate": 0.0009938195239178375, "loss": 2.7358, "step": 650 }, { "epoch": 0.005208, "grad_norm": 2.546875, "learning_rate": 0.0009937997715817632, "loss": 2.1794, "step": 651 }, { "epoch": 0.005216, "grad_norm": 2.390625, "learning_rate": 0.0009937799879294559, "loss": 2.3841, "step": 652 }, { "epoch": 0.005224, "grad_norm": 1.8359375, "learning_rate": 0.00099376017296217, "loss": 1.7848, "step": 653 }, { "epoch": 0.005232, "grad_norm": 2.515625, "learning_rate": 0.0009937403266811618, "loss": 2.3441, "step": 654 }, { "epoch": 0.00524, "grad_norm": 2.421875, "learning_rate": 0.0009937204490876904, "loss": 2.3864, "step": 655 }, { "epoch": 0.005248, "grad_norm": 2.515625, "learning_rate": 0.0009937005401830165, "loss": 1.9804, "step": 656 }, { "epoch": 0.005256, "grad_norm": 2.890625, "learning_rate": 0.0009936805999684025, "loss": 2.5116, "step": 657 }, { "epoch": 0.005264, "grad_norm": 3.125, "learning_rate": 0.000993660628445113, "loss": 2.4067, "step": 658 }, { "epoch": 0.005272, "grad_norm": 2.734375, "learning_rate": 0.0009936406256144143, "loss": 1.773, "step": 659 }, { "epoch": 0.00528, "grad_norm": 2.5625, "learning_rate": 0.0009936205914775754, "loss": 1.8227, "step": 660 }, { "epoch": 0.005288, "grad_norm": 2.65625, "learning_rate": 0.0009936005260358664, "loss": 2.6003, "step": 661 }, { "epoch": 0.005296, "grad_norm": 2.640625, "learning_rate": 0.0009935804292905603, "loss": 1.5483, "step": 662 }, { "epoch": 0.005304, "grad_norm": 2.359375, "learning_rate": 0.0009935603012429313, "loss": 2.2882, "step": 663 }, { "epoch": 0.005312, "grad_norm": 3.15625, "learning_rate": 0.0009935401418942561, "loss": 2.7045, "step": 664 }, { "epoch": 0.00532, "grad_norm": 1.9609375, "learning_rate": 0.000993519951245813, "loss": 1.7143, "step": 665 }, { "epoch": 0.005328, "grad_norm": 2.578125, "learning_rate": 0.0009934997292988825, "loss": 2.2947, "step": 666 }, { "epoch": 0.005336, "grad_norm": 2.359375, "learning_rate": 0.000993479476054747, "loss": 2.1091, "step": 667 }, { "epoch": 0.005344, "grad_norm": 2.203125, "learning_rate": 0.0009934591915146912, "loss": 2.3429, "step": 668 }, { "epoch": 0.005352, "grad_norm": 2.453125, "learning_rate": 0.0009934388756800013, "loss": 1.9753, "step": 669 }, { "epoch": 0.00536, "grad_norm": 3.640625, "learning_rate": 0.000993418528551966, "loss": 2.479, "step": 670 }, { "epoch": 0.005368, "grad_norm": 2.46875, "learning_rate": 0.000993398150131875, "loss": 2.3421, "step": 671 }, { "epoch": 0.005376, "grad_norm": 2.75, "learning_rate": 0.0009933777404210213, "loss": 2.5982, "step": 672 }, { "epoch": 0.005384, "grad_norm": 2.484375, "learning_rate": 0.0009933572994206992, "loss": 2.3622, "step": 673 }, { "epoch": 0.005392, "grad_norm": 2.765625, "learning_rate": 0.000993336827132205, "loss": 2.5144, "step": 674 }, { "epoch": 0.0054, "grad_norm": 3.03125, "learning_rate": 0.0009933163235568369, "loss": 2.4361, "step": 675 }, { "epoch": 0.005408, "grad_norm": 3.125, "learning_rate": 0.000993295788695895, "loss": 2.3388, "step": 676 }, { "epoch": 0.005416, "grad_norm": 2.578125, "learning_rate": 0.0009932752225506822, "loss": 1.6465, "step": 677 }, { "epoch": 0.005424, "grad_norm": 2.640625, "learning_rate": 0.0009932546251225022, "loss": 2.2241, "step": 678 }, { "epoch": 0.005432, "grad_norm": 2.515625, "learning_rate": 0.0009932339964126618, "loss": 2.1922, "step": 679 }, { "epoch": 0.00544, "grad_norm": 3.328125, "learning_rate": 0.0009932133364224688, "loss": 2.9018, "step": 680 }, { "epoch": 0.005448, "grad_norm": 2.578125, "learning_rate": 0.0009931926451532337, "loss": 1.8611, "step": 681 }, { "epoch": 0.005456, "grad_norm": 3.609375, "learning_rate": 0.0009931719226062687, "loss": 3.0274, "step": 682 }, { "epoch": 0.005464, "grad_norm": 2.46875, "learning_rate": 0.000993151168782888, "loss": 2.7502, "step": 683 }, { "epoch": 0.005472, "grad_norm": 2.453125, "learning_rate": 0.0009931303836844074, "loss": 2.4806, "step": 684 }, { "epoch": 0.00548, "grad_norm": 2.390625, "learning_rate": 0.0009931095673121454, "loss": 2.423, "step": 685 }, { "epoch": 0.005488, "grad_norm": 2.796875, "learning_rate": 0.0009930887196674225, "loss": 2.7128, "step": 686 }, { "epoch": 0.005496, "grad_norm": 2.4375, "learning_rate": 0.00099306784075156, "loss": 2.4644, "step": 687 }, { "epoch": 0.005504, "grad_norm": 3.171875, "learning_rate": 0.0009930469305658828, "loss": 2.2745, "step": 688 }, { "epoch": 0.005512, "grad_norm": 3.125, "learning_rate": 0.0009930259891117166, "loss": 2.1802, "step": 689 }, { "epoch": 0.00552, "grad_norm": 2.578125, "learning_rate": 0.0009930050163903896, "loss": 2.3705, "step": 690 }, { "epoch": 0.005528, "grad_norm": 2.34375, "learning_rate": 0.0009929840124032318, "loss": 2.3073, "step": 691 }, { "epoch": 0.005536, "grad_norm": 2.390625, "learning_rate": 0.0009929629771515754, "loss": 2.7613, "step": 692 }, { "epoch": 0.005544, "grad_norm": 2.453125, "learning_rate": 0.0009929419106367542, "loss": 2.1751, "step": 693 }, { "epoch": 0.005552, "grad_norm": 2.6875, "learning_rate": 0.0009929208128601043, "loss": 2.5635, "step": 694 }, { "epoch": 0.00556, "grad_norm": 1.9609375, "learning_rate": 0.0009928996838229637, "loss": 1.9968, "step": 695 }, { "epoch": 0.005568, "grad_norm": 2.421875, "learning_rate": 0.0009928785235266725, "loss": 2.4433, "step": 696 }, { "epoch": 0.005576, "grad_norm": 2.21875, "learning_rate": 0.0009928573319725727, "loss": 2.2108, "step": 697 }, { "epoch": 0.005584, "grad_norm": 2.8125, "learning_rate": 0.000992836109162008, "loss": 2.6701, "step": 698 }, { "epoch": 0.005592, "grad_norm": 2.234375, "learning_rate": 0.0009928148550963243, "loss": 2.0521, "step": 699 }, { "epoch": 0.0056, "grad_norm": 3.265625, "learning_rate": 0.0009927935697768698, "loss": 2.3745, "step": 700 }, { "epoch": 0.005608, "grad_norm": 2.578125, "learning_rate": 0.0009927722532049941, "loss": 2.604, "step": 701 }, { "epoch": 0.005616, "grad_norm": 2.40625, "learning_rate": 0.0009927509053820494, "loss": 2.3476, "step": 702 }, { "epoch": 0.005624, "grad_norm": 2.84375, "learning_rate": 0.0009927295263093894, "loss": 2.2101, "step": 703 }, { "epoch": 0.005632, "grad_norm": 2.453125, "learning_rate": 0.0009927081159883696, "loss": 2.3007, "step": 704 }, { "epoch": 0.00564, "grad_norm": 2.953125, "learning_rate": 0.0009926866744203486, "loss": 2.4521, "step": 705 }, { "epoch": 0.005648, "grad_norm": 2.3125, "learning_rate": 0.0009926652016066853, "loss": 2.7074, "step": 706 }, { "epoch": 0.005656, "grad_norm": 2.78125, "learning_rate": 0.0009926436975487424, "loss": 2.6884, "step": 707 }, { "epoch": 0.005664, "grad_norm": 2.75, "learning_rate": 0.0009926221622478825, "loss": 2.6637, "step": 708 }, { "epoch": 0.005672, "grad_norm": 2.15625, "learning_rate": 0.0009926005957054725, "loss": 2.3447, "step": 709 }, { "epoch": 0.00568, "grad_norm": 3.0625, "learning_rate": 0.0009925789979228796, "loss": 2.0975, "step": 710 }, { "epoch": 0.005688, "grad_norm": 2.6875, "learning_rate": 0.0009925573689014734, "loss": 2.7824, "step": 711 }, { "epoch": 0.005696, "grad_norm": 2.625, "learning_rate": 0.0009925357086426258, "loss": 2.5509, "step": 712 }, { "epoch": 0.005704, "grad_norm": 2.5, "learning_rate": 0.0009925140171477104, "loss": 2.4388, "step": 713 }, { "epoch": 0.005712, "grad_norm": 2.65625, "learning_rate": 0.0009924922944181032, "loss": 2.5461, "step": 714 }, { "epoch": 0.00572, "grad_norm": 2.734375, "learning_rate": 0.0009924705404551811, "loss": 2.23, "step": 715 }, { "epoch": 0.005728, "grad_norm": 2.859375, "learning_rate": 0.0009924487552603243, "loss": 2.2301, "step": 716 }, { "epoch": 0.005736, "grad_norm": 3.09375, "learning_rate": 0.0009924269388349141, "loss": 2.4589, "step": 717 }, { "epoch": 0.005744, "grad_norm": 2.90625, "learning_rate": 0.0009924050911803341, "loss": 2.4995, "step": 718 }, { "epoch": 0.005752, "grad_norm": 2.53125, "learning_rate": 0.0009923832122979701, "loss": 2.1376, "step": 719 }, { "epoch": 0.00576, "grad_norm": 3.546875, "learning_rate": 0.0009923613021892092, "loss": 2.0861, "step": 720 }, { "epoch": 0.005768, "grad_norm": 3.0625, "learning_rate": 0.0009923393608554415, "loss": 2.2836, "step": 721 }, { "epoch": 0.005776, "grad_norm": 2.390625, "learning_rate": 0.000992317388298058, "loss": 1.9761, "step": 722 }, { "epoch": 0.005784, "grad_norm": 3.09375, "learning_rate": 0.0009922953845184523, "loss": 2.3847, "step": 723 }, { "epoch": 0.005792, "grad_norm": 2.71875, "learning_rate": 0.0009922733495180199, "loss": 2.5171, "step": 724 }, { "epoch": 0.0058, "grad_norm": 2.3125, "learning_rate": 0.0009922512832981584, "loss": 2.0619, "step": 725 }, { "epoch": 0.005808, "grad_norm": 2.359375, "learning_rate": 0.000992229185860267, "loss": 2.1805, "step": 726 }, { "epoch": 0.005816, "grad_norm": 3.296875, "learning_rate": 0.000992207057205747, "loss": 2.9828, "step": 727 }, { "epoch": 0.005824, "grad_norm": 2.84375, "learning_rate": 0.000992184897336002, "loss": 1.9876, "step": 728 }, { "epoch": 0.005832, "grad_norm": 2.46875, "learning_rate": 0.0009921627062524374, "loss": 2.4258, "step": 729 }, { "epoch": 0.00584, "grad_norm": 3.203125, "learning_rate": 0.0009921404839564603, "loss": 2.4812, "step": 730 }, { "epoch": 0.005848, "grad_norm": 2.5, "learning_rate": 0.00099211823044948, "loss": 2.6338, "step": 731 }, { "epoch": 0.005856, "grad_norm": 3.109375, "learning_rate": 0.0009920959457329081, "loss": 2.4888, "step": 732 }, { "epoch": 0.005864, "grad_norm": 2.609375, "learning_rate": 0.0009920736298081577, "loss": 2.637, "step": 733 }, { "epoch": 0.005872, "grad_norm": 2.171875, "learning_rate": 0.000992051282676644, "loss": 2.5485, "step": 734 }, { "epoch": 0.00588, "grad_norm": 3.46875, "learning_rate": 0.000992028904339784, "loss": 2.1388, "step": 735 }, { "epoch": 0.005888, "grad_norm": 2.828125, "learning_rate": 0.0009920064947989975, "loss": 2.6359, "step": 736 }, { "epoch": 0.005896, "grad_norm": 2.3125, "learning_rate": 0.0009919840540557054, "loss": 2.197, "step": 737 }, { "epoch": 0.005904, "grad_norm": 2.8125, "learning_rate": 0.0009919615821113307, "loss": 2.4005, "step": 738 }, { "epoch": 0.005912, "grad_norm": 2.484375, "learning_rate": 0.0009919390789672987, "loss": 2.707, "step": 739 }, { "epoch": 0.00592, "grad_norm": 3.171875, "learning_rate": 0.0009919165446250365, "loss": 2.2453, "step": 740 }, { "epoch": 0.005928, "grad_norm": 2.546875, "learning_rate": 0.000991893979085973, "loss": 2.4185, "step": 741 }, { "epoch": 0.005936, "grad_norm": 2.96875, "learning_rate": 0.00099187138235154, "loss": 2.607, "step": 742 }, { "epoch": 0.005944, "grad_norm": 5.09375, "learning_rate": 0.0009918487544231696, "loss": 2.272, "step": 743 }, { "epoch": 0.005952, "grad_norm": 2.703125, "learning_rate": 0.0009918260953022974, "loss": 2.7594, "step": 744 }, { "epoch": 0.00596, "grad_norm": 2.59375, "learning_rate": 0.0009918034049903603, "loss": 2.2736, "step": 745 }, { "epoch": 0.005968, "grad_norm": 2.203125, "learning_rate": 0.0009917806834887975, "loss": 2.2974, "step": 746 }, { "epoch": 0.005976, "grad_norm": 3.078125, "learning_rate": 0.0009917579307990496, "loss": 2.1313, "step": 747 }, { "epoch": 0.005984, "grad_norm": 2.9375, "learning_rate": 0.0009917351469225597, "loss": 2.4566, "step": 748 }, { "epoch": 0.005992, "grad_norm": 2.640625, "learning_rate": 0.0009917123318607726, "loss": 2.3872, "step": 749 }, { "epoch": 0.006, "grad_norm": 3.15625, "learning_rate": 0.0009916894856151356, "loss": 1.8446, "step": 750 }, { "epoch": 0.006008, "grad_norm": 2.765625, "learning_rate": 0.0009916666081870972, "loss": 2.2886, "step": 751 }, { "epoch": 0.006016, "grad_norm": 2.1875, "learning_rate": 0.0009916436995781085, "loss": 2.0598, "step": 752 }, { "epoch": 0.006024, "grad_norm": 2.796875, "learning_rate": 0.0009916207597896222, "loss": 2.2278, "step": 753 }, { "epoch": 0.006032, "grad_norm": 3.453125, "learning_rate": 0.000991597788823093, "loss": 2.7454, "step": 754 }, { "epoch": 0.00604, "grad_norm": 1.8515625, "learning_rate": 0.0009915747866799782, "loss": 1.76, "step": 755 }, { "epoch": 0.006048, "grad_norm": 2.734375, "learning_rate": 0.000991551753361736, "loss": 2.2332, "step": 756 }, { "epoch": 0.006056, "grad_norm": 2.71875, "learning_rate": 0.0009915286888698275, "loss": 1.7404, "step": 757 }, { "epoch": 0.006064, "grad_norm": 2.5, "learning_rate": 0.000991505593205715, "loss": 2.0035, "step": 758 }, { "epoch": 0.006072, "grad_norm": 3.078125, "learning_rate": 0.0009914824663708637, "loss": 2.7774, "step": 759 }, { "epoch": 0.00608, "grad_norm": 6.5625, "learning_rate": 0.00099145930836674, "loss": 2.4951, "step": 760 }, { "epoch": 0.006088, "grad_norm": 2.625, "learning_rate": 0.0009914361191948125, "loss": 2.3349, "step": 761 }, { "epoch": 0.006096, "grad_norm": 2.640625, "learning_rate": 0.0009914128988565522, "loss": 2.4641, "step": 762 }, { "epoch": 0.006104, "grad_norm": 2.671875, "learning_rate": 0.0009913896473534313, "loss": 2.1047, "step": 763 }, { "epoch": 0.006112, "grad_norm": 2.671875, "learning_rate": 0.0009913663646869246, "loss": 2.6953, "step": 764 }, { "epoch": 0.00612, "grad_norm": 2.328125, "learning_rate": 0.0009913430508585086, "loss": 2.0658, "step": 765 }, { "epoch": 0.006128, "grad_norm": 2.84375, "learning_rate": 0.0009913197058696618, "loss": 2.5393, "step": 766 }, { "epoch": 0.006136, "grad_norm": 2.671875, "learning_rate": 0.0009912963297218648, "loss": 2.6389, "step": 767 }, { "epoch": 0.006144, "grad_norm": 4.375, "learning_rate": 0.0009912729224166002, "loss": 2.2208, "step": 768 }, { "epoch": 0.006152, "grad_norm": 3.0, "learning_rate": 0.0009912494839553522, "loss": 2.653, "step": 769 }, { "epoch": 0.00616, "grad_norm": 2.640625, "learning_rate": 0.0009912260143396072, "loss": 2.2034, "step": 770 }, { "epoch": 0.006168, "grad_norm": 3.296875, "learning_rate": 0.000991202513570854, "loss": 2.1248, "step": 771 }, { "epoch": 0.006176, "grad_norm": 3.421875, "learning_rate": 0.0009911789816505825, "loss": 2.2735, "step": 772 }, { "epoch": 0.006184, "grad_norm": 3.09375, "learning_rate": 0.0009911554185802855, "loss": 2.6349, "step": 773 }, { "epoch": 0.006192, "grad_norm": 2.375, "learning_rate": 0.0009911318243614573, "loss": 2.204, "step": 774 }, { "epoch": 0.0062, "grad_norm": 2.203125, "learning_rate": 0.000991108198995594, "loss": 2.0909, "step": 775 }, { "epoch": 0.006208, "grad_norm": 2.515625, "learning_rate": 0.0009910845424841938, "loss": 2.1299, "step": 776 }, { "epoch": 0.006216, "grad_norm": 2.796875, "learning_rate": 0.0009910608548287575, "loss": 2.5299, "step": 777 }, { "epoch": 0.006224, "grad_norm": 2.90625, "learning_rate": 0.0009910371360307866, "loss": 2.648, "step": 778 }, { "epoch": 0.006232, "grad_norm": 3.1875, "learning_rate": 0.000991013386091786, "loss": 2.7741, "step": 779 }, { "epoch": 0.00624, "grad_norm": 3.0, "learning_rate": 0.0009909896050132616, "loss": 2.3313, "step": 780 }, { "epoch": 0.006248, "grad_norm": 2.671875, "learning_rate": 0.0009909657927967214, "loss": 2.4001, "step": 781 }, { "epoch": 0.006256, "grad_norm": 3.15625, "learning_rate": 0.000990941949443676, "loss": 2.2397, "step": 782 }, { "epoch": 0.006264, "grad_norm": 2.703125, "learning_rate": 0.000990918074955637, "loss": 2.3784, "step": 783 }, { "epoch": 0.006272, "grad_norm": 2.71875, "learning_rate": 0.000990894169334119, "loss": 2.5539, "step": 784 }, { "epoch": 0.00628, "grad_norm": 2.625, "learning_rate": 0.0009908702325806375, "loss": 2.1632, "step": 785 }, { "epoch": 0.006288, "grad_norm": 2.796875, "learning_rate": 0.000990846264696711, "loss": 2.0973, "step": 786 }, { "epoch": 0.006296, "grad_norm": 2.640625, "learning_rate": 0.0009908222656838596, "loss": 2.3819, "step": 787 }, { "epoch": 0.006304, "grad_norm": 2.265625, "learning_rate": 0.000990798235543605, "loss": 2.2243, "step": 788 }, { "epoch": 0.006312, "grad_norm": 3.265625, "learning_rate": 0.000990774174277471, "loss": 2.4756, "step": 789 }, { "epoch": 0.00632, "grad_norm": 2.3125, "learning_rate": 0.0009907500818869838, "loss": 2.1659, "step": 790 }, { "epoch": 0.006328, "grad_norm": 2.921875, "learning_rate": 0.0009907259583736715, "loss": 2.4194, "step": 791 }, { "epoch": 0.006336, "grad_norm": 2.390625, "learning_rate": 0.0009907018037390634, "loss": 2.0159, "step": 792 }, { "epoch": 0.006344, "grad_norm": 3.046875, "learning_rate": 0.000990677617984692, "loss": 2.9039, "step": 793 }, { "epoch": 0.006352, "grad_norm": 2.890625, "learning_rate": 0.0009906534011120908, "loss": 2.3807, "step": 794 }, { "epoch": 0.00636, "grad_norm": 2.3125, "learning_rate": 0.0009906291531227955, "loss": 1.994, "step": 795 }, { "epoch": 0.006368, "grad_norm": 1.984375, "learning_rate": 0.0009906048740183442, "loss": 1.9649, "step": 796 }, { "epoch": 0.006376, "grad_norm": 2.96875, "learning_rate": 0.0009905805638002767, "loss": 2.5107, "step": 797 }, { "epoch": 0.006384, "grad_norm": 2.40625, "learning_rate": 0.0009905562224701344, "loss": 2.3981, "step": 798 }, { "epoch": 0.006392, "grad_norm": 2.359375, "learning_rate": 0.0009905318500294609, "loss": 2.3173, "step": 799 }, { "epoch": 0.0064, "grad_norm": 4.375, "learning_rate": 0.0009905074464798022, "loss": 2.2309, "step": 800 }, { "epoch": 0.006408, "grad_norm": 3.140625, "learning_rate": 0.000990483011822706, "loss": 2.0817, "step": 801 }, { "epoch": 0.006416, "grad_norm": 2.3125, "learning_rate": 0.0009904585460597216, "loss": 2.0485, "step": 802 }, { "epoch": 0.006424, "grad_norm": 3.109375, "learning_rate": 0.000990434049192401, "loss": 2.5482, "step": 803 }, { "epoch": 0.006432, "grad_norm": 2.546875, "learning_rate": 0.0009904095212222971, "loss": 2.4535, "step": 804 }, { "epoch": 0.00644, "grad_norm": 2.9375, "learning_rate": 0.0009903849621509663, "loss": 2.8395, "step": 805 }, { "epoch": 0.006448, "grad_norm": 2.1875, "learning_rate": 0.0009903603719799655, "loss": 1.8746, "step": 806 }, { "epoch": 0.006456, "grad_norm": 11.0, "learning_rate": 0.0009903357507108543, "loss": 2.2944, "step": 807 }, { "epoch": 0.006464, "grad_norm": 3.125, "learning_rate": 0.0009903110983451942, "loss": 2.2825, "step": 808 }, { "epoch": 0.006472, "grad_norm": 2.453125, "learning_rate": 0.0009902864148845485, "loss": 2.3946, "step": 809 }, { "epoch": 0.00648, "grad_norm": 2.765625, "learning_rate": 0.000990261700330483, "loss": 2.6699, "step": 810 }, { "epoch": 0.006488, "grad_norm": 2.609375, "learning_rate": 0.0009902369546845647, "loss": 1.9501, "step": 811 }, { "epoch": 0.006496, "grad_norm": 3.0, "learning_rate": 0.000990212177948363, "loss": 2.7386, "step": 812 }, { "epoch": 0.006504, "grad_norm": 2.8125, "learning_rate": 0.0009901873701234493, "loss": 2.4213, "step": 813 }, { "epoch": 0.006512, "grad_norm": 2.609375, "learning_rate": 0.0009901625312113966, "loss": 2.0697, "step": 814 }, { "epoch": 0.00652, "grad_norm": 2.8125, "learning_rate": 0.0009901376612137807, "loss": 2.2036, "step": 815 }, { "epoch": 0.006528, "grad_norm": 3.125, "learning_rate": 0.0009901127601321784, "loss": 2.4044, "step": 816 }, { "epoch": 0.006536, "grad_norm": 3.328125, "learning_rate": 0.0009900878279681688, "loss": 2.9324, "step": 817 }, { "epoch": 0.006544, "grad_norm": 3.203125, "learning_rate": 0.0009900628647233336, "loss": 2.3495, "step": 818 }, { "epoch": 0.006552, "grad_norm": 2.546875, "learning_rate": 0.0009900378703992557, "loss": 2.1946, "step": 819 }, { "epoch": 0.00656, "grad_norm": 2.296875, "learning_rate": 0.0009900128449975198, "loss": 2.1172, "step": 820 }, { "epoch": 0.006568, "grad_norm": 2.484375, "learning_rate": 0.0009899877885197134, "loss": 2.2507, "step": 821 }, { "epoch": 0.006576, "grad_norm": 7.625, "learning_rate": 0.0009899627009674256, "loss": 2.2173, "step": 822 }, { "epoch": 0.006584, "grad_norm": 2.59375, "learning_rate": 0.0009899375823422473, "loss": 2.6946, "step": 823 }, { "epoch": 0.006592, "grad_norm": 2.578125, "learning_rate": 0.0009899124326457715, "loss": 2.5422, "step": 824 }, { "epoch": 0.0066, "grad_norm": 2.78125, "learning_rate": 0.0009898872518795932, "loss": 2.3561, "step": 825 }, { "epoch": 0.006608, "grad_norm": 2.6875, "learning_rate": 0.000989862040045309, "loss": 2.589, "step": 826 }, { "epoch": 0.006616, "grad_norm": 2.4375, "learning_rate": 0.0009898367971445186, "loss": 1.9889, "step": 827 }, { "epoch": 0.006624, "grad_norm": 3.15625, "learning_rate": 0.000989811523178822, "loss": 2.9384, "step": 828 }, { "epoch": 0.006632, "grad_norm": 2.9375, "learning_rate": 0.0009897862181498225, "loss": 2.5604, "step": 829 }, { "epoch": 0.00664, "grad_norm": 3.234375, "learning_rate": 0.0009897608820591249, "loss": 2.525, "step": 830 }, { "epoch": 0.006648, "grad_norm": 4.8125, "learning_rate": 0.000989735514908336, "loss": 2.7396, "step": 831 }, { "epoch": 0.006656, "grad_norm": 3.25, "learning_rate": 0.0009897101166990645, "loss": 2.2525, "step": 832 }, { "epoch": 0.006664, "grad_norm": 3.578125, "learning_rate": 0.000989684687432921, "loss": 2.2011, "step": 833 }, { "epoch": 0.006672, "grad_norm": 2.953125, "learning_rate": 0.0009896592271115183, "loss": 2.6506, "step": 834 }, { "epoch": 0.00668, "grad_norm": 4.5, "learning_rate": 0.000989633735736471, "loss": 2.3357, "step": 835 }, { "epoch": 0.006688, "grad_norm": 9.3125, "learning_rate": 0.000989608213309396, "loss": 2.696, "step": 836 }, { "epoch": 0.006696, "grad_norm": 3.34375, "learning_rate": 0.0009895826598319117, "loss": 2.1965, "step": 837 }, { "epoch": 0.006704, "grad_norm": 2.859375, "learning_rate": 0.0009895570753056387, "loss": 2.3956, "step": 838 }, { "epoch": 0.006712, "grad_norm": 4.28125, "learning_rate": 0.0009895314597321994, "loss": 2.0995, "step": 839 }, { "epoch": 0.00672, "grad_norm": 3.6875, "learning_rate": 0.0009895058131132186, "loss": 2.1329, "step": 840 }, { "epoch": 0.006728, "grad_norm": 3.953125, "learning_rate": 0.0009894801354503224, "loss": 2.1301, "step": 841 }, { "epoch": 0.006736, "grad_norm": 2.265625, "learning_rate": 0.0009894544267451397, "loss": 2.1504, "step": 842 }, { "epoch": 0.006744, "grad_norm": 2.734375, "learning_rate": 0.0009894286869993006, "loss": 2.8615, "step": 843 }, { "epoch": 0.006752, "grad_norm": 3.96875, "learning_rate": 0.0009894029162144376, "loss": 2.2885, "step": 844 }, { "epoch": 0.00676, "grad_norm": 3.203125, "learning_rate": 0.000989377114392185, "loss": 2.819, "step": 845 }, { "epoch": 0.006768, "grad_norm": 6.25, "learning_rate": 0.000989351281534179, "loss": 2.2356, "step": 846 }, { "epoch": 0.006776, "grad_norm": 3.65625, "learning_rate": 0.0009893254176420584, "loss": 1.9986, "step": 847 }, { "epoch": 0.006784, "grad_norm": 8.3125, "learning_rate": 0.0009892995227174628, "loss": 2.5724, "step": 848 }, { "epoch": 0.006792, "grad_norm": 57.0, "learning_rate": 0.0009892735967620347, "loss": 2.5662, "step": 849 }, { "epoch": 0.0068, "grad_norm": 3.375, "learning_rate": 0.0009892476397774185, "loss": 2.1833, "step": 850 }, { "epoch": 0.006808, "grad_norm": 2.859375, "learning_rate": 0.0009892216517652602, "loss": 2.8256, "step": 851 }, { "epoch": 0.006816, "grad_norm": 3.453125, "learning_rate": 0.0009891956327272079, "loss": 2.5291, "step": 852 }, { "epoch": 0.006824, "grad_norm": 2.875, "learning_rate": 0.0009891695826649116, "loss": 2.7706, "step": 853 }, { "epoch": 0.006832, "grad_norm": 3.046875, "learning_rate": 0.0009891435015800237, "loss": 2.3584, "step": 854 }, { "epoch": 0.00684, "grad_norm": 3.84375, "learning_rate": 0.0009891173894741978, "loss": 2.554, "step": 855 }, { "epoch": 0.006848, "grad_norm": 5.875, "learning_rate": 0.00098909124634909, "loss": 2.249, "step": 856 }, { "epoch": 0.006856, "grad_norm": 3.296875, "learning_rate": 0.0009890650722063589, "loss": 2.6463, "step": 857 }, { "epoch": 0.006864, "grad_norm": 3.078125, "learning_rate": 0.0009890388670476635, "loss": 2.6148, "step": 858 }, { "epoch": 0.006872, "grad_norm": 2.765625, "learning_rate": 0.0009890126308746664, "loss": 2.073, "step": 859 }, { "epoch": 0.00688, "grad_norm": 3.171875, "learning_rate": 0.0009889863636890308, "loss": 2.6188, "step": 860 }, { "epoch": 0.006888, "grad_norm": 2.734375, "learning_rate": 0.0009889600654924232, "loss": 2.0904, "step": 861 }, { "epoch": 0.006896, "grad_norm": 5.78125, "learning_rate": 0.0009889337362865111, "loss": 2.8617, "step": 862 }, { "epoch": 0.006904, "grad_norm": 17.0, "learning_rate": 0.0009889073760729642, "loss": 2.3585, "step": 863 }, { "epoch": 0.006912, "grad_norm": 2.9375, "learning_rate": 0.0009888809848534546, "loss": 2.3237, "step": 864 }, { "epoch": 0.00692, "grad_norm": 3.53125, "learning_rate": 0.0009888545626296555, "loss": 2.143, "step": 865 }, { "epoch": 0.006928, "grad_norm": 3.296875, "learning_rate": 0.000988828109403243, "loss": 2.3857, "step": 866 }, { "epoch": 0.006936, "grad_norm": 3.046875, "learning_rate": 0.0009888016251758942, "loss": 2.1326, "step": 867 }, { "epoch": 0.006944, "grad_norm": 5.96875, "learning_rate": 0.0009887751099492894, "loss": 2.2711, "step": 868 }, { "epoch": 0.006952, "grad_norm": 3.65625, "learning_rate": 0.0009887485637251096, "loss": 2.386, "step": 869 }, { "epoch": 0.00696, "grad_norm": 3.0625, "learning_rate": 0.0009887219865050386, "loss": 2.1976, "step": 870 }, { "epoch": 0.006968, "grad_norm": 25.375, "learning_rate": 0.0009886953782907617, "loss": 1.8421, "step": 871 }, { "epoch": 0.006976, "grad_norm": 3.859375, "learning_rate": 0.0009886687390839666, "loss": 2.7707, "step": 872 }, { "epoch": 0.006984, "grad_norm": 3.625, "learning_rate": 0.0009886420688863427, "loss": 2.4722, "step": 873 }, { "epoch": 0.006992, "grad_norm": 4.78125, "learning_rate": 0.0009886153676995813, "loss": 2.7474, "step": 874 }, { "epoch": 0.007, "grad_norm": 3.171875, "learning_rate": 0.0009885886355253757, "loss": 1.9611, "step": 875 }, { "epoch": 0.007008, "grad_norm": 4.46875, "learning_rate": 0.0009885618723654214, "loss": 2.5036, "step": 876 }, { "epoch": 0.007016, "grad_norm": 3.375, "learning_rate": 0.0009885350782214155, "loss": 2.4786, "step": 877 }, { "epoch": 0.007024, "grad_norm": 3.265625, "learning_rate": 0.0009885082530950576, "loss": 2.5318, "step": 878 }, { "epoch": 0.007032, "grad_norm": 15.75, "learning_rate": 0.0009884813969880484, "loss": 2.124, "step": 879 }, { "epoch": 0.00704, "grad_norm": 4.90625, "learning_rate": 0.0009884545099020914, "loss": 2.3292, "step": 880 }, { "epoch": 0.007048, "grad_norm": 2.828125, "learning_rate": 0.0009884275918388918, "loss": 2.3692, "step": 881 }, { "epoch": 0.007056, "grad_norm": 3.046875, "learning_rate": 0.0009884006428001566, "loss": 2.5219, "step": 882 }, { "epoch": 0.007064, "grad_norm": 3.296875, "learning_rate": 0.0009883736627875948, "loss": 2.6393, "step": 883 }, { "epoch": 0.007072, "grad_norm": 3.78125, "learning_rate": 0.0009883466518029176, "loss": 2.6673, "step": 884 }, { "epoch": 0.00708, "grad_norm": 4.8125, "learning_rate": 0.000988319609847838, "loss": 2.2877, "step": 885 }, { "epoch": 0.007088, "grad_norm": 3.671875, "learning_rate": 0.0009882925369240707, "loss": 2.2581, "step": 886 }, { "epoch": 0.007096, "grad_norm": 2.625, "learning_rate": 0.0009882654330333333, "loss": 2.2624, "step": 887 }, { "epoch": 0.007104, "grad_norm": 3.015625, "learning_rate": 0.0009882382981773437, "loss": 2.6791, "step": 888 }, { "epoch": 0.007112, "grad_norm": 3.109375, "learning_rate": 0.0009882111323578236, "loss": 2.6151, "step": 889 }, { "epoch": 0.00712, "grad_norm": 3.265625, "learning_rate": 0.0009881839355764955, "loss": 2.3607, "step": 890 }, { "epoch": 0.007128, "grad_norm": 3.203125, "learning_rate": 0.0009881567078350844, "loss": 1.8735, "step": 891 }, { "epoch": 0.007136, "grad_norm": 3.453125, "learning_rate": 0.0009881294491353165, "loss": 2.7525, "step": 892 }, { "epoch": 0.007144, "grad_norm": 2.9375, "learning_rate": 0.0009881021594789212, "loss": 1.9622, "step": 893 }, { "epoch": 0.007152, "grad_norm": 3.328125, "learning_rate": 0.0009880748388676288, "loss": 2.5409, "step": 894 }, { "epoch": 0.00716, "grad_norm": 4.3125, "learning_rate": 0.0009880474873031721, "loss": 2.1424, "step": 895 }, { "epoch": 0.007168, "grad_norm": 3.6875, "learning_rate": 0.0009880201047872854, "loss": 2.0962, "step": 896 }, { "epoch": 0.007176, "grad_norm": 2.75, "learning_rate": 0.0009879926913217059, "loss": 2.0898, "step": 897 }, { "epoch": 0.007184, "grad_norm": 4.875, "learning_rate": 0.0009879652469081711, "loss": 2.622, "step": 898 }, { "epoch": 0.007192, "grad_norm": 3.046875, "learning_rate": 0.0009879377715484227, "loss": 2.7861, "step": 899 }, { "epoch": 0.0072, "grad_norm": 5.34375, "learning_rate": 0.0009879102652442023, "loss": 2.4208, "step": 900 }, { "epoch": 0.007208, "grad_norm": 3.671875, "learning_rate": 0.0009878827279972547, "loss": 2.6055, "step": 901 }, { "epoch": 0.007216, "grad_norm": 3.5, "learning_rate": 0.000987855159809326, "loss": 2.6596, "step": 902 }, { "epoch": 0.007224, "grad_norm": 3.4375, "learning_rate": 0.0009878275606821651, "loss": 2.2369, "step": 903 }, { "epoch": 0.007232, "grad_norm": 3.5, "learning_rate": 0.0009877999306175216, "loss": 3.3349, "step": 904 }, { "epoch": 0.00724, "grad_norm": 4.59375, "learning_rate": 0.0009877722696171483, "loss": 2.139, "step": 905 }, { "epoch": 0.007248, "grad_norm": 2.734375, "learning_rate": 0.000987744577682799, "loss": 2.015, "step": 906 }, { "epoch": 0.007256, "grad_norm": 8.625, "learning_rate": 0.0009877168548162302, "loss": 2.4227, "step": 907 }, { "epoch": 0.007264, "grad_norm": 2.953125, "learning_rate": 0.0009876891010192, "loss": 2.8337, "step": 908 }, { "epoch": 0.007272, "grad_norm": 3.859375, "learning_rate": 0.0009876613162934686, "loss": 2.8596, "step": 909 }, { "epoch": 0.00728, "grad_norm": 4.1875, "learning_rate": 0.0009876335006407978, "loss": 2.976, "step": 910 }, { "epoch": 0.007288, "grad_norm": 3.84375, "learning_rate": 0.000987605654062952, "loss": 2.627, "step": 911 }, { "epoch": 0.007296, "grad_norm": 3.34375, "learning_rate": 0.0009875777765616966, "loss": 2.153, "step": 912 }, { "epoch": 0.007304, "grad_norm": 2.78125, "learning_rate": 0.0009875498681388005, "loss": 2.6163, "step": 913 }, { "epoch": 0.007312, "grad_norm": 2.65625, "learning_rate": 0.0009875219287960326, "loss": 2.5867, "step": 914 }, { "epoch": 0.00732, "grad_norm": 2.75, "learning_rate": 0.0009874939585351654, "loss": 2.4676, "step": 915 }, { "epoch": 0.007328, "grad_norm": 3.09375, "learning_rate": 0.0009874659573579727, "loss": 2.7059, "step": 916 }, { "epoch": 0.007336, "grad_norm": 3.15625, "learning_rate": 0.0009874379252662303, "loss": 1.7653, "step": 917 }, { "epoch": 0.007344, "grad_norm": 2.765625, "learning_rate": 0.0009874098622617157, "loss": 2.6246, "step": 918 }, { "epoch": 0.007352, "grad_norm": 2.859375, "learning_rate": 0.0009873817683462088, "loss": 2.3481, "step": 919 }, { "epoch": 0.00736, "grad_norm": 3.640625, "learning_rate": 0.0009873536435214914, "loss": 2.3222, "step": 920 }, { "epoch": 0.007368, "grad_norm": 3.25, "learning_rate": 0.000987325487789347, "loss": 2.9621, "step": 921 }, { "epoch": 0.007376, "grad_norm": 3.40625, "learning_rate": 0.0009872973011515612, "loss": 2.2863, "step": 922 }, { "epoch": 0.007384, "grad_norm": 2.921875, "learning_rate": 0.0009872690836099215, "loss": 2.1246, "step": 923 }, { "epoch": 0.007392, "grad_norm": 30.125, "learning_rate": 0.0009872408351662176, "loss": 2.492, "step": 924 }, { "epoch": 0.0074, "grad_norm": 12.5625, "learning_rate": 0.000987212555822241, "loss": 2.5275, "step": 925 }, { "epoch": 0.007408, "grad_norm": 3.171875, "learning_rate": 0.0009871842455797851, "loss": 2.0341, "step": 926 }, { "epoch": 0.007416, "grad_norm": 5.59375, "learning_rate": 0.000987155904440645, "loss": 2.759, "step": 927 }, { "epoch": 0.007424, "grad_norm": 4.625, "learning_rate": 0.0009871275324066185, "loss": 2.2378, "step": 928 }, { "epoch": 0.007432, "grad_norm": 4.84375, "learning_rate": 0.0009870991294795046, "loss": 2.557, "step": 929 }, { "epoch": 0.00744, "grad_norm": 3.5, "learning_rate": 0.000987070695661105, "loss": 2.2237, "step": 930 }, { "epoch": 0.007448, "grad_norm": 4.5, "learning_rate": 0.0009870422309532223, "loss": 2.4358, "step": 931 }, { "epoch": 0.007456, "grad_norm": 3.328125, "learning_rate": 0.0009870137353576625, "loss": 2.4295, "step": 932 }, { "epoch": 0.007464, "grad_norm": 2.8125, "learning_rate": 0.000986985208876232, "loss": 2.2917, "step": 933 }, { "epoch": 0.007472, "grad_norm": 4.0625, "learning_rate": 0.0009869566515107403, "loss": 2.484, "step": 934 }, { "epoch": 0.00748, "grad_norm": 4.34375, "learning_rate": 0.0009869280632629983, "loss": 2.1957, "step": 935 }, { "epoch": 0.007488, "grad_norm": 2.859375, "learning_rate": 0.0009868994441348195, "loss": 2.3244, "step": 936 }, { "epoch": 0.007496, "grad_norm": 2.984375, "learning_rate": 0.0009868707941280185, "loss": 2.3941, "step": 937 }, { "epoch": 0.007504, "grad_norm": 3.4375, "learning_rate": 0.000986842113244412, "loss": 2.503, "step": 938 }, { "epoch": 0.007512, "grad_norm": 2.6875, "learning_rate": 0.0009868134014858194, "loss": 1.8781, "step": 939 }, { "epoch": 0.00752, "grad_norm": 3.5, "learning_rate": 0.0009867846588540613, "loss": 2.0957, "step": 940 }, { "epoch": 0.007528, "grad_norm": 2.90625, "learning_rate": 0.0009867558853509607, "loss": 1.8741, "step": 941 }, { "epoch": 0.007536, "grad_norm": 3.171875, "learning_rate": 0.0009867270809783423, "loss": 2.2781, "step": 942 }, { "epoch": 0.007544, "grad_norm": 3.46875, "learning_rate": 0.0009866982457380326, "loss": 2.3588, "step": 943 }, { "epoch": 0.007552, "grad_norm": 2.90625, "learning_rate": 0.0009866693796318608, "loss": 2.4459, "step": 944 }, { "epoch": 0.00756, "grad_norm": 2.96875, "learning_rate": 0.0009866404826616574, "loss": 2.7947, "step": 945 }, { "epoch": 0.007568, "grad_norm": 3.546875, "learning_rate": 0.0009866115548292545, "loss": 2.1662, "step": 946 }, { "epoch": 0.007576, "grad_norm": 3.015625, "learning_rate": 0.0009865825961364874, "loss": 2.4436, "step": 947 }, { "epoch": 0.007584, "grad_norm": 2.5625, "learning_rate": 0.000986553606585192, "loss": 2.0949, "step": 948 }, { "epoch": 0.007592, "grad_norm": 3.109375, "learning_rate": 0.0009865245861772072, "loss": 2.4112, "step": 949 }, { "epoch": 0.0076, "grad_norm": 2.328125, "learning_rate": 0.0009864955349143734, "loss": 1.5918, "step": 950 }, { "epoch": 0.007608, "grad_norm": 2.453125, "learning_rate": 0.000986466452798533, "loss": 2.0212, "step": 951 }, { "epoch": 0.007616, "grad_norm": 2.953125, "learning_rate": 0.00098643733983153, "loss": 2.292, "step": 952 }, { "epoch": 0.007624, "grad_norm": 3.890625, "learning_rate": 0.0009864081960152113, "loss": 2.5087, "step": 953 }, { "epoch": 0.007632, "grad_norm": 3.234375, "learning_rate": 0.0009863790213514247, "loss": 2.0412, "step": 954 }, { "epoch": 0.00764, "grad_norm": 3.28125, "learning_rate": 0.0009863498158420205, "loss": 2.513, "step": 955 }, { "epoch": 0.007648, "grad_norm": 3.40625, "learning_rate": 0.0009863205794888512, "loss": 2.4871, "step": 956 }, { "epoch": 0.007656, "grad_norm": 2.984375, "learning_rate": 0.0009862913122937705, "loss": 2.5282, "step": 957 }, { "epoch": 0.007664, "grad_norm": 2.765625, "learning_rate": 0.0009862620142586348, "loss": 2.3203, "step": 958 }, { "epoch": 0.007672, "grad_norm": 2.328125, "learning_rate": 0.000986232685385302, "loss": 2.1069, "step": 959 }, { "epoch": 0.00768, "grad_norm": 2.625, "learning_rate": 0.0009862033256756322, "loss": 2.6188, "step": 960 }, { "epoch": 0.007688, "grad_norm": 2.4375, "learning_rate": 0.0009861739351314871, "loss": 1.9506, "step": 961 }, { "epoch": 0.007696, "grad_norm": 2.609375, "learning_rate": 0.000986144513754731, "loss": 2.6677, "step": 962 }, { "epoch": 0.007704, "grad_norm": 3.53125, "learning_rate": 0.0009861150615472296, "loss": 2.7367, "step": 963 }, { "epoch": 0.007712, "grad_norm": 2.625, "learning_rate": 0.0009860855785108508, "loss": 2.5445, "step": 964 }, { "epoch": 0.00772, "grad_norm": 2.78125, "learning_rate": 0.0009860560646474642, "loss": 2.1327, "step": 965 }, { "epoch": 0.007728, "grad_norm": 3.078125, "learning_rate": 0.0009860265199589417, "loss": 2.6643, "step": 966 }, { "epoch": 0.007736, "grad_norm": 3.015625, "learning_rate": 0.0009859969444471568, "loss": 2.1202, "step": 967 }, { "epoch": 0.007744, "grad_norm": 3.40625, "learning_rate": 0.0009859673381139855, "loss": 2.252, "step": 968 }, { "epoch": 0.007752, "grad_norm": 3.03125, "learning_rate": 0.000985937700961305, "loss": 2.1694, "step": 969 }, { "epoch": 0.00776, "grad_norm": 3.46875, "learning_rate": 0.000985908032990995, "loss": 2.596, "step": 970 }, { "epoch": 0.007768, "grad_norm": 2.78125, "learning_rate": 0.000985878334204937, "loss": 2.4959, "step": 971 }, { "epoch": 0.007776, "grad_norm": 2.75, "learning_rate": 0.0009858486046050148, "loss": 1.8638, "step": 972 }, { "epoch": 0.007784, "grad_norm": 2.328125, "learning_rate": 0.0009858188441931133, "loss": 2.1945, "step": 973 }, { "epoch": 0.007792, "grad_norm": 2.984375, "learning_rate": 0.0009857890529711203, "loss": 1.9514, "step": 974 }, { "epoch": 0.0078, "grad_norm": 2.84375, "learning_rate": 0.0009857592309409247, "loss": 2.4108, "step": 975 }, { "epoch": 0.007808, "grad_norm": 3.171875, "learning_rate": 0.000985729378104418, "loss": 2.3765, "step": 976 }, { "epoch": 0.007816, "grad_norm": 3.140625, "learning_rate": 0.0009856994944634936, "loss": 2.7232, "step": 977 }, { "epoch": 0.007824, "grad_norm": 2.4375, "learning_rate": 0.0009856695800200463, "loss": 2.0792, "step": 978 }, { "epoch": 0.007832, "grad_norm": 3.15625, "learning_rate": 0.0009856396347759737, "loss": 2.1162, "step": 979 }, { "epoch": 0.00784, "grad_norm": 2.578125, "learning_rate": 0.0009856096587331747, "loss": 2.4491, "step": 980 }, { "epoch": 0.007848, "grad_norm": 2.53125, "learning_rate": 0.00098557965189355, "loss": 2.4132, "step": 981 }, { "epoch": 0.007856, "grad_norm": 3.1875, "learning_rate": 0.000985549614259003, "loss": 2.3162, "step": 982 }, { "epoch": 0.007864, "grad_norm": 2.625, "learning_rate": 0.0009855195458314387, "loss": 1.9616, "step": 983 }, { "epoch": 0.007872, "grad_norm": 2.859375, "learning_rate": 0.000985489446612764, "loss": 2.4057, "step": 984 }, { "epoch": 0.00788, "grad_norm": 2.875, "learning_rate": 0.0009854593166048872, "loss": 2.29, "step": 985 }, { "epoch": 0.007888, "grad_norm": 3.0625, "learning_rate": 0.0009854291558097198, "loss": 2.5381, "step": 986 }, { "epoch": 0.007896, "grad_norm": 2.5625, "learning_rate": 0.0009853989642291743, "loss": 1.8944, "step": 987 }, { "epoch": 0.007904, "grad_norm": 2.5625, "learning_rate": 0.0009853687418651656, "loss": 2.2163, "step": 988 }, { "epoch": 0.007912, "grad_norm": 2.625, "learning_rate": 0.00098533848871961, "loss": 2.0262, "step": 989 }, { "epoch": 0.00792, "grad_norm": 2.1875, "learning_rate": 0.0009853082047944263, "loss": 1.9078, "step": 990 }, { "epoch": 0.007928, "grad_norm": 3.609375, "learning_rate": 0.0009852778900915349, "loss": 2.2277, "step": 991 }, { "epoch": 0.007936, "grad_norm": 16.625, "learning_rate": 0.0009852475446128588, "loss": 2.1486, "step": 992 }, { "epoch": 0.007944, "grad_norm": 3.015625, "learning_rate": 0.000985217168360322, "loss": 2.4675, "step": 993 }, { "epoch": 0.007952, "grad_norm": 2.90625, "learning_rate": 0.0009851867613358512, "loss": 2.3384, "step": 994 }, { "epoch": 0.00796, "grad_norm": 2.3125, "learning_rate": 0.0009851563235413747, "loss": 2.0297, "step": 995 }, { "epoch": 0.007968, "grad_norm": 2.765625, "learning_rate": 0.0009851258549788229, "loss": 2.691, "step": 996 }, { "epoch": 0.007976, "grad_norm": 2.640625, "learning_rate": 0.0009850953556501277, "loss": 2.3908, "step": 997 }, { "epoch": 0.007984, "grad_norm": 3.09375, "learning_rate": 0.0009850648255572238, "loss": 2.518, "step": 998 }, { "epoch": 0.007992, "grad_norm": 2.9375, "learning_rate": 0.0009850342647020475, "loss": 1.9422, "step": 999 }, { "epoch": 0.008, "grad_norm": 2.703125, "learning_rate": 0.0009850036730865363, "loss": 1.8628, "step": 1000 }, { "epoch": 0.008008, "grad_norm": 4.34375, "learning_rate": 0.0009849730507126306, "loss": 2.666, "step": 1001 }, { "epoch": 0.008016, "grad_norm": 3.625, "learning_rate": 0.0009849423975822728, "loss": 2.7322, "step": 1002 }, { "epoch": 0.008024, "grad_norm": 3.09375, "learning_rate": 0.0009849117136974065, "loss": 3.0342, "step": 1003 }, { "epoch": 0.008032, "grad_norm": 4.5625, "learning_rate": 0.0009848809990599776, "loss": 2.0444, "step": 1004 }, { "epoch": 0.00804, "grad_norm": 2.390625, "learning_rate": 0.000984850253671934, "loss": 1.9374, "step": 1005 }, { "epoch": 0.008048, "grad_norm": 3.453125, "learning_rate": 0.0009848194775352256, "loss": 2.7859, "step": 1006 }, { "epoch": 0.008056, "grad_norm": 3.375, "learning_rate": 0.0009847886706518046, "loss": 2.4402, "step": 1007 }, { "epoch": 0.008064, "grad_norm": 3.0, "learning_rate": 0.000984757833023624, "loss": 1.9096, "step": 1008 }, { "epoch": 0.008072, "grad_norm": 3.203125, "learning_rate": 0.00098472696465264, "loss": 2.5763, "step": 1009 }, { "epoch": 0.00808, "grad_norm": 3.546875, "learning_rate": 0.0009846960655408102, "loss": 2.4549, "step": 1010 }, { "epoch": 0.008088, "grad_norm": 4.8125, "learning_rate": 0.0009846651356900937, "loss": 2.7397, "step": 1011 }, { "epoch": 0.008096, "grad_norm": 3.84375, "learning_rate": 0.0009846341751024528, "loss": 2.7059, "step": 1012 }, { "epoch": 0.008104, "grad_norm": 2.703125, "learning_rate": 0.0009846031837798504, "loss": 2.5009, "step": 1013 }, { "epoch": 0.008112, "grad_norm": 2.8125, "learning_rate": 0.0009845721617242522, "loss": 2.4021, "step": 1014 }, { "epoch": 0.00812, "grad_norm": 2.765625, "learning_rate": 0.0009845411089376254, "loss": 2.3926, "step": 1015 }, { "epoch": 0.008128, "grad_norm": 3.34375, "learning_rate": 0.0009845100254219396, "loss": 2.559, "step": 1016 }, { "epoch": 0.008136, "grad_norm": 2.140625, "learning_rate": 0.000984478911179166, "loss": 1.9164, "step": 1017 }, { "epoch": 0.008144, "grad_norm": 3.625, "learning_rate": 0.0009844477662112774, "loss": 3.1538, "step": 1018 }, { "epoch": 0.008152, "grad_norm": 3.203125, "learning_rate": 0.0009844165905202498, "loss": 2.6124, "step": 1019 }, { "epoch": 0.00816, "grad_norm": 2.734375, "learning_rate": 0.0009843853841080595, "loss": 2.2735, "step": 1020 }, { "epoch": 0.008168, "grad_norm": 3.234375, "learning_rate": 0.000984354146976686, "loss": 2.3599, "step": 1021 }, { "epoch": 0.008176, "grad_norm": 3.828125, "learning_rate": 0.0009843228791281105, "loss": 2.5344, "step": 1022 }, { "epoch": 0.008184, "grad_norm": 2.859375, "learning_rate": 0.0009842915805643156, "loss": 2.8577, "step": 1023 }, { "epoch": 0.008192, "grad_norm": 2.796875, "learning_rate": 0.0009842602512872864, "loss": 2.1852, "step": 1024 }, { "epoch": 0.0082, "grad_norm": 3.9375, "learning_rate": 0.0009842288912990096, "loss": 2.283, "step": 1025 }, { "epoch": 0.008208, "grad_norm": 2.625, "learning_rate": 0.0009841975006014744, "loss": 2.307, "step": 1026 }, { "epoch": 0.008216, "grad_norm": 2.640625, "learning_rate": 0.000984166079196671, "loss": 2.3357, "step": 1027 }, { "epoch": 0.008224, "grad_norm": 2.859375, "learning_rate": 0.0009841346270865925, "loss": 2.0502, "step": 1028 }, { "epoch": 0.008232, "grad_norm": 4.40625, "learning_rate": 0.0009841031442732336, "loss": 2.5893, "step": 1029 }, { "epoch": 0.00824, "grad_norm": 2.9375, "learning_rate": 0.0009840716307585907, "loss": 1.9426, "step": 1030 }, { "epoch": 0.008248, "grad_norm": 3.5625, "learning_rate": 0.0009840400865446625, "loss": 2.2325, "step": 1031 }, { "epoch": 0.008256, "grad_norm": 3.359375, "learning_rate": 0.0009840085116334493, "loss": 2.4399, "step": 1032 }, { "epoch": 0.008264, "grad_norm": 2.84375, "learning_rate": 0.0009839769060269539, "loss": 2.4184, "step": 1033 }, { "epoch": 0.008272, "grad_norm": 3.84375, "learning_rate": 0.0009839452697271802, "loss": 3.0402, "step": 1034 }, { "epoch": 0.00828, "grad_norm": 3.140625, "learning_rate": 0.000983913602736135, "loss": 2.3744, "step": 1035 }, { "epoch": 0.008288, "grad_norm": 2.171875, "learning_rate": 0.0009838819050558263, "loss": 1.7321, "step": 1036 }, { "epoch": 0.008296, "grad_norm": 3.265625, "learning_rate": 0.0009838501766882646, "loss": 2.4289, "step": 1037 }, { "epoch": 0.008304, "grad_norm": 3.1875, "learning_rate": 0.0009838184176354616, "loss": 2.2972, "step": 1038 }, { "epoch": 0.008312, "grad_norm": 3.3125, "learning_rate": 0.000983786627899432, "loss": 2.4402, "step": 1039 }, { "epoch": 0.00832, "grad_norm": 2.765625, "learning_rate": 0.0009837548074821917, "loss": 2.4661, "step": 1040 }, { "epoch": 0.008328, "grad_norm": 3.3125, "learning_rate": 0.0009837229563857582, "loss": 2.7305, "step": 1041 }, { "epoch": 0.008336, "grad_norm": 5.15625, "learning_rate": 0.000983691074612152, "loss": 2.2036, "step": 1042 }, { "epoch": 0.008344, "grad_norm": 2.953125, "learning_rate": 0.0009836591621633951, "loss": 2.0279, "step": 1043 }, { "epoch": 0.008352, "grad_norm": 3.375, "learning_rate": 0.000983627219041511, "loss": 2.46, "step": 1044 }, { "epoch": 0.00836, "grad_norm": 2.4375, "learning_rate": 0.0009835952452485258, "loss": 1.5747, "step": 1045 }, { "epoch": 0.008368, "grad_norm": 3.015625, "learning_rate": 0.0009835632407864667, "loss": 2.6689, "step": 1046 }, { "epoch": 0.008376, "grad_norm": 2.703125, "learning_rate": 0.0009835312056573641, "loss": 2.4178, "step": 1047 }, { "epoch": 0.008384, "grad_norm": 2.84375, "learning_rate": 0.0009834991398632491, "loss": 2.3673, "step": 1048 }, { "epoch": 0.008392, "grad_norm": 3.203125, "learning_rate": 0.0009834670434061556, "loss": 3.0684, "step": 1049 }, { "epoch": 0.0084, "grad_norm": 2.453125, "learning_rate": 0.0009834349162881188, "loss": 2.1458, "step": 1050 }, { "epoch": 0.008408, "grad_norm": 2.5, "learning_rate": 0.0009834027585111766, "loss": 1.8609, "step": 1051 }, { "epoch": 0.008416, "grad_norm": 2.734375, "learning_rate": 0.000983370570077368, "loss": 2.0292, "step": 1052 }, { "epoch": 0.008424, "grad_norm": 3.515625, "learning_rate": 0.0009833383509887347, "loss": 2.4118, "step": 1053 }, { "epoch": 0.008432, "grad_norm": 2.71875, "learning_rate": 0.0009833061012473198, "loss": 2.2706, "step": 1054 }, { "epoch": 0.00844, "grad_norm": 2.25, "learning_rate": 0.0009832738208551683, "loss": 1.7236, "step": 1055 }, { "epoch": 0.008448, "grad_norm": 2.390625, "learning_rate": 0.0009832415098143278, "loss": 1.9321, "step": 1056 }, { "epoch": 0.008456, "grad_norm": 3.640625, "learning_rate": 0.0009832091681268474, "loss": 2.4944, "step": 1057 }, { "epoch": 0.008464, "grad_norm": 2.65625, "learning_rate": 0.0009831767957947778, "loss": 2.3575, "step": 1058 }, { "epoch": 0.008472, "grad_norm": 3.28125, "learning_rate": 0.0009831443928201727, "loss": 2.5896, "step": 1059 }, { "epoch": 0.00848, "grad_norm": 6.09375, "learning_rate": 0.0009831119592050863, "loss": 1.95, "step": 1060 }, { "epoch": 0.008488, "grad_norm": 3.046875, "learning_rate": 0.0009830794949515758, "loss": 2.4578, "step": 1061 }, { "epoch": 0.008496, "grad_norm": 2.8125, "learning_rate": 0.0009830470000617003, "loss": 2.2318, "step": 1062 }, { "epoch": 0.008504, "grad_norm": 2.546875, "learning_rate": 0.0009830144745375203, "loss": 2.041, "step": 1063 }, { "epoch": 0.008512, "grad_norm": 5.5, "learning_rate": 0.0009829819183810986, "loss": 1.8644, "step": 1064 }, { "epoch": 0.00852, "grad_norm": 3.609375, "learning_rate": 0.0009829493315944998, "loss": 2.6589, "step": 1065 }, { "epoch": 0.008528, "grad_norm": 3.375, "learning_rate": 0.0009829167141797907, "loss": 1.5465, "step": 1066 }, { "epoch": 0.008536, "grad_norm": 2.71875, "learning_rate": 0.0009828840661390396, "loss": 1.8352, "step": 1067 }, { "epoch": 0.008544, "grad_norm": 3.71875, "learning_rate": 0.0009828513874743175, "loss": 2.7023, "step": 1068 }, { "epoch": 0.008552, "grad_norm": 4.03125, "learning_rate": 0.0009828186781876962, "loss": 2.3296, "step": 1069 }, { "epoch": 0.00856, "grad_norm": 2.890625, "learning_rate": 0.0009827859382812506, "loss": 2.8014, "step": 1070 }, { "epoch": 0.008568, "grad_norm": 2.796875, "learning_rate": 0.0009827531677570568, "loss": 2.1331, "step": 1071 }, { "epoch": 0.008576, "grad_norm": 3.296875, "learning_rate": 0.0009827203666171928, "loss": 2.1787, "step": 1072 }, { "epoch": 0.008584, "grad_norm": 2.671875, "learning_rate": 0.0009826875348637393, "loss": 1.9714, "step": 1073 }, { "epoch": 0.008592, "grad_norm": 2.3125, "learning_rate": 0.0009826546724987784, "loss": 2.1727, "step": 1074 }, { "epoch": 0.0086, "grad_norm": 9.625, "learning_rate": 0.000982621779524394, "loss": 2.3304, "step": 1075 }, { "epoch": 0.008608, "grad_norm": 2.75, "learning_rate": 0.0009825888559426722, "loss": 2.0995, "step": 1076 }, { "epoch": 0.008616, "grad_norm": 3.125, "learning_rate": 0.0009825559017557007, "loss": 2.3621, "step": 1077 }, { "epoch": 0.008624, "grad_norm": 3.25, "learning_rate": 0.0009825229169655699, "loss": 2.5175, "step": 1078 }, { "epoch": 0.008632, "grad_norm": 5.25, "learning_rate": 0.0009824899015743714, "loss": 2.5876, "step": 1079 }, { "epoch": 0.00864, "grad_norm": 3.078125, "learning_rate": 0.000982456855584199, "loss": 2.6256, "step": 1080 }, { "epoch": 0.008648, "grad_norm": 2.546875, "learning_rate": 0.0009824237789971485, "loss": 2.2485, "step": 1081 }, { "epoch": 0.008656, "grad_norm": 2.625, "learning_rate": 0.0009823906718153177, "loss": 2.2748, "step": 1082 }, { "epoch": 0.008664, "grad_norm": 2.96875, "learning_rate": 0.000982357534040806, "loss": 2.1666, "step": 1083 }, { "epoch": 0.008672, "grad_norm": 2.75, "learning_rate": 0.000982324365675715, "loss": 2.3536, "step": 1084 }, { "epoch": 0.00868, "grad_norm": 4.65625, "learning_rate": 0.0009822911667221484, "loss": 1.9162, "step": 1085 }, { "epoch": 0.008688, "grad_norm": 4.1875, "learning_rate": 0.0009822579371822114, "loss": 2.2568, "step": 1086 }, { "epoch": 0.008696, "grad_norm": 3.234375, "learning_rate": 0.0009822246770580113, "loss": 2.4055, "step": 1087 }, { "epoch": 0.008704, "grad_norm": 6.1875, "learning_rate": 0.0009821913863516578, "loss": 2.5932, "step": 1088 }, { "epoch": 0.008712, "grad_norm": 3.609375, "learning_rate": 0.000982158065065262, "loss": 2.4764, "step": 1089 }, { "epoch": 0.00872, "grad_norm": 3.21875, "learning_rate": 0.0009821247132009367, "loss": 1.9833, "step": 1090 }, { "epoch": 0.008728, "grad_norm": 16.25, "learning_rate": 0.0009820913307607977, "loss": 2.1663, "step": 1091 }, { "epoch": 0.008736, "grad_norm": 3.28125, "learning_rate": 0.0009820579177469615, "loss": 3.1541, "step": 1092 }, { "epoch": 0.008744, "grad_norm": 3.171875, "learning_rate": 0.0009820244741615477, "loss": 2.456, "step": 1093 }, { "epoch": 0.008752, "grad_norm": 2.578125, "learning_rate": 0.0009819910000066767, "loss": 2.4852, "step": 1094 }, { "epoch": 0.00876, "grad_norm": 6.625, "learning_rate": 0.0009819574952844717, "loss": 1.9264, "step": 1095 }, { "epoch": 0.008768, "grad_norm": 2.65625, "learning_rate": 0.0009819239599970573, "loss": 2.2882, "step": 1096 }, { "epoch": 0.008776, "grad_norm": 3.046875, "learning_rate": 0.0009818903941465607, "loss": 2.4658, "step": 1097 }, { "epoch": 0.008784, "grad_norm": 3.375, "learning_rate": 0.0009818567977351103, "loss": 2.1053, "step": 1098 }, { "epoch": 0.008792, "grad_norm": 3.203125, "learning_rate": 0.0009818231707648367, "loss": 2.5472, "step": 1099 }, { "epoch": 0.0088, "grad_norm": 3.828125, "learning_rate": 0.0009817895132378724, "loss": 2.774, "step": 1100 }, { "epoch": 0.008808, "grad_norm": 3.4375, "learning_rate": 0.0009817558251563525, "loss": 2.5896, "step": 1101 }, { "epoch": 0.008816, "grad_norm": 3.09375, "learning_rate": 0.0009817221065224126, "loss": 2.4196, "step": 1102 }, { "epoch": 0.008824, "grad_norm": 3.4375, "learning_rate": 0.000981688357338192, "loss": 2.2919, "step": 1103 }, { "epoch": 0.008832, "grad_norm": 2.9375, "learning_rate": 0.0009816545776058303, "loss": 2.5547, "step": 1104 }, { "epoch": 0.00884, "grad_norm": 3.234375, "learning_rate": 0.0009816207673274701, "loss": 2.509, "step": 1105 }, { "epoch": 0.008848, "grad_norm": 3.375, "learning_rate": 0.0009815869265052555, "loss": 2.7982, "step": 1106 }, { "epoch": 0.008856, "grad_norm": 3.078125, "learning_rate": 0.000981553055141333, "loss": 2.2748, "step": 1107 }, { "epoch": 0.008864, "grad_norm": 3.703125, "learning_rate": 0.00098151915323785, "loss": 2.4906, "step": 1108 }, { "epoch": 0.008872, "grad_norm": 3.0, "learning_rate": 0.0009814852207969571, "loss": 2.9099, "step": 1109 }, { "epoch": 0.00888, "grad_norm": 3.0625, "learning_rate": 0.0009814512578208063, "loss": 2.236, "step": 1110 }, { "epoch": 0.008888, "grad_norm": 3.328125, "learning_rate": 0.000981417264311551, "loss": 2.43, "step": 1111 }, { "epoch": 0.008896, "grad_norm": 2.796875, "learning_rate": 0.0009813832402713475, "loss": 2.3832, "step": 1112 }, { "epoch": 0.008904, "grad_norm": 3.921875, "learning_rate": 0.0009813491857023533, "loss": 2.4108, "step": 1113 }, { "epoch": 0.008912, "grad_norm": 3.71875, "learning_rate": 0.0009813151006067282, "loss": 2.6897, "step": 1114 }, { "epoch": 0.00892, "grad_norm": 4.125, "learning_rate": 0.0009812809849866337, "loss": 2.2368, "step": 1115 }, { "epoch": 0.008928, "grad_norm": 3.53125, "learning_rate": 0.0009812468388442335, "loss": 2.3624, "step": 1116 }, { "epoch": 0.008936, "grad_norm": 2.796875, "learning_rate": 0.0009812126621816933, "loss": 2.479, "step": 1117 }, { "epoch": 0.008944, "grad_norm": 3.21875, "learning_rate": 0.0009811784550011803, "loss": 2.3829, "step": 1118 }, { "epoch": 0.008952, "grad_norm": 22.75, "learning_rate": 0.0009811442173048638, "loss": 2.4674, "step": 1119 }, { "epoch": 0.00896, "grad_norm": 3.625, "learning_rate": 0.000981109949094915, "loss": 3.1102, "step": 1120 }, { "epoch": 0.008968, "grad_norm": 2.921875, "learning_rate": 0.000981075650373508, "loss": 2.306, "step": 1121 }, { "epoch": 0.008976, "grad_norm": 5.03125, "learning_rate": 0.000981041321142817, "loss": 2.4479, "step": 1122 }, { "epoch": 0.008984, "grad_norm": 3.796875, "learning_rate": 0.0009810069614050196, "loss": 2.273, "step": 1123 }, { "epoch": 0.008992, "grad_norm": 4.53125, "learning_rate": 0.0009809725711622946, "loss": 2.3488, "step": 1124 }, { "epoch": 0.009, "grad_norm": 70.5, "learning_rate": 0.0009809381504168233, "loss": 2.1604, "step": 1125 }, { "epoch": 0.009008, "grad_norm": 2.890625, "learning_rate": 0.0009809036991707885, "loss": 2.2615, "step": 1126 }, { "epoch": 0.009016, "grad_norm": 3.578125, "learning_rate": 0.000980869217426375, "loss": 2.6198, "step": 1127 }, { "epoch": 0.009024, "grad_norm": 4.875, "learning_rate": 0.0009808347051857697, "loss": 2.0359, "step": 1128 }, { "epoch": 0.009032, "grad_norm": 32.0, "learning_rate": 0.000980800162451161, "loss": 2.1418, "step": 1129 }, { "epoch": 0.00904, "grad_norm": 4.46875, "learning_rate": 0.0009807655892247402, "loss": 2.6599, "step": 1130 }, { "epoch": 0.009048, "grad_norm": 2.9375, "learning_rate": 0.0009807309855086994, "loss": 2.5127, "step": 1131 }, { "epoch": 0.009056, "grad_norm": 3.15625, "learning_rate": 0.0009806963513052333, "loss": 3.3395, "step": 1132 }, { "epoch": 0.009064, "grad_norm": 3.234375, "learning_rate": 0.000980661686616538, "loss": 2.9129, "step": 1133 }, { "epoch": 0.009072, "grad_norm": 3.0625, "learning_rate": 0.0009806269914448124, "loss": 2.4544, "step": 1134 }, { "epoch": 0.00908, "grad_norm": 3.1875, "learning_rate": 0.0009805922657922567, "loss": 2.5791, "step": 1135 }, { "epoch": 0.009088, "grad_norm": 2.375, "learning_rate": 0.0009805575096610728, "loss": 2.002, "step": 1136 }, { "epoch": 0.009096, "grad_norm": 3.34375, "learning_rate": 0.0009805227230534656, "loss": 3.0077, "step": 1137 }, { "epoch": 0.009104, "grad_norm": 2.375, "learning_rate": 0.0009804879059716405, "loss": 1.9527, "step": 1138 }, { "epoch": 0.009112, "grad_norm": 3.875, "learning_rate": 0.0009804530584178062, "loss": 2.329, "step": 1139 }, { "epoch": 0.00912, "grad_norm": 2.453125, "learning_rate": 0.000980418180394172, "loss": 2.2127, "step": 1140 }, { "epoch": 0.009128, "grad_norm": 3.453125, "learning_rate": 0.0009803832719029507, "loss": 2.6774, "step": 1141 }, { "epoch": 0.009136, "grad_norm": 2.796875, "learning_rate": 0.0009803483329463552, "loss": 2.5821, "step": 1142 }, { "epoch": 0.009144, "grad_norm": 40.75, "learning_rate": 0.0009803133635266019, "loss": 2.631, "step": 1143 }, { "epoch": 0.009152, "grad_norm": 2.328125, "learning_rate": 0.0009802783636459083, "loss": 1.8657, "step": 1144 }, { "epoch": 0.00916, "grad_norm": 3.171875, "learning_rate": 0.0009802433333064944, "loss": 2.5771, "step": 1145 }, { "epoch": 0.009168, "grad_norm": 2.609375, "learning_rate": 0.0009802082725105811, "loss": 2.3805, "step": 1146 }, { "epoch": 0.009176, "grad_norm": 2.9375, "learning_rate": 0.0009801731812603927, "loss": 2.475, "step": 1147 }, { "epoch": 0.009184, "grad_norm": 4.03125, "learning_rate": 0.000980138059558154, "loss": 2.2304, "step": 1148 }, { "epoch": 0.009192, "grad_norm": 3.421875, "learning_rate": 0.0009801029074060928, "loss": 2.2151, "step": 1149 }, { "epoch": 0.0092, "grad_norm": 3.328125, "learning_rate": 0.0009800677248064382, "loss": 2.3945, "step": 1150 }, { "epoch": 0.009208, "grad_norm": 3.515625, "learning_rate": 0.0009800325117614215, "loss": 2.3751, "step": 1151 }, { "epoch": 0.009216, "grad_norm": 2.953125, "learning_rate": 0.0009799972682732758, "loss": 2.4393, "step": 1152 }, { "epoch": 0.009224, "grad_norm": 2.796875, "learning_rate": 0.0009799619943442364, "loss": 2.4102, "step": 1153 }, { "epoch": 0.009232, "grad_norm": 4.40625, "learning_rate": 0.0009799266899765402, "loss": 2.4199, "step": 1154 }, { "epoch": 0.00924, "grad_norm": 3.46875, "learning_rate": 0.0009798913551724263, "loss": 2.2432, "step": 1155 }, { "epoch": 0.009248, "grad_norm": 3.59375, "learning_rate": 0.0009798559899341353, "loss": 2.8464, "step": 1156 }, { "epoch": 0.009256, "grad_norm": 3.5625, "learning_rate": 0.0009798205942639102, "loss": 1.9444, "step": 1157 }, { "epoch": 0.009264, "grad_norm": 3.390625, "learning_rate": 0.0009797851681639957, "loss": 2.3227, "step": 1158 }, { "epoch": 0.009272, "grad_norm": 3.25, "learning_rate": 0.0009797497116366388, "loss": 2.255, "step": 1159 }, { "epoch": 0.00928, "grad_norm": 5.96875, "learning_rate": 0.0009797142246840875, "loss": 3.0305, "step": 1160 }, { "epoch": 0.009288, "grad_norm": 2.671875, "learning_rate": 0.0009796787073085929, "loss": 2.1952, "step": 1161 }, { "epoch": 0.009296, "grad_norm": 2.53125, "learning_rate": 0.000979643159512407, "loss": 2.4221, "step": 1162 }, { "epoch": 0.009304, "grad_norm": 31.0, "learning_rate": 0.0009796075812977848, "loss": 2.0702, "step": 1163 }, { "epoch": 0.009312, "grad_norm": 326.0, "learning_rate": 0.0009795719726669824, "loss": 2.4645, "step": 1164 }, { "epoch": 0.00932, "grad_norm": 2.921875, "learning_rate": 0.0009795363336222577, "loss": 2.3498, "step": 1165 }, { "epoch": 0.009328, "grad_norm": 3.53125, "learning_rate": 0.000979500664165871, "loss": 2.2098, "step": 1166 }, { "epoch": 0.009336, "grad_norm": 3.421875, "learning_rate": 0.0009794649643000846, "loss": 2.3361, "step": 1167 }, { "epoch": 0.009344, "grad_norm": 3.328125, "learning_rate": 0.0009794292340271626, "loss": 2.2592, "step": 1168 }, { "epoch": 0.009352, "grad_norm": 3.09375, "learning_rate": 0.000979393473349371, "loss": 2.4585, "step": 1169 }, { "epoch": 0.00936, "grad_norm": 3.640625, "learning_rate": 0.0009793576822689776, "loss": 2.2381, "step": 1170 }, { "epoch": 0.009368, "grad_norm": 3.203125, "learning_rate": 0.000979321860788252, "loss": 2.4181, "step": 1171 }, { "epoch": 0.009376, "grad_norm": 2.6875, "learning_rate": 0.0009792860089094665, "loss": 2.2664, "step": 1172 }, { "epoch": 0.009384, "grad_norm": 4.28125, "learning_rate": 0.000979250126634894, "loss": 2.2472, "step": 1173 }, { "epoch": 0.009392, "grad_norm": 2.453125, "learning_rate": 0.0009792142139668108, "loss": 1.8541, "step": 1174 }, { "epoch": 0.0094, "grad_norm": 4.1875, "learning_rate": 0.0009791782709074944, "loss": 2.5951, "step": 1175 }, { "epoch": 0.009408, "grad_norm": 2.5, "learning_rate": 0.0009791422974592238, "loss": 1.8411, "step": 1176 }, { "epoch": 0.009416, "grad_norm": 3.25, "learning_rate": 0.0009791062936242807, "loss": 2.7685, "step": 1177 }, { "epoch": 0.009424, "grad_norm": 2.734375, "learning_rate": 0.0009790702594049485, "loss": 2.0299, "step": 1178 }, { "epoch": 0.009432, "grad_norm": 2.875, "learning_rate": 0.0009790341948035125, "loss": 2.6598, "step": 1179 }, { "epoch": 0.00944, "grad_norm": 3.34375, "learning_rate": 0.0009789980998222594, "loss": 2.0269, "step": 1180 }, { "epoch": 0.009448, "grad_norm": 2.65625, "learning_rate": 0.0009789619744634789, "loss": 1.6833, "step": 1181 }, { "epoch": 0.009456, "grad_norm": 3.15625, "learning_rate": 0.0009789258187294617, "loss": 2.2923, "step": 1182 }, { "epoch": 0.009464, "grad_norm": 2.8125, "learning_rate": 0.0009788896326225008, "loss": 2.1057, "step": 1183 }, { "epoch": 0.009472, "grad_norm": 4.125, "learning_rate": 0.000978853416144891, "loss": 2.3777, "step": 1184 }, { "epoch": 0.00948, "grad_norm": 2.953125, "learning_rate": 0.0009788171692989294, "loss": 3.0205, "step": 1185 }, { "epoch": 0.009488, "grad_norm": 3.375, "learning_rate": 0.0009787808920869144, "loss": 2.5063, "step": 1186 }, { "epoch": 0.009496, "grad_norm": 2.515625, "learning_rate": 0.0009787445845111469, "loss": 2.7476, "step": 1187 }, { "epoch": 0.009504, "grad_norm": 2.9375, "learning_rate": 0.0009787082465739293, "loss": 2.0509, "step": 1188 }, { "epoch": 0.009512, "grad_norm": 3.828125, "learning_rate": 0.0009786718782775663, "loss": 2.532, "step": 1189 }, { "epoch": 0.00952, "grad_norm": 3.078125, "learning_rate": 0.0009786354796243641, "loss": 2.4438, "step": 1190 }, { "epoch": 0.009528, "grad_norm": 3.59375, "learning_rate": 0.0009785990506166312, "loss": 2.7103, "step": 1191 }, { "epoch": 0.009536, "grad_norm": 2.734375, "learning_rate": 0.000978562591256678, "loss": 2.3712, "step": 1192 }, { "epoch": 0.009544, "grad_norm": 2.921875, "learning_rate": 0.0009785261015468167, "loss": 2.2404, "step": 1193 }, { "epoch": 0.009552, "grad_norm": 3.421875, "learning_rate": 0.0009784895814893611, "loss": 3.0869, "step": 1194 }, { "epoch": 0.00956, "grad_norm": 2.515625, "learning_rate": 0.0009784530310866279, "loss": 1.9674, "step": 1195 }, { "epoch": 0.009568, "grad_norm": 2.953125, "learning_rate": 0.0009784164503409342, "loss": 2.2707, "step": 1196 }, { "epoch": 0.009576, "grad_norm": 3.09375, "learning_rate": 0.0009783798392546006, "loss": 1.7093, "step": 1197 }, { "epoch": 0.009584, "grad_norm": 3.03125, "learning_rate": 0.0009783431978299488, "loss": 2.3292, "step": 1198 }, { "epoch": 0.009592, "grad_norm": 3.828125, "learning_rate": 0.0009783065260693025, "loss": 1.7621, "step": 1199 }, { "epoch": 0.0096, "grad_norm": 4.34375, "learning_rate": 0.0009782698239749871, "loss": 2.5254, "step": 1200 }, { "epoch": 0.009608, "grad_norm": 3.046875, "learning_rate": 0.0009782330915493306, "loss": 2.3107, "step": 1201 }, { "epoch": 0.009616, "grad_norm": 3.640625, "learning_rate": 0.0009781963287946627, "loss": 2.3161, "step": 1202 }, { "epoch": 0.009624, "grad_norm": 3.171875, "learning_rate": 0.0009781595357133144, "loss": 2.3591, "step": 1203 }, { "epoch": 0.009632, "grad_norm": 3.296875, "learning_rate": 0.000978122712307619, "loss": 1.8024, "step": 1204 }, { "epoch": 0.00964, "grad_norm": 3.21875, "learning_rate": 0.0009780858585799123, "loss": 1.6226, "step": 1205 }, { "epoch": 0.009648, "grad_norm": 4.0, "learning_rate": 0.000978048974532531, "loss": 2.1137, "step": 1206 }, { "epoch": 0.009656, "grad_norm": 3.375, "learning_rate": 0.0009780120601678148, "loss": 2.2401, "step": 1207 }, { "epoch": 0.009664, "grad_norm": 3.578125, "learning_rate": 0.0009779751154881044, "loss": 2.5201, "step": 1208 }, { "epoch": 0.009672, "grad_norm": 4.0625, "learning_rate": 0.0009779381404957429, "loss": 2.7197, "step": 1209 }, { "epoch": 0.00968, "grad_norm": 3.78125, "learning_rate": 0.000977901135193075, "loss": 2.5489, "step": 1210 }, { "epoch": 0.009688, "grad_norm": 2.390625, "learning_rate": 0.000977864099582448, "loss": 2.0973, "step": 1211 }, { "epoch": 0.009696, "grad_norm": 2.90625, "learning_rate": 0.0009778270336662101, "loss": 2.1735, "step": 1212 }, { "epoch": 0.009704, "grad_norm": 3.265625, "learning_rate": 0.0009777899374467122, "loss": 2.7415, "step": 1213 }, { "epoch": 0.009712, "grad_norm": 2.828125, "learning_rate": 0.0009777528109263071, "loss": 2.037, "step": 1214 }, { "epoch": 0.00972, "grad_norm": 2.796875, "learning_rate": 0.0009777156541073493, "loss": 2.0933, "step": 1215 }, { "epoch": 0.009728, "grad_norm": 2.8125, "learning_rate": 0.000977678466992195, "loss": 2.4089, "step": 1216 }, { "epoch": 0.009736, "grad_norm": 2.90625, "learning_rate": 0.0009776412495832026, "loss": 2.8237, "step": 1217 }, { "epoch": 0.009744, "grad_norm": 3.3125, "learning_rate": 0.0009776040018827327, "loss": 2.8598, "step": 1218 }, { "epoch": 0.009752, "grad_norm": 2.578125, "learning_rate": 0.0009775667238931468, "loss": 2.3332, "step": 1219 }, { "epoch": 0.00976, "grad_norm": 167.0, "learning_rate": 0.00097752941561681, "loss": 2.5717, "step": 1220 }, { "epoch": 0.009768, "grad_norm": 2.953125, "learning_rate": 0.0009774920770560876, "loss": 2.3958, "step": 1221 }, { "epoch": 0.009776, "grad_norm": 4.3125, "learning_rate": 0.0009774547082133478, "loss": 2.3754, "step": 1222 }, { "epoch": 0.009784, "grad_norm": 2.90625, "learning_rate": 0.0009774173090909607, "loss": 2.4088, "step": 1223 }, { "epoch": 0.009792, "grad_norm": 3.46875, "learning_rate": 0.000977379879691298, "loss": 2.5859, "step": 1224 }, { "epoch": 0.0098, "grad_norm": 3.515625, "learning_rate": 0.000977342420016733, "loss": 2.4757, "step": 1225 }, { "epoch": 0.009808, "grad_norm": 3.46875, "learning_rate": 0.000977304930069642, "loss": 2.3152, "step": 1226 }, { "epoch": 0.009816, "grad_norm": 3.421875, "learning_rate": 0.000977267409852402, "loss": 2.0647, "step": 1227 }, { "epoch": 0.009824, "grad_norm": 3.78125, "learning_rate": 0.000977229859367393, "loss": 2.7476, "step": 1228 }, { "epoch": 0.009832, "grad_norm": 4.53125, "learning_rate": 0.0009771922786169962, "loss": 3.6881, "step": 1229 }, { "epoch": 0.00984, "grad_norm": 2.59375, "learning_rate": 0.0009771546676035946, "loss": 2.3479, "step": 1230 }, { "epoch": 0.009848, "grad_norm": 3.171875, "learning_rate": 0.0009771170263295742, "loss": 2.4004, "step": 1231 }, { "epoch": 0.009856, "grad_norm": 3.28125, "learning_rate": 0.0009770793547973214, "loss": 2.3488, "step": 1232 }, { "epoch": 0.009864, "grad_norm": 3.734375, "learning_rate": 0.0009770416530092258, "loss": 2.6904, "step": 1233 }, { "epoch": 0.009872, "grad_norm": 2.609375, "learning_rate": 0.0009770039209676781, "loss": 2.4115, "step": 1234 }, { "epoch": 0.00988, "grad_norm": 3.15625, "learning_rate": 0.0009769661586750712, "loss": 2.6776, "step": 1235 }, { "epoch": 0.009888, "grad_norm": 2.875, "learning_rate": 0.0009769283661338005, "loss": 2.1255, "step": 1236 }, { "epoch": 0.009896, "grad_norm": 2.640625, "learning_rate": 0.000976890543346262, "loss": 2.2408, "step": 1237 }, { "epoch": 0.009904, "grad_norm": 2.96875, "learning_rate": 0.0009768526903148548, "loss": 2.4357, "step": 1238 }, { "epoch": 0.009912, "grad_norm": 3.09375, "learning_rate": 0.0009768148070419796, "loss": 2.313, "step": 1239 }, { "epoch": 0.00992, "grad_norm": 3.3125, "learning_rate": 0.0009767768935300385, "loss": 2.1613, "step": 1240 }, { "epoch": 0.009928, "grad_norm": 3.25, "learning_rate": 0.0009767389497814362, "loss": 2.5997, "step": 1241 }, { "epoch": 0.009936, "grad_norm": 3.359375, "learning_rate": 0.0009767009757985791, "loss": 2.523, "step": 1242 }, { "epoch": 0.009944, "grad_norm": 3.546875, "learning_rate": 0.0009766629715838755, "loss": 2.6024, "step": 1243 }, { "epoch": 0.009952, "grad_norm": 2.515625, "learning_rate": 0.0009766249371397353, "loss": 2.0717, "step": 1244 }, { "epoch": 0.00996, "grad_norm": 2.953125, "learning_rate": 0.000976586872468571, "loss": 2.6215, "step": 1245 }, { "epoch": 0.009968, "grad_norm": 2.84375, "learning_rate": 0.000976548777572796, "loss": 2.7619, "step": 1246 }, { "epoch": 0.009976, "grad_norm": 2.75, "learning_rate": 0.0009765106524548268, "loss": 2.6658, "step": 1247 }, { "epoch": 0.009984, "grad_norm": 3.09375, "learning_rate": 0.0009764724971170812, "loss": 3.0304, "step": 1248 }, { "epoch": 0.009992, "grad_norm": 2.90625, "learning_rate": 0.0009764343115619787, "loss": 2.0588, "step": 1249 }, { "epoch": 0.01, "grad_norm": 2.34375, "learning_rate": 0.0009763960957919414, "loss": 2.0798, "step": 1250 }, { "epoch": 0.010008, "grad_norm": 2.546875, "learning_rate": 0.0009763578498093923, "loss": 1.9572, "step": 1251 }, { "epoch": 0.010016, "grad_norm": 2.109375, "learning_rate": 0.0009763195736167575, "loss": 1.8039, "step": 1252 }, { "epoch": 0.010024, "grad_norm": 3.15625, "learning_rate": 0.0009762812672164641, "loss": 2.2611, "step": 1253 }, { "epoch": 0.010032, "grad_norm": 3.59375, "learning_rate": 0.0009762429306109415, "loss": 2.6415, "step": 1254 }, { "epoch": 0.01004, "grad_norm": 3.09375, "learning_rate": 0.0009762045638026212, "loss": 2.5496, "step": 1255 }, { "epoch": 0.010048, "grad_norm": 3.21875, "learning_rate": 0.0009761661667939359, "loss": 2.5062, "step": 1256 }, { "epoch": 0.010056, "grad_norm": 2.3125, "learning_rate": 0.0009761277395873211, "loss": 1.9854, "step": 1257 }, { "epoch": 0.010064, "grad_norm": 2.875, "learning_rate": 0.0009760892821852138, "loss": 2.1809, "step": 1258 }, { "epoch": 0.010072, "grad_norm": 2.875, "learning_rate": 0.0009760507945900527, "loss": 2.7308, "step": 1259 }, { "epoch": 0.01008, "grad_norm": 2.90625, "learning_rate": 0.0009760122768042788, "loss": 2.1488, "step": 1260 }, { "epoch": 0.010088, "grad_norm": 2.84375, "learning_rate": 0.0009759737288303347, "loss": 2.2404, "step": 1261 }, { "epoch": 0.010096, "grad_norm": 3.515625, "learning_rate": 0.0009759351506706653, "loss": 2.3416, "step": 1262 }, { "epoch": 0.010104, "grad_norm": 2.8125, "learning_rate": 0.0009758965423277171, "loss": 2.0936, "step": 1263 }, { "epoch": 0.010112, "grad_norm": 2.921875, "learning_rate": 0.0009758579038039386, "loss": 3.0488, "step": 1264 }, { "epoch": 0.01012, "grad_norm": 2.625, "learning_rate": 0.00097581923510178, "loss": 2.0543, "step": 1265 }, { "epoch": 0.010128, "grad_norm": 3.328125, "learning_rate": 0.000975780536223694, "loss": 2.3157, "step": 1266 }, { "epoch": 0.010136, "grad_norm": 2.796875, "learning_rate": 0.0009757418071721345, "loss": 2.7794, "step": 1267 }, { "epoch": 0.010144, "grad_norm": 2.796875, "learning_rate": 0.0009757030479495579, "loss": 1.9007, "step": 1268 }, { "epoch": 0.010152, "grad_norm": 3.078125, "learning_rate": 0.0009756642585584221, "loss": 2.449, "step": 1269 }, { "epoch": 0.01016, "grad_norm": 3.671875, "learning_rate": 0.0009756254390011871, "loss": 2.4513, "step": 1270 }, { "epoch": 0.010168, "grad_norm": 3.28125, "learning_rate": 0.0009755865892803148, "loss": 2.2878, "step": 1271 }, { "epoch": 0.010176, "grad_norm": 3.203125, "learning_rate": 0.0009755477093982692, "loss": 2.0278, "step": 1272 }, { "epoch": 0.010184, "grad_norm": 3.0, "learning_rate": 0.0009755087993575158, "loss": 2.4454, "step": 1273 }, { "epoch": 0.010192, "grad_norm": 3.53125, "learning_rate": 0.0009754698591605223, "loss": 3.1321, "step": 1274 }, { "epoch": 0.0102, "grad_norm": 3.734375, "learning_rate": 0.0009754308888097583, "loss": 2.6913, "step": 1275 }, { "epoch": 0.010208, "grad_norm": 3.21875, "learning_rate": 0.0009753918883076951, "loss": 2.3973, "step": 1276 }, { "epoch": 0.010216, "grad_norm": 2.734375, "learning_rate": 0.0009753528576568061, "loss": 2.1189, "step": 1277 }, { "epoch": 0.010224, "grad_norm": 2.765625, "learning_rate": 0.0009753137968595668, "loss": 2.0419, "step": 1278 }, { "epoch": 0.010232, "grad_norm": 2.84375, "learning_rate": 0.0009752747059184542, "loss": 1.7639, "step": 1279 }, { "epoch": 0.01024, "grad_norm": 3.65625, "learning_rate": 0.0009752355848359473, "loss": 2.4456, "step": 1280 }, { "epoch": 0.010248, "grad_norm": 3.25, "learning_rate": 0.0009751964336145273, "loss": 3.0387, "step": 1281 }, { "epoch": 0.010256, "grad_norm": 3.390625, "learning_rate": 0.0009751572522566773, "loss": 2.731, "step": 1282 }, { "epoch": 0.010264, "grad_norm": 3.5, "learning_rate": 0.0009751180407648818, "loss": 2.1537, "step": 1283 }, { "epoch": 0.010272, "grad_norm": 2.625, "learning_rate": 0.0009750787991416275, "loss": 2.2179, "step": 1284 }, { "epoch": 0.01028, "grad_norm": 3.828125, "learning_rate": 0.0009750395273894034, "loss": 2.7144, "step": 1285 }, { "epoch": 0.010288, "grad_norm": 3.59375, "learning_rate": 0.0009750002255106999, "loss": 3.1902, "step": 1286 }, { "epoch": 0.010296, "grad_norm": 3.234375, "learning_rate": 0.0009749608935080094, "loss": 2.7522, "step": 1287 }, { "epoch": 0.010304, "grad_norm": 3.421875, "learning_rate": 0.0009749215313838265, "loss": 2.2962, "step": 1288 }, { "epoch": 0.010312, "grad_norm": 3.59375, "learning_rate": 0.0009748821391406473, "loss": 2.6833, "step": 1289 }, { "epoch": 0.01032, "grad_norm": 2.8125, "learning_rate": 0.0009748427167809702, "loss": 2.3747, "step": 1290 }, { "epoch": 0.010328, "grad_norm": 3.484375, "learning_rate": 0.000974803264307295, "loss": 3.1765, "step": 1291 }, { "epoch": 0.010336, "grad_norm": 3.0625, "learning_rate": 0.000974763781722124, "loss": 2.2542, "step": 1292 }, { "epoch": 0.010344, "grad_norm": 3.21875, "learning_rate": 0.0009747242690279612, "loss": 2.5648, "step": 1293 }, { "epoch": 0.010352, "grad_norm": 2.6875, "learning_rate": 0.0009746847262273122, "loss": 1.8832, "step": 1294 }, { "epoch": 0.01036, "grad_norm": 3.828125, "learning_rate": 0.000974645153322685, "loss": 1.9625, "step": 1295 }, { "epoch": 0.010368, "grad_norm": 4.03125, "learning_rate": 0.0009746055503165892, "loss": 2.4567, "step": 1296 }, { "epoch": 0.010376, "grad_norm": 3.25, "learning_rate": 0.0009745659172115362, "loss": 2.6221, "step": 1297 }, { "epoch": 0.010384, "grad_norm": 3.390625, "learning_rate": 0.0009745262540100396, "loss": 1.7217, "step": 1298 }, { "epoch": 0.010392, "grad_norm": 3.109375, "learning_rate": 0.0009744865607146151, "loss": 2.215, "step": 1299 }, { "epoch": 0.0104, "grad_norm": 2.640625, "learning_rate": 0.0009744468373277796, "loss": 2.2563, "step": 1300 }, { "epoch": 0.010408, "grad_norm": 3.109375, "learning_rate": 0.0009744070838520524, "loss": 2.4379, "step": 1301 }, { "epoch": 0.010416, "grad_norm": 3.0625, "learning_rate": 0.0009743673002899546, "loss": 2.4221, "step": 1302 }, { "epoch": 0.010424, "grad_norm": 3.078125, "learning_rate": 0.0009743274866440095, "loss": 1.9019, "step": 1303 }, { "epoch": 0.010432, "grad_norm": 2.734375, "learning_rate": 0.0009742876429167418, "loss": 2.2665, "step": 1304 }, { "epoch": 0.01044, "grad_norm": 3.609375, "learning_rate": 0.0009742477691106783, "loss": 2.4283, "step": 1305 }, { "epoch": 0.010448, "grad_norm": 20.625, "learning_rate": 0.0009742078652283478, "loss": 1.7838, "step": 1306 }, { "epoch": 0.010456, "grad_norm": 3.015625, "learning_rate": 0.0009741679312722811, "loss": 2.345, "step": 1307 }, { "epoch": 0.010464, "grad_norm": 2.71875, "learning_rate": 0.0009741279672450105, "loss": 2.2458, "step": 1308 }, { "epoch": 0.010472, "grad_norm": 2.9375, "learning_rate": 0.0009740879731490708, "loss": 2.9518, "step": 1309 }, { "epoch": 0.01048, "grad_norm": 2.9375, "learning_rate": 0.000974047948986998, "loss": 2.2773, "step": 1310 }, { "epoch": 0.010488, "grad_norm": 2.875, "learning_rate": 0.0009740078947613309, "loss": 2.2202, "step": 1311 }, { "epoch": 0.010496, "grad_norm": 2.875, "learning_rate": 0.0009739678104746091, "loss": 2.2916, "step": 1312 }, { "epoch": 0.010504, "grad_norm": 3.046875, "learning_rate": 0.0009739276961293753, "loss": 2.0818, "step": 1313 }, { "epoch": 0.010512, "grad_norm": 2.890625, "learning_rate": 0.0009738875517281729, "loss": 2.0408, "step": 1314 }, { "epoch": 0.01052, "grad_norm": 2.984375, "learning_rate": 0.0009738473772735484, "loss": 2.2331, "step": 1315 }, { "epoch": 0.010528, "grad_norm": 3.734375, "learning_rate": 0.000973807172768049, "loss": 2.3156, "step": 1316 }, { "epoch": 0.010536, "grad_norm": 3.859375, "learning_rate": 0.0009737669382142251, "loss": 2.915, "step": 1317 }, { "epoch": 0.010544, "grad_norm": 2.484375, "learning_rate": 0.0009737266736146279, "loss": 1.9253, "step": 1318 }, { "epoch": 0.010552, "grad_norm": 2.71875, "learning_rate": 0.0009736863789718109, "loss": 2.4065, "step": 1319 }, { "epoch": 0.01056, "grad_norm": 3.796875, "learning_rate": 0.0009736460542883299, "loss": 2.616, "step": 1320 }, { "epoch": 0.010568, "grad_norm": 3.421875, "learning_rate": 0.0009736056995667417, "loss": 2.9226, "step": 1321 }, { "epoch": 0.010576, "grad_norm": 2.625, "learning_rate": 0.0009735653148096062, "loss": 1.6124, "step": 1322 }, { "epoch": 0.010584, "grad_norm": 2.390625, "learning_rate": 0.000973524900019484, "loss": 1.9177, "step": 1323 }, { "epoch": 0.010592, "grad_norm": 3.5, "learning_rate": 0.0009734844551989385, "loss": 1.9827, "step": 1324 }, { "epoch": 0.0106, "grad_norm": 6.8125, "learning_rate": 0.0009734439803505345, "loss": 1.9332, "step": 1325 }, { "epoch": 0.010608, "grad_norm": 2.828125, "learning_rate": 0.000973403475476839, "loss": 2.6164, "step": 1326 }, { "epoch": 0.010616, "grad_norm": 4.40625, "learning_rate": 0.0009733629405804205, "loss": 3.4817, "step": 1327 }, { "epoch": 0.010624, "grad_norm": 3.25, "learning_rate": 0.0009733223756638501, "loss": 2.6044, "step": 1328 }, { "epoch": 0.010632, "grad_norm": 3.046875, "learning_rate": 0.0009732817807297001, "loss": 2.2979, "step": 1329 }, { "epoch": 0.01064, "grad_norm": 2.796875, "learning_rate": 0.0009732411557805449, "loss": 1.9315, "step": 1330 }, { "epoch": 0.010648, "grad_norm": 2.671875, "learning_rate": 0.0009732005008189613, "loss": 1.9644, "step": 1331 }, { "epoch": 0.010656, "grad_norm": 3.375, "learning_rate": 0.0009731598158475271, "loss": 2.1789, "step": 1332 }, { "epoch": 0.010664, "grad_norm": 2.390625, "learning_rate": 0.0009731191008688227, "loss": 2.1048, "step": 1333 }, { "epoch": 0.010672, "grad_norm": 4.28125, "learning_rate": 0.0009730783558854303, "loss": 2.1208, "step": 1334 }, { "epoch": 0.01068, "grad_norm": 3.296875, "learning_rate": 0.0009730375808999339, "loss": 2.0629, "step": 1335 }, { "epoch": 0.010688, "grad_norm": 5.28125, "learning_rate": 0.0009729967759149193, "loss": 2.6076, "step": 1336 }, { "epoch": 0.010696, "grad_norm": 3.125, "learning_rate": 0.0009729559409329743, "loss": 2.4443, "step": 1337 }, { "epoch": 0.010704, "grad_norm": 3.09375, "learning_rate": 0.0009729150759566886, "loss": 2.2592, "step": 1338 }, { "epoch": 0.010712, "grad_norm": 3.109375, "learning_rate": 0.0009728741809886538, "loss": 2.538, "step": 1339 }, { "epoch": 0.01072, "grad_norm": 2.6875, "learning_rate": 0.0009728332560314637, "loss": 1.7543, "step": 1340 }, { "epoch": 0.010728, "grad_norm": 3.34375, "learning_rate": 0.0009727923010877133, "loss": 2.6589, "step": 1341 }, { "epoch": 0.010736, "grad_norm": 2.609375, "learning_rate": 0.0009727513161600001, "loss": 1.8743, "step": 1342 }, { "epoch": 0.010744, "grad_norm": 3.5625, "learning_rate": 0.0009727103012509233, "loss": 2.3077, "step": 1343 }, { "epoch": 0.010752, "grad_norm": 3.84375, "learning_rate": 0.0009726692563630841, "loss": 2.082, "step": 1344 }, { "epoch": 0.01076, "grad_norm": 3.9375, "learning_rate": 0.0009726281814990855, "loss": 2.5107, "step": 1345 }, { "epoch": 0.010768, "grad_norm": 3.484375, "learning_rate": 0.0009725870766615324, "loss": 2.7255, "step": 1346 }, { "epoch": 0.010776, "grad_norm": 3.5, "learning_rate": 0.0009725459418530313, "loss": 2.3053, "step": 1347 }, { "epoch": 0.010784, "grad_norm": 2.75, "learning_rate": 0.0009725047770761916, "loss": 2.4367, "step": 1348 }, { "epoch": 0.010792, "grad_norm": 3.125, "learning_rate": 0.0009724635823336234, "loss": 2.4892, "step": 1349 }, { "epoch": 0.0108, "grad_norm": 3.03125, "learning_rate": 0.0009724223576279394, "loss": 2.1743, "step": 1350 }, { "epoch": 0.010808, "grad_norm": 3.046875, "learning_rate": 0.000972381102961754, "loss": 2.345, "step": 1351 }, { "epoch": 0.010816, "grad_norm": 4.21875, "learning_rate": 0.0009723398183376834, "loss": 2.2578, "step": 1352 }, { "epoch": 0.010824, "grad_norm": 3.09375, "learning_rate": 0.000972298503758346, "loss": 2.1018, "step": 1353 }, { "epoch": 0.010832, "grad_norm": 3.40625, "learning_rate": 0.000972257159226362, "loss": 2.6635, "step": 1354 }, { "epoch": 0.01084, "grad_norm": 3.265625, "learning_rate": 0.0009722157847443534, "loss": 2.9847, "step": 1355 }, { "epoch": 0.010848, "grad_norm": 3.515625, "learning_rate": 0.0009721743803149437, "loss": 2.6603, "step": 1356 }, { "epoch": 0.010856, "grad_norm": 3.21875, "learning_rate": 0.0009721329459407593, "loss": 2.2404, "step": 1357 }, { "epoch": 0.010864, "grad_norm": 2.6875, "learning_rate": 0.0009720914816244275, "loss": 2.0596, "step": 1358 }, { "epoch": 0.010872, "grad_norm": 3.453125, "learning_rate": 0.0009720499873685782, "loss": 2.4521, "step": 1359 }, { "epoch": 0.01088, "grad_norm": 3.015625, "learning_rate": 0.0009720084631758427, "loss": 3.1029, "step": 1360 }, { "epoch": 0.010888, "grad_norm": 3.453125, "learning_rate": 0.0009719669090488547, "loss": 2.3492, "step": 1361 }, { "epoch": 0.010896, "grad_norm": 3.71875, "learning_rate": 0.0009719253249902493, "loss": 1.8842, "step": 1362 }, { "epoch": 0.010904, "grad_norm": 3.671875, "learning_rate": 0.0009718837110026637, "loss": 2.8783, "step": 1363 }, { "epoch": 0.010912, "grad_norm": 3.03125, "learning_rate": 0.0009718420670887371, "loss": 2.245, "step": 1364 }, { "epoch": 0.01092, "grad_norm": 3.015625, "learning_rate": 0.0009718003932511106, "loss": 2.969, "step": 1365 }, { "epoch": 0.010928, "grad_norm": 2.796875, "learning_rate": 0.0009717586894924269, "loss": 2.2403, "step": 1366 }, { "epoch": 0.010936, "grad_norm": 2.84375, "learning_rate": 0.0009717169558153307, "loss": 2.7348, "step": 1367 }, { "epoch": 0.010944, "grad_norm": 2.734375, "learning_rate": 0.0009716751922224691, "loss": 2.3191, "step": 1368 }, { "epoch": 0.010952, "grad_norm": 2.71875, "learning_rate": 0.0009716333987164904, "loss": 2.2927, "step": 1369 }, { "epoch": 0.01096, "grad_norm": 2.703125, "learning_rate": 0.0009715915753000454, "loss": 2.2298, "step": 1370 }, { "epoch": 0.010968, "grad_norm": 2.375, "learning_rate": 0.0009715497219757861, "loss": 2.0357, "step": 1371 }, { "epoch": 0.010976, "grad_norm": 3.0, "learning_rate": 0.0009715078387463672, "loss": 2.0499, "step": 1372 }, { "epoch": 0.010984, "grad_norm": 3.109375, "learning_rate": 0.0009714659256144445, "loss": 2.4872, "step": 1373 }, { "epoch": 0.010992, "grad_norm": 2.71875, "learning_rate": 0.0009714239825826762, "loss": 2.6449, "step": 1374 }, { "epoch": 0.011, "grad_norm": 3.359375, "learning_rate": 0.0009713820096537225, "loss": 2.655, "step": 1375 }, { "epoch": 0.011008, "grad_norm": 28.125, "learning_rate": 0.0009713400068302449, "loss": 2.7228, "step": 1376 }, { "epoch": 0.011016, "grad_norm": 3.484375, "learning_rate": 0.0009712979741149076, "loss": 2.1789, "step": 1377 }, { "epoch": 0.011024, "grad_norm": 3.890625, "learning_rate": 0.0009712559115103759, "loss": 2.5858, "step": 1378 }, { "epoch": 0.011032, "grad_norm": 3.671875, "learning_rate": 0.0009712138190193176, "loss": 2.5736, "step": 1379 }, { "epoch": 0.01104, "grad_norm": 6.34375, "learning_rate": 0.0009711716966444022, "loss": 2.1157, "step": 1380 }, { "epoch": 0.011048, "grad_norm": 2.90625, "learning_rate": 0.0009711295443883008, "loss": 2.4231, "step": 1381 }, { "epoch": 0.011056, "grad_norm": 3.09375, "learning_rate": 0.0009710873622536867, "loss": 2.661, "step": 1382 }, { "epoch": 0.011064, "grad_norm": 3.3125, "learning_rate": 0.0009710451502432354, "loss": 2.5302, "step": 1383 }, { "epoch": 0.011072, "grad_norm": 3.046875, "learning_rate": 0.0009710029083596234, "loss": 2.0646, "step": 1384 }, { "epoch": 0.01108, "grad_norm": 3.3125, "learning_rate": 0.0009709606366055299, "loss": 2.8672, "step": 1385 }, { "epoch": 0.011088, "grad_norm": 3.359375, "learning_rate": 0.0009709183349836359, "loss": 2.591, "step": 1386 }, { "epoch": 0.011096, "grad_norm": 2.875, "learning_rate": 0.0009708760034966239, "loss": 2.6294, "step": 1387 }, { "epoch": 0.011104, "grad_norm": 3.78125, "learning_rate": 0.0009708336421471784, "loss": 2.6453, "step": 1388 }, { "epoch": 0.011112, "grad_norm": 2.828125, "learning_rate": 0.0009707912509379862, "loss": 2.0457, "step": 1389 }, { "epoch": 0.01112, "grad_norm": 3.265625, "learning_rate": 0.0009707488298717356, "loss": 2.4065, "step": 1390 }, { "epoch": 0.011128, "grad_norm": 5.03125, "learning_rate": 0.0009707063789511169, "loss": 2.6241, "step": 1391 }, { "epoch": 0.011136, "grad_norm": 4.59375, "learning_rate": 0.0009706638981788223, "loss": 1.7711, "step": 1392 }, { "epoch": 0.011144, "grad_norm": 3.578125, "learning_rate": 0.0009706213875575457, "loss": 2.4802, "step": 1393 }, { "epoch": 0.011152, "grad_norm": 3.875, "learning_rate": 0.0009705788470899833, "loss": 2.4616, "step": 1394 }, { "epoch": 0.01116, "grad_norm": 3.40625, "learning_rate": 0.0009705362767788329, "loss": 2.3647, "step": 1395 }, { "epoch": 0.011168, "grad_norm": 4.40625, "learning_rate": 0.0009704936766267942, "loss": 2.2076, "step": 1396 }, { "epoch": 0.011176, "grad_norm": 3.34375, "learning_rate": 0.0009704510466365691, "loss": 2.6617, "step": 1397 }, { "epoch": 0.011184, "grad_norm": 3.453125, "learning_rate": 0.0009704083868108607, "loss": 2.2743, "step": 1398 }, { "epoch": 0.011192, "grad_norm": 3.078125, "learning_rate": 0.0009703656971523747, "loss": 2.8659, "step": 1399 }, { "epoch": 0.0112, "grad_norm": 3.046875, "learning_rate": 0.0009703229776638185, "loss": 2.3952, "step": 1400 }, { "epoch": 0.011208, "grad_norm": 3.125, "learning_rate": 0.0009702802283479013, "loss": 2.8623, "step": 1401 }, { "epoch": 0.011216, "grad_norm": 3.171875, "learning_rate": 0.0009702374492073343, "loss": 2.4936, "step": 1402 }, { "epoch": 0.011224, "grad_norm": 2.828125, "learning_rate": 0.0009701946402448301, "loss": 2.4413, "step": 1403 }, { "epoch": 0.011232, "grad_norm": 3.203125, "learning_rate": 0.0009701518014631039, "loss": 2.9878, "step": 1404 }, { "epoch": 0.01124, "grad_norm": 3.1875, "learning_rate": 0.0009701089328648726, "loss": 2.225, "step": 1405 }, { "epoch": 0.011248, "grad_norm": 3.1875, "learning_rate": 0.0009700660344528546, "loss": 2.1321, "step": 1406 }, { "epoch": 0.011256, "grad_norm": 4.09375, "learning_rate": 0.0009700231062297706, "loss": 2.5387, "step": 1407 }, { "epoch": 0.011264, "grad_norm": 3.109375, "learning_rate": 0.0009699801481983432, "loss": 2.5182, "step": 1408 }, { "epoch": 0.011272, "grad_norm": 3.90625, "learning_rate": 0.0009699371603612965, "loss": 2.4518, "step": 1409 }, { "epoch": 0.01128, "grad_norm": 4.6875, "learning_rate": 0.0009698941427213569, "loss": 2.6494, "step": 1410 }, { "epoch": 0.011288, "grad_norm": 4.1875, "learning_rate": 0.0009698510952812525, "loss": 2.4309, "step": 1411 }, { "epoch": 0.011296, "grad_norm": 3.28125, "learning_rate": 0.0009698080180437132, "loss": 2.2569, "step": 1412 }, { "epoch": 0.011304, "grad_norm": 2.984375, "learning_rate": 0.0009697649110114711, "loss": 2.5427, "step": 1413 }, { "epoch": 0.011312, "grad_norm": 3.484375, "learning_rate": 0.00096972177418726, "loss": 2.7221, "step": 1414 }, { "epoch": 0.01132, "grad_norm": 5.625, "learning_rate": 0.0009696786075738153, "loss": 2.4023, "step": 1415 }, { "epoch": 0.011328, "grad_norm": 3.96875, "learning_rate": 0.0009696354111738749, "loss": 2.2853, "step": 1416 }, { "epoch": 0.011336, "grad_norm": 3.015625, "learning_rate": 0.0009695921849901783, "loss": 2.3587, "step": 1417 }, { "epoch": 0.011344, "grad_norm": 3.234375, "learning_rate": 0.0009695489290254665, "loss": 2.6046, "step": 1418 }, { "epoch": 0.011352, "grad_norm": 4768.0, "learning_rate": 0.0009695056432824829, "loss": 2.2041, "step": 1419 }, { "epoch": 0.01136, "grad_norm": 3.4375, "learning_rate": 0.0009694623277639729, "loss": 2.5062, "step": 1420 }, { "epoch": 0.011368, "grad_norm": 4.75, "learning_rate": 0.0009694189824726832, "loss": 2.3521, "step": 1421 }, { "epoch": 0.011376, "grad_norm": 164.0, "learning_rate": 0.0009693756074113628, "loss": 2.1865, "step": 1422 }, { "epoch": 0.011384, "grad_norm": 5.125, "learning_rate": 0.0009693322025827625, "loss": 2.3867, "step": 1423 }, { "epoch": 0.011392, "grad_norm": 151.0, "learning_rate": 0.0009692887679896351, "loss": 2.478, "step": 1424 }, { "epoch": 0.0114, "grad_norm": 3.71875, "learning_rate": 0.0009692453036347351, "loss": 2.8059, "step": 1425 }, { "epoch": 0.011408, "grad_norm": 3.546875, "learning_rate": 0.0009692018095208188, "loss": 2.5559, "step": 1426 }, { "epoch": 0.011416, "grad_norm": 23.875, "learning_rate": 0.0009691582856506447, "loss": 2.2487, "step": 1427 }, { "epoch": 0.011424, "grad_norm": 3.796875, "learning_rate": 0.0009691147320269731, "loss": 2.2725, "step": 1428 }, { "epoch": 0.011432, "grad_norm": 5.40625, "learning_rate": 0.0009690711486525661, "loss": 3.0237, "step": 1429 }, { "epoch": 0.01144, "grad_norm": 3.71875, "learning_rate": 0.0009690275355301874, "loss": 2.2087, "step": 1430 }, { "epoch": 0.011448, "grad_norm": 6.0625, "learning_rate": 0.0009689838926626033, "loss": 2.6657, "step": 1431 }, { "epoch": 0.011456, "grad_norm": 5.375, "learning_rate": 0.0009689402200525816, "loss": 2.4717, "step": 1432 }, { "epoch": 0.011464, "grad_norm": 4.40625, "learning_rate": 0.0009688965177028916, "loss": 2.1569, "step": 1433 }, { "epoch": 0.011472, "grad_norm": 3.9375, "learning_rate": 0.0009688527856163052, "loss": 2.0296, "step": 1434 }, { "epoch": 0.01148, "grad_norm": 4.4375, "learning_rate": 0.0009688090237955955, "loss": 3.027, "step": 1435 }, { "epoch": 0.011488, "grad_norm": 5.8125, "learning_rate": 0.0009687652322435382, "loss": 2.37, "step": 1436 }, { "epoch": 0.011496, "grad_norm": 4.71875, "learning_rate": 0.0009687214109629103, "loss": 2.8291, "step": 1437 }, { "epoch": 0.011504, "grad_norm": 5.28125, "learning_rate": 0.0009686775599564907, "loss": 2.6955, "step": 1438 }, { "epoch": 0.011512, "grad_norm": 3.453125, "learning_rate": 0.0009686336792270609, "loss": 2.3192, "step": 1439 }, { "epoch": 0.01152, "grad_norm": 4.15625, "learning_rate": 0.0009685897687774035, "loss": 2.5774, "step": 1440 }, { "epoch": 0.011528, "grad_norm": 3.546875, "learning_rate": 0.000968545828610303, "loss": 2.3518, "step": 1441 }, { "epoch": 0.011536, "grad_norm": 4.1875, "learning_rate": 0.0009685018587285464, "loss": 2.0937, "step": 1442 }, { "epoch": 0.011544, "grad_norm": 3.609375, "learning_rate": 0.000968457859134922, "loss": 2.8268, "step": 1443 }, { "epoch": 0.011552, "grad_norm": 3.078125, "learning_rate": 0.0009684138298322205, "loss": 2.1214, "step": 1444 }, { "epoch": 0.01156, "grad_norm": 3.5, "learning_rate": 0.0009683697708232338, "loss": 2.8602, "step": 1445 }, { "epoch": 0.011568, "grad_norm": 4.09375, "learning_rate": 0.0009683256821107564, "loss": 2.3292, "step": 1446 }, { "epoch": 0.011576, "grad_norm": 3.546875, "learning_rate": 0.000968281563697584, "loss": 3.2053, "step": 1447 }, { "epoch": 0.011584, "grad_norm": 3.703125, "learning_rate": 0.0009682374155865149, "loss": 2.4514, "step": 1448 }, { "epoch": 0.011592, "grad_norm": 4.125, "learning_rate": 0.0009681932377803488, "loss": 3.1021, "step": 1449 }, { "epoch": 0.0116, "grad_norm": 4.34375, "learning_rate": 0.0009681490302818874, "loss": 2.4014, "step": 1450 }, { "epoch": 0.011608, "grad_norm": 3.1875, "learning_rate": 0.0009681047930939342, "loss": 2.3098, "step": 1451 }, { "epoch": 0.011616, "grad_norm": 3.03125, "learning_rate": 0.0009680605262192948, "loss": 2.2594, "step": 1452 }, { "epoch": 0.011624, "grad_norm": 5.59375, "learning_rate": 0.0009680162296607764, "loss": 2.2659, "step": 1453 }, { "epoch": 0.011632, "grad_norm": 3.15625, "learning_rate": 0.0009679719034211884, "loss": 2.1848, "step": 1454 }, { "epoch": 0.01164, "grad_norm": 23.125, "learning_rate": 0.000967927547503342, "loss": 1.9713, "step": 1455 }, { "epoch": 0.011648, "grad_norm": 3.8125, "learning_rate": 0.0009678831619100499, "loss": 2.4353, "step": 1456 }, { "epoch": 0.011656, "grad_norm": 3.59375, "learning_rate": 0.0009678387466441272, "loss": 2.3124, "step": 1457 }, { "epoch": 0.011664, "grad_norm": 4.34375, "learning_rate": 0.0009677943017083907, "loss": 2.3855, "step": 1458 }, { "epoch": 0.011672, "grad_norm": 4.25, "learning_rate": 0.0009677498271056589, "loss": 2.5789, "step": 1459 }, { "epoch": 0.01168, "grad_norm": 3.0, "learning_rate": 0.0009677053228387525, "loss": 2.1612, "step": 1460 }, { "epoch": 0.011688, "grad_norm": 5.125, "learning_rate": 0.0009676607889104935, "loss": 2.7722, "step": 1461 }, { "epoch": 0.011696, "grad_norm": 3.375, "learning_rate": 0.0009676162253237069, "loss": 2.2508, "step": 1462 }, { "epoch": 0.011704, "grad_norm": 3.515625, "learning_rate": 0.0009675716320812183, "loss": 2.2007, "step": 1463 }, { "epoch": 0.011712, "grad_norm": 6.75, "learning_rate": 0.0009675270091858559, "loss": 2.2657, "step": 1464 }, { "epoch": 0.01172, "grad_norm": 3.03125, "learning_rate": 0.0009674823566404497, "loss": 2.4579, "step": 1465 }, { "epoch": 0.011728, "grad_norm": 4.8125, "learning_rate": 0.0009674376744478315, "loss": 2.4689, "step": 1466 }, { "epoch": 0.011736, "grad_norm": 3.046875, "learning_rate": 0.0009673929626108349, "loss": 2.4332, "step": 1467 }, { "epoch": 0.011744, "grad_norm": 3.296875, "learning_rate": 0.0009673482211322956, "loss": 2.7257, "step": 1468 }, { "epoch": 0.011752, "grad_norm": 2.8125, "learning_rate": 0.000967303450015051, "loss": 1.9153, "step": 1469 }, { "epoch": 0.01176, "grad_norm": 3.890625, "learning_rate": 0.0009672586492619404, "loss": 2.6094, "step": 1470 }, { "epoch": 0.011768, "grad_norm": 2.859375, "learning_rate": 0.0009672138188758051, "loss": 2.0719, "step": 1471 }, { "epoch": 0.011776, "grad_norm": 4.5, "learning_rate": 0.0009671689588594882, "loss": 2.6258, "step": 1472 }, { "epoch": 0.011784, "grad_norm": 3.265625, "learning_rate": 0.0009671240692158345, "loss": 1.9683, "step": 1473 }, { "epoch": 0.011792, "grad_norm": 3.90625, "learning_rate": 0.0009670791499476912, "loss": 2.8312, "step": 1474 }, { "epoch": 0.0118, "grad_norm": 3.484375, "learning_rate": 0.0009670342010579065, "loss": 2.5007, "step": 1475 }, { "epoch": 0.011808, "grad_norm": 3.671875, "learning_rate": 0.0009669892225493315, "loss": 2.4419, "step": 1476 }, { "epoch": 0.011816, "grad_norm": 2.9375, "learning_rate": 0.0009669442144248184, "loss": 2.0885, "step": 1477 }, { "epoch": 0.011824, "grad_norm": 3.484375, "learning_rate": 0.0009668991766872217, "loss": 2.617, "step": 1478 }, { "epoch": 0.011832, "grad_norm": 3.875, "learning_rate": 0.0009668541093393976, "loss": 2.4154, "step": 1479 }, { "epoch": 0.01184, "grad_norm": 3.40625, "learning_rate": 0.0009668090123842043, "loss": 2.4021, "step": 1480 }, { "epoch": 0.011848, "grad_norm": 1.875, "learning_rate": 0.0009667638858245016, "loss": 1.3015, "step": 1481 }, { "epoch": 0.011856, "grad_norm": 2.859375, "learning_rate": 0.0009667187296631517, "loss": 2.5513, "step": 1482 }, { "epoch": 0.011864, "grad_norm": 4.625, "learning_rate": 0.0009666735439030178, "loss": 2.1646, "step": 1483 }, { "epoch": 0.011872, "grad_norm": 3.046875, "learning_rate": 0.0009666283285469663, "loss": 2.4925, "step": 1484 }, { "epoch": 0.01188, "grad_norm": 2.875, "learning_rate": 0.0009665830835978641, "loss": 1.9096, "step": 1485 }, { "epoch": 0.011888, "grad_norm": 3.421875, "learning_rate": 0.0009665378090585807, "loss": 2.7434, "step": 1486 }, { "epoch": 0.011896, "grad_norm": 3.65625, "learning_rate": 0.0009664925049319875, "loss": 2.9048, "step": 1487 }, { "epoch": 0.011904, "grad_norm": 4.78125, "learning_rate": 0.0009664471712209576, "loss": 2.5093, "step": 1488 }, { "epoch": 0.011912, "grad_norm": 3.84375, "learning_rate": 0.0009664018079283661, "loss": 3.0774, "step": 1489 }, { "epoch": 0.01192, "grad_norm": 3.3125, "learning_rate": 0.0009663564150570896, "loss": 2.4026, "step": 1490 }, { "epoch": 0.011928, "grad_norm": 4.03125, "learning_rate": 0.0009663109926100071, "loss": 2.2442, "step": 1491 }, { "epoch": 0.011936, "grad_norm": 3.28125, "learning_rate": 0.0009662655405899992, "loss": 2.4311, "step": 1492 }, { "epoch": 0.011944, "grad_norm": 3.796875, "learning_rate": 0.0009662200589999483, "loss": 2.6784, "step": 1493 }, { "epoch": 0.011952, "grad_norm": 3.625, "learning_rate": 0.000966174547842739, "loss": 2.944, "step": 1494 }, { "epoch": 0.01196, "grad_norm": 4.375, "learning_rate": 0.0009661290071212574, "loss": 2.32, "step": 1495 }, { "epoch": 0.011968, "grad_norm": 3.234375, "learning_rate": 0.0009660834368383918, "loss": 2.2396, "step": 1496 }, { "epoch": 0.011976, "grad_norm": 4.34375, "learning_rate": 0.0009660378369970321, "loss": 1.9949, "step": 1497 }, { "epoch": 0.011984, "grad_norm": 3.15625, "learning_rate": 0.0009659922076000702, "loss": 2.2384, "step": 1498 }, { "epoch": 0.011992, "grad_norm": 3.34375, "learning_rate": 0.0009659465486503998, "loss": 2.3727, "step": 1499 }, { "epoch": 0.012, "grad_norm": 3.390625, "learning_rate": 0.0009659008601509168, "loss": 2.2633, "step": 1500 }, { "epoch": 0.012008, "grad_norm": 3.125, "learning_rate": 0.0009658551421045183, "loss": 1.8743, "step": 1501 }, { "epoch": 0.012016, "grad_norm": 4.46875, "learning_rate": 0.000965809394514104, "loss": 2.533, "step": 1502 }, { "epoch": 0.012024, "grad_norm": 3.796875, "learning_rate": 0.0009657636173825752, "loss": 2.6252, "step": 1503 }, { "epoch": 0.012032, "grad_norm": 3.171875, "learning_rate": 0.0009657178107128348, "loss": 2.2181, "step": 1504 }, { "epoch": 0.01204, "grad_norm": 3.796875, "learning_rate": 0.000965671974507788, "loss": 2.5224, "step": 1505 }, { "epoch": 0.012048, "grad_norm": 3.15625, "learning_rate": 0.0009656261087703415, "loss": 2.2692, "step": 1506 }, { "epoch": 0.012056, "grad_norm": 3.859375, "learning_rate": 0.0009655802135034041, "loss": 2.9406, "step": 1507 }, { "epoch": 0.012064, "grad_norm": 4.78125, "learning_rate": 0.0009655342887098867, "loss": 2.3665, "step": 1508 }, { "epoch": 0.012072, "grad_norm": 2.609375, "learning_rate": 0.0009654883343927015, "loss": 2.428, "step": 1509 }, { "epoch": 0.01208, "grad_norm": 3.40625, "learning_rate": 0.0009654423505547629, "loss": 2.3257, "step": 1510 }, { "epoch": 0.012088, "grad_norm": 2.890625, "learning_rate": 0.0009653963371989874, "loss": 1.8533, "step": 1511 }, { "epoch": 0.012096, "grad_norm": 2.734375, "learning_rate": 0.0009653502943282926, "loss": 2.4212, "step": 1512 }, { "epoch": 0.012104, "grad_norm": 3.203125, "learning_rate": 0.000965304221945599, "loss": 2.5108, "step": 1513 }, { "epoch": 0.012112, "grad_norm": 2.375, "learning_rate": 0.0009652581200538282, "loss": 2.1769, "step": 1514 }, { "epoch": 0.01212, "grad_norm": 2.84375, "learning_rate": 0.0009652119886559041, "loss": 2.2921, "step": 1515 }, { "epoch": 0.012128, "grad_norm": 4.09375, "learning_rate": 0.000965165827754752, "loss": 2.4479, "step": 1516 }, { "epoch": 0.012136, "grad_norm": 3.125, "learning_rate": 0.0009651196373532998, "loss": 2.5655, "step": 1517 }, { "epoch": 0.012144, "grad_norm": 4.65625, "learning_rate": 0.0009650734174544764, "loss": 2.9544, "step": 1518 }, { "epoch": 0.012152, "grad_norm": 16.25, "learning_rate": 0.0009650271680612134, "loss": 2.0145, "step": 1519 }, { "epoch": 0.01216, "grad_norm": 8.1875, "learning_rate": 0.0009649808891764436, "loss": 1.4618, "step": 1520 }, { "epoch": 0.012168, "grad_norm": 3.0, "learning_rate": 0.000964934580803102, "loss": 2.1696, "step": 1521 }, { "epoch": 0.012176, "grad_norm": 3.765625, "learning_rate": 0.0009648882429441257, "loss": 2.308, "step": 1522 }, { "epoch": 0.012184, "grad_norm": 4.15625, "learning_rate": 0.0009648418756024531, "loss": 2.3938, "step": 1523 }, { "epoch": 0.012192, "grad_norm": 3.5, "learning_rate": 0.0009647954787810249, "loss": 1.7808, "step": 1524 }, { "epoch": 0.0122, "grad_norm": 2.59375, "learning_rate": 0.0009647490524827833, "loss": 2.0332, "step": 1525 }, { "epoch": 0.012208, "grad_norm": 4.03125, "learning_rate": 0.0009647025967106731, "loss": 2.0282, "step": 1526 }, { "epoch": 0.012216, "grad_norm": 4.90625, "learning_rate": 0.00096465611146764, "loss": 2.1542, "step": 1527 }, { "epoch": 0.012224, "grad_norm": 3.921875, "learning_rate": 0.0009646095967566321, "loss": 2.4009, "step": 1528 }, { "epoch": 0.012232, "grad_norm": 3.734375, "learning_rate": 0.0009645630525805996, "loss": 2.2622, "step": 1529 }, { "epoch": 0.01224, "grad_norm": 8.625, "learning_rate": 0.0009645164789424941, "loss": 2.4965, "step": 1530 }, { "epoch": 0.012248, "grad_norm": 4.84375, "learning_rate": 0.0009644698758452692, "loss": 2.6203, "step": 1531 }, { "epoch": 0.012256, "grad_norm": 3.46875, "learning_rate": 0.0009644232432918805, "loss": 2.4983, "step": 1532 }, { "epoch": 0.012264, "grad_norm": 4.71875, "learning_rate": 0.0009643765812852852, "loss": 2.5532, "step": 1533 }, { "epoch": 0.012272, "grad_norm": 3.078125, "learning_rate": 0.0009643298898284428, "loss": 2.1031, "step": 1534 }, { "epoch": 0.01228, "grad_norm": 3.171875, "learning_rate": 0.0009642831689243143, "loss": 2.4462, "step": 1535 }, { "epoch": 0.012288, "grad_norm": 2.875, "learning_rate": 0.0009642364185758627, "loss": 1.8899, "step": 1536 }, { "epoch": 0.012296, "grad_norm": 3.15625, "learning_rate": 0.0009641896387860531, "loss": 2.1491, "step": 1537 }, { "epoch": 0.012304, "grad_norm": 3.515625, "learning_rate": 0.0009641428295578519, "loss": 2.3769, "step": 1538 }, { "epoch": 0.012312, "grad_norm": 4.34375, "learning_rate": 0.0009640959908942276, "loss": 2.6275, "step": 1539 }, { "epoch": 0.01232, "grad_norm": 3.078125, "learning_rate": 0.0009640491227981509, "loss": 1.9994, "step": 1540 }, { "epoch": 0.012328, "grad_norm": 2.546875, "learning_rate": 0.0009640022252725939, "loss": 2.1096, "step": 1541 }, { "epoch": 0.012336, "grad_norm": 3.21875, "learning_rate": 0.0009639552983205313, "loss": 2.6443, "step": 1542 }, { "epoch": 0.012344, "grad_norm": 3.296875, "learning_rate": 0.0009639083419449386, "loss": 2.3, "step": 1543 }, { "epoch": 0.012352, "grad_norm": 3.6875, "learning_rate": 0.0009638613561487939, "loss": 2.9142, "step": 1544 }, { "epoch": 0.01236, "grad_norm": 3.359375, "learning_rate": 0.0009638143409350772, "loss": 2.4846, "step": 1545 }, { "epoch": 0.012368, "grad_norm": 2.859375, "learning_rate": 0.0009637672963067697, "loss": 2.0504, "step": 1546 }, { "epoch": 0.012376, "grad_norm": 2.421875, "learning_rate": 0.0009637202222668554, "loss": 2.1635, "step": 1547 }, { "epoch": 0.012384, "grad_norm": 3.0, "learning_rate": 0.0009636731188183194, "loss": 2.4472, "step": 1548 }, { "epoch": 0.012392, "grad_norm": 3.609375, "learning_rate": 0.0009636259859641489, "loss": 2.6912, "step": 1549 }, { "epoch": 0.0124, "grad_norm": 2.734375, "learning_rate": 0.0009635788237073333, "loss": 2.5965, "step": 1550 }, { "epoch": 0.012408, "grad_norm": 6.0625, "learning_rate": 0.0009635316320508634, "loss": 1.9129, "step": 1551 }, { "epoch": 0.012416, "grad_norm": 2.84375, "learning_rate": 0.0009634844109977319, "loss": 2.3093, "step": 1552 }, { "epoch": 0.012424, "grad_norm": 3.0, "learning_rate": 0.0009634371605509337, "loss": 2.238, "step": 1553 }, { "epoch": 0.012432, "grad_norm": 2.796875, "learning_rate": 0.0009633898807134653, "loss": 2.0332, "step": 1554 }, { "epoch": 0.01244, "grad_norm": 3.765625, "learning_rate": 0.0009633425714883253, "loss": 2.4543, "step": 1555 }, { "epoch": 0.012448, "grad_norm": 3.140625, "learning_rate": 0.0009632952328785136, "loss": 2.4122, "step": 1556 }, { "epoch": 0.012456, "grad_norm": 2.8125, "learning_rate": 0.0009632478648870328, "loss": 2.0024, "step": 1557 }, { "epoch": 0.012464, "grad_norm": 4.40625, "learning_rate": 0.0009632004675168869, "loss": 2.5701, "step": 1558 }, { "epoch": 0.012472, "grad_norm": 3.59375, "learning_rate": 0.0009631530407710813, "loss": 2.5814, "step": 1559 }, { "epoch": 0.01248, "grad_norm": 3.875, "learning_rate": 0.0009631055846526243, "loss": 2.7172, "step": 1560 }, { "epoch": 0.012488, "grad_norm": 3.0, "learning_rate": 0.0009630580991645253, "loss": 2.5776, "step": 1561 }, { "epoch": 0.012496, "grad_norm": 3.59375, "learning_rate": 0.0009630105843097957, "loss": 2.6966, "step": 1562 }, { "epoch": 0.012504, "grad_norm": 3.421875, "learning_rate": 0.0009629630400914489, "loss": 2.8895, "step": 1563 }, { "epoch": 0.012512, "grad_norm": 3.5, "learning_rate": 0.0009629154665125001, "loss": 2.3854, "step": 1564 }, { "epoch": 0.01252, "grad_norm": 60.75, "learning_rate": 0.0009628678635759664, "loss": 2.6387, "step": 1565 }, { "epoch": 0.012528, "grad_norm": 3.171875, "learning_rate": 0.0009628202312848669, "loss": 2.0645, "step": 1566 }, { "epoch": 0.012536, "grad_norm": 3.03125, "learning_rate": 0.000962772569642222, "loss": 2.5091, "step": 1567 }, { "epoch": 0.012544, "grad_norm": 2.890625, "learning_rate": 0.0009627248786510546, "loss": 2.292, "step": 1568 }, { "epoch": 0.012552, "grad_norm": 3.328125, "learning_rate": 0.0009626771583143892, "loss": 2.6159, "step": 1569 }, { "epoch": 0.01256, "grad_norm": 3.359375, "learning_rate": 0.0009626294086352519, "loss": 2.0226, "step": 1570 }, { "epoch": 0.012568, "grad_norm": 3.46875, "learning_rate": 0.0009625816296166714, "loss": 2.6413, "step": 1571 }, { "epoch": 0.012576, "grad_norm": 3.765625, "learning_rate": 0.0009625338212616775, "loss": 2.5841, "step": 1572 }, { "epoch": 0.012584, "grad_norm": 3.890625, "learning_rate": 0.0009624859835733021, "loss": 2.3383, "step": 1573 }, { "epoch": 0.012592, "grad_norm": 3.890625, "learning_rate": 0.0009624381165545793, "loss": 1.9732, "step": 1574 }, { "epoch": 0.0126, "grad_norm": 3.265625, "learning_rate": 0.0009623902202085444, "loss": 2.3935, "step": 1575 }, { "epoch": 0.012608, "grad_norm": 3.953125, "learning_rate": 0.0009623422945382351, "loss": 2.6001, "step": 1576 }, { "epoch": 0.012616, "grad_norm": 3.25, "learning_rate": 0.0009622943395466908, "loss": 2.4149, "step": 1577 }, { "epoch": 0.012624, "grad_norm": 3.84375, "learning_rate": 0.000962246355236953, "loss": 2.4343, "step": 1578 }, { "epoch": 0.012632, "grad_norm": 3.59375, "learning_rate": 0.0009621983416120643, "loss": 2.5685, "step": 1579 }, { "epoch": 0.01264, "grad_norm": 4.28125, "learning_rate": 0.0009621502986750701, "loss": 2.8761, "step": 1580 }, { "epoch": 0.012648, "grad_norm": 3.84375, "learning_rate": 0.0009621022264290171, "loss": 2.389, "step": 1581 }, { "epoch": 0.012656, "grad_norm": 4.0, "learning_rate": 0.0009620541248769539, "loss": 2.2099, "step": 1582 }, { "epoch": 0.012664, "grad_norm": 125.0, "learning_rate": 0.0009620059940219311, "loss": 2.7344, "step": 1583 }, { "epoch": 0.012672, "grad_norm": 66.0, "learning_rate": 0.0009619578338670012, "loss": 2.1077, "step": 1584 }, { "epoch": 0.01268, "grad_norm": 3.515625, "learning_rate": 0.0009619096444152182, "loss": 2.6258, "step": 1585 }, { "epoch": 0.012688, "grad_norm": 3.8125, "learning_rate": 0.0009618614256696385, "loss": 2.7372, "step": 1586 }, { "epoch": 0.012696, "grad_norm": 3.984375, "learning_rate": 0.0009618131776333201, "loss": 2.7063, "step": 1587 }, { "epoch": 0.012704, "grad_norm": 3.90625, "learning_rate": 0.0009617649003093226, "loss": 2.5683, "step": 1588 }, { "epoch": 0.012712, "grad_norm": 3.65625, "learning_rate": 0.0009617165937007079, "loss": 2.7038, "step": 1589 }, { "epoch": 0.01272, "grad_norm": 3.75, "learning_rate": 0.0009616682578105394, "loss": 2.0722, "step": 1590 }, { "epoch": 0.012728, "grad_norm": 3.6875, "learning_rate": 0.0009616198926418824, "loss": 2.3366, "step": 1591 }, { "epoch": 0.012736, "grad_norm": 4.4375, "learning_rate": 0.0009615714981978046, "loss": 2.9602, "step": 1592 }, { "epoch": 0.012744, "grad_norm": 7.40625, "learning_rate": 0.0009615230744813748, "loss": 2.5518, "step": 1593 }, { "epoch": 0.012752, "grad_norm": 11.1875, "learning_rate": 0.000961474621495664, "loss": 2.9394, "step": 1594 }, { "epoch": 0.01276, "grad_norm": 28.375, "learning_rate": 0.0009614261392437451, "loss": 2.4314, "step": 1595 }, { "epoch": 0.012768, "grad_norm": 9.3125, "learning_rate": 0.0009613776277286927, "loss": 2.5537, "step": 1596 }, { "epoch": 0.012776, "grad_norm": 15.75, "learning_rate": 0.0009613290869535834, "loss": 2.5965, "step": 1597 }, { "epoch": 0.012784, "grad_norm": 157.0, "learning_rate": 0.0009612805169214957, "loss": 2.5689, "step": 1598 }, { "epoch": 0.012792, "grad_norm": 66.0, "learning_rate": 0.0009612319176355098, "loss": 2.8868, "step": 1599 }, { "epoch": 0.0128, "grad_norm": 17536.0, "learning_rate": 0.0009611832890987075, "loss": 2.3734, "step": 1600 }, { "epoch": 0.012808, "grad_norm": 1000.0, "learning_rate": 0.0009611346313141733, "loss": 3.016, "step": 1601 }, { "epoch": 0.012816, "grad_norm": 6272.0, "learning_rate": 0.0009610859442849928, "loss": 3.0115, "step": 1602 }, { "epoch": 0.012824, "grad_norm": 300.0, "learning_rate": 0.0009610372280142536, "loss": 2.7349, "step": 1603 }, { "epoch": 0.012832, "grad_norm": 12.0, "learning_rate": 0.0009609884825050452, "loss": 3.1794, "step": 1604 }, { "epoch": 0.01284, "grad_norm": 7.1875, "learning_rate": 0.0009609397077604591, "loss": 3.1212, "step": 1605 }, { "epoch": 0.012848, "grad_norm": 4.5, "learning_rate": 0.0009608909037835887, "loss": 2.4597, "step": 1606 }, { "epoch": 0.012856, "grad_norm": 32.75, "learning_rate": 0.0009608420705775287, "loss": 2.4046, "step": 1607 }, { "epoch": 0.012864, "grad_norm": 6.65625, "learning_rate": 0.0009607932081453763, "loss": 2.415, "step": 1608 }, { "epoch": 0.012872, "grad_norm": 5.875, "learning_rate": 0.0009607443164902302, "loss": 2.8109, "step": 1609 }, { "epoch": 0.01288, "grad_norm": 19.375, "learning_rate": 0.0009606953956151911, "loss": 2.9357, "step": 1610 }, { "epoch": 0.012888, "grad_norm": 3.84375, "learning_rate": 0.0009606464455233617, "loss": 2.1549, "step": 1611 }, { "epoch": 0.012896, "grad_norm": 4.0625, "learning_rate": 0.0009605974662178459, "loss": 2.8744, "step": 1612 }, { "epoch": 0.012904, "grad_norm": 4.5625, "learning_rate": 0.0009605484577017504, "loss": 2.746, "step": 1613 }, { "epoch": 0.012912, "grad_norm": 5.4375, "learning_rate": 0.000960499419978183, "loss": 2.4468, "step": 1614 }, { "epoch": 0.01292, "grad_norm": 6.40625, "learning_rate": 0.0009604503530502535, "loss": 2.7514, "step": 1615 }, { "epoch": 0.012928, "grad_norm": 3.84375, "learning_rate": 0.0009604012569210741, "loss": 2.6343, "step": 1616 }, { "epoch": 0.012936, "grad_norm": 3.953125, "learning_rate": 0.000960352131593758, "loss": 2.6126, "step": 1617 }, { "epoch": 0.012944, "grad_norm": 4.65625, "learning_rate": 0.0009603029770714209, "loss": 2.3892, "step": 1618 }, { "epoch": 0.012952, "grad_norm": 3.5, "learning_rate": 0.00096025379335718, "loss": 2.4088, "step": 1619 }, { "epoch": 0.01296, "grad_norm": 4.09375, "learning_rate": 0.0009602045804541546, "loss": 2.6671, "step": 1620 }, { "epoch": 0.012968, "grad_norm": 5.40625, "learning_rate": 0.0009601553383654655, "loss": 2.5954, "step": 1621 }, { "epoch": 0.012976, "grad_norm": 6.5625, "learning_rate": 0.0009601060670942359, "loss": 2.7659, "step": 1622 }, { "epoch": 0.012984, "grad_norm": 180.0, "learning_rate": 0.0009600567666435904, "loss": 2.3535, "step": 1623 }, { "epoch": 0.012992, "grad_norm": 2640.0, "learning_rate": 0.0009600074370166556, "loss": 2.2217, "step": 1624 }, { "epoch": 0.013, "grad_norm": 91.0, "learning_rate": 0.0009599580782165598, "loss": 2.0964, "step": 1625 }, { "epoch": 0.013008, "grad_norm": 82.5, "learning_rate": 0.0009599086902464333, "loss": 2.2146, "step": 1626 }, { "epoch": 0.013016, "grad_norm": 9216.0, "learning_rate": 0.0009598592731094083, "loss": 2.2224, "step": 1627 }, { "epoch": 0.013024, "grad_norm": 33280.0, "learning_rate": 0.0009598098268086188, "loss": 2.3503, "step": 1628 }, { "epoch": 0.013032, "grad_norm": 117248.0, "learning_rate": 0.0009597603513472006, "loss": 2.1838, "step": 1629 }, { "epoch": 0.01304, "grad_norm": 11072.0, "learning_rate": 0.0009597108467282916, "loss": 2.2981, "step": 1630 }, { "epoch": 0.013048, "grad_norm": 1296.0, "learning_rate": 0.0009596613129550309, "loss": 2.8694, "step": 1631 }, { "epoch": 0.013056, "grad_norm": 8576.0, "learning_rate": 0.00095961175003056, "loss": 2.5409, "step": 1632 }, { "epoch": 0.013064, "grad_norm": 8.625, "learning_rate": 0.0009595621579580223, "loss": 2.8079, "step": 1633 }, { "epoch": 0.013072, "grad_norm": 21504.0, "learning_rate": 0.0009595125367405628, "loss": 1.9427, "step": 1634 }, { "epoch": 0.01308, "grad_norm": 335872.0, "learning_rate": 0.0009594628863813284, "loss": 2.1921, "step": 1635 }, { "epoch": 0.013088, "grad_norm": 47104.0, "learning_rate": 0.000959413206883468, "loss": 2.8641, "step": 1636 }, { "epoch": 0.013096, "grad_norm": 48896.0, "learning_rate": 0.000959363498250132, "loss": 2.9558, "step": 1637 }, { "epoch": 0.013104, "grad_norm": 5376.0, "learning_rate": 0.0009593137604844729, "loss": 3.3235, "step": 1638 }, { "epoch": 0.013112, "grad_norm": 129024.0, "learning_rate": 0.0009592639935896452, "loss": 3.1197, "step": 1639 }, { "epoch": 0.01312, "grad_norm": 25088.0, "learning_rate": 0.0009592141975688047, "loss": 3.3212, "step": 1640 }, { "epoch": 0.013128, "grad_norm": 30464.0, "learning_rate": 0.0009591643724251099, "loss": 3.2661, "step": 1641 }, { "epoch": 0.013136, "grad_norm": 14528.0, "learning_rate": 0.0009591145181617203, "loss": 3.2571, "step": 1642 }, { "epoch": 0.013144, "grad_norm": 5728.0, "learning_rate": 0.0009590646347817977, "loss": 4.1171, "step": 1643 }, { "epoch": 0.013152, "grad_norm": 137.0, "learning_rate": 0.0009590147222885057, "loss": 4.5435, "step": 1644 }, { "epoch": 0.01316, "grad_norm": 748.0, "learning_rate": 0.0009589647806850097, "loss": 4.9383, "step": 1645 }, { "epoch": 0.013168, "grad_norm": 175.0, "learning_rate": 0.0009589148099744769, "loss": 5.5403, "step": 1646 }, { "epoch": 0.013176, "grad_norm": 132.0, "learning_rate": 0.0009588648101600762, "loss": 4.5043, "step": 1647 }, { "epoch": 0.013184, "grad_norm": 25.125, "learning_rate": 0.0009588147812449788, "loss": 4.6111, "step": 1648 }, { "epoch": 0.013192, "grad_norm": 9.375, "learning_rate": 0.0009587647232323574, "loss": 4.0855, "step": 1649 }, { "epoch": 0.0132, "grad_norm": 9.0625, "learning_rate": 0.0009587146361253867, "loss": 3.891, "step": 1650 }, { "epoch": 0.013208, "grad_norm": 6.90625, "learning_rate": 0.000958664519927243, "loss": 3.5506, "step": 1651 }, { "epoch": 0.013216, "grad_norm": 6.9375, "learning_rate": 0.0009586143746411047, "loss": 3.6185, "step": 1652 }, { "epoch": 0.013224, "grad_norm": 10.125, "learning_rate": 0.0009585642002701518, "loss": 3.4196, "step": 1653 }, { "epoch": 0.013232, "grad_norm": 7.84375, "learning_rate": 0.0009585139968175667, "loss": 3.5538, "step": 1654 }, { "epoch": 0.01324, "grad_norm": 5.125, "learning_rate": 0.0009584637642865328, "loss": 3.2, "step": 1655 }, { "epoch": 0.013248, "grad_norm": 5.46875, "learning_rate": 0.0009584135026802361, "loss": 2.8272, "step": 1656 }, { "epoch": 0.013256, "grad_norm": 5.59375, "learning_rate": 0.000958363212001864, "loss": 3.1665, "step": 1657 }, { "epoch": 0.013264, "grad_norm": 5.21875, "learning_rate": 0.0009583128922546059, "loss": 2.9226, "step": 1658 }, { "epoch": 0.013272, "grad_norm": 4.375, "learning_rate": 0.0009582625434416532, "loss": 3.2731, "step": 1659 }, { "epoch": 0.01328, "grad_norm": 3.921875, "learning_rate": 0.0009582121655661986, "loss": 3.0717, "step": 1660 }, { "epoch": 0.013288, "grad_norm": 3.953125, "learning_rate": 0.0009581617586314373, "loss": 2.8663, "step": 1661 }, { "epoch": 0.013296, "grad_norm": 4.09375, "learning_rate": 0.0009581113226405658, "loss": 3.3486, "step": 1662 }, { "epoch": 0.013304, "grad_norm": 3.953125, "learning_rate": 0.000958060857596783, "loss": 2.6151, "step": 1663 }, { "epoch": 0.013312, "grad_norm": 5.3125, "learning_rate": 0.0009580103635032892, "loss": 2.2261, "step": 1664 }, { "epoch": 0.01332, "grad_norm": 9.25, "learning_rate": 0.0009579598403632865, "loss": 3.035, "step": 1665 }, { "epoch": 0.013328, "grad_norm": 4.5625, "learning_rate": 0.0009579092881799794, "loss": 3.2414, "step": 1666 }, { "epoch": 0.013336, "grad_norm": 3.515625, "learning_rate": 0.0009578587069565736, "loss": 3.5588, "step": 1667 }, { "epoch": 0.013344, "grad_norm": 4.21875, "learning_rate": 0.0009578080966962768, "loss": 2.8918, "step": 1668 }, { "epoch": 0.013352, "grad_norm": 31.25, "learning_rate": 0.000957757457402299, "loss": 2.4318, "step": 1669 }, { "epoch": 0.01336, "grad_norm": 14.5, "learning_rate": 0.0009577067890778515, "loss": 3.0043, "step": 1670 }, { "epoch": 0.013368, "grad_norm": 5.25, "learning_rate": 0.0009576560917261474, "loss": 2.7823, "step": 1671 }, { "epoch": 0.013376, "grad_norm": 7.34375, "learning_rate": 0.0009576053653504022, "loss": 3.8058, "step": 1672 }, { "epoch": 0.013384, "grad_norm": 4.84375, "learning_rate": 0.0009575546099538329, "loss": 2.8067, "step": 1673 }, { "epoch": 0.013392, "grad_norm": 4.78125, "learning_rate": 0.0009575038255396582, "loss": 3.2137, "step": 1674 }, { "epoch": 0.0134, "grad_norm": 5.25, "learning_rate": 0.0009574530121110989, "loss": 3.2006, "step": 1675 }, { "epoch": 0.013408, "grad_norm": 5.25, "learning_rate": 0.0009574021696713773, "loss": 2.7244, "step": 1676 }, { "epoch": 0.013416, "grad_norm": 6.96875, "learning_rate": 0.000957351298223718, "loss": 3.1275, "step": 1677 }, { "epoch": 0.013424, "grad_norm": 12.5, "learning_rate": 0.0009573003977713472, "loss": 3.3999, "step": 1678 }, { "epoch": 0.013432, "grad_norm": 8.875, "learning_rate": 0.000957249468317493, "loss": 2.846, "step": 1679 }, { "epoch": 0.01344, "grad_norm": 6.40625, "learning_rate": 0.0009571985098653851, "loss": 2.546, "step": 1680 }, { "epoch": 0.013448, "grad_norm": 6.09375, "learning_rate": 0.0009571475224182553, "loss": 2.8758, "step": 1681 }, { "epoch": 0.013456, "grad_norm": 374.0, "learning_rate": 0.0009570965059793373, "loss": 2.8854, "step": 1682 }, { "epoch": 0.013464, "grad_norm": 3.9375, "learning_rate": 0.0009570454605518663, "loss": 2.9951, "step": 1683 }, { "epoch": 0.013472, "grad_norm": 239.0, "learning_rate": 0.0009569943861390797, "loss": 2.2862, "step": 1684 }, { "epoch": 0.01348, "grad_norm": 4.03125, "learning_rate": 0.0009569432827442164, "loss": 2.5532, "step": 1685 }, { "epoch": 0.013488, "grad_norm": 3.40625, "learning_rate": 0.0009568921503705176, "loss": 2.8841, "step": 1686 }, { "epoch": 0.013496, "grad_norm": 4.5, "learning_rate": 0.0009568409890212257, "loss": 3.2982, "step": 1687 }, { "epoch": 0.013504, "grad_norm": 3.578125, "learning_rate": 0.0009567897986995857, "loss": 2.9228, "step": 1688 }, { "epoch": 0.013512, "grad_norm": 3.578125, "learning_rate": 0.0009567385794088438, "loss": 2.3325, "step": 1689 }, { "epoch": 0.01352, "grad_norm": 4.5, "learning_rate": 0.0009566873311522482, "loss": 3.0614, "step": 1690 }, { "epoch": 0.013528, "grad_norm": 452.0, "learning_rate": 0.0009566360539330492, "loss": 2.5966, "step": 1691 }, { "epoch": 0.013536, "grad_norm": 4.75, "learning_rate": 0.0009565847477544986, "loss": 2.7207, "step": 1692 }, { "epoch": 0.013544, "grad_norm": 4.90625, "learning_rate": 0.0009565334126198504, "loss": 2.7129, "step": 1693 }, { "epoch": 0.013552, "grad_norm": 4.28125, "learning_rate": 0.0009564820485323599, "loss": 2.7854, "step": 1694 }, { "epoch": 0.01356, "grad_norm": 6.21875, "learning_rate": 0.0009564306554952847, "loss": 3.0738, "step": 1695 }, { "epoch": 0.013568, "grad_norm": 3.84375, "learning_rate": 0.0009563792335118841, "loss": 2.6817, "step": 1696 }, { "epoch": 0.013576, "grad_norm": 5.71875, "learning_rate": 0.0009563277825854193, "loss": 3.2775, "step": 1697 }, { "epoch": 0.013584, "grad_norm": 5.125, "learning_rate": 0.0009562763027191531, "loss": 2.6701, "step": 1698 }, { "epoch": 0.013592, "grad_norm": 3.59375, "learning_rate": 0.0009562247939163504, "loss": 2.4586, "step": 1699 }, { "epoch": 0.0136, "grad_norm": 5.5625, "learning_rate": 0.0009561732561802779, "loss": 3.0457, "step": 1700 }, { "epoch": 0.013608, "grad_norm": 152.0, "learning_rate": 0.0009561216895142038, "loss": 2.3603, "step": 1701 }, { "epoch": 0.013616, "grad_norm": 197.0, "learning_rate": 0.0009560700939213987, "loss": 2.5521, "step": 1702 }, { "epoch": 0.013624, "grad_norm": 10.75, "learning_rate": 0.0009560184694051345, "loss": 2.902, "step": 1703 }, { "epoch": 0.013632, "grad_norm": 2624.0, "learning_rate": 0.0009559668159686853, "loss": 2.7219, "step": 1704 }, { "epoch": 0.01364, "grad_norm": 568.0, "learning_rate": 0.0009559151336153271, "loss": 2.7122, "step": 1705 }, { "epoch": 0.013648, "grad_norm": 4352.0, "learning_rate": 0.0009558634223483371, "loss": 3.0346, "step": 1706 }, { "epoch": 0.013656, "grad_norm": 191.0, "learning_rate": 0.000955811682170995, "loss": 3.8762, "step": 1707 }, { "epoch": 0.013664, "grad_norm": 1104.0, "learning_rate": 0.0009557599130865822, "loss": 3.4818, "step": 1708 }, { "epoch": 0.013672, "grad_norm": 4800.0, "learning_rate": 0.0009557081150983818, "loss": 4.3088, "step": 1709 }, { "epoch": 0.01368, "grad_norm": 15360.0, "learning_rate": 0.0009556562882096784, "loss": 5.2712, "step": 1710 }, { "epoch": 0.013688, "grad_norm": 334.0, "learning_rate": 0.0009556044324237594, "loss": 6.1616, "step": 1711 }, { "epoch": 0.013696, "grad_norm": 144.0, "learning_rate": 0.0009555525477439131, "loss": 4.8233, "step": 1712 }, { "epoch": 0.013704, "grad_norm": 21.375, "learning_rate": 0.0009555006341734299, "loss": 4.2935, "step": 1713 }, { "epoch": 0.013712, "grad_norm": 12.0625, "learning_rate": 0.0009554486917156022, "loss": 4.2703, "step": 1714 }, { "epoch": 0.01372, "grad_norm": 11.375, "learning_rate": 0.0009553967203737243, "loss": 3.4716, "step": 1715 }, { "epoch": 0.013728, "grad_norm": 125.0, "learning_rate": 0.0009553447201510919, "loss": 2.4771, "step": 1716 }, { "epoch": 0.013736, "grad_norm": 68.5, "learning_rate": 0.0009552926910510029, "loss": 2.9879, "step": 1717 }, { "epoch": 0.013744, "grad_norm": 7.71875, "learning_rate": 0.0009552406330767571, "loss": 3.2319, "step": 1718 }, { "epoch": 0.013752, "grad_norm": 7.78125, "learning_rate": 0.0009551885462316557, "loss": 2.8692, "step": 1719 }, { "epoch": 0.01376, "grad_norm": 6.96875, "learning_rate": 0.000955136430519002, "loss": 3.608, "step": 1720 }, { "epoch": 0.013768, "grad_norm": 5.78125, "learning_rate": 0.0009550842859421013, "loss": 2.7555, "step": 1721 }, { "epoch": 0.013776, "grad_norm": 5.65625, "learning_rate": 0.0009550321125042604, "loss": 2.9747, "step": 1722 }, { "epoch": 0.013784, "grad_norm": 6.03125, "learning_rate": 0.0009549799102087881, "loss": 3.2478, "step": 1723 }, { "epoch": 0.013792, "grad_norm": 4.96875, "learning_rate": 0.0009549276790589953, "loss": 2.627, "step": 1724 }, { "epoch": 0.0138, "grad_norm": 13.3125, "learning_rate": 0.0009548754190581938, "loss": 3.0301, "step": 1725 }, { "epoch": 0.013808, "grad_norm": 4.34375, "learning_rate": 0.0009548231302096985, "loss": 3.0802, "step": 1726 }, { "epoch": 0.013816, "grad_norm": 4.84375, "learning_rate": 0.0009547708125168253, "loss": 3.3961, "step": 1727 }, { "epoch": 0.013824, "grad_norm": 4.65625, "learning_rate": 0.000954718465982892, "loss": 2.964, "step": 1728 }, { "epoch": 0.013832, "grad_norm": 7.96875, "learning_rate": 0.0009546660906112186, "loss": 2.8581, "step": 1729 }, { "epoch": 0.01384, "grad_norm": 548.0, "learning_rate": 0.0009546136864051265, "loss": 2.7159, "step": 1730 }, { "epoch": 0.013848, "grad_norm": 15.1875, "learning_rate": 0.0009545612533679391, "loss": 2.8315, "step": 1731 }, { "epoch": 0.013856, "grad_norm": 12.125, "learning_rate": 0.0009545087915029816, "loss": 3.3987, "step": 1732 }, { "epoch": 0.013864, "grad_norm": 980.0, "learning_rate": 0.0009544563008135814, "loss": 2.8346, "step": 1733 }, { "epoch": 0.013872, "grad_norm": 15.375, "learning_rate": 0.0009544037813030669, "loss": 3.2528, "step": 1734 }, { "epoch": 0.01388, "grad_norm": 25.5, "learning_rate": 0.0009543512329747693, "loss": 3.391, "step": 1735 }, { "epoch": 0.013888, "grad_norm": 91.5, "learning_rate": 0.0009542986558320209, "loss": 4.3246, "step": 1736 }, { "epoch": 0.013896, "grad_norm": 11.8125, "learning_rate": 0.0009542460498781562, "loss": 3.719, "step": 1737 }, { "epoch": 0.013904, "grad_norm": 7.09375, "learning_rate": 0.0009541934151165113, "loss": 2.8787, "step": 1738 }, { "epoch": 0.013912, "grad_norm": 11.1875, "learning_rate": 0.0009541407515504243, "loss": 3.1767, "step": 1739 }, { "epoch": 0.01392, "grad_norm": 11.5625, "learning_rate": 0.0009540880591832352, "loss": 2.5473, "step": 1740 }, { "epoch": 0.013928, "grad_norm": 9.125, "learning_rate": 0.0009540353380182853, "loss": 2.4701, "step": 1741 }, { "epoch": 0.013936, "grad_norm": 12.5, "learning_rate": 0.0009539825880589185, "loss": 3.2072, "step": 1742 }, { "epoch": 0.013944, "grad_norm": 7.5625, "learning_rate": 0.0009539298093084801, "loss": 3.0691, "step": 1743 }, { "epoch": 0.013952, "grad_norm": 5.0625, "learning_rate": 0.0009538770017703171, "loss": 2.9854, "step": 1744 }, { "epoch": 0.01396, "grad_norm": 6.875, "learning_rate": 0.0009538241654477787, "loss": 3.0524, "step": 1745 }, { "epoch": 0.013968, "grad_norm": 6.75, "learning_rate": 0.0009537713003442155, "loss": 2.8033, "step": 1746 }, { "epoch": 0.013976, "grad_norm": 6.0625, "learning_rate": 0.0009537184064629801, "loss": 3.0128, "step": 1747 }, { "epoch": 0.013984, "grad_norm": 7.875, "learning_rate": 0.0009536654838074272, "loss": 2.5575, "step": 1748 }, { "epoch": 0.013992, "grad_norm": 6.9375, "learning_rate": 0.0009536125323809132, "loss": 2.6684, "step": 1749 }, { "epoch": 0.014, "grad_norm": 10.0, "learning_rate": 0.0009535595521867959, "loss": 2.8853, "step": 1750 }, { "epoch": 0.014008, "grad_norm": 4.40625, "learning_rate": 0.0009535065432284353, "loss": 3.4151, "step": 1751 }, { "epoch": 0.014016, "grad_norm": 3.765625, "learning_rate": 0.0009534535055091934, "loss": 2.0483, "step": 1752 }, { "epoch": 0.014024, "grad_norm": 20.875, "learning_rate": 0.0009534004390324335, "loss": 2.6147, "step": 1753 }, { "epoch": 0.014032, "grad_norm": 4.5, "learning_rate": 0.0009533473438015213, "loss": 2.7359, "step": 1754 }, { "epoch": 0.01404, "grad_norm": 5.34375, "learning_rate": 0.0009532942198198237, "loss": 2.7111, "step": 1755 }, { "epoch": 0.014048, "grad_norm": 4.28125, "learning_rate": 0.0009532410670907102, "loss": 3.01, "step": 1756 }, { "epoch": 0.014056, "grad_norm": 4.59375, "learning_rate": 0.0009531878856175513, "loss": 3.3613, "step": 1757 }, { "epoch": 0.014064, "grad_norm": 4.25, "learning_rate": 0.00095313467540372, "loss": 3.2192, "step": 1758 }, { "epoch": 0.014072, "grad_norm": 4.25, "learning_rate": 0.0009530814364525906, "loss": 2.6988, "step": 1759 }, { "epoch": 0.01408, "grad_norm": 3.96875, "learning_rate": 0.0009530281687675395, "loss": 2.5759, "step": 1760 }, { "epoch": 0.014088, "grad_norm": 125.5, "learning_rate": 0.0009529748723519451, "loss": 2.5701, "step": 1761 }, { "epoch": 0.014096, "grad_norm": 3.578125, "learning_rate": 0.000952921547209187, "loss": 2.4377, "step": 1762 }, { "epoch": 0.014104, "grad_norm": 4.25, "learning_rate": 0.0009528681933426474, "loss": 2.9383, "step": 1763 }, { "epoch": 0.014112, "grad_norm": 4.5, "learning_rate": 0.0009528148107557097, "loss": 2.5384, "step": 1764 }, { "epoch": 0.01412, "grad_norm": 4.09375, "learning_rate": 0.0009527613994517595, "loss": 2.9215, "step": 1765 }, { "epoch": 0.014128, "grad_norm": 3.078125, "learning_rate": 0.0009527079594341841, "loss": 2.1204, "step": 1766 }, { "epoch": 0.014136, "grad_norm": 3.125, "learning_rate": 0.0009526544907063726, "loss": 2.2696, "step": 1767 }, { "epoch": 0.014144, "grad_norm": 6.375, "learning_rate": 0.0009526009932717157, "loss": 3.41, "step": 1768 }, { "epoch": 0.014152, "grad_norm": 3.796875, "learning_rate": 0.0009525474671336063, "loss": 2.3253, "step": 1769 }, { "epoch": 0.01416, "grad_norm": 4.96875, "learning_rate": 0.0009524939122954391, "loss": 2.945, "step": 1770 }, { "epoch": 0.014168, "grad_norm": 5.25, "learning_rate": 0.0009524403287606104, "loss": 3.0538, "step": 1771 }, { "epoch": 0.014176, "grad_norm": 3.546875, "learning_rate": 0.0009523867165325183, "loss": 3.1133, "step": 1772 }, { "epoch": 0.014184, "grad_norm": 3.734375, "learning_rate": 0.000952333075614563, "loss": 3.1749, "step": 1773 }, { "epoch": 0.014192, "grad_norm": 3.046875, "learning_rate": 0.0009522794060101462, "loss": 2.6868, "step": 1774 }, { "epoch": 0.0142, "grad_norm": 3.25, "learning_rate": 0.0009522257077226717, "loss": 3.1258, "step": 1775 }, { "epoch": 0.014208, "grad_norm": 3.28125, "learning_rate": 0.0009521719807555448, "loss": 3.0222, "step": 1776 }, { "epoch": 0.014216, "grad_norm": 3.671875, "learning_rate": 0.000952118225112173, "loss": 3.0513, "step": 1777 }, { "epoch": 0.014224, "grad_norm": 4.40625, "learning_rate": 0.0009520644407959653, "loss": 2.5004, "step": 1778 }, { "epoch": 0.014232, "grad_norm": 3.53125, "learning_rate": 0.0009520106278103327, "loss": 2.8099, "step": 1779 }, { "epoch": 0.01424, "grad_norm": 4.21875, "learning_rate": 0.000951956786158688, "loss": 2.6422, "step": 1780 }, { "epoch": 0.014248, "grad_norm": 3.640625, "learning_rate": 0.0009519029158444457, "loss": 3.2086, "step": 1781 }, { "epoch": 0.014256, "grad_norm": 4.59375, "learning_rate": 0.0009518490168710221, "loss": 2.8074, "step": 1782 }, { "epoch": 0.014264, "grad_norm": 4.6875, "learning_rate": 0.0009517950892418357, "loss": 3.0088, "step": 1783 }, { "epoch": 0.014272, "grad_norm": 5.0625, "learning_rate": 0.0009517411329603063, "loss": 2.9066, "step": 1784 }, { "epoch": 0.01428, "grad_norm": 13.0, "learning_rate": 0.0009516871480298556, "loss": 3.0485, "step": 1785 }, { "epoch": 0.014288, "grad_norm": 7.3125, "learning_rate": 0.0009516331344539078, "loss": 2.0322, "step": 1786 }, { "epoch": 0.014296, "grad_norm": 4.40625, "learning_rate": 0.0009515790922358879, "loss": 2.5755, "step": 1787 }, { "epoch": 0.014304, "grad_norm": 4.59375, "learning_rate": 0.0009515250213792234, "loss": 2.5647, "step": 1788 }, { "epoch": 0.014312, "grad_norm": 4.6875, "learning_rate": 0.0009514709218873433, "loss": 2.9292, "step": 1789 }, { "epoch": 0.01432, "grad_norm": 5.75, "learning_rate": 0.0009514167937636787, "loss": 2.8275, "step": 1790 }, { "epoch": 0.014328, "grad_norm": 6.03125, "learning_rate": 0.0009513626370116621, "loss": 1.9728, "step": 1791 }, { "epoch": 0.014336, "grad_norm": 6.9375, "learning_rate": 0.0009513084516347283, "loss": 3.1421, "step": 1792 }, { "epoch": 0.014344, "grad_norm": 11.3125, "learning_rate": 0.0009512542376363136, "loss": 2.3259, "step": 1793 }, { "epoch": 0.014352, "grad_norm": 11.875, "learning_rate": 0.000951199995019856, "loss": 2.666, "step": 1794 }, { "epoch": 0.01436, "grad_norm": 4.71875, "learning_rate": 0.0009511457237887959, "loss": 2.7335, "step": 1795 }, { "epoch": 0.014368, "grad_norm": 3.875, "learning_rate": 0.0009510914239465746, "loss": 2.6984, "step": 1796 }, { "epoch": 0.014376, "grad_norm": 3.8125, "learning_rate": 0.0009510370954966363, "loss": 2.3031, "step": 1797 }, { "epoch": 0.014384, "grad_norm": 3.875, "learning_rate": 0.0009509827384424261, "loss": 3.1488, "step": 1798 }, { "epoch": 0.014392, "grad_norm": 3.71875, "learning_rate": 0.0009509283527873914, "loss": 3.1044, "step": 1799 }, { "epoch": 0.0144, "grad_norm": 4.15625, "learning_rate": 0.0009508739385349812, "loss": 2.1658, "step": 1800 }, { "epoch": 0.014408, "grad_norm": 3.71875, "learning_rate": 0.0009508194956886465, "loss": 3.1576, "step": 1801 }, { "epoch": 0.014416, "grad_norm": 3.578125, "learning_rate": 0.0009507650242518398, "loss": 2.7158, "step": 1802 }, { "epoch": 0.014424, "grad_norm": 2.828125, "learning_rate": 0.0009507105242280156, "loss": 2.0661, "step": 1803 }, { "epoch": 0.014432, "grad_norm": 3.609375, "learning_rate": 0.0009506559956206307, "loss": 2.4327, "step": 1804 }, { "epoch": 0.01444, "grad_norm": 4.09375, "learning_rate": 0.0009506014384331426, "loss": 2.8335, "step": 1805 }, { "epoch": 0.014448, "grad_norm": 3.65625, "learning_rate": 0.0009505468526690117, "loss": 2.0364, "step": 1806 }, { "epoch": 0.014456, "grad_norm": 20.375, "learning_rate": 0.0009504922383316996, "loss": 2.6127, "step": 1807 }, { "epoch": 0.014464, "grad_norm": 4.625, "learning_rate": 0.00095043759542467, "loss": 2.8881, "step": 1808 }, { "epoch": 0.014472, "grad_norm": 4.15625, "learning_rate": 0.0009503829239513881, "loss": 2.675, "step": 1809 }, { "epoch": 0.01448, "grad_norm": 3.71875, "learning_rate": 0.0009503282239153215, "loss": 2.9855, "step": 1810 }, { "epoch": 0.014488, "grad_norm": 4.4375, "learning_rate": 0.0009502734953199386, "loss": 1.93, "step": 1811 }, { "epoch": 0.014496, "grad_norm": 3.609375, "learning_rate": 0.0009502187381687106, "loss": 2.6029, "step": 1812 }, { "epoch": 0.014504, "grad_norm": 3.796875, "learning_rate": 0.0009501639524651102, "loss": 2.5635, "step": 1813 }, { "epoch": 0.014512, "grad_norm": 3.265625, "learning_rate": 0.0009501091382126118, "loss": 2.6514, "step": 1814 }, { "epoch": 0.01452, "grad_norm": 3.59375, "learning_rate": 0.0009500542954146916, "loss": 2.3883, "step": 1815 }, { "epoch": 0.014528, "grad_norm": 3.125, "learning_rate": 0.0009499994240748275, "loss": 2.8491, "step": 1816 }, { "epoch": 0.014536, "grad_norm": 4.59375, "learning_rate": 0.0009499445241964997, "loss": 2.9087, "step": 1817 }, { "epoch": 0.014544, "grad_norm": 27.75, "learning_rate": 0.0009498895957831897, "loss": 2.0625, "step": 1818 }, { "epoch": 0.014552, "grad_norm": 7.21875, "learning_rate": 0.0009498346388383809, "loss": 2.8301, "step": 1819 }, { "epoch": 0.01456, "grad_norm": 5.1875, "learning_rate": 0.0009497796533655589, "loss": 2.8156, "step": 1820 }, { "epoch": 0.014568, "grad_norm": 5.15625, "learning_rate": 0.0009497246393682107, "loss": 2.0179, "step": 1821 }, { "epoch": 0.014576, "grad_norm": 3.609375, "learning_rate": 0.0009496695968498252, "loss": 2.9041, "step": 1822 }, { "epoch": 0.014584, "grad_norm": 4.5, "learning_rate": 0.0009496145258138931, "loss": 2.6081, "step": 1823 }, { "epoch": 0.014592, "grad_norm": 4.3125, "learning_rate": 0.0009495594262639069, "loss": 2.853, "step": 1824 }, { "epoch": 0.0146, "grad_norm": 5.875, "learning_rate": 0.0009495042982033611, "loss": 2.4539, "step": 1825 }, { "epoch": 0.014608, "grad_norm": 5.0625, "learning_rate": 0.0009494491416357517, "loss": 2.9195, "step": 1826 }, { "epoch": 0.014616, "grad_norm": 6.78125, "learning_rate": 0.0009493939565645769, "loss": 2.3782, "step": 1827 }, { "epoch": 0.014624, "grad_norm": 5.5, "learning_rate": 0.0009493387429933362, "loss": 2.2934, "step": 1828 }, { "epoch": 0.014632, "grad_norm": 5.625, "learning_rate": 0.0009492835009255313, "loss": 2.9106, "step": 1829 }, { "epoch": 0.01464, "grad_norm": 4.75, "learning_rate": 0.0009492282303646657, "loss": 2.8992, "step": 1830 }, { "epoch": 0.014648, "grad_norm": 9.5, "learning_rate": 0.0009491729313142443, "loss": 2.7906, "step": 1831 }, { "epoch": 0.014656, "grad_norm": 4.90625, "learning_rate": 0.0009491176037777745, "loss": 2.7732, "step": 1832 }, { "epoch": 0.014664, "grad_norm": 4.0625, "learning_rate": 0.000949062247758765, "loss": 2.6595, "step": 1833 }, { "epoch": 0.014672, "grad_norm": 4.1875, "learning_rate": 0.0009490068632607261, "loss": 2.7472, "step": 1834 }, { "epoch": 0.01468, "grad_norm": 3.6875, "learning_rate": 0.0009489514502871706, "loss": 2.928, "step": 1835 }, { "epoch": 0.014688, "grad_norm": 3.328125, "learning_rate": 0.0009488960088416125, "loss": 2.9612, "step": 1836 }, { "epoch": 0.014696, "grad_norm": 4.0, "learning_rate": 0.0009488405389275681, "loss": 3.5982, "step": 1837 }, { "epoch": 0.014704, "grad_norm": 2.890625, "learning_rate": 0.0009487850405485547, "loss": 2.7505, "step": 1838 }, { "epoch": 0.014712, "grad_norm": 3.0625, "learning_rate": 0.0009487295137080927, "loss": 2.1785, "step": 1839 }, { "epoch": 0.01472, "grad_norm": 3.734375, "learning_rate": 0.0009486739584097031, "loss": 3.3794, "step": 1840 }, { "epoch": 0.014728, "grad_norm": 2.875, "learning_rate": 0.000948618374656909, "loss": 2.3741, "step": 1841 }, { "epoch": 0.014736, "grad_norm": 3.625, "learning_rate": 0.0009485627624532358, "loss": 2.9139, "step": 1842 }, { "epoch": 0.014744, "grad_norm": 4.125, "learning_rate": 0.0009485071218022102, "loss": 2.7489, "step": 1843 }, { "epoch": 0.014752, "grad_norm": 4.4375, "learning_rate": 0.0009484514527073609, "loss": 2.7216, "step": 1844 }, { "epoch": 0.01476, "grad_norm": 5.71875, "learning_rate": 0.0009483957551722185, "loss": 2.7165, "step": 1845 }, { "epoch": 0.014768, "grad_norm": 6.0625, "learning_rate": 0.000948340029200315, "loss": 2.7619, "step": 1846 }, { "epoch": 0.014776, "grad_norm": 5.03125, "learning_rate": 0.0009482842747951846, "loss": 2.9371, "step": 1847 }, { "epoch": 0.014784, "grad_norm": 19.75, "learning_rate": 0.0009482284919603635, "loss": 1.722, "step": 1848 }, { "epoch": 0.014792, "grad_norm": 6.6875, "learning_rate": 0.0009481726806993887, "loss": 3.0757, "step": 1849 }, { "epoch": 0.0148, "grad_norm": 6.09375, "learning_rate": 0.0009481168410158003, "loss": 2.7552, "step": 1850 }, { "epoch": 0.014808, "grad_norm": 12.875, "learning_rate": 0.0009480609729131394, "loss": 2.3559, "step": 1851 }, { "epoch": 0.014816, "grad_norm": 3.875, "learning_rate": 0.000948005076394949, "loss": 2.3399, "step": 1852 }, { "epoch": 0.014824, "grad_norm": 5.65625, "learning_rate": 0.0009479491514647742, "loss": 2.485, "step": 1853 }, { "epoch": 0.014832, "grad_norm": 4.53125, "learning_rate": 0.0009478931981261615, "loss": 2.9368, "step": 1854 }, { "epoch": 0.01484, "grad_norm": 109.5, "learning_rate": 0.0009478372163826594, "loss": 2.159, "step": 1855 }, { "epoch": 0.014848, "grad_norm": 3.875, "learning_rate": 0.0009477812062378184, "loss": 2.2016, "step": 1856 }, { "epoch": 0.014856, "grad_norm": 3.921875, "learning_rate": 0.0009477251676951904, "loss": 2.9366, "step": 1857 }, { "epoch": 0.014864, "grad_norm": 3.328125, "learning_rate": 0.0009476691007583292, "loss": 2.2999, "step": 1858 }, { "epoch": 0.014872, "grad_norm": 3.46875, "learning_rate": 0.0009476130054307907, "loss": 2.4486, "step": 1859 }, { "epoch": 0.01488, "grad_norm": 3.265625, "learning_rate": 0.0009475568817161326, "loss": 2.0589, "step": 1860 }, { "epoch": 0.014888, "grad_norm": 4.34375, "learning_rate": 0.0009475007296179139, "loss": 2.6214, "step": 1861 }, { "epoch": 0.014896, "grad_norm": 3.875, "learning_rate": 0.0009474445491396957, "loss": 2.5102, "step": 1862 }, { "epoch": 0.014904, "grad_norm": 5.03125, "learning_rate": 0.000947388340285041, "loss": 3.3666, "step": 1863 }, { "epoch": 0.014912, "grad_norm": 4.0, "learning_rate": 0.0009473321030575145, "loss": 2.5488, "step": 1864 }, { "epoch": 0.01492, "grad_norm": 3.78125, "learning_rate": 0.0009472758374606826, "loss": 2.0991, "step": 1865 }, { "epoch": 0.014928, "grad_norm": 2.703125, "learning_rate": 0.0009472195434981139, "loss": 2.0958, "step": 1866 }, { "epoch": 0.014936, "grad_norm": 3.265625, "learning_rate": 0.0009471632211733781, "loss": 2.6042, "step": 1867 }, { "epoch": 0.014944, "grad_norm": 3.796875, "learning_rate": 0.0009471068704900474, "loss": 2.5553, "step": 1868 }, { "epoch": 0.014952, "grad_norm": 3.25, "learning_rate": 0.0009470504914516953, "loss": 2.8116, "step": 1869 }, { "epoch": 0.01496, "grad_norm": 3.609375, "learning_rate": 0.0009469940840618976, "loss": 2.2402, "step": 1870 }, { "epoch": 0.014968, "grad_norm": 4.3125, "learning_rate": 0.0009469376483242311, "loss": 2.8208, "step": 1871 }, { "epoch": 0.014976, "grad_norm": 4.4375, "learning_rate": 0.0009468811842422753, "loss": 3.0299, "step": 1872 }, { "epoch": 0.014984, "grad_norm": 3.171875, "learning_rate": 0.000946824691819611, "loss": 2.1514, "step": 1873 }, { "epoch": 0.014992, "grad_norm": 4.1875, "learning_rate": 0.0009467681710598208, "loss": 2.2212, "step": 1874 }, { "epoch": 0.015, "grad_norm": 4.6875, "learning_rate": 0.0009467116219664893, "loss": 2.4741, "step": 1875 }, { "epoch": 0.015008, "grad_norm": 3.390625, "learning_rate": 0.0009466550445432026, "loss": 2.4659, "step": 1876 }, { "epoch": 0.015016, "grad_norm": 2.9375, "learning_rate": 0.0009465984387935489, "loss": 1.9163, "step": 1877 }, { "epoch": 0.015024, "grad_norm": 4.9375, "learning_rate": 0.000946541804721118, "loss": 3.0359, "step": 1878 }, { "epoch": 0.015032, "grad_norm": 3.3125, "learning_rate": 0.0009464851423295018, "loss": 2.0166, "step": 1879 }, { "epoch": 0.01504, "grad_norm": 4.6875, "learning_rate": 0.0009464284516222936, "loss": 3.2692, "step": 1880 }, { "epoch": 0.015048, "grad_norm": 4.6875, "learning_rate": 0.0009463717326030885, "loss": 2.4765, "step": 1881 }, { "epoch": 0.015056, "grad_norm": 4.96875, "learning_rate": 0.0009463149852754838, "loss": 3.1633, "step": 1882 }, { "epoch": 0.015064, "grad_norm": 9.4375, "learning_rate": 0.0009462582096430783, "loss": 2.2447, "step": 1883 }, { "epoch": 0.015072, "grad_norm": 3.453125, "learning_rate": 0.0009462014057094726, "loss": 2.3417, "step": 1884 }, { "epoch": 0.01508, "grad_norm": 5.71875, "learning_rate": 0.0009461445734782692, "loss": 2.87, "step": 1885 }, { "epoch": 0.015088, "grad_norm": 3.515625, "learning_rate": 0.0009460877129530723, "loss": 2.294, "step": 1886 }, { "epoch": 0.015096, "grad_norm": 4.8125, "learning_rate": 0.0009460308241374878, "loss": 3.237, "step": 1887 }, { "epoch": 0.015104, "grad_norm": 3.453125, "learning_rate": 0.0009459739070351237, "loss": 1.9501, "step": 1888 }, { "epoch": 0.015112, "grad_norm": 4.40625, "learning_rate": 0.0009459169616495896, "loss": 2.4074, "step": 1889 }, { "epoch": 0.01512, "grad_norm": 3.578125, "learning_rate": 0.0009458599879844969, "loss": 2.6216, "step": 1890 }, { "epoch": 0.015128, "grad_norm": 5.0625, "learning_rate": 0.0009458029860434587, "loss": 2.3515, "step": 1891 }, { "epoch": 0.015136, "grad_norm": 3.828125, "learning_rate": 0.0009457459558300902, "loss": 2.659, "step": 1892 }, { "epoch": 0.015144, "grad_norm": 4.5625, "learning_rate": 0.0009456888973480082, "loss": 2.8854, "step": 1893 }, { "epoch": 0.015152, "grad_norm": 4.78125, "learning_rate": 0.0009456318106008309, "loss": 3.4461, "step": 1894 }, { "epoch": 0.01516, "grad_norm": 3.734375, "learning_rate": 0.0009455746955921791, "loss": 2.6252, "step": 1895 }, { "epoch": 0.015168, "grad_norm": 6.96875, "learning_rate": 0.0009455175523256748, "loss": 2.7538, "step": 1896 }, { "epoch": 0.015176, "grad_norm": 4.5625, "learning_rate": 0.000945460380804942, "loss": 3.2448, "step": 1897 }, { "epoch": 0.015184, "grad_norm": 3.640625, "learning_rate": 0.0009454031810336064, "loss": 2.4004, "step": 1898 }, { "epoch": 0.015192, "grad_norm": 8.5625, "learning_rate": 0.0009453459530152957, "loss": 2.3201, "step": 1899 }, { "epoch": 0.0152, "grad_norm": 3.0625, "learning_rate": 0.0009452886967536389, "loss": 2.2991, "step": 1900 }, { "epoch": 0.015208, "grad_norm": 3.40625, "learning_rate": 0.0009452314122522676, "loss": 2.6102, "step": 1901 }, { "epoch": 0.015216, "grad_norm": 5.09375, "learning_rate": 0.0009451740995148143, "loss": 2.4964, "step": 1902 }, { "epoch": 0.015224, "grad_norm": 4.09375, "learning_rate": 0.0009451167585449138, "loss": 3.0048, "step": 1903 }, { "epoch": 0.015232, "grad_norm": 5.21875, "learning_rate": 0.0009450593893462029, "loss": 2.6576, "step": 1904 }, { "epoch": 0.01524, "grad_norm": 6.6875, "learning_rate": 0.0009450019919223196, "loss": 2.1955, "step": 1905 }, { "epoch": 0.015248, "grad_norm": 5.53125, "learning_rate": 0.0009449445662769039, "loss": 2.3768, "step": 1906 }, { "epoch": 0.015256, "grad_norm": 6.75, "learning_rate": 0.000944887112413598, "loss": 2.6027, "step": 1907 }, { "epoch": 0.015264, "grad_norm": 3.578125, "learning_rate": 0.0009448296303360453, "loss": 2.3413, "step": 1908 }, { "epoch": 0.015272, "grad_norm": 5.53125, "learning_rate": 0.0009447721200478914, "loss": 2.1708, "step": 1909 }, { "epoch": 0.01528, "grad_norm": 3.265625, "learning_rate": 0.0009447145815527833, "loss": 2.1096, "step": 1910 }, { "epoch": 0.015288, "grad_norm": 4.4375, "learning_rate": 0.0009446570148543703, "loss": 2.1626, "step": 1911 }, { "epoch": 0.015296, "grad_norm": 3.828125, "learning_rate": 0.0009445994199563033, "loss": 2.6211, "step": 1912 }, { "epoch": 0.015304, "grad_norm": 3.125, "learning_rate": 0.0009445417968622345, "loss": 2.2542, "step": 1913 }, { "epoch": 0.015312, "grad_norm": 3.734375, "learning_rate": 0.0009444841455758186, "loss": 2.3958, "step": 1914 }, { "epoch": 0.01532, "grad_norm": 4.125, "learning_rate": 0.0009444264661007116, "loss": 2.5735, "step": 1915 }, { "epoch": 0.015328, "grad_norm": 4.28125, "learning_rate": 0.0009443687584405715, "loss": 2.9886, "step": 1916 }, { "epoch": 0.015336, "grad_norm": 4.0, "learning_rate": 0.0009443110225990583, "loss": 2.9884, "step": 1917 }, { "epoch": 0.015344, "grad_norm": 4.0, "learning_rate": 0.0009442532585798334, "loss": 2.5562, "step": 1918 }, { "epoch": 0.015352, "grad_norm": 4.15625, "learning_rate": 0.0009441954663865599, "loss": 2.6029, "step": 1919 }, { "epoch": 0.01536, "grad_norm": 3.328125, "learning_rate": 0.0009441376460229032, "loss": 2.6729, "step": 1920 }, { "epoch": 0.015368, "grad_norm": 3.140625, "learning_rate": 0.0009440797974925301, "loss": 2.5426, "step": 1921 }, { "epoch": 0.015376, "grad_norm": 3.75, "learning_rate": 0.0009440219207991092, "loss": 2.4386, "step": 1922 }, { "epoch": 0.015384, "grad_norm": 4.34375, "learning_rate": 0.0009439640159463112, "loss": 3.1575, "step": 1923 }, { "epoch": 0.015392, "grad_norm": 5.4375, "learning_rate": 0.0009439060829378081, "loss": 2.6503, "step": 1924 }, { "epoch": 0.0154, "grad_norm": 3.75, "learning_rate": 0.0009438481217772743, "loss": 2.4362, "step": 1925 }, { "epoch": 0.015408, "grad_norm": 4.1875, "learning_rate": 0.0009437901324683853, "loss": 2.4339, "step": 1926 }, { "epoch": 0.015416, "grad_norm": 3.71875, "learning_rate": 0.0009437321150148188, "loss": 2.7774, "step": 1927 }, { "epoch": 0.015424, "grad_norm": 4.96875, "learning_rate": 0.0009436740694202542, "loss": 2.4869, "step": 1928 }, { "epoch": 0.015432, "grad_norm": 6.15625, "learning_rate": 0.0009436159956883728, "loss": 2.8365, "step": 1929 }, { "epoch": 0.01544, "grad_norm": 3.046875, "learning_rate": 0.0009435578938228576, "loss": 2.1284, "step": 1930 }, { "epoch": 0.015448, "grad_norm": 3.125, "learning_rate": 0.000943499763827393, "loss": 2.349, "step": 1931 }, { "epoch": 0.015456, "grad_norm": 3.890625, "learning_rate": 0.0009434416057056658, "loss": 2.7074, "step": 1932 }, { "epoch": 0.015464, "grad_norm": 55.5, "learning_rate": 0.0009433834194613644, "loss": 3.0703, "step": 1933 }, { "epoch": 0.015472, "grad_norm": 4.40625, "learning_rate": 0.0009433252050981788, "loss": 2.7756, "step": 1934 }, { "epoch": 0.01548, "grad_norm": 4.53125, "learning_rate": 0.000943266962619801, "loss": 2.1847, "step": 1935 }, { "epoch": 0.015488, "grad_norm": 4.0625, "learning_rate": 0.0009432086920299245, "loss": 2.7324, "step": 1936 }, { "epoch": 0.015496, "grad_norm": 4.9375, "learning_rate": 0.0009431503933322449, "loss": 2.3152, "step": 1937 }, { "epoch": 0.015504, "grad_norm": 4.375, "learning_rate": 0.0009430920665304594, "loss": 2.6246, "step": 1938 }, { "epoch": 0.015512, "grad_norm": 4.3125, "learning_rate": 0.0009430337116282669, "loss": 2.8588, "step": 1939 }, { "epoch": 0.01552, "grad_norm": 3.78125, "learning_rate": 0.0009429753286293683, "loss": 2.1823, "step": 1940 }, { "epoch": 0.015528, "grad_norm": 4.375, "learning_rate": 0.0009429169175374664, "loss": 2.5238, "step": 1941 }, { "epoch": 0.015536, "grad_norm": 2.984375, "learning_rate": 0.0009428584783562651, "loss": 2.3751, "step": 1942 }, { "epoch": 0.015544, "grad_norm": 2.65625, "learning_rate": 0.0009428000110894707, "loss": 2.0947, "step": 1943 }, { "epoch": 0.015552, "grad_norm": 2.734375, "learning_rate": 0.0009427415157407916, "loss": 2.404, "step": 1944 }, { "epoch": 0.01556, "grad_norm": 3.234375, "learning_rate": 0.000942682992313937, "loss": 2.2722, "step": 1945 }, { "epoch": 0.015568, "grad_norm": 3.890625, "learning_rate": 0.0009426244408126184, "loss": 2.6087, "step": 1946 }, { "epoch": 0.015576, "grad_norm": 4.8125, "learning_rate": 0.0009425658612405492, "loss": 2.4504, "step": 1947 }, { "epoch": 0.015584, "grad_norm": 3.4375, "learning_rate": 0.0009425072536014446, "loss": 2.6989, "step": 1948 }, { "epoch": 0.015592, "grad_norm": 3.203125, "learning_rate": 0.0009424486178990213, "loss": 2.3022, "step": 1949 }, { "epoch": 0.0156, "grad_norm": 3.578125, "learning_rate": 0.0009423899541369978, "loss": 2.7904, "step": 1950 }, { "epoch": 0.015608, "grad_norm": 4.5, "learning_rate": 0.0009423312623190946, "loss": 2.4093, "step": 1951 }, { "epoch": 0.015616, "grad_norm": 3.71875, "learning_rate": 0.0009422725424490339, "loss": 3.3801, "step": 1952 }, { "epoch": 0.015624, "grad_norm": 3.46875, "learning_rate": 0.0009422137945305396, "loss": 2.4903, "step": 1953 }, { "epoch": 0.015632, "grad_norm": 3.1875, "learning_rate": 0.0009421550185673373, "loss": 1.9227, "step": 1954 }, { "epoch": 0.01564, "grad_norm": 3.53125, "learning_rate": 0.0009420962145631549, "loss": 2.6215, "step": 1955 }, { "epoch": 0.015648, "grad_norm": 5.09375, "learning_rate": 0.0009420373825217212, "loss": 3.2512, "step": 1956 }, { "epoch": 0.015656, "grad_norm": 3.4375, "learning_rate": 0.0009419785224467675, "loss": 2.5304, "step": 1957 }, { "epoch": 0.015664, "grad_norm": 5.40625, "learning_rate": 0.0009419196343420267, "loss": 2.6813, "step": 1958 }, { "epoch": 0.015672, "grad_norm": 3.890625, "learning_rate": 0.0009418607182112333, "loss": 2.2973, "step": 1959 }, { "epoch": 0.01568, "grad_norm": 4.25, "learning_rate": 0.0009418017740581237, "loss": 2.4879, "step": 1960 }, { "epoch": 0.015688, "grad_norm": 6.34375, "learning_rate": 0.0009417428018864362, "loss": 2.7289, "step": 1961 }, { "epoch": 0.015696, "grad_norm": 5.8125, "learning_rate": 0.0009416838016999108, "loss": 2.4296, "step": 1962 }, { "epoch": 0.015704, "grad_norm": 4.28125, "learning_rate": 0.000941624773502289, "loss": 1.7515, "step": 1963 }, { "epoch": 0.015712, "grad_norm": 5.5625, "learning_rate": 0.0009415657172973142, "loss": 2.8563, "step": 1964 }, { "epoch": 0.01572, "grad_norm": 4.59375, "learning_rate": 0.0009415066330887321, "loss": 2.7107, "step": 1965 }, { "epoch": 0.015728, "grad_norm": 4.75, "learning_rate": 0.0009414475208802893, "loss": 2.7495, "step": 1966 }, { "epoch": 0.015736, "grad_norm": 3.9375, "learning_rate": 0.0009413883806757351, "loss": 3.8129, "step": 1967 }, { "epoch": 0.015744, "grad_norm": 4.75, "learning_rate": 0.0009413292124788198, "loss": 2.8651, "step": 1968 }, { "epoch": 0.015752, "grad_norm": 3.578125, "learning_rate": 0.0009412700162932957, "loss": 2.2376, "step": 1969 }, { "epoch": 0.01576, "grad_norm": 3.546875, "learning_rate": 0.0009412107921229171, "loss": 2.7573, "step": 1970 }, { "epoch": 0.015768, "grad_norm": 3.234375, "learning_rate": 0.00094115153997144, "loss": 2.3501, "step": 1971 }, { "epoch": 0.015776, "grad_norm": 2.9375, "learning_rate": 0.0009410922598426222, "loss": 2.5103, "step": 1972 }, { "epoch": 0.015784, "grad_norm": 3.59375, "learning_rate": 0.0009410329517402229, "loss": 2.7505, "step": 1973 }, { "epoch": 0.015792, "grad_norm": 3.875, "learning_rate": 0.0009409736156680035, "loss": 2.6499, "step": 1974 }, { "epoch": 0.0158, "grad_norm": 185.0, "learning_rate": 0.0009409142516297269, "loss": 2.3101, "step": 1975 }, { "epoch": 0.015808, "grad_norm": 3.90625, "learning_rate": 0.0009408548596291582, "loss": 2.3463, "step": 1976 }, { "epoch": 0.015816, "grad_norm": 4.0, "learning_rate": 0.0009407954396700635, "loss": 2.494, "step": 1977 }, { "epoch": 0.015824, "grad_norm": 3.765625, "learning_rate": 0.0009407359917562116, "loss": 3.3045, "step": 1978 }, { "epoch": 0.015832, "grad_norm": 3.34375, "learning_rate": 0.0009406765158913724, "loss": 2.7237, "step": 1979 }, { "epoch": 0.01584, "grad_norm": 4.0, "learning_rate": 0.0009406170120793178, "loss": 2.596, "step": 1980 }, { "epoch": 0.015848, "grad_norm": 5.65625, "learning_rate": 0.0009405574803238216, "loss": 2.4166, "step": 1981 }, { "epoch": 0.015856, "grad_norm": 3.765625, "learning_rate": 0.0009404979206286591, "loss": 2.8139, "step": 1982 }, { "epoch": 0.015864, "grad_norm": 4.75, "learning_rate": 0.0009404383329976076, "loss": 2.7056, "step": 1983 }, { "epoch": 0.015872, "grad_norm": 5.0625, "learning_rate": 0.0009403787174344459, "loss": 2.1955, "step": 1984 }, { "epoch": 0.01588, "grad_norm": 3.421875, "learning_rate": 0.0009403190739429551, "loss": 2.5193, "step": 1985 }, { "epoch": 0.015888, "grad_norm": 4.53125, "learning_rate": 0.0009402594025269173, "loss": 3.0063, "step": 1986 }, { "epoch": 0.015896, "grad_norm": 4.03125, "learning_rate": 0.0009401997031901171, "loss": 2.7363, "step": 1987 }, { "epoch": 0.015904, "grad_norm": 2.96875, "learning_rate": 0.0009401399759363404, "loss": 2.2394, "step": 1988 }, { "epoch": 0.015912, "grad_norm": 2.78125, "learning_rate": 0.0009400802207693752, "loss": 2.1539, "step": 1989 }, { "epoch": 0.01592, "grad_norm": 3.25, "learning_rate": 0.0009400204376930109, "loss": 2.4843, "step": 1990 }, { "epoch": 0.015928, "grad_norm": 3.625, "learning_rate": 0.000939960626711039, "loss": 2.5537, "step": 1991 }, { "epoch": 0.015936, "grad_norm": 3.5, "learning_rate": 0.0009399007878272528, "loss": 2.4474, "step": 1992 }, { "epoch": 0.015944, "grad_norm": 3.40625, "learning_rate": 0.0009398409210454468, "loss": 2.4076, "step": 1993 }, { "epoch": 0.015952, "grad_norm": 3.59375, "learning_rate": 0.0009397810263694183, "loss": 2.7296, "step": 1994 }, { "epoch": 0.01596, "grad_norm": 4.5625, "learning_rate": 0.000939721103802965, "loss": 3.219, "step": 1995 }, { "epoch": 0.015968, "grad_norm": 2.96875, "learning_rate": 0.0009396611533498877, "loss": 2.0824, "step": 1996 }, { "epoch": 0.015976, "grad_norm": 3.46875, "learning_rate": 0.0009396011750139881, "loss": 2.8568, "step": 1997 }, { "epoch": 0.015984, "grad_norm": 3.734375, "learning_rate": 0.0009395411687990702, "loss": 2.6632, "step": 1998 }, { "epoch": 0.015992, "grad_norm": 4.5, "learning_rate": 0.0009394811347089394, "loss": 2.4228, "step": 1999 }, { "epoch": 0.016, "grad_norm": 3.453125, "learning_rate": 0.0009394210727474029, "loss": 2.2591, "step": 2000 }, { "epoch": 0.016008, "grad_norm": 3.03125, "learning_rate": 0.0009393609829182699, "loss": 2.4385, "step": 2001 }, { "epoch": 0.016016, "grad_norm": 3.390625, "learning_rate": 0.0009393008652253511, "loss": 2.314, "step": 2002 }, { "epoch": 0.016024, "grad_norm": 3.484375, "learning_rate": 0.0009392407196724593, "loss": 3.0159, "step": 2003 }, { "epoch": 0.016032, "grad_norm": 3.28125, "learning_rate": 0.0009391805462634086, "loss": 2.6126, "step": 2004 }, { "epoch": 0.01604, "grad_norm": 4.0, "learning_rate": 0.0009391203450020153, "loss": 2.8481, "step": 2005 }, { "epoch": 0.016048, "grad_norm": 4.5, "learning_rate": 0.0009390601158920973, "loss": 1.9919, "step": 2006 }, { "epoch": 0.016056, "grad_norm": 3.859375, "learning_rate": 0.0009389998589374742, "loss": 2.755, "step": 2007 }, { "epoch": 0.016064, "grad_norm": 3.546875, "learning_rate": 0.0009389395741419675, "loss": 2.3468, "step": 2008 }, { "epoch": 0.016072, "grad_norm": 3.4375, "learning_rate": 0.0009388792615094004, "loss": 2.9755, "step": 2009 }, { "epoch": 0.01608, "grad_norm": 3.125, "learning_rate": 0.0009388189210435977, "loss": 2.8788, "step": 2010 }, { "epoch": 0.016088, "grad_norm": 3.265625, "learning_rate": 0.0009387585527483864, "loss": 2.5872, "step": 2011 }, { "epoch": 0.016096, "grad_norm": 2.6875, "learning_rate": 0.0009386981566275948, "loss": 2.3506, "step": 2012 }, { "epoch": 0.016104, "grad_norm": 3.546875, "learning_rate": 0.000938637732685053, "loss": 2.3691, "step": 2013 }, { "epoch": 0.016112, "grad_norm": 21.0, "learning_rate": 0.0009385772809245934, "loss": 2.22, "step": 2014 }, { "epoch": 0.01612, "grad_norm": 2.640625, "learning_rate": 0.0009385168013500495, "loss": 1.8839, "step": 2015 }, { "epoch": 0.016128, "grad_norm": 3.625, "learning_rate": 0.0009384562939652569, "loss": 2.3058, "step": 2016 }, { "epoch": 0.016136, "grad_norm": 3.171875, "learning_rate": 0.000938395758774053, "loss": 2.3271, "step": 2017 }, { "epoch": 0.016144, "grad_norm": 3.90625, "learning_rate": 0.0009383351957802769, "loss": 2.3144, "step": 2018 }, { "epoch": 0.016152, "grad_norm": 3.703125, "learning_rate": 0.0009382746049877693, "loss": 3.0411, "step": 2019 }, { "epoch": 0.01616, "grad_norm": 4.90625, "learning_rate": 0.0009382139864003726, "loss": 3.1926, "step": 2020 }, { "epoch": 0.016168, "grad_norm": 4.0625, "learning_rate": 0.0009381533400219318, "loss": 2.7928, "step": 2021 }, { "epoch": 0.016176, "grad_norm": 4.84375, "learning_rate": 0.0009380926658562925, "loss": 2.625, "step": 2022 }, { "epoch": 0.016184, "grad_norm": 5.125, "learning_rate": 0.0009380319639073027, "loss": 3.3456, "step": 2023 }, { "epoch": 0.016192, "grad_norm": 3.578125, "learning_rate": 0.0009379712341788122, "loss": 2.1623, "step": 2024 }, { "epoch": 0.0162, "grad_norm": 3.21875, "learning_rate": 0.0009379104766746722, "loss": 2.4597, "step": 2025 }, { "epoch": 0.016208, "grad_norm": 3.453125, "learning_rate": 0.0009378496913987362, "loss": 2.5024, "step": 2026 }, { "epoch": 0.016216, "grad_norm": 3.5, "learning_rate": 0.0009377888783548586, "loss": 2.5561, "step": 2027 }, { "epoch": 0.016224, "grad_norm": 3.28125, "learning_rate": 0.0009377280375468967, "loss": 1.9854, "step": 2028 }, { "epoch": 0.016232, "grad_norm": 3.453125, "learning_rate": 0.0009376671689787084, "loss": 2.6797, "step": 2029 }, { "epoch": 0.01624, "grad_norm": 3.71875, "learning_rate": 0.0009376062726541543, "loss": 3.1037, "step": 2030 }, { "epoch": 0.016248, "grad_norm": 3.65625, "learning_rate": 0.0009375453485770961, "loss": 2.4399, "step": 2031 }, { "epoch": 0.016256, "grad_norm": 3.265625, "learning_rate": 0.0009374843967513978, "loss": 2.4585, "step": 2032 }, { "epoch": 0.016264, "grad_norm": 16.125, "learning_rate": 0.0009374234171809248, "loss": 2.795, "step": 2033 }, { "epoch": 0.016272, "grad_norm": 3.375, "learning_rate": 0.0009373624098695443, "loss": 2.4325, "step": 2034 }, { "epoch": 0.01628, "grad_norm": 3.65625, "learning_rate": 0.0009373013748211255, "loss": 2.3227, "step": 2035 }, { "epoch": 0.016288, "grad_norm": 3.390625, "learning_rate": 0.000937240312039539, "loss": 2.2662, "step": 2036 }, { "epoch": 0.016296, "grad_norm": 3.46875, "learning_rate": 0.0009371792215286574, "loss": 3.3621, "step": 2037 }, { "epoch": 0.016304, "grad_norm": 10.6875, "learning_rate": 0.0009371181032923548, "loss": 2.4241, "step": 2038 }, { "epoch": 0.016312, "grad_norm": 3.25, "learning_rate": 0.0009370569573345076, "loss": 2.3139, "step": 2039 }, { "epoch": 0.01632, "grad_norm": 3.96875, "learning_rate": 0.0009369957836589933, "loss": 2.658, "step": 2040 }, { "epoch": 0.016328, "grad_norm": 2.703125, "learning_rate": 0.0009369345822696915, "loss": 1.9932, "step": 2041 }, { "epoch": 0.016336, "grad_norm": 2.984375, "learning_rate": 0.0009368733531704838, "loss": 2.025, "step": 2042 }, { "epoch": 0.016344, "grad_norm": 4.0625, "learning_rate": 0.000936812096365253, "loss": 2.7226, "step": 2043 }, { "epoch": 0.016352, "grad_norm": 14.625, "learning_rate": 0.0009367508118578841, "loss": 2.6992, "step": 2044 }, { "epoch": 0.01636, "grad_norm": 4.125, "learning_rate": 0.0009366894996522636, "loss": 3.2481, "step": 2045 }, { "epoch": 0.016368, "grad_norm": 3.671875, "learning_rate": 0.0009366281597522799, "loss": 2.5633, "step": 2046 }, { "epoch": 0.016376, "grad_norm": 3.921875, "learning_rate": 0.000936566792161823, "loss": 2.5563, "step": 2047 }, { "epoch": 0.016384, "grad_norm": 3.703125, "learning_rate": 0.0009365053968847849, "loss": 2.4693, "step": 2048 }, { "epoch": 0.016392, "grad_norm": 4.0, "learning_rate": 0.0009364439739250592, "loss": 2.3194, "step": 2049 }, { "epoch": 0.0164, "grad_norm": 3.671875, "learning_rate": 0.0009363825232865413, "loss": 2.9378, "step": 2050 }, { "epoch": 0.016408, "grad_norm": 3.859375, "learning_rate": 0.0009363210449731281, "loss": 2.453, "step": 2051 }, { "epoch": 0.016416, "grad_norm": 4.6875, "learning_rate": 0.0009362595389887188, "loss": 3.031, "step": 2052 }, { "epoch": 0.016424, "grad_norm": 4.0625, "learning_rate": 0.0009361980053372139, "loss": 2.5704, "step": 2053 }, { "epoch": 0.016432, "grad_norm": 4.84375, "learning_rate": 0.0009361364440225159, "loss": 3.0753, "step": 2054 }, { "epoch": 0.01644, "grad_norm": 3.3125, "learning_rate": 0.0009360748550485288, "loss": 2.5644, "step": 2055 }, { "epoch": 0.016448, "grad_norm": 3.703125, "learning_rate": 0.0009360132384191584, "loss": 2.4295, "step": 2056 }, { "epoch": 0.016456, "grad_norm": 12.625, "learning_rate": 0.0009359515941383126, "loss": 2.4892, "step": 2057 }, { "epoch": 0.016464, "grad_norm": 3.84375, "learning_rate": 0.0009358899222099009, "loss": 1.9385, "step": 2058 }, { "epoch": 0.016472, "grad_norm": 3.375, "learning_rate": 0.000935828222637834, "loss": 2.1664, "step": 2059 }, { "epoch": 0.01648, "grad_norm": 4.09375, "learning_rate": 0.0009357664954260253, "loss": 2.3635, "step": 2060 }, { "epoch": 0.016488, "grad_norm": 3.578125, "learning_rate": 0.0009357047405783893, "loss": 2.5212, "step": 2061 }, { "epoch": 0.016496, "grad_norm": 3.46875, "learning_rate": 0.0009356429580988423, "loss": 2.2528, "step": 2062 }, { "epoch": 0.016504, "grad_norm": 3.359375, "learning_rate": 0.0009355811479913028, "loss": 2.3958, "step": 2063 }, { "epoch": 0.016512, "grad_norm": 4.78125, "learning_rate": 0.0009355193102596903, "loss": 2.2866, "step": 2064 }, { "epoch": 0.01652, "grad_norm": 4.40625, "learning_rate": 0.0009354574449079267, "loss": 2.8604, "step": 2065 }, { "epoch": 0.016528, "grad_norm": 4.625, "learning_rate": 0.0009353955519399355, "loss": 2.4946, "step": 2066 }, { "epoch": 0.016536, "grad_norm": 4.03125, "learning_rate": 0.0009353336313596419, "loss": 2.1779, "step": 2067 }, { "epoch": 0.016544, "grad_norm": 3.59375, "learning_rate": 0.0009352716831709726, "loss": 2.2651, "step": 2068 }, { "epoch": 0.016552, "grad_norm": 4.03125, "learning_rate": 0.0009352097073778564, "loss": 2.3193, "step": 2069 }, { "epoch": 0.01656, "grad_norm": 4.03125, "learning_rate": 0.0009351477039842238, "loss": 2.5496, "step": 2070 }, { "epoch": 0.016568, "grad_norm": 5.15625, "learning_rate": 0.000935085672994007, "loss": 3.2691, "step": 2071 }, { "epoch": 0.016576, "grad_norm": 5.25, "learning_rate": 0.0009350236144111399, "loss": 2.8406, "step": 2072 }, { "epoch": 0.016584, "grad_norm": 4.34375, "learning_rate": 0.000934961528239558, "loss": 2.7381, "step": 2073 }, { "epoch": 0.016592, "grad_norm": 4.84375, "learning_rate": 0.0009348994144831992, "loss": 2.4841, "step": 2074 }, { "epoch": 0.0166, "grad_norm": 4.09375, "learning_rate": 0.0009348372731460022, "loss": 1.8333, "step": 2075 }, { "epoch": 0.016608, "grad_norm": 4.125, "learning_rate": 0.0009347751042319081, "loss": 2.6566, "step": 2076 }, { "epoch": 0.016616, "grad_norm": 4.3125, "learning_rate": 0.0009347129077448597, "loss": 3.2214, "step": 2077 }, { "epoch": 0.016624, "grad_norm": 10.9375, "learning_rate": 0.0009346506836888013, "loss": 2.6956, "step": 2078 }, { "epoch": 0.016632, "grad_norm": 3.484375, "learning_rate": 0.0009345884320676791, "loss": 2.8038, "step": 2079 }, { "epoch": 0.01664, "grad_norm": 3.53125, "learning_rate": 0.0009345261528854412, "loss": 2.38, "step": 2080 }, { "epoch": 0.016648, "grad_norm": 3.296875, "learning_rate": 0.0009344638461460371, "loss": 2.988, "step": 2081 }, { "epoch": 0.016656, "grad_norm": 3.5, "learning_rate": 0.0009344015118534181, "loss": 3.2494, "step": 2082 }, { "epoch": 0.016664, "grad_norm": 3.671875, "learning_rate": 0.0009343391500115377, "loss": 2.9925, "step": 2083 }, { "epoch": 0.016672, "grad_norm": 2.890625, "learning_rate": 0.0009342767606243506, "loss": 2.048, "step": 2084 }, { "epoch": 0.01668, "grad_norm": 3.171875, "learning_rate": 0.0009342143436958135, "loss": 2.3595, "step": 2085 }, { "epoch": 0.016688, "grad_norm": 3.09375, "learning_rate": 0.0009341518992298847, "loss": 2.7399, "step": 2086 }, { "epoch": 0.016696, "grad_norm": 3.140625, "learning_rate": 0.0009340894272305245, "loss": 2.9021, "step": 2087 }, { "epoch": 0.016704, "grad_norm": 3.265625, "learning_rate": 0.0009340269277016951, "loss": 2.2788, "step": 2088 }, { "epoch": 0.016712, "grad_norm": 3.90625, "learning_rate": 0.0009339644006473596, "loss": 2.41, "step": 2089 }, { "epoch": 0.01672, "grad_norm": 4.25, "learning_rate": 0.0009339018460714835, "loss": 3.3685, "step": 2090 }, { "epoch": 0.016728, "grad_norm": 3.40625, "learning_rate": 0.0009338392639780342, "loss": 2.0549, "step": 2091 }, { "epoch": 0.016736, "grad_norm": 2.65625, "learning_rate": 0.0009337766543709806, "loss": 2.131, "step": 2092 }, { "epoch": 0.016744, "grad_norm": 5.46875, "learning_rate": 0.000933714017254293, "loss": 3.6759, "step": 2093 }, { "epoch": 0.016752, "grad_norm": 3.65625, "learning_rate": 0.0009336513526319441, "loss": 2.4974, "step": 2094 }, { "epoch": 0.01676, "grad_norm": 3.015625, "learning_rate": 0.0009335886605079078, "loss": 2.2813, "step": 2095 }, { "epoch": 0.016768, "grad_norm": 4.09375, "learning_rate": 0.00093352594088616, "loss": 2.1836, "step": 2096 }, { "epoch": 0.016776, "grad_norm": 3.1875, "learning_rate": 0.0009334631937706786, "loss": 2.1425, "step": 2097 }, { "epoch": 0.016784, "grad_norm": 3.65625, "learning_rate": 0.0009334004191654426, "loss": 2.3551, "step": 2098 }, { "epoch": 0.016792, "grad_norm": 4.4375, "learning_rate": 0.0009333376170744332, "loss": 2.4509, "step": 2099 }, { "epoch": 0.0168, "grad_norm": 6.125, "learning_rate": 0.0009332747875016332, "loss": 2.8729, "step": 2100 }, { "epoch": 0.016808, "grad_norm": 9.1875, "learning_rate": 0.0009332119304510274, "loss": 2.4937, "step": 2101 }, { "epoch": 0.016816, "grad_norm": 9.375, "learning_rate": 0.0009331490459266018, "loss": 2.3177, "step": 2102 }, { "epoch": 0.016824, "grad_norm": 6.96875, "learning_rate": 0.0009330861339323447, "loss": 1.857, "step": 2103 }, { "epoch": 0.016832, "grad_norm": 4.03125, "learning_rate": 0.0009330231944722459, "loss": 3.2133, "step": 2104 }, { "epoch": 0.01684, "grad_norm": 7.0625, "learning_rate": 0.0009329602275502967, "loss": 2.6366, "step": 2105 }, { "epoch": 0.016848, "grad_norm": 16.5, "learning_rate": 0.0009328972331704905, "loss": 2.7524, "step": 2106 }, { "epoch": 0.016856, "grad_norm": 226.0, "learning_rate": 0.0009328342113368226, "loss": 2.3661, "step": 2107 }, { "epoch": 0.016864, "grad_norm": 246.0, "learning_rate": 0.0009327711620532897, "loss": 2.4512, "step": 2108 }, { "epoch": 0.016872, "grad_norm": 8256.0, "learning_rate": 0.0009327080853238901, "loss": 2.2009, "step": 2109 }, { "epoch": 0.01688, "grad_norm": 2992.0, "learning_rate": 0.000932644981152624, "loss": 2.8088, "step": 2110 }, { "epoch": 0.016888, "grad_norm": 9600.0, "learning_rate": 0.0009325818495434936, "loss": 2.5808, "step": 2111 }, { "epoch": 0.016896, "grad_norm": 2336.0, "learning_rate": 0.0009325186905005027, "loss": 3.0399, "step": 2112 }, { "epoch": 0.016904, "grad_norm": 71.5, "learning_rate": 0.0009324555040276567, "loss": 3.1409, "step": 2113 }, { "epoch": 0.016912, "grad_norm": 4672.0, "learning_rate": 0.0009323922901289627, "loss": 3.2622, "step": 2114 }, { "epoch": 0.01692, "grad_norm": 19.125, "learning_rate": 0.0009323290488084297, "loss": 4.2641, "step": 2115 }, { "epoch": 0.016928, "grad_norm": 10752.0, "learning_rate": 0.0009322657800700687, "loss": 3.3647, "step": 2116 }, { "epoch": 0.016936, "grad_norm": 15.75, "learning_rate": 0.0009322024839178916, "loss": 3.8015, "step": 2117 }, { "epoch": 0.016944, "grad_norm": 118.0, "learning_rate": 0.0009321391603559128, "loss": 2.028, "step": 2118 }, { "epoch": 0.016952, "grad_norm": 1688.0, "learning_rate": 0.0009320758093881483, "loss": 3.1056, "step": 2119 }, { "epoch": 0.01696, "grad_norm": 171.0, "learning_rate": 0.0009320124310186157, "loss": 3.0985, "step": 2120 }, { "epoch": 0.016968, "grad_norm": 2096.0, "learning_rate": 0.0009319490252513345, "loss": 3.2333, "step": 2121 }, { "epoch": 0.016976, "grad_norm": 904.0, "learning_rate": 0.0009318855920903257, "loss": 3.5808, "step": 2122 }, { "epoch": 0.016984, "grad_norm": 199.0, "learning_rate": 0.000931822131539612, "loss": 2.9975, "step": 2123 }, { "epoch": 0.016992, "grad_norm": 29.0, "learning_rate": 0.0009317586436032182, "loss": 3.1559, "step": 2124 }, { "epoch": 0.017, "grad_norm": 179.0, "learning_rate": 0.0009316951282851706, "loss": 3.1939, "step": 2125 }, { "epoch": 0.017008, "grad_norm": 7.0, "learning_rate": 0.0009316315855894975, "loss": 3.109, "step": 2126 }, { "epoch": 0.017016, "grad_norm": 40.25, "learning_rate": 0.0009315680155202281, "loss": 3.2259, "step": 2127 }, { "epoch": 0.017024, "grad_norm": 7.59375, "learning_rate": 0.0009315044180813946, "loss": 3.9242, "step": 2128 }, { "epoch": 0.017032, "grad_norm": 5.375, "learning_rate": 0.00093144079327703, "loss": 3.4375, "step": 2129 }, { "epoch": 0.01704, "grad_norm": 4.40625, "learning_rate": 0.0009313771411111691, "loss": 2.8263, "step": 2130 }, { "epoch": 0.017048, "grad_norm": 3.765625, "learning_rate": 0.000931313461587849, "loss": 2.468, "step": 2131 }, { "epoch": 0.017056, "grad_norm": 3.59375, "learning_rate": 0.000931249754711108, "loss": 2.2882, "step": 2132 }, { "epoch": 0.017064, "grad_norm": 16.875, "learning_rate": 0.0009311860204849865, "loss": 2.4905, "step": 2133 }, { "epoch": 0.017072, "grad_norm": 18.625, "learning_rate": 0.0009311222589135262, "loss": 2.5819, "step": 2134 }, { "epoch": 0.01708, "grad_norm": 5.03125, "learning_rate": 0.0009310584700007708, "loss": 2.3318, "step": 2135 }, { "epoch": 0.017088, "grad_norm": 7.1875, "learning_rate": 0.000930994653750766, "loss": 3.006, "step": 2136 }, { "epoch": 0.017096, "grad_norm": 10.5625, "learning_rate": 0.0009309308101675586, "loss": 2.8428, "step": 2137 }, { "epoch": 0.017104, "grad_norm": 5.03125, "learning_rate": 0.000930866939255198, "loss": 2.503, "step": 2138 }, { "epoch": 0.017112, "grad_norm": 22.875, "learning_rate": 0.0009308030410177341, "loss": 1.8549, "step": 2139 }, { "epoch": 0.01712, "grad_norm": 6.21875, "learning_rate": 0.00093073911545922, "loss": 3.0851, "step": 2140 }, { "epoch": 0.017128, "grad_norm": 5.1875, "learning_rate": 0.0009306751625837094, "loss": 3.2023, "step": 2141 }, { "epoch": 0.017136, "grad_norm": 5.96875, "learning_rate": 0.0009306111823952579, "loss": 2.5284, "step": 2142 }, { "epoch": 0.017144, "grad_norm": 5.8125, "learning_rate": 0.0009305471748979235, "loss": 2.128, "step": 2143 }, { "epoch": 0.017152, "grad_norm": 6.25, "learning_rate": 0.0009304831400957652, "loss": 2.203, "step": 2144 }, { "epoch": 0.01716, "grad_norm": 4.625, "learning_rate": 0.0009304190779928443, "loss": 2.815, "step": 2145 }, { "epoch": 0.017168, "grad_norm": 3.671875, "learning_rate": 0.0009303549885932232, "loss": 3.0454, "step": 2146 }, { "epoch": 0.017176, "grad_norm": 10.125, "learning_rate": 0.0009302908719009666, "loss": 3.0005, "step": 2147 }, { "epoch": 0.017184, "grad_norm": 3.1875, "learning_rate": 0.0009302267279201406, "loss": 3.1673, "step": 2148 }, { "epoch": 0.017192, "grad_norm": 5.28125, "learning_rate": 0.0009301625566548132, "loss": 2.68, "step": 2149 }, { "epoch": 0.0172, "grad_norm": 83.5, "learning_rate": 0.0009300983581090541, "loss": 2.8563, "step": 2150 }, { "epoch": 0.017208, "grad_norm": 73.0, "learning_rate": 0.0009300341322869346, "loss": 2.9367, "step": 2151 }, { "epoch": 0.017216, "grad_norm": 3.75, "learning_rate": 0.0009299698791925279, "loss": 2.3506, "step": 2152 }, { "epoch": 0.017224, "grad_norm": 43.0, "learning_rate": 0.000929905598829909, "loss": 2.4594, "step": 2153 }, { "epoch": 0.017232, "grad_norm": 4.46875, "learning_rate": 0.0009298412912031542, "loss": 3.2487, "step": 2154 }, { "epoch": 0.01724, "grad_norm": 12.875, "learning_rate": 0.000929776956316342, "loss": 2.5846, "step": 2155 }, { "epoch": 0.017248, "grad_norm": 3.921875, "learning_rate": 0.0009297125941735522, "loss": 2.3342, "step": 2156 }, { "epoch": 0.017256, "grad_norm": 18.625, "learning_rate": 0.000929648204778867, "loss": 2.3348, "step": 2157 }, { "epoch": 0.017264, "grad_norm": 4.03125, "learning_rate": 0.0009295837881363696, "loss": 2.5302, "step": 2158 }, { "epoch": 0.017272, "grad_norm": 3.890625, "learning_rate": 0.0009295193442501455, "loss": 2.1917, "step": 2159 }, { "epoch": 0.01728, "grad_norm": 3.828125, "learning_rate": 0.0009294548731242813, "loss": 3.1456, "step": 2160 }, { "epoch": 0.017288, "grad_norm": 4.09375, "learning_rate": 0.0009293903747628659, "loss": 2.2481, "step": 2161 }, { "epoch": 0.017296, "grad_norm": 4.0625, "learning_rate": 0.0009293258491699896, "loss": 2.4385, "step": 2162 }, { "epoch": 0.017304, "grad_norm": 3.4375, "learning_rate": 0.0009292612963497448, "loss": 3.0649, "step": 2163 }, { "epoch": 0.017312, "grad_norm": 4.21875, "learning_rate": 0.000929196716306225, "loss": 2.5103, "step": 2164 }, { "epoch": 0.01732, "grad_norm": 3.296875, "learning_rate": 0.0009291321090435261, "loss": 2.5761, "step": 2165 }, { "epoch": 0.017328, "grad_norm": 3.8125, "learning_rate": 0.0009290674745657452, "loss": 3.079, "step": 2166 }, { "epoch": 0.017336, "grad_norm": 3.640625, "learning_rate": 0.0009290028128769816, "loss": 2.2416, "step": 2167 }, { "epoch": 0.017344, "grad_norm": 2.953125, "learning_rate": 0.000928938123981336, "loss": 2.9053, "step": 2168 }, { "epoch": 0.017352, "grad_norm": 2.9375, "learning_rate": 0.0009288734078829105, "loss": 2.6221, "step": 2169 }, { "epoch": 0.01736, "grad_norm": 3.4375, "learning_rate": 0.0009288086645858098, "loss": 2.5606, "step": 2170 }, { "epoch": 0.017368, "grad_norm": 3.234375, "learning_rate": 0.0009287438940941398, "loss": 2.3412, "step": 2171 }, { "epoch": 0.017376, "grad_norm": 3.03125, "learning_rate": 0.0009286790964120079, "loss": 2.2654, "step": 2172 }, { "epoch": 0.017384, "grad_norm": 3.578125, "learning_rate": 0.0009286142715435237, "loss": 2.6602, "step": 2173 }, { "epoch": 0.017392, "grad_norm": 3.21875, "learning_rate": 0.0009285494194927983, "loss": 2.6552, "step": 2174 }, { "epoch": 0.0174, "grad_norm": 3.453125, "learning_rate": 0.0009284845402639446, "loss": 2.1998, "step": 2175 }, { "epoch": 0.017408, "grad_norm": 3.421875, "learning_rate": 0.0009284196338610769, "loss": 2.8249, "step": 2176 }, { "epoch": 0.017416, "grad_norm": 27.5, "learning_rate": 0.0009283547002883119, "loss": 2.8347, "step": 2177 }, { "epoch": 0.017424, "grad_norm": 2.90625, "learning_rate": 0.0009282897395497672, "loss": 2.2427, "step": 2178 }, { "epoch": 0.017432, "grad_norm": 4.4375, "learning_rate": 0.000928224751649563, "loss": 2.7544, "step": 2179 }, { "epoch": 0.01744, "grad_norm": 3.515625, "learning_rate": 0.0009281597365918202, "loss": 2.2296, "step": 2180 }, { "epoch": 0.017448, "grad_norm": 11.8125, "learning_rate": 0.0009280946943806625, "loss": 3.0469, "step": 2181 }, { "epoch": 0.017456, "grad_norm": 3.234375, "learning_rate": 0.0009280296250202144, "loss": 2.1292, "step": 2182 }, { "epoch": 0.017464, "grad_norm": 4.375, "learning_rate": 0.000927964528514603, "loss": 2.8419, "step": 2183 }, { "epoch": 0.017472, "grad_norm": 4.1875, "learning_rate": 0.0009278994048679563, "loss": 2.2281, "step": 2184 }, { "epoch": 0.01748, "grad_norm": 3.890625, "learning_rate": 0.0009278342540844042, "loss": 2.8262, "step": 2185 }, { "epoch": 0.017488, "grad_norm": 4.9375, "learning_rate": 0.0009277690761680788, "loss": 2.6041, "step": 2186 }, { "epoch": 0.017496, "grad_norm": 5.75, "learning_rate": 0.0009277038711231137, "loss": 2.9476, "step": 2187 }, { "epoch": 0.017504, "grad_norm": 4.625, "learning_rate": 0.0009276386389536438, "loss": 2.6236, "step": 2188 }, { "epoch": 0.017512, "grad_norm": 3.8125, "learning_rate": 0.0009275733796638064, "loss": 3.3677, "step": 2189 }, { "epoch": 0.01752, "grad_norm": 4.21875, "learning_rate": 0.0009275080932577398, "loss": 3.0498, "step": 2190 }, { "epoch": 0.017528, "grad_norm": 3.609375, "learning_rate": 0.0009274427797395846, "loss": 2.908, "step": 2191 }, { "epoch": 0.017536, "grad_norm": 3.359375, "learning_rate": 0.000927377439113483, "loss": 2.0476, "step": 2192 }, { "epoch": 0.017544, "grad_norm": 3.78125, "learning_rate": 0.0009273120713835786, "loss": 2.6328, "step": 2193 }, { "epoch": 0.017552, "grad_norm": 3.171875, "learning_rate": 0.0009272466765540172, "loss": 3.1633, "step": 2194 }, { "epoch": 0.01756, "grad_norm": 3.125, "learning_rate": 0.0009271812546289459, "loss": 2.7775, "step": 2195 }, { "epoch": 0.017568, "grad_norm": 3.015625, "learning_rate": 0.0009271158056125137, "loss": 2.6394, "step": 2196 }, { "epoch": 0.017576, "grad_norm": 3.390625, "learning_rate": 0.0009270503295088713, "loss": 3.4154, "step": 2197 }, { "epoch": 0.017584, "grad_norm": 3.34375, "learning_rate": 0.0009269848263221712, "loss": 2.6796, "step": 2198 }, { "epoch": 0.017592, "grad_norm": 3.34375, "learning_rate": 0.0009269192960565674, "loss": 2.4133, "step": 2199 }, { "epoch": 0.0176, "grad_norm": 2.90625, "learning_rate": 0.000926853738716216, "loss": 2.6611, "step": 2200 }, { "epoch": 0.017608, "grad_norm": 3.21875, "learning_rate": 0.0009267881543052743, "loss": 1.8857, "step": 2201 }, { "epoch": 0.017616, "grad_norm": 4.125, "learning_rate": 0.0009267225428279018, "loss": 2.8207, "step": 2202 }, { "epoch": 0.017624, "grad_norm": 3.265625, "learning_rate": 0.0009266569042882593, "loss": 2.5235, "step": 2203 }, { "epoch": 0.017632, "grad_norm": 3.78125, "learning_rate": 0.0009265912386905097, "loss": 3.9373, "step": 2204 }, { "epoch": 0.01764, "grad_norm": 3.765625, "learning_rate": 0.0009265255460388173, "loss": 2.4131, "step": 2205 }, { "epoch": 0.017648, "grad_norm": 3.09375, "learning_rate": 0.0009264598263373485, "loss": 2.4032, "step": 2206 }, { "epoch": 0.017656, "grad_norm": 4.59375, "learning_rate": 0.0009263940795902708, "loss": 2.7719, "step": 2207 }, { "epoch": 0.017664, "grad_norm": 3.859375, "learning_rate": 0.0009263283058017542, "loss": 2.7663, "step": 2208 }, { "epoch": 0.017672, "grad_norm": 3.375, "learning_rate": 0.0009262625049759696, "loss": 2.2653, "step": 2209 }, { "epoch": 0.01768, "grad_norm": 3.796875, "learning_rate": 0.0009261966771170905, "loss": 2.098, "step": 2210 }, { "epoch": 0.017688, "grad_norm": 3.71875, "learning_rate": 0.000926130822229291, "loss": 2.4269, "step": 2211 }, { "epoch": 0.017696, "grad_norm": 4.25, "learning_rate": 0.0009260649403167481, "loss": 3.5167, "step": 2212 }, { "epoch": 0.017704, "grad_norm": 2.9375, "learning_rate": 0.0009259990313836397, "loss": 2.3059, "step": 2213 }, { "epoch": 0.017712, "grad_norm": 3.859375, "learning_rate": 0.0009259330954341456, "loss": 2.6145, "step": 2214 }, { "epoch": 0.01772, "grad_norm": 4.6875, "learning_rate": 0.0009258671324724474, "loss": 3.0465, "step": 2215 }, { "epoch": 0.017728, "grad_norm": 3.515625, "learning_rate": 0.0009258011425027289, "loss": 2.3148, "step": 2216 }, { "epoch": 0.017736, "grad_norm": 3.359375, "learning_rate": 0.0009257351255291744, "loss": 2.6897, "step": 2217 }, { "epoch": 0.017744, "grad_norm": 3.234375, "learning_rate": 0.0009256690815559709, "loss": 2.5784, "step": 2218 }, { "epoch": 0.017752, "grad_norm": 3.515625, "learning_rate": 0.000925603010587307, "loss": 2.4038, "step": 2219 }, { "epoch": 0.01776, "grad_norm": 4.0625, "learning_rate": 0.0009255369126273726, "loss": 2.9252, "step": 2220 }, { "epoch": 0.017768, "grad_norm": 3.71875, "learning_rate": 0.0009254707876803597, "loss": 2.6204, "step": 2221 }, { "epoch": 0.017776, "grad_norm": 3.828125, "learning_rate": 0.0009254046357504619, "loss": 2.4755, "step": 2222 }, { "epoch": 0.017784, "grad_norm": 4.15625, "learning_rate": 0.0009253384568418744, "loss": 2.5767, "step": 2223 }, { "epoch": 0.017792, "grad_norm": 4.78125, "learning_rate": 0.0009252722509587941, "loss": 2.6855, "step": 2224 }, { "epoch": 0.0178, "grad_norm": 3.59375, "learning_rate": 0.00092520601810542, "loss": 2.7832, "step": 2225 }, { "epoch": 0.017808, "grad_norm": 3.640625, "learning_rate": 0.0009251397582859522, "loss": 2.7348, "step": 2226 }, { "epoch": 0.017816, "grad_norm": 3.0625, "learning_rate": 0.000925073471504593, "loss": 2.504, "step": 2227 }, { "epoch": 0.017824, "grad_norm": 6.0, "learning_rate": 0.0009250071577655461, "loss": 3.1314, "step": 2228 }, { "epoch": 0.017832, "grad_norm": 4.46875, "learning_rate": 0.0009249408170730174, "loss": 2.4998, "step": 2229 }, { "epoch": 0.01784, "grad_norm": 3.5, "learning_rate": 0.0009248744494312136, "loss": 2.4687, "step": 2230 }, { "epoch": 0.017848, "grad_norm": 2.90625, "learning_rate": 0.0009248080548443441, "loss": 2.8608, "step": 2231 }, { "epoch": 0.017856, "grad_norm": 3.984375, "learning_rate": 0.0009247416333166193, "loss": 3.0936, "step": 2232 }, { "epoch": 0.017864, "grad_norm": 4.28125, "learning_rate": 0.0009246751848522518, "loss": 2.8581, "step": 2233 }, { "epoch": 0.017872, "grad_norm": 3.9375, "learning_rate": 0.0009246087094554556, "loss": 2.7502, "step": 2234 }, { "epoch": 0.01788, "grad_norm": 3.03125, "learning_rate": 0.0009245422071304464, "loss": 2.353, "step": 2235 }, { "epoch": 0.017888, "grad_norm": 3.59375, "learning_rate": 0.0009244756778814419, "loss": 2.9249, "step": 2236 }, { "epoch": 0.017896, "grad_norm": 3.78125, "learning_rate": 0.000924409121712661, "loss": 2.3732, "step": 2237 }, { "epoch": 0.017904, "grad_norm": 5.28125, "learning_rate": 0.000924342538628325, "loss": 2.9259, "step": 2238 }, { "epoch": 0.017912, "grad_norm": 4.25, "learning_rate": 0.0009242759286326564, "loss": 2.6797, "step": 2239 }, { "epoch": 0.01792, "grad_norm": 4.25, "learning_rate": 0.0009242092917298792, "loss": 3.1398, "step": 2240 }, { "epoch": 0.017928, "grad_norm": 4.96875, "learning_rate": 0.0009241426279242199, "loss": 2.8393, "step": 2241 }, { "epoch": 0.017936, "grad_norm": 3.796875, "learning_rate": 0.0009240759372199059, "loss": 2.8723, "step": 2242 }, { "epoch": 0.017944, "grad_norm": 3.25, "learning_rate": 0.0009240092196211669, "loss": 2.4302, "step": 2243 }, { "epoch": 0.017952, "grad_norm": 3.875, "learning_rate": 0.0009239424751322341, "loss": 2.7938, "step": 2244 }, { "epoch": 0.01796, "grad_norm": 3.078125, "learning_rate": 0.0009238757037573401, "loss": 2.2891, "step": 2245 }, { "epoch": 0.017968, "grad_norm": 5.0, "learning_rate": 0.0009238089055007195, "loss": 2.5019, "step": 2246 }, { "epoch": 0.017976, "grad_norm": 3.734375, "learning_rate": 0.0009237420803666087, "loss": 2.6066, "step": 2247 }, { "epoch": 0.017984, "grad_norm": 4.15625, "learning_rate": 0.0009236752283592458, "loss": 3.5008, "step": 2248 }, { "epoch": 0.017992, "grad_norm": 3.515625, "learning_rate": 0.0009236083494828702, "loss": 1.9688, "step": 2249 }, { "epoch": 0.018, "grad_norm": 2.78125, "learning_rate": 0.0009235414437417234, "loss": 2.742, "step": 2250 }, { "epoch": 0.018008, "grad_norm": 3.359375, "learning_rate": 0.0009234745111400486, "loss": 2.3685, "step": 2251 }, { "epoch": 0.018016, "grad_norm": 3.03125, "learning_rate": 0.0009234075516820903, "loss": 2.7371, "step": 2252 }, { "epoch": 0.018024, "grad_norm": 4.25, "learning_rate": 0.0009233405653720953, "loss": 2.82, "step": 2253 }, { "epoch": 0.018032, "grad_norm": 3.40625, "learning_rate": 0.0009232735522143117, "loss": 2.6189, "step": 2254 }, { "epoch": 0.01804, "grad_norm": 3.359375, "learning_rate": 0.0009232065122129892, "loss": 2.4723, "step": 2255 }, { "epoch": 0.018048, "grad_norm": 3.0625, "learning_rate": 0.0009231394453723797, "loss": 3.1268, "step": 2256 }, { "epoch": 0.018056, "grad_norm": 3.03125, "learning_rate": 0.0009230723516967363, "loss": 2.4574, "step": 2257 }, { "epoch": 0.018064, "grad_norm": 4.03125, "learning_rate": 0.0009230052311903143, "loss": 2.7475, "step": 2258 }, { "epoch": 0.018072, "grad_norm": 3.515625, "learning_rate": 0.00092293808385737, "loss": 2.9969, "step": 2259 }, { "epoch": 0.01808, "grad_norm": 4.6875, "learning_rate": 0.0009228709097021619, "loss": 2.469, "step": 2260 }, { "epoch": 0.018088, "grad_norm": 3.4375, "learning_rate": 0.0009228037087289505, "loss": 1.973, "step": 2261 }, { "epoch": 0.018096, "grad_norm": 5.09375, "learning_rate": 0.0009227364809419971, "loss": 2.7187, "step": 2262 }, { "epoch": 0.018104, "grad_norm": 3.84375, "learning_rate": 0.0009226692263455656, "loss": 2.8224, "step": 2263 }, { "epoch": 0.018112, "grad_norm": 5.0, "learning_rate": 0.000922601944943921, "loss": 2.8971, "step": 2264 }, { "epoch": 0.01812, "grad_norm": 4.3125, "learning_rate": 0.00092253463674133, "loss": 2.2827, "step": 2265 }, { "epoch": 0.018128, "grad_norm": 4.125, "learning_rate": 0.0009224673017420617, "loss": 2.4297, "step": 2266 }, { "epoch": 0.018136, "grad_norm": 5.875, "learning_rate": 0.0009223999399503863, "loss": 2.6724, "step": 2267 }, { "epoch": 0.018144, "grad_norm": 3.34375, "learning_rate": 0.0009223325513705756, "loss": 2.8771, "step": 2268 }, { "epoch": 0.018152, "grad_norm": 3.78125, "learning_rate": 0.0009222651360069034, "loss": 2.1285, "step": 2269 }, { "epoch": 0.01816, "grad_norm": 3.515625, "learning_rate": 0.000922197693863645, "loss": 3.0716, "step": 2270 }, { "epoch": 0.018168, "grad_norm": 4.0625, "learning_rate": 0.0009221302249450777, "loss": 3.024, "step": 2271 }, { "epoch": 0.018176, "grad_norm": 2.859375, "learning_rate": 0.0009220627292554802, "loss": 2.7266, "step": 2272 }, { "epoch": 0.018184, "grad_norm": 3.40625, "learning_rate": 0.000921995206799133, "loss": 2.7752, "step": 2273 }, { "epoch": 0.018192, "grad_norm": 2.4375, "learning_rate": 0.0009219276575803183, "loss": 2.2556, "step": 2274 }, { "epoch": 0.0182, "grad_norm": 57.5, "learning_rate": 0.0009218600816033201, "loss": 2.0034, "step": 2275 }, { "epoch": 0.018208, "grad_norm": 3.046875, "learning_rate": 0.0009217924788724239, "loss": 2.2515, "step": 2276 }, { "epoch": 0.018216, "grad_norm": 3.15625, "learning_rate": 0.0009217248493919169, "loss": 2.2495, "step": 2277 }, { "epoch": 0.018224, "grad_norm": 9.25, "learning_rate": 0.0009216571931660884, "loss": 2.046, "step": 2278 }, { "epoch": 0.018232, "grad_norm": 4.03125, "learning_rate": 0.0009215895101992287, "loss": 3.1049, "step": 2279 }, { "epoch": 0.01824, "grad_norm": 3.28125, "learning_rate": 0.0009215218004956303, "loss": 2.4327, "step": 2280 }, { "epoch": 0.018248, "grad_norm": 4.6875, "learning_rate": 0.0009214540640595874, "loss": 3.3104, "step": 2281 }, { "epoch": 0.018256, "grad_norm": 3.171875, "learning_rate": 0.0009213863008953957, "loss": 2.9406, "step": 2282 }, { "epoch": 0.018264, "grad_norm": 3.765625, "learning_rate": 0.0009213185110073526, "loss": 2.5438, "step": 2283 }, { "epoch": 0.018272, "grad_norm": 3.171875, "learning_rate": 0.0009212506943997574, "loss": 2.4274, "step": 2284 }, { "epoch": 0.01828, "grad_norm": 3.28125, "learning_rate": 0.0009211828510769108, "loss": 2.6615, "step": 2285 }, { "epoch": 0.018288, "grad_norm": 2.78125, "learning_rate": 0.0009211149810431155, "loss": 2.6903, "step": 2286 }, { "epoch": 0.018296, "grad_norm": 3.140625, "learning_rate": 0.0009210470843026754, "loss": 2.6647, "step": 2287 }, { "epoch": 0.018304, "grad_norm": 2.90625, "learning_rate": 0.000920979160859897, "loss": 3.274, "step": 2288 }, { "epoch": 0.018312, "grad_norm": 2.9375, "learning_rate": 0.0009209112107190875, "loss": 2.3362, "step": 2289 }, { "epoch": 0.01832, "grad_norm": 3.328125, "learning_rate": 0.0009208432338845564, "loss": 2.6059, "step": 2290 }, { "epoch": 0.018328, "grad_norm": 2.796875, "learning_rate": 0.0009207752303606146, "loss": 2.3004, "step": 2291 }, { "epoch": 0.018336, "grad_norm": 3.15625, "learning_rate": 0.000920707200151575, "loss": 2.4611, "step": 2292 }, { "epoch": 0.018344, "grad_norm": 3.59375, "learning_rate": 0.0009206391432617516, "loss": 2.9711, "step": 2293 }, { "epoch": 0.018352, "grad_norm": 3.125, "learning_rate": 0.000920571059695461, "loss": 2.2244, "step": 2294 }, { "epoch": 0.01836, "grad_norm": 4.34375, "learning_rate": 0.0009205029494570207, "loss": 2.4426, "step": 2295 }, { "epoch": 0.018368, "grad_norm": 4.75, "learning_rate": 0.0009204348125507502, "loss": 2.4134, "step": 2296 }, { "epoch": 0.018376, "grad_norm": 3.546875, "learning_rate": 0.0009203666489809708, "loss": 2.4262, "step": 2297 }, { "epoch": 0.018384, "grad_norm": 2.75, "learning_rate": 0.0009202984587520051, "loss": 1.8437, "step": 2298 }, { "epoch": 0.018392, "grad_norm": 4.0, "learning_rate": 0.0009202302418681778, "loss": 2.8536, "step": 2299 }, { "epoch": 0.0184, "grad_norm": 3.375, "learning_rate": 0.0009201619983338152, "loss": 2.3073, "step": 2300 }, { "epoch": 0.018408, "grad_norm": 3.921875, "learning_rate": 0.0009200937281532451, "loss": 2.7559, "step": 2301 }, { "epoch": 0.018416, "grad_norm": 2.921875, "learning_rate": 0.0009200254313307972, "loss": 2.5209, "step": 2302 }, { "epoch": 0.018424, "grad_norm": 15.6875, "learning_rate": 0.0009199571078708029, "loss": 2.6217, "step": 2303 }, { "epoch": 0.018432, "grad_norm": 3.3125, "learning_rate": 0.0009198887577775949, "loss": 2.4569, "step": 2304 }, { "epoch": 0.01844, "grad_norm": 3.3125, "learning_rate": 0.0009198203810555081, "loss": 2.2659, "step": 2305 }, { "epoch": 0.018448, "grad_norm": 3.25, "learning_rate": 0.000919751977708879, "loss": 2.7942, "step": 2306 }, { "epoch": 0.018456, "grad_norm": 3.5625, "learning_rate": 0.0009196835477420455, "loss": 2.9777, "step": 2307 }, { "epoch": 0.018464, "grad_norm": 3.984375, "learning_rate": 0.0009196150911593473, "loss": 2.3514, "step": 2308 }, { "epoch": 0.018472, "grad_norm": 3.265625, "learning_rate": 0.000919546607965126, "loss": 2.5987, "step": 2309 }, { "epoch": 0.01848, "grad_norm": 3.453125, "learning_rate": 0.0009194780981637245, "loss": 2.8687, "step": 2310 }, { "epoch": 0.018488, "grad_norm": 3.015625, "learning_rate": 0.0009194095617594878, "loss": 2.7535, "step": 2311 }, { "epoch": 0.018496, "grad_norm": 3.625, "learning_rate": 0.0009193409987567625, "loss": 2.7528, "step": 2312 }, { "epoch": 0.018504, "grad_norm": 3.25, "learning_rate": 0.0009192724091598966, "loss": 2.8159, "step": 2313 }, { "epoch": 0.018512, "grad_norm": 4.625, "learning_rate": 0.0009192037929732398, "loss": 2.9354, "step": 2314 }, { "epoch": 0.01852, "grad_norm": 3.828125, "learning_rate": 0.000919135150201144, "loss": 2.838, "step": 2315 }, { "epoch": 0.018528, "grad_norm": 3.65625, "learning_rate": 0.0009190664808479624, "loss": 2.3953, "step": 2316 }, { "epoch": 0.018536, "grad_norm": 3.59375, "learning_rate": 0.0009189977849180499, "loss": 3.0693, "step": 2317 }, { "epoch": 0.018544, "grad_norm": 3.71875, "learning_rate": 0.000918929062415763, "loss": 2.7846, "step": 2318 }, { "epoch": 0.018552, "grad_norm": 2.703125, "learning_rate": 0.00091886031334546, "loss": 2.3004, "step": 2319 }, { "epoch": 0.01856, "grad_norm": 3.703125, "learning_rate": 0.0009187915377115013, "loss": 3.0836, "step": 2320 }, { "epoch": 0.018568, "grad_norm": 2.578125, "learning_rate": 0.0009187227355182479, "loss": 2.2797, "step": 2321 }, { "epoch": 0.018576, "grad_norm": 4.21875, "learning_rate": 0.0009186539067700636, "loss": 2.4035, "step": 2322 }, { "epoch": 0.018584, "grad_norm": 3.375, "learning_rate": 0.0009185850514713134, "loss": 2.5165, "step": 2323 }, { "epoch": 0.018592, "grad_norm": 3.765625, "learning_rate": 0.000918516169626364, "loss": 2.5383, "step": 2324 }, { "epoch": 0.0186, "grad_norm": 4.25, "learning_rate": 0.000918447261239584, "loss": 2.4932, "step": 2325 }, { "epoch": 0.018608, "grad_norm": 3.671875, "learning_rate": 0.000918378326315343, "loss": 2.2256, "step": 2326 }, { "epoch": 0.018616, "grad_norm": 4.71875, "learning_rate": 0.0009183093648580132, "loss": 2.7815, "step": 2327 }, { "epoch": 0.018624, "grad_norm": 4.0625, "learning_rate": 0.0009182403768719678, "loss": 2.6409, "step": 2328 }, { "epoch": 0.018632, "grad_norm": 4.6875, "learning_rate": 0.0009181713623615822, "loss": 2.7556, "step": 2329 }, { "epoch": 0.01864, "grad_norm": 4.1875, "learning_rate": 0.000918102321331233, "loss": 3.0644, "step": 2330 }, { "epoch": 0.018648, "grad_norm": 3.6875, "learning_rate": 0.0009180332537852988, "loss": 2.9007, "step": 2331 }, { "epoch": 0.018656, "grad_norm": 3.5625, "learning_rate": 0.0009179641597281598, "loss": 2.737, "step": 2332 }, { "epoch": 0.018664, "grad_norm": 3.171875, "learning_rate": 0.0009178950391641978, "loss": 2.3259, "step": 2333 }, { "epoch": 0.018672, "grad_norm": 2.90625, "learning_rate": 0.0009178258920977964, "loss": 2.6193, "step": 2334 }, { "epoch": 0.01868, "grad_norm": 3.03125, "learning_rate": 0.0009177567185333408, "loss": 2.8549, "step": 2335 }, { "epoch": 0.018688, "grad_norm": 3.375, "learning_rate": 0.0009176875184752181, "loss": 2.5857, "step": 2336 }, { "epoch": 0.018696, "grad_norm": 2.796875, "learning_rate": 0.0009176182919278167, "loss": 2.2455, "step": 2337 }, { "epoch": 0.018704, "grad_norm": 3.46875, "learning_rate": 0.0009175490388955269, "loss": 3.0094, "step": 2338 }, { "epoch": 0.018712, "grad_norm": 3.4375, "learning_rate": 0.0009174797593827405, "loss": 2.5632, "step": 2339 }, { "epoch": 0.01872, "grad_norm": 3.5, "learning_rate": 0.0009174104533938514, "loss": 2.5248, "step": 2340 }, { "epoch": 0.018728, "grad_norm": 3.765625, "learning_rate": 0.0009173411209332547, "loss": 2.6971, "step": 2341 }, { "epoch": 0.018736, "grad_norm": 3.234375, "learning_rate": 0.0009172717620053477, "loss": 2.0706, "step": 2342 }, { "epoch": 0.018744, "grad_norm": 5.5, "learning_rate": 0.0009172023766145285, "loss": 2.6736, "step": 2343 }, { "epoch": 0.018752, "grad_norm": 3.296875, "learning_rate": 0.000917132964765198, "loss": 3.2301, "step": 2344 }, { "epoch": 0.01876, "grad_norm": 3.359375, "learning_rate": 0.0009170635264617581, "loss": 2.788, "step": 2345 }, { "epoch": 0.018768, "grad_norm": 8.625, "learning_rate": 0.0009169940617086124, "loss": 2.1928, "step": 2346 }, { "epoch": 0.018776, "grad_norm": 3.125, "learning_rate": 0.0009169245705101661, "loss": 2.2712, "step": 2347 }, { "epoch": 0.018784, "grad_norm": 3.515625, "learning_rate": 0.0009168550528708265, "loss": 3.0987, "step": 2348 }, { "epoch": 0.018792, "grad_norm": 3.015625, "learning_rate": 0.0009167855087950024, "loss": 2.6765, "step": 2349 }, { "epoch": 0.0188, "grad_norm": 3.03125, "learning_rate": 0.0009167159382871039, "loss": 2.7783, "step": 2350 }, { "epoch": 0.018808, "grad_norm": 3.9375, "learning_rate": 0.0009166463413515433, "loss": 2.8879, "step": 2351 }, { "epoch": 0.018816, "grad_norm": 2.921875, "learning_rate": 0.0009165767179927345, "loss": 2.6766, "step": 2352 }, { "epoch": 0.018824, "grad_norm": 3.0, "learning_rate": 0.0009165070682150924, "loss": 2.5336, "step": 2353 }, { "epoch": 0.018832, "grad_norm": 2.953125, "learning_rate": 0.0009164373920230349, "loss": 2.4318, "step": 2354 }, { "epoch": 0.01884, "grad_norm": 3.109375, "learning_rate": 0.0009163676894209801, "loss": 2.9485, "step": 2355 }, { "epoch": 0.018848, "grad_norm": 2.5625, "learning_rate": 0.0009162979604133488, "loss": 2.0691, "step": 2356 }, { "epoch": 0.018856, "grad_norm": 2.734375, "learning_rate": 0.0009162282050045631, "loss": 2.0089, "step": 2357 }, { "epoch": 0.018864, "grad_norm": 2.859375, "learning_rate": 0.0009161584231990467, "loss": 2.5931, "step": 2358 }, { "epoch": 0.018872, "grad_norm": 2.828125, "learning_rate": 0.0009160886150012252, "loss": 2.4036, "step": 2359 }, { "epoch": 0.01888, "grad_norm": 2.875, "learning_rate": 0.0009160187804155257, "loss": 2.5498, "step": 2360 }, { "epoch": 0.018888, "grad_norm": 3.21875, "learning_rate": 0.0009159489194463771, "loss": 3.1026, "step": 2361 }, { "epoch": 0.018896, "grad_norm": 2.875, "learning_rate": 0.0009158790320982098, "loss": 2.179, "step": 2362 }, { "epoch": 0.018904, "grad_norm": 3.234375, "learning_rate": 0.000915809118375456, "loss": 2.4734, "step": 2363 }, { "epoch": 0.018912, "grad_norm": 2.75, "learning_rate": 0.0009157391782825495, "loss": 2.2862, "step": 2364 }, { "epoch": 0.01892, "grad_norm": 3.78125, "learning_rate": 0.000915669211823926, "loss": 2.9255, "step": 2365 }, { "epoch": 0.018928, "grad_norm": 2.9375, "learning_rate": 0.0009155992190040225, "loss": 2.2619, "step": 2366 }, { "epoch": 0.018936, "grad_norm": 3.890625, "learning_rate": 0.0009155291998272781, "loss": 2.5723, "step": 2367 }, { "epoch": 0.018944, "grad_norm": 3.390625, "learning_rate": 0.0009154591542981332, "loss": 2.1764, "step": 2368 }, { "epoch": 0.018952, "grad_norm": 3.109375, "learning_rate": 0.00091538908242103, "loss": 2.2723, "step": 2369 }, { "epoch": 0.01896, "grad_norm": 3.21875, "learning_rate": 0.0009153189842004124, "loss": 2.1569, "step": 2370 }, { "epoch": 0.018968, "grad_norm": 4.1875, "learning_rate": 0.0009152488596407258, "loss": 2.3359, "step": 2371 }, { "epoch": 0.018976, "grad_norm": 3.1875, "learning_rate": 0.0009151787087464178, "loss": 1.9837, "step": 2372 }, { "epoch": 0.018984, "grad_norm": 3.515625, "learning_rate": 0.000915108531521937, "loss": 2.7286, "step": 2373 }, { "epoch": 0.018992, "grad_norm": 3.171875, "learning_rate": 0.0009150383279717338, "loss": 2.2795, "step": 2374 }, { "epoch": 0.019, "grad_norm": 3.265625, "learning_rate": 0.0009149680981002608, "loss": 2.4059, "step": 2375 }, { "epoch": 0.019008, "grad_norm": 61.75, "learning_rate": 0.0009148978419119718, "loss": 2.9623, "step": 2376 }, { "epoch": 0.019016, "grad_norm": 3.28125, "learning_rate": 0.0009148275594113223, "loss": 2.9817, "step": 2377 }, { "epoch": 0.019024, "grad_norm": 2.96875, "learning_rate": 0.0009147572506027695, "loss": 2.7588, "step": 2378 }, { "epoch": 0.019032, "grad_norm": 2.890625, "learning_rate": 0.0009146869154907724, "loss": 2.3699, "step": 2379 }, { "epoch": 0.01904, "grad_norm": 2.703125, "learning_rate": 0.0009146165540797915, "loss": 2.6702, "step": 2380 }, { "epoch": 0.019048, "grad_norm": 3.5, "learning_rate": 0.0009145461663742892, "loss": 3.3261, "step": 2381 }, { "epoch": 0.019056, "grad_norm": 3.28125, "learning_rate": 0.0009144757523787292, "loss": 2.777, "step": 2382 }, { "epoch": 0.019064, "grad_norm": 3.328125, "learning_rate": 0.0009144053120975771, "loss": 2.9181, "step": 2383 }, { "epoch": 0.019072, "grad_norm": 2.953125, "learning_rate": 0.0009143348455353003, "loss": 2.2603, "step": 2384 }, { "epoch": 0.01908, "grad_norm": 3.421875, "learning_rate": 0.0009142643526963677, "loss": 2.3733, "step": 2385 }, { "epoch": 0.019088, "grad_norm": 2.890625, "learning_rate": 0.0009141938335852498, "loss": 2.7216, "step": 2386 }, { "epoch": 0.019096, "grad_norm": 3.046875, "learning_rate": 0.0009141232882064187, "loss": 2.0942, "step": 2387 }, { "epoch": 0.019104, "grad_norm": 3.4375, "learning_rate": 0.0009140527165643484, "loss": 2.1025, "step": 2388 }, { "epoch": 0.019112, "grad_norm": 2.5, "learning_rate": 0.0009139821186635147, "loss": 2.2907, "step": 2389 }, { "epoch": 0.01912, "grad_norm": 3.328125, "learning_rate": 0.0009139114945083946, "loss": 2.6083, "step": 2390 }, { "epoch": 0.019128, "grad_norm": 3.375, "learning_rate": 0.0009138408441034671, "loss": 2.0627, "step": 2391 }, { "epoch": 0.019136, "grad_norm": 3.3125, "learning_rate": 0.0009137701674532128, "loss": 2.4537, "step": 2392 }, { "epoch": 0.019144, "grad_norm": 3.3125, "learning_rate": 0.0009136994645621138, "loss": 2.5263, "step": 2393 }, { "epoch": 0.019152, "grad_norm": 3.109375, "learning_rate": 0.0009136287354346542, "loss": 2.35, "step": 2394 }, { "epoch": 0.01916, "grad_norm": 3.46875, "learning_rate": 0.0009135579800753194, "loss": 3.0208, "step": 2395 }, { "epoch": 0.019168, "grad_norm": 3.421875, "learning_rate": 0.0009134871984885966, "loss": 1.9791, "step": 2396 }, { "epoch": 0.019176, "grad_norm": 3.59375, "learning_rate": 0.0009134163906789748, "loss": 2.6874, "step": 2397 }, { "epoch": 0.019184, "grad_norm": 3.0, "learning_rate": 0.0009133455566509445, "loss": 2.4864, "step": 2398 }, { "epoch": 0.019192, "grad_norm": 3.65625, "learning_rate": 0.000913274696408998, "loss": 3.1304, "step": 2399 }, { "epoch": 0.0192, "grad_norm": 3.453125, "learning_rate": 0.000913203809957629, "loss": 2.8444, "step": 2400 }, { "epoch": 0.019208, "grad_norm": 3.1875, "learning_rate": 0.000913132897301333, "loss": 2.4354, "step": 2401 }, { "epoch": 0.019216, "grad_norm": 2.984375, "learning_rate": 0.0009130619584446076, "loss": 2.4562, "step": 2402 }, { "epoch": 0.019224, "grad_norm": 3.125, "learning_rate": 0.0009129909933919514, "loss": 1.9645, "step": 2403 }, { "epoch": 0.019232, "grad_norm": 4.34375, "learning_rate": 0.0009129200021478649, "loss": 3.1137, "step": 2404 }, { "epoch": 0.01924, "grad_norm": 3.5, "learning_rate": 0.0009128489847168502, "loss": 2.4006, "step": 2405 }, { "epoch": 0.019248, "grad_norm": 3.375, "learning_rate": 0.0009127779411034114, "loss": 2.8204, "step": 2406 }, { "epoch": 0.019256, "grad_norm": 3.890625, "learning_rate": 0.0009127068713120539, "loss": 2.5569, "step": 2407 }, { "epoch": 0.019264, "grad_norm": 3.734375, "learning_rate": 0.0009126357753472847, "loss": 2.8012, "step": 2408 }, { "epoch": 0.019272, "grad_norm": 2.625, "learning_rate": 0.0009125646532136129, "loss": 2.0939, "step": 2409 }, { "epoch": 0.01928, "grad_norm": 3.125, "learning_rate": 0.0009124935049155488, "loss": 2.2295, "step": 2410 }, { "epoch": 0.019288, "grad_norm": 3.25, "learning_rate": 0.0009124223304576047, "loss": 3.007, "step": 2411 }, { "epoch": 0.019296, "grad_norm": 3.640625, "learning_rate": 0.0009123511298442941, "loss": 2.5513, "step": 2412 }, { "epoch": 0.019304, "grad_norm": 3.359375, "learning_rate": 0.0009122799030801329, "loss": 2.3395, "step": 2413 }, { "epoch": 0.019312, "grad_norm": 4.15625, "learning_rate": 0.0009122086501696378, "loss": 2.6891, "step": 2414 }, { "epoch": 0.01932, "grad_norm": 3.25, "learning_rate": 0.000912137371117328, "loss": 2.503, "step": 2415 }, { "epoch": 0.019328, "grad_norm": 3.078125, "learning_rate": 0.0009120660659277236, "loss": 2.6117, "step": 2416 }, { "epoch": 0.019336, "grad_norm": 2.953125, "learning_rate": 0.0009119947346053468, "loss": 2.8145, "step": 2417 }, { "epoch": 0.019344, "grad_norm": 2.84375, "learning_rate": 0.0009119233771547213, "loss": 2.8505, "step": 2418 }, { "epoch": 0.019352, "grad_norm": 2.96875, "learning_rate": 0.0009118519935803726, "loss": 2.4176, "step": 2419 }, { "epoch": 0.01936, "grad_norm": 4.09375, "learning_rate": 0.0009117805838868277, "loss": 2.1643, "step": 2420 }, { "epoch": 0.019368, "grad_norm": 2.71875, "learning_rate": 0.0009117091480786155, "loss": 1.9782, "step": 2421 }, { "epoch": 0.019376, "grad_norm": 3.25, "learning_rate": 0.0009116376861602662, "loss": 3.3748, "step": 2422 }, { "epoch": 0.019384, "grad_norm": 4.03125, "learning_rate": 0.0009115661981363118, "loss": 2.5912, "step": 2423 }, { "epoch": 0.019392, "grad_norm": 2.8125, "learning_rate": 0.0009114946840112861, "loss": 2.462, "step": 2424 }, { "epoch": 0.0194, "grad_norm": 3.28125, "learning_rate": 0.0009114231437897244, "loss": 2.8131, "step": 2425 }, { "epoch": 0.019408, "grad_norm": 3.171875, "learning_rate": 0.0009113515774761638, "loss": 2.7796, "step": 2426 }, { "epoch": 0.019416, "grad_norm": 3.5625, "learning_rate": 0.0009112799850751427, "loss": 2.7514, "step": 2427 }, { "epoch": 0.019424, "grad_norm": 3.109375, "learning_rate": 0.0009112083665912019, "loss": 2.0839, "step": 2428 }, { "epoch": 0.019432, "grad_norm": 3.6875, "learning_rate": 0.0009111367220288829, "loss": 3.2493, "step": 2429 }, { "epoch": 0.01944, "grad_norm": 5.84375, "learning_rate": 0.0009110650513927293, "loss": 2.1617, "step": 2430 }, { "epoch": 0.019448, "grad_norm": 3.90625, "learning_rate": 0.0009109933546872866, "loss": 2.291, "step": 2431 }, { "epoch": 0.019456, "grad_norm": 3.0625, "learning_rate": 0.0009109216319171017, "loss": 2.5659, "step": 2432 }, { "epoch": 0.019464, "grad_norm": 3.3125, "learning_rate": 0.0009108498830867232, "loss": 2.6353, "step": 2433 }, { "epoch": 0.019472, "grad_norm": 3.109375, "learning_rate": 0.0009107781082007013, "loss": 2.3882, "step": 2434 }, { "epoch": 0.01948, "grad_norm": 2.875, "learning_rate": 0.0009107063072635878, "loss": 1.8294, "step": 2435 }, { "epoch": 0.019488, "grad_norm": 3.640625, "learning_rate": 0.0009106344802799364, "loss": 2.1865, "step": 2436 }, { "epoch": 0.019496, "grad_norm": 3.890625, "learning_rate": 0.000910562627254302, "loss": 2.8398, "step": 2437 }, { "epoch": 0.019504, "grad_norm": 3.609375, "learning_rate": 0.0009104907481912416, "loss": 3.1294, "step": 2438 }, { "epoch": 0.019512, "grad_norm": 4.0625, "learning_rate": 0.000910418843095314, "loss": 2.5365, "step": 2439 }, { "epoch": 0.01952, "grad_norm": 3.1875, "learning_rate": 0.0009103469119710788, "loss": 2.2013, "step": 2440 }, { "epoch": 0.019528, "grad_norm": 3.4375, "learning_rate": 0.0009102749548230981, "loss": 2.7818, "step": 2441 }, { "epoch": 0.019536, "grad_norm": 4.1875, "learning_rate": 0.0009102029716559352, "loss": 2.6291, "step": 2442 }, { "epoch": 0.019544, "grad_norm": 2.84375, "learning_rate": 0.0009101309624741554, "loss": 2.5438, "step": 2443 }, { "epoch": 0.019552, "grad_norm": 2.9375, "learning_rate": 0.0009100589272823253, "loss": 2.502, "step": 2444 }, { "epoch": 0.01956, "grad_norm": 3.03125, "learning_rate": 0.0009099868660850134, "loss": 2.8484, "step": 2445 }, { "epoch": 0.019568, "grad_norm": 3.25, "learning_rate": 0.0009099147788867894, "loss": 2.5302, "step": 2446 }, { "epoch": 0.019576, "grad_norm": 9.875, "learning_rate": 0.0009098426656922255, "loss": 2.2132, "step": 2447 }, { "epoch": 0.019584, "grad_norm": 78.0, "learning_rate": 0.0009097705265058946, "loss": 2.1222, "step": 2448 }, { "epoch": 0.019592, "grad_norm": 28.0, "learning_rate": 0.000909698361332372, "loss": 2.807, "step": 2449 }, { "epoch": 0.0196, "grad_norm": 119.0, "learning_rate": 0.0009096261701762343, "loss": 2.3598, "step": 2450 }, { "epoch": 0.019608, "grad_norm": 3.890625, "learning_rate": 0.0009095539530420595, "loss": 2.6326, "step": 2451 }, { "epoch": 0.019616, "grad_norm": 96.0, "learning_rate": 0.0009094817099344277, "loss": 1.5923, "step": 2452 }, { "epoch": 0.019624, "grad_norm": 4.03125, "learning_rate": 0.0009094094408579205, "loss": 2.8329, "step": 2453 }, { "epoch": 0.019632, "grad_norm": 4.5625, "learning_rate": 0.0009093371458171214, "loss": 3.2114, "step": 2454 }, { "epoch": 0.01964, "grad_norm": 3.6875, "learning_rate": 0.0009092648248166147, "loss": 2.3595, "step": 2455 }, { "epoch": 0.019648, "grad_norm": 3.59375, "learning_rate": 0.0009091924778609873, "loss": 2.8072, "step": 2456 }, { "epoch": 0.019656, "grad_norm": 2.203125, "learning_rate": 0.0009091201049548272, "loss": 1.7327, "step": 2457 }, { "epoch": 0.019664, "grad_norm": 2.765625, "learning_rate": 0.0009090477061027244, "loss": 1.9418, "step": 2458 }, { "epoch": 0.019672, "grad_norm": 3.6875, "learning_rate": 0.0009089752813092702, "loss": 2.3995, "step": 2459 }, { "epoch": 0.01968, "grad_norm": 3.078125, "learning_rate": 0.0009089028305790577, "loss": 2.0555, "step": 2460 }, { "epoch": 0.019688, "grad_norm": 8.375, "learning_rate": 0.0009088303539166818, "loss": 2.3978, "step": 2461 }, { "epoch": 0.019696, "grad_norm": 2.984375, "learning_rate": 0.0009087578513267388, "loss": 2.562, "step": 2462 }, { "epoch": 0.019704, "grad_norm": 3.875, "learning_rate": 0.0009086853228138263, "loss": 2.6942, "step": 2463 }, { "epoch": 0.019712, "grad_norm": 3.0, "learning_rate": 0.0009086127683825447, "loss": 2.4669, "step": 2464 }, { "epoch": 0.01972, "grad_norm": 3.5, "learning_rate": 0.0009085401880374951, "loss": 3.2457, "step": 2465 }, { "epoch": 0.019728, "grad_norm": 4.03125, "learning_rate": 0.0009084675817832801, "loss": 2.8118, "step": 2466 }, { "epoch": 0.019736, "grad_norm": 3.796875, "learning_rate": 0.0009083949496245049, "loss": 2.4193, "step": 2467 }, { "epoch": 0.019744, "grad_norm": 3.40625, "learning_rate": 0.0009083222915657751, "loss": 2.6598, "step": 2468 }, { "epoch": 0.019752, "grad_norm": 4.21875, "learning_rate": 0.0009082496076116992, "loss": 2.8943, "step": 2469 }, { "epoch": 0.01976, "grad_norm": 3.8125, "learning_rate": 0.0009081768977668862, "loss": 2.5952, "step": 2470 }, { "epoch": 0.019768, "grad_norm": 3.984375, "learning_rate": 0.0009081041620359478, "loss": 2.6943, "step": 2471 }, { "epoch": 0.019776, "grad_norm": 13.9375, "learning_rate": 0.0009080314004234965, "loss": 2.703, "step": 2472 }, { "epoch": 0.019784, "grad_norm": 3.8125, "learning_rate": 0.0009079586129341466, "loss": 2.5667, "step": 2473 }, { "epoch": 0.019792, "grad_norm": 3.90625, "learning_rate": 0.0009078857995725146, "loss": 2.5246, "step": 2474 }, { "epoch": 0.0198, "grad_norm": 3.4375, "learning_rate": 0.0009078129603432181, "loss": 2.5303, "step": 2475 }, { "epoch": 0.019808, "grad_norm": 3.0625, "learning_rate": 0.0009077400952508763, "loss": 2.6667, "step": 2476 }, { "epoch": 0.019816, "grad_norm": 3.9375, "learning_rate": 0.0009076672043001104, "loss": 2.1974, "step": 2477 }, { "epoch": 0.019824, "grad_norm": 3.71875, "learning_rate": 0.0009075942874955431, "loss": 2.8986, "step": 2478 }, { "epoch": 0.019832, "grad_norm": 3.203125, "learning_rate": 0.0009075213448417985, "loss": 3.0384, "step": 2479 }, { "epoch": 0.01984, "grad_norm": 3.984375, "learning_rate": 0.0009074483763435028, "loss": 2.3786, "step": 2480 }, { "epoch": 0.019848, "grad_norm": 2.625, "learning_rate": 0.0009073753820052835, "loss": 1.6224, "step": 2481 }, { "epoch": 0.019856, "grad_norm": 2.953125, "learning_rate": 0.0009073023618317696, "loss": 2.6112, "step": 2482 }, { "epoch": 0.019864, "grad_norm": 2.59375, "learning_rate": 0.0009072293158275922, "loss": 2.0364, "step": 2483 }, { "epoch": 0.019872, "grad_norm": 2.875, "learning_rate": 0.0009071562439973838, "loss": 2.3109, "step": 2484 }, { "epoch": 0.01988, "grad_norm": 2.71875, "learning_rate": 0.0009070831463457785, "loss": 2.8376, "step": 2485 }, { "epoch": 0.019888, "grad_norm": 3.15625, "learning_rate": 0.0009070100228774119, "loss": 2.1396, "step": 2486 }, { "epoch": 0.019896, "grad_norm": 3.34375, "learning_rate": 0.0009069368735969217, "loss": 2.2146, "step": 2487 }, { "epoch": 0.019904, "grad_norm": 3.5625, "learning_rate": 0.0009068636985089467, "loss": 2.7157, "step": 2488 }, { "epoch": 0.019912, "grad_norm": 3.0, "learning_rate": 0.0009067904976181276, "loss": 2.4751, "step": 2489 }, { "epoch": 0.01992, "grad_norm": 3.640625, "learning_rate": 0.0009067172709291069, "loss": 1.9963, "step": 2490 }, { "epoch": 0.019928, "grad_norm": 3.328125, "learning_rate": 0.0009066440184465283, "loss": 2.1975, "step": 2491 }, { "epoch": 0.019936, "grad_norm": 3.09375, "learning_rate": 0.0009065707401750377, "loss": 2.4545, "step": 2492 }, { "epoch": 0.019944, "grad_norm": 3.59375, "learning_rate": 0.0009064974361192821, "loss": 2.1467, "step": 2493 }, { "epoch": 0.019952, "grad_norm": 2.984375, "learning_rate": 0.0009064241062839105, "loss": 2.5052, "step": 2494 }, { "epoch": 0.01996, "grad_norm": 2.859375, "learning_rate": 0.0009063507506735733, "loss": 2.3151, "step": 2495 }, { "epoch": 0.019968, "grad_norm": 3.046875, "learning_rate": 0.0009062773692929226, "loss": 2.2871, "step": 2496 }, { "epoch": 0.019976, "grad_norm": 3.09375, "learning_rate": 0.0009062039621466122, "loss": 2.1649, "step": 2497 }, { "epoch": 0.019984, "grad_norm": 3.46875, "learning_rate": 0.0009061305292392976, "loss": 2.9647, "step": 2498 }, { "epoch": 0.019992, "grad_norm": 3.671875, "learning_rate": 0.0009060570705756357, "loss": 2.6765, "step": 2499 }, { "epoch": 0.02, "grad_norm": 3.453125, "learning_rate": 0.0009059835861602853, "loss": 2.274, "step": 2500 }, { "epoch": 0.020008, "grad_norm": 3.65625, "learning_rate": 0.0009059100759979066, "loss": 2.9982, "step": 2501 }, { "epoch": 0.020016, "grad_norm": 3.734375, "learning_rate": 0.0009058365400931615, "loss": 2.6198, "step": 2502 }, { "epoch": 0.020024, "grad_norm": 3.265625, "learning_rate": 0.0009057629784507137, "loss": 2.3434, "step": 2503 }, { "epoch": 0.020032, "grad_norm": 3.5625, "learning_rate": 0.0009056893910752282, "loss": 2.6835, "step": 2504 }, { "epoch": 0.02004, "grad_norm": 3.40625, "learning_rate": 0.000905615777971372, "loss": 2.6453, "step": 2505 }, { "epoch": 0.020048, "grad_norm": 4.03125, "learning_rate": 0.0009055421391438137, "loss": 2.8609, "step": 2506 }, { "epoch": 0.020056, "grad_norm": 4.15625, "learning_rate": 0.000905468474597223, "loss": 2.9827, "step": 2507 }, { "epoch": 0.020064, "grad_norm": 7.03125, "learning_rate": 0.0009053947843362717, "loss": 2.3582, "step": 2508 }, { "epoch": 0.020072, "grad_norm": 3.609375, "learning_rate": 0.0009053210683656335, "loss": 2.6425, "step": 2509 }, { "epoch": 0.02008, "grad_norm": 4.125, "learning_rate": 0.0009052473266899832, "loss": 2.3927, "step": 2510 }, { "epoch": 0.020088, "grad_norm": 4.40625, "learning_rate": 0.0009051735593139972, "loss": 2.3312, "step": 2511 }, { "epoch": 0.020096, "grad_norm": 4.625, "learning_rate": 0.0009050997662423541, "loss": 2.748, "step": 2512 }, { "epoch": 0.020104, "grad_norm": 5.375, "learning_rate": 0.0009050259474797335, "loss": 2.901, "step": 2513 }, { "epoch": 0.020112, "grad_norm": 5.96875, "learning_rate": 0.0009049521030308169, "loss": 2.782, "step": 2514 }, { "epoch": 0.02012, "grad_norm": 3.875, "learning_rate": 0.0009048782329002878, "loss": 2.5188, "step": 2515 }, { "epoch": 0.020128, "grad_norm": 3.984375, "learning_rate": 0.0009048043370928305, "loss": 2.5122, "step": 2516 }, { "epoch": 0.020136, "grad_norm": 4.625, "learning_rate": 0.0009047304156131317, "loss": 2.2245, "step": 2517 }, { "epoch": 0.020144, "grad_norm": 4.15625, "learning_rate": 0.0009046564684658791, "loss": 2.5169, "step": 2518 }, { "epoch": 0.020152, "grad_norm": 2.78125, "learning_rate": 0.0009045824956557628, "loss": 2.1314, "step": 2519 }, { "epoch": 0.02016, "grad_norm": 3.5, "learning_rate": 0.0009045084971874737, "loss": 2.7989, "step": 2520 }, { "epoch": 0.020168, "grad_norm": 4.5625, "learning_rate": 0.000904434473065705, "loss": 2.9613, "step": 2521 }, { "epoch": 0.020176, "grad_norm": 2.90625, "learning_rate": 0.0009043604232951508, "loss": 2.3781, "step": 2522 }, { "epoch": 0.020184, "grad_norm": 3.421875, "learning_rate": 0.0009042863478805076, "loss": 2.5135, "step": 2523 }, { "epoch": 0.020192, "grad_norm": 3.046875, "learning_rate": 0.000904212246826473, "loss": 2.5289, "step": 2524 }, { "epoch": 0.0202, "grad_norm": 3.125, "learning_rate": 0.0009041381201377467, "loss": 2.1626, "step": 2525 }, { "epoch": 0.020208, "grad_norm": 5.15625, "learning_rate": 0.0009040639678190294, "loss": 2.8493, "step": 2526 }, { "epoch": 0.020216, "grad_norm": 3.828125, "learning_rate": 0.000903989789875024, "loss": 2.3531, "step": 2527 }, { "epoch": 0.020224, "grad_norm": 3.390625, "learning_rate": 0.0009039155863104347, "loss": 2.5852, "step": 2528 }, { "epoch": 0.020232, "grad_norm": 3.328125, "learning_rate": 0.0009038413571299672, "loss": 2.2532, "step": 2529 }, { "epoch": 0.02024, "grad_norm": 4.96875, "learning_rate": 0.0009037671023383293, "loss": 2.6153, "step": 2530 }, { "epoch": 0.020248, "grad_norm": 4.15625, "learning_rate": 0.0009036928219402301, "loss": 2.5298, "step": 2531 }, { "epoch": 0.020256, "grad_norm": 4.90625, "learning_rate": 0.0009036185159403803, "loss": 3.1042, "step": 2532 }, { "epoch": 0.020264, "grad_norm": 5.28125, "learning_rate": 0.0009035441843434922, "loss": 2.727, "step": 2533 }, { "epoch": 0.020272, "grad_norm": 3.390625, "learning_rate": 0.0009034698271542801, "loss": 2.6284, "step": 2534 }, { "epoch": 0.02028, "grad_norm": 3.390625, "learning_rate": 0.0009033954443774597, "loss": 2.7247, "step": 2535 }, { "epoch": 0.020288, "grad_norm": 5.1875, "learning_rate": 0.0009033210360177478, "loss": 2.2813, "step": 2536 }, { "epoch": 0.020296, "grad_norm": 3.171875, "learning_rate": 0.0009032466020798637, "loss": 2.3192, "step": 2537 }, { "epoch": 0.020304, "grad_norm": 3.125, "learning_rate": 0.000903172142568528, "loss": 2.1757, "step": 2538 }, { "epoch": 0.020312, "grad_norm": 2.9375, "learning_rate": 0.0009030976574884624, "loss": 2.064, "step": 2539 }, { "epoch": 0.02032, "grad_norm": 3.296875, "learning_rate": 0.0009030231468443911, "loss": 2.84, "step": 2540 }, { "epoch": 0.020328, "grad_norm": 3.140625, "learning_rate": 0.0009029486106410391, "loss": 2.0335, "step": 2541 }, { "epoch": 0.020336, "grad_norm": 4.0, "learning_rate": 0.0009028740488831339, "loss": 2.9501, "step": 2542 }, { "epoch": 0.020344, "grad_norm": 3.03125, "learning_rate": 0.0009027994615754036, "loss": 2.6019, "step": 2543 }, { "epoch": 0.020352, "grad_norm": 3.59375, "learning_rate": 0.0009027248487225788, "loss": 2.4497, "step": 2544 }, { "epoch": 0.02036, "grad_norm": 3.25, "learning_rate": 0.000902650210329391, "loss": 2.9991, "step": 2545 }, { "epoch": 0.020368, "grad_norm": 3.421875, "learning_rate": 0.0009025755464005742, "loss": 2.8695, "step": 2546 }, { "epoch": 0.020376, "grad_norm": 8.8125, "learning_rate": 0.0009025008569408631, "loss": 1.9543, "step": 2547 }, { "epoch": 0.020384, "grad_norm": 2.921875, "learning_rate": 0.0009024261419549944, "loss": 2.1235, "step": 2548 }, { "epoch": 0.020392, "grad_norm": 3.25, "learning_rate": 0.0009023514014477068, "loss": 2.1355, "step": 2549 }, { "epoch": 0.0204, "grad_norm": 4.21875, "learning_rate": 0.0009022766354237399, "loss": 2.6541, "step": 2550 }, { "epoch": 0.020408, "grad_norm": 3.578125, "learning_rate": 0.0009022018438878354, "loss": 3.2486, "step": 2551 }, { "epoch": 0.020416, "grad_norm": 2.953125, "learning_rate": 0.0009021270268447367, "loss": 1.9664, "step": 2552 }, { "epoch": 0.020424, "grad_norm": 3.703125, "learning_rate": 0.0009020521842991883, "loss": 2.5749, "step": 2553 }, { "epoch": 0.020432, "grad_norm": 3.125, "learning_rate": 0.0009019773162559368, "loss": 2.7795, "step": 2554 }, { "epoch": 0.02044, "grad_norm": 1120.0, "learning_rate": 0.0009019024227197303, "loss": 2.799, "step": 2555 }, { "epoch": 0.020448, "grad_norm": 5.875, "learning_rate": 0.0009018275036953183, "loss": 3.7429, "step": 2556 }, { "epoch": 0.020456, "grad_norm": 3.3125, "learning_rate": 0.000901752559187452, "loss": 2.4368, "step": 2557 }, { "epoch": 0.020464, "grad_norm": 3.3125, "learning_rate": 0.0009016775892008846, "loss": 2.1775, "step": 2558 }, { "epoch": 0.020472, "grad_norm": 3.59375, "learning_rate": 0.0009016025937403706, "loss": 2.9924, "step": 2559 }, { "epoch": 0.02048, "grad_norm": 3.15625, "learning_rate": 0.0009015275728106658, "loss": 2.3662, "step": 2560 }, { "epoch": 0.020488, "grad_norm": 3.84375, "learning_rate": 0.0009014525264165282, "loss": 2.8616, "step": 2561 }, { "epoch": 0.020496, "grad_norm": 3.765625, "learning_rate": 0.0009013774545627171, "loss": 2.6391, "step": 2562 }, { "epoch": 0.020504, "grad_norm": 3.21875, "learning_rate": 0.0009013023572539935, "loss": 2.2897, "step": 2563 }, { "epoch": 0.020512, "grad_norm": 3.234375, "learning_rate": 0.00090122723449512, "loss": 2.3268, "step": 2564 }, { "epoch": 0.02052, "grad_norm": 3.5, "learning_rate": 0.0009011520862908606, "loss": 2.1998, "step": 2565 }, { "epoch": 0.020528, "grad_norm": 3.25, "learning_rate": 0.0009010769126459815, "loss": 2.5281, "step": 2566 }, { "epoch": 0.020536, "grad_norm": 3.828125, "learning_rate": 0.0009010017135652498, "loss": 2.9825, "step": 2567 }, { "epoch": 0.020544, "grad_norm": 3.359375, "learning_rate": 0.0009009264890534346, "loss": 2.5809, "step": 2568 }, { "epoch": 0.020552, "grad_norm": 3.140625, "learning_rate": 0.0009008512391153067, "loss": 2.3823, "step": 2569 }, { "epoch": 0.02056, "grad_norm": 3.734375, "learning_rate": 0.0009007759637556382, "loss": 1.7502, "step": 2570 }, { "epoch": 0.020568, "grad_norm": 3.140625, "learning_rate": 0.000900700662979203, "loss": 2.667, "step": 2571 }, { "epoch": 0.020576, "grad_norm": 3.390625, "learning_rate": 0.0009006253367907769, "loss": 2.1561, "step": 2572 }, { "epoch": 0.020584, "grad_norm": 3.921875, "learning_rate": 0.0009005499851951365, "loss": 3.1898, "step": 2573 }, { "epoch": 0.020592, "grad_norm": 4.125, "learning_rate": 0.0009004746081970607, "loss": 1.9408, "step": 2574 }, { "epoch": 0.0206, "grad_norm": 3.546875, "learning_rate": 0.0009003992058013302, "loss": 2.3539, "step": 2575 }, { "epoch": 0.020608, "grad_norm": 3.765625, "learning_rate": 0.0009003237780127265, "loss": 2.888, "step": 2576 }, { "epoch": 0.020616, "grad_norm": 3.40625, "learning_rate": 0.0009002483248360332, "loss": 2.4032, "step": 2577 }, { "epoch": 0.020624, "grad_norm": 2.96875, "learning_rate": 0.0009001728462760354, "loss": 1.9875, "step": 2578 }, { "epoch": 0.020632, "grad_norm": 3.234375, "learning_rate": 0.0009000973423375203, "loss": 2.3386, "step": 2579 }, { "epoch": 0.02064, "grad_norm": 3.71875, "learning_rate": 0.0009000218130252758, "loss": 2.2545, "step": 2580 }, { "epoch": 0.020648, "grad_norm": 3.203125, "learning_rate": 0.0008999462583440921, "loss": 2.3824, "step": 2581 }, { "epoch": 0.020656, "grad_norm": 3.578125, "learning_rate": 0.0008998706782987607, "loss": 2.71, "step": 2582 }, { "epoch": 0.020664, "grad_norm": 5.0, "learning_rate": 0.0008997950728940748, "loss": 2.055, "step": 2583 }, { "epoch": 0.020672, "grad_norm": 3.46875, "learning_rate": 0.0008997194421348294, "loss": 2.8624, "step": 2584 }, { "epoch": 0.02068, "grad_norm": 3.703125, "learning_rate": 0.0008996437860258206, "loss": 2.4417, "step": 2585 }, { "epoch": 0.020688, "grad_norm": 2.96875, "learning_rate": 0.0008995681045718467, "loss": 2.1408, "step": 2586 }, { "epoch": 0.020696, "grad_norm": 14.125, "learning_rate": 0.0008994923977777074, "loss": 2.1482, "step": 2587 }, { "epoch": 0.020704, "grad_norm": 4.28125, "learning_rate": 0.0008994166656482035, "loss": 2.7117, "step": 2588 }, { "epoch": 0.020712, "grad_norm": 3.40625, "learning_rate": 0.0008993409081881384, "loss": 2.6751, "step": 2589 }, { "epoch": 0.02072, "grad_norm": 16.625, "learning_rate": 0.0008992651254023162, "loss": 2.5228, "step": 2590 }, { "epoch": 0.020728, "grad_norm": 3.046875, "learning_rate": 0.000899189317295543, "loss": 1.9078, "step": 2591 }, { "epoch": 0.020736, "grad_norm": 3.890625, "learning_rate": 0.0008991134838726265, "loss": 2.3015, "step": 2592 }, { "epoch": 0.020744, "grad_norm": 3.96875, "learning_rate": 0.0008990376251383761, "loss": 2.504, "step": 2593 }, { "epoch": 0.020752, "grad_norm": 2.65625, "learning_rate": 0.0008989617410976023, "loss": 2.1792, "step": 2594 }, { "epoch": 0.02076, "grad_norm": 3.671875, "learning_rate": 0.0008988858317551181, "loss": 2.1618, "step": 2595 }, { "epoch": 0.020768, "grad_norm": 3.734375, "learning_rate": 0.0008988098971157373, "loss": 1.9219, "step": 2596 }, { "epoch": 0.020776, "grad_norm": 3.921875, "learning_rate": 0.0008987339371842755, "loss": 2.2211, "step": 2597 }, { "epoch": 0.020784, "grad_norm": 5.21875, "learning_rate": 0.0008986579519655501, "loss": 2.9678, "step": 2598 }, { "epoch": 0.020792, "grad_norm": 4.28125, "learning_rate": 0.0008985819414643801, "loss": 1.8855, "step": 2599 }, { "epoch": 0.0208, "grad_norm": 4.3125, "learning_rate": 0.0008985059056855857, "loss": 2.4614, "step": 2600 }, { "epoch": 0.020808, "grad_norm": 4.71875, "learning_rate": 0.0008984298446339896, "loss": 3.2572, "step": 2601 }, { "epoch": 0.020816, "grad_norm": 4.53125, "learning_rate": 0.0008983537583144148, "loss": 3.0256, "step": 2602 }, { "epoch": 0.020824, "grad_norm": 4.1875, "learning_rate": 0.0008982776467316871, "loss": 2.5088, "step": 2603 }, { "epoch": 0.020832, "grad_norm": 3.875, "learning_rate": 0.0008982015098906332, "loss": 3.0255, "step": 2604 }, { "epoch": 0.02084, "grad_norm": 3.453125, "learning_rate": 0.0008981253477960816, "loss": 2.7386, "step": 2605 }, { "epoch": 0.020848, "grad_norm": 3.3125, "learning_rate": 0.0008980491604528625, "loss": 2.6299, "step": 2606 }, { "epoch": 0.020856, "grad_norm": 3.09375, "learning_rate": 0.0008979729478658078, "loss": 2.2976, "step": 2607 }, { "epoch": 0.020864, "grad_norm": 3.203125, "learning_rate": 0.0008978967100397503, "loss": 2.9939, "step": 2608 }, { "epoch": 0.020872, "grad_norm": 2.953125, "learning_rate": 0.0008978204469795254, "loss": 2.0225, "step": 2609 }, { "epoch": 0.02088, "grad_norm": 3.0, "learning_rate": 0.0008977441586899694, "loss": 2.1205, "step": 2610 }, { "epoch": 0.020888, "grad_norm": 2.984375, "learning_rate": 0.0008976678451759205, "loss": 2.7568, "step": 2611 }, { "epoch": 0.020896, "grad_norm": 3.90625, "learning_rate": 0.0008975915064422183, "loss": 2.2658, "step": 2612 }, { "epoch": 0.020904, "grad_norm": 2.953125, "learning_rate": 0.0008975151424937042, "loss": 2.3873, "step": 2613 }, { "epoch": 0.020912, "grad_norm": 3.578125, "learning_rate": 0.0008974387533352213, "loss": 2.7211, "step": 2614 }, { "epoch": 0.02092, "grad_norm": 3.109375, "learning_rate": 0.0008973623389716137, "loss": 2.8344, "step": 2615 }, { "epoch": 0.020928, "grad_norm": 3.578125, "learning_rate": 0.0008972858994077279, "loss": 2.3558, "step": 2616 }, { "epoch": 0.020936, "grad_norm": 2.828125, "learning_rate": 0.0008972094346484114, "loss": 2.1594, "step": 2617 }, { "epoch": 0.020944, "grad_norm": 3.796875, "learning_rate": 0.0008971329446985136, "loss": 2.4694, "step": 2618 }, { "epoch": 0.020952, "grad_norm": 2.578125, "learning_rate": 0.0008970564295628854, "loss": 2.1444, "step": 2619 }, { "epoch": 0.02096, "grad_norm": 3.328125, "learning_rate": 0.0008969798892463792, "loss": 2.5511, "step": 2620 }, { "epoch": 0.020968, "grad_norm": 2.8125, "learning_rate": 0.0008969033237538494, "loss": 2.0183, "step": 2621 }, { "epoch": 0.020976, "grad_norm": 2.734375, "learning_rate": 0.0008968267330901512, "loss": 2.4551, "step": 2622 }, { "epoch": 0.020984, "grad_norm": 3.75, "learning_rate": 0.0008967501172601422, "loss": 2.9177, "step": 2623 }, { "epoch": 0.020992, "grad_norm": 3.984375, "learning_rate": 0.0008966734762686814, "loss": 2.6315, "step": 2624 }, { "epoch": 0.021, "grad_norm": 3.46875, "learning_rate": 0.0008965968101206292, "loss": 2.9285, "step": 2625 }, { "epoch": 0.021008, "grad_norm": 3.578125, "learning_rate": 0.0008965201188208474, "loss": 2.1791, "step": 2626 }, { "epoch": 0.021016, "grad_norm": 2.875, "learning_rate": 0.0008964434023742001, "loss": 2.4958, "step": 2627 }, { "epoch": 0.021024, "grad_norm": 3.375, "learning_rate": 0.0008963666607855523, "loss": 2.5476, "step": 2628 }, { "epoch": 0.021032, "grad_norm": 4.25, "learning_rate": 0.000896289894059771, "loss": 2.8554, "step": 2629 }, { "epoch": 0.02104, "grad_norm": 3.15625, "learning_rate": 0.0008962131022017246, "loss": 2.9635, "step": 2630 }, { "epoch": 0.021048, "grad_norm": 4.21875, "learning_rate": 0.0008961362852162832, "loss": 3.0229, "step": 2631 }, { "epoch": 0.021056, "grad_norm": 3.484375, "learning_rate": 0.0008960594431083184, "loss": 2.3788, "step": 2632 }, { "epoch": 0.021064, "grad_norm": 3.3125, "learning_rate": 0.0008959825758827035, "loss": 2.7151, "step": 2633 }, { "epoch": 0.021072, "grad_norm": 3.6875, "learning_rate": 0.0008959056835443133, "loss": 3.3296, "step": 2634 }, { "epoch": 0.02108, "grad_norm": 3.765625, "learning_rate": 0.000895828766098024, "loss": 2.8752, "step": 2635 }, { "epoch": 0.021088, "grad_norm": 3.046875, "learning_rate": 0.0008957518235487141, "loss": 2.3133, "step": 2636 }, { "epoch": 0.021096, "grad_norm": 4.21875, "learning_rate": 0.0008956748559012629, "loss": 2.3479, "step": 2637 }, { "epoch": 0.021104, "grad_norm": 3.125, "learning_rate": 0.0008955978631605514, "loss": 2.9216, "step": 2638 }, { "epoch": 0.021112, "grad_norm": 3.0625, "learning_rate": 0.0008955208453314629, "loss": 2.6983, "step": 2639 }, { "epoch": 0.02112, "grad_norm": 3.171875, "learning_rate": 0.0008954438024188814, "loss": 2.0678, "step": 2640 }, { "epoch": 0.021128, "grad_norm": 3.28125, "learning_rate": 0.0008953667344276931, "loss": 2.7622, "step": 2641 }, { "epoch": 0.021136, "grad_norm": 3.4375, "learning_rate": 0.0008952896413627855, "loss": 2.6003, "step": 2642 }, { "epoch": 0.021144, "grad_norm": 3.359375, "learning_rate": 0.0008952125232290477, "loss": 2.7186, "step": 2643 }, { "epoch": 0.021152, "grad_norm": 3.9375, "learning_rate": 0.0008951353800313704, "loss": 3.3478, "step": 2644 }, { "epoch": 0.02116, "grad_norm": 4.28125, "learning_rate": 0.0008950582117746461, "loss": 2.9125, "step": 2645 }, { "epoch": 0.021168, "grad_norm": 3.9375, "learning_rate": 0.0008949810184637686, "loss": 2.1657, "step": 2646 }, { "epoch": 0.021176, "grad_norm": 4.1875, "learning_rate": 0.0008949038001036334, "loss": 2.6946, "step": 2647 }, { "epoch": 0.021184, "grad_norm": 5.03125, "learning_rate": 0.0008948265566991377, "loss": 3.0319, "step": 2648 }, { "epoch": 0.021192, "grad_norm": 6.3125, "learning_rate": 0.0008947492882551802, "loss": 2.5085, "step": 2649 }, { "epoch": 0.0212, "grad_norm": 4.4375, "learning_rate": 0.0008946719947766611, "loss": 3.0574, "step": 2650 }, { "epoch": 0.021208, "grad_norm": 3.4375, "learning_rate": 0.0008945946762684823, "loss": 2.479, "step": 2651 }, { "epoch": 0.021216, "grad_norm": 3.1875, "learning_rate": 0.0008945173327355471, "loss": 2.4835, "step": 2652 }, { "epoch": 0.021224, "grad_norm": 3.34375, "learning_rate": 0.0008944399641827608, "loss": 1.9927, "step": 2653 }, { "epoch": 0.021232, "grad_norm": 5.0625, "learning_rate": 0.0008943625706150298, "loss": 2.4701, "step": 2654 }, { "epoch": 0.02124, "grad_norm": 3.5625, "learning_rate": 0.0008942851520372625, "loss": 2.5215, "step": 2655 }, { "epoch": 0.021248, "grad_norm": 3.453125, "learning_rate": 0.0008942077084543687, "loss": 2.1113, "step": 2656 }, { "epoch": 0.021256, "grad_norm": 3.015625, "learning_rate": 0.0008941302398712596, "loss": 2.6541, "step": 2657 }, { "epoch": 0.021264, "grad_norm": 3.09375, "learning_rate": 0.0008940527462928483, "loss": 2.0576, "step": 2658 }, { "epoch": 0.021272, "grad_norm": 3.90625, "learning_rate": 0.0008939752277240493, "loss": 2.5583, "step": 2659 }, { "epoch": 0.02128, "grad_norm": 3.421875, "learning_rate": 0.000893897684169779, "loss": 2.6924, "step": 2660 }, { "epoch": 0.021288, "grad_norm": 3.109375, "learning_rate": 0.0008938201156349548, "loss": 2.6125, "step": 2661 }, { "epoch": 0.021296, "grad_norm": 3.671875, "learning_rate": 0.000893742522124496, "loss": 2.0131, "step": 2662 }, { "epoch": 0.021304, "grad_norm": 16.875, "learning_rate": 0.0008936649036433238, "loss": 2.572, "step": 2663 }, { "epoch": 0.021312, "grad_norm": 3.65625, "learning_rate": 0.0008935872601963604, "loss": 2.8629, "step": 2664 }, { "epoch": 0.02132, "grad_norm": 3.046875, "learning_rate": 0.0008935095917885299, "loss": 2.3847, "step": 2665 }, { "epoch": 0.021328, "grad_norm": 3.6875, "learning_rate": 0.0008934318984247581, "loss": 2.8426, "step": 2666 }, { "epoch": 0.021336, "grad_norm": 3.671875, "learning_rate": 0.0008933541801099719, "loss": 2.8297, "step": 2667 }, { "epoch": 0.021344, "grad_norm": 3.515625, "learning_rate": 0.0008932764368491005, "loss": 3.0855, "step": 2668 }, { "epoch": 0.021352, "grad_norm": 3.015625, "learning_rate": 0.0008931986686470741, "loss": 2.8691, "step": 2669 }, { "epoch": 0.02136, "grad_norm": 3.15625, "learning_rate": 0.0008931208755088247, "loss": 2.4085, "step": 2670 }, { "epoch": 0.021368, "grad_norm": 3.484375, "learning_rate": 0.0008930430574392859, "loss": 2.7677, "step": 2671 }, { "epoch": 0.021376, "grad_norm": 3.578125, "learning_rate": 0.0008929652144433927, "loss": 2.3039, "step": 2672 }, { "epoch": 0.021384, "grad_norm": 3.359375, "learning_rate": 0.000892887346526082, "loss": 2.1761, "step": 2673 }, { "epoch": 0.021392, "grad_norm": 11.8125, "learning_rate": 0.0008928094536922919, "loss": 2.535, "step": 2674 }, { "epoch": 0.0214, "grad_norm": 3.59375, "learning_rate": 0.0008927315359469625, "loss": 2.5631, "step": 2675 }, { "epoch": 0.021408, "grad_norm": 3.5625, "learning_rate": 0.000892653593295035, "loss": 2.5955, "step": 2676 }, { "epoch": 0.021416, "grad_norm": 3.484375, "learning_rate": 0.0008925756257414526, "loss": 2.7084, "step": 2677 }, { "epoch": 0.021424, "grad_norm": 4.21875, "learning_rate": 0.0008924976332911601, "loss": 2.4632, "step": 2678 }, { "epoch": 0.021432, "grad_norm": 3.921875, "learning_rate": 0.0008924196159491031, "loss": 2.3484, "step": 2679 }, { "epoch": 0.02144, "grad_norm": 4.03125, "learning_rate": 0.00089234157372023, "loss": 2.7477, "step": 2680 }, { "epoch": 0.021448, "grad_norm": 3.484375, "learning_rate": 0.00089226350660949, "loss": 2.4548, "step": 2681 }, { "epoch": 0.021456, "grad_norm": 3.25, "learning_rate": 0.0008921854146218338, "loss": 2.5051, "step": 2682 }, { "epoch": 0.021464, "grad_norm": 4.0625, "learning_rate": 0.000892107297762214, "loss": 2.7906, "step": 2683 }, { "epoch": 0.021472, "grad_norm": 3.359375, "learning_rate": 0.0008920291560355846, "loss": 2.9013, "step": 2684 }, { "epoch": 0.02148, "grad_norm": 3.390625, "learning_rate": 0.0008919509894469016, "loss": 2.5616, "step": 2685 }, { "epoch": 0.021488, "grad_norm": 4.5625, "learning_rate": 0.0008918727980011219, "loss": 2.4202, "step": 2686 }, { "epoch": 0.021496, "grad_norm": 3.546875, "learning_rate": 0.0008917945817032047, "loss": 1.5655, "step": 2687 }, { "epoch": 0.021504, "grad_norm": 4.125, "learning_rate": 0.0008917163405581099, "loss": 3.0738, "step": 2688 }, { "epoch": 0.021512, "grad_norm": 2.78125, "learning_rate": 0.0008916380745707997, "loss": 3.3932, "step": 2689 }, { "epoch": 0.02152, "grad_norm": 2.390625, "learning_rate": 0.0008915597837462377, "loss": 2.3127, "step": 2690 }, { "epoch": 0.021528, "grad_norm": 4.03125, "learning_rate": 0.0008914814680893889, "loss": 2.5285, "step": 2691 }, { "epoch": 0.021536, "grad_norm": 3.015625, "learning_rate": 0.0008914031276052201, "loss": 2.4783, "step": 2692 }, { "epoch": 0.021544, "grad_norm": 2.484375, "learning_rate": 0.0008913247622986995, "loss": 2.6391, "step": 2693 }, { "epoch": 0.021552, "grad_norm": 2.296875, "learning_rate": 0.000891246372174797, "loss": 1.7518, "step": 2694 }, { "epoch": 0.02156, "grad_norm": 3.015625, "learning_rate": 0.0008911679572384839, "loss": 2.499, "step": 2695 }, { "epoch": 0.021568, "grad_norm": 3.203125, "learning_rate": 0.0008910895174947333, "loss": 2.5082, "step": 2696 }, { "epoch": 0.021576, "grad_norm": 3.265625, "learning_rate": 0.0008910110529485197, "loss": 3.1358, "step": 2697 }, { "epoch": 0.021584, "grad_norm": 2.625, "learning_rate": 0.0008909325636048193, "loss": 2.093, "step": 2698 }, { "epoch": 0.021592, "grad_norm": 3.734375, "learning_rate": 0.0008908540494686097, "loss": 2.8209, "step": 2699 }, { "epoch": 0.0216, "grad_norm": 3.1875, "learning_rate": 0.0008907755105448703, "loss": 2.5655, "step": 2700 }, { "epoch": 0.021608, "grad_norm": 3.578125, "learning_rate": 0.0008906969468385818, "loss": 2.7825, "step": 2701 }, { "epoch": 0.021616, "grad_norm": 3.03125, "learning_rate": 0.0008906183583547268, "loss": 2.4338, "step": 2702 }, { "epoch": 0.021624, "grad_norm": 2.96875, "learning_rate": 0.0008905397450982891, "loss": 2.8684, "step": 2703 }, { "epoch": 0.021632, "grad_norm": 3.484375, "learning_rate": 0.0008904611070742546, "loss": 2.7274, "step": 2704 }, { "epoch": 0.02164, "grad_norm": 2.859375, "learning_rate": 0.0008903824442876101, "loss": 1.9534, "step": 2705 }, { "epoch": 0.021648, "grad_norm": 2.890625, "learning_rate": 0.0008903037567433443, "loss": 2.0491, "step": 2706 }, { "epoch": 0.021656, "grad_norm": 2.875, "learning_rate": 0.0008902250444464477, "loss": 2.2807, "step": 2707 }, { "epoch": 0.021664, "grad_norm": 2.9375, "learning_rate": 0.0008901463074019121, "loss": 2.1918, "step": 2708 }, { "epoch": 0.021672, "grad_norm": 3.40625, "learning_rate": 0.0008900675456147307, "loss": 2.73, "step": 2709 }, { "epoch": 0.02168, "grad_norm": 3.1875, "learning_rate": 0.0008899887590898988, "loss": 2.5409, "step": 2710 }, { "epoch": 0.021688, "grad_norm": 3.34375, "learning_rate": 0.0008899099478324126, "loss": 3.2864, "step": 2711 }, { "epoch": 0.021696, "grad_norm": 3.3125, "learning_rate": 0.0008898311118472706, "loss": 2.4325, "step": 2712 }, { "epoch": 0.021704, "grad_norm": 3.671875, "learning_rate": 0.0008897522511394723, "loss": 2.6474, "step": 2713 }, { "epoch": 0.021712, "grad_norm": 2.609375, "learning_rate": 0.000889673365714019, "loss": 2.2069, "step": 2714 }, { "epoch": 0.02172, "grad_norm": 3.3125, "learning_rate": 0.0008895944555759134, "loss": 2.8683, "step": 2715 }, { "epoch": 0.021728, "grad_norm": 2.515625, "learning_rate": 0.00088951552073016, "loss": 2.6151, "step": 2716 }, { "epoch": 0.021736, "grad_norm": 2.875, "learning_rate": 0.0008894365611817648, "loss": 2.3645, "step": 2717 }, { "epoch": 0.021744, "grad_norm": 2.8125, "learning_rate": 0.0008893575769357352, "loss": 2.3964, "step": 2718 }, { "epoch": 0.021752, "grad_norm": 3.25, "learning_rate": 0.0008892785679970804, "loss": 1.9139, "step": 2719 }, { "epoch": 0.02176, "grad_norm": 2.796875, "learning_rate": 0.0008891995343708111, "loss": 2.5362, "step": 2720 }, { "epoch": 0.021768, "grad_norm": 3.09375, "learning_rate": 0.0008891204760619392, "loss": 2.6862, "step": 2721 }, { "epoch": 0.021776, "grad_norm": 3.34375, "learning_rate": 0.0008890413930754791, "loss": 2.4314, "step": 2722 }, { "epoch": 0.021784, "grad_norm": 3.515625, "learning_rate": 0.0008889622854164454, "loss": 2.8433, "step": 2723 }, { "epoch": 0.021792, "grad_norm": 4.34375, "learning_rate": 0.0008888831530898558, "loss": 3.4026, "step": 2724 }, { "epoch": 0.0218, "grad_norm": 2.96875, "learning_rate": 0.0008888039961007281, "loss": 2.4028, "step": 2725 }, { "epoch": 0.021808, "grad_norm": 3.0625, "learning_rate": 0.000888724814454083, "loss": 2.6242, "step": 2726 }, { "epoch": 0.021816, "grad_norm": 3.6875, "learning_rate": 0.0008886456081549414, "loss": 2.8139, "step": 2727 }, { "epoch": 0.021824, "grad_norm": 3.28125, "learning_rate": 0.000888566377208327, "loss": 2.8317, "step": 2728 }, { "epoch": 0.021832, "grad_norm": 3.203125, "learning_rate": 0.0008884871216192643, "loss": 2.4725, "step": 2729 }, { "epoch": 0.02184, "grad_norm": 2.84375, "learning_rate": 0.0008884078413927799, "loss": 1.8723, "step": 2730 }, { "epoch": 0.021848, "grad_norm": 2.890625, "learning_rate": 0.0008883285365339013, "loss": 2.7564, "step": 2731 }, { "epoch": 0.021856, "grad_norm": 2.671875, "learning_rate": 0.000888249207047658, "loss": 2.455, "step": 2732 }, { "epoch": 0.021864, "grad_norm": 2.25, "learning_rate": 0.0008881698529390811, "loss": 1.7137, "step": 2733 }, { "epoch": 0.021872, "grad_norm": 3.125, "learning_rate": 0.000888090474213203, "loss": 2.4261, "step": 2734 }, { "epoch": 0.02188, "grad_norm": 3.28125, "learning_rate": 0.0008880110708750581, "loss": 2.7401, "step": 2735 }, { "epoch": 0.021888, "grad_norm": 2.671875, "learning_rate": 0.0008879316429296817, "loss": 2.0955, "step": 2736 }, { "epoch": 0.021896, "grad_norm": 3.03125, "learning_rate": 0.0008878521903821115, "loss": 2.4692, "step": 2737 }, { "epoch": 0.021904, "grad_norm": 3.140625, "learning_rate": 0.0008877727132373858, "loss": 2.3211, "step": 2738 }, { "epoch": 0.021912, "grad_norm": 4.0, "learning_rate": 0.0008876932115005452, "loss": 3.1478, "step": 2739 }, { "epoch": 0.02192, "grad_norm": 3.4375, "learning_rate": 0.0008876136851766316, "loss": 2.5793, "step": 2740 }, { "epoch": 0.021928, "grad_norm": 3.25, "learning_rate": 0.0008875341342706884, "loss": 2.8439, "step": 2741 }, { "epoch": 0.021936, "grad_norm": 3.546875, "learning_rate": 0.0008874545587877607, "loss": 2.9575, "step": 2742 }, { "epoch": 0.021944, "grad_norm": 2.859375, "learning_rate": 0.0008873749587328951, "loss": 2.421, "step": 2743 }, { "epoch": 0.021952, "grad_norm": 3.171875, "learning_rate": 0.0008872953341111397, "loss": 2.6495, "step": 2744 }, { "epoch": 0.02196, "grad_norm": 3.46875, "learning_rate": 0.0008872156849275442, "loss": 2.2591, "step": 2745 }, { "epoch": 0.021968, "grad_norm": 3.046875, "learning_rate": 0.0008871360111871598, "loss": 2.775, "step": 2746 }, { "epoch": 0.021976, "grad_norm": 3.625, "learning_rate": 0.0008870563128950395, "loss": 2.8198, "step": 2747 }, { "epoch": 0.021984, "grad_norm": 3.09375, "learning_rate": 0.0008869765900562376, "loss": 2.9607, "step": 2748 }, { "epoch": 0.021992, "grad_norm": 2.953125, "learning_rate": 0.0008868968426758101, "loss": 2.5933, "step": 2749 }, { "epoch": 0.022, "grad_norm": 2.5625, "learning_rate": 0.0008868170707588142, "loss": 3.2335, "step": 2750 }, { "epoch": 0.022008, "grad_norm": 2.65625, "learning_rate": 0.0008867372743103092, "loss": 2.535, "step": 2751 }, { "epoch": 0.022016, "grad_norm": 2.796875, "learning_rate": 0.0008866574533353557, "loss": 3.044, "step": 2752 }, { "epoch": 0.022024, "grad_norm": 2.5625, "learning_rate": 0.0008865776078390157, "loss": 2.2158, "step": 2753 }, { "epoch": 0.022032, "grad_norm": 3.40625, "learning_rate": 0.0008864977378263533, "loss": 2.188, "step": 2754 }, { "epoch": 0.02204, "grad_norm": 3.0625, "learning_rate": 0.0008864178433024332, "loss": 2.7737, "step": 2755 }, { "epoch": 0.022048, "grad_norm": 2.859375, "learning_rate": 0.0008863379242723226, "loss": 2.3352, "step": 2756 }, { "epoch": 0.022056, "grad_norm": 2.671875, "learning_rate": 0.0008862579807410898, "loss": 2.5635, "step": 2757 }, { "epoch": 0.022064, "grad_norm": 3.953125, "learning_rate": 0.0008861780127138048, "loss": 2.6765, "step": 2758 }, { "epoch": 0.022072, "grad_norm": 2.984375, "learning_rate": 0.0008860980201955389, "loss": 1.8557, "step": 2759 }, { "epoch": 0.02208, "grad_norm": 3.78125, "learning_rate": 0.0008860180031913653, "loss": 1.9509, "step": 2760 }, { "epoch": 0.022088, "grad_norm": 2.84375, "learning_rate": 0.0008859379617063585, "loss": 1.9022, "step": 2761 }, { "epoch": 0.022096, "grad_norm": 2.921875, "learning_rate": 0.0008858578957455947, "loss": 2.6258, "step": 2762 }, { "epoch": 0.022104, "grad_norm": 3.03125, "learning_rate": 0.0008857778053141514, "loss": 2.5362, "step": 2763 }, { "epoch": 0.022112, "grad_norm": 3.578125, "learning_rate": 0.0008856976904171081, "loss": 2.2888, "step": 2764 }, { "epoch": 0.02212, "grad_norm": 3.3125, "learning_rate": 0.0008856175510595455, "loss": 2.3145, "step": 2765 }, { "epoch": 0.022128, "grad_norm": 3.234375, "learning_rate": 0.000885537387246546, "loss": 2.5289, "step": 2766 }, { "epoch": 0.022136, "grad_norm": 3.015625, "learning_rate": 0.0008854571989831935, "loss": 2.4304, "step": 2767 }, { "epoch": 0.022144, "grad_norm": 2.5625, "learning_rate": 0.0008853769862745733, "loss": 2.4834, "step": 2768 }, { "epoch": 0.022152, "grad_norm": 3.15625, "learning_rate": 0.0008852967491257726, "loss": 2.8067, "step": 2769 }, { "epoch": 0.02216, "grad_norm": 3.3125, "learning_rate": 0.0008852164875418798, "loss": 2.4413, "step": 2770 }, { "epoch": 0.022168, "grad_norm": 2.78125, "learning_rate": 0.0008851362015279852, "loss": 2.5524, "step": 2771 }, { "epoch": 0.022176, "grad_norm": 2.90625, "learning_rate": 0.0008850558910891801, "loss": 2.6897, "step": 2772 }, { "epoch": 0.022184, "grad_norm": 14.75, "learning_rate": 0.000884975556230558, "loss": 2.6999, "step": 2773 }, { "epoch": 0.022192, "grad_norm": 3.125, "learning_rate": 0.0008848951969572137, "loss": 2.1837, "step": 2774 }, { "epoch": 0.0222, "grad_norm": 3.046875, "learning_rate": 0.000884814813274243, "loss": 2.8914, "step": 2775 }, { "epoch": 0.022208, "grad_norm": 2.6875, "learning_rate": 0.0008847344051867443, "loss": 2.2246, "step": 2776 }, { "epoch": 0.022216, "grad_norm": 2.515625, "learning_rate": 0.0008846539726998169, "loss": 2.52, "step": 2777 }, { "epoch": 0.022224, "grad_norm": 3.0, "learning_rate": 0.0008845735158185615, "loss": 2.7436, "step": 2778 }, { "epoch": 0.022232, "grad_norm": 3.109375, "learning_rate": 0.0008844930345480807, "loss": 2.7163, "step": 2779 }, { "epoch": 0.02224, "grad_norm": 3.9375, "learning_rate": 0.0008844125288934786, "loss": 2.7293, "step": 2780 }, { "epoch": 0.022248, "grad_norm": 3.09375, "learning_rate": 0.0008843319988598607, "loss": 2.5206, "step": 2781 }, { "epoch": 0.022256, "grad_norm": 2.859375, "learning_rate": 0.0008842514444523342, "loss": 2.0235, "step": 2782 }, { "epoch": 0.022264, "grad_norm": 2.703125, "learning_rate": 0.0008841708656760075, "loss": 1.846, "step": 2783 }, { "epoch": 0.022272, "grad_norm": 3.140625, "learning_rate": 0.0008840902625359912, "loss": 2.215, "step": 2784 }, { "epoch": 0.02228, "grad_norm": 4.46875, "learning_rate": 0.0008840096350373971, "loss": 2.5577, "step": 2785 }, { "epoch": 0.022288, "grad_norm": 3.484375, "learning_rate": 0.0008839289831853381, "loss": 2.711, "step": 2786 }, { "epoch": 0.022296, "grad_norm": 2.578125, "learning_rate": 0.0008838483069849292, "loss": 2.0991, "step": 2787 }, { "epoch": 0.022304, "grad_norm": 2.84375, "learning_rate": 0.000883767606441287, "loss": 2.4561, "step": 2788 }, { "epoch": 0.022312, "grad_norm": 3.09375, "learning_rate": 0.0008836868815595291, "loss": 2.5592, "step": 2789 }, { "epoch": 0.02232, "grad_norm": 2.984375, "learning_rate": 0.0008836061323447752, "loss": 2.9781, "step": 2790 }, { "epoch": 0.022328, "grad_norm": 2.640625, "learning_rate": 0.0008835253588021465, "loss": 2.5323, "step": 2791 }, { "epoch": 0.022336, "grad_norm": 3.34375, "learning_rate": 0.0008834445609367652, "loss": 2.1223, "step": 2792 }, { "epoch": 0.022344, "grad_norm": 2.796875, "learning_rate": 0.0008833637387537556, "loss": 2.587, "step": 2793 }, { "epoch": 0.022352, "grad_norm": 2.796875, "learning_rate": 0.0008832828922582434, "loss": 2.4655, "step": 2794 }, { "epoch": 0.02236, "grad_norm": 3.265625, "learning_rate": 0.0008832020214553558, "loss": 2.3633, "step": 2795 }, { "epoch": 0.022368, "grad_norm": 2.796875, "learning_rate": 0.0008831211263502213, "loss": 2.4294, "step": 2796 }, { "epoch": 0.022376, "grad_norm": 2.328125, "learning_rate": 0.0008830402069479703, "loss": 2.1127, "step": 2797 }, { "epoch": 0.022384, "grad_norm": 2.734375, "learning_rate": 0.0008829592632537349, "loss": 2.4517, "step": 2798 }, { "epoch": 0.022392, "grad_norm": 3.484375, "learning_rate": 0.0008828782952726479, "loss": 2.3421, "step": 2799 }, { "epoch": 0.0224, "grad_norm": 2.65625, "learning_rate": 0.0008827973030098447, "loss": 2.0168, "step": 2800 }, { "epoch": 0.022408, "grad_norm": 3.6875, "learning_rate": 0.0008827162864704616, "loss": 2.2745, "step": 2801 }, { "epoch": 0.022416, "grad_norm": 3.390625, "learning_rate": 0.0008826352456596365, "loss": 2.3537, "step": 2802 }, { "epoch": 0.022424, "grad_norm": 3.28125, "learning_rate": 0.0008825541805825088, "loss": 2.5425, "step": 2803 }, { "epoch": 0.022432, "grad_norm": 2.859375, "learning_rate": 0.0008824730912442198, "loss": 2.2649, "step": 2804 }, { "epoch": 0.02244, "grad_norm": 3.421875, "learning_rate": 0.000882391977649912, "loss": 2.762, "step": 2805 }, { "epoch": 0.022448, "grad_norm": 3.75, "learning_rate": 0.0008823108398047297, "loss": 2.5092, "step": 2806 }, { "epoch": 0.022456, "grad_norm": 3.984375, "learning_rate": 0.0008822296777138183, "loss": 3.276, "step": 2807 }, { "epoch": 0.022464, "grad_norm": 3.171875, "learning_rate": 0.0008821484913823252, "loss": 2.6396, "step": 2808 }, { "epoch": 0.022472, "grad_norm": 3.421875, "learning_rate": 0.0008820672808153989, "loss": 2.2788, "step": 2809 }, { "epoch": 0.02248, "grad_norm": 2.625, "learning_rate": 0.00088198604601819, "loss": 2.8166, "step": 2810 }, { "epoch": 0.022488, "grad_norm": 2.59375, "learning_rate": 0.0008819047869958501, "loss": 2.7837, "step": 2811 }, { "epoch": 0.022496, "grad_norm": 2.84375, "learning_rate": 0.0008818235037535327, "loss": 3.1881, "step": 2812 }, { "epoch": 0.022504, "grad_norm": 3.015625, "learning_rate": 0.0008817421962963925, "loss": 3.0649, "step": 2813 }, { "epoch": 0.022512, "grad_norm": 9.625, "learning_rate": 0.0008816608646295861, "loss": 2.7576, "step": 2814 }, { "epoch": 0.02252, "grad_norm": 3.09375, "learning_rate": 0.0008815795087582715, "loss": 2.3427, "step": 2815 }, { "epoch": 0.022528, "grad_norm": 2.71875, "learning_rate": 0.0008814981286876081, "loss": 2.1709, "step": 2816 }, { "epoch": 0.022536, "grad_norm": 2.609375, "learning_rate": 0.0008814167244227569, "loss": 2.2394, "step": 2817 }, { "epoch": 0.022544, "grad_norm": 2.296875, "learning_rate": 0.0008813352959688805, "loss": 1.9512, "step": 2818 }, { "epoch": 0.022552, "grad_norm": 3.1875, "learning_rate": 0.000881253843331143, "loss": 2.6879, "step": 2819 }, { "epoch": 0.02256, "grad_norm": 2.859375, "learning_rate": 0.0008811723665147101, "loss": 2.7899, "step": 2820 }, { "epoch": 0.022568, "grad_norm": 5.625, "learning_rate": 0.0008810908655247489, "loss": 2.3615, "step": 2821 }, { "epoch": 0.022576, "grad_norm": 2.78125, "learning_rate": 0.0008810093403664281, "loss": 2.3863, "step": 2822 }, { "epoch": 0.022584, "grad_norm": 4.3125, "learning_rate": 0.000880927791044918, "loss": 2.8119, "step": 2823 }, { "epoch": 0.022592, "grad_norm": 2.640625, "learning_rate": 0.0008808462175653904, "loss": 2.3276, "step": 2824 }, { "epoch": 0.0226, "grad_norm": 2.625, "learning_rate": 0.0008807646199330186, "loss": 2.5113, "step": 2825 }, { "epoch": 0.022608, "grad_norm": 2.65625, "learning_rate": 0.0008806829981529772, "loss": 2.2518, "step": 2826 }, { "epoch": 0.022616, "grad_norm": 2.734375, "learning_rate": 0.0008806013522304427, "loss": 3.0695, "step": 2827 }, { "epoch": 0.022624, "grad_norm": 2.703125, "learning_rate": 0.0008805196821705931, "loss": 2.6053, "step": 2828 }, { "epoch": 0.022632, "grad_norm": 2.40625, "learning_rate": 0.0008804379879786078, "loss": 2.4383, "step": 2829 }, { "epoch": 0.02264, "grad_norm": 2.890625, "learning_rate": 0.0008803562696596675, "loss": 2.8487, "step": 2830 }, { "epoch": 0.022648, "grad_norm": 2.734375, "learning_rate": 0.0008802745272189549, "loss": 2.8328, "step": 2831 }, { "epoch": 0.022656, "grad_norm": 2.71875, "learning_rate": 0.0008801927606616542, "loss": 3.1136, "step": 2832 }, { "epoch": 0.022664, "grad_norm": 2.484375, "learning_rate": 0.0008801109699929506, "loss": 2.4261, "step": 2833 }, { "epoch": 0.022672, "grad_norm": 2.765625, "learning_rate": 0.0008800291552180314, "loss": 2.5252, "step": 2834 }, { "epoch": 0.02268, "grad_norm": 3.34375, "learning_rate": 0.0008799473163420851, "loss": 2.5203, "step": 2835 }, { "epoch": 0.022688, "grad_norm": 3.421875, "learning_rate": 0.0008798654533703017, "loss": 2.5983, "step": 2836 }, { "epoch": 0.022696, "grad_norm": 2.640625, "learning_rate": 0.0008797835663078732, "loss": 2.285, "step": 2837 }, { "epoch": 0.022704, "grad_norm": 2.9375, "learning_rate": 0.0008797016551599924, "loss": 2.721, "step": 2838 }, { "epoch": 0.022712, "grad_norm": 2.8125, "learning_rate": 0.0008796197199318544, "loss": 2.4805, "step": 2839 }, { "epoch": 0.02272, "grad_norm": 3.46875, "learning_rate": 0.0008795377606286552, "loss": 2.7407, "step": 2840 }, { "epoch": 0.022728, "grad_norm": 3.296875, "learning_rate": 0.0008794557772555925, "loss": 2.7285, "step": 2841 }, { "epoch": 0.022736, "grad_norm": 2.828125, "learning_rate": 0.0008793737698178658, "loss": 2.4165, "step": 2842 }, { "epoch": 0.022744, "grad_norm": 3.125, "learning_rate": 0.0008792917383206758, "loss": 2.4299, "step": 2843 }, { "epoch": 0.022752, "grad_norm": 2.828125, "learning_rate": 0.0008792096827692249, "loss": 2.212, "step": 2844 }, { "epoch": 0.02276, "grad_norm": 3.28125, "learning_rate": 0.000879127603168717, "loss": 2.2449, "step": 2845 }, { "epoch": 0.022768, "grad_norm": 3.375, "learning_rate": 0.0008790454995243573, "loss": 2.6798, "step": 2846 }, { "epoch": 0.022776, "grad_norm": 2.53125, "learning_rate": 0.000878963371841353, "loss": 2.1559, "step": 2847 }, { "epoch": 0.022784, "grad_norm": 3.28125, "learning_rate": 0.0008788812201249123, "loss": 2.6488, "step": 2848 }, { "epoch": 0.022792, "grad_norm": 3.140625, "learning_rate": 0.0008787990443802453, "loss": 2.6832, "step": 2849 }, { "epoch": 0.0228, "grad_norm": 2.875, "learning_rate": 0.0008787168446125638, "loss": 2.8295, "step": 2850 }, { "epoch": 0.022808, "grad_norm": 2.90625, "learning_rate": 0.0008786346208270802, "loss": 2.2445, "step": 2851 }, { "epoch": 0.022816, "grad_norm": 3.359375, "learning_rate": 0.0008785523730290093, "loss": 2.466, "step": 2852 }, { "epoch": 0.022824, "grad_norm": 3.0, "learning_rate": 0.0008784701012235672, "loss": 2.1399, "step": 2853 }, { "epoch": 0.022832, "grad_norm": 3.734375, "learning_rate": 0.0008783878054159717, "loss": 2.5488, "step": 2854 }, { "epoch": 0.02284, "grad_norm": 2.515625, "learning_rate": 0.0008783054856114414, "loss": 2.1396, "step": 2855 }, { "epoch": 0.022848, "grad_norm": 3.3125, "learning_rate": 0.0008782231418151975, "loss": 3.2186, "step": 2856 }, { "epoch": 0.022856, "grad_norm": 2.75, "learning_rate": 0.0008781407740324617, "loss": 2.4643, "step": 2857 }, { "epoch": 0.022864, "grad_norm": 3.28125, "learning_rate": 0.000878058382268458, "loss": 2.2218, "step": 2858 }, { "epoch": 0.022872, "grad_norm": 2.8125, "learning_rate": 0.0008779759665284114, "loss": 2.3279, "step": 2859 }, { "epoch": 0.02288, "grad_norm": 7.6875, "learning_rate": 0.0008778935268175485, "loss": 2.0179, "step": 2860 }, { "epoch": 0.022888, "grad_norm": 16.25, "learning_rate": 0.000877811063141098, "loss": 2.4557, "step": 2861 }, { "epoch": 0.022896, "grad_norm": 3.046875, "learning_rate": 0.0008777285755042891, "loss": 2.957, "step": 2862 }, { "epoch": 0.022904, "grad_norm": 3.25, "learning_rate": 0.0008776460639123533, "loss": 3.0638, "step": 2863 }, { "epoch": 0.022912, "grad_norm": 2.953125, "learning_rate": 0.0008775635283705237, "loss": 2.4461, "step": 2864 }, { "epoch": 0.02292, "grad_norm": 2.84375, "learning_rate": 0.0008774809688840341, "loss": 2.6661, "step": 2865 }, { "epoch": 0.022928, "grad_norm": 3.328125, "learning_rate": 0.0008773983854581207, "loss": 2.219, "step": 2866 }, { "epoch": 0.022936, "grad_norm": 2.78125, "learning_rate": 0.0008773157780980206, "loss": 2.4478, "step": 2867 }, { "epoch": 0.022944, "grad_norm": 2.375, "learning_rate": 0.0008772331468089729, "loss": 2.3522, "step": 2868 }, { "epoch": 0.022952, "grad_norm": 2.9375, "learning_rate": 0.0008771504915962179, "loss": 2.4297, "step": 2869 }, { "epoch": 0.02296, "grad_norm": 3.3125, "learning_rate": 0.0008770678124649972, "loss": 2.7615, "step": 2870 }, { "epoch": 0.022968, "grad_norm": 3.484375, "learning_rate": 0.0008769851094205548, "loss": 3.221, "step": 2871 }, { "epoch": 0.022976, "grad_norm": 2.5625, "learning_rate": 0.000876902382468135, "loss": 2.6064, "step": 2872 }, { "epoch": 0.022984, "grad_norm": 3.328125, "learning_rate": 0.0008768196316129848, "loss": 2.4384, "step": 2873 }, { "epoch": 0.022992, "grad_norm": 2.5625, "learning_rate": 0.0008767368568603519, "loss": 2.0107, "step": 2874 }, { "epoch": 0.023, "grad_norm": 2.328125, "learning_rate": 0.0008766540582154859, "loss": 2.2702, "step": 2875 }, { "epoch": 0.023008, "grad_norm": 3.59375, "learning_rate": 0.0008765712356836375, "loss": 2.5891, "step": 2876 }, { "epoch": 0.023016, "grad_norm": 3.171875, "learning_rate": 0.0008764883892700596, "loss": 2.7782, "step": 2877 }, { "epoch": 0.023024, "grad_norm": 2.609375, "learning_rate": 0.000876405518980006, "loss": 2.3485, "step": 2878 }, { "epoch": 0.023032, "grad_norm": 2.84375, "learning_rate": 0.0008763226248187324, "loss": 2.4231, "step": 2879 }, { "epoch": 0.02304, "grad_norm": 2.8125, "learning_rate": 0.0008762397067914956, "loss": 2.6416, "step": 2880 }, { "epoch": 0.023048, "grad_norm": 3.671875, "learning_rate": 0.0008761567649035545, "loss": 1.9322, "step": 2881 }, { "epoch": 0.023056, "grad_norm": 2.78125, "learning_rate": 0.0008760737991601688, "loss": 2.4219, "step": 2882 }, { "epoch": 0.023064, "grad_norm": 2.90625, "learning_rate": 0.0008759908095666005, "loss": 2.3416, "step": 2883 }, { "epoch": 0.023072, "grad_norm": 3.265625, "learning_rate": 0.0008759077961281123, "loss": 2.8897, "step": 2884 }, { "epoch": 0.02308, "grad_norm": 3.6875, "learning_rate": 0.0008758247588499693, "loss": 2.3112, "step": 2885 }, { "epoch": 0.023088, "grad_norm": 4.15625, "learning_rate": 0.0008757416977374372, "loss": 3.238, "step": 2886 }, { "epoch": 0.023096, "grad_norm": 3.0625, "learning_rate": 0.0008756586127957838, "loss": 2.5813, "step": 2887 }, { "epoch": 0.023104, "grad_norm": 3.140625, "learning_rate": 0.0008755755040302784, "loss": 3.0471, "step": 2888 }, { "epoch": 0.023112, "grad_norm": 2.921875, "learning_rate": 0.0008754923714461915, "loss": 2.4548, "step": 2889 }, { "epoch": 0.02312, "grad_norm": 2.828125, "learning_rate": 0.0008754092150487951, "loss": 2.4525, "step": 2890 }, { "epoch": 0.023128, "grad_norm": 2.875, "learning_rate": 0.0008753260348433634, "loss": 2.4126, "step": 2891 }, { "epoch": 0.023136, "grad_norm": 2.984375, "learning_rate": 0.000875242830835171, "loss": 2.6144, "step": 2892 }, { "epoch": 0.023144, "grad_norm": 3.125, "learning_rate": 0.0008751596030294951, "loss": 2.7061, "step": 2893 }, { "epoch": 0.023152, "grad_norm": 2.953125, "learning_rate": 0.0008750763514316135, "loss": 2.1614, "step": 2894 }, { "epoch": 0.02316, "grad_norm": 3.046875, "learning_rate": 0.0008749930760468063, "loss": 2.3196, "step": 2895 }, { "epoch": 0.023168, "grad_norm": 3.078125, "learning_rate": 0.0008749097768803546, "loss": 2.5121, "step": 2896 }, { "epoch": 0.023176, "grad_norm": 3.234375, "learning_rate": 0.000874826453937541, "loss": 2.7553, "step": 2897 }, { "epoch": 0.023184, "grad_norm": 3.09375, "learning_rate": 0.0008747431072236499, "loss": 2.6713, "step": 2898 }, { "epoch": 0.023192, "grad_norm": 3.234375, "learning_rate": 0.000874659736743967, "loss": 2.6395, "step": 2899 }, { "epoch": 0.0232, "grad_norm": 2.890625, "learning_rate": 0.0008745763425037796, "loss": 2.8244, "step": 2900 }, { "epoch": 0.023208, "grad_norm": 2.71875, "learning_rate": 0.0008744929245083764, "loss": 2.2366, "step": 2901 }, { "epoch": 0.023216, "grad_norm": 2.6875, "learning_rate": 0.0008744094827630478, "loss": 2.4064, "step": 2902 }, { "epoch": 0.023224, "grad_norm": 3.28125, "learning_rate": 0.0008743260172730853, "loss": 2.3792, "step": 2903 }, { "epoch": 0.023232, "grad_norm": 2.9375, "learning_rate": 0.0008742425280437826, "loss": 2.2661, "step": 2904 }, { "epoch": 0.02324, "grad_norm": 2.484375, "learning_rate": 0.0008741590150804343, "loss": 1.8845, "step": 2905 }, { "epoch": 0.023248, "grad_norm": 3.296875, "learning_rate": 0.0008740754783883365, "loss": 2.7587, "step": 2906 }, { "epoch": 0.023256, "grad_norm": 2.890625, "learning_rate": 0.0008739919179727874, "loss": 2.256, "step": 2907 }, { "epoch": 0.023264, "grad_norm": 3.34375, "learning_rate": 0.0008739083338390861, "loss": 2.8376, "step": 2908 }, { "epoch": 0.023272, "grad_norm": 3.171875, "learning_rate": 0.0008738247259925333, "loss": 2.7308, "step": 2909 }, { "epoch": 0.02328, "grad_norm": 3.453125, "learning_rate": 0.0008737410944384314, "loss": 2.9598, "step": 2910 }, { "epoch": 0.023288, "grad_norm": 3.0, "learning_rate": 0.0008736574391820842, "loss": 2.3382, "step": 2911 }, { "epoch": 0.023296, "grad_norm": 3.3125, "learning_rate": 0.0008735737602287972, "loss": 2.5425, "step": 2912 }, { "epoch": 0.023304, "grad_norm": 2.625, "learning_rate": 0.0008734900575838771, "loss": 2.1453, "step": 2913 }, { "epoch": 0.023312, "grad_norm": 2.8125, "learning_rate": 0.0008734063312526323, "loss": 2.6776, "step": 2914 }, { "epoch": 0.02332, "grad_norm": 2.375, "learning_rate": 0.0008733225812403723, "loss": 2.1398, "step": 2915 }, { "epoch": 0.023328, "grad_norm": 2.859375, "learning_rate": 0.0008732388075524089, "loss": 2.4419, "step": 2916 }, { "epoch": 0.023336, "grad_norm": 2.984375, "learning_rate": 0.0008731550101940546, "loss": 2.5157, "step": 2917 }, { "epoch": 0.023344, "grad_norm": 4.71875, "learning_rate": 0.000873071189170624, "loss": 2.119, "step": 2918 }, { "epoch": 0.023352, "grad_norm": 2.4375, "learning_rate": 0.0008729873444874328, "loss": 1.9582, "step": 2919 }, { "epoch": 0.02336, "grad_norm": 3.140625, "learning_rate": 0.0008729034761497983, "loss": 2.6796, "step": 2920 }, { "epoch": 0.023368, "grad_norm": 3.625, "learning_rate": 0.0008728195841630393, "loss": 2.8931, "step": 2921 }, { "epoch": 0.023376, "grad_norm": 3.21875, "learning_rate": 0.0008727356685324761, "loss": 2.4876, "step": 2922 }, { "epoch": 0.023384, "grad_norm": 2.8125, "learning_rate": 0.0008726517292634308, "loss": 2.2251, "step": 2923 }, { "epoch": 0.023392, "grad_norm": 3.234375, "learning_rate": 0.0008725677663612266, "loss": 2.7085, "step": 2924 }, { "epoch": 0.0234, "grad_norm": 3.265625, "learning_rate": 0.0008724837798311882, "loss": 2.6973, "step": 2925 }, { "epoch": 0.023408, "grad_norm": 3.203125, "learning_rate": 0.0008723997696786423, "loss": 3.3758, "step": 2926 }, { "epoch": 0.023416, "grad_norm": 2.359375, "learning_rate": 0.0008723157359089162, "loss": 2.4612, "step": 2927 }, { "epoch": 0.023424, "grad_norm": 2.515625, "learning_rate": 0.0008722316785273397, "loss": 2.1113, "step": 2928 }, { "epoch": 0.023432, "grad_norm": 2.71875, "learning_rate": 0.0008721475975392432, "loss": 2.251, "step": 2929 }, { "epoch": 0.02344, "grad_norm": 3.1875, "learning_rate": 0.0008720634929499595, "loss": 2.5482, "step": 2930 }, { "epoch": 0.023448, "grad_norm": 2.515625, "learning_rate": 0.000871979364764822, "loss": 2.2212, "step": 2931 }, { "epoch": 0.023456, "grad_norm": 2.328125, "learning_rate": 0.0008718952129891662, "loss": 2.2323, "step": 2932 }, { "epoch": 0.023464, "grad_norm": 2.515625, "learning_rate": 0.0008718110376283291, "loss": 2.0643, "step": 2933 }, { "epoch": 0.023472, "grad_norm": 3.0625, "learning_rate": 0.0008717268386876486, "loss": 2.5391, "step": 2934 }, { "epoch": 0.02348, "grad_norm": 3.25, "learning_rate": 0.0008716426161724648, "loss": 2.6047, "step": 2935 }, { "epoch": 0.023488, "grad_norm": 2.90625, "learning_rate": 0.0008715583700881189, "loss": 1.9966, "step": 2936 }, { "epoch": 0.023496, "grad_norm": 3.421875, "learning_rate": 0.0008714741004399536, "loss": 2.1759, "step": 2937 }, { "epoch": 0.023504, "grad_norm": 2.78125, "learning_rate": 0.0008713898072333134, "loss": 2.5757, "step": 2938 }, { "epoch": 0.023512, "grad_norm": 3.140625, "learning_rate": 0.000871305490473544, "loss": 2.6564, "step": 2939 }, { "epoch": 0.02352, "grad_norm": 3.0625, "learning_rate": 0.0008712211501659925, "loss": 2.6173, "step": 2940 }, { "epoch": 0.023528, "grad_norm": 2.5, "learning_rate": 0.000871136786316008, "loss": 2.4641, "step": 2941 }, { "epoch": 0.023536, "grad_norm": 3.578125, "learning_rate": 0.0008710523989289405, "loss": 2.7972, "step": 2942 }, { "epoch": 0.023544, "grad_norm": 3.25, "learning_rate": 0.0008709679880101417, "loss": 2.1223, "step": 2943 }, { "epoch": 0.023552, "grad_norm": 2.84375, "learning_rate": 0.0008708835535649653, "loss": 2.2115, "step": 2944 }, { "epoch": 0.02356, "grad_norm": 4.09375, "learning_rate": 0.0008707990955987655, "loss": 2.1785, "step": 2945 }, { "epoch": 0.023568, "grad_norm": 2.890625, "learning_rate": 0.0008707146141168987, "loss": 2.1864, "step": 2946 }, { "epoch": 0.023576, "grad_norm": 3.0, "learning_rate": 0.0008706301091247229, "loss": 2.8291, "step": 2947 }, { "epoch": 0.023584, "grad_norm": 3.25, "learning_rate": 0.0008705455806275968, "loss": 2.6243, "step": 2948 }, { "epoch": 0.023592, "grad_norm": 3.234375, "learning_rate": 0.0008704610286308816, "loss": 2.6982, "step": 2949 }, { "epoch": 0.0236, "grad_norm": 2.5625, "learning_rate": 0.0008703764531399392, "loss": 2.8177, "step": 2950 }, { "epoch": 0.023608, "grad_norm": 3.046875, "learning_rate": 0.0008702918541601334, "loss": 2.4225, "step": 2951 }, { "epoch": 0.023616, "grad_norm": 2.8125, "learning_rate": 0.0008702072316968293, "loss": 2.59, "step": 2952 }, { "epoch": 0.023624, "grad_norm": 3.03125, "learning_rate": 0.0008701225857553936, "loss": 2.3422, "step": 2953 }, { "epoch": 0.023632, "grad_norm": 3.015625, "learning_rate": 0.0008700379163411945, "loss": 2.5393, "step": 2954 }, { "epoch": 0.02364, "grad_norm": 2.625, "learning_rate": 0.0008699532234596014, "loss": 2.3685, "step": 2955 }, { "epoch": 0.023648, "grad_norm": 2.453125, "learning_rate": 0.0008698685071159858, "loss": 2.064, "step": 2956 }, { "epoch": 0.023656, "grad_norm": 3.046875, "learning_rate": 0.0008697837673157201, "loss": 2.3594, "step": 2957 }, { "epoch": 0.023664, "grad_norm": 2.625, "learning_rate": 0.0008696990040641786, "loss": 2.6877, "step": 2958 }, { "epoch": 0.023672, "grad_norm": 2.296875, "learning_rate": 0.0008696142173667365, "loss": 2.1699, "step": 2959 }, { "epoch": 0.02368, "grad_norm": 2.96875, "learning_rate": 0.0008695294072287712, "loss": 2.4378, "step": 2960 }, { "epoch": 0.023688, "grad_norm": 2.78125, "learning_rate": 0.0008694445736556611, "loss": 2.2859, "step": 2961 }, { "epoch": 0.023696, "grad_norm": 3.109375, "learning_rate": 0.0008693597166527865, "loss": 2.8432, "step": 2962 }, { "epoch": 0.023704, "grad_norm": 2.734375, "learning_rate": 0.0008692748362255286, "loss": 2.652, "step": 2963 }, { "epoch": 0.023712, "grad_norm": 3.0625, "learning_rate": 0.0008691899323792707, "loss": 2.1921, "step": 2964 }, { "epoch": 0.02372, "grad_norm": 3.203125, "learning_rate": 0.0008691050051193972, "loss": 2.4141, "step": 2965 }, { "epoch": 0.023728, "grad_norm": 3.0625, "learning_rate": 0.0008690200544512939, "loss": 2.3574, "step": 2966 }, { "epoch": 0.023736, "grad_norm": 2.984375, "learning_rate": 0.0008689350803803486, "loss": 2.4513, "step": 2967 }, { "epoch": 0.023744, "grad_norm": 2.90625, "learning_rate": 0.0008688500829119501, "loss": 2.9565, "step": 2968 }, { "epoch": 0.023752, "grad_norm": 2.984375, "learning_rate": 0.000868765062051489, "loss": 3.0206, "step": 2969 }, { "epoch": 0.02376, "grad_norm": 2.8125, "learning_rate": 0.000868680017804357, "loss": 1.9162, "step": 2970 }, { "epoch": 0.023768, "grad_norm": 3.1875, "learning_rate": 0.0008685949501759476, "loss": 2.5477, "step": 2971 }, { "epoch": 0.023776, "grad_norm": 3.046875, "learning_rate": 0.0008685098591716558, "loss": 2.6591, "step": 2972 }, { "epoch": 0.023784, "grad_norm": 3.171875, "learning_rate": 0.0008684247447968776, "loss": 2.3986, "step": 2973 }, { "epoch": 0.023792, "grad_norm": 3.6875, "learning_rate": 0.0008683396070570113, "loss": 2.8011, "step": 2974 }, { "epoch": 0.0238, "grad_norm": 3.171875, "learning_rate": 0.0008682544459574561, "loss": 2.8189, "step": 2975 }, { "epoch": 0.023808, "grad_norm": 3.234375, "learning_rate": 0.0008681692615036129, "loss": 2.4223, "step": 2976 }, { "epoch": 0.023816, "grad_norm": 3.203125, "learning_rate": 0.0008680840537008839, "loss": 2.866, "step": 2977 }, { "epoch": 0.023824, "grad_norm": 2.671875, "learning_rate": 0.0008679988225546726, "loss": 2.4255, "step": 2978 }, { "epoch": 0.023832, "grad_norm": 2.765625, "learning_rate": 0.0008679135680703849, "loss": 2.4559, "step": 2979 }, { "epoch": 0.02384, "grad_norm": 2.953125, "learning_rate": 0.0008678282902534272, "loss": 2.7454, "step": 2980 }, { "epoch": 0.023848, "grad_norm": 2.90625, "learning_rate": 0.0008677429891092074, "loss": 2.6687, "step": 2981 }, { "epoch": 0.023856, "grad_norm": 2.765625, "learning_rate": 0.0008676576646431358, "loss": 2.6819, "step": 2982 }, { "epoch": 0.023864, "grad_norm": 2.78125, "learning_rate": 0.0008675723168606232, "loss": 2.0289, "step": 2983 }, { "epoch": 0.023872, "grad_norm": 2.875, "learning_rate": 0.0008674869457670824, "loss": 2.6784, "step": 2984 }, { "epoch": 0.02388, "grad_norm": 2.875, "learning_rate": 0.0008674015513679276, "loss": 2.6302, "step": 2985 }, { "epoch": 0.023888, "grad_norm": 4.09375, "learning_rate": 0.0008673161336685742, "loss": 2.2753, "step": 2986 }, { "epoch": 0.023896, "grad_norm": 2.953125, "learning_rate": 0.0008672306926744396, "loss": 2.2744, "step": 2987 }, { "epoch": 0.023904, "grad_norm": 2.9375, "learning_rate": 0.0008671452283909421, "loss": 2.3297, "step": 2988 }, { "epoch": 0.023912, "grad_norm": 2.390625, "learning_rate": 0.0008670597408235017, "loss": 2.038, "step": 2989 }, { "epoch": 0.02392, "grad_norm": 3.171875, "learning_rate": 0.0008669742299775402, "loss": 3.3445, "step": 2990 }, { "epoch": 0.023928, "grad_norm": 2.640625, "learning_rate": 0.0008668886958584805, "loss": 2.8434, "step": 2991 }, { "epoch": 0.023936, "grad_norm": 3.203125, "learning_rate": 0.0008668031384717471, "loss": 2.5344, "step": 2992 }, { "epoch": 0.023944, "grad_norm": 2.984375, "learning_rate": 0.0008667175578227659, "loss": 2.6811, "step": 2993 }, { "epoch": 0.023952, "grad_norm": 2.453125, "learning_rate": 0.0008666319539169643, "loss": 2.1195, "step": 2994 }, { "epoch": 0.02396, "grad_norm": 3.453125, "learning_rate": 0.0008665463267597713, "loss": 3.0531, "step": 2995 }, { "epoch": 0.023968, "grad_norm": 2.765625, "learning_rate": 0.0008664606763566172, "loss": 2.2616, "step": 2996 }, { "epoch": 0.023976, "grad_norm": 3.078125, "learning_rate": 0.0008663750027129338, "loss": 2.5259, "step": 2997 }, { "epoch": 0.023984, "grad_norm": 3.078125, "learning_rate": 0.0008662893058341546, "loss": 2.7361, "step": 2998 }, { "epoch": 0.023992, "grad_norm": 2.921875, "learning_rate": 0.0008662035857257142, "loss": 2.7473, "step": 2999 }, { "epoch": 0.024, "grad_norm": 2.90625, "learning_rate": 0.0008661178423930491, "loss": 2.4512, "step": 3000 }, { "epoch": 0.024008, "grad_norm": 3.21875, "learning_rate": 0.000866032075841597, "loss": 2.5451, "step": 3001 }, { "epoch": 0.024016, "grad_norm": 3.15625, "learning_rate": 0.0008659462860767968, "loss": 3.1254, "step": 3002 }, { "epoch": 0.024024, "grad_norm": 2.453125, "learning_rate": 0.0008658604731040896, "loss": 1.8937, "step": 3003 }, { "epoch": 0.024032, "grad_norm": 2.859375, "learning_rate": 0.0008657746369289174, "loss": 2.5416, "step": 3004 }, { "epoch": 0.02404, "grad_norm": 2.859375, "learning_rate": 0.000865688777556724, "loss": 2.6178, "step": 3005 }, { "epoch": 0.024048, "grad_norm": 2.34375, "learning_rate": 0.0008656028949929541, "loss": 2.2434, "step": 3006 }, { "epoch": 0.024056, "grad_norm": 2.671875, "learning_rate": 0.0008655169892430545, "loss": 2.0667, "step": 3007 }, { "epoch": 0.024064, "grad_norm": 2.5625, "learning_rate": 0.0008654310603124734, "loss": 2.1541, "step": 3008 }, { "epoch": 0.024072, "grad_norm": 2.96875, "learning_rate": 0.0008653451082066601, "loss": 1.7822, "step": 3009 }, { "epoch": 0.02408, "grad_norm": 2.984375, "learning_rate": 0.0008652591329310657, "loss": 2.8261, "step": 3010 }, { "epoch": 0.024088, "grad_norm": 3.0625, "learning_rate": 0.0008651731344911427, "loss": 2.7768, "step": 3011 }, { "epoch": 0.024096, "grad_norm": 3.09375, "learning_rate": 0.000865087112892345, "loss": 2.8121, "step": 3012 }, { "epoch": 0.024104, "grad_norm": 3.515625, "learning_rate": 0.0008650010681401279, "loss": 3.125, "step": 3013 }, { "epoch": 0.024112, "grad_norm": 2.484375, "learning_rate": 0.0008649150002399482, "loss": 2.8669, "step": 3014 }, { "epoch": 0.02412, "grad_norm": 2.71875, "learning_rate": 0.0008648289091972646, "loss": 2.6622, "step": 3015 }, { "epoch": 0.024128, "grad_norm": 2.78125, "learning_rate": 0.0008647427950175363, "loss": 2.382, "step": 3016 }, { "epoch": 0.024136, "grad_norm": 2.4375, "learning_rate": 0.0008646566577062252, "loss": 2.2256, "step": 3017 }, { "epoch": 0.024144, "grad_norm": 3.1875, "learning_rate": 0.0008645704972687936, "loss": 3.1233, "step": 3018 }, { "epoch": 0.024152, "grad_norm": 2.84375, "learning_rate": 0.0008644843137107057, "loss": 2.6507, "step": 3019 }, { "epoch": 0.02416, "grad_norm": 2.765625, "learning_rate": 0.0008643981070374276, "loss": 3.0515, "step": 3020 }, { "epoch": 0.024168, "grad_norm": 3.078125, "learning_rate": 0.000864311877254426, "loss": 2.9681, "step": 3021 }, { "epoch": 0.024176, "grad_norm": 2.9375, "learning_rate": 0.0008642256243671696, "loss": 2.2872, "step": 3022 }, { "epoch": 0.024184, "grad_norm": 2.640625, "learning_rate": 0.0008641393483811285, "loss": 2.293, "step": 3023 }, { "epoch": 0.024192, "grad_norm": 3.109375, "learning_rate": 0.0008640530493017742, "loss": 1.9885, "step": 3024 }, { "epoch": 0.0242, "grad_norm": 3.5625, "learning_rate": 0.0008639667271345798, "loss": 2.4693, "step": 3025 }, { "epoch": 0.024208, "grad_norm": 3.40625, "learning_rate": 0.0008638803818850196, "loss": 2.7523, "step": 3026 }, { "epoch": 0.024216, "grad_norm": 2.796875, "learning_rate": 0.0008637940135585697, "loss": 2.4176, "step": 3027 }, { "epoch": 0.024224, "grad_norm": 2.71875, "learning_rate": 0.0008637076221607073, "loss": 2.5357, "step": 3028 }, { "epoch": 0.024232, "grad_norm": 2.9375, "learning_rate": 0.0008636212076969113, "loss": 2.6171, "step": 3029 }, { "epoch": 0.02424, "grad_norm": 2.828125, "learning_rate": 0.0008635347701726621, "loss": 3.0676, "step": 3030 }, { "epoch": 0.024248, "grad_norm": 3.15625, "learning_rate": 0.0008634483095934413, "loss": 2.9592, "step": 3031 }, { "epoch": 0.024256, "grad_norm": 2.734375, "learning_rate": 0.0008633618259647324, "loss": 2.5606, "step": 3032 }, { "epoch": 0.024264, "grad_norm": 2.953125, "learning_rate": 0.0008632753192920197, "loss": 2.4941, "step": 3033 }, { "epoch": 0.024272, "grad_norm": 161.0, "learning_rate": 0.0008631887895807898, "loss": 2.1177, "step": 3034 }, { "epoch": 0.02428, "grad_norm": 2.84375, "learning_rate": 0.00086310223683653, "loss": 2.6016, "step": 3035 }, { "epoch": 0.024288, "grad_norm": 2.484375, "learning_rate": 0.0008630156610647295, "loss": 1.5545, "step": 3036 }, { "epoch": 0.024296, "grad_norm": 3.5, "learning_rate": 0.0008629290622708789, "loss": 2.3814, "step": 3037 }, { "epoch": 0.024304, "grad_norm": 3.0, "learning_rate": 0.00086284244046047, "loss": 2.2921, "step": 3038 }, { "epoch": 0.024312, "grad_norm": 3.734375, "learning_rate": 0.0008627557956389964, "loss": 2.4122, "step": 3039 }, { "epoch": 0.02432, "grad_norm": 2.609375, "learning_rate": 0.0008626691278119531, "loss": 2.6033, "step": 3040 }, { "epoch": 0.024328, "grad_norm": 3.0, "learning_rate": 0.0008625824369848363, "loss": 2.2241, "step": 3041 }, { "epoch": 0.024336, "grad_norm": 3.046875, "learning_rate": 0.0008624957231631441, "loss": 2.6945, "step": 3042 }, { "epoch": 0.024344, "grad_norm": 2.796875, "learning_rate": 0.0008624089863523754, "loss": 2.4489, "step": 3043 }, { "epoch": 0.024352, "grad_norm": 2.953125, "learning_rate": 0.0008623222265580311, "loss": 2.8369, "step": 3044 }, { "epoch": 0.02436, "grad_norm": 2.890625, "learning_rate": 0.0008622354437856137, "loss": 2.996, "step": 3045 }, { "epoch": 0.024368, "grad_norm": 2.78125, "learning_rate": 0.0008621486380406265, "loss": 2.6307, "step": 3046 }, { "epoch": 0.024376, "grad_norm": 2.625, "learning_rate": 0.0008620618093285748, "loss": 2.5986, "step": 3047 }, { "epoch": 0.024384, "grad_norm": 2.546875, "learning_rate": 0.0008619749576549651, "loss": 2.5019, "step": 3048 }, { "epoch": 0.024392, "grad_norm": 2.921875, "learning_rate": 0.0008618880830253055, "loss": 2.2345, "step": 3049 }, { "epoch": 0.0244, "grad_norm": 2.703125, "learning_rate": 0.0008618011854451056, "loss": 2.5384, "step": 3050 }, { "epoch": 0.024408, "grad_norm": 2.40625, "learning_rate": 0.000861714264919876, "loss": 2.7357, "step": 3051 }, { "epoch": 0.024416, "grad_norm": 2.4375, "learning_rate": 0.0008616273214551293, "loss": 2.3187, "step": 3052 }, { "epoch": 0.024424, "grad_norm": 2.5625, "learning_rate": 0.0008615403550563796, "loss": 2.2372, "step": 3053 }, { "epoch": 0.024432, "grad_norm": 2.734375, "learning_rate": 0.0008614533657291419, "loss": 2.1544, "step": 3054 }, { "epoch": 0.02444, "grad_norm": 2.625, "learning_rate": 0.0008613663534789331, "loss": 2.7091, "step": 3055 }, { "epoch": 0.024448, "grad_norm": 2.625, "learning_rate": 0.0008612793183112715, "loss": 2.4594, "step": 3056 }, { "epoch": 0.024456, "grad_norm": 3.03125, "learning_rate": 0.0008611922602316764, "loss": 2.7221, "step": 3057 }, { "epoch": 0.024464, "grad_norm": 3.09375, "learning_rate": 0.0008611051792456692, "loss": 2.2097, "step": 3058 }, { "epoch": 0.024472, "grad_norm": 3.5625, "learning_rate": 0.0008610180753587727, "loss": 2.472, "step": 3059 }, { "epoch": 0.02448, "grad_norm": 3.09375, "learning_rate": 0.0008609309485765104, "loss": 2.5445, "step": 3060 }, { "epoch": 0.024488, "grad_norm": 2.875, "learning_rate": 0.0008608437989044083, "loss": 1.9301, "step": 3061 }, { "epoch": 0.024496, "grad_norm": 2.71875, "learning_rate": 0.000860756626347993, "loss": 2.5157, "step": 3062 }, { "epoch": 0.024504, "grad_norm": 2.90625, "learning_rate": 0.0008606694309127931, "loss": 2.3455, "step": 3063 }, { "epoch": 0.024512, "grad_norm": 2.734375, "learning_rate": 0.0008605822126043382, "loss": 2.5209, "step": 3064 }, { "epoch": 0.02452, "grad_norm": 3.15625, "learning_rate": 0.0008604949714281598, "loss": 2.6279, "step": 3065 }, { "epoch": 0.024528, "grad_norm": 3.375, "learning_rate": 0.0008604077073897905, "loss": 3.0494, "step": 3066 }, { "epoch": 0.024536, "grad_norm": 2.828125, "learning_rate": 0.0008603204204947647, "loss": 2.4673, "step": 3067 }, { "epoch": 0.024544, "grad_norm": 3.046875, "learning_rate": 0.0008602331107486179, "loss": 2.2317, "step": 3068 }, { "epoch": 0.024552, "grad_norm": 3.359375, "learning_rate": 0.000860145778156887, "loss": 2.8142, "step": 3069 }, { "epoch": 0.02456, "grad_norm": 2.703125, "learning_rate": 0.0008600584227251107, "loss": 2.3699, "step": 3070 }, { "epoch": 0.024568, "grad_norm": 4.03125, "learning_rate": 0.0008599710444588292, "loss": 2.1053, "step": 3071 }, { "epoch": 0.024576, "grad_norm": 4.0, "learning_rate": 0.0008598836433635835, "loss": 3.4209, "step": 3072 }, { "epoch": 0.024584, "grad_norm": 3.46875, "learning_rate": 0.0008597962194449169, "loss": 2.3035, "step": 3073 }, { "epoch": 0.024592, "grad_norm": 2.84375, "learning_rate": 0.0008597087727083735, "loss": 2.4491, "step": 3074 }, { "epoch": 0.0246, "grad_norm": 3.3125, "learning_rate": 0.000859621303159499, "loss": 2.8485, "step": 3075 }, { "epoch": 0.024608, "grad_norm": 3.234375, "learning_rate": 0.0008595338108038408, "loss": 2.7169, "step": 3076 }, { "epoch": 0.024616, "grad_norm": 2.984375, "learning_rate": 0.0008594462956469475, "loss": 2.6056, "step": 3077 }, { "epoch": 0.024624, "grad_norm": 3.53125, "learning_rate": 0.0008593587576943693, "loss": 2.4043, "step": 3078 }, { "epoch": 0.024632, "grad_norm": 3.640625, "learning_rate": 0.0008592711969516576, "loss": 2.7042, "step": 3079 }, { "epoch": 0.02464, "grad_norm": 3.984375, "learning_rate": 0.0008591836134243656, "loss": 2.3026, "step": 3080 }, { "epoch": 0.024648, "grad_norm": 3.109375, "learning_rate": 0.0008590960071180473, "loss": 2.326, "step": 3081 }, { "epoch": 0.024656, "grad_norm": 3.40625, "learning_rate": 0.0008590083780382594, "loss": 2.6963, "step": 3082 }, { "epoch": 0.024664, "grad_norm": 2.703125, "learning_rate": 0.0008589207261905584, "loss": 2.6535, "step": 3083 }, { "epoch": 0.024672, "grad_norm": 3.140625, "learning_rate": 0.0008588330515805036, "loss": 1.8577, "step": 3084 }, { "epoch": 0.02468, "grad_norm": 2.875, "learning_rate": 0.000858745354213655, "loss": 2.1847, "step": 3085 }, { "epoch": 0.024688, "grad_norm": 3.0625, "learning_rate": 0.0008586576340955745, "loss": 2.4716, "step": 3086 }, { "epoch": 0.024696, "grad_norm": 2.484375, "learning_rate": 0.000858569891231825, "loss": 2.4345, "step": 3087 }, { "epoch": 0.024704, "grad_norm": 3.078125, "learning_rate": 0.0008584821256279708, "loss": 2.4492, "step": 3088 }, { "epoch": 0.024712, "grad_norm": 2.734375, "learning_rate": 0.0008583943372895786, "loss": 2.6639, "step": 3089 }, { "epoch": 0.02472, "grad_norm": 2.71875, "learning_rate": 0.0008583065262222153, "loss": 2.9147, "step": 3090 }, { "epoch": 0.024728, "grad_norm": 3.828125, "learning_rate": 0.0008582186924314499, "loss": 2.7793, "step": 3091 }, { "epoch": 0.024736, "grad_norm": 2.65625, "learning_rate": 0.0008581308359228528, "loss": 1.8461, "step": 3092 }, { "epoch": 0.024744, "grad_norm": 2.984375, "learning_rate": 0.0008580429567019959, "loss": 2.4162, "step": 3093 }, { "epoch": 0.024752, "grad_norm": 2.8125, "learning_rate": 0.0008579550547744519, "loss": 2.999, "step": 3094 }, { "epoch": 0.02476, "grad_norm": 2.859375, "learning_rate": 0.0008578671301457958, "loss": 2.7828, "step": 3095 }, { "epoch": 0.024768, "grad_norm": 2.84375, "learning_rate": 0.0008577791828216037, "loss": 2.6608, "step": 3096 }, { "epoch": 0.024776, "grad_norm": 2.65625, "learning_rate": 0.0008576912128074532, "loss": 2.4075, "step": 3097 }, { "epoch": 0.024784, "grad_norm": 2.75, "learning_rate": 0.000857603220108923, "loss": 2.7511, "step": 3098 }, { "epoch": 0.024792, "grad_norm": 2.984375, "learning_rate": 0.0008575152047315936, "loss": 2.3954, "step": 3099 }, { "epoch": 0.0248, "grad_norm": 3.484375, "learning_rate": 0.0008574271666810469, "loss": 2.9334, "step": 3100 }, { "epoch": 0.024808, "grad_norm": 2.6875, "learning_rate": 0.0008573391059628661, "loss": 2.9789, "step": 3101 }, { "epoch": 0.024816, "grad_norm": 2.921875, "learning_rate": 0.000857251022582636, "loss": 3.0947, "step": 3102 }, { "epoch": 0.024824, "grad_norm": 3.140625, "learning_rate": 0.0008571629165459427, "loss": 2.3362, "step": 3103 }, { "epoch": 0.024832, "grad_norm": 2.421875, "learning_rate": 0.0008570747878583738, "loss": 2.1528, "step": 3104 }, { "epoch": 0.02484, "grad_norm": 2.796875, "learning_rate": 0.0008569866365255181, "loss": 2.3441, "step": 3105 }, { "epoch": 0.024848, "grad_norm": 3.265625, "learning_rate": 0.0008568984625529666, "loss": 2.8677, "step": 3106 }, { "epoch": 0.024856, "grad_norm": 3.015625, "learning_rate": 0.0008568102659463106, "loss": 2.7061, "step": 3107 }, { "epoch": 0.024864, "grad_norm": 2.828125, "learning_rate": 0.0008567220467111438, "loss": 2.4004, "step": 3108 }, { "epoch": 0.024872, "grad_norm": 3.296875, "learning_rate": 0.0008566338048530608, "loss": 3.3409, "step": 3109 }, { "epoch": 0.02488, "grad_norm": 2.921875, "learning_rate": 0.0008565455403776579, "loss": 2.5545, "step": 3110 }, { "epoch": 0.024888, "grad_norm": 8.25, "learning_rate": 0.0008564572532905328, "loss": 2.3574, "step": 3111 }, { "epoch": 0.024896, "grad_norm": 3.390625, "learning_rate": 0.0008563689435972843, "loss": 2.7647, "step": 3112 }, { "epoch": 0.024904, "grad_norm": 3.21875, "learning_rate": 0.0008562806113035131, "loss": 3.1895, "step": 3113 }, { "epoch": 0.024912, "grad_norm": 3.3125, "learning_rate": 0.0008561922564148212, "loss": 2.2934, "step": 3114 }, { "epoch": 0.02492, "grad_norm": 3.15625, "learning_rate": 0.0008561038789368118, "loss": 2.5585, "step": 3115 }, { "epoch": 0.024928, "grad_norm": 3.15625, "learning_rate": 0.0008560154788750898, "loss": 2.5898, "step": 3116 }, { "epoch": 0.024936, "grad_norm": 2.71875, "learning_rate": 0.0008559270562352615, "loss": 2.668, "step": 3117 }, { "epoch": 0.024944, "grad_norm": 2.5, "learning_rate": 0.0008558386110229342, "loss": 2.1405, "step": 3118 }, { "epoch": 0.024952, "grad_norm": 2.5, "learning_rate": 0.0008557501432437175, "loss": 2.4516, "step": 3119 }, { "epoch": 0.02496, "grad_norm": 2.9375, "learning_rate": 0.0008556616529032215, "loss": 2.5183, "step": 3120 }, { "epoch": 0.024968, "grad_norm": 2.640625, "learning_rate": 0.0008555731400070586, "loss": 2.4015, "step": 3121 }, { "epoch": 0.024976, "grad_norm": 2.765625, "learning_rate": 0.0008554846045608418, "loss": 2.1858, "step": 3122 }, { "epoch": 0.024984, "grad_norm": 3.0, "learning_rate": 0.0008553960465701862, "loss": 2.8182, "step": 3123 }, { "epoch": 0.024992, "grad_norm": 2.90625, "learning_rate": 0.0008553074660407078, "loss": 2.8037, "step": 3124 }, { "epoch": 0.025, "grad_norm": 2.734375, "learning_rate": 0.0008552188629780245, "loss": 2.4007, "step": 3125 }, { "epoch": 0.025, "eval_loss": 2.4781594276428223, "eval_runtime": 2848.7828, "eval_samples_per_second": 35.103, "eval_steps_per_second": 4.388, "step": 3125 } ], "logging_steps": 1, "max_steps": 12500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 3125, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.836543770231603e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }