{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 8316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008417508417508417, "grad_norm": 5.777827739715576, "learning_rate": 1.2019230769230771e-08, "loss": 0.8113, "step": 1 }, { "epoch": 0.0016835016835016834, "grad_norm": 5.797756195068359, "learning_rate": 2.4038461538461542e-08, "loss": 0.8351, "step": 2 }, { "epoch": 0.0025252525252525255, "grad_norm": 5.98045015335083, "learning_rate": 3.6057692307692314e-08, "loss": 0.859, "step": 3 }, { "epoch": 0.003367003367003367, "grad_norm": 5.880300998687744, "learning_rate": 4.8076923076923085e-08, "loss": 0.8417, "step": 4 }, { "epoch": 0.004208754208754209, "grad_norm": 5.923773288726807, "learning_rate": 6.009615384615386e-08, "loss": 0.8537, "step": 5 }, { "epoch": 0.005050505050505051, "grad_norm": 6.180328369140625, "learning_rate": 7.211538461538463e-08, "loss": 0.9081, "step": 6 }, { "epoch": 0.005892255892255892, "grad_norm": 6.0655717849731445, "learning_rate": 8.413461538461539e-08, "loss": 0.9023, "step": 7 }, { "epoch": 0.006734006734006734, "grad_norm": 6.020984649658203, "learning_rate": 9.615384615384617e-08, "loss": 0.8704, "step": 8 }, { "epoch": 0.007575757575757576, "grad_norm": 6.081356525421143, "learning_rate": 1.0817307692307693e-07, "loss": 0.8621, "step": 9 }, { "epoch": 0.008417508417508417, "grad_norm": 5.947510242462158, "learning_rate": 1.201923076923077e-07, "loss": 0.8686, "step": 10 }, { "epoch": 0.009259259259259259, "grad_norm": 5.824235916137695, "learning_rate": 1.3221153846153847e-07, "loss": 0.8644, "step": 11 }, { "epoch": 0.010101010101010102, "grad_norm": 6.13997220993042, "learning_rate": 1.4423076923076925e-07, "loss": 0.8569, "step": 12 }, { "epoch": 0.010942760942760943, "grad_norm": 5.873048305511475, "learning_rate": 1.5625e-07, "loss": 0.8887, "step": 13 }, { "epoch": 0.011784511784511785, "grad_norm": 6.115290641784668, "learning_rate": 1.6826923076923077e-07, "loss": 0.8948, "step": 14 }, { "epoch": 0.012626262626262626, "grad_norm": 6.093770503997803, "learning_rate": 1.8028846153846153e-07, "loss": 0.9025, "step": 15 }, { "epoch": 0.013468013468013467, "grad_norm": 5.623298168182373, "learning_rate": 1.9230769230769234e-07, "loss": 0.8168, "step": 16 }, { "epoch": 0.01430976430976431, "grad_norm": 6.122828960418701, "learning_rate": 2.043269230769231e-07, "loss": 0.9119, "step": 17 }, { "epoch": 0.015151515151515152, "grad_norm": 5.664336204528809, "learning_rate": 2.1634615384615386e-07, "loss": 0.855, "step": 18 }, { "epoch": 0.015993265993265993, "grad_norm": 5.8011980056762695, "learning_rate": 2.2836538461538461e-07, "loss": 0.8877, "step": 19 }, { "epoch": 0.016835016835016835, "grad_norm": 5.866971492767334, "learning_rate": 2.403846153846154e-07, "loss": 0.8492, "step": 20 }, { "epoch": 0.017676767676767676, "grad_norm": 5.559382915496826, "learning_rate": 2.5240384615384616e-07, "loss": 0.8525, "step": 21 }, { "epoch": 0.018518518518518517, "grad_norm": 5.61030387878418, "learning_rate": 2.6442307692307694e-07, "loss": 0.8424, "step": 22 }, { "epoch": 0.01936026936026936, "grad_norm": 5.678184509277344, "learning_rate": 2.764423076923077e-07, "loss": 0.8551, "step": 23 }, { "epoch": 0.020202020202020204, "grad_norm": 5.56907320022583, "learning_rate": 2.884615384615385e-07, "loss": 0.8814, "step": 24 }, { "epoch": 0.021043771043771045, "grad_norm": 5.6745991706848145, "learning_rate": 3.0048076923076924e-07, "loss": 0.8844, "step": 25 }, { "epoch": 0.021885521885521887, "grad_norm": 5.691203594207764, "learning_rate": 3.125e-07, "loss": 0.9013, "step": 26 }, { "epoch": 0.022727272727272728, "grad_norm": 4.825611114501953, "learning_rate": 3.245192307692308e-07, "loss": 0.8386, "step": 27 }, { "epoch": 0.02356902356902357, "grad_norm": 4.8627543449401855, "learning_rate": 3.3653846153846154e-07, "loss": 0.8285, "step": 28 }, { "epoch": 0.02441077441077441, "grad_norm": 4.61903715133667, "learning_rate": 3.485576923076923e-07, "loss": 0.8023, "step": 29 }, { "epoch": 0.025252525252525252, "grad_norm": 4.752788066864014, "learning_rate": 3.6057692307692306e-07, "loss": 0.8445, "step": 30 }, { "epoch": 0.026094276094276093, "grad_norm": 4.592252254486084, "learning_rate": 3.725961538461539e-07, "loss": 0.8358, "step": 31 }, { "epoch": 0.026936026936026935, "grad_norm": 4.415046691894531, "learning_rate": 3.846153846153847e-07, "loss": 0.8192, "step": 32 }, { "epoch": 0.027777777777777776, "grad_norm": 4.153048038482666, "learning_rate": 3.966346153846154e-07, "loss": 0.8166, "step": 33 }, { "epoch": 0.02861952861952862, "grad_norm": 4.3181633949279785, "learning_rate": 4.086538461538462e-07, "loss": 0.7971, "step": 34 }, { "epoch": 0.029461279461279462, "grad_norm": 4.163095474243164, "learning_rate": 4.20673076923077e-07, "loss": 0.8181, "step": 35 }, { "epoch": 0.030303030303030304, "grad_norm": 3.9594931602478027, "learning_rate": 4.326923076923077e-07, "loss": 0.7972, "step": 36 }, { "epoch": 0.031144781144781145, "grad_norm": 3.5322697162628174, "learning_rate": 4.447115384615385e-07, "loss": 0.7695, "step": 37 }, { "epoch": 0.03198653198653199, "grad_norm": 2.7330048084259033, "learning_rate": 4.5673076923076923e-07, "loss": 0.7938, "step": 38 }, { "epoch": 0.03282828282828283, "grad_norm": 2.5289535522460938, "learning_rate": 4.6875000000000006e-07, "loss": 0.7823, "step": 39 }, { "epoch": 0.03367003367003367, "grad_norm": 2.341895580291748, "learning_rate": 4.807692307692308e-07, "loss": 0.7305, "step": 40 }, { "epoch": 0.034511784511784514, "grad_norm": 2.5023305416107178, "learning_rate": 4.927884615384615e-07, "loss": 0.7584, "step": 41 }, { "epoch": 0.03535353535353535, "grad_norm": 2.3906912803649902, "learning_rate": 5.048076923076923e-07, "loss": 0.7408, "step": 42 }, { "epoch": 0.0361952861952862, "grad_norm": 2.4439585208892822, "learning_rate": 5.168269230769231e-07, "loss": 0.7826, "step": 43 }, { "epoch": 0.037037037037037035, "grad_norm": 2.2566750049591064, "learning_rate": 5.288461538461539e-07, "loss": 0.7262, "step": 44 }, { "epoch": 0.03787878787878788, "grad_norm": 2.1312367916107178, "learning_rate": 5.408653846153847e-07, "loss": 0.765, "step": 45 }, { "epoch": 0.03872053872053872, "grad_norm": 2.0412192344665527, "learning_rate": 5.528846153846155e-07, "loss": 0.7228, "step": 46 }, { "epoch": 0.03956228956228956, "grad_norm": 2.200086832046509, "learning_rate": 5.649038461538462e-07, "loss": 0.805, "step": 47 }, { "epoch": 0.04040404040404041, "grad_norm": 1.9725497961044312, "learning_rate": 5.76923076923077e-07, "loss": 0.7251, "step": 48 }, { "epoch": 0.041245791245791245, "grad_norm": 1.7760865688323975, "learning_rate": 5.889423076923077e-07, "loss": 0.7468, "step": 49 }, { "epoch": 0.04208754208754209, "grad_norm": 1.6021575927734375, "learning_rate": 6.009615384615385e-07, "loss": 0.6935, "step": 50 }, { "epoch": 0.04292929292929293, "grad_norm": 1.381394386291504, "learning_rate": 6.129807692307693e-07, "loss": 0.7139, "step": 51 }, { "epoch": 0.04377104377104377, "grad_norm": 1.6592602729797363, "learning_rate": 6.25e-07, "loss": 0.7114, "step": 52 }, { "epoch": 0.04461279461279461, "grad_norm": 1.7348512411117554, "learning_rate": 6.370192307692307e-07, "loss": 0.7468, "step": 53 }, { "epoch": 0.045454545454545456, "grad_norm": 1.7562978267669678, "learning_rate": 6.490384615384616e-07, "loss": 0.7291, "step": 54 }, { "epoch": 0.046296296296296294, "grad_norm": 1.7896093130111694, "learning_rate": 6.610576923076924e-07, "loss": 0.7167, "step": 55 }, { "epoch": 0.04713804713804714, "grad_norm": 1.6752034425735474, "learning_rate": 6.730769230769231e-07, "loss": 0.6891, "step": 56 }, { "epoch": 0.047979797979797977, "grad_norm": 1.5821552276611328, "learning_rate": 6.85096153846154e-07, "loss": 0.7164, "step": 57 }, { "epoch": 0.04882154882154882, "grad_norm": 1.5930594205856323, "learning_rate": 6.971153846153847e-07, "loss": 0.6972, "step": 58 }, { "epoch": 0.049663299663299666, "grad_norm": 1.4549140930175781, "learning_rate": 7.091346153846154e-07, "loss": 0.6557, "step": 59 }, { "epoch": 0.050505050505050504, "grad_norm": 1.3543978929519653, "learning_rate": 7.211538461538461e-07, "loss": 0.698, "step": 60 }, { "epoch": 0.05134680134680135, "grad_norm": 1.439573049545288, "learning_rate": 7.33173076923077e-07, "loss": 0.6931, "step": 61 }, { "epoch": 0.05218855218855219, "grad_norm": 1.1542819738388062, "learning_rate": 7.451923076923078e-07, "loss": 0.6794, "step": 62 }, { "epoch": 0.05303030303030303, "grad_norm": 1.072357177734375, "learning_rate": 7.572115384615385e-07, "loss": 0.6877, "step": 63 }, { "epoch": 0.05387205387205387, "grad_norm": 1.0495240688323975, "learning_rate": 7.692307692307694e-07, "loss": 0.6597, "step": 64 }, { "epoch": 0.054713804713804715, "grad_norm": 0.953713059425354, "learning_rate": 7.8125e-07, "loss": 0.6647, "step": 65 }, { "epoch": 0.05555555555555555, "grad_norm": 1.0280705690383911, "learning_rate": 7.932692307692308e-07, "loss": 0.6761, "step": 66 }, { "epoch": 0.0563973063973064, "grad_norm": 0.9897440075874329, "learning_rate": 8.052884615384616e-07, "loss": 0.6578, "step": 67 }, { "epoch": 0.05723905723905724, "grad_norm": 0.9641563892364502, "learning_rate": 8.173076923076924e-07, "loss": 0.6462, "step": 68 }, { "epoch": 0.05808080808080808, "grad_norm": 0.8539642095565796, "learning_rate": 8.293269230769231e-07, "loss": 0.6661, "step": 69 }, { "epoch": 0.058922558922558925, "grad_norm": 0.8310503363609314, "learning_rate": 8.41346153846154e-07, "loss": 0.6414, "step": 70 }, { "epoch": 0.05976430976430976, "grad_norm": 0.819660484790802, "learning_rate": 8.533653846153847e-07, "loss": 0.6645, "step": 71 }, { "epoch": 0.06060606060606061, "grad_norm": 0.7659975290298462, "learning_rate": 8.653846153846154e-07, "loss": 0.6548, "step": 72 }, { "epoch": 0.061447811447811446, "grad_norm": 0.7457745671272278, "learning_rate": 8.774038461538462e-07, "loss": 0.6439, "step": 73 }, { "epoch": 0.06228956228956229, "grad_norm": 0.7984325885772705, "learning_rate": 8.89423076923077e-07, "loss": 0.6486, "step": 74 }, { "epoch": 0.06313131313131314, "grad_norm": 0.8394597172737122, "learning_rate": 9.014423076923078e-07, "loss": 0.6305, "step": 75 }, { "epoch": 0.06397306397306397, "grad_norm": 0.8208425045013428, "learning_rate": 9.134615384615385e-07, "loss": 0.5969, "step": 76 }, { "epoch": 0.06481481481481481, "grad_norm": 0.7468993663787842, "learning_rate": 9.254807692307693e-07, "loss": 0.6203, "step": 77 }, { "epoch": 0.06565656565656566, "grad_norm": 0.7684189081192017, "learning_rate": 9.375000000000001e-07, "loss": 0.6691, "step": 78 }, { "epoch": 0.0664983164983165, "grad_norm": 0.7114389538764954, "learning_rate": 9.495192307692308e-07, "loss": 0.6081, "step": 79 }, { "epoch": 0.06734006734006734, "grad_norm": 0.6317752003669739, "learning_rate": 9.615384615384617e-07, "loss": 0.6323, "step": 80 }, { "epoch": 0.06818181818181818, "grad_norm": 0.6621362566947937, "learning_rate": 9.735576923076925e-07, "loss": 0.6214, "step": 81 }, { "epoch": 0.06902356902356903, "grad_norm": 0.618841290473938, "learning_rate": 9.85576923076923e-07, "loss": 0.6341, "step": 82 }, { "epoch": 0.06986531986531987, "grad_norm": 0.5640202760696411, "learning_rate": 9.97596153846154e-07, "loss": 0.5953, "step": 83 }, { "epoch": 0.0707070707070707, "grad_norm": 0.6078553795814514, "learning_rate": 1.0096153846153846e-06, "loss": 0.6097, "step": 84 }, { "epoch": 0.07154882154882154, "grad_norm": 0.6540758013725281, "learning_rate": 1.0216346153846154e-06, "loss": 0.5997, "step": 85 }, { "epoch": 0.0723905723905724, "grad_norm": 0.644311249256134, "learning_rate": 1.0336538461538462e-06, "loss": 0.6374, "step": 86 }, { "epoch": 0.07323232323232323, "grad_norm": 0.6036583185195923, "learning_rate": 1.045673076923077e-06, "loss": 0.6162, "step": 87 }, { "epoch": 0.07407407407407407, "grad_norm": 0.5838223695755005, "learning_rate": 1.0576923076923078e-06, "loss": 0.6411, "step": 88 }, { "epoch": 0.07491582491582492, "grad_norm": 0.5993977785110474, "learning_rate": 1.0697115384615385e-06, "loss": 0.6234, "step": 89 }, { "epoch": 0.07575757575757576, "grad_norm": 0.5190287828445435, "learning_rate": 1.0817307692307693e-06, "loss": 0.6364, "step": 90 }, { "epoch": 0.0765993265993266, "grad_norm": 0.5231310129165649, "learning_rate": 1.0937500000000001e-06, "loss": 0.5998, "step": 91 }, { "epoch": 0.07744107744107744, "grad_norm": 0.5149753093719482, "learning_rate": 1.105769230769231e-06, "loss": 0.6029, "step": 92 }, { "epoch": 0.07828282828282829, "grad_norm": 0.5686424970626831, "learning_rate": 1.1177884615384617e-06, "loss": 0.6094, "step": 93 }, { "epoch": 0.07912457912457913, "grad_norm": 0.5134486556053162, "learning_rate": 1.1298076923076925e-06, "loss": 0.6196, "step": 94 }, { "epoch": 0.07996632996632996, "grad_norm": 0.5452644228935242, "learning_rate": 1.141826923076923e-06, "loss": 0.6012, "step": 95 }, { "epoch": 0.08080808080808081, "grad_norm": 0.4708240032196045, "learning_rate": 1.153846153846154e-06, "loss": 0.6057, "step": 96 }, { "epoch": 0.08164983164983165, "grad_norm": 0.5452204346656799, "learning_rate": 1.1658653846153848e-06, "loss": 0.6216, "step": 97 }, { "epoch": 0.08249158249158249, "grad_norm": 0.4951326549053192, "learning_rate": 1.1778846153846154e-06, "loss": 0.5721, "step": 98 }, { "epoch": 0.08333333333333333, "grad_norm": 0.4869389235973358, "learning_rate": 1.1899038461538462e-06, "loss": 0.5799, "step": 99 }, { "epoch": 0.08417508417508418, "grad_norm": 0.5061637163162231, "learning_rate": 1.201923076923077e-06, "loss": 0.6188, "step": 100 }, { "epoch": 0.08501683501683502, "grad_norm": 0.48143699765205383, "learning_rate": 1.2139423076923077e-06, "loss": 0.6042, "step": 101 }, { "epoch": 0.08585858585858586, "grad_norm": 0.4757838249206543, "learning_rate": 1.2259615384615385e-06, "loss": 0.6169, "step": 102 }, { "epoch": 0.0867003367003367, "grad_norm": 0.47205430269241333, "learning_rate": 1.2379807692307693e-06, "loss": 0.5825, "step": 103 }, { "epoch": 0.08754208754208755, "grad_norm": 0.4781132638454437, "learning_rate": 1.25e-06, "loss": 0.61, "step": 104 }, { "epoch": 0.08838383838383838, "grad_norm": 0.5117834806442261, "learning_rate": 1.2620192307692309e-06, "loss": 0.6066, "step": 105 }, { "epoch": 0.08922558922558922, "grad_norm": 0.4837837219238281, "learning_rate": 1.2740384615384615e-06, "loss": 0.5764, "step": 106 }, { "epoch": 0.09006734006734007, "grad_norm": 0.5003170371055603, "learning_rate": 1.2860576923076922e-06, "loss": 0.5982, "step": 107 }, { "epoch": 0.09090909090909091, "grad_norm": 0.47450533509254456, "learning_rate": 1.2980769230769232e-06, "loss": 0.586, "step": 108 }, { "epoch": 0.09175084175084175, "grad_norm": 0.45480048656463623, "learning_rate": 1.310096153846154e-06, "loss": 0.5901, "step": 109 }, { "epoch": 0.09259259259259259, "grad_norm": 0.5022609829902649, "learning_rate": 1.3221153846153848e-06, "loss": 0.5858, "step": 110 }, { "epoch": 0.09343434343434344, "grad_norm": 0.48186928033828735, "learning_rate": 1.3341346153846154e-06, "loss": 0.6024, "step": 111 }, { "epoch": 0.09427609427609428, "grad_norm": 0.46916529536247253, "learning_rate": 1.3461538461538462e-06, "loss": 0.5617, "step": 112 }, { "epoch": 0.09511784511784512, "grad_norm": 0.47236162424087524, "learning_rate": 1.358173076923077e-06, "loss": 0.5789, "step": 113 }, { "epoch": 0.09595959595959595, "grad_norm": 0.4461255967617035, "learning_rate": 1.370192307692308e-06, "loss": 0.5793, "step": 114 }, { "epoch": 0.0968013468013468, "grad_norm": 0.4767073392868042, "learning_rate": 1.3822115384615387e-06, "loss": 0.5704, "step": 115 }, { "epoch": 0.09764309764309764, "grad_norm": 0.49626994132995605, "learning_rate": 1.3942307692307693e-06, "loss": 0.5731, "step": 116 }, { "epoch": 0.09848484848484848, "grad_norm": 0.47274696826934814, "learning_rate": 1.40625e-06, "loss": 0.5836, "step": 117 }, { "epoch": 0.09932659932659933, "grad_norm": 0.48883482813835144, "learning_rate": 1.4182692307692309e-06, "loss": 0.5791, "step": 118 }, { "epoch": 0.10016835016835017, "grad_norm": 0.4566262364387512, "learning_rate": 1.4302884615384617e-06, "loss": 0.5762, "step": 119 }, { "epoch": 0.10101010101010101, "grad_norm": 0.47291192412376404, "learning_rate": 1.4423076923076922e-06, "loss": 0.5772, "step": 120 }, { "epoch": 0.10185185185185185, "grad_norm": 0.48082220554351807, "learning_rate": 1.4543269230769232e-06, "loss": 0.5989, "step": 121 }, { "epoch": 0.1026936026936027, "grad_norm": 0.4941522181034088, "learning_rate": 1.466346153846154e-06, "loss": 0.5621, "step": 122 }, { "epoch": 0.10353535353535354, "grad_norm": 0.4609183073043823, "learning_rate": 1.4783653846153848e-06, "loss": 0.5519, "step": 123 }, { "epoch": 0.10437710437710437, "grad_norm": 0.4777924716472626, "learning_rate": 1.4903846153846156e-06, "loss": 0.5506, "step": 124 }, { "epoch": 0.10521885521885523, "grad_norm": 0.42458581924438477, "learning_rate": 1.5024038461538462e-06, "loss": 0.5454, "step": 125 }, { "epoch": 0.10606060606060606, "grad_norm": 0.48060354590415955, "learning_rate": 1.514423076923077e-06, "loss": 0.6119, "step": 126 }, { "epoch": 0.1069023569023569, "grad_norm": 0.46815744042396545, "learning_rate": 1.526442307692308e-06, "loss": 0.5747, "step": 127 }, { "epoch": 0.10774410774410774, "grad_norm": 0.43621352314949036, "learning_rate": 1.5384615384615387e-06, "loss": 0.5478, "step": 128 }, { "epoch": 0.10858585858585859, "grad_norm": 0.5171759724617004, "learning_rate": 1.5504807692307695e-06, "loss": 0.5731, "step": 129 }, { "epoch": 0.10942760942760943, "grad_norm": 0.4371987581253052, "learning_rate": 1.5625e-06, "loss": 0.5807, "step": 130 }, { "epoch": 0.11026936026936027, "grad_norm": 0.48663952946662903, "learning_rate": 1.5745192307692309e-06, "loss": 0.5659, "step": 131 }, { "epoch": 0.1111111111111111, "grad_norm": 0.48768216371536255, "learning_rate": 1.5865384615384616e-06, "loss": 0.5674, "step": 132 }, { "epoch": 0.11195286195286196, "grad_norm": 0.45246532559394836, "learning_rate": 1.5985576923076922e-06, "loss": 0.5669, "step": 133 }, { "epoch": 0.1127946127946128, "grad_norm": 0.4395226240158081, "learning_rate": 1.6105769230769232e-06, "loss": 0.568, "step": 134 }, { "epoch": 0.11363636363636363, "grad_norm": 0.4480568766593933, "learning_rate": 1.622596153846154e-06, "loss": 0.5485, "step": 135 }, { "epoch": 0.11447811447811448, "grad_norm": 0.484637588262558, "learning_rate": 1.6346153846153848e-06, "loss": 0.5819, "step": 136 }, { "epoch": 0.11531986531986532, "grad_norm": 0.42918211221694946, "learning_rate": 1.6466346153846156e-06, "loss": 0.5279, "step": 137 }, { "epoch": 0.11616161616161616, "grad_norm": 0.4442211091518402, "learning_rate": 1.6586538461538461e-06, "loss": 0.568, "step": 138 }, { "epoch": 0.117003367003367, "grad_norm": 0.4365970194339752, "learning_rate": 1.670673076923077e-06, "loss": 0.5361, "step": 139 }, { "epoch": 0.11784511784511785, "grad_norm": 0.42874258756637573, "learning_rate": 1.682692307692308e-06, "loss": 0.5465, "step": 140 }, { "epoch": 0.11868686868686869, "grad_norm": 0.4427005648612976, "learning_rate": 1.6947115384615387e-06, "loss": 0.5681, "step": 141 }, { "epoch": 0.11952861952861953, "grad_norm": 0.4524102509021759, "learning_rate": 1.7067307692307695e-06, "loss": 0.5508, "step": 142 }, { "epoch": 0.12037037037037036, "grad_norm": 0.4298781454563141, "learning_rate": 1.71875e-06, "loss": 0.5407, "step": 143 }, { "epoch": 0.12121212121212122, "grad_norm": 0.5009552240371704, "learning_rate": 1.7307692307692308e-06, "loss": 0.5799, "step": 144 }, { "epoch": 0.12205387205387205, "grad_norm": 0.4657253324985504, "learning_rate": 1.7427884615384616e-06, "loss": 0.5748, "step": 145 }, { "epoch": 0.12289562289562289, "grad_norm": 0.4401909410953522, "learning_rate": 1.7548076923076924e-06, "loss": 0.5654, "step": 146 }, { "epoch": 0.12373737373737374, "grad_norm": 0.516408383846283, "learning_rate": 1.7668269230769234e-06, "loss": 0.5742, "step": 147 }, { "epoch": 0.12457912457912458, "grad_norm": 0.5233368277549744, "learning_rate": 1.778846153846154e-06, "loss": 0.5742, "step": 148 }, { "epoch": 0.12542087542087543, "grad_norm": 0.44173648953437805, "learning_rate": 1.7908653846153848e-06, "loss": 0.5561, "step": 149 }, { "epoch": 0.12626262626262627, "grad_norm": 0.4439685642719269, "learning_rate": 1.8028846153846156e-06, "loss": 0.5644, "step": 150 }, { "epoch": 0.1271043771043771, "grad_norm": 0.4500431716442108, "learning_rate": 1.8149038461538463e-06, "loss": 0.5191, "step": 151 }, { "epoch": 0.12794612794612795, "grad_norm": 0.47084730863571167, "learning_rate": 1.826923076923077e-06, "loss": 0.5395, "step": 152 }, { "epoch": 0.12878787878787878, "grad_norm": 0.4511297345161438, "learning_rate": 1.838942307692308e-06, "loss": 0.5527, "step": 153 }, { "epoch": 0.12962962962962962, "grad_norm": 0.47973600029945374, "learning_rate": 1.8509615384615387e-06, "loss": 0.549, "step": 154 }, { "epoch": 0.13047138047138046, "grad_norm": 0.4699326157569885, "learning_rate": 1.8629807692307695e-06, "loss": 0.5742, "step": 155 }, { "epoch": 0.13131313131313133, "grad_norm": 0.45684900879859924, "learning_rate": 1.8750000000000003e-06, "loss": 0.5535, "step": 156 }, { "epoch": 0.13215488215488216, "grad_norm": 0.4578453302383423, "learning_rate": 1.8870192307692308e-06, "loss": 0.5759, "step": 157 }, { "epoch": 0.132996632996633, "grad_norm": 0.44658374786376953, "learning_rate": 1.8990384615384616e-06, "loss": 0.5451, "step": 158 }, { "epoch": 0.13383838383838384, "grad_norm": 0.4715912938117981, "learning_rate": 1.9110576923076924e-06, "loss": 0.5625, "step": 159 }, { "epoch": 0.13468013468013468, "grad_norm": 0.5362568497657776, "learning_rate": 1.9230769230769234e-06, "loss": 0.5574, "step": 160 }, { "epoch": 0.13552188552188552, "grad_norm": 0.4901236891746521, "learning_rate": 1.935096153846154e-06, "loss": 0.5722, "step": 161 }, { "epoch": 0.13636363636363635, "grad_norm": 0.5289010405540466, "learning_rate": 1.947115384615385e-06, "loss": 0.5448, "step": 162 }, { "epoch": 0.13720538720538722, "grad_norm": 0.43858277797698975, "learning_rate": 1.9591346153846155e-06, "loss": 0.5374, "step": 163 }, { "epoch": 0.13804713804713806, "grad_norm": 0.47779902815818787, "learning_rate": 1.971153846153846e-06, "loss": 0.5676, "step": 164 }, { "epoch": 0.1388888888888889, "grad_norm": 0.5285927653312683, "learning_rate": 1.983173076923077e-06, "loss": 0.5368, "step": 165 }, { "epoch": 0.13973063973063973, "grad_norm": 0.44225022196769714, "learning_rate": 1.995192307692308e-06, "loss": 0.5712, "step": 166 }, { "epoch": 0.14057239057239057, "grad_norm": 0.392675518989563, "learning_rate": 2.0072115384615387e-06, "loss": 0.5436, "step": 167 }, { "epoch": 0.1414141414141414, "grad_norm": 0.5476700067520142, "learning_rate": 2.0192307692307692e-06, "loss": 0.5569, "step": 168 }, { "epoch": 0.14225589225589225, "grad_norm": 0.578117847442627, "learning_rate": 2.0312500000000002e-06, "loss": 0.5594, "step": 169 }, { "epoch": 0.14309764309764308, "grad_norm": 0.5215331315994263, "learning_rate": 2.043269230769231e-06, "loss": 0.5553, "step": 170 }, { "epoch": 0.14393939393939395, "grad_norm": 0.511169970035553, "learning_rate": 2.055288461538462e-06, "loss": 0.535, "step": 171 }, { "epoch": 0.1447811447811448, "grad_norm": 0.4973542392253876, "learning_rate": 2.0673076923076924e-06, "loss": 0.5624, "step": 172 }, { "epoch": 0.14562289562289563, "grad_norm": 0.4760303795337677, "learning_rate": 2.0793269230769234e-06, "loss": 0.5527, "step": 173 }, { "epoch": 0.14646464646464646, "grad_norm": 0.4163076877593994, "learning_rate": 2.091346153846154e-06, "loss": 0.5102, "step": 174 }, { "epoch": 0.1473063973063973, "grad_norm": 0.521267294883728, "learning_rate": 2.103365384615385e-06, "loss": 0.5687, "step": 175 }, { "epoch": 0.14814814814814814, "grad_norm": 0.4891953468322754, "learning_rate": 2.1153846153846155e-06, "loss": 0.5368, "step": 176 }, { "epoch": 0.14898989898989898, "grad_norm": 0.44443240761756897, "learning_rate": 2.127403846153846e-06, "loss": 0.5328, "step": 177 }, { "epoch": 0.14983164983164984, "grad_norm": 0.5029366612434387, "learning_rate": 2.139423076923077e-06, "loss": 0.5867, "step": 178 }, { "epoch": 0.15067340067340068, "grad_norm": 0.4226229786872864, "learning_rate": 2.151442307692308e-06, "loss": 0.5277, "step": 179 }, { "epoch": 0.15151515151515152, "grad_norm": 0.47919973731040955, "learning_rate": 2.1634615384615387e-06, "loss": 0.5521, "step": 180 }, { "epoch": 0.15235690235690236, "grad_norm": 0.46681806445121765, "learning_rate": 2.1754807692307697e-06, "loss": 0.5112, "step": 181 }, { "epoch": 0.1531986531986532, "grad_norm": 0.42873701453208923, "learning_rate": 2.1875000000000002e-06, "loss": 0.5486, "step": 182 }, { "epoch": 0.15404040404040403, "grad_norm": 0.4761063754558563, "learning_rate": 2.199519230769231e-06, "loss": 0.5523, "step": 183 }, { "epoch": 0.15488215488215487, "grad_norm": 0.5284635424613953, "learning_rate": 2.211538461538462e-06, "loss": 0.5736, "step": 184 }, { "epoch": 0.15572390572390574, "grad_norm": 0.4726139307022095, "learning_rate": 2.2235576923076924e-06, "loss": 0.5612, "step": 185 }, { "epoch": 0.15656565656565657, "grad_norm": 0.4360535740852356, "learning_rate": 2.2355769230769234e-06, "loss": 0.537, "step": 186 }, { "epoch": 0.1574074074074074, "grad_norm": 0.425367146730423, "learning_rate": 2.247596153846154e-06, "loss": 0.5091, "step": 187 }, { "epoch": 0.15824915824915825, "grad_norm": 0.5115880370140076, "learning_rate": 2.259615384615385e-06, "loss": 0.5274, "step": 188 }, { "epoch": 0.1590909090909091, "grad_norm": 0.5075984001159668, "learning_rate": 2.2716346153846155e-06, "loss": 0.5788, "step": 189 }, { "epoch": 0.15993265993265993, "grad_norm": 0.44567593932151794, "learning_rate": 2.283653846153846e-06, "loss": 0.5349, "step": 190 }, { "epoch": 0.16077441077441076, "grad_norm": 0.45935487747192383, "learning_rate": 2.295673076923077e-06, "loss": 0.5073, "step": 191 }, { "epoch": 0.16161616161616163, "grad_norm": 0.49045607447624207, "learning_rate": 2.307692307692308e-06, "loss": 0.5305, "step": 192 }, { "epoch": 0.16245791245791247, "grad_norm": 0.4650980234146118, "learning_rate": 2.3197115384615386e-06, "loss": 0.5251, "step": 193 }, { "epoch": 0.1632996632996633, "grad_norm": 0.47086551785469055, "learning_rate": 2.3317307692307696e-06, "loss": 0.5134, "step": 194 }, { "epoch": 0.16414141414141414, "grad_norm": 0.4601142704486847, "learning_rate": 2.3437500000000002e-06, "loss": 0.5358, "step": 195 }, { "epoch": 0.16498316498316498, "grad_norm": 0.43569356203079224, "learning_rate": 2.355769230769231e-06, "loss": 0.5161, "step": 196 }, { "epoch": 0.16582491582491582, "grad_norm": 0.4365907609462738, "learning_rate": 2.3677884615384618e-06, "loss": 0.5542, "step": 197 }, { "epoch": 0.16666666666666666, "grad_norm": 0.43038901686668396, "learning_rate": 2.3798076923076924e-06, "loss": 0.5495, "step": 198 }, { "epoch": 0.1675084175084175, "grad_norm": 0.4624800682067871, "learning_rate": 2.3918269230769234e-06, "loss": 0.5369, "step": 199 }, { "epoch": 0.16835016835016836, "grad_norm": 0.5059806108474731, "learning_rate": 2.403846153846154e-06, "loss": 0.549, "step": 200 }, { "epoch": 0.1691919191919192, "grad_norm": 0.4301636517047882, "learning_rate": 2.415865384615385e-06, "loss": 0.5368, "step": 201 }, { "epoch": 0.17003367003367004, "grad_norm": 0.4625979959964752, "learning_rate": 2.4278846153846155e-06, "loss": 0.5112, "step": 202 }, { "epoch": 0.17087542087542087, "grad_norm": 0.4419342577457428, "learning_rate": 2.4399038461538465e-06, "loss": 0.5343, "step": 203 }, { "epoch": 0.1717171717171717, "grad_norm": 0.44128337502479553, "learning_rate": 2.451923076923077e-06, "loss": 0.5617, "step": 204 }, { "epoch": 0.17255892255892255, "grad_norm": 0.4790131747722626, "learning_rate": 2.463942307692308e-06, "loss": 0.5251, "step": 205 }, { "epoch": 0.1734006734006734, "grad_norm": 0.4206755459308624, "learning_rate": 2.4759615384615386e-06, "loss": 0.5095, "step": 206 }, { "epoch": 0.17424242424242425, "grad_norm": 0.447403222322464, "learning_rate": 2.4879807692307696e-06, "loss": 0.5416, "step": 207 }, { "epoch": 0.1750841750841751, "grad_norm": 0.49249467253685, "learning_rate": 2.5e-06, "loss": 0.5511, "step": 208 }, { "epoch": 0.17592592592592593, "grad_norm": 0.49887552857398987, "learning_rate": 2.5120192307692308e-06, "loss": 0.5342, "step": 209 }, { "epoch": 0.17676767676767677, "grad_norm": 0.46410587430000305, "learning_rate": 2.5240384615384618e-06, "loss": 0.5273, "step": 210 }, { "epoch": 0.1776094276094276, "grad_norm": 0.45443838834762573, "learning_rate": 2.5360576923076923e-06, "loss": 0.528, "step": 211 }, { "epoch": 0.17845117845117844, "grad_norm": 0.45844778418540955, "learning_rate": 2.548076923076923e-06, "loss": 0.5255, "step": 212 }, { "epoch": 0.17929292929292928, "grad_norm": 0.4978421926498413, "learning_rate": 2.560096153846154e-06, "loss": 0.5367, "step": 213 }, { "epoch": 0.18013468013468015, "grad_norm": 0.4712221324443817, "learning_rate": 2.5721153846153845e-06, "loss": 0.5464, "step": 214 }, { "epoch": 0.18097643097643099, "grad_norm": 0.44030988216400146, "learning_rate": 2.584134615384616e-06, "loss": 0.5457, "step": 215 }, { "epoch": 0.18181818181818182, "grad_norm": 0.4463440775871277, "learning_rate": 2.5961538461538465e-06, "loss": 0.5406, "step": 216 }, { "epoch": 0.18265993265993266, "grad_norm": 0.44238996505737305, "learning_rate": 2.6081730769230775e-06, "loss": 0.5202, "step": 217 }, { "epoch": 0.1835016835016835, "grad_norm": 0.43906331062316895, "learning_rate": 2.620192307692308e-06, "loss": 0.5237, "step": 218 }, { "epoch": 0.18434343434343434, "grad_norm": 0.4551813006401062, "learning_rate": 2.6322115384615386e-06, "loss": 0.5474, "step": 219 }, { "epoch": 0.18518518518518517, "grad_norm": 0.45530328154563904, "learning_rate": 2.6442307692307696e-06, "loss": 0.5275, "step": 220 }, { "epoch": 0.18602693602693604, "grad_norm": 0.4788454473018646, "learning_rate": 2.65625e-06, "loss": 0.5316, "step": 221 }, { "epoch": 0.18686868686868688, "grad_norm": 0.44755950570106506, "learning_rate": 2.6682692307692308e-06, "loss": 0.5226, "step": 222 }, { "epoch": 0.18771043771043772, "grad_norm": 0.4112953543663025, "learning_rate": 2.6802884615384618e-06, "loss": 0.5279, "step": 223 }, { "epoch": 0.18855218855218855, "grad_norm": 0.44515717029571533, "learning_rate": 2.6923076923076923e-06, "loss": 0.5087, "step": 224 }, { "epoch": 0.1893939393939394, "grad_norm": 0.40898388624191284, "learning_rate": 2.7043269230769233e-06, "loss": 0.4983, "step": 225 }, { "epoch": 0.19023569023569023, "grad_norm": 0.49459806084632874, "learning_rate": 2.716346153846154e-06, "loss": 0.535, "step": 226 }, { "epoch": 0.19107744107744107, "grad_norm": 0.5039848685264587, "learning_rate": 2.7283653846153845e-06, "loss": 0.5362, "step": 227 }, { "epoch": 0.1919191919191919, "grad_norm": 0.47080209851264954, "learning_rate": 2.740384615384616e-06, "loss": 0.5179, "step": 228 }, { "epoch": 0.19276094276094277, "grad_norm": 0.5411335825920105, "learning_rate": 2.7524038461538465e-06, "loss": 0.5119, "step": 229 }, { "epoch": 0.1936026936026936, "grad_norm": 0.48013415932655334, "learning_rate": 2.7644230769230775e-06, "loss": 0.5361, "step": 230 }, { "epoch": 0.19444444444444445, "grad_norm": 0.46559032797813416, "learning_rate": 2.776442307692308e-06, "loss": 0.5369, "step": 231 }, { "epoch": 0.19528619528619529, "grad_norm": 0.4692891538143158, "learning_rate": 2.7884615384615386e-06, "loss": 0.5073, "step": 232 }, { "epoch": 0.19612794612794612, "grad_norm": 0.42630043625831604, "learning_rate": 2.8004807692307696e-06, "loss": 0.5446, "step": 233 }, { "epoch": 0.19696969696969696, "grad_norm": 0.4697445034980774, "learning_rate": 2.8125e-06, "loss": 0.5269, "step": 234 }, { "epoch": 0.1978114478114478, "grad_norm": 0.4293937385082245, "learning_rate": 2.8245192307692307e-06, "loss": 0.5106, "step": 235 }, { "epoch": 0.19865319865319866, "grad_norm": 0.46651557087898254, "learning_rate": 2.8365384615384617e-06, "loss": 0.5163, "step": 236 }, { "epoch": 0.1994949494949495, "grad_norm": 0.4193843901157379, "learning_rate": 2.8485576923076923e-06, "loss": 0.5246, "step": 237 }, { "epoch": 0.20033670033670034, "grad_norm": 0.4402278959751129, "learning_rate": 2.8605769230769233e-06, "loss": 0.5057, "step": 238 }, { "epoch": 0.20117845117845118, "grad_norm": 0.44189050793647766, "learning_rate": 2.872596153846154e-06, "loss": 0.5066, "step": 239 }, { "epoch": 0.20202020202020202, "grad_norm": 0.4825696051120758, "learning_rate": 2.8846153846153845e-06, "loss": 0.5251, "step": 240 }, { "epoch": 0.20286195286195285, "grad_norm": 0.47549331188201904, "learning_rate": 2.896634615384616e-06, "loss": 0.5174, "step": 241 }, { "epoch": 0.2037037037037037, "grad_norm": 0.4752037227153778, "learning_rate": 2.9086538461538465e-06, "loss": 0.5076, "step": 242 }, { "epoch": 0.20454545454545456, "grad_norm": 0.5012762546539307, "learning_rate": 2.9206730769230774e-06, "loss": 0.5729, "step": 243 }, { "epoch": 0.2053872053872054, "grad_norm": 0.5027062296867371, "learning_rate": 2.932692307692308e-06, "loss": 0.5234, "step": 244 }, { "epoch": 0.20622895622895623, "grad_norm": 0.46087342500686646, "learning_rate": 2.9447115384615386e-06, "loss": 0.531, "step": 245 }, { "epoch": 0.20707070707070707, "grad_norm": 0.475440114736557, "learning_rate": 2.9567307692307696e-06, "loss": 0.5179, "step": 246 }, { "epoch": 0.2079124579124579, "grad_norm": 0.46871599555015564, "learning_rate": 2.96875e-06, "loss": 0.5335, "step": 247 }, { "epoch": 0.20875420875420875, "grad_norm": 0.44016602635383606, "learning_rate": 2.980769230769231e-06, "loss": 0.5156, "step": 248 }, { "epoch": 0.20959595959595959, "grad_norm": 0.46655556559562683, "learning_rate": 2.9927884615384617e-06, "loss": 0.5173, "step": 249 }, { "epoch": 0.21043771043771045, "grad_norm": 0.46670207381248474, "learning_rate": 3.0048076923076923e-06, "loss": 0.4955, "step": 250 }, { "epoch": 0.2112794612794613, "grad_norm": 0.5706505179405212, "learning_rate": 3.0168269230769233e-06, "loss": 0.526, "step": 251 }, { "epoch": 0.21212121212121213, "grad_norm": 0.40070322155952454, "learning_rate": 3.028846153846154e-06, "loss": 0.514, "step": 252 }, { "epoch": 0.21296296296296297, "grad_norm": 0.49427083134651184, "learning_rate": 3.0408653846153844e-06, "loss": 0.5107, "step": 253 }, { "epoch": 0.2138047138047138, "grad_norm": 0.4875272214412689, "learning_rate": 3.052884615384616e-06, "loss": 0.542, "step": 254 }, { "epoch": 0.21464646464646464, "grad_norm": 0.4481489658355713, "learning_rate": 3.0649038461538464e-06, "loss": 0.5105, "step": 255 }, { "epoch": 0.21548821548821548, "grad_norm": 0.4969692826271057, "learning_rate": 3.0769230769230774e-06, "loss": 0.5211, "step": 256 }, { "epoch": 0.21632996632996632, "grad_norm": 0.4786125421524048, "learning_rate": 3.088942307692308e-06, "loss": 0.5149, "step": 257 }, { "epoch": 0.21717171717171718, "grad_norm": 0.4892539381980896, "learning_rate": 3.100961538461539e-06, "loss": 0.5105, "step": 258 }, { "epoch": 0.21801346801346802, "grad_norm": 0.47996985912323, "learning_rate": 3.1129807692307696e-06, "loss": 0.5227, "step": 259 }, { "epoch": 0.21885521885521886, "grad_norm": 0.5038319230079651, "learning_rate": 3.125e-06, "loss": 0.5075, "step": 260 }, { "epoch": 0.2196969696969697, "grad_norm": 0.49568822979927063, "learning_rate": 3.137019230769231e-06, "loss": 0.5027, "step": 261 }, { "epoch": 0.22053872053872053, "grad_norm": 0.5506752133369446, "learning_rate": 3.1490384615384617e-06, "loss": 0.5118, "step": 262 }, { "epoch": 0.22138047138047137, "grad_norm": 0.4901628792285919, "learning_rate": 3.1610576923076923e-06, "loss": 0.5594, "step": 263 }, { "epoch": 0.2222222222222222, "grad_norm": 0.48965638875961304, "learning_rate": 3.1730769230769233e-06, "loss": 0.5259, "step": 264 }, { "epoch": 0.22306397306397308, "grad_norm": 0.4557913541793823, "learning_rate": 3.185096153846154e-06, "loss": 0.5299, "step": 265 }, { "epoch": 0.2239057239057239, "grad_norm": 0.5544600486755371, "learning_rate": 3.1971153846153844e-06, "loss": 0.5264, "step": 266 }, { "epoch": 0.22474747474747475, "grad_norm": 0.482130229473114, "learning_rate": 3.209134615384616e-06, "loss": 0.5248, "step": 267 }, { "epoch": 0.2255892255892256, "grad_norm": 0.5098735094070435, "learning_rate": 3.2211538461538464e-06, "loss": 0.5023, "step": 268 }, { "epoch": 0.22643097643097643, "grad_norm": 0.4815903902053833, "learning_rate": 3.2331730769230774e-06, "loss": 0.4934, "step": 269 }, { "epoch": 0.22727272727272727, "grad_norm": 0.45405811071395874, "learning_rate": 3.245192307692308e-06, "loss": 0.5155, "step": 270 }, { "epoch": 0.2281144781144781, "grad_norm": 0.46062448620796204, "learning_rate": 3.257211538461539e-06, "loss": 0.5267, "step": 271 }, { "epoch": 0.22895622895622897, "grad_norm": 0.5014218688011169, "learning_rate": 3.2692307692307696e-06, "loss": 0.5103, "step": 272 }, { "epoch": 0.2297979797979798, "grad_norm": 0.5248931050300598, "learning_rate": 3.28125e-06, "loss": 0.5253, "step": 273 }, { "epoch": 0.23063973063973064, "grad_norm": 0.42319878935813904, "learning_rate": 3.293269230769231e-06, "loss": 0.4738, "step": 274 }, { "epoch": 0.23148148148148148, "grad_norm": 0.4478130340576172, "learning_rate": 3.3052884615384617e-06, "loss": 0.5103, "step": 275 }, { "epoch": 0.23232323232323232, "grad_norm": 0.43648120760917664, "learning_rate": 3.3173076923076923e-06, "loss": 0.5098, "step": 276 }, { "epoch": 0.23316498316498316, "grad_norm": 0.4900733530521393, "learning_rate": 3.3293269230769233e-06, "loss": 0.5266, "step": 277 }, { "epoch": 0.234006734006734, "grad_norm": 0.5633643269538879, "learning_rate": 3.341346153846154e-06, "loss": 0.5596, "step": 278 }, { "epoch": 0.23484848484848486, "grad_norm": 0.45052123069763184, "learning_rate": 3.353365384615385e-06, "loss": 0.5101, "step": 279 }, { "epoch": 0.2356902356902357, "grad_norm": 0.4551660120487213, "learning_rate": 3.365384615384616e-06, "loss": 0.4896, "step": 280 }, { "epoch": 0.23653198653198654, "grad_norm": 0.45628777146339417, "learning_rate": 3.377403846153847e-06, "loss": 0.5432, "step": 281 }, { "epoch": 0.23737373737373738, "grad_norm": 0.48830825090408325, "learning_rate": 3.3894230769230774e-06, "loss": 0.5077, "step": 282 }, { "epoch": 0.2382154882154882, "grad_norm": 0.4658557176589966, "learning_rate": 3.401442307692308e-06, "loss": 0.5055, "step": 283 }, { "epoch": 0.23905723905723905, "grad_norm": 0.454679399728775, "learning_rate": 3.413461538461539e-06, "loss": 0.5008, "step": 284 }, { "epoch": 0.2398989898989899, "grad_norm": 0.5154184103012085, "learning_rate": 3.4254807692307695e-06, "loss": 0.5436, "step": 285 }, { "epoch": 0.24074074074074073, "grad_norm": 0.4425804018974304, "learning_rate": 3.4375e-06, "loss": 0.4988, "step": 286 }, { "epoch": 0.2415824915824916, "grad_norm": 0.4620234966278076, "learning_rate": 3.449519230769231e-06, "loss": 0.522, "step": 287 }, { "epoch": 0.24242424242424243, "grad_norm": 0.5240399837493896, "learning_rate": 3.4615384615384617e-06, "loss": 0.5217, "step": 288 }, { "epoch": 0.24326599326599327, "grad_norm": 0.45701679587364197, "learning_rate": 3.4735576923076923e-06, "loss": 0.4795, "step": 289 }, { "epoch": 0.2441077441077441, "grad_norm": 0.39514803886413574, "learning_rate": 3.4855769230769233e-06, "loss": 0.4913, "step": 290 }, { "epoch": 0.24494949494949494, "grad_norm": 0.48972904682159424, "learning_rate": 3.497596153846154e-06, "loss": 0.5185, "step": 291 }, { "epoch": 0.24579124579124578, "grad_norm": 0.4827323257923126, "learning_rate": 3.509615384615385e-06, "loss": 0.4803, "step": 292 }, { "epoch": 0.24663299663299662, "grad_norm": 0.48440709710121155, "learning_rate": 3.521634615384616e-06, "loss": 0.5262, "step": 293 }, { "epoch": 0.2474747474747475, "grad_norm": 0.4719337522983551, "learning_rate": 3.533653846153847e-06, "loss": 0.5284, "step": 294 }, { "epoch": 0.24831649831649832, "grad_norm": 0.46947619318962097, "learning_rate": 3.5456730769230774e-06, "loss": 0.4921, "step": 295 }, { "epoch": 0.24915824915824916, "grad_norm": 0.4845277667045593, "learning_rate": 3.557692307692308e-06, "loss": 0.5247, "step": 296 }, { "epoch": 0.25, "grad_norm": 0.4730343222618103, "learning_rate": 3.569711538461539e-06, "loss": 0.5016, "step": 297 }, { "epoch": 0.25084175084175087, "grad_norm": 0.46266546845436096, "learning_rate": 3.5817307692307695e-06, "loss": 0.5008, "step": 298 }, { "epoch": 0.2516835016835017, "grad_norm": 0.49312064051628113, "learning_rate": 3.59375e-06, "loss": 0.5132, "step": 299 }, { "epoch": 0.25252525252525254, "grad_norm": 0.48184913396835327, "learning_rate": 3.605769230769231e-06, "loss": 0.5118, "step": 300 }, { "epoch": 0.25336700336700335, "grad_norm": 0.4500241279602051, "learning_rate": 3.6177884615384617e-06, "loss": 0.5034, "step": 301 }, { "epoch": 0.2542087542087542, "grad_norm": 0.46663862466812134, "learning_rate": 3.6298076923076927e-06, "loss": 0.4809, "step": 302 }, { "epoch": 0.255050505050505, "grad_norm": 0.5019620060920715, "learning_rate": 3.6418269230769232e-06, "loss": 0.5327, "step": 303 }, { "epoch": 0.2558922558922559, "grad_norm": 0.4911588132381439, "learning_rate": 3.653846153846154e-06, "loss": 0.5217, "step": 304 }, { "epoch": 0.25673400673400676, "grad_norm": 0.48595160245895386, "learning_rate": 3.665865384615385e-06, "loss": 0.5254, "step": 305 }, { "epoch": 0.25757575757575757, "grad_norm": 0.4880105257034302, "learning_rate": 3.677884615384616e-06, "loss": 0.4751, "step": 306 }, { "epoch": 0.25841750841750843, "grad_norm": 0.5379721522331238, "learning_rate": 3.689903846153847e-06, "loss": 0.5294, "step": 307 }, { "epoch": 0.25925925925925924, "grad_norm": 0.5002626776695251, "learning_rate": 3.7019230769230774e-06, "loss": 0.513, "step": 308 }, { "epoch": 0.2601010101010101, "grad_norm": 0.4721471667289734, "learning_rate": 3.713942307692308e-06, "loss": 0.491, "step": 309 }, { "epoch": 0.2609427609427609, "grad_norm": 0.44296175241470337, "learning_rate": 3.725961538461539e-06, "loss": 0.4981, "step": 310 }, { "epoch": 0.2617845117845118, "grad_norm": 0.48431479930877686, "learning_rate": 3.7379807692307695e-06, "loss": 0.5151, "step": 311 }, { "epoch": 0.26262626262626265, "grad_norm": 0.4783515930175781, "learning_rate": 3.7500000000000005e-06, "loss": 0.5146, "step": 312 }, { "epoch": 0.26346801346801346, "grad_norm": 0.398469477891922, "learning_rate": 3.762019230769231e-06, "loss": 0.5162, "step": 313 }, { "epoch": 0.26430976430976433, "grad_norm": 0.47943875193595886, "learning_rate": 3.7740384615384617e-06, "loss": 0.5439, "step": 314 }, { "epoch": 0.26515151515151514, "grad_norm": 0.44943636655807495, "learning_rate": 3.7860576923076927e-06, "loss": 0.4761, "step": 315 }, { "epoch": 0.265993265993266, "grad_norm": 0.4895787239074707, "learning_rate": 3.7980769230769232e-06, "loss": 0.509, "step": 316 }, { "epoch": 0.2668350168350168, "grad_norm": 0.4619561433792114, "learning_rate": 3.810096153846154e-06, "loss": 0.4883, "step": 317 }, { "epoch": 0.2676767676767677, "grad_norm": 0.4798828959465027, "learning_rate": 3.822115384615385e-06, "loss": 0.511, "step": 318 }, { "epoch": 0.26851851851851855, "grad_norm": 0.4836987257003784, "learning_rate": 3.834134615384616e-06, "loss": 0.5158, "step": 319 }, { "epoch": 0.26936026936026936, "grad_norm": 0.4563503861427307, "learning_rate": 3.846153846153847e-06, "loss": 0.4888, "step": 320 }, { "epoch": 0.2702020202020202, "grad_norm": 0.48587653040885925, "learning_rate": 3.858173076923078e-06, "loss": 0.5122, "step": 321 }, { "epoch": 0.27104377104377103, "grad_norm": 0.5056146383285522, "learning_rate": 3.870192307692308e-06, "loss": 0.513, "step": 322 }, { "epoch": 0.2718855218855219, "grad_norm": 0.51176518201828, "learning_rate": 3.882211538461539e-06, "loss": 0.4734, "step": 323 }, { "epoch": 0.2727272727272727, "grad_norm": 0.48665690422058105, "learning_rate": 3.89423076923077e-06, "loss": 0.4954, "step": 324 }, { "epoch": 0.2735690235690236, "grad_norm": 0.4394415020942688, "learning_rate": 3.90625e-06, "loss": 0.5039, "step": 325 }, { "epoch": 0.27441077441077444, "grad_norm": 0.45930662751197815, "learning_rate": 3.918269230769231e-06, "loss": 0.489, "step": 326 }, { "epoch": 0.27525252525252525, "grad_norm": 0.4550165832042694, "learning_rate": 3.930288461538462e-06, "loss": 0.4891, "step": 327 }, { "epoch": 0.2760942760942761, "grad_norm": 0.4698513150215149, "learning_rate": 3.942307692307692e-06, "loss": 0.493, "step": 328 }, { "epoch": 0.2769360269360269, "grad_norm": 0.4740423858165741, "learning_rate": 3.954326923076923e-06, "loss": 0.4946, "step": 329 }, { "epoch": 0.2777777777777778, "grad_norm": 0.4533052444458008, "learning_rate": 3.966346153846154e-06, "loss": 0.5204, "step": 330 }, { "epoch": 0.2786195286195286, "grad_norm": 0.45135074853897095, "learning_rate": 3.978365384615384e-06, "loss": 0.5148, "step": 331 }, { "epoch": 0.27946127946127947, "grad_norm": 0.5567011833190918, "learning_rate": 3.990384615384616e-06, "loss": 0.4988, "step": 332 }, { "epoch": 0.2803030303030303, "grad_norm": 0.45059993863105774, "learning_rate": 4.002403846153846e-06, "loss": 0.5307, "step": 333 }, { "epoch": 0.28114478114478114, "grad_norm": 0.5156258940696716, "learning_rate": 4.014423076923077e-06, "loss": 0.5274, "step": 334 }, { "epoch": 0.281986531986532, "grad_norm": 0.5223826169967651, "learning_rate": 4.026442307692308e-06, "loss": 0.52, "step": 335 }, { "epoch": 0.2828282828282828, "grad_norm": 0.4632948338985443, "learning_rate": 4.0384615384615385e-06, "loss": 0.5294, "step": 336 }, { "epoch": 0.2836700336700337, "grad_norm": 0.47697293758392334, "learning_rate": 4.0504807692307695e-06, "loss": 0.5261, "step": 337 }, { "epoch": 0.2845117845117845, "grad_norm": 0.4947918951511383, "learning_rate": 4.0625000000000005e-06, "loss": 0.4761, "step": 338 }, { "epoch": 0.28535353535353536, "grad_norm": 0.4598081111907959, "learning_rate": 4.074519230769231e-06, "loss": 0.4867, "step": 339 }, { "epoch": 0.28619528619528617, "grad_norm": 0.4164380729198456, "learning_rate": 4.086538461538462e-06, "loss": 0.5095, "step": 340 }, { "epoch": 0.28703703703703703, "grad_norm": 0.5049055218696594, "learning_rate": 4.098557692307693e-06, "loss": 0.504, "step": 341 }, { "epoch": 0.2878787878787879, "grad_norm": 0.5153409242630005, "learning_rate": 4.110576923076924e-06, "loss": 0.5003, "step": 342 }, { "epoch": 0.2887205387205387, "grad_norm": 0.5178847312927246, "learning_rate": 4.122596153846154e-06, "loss": 0.5314, "step": 343 }, { "epoch": 0.2895622895622896, "grad_norm": 0.508107602596283, "learning_rate": 4.134615384615385e-06, "loss": 0.5155, "step": 344 }, { "epoch": 0.2904040404040404, "grad_norm": 0.5109880566596985, "learning_rate": 4.146634615384616e-06, "loss": 0.507, "step": 345 }, { "epoch": 0.29124579124579125, "grad_norm": 0.49025750160217285, "learning_rate": 4.158653846153847e-06, "loss": 0.4661, "step": 346 }, { "epoch": 0.29208754208754206, "grad_norm": 0.42277082800865173, "learning_rate": 4.170673076923078e-06, "loss": 0.4847, "step": 347 }, { "epoch": 0.29292929292929293, "grad_norm": 0.5252418518066406, "learning_rate": 4.182692307692308e-06, "loss": 0.4877, "step": 348 }, { "epoch": 0.2937710437710438, "grad_norm": 0.536336362361908, "learning_rate": 4.194711538461539e-06, "loss": 0.5119, "step": 349 }, { "epoch": 0.2946127946127946, "grad_norm": 0.48380112648010254, "learning_rate": 4.20673076923077e-06, "loss": 0.5158, "step": 350 }, { "epoch": 0.29545454545454547, "grad_norm": 0.45153284072875977, "learning_rate": 4.21875e-06, "loss": 0.484, "step": 351 }, { "epoch": 0.2962962962962963, "grad_norm": 0.4614394009113312, "learning_rate": 4.230769230769231e-06, "loss": 0.4759, "step": 352 }, { "epoch": 0.29713804713804715, "grad_norm": 0.5119279026985168, "learning_rate": 4.242788461538462e-06, "loss": 0.5344, "step": 353 }, { "epoch": 0.29797979797979796, "grad_norm": 0.5352416634559631, "learning_rate": 4.254807692307692e-06, "loss": 0.5275, "step": 354 }, { "epoch": 0.2988215488215488, "grad_norm": 0.436328262090683, "learning_rate": 4.266826923076923e-06, "loss": 0.4953, "step": 355 }, { "epoch": 0.2996632996632997, "grad_norm": 0.434000164270401, "learning_rate": 4.278846153846154e-06, "loss": 0.4948, "step": 356 }, { "epoch": 0.3005050505050505, "grad_norm": 0.4645795226097107, "learning_rate": 4.290865384615384e-06, "loss": 0.5135, "step": 357 }, { "epoch": 0.30134680134680136, "grad_norm": 0.427421897649765, "learning_rate": 4.302884615384616e-06, "loss": 0.4968, "step": 358 }, { "epoch": 0.3021885521885522, "grad_norm": 0.4766145646572113, "learning_rate": 4.314903846153846e-06, "loss": 0.5139, "step": 359 }, { "epoch": 0.30303030303030304, "grad_norm": 0.584069013595581, "learning_rate": 4.326923076923077e-06, "loss": 0.4857, "step": 360 }, { "epoch": 0.30387205387205385, "grad_norm": 0.41942089796066284, "learning_rate": 4.338942307692308e-06, "loss": 0.498, "step": 361 }, { "epoch": 0.3047138047138047, "grad_norm": 0.47102612257003784, "learning_rate": 4.350961538461539e-06, "loss": 0.5038, "step": 362 }, { "epoch": 0.3055555555555556, "grad_norm": 0.5086708664894104, "learning_rate": 4.3629807692307695e-06, "loss": 0.5085, "step": 363 }, { "epoch": 0.3063973063973064, "grad_norm": 0.48408690094947815, "learning_rate": 4.3750000000000005e-06, "loss": 0.4895, "step": 364 }, { "epoch": 0.30723905723905726, "grad_norm": 0.45701146125793457, "learning_rate": 4.3870192307692315e-06, "loss": 0.4972, "step": 365 }, { "epoch": 0.30808080808080807, "grad_norm": 0.48086753487586975, "learning_rate": 4.399038461538462e-06, "loss": 0.4983, "step": 366 }, { "epoch": 0.30892255892255893, "grad_norm": 0.5152490735054016, "learning_rate": 4.411057692307693e-06, "loss": 0.4916, "step": 367 }, { "epoch": 0.30976430976430974, "grad_norm": 0.509262204170227, "learning_rate": 4.423076923076924e-06, "loss": 0.5097, "step": 368 }, { "epoch": 0.3106060606060606, "grad_norm": 0.4547169506549835, "learning_rate": 4.435096153846154e-06, "loss": 0.4828, "step": 369 }, { "epoch": 0.3114478114478115, "grad_norm": 0.4530722498893738, "learning_rate": 4.447115384615385e-06, "loss": 0.4957, "step": 370 }, { "epoch": 0.3122895622895623, "grad_norm": 0.48112958669662476, "learning_rate": 4.459134615384616e-06, "loss": 0.4719, "step": 371 }, { "epoch": 0.31313131313131315, "grad_norm": 0.5078083276748657, "learning_rate": 4.471153846153847e-06, "loss": 0.5084, "step": 372 }, { "epoch": 0.31397306397306396, "grad_norm": 0.48378244042396545, "learning_rate": 4.483173076923078e-06, "loss": 0.4937, "step": 373 }, { "epoch": 0.3148148148148148, "grad_norm": 0.5044835805892944, "learning_rate": 4.495192307692308e-06, "loss": 0.5003, "step": 374 }, { "epoch": 0.31565656565656564, "grad_norm": 0.43924498558044434, "learning_rate": 4.507211538461539e-06, "loss": 0.4633, "step": 375 }, { "epoch": 0.3164983164983165, "grad_norm": 0.5183614492416382, "learning_rate": 4.51923076923077e-06, "loss": 0.5156, "step": 376 }, { "epoch": 0.31734006734006737, "grad_norm": 0.469632089138031, "learning_rate": 4.53125e-06, "loss": 0.4988, "step": 377 }, { "epoch": 0.3181818181818182, "grad_norm": 0.4877038598060608, "learning_rate": 4.543269230769231e-06, "loss": 0.5144, "step": 378 }, { "epoch": 0.31902356902356904, "grad_norm": 0.48083415627479553, "learning_rate": 4.555288461538462e-06, "loss": 0.485, "step": 379 }, { "epoch": 0.31986531986531985, "grad_norm": 0.5440043807029724, "learning_rate": 4.567307692307692e-06, "loss": 0.5008, "step": 380 }, { "epoch": 0.3207070707070707, "grad_norm": 0.47986117005348206, "learning_rate": 4.579326923076923e-06, "loss": 0.5151, "step": 381 }, { "epoch": 0.32154882154882153, "grad_norm": 0.5295445322990417, "learning_rate": 4.591346153846154e-06, "loss": 0.5141, "step": 382 }, { "epoch": 0.3223905723905724, "grad_norm": 0.4840766489505768, "learning_rate": 4.603365384615385e-06, "loss": 0.5174, "step": 383 }, { "epoch": 0.32323232323232326, "grad_norm": 0.5351732969284058, "learning_rate": 4.615384615384616e-06, "loss": 0.5381, "step": 384 }, { "epoch": 0.32407407407407407, "grad_norm": 0.5545787215232849, "learning_rate": 4.627403846153847e-06, "loss": 0.4815, "step": 385 }, { "epoch": 0.32491582491582494, "grad_norm": 0.5383148789405823, "learning_rate": 4.639423076923077e-06, "loss": 0.5127, "step": 386 }, { "epoch": 0.32575757575757575, "grad_norm": 0.4678453207015991, "learning_rate": 4.651442307692308e-06, "loss": 0.5027, "step": 387 }, { "epoch": 0.3265993265993266, "grad_norm": 0.550381064414978, "learning_rate": 4.663461538461539e-06, "loss": 0.5071, "step": 388 }, { "epoch": 0.3274410774410774, "grad_norm": 0.5118295550346375, "learning_rate": 4.6754807692307694e-06, "loss": 0.4854, "step": 389 }, { "epoch": 0.3282828282828283, "grad_norm": 0.4496217668056488, "learning_rate": 4.6875000000000004e-06, "loss": 0.4654, "step": 390 }, { "epoch": 0.3291245791245791, "grad_norm": 0.4643478989601135, "learning_rate": 4.6995192307692314e-06, "loss": 0.4971, "step": 391 }, { "epoch": 0.32996632996632996, "grad_norm": 0.5618423223495483, "learning_rate": 4.711538461538462e-06, "loss": 0.5296, "step": 392 }, { "epoch": 0.33080808080808083, "grad_norm": 0.4464651942253113, "learning_rate": 4.723557692307693e-06, "loss": 0.5063, "step": 393 }, { "epoch": 0.33164983164983164, "grad_norm": 0.4400874972343445, "learning_rate": 4.7355769230769236e-06, "loss": 0.4972, "step": 394 }, { "epoch": 0.3324915824915825, "grad_norm": 0.6370819807052612, "learning_rate": 4.747596153846154e-06, "loss": 0.4909, "step": 395 }, { "epoch": 0.3333333333333333, "grad_norm": 0.4846592843532562, "learning_rate": 4.759615384615385e-06, "loss": 0.5008, "step": 396 }, { "epoch": 0.3341750841750842, "grad_norm": 0.4829165041446686, "learning_rate": 4.771634615384616e-06, "loss": 0.502, "step": 397 }, { "epoch": 0.335016835016835, "grad_norm": 0.48412299156188965, "learning_rate": 4.783653846153847e-06, "loss": 0.5288, "step": 398 }, { "epoch": 0.33585858585858586, "grad_norm": 0.475932776927948, "learning_rate": 4.795673076923078e-06, "loss": 0.5302, "step": 399 }, { "epoch": 0.3367003367003367, "grad_norm": 0.5231626629829407, "learning_rate": 4.807692307692308e-06, "loss": 0.5019, "step": 400 }, { "epoch": 0.33754208754208753, "grad_norm": 0.47957277297973633, "learning_rate": 4.819711538461539e-06, "loss": 0.456, "step": 401 }, { "epoch": 0.3383838383838384, "grad_norm": 0.48458266258239746, "learning_rate": 4.83173076923077e-06, "loss": 0.5272, "step": 402 }, { "epoch": 0.3392255892255892, "grad_norm": 0.511498749256134, "learning_rate": 4.84375e-06, "loss": 0.4995, "step": 403 }, { "epoch": 0.3400673400673401, "grad_norm": 0.5351393222808838, "learning_rate": 4.855769230769231e-06, "loss": 0.467, "step": 404 }, { "epoch": 0.3409090909090909, "grad_norm": 0.5056577920913696, "learning_rate": 4.867788461538462e-06, "loss": 0.4975, "step": 405 }, { "epoch": 0.34175084175084175, "grad_norm": 0.5164982080459595, "learning_rate": 4.879807692307693e-06, "loss": 0.5079, "step": 406 }, { "epoch": 0.3425925925925926, "grad_norm": 0.5027576088905334, "learning_rate": 4.891826923076923e-06, "loss": 0.4971, "step": 407 }, { "epoch": 0.3434343434343434, "grad_norm": 0.4617939889431, "learning_rate": 4.903846153846154e-06, "loss": 0.4831, "step": 408 }, { "epoch": 0.3442760942760943, "grad_norm": 0.59879469871521, "learning_rate": 4.915865384615385e-06, "loss": 0.5131, "step": 409 }, { "epoch": 0.3451178451178451, "grad_norm": 0.48065653443336487, "learning_rate": 4.927884615384616e-06, "loss": 0.4894, "step": 410 }, { "epoch": 0.34595959595959597, "grad_norm": 0.443237841129303, "learning_rate": 4.939903846153847e-06, "loss": 0.497, "step": 411 }, { "epoch": 0.3468013468013468, "grad_norm": 0.4972589313983917, "learning_rate": 4.951923076923077e-06, "loss": 0.503, "step": 412 }, { "epoch": 0.34764309764309764, "grad_norm": 0.5108572244644165, "learning_rate": 4.963942307692308e-06, "loss": 0.5011, "step": 413 }, { "epoch": 0.3484848484848485, "grad_norm": 0.553188681602478, "learning_rate": 4.975961538461539e-06, "loss": 0.4911, "step": 414 }, { "epoch": 0.3493265993265993, "grad_norm": 0.5023365616798401, "learning_rate": 4.987980769230769e-06, "loss": 0.4664, "step": 415 }, { "epoch": 0.3501683501683502, "grad_norm": 0.5594606995582581, "learning_rate": 5e-06, "loss": 0.5117, "step": 416 }, { "epoch": 0.351010101010101, "grad_norm": 0.5058784484863281, "learning_rate": 5.012019230769231e-06, "loss": 0.4831, "step": 417 }, { "epoch": 0.35185185185185186, "grad_norm": 0.4556933343410492, "learning_rate": 5.0240384615384616e-06, "loss": 0.5045, "step": 418 }, { "epoch": 0.35269360269360267, "grad_norm": 0.4678424298763275, "learning_rate": 5.036057692307693e-06, "loss": 0.5112, "step": 419 }, { "epoch": 0.35353535353535354, "grad_norm": 0.4804094433784485, "learning_rate": 5.0480769230769235e-06, "loss": 0.5019, "step": 420 }, { "epoch": 0.3543771043771044, "grad_norm": 0.5225346684455872, "learning_rate": 5.0600961538461545e-06, "loss": 0.5005, "step": 421 }, { "epoch": 0.3552188552188552, "grad_norm": 0.46472620964050293, "learning_rate": 5.072115384615385e-06, "loss": 0.4771, "step": 422 }, { "epoch": 0.3560606060606061, "grad_norm": 0.5639899969100952, "learning_rate": 5.084134615384616e-06, "loss": 0.5367, "step": 423 }, { "epoch": 0.3569023569023569, "grad_norm": 0.5572798252105713, "learning_rate": 5.096153846153846e-06, "loss": 0.5132, "step": 424 }, { "epoch": 0.35774410774410775, "grad_norm": 0.5395932793617249, "learning_rate": 5.108173076923078e-06, "loss": 0.4897, "step": 425 }, { "epoch": 0.35858585858585856, "grad_norm": 0.4979841709136963, "learning_rate": 5.120192307692308e-06, "loss": 0.4939, "step": 426 }, { "epoch": 0.35942760942760943, "grad_norm": 0.43538185954093933, "learning_rate": 5.132211538461539e-06, "loss": 0.4909, "step": 427 }, { "epoch": 0.3602693602693603, "grad_norm": 0.5317019820213318, "learning_rate": 5.144230769230769e-06, "loss": 0.5087, "step": 428 }, { "epoch": 0.3611111111111111, "grad_norm": 0.46812883019447327, "learning_rate": 5.156250000000001e-06, "loss": 0.4934, "step": 429 }, { "epoch": 0.36195286195286197, "grad_norm": 0.44829294085502625, "learning_rate": 5.168269230769232e-06, "loss": 0.4923, "step": 430 }, { "epoch": 0.3627946127946128, "grad_norm": 0.49959927797317505, "learning_rate": 5.180288461538462e-06, "loss": 0.4832, "step": 431 }, { "epoch": 0.36363636363636365, "grad_norm": 0.4876447021961212, "learning_rate": 5.192307692307693e-06, "loss": 0.4984, "step": 432 }, { "epoch": 0.36447811447811446, "grad_norm": 0.5445570349693298, "learning_rate": 5.204326923076923e-06, "loss": 0.5073, "step": 433 }, { "epoch": 0.3653198653198653, "grad_norm": 0.44871461391448975, "learning_rate": 5.216346153846155e-06, "loss": 0.5081, "step": 434 }, { "epoch": 0.3661616161616162, "grad_norm": 0.4719853401184082, "learning_rate": 5.228365384615385e-06, "loss": 0.484, "step": 435 }, { "epoch": 0.367003367003367, "grad_norm": 0.4707343280315399, "learning_rate": 5.240384615384616e-06, "loss": 0.5198, "step": 436 }, { "epoch": 0.36784511784511786, "grad_norm": 0.49534666538238525, "learning_rate": 5.252403846153846e-06, "loss": 0.53, "step": 437 }, { "epoch": 0.3686868686868687, "grad_norm": 0.5050320625305176, "learning_rate": 5.264423076923077e-06, "loss": 0.4823, "step": 438 }, { "epoch": 0.36952861952861954, "grad_norm": 0.450015127658844, "learning_rate": 5.276442307692307e-06, "loss": 0.4641, "step": 439 }, { "epoch": 0.37037037037037035, "grad_norm": 0.49869540333747864, "learning_rate": 5.288461538461539e-06, "loss": 0.5111, "step": 440 }, { "epoch": 0.3712121212121212, "grad_norm": 0.4853997826576233, "learning_rate": 5.300480769230769e-06, "loss": 0.4972, "step": 441 }, { "epoch": 0.3720538720538721, "grad_norm": 0.4555385410785675, "learning_rate": 5.3125e-06, "loss": 0.516, "step": 442 }, { "epoch": 0.3728956228956229, "grad_norm": 0.4817361533641815, "learning_rate": 5.324519230769231e-06, "loss": 0.5095, "step": 443 }, { "epoch": 0.37373737373737376, "grad_norm": 0.4375511705875397, "learning_rate": 5.3365384615384615e-06, "loss": 0.4903, "step": 444 }, { "epoch": 0.37457912457912457, "grad_norm": 0.44964277744293213, "learning_rate": 5.348557692307693e-06, "loss": 0.5039, "step": 445 }, { "epoch": 0.37542087542087543, "grad_norm": 0.461701899766922, "learning_rate": 5.3605769230769235e-06, "loss": 0.5042, "step": 446 }, { "epoch": 0.37626262626262624, "grad_norm": 0.4803580641746521, "learning_rate": 5.3725961538461545e-06, "loss": 0.48, "step": 447 }, { "epoch": 0.3771043771043771, "grad_norm": 0.46910640597343445, "learning_rate": 5.384615384615385e-06, "loss": 0.5133, "step": 448 }, { "epoch": 0.3779461279461279, "grad_norm": 0.48764732480049133, "learning_rate": 5.396634615384616e-06, "loss": 0.5006, "step": 449 }, { "epoch": 0.3787878787878788, "grad_norm": 0.5016416311264038, "learning_rate": 5.408653846153847e-06, "loss": 0.5102, "step": 450 }, { "epoch": 0.37962962962962965, "grad_norm": 0.5102939605712891, "learning_rate": 5.420673076923078e-06, "loss": 0.4798, "step": 451 }, { "epoch": 0.38047138047138046, "grad_norm": 0.5145123600959778, "learning_rate": 5.432692307692308e-06, "loss": 0.4872, "step": 452 }, { "epoch": 0.3813131313131313, "grad_norm": 0.4339072108268738, "learning_rate": 5.444711538461539e-06, "loss": 0.4809, "step": 453 }, { "epoch": 0.38215488215488214, "grad_norm": 0.521129310131073, "learning_rate": 5.456730769230769e-06, "loss": 0.4916, "step": 454 }, { "epoch": 0.382996632996633, "grad_norm": 0.6040670871734619, "learning_rate": 5.468750000000001e-06, "loss": 0.4867, "step": 455 }, { "epoch": 0.3838383838383838, "grad_norm": 0.469546914100647, "learning_rate": 5.480769230769232e-06, "loss": 0.4808, "step": 456 }, { "epoch": 0.3846801346801347, "grad_norm": 0.5152269005775452, "learning_rate": 5.492788461538462e-06, "loss": 0.4683, "step": 457 }, { "epoch": 0.38552188552188554, "grad_norm": 0.5161177515983582, "learning_rate": 5.504807692307693e-06, "loss": 0.5191, "step": 458 }, { "epoch": 0.38636363636363635, "grad_norm": 0.47038573026657104, "learning_rate": 5.516826923076923e-06, "loss": 0.4837, "step": 459 }, { "epoch": 0.3872053872053872, "grad_norm": 0.4765544831752777, "learning_rate": 5.528846153846155e-06, "loss": 0.521, "step": 460 }, { "epoch": 0.38804713804713803, "grad_norm": 0.5428287386894226, "learning_rate": 5.540865384615385e-06, "loss": 0.4914, "step": 461 }, { "epoch": 0.3888888888888889, "grad_norm": 0.4674924910068512, "learning_rate": 5.552884615384616e-06, "loss": 0.4688, "step": 462 }, { "epoch": 0.3897306397306397, "grad_norm": 0.517905056476593, "learning_rate": 5.564903846153846e-06, "loss": 0.4902, "step": 463 }, { "epoch": 0.39057239057239057, "grad_norm": 0.5230925679206848, "learning_rate": 5.576923076923077e-06, "loss": 0.4776, "step": 464 }, { "epoch": 0.39141414141414144, "grad_norm": 0.4778124988079071, "learning_rate": 5.588942307692307e-06, "loss": 0.4837, "step": 465 }, { "epoch": 0.39225589225589225, "grad_norm": 0.4963686168193817, "learning_rate": 5.600961538461539e-06, "loss": 0.4766, "step": 466 }, { "epoch": 0.3930976430976431, "grad_norm": 0.49569541215896606, "learning_rate": 5.612980769230769e-06, "loss": 0.4864, "step": 467 }, { "epoch": 0.3939393939393939, "grad_norm": 0.48670074343681335, "learning_rate": 5.625e-06, "loss": 0.4766, "step": 468 }, { "epoch": 0.3947811447811448, "grad_norm": 0.435845285654068, "learning_rate": 5.637019230769231e-06, "loss": 0.4949, "step": 469 }, { "epoch": 0.3956228956228956, "grad_norm": 0.46815937757492065, "learning_rate": 5.6490384615384615e-06, "loss": 0.4715, "step": 470 }, { "epoch": 0.39646464646464646, "grad_norm": 0.49429580569267273, "learning_rate": 5.661057692307693e-06, "loss": 0.5068, "step": 471 }, { "epoch": 0.39730639730639733, "grad_norm": 0.48835229873657227, "learning_rate": 5.6730769230769235e-06, "loss": 0.5004, "step": 472 }, { "epoch": 0.39814814814814814, "grad_norm": 0.4767301082611084, "learning_rate": 5.6850961538461545e-06, "loss": 0.501, "step": 473 }, { "epoch": 0.398989898989899, "grad_norm": 0.5044308304786682, "learning_rate": 5.697115384615385e-06, "loss": 0.4798, "step": 474 }, { "epoch": 0.3998316498316498, "grad_norm": 0.4946942627429962, "learning_rate": 5.7091346153846165e-06, "loss": 0.5028, "step": 475 }, { "epoch": 0.4006734006734007, "grad_norm": 0.46484243869781494, "learning_rate": 5.721153846153847e-06, "loss": 0.5285, "step": 476 }, { "epoch": 0.4015151515151515, "grad_norm": 0.4619486629962921, "learning_rate": 5.733173076923078e-06, "loss": 0.4821, "step": 477 }, { "epoch": 0.40235690235690236, "grad_norm": 0.4762738347053528, "learning_rate": 5.745192307692308e-06, "loss": 0.4748, "step": 478 }, { "epoch": 0.4031986531986532, "grad_norm": 0.4724401533603668, "learning_rate": 5.757211538461539e-06, "loss": 0.4884, "step": 479 }, { "epoch": 0.40404040404040403, "grad_norm": 0.4699448049068451, "learning_rate": 5.769230769230769e-06, "loss": 0.499, "step": 480 }, { "epoch": 0.4048821548821549, "grad_norm": 0.4347364902496338, "learning_rate": 5.781250000000001e-06, "loss": 0.4554, "step": 481 }, { "epoch": 0.4057239057239057, "grad_norm": 0.43620866537094116, "learning_rate": 5.793269230769232e-06, "loss": 0.4946, "step": 482 }, { "epoch": 0.4065656565656566, "grad_norm": 0.5166424512863159, "learning_rate": 5.805288461538462e-06, "loss": 0.4941, "step": 483 }, { "epoch": 0.4074074074074074, "grad_norm": 0.4856564998626709, "learning_rate": 5.817307692307693e-06, "loss": 0.4969, "step": 484 }, { "epoch": 0.40824915824915825, "grad_norm": 0.4564407765865326, "learning_rate": 5.829326923076923e-06, "loss": 0.501, "step": 485 }, { "epoch": 0.4090909090909091, "grad_norm": 0.4559411406517029, "learning_rate": 5.841346153846155e-06, "loss": 0.487, "step": 486 }, { "epoch": 0.4099326599326599, "grad_norm": 0.46640586853027344, "learning_rate": 5.853365384615385e-06, "loss": 0.5005, "step": 487 }, { "epoch": 0.4107744107744108, "grad_norm": 0.4679681956768036, "learning_rate": 5.865384615384616e-06, "loss": 0.488, "step": 488 }, { "epoch": 0.4116161616161616, "grad_norm": 0.48272666335105896, "learning_rate": 5.877403846153846e-06, "loss": 0.4766, "step": 489 }, { "epoch": 0.41245791245791247, "grad_norm": 0.44415998458862305, "learning_rate": 5.889423076923077e-06, "loss": 0.4877, "step": 490 }, { "epoch": 0.4132996632996633, "grad_norm": 0.4549729824066162, "learning_rate": 5.901442307692307e-06, "loss": 0.5024, "step": 491 }, { "epoch": 0.41414141414141414, "grad_norm": 0.4937102794647217, "learning_rate": 5.913461538461539e-06, "loss": 0.4717, "step": 492 }, { "epoch": 0.414983164983165, "grad_norm": 0.4803124964237213, "learning_rate": 5.925480769230769e-06, "loss": 0.4788, "step": 493 }, { "epoch": 0.4158249158249158, "grad_norm": 0.5310830473899841, "learning_rate": 5.9375e-06, "loss": 0.478, "step": 494 }, { "epoch": 0.4166666666666667, "grad_norm": 0.4707161784172058, "learning_rate": 5.949519230769232e-06, "loss": 0.4849, "step": 495 }, { "epoch": 0.4175084175084175, "grad_norm": 0.5432158708572388, "learning_rate": 5.961538461538462e-06, "loss": 0.5179, "step": 496 }, { "epoch": 0.41835016835016836, "grad_norm": 0.5814688801765442, "learning_rate": 5.973557692307693e-06, "loss": 0.5129, "step": 497 }, { "epoch": 0.41919191919191917, "grad_norm": 0.5257246494293213, "learning_rate": 5.9855769230769235e-06, "loss": 0.4836, "step": 498 }, { "epoch": 0.42003367003367004, "grad_norm": 0.4660387337207794, "learning_rate": 5.9975961538461545e-06, "loss": 0.5042, "step": 499 }, { "epoch": 0.4208754208754209, "grad_norm": 0.4647604823112488, "learning_rate": 6.009615384615385e-06, "loss": 0.4846, "step": 500 }, { "epoch": 0.4217171717171717, "grad_norm": 0.5020940899848938, "learning_rate": 6.0216346153846165e-06, "loss": 0.4881, "step": 501 }, { "epoch": 0.4225589225589226, "grad_norm": 0.4826745092868805, "learning_rate": 6.033653846153847e-06, "loss": 0.4897, "step": 502 }, { "epoch": 0.4234006734006734, "grad_norm": 0.5058521032333374, "learning_rate": 6.045673076923078e-06, "loss": 0.5039, "step": 503 }, { "epoch": 0.42424242424242425, "grad_norm": 0.5127459764480591, "learning_rate": 6.057692307692308e-06, "loss": 0.4887, "step": 504 }, { "epoch": 0.42508417508417506, "grad_norm": 0.48496514558792114, "learning_rate": 6.069711538461539e-06, "loss": 0.4972, "step": 505 }, { "epoch": 0.42592592592592593, "grad_norm": 0.45193547010421753, "learning_rate": 6.081730769230769e-06, "loss": 0.491, "step": 506 }, { "epoch": 0.42676767676767674, "grad_norm": 0.4819861948490143, "learning_rate": 6.093750000000001e-06, "loss": 0.501, "step": 507 }, { "epoch": 0.4276094276094276, "grad_norm": 0.49506768584251404, "learning_rate": 6.105769230769232e-06, "loss": 0.474, "step": 508 }, { "epoch": 0.42845117845117847, "grad_norm": 0.5094819664955139, "learning_rate": 6.117788461538462e-06, "loss": 0.4809, "step": 509 }, { "epoch": 0.4292929292929293, "grad_norm": 0.5152899622917175, "learning_rate": 6.129807692307693e-06, "loss": 0.4762, "step": 510 }, { "epoch": 0.43013468013468015, "grad_norm": 0.47224125266075134, "learning_rate": 6.141826923076923e-06, "loss": 0.464, "step": 511 }, { "epoch": 0.43097643097643096, "grad_norm": 0.4698803424835205, "learning_rate": 6.153846153846155e-06, "loss": 0.4816, "step": 512 }, { "epoch": 0.4318181818181818, "grad_norm": 0.45763683319091797, "learning_rate": 6.165865384615385e-06, "loss": 0.4759, "step": 513 }, { "epoch": 0.43265993265993263, "grad_norm": 0.44918256998062134, "learning_rate": 6.177884615384616e-06, "loss": 0.4883, "step": 514 }, { "epoch": 0.4335016835016835, "grad_norm": 0.4714778661727905, "learning_rate": 6.189903846153846e-06, "loss": 0.5052, "step": 515 }, { "epoch": 0.43434343434343436, "grad_norm": 0.4474315941333771, "learning_rate": 6.201923076923078e-06, "loss": 0.4916, "step": 516 }, { "epoch": 0.4351851851851852, "grad_norm": 0.49878793954849243, "learning_rate": 6.213942307692308e-06, "loss": 0.4964, "step": 517 }, { "epoch": 0.43602693602693604, "grad_norm": 0.5119079947471619, "learning_rate": 6.225961538461539e-06, "loss": 0.4727, "step": 518 }, { "epoch": 0.43686868686868685, "grad_norm": 0.4798416197299957, "learning_rate": 6.237980769230769e-06, "loss": 0.4917, "step": 519 }, { "epoch": 0.4377104377104377, "grad_norm": 0.45995789766311646, "learning_rate": 6.25e-06, "loss": 0.4626, "step": 520 }, { "epoch": 0.4385521885521885, "grad_norm": 0.5614783763885498, "learning_rate": 6.262019230769232e-06, "loss": 0.5073, "step": 521 }, { "epoch": 0.4393939393939394, "grad_norm": 0.4607319235801697, "learning_rate": 6.274038461538462e-06, "loss": 0.4935, "step": 522 }, { "epoch": 0.44023569023569026, "grad_norm": 0.48512810468673706, "learning_rate": 6.286057692307693e-06, "loss": 0.4818, "step": 523 }, { "epoch": 0.44107744107744107, "grad_norm": 0.4872691333293915, "learning_rate": 6.2980769230769234e-06, "loss": 0.4745, "step": 524 }, { "epoch": 0.44191919191919193, "grad_norm": 0.5046452283859253, "learning_rate": 6.3100961538461544e-06, "loss": 0.474, "step": 525 }, { "epoch": 0.44276094276094274, "grad_norm": 0.49153444170951843, "learning_rate": 6.322115384615385e-06, "loss": 0.4822, "step": 526 }, { "epoch": 0.4436026936026936, "grad_norm": 0.47855618596076965, "learning_rate": 6.3341346153846164e-06, "loss": 0.4833, "step": 527 }, { "epoch": 0.4444444444444444, "grad_norm": 0.4465228021144867, "learning_rate": 6.3461538461538466e-06, "loss": 0.4761, "step": 528 }, { "epoch": 0.4452861952861953, "grad_norm": 0.5102695822715759, "learning_rate": 6.3581730769230776e-06, "loss": 0.4759, "step": 529 }, { "epoch": 0.44612794612794615, "grad_norm": 0.4358554780483246, "learning_rate": 6.370192307692308e-06, "loss": 0.465, "step": 530 }, { "epoch": 0.44696969696969696, "grad_norm": 0.49788111448287964, "learning_rate": 6.382211538461539e-06, "loss": 0.4786, "step": 531 }, { "epoch": 0.4478114478114478, "grad_norm": 0.47267061471939087, "learning_rate": 6.394230769230769e-06, "loss": 0.4693, "step": 532 }, { "epoch": 0.44865319865319864, "grad_norm": 0.4479665756225586, "learning_rate": 6.406250000000001e-06, "loss": 0.4845, "step": 533 }, { "epoch": 0.4494949494949495, "grad_norm": 0.5452726483345032, "learning_rate": 6.418269230769232e-06, "loss": 0.4775, "step": 534 }, { "epoch": 0.4503367003367003, "grad_norm": 0.49981820583343506, "learning_rate": 6.430288461538462e-06, "loss": 0.4741, "step": 535 }, { "epoch": 0.4511784511784512, "grad_norm": 0.46802714467048645, "learning_rate": 6.442307692307693e-06, "loss": 0.4942, "step": 536 }, { "epoch": 0.45202020202020204, "grad_norm": 0.505921483039856, "learning_rate": 6.454326923076924e-06, "loss": 0.4949, "step": 537 }, { "epoch": 0.45286195286195285, "grad_norm": 0.49519670009613037, "learning_rate": 6.466346153846155e-06, "loss": 0.5024, "step": 538 }, { "epoch": 0.4537037037037037, "grad_norm": 0.5278582572937012, "learning_rate": 6.478365384615385e-06, "loss": 0.5115, "step": 539 }, { "epoch": 0.45454545454545453, "grad_norm": 0.5104811191558838, "learning_rate": 6.490384615384616e-06, "loss": 0.4962, "step": 540 }, { "epoch": 0.4553872053872054, "grad_norm": 0.4642155170440674, "learning_rate": 6.502403846153846e-06, "loss": 0.4926, "step": 541 }, { "epoch": 0.4562289562289562, "grad_norm": 0.4860260486602783, "learning_rate": 6.514423076923078e-06, "loss": 0.4701, "step": 542 }, { "epoch": 0.45707070707070707, "grad_norm": 0.47597676515579224, "learning_rate": 6.526442307692308e-06, "loss": 0.4728, "step": 543 }, { "epoch": 0.45791245791245794, "grad_norm": 0.546424388885498, "learning_rate": 6.538461538461539e-06, "loss": 0.4904, "step": 544 }, { "epoch": 0.45875420875420875, "grad_norm": 0.5197954177856445, "learning_rate": 6.550480769230769e-06, "loss": 0.4663, "step": 545 }, { "epoch": 0.4595959595959596, "grad_norm": 0.4767465889453888, "learning_rate": 6.5625e-06, "loss": 0.4764, "step": 546 }, { "epoch": 0.4604377104377104, "grad_norm": 0.48061972856521606, "learning_rate": 6.574519230769232e-06, "loss": 0.5052, "step": 547 }, { "epoch": 0.4612794612794613, "grad_norm": 0.4939226508140564, "learning_rate": 6.586538461538462e-06, "loss": 0.5099, "step": 548 }, { "epoch": 0.4621212121212121, "grad_norm": 0.5008599162101746, "learning_rate": 6.598557692307693e-06, "loss": 0.4837, "step": 549 }, { "epoch": 0.46296296296296297, "grad_norm": 0.5019283294677734, "learning_rate": 6.610576923076923e-06, "loss": 0.4829, "step": 550 }, { "epoch": 0.46380471380471383, "grad_norm": 0.4523107409477234, "learning_rate": 6.622596153846154e-06, "loss": 0.4629, "step": 551 }, { "epoch": 0.46464646464646464, "grad_norm": 0.5309014916419983, "learning_rate": 6.6346153846153846e-06, "loss": 0.4832, "step": 552 }, { "epoch": 0.4654882154882155, "grad_norm": 0.626876950263977, "learning_rate": 6.646634615384616e-06, "loss": 0.4609, "step": 553 }, { "epoch": 0.4663299663299663, "grad_norm": 0.5110170841217041, "learning_rate": 6.6586538461538465e-06, "loss": 0.4862, "step": 554 }, { "epoch": 0.4671717171717172, "grad_norm": 0.6093491315841675, "learning_rate": 6.6706730769230775e-06, "loss": 0.4758, "step": 555 }, { "epoch": 0.468013468013468, "grad_norm": 0.6044207811355591, "learning_rate": 6.682692307692308e-06, "loss": 0.4912, "step": 556 }, { "epoch": 0.46885521885521886, "grad_norm": 0.49996811151504517, "learning_rate": 6.694711538461539e-06, "loss": 0.456, "step": 557 }, { "epoch": 0.4696969696969697, "grad_norm": 0.5355966091156006, "learning_rate": 6.70673076923077e-06, "loss": 0.4709, "step": 558 }, { "epoch": 0.47053872053872053, "grad_norm": 0.5865458250045776, "learning_rate": 6.718750000000001e-06, "loss": 0.4658, "step": 559 }, { "epoch": 0.4713804713804714, "grad_norm": 0.5117978453636169, "learning_rate": 6.730769230769232e-06, "loss": 0.4658, "step": 560 }, { "epoch": 0.4722222222222222, "grad_norm": 0.5395684242248535, "learning_rate": 6.742788461538462e-06, "loss": 0.4764, "step": 561 }, { "epoch": 0.4730639730639731, "grad_norm": 0.6292040944099426, "learning_rate": 6.754807692307694e-06, "loss": 0.4944, "step": 562 }, { "epoch": 0.4739057239057239, "grad_norm": 0.4364086389541626, "learning_rate": 6.766826923076924e-06, "loss": 0.4807, "step": 563 }, { "epoch": 0.47474747474747475, "grad_norm": 0.5714690089225769, "learning_rate": 6.778846153846155e-06, "loss": 0.4821, "step": 564 }, { "epoch": 0.47558922558922556, "grad_norm": 0.5706982612609863, "learning_rate": 6.790865384615385e-06, "loss": 0.4621, "step": 565 }, { "epoch": 0.4764309764309764, "grad_norm": 0.46611884236335754, "learning_rate": 6.802884615384616e-06, "loss": 0.4833, "step": 566 }, { "epoch": 0.4772727272727273, "grad_norm": 0.5521095395088196, "learning_rate": 6.814903846153846e-06, "loss": 0.4709, "step": 567 }, { "epoch": 0.4781144781144781, "grad_norm": 0.6356738805770874, "learning_rate": 6.826923076923078e-06, "loss": 0.4914, "step": 568 }, { "epoch": 0.47895622895622897, "grad_norm": 0.5208408236503601, "learning_rate": 6.838942307692308e-06, "loss": 0.4671, "step": 569 }, { "epoch": 0.4797979797979798, "grad_norm": 0.5726565718650818, "learning_rate": 6.850961538461539e-06, "loss": 0.4639, "step": 570 }, { "epoch": 0.48063973063973064, "grad_norm": 0.5706713795661926, "learning_rate": 6.862980769230769e-06, "loss": 0.4893, "step": 571 }, { "epoch": 0.48148148148148145, "grad_norm": 0.5654163956642151, "learning_rate": 6.875e-06, "loss": 0.4673, "step": 572 }, { "epoch": 0.4823232323232323, "grad_norm": 0.4735598564147949, "learning_rate": 6.887019230769232e-06, "loss": 0.4983, "step": 573 }, { "epoch": 0.4831649831649832, "grad_norm": 0.5695908665657043, "learning_rate": 6.899038461538462e-06, "loss": 0.4983, "step": 574 }, { "epoch": 0.484006734006734, "grad_norm": 0.5011371374130249, "learning_rate": 6.911057692307693e-06, "loss": 0.4866, "step": 575 }, { "epoch": 0.48484848484848486, "grad_norm": 0.529087245464325, "learning_rate": 6.923076923076923e-06, "loss": 0.4644, "step": 576 }, { "epoch": 0.48569023569023567, "grad_norm": 0.5470275282859802, "learning_rate": 6.935096153846154e-06, "loss": 0.5156, "step": 577 }, { "epoch": 0.48653198653198654, "grad_norm": 0.47910067439079285, "learning_rate": 6.9471153846153845e-06, "loss": 0.4672, "step": 578 }, { "epoch": 0.48737373737373735, "grad_norm": 0.5154852867126465, "learning_rate": 6.959134615384616e-06, "loss": 0.4587, "step": 579 }, { "epoch": 0.4882154882154882, "grad_norm": 0.48536980152130127, "learning_rate": 6.9711538461538465e-06, "loss": 0.4896, "step": 580 }, { "epoch": 0.4890572390572391, "grad_norm": 0.5139884352684021, "learning_rate": 6.9831730769230775e-06, "loss": 0.4905, "step": 581 }, { "epoch": 0.4898989898989899, "grad_norm": 0.5450189709663391, "learning_rate": 6.995192307692308e-06, "loss": 0.479, "step": 582 }, { "epoch": 0.49074074074074076, "grad_norm": 0.5229934453964233, "learning_rate": 7.0072115384615395e-06, "loss": 0.4809, "step": 583 }, { "epoch": 0.49158249158249157, "grad_norm": 0.6015797257423401, "learning_rate": 7.01923076923077e-06, "loss": 0.488, "step": 584 }, { "epoch": 0.49242424242424243, "grad_norm": 0.6081055402755737, "learning_rate": 7.031250000000001e-06, "loss": 0.4591, "step": 585 }, { "epoch": 0.49326599326599324, "grad_norm": 0.5402100682258606, "learning_rate": 7.043269230769232e-06, "loss": 0.4867, "step": 586 }, { "epoch": 0.4941077441077441, "grad_norm": 0.5886497497558594, "learning_rate": 7.055288461538462e-06, "loss": 0.4775, "step": 587 }, { "epoch": 0.494949494949495, "grad_norm": 0.5507027506828308, "learning_rate": 7.067307692307694e-06, "loss": 0.4721, "step": 588 }, { "epoch": 0.4957912457912458, "grad_norm": 0.5298016667366028, "learning_rate": 7.079326923076924e-06, "loss": 0.4969, "step": 589 }, { "epoch": 0.49663299663299665, "grad_norm": 0.507517397403717, "learning_rate": 7.091346153846155e-06, "loss": 0.4937, "step": 590 }, { "epoch": 0.49747474747474746, "grad_norm": 0.5142073631286621, "learning_rate": 7.103365384615385e-06, "loss": 0.4722, "step": 591 }, { "epoch": 0.4983164983164983, "grad_norm": 0.5488661527633667, "learning_rate": 7.115384615384616e-06, "loss": 0.4747, "step": 592 }, { "epoch": 0.49915824915824913, "grad_norm": 0.49899905920028687, "learning_rate": 7.127403846153846e-06, "loss": 0.4955, "step": 593 }, { "epoch": 0.5, "grad_norm": 0.5035253763198853, "learning_rate": 7.139423076923078e-06, "loss": 0.4342, "step": 594 }, { "epoch": 0.5008417508417509, "grad_norm": 0.4821213185787201, "learning_rate": 7.151442307692308e-06, "loss": 0.4993, "step": 595 }, { "epoch": 0.5016835016835017, "grad_norm": 0.5364698171615601, "learning_rate": 7.163461538461539e-06, "loss": 0.489, "step": 596 }, { "epoch": 0.5025252525252525, "grad_norm": 0.49046310782432556, "learning_rate": 7.175480769230769e-06, "loss": 0.4695, "step": 597 }, { "epoch": 0.5033670033670034, "grad_norm": 0.47628283500671387, "learning_rate": 7.1875e-06, "loss": 0.4618, "step": 598 }, { "epoch": 0.5042087542087542, "grad_norm": 0.6613359451293945, "learning_rate": 7.199519230769232e-06, "loss": 0.4955, "step": 599 }, { "epoch": 0.5050505050505051, "grad_norm": 0.49906256794929504, "learning_rate": 7.211538461538462e-06, "loss": 0.484, "step": 600 }, { "epoch": 0.5058922558922558, "grad_norm": 0.5680059194564819, "learning_rate": 7.223557692307693e-06, "loss": 0.4947, "step": 601 }, { "epoch": 0.5067340067340067, "grad_norm": 0.5805497765541077, "learning_rate": 7.235576923076923e-06, "loss": 0.5018, "step": 602 }, { "epoch": 0.5075757575757576, "grad_norm": 0.550378680229187, "learning_rate": 7.247596153846155e-06, "loss": 0.4849, "step": 603 }, { "epoch": 0.5084175084175084, "grad_norm": 0.5820996761322021, "learning_rate": 7.259615384615385e-06, "loss": 0.4976, "step": 604 }, { "epoch": 0.5092592592592593, "grad_norm": 0.5624049305915833, "learning_rate": 7.271634615384616e-06, "loss": 0.4818, "step": 605 }, { "epoch": 0.51010101010101, "grad_norm": 0.5675477385520935, "learning_rate": 7.2836538461538465e-06, "loss": 0.4921, "step": 606 }, { "epoch": 0.5109427609427609, "grad_norm": 0.503031849861145, "learning_rate": 7.2956730769230775e-06, "loss": 0.4727, "step": 607 }, { "epoch": 0.5117845117845118, "grad_norm": 0.6441077589988708, "learning_rate": 7.307692307692308e-06, "loss": 0.474, "step": 608 }, { "epoch": 0.5126262626262627, "grad_norm": 0.5831144452095032, "learning_rate": 7.3197115384615395e-06, "loss": 0.4741, "step": 609 }, { "epoch": 0.5134680134680135, "grad_norm": 0.5234183669090271, "learning_rate": 7.33173076923077e-06, "loss": 0.4729, "step": 610 }, { "epoch": 0.5143097643097643, "grad_norm": 0.5143928527832031, "learning_rate": 7.343750000000001e-06, "loss": 0.4781, "step": 611 }, { "epoch": 0.5151515151515151, "grad_norm": 0.510441780090332, "learning_rate": 7.355769230769232e-06, "loss": 0.4659, "step": 612 }, { "epoch": 0.515993265993266, "grad_norm": 0.4805666208267212, "learning_rate": 7.367788461538462e-06, "loss": 0.4675, "step": 613 }, { "epoch": 0.5168350168350169, "grad_norm": 0.6595744490623474, "learning_rate": 7.379807692307694e-06, "loss": 0.5036, "step": 614 }, { "epoch": 0.5176767676767676, "grad_norm": 0.5530588626861572, "learning_rate": 7.391826923076924e-06, "loss": 0.4817, "step": 615 }, { "epoch": 0.5185185185185185, "grad_norm": 0.5593632459640503, "learning_rate": 7.403846153846155e-06, "loss": 0.4536, "step": 616 }, { "epoch": 0.5193602693602694, "grad_norm": 0.5759657025337219, "learning_rate": 7.415865384615385e-06, "loss": 0.4899, "step": 617 }, { "epoch": 0.5202020202020202, "grad_norm": 0.5678423047065735, "learning_rate": 7.427884615384616e-06, "loss": 0.4885, "step": 618 }, { "epoch": 0.5210437710437711, "grad_norm": 0.5930958390235901, "learning_rate": 7.439903846153846e-06, "loss": 0.4942, "step": 619 }, { "epoch": 0.5218855218855218, "grad_norm": 0.5203920602798462, "learning_rate": 7.451923076923078e-06, "loss": 0.4736, "step": 620 }, { "epoch": 0.5227272727272727, "grad_norm": 0.5334050059318542, "learning_rate": 7.463942307692308e-06, "loss": 0.4648, "step": 621 }, { "epoch": 0.5235690235690236, "grad_norm": 0.5758285522460938, "learning_rate": 7.475961538461539e-06, "loss": 0.4852, "step": 622 }, { "epoch": 0.5244107744107744, "grad_norm": 0.5613915324211121, "learning_rate": 7.487980769230769e-06, "loss": 0.4756, "step": 623 }, { "epoch": 0.5252525252525253, "grad_norm": 0.6641870737075806, "learning_rate": 7.500000000000001e-06, "loss": 0.4946, "step": 624 }, { "epoch": 0.5260942760942761, "grad_norm": 0.5807854533195496, "learning_rate": 7.512019230769232e-06, "loss": 0.4604, "step": 625 }, { "epoch": 0.5269360269360269, "grad_norm": 0.6811582446098328, "learning_rate": 7.524038461538462e-06, "loss": 0.4579, "step": 626 }, { "epoch": 0.5277777777777778, "grad_norm": 0.7274307012557983, "learning_rate": 7.536057692307693e-06, "loss": 0.5126, "step": 627 }, { "epoch": 0.5286195286195287, "grad_norm": 0.6930693984031677, "learning_rate": 7.548076923076923e-06, "loss": 0.495, "step": 628 }, { "epoch": 0.5294612794612794, "grad_norm": 0.7578830718994141, "learning_rate": 7.560096153846155e-06, "loss": 0.4858, "step": 629 }, { "epoch": 0.5303030303030303, "grad_norm": 0.6113349795341492, "learning_rate": 7.572115384615385e-06, "loss": 0.474, "step": 630 }, { "epoch": 0.5311447811447811, "grad_norm": 0.5895912647247314, "learning_rate": 7.584134615384616e-06, "loss": 0.4748, "step": 631 }, { "epoch": 0.531986531986532, "grad_norm": 0.5621899962425232, "learning_rate": 7.5961538461538465e-06, "loss": 0.488, "step": 632 }, { "epoch": 0.5328282828282829, "grad_norm": 0.5344527363777161, "learning_rate": 7.6081730769230775e-06, "loss": 0.4499, "step": 633 }, { "epoch": 0.5336700336700336, "grad_norm": 0.49869203567504883, "learning_rate": 7.620192307692308e-06, "loss": 0.4554, "step": 634 }, { "epoch": 0.5345117845117845, "grad_norm": 0.5366109013557434, "learning_rate": 7.63221153846154e-06, "loss": 0.4996, "step": 635 }, { "epoch": 0.5353535353535354, "grad_norm": 0.5738183259963989, "learning_rate": 7.64423076923077e-06, "loss": 0.4954, "step": 636 }, { "epoch": 0.5361952861952862, "grad_norm": 0.5121124982833862, "learning_rate": 7.656250000000001e-06, "loss": 0.4716, "step": 637 }, { "epoch": 0.5370370370370371, "grad_norm": 0.4951989948749542, "learning_rate": 7.668269230769232e-06, "loss": 0.4465, "step": 638 }, { "epoch": 0.5378787878787878, "grad_norm": 0.6070489883422852, "learning_rate": 7.680288461538462e-06, "loss": 0.4746, "step": 639 }, { "epoch": 0.5387205387205387, "grad_norm": 0.5442604422569275, "learning_rate": 7.692307692307694e-06, "loss": 0.5001, "step": 640 }, { "epoch": 0.5395622895622896, "grad_norm": 0.49886611104011536, "learning_rate": 7.704326923076924e-06, "loss": 0.4723, "step": 641 }, { "epoch": 0.5404040404040404, "grad_norm": 0.5302504897117615, "learning_rate": 7.716346153846156e-06, "loss": 0.4672, "step": 642 }, { "epoch": 0.5412457912457912, "grad_norm": 0.531509280204773, "learning_rate": 7.728365384615386e-06, "loss": 0.4931, "step": 643 }, { "epoch": 0.5420875420875421, "grad_norm": 0.47490960359573364, "learning_rate": 7.740384615384616e-06, "loss": 0.4681, "step": 644 }, { "epoch": 0.5429292929292929, "grad_norm": 0.5356503129005432, "learning_rate": 7.752403846153846e-06, "loss": 0.5075, "step": 645 }, { "epoch": 0.5437710437710438, "grad_norm": 0.5028493404388428, "learning_rate": 7.764423076923078e-06, "loss": 0.476, "step": 646 }, { "epoch": 0.5446127946127947, "grad_norm": 0.5357822179794312, "learning_rate": 7.776442307692308e-06, "loss": 0.4677, "step": 647 }, { "epoch": 0.5454545454545454, "grad_norm": 0.6258405447006226, "learning_rate": 7.78846153846154e-06, "loss": 0.4892, "step": 648 }, { "epoch": 0.5462962962962963, "grad_norm": 0.5021923184394836, "learning_rate": 7.80048076923077e-06, "loss": 0.4865, "step": 649 }, { "epoch": 0.5471380471380471, "grad_norm": 0.6114410758018494, "learning_rate": 7.8125e-06, "loss": 0.4629, "step": 650 }, { "epoch": 0.547979797979798, "grad_norm": 0.5158639550209045, "learning_rate": 7.824519230769232e-06, "loss": 0.4798, "step": 651 }, { "epoch": 0.5488215488215489, "grad_norm": 0.50199955701828, "learning_rate": 7.836538461538462e-06, "loss": 0.4728, "step": 652 }, { "epoch": 0.5496632996632996, "grad_norm": 0.49206605553627014, "learning_rate": 7.848557692307694e-06, "loss": 0.4618, "step": 653 }, { "epoch": 0.5505050505050505, "grad_norm": 0.47622647881507874, "learning_rate": 7.860576923076924e-06, "loss": 0.4428, "step": 654 }, { "epoch": 0.5513468013468014, "grad_norm": 0.5210945010185242, "learning_rate": 7.872596153846154e-06, "loss": 0.4905, "step": 655 }, { "epoch": 0.5521885521885522, "grad_norm": 0.5433646440505981, "learning_rate": 7.884615384615384e-06, "loss": 0.4751, "step": 656 }, { "epoch": 0.553030303030303, "grad_norm": 0.5094481110572815, "learning_rate": 7.896634615384616e-06, "loss": 0.4695, "step": 657 }, { "epoch": 0.5538720538720538, "grad_norm": 0.5298565626144409, "learning_rate": 7.908653846153846e-06, "loss": 0.4837, "step": 658 }, { "epoch": 0.5547138047138047, "grad_norm": 0.5288271903991699, "learning_rate": 7.920673076923078e-06, "loss": 0.4767, "step": 659 }, { "epoch": 0.5555555555555556, "grad_norm": 0.4994739890098572, "learning_rate": 7.932692307692308e-06, "loss": 0.4788, "step": 660 }, { "epoch": 0.5563973063973064, "grad_norm": 0.513253390789032, "learning_rate": 7.944711538461539e-06, "loss": 0.4994, "step": 661 }, { "epoch": 0.5572390572390572, "grad_norm": 0.493746280670166, "learning_rate": 7.956730769230769e-06, "loss": 0.4695, "step": 662 }, { "epoch": 0.5580808080808081, "grad_norm": 0.5455544590950012, "learning_rate": 7.96875e-06, "loss": 0.4441, "step": 663 }, { "epoch": 0.5589225589225589, "grad_norm": 0.5273863673210144, "learning_rate": 7.980769230769232e-06, "loss": 0.4804, "step": 664 }, { "epoch": 0.5597643097643098, "grad_norm": 0.6098944544792175, "learning_rate": 7.992788461538463e-06, "loss": 0.5228, "step": 665 }, { "epoch": 0.5606060606060606, "grad_norm": 0.5571783185005188, "learning_rate": 8.004807692307693e-06, "loss": 0.4574, "step": 666 }, { "epoch": 0.5614478114478114, "grad_norm": 0.5219266414642334, "learning_rate": 8.016826923076923e-06, "loss": 0.4829, "step": 667 }, { "epoch": 0.5622895622895623, "grad_norm": 0.5856225490570068, "learning_rate": 8.028846153846155e-06, "loss": 0.467, "step": 668 }, { "epoch": 0.5631313131313131, "grad_norm": 0.579103946685791, "learning_rate": 8.040865384615385e-06, "loss": 0.48, "step": 669 }, { "epoch": 0.563973063973064, "grad_norm": 0.5531448125839233, "learning_rate": 8.052884615384617e-06, "loss": 0.4998, "step": 670 }, { "epoch": 0.5648148148148148, "grad_norm": 0.6340730786323547, "learning_rate": 8.064903846153847e-06, "loss": 0.4424, "step": 671 }, { "epoch": 0.5656565656565656, "grad_norm": 0.56747967004776, "learning_rate": 8.076923076923077e-06, "loss": 0.4598, "step": 672 }, { "epoch": 0.5664983164983165, "grad_norm": 0.5403140783309937, "learning_rate": 8.088942307692307e-06, "loss": 0.4546, "step": 673 }, { "epoch": 0.5673400673400674, "grad_norm": 0.5548694729804993, "learning_rate": 8.100961538461539e-06, "loss": 0.4669, "step": 674 }, { "epoch": 0.5681818181818182, "grad_norm": 0.62501460313797, "learning_rate": 8.112980769230769e-06, "loss": 0.4822, "step": 675 }, { "epoch": 0.569023569023569, "grad_norm": 0.5604432821273804, "learning_rate": 8.125000000000001e-06, "loss": 0.4672, "step": 676 }, { "epoch": 0.5698653198653199, "grad_norm": 0.5901744365692139, "learning_rate": 8.137019230769231e-06, "loss": 0.4564, "step": 677 }, { "epoch": 0.5707070707070707, "grad_norm": 0.5077469944953918, "learning_rate": 8.149038461538461e-06, "loss": 0.5006, "step": 678 }, { "epoch": 0.5715488215488216, "grad_norm": 0.6039754152297974, "learning_rate": 8.161057692307693e-06, "loss": 0.4706, "step": 679 }, { "epoch": 0.5723905723905723, "grad_norm": 0.5326758623123169, "learning_rate": 8.173076923076923e-06, "loss": 0.473, "step": 680 }, { "epoch": 0.5732323232323232, "grad_norm": 0.5324435234069824, "learning_rate": 8.185096153846155e-06, "loss": 0.4858, "step": 681 }, { "epoch": 0.5740740740740741, "grad_norm": 0.5752372145652771, "learning_rate": 8.197115384615385e-06, "loss": 0.4937, "step": 682 }, { "epoch": 0.5749158249158249, "grad_norm": 0.5291317701339722, "learning_rate": 8.209134615384617e-06, "loss": 0.462, "step": 683 }, { "epoch": 0.5757575757575758, "grad_norm": 0.6811999082565308, "learning_rate": 8.221153846153847e-06, "loss": 0.4905, "step": 684 }, { "epoch": 0.5765993265993266, "grad_norm": 0.45471450686454773, "learning_rate": 8.233173076923077e-06, "loss": 0.48, "step": 685 }, { "epoch": 0.5774410774410774, "grad_norm": 0.618767499923706, "learning_rate": 8.245192307692308e-06, "loss": 0.4498, "step": 686 }, { "epoch": 0.5782828282828283, "grad_norm": 0.6020157933235168, "learning_rate": 8.25721153846154e-06, "loss": 0.4791, "step": 687 }, { "epoch": 0.5791245791245792, "grad_norm": 0.5376692414283752, "learning_rate": 8.26923076923077e-06, "loss": 0.4638, "step": 688 }, { "epoch": 0.57996632996633, "grad_norm": 0.6213158369064331, "learning_rate": 8.281250000000001e-06, "loss": 0.4755, "step": 689 }, { "epoch": 0.5808080808080808, "grad_norm": 0.5799819827079773, "learning_rate": 8.293269230769232e-06, "loss": 0.4777, "step": 690 }, { "epoch": 0.5816498316498316, "grad_norm": 0.500773012638092, "learning_rate": 8.305288461538462e-06, "loss": 0.4531, "step": 691 }, { "epoch": 0.5824915824915825, "grad_norm": 0.5459644198417664, "learning_rate": 8.317307692307694e-06, "loss": 0.4455, "step": 692 }, { "epoch": 0.5833333333333334, "grad_norm": 0.584384024143219, "learning_rate": 8.329326923076924e-06, "loss": 0.4557, "step": 693 }, { "epoch": 0.5841750841750841, "grad_norm": 0.49264082312583923, "learning_rate": 8.341346153846156e-06, "loss": 0.483, "step": 694 }, { "epoch": 0.585016835016835, "grad_norm": 0.56776362657547, "learning_rate": 8.353365384615386e-06, "loss": 0.5007, "step": 695 }, { "epoch": 0.5858585858585859, "grad_norm": 0.5343937277793884, "learning_rate": 8.365384615384616e-06, "loss": 0.4781, "step": 696 }, { "epoch": 0.5867003367003367, "grad_norm": 0.5687231421470642, "learning_rate": 8.377403846153846e-06, "loss": 0.4595, "step": 697 }, { "epoch": 0.5875420875420876, "grad_norm": 0.538329541683197, "learning_rate": 8.389423076923078e-06, "loss": 0.5022, "step": 698 }, { "epoch": 0.5883838383838383, "grad_norm": 0.5413886308670044, "learning_rate": 8.401442307692308e-06, "loss": 0.4602, "step": 699 }, { "epoch": 0.5892255892255892, "grad_norm": 0.666530966758728, "learning_rate": 8.41346153846154e-06, "loss": 0.4501, "step": 700 }, { "epoch": 0.5900673400673401, "grad_norm": 0.4921640455722809, "learning_rate": 8.42548076923077e-06, "loss": 0.4556, "step": 701 }, { "epoch": 0.5909090909090909, "grad_norm": 0.6813014149665833, "learning_rate": 8.4375e-06, "loss": 0.4722, "step": 702 }, { "epoch": 0.5917508417508418, "grad_norm": 0.5669876933097839, "learning_rate": 8.449519230769232e-06, "loss": 0.4473, "step": 703 }, { "epoch": 0.5925925925925926, "grad_norm": 0.5937355756759644, "learning_rate": 8.461538461538462e-06, "loss": 0.4607, "step": 704 }, { "epoch": 0.5934343434343434, "grad_norm": 0.6267182230949402, "learning_rate": 8.473557692307694e-06, "loss": 0.481, "step": 705 }, { "epoch": 0.5942760942760943, "grad_norm": 0.5806222558021545, "learning_rate": 8.485576923076924e-06, "loss": 0.4747, "step": 706 }, { "epoch": 0.5951178451178452, "grad_norm": 0.5684970021247864, "learning_rate": 8.497596153846154e-06, "loss": 0.4781, "step": 707 }, { "epoch": 0.5959595959595959, "grad_norm": 0.5098254084587097, "learning_rate": 8.509615384615384e-06, "loss": 0.4567, "step": 708 }, { "epoch": 0.5968013468013468, "grad_norm": 0.5780357122421265, "learning_rate": 8.521634615384616e-06, "loss": 0.457, "step": 709 }, { "epoch": 0.5976430976430976, "grad_norm": 0.5214495658874512, "learning_rate": 8.533653846153846e-06, "loss": 0.4914, "step": 710 }, { "epoch": 0.5984848484848485, "grad_norm": 0.6729046702384949, "learning_rate": 8.545673076923078e-06, "loss": 0.4878, "step": 711 }, { "epoch": 0.5993265993265994, "grad_norm": 0.5592548251152039, "learning_rate": 8.557692307692308e-06, "loss": 0.4968, "step": 712 }, { "epoch": 0.6001683501683501, "grad_norm": 0.7524355053901672, "learning_rate": 8.569711538461539e-06, "loss": 0.4518, "step": 713 }, { "epoch": 0.601010101010101, "grad_norm": 0.5896543860435486, "learning_rate": 8.581730769230769e-06, "loss": 0.4737, "step": 714 }, { "epoch": 0.6018518518518519, "grad_norm": 0.5865126252174377, "learning_rate": 8.59375e-06, "loss": 0.4811, "step": 715 }, { "epoch": 0.6026936026936027, "grad_norm": 0.5870494246482849, "learning_rate": 8.605769230769232e-06, "loss": 0.4912, "step": 716 }, { "epoch": 0.6035353535353535, "grad_norm": 0.6265791654586792, "learning_rate": 8.617788461538463e-06, "loss": 0.4678, "step": 717 }, { "epoch": 0.6043771043771043, "grad_norm": 0.5831058621406555, "learning_rate": 8.629807692307693e-06, "loss": 0.4856, "step": 718 }, { "epoch": 0.6052188552188552, "grad_norm": 0.5668807029724121, "learning_rate": 8.641826923076923e-06, "loss": 0.4735, "step": 719 }, { "epoch": 0.6060606060606061, "grad_norm": 0.5087153911590576, "learning_rate": 8.653846153846155e-06, "loss": 0.4418, "step": 720 }, { "epoch": 0.6069023569023569, "grad_norm": 0.5662202835083008, "learning_rate": 8.665865384615385e-06, "loss": 0.4773, "step": 721 }, { "epoch": 0.6077441077441077, "grad_norm": 0.5085929036140442, "learning_rate": 8.677884615384617e-06, "loss": 0.5079, "step": 722 }, { "epoch": 0.6085858585858586, "grad_norm": 0.5472040772438049, "learning_rate": 8.689903846153847e-06, "loss": 0.4747, "step": 723 }, { "epoch": 0.6094276094276094, "grad_norm": 0.5796393156051636, "learning_rate": 8.701923076923079e-06, "loss": 0.4632, "step": 724 }, { "epoch": 0.6102693602693603, "grad_norm": 0.5553291440010071, "learning_rate": 8.713942307692309e-06, "loss": 0.4862, "step": 725 }, { "epoch": 0.6111111111111112, "grad_norm": 0.5151901841163635, "learning_rate": 8.725961538461539e-06, "loss": 0.4418, "step": 726 }, { "epoch": 0.6119528619528619, "grad_norm": 0.5418024063110352, "learning_rate": 8.737980769230769e-06, "loss": 0.4607, "step": 727 }, { "epoch": 0.6127946127946128, "grad_norm": 0.5351435542106628, "learning_rate": 8.750000000000001e-06, "loss": 0.4799, "step": 728 }, { "epoch": 0.6136363636363636, "grad_norm": 0.547080397605896, "learning_rate": 8.762019230769233e-06, "loss": 0.4767, "step": 729 }, { "epoch": 0.6144781144781145, "grad_norm": 0.5164614319801331, "learning_rate": 8.774038461538463e-06, "loss": 0.4629, "step": 730 }, { "epoch": 0.6153198653198653, "grad_norm": 0.556559145450592, "learning_rate": 8.786057692307693e-06, "loss": 0.4933, "step": 731 }, { "epoch": 0.6161616161616161, "grad_norm": 0.4988883435726166, "learning_rate": 8.798076923076923e-06, "loss": 0.4751, "step": 732 }, { "epoch": 0.617003367003367, "grad_norm": 0.5318074822425842, "learning_rate": 8.810096153846155e-06, "loss": 0.4535, "step": 733 }, { "epoch": 0.6178451178451179, "grad_norm": 0.5376843810081482, "learning_rate": 8.822115384615385e-06, "loss": 0.4587, "step": 734 }, { "epoch": 0.6186868686868687, "grad_norm": 0.5631672739982605, "learning_rate": 8.834134615384617e-06, "loss": 0.473, "step": 735 }, { "epoch": 0.6195286195286195, "grad_norm": 0.607544481754303, "learning_rate": 8.846153846153847e-06, "loss": 0.4641, "step": 736 }, { "epoch": 0.6203703703703703, "grad_norm": 0.6117792725563049, "learning_rate": 8.858173076923077e-06, "loss": 0.4737, "step": 737 }, { "epoch": 0.6212121212121212, "grad_norm": 0.5736979246139526, "learning_rate": 8.870192307692308e-06, "loss": 0.4602, "step": 738 }, { "epoch": 0.6220538720538721, "grad_norm": 0.5896879434585571, "learning_rate": 8.88221153846154e-06, "loss": 0.5038, "step": 739 }, { "epoch": 0.622895622895623, "grad_norm": 0.5167244076728821, "learning_rate": 8.89423076923077e-06, "loss": 0.4732, "step": 740 }, { "epoch": 0.6237373737373737, "grad_norm": 0.5334715843200684, "learning_rate": 8.906250000000001e-06, "loss": 0.4548, "step": 741 }, { "epoch": 0.6245791245791246, "grad_norm": 0.5871593356132507, "learning_rate": 8.918269230769231e-06, "loss": 0.4552, "step": 742 }, { "epoch": 0.6254208754208754, "grad_norm": 0.5605810880661011, "learning_rate": 8.930288461538462e-06, "loss": 0.4647, "step": 743 }, { "epoch": 0.6262626262626263, "grad_norm": 0.5086384415626526, "learning_rate": 8.942307692307693e-06, "loss": 0.4833, "step": 744 }, { "epoch": 0.627104377104377, "grad_norm": 0.5565049648284912, "learning_rate": 8.954326923076924e-06, "loss": 0.4494, "step": 745 }, { "epoch": 0.6279461279461279, "grad_norm": 0.5247131586074829, "learning_rate": 8.966346153846155e-06, "loss": 0.469, "step": 746 }, { "epoch": 0.6287878787878788, "grad_norm": 0.7145178914070129, "learning_rate": 8.978365384615386e-06, "loss": 0.4581, "step": 747 }, { "epoch": 0.6296296296296297, "grad_norm": 0.5308711528778076, "learning_rate": 8.990384615384616e-06, "loss": 0.482, "step": 748 }, { "epoch": 0.6304713804713805, "grad_norm": 0.6755558848381042, "learning_rate": 9.002403846153846e-06, "loss": 0.4982, "step": 749 }, { "epoch": 0.6313131313131313, "grad_norm": 0.6583971381187439, "learning_rate": 9.014423076923078e-06, "loss": 0.4861, "step": 750 }, { "epoch": 0.6321548821548821, "grad_norm": 0.5595396161079407, "learning_rate": 9.026442307692308e-06, "loss": 0.4945, "step": 751 }, { "epoch": 0.632996632996633, "grad_norm": 0.6203212738037109, "learning_rate": 9.03846153846154e-06, "loss": 0.4452, "step": 752 }, { "epoch": 0.6338383838383839, "grad_norm": 0.5376958250999451, "learning_rate": 9.05048076923077e-06, "loss": 0.465, "step": 753 }, { "epoch": 0.6346801346801347, "grad_norm": 0.5286945104598999, "learning_rate": 9.0625e-06, "loss": 0.4645, "step": 754 }, { "epoch": 0.6355218855218855, "grad_norm": 0.686160683631897, "learning_rate": 9.074519230769232e-06, "loss": 0.4722, "step": 755 }, { "epoch": 0.6363636363636364, "grad_norm": 0.5844593048095703, "learning_rate": 9.086538461538462e-06, "loss": 0.4531, "step": 756 }, { "epoch": 0.6372053872053872, "grad_norm": 0.6571103930473328, "learning_rate": 9.098557692307694e-06, "loss": 0.4677, "step": 757 }, { "epoch": 0.6380471380471381, "grad_norm": 0.6030696630477905, "learning_rate": 9.110576923076924e-06, "loss": 0.4687, "step": 758 }, { "epoch": 0.6388888888888888, "grad_norm": 0.597265899181366, "learning_rate": 9.122596153846154e-06, "loss": 0.4673, "step": 759 }, { "epoch": 0.6397306397306397, "grad_norm": 0.5619155764579773, "learning_rate": 9.134615384615384e-06, "loss": 0.4646, "step": 760 }, { "epoch": 0.6405723905723906, "grad_norm": 0.6093075275421143, "learning_rate": 9.146634615384616e-06, "loss": 0.4389, "step": 761 }, { "epoch": 0.6414141414141414, "grad_norm": 0.5667924284934998, "learning_rate": 9.158653846153846e-06, "loss": 0.4632, "step": 762 }, { "epoch": 0.6422558922558923, "grad_norm": 0.5919568538665771, "learning_rate": 9.170673076923078e-06, "loss": 0.4767, "step": 763 }, { "epoch": 0.6430976430976431, "grad_norm": 0.4774402976036072, "learning_rate": 9.182692307692308e-06, "loss": 0.4386, "step": 764 }, { "epoch": 0.6439393939393939, "grad_norm": 0.549081563949585, "learning_rate": 9.194711538461538e-06, "loss": 0.4547, "step": 765 }, { "epoch": 0.6447811447811448, "grad_norm": 0.5224665403366089, "learning_rate": 9.20673076923077e-06, "loss": 0.4744, "step": 766 }, { "epoch": 0.6456228956228957, "grad_norm": 0.5786705017089844, "learning_rate": 9.21875e-06, "loss": 0.4828, "step": 767 }, { "epoch": 0.6464646464646465, "grad_norm": 0.5286431312561035, "learning_rate": 9.230769230769232e-06, "loss": 0.471, "step": 768 }, { "epoch": 0.6473063973063973, "grad_norm": 0.5456591844558716, "learning_rate": 9.242788461538462e-06, "loss": 0.4844, "step": 769 }, { "epoch": 0.6481481481481481, "grad_norm": 0.5166114568710327, "learning_rate": 9.254807692307694e-06, "loss": 0.4782, "step": 770 }, { "epoch": 0.648989898989899, "grad_norm": 0.6322537660598755, "learning_rate": 9.266826923076924e-06, "loss": 0.4823, "step": 771 }, { "epoch": 0.6498316498316499, "grad_norm": 0.5614060759544373, "learning_rate": 9.278846153846155e-06, "loss": 0.4836, "step": 772 }, { "epoch": 0.6506734006734006, "grad_norm": 0.5154160857200623, "learning_rate": 9.290865384615385e-06, "loss": 0.4725, "step": 773 }, { "epoch": 0.6515151515151515, "grad_norm": 0.5165377259254456, "learning_rate": 9.302884615384617e-06, "loss": 0.4567, "step": 774 }, { "epoch": 0.6523569023569024, "grad_norm": 0.5173574090003967, "learning_rate": 9.314903846153847e-06, "loss": 0.4546, "step": 775 }, { "epoch": 0.6531986531986532, "grad_norm": 0.6091572642326355, "learning_rate": 9.326923076923079e-06, "loss": 0.505, "step": 776 }, { "epoch": 0.6540404040404041, "grad_norm": 0.5100775957107544, "learning_rate": 9.338942307692309e-06, "loss": 0.4656, "step": 777 }, { "epoch": 0.6548821548821548, "grad_norm": 0.5154033303260803, "learning_rate": 9.350961538461539e-06, "loss": 0.4487, "step": 778 }, { "epoch": 0.6557239057239057, "grad_norm": 0.5674734115600586, "learning_rate": 9.362980769230769e-06, "loss": 0.4763, "step": 779 }, { "epoch": 0.6565656565656566, "grad_norm": 0.6164658069610596, "learning_rate": 9.375000000000001e-06, "loss": 0.4419, "step": 780 }, { "epoch": 0.6574074074074074, "grad_norm": 0.5208393335342407, "learning_rate": 9.387019230769233e-06, "loss": 0.4646, "step": 781 }, { "epoch": 0.6582491582491582, "grad_norm": 0.6296796798706055, "learning_rate": 9.399038461538463e-06, "loss": 0.4582, "step": 782 }, { "epoch": 0.6590909090909091, "grad_norm": 0.5322934985160828, "learning_rate": 9.411057692307693e-06, "loss": 0.4669, "step": 783 }, { "epoch": 0.6599326599326599, "grad_norm": 0.5300419926643372, "learning_rate": 9.423076923076923e-06, "loss": 0.4767, "step": 784 }, { "epoch": 0.6607744107744108, "grad_norm": 0.5256965756416321, "learning_rate": 9.435096153846155e-06, "loss": 0.4678, "step": 785 }, { "epoch": 0.6616161616161617, "grad_norm": 0.5663321018218994, "learning_rate": 9.447115384615385e-06, "loss": 0.4713, "step": 786 }, { "epoch": 0.6624579124579124, "grad_norm": 0.5133559703826904, "learning_rate": 9.459134615384617e-06, "loss": 0.4422, "step": 787 }, { "epoch": 0.6632996632996633, "grad_norm": 0.49950146675109863, "learning_rate": 9.471153846153847e-06, "loss": 0.49, "step": 788 }, { "epoch": 0.6641414141414141, "grad_norm": 0.5092547535896301, "learning_rate": 9.483173076923077e-06, "loss": 0.4793, "step": 789 }, { "epoch": 0.664983164983165, "grad_norm": 0.5462767481803894, "learning_rate": 9.495192307692307e-06, "loss": 0.4405, "step": 790 }, { "epoch": 0.6658249158249159, "grad_norm": 0.5611203908920288, "learning_rate": 9.50721153846154e-06, "loss": 0.4941, "step": 791 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5567792057991028, "learning_rate": 9.51923076923077e-06, "loss": 0.4907, "step": 792 }, { "epoch": 0.6675084175084175, "grad_norm": 0.4765409827232361, "learning_rate": 9.531250000000001e-06, "loss": 0.4496, "step": 793 }, { "epoch": 0.6683501683501684, "grad_norm": 0.5768190026283264, "learning_rate": 9.543269230769231e-06, "loss": 0.4724, "step": 794 }, { "epoch": 0.6691919191919192, "grad_norm": 0.5483579635620117, "learning_rate": 9.555288461538462e-06, "loss": 0.4718, "step": 795 }, { "epoch": 0.67003367003367, "grad_norm": 0.4813135862350464, "learning_rate": 9.567307692307693e-06, "loss": 0.4554, "step": 796 }, { "epoch": 0.6708754208754208, "grad_norm": 0.5178253054618835, "learning_rate": 9.579326923076924e-06, "loss": 0.4606, "step": 797 }, { "epoch": 0.6717171717171717, "grad_norm": 0.5508886575698853, "learning_rate": 9.591346153846155e-06, "loss": 0.4746, "step": 798 }, { "epoch": 0.6725589225589226, "grad_norm": 0.5255088210105896, "learning_rate": 9.603365384615386e-06, "loss": 0.4714, "step": 799 }, { "epoch": 0.6734006734006734, "grad_norm": 0.4758123755455017, "learning_rate": 9.615384615384616e-06, "loss": 0.4505, "step": 800 }, { "epoch": 0.6742424242424242, "grad_norm": 0.5234363079071045, "learning_rate": 9.627403846153846e-06, "loss": 0.4794, "step": 801 }, { "epoch": 0.6750841750841751, "grad_norm": 0.5714362859725952, "learning_rate": 9.639423076923078e-06, "loss": 0.5084, "step": 802 }, { "epoch": 0.6759259259259259, "grad_norm": 0.6259543299674988, "learning_rate": 9.651442307692308e-06, "loss": 0.4909, "step": 803 }, { "epoch": 0.6767676767676768, "grad_norm": 0.5470781922340393, "learning_rate": 9.66346153846154e-06, "loss": 0.4836, "step": 804 }, { "epoch": 0.6776094276094277, "grad_norm": 0.5120142698287964, "learning_rate": 9.67548076923077e-06, "loss": 0.4573, "step": 805 }, { "epoch": 0.6784511784511784, "grad_norm": 0.5679871439933777, "learning_rate": 9.6875e-06, "loss": 0.4866, "step": 806 }, { "epoch": 0.6792929292929293, "grad_norm": 0.47754523158073425, "learning_rate": 9.699519230769232e-06, "loss": 0.4828, "step": 807 }, { "epoch": 0.6801346801346801, "grad_norm": 0.5987469553947449, "learning_rate": 9.711538461538462e-06, "loss": 0.4918, "step": 808 }, { "epoch": 0.680976430976431, "grad_norm": 0.54128497838974, "learning_rate": 9.723557692307694e-06, "loss": 0.4778, "step": 809 }, { "epoch": 0.6818181818181818, "grad_norm": 0.48948967456817627, "learning_rate": 9.735576923076924e-06, "loss": 0.4805, "step": 810 }, { "epoch": 0.6826599326599326, "grad_norm": 0.46790292859077454, "learning_rate": 9.747596153846156e-06, "loss": 0.4644, "step": 811 }, { "epoch": 0.6835016835016835, "grad_norm": 0.5583546161651611, "learning_rate": 9.759615384615386e-06, "loss": 0.474, "step": 812 }, { "epoch": 0.6843434343434344, "grad_norm": 0.5589898824691772, "learning_rate": 9.771634615384616e-06, "loss": 0.4734, "step": 813 }, { "epoch": 0.6851851851851852, "grad_norm": 0.53281170129776, "learning_rate": 9.783653846153846e-06, "loss": 0.4728, "step": 814 }, { "epoch": 0.686026936026936, "grad_norm": 0.614088773727417, "learning_rate": 9.795673076923078e-06, "loss": 0.4905, "step": 815 }, { "epoch": 0.6868686868686869, "grad_norm": 0.6235833764076233, "learning_rate": 9.807692307692308e-06, "loss": 0.4837, "step": 816 }, { "epoch": 0.6877104377104377, "grad_norm": 0.5030468702316284, "learning_rate": 9.81971153846154e-06, "loss": 0.4665, "step": 817 }, { "epoch": 0.6885521885521886, "grad_norm": 0.5937991738319397, "learning_rate": 9.83173076923077e-06, "loss": 0.473, "step": 818 }, { "epoch": 0.6893939393939394, "grad_norm": 0.4897083640098572, "learning_rate": 9.84375e-06, "loss": 0.4529, "step": 819 }, { "epoch": 0.6902356902356902, "grad_norm": 0.602712094783783, "learning_rate": 9.855769230769232e-06, "loss": 0.4919, "step": 820 }, { "epoch": 0.6910774410774411, "grad_norm": 0.48713091015815735, "learning_rate": 9.867788461538462e-06, "loss": 0.4859, "step": 821 }, { "epoch": 0.6919191919191919, "grad_norm": 0.52967369556427, "learning_rate": 9.879807692307694e-06, "loss": 0.4931, "step": 822 }, { "epoch": 0.6927609427609428, "grad_norm": 0.6116909980773926, "learning_rate": 9.891826923076924e-06, "loss": 0.4827, "step": 823 }, { "epoch": 0.6936026936026936, "grad_norm": 0.4570704996585846, "learning_rate": 9.903846153846155e-06, "loss": 0.4902, "step": 824 }, { "epoch": 0.6944444444444444, "grad_norm": 0.5384284257888794, "learning_rate": 9.915865384615385e-06, "loss": 0.4468, "step": 825 }, { "epoch": 0.6952861952861953, "grad_norm": 0.499580979347229, "learning_rate": 9.927884615384617e-06, "loss": 0.4428, "step": 826 }, { "epoch": 0.6961279461279462, "grad_norm": 0.5262911915779114, "learning_rate": 9.939903846153847e-06, "loss": 0.4782, "step": 827 }, { "epoch": 0.696969696969697, "grad_norm": 0.47358694672584534, "learning_rate": 9.951923076923079e-06, "loss": 0.4696, "step": 828 }, { "epoch": 0.6978114478114478, "grad_norm": 0.6106840968132019, "learning_rate": 9.963942307692309e-06, "loss": 0.4693, "step": 829 }, { "epoch": 0.6986531986531986, "grad_norm": 0.5421957969665527, "learning_rate": 9.975961538461539e-06, "loss": 0.4743, "step": 830 }, { "epoch": 0.6994949494949495, "grad_norm": 0.5049262046813965, "learning_rate": 9.987980769230769e-06, "loss": 0.4424, "step": 831 }, { "epoch": 0.7003367003367004, "grad_norm": 0.6546176671981812, "learning_rate": 1e-05, "loss": 0.4535, "step": 832 }, { "epoch": 0.7011784511784511, "grad_norm": 0.5066074132919312, "learning_rate": 9.999999559473347e-06, "loss": 0.4762, "step": 833 }, { "epoch": 0.702020202020202, "grad_norm": 0.5727912187576294, "learning_rate": 9.999998237893464e-06, "loss": 0.45, "step": 834 }, { "epoch": 0.7028619528619529, "grad_norm": 0.5320847630500793, "learning_rate": 9.999996035260584e-06, "loss": 0.4994, "step": 835 }, { "epoch": 0.7037037037037037, "grad_norm": 0.501000165939331, "learning_rate": 9.999992951575098e-06, "loss": 0.446, "step": 836 }, { "epoch": 0.7045454545454546, "grad_norm": 0.5546392798423767, "learning_rate": 9.999988986837544e-06, "loss": 0.4549, "step": 837 }, { "epoch": 0.7053872053872053, "grad_norm": 0.47442951798439026, "learning_rate": 9.999984141048625e-06, "loss": 0.4656, "step": 838 }, { "epoch": 0.7062289562289562, "grad_norm": 0.555999219417572, "learning_rate": 9.99997841420919e-06, "loss": 0.4696, "step": 839 }, { "epoch": 0.7070707070707071, "grad_norm": 0.559571385383606, "learning_rate": 9.999971806320256e-06, "loss": 0.4567, "step": 840 }, { "epoch": 0.7079124579124579, "grad_norm": 0.5354645252227783, "learning_rate": 9.99996431738298e-06, "loss": 0.4659, "step": 841 }, { "epoch": 0.7087542087542088, "grad_norm": 0.5663822293281555, "learning_rate": 9.999955947398687e-06, "loss": 0.4676, "step": 842 }, { "epoch": 0.7095959595959596, "grad_norm": 0.4805471897125244, "learning_rate": 9.999946696368847e-06, "loss": 0.4905, "step": 843 }, { "epoch": 0.7104377104377104, "grad_norm": 0.5676055550575256, "learning_rate": 9.999936564295093e-06, "loss": 0.4435, "step": 844 }, { "epoch": 0.7112794612794613, "grad_norm": 0.6202540993690491, "learning_rate": 9.999925551179213e-06, "loss": 0.4872, "step": 845 }, { "epoch": 0.7121212121212122, "grad_norm": 0.5717779397964478, "learning_rate": 9.999913657023141e-06, "loss": 0.4586, "step": 846 }, { "epoch": 0.7129629629629629, "grad_norm": 0.6225761771202087, "learning_rate": 9.999900881828978e-06, "loss": 0.4804, "step": 847 }, { "epoch": 0.7138047138047138, "grad_norm": 0.5385556817054749, "learning_rate": 9.999887225598974e-06, "loss": 0.4978, "step": 848 }, { "epoch": 0.7146464646464646, "grad_norm": 0.5566672086715698, "learning_rate": 9.999872688335532e-06, "loss": 0.4794, "step": 849 }, { "epoch": 0.7154882154882155, "grad_norm": 0.592505931854248, "learning_rate": 9.99985727004122e-06, "loss": 0.45, "step": 850 }, { "epoch": 0.7163299663299664, "grad_norm": 0.5444203615188599, "learning_rate": 9.999840970718751e-06, "loss": 0.4998, "step": 851 }, { "epoch": 0.7171717171717171, "grad_norm": 0.6432380080223083, "learning_rate": 9.999823790370995e-06, "loss": 0.4905, "step": 852 }, { "epoch": 0.718013468013468, "grad_norm": 0.6080014705657959, "learning_rate": 9.999805729000984e-06, "loss": 0.4701, "step": 853 }, { "epoch": 0.7188552188552189, "grad_norm": 0.5344563126564026, "learning_rate": 9.999786786611899e-06, "loss": 0.4374, "step": 854 }, { "epoch": 0.7196969696969697, "grad_norm": 0.5315552353858948, "learning_rate": 9.999766963207076e-06, "loss": 0.4682, "step": 855 }, { "epoch": 0.7205387205387206, "grad_norm": 0.6178186535835266, "learning_rate": 9.99974625879001e-06, "loss": 0.4772, "step": 856 }, { "epoch": 0.7213804713804713, "grad_norm": 0.53788822889328, "learning_rate": 9.999724673364348e-06, "loss": 0.446, "step": 857 }, { "epoch": 0.7222222222222222, "grad_norm": 0.5974572896957397, "learning_rate": 9.999702206933895e-06, "loss": 0.4663, "step": 858 }, { "epoch": 0.7230639730639731, "grad_norm": 0.5753597021102905, "learning_rate": 9.99967885950261e-06, "loss": 0.4725, "step": 859 }, { "epoch": 0.7239057239057239, "grad_norm": 0.5275941491127014, "learning_rate": 9.999654631074605e-06, "loss": 0.4679, "step": 860 }, { "epoch": 0.7247474747474747, "grad_norm": 0.5501117706298828, "learning_rate": 9.999629521654152e-06, "loss": 0.4552, "step": 861 }, { "epoch": 0.7255892255892256, "grad_norm": 0.5410375595092773, "learning_rate": 9.999603531245673e-06, "loss": 0.4764, "step": 862 }, { "epoch": 0.7264309764309764, "grad_norm": 0.5003824234008789, "learning_rate": 9.999576659853749e-06, "loss": 0.4727, "step": 863 }, { "epoch": 0.7272727272727273, "grad_norm": 0.5109595060348511, "learning_rate": 9.999548907483115e-06, "loss": 0.4548, "step": 864 }, { "epoch": 0.7281144781144782, "grad_norm": 0.5565609335899353, "learning_rate": 9.999520274138662e-06, "loss": 0.4336, "step": 865 }, { "epoch": 0.7289562289562289, "grad_norm": 0.46877625584602356, "learning_rate": 9.999490759825434e-06, "loss": 0.4657, "step": 866 }, { "epoch": 0.7297979797979798, "grad_norm": 0.4781875014305115, "learning_rate": 9.999460364548632e-06, "loss": 0.4371, "step": 867 }, { "epoch": 0.7306397306397306, "grad_norm": 0.6060004830360413, "learning_rate": 9.999429088313614e-06, "loss": 0.4668, "step": 868 }, { "epoch": 0.7314814814814815, "grad_norm": 0.5096649527549744, "learning_rate": 9.999396931125889e-06, "loss": 0.4543, "step": 869 }, { "epoch": 0.7323232323232324, "grad_norm": 0.6071654558181763, "learning_rate": 9.999363892991124e-06, "loss": 0.4833, "step": 870 }, { "epoch": 0.7331649831649831, "grad_norm": 0.6239425539970398, "learning_rate": 9.999329973915139e-06, "loss": 0.4642, "step": 871 }, { "epoch": 0.734006734006734, "grad_norm": 0.51524418592453, "learning_rate": 9.999295173903913e-06, "loss": 0.4522, "step": 872 }, { "epoch": 0.7348484848484849, "grad_norm": 0.7928244471549988, "learning_rate": 9.99925949296358e-06, "loss": 0.471, "step": 873 }, { "epoch": 0.7356902356902357, "grad_norm": 0.5531490445137024, "learning_rate": 9.999222931100423e-06, "loss": 0.447, "step": 874 }, { "epoch": 0.7365319865319865, "grad_norm": 0.6046125888824463, "learning_rate": 9.999185488320889e-06, "loss": 0.4755, "step": 875 }, { "epoch": 0.7373737373737373, "grad_norm": 0.6821202635765076, "learning_rate": 9.99914716463157e-06, "loss": 0.4571, "step": 876 }, { "epoch": 0.7382154882154882, "grad_norm": 0.7309443950653076, "learning_rate": 9.999107960039224e-06, "loss": 0.4788, "step": 877 }, { "epoch": 0.7390572390572391, "grad_norm": 0.5109738707542419, "learning_rate": 9.999067874550761e-06, "loss": 0.4589, "step": 878 }, { "epoch": 0.73989898989899, "grad_norm": 0.8120182752609253, "learning_rate": 9.999026908173237e-06, "loss": 0.4758, "step": 879 }, { "epoch": 0.7407407407407407, "grad_norm": 0.5823782086372375, "learning_rate": 9.998985060913877e-06, "loss": 0.4598, "step": 880 }, { "epoch": 0.7415824915824916, "grad_norm": 0.7194758653640747, "learning_rate": 9.998942332780055e-06, "loss": 0.4773, "step": 881 }, { "epoch": 0.7424242424242424, "grad_norm": 0.6104284524917603, "learning_rate": 9.998898723779294e-06, "loss": 0.4632, "step": 882 }, { "epoch": 0.7432659932659933, "grad_norm": 0.5531834363937378, "learning_rate": 9.998854233919284e-06, "loss": 0.452, "step": 883 }, { "epoch": 0.7441077441077442, "grad_norm": 0.5974542498588562, "learning_rate": 9.998808863207865e-06, "loss": 0.4539, "step": 884 }, { "epoch": 0.7449494949494949, "grad_norm": 0.6488112807273865, "learning_rate": 9.998762611653028e-06, "loss": 0.4986, "step": 885 }, { "epoch": 0.7457912457912458, "grad_norm": 0.7117404341697693, "learning_rate": 9.998715479262926e-06, "loss": 0.4738, "step": 886 }, { "epoch": 0.7466329966329966, "grad_norm": 0.5183034539222717, "learning_rate": 9.998667466045862e-06, "loss": 0.4778, "step": 887 }, { "epoch": 0.7474747474747475, "grad_norm": 0.6704620718955994, "learning_rate": 9.998618572010298e-06, "loss": 0.4737, "step": 888 }, { "epoch": 0.7483164983164983, "grad_norm": 0.6523065567016602, "learning_rate": 9.99856879716485e-06, "loss": 0.5097, "step": 889 }, { "epoch": 0.7491582491582491, "grad_norm": 0.5552307963371277, "learning_rate": 9.998518141518287e-06, "loss": 0.4468, "step": 890 }, { "epoch": 0.75, "grad_norm": 0.5721311569213867, "learning_rate": 9.998466605079537e-06, "loss": 0.4414, "step": 891 }, { "epoch": 0.7508417508417509, "grad_norm": 0.5872257947921753, "learning_rate": 9.998414187857678e-06, "loss": 0.4863, "step": 892 }, { "epoch": 0.7516835016835017, "grad_norm": 0.5167661905288696, "learning_rate": 9.998360889861951e-06, "loss": 0.4643, "step": 893 }, { "epoch": 0.7525252525252525, "grad_norm": 0.5516799688339233, "learning_rate": 9.998306711101746e-06, "loss": 0.4546, "step": 894 }, { "epoch": 0.7533670033670034, "grad_norm": 0.6352124214172363, "learning_rate": 9.99825165158661e-06, "loss": 0.4589, "step": 895 }, { "epoch": 0.7542087542087542, "grad_norm": 0.5473071336746216, "learning_rate": 9.998195711326245e-06, "loss": 0.4553, "step": 896 }, { "epoch": 0.7550505050505051, "grad_norm": 0.6246609687805176, "learning_rate": 9.998138890330504e-06, "loss": 0.4595, "step": 897 }, { "epoch": 0.7558922558922558, "grad_norm": 0.6000503897666931, "learning_rate": 9.998081188609405e-06, "loss": 0.4461, "step": 898 }, { "epoch": 0.7567340067340067, "grad_norm": 0.4873741865158081, "learning_rate": 9.998022606173115e-06, "loss": 0.4526, "step": 899 }, { "epoch": 0.7575757575757576, "grad_norm": 0.4987059533596039, "learning_rate": 9.997963143031956e-06, "loss": 0.4474, "step": 900 }, { "epoch": 0.7584175084175084, "grad_norm": 0.5353677272796631, "learning_rate": 9.997902799196404e-06, "loss": 0.4874, "step": 901 }, { "epoch": 0.7592592592592593, "grad_norm": 0.5527768135070801, "learning_rate": 9.997841574677096e-06, "loss": 0.4487, "step": 902 }, { "epoch": 0.76010101010101, "grad_norm": 0.5339164137840271, "learning_rate": 9.997779469484816e-06, "loss": 0.4746, "step": 903 }, { "epoch": 0.7609427609427609, "grad_norm": 0.4849138855934143, "learning_rate": 9.997716483630513e-06, "loss": 0.4804, "step": 904 }, { "epoch": 0.7617845117845118, "grad_norm": 0.6012535691261292, "learning_rate": 9.99765261712528e-06, "loss": 0.4652, "step": 905 }, { "epoch": 0.7626262626262627, "grad_norm": 0.48524826765060425, "learning_rate": 9.997587869980376e-06, "loss": 0.4541, "step": 906 }, { "epoch": 0.7634680134680135, "grad_norm": 0.5285351872444153, "learning_rate": 9.997522242207207e-06, "loss": 0.4683, "step": 907 }, { "epoch": 0.7643097643097643, "grad_norm": 0.5169563293457031, "learning_rate": 9.997455733817338e-06, "loss": 0.4815, "step": 908 }, { "epoch": 0.7651515151515151, "grad_norm": 0.5612872838973999, "learning_rate": 9.997388344822489e-06, "loss": 0.4648, "step": 909 }, { "epoch": 0.765993265993266, "grad_norm": 0.48703497648239136, "learning_rate": 9.997320075234535e-06, "loss": 0.4382, "step": 910 }, { "epoch": 0.7668350168350169, "grad_norm": 0.5571085810661316, "learning_rate": 9.997250925065505e-06, "loss": 0.4688, "step": 911 }, { "epoch": 0.7676767676767676, "grad_norm": 0.5621874928474426, "learning_rate": 9.997180894327582e-06, "loss": 0.4711, "step": 912 }, { "epoch": 0.7685185185185185, "grad_norm": 0.4844915568828583, "learning_rate": 9.99710998303311e-06, "loss": 0.4527, "step": 913 }, { "epoch": 0.7693602693602694, "grad_norm": 0.6080215573310852, "learning_rate": 9.997038191194584e-06, "loss": 0.4692, "step": 914 }, { "epoch": 0.7702020202020202, "grad_norm": 0.5499245524406433, "learning_rate": 9.996965518824652e-06, "loss": 0.4687, "step": 915 }, { "epoch": 0.7710437710437711, "grad_norm": 0.5146388411521912, "learning_rate": 9.99689196593612e-06, "loss": 0.4841, "step": 916 }, { "epoch": 0.7718855218855218, "grad_norm": 0.6070188879966736, "learning_rate": 9.996817532541952e-06, "loss": 0.4866, "step": 917 }, { "epoch": 0.7727272727272727, "grad_norm": 0.5029646158218384, "learning_rate": 9.99674221865526e-06, "loss": 0.444, "step": 918 }, { "epoch": 0.7735690235690236, "grad_norm": 0.49566370248794556, "learning_rate": 9.996666024289316e-06, "loss": 0.4411, "step": 919 }, { "epoch": 0.7744107744107744, "grad_norm": 0.5037920475006104, "learning_rate": 9.996588949457547e-06, "loss": 0.4411, "step": 920 }, { "epoch": 0.7752525252525253, "grad_norm": 0.6454029679298401, "learning_rate": 9.996510994173537e-06, "loss": 0.4473, "step": 921 }, { "epoch": 0.7760942760942761, "grad_norm": 0.6914611458778381, "learning_rate": 9.996432158451017e-06, "loss": 0.4683, "step": 922 }, { "epoch": 0.7769360269360269, "grad_norm": 0.5385676026344299, "learning_rate": 9.996352442303883e-06, "loss": 0.4826, "step": 923 }, { "epoch": 0.7777777777777778, "grad_norm": 0.6487097144126892, "learning_rate": 9.996271845746179e-06, "loss": 0.5109, "step": 924 }, { "epoch": 0.7786195286195287, "grad_norm": 0.5642906427383423, "learning_rate": 9.99619036879211e-06, "loss": 0.4285, "step": 925 }, { "epoch": 0.7794612794612794, "grad_norm": 0.5543748140335083, "learning_rate": 9.99610801145603e-06, "loss": 0.488, "step": 926 }, { "epoch": 0.7803030303030303, "grad_norm": 0.5695657134056091, "learning_rate": 9.996024773752454e-06, "loss": 0.4574, "step": 927 }, { "epoch": 0.7811447811447811, "grad_norm": 0.5696499347686768, "learning_rate": 9.995940655696048e-06, "loss": 0.4396, "step": 928 }, { "epoch": 0.781986531986532, "grad_norm": 0.5318474173545837, "learning_rate": 9.995855657301633e-06, "loss": 0.4657, "step": 929 }, { "epoch": 0.7828282828282829, "grad_norm": 0.49283796548843384, "learning_rate": 9.995769778584191e-06, "loss": 0.4538, "step": 930 }, { "epoch": 0.7836700336700336, "grad_norm": 0.5600171685218811, "learning_rate": 9.995683019558851e-06, "loss": 0.4952, "step": 931 }, { "epoch": 0.7845117845117845, "grad_norm": 0.5859991908073425, "learning_rate": 9.9955953802409e-06, "loss": 0.4806, "step": 932 }, { "epoch": 0.7853535353535354, "grad_norm": 0.5459975004196167, "learning_rate": 9.995506860645786e-06, "loss": 0.4543, "step": 933 }, { "epoch": 0.7861952861952862, "grad_norm": 0.4901520907878876, "learning_rate": 9.995417460789104e-06, "loss": 0.4534, "step": 934 }, { "epoch": 0.7870370370370371, "grad_norm": 0.6021410822868347, "learning_rate": 9.995327180686605e-06, "loss": 0.4881, "step": 935 }, { "epoch": 0.7878787878787878, "grad_norm": 0.57134610414505, "learning_rate": 9.9952360203542e-06, "loss": 0.4733, "step": 936 }, { "epoch": 0.7887205387205387, "grad_norm": 0.4590649902820587, "learning_rate": 9.995143979807951e-06, "loss": 0.4583, "step": 937 }, { "epoch": 0.7895622895622896, "grad_norm": 0.559934139251709, "learning_rate": 9.995051059064078e-06, "loss": 0.46, "step": 938 }, { "epoch": 0.7904040404040404, "grad_norm": 0.5781227946281433, "learning_rate": 9.994957258138955e-06, "loss": 0.4573, "step": 939 }, { "epoch": 0.7912457912457912, "grad_norm": 0.5273347496986389, "learning_rate": 9.99486257704911e-06, "loss": 0.4521, "step": 940 }, { "epoch": 0.7920875420875421, "grad_norm": 0.5664651393890381, "learning_rate": 9.994767015811225e-06, "loss": 0.4319, "step": 941 }, { "epoch": 0.7929292929292929, "grad_norm": 0.4783509373664856, "learning_rate": 9.994670574442141e-06, "loss": 0.4637, "step": 942 }, { "epoch": 0.7937710437710438, "grad_norm": 0.5753483176231384, "learning_rate": 9.994573252958853e-06, "loss": 0.4713, "step": 943 }, { "epoch": 0.7946127946127947, "grad_norm": 0.5746240019798279, "learning_rate": 9.994475051378507e-06, "loss": 0.4449, "step": 944 }, { "epoch": 0.7954545454545454, "grad_norm": 0.5092933773994446, "learning_rate": 9.99437596971841e-06, "loss": 0.4665, "step": 945 }, { "epoch": 0.7962962962962963, "grad_norm": 0.569720447063446, "learning_rate": 9.994276007996018e-06, "loss": 0.4849, "step": 946 }, { "epoch": 0.7971380471380471, "grad_norm": 0.5389716625213623, "learning_rate": 9.994175166228947e-06, "loss": 0.4498, "step": 947 }, { "epoch": 0.797979797979798, "grad_norm": 0.5067436695098877, "learning_rate": 9.99407344443497e-06, "loss": 0.452, "step": 948 }, { "epoch": 0.7988215488215489, "grad_norm": 0.4825668931007385, "learning_rate": 9.993970842632005e-06, "loss": 0.4436, "step": 949 }, { "epoch": 0.7996632996632996, "grad_norm": 0.563647449016571, "learning_rate": 9.993867360838137e-06, "loss": 0.4575, "step": 950 }, { "epoch": 0.8005050505050505, "grad_norm": 0.5021247267723083, "learning_rate": 9.993762999071597e-06, "loss": 0.4605, "step": 951 }, { "epoch": 0.8013468013468014, "grad_norm": 0.6064338088035583, "learning_rate": 9.993657757350777e-06, "loss": 0.4657, "step": 952 }, { "epoch": 0.8021885521885522, "grad_norm": 0.565477728843689, "learning_rate": 9.993551635694218e-06, "loss": 0.4443, "step": 953 }, { "epoch": 0.803030303030303, "grad_norm": 0.6067532896995544, "learning_rate": 9.993444634120627e-06, "loss": 0.4584, "step": 954 }, { "epoch": 0.8038720538720538, "grad_norm": 0.5611613392829895, "learning_rate": 9.99333675264885e-06, "loss": 0.4721, "step": 955 }, { "epoch": 0.8047138047138047, "grad_norm": 0.5915161967277527, "learning_rate": 9.993227991297903e-06, "loss": 0.4846, "step": 956 }, { "epoch": 0.8055555555555556, "grad_norm": 0.5038012862205505, "learning_rate": 9.993118350086947e-06, "loss": 0.4719, "step": 957 }, { "epoch": 0.8063973063973064, "grad_norm": 0.6122514605522156, "learning_rate": 9.993007829035306e-06, "loss": 0.4702, "step": 958 }, { "epoch": 0.8072390572390572, "grad_norm": 0.5634361505508423, "learning_rate": 9.992896428162452e-06, "loss": 0.4747, "step": 959 }, { "epoch": 0.8080808080808081, "grad_norm": 0.5262794494628906, "learning_rate": 9.992784147488018e-06, "loss": 0.4404, "step": 960 }, { "epoch": 0.8089225589225589, "grad_norm": 0.6034880876541138, "learning_rate": 9.992670987031786e-06, "loss": 0.4942, "step": 961 }, { "epoch": 0.8097643097643098, "grad_norm": 0.5170391201972961, "learning_rate": 9.992556946813696e-06, "loss": 0.4654, "step": 962 }, { "epoch": 0.8106060606060606, "grad_norm": 0.5069051384925842, "learning_rate": 9.992442026853846e-06, "loss": 0.4689, "step": 963 }, { "epoch": 0.8114478114478114, "grad_norm": 0.526301383972168, "learning_rate": 9.992326227172483e-06, "loss": 0.5059, "step": 964 }, { "epoch": 0.8122895622895623, "grad_norm": 0.49756789207458496, "learning_rate": 9.992209547790012e-06, "loss": 0.4462, "step": 965 }, { "epoch": 0.8131313131313131, "grad_norm": 0.5008155703544617, "learning_rate": 9.992091988726998e-06, "loss": 0.4426, "step": 966 }, { "epoch": 0.813973063973064, "grad_norm": 0.49166810512542725, "learning_rate": 9.99197355000415e-06, "loss": 0.4712, "step": 967 }, { "epoch": 0.8148148148148148, "grad_norm": 0.4973139762878418, "learning_rate": 9.991854231642345e-06, "loss": 0.4655, "step": 968 }, { "epoch": 0.8156565656565656, "grad_norm": 0.48771727085113525, "learning_rate": 9.9917340336626e-06, "loss": 0.4654, "step": 969 }, { "epoch": 0.8164983164983165, "grad_norm": 0.530437707901001, "learning_rate": 9.9916129560861e-06, "loss": 0.4662, "step": 970 }, { "epoch": 0.8173400673400674, "grad_norm": 0.5563759207725525, "learning_rate": 9.99149099893418e-06, "loss": 0.4934, "step": 971 }, { "epoch": 0.8181818181818182, "grad_norm": 0.493308424949646, "learning_rate": 9.99136816222833e-06, "loss": 0.4568, "step": 972 }, { "epoch": 0.819023569023569, "grad_norm": 0.49903517961502075, "learning_rate": 9.991244445990193e-06, "loss": 0.4728, "step": 973 }, { "epoch": 0.8198653198653199, "grad_norm": 0.5646932721138, "learning_rate": 9.991119850241573e-06, "loss": 0.48, "step": 974 }, { "epoch": 0.8207070707070707, "grad_norm": 0.45239782333374023, "learning_rate": 9.990994375004422e-06, "loss": 0.44, "step": 975 }, { "epoch": 0.8215488215488216, "grad_norm": 0.48145848512649536, "learning_rate": 9.990868020300853e-06, "loss": 0.4703, "step": 976 }, { "epoch": 0.8223905723905723, "grad_norm": 0.5419365763664246, "learning_rate": 9.990740786153125e-06, "loss": 0.4581, "step": 977 }, { "epoch": 0.8232323232323232, "grad_norm": 0.5141558051109314, "learning_rate": 9.990612672583666e-06, "loss": 0.4687, "step": 978 }, { "epoch": 0.8240740740740741, "grad_norm": 0.46558523178100586, "learning_rate": 9.990483679615044e-06, "loss": 0.4669, "step": 979 }, { "epoch": 0.8249158249158249, "grad_norm": 0.520839512348175, "learning_rate": 9.990353807269995e-06, "loss": 0.4713, "step": 980 }, { "epoch": 0.8257575757575758, "grad_norm": 0.6344213485717773, "learning_rate": 9.990223055571398e-06, "loss": 0.4426, "step": 981 }, { "epoch": 0.8265993265993266, "grad_norm": 0.5174628496170044, "learning_rate": 9.990091424542298e-06, "loss": 0.4687, "step": 982 }, { "epoch": 0.8274410774410774, "grad_norm": 0.5975804924964905, "learning_rate": 9.989958914205886e-06, "loss": 0.4741, "step": 983 }, { "epoch": 0.8282828282828283, "grad_norm": 0.4909435212612152, "learning_rate": 9.989825524585514e-06, "loss": 0.4467, "step": 984 }, { "epoch": 0.8291245791245792, "grad_norm": 0.49698901176452637, "learning_rate": 9.989691255704685e-06, "loss": 0.4531, "step": 985 }, { "epoch": 0.82996632996633, "grad_norm": 0.5153672099113464, "learning_rate": 9.98955610758706e-06, "loss": 0.4698, "step": 986 }, { "epoch": 0.8308080808080808, "grad_norm": 0.5274020433425903, "learning_rate": 9.989420080256454e-06, "loss": 0.4671, "step": 987 }, { "epoch": 0.8316498316498316, "grad_norm": 0.5154382586479187, "learning_rate": 9.989283173736835e-06, "loss": 0.4432, "step": 988 }, { "epoch": 0.8324915824915825, "grad_norm": 0.48255765438079834, "learning_rate": 9.989145388052328e-06, "loss": 0.4482, "step": 989 }, { "epoch": 0.8333333333333334, "grad_norm": 0.5773054957389832, "learning_rate": 9.989006723227212e-06, "loss": 0.4839, "step": 990 }, { "epoch": 0.8341750841750841, "grad_norm": 0.5060662031173706, "learning_rate": 9.988867179285921e-06, "loss": 0.4868, "step": 991 }, { "epoch": 0.835016835016835, "grad_norm": 0.48368775844573975, "learning_rate": 9.988726756253046e-06, "loss": 0.4512, "step": 992 }, { "epoch": 0.8358585858585859, "grad_norm": 0.4422658085823059, "learning_rate": 9.988585454153329e-06, "loss": 0.4539, "step": 993 }, { "epoch": 0.8367003367003367, "grad_norm": 0.5150693655014038, "learning_rate": 9.98844327301167e-06, "loss": 0.4844, "step": 994 }, { "epoch": 0.8375420875420876, "grad_norm": 0.4918389916419983, "learning_rate": 9.98830021285312e-06, "loss": 0.4756, "step": 995 }, { "epoch": 0.8383838383838383, "grad_norm": 0.585923433303833, "learning_rate": 9.988156273702893e-06, "loss": 0.46, "step": 996 }, { "epoch": 0.8392255892255892, "grad_norm": 0.5159416198730469, "learning_rate": 9.988011455586349e-06, "loss": 0.4411, "step": 997 }, { "epoch": 0.8400673400673401, "grad_norm": 0.6379266977310181, "learning_rate": 9.987865758529006e-06, "loss": 0.4389, "step": 998 }, { "epoch": 0.8409090909090909, "grad_norm": 0.6692814826965332, "learning_rate": 9.987719182556538e-06, "loss": 0.48, "step": 999 }, { "epoch": 0.8417508417508418, "grad_norm": 0.5532235503196716, "learning_rate": 9.987571727694777e-06, "loss": 0.4606, "step": 1000 }, { "epoch": 0.8425925925925926, "grad_norm": 0.5646963119506836, "learning_rate": 9.987423393969701e-06, "loss": 0.4634, "step": 1001 }, { "epoch": 0.8434343434343434, "grad_norm": 0.5861549973487854, "learning_rate": 9.987274181407451e-06, "loss": 0.4619, "step": 1002 }, { "epoch": 0.8442760942760943, "grad_norm": 0.5745865702629089, "learning_rate": 9.987124090034317e-06, "loss": 0.4549, "step": 1003 }, { "epoch": 0.8451178451178452, "grad_norm": 0.5599626302719116, "learning_rate": 9.98697311987675e-06, "loss": 0.4577, "step": 1004 }, { "epoch": 0.8459595959595959, "grad_norm": 0.5411409139633179, "learning_rate": 9.98682127096135e-06, "loss": 0.4631, "step": 1005 }, { "epoch": 0.8468013468013468, "grad_norm": 0.4867398142814636, "learning_rate": 9.986668543314877e-06, "loss": 0.4734, "step": 1006 }, { "epoch": 0.8476430976430976, "grad_norm": 0.5048694014549255, "learning_rate": 9.98651493696424e-06, "loss": 0.4472, "step": 1007 }, { "epoch": 0.8484848484848485, "grad_norm": 0.47543784976005554, "learning_rate": 9.986360451936508e-06, "loss": 0.4698, "step": 1008 }, { "epoch": 0.8493265993265994, "grad_norm": 0.555640459060669, "learning_rate": 9.986205088258905e-06, "loss": 0.4857, "step": 1009 }, { "epoch": 0.8501683501683501, "grad_norm": 0.5930616855621338, "learning_rate": 9.986048845958803e-06, "loss": 0.4838, "step": 1010 }, { "epoch": 0.851010101010101, "grad_norm": 0.5537025928497314, "learning_rate": 9.985891725063737e-06, "loss": 0.4932, "step": 1011 }, { "epoch": 0.8518518518518519, "grad_norm": 0.5929937362670898, "learning_rate": 9.985733725601391e-06, "loss": 0.4574, "step": 1012 }, { "epoch": 0.8526936026936027, "grad_norm": 0.5442627668380737, "learning_rate": 9.98557484759961e-06, "loss": 0.4551, "step": 1013 }, { "epoch": 0.8535353535353535, "grad_norm": 0.6641427278518677, "learning_rate": 9.985415091086385e-06, "loss": 0.4658, "step": 1014 }, { "epoch": 0.8543771043771043, "grad_norm": 0.5918787121772766, "learning_rate": 9.985254456089871e-06, "loss": 0.44, "step": 1015 }, { "epoch": 0.8552188552188552, "grad_norm": 0.5404515266418457, "learning_rate": 9.985092942638371e-06, "loss": 0.4536, "step": 1016 }, { "epoch": 0.8560606060606061, "grad_norm": 0.6941116452217102, "learning_rate": 9.984930550760345e-06, "loss": 0.4799, "step": 1017 }, { "epoch": 0.8569023569023569, "grad_norm": 0.5734308362007141, "learning_rate": 9.984767280484413e-06, "loss": 0.5047, "step": 1018 }, { "epoch": 0.8577441077441077, "grad_norm": 0.5958016514778137, "learning_rate": 9.984603131839339e-06, "loss": 0.4797, "step": 1019 }, { "epoch": 0.8585858585858586, "grad_norm": 0.55145263671875, "learning_rate": 9.98443810485405e-06, "loss": 0.4431, "step": 1020 }, { "epoch": 0.8594276094276094, "grad_norm": 0.6410260200500488, "learning_rate": 9.984272199557627e-06, "loss": 0.5027, "step": 1021 }, { "epoch": 0.8602693602693603, "grad_norm": 0.5028948783874512, "learning_rate": 9.984105415979302e-06, "loss": 0.4589, "step": 1022 }, { "epoch": 0.8611111111111112, "grad_norm": 0.6020529866218567, "learning_rate": 9.983937754148467e-06, "loss": 0.4424, "step": 1023 }, { "epoch": 0.8619528619528619, "grad_norm": 0.46482351422309875, "learning_rate": 9.983769214094661e-06, "loss": 0.477, "step": 1024 }, { "epoch": 0.8627946127946128, "grad_norm": 0.6272096633911133, "learning_rate": 9.983599795847588e-06, "loss": 0.4477, "step": 1025 }, { "epoch": 0.8636363636363636, "grad_norm": 0.6593365669250488, "learning_rate": 9.983429499437096e-06, "loss": 0.4706, "step": 1026 }, { "epoch": 0.8644781144781145, "grad_norm": 0.5216848254203796, "learning_rate": 9.983258324893199e-06, "loss": 0.4671, "step": 1027 }, { "epoch": 0.8653198653198653, "grad_norm": 0.5457995533943176, "learning_rate": 9.983086272246055e-06, "loss": 0.4612, "step": 1028 }, { "epoch": 0.8661616161616161, "grad_norm": 0.7819581031799316, "learning_rate": 9.982913341525981e-06, "loss": 0.4637, "step": 1029 }, { "epoch": 0.867003367003367, "grad_norm": 0.5016987323760986, "learning_rate": 9.982739532763454e-06, "loss": 0.4627, "step": 1030 }, { "epoch": 0.8678451178451179, "grad_norm": 0.5511379241943359, "learning_rate": 9.982564845989098e-06, "loss": 0.4723, "step": 1031 }, { "epoch": 0.8686868686868687, "grad_norm": 0.5430501699447632, "learning_rate": 9.982389281233693e-06, "loss": 0.4551, "step": 1032 }, { "epoch": 0.8695286195286195, "grad_norm": 0.5615834593772888, "learning_rate": 9.98221283852818e-06, "loss": 0.4888, "step": 1033 }, { "epoch": 0.8703703703703703, "grad_norm": 0.49130189418792725, "learning_rate": 9.982035517903645e-06, "loss": 0.4768, "step": 1034 }, { "epoch": 0.8712121212121212, "grad_norm": 0.5672492384910583, "learning_rate": 9.981857319391338e-06, "loss": 0.4525, "step": 1035 }, { "epoch": 0.8720538720538721, "grad_norm": 0.5694364309310913, "learning_rate": 9.981678243022656e-06, "loss": 0.4507, "step": 1036 }, { "epoch": 0.872895622895623, "grad_norm": 0.5430108308792114, "learning_rate": 9.981498288829157e-06, "loss": 0.4503, "step": 1037 }, { "epoch": 0.8737373737373737, "grad_norm": 0.5515562891960144, "learning_rate": 9.98131745684255e-06, "loss": 0.4453, "step": 1038 }, { "epoch": 0.8745791245791246, "grad_norm": 0.4862312972545624, "learning_rate": 9.981135747094698e-06, "loss": 0.4309, "step": 1039 }, { "epoch": 0.8754208754208754, "grad_norm": 0.6130978465080261, "learning_rate": 9.98095315961762e-06, "loss": 0.4492, "step": 1040 }, { "epoch": 0.8762626262626263, "grad_norm": 0.4930335581302643, "learning_rate": 9.980769694443495e-06, "loss": 0.4526, "step": 1041 }, { "epoch": 0.877104377104377, "grad_norm": 0.5040249228477478, "learning_rate": 9.980585351604647e-06, "loss": 0.4464, "step": 1042 }, { "epoch": 0.8779461279461279, "grad_norm": 0.5572879910469055, "learning_rate": 9.980400131133557e-06, "loss": 0.4559, "step": 1043 }, { "epoch": 0.8787878787878788, "grad_norm": 0.5916115045547485, "learning_rate": 9.980214033062867e-06, "loss": 0.4551, "step": 1044 }, { "epoch": 0.8796296296296297, "grad_norm": 0.5097640156745911, "learning_rate": 9.98002705742537e-06, "loss": 0.4415, "step": 1045 }, { "epoch": 0.8804713804713805, "grad_norm": 0.6020386815071106, "learning_rate": 9.97983920425401e-06, "loss": 0.4709, "step": 1046 }, { "epoch": 0.8813131313131313, "grad_norm": 0.5191218256950378, "learning_rate": 9.97965047358189e-06, "loss": 0.453, "step": 1047 }, { "epoch": 0.8821548821548821, "grad_norm": 0.5914981365203857, "learning_rate": 9.979460865442265e-06, "loss": 0.4477, "step": 1048 }, { "epoch": 0.882996632996633, "grad_norm": 0.44840848445892334, "learning_rate": 9.97927037986855e-06, "loss": 0.4649, "step": 1049 }, { "epoch": 0.8838383838383839, "grad_norm": 0.5157731175422668, "learning_rate": 9.979079016894306e-06, "loss": 0.4557, "step": 1050 }, { "epoch": 0.8846801346801347, "grad_norm": 0.5102500915527344, "learning_rate": 9.978886776553255e-06, "loss": 0.4622, "step": 1051 }, { "epoch": 0.8855218855218855, "grad_norm": 0.5278126001358032, "learning_rate": 9.978693658879275e-06, "loss": 0.4516, "step": 1052 }, { "epoch": 0.8863636363636364, "grad_norm": 0.5881726145744324, "learning_rate": 9.97849966390639e-06, "loss": 0.4643, "step": 1053 }, { "epoch": 0.8872053872053872, "grad_norm": 0.5147579908370972, "learning_rate": 9.978304791668787e-06, "loss": 0.5035, "step": 1054 }, { "epoch": 0.8880471380471381, "grad_norm": 0.6870089769363403, "learning_rate": 9.978109042200802e-06, "loss": 0.4717, "step": 1055 }, { "epoch": 0.8888888888888888, "grad_norm": 0.6343601942062378, "learning_rate": 9.977912415536933e-06, "loss": 0.4941, "step": 1056 }, { "epoch": 0.8897306397306397, "grad_norm": 0.5220882892608643, "learning_rate": 9.977714911711822e-06, "loss": 0.4385, "step": 1057 }, { "epoch": 0.8905723905723906, "grad_norm": 0.524225652217865, "learning_rate": 9.977516530760277e-06, "loss": 0.4536, "step": 1058 }, { "epoch": 0.8914141414141414, "grad_norm": 0.6802743077278137, "learning_rate": 9.977317272717248e-06, "loss": 0.47, "step": 1059 }, { "epoch": 0.8922558922558923, "grad_norm": 0.5907453298568726, "learning_rate": 9.977117137617852e-06, "loss": 0.4773, "step": 1060 }, { "epoch": 0.8930976430976431, "grad_norm": 0.5242749452590942, "learning_rate": 9.976916125497355e-06, "loss": 0.468, "step": 1061 }, { "epoch": 0.8939393939393939, "grad_norm": 0.5584849715232849, "learning_rate": 9.976714236391173e-06, "loss": 0.4601, "step": 1062 }, { "epoch": 0.8947811447811448, "grad_norm": 0.49232685565948486, "learning_rate": 9.976511470334887e-06, "loss": 0.4522, "step": 1063 }, { "epoch": 0.8956228956228957, "grad_norm": 0.5397582054138184, "learning_rate": 9.976307827364221e-06, "loss": 0.4423, "step": 1064 }, { "epoch": 0.8964646464646465, "grad_norm": 0.641350269317627, "learning_rate": 9.976103307515062e-06, "loss": 0.4638, "step": 1065 }, { "epoch": 0.8973063973063973, "grad_norm": 0.6371569037437439, "learning_rate": 9.975897910823446e-06, "loss": 0.4487, "step": 1066 }, { "epoch": 0.8981481481481481, "grad_norm": 0.45133692026138306, "learning_rate": 9.97569163732557e-06, "loss": 0.4328, "step": 1067 }, { "epoch": 0.898989898989899, "grad_norm": 0.7229287624359131, "learning_rate": 9.975484487057781e-06, "loss": 0.4509, "step": 1068 }, { "epoch": 0.8998316498316499, "grad_norm": 0.5302020311355591, "learning_rate": 9.975276460056577e-06, "loss": 0.4664, "step": 1069 }, { "epoch": 0.9006734006734006, "grad_norm": 0.5741941332817078, "learning_rate": 9.975067556358618e-06, "loss": 0.4722, "step": 1070 }, { "epoch": 0.9015151515151515, "grad_norm": 0.514556884765625, "learning_rate": 9.974857776000714e-06, "loss": 0.4454, "step": 1071 }, { "epoch": 0.9023569023569024, "grad_norm": 0.6114805340766907, "learning_rate": 9.97464711901983e-06, "loss": 0.4664, "step": 1072 }, { "epoch": 0.9031986531986532, "grad_norm": 0.5386016368865967, "learning_rate": 9.974435585453088e-06, "loss": 0.4638, "step": 1073 }, { "epoch": 0.9040404040404041, "grad_norm": 0.5559826493263245, "learning_rate": 9.974223175337761e-06, "loss": 0.4944, "step": 1074 }, { "epoch": 0.9048821548821548, "grad_norm": 0.5518078207969666, "learning_rate": 9.974009888711277e-06, "loss": 0.4637, "step": 1075 }, { "epoch": 0.9057239057239057, "grad_norm": 0.6028203964233398, "learning_rate": 9.973795725611224e-06, "loss": 0.455, "step": 1076 }, { "epoch": 0.9065656565656566, "grad_norm": 0.6005215048789978, "learning_rate": 9.973580686075334e-06, "loss": 0.4575, "step": 1077 }, { "epoch": 0.9074074074074074, "grad_norm": 0.5274342894554138, "learning_rate": 9.973364770141502e-06, "loss": 0.4583, "step": 1078 }, { "epoch": 0.9082491582491582, "grad_norm": 0.5973111987113953, "learning_rate": 9.973147977847775e-06, "loss": 0.4524, "step": 1079 }, { "epoch": 0.9090909090909091, "grad_norm": 0.6103183031082153, "learning_rate": 9.972930309232352e-06, "loss": 0.4766, "step": 1080 }, { "epoch": 0.9099326599326599, "grad_norm": 0.5308524370193481, "learning_rate": 9.97271176433359e-06, "loss": 0.4471, "step": 1081 }, { "epoch": 0.9107744107744108, "grad_norm": 0.5152952075004578, "learning_rate": 9.972492343190001e-06, "loss": 0.4193, "step": 1082 }, { "epoch": 0.9116161616161617, "grad_norm": 0.5764251947402954, "learning_rate": 9.972272045840245e-06, "loss": 0.4693, "step": 1083 }, { "epoch": 0.9124579124579124, "grad_norm": 0.6252139806747437, "learning_rate": 9.972050872323144e-06, "loss": 0.4591, "step": 1084 }, { "epoch": 0.9132996632996633, "grad_norm": 0.5684804320335388, "learning_rate": 9.971828822677672e-06, "loss": 0.4746, "step": 1085 }, { "epoch": 0.9141414141414141, "grad_norm": 0.570947527885437, "learning_rate": 9.971605896942952e-06, "loss": 0.4493, "step": 1086 }, { "epoch": 0.914983164983165, "grad_norm": 0.552958071231842, "learning_rate": 9.971382095158268e-06, "loss": 0.4391, "step": 1087 }, { "epoch": 0.9158249158249159, "grad_norm": 0.5873636603355408, "learning_rate": 9.97115741736306e-06, "loss": 0.4571, "step": 1088 }, { "epoch": 0.9166666666666666, "grad_norm": 0.516365647315979, "learning_rate": 9.970931863596913e-06, "loss": 0.4478, "step": 1089 }, { "epoch": 0.9175084175084175, "grad_norm": 0.5450437068939209, "learning_rate": 9.970705433899576e-06, "loss": 0.462, "step": 1090 }, { "epoch": 0.9183501683501684, "grad_norm": 0.5606904029846191, "learning_rate": 9.970478128310945e-06, "loss": 0.4636, "step": 1091 }, { "epoch": 0.9191919191919192, "grad_norm": 0.5247128009796143, "learning_rate": 9.970249946871078e-06, "loss": 0.4746, "step": 1092 }, { "epoch": 0.92003367003367, "grad_norm": 0.6075232625007629, "learning_rate": 9.970020889620177e-06, "loss": 0.488, "step": 1093 }, { "epoch": 0.9208754208754208, "grad_norm": 0.6107762455940247, "learning_rate": 9.96979095659861e-06, "loss": 0.4702, "step": 1094 }, { "epoch": 0.9217171717171717, "grad_norm": 0.5157359838485718, "learning_rate": 9.969560147846891e-06, "loss": 0.4694, "step": 1095 }, { "epoch": 0.9225589225589226, "grad_norm": 0.585603654384613, "learning_rate": 9.969328463405692e-06, "loss": 0.4759, "step": 1096 }, { "epoch": 0.9234006734006734, "grad_norm": 0.5437265634536743, "learning_rate": 9.969095903315837e-06, "loss": 0.4618, "step": 1097 }, { "epoch": 0.9242424242424242, "grad_norm": 0.5551795363426208, "learning_rate": 9.968862467618306e-06, "loss": 0.45, "step": 1098 }, { "epoch": 0.9250841750841751, "grad_norm": 0.48833027482032776, "learning_rate": 9.968628156354235e-06, "loss": 0.4652, "step": 1099 }, { "epoch": 0.9259259259259259, "grad_norm": 0.47221142053604126, "learning_rate": 9.968392969564907e-06, "loss": 0.4537, "step": 1100 }, { "epoch": 0.9267676767676768, "grad_norm": 0.5508369207382202, "learning_rate": 9.96815690729177e-06, "loss": 0.4479, "step": 1101 }, { "epoch": 0.9276094276094277, "grad_norm": 0.5142565965652466, "learning_rate": 9.967919969576416e-06, "loss": 0.4723, "step": 1102 }, { "epoch": 0.9284511784511784, "grad_norm": 0.4732082188129425, "learning_rate": 9.9676821564606e-06, "loss": 0.4274, "step": 1103 }, { "epoch": 0.9292929292929293, "grad_norm": 0.5427067875862122, "learning_rate": 9.967443467986225e-06, "loss": 0.4578, "step": 1104 }, { "epoch": 0.9301346801346801, "grad_norm": 0.470114141702652, "learning_rate": 9.967203904195352e-06, "loss": 0.4403, "step": 1105 }, { "epoch": 0.930976430976431, "grad_norm": 0.49383819103240967, "learning_rate": 9.966963465130193e-06, "loss": 0.4721, "step": 1106 }, { "epoch": 0.9318181818181818, "grad_norm": 0.576120138168335, "learning_rate": 9.966722150833116e-06, "loss": 0.4628, "step": 1107 }, { "epoch": 0.9326599326599326, "grad_norm": 0.4744495153427124, "learning_rate": 9.966479961346645e-06, "loss": 0.4628, "step": 1108 }, { "epoch": 0.9335016835016835, "grad_norm": 0.4723640978336334, "learning_rate": 9.966236896713452e-06, "loss": 0.4346, "step": 1109 }, { "epoch": 0.9343434343434344, "grad_norm": 0.6256804466247559, "learning_rate": 9.965992956976374e-06, "loss": 0.4783, "step": 1110 }, { "epoch": 0.9351851851851852, "grad_norm": 0.572952151298523, "learning_rate": 9.965748142178392e-06, "loss": 0.4913, "step": 1111 }, { "epoch": 0.936026936026936, "grad_norm": 0.4642125070095062, "learning_rate": 9.965502452362643e-06, "loss": 0.4526, "step": 1112 }, { "epoch": 0.9368686868686869, "grad_norm": 0.6617243885993958, "learning_rate": 9.965255887572425e-06, "loss": 0.4523, "step": 1113 }, { "epoch": 0.9377104377104377, "grad_norm": 0.5603154897689819, "learning_rate": 9.965008447851182e-06, "loss": 0.4402, "step": 1114 }, { "epoch": 0.9385521885521886, "grad_norm": 0.5379165410995483, "learning_rate": 9.964760133242517e-06, "loss": 0.4618, "step": 1115 }, { "epoch": 0.9393939393939394, "grad_norm": 0.7032405734062195, "learning_rate": 9.964510943790185e-06, "loss": 0.4804, "step": 1116 }, { "epoch": 0.9402356902356902, "grad_norm": 0.46072542667388916, "learning_rate": 9.964260879538095e-06, "loss": 0.485, "step": 1117 }, { "epoch": 0.9410774410774411, "grad_norm": 0.5280246138572693, "learning_rate": 9.964009940530312e-06, "loss": 0.4308, "step": 1118 }, { "epoch": 0.9419191919191919, "grad_norm": 0.5976089239120483, "learning_rate": 9.963758126811054e-06, "loss": 0.4528, "step": 1119 }, { "epoch": 0.9427609427609428, "grad_norm": 0.5982502698898315, "learning_rate": 9.963505438424695e-06, "loss": 0.4621, "step": 1120 }, { "epoch": 0.9436026936026936, "grad_norm": 0.5087417364120483, "learning_rate": 9.963251875415758e-06, "loss": 0.4545, "step": 1121 }, { "epoch": 0.9444444444444444, "grad_norm": 0.6353872418403625, "learning_rate": 9.962997437828926e-06, "loss": 0.4576, "step": 1122 }, { "epoch": 0.9452861952861953, "grad_norm": 0.5540047287940979, "learning_rate": 9.962742125709032e-06, "loss": 0.4737, "step": 1123 }, { "epoch": 0.9461279461279462, "grad_norm": 0.5864660739898682, "learning_rate": 9.962485939101067e-06, "loss": 0.4424, "step": 1124 }, { "epoch": 0.946969696969697, "grad_norm": 0.5705096125602722, "learning_rate": 9.962228878050171e-06, "loss": 0.4593, "step": 1125 }, { "epoch": 0.9478114478114478, "grad_norm": 0.5828489661216736, "learning_rate": 9.961970942601644e-06, "loss": 0.4686, "step": 1126 }, { "epoch": 0.9486531986531986, "grad_norm": 0.46782949566841125, "learning_rate": 9.961712132800933e-06, "loss": 0.457, "step": 1127 }, { "epoch": 0.9494949494949495, "grad_norm": 0.5465356111526489, "learning_rate": 9.961452448693647e-06, "loss": 0.4593, "step": 1128 }, { "epoch": 0.9503367003367004, "grad_norm": 0.4910604655742645, "learning_rate": 9.961191890325544e-06, "loss": 0.4604, "step": 1129 }, { "epoch": 0.9511784511784511, "grad_norm": 0.5168522596359253, "learning_rate": 9.960930457742534e-06, "loss": 0.4265, "step": 1130 }, { "epoch": 0.952020202020202, "grad_norm": 0.4900423586368561, "learning_rate": 9.960668150990689e-06, "loss": 0.4667, "step": 1131 }, { "epoch": 0.9528619528619529, "grad_norm": 0.5333201289176941, "learning_rate": 9.960404970116228e-06, "loss": 0.4747, "step": 1132 }, { "epoch": 0.9537037037037037, "grad_norm": 0.5217191576957703, "learning_rate": 9.960140915165526e-06, "loss": 0.422, "step": 1133 }, { "epoch": 0.9545454545454546, "grad_norm": 0.5069761872291565, "learning_rate": 9.959875986185111e-06, "loss": 0.4376, "step": 1134 }, { "epoch": 0.9553872053872053, "grad_norm": 0.5368914008140564, "learning_rate": 9.959610183221669e-06, "loss": 0.4408, "step": 1135 }, { "epoch": 0.9562289562289562, "grad_norm": 0.5074960589408875, "learning_rate": 9.959343506322037e-06, "loss": 0.4701, "step": 1136 }, { "epoch": 0.9570707070707071, "grad_norm": 0.6045913100242615, "learning_rate": 9.959075955533203e-06, "loss": 0.4677, "step": 1137 }, { "epoch": 0.9579124579124579, "grad_norm": 0.5101625919342041, "learning_rate": 9.958807530902319e-06, "loss": 0.4565, "step": 1138 }, { "epoch": 0.9587542087542088, "grad_norm": 0.5528519153594971, "learning_rate": 9.958538232476678e-06, "loss": 0.467, "step": 1139 }, { "epoch": 0.9595959595959596, "grad_norm": 0.5510973334312439, "learning_rate": 9.958268060303735e-06, "loss": 0.4918, "step": 1140 }, { "epoch": 0.9604377104377104, "grad_norm": 0.5565927028656006, "learning_rate": 9.957997014431097e-06, "loss": 0.4271, "step": 1141 }, { "epoch": 0.9612794612794613, "grad_norm": 0.4772287905216217, "learning_rate": 9.957725094906528e-06, "loss": 0.4388, "step": 1142 }, { "epoch": 0.9621212121212122, "grad_norm": 0.5017109513282776, "learning_rate": 9.957452301777938e-06, "loss": 0.4356, "step": 1143 }, { "epoch": 0.9629629629629629, "grad_norm": 0.4971100986003876, "learning_rate": 9.957178635093401e-06, "loss": 0.4646, "step": 1144 }, { "epoch": 0.9638047138047138, "grad_norm": 0.5111830830574036, "learning_rate": 9.956904094901136e-06, "loss": 0.4531, "step": 1145 }, { "epoch": 0.9646464646464646, "grad_norm": 0.5406001806259155, "learning_rate": 9.956628681249523e-06, "loss": 0.4348, "step": 1146 }, { "epoch": 0.9654882154882155, "grad_norm": 0.5654174089431763, "learning_rate": 9.956352394187093e-06, "loss": 0.442, "step": 1147 }, { "epoch": 0.9663299663299664, "grad_norm": 0.4787231981754303, "learning_rate": 9.956075233762528e-06, "loss": 0.4508, "step": 1148 }, { "epoch": 0.9671717171717171, "grad_norm": 0.46309447288513184, "learning_rate": 9.955797200024668e-06, "loss": 0.4642, "step": 1149 }, { "epoch": 0.968013468013468, "grad_norm": 0.46454259753227234, "learning_rate": 9.955518293022506e-06, "loss": 0.4529, "step": 1150 }, { "epoch": 0.9688552188552189, "grad_norm": 0.5763834714889526, "learning_rate": 9.955238512805186e-06, "loss": 0.4667, "step": 1151 }, { "epoch": 0.9696969696969697, "grad_norm": 0.5173787474632263, "learning_rate": 9.954957859422011e-06, "loss": 0.4224, "step": 1152 }, { "epoch": 0.9705387205387206, "grad_norm": 0.5354113578796387, "learning_rate": 9.954676332922437e-06, "loss": 0.4432, "step": 1153 }, { "epoch": 0.9713804713804713, "grad_norm": 0.5218011736869812, "learning_rate": 9.954393933356065e-06, "loss": 0.4484, "step": 1154 }, { "epoch": 0.9722222222222222, "grad_norm": 0.48890507221221924, "learning_rate": 9.954110660772665e-06, "loss": 0.4231, "step": 1155 }, { "epoch": 0.9730639730639731, "grad_norm": 0.5993843674659729, "learning_rate": 9.953826515222146e-06, "loss": 0.468, "step": 1156 }, { "epoch": 0.9739057239057239, "grad_norm": 0.4913887679576874, "learning_rate": 9.953541496754582e-06, "loss": 0.4734, "step": 1157 }, { "epoch": 0.9747474747474747, "grad_norm": 0.5866419672966003, "learning_rate": 9.953255605420194e-06, "loss": 0.4668, "step": 1158 }, { "epoch": 0.9755892255892256, "grad_norm": 0.5126329064369202, "learning_rate": 9.95296884126936e-06, "loss": 0.4658, "step": 1159 }, { "epoch": 0.9764309764309764, "grad_norm": 0.5449435114860535, "learning_rate": 9.952681204352608e-06, "loss": 0.464, "step": 1160 }, { "epoch": 0.9772727272727273, "grad_norm": 0.46687206625938416, "learning_rate": 9.952392694720627e-06, "loss": 0.4618, "step": 1161 }, { "epoch": 0.9781144781144782, "grad_norm": 0.5053809881210327, "learning_rate": 9.952103312424254e-06, "loss": 0.4565, "step": 1162 }, { "epoch": 0.9789562289562289, "grad_norm": 0.593429684638977, "learning_rate": 9.95181305751448e-06, "loss": 0.4802, "step": 1163 }, { "epoch": 0.9797979797979798, "grad_norm": 0.5483212471008301, "learning_rate": 9.951521930042453e-06, "loss": 0.4472, "step": 1164 }, { "epoch": 0.9806397306397306, "grad_norm": 0.5046491026878357, "learning_rate": 9.95122993005947e-06, "loss": 0.4453, "step": 1165 }, { "epoch": 0.9814814814814815, "grad_norm": 0.46300235390663147, "learning_rate": 9.950937057616988e-06, "loss": 0.4506, "step": 1166 }, { "epoch": 0.9823232323232324, "grad_norm": 0.5206074118614197, "learning_rate": 9.950643312766611e-06, "loss": 0.4571, "step": 1167 }, { "epoch": 0.9831649831649831, "grad_norm": 0.5327122211456299, "learning_rate": 9.950348695560102e-06, "loss": 0.4463, "step": 1168 }, { "epoch": 0.984006734006734, "grad_norm": 0.5258532166481018, "learning_rate": 9.950053206049375e-06, "loss": 0.4451, "step": 1169 }, { "epoch": 0.9848484848484849, "grad_norm": 0.6278533339500427, "learning_rate": 9.9497568442865e-06, "loss": 0.4597, "step": 1170 }, { "epoch": 0.9856902356902357, "grad_norm": 0.5809938311576843, "learning_rate": 9.949459610323694e-06, "loss": 0.4719, "step": 1171 }, { "epoch": 0.9865319865319865, "grad_norm": 0.653361976146698, "learning_rate": 9.94916150421334e-06, "loss": 0.4573, "step": 1172 }, { "epoch": 0.9873737373737373, "grad_norm": 0.5274221301078796, "learning_rate": 9.94886252600796e-06, "loss": 0.4643, "step": 1173 }, { "epoch": 0.9882154882154882, "grad_norm": 0.5604749917984009, "learning_rate": 9.948562675760245e-06, "loss": 0.4676, "step": 1174 }, { "epoch": 0.9890572390572391, "grad_norm": 0.5695879459381104, "learning_rate": 9.948261953523026e-06, "loss": 0.4458, "step": 1175 }, { "epoch": 0.98989898989899, "grad_norm": 0.4932173788547516, "learning_rate": 9.947960359349295e-06, "loss": 0.4475, "step": 1176 }, { "epoch": 0.9907407407407407, "grad_norm": 0.5727276802062988, "learning_rate": 9.947657893292197e-06, "loss": 0.4501, "step": 1177 }, { "epoch": 0.9915824915824916, "grad_norm": 0.4858740568161011, "learning_rate": 9.94735455540503e-06, "loss": 0.459, "step": 1178 }, { "epoch": 0.9924242424242424, "grad_norm": 0.5298016667366028, "learning_rate": 9.947050345741243e-06, "loss": 0.4617, "step": 1179 }, { "epoch": 0.9932659932659933, "grad_norm": 0.4865809381008148, "learning_rate": 9.946745264354443e-06, "loss": 0.4579, "step": 1180 }, { "epoch": 0.9941077441077442, "grad_norm": 0.48931026458740234, "learning_rate": 9.946439311298388e-06, "loss": 0.4449, "step": 1181 }, { "epoch": 0.9949494949494949, "grad_norm": 0.4801371991634369, "learning_rate": 9.94613248662699e-06, "loss": 0.4238, "step": 1182 }, { "epoch": 0.9957912457912458, "grad_norm": 0.5503662824630737, "learning_rate": 9.945824790394317e-06, "loss": 0.4696, "step": 1183 }, { "epoch": 0.9966329966329966, "grad_norm": 0.4781520664691925, "learning_rate": 9.945516222654582e-06, "loss": 0.47, "step": 1184 }, { "epoch": 0.9974747474747475, "grad_norm": 0.4490736126899719, "learning_rate": 9.945206783462166e-06, "loss": 0.4507, "step": 1185 }, { "epoch": 0.9983164983164983, "grad_norm": 0.5199840664863586, "learning_rate": 9.944896472871592e-06, "loss": 0.4584, "step": 1186 }, { "epoch": 0.9991582491582491, "grad_norm": 0.5248775482177734, "learning_rate": 9.944585290937539e-06, "loss": 0.4489, "step": 1187 }, { "epoch": 1.0, "grad_norm": 0.4194183945655823, "learning_rate": 9.944273237714842e-06, "loss": 0.412, "step": 1188 }, { "epoch": 1.0008417508417509, "grad_norm": 0.5709053874015808, "learning_rate": 9.943960313258485e-06, "loss": 0.451, "step": 1189 }, { "epoch": 1.0016835016835017, "grad_norm": 0.471882700920105, "learning_rate": 9.943646517623613e-06, "loss": 0.4058, "step": 1190 }, { "epoch": 1.0025252525252526, "grad_norm": 0.5485161542892456, "learning_rate": 9.943331850865519e-06, "loss": 0.4147, "step": 1191 }, { "epoch": 1.0033670033670035, "grad_norm": 0.481679767370224, "learning_rate": 9.943016313039648e-06, "loss": 0.4302, "step": 1192 }, { "epoch": 1.004208754208754, "grad_norm": 0.5298817157745361, "learning_rate": 9.942699904201604e-06, "loss": 0.4299, "step": 1193 }, { "epoch": 1.005050505050505, "grad_norm": 0.4842771291732788, "learning_rate": 9.942382624407141e-06, "loss": 0.416, "step": 1194 }, { "epoch": 1.0058922558922558, "grad_norm": 0.5785439014434814, "learning_rate": 9.942064473712167e-06, "loss": 0.4489, "step": 1195 }, { "epoch": 1.0067340067340067, "grad_norm": 0.5274850130081177, "learning_rate": 9.941745452172743e-06, "loss": 0.4151, "step": 1196 }, { "epoch": 1.0075757575757576, "grad_norm": 0.5226646661758423, "learning_rate": 9.941425559845084e-06, "loss": 0.4213, "step": 1197 }, { "epoch": 1.0084175084175084, "grad_norm": 0.5370646119117737, "learning_rate": 9.941104796785558e-06, "loss": 0.422, "step": 1198 }, { "epoch": 1.0092592592592593, "grad_norm": 0.4578465521335602, "learning_rate": 9.94078316305069e-06, "loss": 0.3966, "step": 1199 }, { "epoch": 1.0101010101010102, "grad_norm": 0.4733089506626129, "learning_rate": 9.940460658697151e-06, "loss": 0.4135, "step": 1200 }, { "epoch": 1.010942760942761, "grad_norm": 0.5385671854019165, "learning_rate": 9.940137283781773e-06, "loss": 0.4336, "step": 1201 }, { "epoch": 1.0117845117845117, "grad_norm": 0.4576013684272766, "learning_rate": 9.939813038361536e-06, "loss": 0.4345, "step": 1202 }, { "epoch": 1.0126262626262625, "grad_norm": 0.5044677257537842, "learning_rate": 9.939487922493576e-06, "loss": 0.4308, "step": 1203 }, { "epoch": 1.0134680134680134, "grad_norm": 0.5184712409973145, "learning_rate": 9.939161936235183e-06, "loss": 0.4285, "step": 1204 }, { "epoch": 1.0143097643097643, "grad_norm": 0.4753231406211853, "learning_rate": 9.938835079643795e-06, "loss": 0.4086, "step": 1205 }, { "epoch": 1.0151515151515151, "grad_norm": 0.46323075890541077, "learning_rate": 9.938507352777014e-06, "loss": 0.4034, "step": 1206 }, { "epoch": 1.015993265993266, "grad_norm": 0.49842920899391174, "learning_rate": 9.938178755692584e-06, "loss": 0.4051, "step": 1207 }, { "epoch": 1.0168350168350169, "grad_norm": 0.49115240573883057, "learning_rate": 9.93784928844841e-06, "loss": 0.4093, "step": 1208 }, { "epoch": 1.0176767676767677, "grad_norm": 0.4832595884799957, "learning_rate": 9.93751895110255e-06, "loss": 0.4181, "step": 1209 }, { "epoch": 1.0185185185185186, "grad_norm": 0.486695259809494, "learning_rate": 9.937187743713206e-06, "loss": 0.4045, "step": 1210 }, { "epoch": 1.0193602693602695, "grad_norm": 0.4662696421146393, "learning_rate": 9.936855666338746e-06, "loss": 0.4163, "step": 1211 }, { "epoch": 1.02020202020202, "grad_norm": 0.48934370279312134, "learning_rate": 9.936522719037684e-06, "loss": 0.4308, "step": 1212 }, { "epoch": 1.021043771043771, "grad_norm": 0.5252069234848022, "learning_rate": 9.936188901868687e-06, "loss": 0.4204, "step": 1213 }, { "epoch": 1.0218855218855218, "grad_norm": 0.515494704246521, "learning_rate": 9.935854214890578e-06, "loss": 0.4242, "step": 1214 }, { "epoch": 1.0227272727272727, "grad_norm": 0.5193724036216736, "learning_rate": 9.935518658162336e-06, "loss": 0.4435, "step": 1215 }, { "epoch": 1.0235690235690236, "grad_norm": 0.48461031913757324, "learning_rate": 9.935182231743085e-06, "loss": 0.4112, "step": 1216 }, { "epoch": 1.0244107744107744, "grad_norm": 0.4808846712112427, "learning_rate": 9.93484493569211e-06, "loss": 0.4359, "step": 1217 }, { "epoch": 1.0252525252525253, "grad_norm": 0.5503225326538086, "learning_rate": 9.934506770068843e-06, "loss": 0.424, "step": 1218 }, { "epoch": 1.0260942760942762, "grad_norm": 0.5129328370094299, "learning_rate": 9.934167734932875e-06, "loss": 0.416, "step": 1219 }, { "epoch": 1.026936026936027, "grad_norm": 0.5058778524398804, "learning_rate": 9.933827830343947e-06, "loss": 0.4263, "step": 1220 }, { "epoch": 1.0277777777777777, "grad_norm": 0.5316030979156494, "learning_rate": 9.933487056361954e-06, "loss": 0.4236, "step": 1221 }, { "epoch": 1.0286195286195285, "grad_norm": 0.5263699293136597, "learning_rate": 9.933145413046943e-06, "loss": 0.4257, "step": 1222 }, { "epoch": 1.0294612794612794, "grad_norm": 0.5296460390090942, "learning_rate": 9.932802900459115e-06, "loss": 0.4246, "step": 1223 }, { "epoch": 1.0303030303030303, "grad_norm": 0.5391560196876526, "learning_rate": 9.932459518658827e-06, "loss": 0.4258, "step": 1224 }, { "epoch": 1.0311447811447811, "grad_norm": 0.5729191303253174, "learning_rate": 9.932115267706583e-06, "loss": 0.4172, "step": 1225 }, { "epoch": 1.031986531986532, "grad_norm": 0.4728996753692627, "learning_rate": 9.931770147663049e-06, "loss": 0.4446, "step": 1226 }, { "epoch": 1.0328282828282829, "grad_norm": 0.5607907772064209, "learning_rate": 9.931424158589032e-06, "loss": 0.4144, "step": 1227 }, { "epoch": 1.0336700336700337, "grad_norm": 0.4904206693172455, "learning_rate": 9.931077300545504e-06, "loss": 0.4323, "step": 1228 }, { "epoch": 1.0345117845117846, "grad_norm": 0.4828205406665802, "learning_rate": 9.930729573593582e-06, "loss": 0.4077, "step": 1229 }, { "epoch": 1.0353535353535352, "grad_norm": 0.4627038538455963, "learning_rate": 9.93038097779454e-06, "loss": 0.4469, "step": 1230 }, { "epoch": 1.0361952861952861, "grad_norm": 0.5474883317947388, "learning_rate": 9.930031513209807e-06, "loss": 0.4123, "step": 1231 }, { "epoch": 1.037037037037037, "grad_norm": 0.4556296169757843, "learning_rate": 9.929681179900959e-06, "loss": 0.426, "step": 1232 }, { "epoch": 1.0378787878787878, "grad_norm": 0.48275136947631836, "learning_rate": 9.929329977929729e-06, "loss": 0.4328, "step": 1233 }, { "epoch": 1.0387205387205387, "grad_norm": 0.5282633900642395, "learning_rate": 9.928977907358005e-06, "loss": 0.4527, "step": 1234 }, { "epoch": 1.0395622895622896, "grad_norm": 0.4788952171802521, "learning_rate": 9.928624968247823e-06, "loss": 0.4213, "step": 1235 }, { "epoch": 1.0404040404040404, "grad_norm": 0.4718884527683258, "learning_rate": 9.928271160661376e-06, "loss": 0.4229, "step": 1236 }, { "epoch": 1.0412457912457913, "grad_norm": 0.5381907820701599, "learning_rate": 9.927916484661006e-06, "loss": 0.4287, "step": 1237 }, { "epoch": 1.0420875420875422, "grad_norm": 0.48357319831848145, "learning_rate": 9.927560940309215e-06, "loss": 0.4093, "step": 1238 }, { "epoch": 1.0429292929292928, "grad_norm": 0.4890466332435608, "learning_rate": 9.92720452766865e-06, "loss": 0.4191, "step": 1239 }, { "epoch": 1.0437710437710437, "grad_norm": 0.475445955991745, "learning_rate": 9.926847246802117e-06, "loss": 0.4305, "step": 1240 }, { "epoch": 1.0446127946127945, "grad_norm": 0.467986524105072, "learning_rate": 9.926489097772573e-06, "loss": 0.4305, "step": 1241 }, { "epoch": 1.0454545454545454, "grad_norm": 0.5641269087791443, "learning_rate": 9.926130080643125e-06, "loss": 0.4016, "step": 1242 }, { "epoch": 1.0462962962962963, "grad_norm": 0.4691565930843353, "learning_rate": 9.925770195477037e-06, "loss": 0.4072, "step": 1243 }, { "epoch": 1.0471380471380471, "grad_norm": 0.6080915331840515, "learning_rate": 9.925409442337726e-06, "loss": 0.4136, "step": 1244 }, { "epoch": 1.047979797979798, "grad_norm": 0.527574896812439, "learning_rate": 9.92504782128876e-06, "loss": 0.4322, "step": 1245 }, { "epoch": 1.0488215488215489, "grad_norm": 0.5466039776802063, "learning_rate": 9.924685332393859e-06, "loss": 0.4185, "step": 1246 }, { "epoch": 1.0496632996632997, "grad_norm": 0.49858346581459045, "learning_rate": 9.924321975716898e-06, "loss": 0.4355, "step": 1247 }, { "epoch": 1.0505050505050506, "grad_norm": 0.47747135162353516, "learning_rate": 9.923957751321905e-06, "loss": 0.4363, "step": 1248 }, { "epoch": 1.0513468013468013, "grad_norm": 0.516966700553894, "learning_rate": 9.923592659273059e-06, "loss": 0.4309, "step": 1249 }, { "epoch": 1.0521885521885521, "grad_norm": 0.4727741777896881, "learning_rate": 9.923226699634694e-06, "loss": 0.4153, "step": 1250 }, { "epoch": 1.053030303030303, "grad_norm": 0.5328046679496765, "learning_rate": 9.922859872471297e-06, "loss": 0.4283, "step": 1251 }, { "epoch": 1.0538720538720538, "grad_norm": 0.5798715949058533, "learning_rate": 9.922492177847505e-06, "loss": 0.4367, "step": 1252 }, { "epoch": 1.0547138047138047, "grad_norm": 0.5153037905693054, "learning_rate": 9.92212361582811e-06, "loss": 0.4046, "step": 1253 }, { "epoch": 1.0555555555555556, "grad_norm": 0.5006885528564453, "learning_rate": 9.921754186478059e-06, "loss": 0.4111, "step": 1254 }, { "epoch": 1.0563973063973064, "grad_norm": 0.5723491311073303, "learning_rate": 9.921383889862446e-06, "loss": 0.4216, "step": 1255 }, { "epoch": 1.0572390572390573, "grad_norm": 0.44418737292289734, "learning_rate": 9.921012726046523e-06, "loss": 0.4266, "step": 1256 }, { "epoch": 1.0580808080808082, "grad_norm": 0.49394527077674866, "learning_rate": 9.920640695095693e-06, "loss": 0.4324, "step": 1257 }, { "epoch": 1.0589225589225588, "grad_norm": 0.5721599459648132, "learning_rate": 9.920267797075512e-06, "loss": 0.4301, "step": 1258 }, { "epoch": 1.0597643097643097, "grad_norm": 0.5001194477081299, "learning_rate": 9.919894032051686e-06, "loss": 0.4364, "step": 1259 }, { "epoch": 1.0606060606060606, "grad_norm": 0.645035982131958, "learning_rate": 9.919519400090079e-06, "loss": 0.4338, "step": 1260 }, { "epoch": 1.0614478114478114, "grad_norm": 0.5458484292030334, "learning_rate": 9.919143901256706e-06, "loss": 0.4105, "step": 1261 }, { "epoch": 1.0622895622895623, "grad_norm": 0.6201835870742798, "learning_rate": 9.918767535617732e-06, "loss": 0.4273, "step": 1262 }, { "epoch": 1.0631313131313131, "grad_norm": 0.5730094313621521, "learning_rate": 9.918390303239476e-06, "loss": 0.4114, "step": 1263 }, { "epoch": 1.063973063973064, "grad_norm": 0.5363729596138, "learning_rate": 9.918012204188414e-06, "loss": 0.4073, "step": 1264 }, { "epoch": 1.0648148148148149, "grad_norm": 0.6115604043006897, "learning_rate": 9.917633238531166e-06, "loss": 0.4195, "step": 1265 }, { "epoch": 1.0656565656565657, "grad_norm": 0.5178766846656799, "learning_rate": 9.917253406334514e-06, "loss": 0.4175, "step": 1266 }, { "epoch": 1.0664983164983166, "grad_norm": 0.5876023173332214, "learning_rate": 9.916872707665385e-06, "loss": 0.4219, "step": 1267 }, { "epoch": 1.0673400673400673, "grad_norm": 0.49975138902664185, "learning_rate": 9.916491142590867e-06, "loss": 0.4269, "step": 1268 }, { "epoch": 1.0681818181818181, "grad_norm": 0.5075017213821411, "learning_rate": 9.916108711178192e-06, "loss": 0.4274, "step": 1269 }, { "epoch": 1.069023569023569, "grad_norm": 0.5560829639434814, "learning_rate": 9.915725413494748e-06, "loss": 0.4204, "step": 1270 }, { "epoch": 1.0698653198653199, "grad_norm": 0.46173474192619324, "learning_rate": 9.915341249608079e-06, "loss": 0.4205, "step": 1271 }, { "epoch": 1.0707070707070707, "grad_norm": 0.5282843708992004, "learning_rate": 9.914956219585875e-06, "loss": 0.4073, "step": 1272 }, { "epoch": 1.0715488215488216, "grad_norm": 0.5165653228759766, "learning_rate": 9.914570323495987e-06, "loss": 0.4231, "step": 1273 }, { "epoch": 1.0723905723905724, "grad_norm": 0.5966348648071289, "learning_rate": 9.914183561406411e-06, "loss": 0.4413, "step": 1274 }, { "epoch": 1.0732323232323233, "grad_norm": 0.49233749508857727, "learning_rate": 9.9137959333853e-06, "loss": 0.4421, "step": 1275 }, { "epoch": 1.074074074074074, "grad_norm": 0.5013700723648071, "learning_rate": 9.913407439500956e-06, "loss": 0.4024, "step": 1276 }, { "epoch": 1.0749158249158248, "grad_norm": 0.5443810224533081, "learning_rate": 9.91301807982184e-06, "loss": 0.4166, "step": 1277 }, { "epoch": 1.0757575757575757, "grad_norm": 0.48527753353118896, "learning_rate": 9.912627854416555e-06, "loss": 0.4452, "step": 1278 }, { "epoch": 1.0765993265993266, "grad_norm": 0.523747980594635, "learning_rate": 9.912236763353868e-06, "loss": 0.4157, "step": 1279 }, { "epoch": 1.0774410774410774, "grad_norm": 0.5057489275932312, "learning_rate": 9.911844806702692e-06, "loss": 0.4198, "step": 1280 }, { "epoch": 1.0782828282828283, "grad_norm": 0.5064775347709656, "learning_rate": 9.911451984532093e-06, "loss": 0.4245, "step": 1281 }, { "epoch": 1.0791245791245792, "grad_norm": 0.5552942156791687, "learning_rate": 9.911058296911293e-06, "loss": 0.4109, "step": 1282 }, { "epoch": 1.07996632996633, "grad_norm": 0.5050599575042725, "learning_rate": 9.910663743909661e-06, "loss": 0.4124, "step": 1283 }, { "epoch": 1.0808080808080809, "grad_norm": 0.5435260534286499, "learning_rate": 9.910268325596722e-06, "loss": 0.4393, "step": 1284 }, { "epoch": 1.0816498316498318, "grad_norm": 0.5659509897232056, "learning_rate": 9.909872042042155e-06, "loss": 0.4369, "step": 1285 }, { "epoch": 1.0824915824915824, "grad_norm": 0.43839895725250244, "learning_rate": 9.909474893315787e-06, "loss": 0.4087, "step": 1286 }, { "epoch": 1.0833333333333333, "grad_norm": 0.5038987398147583, "learning_rate": 9.909076879487603e-06, "loss": 0.4069, "step": 1287 }, { "epoch": 1.0841750841750841, "grad_norm": 0.5482878684997559, "learning_rate": 9.908678000627731e-06, "loss": 0.4563, "step": 1288 }, { "epoch": 1.085016835016835, "grad_norm": 0.44506052136421204, "learning_rate": 9.908278256806466e-06, "loss": 0.412, "step": 1289 }, { "epoch": 1.0858585858585859, "grad_norm": 0.5357989072799683, "learning_rate": 9.90787764809424e-06, "loss": 0.4352, "step": 1290 }, { "epoch": 1.0867003367003367, "grad_norm": 0.4458696246147156, "learning_rate": 9.90747617456165e-06, "loss": 0.4253, "step": 1291 }, { "epoch": 1.0875420875420876, "grad_norm": 0.533385157585144, "learning_rate": 9.907073836279435e-06, "loss": 0.4295, "step": 1292 }, { "epoch": 1.0883838383838385, "grad_norm": 0.4535026550292969, "learning_rate": 9.906670633318494e-06, "loss": 0.4326, "step": 1293 }, { "epoch": 1.0892255892255893, "grad_norm": 0.4526161253452301, "learning_rate": 9.906266565749876e-06, "loss": 0.4154, "step": 1294 }, { "epoch": 1.09006734006734, "grad_norm": 0.5237308740615845, "learning_rate": 9.905861633644781e-06, "loss": 0.4069, "step": 1295 }, { "epoch": 1.0909090909090908, "grad_norm": 0.5000157952308655, "learning_rate": 9.905455837074563e-06, "loss": 0.4131, "step": 1296 }, { "epoch": 1.0917508417508417, "grad_norm": 0.5527651906013489, "learning_rate": 9.905049176110728e-06, "loss": 0.4175, "step": 1297 }, { "epoch": 1.0925925925925926, "grad_norm": 0.5550265908241272, "learning_rate": 9.904641650824933e-06, "loss": 0.4209, "step": 1298 }, { "epoch": 1.0934343434343434, "grad_norm": 0.5089467167854309, "learning_rate": 9.904233261288988e-06, "loss": 0.4317, "step": 1299 }, { "epoch": 1.0942760942760943, "grad_norm": 0.5454887747764587, "learning_rate": 9.903824007574855e-06, "loss": 0.4265, "step": 1300 }, { "epoch": 1.0951178451178452, "grad_norm": 0.4855159819126129, "learning_rate": 9.903413889754654e-06, "loss": 0.4299, "step": 1301 }, { "epoch": 1.095959595959596, "grad_norm": 0.5834365487098694, "learning_rate": 9.903002907900645e-06, "loss": 0.4053, "step": 1302 }, { "epoch": 1.0968013468013469, "grad_norm": 0.4869956970214844, "learning_rate": 9.902591062085251e-06, "loss": 0.4191, "step": 1303 }, { "epoch": 1.0976430976430978, "grad_norm": 0.5156760215759277, "learning_rate": 9.902178352381044e-06, "loss": 0.411, "step": 1304 }, { "epoch": 1.0984848484848484, "grad_norm": 0.5426946878433228, "learning_rate": 9.901764778860748e-06, "loss": 0.4229, "step": 1305 }, { "epoch": 1.0993265993265993, "grad_norm": 0.516101598739624, "learning_rate": 9.901350341597237e-06, "loss": 0.4156, "step": 1306 }, { "epoch": 1.1001683501683501, "grad_norm": 0.5514787435531616, "learning_rate": 9.90093504066354e-06, "loss": 0.4343, "step": 1307 }, { "epoch": 1.101010101010101, "grad_norm": 0.4914611876010895, "learning_rate": 9.900518876132839e-06, "loss": 0.4151, "step": 1308 }, { "epoch": 1.1018518518518519, "grad_norm": 0.5503596067428589, "learning_rate": 9.900101848078465e-06, "loss": 0.4337, "step": 1309 }, { "epoch": 1.1026936026936027, "grad_norm": 0.5795862674713135, "learning_rate": 9.899683956573904e-06, "loss": 0.4601, "step": 1310 }, { "epoch": 1.1035353535353536, "grad_norm": 0.46864330768585205, "learning_rate": 9.899265201692791e-06, "loss": 0.4189, "step": 1311 }, { "epoch": 1.1043771043771045, "grad_norm": 0.5339556932449341, "learning_rate": 9.898845583508918e-06, "loss": 0.4108, "step": 1312 }, { "epoch": 1.1052188552188553, "grad_norm": 0.5048878192901611, "learning_rate": 9.898425102096224e-06, "loss": 0.4507, "step": 1313 }, { "epoch": 1.106060606060606, "grad_norm": 0.5182617902755737, "learning_rate": 9.898003757528801e-06, "loss": 0.4529, "step": 1314 }, { "epoch": 1.1069023569023568, "grad_norm": 0.5154816508293152, "learning_rate": 9.897581549880897e-06, "loss": 0.4326, "step": 1315 }, { "epoch": 1.1077441077441077, "grad_norm": 0.4493154287338257, "learning_rate": 9.897158479226909e-06, "loss": 0.3961, "step": 1316 }, { "epoch": 1.1085858585858586, "grad_norm": 0.4458385109901428, "learning_rate": 9.896734545641388e-06, "loss": 0.404, "step": 1317 }, { "epoch": 1.1094276094276094, "grad_norm": 0.45774132013320923, "learning_rate": 9.896309749199033e-06, "loss": 0.4056, "step": 1318 }, { "epoch": 1.1102693602693603, "grad_norm": 0.4885023832321167, "learning_rate": 9.895884089974697e-06, "loss": 0.4129, "step": 1319 }, { "epoch": 1.1111111111111112, "grad_norm": 0.4809379279613495, "learning_rate": 9.895457568043389e-06, "loss": 0.4198, "step": 1320 }, { "epoch": 1.111952861952862, "grad_norm": 0.5605499148368835, "learning_rate": 9.895030183480263e-06, "loss": 0.4123, "step": 1321 }, { "epoch": 1.112794612794613, "grad_norm": 0.5162538290023804, "learning_rate": 9.894601936360633e-06, "loss": 0.4239, "step": 1322 }, { "epoch": 1.1136363636363635, "grad_norm": 0.48748111724853516, "learning_rate": 9.894172826759958e-06, "loss": 0.422, "step": 1323 }, { "epoch": 1.1144781144781144, "grad_norm": 0.5135069489479065, "learning_rate": 9.89374285475385e-06, "loss": 0.4449, "step": 1324 }, { "epoch": 1.1153198653198653, "grad_norm": 0.5238416790962219, "learning_rate": 9.893312020418078e-06, "loss": 0.4304, "step": 1325 }, { "epoch": 1.1161616161616161, "grad_norm": 0.4704055190086365, "learning_rate": 9.89288032382856e-06, "loss": 0.429, "step": 1326 }, { "epoch": 1.117003367003367, "grad_norm": 0.47612106800079346, "learning_rate": 9.892447765061361e-06, "loss": 0.4091, "step": 1327 }, { "epoch": 1.1178451178451179, "grad_norm": 0.5191083550453186, "learning_rate": 9.892014344192707e-06, "loss": 0.4429, "step": 1328 }, { "epoch": 1.1186868686868687, "grad_norm": 0.5548783540725708, "learning_rate": 9.89158006129897e-06, "loss": 0.4218, "step": 1329 }, { "epoch": 1.1195286195286196, "grad_norm": 0.4822879433631897, "learning_rate": 9.891144916456674e-06, "loss": 0.4184, "step": 1330 }, { "epoch": 1.1203703703703705, "grad_norm": 0.45177459716796875, "learning_rate": 9.8907089097425e-06, "loss": 0.4266, "step": 1331 }, { "epoch": 1.121212121212121, "grad_norm": 0.5327690243721008, "learning_rate": 9.890272041233272e-06, "loss": 0.4235, "step": 1332 }, { "epoch": 1.122053872053872, "grad_norm": 0.5746333003044128, "learning_rate": 9.889834311005971e-06, "loss": 0.4267, "step": 1333 }, { "epoch": 1.1228956228956228, "grad_norm": 0.4509575664997101, "learning_rate": 9.889395719137733e-06, "loss": 0.4499, "step": 1334 }, { "epoch": 1.1237373737373737, "grad_norm": 0.5122528076171875, "learning_rate": 9.888956265705842e-06, "loss": 0.4334, "step": 1335 }, { "epoch": 1.1245791245791246, "grad_norm": 0.5264590978622437, "learning_rate": 9.888515950787735e-06, "loss": 0.4151, "step": 1336 }, { "epoch": 1.1254208754208754, "grad_norm": 0.5425626039505005, "learning_rate": 9.888074774460999e-06, "loss": 0.4281, "step": 1337 }, { "epoch": 1.1262626262626263, "grad_norm": 0.5245966911315918, "learning_rate": 9.887632736803372e-06, "loss": 0.426, "step": 1338 }, { "epoch": 1.1271043771043772, "grad_norm": 0.5560979247093201, "learning_rate": 9.887189837892748e-06, "loss": 0.4171, "step": 1339 }, { "epoch": 1.127946127946128, "grad_norm": 0.4916987717151642, "learning_rate": 9.88674607780717e-06, "loss": 0.4279, "step": 1340 }, { "epoch": 1.128787878787879, "grad_norm": 0.5198803544044495, "learning_rate": 9.886301456624833e-06, "loss": 0.4389, "step": 1341 }, { "epoch": 1.1296296296296295, "grad_norm": 0.5419111251831055, "learning_rate": 9.885855974424087e-06, "loss": 0.4421, "step": 1342 }, { "epoch": 1.1304713804713804, "grad_norm": 0.4650713801383972, "learning_rate": 9.885409631283427e-06, "loss": 0.4143, "step": 1343 }, { "epoch": 1.1313131313131313, "grad_norm": 0.506197452545166, "learning_rate": 9.884962427281503e-06, "loss": 0.4115, "step": 1344 }, { "epoch": 1.1321548821548821, "grad_norm": 0.43010032176971436, "learning_rate": 9.88451436249712e-06, "loss": 0.407, "step": 1345 }, { "epoch": 1.132996632996633, "grad_norm": 0.5606688261032104, "learning_rate": 9.88406543700923e-06, "loss": 0.4504, "step": 1346 }, { "epoch": 1.1338383838383839, "grad_norm": 0.4560110569000244, "learning_rate": 9.883615650896941e-06, "loss": 0.4362, "step": 1347 }, { "epoch": 1.1346801346801347, "grad_norm": 0.47593918442726135, "learning_rate": 9.883165004239505e-06, "loss": 0.4378, "step": 1348 }, { "epoch": 1.1355218855218856, "grad_norm": 0.5590736269950867, "learning_rate": 9.882713497116335e-06, "loss": 0.4348, "step": 1349 }, { "epoch": 1.1363636363636362, "grad_norm": 0.581150472164154, "learning_rate": 9.88226112960699e-06, "loss": 0.4255, "step": 1350 }, { "epoch": 1.137205387205387, "grad_norm": 0.5032079815864563, "learning_rate": 9.881807901791183e-06, "loss": 0.4458, "step": 1351 }, { "epoch": 1.138047138047138, "grad_norm": 0.4982477128505707, "learning_rate": 9.881353813748775e-06, "loss": 0.4209, "step": 1352 }, { "epoch": 1.1388888888888888, "grad_norm": 0.5615268349647522, "learning_rate": 9.880898865559784e-06, "loss": 0.4509, "step": 1353 }, { "epoch": 1.1397306397306397, "grad_norm": 0.45299121737480164, "learning_rate": 9.880443057304375e-06, "loss": 0.4147, "step": 1354 }, { "epoch": 1.1405723905723906, "grad_norm": 0.5068076848983765, "learning_rate": 9.879986389062868e-06, "loss": 0.4231, "step": 1355 }, { "epoch": 1.1414141414141414, "grad_norm": 0.52045077085495, "learning_rate": 9.87952886091573e-06, "loss": 0.4188, "step": 1356 }, { "epoch": 1.1422558922558923, "grad_norm": 0.46406736969947815, "learning_rate": 9.879070472943585e-06, "loss": 0.3896, "step": 1357 }, { "epoch": 1.1430976430976432, "grad_norm": 0.4978713095188141, "learning_rate": 9.878611225227205e-06, "loss": 0.4118, "step": 1358 }, { "epoch": 1.143939393939394, "grad_norm": 0.5039154291152954, "learning_rate": 9.878151117847515e-06, "loss": 0.4215, "step": 1359 }, { "epoch": 1.144781144781145, "grad_norm": 0.5054770708084106, "learning_rate": 9.877690150885589e-06, "loss": 0.4086, "step": 1360 }, { "epoch": 1.1456228956228955, "grad_norm": 0.5559184551239014, "learning_rate": 9.877228324422654e-06, "loss": 0.4257, "step": 1361 }, { "epoch": 1.1464646464646464, "grad_norm": 0.5354487299919128, "learning_rate": 9.876765638540091e-06, "loss": 0.445, "step": 1362 }, { "epoch": 1.1473063973063973, "grad_norm": 0.48374441266059875, "learning_rate": 9.87630209331943e-06, "loss": 0.4461, "step": 1363 }, { "epoch": 1.1481481481481481, "grad_norm": 0.5792035460472107, "learning_rate": 9.875837688842352e-06, "loss": 0.4039, "step": 1364 }, { "epoch": 1.148989898989899, "grad_norm": 0.4769614636898041, "learning_rate": 9.875372425190687e-06, "loss": 0.4066, "step": 1365 }, { "epoch": 1.1498316498316499, "grad_norm": 0.5083758234977722, "learning_rate": 9.874906302446425e-06, "loss": 0.4264, "step": 1366 }, { "epoch": 1.1506734006734007, "grad_norm": 0.4764128029346466, "learning_rate": 9.874439320691698e-06, "loss": 0.4189, "step": 1367 }, { "epoch": 1.1515151515151516, "grad_norm": 0.5543003678321838, "learning_rate": 9.873971480008795e-06, "loss": 0.4444, "step": 1368 }, { "epoch": 1.1523569023569022, "grad_norm": 0.5032731890678406, "learning_rate": 9.873502780480152e-06, "loss": 0.4221, "step": 1369 }, { "epoch": 1.1531986531986531, "grad_norm": 0.45568031072616577, "learning_rate": 9.873033222188364e-06, "loss": 0.4216, "step": 1370 }, { "epoch": 1.154040404040404, "grad_norm": 0.4789413809776306, "learning_rate": 9.872562805216168e-06, "loss": 0.4162, "step": 1371 }, { "epoch": 1.1548821548821548, "grad_norm": 0.5932811498641968, "learning_rate": 9.872091529646455e-06, "loss": 0.4368, "step": 1372 }, { "epoch": 1.1557239057239057, "grad_norm": 0.48737192153930664, "learning_rate": 9.871619395562273e-06, "loss": 0.4396, "step": 1373 }, { "epoch": 1.1565656565656566, "grad_norm": 0.4669978618621826, "learning_rate": 9.871146403046815e-06, "loss": 0.4211, "step": 1374 }, { "epoch": 1.1574074074074074, "grad_norm": 0.6121896505355835, "learning_rate": 9.870672552183425e-06, "loss": 0.4042, "step": 1375 }, { "epoch": 1.1582491582491583, "grad_norm": 0.5648590326309204, "learning_rate": 9.870197843055605e-06, "loss": 0.4083, "step": 1376 }, { "epoch": 1.1590909090909092, "grad_norm": 0.5580945611000061, "learning_rate": 9.869722275747002e-06, "loss": 0.4345, "step": 1377 }, { "epoch": 1.15993265993266, "grad_norm": 0.47706568241119385, "learning_rate": 9.869245850341416e-06, "loss": 0.4273, "step": 1378 }, { "epoch": 1.1607744107744107, "grad_norm": 0.4968038499355316, "learning_rate": 9.868768566922797e-06, "loss": 0.458, "step": 1379 }, { "epoch": 1.1616161616161615, "grad_norm": 0.46777284145355225, "learning_rate": 9.86829042557525e-06, "loss": 0.4473, "step": 1380 }, { "epoch": 1.1624579124579124, "grad_norm": 0.4833667278289795, "learning_rate": 9.867811426383025e-06, "loss": 0.3929, "step": 1381 }, { "epoch": 1.1632996632996633, "grad_norm": 0.5322316884994507, "learning_rate": 9.867331569430529e-06, "loss": 0.4506, "step": 1382 }, { "epoch": 1.1641414141414141, "grad_norm": 0.5176856517791748, "learning_rate": 9.866850854802318e-06, "loss": 0.4375, "step": 1383 }, { "epoch": 1.164983164983165, "grad_norm": 0.47567081451416016, "learning_rate": 9.866369282583101e-06, "loss": 0.4288, "step": 1384 }, { "epoch": 1.1658249158249159, "grad_norm": 0.5101653933525085, "learning_rate": 9.865886852857734e-06, "loss": 0.431, "step": 1385 }, { "epoch": 1.1666666666666667, "grad_norm": 0.4776579737663269, "learning_rate": 9.865403565711225e-06, "loss": 0.4147, "step": 1386 }, { "epoch": 1.1675084175084174, "grad_norm": 0.43861764669418335, "learning_rate": 9.864919421228734e-06, "loss": 0.4397, "step": 1387 }, { "epoch": 1.1683501683501682, "grad_norm": 0.5173589587211609, "learning_rate": 9.864434419495577e-06, "loss": 0.4029, "step": 1388 }, { "epoch": 1.1691919191919191, "grad_norm": 0.45766472816467285, "learning_rate": 9.863948560597211e-06, "loss": 0.4027, "step": 1389 }, { "epoch": 1.17003367003367, "grad_norm": 0.4629509747028351, "learning_rate": 9.863461844619254e-06, "loss": 0.3993, "step": 1390 }, { "epoch": 1.1708754208754208, "grad_norm": 0.5095140337944031, "learning_rate": 9.862974271647467e-06, "loss": 0.4118, "step": 1391 }, { "epoch": 1.1717171717171717, "grad_norm": 0.5212144255638123, "learning_rate": 9.862485841767768e-06, "loss": 0.4387, "step": 1392 }, { "epoch": 1.1725589225589226, "grad_norm": 0.5190088748931885, "learning_rate": 9.861996555066222e-06, "loss": 0.4072, "step": 1393 }, { "epoch": 1.1734006734006734, "grad_norm": 0.5718361735343933, "learning_rate": 9.861506411629047e-06, "loss": 0.4093, "step": 1394 }, { "epoch": 1.1742424242424243, "grad_norm": 0.5331986546516418, "learning_rate": 9.861015411542613e-06, "loss": 0.4323, "step": 1395 }, { "epoch": 1.1750841750841752, "grad_norm": 0.587783694267273, "learning_rate": 9.860523554893438e-06, "loss": 0.4482, "step": 1396 }, { "epoch": 1.175925925925926, "grad_norm": 0.5511756539344788, "learning_rate": 9.860030841768191e-06, "loss": 0.432, "step": 1397 }, { "epoch": 1.1767676767676767, "grad_norm": 0.4470411241054535, "learning_rate": 9.859537272253696e-06, "loss": 0.4276, "step": 1398 }, { "epoch": 1.1776094276094276, "grad_norm": 0.5563693642616272, "learning_rate": 9.859042846436924e-06, "loss": 0.4564, "step": 1399 }, { "epoch": 1.1784511784511784, "grad_norm": 0.4880155622959137, "learning_rate": 9.858547564405e-06, "loss": 0.4233, "step": 1400 }, { "epoch": 1.1792929292929293, "grad_norm": 0.5013789534568787, "learning_rate": 9.858051426245193e-06, "loss": 0.4478, "step": 1401 }, { "epoch": 1.1801346801346801, "grad_norm": 0.5396603941917419, "learning_rate": 9.857554432044931e-06, "loss": 0.4255, "step": 1402 }, { "epoch": 1.180976430976431, "grad_norm": 0.4293077290058136, "learning_rate": 9.857056581891791e-06, "loss": 0.4276, "step": 1403 }, { "epoch": 1.1818181818181819, "grad_norm": 0.4605470597743988, "learning_rate": 9.856557875873498e-06, "loss": 0.4306, "step": 1404 }, { "epoch": 1.1826599326599327, "grad_norm": 0.47763141989707947, "learning_rate": 9.85605831407793e-06, "loss": 0.443, "step": 1405 }, { "epoch": 1.1835016835016834, "grad_norm": 0.5086050033569336, "learning_rate": 9.855557896593113e-06, "loss": 0.4256, "step": 1406 }, { "epoch": 1.1843434343434343, "grad_norm": 0.4217749536037445, "learning_rate": 9.855056623507229e-06, "loss": 0.4315, "step": 1407 }, { "epoch": 1.1851851851851851, "grad_norm": 0.5056982040405273, "learning_rate": 9.854554494908606e-06, "loss": 0.4128, "step": 1408 }, { "epoch": 1.186026936026936, "grad_norm": 0.5344873070716858, "learning_rate": 9.854051510885722e-06, "loss": 0.4068, "step": 1409 }, { "epoch": 1.1868686868686869, "grad_norm": 0.47799092531204224, "learning_rate": 9.853547671527216e-06, "loss": 0.4417, "step": 1410 }, { "epoch": 1.1877104377104377, "grad_norm": 0.586819589138031, "learning_rate": 9.85304297692186e-06, "loss": 0.4284, "step": 1411 }, { "epoch": 1.1885521885521886, "grad_norm": 0.45386117696762085, "learning_rate": 9.852537427158592e-06, "loss": 0.4223, "step": 1412 }, { "epoch": 1.1893939393939394, "grad_norm": 0.5883371829986572, "learning_rate": 9.852031022326496e-06, "loss": 0.4132, "step": 1413 }, { "epoch": 1.1902356902356903, "grad_norm": 0.480936735868454, "learning_rate": 9.851523762514802e-06, "loss": 0.4388, "step": 1414 }, { "epoch": 1.1910774410774412, "grad_norm": 0.5386082530021667, "learning_rate": 9.851015647812897e-06, "loss": 0.4441, "step": 1415 }, { "epoch": 1.1919191919191918, "grad_norm": 0.48542481660842896, "learning_rate": 9.850506678310318e-06, "loss": 0.409, "step": 1416 }, { "epoch": 1.1927609427609427, "grad_norm": 0.5727831125259399, "learning_rate": 9.849996854096748e-06, "loss": 0.4443, "step": 1417 }, { "epoch": 1.1936026936026936, "grad_norm": 0.5099077224731445, "learning_rate": 9.849486175262024e-06, "loss": 0.4227, "step": 1418 }, { "epoch": 1.1944444444444444, "grad_norm": 0.49213942885398865, "learning_rate": 9.848974641896135e-06, "loss": 0.4162, "step": 1419 }, { "epoch": 1.1952861952861953, "grad_norm": 0.5589703321456909, "learning_rate": 9.848462254089217e-06, "loss": 0.4334, "step": 1420 }, { "epoch": 1.1961279461279462, "grad_norm": 0.5037125945091248, "learning_rate": 9.847949011931557e-06, "loss": 0.4281, "step": 1421 }, { "epoch": 1.196969696969697, "grad_norm": 0.5890716910362244, "learning_rate": 9.847434915513596e-06, "loss": 0.425, "step": 1422 }, { "epoch": 1.1978114478114479, "grad_norm": 0.47508275508880615, "learning_rate": 9.846919964925921e-06, "loss": 0.422, "step": 1423 }, { "epoch": 1.1986531986531987, "grad_norm": 0.4955739676952362, "learning_rate": 9.846404160259276e-06, "loss": 0.4091, "step": 1424 }, { "epoch": 1.1994949494949494, "grad_norm": 0.5114926099777222, "learning_rate": 9.845887501604546e-06, "loss": 0.4161, "step": 1425 }, { "epoch": 1.2003367003367003, "grad_norm": 0.4628690779209137, "learning_rate": 9.845369989052774e-06, "loss": 0.4174, "step": 1426 }, { "epoch": 1.2011784511784511, "grad_norm": 0.4978194534778595, "learning_rate": 9.844851622695154e-06, "loss": 0.4262, "step": 1427 }, { "epoch": 1.202020202020202, "grad_norm": 0.5211849808692932, "learning_rate": 9.844332402623021e-06, "loss": 0.4436, "step": 1428 }, { "epoch": 1.2028619528619529, "grad_norm": 0.49939727783203125, "learning_rate": 9.843812328927872e-06, "loss": 0.4311, "step": 1429 }, { "epoch": 1.2037037037037037, "grad_norm": 0.5199992656707764, "learning_rate": 9.843291401701352e-06, "loss": 0.3896, "step": 1430 }, { "epoch": 1.2045454545454546, "grad_norm": 0.5420658588409424, "learning_rate": 9.842769621035248e-06, "loss": 0.4122, "step": 1431 }, { "epoch": 1.2053872053872055, "grad_norm": 0.4438866972923279, "learning_rate": 9.842246987021506e-06, "loss": 0.4322, "step": 1432 }, { "epoch": 1.2062289562289563, "grad_norm": 0.5117930173873901, "learning_rate": 9.841723499752221e-06, "loss": 0.4293, "step": 1433 }, { "epoch": 1.2070707070707072, "grad_norm": 0.5348331332206726, "learning_rate": 9.841199159319635e-06, "loss": 0.4312, "step": 1434 }, { "epoch": 1.2079124579124578, "grad_norm": 0.5135655999183655, "learning_rate": 9.840673965816143e-06, "loss": 0.4223, "step": 1435 }, { "epoch": 1.2087542087542087, "grad_norm": 0.5633255243301392, "learning_rate": 9.840147919334292e-06, "loss": 0.4284, "step": 1436 }, { "epoch": 1.2095959595959596, "grad_norm": 0.4785836338996887, "learning_rate": 9.839621019966771e-06, "loss": 0.4123, "step": 1437 }, { "epoch": 1.2104377104377104, "grad_norm": 0.564140260219574, "learning_rate": 9.83909326780643e-06, "loss": 0.4299, "step": 1438 }, { "epoch": 1.2112794612794613, "grad_norm": 0.5763178467750549, "learning_rate": 9.838564662946265e-06, "loss": 0.4246, "step": 1439 }, { "epoch": 1.2121212121212122, "grad_norm": 0.5302059054374695, "learning_rate": 9.83803520547942e-06, "loss": 0.4128, "step": 1440 }, { "epoch": 1.212962962962963, "grad_norm": 0.5574486255645752, "learning_rate": 9.83750489549919e-06, "loss": 0.4347, "step": 1441 }, { "epoch": 1.2138047138047139, "grad_norm": 0.6102777719497681, "learning_rate": 9.836973733099025e-06, "loss": 0.4202, "step": 1442 }, { "epoch": 1.2146464646464645, "grad_norm": 0.5477110743522644, "learning_rate": 9.836441718372519e-06, "loss": 0.4115, "step": 1443 }, { "epoch": 1.2154882154882154, "grad_norm": 0.6068682074546814, "learning_rate": 9.835908851413416e-06, "loss": 0.4286, "step": 1444 }, { "epoch": 1.2163299663299663, "grad_norm": 0.5822208523750305, "learning_rate": 9.835375132315619e-06, "loss": 0.4209, "step": 1445 }, { "epoch": 1.2171717171717171, "grad_norm": 0.525497555732727, "learning_rate": 9.834840561173169e-06, "loss": 0.4407, "step": 1446 }, { "epoch": 1.218013468013468, "grad_norm": 0.5279465913772583, "learning_rate": 9.834305138080267e-06, "loss": 0.421, "step": 1447 }, { "epoch": 1.2188552188552189, "grad_norm": 0.4492776095867157, "learning_rate": 9.83376886313126e-06, "loss": 0.4071, "step": 1448 }, { "epoch": 1.2196969696969697, "grad_norm": 0.5015191435813904, "learning_rate": 9.833231736420643e-06, "loss": 0.4175, "step": 1449 }, { "epoch": 1.2205387205387206, "grad_norm": 0.5233821272850037, "learning_rate": 9.832693758043065e-06, "loss": 0.4367, "step": 1450 }, { "epoch": 1.2213804713804715, "grad_norm": 0.5672961473464966, "learning_rate": 9.832154928093324e-06, "loss": 0.4184, "step": 1451 }, { "epoch": 1.2222222222222223, "grad_norm": 0.4942997694015503, "learning_rate": 9.831615246666369e-06, "loss": 0.4301, "step": 1452 }, { "epoch": 1.2230639730639732, "grad_norm": 0.533369243144989, "learning_rate": 9.831074713857293e-06, "loss": 0.3991, "step": 1453 }, { "epoch": 1.2239057239057238, "grad_norm": 0.5636284947395325, "learning_rate": 9.830533329761346e-06, "loss": 0.4277, "step": 1454 }, { "epoch": 1.2247474747474747, "grad_norm": 0.45234227180480957, "learning_rate": 9.82999109447393e-06, "loss": 0.4227, "step": 1455 }, { "epoch": 1.2255892255892256, "grad_norm": 0.5630804300308228, "learning_rate": 9.829448008090583e-06, "loss": 0.4158, "step": 1456 }, { "epoch": 1.2264309764309764, "grad_norm": 0.5768043398857117, "learning_rate": 9.828904070707013e-06, "loss": 0.4124, "step": 1457 }, { "epoch": 1.2272727272727273, "grad_norm": 0.47553184628486633, "learning_rate": 9.82835928241906e-06, "loss": 0.4272, "step": 1458 }, { "epoch": 1.2281144781144782, "grad_norm": 0.5486543774604797, "learning_rate": 9.827813643322724e-06, "loss": 0.4318, "step": 1459 }, { "epoch": 1.228956228956229, "grad_norm": 0.5928820967674255, "learning_rate": 9.827267153514154e-06, "loss": 0.4211, "step": 1460 }, { "epoch": 1.22979797979798, "grad_norm": 0.48125261068344116, "learning_rate": 9.826719813089646e-06, "loss": 0.4134, "step": 1461 }, { "epoch": 1.2306397306397305, "grad_norm": 0.5811278820037842, "learning_rate": 9.826171622145645e-06, "loss": 0.4494, "step": 1462 }, { "epoch": 1.2314814814814814, "grad_norm": 0.5531315207481384, "learning_rate": 9.825622580778753e-06, "loss": 0.4274, "step": 1463 }, { "epoch": 1.2323232323232323, "grad_norm": 0.5192791819572449, "learning_rate": 9.825072689085714e-06, "loss": 0.4078, "step": 1464 }, { "epoch": 1.2331649831649831, "grad_norm": 0.5461998581886292, "learning_rate": 9.824521947163424e-06, "loss": 0.4083, "step": 1465 }, { "epoch": 1.234006734006734, "grad_norm": 0.531383216381073, "learning_rate": 9.823970355108931e-06, "loss": 0.4162, "step": 1466 }, { "epoch": 1.2348484848484849, "grad_norm": 0.5375086069107056, "learning_rate": 9.823417913019433e-06, "loss": 0.4414, "step": 1467 }, { "epoch": 1.2356902356902357, "grad_norm": 0.6531874537467957, "learning_rate": 9.822864620992272e-06, "loss": 0.4315, "step": 1468 }, { "epoch": 1.2365319865319866, "grad_norm": 0.5381276607513428, "learning_rate": 9.822310479124945e-06, "loss": 0.4328, "step": 1469 }, { "epoch": 1.2373737373737375, "grad_norm": 0.6014842987060547, "learning_rate": 9.821755487515102e-06, "loss": 0.445, "step": 1470 }, { "epoch": 1.2382154882154883, "grad_norm": 0.6169754266738892, "learning_rate": 9.821199646260534e-06, "loss": 0.4309, "step": 1471 }, { "epoch": 1.239057239057239, "grad_norm": 0.5498476028442383, "learning_rate": 9.820642955459186e-06, "loss": 0.4242, "step": 1472 }, { "epoch": 1.2398989898989898, "grad_norm": 0.5072385668754578, "learning_rate": 9.820085415209156e-06, "loss": 0.4065, "step": 1473 }, { "epoch": 1.2407407407407407, "grad_norm": 0.5507019758224487, "learning_rate": 9.819527025608686e-06, "loss": 0.4159, "step": 1474 }, { "epoch": 1.2415824915824916, "grad_norm": 0.5485184192657471, "learning_rate": 9.818967786756172e-06, "loss": 0.4261, "step": 1475 }, { "epoch": 1.2424242424242424, "grad_norm": 0.532622218132019, "learning_rate": 9.818407698750157e-06, "loss": 0.4093, "step": 1476 }, { "epoch": 1.2432659932659933, "grad_norm": 0.6920375227928162, "learning_rate": 9.817846761689333e-06, "loss": 0.4132, "step": 1477 }, { "epoch": 1.2441077441077442, "grad_norm": 0.5041726231575012, "learning_rate": 9.817284975672545e-06, "loss": 0.4348, "step": 1478 }, { "epoch": 1.244949494949495, "grad_norm": 0.5416246056556702, "learning_rate": 9.816722340798786e-06, "loss": 0.3971, "step": 1479 }, { "epoch": 1.2457912457912457, "grad_norm": 0.588259756565094, "learning_rate": 9.816158857167198e-06, "loss": 0.417, "step": 1480 }, { "epoch": 1.2466329966329965, "grad_norm": 0.5366615056991577, "learning_rate": 9.81559452487707e-06, "loss": 0.4475, "step": 1481 }, { "epoch": 1.2474747474747474, "grad_norm": 0.48820552229881287, "learning_rate": 9.815029344027847e-06, "loss": 0.4145, "step": 1482 }, { "epoch": 1.2483164983164983, "grad_norm": 0.5327175259590149, "learning_rate": 9.814463314719118e-06, "loss": 0.4159, "step": 1483 }, { "epoch": 1.2491582491582491, "grad_norm": 0.5000365972518921, "learning_rate": 9.813896437050625e-06, "loss": 0.4229, "step": 1484 }, { "epoch": 1.25, "grad_norm": 0.4892677962779999, "learning_rate": 9.813328711122257e-06, "loss": 0.4193, "step": 1485 }, { "epoch": 1.2508417508417509, "grad_norm": 0.586632251739502, "learning_rate": 9.812760137034052e-06, "loss": 0.4323, "step": 1486 }, { "epoch": 1.2516835016835017, "grad_norm": 0.4576683044433594, "learning_rate": 9.8121907148862e-06, "loss": 0.4293, "step": 1487 }, { "epoch": 1.2525252525252526, "grad_norm": 0.49621671438217163, "learning_rate": 9.81162044477904e-06, "loss": 0.4077, "step": 1488 }, { "epoch": 1.2533670033670035, "grad_norm": 0.5081853270530701, "learning_rate": 9.811049326813059e-06, "loss": 0.4072, "step": 1489 }, { "epoch": 1.2542087542087543, "grad_norm": 0.6150109767913818, "learning_rate": 9.810477361088893e-06, "loss": 0.4512, "step": 1490 }, { "epoch": 1.255050505050505, "grad_norm": 0.5181475877761841, "learning_rate": 9.80990454770733e-06, "loss": 0.4155, "step": 1491 }, { "epoch": 1.2558922558922558, "grad_norm": 0.4327806532382965, "learning_rate": 9.809330886769304e-06, "loss": 0.42, "step": 1492 }, { "epoch": 1.2567340067340067, "grad_norm": 0.5020470023155212, "learning_rate": 9.808756378375904e-06, "loss": 0.4072, "step": 1493 }, { "epoch": 1.2575757575757576, "grad_norm": 0.5007431507110596, "learning_rate": 9.80818102262836e-06, "loss": 0.4251, "step": 1494 }, { "epoch": 1.2584175084175084, "grad_norm": 0.46861907839775085, "learning_rate": 9.807604819628059e-06, "loss": 0.4232, "step": 1495 }, { "epoch": 1.2592592592592593, "grad_norm": 0.4780140519142151, "learning_rate": 9.807027769476532e-06, "loss": 0.4197, "step": 1496 }, { "epoch": 1.2601010101010102, "grad_norm": 0.4845750331878662, "learning_rate": 9.806449872275461e-06, "loss": 0.3984, "step": 1497 }, { "epoch": 1.2609427609427608, "grad_norm": 0.4977077543735504, "learning_rate": 9.80587112812668e-06, "loss": 0.43, "step": 1498 }, { "epoch": 1.2617845117845117, "grad_norm": 0.513006865978241, "learning_rate": 9.80529153713217e-06, "loss": 0.4193, "step": 1499 }, { "epoch": 1.2626262626262625, "grad_norm": 0.5411672592163086, "learning_rate": 9.80471109939406e-06, "loss": 0.4313, "step": 1500 }, { "epoch": 1.2634680134680134, "grad_norm": 0.47114577889442444, "learning_rate": 9.804129815014628e-06, "loss": 0.4264, "step": 1501 }, { "epoch": 1.2643097643097643, "grad_norm": 0.5346155166625977, "learning_rate": 9.803547684096304e-06, "loss": 0.4032, "step": 1502 }, { "epoch": 1.2651515151515151, "grad_norm": 0.47990643978118896, "learning_rate": 9.802964706741666e-06, "loss": 0.4282, "step": 1503 }, { "epoch": 1.265993265993266, "grad_norm": 0.5635126233100891, "learning_rate": 9.80238088305344e-06, "loss": 0.4427, "step": 1504 }, { "epoch": 1.2668350168350169, "grad_norm": 0.4411381483078003, "learning_rate": 9.801796213134502e-06, "loss": 0.4154, "step": 1505 }, { "epoch": 1.2676767676767677, "grad_norm": 0.6220641136169434, "learning_rate": 9.801210697087878e-06, "loss": 0.4169, "step": 1506 }, { "epoch": 1.2685185185185186, "grad_norm": 0.4728910028934479, "learning_rate": 9.80062433501674e-06, "loss": 0.4325, "step": 1507 }, { "epoch": 1.2693602693602695, "grad_norm": 0.519640326499939, "learning_rate": 9.800037127024414e-06, "loss": 0.4019, "step": 1508 }, { "epoch": 1.2702020202020203, "grad_norm": 0.5078396201133728, "learning_rate": 9.799449073214369e-06, "loss": 0.4333, "step": 1509 }, { "epoch": 1.271043771043771, "grad_norm": 0.501079261302948, "learning_rate": 9.79886017369023e-06, "loss": 0.4253, "step": 1510 }, { "epoch": 1.2718855218855218, "grad_norm": 0.5171712636947632, "learning_rate": 9.798270428555764e-06, "loss": 0.434, "step": 1511 }, { "epoch": 1.2727272727272727, "grad_norm": 0.5041862726211548, "learning_rate": 9.797679837914895e-06, "loss": 0.4294, "step": 1512 }, { "epoch": 1.2735690235690236, "grad_norm": 0.5093945860862732, "learning_rate": 9.797088401871685e-06, "loss": 0.4111, "step": 1513 }, { "epoch": 1.2744107744107744, "grad_norm": 0.4873342216014862, "learning_rate": 9.796496120530358e-06, "loss": 0.4507, "step": 1514 }, { "epoch": 1.2752525252525253, "grad_norm": 0.48936882615089417, "learning_rate": 9.795902993995277e-06, "loss": 0.438, "step": 1515 }, { "epoch": 1.2760942760942762, "grad_norm": 0.48853009939193726, "learning_rate": 9.795309022370955e-06, "loss": 0.4192, "step": 1516 }, { "epoch": 1.2769360269360268, "grad_norm": 0.47013041377067566, "learning_rate": 9.79471420576206e-06, "loss": 0.4087, "step": 1517 }, { "epoch": 1.2777777777777777, "grad_norm": 0.4418462812900543, "learning_rate": 9.794118544273403e-06, "loss": 0.4243, "step": 1518 }, { "epoch": 1.2786195286195285, "grad_norm": 0.5290106534957886, "learning_rate": 9.793522038009947e-06, "loss": 0.3976, "step": 1519 }, { "epoch": 1.2794612794612794, "grad_norm": 0.5512610077857971, "learning_rate": 9.7929246870768e-06, "loss": 0.4434, "step": 1520 }, { "epoch": 1.2803030303030303, "grad_norm": 0.4555549919605255, "learning_rate": 9.792326491579227e-06, "loss": 0.4297, "step": 1521 }, { "epoch": 1.2811447811447811, "grad_norm": 0.5451554656028748, "learning_rate": 9.791727451622633e-06, "loss": 0.4208, "step": 1522 }, { "epoch": 1.281986531986532, "grad_norm": 0.5037153959274292, "learning_rate": 9.791127567312572e-06, "loss": 0.447, "step": 1523 }, { "epoch": 1.2828282828282829, "grad_norm": 0.4887143671512604, "learning_rate": 9.790526838754755e-06, "loss": 0.4227, "step": 1524 }, { "epoch": 1.2836700336700337, "grad_norm": 0.48797714710235596, "learning_rate": 9.789925266055035e-06, "loss": 0.4174, "step": 1525 }, { "epoch": 1.2845117845117846, "grad_norm": 0.50099778175354, "learning_rate": 9.789322849319418e-06, "loss": 0.4256, "step": 1526 }, { "epoch": 1.2853535353535355, "grad_norm": 0.4968985617160797, "learning_rate": 9.788719588654051e-06, "loss": 0.4351, "step": 1527 }, { "epoch": 1.2861952861952861, "grad_norm": 0.5541825890541077, "learning_rate": 9.78811548416524e-06, "loss": 0.4243, "step": 1528 }, { "epoch": 1.287037037037037, "grad_norm": 0.47102710604667664, "learning_rate": 9.787510535959433e-06, "loss": 0.4132, "step": 1529 }, { "epoch": 1.2878787878787878, "grad_norm": 0.5087406039237976, "learning_rate": 9.786904744143227e-06, "loss": 0.4254, "step": 1530 }, { "epoch": 1.2887205387205387, "grad_norm": 0.5192109942436218, "learning_rate": 9.786298108823369e-06, "loss": 0.411, "step": 1531 }, { "epoch": 1.2895622895622896, "grad_norm": 0.46956878900527954, "learning_rate": 9.785690630106755e-06, "loss": 0.4257, "step": 1532 }, { "epoch": 1.2904040404040404, "grad_norm": 0.5205246210098267, "learning_rate": 9.785082308100431e-06, "loss": 0.4362, "step": 1533 }, { "epoch": 1.2912457912457913, "grad_norm": 0.5121812224388123, "learning_rate": 9.784473142911587e-06, "loss": 0.4164, "step": 1534 }, { "epoch": 1.292087542087542, "grad_norm": 0.5323144793510437, "learning_rate": 9.78386313464757e-06, "loss": 0.4318, "step": 1535 }, { "epoch": 1.2929292929292928, "grad_norm": 0.45130330324172974, "learning_rate": 9.78325228341586e-06, "loss": 0.4358, "step": 1536 }, { "epoch": 1.2937710437710437, "grad_norm": 0.49863511323928833, "learning_rate": 9.782640589324106e-06, "loss": 0.4035, "step": 1537 }, { "epoch": 1.2946127946127945, "grad_norm": 0.45682957768440247, "learning_rate": 9.782028052480088e-06, "loss": 0.4228, "step": 1538 }, { "epoch": 1.2954545454545454, "grad_norm": 0.5112433433532715, "learning_rate": 9.781414672991745e-06, "loss": 0.4153, "step": 1539 }, { "epoch": 1.2962962962962963, "grad_norm": 0.5122635960578918, "learning_rate": 9.780800450967161e-06, "loss": 0.4076, "step": 1540 }, { "epoch": 1.2971380471380471, "grad_norm": 0.47298723459243774, "learning_rate": 9.780185386514565e-06, "loss": 0.4055, "step": 1541 }, { "epoch": 1.297979797979798, "grad_norm": 0.4820300042629242, "learning_rate": 9.779569479742342e-06, "loss": 0.4427, "step": 1542 }, { "epoch": 1.2988215488215489, "grad_norm": 0.45890572667121887, "learning_rate": 9.77895273075902e-06, "loss": 0.4083, "step": 1543 }, { "epoch": 1.2996632996632997, "grad_norm": 0.5777893662452698, "learning_rate": 9.778335139673273e-06, "loss": 0.4226, "step": 1544 }, { "epoch": 1.3005050505050506, "grad_norm": 0.5272167325019836, "learning_rate": 9.777716706593934e-06, "loss": 0.4305, "step": 1545 }, { "epoch": 1.3013468013468015, "grad_norm": 0.4834214150905609, "learning_rate": 9.777097431629974e-06, "loss": 0.4111, "step": 1546 }, { "epoch": 1.3021885521885521, "grad_norm": 0.46777841448783875, "learning_rate": 9.776477314890515e-06, "loss": 0.4194, "step": 1547 }, { "epoch": 1.303030303030303, "grad_norm": 0.5525125861167908, "learning_rate": 9.77585635648483e-06, "loss": 0.452, "step": 1548 }, { "epoch": 1.3038720538720538, "grad_norm": 0.5256640315055847, "learning_rate": 9.775234556522334e-06, "loss": 0.4212, "step": 1549 }, { "epoch": 1.3047138047138047, "grad_norm": 0.45228368043899536, "learning_rate": 9.774611915112602e-06, "loss": 0.4271, "step": 1550 }, { "epoch": 1.3055555555555556, "grad_norm": 0.4778357744216919, "learning_rate": 9.773988432365346e-06, "loss": 0.4266, "step": 1551 }, { "epoch": 1.3063973063973064, "grad_norm": 0.45610103011131287, "learning_rate": 9.77336410839043e-06, "loss": 0.4168, "step": 1552 }, { "epoch": 1.3072390572390573, "grad_norm": 0.46948808431625366, "learning_rate": 9.772738943297868e-06, "loss": 0.4129, "step": 1553 }, { "epoch": 1.308080808080808, "grad_norm": 0.5117892622947693, "learning_rate": 9.77211293719782e-06, "loss": 0.4113, "step": 1554 }, { "epoch": 1.3089225589225588, "grad_norm": 0.5239236354827881, "learning_rate": 9.771486090200592e-06, "loss": 0.4202, "step": 1555 }, { "epoch": 1.3097643097643097, "grad_norm": 0.4486660957336426, "learning_rate": 9.770858402416647e-06, "loss": 0.434, "step": 1556 }, { "epoch": 1.3106060606060606, "grad_norm": 0.5243489146232605, "learning_rate": 9.770229873956587e-06, "loss": 0.4193, "step": 1557 }, { "epoch": 1.3114478114478114, "grad_norm": 0.41101372241973877, "learning_rate": 9.769600504931167e-06, "loss": 0.3964, "step": 1558 }, { "epoch": 1.3122895622895623, "grad_norm": 0.43051716685295105, "learning_rate": 9.768970295451286e-06, "loss": 0.4203, "step": 1559 }, { "epoch": 1.3131313131313131, "grad_norm": 0.5284525752067566, "learning_rate": 9.768339245627994e-06, "loss": 0.4148, "step": 1560 }, { "epoch": 1.313973063973064, "grad_norm": 0.47181710600852966, "learning_rate": 9.76770735557249e-06, "loss": 0.4329, "step": 1561 }, { "epoch": 1.3148148148148149, "grad_norm": 0.49150946736335754, "learning_rate": 9.76707462539612e-06, "loss": 0.3959, "step": 1562 }, { "epoch": 1.3156565656565657, "grad_norm": 0.5111181139945984, "learning_rate": 9.766441055210377e-06, "loss": 0.4367, "step": 1563 }, { "epoch": 1.3164983164983166, "grad_norm": 0.43077293038368225, "learning_rate": 9.765806645126903e-06, "loss": 0.4074, "step": 1564 }, { "epoch": 1.3173400673400675, "grad_norm": 0.4892590045928955, "learning_rate": 9.76517139525749e-06, "loss": 0.4218, "step": 1565 }, { "epoch": 1.3181818181818181, "grad_norm": 0.48818325996398926, "learning_rate": 9.764535305714069e-06, "loss": 0.4327, "step": 1566 }, { "epoch": 1.319023569023569, "grad_norm": 0.4401734471321106, "learning_rate": 9.763898376608734e-06, "loss": 0.4087, "step": 1567 }, { "epoch": 1.3198653198653199, "grad_norm": 0.44224029779434204, "learning_rate": 9.763260608053713e-06, "loss": 0.4018, "step": 1568 }, { "epoch": 1.3207070707070707, "grad_norm": 0.4745066463947296, "learning_rate": 9.762622000161392e-06, "loss": 0.4255, "step": 1569 }, { "epoch": 1.3215488215488216, "grad_norm": 0.4639229476451874, "learning_rate": 9.761982553044298e-06, "loss": 0.4469, "step": 1570 }, { "epoch": 1.3223905723905724, "grad_norm": 0.4757630228996277, "learning_rate": 9.761342266815106e-06, "loss": 0.4533, "step": 1571 }, { "epoch": 1.3232323232323233, "grad_norm": 0.5097732543945312, "learning_rate": 9.760701141586647e-06, "loss": 0.3982, "step": 1572 }, { "epoch": 1.324074074074074, "grad_norm": 0.5122289061546326, "learning_rate": 9.76005917747189e-06, "loss": 0.4287, "step": 1573 }, { "epoch": 1.3249158249158248, "grad_norm": 0.5474659204483032, "learning_rate": 9.759416374583957e-06, "loss": 0.4395, "step": 1574 }, { "epoch": 1.3257575757575757, "grad_norm": 0.5080158710479736, "learning_rate": 9.758772733036115e-06, "loss": 0.4376, "step": 1575 }, { "epoch": 1.3265993265993266, "grad_norm": 0.4568294286727905, "learning_rate": 9.758128252941784e-06, "loss": 0.4341, "step": 1576 }, { "epoch": 1.3274410774410774, "grad_norm": 0.5471330285072327, "learning_rate": 9.757482934414526e-06, "loss": 0.4476, "step": 1577 }, { "epoch": 1.3282828282828283, "grad_norm": 0.512522280216217, "learning_rate": 9.756836777568053e-06, "loss": 0.411, "step": 1578 }, { "epoch": 1.3291245791245792, "grad_norm": 0.5036681294441223, "learning_rate": 9.756189782516226e-06, "loss": 0.419, "step": 1579 }, { "epoch": 1.32996632996633, "grad_norm": 0.6227540969848633, "learning_rate": 9.75554194937305e-06, "loss": 0.4296, "step": 1580 }, { "epoch": 1.3308080808080809, "grad_norm": 0.478472501039505, "learning_rate": 9.754893278252683e-06, "loss": 0.4274, "step": 1581 }, { "epoch": 1.3316498316498318, "grad_norm": 0.6046531796455383, "learning_rate": 9.754243769269425e-06, "loss": 0.4082, "step": 1582 }, { "epoch": 1.3324915824915826, "grad_norm": 0.5023596882820129, "learning_rate": 9.75359342253773e-06, "loss": 0.4231, "step": 1583 }, { "epoch": 1.3333333333333333, "grad_norm": 0.5095615386962891, "learning_rate": 9.752942238172193e-06, "loss": 0.431, "step": 1584 }, { "epoch": 1.3341750841750841, "grad_norm": 0.49343782663345337, "learning_rate": 9.752290216287562e-06, "loss": 0.4281, "step": 1585 }, { "epoch": 1.335016835016835, "grad_norm": 0.45843878388404846, "learning_rate": 9.751637356998725e-06, "loss": 0.3957, "step": 1586 }, { "epoch": 1.3358585858585859, "grad_norm": 0.5187981128692627, "learning_rate": 9.75098366042073e-06, "loss": 0.4181, "step": 1587 }, { "epoch": 1.3367003367003367, "grad_norm": 0.4775395691394806, "learning_rate": 9.75032912666876e-06, "loss": 0.4283, "step": 1588 }, { "epoch": 1.3375420875420876, "grad_norm": 0.5516740679740906, "learning_rate": 9.749673755858152e-06, "loss": 0.4225, "step": 1589 }, { "epoch": 1.3383838383838385, "grad_norm": 0.4546196460723877, "learning_rate": 9.74901754810439e-06, "loss": 0.4281, "step": 1590 }, { "epoch": 1.339225589225589, "grad_norm": 0.5430958867073059, "learning_rate": 9.748360503523106e-06, "loss": 0.4193, "step": 1591 }, { "epoch": 1.34006734006734, "grad_norm": 0.5020763874053955, "learning_rate": 9.747702622230078e-06, "loss": 0.4076, "step": 1592 }, { "epoch": 1.3409090909090908, "grad_norm": 0.5334467887878418, "learning_rate": 9.747043904341229e-06, "loss": 0.424, "step": 1593 }, { "epoch": 1.3417508417508417, "grad_norm": 0.5554774403572083, "learning_rate": 9.746384349972635e-06, "loss": 0.4161, "step": 1594 }, { "epoch": 1.3425925925925926, "grad_norm": 0.517042875289917, "learning_rate": 9.745723959240513e-06, "loss": 0.4237, "step": 1595 }, { "epoch": 1.3434343434343434, "grad_norm": 0.5792546272277832, "learning_rate": 9.745062732261235e-06, "loss": 0.4221, "step": 1596 }, { "epoch": 1.3442760942760943, "grad_norm": 0.4634092152118683, "learning_rate": 9.744400669151315e-06, "loss": 0.4151, "step": 1597 }, { "epoch": 1.3451178451178452, "grad_norm": 0.5295396447181702, "learning_rate": 9.743737770027414e-06, "loss": 0.4396, "step": 1598 }, { "epoch": 1.345959595959596, "grad_norm": 0.4741693139076233, "learning_rate": 9.743074035006344e-06, "loss": 0.3991, "step": 1599 }, { "epoch": 1.3468013468013469, "grad_norm": 0.5660140514373779, "learning_rate": 9.74240946420506e-06, "loss": 0.4195, "step": 1600 }, { "epoch": 1.3476430976430978, "grad_norm": 0.4890682101249695, "learning_rate": 9.741744057740668e-06, "loss": 0.404, "step": 1601 }, { "epoch": 1.3484848484848486, "grad_norm": 0.5456443428993225, "learning_rate": 9.741077815730419e-06, "loss": 0.4307, "step": 1602 }, { "epoch": 1.3493265993265993, "grad_norm": 0.6475939154624939, "learning_rate": 9.740410738291713e-06, "loss": 0.4136, "step": 1603 }, { "epoch": 1.3501683501683501, "grad_norm": 0.5293523073196411, "learning_rate": 9.739742825542094e-06, "loss": 0.4401, "step": 1604 }, { "epoch": 1.351010101010101, "grad_norm": 0.4974009096622467, "learning_rate": 9.739074077599258e-06, "loss": 0.4464, "step": 1605 }, { "epoch": 1.3518518518518519, "grad_norm": 0.6226346492767334, "learning_rate": 9.738404494581044e-06, "loss": 0.4503, "step": 1606 }, { "epoch": 1.3526936026936027, "grad_norm": 0.4759966731071472, "learning_rate": 9.737734076605439e-06, "loss": 0.4142, "step": 1607 }, { "epoch": 1.3535353535353536, "grad_norm": 0.44600939750671387, "learning_rate": 9.73706282379058e-06, "loss": 0.4039, "step": 1608 }, { "epoch": 1.3543771043771045, "grad_norm": 0.5154616236686707, "learning_rate": 9.736390736254748e-06, "loss": 0.4192, "step": 1609 }, { "epoch": 1.355218855218855, "grad_norm": 0.5084032416343689, "learning_rate": 9.73571781411637e-06, "loss": 0.4162, "step": 1610 }, { "epoch": 1.356060606060606, "grad_norm": 0.4866029620170593, "learning_rate": 9.735044057494024e-06, "loss": 0.4097, "step": 1611 }, { "epoch": 1.3569023569023568, "grad_norm": 0.5268383622169495, "learning_rate": 9.734369466506433e-06, "loss": 0.4311, "step": 1612 }, { "epoch": 1.3577441077441077, "grad_norm": 0.4808078706264496, "learning_rate": 9.733694041272468e-06, "loss": 0.4239, "step": 1613 }, { "epoch": 1.3585858585858586, "grad_norm": 0.45232412219047546, "learning_rate": 9.733017781911143e-06, "loss": 0.4036, "step": 1614 }, { "epoch": 1.3594276094276094, "grad_norm": 0.4907793700695038, "learning_rate": 9.732340688541625e-06, "loss": 0.3925, "step": 1615 }, { "epoch": 1.3602693602693603, "grad_norm": 0.4832307696342468, "learning_rate": 9.731662761283224e-06, "loss": 0.4087, "step": 1616 }, { "epoch": 1.3611111111111112, "grad_norm": 0.521469235420227, "learning_rate": 9.730984000255399e-06, "loss": 0.4123, "step": 1617 }, { "epoch": 1.361952861952862, "grad_norm": 0.49984538555145264, "learning_rate": 9.730304405577754e-06, "loss": 0.4182, "step": 1618 }, { "epoch": 1.362794612794613, "grad_norm": 0.5780167579650879, "learning_rate": 9.72962397737004e-06, "loss": 0.4307, "step": 1619 }, { "epoch": 1.3636363636363638, "grad_norm": 0.4942317605018616, "learning_rate": 9.728942715752159e-06, "loss": 0.4342, "step": 1620 }, { "epoch": 1.3644781144781144, "grad_norm": 0.4700210392475128, "learning_rate": 9.728260620844153e-06, "loss": 0.4202, "step": 1621 }, { "epoch": 1.3653198653198653, "grad_norm": 0.4876211881637573, "learning_rate": 9.727577692766215e-06, "loss": 0.4075, "step": 1622 }, { "epoch": 1.3661616161616161, "grad_norm": 0.4791610538959503, "learning_rate": 9.726893931638683e-06, "loss": 0.4178, "step": 1623 }, { "epoch": 1.367003367003367, "grad_norm": 0.4482826888561249, "learning_rate": 9.726209337582048e-06, "loss": 0.4071, "step": 1624 }, { "epoch": 1.3678451178451179, "grad_norm": 0.45213156938552856, "learning_rate": 9.725523910716938e-06, "loss": 0.3963, "step": 1625 }, { "epoch": 1.3686868686868687, "grad_norm": 0.5159910321235657, "learning_rate": 9.724837651164135e-06, "loss": 0.4197, "step": 1626 }, { "epoch": 1.3695286195286196, "grad_norm": 0.4258015751838684, "learning_rate": 9.724150559044565e-06, "loss": 0.4037, "step": 1627 }, { "epoch": 1.3703703703703702, "grad_norm": 0.4886329770088196, "learning_rate": 9.7234626344793e-06, "loss": 0.4166, "step": 1628 }, { "epoch": 1.371212121212121, "grad_norm": 0.5095988512039185, "learning_rate": 9.722773877589558e-06, "loss": 0.4266, "step": 1629 }, { "epoch": 1.372053872053872, "grad_norm": 0.5086877942085266, "learning_rate": 9.722084288496709e-06, "loss": 0.4201, "step": 1630 }, { "epoch": 1.3728956228956228, "grad_norm": 0.4286437928676605, "learning_rate": 9.721393867322264e-06, "loss": 0.4023, "step": 1631 }, { "epoch": 1.3737373737373737, "grad_norm": 0.5206538438796997, "learning_rate": 9.720702614187883e-06, "loss": 0.4131, "step": 1632 }, { "epoch": 1.3745791245791246, "grad_norm": 0.5994852781295776, "learning_rate": 9.720010529215371e-06, "loss": 0.4318, "step": 1633 }, { "epoch": 1.3754208754208754, "grad_norm": 0.4529181718826294, "learning_rate": 9.719317612526682e-06, "loss": 0.4058, "step": 1634 }, { "epoch": 1.3762626262626263, "grad_norm": 0.49961018562316895, "learning_rate": 9.718623864243915e-06, "loss": 0.4131, "step": 1635 }, { "epoch": 1.3771043771043772, "grad_norm": 0.6071262955665588, "learning_rate": 9.717929284489317e-06, "loss": 0.3982, "step": 1636 }, { "epoch": 1.377946127946128, "grad_norm": 0.4595780372619629, "learning_rate": 9.717233873385278e-06, "loss": 0.4129, "step": 1637 }, { "epoch": 1.378787878787879, "grad_norm": 0.5829492807388306, "learning_rate": 9.716537631054339e-06, "loss": 0.4292, "step": 1638 }, { "epoch": 1.3796296296296298, "grad_norm": 0.47491058707237244, "learning_rate": 9.715840557619184e-06, "loss": 0.398, "step": 1639 }, { "epoch": 1.3804713804713804, "grad_norm": 0.46715933084487915, "learning_rate": 9.715142653202645e-06, "loss": 0.4162, "step": 1640 }, { "epoch": 1.3813131313131313, "grad_norm": 0.5555644035339355, "learning_rate": 9.7144439179277e-06, "loss": 0.4383, "step": 1641 }, { "epoch": 1.3821548821548821, "grad_norm": 0.45320722460746765, "learning_rate": 9.713744351917475e-06, "loss": 0.4261, "step": 1642 }, { "epoch": 1.382996632996633, "grad_norm": 0.5013389587402344, "learning_rate": 9.713043955295241e-06, "loss": 0.4177, "step": 1643 }, { "epoch": 1.3838383838383839, "grad_norm": 0.48484572768211365, "learning_rate": 9.712342728184411e-06, "loss": 0.3931, "step": 1644 }, { "epoch": 1.3846801346801347, "grad_norm": 0.44795355200767517, "learning_rate": 9.711640670708555e-06, "loss": 0.4279, "step": 1645 }, { "epoch": 1.3855218855218856, "grad_norm": 0.49898242950439453, "learning_rate": 9.710937782991378e-06, "loss": 0.4486, "step": 1646 }, { "epoch": 1.3863636363636362, "grad_norm": 0.5326272249221802, "learning_rate": 9.710234065156741e-06, "loss": 0.4354, "step": 1647 }, { "epoch": 1.387205387205387, "grad_norm": 0.4919356107711792, "learning_rate": 9.70952951732864e-06, "loss": 0.4119, "step": 1648 }, { "epoch": 1.388047138047138, "grad_norm": 0.43522119522094727, "learning_rate": 9.708824139631232e-06, "loss": 0.4094, "step": 1649 }, { "epoch": 1.3888888888888888, "grad_norm": 0.4710790514945984, "learning_rate": 9.708117932188806e-06, "loss": 0.4092, "step": 1650 }, { "epoch": 1.3897306397306397, "grad_norm": 0.46408388018608093, "learning_rate": 9.707410895125805e-06, "loss": 0.3858, "step": 1651 }, { "epoch": 1.3905723905723906, "grad_norm": 0.4448884129524231, "learning_rate": 9.706703028566816e-06, "loss": 0.4242, "step": 1652 }, { "epoch": 1.3914141414141414, "grad_norm": 0.5024473071098328, "learning_rate": 9.705994332636572e-06, "loss": 0.4213, "step": 1653 }, { "epoch": 1.3922558922558923, "grad_norm": 0.4982411563396454, "learning_rate": 9.705284807459956e-06, "loss": 0.422, "step": 1654 }, { "epoch": 1.3930976430976432, "grad_norm": 0.4958944320678711, "learning_rate": 9.704574453161992e-06, "loss": 0.4012, "step": 1655 }, { "epoch": 1.393939393939394, "grad_norm": 0.4890750050544739, "learning_rate": 9.703863269867852e-06, "loss": 0.4343, "step": 1656 }, { "epoch": 1.394781144781145, "grad_norm": 0.4408648610115051, "learning_rate": 9.703151257702854e-06, "loss": 0.41, "step": 1657 }, { "epoch": 1.3956228956228955, "grad_norm": 0.4905857741832733, "learning_rate": 9.702438416792461e-06, "loss": 0.4382, "step": 1658 }, { "epoch": 1.3964646464646464, "grad_norm": 0.48247459530830383, "learning_rate": 9.701724747262284e-06, "loss": 0.3946, "step": 1659 }, { "epoch": 1.3973063973063973, "grad_norm": 0.45618775486946106, "learning_rate": 9.70101024923808e-06, "loss": 0.4048, "step": 1660 }, { "epoch": 1.3981481481481481, "grad_norm": 0.5396762490272522, "learning_rate": 9.700294922845752e-06, "loss": 0.4229, "step": 1661 }, { "epoch": 1.398989898989899, "grad_norm": 0.44758111238479614, "learning_rate": 9.699578768211347e-06, "loss": 0.4145, "step": 1662 }, { "epoch": 1.3998316498316499, "grad_norm": 0.4849450886249542, "learning_rate": 9.698861785461056e-06, "loss": 0.424, "step": 1663 }, { "epoch": 1.4006734006734007, "grad_norm": 0.49467143416404724, "learning_rate": 9.698143974721223e-06, "loss": 0.4121, "step": 1664 }, { "epoch": 1.4015151515151514, "grad_norm": 0.5260468125343323, "learning_rate": 9.697425336118333e-06, "loss": 0.416, "step": 1665 }, { "epoch": 1.4023569023569022, "grad_norm": 0.45928168296813965, "learning_rate": 9.69670586977902e-06, "loss": 0.4147, "step": 1666 }, { "epoch": 1.4031986531986531, "grad_norm": 0.5353308916091919, "learning_rate": 9.695985575830056e-06, "loss": 0.4388, "step": 1667 }, { "epoch": 1.404040404040404, "grad_norm": 0.46767061948776245, "learning_rate": 9.695264454398368e-06, "loss": 0.4214, "step": 1668 }, { "epoch": 1.4048821548821548, "grad_norm": 0.507451057434082, "learning_rate": 9.694542505611027e-06, "loss": 0.4003, "step": 1669 }, { "epoch": 1.4057239057239057, "grad_norm": 0.5425453186035156, "learning_rate": 9.693819729595245e-06, "loss": 0.4278, "step": 1670 }, { "epoch": 1.4065656565656566, "grad_norm": 0.4749312698841095, "learning_rate": 9.693096126478381e-06, "loss": 0.4143, "step": 1671 }, { "epoch": 1.4074074074074074, "grad_norm": 0.4994824528694153, "learning_rate": 9.692371696387947e-06, "loss": 0.4226, "step": 1672 }, { "epoch": 1.4082491582491583, "grad_norm": 0.5086102485656738, "learning_rate": 9.691646439451594e-06, "loss": 0.427, "step": 1673 }, { "epoch": 1.4090909090909092, "grad_norm": 0.4970771074295044, "learning_rate": 9.690920355797117e-06, "loss": 0.4504, "step": 1674 }, { "epoch": 1.40993265993266, "grad_norm": 0.48751360177993774, "learning_rate": 9.690193445552461e-06, "loss": 0.4186, "step": 1675 }, { "epoch": 1.410774410774411, "grad_norm": 0.49383625388145447, "learning_rate": 9.689465708845717e-06, "loss": 0.4381, "step": 1676 }, { "epoch": 1.4116161616161615, "grad_norm": 0.4416236877441406, "learning_rate": 9.688737145805118e-06, "loss": 0.4331, "step": 1677 }, { "epoch": 1.4124579124579124, "grad_norm": 0.5505141019821167, "learning_rate": 9.688007756559046e-06, "loss": 0.4013, "step": 1678 }, { "epoch": 1.4132996632996633, "grad_norm": 0.482657253742218, "learning_rate": 9.687277541236027e-06, "loss": 0.4276, "step": 1679 }, { "epoch": 1.4141414141414141, "grad_norm": 0.5079308152198792, "learning_rate": 9.686546499964731e-06, "loss": 0.3959, "step": 1680 }, { "epoch": 1.414983164983165, "grad_norm": 0.46185022592544556, "learning_rate": 9.685814632873975e-06, "loss": 0.4129, "step": 1681 }, { "epoch": 1.4158249158249159, "grad_norm": 0.5420494079589844, "learning_rate": 9.685081940092726e-06, "loss": 0.416, "step": 1682 }, { "epoch": 1.4166666666666667, "grad_norm": 0.538648784160614, "learning_rate": 9.684348421750087e-06, "loss": 0.4154, "step": 1683 }, { "epoch": 1.4175084175084174, "grad_norm": 0.4859914481639862, "learning_rate": 9.683614077975317e-06, "loss": 0.3904, "step": 1684 }, { "epoch": 1.4183501683501682, "grad_norm": 0.554056704044342, "learning_rate": 9.682878908897811e-06, "loss": 0.4339, "step": 1685 }, { "epoch": 1.4191919191919191, "grad_norm": 0.5237082839012146, "learning_rate": 9.682142914647115e-06, "loss": 0.4215, "step": 1686 }, { "epoch": 1.42003367003367, "grad_norm": 0.4944259524345398, "learning_rate": 9.68140609535292e-06, "loss": 0.3919, "step": 1687 }, { "epoch": 1.4208754208754208, "grad_norm": 0.5345171689987183, "learning_rate": 9.68066845114506e-06, "loss": 0.4042, "step": 1688 }, { "epoch": 1.4217171717171717, "grad_norm": 0.5546846985816956, "learning_rate": 9.679929982153516e-06, "loss": 0.4349, "step": 1689 }, { "epoch": 1.4225589225589226, "grad_norm": 0.5220391750335693, "learning_rate": 9.679190688508415e-06, "loss": 0.4252, "step": 1690 }, { "epoch": 1.4234006734006734, "grad_norm": 0.6068044304847717, "learning_rate": 9.678450570340028e-06, "loss": 0.4361, "step": 1691 }, { "epoch": 1.4242424242424243, "grad_norm": 0.4931564927101135, "learning_rate": 9.67770962777877e-06, "loss": 0.4209, "step": 1692 }, { "epoch": 1.4250841750841752, "grad_norm": 0.5110986828804016, "learning_rate": 9.676967860955206e-06, "loss": 0.4238, "step": 1693 }, { "epoch": 1.425925925925926, "grad_norm": 0.5570716261863708, "learning_rate": 9.676225270000042e-06, "loss": 0.4422, "step": 1694 }, { "epoch": 1.4267676767676767, "grad_norm": 0.4933370053768158, "learning_rate": 9.67548185504413e-06, "loss": 0.3982, "step": 1695 }, { "epoch": 1.4276094276094276, "grad_norm": 0.5523805618286133, "learning_rate": 9.674737616218468e-06, "loss": 0.4163, "step": 1696 }, { "epoch": 1.4284511784511784, "grad_norm": 0.5389685034751892, "learning_rate": 9.673992553654196e-06, "loss": 0.4315, "step": 1697 }, { "epoch": 1.4292929292929293, "grad_norm": 0.47339755296707153, "learning_rate": 9.673246667482608e-06, "loss": 0.4172, "step": 1698 }, { "epoch": 1.4301346801346801, "grad_norm": 0.5191830992698669, "learning_rate": 9.672499957835133e-06, "loss": 0.4235, "step": 1699 }, { "epoch": 1.430976430976431, "grad_norm": 0.469277024269104, "learning_rate": 9.67175242484335e-06, "loss": 0.3817, "step": 1700 }, { "epoch": 1.4318181818181819, "grad_norm": 0.5190001726150513, "learning_rate": 9.671004068638982e-06, "loss": 0.4145, "step": 1701 }, { "epoch": 1.4326599326599325, "grad_norm": 0.46620285511016846, "learning_rate": 9.670254889353897e-06, "loss": 0.4304, "step": 1702 }, { "epoch": 1.4335016835016834, "grad_norm": 0.5989198684692383, "learning_rate": 9.669504887120108e-06, "loss": 0.4307, "step": 1703 }, { "epoch": 1.4343434343434343, "grad_norm": 0.46250438690185547, "learning_rate": 9.668754062069777e-06, "loss": 0.4554, "step": 1704 }, { "epoch": 1.4351851851851851, "grad_norm": 0.5330787301063538, "learning_rate": 9.668002414335202e-06, "loss": 0.4405, "step": 1705 }, { "epoch": 1.436026936026936, "grad_norm": 0.47413963079452515, "learning_rate": 9.667249944048837e-06, "loss": 0.4328, "step": 1706 }, { "epoch": 1.4368686868686869, "grad_norm": 0.41843682527542114, "learning_rate": 9.66649665134327e-06, "loss": 0.411, "step": 1707 }, { "epoch": 1.4377104377104377, "grad_norm": 0.46464595198631287, "learning_rate": 9.665742536351243e-06, "loss": 0.4255, "step": 1708 }, { "epoch": 1.4385521885521886, "grad_norm": 0.504386305809021, "learning_rate": 9.664987599205636e-06, "loss": 0.3919, "step": 1709 }, { "epoch": 1.4393939393939394, "grad_norm": 0.5311611294746399, "learning_rate": 9.66423184003948e-06, "loss": 0.4239, "step": 1710 }, { "epoch": 1.4402356902356903, "grad_norm": 0.48153701424598694, "learning_rate": 9.663475258985946e-06, "loss": 0.4163, "step": 1711 }, { "epoch": 1.4410774410774412, "grad_norm": 0.5258147716522217, "learning_rate": 9.662717856178351e-06, "loss": 0.4265, "step": 1712 }, { "epoch": 1.441919191919192, "grad_norm": 0.5344834923744202, "learning_rate": 9.66195963175016e-06, "loss": 0.4236, "step": 1713 }, { "epoch": 1.4427609427609427, "grad_norm": 0.44353675842285156, "learning_rate": 9.66120058583498e-06, "loss": 0.4139, "step": 1714 }, { "epoch": 1.4436026936026936, "grad_norm": 0.4928493797779083, "learning_rate": 9.66044071856656e-06, "loss": 0.4357, "step": 1715 }, { "epoch": 1.4444444444444444, "grad_norm": 0.49897995591163635, "learning_rate": 9.659680030078798e-06, "loss": 0.4116, "step": 1716 }, { "epoch": 1.4452861952861953, "grad_norm": 0.4964713454246521, "learning_rate": 9.658918520505735e-06, "loss": 0.4393, "step": 1717 }, { "epoch": 1.4461279461279462, "grad_norm": 0.5447719097137451, "learning_rate": 9.65815618998156e-06, "loss": 0.4052, "step": 1718 }, { "epoch": 1.446969696969697, "grad_norm": 0.4583527147769928, "learning_rate": 9.6573930386406e-06, "loss": 0.4079, "step": 1719 }, { "epoch": 1.4478114478114479, "grad_norm": 0.5008954405784607, "learning_rate": 9.656629066617336e-06, "loss": 0.4407, "step": 1720 }, { "epoch": 1.4486531986531985, "grad_norm": 0.4507978558540344, "learning_rate": 9.65586427404638e-06, "loss": 0.4265, "step": 1721 }, { "epoch": 1.4494949494949494, "grad_norm": 0.48604846000671387, "learning_rate": 9.655098661062503e-06, "loss": 0.4523, "step": 1722 }, { "epoch": 1.4503367003367003, "grad_norm": 0.5195416212081909, "learning_rate": 9.654332227800611e-06, "loss": 0.4332, "step": 1723 }, { "epoch": 1.4511784511784511, "grad_norm": 0.44571131467819214, "learning_rate": 9.653564974395759e-06, "loss": 0.4136, "step": 1724 }, { "epoch": 1.452020202020202, "grad_norm": 0.530840277671814, "learning_rate": 9.652796900983146e-06, "loss": 0.4238, "step": 1725 }, { "epoch": 1.4528619528619529, "grad_norm": 0.4596545696258545, "learning_rate": 9.652028007698112e-06, "loss": 0.4374, "step": 1726 }, { "epoch": 1.4537037037037037, "grad_norm": 0.43145114183425903, "learning_rate": 9.651258294676146e-06, "loss": 0.4288, "step": 1727 }, { "epoch": 1.4545454545454546, "grad_norm": 0.5737654566764832, "learning_rate": 9.650487762052879e-06, "loss": 0.4205, "step": 1728 }, { "epoch": 1.4553872053872055, "grad_norm": 0.5244180560112, "learning_rate": 9.64971640996409e-06, "loss": 0.4176, "step": 1729 }, { "epoch": 1.4562289562289563, "grad_norm": 0.49330517649650574, "learning_rate": 9.648944238545695e-06, "loss": 0.4443, "step": 1730 }, { "epoch": 1.4570707070707072, "grad_norm": 0.5575898289680481, "learning_rate": 9.64817124793376e-06, "loss": 0.4149, "step": 1731 }, { "epoch": 1.457912457912458, "grad_norm": 0.47839662432670593, "learning_rate": 9.647397438264497e-06, "loss": 0.417, "step": 1732 }, { "epoch": 1.4587542087542087, "grad_norm": 0.5303191542625427, "learning_rate": 9.646622809674256e-06, "loss": 0.4316, "step": 1733 }, { "epoch": 1.4595959595959596, "grad_norm": 0.5637019872665405, "learning_rate": 9.645847362299536e-06, "loss": 0.4216, "step": 1734 }, { "epoch": 1.4604377104377104, "grad_norm": 0.5316307544708252, "learning_rate": 9.645071096276981e-06, "loss": 0.4043, "step": 1735 }, { "epoch": 1.4612794612794613, "grad_norm": 0.46865177154541016, "learning_rate": 9.644294011743375e-06, "loss": 0.4218, "step": 1736 }, { "epoch": 1.4621212121212122, "grad_norm": 0.5869331955909729, "learning_rate": 9.643516108835648e-06, "loss": 0.4135, "step": 1737 }, { "epoch": 1.462962962962963, "grad_norm": 0.5091442465782166, "learning_rate": 9.64273738769088e-06, "loss": 0.4536, "step": 1738 }, { "epoch": 1.4638047138047139, "grad_norm": 0.5605529546737671, "learning_rate": 9.641957848446282e-06, "loss": 0.4195, "step": 1739 }, { "epoch": 1.4646464646464645, "grad_norm": 0.5798439979553223, "learning_rate": 9.641177491239223e-06, "loss": 0.4112, "step": 1740 }, { "epoch": 1.4654882154882154, "grad_norm": 0.42879557609558105, "learning_rate": 9.640396316207209e-06, "loss": 0.4263, "step": 1741 }, { "epoch": 1.4663299663299663, "grad_norm": 0.5489835143089294, "learning_rate": 9.63961432348789e-06, "loss": 0.3925, "step": 1742 }, { "epoch": 1.4671717171717171, "grad_norm": 0.5028402209281921, "learning_rate": 9.638831513219062e-06, "loss": 0.4362, "step": 1743 }, { "epoch": 1.468013468013468, "grad_norm": 0.5027564764022827, "learning_rate": 9.638047885538665e-06, "loss": 0.4046, "step": 1744 }, { "epoch": 1.4688552188552189, "grad_norm": 0.5048264861106873, "learning_rate": 9.637263440584783e-06, "loss": 0.4072, "step": 1745 }, { "epoch": 1.4696969696969697, "grad_norm": 0.46916401386260986, "learning_rate": 9.636478178495642e-06, "loss": 0.4099, "step": 1746 }, { "epoch": 1.4705387205387206, "grad_norm": 0.49126383662223816, "learning_rate": 9.635692099409618e-06, "loss": 0.42, "step": 1747 }, { "epoch": 1.4713804713804715, "grad_norm": 0.5158584713935852, "learning_rate": 9.634905203465219e-06, "loss": 0.4293, "step": 1748 }, { "epoch": 1.4722222222222223, "grad_norm": 0.4978633522987366, "learning_rate": 9.63411749080111e-06, "loss": 0.4394, "step": 1749 }, { "epoch": 1.4730639730639732, "grad_norm": 0.5006828904151917, "learning_rate": 9.633328961556092e-06, "loss": 0.4488, "step": 1750 }, { "epoch": 1.4739057239057238, "grad_norm": 0.4443666636943817, "learning_rate": 9.632539615869114e-06, "loss": 0.4301, "step": 1751 }, { "epoch": 1.4747474747474747, "grad_norm": 0.5240764021873474, "learning_rate": 9.631749453879266e-06, "loss": 0.4368, "step": 1752 }, { "epoch": 1.4755892255892256, "grad_norm": 0.46050259470939636, "learning_rate": 9.630958475725782e-06, "loss": 0.4235, "step": 1753 }, { "epoch": 1.4764309764309764, "grad_norm": 0.4491947591304779, "learning_rate": 9.630166681548042e-06, "loss": 0.4016, "step": 1754 }, { "epoch": 1.4772727272727273, "grad_norm": 0.4702610671520233, "learning_rate": 9.629374071485571e-06, "loss": 0.3955, "step": 1755 }, { "epoch": 1.4781144781144782, "grad_norm": 0.4630206823348999, "learning_rate": 9.628580645678032e-06, "loss": 0.4394, "step": 1756 }, { "epoch": 1.478956228956229, "grad_norm": 0.41625523567199707, "learning_rate": 9.627786404265235e-06, "loss": 0.4028, "step": 1757 }, { "epoch": 1.4797979797979797, "grad_norm": 0.47248247265815735, "learning_rate": 9.626991347387136e-06, "loss": 0.433, "step": 1758 }, { "epoch": 1.4806397306397305, "grad_norm": 0.5050010681152344, "learning_rate": 9.62619547518383e-06, "loss": 0.4133, "step": 1759 }, { "epoch": 1.4814814814814814, "grad_norm": 0.45248115062713623, "learning_rate": 9.62539878779556e-06, "loss": 0.3951, "step": 1760 }, { "epoch": 1.4823232323232323, "grad_norm": 0.445986807346344, "learning_rate": 9.62460128536271e-06, "loss": 0.425, "step": 1761 }, { "epoch": 1.4831649831649831, "grad_norm": 0.5708766579627991, "learning_rate": 9.62380296802581e-06, "loss": 0.4389, "step": 1762 }, { "epoch": 1.484006734006734, "grad_norm": 0.4612174928188324, "learning_rate": 9.623003835925529e-06, "loss": 0.4222, "step": 1763 }, { "epoch": 1.4848484848484849, "grad_norm": 0.46463543176651, "learning_rate": 9.622203889202687e-06, "loss": 0.4373, "step": 1764 }, { "epoch": 1.4856902356902357, "grad_norm": 0.5217742323875427, "learning_rate": 9.621403127998238e-06, "loss": 0.4254, "step": 1765 }, { "epoch": 1.4865319865319866, "grad_norm": 0.43294674158096313, "learning_rate": 9.620601552453286e-06, "loss": 0.4135, "step": 1766 }, { "epoch": 1.4873737373737375, "grad_norm": 0.42677101492881775, "learning_rate": 9.61979916270908e-06, "loss": 0.4079, "step": 1767 }, { "epoch": 1.4882154882154883, "grad_norm": 0.5054309964179993, "learning_rate": 9.618995958907008e-06, "loss": 0.424, "step": 1768 }, { "epoch": 1.4890572390572392, "grad_norm": 0.44805189967155457, "learning_rate": 9.618191941188602e-06, "loss": 0.431, "step": 1769 }, { "epoch": 1.4898989898989898, "grad_norm": 0.4632880389690399, "learning_rate": 9.61738710969554e-06, "loss": 0.3936, "step": 1770 }, { "epoch": 1.4907407407407407, "grad_norm": 0.50661301612854, "learning_rate": 9.616581464569642e-06, "loss": 0.4042, "step": 1771 }, { "epoch": 1.4915824915824916, "grad_norm": 0.4934137165546417, "learning_rate": 9.61577500595287e-06, "loss": 0.4227, "step": 1772 }, { "epoch": 1.4924242424242424, "grad_norm": 0.4944072365760803, "learning_rate": 9.614967733987332e-06, "loss": 0.3993, "step": 1773 }, { "epoch": 1.4932659932659933, "grad_norm": 0.4729905426502228, "learning_rate": 9.614159648815276e-06, "loss": 0.4161, "step": 1774 }, { "epoch": 1.4941077441077442, "grad_norm": 0.4871782660484314, "learning_rate": 9.613350750579098e-06, "loss": 0.4183, "step": 1775 }, { "epoch": 1.494949494949495, "grad_norm": 0.5116165280342102, "learning_rate": 9.61254103942133e-06, "loss": 0.4091, "step": 1776 }, { "epoch": 1.4957912457912457, "grad_norm": 0.48430344462394714, "learning_rate": 9.611730515484657e-06, "loss": 0.4194, "step": 1777 }, { "epoch": 1.4966329966329965, "grad_norm": 0.44713032245635986, "learning_rate": 9.6109191789119e-06, "loss": 0.4316, "step": 1778 }, { "epoch": 1.4974747474747474, "grad_norm": 0.46166300773620605, "learning_rate": 9.610107029846023e-06, "loss": 0.4278, "step": 1779 }, { "epoch": 1.4983164983164983, "grad_norm": 0.48499795794487, "learning_rate": 9.609294068430137e-06, "loss": 0.4201, "step": 1780 }, { "epoch": 1.4991582491582491, "grad_norm": 0.45792344212532043, "learning_rate": 9.608480294807496e-06, "loss": 0.4428, "step": 1781 }, { "epoch": 1.5, "grad_norm": 0.4293348789215088, "learning_rate": 9.607665709121495e-06, "loss": 0.4165, "step": 1782 }, { "epoch": 1.5008417508417509, "grad_norm": 0.513314962387085, "learning_rate": 9.606850311515668e-06, "loss": 0.4331, "step": 1783 }, { "epoch": 1.5016835016835017, "grad_norm": 0.5065396428108215, "learning_rate": 9.606034102133704e-06, "loss": 0.4281, "step": 1784 }, { "epoch": 1.5025252525252526, "grad_norm": 0.519411027431488, "learning_rate": 9.605217081119423e-06, "loss": 0.4216, "step": 1785 }, { "epoch": 1.5033670033670035, "grad_norm": 0.41935285925865173, "learning_rate": 9.604399248616794e-06, "loss": 0.3946, "step": 1786 }, { "epoch": 1.5042087542087543, "grad_norm": 0.5040301084518433, "learning_rate": 9.603580604769928e-06, "loss": 0.411, "step": 1787 }, { "epoch": 1.5050505050505052, "grad_norm": 0.45913165807724, "learning_rate": 9.60276114972308e-06, "loss": 0.4139, "step": 1788 }, { "epoch": 1.5058922558922558, "grad_norm": 0.5606755018234253, "learning_rate": 9.601940883620644e-06, "loss": 0.4351, "step": 1789 }, { "epoch": 1.5067340067340067, "grad_norm": 0.5750070214271545, "learning_rate": 9.60111980660716e-06, "loss": 0.437, "step": 1790 }, { "epoch": 1.5075757575757576, "grad_norm": 0.4791819155216217, "learning_rate": 9.600297918827313e-06, "loss": 0.4329, "step": 1791 }, { "epoch": 1.5084175084175084, "grad_norm": 0.5375425815582275, "learning_rate": 9.599475220425927e-06, "loss": 0.4105, "step": 1792 }, { "epoch": 1.5092592592592593, "grad_norm": 0.502767026424408, "learning_rate": 9.59865171154797e-06, "loss": 0.4364, "step": 1793 }, { "epoch": 1.51010101010101, "grad_norm": 0.493915855884552, "learning_rate": 9.597827392338553e-06, "loss": 0.424, "step": 1794 }, { "epoch": 1.5109427609427608, "grad_norm": 0.42471861839294434, "learning_rate": 9.597002262942932e-06, "loss": 0.4021, "step": 1795 }, { "epoch": 1.5117845117845117, "grad_norm": 0.4358486831188202, "learning_rate": 9.596176323506498e-06, "loss": 0.4109, "step": 1796 }, { "epoch": 1.5126262626262625, "grad_norm": 0.4256801903247833, "learning_rate": 9.595349574174797e-06, "loss": 0.4187, "step": 1797 }, { "epoch": 1.5134680134680134, "grad_norm": 0.43076473474502563, "learning_rate": 9.594522015093508e-06, "loss": 0.3986, "step": 1798 }, { "epoch": 1.5143097643097643, "grad_norm": 0.6186115741729736, "learning_rate": 9.593693646408456e-06, "loss": 0.4492, "step": 1799 }, { "epoch": 1.5151515151515151, "grad_norm": 0.4539199769496918, "learning_rate": 9.592864468265606e-06, "loss": 0.4269, "step": 1800 }, { "epoch": 1.515993265993266, "grad_norm": 0.5012392401695251, "learning_rate": 9.592034480811072e-06, "loss": 0.3937, "step": 1801 }, { "epoch": 1.5168350168350169, "grad_norm": 0.5291221141815186, "learning_rate": 9.591203684191104e-06, "loss": 0.4247, "step": 1802 }, { "epoch": 1.5176767676767677, "grad_norm": 0.4683563709259033, "learning_rate": 9.5903720785521e-06, "loss": 0.4019, "step": 1803 }, { "epoch": 1.5185185185185186, "grad_norm": 0.486794650554657, "learning_rate": 9.589539664040592e-06, "loss": 0.4429, "step": 1804 }, { "epoch": 1.5193602693602695, "grad_norm": 0.5870823860168457, "learning_rate": 9.588706440803267e-06, "loss": 0.4247, "step": 1805 }, { "epoch": 1.5202020202020203, "grad_norm": 0.4388269782066345, "learning_rate": 9.587872408986945e-06, "loss": 0.4147, "step": 1806 }, { "epoch": 1.5210437710437712, "grad_norm": 0.5506766438484192, "learning_rate": 9.587037568738591e-06, "loss": 0.4257, "step": 1807 }, { "epoch": 1.5218855218855218, "grad_norm": 0.5301663279533386, "learning_rate": 9.586201920205312e-06, "loss": 0.4129, "step": 1808 }, { "epoch": 1.5227272727272727, "grad_norm": 0.5313809514045715, "learning_rate": 9.585365463534361e-06, "loss": 0.4425, "step": 1809 }, { "epoch": 1.5235690235690236, "grad_norm": 0.49614956974983215, "learning_rate": 9.584528198873127e-06, "loss": 0.4094, "step": 1810 }, { "epoch": 1.5244107744107744, "grad_norm": 0.5376438498497009, "learning_rate": 9.583690126369147e-06, "loss": 0.4103, "step": 1811 }, { "epoch": 1.5252525252525253, "grad_norm": 0.47868862748146057, "learning_rate": 9.5828512461701e-06, "loss": 0.4391, "step": 1812 }, { "epoch": 1.526094276094276, "grad_norm": 0.5248849987983704, "learning_rate": 9.582011558423803e-06, "loss": 0.4211, "step": 1813 }, { "epoch": 1.5269360269360268, "grad_norm": 0.5950268507003784, "learning_rate": 9.581171063278218e-06, "loss": 0.4222, "step": 1814 }, { "epoch": 1.5277777777777777, "grad_norm": 0.4965553879737854, "learning_rate": 9.58032976088145e-06, "loss": 0.4186, "step": 1815 }, { "epoch": 1.5286195286195285, "grad_norm": 0.5820702910423279, "learning_rate": 9.579487651381745e-06, "loss": 0.4147, "step": 1816 }, { "epoch": 1.5294612794612794, "grad_norm": 0.5196915864944458, "learning_rate": 9.578644734927492e-06, "loss": 0.4337, "step": 1817 }, { "epoch": 1.5303030303030303, "grad_norm": 0.529447078704834, "learning_rate": 9.577801011667223e-06, "loss": 0.4451, "step": 1818 }, { "epoch": 1.5311447811447811, "grad_norm": 0.5111280679702759, "learning_rate": 9.576956481749609e-06, "loss": 0.4259, "step": 1819 }, { "epoch": 1.531986531986532, "grad_norm": 0.4693845212459564, "learning_rate": 9.576111145323466e-06, "loss": 0.4338, "step": 1820 }, { "epoch": 1.5328282828282829, "grad_norm": 0.46038779616355896, "learning_rate": 9.57526500253775e-06, "loss": 0.4115, "step": 1821 }, { "epoch": 1.5336700336700337, "grad_norm": 0.5052127242088318, "learning_rate": 9.574418053541563e-06, "loss": 0.424, "step": 1822 }, { "epoch": 1.5345117845117846, "grad_norm": 0.5136817097663879, "learning_rate": 9.573570298484145e-06, "loss": 0.4262, "step": 1823 }, { "epoch": 1.5353535353535355, "grad_norm": 0.5896912217140198, "learning_rate": 9.57272173751488e-06, "loss": 0.4365, "step": 1824 }, { "epoch": 1.5361952861952863, "grad_norm": 0.4691097140312195, "learning_rate": 9.571872370783293e-06, "loss": 0.4112, "step": 1825 }, { "epoch": 1.5370370370370372, "grad_norm": 0.5430097579956055, "learning_rate": 9.571022198439053e-06, "loss": 0.409, "step": 1826 }, { "epoch": 1.5378787878787878, "grad_norm": 0.539283275604248, "learning_rate": 9.570171220631965e-06, "loss": 0.4131, "step": 1827 }, { "epoch": 1.5387205387205387, "grad_norm": 0.5112338066101074, "learning_rate": 9.569319437511986e-06, "loss": 0.4182, "step": 1828 }, { "epoch": 1.5395622895622896, "grad_norm": 0.5127906799316406, "learning_rate": 9.568466849229206e-06, "loss": 0.4308, "step": 1829 }, { "epoch": 1.5404040404040404, "grad_norm": 0.5530157685279846, "learning_rate": 9.56761345593386e-06, "loss": 0.4171, "step": 1830 }, { "epoch": 1.541245791245791, "grad_norm": 0.5339685082435608, "learning_rate": 9.566759257776326e-06, "loss": 0.4215, "step": 1831 }, { "epoch": 1.542087542087542, "grad_norm": 0.5659245252609253, "learning_rate": 9.565904254907124e-06, "loss": 0.4631, "step": 1832 }, { "epoch": 1.5429292929292928, "grad_norm": 0.5513259768486023, "learning_rate": 9.565048447476912e-06, "loss": 0.4079, "step": 1833 }, { "epoch": 1.5437710437710437, "grad_norm": 0.5678008198738098, "learning_rate": 9.564191835636494e-06, "loss": 0.4227, "step": 1834 }, { "epoch": 1.5446127946127945, "grad_norm": 0.5455420017242432, "learning_rate": 9.563334419536813e-06, "loss": 0.4321, "step": 1835 }, { "epoch": 1.5454545454545454, "grad_norm": 0.4860096573829651, "learning_rate": 9.562476199328956e-06, "loss": 0.4187, "step": 1836 }, { "epoch": 1.5462962962962963, "grad_norm": 0.5470433235168457, "learning_rate": 9.56161717516415e-06, "loss": 0.4347, "step": 1837 }, { "epoch": 1.5471380471380471, "grad_norm": 0.47916463017463684, "learning_rate": 9.560757347193766e-06, "loss": 0.4444, "step": 1838 }, { "epoch": 1.547979797979798, "grad_norm": 0.4635276794433594, "learning_rate": 9.559896715569313e-06, "loss": 0.4215, "step": 1839 }, { "epoch": 1.5488215488215489, "grad_norm": 0.5030677914619446, "learning_rate": 9.559035280442443e-06, "loss": 0.4252, "step": 1840 }, { "epoch": 1.5496632996632997, "grad_norm": 0.55422443151474, "learning_rate": 9.55817304196495e-06, "loss": 0.4173, "step": 1841 }, { "epoch": 1.5505050505050506, "grad_norm": 0.5134925246238708, "learning_rate": 9.55731000028877e-06, "loss": 0.4149, "step": 1842 }, { "epoch": 1.5513468013468015, "grad_norm": 0.5461732745170593, "learning_rate": 9.556446155565983e-06, "loss": 0.4006, "step": 1843 }, { "epoch": 1.5521885521885523, "grad_norm": 0.48303961753845215, "learning_rate": 9.555581507948803e-06, "loss": 0.4431, "step": 1844 }, { "epoch": 1.553030303030303, "grad_norm": 0.4775664508342743, "learning_rate": 9.554716057589593e-06, "loss": 0.4425, "step": 1845 }, { "epoch": 1.5538720538720538, "grad_norm": 0.5358283519744873, "learning_rate": 9.553849804640856e-06, "loss": 0.3974, "step": 1846 }, { "epoch": 1.5547138047138047, "grad_norm": 0.444015771150589, "learning_rate": 9.55298274925523e-06, "loss": 0.4079, "step": 1847 }, { "epoch": 1.5555555555555556, "grad_norm": 0.4982069730758667, "learning_rate": 9.552114891585503e-06, "loss": 0.4182, "step": 1848 }, { "epoch": 1.5563973063973064, "grad_norm": 0.5626714825630188, "learning_rate": 9.551246231784601e-06, "loss": 0.4266, "step": 1849 }, { "epoch": 1.557239057239057, "grad_norm": 0.4772643446922302, "learning_rate": 9.550376770005589e-06, "loss": 0.404, "step": 1850 }, { "epoch": 1.558080808080808, "grad_norm": 0.5055938363075256, "learning_rate": 9.549506506401679e-06, "loss": 0.4281, "step": 1851 }, { "epoch": 1.5589225589225588, "grad_norm": 0.4820690453052521, "learning_rate": 9.548635441126215e-06, "loss": 0.407, "step": 1852 }, { "epoch": 1.5597643097643097, "grad_norm": 0.45646071434020996, "learning_rate": 9.547763574332693e-06, "loss": 0.4211, "step": 1853 }, { "epoch": 1.5606060606060606, "grad_norm": 0.5025748610496521, "learning_rate": 9.546890906174744e-06, "loss": 0.4224, "step": 1854 }, { "epoch": 1.5614478114478114, "grad_norm": 0.455039918422699, "learning_rate": 9.54601743680614e-06, "loss": 0.4187, "step": 1855 }, { "epoch": 1.5622895622895623, "grad_norm": 0.4563053846359253, "learning_rate": 9.545143166380797e-06, "loss": 0.4118, "step": 1856 }, { "epoch": 1.5631313131313131, "grad_norm": 0.45400890707969666, "learning_rate": 9.54426809505277e-06, "loss": 0.407, "step": 1857 }, { "epoch": 1.563973063973064, "grad_norm": 0.49796542525291443, "learning_rate": 9.543392222976257e-06, "loss": 0.4207, "step": 1858 }, { "epoch": 1.5648148148148149, "grad_norm": 0.4121943414211273, "learning_rate": 9.542515550305596e-06, "loss": 0.4036, "step": 1859 }, { "epoch": 1.5656565656565657, "grad_norm": 0.5896511077880859, "learning_rate": 9.541638077195265e-06, "loss": 0.4346, "step": 1860 }, { "epoch": 1.5664983164983166, "grad_norm": 0.48345696926116943, "learning_rate": 9.540759803799884e-06, "loss": 0.4376, "step": 1861 }, { "epoch": 1.5673400673400675, "grad_norm": 0.5390684008598328, "learning_rate": 9.539880730274214e-06, "loss": 0.4058, "step": 1862 }, { "epoch": 1.5681818181818183, "grad_norm": 0.4982520639896393, "learning_rate": 9.539000856773158e-06, "loss": 0.44, "step": 1863 }, { "epoch": 1.569023569023569, "grad_norm": 0.49521711468696594, "learning_rate": 9.53812018345176e-06, "loss": 0.3917, "step": 1864 }, { "epoch": 1.5698653198653199, "grad_norm": 0.5349518060684204, "learning_rate": 9.537238710465201e-06, "loss": 0.4438, "step": 1865 }, { "epoch": 1.5707070707070707, "grad_norm": 0.41282936930656433, "learning_rate": 9.53635643796881e-06, "loss": 0.4246, "step": 1866 }, { "epoch": 1.5715488215488216, "grad_norm": 0.4919711649417877, "learning_rate": 9.535473366118048e-06, "loss": 0.3836, "step": 1867 }, { "epoch": 1.5723905723905722, "grad_norm": 0.5366120338439941, "learning_rate": 9.534589495068527e-06, "loss": 0.4511, "step": 1868 }, { "epoch": 1.573232323232323, "grad_norm": 0.4937106668949127, "learning_rate": 9.533704824975992e-06, "loss": 0.4286, "step": 1869 }, { "epoch": 1.574074074074074, "grad_norm": 0.4753851890563965, "learning_rate": 9.532819355996328e-06, "loss": 0.4168, "step": 1870 }, { "epoch": 1.5749158249158248, "grad_norm": 0.4771401286125183, "learning_rate": 9.53193308828557e-06, "loss": 0.4197, "step": 1871 }, { "epoch": 1.5757575757575757, "grad_norm": 0.4827803373336792, "learning_rate": 9.531046021999883e-06, "loss": 0.4128, "step": 1872 }, { "epoch": 1.5765993265993266, "grad_norm": 0.42360761761665344, "learning_rate": 9.530158157295581e-06, "loss": 0.4201, "step": 1873 }, { "epoch": 1.5774410774410774, "grad_norm": 0.47254520654678345, "learning_rate": 9.529269494329111e-06, "loss": 0.423, "step": 1874 }, { "epoch": 1.5782828282828283, "grad_norm": 0.5026547312736511, "learning_rate": 9.52838003325707e-06, "loss": 0.4159, "step": 1875 }, { "epoch": 1.5791245791245792, "grad_norm": 0.43833643198013306, "learning_rate": 9.52748977423619e-06, "loss": 0.4165, "step": 1876 }, { "epoch": 1.57996632996633, "grad_norm": 0.48134922981262207, "learning_rate": 9.52659871742334e-06, "loss": 0.4015, "step": 1877 }, { "epoch": 1.5808080808080809, "grad_norm": 0.48203331232070923, "learning_rate": 9.525706862975536e-06, "loss": 0.4279, "step": 1878 }, { "epoch": 1.5816498316498318, "grad_norm": 0.46499916911125183, "learning_rate": 9.524814211049932e-06, "loss": 0.4082, "step": 1879 }, { "epoch": 1.5824915824915826, "grad_norm": 0.4865618646144867, "learning_rate": 9.523920761803825e-06, "loss": 0.4465, "step": 1880 }, { "epoch": 1.5833333333333335, "grad_norm": 0.48994654417037964, "learning_rate": 9.523026515394645e-06, "loss": 0.41, "step": 1881 }, { "epoch": 1.5841750841750841, "grad_norm": 0.4616394340991974, "learning_rate": 9.522131471979973e-06, "loss": 0.4122, "step": 1882 }, { "epoch": 1.585016835016835, "grad_norm": 0.4723338484764099, "learning_rate": 9.521235631717523e-06, "loss": 0.4103, "step": 1883 }, { "epoch": 1.5858585858585859, "grad_norm": 0.4753841459751129, "learning_rate": 9.520338994765153e-06, "loss": 0.3829, "step": 1884 }, { "epoch": 1.5867003367003367, "grad_norm": 0.4522921144962311, "learning_rate": 9.519441561280858e-06, "loss": 0.4417, "step": 1885 }, { "epoch": 1.5875420875420876, "grad_norm": 0.4485127627849579, "learning_rate": 9.518543331422775e-06, "loss": 0.4234, "step": 1886 }, { "epoch": 1.5883838383838382, "grad_norm": 0.49720606207847595, "learning_rate": 9.517644305349184e-06, "loss": 0.4177, "step": 1887 }, { "epoch": 1.589225589225589, "grad_norm": 0.4813624322414398, "learning_rate": 9.516744483218504e-06, "loss": 0.4342, "step": 1888 }, { "epoch": 1.59006734006734, "grad_norm": 0.5012444257736206, "learning_rate": 9.515843865189289e-06, "loss": 0.4157, "step": 1889 }, { "epoch": 1.5909090909090908, "grad_norm": 0.5221617221832275, "learning_rate": 9.514942451420239e-06, "loss": 0.424, "step": 1890 }, { "epoch": 1.5917508417508417, "grad_norm": 0.4625024199485779, "learning_rate": 9.514040242070195e-06, "loss": 0.434, "step": 1891 }, { "epoch": 1.5925925925925926, "grad_norm": 0.49085208773612976, "learning_rate": 9.513137237298133e-06, "loss": 0.4009, "step": 1892 }, { "epoch": 1.5934343434343434, "grad_norm": 0.5276171565055847, "learning_rate": 9.512233437263173e-06, "loss": 0.4182, "step": 1893 }, { "epoch": 1.5942760942760943, "grad_norm": 0.48924747109413147, "learning_rate": 9.511328842124576e-06, "loss": 0.4382, "step": 1894 }, { "epoch": 1.5951178451178452, "grad_norm": 0.5459319353103638, "learning_rate": 9.510423452041739e-06, "loss": 0.3884, "step": 1895 }, { "epoch": 1.595959595959596, "grad_norm": 0.4807513952255249, "learning_rate": 9.509517267174204e-06, "loss": 0.4088, "step": 1896 }, { "epoch": 1.5968013468013469, "grad_norm": 0.5918729901313782, "learning_rate": 9.508610287681647e-06, "loss": 0.4225, "step": 1897 }, { "epoch": 1.5976430976430978, "grad_norm": 0.47554099559783936, "learning_rate": 9.50770251372389e-06, "loss": 0.398, "step": 1898 }, { "epoch": 1.5984848484848486, "grad_norm": 0.47015267610549927, "learning_rate": 9.506793945460889e-06, "loss": 0.4298, "step": 1899 }, { "epoch": 1.5993265993265995, "grad_norm": 0.47670385241508484, "learning_rate": 9.505884583052749e-06, "loss": 0.4124, "step": 1900 }, { "epoch": 1.6001683501683501, "grad_norm": 0.43019625544548035, "learning_rate": 9.504974426659705e-06, "loss": 0.4104, "step": 1901 }, { "epoch": 1.601010101010101, "grad_norm": 0.48342883586883545, "learning_rate": 9.504063476442136e-06, "loss": 0.406, "step": 1902 }, { "epoch": 1.6018518518518519, "grad_norm": 0.5533046722412109, "learning_rate": 9.503151732560564e-06, "loss": 0.4154, "step": 1903 }, { "epoch": 1.6026936026936027, "grad_norm": 0.4667620062828064, "learning_rate": 9.50223919517565e-06, "loss": 0.4271, "step": 1904 }, { "epoch": 1.6035353535353534, "grad_norm": 0.4488353133201599, "learning_rate": 9.501325864448185e-06, "loss": 0.412, "step": 1905 }, { "epoch": 1.6043771043771042, "grad_norm": 0.5117861032485962, "learning_rate": 9.500411740539113e-06, "loss": 0.4387, "step": 1906 }, { "epoch": 1.605218855218855, "grad_norm": 0.5077653527259827, "learning_rate": 9.499496823609513e-06, "loss": 0.4267, "step": 1907 }, { "epoch": 1.606060606060606, "grad_norm": 0.43279191851615906, "learning_rate": 9.498581113820602e-06, "loss": 0.4028, "step": 1908 }, { "epoch": 1.6069023569023568, "grad_norm": 0.512673556804657, "learning_rate": 9.497664611333736e-06, "loss": 0.4185, "step": 1909 }, { "epoch": 1.6077441077441077, "grad_norm": 0.553719162940979, "learning_rate": 9.496747316310414e-06, "loss": 0.4552, "step": 1910 }, { "epoch": 1.6085858585858586, "grad_norm": 0.440403014421463, "learning_rate": 9.495829228912273e-06, "loss": 0.4272, "step": 1911 }, { "epoch": 1.6094276094276094, "grad_norm": 0.5962316989898682, "learning_rate": 9.494910349301092e-06, "loss": 0.4178, "step": 1912 }, { "epoch": 1.6102693602693603, "grad_norm": 0.5092113018035889, "learning_rate": 9.493990677638782e-06, "loss": 0.4227, "step": 1913 }, { "epoch": 1.6111111111111112, "grad_norm": 0.45346787571907043, "learning_rate": 9.493070214087405e-06, "loss": 0.4323, "step": 1914 }, { "epoch": 1.611952861952862, "grad_norm": 0.4929165542125702, "learning_rate": 9.492148958809153e-06, "loss": 0.41, "step": 1915 }, { "epoch": 1.612794612794613, "grad_norm": 0.5086117386817932, "learning_rate": 9.491226911966362e-06, "loss": 0.424, "step": 1916 }, { "epoch": 1.6136363636363638, "grad_norm": 0.395149827003479, "learning_rate": 9.490304073721506e-06, "loss": 0.412, "step": 1917 }, { "epoch": 1.6144781144781146, "grad_norm": 0.4670615494251251, "learning_rate": 9.489380444237198e-06, "loss": 0.3854, "step": 1918 }, { "epoch": 1.6153198653198653, "grad_norm": 0.4340841770172119, "learning_rate": 9.488456023676194e-06, "loss": 0.4355, "step": 1919 }, { "epoch": 1.6161616161616161, "grad_norm": 0.4398490786552429, "learning_rate": 9.487530812201384e-06, "loss": 0.395, "step": 1920 }, { "epoch": 1.617003367003367, "grad_norm": 0.5365794897079468, "learning_rate": 9.486604809975804e-06, "loss": 0.4244, "step": 1921 }, { "epoch": 1.6178451178451179, "grad_norm": 0.5281057953834534, "learning_rate": 9.48567801716262e-06, "loss": 0.4321, "step": 1922 }, { "epoch": 1.6186868686868687, "grad_norm": 0.4609415829181671, "learning_rate": 9.484750433925147e-06, "loss": 0.4165, "step": 1923 }, { "epoch": 1.6195286195286194, "grad_norm": 0.5007176399230957, "learning_rate": 9.483822060426833e-06, "loss": 0.4199, "step": 1924 }, { "epoch": 1.6203703703703702, "grad_norm": 0.5632774233818054, "learning_rate": 9.48289289683127e-06, "loss": 0.4067, "step": 1925 }, { "epoch": 1.621212121212121, "grad_norm": 0.4571710228919983, "learning_rate": 9.481962943302183e-06, "loss": 0.4073, "step": 1926 }, { "epoch": 1.622053872053872, "grad_norm": 0.47537803649902344, "learning_rate": 9.481032200003441e-06, "loss": 0.4344, "step": 1927 }, { "epoch": 1.6228956228956228, "grad_norm": 0.473777711391449, "learning_rate": 9.480100667099052e-06, "loss": 0.409, "step": 1928 }, { "epoch": 1.6237373737373737, "grad_norm": 0.4813259243965149, "learning_rate": 9.479168344753162e-06, "loss": 0.4018, "step": 1929 }, { "epoch": 1.6245791245791246, "grad_norm": 0.44854605197906494, "learning_rate": 9.478235233130053e-06, "loss": 0.4215, "step": 1930 }, { "epoch": 1.6254208754208754, "grad_norm": 0.43427762389183044, "learning_rate": 9.477301332394152e-06, "loss": 0.4345, "step": 1931 }, { "epoch": 1.6262626262626263, "grad_norm": 0.47301340103149414, "learning_rate": 9.476366642710025e-06, "loss": 0.3996, "step": 1932 }, { "epoch": 1.6271043771043772, "grad_norm": 0.5524217486381531, "learning_rate": 9.475431164242367e-06, "loss": 0.4426, "step": 1933 }, { "epoch": 1.627946127946128, "grad_norm": 0.43161439895629883, "learning_rate": 9.474494897156026e-06, "loss": 0.415, "step": 1934 }, { "epoch": 1.628787878787879, "grad_norm": 0.5349063277244568, "learning_rate": 9.47355784161598e-06, "loss": 0.3984, "step": 1935 }, { "epoch": 1.6296296296296298, "grad_norm": 0.5156688094139099, "learning_rate": 9.472619997787346e-06, "loss": 0.4244, "step": 1936 }, { "epoch": 1.6304713804713806, "grad_norm": 0.500935971736908, "learning_rate": 9.471681365835384e-06, "loss": 0.4064, "step": 1937 }, { "epoch": 1.6313131313131313, "grad_norm": 0.4195309281349182, "learning_rate": 9.470741945925492e-06, "loss": 0.3877, "step": 1938 }, { "epoch": 1.6321548821548821, "grad_norm": 0.4501732587814331, "learning_rate": 9.469801738223203e-06, "loss": 0.409, "step": 1939 }, { "epoch": 1.632996632996633, "grad_norm": 0.47685107588768005, "learning_rate": 9.468860742894194e-06, "loss": 0.4262, "step": 1940 }, { "epoch": 1.6338383838383839, "grad_norm": 0.4900820553302765, "learning_rate": 9.467918960104279e-06, "loss": 0.4057, "step": 1941 }, { "epoch": 1.6346801346801347, "grad_norm": 0.4419589936733246, "learning_rate": 9.466976390019407e-06, "loss": 0.4276, "step": 1942 }, { "epoch": 1.6355218855218854, "grad_norm": 0.4891602098941803, "learning_rate": 9.466033032805673e-06, "loss": 0.4353, "step": 1943 }, { "epoch": 1.6363636363636362, "grad_norm": 0.5087552070617676, "learning_rate": 9.465088888629303e-06, "loss": 0.4022, "step": 1944 }, { "epoch": 1.637205387205387, "grad_norm": 0.45793914794921875, "learning_rate": 9.464143957656666e-06, "loss": 0.4017, "step": 1945 }, { "epoch": 1.638047138047138, "grad_norm": 0.47448912262916565, "learning_rate": 9.46319824005427e-06, "loss": 0.4281, "step": 1946 }, { "epoch": 1.6388888888888888, "grad_norm": 0.44809389114379883, "learning_rate": 9.462251735988761e-06, "loss": 0.4156, "step": 1947 }, { "epoch": 1.6397306397306397, "grad_norm": 0.47928348183631897, "learning_rate": 9.461304445626922e-06, "loss": 0.4074, "step": 1948 }, { "epoch": 1.6405723905723906, "grad_norm": 0.5998957753181458, "learning_rate": 9.460356369135674e-06, "loss": 0.3952, "step": 1949 }, { "epoch": 1.6414141414141414, "grad_norm": 0.5013339519500732, "learning_rate": 9.45940750668208e-06, "loss": 0.4234, "step": 1950 }, { "epoch": 1.6422558922558923, "grad_norm": 0.48406341671943665, "learning_rate": 9.458457858433343e-06, "loss": 0.4243, "step": 1951 }, { "epoch": 1.6430976430976432, "grad_norm": 0.5672360062599182, "learning_rate": 9.457507424556796e-06, "loss": 0.4186, "step": 1952 }, { "epoch": 1.643939393939394, "grad_norm": 0.46521151065826416, "learning_rate": 9.456556205219917e-06, "loss": 0.4153, "step": 1953 }, { "epoch": 1.644781144781145, "grad_norm": 0.48197102546691895, "learning_rate": 9.455604200590322e-06, "loss": 0.4053, "step": 1954 }, { "epoch": 1.6456228956228958, "grad_norm": 0.49669814109802246, "learning_rate": 9.454651410835762e-06, "loss": 0.3877, "step": 1955 }, { "epoch": 1.6464646464646466, "grad_norm": 0.5050997138023376, "learning_rate": 9.453697836124134e-06, "loss": 0.3926, "step": 1956 }, { "epoch": 1.6473063973063973, "grad_norm": 0.4780939519405365, "learning_rate": 9.452743476623462e-06, "loss": 0.4158, "step": 1957 }, { "epoch": 1.6481481481481481, "grad_norm": 0.5927581787109375, "learning_rate": 9.451788332501916e-06, "loss": 0.4175, "step": 1958 }, { "epoch": 1.648989898989899, "grad_norm": 0.5257600545883179, "learning_rate": 9.450832403927805e-06, "loss": 0.424, "step": 1959 }, { "epoch": 1.6498316498316499, "grad_norm": 0.4617391526699066, "learning_rate": 9.449875691069573e-06, "loss": 0.4445, "step": 1960 }, { "epoch": 1.6506734006734005, "grad_norm": 0.5305960774421692, "learning_rate": 9.4489181940958e-06, "loss": 0.4199, "step": 1961 }, { "epoch": 1.6515151515151514, "grad_norm": 0.4762917160987854, "learning_rate": 9.44795991317521e-06, "loss": 0.407, "step": 1962 }, { "epoch": 1.6523569023569022, "grad_norm": 0.49372291564941406, "learning_rate": 9.447000848476664e-06, "loss": 0.4296, "step": 1963 }, { "epoch": 1.6531986531986531, "grad_norm": 0.44275957345962524, "learning_rate": 9.446041000169155e-06, "loss": 0.3852, "step": 1964 }, { "epoch": 1.654040404040404, "grad_norm": 0.5177695751190186, "learning_rate": 9.445080368421821e-06, "loss": 0.3927, "step": 1965 }, { "epoch": 1.6548821548821548, "grad_norm": 0.5600129961967468, "learning_rate": 9.444118953403935e-06, "loss": 0.4269, "step": 1966 }, { "epoch": 1.6557239057239057, "grad_norm": 0.4560687839984894, "learning_rate": 9.443156755284911e-06, "loss": 0.4212, "step": 1967 }, { "epoch": 1.6565656565656566, "grad_norm": 0.5578938722610474, "learning_rate": 9.442193774234294e-06, "loss": 0.4267, "step": 1968 }, { "epoch": 1.6574074074074074, "grad_norm": 0.5741994976997375, "learning_rate": 9.441230010421774e-06, "loss": 0.4141, "step": 1969 }, { "epoch": 1.6582491582491583, "grad_norm": 0.4463554322719574, "learning_rate": 9.440265464017177e-06, "loss": 0.4143, "step": 1970 }, { "epoch": 1.6590909090909092, "grad_norm": 0.5046603083610535, "learning_rate": 9.439300135190464e-06, "loss": 0.4329, "step": 1971 }, { "epoch": 1.65993265993266, "grad_norm": 0.4567124545574188, "learning_rate": 9.438334024111739e-06, "loss": 0.3849, "step": 1972 }, { "epoch": 1.660774410774411, "grad_norm": 0.4835706055164337, "learning_rate": 9.437367130951241e-06, "loss": 0.4139, "step": 1973 }, { "epoch": 1.6616161616161618, "grad_norm": 0.512239933013916, "learning_rate": 9.436399455879347e-06, "loss": 0.4167, "step": 1974 }, { "epoch": 1.6624579124579124, "grad_norm": 0.4556059241294861, "learning_rate": 9.435430999066567e-06, "loss": 0.4051, "step": 1975 }, { "epoch": 1.6632996632996633, "grad_norm": 0.5292256474494934, "learning_rate": 9.43446176068356e-06, "loss": 0.3916, "step": 1976 }, { "epoch": 1.6641414141414141, "grad_norm": 0.5345644950866699, "learning_rate": 9.43349174090111e-06, "loss": 0.4221, "step": 1977 }, { "epoch": 1.664983164983165, "grad_norm": 0.5833487510681152, "learning_rate": 9.43252093989015e-06, "loss": 0.4267, "step": 1978 }, { "epoch": 1.6658249158249159, "grad_norm": 0.5474728941917419, "learning_rate": 9.431549357821744e-06, "loss": 0.3999, "step": 1979 }, { "epoch": 1.6666666666666665, "grad_norm": 0.6197241544723511, "learning_rate": 9.430576994867093e-06, "loss": 0.3977, "step": 1980 }, { "epoch": 1.6675084175084174, "grad_norm": 0.5661614537239075, "learning_rate": 9.429603851197539e-06, "loss": 0.4284, "step": 1981 }, { "epoch": 1.6683501683501682, "grad_norm": 0.5536447763442993, "learning_rate": 9.428629926984562e-06, "loss": 0.4166, "step": 1982 }, { "epoch": 1.6691919191919191, "grad_norm": 0.5712242126464844, "learning_rate": 9.427655222399776e-06, "loss": 0.4185, "step": 1983 }, { "epoch": 1.67003367003367, "grad_norm": 0.5562563538551331, "learning_rate": 9.426679737614935e-06, "loss": 0.3997, "step": 1984 }, { "epoch": 1.6708754208754208, "grad_norm": 0.5062586665153503, "learning_rate": 9.42570347280193e-06, "loss": 0.4381, "step": 1985 }, { "epoch": 1.6717171717171717, "grad_norm": 0.5131022930145264, "learning_rate": 9.424726428132787e-06, "loss": 0.412, "step": 1986 }, { "epoch": 1.6725589225589226, "grad_norm": 0.49926257133483887, "learning_rate": 9.423748603779675e-06, "loss": 0.4152, "step": 1987 }, { "epoch": 1.6734006734006734, "grad_norm": 0.5887198448181152, "learning_rate": 9.422769999914895e-06, "loss": 0.4503, "step": 1988 }, { "epoch": 1.6742424242424243, "grad_norm": 0.4469551742076874, "learning_rate": 9.421790616710887e-06, "loss": 0.4137, "step": 1989 }, { "epoch": 1.6750841750841752, "grad_norm": 0.5595013499259949, "learning_rate": 9.420810454340232e-06, "loss": 0.4359, "step": 1990 }, { "epoch": 1.675925925925926, "grad_norm": 0.520087480545044, "learning_rate": 9.41982951297564e-06, "loss": 0.4191, "step": 1991 }, { "epoch": 1.676767676767677, "grad_norm": 0.4519229233264923, "learning_rate": 9.41884779278997e-06, "loss": 0.4063, "step": 1992 }, { "epoch": 1.6776094276094278, "grad_norm": 0.6022638082504272, "learning_rate": 9.417865293956204e-06, "loss": 0.4059, "step": 1993 }, { "epoch": 1.6784511784511784, "grad_norm": 0.48140543699264526, "learning_rate": 9.416882016647474e-06, "loss": 0.3916, "step": 1994 }, { "epoch": 1.6792929292929293, "grad_norm": 0.5249467492103577, "learning_rate": 9.415897961037042e-06, "loss": 0.4144, "step": 1995 }, { "epoch": 1.6801346801346801, "grad_norm": 0.48568254709243774, "learning_rate": 9.41491312729831e-06, "loss": 0.3987, "step": 1996 }, { "epoch": 1.680976430976431, "grad_norm": 0.4953926205635071, "learning_rate": 9.413927515604815e-06, "loss": 0.4173, "step": 1997 }, { "epoch": 1.6818181818181817, "grad_norm": 0.5008490085601807, "learning_rate": 9.412941126130234e-06, "loss": 0.4028, "step": 1998 }, { "epoch": 1.6826599326599325, "grad_norm": 0.5186607241630554, "learning_rate": 9.411953959048378e-06, "loss": 0.4055, "step": 1999 }, { "epoch": 1.6835016835016834, "grad_norm": 0.4528677463531494, "learning_rate": 9.410966014533195e-06, "loss": 0.3978, "step": 2000 }, { "epoch": 1.6843434343434343, "grad_norm": 0.4923630654811859, "learning_rate": 9.409977292758776e-06, "loss": 0.4173, "step": 2001 }, { "epoch": 1.6851851851851851, "grad_norm": 0.5106834769248962, "learning_rate": 9.40898779389934e-06, "loss": 0.4061, "step": 2002 }, { "epoch": 1.686026936026936, "grad_norm": 0.5495801568031311, "learning_rate": 9.407997518129248e-06, "loss": 0.4176, "step": 2003 }, { "epoch": 1.6868686868686869, "grad_norm": 0.5131496787071228, "learning_rate": 9.407006465623e-06, "loss": 0.4119, "step": 2004 }, { "epoch": 1.6877104377104377, "grad_norm": 0.5632554292678833, "learning_rate": 9.406014636555225e-06, "loss": 0.4425, "step": 2005 }, { "epoch": 1.6885521885521886, "grad_norm": 0.4654299020767212, "learning_rate": 9.4050220311007e-06, "loss": 0.4051, "step": 2006 }, { "epoch": 1.6893939393939394, "grad_norm": 0.47783562541007996, "learning_rate": 9.404028649434326e-06, "loss": 0.4287, "step": 2007 }, { "epoch": 1.6902356902356903, "grad_norm": 0.5467926859855652, "learning_rate": 9.403034491731152e-06, "loss": 0.3993, "step": 2008 }, { "epoch": 1.6910774410774412, "grad_norm": 0.4419778883457184, "learning_rate": 9.402039558166358e-06, "loss": 0.408, "step": 2009 }, { "epoch": 1.691919191919192, "grad_norm": 0.48887839913368225, "learning_rate": 9.40104384891526e-06, "loss": 0.4058, "step": 2010 }, { "epoch": 1.692760942760943, "grad_norm": 0.5507516264915466, "learning_rate": 9.400047364153318e-06, "loss": 0.419, "step": 2011 }, { "epoch": 1.6936026936026936, "grad_norm": 0.4678021967411041, "learning_rate": 9.399050104056118e-06, "loss": 0.4246, "step": 2012 }, { "epoch": 1.6944444444444444, "grad_norm": 0.5605798363685608, "learning_rate": 9.39805206879939e-06, "loss": 0.4273, "step": 2013 }, { "epoch": 1.6952861952861953, "grad_norm": 0.46799761056900024, "learning_rate": 9.397053258558998e-06, "loss": 0.4292, "step": 2014 }, { "epoch": 1.6961279461279462, "grad_norm": 0.495412141084671, "learning_rate": 9.396053673510941e-06, "loss": 0.4152, "step": 2015 }, { "epoch": 1.696969696969697, "grad_norm": 0.5316377878189087, "learning_rate": 9.39505331383136e-06, "loss": 0.4104, "step": 2016 }, { "epoch": 1.6978114478114477, "grad_norm": 0.4593218266963959, "learning_rate": 9.39405217969653e-06, "loss": 0.4142, "step": 2017 }, { "epoch": 1.6986531986531985, "grad_norm": 0.4888450801372528, "learning_rate": 9.393050271282855e-06, "loss": 0.4215, "step": 2018 }, { "epoch": 1.6994949494949494, "grad_norm": 0.45036259293556213, "learning_rate": 9.392047588766889e-06, "loss": 0.4029, "step": 2019 }, { "epoch": 1.7003367003367003, "grad_norm": 0.49101951718330383, "learning_rate": 9.39104413232531e-06, "loss": 0.4248, "step": 2020 }, { "epoch": 1.7011784511784511, "grad_norm": 0.5231309533119202, "learning_rate": 9.39003990213494e-06, "loss": 0.4259, "step": 2021 }, { "epoch": 1.702020202020202, "grad_norm": 0.48262229561805725, "learning_rate": 9.389034898372738e-06, "loss": 0.4068, "step": 2022 }, { "epoch": 1.7028619528619529, "grad_norm": 0.42656251788139343, "learning_rate": 9.388029121215792e-06, "loss": 0.384, "step": 2023 }, { "epoch": 1.7037037037037037, "grad_norm": 0.4968153238296509, "learning_rate": 9.387022570841331e-06, "loss": 0.4376, "step": 2024 }, { "epoch": 1.7045454545454546, "grad_norm": 0.48010462522506714, "learning_rate": 9.386015247426724e-06, "loss": 0.4323, "step": 2025 }, { "epoch": 1.7053872053872055, "grad_norm": 0.47275295853614807, "learning_rate": 9.385007151149467e-06, "loss": 0.4121, "step": 2026 }, { "epoch": 1.7062289562289563, "grad_norm": 0.4498578608036041, "learning_rate": 9.383998282187198e-06, "loss": 0.4154, "step": 2027 }, { "epoch": 1.7070707070707072, "grad_norm": 0.48190775513648987, "learning_rate": 9.382988640717697e-06, "loss": 0.3989, "step": 2028 }, { "epoch": 1.707912457912458, "grad_norm": 0.4929972290992737, "learning_rate": 9.381978226918864e-06, "loss": 0.3923, "step": 2029 }, { "epoch": 1.708754208754209, "grad_norm": 0.45879697799682617, "learning_rate": 9.380967040968751e-06, "loss": 0.412, "step": 2030 }, { "epoch": 1.7095959595959596, "grad_norm": 0.5207894444465637, "learning_rate": 9.379955083045538e-06, "loss": 0.4422, "step": 2031 }, { "epoch": 1.7104377104377104, "grad_norm": 0.49121662974357605, "learning_rate": 9.378942353327543e-06, "loss": 0.4017, "step": 2032 }, { "epoch": 1.7112794612794613, "grad_norm": 0.41051506996154785, "learning_rate": 9.37792885199322e-06, "loss": 0.4192, "step": 2033 }, { "epoch": 1.7121212121212122, "grad_norm": 0.5275959968566895, "learning_rate": 9.376914579221156e-06, "loss": 0.43, "step": 2034 }, { "epoch": 1.7129629629629628, "grad_norm": 0.46462321281433105, "learning_rate": 9.375899535190084e-06, "loss": 0.4283, "step": 2035 }, { "epoch": 1.7138047138047137, "grad_norm": 0.49480506777763367, "learning_rate": 9.374883720078855e-06, "loss": 0.4262, "step": 2036 }, { "epoch": 1.7146464646464645, "grad_norm": 0.46763932704925537, "learning_rate": 9.373867134066476e-06, "loss": 0.3932, "step": 2037 }, { "epoch": 1.7154882154882154, "grad_norm": 0.5202119946479797, "learning_rate": 9.372849777332073e-06, "loss": 0.4058, "step": 2038 }, { "epoch": 1.7163299663299663, "grad_norm": 0.6221701502799988, "learning_rate": 9.37183165005492e-06, "loss": 0.4314, "step": 2039 }, { "epoch": 1.7171717171717171, "grad_norm": 0.4646471440792084, "learning_rate": 9.370812752414422e-06, "loss": 0.4222, "step": 2040 }, { "epoch": 1.718013468013468, "grad_norm": 0.6379355788230896, "learning_rate": 9.369793084590118e-06, "loss": 0.421, "step": 2041 }, { "epoch": 1.7188552188552189, "grad_norm": 0.5719043016433716, "learning_rate": 9.368772646761681e-06, "loss": 0.4537, "step": 2042 }, { "epoch": 1.7196969696969697, "grad_norm": 0.49792370200157166, "learning_rate": 9.367751439108928e-06, "loss": 0.4271, "step": 2043 }, { "epoch": 1.7205387205387206, "grad_norm": 0.6139031648635864, "learning_rate": 9.366729461811806e-06, "loss": 0.4273, "step": 2044 }, { "epoch": 1.7213804713804715, "grad_norm": 0.45173248648643494, "learning_rate": 9.365706715050395e-06, "loss": 0.4393, "step": 2045 }, { "epoch": 1.7222222222222223, "grad_norm": 0.48039108514785767, "learning_rate": 9.364683199004918e-06, "loss": 0.4217, "step": 2046 }, { "epoch": 1.7230639730639732, "grad_norm": 0.5358500480651855, "learning_rate": 9.363658913855727e-06, "loss": 0.4284, "step": 2047 }, { "epoch": 1.723905723905724, "grad_norm": 0.44214770197868347, "learning_rate": 9.362633859783313e-06, "loss": 0.4253, "step": 2048 }, { "epoch": 1.7247474747474747, "grad_norm": 0.44579699635505676, "learning_rate": 9.361608036968301e-06, "loss": 0.3963, "step": 2049 }, { "epoch": 1.7255892255892256, "grad_norm": 0.4887399673461914, "learning_rate": 9.360581445591453e-06, "loss": 0.4171, "step": 2050 }, { "epoch": 1.7264309764309764, "grad_norm": 0.4154456555843353, "learning_rate": 9.359554085833662e-06, "loss": 0.407, "step": 2051 }, { "epoch": 1.7272727272727273, "grad_norm": 0.5028557181358337, "learning_rate": 9.358525957875964e-06, "loss": 0.4246, "step": 2052 }, { "epoch": 1.7281144781144782, "grad_norm": 0.4942321181297302, "learning_rate": 9.357497061899523e-06, "loss": 0.3835, "step": 2053 }, { "epoch": 1.7289562289562288, "grad_norm": 0.49582192301750183, "learning_rate": 9.356467398085642e-06, "loss": 0.4276, "step": 2054 }, { "epoch": 1.7297979797979797, "grad_norm": 0.4542258083820343, "learning_rate": 9.35543696661576e-06, "loss": 0.4195, "step": 2055 }, { "epoch": 1.7306397306397305, "grad_norm": 0.46224245429039, "learning_rate": 9.354405767671449e-06, "loss": 0.4288, "step": 2056 }, { "epoch": 1.7314814814814814, "grad_norm": 0.4600341022014618, "learning_rate": 9.353373801434418e-06, "loss": 0.4301, "step": 2057 }, { "epoch": 1.7323232323232323, "grad_norm": 0.46587133407592773, "learning_rate": 9.35234106808651e-06, "loss": 0.426, "step": 2058 }, { "epoch": 1.7331649831649831, "grad_norm": 0.4226820170879364, "learning_rate": 9.351307567809704e-06, "loss": 0.4089, "step": 2059 }, { "epoch": 1.734006734006734, "grad_norm": 0.45373234152793884, "learning_rate": 9.350273300786113e-06, "loss": 0.4513, "step": 2060 }, { "epoch": 1.7348484848484849, "grad_norm": 0.4547770023345947, "learning_rate": 9.349238267197986e-06, "loss": 0.4319, "step": 2061 }, { "epoch": 1.7356902356902357, "grad_norm": 0.4188601076602936, "learning_rate": 9.348202467227709e-06, "loss": 0.4228, "step": 2062 }, { "epoch": 1.7365319865319866, "grad_norm": 0.44111770391464233, "learning_rate": 9.347165901057798e-06, "loss": 0.4101, "step": 2063 }, { "epoch": 1.7373737373737375, "grad_norm": 0.5180520415306091, "learning_rate": 9.346128568870907e-06, "loss": 0.4005, "step": 2064 }, { "epoch": 1.7382154882154883, "grad_norm": 0.5100253820419312, "learning_rate": 9.345090470849827e-06, "loss": 0.4011, "step": 2065 }, { "epoch": 1.7390572390572392, "grad_norm": 0.4605889916419983, "learning_rate": 9.344051607177483e-06, "loss": 0.4261, "step": 2066 }, { "epoch": 1.73989898989899, "grad_norm": 0.45414865016937256, "learning_rate": 9.34301197803693e-06, "loss": 0.4122, "step": 2067 }, { "epoch": 1.7407407407407407, "grad_norm": 0.45866313576698303, "learning_rate": 9.341971583611364e-06, "loss": 0.4172, "step": 2068 }, { "epoch": 1.7415824915824916, "grad_norm": 0.4754038453102112, "learning_rate": 9.340930424084114e-06, "loss": 0.4461, "step": 2069 }, { "epoch": 1.7424242424242424, "grad_norm": 0.44079604744911194, "learning_rate": 9.339888499638643e-06, "loss": 0.3969, "step": 2070 }, { "epoch": 1.7432659932659933, "grad_norm": 0.4435809552669525, "learning_rate": 9.338845810458548e-06, "loss": 0.4063, "step": 2071 }, { "epoch": 1.7441077441077442, "grad_norm": 0.4560576379299164, "learning_rate": 9.337802356727563e-06, "loss": 0.4001, "step": 2072 }, { "epoch": 1.7449494949494948, "grad_norm": 0.42149680852890015, "learning_rate": 9.336758138629558e-06, "loss": 0.4244, "step": 2073 }, { "epoch": 1.7457912457912457, "grad_norm": 0.48382100462913513, "learning_rate": 9.33571315634853e-06, "loss": 0.4179, "step": 2074 }, { "epoch": 1.7466329966329965, "grad_norm": 0.49152910709381104, "learning_rate": 9.33466741006862e-06, "loss": 0.4448, "step": 2075 }, { "epoch": 1.7474747474747474, "grad_norm": 0.4386157691478729, "learning_rate": 9.333620899974098e-06, "loss": 0.4337, "step": 2076 }, { "epoch": 1.7483164983164983, "grad_norm": 0.43203070759773254, "learning_rate": 9.332573626249373e-06, "loss": 0.4193, "step": 2077 }, { "epoch": 1.7491582491582491, "grad_norm": 0.4363729953765869, "learning_rate": 9.331525589078982e-06, "loss": 0.3969, "step": 2078 }, { "epoch": 1.75, "grad_norm": 0.45445889234542847, "learning_rate": 9.330476788647602e-06, "loss": 0.402, "step": 2079 }, { "epoch": 1.7508417508417509, "grad_norm": 0.4190775752067566, "learning_rate": 9.329427225140042e-06, "loss": 0.414, "step": 2080 }, { "epoch": 1.7516835016835017, "grad_norm": 0.42270681262016296, "learning_rate": 9.328376898741249e-06, "loss": 0.3952, "step": 2081 }, { "epoch": 1.7525252525252526, "grad_norm": 0.515521228313446, "learning_rate": 9.327325809636298e-06, "loss": 0.451, "step": 2082 }, { "epoch": 1.7533670033670035, "grad_norm": 0.4288911819458008, "learning_rate": 9.326273958010406e-06, "loss": 0.3944, "step": 2083 }, { "epoch": 1.7542087542087543, "grad_norm": 0.465834379196167, "learning_rate": 9.325221344048916e-06, "loss": 0.4166, "step": 2084 }, { "epoch": 1.7550505050505052, "grad_norm": 0.44798704981803894, "learning_rate": 9.324167967937315e-06, "loss": 0.4448, "step": 2085 }, { "epoch": 1.7558922558922558, "grad_norm": 0.5069999098777771, "learning_rate": 9.323113829861214e-06, "loss": 0.4477, "step": 2086 }, { "epoch": 1.7567340067340067, "grad_norm": 0.4871479570865631, "learning_rate": 9.322058930006365e-06, "loss": 0.4231, "step": 2087 }, { "epoch": 1.7575757575757576, "grad_norm": 0.4294722080230713, "learning_rate": 9.321003268558654e-06, "loss": 0.4136, "step": 2088 }, { "epoch": 1.7584175084175084, "grad_norm": 0.49744436144828796, "learning_rate": 9.319946845704101e-06, "loss": 0.4158, "step": 2089 }, { "epoch": 1.7592592592592593, "grad_norm": 0.4677836298942566, "learning_rate": 9.318889661628856e-06, "loss": 0.3987, "step": 2090 }, { "epoch": 1.76010101010101, "grad_norm": 0.5044791102409363, "learning_rate": 9.317831716519206e-06, "loss": 0.409, "step": 2091 }, { "epoch": 1.7609427609427608, "grad_norm": 0.4754284918308258, "learning_rate": 9.316773010561574e-06, "loss": 0.4318, "step": 2092 }, { "epoch": 1.7617845117845117, "grad_norm": 0.5090442895889282, "learning_rate": 9.315713543942516e-06, "loss": 0.4296, "step": 2093 }, { "epoch": 1.7626262626262625, "grad_norm": 0.4514063000679016, "learning_rate": 9.314653316848716e-06, "loss": 0.3998, "step": 2094 }, { "epoch": 1.7634680134680134, "grad_norm": 0.4123072028160095, "learning_rate": 9.313592329467005e-06, "loss": 0.4179, "step": 2095 }, { "epoch": 1.7643097643097643, "grad_norm": 0.47004854679107666, "learning_rate": 9.312530581984335e-06, "loss": 0.4366, "step": 2096 }, { "epoch": 1.7651515151515151, "grad_norm": 0.4415125846862793, "learning_rate": 9.3114680745878e-06, "loss": 0.4307, "step": 2097 }, { "epoch": 1.765993265993266, "grad_norm": 0.523364782333374, "learning_rate": 9.310404807464622e-06, "loss": 0.4252, "step": 2098 }, { "epoch": 1.7668350168350169, "grad_norm": 0.4482495188713074, "learning_rate": 9.309340780802162e-06, "loss": 0.4078, "step": 2099 }, { "epoch": 1.7676767676767677, "grad_norm": 0.5005797743797302, "learning_rate": 9.308275994787915e-06, "loss": 0.3974, "step": 2100 }, { "epoch": 1.7685185185185186, "grad_norm": 0.4564092755317688, "learning_rate": 9.307210449609503e-06, "loss": 0.3868, "step": 2101 }, { "epoch": 1.7693602693602695, "grad_norm": 0.4775140881538391, "learning_rate": 9.306144145454689e-06, "loss": 0.3937, "step": 2102 }, { "epoch": 1.7702020202020203, "grad_norm": 0.5972254872322083, "learning_rate": 9.305077082511369e-06, "loss": 0.413, "step": 2103 }, { "epoch": 1.7710437710437712, "grad_norm": 0.5173206925392151, "learning_rate": 9.304009260967565e-06, "loss": 0.4028, "step": 2104 }, { "epoch": 1.7718855218855218, "grad_norm": 0.4578031599521637, "learning_rate": 9.302940681011447e-06, "loss": 0.3946, "step": 2105 }, { "epoch": 1.7727272727272727, "grad_norm": 0.4345819056034088, "learning_rate": 9.301871342831304e-06, "loss": 0.412, "step": 2106 }, { "epoch": 1.7735690235690236, "grad_norm": 0.46484488248825073, "learning_rate": 9.300801246615566e-06, "loss": 0.4191, "step": 2107 }, { "epoch": 1.7744107744107744, "grad_norm": 0.4650697708129883, "learning_rate": 9.299730392552793e-06, "loss": 0.3993, "step": 2108 }, { "epoch": 1.7752525252525253, "grad_norm": 0.4693724513053894, "learning_rate": 9.298658780831686e-06, "loss": 0.4237, "step": 2109 }, { "epoch": 1.776094276094276, "grad_norm": 0.4734954237937927, "learning_rate": 9.297586411641072e-06, "loss": 0.4044, "step": 2110 }, { "epoch": 1.7769360269360268, "grad_norm": 0.45823028683662415, "learning_rate": 9.296513285169915e-06, "loss": 0.4161, "step": 2111 }, { "epoch": 1.7777777777777777, "grad_norm": 0.5491986274719238, "learning_rate": 9.295439401607308e-06, "loss": 0.4191, "step": 2112 }, { "epoch": 1.7786195286195285, "grad_norm": 0.528437077999115, "learning_rate": 9.294364761142485e-06, "loss": 0.4067, "step": 2113 }, { "epoch": 1.7794612794612794, "grad_norm": 0.47081494331359863, "learning_rate": 9.293289363964805e-06, "loss": 0.4022, "step": 2114 }, { "epoch": 1.7803030303030303, "grad_norm": 0.5439700484275818, "learning_rate": 9.292213210263767e-06, "loss": 0.4144, "step": 2115 }, { "epoch": 1.7811447811447811, "grad_norm": 0.4736545979976654, "learning_rate": 9.291136300229002e-06, "loss": 0.3874, "step": 2116 }, { "epoch": 1.781986531986532, "grad_norm": 0.4628179967403412, "learning_rate": 9.29005863405027e-06, "loss": 0.4093, "step": 2117 }, { "epoch": 1.7828282828282829, "grad_norm": 0.579057514667511, "learning_rate": 9.28898021191747e-06, "loss": 0.4388, "step": 2118 }, { "epoch": 1.7836700336700337, "grad_norm": 0.42987802624702454, "learning_rate": 9.287901034020628e-06, "loss": 0.4015, "step": 2119 }, { "epoch": 1.7845117845117846, "grad_norm": 0.48043590784072876, "learning_rate": 9.286821100549907e-06, "loss": 0.4302, "step": 2120 }, { "epoch": 1.7853535353535355, "grad_norm": 0.50056391954422, "learning_rate": 9.285740411695608e-06, "loss": 0.4189, "step": 2121 }, { "epoch": 1.7861952861952863, "grad_norm": 0.5340574383735657, "learning_rate": 9.284658967648155e-06, "loss": 0.3889, "step": 2122 }, { "epoch": 1.7870370370370372, "grad_norm": 0.4134519100189209, "learning_rate": 9.283576768598111e-06, "loss": 0.4206, "step": 2123 }, { "epoch": 1.7878787878787878, "grad_norm": 0.4724647104740143, "learning_rate": 9.282493814736173e-06, "loss": 0.4076, "step": 2124 }, { "epoch": 1.7887205387205387, "grad_norm": 0.5739797949790955, "learning_rate": 9.281410106253166e-06, "loss": 0.4122, "step": 2125 }, { "epoch": 1.7895622895622896, "grad_norm": 0.43650585412979126, "learning_rate": 9.280325643340053e-06, "loss": 0.44, "step": 2126 }, { "epoch": 1.7904040404040404, "grad_norm": 0.5357071161270142, "learning_rate": 9.279240426187926e-06, "loss": 0.422, "step": 2127 }, { "epoch": 1.791245791245791, "grad_norm": 0.5699374675750732, "learning_rate": 9.278154454988014e-06, "loss": 0.3981, "step": 2128 }, { "epoch": 1.792087542087542, "grad_norm": 0.4702226519584656, "learning_rate": 9.277067729931677e-06, "loss": 0.4093, "step": 2129 }, { "epoch": 1.7929292929292928, "grad_norm": 0.4808195233345032, "learning_rate": 9.275980251210403e-06, "loss": 0.4116, "step": 2130 }, { "epoch": 1.7937710437710437, "grad_norm": 0.6422219276428223, "learning_rate": 9.274892019015825e-06, "loss": 0.4262, "step": 2131 }, { "epoch": 1.7946127946127945, "grad_norm": 0.4843781888484955, "learning_rate": 9.273803033539695e-06, "loss": 0.4292, "step": 2132 }, { "epoch": 1.7954545454545454, "grad_norm": 0.491632342338562, "learning_rate": 9.272713294973905e-06, "loss": 0.4268, "step": 2133 }, { "epoch": 1.7962962962962963, "grad_norm": 0.43286147713661194, "learning_rate": 9.27162280351048e-06, "loss": 0.398, "step": 2134 }, { "epoch": 1.7971380471380471, "grad_norm": 0.4756714999675751, "learning_rate": 9.270531559341573e-06, "loss": 0.4119, "step": 2135 }, { "epoch": 1.797979797979798, "grad_norm": 0.46841171383857727, "learning_rate": 9.269439562659479e-06, "loss": 0.4204, "step": 2136 }, { "epoch": 1.7988215488215489, "grad_norm": 0.47837719321250916, "learning_rate": 9.268346813656615e-06, "loss": 0.4306, "step": 2137 }, { "epoch": 1.7996632996632997, "grad_norm": 0.4741446077823639, "learning_rate": 9.267253312525533e-06, "loss": 0.4107, "step": 2138 }, { "epoch": 1.8005050505050506, "grad_norm": 0.42891356348991394, "learning_rate": 9.266159059458925e-06, "loss": 0.4256, "step": 2139 }, { "epoch": 1.8013468013468015, "grad_norm": 0.4920071065425873, "learning_rate": 9.265064054649606e-06, "loss": 0.4149, "step": 2140 }, { "epoch": 1.8021885521885523, "grad_norm": 0.49339017271995544, "learning_rate": 9.26396829829053e-06, "loss": 0.4088, "step": 2141 }, { "epoch": 1.803030303030303, "grad_norm": 0.4281088411808014, "learning_rate": 9.26287179057478e-06, "loss": 0.4353, "step": 2142 }, { "epoch": 1.8038720538720538, "grad_norm": 0.5419195294380188, "learning_rate": 9.261774531695572e-06, "loss": 0.417, "step": 2143 }, { "epoch": 1.8047138047138047, "grad_norm": 0.49070677161216736, "learning_rate": 9.260676521846255e-06, "loss": 0.3969, "step": 2144 }, { "epoch": 1.8055555555555556, "grad_norm": 0.5569455027580261, "learning_rate": 9.25957776122031e-06, "loss": 0.4303, "step": 2145 }, { "epoch": 1.8063973063973064, "grad_norm": 0.514457106590271, "learning_rate": 9.258478250011349e-06, "loss": 0.4201, "step": 2146 }, { "epoch": 1.807239057239057, "grad_norm": 0.48682448267936707, "learning_rate": 9.257377988413118e-06, "loss": 0.4001, "step": 2147 }, { "epoch": 1.808080808080808, "grad_norm": 0.5591537356376648, "learning_rate": 9.256276976619499e-06, "loss": 0.4281, "step": 2148 }, { "epoch": 1.8089225589225588, "grad_norm": 0.4856509864330292, "learning_rate": 9.255175214824498e-06, "loss": 0.412, "step": 2149 }, { "epoch": 1.8097643097643097, "grad_norm": 0.4449159801006317, "learning_rate": 9.254072703222257e-06, "loss": 0.4053, "step": 2150 }, { "epoch": 1.8106060606060606, "grad_norm": 0.5135862827301025, "learning_rate": 9.252969442007052e-06, "loss": 0.413, "step": 2151 }, { "epoch": 1.8114478114478114, "grad_norm": 0.5641700625419617, "learning_rate": 9.251865431373288e-06, "loss": 0.4478, "step": 2152 }, { "epoch": 1.8122895622895623, "grad_norm": 0.4975886642932892, "learning_rate": 9.250760671515505e-06, "loss": 0.4304, "step": 2153 }, { "epoch": 1.8131313131313131, "grad_norm": 0.61391282081604, "learning_rate": 9.24965516262837e-06, "loss": 0.4267, "step": 2154 }, { "epoch": 1.813973063973064, "grad_norm": 0.5287446975708008, "learning_rate": 9.24854890490669e-06, "loss": 0.4118, "step": 2155 }, { "epoch": 1.8148148148148149, "grad_norm": 0.49215781688690186, "learning_rate": 9.247441898545398e-06, "loss": 0.4159, "step": 2156 }, { "epoch": 1.8156565656565657, "grad_norm": 0.5632743835449219, "learning_rate": 9.246334143739558e-06, "loss": 0.4132, "step": 2157 }, { "epoch": 1.8164983164983166, "grad_norm": 0.4993419051170349, "learning_rate": 9.24522564068437e-06, "loss": 0.4108, "step": 2158 }, { "epoch": 1.8173400673400675, "grad_norm": 0.5005595088005066, "learning_rate": 9.244116389575166e-06, "loss": 0.4114, "step": 2159 }, { "epoch": 1.8181818181818183, "grad_norm": 0.5482595562934875, "learning_rate": 9.243006390607403e-06, "loss": 0.42, "step": 2160 }, { "epoch": 1.819023569023569, "grad_norm": 0.4556275010108948, "learning_rate": 9.24189564397668e-06, "loss": 0.4208, "step": 2161 }, { "epoch": 1.8198653198653199, "grad_norm": 0.5058465600013733, "learning_rate": 9.240784149878718e-06, "loss": 0.4021, "step": 2162 }, { "epoch": 1.8207070707070707, "grad_norm": 0.4833398461341858, "learning_rate": 9.239671908509378e-06, "loss": 0.4071, "step": 2163 }, { "epoch": 1.8215488215488216, "grad_norm": 0.4671793580055237, "learning_rate": 9.238558920064645e-06, "loss": 0.4097, "step": 2164 }, { "epoch": 1.8223905723905722, "grad_norm": 0.49806857109069824, "learning_rate": 9.237445184740643e-06, "loss": 0.4151, "step": 2165 }, { "epoch": 1.823232323232323, "grad_norm": 0.5938169956207275, "learning_rate": 9.236330702733621e-06, "loss": 0.4142, "step": 2166 }, { "epoch": 1.824074074074074, "grad_norm": 0.5032863616943359, "learning_rate": 9.235215474239963e-06, "loss": 0.4044, "step": 2167 }, { "epoch": 1.8249158249158248, "grad_norm": 0.5091928839683533, "learning_rate": 9.234099499456187e-06, "loss": 0.3958, "step": 2168 }, { "epoch": 1.8257575757575757, "grad_norm": 0.5959049463272095, "learning_rate": 9.232982778578936e-06, "loss": 0.412, "step": 2169 }, { "epoch": 1.8265993265993266, "grad_norm": 0.49238938093185425, "learning_rate": 9.231865311804991e-06, "loss": 0.4207, "step": 2170 }, { "epoch": 1.8274410774410774, "grad_norm": 0.5868483781814575, "learning_rate": 9.23074709933126e-06, "loss": 0.4268, "step": 2171 }, { "epoch": 1.8282828282828283, "grad_norm": 0.4670020043849945, "learning_rate": 9.229628141354783e-06, "loss": 0.3958, "step": 2172 }, { "epoch": 1.8291245791245792, "grad_norm": 0.4719027578830719, "learning_rate": 9.228508438072737e-06, "loss": 0.4265, "step": 2173 }, { "epoch": 1.82996632996633, "grad_norm": 0.5334663391113281, "learning_rate": 9.22738798968242e-06, "loss": 0.4268, "step": 2174 }, { "epoch": 1.8308080808080809, "grad_norm": 0.4944954812526703, "learning_rate": 9.226266796381268e-06, "loss": 0.3892, "step": 2175 }, { "epoch": 1.8316498316498318, "grad_norm": 0.4247192442417145, "learning_rate": 9.22514485836685e-06, "loss": 0.3902, "step": 2176 }, { "epoch": 1.8324915824915826, "grad_norm": 0.5430282354354858, "learning_rate": 9.224022175836862e-06, "loss": 0.4122, "step": 2177 }, { "epoch": 1.8333333333333335, "grad_norm": 0.4930962920188904, "learning_rate": 9.222898748989133e-06, "loss": 0.4068, "step": 2178 }, { "epoch": 1.8341750841750841, "grad_norm": 0.5023189783096313, "learning_rate": 9.22177457802162e-06, "loss": 0.4124, "step": 2179 }, { "epoch": 1.835016835016835, "grad_norm": 0.49180325865745544, "learning_rate": 9.22064966313242e-06, "loss": 0.4001, "step": 2180 }, { "epoch": 1.8358585858585859, "grad_norm": 0.44990330934524536, "learning_rate": 9.219524004519749e-06, "loss": 0.4222, "step": 2181 }, { "epoch": 1.8367003367003367, "grad_norm": 0.4702785313129425, "learning_rate": 9.218397602381961e-06, "loss": 0.4223, "step": 2182 }, { "epoch": 1.8375420875420876, "grad_norm": 0.5035222768783569, "learning_rate": 9.217270456917543e-06, "loss": 0.3966, "step": 2183 }, { "epoch": 1.8383838383838382, "grad_norm": 0.4754098355770111, "learning_rate": 9.216142568325109e-06, "loss": 0.414, "step": 2184 }, { "epoch": 1.839225589225589, "grad_norm": 0.5164647102355957, "learning_rate": 9.215013936803403e-06, "loss": 0.4551, "step": 2185 }, { "epoch": 1.84006734006734, "grad_norm": 0.4235841631889343, "learning_rate": 9.213884562551304e-06, "loss": 0.4033, "step": 2186 }, { "epoch": 1.8409090909090908, "grad_norm": 0.529967188835144, "learning_rate": 9.21275444576782e-06, "loss": 0.4492, "step": 2187 }, { "epoch": 1.8417508417508417, "grad_norm": 0.46820294857025146, "learning_rate": 9.211623586652086e-06, "loss": 0.4274, "step": 2188 }, { "epoch": 1.8425925925925926, "grad_norm": 0.5166777968406677, "learning_rate": 9.210491985403375e-06, "loss": 0.4091, "step": 2189 }, { "epoch": 1.8434343434343434, "grad_norm": 0.463363379240036, "learning_rate": 9.209359642221089e-06, "loss": 0.4166, "step": 2190 }, { "epoch": 1.8442760942760943, "grad_norm": 0.48729169368743896, "learning_rate": 9.208226557304754e-06, "loss": 0.4105, "step": 2191 }, { "epoch": 1.8451178451178452, "grad_norm": 0.47030913829803467, "learning_rate": 9.207092730854034e-06, "loss": 0.4149, "step": 2192 }, { "epoch": 1.845959595959596, "grad_norm": 0.47815990447998047, "learning_rate": 9.205958163068723e-06, "loss": 0.4375, "step": 2193 }, { "epoch": 1.8468013468013469, "grad_norm": 0.45777517557144165, "learning_rate": 9.204822854148738e-06, "loss": 0.3765, "step": 2194 }, { "epoch": 1.8476430976430978, "grad_norm": 0.4336608946323395, "learning_rate": 9.203686804294141e-06, "loss": 0.4248, "step": 2195 }, { "epoch": 1.8484848484848486, "grad_norm": 0.463859498500824, "learning_rate": 9.202550013705109e-06, "loss": 0.4222, "step": 2196 }, { "epoch": 1.8493265993265995, "grad_norm": 0.4822160601615906, "learning_rate": 9.201412482581962e-06, "loss": 0.4169, "step": 2197 }, { "epoch": 1.8501683501683501, "grad_norm": 0.47922903299331665, "learning_rate": 9.20027421112514e-06, "loss": 0.4114, "step": 2198 }, { "epoch": 1.851010101010101, "grad_norm": 0.48215171694755554, "learning_rate": 9.199135199535223e-06, "loss": 0.399, "step": 2199 }, { "epoch": 1.8518518518518519, "grad_norm": 0.4376847743988037, "learning_rate": 9.197995448012912e-06, "loss": 0.4031, "step": 2200 }, { "epoch": 1.8526936026936027, "grad_norm": 0.4564366042613983, "learning_rate": 9.196854956759051e-06, "loss": 0.4006, "step": 2201 }, { "epoch": 1.8535353535353534, "grad_norm": 0.42948344349861145, "learning_rate": 9.1957137259746e-06, "loss": 0.4091, "step": 2202 }, { "epoch": 1.8543771043771042, "grad_norm": 0.5097257494926453, "learning_rate": 9.194571755860658e-06, "loss": 0.4187, "step": 2203 }, { "epoch": 1.855218855218855, "grad_norm": 0.38655105233192444, "learning_rate": 9.193429046618452e-06, "loss": 0.4012, "step": 2204 }, { "epoch": 1.856060606060606, "grad_norm": 0.4467943012714386, "learning_rate": 9.192285598449342e-06, "loss": 0.4112, "step": 2205 }, { "epoch": 1.8569023569023568, "grad_norm": 0.4388805329799652, "learning_rate": 9.191141411554813e-06, "loss": 0.4084, "step": 2206 }, { "epoch": 1.8577441077441077, "grad_norm": 0.44611093401908875, "learning_rate": 9.189996486136484e-06, "loss": 0.4292, "step": 2207 }, { "epoch": 1.8585858585858586, "grad_norm": 0.46619725227355957, "learning_rate": 9.1888508223961e-06, "loss": 0.402, "step": 2208 }, { "epoch": 1.8594276094276094, "grad_norm": 0.44821661710739136, "learning_rate": 9.187704420535546e-06, "loss": 0.4136, "step": 2209 }, { "epoch": 1.8602693602693603, "grad_norm": 0.4947660267353058, "learning_rate": 9.186557280756823e-06, "loss": 0.4278, "step": 2210 }, { "epoch": 1.8611111111111112, "grad_norm": 0.45060476660728455, "learning_rate": 9.185409403262074e-06, "loss": 0.4228, "step": 2211 }, { "epoch": 1.861952861952862, "grad_norm": 0.4570328891277313, "learning_rate": 9.184260788253565e-06, "loss": 0.4237, "step": 2212 }, { "epoch": 1.862794612794613, "grad_norm": 0.47995758056640625, "learning_rate": 9.183111435933696e-06, "loss": 0.413, "step": 2213 }, { "epoch": 1.8636363636363638, "grad_norm": 0.4446435868740082, "learning_rate": 9.181961346504993e-06, "loss": 0.3976, "step": 2214 }, { "epoch": 1.8644781144781146, "grad_norm": 0.44256511330604553, "learning_rate": 9.180810520170117e-06, "loss": 0.4062, "step": 2215 }, { "epoch": 1.8653198653198653, "grad_norm": 0.4937680661678314, "learning_rate": 9.179658957131851e-06, "loss": 0.4352, "step": 2216 }, { "epoch": 1.8661616161616161, "grad_norm": 0.4791995882987976, "learning_rate": 9.178506657593119e-06, "loss": 0.4311, "step": 2217 }, { "epoch": 1.867003367003367, "grad_norm": 0.4189358651638031, "learning_rate": 9.177353621756963e-06, "loss": 0.4063, "step": 2218 }, { "epoch": 1.8678451178451179, "grad_norm": 0.44479140639305115, "learning_rate": 9.176199849826563e-06, "loss": 0.4235, "step": 2219 }, { "epoch": 1.8686868686868687, "grad_norm": 0.45939651131629944, "learning_rate": 9.175045342005225e-06, "loss": 0.4207, "step": 2220 }, { "epoch": 1.8695286195286194, "grad_norm": 0.43709033727645874, "learning_rate": 9.173890098496386e-06, "loss": 0.4221, "step": 2221 }, { "epoch": 1.8703703703703702, "grad_norm": 0.42740532755851746, "learning_rate": 9.172734119503612e-06, "loss": 0.3955, "step": 2222 }, { "epoch": 1.871212121212121, "grad_norm": 0.5072750449180603, "learning_rate": 9.171577405230598e-06, "loss": 0.4523, "step": 2223 }, { "epoch": 1.872053872053872, "grad_norm": 0.4535278081893921, "learning_rate": 9.170419955881172e-06, "loss": 0.4285, "step": 2224 }, { "epoch": 1.8728956228956228, "grad_norm": 0.4563673734664917, "learning_rate": 9.169261771659286e-06, "loss": 0.4081, "step": 2225 }, { "epoch": 1.8737373737373737, "grad_norm": 0.46163609623908997, "learning_rate": 9.168102852769026e-06, "loss": 0.3924, "step": 2226 }, { "epoch": 1.8745791245791246, "grad_norm": 0.46989428997039795, "learning_rate": 9.166943199414605e-06, "loss": 0.4389, "step": 2227 }, { "epoch": 1.8754208754208754, "grad_norm": 0.4567207098007202, "learning_rate": 9.165782811800365e-06, "loss": 0.388, "step": 2228 }, { "epoch": 1.8762626262626263, "grad_norm": 0.49187102913856506, "learning_rate": 9.164621690130784e-06, "loss": 0.4281, "step": 2229 }, { "epoch": 1.8771043771043772, "grad_norm": 0.4542689621448517, "learning_rate": 9.163459834610458e-06, "loss": 0.4118, "step": 2230 }, { "epoch": 1.877946127946128, "grad_norm": 0.4671179950237274, "learning_rate": 9.162297245444121e-06, "loss": 0.4024, "step": 2231 }, { "epoch": 1.878787878787879, "grad_norm": 0.5034841299057007, "learning_rate": 9.161133922836634e-06, "loss": 0.4039, "step": 2232 }, { "epoch": 1.8796296296296298, "grad_norm": 0.48552101850509644, "learning_rate": 9.159969866992986e-06, "loss": 0.4245, "step": 2233 }, { "epoch": 1.8804713804713806, "grad_norm": 0.5012979507446289, "learning_rate": 9.158805078118296e-06, "loss": 0.4199, "step": 2234 }, { "epoch": 1.8813131313131313, "grad_norm": 0.5548691749572754, "learning_rate": 9.157639556417814e-06, "loss": 0.4218, "step": 2235 }, { "epoch": 1.8821548821548821, "grad_norm": 0.4988200068473816, "learning_rate": 9.156473302096914e-06, "loss": 0.4149, "step": 2236 }, { "epoch": 1.882996632996633, "grad_norm": 0.5110421776771545, "learning_rate": 9.155306315361105e-06, "loss": 0.3966, "step": 2237 }, { "epoch": 1.8838383838383839, "grad_norm": 0.4907882809638977, "learning_rate": 9.15413859641602e-06, "loss": 0.3995, "step": 2238 }, { "epoch": 1.8846801346801347, "grad_norm": 0.4983086884021759, "learning_rate": 9.152970145467427e-06, "loss": 0.4153, "step": 2239 }, { "epoch": 1.8855218855218854, "grad_norm": 0.4434485137462616, "learning_rate": 9.151800962721218e-06, "loss": 0.4103, "step": 2240 }, { "epoch": 1.8863636363636362, "grad_norm": 0.4016505181789398, "learning_rate": 9.150631048383415e-06, "loss": 0.4262, "step": 2241 }, { "epoch": 1.887205387205387, "grad_norm": 0.5059490203857422, "learning_rate": 9.14946040266017e-06, "loss": 0.4233, "step": 2242 }, { "epoch": 1.888047138047138, "grad_norm": 0.4201311767101288, "learning_rate": 9.148289025757762e-06, "loss": 0.3986, "step": 2243 }, { "epoch": 1.8888888888888888, "grad_norm": 0.48924776911735535, "learning_rate": 9.147116917882601e-06, "loss": 0.4127, "step": 2244 }, { "epoch": 1.8897306397306397, "grad_norm": 0.479098379611969, "learning_rate": 9.145944079241225e-06, "loss": 0.4123, "step": 2245 }, { "epoch": 1.8905723905723906, "grad_norm": 0.48632684350013733, "learning_rate": 9.144770510040302e-06, "loss": 0.4282, "step": 2246 }, { "epoch": 1.8914141414141414, "grad_norm": 0.45241427421569824, "learning_rate": 9.143596210486624e-06, "loss": 0.4159, "step": 2247 }, { "epoch": 1.8922558922558923, "grad_norm": 0.5111252665519714, "learning_rate": 9.142421180787117e-06, "loss": 0.4099, "step": 2248 }, { "epoch": 1.8930976430976432, "grad_norm": 0.549390435218811, "learning_rate": 9.141245421148837e-06, "loss": 0.4236, "step": 2249 }, { "epoch": 1.893939393939394, "grad_norm": 0.4973377287387848, "learning_rate": 9.14006893177896e-06, "loss": 0.3967, "step": 2250 }, { "epoch": 1.894781144781145, "grad_norm": 0.5448245406150818, "learning_rate": 9.138891712884797e-06, "loss": 0.4376, "step": 2251 }, { "epoch": 1.8956228956228958, "grad_norm": 0.500593900680542, "learning_rate": 9.13771376467379e-06, "loss": 0.4127, "step": 2252 }, { "epoch": 1.8964646464646466, "grad_norm": 0.4738689363002777, "learning_rate": 9.136535087353502e-06, "loss": 0.4201, "step": 2253 }, { "epoch": 1.8973063973063973, "grad_norm": 0.42859819531440735, "learning_rate": 9.135355681131632e-06, "loss": 0.4157, "step": 2254 }, { "epoch": 1.8981481481481481, "grad_norm": 0.4717070162296295, "learning_rate": 9.134175546215999e-06, "loss": 0.4207, "step": 2255 }, { "epoch": 1.898989898989899, "grad_norm": 0.4088820815086365, "learning_rate": 9.132994682814562e-06, "loss": 0.4018, "step": 2256 }, { "epoch": 1.8998316498316499, "grad_norm": 0.5758814215660095, "learning_rate": 9.131813091135397e-06, "loss": 0.4215, "step": 2257 }, { "epoch": 1.9006734006734005, "grad_norm": 0.4627901315689087, "learning_rate": 9.130630771386716e-06, "loss": 0.406, "step": 2258 }, { "epoch": 1.9015151515151514, "grad_norm": 0.4836151897907257, "learning_rate": 9.129447723776853e-06, "loss": 0.4127, "step": 2259 }, { "epoch": 1.9023569023569022, "grad_norm": 0.4485929608345032, "learning_rate": 9.128263948514273e-06, "loss": 0.396, "step": 2260 }, { "epoch": 1.9031986531986531, "grad_norm": 0.4933216869831085, "learning_rate": 9.127079445807576e-06, "loss": 0.4146, "step": 2261 }, { "epoch": 1.904040404040404, "grad_norm": 0.4852791726589203, "learning_rate": 9.125894215865478e-06, "loss": 0.4126, "step": 2262 }, { "epoch": 1.9048821548821548, "grad_norm": 0.560177206993103, "learning_rate": 9.124708258896831e-06, "loss": 0.4266, "step": 2263 }, { "epoch": 1.9057239057239057, "grad_norm": 0.4936082661151886, "learning_rate": 9.123521575110615e-06, "loss": 0.4015, "step": 2264 }, { "epoch": 1.9065656565656566, "grad_norm": 0.42276954650878906, "learning_rate": 9.122334164715935e-06, "loss": 0.3995, "step": 2265 }, { "epoch": 1.9074074074074074, "grad_norm": 0.483418345451355, "learning_rate": 9.121146027922023e-06, "loss": 0.4275, "step": 2266 }, { "epoch": 1.9082491582491583, "grad_norm": 0.5623087882995605, "learning_rate": 9.119957164938247e-06, "loss": 0.4144, "step": 2267 }, { "epoch": 1.9090909090909092, "grad_norm": 0.5205636024475098, "learning_rate": 9.118767575974091e-06, "loss": 0.4015, "step": 2268 }, { "epoch": 1.90993265993266, "grad_norm": 0.5009389519691467, "learning_rate": 9.117577261239177e-06, "loss": 0.3984, "step": 2269 }, { "epoch": 1.910774410774411, "grad_norm": 0.5094627737998962, "learning_rate": 9.11638622094325e-06, "loss": 0.4244, "step": 2270 }, { "epoch": 1.9116161616161618, "grad_norm": 0.5754061341285706, "learning_rate": 9.115194455296187e-06, "loss": 0.4175, "step": 2271 }, { "epoch": 1.9124579124579124, "grad_norm": 0.4598313868045807, "learning_rate": 9.114001964507983e-06, "loss": 0.4093, "step": 2272 }, { "epoch": 1.9132996632996633, "grad_norm": 0.44699010252952576, "learning_rate": 9.112808748788776e-06, "loss": 0.3978, "step": 2273 }, { "epoch": 1.9141414141414141, "grad_norm": 0.5282257199287415, "learning_rate": 9.111614808348815e-06, "loss": 0.4048, "step": 2274 }, { "epoch": 1.914983164983165, "grad_norm": 0.5676331520080566, "learning_rate": 9.11042014339849e-06, "loss": 0.4345, "step": 2275 }, { "epoch": 1.9158249158249159, "grad_norm": 0.48413246870040894, "learning_rate": 9.109224754148315e-06, "loss": 0.4064, "step": 2276 }, { "epoch": 1.9166666666666665, "grad_norm": 0.47851401567459106, "learning_rate": 9.108028640808927e-06, "loss": 0.4112, "step": 2277 }, { "epoch": 1.9175084175084174, "grad_norm": 0.6386982798576355, "learning_rate": 9.106831803591094e-06, "loss": 0.4233, "step": 2278 }, { "epoch": 1.9183501683501682, "grad_norm": 0.5331704020500183, "learning_rate": 9.105634242705715e-06, "loss": 0.4105, "step": 2279 }, { "epoch": 1.9191919191919191, "grad_norm": 0.5036085247993469, "learning_rate": 9.104435958363808e-06, "loss": 0.4231, "step": 2280 }, { "epoch": 1.92003367003367, "grad_norm": 0.5522636771202087, "learning_rate": 9.103236950776528e-06, "loss": 0.4248, "step": 2281 }, { "epoch": 1.9208754208754208, "grad_norm": 0.4968482553958893, "learning_rate": 9.10203722015515e-06, "loss": 0.3974, "step": 2282 }, { "epoch": 1.9217171717171717, "grad_norm": 0.5288631916046143, "learning_rate": 9.100836766711079e-06, "loss": 0.4274, "step": 2283 }, { "epoch": 1.9225589225589226, "grad_norm": 0.5228056311607361, "learning_rate": 9.099635590655851e-06, "loss": 0.4164, "step": 2284 }, { "epoch": 1.9234006734006734, "grad_norm": 0.5266386270523071, "learning_rate": 9.098433692201123e-06, "loss": 0.4328, "step": 2285 }, { "epoch": 1.9242424242424243, "grad_norm": 0.5004182457923889, "learning_rate": 9.097231071558683e-06, "loss": 0.4, "step": 2286 }, { "epoch": 1.9250841750841752, "grad_norm": 0.44438859820365906, "learning_rate": 9.096027728940447e-06, "loss": 0.4168, "step": 2287 }, { "epoch": 1.925925925925926, "grad_norm": 0.48876306414604187, "learning_rate": 9.094823664558455e-06, "loss": 0.4141, "step": 2288 }, { "epoch": 1.926767676767677, "grad_norm": 0.46203288435935974, "learning_rate": 9.093618878624877e-06, "loss": 0.4178, "step": 2289 }, { "epoch": 1.9276094276094278, "grad_norm": 0.4540310204029083, "learning_rate": 9.092413371352009e-06, "loss": 0.4329, "step": 2290 }, { "epoch": 1.9284511784511784, "grad_norm": 0.41050752997398376, "learning_rate": 9.091207142952273e-06, "loss": 0.4131, "step": 2291 }, { "epoch": 1.9292929292929293, "grad_norm": 0.4764178693294525, "learning_rate": 9.090000193638221e-06, "loss": 0.41, "step": 2292 }, { "epoch": 1.9301346801346801, "grad_norm": 0.45785030722618103, "learning_rate": 9.088792523622529e-06, "loss": 0.409, "step": 2293 }, { "epoch": 1.930976430976431, "grad_norm": 0.4739355742931366, "learning_rate": 9.087584133118004e-06, "loss": 0.4086, "step": 2294 }, { "epoch": 1.9318181818181817, "grad_norm": 0.47115257382392883, "learning_rate": 9.086375022337572e-06, "loss": 0.405, "step": 2295 }, { "epoch": 1.9326599326599325, "grad_norm": 0.4706433415412903, "learning_rate": 9.085165191494297e-06, "loss": 0.424, "step": 2296 }, { "epoch": 1.9335016835016834, "grad_norm": 0.45058536529541016, "learning_rate": 9.08395464080136e-06, "loss": 0.4156, "step": 2297 }, { "epoch": 1.9343434343434343, "grad_norm": 0.44317424297332764, "learning_rate": 9.082743370472075e-06, "loss": 0.4095, "step": 2298 }, { "epoch": 1.9351851851851851, "grad_norm": 0.47244030237197876, "learning_rate": 9.081531380719882e-06, "loss": 0.4328, "step": 2299 }, { "epoch": 1.936026936026936, "grad_norm": 0.47624388337135315, "learning_rate": 9.080318671758343e-06, "loss": 0.4185, "step": 2300 }, { "epoch": 1.9368686868686869, "grad_norm": 0.48238301277160645, "learning_rate": 9.079105243801154e-06, "loss": 0.3938, "step": 2301 }, { "epoch": 1.9377104377104377, "grad_norm": 0.49177247285842896, "learning_rate": 9.077891097062131e-06, "loss": 0.4264, "step": 2302 }, { "epoch": 1.9385521885521886, "grad_norm": 0.4980221688747406, "learning_rate": 9.07667623175522e-06, "loss": 0.4101, "step": 2303 }, { "epoch": 1.9393939393939394, "grad_norm": 0.48890164494514465, "learning_rate": 9.075460648094492e-06, "loss": 0.426, "step": 2304 }, { "epoch": 1.9402356902356903, "grad_norm": 0.49409013986587524, "learning_rate": 9.074244346294149e-06, "loss": 0.4299, "step": 2305 }, { "epoch": 1.9410774410774412, "grad_norm": 0.49267488718032837, "learning_rate": 9.073027326568516e-06, "loss": 0.4048, "step": 2306 }, { "epoch": 1.941919191919192, "grad_norm": 0.49952319264411926, "learning_rate": 9.071809589132043e-06, "loss": 0.4254, "step": 2307 }, { "epoch": 1.942760942760943, "grad_norm": 0.4638674259185791, "learning_rate": 9.07059113419931e-06, "loss": 0.4136, "step": 2308 }, { "epoch": 1.9436026936026936, "grad_norm": 0.44378602504730225, "learning_rate": 9.06937196198502e-06, "loss": 0.3938, "step": 2309 }, { "epoch": 1.9444444444444444, "grad_norm": 0.47336140275001526, "learning_rate": 9.068152072704007e-06, "loss": 0.3982, "step": 2310 }, { "epoch": 1.9452861952861953, "grad_norm": 0.46364858746528625, "learning_rate": 9.066931466571222e-06, "loss": 0.3913, "step": 2311 }, { "epoch": 1.9461279461279462, "grad_norm": 0.491619735956192, "learning_rate": 9.065710143801759e-06, "loss": 0.4147, "step": 2312 }, { "epoch": 1.946969696969697, "grad_norm": 0.4725497364997864, "learning_rate": 9.064488104610819e-06, "loss": 0.4402, "step": 2313 }, { "epoch": 1.9478114478114477, "grad_norm": 0.43664082884788513, "learning_rate": 9.063265349213742e-06, "loss": 0.4116, "step": 2314 }, { "epoch": 1.9486531986531985, "grad_norm": 0.4683820605278015, "learning_rate": 9.06204187782599e-06, "loss": 0.4271, "step": 2315 }, { "epoch": 1.9494949494949494, "grad_norm": 0.4328610301017761, "learning_rate": 9.060817690663153e-06, "loss": 0.399, "step": 2316 }, { "epoch": 1.9503367003367003, "grad_norm": 0.4784926772117615, "learning_rate": 9.059592787940942e-06, "loss": 0.4181, "step": 2317 }, { "epoch": 1.9511784511784511, "grad_norm": 0.4187324047088623, "learning_rate": 9.058367169875205e-06, "loss": 0.409, "step": 2318 }, { "epoch": 1.952020202020202, "grad_norm": 0.4748072326183319, "learning_rate": 9.057140836681901e-06, "loss": 0.4151, "step": 2319 }, { "epoch": 1.9528619528619529, "grad_norm": 0.5079016089439392, "learning_rate": 9.055913788577128e-06, "loss": 0.4142, "step": 2320 }, { "epoch": 1.9537037037037037, "grad_norm": 0.43032604455947876, "learning_rate": 9.054686025777106e-06, "loss": 0.4022, "step": 2321 }, { "epoch": 1.9545454545454546, "grad_norm": 0.5004208087921143, "learning_rate": 9.053457548498175e-06, "loss": 0.4143, "step": 2322 }, { "epoch": 1.9553872053872055, "grad_norm": 0.44209179282188416, "learning_rate": 9.052228356956809e-06, "loss": 0.4092, "step": 2323 }, { "epoch": 1.9562289562289563, "grad_norm": 0.5251572728157043, "learning_rate": 9.050998451369603e-06, "loss": 0.4181, "step": 2324 }, { "epoch": 1.9570707070707072, "grad_norm": 0.4163760244846344, "learning_rate": 9.049767831953281e-06, "loss": 0.4008, "step": 2325 }, { "epoch": 1.957912457912458, "grad_norm": 0.4697512686252594, "learning_rate": 9.048536498924693e-06, "loss": 0.4426, "step": 2326 }, { "epoch": 1.958754208754209, "grad_norm": 0.4706752896308899, "learning_rate": 9.047304452500808e-06, "loss": 0.4102, "step": 2327 }, { "epoch": 1.9595959595959596, "grad_norm": 0.49708035588264465, "learning_rate": 9.04607169289873e-06, "loss": 0.4092, "step": 2328 }, { "epoch": 1.9604377104377104, "grad_norm": 0.4177589416503906, "learning_rate": 9.044838220335681e-06, "loss": 0.3981, "step": 2329 }, { "epoch": 1.9612794612794613, "grad_norm": 0.4991239905357361, "learning_rate": 9.043604035029016e-06, "loss": 0.4047, "step": 2330 }, { "epoch": 1.9621212121212122, "grad_norm": 0.48381227254867554, "learning_rate": 9.042369137196209e-06, "loss": 0.4196, "step": 2331 }, { "epoch": 1.9629629629629628, "grad_norm": 0.46358948945999146, "learning_rate": 9.041133527054861e-06, "loss": 0.4189, "step": 2332 }, { "epoch": 1.9638047138047137, "grad_norm": 0.4655840992927551, "learning_rate": 9.039897204822704e-06, "loss": 0.4348, "step": 2333 }, { "epoch": 1.9646464646464645, "grad_norm": 0.523273766040802, "learning_rate": 9.038660170717586e-06, "loss": 0.4025, "step": 2334 }, { "epoch": 1.9654882154882154, "grad_norm": 0.46653231978416443, "learning_rate": 9.03742242495749e-06, "loss": 0.4105, "step": 2335 }, { "epoch": 1.9663299663299663, "grad_norm": 0.46157893538475037, "learning_rate": 9.036183967760514e-06, "loss": 0.4271, "step": 2336 }, { "epoch": 1.9671717171717171, "grad_norm": 0.4902266561985016, "learning_rate": 9.034944799344895e-06, "loss": 0.4417, "step": 2337 }, { "epoch": 1.968013468013468, "grad_norm": 0.48366779088974, "learning_rate": 9.033704919928984e-06, "loss": 0.4037, "step": 2338 }, { "epoch": 1.9688552188552189, "grad_norm": 0.46482041478157043, "learning_rate": 9.032464329731261e-06, "loss": 0.4043, "step": 2339 }, { "epoch": 1.9696969696969697, "grad_norm": 0.5075863599777222, "learning_rate": 9.031223028970331e-06, "loss": 0.3966, "step": 2340 }, { "epoch": 1.9705387205387206, "grad_norm": 0.5154946446418762, "learning_rate": 9.029981017864924e-06, "loss": 0.3931, "step": 2341 }, { "epoch": 1.9713804713804715, "grad_norm": 0.4395560324192047, "learning_rate": 9.028738296633897e-06, "loss": 0.3932, "step": 2342 }, { "epoch": 1.9722222222222223, "grad_norm": 0.49620383977890015, "learning_rate": 9.02749486549623e-06, "loss": 0.4219, "step": 2343 }, { "epoch": 1.9730639730639732, "grad_norm": 0.5282461643218994, "learning_rate": 9.026250724671029e-06, "loss": 0.411, "step": 2344 }, { "epoch": 1.973905723905724, "grad_norm": 0.5355737805366516, "learning_rate": 9.025005874377525e-06, "loss": 0.4106, "step": 2345 }, { "epoch": 1.9747474747474747, "grad_norm": 0.41900357604026794, "learning_rate": 9.023760314835073e-06, "loss": 0.402, "step": 2346 }, { "epoch": 1.9755892255892256, "grad_norm": 0.4956151247024536, "learning_rate": 9.022514046263154e-06, "loss": 0.4243, "step": 2347 }, { "epoch": 1.9764309764309764, "grad_norm": 0.4954608380794525, "learning_rate": 9.021267068881376e-06, "loss": 0.4197, "step": 2348 }, { "epoch": 1.9772727272727273, "grad_norm": 0.47773227095603943, "learning_rate": 9.020019382909465e-06, "loss": 0.431, "step": 2349 }, { "epoch": 1.9781144781144782, "grad_norm": 0.5965384244918823, "learning_rate": 9.018770988567284e-06, "loss": 0.4294, "step": 2350 }, { "epoch": 1.9789562289562288, "grad_norm": 0.5048604011535645, "learning_rate": 9.017521886074807e-06, "loss": 0.3969, "step": 2351 }, { "epoch": 1.9797979797979797, "grad_norm": 0.5060854554176331, "learning_rate": 9.01627207565214e-06, "loss": 0.4265, "step": 2352 }, { "epoch": 1.9806397306397305, "grad_norm": 0.45323777198791504, "learning_rate": 9.015021557519515e-06, "loss": 0.4009, "step": 2353 }, { "epoch": 1.9814814814814814, "grad_norm": 0.5521825551986694, "learning_rate": 9.013770331897287e-06, "loss": 0.4028, "step": 2354 }, { "epoch": 1.9823232323232323, "grad_norm": 0.43151921033859253, "learning_rate": 9.012518399005932e-06, "loss": 0.4365, "step": 2355 }, { "epoch": 1.9831649831649831, "grad_norm": 0.5465832352638245, "learning_rate": 9.011265759066057e-06, "loss": 0.4176, "step": 2356 }, { "epoch": 1.984006734006734, "grad_norm": 0.5330916047096252, "learning_rate": 9.01001241229839e-06, "loss": 0.4228, "step": 2357 }, { "epoch": 1.9848484848484849, "grad_norm": 0.49160170555114746, "learning_rate": 9.008758358923782e-06, "loss": 0.4173, "step": 2358 }, { "epoch": 1.9856902356902357, "grad_norm": 0.542905330657959, "learning_rate": 9.007503599163212e-06, "loss": 0.4115, "step": 2359 }, { "epoch": 1.9865319865319866, "grad_norm": 0.5112441778182983, "learning_rate": 9.006248133237783e-06, "loss": 0.412, "step": 2360 }, { "epoch": 1.9873737373737375, "grad_norm": 0.4736434519290924, "learning_rate": 9.00499196136872e-06, "loss": 0.4051, "step": 2361 }, { "epoch": 1.9882154882154883, "grad_norm": 0.5125139355659485, "learning_rate": 9.003735083777377e-06, "loss": 0.4039, "step": 2362 }, { "epoch": 1.9890572390572392, "grad_norm": 0.40983709692955017, "learning_rate": 9.002477500685223e-06, "loss": 0.3966, "step": 2363 }, { "epoch": 1.98989898989899, "grad_norm": 0.5151861310005188, "learning_rate": 9.001219212313863e-06, "loss": 0.4471, "step": 2364 }, { "epoch": 1.9907407407407407, "grad_norm": 0.5829835534095764, "learning_rate": 8.99996021888502e-06, "loss": 0.4194, "step": 2365 }, { "epoch": 1.9915824915824916, "grad_norm": 0.47554346919059753, "learning_rate": 8.99870052062054e-06, "loss": 0.426, "step": 2366 }, { "epoch": 1.9924242424242424, "grad_norm": 0.4815850555896759, "learning_rate": 8.997440117742398e-06, "loss": 0.4032, "step": 2367 }, { "epoch": 1.9932659932659933, "grad_norm": 0.539364218711853, "learning_rate": 8.996179010472686e-06, "loss": 0.4112, "step": 2368 }, { "epoch": 1.9941077441077442, "grad_norm": 0.518194854259491, "learning_rate": 8.994917199033631e-06, "loss": 0.4249, "step": 2369 }, { "epoch": 1.9949494949494948, "grad_norm": 0.5194661617279053, "learning_rate": 8.99365468364757e-06, "loss": 0.3946, "step": 2370 }, { "epoch": 1.9957912457912457, "grad_norm": 0.4896297752857208, "learning_rate": 8.992391464536977e-06, "loss": 0.424, "step": 2371 }, { "epoch": 1.9966329966329965, "grad_norm": 0.5980393886566162, "learning_rate": 8.991127541924444e-06, "loss": 0.435, "step": 2372 }, { "epoch": 1.9974747474747474, "grad_norm": 0.45056089758872986, "learning_rate": 8.98986291603269e-06, "loss": 0.4059, "step": 2373 }, { "epoch": 1.9983164983164983, "grad_norm": 0.5362873673439026, "learning_rate": 8.988597587084549e-06, "loss": 0.4033, "step": 2374 }, { "epoch": 1.9991582491582491, "grad_norm": 0.5254315137863159, "learning_rate": 8.98733155530299e-06, "loss": 0.4385, "step": 2375 }, { "epoch": 2.0, "grad_norm": 0.5676225423812866, "learning_rate": 8.986064820911098e-06, "loss": 0.4004, "step": 2376 }, { "epoch": 2.000841750841751, "grad_norm": 0.5460618138313293, "learning_rate": 8.984797384132092e-06, "loss": 0.3585, "step": 2377 }, { "epoch": 2.0016835016835017, "grad_norm": 0.4449284076690674, "learning_rate": 8.9835292451893e-06, "loss": 0.3823, "step": 2378 }, { "epoch": 2.0025252525252526, "grad_norm": 0.48679208755493164, "learning_rate": 8.982260404306186e-06, "loss": 0.3604, "step": 2379 }, { "epoch": 2.0033670033670035, "grad_norm": 0.4428463578224182, "learning_rate": 8.980990861706331e-06, "loss": 0.3749, "step": 2380 }, { "epoch": 2.0042087542087543, "grad_norm": 0.43409883975982666, "learning_rate": 8.979720617613446e-06, "loss": 0.3646, "step": 2381 }, { "epoch": 2.005050505050505, "grad_norm": 0.4344434142112732, "learning_rate": 8.978449672251355e-06, "loss": 0.3537, "step": 2382 }, { "epoch": 2.005892255892256, "grad_norm": 0.4570139944553375, "learning_rate": 8.977178025844018e-06, "loss": 0.3954, "step": 2383 }, { "epoch": 2.006734006734007, "grad_norm": 0.44213536381721497, "learning_rate": 8.975905678615511e-06, "loss": 0.367, "step": 2384 }, { "epoch": 2.007575757575758, "grad_norm": 0.43715447187423706, "learning_rate": 8.974632630790031e-06, "loss": 0.3625, "step": 2385 }, { "epoch": 2.008417508417508, "grad_norm": 0.45935195684432983, "learning_rate": 8.973358882591912e-06, "loss": 0.3748, "step": 2386 }, { "epoch": 2.009259259259259, "grad_norm": 0.4643315374851227, "learning_rate": 8.972084434245592e-06, "loss": 0.3646, "step": 2387 }, { "epoch": 2.01010101010101, "grad_norm": 0.5032113194465637, "learning_rate": 8.970809285975648e-06, "loss": 0.3612, "step": 2388 }, { "epoch": 2.010942760942761, "grad_norm": 0.4555783271789551, "learning_rate": 8.969533438006773e-06, "loss": 0.3646, "step": 2389 }, { "epoch": 2.0117845117845117, "grad_norm": 0.5266281962394714, "learning_rate": 8.968256890563786e-06, "loss": 0.3943, "step": 2390 }, { "epoch": 2.0126262626262625, "grad_norm": 0.4905073344707489, "learning_rate": 8.966979643871627e-06, "loss": 0.3437, "step": 2391 }, { "epoch": 2.0134680134680134, "grad_norm": 0.42840972542762756, "learning_rate": 8.96570169815536e-06, "loss": 0.3531, "step": 2392 }, { "epoch": 2.0143097643097643, "grad_norm": 0.502066969871521, "learning_rate": 8.964423053640177e-06, "loss": 0.3835, "step": 2393 }, { "epoch": 2.015151515151515, "grad_norm": 0.4667770266532898, "learning_rate": 8.963143710551382e-06, "loss": 0.3625, "step": 2394 }, { "epoch": 2.015993265993266, "grad_norm": 0.4749871492385864, "learning_rate": 8.961863669114414e-06, "loss": 0.3707, "step": 2395 }, { "epoch": 2.016835016835017, "grad_norm": 0.452521413564682, "learning_rate": 8.960582929554828e-06, "loss": 0.3766, "step": 2396 }, { "epoch": 2.0176767676767677, "grad_norm": 0.46993470191955566, "learning_rate": 8.959301492098306e-06, "loss": 0.3781, "step": 2397 }, { "epoch": 2.0185185185185186, "grad_norm": 0.5057358145713806, "learning_rate": 8.958019356970648e-06, "loss": 0.3509, "step": 2398 }, { "epoch": 2.0193602693602695, "grad_norm": 0.4469110369682312, "learning_rate": 8.956736524397781e-06, "loss": 0.3665, "step": 2399 }, { "epoch": 2.0202020202020203, "grad_norm": 0.4950222969055176, "learning_rate": 8.955452994605753e-06, "loss": 0.3764, "step": 2400 }, { "epoch": 2.021043771043771, "grad_norm": 0.43698883056640625, "learning_rate": 8.954168767820739e-06, "loss": 0.3914, "step": 2401 }, { "epoch": 2.021885521885522, "grad_norm": 0.3921854496002197, "learning_rate": 8.95288384426903e-06, "loss": 0.3615, "step": 2402 }, { "epoch": 2.022727272727273, "grad_norm": 0.41093671321868896, "learning_rate": 8.951598224177045e-06, "loss": 0.3503, "step": 2403 }, { "epoch": 2.0235690235690234, "grad_norm": 0.40188801288604736, "learning_rate": 8.950311907771322e-06, "loss": 0.3615, "step": 2404 }, { "epoch": 2.024410774410774, "grad_norm": 0.4395439028739929, "learning_rate": 8.949024895278525e-06, "loss": 0.3571, "step": 2405 }, { "epoch": 2.025252525252525, "grad_norm": 0.3764999806880951, "learning_rate": 8.94773718692544e-06, "loss": 0.3368, "step": 2406 }, { "epoch": 2.026094276094276, "grad_norm": 0.4444005787372589, "learning_rate": 8.946448782938973e-06, "loss": 0.369, "step": 2407 }, { "epoch": 2.026936026936027, "grad_norm": 0.4838944673538208, "learning_rate": 8.945159683546156e-06, "loss": 0.3589, "step": 2408 }, { "epoch": 2.0277777777777777, "grad_norm": 0.4359593987464905, "learning_rate": 8.943869888974144e-06, "loss": 0.3771, "step": 2409 }, { "epoch": 2.0286195286195285, "grad_norm": 0.4261634349822998, "learning_rate": 8.942579399450208e-06, "loss": 0.4041, "step": 2410 }, { "epoch": 2.0294612794612794, "grad_norm": 0.4453122913837433, "learning_rate": 8.941288215201748e-06, "loss": 0.3689, "step": 2411 }, { "epoch": 2.0303030303030303, "grad_norm": 0.575972855091095, "learning_rate": 8.939996336456286e-06, "loss": 0.3668, "step": 2412 }, { "epoch": 2.031144781144781, "grad_norm": 0.43100062012672424, "learning_rate": 8.938703763441462e-06, "loss": 0.3583, "step": 2413 }, { "epoch": 2.031986531986532, "grad_norm": 0.5135225653648376, "learning_rate": 8.937410496385044e-06, "loss": 0.3821, "step": 2414 }, { "epoch": 2.032828282828283, "grad_norm": 0.4786876142024994, "learning_rate": 8.936116535514918e-06, "loss": 0.4039, "step": 2415 }, { "epoch": 2.0336700336700337, "grad_norm": 0.4325219690799713, "learning_rate": 8.934821881059095e-06, "loss": 0.3648, "step": 2416 }, { "epoch": 2.0345117845117846, "grad_norm": 0.4814983010292053, "learning_rate": 8.933526533245704e-06, "loss": 0.3745, "step": 2417 }, { "epoch": 2.0353535353535355, "grad_norm": 0.5063315629959106, "learning_rate": 8.932230492303e-06, "loss": 0.3737, "step": 2418 }, { "epoch": 2.0361952861952863, "grad_norm": 0.49959397315979004, "learning_rate": 8.930933758459362e-06, "loss": 0.3531, "step": 2419 }, { "epoch": 2.037037037037037, "grad_norm": 0.4844426214694977, "learning_rate": 8.929636331943286e-06, "loss": 0.3634, "step": 2420 }, { "epoch": 2.037878787878788, "grad_norm": 0.47807547450065613, "learning_rate": 8.928338212983393e-06, "loss": 0.3655, "step": 2421 }, { "epoch": 2.038720538720539, "grad_norm": 0.42105239629745483, "learning_rate": 8.927039401808426e-06, "loss": 0.3608, "step": 2422 }, { "epoch": 2.0395622895622894, "grad_norm": 0.40565961599349976, "learning_rate": 8.925739898647247e-06, "loss": 0.3839, "step": 2423 }, { "epoch": 2.04040404040404, "grad_norm": 0.46444806456565857, "learning_rate": 8.924439703728844e-06, "loss": 0.3872, "step": 2424 }, { "epoch": 2.041245791245791, "grad_norm": 0.4192213714122772, "learning_rate": 8.923138817282326e-06, "loss": 0.3473, "step": 2425 }, { "epoch": 2.042087542087542, "grad_norm": 0.41920626163482666, "learning_rate": 8.92183723953692e-06, "loss": 0.356, "step": 2426 }, { "epoch": 2.042929292929293, "grad_norm": 0.39470213651657104, "learning_rate": 8.920534970721984e-06, "loss": 0.3454, "step": 2427 }, { "epoch": 2.0437710437710437, "grad_norm": 0.45016536116600037, "learning_rate": 8.919232011066984e-06, "loss": 0.3631, "step": 2428 }, { "epoch": 2.0446127946127945, "grad_norm": 0.39988580346107483, "learning_rate": 8.91792836080152e-06, "loss": 0.359, "step": 2429 }, { "epoch": 2.0454545454545454, "grad_norm": 0.4388815462589264, "learning_rate": 8.91662402015531e-06, "loss": 0.3734, "step": 2430 }, { "epoch": 2.0462962962962963, "grad_norm": 0.37889108061790466, "learning_rate": 8.915318989358188e-06, "loss": 0.3823, "step": 2431 }, { "epoch": 2.047138047138047, "grad_norm": 0.4253448247909546, "learning_rate": 8.914013268640118e-06, "loss": 0.3862, "step": 2432 }, { "epoch": 2.047979797979798, "grad_norm": 0.47764918208122253, "learning_rate": 8.912706858231182e-06, "loss": 0.3705, "step": 2433 }, { "epoch": 2.048821548821549, "grad_norm": 0.4038982391357422, "learning_rate": 8.911399758361582e-06, "loss": 0.3495, "step": 2434 }, { "epoch": 2.0496632996632997, "grad_norm": 0.49673354625701904, "learning_rate": 8.910091969261645e-06, "loss": 0.3843, "step": 2435 }, { "epoch": 2.0505050505050506, "grad_norm": 0.4522888958454132, "learning_rate": 8.908783491161814e-06, "loss": 0.3758, "step": 2436 }, { "epoch": 2.0513468013468015, "grad_norm": 0.4293997883796692, "learning_rate": 8.90747432429266e-06, "loss": 0.3751, "step": 2437 }, { "epoch": 2.0521885521885523, "grad_norm": 0.4124661684036255, "learning_rate": 8.90616446888487e-06, "loss": 0.3765, "step": 2438 }, { "epoch": 2.053030303030303, "grad_norm": 0.3916114866733551, "learning_rate": 8.904853925169255e-06, "loss": 0.3642, "step": 2439 }, { "epoch": 2.053872053872054, "grad_norm": 0.43513157963752747, "learning_rate": 8.903542693376748e-06, "loss": 0.3706, "step": 2440 }, { "epoch": 2.0547138047138045, "grad_norm": 0.4118385910987854, "learning_rate": 8.9022307737384e-06, "loss": 0.3694, "step": 2441 }, { "epoch": 2.0555555555555554, "grad_norm": 0.395168274641037, "learning_rate": 8.900918166485389e-06, "loss": 0.3516, "step": 2442 }, { "epoch": 2.0563973063973062, "grad_norm": 0.4399130940437317, "learning_rate": 8.899604871849005e-06, "loss": 0.3723, "step": 2443 }, { "epoch": 2.057239057239057, "grad_norm": 0.4032900631427765, "learning_rate": 8.898290890060668e-06, "loss": 0.381, "step": 2444 }, { "epoch": 2.058080808080808, "grad_norm": 0.43829837441444397, "learning_rate": 8.896976221351914e-06, "loss": 0.3748, "step": 2445 }, { "epoch": 2.058922558922559, "grad_norm": 0.4477021396160126, "learning_rate": 8.895660865954404e-06, "loss": 0.3641, "step": 2446 }, { "epoch": 2.0597643097643097, "grad_norm": 0.4743240475654602, "learning_rate": 8.894344824099916e-06, "loss": 0.3529, "step": 2447 }, { "epoch": 2.0606060606060606, "grad_norm": 0.45205339789390564, "learning_rate": 8.893028096020349e-06, "loss": 0.3568, "step": 2448 }, { "epoch": 2.0614478114478114, "grad_norm": 0.4272143840789795, "learning_rate": 8.891710681947727e-06, "loss": 0.377, "step": 2449 }, { "epoch": 2.0622895622895623, "grad_norm": 0.49785473942756653, "learning_rate": 8.890392582114193e-06, "loss": 0.3862, "step": 2450 }, { "epoch": 2.063131313131313, "grad_norm": 0.44366776943206787, "learning_rate": 8.889073796752008e-06, "loss": 0.3824, "step": 2451 }, { "epoch": 2.063973063973064, "grad_norm": 0.5188264846801758, "learning_rate": 8.887754326093557e-06, "loss": 0.3679, "step": 2452 }, { "epoch": 2.064814814814815, "grad_norm": 0.43407532572746277, "learning_rate": 8.886434170371345e-06, "loss": 0.3714, "step": 2453 }, { "epoch": 2.0656565656565657, "grad_norm": 0.4291672706604004, "learning_rate": 8.885113329817997e-06, "loss": 0.3505, "step": 2454 }, { "epoch": 2.0664983164983166, "grad_norm": 0.47482091188430786, "learning_rate": 8.88379180466626e-06, "loss": 0.367, "step": 2455 }, { "epoch": 2.0673400673400675, "grad_norm": 0.48510268330574036, "learning_rate": 8.882469595149e-06, "loss": 0.3722, "step": 2456 }, { "epoch": 2.0681818181818183, "grad_norm": 0.45355820655822754, "learning_rate": 8.881146701499203e-06, "loss": 0.3448, "step": 2457 }, { "epoch": 2.069023569023569, "grad_norm": 0.4567302167415619, "learning_rate": 8.87982312394998e-06, "loss": 0.3631, "step": 2458 }, { "epoch": 2.06986531986532, "grad_norm": 0.3988644480705261, "learning_rate": 8.878498862734559e-06, "loss": 0.3492, "step": 2459 }, { "epoch": 2.0707070707070705, "grad_norm": 0.4639805555343628, "learning_rate": 8.877173918086289e-06, "loss": 0.3507, "step": 2460 }, { "epoch": 2.0715488215488214, "grad_norm": 0.42680275440216064, "learning_rate": 8.875848290238635e-06, "loss": 0.3762, "step": 2461 }, { "epoch": 2.0723905723905722, "grad_norm": 0.4740827679634094, "learning_rate": 8.874521979425192e-06, "loss": 0.3938, "step": 2462 }, { "epoch": 2.073232323232323, "grad_norm": 0.45929551124572754, "learning_rate": 8.873194985879669e-06, "loss": 0.3507, "step": 2463 }, { "epoch": 2.074074074074074, "grad_norm": 0.41652148962020874, "learning_rate": 8.871867309835894e-06, "loss": 0.3775, "step": 2464 }, { "epoch": 2.074915824915825, "grad_norm": 0.4708382785320282, "learning_rate": 8.87053895152782e-06, "loss": 0.3588, "step": 2465 }, { "epoch": 2.0757575757575757, "grad_norm": 0.48098188638687134, "learning_rate": 8.869209911189517e-06, "loss": 0.3686, "step": 2466 }, { "epoch": 2.0765993265993266, "grad_norm": 0.4231470823287964, "learning_rate": 8.867880189055178e-06, "loss": 0.3429, "step": 2467 }, { "epoch": 2.0774410774410774, "grad_norm": 0.48380789160728455, "learning_rate": 8.866549785359113e-06, "loss": 0.3368, "step": 2468 }, { "epoch": 2.0782828282828283, "grad_norm": 0.46176257729530334, "learning_rate": 8.865218700335752e-06, "loss": 0.3632, "step": 2469 }, { "epoch": 2.079124579124579, "grad_norm": 0.4072927236557007, "learning_rate": 8.863886934219646e-06, "loss": 0.3542, "step": 2470 }, { "epoch": 2.07996632996633, "grad_norm": 0.49426132440567017, "learning_rate": 8.862554487245467e-06, "loss": 0.3872, "step": 2471 }, { "epoch": 2.080808080808081, "grad_norm": 0.4409431219100952, "learning_rate": 8.861221359648009e-06, "loss": 0.3592, "step": 2472 }, { "epoch": 2.0816498316498318, "grad_norm": 0.4401162266731262, "learning_rate": 8.859887551662181e-06, "loss": 0.3752, "step": 2473 }, { "epoch": 2.0824915824915826, "grad_norm": 0.4075237512588501, "learning_rate": 8.858553063523014e-06, "loss": 0.3498, "step": 2474 }, { "epoch": 2.0833333333333335, "grad_norm": 0.47704917192459106, "learning_rate": 8.857217895465661e-06, "loss": 0.3595, "step": 2475 }, { "epoch": 2.0841750841750843, "grad_norm": 0.526064395904541, "learning_rate": 8.85588204772539e-06, "loss": 0.3787, "step": 2476 }, { "epoch": 2.085016835016835, "grad_norm": 0.4326465129852295, "learning_rate": 8.854545520537594e-06, "loss": 0.3441, "step": 2477 }, { "epoch": 2.0858585858585856, "grad_norm": 0.5130478143692017, "learning_rate": 8.853208314137781e-06, "loss": 0.3584, "step": 2478 }, { "epoch": 2.0867003367003365, "grad_norm": 0.4075468182563782, "learning_rate": 8.851870428761583e-06, "loss": 0.3706, "step": 2479 }, { "epoch": 2.0875420875420874, "grad_norm": 0.44458460807800293, "learning_rate": 8.850531864644749e-06, "loss": 0.3661, "step": 2480 }, { "epoch": 2.0883838383838382, "grad_norm": 0.47994959354400635, "learning_rate": 8.849192622023149e-06, "loss": 0.3569, "step": 2481 }, { "epoch": 2.089225589225589, "grad_norm": 0.45625171065330505, "learning_rate": 8.84785270113277e-06, "loss": 0.3623, "step": 2482 }, { "epoch": 2.09006734006734, "grad_norm": 0.5504258275032043, "learning_rate": 8.846512102209721e-06, "loss": 0.364, "step": 2483 }, { "epoch": 2.090909090909091, "grad_norm": 0.4196988344192505, "learning_rate": 8.845170825490232e-06, "loss": 0.3689, "step": 2484 }, { "epoch": 2.0917508417508417, "grad_norm": 0.47762778401374817, "learning_rate": 8.843828871210647e-06, "loss": 0.3494, "step": 2485 }, { "epoch": 2.0925925925925926, "grad_norm": 0.4564860463142395, "learning_rate": 8.842486239607435e-06, "loss": 0.3573, "step": 2486 }, { "epoch": 2.0934343434343434, "grad_norm": 0.5458654165267944, "learning_rate": 8.841142930917182e-06, "loss": 0.3572, "step": 2487 }, { "epoch": 2.0942760942760943, "grad_norm": 0.43664076924324036, "learning_rate": 8.839798945376592e-06, "loss": 0.3745, "step": 2488 }, { "epoch": 2.095117845117845, "grad_norm": 0.42504584789276123, "learning_rate": 8.83845428322249e-06, "loss": 0.355, "step": 2489 }, { "epoch": 2.095959595959596, "grad_norm": 0.49800702929496765, "learning_rate": 8.837108944691818e-06, "loss": 0.3525, "step": 2490 }, { "epoch": 2.096801346801347, "grad_norm": 0.43348634243011475, "learning_rate": 8.835762930021644e-06, "loss": 0.3574, "step": 2491 }, { "epoch": 2.0976430976430978, "grad_norm": 0.4261520802974701, "learning_rate": 8.834416239449147e-06, "loss": 0.386, "step": 2492 }, { "epoch": 2.0984848484848486, "grad_norm": 0.42873916029930115, "learning_rate": 8.833068873211625e-06, "loss": 0.3809, "step": 2493 }, { "epoch": 2.0993265993265995, "grad_norm": 0.44675201177597046, "learning_rate": 8.831720831546503e-06, "loss": 0.3599, "step": 2494 }, { "epoch": 2.1001683501683504, "grad_norm": 0.40433287620544434, "learning_rate": 8.830372114691322e-06, "loss": 0.3557, "step": 2495 }, { "epoch": 2.101010101010101, "grad_norm": 0.42703157663345337, "learning_rate": 8.829022722883733e-06, "loss": 0.3772, "step": 2496 }, { "epoch": 2.1018518518518516, "grad_norm": 0.40602293610572815, "learning_rate": 8.82767265636152e-06, "loss": 0.361, "step": 2497 }, { "epoch": 2.1026936026936025, "grad_norm": 0.4090743064880371, "learning_rate": 8.826321915362576e-06, "loss": 0.3547, "step": 2498 }, { "epoch": 2.1035353535353534, "grad_norm": 0.4423297345638275, "learning_rate": 8.824970500124916e-06, "loss": 0.3683, "step": 2499 }, { "epoch": 2.1043771043771042, "grad_norm": 0.44607728719711304, "learning_rate": 8.823618410886674e-06, "loss": 0.3886, "step": 2500 }, { "epoch": 2.105218855218855, "grad_norm": 0.4920103847980499, "learning_rate": 8.822265647886104e-06, "loss": 0.3801, "step": 2501 }, { "epoch": 2.106060606060606, "grad_norm": 0.48479416966438293, "learning_rate": 8.820912211361575e-06, "loss": 0.3564, "step": 2502 }, { "epoch": 2.106902356902357, "grad_norm": 0.41050073504447937, "learning_rate": 8.819558101551577e-06, "loss": 0.3554, "step": 2503 }, { "epoch": 2.1077441077441077, "grad_norm": 0.47582876682281494, "learning_rate": 8.818203318694721e-06, "loss": 0.3631, "step": 2504 }, { "epoch": 2.1085858585858586, "grad_norm": 0.5012246370315552, "learning_rate": 8.816847863029732e-06, "loss": 0.3651, "step": 2505 }, { "epoch": 2.1094276094276094, "grad_norm": 0.42936331033706665, "learning_rate": 8.815491734795458e-06, "loss": 0.3685, "step": 2506 }, { "epoch": 2.1102693602693603, "grad_norm": 0.5393801331520081, "learning_rate": 8.81413493423086e-06, "loss": 0.3613, "step": 2507 }, { "epoch": 2.111111111111111, "grad_norm": 0.4706445038318634, "learning_rate": 8.812777461575024e-06, "loss": 0.3752, "step": 2508 }, { "epoch": 2.111952861952862, "grad_norm": 0.428170382976532, "learning_rate": 8.81141931706715e-06, "loss": 0.3766, "step": 2509 }, { "epoch": 2.112794612794613, "grad_norm": 0.4846231937408447, "learning_rate": 8.810060500946555e-06, "loss": 0.3767, "step": 2510 }, { "epoch": 2.1136363636363638, "grad_norm": 0.4621906280517578, "learning_rate": 8.808701013452681e-06, "loss": 0.3916, "step": 2511 }, { "epoch": 2.1144781144781146, "grad_norm": 0.43486452102661133, "learning_rate": 8.807340854825082e-06, "loss": 0.3408, "step": 2512 }, { "epoch": 2.1153198653198655, "grad_norm": 0.4195125699043274, "learning_rate": 8.80598002530343e-06, "loss": 0.367, "step": 2513 }, { "epoch": 2.1161616161616164, "grad_norm": 0.46005553007125854, "learning_rate": 8.804618525127525e-06, "loss": 0.3826, "step": 2514 }, { "epoch": 2.1170033670033668, "grad_norm": 0.46254095435142517, "learning_rate": 8.80325635453727e-06, "loss": 0.3767, "step": 2515 }, { "epoch": 2.1178451178451176, "grad_norm": 0.4237406551837921, "learning_rate": 8.801893513772697e-06, "loss": 0.3759, "step": 2516 }, { "epoch": 2.1186868686868685, "grad_norm": 0.47296902537345886, "learning_rate": 8.800530003073956e-06, "loss": 0.3693, "step": 2517 }, { "epoch": 2.1195286195286194, "grad_norm": 0.4628741145133972, "learning_rate": 8.799165822681305e-06, "loss": 0.3594, "step": 2518 }, { "epoch": 2.1203703703703702, "grad_norm": 0.4296816289424896, "learning_rate": 8.797800972835135e-06, "loss": 0.3742, "step": 2519 }, { "epoch": 2.121212121212121, "grad_norm": 0.46850916743278503, "learning_rate": 8.796435453775943e-06, "loss": 0.3685, "step": 2520 }, { "epoch": 2.122053872053872, "grad_norm": 0.46377405524253845, "learning_rate": 8.79506926574435e-06, "loss": 0.3622, "step": 2521 }, { "epoch": 2.122895622895623, "grad_norm": 0.4542447626590729, "learning_rate": 8.793702408981089e-06, "loss": 0.3834, "step": 2522 }, { "epoch": 2.1237373737373737, "grad_norm": 0.5486211776733398, "learning_rate": 8.792334883727018e-06, "loss": 0.3713, "step": 2523 }, { "epoch": 2.1245791245791246, "grad_norm": 0.44716960191726685, "learning_rate": 8.790966690223108e-06, "loss": 0.369, "step": 2524 }, { "epoch": 2.1254208754208754, "grad_norm": 0.4818045198917389, "learning_rate": 8.789597828710452e-06, "loss": 0.3887, "step": 2525 }, { "epoch": 2.1262626262626263, "grad_norm": 0.48854491114616394, "learning_rate": 8.788228299430257e-06, "loss": 0.3756, "step": 2526 }, { "epoch": 2.127104377104377, "grad_norm": 0.4691668748855591, "learning_rate": 8.786858102623846e-06, "loss": 0.3386, "step": 2527 }, { "epoch": 2.127946127946128, "grad_norm": 0.5022256970405579, "learning_rate": 8.785487238532662e-06, "loss": 0.3556, "step": 2528 }, { "epoch": 2.128787878787879, "grad_norm": 0.4232371151447296, "learning_rate": 8.78411570739827e-06, "loss": 0.3648, "step": 2529 }, { "epoch": 2.1296296296296298, "grad_norm": 0.4381847381591797, "learning_rate": 8.782743509462348e-06, "loss": 0.3522, "step": 2530 }, { "epoch": 2.1304713804713806, "grad_norm": 0.46259254217147827, "learning_rate": 8.781370644966687e-06, "loss": 0.3814, "step": 2531 }, { "epoch": 2.1313131313131315, "grad_norm": 0.49868953227996826, "learning_rate": 8.779997114153205e-06, "loss": 0.3845, "step": 2532 }, { "epoch": 2.1321548821548824, "grad_norm": 0.4548985958099365, "learning_rate": 8.778622917263933e-06, "loss": 0.3633, "step": 2533 }, { "epoch": 2.1329966329966332, "grad_norm": 0.5184977650642395, "learning_rate": 8.777248054541015e-06, "loss": 0.3588, "step": 2534 }, { "epoch": 2.1338383838383836, "grad_norm": 0.5179730653762817, "learning_rate": 8.77587252622672e-06, "loss": 0.3698, "step": 2535 }, { "epoch": 2.1346801346801345, "grad_norm": 0.4723057448863983, "learning_rate": 8.774496332563429e-06, "loss": 0.3585, "step": 2536 }, { "epoch": 2.1355218855218854, "grad_norm": 0.4978412091732025, "learning_rate": 8.773119473793643e-06, "loss": 0.3578, "step": 2537 }, { "epoch": 2.1363636363636362, "grad_norm": 0.4433605968952179, "learning_rate": 8.77174195015998e-06, "loss": 0.3741, "step": 2538 }, { "epoch": 2.137205387205387, "grad_norm": 0.46858322620391846, "learning_rate": 8.770363761905173e-06, "loss": 0.3586, "step": 2539 }, { "epoch": 2.138047138047138, "grad_norm": 0.4278303384780884, "learning_rate": 8.76898490927207e-06, "loss": 0.356, "step": 2540 }, { "epoch": 2.138888888888889, "grad_norm": 0.4811704456806183, "learning_rate": 8.767605392503649e-06, "loss": 0.364, "step": 2541 }, { "epoch": 2.1397306397306397, "grad_norm": 0.4215065538883209, "learning_rate": 8.766225211842987e-06, "loss": 0.3513, "step": 2542 }, { "epoch": 2.1405723905723906, "grad_norm": 0.4537163972854614, "learning_rate": 8.76484436753329e-06, "loss": 0.3737, "step": 2543 }, { "epoch": 2.1414141414141414, "grad_norm": 0.4388437569141388, "learning_rate": 8.763462859817876e-06, "loss": 0.3755, "step": 2544 }, { "epoch": 2.1422558922558923, "grad_norm": 0.4352024495601654, "learning_rate": 8.762080688940183e-06, "loss": 0.3747, "step": 2545 }, { "epoch": 2.143097643097643, "grad_norm": 0.4366527795791626, "learning_rate": 8.760697855143763e-06, "loss": 0.3649, "step": 2546 }, { "epoch": 2.143939393939394, "grad_norm": 0.39546576142311096, "learning_rate": 8.759314358672286e-06, "loss": 0.3557, "step": 2547 }, { "epoch": 2.144781144781145, "grad_norm": 0.4597035348415375, "learning_rate": 8.75793019976954e-06, "loss": 0.3749, "step": 2548 }, { "epoch": 2.1456228956228958, "grad_norm": 0.5380921959877014, "learning_rate": 8.756545378679428e-06, "loss": 0.3617, "step": 2549 }, { "epoch": 2.1464646464646466, "grad_norm": 0.4685041904449463, "learning_rate": 8.75515989564597e-06, "loss": 0.4056, "step": 2550 }, { "epoch": 2.1473063973063975, "grad_norm": 0.49061119556427, "learning_rate": 8.753773750913302e-06, "loss": 0.3595, "step": 2551 }, { "epoch": 2.148148148148148, "grad_norm": 0.4842711389064789, "learning_rate": 8.75238694472568e-06, "loss": 0.3659, "step": 2552 }, { "epoch": 2.148989898989899, "grad_norm": 0.5162442922592163, "learning_rate": 8.750999477327473e-06, "loss": 0.367, "step": 2553 }, { "epoch": 2.1498316498316496, "grad_norm": 0.43488234281539917, "learning_rate": 8.749611348963165e-06, "loss": 0.3531, "step": 2554 }, { "epoch": 2.1506734006734005, "grad_norm": 0.45663952827453613, "learning_rate": 8.748222559877362e-06, "loss": 0.3616, "step": 2555 }, { "epoch": 2.1515151515151514, "grad_norm": 0.5102798342704773, "learning_rate": 8.746833110314784e-06, "loss": 0.3609, "step": 2556 }, { "epoch": 2.1523569023569022, "grad_norm": 0.451401948928833, "learning_rate": 8.745443000520263e-06, "loss": 0.3763, "step": 2557 }, { "epoch": 2.153198653198653, "grad_norm": 0.4769033193588257, "learning_rate": 8.744052230738756e-06, "loss": 0.3838, "step": 2558 }, { "epoch": 2.154040404040404, "grad_norm": 0.42175325751304626, "learning_rate": 8.742660801215328e-06, "loss": 0.3528, "step": 2559 }, { "epoch": 2.154882154882155, "grad_norm": 0.49492231011390686, "learning_rate": 8.741268712195166e-06, "loss": 0.3722, "step": 2560 }, { "epoch": 2.1557239057239057, "grad_norm": 0.4530574381351471, "learning_rate": 8.739875963923568e-06, "loss": 0.3683, "step": 2561 }, { "epoch": 2.1565656565656566, "grad_norm": 0.4336623549461365, "learning_rate": 8.738482556645953e-06, "loss": 0.3455, "step": 2562 }, { "epoch": 2.1574074074074074, "grad_norm": 0.4935213029384613, "learning_rate": 8.737088490607853e-06, "loss": 0.3847, "step": 2563 }, { "epoch": 2.1582491582491583, "grad_norm": 0.428528368473053, "learning_rate": 8.73569376605492e-06, "loss": 0.3582, "step": 2564 }, { "epoch": 2.159090909090909, "grad_norm": 0.4701298177242279, "learning_rate": 8.734298383232918e-06, "loss": 0.3738, "step": 2565 }, { "epoch": 2.15993265993266, "grad_norm": 0.4456746280193329, "learning_rate": 8.732902342387725e-06, "loss": 0.3617, "step": 2566 }, { "epoch": 2.160774410774411, "grad_norm": 0.4183025062084198, "learning_rate": 8.731505643765345e-06, "loss": 0.3715, "step": 2567 }, { "epoch": 2.1616161616161618, "grad_norm": 0.4859693944454193, "learning_rate": 8.730108287611885e-06, "loss": 0.3767, "step": 2568 }, { "epoch": 2.1624579124579126, "grad_norm": 0.4419477880001068, "learning_rate": 8.728710274173578e-06, "loss": 0.3631, "step": 2569 }, { "epoch": 2.1632996632996635, "grad_norm": 0.4619992971420288, "learning_rate": 8.727311603696765e-06, "loss": 0.3699, "step": 2570 }, { "epoch": 2.1641414141414144, "grad_norm": 0.4546523690223694, "learning_rate": 8.72591227642791e-06, "loss": 0.3729, "step": 2571 }, { "epoch": 2.164983164983165, "grad_norm": 0.4797595739364624, "learning_rate": 8.72451229261359e-06, "loss": 0.3593, "step": 2572 }, { "epoch": 2.1658249158249157, "grad_norm": 0.512609601020813, "learning_rate": 8.723111652500494e-06, "loss": 0.3531, "step": 2573 }, { "epoch": 2.1666666666666665, "grad_norm": 0.4725880026817322, "learning_rate": 8.721710356335432e-06, "loss": 0.3492, "step": 2574 }, { "epoch": 2.1675084175084174, "grad_norm": 0.4469078779220581, "learning_rate": 8.720308404365325e-06, "loss": 0.3648, "step": 2575 }, { "epoch": 2.1683501683501682, "grad_norm": 0.4311220347881317, "learning_rate": 8.718905796837213e-06, "loss": 0.3465, "step": 2576 }, { "epoch": 2.169191919191919, "grad_norm": 0.4725921154022217, "learning_rate": 8.717502533998253e-06, "loss": 0.3866, "step": 2577 }, { "epoch": 2.17003367003367, "grad_norm": 0.45946788787841797, "learning_rate": 8.716098616095712e-06, "loss": 0.3765, "step": 2578 }, { "epoch": 2.170875420875421, "grad_norm": 0.42008161544799805, "learning_rate": 8.714694043376974e-06, "loss": 0.3847, "step": 2579 }, { "epoch": 2.1717171717171717, "grad_norm": 0.4357036054134369, "learning_rate": 8.713288816089545e-06, "loss": 0.3717, "step": 2580 }, { "epoch": 2.1725589225589226, "grad_norm": 0.4246314465999603, "learning_rate": 8.711882934481035e-06, "loss": 0.3755, "step": 2581 }, { "epoch": 2.1734006734006734, "grad_norm": 0.42268598079681396, "learning_rate": 8.71047639879918e-06, "loss": 0.3647, "step": 2582 }, { "epoch": 2.1742424242424243, "grad_norm": 0.4241412580013275, "learning_rate": 8.709069209291822e-06, "loss": 0.3747, "step": 2583 }, { "epoch": 2.175084175084175, "grad_norm": 0.4800211787223816, "learning_rate": 8.707661366206927e-06, "loss": 0.3763, "step": 2584 }, { "epoch": 2.175925925925926, "grad_norm": 0.4605640470981598, "learning_rate": 8.70625286979257e-06, "loss": 0.3744, "step": 2585 }, { "epoch": 2.176767676767677, "grad_norm": 0.5006455779075623, "learning_rate": 8.704843720296945e-06, "loss": 0.3575, "step": 2586 }, { "epoch": 2.1776094276094278, "grad_norm": 0.48128584027290344, "learning_rate": 8.703433917968356e-06, "loss": 0.397, "step": 2587 }, { "epoch": 2.1784511784511786, "grad_norm": 0.3948119580745697, "learning_rate": 8.702023463055227e-06, "loss": 0.3824, "step": 2588 }, { "epoch": 2.179292929292929, "grad_norm": 0.4813423156738281, "learning_rate": 8.700612355806095e-06, "loss": 0.366, "step": 2589 }, { "epoch": 2.18013468013468, "grad_norm": 0.45031338930130005, "learning_rate": 8.699200596469612e-06, "loss": 0.3926, "step": 2590 }, { "epoch": 2.180976430976431, "grad_norm": 0.4011789560317993, "learning_rate": 8.697788185294544e-06, "loss": 0.3513, "step": 2591 }, { "epoch": 2.1818181818181817, "grad_norm": 0.43907827138900757, "learning_rate": 8.696375122529777e-06, "loss": 0.3618, "step": 2592 }, { "epoch": 2.1826599326599325, "grad_norm": 0.45102500915527344, "learning_rate": 8.694961408424302e-06, "loss": 0.3779, "step": 2593 }, { "epoch": 2.1835016835016834, "grad_norm": 0.40702614188194275, "learning_rate": 8.693547043227235e-06, "loss": 0.3561, "step": 2594 }, { "epoch": 2.1843434343434343, "grad_norm": 0.4211312234401703, "learning_rate": 8.6921320271878e-06, "loss": 0.3583, "step": 2595 }, { "epoch": 2.185185185185185, "grad_norm": 0.43251273036003113, "learning_rate": 8.690716360555337e-06, "loss": 0.3679, "step": 2596 }, { "epoch": 2.186026936026936, "grad_norm": 0.40223315358161926, "learning_rate": 8.689300043579305e-06, "loss": 0.3576, "step": 2597 }, { "epoch": 2.186868686868687, "grad_norm": 0.4257620573043823, "learning_rate": 8.687883076509271e-06, "loss": 0.3662, "step": 2598 }, { "epoch": 2.1877104377104377, "grad_norm": 0.48071208596229553, "learning_rate": 8.686465459594922e-06, "loss": 0.3701, "step": 2599 }, { "epoch": 2.1885521885521886, "grad_norm": 0.445544570684433, "learning_rate": 8.685047193086055e-06, "loss": 0.3425, "step": 2600 }, { "epoch": 2.1893939393939394, "grad_norm": 0.4810592532157898, "learning_rate": 8.683628277232585e-06, "loss": 0.3558, "step": 2601 }, { "epoch": 2.1902356902356903, "grad_norm": 0.46090176701545715, "learning_rate": 8.68220871228454e-06, "loss": 0.3814, "step": 2602 }, { "epoch": 2.191077441077441, "grad_norm": 0.4826107919216156, "learning_rate": 8.680788498492063e-06, "loss": 0.3558, "step": 2603 }, { "epoch": 2.191919191919192, "grad_norm": 0.5437074303627014, "learning_rate": 8.67936763610541e-06, "loss": 0.3642, "step": 2604 }, { "epoch": 2.192760942760943, "grad_norm": 0.433447927236557, "learning_rate": 8.67794612537495e-06, "loss": 0.3637, "step": 2605 }, { "epoch": 2.1936026936026938, "grad_norm": 0.5098264813423157, "learning_rate": 8.676523966551173e-06, "loss": 0.3821, "step": 2606 }, { "epoch": 2.1944444444444446, "grad_norm": 0.4432713985443115, "learning_rate": 8.675101159884676e-06, "loss": 0.3781, "step": 2607 }, { "epoch": 2.1952861952861955, "grad_norm": 0.4112246632575989, "learning_rate": 8.67367770562617e-06, "loss": 0.4035, "step": 2608 }, { "epoch": 2.196127946127946, "grad_norm": 0.4399263560771942, "learning_rate": 8.672253604026487e-06, "loss": 0.3796, "step": 2609 }, { "epoch": 2.196969696969697, "grad_norm": 0.5050444602966309, "learning_rate": 8.67082885533657e-06, "loss": 0.3631, "step": 2610 }, { "epoch": 2.1978114478114477, "grad_norm": 0.4414614140987396, "learning_rate": 8.66940345980747e-06, "loss": 0.3676, "step": 2611 }, { "epoch": 2.1986531986531985, "grad_norm": 0.4340655207633972, "learning_rate": 8.66797741769036e-06, "loss": 0.3524, "step": 2612 }, { "epoch": 2.1994949494949494, "grad_norm": 0.41591939330101013, "learning_rate": 8.666550729236525e-06, "loss": 0.3533, "step": 2613 }, { "epoch": 2.2003367003367003, "grad_norm": 0.49684056639671326, "learning_rate": 8.665123394697357e-06, "loss": 0.3767, "step": 2614 }, { "epoch": 2.201178451178451, "grad_norm": 0.4687952399253845, "learning_rate": 8.663695414324375e-06, "loss": 0.364, "step": 2615 }, { "epoch": 2.202020202020202, "grad_norm": 0.48025399446487427, "learning_rate": 8.662266788369197e-06, "loss": 0.3516, "step": 2616 }, { "epoch": 2.202861952861953, "grad_norm": 0.4604564905166626, "learning_rate": 8.66083751708357e-06, "loss": 0.3783, "step": 2617 }, { "epoch": 2.2037037037037037, "grad_norm": 0.48606160283088684, "learning_rate": 8.65940760071934e-06, "loss": 0.3467, "step": 2618 }, { "epoch": 2.2045454545454546, "grad_norm": 0.4587206244468689, "learning_rate": 8.657977039528478e-06, "loss": 0.3662, "step": 2619 }, { "epoch": 2.2053872053872055, "grad_norm": 0.477743536233902, "learning_rate": 8.656545833763062e-06, "loss": 0.3591, "step": 2620 }, { "epoch": 2.2062289562289563, "grad_norm": 0.5136227607727051, "learning_rate": 8.655113983675284e-06, "loss": 0.3658, "step": 2621 }, { "epoch": 2.207070707070707, "grad_norm": 0.4642745852470398, "learning_rate": 8.653681489517457e-06, "loss": 0.366, "step": 2622 }, { "epoch": 2.207912457912458, "grad_norm": 0.3920407295227051, "learning_rate": 8.652248351541997e-06, "loss": 0.3505, "step": 2623 }, { "epoch": 2.208754208754209, "grad_norm": 0.5135369300842285, "learning_rate": 8.650814570001439e-06, "loss": 0.3874, "step": 2624 }, { "epoch": 2.20959595959596, "grad_norm": 0.4914891719818115, "learning_rate": 8.649380145148429e-06, "loss": 0.3402, "step": 2625 }, { "epoch": 2.2104377104377106, "grad_norm": 0.47825294733047485, "learning_rate": 8.64794507723573e-06, "loss": 0.3805, "step": 2626 }, { "epoch": 2.211279461279461, "grad_norm": 0.48163533210754395, "learning_rate": 8.646509366516219e-06, "loss": 0.3637, "step": 2627 }, { "epoch": 2.212121212121212, "grad_norm": 0.5083436965942383, "learning_rate": 8.645073013242878e-06, "loss": 0.3639, "step": 2628 }, { "epoch": 2.212962962962963, "grad_norm": 0.45899882912635803, "learning_rate": 8.643636017668812e-06, "loss": 0.3573, "step": 2629 }, { "epoch": 2.2138047138047137, "grad_norm": 0.496992290019989, "learning_rate": 8.642198380047234e-06, "loss": 0.3873, "step": 2630 }, { "epoch": 2.2146464646464645, "grad_norm": 0.44238752126693726, "learning_rate": 8.640760100631467e-06, "loss": 0.3588, "step": 2631 }, { "epoch": 2.2154882154882154, "grad_norm": 0.4537624418735504, "learning_rate": 8.639321179674958e-06, "loss": 0.375, "step": 2632 }, { "epoch": 2.2163299663299663, "grad_norm": 0.5841053128242493, "learning_rate": 8.637881617431255e-06, "loss": 0.379, "step": 2633 }, { "epoch": 2.217171717171717, "grad_norm": 0.424151748418808, "learning_rate": 8.636441414154027e-06, "loss": 0.3671, "step": 2634 }, { "epoch": 2.218013468013468, "grad_norm": 0.5366584062576294, "learning_rate": 8.635000570097053e-06, "loss": 0.3532, "step": 2635 }, { "epoch": 2.218855218855219, "grad_norm": 0.5179762840270996, "learning_rate": 8.633559085514221e-06, "loss": 0.3719, "step": 2636 }, { "epoch": 2.2196969696969697, "grad_norm": 0.4592147469520569, "learning_rate": 8.632116960659543e-06, "loss": 0.3549, "step": 2637 }, { "epoch": 2.2205387205387206, "grad_norm": 0.43801894783973694, "learning_rate": 8.63067419578713e-06, "loss": 0.356, "step": 2638 }, { "epoch": 2.2213804713804715, "grad_norm": 0.4400918781757355, "learning_rate": 8.629230791151218e-06, "loss": 0.3985, "step": 2639 }, { "epoch": 2.2222222222222223, "grad_norm": 0.39936885237693787, "learning_rate": 8.627786747006146e-06, "loss": 0.3519, "step": 2640 }, { "epoch": 2.223063973063973, "grad_norm": 0.4332900941371918, "learning_rate": 8.626342063606373e-06, "loss": 0.3523, "step": 2641 }, { "epoch": 2.223905723905724, "grad_norm": 0.4542820453643799, "learning_rate": 8.624896741206465e-06, "loss": 0.3618, "step": 2642 }, { "epoch": 2.224747474747475, "grad_norm": 0.438487708568573, "learning_rate": 8.623450780061106e-06, "loss": 0.3862, "step": 2643 }, { "epoch": 2.225589225589226, "grad_norm": 0.48666369915008545, "learning_rate": 8.622004180425088e-06, "loss": 0.3728, "step": 2644 }, { "epoch": 2.2264309764309766, "grad_norm": 0.47688236832618713, "learning_rate": 8.620556942553318e-06, "loss": 0.3555, "step": 2645 }, { "epoch": 2.227272727272727, "grad_norm": 0.41083911061286926, "learning_rate": 8.619109066700812e-06, "loss": 0.3489, "step": 2646 }, { "epoch": 2.228114478114478, "grad_norm": 0.462228000164032, "learning_rate": 8.617660553122707e-06, "loss": 0.3701, "step": 2647 }, { "epoch": 2.228956228956229, "grad_norm": 0.4574538767337799, "learning_rate": 8.616211402074242e-06, "loss": 0.3726, "step": 2648 }, { "epoch": 2.2297979797979797, "grad_norm": 0.4406462609767914, "learning_rate": 8.614761613810775e-06, "loss": 0.367, "step": 2649 }, { "epoch": 2.2306397306397305, "grad_norm": 0.44900715351104736, "learning_rate": 8.61331118858777e-06, "loss": 0.348, "step": 2650 }, { "epoch": 2.2314814814814814, "grad_norm": 0.5120633244514465, "learning_rate": 8.611860126660815e-06, "loss": 0.3673, "step": 2651 }, { "epoch": 2.2323232323232323, "grad_norm": 0.43567731976509094, "learning_rate": 8.610408428285594e-06, "loss": 0.3472, "step": 2652 }, { "epoch": 2.233164983164983, "grad_norm": 0.46917980909347534, "learning_rate": 8.608956093717917e-06, "loss": 0.3598, "step": 2653 }, { "epoch": 2.234006734006734, "grad_norm": 0.48992466926574707, "learning_rate": 8.6075031232137e-06, "loss": 0.3668, "step": 2654 }, { "epoch": 2.234848484848485, "grad_norm": 0.4966944456100464, "learning_rate": 8.606049517028971e-06, "loss": 0.3964, "step": 2655 }, { "epoch": 2.2356902356902357, "grad_norm": 0.4968997836112976, "learning_rate": 8.604595275419873e-06, "loss": 0.3742, "step": 2656 }, { "epoch": 2.2365319865319866, "grad_norm": 0.46171438694000244, "learning_rate": 8.603140398642658e-06, "loss": 0.3753, "step": 2657 }, { "epoch": 2.2373737373737375, "grad_norm": 0.4976307451725006, "learning_rate": 8.601684886953687e-06, "loss": 0.3632, "step": 2658 }, { "epoch": 2.2382154882154883, "grad_norm": 0.4802835285663605, "learning_rate": 8.600228740609442e-06, "loss": 0.3566, "step": 2659 }, { "epoch": 2.239057239057239, "grad_norm": 0.4229671061038971, "learning_rate": 8.59877195986651e-06, "loss": 0.3615, "step": 2660 }, { "epoch": 2.23989898989899, "grad_norm": 0.4723897874355316, "learning_rate": 8.59731454498159e-06, "loss": 0.3735, "step": 2661 }, { "epoch": 2.240740740740741, "grad_norm": 0.4598061442375183, "learning_rate": 8.595856496211494e-06, "loss": 0.3698, "step": 2662 }, { "epoch": 2.241582491582492, "grad_norm": 0.43274664878845215, "learning_rate": 8.594397813813147e-06, "loss": 0.3836, "step": 2663 }, { "epoch": 2.242424242424242, "grad_norm": 0.4117281436920166, "learning_rate": 8.592938498043583e-06, "loss": 0.3921, "step": 2664 }, { "epoch": 2.243265993265993, "grad_norm": 0.4220670461654663, "learning_rate": 8.591478549159951e-06, "loss": 0.3431, "step": 2665 }, { "epoch": 2.244107744107744, "grad_norm": 0.4228247404098511, "learning_rate": 8.590017967419506e-06, "loss": 0.3335, "step": 2666 }, { "epoch": 2.244949494949495, "grad_norm": 0.42118459939956665, "learning_rate": 8.588556753079621e-06, "loss": 0.3748, "step": 2667 }, { "epoch": 2.2457912457912457, "grad_norm": 0.449691504240036, "learning_rate": 8.58709490639778e-06, "loss": 0.3765, "step": 2668 }, { "epoch": 2.2466329966329965, "grad_norm": 0.4514300525188446, "learning_rate": 8.585632427631572e-06, "loss": 0.3612, "step": 2669 }, { "epoch": 2.2474747474747474, "grad_norm": 0.433359295129776, "learning_rate": 8.5841693170387e-06, "loss": 0.3693, "step": 2670 }, { "epoch": 2.2483164983164983, "grad_norm": 0.3981274962425232, "learning_rate": 8.582705574876984e-06, "loss": 0.375, "step": 2671 }, { "epoch": 2.249158249158249, "grad_norm": 0.41570964455604553, "learning_rate": 8.581241201404347e-06, "loss": 0.3699, "step": 2672 }, { "epoch": 2.25, "grad_norm": 0.46865251660346985, "learning_rate": 8.579776196878832e-06, "loss": 0.3427, "step": 2673 }, { "epoch": 2.250841750841751, "grad_norm": 0.4187881648540497, "learning_rate": 8.578310561558585e-06, "loss": 0.3883, "step": 2674 }, { "epoch": 2.2516835016835017, "grad_norm": 0.41839104890823364, "learning_rate": 8.576844295701864e-06, "loss": 0.3733, "step": 2675 }, { "epoch": 2.2525252525252526, "grad_norm": 0.44272512197494507, "learning_rate": 8.575377399567047e-06, "loss": 0.3713, "step": 2676 }, { "epoch": 2.2533670033670035, "grad_norm": 0.4097166061401367, "learning_rate": 8.573909873412613e-06, "loss": 0.3579, "step": 2677 }, { "epoch": 2.2542087542087543, "grad_norm": 0.4155197739601135, "learning_rate": 8.572441717497156e-06, "loss": 0.3777, "step": 2678 }, { "epoch": 2.255050505050505, "grad_norm": 0.4347831606864929, "learning_rate": 8.570972932079382e-06, "loss": 0.3612, "step": 2679 }, { "epoch": 2.255892255892256, "grad_norm": 0.41619113087654114, "learning_rate": 8.569503517418105e-06, "loss": 0.3817, "step": 2680 }, { "epoch": 2.256734006734007, "grad_norm": 0.40353456139564514, "learning_rate": 8.568033473772253e-06, "loss": 0.3935, "step": 2681 }, { "epoch": 2.257575757575758, "grad_norm": 0.5146369338035583, "learning_rate": 8.566562801400861e-06, "loss": 0.4017, "step": 2682 }, { "epoch": 2.2584175084175087, "grad_norm": 0.4335944652557373, "learning_rate": 8.56509150056308e-06, "loss": 0.3557, "step": 2683 }, { "epoch": 2.259259259259259, "grad_norm": 0.48099225759506226, "learning_rate": 8.563619571518165e-06, "loss": 0.3716, "step": 2684 }, { "epoch": 2.26010101010101, "grad_norm": 0.5320502519607544, "learning_rate": 8.562147014525491e-06, "loss": 0.3673, "step": 2685 }, { "epoch": 2.260942760942761, "grad_norm": 0.43572765588760376, "learning_rate": 8.560673829844535e-06, "loss": 0.3748, "step": 2686 }, { "epoch": 2.2617845117845117, "grad_norm": 0.48375627398490906, "learning_rate": 8.559200017734889e-06, "loss": 0.3681, "step": 2687 }, { "epoch": 2.2626262626262625, "grad_norm": 0.4549145996570587, "learning_rate": 8.557725578456253e-06, "loss": 0.37, "step": 2688 }, { "epoch": 2.2634680134680134, "grad_norm": 0.43778035044670105, "learning_rate": 8.55625051226844e-06, "loss": 0.3586, "step": 2689 }, { "epoch": 2.2643097643097643, "grad_norm": 0.4487132728099823, "learning_rate": 8.554774819431371e-06, "loss": 0.361, "step": 2690 }, { "epoch": 2.265151515151515, "grad_norm": 0.47270819544792175, "learning_rate": 8.553298500205081e-06, "loss": 0.3882, "step": 2691 }, { "epoch": 2.265993265993266, "grad_norm": 0.435702383518219, "learning_rate": 8.55182155484971e-06, "loss": 0.3718, "step": 2692 }, { "epoch": 2.266835016835017, "grad_norm": 0.4084170162677765, "learning_rate": 8.550343983625517e-06, "loss": 0.3891, "step": 2693 }, { "epoch": 2.2676767676767677, "grad_norm": 0.43827226758003235, "learning_rate": 8.54886578679286e-06, "loss": 0.3543, "step": 2694 }, { "epoch": 2.2685185185185186, "grad_norm": 0.41249117255210876, "learning_rate": 8.547386964612217e-06, "loss": 0.364, "step": 2695 }, { "epoch": 2.2693602693602695, "grad_norm": 0.399693101644516, "learning_rate": 8.54590751734417e-06, "loss": 0.3732, "step": 2696 }, { "epoch": 2.2702020202020203, "grad_norm": 0.4367385804653168, "learning_rate": 8.544427445249412e-06, "loss": 0.3649, "step": 2697 }, { "epoch": 2.271043771043771, "grad_norm": 0.43308570981025696, "learning_rate": 8.542946748588752e-06, "loss": 0.3686, "step": 2698 }, { "epoch": 2.271885521885522, "grad_norm": 0.41239649057388306, "learning_rate": 8.541465427623102e-06, "loss": 0.3622, "step": 2699 }, { "epoch": 2.2727272727272725, "grad_norm": 0.4432728588581085, "learning_rate": 8.539983482613486e-06, "loss": 0.385, "step": 2700 }, { "epoch": 2.2735690235690234, "grad_norm": 0.44088566303253174, "learning_rate": 8.538500913821037e-06, "loss": 0.3521, "step": 2701 }, { "epoch": 2.274410774410774, "grad_norm": 0.5150613188743591, "learning_rate": 8.537017721507004e-06, "loss": 0.3745, "step": 2702 }, { "epoch": 2.275252525252525, "grad_norm": 0.42920464277267456, "learning_rate": 8.535533905932739e-06, "loss": 0.3712, "step": 2703 }, { "epoch": 2.276094276094276, "grad_norm": 0.476984441280365, "learning_rate": 8.534049467359706e-06, "loss": 0.3589, "step": 2704 }, { "epoch": 2.276936026936027, "grad_norm": 0.4296610653400421, "learning_rate": 8.532564406049477e-06, "loss": 0.3656, "step": 2705 }, { "epoch": 2.2777777777777777, "grad_norm": 0.4474858343601227, "learning_rate": 8.531078722263739e-06, "loss": 0.372, "step": 2706 }, { "epoch": 2.2786195286195285, "grad_norm": 0.4567042589187622, "learning_rate": 8.529592416264283e-06, "loss": 0.3961, "step": 2707 }, { "epoch": 2.2794612794612794, "grad_norm": 0.4596327841281891, "learning_rate": 8.528105488313013e-06, "loss": 0.3669, "step": 2708 }, { "epoch": 2.2803030303030303, "grad_norm": 0.42659616470336914, "learning_rate": 8.526617938671941e-06, "loss": 0.3605, "step": 2709 }, { "epoch": 2.281144781144781, "grad_norm": 0.421028196811676, "learning_rate": 8.52512976760319e-06, "loss": 0.3763, "step": 2710 }, { "epoch": 2.281986531986532, "grad_norm": 0.5101936459541321, "learning_rate": 8.523640975368991e-06, "loss": 0.386, "step": 2711 }, { "epoch": 2.282828282828283, "grad_norm": 0.40985047817230225, "learning_rate": 8.522151562231687e-06, "loss": 0.4006, "step": 2712 }, { "epoch": 2.2836700336700337, "grad_norm": 0.533717930316925, "learning_rate": 8.520661528453725e-06, "loss": 0.3703, "step": 2713 }, { "epoch": 2.2845117845117846, "grad_norm": 0.44273102283477783, "learning_rate": 8.519170874297665e-06, "loss": 0.3767, "step": 2714 }, { "epoch": 2.2853535353535355, "grad_norm": 0.45007961988449097, "learning_rate": 8.517679600026179e-06, "loss": 0.4132, "step": 2715 }, { "epoch": 2.2861952861952863, "grad_norm": 0.47788792848587036, "learning_rate": 8.516187705902043e-06, "loss": 0.3503, "step": 2716 }, { "epoch": 2.287037037037037, "grad_norm": 0.46674251556396484, "learning_rate": 8.514695192188147e-06, "loss": 0.3801, "step": 2717 }, { "epoch": 2.287878787878788, "grad_norm": 0.4608801007270813, "learning_rate": 8.513202059147485e-06, "loss": 0.3749, "step": 2718 }, { "epoch": 2.288720538720539, "grad_norm": 0.48542287945747375, "learning_rate": 8.511708307043166e-06, "loss": 0.3625, "step": 2719 }, { "epoch": 2.28956228956229, "grad_norm": 0.4253634512424469, "learning_rate": 8.510213936138403e-06, "loss": 0.383, "step": 2720 }, { "epoch": 2.29040404040404, "grad_norm": 0.43048369884490967, "learning_rate": 8.50871894669652e-06, "loss": 0.3622, "step": 2721 }, { "epoch": 2.291245791245791, "grad_norm": 0.43434301018714905, "learning_rate": 8.507223338980952e-06, "loss": 0.391, "step": 2722 }, { "epoch": 2.292087542087542, "grad_norm": 0.42083221673965454, "learning_rate": 8.505727113255238e-06, "loss": 0.3768, "step": 2723 }, { "epoch": 2.292929292929293, "grad_norm": 0.49777618050575256, "learning_rate": 8.504230269783032e-06, "loss": 0.3745, "step": 2724 }, { "epoch": 2.2937710437710437, "grad_norm": 0.4253290295600891, "learning_rate": 8.502732808828092e-06, "loss": 0.3619, "step": 2725 }, { "epoch": 2.2946127946127945, "grad_norm": 0.49169594049453735, "learning_rate": 8.501234730654287e-06, "loss": 0.4063, "step": 2726 }, { "epoch": 2.2954545454545454, "grad_norm": 0.43514108657836914, "learning_rate": 8.499736035525593e-06, "loss": 0.3639, "step": 2727 }, { "epoch": 2.2962962962962963, "grad_norm": 0.4125373661518097, "learning_rate": 8.4982367237061e-06, "loss": 0.3739, "step": 2728 }, { "epoch": 2.297138047138047, "grad_norm": 0.44994792342185974, "learning_rate": 8.496736795459997e-06, "loss": 0.3546, "step": 2729 }, { "epoch": 2.297979797979798, "grad_norm": 0.4378022849559784, "learning_rate": 8.495236251051594e-06, "loss": 0.3752, "step": 2730 }, { "epoch": 2.298821548821549, "grad_norm": 0.455334335565567, "learning_rate": 8.493735090745295e-06, "loss": 0.3696, "step": 2731 }, { "epoch": 2.2996632996632997, "grad_norm": 0.45003774762153625, "learning_rate": 8.492233314805628e-06, "loss": 0.379, "step": 2732 }, { "epoch": 2.3005050505050506, "grad_norm": 0.4100571870803833, "learning_rate": 8.490730923497219e-06, "loss": 0.3842, "step": 2733 }, { "epoch": 2.3013468013468015, "grad_norm": 0.44802021980285645, "learning_rate": 8.489227917084804e-06, "loss": 0.3614, "step": 2734 }, { "epoch": 2.3021885521885523, "grad_norm": 0.4223231077194214, "learning_rate": 8.487724295833229e-06, "loss": 0.3785, "step": 2735 }, { "epoch": 2.303030303030303, "grad_norm": 0.4955383837223053, "learning_rate": 8.486220060007451e-06, "loss": 0.3636, "step": 2736 }, { "epoch": 2.3038720538720536, "grad_norm": 0.4502008855342865, "learning_rate": 8.484715209872528e-06, "loss": 0.3712, "step": 2737 }, { "epoch": 2.3047138047138045, "grad_norm": 0.5721653699874878, "learning_rate": 8.483209745693634e-06, "loss": 0.3907, "step": 2738 }, { "epoch": 2.3055555555555554, "grad_norm": 0.4510665833950043, "learning_rate": 8.481703667736047e-06, "loss": 0.3599, "step": 2739 }, { "epoch": 2.3063973063973062, "grad_norm": 0.4556238055229187, "learning_rate": 8.480196976265155e-06, "loss": 0.372, "step": 2740 }, { "epoch": 2.307239057239057, "grad_norm": 0.4739348292350769, "learning_rate": 8.478689671546452e-06, "loss": 0.38, "step": 2741 }, { "epoch": 2.308080808080808, "grad_norm": 0.4744263291358948, "learning_rate": 8.477181753845539e-06, "loss": 0.3855, "step": 2742 }, { "epoch": 2.308922558922559, "grad_norm": 0.4669438302516937, "learning_rate": 8.47567322342813e-06, "loss": 0.3462, "step": 2743 }, { "epoch": 2.3097643097643097, "grad_norm": 0.46501031517982483, "learning_rate": 8.474164080560044e-06, "loss": 0.3807, "step": 2744 }, { "epoch": 2.3106060606060606, "grad_norm": 0.4176958501338959, "learning_rate": 8.472654325507206e-06, "loss": 0.3617, "step": 2745 }, { "epoch": 2.3114478114478114, "grad_norm": 0.48751410841941833, "learning_rate": 8.471143958535655e-06, "loss": 0.3567, "step": 2746 }, { "epoch": 2.3122895622895623, "grad_norm": 0.41177037358283997, "learning_rate": 8.469632979911529e-06, "loss": 0.3792, "step": 2747 }, { "epoch": 2.313131313131313, "grad_norm": 0.47616103291511536, "learning_rate": 8.468121389901082e-06, "loss": 0.3951, "step": 2748 }, { "epoch": 2.313973063973064, "grad_norm": 0.40859392285346985, "learning_rate": 8.46660918877067e-06, "loss": 0.3518, "step": 2749 }, { "epoch": 2.314814814814815, "grad_norm": 0.4390644133090973, "learning_rate": 8.46509637678676e-06, "loss": 0.3435, "step": 2750 }, { "epoch": 2.3156565656565657, "grad_norm": 0.4682598114013672, "learning_rate": 8.463582954215928e-06, "loss": 0.387, "step": 2751 }, { "epoch": 2.3164983164983166, "grad_norm": 0.423617422580719, "learning_rate": 8.46206892132485e-06, "loss": 0.3425, "step": 2752 }, { "epoch": 2.3173400673400675, "grad_norm": 0.43769192695617676, "learning_rate": 8.460554278380318e-06, "loss": 0.3562, "step": 2753 }, { "epoch": 2.3181818181818183, "grad_norm": 0.5029165744781494, "learning_rate": 8.459039025649228e-06, "loss": 0.3676, "step": 2754 }, { "epoch": 2.319023569023569, "grad_norm": 0.4670831561088562, "learning_rate": 8.457523163398583e-06, "loss": 0.3327, "step": 2755 }, { "epoch": 2.31986531986532, "grad_norm": 0.43827274441719055, "learning_rate": 8.456006691895495e-06, "loss": 0.3763, "step": 2756 }, { "epoch": 2.320707070707071, "grad_norm": 0.4235856831073761, "learning_rate": 8.454489611407182e-06, "loss": 0.3512, "step": 2757 }, { "epoch": 2.3215488215488214, "grad_norm": 0.44787102937698364, "learning_rate": 8.45297192220097e-06, "loss": 0.3777, "step": 2758 }, { "epoch": 2.3223905723905722, "grad_norm": 0.4188929498195648, "learning_rate": 8.45145362454429e-06, "loss": 0.3792, "step": 2759 }, { "epoch": 2.323232323232323, "grad_norm": 0.46617811918258667, "learning_rate": 8.449934718704686e-06, "loss": 0.3886, "step": 2760 }, { "epoch": 2.324074074074074, "grad_norm": 0.4610802233219147, "learning_rate": 8.448415204949803e-06, "loss": 0.3615, "step": 2761 }, { "epoch": 2.324915824915825, "grad_norm": 0.41926631331443787, "learning_rate": 8.446895083547395e-06, "loss": 0.3973, "step": 2762 }, { "epoch": 2.3257575757575757, "grad_norm": 0.4505683183670044, "learning_rate": 8.445374354765324e-06, "loss": 0.3972, "step": 2763 }, { "epoch": 2.3265993265993266, "grad_norm": 0.4605647027492523, "learning_rate": 8.44385301887156e-06, "loss": 0.3718, "step": 2764 }, { "epoch": 2.3274410774410774, "grad_norm": 0.47563737630844116, "learning_rate": 8.442331076134178e-06, "loss": 0.3608, "step": 2765 }, { "epoch": 2.3282828282828283, "grad_norm": 0.4204709529876709, "learning_rate": 8.44080852682136e-06, "loss": 0.3738, "step": 2766 }, { "epoch": 2.329124579124579, "grad_norm": 0.46845296025276184, "learning_rate": 8.439285371201396e-06, "loss": 0.3693, "step": 2767 }, { "epoch": 2.32996632996633, "grad_norm": 0.47830691933631897, "learning_rate": 8.437761609542682e-06, "loss": 0.3563, "step": 2768 }, { "epoch": 2.330808080808081, "grad_norm": 0.4349057674407959, "learning_rate": 8.436237242113721e-06, "loss": 0.364, "step": 2769 }, { "epoch": 2.3316498316498318, "grad_norm": 0.46624934673309326, "learning_rate": 8.43471226918312e-06, "loss": 0.3542, "step": 2770 }, { "epoch": 2.3324915824915826, "grad_norm": 0.48805028200149536, "learning_rate": 8.433186691019601e-06, "loss": 0.3556, "step": 2771 }, { "epoch": 2.3333333333333335, "grad_norm": 0.5049060583114624, "learning_rate": 8.431660507891985e-06, "loss": 0.3811, "step": 2772 }, { "epoch": 2.3341750841750843, "grad_norm": 0.4795832633972168, "learning_rate": 8.4301337200692e-06, "loss": 0.3512, "step": 2773 }, { "epoch": 2.3350168350168348, "grad_norm": 0.42766669392585754, "learning_rate": 8.428606327820283e-06, "loss": 0.3766, "step": 2774 }, { "epoch": 2.3358585858585856, "grad_norm": 0.45736730098724365, "learning_rate": 8.427078331414377e-06, "loss": 0.3629, "step": 2775 }, { "epoch": 2.3367003367003365, "grad_norm": 0.5139711499214172, "learning_rate": 8.425549731120734e-06, "loss": 0.3984, "step": 2776 }, { "epoch": 2.3375420875420874, "grad_norm": 0.44113942980766296, "learning_rate": 8.424020527208704e-06, "loss": 0.3775, "step": 2777 }, { "epoch": 2.3383838383838382, "grad_norm": 0.4391513764858246, "learning_rate": 8.422490719947753e-06, "loss": 0.3393, "step": 2778 }, { "epoch": 2.339225589225589, "grad_norm": 0.4640311300754547, "learning_rate": 8.420960309607449e-06, "loss": 0.3711, "step": 2779 }, { "epoch": 2.34006734006734, "grad_norm": 0.420686274766922, "learning_rate": 8.419429296457466e-06, "loss": 0.3669, "step": 2780 }, { "epoch": 2.340909090909091, "grad_norm": 0.443340003490448, "learning_rate": 8.417897680767586e-06, "loss": 0.3859, "step": 2781 }, { "epoch": 2.3417508417508417, "grad_norm": 0.4786008596420288, "learning_rate": 8.416365462807695e-06, "loss": 0.3822, "step": 2782 }, { "epoch": 2.3425925925925926, "grad_norm": 0.4365476369857788, "learning_rate": 8.414832642847785e-06, "loss": 0.3832, "step": 2783 }, { "epoch": 2.3434343434343434, "grad_norm": 0.5029516816139221, "learning_rate": 8.413299221157956e-06, "loss": 0.3984, "step": 2784 }, { "epoch": 2.3442760942760943, "grad_norm": 0.49432095885276794, "learning_rate": 8.411765198008414e-06, "loss": 0.3447, "step": 2785 }, { "epoch": 2.345117845117845, "grad_norm": 0.4307796359062195, "learning_rate": 8.41023057366947e-06, "loss": 0.377, "step": 2786 }, { "epoch": 2.345959595959596, "grad_norm": 0.5302203893661499, "learning_rate": 8.408695348411542e-06, "loss": 0.3865, "step": 2787 }, { "epoch": 2.346801346801347, "grad_norm": 0.4461498558521271, "learning_rate": 8.407159522505151e-06, "loss": 0.3414, "step": 2788 }, { "epoch": 2.3476430976430978, "grad_norm": 0.4796358048915863, "learning_rate": 8.405623096220928e-06, "loss": 0.3628, "step": 2789 }, { "epoch": 2.3484848484848486, "grad_norm": 0.43286406993865967, "learning_rate": 8.404086069829604e-06, "loss": 0.3889, "step": 2790 }, { "epoch": 2.3493265993265995, "grad_norm": 0.45573922991752625, "learning_rate": 8.402548443602026e-06, "loss": 0.3897, "step": 2791 }, { "epoch": 2.3501683501683504, "grad_norm": 0.47001177072525024, "learning_rate": 8.401010217809134e-06, "loss": 0.3639, "step": 2792 }, { "epoch": 2.351010101010101, "grad_norm": 0.42493200302124023, "learning_rate": 8.399471392721981e-06, "loss": 0.3655, "step": 2793 }, { "epoch": 2.351851851851852, "grad_norm": 0.4738049805164337, "learning_rate": 8.397931968611726e-06, "loss": 0.3835, "step": 2794 }, { "epoch": 2.3526936026936025, "grad_norm": 0.46020978689193726, "learning_rate": 8.396391945749631e-06, "loss": 0.3837, "step": 2795 }, { "epoch": 2.3535353535353534, "grad_norm": 0.5034357905387878, "learning_rate": 8.394851324407067e-06, "loss": 0.4099, "step": 2796 }, { "epoch": 2.3543771043771042, "grad_norm": 0.4679916203022003, "learning_rate": 8.393310104855505e-06, "loss": 0.3515, "step": 2797 }, { "epoch": 2.355218855218855, "grad_norm": 0.47853466868400574, "learning_rate": 8.391768287366524e-06, "loss": 0.3479, "step": 2798 }, { "epoch": 2.356060606060606, "grad_norm": 0.4434138834476471, "learning_rate": 8.39022587221181e-06, "loss": 0.3499, "step": 2799 }, { "epoch": 2.356902356902357, "grad_norm": 0.4552295506000519, "learning_rate": 8.388682859663153e-06, "loss": 0.379, "step": 2800 }, { "epoch": 2.3577441077441077, "grad_norm": 0.47132235765457153, "learning_rate": 8.387139249992448e-06, "loss": 0.3596, "step": 2801 }, { "epoch": 2.3585858585858586, "grad_norm": 0.4541162848472595, "learning_rate": 8.385595043471695e-06, "loss": 0.3875, "step": 2802 }, { "epoch": 2.3594276094276094, "grad_norm": 0.43694236874580383, "learning_rate": 8.384050240373e-06, "loss": 0.3796, "step": 2803 }, { "epoch": 2.3602693602693603, "grad_norm": 0.4768221080303192, "learning_rate": 8.382504840968575e-06, "loss": 0.3694, "step": 2804 }, { "epoch": 2.361111111111111, "grad_norm": 0.48954641819000244, "learning_rate": 8.380958845530733e-06, "loss": 0.3853, "step": 2805 }, { "epoch": 2.361952861952862, "grad_norm": 0.4364931881427765, "learning_rate": 8.379412254331898e-06, "loss": 0.363, "step": 2806 }, { "epoch": 2.362794612794613, "grad_norm": 0.45732566714286804, "learning_rate": 8.377865067644592e-06, "loss": 0.3608, "step": 2807 }, { "epoch": 2.3636363636363638, "grad_norm": 0.4313720166683197, "learning_rate": 8.376317285741452e-06, "loss": 0.3672, "step": 2808 }, { "epoch": 2.3644781144781146, "grad_norm": 0.4233641028404236, "learning_rate": 8.374768908895208e-06, "loss": 0.3706, "step": 2809 }, { "epoch": 2.3653198653198655, "grad_norm": 0.4445509612560272, "learning_rate": 8.373219937378703e-06, "loss": 0.386, "step": 2810 }, { "epoch": 2.3661616161616164, "grad_norm": 0.49122172594070435, "learning_rate": 8.371670371464879e-06, "loss": 0.3767, "step": 2811 }, { "epoch": 2.3670033670033668, "grad_norm": 0.44620662927627563, "learning_rate": 8.370120211426792e-06, "loss": 0.3713, "step": 2812 }, { "epoch": 2.3678451178451176, "grad_norm": 0.49821531772613525, "learning_rate": 8.368569457537593e-06, "loss": 0.3631, "step": 2813 }, { "epoch": 2.3686868686868685, "grad_norm": 0.44667282700538635, "learning_rate": 8.36701811007054e-06, "loss": 0.3443, "step": 2814 }, { "epoch": 2.3695286195286194, "grad_norm": 0.43990910053253174, "learning_rate": 8.365466169299e-06, "loss": 0.3783, "step": 2815 }, { "epoch": 2.3703703703703702, "grad_norm": 0.4429168701171875, "learning_rate": 8.363913635496442e-06, "loss": 0.3567, "step": 2816 }, { "epoch": 2.371212121212121, "grad_norm": 0.4994315505027771, "learning_rate": 8.362360508936433e-06, "loss": 0.385, "step": 2817 }, { "epoch": 2.372053872053872, "grad_norm": 0.5394458174705505, "learning_rate": 8.36080678989266e-06, "loss": 0.3796, "step": 2818 }, { "epoch": 2.372895622895623, "grad_norm": 0.390229195356369, "learning_rate": 8.359252478638894e-06, "loss": 0.3489, "step": 2819 }, { "epoch": 2.3737373737373737, "grad_norm": 0.5139751434326172, "learning_rate": 8.357697575449032e-06, "loss": 0.3484, "step": 2820 }, { "epoch": 2.3745791245791246, "grad_norm": 0.5222774147987366, "learning_rate": 8.356142080597058e-06, "loss": 0.3786, "step": 2821 }, { "epoch": 2.3754208754208754, "grad_norm": 0.4484851062297821, "learning_rate": 8.354585994357068e-06, "loss": 0.3778, "step": 2822 }, { "epoch": 2.3762626262626263, "grad_norm": 0.4827042520046234, "learning_rate": 8.35302931700326e-06, "loss": 0.355, "step": 2823 }, { "epoch": 2.377104377104377, "grad_norm": 0.4793928265571594, "learning_rate": 8.35147204880994e-06, "loss": 0.3758, "step": 2824 }, { "epoch": 2.377946127946128, "grad_norm": 0.4455835819244385, "learning_rate": 8.349914190051511e-06, "loss": 0.3783, "step": 2825 }, { "epoch": 2.378787878787879, "grad_norm": 0.4851279556751251, "learning_rate": 8.348355741002488e-06, "loss": 0.3587, "step": 2826 }, { "epoch": 2.3796296296296298, "grad_norm": 0.4284859001636505, "learning_rate": 8.346796701937486e-06, "loss": 0.3772, "step": 2827 }, { "epoch": 2.3804713804713806, "grad_norm": 0.4409014880657196, "learning_rate": 8.345237073131223e-06, "loss": 0.3713, "step": 2828 }, { "epoch": 2.3813131313131315, "grad_norm": 0.44839394092559814, "learning_rate": 8.343676854858522e-06, "loss": 0.3595, "step": 2829 }, { "epoch": 2.3821548821548824, "grad_norm": 0.42526134848594666, "learning_rate": 8.34211604739431e-06, "loss": 0.3756, "step": 2830 }, { "epoch": 2.3829966329966332, "grad_norm": 0.49746865034103394, "learning_rate": 8.340554651013621e-06, "loss": 0.3893, "step": 2831 }, { "epoch": 2.3838383838383836, "grad_norm": 0.4298902451992035, "learning_rate": 8.338992665991587e-06, "loss": 0.3593, "step": 2832 }, { "epoch": 2.3846801346801345, "grad_norm": 0.42933058738708496, "learning_rate": 8.337430092603447e-06, "loss": 0.3617, "step": 2833 }, { "epoch": 2.3855218855218854, "grad_norm": 0.43089598417282104, "learning_rate": 8.335866931124543e-06, "loss": 0.3579, "step": 2834 }, { "epoch": 2.3863636363636362, "grad_norm": 0.4418374300003052, "learning_rate": 8.334303181830319e-06, "loss": 0.359, "step": 2835 }, { "epoch": 2.387205387205387, "grad_norm": 0.43152740597724915, "learning_rate": 8.332738844996329e-06, "loss": 0.3723, "step": 2836 }, { "epoch": 2.388047138047138, "grad_norm": 0.40577518939971924, "learning_rate": 8.33117392089822e-06, "loss": 0.3957, "step": 2837 }, { "epoch": 2.388888888888889, "grad_norm": 0.4589220881462097, "learning_rate": 8.32960840981175e-06, "loss": 0.3871, "step": 2838 }, { "epoch": 2.3897306397306397, "grad_norm": 0.43566393852233887, "learning_rate": 8.328042312012782e-06, "loss": 0.3851, "step": 2839 }, { "epoch": 2.3905723905723906, "grad_norm": 0.4143589437007904, "learning_rate": 8.326475627777279e-06, "loss": 0.3844, "step": 2840 }, { "epoch": 2.3914141414141414, "grad_norm": 0.45368075370788574, "learning_rate": 8.324908357381302e-06, "loss": 0.36, "step": 2841 }, { "epoch": 2.3922558922558923, "grad_norm": 0.45294514298439026, "learning_rate": 8.323340501101027e-06, "loss": 0.3841, "step": 2842 }, { "epoch": 2.393097643097643, "grad_norm": 0.4428863823413849, "learning_rate": 8.321772059212721e-06, "loss": 0.365, "step": 2843 }, { "epoch": 2.393939393939394, "grad_norm": 0.43784964084625244, "learning_rate": 8.320203031992767e-06, "loss": 0.3784, "step": 2844 }, { "epoch": 2.394781144781145, "grad_norm": 0.44639381766319275, "learning_rate": 8.318633419717636e-06, "loss": 0.3575, "step": 2845 }, { "epoch": 2.3956228956228958, "grad_norm": 0.46495452523231506, "learning_rate": 8.317063222663919e-06, "loss": 0.4013, "step": 2846 }, { "epoch": 2.3964646464646466, "grad_norm": 0.47444775700569153, "learning_rate": 8.315492441108296e-06, "loss": 0.3583, "step": 2847 }, { "epoch": 2.3973063973063975, "grad_norm": 0.4205220639705658, "learning_rate": 8.313921075327557e-06, "loss": 0.3819, "step": 2848 }, { "epoch": 2.398148148148148, "grad_norm": 0.4739673137664795, "learning_rate": 8.312349125598593e-06, "loss": 0.3612, "step": 2849 }, { "epoch": 2.398989898989899, "grad_norm": 0.4994122385978699, "learning_rate": 8.3107765921984e-06, "loss": 0.3545, "step": 2850 }, { "epoch": 2.3998316498316496, "grad_norm": 0.4550198018550873, "learning_rate": 8.309203475404074e-06, "loss": 0.3637, "step": 2851 }, { "epoch": 2.4006734006734005, "grad_norm": 0.4627688229084015, "learning_rate": 8.307629775492813e-06, "loss": 0.3689, "step": 2852 }, { "epoch": 2.4015151515151514, "grad_norm": 0.4592593014240265, "learning_rate": 8.306055492741924e-06, "loss": 0.3709, "step": 2853 }, { "epoch": 2.4023569023569022, "grad_norm": 0.45587456226348877, "learning_rate": 8.304480627428807e-06, "loss": 0.3555, "step": 2854 }, { "epoch": 2.403198653198653, "grad_norm": 0.466807097196579, "learning_rate": 8.302905179830975e-06, "loss": 0.3774, "step": 2855 }, { "epoch": 2.404040404040404, "grad_norm": 0.42188185453414917, "learning_rate": 8.301329150226036e-06, "loss": 0.3798, "step": 2856 }, { "epoch": 2.404882154882155, "grad_norm": 0.48132556676864624, "learning_rate": 8.299752538891704e-06, "loss": 0.3756, "step": 2857 }, { "epoch": 2.4057239057239057, "grad_norm": 0.41933485865592957, "learning_rate": 8.298175346105795e-06, "loss": 0.3756, "step": 2858 }, { "epoch": 2.4065656565656566, "grad_norm": 0.4715639650821686, "learning_rate": 8.296597572146225e-06, "loss": 0.3523, "step": 2859 }, { "epoch": 2.4074074074074074, "grad_norm": 0.43858206272125244, "learning_rate": 8.29501921729102e-06, "loss": 0.3693, "step": 2860 }, { "epoch": 2.4082491582491583, "grad_norm": 0.399434894323349, "learning_rate": 8.293440281818297e-06, "loss": 0.3672, "step": 2861 }, { "epoch": 2.409090909090909, "grad_norm": 0.443985253572464, "learning_rate": 8.291860766006284e-06, "loss": 0.376, "step": 2862 }, { "epoch": 2.40993265993266, "grad_norm": 0.4311616122722626, "learning_rate": 8.290280670133306e-06, "loss": 0.3674, "step": 2863 }, { "epoch": 2.410774410774411, "grad_norm": 0.4235770106315613, "learning_rate": 8.288699994477798e-06, "loss": 0.4003, "step": 2864 }, { "epoch": 2.4116161616161618, "grad_norm": 0.42286762595176697, "learning_rate": 8.287118739318288e-06, "loss": 0.3572, "step": 2865 }, { "epoch": 2.4124579124579126, "grad_norm": 0.4014527499675751, "learning_rate": 8.28553690493341e-06, "loss": 0.3735, "step": 2866 }, { "epoch": 2.4132996632996635, "grad_norm": 0.3724038302898407, "learning_rate": 8.2839544916019e-06, "loss": 0.3453, "step": 2867 }, { "epoch": 2.4141414141414144, "grad_norm": 0.4067322909832001, "learning_rate": 8.282371499602597e-06, "loss": 0.3771, "step": 2868 }, { "epoch": 2.4149831649831652, "grad_norm": 0.44269928336143494, "learning_rate": 8.280787929214442e-06, "loss": 0.3883, "step": 2869 }, { "epoch": 2.4158249158249157, "grad_norm": 0.4675607681274414, "learning_rate": 8.279203780716476e-06, "loss": 0.3564, "step": 2870 }, { "epoch": 2.4166666666666665, "grad_norm": 0.4542387127876282, "learning_rate": 8.277619054387844e-06, "loss": 0.3836, "step": 2871 }, { "epoch": 2.4175084175084174, "grad_norm": 0.45024600625038147, "learning_rate": 8.276033750507788e-06, "loss": 0.3651, "step": 2872 }, { "epoch": 2.4183501683501682, "grad_norm": 0.4711262285709381, "learning_rate": 8.274447869355659e-06, "loss": 0.3777, "step": 2873 }, { "epoch": 2.419191919191919, "grad_norm": 0.41295936703681946, "learning_rate": 8.272861411210906e-06, "loss": 0.3791, "step": 2874 }, { "epoch": 2.42003367003367, "grad_norm": 0.47096699476242065, "learning_rate": 8.27127437635308e-06, "loss": 0.3679, "step": 2875 }, { "epoch": 2.420875420875421, "grad_norm": 0.5084740519523621, "learning_rate": 8.26968676506183e-06, "loss": 0.3483, "step": 2876 }, { "epoch": 2.4217171717171717, "grad_norm": 0.40911829471588135, "learning_rate": 8.268098577616914e-06, "loss": 0.3791, "step": 2877 }, { "epoch": 2.4225589225589226, "grad_norm": 0.5524013638496399, "learning_rate": 8.266509814298184e-06, "loss": 0.3402, "step": 2878 }, { "epoch": 2.4234006734006734, "grad_norm": 0.46703094244003296, "learning_rate": 8.264920475385603e-06, "loss": 0.3575, "step": 2879 }, { "epoch": 2.4242424242424243, "grad_norm": 0.4444088041782379, "learning_rate": 8.263330561159222e-06, "loss": 0.3842, "step": 2880 }, { "epoch": 2.425084175084175, "grad_norm": 0.4788113236427307, "learning_rate": 8.261740071899206e-06, "loss": 0.353, "step": 2881 }, { "epoch": 2.425925925925926, "grad_norm": 0.41994011402130127, "learning_rate": 8.260149007885814e-06, "loss": 0.3801, "step": 2882 }, { "epoch": 2.426767676767677, "grad_norm": 0.4592745900154114, "learning_rate": 8.25855736939941e-06, "loss": 0.3848, "step": 2883 }, { "epoch": 2.4276094276094278, "grad_norm": 0.4554484188556671, "learning_rate": 8.256965156720455e-06, "loss": 0.3543, "step": 2884 }, { "epoch": 2.4284511784511786, "grad_norm": 0.4193292558193207, "learning_rate": 8.255372370129516e-06, "loss": 0.3644, "step": 2885 }, { "epoch": 2.429292929292929, "grad_norm": 0.44576141238212585, "learning_rate": 8.253779009907258e-06, "loss": 0.3763, "step": 2886 }, { "epoch": 2.43013468013468, "grad_norm": 0.43041470646858215, "learning_rate": 8.25218507633445e-06, "loss": 0.3905, "step": 2887 }, { "epoch": 2.430976430976431, "grad_norm": 0.44574400782585144, "learning_rate": 8.25059056969196e-06, "loss": 0.3757, "step": 2888 }, { "epoch": 2.4318181818181817, "grad_norm": 0.41673749685287476, "learning_rate": 8.248995490260752e-06, "loss": 0.3672, "step": 2889 }, { "epoch": 2.4326599326599325, "grad_norm": 0.4801325500011444, "learning_rate": 8.247399838321902e-06, "loss": 0.3568, "step": 2890 }, { "epoch": 2.4335016835016834, "grad_norm": 0.4393468201160431, "learning_rate": 8.245803614156576e-06, "loss": 0.3436, "step": 2891 }, { "epoch": 2.4343434343434343, "grad_norm": 0.4239155650138855, "learning_rate": 8.24420681804605e-06, "loss": 0.369, "step": 2892 }, { "epoch": 2.435185185185185, "grad_norm": 0.5074912309646606, "learning_rate": 8.242609450271694e-06, "loss": 0.37, "step": 2893 }, { "epoch": 2.436026936026936, "grad_norm": 0.49302029609680176, "learning_rate": 8.241011511114984e-06, "loss": 0.3751, "step": 2894 }, { "epoch": 2.436868686868687, "grad_norm": 0.446073979139328, "learning_rate": 8.239413000857492e-06, "loss": 0.367, "step": 2895 }, { "epoch": 2.4377104377104377, "grad_norm": 0.4640878438949585, "learning_rate": 8.23781391978089e-06, "loss": 0.3958, "step": 2896 }, { "epoch": 2.4385521885521886, "grad_norm": 0.4295414984226227, "learning_rate": 8.236214268166958e-06, "loss": 0.3621, "step": 2897 }, { "epoch": 2.4393939393939394, "grad_norm": 0.4862079620361328, "learning_rate": 8.234614046297568e-06, "loss": 0.373, "step": 2898 }, { "epoch": 2.4402356902356903, "grad_norm": 0.4140170216560364, "learning_rate": 8.233013254454699e-06, "loss": 0.3782, "step": 2899 }, { "epoch": 2.441077441077441, "grad_norm": 0.3891260623931885, "learning_rate": 8.231411892920424e-06, "loss": 0.3415, "step": 2900 }, { "epoch": 2.441919191919192, "grad_norm": 0.43555697798728943, "learning_rate": 8.229809961976924e-06, "loss": 0.3734, "step": 2901 }, { "epoch": 2.442760942760943, "grad_norm": 0.4492727220058441, "learning_rate": 8.228207461906473e-06, "loss": 0.3672, "step": 2902 }, { "epoch": 2.4436026936026938, "grad_norm": 0.4199630916118622, "learning_rate": 8.22660439299145e-06, "loss": 0.3468, "step": 2903 }, { "epoch": 2.4444444444444446, "grad_norm": 0.5019910335540771, "learning_rate": 8.225000755514332e-06, "loss": 0.3396, "step": 2904 }, { "epoch": 2.4452861952861955, "grad_norm": 0.4933430850505829, "learning_rate": 8.223396549757699e-06, "loss": 0.3794, "step": 2905 }, { "epoch": 2.4461279461279464, "grad_norm": 0.5259891748428345, "learning_rate": 8.221791776004229e-06, "loss": 0.3671, "step": 2906 }, { "epoch": 2.446969696969697, "grad_norm": 0.4581696689128876, "learning_rate": 8.220186434536697e-06, "loss": 0.3439, "step": 2907 }, { "epoch": 2.4478114478114477, "grad_norm": 0.5507912039756775, "learning_rate": 8.218580525637984e-06, "loss": 0.3663, "step": 2908 }, { "epoch": 2.4486531986531985, "grad_norm": 0.508431613445282, "learning_rate": 8.216974049591069e-06, "loss": 0.3898, "step": 2909 }, { "epoch": 2.4494949494949494, "grad_norm": 0.47607240080833435, "learning_rate": 8.215367006679027e-06, "loss": 0.3472, "step": 2910 }, { "epoch": 2.4503367003367003, "grad_norm": 0.527305543422699, "learning_rate": 8.213759397185039e-06, "loss": 0.378, "step": 2911 }, { "epoch": 2.451178451178451, "grad_norm": 0.5522343516349792, "learning_rate": 8.212151221392383e-06, "loss": 0.3645, "step": 2912 }, { "epoch": 2.452020202020202, "grad_norm": 0.44636109471321106, "learning_rate": 8.210542479584434e-06, "loss": 0.3795, "step": 2913 }, { "epoch": 2.452861952861953, "grad_norm": 0.5222830176353455, "learning_rate": 8.208933172044672e-06, "loss": 0.3787, "step": 2914 }, { "epoch": 2.4537037037037037, "grad_norm": 0.5093931555747986, "learning_rate": 8.207323299056673e-06, "loss": 0.3674, "step": 2915 }, { "epoch": 2.4545454545454546, "grad_norm": 0.3965558111667633, "learning_rate": 8.205712860904114e-06, "loss": 0.3631, "step": 2916 }, { "epoch": 2.4553872053872055, "grad_norm": 0.49131932854652405, "learning_rate": 8.204101857870772e-06, "loss": 0.3632, "step": 2917 }, { "epoch": 2.4562289562289563, "grad_norm": 0.46274060010910034, "learning_rate": 8.202490290240521e-06, "loss": 0.3813, "step": 2918 }, { "epoch": 2.457070707070707, "grad_norm": 0.4568852186203003, "learning_rate": 8.20087815829734e-06, "loss": 0.3572, "step": 2919 }, { "epoch": 2.457912457912458, "grad_norm": 0.42516106367111206, "learning_rate": 8.199265462325301e-06, "loss": 0.3711, "step": 2920 }, { "epoch": 2.458754208754209, "grad_norm": 0.41925328969955444, "learning_rate": 8.197652202608577e-06, "loss": 0.3559, "step": 2921 }, { "epoch": 2.45959595959596, "grad_norm": 0.44218960404396057, "learning_rate": 8.196038379431446e-06, "loss": 0.371, "step": 2922 }, { "epoch": 2.46043771043771, "grad_norm": 0.44585686922073364, "learning_rate": 8.194423993078275e-06, "loss": 0.3835, "step": 2923 }, { "epoch": 2.461279461279461, "grad_norm": 0.4255709648132324, "learning_rate": 8.192809043833544e-06, "loss": 0.3996, "step": 2924 }, { "epoch": 2.462121212121212, "grad_norm": 0.4331108629703522, "learning_rate": 8.191193531981815e-06, "loss": 0.3768, "step": 2925 }, { "epoch": 2.462962962962963, "grad_norm": 0.4597373902797699, "learning_rate": 8.189577457807765e-06, "loss": 0.3921, "step": 2926 }, { "epoch": 2.4638047138047137, "grad_norm": 0.4470524489879608, "learning_rate": 8.18796082159616e-06, "loss": 0.3698, "step": 2927 }, { "epoch": 2.4646464646464645, "grad_norm": 0.46538591384887695, "learning_rate": 8.186343623631872e-06, "loss": 0.3583, "step": 2928 }, { "epoch": 2.4654882154882154, "grad_norm": 0.4230930209159851, "learning_rate": 8.184725864199865e-06, "loss": 0.3483, "step": 2929 }, { "epoch": 2.4663299663299663, "grad_norm": 0.450244277715683, "learning_rate": 8.183107543585209e-06, "loss": 0.3631, "step": 2930 }, { "epoch": 2.467171717171717, "grad_norm": 0.4359808564186096, "learning_rate": 8.181488662073068e-06, "loss": 0.3712, "step": 2931 }, { "epoch": 2.468013468013468, "grad_norm": 0.44469982385635376, "learning_rate": 8.179869219948701e-06, "loss": 0.3697, "step": 2932 }, { "epoch": 2.468855218855219, "grad_norm": 0.44240525364875793, "learning_rate": 8.178249217497481e-06, "loss": 0.384, "step": 2933 }, { "epoch": 2.4696969696969697, "grad_norm": 0.42862287163734436, "learning_rate": 8.17662865500486e-06, "loss": 0.3888, "step": 2934 }, { "epoch": 2.4705387205387206, "grad_norm": 0.45106080174446106, "learning_rate": 8.175007532756405e-06, "loss": 0.3477, "step": 2935 }, { "epoch": 2.4713804713804715, "grad_norm": 0.4235813319683075, "learning_rate": 8.173385851037771e-06, "loss": 0.3706, "step": 2936 }, { "epoch": 2.4722222222222223, "grad_norm": 0.43820127844810486, "learning_rate": 8.171763610134717e-06, "loss": 0.3815, "step": 2937 }, { "epoch": 2.473063973063973, "grad_norm": 0.40920689702033997, "learning_rate": 8.170140810333103e-06, "loss": 0.3891, "step": 2938 }, { "epoch": 2.473905723905724, "grad_norm": 0.4354347586631775, "learning_rate": 8.168517451918875e-06, "loss": 0.3812, "step": 2939 }, { "epoch": 2.474747474747475, "grad_norm": 0.42839792370796204, "learning_rate": 8.166893535178095e-06, "loss": 0.3971, "step": 2940 }, { "epoch": 2.475589225589226, "grad_norm": 0.4318980872631073, "learning_rate": 8.165269060396908e-06, "loss": 0.3509, "step": 2941 }, { "epoch": 2.4764309764309766, "grad_norm": 0.39547377824783325, "learning_rate": 8.163644027861568e-06, "loss": 0.3759, "step": 2942 }, { "epoch": 2.4772727272727275, "grad_norm": 0.415758341550827, "learning_rate": 8.16201843785842e-06, "loss": 0.4078, "step": 2943 }, { "epoch": 2.478114478114478, "grad_norm": 0.45722073316574097, "learning_rate": 8.160392290673913e-06, "loss": 0.3683, "step": 2944 }, { "epoch": 2.478956228956229, "grad_norm": 0.4085814952850342, "learning_rate": 8.158765586594588e-06, "loss": 0.3642, "step": 2945 }, { "epoch": 2.4797979797979797, "grad_norm": 0.4495580494403839, "learning_rate": 8.157138325907093e-06, "loss": 0.3908, "step": 2946 }, { "epoch": 2.4806397306397305, "grad_norm": 0.41854435205459595, "learning_rate": 8.155510508898163e-06, "loss": 0.3894, "step": 2947 }, { "epoch": 2.4814814814814814, "grad_norm": 0.4551297724246979, "learning_rate": 8.153882135854641e-06, "loss": 0.3539, "step": 2948 }, { "epoch": 2.4823232323232323, "grad_norm": 0.42625024914741516, "learning_rate": 8.152253207063458e-06, "loss": 0.3595, "step": 2949 }, { "epoch": 2.483164983164983, "grad_norm": 0.44179585576057434, "learning_rate": 8.150623722811656e-06, "loss": 0.3732, "step": 2950 }, { "epoch": 2.484006734006734, "grad_norm": 0.4420273005962372, "learning_rate": 8.148993683386363e-06, "loss": 0.3716, "step": 2951 }, { "epoch": 2.484848484848485, "grad_norm": 0.41699445247650146, "learning_rate": 8.14736308907481e-06, "loss": 0.365, "step": 2952 }, { "epoch": 2.4856902356902357, "grad_norm": 0.4258747398853302, "learning_rate": 8.145731940164325e-06, "loss": 0.359, "step": 2953 }, { "epoch": 2.4865319865319866, "grad_norm": 0.45052242279052734, "learning_rate": 8.144100236942334e-06, "loss": 0.3837, "step": 2954 }, { "epoch": 2.4873737373737375, "grad_norm": 0.40887463092803955, "learning_rate": 8.14246797969636e-06, "loss": 0.3432, "step": 2955 }, { "epoch": 2.4882154882154883, "grad_norm": 0.45227575302124023, "learning_rate": 8.140835168714027e-06, "loss": 0.3612, "step": 2956 }, { "epoch": 2.489057239057239, "grad_norm": 0.46806004643440247, "learning_rate": 8.139201804283051e-06, "loss": 0.3646, "step": 2957 }, { "epoch": 2.48989898989899, "grad_norm": 0.4381774067878723, "learning_rate": 8.137567886691248e-06, "loss": 0.3551, "step": 2958 }, { "epoch": 2.490740740740741, "grad_norm": 0.43013325333595276, "learning_rate": 8.135933416226533e-06, "loss": 0.3787, "step": 2959 }, { "epoch": 2.4915824915824913, "grad_norm": 0.46190452575683594, "learning_rate": 8.134298393176916e-06, "loss": 0.365, "step": 2960 }, { "epoch": 2.492424242424242, "grad_norm": 0.5419265031814575, "learning_rate": 8.132662817830506e-06, "loss": 0.3654, "step": 2961 }, { "epoch": 2.493265993265993, "grad_norm": 0.4122037887573242, "learning_rate": 8.13102669047551e-06, "loss": 0.3781, "step": 2962 }, { "epoch": 2.494107744107744, "grad_norm": 0.4667261838912964, "learning_rate": 8.129390011400228e-06, "loss": 0.3875, "step": 2963 }, { "epoch": 2.494949494949495, "grad_norm": 0.4759286940097809, "learning_rate": 8.127752780893063e-06, "loss": 0.3549, "step": 2964 }, { "epoch": 2.4957912457912457, "grad_norm": 0.4388701915740967, "learning_rate": 8.126114999242513e-06, "loss": 0.3755, "step": 2965 }, { "epoch": 2.4966329966329965, "grad_norm": 0.42160648107528687, "learning_rate": 8.124476666737169e-06, "loss": 0.3433, "step": 2966 }, { "epoch": 2.4974747474747474, "grad_norm": 0.4886862337589264, "learning_rate": 8.122837783665727e-06, "loss": 0.366, "step": 2967 }, { "epoch": 2.4983164983164983, "grad_norm": 0.43775278329849243, "learning_rate": 8.121198350316973e-06, "loss": 0.3616, "step": 2968 }, { "epoch": 2.499158249158249, "grad_norm": 0.43876489996910095, "learning_rate": 8.119558366979794e-06, "loss": 0.3509, "step": 2969 }, { "epoch": 2.5, "grad_norm": 0.4331294894218445, "learning_rate": 8.11791783394317e-06, "loss": 0.3622, "step": 2970 }, { "epoch": 2.500841750841751, "grad_norm": 0.4686462879180908, "learning_rate": 8.116276751496182e-06, "loss": 0.3798, "step": 2971 }, { "epoch": 2.5016835016835017, "grad_norm": 0.41695278882980347, "learning_rate": 8.11463511992801e-06, "loss": 0.3741, "step": 2972 }, { "epoch": 2.5025252525252526, "grad_norm": 0.42829352617263794, "learning_rate": 8.11299293952792e-06, "loss": 0.3975, "step": 2973 }, { "epoch": 2.5033670033670035, "grad_norm": 0.39887192845344543, "learning_rate": 8.111350210585284e-06, "loss": 0.3395, "step": 2974 }, { "epoch": 2.5042087542087543, "grad_norm": 0.4127504527568817, "learning_rate": 8.109706933389573e-06, "loss": 0.377, "step": 2975 }, { "epoch": 2.505050505050505, "grad_norm": 0.40520501136779785, "learning_rate": 8.108063108230346e-06, "loss": 0.3486, "step": 2976 }, { "epoch": 2.505892255892256, "grad_norm": 0.4276700019836426, "learning_rate": 8.106418735397261e-06, "loss": 0.3597, "step": 2977 }, { "epoch": 2.506734006734007, "grad_norm": 0.40231189131736755, "learning_rate": 8.104773815180079e-06, "loss": 0.3718, "step": 2978 }, { "epoch": 2.507575757575758, "grad_norm": 0.4326150715351105, "learning_rate": 8.103128347868646e-06, "loss": 0.3435, "step": 2979 }, { "epoch": 2.5084175084175087, "grad_norm": 0.41611412167549133, "learning_rate": 8.101482333752915e-06, "loss": 0.352, "step": 2980 }, { "epoch": 2.5092592592592595, "grad_norm": 0.3909856975078583, "learning_rate": 8.099835773122933e-06, "loss": 0.3777, "step": 2981 }, { "epoch": 2.51010101010101, "grad_norm": 0.39767345786094666, "learning_rate": 8.098188666268836e-06, "loss": 0.3615, "step": 2982 }, { "epoch": 2.510942760942761, "grad_norm": 0.45483967661857605, "learning_rate": 8.096541013480868e-06, "loss": 0.3684, "step": 2983 }, { "epoch": 2.5117845117845117, "grad_norm": 0.4205494523048401, "learning_rate": 8.094892815049358e-06, "loss": 0.3659, "step": 2984 }, { "epoch": 2.5126262626262625, "grad_norm": 0.44642215967178345, "learning_rate": 8.093244071264739e-06, "loss": 0.3848, "step": 2985 }, { "epoch": 2.5134680134680134, "grad_norm": 0.43247202038764954, "learning_rate": 8.091594782417536e-06, "loss": 0.3416, "step": 2986 }, { "epoch": 2.5143097643097643, "grad_norm": 0.3977932035923004, "learning_rate": 8.08994494879837e-06, "loss": 0.3778, "step": 2987 }, { "epoch": 2.515151515151515, "grad_norm": 0.391245573759079, "learning_rate": 8.088294570697962e-06, "loss": 0.3557, "step": 2988 }, { "epoch": 2.515993265993266, "grad_norm": 0.40673020482063293, "learning_rate": 8.086643648407125e-06, "loss": 0.374, "step": 2989 }, { "epoch": 2.516835016835017, "grad_norm": 0.40771085023880005, "learning_rate": 8.084992182216768e-06, "loss": 0.3693, "step": 2990 }, { "epoch": 2.5176767676767677, "grad_norm": 0.398529976606369, "learning_rate": 8.0833401724179e-06, "loss": 0.3705, "step": 2991 }, { "epoch": 2.5185185185185186, "grad_norm": 0.43515104055404663, "learning_rate": 8.081687619301621e-06, "loss": 0.3602, "step": 2992 }, { "epoch": 2.5193602693602695, "grad_norm": 0.40613800287246704, "learning_rate": 8.080034523159126e-06, "loss": 0.3536, "step": 2993 }, { "epoch": 2.5202020202020203, "grad_norm": 0.4341198205947876, "learning_rate": 8.078380884281713e-06, "loss": 0.3643, "step": 2994 }, { "epoch": 2.521043771043771, "grad_norm": 0.4314141273498535, "learning_rate": 8.076726702960767e-06, "loss": 0.372, "step": 2995 }, { "epoch": 2.5218855218855216, "grad_norm": 0.46190690994262695, "learning_rate": 8.075071979487774e-06, "loss": 0.3546, "step": 2996 }, { "epoch": 2.5227272727272725, "grad_norm": 0.46209272742271423, "learning_rate": 8.073416714154313e-06, "loss": 0.3603, "step": 2997 }, { "epoch": 2.5235690235690234, "grad_norm": 0.43599870800971985, "learning_rate": 8.07176090725206e-06, "loss": 0.3699, "step": 2998 }, { "epoch": 2.524410774410774, "grad_norm": 0.4495590329170227, "learning_rate": 8.070104559072787e-06, "loss": 0.3652, "step": 2999 }, { "epoch": 2.525252525252525, "grad_norm": 0.44547098875045776, "learning_rate": 8.068447669908357e-06, "loss": 0.3417, "step": 3000 }, { "epoch": 2.526094276094276, "grad_norm": 0.49054384231567383, "learning_rate": 8.066790240050734e-06, "loss": 0.3712, "step": 3001 }, { "epoch": 2.526936026936027, "grad_norm": 0.44132229685783386, "learning_rate": 8.065132269791975e-06, "loss": 0.3873, "step": 3002 }, { "epoch": 2.5277777777777777, "grad_norm": 0.5833751559257507, "learning_rate": 8.063473759424232e-06, "loss": 0.3629, "step": 3003 }, { "epoch": 2.5286195286195285, "grad_norm": 0.45209798216819763, "learning_rate": 8.061814709239753e-06, "loss": 0.3746, "step": 3004 }, { "epoch": 2.5294612794612794, "grad_norm": 0.438972145318985, "learning_rate": 8.060155119530875e-06, "loss": 0.3665, "step": 3005 }, { "epoch": 2.5303030303030303, "grad_norm": 0.4507683515548706, "learning_rate": 8.058494990590042e-06, "loss": 0.3582, "step": 3006 }, { "epoch": 2.531144781144781, "grad_norm": 0.44599848985671997, "learning_rate": 8.056834322709783e-06, "loss": 0.3651, "step": 3007 }, { "epoch": 2.531986531986532, "grad_norm": 0.522534191608429, "learning_rate": 8.055173116182727e-06, "loss": 0.3794, "step": 3008 }, { "epoch": 2.532828282828283, "grad_norm": 0.5059759020805359, "learning_rate": 8.053511371301594e-06, "loss": 0.3759, "step": 3009 }, { "epoch": 2.5336700336700337, "grad_norm": 0.4781307280063629, "learning_rate": 8.051849088359203e-06, "loss": 0.378, "step": 3010 }, { "epoch": 2.5345117845117846, "grad_norm": 0.5275559425354004, "learning_rate": 8.050186267648465e-06, "loss": 0.3744, "step": 3011 }, { "epoch": 2.5353535353535355, "grad_norm": 0.4405045807361603, "learning_rate": 8.048522909462389e-06, "loss": 0.3752, "step": 3012 }, { "epoch": 2.5361952861952863, "grad_norm": 0.47412705421447754, "learning_rate": 8.046859014094075e-06, "loss": 0.3738, "step": 3013 }, { "epoch": 2.537037037037037, "grad_norm": 0.4371356666088104, "learning_rate": 8.045194581836717e-06, "loss": 0.3896, "step": 3014 }, { "epoch": 2.537878787878788, "grad_norm": 0.484274297952652, "learning_rate": 8.043529612983608e-06, "loss": 0.3916, "step": 3015 }, { "epoch": 2.538720538720539, "grad_norm": 0.4273083508014679, "learning_rate": 8.041864107828134e-06, "loss": 0.3821, "step": 3016 }, { "epoch": 2.53956228956229, "grad_norm": 0.4825156331062317, "learning_rate": 8.04019806666377e-06, "loss": 0.3782, "step": 3017 }, { "epoch": 2.5404040404040407, "grad_norm": 0.4761039912700653, "learning_rate": 8.038531489784099e-06, "loss": 0.3766, "step": 3018 }, { "epoch": 2.541245791245791, "grad_norm": 0.46649160981178284, "learning_rate": 8.036864377482783e-06, "loss": 0.3694, "step": 3019 }, { "epoch": 2.542087542087542, "grad_norm": 0.4690137505531311, "learning_rate": 8.035196730053585e-06, "loss": 0.3674, "step": 3020 }, { "epoch": 2.542929292929293, "grad_norm": 0.4298070967197418, "learning_rate": 8.033528547790363e-06, "loss": 0.3804, "step": 3021 }, { "epoch": 2.5437710437710437, "grad_norm": 0.4663013517856598, "learning_rate": 8.031859830987072e-06, "loss": 0.3773, "step": 3022 }, { "epoch": 2.5446127946127945, "grad_norm": 0.4541909992694855, "learning_rate": 8.030190579937752e-06, "loss": 0.3767, "step": 3023 }, { "epoch": 2.5454545454545454, "grad_norm": 0.4074149429798126, "learning_rate": 8.028520794936548e-06, "loss": 0.3602, "step": 3024 }, { "epoch": 2.5462962962962963, "grad_norm": 0.42201125621795654, "learning_rate": 8.02685047627769e-06, "loss": 0.364, "step": 3025 }, { "epoch": 2.547138047138047, "grad_norm": 0.4039117097854614, "learning_rate": 8.025179624255508e-06, "loss": 0.3614, "step": 3026 }, { "epoch": 2.547979797979798, "grad_norm": 0.45107975602149963, "learning_rate": 8.023508239164422e-06, "loss": 0.3507, "step": 3027 }, { "epoch": 2.548821548821549, "grad_norm": 0.41030436754226685, "learning_rate": 8.021836321298952e-06, "loss": 0.3821, "step": 3028 }, { "epoch": 2.5496632996632997, "grad_norm": 0.40504971146583557, "learning_rate": 8.020163870953704e-06, "loss": 0.3854, "step": 3029 }, { "epoch": 2.5505050505050506, "grad_norm": 0.4412320852279663, "learning_rate": 8.018490888423382e-06, "loss": 0.3481, "step": 3030 }, { "epoch": 2.5513468013468015, "grad_norm": 0.461561918258667, "learning_rate": 8.016817374002784e-06, "loss": 0.3585, "step": 3031 }, { "epoch": 2.5521885521885523, "grad_norm": 0.4572669267654419, "learning_rate": 8.015143327986803e-06, "loss": 0.3493, "step": 3032 }, { "epoch": 2.5530303030303028, "grad_norm": 0.5219828486442566, "learning_rate": 8.01346875067042e-06, "loss": 0.3692, "step": 3033 }, { "epoch": 2.5538720538720536, "grad_norm": 0.4372483193874359, "learning_rate": 8.011793642348716e-06, "loss": 0.3586, "step": 3034 }, { "epoch": 2.5547138047138045, "grad_norm": 0.4658834934234619, "learning_rate": 8.010118003316861e-06, "loss": 0.389, "step": 3035 }, { "epoch": 2.5555555555555554, "grad_norm": 0.42695048451423645, "learning_rate": 8.008441833870125e-06, "loss": 0.3785, "step": 3036 }, { "epoch": 2.5563973063973062, "grad_norm": 0.4878685474395752, "learning_rate": 8.00676513430386e-06, "loss": 0.3742, "step": 3037 }, { "epoch": 2.557239057239057, "grad_norm": 0.42629361152648926, "learning_rate": 8.005087904913524e-06, "loss": 0.4025, "step": 3038 }, { "epoch": 2.558080808080808, "grad_norm": 0.4454309940338135, "learning_rate": 8.003410145994659e-06, "loss": 0.3965, "step": 3039 }, { "epoch": 2.558922558922559, "grad_norm": 0.479331374168396, "learning_rate": 8.001731857842907e-06, "loss": 0.3596, "step": 3040 }, { "epoch": 2.5597643097643097, "grad_norm": 0.42721933126449585, "learning_rate": 8.000053040753999e-06, "loss": 0.3678, "step": 3041 }, { "epoch": 2.5606060606060606, "grad_norm": 0.4365529417991638, "learning_rate": 7.99837369502376e-06, "loss": 0.3815, "step": 3042 }, { "epoch": 2.5614478114478114, "grad_norm": 0.45413750410079956, "learning_rate": 7.996693820948109e-06, "loss": 0.3807, "step": 3043 }, { "epoch": 2.5622895622895623, "grad_norm": 0.41567185521125793, "learning_rate": 7.995013418823058e-06, "loss": 0.3907, "step": 3044 }, { "epoch": 2.563131313131313, "grad_norm": 0.40571504831314087, "learning_rate": 7.993332488944711e-06, "loss": 0.3731, "step": 3045 }, { "epoch": 2.563973063973064, "grad_norm": 0.43381038308143616, "learning_rate": 7.991651031609267e-06, "loss": 0.369, "step": 3046 }, { "epoch": 2.564814814814815, "grad_norm": 0.42811718583106995, "learning_rate": 7.989969047113016e-06, "loss": 0.365, "step": 3047 }, { "epoch": 2.5656565656565657, "grad_norm": 0.4769359529018402, "learning_rate": 7.988286535752342e-06, "loss": 0.373, "step": 3048 }, { "epoch": 2.5664983164983166, "grad_norm": 0.4434069097042084, "learning_rate": 7.986603497823717e-06, "loss": 0.3758, "step": 3049 }, { "epoch": 2.5673400673400675, "grad_norm": 0.4472016394138336, "learning_rate": 7.984919933623718e-06, "loss": 0.3511, "step": 3050 }, { "epoch": 2.5681818181818183, "grad_norm": 0.5318796038627625, "learning_rate": 7.983235843449003e-06, "loss": 0.3627, "step": 3051 }, { "epoch": 2.569023569023569, "grad_norm": 0.422174334526062, "learning_rate": 7.981551227596327e-06, "loss": 0.3824, "step": 3052 }, { "epoch": 2.56986531986532, "grad_norm": 0.4751637578010559, "learning_rate": 7.979866086362536e-06, "loss": 0.3611, "step": 3053 }, { "epoch": 2.570707070707071, "grad_norm": 0.4373795986175537, "learning_rate": 7.978180420044572e-06, "loss": 0.3565, "step": 3054 }, { "epoch": 2.571548821548822, "grad_norm": 0.47710105776786804, "learning_rate": 7.976494228939466e-06, "loss": 0.3505, "step": 3055 }, { "epoch": 2.5723905723905722, "grad_norm": 0.47120025753974915, "learning_rate": 7.974807513344343e-06, "loss": 0.3825, "step": 3056 }, { "epoch": 2.573232323232323, "grad_norm": 0.4127935767173767, "learning_rate": 7.97312027355642e-06, "loss": 0.3675, "step": 3057 }, { "epoch": 2.574074074074074, "grad_norm": 0.44475916028022766, "learning_rate": 7.971432509873008e-06, "loss": 0.3574, "step": 3058 }, { "epoch": 2.574915824915825, "grad_norm": 0.4957478642463684, "learning_rate": 7.969744222591507e-06, "loss": 0.391, "step": 3059 }, { "epoch": 2.5757575757575757, "grad_norm": 0.4252842664718628, "learning_rate": 7.968055412009414e-06, "loss": 0.3442, "step": 3060 }, { "epoch": 2.5765993265993266, "grad_norm": 0.49052271246910095, "learning_rate": 7.966366078424313e-06, "loss": 0.363, "step": 3061 }, { "epoch": 2.5774410774410774, "grad_norm": 0.4323636293411255, "learning_rate": 7.964676222133882e-06, "loss": 0.3514, "step": 3062 }, { "epoch": 2.5782828282828283, "grad_norm": 0.40465039014816284, "learning_rate": 7.962985843435894e-06, "loss": 0.3638, "step": 3063 }, { "epoch": 2.579124579124579, "grad_norm": 0.426972895860672, "learning_rate": 7.961294942628211e-06, "loss": 0.4045, "step": 3064 }, { "epoch": 2.57996632996633, "grad_norm": 0.44296711683273315, "learning_rate": 7.959603520008785e-06, "loss": 0.3885, "step": 3065 }, { "epoch": 2.580808080808081, "grad_norm": 0.4312283396720886, "learning_rate": 7.95791157587567e-06, "loss": 0.397, "step": 3066 }, { "epoch": 2.5816498316498318, "grad_norm": 0.41058874130249023, "learning_rate": 7.956219110526995e-06, "loss": 0.3501, "step": 3067 }, { "epoch": 2.5824915824915826, "grad_norm": 0.4189448654651642, "learning_rate": 7.954526124260998e-06, "loss": 0.3689, "step": 3068 }, { "epoch": 2.5833333333333335, "grad_norm": 0.43404722213745117, "learning_rate": 7.952832617375998e-06, "loss": 0.3367, "step": 3069 }, { "epoch": 2.584175084175084, "grad_norm": 0.4569702744483948, "learning_rate": 7.951138590170409e-06, "loss": 0.3695, "step": 3070 }, { "epoch": 2.5850168350168348, "grad_norm": 0.40933284163475037, "learning_rate": 7.949444042942738e-06, "loss": 0.3603, "step": 3071 }, { "epoch": 2.5858585858585856, "grad_norm": 0.3948673605918884, "learning_rate": 7.94774897599158e-06, "loss": 0.3799, "step": 3072 }, { "epoch": 2.5867003367003365, "grad_norm": 0.43029460310935974, "learning_rate": 7.946053389615626e-06, "loss": 0.3439, "step": 3073 }, { "epoch": 2.5875420875420874, "grad_norm": 0.4754485487937927, "learning_rate": 7.944357284113657e-06, "loss": 0.3718, "step": 3074 }, { "epoch": 2.5883838383838382, "grad_norm": 0.3708105981349945, "learning_rate": 7.94266065978454e-06, "loss": 0.3612, "step": 3075 }, { "epoch": 2.589225589225589, "grad_norm": 0.4760894179344177, "learning_rate": 7.940963516927246e-06, "loss": 0.3495, "step": 3076 }, { "epoch": 2.59006734006734, "grad_norm": 0.42566046118736267, "learning_rate": 7.939265855840822e-06, "loss": 0.3854, "step": 3077 }, { "epoch": 2.590909090909091, "grad_norm": 0.417095810174942, "learning_rate": 7.93756767682442e-06, "loss": 0.361, "step": 3078 }, { "epoch": 2.5917508417508417, "grad_norm": 0.38692402839660645, "learning_rate": 7.935868980177274e-06, "loss": 0.366, "step": 3079 }, { "epoch": 2.5925925925925926, "grad_norm": 0.40878576040267944, "learning_rate": 7.934169766198712e-06, "loss": 0.3659, "step": 3080 }, { "epoch": 2.5934343434343434, "grad_norm": 0.4753744900226593, "learning_rate": 7.932470035188156e-06, "loss": 0.3758, "step": 3081 }, { "epoch": 2.5942760942760943, "grad_norm": 0.4133988320827484, "learning_rate": 7.930769787445113e-06, "loss": 0.3711, "step": 3082 }, { "epoch": 2.595117845117845, "grad_norm": 0.4134349822998047, "learning_rate": 7.92906902326919e-06, "loss": 0.3616, "step": 3083 }, { "epoch": 2.595959595959596, "grad_norm": 0.40976256132125854, "learning_rate": 7.927367742960077e-06, "loss": 0.3485, "step": 3084 }, { "epoch": 2.596801346801347, "grad_norm": 0.40525999665260315, "learning_rate": 7.925665946817557e-06, "loss": 0.3547, "step": 3085 }, { "epoch": 2.5976430976430978, "grad_norm": 0.46642136573791504, "learning_rate": 7.923963635141506e-06, "loss": 0.3821, "step": 3086 }, { "epoch": 2.5984848484848486, "grad_norm": 0.42089173197746277, "learning_rate": 7.922260808231887e-06, "loss": 0.3691, "step": 3087 }, { "epoch": 2.5993265993265995, "grad_norm": 0.43426400423049927, "learning_rate": 7.92055746638876e-06, "loss": 0.3655, "step": 3088 }, { "epoch": 2.6001683501683504, "grad_norm": 0.3852194547653198, "learning_rate": 7.91885360991227e-06, "loss": 0.3633, "step": 3089 }, { "epoch": 2.601010101010101, "grad_norm": 0.4119604527950287, "learning_rate": 7.917149239102656e-06, "loss": 0.3595, "step": 3090 }, { "epoch": 2.601851851851852, "grad_norm": 0.38589906692504883, "learning_rate": 7.915444354260242e-06, "loss": 0.3703, "step": 3091 }, { "epoch": 2.602693602693603, "grad_norm": 0.3880903124809265, "learning_rate": 7.91373895568545e-06, "loss": 0.3699, "step": 3092 }, { "epoch": 2.6035353535353534, "grad_norm": 0.37981754541397095, "learning_rate": 7.912033043678792e-06, "loss": 0.3584, "step": 3093 }, { "epoch": 2.6043771043771042, "grad_norm": 0.46131882071495056, "learning_rate": 7.910326618540864e-06, "loss": 0.3921, "step": 3094 }, { "epoch": 2.605218855218855, "grad_norm": 0.4081268012523651, "learning_rate": 7.908619680572358e-06, "loss": 0.3394, "step": 3095 }, { "epoch": 2.606060606060606, "grad_norm": 0.4726884961128235, "learning_rate": 7.906912230074055e-06, "loss": 0.3764, "step": 3096 }, { "epoch": 2.606902356902357, "grad_norm": 0.40560364723205566, "learning_rate": 7.905204267346825e-06, "loss": 0.3862, "step": 3097 }, { "epoch": 2.6077441077441077, "grad_norm": 0.415130078792572, "learning_rate": 7.90349579269163e-06, "loss": 0.3657, "step": 3098 }, { "epoch": 2.6085858585858586, "grad_norm": 0.42588821053504944, "learning_rate": 7.901786806409518e-06, "loss": 0.3751, "step": 3099 }, { "epoch": 2.6094276094276094, "grad_norm": 0.4331286549568176, "learning_rate": 7.900077308801634e-06, "loss": 0.381, "step": 3100 }, { "epoch": 2.6102693602693603, "grad_norm": 0.4527905285358429, "learning_rate": 7.898367300169212e-06, "loss": 0.3722, "step": 3101 }, { "epoch": 2.611111111111111, "grad_norm": 0.3775057792663574, "learning_rate": 7.896656780813568e-06, "loss": 0.38, "step": 3102 }, { "epoch": 2.611952861952862, "grad_norm": 0.46935132145881653, "learning_rate": 7.89494575103612e-06, "loss": 0.3642, "step": 3103 }, { "epoch": 2.612794612794613, "grad_norm": 0.42738476395606995, "learning_rate": 7.893234211138363e-06, "loss": 0.3547, "step": 3104 }, { "epoch": 2.6136363636363638, "grad_norm": 0.3904551863670349, "learning_rate": 7.891522161421894e-06, "loss": 0.3469, "step": 3105 }, { "epoch": 2.6144781144781146, "grad_norm": 0.42792946100234985, "learning_rate": 7.889809602188394e-06, "loss": 0.3682, "step": 3106 }, { "epoch": 2.615319865319865, "grad_norm": 0.4605967700481415, "learning_rate": 7.888096533739628e-06, "loss": 0.3757, "step": 3107 }, { "epoch": 2.616161616161616, "grad_norm": 0.40349388122558594, "learning_rate": 7.886382956377466e-06, "loss": 0.3878, "step": 3108 }, { "epoch": 2.6170033670033668, "grad_norm": 0.5088155269622803, "learning_rate": 7.884668870403853e-06, "loss": 0.3928, "step": 3109 }, { "epoch": 2.6178451178451176, "grad_norm": 0.4551922678947449, "learning_rate": 7.88295427612083e-06, "loss": 0.3658, "step": 3110 }, { "epoch": 2.6186868686868685, "grad_norm": 0.442027747631073, "learning_rate": 7.881239173830526e-06, "loss": 0.3673, "step": 3111 }, { "epoch": 2.6195286195286194, "grad_norm": 0.4612203538417816, "learning_rate": 7.879523563835163e-06, "loss": 0.359, "step": 3112 }, { "epoch": 2.6203703703703702, "grad_norm": 0.4896228313446045, "learning_rate": 7.877807446437046e-06, "loss": 0.3355, "step": 3113 }, { "epoch": 2.621212121212121, "grad_norm": 0.41985684633255005, "learning_rate": 7.876090821938578e-06, "loss": 0.3585, "step": 3114 }, { "epoch": 2.622053872053872, "grad_norm": 0.4592984616756439, "learning_rate": 7.874373690642244e-06, "loss": 0.3824, "step": 3115 }, { "epoch": 2.622895622895623, "grad_norm": 0.4769318699836731, "learning_rate": 7.872656052850619e-06, "loss": 0.3794, "step": 3116 }, { "epoch": 2.6237373737373737, "grad_norm": 0.42093324661254883, "learning_rate": 7.870937908866372e-06, "loss": 0.3771, "step": 3117 }, { "epoch": 2.6245791245791246, "grad_norm": 0.461921364068985, "learning_rate": 7.869219258992256e-06, "loss": 0.3866, "step": 3118 }, { "epoch": 2.6254208754208754, "grad_norm": 0.41307321190834045, "learning_rate": 7.867500103531118e-06, "loss": 0.3805, "step": 3119 }, { "epoch": 2.6262626262626263, "grad_norm": 0.47126299142837524, "learning_rate": 7.865780442785891e-06, "loss": 0.3788, "step": 3120 }, { "epoch": 2.627104377104377, "grad_norm": 0.4488561153411865, "learning_rate": 7.864060277059593e-06, "loss": 0.3748, "step": 3121 }, { "epoch": 2.627946127946128, "grad_norm": 0.4195275604724884, "learning_rate": 7.862339606655342e-06, "loss": 0.3698, "step": 3122 }, { "epoch": 2.628787878787879, "grad_norm": 0.4531475007534027, "learning_rate": 7.860618431876336e-06, "loss": 0.3763, "step": 3123 }, { "epoch": 2.6296296296296298, "grad_norm": 0.4500516951084137, "learning_rate": 7.858896753025862e-06, "loss": 0.3488, "step": 3124 }, { "epoch": 2.6304713804713806, "grad_norm": 0.4333934783935547, "learning_rate": 7.857174570407303e-06, "loss": 0.3687, "step": 3125 }, { "epoch": 2.6313131313131315, "grad_norm": 0.4207960367202759, "learning_rate": 7.85545188432412e-06, "loss": 0.3649, "step": 3126 }, { "epoch": 2.6321548821548824, "grad_norm": 0.4485623240470886, "learning_rate": 7.853728695079874e-06, "loss": 0.3776, "step": 3127 }, { "epoch": 2.6329966329966332, "grad_norm": 0.4801420271396637, "learning_rate": 7.852005002978208e-06, "loss": 0.3958, "step": 3128 }, { "epoch": 2.633838383838384, "grad_norm": 0.43177154660224915, "learning_rate": 7.850280808322852e-06, "loss": 0.3606, "step": 3129 }, { "epoch": 2.634680134680135, "grad_norm": 0.4587952792644501, "learning_rate": 7.84855611141763e-06, "loss": 0.3661, "step": 3130 }, { "epoch": 2.6355218855218854, "grad_norm": 0.4908248484134674, "learning_rate": 7.846830912566453e-06, "loss": 0.3669, "step": 3131 }, { "epoch": 2.6363636363636362, "grad_norm": 0.4196404814720154, "learning_rate": 7.845105212073317e-06, "loss": 0.3609, "step": 3132 }, { "epoch": 2.637205387205387, "grad_norm": 0.44231706857681274, "learning_rate": 7.84337901024231e-06, "loss": 0.3642, "step": 3133 }, { "epoch": 2.638047138047138, "grad_norm": 0.47726723551750183, "learning_rate": 7.841652307377604e-06, "loss": 0.3854, "step": 3134 }, { "epoch": 2.638888888888889, "grad_norm": 0.40780794620513916, "learning_rate": 7.839925103783468e-06, "loss": 0.3777, "step": 3135 }, { "epoch": 2.6397306397306397, "grad_norm": 0.47693267464637756, "learning_rate": 7.838197399764251e-06, "loss": 0.3586, "step": 3136 }, { "epoch": 2.6405723905723906, "grad_norm": 0.44193235039711, "learning_rate": 7.836469195624392e-06, "loss": 0.3732, "step": 3137 }, { "epoch": 2.6414141414141414, "grad_norm": 0.4108421206474304, "learning_rate": 7.834740491668422e-06, "loss": 0.3626, "step": 3138 }, { "epoch": 2.6422558922558923, "grad_norm": 0.45441997051239014, "learning_rate": 7.833011288200952e-06, "loss": 0.387, "step": 3139 }, { "epoch": 2.643097643097643, "grad_norm": 0.4902670085430145, "learning_rate": 7.83128158552669e-06, "loss": 0.3472, "step": 3140 }, { "epoch": 2.643939393939394, "grad_norm": 0.40926283597946167, "learning_rate": 7.82955138395043e-06, "loss": 0.3793, "step": 3141 }, { "epoch": 2.644781144781145, "grad_norm": 0.4472942352294922, "learning_rate": 7.827820683777045e-06, "loss": 0.3795, "step": 3142 }, { "epoch": 2.6456228956228958, "grad_norm": 0.5099055171012878, "learning_rate": 7.82608948531151e-06, "loss": 0.3869, "step": 3143 }, { "epoch": 2.6464646464646466, "grad_norm": 0.40656009316444397, "learning_rate": 7.824357788858877e-06, "loss": 0.3478, "step": 3144 }, { "epoch": 2.647306397306397, "grad_norm": 0.4444645643234253, "learning_rate": 7.822625594724289e-06, "loss": 0.39, "step": 3145 }, { "epoch": 2.648148148148148, "grad_norm": 0.41540440917015076, "learning_rate": 7.820892903212978e-06, "loss": 0.3526, "step": 3146 }, { "epoch": 2.648989898989899, "grad_norm": 0.44313865900039673, "learning_rate": 7.819159714630263e-06, "loss": 0.3803, "step": 3147 }, { "epoch": 2.6498316498316496, "grad_norm": 0.37512776255607605, "learning_rate": 7.817426029281551e-06, "loss": 0.3628, "step": 3148 }, { "epoch": 2.6506734006734005, "grad_norm": 0.42030951380729675, "learning_rate": 7.815691847472335e-06, "loss": 0.3785, "step": 3149 }, { "epoch": 2.6515151515151514, "grad_norm": 0.39490917325019836, "learning_rate": 7.813957169508194e-06, "loss": 0.3678, "step": 3150 }, { "epoch": 2.6523569023569022, "grad_norm": 0.4172931909561157, "learning_rate": 7.8122219956948e-06, "loss": 0.3723, "step": 3151 }, { "epoch": 2.653198653198653, "grad_norm": 0.4046728312969208, "learning_rate": 7.810486326337909e-06, "loss": 0.3872, "step": 3152 }, { "epoch": 2.654040404040404, "grad_norm": 0.41325506567955017, "learning_rate": 7.808750161743363e-06, "loss": 0.3961, "step": 3153 }, { "epoch": 2.654882154882155, "grad_norm": 0.3934098780155182, "learning_rate": 7.807013502217094e-06, "loss": 0.3857, "step": 3154 }, { "epoch": 2.6557239057239057, "grad_norm": 0.4003064036369324, "learning_rate": 7.805276348065115e-06, "loss": 0.3795, "step": 3155 }, { "epoch": 2.6565656565656566, "grad_norm": 0.3913799524307251, "learning_rate": 7.80353869959354e-06, "loss": 0.3604, "step": 3156 }, { "epoch": 2.6574074074074074, "grad_norm": 0.4120435118675232, "learning_rate": 7.801800557108556e-06, "loss": 0.3533, "step": 3157 }, { "epoch": 2.6582491582491583, "grad_norm": 0.4854486286640167, "learning_rate": 7.80006192091644e-06, "loss": 0.3762, "step": 3158 }, { "epoch": 2.659090909090909, "grad_norm": 0.4436338245868683, "learning_rate": 7.798322791323563e-06, "loss": 0.3784, "step": 3159 }, { "epoch": 2.65993265993266, "grad_norm": 0.47959885001182556, "learning_rate": 7.796583168636375e-06, "loss": 0.3596, "step": 3160 }, { "epoch": 2.660774410774411, "grad_norm": 0.46081531047821045, "learning_rate": 7.794843053161419e-06, "loss": 0.3704, "step": 3161 }, { "epoch": 2.6616161616161618, "grad_norm": 0.4334448277950287, "learning_rate": 7.793102445205319e-06, "loss": 0.3624, "step": 3162 }, { "epoch": 2.6624579124579126, "grad_norm": 0.5272902250289917, "learning_rate": 7.791361345074787e-06, "loss": 0.3882, "step": 3163 }, { "epoch": 2.6632996632996635, "grad_norm": 0.4704950451850891, "learning_rate": 7.78961975307663e-06, "loss": 0.3664, "step": 3164 }, { "epoch": 2.6641414141414144, "grad_norm": 0.46277743577957153, "learning_rate": 7.78787766951773e-06, "loss": 0.3602, "step": 3165 }, { "epoch": 2.6649831649831652, "grad_norm": 0.4722355902194977, "learning_rate": 7.78613509470506e-06, "loss": 0.363, "step": 3166 }, { "epoch": 2.665824915824916, "grad_norm": 0.5007103085517883, "learning_rate": 7.784392028945685e-06, "loss": 0.381, "step": 3167 }, { "epoch": 2.6666666666666665, "grad_norm": 0.501643717288971, "learning_rate": 7.782648472546747e-06, "loss": 0.3764, "step": 3168 }, { "epoch": 2.6675084175084174, "grad_norm": 0.4247448444366455, "learning_rate": 7.780904425815482e-06, "loss": 0.3825, "step": 3169 }, { "epoch": 2.6683501683501682, "grad_norm": 0.49460309743881226, "learning_rate": 7.77915988905921e-06, "loss": 0.3911, "step": 3170 }, { "epoch": 2.669191919191919, "grad_norm": 0.4482337236404419, "learning_rate": 7.777414862585332e-06, "loss": 0.3565, "step": 3171 }, { "epoch": 2.67003367003367, "grad_norm": 0.47112250328063965, "learning_rate": 7.775669346701347e-06, "loss": 0.3609, "step": 3172 }, { "epoch": 2.670875420875421, "grad_norm": 0.47779783606529236, "learning_rate": 7.773923341714829e-06, "loss": 0.3716, "step": 3173 }, { "epoch": 2.6717171717171717, "grad_norm": 0.44504258036613464, "learning_rate": 7.772176847933446e-06, "loss": 0.3763, "step": 3174 }, { "epoch": 2.6725589225589226, "grad_norm": 0.4619770348072052, "learning_rate": 7.770429865664945e-06, "loss": 0.3721, "step": 3175 }, { "epoch": 2.6734006734006734, "grad_norm": 0.4363296329975128, "learning_rate": 7.768682395217167e-06, "loss": 0.3755, "step": 3176 }, { "epoch": 2.6742424242424243, "grad_norm": 0.46462228894233704, "learning_rate": 7.766934436898032e-06, "loss": 0.3831, "step": 3177 }, { "epoch": 2.675084175084175, "grad_norm": 0.44191932678222656, "learning_rate": 7.765185991015548e-06, "loss": 0.3739, "step": 3178 }, { "epoch": 2.675925925925926, "grad_norm": 0.5333150625228882, "learning_rate": 7.763437057877814e-06, "loss": 0.3681, "step": 3179 }, { "epoch": 2.676767676767677, "grad_norm": 0.4276178181171417, "learning_rate": 7.761687637793007e-06, "loss": 0.3463, "step": 3180 }, { "epoch": 2.6776094276094278, "grad_norm": 0.4979904592037201, "learning_rate": 7.759937731069396e-06, "loss": 0.353, "step": 3181 }, { "epoch": 2.678451178451178, "grad_norm": 0.4581634998321533, "learning_rate": 7.75818733801533e-06, "loss": 0.3692, "step": 3182 }, { "epoch": 2.679292929292929, "grad_norm": 0.46685266494750977, "learning_rate": 7.756436458939252e-06, "loss": 0.3857, "step": 3183 }, { "epoch": 2.68013468013468, "grad_norm": 0.4538704752922058, "learning_rate": 7.75468509414968e-06, "loss": 0.3659, "step": 3184 }, { "epoch": 2.680976430976431, "grad_norm": 0.4190831482410431, "learning_rate": 7.752933243955225e-06, "loss": 0.3517, "step": 3185 }, { "epoch": 2.6818181818181817, "grad_norm": 0.4862916171550751, "learning_rate": 7.751180908664584e-06, "loss": 0.3701, "step": 3186 }, { "epoch": 2.6826599326599325, "grad_norm": 0.4631706178188324, "learning_rate": 7.749428088586535e-06, "loss": 0.361, "step": 3187 }, { "epoch": 2.6835016835016834, "grad_norm": 0.4017314910888672, "learning_rate": 7.747674784029943e-06, "loss": 0.3679, "step": 3188 }, { "epoch": 2.6843434343434343, "grad_norm": 0.40699052810668945, "learning_rate": 7.74592099530376e-06, "loss": 0.3732, "step": 3189 }, { "epoch": 2.685185185185185, "grad_norm": 0.4523361027240753, "learning_rate": 7.744166722717023e-06, "loss": 0.3648, "step": 3190 }, { "epoch": 2.686026936026936, "grad_norm": 0.42537274956703186, "learning_rate": 7.742411966578853e-06, "loss": 0.3563, "step": 3191 }, { "epoch": 2.686868686868687, "grad_norm": 0.3977847099304199, "learning_rate": 7.740656727198456e-06, "loss": 0.37, "step": 3192 }, { "epoch": 2.6877104377104377, "grad_norm": 0.4617396891117096, "learning_rate": 7.738901004885123e-06, "loss": 0.363, "step": 3193 }, { "epoch": 2.6885521885521886, "grad_norm": 0.43154099583625793, "learning_rate": 7.737144799948233e-06, "loss": 0.3547, "step": 3194 }, { "epoch": 2.6893939393939394, "grad_norm": 0.37243881821632385, "learning_rate": 7.735388112697248e-06, "loss": 0.3666, "step": 3195 }, { "epoch": 2.6902356902356903, "grad_norm": 0.4674469828605652, "learning_rate": 7.733630943441716e-06, "loss": 0.364, "step": 3196 }, { "epoch": 2.691077441077441, "grad_norm": 0.40426865220069885, "learning_rate": 7.731873292491265e-06, "loss": 0.3753, "step": 3197 }, { "epoch": 2.691919191919192, "grad_norm": 0.4116882383823395, "learning_rate": 7.730115160155616e-06, "loss": 0.3598, "step": 3198 }, { "epoch": 2.692760942760943, "grad_norm": 0.4453803598880768, "learning_rate": 7.728356546744567e-06, "loss": 0.3964, "step": 3199 }, { "epoch": 2.6936026936026938, "grad_norm": 0.43885505199432373, "learning_rate": 7.726597452568008e-06, "loss": 0.3423, "step": 3200 }, { "epoch": 2.6944444444444446, "grad_norm": 0.4372022747993469, "learning_rate": 7.724837877935906e-06, "loss": 0.38, "step": 3201 }, { "epoch": 2.6952861952861955, "grad_norm": 0.4246046841144562, "learning_rate": 7.723077823158322e-06, "loss": 0.3499, "step": 3202 }, { "epoch": 2.6961279461279464, "grad_norm": 0.43846026062965393, "learning_rate": 7.721317288545392e-06, "loss": 0.3941, "step": 3203 }, { "epoch": 2.6969696969696972, "grad_norm": 0.45697635412216187, "learning_rate": 7.719556274407344e-06, "loss": 0.3505, "step": 3204 }, { "epoch": 2.6978114478114477, "grad_norm": 0.4482317864894867, "learning_rate": 7.717794781054486e-06, "loss": 0.355, "step": 3205 }, { "epoch": 2.6986531986531985, "grad_norm": 0.41324710845947266, "learning_rate": 7.71603280879721e-06, "loss": 0.377, "step": 3206 }, { "epoch": 2.6994949494949494, "grad_norm": 0.45354902744293213, "learning_rate": 7.714270357946e-06, "loss": 0.3991, "step": 3207 }, { "epoch": 2.7003367003367003, "grad_norm": 0.3961485028266907, "learning_rate": 7.71250742881141e-06, "loss": 0.3463, "step": 3208 }, { "epoch": 2.701178451178451, "grad_norm": 0.4096146523952484, "learning_rate": 7.710744021704095e-06, "loss": 0.3595, "step": 3209 }, { "epoch": 2.702020202020202, "grad_norm": 0.49172767996788025, "learning_rate": 7.708980136934782e-06, "loss": 0.3811, "step": 3210 }, { "epoch": 2.702861952861953, "grad_norm": 0.47188594937324524, "learning_rate": 7.707215774814289e-06, "loss": 0.3709, "step": 3211 }, { "epoch": 2.7037037037037037, "grad_norm": 0.40727150440216064, "learning_rate": 7.705450935653511e-06, "loss": 0.3641, "step": 3212 }, { "epoch": 2.7045454545454546, "grad_norm": 0.43043461441993713, "learning_rate": 7.703685619763438e-06, "loss": 0.3556, "step": 3213 }, { "epoch": 2.7053872053872055, "grad_norm": 0.41868236660957336, "learning_rate": 7.70191982745513e-06, "loss": 0.3572, "step": 3214 }, { "epoch": 2.7062289562289563, "grad_norm": 0.4623730778694153, "learning_rate": 7.700153559039744e-06, "loss": 0.3697, "step": 3215 }, { "epoch": 2.707070707070707, "grad_norm": 0.4535217881202698, "learning_rate": 7.698386814828513e-06, "loss": 0.359, "step": 3216 }, { "epoch": 2.707912457912458, "grad_norm": 0.43468141555786133, "learning_rate": 7.696619595132756e-06, "loss": 0.3657, "step": 3217 }, { "epoch": 2.708754208754209, "grad_norm": 0.4858476221561432, "learning_rate": 7.694851900263878e-06, "loss": 0.3916, "step": 3218 }, { "epoch": 2.7095959595959593, "grad_norm": 0.3859041929244995, "learning_rate": 7.693083730533362e-06, "loss": 0.3541, "step": 3219 }, { "epoch": 2.71043771043771, "grad_norm": 0.4580496847629547, "learning_rate": 7.691315086252782e-06, "loss": 0.384, "step": 3220 }, { "epoch": 2.711279461279461, "grad_norm": 0.48037075996398926, "learning_rate": 7.68954596773379e-06, "loss": 0.3655, "step": 3221 }, { "epoch": 2.712121212121212, "grad_norm": 0.4949318766593933, "learning_rate": 7.687776375288124e-06, "loss": 0.3919, "step": 3222 }, { "epoch": 2.712962962962963, "grad_norm": 0.4242825508117676, "learning_rate": 7.686006309227604e-06, "loss": 0.3692, "step": 3223 }, { "epoch": 2.7138047138047137, "grad_norm": 0.49120843410491943, "learning_rate": 7.684235769864138e-06, "loss": 0.3817, "step": 3224 }, { "epoch": 2.7146464646464645, "grad_norm": 0.48487967252731323, "learning_rate": 7.682464757509711e-06, "loss": 0.3665, "step": 3225 }, { "epoch": 2.7154882154882154, "grad_norm": 0.5230485796928406, "learning_rate": 7.680693272476396e-06, "loss": 0.3572, "step": 3226 }, { "epoch": 2.7163299663299663, "grad_norm": 0.43026912212371826, "learning_rate": 7.678921315076344e-06, "loss": 0.3686, "step": 3227 }, { "epoch": 2.717171717171717, "grad_norm": 0.44128865003585815, "learning_rate": 7.677148885621797e-06, "loss": 0.3638, "step": 3228 }, { "epoch": 2.718013468013468, "grad_norm": 0.51166832447052, "learning_rate": 7.675375984425074e-06, "loss": 0.3744, "step": 3229 }, { "epoch": 2.718855218855219, "grad_norm": 0.41336655616760254, "learning_rate": 7.673602611798578e-06, "loss": 0.3772, "step": 3230 }, { "epoch": 2.7196969696969697, "grad_norm": 0.4274938106536865, "learning_rate": 7.671828768054799e-06, "loss": 0.3773, "step": 3231 }, { "epoch": 2.7205387205387206, "grad_norm": 0.44819819927215576, "learning_rate": 7.670054453506304e-06, "loss": 0.3701, "step": 3232 }, { "epoch": 2.7213804713804715, "grad_norm": 0.43579918146133423, "learning_rate": 7.668279668465747e-06, "loss": 0.3761, "step": 3233 }, { "epoch": 2.7222222222222223, "grad_norm": 0.42360469698905945, "learning_rate": 7.666504413245868e-06, "loss": 0.3706, "step": 3234 }, { "epoch": 2.723063973063973, "grad_norm": 0.4203515350818634, "learning_rate": 7.66472868815948e-06, "loss": 0.3549, "step": 3235 }, { "epoch": 2.723905723905724, "grad_norm": 0.417258620262146, "learning_rate": 7.662952493519488e-06, "loss": 0.355, "step": 3236 }, { "epoch": 2.724747474747475, "grad_norm": 0.4322642385959625, "learning_rate": 7.661175829638875e-06, "loss": 0.3476, "step": 3237 }, { "epoch": 2.725589225589226, "grad_norm": 0.4099026024341583, "learning_rate": 7.65939869683071e-06, "loss": 0.3673, "step": 3238 }, { "epoch": 2.7264309764309766, "grad_norm": 0.46699059009552, "learning_rate": 7.657621095408141e-06, "loss": 0.3616, "step": 3239 }, { "epoch": 2.7272727272727275, "grad_norm": 0.4395773410797119, "learning_rate": 7.655843025684402e-06, "loss": 0.3714, "step": 3240 }, { "epoch": 2.7281144781144784, "grad_norm": 0.4009642004966736, "learning_rate": 7.654064487972806e-06, "loss": 0.3729, "step": 3241 }, { "epoch": 2.728956228956229, "grad_norm": 0.5015642046928406, "learning_rate": 7.652285482586752e-06, "loss": 0.3675, "step": 3242 }, { "epoch": 2.7297979797979797, "grad_norm": 0.4065411686897278, "learning_rate": 7.650506009839718e-06, "loss": 0.3675, "step": 3243 }, { "epoch": 2.7306397306397305, "grad_norm": 0.42826032638549805, "learning_rate": 7.648726070045265e-06, "loss": 0.372, "step": 3244 }, { "epoch": 2.7314814814814814, "grad_norm": 0.44297561049461365, "learning_rate": 7.646945663517043e-06, "loss": 0.3697, "step": 3245 }, { "epoch": 2.7323232323232323, "grad_norm": 0.4450099766254425, "learning_rate": 7.645164790568773e-06, "loss": 0.3571, "step": 3246 }, { "epoch": 2.733164983164983, "grad_norm": 0.4014435410499573, "learning_rate": 7.643383451514266e-06, "loss": 0.359, "step": 3247 }, { "epoch": 2.734006734006734, "grad_norm": 0.47617849707603455, "learning_rate": 7.641601646667413e-06, "loss": 0.3771, "step": 3248 }, { "epoch": 2.734848484848485, "grad_norm": 0.38696715235710144, "learning_rate": 7.639819376342186e-06, "loss": 0.3682, "step": 3249 }, { "epoch": 2.7356902356902357, "grad_norm": 0.4213702976703644, "learning_rate": 7.638036640852641e-06, "loss": 0.3553, "step": 3250 }, { "epoch": 2.7365319865319866, "grad_norm": 0.4373057186603546, "learning_rate": 7.636253440512913e-06, "loss": 0.3785, "step": 3251 }, { "epoch": 2.7373737373737375, "grad_norm": 0.44141051173210144, "learning_rate": 7.634469775637226e-06, "loss": 0.366, "step": 3252 }, { "epoch": 2.7382154882154883, "grad_norm": 0.456469863653183, "learning_rate": 7.632685646539874e-06, "loss": 0.3578, "step": 3253 }, { "epoch": 2.739057239057239, "grad_norm": 0.49701955914497375, "learning_rate": 7.630901053535245e-06, "loss": 0.3618, "step": 3254 }, { "epoch": 2.73989898989899, "grad_norm": 0.4528011679649353, "learning_rate": 7.6291159969378e-06, "loss": 0.3691, "step": 3255 }, { "epoch": 2.7407407407407405, "grad_norm": 0.5031781792640686, "learning_rate": 7.627330477062087e-06, "loss": 0.3781, "step": 3256 }, { "epoch": 2.7415824915824913, "grad_norm": 0.5366607308387756, "learning_rate": 7.625544494222732e-06, "loss": 0.3789, "step": 3257 }, { "epoch": 2.742424242424242, "grad_norm": 0.5312321782112122, "learning_rate": 7.623758048734446e-06, "loss": 0.3751, "step": 3258 }, { "epoch": 2.743265993265993, "grad_norm": 0.4181349277496338, "learning_rate": 7.6219711409120164e-06, "loss": 0.3737, "step": 3259 }, { "epoch": 2.744107744107744, "grad_norm": 0.508043110370636, "learning_rate": 7.6201837710703215e-06, "loss": 0.368, "step": 3260 }, { "epoch": 2.744949494949495, "grad_norm": 0.43085652589797974, "learning_rate": 7.61839593952431e-06, "loss": 0.3514, "step": 3261 }, { "epoch": 2.7457912457912457, "grad_norm": 0.442551851272583, "learning_rate": 7.616607646589018e-06, "loss": 0.3412, "step": 3262 }, { "epoch": 2.7466329966329965, "grad_norm": 0.4940014183521271, "learning_rate": 7.614818892579561e-06, "loss": 0.3698, "step": 3263 }, { "epoch": 2.7474747474747474, "grad_norm": 0.4390539526939392, "learning_rate": 7.613029677811139e-06, "loss": 0.3644, "step": 3264 }, { "epoch": 2.7483164983164983, "grad_norm": 0.42621472477912903, "learning_rate": 7.611240002599028e-06, "loss": 0.3909, "step": 3265 }, { "epoch": 2.749158249158249, "grad_norm": 0.5246572494506836, "learning_rate": 7.609449867258589e-06, "loss": 0.3691, "step": 3266 }, { "epoch": 2.75, "grad_norm": 0.4337848126888275, "learning_rate": 7.607659272105263e-06, "loss": 0.3607, "step": 3267 }, { "epoch": 2.750841750841751, "grad_norm": 0.4104578495025635, "learning_rate": 7.605868217454573e-06, "loss": 0.3608, "step": 3268 }, { "epoch": 2.7516835016835017, "grad_norm": 0.49935027956962585, "learning_rate": 7.604076703622121e-06, "loss": 0.3696, "step": 3269 }, { "epoch": 2.7525252525252526, "grad_norm": 0.479187548160553, "learning_rate": 7.602284730923588e-06, "loss": 0.3799, "step": 3270 }, { "epoch": 2.7533670033670035, "grad_norm": 0.3985010087490082, "learning_rate": 7.6004922996747435e-06, "loss": 0.3588, "step": 3271 }, { "epoch": 2.7542087542087543, "grad_norm": 0.45951199531555176, "learning_rate": 7.59869941019143e-06, "loss": 0.3674, "step": 3272 }, { "epoch": 2.755050505050505, "grad_norm": 0.4626936614513397, "learning_rate": 7.5969060627895756e-06, "loss": 0.3503, "step": 3273 }, { "epoch": 2.755892255892256, "grad_norm": 0.4417159855365753, "learning_rate": 7.5951122577851845e-06, "loss": 0.3669, "step": 3274 }, { "epoch": 2.756734006734007, "grad_norm": 0.46139225363731384, "learning_rate": 7.593317995494347e-06, "loss": 0.3755, "step": 3275 }, { "epoch": 2.757575757575758, "grad_norm": 0.49060630798339844, "learning_rate": 7.59152327623323e-06, "loss": 0.3851, "step": 3276 }, { "epoch": 2.7584175084175087, "grad_norm": 0.42907604575157166, "learning_rate": 7.589728100318083e-06, "loss": 0.3649, "step": 3277 }, { "epoch": 2.7592592592592595, "grad_norm": 0.4173426926136017, "learning_rate": 7.587932468065234e-06, "loss": 0.3534, "step": 3278 }, { "epoch": 2.76010101010101, "grad_norm": 0.5045489072799683, "learning_rate": 7.586136379791093e-06, "loss": 0.384, "step": 3279 }, { "epoch": 2.760942760942761, "grad_norm": 0.4689265489578247, "learning_rate": 7.5843398358121514e-06, "loss": 0.3608, "step": 3280 }, { "epoch": 2.7617845117845117, "grad_norm": 0.4118746817111969, "learning_rate": 7.5825428364449775e-06, "loss": 0.3733, "step": 3281 }, { "epoch": 2.7626262626262625, "grad_norm": 0.46072787046432495, "learning_rate": 7.58074538200622e-06, "loss": 0.3545, "step": 3282 }, { "epoch": 2.7634680134680134, "grad_norm": 0.4756283760070801, "learning_rate": 7.578947472812614e-06, "loss": 0.3828, "step": 3283 }, { "epoch": 2.7643097643097643, "grad_norm": 0.38091933727264404, "learning_rate": 7.577149109180968e-06, "loss": 0.3642, "step": 3284 }, { "epoch": 2.765151515151515, "grad_norm": 0.5072594881057739, "learning_rate": 7.5753502914281715e-06, "loss": 0.3681, "step": 3285 }, { "epoch": 2.765993265993266, "grad_norm": 0.41671988368034363, "learning_rate": 7.573551019871198e-06, "loss": 0.3646, "step": 3286 }, { "epoch": 2.766835016835017, "grad_norm": 0.4310835897922516, "learning_rate": 7.571751294827097e-06, "loss": 0.3685, "step": 3287 }, { "epoch": 2.7676767676767677, "grad_norm": 0.4612216651439667, "learning_rate": 7.569951116612999e-06, "loss": 0.3649, "step": 3288 }, { "epoch": 2.7685185185185186, "grad_norm": 0.4473431408405304, "learning_rate": 7.568150485546115e-06, "loss": 0.3566, "step": 3289 }, { "epoch": 2.7693602693602695, "grad_norm": 0.4665190279483795, "learning_rate": 7.566349401943736e-06, "loss": 0.3567, "step": 3290 }, { "epoch": 2.7702020202020203, "grad_norm": 0.45248082280158997, "learning_rate": 7.564547866123229e-06, "loss": 0.3649, "step": 3291 }, { "epoch": 2.771043771043771, "grad_norm": 0.4459608197212219, "learning_rate": 7.5627458784020495e-06, "loss": 0.3823, "step": 3292 }, { "epoch": 2.7718855218855216, "grad_norm": 0.4696965217590332, "learning_rate": 7.560943439097721e-06, "loss": 0.3803, "step": 3293 }, { "epoch": 2.7727272727272725, "grad_norm": 0.41458022594451904, "learning_rate": 7.559140548527857e-06, "loss": 0.3779, "step": 3294 }, { "epoch": 2.7735690235690234, "grad_norm": 0.3943292796611786, "learning_rate": 7.557337207010143e-06, "loss": 0.3727, "step": 3295 }, { "epoch": 2.774410774410774, "grad_norm": 0.4472830295562744, "learning_rate": 7.555533414862351e-06, "loss": 0.3807, "step": 3296 }, { "epoch": 2.775252525252525, "grad_norm": 0.41130638122558594, "learning_rate": 7.553729172402322e-06, "loss": 0.388, "step": 3297 }, { "epoch": 2.776094276094276, "grad_norm": 0.39803892374038696, "learning_rate": 7.551924479947988e-06, "loss": 0.3736, "step": 3298 }, { "epoch": 2.776936026936027, "grad_norm": 0.45159029960632324, "learning_rate": 7.550119337817353e-06, "loss": 0.3752, "step": 3299 }, { "epoch": 2.7777777777777777, "grad_norm": 0.3792920708656311, "learning_rate": 7.548313746328504e-06, "loss": 0.3792, "step": 3300 }, { "epoch": 2.7786195286195285, "grad_norm": 0.4255044460296631, "learning_rate": 7.546507705799604e-06, "loss": 0.3511, "step": 3301 }, { "epoch": 2.7794612794612794, "grad_norm": 0.41190603375434875, "learning_rate": 7.544701216548895e-06, "loss": 0.3715, "step": 3302 }, { "epoch": 2.7803030303030303, "grad_norm": 0.4541373550891876, "learning_rate": 7.542894278894703e-06, "loss": 0.3833, "step": 3303 }, { "epoch": 2.781144781144781, "grad_norm": 0.4301801919937134, "learning_rate": 7.541086893155429e-06, "loss": 0.3488, "step": 3304 }, { "epoch": 2.781986531986532, "grad_norm": 0.40475180745124817, "learning_rate": 7.539279059649553e-06, "loss": 0.384, "step": 3305 }, { "epoch": 2.782828282828283, "grad_norm": 0.43699541687965393, "learning_rate": 7.5374707786956325e-06, "loss": 0.3647, "step": 3306 }, { "epoch": 2.7836700336700337, "grad_norm": 0.48588836193084717, "learning_rate": 7.535662050612309e-06, "loss": 0.3649, "step": 3307 }, { "epoch": 2.7845117845117846, "grad_norm": 0.4382491111755371, "learning_rate": 7.533852875718297e-06, "loss": 0.3646, "step": 3308 }, { "epoch": 2.7853535353535355, "grad_norm": 0.40612271428108215, "learning_rate": 7.5320432543323965e-06, "loss": 0.3613, "step": 3309 }, { "epoch": 2.7861952861952863, "grad_norm": 0.5080970525741577, "learning_rate": 7.530233186773477e-06, "loss": 0.3622, "step": 3310 }, { "epoch": 2.787037037037037, "grad_norm": 0.4270060956478119, "learning_rate": 7.528422673360495e-06, "loss": 0.3454, "step": 3311 }, { "epoch": 2.787878787878788, "grad_norm": 0.4667609632015228, "learning_rate": 7.5266117144124805e-06, "loss": 0.3583, "step": 3312 }, { "epoch": 2.788720538720539, "grad_norm": 0.4549032151699066, "learning_rate": 7.524800310248546e-06, "loss": 0.3402, "step": 3313 }, { "epoch": 2.78956228956229, "grad_norm": 0.42628931999206543, "learning_rate": 7.522988461187878e-06, "loss": 0.3681, "step": 3314 }, { "epoch": 2.7904040404040407, "grad_norm": 0.49974924325942993, "learning_rate": 7.5211761675497444e-06, "loss": 0.3982, "step": 3315 }, { "epoch": 2.791245791245791, "grad_norm": 0.4675558805465698, "learning_rate": 7.51936342965349e-06, "loss": 0.3517, "step": 3316 }, { "epoch": 2.792087542087542, "grad_norm": 0.4054194390773773, "learning_rate": 7.517550247818542e-06, "loss": 0.3835, "step": 3317 }, { "epoch": 2.792929292929293, "grad_norm": 0.5536786913871765, "learning_rate": 7.515736622364397e-06, "loss": 0.3798, "step": 3318 }, { "epoch": 2.7937710437710437, "grad_norm": 0.4427326023578644, "learning_rate": 7.51392255361064e-06, "loss": 0.3677, "step": 3319 }, { "epoch": 2.7946127946127945, "grad_norm": 0.39212119579315186, "learning_rate": 7.512108041876925e-06, "loss": 0.3586, "step": 3320 }, { "epoch": 2.7954545454545454, "grad_norm": 0.5039368271827698, "learning_rate": 7.510293087482992e-06, "loss": 0.365, "step": 3321 }, { "epoch": 2.7962962962962963, "grad_norm": 0.4757993221282959, "learning_rate": 7.508477690748652e-06, "loss": 0.3567, "step": 3322 }, { "epoch": 2.797138047138047, "grad_norm": 0.47795820236206055, "learning_rate": 7.5066618519937995e-06, "loss": 0.3596, "step": 3323 }, { "epoch": 2.797979797979798, "grad_norm": 0.4317070543766022, "learning_rate": 7.504845571538406e-06, "loss": 0.3789, "step": 3324 }, { "epoch": 2.798821548821549, "grad_norm": 0.46346041560173035, "learning_rate": 7.503028849702516e-06, "loss": 0.3519, "step": 3325 }, { "epoch": 2.7996632996632997, "grad_norm": 0.4668067395687103, "learning_rate": 7.5012116868062565e-06, "loss": 0.3736, "step": 3326 }, { "epoch": 2.8005050505050506, "grad_norm": 0.4740873873233795, "learning_rate": 7.499394083169832e-06, "loss": 0.3736, "step": 3327 }, { "epoch": 2.8013468013468015, "grad_norm": 0.409469872713089, "learning_rate": 7.497576039113522e-06, "loss": 0.3748, "step": 3328 }, { "epoch": 2.8021885521885523, "grad_norm": 0.47661685943603516, "learning_rate": 7.495757554957688e-06, "loss": 0.3612, "step": 3329 }, { "epoch": 2.8030303030303028, "grad_norm": 0.501055121421814, "learning_rate": 7.493938631022764e-06, "loss": 0.3751, "step": 3330 }, { "epoch": 2.8038720538720536, "grad_norm": 0.46686142683029175, "learning_rate": 7.492119267629262e-06, "loss": 0.3701, "step": 3331 }, { "epoch": 2.8047138047138045, "grad_norm": 0.5451623797416687, "learning_rate": 7.490299465097777e-06, "loss": 0.4041, "step": 3332 }, { "epoch": 2.8055555555555554, "grad_norm": 0.4210052788257599, "learning_rate": 7.488479223748977e-06, "loss": 0.3571, "step": 3333 }, { "epoch": 2.8063973063973062, "grad_norm": 0.4698493182659149, "learning_rate": 7.486658543903606e-06, "loss": 0.38, "step": 3334 }, { "epoch": 2.807239057239057, "grad_norm": 0.5166317820549011, "learning_rate": 7.4848374258824885e-06, "loss": 0.3421, "step": 3335 }, { "epoch": 2.808080808080808, "grad_norm": 0.414994478225708, "learning_rate": 7.483015870006525e-06, "loss": 0.3867, "step": 3336 }, { "epoch": 2.808922558922559, "grad_norm": 0.5553076863288879, "learning_rate": 7.481193876596693e-06, "loss": 0.3772, "step": 3337 }, { "epoch": 2.8097643097643097, "grad_norm": 0.4233987033367157, "learning_rate": 7.479371445974046e-06, "loss": 0.3725, "step": 3338 }, { "epoch": 2.8106060606060606, "grad_norm": 0.463186115026474, "learning_rate": 7.477548578459717e-06, "loss": 0.3862, "step": 3339 }, { "epoch": 2.8114478114478114, "grad_norm": 0.46857449412345886, "learning_rate": 7.475725274374914e-06, "loss": 0.353, "step": 3340 }, { "epoch": 2.8122895622895623, "grad_norm": 0.5237926244735718, "learning_rate": 7.473901534040924e-06, "loss": 0.35, "step": 3341 }, { "epoch": 2.813131313131313, "grad_norm": 0.45795294642448425, "learning_rate": 7.472077357779108e-06, "loss": 0.3482, "step": 3342 }, { "epoch": 2.813973063973064, "grad_norm": 0.47134125232696533, "learning_rate": 7.4702527459109054e-06, "loss": 0.3824, "step": 3343 }, { "epoch": 2.814814814814815, "grad_norm": 0.46175575256347656, "learning_rate": 7.468427698757831e-06, "loss": 0.3768, "step": 3344 }, { "epoch": 2.8156565656565657, "grad_norm": 0.45522820949554443, "learning_rate": 7.466602216641481e-06, "loss": 0.393, "step": 3345 }, { "epoch": 2.8164983164983166, "grad_norm": 0.45280811190605164, "learning_rate": 7.464776299883523e-06, "loss": 0.3768, "step": 3346 }, { "epoch": 2.8173400673400675, "grad_norm": 0.4569155275821686, "learning_rate": 7.4629499488057025e-06, "loss": 0.3449, "step": 3347 }, { "epoch": 2.8181818181818183, "grad_norm": 0.47541168332099915, "learning_rate": 7.461123163729842e-06, "loss": 0.3523, "step": 3348 }, { "epoch": 2.819023569023569, "grad_norm": 0.44842609763145447, "learning_rate": 7.459295944977842e-06, "loss": 0.3597, "step": 3349 }, { "epoch": 2.81986531986532, "grad_norm": 0.47499412298202515, "learning_rate": 7.457468292871675e-06, "loss": 0.3844, "step": 3350 }, { "epoch": 2.820707070707071, "grad_norm": 0.4260752201080322, "learning_rate": 7.455640207733397e-06, "loss": 0.3809, "step": 3351 }, { "epoch": 2.821548821548822, "grad_norm": 0.451578289270401, "learning_rate": 7.453811689885131e-06, "loss": 0.3906, "step": 3352 }, { "epoch": 2.8223905723905722, "grad_norm": 0.4944438636302948, "learning_rate": 7.451982739649087e-06, "loss": 0.364, "step": 3353 }, { "epoch": 2.823232323232323, "grad_norm": 0.456232488155365, "learning_rate": 7.450153357347541e-06, "loss": 0.3814, "step": 3354 }, { "epoch": 2.824074074074074, "grad_norm": 0.3908412456512451, "learning_rate": 7.448323543302852e-06, "loss": 0.3471, "step": 3355 }, { "epoch": 2.824915824915825, "grad_norm": 0.47612518072128296, "learning_rate": 7.446493297837452e-06, "loss": 0.3551, "step": 3356 }, { "epoch": 2.8257575757575757, "grad_norm": 0.46988847851753235, "learning_rate": 7.444662621273851e-06, "loss": 0.3682, "step": 3357 }, { "epoch": 2.8265993265993266, "grad_norm": 0.36950892210006714, "learning_rate": 7.442831513934631e-06, "loss": 0.3594, "step": 3358 }, { "epoch": 2.8274410774410774, "grad_norm": 0.44464626908302307, "learning_rate": 7.440999976142454e-06, "loss": 0.3721, "step": 3359 }, { "epoch": 2.8282828282828283, "grad_norm": 0.43218541145324707, "learning_rate": 7.4391680082200565e-06, "loss": 0.3777, "step": 3360 }, { "epoch": 2.829124579124579, "grad_norm": 0.4651750922203064, "learning_rate": 7.437335610490253e-06, "loss": 0.3564, "step": 3361 }, { "epoch": 2.82996632996633, "grad_norm": 0.46871066093444824, "learning_rate": 7.435502783275928e-06, "loss": 0.3648, "step": 3362 }, { "epoch": 2.830808080808081, "grad_norm": 0.4423588514328003, "learning_rate": 7.433669526900046e-06, "loss": 0.3606, "step": 3363 }, { "epoch": 2.8316498316498318, "grad_norm": 0.4596984088420868, "learning_rate": 7.431835841685648e-06, "loss": 0.3451, "step": 3364 }, { "epoch": 2.8324915824915826, "grad_norm": 0.5366805195808411, "learning_rate": 7.430001727955848e-06, "loss": 0.3886, "step": 3365 }, { "epoch": 2.8333333333333335, "grad_norm": 0.4248734712600708, "learning_rate": 7.428167186033835e-06, "loss": 0.382, "step": 3366 }, { "epoch": 2.834175084175084, "grad_norm": 0.4571065306663513, "learning_rate": 7.426332216242878e-06, "loss": 0.3742, "step": 3367 }, { "epoch": 2.8350168350168348, "grad_norm": 0.4790012836456299, "learning_rate": 7.424496818906315e-06, "loss": 0.379, "step": 3368 }, { "epoch": 2.8358585858585856, "grad_norm": 0.47879543900489807, "learning_rate": 7.422660994347564e-06, "loss": 0.3835, "step": 3369 }, { "epoch": 2.8367003367003365, "grad_norm": 0.42840227484703064, "learning_rate": 7.420824742890117e-06, "loss": 0.3792, "step": 3370 }, { "epoch": 2.8375420875420874, "grad_norm": 0.4202626049518585, "learning_rate": 7.41898806485754e-06, "loss": 0.3643, "step": 3371 }, { "epoch": 2.8383838383838382, "grad_norm": 0.44447019696235657, "learning_rate": 7.4171509605734785e-06, "loss": 0.3772, "step": 3372 }, { "epoch": 2.839225589225589, "grad_norm": 0.46548089385032654, "learning_rate": 7.4153134303616456e-06, "loss": 0.3634, "step": 3373 }, { "epoch": 2.84006734006734, "grad_norm": 0.4698775112628937, "learning_rate": 7.413475474545836e-06, "loss": 0.3824, "step": 3374 }, { "epoch": 2.840909090909091, "grad_norm": 0.4593964219093323, "learning_rate": 7.411637093449916e-06, "loss": 0.3807, "step": 3375 }, { "epoch": 2.8417508417508417, "grad_norm": 0.47799617052078247, "learning_rate": 7.409798287397829e-06, "loss": 0.3864, "step": 3376 }, { "epoch": 2.8425925925925926, "grad_norm": 0.38910549879074097, "learning_rate": 7.407959056713594e-06, "loss": 0.3523, "step": 3377 }, { "epoch": 2.8434343434343434, "grad_norm": 0.4490715563297272, "learning_rate": 7.406119401721299e-06, "loss": 0.3552, "step": 3378 }, { "epoch": 2.8442760942760943, "grad_norm": 0.5067480802536011, "learning_rate": 7.404279322745112e-06, "loss": 0.347, "step": 3379 }, { "epoch": 2.845117845117845, "grad_norm": 0.3880104422569275, "learning_rate": 7.402438820109277e-06, "loss": 0.3695, "step": 3380 }, { "epoch": 2.845959595959596, "grad_norm": 0.46624135971069336, "learning_rate": 7.400597894138107e-06, "loss": 0.3925, "step": 3381 }, { "epoch": 2.846801346801347, "grad_norm": 0.470758318901062, "learning_rate": 7.398756545155994e-06, "loss": 0.3918, "step": 3382 }, { "epoch": 2.8476430976430978, "grad_norm": 0.466831773519516, "learning_rate": 7.396914773487406e-06, "loss": 0.3693, "step": 3383 }, { "epoch": 2.8484848484848486, "grad_norm": 0.4648755192756653, "learning_rate": 7.395072579456878e-06, "loss": 0.3583, "step": 3384 }, { "epoch": 2.8493265993265995, "grad_norm": 0.4552430212497711, "learning_rate": 7.393229963389028e-06, "loss": 0.3801, "step": 3385 }, { "epoch": 2.8501683501683504, "grad_norm": 0.4345319867134094, "learning_rate": 7.391386925608543e-06, "loss": 0.3695, "step": 3386 }, { "epoch": 2.851010101010101, "grad_norm": 0.4438598155975342, "learning_rate": 7.389543466440185e-06, "loss": 0.3944, "step": 3387 }, { "epoch": 2.851851851851852, "grad_norm": 0.4335768520832062, "learning_rate": 7.387699586208794e-06, "loss": 0.3663, "step": 3388 }, { "epoch": 2.852693602693603, "grad_norm": 0.4196683168411255, "learning_rate": 7.385855285239279e-06, "loss": 0.3669, "step": 3389 }, { "epoch": 2.8535353535353534, "grad_norm": 0.4197171926498413, "learning_rate": 7.384010563856625e-06, "loss": 0.3614, "step": 3390 }, { "epoch": 2.8543771043771042, "grad_norm": 0.4630451202392578, "learning_rate": 7.382165422385894e-06, "loss": 0.3804, "step": 3391 }, { "epoch": 2.855218855218855, "grad_norm": 0.46646618843078613, "learning_rate": 7.380319861152219e-06, "loss": 0.3782, "step": 3392 }, { "epoch": 2.856060606060606, "grad_norm": 0.4365847706794739, "learning_rate": 7.378473880480805e-06, "loss": 0.3587, "step": 3393 }, { "epoch": 2.856902356902357, "grad_norm": 0.43402040004730225, "learning_rate": 7.3766274806969366e-06, "loss": 0.3537, "step": 3394 }, { "epoch": 2.8577441077441077, "grad_norm": 0.4374130964279175, "learning_rate": 7.374780662125967e-06, "loss": 0.3686, "step": 3395 }, { "epoch": 2.8585858585858586, "grad_norm": 0.4600830376148224, "learning_rate": 7.372933425093327e-06, "loss": 0.3787, "step": 3396 }, { "epoch": 2.8594276094276094, "grad_norm": 0.45783475041389465, "learning_rate": 7.371085769924519e-06, "loss": 0.3592, "step": 3397 }, { "epoch": 2.8602693602693603, "grad_norm": 0.4525892734527588, "learning_rate": 7.369237696945118e-06, "loss": 0.3431, "step": 3398 }, { "epoch": 2.861111111111111, "grad_norm": 0.4148608148097992, "learning_rate": 7.367389206480776e-06, "loss": 0.3796, "step": 3399 }, { "epoch": 2.861952861952862, "grad_norm": 0.4455849230289459, "learning_rate": 7.3655402988572154e-06, "loss": 0.3916, "step": 3400 }, { "epoch": 2.862794612794613, "grad_norm": 0.47573718428611755, "learning_rate": 7.363690974400235e-06, "loss": 0.3755, "step": 3401 }, { "epoch": 2.8636363636363638, "grad_norm": 0.4407208561897278, "learning_rate": 7.361841233435705e-06, "loss": 0.3695, "step": 3402 }, { "epoch": 2.8644781144781146, "grad_norm": 0.4417763650417328, "learning_rate": 7.359991076289568e-06, "loss": 0.3746, "step": 3403 }, { "epoch": 2.865319865319865, "grad_norm": 0.48923373222351074, "learning_rate": 7.358140503287843e-06, "loss": 0.3909, "step": 3404 }, { "epoch": 2.866161616161616, "grad_norm": 0.44245946407318115, "learning_rate": 7.356289514756619e-06, "loss": 0.371, "step": 3405 }, { "epoch": 2.8670033670033668, "grad_norm": 0.4105851352214813, "learning_rate": 7.354438111022062e-06, "loss": 0.3587, "step": 3406 }, { "epoch": 2.8678451178451176, "grad_norm": 0.4530768096446991, "learning_rate": 7.352586292410408e-06, "loss": 0.3571, "step": 3407 }, { "epoch": 2.8686868686868685, "grad_norm": 0.4326963722705841, "learning_rate": 7.350734059247968e-06, "loss": 0.3676, "step": 3408 }, { "epoch": 2.8695286195286194, "grad_norm": 0.37328821420669556, "learning_rate": 7.348881411861124e-06, "loss": 0.3561, "step": 3409 }, { "epoch": 2.8703703703703702, "grad_norm": 0.4212169349193573, "learning_rate": 7.3470283505763355e-06, "loss": 0.3716, "step": 3410 }, { "epoch": 2.871212121212121, "grad_norm": 0.42123427987098694, "learning_rate": 7.345174875720125e-06, "loss": 0.363, "step": 3411 }, { "epoch": 2.872053872053872, "grad_norm": 0.40477684140205383, "learning_rate": 7.3433209876191e-06, "loss": 0.3866, "step": 3412 }, { "epoch": 2.872895622895623, "grad_norm": 0.4571145176887512, "learning_rate": 7.341466686599934e-06, "loss": 0.369, "step": 3413 }, { "epoch": 2.8737373737373737, "grad_norm": 0.4171546697616577, "learning_rate": 7.339611972989374e-06, "loss": 0.3588, "step": 3414 }, { "epoch": 2.8745791245791246, "grad_norm": 0.4491715133190155, "learning_rate": 7.337756847114242e-06, "loss": 0.37, "step": 3415 }, { "epoch": 2.8754208754208754, "grad_norm": 0.49722713232040405, "learning_rate": 7.335901309301428e-06, "loss": 0.3577, "step": 3416 }, { "epoch": 2.8762626262626263, "grad_norm": 0.4015265107154846, "learning_rate": 7.334045359877902e-06, "loss": 0.3587, "step": 3417 }, { "epoch": 2.877104377104377, "grad_norm": 0.3660012483596802, "learning_rate": 7.332188999170696e-06, "loss": 0.3807, "step": 3418 }, { "epoch": 2.877946127946128, "grad_norm": 0.44266477227211, "learning_rate": 7.3303322275069256e-06, "loss": 0.3576, "step": 3419 }, { "epoch": 2.878787878787879, "grad_norm": 0.49396511912345886, "learning_rate": 7.328475045213772e-06, "loss": 0.3623, "step": 3420 }, { "epoch": 2.8796296296296298, "grad_norm": 0.39539316296577454, "learning_rate": 7.32661745261849e-06, "loss": 0.3671, "step": 3421 }, { "epoch": 2.8804713804713806, "grad_norm": 0.47065943479537964, "learning_rate": 7.324759450048408e-06, "loss": 0.357, "step": 3422 }, { "epoch": 2.8813131313131315, "grad_norm": 0.458012193441391, "learning_rate": 7.3229010378309275e-06, "loss": 0.3607, "step": 3423 }, { "epoch": 2.8821548821548824, "grad_norm": 0.479110449552536, "learning_rate": 7.321042216293514e-06, "loss": 0.3704, "step": 3424 }, { "epoch": 2.8829966329966332, "grad_norm": 0.4262150228023529, "learning_rate": 7.319182985763722e-06, "loss": 0.3839, "step": 3425 }, { "epoch": 2.883838383838384, "grad_norm": 0.42814627289772034, "learning_rate": 7.317323346569159e-06, "loss": 0.3591, "step": 3426 }, { "epoch": 2.884680134680135, "grad_norm": 0.4642055332660675, "learning_rate": 7.315463299037517e-06, "loss": 0.373, "step": 3427 }, { "epoch": 2.8855218855218854, "grad_norm": 0.4816725552082062, "learning_rate": 7.3136028434965544e-06, "loss": 0.3934, "step": 3428 }, { "epoch": 2.8863636363636362, "grad_norm": 0.41963520646095276, "learning_rate": 7.3117419802741075e-06, "loss": 0.3894, "step": 3429 }, { "epoch": 2.887205387205387, "grad_norm": 0.4455481767654419, "learning_rate": 7.309880709698075e-06, "loss": 0.3629, "step": 3430 }, { "epoch": 2.888047138047138, "grad_norm": 0.4315522611141205, "learning_rate": 7.308019032096435e-06, "loss": 0.3523, "step": 3431 }, { "epoch": 2.888888888888889, "grad_norm": 0.4470425546169281, "learning_rate": 7.306156947797235e-06, "loss": 0.3887, "step": 3432 }, { "epoch": 2.8897306397306397, "grad_norm": 0.43249186873435974, "learning_rate": 7.304294457128593e-06, "loss": 0.3889, "step": 3433 }, { "epoch": 2.8905723905723906, "grad_norm": 0.40002623200416565, "learning_rate": 7.302431560418703e-06, "loss": 0.3658, "step": 3434 }, { "epoch": 2.8914141414141414, "grad_norm": 0.5076291561126709, "learning_rate": 7.3005682579958215e-06, "loss": 0.3633, "step": 3435 }, { "epoch": 2.8922558922558923, "grad_norm": 0.4780314266681671, "learning_rate": 7.298704550188289e-06, "loss": 0.3781, "step": 3436 }, { "epoch": 2.893097643097643, "grad_norm": 0.4311736226081848, "learning_rate": 7.296840437324505e-06, "loss": 0.3576, "step": 3437 }, { "epoch": 2.893939393939394, "grad_norm": 0.4819246828556061, "learning_rate": 7.2949759197329495e-06, "loss": 0.3553, "step": 3438 }, { "epoch": 2.894781144781145, "grad_norm": 0.42791515588760376, "learning_rate": 7.293110997742167e-06, "loss": 0.3502, "step": 3439 }, { "epoch": 2.8956228956228958, "grad_norm": 0.40688204765319824, "learning_rate": 7.291245671680782e-06, "loss": 0.3793, "step": 3440 }, { "epoch": 2.8964646464646466, "grad_norm": 0.4261029362678528, "learning_rate": 7.28937994187748e-06, "loss": 0.3863, "step": 3441 }, { "epoch": 2.897306397306397, "grad_norm": 0.4433722496032715, "learning_rate": 7.287513808661026e-06, "loss": 0.3796, "step": 3442 }, { "epoch": 2.898148148148148, "grad_norm": 0.4581228792667389, "learning_rate": 7.285647272360248e-06, "loss": 0.3666, "step": 3443 }, { "epoch": 2.898989898989899, "grad_norm": 0.42080819606781006, "learning_rate": 7.2837803333040545e-06, "loss": 0.3551, "step": 3444 }, { "epoch": 2.8998316498316496, "grad_norm": 0.4227973520755768, "learning_rate": 7.281912991821417e-06, "loss": 0.3849, "step": 3445 }, { "epoch": 2.9006734006734005, "grad_norm": 0.422392338514328, "learning_rate": 7.280045248241382e-06, "loss": 0.3584, "step": 3446 }, { "epoch": 2.9015151515151514, "grad_norm": 0.49874988198280334, "learning_rate": 7.278177102893065e-06, "loss": 0.3712, "step": 3447 }, { "epoch": 2.9023569023569022, "grad_norm": 0.44879093766212463, "learning_rate": 7.276308556105655e-06, "loss": 0.3727, "step": 3448 }, { "epoch": 2.903198653198653, "grad_norm": 0.4766097366809845, "learning_rate": 7.274439608208409e-06, "loss": 0.3655, "step": 3449 }, { "epoch": 2.904040404040404, "grad_norm": 0.4177875220775604, "learning_rate": 7.2725702595306545e-06, "loss": 0.3662, "step": 3450 }, { "epoch": 2.904882154882155, "grad_norm": 0.5355978608131409, "learning_rate": 7.270700510401791e-06, "loss": 0.3651, "step": 3451 }, { "epoch": 2.9057239057239057, "grad_norm": 0.4646821618080139, "learning_rate": 7.26883036115129e-06, "loss": 0.3867, "step": 3452 }, { "epoch": 2.9065656565656566, "grad_norm": 0.41565459966659546, "learning_rate": 7.266959812108691e-06, "loss": 0.3381, "step": 3453 }, { "epoch": 2.9074074074074074, "grad_norm": 0.5759257078170776, "learning_rate": 7.265088863603601e-06, "loss": 0.3735, "step": 3454 }, { "epoch": 2.9082491582491583, "grad_norm": 0.4739326238632202, "learning_rate": 7.263217515965706e-06, "loss": 0.3709, "step": 3455 }, { "epoch": 2.909090909090909, "grad_norm": 0.5069561004638672, "learning_rate": 7.2613457695247544e-06, "loss": 0.3536, "step": 3456 }, { "epoch": 2.90993265993266, "grad_norm": 0.41775625944137573, "learning_rate": 7.2594736246105716e-06, "loss": 0.3652, "step": 3457 }, { "epoch": 2.910774410774411, "grad_norm": 0.5098841190338135, "learning_rate": 7.257601081553045e-06, "loss": 0.3819, "step": 3458 }, { "epoch": 2.9116161616161618, "grad_norm": 0.4865054786205292, "learning_rate": 7.255728140682139e-06, "loss": 0.3836, "step": 3459 }, { "epoch": 2.9124579124579126, "grad_norm": 0.45205357670783997, "learning_rate": 7.253854802327885e-06, "loss": 0.3714, "step": 3460 }, { "epoch": 2.9132996632996635, "grad_norm": 0.45857593417167664, "learning_rate": 7.251981066820387e-06, "loss": 0.3583, "step": 3461 }, { "epoch": 2.9141414141414144, "grad_norm": 0.43941572308540344, "learning_rate": 7.250106934489813e-06, "loss": 0.3538, "step": 3462 }, { "epoch": 2.9149831649831652, "grad_norm": 0.42718034982681274, "learning_rate": 7.248232405666411e-06, "loss": 0.3638, "step": 3463 }, { "epoch": 2.915824915824916, "grad_norm": 0.4079572260379791, "learning_rate": 7.246357480680488e-06, "loss": 0.364, "step": 3464 }, { "epoch": 2.9166666666666665, "grad_norm": 0.4027693271636963, "learning_rate": 7.244482159862428e-06, "loss": 0.3955, "step": 3465 }, { "epoch": 2.9175084175084174, "grad_norm": 0.4181070327758789, "learning_rate": 7.242606443542681e-06, "loss": 0.3804, "step": 3466 }, { "epoch": 2.9183501683501682, "grad_norm": 0.41708138585090637, "learning_rate": 7.240730332051771e-06, "loss": 0.3731, "step": 3467 }, { "epoch": 2.919191919191919, "grad_norm": 0.43894463777542114, "learning_rate": 7.238853825720287e-06, "loss": 0.369, "step": 3468 }, { "epoch": 2.92003367003367, "grad_norm": 0.4592682421207428, "learning_rate": 7.236976924878888e-06, "loss": 0.407, "step": 3469 }, { "epoch": 2.920875420875421, "grad_norm": 0.42428311705589294, "learning_rate": 7.235099629858307e-06, "loss": 0.3777, "step": 3470 }, { "epoch": 2.9217171717171717, "grad_norm": 0.4256041347980499, "learning_rate": 7.2332219409893404e-06, "loss": 0.3698, "step": 3471 }, { "epoch": 2.9225589225589226, "grad_norm": 0.4489157199859619, "learning_rate": 7.23134385860286e-06, "loss": 0.359, "step": 3472 }, { "epoch": 2.9234006734006734, "grad_norm": 0.4387713670730591, "learning_rate": 7.2294653830298e-06, "loss": 0.3683, "step": 3473 }, { "epoch": 2.9242424242424243, "grad_norm": 0.4374532103538513, "learning_rate": 7.227586514601172e-06, "loss": 0.3916, "step": 3474 }, { "epoch": 2.925084175084175, "grad_norm": 0.4763369858264923, "learning_rate": 7.2257072536480514e-06, "loss": 0.3653, "step": 3475 }, { "epoch": 2.925925925925926, "grad_norm": 0.4598913788795471, "learning_rate": 7.223827600501583e-06, "loss": 0.3451, "step": 3476 }, { "epoch": 2.926767676767677, "grad_norm": 0.39127376675605774, "learning_rate": 7.221947555492982e-06, "loss": 0.3656, "step": 3477 }, { "epoch": 2.9276094276094278, "grad_norm": 0.42243069410324097, "learning_rate": 7.220067118953533e-06, "loss": 0.369, "step": 3478 }, { "epoch": 2.928451178451178, "grad_norm": 0.5104761719703674, "learning_rate": 7.2181862912145874e-06, "loss": 0.3477, "step": 3479 }, { "epoch": 2.929292929292929, "grad_norm": 0.40013056993484497, "learning_rate": 7.216305072607569e-06, "loss": 0.3855, "step": 3480 }, { "epoch": 2.93013468013468, "grad_norm": 0.39363357424736023, "learning_rate": 7.214423463463969e-06, "loss": 0.3373, "step": 3481 }, { "epoch": 2.930976430976431, "grad_norm": 0.5041378736495972, "learning_rate": 7.212541464115344e-06, "loss": 0.3732, "step": 3482 }, { "epoch": 2.9318181818181817, "grad_norm": 0.41720226407051086, "learning_rate": 7.210659074893326e-06, "loss": 0.3612, "step": 3483 }, { "epoch": 2.9326599326599325, "grad_norm": 0.3918554484844208, "learning_rate": 7.2087762961296095e-06, "loss": 0.3685, "step": 3484 }, { "epoch": 2.9335016835016834, "grad_norm": 0.5137543082237244, "learning_rate": 7.206893128155959e-06, "loss": 0.3745, "step": 3485 }, { "epoch": 2.9343434343434343, "grad_norm": 0.4408521354198456, "learning_rate": 7.205009571304213e-06, "loss": 0.3822, "step": 3486 }, { "epoch": 2.935185185185185, "grad_norm": 0.4178577661514282, "learning_rate": 7.203125625906272e-06, "loss": 0.3384, "step": 3487 }, { "epoch": 2.936026936026936, "grad_norm": 0.47856175899505615, "learning_rate": 7.201241292294105e-06, "loss": 0.3654, "step": 3488 }, { "epoch": 2.936868686868687, "grad_norm": 0.4211041033267975, "learning_rate": 7.199356570799757e-06, "loss": 0.358, "step": 3489 }, { "epoch": 2.9377104377104377, "grad_norm": 0.4092562794685364, "learning_rate": 7.197471461755331e-06, "loss": 0.366, "step": 3490 }, { "epoch": 2.9385521885521886, "grad_norm": 0.41669726371765137, "learning_rate": 7.195585965493006e-06, "loss": 0.3794, "step": 3491 }, { "epoch": 2.9393939393939394, "grad_norm": 0.4395517408847809, "learning_rate": 7.193700082345026e-06, "loss": 0.3577, "step": 3492 }, { "epoch": 2.9402356902356903, "grad_norm": 0.4074990153312683, "learning_rate": 7.1918138126437036e-06, "loss": 0.361, "step": 3493 }, { "epoch": 2.941077441077441, "grad_norm": 0.3630622625350952, "learning_rate": 7.189927156721418e-06, "loss": 0.3506, "step": 3494 }, { "epoch": 2.941919191919192, "grad_norm": 0.4615698754787445, "learning_rate": 7.1880401149106215e-06, "loss": 0.3825, "step": 3495 }, { "epoch": 2.942760942760943, "grad_norm": 0.485809326171875, "learning_rate": 7.186152687543827e-06, "loss": 0.372, "step": 3496 }, { "epoch": 2.9436026936026938, "grad_norm": 0.3713712692260742, "learning_rate": 7.184264874953624e-06, "loss": 0.3423, "step": 3497 }, { "epoch": 2.9444444444444446, "grad_norm": 0.4673263132572174, "learning_rate": 7.182376677472661e-06, "loss": 0.3445, "step": 3498 }, { "epoch": 2.9452861952861955, "grad_norm": 0.46258920431137085, "learning_rate": 7.180488095433661e-06, "loss": 0.3614, "step": 3499 }, { "epoch": 2.9461279461279464, "grad_norm": 0.39236292243003845, "learning_rate": 7.17859912916941e-06, "loss": 0.3816, "step": 3500 }, { "epoch": 2.9469696969696972, "grad_norm": 0.41503241658210754, "learning_rate": 7.1767097790127684e-06, "loss": 0.3688, "step": 3501 }, { "epoch": 2.9478114478114477, "grad_norm": 0.38799723982810974, "learning_rate": 7.174820045296655e-06, "loss": 0.3771, "step": 3502 }, { "epoch": 2.9486531986531985, "grad_norm": 0.4115358889102936, "learning_rate": 7.172929928354062e-06, "loss": 0.4076, "step": 3503 }, { "epoch": 2.9494949494949494, "grad_norm": 0.37965628504753113, "learning_rate": 7.17103942851805e-06, "loss": 0.3617, "step": 3504 }, { "epoch": 2.9503367003367003, "grad_norm": 0.4216770529747009, "learning_rate": 7.169148546121745e-06, "loss": 0.3637, "step": 3505 }, { "epoch": 2.951178451178451, "grad_norm": 0.4180165231227875, "learning_rate": 7.167257281498341e-06, "loss": 0.3757, "step": 3506 }, { "epoch": 2.952020202020202, "grad_norm": 0.38398081064224243, "learning_rate": 7.165365634981094e-06, "loss": 0.3588, "step": 3507 }, { "epoch": 2.952861952861953, "grad_norm": 0.42953070998191833, "learning_rate": 7.16347360690334e-06, "loss": 0.3813, "step": 3508 }, { "epoch": 2.9537037037037037, "grad_norm": 0.41954487562179565, "learning_rate": 7.161581197598469e-06, "loss": 0.3481, "step": 3509 }, { "epoch": 2.9545454545454546, "grad_norm": 0.43090304732322693, "learning_rate": 7.159688407399946e-06, "loss": 0.367, "step": 3510 }, { "epoch": 2.9553872053872055, "grad_norm": 0.4266909956932068, "learning_rate": 7.157795236641301e-06, "loss": 0.3872, "step": 3511 }, { "epoch": 2.9562289562289563, "grad_norm": 0.43865931034088135, "learning_rate": 7.155901685656128e-06, "loss": 0.3888, "step": 3512 }, { "epoch": 2.957070707070707, "grad_norm": 0.4270515739917755, "learning_rate": 7.154007754778094e-06, "loss": 0.3452, "step": 3513 }, { "epoch": 2.957912457912458, "grad_norm": 0.40028148889541626, "learning_rate": 7.1521134443409305e-06, "loss": 0.3752, "step": 3514 }, { "epoch": 2.958754208754209, "grad_norm": 0.39236927032470703, "learning_rate": 7.150218754678431e-06, "loss": 0.3707, "step": 3515 }, { "epoch": 2.9595959595959593, "grad_norm": 0.4237641990184784, "learning_rate": 7.148323686124464e-06, "loss": 0.3749, "step": 3516 }, { "epoch": 2.96043771043771, "grad_norm": 0.4370881915092468, "learning_rate": 7.146428239012959e-06, "loss": 0.382, "step": 3517 }, { "epoch": 2.961279461279461, "grad_norm": 0.4167320430278778, "learning_rate": 7.144532413677915e-06, "loss": 0.3742, "step": 3518 }, { "epoch": 2.962121212121212, "grad_norm": 0.4076353907585144, "learning_rate": 7.1426362104533954e-06, "loss": 0.3424, "step": 3519 }, { "epoch": 2.962962962962963, "grad_norm": 0.42336151003837585, "learning_rate": 7.140739629673531e-06, "loss": 0.3838, "step": 3520 }, { "epoch": 2.9638047138047137, "grad_norm": 0.4860823452472687, "learning_rate": 7.138842671672523e-06, "loss": 0.3629, "step": 3521 }, { "epoch": 2.9646464646464645, "grad_norm": 0.4381859004497528, "learning_rate": 7.136945336784631e-06, "loss": 0.3822, "step": 3522 }, { "epoch": 2.9654882154882154, "grad_norm": 0.40435269474983215, "learning_rate": 7.135047625344188e-06, "loss": 0.3542, "step": 3523 }, { "epoch": 2.9663299663299663, "grad_norm": 0.4235987067222595, "learning_rate": 7.133149537685591e-06, "loss": 0.3731, "step": 3524 }, { "epoch": 2.967171717171717, "grad_norm": 0.45442190766334534, "learning_rate": 7.1312510741433036e-06, "loss": 0.3546, "step": 3525 }, { "epoch": 2.968013468013468, "grad_norm": 0.39585575461387634, "learning_rate": 7.129352235051853e-06, "loss": 0.3671, "step": 3526 }, { "epoch": 2.968855218855219, "grad_norm": 0.4407922923564911, "learning_rate": 7.127453020745839e-06, "loss": 0.3612, "step": 3527 }, { "epoch": 2.9696969696969697, "grad_norm": 0.4968118667602539, "learning_rate": 7.1255534315599194e-06, "loss": 0.3662, "step": 3528 }, { "epoch": 2.9705387205387206, "grad_norm": 0.4474887251853943, "learning_rate": 7.123653467828825e-06, "loss": 0.3761, "step": 3529 }, { "epoch": 2.9713804713804715, "grad_norm": 0.4624294340610504, "learning_rate": 7.121753129887347e-06, "loss": 0.3642, "step": 3530 }, { "epoch": 2.9722222222222223, "grad_norm": 0.43217527866363525, "learning_rate": 7.119852418070348e-06, "loss": 0.3901, "step": 3531 }, { "epoch": 2.973063973063973, "grad_norm": 0.43595701456069946, "learning_rate": 7.1179513327127515e-06, "loss": 0.3652, "step": 3532 }, { "epoch": 2.973905723905724, "grad_norm": 0.4151625335216522, "learning_rate": 7.116049874149551e-06, "loss": 0.3531, "step": 3533 }, { "epoch": 2.974747474747475, "grad_norm": 0.41905924677848816, "learning_rate": 7.114148042715799e-06, "loss": 0.3567, "step": 3534 }, { "epoch": 2.975589225589226, "grad_norm": 0.4332248866558075, "learning_rate": 7.112245838746626e-06, "loss": 0.3616, "step": 3535 }, { "epoch": 2.9764309764309766, "grad_norm": 0.42300117015838623, "learning_rate": 7.110343262577215e-06, "loss": 0.3521, "step": 3536 }, { "epoch": 2.9772727272727275, "grad_norm": 0.42008695006370544, "learning_rate": 7.108440314542823e-06, "loss": 0.3752, "step": 3537 }, { "epoch": 2.9781144781144784, "grad_norm": 0.5144386887550354, "learning_rate": 7.106536994978767e-06, "loss": 0.3915, "step": 3538 }, { "epoch": 2.978956228956229, "grad_norm": 0.4357967674732208, "learning_rate": 7.104633304220434e-06, "loss": 0.3912, "step": 3539 }, { "epoch": 2.9797979797979797, "grad_norm": 0.45614638924598694, "learning_rate": 7.102729242603275e-06, "loss": 0.3416, "step": 3540 }, { "epoch": 2.9806397306397305, "grad_norm": 0.40960294008255005, "learning_rate": 7.100824810462806e-06, "loss": 0.3457, "step": 3541 }, { "epoch": 2.9814814814814814, "grad_norm": 0.49771782755851746, "learning_rate": 7.098920008134607e-06, "loss": 0.3745, "step": 3542 }, { "epoch": 2.9823232323232323, "grad_norm": 0.4691956639289856, "learning_rate": 7.0970148359543255e-06, "loss": 0.372, "step": 3543 }, { "epoch": 2.983164983164983, "grad_norm": 0.47257155179977417, "learning_rate": 7.095109294257672e-06, "loss": 0.3876, "step": 3544 }, { "epoch": 2.984006734006734, "grad_norm": 0.407505601644516, "learning_rate": 7.093203383380424e-06, "loss": 0.372, "step": 3545 }, { "epoch": 2.984848484848485, "grad_norm": 0.48237332701683044, "learning_rate": 7.0912971036584245e-06, "loss": 0.3545, "step": 3546 }, { "epoch": 2.9856902356902357, "grad_norm": 0.47879064083099365, "learning_rate": 7.089390455427577e-06, "loss": 0.3617, "step": 3547 }, { "epoch": 2.9865319865319866, "grad_norm": 0.4213002622127533, "learning_rate": 7.087483439023857e-06, "loss": 0.3721, "step": 3548 }, { "epoch": 2.9873737373737375, "grad_norm": 0.5266488194465637, "learning_rate": 7.0855760547833e-06, "loss": 0.3596, "step": 3549 }, { "epoch": 2.9882154882154883, "grad_norm": 0.45717373490333557, "learning_rate": 7.083668303042005e-06, "loss": 0.3754, "step": 3550 }, { "epoch": 2.989057239057239, "grad_norm": 0.4722406268119812, "learning_rate": 7.081760184136142e-06, "loss": 0.3835, "step": 3551 }, { "epoch": 2.98989898989899, "grad_norm": 0.4832378029823303, "learning_rate": 7.079851698401938e-06, "loss": 0.3534, "step": 3552 }, { "epoch": 2.9907407407407405, "grad_norm": 0.4390367865562439, "learning_rate": 7.077942846175692e-06, "loss": 0.3706, "step": 3553 }, { "epoch": 2.9915824915824913, "grad_norm": 0.4160062074661255, "learning_rate": 7.076033627793764e-06, "loss": 0.3583, "step": 3554 }, { "epoch": 2.992424242424242, "grad_norm": 0.5187891125679016, "learning_rate": 7.074124043592575e-06, "loss": 0.3892, "step": 3555 }, { "epoch": 2.993265993265993, "grad_norm": 0.42714905738830566, "learning_rate": 7.072214093908617e-06, "loss": 0.3488, "step": 3556 }, { "epoch": 2.994107744107744, "grad_norm": 0.4267241656780243, "learning_rate": 7.070303779078444e-06, "loss": 0.3582, "step": 3557 }, { "epoch": 2.994949494949495, "grad_norm": 0.43499755859375, "learning_rate": 7.06839309943867e-06, "loss": 0.3598, "step": 3558 }, { "epoch": 2.9957912457912457, "grad_norm": 0.47556859254837036, "learning_rate": 7.066482055325983e-06, "loss": 0.3832, "step": 3559 }, { "epoch": 2.9966329966329965, "grad_norm": 0.47915560007095337, "learning_rate": 7.064570647077125e-06, "loss": 0.3613, "step": 3560 }, { "epoch": 2.9974747474747474, "grad_norm": 0.4304136037826538, "learning_rate": 7.062658875028907e-06, "loss": 0.3731, "step": 3561 }, { "epoch": 2.9983164983164983, "grad_norm": 0.4371441900730133, "learning_rate": 7.060746739518205e-06, "loss": 0.3654, "step": 3562 }, { "epoch": 2.999158249158249, "grad_norm": 0.511182963848114, "learning_rate": 7.058834240881958e-06, "loss": 0.3761, "step": 3563 }, { "epoch": 3.0, "grad_norm": 0.43785181641578674, "learning_rate": 7.056921379457166e-06, "loss": 0.3328, "step": 3564 }, { "epoch": 3.000841750841751, "grad_norm": 0.45101606845855713, "learning_rate": 7.055008155580899e-06, "loss": 0.3204, "step": 3565 }, { "epoch": 3.0016835016835017, "grad_norm": 0.43202993273735046, "learning_rate": 7.053094569590283e-06, "loss": 0.3358, "step": 3566 }, { "epoch": 3.0025252525252526, "grad_norm": 0.4196683168411255, "learning_rate": 7.0511806218225175e-06, "loss": 0.3265, "step": 3567 }, { "epoch": 3.0033670033670035, "grad_norm": 0.5621776580810547, "learning_rate": 7.0492663126148555e-06, "loss": 0.348, "step": 3568 }, { "epoch": 3.0042087542087543, "grad_norm": 0.4349118173122406, "learning_rate": 7.047351642304624e-06, "loss": 0.3419, "step": 3569 }, { "epoch": 3.005050505050505, "grad_norm": 0.4560631215572357, "learning_rate": 7.045436611229204e-06, "loss": 0.3101, "step": 3570 }, { "epoch": 3.005892255892256, "grad_norm": 0.45476827025413513, "learning_rate": 7.043521219726046e-06, "loss": 0.333, "step": 3571 }, { "epoch": 3.006734006734007, "grad_norm": 0.47594186663627625, "learning_rate": 7.041605468132662e-06, "loss": 0.3309, "step": 3572 }, { "epoch": 3.007575757575758, "grad_norm": 0.5108615756034851, "learning_rate": 7.039689356786629e-06, "loss": 0.3149, "step": 3573 }, { "epoch": 3.008417508417508, "grad_norm": 0.42906829714775085, "learning_rate": 7.0377728860255845e-06, "loss": 0.302, "step": 3574 }, { "epoch": 3.009259259259259, "grad_norm": 0.497420072555542, "learning_rate": 7.035856056187231e-06, "loss": 0.3296, "step": 3575 }, { "epoch": 3.01010101010101, "grad_norm": 0.47339126467704773, "learning_rate": 7.033938867609335e-06, "loss": 0.3037, "step": 3576 }, { "epoch": 3.010942760942761, "grad_norm": 0.44980746507644653, "learning_rate": 7.032021320629728e-06, "loss": 0.3139, "step": 3577 }, { "epoch": 3.0117845117845117, "grad_norm": 0.480300635099411, "learning_rate": 7.0301034155863e-06, "loss": 0.3092, "step": 3578 }, { "epoch": 3.0126262626262625, "grad_norm": 0.43153470754623413, "learning_rate": 7.028185152817004e-06, "loss": 0.3555, "step": 3579 }, { "epoch": 3.0134680134680134, "grad_norm": 0.4858878254890442, "learning_rate": 7.026266532659864e-06, "loss": 0.3253, "step": 3580 }, { "epoch": 3.0143097643097643, "grad_norm": 0.4659850001335144, "learning_rate": 7.0243475554529564e-06, "loss": 0.327, "step": 3581 }, { "epoch": 3.015151515151515, "grad_norm": 0.4263989329338074, "learning_rate": 7.022428221534427e-06, "loss": 0.3276, "step": 3582 }, { "epoch": 3.015993265993266, "grad_norm": 0.45722541213035583, "learning_rate": 7.020508531242482e-06, "loss": 0.3139, "step": 3583 }, { "epoch": 3.016835016835017, "grad_norm": 0.41472911834716797, "learning_rate": 7.018588484915394e-06, "loss": 0.3298, "step": 3584 }, { "epoch": 3.0176767676767677, "grad_norm": 0.4271794557571411, "learning_rate": 7.016668082891494e-06, "loss": 0.3375, "step": 3585 }, { "epoch": 3.0185185185185186, "grad_norm": 0.4208003282546997, "learning_rate": 7.014747325509177e-06, "loss": 0.3259, "step": 3586 }, { "epoch": 3.0193602693602695, "grad_norm": 0.40725481510162354, "learning_rate": 7.012826213106899e-06, "loss": 0.3001, "step": 3587 }, { "epoch": 3.0202020202020203, "grad_norm": 0.4291864037513733, "learning_rate": 7.010904746023185e-06, "loss": 0.3489, "step": 3588 }, { "epoch": 3.021043771043771, "grad_norm": 0.405221164226532, "learning_rate": 7.008982924596615e-06, "loss": 0.3275, "step": 3589 }, { "epoch": 3.021885521885522, "grad_norm": 0.5223220586776733, "learning_rate": 7.007060749165835e-06, "loss": 0.3186, "step": 3590 }, { "epoch": 3.022727272727273, "grad_norm": 0.44938308000564575, "learning_rate": 7.005138220069553e-06, "loss": 0.3188, "step": 3591 }, { "epoch": 3.0235690235690234, "grad_norm": 0.42507994174957275, "learning_rate": 7.003215337646539e-06, "loss": 0.3024, "step": 3592 }, { "epoch": 3.024410774410774, "grad_norm": 0.4302850663661957, "learning_rate": 7.0012921022356265e-06, "loss": 0.3435, "step": 3593 }, { "epoch": 3.025252525252525, "grad_norm": 0.4622645676136017, "learning_rate": 6.999368514175707e-06, "loss": 0.335, "step": 3594 }, { "epoch": 3.026094276094276, "grad_norm": 0.46592268347740173, "learning_rate": 6.997444573805739e-06, "loss": 0.3434, "step": 3595 }, { "epoch": 3.026936026936027, "grad_norm": 0.3758339583873749, "learning_rate": 6.995520281464743e-06, "loss": 0.3199, "step": 3596 }, { "epoch": 3.0277777777777777, "grad_norm": 0.4400799870491028, "learning_rate": 6.993595637491799e-06, "loss": 0.3032, "step": 3597 }, { "epoch": 3.0286195286195285, "grad_norm": 0.4321746826171875, "learning_rate": 6.991670642226047e-06, "loss": 0.3324, "step": 3598 }, { "epoch": 3.0294612794612794, "grad_norm": 0.4777515232563019, "learning_rate": 6.989745296006696e-06, "loss": 0.3258, "step": 3599 }, { "epoch": 3.0303030303030303, "grad_norm": 0.412071168422699, "learning_rate": 6.987819599173007e-06, "loss": 0.3108, "step": 3600 }, { "epoch": 3.031144781144781, "grad_norm": 0.43244192004203796, "learning_rate": 6.985893552064314e-06, "loss": 0.322, "step": 3601 }, { "epoch": 3.031986531986532, "grad_norm": 0.4499422013759613, "learning_rate": 6.983967155020005e-06, "loss": 0.323, "step": 3602 }, { "epoch": 3.032828282828283, "grad_norm": 0.4335784316062927, "learning_rate": 6.982040408379531e-06, "loss": 0.3131, "step": 3603 }, { "epoch": 3.0336700336700337, "grad_norm": 0.3974713683128357, "learning_rate": 6.980113312482405e-06, "loss": 0.3134, "step": 3604 }, { "epoch": 3.0345117845117846, "grad_norm": 0.4820351004600525, "learning_rate": 6.978185867668203e-06, "loss": 0.3315, "step": 3605 }, { "epoch": 3.0353535353535355, "grad_norm": 0.4674270451068878, "learning_rate": 6.97625807427656e-06, "loss": 0.3151, "step": 3606 }, { "epoch": 3.0361952861952863, "grad_norm": 0.39501121640205383, "learning_rate": 6.974329932647177e-06, "loss": 0.2984, "step": 3607 }, { "epoch": 3.037037037037037, "grad_norm": 0.4796160161495209, "learning_rate": 6.972401443119808e-06, "loss": 0.3329, "step": 3608 }, { "epoch": 3.037878787878788, "grad_norm": 0.4805756211280823, "learning_rate": 6.970472606034277e-06, "loss": 0.3452, "step": 3609 }, { "epoch": 3.038720538720539, "grad_norm": 0.41504549980163574, "learning_rate": 6.968543421730466e-06, "loss": 0.3244, "step": 3610 }, { "epoch": 3.0395622895622894, "grad_norm": 0.45388370752334595, "learning_rate": 6.966613890548315e-06, "loss": 0.3044, "step": 3611 }, { "epoch": 3.04040404040404, "grad_norm": 0.4376748502254486, "learning_rate": 6.964684012827831e-06, "loss": 0.3278, "step": 3612 }, { "epoch": 3.041245791245791, "grad_norm": 0.4358134865760803, "learning_rate": 6.9627537889090766e-06, "loss": 0.3355, "step": 3613 }, { "epoch": 3.042087542087542, "grad_norm": 0.42797112464904785, "learning_rate": 6.960823219132179e-06, "loss": 0.3139, "step": 3614 }, { "epoch": 3.042929292929293, "grad_norm": 0.4233553409576416, "learning_rate": 6.958892303837326e-06, "loss": 0.323, "step": 3615 }, { "epoch": 3.0437710437710437, "grad_norm": 0.4261596202850342, "learning_rate": 6.956961043364763e-06, "loss": 0.3531, "step": 3616 }, { "epoch": 3.0446127946127945, "grad_norm": 0.41554781794548035, "learning_rate": 6.9550294380548e-06, "loss": 0.337, "step": 3617 }, { "epoch": 3.0454545454545454, "grad_norm": 0.4409468472003937, "learning_rate": 6.9530974882478086e-06, "loss": 0.3064, "step": 3618 }, { "epoch": 3.0462962962962963, "grad_norm": 0.44640395045280457, "learning_rate": 6.951165194284214e-06, "loss": 0.3142, "step": 3619 }, { "epoch": 3.047138047138047, "grad_norm": 0.4055858254432678, "learning_rate": 6.9492325565045126e-06, "loss": 0.3117, "step": 3620 }, { "epoch": 3.047979797979798, "grad_norm": 0.41258320212364197, "learning_rate": 6.947299575249252e-06, "loss": 0.3248, "step": 3621 }, { "epoch": 3.048821548821549, "grad_norm": 0.41116228699684143, "learning_rate": 6.945366250859046e-06, "loss": 0.309, "step": 3622 }, { "epoch": 3.0496632996632997, "grad_norm": 0.4250611960887909, "learning_rate": 6.943432583674566e-06, "loss": 0.3209, "step": 3623 }, { "epoch": 3.0505050505050506, "grad_norm": 0.40102651715278625, "learning_rate": 6.941498574036544e-06, "loss": 0.3512, "step": 3624 }, { "epoch": 3.0513468013468015, "grad_norm": 0.3901109993457794, "learning_rate": 6.939564222285775e-06, "loss": 0.344, "step": 3625 }, { "epoch": 3.0521885521885523, "grad_norm": 0.390360563993454, "learning_rate": 6.937629528763112e-06, "loss": 0.3294, "step": 3626 }, { "epoch": 3.053030303030303, "grad_norm": 0.43782302737236023, "learning_rate": 6.935694493809468e-06, "loss": 0.3303, "step": 3627 }, { "epoch": 3.053872053872054, "grad_norm": 0.41069501638412476, "learning_rate": 6.933759117765816e-06, "loss": 0.3243, "step": 3628 }, { "epoch": 3.0547138047138045, "grad_norm": 0.3856247663497925, "learning_rate": 6.931823400973191e-06, "loss": 0.3363, "step": 3629 }, { "epoch": 3.0555555555555554, "grad_norm": 0.40515804290771484, "learning_rate": 6.929887343772686e-06, "loss": 0.3299, "step": 3630 }, { "epoch": 3.0563973063973062, "grad_norm": 0.4057391583919525, "learning_rate": 6.927950946505459e-06, "loss": 0.3319, "step": 3631 }, { "epoch": 3.057239057239057, "grad_norm": 0.3916190564632416, "learning_rate": 6.926014209512717e-06, "loss": 0.3428, "step": 3632 }, { "epoch": 3.058080808080808, "grad_norm": 0.43428337574005127, "learning_rate": 6.924077133135739e-06, "loss": 0.3335, "step": 3633 }, { "epoch": 3.058922558922559, "grad_norm": 0.41141435503959656, "learning_rate": 6.9221397177158565e-06, "loss": 0.3292, "step": 3634 }, { "epoch": 3.0597643097643097, "grad_norm": 0.4002685248851776, "learning_rate": 6.920201963594462e-06, "loss": 0.3042, "step": 3635 }, { "epoch": 3.0606060606060606, "grad_norm": 0.46771469712257385, "learning_rate": 6.9182638711130105e-06, "loss": 0.3028, "step": 3636 }, { "epoch": 3.0614478114478114, "grad_norm": 0.4088781774044037, "learning_rate": 6.916325440613014e-06, "loss": 0.3429, "step": 3637 }, { "epoch": 3.0622895622895623, "grad_norm": 0.39007139205932617, "learning_rate": 6.914386672436042e-06, "loss": 0.3043, "step": 3638 }, { "epoch": 3.063131313131313, "grad_norm": 0.4761551022529602, "learning_rate": 6.91244756692373e-06, "loss": 0.3267, "step": 3639 }, { "epoch": 3.063973063973064, "grad_norm": 0.42025867104530334, "learning_rate": 6.910508124417766e-06, "loss": 0.2918, "step": 3640 }, { "epoch": 3.064814814814815, "grad_norm": 0.40107372403144836, "learning_rate": 6.908568345259903e-06, "loss": 0.3547, "step": 3641 }, { "epoch": 3.0656565656565657, "grad_norm": 0.37067633867263794, "learning_rate": 6.9066282297919494e-06, "loss": 0.3359, "step": 3642 }, { "epoch": 3.0664983164983166, "grad_norm": 0.3967112600803375, "learning_rate": 6.904687778355774e-06, "loss": 0.3187, "step": 3643 }, { "epoch": 3.0673400673400675, "grad_norm": 0.36868816614151, "learning_rate": 6.902746991293305e-06, "loss": 0.3261, "step": 3644 }, { "epoch": 3.0681818181818183, "grad_norm": 0.3932925760746002, "learning_rate": 6.900805868946532e-06, "loss": 0.3083, "step": 3645 }, { "epoch": 3.069023569023569, "grad_norm": 0.40293484926223755, "learning_rate": 6.8988644116574965e-06, "loss": 0.3249, "step": 3646 }, { "epoch": 3.06986531986532, "grad_norm": 0.3842941224575043, "learning_rate": 6.896922619768309e-06, "loss": 0.3099, "step": 3647 }, { "epoch": 3.0707070707070705, "grad_norm": 0.3899823725223541, "learning_rate": 6.894980493621133e-06, "loss": 0.3191, "step": 3648 }, { "epoch": 3.0715488215488214, "grad_norm": 0.3901764452457428, "learning_rate": 6.893038033558188e-06, "loss": 0.3132, "step": 3649 }, { "epoch": 3.0723905723905722, "grad_norm": 0.3961602449417114, "learning_rate": 6.891095239921761e-06, "loss": 0.3227, "step": 3650 }, { "epoch": 3.073232323232323, "grad_norm": 0.37254154682159424, "learning_rate": 6.889152113054191e-06, "loss": 0.3173, "step": 3651 }, { "epoch": 3.074074074074074, "grad_norm": 0.41167116165161133, "learning_rate": 6.887208653297878e-06, "loss": 0.3303, "step": 3652 }, { "epoch": 3.074915824915825, "grad_norm": 0.38939952850341797, "learning_rate": 6.885264860995279e-06, "loss": 0.3171, "step": 3653 }, { "epoch": 3.0757575757575757, "grad_norm": 0.4114934206008911, "learning_rate": 6.883320736488911e-06, "loss": 0.3119, "step": 3654 }, { "epoch": 3.0765993265993266, "grad_norm": 0.36292150616645813, "learning_rate": 6.881376280121351e-06, "loss": 0.3197, "step": 3655 }, { "epoch": 3.0774410774410774, "grad_norm": 0.4002666473388672, "learning_rate": 6.879431492235234e-06, "loss": 0.3026, "step": 3656 }, { "epoch": 3.0782828282828283, "grad_norm": 0.3615115284919739, "learning_rate": 6.877486373173248e-06, "loss": 0.3162, "step": 3657 }, { "epoch": 3.079124579124579, "grad_norm": 0.4488748013973236, "learning_rate": 6.875540923278149e-06, "loss": 0.2888, "step": 3658 }, { "epoch": 3.07996632996633, "grad_norm": 0.42041945457458496, "learning_rate": 6.873595142892741e-06, "loss": 0.2923, "step": 3659 }, { "epoch": 3.080808080808081, "grad_norm": 0.39447924494743347, "learning_rate": 6.871649032359895e-06, "loss": 0.3191, "step": 3660 }, { "epoch": 3.0816498316498318, "grad_norm": 0.4190329611301422, "learning_rate": 6.869702592022535e-06, "loss": 0.34, "step": 3661 }, { "epoch": 3.0824915824915826, "grad_norm": 0.42345932126045227, "learning_rate": 6.867755822223644e-06, "loss": 0.3099, "step": 3662 }, { "epoch": 3.0833333333333335, "grad_norm": 0.40238943696022034, "learning_rate": 6.865808723306264e-06, "loss": 0.3236, "step": 3663 }, { "epoch": 3.0841750841750843, "grad_norm": 0.41384685039520264, "learning_rate": 6.863861295613495e-06, "loss": 0.3262, "step": 3664 }, { "epoch": 3.085016835016835, "grad_norm": 0.35952961444854736, "learning_rate": 6.861913539488495e-06, "loss": 0.3211, "step": 3665 }, { "epoch": 3.0858585858585856, "grad_norm": 0.36890891194343567, "learning_rate": 6.85996545527448e-06, "loss": 0.3067, "step": 3666 }, { "epoch": 3.0867003367003365, "grad_norm": 0.44436293840408325, "learning_rate": 6.85801704331472e-06, "loss": 0.3439, "step": 3667 }, { "epoch": 3.0875420875420874, "grad_norm": 0.4018145203590393, "learning_rate": 6.856068303952548e-06, "loss": 0.3417, "step": 3668 }, { "epoch": 3.0883838383838382, "grad_norm": 0.43370720744132996, "learning_rate": 6.854119237531352e-06, "loss": 0.3093, "step": 3669 }, { "epoch": 3.089225589225589, "grad_norm": 0.3793923258781433, "learning_rate": 6.85216984439458e-06, "loss": 0.3207, "step": 3670 }, { "epoch": 3.09006734006734, "grad_norm": 0.4368059039115906, "learning_rate": 6.850220124885734e-06, "loss": 0.323, "step": 3671 }, { "epoch": 3.090909090909091, "grad_norm": 0.4063248038291931, "learning_rate": 6.848270079348375e-06, "loss": 0.326, "step": 3672 }, { "epoch": 3.0917508417508417, "grad_norm": 0.40797334909439087, "learning_rate": 6.846319708126125e-06, "loss": 0.341, "step": 3673 }, { "epoch": 3.0925925925925926, "grad_norm": 0.4372561573982239, "learning_rate": 6.844369011562657e-06, "loss": 0.3338, "step": 3674 }, { "epoch": 3.0934343434343434, "grad_norm": 0.42385971546173096, "learning_rate": 6.842417990001705e-06, "loss": 0.3076, "step": 3675 }, { "epoch": 3.0942760942760943, "grad_norm": 0.4113486707210541, "learning_rate": 6.840466643787061e-06, "loss": 0.2937, "step": 3676 }, { "epoch": 3.095117845117845, "grad_norm": 0.40177854895591736, "learning_rate": 6.838514973262573e-06, "loss": 0.3239, "step": 3677 }, { "epoch": 3.095959595959596, "grad_norm": 0.422422856092453, "learning_rate": 6.8365629787721434e-06, "loss": 0.323, "step": 3678 }, { "epoch": 3.096801346801347, "grad_norm": 0.4133533239364624, "learning_rate": 6.8346106606597394e-06, "loss": 0.3397, "step": 3679 }, { "epoch": 3.0976430976430978, "grad_norm": 0.4034905731678009, "learning_rate": 6.832658019269374e-06, "loss": 0.3296, "step": 3680 }, { "epoch": 3.0984848484848486, "grad_norm": 0.3836168646812439, "learning_rate": 6.83070505494513e-06, "loss": 0.3266, "step": 3681 }, { "epoch": 3.0993265993265995, "grad_norm": 0.4269743859767914, "learning_rate": 6.828751768031136e-06, "loss": 0.3372, "step": 3682 }, { "epoch": 3.1001683501683504, "grad_norm": 0.4134860932826996, "learning_rate": 6.826798158871584e-06, "loss": 0.3143, "step": 3683 }, { "epoch": 3.101010101010101, "grad_norm": 0.4031934440135956, "learning_rate": 6.82484422781072e-06, "loss": 0.3454, "step": 3684 }, { "epoch": 3.1018518518518516, "grad_norm": 0.3951745927333832, "learning_rate": 6.822889975192848e-06, "loss": 0.3123, "step": 3685 }, { "epoch": 3.1026936026936025, "grad_norm": 0.4008622467517853, "learning_rate": 6.820935401362328e-06, "loss": 0.2985, "step": 3686 }, { "epoch": 3.1035353535353534, "grad_norm": 0.4197220504283905, "learning_rate": 6.8189805066635754e-06, "loss": 0.3097, "step": 3687 }, { "epoch": 3.1043771043771042, "grad_norm": 0.40623003244400024, "learning_rate": 6.817025291441064e-06, "loss": 0.335, "step": 3688 }, { "epoch": 3.105218855218855, "grad_norm": 0.37590348720550537, "learning_rate": 6.815069756039325e-06, "loss": 0.3315, "step": 3689 }, { "epoch": 3.106060606060606, "grad_norm": 0.40525949001312256, "learning_rate": 6.813113900802945e-06, "loss": 0.3341, "step": 3690 }, { "epoch": 3.106902356902357, "grad_norm": 0.3946521580219269, "learning_rate": 6.811157726076563e-06, "loss": 0.3322, "step": 3691 }, { "epoch": 3.1077441077441077, "grad_norm": 0.4071916341781616, "learning_rate": 6.809201232204881e-06, "loss": 0.3351, "step": 3692 }, { "epoch": 3.1085858585858586, "grad_norm": 0.42905306816101074, "learning_rate": 6.807244419532653e-06, "loss": 0.32, "step": 3693 }, { "epoch": 3.1094276094276094, "grad_norm": 0.41992273926734924, "learning_rate": 6.8052872884046915e-06, "loss": 0.3327, "step": 3694 }, { "epoch": 3.1102693602693603, "grad_norm": 0.4015096426010132, "learning_rate": 6.803329839165862e-06, "loss": 0.3103, "step": 3695 }, { "epoch": 3.111111111111111, "grad_norm": 0.4146229326725006, "learning_rate": 6.80137207216109e-06, "loss": 0.3131, "step": 3696 }, { "epoch": 3.111952861952862, "grad_norm": 0.38770943880081177, "learning_rate": 6.799413987735353e-06, "loss": 0.3169, "step": 3697 }, { "epoch": 3.112794612794613, "grad_norm": 0.4251973628997803, "learning_rate": 6.797455586233686e-06, "loss": 0.329, "step": 3698 }, { "epoch": 3.1136363636363638, "grad_norm": 0.4139443337917328, "learning_rate": 6.795496868001182e-06, "loss": 0.2913, "step": 3699 }, { "epoch": 3.1144781144781146, "grad_norm": 0.4228924512863159, "learning_rate": 6.793537833382988e-06, "loss": 0.3443, "step": 3700 }, { "epoch": 3.1153198653198655, "grad_norm": 0.40441206097602844, "learning_rate": 6.7915784827243055e-06, "loss": 0.334, "step": 3701 }, { "epoch": 3.1161616161616164, "grad_norm": 0.443661093711853, "learning_rate": 6.789618816370394e-06, "loss": 0.315, "step": 3702 }, { "epoch": 3.1170033670033668, "grad_norm": 0.39957815408706665, "learning_rate": 6.7876588346665664e-06, "loss": 0.3258, "step": 3703 }, { "epoch": 3.1178451178451176, "grad_norm": 0.43948420882225037, "learning_rate": 6.785698537958193e-06, "loss": 0.3123, "step": 3704 }, { "epoch": 3.1186868686868685, "grad_norm": 0.42612266540527344, "learning_rate": 6.783737926590701e-06, "loss": 0.3181, "step": 3705 }, { "epoch": 3.1195286195286194, "grad_norm": 0.4117053747177124, "learning_rate": 6.781777000909567e-06, "loss": 0.3391, "step": 3706 }, { "epoch": 3.1203703703703702, "grad_norm": 0.4208832085132599, "learning_rate": 6.779815761260331e-06, "loss": 0.3349, "step": 3707 }, { "epoch": 3.121212121212121, "grad_norm": 0.4047853648662567, "learning_rate": 6.777854207988581e-06, "loss": 0.3053, "step": 3708 }, { "epoch": 3.122053872053872, "grad_norm": 0.3927420377731323, "learning_rate": 6.7758923414399666e-06, "loss": 0.3143, "step": 3709 }, { "epoch": 3.122895622895623, "grad_norm": 0.41030269861221313, "learning_rate": 6.773930161960186e-06, "loss": 0.3338, "step": 3710 }, { "epoch": 3.1237373737373737, "grad_norm": 0.4046097695827484, "learning_rate": 6.771967669895e-06, "loss": 0.2943, "step": 3711 }, { "epoch": 3.1245791245791246, "grad_norm": 0.4248257875442505, "learning_rate": 6.770004865590217e-06, "loss": 0.3046, "step": 3712 }, { "epoch": 3.1254208754208754, "grad_norm": 0.40670663118362427, "learning_rate": 6.768041749391709e-06, "loss": 0.3231, "step": 3713 }, { "epoch": 3.1262626262626263, "grad_norm": 0.40430375933647156, "learning_rate": 6.766078321645392e-06, "loss": 0.3543, "step": 3714 }, { "epoch": 3.127104377104377, "grad_norm": 0.38928771018981934, "learning_rate": 6.764114582697247e-06, "loss": 0.3222, "step": 3715 }, { "epoch": 3.127946127946128, "grad_norm": 0.4225044548511505, "learning_rate": 6.762150532893304e-06, "loss": 0.3396, "step": 3716 }, { "epoch": 3.128787878787879, "grad_norm": 0.3843388259410858, "learning_rate": 6.760186172579651e-06, "loss": 0.3313, "step": 3717 }, { "epoch": 3.1296296296296298, "grad_norm": 0.4140692353248596, "learning_rate": 6.758221502102427e-06, "loss": 0.3277, "step": 3718 }, { "epoch": 3.1304713804713806, "grad_norm": 0.38219770789146423, "learning_rate": 6.7562565218078316e-06, "loss": 0.3422, "step": 3719 }, { "epoch": 3.1313131313131315, "grad_norm": 0.4198126792907715, "learning_rate": 6.7542912320421105e-06, "loss": 0.3188, "step": 3720 }, { "epoch": 3.1321548821548824, "grad_norm": 0.41778287291526794, "learning_rate": 6.752325633151572e-06, "loss": 0.3409, "step": 3721 }, { "epoch": 3.1329966329966332, "grad_norm": 0.4063754081726074, "learning_rate": 6.750359725482573e-06, "loss": 0.3256, "step": 3722 }, { "epoch": 3.1338383838383836, "grad_norm": 0.43272262811660767, "learning_rate": 6.748393509381531e-06, "loss": 0.3344, "step": 3723 }, { "epoch": 3.1346801346801345, "grad_norm": 0.41205355525016785, "learning_rate": 6.746426985194913e-06, "loss": 0.312, "step": 3724 }, { "epoch": 3.1355218855218854, "grad_norm": 0.4071003794670105, "learning_rate": 6.744460153269239e-06, "loss": 0.3175, "step": 3725 }, { "epoch": 3.1363636363636362, "grad_norm": 0.3993509113788605, "learning_rate": 6.742493013951088e-06, "loss": 0.3309, "step": 3726 }, { "epoch": 3.137205387205387, "grad_norm": 0.4333304762840271, "learning_rate": 6.740525567587091e-06, "loss": 0.3135, "step": 3727 }, { "epoch": 3.138047138047138, "grad_norm": 0.4063817262649536, "learning_rate": 6.738557814523932e-06, "loss": 0.3136, "step": 3728 }, { "epoch": 3.138888888888889, "grad_norm": 0.40350064635276794, "learning_rate": 6.73658975510835e-06, "loss": 0.3111, "step": 3729 }, { "epoch": 3.1397306397306397, "grad_norm": 0.4146251082420349, "learning_rate": 6.734621389687141e-06, "loss": 0.3171, "step": 3730 }, { "epoch": 3.1405723905723906, "grad_norm": 0.4022911787033081, "learning_rate": 6.732652718607147e-06, "loss": 0.3181, "step": 3731 }, { "epoch": 3.1414141414141414, "grad_norm": 0.3884579539299011, "learning_rate": 6.730683742215274e-06, "loss": 0.3264, "step": 3732 }, { "epoch": 3.1422558922558923, "grad_norm": 0.39588025212287903, "learning_rate": 6.728714460858472e-06, "loss": 0.33, "step": 3733 }, { "epoch": 3.143097643097643, "grad_norm": 0.3879547417163849, "learning_rate": 6.726744874883751e-06, "loss": 0.3268, "step": 3734 }, { "epoch": 3.143939393939394, "grad_norm": 0.4070504605770111, "learning_rate": 6.724774984638175e-06, "loss": 0.3302, "step": 3735 }, { "epoch": 3.144781144781145, "grad_norm": 0.39435359835624695, "learning_rate": 6.722804790468857e-06, "loss": 0.3146, "step": 3736 }, { "epoch": 3.1456228956228958, "grad_norm": 0.413806289434433, "learning_rate": 6.7208342927229684e-06, "loss": 0.3252, "step": 3737 }, { "epoch": 3.1464646464646466, "grad_norm": 0.4175029397010803, "learning_rate": 6.718863491747731e-06, "loss": 0.315, "step": 3738 }, { "epoch": 3.1473063973063975, "grad_norm": 0.4063631594181061, "learning_rate": 6.71689238789042e-06, "loss": 0.3574, "step": 3739 }, { "epoch": 3.148148148148148, "grad_norm": 0.4101138412952423, "learning_rate": 6.714920981498366e-06, "loss": 0.3467, "step": 3740 }, { "epoch": 3.148989898989899, "grad_norm": 0.43822622299194336, "learning_rate": 6.712949272918952e-06, "loss": 0.3197, "step": 3741 }, { "epoch": 3.1498316498316496, "grad_norm": 0.40743133425712585, "learning_rate": 6.710977262499613e-06, "loss": 0.316, "step": 3742 }, { "epoch": 3.1506734006734005, "grad_norm": 0.4299086630344391, "learning_rate": 6.70900495058784e-06, "loss": 0.3303, "step": 3743 }, { "epoch": 3.1515151515151514, "grad_norm": 0.43656250834465027, "learning_rate": 6.707032337531172e-06, "loss": 0.3367, "step": 3744 }, { "epoch": 3.1523569023569022, "grad_norm": 0.4115843176841736, "learning_rate": 6.705059423677209e-06, "loss": 0.3214, "step": 3745 }, { "epoch": 3.153198653198653, "grad_norm": 0.39549925923347473, "learning_rate": 6.703086209373596e-06, "loss": 0.3112, "step": 3746 }, { "epoch": 3.154040404040404, "grad_norm": 0.4358007609844208, "learning_rate": 6.7011126949680365e-06, "loss": 0.3139, "step": 3747 }, { "epoch": 3.154882154882155, "grad_norm": 0.418535053730011, "learning_rate": 6.699138880808282e-06, "loss": 0.3069, "step": 3748 }, { "epoch": 3.1557239057239057, "grad_norm": 0.4255852699279785, "learning_rate": 6.6971647672421445e-06, "loss": 0.3399, "step": 3749 }, { "epoch": 3.1565656565656566, "grad_norm": 0.43768027424812317, "learning_rate": 6.695190354617478e-06, "loss": 0.3319, "step": 3750 }, { "epoch": 3.1574074074074074, "grad_norm": 0.37536102533340454, "learning_rate": 6.693215643282201e-06, "loss": 0.3306, "step": 3751 }, { "epoch": 3.1582491582491583, "grad_norm": 0.44794490933418274, "learning_rate": 6.6912406335842705e-06, "loss": 0.3271, "step": 3752 }, { "epoch": 3.159090909090909, "grad_norm": 0.38367995619773865, "learning_rate": 6.689265325871715e-06, "loss": 0.3141, "step": 3753 }, { "epoch": 3.15993265993266, "grad_norm": 0.40388551354408264, "learning_rate": 6.687289720492596e-06, "loss": 0.298, "step": 3754 }, { "epoch": 3.160774410774411, "grad_norm": 0.3667890429496765, "learning_rate": 6.6853138177950385e-06, "loss": 0.3313, "step": 3755 }, { "epoch": 3.1616161616161618, "grad_norm": 0.384136438369751, "learning_rate": 6.68333761812722e-06, "loss": 0.3251, "step": 3756 }, { "epoch": 3.1624579124579126, "grad_norm": 0.4426936209201813, "learning_rate": 6.6813611218373674e-06, "loss": 0.3254, "step": 3757 }, { "epoch": 3.1632996632996635, "grad_norm": 0.41764265298843384, "learning_rate": 6.679384329273758e-06, "loss": 0.3308, "step": 3758 }, { "epoch": 3.1641414141414144, "grad_norm": 0.3990895450115204, "learning_rate": 6.677407240784726e-06, "loss": 0.3385, "step": 3759 }, { "epoch": 3.164983164983165, "grad_norm": 0.4222327470779419, "learning_rate": 6.675429856718653e-06, "loss": 0.3079, "step": 3760 }, { "epoch": 3.1658249158249157, "grad_norm": 0.4083492159843445, "learning_rate": 6.673452177423978e-06, "loss": 0.3231, "step": 3761 }, { "epoch": 3.1666666666666665, "grad_norm": 0.4051346182823181, "learning_rate": 6.671474203249189e-06, "loss": 0.3131, "step": 3762 }, { "epoch": 3.1675084175084174, "grad_norm": 0.3793027102947235, "learning_rate": 6.669495934542822e-06, "loss": 0.3042, "step": 3763 }, { "epoch": 3.1683501683501682, "grad_norm": 0.40679994225502014, "learning_rate": 6.667517371653475e-06, "loss": 0.3161, "step": 3764 }, { "epoch": 3.169191919191919, "grad_norm": 0.3949684500694275, "learning_rate": 6.665538514929788e-06, "loss": 0.3054, "step": 3765 }, { "epoch": 3.17003367003367, "grad_norm": 0.37623071670532227, "learning_rate": 6.663559364720458e-06, "loss": 0.3009, "step": 3766 }, { "epoch": 3.170875420875421, "grad_norm": 0.3924348056316376, "learning_rate": 6.66157992137423e-06, "loss": 0.3234, "step": 3767 }, { "epoch": 3.1717171717171717, "grad_norm": 0.4307842254638672, "learning_rate": 6.6596001852399076e-06, "loss": 0.3113, "step": 3768 }, { "epoch": 3.1725589225589226, "grad_norm": 0.3789771795272827, "learning_rate": 6.657620156666338e-06, "loss": 0.3179, "step": 3769 }, { "epoch": 3.1734006734006734, "grad_norm": 0.4118492305278778, "learning_rate": 6.655639836002425e-06, "loss": 0.312, "step": 3770 }, { "epoch": 3.1742424242424243, "grad_norm": 0.38582250475883484, "learning_rate": 6.65365922359712e-06, "loss": 0.3429, "step": 3771 }, { "epoch": 3.175084175084175, "grad_norm": 0.4130614697933197, "learning_rate": 6.6516783197994305e-06, "loss": 0.3188, "step": 3772 }, { "epoch": 3.175925925925926, "grad_norm": 0.3806181848049164, "learning_rate": 6.649697124958411e-06, "loss": 0.3273, "step": 3773 }, { "epoch": 3.176767676767677, "grad_norm": 0.3809477388858795, "learning_rate": 6.6477156394231704e-06, "loss": 0.3387, "step": 3774 }, { "epoch": 3.1776094276094278, "grad_norm": 0.38143402338027954, "learning_rate": 6.645733863542867e-06, "loss": 0.3078, "step": 3775 }, { "epoch": 3.1784511784511786, "grad_norm": 0.3774363398551941, "learning_rate": 6.643751797666711e-06, "loss": 0.3242, "step": 3776 }, { "epoch": 3.179292929292929, "grad_norm": 0.41996270418167114, "learning_rate": 6.641769442143964e-06, "loss": 0.3264, "step": 3777 }, { "epoch": 3.18013468013468, "grad_norm": 0.3579813539981842, "learning_rate": 6.639786797323937e-06, "loss": 0.3263, "step": 3778 }, { "epoch": 3.180976430976431, "grad_norm": 0.3946196138858795, "learning_rate": 6.637803863555994e-06, "loss": 0.3219, "step": 3779 }, { "epoch": 3.1818181818181817, "grad_norm": 0.44427600502967834, "learning_rate": 6.63582064118955e-06, "loss": 0.3239, "step": 3780 }, { "epoch": 3.1826599326599325, "grad_norm": 0.4259587526321411, "learning_rate": 6.633837130574068e-06, "loss": 0.3199, "step": 3781 }, { "epoch": 3.1835016835016834, "grad_norm": 0.42572563886642456, "learning_rate": 6.631853332059064e-06, "loss": 0.3399, "step": 3782 }, { "epoch": 3.1843434343434343, "grad_norm": 0.44035619497299194, "learning_rate": 6.629869245994106e-06, "loss": 0.3363, "step": 3783 }, { "epoch": 3.185185185185185, "grad_norm": 0.4270343780517578, "learning_rate": 6.627884872728808e-06, "loss": 0.3416, "step": 3784 }, { "epoch": 3.186026936026936, "grad_norm": 0.4277591407299042, "learning_rate": 6.625900212612843e-06, "loss": 0.3261, "step": 3785 }, { "epoch": 3.186868686868687, "grad_norm": 0.4189662039279938, "learning_rate": 6.6239152659959236e-06, "loss": 0.3186, "step": 3786 }, { "epoch": 3.1877104377104377, "grad_norm": 0.4157570004463196, "learning_rate": 6.621930033227822e-06, "loss": 0.3452, "step": 3787 }, { "epoch": 3.1885521885521886, "grad_norm": 0.41035953164100647, "learning_rate": 6.619944514658357e-06, "loss": 0.3416, "step": 3788 }, { "epoch": 3.1893939393939394, "grad_norm": 0.38632553815841675, "learning_rate": 6.617958710637398e-06, "loss": 0.3175, "step": 3789 }, { "epoch": 3.1902356902356903, "grad_norm": 0.4243316352367401, "learning_rate": 6.6159726215148626e-06, "loss": 0.3068, "step": 3790 }, { "epoch": 3.191077441077441, "grad_norm": 0.4241039454936981, "learning_rate": 6.613986247640725e-06, "loss": 0.3221, "step": 3791 }, { "epoch": 3.191919191919192, "grad_norm": 0.41415196657180786, "learning_rate": 6.611999589365001e-06, "loss": 0.3495, "step": 3792 }, { "epoch": 3.192760942760943, "grad_norm": 0.4453219771385193, "learning_rate": 6.610012647037764e-06, "loss": 0.3066, "step": 3793 }, { "epoch": 3.1936026936026938, "grad_norm": 0.47005775570869446, "learning_rate": 6.6080254210091334e-06, "loss": 0.3418, "step": 3794 }, { "epoch": 3.1944444444444446, "grad_norm": 0.42335352301597595, "learning_rate": 6.606037911629279e-06, "loss": 0.3189, "step": 3795 }, { "epoch": 3.1952861952861955, "grad_norm": 0.34478041529655457, "learning_rate": 6.6040501192484255e-06, "loss": 0.336, "step": 3796 }, { "epoch": 3.196127946127946, "grad_norm": 0.4110449254512787, "learning_rate": 6.602062044216836e-06, "loss": 0.3345, "step": 3797 }, { "epoch": 3.196969696969697, "grad_norm": 0.43212413787841797, "learning_rate": 6.6000736868848345e-06, "loss": 0.325, "step": 3798 }, { "epoch": 3.1978114478114477, "grad_norm": 0.39913079142570496, "learning_rate": 6.598085047602791e-06, "loss": 0.3167, "step": 3799 }, { "epoch": 3.1986531986531985, "grad_norm": 0.3904154598712921, "learning_rate": 6.596096126721124e-06, "loss": 0.3357, "step": 3800 }, { "epoch": 3.1994949494949494, "grad_norm": 0.4444361925125122, "learning_rate": 6.5941069245903034e-06, "loss": 0.3062, "step": 3801 }, { "epoch": 3.2003367003367003, "grad_norm": 0.4200494885444641, "learning_rate": 6.592117441560847e-06, "loss": 0.3482, "step": 3802 }, { "epoch": 3.201178451178451, "grad_norm": 0.3881874978542328, "learning_rate": 6.590127677983322e-06, "loss": 0.3494, "step": 3803 }, { "epoch": 3.202020202020202, "grad_norm": 0.4288330376148224, "learning_rate": 6.588137634208349e-06, "loss": 0.3202, "step": 3804 }, { "epoch": 3.202861952861953, "grad_norm": 0.3752167820930481, "learning_rate": 6.5861473105865926e-06, "loss": 0.3416, "step": 3805 }, { "epoch": 3.2037037037037037, "grad_norm": 0.4469863772392273, "learning_rate": 6.5841567074687685e-06, "loss": 0.3189, "step": 3806 }, { "epoch": 3.2045454545454546, "grad_norm": 0.4032677114009857, "learning_rate": 6.582165825205643e-06, "loss": 0.3054, "step": 3807 }, { "epoch": 3.2053872053872055, "grad_norm": 0.40204834938049316, "learning_rate": 6.580174664148031e-06, "loss": 0.332, "step": 3808 }, { "epoch": 3.2062289562289563, "grad_norm": 0.4269063472747803, "learning_rate": 6.578183224646797e-06, "loss": 0.3421, "step": 3809 }, { "epoch": 3.207070707070707, "grad_norm": 0.3998134434223175, "learning_rate": 6.576191507052854e-06, "loss": 0.3138, "step": 3810 }, { "epoch": 3.207912457912458, "grad_norm": 0.3832710087299347, "learning_rate": 6.574199511717162e-06, "loss": 0.3218, "step": 3811 }, { "epoch": 3.208754208754209, "grad_norm": 0.3784097731113434, "learning_rate": 6.5722072389907335e-06, "loss": 0.3179, "step": 3812 }, { "epoch": 3.20959595959596, "grad_norm": 0.3781067430973053, "learning_rate": 6.570214689224625e-06, "loss": 0.3238, "step": 3813 }, { "epoch": 3.2104377104377106, "grad_norm": 0.422549307346344, "learning_rate": 6.56822186276995e-06, "loss": 0.3213, "step": 3814 }, { "epoch": 3.211279461279461, "grad_norm": 0.38977304100990295, "learning_rate": 6.566228759977864e-06, "loss": 0.32, "step": 3815 }, { "epoch": 3.212121212121212, "grad_norm": 0.4009077250957489, "learning_rate": 6.5642353811995695e-06, "loss": 0.3149, "step": 3816 }, { "epoch": 3.212962962962963, "grad_norm": 0.4283040761947632, "learning_rate": 6.562241726786327e-06, "loss": 0.3117, "step": 3817 }, { "epoch": 3.2138047138047137, "grad_norm": 0.4072422683238983, "learning_rate": 6.560247797089434e-06, "loss": 0.3161, "step": 3818 }, { "epoch": 3.2146464646464645, "grad_norm": 0.4143606126308441, "learning_rate": 6.558253592460245e-06, "loss": 0.3179, "step": 3819 }, { "epoch": 3.2154882154882154, "grad_norm": 0.3980332911014557, "learning_rate": 6.556259113250161e-06, "loss": 0.3042, "step": 3820 }, { "epoch": 3.2163299663299663, "grad_norm": 0.3821530044078827, "learning_rate": 6.554264359810629e-06, "loss": 0.3361, "step": 3821 }, { "epoch": 3.217171717171717, "grad_norm": 0.44993144273757935, "learning_rate": 6.552269332493145e-06, "loss": 0.3072, "step": 3822 }, { "epoch": 3.218013468013468, "grad_norm": 0.4209173023700714, "learning_rate": 6.550274031649258e-06, "loss": 0.3344, "step": 3823 }, { "epoch": 3.218855218855219, "grad_norm": 0.3803650140762329, "learning_rate": 6.548278457630555e-06, "loss": 0.3032, "step": 3824 }, { "epoch": 3.2196969696969697, "grad_norm": 0.40061435103416443, "learning_rate": 6.546282610788683e-06, "loss": 0.3216, "step": 3825 }, { "epoch": 3.2205387205387206, "grad_norm": 0.4136238992214203, "learning_rate": 6.544286491475329e-06, "loss": 0.3211, "step": 3826 }, { "epoch": 3.2213804713804715, "grad_norm": 0.3808964788913727, "learning_rate": 6.54229010004223e-06, "loss": 0.3287, "step": 3827 }, { "epoch": 3.2222222222222223, "grad_norm": 0.4273664653301239, "learning_rate": 6.540293436841173e-06, "loss": 0.3162, "step": 3828 }, { "epoch": 3.223063973063973, "grad_norm": 0.41767722368240356, "learning_rate": 6.538296502223992e-06, "loss": 0.3321, "step": 3829 }, { "epoch": 3.223905723905724, "grad_norm": 0.4178411364555359, "learning_rate": 6.536299296542566e-06, "loss": 0.3315, "step": 3830 }, { "epoch": 3.224747474747475, "grad_norm": 0.40988659858703613, "learning_rate": 6.534301820148824e-06, "loss": 0.3099, "step": 3831 }, { "epoch": 3.225589225589226, "grad_norm": 0.4074786305427551, "learning_rate": 6.5323040733947444e-06, "loss": 0.3362, "step": 3832 }, { "epoch": 3.2264309764309766, "grad_norm": 0.3856716752052307, "learning_rate": 6.53030605663235e-06, "loss": 0.347, "step": 3833 }, { "epoch": 3.227272727272727, "grad_norm": 0.4260260760784149, "learning_rate": 6.528307770213713e-06, "loss": 0.3349, "step": 3834 }, { "epoch": 3.228114478114478, "grad_norm": 0.4431381821632385, "learning_rate": 6.5263092144909524e-06, "loss": 0.3262, "step": 3835 }, { "epoch": 3.228956228956229, "grad_norm": 0.3979245126247406, "learning_rate": 6.524310389816237e-06, "loss": 0.332, "step": 3836 }, { "epoch": 3.2297979797979797, "grad_norm": 0.4289344549179077, "learning_rate": 6.522311296541779e-06, "loss": 0.3149, "step": 3837 }, { "epoch": 3.2306397306397305, "grad_norm": 0.4002893567085266, "learning_rate": 6.520311935019841e-06, "loss": 0.315, "step": 3838 }, { "epoch": 3.2314814814814814, "grad_norm": 0.3934142589569092, "learning_rate": 6.518312305602731e-06, "loss": 0.331, "step": 3839 }, { "epoch": 3.2323232323232323, "grad_norm": 0.3964950740337372, "learning_rate": 6.516312408642804e-06, "loss": 0.3171, "step": 3840 }, { "epoch": 3.233164983164983, "grad_norm": 0.4059320390224457, "learning_rate": 6.514312244492467e-06, "loss": 0.3032, "step": 3841 }, { "epoch": 3.234006734006734, "grad_norm": 0.41101428866386414, "learning_rate": 6.512311813504167e-06, "loss": 0.3364, "step": 3842 }, { "epoch": 3.234848484848485, "grad_norm": 0.4033203423023224, "learning_rate": 6.510311116030401e-06, "loss": 0.3355, "step": 3843 }, { "epoch": 3.2356902356902357, "grad_norm": 0.38422518968582153, "learning_rate": 6.508310152423716e-06, "loss": 0.3507, "step": 3844 }, { "epoch": 3.2365319865319866, "grad_norm": 0.4126417934894562, "learning_rate": 6.5063089230367e-06, "loss": 0.3581, "step": 3845 }, { "epoch": 3.2373737373737375, "grad_norm": 0.4136028587818146, "learning_rate": 6.504307428221993e-06, "loss": 0.3371, "step": 3846 }, { "epoch": 3.2382154882154883, "grad_norm": 0.3841722011566162, "learning_rate": 6.5023056683322785e-06, "loss": 0.3204, "step": 3847 }, { "epoch": 3.239057239057239, "grad_norm": 0.39518532156944275, "learning_rate": 6.50030364372029e-06, "loss": 0.3327, "step": 3848 }, { "epoch": 3.23989898989899, "grad_norm": 0.40889817476272583, "learning_rate": 6.498301354738803e-06, "loss": 0.3198, "step": 3849 }, { "epoch": 3.240740740740741, "grad_norm": 0.41106104850769043, "learning_rate": 6.496298801740643e-06, "loss": 0.3113, "step": 3850 }, { "epoch": 3.241582491582492, "grad_norm": 0.43906643986701965, "learning_rate": 6.494295985078682e-06, "loss": 0.3523, "step": 3851 }, { "epoch": 3.242424242424242, "grad_norm": 0.4385296404361725, "learning_rate": 6.492292905105837e-06, "loss": 0.3352, "step": 3852 }, { "epoch": 3.243265993265993, "grad_norm": 0.40645068883895874, "learning_rate": 6.490289562175072e-06, "loss": 0.3418, "step": 3853 }, { "epoch": 3.244107744107744, "grad_norm": 0.4577059745788574, "learning_rate": 6.488285956639396e-06, "loss": 0.2986, "step": 3854 }, { "epoch": 3.244949494949495, "grad_norm": 0.42293599247932434, "learning_rate": 6.486282088851871e-06, "loss": 0.3181, "step": 3855 }, { "epoch": 3.2457912457912457, "grad_norm": 0.40734758973121643, "learning_rate": 6.4842779591655925e-06, "loss": 0.3277, "step": 3856 }, { "epoch": 3.2466329966329965, "grad_norm": 0.4533599317073822, "learning_rate": 6.482273567933714e-06, "loss": 0.3185, "step": 3857 }, { "epoch": 3.2474747474747474, "grad_norm": 0.40135565400123596, "learning_rate": 6.4802689155094315e-06, "loss": 0.3261, "step": 3858 }, { "epoch": 3.2483164983164983, "grad_norm": 0.3885362148284912, "learning_rate": 6.478264002245982e-06, "loss": 0.332, "step": 3859 }, { "epoch": 3.249158249158249, "grad_norm": 0.42211979627609253, "learning_rate": 6.4762588284966555e-06, "loss": 0.3315, "step": 3860 }, { "epoch": 3.25, "grad_norm": 0.4076663851737976, "learning_rate": 6.474253394614786e-06, "loss": 0.316, "step": 3861 }, { "epoch": 3.250841750841751, "grad_norm": 0.4006631076335907, "learning_rate": 6.472247700953748e-06, "loss": 0.3475, "step": 3862 }, { "epoch": 3.2516835016835017, "grad_norm": 0.40360012650489807, "learning_rate": 6.470241747866972e-06, "loss": 0.3077, "step": 3863 }, { "epoch": 3.2525252525252526, "grad_norm": 0.389297753572464, "learning_rate": 6.468235535707923e-06, "loss": 0.3195, "step": 3864 }, { "epoch": 3.2533670033670035, "grad_norm": 0.40614357590675354, "learning_rate": 6.46622906483012e-06, "loss": 0.3518, "step": 3865 }, { "epoch": 3.2542087542087543, "grad_norm": 0.4375155568122864, "learning_rate": 6.464222335587123e-06, "loss": 0.3061, "step": 3866 }, { "epoch": 3.255050505050505, "grad_norm": 0.4018388092517853, "learning_rate": 6.462215348332539e-06, "loss": 0.3221, "step": 3867 }, { "epoch": 3.255892255892256, "grad_norm": 0.4507244825363159, "learning_rate": 6.460208103420024e-06, "loss": 0.2985, "step": 3868 }, { "epoch": 3.256734006734007, "grad_norm": 0.43541085720062256, "learning_rate": 6.45820060120327e-06, "loss": 0.326, "step": 3869 }, { "epoch": 3.257575757575758, "grad_norm": 0.42252063751220703, "learning_rate": 6.456192842036026e-06, "loss": 0.3271, "step": 3870 }, { "epoch": 3.2584175084175087, "grad_norm": 0.5119532346725464, "learning_rate": 6.454184826272078e-06, "loss": 0.3282, "step": 3871 }, { "epoch": 3.259259259259259, "grad_norm": 0.3853996992111206, "learning_rate": 6.4521765542652595e-06, "loss": 0.3345, "step": 3872 }, { "epoch": 3.26010101010101, "grad_norm": 0.4203888773918152, "learning_rate": 6.450168026369449e-06, "loss": 0.3102, "step": 3873 }, { "epoch": 3.260942760942761, "grad_norm": 0.4275035858154297, "learning_rate": 6.448159242938573e-06, "loss": 0.3156, "step": 3874 }, { "epoch": 3.2617845117845117, "grad_norm": 0.3656291365623474, "learning_rate": 6.446150204326597e-06, "loss": 0.314, "step": 3875 }, { "epoch": 3.2626262626262625, "grad_norm": 0.4610520899295807, "learning_rate": 6.444140910887538e-06, "loss": 0.3431, "step": 3876 }, { "epoch": 3.2634680134680134, "grad_norm": 0.41965317726135254, "learning_rate": 6.442131362975455e-06, "loss": 0.3271, "step": 3877 }, { "epoch": 3.2643097643097643, "grad_norm": 0.3913000226020813, "learning_rate": 6.440121560944449e-06, "loss": 0.3249, "step": 3878 }, { "epoch": 3.265151515151515, "grad_norm": 0.4318212568759918, "learning_rate": 6.438111505148671e-06, "loss": 0.3245, "step": 3879 }, { "epoch": 3.265993265993266, "grad_norm": 0.44217780232429504, "learning_rate": 6.436101195942313e-06, "loss": 0.3238, "step": 3880 }, { "epoch": 3.266835016835017, "grad_norm": 0.410292387008667, "learning_rate": 6.434090633679613e-06, "loss": 0.3021, "step": 3881 }, { "epoch": 3.2676767676767677, "grad_norm": 0.3946225345134735, "learning_rate": 6.432079818714856e-06, "loss": 0.3048, "step": 3882 }, { "epoch": 3.2685185185185186, "grad_norm": 0.41921839118003845, "learning_rate": 6.4300687514023655e-06, "loss": 0.3282, "step": 3883 }, { "epoch": 3.2693602693602695, "grad_norm": 0.41344377398490906, "learning_rate": 6.4280574320965136e-06, "loss": 0.3438, "step": 3884 }, { "epoch": 3.2702020202020203, "grad_norm": 0.37680622935295105, "learning_rate": 6.426045861151718e-06, "loss": 0.3103, "step": 3885 }, { "epoch": 3.271043771043771, "grad_norm": 0.37603816390037537, "learning_rate": 6.424034038922437e-06, "loss": 0.3337, "step": 3886 }, { "epoch": 3.271885521885522, "grad_norm": 0.3724794387817383, "learning_rate": 6.422021965763179e-06, "loss": 0.3455, "step": 3887 }, { "epoch": 3.2727272727272725, "grad_norm": 0.3721892237663269, "learning_rate": 6.420009642028486e-06, "loss": 0.3152, "step": 3888 }, { "epoch": 3.2735690235690234, "grad_norm": 0.41276484727859497, "learning_rate": 6.4179970680729585e-06, "loss": 0.3312, "step": 3889 }, { "epoch": 3.274410774410774, "grad_norm": 0.38998252153396606, "learning_rate": 6.4159842442512275e-06, "loss": 0.3313, "step": 3890 }, { "epoch": 3.275252525252525, "grad_norm": 0.3847276568412781, "learning_rate": 6.413971170917976e-06, "loss": 0.3242, "step": 3891 }, { "epoch": 3.276094276094276, "grad_norm": 0.38818275928497314, "learning_rate": 6.41195784842793e-06, "loss": 0.3327, "step": 3892 }, { "epoch": 3.276936026936027, "grad_norm": 0.40705040097236633, "learning_rate": 6.409944277135859e-06, "loss": 0.3205, "step": 3893 }, { "epoch": 3.2777777777777777, "grad_norm": 0.3838440775871277, "learning_rate": 6.4079304573965714e-06, "loss": 0.3278, "step": 3894 }, { "epoch": 3.2786195286195285, "grad_norm": 0.40798914432525635, "learning_rate": 6.405916389564929e-06, "loss": 0.3396, "step": 3895 }, { "epoch": 3.2794612794612794, "grad_norm": 0.3566807806491852, "learning_rate": 6.403902073995827e-06, "loss": 0.339, "step": 3896 }, { "epoch": 3.2803030303030303, "grad_norm": 0.4178483188152313, "learning_rate": 6.401887511044216e-06, "loss": 0.3216, "step": 3897 }, { "epoch": 3.281144781144781, "grad_norm": 0.39273524284362793, "learning_rate": 6.399872701065076e-06, "loss": 0.3161, "step": 3898 }, { "epoch": 3.281986531986532, "grad_norm": 0.3874754309654236, "learning_rate": 6.397857644413443e-06, "loss": 0.3236, "step": 3899 }, { "epoch": 3.282828282828283, "grad_norm": 0.4200805425643921, "learning_rate": 6.395842341444389e-06, "loss": 0.3018, "step": 3900 }, { "epoch": 3.2836700336700337, "grad_norm": 0.361688494682312, "learning_rate": 6.393826792513035e-06, "loss": 0.3624, "step": 3901 }, { "epoch": 3.2845117845117846, "grad_norm": 0.37764716148376465, "learning_rate": 6.391810997974538e-06, "loss": 0.3289, "step": 3902 }, { "epoch": 3.2853535353535355, "grad_norm": 0.3931100368499756, "learning_rate": 6.389794958184104e-06, "loss": 0.3074, "step": 3903 }, { "epoch": 3.2861952861952863, "grad_norm": 0.42119869589805603, "learning_rate": 6.387778673496982e-06, "loss": 0.3304, "step": 3904 }, { "epoch": 3.287037037037037, "grad_norm": 0.3836376368999481, "learning_rate": 6.385762144268463e-06, "loss": 0.3321, "step": 3905 }, { "epoch": 3.287878787878788, "grad_norm": 0.40740978717803955, "learning_rate": 6.383745370853879e-06, "loss": 0.3357, "step": 3906 }, { "epoch": 3.288720538720539, "grad_norm": 0.37384480237960815, "learning_rate": 6.381728353608607e-06, "loss": 0.3111, "step": 3907 }, { "epoch": 3.28956228956229, "grad_norm": 0.4200274348258972, "learning_rate": 6.37971109288807e-06, "loss": 0.3007, "step": 3908 }, { "epoch": 3.29040404040404, "grad_norm": 0.4670436978340149, "learning_rate": 6.377693589047726e-06, "loss": 0.3135, "step": 3909 }, { "epoch": 3.291245791245791, "grad_norm": 0.42142099142074585, "learning_rate": 6.375675842443086e-06, "loss": 0.3117, "step": 3910 }, { "epoch": 3.292087542087542, "grad_norm": 0.42054978013038635, "learning_rate": 6.3736578534296955e-06, "loss": 0.3242, "step": 3911 }, { "epoch": 3.292929292929293, "grad_norm": 0.4000498652458191, "learning_rate": 6.371639622363145e-06, "loss": 0.3442, "step": 3912 }, { "epoch": 3.2937710437710437, "grad_norm": 0.4234660565853119, "learning_rate": 6.36962114959907e-06, "loss": 0.3095, "step": 3913 }, { "epoch": 3.2946127946127945, "grad_norm": 0.3953658938407898, "learning_rate": 6.367602435493146e-06, "loss": 0.3148, "step": 3914 }, { "epoch": 3.2954545454545454, "grad_norm": 0.4533858895301819, "learning_rate": 6.365583480401092e-06, "loss": 0.3218, "step": 3915 }, { "epoch": 3.2962962962962963, "grad_norm": 0.4637863039970398, "learning_rate": 6.3635642846786716e-06, "loss": 0.3312, "step": 3916 }, { "epoch": 3.297138047138047, "grad_norm": 0.45561274886131287, "learning_rate": 6.3615448486816845e-06, "loss": 0.3496, "step": 3917 }, { "epoch": 3.297979797979798, "grad_norm": 0.4768444299697876, "learning_rate": 6.3595251727659805e-06, "loss": 0.3083, "step": 3918 }, { "epoch": 3.298821548821549, "grad_norm": 0.46242856979370117, "learning_rate": 6.357505257287446e-06, "loss": 0.3325, "step": 3919 }, { "epoch": 3.2996632996632997, "grad_norm": 0.4446099102497101, "learning_rate": 6.355485102602012e-06, "loss": 0.2996, "step": 3920 }, { "epoch": 3.3005050505050506, "grad_norm": 0.41596442461013794, "learning_rate": 6.353464709065652e-06, "loss": 0.3506, "step": 3921 }, { "epoch": 3.3013468013468015, "grad_norm": 0.39087218046188354, "learning_rate": 6.351444077034381e-06, "loss": 0.3196, "step": 3922 }, { "epoch": 3.3021885521885523, "grad_norm": 0.44316768646240234, "learning_rate": 6.349423206864253e-06, "loss": 0.3027, "step": 3923 }, { "epoch": 3.303030303030303, "grad_norm": 0.45260781049728394, "learning_rate": 6.347402098911371e-06, "loss": 0.3145, "step": 3924 }, { "epoch": 3.3038720538720536, "grad_norm": 0.43701156973838806, "learning_rate": 6.345380753531874e-06, "loss": 0.3506, "step": 3925 }, { "epoch": 3.3047138047138045, "grad_norm": 0.39477741718292236, "learning_rate": 6.343359171081944e-06, "loss": 0.3385, "step": 3926 }, { "epoch": 3.3055555555555554, "grad_norm": 0.4465186893939972, "learning_rate": 6.341337351917807e-06, "loss": 0.3334, "step": 3927 }, { "epoch": 3.3063973063973062, "grad_norm": 0.39446568489074707, "learning_rate": 6.339315296395726e-06, "loss": 0.3292, "step": 3928 }, { "epoch": 3.307239057239057, "grad_norm": 0.36059075593948364, "learning_rate": 6.337293004872013e-06, "loss": 0.3186, "step": 3929 }, { "epoch": 3.308080808080808, "grad_norm": 0.3938598334789276, "learning_rate": 6.3352704777030125e-06, "loss": 0.2968, "step": 3930 }, { "epoch": 3.308922558922559, "grad_norm": 0.38493433594703674, "learning_rate": 6.33324771524512e-06, "loss": 0.3386, "step": 3931 }, { "epoch": 3.3097643097643097, "grad_norm": 0.41828620433807373, "learning_rate": 6.331224717854765e-06, "loss": 0.3338, "step": 3932 }, { "epoch": 3.3106060606060606, "grad_norm": 0.39827755093574524, "learning_rate": 6.329201485888423e-06, "loss": 0.3347, "step": 3933 }, { "epoch": 3.3114478114478114, "grad_norm": 0.3802882432937622, "learning_rate": 6.327178019702606e-06, "loss": 0.31, "step": 3934 }, { "epoch": 3.3122895622895623, "grad_norm": 0.4062994420528412, "learning_rate": 6.325154319653875e-06, "loss": 0.3298, "step": 3935 }, { "epoch": 3.313131313131313, "grad_norm": 0.4115019738674164, "learning_rate": 6.323130386098824e-06, "loss": 0.332, "step": 3936 }, { "epoch": 3.313973063973064, "grad_norm": 0.3898678719997406, "learning_rate": 6.321106219394091e-06, "loss": 0.3393, "step": 3937 }, { "epoch": 3.314814814814815, "grad_norm": 0.41906771063804626, "learning_rate": 6.319081819896358e-06, "loss": 0.3302, "step": 3938 }, { "epoch": 3.3156565656565657, "grad_norm": 0.4549757242202759, "learning_rate": 6.3170571879623445e-06, "loss": 0.3189, "step": 3939 }, { "epoch": 3.3164983164983166, "grad_norm": 0.4042644500732422, "learning_rate": 6.315032323948814e-06, "loss": 0.3204, "step": 3940 }, { "epoch": 3.3173400673400675, "grad_norm": 0.43256106972694397, "learning_rate": 6.313007228212568e-06, "loss": 0.3302, "step": 3941 }, { "epoch": 3.3181818181818183, "grad_norm": 0.4322051405906677, "learning_rate": 6.3109819011104485e-06, "loss": 0.3483, "step": 3942 }, { "epoch": 3.319023569023569, "grad_norm": 0.43817371129989624, "learning_rate": 6.308956342999342e-06, "loss": 0.3379, "step": 3943 }, { "epoch": 3.31986531986532, "grad_norm": 0.3937491178512573, "learning_rate": 6.3069305542361725e-06, "loss": 0.3259, "step": 3944 }, { "epoch": 3.320707070707071, "grad_norm": 0.4360649585723877, "learning_rate": 6.3049045351779045e-06, "loss": 0.3346, "step": 3945 }, { "epoch": 3.3215488215488214, "grad_norm": 0.39328789710998535, "learning_rate": 6.302878286181548e-06, "loss": 0.3281, "step": 3946 }, { "epoch": 3.3223905723905722, "grad_norm": 0.3896661400794983, "learning_rate": 6.300851807604144e-06, "loss": 0.334, "step": 3947 }, { "epoch": 3.323232323232323, "grad_norm": 0.4279959499835968, "learning_rate": 6.298825099802784e-06, "loss": 0.3304, "step": 3948 }, { "epoch": 3.324074074074074, "grad_norm": 0.3921409845352173, "learning_rate": 6.296798163134594e-06, "loss": 0.3298, "step": 3949 }, { "epoch": 3.324915824915825, "grad_norm": 0.41700732707977295, "learning_rate": 6.2947709979567415e-06, "loss": 0.3109, "step": 3950 }, { "epoch": 3.3257575757575757, "grad_norm": 0.41487810015678406, "learning_rate": 6.292743604626436e-06, "loss": 0.3418, "step": 3951 }, { "epoch": 3.3265993265993266, "grad_norm": 0.4130215048789978, "learning_rate": 6.290715983500923e-06, "loss": 0.3266, "step": 3952 }, { "epoch": 3.3274410774410774, "grad_norm": 0.44452545046806335, "learning_rate": 6.288688134937495e-06, "loss": 0.31, "step": 3953 }, { "epoch": 3.3282828282828283, "grad_norm": 0.40514981746673584, "learning_rate": 6.286660059293478e-06, "loss": 0.3329, "step": 3954 }, { "epoch": 3.329124579124579, "grad_norm": 0.3956911563873291, "learning_rate": 6.28463175692624e-06, "loss": 0.283, "step": 3955 }, { "epoch": 3.32996632996633, "grad_norm": 0.4202256202697754, "learning_rate": 6.2826032281931905e-06, "loss": 0.3176, "step": 3956 }, { "epoch": 3.330808080808081, "grad_norm": 0.4966040849685669, "learning_rate": 6.280574473451778e-06, "loss": 0.333, "step": 3957 }, { "epoch": 3.3316498316498318, "grad_norm": 0.3934146761894226, "learning_rate": 6.27854549305949e-06, "loss": 0.3167, "step": 3958 }, { "epoch": 3.3324915824915826, "grad_norm": 0.44415515661239624, "learning_rate": 6.276516287373857e-06, "loss": 0.3135, "step": 3959 }, { "epoch": 3.3333333333333335, "grad_norm": 0.48710450530052185, "learning_rate": 6.274486856752442e-06, "loss": 0.3547, "step": 3960 }, { "epoch": 3.3341750841750843, "grad_norm": 0.44116947054862976, "learning_rate": 6.272457201552857e-06, "loss": 0.3181, "step": 3961 }, { "epoch": 3.3350168350168348, "grad_norm": 0.4198330044746399, "learning_rate": 6.270427322132746e-06, "loss": 0.318, "step": 3962 }, { "epoch": 3.3358585858585856, "grad_norm": 0.3880566358566284, "learning_rate": 6.268397218849797e-06, "loss": 0.3281, "step": 3963 }, { "epoch": 3.3367003367003365, "grad_norm": 0.41018345952033997, "learning_rate": 6.266366892061735e-06, "loss": 0.3373, "step": 3964 }, { "epoch": 3.3375420875420874, "grad_norm": 0.43391990661621094, "learning_rate": 6.264336342126325e-06, "loss": 0.3318, "step": 3965 }, { "epoch": 3.3383838383838382, "grad_norm": 0.3894343078136444, "learning_rate": 6.262305569401371e-06, "loss": 0.3079, "step": 3966 }, { "epoch": 3.339225589225589, "grad_norm": 0.39079463481903076, "learning_rate": 6.260274574244719e-06, "loss": 0.3275, "step": 3967 }, { "epoch": 3.34006734006734, "grad_norm": 0.4951452612876892, "learning_rate": 6.25824335701425e-06, "loss": 0.3026, "step": 3968 }, { "epoch": 3.340909090909091, "grad_norm": 0.4190388023853302, "learning_rate": 6.256211918067887e-06, "loss": 0.3118, "step": 3969 }, { "epoch": 3.3417508417508417, "grad_norm": 0.4012814164161682, "learning_rate": 6.254180257763591e-06, "loss": 0.3349, "step": 3970 }, { "epoch": 3.3425925925925926, "grad_norm": 0.4058343768119812, "learning_rate": 6.252148376459363e-06, "loss": 0.3431, "step": 3971 }, { "epoch": 3.3434343434343434, "grad_norm": 0.3950294852256775, "learning_rate": 6.25011627451324e-06, "loss": 0.338, "step": 3972 }, { "epoch": 3.3442760942760943, "grad_norm": 0.40570881962776184, "learning_rate": 6.248083952283303e-06, "loss": 0.3359, "step": 3973 }, { "epoch": 3.345117845117845, "grad_norm": 0.3929937779903412, "learning_rate": 6.246051410127667e-06, "loss": 0.3283, "step": 3974 }, { "epoch": 3.345959595959596, "grad_norm": 0.3884713351726532, "learning_rate": 6.244018648404486e-06, "loss": 0.3368, "step": 3975 }, { "epoch": 3.346801346801347, "grad_norm": 0.4249723255634308, "learning_rate": 6.241985667471958e-06, "loss": 0.32, "step": 3976 }, { "epoch": 3.3476430976430978, "grad_norm": 0.3907220959663391, "learning_rate": 6.2399524676883125e-06, "loss": 0.3224, "step": 3977 }, { "epoch": 3.3484848484848486, "grad_norm": 0.38149699568748474, "learning_rate": 6.237919049411825e-06, "loss": 0.3515, "step": 3978 }, { "epoch": 3.3493265993265995, "grad_norm": 0.4166454076766968, "learning_rate": 6.235885413000801e-06, "loss": 0.3284, "step": 3979 }, { "epoch": 3.3501683501683504, "grad_norm": 0.39354780316352844, "learning_rate": 6.233851558813592e-06, "loss": 0.3327, "step": 3980 }, { "epoch": 3.351010101010101, "grad_norm": 0.39926111698150635, "learning_rate": 6.2318174872085825e-06, "loss": 0.3153, "step": 3981 }, { "epoch": 3.351851851851852, "grad_norm": 0.4114503264427185, "learning_rate": 6.2297831985442e-06, "loss": 0.329, "step": 3982 }, { "epoch": 3.3526936026936025, "grad_norm": 0.3956889808177948, "learning_rate": 6.227748693178905e-06, "loss": 0.3406, "step": 3983 }, { "epoch": 3.3535353535353534, "grad_norm": 0.3952290713787079, "learning_rate": 6.225713971471202e-06, "loss": 0.3281, "step": 3984 }, { "epoch": 3.3543771043771042, "grad_norm": 0.4320905804634094, "learning_rate": 6.22367903377963e-06, "loss": 0.3202, "step": 3985 }, { "epoch": 3.355218855218855, "grad_norm": 0.4236990809440613, "learning_rate": 6.221643880462767e-06, "loss": 0.3378, "step": 3986 }, { "epoch": 3.356060606060606, "grad_norm": 0.3749270439147949, "learning_rate": 6.219608511879223e-06, "loss": 0.345, "step": 3987 }, { "epoch": 3.356902356902357, "grad_norm": 0.42270368337631226, "learning_rate": 6.217572928387662e-06, "loss": 0.3151, "step": 3988 }, { "epoch": 3.3577441077441077, "grad_norm": 0.42701447010040283, "learning_rate": 6.2155371303467675e-06, "loss": 0.3329, "step": 3989 }, { "epoch": 3.3585858585858586, "grad_norm": 0.42350608110427856, "learning_rate": 6.21350111811527e-06, "loss": 0.3428, "step": 3990 }, { "epoch": 3.3594276094276094, "grad_norm": 0.4270547032356262, "learning_rate": 6.211464892051939e-06, "loss": 0.3473, "step": 3991 }, { "epoch": 3.3602693602693603, "grad_norm": 0.39794209599494934, "learning_rate": 6.209428452515579e-06, "loss": 0.3447, "step": 3992 }, { "epoch": 3.361111111111111, "grad_norm": 0.4369339346885681, "learning_rate": 6.207391799865031e-06, "loss": 0.324, "step": 3993 }, { "epoch": 3.361952861952862, "grad_norm": 0.44041162729263306, "learning_rate": 6.205354934459176e-06, "loss": 0.2885, "step": 3994 }, { "epoch": 3.362794612794613, "grad_norm": 0.4261743724346161, "learning_rate": 6.20331785665693e-06, "loss": 0.3095, "step": 3995 }, { "epoch": 3.3636363636363638, "grad_norm": 0.4285004734992981, "learning_rate": 6.201280566817247e-06, "loss": 0.3424, "step": 3996 }, { "epoch": 3.3644781144781146, "grad_norm": 0.4604799151420593, "learning_rate": 6.1992430652991235e-06, "loss": 0.3284, "step": 3997 }, { "epoch": 3.3653198653198655, "grad_norm": 0.384001225233078, "learning_rate": 6.1972053524615835e-06, "loss": 0.3241, "step": 3998 }, { "epoch": 3.3661616161616164, "grad_norm": 0.4152281880378723, "learning_rate": 6.195167428663699e-06, "loss": 0.2998, "step": 3999 }, { "epoch": 3.3670033670033668, "grad_norm": 0.4531628489494324, "learning_rate": 6.19312929426457e-06, "loss": 0.3295, "step": 4000 }, { "epoch": 3.3678451178451176, "grad_norm": 0.42662349343299866, "learning_rate": 6.191090949623339e-06, "loss": 0.3396, "step": 4001 }, { "epoch": 3.3686868686868685, "grad_norm": 0.36927008628845215, "learning_rate": 6.189052395099184e-06, "loss": 0.3155, "step": 4002 }, { "epoch": 3.3695286195286194, "grad_norm": 0.4486301839351654, "learning_rate": 6.187013631051321e-06, "loss": 0.3427, "step": 4003 }, { "epoch": 3.3703703703703702, "grad_norm": 0.43211355805397034, "learning_rate": 6.184974657838999e-06, "loss": 0.3433, "step": 4004 }, { "epoch": 3.371212121212121, "grad_norm": 0.3610932528972626, "learning_rate": 6.18293547582151e-06, "loss": 0.3263, "step": 4005 }, { "epoch": 3.372053872053872, "grad_norm": 0.4051627516746521, "learning_rate": 6.180896085358178e-06, "loss": 0.3203, "step": 4006 }, { "epoch": 3.372895622895623, "grad_norm": 0.42014527320861816, "learning_rate": 6.178856486808367e-06, "loss": 0.3281, "step": 4007 }, { "epoch": 3.3737373737373737, "grad_norm": 0.3769921064376831, "learning_rate": 6.176816680531472e-06, "loss": 0.3219, "step": 4008 }, { "epoch": 3.3745791245791246, "grad_norm": 0.4084587097167969, "learning_rate": 6.1747766668869346e-06, "loss": 0.3432, "step": 4009 }, { "epoch": 3.3754208754208754, "grad_norm": 0.3784762918949127, "learning_rate": 6.172736446234222e-06, "loss": 0.3155, "step": 4010 }, { "epoch": 3.3762626262626263, "grad_norm": 0.3990470767021179, "learning_rate": 6.170696018932845e-06, "loss": 0.3315, "step": 4011 }, { "epoch": 3.377104377104377, "grad_norm": 0.4227665960788727, "learning_rate": 6.168655385342348e-06, "loss": 0.3347, "step": 4012 }, { "epoch": 3.377946127946128, "grad_norm": 0.37575000524520874, "learning_rate": 6.1666145458223115e-06, "loss": 0.3253, "step": 4013 }, { "epoch": 3.378787878787879, "grad_norm": 0.4233551323413849, "learning_rate": 6.164573500732356e-06, "loss": 0.2967, "step": 4014 }, { "epoch": 3.3796296296296298, "grad_norm": 0.4118350148200989, "learning_rate": 6.162532250432133e-06, "loss": 0.336, "step": 4015 }, { "epoch": 3.3804713804713806, "grad_norm": 0.3961559534072876, "learning_rate": 6.160490795281333e-06, "loss": 0.3514, "step": 4016 }, { "epoch": 3.3813131313131315, "grad_norm": 0.38310083746910095, "learning_rate": 6.158449135639683e-06, "loss": 0.3226, "step": 4017 }, { "epoch": 3.3821548821548824, "grad_norm": 0.41891714930534363, "learning_rate": 6.156407271866944e-06, "loss": 0.3249, "step": 4018 }, { "epoch": 3.3829966329966332, "grad_norm": 0.4289630949497223, "learning_rate": 6.154365204322914e-06, "loss": 0.3305, "step": 4019 }, { "epoch": 3.3838383838383836, "grad_norm": 0.4015401303768158, "learning_rate": 6.15232293336743e-06, "loss": 0.3325, "step": 4020 }, { "epoch": 3.3846801346801345, "grad_norm": 0.3744424879550934, "learning_rate": 6.150280459360357e-06, "loss": 0.324, "step": 4021 }, { "epoch": 3.3855218855218854, "grad_norm": 0.3851487338542938, "learning_rate": 6.1482377826616055e-06, "loss": 0.3616, "step": 4022 }, { "epoch": 3.3863636363636362, "grad_norm": 0.4566548764705658, "learning_rate": 6.1461949036311145e-06, "loss": 0.3476, "step": 4023 }, { "epoch": 3.387205387205387, "grad_norm": 0.40144336223602295, "learning_rate": 6.144151822628861e-06, "loss": 0.3187, "step": 4024 }, { "epoch": 3.388047138047138, "grad_norm": 0.41791847348213196, "learning_rate": 6.142108540014857e-06, "loss": 0.31, "step": 4025 }, { "epoch": 3.388888888888889, "grad_norm": 0.40482205152511597, "learning_rate": 6.140065056149155e-06, "loss": 0.3058, "step": 4026 }, { "epoch": 3.3897306397306397, "grad_norm": 0.4876040518283844, "learning_rate": 6.1380213713918316e-06, "loss": 0.3313, "step": 4027 }, { "epoch": 3.3905723905723906, "grad_norm": 0.38310107588768005, "learning_rate": 6.135977486103012e-06, "loss": 0.3451, "step": 4028 }, { "epoch": 3.3914141414141414, "grad_norm": 0.46072298288345337, "learning_rate": 6.133933400642848e-06, "loss": 0.3302, "step": 4029 }, { "epoch": 3.3922558922558923, "grad_norm": 0.4451667368412018, "learning_rate": 6.131889115371529e-06, "loss": 0.3214, "step": 4030 }, { "epoch": 3.393097643097643, "grad_norm": 0.43177881836891174, "learning_rate": 6.12984463064928e-06, "loss": 0.3491, "step": 4031 }, { "epoch": 3.393939393939394, "grad_norm": 0.40444767475128174, "learning_rate": 6.127799946836362e-06, "loss": 0.3377, "step": 4032 }, { "epoch": 3.394781144781145, "grad_norm": 0.3905409574508667, "learning_rate": 6.1257550642930695e-06, "loss": 0.3031, "step": 4033 }, { "epoch": 3.3956228956228958, "grad_norm": 0.48918336629867554, "learning_rate": 6.123709983379732e-06, "loss": 0.3214, "step": 4034 }, { "epoch": 3.3964646464646466, "grad_norm": 0.3921576142311096, "learning_rate": 6.121664704456714e-06, "loss": 0.3153, "step": 4035 }, { "epoch": 3.3973063973063975, "grad_norm": 0.4191894233226776, "learning_rate": 6.119619227884418e-06, "loss": 0.323, "step": 4036 }, { "epoch": 3.398148148148148, "grad_norm": 0.41757702827453613, "learning_rate": 6.117573554023277e-06, "loss": 0.3174, "step": 4037 }, { "epoch": 3.398989898989899, "grad_norm": 0.40779486298561096, "learning_rate": 6.115527683233761e-06, "loss": 0.304, "step": 4038 }, { "epoch": 3.3998316498316496, "grad_norm": 0.4001549482345581, "learning_rate": 6.113481615876376e-06, "loss": 0.3274, "step": 4039 }, { "epoch": 3.4006734006734005, "grad_norm": 0.40959981083869934, "learning_rate": 6.111435352311654e-06, "loss": 0.3573, "step": 4040 }, { "epoch": 3.4015151515151514, "grad_norm": 0.44297319650650024, "learning_rate": 6.109388892900178e-06, "loss": 0.3106, "step": 4041 }, { "epoch": 3.4023569023569022, "grad_norm": 0.3976607620716095, "learning_rate": 6.10734223800255e-06, "loss": 0.3383, "step": 4042 }, { "epoch": 3.403198653198653, "grad_norm": 0.4202091693878174, "learning_rate": 6.105295387979413e-06, "loss": 0.3391, "step": 4043 }, { "epoch": 3.404040404040404, "grad_norm": 0.40938663482666016, "learning_rate": 6.1032483431914455e-06, "loss": 0.3157, "step": 4044 }, { "epoch": 3.404882154882155, "grad_norm": 0.4052133858203888, "learning_rate": 6.101201103999357e-06, "loss": 0.3572, "step": 4045 }, { "epoch": 3.4057239057239057, "grad_norm": 0.38700711727142334, "learning_rate": 6.099153670763894e-06, "loss": 0.3035, "step": 4046 }, { "epoch": 3.4065656565656566, "grad_norm": 0.39762845635414124, "learning_rate": 6.097106043845837e-06, "loss": 0.3205, "step": 4047 }, { "epoch": 3.4074074074074074, "grad_norm": 0.387698769569397, "learning_rate": 6.095058223605997e-06, "loss": 0.308, "step": 4048 }, { "epoch": 3.4082491582491583, "grad_norm": 0.40638792514801025, "learning_rate": 6.0930102104052234e-06, "loss": 0.3272, "step": 4049 }, { "epoch": 3.409090909090909, "grad_norm": 0.37617021799087524, "learning_rate": 6.090962004604397e-06, "loss": 0.3177, "step": 4050 }, { "epoch": 3.40993265993266, "grad_norm": 0.414263516664505, "learning_rate": 6.088913606564435e-06, "loss": 0.312, "step": 4051 }, { "epoch": 3.410774410774411, "grad_norm": 0.40298956632614136, "learning_rate": 6.086865016646286e-06, "loss": 0.3288, "step": 4052 }, { "epoch": 3.4116161616161618, "grad_norm": 0.40969914197921753, "learning_rate": 6.084816235210933e-06, "loss": 0.3233, "step": 4053 }, { "epoch": 3.4124579124579126, "grad_norm": 0.40068596601486206, "learning_rate": 6.082767262619394e-06, "loss": 0.3396, "step": 4054 }, { "epoch": 3.4132996632996635, "grad_norm": 0.3930071294307709, "learning_rate": 6.080718099232721e-06, "loss": 0.3182, "step": 4055 }, { "epoch": 3.4141414141414144, "grad_norm": 0.35890549421310425, "learning_rate": 6.0786687454119955e-06, "loss": 0.3209, "step": 4056 }, { "epoch": 3.4149831649831652, "grad_norm": 0.39085856080055237, "learning_rate": 6.076619201518336e-06, "loss": 0.3448, "step": 4057 }, { "epoch": 3.4158249158249157, "grad_norm": 0.3776542842388153, "learning_rate": 6.074569467912896e-06, "loss": 0.305, "step": 4058 }, { "epoch": 3.4166666666666665, "grad_norm": 0.4525560140609741, "learning_rate": 6.072519544956859e-06, "loss": 0.3412, "step": 4059 }, { "epoch": 3.4175084175084174, "grad_norm": 0.41355228424072266, "learning_rate": 6.070469433011444e-06, "loss": 0.3177, "step": 4060 }, { "epoch": 3.4183501683501682, "grad_norm": 0.39940106868743896, "learning_rate": 6.068419132437901e-06, "loss": 0.3217, "step": 4061 }, { "epoch": 3.419191919191919, "grad_norm": 0.4094993770122528, "learning_rate": 6.066368643597518e-06, "loss": 0.338, "step": 4062 }, { "epoch": 3.42003367003367, "grad_norm": 0.435034841299057, "learning_rate": 6.064317966851609e-06, "loss": 0.3379, "step": 4063 }, { "epoch": 3.420875420875421, "grad_norm": 0.3987088203430176, "learning_rate": 6.0622671025615285e-06, "loss": 0.3088, "step": 4064 }, { "epoch": 3.4217171717171717, "grad_norm": 0.3844773769378662, "learning_rate": 6.060216051088659e-06, "loss": 0.3121, "step": 4065 }, { "epoch": 3.4225589225589226, "grad_norm": 0.4348909258842468, "learning_rate": 6.058164812794418e-06, "loss": 0.3311, "step": 4066 }, { "epoch": 3.4234006734006734, "grad_norm": 0.39918169379234314, "learning_rate": 6.0561133880402554e-06, "loss": 0.332, "step": 4067 }, { "epoch": 3.4242424242424243, "grad_norm": 0.3891465663909912, "learning_rate": 6.054061777187653e-06, "loss": 0.3113, "step": 4068 }, { "epoch": 3.425084175084175, "grad_norm": 0.37697139382362366, "learning_rate": 6.052009980598129e-06, "loss": 0.3305, "step": 4069 }, { "epoch": 3.425925925925926, "grad_norm": 0.38497069478034973, "learning_rate": 6.04995799863323e-06, "loss": 0.3374, "step": 4070 }, { "epoch": 3.426767676767677, "grad_norm": 0.40426909923553467, "learning_rate": 6.047905831654538e-06, "loss": 0.3259, "step": 4071 }, { "epoch": 3.4276094276094278, "grad_norm": 0.4020056128501892, "learning_rate": 6.045853480023664e-06, "loss": 0.3314, "step": 4072 }, { "epoch": 3.4284511784511786, "grad_norm": 0.3976704478263855, "learning_rate": 6.04380094410226e-06, "loss": 0.3246, "step": 4073 }, { "epoch": 3.429292929292929, "grad_norm": 0.3719247877597809, "learning_rate": 6.041748224252e-06, "loss": 0.3173, "step": 4074 }, { "epoch": 3.43013468013468, "grad_norm": 0.4084589183330536, "learning_rate": 6.039695320834595e-06, "loss": 0.3051, "step": 4075 }, { "epoch": 3.430976430976431, "grad_norm": 0.4486449360847473, "learning_rate": 6.037642234211791e-06, "loss": 0.3156, "step": 4076 }, { "epoch": 3.4318181818181817, "grad_norm": 0.41201648116111755, "learning_rate": 6.035588964745362e-06, "loss": 0.304, "step": 4077 }, { "epoch": 3.4326599326599325, "grad_norm": 0.4108927845954895, "learning_rate": 6.033535512797117e-06, "loss": 0.3494, "step": 4078 }, { "epoch": 3.4335016835016834, "grad_norm": 0.4584639370441437, "learning_rate": 6.0314818787288976e-06, "loss": 0.3103, "step": 4079 }, { "epoch": 3.4343434343434343, "grad_norm": 0.4579455852508545, "learning_rate": 6.029428062902572e-06, "loss": 0.3367, "step": 4080 }, { "epoch": 3.435185185185185, "grad_norm": 0.3885105848312378, "learning_rate": 6.0273740656800475e-06, "loss": 0.3177, "step": 4081 }, { "epoch": 3.436026936026936, "grad_norm": 0.4670053720474243, "learning_rate": 6.025319887423258e-06, "loss": 0.3183, "step": 4082 }, { "epoch": 3.436868686868687, "grad_norm": 0.44715023040771484, "learning_rate": 6.023265528494173e-06, "loss": 0.327, "step": 4083 }, { "epoch": 3.4377104377104377, "grad_norm": 0.3931790888309479, "learning_rate": 6.0212109892547934e-06, "loss": 0.3384, "step": 4084 }, { "epoch": 3.4385521885521886, "grad_norm": 0.40842753648757935, "learning_rate": 6.019156270067149e-06, "loss": 0.3124, "step": 4085 }, { "epoch": 3.4393939393939394, "grad_norm": 0.4269856810569763, "learning_rate": 6.017101371293306e-06, "loss": 0.3207, "step": 4086 }, { "epoch": 3.4402356902356903, "grad_norm": 0.3801109194755554, "learning_rate": 6.0150462932953554e-06, "loss": 0.3333, "step": 4087 }, { "epoch": 3.441077441077441, "grad_norm": 0.40722718834877014, "learning_rate": 6.012991036435427e-06, "loss": 0.3135, "step": 4088 }, { "epoch": 3.441919191919192, "grad_norm": 0.4044586718082428, "learning_rate": 6.010935601075678e-06, "loss": 0.3554, "step": 4089 }, { "epoch": 3.442760942760943, "grad_norm": 0.38904476165771484, "learning_rate": 6.0088799875783e-06, "loss": 0.3218, "step": 4090 }, { "epoch": 3.4436026936026938, "grad_norm": 0.39870771765708923, "learning_rate": 6.006824196305509e-06, "loss": 0.3372, "step": 4091 }, { "epoch": 3.4444444444444446, "grad_norm": 0.36261171102523804, "learning_rate": 6.004768227619562e-06, "loss": 0.3305, "step": 4092 }, { "epoch": 3.4452861952861955, "grad_norm": 0.4076400697231293, "learning_rate": 6.00271208188274e-06, "loss": 0.3438, "step": 4093 }, { "epoch": 3.4461279461279464, "grad_norm": 0.39833563566207886, "learning_rate": 6.00065575945736e-06, "loss": 0.3171, "step": 4094 }, { "epoch": 3.446969696969697, "grad_norm": 0.40648016333580017, "learning_rate": 5.998599260705766e-06, "loss": 0.3135, "step": 4095 }, { "epoch": 3.4478114478114477, "grad_norm": 0.39379486441612244, "learning_rate": 5.9965425859903356e-06, "loss": 0.3271, "step": 4096 }, { "epoch": 3.4486531986531985, "grad_norm": 0.3911527395248413, "learning_rate": 5.994485735673479e-06, "loss": 0.3488, "step": 4097 }, { "epoch": 3.4494949494949494, "grad_norm": 0.39247438311576843, "learning_rate": 5.9924287101176315e-06, "loss": 0.312, "step": 4098 }, { "epoch": 3.4503367003367003, "grad_norm": 0.441690057516098, "learning_rate": 5.990371509685263e-06, "loss": 0.312, "step": 4099 }, { "epoch": 3.451178451178451, "grad_norm": 0.3784649968147278, "learning_rate": 5.988314134738879e-06, "loss": 0.3173, "step": 4100 }, { "epoch": 3.452020202020202, "grad_norm": 0.408271849155426, "learning_rate": 5.986256585641006e-06, "loss": 0.3328, "step": 4101 }, { "epoch": 3.452861952861953, "grad_norm": 0.43807587027549744, "learning_rate": 5.984198862754208e-06, "loss": 0.327, "step": 4102 }, { "epoch": 3.4537037037037037, "grad_norm": 0.3939760625362396, "learning_rate": 5.982140966441077e-06, "loss": 0.3493, "step": 4103 }, { "epoch": 3.4545454545454546, "grad_norm": 0.41076862812042236, "learning_rate": 5.980082897064236e-06, "loss": 0.3248, "step": 4104 }, { "epoch": 3.4553872053872055, "grad_norm": 0.4274197518825531, "learning_rate": 5.978024654986342e-06, "loss": 0.3313, "step": 4105 }, { "epoch": 3.4562289562289563, "grad_norm": 0.3844243288040161, "learning_rate": 5.975966240570075e-06, "loss": 0.3135, "step": 4106 }, { "epoch": 3.457070707070707, "grad_norm": 0.44884905219078064, "learning_rate": 5.97390765417815e-06, "loss": 0.3398, "step": 4107 }, { "epoch": 3.457912457912458, "grad_norm": 0.4050668478012085, "learning_rate": 5.9718488961733144e-06, "loss": 0.3357, "step": 4108 }, { "epoch": 3.458754208754209, "grad_norm": 0.39935699105262756, "learning_rate": 5.969789966918342e-06, "loss": 0.3495, "step": 4109 }, { "epoch": 3.45959595959596, "grad_norm": 0.3961719572544098, "learning_rate": 5.9677308667760385e-06, "loss": 0.3316, "step": 4110 }, { "epoch": 3.46043771043771, "grad_norm": 0.4222089946269989, "learning_rate": 5.9656715961092395e-06, "loss": 0.3387, "step": 4111 }, { "epoch": 3.461279461279461, "grad_norm": 0.38479891419410706, "learning_rate": 5.963612155280807e-06, "loss": 0.3475, "step": 4112 }, { "epoch": 3.462121212121212, "grad_norm": 0.3808903694152832, "learning_rate": 5.961552544653642e-06, "loss": 0.3353, "step": 4113 }, { "epoch": 3.462962962962963, "grad_norm": 0.4178912043571472, "learning_rate": 5.9594927645906664e-06, "loss": 0.3089, "step": 4114 }, { "epoch": 3.4638047138047137, "grad_norm": 0.43647512793540955, "learning_rate": 5.9574328154548356e-06, "loss": 0.3459, "step": 4115 }, { "epoch": 3.4646464646464645, "grad_norm": 0.4202926456928253, "learning_rate": 5.955372697609135e-06, "loss": 0.3216, "step": 4116 }, { "epoch": 3.4654882154882154, "grad_norm": 0.37902799248695374, "learning_rate": 5.95331241141658e-06, "loss": 0.3213, "step": 4117 }, { "epoch": 3.4663299663299663, "grad_norm": 0.3711166977882385, "learning_rate": 5.951251957240214e-06, "loss": 0.3274, "step": 4118 }, { "epoch": 3.467171717171717, "grad_norm": 0.3850290775299072, "learning_rate": 5.949191335443113e-06, "loss": 0.3075, "step": 4119 }, { "epoch": 3.468013468013468, "grad_norm": 0.39138442277908325, "learning_rate": 5.947130546388376e-06, "loss": 0.3086, "step": 4120 }, { "epoch": 3.468855218855219, "grad_norm": 0.426005482673645, "learning_rate": 5.945069590439142e-06, "loss": 0.2914, "step": 4121 }, { "epoch": 3.4696969696969697, "grad_norm": 0.39236173033714294, "learning_rate": 5.9430084679585686e-06, "loss": 0.3084, "step": 4122 }, { "epoch": 3.4705387205387206, "grad_norm": 0.3527890145778656, "learning_rate": 5.940947179309849e-06, "loss": 0.3387, "step": 4123 }, { "epoch": 3.4713804713804715, "grad_norm": 0.3895651698112488, "learning_rate": 5.9388857248562075e-06, "loss": 0.3338, "step": 4124 }, { "epoch": 3.4722222222222223, "grad_norm": 0.4151286482810974, "learning_rate": 5.9368241049608876e-06, "loss": 0.33, "step": 4125 }, { "epoch": 3.473063973063973, "grad_norm": 0.3994264602661133, "learning_rate": 5.934762319987175e-06, "loss": 0.3329, "step": 4126 }, { "epoch": 3.473905723905724, "grad_norm": 0.38810113072395325, "learning_rate": 5.932700370298374e-06, "loss": 0.3333, "step": 4127 }, { "epoch": 3.474747474747475, "grad_norm": 0.4249228239059448, "learning_rate": 5.930638256257826e-06, "loss": 0.3258, "step": 4128 }, { "epoch": 3.475589225589226, "grad_norm": 0.38140642642974854, "learning_rate": 5.928575978228893e-06, "loss": 0.3338, "step": 4129 }, { "epoch": 3.4764309764309766, "grad_norm": 0.42204323410987854, "learning_rate": 5.926513536574975e-06, "loss": 0.341, "step": 4130 }, { "epoch": 3.4772727272727275, "grad_norm": 0.4366730749607086, "learning_rate": 5.924450931659491e-06, "loss": 0.3256, "step": 4131 }, { "epoch": 3.478114478114478, "grad_norm": 0.4243471026420593, "learning_rate": 5.9223881638459e-06, "loss": 0.3281, "step": 4132 }, { "epoch": 3.478956228956229, "grad_norm": 0.4006428122520447, "learning_rate": 5.920325233497678e-06, "loss": 0.3249, "step": 4133 }, { "epoch": 3.4797979797979797, "grad_norm": 0.4100079834461212, "learning_rate": 5.918262140978339e-06, "loss": 0.3116, "step": 4134 }, { "epoch": 3.4806397306397305, "grad_norm": 0.441013365983963, "learning_rate": 5.916198886651421e-06, "loss": 0.2966, "step": 4135 }, { "epoch": 3.4814814814814814, "grad_norm": 0.42597687244415283, "learning_rate": 5.914135470880491e-06, "loss": 0.3258, "step": 4136 }, { "epoch": 3.4823232323232323, "grad_norm": 0.4087800681591034, "learning_rate": 5.912071894029144e-06, "loss": 0.3566, "step": 4137 }, { "epoch": 3.483164983164983, "grad_norm": 0.4050416350364685, "learning_rate": 5.9100081564610075e-06, "loss": 0.3238, "step": 4138 }, { "epoch": 3.484006734006734, "grad_norm": 0.4224300682544708, "learning_rate": 5.90794425853973e-06, "loss": 0.3407, "step": 4139 }, { "epoch": 3.484848484848485, "grad_norm": 0.4223337769508362, "learning_rate": 5.905880200628995e-06, "loss": 0.3352, "step": 4140 }, { "epoch": 3.4856902356902357, "grad_norm": 0.4288283586502075, "learning_rate": 5.9038159830925114e-06, "loss": 0.3246, "step": 4141 }, { "epoch": 3.4865319865319866, "grad_norm": 0.4049779176712036, "learning_rate": 5.901751606294014e-06, "loss": 0.3227, "step": 4142 }, { "epoch": 3.4873737373737375, "grad_norm": 0.4194290339946747, "learning_rate": 5.899687070597271e-06, "loss": 0.3395, "step": 4143 }, { "epoch": 3.4882154882154883, "grad_norm": 0.4187809228897095, "learning_rate": 5.897622376366072e-06, "loss": 0.3157, "step": 4144 }, { "epoch": 3.489057239057239, "grad_norm": 0.40165361762046814, "learning_rate": 5.895557523964244e-06, "loss": 0.3369, "step": 4145 }, { "epoch": 3.48989898989899, "grad_norm": 0.3862089216709137, "learning_rate": 5.8934925137556294e-06, "loss": 0.3413, "step": 4146 }, { "epoch": 3.490740740740741, "grad_norm": 0.4646933078765869, "learning_rate": 5.89142734610411e-06, "loss": 0.343, "step": 4147 }, { "epoch": 3.4915824915824913, "grad_norm": 0.40569761395454407, "learning_rate": 5.889362021373586e-06, "loss": 0.2983, "step": 4148 }, { "epoch": 3.492424242424242, "grad_norm": 0.38418856263160706, "learning_rate": 5.887296539927994e-06, "loss": 0.3344, "step": 4149 }, { "epoch": 3.493265993265993, "grad_norm": 0.3852887749671936, "learning_rate": 5.885230902131289e-06, "loss": 0.3533, "step": 4150 }, { "epoch": 3.494107744107744, "grad_norm": 0.40007883310317993, "learning_rate": 5.883165108347465e-06, "loss": 0.3157, "step": 4151 }, { "epoch": 3.494949494949495, "grad_norm": 0.42252734303474426, "learning_rate": 5.88109915894053e-06, "loss": 0.3246, "step": 4152 }, { "epoch": 3.4957912457912457, "grad_norm": 0.358029305934906, "learning_rate": 5.8790330542745325e-06, "loss": 0.3375, "step": 4153 }, { "epoch": 3.4966329966329965, "grad_norm": 0.3961159884929657, "learning_rate": 5.876966794713537e-06, "loss": 0.3417, "step": 4154 }, { "epoch": 3.4974747474747474, "grad_norm": 0.41269204020500183, "learning_rate": 5.874900380621642e-06, "loss": 0.3386, "step": 4155 }, { "epoch": 3.4983164983164983, "grad_norm": 0.43729567527770996, "learning_rate": 5.872833812362972e-06, "loss": 0.3105, "step": 4156 }, { "epoch": 3.499158249158249, "grad_norm": 0.391299307346344, "learning_rate": 5.870767090301681e-06, "loss": 0.3169, "step": 4157 }, { "epoch": 3.5, "grad_norm": 0.42899808287620544, "learning_rate": 5.868700214801945e-06, "loss": 0.3292, "step": 4158 }, { "epoch": 3.500841750841751, "grad_norm": 0.38328731060028076, "learning_rate": 5.866633186227968e-06, "loss": 0.3245, "step": 4159 }, { "epoch": 3.5016835016835017, "grad_norm": 0.41315630078315735, "learning_rate": 5.864566004943983e-06, "loss": 0.3241, "step": 4160 }, { "epoch": 3.5025252525252526, "grad_norm": 0.4556531608104706, "learning_rate": 5.862498671314253e-06, "loss": 0.3385, "step": 4161 }, { "epoch": 3.5033670033670035, "grad_norm": 0.4180457890033722, "learning_rate": 5.860431185703062e-06, "loss": 0.3202, "step": 4162 }, { "epoch": 3.5042087542087543, "grad_norm": 0.3725320100784302, "learning_rate": 5.85836354847472e-06, "loss": 0.3209, "step": 4163 }, { "epoch": 3.505050505050505, "grad_norm": 0.42270198464393616, "learning_rate": 5.856295759993571e-06, "loss": 0.3064, "step": 4164 }, { "epoch": 3.505892255892256, "grad_norm": 0.424487441778183, "learning_rate": 5.854227820623978e-06, "loss": 0.3043, "step": 4165 }, { "epoch": 3.506734006734007, "grad_norm": 0.42093127965927124, "learning_rate": 5.852159730730338e-06, "loss": 0.3617, "step": 4166 }, { "epoch": 3.507575757575758, "grad_norm": 0.3800513744354248, "learning_rate": 5.850091490677067e-06, "loss": 0.3333, "step": 4167 }, { "epoch": 3.5084175084175087, "grad_norm": 0.3892977833747864, "learning_rate": 5.848023100828612e-06, "loss": 0.3484, "step": 4168 }, { "epoch": 3.5092592592592595, "grad_norm": 0.41153544187545776, "learning_rate": 5.845954561549445e-06, "loss": 0.3093, "step": 4169 }, { "epoch": 3.51010101010101, "grad_norm": 0.4348304271697998, "learning_rate": 5.843885873204067e-06, "loss": 0.3409, "step": 4170 }, { "epoch": 3.510942760942761, "grad_norm": 0.423814594745636, "learning_rate": 5.841817036156999e-06, "loss": 0.3262, "step": 4171 }, { "epoch": 3.5117845117845117, "grad_norm": 0.41504138708114624, "learning_rate": 5.839748050772796e-06, "loss": 0.3118, "step": 4172 }, { "epoch": 3.5126262626262625, "grad_norm": 0.42785727977752686, "learning_rate": 5.837678917416034e-06, "loss": 0.3558, "step": 4173 }, { "epoch": 3.5134680134680134, "grad_norm": 0.3963812291622162, "learning_rate": 5.835609636451315e-06, "loss": 0.322, "step": 4174 }, { "epoch": 3.5143097643097643, "grad_norm": 0.406194806098938, "learning_rate": 5.833540208243269e-06, "loss": 0.3339, "step": 4175 }, { "epoch": 3.515151515151515, "grad_norm": 0.37990376353263855, "learning_rate": 5.831470633156551e-06, "loss": 0.3324, "step": 4176 }, { "epoch": 3.515993265993266, "grad_norm": 0.3651706874370575, "learning_rate": 5.829400911555845e-06, "loss": 0.3241, "step": 4177 }, { "epoch": 3.516835016835017, "grad_norm": 0.40997907519340515, "learning_rate": 5.827331043805854e-06, "loss": 0.301, "step": 4178 }, { "epoch": 3.5176767676767677, "grad_norm": 0.3884288966655731, "learning_rate": 5.825261030271313e-06, "loss": 0.32, "step": 4179 }, { "epoch": 3.5185185185185186, "grad_norm": 0.4080345034599304, "learning_rate": 5.823190871316979e-06, "loss": 0.3463, "step": 4180 }, { "epoch": 3.5193602693602695, "grad_norm": 0.36604785919189453, "learning_rate": 5.821120567307639e-06, "loss": 0.3339, "step": 4181 }, { "epoch": 3.5202020202020203, "grad_norm": 0.3994137942790985, "learning_rate": 5.819050118608098e-06, "loss": 0.3321, "step": 4182 }, { "epoch": 3.521043771043771, "grad_norm": 0.40680307149887085, "learning_rate": 5.816979525583197e-06, "loss": 0.3409, "step": 4183 }, { "epoch": 3.5218855218855216, "grad_norm": 0.39480987191200256, "learning_rate": 5.81490878859779e-06, "loss": 0.3336, "step": 4184 }, { "epoch": 3.5227272727272725, "grad_norm": 0.39492252469062805, "learning_rate": 5.812837908016771e-06, "loss": 0.301, "step": 4185 }, { "epoch": 3.5235690235690234, "grad_norm": 0.3765878677368164, "learning_rate": 5.810766884205044e-06, "loss": 0.2899, "step": 4186 }, { "epoch": 3.524410774410774, "grad_norm": 0.3933088183403015, "learning_rate": 5.808695717527547e-06, "loss": 0.3196, "step": 4187 }, { "epoch": 3.525252525252525, "grad_norm": 0.40088245272636414, "learning_rate": 5.8066244083492444e-06, "loss": 0.3084, "step": 4188 }, { "epoch": 3.526094276094276, "grad_norm": 0.38398391008377075, "learning_rate": 5.80455295703512e-06, "loss": 0.3239, "step": 4189 }, { "epoch": 3.526936026936027, "grad_norm": 0.3845941722393036, "learning_rate": 5.802481363950187e-06, "loss": 0.3456, "step": 4190 }, { "epoch": 3.5277777777777777, "grad_norm": 0.4194495677947998, "learning_rate": 5.800409629459484e-06, "loss": 0.3295, "step": 4191 }, { "epoch": 3.5286195286195285, "grad_norm": 0.41500237584114075, "learning_rate": 5.7983377539280695e-06, "loss": 0.3099, "step": 4192 }, { "epoch": 3.5294612794612794, "grad_norm": 0.39243075251579285, "learning_rate": 5.7962657377210304e-06, "loss": 0.3184, "step": 4193 }, { "epoch": 3.5303030303030303, "grad_norm": 0.422048956155777, "learning_rate": 5.79419358120348e-06, "loss": 0.316, "step": 4194 }, { "epoch": 3.531144781144781, "grad_norm": 0.40700894594192505, "learning_rate": 5.792121284740553e-06, "loss": 0.3044, "step": 4195 }, { "epoch": 3.531986531986532, "grad_norm": 0.40869376063346863, "learning_rate": 5.790048848697412e-06, "loss": 0.3289, "step": 4196 }, { "epoch": 3.532828282828283, "grad_norm": 0.4131866991519928, "learning_rate": 5.7879762734392375e-06, "loss": 0.3169, "step": 4197 }, { "epoch": 3.5336700336700337, "grad_norm": 0.4471578896045685, "learning_rate": 5.7859035593312465e-06, "loss": 0.3275, "step": 4198 }, { "epoch": 3.5345117845117846, "grad_norm": 0.4246121048927307, "learning_rate": 5.783830706738667e-06, "loss": 0.2974, "step": 4199 }, { "epoch": 3.5353535353535355, "grad_norm": 0.4249908924102783, "learning_rate": 5.781757716026761e-06, "loss": 0.3452, "step": 4200 }, { "epoch": 3.5361952861952863, "grad_norm": 0.403935968875885, "learning_rate": 5.7796845875608095e-06, "loss": 0.3178, "step": 4201 }, { "epoch": 3.537037037037037, "grad_norm": 0.40684136748313904, "learning_rate": 5.777611321706125e-06, "loss": 0.3397, "step": 4202 }, { "epoch": 3.537878787878788, "grad_norm": 0.43782293796539307, "learning_rate": 5.77553791882803e-06, "loss": 0.2931, "step": 4203 }, { "epoch": 3.538720538720539, "grad_norm": 0.40430375933647156, "learning_rate": 5.773464379291889e-06, "loss": 0.3153, "step": 4204 }, { "epoch": 3.53956228956229, "grad_norm": 0.4167107045650482, "learning_rate": 5.771390703463076e-06, "loss": 0.3297, "step": 4205 }, { "epoch": 3.5404040404040407, "grad_norm": 0.40553411841392517, "learning_rate": 5.7693168917069984e-06, "loss": 0.3376, "step": 4206 }, { "epoch": 3.541245791245791, "grad_norm": 0.45789825916290283, "learning_rate": 5.767242944389082e-06, "loss": 0.3014, "step": 4207 }, { "epoch": 3.542087542087542, "grad_norm": 0.3929498791694641, "learning_rate": 5.765168861874779e-06, "loss": 0.3016, "step": 4208 }, { "epoch": 3.542929292929293, "grad_norm": 0.39988040924072266, "learning_rate": 5.7630946445295646e-06, "loss": 0.3129, "step": 4209 }, { "epoch": 3.5437710437710437, "grad_norm": 0.4464922547340393, "learning_rate": 5.7610202927189394e-06, "loss": 0.3261, "step": 4210 }, { "epoch": 3.5446127946127945, "grad_norm": 0.38858747482299805, "learning_rate": 5.758945806808424e-06, "loss": 0.332, "step": 4211 }, { "epoch": 3.5454545454545454, "grad_norm": 0.392403244972229, "learning_rate": 5.756871187163565e-06, "loss": 0.3308, "step": 4212 }, { "epoch": 3.5462962962962963, "grad_norm": 0.4103482961654663, "learning_rate": 5.754796434149934e-06, "loss": 0.3301, "step": 4213 }, { "epoch": 3.547138047138047, "grad_norm": 0.3967682421207428, "learning_rate": 5.752721548133125e-06, "loss": 0.3253, "step": 4214 }, { "epoch": 3.547979797979798, "grad_norm": 0.37121155858039856, "learning_rate": 5.750646529478754e-06, "loss": 0.3282, "step": 4215 }, { "epoch": 3.548821548821549, "grad_norm": 0.39109915494918823, "learning_rate": 5.74857137855246e-06, "loss": 0.3344, "step": 4216 }, { "epoch": 3.5496632996632997, "grad_norm": 0.4260077178478241, "learning_rate": 5.74649609571991e-06, "loss": 0.3111, "step": 4217 }, { "epoch": 3.5505050505050506, "grad_norm": 0.37072718143463135, "learning_rate": 5.744420681346788e-06, "loss": 0.3128, "step": 4218 }, { "epoch": 3.5513468013468015, "grad_norm": 0.3785651624202728, "learning_rate": 5.742345135798807e-06, "loss": 0.3343, "step": 4219 }, { "epoch": 3.5521885521885523, "grad_norm": 0.3748045563697815, "learning_rate": 5.740269459441697e-06, "loss": 0.3525, "step": 4220 }, { "epoch": 3.5530303030303028, "grad_norm": 0.39653000235557556, "learning_rate": 5.7381936526412165e-06, "loss": 0.3357, "step": 4221 }, { "epoch": 3.5538720538720536, "grad_norm": 0.40738850831985474, "learning_rate": 5.7361177157631445e-06, "loss": 0.3667, "step": 4222 }, { "epoch": 3.5547138047138045, "grad_norm": 0.37655436992645264, "learning_rate": 5.734041649173283e-06, "loss": 0.3379, "step": 4223 }, { "epoch": 3.5555555555555554, "grad_norm": 0.39410942792892456, "learning_rate": 5.731965453237456e-06, "loss": 0.355, "step": 4224 }, { "epoch": 3.5563973063973062, "grad_norm": 0.3713516891002655, "learning_rate": 5.729889128321514e-06, "loss": 0.2993, "step": 4225 }, { "epoch": 3.557239057239057, "grad_norm": 0.39740481972694397, "learning_rate": 5.727812674791324e-06, "loss": 0.3385, "step": 4226 }, { "epoch": 3.558080808080808, "grad_norm": 0.41071921586990356, "learning_rate": 5.725736093012782e-06, "loss": 0.3384, "step": 4227 }, { "epoch": 3.558922558922559, "grad_norm": 0.4491153359413147, "learning_rate": 5.723659383351803e-06, "loss": 0.3567, "step": 4228 }, { "epoch": 3.5597643097643097, "grad_norm": 0.37240415811538696, "learning_rate": 5.721582546174326e-06, "loss": 0.3455, "step": 4229 }, { "epoch": 3.5606060606060606, "grad_norm": 0.4042244851589203, "learning_rate": 5.719505581846312e-06, "loss": 0.3198, "step": 4230 }, { "epoch": 3.5614478114478114, "grad_norm": 0.45436927676200867, "learning_rate": 5.717428490733742e-06, "loss": 0.3313, "step": 4231 }, { "epoch": 3.5622895622895623, "grad_norm": 0.39612945914268494, "learning_rate": 5.715351273202623e-06, "loss": 0.3277, "step": 4232 }, { "epoch": 3.563131313131313, "grad_norm": 0.4265810251235962, "learning_rate": 5.7132739296189835e-06, "loss": 0.3048, "step": 4233 }, { "epoch": 3.563973063973064, "grad_norm": 0.45272934436798096, "learning_rate": 5.711196460348873e-06, "loss": 0.3391, "step": 4234 }, { "epoch": 3.564814814814815, "grad_norm": 0.41331997513771057, "learning_rate": 5.709118865758364e-06, "loss": 0.3333, "step": 4235 }, { "epoch": 3.5656565656565657, "grad_norm": 0.3919621706008911, "learning_rate": 5.707041146213551e-06, "loss": 0.3249, "step": 4236 }, { "epoch": 3.5664983164983166, "grad_norm": 0.3702787458896637, "learning_rate": 5.704963302080548e-06, "loss": 0.3381, "step": 4237 }, { "epoch": 3.5673400673400675, "grad_norm": 0.37654611468315125, "learning_rate": 5.702885333725498e-06, "loss": 0.3262, "step": 4238 }, { "epoch": 3.5681818181818183, "grad_norm": 0.40419602394104004, "learning_rate": 5.700807241514557e-06, "loss": 0.3487, "step": 4239 }, { "epoch": 3.569023569023569, "grad_norm": 0.39463356137275696, "learning_rate": 5.698729025813908e-06, "loss": 0.3275, "step": 4240 }, { "epoch": 3.56986531986532, "grad_norm": 0.39771780371665955, "learning_rate": 5.696650686989756e-06, "loss": 0.311, "step": 4241 }, { "epoch": 3.570707070707071, "grad_norm": 0.4118492007255554, "learning_rate": 5.694572225408327e-06, "loss": 0.3414, "step": 4242 }, { "epoch": 3.571548821548822, "grad_norm": 0.49153485894203186, "learning_rate": 5.692493641435863e-06, "loss": 0.3304, "step": 4243 }, { "epoch": 3.5723905723905722, "grad_norm": 0.3876871168613434, "learning_rate": 5.6904149354386405e-06, "loss": 0.3462, "step": 4244 }, { "epoch": 3.573232323232323, "grad_norm": 0.3923969268798828, "learning_rate": 5.688336107782944e-06, "loss": 0.322, "step": 4245 }, { "epoch": 3.574074074074074, "grad_norm": 0.45791691541671753, "learning_rate": 5.6862571588350854e-06, "loss": 0.3128, "step": 4246 }, { "epoch": 3.574915824915825, "grad_norm": 0.4174768328666687, "learning_rate": 5.684178088961401e-06, "loss": 0.3148, "step": 4247 }, { "epoch": 3.5757575757575757, "grad_norm": 0.42788249254226685, "learning_rate": 5.682098898528241e-06, "loss": 0.3453, "step": 4248 }, { "epoch": 3.5765993265993266, "grad_norm": 0.37882721424102783, "learning_rate": 5.680019587901985e-06, "loss": 0.3483, "step": 4249 }, { "epoch": 3.5774410774410774, "grad_norm": 0.374235600233078, "learning_rate": 5.677940157449027e-06, "loss": 0.3252, "step": 4250 }, { "epoch": 3.5782828282828283, "grad_norm": 0.4337380826473236, "learning_rate": 5.675860607535785e-06, "loss": 0.3137, "step": 4251 }, { "epoch": 3.579124579124579, "grad_norm": 0.40273502469062805, "learning_rate": 5.6737809385286975e-06, "loss": 0.3355, "step": 4252 }, { "epoch": 3.57996632996633, "grad_norm": 0.37956365942955017, "learning_rate": 5.671701150794225e-06, "loss": 0.3291, "step": 4253 }, { "epoch": 3.580808080808081, "grad_norm": 0.3770226836204529, "learning_rate": 5.66962124469885e-06, "loss": 0.3121, "step": 4254 }, { "epoch": 3.5816498316498318, "grad_norm": 0.44081664085388184, "learning_rate": 5.667541220609073e-06, "loss": 0.3283, "step": 4255 }, { "epoch": 3.5824915824915826, "grad_norm": 0.40146979689598083, "learning_rate": 5.665461078891415e-06, "loss": 0.3306, "step": 4256 }, { "epoch": 3.5833333333333335, "grad_norm": 0.37491369247436523, "learning_rate": 5.6633808199124195e-06, "loss": 0.3266, "step": 4257 }, { "epoch": 3.584175084175084, "grad_norm": 0.43921002745628357, "learning_rate": 5.6613004440386525e-06, "loss": 0.3244, "step": 4258 }, { "epoch": 3.5850168350168348, "grad_norm": 0.4300937056541443, "learning_rate": 5.659219951636695e-06, "loss": 0.3158, "step": 4259 }, { "epoch": 3.5858585858585856, "grad_norm": 0.3914637267589569, "learning_rate": 5.6571393430731545e-06, "loss": 0.3241, "step": 4260 }, { "epoch": 3.5867003367003365, "grad_norm": 0.4254450798034668, "learning_rate": 5.655058618714656e-06, "loss": 0.3262, "step": 4261 }, { "epoch": 3.5875420875420874, "grad_norm": 0.41685688495635986, "learning_rate": 5.652977778927845e-06, "loss": 0.3128, "step": 4262 }, { "epoch": 3.5883838383838382, "grad_norm": 0.4315827190876007, "learning_rate": 5.650896824079388e-06, "loss": 0.3333, "step": 4263 }, { "epoch": 3.589225589225589, "grad_norm": 0.41744592785835266, "learning_rate": 5.648815754535971e-06, "loss": 0.3179, "step": 4264 }, { "epoch": 3.59006734006734, "grad_norm": 0.4113788604736328, "learning_rate": 5.6467345706643e-06, "loss": 0.3243, "step": 4265 }, { "epoch": 3.590909090909091, "grad_norm": 0.3999469578266144, "learning_rate": 5.644653272831103e-06, "loss": 0.347, "step": 4266 }, { "epoch": 3.5917508417508417, "grad_norm": 0.45094603300094604, "learning_rate": 5.642571861403127e-06, "loss": 0.3055, "step": 4267 }, { "epoch": 3.5925925925925926, "grad_norm": 0.40511757135391235, "learning_rate": 5.640490336747138e-06, "loss": 0.3767, "step": 4268 }, { "epoch": 3.5934343434343434, "grad_norm": 0.3806358575820923, "learning_rate": 5.638408699229922e-06, "loss": 0.3563, "step": 4269 }, { "epoch": 3.5942760942760943, "grad_norm": 0.37735220789909363, "learning_rate": 5.63632694921829e-06, "loss": 0.3397, "step": 4270 }, { "epoch": 3.595117845117845, "grad_norm": 0.4274913966655731, "learning_rate": 5.634245087079062e-06, "loss": 0.3173, "step": 4271 }, { "epoch": 3.595959595959596, "grad_norm": 0.43270108103752136, "learning_rate": 5.6321631131790886e-06, "loss": 0.3266, "step": 4272 }, { "epoch": 3.596801346801347, "grad_norm": 0.4186131954193115, "learning_rate": 5.6300810278852345e-06, "loss": 0.3235, "step": 4273 }, { "epoch": 3.5976430976430978, "grad_norm": 0.42028355598449707, "learning_rate": 5.627998831564387e-06, "loss": 0.3227, "step": 4274 }, { "epoch": 3.5984848484848486, "grad_norm": 0.43381404876708984, "learning_rate": 5.625916524583449e-06, "loss": 0.2969, "step": 4275 }, { "epoch": 3.5993265993265995, "grad_norm": 0.3960195481777191, "learning_rate": 5.623834107309347e-06, "loss": 0.3443, "step": 4276 }, { "epoch": 3.6001683501683504, "grad_norm": 0.41211310029029846, "learning_rate": 5.621751580109023e-06, "loss": 0.312, "step": 4277 }, { "epoch": 3.601010101010101, "grad_norm": 0.4284791648387909, "learning_rate": 5.619668943349444e-06, "loss": 0.3322, "step": 4278 }, { "epoch": 3.601851851851852, "grad_norm": 0.4053880572319031, "learning_rate": 5.617586197397589e-06, "loss": 0.3206, "step": 4279 }, { "epoch": 3.602693602693603, "grad_norm": 0.39853036403656006, "learning_rate": 5.615503342620462e-06, "loss": 0.3347, "step": 4280 }, { "epoch": 3.6035353535353534, "grad_norm": 0.35939133167266846, "learning_rate": 5.613420379385084e-06, "loss": 0.3346, "step": 4281 }, { "epoch": 3.6043771043771042, "grad_norm": 0.4100145697593689, "learning_rate": 5.6113373080584954e-06, "loss": 0.3403, "step": 4282 }, { "epoch": 3.605218855218855, "grad_norm": 0.39269503951072693, "learning_rate": 5.609254129007755e-06, "loss": 0.3242, "step": 4283 }, { "epoch": 3.606060606060606, "grad_norm": 0.40499478578567505, "learning_rate": 5.607170842599942e-06, "loss": 0.3052, "step": 4284 }, { "epoch": 3.606902356902357, "grad_norm": 0.41077813506126404, "learning_rate": 5.6050874492021525e-06, "loss": 0.3434, "step": 4285 }, { "epoch": 3.6077441077441077, "grad_norm": 0.39929908514022827, "learning_rate": 5.603003949181503e-06, "loss": 0.3266, "step": 4286 }, { "epoch": 3.6085858585858586, "grad_norm": 0.39706477522850037, "learning_rate": 5.600920342905132e-06, "loss": 0.3154, "step": 4287 }, { "epoch": 3.6094276094276094, "grad_norm": 0.40550869703292847, "learning_rate": 5.5988366307401854e-06, "loss": 0.3353, "step": 4288 }, { "epoch": 3.6102693602693603, "grad_norm": 0.4059295356273651, "learning_rate": 5.596752813053843e-06, "loss": 0.3222, "step": 4289 }, { "epoch": 3.611111111111111, "grad_norm": 0.3902037739753723, "learning_rate": 5.594668890213292e-06, "loss": 0.3387, "step": 4290 }, { "epoch": 3.611952861952862, "grad_norm": 0.3999636769294739, "learning_rate": 5.592584862585742e-06, "loss": 0.316, "step": 4291 }, { "epoch": 3.612794612794613, "grad_norm": 0.4184543490409851, "learning_rate": 5.5905007305384216e-06, "loss": 0.3403, "step": 4292 }, { "epoch": 3.6136363636363638, "grad_norm": 0.45032477378845215, "learning_rate": 5.588416494438576e-06, "loss": 0.3274, "step": 4293 }, { "epoch": 3.6144781144781146, "grad_norm": 0.3988777697086334, "learning_rate": 5.586332154653472e-06, "loss": 0.329, "step": 4294 }, { "epoch": 3.615319865319865, "grad_norm": 0.3932662904262543, "learning_rate": 5.5842477115503925e-06, "loss": 0.3323, "step": 4295 }, { "epoch": 3.616161616161616, "grad_norm": 0.42845645546913147, "learning_rate": 5.582163165496635e-06, "loss": 0.3287, "step": 4296 }, { "epoch": 3.6170033670033668, "grad_norm": 0.4209056496620178, "learning_rate": 5.580078516859521e-06, "loss": 0.3233, "step": 4297 }, { "epoch": 3.6178451178451176, "grad_norm": 0.42147159576416016, "learning_rate": 5.57799376600639e-06, "loss": 0.325, "step": 4298 }, { "epoch": 3.6186868686868685, "grad_norm": 0.3983517289161682, "learning_rate": 5.575908913304594e-06, "loss": 0.3218, "step": 4299 }, { "epoch": 3.6195286195286194, "grad_norm": 0.4013172388076782, "learning_rate": 5.573823959121507e-06, "loss": 0.3231, "step": 4300 }, { "epoch": 3.6203703703703702, "grad_norm": 0.43529024720191956, "learning_rate": 5.571738903824521e-06, "loss": 0.3351, "step": 4301 }, { "epoch": 3.621212121212121, "grad_norm": 0.42513900995254517, "learning_rate": 5.569653747781045e-06, "loss": 0.3354, "step": 4302 }, { "epoch": 3.622053872053872, "grad_norm": 0.4178805351257324, "learning_rate": 5.567568491358505e-06, "loss": 0.308, "step": 4303 }, { "epoch": 3.622895622895623, "grad_norm": 0.4217807948589325, "learning_rate": 5.565483134924345e-06, "loss": 0.341, "step": 4304 }, { "epoch": 3.6237373737373737, "grad_norm": 0.38220658898353577, "learning_rate": 5.563397678846029e-06, "loss": 0.3587, "step": 4305 }, { "epoch": 3.6245791245791246, "grad_norm": 0.42034733295440674, "learning_rate": 5.5613121234910365e-06, "loss": 0.3258, "step": 4306 }, { "epoch": 3.6254208754208754, "grad_norm": 0.4297569990158081, "learning_rate": 5.559226469226859e-06, "loss": 0.3103, "step": 4307 }, { "epoch": 3.6262626262626263, "grad_norm": 0.4065197706222534, "learning_rate": 5.55714071642102e-06, "loss": 0.3228, "step": 4308 }, { "epoch": 3.627104377104377, "grad_norm": 0.4051803946495056, "learning_rate": 5.555054865441044e-06, "loss": 0.3356, "step": 4309 }, { "epoch": 3.627946127946128, "grad_norm": 0.4119677245616913, "learning_rate": 5.552968916654484e-06, "loss": 0.3347, "step": 4310 }, { "epoch": 3.628787878787879, "grad_norm": 0.38536494970321655, "learning_rate": 5.5508828704289066e-06, "loss": 0.3131, "step": 4311 }, { "epoch": 3.6296296296296298, "grad_norm": 0.43744024634361267, "learning_rate": 5.548796727131892e-06, "loss": 0.3326, "step": 4312 }, { "epoch": 3.6304713804713806, "grad_norm": 0.42148128151893616, "learning_rate": 5.546710487131045e-06, "loss": 0.3219, "step": 4313 }, { "epoch": 3.6313131313131315, "grad_norm": 0.41357001662254333, "learning_rate": 5.5446241507939815e-06, "loss": 0.3301, "step": 4314 }, { "epoch": 3.6321548821548824, "grad_norm": 0.4379352331161499, "learning_rate": 5.5425377184883345e-06, "loss": 0.3308, "step": 4315 }, { "epoch": 3.6329966329966332, "grad_norm": 0.38944751024246216, "learning_rate": 5.540451190581759e-06, "loss": 0.3296, "step": 4316 }, { "epoch": 3.633838383838384, "grad_norm": 0.4253859519958496, "learning_rate": 5.538364567441921e-06, "loss": 0.3354, "step": 4317 }, { "epoch": 3.634680134680135, "grad_norm": 0.4457274377346039, "learning_rate": 5.536277849436507e-06, "loss": 0.3303, "step": 4318 }, { "epoch": 3.6355218855218854, "grad_norm": 0.3902823328971863, "learning_rate": 5.534191036933216e-06, "loss": 0.3226, "step": 4319 }, { "epoch": 3.6363636363636362, "grad_norm": 0.4117942154407501, "learning_rate": 5.532104130299771e-06, "loss": 0.3348, "step": 4320 }, { "epoch": 3.637205387205387, "grad_norm": 0.39369040727615356, "learning_rate": 5.530017129903906e-06, "loss": 0.3216, "step": 4321 }, { "epoch": 3.638047138047138, "grad_norm": 0.43550899624824524, "learning_rate": 5.527930036113371e-06, "loss": 0.3203, "step": 4322 }, { "epoch": 3.638888888888889, "grad_norm": 0.346449613571167, "learning_rate": 5.525842849295936e-06, "loss": 0.3403, "step": 4323 }, { "epoch": 3.6397306397306397, "grad_norm": 0.38947927951812744, "learning_rate": 5.523755569819384e-06, "loss": 0.3434, "step": 4324 }, { "epoch": 3.6405723905723906, "grad_norm": 0.38055694103240967, "learning_rate": 5.521668198051517e-06, "loss": 0.3224, "step": 4325 }, { "epoch": 3.6414141414141414, "grad_norm": 0.37555667757987976, "learning_rate": 5.51958073436015e-06, "loss": 0.316, "step": 4326 }, { "epoch": 3.6422558922558923, "grad_norm": 0.4099769592285156, "learning_rate": 5.517493179113121e-06, "loss": 0.3217, "step": 4327 }, { "epoch": 3.643097643097643, "grad_norm": 0.3825104236602783, "learning_rate": 5.515405532678274e-06, "loss": 0.2916, "step": 4328 }, { "epoch": 3.643939393939394, "grad_norm": 0.3925319015979767, "learning_rate": 5.51331779542348e-06, "loss": 0.3143, "step": 4329 }, { "epoch": 3.644781144781145, "grad_norm": 0.36923131346702576, "learning_rate": 5.511229967716616e-06, "loss": 0.3389, "step": 4330 }, { "epoch": 3.6456228956228958, "grad_norm": 0.4032166302204132, "learning_rate": 5.509142049925581e-06, "loss": 0.3361, "step": 4331 }, { "epoch": 3.6464646464646466, "grad_norm": 0.3993125259876251, "learning_rate": 5.507054042418289e-06, "loss": 0.3453, "step": 4332 }, { "epoch": 3.647306397306397, "grad_norm": 0.3773635923862457, "learning_rate": 5.504965945562668e-06, "loss": 0.3155, "step": 4333 }, { "epoch": 3.648148148148148, "grad_norm": 0.40735307335853577, "learning_rate": 5.502877759726665e-06, "loss": 0.3301, "step": 4334 }, { "epoch": 3.648989898989899, "grad_norm": 0.3708822429180145, "learning_rate": 5.50078948527824e-06, "loss": 0.3155, "step": 4335 }, { "epoch": 3.6498316498316496, "grad_norm": 0.38617101311683655, "learning_rate": 5.498701122585367e-06, "loss": 0.3129, "step": 4336 }, { "epoch": 3.6506734006734005, "grad_norm": 0.3976280093193054, "learning_rate": 5.49661267201604e-06, "loss": 0.3463, "step": 4337 }, { "epoch": 3.6515151515151514, "grad_norm": 0.38836973905563354, "learning_rate": 5.494524133938265e-06, "loss": 0.3288, "step": 4338 }, { "epoch": 3.6523569023569022, "grad_norm": 0.39616069197654724, "learning_rate": 5.492435508720065e-06, "loss": 0.3219, "step": 4339 }, { "epoch": 3.653198653198653, "grad_norm": 0.4003806412220001, "learning_rate": 5.490346796729481e-06, "loss": 0.3351, "step": 4340 }, { "epoch": 3.654040404040404, "grad_norm": 0.37995412945747375, "learning_rate": 5.488257998334559e-06, "loss": 0.3401, "step": 4341 }, { "epoch": 3.654882154882155, "grad_norm": 0.3780573606491089, "learning_rate": 5.486169113903377e-06, "loss": 0.3219, "step": 4342 }, { "epoch": 3.6557239057239057, "grad_norm": 0.409850150346756, "learning_rate": 5.4840801438040115e-06, "loss": 0.3108, "step": 4343 }, { "epoch": 3.6565656565656566, "grad_norm": 0.3886148929595947, "learning_rate": 5.481991088404563e-06, "loss": 0.3369, "step": 4344 }, { "epoch": 3.6574074074074074, "grad_norm": 0.3735779821872711, "learning_rate": 5.479901948073148e-06, "loss": 0.331, "step": 4345 }, { "epoch": 3.6582491582491583, "grad_norm": 0.4037837088108063, "learning_rate": 5.477812723177892e-06, "loss": 0.3347, "step": 4346 }, { "epoch": 3.659090909090909, "grad_norm": 0.4252997636795044, "learning_rate": 5.475723414086939e-06, "loss": 0.3451, "step": 4347 }, { "epoch": 3.65993265993266, "grad_norm": 0.3739151060581207, "learning_rate": 5.47363402116845e-06, "loss": 0.3228, "step": 4348 }, { "epoch": 3.660774410774411, "grad_norm": 0.4328724443912506, "learning_rate": 5.471544544790596e-06, "loss": 0.3044, "step": 4349 }, { "epoch": 3.6616161616161618, "grad_norm": 0.38872888684272766, "learning_rate": 5.469454985321567e-06, "loss": 0.3096, "step": 4350 }, { "epoch": 3.6624579124579126, "grad_norm": 0.39996904134750366, "learning_rate": 5.467365343129564e-06, "loss": 0.3359, "step": 4351 }, { "epoch": 3.6632996632996635, "grad_norm": 0.405546098947525, "learning_rate": 5.465275618582805e-06, "loss": 0.3275, "step": 4352 }, { "epoch": 3.6641414141414144, "grad_norm": 0.46989449858665466, "learning_rate": 5.46318581204952e-06, "loss": 0.3584, "step": 4353 }, { "epoch": 3.6649831649831652, "grad_norm": 0.3926140367984772, "learning_rate": 5.46109592389796e-06, "loss": 0.3369, "step": 4354 }, { "epoch": 3.665824915824916, "grad_norm": 0.3873187005519867, "learning_rate": 5.45900595449638e-06, "loss": 0.3389, "step": 4355 }, { "epoch": 3.6666666666666665, "grad_norm": 0.40739521384239197, "learning_rate": 5.456915904213057e-06, "loss": 0.3271, "step": 4356 }, { "epoch": 3.6675084175084174, "grad_norm": 0.4327694773674011, "learning_rate": 5.454825773416281e-06, "loss": 0.3439, "step": 4357 }, { "epoch": 3.6683501683501682, "grad_norm": 0.39496034383773804, "learning_rate": 5.452735562474354e-06, "loss": 0.3364, "step": 4358 }, { "epoch": 3.669191919191919, "grad_norm": 0.42558109760284424, "learning_rate": 5.450645271755596e-06, "loss": 0.3217, "step": 4359 }, { "epoch": 3.67003367003367, "grad_norm": 0.44530874490737915, "learning_rate": 5.448554901628334e-06, "loss": 0.3276, "step": 4360 }, { "epoch": 3.670875420875421, "grad_norm": 0.3968072831630707, "learning_rate": 5.446464452460917e-06, "loss": 0.306, "step": 4361 }, { "epoch": 3.6717171717171717, "grad_norm": 0.3815290629863739, "learning_rate": 5.444373924621703e-06, "loss": 0.3533, "step": 4362 }, { "epoch": 3.6725589225589226, "grad_norm": 0.4160464406013489, "learning_rate": 5.442283318479067e-06, "loss": 0.3283, "step": 4363 }, { "epoch": 3.6734006734006734, "grad_norm": 0.3947882354259491, "learning_rate": 5.440192634401395e-06, "loss": 0.3124, "step": 4364 }, { "epoch": 3.6742424242424243, "grad_norm": 0.36554086208343506, "learning_rate": 5.438101872757087e-06, "loss": 0.3384, "step": 4365 }, { "epoch": 3.675084175084175, "grad_norm": 0.41855865716934204, "learning_rate": 5.436011033914557e-06, "loss": 0.3103, "step": 4366 }, { "epoch": 3.675925925925926, "grad_norm": 0.3992498219013214, "learning_rate": 5.433920118242237e-06, "loss": 0.3143, "step": 4367 }, { "epoch": 3.676767676767677, "grad_norm": 0.38479676842689514, "learning_rate": 5.431829126108563e-06, "loss": 0.3229, "step": 4368 }, { "epoch": 3.6776094276094278, "grad_norm": 0.42273467779159546, "learning_rate": 5.429738057881996e-06, "loss": 0.313, "step": 4369 }, { "epoch": 3.678451178451178, "grad_norm": 0.40815624594688416, "learning_rate": 5.427646913931001e-06, "loss": 0.3446, "step": 4370 }, { "epoch": 3.679292929292929, "grad_norm": 0.39746686816215515, "learning_rate": 5.425555694624058e-06, "loss": 0.3468, "step": 4371 }, { "epoch": 3.68013468013468, "grad_norm": 0.44955411553382874, "learning_rate": 5.423464400329666e-06, "loss": 0.3252, "step": 4372 }, { "epoch": 3.680976430976431, "grad_norm": 0.4293909966945648, "learning_rate": 5.421373031416332e-06, "loss": 0.3283, "step": 4373 }, { "epoch": 3.6818181818181817, "grad_norm": 0.39264601469039917, "learning_rate": 5.41928158825258e-06, "loss": 0.3188, "step": 4374 }, { "epoch": 3.6826599326599325, "grad_norm": 0.43571630120277405, "learning_rate": 5.417190071206939e-06, "loss": 0.3131, "step": 4375 }, { "epoch": 3.6835016835016834, "grad_norm": 0.46533337235450745, "learning_rate": 5.41509848064796e-06, "loss": 0.3348, "step": 4376 }, { "epoch": 3.6843434343434343, "grad_norm": 0.41986191272735596, "learning_rate": 5.413006816944204e-06, "loss": 0.3348, "step": 4377 }, { "epoch": 3.685185185185185, "grad_norm": 0.4031005799770355, "learning_rate": 5.410915080464243e-06, "loss": 0.3275, "step": 4378 }, { "epoch": 3.686026936026936, "grad_norm": 0.40171313285827637, "learning_rate": 5.4088232715766634e-06, "loss": 0.3312, "step": 4379 }, { "epoch": 3.686868686868687, "grad_norm": 0.4242055416107178, "learning_rate": 5.4067313906500664e-06, "loss": 0.3239, "step": 4380 }, { "epoch": 3.6877104377104377, "grad_norm": 0.4245125651359558, "learning_rate": 5.40463943805306e-06, "loss": 0.3242, "step": 4381 }, { "epoch": 3.6885521885521886, "grad_norm": 0.41521310806274414, "learning_rate": 5.4025474141542724e-06, "loss": 0.3594, "step": 4382 }, { "epoch": 3.6893939393939394, "grad_norm": 0.39296963810920715, "learning_rate": 5.400455319322337e-06, "loss": 0.3401, "step": 4383 }, { "epoch": 3.6902356902356903, "grad_norm": 0.4164401888847351, "learning_rate": 5.398363153925905e-06, "loss": 0.32, "step": 4384 }, { "epoch": 3.691077441077441, "grad_norm": 0.4089939296245575, "learning_rate": 5.3962709183336396e-06, "loss": 0.3213, "step": 4385 }, { "epoch": 3.691919191919192, "grad_norm": 0.44276732206344604, "learning_rate": 5.3941786129142135e-06, "loss": 0.326, "step": 4386 }, { "epoch": 3.692760942760943, "grad_norm": 0.4038149118423462, "learning_rate": 5.392086238036311e-06, "loss": 0.3254, "step": 4387 }, { "epoch": 3.6936026936026938, "grad_norm": 0.4423997700214386, "learning_rate": 5.389993794068636e-06, "loss": 0.3214, "step": 4388 }, { "epoch": 3.6944444444444446, "grad_norm": 0.39982524514198303, "learning_rate": 5.387901281379894e-06, "loss": 0.2841, "step": 4389 }, { "epoch": 3.6952861952861955, "grad_norm": 0.40905487537384033, "learning_rate": 5.385808700338812e-06, "loss": 0.312, "step": 4390 }, { "epoch": 3.6961279461279464, "grad_norm": 0.4158104360103607, "learning_rate": 5.383716051314125e-06, "loss": 0.3297, "step": 4391 }, { "epoch": 3.6969696969696972, "grad_norm": 0.4283987879753113, "learning_rate": 5.3816233346745775e-06, "loss": 0.3502, "step": 4392 }, { "epoch": 3.6978114478114477, "grad_norm": 0.4002005457878113, "learning_rate": 5.37953055078893e-06, "loss": 0.335, "step": 4393 }, { "epoch": 3.6986531986531985, "grad_norm": 0.4118419289588928, "learning_rate": 5.3774377000259534e-06, "loss": 0.3327, "step": 4394 }, { "epoch": 3.6994949494949494, "grad_norm": 0.4804874062538147, "learning_rate": 5.375344782754429e-06, "loss": 0.3288, "step": 4395 }, { "epoch": 3.7003367003367003, "grad_norm": 0.4192589819431305, "learning_rate": 5.373251799343153e-06, "loss": 0.3281, "step": 4396 }, { "epoch": 3.701178451178451, "grad_norm": 0.411724328994751, "learning_rate": 5.3711587501609295e-06, "loss": 0.3222, "step": 4397 }, { "epoch": 3.702020202020202, "grad_norm": 0.4194585978984833, "learning_rate": 5.369065635576578e-06, "loss": 0.3356, "step": 4398 }, { "epoch": 3.702861952861953, "grad_norm": 0.4142405092716217, "learning_rate": 5.366972455958928e-06, "loss": 0.3348, "step": 4399 }, { "epoch": 3.7037037037037037, "grad_norm": 0.4446183741092682, "learning_rate": 5.364879211676817e-06, "loss": 0.3183, "step": 4400 }, { "epoch": 3.7045454545454546, "grad_norm": 0.39882805943489075, "learning_rate": 5.362785903099099e-06, "loss": 0.3094, "step": 4401 }, { "epoch": 3.7053872053872055, "grad_norm": 0.4384702146053314, "learning_rate": 5.3606925305946374e-06, "loss": 0.322, "step": 4402 }, { "epoch": 3.7062289562289563, "grad_norm": 0.4203278720378876, "learning_rate": 5.358599094532306e-06, "loss": 0.3234, "step": 4403 }, { "epoch": 3.707070707070707, "grad_norm": 0.4149571657180786, "learning_rate": 5.356505595280992e-06, "loss": 0.3318, "step": 4404 }, { "epoch": 3.707912457912458, "grad_norm": 0.41561809182167053, "learning_rate": 5.3544120332095905e-06, "loss": 0.3192, "step": 4405 }, { "epoch": 3.708754208754209, "grad_norm": 0.4518221616744995, "learning_rate": 5.352318408687009e-06, "loss": 0.3384, "step": 4406 }, { "epoch": 3.7095959595959593, "grad_norm": 0.3929821252822876, "learning_rate": 5.350224722082171e-06, "loss": 0.3543, "step": 4407 }, { "epoch": 3.71043771043771, "grad_norm": 0.4175344407558441, "learning_rate": 5.348130973764e-06, "loss": 0.3276, "step": 4408 }, { "epoch": 3.711279461279461, "grad_norm": 0.3921108841896057, "learning_rate": 5.346037164101442e-06, "loss": 0.3246, "step": 4409 }, { "epoch": 3.712121212121212, "grad_norm": 0.4243178367614746, "learning_rate": 5.343943293463445e-06, "loss": 0.3192, "step": 4410 }, { "epoch": 3.712962962962963, "grad_norm": 0.39457041025161743, "learning_rate": 5.341849362218973e-06, "loss": 0.3363, "step": 4411 }, { "epoch": 3.7138047138047137, "grad_norm": 0.3963141143321991, "learning_rate": 5.339755370736999e-06, "loss": 0.292, "step": 4412 }, { "epoch": 3.7146464646464645, "grad_norm": 0.39721816778182983, "learning_rate": 5.337661319386506e-06, "loss": 0.3038, "step": 4413 }, { "epoch": 3.7154882154882154, "grad_norm": 0.4405158758163452, "learning_rate": 5.335567208536489e-06, "loss": 0.3146, "step": 4414 }, { "epoch": 3.7163299663299663, "grad_norm": 0.3868062496185303, "learning_rate": 5.333473038555952e-06, "loss": 0.3152, "step": 4415 }, { "epoch": 3.717171717171717, "grad_norm": 0.3772996962070465, "learning_rate": 5.3313788098139106e-06, "loss": 0.3238, "step": 4416 }, { "epoch": 3.718013468013468, "grad_norm": 0.3893054723739624, "learning_rate": 5.32928452267939e-06, "loss": 0.3386, "step": 4417 }, { "epoch": 3.718855218855219, "grad_norm": 0.39512649178504944, "learning_rate": 5.3271901775214265e-06, "loss": 0.3444, "step": 4418 }, { "epoch": 3.7196969696969697, "grad_norm": 0.41435256600379944, "learning_rate": 5.3250957747090625e-06, "loss": 0.3349, "step": 4419 }, { "epoch": 3.7205387205387206, "grad_norm": 0.426089346408844, "learning_rate": 5.3230013146113604e-06, "loss": 0.3368, "step": 4420 }, { "epoch": 3.7213804713804715, "grad_norm": 0.41134002804756165, "learning_rate": 5.32090679759738e-06, "loss": 0.3079, "step": 4421 }, { "epoch": 3.7222222222222223, "grad_norm": 0.39541494846343994, "learning_rate": 5.318812224036204e-06, "loss": 0.36, "step": 4422 }, { "epoch": 3.723063973063973, "grad_norm": 0.37417101860046387, "learning_rate": 5.3167175942969125e-06, "loss": 0.308, "step": 4423 }, { "epoch": 3.723905723905724, "grad_norm": 0.3933764398097992, "learning_rate": 5.314622908748606e-06, "loss": 0.333, "step": 4424 }, { "epoch": 3.724747474747475, "grad_norm": 0.3846442997455597, "learning_rate": 5.312528167760387e-06, "loss": 0.3269, "step": 4425 }, { "epoch": 3.725589225589226, "grad_norm": 0.41678473353385925, "learning_rate": 5.310433371701375e-06, "loss": 0.3504, "step": 4426 }, { "epoch": 3.7264309764309766, "grad_norm": 0.41077226400375366, "learning_rate": 5.30833852094069e-06, "loss": 0.345, "step": 4427 }, { "epoch": 3.7272727272727275, "grad_norm": 0.4177596867084503, "learning_rate": 5.306243615847473e-06, "loss": 0.3254, "step": 4428 }, { "epoch": 3.7281144781144784, "grad_norm": 0.3667922914028168, "learning_rate": 5.304148656790864e-06, "loss": 0.3208, "step": 4429 }, { "epoch": 3.728956228956229, "grad_norm": 0.4270763695240021, "learning_rate": 5.302053644140021e-06, "loss": 0.3403, "step": 4430 }, { "epoch": 3.7297979797979797, "grad_norm": 0.40930482745170593, "learning_rate": 5.2999585782641036e-06, "loss": 0.3327, "step": 4431 }, { "epoch": 3.7306397306397305, "grad_norm": 0.3732016980648041, "learning_rate": 5.297863459532288e-06, "loss": 0.3425, "step": 4432 }, { "epoch": 3.7314814814814814, "grad_norm": 0.42892828583717346, "learning_rate": 5.295768288313755e-06, "loss": 0.3046, "step": 4433 }, { "epoch": 3.7323232323232323, "grad_norm": 0.4273284375667572, "learning_rate": 5.293673064977697e-06, "loss": 0.3192, "step": 4434 }, { "epoch": 3.733164983164983, "grad_norm": 0.38576963543891907, "learning_rate": 5.291577789893313e-06, "loss": 0.329, "step": 4435 }, { "epoch": 3.734006734006734, "grad_norm": 0.4375356137752533, "learning_rate": 5.289482463429814e-06, "loss": 0.3285, "step": 4436 }, { "epoch": 3.734848484848485, "grad_norm": 0.4086683392524719, "learning_rate": 5.287387085956418e-06, "loss": 0.3297, "step": 4437 }, { "epoch": 3.7356902356902357, "grad_norm": 0.3829796612262726, "learning_rate": 5.2852916578423545e-06, "loss": 0.3337, "step": 4438 }, { "epoch": 3.7365319865319866, "grad_norm": 0.4085445702075958, "learning_rate": 5.2831961794568585e-06, "loss": 0.3327, "step": 4439 }, { "epoch": 3.7373737373737375, "grad_norm": 0.4071895182132721, "learning_rate": 5.281100651169175e-06, "loss": 0.3072, "step": 4440 }, { "epoch": 3.7382154882154883, "grad_norm": 0.38960957527160645, "learning_rate": 5.279005073348563e-06, "loss": 0.3037, "step": 4441 }, { "epoch": 3.739057239057239, "grad_norm": 0.3986041247844696, "learning_rate": 5.276909446364281e-06, "loss": 0.3223, "step": 4442 }, { "epoch": 3.73989898989899, "grad_norm": 0.4438723623752594, "learning_rate": 5.274813770585603e-06, "loss": 0.3185, "step": 4443 }, { "epoch": 3.7407407407407405, "grad_norm": 0.36917850375175476, "learning_rate": 5.272718046381808e-06, "loss": 0.3348, "step": 4444 }, { "epoch": 3.7415824915824913, "grad_norm": 0.3921787738800049, "learning_rate": 5.270622274122185e-06, "loss": 0.3452, "step": 4445 }, { "epoch": 3.742424242424242, "grad_norm": 0.40124353766441345, "learning_rate": 5.268526454176033e-06, "loss": 0.3433, "step": 4446 }, { "epoch": 3.743265993265993, "grad_norm": 0.4058099091053009, "learning_rate": 5.2664305869126575e-06, "loss": 0.3401, "step": 4447 }, { "epoch": 3.744107744107744, "grad_norm": 0.40925952792167664, "learning_rate": 5.264334672701371e-06, "loss": 0.2939, "step": 4448 }, { "epoch": 3.744949494949495, "grad_norm": 0.3901326656341553, "learning_rate": 5.262238711911497e-06, "loss": 0.3281, "step": 4449 }, { "epoch": 3.7457912457912457, "grad_norm": 0.3855302929878235, "learning_rate": 5.260142704912367e-06, "loss": 0.3056, "step": 4450 }, { "epoch": 3.7466329966329965, "grad_norm": 0.4142834544181824, "learning_rate": 5.258046652073317e-06, "loss": 0.3421, "step": 4451 }, { "epoch": 3.7474747474747474, "grad_norm": 0.42451173067092896, "learning_rate": 5.255950553763699e-06, "loss": 0.3161, "step": 4452 }, { "epoch": 3.7483164983164983, "grad_norm": 0.4117124378681183, "learning_rate": 5.2538544103528614e-06, "loss": 0.3166, "step": 4453 }, { "epoch": 3.749158249158249, "grad_norm": 0.4113992154598236, "learning_rate": 5.251758222210172e-06, "loss": 0.3244, "step": 4454 }, { "epoch": 3.75, "grad_norm": 0.39880862832069397, "learning_rate": 5.249661989704999e-06, "loss": 0.335, "step": 4455 }, { "epoch": 3.750841750841751, "grad_norm": 0.40771687030792236, "learning_rate": 5.247565713206723e-06, "loss": 0.3232, "step": 4456 }, { "epoch": 3.7516835016835017, "grad_norm": 0.4211136996746063, "learning_rate": 5.2454693930847264e-06, "loss": 0.3207, "step": 4457 }, { "epoch": 3.7525252525252526, "grad_norm": 0.4293179214000702, "learning_rate": 5.243373029708408e-06, "loss": 0.287, "step": 4458 }, { "epoch": 3.7533670033670035, "grad_norm": 0.4198540449142456, "learning_rate": 5.241276623447165e-06, "loss": 0.2983, "step": 4459 }, { "epoch": 3.7542087542087543, "grad_norm": 0.4322296380996704, "learning_rate": 5.239180174670412e-06, "loss": 0.2985, "step": 4460 }, { "epoch": 3.755050505050505, "grad_norm": 0.43092337250709534, "learning_rate": 5.237083683747558e-06, "loss": 0.3179, "step": 4461 }, { "epoch": 3.755892255892256, "grad_norm": 0.4268755316734314, "learning_rate": 5.234987151048035e-06, "loss": 0.3387, "step": 4462 }, { "epoch": 3.756734006734007, "grad_norm": 0.38758227229118347, "learning_rate": 5.232890576941269e-06, "loss": 0.3354, "step": 4463 }, { "epoch": 3.757575757575758, "grad_norm": 0.39683207869529724, "learning_rate": 5.230793961796699e-06, "loss": 0.3364, "step": 4464 }, { "epoch": 3.7584175084175087, "grad_norm": 0.40501120686531067, "learning_rate": 5.228697305983774e-06, "loss": 0.3559, "step": 4465 }, { "epoch": 3.7592592592592595, "grad_norm": 0.41032829880714417, "learning_rate": 5.226600609871947e-06, "loss": 0.3055, "step": 4466 }, { "epoch": 3.76010101010101, "grad_norm": 0.39503979682922363, "learning_rate": 5.224503873830674e-06, "loss": 0.339, "step": 4467 }, { "epoch": 3.760942760942761, "grad_norm": 0.3778727054595947, "learning_rate": 5.222407098229425e-06, "loss": 0.321, "step": 4468 }, { "epoch": 3.7617845117845117, "grad_norm": 0.4051658809185028, "learning_rate": 5.220310283437675e-06, "loss": 0.3454, "step": 4469 }, { "epoch": 3.7626262626262625, "grad_norm": 0.36973875761032104, "learning_rate": 5.218213429824904e-06, "loss": 0.3111, "step": 4470 }, { "epoch": 3.7634680134680134, "grad_norm": 0.4234011471271515, "learning_rate": 5.216116537760602e-06, "loss": 0.3439, "step": 4471 }, { "epoch": 3.7643097643097643, "grad_norm": 0.39172297716140747, "learning_rate": 5.214019607614258e-06, "loss": 0.3374, "step": 4472 }, { "epoch": 3.765151515151515, "grad_norm": 0.4074859619140625, "learning_rate": 5.211922639755381e-06, "loss": 0.3407, "step": 4473 }, { "epoch": 3.765993265993266, "grad_norm": 0.41045400500297546, "learning_rate": 5.2098256345534735e-06, "loss": 0.3219, "step": 4474 }, { "epoch": 3.766835016835017, "grad_norm": 0.3921755850315094, "learning_rate": 5.207728592378053e-06, "loss": 0.3167, "step": 4475 }, { "epoch": 3.7676767676767677, "grad_norm": 0.40101370215415955, "learning_rate": 5.205631513598641e-06, "loss": 0.3101, "step": 4476 }, { "epoch": 3.7685185185185186, "grad_norm": 0.4394257664680481, "learning_rate": 5.203534398584761e-06, "loss": 0.3339, "step": 4477 }, { "epoch": 3.7693602693602695, "grad_norm": 0.4209798574447632, "learning_rate": 5.201437247705952e-06, "loss": 0.2954, "step": 4478 }, { "epoch": 3.7702020202020203, "grad_norm": 0.4000129699707031, "learning_rate": 5.199340061331752e-06, "loss": 0.3441, "step": 4479 }, { "epoch": 3.771043771043771, "grad_norm": 0.39409583806991577, "learning_rate": 5.197242839831707e-06, "loss": 0.3116, "step": 4480 }, { "epoch": 3.7718855218855216, "grad_norm": 0.4042092263698578, "learning_rate": 5.195145583575373e-06, "loss": 0.3134, "step": 4481 }, { "epoch": 3.7727272727272725, "grad_norm": 0.4536341428756714, "learning_rate": 5.193048292932304e-06, "loss": 0.3233, "step": 4482 }, { "epoch": 3.7735690235690234, "grad_norm": 0.4025624692440033, "learning_rate": 5.1909509682720695e-06, "loss": 0.3519, "step": 4483 }, { "epoch": 3.774410774410774, "grad_norm": 0.40416452288627625, "learning_rate": 5.188853609964237e-06, "loss": 0.3393, "step": 4484 }, { "epoch": 3.775252525252525, "grad_norm": 0.4136558771133423, "learning_rate": 5.186756218378386e-06, "loss": 0.3625, "step": 4485 }, { "epoch": 3.776094276094276, "grad_norm": 0.39632681012153625, "learning_rate": 5.1846587938840995e-06, "loss": 0.314, "step": 4486 }, { "epoch": 3.776936026936027, "grad_norm": 0.39123404026031494, "learning_rate": 5.182561336850963e-06, "loss": 0.3221, "step": 4487 }, { "epoch": 3.7777777777777777, "grad_norm": 0.4245922863483429, "learning_rate": 5.180463847648574e-06, "loss": 0.312, "step": 4488 }, { "epoch": 3.7786195286195285, "grad_norm": 0.4260081350803375, "learning_rate": 5.178366326646529e-06, "loss": 0.3365, "step": 4489 }, { "epoch": 3.7794612794612794, "grad_norm": 0.42116427421569824, "learning_rate": 5.176268774214438e-06, "loss": 0.2991, "step": 4490 }, { "epoch": 3.7803030303030303, "grad_norm": 0.43942007422447205, "learning_rate": 5.174171190721907e-06, "loss": 0.332, "step": 4491 }, { "epoch": 3.781144781144781, "grad_norm": 0.41366076469421387, "learning_rate": 5.172073576538558e-06, "loss": 0.3294, "step": 4492 }, { "epoch": 3.781986531986532, "grad_norm": 0.3944370150566101, "learning_rate": 5.169975932034007e-06, "loss": 0.3287, "step": 4493 }, { "epoch": 3.782828282828283, "grad_norm": 0.4241943657398224, "learning_rate": 5.167878257577888e-06, "loss": 0.3277, "step": 4494 }, { "epoch": 3.7836700336700337, "grad_norm": 0.41009020805358887, "learning_rate": 5.165780553539828e-06, "loss": 0.3074, "step": 4495 }, { "epoch": 3.7845117845117846, "grad_norm": 0.4186362326145172, "learning_rate": 5.163682820289469e-06, "loss": 0.331, "step": 4496 }, { "epoch": 3.7853535353535355, "grad_norm": 0.42262765765190125, "learning_rate": 5.16158505819645e-06, "loss": 0.3216, "step": 4497 }, { "epoch": 3.7861952861952863, "grad_norm": 0.43473052978515625, "learning_rate": 5.159487267630422e-06, "loss": 0.3405, "step": 4498 }, { "epoch": 3.787037037037037, "grad_norm": 0.40983346104621887, "learning_rate": 5.157389448961036e-06, "loss": 0.319, "step": 4499 }, { "epoch": 3.787878787878788, "grad_norm": 0.42738401889801025, "learning_rate": 5.155291602557953e-06, "loss": 0.3388, "step": 4500 }, { "epoch": 3.788720538720539, "grad_norm": 0.40212902426719666, "learning_rate": 5.153193728790832e-06, "loss": 0.327, "step": 4501 }, { "epoch": 3.78956228956229, "grad_norm": 0.42046648263931274, "learning_rate": 5.151095828029343e-06, "loss": 0.3074, "step": 4502 }, { "epoch": 3.7904040404040407, "grad_norm": 0.43680644035339355, "learning_rate": 5.148997900643159e-06, "loss": 0.3352, "step": 4503 }, { "epoch": 3.791245791245791, "grad_norm": 0.4037262797355652, "learning_rate": 5.146899947001956e-06, "loss": 0.3121, "step": 4504 }, { "epoch": 3.792087542087542, "grad_norm": 0.445867657661438, "learning_rate": 5.144801967475418e-06, "loss": 0.3339, "step": 4505 }, { "epoch": 3.792929292929293, "grad_norm": 0.41129130125045776, "learning_rate": 5.142703962433227e-06, "loss": 0.319, "step": 4506 }, { "epoch": 3.7937710437710437, "grad_norm": 0.43130066990852356, "learning_rate": 5.1406059322450774e-06, "loss": 0.3247, "step": 4507 }, { "epoch": 3.7946127946127945, "grad_norm": 0.4546698033809662, "learning_rate": 5.138507877280664e-06, "loss": 0.3231, "step": 4508 }, { "epoch": 3.7954545454545454, "grad_norm": 0.46108534932136536, "learning_rate": 5.136409797909685e-06, "loss": 0.3253, "step": 4509 }, { "epoch": 3.7962962962962963, "grad_norm": 0.44246456027030945, "learning_rate": 5.134311694501845e-06, "loss": 0.349, "step": 4510 }, { "epoch": 3.797138047138047, "grad_norm": 0.43419256806373596, "learning_rate": 5.132213567426855e-06, "loss": 0.3317, "step": 4511 }, { "epoch": 3.797979797979798, "grad_norm": 0.4256855547428131, "learning_rate": 5.130115417054421e-06, "loss": 0.3071, "step": 4512 }, { "epoch": 3.798821548821549, "grad_norm": 0.41354092955589294, "learning_rate": 5.128017243754267e-06, "loss": 0.3437, "step": 4513 }, { "epoch": 3.7996632996632997, "grad_norm": 0.3964836597442627, "learning_rate": 5.125919047896107e-06, "loss": 0.3299, "step": 4514 }, { "epoch": 3.8005050505050506, "grad_norm": 0.4113349914550781, "learning_rate": 5.12382082984967e-06, "loss": 0.3206, "step": 4515 }, { "epoch": 3.8013468013468015, "grad_norm": 0.42604905366897583, "learning_rate": 5.121722589984681e-06, "loss": 0.3371, "step": 4516 }, { "epoch": 3.8021885521885523, "grad_norm": 0.40875810384750366, "learning_rate": 5.119624328670875e-06, "loss": 0.3123, "step": 4517 }, { "epoch": 3.8030303030303028, "grad_norm": 0.38571682572364807, "learning_rate": 5.1175260462779864e-06, "loss": 0.3268, "step": 4518 }, { "epoch": 3.8038720538720536, "grad_norm": 0.4236220717430115, "learning_rate": 5.115427743175757e-06, "loss": 0.3371, "step": 4519 }, { "epoch": 3.8047138047138045, "grad_norm": 0.39086097478866577, "learning_rate": 5.113329419733928e-06, "loss": 0.3301, "step": 4520 }, { "epoch": 3.8055555555555554, "grad_norm": 0.41148385405540466, "learning_rate": 5.111231076322246e-06, "loss": 0.3169, "step": 4521 }, { "epoch": 3.8063973063973062, "grad_norm": 0.3766268789768219, "learning_rate": 5.109132713310463e-06, "loss": 0.3406, "step": 4522 }, { "epoch": 3.807239057239057, "grad_norm": 0.40441209077835083, "learning_rate": 5.1070343310683315e-06, "loss": 0.3171, "step": 4523 }, { "epoch": 3.808080808080808, "grad_norm": 0.3848738968372345, "learning_rate": 5.104935929965612e-06, "loss": 0.3419, "step": 4524 }, { "epoch": 3.808922558922559, "grad_norm": 0.42702484130859375, "learning_rate": 5.10283751037206e-06, "loss": 0.3368, "step": 4525 }, { "epoch": 3.8097643097643097, "grad_norm": 0.4181165099143982, "learning_rate": 5.1007390726574445e-06, "loss": 0.329, "step": 4526 }, { "epoch": 3.8106060606060606, "grad_norm": 0.42703935503959656, "learning_rate": 5.0986406171915294e-06, "loss": 0.3626, "step": 4527 }, { "epoch": 3.8114478114478114, "grad_norm": 0.4003002345561981, "learning_rate": 5.096542144344087e-06, "loss": 0.327, "step": 4528 }, { "epoch": 3.8122895622895623, "grad_norm": 0.4130403697490692, "learning_rate": 5.0944436544848874e-06, "loss": 0.3486, "step": 4529 }, { "epoch": 3.813131313131313, "grad_norm": 0.40487977862358093, "learning_rate": 5.092345147983712e-06, "loss": 0.338, "step": 4530 }, { "epoch": 3.813973063973064, "grad_norm": 0.39672163128852844, "learning_rate": 5.0902466252103336e-06, "loss": 0.3264, "step": 4531 }, { "epoch": 3.814814814814815, "grad_norm": 0.3832261860370636, "learning_rate": 5.08814808653454e-06, "loss": 0.338, "step": 4532 }, { "epoch": 3.8156565656565657, "grad_norm": 0.3918994665145874, "learning_rate": 5.086049532326112e-06, "loss": 0.339, "step": 4533 }, { "epoch": 3.8164983164983166, "grad_norm": 0.400662362575531, "learning_rate": 5.083950962954841e-06, "loss": 0.3276, "step": 4534 }, { "epoch": 3.8173400673400675, "grad_norm": 0.40899738669395447, "learning_rate": 5.081852378790514e-06, "loss": 0.3254, "step": 4535 }, { "epoch": 3.8181818181818183, "grad_norm": 0.3656417429447174, "learning_rate": 5.079753780202925e-06, "loss": 0.334, "step": 4536 }, { "epoch": 3.819023569023569, "grad_norm": 0.43388864398002625, "learning_rate": 5.07765516756187e-06, "loss": 0.3397, "step": 4537 }, { "epoch": 3.81986531986532, "grad_norm": 0.41351422667503357, "learning_rate": 5.075556541237148e-06, "loss": 0.3152, "step": 4538 }, { "epoch": 3.820707070707071, "grad_norm": 0.370904803276062, "learning_rate": 5.073457901598556e-06, "loss": 0.3494, "step": 4539 }, { "epoch": 3.821548821548822, "grad_norm": 0.4142703711986542, "learning_rate": 5.071359249015899e-06, "loss": 0.3292, "step": 4540 }, { "epoch": 3.8223905723905722, "grad_norm": 0.4001491367816925, "learning_rate": 5.069260583858982e-06, "loss": 0.3438, "step": 4541 }, { "epoch": 3.823232323232323, "grad_norm": 0.3882243037223816, "learning_rate": 5.067161906497611e-06, "loss": 0.3267, "step": 4542 }, { "epoch": 3.824074074074074, "grad_norm": 0.3812130093574524, "learning_rate": 5.065063217301598e-06, "loss": 0.3031, "step": 4543 }, { "epoch": 3.824915824915825, "grad_norm": 0.4319598972797394, "learning_rate": 5.062964516640749e-06, "loss": 0.3332, "step": 4544 }, { "epoch": 3.8257575757575757, "grad_norm": 0.40510618686676025, "learning_rate": 5.060865804884884e-06, "loss": 0.3214, "step": 4545 }, { "epoch": 3.8265993265993266, "grad_norm": 0.4133566617965698, "learning_rate": 5.058767082403814e-06, "loss": 0.3239, "step": 4546 }, { "epoch": 3.8274410774410774, "grad_norm": 0.39371752738952637, "learning_rate": 5.0566683495673576e-06, "loss": 0.3141, "step": 4547 }, { "epoch": 3.8282828282828283, "grad_norm": 0.3682677745819092, "learning_rate": 5.054569606745333e-06, "loss": 0.3226, "step": 4548 }, { "epoch": 3.829124579124579, "grad_norm": 0.3767492473125458, "learning_rate": 5.052470854307563e-06, "loss": 0.3288, "step": 4549 }, { "epoch": 3.82996632996633, "grad_norm": 0.39477601647377014, "learning_rate": 5.0503720926238685e-06, "loss": 0.3201, "step": 4550 }, { "epoch": 3.830808080808081, "grad_norm": 0.424442857503891, "learning_rate": 5.0482733220640755e-06, "loss": 0.3002, "step": 4551 }, { "epoch": 3.8316498316498318, "grad_norm": 0.42115843296051025, "learning_rate": 5.046174542998006e-06, "loss": 0.3214, "step": 4552 }, { "epoch": 3.8324915824915826, "grad_norm": 0.3967714011669159, "learning_rate": 5.044075755795493e-06, "loss": 0.3157, "step": 4553 }, { "epoch": 3.8333333333333335, "grad_norm": 0.4235072135925293, "learning_rate": 5.041976960826359e-06, "loss": 0.3311, "step": 4554 }, { "epoch": 3.834175084175084, "grad_norm": 0.40981996059417725, "learning_rate": 5.039878158460438e-06, "loss": 0.3227, "step": 4555 }, { "epoch": 3.8350168350168348, "grad_norm": 0.40974345803260803, "learning_rate": 5.037779349067561e-06, "loss": 0.2909, "step": 4556 }, { "epoch": 3.8358585858585856, "grad_norm": 0.4286514222621918, "learning_rate": 5.035680533017559e-06, "loss": 0.3323, "step": 4557 }, { "epoch": 3.8367003367003365, "grad_norm": 0.4032084345817566, "learning_rate": 5.033581710680268e-06, "loss": 0.3326, "step": 4558 }, { "epoch": 3.8375420875420874, "grad_norm": 0.4142775535583496, "learning_rate": 5.03148288242552e-06, "loss": 0.3518, "step": 4559 }, { "epoch": 3.8383838383838382, "grad_norm": 0.410625696182251, "learning_rate": 5.029384048623154e-06, "loss": 0.3132, "step": 4560 }, { "epoch": 3.839225589225589, "grad_norm": 0.39393526315689087, "learning_rate": 5.027285209643003e-06, "loss": 0.3201, "step": 4561 }, { "epoch": 3.84006734006734, "grad_norm": 0.4266917407512665, "learning_rate": 5.0251863658549105e-06, "loss": 0.3264, "step": 4562 }, { "epoch": 3.840909090909091, "grad_norm": 0.4539196789264679, "learning_rate": 5.023087517628708e-06, "loss": 0.3246, "step": 4563 }, { "epoch": 3.8417508417508417, "grad_norm": 0.3936625123023987, "learning_rate": 5.020988665334242e-06, "loss": 0.3165, "step": 4564 }, { "epoch": 3.8425925925925926, "grad_norm": 0.36853092908859253, "learning_rate": 5.018889809341346e-06, "loss": 0.3153, "step": 4565 }, { "epoch": 3.8434343434343434, "grad_norm": 0.4054800868034363, "learning_rate": 5.0167909500198676e-06, "loss": 0.3409, "step": 4566 }, { "epoch": 3.8442760942760943, "grad_norm": 0.43240243196487427, "learning_rate": 5.014692087739643e-06, "loss": 0.3025, "step": 4567 }, { "epoch": 3.845117845117845, "grad_norm": 0.41116851568222046, "learning_rate": 5.012593222870516e-06, "loss": 0.3177, "step": 4568 }, { "epoch": 3.845959595959596, "grad_norm": 0.4006378948688507, "learning_rate": 5.0104943557823295e-06, "loss": 0.3224, "step": 4569 }, { "epoch": 3.846801346801347, "grad_norm": 0.4309214949607849, "learning_rate": 5.008395486844927e-06, "loss": 0.3005, "step": 4570 }, { "epoch": 3.8476430976430978, "grad_norm": 0.441713809967041, "learning_rate": 5.006296616428147e-06, "loss": 0.327, "step": 4571 }, { "epoch": 3.8484848484848486, "grad_norm": 0.4186597168445587, "learning_rate": 5.004197744901838e-06, "loss": 0.3395, "step": 4572 }, { "epoch": 3.8493265993265995, "grad_norm": 0.44134172797203064, "learning_rate": 5.0020988726358415e-06, "loss": 0.3513, "step": 4573 }, { "epoch": 3.8501683501683504, "grad_norm": 0.4240123927593231, "learning_rate": 5e-06, "loss": 0.3261, "step": 4574 }, { "epoch": 3.851010101010101, "grad_norm": 0.40847983956336975, "learning_rate": 4.997901127364159e-06, "loss": 0.332, "step": 4575 }, { "epoch": 3.851851851851852, "grad_norm": 0.44679057598114014, "learning_rate": 4.995802255098164e-06, "loss": 0.3182, "step": 4576 }, { "epoch": 3.852693602693603, "grad_norm": 0.41153615713119507, "learning_rate": 4.993703383571853e-06, "loss": 0.3335, "step": 4577 }, { "epoch": 3.8535353535353534, "grad_norm": 0.3960033059120178, "learning_rate": 4.991604513155075e-06, "loss": 0.3383, "step": 4578 }, { "epoch": 3.8543771043771042, "grad_norm": 0.4026262164115906, "learning_rate": 4.989505644217671e-06, "loss": 0.3369, "step": 4579 }, { "epoch": 3.855218855218855, "grad_norm": 0.44139164686203003, "learning_rate": 4.9874067771294845e-06, "loss": 0.3406, "step": 4580 }, { "epoch": 3.856060606060606, "grad_norm": 0.4028429687023163, "learning_rate": 4.9853079122603596e-06, "loss": 0.3313, "step": 4581 }, { "epoch": 3.856902356902357, "grad_norm": 0.4164786636829376, "learning_rate": 4.983209049980132e-06, "loss": 0.3455, "step": 4582 }, { "epoch": 3.8577441077441077, "grad_norm": 0.44676944613456726, "learning_rate": 4.981110190658653e-06, "loss": 0.3352, "step": 4583 }, { "epoch": 3.8585858585858586, "grad_norm": 0.4257305860519409, "learning_rate": 4.97901133466576e-06, "loss": 0.3389, "step": 4584 }, { "epoch": 3.8594276094276094, "grad_norm": 0.3858935534954071, "learning_rate": 4.976912482371293e-06, "loss": 0.3361, "step": 4585 }, { "epoch": 3.8602693602693603, "grad_norm": 0.3843333423137665, "learning_rate": 4.974813634145093e-06, "loss": 0.3231, "step": 4586 }, { "epoch": 3.861111111111111, "grad_norm": 0.4868392050266266, "learning_rate": 4.9727147903569985e-06, "loss": 0.3445, "step": 4587 }, { "epoch": 3.861952861952862, "grad_norm": 0.39985957741737366, "learning_rate": 4.970615951376848e-06, "loss": 0.3088, "step": 4588 }, { "epoch": 3.862794612794613, "grad_norm": 0.3732599914073944, "learning_rate": 4.9685171175744816e-06, "loss": 0.3496, "step": 4589 }, { "epoch": 3.8636363636363638, "grad_norm": 0.3821815550327301, "learning_rate": 4.966418289319733e-06, "loss": 0.3273, "step": 4590 }, { "epoch": 3.8644781144781146, "grad_norm": 0.3701443076133728, "learning_rate": 4.964319466982442e-06, "loss": 0.3139, "step": 4591 }, { "epoch": 3.865319865319865, "grad_norm": 0.40107977390289307, "learning_rate": 4.962220650932441e-06, "loss": 0.3299, "step": 4592 }, { "epoch": 3.866161616161616, "grad_norm": 0.4224853813648224, "learning_rate": 4.960121841539562e-06, "loss": 0.3415, "step": 4593 }, { "epoch": 3.8670033670033668, "grad_norm": 0.3846529722213745, "learning_rate": 4.958023039173642e-06, "loss": 0.3141, "step": 4594 }, { "epoch": 3.8678451178451176, "grad_norm": 0.40402939915657043, "learning_rate": 4.955924244204509e-06, "loss": 0.3403, "step": 4595 }, { "epoch": 3.8686868686868685, "grad_norm": 0.40474265813827515, "learning_rate": 4.953825457001995e-06, "loss": 0.3443, "step": 4596 }, { "epoch": 3.8695286195286194, "grad_norm": 0.39807257056236267, "learning_rate": 4.951726677935927e-06, "loss": 0.3415, "step": 4597 }, { "epoch": 3.8703703703703702, "grad_norm": 0.41195017099380493, "learning_rate": 4.9496279073761315e-06, "loss": 0.3388, "step": 4598 }, { "epoch": 3.871212121212121, "grad_norm": 0.4027397930622101, "learning_rate": 4.947529145692437e-06, "loss": 0.3243, "step": 4599 }, { "epoch": 3.872053872053872, "grad_norm": 0.45097124576568604, "learning_rate": 4.945430393254668e-06, "loss": 0.3383, "step": 4600 }, { "epoch": 3.872895622895623, "grad_norm": 0.41231003403663635, "learning_rate": 4.943331650432644e-06, "loss": 0.3294, "step": 4601 }, { "epoch": 3.8737373737373737, "grad_norm": 0.40045464038848877, "learning_rate": 4.9412329175961885e-06, "loss": 0.3341, "step": 4602 }, { "epoch": 3.8745791245791246, "grad_norm": 0.40987586975097656, "learning_rate": 4.939134195115118e-06, "loss": 0.3514, "step": 4603 }, { "epoch": 3.8754208754208754, "grad_norm": 0.39006516337394714, "learning_rate": 4.937035483359251e-06, "loss": 0.331, "step": 4604 }, { "epoch": 3.8762626262626263, "grad_norm": 0.3898180425167084, "learning_rate": 4.934936782698404e-06, "loss": 0.3329, "step": 4605 }, { "epoch": 3.877104377104377, "grad_norm": 0.3828921914100647, "learning_rate": 4.93283809350239e-06, "loss": 0.3382, "step": 4606 }, { "epoch": 3.877946127946128, "grad_norm": 0.40633752942085266, "learning_rate": 4.930739416141019e-06, "loss": 0.3121, "step": 4607 }, { "epoch": 3.878787878787879, "grad_norm": 0.43344128131866455, "learning_rate": 4.9286407509841036e-06, "loss": 0.3313, "step": 4608 }, { "epoch": 3.8796296296296298, "grad_norm": 0.3772662878036499, "learning_rate": 4.926542098401445e-06, "loss": 0.3178, "step": 4609 }, { "epoch": 3.8804713804713806, "grad_norm": 0.37889501452445984, "learning_rate": 4.924443458762854e-06, "loss": 0.3254, "step": 4610 }, { "epoch": 3.8813131313131315, "grad_norm": 0.3810010254383087, "learning_rate": 4.9223448324381315e-06, "loss": 0.3062, "step": 4611 }, { "epoch": 3.8821548821548824, "grad_norm": 0.3882027268409729, "learning_rate": 4.920246219797077e-06, "loss": 0.3631, "step": 4612 }, { "epoch": 3.8829966329966332, "grad_norm": 0.3665992021560669, "learning_rate": 4.918147621209488e-06, "loss": 0.3448, "step": 4613 }, { "epoch": 3.883838383838384, "grad_norm": 0.41205379366874695, "learning_rate": 4.9160490370451595e-06, "loss": 0.341, "step": 4614 }, { "epoch": 3.884680134680135, "grad_norm": 0.38892579078674316, "learning_rate": 4.913950467673888e-06, "loss": 0.3308, "step": 4615 }, { "epoch": 3.8855218855218854, "grad_norm": 0.37444373965263367, "learning_rate": 4.9118519134654614e-06, "loss": 0.3168, "step": 4616 }, { "epoch": 3.8863636363636362, "grad_norm": 0.3748886287212372, "learning_rate": 4.909753374789668e-06, "loss": 0.3362, "step": 4617 }, { "epoch": 3.887205387205387, "grad_norm": 0.3794686496257782, "learning_rate": 4.9076548520162916e-06, "loss": 0.3354, "step": 4618 }, { "epoch": 3.888047138047138, "grad_norm": 0.36799100041389465, "learning_rate": 4.905556345515115e-06, "loss": 0.3141, "step": 4619 }, { "epoch": 3.888888888888889, "grad_norm": 0.3890550434589386, "learning_rate": 4.903457855655914e-06, "loss": 0.3285, "step": 4620 }, { "epoch": 3.8897306397306397, "grad_norm": 0.41789716482162476, "learning_rate": 4.901359382808471e-06, "loss": 0.3279, "step": 4621 }, { "epoch": 3.8905723905723906, "grad_norm": 0.4248289167881012, "learning_rate": 4.899260927342557e-06, "loss": 0.3337, "step": 4622 }, { "epoch": 3.8914141414141414, "grad_norm": 0.4209742844104767, "learning_rate": 4.897162489627941e-06, "loss": 0.3182, "step": 4623 }, { "epoch": 3.8922558922558923, "grad_norm": 0.41835281252861023, "learning_rate": 4.895064070034391e-06, "loss": 0.2978, "step": 4624 }, { "epoch": 3.893097643097643, "grad_norm": 0.44803473353385925, "learning_rate": 4.8929656689316684e-06, "loss": 0.3206, "step": 4625 }, { "epoch": 3.893939393939394, "grad_norm": 0.42540547251701355, "learning_rate": 4.890867286689539e-06, "loss": 0.3311, "step": 4626 }, { "epoch": 3.894781144781145, "grad_norm": 0.4100591540336609, "learning_rate": 4.888768923677756e-06, "loss": 0.333, "step": 4627 }, { "epoch": 3.8956228956228958, "grad_norm": 0.3992982506752014, "learning_rate": 4.886670580266074e-06, "loss": 0.3188, "step": 4628 }, { "epoch": 3.8964646464646466, "grad_norm": 0.42439034581184387, "learning_rate": 4.884572256824245e-06, "loss": 0.3291, "step": 4629 }, { "epoch": 3.897306397306397, "grad_norm": 0.3827192485332489, "learning_rate": 4.8824739537220135e-06, "loss": 0.345, "step": 4630 }, { "epoch": 3.898148148148148, "grad_norm": 0.40588241815567017, "learning_rate": 4.880375671329126e-06, "loss": 0.3184, "step": 4631 }, { "epoch": 3.898989898989899, "grad_norm": 0.3851429522037506, "learning_rate": 4.87827741001532e-06, "loss": 0.3226, "step": 4632 }, { "epoch": 3.8998316498316496, "grad_norm": 0.3841548562049866, "learning_rate": 4.876179170150332e-06, "loss": 0.323, "step": 4633 }, { "epoch": 3.9006734006734005, "grad_norm": 0.40602174401283264, "learning_rate": 4.874080952103895e-06, "loss": 0.3005, "step": 4634 }, { "epoch": 3.9015151515151514, "grad_norm": 0.3891107738018036, "learning_rate": 4.871982756245736e-06, "loss": 0.3242, "step": 4635 }, { "epoch": 3.9023569023569022, "grad_norm": 0.3608280420303345, "learning_rate": 4.869884582945579e-06, "loss": 0.3585, "step": 4636 }, { "epoch": 3.903198653198653, "grad_norm": 0.3983980715274811, "learning_rate": 4.8677864325731475e-06, "loss": 0.3203, "step": 4637 }, { "epoch": 3.904040404040404, "grad_norm": 0.4103427529335022, "learning_rate": 4.865688305498156e-06, "loss": 0.3258, "step": 4638 }, { "epoch": 3.904882154882155, "grad_norm": 0.38685518503189087, "learning_rate": 4.863590202090317e-06, "loss": 0.3254, "step": 4639 }, { "epoch": 3.9057239057239057, "grad_norm": 0.3757613003253937, "learning_rate": 4.861492122719339e-06, "loss": 0.3163, "step": 4640 }, { "epoch": 3.9065656565656566, "grad_norm": 0.4217376112937927, "learning_rate": 4.859394067754923e-06, "loss": 0.3219, "step": 4641 }, { "epoch": 3.9074074074074074, "grad_norm": 0.39808088541030884, "learning_rate": 4.8572960375667734e-06, "loss": 0.3406, "step": 4642 }, { "epoch": 3.9082491582491583, "grad_norm": 0.40218600630760193, "learning_rate": 4.855198032524584e-06, "loss": 0.3274, "step": 4643 }, { "epoch": 3.909090909090909, "grad_norm": 0.39557352662086487, "learning_rate": 4.8531000529980445e-06, "loss": 0.3056, "step": 4644 }, { "epoch": 3.90993265993266, "grad_norm": 0.3745580017566681, "learning_rate": 4.851002099356842e-06, "loss": 0.3325, "step": 4645 }, { "epoch": 3.910774410774411, "grad_norm": 0.39527279138565063, "learning_rate": 4.848904171970657e-06, "loss": 0.3043, "step": 4646 }, { "epoch": 3.9116161616161618, "grad_norm": 0.40143951773643494, "learning_rate": 4.846806271209169e-06, "loss": 0.3112, "step": 4647 }, { "epoch": 3.9124579124579126, "grad_norm": 0.4287961423397064, "learning_rate": 4.844708397442049e-06, "loss": 0.3273, "step": 4648 }, { "epoch": 3.9132996632996635, "grad_norm": 0.4041237533092499, "learning_rate": 4.842610551038966e-06, "loss": 0.3083, "step": 4649 }, { "epoch": 3.9141414141414144, "grad_norm": 0.42748546600341797, "learning_rate": 4.8405127323695805e-06, "loss": 0.3313, "step": 4650 }, { "epoch": 3.9149831649831652, "grad_norm": 0.42938101291656494, "learning_rate": 4.838414941803552e-06, "loss": 0.3394, "step": 4651 }, { "epoch": 3.915824915824916, "grad_norm": 0.396415650844574, "learning_rate": 4.836317179710532e-06, "loss": 0.3404, "step": 4652 }, { "epoch": 3.9166666666666665, "grad_norm": 0.4226960837841034, "learning_rate": 4.834219446460173e-06, "loss": 0.3223, "step": 4653 }, { "epoch": 3.9175084175084174, "grad_norm": 0.44564399123191833, "learning_rate": 4.832121742422113e-06, "loss": 0.2979, "step": 4654 }, { "epoch": 3.9183501683501682, "grad_norm": 0.43728718161582947, "learning_rate": 4.830024067965994e-06, "loss": 0.3533, "step": 4655 }, { "epoch": 3.919191919191919, "grad_norm": 0.4051240086555481, "learning_rate": 4.827926423461445e-06, "loss": 0.3176, "step": 4656 }, { "epoch": 3.92003367003367, "grad_norm": 0.4093012511730194, "learning_rate": 4.825828809278094e-06, "loss": 0.3241, "step": 4657 }, { "epoch": 3.920875420875421, "grad_norm": 0.3782062232494354, "learning_rate": 4.823731225785563e-06, "loss": 0.3238, "step": 4658 }, { "epoch": 3.9217171717171717, "grad_norm": 0.42732301354408264, "learning_rate": 4.821633673353472e-06, "loss": 0.3144, "step": 4659 }, { "epoch": 3.9225589225589226, "grad_norm": 0.41061460971832275, "learning_rate": 4.819536152351428e-06, "loss": 0.3122, "step": 4660 }, { "epoch": 3.9234006734006734, "grad_norm": 0.3620077073574066, "learning_rate": 4.81743866314904e-06, "loss": 0.3365, "step": 4661 }, { "epoch": 3.9242424242424243, "grad_norm": 0.4227347671985626, "learning_rate": 4.815341206115901e-06, "loss": 0.342, "step": 4662 }, { "epoch": 3.925084175084175, "grad_norm": 0.4380090832710266, "learning_rate": 4.813243781621614e-06, "loss": 0.3335, "step": 4663 }, { "epoch": 3.925925925925926, "grad_norm": 0.40087369084358215, "learning_rate": 4.811146390035764e-06, "loss": 0.338, "step": 4664 }, { "epoch": 3.926767676767677, "grad_norm": 0.4089820086956024, "learning_rate": 4.809049031727932e-06, "loss": 0.3493, "step": 4665 }, { "epoch": 3.9276094276094278, "grad_norm": 0.39813047647476196, "learning_rate": 4.806951707067697e-06, "loss": 0.3036, "step": 4666 }, { "epoch": 3.928451178451178, "grad_norm": 0.44174525141716003, "learning_rate": 4.8048544164246305e-06, "loss": 0.3101, "step": 4667 }, { "epoch": 3.929292929292929, "grad_norm": 0.41457298398017883, "learning_rate": 4.802757160168293e-06, "loss": 0.3463, "step": 4668 }, { "epoch": 3.93013468013468, "grad_norm": 0.41951555013656616, "learning_rate": 4.800659938668249e-06, "loss": 0.3195, "step": 4669 }, { "epoch": 3.930976430976431, "grad_norm": 0.36764541268348694, "learning_rate": 4.79856275229405e-06, "loss": 0.34, "step": 4670 }, { "epoch": 3.9318181818181817, "grad_norm": 0.3770449161529541, "learning_rate": 4.796465601415241e-06, "loss": 0.3065, "step": 4671 }, { "epoch": 3.9326599326599325, "grad_norm": 0.37998151779174805, "learning_rate": 4.794368486401363e-06, "loss": 0.3309, "step": 4672 }, { "epoch": 3.9335016835016834, "grad_norm": 0.41338035464286804, "learning_rate": 4.7922714076219475e-06, "loss": 0.3145, "step": 4673 }, { "epoch": 3.9343434343434343, "grad_norm": 0.40250256657600403, "learning_rate": 4.790174365446527e-06, "loss": 0.3166, "step": 4674 }, { "epoch": 3.935185185185185, "grad_norm": 0.39283913373947144, "learning_rate": 4.78807736024462e-06, "loss": 0.3192, "step": 4675 }, { "epoch": 3.936026936026936, "grad_norm": 0.41169148683547974, "learning_rate": 4.785980392385743e-06, "loss": 0.325, "step": 4676 }, { "epoch": 3.936868686868687, "grad_norm": 0.39879947900772095, "learning_rate": 4.783883462239401e-06, "loss": 0.3109, "step": 4677 }, { "epoch": 3.9377104377104377, "grad_norm": 0.3781038522720337, "learning_rate": 4.781786570175096e-06, "loss": 0.2866, "step": 4678 }, { "epoch": 3.9385521885521886, "grad_norm": 0.3965287506580353, "learning_rate": 4.779689716562326e-06, "loss": 0.3569, "step": 4679 }, { "epoch": 3.9393939393939394, "grad_norm": 0.37546342611312866, "learning_rate": 4.777592901770576e-06, "loss": 0.31, "step": 4680 }, { "epoch": 3.9402356902356903, "grad_norm": 0.4405345618724823, "learning_rate": 4.775496126169327e-06, "loss": 0.3482, "step": 4681 }, { "epoch": 3.941077441077441, "grad_norm": 0.3765977621078491, "learning_rate": 4.773399390128057e-06, "loss": 0.3249, "step": 4682 }, { "epoch": 3.941919191919192, "grad_norm": 0.4291931688785553, "learning_rate": 4.771302694016227e-06, "loss": 0.3326, "step": 4683 }, { "epoch": 3.942760942760943, "grad_norm": 0.4362500011920929, "learning_rate": 4.769206038203301e-06, "loss": 0.3428, "step": 4684 }, { "epoch": 3.9436026936026938, "grad_norm": 0.38792771100997925, "learning_rate": 4.767109423058734e-06, "loss": 0.3102, "step": 4685 }, { "epoch": 3.9444444444444446, "grad_norm": 0.368428498506546, "learning_rate": 4.765012848951967e-06, "loss": 0.3396, "step": 4686 }, { "epoch": 3.9452861952861955, "grad_norm": 0.39106884598731995, "learning_rate": 4.762916316252443e-06, "loss": 0.3282, "step": 4687 }, { "epoch": 3.9461279461279464, "grad_norm": 0.41518330574035645, "learning_rate": 4.760819825329591e-06, "loss": 0.3404, "step": 4688 }, { "epoch": 3.9469696969696972, "grad_norm": 0.45018449425697327, "learning_rate": 4.758723376552835e-06, "loss": 0.3208, "step": 4689 }, { "epoch": 3.9478114478114477, "grad_norm": 0.3821294903755188, "learning_rate": 4.756626970291593e-06, "loss": 0.334, "step": 4690 }, { "epoch": 3.9486531986531985, "grad_norm": 0.3990931510925293, "learning_rate": 4.754530606915274e-06, "loss": 0.3254, "step": 4691 }, { "epoch": 3.9494949494949494, "grad_norm": 0.45427417755126953, "learning_rate": 4.752434286793279e-06, "loss": 0.3324, "step": 4692 }, { "epoch": 3.9503367003367003, "grad_norm": 0.4064030945301056, "learning_rate": 4.7503380102950035e-06, "loss": 0.3125, "step": 4693 }, { "epoch": 3.951178451178451, "grad_norm": 0.4093334674835205, "learning_rate": 4.748241777789829e-06, "loss": 0.3486, "step": 4694 }, { "epoch": 3.952020202020202, "grad_norm": 0.42919671535491943, "learning_rate": 4.746145589647139e-06, "loss": 0.35, "step": 4695 }, { "epoch": 3.952861952861953, "grad_norm": 0.36975592374801636, "learning_rate": 4.7440494462363025e-06, "loss": 0.3266, "step": 4696 }, { "epoch": 3.9537037037037037, "grad_norm": 0.41065967082977295, "learning_rate": 4.741953347926684e-06, "loss": 0.3104, "step": 4697 }, { "epoch": 3.9545454545454546, "grad_norm": 0.40059182047843933, "learning_rate": 4.739857295087635e-06, "loss": 0.3229, "step": 4698 }, { "epoch": 3.9553872053872055, "grad_norm": 0.437602698802948, "learning_rate": 4.7377612880885056e-06, "loss": 0.3155, "step": 4699 }, { "epoch": 3.9562289562289563, "grad_norm": 0.3930286765098572, "learning_rate": 4.73566532729863e-06, "loss": 0.345, "step": 4700 }, { "epoch": 3.957070707070707, "grad_norm": 0.46598055958747864, "learning_rate": 4.733569413087344e-06, "loss": 0.313, "step": 4701 }, { "epoch": 3.957912457912458, "grad_norm": 0.40829524397850037, "learning_rate": 4.731473545823969e-06, "loss": 0.3271, "step": 4702 }, { "epoch": 3.958754208754209, "grad_norm": 0.4243229329586029, "learning_rate": 4.729377725877816e-06, "loss": 0.3538, "step": 4703 }, { "epoch": 3.9595959595959593, "grad_norm": 0.4408712089061737, "learning_rate": 4.727281953618194e-06, "loss": 0.319, "step": 4704 }, { "epoch": 3.96043771043771, "grad_norm": 0.41731011867523193, "learning_rate": 4.725186229414398e-06, "loss": 0.3193, "step": 4705 }, { "epoch": 3.961279461279461, "grad_norm": 0.4100348949432373, "learning_rate": 4.72309055363572e-06, "loss": 0.3223, "step": 4706 }, { "epoch": 3.962121212121212, "grad_norm": 0.4153570532798767, "learning_rate": 4.7209949266514384e-06, "loss": 0.3265, "step": 4707 }, { "epoch": 3.962962962962963, "grad_norm": 0.38030585646629333, "learning_rate": 4.718899348830825e-06, "loss": 0.3361, "step": 4708 }, { "epoch": 3.9638047138047137, "grad_norm": 0.4334588646888733, "learning_rate": 4.716803820543144e-06, "loss": 0.3336, "step": 4709 }, { "epoch": 3.9646464646464645, "grad_norm": 0.39251986145973206, "learning_rate": 4.714708342157646e-06, "loss": 0.3569, "step": 4710 }, { "epoch": 3.9654882154882154, "grad_norm": 0.3521044850349426, "learning_rate": 4.712612914043583e-06, "loss": 0.3182, "step": 4711 }, { "epoch": 3.9663299663299663, "grad_norm": 0.40553444623947144, "learning_rate": 4.710517536570188e-06, "loss": 0.3082, "step": 4712 }, { "epoch": 3.967171717171717, "grad_norm": 0.4378120005130768, "learning_rate": 4.708422210106689e-06, "loss": 0.3587, "step": 4713 }, { "epoch": 3.968013468013468, "grad_norm": 0.3795664310455322, "learning_rate": 4.706326935022306e-06, "loss": 0.3204, "step": 4714 }, { "epoch": 3.968855218855219, "grad_norm": 0.4055972695350647, "learning_rate": 4.704231711686247e-06, "loss": 0.3382, "step": 4715 }, { "epoch": 3.9696969696969697, "grad_norm": 0.4231908321380615, "learning_rate": 4.702136540467712e-06, "loss": 0.3325, "step": 4716 }, { "epoch": 3.9705387205387206, "grad_norm": 0.39152196049690247, "learning_rate": 4.700041421735897e-06, "loss": 0.3301, "step": 4717 }, { "epoch": 3.9713804713804715, "grad_norm": 0.4257064759731293, "learning_rate": 4.697946355859982e-06, "loss": 0.3452, "step": 4718 }, { "epoch": 3.9722222222222223, "grad_norm": 0.4267294406890869, "learning_rate": 4.695851343209137e-06, "loss": 0.312, "step": 4719 }, { "epoch": 3.973063973063973, "grad_norm": 0.451336532831192, "learning_rate": 4.6937563841525295e-06, "loss": 0.3371, "step": 4720 }, { "epoch": 3.973905723905724, "grad_norm": 0.4117458462715149, "learning_rate": 4.691661479059311e-06, "loss": 0.3136, "step": 4721 }, { "epoch": 3.974747474747475, "grad_norm": 0.4764581322669983, "learning_rate": 4.689566628298627e-06, "loss": 0.3604, "step": 4722 }, { "epoch": 3.975589225589226, "grad_norm": 0.4007113575935364, "learning_rate": 4.6874718322396145e-06, "loss": 0.3141, "step": 4723 }, { "epoch": 3.9764309764309766, "grad_norm": 0.4625222682952881, "learning_rate": 4.685377091251396e-06, "loss": 0.3321, "step": 4724 }, { "epoch": 3.9772727272727275, "grad_norm": 0.39231008291244507, "learning_rate": 4.68328240570309e-06, "loss": 0.3088, "step": 4725 }, { "epoch": 3.9781144781144784, "grad_norm": 0.3934188783168793, "learning_rate": 4.681187775963797e-06, "loss": 0.3435, "step": 4726 }, { "epoch": 3.978956228956229, "grad_norm": 0.43073442578315735, "learning_rate": 4.67909320240262e-06, "loss": 0.3318, "step": 4727 }, { "epoch": 3.9797979797979797, "grad_norm": 0.40180012583732605, "learning_rate": 4.676998685388641e-06, "loss": 0.3539, "step": 4728 }, { "epoch": 3.9806397306397305, "grad_norm": 0.38105830550193787, "learning_rate": 4.674904225290938e-06, "loss": 0.3234, "step": 4729 }, { "epoch": 3.9814814814814814, "grad_norm": 0.41326847672462463, "learning_rate": 4.672809822478576e-06, "loss": 0.3337, "step": 4730 }, { "epoch": 3.9823232323232323, "grad_norm": 0.37924444675445557, "learning_rate": 4.670715477320613e-06, "loss": 0.3426, "step": 4731 }, { "epoch": 3.983164983164983, "grad_norm": 0.3684144616127014, "learning_rate": 4.66862119018609e-06, "loss": 0.3413, "step": 4732 }, { "epoch": 3.984006734006734, "grad_norm": 0.4465126395225525, "learning_rate": 4.6665269614440485e-06, "loss": 0.3185, "step": 4733 }, { "epoch": 3.984848484848485, "grad_norm": 0.38720324635505676, "learning_rate": 4.6644327914635115e-06, "loss": 0.3307, "step": 4734 }, { "epoch": 3.9856902356902357, "grad_norm": 0.3869250416755676, "learning_rate": 4.662338680613496e-06, "loss": 0.3428, "step": 4735 }, { "epoch": 3.9865319865319866, "grad_norm": 0.44505009055137634, "learning_rate": 4.660244629263004e-06, "loss": 0.307, "step": 4736 }, { "epoch": 3.9873737373737375, "grad_norm": 0.4023107886314392, "learning_rate": 4.658150637781028e-06, "loss": 0.3178, "step": 4737 }, { "epoch": 3.9882154882154883, "grad_norm": 0.38831189274787903, "learning_rate": 4.656056706536557e-06, "loss": 0.3097, "step": 4738 }, { "epoch": 3.989057239057239, "grad_norm": 0.39900222420692444, "learning_rate": 4.65396283589856e-06, "loss": 0.3148, "step": 4739 }, { "epoch": 3.98989898989899, "grad_norm": 0.40791627764701843, "learning_rate": 4.6518690262360016e-06, "loss": 0.3593, "step": 4740 }, { "epoch": 3.9907407407407405, "grad_norm": 0.4158409833908081, "learning_rate": 4.649775277917832e-06, "loss": 0.3245, "step": 4741 }, { "epoch": 3.9915824915824913, "grad_norm": 0.42738857865333557, "learning_rate": 4.64768159131299e-06, "loss": 0.3182, "step": 4742 }, { "epoch": 3.992424242424242, "grad_norm": 0.37796443700790405, "learning_rate": 4.64558796679041e-06, "loss": 0.3391, "step": 4743 }, { "epoch": 3.993265993265993, "grad_norm": 0.4130061864852905, "learning_rate": 4.643494404719009e-06, "loss": 0.3387, "step": 4744 }, { "epoch": 3.994107744107744, "grad_norm": 0.38417553901672363, "learning_rate": 4.641400905467695e-06, "loss": 0.3261, "step": 4745 }, { "epoch": 3.994949494949495, "grad_norm": 0.3780628442764282, "learning_rate": 4.639307469405365e-06, "loss": 0.3387, "step": 4746 }, { "epoch": 3.9957912457912457, "grad_norm": 0.3886467516422272, "learning_rate": 4.637214096900903e-06, "loss": 0.3411, "step": 4747 }, { "epoch": 3.9966329966329965, "grad_norm": 0.3844238221645355, "learning_rate": 4.635120788323184e-06, "loss": 0.329, "step": 4748 }, { "epoch": 3.9974747474747474, "grad_norm": 0.4154476225376129, "learning_rate": 4.633027544041073e-06, "loss": 0.3491, "step": 4749 }, { "epoch": 3.9983164983164983, "grad_norm": 0.41044819355010986, "learning_rate": 4.630934364423423e-06, "loss": 0.3088, "step": 4750 }, { "epoch": 3.999158249158249, "grad_norm": 0.44278284907341003, "learning_rate": 4.628841249839071e-06, "loss": 0.3484, "step": 4751 }, { "epoch": 4.0, "grad_norm": 0.4137051999568939, "learning_rate": 4.62674820065685e-06, "loss": 0.2957, "step": 4752 }, { "epoch": 4.000841750841751, "grad_norm": 0.457671195268631, "learning_rate": 4.624655217245572e-06, "loss": 0.2847, "step": 4753 }, { "epoch": 4.001683501683502, "grad_norm": 0.3758162260055542, "learning_rate": 4.622562299974048e-06, "loss": 0.2904, "step": 4754 }, { "epoch": 4.002525252525253, "grad_norm": 0.44727909564971924, "learning_rate": 4.620469449211072e-06, "loss": 0.2766, "step": 4755 }, { "epoch": 4.0033670033670035, "grad_norm": 0.4755506217479706, "learning_rate": 4.618376665325424e-06, "loss": 0.2771, "step": 4756 }, { "epoch": 4.004208754208754, "grad_norm": 0.4200167655944824, "learning_rate": 4.616283948685876e-06, "loss": 0.2836, "step": 4757 }, { "epoch": 4.005050505050505, "grad_norm": 0.43848612904548645, "learning_rate": 4.614191299661187e-06, "loss": 0.2913, "step": 4758 }, { "epoch": 4.005892255892256, "grad_norm": 0.45278966426849365, "learning_rate": 4.612098718620106e-06, "loss": 0.2855, "step": 4759 }, { "epoch": 4.006734006734007, "grad_norm": 0.4592246413230896, "learning_rate": 4.6100062059313655e-06, "loss": 0.2784, "step": 4760 }, { "epoch": 4.007575757575758, "grad_norm": 0.42560091614723206, "learning_rate": 4.60791376196369e-06, "loss": 0.2881, "step": 4761 }, { "epoch": 4.008417508417509, "grad_norm": 0.43037864565849304, "learning_rate": 4.605821387085789e-06, "loss": 0.2937, "step": 4762 }, { "epoch": 4.0092592592592595, "grad_norm": 0.4190219044685364, "learning_rate": 4.603729081666363e-06, "loss": 0.274, "step": 4763 }, { "epoch": 4.01010101010101, "grad_norm": 0.43062275648117065, "learning_rate": 4.601636846074094e-06, "loss": 0.2855, "step": 4764 }, { "epoch": 4.010942760942761, "grad_norm": 0.4696842432022095, "learning_rate": 4.5995446806776645e-06, "loss": 0.2627, "step": 4765 }, { "epoch": 4.011784511784512, "grad_norm": 0.45170024037361145, "learning_rate": 4.59745258584573e-06, "loss": 0.2862, "step": 4766 }, { "epoch": 4.012626262626263, "grad_norm": 0.36563464999198914, "learning_rate": 4.595360561946942e-06, "loss": 0.2798, "step": 4767 }, { "epoch": 4.013468013468014, "grad_norm": 0.41150912642478943, "learning_rate": 4.593268609349936e-06, "loss": 0.306, "step": 4768 }, { "epoch": 4.014309764309765, "grad_norm": 0.4403797388076782, "learning_rate": 4.591176728423337e-06, "loss": 0.3013, "step": 4769 }, { "epoch": 4.015151515151516, "grad_norm": 0.378713458776474, "learning_rate": 4.5890849195357575e-06, "loss": 0.2989, "step": 4770 }, { "epoch": 4.015993265993266, "grad_norm": 0.37785398960113525, "learning_rate": 4.586993183055798e-06, "loss": 0.3006, "step": 4771 }, { "epoch": 4.016835016835016, "grad_norm": 0.4569219648838043, "learning_rate": 4.584901519352041e-06, "loss": 0.262, "step": 4772 }, { "epoch": 4.017676767676767, "grad_norm": 0.40951773524284363, "learning_rate": 4.582809928793063e-06, "loss": 0.296, "step": 4773 }, { "epoch": 4.018518518518518, "grad_norm": 0.3997742831707001, "learning_rate": 4.580718411747422e-06, "loss": 0.2782, "step": 4774 }, { "epoch": 4.019360269360269, "grad_norm": 0.4030362069606781, "learning_rate": 4.578626968583668e-06, "loss": 0.264, "step": 4775 }, { "epoch": 4.02020202020202, "grad_norm": 0.44087645411491394, "learning_rate": 4.5765355996703345e-06, "loss": 0.3132, "step": 4776 }, { "epoch": 4.021043771043771, "grad_norm": 0.43315863609313965, "learning_rate": 4.574444305375943e-06, "loss": 0.2814, "step": 4777 }, { "epoch": 4.021885521885522, "grad_norm": 0.4111197292804718, "learning_rate": 4.572353086069001e-06, "loss": 0.3125, "step": 4778 }, { "epoch": 4.0227272727272725, "grad_norm": 0.4101158678531647, "learning_rate": 4.570261942118006e-06, "loss": 0.2856, "step": 4779 }, { "epoch": 4.023569023569023, "grad_norm": 0.4232115149497986, "learning_rate": 4.568170873891437e-06, "loss": 0.3018, "step": 4780 }, { "epoch": 4.024410774410774, "grad_norm": 0.39176055788993835, "learning_rate": 4.5660798817577645e-06, "loss": 0.2847, "step": 4781 }, { "epoch": 4.025252525252525, "grad_norm": 0.3752654194831848, "learning_rate": 4.563988966085444e-06, "loss": 0.2707, "step": 4782 }, { "epoch": 4.026094276094276, "grad_norm": 0.37286731600761414, "learning_rate": 4.5618981272429155e-06, "loss": 0.2935, "step": 4783 }, { "epoch": 4.026936026936027, "grad_norm": 0.38943544030189514, "learning_rate": 4.559807365598609e-06, "loss": 0.2939, "step": 4784 }, { "epoch": 4.027777777777778, "grad_norm": 0.3762945234775543, "learning_rate": 4.557716681520933e-06, "loss": 0.3142, "step": 4785 }, { "epoch": 4.0286195286195285, "grad_norm": 0.39601582288742065, "learning_rate": 4.5556260753782975e-06, "loss": 0.2786, "step": 4786 }, { "epoch": 4.029461279461279, "grad_norm": 0.39140722155570984, "learning_rate": 4.553535547539084e-06, "loss": 0.2953, "step": 4787 }, { "epoch": 4.03030303030303, "grad_norm": 0.404552161693573, "learning_rate": 4.551445098371669e-06, "loss": 0.3085, "step": 4788 }, { "epoch": 4.031144781144781, "grad_norm": 0.37949928641319275, "learning_rate": 4.549354728244407e-06, "loss": 0.3055, "step": 4789 }, { "epoch": 4.031986531986532, "grad_norm": 0.38762617111206055, "learning_rate": 4.547264437525647e-06, "loss": 0.2922, "step": 4790 }, { "epoch": 4.032828282828283, "grad_norm": 0.41108399629592896, "learning_rate": 4.5451742265837205e-06, "loss": 0.3067, "step": 4791 }, { "epoch": 4.033670033670034, "grad_norm": 0.39301469922065735, "learning_rate": 4.543084095786945e-06, "loss": 0.2801, "step": 4792 }, { "epoch": 4.034511784511785, "grad_norm": 0.3760354816913605, "learning_rate": 4.540994045503622e-06, "loss": 0.302, "step": 4793 }, { "epoch": 4.0353535353535355, "grad_norm": 0.3985457122325897, "learning_rate": 4.538904076102043e-06, "loss": 0.2829, "step": 4794 }, { "epoch": 4.036195286195286, "grad_norm": 0.4153105914592743, "learning_rate": 4.5368141879504805e-06, "loss": 0.2681, "step": 4795 }, { "epoch": 4.037037037037037, "grad_norm": 0.36573687195777893, "learning_rate": 4.534724381417195e-06, "loss": 0.2926, "step": 4796 }, { "epoch": 4.037878787878788, "grad_norm": 0.3808704614639282, "learning_rate": 4.532634656870437e-06, "loss": 0.2939, "step": 4797 }, { "epoch": 4.038720538720539, "grad_norm": 0.37710875272750854, "learning_rate": 4.5305450146784345e-06, "loss": 0.2948, "step": 4798 }, { "epoch": 4.03956228956229, "grad_norm": 0.370419442653656, "learning_rate": 4.528455455209405e-06, "loss": 0.3065, "step": 4799 }, { "epoch": 4.040404040404041, "grad_norm": 0.352943480014801, "learning_rate": 4.526365978831551e-06, "loss": 0.296, "step": 4800 }, { "epoch": 4.0412457912457915, "grad_norm": 0.3840220868587494, "learning_rate": 4.524276585913061e-06, "loss": 0.2706, "step": 4801 }, { "epoch": 4.042087542087542, "grad_norm": 0.3920368254184723, "learning_rate": 4.5221872768221095e-06, "loss": 0.2981, "step": 4802 }, { "epoch": 4.042929292929293, "grad_norm": 0.38512519001960754, "learning_rate": 4.520098051926855e-06, "loss": 0.2989, "step": 4803 }, { "epoch": 4.043771043771044, "grad_norm": 0.3798656165599823, "learning_rate": 4.5180089115954375e-06, "loss": 0.3009, "step": 4804 }, { "epoch": 4.044612794612795, "grad_norm": 0.3858184814453125, "learning_rate": 4.515919856195992e-06, "loss": 0.2927, "step": 4805 }, { "epoch": 4.045454545454546, "grad_norm": 0.3746955990791321, "learning_rate": 4.513830886096625e-06, "loss": 0.2767, "step": 4806 }, { "epoch": 4.046296296296297, "grad_norm": 0.39184677600860596, "learning_rate": 4.511742001665441e-06, "loss": 0.2586, "step": 4807 }, { "epoch": 4.047138047138047, "grad_norm": 0.39487311244010925, "learning_rate": 4.509653203270521e-06, "loss": 0.2728, "step": 4808 }, { "epoch": 4.047979797979798, "grad_norm": 0.38909485936164856, "learning_rate": 4.507564491279936e-06, "loss": 0.2767, "step": 4809 }, { "epoch": 4.048821548821548, "grad_norm": 0.38534167408943176, "learning_rate": 4.505475866061737e-06, "loss": 0.291, "step": 4810 }, { "epoch": 4.049663299663299, "grad_norm": 0.38473302125930786, "learning_rate": 4.503387327983963e-06, "loss": 0.2912, "step": 4811 }, { "epoch": 4.05050505050505, "grad_norm": 0.39978864789009094, "learning_rate": 4.501298877414634e-06, "loss": 0.2802, "step": 4812 }, { "epoch": 4.051346801346801, "grad_norm": 0.3800874948501587, "learning_rate": 4.499210514721762e-06, "loss": 0.2837, "step": 4813 }, { "epoch": 4.052188552188552, "grad_norm": 0.43621551990509033, "learning_rate": 4.497122240273337e-06, "loss": 0.2863, "step": 4814 }, { "epoch": 4.053030303030303, "grad_norm": 0.41980746388435364, "learning_rate": 4.495034054437333e-06, "loss": 0.2773, "step": 4815 }, { "epoch": 4.053872053872054, "grad_norm": 0.41091421246528625, "learning_rate": 4.492945957581712e-06, "loss": 0.283, "step": 4816 }, { "epoch": 4.0547138047138045, "grad_norm": 0.412264347076416, "learning_rate": 4.490857950074419e-06, "loss": 0.2987, "step": 4817 }, { "epoch": 4.055555555555555, "grad_norm": 0.40202024579048157, "learning_rate": 4.488770032283386e-06, "loss": 0.295, "step": 4818 }, { "epoch": 4.056397306397306, "grad_norm": 0.40516263246536255, "learning_rate": 4.486682204576522e-06, "loss": 0.2972, "step": 4819 }, { "epoch": 4.057239057239057, "grad_norm": 0.435289591550827, "learning_rate": 4.484594467321727e-06, "loss": 0.278, "step": 4820 }, { "epoch": 4.058080808080808, "grad_norm": 0.3936084806919098, "learning_rate": 4.482506820886881e-06, "loss": 0.2791, "step": 4821 }, { "epoch": 4.058922558922559, "grad_norm": 0.40584877133369446, "learning_rate": 4.480419265639849e-06, "loss": 0.2893, "step": 4822 }, { "epoch": 4.05976430976431, "grad_norm": 0.4289934039115906, "learning_rate": 4.478331801948484e-06, "loss": 0.3011, "step": 4823 }, { "epoch": 4.0606060606060606, "grad_norm": 0.4096466898918152, "learning_rate": 4.476244430180617e-06, "loss": 0.285, "step": 4824 }, { "epoch": 4.061447811447811, "grad_norm": 0.43845808506011963, "learning_rate": 4.474157150704066e-06, "loss": 0.298, "step": 4825 }, { "epoch": 4.062289562289562, "grad_norm": 0.396622896194458, "learning_rate": 4.47206996388663e-06, "loss": 0.274, "step": 4826 }, { "epoch": 4.063131313131313, "grad_norm": 0.42610520124435425, "learning_rate": 4.469982870096095e-06, "loss": 0.2702, "step": 4827 }, { "epoch": 4.063973063973064, "grad_norm": 0.40324005484580994, "learning_rate": 4.467895869700229e-06, "loss": 0.2945, "step": 4828 }, { "epoch": 4.064814814814815, "grad_norm": 0.3966841995716095, "learning_rate": 4.465808963066784e-06, "loss": 0.3011, "step": 4829 }, { "epoch": 4.065656565656566, "grad_norm": 0.3887748718261719, "learning_rate": 4.463722150563496e-06, "loss": 0.2966, "step": 4830 }, { "epoch": 4.066498316498317, "grad_norm": 0.41583251953125, "learning_rate": 4.461635432558081e-06, "loss": 0.2731, "step": 4831 }, { "epoch": 4.0673400673400675, "grad_norm": 0.3808067739009857, "learning_rate": 4.4595488094182434e-06, "loss": 0.2821, "step": 4832 }, { "epoch": 4.068181818181818, "grad_norm": 0.37683022022247314, "learning_rate": 4.457462281511666e-06, "loss": 0.2884, "step": 4833 }, { "epoch": 4.069023569023569, "grad_norm": 0.3788350224494934, "learning_rate": 4.45537584920602e-06, "loss": 0.2763, "step": 4834 }, { "epoch": 4.06986531986532, "grad_norm": 0.39231809973716736, "learning_rate": 4.453289512868957e-06, "loss": 0.3052, "step": 4835 }, { "epoch": 4.070707070707071, "grad_norm": 0.3810238838195801, "learning_rate": 4.451203272868109e-06, "loss": 0.2818, "step": 4836 }, { "epoch": 4.071548821548822, "grad_norm": 0.4420488476753235, "learning_rate": 4.449117129571097e-06, "loss": 0.3251, "step": 4837 }, { "epoch": 4.072390572390573, "grad_norm": 0.3542480766773224, "learning_rate": 4.447031083345516e-06, "loss": 0.3195, "step": 4838 }, { "epoch": 4.0732323232323235, "grad_norm": 0.41823211312294006, "learning_rate": 4.444945134558957e-06, "loss": 0.2915, "step": 4839 }, { "epoch": 4.074074074074074, "grad_norm": 0.4038480818271637, "learning_rate": 4.442859283578982e-06, "loss": 0.2895, "step": 4840 }, { "epoch": 4.074915824915825, "grad_norm": 0.3644173741340637, "learning_rate": 4.440773530773142e-06, "loss": 0.2738, "step": 4841 }, { "epoch": 4.075757575757576, "grad_norm": 0.39574378728866577, "learning_rate": 4.438687876508967e-06, "loss": 0.2795, "step": 4842 }, { "epoch": 4.076599326599327, "grad_norm": 0.3925746977329254, "learning_rate": 4.436602321153974e-06, "loss": 0.314, "step": 4843 }, { "epoch": 4.077441077441078, "grad_norm": 0.37906691431999207, "learning_rate": 4.434516865075656e-06, "loss": 0.2956, "step": 4844 }, { "epoch": 4.078282828282829, "grad_norm": 0.3850056529045105, "learning_rate": 4.432431508641497e-06, "loss": 0.3048, "step": 4845 }, { "epoch": 4.079124579124579, "grad_norm": 0.4145906865596771, "learning_rate": 4.430346252218956e-06, "loss": 0.3032, "step": 4846 }, { "epoch": 4.07996632996633, "grad_norm": 0.3559698462486267, "learning_rate": 4.428261096175481e-06, "loss": 0.2958, "step": 4847 }, { "epoch": 4.08080808080808, "grad_norm": 0.4023480713367462, "learning_rate": 4.426176040878495e-06, "loss": 0.3178, "step": 4848 }, { "epoch": 4.081649831649831, "grad_norm": 0.3769375681877136, "learning_rate": 4.424091086695407e-06, "loss": 0.2836, "step": 4849 }, { "epoch": 4.082491582491582, "grad_norm": 0.38761037588119507, "learning_rate": 4.422006233993611e-06, "loss": 0.2942, "step": 4850 }, { "epoch": 4.083333333333333, "grad_norm": 0.40076908469200134, "learning_rate": 4.4199214831404795e-06, "loss": 0.2808, "step": 4851 }, { "epoch": 4.084175084175084, "grad_norm": 0.3738369047641754, "learning_rate": 4.417836834503367e-06, "loss": 0.2892, "step": 4852 }, { "epoch": 4.085016835016835, "grad_norm": 0.37480735778808594, "learning_rate": 4.41575228844961e-06, "loss": 0.2962, "step": 4853 }, { "epoch": 4.085858585858586, "grad_norm": 0.3916614055633545, "learning_rate": 4.413667845346528e-06, "loss": 0.2766, "step": 4854 }, { "epoch": 4.0867003367003365, "grad_norm": 0.40676572918891907, "learning_rate": 4.411583505561424e-06, "loss": 0.2949, "step": 4855 }, { "epoch": 4.087542087542087, "grad_norm": 0.41584673523902893, "learning_rate": 4.40949926946158e-06, "loss": 0.3096, "step": 4856 }, { "epoch": 4.088383838383838, "grad_norm": 0.3999894857406616, "learning_rate": 4.407415137414259e-06, "loss": 0.2787, "step": 4857 }, { "epoch": 4.089225589225589, "grad_norm": 0.37968024611473083, "learning_rate": 4.405331109786711e-06, "loss": 0.272, "step": 4858 }, { "epoch": 4.09006734006734, "grad_norm": 0.42452409863471985, "learning_rate": 4.40324718694616e-06, "loss": 0.2876, "step": 4859 }, { "epoch": 4.090909090909091, "grad_norm": 0.44313153624534607, "learning_rate": 4.4011633692598145e-06, "loss": 0.2794, "step": 4860 }, { "epoch": 4.091750841750842, "grad_norm": 0.39427706599235535, "learning_rate": 4.39907965709487e-06, "loss": 0.3022, "step": 4861 }, { "epoch": 4.092592592592593, "grad_norm": 0.37300679087638855, "learning_rate": 4.3969960508184975e-06, "loss": 0.3069, "step": 4862 }, { "epoch": 4.093434343434343, "grad_norm": 0.3995746076107025, "learning_rate": 4.394912550797849e-06, "loss": 0.2782, "step": 4863 }, { "epoch": 4.094276094276094, "grad_norm": 0.38217610120773315, "learning_rate": 4.392829157400061e-06, "loss": 0.3104, "step": 4864 }, { "epoch": 4.095117845117845, "grad_norm": 0.41323840618133545, "learning_rate": 4.3907458709922465e-06, "loss": 0.2891, "step": 4865 }, { "epoch": 4.095959595959596, "grad_norm": 0.37817034125328064, "learning_rate": 4.388662691941506e-06, "loss": 0.2975, "step": 4866 }, { "epoch": 4.096801346801347, "grad_norm": 0.4132569432258606, "learning_rate": 4.386579620614918e-06, "loss": 0.3061, "step": 4867 }, { "epoch": 4.097643097643098, "grad_norm": 0.4207150638103485, "learning_rate": 4.38449665737954e-06, "loss": 0.297, "step": 4868 }, { "epoch": 4.098484848484849, "grad_norm": 0.3734053373336792, "learning_rate": 4.382413802602413e-06, "loss": 0.3043, "step": 4869 }, { "epoch": 4.0993265993265995, "grad_norm": 0.3990747034549713, "learning_rate": 4.380331056650557e-06, "loss": 0.3082, "step": 4870 }, { "epoch": 4.10016835016835, "grad_norm": 0.384875625371933, "learning_rate": 4.378248419890977e-06, "loss": 0.3115, "step": 4871 }, { "epoch": 4.101010101010101, "grad_norm": 0.42099177837371826, "learning_rate": 4.376165892690654e-06, "loss": 0.2982, "step": 4872 }, { "epoch": 4.101851851851852, "grad_norm": 0.37107041478157043, "learning_rate": 4.374083475416553e-06, "loss": 0.2836, "step": 4873 }, { "epoch": 4.102693602693603, "grad_norm": 0.3938240110874176, "learning_rate": 4.372001168435615e-06, "loss": 0.3238, "step": 4874 }, { "epoch": 4.103535353535354, "grad_norm": 0.44492200016975403, "learning_rate": 4.369918972114768e-06, "loss": 0.2997, "step": 4875 }, { "epoch": 4.104377104377105, "grad_norm": 0.4014338254928589, "learning_rate": 4.367836886820912e-06, "loss": 0.3196, "step": 4876 }, { "epoch": 4.1052188552188555, "grad_norm": 0.3727763295173645, "learning_rate": 4.36575491292094e-06, "loss": 0.2684, "step": 4877 }, { "epoch": 4.106060606060606, "grad_norm": 0.39445531368255615, "learning_rate": 4.363673050781713e-06, "loss": 0.2631, "step": 4878 }, { "epoch": 4.106902356902357, "grad_norm": 0.3890388607978821, "learning_rate": 4.361591300770079e-06, "loss": 0.3041, "step": 4879 }, { "epoch": 4.107744107744108, "grad_norm": 0.4111047685146332, "learning_rate": 4.359509663252864e-06, "loss": 0.2907, "step": 4880 }, { "epoch": 4.108585858585859, "grad_norm": 0.3710715174674988, "learning_rate": 4.357428138596874e-06, "loss": 0.3091, "step": 4881 }, { "epoch": 4.109427609427609, "grad_norm": 0.36921486258506775, "learning_rate": 4.355346727168898e-06, "loss": 0.2718, "step": 4882 }, { "epoch": 4.11026936026936, "grad_norm": 0.37893420457839966, "learning_rate": 4.353265429335701e-06, "loss": 0.315, "step": 4883 }, { "epoch": 4.111111111111111, "grad_norm": 0.38290655612945557, "learning_rate": 4.35118424546403e-06, "loss": 0.2905, "step": 4884 }, { "epoch": 4.111952861952862, "grad_norm": 0.3942928612232208, "learning_rate": 4.3491031759206145e-06, "loss": 0.3138, "step": 4885 }, { "epoch": 4.1127946127946124, "grad_norm": 0.4192934036254883, "learning_rate": 4.3470222210721554e-06, "loss": 0.264, "step": 4886 }, { "epoch": 4.113636363636363, "grad_norm": 0.38224613666534424, "learning_rate": 4.344941381285344e-06, "loss": 0.3015, "step": 4887 }, { "epoch": 4.114478114478114, "grad_norm": 0.4000622630119324, "learning_rate": 4.342860656926846e-06, "loss": 0.3044, "step": 4888 }, { "epoch": 4.115319865319865, "grad_norm": 0.4023624658584595, "learning_rate": 4.3407800483633065e-06, "loss": 0.2892, "step": 4889 }, { "epoch": 4.116161616161616, "grad_norm": 0.41277220845222473, "learning_rate": 4.338699555961351e-06, "loss": 0.2792, "step": 4890 }, { "epoch": 4.117003367003367, "grad_norm": 0.3803873658180237, "learning_rate": 4.336619180087582e-06, "loss": 0.3089, "step": 4891 }, { "epoch": 4.117845117845118, "grad_norm": 0.44211170077323914, "learning_rate": 4.334538921108586e-06, "loss": 0.2624, "step": 4892 }, { "epoch": 4.1186868686868685, "grad_norm": 0.41121163964271545, "learning_rate": 4.332458779390929e-06, "loss": 0.2756, "step": 4893 }, { "epoch": 4.119528619528619, "grad_norm": 0.38900598883628845, "learning_rate": 4.330378755301151e-06, "loss": 0.2924, "step": 4894 }, { "epoch": 4.12037037037037, "grad_norm": 0.3581814169883728, "learning_rate": 4.328298849205776e-06, "loss": 0.2949, "step": 4895 }, { "epoch": 4.121212121212121, "grad_norm": 0.4126325845718384, "learning_rate": 4.326219061471305e-06, "loss": 0.2881, "step": 4896 }, { "epoch": 4.122053872053872, "grad_norm": 0.46003782749176025, "learning_rate": 4.324139392464217e-06, "loss": 0.3146, "step": 4897 }, { "epoch": 4.122895622895623, "grad_norm": 0.3988182246685028, "learning_rate": 4.322059842550975e-06, "loss": 0.3109, "step": 4898 }, { "epoch": 4.123737373737374, "grad_norm": 0.3483794927597046, "learning_rate": 4.319980412098016e-06, "loss": 0.3312, "step": 4899 }, { "epoch": 4.124579124579125, "grad_norm": 0.38944998383522034, "learning_rate": 4.31790110147176e-06, "loss": 0.2744, "step": 4900 }, { "epoch": 4.125420875420875, "grad_norm": 0.44811776280403137, "learning_rate": 4.315821911038602e-06, "loss": 0.2823, "step": 4901 }, { "epoch": 4.126262626262626, "grad_norm": 0.408237487077713, "learning_rate": 4.313742841164914e-06, "loss": 0.2922, "step": 4902 }, { "epoch": 4.127104377104377, "grad_norm": 0.4361792504787445, "learning_rate": 4.311663892217058e-06, "loss": 0.3048, "step": 4903 }, { "epoch": 4.127946127946128, "grad_norm": 0.4586776793003082, "learning_rate": 4.30958506456136e-06, "loss": 0.2834, "step": 4904 }, { "epoch": 4.128787878787879, "grad_norm": 0.43048012256622314, "learning_rate": 4.307506358564138e-06, "loss": 0.2796, "step": 4905 }, { "epoch": 4.12962962962963, "grad_norm": 0.38747450709342957, "learning_rate": 4.305427774591676e-06, "loss": 0.2901, "step": 4906 }, { "epoch": 4.130471380471381, "grad_norm": 0.5715799331665039, "learning_rate": 4.303349313010245e-06, "loss": 0.2884, "step": 4907 }, { "epoch": 4.1313131313131315, "grad_norm": 0.42111822962760925, "learning_rate": 4.301270974186091e-06, "loss": 0.2624, "step": 4908 }, { "epoch": 4.132154882154882, "grad_norm": 0.3726063370704651, "learning_rate": 4.299192758485444e-06, "loss": 0.3005, "step": 4909 }, { "epoch": 4.132996632996633, "grad_norm": 0.4369526207447052, "learning_rate": 4.2971146662745034e-06, "loss": 0.2866, "step": 4910 }, { "epoch": 4.133838383838384, "grad_norm": 0.41945916414260864, "learning_rate": 4.295036697919453e-06, "loss": 0.272, "step": 4911 }, { "epoch": 4.134680134680135, "grad_norm": 0.3727133572101593, "learning_rate": 4.292958853786451e-06, "loss": 0.2823, "step": 4912 }, { "epoch": 4.135521885521886, "grad_norm": 0.39445358514785767, "learning_rate": 4.290881134241637e-06, "loss": 0.306, "step": 4913 }, { "epoch": 4.136363636363637, "grad_norm": 0.38664817810058594, "learning_rate": 4.2888035396511275e-06, "loss": 0.2921, "step": 4914 }, { "epoch": 4.1372053872053876, "grad_norm": 0.39391329884529114, "learning_rate": 4.286726070381017e-06, "loss": 0.2916, "step": 4915 }, { "epoch": 4.138047138047138, "grad_norm": 0.37473025918006897, "learning_rate": 4.284648726797378e-06, "loss": 0.2825, "step": 4916 }, { "epoch": 4.138888888888889, "grad_norm": 0.38150128722190857, "learning_rate": 4.282571509266261e-06, "loss": 0.3087, "step": 4917 }, { "epoch": 4.13973063973064, "grad_norm": 0.40943488478660583, "learning_rate": 4.28049441815369e-06, "loss": 0.3085, "step": 4918 }, { "epoch": 4.140572390572391, "grad_norm": 0.3905225694179535, "learning_rate": 4.278417453825674e-06, "loss": 0.2696, "step": 4919 }, { "epoch": 4.141414141414141, "grad_norm": 0.3643358647823334, "learning_rate": 4.276340616648198e-06, "loss": 0.2948, "step": 4920 }, { "epoch": 4.142255892255892, "grad_norm": 0.35407915711402893, "learning_rate": 4.274263906987219e-06, "loss": 0.3025, "step": 4921 }, { "epoch": 4.143097643097643, "grad_norm": 0.3886636197566986, "learning_rate": 4.272187325208677e-06, "loss": 0.2776, "step": 4922 }, { "epoch": 4.143939393939394, "grad_norm": 0.4103652834892273, "learning_rate": 4.270110871678489e-06, "loss": 0.2788, "step": 4923 }, { "epoch": 4.1447811447811445, "grad_norm": 0.43444928526878357, "learning_rate": 4.268034546762544e-06, "loss": 0.2891, "step": 4924 }, { "epoch": 4.145622895622895, "grad_norm": 0.4077366292476654, "learning_rate": 4.265958350826718e-06, "loss": 0.2819, "step": 4925 }, { "epoch": 4.146464646464646, "grad_norm": 0.4034850299358368, "learning_rate": 4.263882284236857e-06, "loss": 0.2989, "step": 4926 }, { "epoch": 4.147306397306397, "grad_norm": 0.39693814516067505, "learning_rate": 4.261806347358784e-06, "loss": 0.2914, "step": 4927 }, { "epoch": 4.148148148148148, "grad_norm": 0.3834112584590912, "learning_rate": 4.259730540558306e-06, "loss": 0.304, "step": 4928 }, { "epoch": 4.148989898989899, "grad_norm": 0.41144198179244995, "learning_rate": 4.257654864201194e-06, "loss": 0.2808, "step": 4929 }, { "epoch": 4.14983164983165, "grad_norm": 0.3934287428855896, "learning_rate": 4.2555793186532124e-06, "loss": 0.2877, "step": 4930 }, { "epoch": 4.1506734006734005, "grad_norm": 0.3994236886501312, "learning_rate": 4.253503904280091e-06, "loss": 0.2701, "step": 4931 }, { "epoch": 4.151515151515151, "grad_norm": 0.41412681341171265, "learning_rate": 4.251428621447542e-06, "loss": 0.2984, "step": 4932 }, { "epoch": 4.152356902356902, "grad_norm": 0.3824456036090851, "learning_rate": 4.249353470521248e-06, "loss": 0.2729, "step": 4933 }, { "epoch": 4.153198653198653, "grad_norm": 0.387469619512558, "learning_rate": 4.247278451866876e-06, "loss": 0.324, "step": 4934 }, { "epoch": 4.154040404040404, "grad_norm": 0.41035693883895874, "learning_rate": 4.245203565850067e-06, "loss": 0.2933, "step": 4935 }, { "epoch": 4.154882154882155, "grad_norm": 0.37154191732406616, "learning_rate": 4.2431288128364366e-06, "loss": 0.297, "step": 4936 }, { "epoch": 4.155723905723906, "grad_norm": 0.37819960713386536, "learning_rate": 4.241054193191579e-06, "loss": 0.2939, "step": 4937 }, { "epoch": 4.156565656565657, "grad_norm": 0.4256078898906708, "learning_rate": 4.238979707281063e-06, "loss": 0.3106, "step": 4938 }, { "epoch": 4.157407407407407, "grad_norm": 0.3942992687225342, "learning_rate": 4.236905355470437e-06, "loss": 0.302, "step": 4939 }, { "epoch": 4.158249158249158, "grad_norm": 0.3879372179508209, "learning_rate": 4.234831138125222e-06, "loss": 0.2913, "step": 4940 }, { "epoch": 4.159090909090909, "grad_norm": 0.35316941142082214, "learning_rate": 4.232757055610919e-06, "loss": 0.3036, "step": 4941 }, { "epoch": 4.15993265993266, "grad_norm": 0.4207078814506531, "learning_rate": 4.230683108293002e-06, "loss": 0.2881, "step": 4942 }, { "epoch": 4.160774410774411, "grad_norm": 0.3884431719779968, "learning_rate": 4.228609296536925e-06, "loss": 0.2516, "step": 4943 }, { "epoch": 4.161616161616162, "grad_norm": 0.37146973609924316, "learning_rate": 4.226535620708113e-06, "loss": 0.3042, "step": 4944 }, { "epoch": 4.162457912457913, "grad_norm": 0.4116241931915283, "learning_rate": 4.22446208117197e-06, "loss": 0.2881, "step": 4945 }, { "epoch": 4.1632996632996635, "grad_norm": 0.394206702709198, "learning_rate": 4.2223886782938775e-06, "loss": 0.2868, "step": 4946 }, { "epoch": 4.164141414141414, "grad_norm": 0.40595975518226624, "learning_rate": 4.220315412439191e-06, "loss": 0.2958, "step": 4947 }, { "epoch": 4.164983164983165, "grad_norm": 0.4150819182395935, "learning_rate": 4.218242283973241e-06, "loss": 0.3054, "step": 4948 }, { "epoch": 4.165824915824916, "grad_norm": 0.40776216983795166, "learning_rate": 4.216169293261335e-06, "loss": 0.2803, "step": 4949 }, { "epoch": 4.166666666666667, "grad_norm": 0.4075741767883301, "learning_rate": 4.214096440668755e-06, "loss": 0.2912, "step": 4950 }, { "epoch": 4.167508417508418, "grad_norm": 0.39652228355407715, "learning_rate": 4.2120237265607624e-06, "loss": 0.2968, "step": 4951 }, { "epoch": 4.168350168350169, "grad_norm": 0.440480500459671, "learning_rate": 4.20995115130259e-06, "loss": 0.2968, "step": 4952 }, { "epoch": 4.16919191919192, "grad_norm": 0.39293375611305237, "learning_rate": 4.207878715259448e-06, "loss": 0.3117, "step": 4953 }, { "epoch": 4.17003367003367, "grad_norm": 0.4190816879272461, "learning_rate": 4.205806418796522e-06, "loss": 0.2806, "step": 4954 }, { "epoch": 4.170875420875421, "grad_norm": 0.39743080735206604, "learning_rate": 4.203734262278972e-06, "loss": 0.3042, "step": 4955 }, { "epoch": 4.171717171717171, "grad_norm": 0.3822762668132782, "learning_rate": 4.201662246071932e-06, "loss": 0.3031, "step": 4956 }, { "epoch": 4.172558922558922, "grad_norm": 0.3652905821800232, "learning_rate": 4.199590370540518e-06, "loss": 0.2703, "step": 4957 }, { "epoch": 4.173400673400673, "grad_norm": 0.38586828112602234, "learning_rate": 4.197518636049814e-06, "loss": 0.2912, "step": 4958 }, { "epoch": 4.174242424242424, "grad_norm": 0.3974781632423401, "learning_rate": 4.195447042964881e-06, "loss": 0.2906, "step": 4959 }, { "epoch": 4.175084175084175, "grad_norm": 0.3814254701137543, "learning_rate": 4.193375591650758e-06, "loss": 0.2737, "step": 4960 }, { "epoch": 4.175925925925926, "grad_norm": 0.3877207040786743, "learning_rate": 4.191304282472453e-06, "loss": 0.2854, "step": 4961 }, { "epoch": 4.1767676767676765, "grad_norm": 0.39355507493019104, "learning_rate": 4.189233115794959e-06, "loss": 0.3083, "step": 4962 }, { "epoch": 4.177609427609427, "grad_norm": 0.38206109404563904, "learning_rate": 4.187162091983231e-06, "loss": 0.2798, "step": 4963 }, { "epoch": 4.178451178451178, "grad_norm": 0.4102775454521179, "learning_rate": 4.185091211402211e-06, "loss": 0.2957, "step": 4964 }, { "epoch": 4.179292929292929, "grad_norm": 0.37904876470565796, "learning_rate": 4.183020474416805e-06, "loss": 0.2899, "step": 4965 }, { "epoch": 4.18013468013468, "grad_norm": 0.4114772379398346, "learning_rate": 4.180949881391901e-06, "loss": 0.2807, "step": 4966 }, { "epoch": 4.180976430976431, "grad_norm": 0.41866427659988403, "learning_rate": 4.178879432692362e-06, "loss": 0.2837, "step": 4967 }, { "epoch": 4.181818181818182, "grad_norm": 0.3976939618587494, "learning_rate": 4.1768091286830224e-06, "loss": 0.2971, "step": 4968 }, { "epoch": 4.1826599326599325, "grad_norm": 0.3888792097568512, "learning_rate": 4.174738969728689e-06, "loss": 0.2888, "step": 4969 }, { "epoch": 4.183501683501683, "grad_norm": 0.40224602818489075, "learning_rate": 4.172668956194149e-06, "loss": 0.2904, "step": 4970 }, { "epoch": 4.184343434343434, "grad_norm": 0.3705733120441437, "learning_rate": 4.170599088444157e-06, "loss": 0.3215, "step": 4971 }, { "epoch": 4.185185185185185, "grad_norm": 0.3981429934501648, "learning_rate": 4.168529366843449e-06, "loss": 0.3031, "step": 4972 }, { "epoch": 4.186026936026936, "grad_norm": 0.3929820656776428, "learning_rate": 4.166459791756733e-06, "loss": 0.3022, "step": 4973 }, { "epoch": 4.186868686868687, "grad_norm": 0.3879941999912262, "learning_rate": 4.164390363548687e-06, "loss": 0.2717, "step": 4974 }, { "epoch": 4.187710437710438, "grad_norm": 0.3902603089809418, "learning_rate": 4.162321082583967e-06, "loss": 0.2747, "step": 4975 }, { "epoch": 4.188552188552189, "grad_norm": 0.4159656763076782, "learning_rate": 4.1602519492272054e-06, "loss": 0.2664, "step": 4976 }, { "epoch": 4.1893939393939394, "grad_norm": 0.39042460918426514, "learning_rate": 4.158182963843001e-06, "loss": 0.3008, "step": 4977 }, { "epoch": 4.19023569023569, "grad_norm": 0.4183378219604492, "learning_rate": 4.156114126795934e-06, "loss": 0.2726, "step": 4978 }, { "epoch": 4.191077441077441, "grad_norm": 0.3819662928581238, "learning_rate": 4.154045438450556e-06, "loss": 0.3119, "step": 4979 }, { "epoch": 4.191919191919192, "grad_norm": 0.3711578845977783, "learning_rate": 4.151976899171389e-06, "loss": 0.2897, "step": 4980 }, { "epoch": 4.192760942760943, "grad_norm": 0.36313214898109436, "learning_rate": 4.149908509322936e-06, "loss": 0.2856, "step": 4981 }, { "epoch": 4.193602693602694, "grad_norm": 0.44957777857780457, "learning_rate": 4.147840269269663e-06, "loss": 0.2697, "step": 4982 }, { "epoch": 4.194444444444445, "grad_norm": 0.40622478723526, "learning_rate": 4.145772179376023e-06, "loss": 0.2918, "step": 4983 }, { "epoch": 4.1952861952861955, "grad_norm": 0.3739442825317383, "learning_rate": 4.143704240006431e-06, "loss": 0.2951, "step": 4984 }, { "epoch": 4.196127946127946, "grad_norm": 0.4152749180793762, "learning_rate": 4.141636451525283e-06, "loss": 0.2599, "step": 4985 }, { "epoch": 4.196969696969697, "grad_norm": 0.3844965994358063, "learning_rate": 4.139568814296942e-06, "loss": 0.3038, "step": 4986 }, { "epoch": 4.197811447811448, "grad_norm": 0.38955050706863403, "learning_rate": 4.13750132868575e-06, "loss": 0.3196, "step": 4987 }, { "epoch": 4.198653198653199, "grad_norm": 0.3971916437149048, "learning_rate": 4.135433995056018e-06, "loss": 0.3184, "step": 4988 }, { "epoch": 4.19949494949495, "grad_norm": 0.4157247245311737, "learning_rate": 4.133366813772034e-06, "loss": 0.264, "step": 4989 }, { "epoch": 4.200336700336701, "grad_norm": 0.3976524770259857, "learning_rate": 4.131299785198058e-06, "loss": 0.3083, "step": 4990 }, { "epoch": 4.201178451178452, "grad_norm": 0.38295525312423706, "learning_rate": 4.129232909698321e-06, "loss": 0.2849, "step": 4991 }, { "epoch": 4.202020202020202, "grad_norm": 0.3833342492580414, "learning_rate": 4.127166187637028e-06, "loss": 0.308, "step": 4992 }, { "epoch": 4.202861952861953, "grad_norm": 0.3898562788963318, "learning_rate": 4.125099619378358e-06, "loss": 0.2959, "step": 4993 }, { "epoch": 4.203703703703703, "grad_norm": 0.3919023275375366, "learning_rate": 4.123033205286465e-06, "loss": 0.2879, "step": 4994 }, { "epoch": 4.204545454545454, "grad_norm": 0.4299560785293579, "learning_rate": 4.12096694572547e-06, "loss": 0.2547, "step": 4995 }, { "epoch": 4.205387205387205, "grad_norm": 0.40446728467941284, "learning_rate": 4.1189008410594715e-06, "loss": 0.2789, "step": 4996 }, { "epoch": 4.206228956228956, "grad_norm": 0.4023892879486084, "learning_rate": 4.116834891652538e-06, "loss": 0.2695, "step": 4997 }, { "epoch": 4.207070707070707, "grad_norm": 0.4045135974884033, "learning_rate": 4.11476909786871e-06, "loss": 0.3134, "step": 4998 }, { "epoch": 4.207912457912458, "grad_norm": 0.40832042694091797, "learning_rate": 4.112703460072007e-06, "loss": 0.31, "step": 4999 }, { "epoch": 4.2087542087542085, "grad_norm": 0.38857245445251465, "learning_rate": 4.110637978626415e-06, "loss": 0.2975, "step": 5000 }, { "epoch": 4.209595959595959, "grad_norm": 0.383675217628479, "learning_rate": 4.108572653895893e-06, "loss": 0.2835, "step": 5001 }, { "epoch": 4.21043771043771, "grad_norm": 0.41540515422821045, "learning_rate": 4.106507486244373e-06, "loss": 0.3114, "step": 5002 }, { "epoch": 4.211279461279461, "grad_norm": 0.3738672435283661, "learning_rate": 4.1044424760357596e-06, "loss": 0.2903, "step": 5003 }, { "epoch": 4.212121212121212, "grad_norm": 0.3661859333515167, "learning_rate": 4.102377623633928e-06, "loss": 0.286, "step": 5004 }, { "epoch": 4.212962962962963, "grad_norm": 0.3538668155670166, "learning_rate": 4.10031292940273e-06, "loss": 0.3095, "step": 5005 }, { "epoch": 4.213804713804714, "grad_norm": 0.3877775967121124, "learning_rate": 4.098248393705987e-06, "loss": 0.28, "step": 5006 }, { "epoch": 4.2146464646464645, "grad_norm": 0.510622501373291, "learning_rate": 4.096184016907491e-06, "loss": 0.3087, "step": 5007 }, { "epoch": 4.215488215488215, "grad_norm": 0.37807050347328186, "learning_rate": 4.094119799371007e-06, "loss": 0.2745, "step": 5008 }, { "epoch": 4.216329966329966, "grad_norm": 0.4175305664539337, "learning_rate": 4.0920557414602705e-06, "loss": 0.274, "step": 5009 }, { "epoch": 4.217171717171717, "grad_norm": 0.39245378971099854, "learning_rate": 4.089991843538993e-06, "loss": 0.3055, "step": 5010 }, { "epoch": 4.218013468013468, "grad_norm": 0.3937339186668396, "learning_rate": 4.0879281059708565e-06, "loss": 0.2805, "step": 5011 }, { "epoch": 4.218855218855219, "grad_norm": 0.4053780734539032, "learning_rate": 4.08586452911951e-06, "loss": 0.2907, "step": 5012 }, { "epoch": 4.21969696969697, "grad_norm": 0.417531281709671, "learning_rate": 4.08380111334858e-06, "loss": 0.2771, "step": 5013 }, { "epoch": 4.220538720538721, "grad_norm": 0.4108983874320984, "learning_rate": 4.081737859021661e-06, "loss": 0.3009, "step": 5014 }, { "epoch": 4.2213804713804715, "grad_norm": 0.3712334930896759, "learning_rate": 4.079674766502323e-06, "loss": 0.3052, "step": 5015 }, { "epoch": 4.222222222222222, "grad_norm": 0.41876131296157837, "learning_rate": 4.0776118361541014e-06, "loss": 0.289, "step": 5016 }, { "epoch": 4.223063973063973, "grad_norm": 0.4006138741970062, "learning_rate": 4.0755490683405105e-06, "loss": 0.2825, "step": 5017 }, { "epoch": 4.223905723905724, "grad_norm": 0.39684590697288513, "learning_rate": 4.073486463425027e-06, "loss": 0.2638, "step": 5018 }, { "epoch": 4.224747474747475, "grad_norm": 0.392235666513443, "learning_rate": 4.0714240217711095e-06, "loss": 0.2876, "step": 5019 }, { "epoch": 4.225589225589226, "grad_norm": 0.3703255355358124, "learning_rate": 4.069361743742175e-06, "loss": 0.2795, "step": 5020 }, { "epoch": 4.226430976430977, "grad_norm": 0.3754202425479889, "learning_rate": 4.067299629701627e-06, "loss": 0.2948, "step": 5021 }, { "epoch": 4.2272727272727275, "grad_norm": 0.3954675793647766, "learning_rate": 4.0652376800128266e-06, "loss": 0.2779, "step": 5022 }, { "epoch": 4.228114478114478, "grad_norm": 0.39151838421821594, "learning_rate": 4.063175895039114e-06, "loss": 0.2711, "step": 5023 }, { "epoch": 4.228956228956229, "grad_norm": 0.4045582115650177, "learning_rate": 4.061114275143796e-06, "loss": 0.2602, "step": 5024 }, { "epoch": 4.22979797979798, "grad_norm": 0.381033331155777, "learning_rate": 4.059052820690151e-06, "loss": 0.2998, "step": 5025 }, { "epoch": 4.230639730639731, "grad_norm": 0.39791300892829895, "learning_rate": 4.056991532041433e-06, "loss": 0.3061, "step": 5026 }, { "epoch": 4.231481481481482, "grad_norm": 0.3961678445339203, "learning_rate": 4.05493040956086e-06, "loss": 0.3046, "step": 5027 }, { "epoch": 4.232323232323233, "grad_norm": 0.3972451686859131, "learning_rate": 4.0528694536116245e-06, "loss": 0.302, "step": 5028 }, { "epoch": 4.233164983164984, "grad_norm": 0.38233157992362976, "learning_rate": 4.05080866455689e-06, "loss": 0.2924, "step": 5029 }, { "epoch": 4.2340067340067336, "grad_norm": 0.3699395954608917, "learning_rate": 4.048748042759786e-06, "loss": 0.2801, "step": 5030 }, { "epoch": 4.234848484848484, "grad_norm": 0.407847136259079, "learning_rate": 4.04668758858342e-06, "loss": 0.3104, "step": 5031 }, { "epoch": 4.235690235690235, "grad_norm": 0.4145480990409851, "learning_rate": 4.044627302390866e-06, "loss": 0.3039, "step": 5032 }, { "epoch": 4.236531986531986, "grad_norm": 0.38868650794029236, "learning_rate": 4.042567184545166e-06, "loss": 0.29, "step": 5033 }, { "epoch": 4.237373737373737, "grad_norm": 0.3899260461330414, "learning_rate": 4.040507235409337e-06, "loss": 0.3086, "step": 5034 }, { "epoch": 4.238215488215488, "grad_norm": 0.39527931809425354, "learning_rate": 4.03844745534636e-06, "loss": 0.2777, "step": 5035 }, { "epoch": 4.239057239057239, "grad_norm": 0.41597622632980347, "learning_rate": 4.036387844719193e-06, "loss": 0.2964, "step": 5036 }, { "epoch": 4.23989898989899, "grad_norm": 0.38072705268859863, "learning_rate": 4.034328403890762e-06, "loss": 0.2716, "step": 5037 }, { "epoch": 4.2407407407407405, "grad_norm": 0.39815637469291687, "learning_rate": 4.032269133223963e-06, "loss": 0.2869, "step": 5038 }, { "epoch": 4.241582491582491, "grad_norm": 0.41650131344795227, "learning_rate": 4.030210033081659e-06, "loss": 0.285, "step": 5039 }, { "epoch": 4.242424242424242, "grad_norm": 0.40685227513313293, "learning_rate": 4.028151103826687e-06, "loss": 0.2867, "step": 5040 }, { "epoch": 4.243265993265993, "grad_norm": 0.39043697714805603, "learning_rate": 4.026092345821851e-06, "loss": 0.2861, "step": 5041 }, { "epoch": 4.244107744107744, "grad_norm": 0.4330759346485138, "learning_rate": 4.024033759429927e-06, "loss": 0.3055, "step": 5042 }, { "epoch": 4.244949494949495, "grad_norm": 0.38438594341278076, "learning_rate": 4.02197534501366e-06, "loss": 0.3203, "step": 5043 }, { "epoch": 4.245791245791246, "grad_norm": 0.4053633213043213, "learning_rate": 4.0199171029357646e-06, "loss": 0.3014, "step": 5044 }, { "epoch": 4.2466329966329965, "grad_norm": 0.42170557379722595, "learning_rate": 4.017859033558925e-06, "loss": 0.3134, "step": 5045 }, { "epoch": 4.247474747474747, "grad_norm": 0.396676242351532, "learning_rate": 4.015801137245793e-06, "loss": 0.3106, "step": 5046 }, { "epoch": 4.248316498316498, "grad_norm": 0.382599413394928, "learning_rate": 4.013743414358996e-06, "loss": 0.2943, "step": 5047 }, { "epoch": 4.249158249158249, "grad_norm": 0.4096616804599762, "learning_rate": 4.011685865261122e-06, "loss": 0.2662, "step": 5048 }, { "epoch": 4.25, "grad_norm": 0.3644796907901764, "learning_rate": 4.0096284903147375e-06, "loss": 0.2778, "step": 5049 }, { "epoch": 4.250841750841751, "grad_norm": 0.39572814106941223, "learning_rate": 4.00757128988237e-06, "loss": 0.2786, "step": 5050 }, { "epoch": 4.251683501683502, "grad_norm": 0.4262208044528961, "learning_rate": 4.005514264326523e-06, "loss": 0.3086, "step": 5051 }, { "epoch": 4.252525252525253, "grad_norm": 0.4099382162094116, "learning_rate": 4.0034574140096636e-06, "loss": 0.304, "step": 5052 }, { "epoch": 4.2533670033670035, "grad_norm": 0.38543033599853516, "learning_rate": 4.001400739294235e-06, "loss": 0.3042, "step": 5053 }, { "epoch": 4.254208754208754, "grad_norm": 0.43121957778930664, "learning_rate": 3.999344240542641e-06, "loss": 0.2959, "step": 5054 }, { "epoch": 4.255050505050505, "grad_norm": 0.4326308071613312, "learning_rate": 3.997287918117262e-06, "loss": 0.2897, "step": 5055 }, { "epoch": 4.255892255892256, "grad_norm": 0.42239657044410706, "learning_rate": 3.9952317723804406e-06, "loss": 0.2695, "step": 5056 }, { "epoch": 4.256734006734007, "grad_norm": 0.38155397772789, "learning_rate": 3.993175803694491e-06, "loss": 0.2828, "step": 5057 }, { "epoch": 4.257575757575758, "grad_norm": 0.3857765793800354, "learning_rate": 3.991120012421702e-06, "loss": 0.3131, "step": 5058 }, { "epoch": 4.258417508417509, "grad_norm": 0.40919676423072815, "learning_rate": 3.9890643989243225e-06, "loss": 0.3067, "step": 5059 }, { "epoch": 4.2592592592592595, "grad_norm": 0.394138902425766, "learning_rate": 3.987008963564573e-06, "loss": 0.2956, "step": 5060 }, { "epoch": 4.26010101010101, "grad_norm": 0.3879416584968567, "learning_rate": 3.984953706704647e-06, "loss": 0.2895, "step": 5061 }, { "epoch": 4.260942760942761, "grad_norm": 0.4001677930355072, "learning_rate": 3.982898628706696e-06, "loss": 0.2945, "step": 5062 }, { "epoch": 4.261784511784512, "grad_norm": 0.39351627230644226, "learning_rate": 3.9808437299328516e-06, "loss": 0.2889, "step": 5063 }, { "epoch": 4.262626262626263, "grad_norm": 0.41048935055732727, "learning_rate": 3.978789010745208e-06, "loss": 0.2774, "step": 5064 }, { "epoch": 4.263468013468014, "grad_norm": 0.4109102189540863, "learning_rate": 3.976734471505828e-06, "loss": 0.2978, "step": 5065 }, { "epoch": 4.264309764309765, "grad_norm": 0.41158705949783325, "learning_rate": 3.974680112576744e-06, "loss": 0.2978, "step": 5066 }, { "epoch": 4.265151515151516, "grad_norm": 0.3668223023414612, "learning_rate": 3.972625934319956e-06, "loss": 0.2773, "step": 5067 }, { "epoch": 4.2659932659932664, "grad_norm": 0.4613923132419586, "learning_rate": 3.970571937097429e-06, "loss": 0.2814, "step": 5068 }, { "epoch": 4.266835016835016, "grad_norm": 0.4552437663078308, "learning_rate": 3.968518121271104e-06, "loss": 0.3116, "step": 5069 }, { "epoch": 4.267676767676767, "grad_norm": 0.4035019874572754, "learning_rate": 3.9664644872028835e-06, "loss": 0.2924, "step": 5070 }, { "epoch": 4.268518518518518, "grad_norm": 0.39018282294273376, "learning_rate": 3.9644110352546384e-06, "loss": 0.3014, "step": 5071 }, { "epoch": 4.269360269360269, "grad_norm": 0.42447224259376526, "learning_rate": 3.96235776578821e-06, "loss": 0.2858, "step": 5072 }, { "epoch": 4.27020202020202, "grad_norm": 0.4183237552642822, "learning_rate": 3.960304679165404e-06, "loss": 0.2976, "step": 5073 }, { "epoch": 4.271043771043771, "grad_norm": 0.40104418992996216, "learning_rate": 3.958251775748001e-06, "loss": 0.32, "step": 5074 }, { "epoch": 4.271885521885522, "grad_norm": 0.40387651324272156, "learning_rate": 3.9561990558977406e-06, "loss": 0.2946, "step": 5075 }, { "epoch": 4.2727272727272725, "grad_norm": 0.4061674475669861, "learning_rate": 3.954146519976337e-06, "loss": 0.295, "step": 5076 }, { "epoch": 4.273569023569023, "grad_norm": 0.3975258767604828, "learning_rate": 3.952094168345464e-06, "loss": 0.2661, "step": 5077 }, { "epoch": 4.274410774410774, "grad_norm": 0.380655437707901, "learning_rate": 3.950042001366771e-06, "loss": 0.2908, "step": 5078 }, { "epoch": 4.275252525252525, "grad_norm": 0.3895055055618286, "learning_rate": 3.947990019401873e-06, "loss": 0.3072, "step": 5079 }, { "epoch": 4.276094276094276, "grad_norm": 0.45697832107543945, "learning_rate": 3.9459382228123475e-06, "loss": 0.2795, "step": 5080 }, { "epoch": 4.276936026936027, "grad_norm": 0.4239952564239502, "learning_rate": 3.943886611959746e-06, "loss": 0.285, "step": 5081 }, { "epoch": 4.277777777777778, "grad_norm": 0.3881807327270508, "learning_rate": 3.941835187205584e-06, "loss": 0.2785, "step": 5082 }, { "epoch": 4.2786195286195285, "grad_norm": 0.4140184819698334, "learning_rate": 3.939783948911343e-06, "loss": 0.3088, "step": 5083 }, { "epoch": 4.279461279461279, "grad_norm": 0.38822633028030396, "learning_rate": 3.937732897438472e-06, "loss": 0.3039, "step": 5084 }, { "epoch": 4.28030303030303, "grad_norm": 0.3904826045036316, "learning_rate": 3.9356820331483915e-06, "loss": 0.2602, "step": 5085 }, { "epoch": 4.281144781144781, "grad_norm": 0.38393792510032654, "learning_rate": 3.9336313564024835e-06, "loss": 0.2986, "step": 5086 }, { "epoch": 4.281986531986532, "grad_norm": 0.3969052731990814, "learning_rate": 3.9315808675621e-06, "loss": 0.2947, "step": 5087 }, { "epoch": 4.282828282828283, "grad_norm": 0.38609206676483154, "learning_rate": 3.929530566988558e-06, "loss": 0.2936, "step": 5088 }, { "epoch": 4.283670033670034, "grad_norm": 0.36668577790260315, "learning_rate": 3.927480455043142e-06, "loss": 0.3229, "step": 5089 }, { "epoch": 4.284511784511785, "grad_norm": 0.41055697202682495, "learning_rate": 3.9254305320871046e-06, "loss": 0.2885, "step": 5090 }, { "epoch": 4.2853535353535355, "grad_norm": 0.41756191849708557, "learning_rate": 3.923380798481665e-06, "loss": 0.3142, "step": 5091 }, { "epoch": 4.286195286195286, "grad_norm": 0.412464439868927, "learning_rate": 3.921331254588007e-06, "loss": 0.2922, "step": 5092 }, { "epoch": 4.287037037037037, "grad_norm": 0.37696373462677, "learning_rate": 3.919281900767282e-06, "loss": 0.2992, "step": 5093 }, { "epoch": 4.287878787878788, "grad_norm": 0.40404900908470154, "learning_rate": 3.917232737380607e-06, "loss": 0.2715, "step": 5094 }, { "epoch": 4.288720538720539, "grad_norm": 0.42308565974235535, "learning_rate": 3.915183764789068e-06, "loss": 0.3134, "step": 5095 }, { "epoch": 4.28956228956229, "grad_norm": 0.4032427668571472, "learning_rate": 3.913134983353714e-06, "loss": 0.2756, "step": 5096 }, { "epoch": 4.290404040404041, "grad_norm": 0.37677428126335144, "learning_rate": 3.911086393435567e-06, "loss": 0.305, "step": 5097 }, { "epoch": 4.2912457912457915, "grad_norm": 0.39705830812454224, "learning_rate": 3.909037995395604e-06, "loss": 0.2736, "step": 5098 }, { "epoch": 4.292087542087542, "grad_norm": 0.4003625512123108, "learning_rate": 3.906989789594779e-06, "loss": 0.2789, "step": 5099 }, { "epoch": 4.292929292929293, "grad_norm": 0.41180428862571716, "learning_rate": 3.904941776394005e-06, "loss": 0.2804, "step": 5100 }, { "epoch": 4.293771043771044, "grad_norm": 0.4135023355484009, "learning_rate": 3.902893956154164e-06, "loss": 0.2887, "step": 5101 }, { "epoch": 4.294612794612795, "grad_norm": 0.4061472415924072, "learning_rate": 3.900846329236107e-06, "loss": 0.2899, "step": 5102 }, { "epoch": 4.295454545454546, "grad_norm": 0.38670581579208374, "learning_rate": 3.898798896000644e-06, "loss": 0.2942, "step": 5103 }, { "epoch": 4.296296296296296, "grad_norm": 0.3947773575782776, "learning_rate": 3.896751656808556e-06, "loss": 0.2879, "step": 5104 }, { "epoch": 4.297138047138047, "grad_norm": 0.41527682542800903, "learning_rate": 3.8947046120205875e-06, "loss": 0.2745, "step": 5105 }, { "epoch": 4.297979797979798, "grad_norm": 0.4062865674495697, "learning_rate": 3.892657761997451e-06, "loss": 0.2837, "step": 5106 }, { "epoch": 4.298821548821548, "grad_norm": 0.39239194989204407, "learning_rate": 3.890611107099823e-06, "loss": 0.2895, "step": 5107 }, { "epoch": 4.299663299663299, "grad_norm": 0.4320763051509857, "learning_rate": 3.888564647688347e-06, "loss": 0.2832, "step": 5108 }, { "epoch": 4.30050505050505, "grad_norm": 0.4084832966327667, "learning_rate": 3.886518384123628e-06, "loss": 0.2906, "step": 5109 }, { "epoch": 4.301346801346801, "grad_norm": 0.44686466455459595, "learning_rate": 3.884472316766239e-06, "loss": 0.2757, "step": 5110 }, { "epoch": 4.302188552188552, "grad_norm": 0.39817336201667786, "learning_rate": 3.882426445976723e-06, "loss": 0.2905, "step": 5111 }, { "epoch": 4.303030303030303, "grad_norm": 0.3793126344680786, "learning_rate": 3.880380772115583e-06, "loss": 0.2991, "step": 5112 }, { "epoch": 4.303872053872054, "grad_norm": 0.41483423113822937, "learning_rate": 3.8783352955432865e-06, "loss": 0.2746, "step": 5113 }, { "epoch": 4.3047138047138045, "grad_norm": 0.43181121349334717, "learning_rate": 3.876290016620271e-06, "loss": 0.3012, "step": 5114 }, { "epoch": 4.305555555555555, "grad_norm": 0.4272926449775696, "learning_rate": 3.874244935706933e-06, "loss": 0.2977, "step": 5115 }, { "epoch": 4.306397306397306, "grad_norm": 0.40822911262512207, "learning_rate": 3.872200053163639e-06, "loss": 0.2864, "step": 5116 }, { "epoch": 4.307239057239057, "grad_norm": 0.40509024262428284, "learning_rate": 3.8701553693507225e-06, "loss": 0.2904, "step": 5117 }, { "epoch": 4.308080808080808, "grad_norm": 0.4040939509868622, "learning_rate": 3.868110884628473e-06, "loss": 0.3189, "step": 5118 }, { "epoch": 4.308922558922559, "grad_norm": 0.4014407694339752, "learning_rate": 3.866066599357154e-06, "loss": 0.2589, "step": 5119 }, { "epoch": 4.30976430976431, "grad_norm": 0.38993218541145325, "learning_rate": 3.86402251389699e-06, "loss": 0.306, "step": 5120 }, { "epoch": 4.3106060606060606, "grad_norm": 0.39449983835220337, "learning_rate": 3.861978628608169e-06, "loss": 0.2934, "step": 5121 }, { "epoch": 4.311447811447811, "grad_norm": 0.412103533744812, "learning_rate": 3.8599349438508475e-06, "loss": 0.2907, "step": 5122 }, { "epoch": 4.312289562289562, "grad_norm": 0.4066225588321686, "learning_rate": 3.857891459985144e-06, "loss": 0.2922, "step": 5123 }, { "epoch": 4.313131313131313, "grad_norm": 0.3880312740802765, "learning_rate": 3.855848177371141e-06, "loss": 0.2871, "step": 5124 }, { "epoch": 4.313973063973064, "grad_norm": 0.4238545894622803, "learning_rate": 3.853805096368888e-06, "loss": 0.3069, "step": 5125 }, { "epoch": 4.314814814814815, "grad_norm": 0.3988577127456665, "learning_rate": 3.851762217338395e-06, "loss": 0.2885, "step": 5126 }, { "epoch": 4.315656565656566, "grad_norm": 0.39502277970314026, "learning_rate": 3.849719540639644e-06, "loss": 0.3143, "step": 5127 }, { "epoch": 4.316498316498317, "grad_norm": 0.3894442915916443, "learning_rate": 3.847677066632572e-06, "loss": 0.296, "step": 5128 }, { "epoch": 4.3173400673400675, "grad_norm": 0.416208952665329, "learning_rate": 3.845634795677087e-06, "loss": 0.2848, "step": 5129 }, { "epoch": 4.318181818181818, "grad_norm": 0.39227741956710815, "learning_rate": 3.8435927281330585e-06, "loss": 0.3066, "step": 5130 }, { "epoch": 4.319023569023569, "grad_norm": 0.40007728338241577, "learning_rate": 3.84155086436032e-06, "loss": 0.2784, "step": 5131 }, { "epoch": 4.31986531986532, "grad_norm": 0.4089445471763611, "learning_rate": 3.839509204718667e-06, "loss": 0.2853, "step": 5132 }, { "epoch": 4.320707070707071, "grad_norm": 0.3907436430454254, "learning_rate": 3.8374677495678686e-06, "loss": 0.3103, "step": 5133 }, { "epoch": 4.321548821548822, "grad_norm": 0.4007527232170105, "learning_rate": 3.835426499267645e-06, "loss": 0.2943, "step": 5134 }, { "epoch": 4.322390572390573, "grad_norm": 0.3779112696647644, "learning_rate": 3.83338545417769e-06, "loss": 0.3064, "step": 5135 }, { "epoch": 4.3232323232323235, "grad_norm": 0.3867422342300415, "learning_rate": 3.831344614657654e-06, "loss": 0.3057, "step": 5136 }, { "epoch": 4.324074074074074, "grad_norm": 0.3815416395664215, "learning_rate": 3.829303981067156e-06, "loss": 0.3015, "step": 5137 }, { "epoch": 4.324915824915825, "grad_norm": 0.37700191140174866, "learning_rate": 3.82726355376578e-06, "loss": 0.2723, "step": 5138 }, { "epoch": 4.325757575757576, "grad_norm": 0.38842329382896423, "learning_rate": 3.825223333113067e-06, "loss": 0.3169, "step": 5139 }, { "epoch": 4.326599326599327, "grad_norm": 0.39135491847991943, "learning_rate": 3.823183319468529e-06, "loss": 0.2742, "step": 5140 }, { "epoch": 4.327441077441078, "grad_norm": 0.39452531933784485, "learning_rate": 3.821143513191636e-06, "loss": 0.2606, "step": 5141 }, { "epoch": 4.328282828282829, "grad_norm": 0.4055882692337036, "learning_rate": 3.819103914641822e-06, "loss": 0.3016, "step": 5142 }, { "epoch": 4.329124579124579, "grad_norm": 0.416216105222702, "learning_rate": 3.81706452417849e-06, "loss": 0.3021, "step": 5143 }, { "epoch": 4.32996632996633, "grad_norm": 0.38448646664619446, "learning_rate": 3.815025342161002e-06, "loss": 0.282, "step": 5144 }, { "epoch": 4.33080808080808, "grad_norm": 0.4138631224632263, "learning_rate": 3.8129863689486814e-06, "loss": 0.3004, "step": 5145 }, { "epoch": 4.331649831649831, "grad_norm": 0.43105557560920715, "learning_rate": 3.810947604900818e-06, "loss": 0.295, "step": 5146 }, { "epoch": 4.332491582491582, "grad_norm": 0.3983442485332489, "learning_rate": 3.8089090503766633e-06, "loss": 0.299, "step": 5147 }, { "epoch": 4.333333333333333, "grad_norm": 0.39064839482307434, "learning_rate": 3.806870705735431e-06, "loss": 0.295, "step": 5148 }, { "epoch": 4.334175084175084, "grad_norm": 0.39353981614112854, "learning_rate": 3.8048325713363022e-06, "loss": 0.2921, "step": 5149 }, { "epoch": 4.335016835016835, "grad_norm": 0.3895052671432495, "learning_rate": 3.8027946475384173e-06, "loss": 0.2958, "step": 5150 }, { "epoch": 4.335858585858586, "grad_norm": 0.3852505683898926, "learning_rate": 3.8007569347008786e-06, "loss": 0.2954, "step": 5151 }, { "epoch": 4.3367003367003365, "grad_norm": 0.41661936044692993, "learning_rate": 3.798719433182755e-06, "loss": 0.3153, "step": 5152 }, { "epoch": 4.337542087542087, "grad_norm": 0.4138605296611786, "learning_rate": 3.796682143343072e-06, "loss": 0.2845, "step": 5153 }, { "epoch": 4.338383838383838, "grad_norm": 0.37109899520874023, "learning_rate": 3.794645065540826e-06, "loss": 0.3051, "step": 5154 }, { "epoch": 4.339225589225589, "grad_norm": 0.46713054180145264, "learning_rate": 3.7926082001349696e-06, "loss": 0.303, "step": 5155 }, { "epoch": 4.34006734006734, "grad_norm": 0.398103266954422, "learning_rate": 3.7905715474844224e-06, "loss": 0.2834, "step": 5156 }, { "epoch": 4.340909090909091, "grad_norm": 0.4092443585395813, "learning_rate": 3.7885351079480613e-06, "loss": 0.3077, "step": 5157 }, { "epoch": 4.341750841750842, "grad_norm": 0.41345760226249695, "learning_rate": 3.7864988818847295e-06, "loss": 0.2831, "step": 5158 }, { "epoch": 4.342592592592593, "grad_norm": 0.39417487382888794, "learning_rate": 3.784462869653234e-06, "loss": 0.2512, "step": 5159 }, { "epoch": 4.343434343434343, "grad_norm": 0.4142443537712097, "learning_rate": 3.7824270716123396e-06, "loss": 0.2943, "step": 5160 }, { "epoch": 4.344276094276094, "grad_norm": 0.3824528455734253, "learning_rate": 3.780391488120777e-06, "loss": 0.2959, "step": 5161 }, { "epoch": 4.345117845117845, "grad_norm": 0.3736780881881714, "learning_rate": 3.7783561195372364e-06, "loss": 0.3054, "step": 5162 }, { "epoch": 4.345959595959596, "grad_norm": 0.3846305310726166, "learning_rate": 3.7763209662203715e-06, "loss": 0.2838, "step": 5163 }, { "epoch": 4.346801346801347, "grad_norm": 0.40043866634368896, "learning_rate": 3.7742860285287973e-06, "loss": 0.3165, "step": 5164 }, { "epoch": 4.347643097643098, "grad_norm": 0.4095556437969208, "learning_rate": 3.772251306821095e-06, "loss": 0.3066, "step": 5165 }, { "epoch": 4.348484848484849, "grad_norm": 0.4031692147254944, "learning_rate": 3.7702168014558018e-06, "loss": 0.2658, "step": 5166 }, { "epoch": 4.3493265993265995, "grad_norm": 0.4107736647129059, "learning_rate": 3.7681825127914196e-06, "loss": 0.2927, "step": 5167 }, { "epoch": 4.35016835016835, "grad_norm": 0.4364258050918579, "learning_rate": 3.76614844118641e-06, "loss": 0.2611, "step": 5168 }, { "epoch": 4.351010101010101, "grad_norm": 0.4463154375553131, "learning_rate": 3.7641145869991997e-06, "loss": 0.3044, "step": 5169 }, { "epoch": 4.351851851851852, "grad_norm": 0.39782193303108215, "learning_rate": 3.762080950588176e-06, "loss": 0.3008, "step": 5170 }, { "epoch": 4.352693602693603, "grad_norm": 0.42053377628326416, "learning_rate": 3.7600475323116883e-06, "loss": 0.3234, "step": 5171 }, { "epoch": 4.353535353535354, "grad_norm": 0.434622585773468, "learning_rate": 3.7580143325280435e-06, "loss": 0.3053, "step": 5172 }, { "epoch": 4.354377104377105, "grad_norm": 0.4135710299015045, "learning_rate": 3.7559813515955157e-06, "loss": 0.2948, "step": 5173 }, { "epoch": 4.3552188552188555, "grad_norm": 0.38556891679763794, "learning_rate": 3.753948589872335e-06, "loss": 0.2879, "step": 5174 }, { "epoch": 4.356060606060606, "grad_norm": 0.389202356338501, "learning_rate": 3.751916047716698e-06, "loss": 0.2992, "step": 5175 }, { "epoch": 4.356902356902357, "grad_norm": 0.4229392111301422, "learning_rate": 3.7498837254867615e-06, "loss": 0.3051, "step": 5176 }, { "epoch": 4.357744107744108, "grad_norm": 0.41890546679496765, "learning_rate": 3.7478516235406388e-06, "loss": 0.2715, "step": 5177 }, { "epoch": 4.358585858585858, "grad_norm": 0.4120687246322632, "learning_rate": 3.7458197422364095e-06, "loss": 0.2688, "step": 5178 }, { "epoch": 4.359427609427609, "grad_norm": 0.3878566026687622, "learning_rate": 3.7437880819321148e-06, "loss": 0.259, "step": 5179 }, { "epoch": 4.36026936026936, "grad_norm": 0.406497597694397, "learning_rate": 3.741756642985751e-06, "loss": 0.2753, "step": 5180 }, { "epoch": 4.361111111111111, "grad_norm": 0.392829567193985, "learning_rate": 3.7397254257552817e-06, "loss": 0.3149, "step": 5181 }, { "epoch": 4.361952861952862, "grad_norm": 0.3826240599155426, "learning_rate": 3.7376944305986305e-06, "loss": 0.2775, "step": 5182 }, { "epoch": 4.3627946127946124, "grad_norm": 0.40186524391174316, "learning_rate": 3.735663657873677e-06, "loss": 0.316, "step": 5183 }, { "epoch": 4.363636363636363, "grad_norm": 0.4106946289539337, "learning_rate": 3.733633107938268e-06, "loss": 0.3057, "step": 5184 }, { "epoch": 4.364478114478114, "grad_norm": 0.40169620513916016, "learning_rate": 3.7316027811502033e-06, "loss": 0.2896, "step": 5185 }, { "epoch": 4.365319865319865, "grad_norm": 0.4132227897644043, "learning_rate": 3.7295726778672547e-06, "loss": 0.3093, "step": 5186 }, { "epoch": 4.366161616161616, "grad_norm": 0.4103592336177826, "learning_rate": 3.7275427984471435e-06, "loss": 0.2945, "step": 5187 }, { "epoch": 4.367003367003367, "grad_norm": 0.39913758635520935, "learning_rate": 3.725513143247559e-06, "loss": 0.2818, "step": 5188 }, { "epoch": 4.367845117845118, "grad_norm": 0.45788607001304626, "learning_rate": 3.723483712626145e-06, "loss": 0.3112, "step": 5189 }, { "epoch": 4.3686868686868685, "grad_norm": 0.39047548174858093, "learning_rate": 3.7214545069405096e-06, "loss": 0.3141, "step": 5190 }, { "epoch": 4.369528619528619, "grad_norm": 0.40437644720077515, "learning_rate": 3.719425526548223e-06, "loss": 0.2782, "step": 5191 }, { "epoch": 4.37037037037037, "grad_norm": 0.4250360131263733, "learning_rate": 3.7173967718068103e-06, "loss": 0.3063, "step": 5192 }, { "epoch": 4.371212121212121, "grad_norm": 0.40046051144599915, "learning_rate": 3.7153682430737615e-06, "loss": 0.3071, "step": 5193 }, { "epoch": 4.372053872053872, "grad_norm": 0.3654178977012634, "learning_rate": 3.7133399407065247e-06, "loss": 0.3007, "step": 5194 }, { "epoch": 4.372895622895623, "grad_norm": 0.4111759066581726, "learning_rate": 3.711311865062507e-06, "loss": 0.2882, "step": 5195 }, { "epoch": 4.373737373737374, "grad_norm": 0.42173469066619873, "learning_rate": 3.7092840164990767e-06, "loss": 0.2905, "step": 5196 }, { "epoch": 4.374579124579125, "grad_norm": 0.3604118824005127, "learning_rate": 3.7072563953735665e-06, "loss": 0.3105, "step": 5197 }, { "epoch": 4.375420875420875, "grad_norm": 0.43652665615081787, "learning_rate": 3.7052290020432597e-06, "loss": 0.2809, "step": 5198 }, { "epoch": 4.376262626262626, "grad_norm": 0.4094158709049225, "learning_rate": 3.7032018368654083e-06, "loss": 0.3003, "step": 5199 }, { "epoch": 4.377104377104377, "grad_norm": 0.418210506439209, "learning_rate": 3.701174900197218e-06, "loss": 0.324, "step": 5200 }, { "epoch": 4.377946127946128, "grad_norm": 0.3986347019672394, "learning_rate": 3.699148192395857e-06, "loss": 0.3106, "step": 5201 }, { "epoch": 4.378787878787879, "grad_norm": 0.40048736333847046, "learning_rate": 3.6971217138184532e-06, "loss": 0.3013, "step": 5202 }, { "epoch": 4.37962962962963, "grad_norm": 0.3990994095802307, "learning_rate": 3.695095464822096e-06, "loss": 0.3033, "step": 5203 }, { "epoch": 4.380471380471381, "grad_norm": 0.4170070290565491, "learning_rate": 3.6930694457638296e-06, "loss": 0.3007, "step": 5204 }, { "epoch": 4.3813131313131315, "grad_norm": 0.4286414682865143, "learning_rate": 3.6910436570006604e-06, "loss": 0.3143, "step": 5205 }, { "epoch": 4.382154882154882, "grad_norm": 0.4214476943016052, "learning_rate": 3.6890180988895523e-06, "loss": 0.2826, "step": 5206 }, { "epoch": 4.382996632996633, "grad_norm": 0.4041339159011841, "learning_rate": 3.6869927717874333e-06, "loss": 0.3081, "step": 5207 }, { "epoch": 4.383838383838384, "grad_norm": 0.3973979651927948, "learning_rate": 3.6849676760511866e-06, "loss": 0.3075, "step": 5208 }, { "epoch": 4.384680134680135, "grad_norm": 0.4099964499473572, "learning_rate": 3.6829428120376563e-06, "loss": 0.2688, "step": 5209 }, { "epoch": 4.385521885521886, "grad_norm": 0.4441604018211365, "learning_rate": 3.6809181801036435e-06, "loss": 0.2966, "step": 5210 }, { "epoch": 4.386363636363637, "grad_norm": 0.40816599130630493, "learning_rate": 3.67889378060591e-06, "loss": 0.3258, "step": 5211 }, { "epoch": 4.3872053872053876, "grad_norm": 0.41998162865638733, "learning_rate": 3.6768696139011784e-06, "loss": 0.3018, "step": 5212 }, { "epoch": 4.388047138047138, "grad_norm": 0.41211214661598206, "learning_rate": 3.6748456803461265e-06, "loss": 0.2832, "step": 5213 }, { "epoch": 4.388888888888889, "grad_norm": 0.39449894428253174, "learning_rate": 3.6728219802973953e-06, "loss": 0.2904, "step": 5214 }, { "epoch": 4.38973063973064, "grad_norm": 0.4013324975967407, "learning_rate": 3.6707985141115788e-06, "loss": 0.29, "step": 5215 }, { "epoch": 4.390572390572391, "grad_norm": 0.38133928179740906, "learning_rate": 3.668775282145236e-06, "loss": 0.3119, "step": 5216 }, { "epoch": 4.391414141414142, "grad_norm": 0.3821057975292206, "learning_rate": 3.66675228475488e-06, "loss": 0.2809, "step": 5217 }, { "epoch": 4.392255892255892, "grad_norm": 0.39782413840293884, "learning_rate": 3.664729522296988e-06, "loss": 0.2942, "step": 5218 }, { "epoch": 4.393097643097643, "grad_norm": 0.41336947679519653, "learning_rate": 3.6627069951279894e-06, "loss": 0.2982, "step": 5219 }, { "epoch": 4.393939393939394, "grad_norm": 0.3820466101169586, "learning_rate": 3.660684703604276e-06, "loss": 0.2852, "step": 5220 }, { "epoch": 4.3947811447811445, "grad_norm": 0.40217387676239014, "learning_rate": 3.658662648082196e-06, "loss": 0.2663, "step": 5221 }, { "epoch": 4.395622895622895, "grad_norm": 0.3607964515686035, "learning_rate": 3.656640828918057e-06, "loss": 0.3069, "step": 5222 }, { "epoch": 4.396464646464646, "grad_norm": 0.4242912828922272, "learning_rate": 3.654619246468126e-06, "loss": 0.2911, "step": 5223 }, { "epoch": 4.397306397306397, "grad_norm": 0.38991907238960266, "learning_rate": 3.6525979010886304e-06, "loss": 0.2928, "step": 5224 }, { "epoch": 4.398148148148148, "grad_norm": 0.38672444224357605, "learning_rate": 3.650576793135748e-06, "loss": 0.2855, "step": 5225 }, { "epoch": 4.398989898989899, "grad_norm": 0.40701064467430115, "learning_rate": 3.648555922965622e-06, "loss": 0.2939, "step": 5226 }, { "epoch": 4.39983164983165, "grad_norm": 0.4098721444606781, "learning_rate": 3.646535290934349e-06, "loss": 0.2935, "step": 5227 }, { "epoch": 4.4006734006734005, "grad_norm": 0.39396968483924866, "learning_rate": 3.6445148973979883e-06, "loss": 0.306, "step": 5228 }, { "epoch": 4.401515151515151, "grad_norm": 0.3967791199684143, "learning_rate": 3.6424947427125555e-06, "loss": 0.2884, "step": 5229 }, { "epoch": 4.402356902356902, "grad_norm": 0.45010387897491455, "learning_rate": 3.640474827234021e-06, "loss": 0.2739, "step": 5230 }, { "epoch": 4.403198653198653, "grad_norm": 0.4303922653198242, "learning_rate": 3.6384551513183163e-06, "loss": 0.2795, "step": 5231 }, { "epoch": 4.404040404040404, "grad_norm": 0.3827405273914337, "learning_rate": 3.6364357153213314e-06, "loss": 0.3026, "step": 5232 }, { "epoch": 4.404882154882155, "grad_norm": 0.400723934173584, "learning_rate": 3.634416519598908e-06, "loss": 0.3039, "step": 5233 }, { "epoch": 4.405723905723906, "grad_norm": 0.4042334258556366, "learning_rate": 3.632397564506854e-06, "loss": 0.2937, "step": 5234 }, { "epoch": 4.406565656565657, "grad_norm": 0.4056072533130646, "learning_rate": 3.6303788504009318e-06, "loss": 0.2947, "step": 5235 }, { "epoch": 4.407407407407407, "grad_norm": 0.40976235270500183, "learning_rate": 3.628360377636857e-06, "loss": 0.2913, "step": 5236 }, { "epoch": 4.408249158249158, "grad_norm": 0.4149715006351471, "learning_rate": 3.6263421465703074e-06, "loss": 0.2807, "step": 5237 }, { "epoch": 4.409090909090909, "grad_norm": 0.4162931442260742, "learning_rate": 3.6243241575569145e-06, "loss": 0.266, "step": 5238 }, { "epoch": 4.40993265993266, "grad_norm": 0.3738608956336975, "learning_rate": 3.6223064109522745e-06, "loss": 0.2985, "step": 5239 }, { "epoch": 4.410774410774411, "grad_norm": 0.4336477220058441, "learning_rate": 3.620288907111932e-06, "loss": 0.3007, "step": 5240 }, { "epoch": 4.411616161616162, "grad_norm": 0.434946209192276, "learning_rate": 3.618271646391395e-06, "loss": 0.2776, "step": 5241 }, { "epoch": 4.412457912457913, "grad_norm": 0.4305168390274048, "learning_rate": 3.616254629146123e-06, "loss": 0.2871, "step": 5242 }, { "epoch": 4.4132996632996635, "grad_norm": 0.39555370807647705, "learning_rate": 3.614237855731538e-06, "loss": 0.2957, "step": 5243 }, { "epoch": 4.414141414141414, "grad_norm": 0.40937507152557373, "learning_rate": 3.6122213265030194e-06, "loss": 0.3022, "step": 5244 }, { "epoch": 4.414983164983165, "grad_norm": 0.42019015550613403, "learning_rate": 3.610205041815897e-06, "loss": 0.2685, "step": 5245 }, { "epoch": 4.415824915824916, "grad_norm": 0.4086863696575165, "learning_rate": 3.608189002025464e-06, "loss": 0.3176, "step": 5246 }, { "epoch": 4.416666666666667, "grad_norm": 0.38762161135673523, "learning_rate": 3.6061732074869683e-06, "loss": 0.292, "step": 5247 }, { "epoch": 4.417508417508418, "grad_norm": 0.3924488425254822, "learning_rate": 3.604157658555612e-06, "loss": 0.2908, "step": 5248 }, { "epoch": 4.418350168350169, "grad_norm": 0.4225291311740875, "learning_rate": 3.6021423555865574e-06, "loss": 0.3187, "step": 5249 }, { "epoch": 4.41919191919192, "grad_norm": 0.40227559208869934, "learning_rate": 3.600127298934925e-06, "loss": 0.3087, "step": 5250 }, { "epoch": 4.42003367003367, "grad_norm": 0.4301646053791046, "learning_rate": 3.598112488955786e-06, "loss": 0.2848, "step": 5251 }, { "epoch": 4.420875420875421, "grad_norm": 0.421230673789978, "learning_rate": 3.596097926004174e-06, "loss": 0.2929, "step": 5252 }, { "epoch": 4.421717171717171, "grad_norm": 0.38149330019950867, "learning_rate": 3.5940836104350735e-06, "loss": 0.3128, "step": 5253 }, { "epoch": 4.422558922558922, "grad_norm": 0.37676140666007996, "learning_rate": 3.592069542603429e-06, "loss": 0.3069, "step": 5254 }, { "epoch": 4.423400673400673, "grad_norm": 0.4222966134548187, "learning_rate": 3.590055722864143e-06, "loss": 0.285, "step": 5255 }, { "epoch": 4.424242424242424, "grad_norm": 0.3956942856311798, "learning_rate": 3.588042151572071e-06, "loss": 0.2929, "step": 5256 }, { "epoch": 4.425084175084175, "grad_norm": 0.3844815790653229, "learning_rate": 3.586028829082025e-06, "loss": 0.318, "step": 5257 }, { "epoch": 4.425925925925926, "grad_norm": 0.3841371536254883, "learning_rate": 3.5840157557487754e-06, "loss": 0.2886, "step": 5258 }, { "epoch": 4.4267676767676765, "grad_norm": 0.3860139548778534, "learning_rate": 3.582002931927043e-06, "loss": 0.2783, "step": 5259 }, { "epoch": 4.427609427609427, "grad_norm": 0.3750838041305542, "learning_rate": 3.5799903579715134e-06, "loss": 0.287, "step": 5260 }, { "epoch": 4.428451178451178, "grad_norm": 0.3897874057292938, "learning_rate": 3.5779780342368224e-06, "loss": 0.2771, "step": 5261 }, { "epoch": 4.429292929292929, "grad_norm": 0.4198605716228485, "learning_rate": 3.575965961077563e-06, "loss": 0.2868, "step": 5262 }, { "epoch": 4.43013468013468, "grad_norm": 0.42250317335128784, "learning_rate": 3.5739541388482835e-06, "loss": 0.2885, "step": 5263 }, { "epoch": 4.430976430976431, "grad_norm": 0.41698139905929565, "learning_rate": 3.571942567903488e-06, "loss": 0.2764, "step": 5264 }, { "epoch": 4.431818181818182, "grad_norm": 0.3988996148109436, "learning_rate": 3.569931248597636e-06, "loss": 0.2738, "step": 5265 }, { "epoch": 4.4326599326599325, "grad_norm": 0.3875957429409027, "learning_rate": 3.5679201812851456e-06, "loss": 0.3102, "step": 5266 }, { "epoch": 4.433501683501683, "grad_norm": 0.4474102556705475, "learning_rate": 3.565909366320388e-06, "loss": 0.3221, "step": 5267 }, { "epoch": 4.434343434343434, "grad_norm": 0.3835414946079254, "learning_rate": 3.5638988040576882e-06, "loss": 0.274, "step": 5268 }, { "epoch": 4.435185185185185, "grad_norm": 0.4059366285800934, "learning_rate": 3.561888494851331e-06, "loss": 0.3, "step": 5269 }, { "epoch": 4.436026936026936, "grad_norm": 0.3831067383289337, "learning_rate": 3.559878439055552e-06, "loss": 0.2761, "step": 5270 }, { "epoch": 4.436868686868687, "grad_norm": 0.41231873631477356, "learning_rate": 3.5578686370245473e-06, "loss": 0.2821, "step": 5271 }, { "epoch": 4.437710437710438, "grad_norm": 0.40243127942085266, "learning_rate": 3.555859089112463e-06, "loss": 0.301, "step": 5272 }, { "epoch": 4.438552188552189, "grad_norm": 0.4156123697757721, "learning_rate": 3.553849795673405e-06, "loss": 0.2853, "step": 5273 }, { "epoch": 4.4393939393939394, "grad_norm": 0.3877217173576355, "learning_rate": 3.5518407570614298e-06, "loss": 0.3129, "step": 5274 }, { "epoch": 4.44023569023569, "grad_norm": 0.3721005618572235, "learning_rate": 3.5498319736305518e-06, "loss": 0.2989, "step": 5275 }, { "epoch": 4.441077441077441, "grad_norm": 0.3869043290615082, "learning_rate": 3.5478234457347418e-06, "loss": 0.2819, "step": 5276 }, { "epoch": 4.441919191919192, "grad_norm": 0.399927020072937, "learning_rate": 3.545815173727924e-06, "loss": 0.2836, "step": 5277 }, { "epoch": 4.442760942760943, "grad_norm": 0.40345555543899536, "learning_rate": 3.5438071579639755e-06, "loss": 0.2965, "step": 5278 }, { "epoch": 4.443602693602694, "grad_norm": 0.40767309069633484, "learning_rate": 3.5417993987967313e-06, "loss": 0.3211, "step": 5279 }, { "epoch": 4.444444444444445, "grad_norm": 0.39928802847862244, "learning_rate": 3.539791896579979e-06, "loss": 0.2905, "step": 5280 }, { "epoch": 4.4452861952861955, "grad_norm": 0.37288400530815125, "learning_rate": 3.537784651667461e-06, "loss": 0.3115, "step": 5281 }, { "epoch": 4.446127946127946, "grad_norm": 0.4060059189796448, "learning_rate": 3.5357776644128794e-06, "loss": 0.2913, "step": 5282 }, { "epoch": 4.446969696969697, "grad_norm": 0.40115347504615784, "learning_rate": 3.5337709351698823e-06, "loss": 0.3019, "step": 5283 }, { "epoch": 4.447811447811448, "grad_norm": 0.39069071412086487, "learning_rate": 3.531764464292079e-06, "loss": 0.2947, "step": 5284 }, { "epoch": 4.448653198653199, "grad_norm": 0.4086557924747467, "learning_rate": 3.529758252133031e-06, "loss": 0.2818, "step": 5285 }, { "epoch": 4.44949494949495, "grad_norm": 0.38753822445869446, "learning_rate": 3.527752299046252e-06, "loss": 0.316, "step": 5286 }, { "epoch": 4.450336700336701, "grad_norm": 0.41594621539115906, "learning_rate": 3.5257466053852153e-06, "loss": 0.2894, "step": 5287 }, { "epoch": 4.451178451178452, "grad_norm": 0.3955937325954437, "learning_rate": 3.5237411715033453e-06, "loss": 0.3088, "step": 5288 }, { "epoch": 4.452020202020202, "grad_norm": 0.38782835006713867, "learning_rate": 3.5217359977540195e-06, "loss": 0.2933, "step": 5289 }, { "epoch": 4.452861952861953, "grad_norm": 0.3818192481994629, "learning_rate": 3.519731084490572e-06, "loss": 0.2991, "step": 5290 }, { "epoch": 4.453703703703704, "grad_norm": 0.37851646542549133, "learning_rate": 3.5177264320662854e-06, "loss": 0.3011, "step": 5291 }, { "epoch": 4.454545454545454, "grad_norm": 0.39173734188079834, "learning_rate": 3.5157220408344083e-06, "loss": 0.2836, "step": 5292 }, { "epoch": 4.455387205387205, "grad_norm": 0.4456416368484497, "learning_rate": 3.513717911148131e-06, "loss": 0.2971, "step": 5293 }, { "epoch": 4.456228956228956, "grad_norm": 0.4157058596611023, "learning_rate": 3.5117140433606044e-06, "loss": 0.2848, "step": 5294 }, { "epoch": 4.457070707070707, "grad_norm": 0.3950101137161255, "learning_rate": 3.50971043782493e-06, "loss": 0.2838, "step": 5295 }, { "epoch": 4.457912457912458, "grad_norm": 0.4197632372379303, "learning_rate": 3.507707094894166e-06, "loss": 0.3138, "step": 5296 }, { "epoch": 4.4587542087542085, "grad_norm": 0.4271342158317566, "learning_rate": 3.5057040149213195e-06, "loss": 0.3106, "step": 5297 }, { "epoch": 4.459595959595959, "grad_norm": 0.39269986748695374, "learning_rate": 3.5037011982593584e-06, "loss": 0.2955, "step": 5298 }, { "epoch": 4.46043771043771, "grad_norm": 0.4053461253643036, "learning_rate": 3.501698645261199e-06, "loss": 0.2933, "step": 5299 }, { "epoch": 4.461279461279461, "grad_norm": 0.41481608152389526, "learning_rate": 3.4996963562797128e-06, "loss": 0.2795, "step": 5300 }, { "epoch": 4.462121212121212, "grad_norm": 0.4078722298145294, "learning_rate": 3.4976943316677227e-06, "loss": 0.3105, "step": 5301 }, { "epoch": 4.462962962962963, "grad_norm": 0.3894154727458954, "learning_rate": 3.4956925717780075e-06, "loss": 0.2744, "step": 5302 }, { "epoch": 4.463804713804714, "grad_norm": 0.4094095826148987, "learning_rate": 3.4936910769633013e-06, "loss": 0.2975, "step": 5303 }, { "epoch": 4.4646464646464645, "grad_norm": 0.38323813676834106, "learning_rate": 3.4916898475762857e-06, "loss": 0.2952, "step": 5304 }, { "epoch": 4.465488215488215, "grad_norm": 0.40488266944885254, "learning_rate": 3.489688883969601e-06, "loss": 0.2681, "step": 5305 }, { "epoch": 4.466329966329966, "grad_norm": 0.38848167657852173, "learning_rate": 3.487688186495835e-06, "loss": 0.2918, "step": 5306 }, { "epoch": 4.467171717171717, "grad_norm": 0.41301393508911133, "learning_rate": 3.4856877555075338e-06, "loss": 0.299, "step": 5307 }, { "epoch": 4.468013468013468, "grad_norm": 0.40556731820106506, "learning_rate": 3.4836875913571957e-06, "loss": 0.2826, "step": 5308 }, { "epoch": 4.468855218855219, "grad_norm": 0.42330625653266907, "learning_rate": 3.4816876943972713e-06, "loss": 0.3077, "step": 5309 }, { "epoch": 4.46969696969697, "grad_norm": 0.3748111426830292, "learning_rate": 3.47968806498016e-06, "loss": 0.3093, "step": 5310 }, { "epoch": 4.470538720538721, "grad_norm": 0.39587312936782837, "learning_rate": 3.4776887034582228e-06, "loss": 0.2842, "step": 5311 }, { "epoch": 4.4713804713804715, "grad_norm": 0.3968220055103302, "learning_rate": 3.4756896101837645e-06, "loss": 0.307, "step": 5312 }, { "epoch": 4.472222222222222, "grad_norm": 0.41084083914756775, "learning_rate": 3.4736907855090475e-06, "loss": 0.3124, "step": 5313 }, { "epoch": 4.473063973063973, "grad_norm": 0.37727853655815125, "learning_rate": 3.4716922297862878e-06, "loss": 0.3158, "step": 5314 }, { "epoch": 4.473905723905724, "grad_norm": 0.39812222123146057, "learning_rate": 3.4696939433676523e-06, "loss": 0.2876, "step": 5315 }, { "epoch": 4.474747474747475, "grad_norm": 0.41432705521583557, "learning_rate": 3.467695926605258e-06, "loss": 0.2655, "step": 5316 }, { "epoch": 4.475589225589226, "grad_norm": 0.3890663981437683, "learning_rate": 3.4656981798511783e-06, "loss": 0.2823, "step": 5317 }, { "epoch": 4.476430976430977, "grad_norm": 0.4189978241920471, "learning_rate": 3.463700703457436e-06, "loss": 0.2826, "step": 5318 }, { "epoch": 4.4772727272727275, "grad_norm": 0.4017978310585022, "learning_rate": 3.4617034977760094e-06, "loss": 0.3266, "step": 5319 }, { "epoch": 4.478114478114478, "grad_norm": 0.3963242471218109, "learning_rate": 3.4597065631588283e-06, "loss": 0.3011, "step": 5320 }, { "epoch": 4.478956228956229, "grad_norm": 0.42764878273010254, "learning_rate": 3.4577098999577715e-06, "loss": 0.3002, "step": 5321 }, { "epoch": 4.47979797979798, "grad_norm": 0.40956911444664, "learning_rate": 3.455713508524673e-06, "loss": 0.2822, "step": 5322 }, { "epoch": 4.480639730639731, "grad_norm": 0.4379592537879944, "learning_rate": 3.4537173892113173e-06, "loss": 0.2986, "step": 5323 }, { "epoch": 4.481481481481482, "grad_norm": 0.3903700113296509, "learning_rate": 3.451721542369446e-06, "loss": 0.3083, "step": 5324 }, { "epoch": 4.482323232323233, "grad_norm": 0.4098140001296997, "learning_rate": 3.4497259683507444e-06, "loss": 0.2922, "step": 5325 }, { "epoch": 4.483164983164984, "grad_norm": 0.4377700388431549, "learning_rate": 3.447730667506856e-06, "loss": 0.3023, "step": 5326 }, { "epoch": 4.4840067340067336, "grad_norm": 0.42720866203308105, "learning_rate": 3.445735640189373e-06, "loss": 0.3007, "step": 5327 }, { "epoch": 4.484848484848484, "grad_norm": 0.3952806293964386, "learning_rate": 3.443740886749841e-06, "loss": 0.2951, "step": 5328 }, { "epoch": 4.485690235690235, "grad_norm": 0.40204504132270813, "learning_rate": 3.4417464075397543e-06, "loss": 0.2949, "step": 5329 }, { "epoch": 4.486531986531986, "grad_norm": 0.4150794446468353, "learning_rate": 3.439752202910567e-06, "loss": 0.3282, "step": 5330 }, { "epoch": 4.487373737373737, "grad_norm": 0.4110192656517029, "learning_rate": 3.437758273213675e-06, "loss": 0.2771, "step": 5331 }, { "epoch": 4.488215488215488, "grad_norm": 0.4191926121711731, "learning_rate": 3.4357646188004317e-06, "loss": 0.3101, "step": 5332 }, { "epoch": 4.489057239057239, "grad_norm": 0.37853652238845825, "learning_rate": 3.4337712400221384e-06, "loss": 0.298, "step": 5333 }, { "epoch": 4.48989898989899, "grad_norm": 0.4161873757839203, "learning_rate": 3.43177813723005e-06, "loss": 0.2902, "step": 5334 }, { "epoch": 4.4907407407407405, "grad_norm": 0.39270836114883423, "learning_rate": 3.4297853107753753e-06, "loss": 0.3056, "step": 5335 }, { "epoch": 4.491582491582491, "grad_norm": 0.3754652738571167, "learning_rate": 3.4277927610092686e-06, "loss": 0.2843, "step": 5336 }, { "epoch": 4.492424242424242, "grad_norm": 0.37484920024871826, "learning_rate": 3.4258004882828395e-06, "loss": 0.293, "step": 5337 }, { "epoch": 4.493265993265993, "grad_norm": 0.37018463015556335, "learning_rate": 3.423808492947148e-06, "loss": 0.3222, "step": 5338 }, { "epoch": 4.494107744107744, "grad_norm": 0.39027634263038635, "learning_rate": 3.421816775353203e-06, "loss": 0.2813, "step": 5339 }, { "epoch": 4.494949494949495, "grad_norm": 0.44310545921325684, "learning_rate": 3.4198253358519686e-06, "loss": 0.3069, "step": 5340 }, { "epoch": 4.495791245791246, "grad_norm": 0.3656753599643707, "learning_rate": 3.4178341747943583e-06, "loss": 0.3097, "step": 5341 }, { "epoch": 4.4966329966329965, "grad_norm": 0.38262006640434265, "learning_rate": 3.4158432925312335e-06, "loss": 0.2767, "step": 5342 }, { "epoch": 4.497474747474747, "grad_norm": 0.4189014136791229, "learning_rate": 3.4138526894134104e-06, "loss": 0.2891, "step": 5343 }, { "epoch": 4.498316498316498, "grad_norm": 0.41399359703063965, "learning_rate": 3.411862365791654e-06, "loss": 0.2892, "step": 5344 }, { "epoch": 4.499158249158249, "grad_norm": 0.3801332712173462, "learning_rate": 3.4098723220166784e-06, "loss": 0.3011, "step": 5345 }, { "epoch": 4.5, "grad_norm": 0.3659728467464447, "learning_rate": 3.407882558439154e-06, "loss": 0.287, "step": 5346 }, { "epoch": 4.500841750841751, "grad_norm": 0.40155860781669617, "learning_rate": 3.4058930754096987e-06, "loss": 0.3195, "step": 5347 }, { "epoch": 4.501683501683502, "grad_norm": 0.3662221133708954, "learning_rate": 3.4039038732788772e-06, "loss": 0.272, "step": 5348 }, { "epoch": 4.502525252525253, "grad_norm": 0.4056859016418457, "learning_rate": 3.401914952397211e-06, "loss": 0.3065, "step": 5349 }, { "epoch": 4.5033670033670035, "grad_norm": 0.41111546754837036, "learning_rate": 3.399926313115166e-06, "loss": 0.3301, "step": 5350 }, { "epoch": 4.504208754208754, "grad_norm": 0.42367565631866455, "learning_rate": 3.397937955783165e-06, "loss": 0.3124, "step": 5351 }, { "epoch": 4.505050505050505, "grad_norm": 0.3961285352706909, "learning_rate": 3.3959498807515766e-06, "loss": 0.3386, "step": 5352 }, { "epoch": 4.505892255892256, "grad_norm": 0.4342414140701294, "learning_rate": 3.3939620883707215e-06, "loss": 0.3025, "step": 5353 }, { "epoch": 4.506734006734007, "grad_norm": 0.39393338561058044, "learning_rate": 3.3919745789908674e-06, "loss": 0.301, "step": 5354 }, { "epoch": 4.507575757575758, "grad_norm": 0.40747329592704773, "learning_rate": 3.3899873529622362e-06, "loss": 0.2657, "step": 5355 }, { "epoch": 4.508417508417509, "grad_norm": 0.4500364065170288, "learning_rate": 3.3880004106350007e-06, "loss": 0.3001, "step": 5356 }, { "epoch": 4.5092592592592595, "grad_norm": 0.4213866591453552, "learning_rate": 3.3860137523592773e-06, "loss": 0.2802, "step": 5357 }, { "epoch": 4.51010101010101, "grad_norm": 0.43852731585502625, "learning_rate": 3.384027378485139e-06, "loss": 0.2741, "step": 5358 }, { "epoch": 4.510942760942761, "grad_norm": 0.4125405251979828, "learning_rate": 3.3820412893626044e-06, "loss": 0.3142, "step": 5359 }, { "epoch": 4.511784511784512, "grad_norm": 0.3831739127635956, "learning_rate": 3.3800554853416446e-06, "loss": 0.3043, "step": 5360 }, { "epoch": 4.512626262626263, "grad_norm": 0.41770148277282715, "learning_rate": 3.3780699667721777e-06, "loss": 0.286, "step": 5361 }, { "epoch": 4.513468013468014, "grad_norm": 0.3881751000881195, "learning_rate": 3.376084734004077e-06, "loss": 0.3059, "step": 5362 }, { "epoch": 4.514309764309765, "grad_norm": 0.41243213415145874, "learning_rate": 3.374099787387159e-06, "loss": 0.2861, "step": 5363 }, { "epoch": 4.515151515151516, "grad_norm": 0.4167512357234955, "learning_rate": 3.3721151272711935e-06, "loss": 0.2808, "step": 5364 }, { "epoch": 4.5159932659932664, "grad_norm": 0.39380884170532227, "learning_rate": 3.370130754005897e-06, "loss": 0.2955, "step": 5365 }, { "epoch": 4.516835016835017, "grad_norm": 0.3901533782482147, "learning_rate": 3.368146667940937e-06, "loss": 0.3037, "step": 5366 }, { "epoch": 4.517676767676767, "grad_norm": 0.4078977406024933, "learning_rate": 3.366162869425933e-06, "loss": 0.2763, "step": 5367 }, { "epoch": 4.518518518518518, "grad_norm": 0.43091949820518494, "learning_rate": 3.364179358810452e-06, "loss": 0.2725, "step": 5368 }, { "epoch": 4.519360269360269, "grad_norm": 0.4055044949054718, "learning_rate": 3.362196136444007e-06, "loss": 0.2842, "step": 5369 }, { "epoch": 4.52020202020202, "grad_norm": 0.4557965099811554, "learning_rate": 3.3602132026760647e-06, "loss": 0.3152, "step": 5370 }, { "epoch": 4.521043771043771, "grad_norm": 0.4074572026729584, "learning_rate": 3.358230557856037e-06, "loss": 0.2844, "step": 5371 }, { "epoch": 4.521885521885522, "grad_norm": 0.4148077368736267, "learning_rate": 3.35624820233329e-06, "loss": 0.2479, "step": 5372 }, { "epoch": 4.5227272727272725, "grad_norm": 0.43837183713912964, "learning_rate": 3.354266136457135e-06, "loss": 0.3058, "step": 5373 }, { "epoch": 4.523569023569023, "grad_norm": 0.4392109811306, "learning_rate": 3.3522843605768312e-06, "loss": 0.2817, "step": 5374 }, { "epoch": 4.524410774410774, "grad_norm": 0.38888224959373474, "learning_rate": 3.3503028750415905e-06, "loss": 0.2932, "step": 5375 }, { "epoch": 4.525252525252525, "grad_norm": 0.43384650349617004, "learning_rate": 3.3483216802005724e-06, "loss": 0.3082, "step": 5376 }, { "epoch": 4.526094276094276, "grad_norm": 0.3905610740184784, "learning_rate": 3.346340776402881e-06, "loss": 0.2931, "step": 5377 }, { "epoch": 4.526936026936027, "grad_norm": 0.38546210527420044, "learning_rate": 3.3443601639975767e-06, "loss": 0.3109, "step": 5378 }, { "epoch": 4.527777777777778, "grad_norm": 0.44008582830429077, "learning_rate": 3.342379843333664e-06, "loss": 0.2859, "step": 5379 }, { "epoch": 4.5286195286195285, "grad_norm": 0.41421422362327576, "learning_rate": 3.3403998147600937e-06, "loss": 0.3258, "step": 5380 }, { "epoch": 4.529461279461279, "grad_norm": 0.39612847566604614, "learning_rate": 3.3384200786257713e-06, "loss": 0.2975, "step": 5381 }, { "epoch": 4.53030303030303, "grad_norm": 0.3822912573814392, "learning_rate": 3.336440635279543e-06, "loss": 0.3445, "step": 5382 }, { "epoch": 4.531144781144781, "grad_norm": 0.42951831221580505, "learning_rate": 3.334461485070214e-06, "loss": 0.2921, "step": 5383 }, { "epoch": 4.531986531986532, "grad_norm": 0.37611472606658936, "learning_rate": 3.332482628346526e-06, "loss": 0.2849, "step": 5384 }, { "epoch": 4.532828282828283, "grad_norm": 0.39046549797058105, "learning_rate": 3.3305040654571796e-06, "loss": 0.2951, "step": 5385 }, { "epoch": 4.533670033670034, "grad_norm": 0.3827885389328003, "learning_rate": 3.3285257967508146e-06, "loss": 0.3313, "step": 5386 }, { "epoch": 4.534511784511785, "grad_norm": 0.4047410786151886, "learning_rate": 3.326547822576023e-06, "loss": 0.2766, "step": 5387 }, { "epoch": 4.5353535353535355, "grad_norm": 0.3853099048137665, "learning_rate": 3.324570143281348e-06, "loss": 0.2816, "step": 5388 }, { "epoch": 4.536195286195286, "grad_norm": 0.3956538140773773, "learning_rate": 3.3225927592152762e-06, "loss": 0.296, "step": 5389 }, { "epoch": 4.537037037037037, "grad_norm": 0.36833879351615906, "learning_rate": 3.320615670726244e-06, "loss": 0.3143, "step": 5390 }, { "epoch": 4.537878787878788, "grad_norm": 0.3943750560283661, "learning_rate": 3.3186388781626355e-06, "loss": 0.2795, "step": 5391 }, { "epoch": 4.538720538720539, "grad_norm": 0.39168643951416016, "learning_rate": 3.3166623818727816e-06, "loss": 0.3056, "step": 5392 }, { "epoch": 4.53956228956229, "grad_norm": 0.3992222845554352, "learning_rate": 3.3146861822049615e-06, "loss": 0.3297, "step": 5393 }, { "epoch": 4.540404040404041, "grad_norm": 0.4001285433769226, "learning_rate": 3.3127102795074064e-06, "loss": 0.3042, "step": 5394 }, { "epoch": 4.5412457912457915, "grad_norm": 0.40093275904655457, "learning_rate": 3.3107346741282875e-06, "loss": 0.3088, "step": 5395 }, { "epoch": 4.542087542087542, "grad_norm": 0.4173440933227539, "learning_rate": 3.3087593664157303e-06, "loss": 0.2779, "step": 5396 }, { "epoch": 4.542929292929293, "grad_norm": 0.41081807017326355, "learning_rate": 3.3067843567178027e-06, "loss": 0.3097, "step": 5397 }, { "epoch": 4.543771043771044, "grad_norm": 0.40753746032714844, "learning_rate": 3.3048096453825225e-06, "loss": 0.2915, "step": 5398 }, { "epoch": 4.544612794612795, "grad_norm": 0.42002326250076294, "learning_rate": 3.302835232757857e-06, "loss": 0.3133, "step": 5399 }, { "epoch": 4.545454545454545, "grad_norm": 0.43067678809165955, "learning_rate": 3.3008611191917184e-06, "loss": 0.2723, "step": 5400 }, { "epoch": 4.546296296296296, "grad_norm": 0.40145343542099, "learning_rate": 3.298887305031965e-06, "loss": 0.295, "step": 5401 }, { "epoch": 4.547138047138047, "grad_norm": 0.4231933653354645, "learning_rate": 3.296913790626406e-06, "loss": 0.271, "step": 5402 }, { "epoch": 4.547979797979798, "grad_norm": 0.4172057807445526, "learning_rate": 3.2949405763227926e-06, "loss": 0.2868, "step": 5403 }, { "epoch": 4.548821548821548, "grad_norm": 0.39356863498687744, "learning_rate": 3.2929676624688284e-06, "loss": 0.2955, "step": 5404 }, { "epoch": 4.549663299663299, "grad_norm": 0.40801313519477844, "learning_rate": 3.290995049412161e-06, "loss": 0.2896, "step": 5405 }, { "epoch": 4.55050505050505, "grad_norm": 0.410441130399704, "learning_rate": 3.2890227375003885e-06, "loss": 0.2952, "step": 5406 }, { "epoch": 4.551346801346801, "grad_norm": 0.40226125717163086, "learning_rate": 3.28705072708105e-06, "loss": 0.2908, "step": 5407 }, { "epoch": 4.552188552188552, "grad_norm": 0.4212096333503723, "learning_rate": 3.2850790185016362e-06, "loss": 0.3171, "step": 5408 }, { "epoch": 4.553030303030303, "grad_norm": 0.38317301869392395, "learning_rate": 3.283107612109581e-06, "loss": 0.301, "step": 5409 }, { "epoch": 4.553872053872054, "grad_norm": 0.40282052755355835, "learning_rate": 3.2811365082522707e-06, "loss": 0.3015, "step": 5410 }, { "epoch": 4.5547138047138045, "grad_norm": 0.4108688235282898, "learning_rate": 3.2791657072770332e-06, "loss": 0.2952, "step": 5411 }, { "epoch": 4.555555555555555, "grad_norm": 0.4469974637031555, "learning_rate": 3.2771952095311443e-06, "loss": 0.3119, "step": 5412 }, { "epoch": 4.556397306397306, "grad_norm": 0.4222090542316437, "learning_rate": 3.2752250153618264e-06, "loss": 0.2907, "step": 5413 }, { "epoch": 4.557239057239057, "grad_norm": 0.3956276774406433, "learning_rate": 3.273255125116249e-06, "loss": 0.2908, "step": 5414 }, { "epoch": 4.558080808080808, "grad_norm": 0.3814711570739746, "learning_rate": 3.27128553914153e-06, "loss": 0.2952, "step": 5415 }, { "epoch": 4.558922558922559, "grad_norm": 0.3892897665500641, "learning_rate": 3.2693162577847284e-06, "loss": 0.3115, "step": 5416 }, { "epoch": 4.55976430976431, "grad_norm": 0.43810218572616577, "learning_rate": 3.2673472813928542e-06, "loss": 0.2827, "step": 5417 }, { "epoch": 4.5606060606060606, "grad_norm": 0.40614861249923706, "learning_rate": 3.2653786103128616e-06, "loss": 0.2925, "step": 5418 }, { "epoch": 4.561447811447811, "grad_norm": 0.39234504103660583, "learning_rate": 3.2634102448916494e-06, "loss": 0.3127, "step": 5419 }, { "epoch": 4.562289562289562, "grad_norm": 0.397197425365448, "learning_rate": 3.261442185476068e-06, "loss": 0.3038, "step": 5420 }, { "epoch": 4.563131313131313, "grad_norm": 0.41193515062332153, "learning_rate": 3.2594744324129102e-06, "loss": 0.3186, "step": 5421 }, { "epoch": 4.563973063973064, "grad_norm": 0.3969498574733734, "learning_rate": 3.2575069860489126e-06, "loss": 0.29, "step": 5422 }, { "epoch": 4.564814814814815, "grad_norm": 0.3936668634414673, "learning_rate": 3.255539846730763e-06, "loss": 0.2669, "step": 5423 }, { "epoch": 4.565656565656566, "grad_norm": 0.3654908537864685, "learning_rate": 3.2535730148050894e-06, "loss": 0.3225, "step": 5424 }, { "epoch": 4.566498316498317, "grad_norm": 0.37696924805641174, "learning_rate": 3.251606490618469e-06, "loss": 0.3061, "step": 5425 }, { "epoch": 4.5673400673400675, "grad_norm": 0.4143631160259247, "learning_rate": 3.249640274517428e-06, "loss": 0.2919, "step": 5426 }, { "epoch": 4.568181818181818, "grad_norm": 0.4044351875782013, "learning_rate": 3.24767436684843e-06, "loss": 0.2833, "step": 5427 }, { "epoch": 4.569023569023569, "grad_norm": 0.4004770517349243, "learning_rate": 3.245708767957891e-06, "loss": 0.2843, "step": 5428 }, { "epoch": 4.56986531986532, "grad_norm": 0.39851975440979004, "learning_rate": 3.243743478192172e-06, "loss": 0.3056, "step": 5429 }, { "epoch": 4.570707070707071, "grad_norm": 0.4026558995246887, "learning_rate": 3.2417784978975736e-06, "loss": 0.2947, "step": 5430 }, { "epoch": 4.571548821548822, "grad_norm": 0.42095038294792175, "learning_rate": 3.23981382742035e-06, "loss": 0.3005, "step": 5431 }, { "epoch": 4.572390572390573, "grad_norm": 0.4103265702724457, "learning_rate": 3.2378494671066975e-06, "loss": 0.2979, "step": 5432 }, { "epoch": 4.5732323232323235, "grad_norm": 0.40404075384140015, "learning_rate": 3.2358854173027547e-06, "loss": 0.3115, "step": 5433 }, { "epoch": 4.574074074074074, "grad_norm": 0.4134666621685028, "learning_rate": 3.2339216783546105e-06, "loss": 0.3217, "step": 5434 }, { "epoch": 4.574915824915825, "grad_norm": 0.39085912704467773, "learning_rate": 3.2319582506082926e-06, "loss": 0.2885, "step": 5435 }, { "epoch": 4.575757575757576, "grad_norm": 0.3892069160938263, "learning_rate": 3.2299951344097834e-06, "loss": 0.2786, "step": 5436 }, { "epoch": 4.576599326599327, "grad_norm": 0.42004790902137756, "learning_rate": 3.2280323301050016e-06, "loss": 0.2978, "step": 5437 }, { "epoch": 4.577441077441078, "grad_norm": 0.4055291414260864, "learning_rate": 3.226069838039816e-06, "loss": 0.281, "step": 5438 }, { "epoch": 4.578282828282829, "grad_norm": 0.40011340379714966, "learning_rate": 3.224107658560036e-06, "loss": 0.3098, "step": 5439 }, { "epoch": 4.57912457912458, "grad_norm": 0.41209015250205994, "learning_rate": 3.222145792011422e-06, "loss": 0.2666, "step": 5440 }, { "epoch": 4.5799663299663305, "grad_norm": 0.39538586139678955, "learning_rate": 3.220184238739671e-06, "loss": 0.2996, "step": 5441 }, { "epoch": 4.58080808080808, "grad_norm": 0.38736042380332947, "learning_rate": 3.2182229990904335e-06, "loss": 0.2975, "step": 5442 }, { "epoch": 4.581649831649831, "grad_norm": 0.4169107675552368, "learning_rate": 3.2162620734093003e-06, "loss": 0.2961, "step": 5443 }, { "epoch": 4.582491582491582, "grad_norm": 0.38558581471443176, "learning_rate": 3.2143014620418077e-06, "loss": 0.3236, "step": 5444 }, { "epoch": 4.583333333333333, "grad_norm": 0.3939802944660187, "learning_rate": 3.212341165333435e-06, "loss": 0.3126, "step": 5445 }, { "epoch": 4.584175084175084, "grad_norm": 0.38951966166496277, "learning_rate": 3.2103811836296066e-06, "loss": 0.3103, "step": 5446 }, { "epoch": 4.585016835016835, "grad_norm": 0.37806469202041626, "learning_rate": 3.208421517275696e-06, "loss": 0.3056, "step": 5447 }, { "epoch": 4.585858585858586, "grad_norm": 0.3959447741508484, "learning_rate": 3.206462166617013e-06, "loss": 0.3143, "step": 5448 }, { "epoch": 4.5867003367003365, "grad_norm": 0.36925968527793884, "learning_rate": 3.2045031319988196e-06, "loss": 0.3024, "step": 5449 }, { "epoch": 4.587542087542087, "grad_norm": 0.4130898714065552, "learning_rate": 3.2025444137663153e-06, "loss": 0.2844, "step": 5450 }, { "epoch": 4.588383838383838, "grad_norm": 0.4152967035770416, "learning_rate": 3.2005860122646482e-06, "loss": 0.2677, "step": 5451 }, { "epoch": 4.589225589225589, "grad_norm": 0.40061312913894653, "learning_rate": 3.198627927838911e-06, "loss": 0.2827, "step": 5452 }, { "epoch": 4.59006734006734, "grad_norm": 0.39725568890571594, "learning_rate": 3.196670160834139e-06, "loss": 0.3133, "step": 5453 }, { "epoch": 4.590909090909091, "grad_norm": 0.40323108434677124, "learning_rate": 3.1947127115953097e-06, "loss": 0.2995, "step": 5454 }, { "epoch": 4.591750841750842, "grad_norm": 0.4050002098083496, "learning_rate": 3.1927555804673483e-06, "loss": 0.2806, "step": 5455 }, { "epoch": 4.592592592592593, "grad_norm": 0.4051240086555481, "learning_rate": 3.1907987677951207e-06, "loss": 0.2681, "step": 5456 }, { "epoch": 4.593434343434343, "grad_norm": 0.4009389281272888, "learning_rate": 3.188842273923438e-06, "loss": 0.2978, "step": 5457 }, { "epoch": 4.594276094276094, "grad_norm": 0.41790321469306946, "learning_rate": 3.1868860991970576e-06, "loss": 0.2776, "step": 5458 }, { "epoch": 4.595117845117845, "grad_norm": 0.4165343940258026, "learning_rate": 3.1849302439606767e-06, "loss": 0.2981, "step": 5459 }, { "epoch": 4.595959595959596, "grad_norm": 0.42786598205566406, "learning_rate": 3.182974708558938e-06, "loss": 0.289, "step": 5460 }, { "epoch": 4.596801346801347, "grad_norm": 0.3851867616176605, "learning_rate": 3.181019493336428e-06, "loss": 0.2913, "step": 5461 }, { "epoch": 4.597643097643098, "grad_norm": 0.38674694299697876, "learning_rate": 3.1790645986376747e-06, "loss": 0.2945, "step": 5462 }, { "epoch": 4.598484848484849, "grad_norm": 0.4030170440673828, "learning_rate": 3.1771100248071533e-06, "loss": 0.3133, "step": 5463 }, { "epoch": 4.5993265993265995, "grad_norm": 0.4159967005252838, "learning_rate": 3.1751557721892824e-06, "loss": 0.3117, "step": 5464 }, { "epoch": 4.60016835016835, "grad_norm": 0.38000813126564026, "learning_rate": 3.173201841128417e-06, "loss": 0.2884, "step": 5465 }, { "epoch": 4.601010101010101, "grad_norm": 0.3819810450077057, "learning_rate": 3.1712482319688655e-06, "loss": 0.2908, "step": 5466 }, { "epoch": 4.601851851851852, "grad_norm": 0.36494192481040955, "learning_rate": 3.1692949450548704e-06, "loss": 0.292, "step": 5467 }, { "epoch": 4.602693602693603, "grad_norm": 0.3759672939777374, "learning_rate": 3.167341980730626e-06, "loss": 0.3034, "step": 5468 }, { "epoch": 4.603535353535354, "grad_norm": 0.40258607268333435, "learning_rate": 3.1653893393402622e-06, "loss": 0.2758, "step": 5469 }, { "epoch": 4.604377104377105, "grad_norm": 0.37858620285987854, "learning_rate": 3.1634370212278574e-06, "loss": 0.2988, "step": 5470 }, { "epoch": 4.6052188552188555, "grad_norm": 0.4133528172969818, "learning_rate": 3.1614850267374297e-06, "loss": 0.2876, "step": 5471 }, { "epoch": 4.606060606060606, "grad_norm": 0.4080013930797577, "learning_rate": 3.1595333562129414e-06, "loss": 0.3193, "step": 5472 }, { "epoch": 4.606902356902357, "grad_norm": 0.40786880254745483, "learning_rate": 3.157582009998295e-06, "loss": 0.306, "step": 5473 }, { "epoch": 4.607744107744107, "grad_norm": 0.38389647006988525, "learning_rate": 3.155630988437345e-06, "loss": 0.3083, "step": 5474 }, { "epoch": 4.608585858585858, "grad_norm": 0.4426363408565521, "learning_rate": 3.1536802918738764e-06, "loss": 0.3141, "step": 5475 }, { "epoch": 4.609427609427609, "grad_norm": 0.4301900565624237, "learning_rate": 3.151729920651626e-06, "loss": 0.2783, "step": 5476 }, { "epoch": 4.61026936026936, "grad_norm": 0.39071542024612427, "learning_rate": 3.1497798751142684e-06, "loss": 0.2744, "step": 5477 }, { "epoch": 4.611111111111111, "grad_norm": 0.4219914972782135, "learning_rate": 3.1478301556054214e-06, "loss": 0.2911, "step": 5478 }, { "epoch": 4.611952861952862, "grad_norm": 0.41143012046813965, "learning_rate": 3.1458807624686495e-06, "loss": 0.2929, "step": 5479 }, { "epoch": 4.6127946127946124, "grad_norm": 0.43535155057907104, "learning_rate": 3.143931696047454e-06, "loss": 0.3129, "step": 5480 }, { "epoch": 4.613636363636363, "grad_norm": 0.38031935691833496, "learning_rate": 3.141982956685282e-06, "loss": 0.3124, "step": 5481 }, { "epoch": 4.614478114478114, "grad_norm": 0.41197770833969116, "learning_rate": 3.140034544725523e-06, "loss": 0.3019, "step": 5482 }, { "epoch": 4.615319865319865, "grad_norm": 0.4558964669704437, "learning_rate": 3.138086460511505e-06, "loss": 0.3139, "step": 5483 }, { "epoch": 4.616161616161616, "grad_norm": 0.37612271308898926, "learning_rate": 3.1361387043865042e-06, "loss": 0.2983, "step": 5484 }, { "epoch": 4.617003367003367, "grad_norm": 0.37950628995895386, "learning_rate": 3.1341912766937366e-06, "loss": 0.2901, "step": 5485 }, { "epoch": 4.617845117845118, "grad_norm": 0.44569122791290283, "learning_rate": 3.1322441777763573e-06, "loss": 0.3058, "step": 5486 }, { "epoch": 4.6186868686868685, "grad_norm": 0.4333460032939911, "learning_rate": 3.1302974079774675e-06, "loss": 0.2804, "step": 5487 }, { "epoch": 4.619528619528619, "grad_norm": 0.3807896077632904, "learning_rate": 3.128350967640107e-06, "loss": 0.3118, "step": 5488 }, { "epoch": 4.62037037037037, "grad_norm": 0.38181546330451965, "learning_rate": 3.1264048571072598e-06, "loss": 0.3019, "step": 5489 }, { "epoch": 4.621212121212121, "grad_norm": 0.4651874601840973, "learning_rate": 3.1244590767218526e-06, "loss": 0.3011, "step": 5490 }, { "epoch": 4.622053872053872, "grad_norm": 0.4422103464603424, "learning_rate": 3.122513626826753e-06, "loss": 0.3115, "step": 5491 }, { "epoch": 4.622895622895623, "grad_norm": 0.3806515634059906, "learning_rate": 3.1205685077647683e-06, "loss": 0.3087, "step": 5492 }, { "epoch": 4.623737373737374, "grad_norm": 0.4004344344139099, "learning_rate": 3.118623719878651e-06, "loss": 0.2843, "step": 5493 }, { "epoch": 4.624579124579125, "grad_norm": 0.4564538598060608, "learning_rate": 3.1166792635110887e-06, "loss": 0.3049, "step": 5494 }, { "epoch": 4.625420875420875, "grad_norm": 0.43624308705329895, "learning_rate": 3.114735139004723e-06, "loss": 0.2934, "step": 5495 }, { "epoch": 4.626262626262626, "grad_norm": 0.42929404973983765, "learning_rate": 3.112791346702124e-06, "loss": 0.3105, "step": 5496 }, { "epoch": 4.627104377104377, "grad_norm": 0.376168429851532, "learning_rate": 3.1108478869458107e-06, "loss": 0.3155, "step": 5497 }, { "epoch": 4.627946127946128, "grad_norm": 0.4228682518005371, "learning_rate": 3.1089047600782408e-06, "loss": 0.301, "step": 5498 }, { "epoch": 4.628787878787879, "grad_norm": 0.41477441787719727, "learning_rate": 3.1069619664418115e-06, "loss": 0.2879, "step": 5499 }, { "epoch": 4.62962962962963, "grad_norm": 0.4594900608062744, "learning_rate": 3.105019506378869e-06, "loss": 0.2491, "step": 5500 }, { "epoch": 4.630471380471381, "grad_norm": 0.43338143825531006, "learning_rate": 3.1030773802316916e-06, "loss": 0.3162, "step": 5501 }, { "epoch": 4.6313131313131315, "grad_norm": 0.4144834280014038, "learning_rate": 3.1011355883425047e-06, "loss": 0.2993, "step": 5502 }, { "epoch": 4.632154882154882, "grad_norm": 0.4447397291660309, "learning_rate": 3.0991941310534714e-06, "loss": 0.2794, "step": 5503 }, { "epoch": 4.632996632996633, "grad_norm": 0.471088171005249, "learning_rate": 3.0972530087066967e-06, "loss": 0.3347, "step": 5504 }, { "epoch": 4.633838383838384, "grad_norm": 0.491947203874588, "learning_rate": 3.0953122216442266e-06, "loss": 0.3072, "step": 5505 }, { "epoch": 4.634680134680135, "grad_norm": 0.39270147681236267, "learning_rate": 3.0933717702080522e-06, "loss": 0.2921, "step": 5506 }, { "epoch": 4.635521885521886, "grad_norm": 0.43611839413642883, "learning_rate": 3.091431654740098e-06, "loss": 0.2903, "step": 5507 }, { "epoch": 4.636363636363637, "grad_norm": 0.40289580821990967, "learning_rate": 3.0894918755822357e-06, "loss": 0.3024, "step": 5508 }, { "epoch": 4.6372053872053876, "grad_norm": 0.44806161522865295, "learning_rate": 3.087552433076272e-06, "loss": 0.2932, "step": 5509 }, { "epoch": 4.638047138047138, "grad_norm": 0.4277414381504059, "learning_rate": 3.0856133275639582e-06, "loss": 0.2954, "step": 5510 }, { "epoch": 4.638888888888889, "grad_norm": 0.4353707730770111, "learning_rate": 3.0836745593869877e-06, "loss": 0.2602, "step": 5511 }, { "epoch": 4.63973063973064, "grad_norm": 0.3986283242702484, "learning_rate": 3.0817361288869907e-06, "loss": 0.3012, "step": 5512 }, { "epoch": 4.640572390572391, "grad_norm": 0.42043060064315796, "learning_rate": 3.0797980364055393e-06, "loss": 0.285, "step": 5513 }, { "epoch": 4.641414141414142, "grad_norm": 0.44859468936920166, "learning_rate": 3.0778602822841464e-06, "loss": 0.2647, "step": 5514 }, { "epoch": 4.642255892255893, "grad_norm": 0.39683717489242554, "learning_rate": 3.0759228668642627e-06, "loss": 0.2862, "step": 5515 }, { "epoch": 4.643097643097643, "grad_norm": 0.42844313383102417, "learning_rate": 3.0739857904872837e-06, "loss": 0.3123, "step": 5516 }, { "epoch": 4.643939393939394, "grad_norm": 0.42665600776672363, "learning_rate": 3.072049053494543e-06, "loss": 0.3115, "step": 5517 }, { "epoch": 4.6447811447811445, "grad_norm": 0.3783291280269623, "learning_rate": 3.0701126562273142e-06, "loss": 0.2859, "step": 5518 }, { "epoch": 4.645622895622895, "grad_norm": 0.39883020520210266, "learning_rate": 3.0681765990268103e-06, "loss": 0.3196, "step": 5519 }, { "epoch": 4.646464646464646, "grad_norm": 0.41349413990974426, "learning_rate": 3.0662408822341866e-06, "loss": 0.3048, "step": 5520 }, { "epoch": 4.647306397306397, "grad_norm": 0.41336148977279663, "learning_rate": 3.0643055061905336e-06, "loss": 0.3121, "step": 5521 }, { "epoch": 4.648148148148148, "grad_norm": 0.3900788724422455, "learning_rate": 3.0623704712368895e-06, "loss": 0.2955, "step": 5522 }, { "epoch": 4.648989898989899, "grad_norm": 0.3742169737815857, "learning_rate": 3.0604357777142267e-06, "loss": 0.3015, "step": 5523 }, { "epoch": 4.64983164983165, "grad_norm": 0.37274619936943054, "learning_rate": 3.058501425963457e-06, "loss": 0.3096, "step": 5524 }, { "epoch": 4.6506734006734005, "grad_norm": 0.3860580325126648, "learning_rate": 3.0565674163254365e-06, "loss": 0.3259, "step": 5525 }, { "epoch": 4.651515151515151, "grad_norm": 0.3663026690483093, "learning_rate": 3.0546337491409546e-06, "loss": 0.3015, "step": 5526 }, { "epoch": 4.652356902356902, "grad_norm": 0.35756248235702515, "learning_rate": 3.0527004247507485e-06, "loss": 0.3034, "step": 5527 }, { "epoch": 4.653198653198653, "grad_norm": 0.385314017534256, "learning_rate": 3.0507674434954883e-06, "loss": 0.2627, "step": 5528 }, { "epoch": 4.654040404040404, "grad_norm": 0.3796789348125458, "learning_rate": 3.0488348057157868e-06, "loss": 0.2992, "step": 5529 }, { "epoch": 4.654882154882155, "grad_norm": 0.3884929120540619, "learning_rate": 3.0469025117521944e-06, "loss": 0.2817, "step": 5530 }, { "epoch": 4.655723905723906, "grad_norm": 0.4184952974319458, "learning_rate": 3.0449705619452e-06, "loss": 0.2693, "step": 5531 }, { "epoch": 4.656565656565657, "grad_norm": 0.3865834176540375, "learning_rate": 3.043038956635238e-06, "loss": 0.2909, "step": 5532 }, { "epoch": 4.657407407407407, "grad_norm": 0.4101921319961548, "learning_rate": 3.041107696162676e-06, "loss": 0.2963, "step": 5533 }, { "epoch": 4.658249158249158, "grad_norm": 0.391818106174469, "learning_rate": 3.0391767808678218e-06, "loss": 0.3213, "step": 5534 }, { "epoch": 4.659090909090909, "grad_norm": 0.43603241443634033, "learning_rate": 3.0372462110909255e-06, "loss": 0.3107, "step": 5535 }, { "epoch": 4.65993265993266, "grad_norm": 0.3949803113937378, "learning_rate": 3.0353159871721718e-06, "loss": 0.294, "step": 5536 }, { "epoch": 4.660774410774411, "grad_norm": 0.42379212379455566, "learning_rate": 3.0333861094516853e-06, "loss": 0.2661, "step": 5537 }, { "epoch": 4.661616161616162, "grad_norm": 0.42633765935897827, "learning_rate": 3.031456578269536e-06, "loss": 0.3367, "step": 5538 }, { "epoch": 4.662457912457913, "grad_norm": 0.4335867762565613, "learning_rate": 3.029527393965723e-06, "loss": 0.2936, "step": 5539 }, { "epoch": 4.6632996632996635, "grad_norm": 0.5075645446777344, "learning_rate": 3.0275985568801925e-06, "loss": 0.324, "step": 5540 }, { "epoch": 4.664141414141414, "grad_norm": 0.3801816701889038, "learning_rate": 3.0256700673528257e-06, "loss": 0.2933, "step": 5541 }, { "epoch": 4.664983164983165, "grad_norm": 0.4517483711242676, "learning_rate": 3.02374192572344e-06, "loss": 0.2985, "step": 5542 }, { "epoch": 4.665824915824916, "grad_norm": 0.42100563645362854, "learning_rate": 3.0218141323317972e-06, "loss": 0.2949, "step": 5543 }, { "epoch": 4.666666666666667, "grad_norm": 0.39376717805862427, "learning_rate": 3.019886687517596e-06, "loss": 0.2883, "step": 5544 }, { "epoch": 4.667508417508418, "grad_norm": 0.42457693815231323, "learning_rate": 3.0179595916204705e-06, "loss": 0.2627, "step": 5545 }, { "epoch": 4.668350168350169, "grad_norm": 0.4354398548603058, "learning_rate": 3.016032844979997e-06, "loss": 0.3262, "step": 5546 }, { "epoch": 4.66919191919192, "grad_norm": 0.39960893988609314, "learning_rate": 3.0141064479356854e-06, "loss": 0.2968, "step": 5547 }, { "epoch": 4.6700336700336695, "grad_norm": 0.4436018764972687, "learning_rate": 3.0121804008269928e-06, "loss": 0.3005, "step": 5548 }, { "epoch": 4.67087542087542, "grad_norm": 0.43717700242996216, "learning_rate": 3.010254703993306e-06, "loss": 0.2794, "step": 5549 }, { "epoch": 4.671717171717171, "grad_norm": 0.39573776721954346, "learning_rate": 3.008329357773955e-06, "loss": 0.2963, "step": 5550 }, { "epoch": 4.672558922558922, "grad_norm": 0.4243094027042389, "learning_rate": 3.0064043625082033e-06, "loss": 0.3125, "step": 5551 }, { "epoch": 4.673400673400673, "grad_norm": 0.409467488527298, "learning_rate": 3.0044797185352593e-06, "loss": 0.3099, "step": 5552 }, { "epoch": 4.674242424242424, "grad_norm": 0.4130086302757263, "learning_rate": 3.0025554261942612e-06, "loss": 0.2891, "step": 5553 }, { "epoch": 4.675084175084175, "grad_norm": 0.39026695489883423, "learning_rate": 3.0006314858242946e-06, "loss": 0.2855, "step": 5554 }, { "epoch": 4.675925925925926, "grad_norm": 0.39604994654655457, "learning_rate": 2.9987078977643756e-06, "loss": 0.2966, "step": 5555 }, { "epoch": 4.6767676767676765, "grad_norm": 0.4345289468765259, "learning_rate": 2.9967846623534625e-06, "loss": 0.3051, "step": 5556 }, { "epoch": 4.677609427609427, "grad_norm": 0.37629544734954834, "learning_rate": 2.9948617799304486e-06, "loss": 0.2931, "step": 5557 }, { "epoch": 4.678451178451178, "grad_norm": 0.3769298493862152, "learning_rate": 2.9929392508341653e-06, "loss": 0.2908, "step": 5558 }, { "epoch": 4.679292929292929, "grad_norm": 0.3862557113170624, "learning_rate": 2.9910170754033863e-06, "loss": 0.287, "step": 5559 }, { "epoch": 4.68013468013468, "grad_norm": 0.4032231867313385, "learning_rate": 2.9890952539768165e-06, "loss": 0.3041, "step": 5560 }, { "epoch": 4.680976430976431, "grad_norm": 0.3942810297012329, "learning_rate": 2.9871737868931026e-06, "loss": 0.2972, "step": 5561 }, { "epoch": 4.681818181818182, "grad_norm": 0.38990750908851624, "learning_rate": 2.985252674490826e-06, "loss": 0.2913, "step": 5562 }, { "epoch": 4.6826599326599325, "grad_norm": 0.3958323299884796, "learning_rate": 2.9833319171085074e-06, "loss": 0.2916, "step": 5563 }, { "epoch": 4.683501683501683, "grad_norm": 0.3902466595172882, "learning_rate": 2.9814115150846066e-06, "loss": 0.3009, "step": 5564 }, { "epoch": 4.684343434343434, "grad_norm": 0.4244628846645355, "learning_rate": 2.979491468757519e-06, "loss": 0.2801, "step": 5565 }, { "epoch": 4.685185185185185, "grad_norm": 0.381279855966568, "learning_rate": 2.9775717784655746e-06, "loss": 0.2928, "step": 5566 }, { "epoch": 4.686026936026936, "grad_norm": 0.3998388350009918, "learning_rate": 2.9756524445470465e-06, "loss": 0.286, "step": 5567 }, { "epoch": 4.686868686868687, "grad_norm": 0.38659751415252686, "learning_rate": 2.9737334673401387e-06, "loss": 0.3238, "step": 5568 }, { "epoch": 4.687710437710438, "grad_norm": 0.39830389618873596, "learning_rate": 2.9718148471829954e-06, "loss": 0.3386, "step": 5569 }, { "epoch": 4.688552188552189, "grad_norm": 0.3942379057407379, "learning_rate": 2.969896584413701e-06, "loss": 0.3094, "step": 5570 }, { "epoch": 4.6893939393939394, "grad_norm": 0.3660367727279663, "learning_rate": 2.967978679370273e-06, "loss": 0.3131, "step": 5571 }, { "epoch": 4.69023569023569, "grad_norm": 0.38957929611206055, "learning_rate": 2.966061132390665e-06, "loss": 0.3077, "step": 5572 }, { "epoch": 4.691077441077441, "grad_norm": 0.3767688274383545, "learning_rate": 2.9641439438127713e-06, "loss": 0.3058, "step": 5573 }, { "epoch": 4.691919191919192, "grad_norm": 0.40511566400527954, "learning_rate": 2.9622271139744176e-06, "loss": 0.3006, "step": 5574 }, { "epoch": 4.692760942760943, "grad_norm": 0.41454586386680603, "learning_rate": 2.960310643213373e-06, "loss": 0.2978, "step": 5575 }, { "epoch": 4.693602693602694, "grad_norm": 0.38342276215553284, "learning_rate": 2.9583945318673402e-06, "loss": 0.3112, "step": 5576 }, { "epoch": 4.694444444444445, "grad_norm": 0.4061432182788849, "learning_rate": 2.956478780273956e-06, "loss": 0.3032, "step": 5577 }, { "epoch": 4.6952861952861955, "grad_norm": 0.4087730646133423, "learning_rate": 2.954563388770798e-06, "loss": 0.3151, "step": 5578 }, { "epoch": 4.696127946127946, "grad_norm": 0.36630067229270935, "learning_rate": 2.9526483576953767e-06, "loss": 0.2846, "step": 5579 }, { "epoch": 4.696969696969697, "grad_norm": 0.38967666029930115, "learning_rate": 2.9507336873851445e-06, "loss": 0.3147, "step": 5580 }, { "epoch": 4.697811447811448, "grad_norm": 0.39448532462120056, "learning_rate": 2.9488193781774837e-06, "loss": 0.3079, "step": 5581 }, { "epoch": 4.698653198653199, "grad_norm": 0.3930644392967224, "learning_rate": 2.9469054304097176e-06, "loss": 0.3037, "step": 5582 }, { "epoch": 4.69949494949495, "grad_norm": 0.40035316348075867, "learning_rate": 2.944991844419104e-06, "loss": 0.2966, "step": 5583 }, { "epoch": 4.700336700336701, "grad_norm": 0.45260411500930786, "learning_rate": 2.9430786205428365e-06, "loss": 0.3166, "step": 5584 }, { "epoch": 4.701178451178452, "grad_norm": 0.3979441225528717, "learning_rate": 2.9411657591180432e-06, "loss": 0.3052, "step": 5585 }, { "epoch": 4.702020202020202, "grad_norm": 0.4079403877258301, "learning_rate": 2.939253260481796e-06, "loss": 0.2823, "step": 5586 }, { "epoch": 4.702861952861953, "grad_norm": 0.38268399238586426, "learning_rate": 2.9373411249710935e-06, "loss": 0.2931, "step": 5587 }, { "epoch": 4.703703703703704, "grad_norm": 0.39461591839790344, "learning_rate": 2.935429352922877e-06, "loss": 0.3044, "step": 5588 }, { "epoch": 4.704545454545455, "grad_norm": 0.37013620138168335, "learning_rate": 2.933517944674019e-06, "loss": 0.2838, "step": 5589 }, { "epoch": 4.705387205387205, "grad_norm": 0.40355250239372253, "learning_rate": 2.9316069005613296e-06, "loss": 0.312, "step": 5590 }, { "epoch": 4.706228956228956, "grad_norm": 0.3768024146556854, "learning_rate": 2.9296962209215586e-06, "loss": 0.3084, "step": 5591 }, { "epoch": 4.707070707070707, "grad_norm": 0.385932981967926, "learning_rate": 2.9277859060913843e-06, "loss": 0.2912, "step": 5592 }, { "epoch": 4.707912457912458, "grad_norm": 0.443380743265152, "learning_rate": 2.9258759564074265e-06, "loss": 0.2938, "step": 5593 }, { "epoch": 4.7087542087542085, "grad_norm": 0.40282365679740906, "learning_rate": 2.923966372206239e-06, "loss": 0.2932, "step": 5594 }, { "epoch": 4.709595959595959, "grad_norm": 0.40883907675743103, "learning_rate": 2.922057153824308e-06, "loss": 0.2984, "step": 5595 }, { "epoch": 4.71043771043771, "grad_norm": 0.40979239344596863, "learning_rate": 2.920148301598062e-06, "loss": 0.2981, "step": 5596 }, { "epoch": 4.711279461279461, "grad_norm": 0.4131472408771515, "learning_rate": 2.9182398158638592e-06, "loss": 0.2835, "step": 5597 }, { "epoch": 4.712121212121212, "grad_norm": 0.3922131061553955, "learning_rate": 2.9163316969579957e-06, "loss": 0.2829, "step": 5598 }, { "epoch": 4.712962962962963, "grad_norm": 0.41412270069122314, "learning_rate": 2.914423945216703e-06, "loss": 0.2903, "step": 5599 }, { "epoch": 4.713804713804714, "grad_norm": 0.38452696800231934, "learning_rate": 2.912516560976146e-06, "loss": 0.3197, "step": 5600 }, { "epoch": 4.7146464646464645, "grad_norm": 0.3605614900588989, "learning_rate": 2.9106095445724224e-06, "loss": 0.2973, "step": 5601 }, { "epoch": 4.715488215488215, "grad_norm": 0.3928007483482361, "learning_rate": 2.9087028963415776e-06, "loss": 0.2849, "step": 5602 }, { "epoch": 4.716329966329966, "grad_norm": 0.388764351606369, "learning_rate": 2.9067966166195778e-06, "loss": 0.3165, "step": 5603 }, { "epoch": 4.717171717171717, "grad_norm": 0.3714967370033264, "learning_rate": 2.9048907057423308e-06, "loss": 0.279, "step": 5604 }, { "epoch": 4.718013468013468, "grad_norm": 0.3748752772808075, "learning_rate": 2.902985164045676e-06, "loss": 0.2905, "step": 5605 }, { "epoch": 4.718855218855219, "grad_norm": 0.3913353979587555, "learning_rate": 2.9010799918653932e-06, "loss": 0.2921, "step": 5606 }, { "epoch": 4.71969696969697, "grad_norm": 0.384721577167511, "learning_rate": 2.899175189537195e-06, "loss": 0.3003, "step": 5607 }, { "epoch": 4.720538720538721, "grad_norm": 0.40530091524124146, "learning_rate": 2.8972707573967264e-06, "loss": 0.3004, "step": 5608 }, { "epoch": 4.7213804713804715, "grad_norm": 0.39530375599861145, "learning_rate": 2.895366695779566e-06, "loss": 0.2775, "step": 5609 }, { "epoch": 4.722222222222222, "grad_norm": 0.4048554003238678, "learning_rate": 2.893463005021234e-06, "loss": 0.2773, "step": 5610 }, { "epoch": 4.723063973063973, "grad_norm": 0.3987857401371002, "learning_rate": 2.891559685457178e-06, "loss": 0.3079, "step": 5611 }, { "epoch": 4.723905723905724, "grad_norm": 0.3605796992778778, "learning_rate": 2.8896567374227856e-06, "loss": 0.2963, "step": 5612 }, { "epoch": 4.724747474747475, "grad_norm": 0.39120161533355713, "learning_rate": 2.8877541612533765e-06, "loss": 0.3193, "step": 5613 }, { "epoch": 4.725589225589226, "grad_norm": 0.40278613567352295, "learning_rate": 2.8858519572842e-06, "loss": 0.2973, "step": 5614 }, { "epoch": 4.726430976430977, "grad_norm": 0.39217710494995117, "learning_rate": 2.883950125850452e-06, "loss": 0.2949, "step": 5615 }, { "epoch": 4.7272727272727275, "grad_norm": 0.3967953622341156, "learning_rate": 2.8820486672872515e-06, "loss": 0.3064, "step": 5616 }, { "epoch": 4.728114478114478, "grad_norm": 0.42894336581230164, "learning_rate": 2.880147581929654e-06, "loss": 0.3079, "step": 5617 }, { "epoch": 4.728956228956229, "grad_norm": 0.4029891788959503, "learning_rate": 2.878246870112653e-06, "loss": 0.2892, "step": 5618 }, { "epoch": 4.72979797979798, "grad_norm": 0.40415915846824646, "learning_rate": 2.876346532171177e-06, "loss": 0.2842, "step": 5619 }, { "epoch": 4.730639730639731, "grad_norm": 0.3942324221134186, "learning_rate": 2.8744465684400822e-06, "loss": 0.2688, "step": 5620 }, { "epoch": 4.731481481481482, "grad_norm": 0.39161863923072815, "learning_rate": 2.8725469792541644e-06, "loss": 0.2803, "step": 5621 }, { "epoch": 4.732323232323233, "grad_norm": 0.4068886637687683, "learning_rate": 2.870647764948148e-06, "loss": 0.2841, "step": 5622 }, { "epoch": 4.733164983164983, "grad_norm": 0.37939703464508057, "learning_rate": 2.868748925856698e-06, "loss": 0.3086, "step": 5623 }, { "epoch": 4.7340067340067336, "grad_norm": 0.39177119731903076, "learning_rate": 2.8668504623144107e-06, "loss": 0.2815, "step": 5624 }, { "epoch": 4.734848484848484, "grad_norm": 0.4055367708206177, "learning_rate": 2.864952374655815e-06, "loss": 0.2855, "step": 5625 }, { "epoch": 4.735690235690235, "grad_norm": 0.4633174240589142, "learning_rate": 2.863054663215371e-06, "loss": 0.3206, "step": 5626 }, { "epoch": 4.736531986531986, "grad_norm": 0.396324098110199, "learning_rate": 2.8611573283274786e-06, "loss": 0.2824, "step": 5627 }, { "epoch": 4.737373737373737, "grad_norm": 0.41229137778282166, "learning_rate": 2.8592603703264693e-06, "loss": 0.2925, "step": 5628 }, { "epoch": 4.738215488215488, "grad_norm": 0.41042810678482056, "learning_rate": 2.8573637895466067e-06, "loss": 0.3086, "step": 5629 }, { "epoch": 4.739057239057239, "grad_norm": 0.4251020848751068, "learning_rate": 2.8554675863220876e-06, "loss": 0.309, "step": 5630 }, { "epoch": 4.73989898989899, "grad_norm": 0.40586069226264954, "learning_rate": 2.8535717609870417e-06, "loss": 0.3176, "step": 5631 }, { "epoch": 4.7407407407407405, "grad_norm": 0.41135019063949585, "learning_rate": 2.851676313875538e-06, "loss": 0.2784, "step": 5632 }, { "epoch": 4.741582491582491, "grad_norm": 0.4276634454727173, "learning_rate": 2.849781245321569e-06, "loss": 0.3225, "step": 5633 }, { "epoch": 4.742424242424242, "grad_norm": 0.43404310941696167, "learning_rate": 2.8478865556590716e-06, "loss": 0.2921, "step": 5634 }, { "epoch": 4.743265993265993, "grad_norm": 0.4191960394382477, "learning_rate": 2.8459922452219058e-06, "loss": 0.3019, "step": 5635 }, { "epoch": 4.744107744107744, "grad_norm": 0.44935497641563416, "learning_rate": 2.8440983143438725e-06, "loss": 0.2757, "step": 5636 }, { "epoch": 4.744949494949495, "grad_norm": 0.42225393652915955, "learning_rate": 2.8422047633587025e-06, "loss": 0.2979, "step": 5637 }, { "epoch": 4.745791245791246, "grad_norm": 0.41137152910232544, "learning_rate": 2.8403115926000554e-06, "loss": 0.3118, "step": 5638 }, { "epoch": 4.7466329966329965, "grad_norm": 0.4222579598426819, "learning_rate": 2.8384188024015313e-06, "loss": 0.3306, "step": 5639 }, { "epoch": 4.747474747474747, "grad_norm": 0.43283605575561523, "learning_rate": 2.8365263930966615e-06, "loss": 0.3058, "step": 5640 }, { "epoch": 4.748316498316498, "grad_norm": 0.40030962228775024, "learning_rate": 2.8346343650189066e-06, "loss": 0.301, "step": 5641 }, { "epoch": 4.749158249158249, "grad_norm": 0.4333449900150299, "learning_rate": 2.832742718501663e-06, "loss": 0.336, "step": 5642 }, { "epoch": 4.75, "grad_norm": 0.4156118929386139, "learning_rate": 2.830851453878256e-06, "loss": 0.2847, "step": 5643 }, { "epoch": 4.750841750841751, "grad_norm": 0.46803322434425354, "learning_rate": 2.82896057148195e-06, "loss": 0.3167, "step": 5644 }, { "epoch": 4.751683501683502, "grad_norm": 0.41671016812324524, "learning_rate": 2.827070071645939e-06, "loss": 0.2932, "step": 5645 }, { "epoch": 4.752525252525253, "grad_norm": 0.40381142497062683, "learning_rate": 2.8251799547033478e-06, "loss": 0.2933, "step": 5646 }, { "epoch": 4.7533670033670035, "grad_norm": 0.40906232595443726, "learning_rate": 2.8232902209872337e-06, "loss": 0.3006, "step": 5647 }, { "epoch": 4.754208754208754, "grad_norm": 0.41153693199157715, "learning_rate": 2.8214008708305905e-06, "loss": 0.2747, "step": 5648 }, { "epoch": 4.755050505050505, "grad_norm": 0.4112420678138733, "learning_rate": 2.81951190456634e-06, "loss": 0.2811, "step": 5649 }, { "epoch": 4.755892255892256, "grad_norm": 0.36665648221969604, "learning_rate": 2.8176233225273403e-06, "loss": 0.2822, "step": 5650 }, { "epoch": 4.756734006734007, "grad_norm": 0.39381200075149536, "learning_rate": 2.8157351250463787e-06, "loss": 0.2933, "step": 5651 }, { "epoch": 4.757575757575758, "grad_norm": 0.4013933837413788, "learning_rate": 2.813847312456174e-06, "loss": 0.2987, "step": 5652 }, { "epoch": 4.758417508417509, "grad_norm": 0.4192175269126892, "learning_rate": 2.8119598850893815e-06, "loss": 0.2868, "step": 5653 }, { "epoch": 4.7592592592592595, "grad_norm": 0.40771812200546265, "learning_rate": 2.810072843278583e-06, "loss": 0.3054, "step": 5654 }, { "epoch": 4.76010101010101, "grad_norm": 0.4177559018135071, "learning_rate": 2.808186187356299e-06, "loss": 0.3121, "step": 5655 }, { "epoch": 4.760942760942761, "grad_norm": 0.4143209457397461, "learning_rate": 2.8062999176549747e-06, "loss": 0.2704, "step": 5656 }, { "epoch": 4.761784511784512, "grad_norm": 0.4180520176887512, "learning_rate": 2.8044140345069955e-06, "loss": 0.2848, "step": 5657 }, { "epoch": 4.762626262626263, "grad_norm": 0.4040321409702301, "learning_rate": 2.802528538244671e-06, "loss": 0.2868, "step": 5658 }, { "epoch": 4.763468013468014, "grad_norm": 0.37690114974975586, "learning_rate": 2.8006434292002443e-06, "loss": 0.2725, "step": 5659 }, { "epoch": 4.764309764309765, "grad_norm": 0.4076095521450043, "learning_rate": 2.7987587077058957e-06, "loss": 0.305, "step": 5660 }, { "epoch": 4.765151515151516, "grad_norm": 0.4352533519268036, "learning_rate": 2.796874374093729e-06, "loss": 0.2989, "step": 5661 }, { "epoch": 4.7659932659932664, "grad_norm": 0.3836723566055298, "learning_rate": 2.7949904286957884e-06, "loss": 0.3109, "step": 5662 }, { "epoch": 4.766835016835017, "grad_norm": 0.40600109100341797, "learning_rate": 2.7931068718440425e-06, "loss": 0.3159, "step": 5663 }, { "epoch": 4.767676767676767, "grad_norm": 0.44366997480392456, "learning_rate": 2.7912237038703926e-06, "loss": 0.3288, "step": 5664 }, { "epoch": 4.768518518518518, "grad_norm": 0.4066266417503357, "learning_rate": 2.789340925106675e-06, "loss": 0.3101, "step": 5665 }, { "epoch": 4.769360269360269, "grad_norm": 0.4355826675891876, "learning_rate": 2.787458535884657e-06, "loss": 0.3037, "step": 5666 }, { "epoch": 4.77020202020202, "grad_norm": 0.42125779390335083, "learning_rate": 2.785576536536033e-06, "loss": 0.2598, "step": 5667 }, { "epoch": 4.771043771043771, "grad_norm": 0.40756165981292725, "learning_rate": 2.783694927392433e-06, "loss": 0.3174, "step": 5668 }, { "epoch": 4.771885521885522, "grad_norm": 0.44382497668266296, "learning_rate": 2.7818137087854134e-06, "loss": 0.2967, "step": 5669 }, { "epoch": 4.7727272727272725, "grad_norm": 0.41608870029449463, "learning_rate": 2.779932881046468e-06, "loss": 0.2892, "step": 5670 }, { "epoch": 4.773569023569023, "grad_norm": 0.3886108100414276, "learning_rate": 2.7780524445070195e-06, "loss": 0.2964, "step": 5671 }, { "epoch": 4.774410774410774, "grad_norm": 0.3753642737865448, "learning_rate": 2.7761723994984193e-06, "loss": 0.3008, "step": 5672 }, { "epoch": 4.775252525252525, "grad_norm": 0.4014696478843689, "learning_rate": 2.77429274635195e-06, "loss": 0.2981, "step": 5673 }, { "epoch": 4.776094276094276, "grad_norm": 0.39145538210868835, "learning_rate": 2.7724134853988293e-06, "loss": 0.2777, "step": 5674 }, { "epoch": 4.776936026936027, "grad_norm": 0.39892464876174927, "learning_rate": 2.7705346169701998e-06, "loss": 0.2927, "step": 5675 }, { "epoch": 4.777777777777778, "grad_norm": 0.40056800842285156, "learning_rate": 2.7686561413971423e-06, "loss": 0.2926, "step": 5676 }, { "epoch": 4.7786195286195285, "grad_norm": 0.41343095898628235, "learning_rate": 2.7667780590106604e-06, "loss": 0.2953, "step": 5677 }, { "epoch": 4.779461279461279, "grad_norm": 0.3929968476295471, "learning_rate": 2.7649003701416953e-06, "loss": 0.3123, "step": 5678 }, { "epoch": 4.78030303030303, "grad_norm": 0.3996606171131134, "learning_rate": 2.763023075121114e-06, "loss": 0.301, "step": 5679 }, { "epoch": 4.781144781144781, "grad_norm": 0.3983176648616791, "learning_rate": 2.761146174279717e-06, "loss": 0.2871, "step": 5680 }, { "epoch": 4.781986531986532, "grad_norm": 0.42452195286750793, "learning_rate": 2.759269667948231e-06, "loss": 0.256, "step": 5681 }, { "epoch": 4.782828282828283, "grad_norm": 0.3918997645378113, "learning_rate": 2.757393556457319e-06, "loss": 0.31, "step": 5682 }, { "epoch": 4.783670033670034, "grad_norm": 0.4423258304595947, "learning_rate": 2.755517840137574e-06, "loss": 0.2738, "step": 5683 }, { "epoch": 4.784511784511785, "grad_norm": 0.4105573892593384, "learning_rate": 2.7536425193195143e-06, "loss": 0.3125, "step": 5684 }, { "epoch": 4.7853535353535355, "grad_norm": 0.3948405086994171, "learning_rate": 2.7517675943335904e-06, "loss": 0.3041, "step": 5685 }, { "epoch": 4.786195286195286, "grad_norm": 0.4056791067123413, "learning_rate": 2.749893065510185e-06, "loss": 0.3136, "step": 5686 }, { "epoch": 4.787037037037037, "grad_norm": 0.43097659945487976, "learning_rate": 2.7480189331796143e-06, "loss": 0.2696, "step": 5687 }, { "epoch": 4.787878787878788, "grad_norm": 0.4217633605003357, "learning_rate": 2.7461451976721158e-06, "loss": 0.3121, "step": 5688 }, { "epoch": 4.788720538720539, "grad_norm": 0.4043844938278198, "learning_rate": 2.744271859317863e-06, "loss": 0.3109, "step": 5689 }, { "epoch": 4.78956228956229, "grad_norm": 0.3859269917011261, "learning_rate": 2.7423989184469563e-06, "loss": 0.2868, "step": 5690 }, { "epoch": 4.790404040404041, "grad_norm": 0.3960281014442444, "learning_rate": 2.740526375389429e-06, "loss": 0.2978, "step": 5691 }, { "epoch": 4.7912457912457915, "grad_norm": 0.4210144579410553, "learning_rate": 2.7386542304752455e-06, "loss": 0.2947, "step": 5692 }, { "epoch": 4.792087542087542, "grad_norm": 0.4055316150188446, "learning_rate": 2.736782484034296e-06, "loss": 0.287, "step": 5693 }, { "epoch": 4.792929292929293, "grad_norm": 0.3924580514431, "learning_rate": 2.7349111363964e-06, "loss": 0.282, "step": 5694 }, { "epoch": 4.793771043771044, "grad_norm": 0.41697564721107483, "learning_rate": 2.733040187891312e-06, "loss": 0.3152, "step": 5695 }, { "epoch": 4.794612794612795, "grad_norm": 0.37513428926467896, "learning_rate": 2.731169638848713e-06, "loss": 0.325, "step": 5696 }, { "epoch": 4.795454545454545, "grad_norm": 0.3793131709098816, "learning_rate": 2.7292994895982094e-06, "loss": 0.3123, "step": 5697 }, { "epoch": 4.796296296296296, "grad_norm": 0.39427152276039124, "learning_rate": 2.7274297404693463e-06, "loss": 0.2999, "step": 5698 }, { "epoch": 4.797138047138047, "grad_norm": 0.40922218561172485, "learning_rate": 2.7255603917915923e-06, "loss": 0.2962, "step": 5699 }, { "epoch": 4.797979797979798, "grad_norm": 0.40478232502937317, "learning_rate": 2.7236914438943464e-06, "loss": 0.2726, "step": 5700 }, { "epoch": 4.798821548821548, "grad_norm": 0.41623979806900024, "learning_rate": 2.721822897106937e-06, "loss": 0.2959, "step": 5701 }, { "epoch": 4.799663299663299, "grad_norm": 0.4191470146179199, "learning_rate": 2.71995475175862e-06, "loss": 0.2759, "step": 5702 }, { "epoch": 4.80050505050505, "grad_norm": 0.39419084787368774, "learning_rate": 2.718087008178584e-06, "loss": 0.2765, "step": 5703 }, { "epoch": 4.801346801346801, "grad_norm": 0.4112904667854309, "learning_rate": 2.716219666695947e-06, "loss": 0.2799, "step": 5704 }, { "epoch": 4.802188552188552, "grad_norm": 0.38366463780403137, "learning_rate": 2.7143527276397533e-06, "loss": 0.2963, "step": 5705 }, { "epoch": 4.803030303030303, "grad_norm": 0.41137221455574036, "learning_rate": 2.7124861913389778e-06, "loss": 0.2728, "step": 5706 }, { "epoch": 4.803872053872054, "grad_norm": 0.39851120114326477, "learning_rate": 2.7106200581225196e-06, "loss": 0.3014, "step": 5707 }, { "epoch": 4.8047138047138045, "grad_norm": 0.3935084044933319, "learning_rate": 2.7087543283192185e-06, "loss": 0.283, "step": 5708 }, { "epoch": 4.805555555555555, "grad_norm": 0.4100017547607422, "learning_rate": 2.706889002257833e-06, "loss": 0.3137, "step": 5709 }, { "epoch": 4.806397306397306, "grad_norm": 0.39778852462768555, "learning_rate": 2.7050240802670534e-06, "loss": 0.2952, "step": 5710 }, { "epoch": 4.807239057239057, "grad_norm": 0.41711682081222534, "learning_rate": 2.703159562675496e-06, "loss": 0.3078, "step": 5711 }, { "epoch": 4.808080808080808, "grad_norm": 0.4312971234321594, "learning_rate": 2.701295449811714e-06, "loss": 0.2946, "step": 5712 }, { "epoch": 4.808922558922559, "grad_norm": 0.3944981098175049, "learning_rate": 2.699431742004178e-06, "loss": 0.3006, "step": 5713 }, { "epoch": 4.80976430976431, "grad_norm": 0.457108736038208, "learning_rate": 2.6975684395812994e-06, "loss": 0.2872, "step": 5714 }, { "epoch": 4.8106060606060606, "grad_norm": 0.4169090986251831, "learning_rate": 2.695705542871407e-06, "loss": 0.2929, "step": 5715 }, { "epoch": 4.811447811447811, "grad_norm": 0.4104226529598236, "learning_rate": 2.693843052202767e-06, "loss": 0.2979, "step": 5716 }, { "epoch": 4.812289562289562, "grad_norm": 0.41991642117500305, "learning_rate": 2.6919809679035676e-06, "loss": 0.3078, "step": 5717 }, { "epoch": 4.813131313131313, "grad_norm": 0.3978438079357147, "learning_rate": 2.6901192903019268e-06, "loss": 0.2911, "step": 5718 }, { "epoch": 4.813973063973064, "grad_norm": 0.43643614649772644, "learning_rate": 2.688258019725895e-06, "loss": 0.3116, "step": 5719 }, { "epoch": 4.814814814814815, "grad_norm": 0.4144088625907898, "learning_rate": 2.686397156503445e-06, "loss": 0.3092, "step": 5720 }, { "epoch": 4.815656565656566, "grad_norm": 0.41137751936912537, "learning_rate": 2.6845367009624844e-06, "loss": 0.3103, "step": 5721 }, { "epoch": 4.816498316498317, "grad_norm": 0.4058806598186493, "learning_rate": 2.6826766534308434e-06, "loss": 0.2825, "step": 5722 }, { "epoch": 4.8173400673400675, "grad_norm": 0.43045851588249207, "learning_rate": 2.6808170142362797e-06, "loss": 0.2916, "step": 5723 }, { "epoch": 4.818181818181818, "grad_norm": 0.40016719698905945, "learning_rate": 2.678957783706484e-06, "loss": 0.2832, "step": 5724 }, { "epoch": 4.819023569023569, "grad_norm": 0.40830278396606445, "learning_rate": 2.6770989621690746e-06, "loss": 0.2944, "step": 5725 }, { "epoch": 4.81986531986532, "grad_norm": 0.4224494993686676, "learning_rate": 2.675240549951593e-06, "loss": 0.2907, "step": 5726 }, { "epoch": 4.820707070707071, "grad_norm": 0.4007267653942108, "learning_rate": 2.673382547381512e-06, "loss": 0.2971, "step": 5727 }, { "epoch": 4.821548821548822, "grad_norm": 0.4080795645713806, "learning_rate": 2.67152495478623e-06, "loss": 0.2908, "step": 5728 }, { "epoch": 4.822390572390573, "grad_norm": 0.39021748304367065, "learning_rate": 2.669667772493075e-06, "loss": 0.271, "step": 5729 }, { "epoch": 4.8232323232323235, "grad_norm": 0.42977821826934814, "learning_rate": 2.667811000829305e-06, "loss": 0.3026, "step": 5730 }, { "epoch": 4.824074074074074, "grad_norm": 0.412463903427124, "learning_rate": 2.6659546401221017e-06, "loss": 0.3113, "step": 5731 }, { "epoch": 4.824915824915825, "grad_norm": 0.41399192810058594, "learning_rate": 2.6640986906985724e-06, "loss": 0.285, "step": 5732 }, { "epoch": 4.825757575757576, "grad_norm": 0.3833169937133789, "learning_rate": 2.66224315288576e-06, "loss": 0.2992, "step": 5733 }, { "epoch": 4.826599326599327, "grad_norm": 0.4152488708496094, "learning_rate": 2.6603880270106263e-06, "loss": 0.279, "step": 5734 }, { "epoch": 4.827441077441078, "grad_norm": 0.4184335172176361, "learning_rate": 2.6585333134000673e-06, "loss": 0.3057, "step": 5735 }, { "epoch": 4.828282828282829, "grad_norm": 0.3964718282222748, "learning_rate": 2.6566790123809e-06, "loss": 0.3284, "step": 5736 }, { "epoch": 4.82912457912458, "grad_norm": 0.43030714988708496, "learning_rate": 2.6548251242798763e-06, "loss": 0.2836, "step": 5737 }, { "epoch": 4.8299663299663305, "grad_norm": 0.40716883540153503, "learning_rate": 2.652971649423668e-06, "loss": 0.2845, "step": 5738 }, { "epoch": 4.83080808080808, "grad_norm": 0.3812611401081085, "learning_rate": 2.6511185881388757e-06, "loss": 0.3083, "step": 5739 }, { "epoch": 4.831649831649831, "grad_norm": 0.42559677362442017, "learning_rate": 2.6492659407520326e-06, "loss": 0.3179, "step": 5740 }, { "epoch": 4.832491582491582, "grad_norm": 0.4722898006439209, "learning_rate": 2.647413707589591e-06, "loss": 0.3176, "step": 5741 }, { "epoch": 4.833333333333333, "grad_norm": 0.38053038716316223, "learning_rate": 2.645561888977939e-06, "loss": 0.2835, "step": 5742 }, { "epoch": 4.834175084175084, "grad_norm": 0.40432512760162354, "learning_rate": 2.6437104852433826e-06, "loss": 0.2821, "step": 5743 }, { "epoch": 4.835016835016835, "grad_norm": 0.38781410455703735, "learning_rate": 2.6418594967121587e-06, "loss": 0.2917, "step": 5744 }, { "epoch": 4.835858585858586, "grad_norm": 0.46449539065361023, "learning_rate": 2.640008923710432e-06, "loss": 0.2871, "step": 5745 }, { "epoch": 4.8367003367003365, "grad_norm": 0.39076897501945496, "learning_rate": 2.6381587665642965e-06, "loss": 0.3262, "step": 5746 }, { "epoch": 4.837542087542087, "grad_norm": 0.41292470693588257, "learning_rate": 2.636309025599767e-06, "loss": 0.291, "step": 5747 }, { "epoch": 4.838383838383838, "grad_norm": 0.3720254600048065, "learning_rate": 2.6344597011427862e-06, "loss": 0.309, "step": 5748 }, { "epoch": 4.839225589225589, "grad_norm": 0.41109922528266907, "learning_rate": 2.6326107935192258e-06, "loss": 0.2993, "step": 5749 }, { "epoch": 4.84006734006734, "grad_norm": 0.3996157944202423, "learning_rate": 2.6307623030548823e-06, "loss": 0.3025, "step": 5750 }, { "epoch": 4.840909090909091, "grad_norm": 0.4112224578857422, "learning_rate": 2.6289142300754826e-06, "loss": 0.3138, "step": 5751 }, { "epoch": 4.841750841750842, "grad_norm": 0.41363540291786194, "learning_rate": 2.627066574906675e-06, "loss": 0.2863, "step": 5752 }, { "epoch": 4.842592592592593, "grad_norm": 0.41548675298690796, "learning_rate": 2.625219337874033e-06, "loss": 0.3499, "step": 5753 }, { "epoch": 4.843434343434343, "grad_norm": 0.37049803137779236, "learning_rate": 2.623372519303065e-06, "loss": 0.2948, "step": 5754 }, { "epoch": 4.844276094276094, "grad_norm": 0.44134125113487244, "learning_rate": 2.6215261195191954e-06, "loss": 0.2802, "step": 5755 }, { "epoch": 4.845117845117845, "grad_norm": 0.4092375636100769, "learning_rate": 2.619680138847783e-06, "loss": 0.2738, "step": 5756 }, { "epoch": 4.845959595959596, "grad_norm": 0.40128275752067566, "learning_rate": 2.617834577614108e-06, "loss": 0.2897, "step": 5757 }, { "epoch": 4.846801346801347, "grad_norm": 0.41924259066581726, "learning_rate": 2.6159894361433754e-06, "loss": 0.3115, "step": 5758 }, { "epoch": 4.847643097643098, "grad_norm": 0.38044679164886475, "learning_rate": 2.614144714760723e-06, "loss": 0.3136, "step": 5759 }, { "epoch": 4.848484848484849, "grad_norm": 0.4219626486301422, "learning_rate": 2.612300413791209e-06, "loss": 0.2902, "step": 5760 }, { "epoch": 4.8493265993265995, "grad_norm": 0.40177202224731445, "learning_rate": 2.6104565335598155e-06, "loss": 0.2778, "step": 5761 }, { "epoch": 4.85016835016835, "grad_norm": 0.42997413873672485, "learning_rate": 2.6086130743914572e-06, "loss": 0.3109, "step": 5762 }, { "epoch": 4.851010101010101, "grad_norm": 0.416345477104187, "learning_rate": 2.606770036610973e-06, "loss": 0.3015, "step": 5763 }, { "epoch": 4.851851851851852, "grad_norm": 0.3707782030105591, "learning_rate": 2.6049274205431236e-06, "loss": 0.3002, "step": 5764 }, { "epoch": 4.852693602693603, "grad_norm": 0.4427900016307831, "learning_rate": 2.6030852265125973e-06, "loss": 0.2975, "step": 5765 }, { "epoch": 4.853535353535354, "grad_norm": 0.3939822018146515, "learning_rate": 2.6012434548440044e-06, "loss": 0.2815, "step": 5766 }, { "epoch": 4.854377104377105, "grad_norm": 0.3951409161090851, "learning_rate": 2.5994021058618932e-06, "loss": 0.2927, "step": 5767 }, { "epoch": 4.8552188552188555, "grad_norm": 0.4084109365940094, "learning_rate": 2.5975611798907253e-06, "loss": 0.2786, "step": 5768 }, { "epoch": 4.856060606060606, "grad_norm": 0.3994598686695099, "learning_rate": 2.5957206772548903e-06, "loss": 0.2733, "step": 5769 }, { "epoch": 4.856902356902357, "grad_norm": 0.38305479288101196, "learning_rate": 2.593880598278703e-06, "loss": 0.3357, "step": 5770 }, { "epoch": 4.857744107744107, "grad_norm": 0.4031047224998474, "learning_rate": 2.5920409432864067e-06, "loss": 0.284, "step": 5771 }, { "epoch": 4.858585858585858, "grad_norm": 0.3927496671676636, "learning_rate": 2.590201712602171e-06, "loss": 0.308, "step": 5772 }, { "epoch": 4.859427609427609, "grad_norm": 0.37989023327827454, "learning_rate": 2.588362906550085e-06, "loss": 0.3081, "step": 5773 }, { "epoch": 4.86026936026936, "grad_norm": 0.3744845390319824, "learning_rate": 2.5865245254541647e-06, "loss": 0.3035, "step": 5774 }, { "epoch": 4.861111111111111, "grad_norm": 0.37405383586883545, "learning_rate": 2.584686569638356e-06, "loss": 0.2857, "step": 5775 }, { "epoch": 4.861952861952862, "grad_norm": 0.3949374854564667, "learning_rate": 2.5828490394265245e-06, "loss": 0.3096, "step": 5776 }, { "epoch": 4.8627946127946124, "grad_norm": 0.38873445987701416, "learning_rate": 2.58101193514246e-06, "loss": 0.3169, "step": 5777 }, { "epoch": 4.863636363636363, "grad_norm": 0.41120943427085876, "learning_rate": 2.579175257109885e-06, "loss": 0.2984, "step": 5778 }, { "epoch": 4.864478114478114, "grad_norm": 0.397567480802536, "learning_rate": 2.577339005652437e-06, "loss": 0.2924, "step": 5779 }, { "epoch": 4.865319865319865, "grad_norm": 0.43756797909736633, "learning_rate": 2.575503181093687e-06, "loss": 0.275, "step": 5780 }, { "epoch": 4.866161616161616, "grad_norm": 0.4099750220775604, "learning_rate": 2.5736677837571252e-06, "loss": 0.2941, "step": 5781 }, { "epoch": 4.867003367003367, "grad_norm": 0.4172776937484741, "learning_rate": 2.5718328139661656e-06, "loss": 0.3126, "step": 5782 }, { "epoch": 4.867845117845118, "grad_norm": 0.4182609021663666, "learning_rate": 2.569998272044153e-06, "loss": 0.3116, "step": 5783 }, { "epoch": 4.8686868686868685, "grad_norm": 0.4478791654109955, "learning_rate": 2.568164158314353e-06, "loss": 0.2832, "step": 5784 }, { "epoch": 4.869528619528619, "grad_norm": 0.42539501190185547, "learning_rate": 2.5663304730999556e-06, "loss": 0.2782, "step": 5785 }, { "epoch": 4.87037037037037, "grad_norm": 0.40467992424964905, "learning_rate": 2.564497216724075e-06, "loss": 0.3157, "step": 5786 }, { "epoch": 4.871212121212121, "grad_norm": 0.42102405428886414, "learning_rate": 2.562664389509749e-06, "loss": 0.339, "step": 5787 }, { "epoch": 4.872053872053872, "grad_norm": 0.4519784450531006, "learning_rate": 2.560831991779943e-06, "loss": 0.3024, "step": 5788 }, { "epoch": 4.872895622895623, "grad_norm": 0.4097096920013428, "learning_rate": 2.5590000238575473e-06, "loss": 0.2984, "step": 5789 }, { "epoch": 4.873737373737374, "grad_norm": 0.41157111525535583, "learning_rate": 2.557168486065371e-06, "loss": 0.3024, "step": 5790 }, { "epoch": 4.874579124579125, "grad_norm": 0.3891858458518982, "learning_rate": 2.5553373787261505e-06, "loss": 0.2944, "step": 5791 }, { "epoch": 4.875420875420875, "grad_norm": 0.4122762680053711, "learning_rate": 2.553506702162549e-06, "loss": 0.2936, "step": 5792 }, { "epoch": 4.876262626262626, "grad_norm": 0.4550033509731293, "learning_rate": 2.551676456697148e-06, "loss": 0.3105, "step": 5793 }, { "epoch": 4.877104377104377, "grad_norm": 0.3881373107433319, "learning_rate": 2.5498466426524594e-06, "loss": 0.276, "step": 5794 }, { "epoch": 4.877946127946128, "grad_norm": 0.3975101113319397, "learning_rate": 2.548017260350915e-06, "loss": 0.3173, "step": 5795 }, { "epoch": 4.878787878787879, "grad_norm": 0.41085284948349, "learning_rate": 2.5461883101148687e-06, "loss": 0.299, "step": 5796 }, { "epoch": 4.87962962962963, "grad_norm": 0.42151185870170593, "learning_rate": 2.5443597922666053e-06, "loss": 0.2996, "step": 5797 }, { "epoch": 4.880471380471381, "grad_norm": 0.4073418974876404, "learning_rate": 2.5425317071283252e-06, "loss": 0.3001, "step": 5798 }, { "epoch": 4.8813131313131315, "grad_norm": 0.39744213223457336, "learning_rate": 2.5407040550221606e-06, "loss": 0.2688, "step": 5799 }, { "epoch": 4.882154882154882, "grad_norm": 0.39714863896369934, "learning_rate": 2.538876836270159e-06, "loss": 0.2809, "step": 5800 }, { "epoch": 4.882996632996633, "grad_norm": 0.4253028333187103, "learning_rate": 2.5370500511942996e-06, "loss": 0.3051, "step": 5801 }, { "epoch": 4.883838383838384, "grad_norm": 0.40171295404434204, "learning_rate": 2.535223700116479e-06, "loss": 0.2878, "step": 5802 }, { "epoch": 4.884680134680135, "grad_norm": 0.39228716492652893, "learning_rate": 2.53339778335852e-06, "loss": 0.3175, "step": 5803 }, { "epoch": 4.885521885521886, "grad_norm": 0.4312647581100464, "learning_rate": 2.5315723012421682e-06, "loss": 0.2885, "step": 5804 }, { "epoch": 4.886363636363637, "grad_norm": 0.43778133392333984, "learning_rate": 2.5297472540890966e-06, "loss": 0.296, "step": 5805 }, { "epoch": 4.8872053872053876, "grad_norm": 0.40619736909866333, "learning_rate": 2.5279226422208945e-06, "loss": 0.2967, "step": 5806 }, { "epoch": 4.888047138047138, "grad_norm": 0.4511032998561859, "learning_rate": 2.5260984659590786e-06, "loss": 0.289, "step": 5807 }, { "epoch": 4.888888888888889, "grad_norm": 0.41092365980148315, "learning_rate": 2.5242747256250864e-06, "loss": 0.3002, "step": 5808 }, { "epoch": 4.88973063973064, "grad_norm": 0.41294771432876587, "learning_rate": 2.5224514215402828e-06, "loss": 0.3234, "step": 5809 }, { "epoch": 4.890572390572391, "grad_norm": 0.39252138137817383, "learning_rate": 2.5206285540259544e-06, "loss": 0.2979, "step": 5810 }, { "epoch": 4.891414141414142, "grad_norm": 0.3661346733570099, "learning_rate": 2.518806123403309e-06, "loss": 0.2901, "step": 5811 }, { "epoch": 4.892255892255893, "grad_norm": 0.3997708857059479, "learning_rate": 2.5169841299934753e-06, "loss": 0.2818, "step": 5812 }, { "epoch": 4.893097643097643, "grad_norm": 0.39567476511001587, "learning_rate": 2.5151625741175123e-06, "loss": 0.3108, "step": 5813 }, { "epoch": 4.893939393939394, "grad_norm": 0.43727511167526245, "learning_rate": 2.5133414560963942e-06, "loss": 0.3133, "step": 5814 }, { "epoch": 4.8947811447811445, "grad_norm": 0.4349600076675415, "learning_rate": 2.511520776251024e-06, "loss": 0.3004, "step": 5815 }, { "epoch": 4.895622895622895, "grad_norm": 0.37654468417167664, "learning_rate": 2.5097005349022242e-06, "loss": 0.3109, "step": 5816 }, { "epoch": 4.896464646464646, "grad_norm": 0.43174317479133606, "learning_rate": 2.507880732370739e-06, "loss": 0.2939, "step": 5817 }, { "epoch": 4.897306397306397, "grad_norm": 0.3941152095794678, "learning_rate": 2.506061368977239e-06, "loss": 0.3057, "step": 5818 }, { "epoch": 4.898148148148148, "grad_norm": 0.4140738844871521, "learning_rate": 2.504242445042313e-06, "loss": 0.3118, "step": 5819 }, { "epoch": 4.898989898989899, "grad_norm": 0.39650124311447144, "learning_rate": 2.5024239608864785e-06, "loss": 0.2959, "step": 5820 }, { "epoch": 4.89983164983165, "grad_norm": 0.43234318494796753, "learning_rate": 2.500605916830169e-06, "loss": 0.3092, "step": 5821 }, { "epoch": 4.9006734006734005, "grad_norm": 0.39858847856521606, "learning_rate": 2.4987883131937448e-06, "loss": 0.3017, "step": 5822 }, { "epoch": 4.901515151515151, "grad_norm": 0.4063092768192291, "learning_rate": 2.4969711502974866e-06, "loss": 0.3201, "step": 5823 }, { "epoch": 4.902356902356902, "grad_norm": 0.40307021141052246, "learning_rate": 2.4951544284615954e-06, "loss": 0.2843, "step": 5824 }, { "epoch": 4.903198653198653, "grad_norm": 0.37773704528808594, "learning_rate": 2.4933381480062014e-06, "loss": 0.3242, "step": 5825 }, { "epoch": 4.904040404040404, "grad_norm": 0.3998887538909912, "learning_rate": 2.491522309251348e-06, "loss": 0.2828, "step": 5826 }, { "epoch": 4.904882154882155, "grad_norm": 0.3816906809806824, "learning_rate": 2.48970691251701e-06, "loss": 0.3041, "step": 5827 }, { "epoch": 4.905723905723906, "grad_norm": 0.4126293361186981, "learning_rate": 2.487891958123077e-06, "loss": 0.281, "step": 5828 }, { "epoch": 4.906565656565657, "grad_norm": 0.4128814935684204, "learning_rate": 2.4860774463893615e-06, "loss": 0.2636, "step": 5829 }, { "epoch": 4.907407407407407, "grad_norm": 0.475105881690979, "learning_rate": 2.484263377635603e-06, "loss": 0.2896, "step": 5830 }, { "epoch": 4.908249158249158, "grad_norm": 0.3930932581424713, "learning_rate": 2.4824497521814597e-06, "loss": 0.3071, "step": 5831 }, { "epoch": 4.909090909090909, "grad_norm": 0.391812264919281, "learning_rate": 2.480636570346511e-06, "loss": 0.2762, "step": 5832 }, { "epoch": 4.90993265993266, "grad_norm": 0.41047918796539307, "learning_rate": 2.4788238324502585e-06, "loss": 0.3032, "step": 5833 }, { "epoch": 4.910774410774411, "grad_norm": 0.40217846632003784, "learning_rate": 2.4770115388121236e-06, "loss": 0.298, "step": 5834 }, { "epoch": 4.911616161616162, "grad_norm": 0.4004841446876526, "learning_rate": 2.4751996897514547e-06, "loss": 0.2947, "step": 5835 }, { "epoch": 4.912457912457913, "grad_norm": 0.4042259156703949, "learning_rate": 2.4733882855875208e-06, "loss": 0.2867, "step": 5836 }, { "epoch": 4.9132996632996635, "grad_norm": 0.4094258248806, "learning_rate": 2.4715773266395076e-06, "loss": 0.2811, "step": 5837 }, { "epoch": 4.914141414141414, "grad_norm": 0.37756505608558655, "learning_rate": 2.4697668132265244e-06, "loss": 0.3337, "step": 5838 }, { "epoch": 4.914983164983165, "grad_norm": 0.371572881937027, "learning_rate": 2.4679567456676064e-06, "loss": 0.3163, "step": 5839 }, { "epoch": 4.915824915824916, "grad_norm": 0.39034605026245117, "learning_rate": 2.466147124281703e-06, "loss": 0.2967, "step": 5840 }, { "epoch": 4.916666666666667, "grad_norm": 0.4127872586250305, "learning_rate": 2.464337949387693e-06, "loss": 0.3037, "step": 5841 }, { "epoch": 4.917508417508418, "grad_norm": 0.4187646508216858, "learning_rate": 2.462529221304368e-06, "loss": 0.3066, "step": 5842 }, { "epoch": 4.918350168350169, "grad_norm": 0.42653223872184753, "learning_rate": 2.460720940350449e-06, "loss": 0.265, "step": 5843 }, { "epoch": 4.91919191919192, "grad_norm": 0.41418927907943726, "learning_rate": 2.4589131068445725e-06, "loss": 0.28, "step": 5844 }, { "epoch": 4.9200336700336695, "grad_norm": 0.378953754901886, "learning_rate": 2.4571057211052985e-06, "loss": 0.3057, "step": 5845 }, { "epoch": 4.92087542087542, "grad_norm": 0.39986440539360046, "learning_rate": 2.4552987834511056e-06, "loss": 0.2988, "step": 5846 }, { "epoch": 4.921717171717171, "grad_norm": 0.43130701780319214, "learning_rate": 2.453492294200397e-06, "loss": 0.2845, "step": 5847 }, { "epoch": 4.922558922558922, "grad_norm": 0.37645527720451355, "learning_rate": 2.4516862536714974e-06, "loss": 0.3077, "step": 5848 }, { "epoch": 4.923400673400673, "grad_norm": 0.4048224985599518, "learning_rate": 2.4498806621826482e-06, "loss": 0.3059, "step": 5849 }, { "epoch": 4.924242424242424, "grad_norm": 0.3999386429786682, "learning_rate": 2.448075520052013e-06, "loss": 0.3052, "step": 5850 }, { "epoch": 4.925084175084175, "grad_norm": 0.4255780577659607, "learning_rate": 2.446270827597678e-06, "loss": 0.2726, "step": 5851 }, { "epoch": 4.925925925925926, "grad_norm": 0.444161057472229, "learning_rate": 2.4444665851376516e-06, "loss": 0.2729, "step": 5852 }, { "epoch": 4.9267676767676765, "grad_norm": 0.42131921648979187, "learning_rate": 2.4426627929898584e-06, "loss": 0.3061, "step": 5853 }, { "epoch": 4.927609427609427, "grad_norm": 0.3907037079334259, "learning_rate": 2.4408594514721457e-06, "loss": 0.2944, "step": 5854 }, { "epoch": 4.928451178451178, "grad_norm": 0.42459607124328613, "learning_rate": 2.43905656090228e-06, "loss": 0.2829, "step": 5855 }, { "epoch": 4.929292929292929, "grad_norm": 0.4183717668056488, "learning_rate": 2.4372541215979513e-06, "loss": 0.3043, "step": 5856 }, { "epoch": 4.93013468013468, "grad_norm": 0.3895753026008606, "learning_rate": 2.435452133876771e-06, "loss": 0.3043, "step": 5857 }, { "epoch": 4.930976430976431, "grad_norm": 0.4155850112438202, "learning_rate": 2.433650598056267e-06, "loss": 0.3143, "step": 5858 }, { "epoch": 4.931818181818182, "grad_norm": 0.45281893014907837, "learning_rate": 2.4318495144538856e-06, "loss": 0.3051, "step": 5859 }, { "epoch": 4.9326599326599325, "grad_norm": 0.3937387764453888, "learning_rate": 2.430048883387003e-06, "loss": 0.2993, "step": 5860 }, { "epoch": 4.933501683501683, "grad_norm": 0.4213716685771942, "learning_rate": 2.4282487051729055e-06, "loss": 0.3026, "step": 5861 }, { "epoch": 4.934343434343434, "grad_norm": 0.376581609249115, "learning_rate": 2.426448980128803e-06, "loss": 0.311, "step": 5862 }, { "epoch": 4.935185185185185, "grad_norm": 0.3943955898284912, "learning_rate": 2.42464970857183e-06, "loss": 0.2787, "step": 5863 }, { "epoch": 4.936026936026936, "grad_norm": 0.3740600049495697, "learning_rate": 2.422850890819033e-06, "loss": 0.3021, "step": 5864 }, { "epoch": 4.936868686868687, "grad_norm": 0.41254255175590515, "learning_rate": 2.4210525271873877e-06, "loss": 0.3113, "step": 5865 }, { "epoch": 4.937710437710438, "grad_norm": 0.3592768907546997, "learning_rate": 2.4192546179937813e-06, "loss": 0.3046, "step": 5866 }, { "epoch": 4.938552188552189, "grad_norm": 0.4421049952507019, "learning_rate": 2.417457163555025e-06, "loss": 0.3, "step": 5867 }, { "epoch": 4.9393939393939394, "grad_norm": 0.3888101875782013, "learning_rate": 2.415660164187849e-06, "loss": 0.2969, "step": 5868 }, { "epoch": 4.94023569023569, "grad_norm": 0.3834875226020813, "learning_rate": 2.413863620208907e-06, "loss": 0.321, "step": 5869 }, { "epoch": 4.941077441077441, "grad_norm": 0.3965831696987152, "learning_rate": 2.4120675319347676e-06, "loss": 0.3205, "step": 5870 }, { "epoch": 4.941919191919192, "grad_norm": 0.39080360531806946, "learning_rate": 2.41027189968192e-06, "loss": 0.2913, "step": 5871 }, { "epoch": 4.942760942760943, "grad_norm": 0.3988180160522461, "learning_rate": 2.408476723766769e-06, "loss": 0.2675, "step": 5872 }, { "epoch": 4.943602693602694, "grad_norm": 0.4153614640235901, "learning_rate": 2.4066820045056534e-06, "loss": 0.2972, "step": 5873 }, { "epoch": 4.944444444444445, "grad_norm": 0.4142568111419678, "learning_rate": 2.4048877422148163e-06, "loss": 0.2889, "step": 5874 }, { "epoch": 4.9452861952861955, "grad_norm": 0.4243411421775818, "learning_rate": 2.4030939372104274e-06, "loss": 0.2848, "step": 5875 }, { "epoch": 4.946127946127946, "grad_norm": 0.4401875138282776, "learning_rate": 2.401300589808571e-06, "loss": 0.279, "step": 5876 }, { "epoch": 4.946969696969697, "grad_norm": 0.38341715931892395, "learning_rate": 2.3995077003252586e-06, "loss": 0.3199, "step": 5877 }, { "epoch": 4.947811447811448, "grad_norm": 0.3949824273586273, "learning_rate": 2.3977152690764127e-06, "loss": 0.3161, "step": 5878 }, { "epoch": 4.948653198653199, "grad_norm": 0.4526062309741974, "learning_rate": 2.395923296377882e-06, "loss": 0.3143, "step": 5879 }, { "epoch": 4.94949494949495, "grad_norm": 0.405387282371521, "learning_rate": 2.394131782545428e-06, "loss": 0.2741, "step": 5880 }, { "epoch": 4.950336700336701, "grad_norm": 0.3908955454826355, "learning_rate": 2.392340727894738e-06, "loss": 0.3152, "step": 5881 }, { "epoch": 4.951178451178452, "grad_norm": 0.411672443151474, "learning_rate": 2.3905501327414134e-06, "loss": 0.2945, "step": 5882 }, { "epoch": 4.952020202020202, "grad_norm": 0.4091075360774994, "learning_rate": 2.3887599974009735e-06, "loss": 0.3067, "step": 5883 }, { "epoch": 4.952861952861953, "grad_norm": 0.4192813038825989, "learning_rate": 2.3869703221888637e-06, "loss": 0.2999, "step": 5884 }, { "epoch": 4.953703703703704, "grad_norm": 0.37369590997695923, "learning_rate": 2.3851811074204394e-06, "loss": 0.3138, "step": 5885 }, { "epoch": 4.954545454545455, "grad_norm": 0.411734014749527, "learning_rate": 2.383392353410984e-06, "loss": 0.3017, "step": 5886 }, { "epoch": 4.955387205387205, "grad_norm": 0.4233885109424591, "learning_rate": 2.3816040604756925e-06, "loss": 0.3316, "step": 5887 }, { "epoch": 4.956228956228956, "grad_norm": 0.41221925616264343, "learning_rate": 2.3798162289296793e-06, "loss": 0.29, "step": 5888 }, { "epoch": 4.957070707070707, "grad_norm": 0.3939487338066101, "learning_rate": 2.378028859087982e-06, "loss": 0.3062, "step": 5889 }, { "epoch": 4.957912457912458, "grad_norm": 0.4244626760482788, "learning_rate": 2.3762419512655555e-06, "loss": 0.3019, "step": 5890 }, { "epoch": 4.9587542087542085, "grad_norm": 0.40818309783935547, "learning_rate": 2.3744555057772693e-06, "loss": 0.2845, "step": 5891 }, { "epoch": 4.959595959595959, "grad_norm": 0.41206786036491394, "learning_rate": 2.372669522937916e-06, "loss": 0.2984, "step": 5892 }, { "epoch": 4.96043771043771, "grad_norm": 0.41284260153770447, "learning_rate": 2.3708840030622016e-06, "loss": 0.3018, "step": 5893 }, { "epoch": 4.961279461279461, "grad_norm": 0.41156601905822754, "learning_rate": 2.369098946464756e-06, "loss": 0.3032, "step": 5894 }, { "epoch": 4.962121212121212, "grad_norm": 0.4139055013656616, "learning_rate": 2.3673143534601274e-06, "loss": 0.3075, "step": 5895 }, { "epoch": 4.962962962962963, "grad_norm": 0.40665990114212036, "learning_rate": 2.365530224362777e-06, "loss": 0.2799, "step": 5896 }, { "epoch": 4.963804713804714, "grad_norm": 0.430160790681839, "learning_rate": 2.3637465594870875e-06, "loss": 0.2749, "step": 5897 }, { "epoch": 4.9646464646464645, "grad_norm": 0.45353949069976807, "learning_rate": 2.361963359147361e-06, "loss": 0.2928, "step": 5898 }, { "epoch": 4.965488215488215, "grad_norm": 0.38627785444259644, "learning_rate": 2.360180623657815e-06, "loss": 0.315, "step": 5899 }, { "epoch": 4.966329966329966, "grad_norm": 0.4051484763622284, "learning_rate": 2.3583983533325888e-06, "loss": 0.2906, "step": 5900 }, { "epoch": 4.967171717171717, "grad_norm": 0.4183894991874695, "learning_rate": 2.356616548485736e-06, "loss": 0.3003, "step": 5901 }, { "epoch": 4.968013468013468, "grad_norm": 0.4007236063480377, "learning_rate": 2.3548352094312283e-06, "loss": 0.2985, "step": 5902 }, { "epoch": 4.968855218855219, "grad_norm": 0.39620906114578247, "learning_rate": 2.353054336482959e-06, "loss": 0.3237, "step": 5903 }, { "epoch": 4.96969696969697, "grad_norm": 0.4122638702392578, "learning_rate": 2.3512739299547345e-06, "loss": 0.2753, "step": 5904 }, { "epoch": 4.970538720538721, "grad_norm": 0.4471045434474945, "learning_rate": 2.349493990160284e-06, "loss": 0.328, "step": 5905 }, { "epoch": 4.9713804713804715, "grad_norm": 0.3814716935157776, "learning_rate": 2.347714517413249e-06, "loss": 0.2729, "step": 5906 }, { "epoch": 4.972222222222222, "grad_norm": 0.41382142901420593, "learning_rate": 2.345935512027195e-06, "loss": 0.2675, "step": 5907 }, { "epoch": 4.973063973063973, "grad_norm": 0.39837390184402466, "learning_rate": 2.3441569743156e-06, "loss": 0.3025, "step": 5908 }, { "epoch": 4.973905723905724, "grad_norm": 0.4574776589870453, "learning_rate": 2.342378904591861e-06, "loss": 0.2899, "step": 5909 }, { "epoch": 4.974747474747475, "grad_norm": 0.3775649070739746, "learning_rate": 2.34060130316929e-06, "loss": 0.319, "step": 5910 }, { "epoch": 4.975589225589226, "grad_norm": 0.4088735282421112, "learning_rate": 2.3388241703611255e-06, "loss": 0.2825, "step": 5911 }, { "epoch": 4.976430976430977, "grad_norm": 0.36413446068763733, "learning_rate": 2.3370475064805137e-06, "loss": 0.3178, "step": 5912 }, { "epoch": 4.9772727272727275, "grad_norm": 0.3629331588745117, "learning_rate": 2.3352713118405228e-06, "loss": 0.3041, "step": 5913 }, { "epoch": 4.978114478114478, "grad_norm": 0.374537855386734, "learning_rate": 2.3334955867541342e-06, "loss": 0.3315, "step": 5914 }, { "epoch": 4.978956228956229, "grad_norm": 0.3801814317703247, "learning_rate": 2.3317203315342523e-06, "loss": 0.2753, "step": 5915 }, { "epoch": 4.97979797979798, "grad_norm": 0.3623477816581726, "learning_rate": 2.3299455464936976e-06, "loss": 0.2977, "step": 5916 }, { "epoch": 4.980639730639731, "grad_norm": 0.3749188780784607, "learning_rate": 2.3281712319452036e-06, "loss": 0.2724, "step": 5917 }, { "epoch": 4.981481481481482, "grad_norm": 0.38506609201431274, "learning_rate": 2.326397388201423e-06, "loss": 0.2747, "step": 5918 }, { "epoch": 4.982323232323233, "grad_norm": 0.3704449534416199, "learning_rate": 2.3246240155749284e-06, "loss": 0.281, "step": 5919 }, { "epoch": 4.983164983164983, "grad_norm": 0.4005393385887146, "learning_rate": 2.3228511143782033e-06, "loss": 0.3268, "step": 5920 }, { "epoch": 4.9840067340067336, "grad_norm": 0.3886265456676483, "learning_rate": 2.3210786849236567e-06, "loss": 0.2915, "step": 5921 }, { "epoch": 4.984848484848484, "grad_norm": 0.4376007616519928, "learning_rate": 2.3193067275236065e-06, "loss": 0.3123, "step": 5922 }, { "epoch": 4.985690235690235, "grad_norm": 0.45195943117141724, "learning_rate": 2.3175352424902885e-06, "loss": 0.2915, "step": 5923 }, { "epoch": 4.986531986531986, "grad_norm": 0.3823513686656952, "learning_rate": 2.3157642301358624e-06, "loss": 0.3265, "step": 5924 }, { "epoch": 4.987373737373737, "grad_norm": 0.443708211183548, "learning_rate": 2.3139936907723966e-06, "loss": 0.3049, "step": 5925 }, { "epoch": 4.988215488215488, "grad_norm": 0.40494856238365173, "learning_rate": 2.3122236247118768e-06, "loss": 0.3072, "step": 5926 }, { "epoch": 4.989057239057239, "grad_norm": 0.4187083840370178, "learning_rate": 2.3104540322662104e-06, "loss": 0.3021, "step": 5927 }, { "epoch": 4.98989898989899, "grad_norm": 0.3474942445755005, "learning_rate": 2.3086849137472197e-06, "loss": 0.3279, "step": 5928 }, { "epoch": 4.9907407407407405, "grad_norm": 0.41929346323013306, "learning_rate": 2.30691626946664e-06, "loss": 0.3046, "step": 5929 }, { "epoch": 4.991582491582491, "grad_norm": 0.42835360765457153, "learning_rate": 2.3051480997361257e-06, "loss": 0.3019, "step": 5930 }, { "epoch": 4.992424242424242, "grad_norm": 0.39177027344703674, "learning_rate": 2.303380404867246e-06, "loss": 0.309, "step": 5931 }, { "epoch": 4.993265993265993, "grad_norm": 0.3721383810043335, "learning_rate": 2.3016131851714886e-06, "loss": 0.3091, "step": 5932 }, { "epoch": 4.994107744107744, "grad_norm": 0.3800475001335144, "learning_rate": 2.299846440960258e-06, "loss": 0.3089, "step": 5933 }, { "epoch": 4.994949494949495, "grad_norm": 0.3607935309410095, "learning_rate": 2.298080172544872e-06, "loss": 0.2977, "step": 5934 }, { "epoch": 4.995791245791246, "grad_norm": 0.366188645362854, "learning_rate": 2.2963143802365645e-06, "loss": 0.3162, "step": 5935 }, { "epoch": 4.9966329966329965, "grad_norm": 0.40053167939186096, "learning_rate": 2.294549064346488e-06, "loss": 0.2968, "step": 5936 }, { "epoch": 4.997474747474747, "grad_norm": 0.3908006250858307, "learning_rate": 2.2927842251857126e-06, "loss": 0.2945, "step": 5937 }, { "epoch": 4.998316498316498, "grad_norm": 0.382779598236084, "learning_rate": 2.2910198630652185e-06, "loss": 0.2872, "step": 5938 }, { "epoch": 4.999158249158249, "grad_norm": 0.38359639048576355, "learning_rate": 2.289255978295907e-06, "loss": 0.2975, "step": 5939 }, { "epoch": 5.0, "grad_norm": 0.41389232873916626, "learning_rate": 2.28749257118859e-06, "loss": 0.2683, "step": 5940 }, { "epoch": 5.000841750841751, "grad_norm": 0.4411732852458954, "learning_rate": 2.2857296420540036e-06, "loss": 0.2726, "step": 5941 }, { "epoch": 5.001683501683502, "grad_norm": 0.39064785838127136, "learning_rate": 2.2839671912027904e-06, "loss": 0.2551, "step": 5942 }, { "epoch": 5.002525252525253, "grad_norm": 0.6583713293075562, "learning_rate": 2.2822052189455166e-06, "loss": 0.2776, "step": 5943 }, { "epoch": 5.0033670033670035, "grad_norm": 0.44271358847618103, "learning_rate": 2.280443725592657e-06, "loss": 0.2859, "step": 5944 }, { "epoch": 5.004208754208754, "grad_norm": 0.4687883257865906, "learning_rate": 2.278682711454609e-06, "loss": 0.2598, "step": 5945 }, { "epoch": 5.005050505050505, "grad_norm": 0.49129343032836914, "learning_rate": 2.2769221768416807e-06, "loss": 0.2832, "step": 5946 }, { "epoch": 5.005892255892256, "grad_norm": 0.43634477257728577, "learning_rate": 2.275162122064094e-06, "loss": 0.2701, "step": 5947 }, { "epoch": 5.006734006734007, "grad_norm": 0.405460923910141, "learning_rate": 2.2734025474319925e-06, "loss": 0.2865, "step": 5948 }, { "epoch": 5.007575757575758, "grad_norm": 0.4252626299858093, "learning_rate": 2.2716434532554338e-06, "loss": 0.2533, "step": 5949 }, { "epoch": 5.008417508417509, "grad_norm": 0.4175722897052765, "learning_rate": 2.269884839844386e-06, "loss": 0.2568, "step": 5950 }, { "epoch": 5.0092592592592595, "grad_norm": 0.46242555975914, "learning_rate": 2.2681267075087372e-06, "loss": 0.2406, "step": 5951 }, { "epoch": 5.01010101010101, "grad_norm": 0.4006630778312683, "learning_rate": 2.2663690565582858e-06, "loss": 0.2825, "step": 5952 }, { "epoch": 5.010942760942761, "grad_norm": 0.42669522762298584, "learning_rate": 2.264611887302751e-06, "loss": 0.2474, "step": 5953 }, { "epoch": 5.011784511784512, "grad_norm": 0.4186941683292389, "learning_rate": 2.262855200051767e-06, "loss": 0.2574, "step": 5954 }, { "epoch": 5.012626262626263, "grad_norm": 0.40509793162345886, "learning_rate": 2.261098995114878e-06, "loss": 0.2993, "step": 5955 }, { "epoch": 5.013468013468014, "grad_norm": 0.42559608817100525, "learning_rate": 2.2593432728015455e-06, "loss": 0.2532, "step": 5956 }, { "epoch": 5.014309764309765, "grad_norm": 0.43612492084503174, "learning_rate": 2.2575880334211493e-06, "loss": 0.2716, "step": 5957 }, { "epoch": 5.015151515151516, "grad_norm": 0.4033622145652771, "learning_rate": 2.2558332772829776e-06, "loss": 0.2359, "step": 5958 }, { "epoch": 5.015993265993266, "grad_norm": 0.4045875072479248, "learning_rate": 2.2540790046962406e-06, "loss": 0.2474, "step": 5959 }, { "epoch": 5.016835016835016, "grad_norm": 0.42509281635284424, "learning_rate": 2.252325215970059e-06, "loss": 0.2783, "step": 5960 }, { "epoch": 5.017676767676767, "grad_norm": 0.41479817032814026, "learning_rate": 2.250571911413467e-06, "loss": 0.2682, "step": 5961 }, { "epoch": 5.018518518518518, "grad_norm": 0.421549528837204, "learning_rate": 2.248819091335418e-06, "loss": 0.2919, "step": 5962 }, { "epoch": 5.019360269360269, "grad_norm": 0.40141671895980835, "learning_rate": 2.247066756044775e-06, "loss": 0.2278, "step": 5963 }, { "epoch": 5.02020202020202, "grad_norm": 0.39838045835494995, "learning_rate": 2.245314905850322e-06, "loss": 0.2747, "step": 5964 }, { "epoch": 5.021043771043771, "grad_norm": 0.4000127911567688, "learning_rate": 2.2435635410607494e-06, "loss": 0.2799, "step": 5965 }, { "epoch": 5.021885521885522, "grad_norm": 0.4019315540790558, "learning_rate": 2.24181266198467e-06, "loss": 0.262, "step": 5966 }, { "epoch": 5.0227272727272725, "grad_norm": 0.4295208752155304, "learning_rate": 2.2400622689306057e-06, "loss": 0.2792, "step": 5967 }, { "epoch": 5.023569023569023, "grad_norm": 0.3959695100784302, "learning_rate": 2.238312362206993e-06, "loss": 0.2456, "step": 5968 }, { "epoch": 5.024410774410774, "grad_norm": 0.39150843024253845, "learning_rate": 2.2365629421221874e-06, "loss": 0.2401, "step": 5969 }, { "epoch": 5.025252525252525, "grad_norm": 0.45152267813682556, "learning_rate": 2.2348140089844515e-06, "loss": 0.2469, "step": 5970 }, { "epoch": 5.026094276094276, "grad_norm": 0.4001217484474182, "learning_rate": 2.23306556310197e-06, "loss": 0.2582, "step": 5971 }, { "epoch": 5.026936026936027, "grad_norm": 0.4168945252895355, "learning_rate": 2.2313176047828355e-06, "loss": 0.258, "step": 5972 }, { "epoch": 5.027777777777778, "grad_norm": 0.392500102519989, "learning_rate": 2.229570134335056e-06, "loss": 0.2797, "step": 5973 }, { "epoch": 5.0286195286195285, "grad_norm": 0.3865589499473572, "learning_rate": 2.2278231520665546e-06, "loss": 0.2664, "step": 5974 }, { "epoch": 5.029461279461279, "grad_norm": 0.41025862097740173, "learning_rate": 2.226076658285172e-06, "loss": 0.2329, "step": 5975 }, { "epoch": 5.03030303030303, "grad_norm": 0.39388182759284973, "learning_rate": 2.224330653298655e-06, "loss": 0.2779, "step": 5976 }, { "epoch": 5.031144781144781, "grad_norm": 0.4355925917625427, "learning_rate": 2.22258513741467e-06, "loss": 0.2499, "step": 5977 }, { "epoch": 5.031986531986532, "grad_norm": 0.4160754084587097, "learning_rate": 2.2208401109407936e-06, "loss": 0.2769, "step": 5978 }, { "epoch": 5.032828282828283, "grad_norm": 0.40377524495124817, "learning_rate": 2.219095574184519e-06, "loss": 0.2646, "step": 5979 }, { "epoch": 5.033670033670034, "grad_norm": 0.3962368965148926, "learning_rate": 2.2173515274532543e-06, "loss": 0.2725, "step": 5980 }, { "epoch": 5.034511784511785, "grad_norm": 0.39846253395080566, "learning_rate": 2.2156079710543175e-06, "loss": 0.273, "step": 5981 }, { "epoch": 5.0353535353535355, "grad_norm": 0.42549848556518555, "learning_rate": 2.213864905294941e-06, "loss": 0.2579, "step": 5982 }, { "epoch": 5.036195286195286, "grad_norm": 0.40379810333251953, "learning_rate": 2.2121223304822725e-06, "loss": 0.2568, "step": 5983 }, { "epoch": 5.037037037037037, "grad_norm": 0.4137783348560333, "learning_rate": 2.2103802469233715e-06, "loss": 0.264, "step": 5984 }, { "epoch": 5.037878787878788, "grad_norm": 0.45181629061698914, "learning_rate": 2.208638654925214e-06, "loss": 0.2664, "step": 5985 }, { "epoch": 5.038720538720539, "grad_norm": 0.41941317915916443, "learning_rate": 2.206897554794683e-06, "loss": 0.2617, "step": 5986 }, { "epoch": 5.03956228956229, "grad_norm": 0.39530083537101746, "learning_rate": 2.205156946838583e-06, "loss": 0.2665, "step": 5987 }, { "epoch": 5.040404040404041, "grad_norm": 0.3949591815471649, "learning_rate": 2.2034168313636267e-06, "loss": 0.2769, "step": 5988 }, { "epoch": 5.0412457912457915, "grad_norm": 0.3899107575416565, "learning_rate": 2.20167720867644e-06, "loss": 0.2946, "step": 5989 }, { "epoch": 5.042087542087542, "grad_norm": 0.4126301407814026, "learning_rate": 2.199938079083561e-06, "loss": 0.248, "step": 5990 }, { "epoch": 5.042929292929293, "grad_norm": 0.4174189269542694, "learning_rate": 2.1981994428914453e-06, "loss": 0.263, "step": 5991 }, { "epoch": 5.043771043771044, "grad_norm": 0.39632487297058105, "learning_rate": 2.196461300406461e-06, "loss": 0.2622, "step": 5992 }, { "epoch": 5.044612794612795, "grad_norm": 0.3676290214061737, "learning_rate": 2.1947236519348854e-06, "loss": 0.2789, "step": 5993 }, { "epoch": 5.045454545454546, "grad_norm": 0.4088633954524994, "learning_rate": 2.1929864977829084e-06, "loss": 0.2574, "step": 5994 }, { "epoch": 5.046296296296297, "grad_norm": 0.410428524017334, "learning_rate": 2.191249838256638e-06, "loss": 0.2637, "step": 5995 }, { "epoch": 5.047138047138047, "grad_norm": 0.38667941093444824, "learning_rate": 2.189513673662092e-06, "loss": 0.279, "step": 5996 }, { "epoch": 5.047979797979798, "grad_norm": 0.3957095146179199, "learning_rate": 2.187778004305201e-06, "loss": 0.2692, "step": 5997 }, { "epoch": 5.048821548821548, "grad_norm": 0.40082499384880066, "learning_rate": 2.186042830491808e-06, "loss": 0.2711, "step": 5998 }, { "epoch": 5.049663299663299, "grad_norm": 0.3969668447971344, "learning_rate": 2.184308152527667e-06, "loss": 0.265, "step": 5999 }, { "epoch": 5.05050505050505, "grad_norm": 0.3574749231338501, "learning_rate": 2.1825739707184494e-06, "loss": 0.2871, "step": 6000 }, { "epoch": 5.051346801346801, "grad_norm": 0.39359939098358154, "learning_rate": 2.1808402853697374e-06, "loss": 0.2424, "step": 6001 }, { "epoch": 5.052188552188552, "grad_norm": 0.40651363134384155, "learning_rate": 2.179107096787023e-06, "loss": 0.236, "step": 6002 }, { "epoch": 5.053030303030303, "grad_norm": 0.39821329712867737, "learning_rate": 2.1773744052757117e-06, "loss": 0.2753, "step": 6003 }, { "epoch": 5.053872053872054, "grad_norm": 0.4146057665348053, "learning_rate": 2.1756422111411245e-06, "loss": 0.2683, "step": 6004 }, { "epoch": 5.0547138047138045, "grad_norm": 0.40100690722465515, "learning_rate": 2.173910514688492e-06, "loss": 0.2515, "step": 6005 }, { "epoch": 5.055555555555555, "grad_norm": 0.409116268157959, "learning_rate": 2.172179316222955e-06, "loss": 0.2605, "step": 6006 }, { "epoch": 5.056397306397306, "grad_norm": 0.4118867516517639, "learning_rate": 2.1704486160495724e-06, "loss": 0.2527, "step": 6007 }, { "epoch": 5.057239057239057, "grad_norm": 0.40742406249046326, "learning_rate": 2.168718414473309e-06, "loss": 0.2528, "step": 6008 }, { "epoch": 5.058080808080808, "grad_norm": 0.40339821577072144, "learning_rate": 2.166988711799049e-06, "loss": 0.2825, "step": 6009 }, { "epoch": 5.058922558922559, "grad_norm": 0.4243990182876587, "learning_rate": 2.1652595083315815e-06, "loss": 0.2482, "step": 6010 }, { "epoch": 5.05976430976431, "grad_norm": 0.37848740816116333, "learning_rate": 2.163530804375609e-06, "loss": 0.2854, "step": 6011 }, { "epoch": 5.0606060606060606, "grad_norm": 0.3893365263938904, "learning_rate": 2.1618026002357495e-06, "loss": 0.2931, "step": 6012 }, { "epoch": 5.061447811447811, "grad_norm": 0.40246516466140747, "learning_rate": 2.160074896216533e-06, "loss": 0.2707, "step": 6013 }, { "epoch": 5.062289562289562, "grad_norm": 0.4059683084487915, "learning_rate": 2.158347692622398e-06, "loss": 0.2493, "step": 6014 }, { "epoch": 5.063131313131313, "grad_norm": 0.407571941614151, "learning_rate": 2.1566209897576947e-06, "loss": 0.2472, "step": 6015 }, { "epoch": 5.063973063973064, "grad_norm": 0.3854617178440094, "learning_rate": 2.154894787926684e-06, "loss": 0.2526, "step": 6016 }, { "epoch": 5.064814814814815, "grad_norm": 0.4221910536289215, "learning_rate": 2.1531690874335485e-06, "loss": 0.2815, "step": 6017 }, { "epoch": 5.065656565656566, "grad_norm": 0.3997596204280853, "learning_rate": 2.1514438885823714e-06, "loss": 0.2493, "step": 6018 }, { "epoch": 5.066498316498317, "grad_norm": 0.41092997789382935, "learning_rate": 2.1497191916771497e-06, "loss": 0.2825, "step": 6019 }, { "epoch": 5.0673400673400675, "grad_norm": 0.41194313764572144, "learning_rate": 2.1479949970217933e-06, "loss": 0.2608, "step": 6020 }, { "epoch": 5.068181818181818, "grad_norm": 0.4161786139011383, "learning_rate": 2.1462713049201266e-06, "loss": 0.2571, "step": 6021 }, { "epoch": 5.069023569023569, "grad_norm": 0.40598395466804504, "learning_rate": 2.1445481156758795e-06, "loss": 0.2956, "step": 6022 }, { "epoch": 5.06986531986532, "grad_norm": 0.39183667302131653, "learning_rate": 2.142825429592699e-06, "loss": 0.2604, "step": 6023 }, { "epoch": 5.070707070707071, "grad_norm": 0.4360606074333191, "learning_rate": 2.1411032469741373e-06, "loss": 0.2607, "step": 6024 }, { "epoch": 5.071548821548822, "grad_norm": 0.3930286765098572, "learning_rate": 2.139381568123665e-06, "loss": 0.2565, "step": 6025 }, { "epoch": 5.072390572390573, "grad_norm": 0.4107861816883087, "learning_rate": 2.1376603933446595e-06, "loss": 0.2495, "step": 6026 }, { "epoch": 5.0732323232323235, "grad_norm": 0.3806445896625519, "learning_rate": 2.1359397229404065e-06, "loss": 0.2855, "step": 6027 }, { "epoch": 5.074074074074074, "grad_norm": 0.4014904797077179, "learning_rate": 2.1342195572141117e-06, "loss": 0.2456, "step": 6028 }, { "epoch": 5.074915824915825, "grad_norm": 0.39301908016204834, "learning_rate": 2.1324998964688827e-06, "loss": 0.3041, "step": 6029 }, { "epoch": 5.075757575757576, "grad_norm": 0.38672539591789246, "learning_rate": 2.1307807410077448e-06, "loss": 0.2794, "step": 6030 }, { "epoch": 5.076599326599327, "grad_norm": 0.40350937843322754, "learning_rate": 2.1290620911336308e-06, "loss": 0.264, "step": 6031 }, { "epoch": 5.077441077441078, "grad_norm": 0.4139862656593323, "learning_rate": 2.127343947149382e-06, "loss": 0.2468, "step": 6032 }, { "epoch": 5.078282828282829, "grad_norm": 0.4303136169910431, "learning_rate": 2.1256263093577567e-06, "loss": 0.2751, "step": 6033 }, { "epoch": 5.079124579124579, "grad_norm": 0.3674314618110657, "learning_rate": 2.1239091780614228e-06, "loss": 0.2671, "step": 6034 }, { "epoch": 5.07996632996633, "grad_norm": 0.4030522108078003, "learning_rate": 2.1221925535629544e-06, "loss": 0.2831, "step": 6035 }, { "epoch": 5.08080808080808, "grad_norm": 0.40567487478256226, "learning_rate": 2.12047643616484e-06, "loss": 0.2621, "step": 6036 }, { "epoch": 5.081649831649831, "grad_norm": 0.40236419439315796, "learning_rate": 2.118760826169475e-06, "loss": 0.25, "step": 6037 }, { "epoch": 5.082491582491582, "grad_norm": 0.39013996720314026, "learning_rate": 2.1170457238791712e-06, "loss": 0.2587, "step": 6038 }, { "epoch": 5.083333333333333, "grad_norm": 0.4244282841682434, "learning_rate": 2.1153311295961486e-06, "loss": 0.2791, "step": 6039 }, { "epoch": 5.084175084175084, "grad_norm": 0.38855066895484924, "learning_rate": 2.1136170436225363e-06, "loss": 0.2548, "step": 6040 }, { "epoch": 5.085016835016835, "grad_norm": 0.38989225029945374, "learning_rate": 2.111903466260371e-06, "loss": 0.2456, "step": 6041 }, { "epoch": 5.085858585858586, "grad_norm": 0.4031529724597931, "learning_rate": 2.110190397811609e-06, "loss": 0.2529, "step": 6042 }, { "epoch": 5.0867003367003365, "grad_norm": 0.4071689546108246, "learning_rate": 2.1084778385781056e-06, "loss": 0.2724, "step": 6043 }, { "epoch": 5.087542087542087, "grad_norm": 0.404655784368515, "learning_rate": 2.1067657888616374e-06, "loss": 0.296, "step": 6044 }, { "epoch": 5.088383838383838, "grad_norm": 0.4243928790092468, "learning_rate": 2.1050542489638833e-06, "loss": 0.266, "step": 6045 }, { "epoch": 5.089225589225589, "grad_norm": 0.39093753695487976, "learning_rate": 2.1033432191864324e-06, "loss": 0.283, "step": 6046 }, { "epoch": 5.09006734006734, "grad_norm": 0.4160464107990265, "learning_rate": 2.1016326998307907e-06, "loss": 0.2516, "step": 6047 }, { "epoch": 5.090909090909091, "grad_norm": 0.4021521806716919, "learning_rate": 2.099922691198366e-06, "loss": 0.2674, "step": 6048 }, { "epoch": 5.091750841750842, "grad_norm": 0.38166093826293945, "learning_rate": 2.0982131935904844e-06, "loss": 0.2674, "step": 6049 }, { "epoch": 5.092592592592593, "grad_norm": 0.4381592869758606, "learning_rate": 2.0965042073083724e-06, "loss": 0.2822, "step": 6050 }, { "epoch": 5.093434343434343, "grad_norm": 0.3728099465370178, "learning_rate": 2.0947957326531776e-06, "loss": 0.2941, "step": 6051 }, { "epoch": 5.094276094276094, "grad_norm": 0.4240298569202423, "learning_rate": 2.093087769925947e-06, "loss": 0.278, "step": 6052 }, { "epoch": 5.095117845117845, "grad_norm": 0.3850318193435669, "learning_rate": 2.0913803194276444e-06, "loss": 0.2886, "step": 6053 }, { "epoch": 5.095959595959596, "grad_norm": 0.3978691101074219, "learning_rate": 2.089673381459135e-06, "loss": 0.2774, "step": 6054 }, { "epoch": 5.096801346801347, "grad_norm": 0.4034176170825958, "learning_rate": 2.0879669563212086e-06, "loss": 0.2864, "step": 6055 }, { "epoch": 5.097643097643098, "grad_norm": 0.42322394251823425, "learning_rate": 2.08626104431455e-06, "loss": 0.2391, "step": 6056 }, { "epoch": 5.098484848484849, "grad_norm": 0.41590961813926697, "learning_rate": 2.0845556457397603e-06, "loss": 0.2503, "step": 6057 }, { "epoch": 5.0993265993265995, "grad_norm": 0.4057426452636719, "learning_rate": 2.082850760897347e-06, "loss": 0.2379, "step": 6058 }, { "epoch": 5.10016835016835, "grad_norm": 0.4024038016796112, "learning_rate": 2.0811463900877304e-06, "loss": 0.2454, "step": 6059 }, { "epoch": 5.101010101010101, "grad_norm": 0.38583502173423767, "learning_rate": 2.0794425336112405e-06, "loss": 0.2828, "step": 6060 }, { "epoch": 5.101851851851852, "grad_norm": 0.40383821725845337, "learning_rate": 2.077739191768114e-06, "loss": 0.265, "step": 6061 }, { "epoch": 5.102693602693603, "grad_norm": 0.4502067267894745, "learning_rate": 2.0760363648584953e-06, "loss": 0.2678, "step": 6062 }, { "epoch": 5.103535353535354, "grad_norm": 0.40853220224380493, "learning_rate": 2.0743340531824445e-06, "loss": 0.2463, "step": 6063 }, { "epoch": 5.104377104377105, "grad_norm": 0.37399405241012573, "learning_rate": 2.072632257039924e-06, "loss": 0.2735, "step": 6064 }, { "epoch": 5.1052188552188555, "grad_norm": 0.40939345955848694, "learning_rate": 2.0709309767308104e-06, "loss": 0.2667, "step": 6065 }, { "epoch": 5.106060606060606, "grad_norm": 0.4319714307785034, "learning_rate": 2.0692302125548874e-06, "loss": 0.2546, "step": 6066 }, { "epoch": 5.106902356902357, "grad_norm": 0.391215056180954, "learning_rate": 2.067529964811846e-06, "loss": 0.243, "step": 6067 }, { "epoch": 5.107744107744108, "grad_norm": 0.38910242915153503, "learning_rate": 2.06583023380129e-06, "loss": 0.2636, "step": 6068 }, { "epoch": 5.108585858585859, "grad_norm": 0.44947561621665955, "learning_rate": 2.0641310198227293e-06, "loss": 0.2675, "step": 6069 }, { "epoch": 5.109427609427609, "grad_norm": 0.40170979499816895, "learning_rate": 2.0624323231755814e-06, "loss": 0.2772, "step": 6070 }, { "epoch": 5.11026936026936, "grad_norm": 0.4083676338195801, "learning_rate": 2.0607341441591776e-06, "loss": 0.2639, "step": 6071 }, { "epoch": 5.111111111111111, "grad_norm": 0.4258100390434265, "learning_rate": 2.0590364830727555e-06, "loss": 0.2554, "step": 6072 }, { "epoch": 5.111952861952862, "grad_norm": 0.4143998324871063, "learning_rate": 2.05733934021546e-06, "loss": 0.2882, "step": 6073 }, { "epoch": 5.1127946127946124, "grad_norm": 0.4323666989803314, "learning_rate": 2.055642715886346e-06, "loss": 0.2631, "step": 6074 }, { "epoch": 5.113636363636363, "grad_norm": 0.41801387071609497, "learning_rate": 2.053946610384373e-06, "loss": 0.2544, "step": 6075 }, { "epoch": 5.114478114478114, "grad_norm": 0.37560346722602844, "learning_rate": 2.05225102400842e-06, "loss": 0.2805, "step": 6076 }, { "epoch": 5.115319865319865, "grad_norm": 0.3875944912433624, "learning_rate": 2.050555957057263e-06, "loss": 0.2614, "step": 6077 }, { "epoch": 5.116161616161616, "grad_norm": 0.41093870997428894, "learning_rate": 2.048861409829592e-06, "loss": 0.2556, "step": 6078 }, { "epoch": 5.117003367003367, "grad_norm": 0.42641153931617737, "learning_rate": 2.0471673826240025e-06, "loss": 0.2839, "step": 6079 }, { "epoch": 5.117845117845118, "grad_norm": 0.40279051661491394, "learning_rate": 2.0454738757390015e-06, "loss": 0.2967, "step": 6080 }, { "epoch": 5.1186868686868685, "grad_norm": 0.41018733382225037, "learning_rate": 2.043780889473005e-06, "loss": 0.2837, "step": 6081 }, { "epoch": 5.119528619528619, "grad_norm": 0.39903056621551514, "learning_rate": 2.042088424124333e-06, "loss": 0.2712, "step": 6082 }, { "epoch": 5.12037037037037, "grad_norm": 0.38253116607666016, "learning_rate": 2.040396479991214e-06, "loss": 0.2595, "step": 6083 }, { "epoch": 5.121212121212121, "grad_norm": 0.41305720806121826, "learning_rate": 2.038705057371791e-06, "loss": 0.2746, "step": 6084 }, { "epoch": 5.122053872053872, "grad_norm": 0.4208167791366577, "learning_rate": 2.037014156564108e-06, "loss": 0.2787, "step": 6085 }, { "epoch": 5.122895622895623, "grad_norm": 0.3906072676181793, "learning_rate": 2.0353237778661195e-06, "loss": 0.2613, "step": 6086 }, { "epoch": 5.123737373737374, "grad_norm": 0.4117913842201233, "learning_rate": 2.03363392157569e-06, "loss": 0.2693, "step": 6087 }, { "epoch": 5.124579124579125, "grad_norm": 0.4238797426223755, "learning_rate": 2.0319445879905873e-06, "loss": 0.2723, "step": 6088 }, { "epoch": 5.125420875420875, "grad_norm": 0.3962254226207733, "learning_rate": 2.0302557774084946e-06, "loss": 0.2443, "step": 6089 }, { "epoch": 5.126262626262626, "grad_norm": 0.41281858086586, "learning_rate": 2.0285674901269948e-06, "loss": 0.2824, "step": 6090 }, { "epoch": 5.127104377104377, "grad_norm": 0.3759649395942688, "learning_rate": 2.0268797264435814e-06, "loss": 0.2588, "step": 6091 }, { "epoch": 5.127946127946128, "grad_norm": 0.407733678817749, "learning_rate": 2.025192486655658e-06, "loss": 0.286, "step": 6092 }, { "epoch": 5.128787878787879, "grad_norm": 0.435346782207489, "learning_rate": 2.0235057710605354e-06, "loss": 0.2806, "step": 6093 }, { "epoch": 5.12962962962963, "grad_norm": 0.42437583208084106, "learning_rate": 2.02181957995543e-06, "loss": 0.269, "step": 6094 }, { "epoch": 5.130471380471381, "grad_norm": 0.429605633020401, "learning_rate": 2.0201339136374664e-06, "loss": 0.2387, "step": 6095 }, { "epoch": 5.1313131313131315, "grad_norm": 0.38885197043418884, "learning_rate": 2.018448772403675e-06, "loss": 0.2832, "step": 6096 }, { "epoch": 5.132154882154882, "grad_norm": 0.44333457946777344, "learning_rate": 2.016764156550997e-06, "loss": 0.2482, "step": 6097 }, { "epoch": 5.132996632996633, "grad_norm": 0.43105724453926086, "learning_rate": 2.0150800663762824e-06, "loss": 0.2899, "step": 6098 }, { "epoch": 5.133838383838384, "grad_norm": 0.4219614863395691, "learning_rate": 2.0133965021762835e-06, "loss": 0.3033, "step": 6099 }, { "epoch": 5.134680134680135, "grad_norm": 0.40957537293434143, "learning_rate": 2.011713464247661e-06, "loss": 0.2758, "step": 6100 }, { "epoch": 5.135521885521886, "grad_norm": 0.4232679009437561, "learning_rate": 2.0100309528869865e-06, "loss": 0.2598, "step": 6101 }, { "epoch": 5.136363636363637, "grad_norm": 0.4296596050262451, "learning_rate": 2.0083489683907343e-06, "loss": 0.2774, "step": 6102 }, { "epoch": 5.1372053872053876, "grad_norm": 0.38574451208114624, "learning_rate": 2.0066675110552903e-06, "loss": 0.2766, "step": 6103 }, { "epoch": 5.138047138047138, "grad_norm": 0.40975892543792725, "learning_rate": 2.004986581176944e-06, "loss": 0.2752, "step": 6104 }, { "epoch": 5.138888888888889, "grad_norm": 0.41701406240463257, "learning_rate": 2.0033061790518922e-06, "loss": 0.25, "step": 6105 }, { "epoch": 5.13973063973064, "grad_norm": 0.37990501523017883, "learning_rate": 2.001626304976242e-06, "loss": 0.2528, "step": 6106 }, { "epoch": 5.140572390572391, "grad_norm": 0.4308101236820221, "learning_rate": 1.999946959246002e-06, "loss": 0.2706, "step": 6107 }, { "epoch": 5.141414141414141, "grad_norm": 0.42399632930755615, "learning_rate": 1.9982681421570944e-06, "loss": 0.2769, "step": 6108 }, { "epoch": 5.142255892255892, "grad_norm": 0.3754323124885559, "learning_rate": 1.996589854005341e-06, "loss": 0.2703, "step": 6109 }, { "epoch": 5.143097643097643, "grad_norm": 0.45855361223220825, "learning_rate": 1.9949120950864775e-06, "loss": 0.2329, "step": 6110 }, { "epoch": 5.143939393939394, "grad_norm": 0.4367814064025879, "learning_rate": 1.993234865696142e-06, "loss": 0.2824, "step": 6111 }, { "epoch": 5.1447811447811445, "grad_norm": 0.3829388916492462, "learning_rate": 1.991558166129877e-06, "loss": 0.2527, "step": 6112 }, { "epoch": 5.145622895622895, "grad_norm": 0.3745931386947632, "learning_rate": 1.9898819966831377e-06, "loss": 0.2524, "step": 6113 }, { "epoch": 5.146464646464646, "grad_norm": 0.3968631327152252, "learning_rate": 1.988206357651285e-06, "loss": 0.2527, "step": 6114 }, { "epoch": 5.147306397306397, "grad_norm": 0.41343945264816284, "learning_rate": 1.986531249329582e-06, "loss": 0.2814, "step": 6115 }, { "epoch": 5.148148148148148, "grad_norm": 0.38404589891433716, "learning_rate": 1.9848566720132e-06, "loss": 0.2807, "step": 6116 }, { "epoch": 5.148989898989899, "grad_norm": 0.4662491977214813, "learning_rate": 1.983182625997217e-06, "loss": 0.2807, "step": 6117 }, { "epoch": 5.14983164983165, "grad_norm": 0.40720778703689575, "learning_rate": 1.9815091115766182e-06, "loss": 0.2803, "step": 6118 }, { "epoch": 5.1506734006734005, "grad_norm": 0.3680381178855896, "learning_rate": 1.979836129046298e-06, "loss": 0.2618, "step": 6119 }, { "epoch": 5.151515151515151, "grad_norm": 0.4071272313594818, "learning_rate": 1.9781636787010503e-06, "loss": 0.2355, "step": 6120 }, { "epoch": 5.152356902356902, "grad_norm": 0.4447405934333801, "learning_rate": 1.9764917608355783e-06, "loss": 0.2523, "step": 6121 }, { "epoch": 5.153198653198653, "grad_norm": 0.4132116138935089, "learning_rate": 1.9748203757444946e-06, "loss": 0.249, "step": 6122 }, { "epoch": 5.154040404040404, "grad_norm": 0.4155798852443695, "learning_rate": 1.9731495237223116e-06, "loss": 0.2458, "step": 6123 }, { "epoch": 5.154882154882155, "grad_norm": 0.4285241961479187, "learning_rate": 1.9714792050634546e-06, "loss": 0.2642, "step": 6124 }, { "epoch": 5.155723905723906, "grad_norm": 0.4250887334346771, "learning_rate": 1.9698094200622498e-06, "loss": 0.2605, "step": 6125 }, { "epoch": 5.156565656565657, "grad_norm": 0.4138072729110718, "learning_rate": 1.9681401690129306e-06, "loss": 0.2424, "step": 6126 }, { "epoch": 5.157407407407407, "grad_norm": 0.4328722655773163, "learning_rate": 1.966471452209638e-06, "loss": 0.2473, "step": 6127 }, { "epoch": 5.158249158249158, "grad_norm": 0.4009510576725006, "learning_rate": 1.9648032699464163e-06, "loss": 0.265, "step": 6128 }, { "epoch": 5.159090909090909, "grad_norm": 0.3909916579723358, "learning_rate": 1.9631356225172192e-06, "loss": 0.2816, "step": 6129 }, { "epoch": 5.15993265993266, "grad_norm": 0.4154093861579895, "learning_rate": 1.9614685102159016e-06, "loss": 0.2835, "step": 6130 }, { "epoch": 5.160774410774411, "grad_norm": 0.4006600081920624, "learning_rate": 1.959801933336229e-06, "loss": 0.2658, "step": 6131 }, { "epoch": 5.161616161616162, "grad_norm": 0.39302927255630493, "learning_rate": 1.9581358921718686e-06, "loss": 0.2889, "step": 6132 }, { "epoch": 5.162457912457913, "grad_norm": 0.39411461353302, "learning_rate": 1.9564703870163944e-06, "loss": 0.2702, "step": 6133 }, { "epoch": 5.1632996632996635, "grad_norm": 0.39636823534965515, "learning_rate": 1.9548054181632845e-06, "loss": 0.2662, "step": 6134 }, { "epoch": 5.164141414141414, "grad_norm": 0.4118683934211731, "learning_rate": 1.953140985905926e-06, "loss": 0.2699, "step": 6135 }, { "epoch": 5.164983164983165, "grad_norm": 0.3837244212627411, "learning_rate": 1.9514770905376122e-06, "loss": 0.2431, "step": 6136 }, { "epoch": 5.165824915824916, "grad_norm": 0.3962317705154419, "learning_rate": 1.949813732351536e-06, "loss": 0.2848, "step": 6137 }, { "epoch": 5.166666666666667, "grad_norm": 0.39385727047920227, "learning_rate": 1.9481509116407975e-06, "loss": 0.261, "step": 6138 }, { "epoch": 5.167508417508418, "grad_norm": 0.37867242097854614, "learning_rate": 1.9464886286984064e-06, "loss": 0.2562, "step": 6139 }, { "epoch": 5.168350168350169, "grad_norm": 0.4229494631290436, "learning_rate": 1.9448268838172748e-06, "loss": 0.2883, "step": 6140 }, { "epoch": 5.16919191919192, "grad_norm": 0.40960970520973206, "learning_rate": 1.9431656772902187e-06, "loss": 0.2552, "step": 6141 }, { "epoch": 5.17003367003367, "grad_norm": 0.4129706621170044, "learning_rate": 1.9415050094099604e-06, "loss": 0.2699, "step": 6142 }, { "epoch": 5.170875420875421, "grad_norm": 0.38389649987220764, "learning_rate": 1.9398448804691256e-06, "loss": 0.2592, "step": 6143 }, { "epoch": 5.171717171717171, "grad_norm": 0.4373009502887726, "learning_rate": 1.938185290760249e-06, "loss": 0.2633, "step": 6144 }, { "epoch": 5.172558922558922, "grad_norm": 0.4111494719982147, "learning_rate": 1.9365262405757684e-06, "loss": 0.2549, "step": 6145 }, { "epoch": 5.173400673400673, "grad_norm": 0.43868327140808105, "learning_rate": 1.9348677302080253e-06, "loss": 0.2446, "step": 6146 }, { "epoch": 5.174242424242424, "grad_norm": 0.4029381275177002, "learning_rate": 1.9332097599492655e-06, "loss": 0.2494, "step": 6147 }, { "epoch": 5.175084175084175, "grad_norm": 0.40897804498672485, "learning_rate": 1.931552330091644e-06, "loss": 0.2716, "step": 6148 }, { "epoch": 5.175925925925926, "grad_norm": 0.3973604738712311, "learning_rate": 1.929895440927216e-06, "loss": 0.2411, "step": 6149 }, { "epoch": 5.1767676767676765, "grad_norm": 0.37623634934425354, "learning_rate": 1.928239092747941e-06, "loss": 0.2839, "step": 6150 }, { "epoch": 5.177609427609427, "grad_norm": 0.39903688430786133, "learning_rate": 1.926583285845687e-06, "loss": 0.291, "step": 6151 }, { "epoch": 5.178451178451178, "grad_norm": 0.37303176522254944, "learning_rate": 1.924928020512227e-06, "loss": 0.2573, "step": 6152 }, { "epoch": 5.179292929292929, "grad_norm": 0.43052855134010315, "learning_rate": 1.923273297039235e-06, "loss": 0.2841, "step": 6153 }, { "epoch": 5.18013468013468, "grad_norm": 0.4149332046508789, "learning_rate": 1.9216191157182902e-06, "loss": 0.267, "step": 6154 }, { "epoch": 5.180976430976431, "grad_norm": 0.39798805117607117, "learning_rate": 1.919965476840875e-06, "loss": 0.2684, "step": 6155 }, { "epoch": 5.181818181818182, "grad_norm": 0.40595129132270813, "learning_rate": 1.91831238069838e-06, "loss": 0.2863, "step": 6156 }, { "epoch": 5.1826599326599325, "grad_norm": 0.4026780128479004, "learning_rate": 1.916659827582101e-06, "loss": 0.2579, "step": 6157 }, { "epoch": 5.183501683501683, "grad_norm": 0.43887439370155334, "learning_rate": 1.9150078177832333e-06, "loss": 0.2652, "step": 6158 }, { "epoch": 5.184343434343434, "grad_norm": 0.4048791527748108, "learning_rate": 1.913356351592876e-06, "loss": 0.2739, "step": 6159 }, { "epoch": 5.185185185185185, "grad_norm": 0.39385396242141724, "learning_rate": 1.9117054293020383e-06, "loss": 0.2709, "step": 6160 }, { "epoch": 5.186026936026936, "grad_norm": 0.38887715339660645, "learning_rate": 1.910055051201631e-06, "loss": 0.2796, "step": 6161 }, { "epoch": 5.186868686868687, "grad_norm": 0.40929773449897766, "learning_rate": 1.908405217582467e-06, "loss": 0.253, "step": 6162 }, { "epoch": 5.187710437710438, "grad_norm": 0.42886781692504883, "learning_rate": 1.9067559287352639e-06, "loss": 0.2454, "step": 6163 }, { "epoch": 5.188552188552189, "grad_norm": 0.39592456817626953, "learning_rate": 1.9051071849506437e-06, "loss": 0.2777, "step": 6164 }, { "epoch": 5.1893939393939394, "grad_norm": 0.4064641296863556, "learning_rate": 1.9034589865191344e-06, "loss": 0.2513, "step": 6165 }, { "epoch": 5.19023569023569, "grad_norm": 0.4247896075248718, "learning_rate": 1.9018113337311638e-06, "loss": 0.2649, "step": 6166 }, { "epoch": 5.191077441077441, "grad_norm": 0.40681979060173035, "learning_rate": 1.9001642268770692e-06, "loss": 0.2391, "step": 6167 }, { "epoch": 5.191919191919192, "grad_norm": 0.39386188983917236, "learning_rate": 1.8985176662470844e-06, "loss": 0.2699, "step": 6168 }, { "epoch": 5.192760942760943, "grad_norm": 0.4211398661136627, "learning_rate": 1.8968716521313552e-06, "loss": 0.2375, "step": 6169 }, { "epoch": 5.193602693602694, "grad_norm": 0.3941161036491394, "learning_rate": 1.8952261848199243e-06, "loss": 0.2744, "step": 6170 }, { "epoch": 5.194444444444445, "grad_norm": 0.39844852685928345, "learning_rate": 1.8935812646027395e-06, "loss": 0.2797, "step": 6171 }, { "epoch": 5.1952861952861955, "grad_norm": 0.4209328591823578, "learning_rate": 1.8919368917696562e-06, "loss": 0.2572, "step": 6172 }, { "epoch": 5.196127946127946, "grad_norm": 0.4004970192909241, "learning_rate": 1.8902930666104274e-06, "loss": 0.2856, "step": 6173 }, { "epoch": 5.196969696969697, "grad_norm": 0.3977859318256378, "learning_rate": 1.8886497894147155e-06, "loss": 0.2521, "step": 6174 }, { "epoch": 5.197811447811448, "grad_norm": 0.39834704995155334, "learning_rate": 1.8870070604720825e-06, "loss": 0.2804, "step": 6175 }, { "epoch": 5.198653198653199, "grad_norm": 0.3779483437538147, "learning_rate": 1.8853648800719926e-06, "loss": 0.2659, "step": 6176 }, { "epoch": 5.19949494949495, "grad_norm": 0.3979488015174866, "learning_rate": 1.883723248503817e-06, "loss": 0.2587, "step": 6177 }, { "epoch": 5.200336700336701, "grad_norm": 0.39440181851387024, "learning_rate": 1.882082166056831e-06, "loss": 0.2808, "step": 6178 }, { "epoch": 5.201178451178452, "grad_norm": 0.4050585925579071, "learning_rate": 1.8804416330202085e-06, "loss": 0.2456, "step": 6179 }, { "epoch": 5.202020202020202, "grad_norm": 0.44095444679260254, "learning_rate": 1.8788016496830296e-06, "loss": 0.2321, "step": 6180 }, { "epoch": 5.202861952861953, "grad_norm": 0.4212122857570648, "learning_rate": 1.8771622163342745e-06, "loss": 0.2524, "step": 6181 }, { "epoch": 5.203703703703703, "grad_norm": 0.39908367395401, "learning_rate": 1.8755233332628309e-06, "loss": 0.2494, "step": 6182 }, { "epoch": 5.204545454545454, "grad_norm": 0.39821597933769226, "learning_rate": 1.8738850007574888e-06, "loss": 0.273, "step": 6183 }, { "epoch": 5.205387205387205, "grad_norm": 0.38355764746665955, "learning_rate": 1.8722472191069385e-06, "loss": 0.246, "step": 6184 }, { "epoch": 5.206228956228956, "grad_norm": 0.41385209560394287, "learning_rate": 1.8706099885997725e-06, "loss": 0.2575, "step": 6185 }, { "epoch": 5.207070707070707, "grad_norm": 0.41061532497406006, "learning_rate": 1.8689733095244928e-06, "loss": 0.2656, "step": 6186 }, { "epoch": 5.207912457912458, "grad_norm": 0.45647868514060974, "learning_rate": 1.8673371821694946e-06, "loss": 0.2888, "step": 6187 }, { "epoch": 5.2087542087542085, "grad_norm": 0.45991143584251404, "learning_rate": 1.8657016068230855e-06, "loss": 0.282, "step": 6188 }, { "epoch": 5.209595959595959, "grad_norm": 0.41993436217308044, "learning_rate": 1.8640665837734678e-06, "loss": 0.2822, "step": 6189 }, { "epoch": 5.21043771043771, "grad_norm": 0.42579588294029236, "learning_rate": 1.8624321133087531e-06, "loss": 0.2722, "step": 6190 }, { "epoch": 5.211279461279461, "grad_norm": 0.44557616114616394, "learning_rate": 1.8607981957169507e-06, "loss": 0.2681, "step": 6191 }, { "epoch": 5.212121212121212, "grad_norm": 0.4274488091468811, "learning_rate": 1.8591648312859733e-06, "loss": 0.2696, "step": 6192 }, { "epoch": 5.212962962962963, "grad_norm": 0.4235064387321472, "learning_rate": 1.85753202030364e-06, "loss": 0.2735, "step": 6193 }, { "epoch": 5.213804713804714, "grad_norm": 0.4315325915813446, "learning_rate": 1.8558997630576664e-06, "loss": 0.272, "step": 6194 }, { "epoch": 5.2146464646464645, "grad_norm": 0.41186147928237915, "learning_rate": 1.8542680598356766e-06, "loss": 0.3093, "step": 6195 }, { "epoch": 5.215488215488215, "grad_norm": 0.4269794523715973, "learning_rate": 1.8526369109251924e-06, "loss": 0.2724, "step": 6196 }, { "epoch": 5.216329966329966, "grad_norm": 0.4109227657318115, "learning_rate": 1.8510063166136383e-06, "loss": 0.2863, "step": 6197 }, { "epoch": 5.217171717171717, "grad_norm": 0.3915387988090515, "learning_rate": 1.8493762771883444e-06, "loss": 0.2757, "step": 6198 }, { "epoch": 5.218013468013468, "grad_norm": 0.41621777415275574, "learning_rate": 1.8477467929365417e-06, "loss": 0.2699, "step": 6199 }, { "epoch": 5.218855218855219, "grad_norm": 0.4113415479660034, "learning_rate": 1.846117864145362e-06, "loss": 0.2382, "step": 6200 }, { "epoch": 5.21969696969697, "grad_norm": 0.3916022479534149, "learning_rate": 1.844489491101839e-06, "loss": 0.2744, "step": 6201 }, { "epoch": 5.220538720538721, "grad_norm": 0.41200289130210876, "learning_rate": 1.842861674092909e-06, "loss": 0.2869, "step": 6202 }, { "epoch": 5.2213804713804715, "grad_norm": 0.4152713119983673, "learning_rate": 1.8412344134054112e-06, "loss": 0.3057, "step": 6203 }, { "epoch": 5.222222222222222, "grad_norm": 0.38620680570602417, "learning_rate": 1.8396077093260883e-06, "loss": 0.2951, "step": 6204 }, { "epoch": 5.223063973063973, "grad_norm": 0.441158264875412, "learning_rate": 1.8379815621415808e-06, "loss": 0.2601, "step": 6205 }, { "epoch": 5.223905723905724, "grad_norm": 0.3840394616127014, "learning_rate": 1.8363559721384328e-06, "loss": 0.2741, "step": 6206 }, { "epoch": 5.224747474747475, "grad_norm": 0.37804293632507324, "learning_rate": 1.8347309396030926e-06, "loss": 0.3039, "step": 6207 }, { "epoch": 5.225589225589226, "grad_norm": 0.37297195196151733, "learning_rate": 1.833106464821906e-06, "loss": 0.2419, "step": 6208 }, { "epoch": 5.226430976430977, "grad_norm": 0.4442484378814697, "learning_rate": 1.831482548081125e-06, "loss": 0.2618, "step": 6209 }, { "epoch": 5.2272727272727275, "grad_norm": 0.39140188694000244, "learning_rate": 1.8298591896669005e-06, "loss": 0.2894, "step": 6210 }, { "epoch": 5.228114478114478, "grad_norm": 0.41603055596351624, "learning_rate": 1.8282363898652828e-06, "loss": 0.2919, "step": 6211 }, { "epoch": 5.228956228956229, "grad_norm": 0.38869160413742065, "learning_rate": 1.8266141489622308e-06, "loss": 0.2608, "step": 6212 }, { "epoch": 5.22979797979798, "grad_norm": 0.4040675461292267, "learning_rate": 1.8249924672435982e-06, "loss": 0.2994, "step": 6213 }, { "epoch": 5.230639730639731, "grad_norm": 0.3993898034095764, "learning_rate": 1.8233713449951413e-06, "loss": 0.265, "step": 6214 }, { "epoch": 5.231481481481482, "grad_norm": 0.44737741351127625, "learning_rate": 1.821750782502521e-06, "loss": 0.2561, "step": 6215 }, { "epoch": 5.232323232323233, "grad_norm": 0.3762952983379364, "learning_rate": 1.820130780051299e-06, "loss": 0.2748, "step": 6216 }, { "epoch": 5.233164983164984, "grad_norm": 0.4200701117515564, "learning_rate": 1.8185113379269354e-06, "loss": 0.257, "step": 6217 }, { "epoch": 5.2340067340067336, "grad_norm": 0.3908381760120392, "learning_rate": 1.8168924564147934e-06, "loss": 0.2793, "step": 6218 }, { "epoch": 5.234848484848484, "grad_norm": 0.41064947843551636, "learning_rate": 1.8152741358001335e-06, "loss": 0.2386, "step": 6219 }, { "epoch": 5.235690235690235, "grad_norm": 0.4387349784374237, "learning_rate": 1.8136563763681282e-06, "loss": 0.2789, "step": 6220 }, { "epoch": 5.236531986531986, "grad_norm": 0.4156099259853363, "learning_rate": 1.8120391784038405e-06, "loss": 0.2414, "step": 6221 }, { "epoch": 5.237373737373737, "grad_norm": 0.4112551510334015, "learning_rate": 1.8104225421922372e-06, "loss": 0.272, "step": 6222 }, { "epoch": 5.238215488215488, "grad_norm": 0.40240880846977234, "learning_rate": 1.8088064680181861e-06, "loss": 0.2645, "step": 6223 }, { "epoch": 5.239057239057239, "grad_norm": 0.41504594683647156, "learning_rate": 1.807190956166458e-06, "loss": 0.2812, "step": 6224 }, { "epoch": 5.23989898989899, "grad_norm": 0.39923450350761414, "learning_rate": 1.8055760069217249e-06, "loss": 0.2528, "step": 6225 }, { "epoch": 5.2407407407407405, "grad_norm": 0.44445472955703735, "learning_rate": 1.8039616205685562e-06, "loss": 0.3016, "step": 6226 }, { "epoch": 5.241582491582491, "grad_norm": 0.4092691242694855, "learning_rate": 1.8023477973914228e-06, "loss": 0.2381, "step": 6227 }, { "epoch": 5.242424242424242, "grad_norm": 0.3965267241001129, "learning_rate": 1.800734537674701e-06, "loss": 0.2982, "step": 6228 }, { "epoch": 5.243265993265993, "grad_norm": 0.3962632417678833, "learning_rate": 1.7991218417026619e-06, "loss": 0.2756, "step": 6229 }, { "epoch": 5.244107744107744, "grad_norm": 0.3984866738319397, "learning_rate": 1.797509709759479e-06, "loss": 0.2705, "step": 6230 }, { "epoch": 5.244949494949495, "grad_norm": 0.3946498930454254, "learning_rate": 1.7958981421292298e-06, "loss": 0.2674, "step": 6231 }, { "epoch": 5.245791245791246, "grad_norm": 0.40333229303359985, "learning_rate": 1.7942871390958866e-06, "loss": 0.2968, "step": 6232 }, { "epoch": 5.2466329966329965, "grad_norm": 0.4137178957462311, "learning_rate": 1.7926767009433283e-06, "loss": 0.264, "step": 6233 }, { "epoch": 5.247474747474747, "grad_norm": 0.417004257440567, "learning_rate": 1.79106682795533e-06, "loss": 0.2612, "step": 6234 }, { "epoch": 5.248316498316498, "grad_norm": 0.3912789523601532, "learning_rate": 1.7894575204155673e-06, "loss": 0.2643, "step": 6235 }, { "epoch": 5.249158249158249, "grad_norm": 0.39015740156173706, "learning_rate": 1.7878487786076182e-06, "loss": 0.258, "step": 6236 }, { "epoch": 5.25, "grad_norm": 0.4227549135684967, "learning_rate": 1.7862406028149616e-06, "loss": 0.2767, "step": 6237 }, { "epoch": 5.250841750841751, "grad_norm": 0.40251651406288147, "learning_rate": 1.7846329933209744e-06, "loss": 0.2669, "step": 6238 }, { "epoch": 5.251683501683502, "grad_norm": 0.3937962055206299, "learning_rate": 1.7830259504089342e-06, "loss": 0.2773, "step": 6239 }, { "epoch": 5.252525252525253, "grad_norm": 0.42498520016670227, "learning_rate": 1.7814194743620171e-06, "loss": 0.2889, "step": 6240 }, { "epoch": 5.2533670033670035, "grad_norm": 0.4476029872894287, "learning_rate": 1.7798135654633036e-06, "loss": 0.2796, "step": 6241 }, { "epoch": 5.254208754208754, "grad_norm": 0.3731241524219513, "learning_rate": 1.7782082239957733e-06, "loss": 0.2881, "step": 6242 }, { "epoch": 5.255050505050505, "grad_norm": 0.4352182149887085, "learning_rate": 1.776603450242303e-06, "loss": 0.2424, "step": 6243 }, { "epoch": 5.255892255892256, "grad_norm": 0.45182672142982483, "learning_rate": 1.7749992444856683e-06, "loss": 0.2736, "step": 6244 }, { "epoch": 5.256734006734007, "grad_norm": 0.4173124134540558, "learning_rate": 1.7733956070085524e-06, "loss": 0.2828, "step": 6245 }, { "epoch": 5.257575757575758, "grad_norm": 0.44349315762519836, "learning_rate": 1.7717925380935285e-06, "loss": 0.2531, "step": 6246 }, { "epoch": 5.258417508417509, "grad_norm": 0.42676419019699097, "learning_rate": 1.7701900380230786e-06, "loss": 0.2438, "step": 6247 }, { "epoch": 5.2592592592592595, "grad_norm": 0.41270822286605835, "learning_rate": 1.768588107079578e-06, "loss": 0.2663, "step": 6248 }, { "epoch": 5.26010101010101, "grad_norm": 0.398573637008667, "learning_rate": 1.7669867455453027e-06, "loss": 0.271, "step": 6249 }, { "epoch": 5.260942760942761, "grad_norm": 0.42154332995414734, "learning_rate": 1.7653859537024337e-06, "loss": 0.2543, "step": 6250 }, { "epoch": 5.261784511784512, "grad_norm": 0.41897377371788025, "learning_rate": 1.763785731833043e-06, "loss": 0.2618, "step": 6251 }, { "epoch": 5.262626262626263, "grad_norm": 0.40908896923065186, "learning_rate": 1.762186080219111e-06, "loss": 0.2646, "step": 6252 }, { "epoch": 5.263468013468014, "grad_norm": 0.3809334933757782, "learning_rate": 1.7605869991425096e-06, "loss": 0.2734, "step": 6253 }, { "epoch": 5.264309764309765, "grad_norm": 0.38693767786026, "learning_rate": 1.7589884888850166e-06, "loss": 0.2703, "step": 6254 }, { "epoch": 5.265151515151516, "grad_norm": 0.3890720307826996, "learning_rate": 1.7573905497283067e-06, "loss": 0.2468, "step": 6255 }, { "epoch": 5.2659932659932664, "grad_norm": 0.38732126355171204, "learning_rate": 1.7557931819539509e-06, "loss": 0.2417, "step": 6256 }, { "epoch": 5.266835016835016, "grad_norm": 0.3957911729812622, "learning_rate": 1.7541963858434242e-06, "loss": 0.2785, "step": 6257 }, { "epoch": 5.267676767676767, "grad_norm": 0.3687361180782318, "learning_rate": 1.7526001616781003e-06, "loss": 0.293, "step": 6258 }, { "epoch": 5.268518518518518, "grad_norm": 0.3748111128807068, "learning_rate": 1.7510045097392503e-06, "loss": 0.2716, "step": 6259 }, { "epoch": 5.269360269360269, "grad_norm": 0.38538017868995667, "learning_rate": 1.749409430308045e-06, "loss": 0.2561, "step": 6260 }, { "epoch": 5.27020202020202, "grad_norm": 0.3810572028160095, "learning_rate": 1.7478149236655517e-06, "loss": 0.2515, "step": 6261 }, { "epoch": 5.271043771043771, "grad_norm": 0.42593124508857727, "learning_rate": 1.7462209900927419e-06, "loss": 0.2646, "step": 6262 }, { "epoch": 5.271885521885522, "grad_norm": 0.4092859923839569, "learning_rate": 1.7446276298704851e-06, "loss": 0.2848, "step": 6263 }, { "epoch": 5.2727272727272725, "grad_norm": 0.3854082524776459, "learning_rate": 1.7430348432795475e-06, "loss": 0.2605, "step": 6264 }, { "epoch": 5.273569023569023, "grad_norm": 0.4282331168651581, "learning_rate": 1.7414426306005921e-06, "loss": 0.2866, "step": 6265 }, { "epoch": 5.274410774410774, "grad_norm": 0.41066065430641174, "learning_rate": 1.7398509921141876e-06, "loss": 0.2635, "step": 6266 }, { "epoch": 5.275252525252525, "grad_norm": 0.40551936626434326, "learning_rate": 1.738259928100795e-06, "loss": 0.2776, "step": 6267 }, { "epoch": 5.276094276094276, "grad_norm": 0.38476893305778503, "learning_rate": 1.736669438840779e-06, "loss": 0.2945, "step": 6268 }, { "epoch": 5.276936026936027, "grad_norm": 0.4176275134086609, "learning_rate": 1.7350795246143998e-06, "loss": 0.3037, "step": 6269 }, { "epoch": 5.277777777777778, "grad_norm": 0.40581610798835754, "learning_rate": 1.7334901857018156e-06, "loss": 0.2683, "step": 6270 }, { "epoch": 5.2786195286195285, "grad_norm": 0.3995945453643799, "learning_rate": 1.7319014223830878e-06, "loss": 0.2692, "step": 6271 }, { "epoch": 5.279461279461279, "grad_norm": 0.41398826241493225, "learning_rate": 1.7303132349381702e-06, "loss": 0.2862, "step": 6272 }, { "epoch": 5.28030303030303, "grad_norm": 0.37564560770988464, "learning_rate": 1.7287256236469218e-06, "loss": 0.275, "step": 6273 }, { "epoch": 5.281144781144781, "grad_norm": 0.40875113010406494, "learning_rate": 1.7271385887890935e-06, "loss": 0.2476, "step": 6274 }, { "epoch": 5.281986531986532, "grad_norm": 0.4107505977153778, "learning_rate": 1.7255521306443408e-06, "loss": 0.2947, "step": 6275 }, { "epoch": 5.282828282828283, "grad_norm": 0.39057859778404236, "learning_rate": 1.7239662494922127e-06, "loss": 0.2908, "step": 6276 }, { "epoch": 5.283670033670034, "grad_norm": 0.379854291677475, "learning_rate": 1.7223809456121588e-06, "loss": 0.2786, "step": 6277 }, { "epoch": 5.284511784511785, "grad_norm": 0.4056224822998047, "learning_rate": 1.7207962192835248e-06, "loss": 0.2628, "step": 6278 }, { "epoch": 5.2853535353535355, "grad_norm": 0.38389483094215393, "learning_rate": 1.719212070785558e-06, "loss": 0.2405, "step": 6279 }, { "epoch": 5.286195286195286, "grad_norm": 0.3767940402030945, "learning_rate": 1.7176285003974036e-06, "loss": 0.2636, "step": 6280 }, { "epoch": 5.287037037037037, "grad_norm": 0.48272189497947693, "learning_rate": 1.7160455083981026e-06, "loss": 0.2533, "step": 6281 }, { "epoch": 5.287878787878788, "grad_norm": 0.41035789251327515, "learning_rate": 1.7144630950665924e-06, "loss": 0.2843, "step": 6282 }, { "epoch": 5.288720538720539, "grad_norm": 0.4142405688762665, "learning_rate": 1.7128812606817136e-06, "loss": 0.2379, "step": 6283 }, { "epoch": 5.28956228956229, "grad_norm": 0.4727784991264343, "learning_rate": 1.7113000055222034e-06, "loss": 0.3016, "step": 6284 }, { "epoch": 5.290404040404041, "grad_norm": 0.38178351521492004, "learning_rate": 1.7097193298666947e-06, "loss": 0.2835, "step": 6285 }, { "epoch": 5.2912457912457915, "grad_norm": 0.39417341351509094, "learning_rate": 1.7081392339937197e-06, "loss": 0.2954, "step": 6286 }, { "epoch": 5.292087542087542, "grad_norm": 0.4515724778175354, "learning_rate": 1.7065597181817052e-06, "loss": 0.2683, "step": 6287 }, { "epoch": 5.292929292929293, "grad_norm": 0.4421881139278412, "learning_rate": 1.7049807827089815e-06, "loss": 0.2656, "step": 6288 }, { "epoch": 5.293771043771044, "grad_norm": 0.3916500210762024, "learning_rate": 1.7034024278537748e-06, "loss": 0.2696, "step": 6289 }, { "epoch": 5.294612794612795, "grad_norm": 0.40959885716438293, "learning_rate": 1.7018246538942069e-06, "loss": 0.2531, "step": 6290 }, { "epoch": 5.295454545454546, "grad_norm": 0.387376993894577, "learning_rate": 1.7002474611082965e-06, "loss": 0.241, "step": 6291 }, { "epoch": 5.296296296296296, "grad_norm": 0.4168015122413635, "learning_rate": 1.698670849773965e-06, "loss": 0.2533, "step": 6292 }, { "epoch": 5.297138047138047, "grad_norm": 0.42167848348617554, "learning_rate": 1.6970948201690268e-06, "loss": 0.2534, "step": 6293 }, { "epoch": 5.297979797979798, "grad_norm": 0.4070766270160675, "learning_rate": 1.6955193725711938e-06, "loss": 0.2519, "step": 6294 }, { "epoch": 5.298821548821548, "grad_norm": 0.37866705656051636, "learning_rate": 1.6939445072580774e-06, "loss": 0.2676, "step": 6295 }, { "epoch": 5.299663299663299, "grad_norm": 0.3892401456832886, "learning_rate": 1.6923702245071877e-06, "loss": 0.2575, "step": 6296 }, { "epoch": 5.30050505050505, "grad_norm": 0.41046130657196045, "learning_rate": 1.6907965245959284e-06, "loss": 0.2847, "step": 6297 }, { "epoch": 5.301346801346801, "grad_norm": 0.397828072309494, "learning_rate": 1.6892234078016023e-06, "loss": 0.2599, "step": 6298 }, { "epoch": 5.302188552188552, "grad_norm": 0.4317542612552643, "learning_rate": 1.6876508744014075e-06, "loss": 0.2507, "step": 6299 }, { "epoch": 5.303030303030303, "grad_norm": 0.39094114303588867, "learning_rate": 1.6860789246724434e-06, "loss": 0.2635, "step": 6300 }, { "epoch": 5.303872053872054, "grad_norm": 0.40362513065338135, "learning_rate": 1.6845075588917053e-06, "loss": 0.2836, "step": 6301 }, { "epoch": 5.3047138047138045, "grad_norm": 0.38967058062553406, "learning_rate": 1.6829367773360833e-06, "loss": 0.266, "step": 6302 }, { "epoch": 5.305555555555555, "grad_norm": 0.405093252658844, "learning_rate": 1.6813665802823637e-06, "loss": 0.288, "step": 6303 }, { "epoch": 5.306397306397306, "grad_norm": 0.3998425602912903, "learning_rate": 1.6797969680072345e-06, "loss": 0.2776, "step": 6304 }, { "epoch": 5.307239057239057, "grad_norm": 0.4495145380496979, "learning_rate": 1.6782279407872792e-06, "loss": 0.2911, "step": 6305 }, { "epoch": 5.308080808080808, "grad_norm": 0.391579806804657, "learning_rate": 1.6766594988989753e-06, "loss": 0.2753, "step": 6306 }, { "epoch": 5.308922558922559, "grad_norm": 0.39523687958717346, "learning_rate": 1.6750916426186991e-06, "loss": 0.2814, "step": 6307 }, { "epoch": 5.30976430976431, "grad_norm": 0.40420860052108765, "learning_rate": 1.6735243722227223e-06, "loss": 0.2795, "step": 6308 }, { "epoch": 5.3106060606060606, "grad_norm": 0.383884459733963, "learning_rate": 1.671957687987218e-06, "loss": 0.2665, "step": 6309 }, { "epoch": 5.311447811447811, "grad_norm": 0.3892398476600647, "learning_rate": 1.670391590188249e-06, "loss": 0.251, "step": 6310 }, { "epoch": 5.312289562289562, "grad_norm": 0.4222714602947235, "learning_rate": 1.6688260791017819e-06, "loss": 0.2543, "step": 6311 }, { "epoch": 5.313131313131313, "grad_norm": 0.408005028963089, "learning_rate": 1.6672611550036727e-06, "loss": 0.2748, "step": 6312 }, { "epoch": 5.313973063973064, "grad_norm": 0.42762020230293274, "learning_rate": 1.6656968181696813e-06, "loss": 0.2654, "step": 6313 }, { "epoch": 5.314814814814815, "grad_norm": 0.4223037362098694, "learning_rate": 1.6641330688754591e-06, "loss": 0.2595, "step": 6314 }, { "epoch": 5.315656565656566, "grad_norm": 0.42235708236694336, "learning_rate": 1.662569907396554e-06, "loss": 0.2782, "step": 6315 }, { "epoch": 5.316498316498317, "grad_norm": 0.4058544337749481, "learning_rate": 1.6610073340084142e-06, "loss": 0.2598, "step": 6316 }, { "epoch": 5.3173400673400675, "grad_norm": 0.4245688319206238, "learning_rate": 1.6594453489863787e-06, "loss": 0.2603, "step": 6317 }, { "epoch": 5.318181818181818, "grad_norm": 0.42332860827445984, "learning_rate": 1.65788395260569e-06, "loss": 0.2529, "step": 6318 }, { "epoch": 5.319023569023569, "grad_norm": 0.3937399387359619, "learning_rate": 1.6563231451414803e-06, "loss": 0.2917, "step": 6319 }, { "epoch": 5.31986531986532, "grad_norm": 0.39737001061439514, "learning_rate": 1.6547629268687786e-06, "loss": 0.2607, "step": 6320 }, { "epoch": 5.320707070707071, "grad_norm": 0.41246771812438965, "learning_rate": 1.653203298062515e-06, "loss": 0.2917, "step": 6321 }, { "epoch": 5.321548821548822, "grad_norm": 0.43575453758239746, "learning_rate": 1.6516442589975128e-06, "loss": 0.2736, "step": 6322 }, { "epoch": 5.322390572390573, "grad_norm": 0.38913053274154663, "learning_rate": 1.6500858099484912e-06, "loss": 0.2699, "step": 6323 }, { "epoch": 5.3232323232323235, "grad_norm": 0.4239421784877777, "learning_rate": 1.6485279511900643e-06, "loss": 0.2597, "step": 6324 }, { "epoch": 5.324074074074074, "grad_norm": 0.4346782863140106, "learning_rate": 1.6469706829967418e-06, "loss": 0.2808, "step": 6325 }, { "epoch": 5.324915824915825, "grad_norm": 0.3958697021007538, "learning_rate": 1.645414005642934e-06, "loss": 0.2658, "step": 6326 }, { "epoch": 5.325757575757576, "grad_norm": 0.43267619609832764, "learning_rate": 1.6438579194029441e-06, "loss": 0.2814, "step": 6327 }, { "epoch": 5.326599326599327, "grad_norm": 0.44614067673683167, "learning_rate": 1.64230242455097e-06, "loss": 0.2786, "step": 6328 }, { "epoch": 5.327441077441078, "grad_norm": 0.41281619668006897, "learning_rate": 1.6407475213611051e-06, "loss": 0.2442, "step": 6329 }, { "epoch": 5.328282828282829, "grad_norm": 0.4137236177921295, "learning_rate": 1.6391932101073426e-06, "loss": 0.2754, "step": 6330 }, { "epoch": 5.329124579124579, "grad_norm": 0.468251496553421, "learning_rate": 1.6376394910635662e-06, "loss": 0.2811, "step": 6331 }, { "epoch": 5.32996632996633, "grad_norm": 0.43031129240989685, "learning_rate": 1.6360863645035606e-06, "loss": 0.2653, "step": 6332 }, { "epoch": 5.33080808080808, "grad_norm": 0.4166792631149292, "learning_rate": 1.6345338307010001e-06, "loss": 0.2586, "step": 6333 }, { "epoch": 5.331649831649831, "grad_norm": 0.42096903920173645, "learning_rate": 1.6329818899294608e-06, "loss": 0.2599, "step": 6334 }, { "epoch": 5.332491582491582, "grad_norm": 0.4157000482082367, "learning_rate": 1.6314305424624095e-06, "loss": 0.2761, "step": 6335 }, { "epoch": 5.333333333333333, "grad_norm": 0.4066374897956848, "learning_rate": 1.6298797885732092e-06, "loss": 0.2562, "step": 6336 }, { "epoch": 5.334175084175084, "grad_norm": 0.4279656410217285, "learning_rate": 1.6283296285351214e-06, "loss": 0.2623, "step": 6337 }, { "epoch": 5.335016835016835, "grad_norm": 0.41376519203186035, "learning_rate": 1.6267800626212988e-06, "loss": 0.2141, "step": 6338 }, { "epoch": 5.335858585858586, "grad_norm": 0.40628373622894287, "learning_rate": 1.6252310911047942e-06, "loss": 0.2437, "step": 6339 }, { "epoch": 5.3367003367003365, "grad_norm": 0.38174688816070557, "learning_rate": 1.6236827142585504e-06, "loss": 0.2716, "step": 6340 }, { "epoch": 5.337542087542087, "grad_norm": 0.38857385516166687, "learning_rate": 1.6221349323554076e-06, "loss": 0.2808, "step": 6341 }, { "epoch": 5.338383838383838, "grad_norm": 0.3895750045776367, "learning_rate": 1.6205877456681029e-06, "loss": 0.2962, "step": 6342 }, { "epoch": 5.339225589225589, "grad_norm": 0.4115534722805023, "learning_rate": 1.619041154469268e-06, "loss": 0.3117, "step": 6343 }, { "epoch": 5.34006734006734, "grad_norm": 0.41414037346839905, "learning_rate": 1.6174951590314276e-06, "loss": 0.2642, "step": 6344 }, { "epoch": 5.340909090909091, "grad_norm": 0.4050171673297882, "learning_rate": 1.6159497596270024e-06, "loss": 0.2482, "step": 6345 }, { "epoch": 5.341750841750842, "grad_norm": 0.42400506138801575, "learning_rate": 1.6144049565283064e-06, "loss": 0.2611, "step": 6346 }, { "epoch": 5.342592592592593, "grad_norm": 0.44368255138397217, "learning_rate": 1.6128607500075528e-06, "loss": 0.2749, "step": 6347 }, { "epoch": 5.343434343434343, "grad_norm": 0.41220641136169434, "learning_rate": 1.6113171403368483e-06, "loss": 0.276, "step": 6348 }, { "epoch": 5.344276094276094, "grad_norm": 0.40530526638031006, "learning_rate": 1.6097741277881918e-06, "loss": 0.2619, "step": 6349 }, { "epoch": 5.345117845117845, "grad_norm": 0.41158148646354675, "learning_rate": 1.6082317126334768e-06, "loss": 0.2602, "step": 6350 }, { "epoch": 5.345959595959596, "grad_norm": 0.4242410957813263, "learning_rate": 1.606689895144497e-06, "loss": 0.2614, "step": 6351 }, { "epoch": 5.346801346801347, "grad_norm": 0.4063771367073059, "learning_rate": 1.6051486755929336e-06, "loss": 0.2868, "step": 6352 }, { "epoch": 5.347643097643098, "grad_norm": 0.4076620638370514, "learning_rate": 1.603608054250369e-06, "loss": 0.2652, "step": 6353 }, { "epoch": 5.348484848484849, "grad_norm": 0.42411988973617554, "learning_rate": 1.6020680313882753e-06, "loss": 0.2712, "step": 6354 }, { "epoch": 5.3493265993265995, "grad_norm": 0.4192156195640564, "learning_rate": 1.6005286072780202e-06, "loss": 0.2831, "step": 6355 }, { "epoch": 5.35016835016835, "grad_norm": 0.4033704400062561, "learning_rate": 1.598989782190869e-06, "loss": 0.2844, "step": 6356 }, { "epoch": 5.351010101010101, "grad_norm": 0.40804702043533325, "learning_rate": 1.5974515563979776e-06, "loss": 0.261, "step": 6357 }, { "epoch": 5.351851851851852, "grad_norm": 0.4270396828651428, "learning_rate": 1.595913930170397e-06, "loss": 0.282, "step": 6358 }, { "epoch": 5.352693602693603, "grad_norm": 0.3685462772846222, "learning_rate": 1.5943769037790735e-06, "loss": 0.2956, "step": 6359 }, { "epoch": 5.353535353535354, "grad_norm": 0.39383944869041443, "learning_rate": 1.5928404774948502e-06, "loss": 0.2693, "step": 6360 }, { "epoch": 5.354377104377105, "grad_norm": 0.3977528512477875, "learning_rate": 1.59130465158846e-06, "loss": 0.2657, "step": 6361 }, { "epoch": 5.3552188552188555, "grad_norm": 0.39543160796165466, "learning_rate": 1.5897694263305318e-06, "loss": 0.2686, "step": 6362 }, { "epoch": 5.356060606060606, "grad_norm": 0.3763255178928375, "learning_rate": 1.5882348019915855e-06, "loss": 0.2509, "step": 6363 }, { "epoch": 5.356902356902357, "grad_norm": 0.37833425402641296, "learning_rate": 1.5867007788420446e-06, "loss": 0.2829, "step": 6364 }, { "epoch": 5.357744107744108, "grad_norm": 0.4053158164024353, "learning_rate": 1.5851673571522168e-06, "loss": 0.267, "step": 6365 }, { "epoch": 5.358585858585858, "grad_norm": 0.4110909700393677, "learning_rate": 1.5836345371923078e-06, "loss": 0.2761, "step": 6366 }, { "epoch": 5.359427609427609, "grad_norm": 0.42880481481552124, "learning_rate": 1.5821023192324154e-06, "loss": 0.2735, "step": 6367 }, { "epoch": 5.36026936026936, "grad_norm": 0.4296332895755768, "learning_rate": 1.5805707035425334e-06, "loss": 0.2604, "step": 6368 }, { "epoch": 5.361111111111111, "grad_norm": 0.4311567544937134, "learning_rate": 1.5790396903925509e-06, "loss": 0.2762, "step": 6369 }, { "epoch": 5.361952861952862, "grad_norm": 0.41716286540031433, "learning_rate": 1.5775092800522479e-06, "loss": 0.2876, "step": 6370 }, { "epoch": 5.3627946127946124, "grad_norm": 0.41784918308258057, "learning_rate": 1.5759794727912964e-06, "loss": 0.2675, "step": 6371 }, { "epoch": 5.363636363636363, "grad_norm": 0.4280494749546051, "learning_rate": 1.5744502688792684e-06, "loss": 0.2555, "step": 6372 }, { "epoch": 5.364478114478114, "grad_norm": 0.424883633852005, "learning_rate": 1.572921668585624e-06, "loss": 0.2826, "step": 6373 }, { "epoch": 5.365319865319865, "grad_norm": 0.4179278314113617, "learning_rate": 1.571393672179718e-06, "loss": 0.2772, "step": 6374 }, { "epoch": 5.366161616161616, "grad_norm": 0.40590378642082214, "learning_rate": 1.5698662799308024e-06, "loss": 0.2459, "step": 6375 }, { "epoch": 5.367003367003367, "grad_norm": 0.4244216978549957, "learning_rate": 1.5683394921080164e-06, "loss": 0.253, "step": 6376 }, { "epoch": 5.367845117845118, "grad_norm": 0.41210639476776123, "learning_rate": 1.5668133089804e-06, "loss": 0.2689, "step": 6377 }, { "epoch": 5.3686868686868685, "grad_norm": 0.387204647064209, "learning_rate": 1.5652877308168811e-06, "loss": 0.2334, "step": 6378 }, { "epoch": 5.369528619528619, "grad_norm": 0.4001101851463318, "learning_rate": 1.5637627578862813e-06, "loss": 0.2848, "step": 6379 }, { "epoch": 5.37037037037037, "grad_norm": 0.40513306856155396, "learning_rate": 1.562238390457319e-06, "loss": 0.2525, "step": 6380 }, { "epoch": 5.371212121212121, "grad_norm": 0.37662163376808167, "learning_rate": 1.5607146287986052e-06, "loss": 0.2836, "step": 6381 }, { "epoch": 5.372053872053872, "grad_norm": 0.39754125475883484, "learning_rate": 1.559191473178641e-06, "loss": 0.2542, "step": 6382 }, { "epoch": 5.372895622895623, "grad_norm": 0.3782583475112915, "learning_rate": 1.557668923865824e-06, "loss": 0.2555, "step": 6383 }, { "epoch": 5.373737373737374, "grad_norm": 0.40513840317726135, "learning_rate": 1.5561469811284408e-06, "loss": 0.2732, "step": 6384 }, { "epoch": 5.374579124579125, "grad_norm": 0.39840057492256165, "learning_rate": 1.5546256452346758e-06, "loss": 0.2625, "step": 6385 }, { "epoch": 5.375420875420875, "grad_norm": 0.40221306681632996, "learning_rate": 1.5531049164526064e-06, "loss": 0.2679, "step": 6386 }, { "epoch": 5.376262626262626, "grad_norm": 0.4062540531158447, "learning_rate": 1.5515847950501995e-06, "loss": 0.2941, "step": 6387 }, { "epoch": 5.377104377104377, "grad_norm": 0.4087107181549072, "learning_rate": 1.5500652812953154e-06, "loss": 0.2921, "step": 6388 }, { "epoch": 5.377946127946128, "grad_norm": 0.38702160120010376, "learning_rate": 1.5485463754557112e-06, "loss": 0.2687, "step": 6389 }, { "epoch": 5.378787878787879, "grad_norm": 0.39309611916542053, "learning_rate": 1.5470280777990314e-06, "loss": 0.2674, "step": 6390 }, { "epoch": 5.37962962962963, "grad_norm": 0.4110589623451233, "learning_rate": 1.5455103885928196e-06, "loss": 0.2738, "step": 6391 }, { "epoch": 5.380471380471381, "grad_norm": 0.42769068479537964, "learning_rate": 1.5439933081045072e-06, "loss": 0.2256, "step": 6392 }, { "epoch": 5.3813131313131315, "grad_norm": 0.42049890756607056, "learning_rate": 1.542476836601418e-06, "loss": 0.2528, "step": 6393 }, { "epoch": 5.382154882154882, "grad_norm": 0.39665448665618896, "learning_rate": 1.5409609743507742e-06, "loss": 0.2433, "step": 6394 }, { "epoch": 5.382996632996633, "grad_norm": 0.3966784179210663, "learning_rate": 1.539445721619683e-06, "loss": 0.2847, "step": 6395 }, { "epoch": 5.383838383838384, "grad_norm": 0.4139573574066162, "learning_rate": 1.5379310786751522e-06, "loss": 0.2858, "step": 6396 }, { "epoch": 5.384680134680135, "grad_norm": 0.4252675771713257, "learning_rate": 1.5364170457840737e-06, "loss": 0.2726, "step": 6397 }, { "epoch": 5.385521885521886, "grad_norm": 0.41019007563591003, "learning_rate": 1.5349036232132404e-06, "loss": 0.2721, "step": 6398 }, { "epoch": 5.386363636363637, "grad_norm": 0.39492130279541016, "learning_rate": 1.5333908112293317e-06, "loss": 0.2514, "step": 6399 }, { "epoch": 5.3872053872053876, "grad_norm": 0.43324100971221924, "learning_rate": 1.531878610098919e-06, "loss": 0.2534, "step": 6400 }, { "epoch": 5.388047138047138, "grad_norm": 0.415993332862854, "learning_rate": 1.5303670200884707e-06, "loss": 0.2645, "step": 6401 }, { "epoch": 5.388888888888889, "grad_norm": 0.4213036298751831, "learning_rate": 1.528856041464346e-06, "loss": 0.2562, "step": 6402 }, { "epoch": 5.38973063973064, "grad_norm": 0.43916451930999756, "learning_rate": 1.5273456744927945e-06, "loss": 0.2532, "step": 6403 }, { "epoch": 5.390572390572391, "grad_norm": 0.42160236835479736, "learning_rate": 1.5258359194399585e-06, "loss": 0.2581, "step": 6404 }, { "epoch": 5.391414141414142, "grad_norm": 0.3803161084651947, "learning_rate": 1.524326776571871e-06, "loss": 0.2805, "step": 6405 }, { "epoch": 5.392255892255892, "grad_norm": 0.398004412651062, "learning_rate": 1.5228182461544617e-06, "loss": 0.2687, "step": 6406 }, { "epoch": 5.393097643097643, "grad_norm": 0.4249744415283203, "learning_rate": 1.5213103284535503e-06, "loss": 0.2456, "step": 6407 }, { "epoch": 5.393939393939394, "grad_norm": 0.4087044298648834, "learning_rate": 1.5198030237348471e-06, "loss": 0.2672, "step": 6408 }, { "epoch": 5.3947811447811445, "grad_norm": 0.425024151802063, "learning_rate": 1.5182963322639532e-06, "loss": 0.2562, "step": 6409 }, { "epoch": 5.395622895622895, "grad_norm": 0.42093685269355774, "learning_rate": 1.5167902543063668e-06, "loss": 0.2707, "step": 6410 }, { "epoch": 5.396464646464646, "grad_norm": 0.4152735769748688, "learning_rate": 1.5152847901274725e-06, "loss": 0.2663, "step": 6411 }, { "epoch": 5.397306397306397, "grad_norm": 0.4069223403930664, "learning_rate": 1.5137799399925513e-06, "loss": 0.2657, "step": 6412 }, { "epoch": 5.398148148148148, "grad_norm": 0.42781829833984375, "learning_rate": 1.512275704166773e-06, "loss": 0.257, "step": 6413 }, { "epoch": 5.398989898989899, "grad_norm": 0.3847272992134094, "learning_rate": 1.5107720829151979e-06, "loss": 0.2821, "step": 6414 }, { "epoch": 5.39983164983165, "grad_norm": 0.4275425374507904, "learning_rate": 1.5092690765027839e-06, "loss": 0.2685, "step": 6415 }, { "epoch": 5.4006734006734005, "grad_norm": 0.3922717571258545, "learning_rate": 1.5077666851943728e-06, "loss": 0.27, "step": 6416 }, { "epoch": 5.401515151515151, "grad_norm": 0.39072340726852417, "learning_rate": 1.5062649092547055e-06, "loss": 0.2696, "step": 6417 }, { "epoch": 5.402356902356902, "grad_norm": 0.40928247570991516, "learning_rate": 1.5047637489484086e-06, "loss": 0.2852, "step": 6418 }, { "epoch": 5.403198653198653, "grad_norm": 0.40568360686302185, "learning_rate": 1.5032632045400043e-06, "loss": 0.2769, "step": 6419 }, { "epoch": 5.404040404040404, "grad_norm": 0.4165388345718384, "learning_rate": 1.5017632762939032e-06, "loss": 0.2593, "step": 6420 }, { "epoch": 5.404882154882155, "grad_norm": 0.42376628518104553, "learning_rate": 1.5002639644744094e-06, "loss": 0.2633, "step": 6421 }, { "epoch": 5.405723905723906, "grad_norm": 0.444497674703598, "learning_rate": 1.4987652693457155e-06, "loss": 0.2489, "step": 6422 }, { "epoch": 5.406565656565657, "grad_norm": 0.4150242507457733, "learning_rate": 1.4972671911719095e-06, "loss": 0.2669, "step": 6423 }, { "epoch": 5.407407407407407, "grad_norm": 0.4160633981227875, "learning_rate": 1.4957697302169699e-06, "loss": 0.2704, "step": 6424 }, { "epoch": 5.408249158249158, "grad_norm": 0.478825181722641, "learning_rate": 1.4942728867447636e-06, "loss": 0.2744, "step": 6425 }, { "epoch": 5.409090909090909, "grad_norm": 0.4384937286376953, "learning_rate": 1.4927766610190496e-06, "loss": 0.2588, "step": 6426 }, { "epoch": 5.40993265993266, "grad_norm": 0.4634856879711151, "learning_rate": 1.4912810533034798e-06, "loss": 0.2564, "step": 6427 }, { "epoch": 5.410774410774411, "grad_norm": 0.409141480922699, "learning_rate": 1.489786063861598e-06, "loss": 0.2771, "step": 6428 }, { "epoch": 5.411616161616162, "grad_norm": 0.453246146440506, "learning_rate": 1.4882916929568359e-06, "loss": 0.2523, "step": 6429 }, { "epoch": 5.412457912457913, "grad_norm": 0.4361608326435089, "learning_rate": 1.486797940852517e-06, "loss": 0.2856, "step": 6430 }, { "epoch": 5.4132996632996635, "grad_norm": 0.4179026484489441, "learning_rate": 1.4853048078118548e-06, "loss": 0.2598, "step": 6431 }, { "epoch": 5.414141414141414, "grad_norm": 0.4228212833404541, "learning_rate": 1.483812294097957e-06, "loss": 0.2927, "step": 6432 }, { "epoch": 5.414983164983165, "grad_norm": 0.45616552233695984, "learning_rate": 1.4823203999738227e-06, "loss": 0.2677, "step": 6433 }, { "epoch": 5.415824915824916, "grad_norm": 0.3780338168144226, "learning_rate": 1.4808291257023372e-06, "loss": 0.285, "step": 6434 }, { "epoch": 5.416666666666667, "grad_norm": 0.42804884910583496, "learning_rate": 1.4793384715462772e-06, "loss": 0.2714, "step": 6435 }, { "epoch": 5.417508417508418, "grad_norm": 0.4437561631202698, "learning_rate": 1.4778484377683155e-06, "loss": 0.2511, "step": 6436 }, { "epoch": 5.418350168350169, "grad_norm": 0.4199783504009247, "learning_rate": 1.4763590246310105e-06, "loss": 0.2602, "step": 6437 }, { "epoch": 5.41919191919192, "grad_norm": 0.3957620859146118, "learning_rate": 1.4748702323968107e-06, "loss": 0.2598, "step": 6438 }, { "epoch": 5.42003367003367, "grad_norm": 0.4230395257472992, "learning_rate": 1.4733820613280586e-06, "loss": 0.2727, "step": 6439 }, { "epoch": 5.420875420875421, "grad_norm": 0.382983922958374, "learning_rate": 1.4718945116869881e-06, "loss": 0.2698, "step": 6440 }, { "epoch": 5.421717171717171, "grad_norm": 0.4099201560020447, "learning_rate": 1.470407583735719e-06, "loss": 0.2317, "step": 6441 }, { "epoch": 5.422558922558922, "grad_norm": 0.42981213331222534, "learning_rate": 1.4689212777362637e-06, "loss": 0.2498, "step": 6442 }, { "epoch": 5.423400673400673, "grad_norm": 0.4344232976436615, "learning_rate": 1.4674355939505247e-06, "loss": 0.2744, "step": 6443 }, { "epoch": 5.424242424242424, "grad_norm": 0.4501221477985382, "learning_rate": 1.4659505326402956e-06, "loss": 0.2379, "step": 6444 }, { "epoch": 5.425084175084175, "grad_norm": 0.3975953459739685, "learning_rate": 1.4644660940672628e-06, "loss": 0.2746, "step": 6445 }, { "epoch": 5.425925925925926, "grad_norm": 0.4183977246284485, "learning_rate": 1.4629822784929976e-06, "loss": 0.2634, "step": 6446 }, { "epoch": 5.4267676767676765, "grad_norm": 0.3820497691631317, "learning_rate": 1.4614990861789634e-06, "loss": 0.2728, "step": 6447 }, { "epoch": 5.427609427609427, "grad_norm": 0.4017888009548187, "learning_rate": 1.4600165173865154e-06, "loss": 0.2772, "step": 6448 }, { "epoch": 5.428451178451178, "grad_norm": 0.4217763841152191, "learning_rate": 1.4585345723769002e-06, "loss": 0.2741, "step": 6449 }, { "epoch": 5.429292929292929, "grad_norm": 0.4118961691856384, "learning_rate": 1.4570532514112495e-06, "loss": 0.271, "step": 6450 }, { "epoch": 5.43013468013468, "grad_norm": 0.38691383600234985, "learning_rate": 1.4555725547505894e-06, "loss": 0.2529, "step": 6451 }, { "epoch": 5.430976430976431, "grad_norm": 0.41502144932746887, "learning_rate": 1.4540924826558322e-06, "loss": 0.2701, "step": 6452 }, { "epoch": 5.431818181818182, "grad_norm": 0.4017292559146881, "learning_rate": 1.4526130353877842e-06, "loss": 0.2774, "step": 6453 }, { "epoch": 5.4326599326599325, "grad_norm": 0.4144552946090698, "learning_rate": 1.451134213207141e-06, "loss": 0.2589, "step": 6454 }, { "epoch": 5.433501683501683, "grad_norm": 0.41986557841300964, "learning_rate": 1.4496560163744849e-06, "loss": 0.2433, "step": 6455 }, { "epoch": 5.434343434343434, "grad_norm": 0.40354058146476746, "learning_rate": 1.4481784451502895e-06, "loss": 0.2853, "step": 6456 }, { "epoch": 5.435185185185185, "grad_norm": 0.4668846130371094, "learning_rate": 1.4467014997949208e-06, "loss": 0.2483, "step": 6457 }, { "epoch": 5.436026936026936, "grad_norm": 0.38943803310394287, "learning_rate": 1.4452251805686306e-06, "loss": 0.2536, "step": 6458 }, { "epoch": 5.436868686868687, "grad_norm": 0.4141045808792114, "learning_rate": 1.4437494877315617e-06, "loss": 0.2857, "step": 6459 }, { "epoch": 5.437710437710438, "grad_norm": 0.41050514578819275, "learning_rate": 1.4422744215437473e-06, "loss": 0.2548, "step": 6460 }, { "epoch": 5.438552188552189, "grad_norm": 0.43081116676330566, "learning_rate": 1.440799982265112e-06, "loss": 0.2673, "step": 6461 }, { "epoch": 5.4393939393939394, "grad_norm": 0.38514551520347595, "learning_rate": 1.4393261701554661e-06, "loss": 0.2914, "step": 6462 }, { "epoch": 5.44023569023569, "grad_norm": 0.3809138238430023, "learning_rate": 1.4378529854745105e-06, "loss": 0.2563, "step": 6463 }, { "epoch": 5.441077441077441, "grad_norm": 0.4153038263320923, "learning_rate": 1.436380428481835e-06, "loss": 0.2502, "step": 6464 }, { "epoch": 5.441919191919192, "grad_norm": 0.40550288558006287, "learning_rate": 1.4349084994369217e-06, "loss": 0.2632, "step": 6465 }, { "epoch": 5.442760942760943, "grad_norm": 0.4223509430885315, "learning_rate": 1.4334371985991409e-06, "loss": 0.2503, "step": 6466 }, { "epoch": 5.443602693602694, "grad_norm": 0.4042579233646393, "learning_rate": 1.4319665262277504e-06, "loss": 0.2679, "step": 6467 }, { "epoch": 5.444444444444445, "grad_norm": 0.4168646037578583, "learning_rate": 1.4304964825818962e-06, "loss": 0.2637, "step": 6468 }, { "epoch": 5.4452861952861955, "grad_norm": 0.4518718719482422, "learning_rate": 1.4290270679206185e-06, "loss": 0.2709, "step": 6469 }, { "epoch": 5.446127946127946, "grad_norm": 0.43800657987594604, "learning_rate": 1.4275582825028445e-06, "loss": 0.2882, "step": 6470 }, { "epoch": 5.446969696969697, "grad_norm": 0.40232229232788086, "learning_rate": 1.4260901265873883e-06, "loss": 0.2632, "step": 6471 }, { "epoch": 5.447811447811448, "grad_norm": 0.45213836431503296, "learning_rate": 1.4246226004329544e-06, "loss": 0.2525, "step": 6472 }, { "epoch": 5.448653198653199, "grad_norm": 0.44541049003601074, "learning_rate": 1.423155704298136e-06, "loss": 0.3015, "step": 6473 }, { "epoch": 5.44949494949495, "grad_norm": 0.4340059161186218, "learning_rate": 1.4216894384414182e-06, "loss": 0.2556, "step": 6474 }, { "epoch": 5.450336700336701, "grad_norm": 0.4223484694957733, "learning_rate": 1.420223803121169e-06, "loss": 0.2608, "step": 6475 }, { "epoch": 5.451178451178452, "grad_norm": 0.4545195996761322, "learning_rate": 1.4187587985956536e-06, "loss": 0.2892, "step": 6476 }, { "epoch": 5.452020202020202, "grad_norm": 0.42242667078971863, "learning_rate": 1.4172944251230169e-06, "loss": 0.295, "step": 6477 }, { "epoch": 5.452861952861953, "grad_norm": 0.41659092903137207, "learning_rate": 1.415830682961301e-06, "loss": 0.2636, "step": 6478 }, { "epoch": 5.453703703703704, "grad_norm": 0.4404568672180176, "learning_rate": 1.4143675723684308e-06, "loss": 0.26, "step": 6479 }, { "epoch": 5.454545454545454, "grad_norm": 0.4589591920375824, "learning_rate": 1.4129050936022216e-06, "loss": 0.2959, "step": 6480 }, { "epoch": 5.455387205387205, "grad_norm": 0.42859601974487305, "learning_rate": 1.4114432469203793e-06, "loss": 0.2566, "step": 6481 }, { "epoch": 5.456228956228956, "grad_norm": 0.4256117343902588, "learning_rate": 1.4099820325804942e-06, "loss": 0.2711, "step": 6482 }, { "epoch": 5.457070707070707, "grad_norm": 0.4366103410720825, "learning_rate": 1.408521450840052e-06, "loss": 0.259, "step": 6483 }, { "epoch": 5.457912457912458, "grad_norm": 0.4039357304573059, "learning_rate": 1.4070615019564199e-06, "loss": 0.2619, "step": 6484 }, { "epoch": 5.4587542087542085, "grad_norm": 0.45667392015457153, "learning_rate": 1.405602186186855e-06, "loss": 0.2636, "step": 6485 }, { "epoch": 5.459595959595959, "grad_norm": 0.4602417051792145, "learning_rate": 1.404143503788507e-06, "loss": 0.2855, "step": 6486 }, { "epoch": 5.46043771043771, "grad_norm": 0.38898926973342896, "learning_rate": 1.4026854550184122e-06, "loss": 0.2686, "step": 6487 }, { "epoch": 5.461279461279461, "grad_norm": 0.431723415851593, "learning_rate": 1.401228040133492e-06, "loss": 0.2924, "step": 6488 }, { "epoch": 5.462121212121212, "grad_norm": 0.4470956325531006, "learning_rate": 1.3997712593905599e-06, "loss": 0.2567, "step": 6489 }, { "epoch": 5.462962962962963, "grad_norm": 0.3907085359096527, "learning_rate": 1.3983151130463135e-06, "loss": 0.2885, "step": 6490 }, { "epoch": 5.463804713804714, "grad_norm": 0.4346863329410553, "learning_rate": 1.3968596013573433e-06, "loss": 0.2604, "step": 6491 }, { "epoch": 5.4646464646464645, "grad_norm": 0.4278485178947449, "learning_rate": 1.3954047245801278e-06, "loss": 0.2806, "step": 6492 }, { "epoch": 5.465488215488215, "grad_norm": 0.4052940011024475, "learning_rate": 1.3939504829710293e-06, "loss": 0.2721, "step": 6493 }, { "epoch": 5.466329966329966, "grad_norm": 0.4654768407344818, "learning_rate": 1.3924968767863006e-06, "loss": 0.2752, "step": 6494 }, { "epoch": 5.467171717171717, "grad_norm": 0.422122061252594, "learning_rate": 1.3910439062820841e-06, "loss": 0.2505, "step": 6495 }, { "epoch": 5.468013468013468, "grad_norm": 0.41243815422058105, "learning_rate": 1.389591571714407e-06, "loss": 0.278, "step": 6496 }, { "epoch": 5.468855218855219, "grad_norm": 0.40594300627708435, "learning_rate": 1.3881398733391882e-06, "loss": 0.2649, "step": 6497 }, { "epoch": 5.46969696969697, "grad_norm": 0.4083988070487976, "learning_rate": 1.3866888114122295e-06, "loss": 0.2825, "step": 6498 }, { "epoch": 5.470538720538721, "grad_norm": 0.41069626808166504, "learning_rate": 1.3852383861892276e-06, "loss": 0.2725, "step": 6499 }, { "epoch": 5.4713804713804715, "grad_norm": 0.418923556804657, "learning_rate": 1.3837885979257598e-06, "loss": 0.2607, "step": 6500 }, { "epoch": 5.472222222222222, "grad_norm": 0.4556306004524231, "learning_rate": 1.3823394468772939e-06, "loss": 0.2764, "step": 6501 }, { "epoch": 5.473063973063973, "grad_norm": 0.43582960963249207, "learning_rate": 1.3808909332991882e-06, "loss": 0.273, "step": 6502 }, { "epoch": 5.473905723905724, "grad_norm": 0.38529208302497864, "learning_rate": 1.3794430574466833e-06, "loss": 0.2597, "step": 6503 }, { "epoch": 5.474747474747475, "grad_norm": 0.39279791712760925, "learning_rate": 1.3779958195749138e-06, "loss": 0.2701, "step": 6504 }, { "epoch": 5.475589225589226, "grad_norm": 0.3862250745296478, "learning_rate": 1.3765492199388964e-06, "loss": 0.2797, "step": 6505 }, { "epoch": 5.476430976430977, "grad_norm": 0.36529475450515747, "learning_rate": 1.375103258793536e-06, "loss": 0.2792, "step": 6506 }, { "epoch": 5.4772727272727275, "grad_norm": 0.40845590829849243, "learning_rate": 1.373657936393628e-06, "loss": 0.2789, "step": 6507 }, { "epoch": 5.478114478114478, "grad_norm": 0.4281468987464905, "learning_rate": 1.3722132529938553e-06, "loss": 0.2599, "step": 6508 }, { "epoch": 5.478956228956229, "grad_norm": 0.42418959736824036, "learning_rate": 1.3707692088487846e-06, "loss": 0.2664, "step": 6509 }, { "epoch": 5.47979797979798, "grad_norm": 0.3974786698818207, "learning_rate": 1.369325804212872e-06, "loss": 0.2761, "step": 6510 }, { "epoch": 5.480639730639731, "grad_norm": 0.42365655303001404, "learning_rate": 1.3678830393404597e-06, "loss": 0.2586, "step": 6511 }, { "epoch": 5.481481481481482, "grad_norm": 0.43558090925216675, "learning_rate": 1.3664409144857788e-06, "loss": 0.2719, "step": 6512 }, { "epoch": 5.482323232323233, "grad_norm": 0.4342392683029175, "learning_rate": 1.3649994299029496e-06, "loss": 0.2969, "step": 6513 }, { "epoch": 5.483164983164984, "grad_norm": 0.3993336260318756, "learning_rate": 1.363558585845975e-06, "loss": 0.2673, "step": 6514 }, { "epoch": 5.4840067340067336, "grad_norm": 0.4168018698692322, "learning_rate": 1.3621183825687456e-06, "loss": 0.2787, "step": 6515 }, { "epoch": 5.484848484848484, "grad_norm": 0.43924686312675476, "learning_rate": 1.3606788203250438e-06, "loss": 0.253, "step": 6516 }, { "epoch": 5.485690235690235, "grad_norm": 0.4528057277202606, "learning_rate": 1.3592398993685329e-06, "loss": 0.2607, "step": 6517 }, { "epoch": 5.486531986531986, "grad_norm": 0.4354998469352722, "learning_rate": 1.357801619952769e-06, "loss": 0.2685, "step": 6518 }, { "epoch": 5.487373737373737, "grad_norm": 0.444806307554245, "learning_rate": 1.3563639823311897e-06, "loss": 0.2814, "step": 6519 }, { "epoch": 5.488215488215488, "grad_norm": 0.41318026185035706, "learning_rate": 1.3549269867571225e-06, "loss": 0.2492, "step": 6520 }, { "epoch": 5.489057239057239, "grad_norm": 0.4179784655570984, "learning_rate": 1.3534906334837833e-06, "loss": 0.2851, "step": 6521 }, { "epoch": 5.48989898989899, "grad_norm": 0.44744250178337097, "learning_rate": 1.352054922764271e-06, "loss": 0.267, "step": 6522 }, { "epoch": 5.4907407407407405, "grad_norm": 0.38322654366493225, "learning_rate": 1.3506198548515725e-06, "loss": 0.2828, "step": 6523 }, { "epoch": 5.491582491582491, "grad_norm": 0.4420012831687927, "learning_rate": 1.349185429998563e-06, "loss": 0.2621, "step": 6524 }, { "epoch": 5.492424242424242, "grad_norm": 0.4120419919490814, "learning_rate": 1.347751648458005e-06, "loss": 0.2725, "step": 6525 }, { "epoch": 5.493265993265993, "grad_norm": 0.406418114900589, "learning_rate": 1.3463185104825449e-06, "loss": 0.2731, "step": 6526 }, { "epoch": 5.494107744107744, "grad_norm": 0.41023382544517517, "learning_rate": 1.3448860163247168e-06, "loss": 0.2708, "step": 6527 }, { "epoch": 5.494949494949495, "grad_norm": 0.43593573570251465, "learning_rate": 1.3434541662369383e-06, "loss": 0.2793, "step": 6528 }, { "epoch": 5.495791245791246, "grad_norm": 0.4053407609462738, "learning_rate": 1.3420229604715224e-06, "loss": 0.2501, "step": 6529 }, { "epoch": 5.4966329966329965, "grad_norm": 0.42702049016952515, "learning_rate": 1.3405923992806601e-06, "loss": 0.2748, "step": 6530 }, { "epoch": 5.497474747474747, "grad_norm": 0.4661630392074585, "learning_rate": 1.3391624829164317e-06, "loss": 0.2633, "step": 6531 }, { "epoch": 5.498316498316498, "grad_norm": 0.4049837589263916, "learning_rate": 1.3377332116308024e-06, "loss": 0.2659, "step": 6532 }, { "epoch": 5.499158249158249, "grad_norm": 0.444153368473053, "learning_rate": 1.3363045856756257e-06, "loss": 0.2482, "step": 6533 }, { "epoch": 5.5, "grad_norm": 0.43824565410614014, "learning_rate": 1.334876605302643e-06, "loss": 0.2824, "step": 6534 }, { "epoch": 5.500841750841751, "grad_norm": 0.4138372540473938, "learning_rate": 1.3334492707634772e-06, "loss": 0.2609, "step": 6535 }, { "epoch": 5.501683501683502, "grad_norm": 0.4463995695114136, "learning_rate": 1.33202258230964e-06, "loss": 0.269, "step": 6536 }, { "epoch": 5.502525252525253, "grad_norm": 0.4133257269859314, "learning_rate": 1.3305965401925303e-06, "loss": 0.2821, "step": 6537 }, { "epoch": 5.5033670033670035, "grad_norm": 0.4015238583087921, "learning_rate": 1.329171144663432e-06, "loss": 0.2717, "step": 6538 }, { "epoch": 5.504208754208754, "grad_norm": 0.3838058412075043, "learning_rate": 1.327746395973512e-06, "loss": 0.2551, "step": 6539 }, { "epoch": 5.505050505050505, "grad_norm": 0.41470223665237427, "learning_rate": 1.3263222943738307e-06, "loss": 0.2835, "step": 6540 }, { "epoch": 5.505892255892256, "grad_norm": 0.41732078790664673, "learning_rate": 1.3248988401153257e-06, "loss": 0.2731, "step": 6541 }, { "epoch": 5.506734006734007, "grad_norm": 0.37629392743110657, "learning_rate": 1.3234760334488289e-06, "loss": 0.2822, "step": 6542 }, { "epoch": 5.507575757575758, "grad_norm": 0.4250863194465637, "learning_rate": 1.3220538746250516e-06, "loss": 0.2747, "step": 6543 }, { "epoch": 5.508417508417509, "grad_norm": 0.42182034254074097, "learning_rate": 1.3206323638945922e-06, "loss": 0.2833, "step": 6544 }, { "epoch": 5.5092592592592595, "grad_norm": 0.4343380033969879, "learning_rate": 1.3192115015079376e-06, "loss": 0.2536, "step": 6545 }, { "epoch": 5.51010101010101, "grad_norm": 0.42549535632133484, "learning_rate": 1.3177912877154607e-06, "loss": 0.2964, "step": 6546 }, { "epoch": 5.510942760942761, "grad_norm": 0.4339703917503357, "learning_rate": 1.316371722767416e-06, "loss": 0.2642, "step": 6547 }, { "epoch": 5.511784511784512, "grad_norm": 0.4286399483680725, "learning_rate": 1.3149528069139473e-06, "loss": 0.2637, "step": 6548 }, { "epoch": 5.512626262626263, "grad_norm": 0.4395776093006134, "learning_rate": 1.3135345404050797e-06, "loss": 0.2682, "step": 6549 }, { "epoch": 5.513468013468014, "grad_norm": 0.4302883744239807, "learning_rate": 1.3121169234907287e-06, "loss": 0.2908, "step": 6550 }, { "epoch": 5.514309764309765, "grad_norm": 0.4128498136997223, "learning_rate": 1.3106999564206963e-06, "loss": 0.2748, "step": 6551 }, { "epoch": 5.515151515151516, "grad_norm": 0.40495821833610535, "learning_rate": 1.3092836394446639e-06, "loss": 0.2838, "step": 6552 }, { "epoch": 5.5159932659932664, "grad_norm": 0.42909783124923706, "learning_rate": 1.3078679728122013e-06, "loss": 0.29, "step": 6553 }, { "epoch": 5.516835016835017, "grad_norm": 0.4211830496788025, "learning_rate": 1.306452956772767e-06, "loss": 0.2613, "step": 6554 }, { "epoch": 5.517676767676767, "grad_norm": 0.41489359736442566, "learning_rate": 1.3050385915756986e-06, "loss": 0.2713, "step": 6555 }, { "epoch": 5.518518518518518, "grad_norm": 0.41983330249786377, "learning_rate": 1.3036248774702253e-06, "loss": 0.2519, "step": 6556 }, { "epoch": 5.519360269360269, "grad_norm": 0.42347627878189087, "learning_rate": 1.3022118147054574e-06, "loss": 0.2429, "step": 6557 }, { "epoch": 5.52020202020202, "grad_norm": 0.39409589767456055, "learning_rate": 1.3007994035303895e-06, "loss": 0.2665, "step": 6558 }, { "epoch": 5.521043771043771, "grad_norm": 0.4149973690509796, "learning_rate": 1.2993876441939073e-06, "loss": 0.2574, "step": 6559 }, { "epoch": 5.521885521885522, "grad_norm": 0.41941577196121216, "learning_rate": 1.2979765369447745e-06, "loss": 0.2951, "step": 6560 }, { "epoch": 5.5227272727272725, "grad_norm": 0.42701977491378784, "learning_rate": 1.2965660820316462e-06, "loss": 0.2839, "step": 6561 }, { "epoch": 5.523569023569023, "grad_norm": 0.42933082580566406, "learning_rate": 1.2951562797030564e-06, "loss": 0.2754, "step": 6562 }, { "epoch": 5.524410774410774, "grad_norm": 0.44677695631980896, "learning_rate": 1.2937471302074306e-06, "loss": 0.2595, "step": 6563 }, { "epoch": 5.525252525252525, "grad_norm": 0.41831374168395996, "learning_rate": 1.2923386337930744e-06, "loss": 0.2719, "step": 6564 }, { "epoch": 5.526094276094276, "grad_norm": 0.43783679604530334, "learning_rate": 1.290930790708178e-06, "loss": 0.2786, "step": 6565 }, { "epoch": 5.526936026936027, "grad_norm": 0.4333716034889221, "learning_rate": 1.2895236012008212e-06, "loss": 0.2643, "step": 6566 }, { "epoch": 5.527777777777778, "grad_norm": 0.38731059432029724, "learning_rate": 1.2881170655189652e-06, "loss": 0.256, "step": 6567 }, { "epoch": 5.5286195286195285, "grad_norm": 0.3914065361022949, "learning_rate": 1.2867111839104568e-06, "loss": 0.2933, "step": 6568 }, { "epoch": 5.529461279461279, "grad_norm": 0.407340407371521, "learning_rate": 1.2853059566230268e-06, "loss": 0.284, "step": 6569 }, { "epoch": 5.53030303030303, "grad_norm": 0.43095099925994873, "learning_rate": 1.2839013839042896e-06, "loss": 0.2648, "step": 6570 }, { "epoch": 5.531144781144781, "grad_norm": 0.43078914284706116, "learning_rate": 1.2824974660017476e-06, "loss": 0.2684, "step": 6571 }, { "epoch": 5.531986531986532, "grad_norm": 0.42720943689346313, "learning_rate": 1.2810942031627872e-06, "loss": 0.2458, "step": 6572 }, { "epoch": 5.532828282828283, "grad_norm": 0.39869794249534607, "learning_rate": 1.2796915956346766e-06, "loss": 0.2887, "step": 6573 }, { "epoch": 5.533670033670034, "grad_norm": 0.4102044105529785, "learning_rate": 1.2782896436645697e-06, "loss": 0.2596, "step": 6574 }, { "epoch": 5.534511784511785, "grad_norm": 0.43214067816734314, "learning_rate": 1.2768883474995076e-06, "loss": 0.247, "step": 6575 }, { "epoch": 5.5353535353535355, "grad_norm": 0.3911362588405609, "learning_rate": 1.2754877073864114e-06, "loss": 0.2553, "step": 6576 }, { "epoch": 5.536195286195286, "grad_norm": 0.4162939786911011, "learning_rate": 1.2740877235720905e-06, "loss": 0.2778, "step": 6577 }, { "epoch": 5.537037037037037, "grad_norm": 0.40933671593666077, "learning_rate": 1.2726883963032367e-06, "loss": 0.2773, "step": 6578 }, { "epoch": 5.537878787878788, "grad_norm": 0.4152149558067322, "learning_rate": 1.271289725826424e-06, "loss": 0.2553, "step": 6579 }, { "epoch": 5.538720538720539, "grad_norm": 0.401248574256897, "learning_rate": 1.2698917123881165e-06, "loss": 0.2684, "step": 6580 }, { "epoch": 5.53956228956229, "grad_norm": 0.4083375632762909, "learning_rate": 1.2684943562346564e-06, "loss": 0.2525, "step": 6581 }, { "epoch": 5.540404040404041, "grad_norm": 0.40703392028808594, "learning_rate": 1.267097657612275e-06, "loss": 0.2821, "step": 6582 }, { "epoch": 5.5412457912457915, "grad_norm": 0.4042256474494934, "learning_rate": 1.2657016167670834e-06, "loss": 0.2529, "step": 6583 }, { "epoch": 5.542087542087542, "grad_norm": 0.4132785201072693, "learning_rate": 1.2643062339450812e-06, "loss": 0.2699, "step": 6584 }, { "epoch": 5.542929292929293, "grad_norm": 0.4085453152656555, "learning_rate": 1.2629115093921478e-06, "loss": 0.2695, "step": 6585 }, { "epoch": 5.543771043771044, "grad_norm": 0.4282416105270386, "learning_rate": 1.26151744335405e-06, "loss": 0.2709, "step": 6586 }, { "epoch": 5.544612794612795, "grad_norm": 0.3848167657852173, "learning_rate": 1.2601240360764339e-06, "loss": 0.271, "step": 6587 }, { "epoch": 5.545454545454545, "grad_norm": 0.43157142400741577, "learning_rate": 1.2587312878048357e-06, "loss": 0.2535, "step": 6588 }, { "epoch": 5.546296296296296, "grad_norm": 0.4015180468559265, "learning_rate": 1.2573391987846728e-06, "loss": 0.2938, "step": 6589 }, { "epoch": 5.547138047138047, "grad_norm": 0.4216180443763733, "learning_rate": 1.2559477692612454e-06, "loss": 0.2904, "step": 6590 }, { "epoch": 5.547979797979798, "grad_norm": 0.43131738901138306, "learning_rate": 1.2545569994797364e-06, "loss": 0.275, "step": 6591 }, { "epoch": 5.548821548821548, "grad_norm": 0.41111066937446594, "learning_rate": 1.2531668896852162e-06, "loss": 0.2665, "step": 6592 }, { "epoch": 5.549663299663299, "grad_norm": 0.43923327326774597, "learning_rate": 1.2517774401226378e-06, "loss": 0.2498, "step": 6593 }, { "epoch": 5.55050505050505, "grad_norm": 0.4002353549003601, "learning_rate": 1.2503886510368357e-06, "loss": 0.2617, "step": 6594 }, { "epoch": 5.551346801346801, "grad_norm": 0.4353891909122467, "learning_rate": 1.24900052267253e-06, "loss": 0.2981, "step": 6595 }, { "epoch": 5.552188552188552, "grad_norm": 0.39318859577178955, "learning_rate": 1.2476130552743209e-06, "loss": 0.3089, "step": 6596 }, { "epoch": 5.553030303030303, "grad_norm": 0.4423496425151825, "learning_rate": 1.2462262490866977e-06, "loss": 0.2784, "step": 6597 }, { "epoch": 5.553872053872054, "grad_norm": 0.4241623282432556, "learning_rate": 1.2448401043540315e-06, "loss": 0.2757, "step": 6598 }, { "epoch": 5.5547138047138045, "grad_norm": 0.42744094133377075, "learning_rate": 1.2434546213205739e-06, "loss": 0.2723, "step": 6599 }, { "epoch": 5.555555555555555, "grad_norm": 0.40930160880088806, "learning_rate": 1.242069800230461e-06, "loss": 0.2897, "step": 6600 }, { "epoch": 5.556397306397306, "grad_norm": 0.4504169821739197, "learning_rate": 1.2406856413277157e-06, "loss": 0.2854, "step": 6601 }, { "epoch": 5.557239057239057, "grad_norm": 0.43007558584213257, "learning_rate": 1.2393021448562403e-06, "loss": 0.2412, "step": 6602 }, { "epoch": 5.558080808080808, "grad_norm": 0.427250474691391, "learning_rate": 1.2379193110598193e-06, "loss": 0.2702, "step": 6603 }, { "epoch": 5.558922558922559, "grad_norm": 0.40660595893859863, "learning_rate": 1.2365371401821252e-06, "loss": 0.2879, "step": 6604 }, { "epoch": 5.55976430976431, "grad_norm": 0.4366808533668518, "learning_rate": 1.2351556324667118e-06, "loss": 0.2757, "step": 6605 }, { "epoch": 5.5606060606060606, "grad_norm": 0.4392968416213989, "learning_rate": 1.233774788157015e-06, "loss": 0.2703, "step": 6606 }, { "epoch": 5.561447811447811, "grad_norm": 0.4241284132003784, "learning_rate": 1.232394607496354e-06, "loss": 0.2718, "step": 6607 }, { "epoch": 5.562289562289562, "grad_norm": 0.41251733899116516, "learning_rate": 1.2310150907279294e-06, "loss": 0.2644, "step": 6608 }, { "epoch": 5.563131313131313, "grad_norm": 0.438205748796463, "learning_rate": 1.2296362380948285e-06, "loss": 0.2571, "step": 6609 }, { "epoch": 5.563973063973064, "grad_norm": 0.43122875690460205, "learning_rate": 1.2282580498400216e-06, "loss": 0.2775, "step": 6610 }, { "epoch": 5.564814814814815, "grad_norm": 0.4191920757293701, "learning_rate": 1.2268805262063581e-06, "loss": 0.2755, "step": 6611 }, { "epoch": 5.565656565656566, "grad_norm": 0.45515385270118713, "learning_rate": 1.2255036674365716e-06, "loss": 0.2816, "step": 6612 }, { "epoch": 5.566498316498317, "grad_norm": 0.435273677110672, "learning_rate": 1.2241274737732806e-06, "loss": 0.2831, "step": 6613 }, { "epoch": 5.5673400673400675, "grad_norm": 0.45602595806121826, "learning_rate": 1.2227519454589858e-06, "loss": 0.2732, "step": 6614 }, { "epoch": 5.568181818181818, "grad_norm": 0.41511115431785583, "learning_rate": 1.2213770827360695e-06, "loss": 0.2469, "step": 6615 }, { "epoch": 5.569023569023569, "grad_norm": 0.48395952582359314, "learning_rate": 1.2200028858467966e-06, "loss": 0.2759, "step": 6616 }, { "epoch": 5.56986531986532, "grad_norm": 0.3949632942676544, "learning_rate": 1.2186293550333133e-06, "loss": 0.2778, "step": 6617 }, { "epoch": 5.570707070707071, "grad_norm": 0.3918943703174591, "learning_rate": 1.2172564905376543e-06, "loss": 0.286, "step": 6618 }, { "epoch": 5.571548821548822, "grad_norm": 0.43214261531829834, "learning_rate": 1.21588429260173e-06, "loss": 0.2534, "step": 6619 }, { "epoch": 5.572390572390573, "grad_norm": 0.4290667176246643, "learning_rate": 1.2145127614673386e-06, "loss": 0.2891, "step": 6620 }, { "epoch": 5.5732323232323235, "grad_norm": 0.4133506417274475, "learning_rate": 1.213141897376156e-06, "loss": 0.2463, "step": 6621 }, { "epoch": 5.574074074074074, "grad_norm": 0.3993637263774872, "learning_rate": 1.2117717005697455e-06, "loss": 0.2851, "step": 6622 }, { "epoch": 5.574915824915825, "grad_norm": 0.3940587043762207, "learning_rate": 1.2104021712895491e-06, "loss": 0.2841, "step": 6623 }, { "epoch": 5.575757575757576, "grad_norm": 0.39273959398269653, "learning_rate": 1.2090333097768918e-06, "loss": 0.299, "step": 6624 }, { "epoch": 5.576599326599327, "grad_norm": 0.4094696342945099, "learning_rate": 1.2076651162729835e-06, "loss": 0.2729, "step": 6625 }, { "epoch": 5.577441077441078, "grad_norm": 0.4026786684989929, "learning_rate": 1.2062975910189117e-06, "loss": 0.2362, "step": 6626 }, { "epoch": 5.578282828282829, "grad_norm": 0.3914858102798462, "learning_rate": 1.2049307342556527e-06, "loss": 0.2695, "step": 6627 }, { "epoch": 5.57912457912458, "grad_norm": 0.4275614619255066, "learning_rate": 1.2035645462240586e-06, "loss": 0.2301, "step": 6628 }, { "epoch": 5.5799663299663305, "grad_norm": 0.45567676424980164, "learning_rate": 1.202199027164866e-06, "loss": 0.266, "step": 6629 }, { "epoch": 5.58080808080808, "grad_norm": 0.433627724647522, "learning_rate": 1.2008341773186942e-06, "loss": 0.2919, "step": 6630 }, { "epoch": 5.581649831649831, "grad_norm": 0.4089590609073639, "learning_rate": 1.1994699969260464e-06, "loss": 0.2579, "step": 6631 }, { "epoch": 5.582491582491582, "grad_norm": 0.39354968070983887, "learning_rate": 1.1981064862273045e-06, "loss": 0.2686, "step": 6632 }, { "epoch": 5.583333333333333, "grad_norm": 0.4230596125125885, "learning_rate": 1.196743645462733e-06, "loss": 0.2684, "step": 6633 }, { "epoch": 5.584175084175084, "grad_norm": 0.4229223430156708, "learning_rate": 1.195381474872478e-06, "loss": 0.2667, "step": 6634 }, { "epoch": 5.585016835016835, "grad_norm": 0.3810349404811859, "learning_rate": 1.1940199746965698e-06, "loss": 0.2944, "step": 6635 }, { "epoch": 5.585858585858586, "grad_norm": 0.4337749481201172, "learning_rate": 1.1926591451749203e-06, "loss": 0.2461, "step": 6636 }, { "epoch": 5.5867003367003365, "grad_norm": 0.433533638715744, "learning_rate": 1.1912989865473212e-06, "loss": 0.2473, "step": 6637 }, { "epoch": 5.587542087542087, "grad_norm": 0.424051970243454, "learning_rate": 1.1899394990534457e-06, "loss": 0.2442, "step": 6638 }, { "epoch": 5.588383838383838, "grad_norm": 0.3901095986366272, "learning_rate": 1.1885806829328528e-06, "loss": 0.2617, "step": 6639 }, { "epoch": 5.589225589225589, "grad_norm": 0.398193895816803, "learning_rate": 1.1872225384249768e-06, "loss": 0.285, "step": 6640 }, { "epoch": 5.59006734006734, "grad_norm": 0.41726788878440857, "learning_rate": 1.1858650657691406e-06, "loss": 0.2399, "step": 6641 }, { "epoch": 5.590909090909091, "grad_norm": 0.4083051383495331, "learning_rate": 1.1845082652045426e-06, "loss": 0.2456, "step": 6642 }, { "epoch": 5.591750841750842, "grad_norm": 0.4285636842250824, "learning_rate": 1.1831521369702682e-06, "loss": 0.2842, "step": 6643 }, { "epoch": 5.592592592592593, "grad_norm": 0.41095438599586487, "learning_rate": 1.1817966813052806e-06, "loss": 0.2853, "step": 6644 }, { "epoch": 5.593434343434343, "grad_norm": 0.4128413796424866, "learning_rate": 1.1804418984484234e-06, "loss": 0.2482, "step": 6645 }, { "epoch": 5.594276094276094, "grad_norm": 0.3981788754463196, "learning_rate": 1.1790877886384272e-06, "loss": 0.3003, "step": 6646 }, { "epoch": 5.595117845117845, "grad_norm": 0.38918566703796387, "learning_rate": 1.1777343521138974e-06, "loss": 0.2828, "step": 6647 }, { "epoch": 5.595959595959596, "grad_norm": 0.404602587223053, "learning_rate": 1.1763815891133274e-06, "loss": 0.2673, "step": 6648 }, { "epoch": 5.596801346801347, "grad_norm": 0.4441598355770111, "learning_rate": 1.175029499875086e-06, "loss": 0.2738, "step": 6649 }, { "epoch": 5.597643097643098, "grad_norm": 0.4127553403377533, "learning_rate": 1.1736780846374257e-06, "loss": 0.2679, "step": 6650 }, { "epoch": 5.598484848484849, "grad_norm": 0.3991018235683441, "learning_rate": 1.1723273436384803e-06, "loss": 0.2532, "step": 6651 }, { "epoch": 5.5993265993265995, "grad_norm": 0.4008062779903412, "learning_rate": 1.170977277116267e-06, "loss": 0.2714, "step": 6652 }, { "epoch": 5.60016835016835, "grad_norm": 0.4064158797264099, "learning_rate": 1.16962788530868e-06, "loss": 0.2769, "step": 6653 }, { "epoch": 5.601010101010101, "grad_norm": 0.41852089762687683, "learning_rate": 1.1682791684534971e-06, "loss": 0.2613, "step": 6654 }, { "epoch": 5.601851851851852, "grad_norm": 0.40008431673049927, "learning_rate": 1.166931126788375e-06, "loss": 0.29, "step": 6655 }, { "epoch": 5.602693602693603, "grad_norm": 0.38952529430389404, "learning_rate": 1.1655837605508542e-06, "loss": 0.2535, "step": 6656 }, { "epoch": 5.603535353535354, "grad_norm": 0.4143137037754059, "learning_rate": 1.1642370699783566e-06, "loss": 0.2745, "step": 6657 }, { "epoch": 5.604377104377105, "grad_norm": 0.41162219643592834, "learning_rate": 1.162891055308182e-06, "loss": 0.2552, "step": 6658 }, { "epoch": 5.6052188552188555, "grad_norm": 0.40346428751945496, "learning_rate": 1.1615457167775106e-06, "loss": 0.2721, "step": 6659 }, { "epoch": 5.606060606060606, "grad_norm": 0.4057501256465912, "learning_rate": 1.1602010546234094e-06, "loss": 0.2561, "step": 6660 }, { "epoch": 5.606902356902357, "grad_norm": 0.4165043830871582, "learning_rate": 1.1588570690828183e-06, "loss": 0.2623, "step": 6661 }, { "epoch": 5.607744107744107, "grad_norm": 0.43151208758354187, "learning_rate": 1.1575137603925651e-06, "loss": 0.2801, "step": 6662 }, { "epoch": 5.608585858585858, "grad_norm": 0.4030390679836273, "learning_rate": 1.156171128789354e-06, "loss": 0.2736, "step": 6663 }, { "epoch": 5.609427609427609, "grad_norm": 0.4385327994823456, "learning_rate": 1.1548291745097689e-06, "loss": 0.2639, "step": 6664 }, { "epoch": 5.61026936026936, "grad_norm": 0.4181050658226013, "learning_rate": 1.1534878977902797e-06, "loss": 0.2522, "step": 6665 }, { "epoch": 5.611111111111111, "grad_norm": 0.40497565269470215, "learning_rate": 1.1521472988672321e-06, "loss": 0.2646, "step": 6666 }, { "epoch": 5.611952861952862, "grad_norm": 0.4104555547237396, "learning_rate": 1.150807377976853e-06, "loss": 0.2598, "step": 6667 }, { "epoch": 5.6127946127946124, "grad_norm": 0.38738906383514404, "learning_rate": 1.1494681353552516e-06, "loss": 0.283, "step": 6668 }, { "epoch": 5.613636363636363, "grad_norm": 0.4357268214225769, "learning_rate": 1.1481295712384183e-06, "loss": 0.2475, "step": 6669 }, { "epoch": 5.614478114478114, "grad_norm": 0.3798251152038574, "learning_rate": 1.1467916858622209e-06, "loss": 0.2641, "step": 6670 }, { "epoch": 5.615319865319865, "grad_norm": 0.4124727249145508, "learning_rate": 1.1454544794624089e-06, "loss": 0.2758, "step": 6671 }, { "epoch": 5.616161616161616, "grad_norm": 0.41438427567481995, "learning_rate": 1.1441179522746099e-06, "loss": 0.2515, "step": 6672 }, { "epoch": 5.617003367003367, "grad_norm": 0.38842204213142395, "learning_rate": 1.1427821045343401e-06, "loss": 0.2397, "step": 6673 }, { "epoch": 5.617845117845118, "grad_norm": 0.4434807002544403, "learning_rate": 1.1414469364769865e-06, "loss": 0.2505, "step": 6674 }, { "epoch": 5.6186868686868685, "grad_norm": 0.43132129311561584, "learning_rate": 1.1401124483378206e-06, "loss": 0.2823, "step": 6675 }, { "epoch": 5.619528619528619, "grad_norm": 0.39488962292671204, "learning_rate": 1.1387786403519913e-06, "loss": 0.281, "step": 6676 }, { "epoch": 5.62037037037037, "grad_norm": 0.3878309726715088, "learning_rate": 1.137445512754532e-06, "loss": 0.2208, "step": 6677 }, { "epoch": 5.621212121212121, "grad_norm": 0.4294576644897461, "learning_rate": 1.1361130657803554e-06, "loss": 0.2702, "step": 6678 }, { "epoch": 5.622053872053872, "grad_norm": 0.45349910855293274, "learning_rate": 1.1347812996642504e-06, "loss": 0.2747, "step": 6679 }, { "epoch": 5.622895622895623, "grad_norm": 0.4484333097934723, "learning_rate": 1.1334502146408883e-06, "loss": 0.2434, "step": 6680 }, { "epoch": 5.623737373737374, "grad_norm": 0.4118272066116333, "learning_rate": 1.132119810944823e-06, "loss": 0.2786, "step": 6681 }, { "epoch": 5.624579124579125, "grad_norm": 0.39011213183403015, "learning_rate": 1.1307900888104833e-06, "loss": 0.299, "step": 6682 }, { "epoch": 5.625420875420875, "grad_norm": 0.44555017352104187, "learning_rate": 1.12946104847218e-06, "loss": 0.2425, "step": 6683 }, { "epoch": 5.626262626262626, "grad_norm": 0.4298226237297058, "learning_rate": 1.1281326901641072e-06, "loss": 0.27, "step": 6684 }, { "epoch": 5.627104377104377, "grad_norm": 0.4366805851459503, "learning_rate": 1.1268050141203324e-06, "loss": 0.2637, "step": 6685 }, { "epoch": 5.627946127946128, "grad_norm": 0.43155133724212646, "learning_rate": 1.1254780205748094e-06, "loss": 0.2599, "step": 6686 }, { "epoch": 5.628787878787879, "grad_norm": 0.4240611791610718, "learning_rate": 1.1241517097613668e-06, "loss": 0.2756, "step": 6687 }, { "epoch": 5.62962962962963, "grad_norm": 0.40487486124038696, "learning_rate": 1.1228260819137138e-06, "loss": 0.2844, "step": 6688 }, { "epoch": 5.630471380471381, "grad_norm": 0.4357791543006897, "learning_rate": 1.121501137265441e-06, "loss": 0.2557, "step": 6689 }, { "epoch": 5.6313131313131315, "grad_norm": 0.42160525918006897, "learning_rate": 1.1201768760500203e-06, "loss": 0.2699, "step": 6690 }, { "epoch": 5.632154882154882, "grad_norm": 0.4129209816455841, "learning_rate": 1.1188532985007976e-06, "loss": 0.2552, "step": 6691 }, { "epoch": 5.632996632996633, "grad_norm": 0.44385698437690735, "learning_rate": 1.1175304048510026e-06, "loss": 0.2893, "step": 6692 }, { "epoch": 5.633838383838384, "grad_norm": 0.4347127676010132, "learning_rate": 1.1162081953337418e-06, "loss": 0.254, "step": 6693 }, { "epoch": 5.634680134680135, "grad_norm": 0.44533905386924744, "learning_rate": 1.1148866701820037e-06, "loss": 0.2403, "step": 6694 }, { "epoch": 5.635521885521886, "grad_norm": 0.4088422656059265, "learning_rate": 1.113565829628656e-06, "loss": 0.2952, "step": 6695 }, { "epoch": 5.636363636363637, "grad_norm": 0.4239930212497711, "learning_rate": 1.1122456739064446e-06, "loss": 0.2316, "step": 6696 }, { "epoch": 5.6372053872053876, "grad_norm": 0.4310958981513977, "learning_rate": 1.110926203247993e-06, "loss": 0.2477, "step": 6697 }, { "epoch": 5.638047138047138, "grad_norm": 0.40731459856033325, "learning_rate": 1.1096074178858085e-06, "loss": 0.2691, "step": 6698 }, { "epoch": 5.638888888888889, "grad_norm": 0.4237225353717804, "learning_rate": 1.108289318052273e-06, "loss": 0.2822, "step": 6699 }, { "epoch": 5.63973063973064, "grad_norm": 0.3980828821659088, "learning_rate": 1.1069719039796516e-06, "loss": 0.2878, "step": 6700 }, { "epoch": 5.640572390572391, "grad_norm": 0.41830378770828247, "learning_rate": 1.1056551759000866e-06, "loss": 0.2556, "step": 6701 }, { "epoch": 5.641414141414142, "grad_norm": 0.3967030942440033, "learning_rate": 1.1043391340455973e-06, "loss": 0.3065, "step": 6702 }, { "epoch": 5.642255892255893, "grad_norm": 0.3890264928340912, "learning_rate": 1.1030237786480875e-06, "loss": 0.2943, "step": 6703 }, { "epoch": 5.643097643097643, "grad_norm": 0.4095706343650818, "learning_rate": 1.1017091099393334e-06, "loss": 0.2885, "step": 6704 }, { "epoch": 5.643939393939394, "grad_norm": 0.36027735471725464, "learning_rate": 1.1003951281509967e-06, "loss": 0.2486, "step": 6705 }, { "epoch": 5.6447811447811445, "grad_norm": 0.3948330879211426, "learning_rate": 1.099081833514613e-06, "loss": 0.2624, "step": 6706 }, { "epoch": 5.645622895622895, "grad_norm": 0.4525747299194336, "learning_rate": 1.0977692262616007e-06, "loss": 0.259, "step": 6707 }, { "epoch": 5.646464646464646, "grad_norm": 0.40866971015930176, "learning_rate": 1.0964573066232538e-06, "loss": 0.2838, "step": 6708 }, { "epoch": 5.647306397306397, "grad_norm": 0.41037625074386597, "learning_rate": 1.0951460748307457e-06, "loss": 0.2589, "step": 6709 }, { "epoch": 5.648148148148148, "grad_norm": 0.44065791368484497, "learning_rate": 1.09383553111513e-06, "loss": 0.2705, "step": 6710 }, { "epoch": 5.648989898989899, "grad_norm": 0.4170142412185669, "learning_rate": 1.092525675707341e-06, "loss": 0.2866, "step": 6711 }, { "epoch": 5.64983164983165, "grad_norm": 0.4044676125049591, "learning_rate": 1.0912165088381872e-06, "loss": 0.2934, "step": 6712 }, { "epoch": 5.6506734006734005, "grad_norm": 0.4000993072986603, "learning_rate": 1.089908030738357e-06, "loss": 0.2389, "step": 6713 }, { "epoch": 5.651515151515151, "grad_norm": 0.4557304382324219, "learning_rate": 1.0886002416384184e-06, "loss": 0.2339, "step": 6714 }, { "epoch": 5.652356902356902, "grad_norm": 0.4213719069957733, "learning_rate": 1.0872931417688177e-06, "loss": 0.2587, "step": 6715 }, { "epoch": 5.653198653198653, "grad_norm": 0.3914572298526764, "learning_rate": 1.0859867313598822e-06, "loss": 0.2663, "step": 6716 }, { "epoch": 5.654040404040404, "grad_norm": 0.4028793275356293, "learning_rate": 1.0846810106418133e-06, "loss": 0.2619, "step": 6717 }, { "epoch": 5.654882154882155, "grad_norm": 0.4692913591861725, "learning_rate": 1.0833759798446918e-06, "loss": 0.2422, "step": 6718 }, { "epoch": 5.655723905723906, "grad_norm": 0.42485156655311584, "learning_rate": 1.082071639198481e-06, "loss": 0.2502, "step": 6719 }, { "epoch": 5.656565656565657, "grad_norm": 0.43535664677619934, "learning_rate": 1.0807679889330163e-06, "loss": 0.266, "step": 6720 }, { "epoch": 5.657407407407407, "grad_norm": 0.41168150305747986, "learning_rate": 1.0794650292780185e-06, "loss": 0.285, "step": 6721 }, { "epoch": 5.658249158249158, "grad_norm": 0.40936407446861267, "learning_rate": 1.0781627604630807e-06, "loss": 0.2338, "step": 6722 }, { "epoch": 5.659090909090909, "grad_norm": 0.4190830886363983, "learning_rate": 1.0768611827176756e-06, "loss": 0.2744, "step": 6723 }, { "epoch": 5.65993265993266, "grad_norm": 0.4191575050354004, "learning_rate": 1.0755602962711576e-06, "loss": 0.2662, "step": 6724 }, { "epoch": 5.660774410774411, "grad_norm": 0.4308607578277588, "learning_rate": 1.0742601013527543e-06, "loss": 0.2807, "step": 6725 }, { "epoch": 5.661616161616162, "grad_norm": 0.4184069335460663, "learning_rate": 1.0729605981915763e-06, "loss": 0.2529, "step": 6726 }, { "epoch": 5.662457912457913, "grad_norm": 0.42429855465888977, "learning_rate": 1.0716617870166074e-06, "loss": 0.2549, "step": 6727 }, { "epoch": 5.6632996632996635, "grad_norm": 0.3992094397544861, "learning_rate": 1.0703636680567149e-06, "loss": 0.2683, "step": 6728 }, { "epoch": 5.664141414141414, "grad_norm": 0.4109998643398285, "learning_rate": 1.0690662415406388e-06, "loss": 0.2541, "step": 6729 }, { "epoch": 5.664983164983165, "grad_norm": 0.43465933203697205, "learning_rate": 1.0677695076970012e-06, "loss": 0.252, "step": 6730 }, { "epoch": 5.665824915824916, "grad_norm": 0.4086041748523712, "learning_rate": 1.0664734667542976e-06, "loss": 0.2854, "step": 6731 }, { "epoch": 5.666666666666667, "grad_norm": 0.39028915762901306, "learning_rate": 1.0651781189409067e-06, "loss": 0.2582, "step": 6732 }, { "epoch": 5.667508417508418, "grad_norm": 0.430903822183609, "learning_rate": 1.0638834644850827e-06, "loss": 0.277, "step": 6733 }, { "epoch": 5.668350168350169, "grad_norm": 0.401242733001709, "learning_rate": 1.0625895036149575e-06, "loss": 0.2605, "step": 6734 }, { "epoch": 5.66919191919192, "grad_norm": 0.4114583432674408, "learning_rate": 1.0612962365585384e-06, "loss": 0.2637, "step": 6735 }, { "epoch": 5.6700336700336695, "grad_norm": 0.4235115349292755, "learning_rate": 1.0600036635437143e-06, "loss": 0.2831, "step": 6736 }, { "epoch": 5.67087542087542, "grad_norm": 0.384551078081131, "learning_rate": 1.0587117847982526e-06, "loss": 0.2748, "step": 6737 }, { "epoch": 5.671717171717171, "grad_norm": 0.41089022159576416, "learning_rate": 1.0574206005497938e-06, "loss": 0.2854, "step": 6738 }, { "epoch": 5.672558922558922, "grad_norm": 0.429255872964859, "learning_rate": 1.0561301110258593e-06, "loss": 0.2715, "step": 6739 }, { "epoch": 5.673400673400673, "grad_norm": 0.4021652042865753, "learning_rate": 1.0548403164538445e-06, "loss": 0.2901, "step": 6740 }, { "epoch": 5.674242424242424, "grad_norm": 0.42006629705429077, "learning_rate": 1.0535512170610274e-06, "loss": 0.2997, "step": 6741 }, { "epoch": 5.675084175084175, "grad_norm": 0.39119184017181396, "learning_rate": 1.0522628130745615e-06, "loss": 0.2544, "step": 6742 }, { "epoch": 5.675925925925926, "grad_norm": 0.41972288489341736, "learning_rate": 1.0509751047214767e-06, "loss": 0.2683, "step": 6743 }, { "epoch": 5.6767676767676765, "grad_norm": 0.44817888736724854, "learning_rate": 1.0496880922286789e-06, "loss": 0.2571, "step": 6744 }, { "epoch": 5.677609427609427, "grad_norm": 0.4041992723941803, "learning_rate": 1.048401775822957e-06, "loss": 0.2453, "step": 6745 }, { "epoch": 5.678451178451178, "grad_norm": 0.40211570262908936, "learning_rate": 1.0471161557309716e-06, "loss": 0.2718, "step": 6746 }, { "epoch": 5.679292929292929, "grad_norm": 0.4238039255142212, "learning_rate": 1.0458312321792618e-06, "loss": 0.2879, "step": 6747 }, { "epoch": 5.68013468013468, "grad_norm": 0.3738325238227844, "learning_rate": 1.0445470053942458e-06, "loss": 0.2733, "step": 6748 }, { "epoch": 5.680976430976431, "grad_norm": 0.4046252965927124, "learning_rate": 1.0432634756022197e-06, "loss": 0.2743, "step": 6749 }, { "epoch": 5.681818181818182, "grad_norm": 0.3798696994781494, "learning_rate": 1.0419806430293538e-06, "loss": 0.2526, "step": 6750 }, { "epoch": 5.6826599326599325, "grad_norm": 0.38921359181404114, "learning_rate": 1.0406985079016963e-06, "loss": 0.2675, "step": 6751 }, { "epoch": 5.683501683501683, "grad_norm": 0.4488805830478668, "learning_rate": 1.0394170704451722e-06, "loss": 0.2636, "step": 6752 }, { "epoch": 5.684343434343434, "grad_norm": 0.42311975359916687, "learning_rate": 1.0381363308855858e-06, "loss": 0.2688, "step": 6753 }, { "epoch": 5.685185185185185, "grad_norm": 0.3754011392593384, "learning_rate": 1.0368562894486184e-06, "loss": 0.2836, "step": 6754 }, { "epoch": 5.686026936026936, "grad_norm": 0.4401177167892456, "learning_rate": 1.0355769463598258e-06, "loss": 0.2513, "step": 6755 }, { "epoch": 5.686868686868687, "grad_norm": 0.44487258791923523, "learning_rate": 1.0342983018446402e-06, "loss": 0.2695, "step": 6756 }, { "epoch": 5.687710437710438, "grad_norm": 0.4120875895023346, "learning_rate": 1.0330203561283736e-06, "loss": 0.2642, "step": 6757 }, { "epoch": 5.688552188552189, "grad_norm": 0.46216723322868347, "learning_rate": 1.0317431094362152e-06, "loss": 0.2726, "step": 6758 }, { "epoch": 5.6893939393939394, "grad_norm": 0.4080899655818939, "learning_rate": 1.030466561993228e-06, "loss": 0.2915, "step": 6759 }, { "epoch": 5.69023569023569, "grad_norm": 0.4448041319847107, "learning_rate": 1.029190714024354e-06, "loss": 0.2367, "step": 6760 }, { "epoch": 5.691077441077441, "grad_norm": 0.4522228240966797, "learning_rate": 1.0279155657544088e-06, "loss": 0.2616, "step": 6761 }, { "epoch": 5.691919191919192, "grad_norm": 0.39218127727508545, "learning_rate": 1.026641117408091e-06, "loss": 0.2741, "step": 6762 }, { "epoch": 5.692760942760943, "grad_norm": 0.3945407271385193, "learning_rate": 1.025367369209968e-06, "loss": 0.2717, "step": 6763 }, { "epoch": 5.693602693602694, "grad_norm": 0.40662240982055664, "learning_rate": 1.0240943213844912e-06, "loss": 0.2776, "step": 6764 }, { "epoch": 5.694444444444445, "grad_norm": 0.42833977937698364, "learning_rate": 1.0228219741559825e-06, "loss": 0.2627, "step": 6765 }, { "epoch": 5.6952861952861955, "grad_norm": 0.41243883967399597, "learning_rate": 1.0215503277486454e-06, "loss": 0.2622, "step": 6766 }, { "epoch": 5.696127946127946, "grad_norm": 0.40543895959854126, "learning_rate": 1.020279382386557e-06, "loss": 0.2617, "step": 6767 }, { "epoch": 5.696969696969697, "grad_norm": 0.41414621472358704, "learning_rate": 1.0190091382936695e-06, "loss": 0.2656, "step": 6768 }, { "epoch": 5.697811447811448, "grad_norm": 0.4076645076274872, "learning_rate": 1.0177395956938157e-06, "loss": 0.2507, "step": 6769 }, { "epoch": 5.698653198653199, "grad_norm": 0.37685537338256836, "learning_rate": 1.016470754810701e-06, "loss": 0.2611, "step": 6770 }, { "epoch": 5.69949494949495, "grad_norm": 0.3953886926174164, "learning_rate": 1.0152026158679107e-06, "loss": 0.2795, "step": 6771 }, { "epoch": 5.700336700336701, "grad_norm": 0.4280475080013275, "learning_rate": 1.0139351790889024e-06, "loss": 0.3005, "step": 6772 }, { "epoch": 5.701178451178452, "grad_norm": 0.43095096945762634, "learning_rate": 1.012668444697012e-06, "loss": 0.2507, "step": 6773 }, { "epoch": 5.702020202020202, "grad_norm": 0.42400193214416504, "learning_rate": 1.011402412915452e-06, "loss": 0.2838, "step": 6774 }, { "epoch": 5.702861952861953, "grad_norm": 0.41234979033470154, "learning_rate": 1.0101370839673125e-06, "loss": 0.2616, "step": 6775 }, { "epoch": 5.703703703703704, "grad_norm": 0.3898994028568268, "learning_rate": 1.008872458075556e-06, "loss": 0.2947, "step": 6776 }, { "epoch": 5.704545454545455, "grad_norm": 0.4264952838420868, "learning_rate": 1.0076085354630238e-06, "loss": 0.2671, "step": 6777 }, { "epoch": 5.705387205387205, "grad_norm": 0.4148279130458832, "learning_rate": 1.0063453163524306e-06, "loss": 0.3048, "step": 6778 }, { "epoch": 5.706228956228956, "grad_norm": 0.3838178813457489, "learning_rate": 1.005082800966371e-06, "loss": 0.3027, "step": 6779 }, { "epoch": 5.707070707070707, "grad_norm": 0.4331328868865967, "learning_rate": 1.003820989527315e-06, "loss": 0.2913, "step": 6780 }, { "epoch": 5.707912457912458, "grad_norm": 0.39515188336372375, "learning_rate": 1.0025598822576048e-06, "loss": 0.2738, "step": 6781 }, { "epoch": 5.7087542087542085, "grad_norm": 0.4198285937309265, "learning_rate": 1.0012994793794606e-06, "loss": 0.2787, "step": 6782 }, { "epoch": 5.709595959595959, "grad_norm": 0.43336835503578186, "learning_rate": 1.0000397811149815e-06, "loss": 0.2771, "step": 6783 }, { "epoch": 5.71043771043771, "grad_norm": 0.4108847975730896, "learning_rate": 9.98780787686137e-07, "loss": 0.2747, "step": 6784 }, { "epoch": 5.711279461279461, "grad_norm": 0.4007914960384369, "learning_rate": 9.975224993147775e-07, "loss": 0.2529, "step": 6785 }, { "epoch": 5.712121212121212, "grad_norm": 0.3888951241970062, "learning_rate": 9.962649162226245e-07, "loss": 0.2629, "step": 6786 }, { "epoch": 5.712962962962963, "grad_norm": 0.3973996043205261, "learning_rate": 9.950080386312805e-07, "loss": 0.2685, "step": 6787 }, { "epoch": 5.713804713804714, "grad_norm": 0.38864994049072266, "learning_rate": 9.937518667622187e-07, "loss": 0.2636, "step": 6788 }, { "epoch": 5.7146464646464645, "grad_norm": 0.39200282096862793, "learning_rate": 9.924964008367888e-07, "loss": 0.2607, "step": 6789 }, { "epoch": 5.715488215488215, "grad_norm": 0.38741227984428406, "learning_rate": 9.9124164107622e-07, "loss": 0.2896, "step": 6790 }, { "epoch": 5.716329966329966, "grad_norm": 0.4456157684326172, "learning_rate": 9.899875877016118e-07, "loss": 0.2292, "step": 6791 }, { "epoch": 5.717171717171717, "grad_norm": 0.42334553599357605, "learning_rate": 9.887342409339445e-07, "loss": 0.243, "step": 6792 }, { "epoch": 5.718013468013468, "grad_norm": 0.3839649558067322, "learning_rate": 9.874816009940703e-07, "loss": 0.2764, "step": 6793 }, { "epoch": 5.718855218855219, "grad_norm": 0.39961591362953186, "learning_rate": 9.86229668102715e-07, "loss": 0.2958, "step": 6794 }, { "epoch": 5.71969696969697, "grad_norm": 0.4585433900356293, "learning_rate": 9.849784424804848e-07, "loss": 0.282, "step": 6795 }, { "epoch": 5.720538720538721, "grad_norm": 0.41340014338493347, "learning_rate": 9.837279243478604e-07, "loss": 0.3056, "step": 6796 }, { "epoch": 5.7213804713804715, "grad_norm": 0.41821014881134033, "learning_rate": 9.824781139251948e-07, "loss": 0.2604, "step": 6797 }, { "epoch": 5.722222222222222, "grad_norm": 0.40184906125068665, "learning_rate": 9.812290114327183e-07, "loss": 0.2483, "step": 6798 }, { "epoch": 5.723063973063973, "grad_norm": 0.42209160327911377, "learning_rate": 9.799806170905345e-07, "loss": 0.2462, "step": 6799 }, { "epoch": 5.723905723905724, "grad_norm": 0.4388419985771179, "learning_rate": 9.78732931118625e-07, "loss": 0.2676, "step": 6800 }, { "epoch": 5.724747474747475, "grad_norm": 0.40751129388809204, "learning_rate": 9.774859537368464e-07, "loss": 0.2905, "step": 6801 }, { "epoch": 5.725589225589226, "grad_norm": 0.40902307629585266, "learning_rate": 9.762396851649287e-07, "loss": 0.2798, "step": 6802 }, { "epoch": 5.726430976430977, "grad_norm": 0.40654096007347107, "learning_rate": 9.74994125622476e-07, "loss": 0.2633, "step": 6803 }, { "epoch": 5.7272727272727275, "grad_norm": 0.44129130244255066, "learning_rate": 9.737492753289724e-07, "loss": 0.3108, "step": 6804 }, { "epoch": 5.728114478114478, "grad_norm": 0.42957714200019836, "learning_rate": 9.725051345037705e-07, "loss": 0.2848, "step": 6805 }, { "epoch": 5.728956228956229, "grad_norm": 0.4059366285800934, "learning_rate": 9.712617033661042e-07, "loss": 0.2902, "step": 6806 }, { "epoch": 5.72979797979798, "grad_norm": 0.4328688383102417, "learning_rate": 9.700189821350775e-07, "loss": 0.2793, "step": 6807 }, { "epoch": 5.730639730639731, "grad_norm": 0.4310471713542938, "learning_rate": 9.687769710296702e-07, "loss": 0.2605, "step": 6808 }, { "epoch": 5.731481481481482, "grad_norm": 0.4183773994445801, "learning_rate": 9.675356702687405e-07, "loss": 0.2578, "step": 6809 }, { "epoch": 5.732323232323233, "grad_norm": 0.42513710260391235, "learning_rate": 9.662950800710181e-07, "loss": 0.2551, "step": 6810 }, { "epoch": 5.733164983164983, "grad_norm": 0.4004693031311035, "learning_rate": 9.650552006551057e-07, "loss": 0.2899, "step": 6811 }, { "epoch": 5.7340067340067336, "grad_norm": 0.4006615877151489, "learning_rate": 9.638160322394851e-07, "loss": 0.2519, "step": 6812 }, { "epoch": 5.734848484848484, "grad_norm": 0.3737018406391144, "learning_rate": 9.625775750425126e-07, "loss": 0.2806, "step": 6813 }, { "epoch": 5.735690235690235, "grad_norm": 0.4020669460296631, "learning_rate": 9.61339829282416e-07, "loss": 0.29, "step": 6814 }, { "epoch": 5.736531986531986, "grad_norm": 0.42166295647621155, "learning_rate": 9.601027951772995e-07, "loss": 0.2538, "step": 6815 }, { "epoch": 5.737373737373737, "grad_norm": 0.4255080223083496, "learning_rate": 9.588664729451386e-07, "loss": 0.2512, "step": 6816 }, { "epoch": 5.738215488215488, "grad_norm": 0.4335262179374695, "learning_rate": 9.576308628037922e-07, "loss": 0.2734, "step": 6817 }, { "epoch": 5.739057239057239, "grad_norm": 0.406749963760376, "learning_rate": 9.563959649709852e-07, "loss": 0.2866, "step": 6818 }, { "epoch": 5.73989898989899, "grad_norm": 0.39817124605178833, "learning_rate": 9.551617796643198e-07, "loss": 0.269, "step": 6819 }, { "epoch": 5.7407407407407405, "grad_norm": 0.3925028145313263, "learning_rate": 9.539283071012712e-07, "loss": 0.2967, "step": 6820 }, { "epoch": 5.741582491582491, "grad_norm": 0.39247703552246094, "learning_rate": 9.526955474991917e-07, "loss": 0.2835, "step": 6821 }, { "epoch": 5.742424242424242, "grad_norm": 0.4384421110153198, "learning_rate": 9.514635010753082e-07, "loss": 0.2555, "step": 6822 }, { "epoch": 5.743265993265993, "grad_norm": 0.40296897292137146, "learning_rate": 9.502321680467191e-07, "loss": 0.2718, "step": 6823 }, { "epoch": 5.744107744107744, "grad_norm": 0.4311542809009552, "learning_rate": 9.490015486303966e-07, "loss": 0.3044, "step": 6824 }, { "epoch": 5.744949494949495, "grad_norm": 0.4210510849952698, "learning_rate": 9.477716430431921e-07, "loss": 0.2748, "step": 6825 }, { "epoch": 5.745791245791246, "grad_norm": 0.4067878723144531, "learning_rate": 9.465424515018268e-07, "loss": 0.2732, "step": 6826 }, { "epoch": 5.7466329966329965, "grad_norm": 0.42461708188056946, "learning_rate": 9.453139742228956e-07, "loss": 0.2802, "step": 6827 }, { "epoch": 5.747474747474747, "grad_norm": 0.420041561126709, "learning_rate": 9.440862114228721e-07, "loss": 0.2382, "step": 6828 }, { "epoch": 5.748316498316498, "grad_norm": 0.4551105201244354, "learning_rate": 9.428591633180989e-07, "loss": 0.2737, "step": 6829 }, { "epoch": 5.749158249158249, "grad_norm": 0.40940558910369873, "learning_rate": 9.416328301247967e-07, "loss": 0.2478, "step": 6830 }, { "epoch": 5.75, "grad_norm": 0.41476473212242126, "learning_rate": 9.404072120590585e-07, "loss": 0.287, "step": 6831 }, { "epoch": 5.750841750841751, "grad_norm": 0.40237322449684143, "learning_rate": 9.39182309336849e-07, "loss": 0.285, "step": 6832 }, { "epoch": 5.751683501683502, "grad_norm": 0.4353773295879364, "learning_rate": 9.379581221740108e-07, "loss": 0.283, "step": 6833 }, { "epoch": 5.752525252525253, "grad_norm": 0.406566858291626, "learning_rate": 9.367346507862601e-07, "loss": 0.2809, "step": 6834 }, { "epoch": 5.7533670033670035, "grad_norm": 0.4155520796775818, "learning_rate": 9.355118953891834e-07, "loss": 0.2488, "step": 6835 }, { "epoch": 5.754208754208754, "grad_norm": 0.44175833463668823, "learning_rate": 9.342898561982444e-07, "loss": 0.2744, "step": 6836 }, { "epoch": 5.755050505050505, "grad_norm": 0.44931215047836304, "learning_rate": 9.330685334287764e-07, "loss": 0.2675, "step": 6837 }, { "epoch": 5.755892255892256, "grad_norm": 0.4338991343975067, "learning_rate": 9.318479272959952e-07, "loss": 0.2658, "step": 6838 }, { "epoch": 5.756734006734007, "grad_norm": 0.40005406737327576, "learning_rate": 9.306280380149807e-07, "loss": 0.2985, "step": 6839 }, { "epoch": 5.757575757575758, "grad_norm": 0.4079888164997101, "learning_rate": 9.294088658006917e-07, "loss": 0.2625, "step": 6840 }, { "epoch": 5.758417508417509, "grad_norm": 0.4526895582675934, "learning_rate": 9.281904108679573e-07, "loss": 0.2951, "step": 6841 }, { "epoch": 5.7592592592592595, "grad_norm": 0.43609675765037537, "learning_rate": 9.269726734314849e-07, "loss": 0.2725, "step": 6842 }, { "epoch": 5.76010101010101, "grad_norm": 0.40683600306510925, "learning_rate": 9.257556537058504e-07, "loss": 0.2877, "step": 6843 }, { "epoch": 5.760942760942761, "grad_norm": 0.41565585136413574, "learning_rate": 9.245393519055085e-07, "loss": 0.2727, "step": 6844 }, { "epoch": 5.761784511784512, "grad_norm": 0.44423040747642517, "learning_rate": 9.233237682447816e-07, "loss": 0.2597, "step": 6845 }, { "epoch": 5.762626262626263, "grad_norm": 0.407599538564682, "learning_rate": 9.221089029378715e-07, "loss": 0.2716, "step": 6846 }, { "epoch": 5.763468013468014, "grad_norm": 0.41799136996269226, "learning_rate": 9.208947561988485e-07, "loss": 0.2795, "step": 6847 }, { "epoch": 5.764309764309765, "grad_norm": 0.44757288694381714, "learning_rate": 9.196813282416572e-07, "loss": 0.3063, "step": 6848 }, { "epoch": 5.765151515151516, "grad_norm": 0.4237487018108368, "learning_rate": 9.184686192801196e-07, "loss": 0.2801, "step": 6849 }, { "epoch": 5.7659932659932664, "grad_norm": 0.38944077491760254, "learning_rate": 9.172566295279245e-07, "loss": 0.255, "step": 6850 }, { "epoch": 5.766835016835017, "grad_norm": 0.4121483564376831, "learning_rate": 9.160453591986401e-07, "loss": 0.2802, "step": 6851 }, { "epoch": 5.767676767676767, "grad_norm": 0.4298042953014374, "learning_rate": 9.148348085057046e-07, "loss": 0.2764, "step": 6852 }, { "epoch": 5.768518518518518, "grad_norm": 0.4199468493461609, "learning_rate": 9.136249776624284e-07, "loss": 0.2615, "step": 6853 }, { "epoch": 5.769360269360269, "grad_norm": 0.3846260607242584, "learning_rate": 9.124158668819971e-07, "loss": 0.2637, "step": 6854 }, { "epoch": 5.77020202020202, "grad_norm": 0.43946897983551025, "learning_rate": 9.112074763774714e-07, "loss": 0.2856, "step": 6855 }, { "epoch": 5.771043771043771, "grad_norm": 0.4405135214328766, "learning_rate": 9.099998063617804e-07, "loss": 0.2623, "step": 6856 }, { "epoch": 5.771885521885522, "grad_norm": 0.39915016293525696, "learning_rate": 9.087928570477289e-07, "loss": 0.2872, "step": 6857 }, { "epoch": 5.7727272727272725, "grad_norm": 0.4162595570087433, "learning_rate": 9.075866286479929e-07, "loss": 0.274, "step": 6858 }, { "epoch": 5.773569023569023, "grad_norm": 0.39560186862945557, "learning_rate": 9.063811213751234e-07, "loss": 0.3008, "step": 6859 }, { "epoch": 5.774410774410774, "grad_norm": 0.43319761753082275, "learning_rate": 9.05176335441546e-07, "loss": 0.2747, "step": 6860 }, { "epoch": 5.775252525252525, "grad_norm": 0.44394856691360474, "learning_rate": 9.039722710595544e-07, "loss": 0.2723, "step": 6861 }, { "epoch": 5.776094276094276, "grad_norm": 0.3946833908557892, "learning_rate": 9.027689284413171e-07, "loss": 0.277, "step": 6862 }, { "epoch": 5.776936026936027, "grad_norm": 0.3998156785964966, "learning_rate": 9.015663077988779e-07, "loss": 0.2895, "step": 6863 }, { "epoch": 5.777777777777778, "grad_norm": 0.41540032625198364, "learning_rate": 9.003644093441493e-07, "loss": 0.248, "step": 6864 }, { "epoch": 5.7786195286195285, "grad_norm": 0.41625794768333435, "learning_rate": 8.991632332889211e-07, "loss": 0.2841, "step": 6865 }, { "epoch": 5.779461279461279, "grad_norm": 0.4033646583557129, "learning_rate": 8.97962779844852e-07, "loss": 0.258, "step": 6866 }, { "epoch": 5.78030303030303, "grad_norm": 0.3876248300075531, "learning_rate": 8.967630492234736e-07, "loss": 0.2528, "step": 6867 }, { "epoch": 5.781144781144781, "grad_norm": 0.3924716114997864, "learning_rate": 8.955640416361933e-07, "loss": 0.2609, "step": 6868 }, { "epoch": 5.781986531986532, "grad_norm": 0.4099951982498169, "learning_rate": 8.943657572942865e-07, "loss": 0.2662, "step": 6869 }, { "epoch": 5.782828282828283, "grad_norm": 0.4250403344631195, "learning_rate": 8.931681964089061e-07, "loss": 0.2815, "step": 6870 }, { "epoch": 5.783670033670034, "grad_norm": 0.40786483883857727, "learning_rate": 8.919713591910739e-07, "loss": 0.2523, "step": 6871 }, { "epoch": 5.784511784511785, "grad_norm": 0.4109657406806946, "learning_rate": 8.90775245851686e-07, "loss": 0.2796, "step": 6872 }, { "epoch": 5.7853535353535355, "grad_norm": 0.405692994594574, "learning_rate": 8.895798566015102e-07, "loss": 0.2687, "step": 6873 }, { "epoch": 5.786195286195286, "grad_norm": 0.43033379316329956, "learning_rate": 8.883851916511871e-07, "loss": 0.2718, "step": 6874 }, { "epoch": 5.787037037037037, "grad_norm": 0.42264342308044434, "learning_rate": 8.871912512112257e-07, "loss": 0.258, "step": 6875 }, { "epoch": 5.787878787878788, "grad_norm": 0.40421274304389954, "learning_rate": 8.859980354920172e-07, "loss": 0.2859, "step": 6876 }, { "epoch": 5.788720538720539, "grad_norm": 0.4216535985469818, "learning_rate": 8.848055447038156e-07, "loss": 0.2945, "step": 6877 }, { "epoch": 5.78956228956229, "grad_norm": 0.41232192516326904, "learning_rate": 8.836137790567512e-07, "loss": 0.2949, "step": 6878 }, { "epoch": 5.790404040404041, "grad_norm": 0.43671146035194397, "learning_rate": 8.824227387608236e-07, "loss": 0.262, "step": 6879 }, { "epoch": 5.7912457912457915, "grad_norm": 0.43657785654067993, "learning_rate": 8.812324240259096e-07, "loss": 0.2618, "step": 6880 }, { "epoch": 5.792087542087542, "grad_norm": 0.4806455075740814, "learning_rate": 8.800428350617552e-07, "loss": 0.2482, "step": 6881 }, { "epoch": 5.792929292929293, "grad_norm": 0.3960490822792053, "learning_rate": 8.788539720779776e-07, "loss": 0.272, "step": 6882 }, { "epoch": 5.793771043771044, "grad_norm": 0.3943134844303131, "learning_rate": 8.776658352840661e-07, "loss": 0.2807, "step": 6883 }, { "epoch": 5.794612794612795, "grad_norm": 0.4267183840274811, "learning_rate": 8.764784248893859e-07, "loss": 0.267, "step": 6884 }, { "epoch": 5.795454545454545, "grad_norm": 0.40945979952812195, "learning_rate": 8.752917411031686e-07, "loss": 0.2882, "step": 6885 }, { "epoch": 5.796296296296296, "grad_norm": 0.39739248156547546, "learning_rate": 8.74105784134523e-07, "loss": 0.268, "step": 6886 }, { "epoch": 5.797138047138047, "grad_norm": 0.4070688784122467, "learning_rate": 8.729205541924263e-07, "loss": 0.2735, "step": 6887 }, { "epoch": 5.797979797979798, "grad_norm": 0.4176960587501526, "learning_rate": 8.717360514857271e-07, "loss": 0.2876, "step": 6888 }, { "epoch": 5.798821548821548, "grad_norm": 0.39368849992752075, "learning_rate": 8.7055227622315e-07, "loss": 0.2503, "step": 6889 }, { "epoch": 5.799663299663299, "grad_norm": 0.4050050973892212, "learning_rate": 8.693692286132871e-07, "loss": 0.2765, "step": 6890 }, { "epoch": 5.80050505050505, "grad_norm": 0.3743281960487366, "learning_rate": 8.681869088646039e-07, "loss": 0.2764, "step": 6891 }, { "epoch": 5.801346801346801, "grad_norm": 0.3909154534339905, "learning_rate": 8.670053171854381e-07, "loss": 0.3145, "step": 6892 }, { "epoch": 5.802188552188552, "grad_norm": 0.40544161200523376, "learning_rate": 8.658244537840005e-07, "loss": 0.2709, "step": 6893 }, { "epoch": 5.803030303030303, "grad_norm": 0.41173189878463745, "learning_rate": 8.646443188683701e-07, "loss": 0.2761, "step": 6894 }, { "epoch": 5.803872053872054, "grad_norm": 0.4165666401386261, "learning_rate": 8.634649126465e-07, "loss": 0.2684, "step": 6895 }, { "epoch": 5.8047138047138045, "grad_norm": 0.40542706847190857, "learning_rate": 8.622862353262118e-07, "loss": 0.2784, "step": 6896 }, { "epoch": 5.805555555555555, "grad_norm": 0.4103652536869049, "learning_rate": 8.611082871152032e-07, "loss": 0.2762, "step": 6897 }, { "epoch": 5.806397306397306, "grad_norm": 0.42590346932411194, "learning_rate": 8.599310682210416e-07, "loss": 0.2862, "step": 6898 }, { "epoch": 5.807239057239057, "grad_norm": 0.4109344780445099, "learning_rate": 8.587545788511653e-07, "loss": 0.2978, "step": 6899 }, { "epoch": 5.808080808080808, "grad_norm": 0.41468048095703125, "learning_rate": 8.575788192128826e-07, "loss": 0.2736, "step": 6900 }, { "epoch": 5.808922558922559, "grad_norm": 0.41475701332092285, "learning_rate": 8.564037895133759e-07, "loss": 0.2898, "step": 6901 }, { "epoch": 5.80976430976431, "grad_norm": 0.41126367449760437, "learning_rate": 8.552294899596991e-07, "loss": 0.2861, "step": 6902 }, { "epoch": 5.8106060606060606, "grad_norm": 0.4131387770175934, "learning_rate": 8.540559207587756e-07, "loss": 0.2776, "step": 6903 }, { "epoch": 5.811447811447811, "grad_norm": 0.4331498146057129, "learning_rate": 8.528830821174005e-07, "loss": 0.2745, "step": 6904 }, { "epoch": 5.812289562289562, "grad_norm": 0.43188512325286865, "learning_rate": 8.517109742422392e-07, "loss": 0.2758, "step": 6905 }, { "epoch": 5.813131313131313, "grad_norm": 0.42901432514190674, "learning_rate": 8.505395973398317e-07, "loss": 0.2938, "step": 6906 }, { "epoch": 5.813973063973064, "grad_norm": 0.4553045630455017, "learning_rate": 8.49368951616586e-07, "loss": 0.2749, "step": 6907 }, { "epoch": 5.814814814814815, "grad_norm": 0.40263432264328003, "learning_rate": 8.48199037278783e-07, "loss": 0.2729, "step": 6908 }, { "epoch": 5.815656565656566, "grad_norm": 0.38195595145225525, "learning_rate": 8.470298545325728e-07, "loss": 0.2658, "step": 6909 }, { "epoch": 5.816498316498317, "grad_norm": 0.40696752071380615, "learning_rate": 8.458614035839802e-07, "loss": 0.3028, "step": 6910 }, { "epoch": 5.8173400673400675, "grad_norm": 0.4710146486759186, "learning_rate": 8.446936846388976e-07, "loss": 0.2798, "step": 6911 }, { "epoch": 5.818181818181818, "grad_norm": 0.42588576674461365, "learning_rate": 8.435266979030876e-07, "loss": 0.2587, "step": 6912 }, { "epoch": 5.819023569023569, "grad_norm": 0.4060693085193634, "learning_rate": 8.42360443582187e-07, "loss": 0.2751, "step": 6913 }, { "epoch": 5.81986531986532, "grad_norm": 0.38842299580574036, "learning_rate": 8.411949218817045e-07, "loss": 0.2757, "step": 6914 }, { "epoch": 5.820707070707071, "grad_norm": 0.40385210514068604, "learning_rate": 8.400301330070149e-07, "loss": 0.2682, "step": 6915 }, { "epoch": 5.821548821548822, "grad_norm": 0.4008438289165497, "learning_rate": 8.388660771633678e-07, "loss": 0.2615, "step": 6916 }, { "epoch": 5.822390572390573, "grad_norm": 0.39346230030059814, "learning_rate": 8.377027545558796e-07, "loss": 0.2828, "step": 6917 }, { "epoch": 5.8232323232323235, "grad_norm": 0.43078580498695374, "learning_rate": 8.365401653895422e-07, "loss": 0.2668, "step": 6918 }, { "epoch": 5.824074074074074, "grad_norm": 0.40605875849723816, "learning_rate": 8.353783098692175e-07, "loss": 0.2868, "step": 6919 }, { "epoch": 5.824915824915825, "grad_norm": 0.41204211115837097, "learning_rate": 8.342171881996353e-07, "loss": 0.278, "step": 6920 }, { "epoch": 5.825757575757576, "grad_norm": 0.3989242613315582, "learning_rate": 8.330568005853968e-07, "loss": 0.2943, "step": 6921 }, { "epoch": 5.826599326599327, "grad_norm": 0.4242182672023773, "learning_rate": 8.318971472309761e-07, "loss": 0.2585, "step": 6922 }, { "epoch": 5.827441077441078, "grad_norm": 0.3926164209842682, "learning_rate": 8.30738228340715e-07, "loss": 0.2929, "step": 6923 }, { "epoch": 5.828282828282829, "grad_norm": 0.3920044004917145, "learning_rate": 8.2958004411883e-07, "loss": 0.2844, "step": 6924 }, { "epoch": 5.82912457912458, "grad_norm": 0.42778852581977844, "learning_rate": 8.284225947694035e-07, "loss": 0.2709, "step": 6925 }, { "epoch": 5.8299663299663305, "grad_norm": 0.4198792278766632, "learning_rate": 8.272658804963896e-07, "loss": 0.2561, "step": 6926 }, { "epoch": 5.83080808080808, "grad_norm": 0.41691580414772034, "learning_rate": 8.261099015036156e-07, "loss": 0.306, "step": 6927 }, { "epoch": 5.831649831649831, "grad_norm": 0.45649781823158264, "learning_rate": 8.249546579947759e-07, "loss": 0.2484, "step": 6928 }, { "epoch": 5.832491582491582, "grad_norm": 0.42662107944488525, "learning_rate": 8.238001501734383e-07, "loss": 0.2445, "step": 6929 }, { "epoch": 5.833333333333333, "grad_norm": 0.43199077248573303, "learning_rate": 8.226463782430372e-07, "loss": 0.2561, "step": 6930 }, { "epoch": 5.834175084175084, "grad_norm": 0.45528796315193176, "learning_rate": 8.214933424068822e-07, "loss": 0.2743, "step": 6931 }, { "epoch": 5.835016835016835, "grad_norm": 0.4290461242198944, "learning_rate": 8.203410428681491e-07, "loss": 0.2642, "step": 6932 }, { "epoch": 5.835858585858586, "grad_norm": 0.3971801996231079, "learning_rate": 8.191894798298844e-07, "loss": 0.2972, "step": 6933 }, { "epoch": 5.8367003367003365, "grad_norm": 0.40925344824790955, "learning_rate": 8.180386534950074e-07, "loss": 0.2939, "step": 6934 }, { "epoch": 5.837542087542087, "grad_norm": 0.41800805926322937, "learning_rate": 8.168885640663043e-07, "loss": 0.2668, "step": 6935 }, { "epoch": 5.838383838383838, "grad_norm": 0.41377899050712585, "learning_rate": 8.157392117464352e-07, "loss": 0.3017, "step": 6936 }, { "epoch": 5.839225589225589, "grad_norm": 0.40612882375717163, "learning_rate": 8.145905967379275e-07, "loss": 0.2836, "step": 6937 }, { "epoch": 5.84006734006734, "grad_norm": 0.39380404353141785, "learning_rate": 8.134427192431776e-07, "loss": 0.2747, "step": 6938 }, { "epoch": 5.840909090909091, "grad_norm": 0.43348127603530884, "learning_rate": 8.122955794644555e-07, "loss": 0.2971, "step": 6939 }, { "epoch": 5.841750841750842, "grad_norm": 0.41318565607070923, "learning_rate": 8.111491776039004e-07, "loss": 0.2777, "step": 6940 }, { "epoch": 5.842592592592593, "grad_norm": 0.4114929437637329, "learning_rate": 8.100035138635188e-07, "loss": 0.2451, "step": 6941 }, { "epoch": 5.843434343434343, "grad_norm": 0.4064035415649414, "learning_rate": 8.088585884451899e-07, "loss": 0.289, "step": 6942 }, { "epoch": 5.844276094276094, "grad_norm": 0.4249264895915985, "learning_rate": 8.0771440155066e-07, "loss": 0.2673, "step": 6943 }, { "epoch": 5.845117845117845, "grad_norm": 0.3997933864593506, "learning_rate": 8.06570953381548e-07, "loss": 0.2434, "step": 6944 }, { "epoch": 5.845959595959596, "grad_norm": 0.42677921056747437, "learning_rate": 8.05428244139343e-07, "loss": 0.2389, "step": 6945 }, { "epoch": 5.846801346801347, "grad_norm": 0.4172118604183197, "learning_rate": 8.042862740254015e-07, "loss": 0.2655, "step": 6946 }, { "epoch": 5.847643097643098, "grad_norm": 0.4133656322956085, "learning_rate": 8.031450432409499e-07, "loss": 0.275, "step": 6947 }, { "epoch": 5.848484848484849, "grad_norm": 0.4267559349536896, "learning_rate": 8.020045519870873e-07, "loss": 0.2645, "step": 6948 }, { "epoch": 5.8493265993265995, "grad_norm": 0.38962262868881226, "learning_rate": 8.008648004647779e-07, "loss": 0.2333, "step": 6949 }, { "epoch": 5.85016835016835, "grad_norm": 0.42186179757118225, "learning_rate": 7.99725788874861e-07, "loss": 0.2595, "step": 6950 }, { "epoch": 5.851010101010101, "grad_norm": 0.4226785898208618, "learning_rate": 7.985875174180391e-07, "loss": 0.2673, "step": 6951 }, { "epoch": 5.851851851851852, "grad_norm": 0.4231368601322174, "learning_rate": 7.974499862948915e-07, "loss": 0.2683, "step": 6952 }, { "epoch": 5.852693602693603, "grad_norm": 0.4108998775482178, "learning_rate": 7.963131957058612e-07, "loss": 0.2792, "step": 6953 }, { "epoch": 5.853535353535354, "grad_norm": 0.4391774833202362, "learning_rate": 7.951771458512631e-07, "loss": 0.2296, "step": 6954 }, { "epoch": 5.854377104377105, "grad_norm": 0.413052499294281, "learning_rate": 7.940418369312802e-07, "loss": 0.2539, "step": 6955 }, { "epoch": 5.8552188552188555, "grad_norm": 0.45027971267700195, "learning_rate": 7.929072691459666e-07, "loss": 0.2835, "step": 6956 }, { "epoch": 5.856060606060606, "grad_norm": 0.4289625883102417, "learning_rate": 7.917734426952478e-07, "loss": 0.2834, "step": 6957 }, { "epoch": 5.856902356902357, "grad_norm": 0.404479444026947, "learning_rate": 7.906403577789135e-07, "loss": 0.2681, "step": 6958 }, { "epoch": 5.857744107744107, "grad_norm": 0.4095902442932129, "learning_rate": 7.89508014596625e-07, "loss": 0.2446, "step": 6959 }, { "epoch": 5.858585858585858, "grad_norm": 0.46374860405921936, "learning_rate": 7.883764133479138e-07, "loss": 0.2633, "step": 6960 }, { "epoch": 5.859427609427609, "grad_norm": 0.4644778370857239, "learning_rate": 7.872455542321821e-07, "loss": 0.2721, "step": 6961 }, { "epoch": 5.86026936026936, "grad_norm": 0.4080750048160553, "learning_rate": 7.861154374486974e-07, "loss": 0.2714, "step": 6962 }, { "epoch": 5.861111111111111, "grad_norm": 0.46246930956840515, "learning_rate": 7.849860631965988e-07, "loss": 0.2504, "step": 6963 }, { "epoch": 5.861952861952862, "grad_norm": 0.4076506495475769, "learning_rate": 7.838574316748925e-07, "loss": 0.2776, "step": 6964 }, { "epoch": 5.8627946127946124, "grad_norm": 0.4199800193309784, "learning_rate": 7.827295430824572e-07, "loss": 0.265, "step": 6965 }, { "epoch": 5.863636363636363, "grad_norm": 0.43715888261795044, "learning_rate": 7.816023976180393e-07, "loss": 0.2519, "step": 6966 }, { "epoch": 5.864478114478114, "grad_norm": 0.400739848613739, "learning_rate": 7.804759954802532e-07, "loss": 0.2665, "step": 6967 }, { "epoch": 5.865319865319865, "grad_norm": 0.42034944891929626, "learning_rate": 7.793503368675814e-07, "loss": 0.2475, "step": 6968 }, { "epoch": 5.866161616161616, "grad_norm": 0.4286196529865265, "learning_rate": 7.782254219783797e-07, "loss": 0.2494, "step": 6969 }, { "epoch": 5.867003367003367, "grad_norm": 0.42115435004234314, "learning_rate": 7.771012510108688e-07, "loss": 0.2669, "step": 6970 }, { "epoch": 5.867845117845118, "grad_norm": 0.39632704854011536, "learning_rate": 7.759778241631388e-07, "loss": 0.2757, "step": 6971 }, { "epoch": 5.8686868686868685, "grad_norm": 0.42515793442726135, "learning_rate": 7.748551416331512e-07, "loss": 0.255, "step": 6972 }, { "epoch": 5.869528619528619, "grad_norm": 0.4006560146808624, "learning_rate": 7.737332036187323e-07, "loss": 0.261, "step": 6973 }, { "epoch": 5.87037037037037, "grad_norm": 0.41324499249458313, "learning_rate": 7.726120103175822e-07, "loss": 0.2494, "step": 6974 }, { "epoch": 5.871212121212121, "grad_norm": 0.42150062322616577, "learning_rate": 7.714915619272656e-07, "loss": 0.2685, "step": 6975 }, { "epoch": 5.872053872053872, "grad_norm": 0.44717031717300415, "learning_rate": 7.703718586452169e-07, "loss": 0.2922, "step": 6976 }, { "epoch": 5.872895622895623, "grad_norm": 0.40827488899230957, "learning_rate": 7.692529006687405e-07, "loss": 0.2664, "step": 6977 }, { "epoch": 5.873737373737374, "grad_norm": 0.4194709360599518, "learning_rate": 7.681346881950097e-07, "loss": 0.2701, "step": 6978 }, { "epoch": 5.874579124579125, "grad_norm": 0.4369869530200958, "learning_rate": 7.670172214210653e-07, "loss": 0.2895, "step": 6979 }, { "epoch": 5.875420875420875, "grad_norm": 0.40056470036506653, "learning_rate": 7.659005005438152e-07, "loss": 0.2759, "step": 6980 }, { "epoch": 5.876262626262626, "grad_norm": 0.41758671402931213, "learning_rate": 7.647845257600367e-07, "loss": 0.2679, "step": 6981 }, { "epoch": 5.877104377104377, "grad_norm": 0.39520519971847534, "learning_rate": 7.636692972663801e-07, "loss": 0.2639, "step": 6982 }, { "epoch": 5.877946127946128, "grad_norm": 0.4229337275028229, "learning_rate": 7.625548152593587e-07, "loss": 0.2728, "step": 6983 }, { "epoch": 5.878787878787879, "grad_norm": 0.40313848853111267, "learning_rate": 7.614410799353561e-07, "loss": 0.2663, "step": 6984 }, { "epoch": 5.87962962962963, "grad_norm": 0.4054330289363861, "learning_rate": 7.603280914906231e-07, "loss": 0.2689, "step": 6985 }, { "epoch": 5.880471380471381, "grad_norm": 0.4161555767059326, "learning_rate": 7.592158501212826e-07, "loss": 0.2825, "step": 6986 }, { "epoch": 5.8813131313131315, "grad_norm": 0.3822818398475647, "learning_rate": 7.581043560233208e-07, "loss": 0.2646, "step": 6987 }, { "epoch": 5.882154882154882, "grad_norm": 0.40750619769096375, "learning_rate": 7.569936093925972e-07, "loss": 0.3102, "step": 6988 }, { "epoch": 5.882996632996633, "grad_norm": 0.4557656943798065, "learning_rate": 7.558836104248352e-07, "loss": 0.2729, "step": 6989 }, { "epoch": 5.883838383838384, "grad_norm": 0.4101003110408783, "learning_rate": 7.547743593156303e-07, "loss": 0.2837, "step": 6990 }, { "epoch": 5.884680134680135, "grad_norm": 0.4779362380504608, "learning_rate": 7.536658562604438e-07, "loss": 0.2787, "step": 6991 }, { "epoch": 5.885521885521886, "grad_norm": 0.41989198327064514, "learning_rate": 7.525581014546035e-07, "loss": 0.265, "step": 6992 }, { "epoch": 5.886363636363637, "grad_norm": 0.4438842535018921, "learning_rate": 7.514510950933112e-07, "loss": 0.2494, "step": 6993 }, { "epoch": 5.8872053872053876, "grad_norm": 0.4081358313560486, "learning_rate": 7.503448373716305e-07, "loss": 0.2893, "step": 6994 }, { "epoch": 5.888047138047138, "grad_norm": 0.46739882230758667, "learning_rate": 7.492393284844973e-07, "loss": 0.2537, "step": 6995 }, { "epoch": 5.888888888888889, "grad_norm": 0.3973633646965027, "learning_rate": 7.481345686267144e-07, "loss": 0.2726, "step": 6996 }, { "epoch": 5.88973063973064, "grad_norm": 0.39723652601242065, "learning_rate": 7.470305579929498e-07, "loss": 0.2983, "step": 6997 }, { "epoch": 5.890572390572391, "grad_norm": 0.3982998728752136, "learning_rate": 7.459272967777437e-07, "loss": 0.2999, "step": 6998 }, { "epoch": 5.891414141414142, "grad_norm": 0.39019426703453064, "learning_rate": 7.448247851755031e-07, "loss": 0.2717, "step": 6999 }, { "epoch": 5.892255892255893, "grad_norm": 0.41276413202285767, "learning_rate": 7.437230233805021e-07, "loss": 0.2892, "step": 7000 }, { "epoch": 5.893097643097643, "grad_norm": 0.44260531663894653, "learning_rate": 7.42622011586882e-07, "loss": 0.2373, "step": 7001 }, { "epoch": 5.893939393939394, "grad_norm": 0.3949020504951477, "learning_rate": 7.415217499886523e-07, "loss": 0.2477, "step": 7002 }, { "epoch": 5.8947811447811445, "grad_norm": 0.4070816934108734, "learning_rate": 7.404222387796916e-07, "loss": 0.2521, "step": 7003 }, { "epoch": 5.895622895622895, "grad_norm": 0.4292494058609009, "learning_rate": 7.393234781537467e-07, "loss": 0.2656, "step": 7004 }, { "epoch": 5.896464646464646, "grad_norm": 0.44723907113075256, "learning_rate": 7.382254683044298e-07, "loss": 0.2874, "step": 7005 }, { "epoch": 5.897306397306397, "grad_norm": 0.4612472355365753, "learning_rate": 7.371282094252214e-07, "loss": 0.2523, "step": 7006 }, { "epoch": 5.898148148148148, "grad_norm": 0.3989807367324829, "learning_rate": 7.360317017094709e-07, "loss": 0.2709, "step": 7007 }, { "epoch": 5.898989898989899, "grad_norm": 0.40764984488487244, "learning_rate": 7.34935945350394e-07, "loss": 0.2828, "step": 7008 }, { "epoch": 5.89983164983165, "grad_norm": 0.4412793815135956, "learning_rate": 7.338409405410762e-07, "loss": 0.2527, "step": 7009 }, { "epoch": 5.9006734006734005, "grad_norm": 0.41914480924606323, "learning_rate": 7.327466874744682e-07, "loss": 0.3017, "step": 7010 }, { "epoch": 5.901515151515151, "grad_norm": 0.4102308750152588, "learning_rate": 7.316531863433873e-07, "loss": 0.2659, "step": 7011 }, { "epoch": 5.902356902356902, "grad_norm": 0.4466513395309448, "learning_rate": 7.305604373405228e-07, "loss": 0.281, "step": 7012 }, { "epoch": 5.903198653198653, "grad_norm": 0.42429083585739136, "learning_rate": 7.294684406584268e-07, "loss": 0.2874, "step": 7013 }, { "epoch": 5.904040404040404, "grad_norm": 0.41244617104530334, "learning_rate": 7.28377196489522e-07, "loss": 0.281, "step": 7014 }, { "epoch": 5.904882154882155, "grad_norm": 0.4354402422904968, "learning_rate": 7.272867050260962e-07, "loss": 0.2634, "step": 7015 }, { "epoch": 5.905723905723906, "grad_norm": 0.40783849358558655, "learning_rate": 7.261969664603069e-07, "loss": 0.2692, "step": 7016 }, { "epoch": 5.906565656565657, "grad_norm": 0.4588857889175415, "learning_rate": 7.251079809841771e-07, "loss": 0.2574, "step": 7017 }, { "epoch": 5.907407407407407, "grad_norm": 0.4102467894554138, "learning_rate": 7.240197487895978e-07, "loss": 0.2745, "step": 7018 }, { "epoch": 5.908249158249158, "grad_norm": 0.44467636942863464, "learning_rate": 7.229322700683239e-07, "loss": 0.2802, "step": 7019 }, { "epoch": 5.909090909090909, "grad_norm": 0.4338917136192322, "learning_rate": 7.218455450119865e-07, "loss": 0.2729, "step": 7020 }, { "epoch": 5.90993265993266, "grad_norm": 0.4199344515800476, "learning_rate": 7.207595738120749e-07, "loss": 0.2879, "step": 7021 }, { "epoch": 5.910774410774411, "grad_norm": 0.40493035316467285, "learning_rate": 7.19674356659949e-07, "loss": 0.2819, "step": 7022 }, { "epoch": 5.911616161616162, "grad_norm": 0.3927299380302429, "learning_rate": 7.185898937468349e-07, "loss": 0.2781, "step": 7023 }, { "epoch": 5.912457912457913, "grad_norm": 0.4083110988140106, "learning_rate": 7.175061852638277e-07, "loss": 0.2601, "step": 7024 }, { "epoch": 5.9132996632996635, "grad_norm": 0.3934904932975769, "learning_rate": 7.164232314018893e-07, "loss": 0.2746, "step": 7025 }, { "epoch": 5.914141414141414, "grad_norm": 0.4076911509037018, "learning_rate": 7.153410323518462e-07, "loss": 0.2977, "step": 7026 }, { "epoch": 5.914983164983165, "grad_norm": 0.44147372245788574, "learning_rate": 7.142595883043929e-07, "loss": 0.2531, "step": 7027 }, { "epoch": 5.915824915824916, "grad_norm": 0.4068685472011566, "learning_rate": 7.131788994500932e-07, "loss": 0.2625, "step": 7028 }, { "epoch": 5.916666666666667, "grad_norm": 0.4535170793533325, "learning_rate": 7.120989659793737e-07, "loss": 0.2466, "step": 7029 }, { "epoch": 5.917508417508418, "grad_norm": 0.44381970167160034, "learning_rate": 7.110197880825326e-07, "loss": 0.2674, "step": 7030 }, { "epoch": 5.918350168350169, "grad_norm": 0.40334269404411316, "learning_rate": 7.099413659497318e-07, "loss": 0.259, "step": 7031 }, { "epoch": 5.91919191919192, "grad_norm": 0.4225897789001465, "learning_rate": 7.088636997709991e-07, "loss": 0.2772, "step": 7032 }, { "epoch": 5.9200336700336695, "grad_norm": 0.4006548821926117, "learning_rate": 7.077867897362334e-07, "loss": 0.2596, "step": 7033 }, { "epoch": 5.92087542087542, "grad_norm": 0.39922574162483215, "learning_rate": 7.067106360351966e-07, "loss": 0.2772, "step": 7034 }, { "epoch": 5.921717171717171, "grad_norm": 0.42317917943000793, "learning_rate": 7.056352388575166e-07, "loss": 0.2745, "step": 7035 }, { "epoch": 5.922558922558922, "grad_norm": 0.4009222686290741, "learning_rate": 7.045605983926923e-07, "loss": 0.2666, "step": 7036 }, { "epoch": 5.923400673400673, "grad_norm": 0.39857548475265503, "learning_rate": 7.034867148300867e-07, "loss": 0.292, "step": 7037 }, { "epoch": 5.924242424242424, "grad_norm": 0.4011605978012085, "learning_rate": 7.024135883589289e-07, "loss": 0.2817, "step": 7038 }, { "epoch": 5.925084175084175, "grad_norm": 0.42929893732070923, "learning_rate": 7.013412191683156e-07, "loss": 0.2686, "step": 7039 }, { "epoch": 5.925925925925926, "grad_norm": 0.40138334035873413, "learning_rate": 7.002696074472076e-07, "loss": 0.2876, "step": 7040 }, { "epoch": 5.9267676767676765, "grad_norm": 0.4020006060600281, "learning_rate": 6.991987533844363e-07, "loss": 0.3002, "step": 7041 }, { "epoch": 5.927609427609427, "grad_norm": 0.3909180164337158, "learning_rate": 6.981286571686985e-07, "loss": 0.2671, "step": 7042 }, { "epoch": 5.928451178451178, "grad_norm": 0.38925930857658386, "learning_rate": 6.97059318988555e-07, "loss": 0.3008, "step": 7043 }, { "epoch": 5.929292929292929, "grad_norm": 0.4082745611667633, "learning_rate": 6.959907390324344e-07, "loss": 0.2239, "step": 7044 }, { "epoch": 5.93013468013468, "grad_norm": 0.42688316106796265, "learning_rate": 6.949229174886318e-07, "loss": 0.2789, "step": 7045 }, { "epoch": 5.930976430976431, "grad_norm": 0.4120953679084778, "learning_rate": 6.938558545453111e-07, "loss": 0.2867, "step": 7046 }, { "epoch": 5.931818181818182, "grad_norm": 0.39908766746520996, "learning_rate": 6.92789550390498e-07, "loss": 0.2665, "step": 7047 }, { "epoch": 5.9326599326599325, "grad_norm": 0.4093203842639923, "learning_rate": 6.917240052120878e-07, "loss": 0.2876, "step": 7048 }, { "epoch": 5.933501683501683, "grad_norm": 0.4379862844944, "learning_rate": 6.906592191978384e-07, "loss": 0.2589, "step": 7049 }, { "epoch": 5.934343434343434, "grad_norm": 0.418682336807251, "learning_rate": 6.895951925353794e-07, "loss": 0.2758, "step": 7050 }, { "epoch": 5.935185185185185, "grad_norm": 0.41738399863243103, "learning_rate": 6.885319254122019e-07, "loss": 0.2712, "step": 7051 }, { "epoch": 5.936026936026936, "grad_norm": 0.38996773958206177, "learning_rate": 6.87469418015666e-07, "loss": 0.2919, "step": 7052 }, { "epoch": 5.936868686868687, "grad_norm": 0.39549314975738525, "learning_rate": 6.864076705329959e-07, "loss": 0.2739, "step": 7053 }, { "epoch": 5.937710437710438, "grad_norm": 0.4225512146949768, "learning_rate": 6.85346683151284e-07, "loss": 0.2686, "step": 7054 }, { "epoch": 5.938552188552189, "grad_norm": 0.41432327032089233, "learning_rate": 6.842864560574869e-07, "loss": 0.2877, "step": 7055 }, { "epoch": 5.9393939393939394, "grad_norm": 0.4213102459907532, "learning_rate": 6.83226989438427e-07, "loss": 0.2712, "step": 7056 }, { "epoch": 5.94023569023569, "grad_norm": 0.3725331127643585, "learning_rate": 6.821682834807942e-07, "loss": 0.2987, "step": 7057 }, { "epoch": 5.941077441077441, "grad_norm": 0.4002169072628021, "learning_rate": 6.811103383711453e-07, "loss": 0.2666, "step": 7058 }, { "epoch": 5.941919191919192, "grad_norm": 0.40972742438316345, "learning_rate": 6.800531542959004e-07, "loss": 0.2827, "step": 7059 }, { "epoch": 5.942760942760943, "grad_norm": 0.3958880305290222, "learning_rate": 6.789967314413465e-07, "loss": 0.2654, "step": 7060 }, { "epoch": 5.943602693602694, "grad_norm": 0.38478225469589233, "learning_rate": 6.779410699936351e-07, "loss": 0.2949, "step": 7061 }, { "epoch": 5.944444444444445, "grad_norm": 0.4375174939632416, "learning_rate": 6.768861701387869e-07, "loss": 0.2596, "step": 7062 }, { "epoch": 5.9452861952861955, "grad_norm": 0.4083232581615448, "learning_rate": 6.758320320626871e-07, "loss": 0.2298, "step": 7063 }, { "epoch": 5.946127946127946, "grad_norm": 0.4011740982532501, "learning_rate": 6.74778655951085e-07, "loss": 0.2776, "step": 7064 }, { "epoch": 5.946969696969697, "grad_norm": 0.40346935391426086, "learning_rate": 6.737260419895953e-07, "loss": 0.2588, "step": 7065 }, { "epoch": 5.947811447811448, "grad_norm": 0.41041797399520874, "learning_rate": 6.726741903637024e-07, "loss": 0.2815, "step": 7066 }, { "epoch": 5.948653198653199, "grad_norm": 0.37255412340164185, "learning_rate": 6.716231012587515e-07, "loss": 0.2855, "step": 7067 }, { "epoch": 5.94949494949495, "grad_norm": 0.3891542851924896, "learning_rate": 6.705727748599584e-07, "loss": 0.2717, "step": 7068 }, { "epoch": 5.950336700336701, "grad_norm": 0.4396461248397827, "learning_rate": 6.695232113523997e-07, "loss": 0.2601, "step": 7069 }, { "epoch": 5.951178451178452, "grad_norm": 0.402810662984848, "learning_rate": 6.684744109210195e-07, "loss": 0.2772, "step": 7070 }, { "epoch": 5.952020202020202, "grad_norm": 0.42887255549430847, "learning_rate": 6.674263737506293e-07, "loss": 0.2725, "step": 7071 }, { "epoch": 5.952861952861953, "grad_norm": 0.40979263186454773, "learning_rate": 6.663791000259018e-07, "loss": 0.2888, "step": 7072 }, { "epoch": 5.953703703703704, "grad_norm": 0.421650230884552, "learning_rate": 6.653325899313812e-07, "loss": 0.2693, "step": 7073 }, { "epoch": 5.954545454545455, "grad_norm": 0.39915549755096436, "learning_rate": 6.642868436514705e-07, "loss": 0.2638, "step": 7074 }, { "epoch": 5.955387205387205, "grad_norm": 0.441167414188385, "learning_rate": 6.632418613704444e-07, "loss": 0.2949, "step": 7075 }, { "epoch": 5.956228956228956, "grad_norm": 0.40190380811691284, "learning_rate": 6.621976432724381e-07, "loss": 0.2666, "step": 7076 }, { "epoch": 5.957070707070707, "grad_norm": 0.41226255893707275, "learning_rate": 6.611541895414525e-07, "loss": 0.2818, "step": 7077 }, { "epoch": 5.957912457912458, "grad_norm": 0.43168285489082336, "learning_rate": 6.601115003613589e-07, "loss": 0.2821, "step": 7078 }, { "epoch": 5.9587542087542085, "grad_norm": 0.42067885398864746, "learning_rate": 6.590695759158866e-07, "loss": 0.2593, "step": 7079 }, { "epoch": 5.959595959595959, "grad_norm": 0.43072476983070374, "learning_rate": 6.580284163886369e-07, "loss": 0.2696, "step": 7080 }, { "epoch": 5.96043771043771, "grad_norm": 0.42443564534187317, "learning_rate": 6.56988021963072e-07, "loss": 0.2516, "step": 7081 }, { "epoch": 5.961279461279461, "grad_norm": 0.4144950211048126, "learning_rate": 6.559483928225191e-07, "loss": 0.2536, "step": 7082 }, { "epoch": 5.962121212121212, "grad_norm": 0.391508549451828, "learning_rate": 6.549095291501728e-07, "loss": 0.2677, "step": 7083 }, { "epoch": 5.962962962962963, "grad_norm": 0.42363953590393066, "learning_rate": 6.538714311290939e-07, "loss": 0.2545, "step": 7084 }, { "epoch": 5.963804713804714, "grad_norm": 0.4331723749637604, "learning_rate": 6.528340989422045e-07, "loss": 0.2698, "step": 7085 }, { "epoch": 5.9646464646464645, "grad_norm": 0.40517890453338623, "learning_rate": 6.517975327722936e-07, "loss": 0.2699, "step": 7086 }, { "epoch": 5.965488215488215, "grad_norm": 0.4287364184856415, "learning_rate": 6.507617328020149e-07, "loss": 0.2915, "step": 7087 }, { "epoch": 5.966329966329966, "grad_norm": 0.4033283591270447, "learning_rate": 6.497266992138879e-07, "loss": 0.2624, "step": 7088 }, { "epoch": 5.967171717171717, "grad_norm": 0.40149930119514465, "learning_rate": 6.486924321902971e-07, "loss": 0.2352, "step": 7089 }, { "epoch": 5.968013468013468, "grad_norm": 0.42741817235946655, "learning_rate": 6.476589319134913e-07, "loss": 0.2834, "step": 7090 }, { "epoch": 5.968855218855219, "grad_norm": 0.4352015256881714, "learning_rate": 6.466261985655825e-07, "loss": 0.2594, "step": 7091 }, { "epoch": 5.96969696969697, "grad_norm": 0.40200304985046387, "learning_rate": 6.455942323285518e-07, "loss": 0.2944, "step": 7092 }, { "epoch": 5.970538720538721, "grad_norm": 0.44436636567115784, "learning_rate": 6.445630333842406e-07, "loss": 0.2742, "step": 7093 }, { "epoch": 5.9713804713804715, "grad_norm": 0.40229156613349915, "learning_rate": 6.435326019143595e-07, "loss": 0.2824, "step": 7094 }, { "epoch": 5.972222222222222, "grad_norm": 0.3998977541923523, "learning_rate": 6.425029381004783e-07, "loss": 0.2667, "step": 7095 }, { "epoch": 5.973063973063973, "grad_norm": 0.42994657158851624, "learning_rate": 6.414740421240384e-07, "loss": 0.2689, "step": 7096 }, { "epoch": 5.973905723905724, "grad_norm": 0.42131561040878296, "learning_rate": 6.404459141663394e-07, "loss": 0.2718, "step": 7097 }, { "epoch": 5.974747474747475, "grad_norm": 0.41067537665367126, "learning_rate": 6.39418554408549e-07, "loss": 0.269, "step": 7098 }, { "epoch": 5.975589225589226, "grad_norm": 0.3795097768306732, "learning_rate": 6.383919630317004e-07, "loss": 0.2717, "step": 7099 }, { "epoch": 5.976430976430977, "grad_norm": 0.4166150689125061, "learning_rate": 6.373661402166876e-07, "loss": 0.2736, "step": 7100 }, { "epoch": 5.9772727272727275, "grad_norm": 0.4100440442562103, "learning_rate": 6.363410861442737e-07, "loss": 0.2775, "step": 7101 }, { "epoch": 5.978114478114478, "grad_norm": 0.427889347076416, "learning_rate": 6.353168009950833e-07, "loss": 0.2467, "step": 7102 }, { "epoch": 5.978956228956229, "grad_norm": 0.4362967014312744, "learning_rate": 6.342932849496047e-07, "loss": 0.2838, "step": 7103 }, { "epoch": 5.97979797979798, "grad_norm": 0.41887375712394714, "learning_rate": 6.332705381881948e-07, "loss": 0.2639, "step": 7104 }, { "epoch": 5.980639730639731, "grad_norm": 0.36246541142463684, "learning_rate": 6.322485608910722e-07, "loss": 0.2675, "step": 7105 }, { "epoch": 5.981481481481482, "grad_norm": 0.39203646779060364, "learning_rate": 6.3122735323832e-07, "loss": 0.313, "step": 7106 }, { "epoch": 5.982323232323233, "grad_norm": 0.41802191734313965, "learning_rate": 6.30206915409885e-07, "loss": 0.2365, "step": 7107 }, { "epoch": 5.983164983164983, "grad_norm": 0.3827756941318512, "learning_rate": 6.291872475855792e-07, "loss": 0.2755, "step": 7108 }, { "epoch": 5.9840067340067336, "grad_norm": 0.3898351192474365, "learning_rate": 6.281683499450792e-07, "loss": 0.2522, "step": 7109 }, { "epoch": 5.984848484848484, "grad_norm": 0.4120517671108246, "learning_rate": 6.271502226679266e-07, "loss": 0.2987, "step": 7110 }, { "epoch": 5.985690235690235, "grad_norm": 0.3966161012649536, "learning_rate": 6.261328659335258e-07, "loss": 0.2734, "step": 7111 }, { "epoch": 5.986531986531986, "grad_norm": 0.3843019902706146, "learning_rate": 6.251162799211447e-07, "loss": 0.29, "step": 7112 }, { "epoch": 5.987373737373737, "grad_norm": 0.4211755394935608, "learning_rate": 6.241004648099186e-07, "loss": 0.2388, "step": 7113 }, { "epoch": 5.988215488215488, "grad_norm": 0.39992639422416687, "learning_rate": 6.230854207788428e-07, "loss": 0.2728, "step": 7114 }, { "epoch": 5.989057239057239, "grad_norm": 0.4226056635379791, "learning_rate": 6.220711480067809e-07, "loss": 0.2516, "step": 7115 }, { "epoch": 5.98989898989899, "grad_norm": 0.3859221339225769, "learning_rate": 6.210576466724583e-07, "loss": 0.2617, "step": 7116 }, { "epoch": 5.9907407407407405, "grad_norm": 0.38352641463279724, "learning_rate": 6.200449169544625e-07, "loss": 0.2698, "step": 7117 }, { "epoch": 5.991582491582491, "grad_norm": 0.39528533816337585, "learning_rate": 6.1903295903125e-07, "loss": 0.2734, "step": 7118 }, { "epoch": 5.992424242424242, "grad_norm": 0.41073864698410034, "learning_rate": 6.180217730811372e-07, "loss": 0.2836, "step": 7119 }, { "epoch": 5.993265993265993, "grad_norm": 0.39146333932876587, "learning_rate": 6.170113592823057e-07, "loss": 0.3102, "step": 7120 }, { "epoch": 5.994107744107744, "grad_norm": 0.4090140759944916, "learning_rate": 6.160017178128008e-07, "loss": 0.2688, "step": 7121 }, { "epoch": 5.994949494949495, "grad_norm": 0.42989590764045715, "learning_rate": 6.149928488505347e-07, "loss": 0.2687, "step": 7122 }, { "epoch": 5.995791245791246, "grad_norm": 0.41797855496406555, "learning_rate": 6.139847525732784e-07, "loss": 0.2627, "step": 7123 }, { "epoch": 5.9966329966329965, "grad_norm": 0.41749173402786255, "learning_rate": 6.129774291586704e-07, "loss": 0.25, "step": 7124 }, { "epoch": 5.997474747474747, "grad_norm": 0.41564568877220154, "learning_rate": 6.119708787842088e-07, "loss": 0.2538, "step": 7125 }, { "epoch": 5.998316498316498, "grad_norm": 0.4176606237888336, "learning_rate": 6.109651016272632e-07, "loss": 0.2736, "step": 7126 }, { "epoch": 5.999158249158249, "grad_norm": 0.4327438771724701, "learning_rate": 6.099600978650599e-07, "loss": 0.2732, "step": 7127 }, { "epoch": 6.0, "grad_norm": 0.4238227903842926, "learning_rate": 6.08955867674692e-07, "loss": 0.2551, "step": 7128 }, { "epoch": 6.000841750841751, "grad_norm": 0.47833511233329773, "learning_rate": 6.079524112331136e-07, "loss": 0.2447, "step": 7129 }, { "epoch": 6.001683501683502, "grad_norm": 0.4526824355125427, "learning_rate": 6.069497287171456e-07, "loss": 0.2417, "step": 7130 }, { "epoch": 6.002525252525253, "grad_norm": 0.5180487632751465, "learning_rate": 6.059478203034725e-07, "loss": 0.2645, "step": 7131 }, { "epoch": 6.0033670033670035, "grad_norm": 0.4812469184398651, "learning_rate": 6.04946686168641e-07, "loss": 0.2303, "step": 7132 }, { "epoch": 6.004208754208754, "grad_norm": 0.467264860868454, "learning_rate": 6.039463264890594e-07, "loss": 0.2463, "step": 7133 }, { "epoch": 6.005050505050505, "grad_norm": 0.5343770980834961, "learning_rate": 6.029467414410039e-07, "loss": 0.2313, "step": 7134 }, { "epoch": 6.005892255892256, "grad_norm": 0.463469535112381, "learning_rate": 6.019479312006122e-07, "loss": 0.2286, "step": 7135 }, { "epoch": 6.006734006734007, "grad_norm": 0.4084986746311188, "learning_rate": 6.009498959438831e-07, "loss": 0.2372, "step": 7136 }, { "epoch": 6.007575757575758, "grad_norm": 0.470529168844223, "learning_rate": 5.999526358466834e-07, "loss": 0.2597, "step": 7137 }, { "epoch": 6.008417508417509, "grad_norm": 0.4702877402305603, "learning_rate": 5.989561510847392e-07, "loss": 0.2412, "step": 7138 }, { "epoch": 6.0092592592592595, "grad_norm": 0.42366278171539307, "learning_rate": 5.979604418336433e-07, "loss": 0.2282, "step": 7139 }, { "epoch": 6.01010101010101, "grad_norm": 0.41657912731170654, "learning_rate": 5.969655082688497e-07, "loss": 0.2156, "step": 7140 }, { "epoch": 6.010942760942761, "grad_norm": 0.44923460483551025, "learning_rate": 5.95971350565675e-07, "loss": 0.2178, "step": 7141 }, { "epoch": 6.011784511784512, "grad_norm": 0.4343542456626892, "learning_rate": 5.949779688993018e-07, "loss": 0.2221, "step": 7142 }, { "epoch": 6.012626262626263, "grad_norm": 0.4207797348499298, "learning_rate": 5.939853634447751e-07, "loss": 0.2488, "step": 7143 }, { "epoch": 6.013468013468014, "grad_norm": 0.46587663888931274, "learning_rate": 5.929935343770021e-07, "loss": 0.2447, "step": 7144 }, { "epoch": 6.014309764309765, "grad_norm": 0.42477142810821533, "learning_rate": 5.920024818707526e-07, "loss": 0.2622, "step": 7145 }, { "epoch": 6.015151515151516, "grad_norm": 0.4100874960422516, "learning_rate": 5.910122061006607e-07, "loss": 0.2514, "step": 7146 }, { "epoch": 6.015993265993266, "grad_norm": 0.3833913207054138, "learning_rate": 5.900227072412246e-07, "loss": 0.2185, "step": 7147 }, { "epoch": 6.016835016835016, "grad_norm": 0.39591801166534424, "learning_rate": 5.890339854668042e-07, "loss": 0.263, "step": 7148 }, { "epoch": 6.017676767676767, "grad_norm": 0.42419150471687317, "learning_rate": 5.880460409516237e-07, "loss": 0.2414, "step": 7149 }, { "epoch": 6.018518518518518, "grad_norm": 0.4826347827911377, "learning_rate": 5.870588738697669e-07, "loss": 0.2495, "step": 7150 }, { "epoch": 6.019360269360269, "grad_norm": 0.4017678499221802, "learning_rate": 5.860724843951859e-07, "loss": 0.2333, "step": 7151 }, { "epoch": 6.02020202020202, "grad_norm": 0.43384209275245667, "learning_rate": 5.850868727016906e-07, "loss": 0.2327, "step": 7152 }, { "epoch": 6.021043771043771, "grad_norm": 0.42787495255470276, "learning_rate": 5.841020389629593e-07, "loss": 0.2321, "step": 7153 }, { "epoch": 6.021885521885522, "grad_norm": 0.41407206654548645, "learning_rate": 5.83117983352528e-07, "loss": 0.2379, "step": 7154 }, { "epoch": 6.0227272727272725, "grad_norm": 0.43042317032814026, "learning_rate": 5.821347060437971e-07, "loss": 0.2431, "step": 7155 }, { "epoch": 6.023569023569023, "grad_norm": 0.42455238103866577, "learning_rate": 5.811522072100328e-07, "loss": 0.2564, "step": 7156 }, { "epoch": 6.024410774410774, "grad_norm": 0.3751172721385956, "learning_rate": 5.801704870243602e-07, "loss": 0.2613, "step": 7157 }, { "epoch": 6.025252525252525, "grad_norm": 0.3952430188655853, "learning_rate": 5.791895456597701e-07, "loss": 0.2859, "step": 7158 }, { "epoch": 6.026094276094276, "grad_norm": 0.4088071882724762, "learning_rate": 5.782093832891134e-07, "loss": 0.26, "step": 7159 }, { "epoch": 6.026936026936027, "grad_norm": 0.40810441970825195, "learning_rate": 5.772300000851066e-07, "loss": 0.2679, "step": 7160 }, { "epoch": 6.027777777777778, "grad_norm": 0.4046202600002289, "learning_rate": 5.76251396220327e-07, "loss": 0.2475, "step": 7161 }, { "epoch": 6.0286195286195285, "grad_norm": 0.45017752051353455, "learning_rate": 5.752735718672137e-07, "loss": 0.2329, "step": 7162 }, { "epoch": 6.029461279461279, "grad_norm": 0.44180041551589966, "learning_rate": 5.742965271980711e-07, "loss": 0.2309, "step": 7163 }, { "epoch": 6.03030303030303, "grad_norm": 0.4137255847454071, "learning_rate": 5.733202623850653e-07, "loss": 0.2313, "step": 7164 }, { "epoch": 6.031144781144781, "grad_norm": 0.39220505952835083, "learning_rate": 5.723447776002244e-07, "loss": 0.2126, "step": 7165 }, { "epoch": 6.031986531986532, "grad_norm": 0.399101585149765, "learning_rate": 5.713700730154387e-07, "loss": 0.2368, "step": 7166 }, { "epoch": 6.032828282828283, "grad_norm": 0.40961048007011414, "learning_rate": 5.703961488024607e-07, "loss": 0.2409, "step": 7167 }, { "epoch": 6.033670033670034, "grad_norm": 0.3752419054508209, "learning_rate": 5.694230051329069e-07, "loss": 0.233, "step": 7168 }, { "epoch": 6.034511784511785, "grad_norm": 0.3838401436805725, "learning_rate": 5.684506421782571e-07, "loss": 0.2655, "step": 7169 }, { "epoch": 6.0353535353535355, "grad_norm": 0.40501388907432556, "learning_rate": 5.674790601098507e-07, "loss": 0.2367, "step": 7170 }, { "epoch": 6.036195286195286, "grad_norm": 0.3698711395263672, "learning_rate": 5.665082590988901e-07, "loss": 0.2332, "step": 7171 }, { "epoch": 6.037037037037037, "grad_norm": 0.40782880783081055, "learning_rate": 5.655382393164421e-07, "loss": 0.2352, "step": 7172 }, { "epoch": 6.037878787878788, "grad_norm": 0.4067365229129791, "learning_rate": 5.645690009334331e-07, "loss": 0.2187, "step": 7173 }, { "epoch": 6.038720538720539, "grad_norm": 0.42370349168777466, "learning_rate": 5.636005441206555e-07, "loss": 0.2439, "step": 7174 }, { "epoch": 6.03956228956229, "grad_norm": 0.40007638931274414, "learning_rate": 5.626328690487603e-07, "loss": 0.2394, "step": 7175 }, { "epoch": 6.040404040404041, "grad_norm": 0.41095688939094543, "learning_rate": 5.616659758882609e-07, "loss": 0.2385, "step": 7176 }, { "epoch": 6.0412457912457915, "grad_norm": 0.4107759892940521, "learning_rate": 5.606998648095368e-07, "loss": 0.2225, "step": 7177 }, { "epoch": 6.042087542087542, "grad_norm": 0.4452713131904602, "learning_rate": 5.597345359828243e-07, "loss": 0.2463, "step": 7178 }, { "epoch": 6.042929292929293, "grad_norm": 0.4053027629852295, "learning_rate": 5.587699895782278e-07, "loss": 0.2483, "step": 7179 }, { "epoch": 6.043771043771044, "grad_norm": 0.412474125623703, "learning_rate": 5.578062257657074e-07, "loss": 0.2322, "step": 7180 }, { "epoch": 6.044612794612795, "grad_norm": 0.425698846578598, "learning_rate": 5.568432447150912e-07, "loss": 0.2392, "step": 7181 }, { "epoch": 6.045454545454546, "grad_norm": 0.41794589161872864, "learning_rate": 5.558810465960657e-07, "loss": 0.1988, "step": 7182 }, { "epoch": 6.046296296296297, "grad_norm": 0.4527900516986847, "learning_rate": 5.549196315781807e-07, "loss": 0.2149, "step": 7183 }, { "epoch": 6.047138047138047, "grad_norm": 0.42557916045188904, "learning_rate": 5.539589998308453e-07, "loss": 0.2259, "step": 7184 }, { "epoch": 6.047979797979798, "grad_norm": 0.4112907648086548, "learning_rate": 5.529991515233368e-07, "loss": 0.237, "step": 7185 }, { "epoch": 6.048821548821548, "grad_norm": 0.42145130038261414, "learning_rate": 5.5204008682479e-07, "loss": 0.2456, "step": 7186 }, { "epoch": 6.049663299663299, "grad_norm": 0.3819314241409302, "learning_rate": 5.510818059042005e-07, "loss": 0.2539, "step": 7187 }, { "epoch": 6.05050505050505, "grad_norm": 0.4101177752017975, "learning_rate": 5.501243089304287e-07, "loss": 0.2714, "step": 7188 }, { "epoch": 6.051346801346801, "grad_norm": 0.4176977574825287, "learning_rate": 5.491675960721948e-07, "loss": 0.2504, "step": 7189 }, { "epoch": 6.052188552188552, "grad_norm": 0.4027070105075836, "learning_rate": 5.482116674980841e-07, "loss": 0.2611, "step": 7190 }, { "epoch": 6.053030303030303, "grad_norm": 0.408318430185318, "learning_rate": 5.472565233765398e-07, "loss": 0.2546, "step": 7191 }, { "epoch": 6.053872053872054, "grad_norm": 0.3925786316394806, "learning_rate": 5.463021638758687e-07, "loss": 0.2342, "step": 7192 }, { "epoch": 6.0547138047138045, "grad_norm": 0.40214210748672485, "learning_rate": 5.45348589164238e-07, "loss": 0.2383, "step": 7193 }, { "epoch": 6.055555555555555, "grad_norm": 0.4056279957294464, "learning_rate": 5.443957994096787e-07, "loss": 0.2486, "step": 7194 }, { "epoch": 6.056397306397306, "grad_norm": 0.4351408779621124, "learning_rate": 5.434437947800841e-07, "loss": 0.2504, "step": 7195 }, { "epoch": 6.057239057239057, "grad_norm": 0.4248307943344116, "learning_rate": 5.424925754432059e-07, "loss": 0.2347, "step": 7196 }, { "epoch": 6.058080808080808, "grad_norm": 0.41833072900772095, "learning_rate": 5.415421415666577e-07, "loss": 0.2541, "step": 7197 }, { "epoch": 6.058922558922559, "grad_norm": 0.4012230634689331, "learning_rate": 5.405924933179197e-07, "loss": 0.2557, "step": 7198 }, { "epoch": 6.05976430976431, "grad_norm": 0.44609683752059937, "learning_rate": 5.396436308643271e-07, "loss": 0.2397, "step": 7199 }, { "epoch": 6.0606060606060606, "grad_norm": 0.4170631468296051, "learning_rate": 5.386955543730799e-07, "loss": 0.2479, "step": 7200 }, { "epoch": 6.061447811447811, "grad_norm": 0.3976655602455139, "learning_rate": 5.377482640112397e-07, "loss": 0.2149, "step": 7201 }, { "epoch": 6.062289562289562, "grad_norm": 0.4179539978504181, "learning_rate": 5.368017599457304e-07, "loss": 0.2591, "step": 7202 }, { "epoch": 6.063131313131313, "grad_norm": 0.42081862688064575, "learning_rate": 5.358560423433351e-07, "loss": 0.2449, "step": 7203 }, { "epoch": 6.063973063973064, "grad_norm": 0.40088126063346863, "learning_rate": 5.349111113706989e-07, "loss": 0.232, "step": 7204 }, { "epoch": 6.064814814814815, "grad_norm": 0.3805214762687683, "learning_rate": 5.339669671943288e-07, "loss": 0.2214, "step": 7205 }, { "epoch": 6.065656565656566, "grad_norm": 0.4010111093521118, "learning_rate": 5.33023609980593e-07, "loss": 0.2489, "step": 7206 }, { "epoch": 6.066498316498317, "grad_norm": 0.3892325460910797, "learning_rate": 5.320810398957221e-07, "loss": 0.2573, "step": 7207 }, { "epoch": 6.0673400673400675, "grad_norm": 0.42840811610221863, "learning_rate": 5.311392571058066e-07, "loss": 0.2474, "step": 7208 }, { "epoch": 6.068181818181818, "grad_norm": 0.4280456304550171, "learning_rate": 5.301982617767976e-07, "loss": 0.2295, "step": 7209 }, { "epoch": 6.069023569023569, "grad_norm": 0.4377499520778656, "learning_rate": 5.29258054074509e-07, "loss": 0.2408, "step": 7210 }, { "epoch": 6.06986531986532, "grad_norm": 0.3983360826969147, "learning_rate": 5.283186341646169e-07, "loss": 0.246, "step": 7211 }, { "epoch": 6.070707070707071, "grad_norm": 0.4316233694553375, "learning_rate": 5.273800022126557e-07, "loss": 0.2266, "step": 7212 }, { "epoch": 6.071548821548822, "grad_norm": 0.41848519444465637, "learning_rate": 5.264421583840229e-07, "loss": 0.2046, "step": 7213 }, { "epoch": 6.072390572390573, "grad_norm": 0.4208781123161316, "learning_rate": 5.25505102843975e-07, "loss": 0.26, "step": 7214 }, { "epoch": 6.0732323232323235, "grad_norm": 0.4201929569244385, "learning_rate": 5.24568835757634e-07, "loss": 0.2311, "step": 7215 }, { "epoch": 6.074074074074074, "grad_norm": 0.4107663333415985, "learning_rate": 5.236333572899771e-07, "loss": 0.2386, "step": 7216 }, { "epoch": 6.074915824915825, "grad_norm": 0.39625295996665955, "learning_rate": 5.226986676058482e-07, "loss": 0.2152, "step": 7217 }, { "epoch": 6.075757575757576, "grad_norm": 0.4432482421398163, "learning_rate": 5.217647668699471e-07, "loss": 0.24, "step": 7218 }, { "epoch": 6.076599326599327, "grad_norm": 0.4278867840766907, "learning_rate": 5.208316552468401e-07, "loss": 0.234, "step": 7219 }, { "epoch": 6.077441077441078, "grad_norm": 0.39436501264572144, "learning_rate": 5.198993329009495e-07, "loss": 0.2489, "step": 7220 }, { "epoch": 6.078282828282829, "grad_norm": 0.41042086482048035, "learning_rate": 5.189677999965598e-07, "loss": 0.2277, "step": 7221 }, { "epoch": 6.079124579124579, "grad_norm": 0.4012400507926941, "learning_rate": 5.180370566978176e-07, "loss": 0.2488, "step": 7222 }, { "epoch": 6.07996632996633, "grad_norm": 0.4389212727546692, "learning_rate": 5.171071031687313e-07, "loss": 0.2285, "step": 7223 }, { "epoch": 6.08080808080808, "grad_norm": 0.3954675495624542, "learning_rate": 5.16177939573167e-07, "loss": 0.2372, "step": 7224 }, { "epoch": 6.081649831649831, "grad_norm": 0.40544256567955017, "learning_rate": 5.152495660748541e-07, "loss": 0.2574, "step": 7225 }, { "epoch": 6.082491582491582, "grad_norm": 0.436315655708313, "learning_rate": 5.143219828373808e-07, "loss": 0.232, "step": 7226 }, { "epoch": 6.083333333333333, "grad_norm": 0.4498485028743744, "learning_rate": 5.133951900241973e-07, "loss": 0.2646, "step": 7227 }, { "epoch": 6.084175084175084, "grad_norm": 0.4266524612903595, "learning_rate": 5.12469187798616e-07, "loss": 0.2328, "step": 7228 }, { "epoch": 6.085016835016835, "grad_norm": 0.4272833466529846, "learning_rate": 5.115439763238072e-07, "loss": 0.2529, "step": 7229 }, { "epoch": 6.085858585858586, "grad_norm": 0.45750710368156433, "learning_rate": 5.106195557628024e-07, "loss": 0.2698, "step": 7230 }, { "epoch": 6.0867003367003365, "grad_norm": 0.4075126349925995, "learning_rate": 5.096959262784962e-07, "loss": 0.2586, "step": 7231 }, { "epoch": 6.087542087542087, "grad_norm": 0.4170761704444885, "learning_rate": 5.087730880336395e-07, "loss": 0.2265, "step": 7232 }, { "epoch": 6.088383838383838, "grad_norm": 0.4170016646385193, "learning_rate": 5.078510411908483e-07, "loss": 0.2704, "step": 7233 }, { "epoch": 6.089225589225589, "grad_norm": 0.4686491787433624, "learning_rate": 5.069297859125966e-07, "loss": 0.2456, "step": 7234 }, { "epoch": 6.09006734006734, "grad_norm": 0.46001726388931274, "learning_rate": 5.060093223612178e-07, "loss": 0.2389, "step": 7235 }, { "epoch": 6.090909090909091, "grad_norm": 0.41899991035461426, "learning_rate": 5.050896506989106e-07, "loss": 0.2464, "step": 7236 }, { "epoch": 6.091750841750842, "grad_norm": 0.46039849519729614, "learning_rate": 5.041707710877275e-07, "loss": 0.2452, "step": 7237 }, { "epoch": 6.092592592592593, "grad_norm": 0.4026279151439667, "learning_rate": 5.032526836895873e-07, "loss": 0.2587, "step": 7238 }, { "epoch": 6.093434343434343, "grad_norm": 0.4543323218822479, "learning_rate": 5.023353886662652e-07, "loss": 0.2224, "step": 7239 }, { "epoch": 6.094276094276094, "grad_norm": 0.4565984308719635, "learning_rate": 5.014188861794001e-07, "loss": 0.2059, "step": 7240 }, { "epoch": 6.095117845117845, "grad_norm": 0.4500363767147064, "learning_rate": 5.005031763904883e-07, "loss": 0.2401, "step": 7241 }, { "epoch": 6.095959595959596, "grad_norm": 0.4613000452518463, "learning_rate": 4.99588259460887e-07, "loss": 0.2369, "step": 7242 }, { "epoch": 6.096801346801347, "grad_norm": 0.4065949618816376, "learning_rate": 4.986741355518165e-07, "loss": 0.263, "step": 7243 }, { "epoch": 6.097643097643098, "grad_norm": 0.4298391342163086, "learning_rate": 4.977608048243526e-07, "loss": 0.2369, "step": 7244 }, { "epoch": 6.098484848484849, "grad_norm": 0.41760849952697754, "learning_rate": 4.96848267439436e-07, "loss": 0.252, "step": 7245 }, { "epoch": 6.0993265993265995, "grad_norm": 0.42789116501808167, "learning_rate": 4.959365235578645e-07, "loss": 0.2332, "step": 7246 }, { "epoch": 6.10016835016835, "grad_norm": 0.42778587341308594, "learning_rate": 4.95025573340297e-07, "loss": 0.273, "step": 7247 }, { "epoch": 6.101010101010101, "grad_norm": 0.43415504693984985, "learning_rate": 4.941154169472523e-07, "loss": 0.2493, "step": 7248 }, { "epoch": 6.101851851851852, "grad_norm": 0.42061832547187805, "learning_rate": 4.932060545391115e-07, "loss": 0.2209, "step": 7249 }, { "epoch": 6.102693602693603, "grad_norm": 0.40613386034965515, "learning_rate": 4.922974862761126e-07, "loss": 0.2059, "step": 7250 }, { "epoch": 6.103535353535354, "grad_norm": 0.4550744891166687, "learning_rate": 4.913897123183553e-07, "loss": 0.2352, "step": 7251 }, { "epoch": 6.104377104377105, "grad_norm": 0.4003440737724304, "learning_rate": 4.904827328257978e-07, "loss": 0.2798, "step": 7252 }, { "epoch": 6.1052188552188555, "grad_norm": 0.4689430594444275, "learning_rate": 4.89576547958261e-07, "loss": 0.216, "step": 7253 }, { "epoch": 6.106060606060606, "grad_norm": 0.4087640345096588, "learning_rate": 4.886711578754244e-07, "loss": 0.2546, "step": 7254 }, { "epoch": 6.106902356902357, "grad_norm": 0.40160027146339417, "learning_rate": 4.877665627368272e-07, "loss": 0.2703, "step": 7255 }, { "epoch": 6.107744107744108, "grad_norm": 0.3843595087528229, "learning_rate": 4.868627627018673e-07, "loss": 0.2496, "step": 7256 }, { "epoch": 6.108585858585859, "grad_norm": 0.38632407784461975, "learning_rate": 4.859597579298064e-07, "loss": 0.2373, "step": 7257 }, { "epoch": 6.109427609427609, "grad_norm": 0.41235294938087463, "learning_rate": 4.85057548579761e-07, "loss": 0.2497, "step": 7258 }, { "epoch": 6.11026936026936, "grad_norm": 0.4419651925563812, "learning_rate": 4.841561348107121e-07, "loss": 0.2444, "step": 7259 }, { "epoch": 6.111111111111111, "grad_norm": 0.43520936369895935, "learning_rate": 4.832555167814967e-07, "loss": 0.2199, "step": 7260 }, { "epoch": 6.111952861952862, "grad_norm": 0.38874930143356323, "learning_rate": 4.823556946508152e-07, "loss": 0.241, "step": 7261 }, { "epoch": 6.1127946127946124, "grad_norm": 0.4456747770309448, "learning_rate": 4.814566685772248e-07, "loss": 0.2592, "step": 7262 }, { "epoch": 6.113636363636363, "grad_norm": 0.4092562198638916, "learning_rate": 4.805584387191437e-07, "loss": 0.2227, "step": 7263 }, { "epoch": 6.114478114478114, "grad_norm": 0.40410473942756653, "learning_rate": 4.796610052348483e-07, "loss": 0.254, "step": 7264 }, { "epoch": 6.115319865319865, "grad_norm": 0.4257497191429138, "learning_rate": 4.787643682824772e-07, "loss": 0.254, "step": 7265 }, { "epoch": 6.116161616161616, "grad_norm": 0.40155383944511414, "learning_rate": 4.778685280200273e-07, "loss": 0.2627, "step": 7266 }, { "epoch": 6.117003367003367, "grad_norm": 0.4059487283229828, "learning_rate": 4.769734846053558e-07, "loss": 0.2307, "step": 7267 }, { "epoch": 6.117845117845118, "grad_norm": 0.37906306982040405, "learning_rate": 4.7607923819617765e-07, "loss": 0.2561, "step": 7268 }, { "epoch": 6.1186868686868685, "grad_norm": 0.39699527621269226, "learning_rate": 4.751857889500683e-07, "loss": 0.232, "step": 7269 }, { "epoch": 6.119528619528619, "grad_norm": 0.40166524052619934, "learning_rate": 4.742931370244652e-07, "loss": 0.243, "step": 7270 }, { "epoch": 6.12037037037037, "grad_norm": 0.4253845512866974, "learning_rate": 4.734012825766621e-07, "loss": 0.2476, "step": 7271 }, { "epoch": 6.121212121212121, "grad_norm": 0.42506271600723267, "learning_rate": 4.7251022576381255e-07, "loss": 0.2378, "step": 7272 }, { "epoch": 6.122053872053872, "grad_norm": 0.38228893280029297, "learning_rate": 4.7161996674292997e-07, "loss": 0.2487, "step": 7273 }, { "epoch": 6.122895622895623, "grad_norm": 0.432345449924469, "learning_rate": 4.707305056708883e-07, "loss": 0.2183, "step": 7274 }, { "epoch": 6.123737373737374, "grad_norm": 0.40017619729042053, "learning_rate": 4.69841842704421e-07, "loss": 0.2341, "step": 7275 }, { "epoch": 6.124579124579125, "grad_norm": 0.436685711145401, "learning_rate": 4.6895397800011843e-07, "loss": 0.228, "step": 7276 }, { "epoch": 6.125420875420875, "grad_norm": 0.4576675295829773, "learning_rate": 4.6806691171443206e-07, "loss": 0.2277, "step": 7277 }, { "epoch": 6.126262626262626, "grad_norm": 0.44273409247398376, "learning_rate": 4.6718064400367304e-07, "loss": 0.2581, "step": 7278 }, { "epoch": 6.127104377104377, "grad_norm": 0.4356876313686371, "learning_rate": 4.662951750240113e-07, "loss": 0.2383, "step": 7279 }, { "epoch": 6.127946127946128, "grad_norm": 0.4118693172931671, "learning_rate": 4.654105049314744e-07, "loss": 0.2417, "step": 7280 }, { "epoch": 6.128787878787879, "grad_norm": 0.4245947003364563, "learning_rate": 4.6452663388195263e-07, "loss": 0.2568, "step": 7281 }, { "epoch": 6.12962962962963, "grad_norm": 0.4289191663265228, "learning_rate": 4.6364356203119134e-07, "loss": 0.2531, "step": 7282 }, { "epoch": 6.130471380471381, "grad_norm": 0.4089781641960144, "learning_rate": 4.6276128953479936e-07, "loss": 0.2227, "step": 7283 }, { "epoch": 6.1313131313131315, "grad_norm": 0.39958274364471436, "learning_rate": 4.618798165482419e-07, "loss": 0.2396, "step": 7284 }, { "epoch": 6.132154882154882, "grad_norm": 0.41689640283584595, "learning_rate": 4.609991432268429e-07, "loss": 0.2038, "step": 7285 }, { "epoch": 6.132996632996633, "grad_norm": 0.41188856959342957, "learning_rate": 4.601192697257867e-07, "loss": 0.2373, "step": 7286 }, { "epoch": 6.133838383838384, "grad_norm": 0.4057807922363281, "learning_rate": 4.592401962001175e-07, "loss": 0.2524, "step": 7287 }, { "epoch": 6.134680134680135, "grad_norm": 0.4025174379348755, "learning_rate": 4.58361922804737e-07, "loss": 0.2356, "step": 7288 }, { "epoch": 6.135521885521886, "grad_norm": 0.4134717583656311, "learning_rate": 4.5748444969440585e-07, "loss": 0.2367, "step": 7289 }, { "epoch": 6.136363636363637, "grad_norm": 0.396721750497818, "learning_rate": 4.566077770237426e-07, "loss": 0.2721, "step": 7290 }, { "epoch": 6.1372053872053876, "grad_norm": 0.3935607969760895, "learning_rate": 4.5573190494722974e-07, "loss": 0.244, "step": 7291 }, { "epoch": 6.138047138047138, "grad_norm": 0.4591834843158722, "learning_rate": 4.5485683361920383e-07, "loss": 0.2442, "step": 7292 }, { "epoch": 6.138888888888889, "grad_norm": 0.4284266233444214, "learning_rate": 4.5398256319386104e-07, "loss": 0.2344, "step": 7293 }, { "epoch": 6.13973063973064, "grad_norm": 0.4137808680534363, "learning_rate": 4.531090938252575e-07, "loss": 0.2613, "step": 7294 }, { "epoch": 6.140572390572391, "grad_norm": 0.42010971903800964, "learning_rate": 4.522364256673084e-07, "loss": 0.2331, "step": 7295 }, { "epoch": 6.141414141414141, "grad_norm": 0.5047738552093506, "learning_rate": 4.5136455887378585e-07, "loss": 0.2403, "step": 7296 }, { "epoch": 6.142255892255892, "grad_norm": 0.4323597848415375, "learning_rate": 4.504934935983235e-07, "loss": 0.2633, "step": 7297 }, { "epoch": 6.143097643097643, "grad_norm": 0.3944590389728546, "learning_rate": 4.496232299944114e-07, "loss": 0.2465, "step": 7298 }, { "epoch": 6.143939393939394, "grad_norm": 0.4135618805885315, "learning_rate": 4.4875376821540027e-07, "loss": 0.2021, "step": 7299 }, { "epoch": 6.1447811447811445, "grad_norm": 0.4311356544494629, "learning_rate": 4.4788510841449807e-07, "loss": 0.2636, "step": 7300 }, { "epoch": 6.145622895622895, "grad_norm": 0.39583107829093933, "learning_rate": 4.470172507447706e-07, "loss": 0.2284, "step": 7301 }, { "epoch": 6.146464646464646, "grad_norm": 0.4400799870491028, "learning_rate": 4.461501953591463e-07, "loss": 0.247, "step": 7302 }, { "epoch": 6.147306397306397, "grad_norm": 0.3780968189239502, "learning_rate": 4.452839424104066e-07, "loss": 0.2501, "step": 7303 }, { "epoch": 6.148148148148148, "grad_norm": 0.4126974940299988, "learning_rate": 4.4441849205119736e-07, "loss": 0.2361, "step": 7304 }, { "epoch": 6.148989898989899, "grad_norm": 0.3763626217842102, "learning_rate": 4.435538444340187e-07, "loss": 0.2402, "step": 7305 }, { "epoch": 6.14983164983165, "grad_norm": 0.40503990650177, "learning_rate": 4.4268999971122995e-07, "loss": 0.2625, "step": 7306 }, { "epoch": 6.1506734006734005, "grad_norm": 0.42657482624053955, "learning_rate": 4.418269580350504e-07, "loss": 0.2456, "step": 7307 }, { "epoch": 6.151515151515151, "grad_norm": 0.4113866984844208, "learning_rate": 4.409647195575584e-07, "loss": 0.2393, "step": 7308 }, { "epoch": 6.152356902356902, "grad_norm": 0.41922637820243835, "learning_rate": 4.401032844306891e-07, "loss": 0.2289, "step": 7309 }, { "epoch": 6.153198653198653, "grad_norm": 0.4294556975364685, "learning_rate": 4.3924265280623544e-07, "loss": 0.2539, "step": 7310 }, { "epoch": 6.154040404040404, "grad_norm": 0.436189204454422, "learning_rate": 4.3838282483585017e-07, "loss": 0.2323, "step": 7311 }, { "epoch": 6.154882154882155, "grad_norm": 0.4581568241119385, "learning_rate": 4.3752380067104415e-07, "loss": 0.2608, "step": 7312 }, { "epoch": 6.155723905723906, "grad_norm": 0.412991464138031, "learning_rate": 4.3666558046318754e-07, "loss": 0.2291, "step": 7313 }, { "epoch": 6.156565656565657, "grad_norm": 0.4331604242324829, "learning_rate": 4.3580816436350757e-07, "loss": 0.2391, "step": 7314 }, { "epoch": 6.157407407407407, "grad_norm": 0.4110325872898102, "learning_rate": 4.349515525230891e-07, "loss": 0.2342, "step": 7315 }, { "epoch": 6.158249158249158, "grad_norm": 0.4221743047237396, "learning_rate": 4.340957450928779e-07, "loss": 0.2767, "step": 7316 }, { "epoch": 6.159090909090909, "grad_norm": 0.39695701003074646, "learning_rate": 4.3324074222367395e-07, "loss": 0.2129, "step": 7317 }, { "epoch": 6.15993265993266, "grad_norm": 0.4081915020942688, "learning_rate": 4.3238654406614065e-07, "loss": 0.2362, "step": 7318 }, { "epoch": 6.160774410774411, "grad_norm": 0.4212816655635834, "learning_rate": 4.3153315077079595e-07, "loss": 0.2546, "step": 7319 }, { "epoch": 6.161616161616162, "grad_norm": 0.4197993576526642, "learning_rate": 4.3068056248801503e-07, "loss": 0.2064, "step": 7320 }, { "epoch": 6.162457912457913, "grad_norm": 0.4314093291759491, "learning_rate": 4.298287793680356e-07, "loss": 0.2361, "step": 7321 }, { "epoch": 6.1632996632996635, "grad_norm": 0.39758414030075073, "learning_rate": 4.289778015609486e-07, "loss": 0.2388, "step": 7322 }, { "epoch": 6.164141414141414, "grad_norm": 0.40899673104286194, "learning_rate": 4.281276292167075e-07, "loss": 0.2593, "step": 7323 }, { "epoch": 6.164983164983165, "grad_norm": 0.4296766221523285, "learning_rate": 4.2727826248512017e-07, "loss": 0.2429, "step": 7324 }, { "epoch": 6.165824915824916, "grad_norm": 0.40667998790740967, "learning_rate": 4.2642970151585526e-07, "loss": 0.223, "step": 7325 }, { "epoch": 6.166666666666667, "grad_norm": 0.41413912177085876, "learning_rate": 4.255819464584382e-07, "loss": 0.2637, "step": 7326 }, { "epoch": 6.167508417508418, "grad_norm": 0.44248512387275696, "learning_rate": 4.2473499746225165e-07, "loss": 0.2201, "step": 7327 }, { "epoch": 6.168350168350169, "grad_norm": 0.3997850716114044, "learning_rate": 4.2388885467653516e-07, "loss": 0.2195, "step": 7328 }, { "epoch": 6.16919191919192, "grad_norm": 0.37775465846061707, "learning_rate": 4.230435182503928e-07, "loss": 0.2608, "step": 7329 }, { "epoch": 6.17003367003367, "grad_norm": 0.3977787494659424, "learning_rate": 4.221989883327793e-07, "loss": 0.2402, "step": 7330 }, { "epoch": 6.170875420875421, "grad_norm": 0.4003085494041443, "learning_rate": 4.2135526507250945e-07, "loss": 0.2337, "step": 7331 }, { "epoch": 6.171717171717171, "grad_norm": 0.3923133313655853, "learning_rate": 4.2051234861825665e-07, "loss": 0.232, "step": 7332 }, { "epoch": 6.172558922558922, "grad_norm": 0.4098883867263794, "learning_rate": 4.1967023911855086e-07, "loss": 0.2471, "step": 7333 }, { "epoch": 6.173400673400673, "grad_norm": 0.41116243600845337, "learning_rate": 4.1882893672178346e-07, "loss": 0.2583, "step": 7334 }, { "epoch": 6.174242424242424, "grad_norm": 0.3913109600543976, "learning_rate": 4.1798844157619867e-07, "loss": 0.2564, "step": 7335 }, { "epoch": 6.175084175084175, "grad_norm": 0.3890984058380127, "learning_rate": 4.1714875382990073e-07, "loss": 0.2376, "step": 7336 }, { "epoch": 6.175925925925926, "grad_norm": 0.43117576837539673, "learning_rate": 4.1630987363085295e-07, "loss": 0.2636, "step": 7337 }, { "epoch": 6.1767676767676765, "grad_norm": 0.4231961667537689, "learning_rate": 4.154718011268732e-07, "loss": 0.2356, "step": 7338 }, { "epoch": 6.177609427609427, "grad_norm": 0.413638710975647, "learning_rate": 4.146345364656401e-07, "loss": 0.2458, "step": 7339 }, { "epoch": 6.178451178451178, "grad_norm": 0.407362699508667, "learning_rate": 4.137980797946889e-07, "loss": 0.2549, "step": 7340 }, { "epoch": 6.179292929292929, "grad_norm": 0.4005243480205536, "learning_rate": 4.1296243126141e-07, "loss": 0.2706, "step": 7341 }, { "epoch": 6.18013468013468, "grad_norm": 0.3956618010997772, "learning_rate": 4.1212759101305624e-07, "loss": 0.2574, "step": 7342 }, { "epoch": 6.180976430976431, "grad_norm": 0.41719090938568115, "learning_rate": 4.112935591967343e-07, "loss": 0.2639, "step": 7343 }, { "epoch": 6.181818181818182, "grad_norm": 0.41497746109962463, "learning_rate": 4.1046033595940837e-07, "loss": 0.2551, "step": 7344 }, { "epoch": 6.1826599326599325, "grad_norm": 0.4250362515449524, "learning_rate": 4.096279214479021e-07, "loss": 0.2503, "step": 7345 }, { "epoch": 6.183501683501683, "grad_norm": 0.422063410282135, "learning_rate": 4.08796315808897e-07, "loss": 0.2341, "step": 7346 }, { "epoch": 6.184343434343434, "grad_norm": 0.431585431098938, "learning_rate": 4.079655191889298e-07, "loss": 0.2193, "step": 7347 }, { "epoch": 6.185185185185185, "grad_norm": 0.3930812478065491, "learning_rate": 4.071355317343956e-07, "loss": 0.2658, "step": 7348 }, { "epoch": 6.186026936026936, "grad_norm": 0.4151102900505066, "learning_rate": 4.0630635359154626e-07, "loss": 0.2497, "step": 7349 }, { "epoch": 6.186868686868687, "grad_norm": 0.4297424256801605, "learning_rate": 4.054779849064927e-07, "loss": 0.2197, "step": 7350 }, { "epoch": 6.187710437710438, "grad_norm": 0.4053298830986023, "learning_rate": 4.0465042582520376e-07, "loss": 0.2462, "step": 7351 }, { "epoch": 6.188552188552189, "grad_norm": 0.4231981635093689, "learning_rate": 4.0382367649350217e-07, "loss": 0.2338, "step": 7352 }, { "epoch": 6.1893939393939394, "grad_norm": 0.3883534371852875, "learning_rate": 4.029977370570698e-07, "loss": 0.2673, "step": 7353 }, { "epoch": 6.19023569023569, "grad_norm": 0.40712955594062805, "learning_rate": 4.0217260766144685e-07, "loss": 0.2483, "step": 7354 }, { "epoch": 6.191077441077441, "grad_norm": 0.4060788154602051, "learning_rate": 4.013482884520309e-07, "loss": 0.2436, "step": 7355 }, { "epoch": 6.191919191919192, "grad_norm": 0.38212597370147705, "learning_rate": 4.005247795740741e-07, "loss": 0.2754, "step": 7356 }, { "epoch": 6.192760942760943, "grad_norm": 0.4343512952327728, "learning_rate": 3.9970208117268873e-07, "loss": 0.2281, "step": 7357 }, { "epoch": 6.193602693602694, "grad_norm": 0.4860857427120209, "learning_rate": 3.98880193392841e-07, "loss": 0.2312, "step": 7358 }, { "epoch": 6.194444444444445, "grad_norm": 0.40637966990470886, "learning_rate": 3.9805911637935845e-07, "loss": 0.2405, "step": 7359 }, { "epoch": 6.1952861952861955, "grad_norm": 0.382013201713562, "learning_rate": 3.9723885027692253e-07, "loss": 0.2486, "step": 7360 }, { "epoch": 6.196127946127946, "grad_norm": 0.4646984934806824, "learning_rate": 3.9641939523007376e-07, "loss": 0.2363, "step": 7361 }, { "epoch": 6.196969696969697, "grad_norm": 0.4131178855895996, "learning_rate": 3.956007513832072e-07, "loss": 0.2518, "step": 7362 }, { "epoch": 6.197811447811448, "grad_norm": 0.4504210650920868, "learning_rate": 3.947829188805785e-07, "loss": 0.2469, "step": 7363 }, { "epoch": 6.198653198653199, "grad_norm": 0.4516119062900543, "learning_rate": 3.9396589786629804e-07, "loss": 0.2529, "step": 7364 }, { "epoch": 6.19949494949495, "grad_norm": 0.4443976879119873, "learning_rate": 3.931496884843322e-07, "loss": 0.2413, "step": 7365 }, { "epoch": 6.200336700336701, "grad_norm": 0.4280223548412323, "learning_rate": 3.923342908785072e-07, "loss": 0.2414, "step": 7366 }, { "epoch": 6.201178451178452, "grad_norm": 0.4408978521823883, "learning_rate": 3.915197051925046e-07, "loss": 0.2238, "step": 7367 }, { "epoch": 6.202020202020202, "grad_norm": 0.39024895429611206, "learning_rate": 3.907059315698636e-07, "loss": 0.2312, "step": 7368 }, { "epoch": 6.202861952861953, "grad_norm": 0.4023580551147461, "learning_rate": 3.898929701539783e-07, "loss": 0.2177, "step": 7369 }, { "epoch": 6.203703703703703, "grad_norm": 0.45884910225868225, "learning_rate": 3.8908082108810143e-07, "loss": 0.2502, "step": 7370 }, { "epoch": 6.204545454545454, "grad_norm": 0.4182683527469635, "learning_rate": 3.882694845153434e-07, "loss": 0.2634, "step": 7371 }, { "epoch": 6.205387205387205, "grad_norm": 0.3971586525440216, "learning_rate": 3.874589605786699e-07, "loss": 0.2471, "step": 7372 }, { "epoch": 6.206228956228956, "grad_norm": 0.39386290311813354, "learning_rate": 3.8664924942090397e-07, "loss": 0.2624, "step": 7373 }, { "epoch": 6.207070707070707, "grad_norm": 0.3913112282752991, "learning_rate": 3.858403511847247e-07, "loss": 0.2409, "step": 7374 }, { "epoch": 6.207912457912458, "grad_norm": 0.39184194803237915, "learning_rate": 3.850322660126693e-07, "loss": 0.2771, "step": 7375 }, { "epoch": 6.2087542087542085, "grad_norm": 0.40981432795524597, "learning_rate": 3.8422499404713044e-07, "loss": 0.2464, "step": 7376 }, { "epoch": 6.209595959595959, "grad_norm": 0.3656063377857208, "learning_rate": 3.8341853543035876e-07, "loss": 0.2735, "step": 7377 }, { "epoch": 6.21043771043771, "grad_norm": 0.4310298562049866, "learning_rate": 3.826128903044607e-07, "loss": 0.2425, "step": 7378 }, { "epoch": 6.211279461279461, "grad_norm": 0.3882256746292114, "learning_rate": 3.818080588113987e-07, "loss": 0.2732, "step": 7379 }, { "epoch": 6.212121212121212, "grad_norm": 0.3897815942764282, "learning_rate": 3.810040410929938e-07, "loss": 0.2508, "step": 7380 }, { "epoch": 6.212962962962963, "grad_norm": 0.4144647419452667, "learning_rate": 3.8020083729092103e-07, "loss": 0.2307, "step": 7381 }, { "epoch": 6.213804713804714, "grad_norm": 0.42384764552116394, "learning_rate": 3.7939844754671493e-07, "loss": 0.2425, "step": 7382 }, { "epoch": 6.2146464646464645, "grad_norm": 0.40488091111183167, "learning_rate": 3.785968720017641e-07, "loss": 0.2623, "step": 7383 }, { "epoch": 6.215488215488215, "grad_norm": 0.4250723123550415, "learning_rate": 3.7779611079731606e-07, "loss": 0.2285, "step": 7384 }, { "epoch": 6.216329966329966, "grad_norm": 0.42876124382019043, "learning_rate": 3.7699616407447236e-07, "loss": 0.2533, "step": 7385 }, { "epoch": 6.217171717171717, "grad_norm": 0.4334549307823181, "learning_rate": 3.7619703197419146e-07, "loss": 0.2316, "step": 7386 }, { "epoch": 6.218013468013468, "grad_norm": 0.4385633170604706, "learning_rate": 3.7539871463729063e-07, "loss": 0.253, "step": 7387 }, { "epoch": 6.218855218855219, "grad_norm": 0.4058700501918793, "learning_rate": 3.746012122044407e-07, "loss": 0.2356, "step": 7388 }, { "epoch": 6.21969696969697, "grad_norm": 0.4074119031429291, "learning_rate": 3.738045248161709e-07, "loss": 0.2368, "step": 7389 }, { "epoch": 6.220538720538721, "grad_norm": 0.4096810221672058, "learning_rate": 3.7300865261286567e-07, "loss": 0.2318, "step": 7390 }, { "epoch": 6.2213804713804715, "grad_norm": 0.41167932748794556, "learning_rate": 3.7221359573476555e-07, "loss": 0.248, "step": 7391 }, { "epoch": 6.222222222222222, "grad_norm": 0.3939381539821625, "learning_rate": 3.714193543219685e-07, "loss": 0.2511, "step": 7392 }, { "epoch": 6.223063973063973, "grad_norm": 0.4412488341331482, "learning_rate": 3.7062592851442924e-07, "loss": 0.1979, "step": 7393 }, { "epoch": 6.223905723905724, "grad_norm": 0.4562610685825348, "learning_rate": 3.6983331845195755e-07, "loss": 0.263, "step": 7394 }, { "epoch": 6.224747474747475, "grad_norm": 0.43435677886009216, "learning_rate": 3.69041524274219e-07, "loss": 0.2565, "step": 7395 }, { "epoch": 6.225589225589226, "grad_norm": 0.3999270498752594, "learning_rate": 3.682505461207353e-07, "loss": 0.2192, "step": 7396 }, { "epoch": 6.226430976430977, "grad_norm": 0.44780343770980835, "learning_rate": 3.674603841308866e-07, "loss": 0.2797, "step": 7397 }, { "epoch": 6.2272727272727275, "grad_norm": 0.3820416331291199, "learning_rate": 3.6667103844390873e-07, "loss": 0.2637, "step": 7398 }, { "epoch": 6.228114478114478, "grad_norm": 0.4298854470252991, "learning_rate": 3.658825091988916e-07, "loss": 0.239, "step": 7399 }, { "epoch": 6.228956228956229, "grad_norm": 0.4543650150299072, "learning_rate": 3.6509479653478174e-07, "loss": 0.2518, "step": 7400 }, { "epoch": 6.22979797979798, "grad_norm": 0.3810153007507324, "learning_rate": 3.6430790059038436e-07, "loss": 0.2483, "step": 7401 }, { "epoch": 6.230639730639731, "grad_norm": 0.4106610119342804, "learning_rate": 3.6352182150435735e-07, "loss": 0.2335, "step": 7402 }, { "epoch": 6.231481481481482, "grad_norm": 0.43429258465766907, "learning_rate": 3.6273655941521766e-07, "loss": 0.2367, "step": 7403 }, { "epoch": 6.232323232323233, "grad_norm": 0.4801909923553467, "learning_rate": 3.619521144613347e-07, "loss": 0.2419, "step": 7404 }, { "epoch": 6.233164983164984, "grad_norm": 0.40179577469825745, "learning_rate": 3.611684867809384e-07, "loss": 0.2316, "step": 7405 }, { "epoch": 6.2340067340067336, "grad_norm": 0.4579816460609436, "learning_rate": 3.6038567651211167e-07, "loss": 0.211, "step": 7406 }, { "epoch": 6.234848484848484, "grad_norm": 0.43273165822029114, "learning_rate": 3.5960368379279296e-07, "loss": 0.2427, "step": 7407 }, { "epoch": 6.235690235690235, "grad_norm": 0.40001407265663147, "learning_rate": 3.5882250876077775e-07, "loss": 0.2426, "step": 7408 }, { "epoch": 6.236531986531986, "grad_norm": 0.44592034816741943, "learning_rate": 3.580421515537186e-07, "loss": 0.2531, "step": 7409 }, { "epoch": 6.237373737373737, "grad_norm": 0.4257757067680359, "learning_rate": 3.572626123091222e-07, "loss": 0.248, "step": 7410 }, { "epoch": 6.238215488215488, "grad_norm": 0.3912956416606903, "learning_rate": 3.5648389116435256e-07, "loss": 0.2197, "step": 7411 }, { "epoch": 6.239057239057239, "grad_norm": 0.3995293080806732, "learning_rate": 3.5570598825662605e-07, "loss": 0.2247, "step": 7412 }, { "epoch": 6.23989898989899, "grad_norm": 0.4155826270580292, "learning_rate": 3.549289037230197e-07, "loss": 0.2649, "step": 7413 }, { "epoch": 6.2407407407407405, "grad_norm": 0.40122857689857483, "learning_rate": 3.541526377004645e-07, "loss": 0.2483, "step": 7414 }, { "epoch": 6.241582491582491, "grad_norm": 0.3928305208683014, "learning_rate": 3.5337719032574545e-07, "loss": 0.266, "step": 7415 }, { "epoch": 6.242424242424242, "grad_norm": 0.42716121673583984, "learning_rate": 3.5260256173550554e-07, "loss": 0.2355, "step": 7416 }, { "epoch": 6.243265993265993, "grad_norm": 0.4587249159812927, "learning_rate": 3.518287520662411e-07, "loss": 0.1906, "step": 7417 }, { "epoch": 6.244107744107744, "grad_norm": 0.4418339431285858, "learning_rate": 3.5105576145430644e-07, "loss": 0.2528, "step": 7418 }, { "epoch": 6.244949494949495, "grad_norm": 0.4279372990131378, "learning_rate": 3.5028359003591196e-07, "loss": 0.284, "step": 7419 }, { "epoch": 6.245791245791246, "grad_norm": 0.43314385414123535, "learning_rate": 3.495122379471216e-07, "loss": 0.2318, "step": 7420 }, { "epoch": 6.2466329966329965, "grad_norm": 0.3890286386013031, "learning_rate": 3.487417053238551e-07, "loss": 0.259, "step": 7421 }, { "epoch": 6.247474747474747, "grad_norm": 0.3852865397930145, "learning_rate": 3.4797199230188983e-07, "loss": 0.2506, "step": 7422 }, { "epoch": 6.248316498316498, "grad_norm": 0.41006889939308167, "learning_rate": 3.472030990168562e-07, "loss": 0.2541, "step": 7423 }, { "epoch": 6.249158249158249, "grad_norm": 0.437465637922287, "learning_rate": 3.4643502560424205e-07, "loss": 0.2409, "step": 7424 }, { "epoch": 6.25, "grad_norm": 0.4140300452709198, "learning_rate": 3.456677721993901e-07, "loss": 0.2565, "step": 7425 }, { "epoch": 6.250841750841751, "grad_norm": 0.4673723876476288, "learning_rate": 3.4490133893749777e-07, "loss": 0.2461, "step": 7426 }, { "epoch": 6.251683501683502, "grad_norm": 0.4016309976577759, "learning_rate": 3.441357259536204e-07, "loss": 0.2364, "step": 7427 }, { "epoch": 6.252525252525253, "grad_norm": 0.43185245990753174, "learning_rate": 3.433709333826662e-07, "loss": 0.2294, "step": 7428 }, { "epoch": 6.2533670033670035, "grad_norm": 0.39267683029174805, "learning_rate": 3.4260696135939964e-07, "loss": 0.2781, "step": 7429 }, { "epoch": 6.254208754208754, "grad_norm": 0.40890541672706604, "learning_rate": 3.418438100184401e-07, "loss": 0.2476, "step": 7430 }, { "epoch": 6.255050505050505, "grad_norm": 0.4273088276386261, "learning_rate": 3.4108147949426516e-07, "loss": 0.248, "step": 7431 }, { "epoch": 6.255892255892256, "grad_norm": 0.4481416642665863, "learning_rate": 3.40319969921204e-07, "loss": 0.2453, "step": 7432 }, { "epoch": 6.256734006734007, "grad_norm": 0.42885884642601013, "learning_rate": 3.395592814334425e-07, "loss": 0.2613, "step": 7433 }, { "epoch": 6.257575757575758, "grad_norm": 0.43874627351760864, "learning_rate": 3.387994141650214e-07, "loss": 0.2393, "step": 7434 }, { "epoch": 6.258417508417509, "grad_norm": 0.4418168067932129, "learning_rate": 3.380403682498401e-07, "loss": 0.2545, "step": 7435 }, { "epoch": 6.2592592592592595, "grad_norm": 0.40920987725257874, "learning_rate": 3.372821438216489e-07, "loss": 0.2351, "step": 7436 }, { "epoch": 6.26010101010101, "grad_norm": 0.4273855686187744, "learning_rate": 3.365247410140554e-07, "loss": 0.2465, "step": 7437 }, { "epoch": 6.260942760942761, "grad_norm": 0.4402583837509155, "learning_rate": 3.3576815996052116e-07, "loss": 0.2381, "step": 7438 }, { "epoch": 6.261784511784512, "grad_norm": 0.42150506377220154, "learning_rate": 3.35012400794365e-07, "loss": 0.2471, "step": 7439 }, { "epoch": 6.262626262626263, "grad_norm": 0.4189971387386322, "learning_rate": 3.3425746364875835e-07, "loss": 0.2572, "step": 7440 }, { "epoch": 6.263468013468014, "grad_norm": 0.43873509764671326, "learning_rate": 3.3350334865673073e-07, "loss": 0.234, "step": 7441 }, { "epoch": 6.264309764309765, "grad_norm": 0.4277859032154083, "learning_rate": 3.3275005595116427e-07, "loss": 0.2631, "step": 7442 }, { "epoch": 6.265151515151516, "grad_norm": 0.4241875112056732, "learning_rate": 3.3199758566479776e-07, "loss": 0.2294, "step": 7443 }, { "epoch": 6.2659932659932664, "grad_norm": 0.4199080169200897, "learning_rate": 3.3124593793022464e-07, "loss": 0.2661, "step": 7444 }, { "epoch": 6.266835016835016, "grad_norm": 0.4389156103134155, "learning_rate": 3.304951128798917e-07, "loss": 0.2423, "step": 7445 }, { "epoch": 6.267676767676767, "grad_norm": 0.43130412697792053, "learning_rate": 3.2974511064610413e-07, "loss": 0.2599, "step": 7446 }, { "epoch": 6.268518518518518, "grad_norm": 0.41796407103538513, "learning_rate": 3.2899593136101916e-07, "loss": 0.2493, "step": 7447 }, { "epoch": 6.269360269360269, "grad_norm": 0.4132273197174072, "learning_rate": 3.2824757515665163e-07, "loss": 0.2182, "step": 7448 }, { "epoch": 6.27020202020202, "grad_norm": 0.41590821743011475, "learning_rate": 3.2750004216486833e-07, "loss": 0.2786, "step": 7449 }, { "epoch": 6.271043771043771, "grad_norm": 0.3531455993652344, "learning_rate": 3.267533325173927e-07, "loss": 0.2594, "step": 7450 }, { "epoch": 6.271885521885522, "grad_norm": 0.41846761107444763, "learning_rate": 3.2600744634580294e-07, "loss": 0.2531, "step": 7451 }, { "epoch": 6.2727272727272725, "grad_norm": 0.46468445658683777, "learning_rate": 3.252623837815333e-07, "loss": 0.232, "step": 7452 }, { "epoch": 6.273569023569023, "grad_norm": 0.41264817118644714, "learning_rate": 3.2451814495587153e-07, "loss": 0.2332, "step": 7453 }, { "epoch": 6.274410774410774, "grad_norm": 0.3766915202140808, "learning_rate": 3.237747299999594e-07, "loss": 0.244, "step": 7454 }, { "epoch": 6.275252525252525, "grad_norm": 0.4368147850036621, "learning_rate": 3.2303213904479436e-07, "loss": 0.2507, "step": 7455 }, { "epoch": 6.276094276094276, "grad_norm": 0.4328116178512573, "learning_rate": 3.2229037222123006e-07, "loss": 0.2548, "step": 7456 }, { "epoch": 6.276936026936027, "grad_norm": 0.4316273033618927, "learning_rate": 3.215494296599736e-07, "loss": 0.2404, "step": 7457 }, { "epoch": 6.277777777777778, "grad_norm": 0.43777531385421753, "learning_rate": 3.208093114915861e-07, "loss": 0.2492, "step": 7458 }, { "epoch": 6.2786195286195285, "grad_norm": 0.378193736076355, "learning_rate": 3.200700178464844e-07, "loss": 0.247, "step": 7459 }, { "epoch": 6.279461279461279, "grad_norm": 0.41553032398223877, "learning_rate": 3.193315488549414e-07, "loss": 0.2322, "step": 7460 }, { "epoch": 6.28030303030303, "grad_norm": 0.43888261914253235, "learning_rate": 3.1859390464708094e-07, "loss": 0.2377, "step": 7461 }, { "epoch": 6.281144781144781, "grad_norm": 0.4302274286746979, "learning_rate": 3.178570853528856e-07, "loss": 0.2261, "step": 7462 }, { "epoch": 6.281986531986532, "grad_norm": 0.4265628457069397, "learning_rate": 3.1712109110219046e-07, "loss": 0.2387, "step": 7463 }, { "epoch": 6.282828282828283, "grad_norm": 0.426277756690979, "learning_rate": 3.16385922024684e-07, "loss": 0.2323, "step": 7464 }, { "epoch": 6.283670033670034, "grad_norm": 0.4149208068847656, "learning_rate": 3.156515782499131e-07, "loss": 0.2511, "step": 7465 }, { "epoch": 6.284511784511785, "grad_norm": 0.43285417556762695, "learning_rate": 3.1491805990727485e-07, "loss": 0.2626, "step": 7466 }, { "epoch": 6.2853535353535355, "grad_norm": 0.42978936433792114, "learning_rate": 3.1418536712602533e-07, "loss": 0.2565, "step": 7467 }, { "epoch": 6.286195286195286, "grad_norm": 0.43151816725730896, "learning_rate": 3.1345350003527065e-07, "loss": 0.2525, "step": 7468 }, { "epoch": 6.287037037037037, "grad_norm": 0.417115718126297, "learning_rate": 3.127224587639749e-07, "loss": 0.2558, "step": 7469 }, { "epoch": 6.287878787878788, "grad_norm": 0.42937734723091125, "learning_rate": 3.1199224344095557e-07, "loss": 0.2587, "step": 7470 }, { "epoch": 6.288720538720539, "grad_norm": 0.44475287199020386, "learning_rate": 3.112628541948831e-07, "loss": 0.2799, "step": 7471 }, { "epoch": 6.28956228956229, "grad_norm": 0.4173991084098816, "learning_rate": 3.105342911542836e-07, "loss": 0.2174, "step": 7472 }, { "epoch": 6.290404040404041, "grad_norm": 0.4001966118812561, "learning_rate": 3.0980655444753927e-07, "loss": 0.2398, "step": 7473 }, { "epoch": 6.2912457912457915, "grad_norm": 0.38484418392181396, "learning_rate": 3.090796442028843e-07, "loss": 0.241, "step": 7474 }, { "epoch": 6.292087542087542, "grad_norm": 0.4559750556945801, "learning_rate": 3.083535605484078e-07, "loss": 0.236, "step": 7475 }, { "epoch": 6.292929292929293, "grad_norm": 0.42674529552459717, "learning_rate": 3.076283036120531e-07, "loss": 0.2273, "step": 7476 }, { "epoch": 6.293771043771044, "grad_norm": 0.418211430311203, "learning_rate": 3.0690387352161845e-07, "loss": 0.2406, "step": 7477 }, { "epoch": 6.294612794612795, "grad_norm": 0.41188934445381165, "learning_rate": 3.0618027040475727e-07, "loss": 0.2424, "step": 7478 }, { "epoch": 6.295454545454546, "grad_norm": 0.4209052622318268, "learning_rate": 3.054574943889754e-07, "loss": 0.25, "step": 7479 }, { "epoch": 6.296296296296296, "grad_norm": 0.4144991636276245, "learning_rate": 3.047355456016321e-07, "loss": 0.2419, "step": 7480 }, { "epoch": 6.297138047138047, "grad_norm": 0.4008244276046753, "learning_rate": 3.0401442416994497e-07, "loss": 0.2404, "step": 7481 }, { "epoch": 6.297979797979798, "grad_norm": 0.4011487662792206, "learning_rate": 3.0329413022098185e-07, "loss": 0.2262, "step": 7482 }, { "epoch": 6.298821548821548, "grad_norm": 0.3968098759651184, "learning_rate": 3.0257466388166676e-07, "loss": 0.2446, "step": 7483 }, { "epoch": 6.299663299663299, "grad_norm": 0.4346718192100525, "learning_rate": 3.0185602527877724e-07, "loss": 0.2258, "step": 7484 }, { "epoch": 6.30050505050505, "grad_norm": 0.4106857478618622, "learning_rate": 3.0113821453894466e-07, "loss": 0.2714, "step": 7485 }, { "epoch": 6.301346801346801, "grad_norm": 0.4488633871078491, "learning_rate": 3.004212317886551e-07, "loss": 0.255, "step": 7486 }, { "epoch": 6.302188552188552, "grad_norm": 0.39925163984298706, "learning_rate": 2.997050771542492e-07, "loss": 0.2466, "step": 7487 }, { "epoch": 6.303030303030303, "grad_norm": 0.4289817214012146, "learning_rate": 2.9898975076191995e-07, "loss": 0.2614, "step": 7488 }, { "epoch": 6.303872053872054, "grad_norm": 0.40494149923324585, "learning_rate": 2.9827525273771586e-07, "loss": 0.2284, "step": 7489 }, { "epoch": 6.3047138047138045, "grad_norm": 0.40008023381233215, "learning_rate": 2.975615832075401e-07, "loss": 0.2559, "step": 7490 }, { "epoch": 6.305555555555555, "grad_norm": 0.4484536051750183, "learning_rate": 2.968487422971478e-07, "loss": 0.2033, "step": 7491 }, { "epoch": 6.306397306397306, "grad_norm": 0.4399261176586151, "learning_rate": 2.9613673013214995e-07, "loss": 0.253, "step": 7492 }, { "epoch": 6.307239057239057, "grad_norm": 0.42425522208213806, "learning_rate": 2.9542554683800906e-07, "loss": 0.2514, "step": 7493 }, { "epoch": 6.308080808080808, "grad_norm": 0.401481956243515, "learning_rate": 2.9471519254004434e-07, "loss": 0.2219, "step": 7494 }, { "epoch": 6.308922558922559, "grad_norm": 0.46702125668525696, "learning_rate": 2.940056673634284e-07, "loss": 0.2334, "step": 7495 }, { "epoch": 6.30976430976431, "grad_norm": 0.4396088719367981, "learning_rate": 2.9329697143318624e-07, "loss": 0.2083, "step": 7496 }, { "epoch": 6.3106060606060606, "grad_norm": 0.4237958788871765, "learning_rate": 2.9258910487419743e-07, "loss": 0.2309, "step": 7497 }, { "epoch": 6.311447811447811, "grad_norm": 0.43338245153427124, "learning_rate": 2.9188206781119554e-07, "loss": 0.2321, "step": 7498 }, { "epoch": 6.312289562289562, "grad_norm": 0.47202008962631226, "learning_rate": 2.911758603687698e-07, "loss": 0.2557, "step": 7499 }, { "epoch": 6.313131313131313, "grad_norm": 0.4067334234714508, "learning_rate": 2.9047048267136004e-07, "loss": 0.2896, "step": 7500 }, { "epoch": 6.313973063973064, "grad_norm": 0.4190205931663513, "learning_rate": 2.8976593484326185e-07, "loss": 0.2535, "step": 7501 }, { "epoch": 6.314814814814815, "grad_norm": 0.43260854482650757, "learning_rate": 2.890622170086227e-07, "loss": 0.2447, "step": 7502 }, { "epoch": 6.315656565656566, "grad_norm": 0.46300891041755676, "learning_rate": 2.883593292914466e-07, "loss": 0.2641, "step": 7503 }, { "epoch": 6.316498316498317, "grad_norm": 0.42247509956359863, "learning_rate": 2.87657271815589e-07, "loss": 0.2517, "step": 7504 }, { "epoch": 6.3173400673400675, "grad_norm": 0.41783803701400757, "learning_rate": 2.8695604470476145e-07, "loss": 0.2195, "step": 7505 }, { "epoch": 6.318181818181818, "grad_norm": 0.4249761402606964, "learning_rate": 2.862556480825257e-07, "loss": 0.2286, "step": 7506 }, { "epoch": 6.319023569023569, "grad_norm": 0.40713515877723694, "learning_rate": 2.8555608207230024e-07, "loss": 0.2478, "step": 7507 }, { "epoch": 6.31986531986532, "grad_norm": 0.44965508580207825, "learning_rate": 2.8485734679735646e-07, "loss": 0.2234, "step": 7508 }, { "epoch": 6.320707070707071, "grad_norm": 0.4261947572231293, "learning_rate": 2.841594423808169e-07, "loss": 0.2507, "step": 7509 }, { "epoch": 6.321548821548822, "grad_norm": 0.3976067900657654, "learning_rate": 2.834623689456617e-07, "loss": 0.245, "step": 7510 }, { "epoch": 6.322390572390573, "grad_norm": 0.44173553586006165, "learning_rate": 2.827661266147225e-07, "loss": 0.2707, "step": 7511 }, { "epoch": 6.3232323232323235, "grad_norm": 0.3837811350822449, "learning_rate": 2.8207071551068446e-07, "loss": 0.2716, "step": 7512 }, { "epoch": 6.324074074074074, "grad_norm": 0.4033052921295166, "learning_rate": 2.8137613575608623e-07, "loss": 0.2719, "step": 7513 }, { "epoch": 6.324915824915825, "grad_norm": 0.4338458180427551, "learning_rate": 2.8068238747331886e-07, "loss": 0.2123, "step": 7514 }, { "epoch": 6.325757575757576, "grad_norm": 0.4372604191303253, "learning_rate": 2.799894707846301e-07, "loss": 0.2428, "step": 7515 }, { "epoch": 6.326599326599327, "grad_norm": 0.4364807903766632, "learning_rate": 2.7929738581211895e-07, "loss": 0.2697, "step": 7516 }, { "epoch": 6.327441077441078, "grad_norm": 0.40694668889045715, "learning_rate": 2.786061326777378e-07, "loss": 0.2539, "step": 7517 }, { "epoch": 6.328282828282829, "grad_norm": 0.4467611312866211, "learning_rate": 2.779157115032921e-07, "loss": 0.2303, "step": 7518 }, { "epoch": 6.329124579124579, "grad_norm": 0.4303204417228699, "learning_rate": 2.7722612241044333e-07, "loss": 0.2484, "step": 7519 }, { "epoch": 6.32996632996633, "grad_norm": 0.4107045829296112, "learning_rate": 2.765373655207021e-07, "loss": 0.2606, "step": 7520 }, { "epoch": 6.33080808080808, "grad_norm": 0.4492426812648773, "learning_rate": 2.7584944095543686e-07, "loss": 0.2312, "step": 7521 }, { "epoch": 6.331649831649831, "grad_norm": 0.45174703001976013, "learning_rate": 2.751623488358662e-07, "loss": 0.2357, "step": 7522 }, { "epoch": 6.332491582491582, "grad_norm": 0.4304652810096741, "learning_rate": 2.744760892830622e-07, "loss": 0.2329, "step": 7523 }, { "epoch": 6.333333333333333, "grad_norm": 0.42639297246932983, "learning_rate": 2.7379066241795315e-07, "loss": 0.2458, "step": 7524 }, { "epoch": 6.334175084175084, "grad_norm": 0.4130456745624542, "learning_rate": 2.731060683613168e-07, "loss": 0.2348, "step": 7525 }, { "epoch": 6.335016835016835, "grad_norm": 0.4354073703289032, "learning_rate": 2.7242230723378724e-07, "loss": 0.2478, "step": 7526 }, { "epoch": 6.335858585858586, "grad_norm": 0.4371592700481415, "learning_rate": 2.717393791558487e-07, "loss": 0.2311, "step": 7527 }, { "epoch": 6.3367003367003365, "grad_norm": 0.4071105718612671, "learning_rate": 2.710572842478426e-07, "loss": 0.2671, "step": 7528 }, { "epoch": 6.337542087542087, "grad_norm": 0.415008008480072, "learning_rate": 2.703760226299607e-07, "loss": 0.2585, "step": 7529 }, { "epoch": 6.338383838383838, "grad_norm": 0.41860339045524597, "learning_rate": 2.696955944222468e-07, "loss": 0.2795, "step": 7530 }, { "epoch": 6.339225589225589, "grad_norm": 0.42390623688697815, "learning_rate": 2.690159997446018e-07, "loss": 0.2481, "step": 7531 }, { "epoch": 6.34006734006734, "grad_norm": 0.4083349108695984, "learning_rate": 2.68337238716776e-07, "loss": 0.2502, "step": 7532 }, { "epoch": 6.340909090909091, "grad_norm": 0.42782920598983765, "learning_rate": 2.676593114583759e-07, "loss": 0.2593, "step": 7533 }, { "epoch": 6.341750841750842, "grad_norm": 0.41008928418159485, "learning_rate": 2.669822180888582e-07, "loss": 0.2475, "step": 7534 }, { "epoch": 6.342592592592593, "grad_norm": 0.3994171619415283, "learning_rate": 2.663059587275341e-07, "loss": 0.2542, "step": 7535 }, { "epoch": 6.343434343434343, "grad_norm": 0.4728219211101532, "learning_rate": 2.656305334935677e-07, "loss": 0.2212, "step": 7536 }, { "epoch": 6.344276094276094, "grad_norm": 0.43605828285217285, "learning_rate": 2.649559425059767e-07, "loss": 0.2347, "step": 7537 }, { "epoch": 6.345117845117845, "grad_norm": 0.4114137589931488, "learning_rate": 2.642821858836314e-07, "loss": 0.249, "step": 7538 }, { "epoch": 6.345959595959596, "grad_norm": 0.42262399196624756, "learning_rate": 2.6360926374525473e-07, "loss": 0.2798, "step": 7539 }, { "epoch": 6.346801346801347, "grad_norm": 0.41430217027664185, "learning_rate": 2.629371762094213e-07, "loss": 0.2529, "step": 7540 }, { "epoch": 6.347643097643098, "grad_norm": 0.3973495364189148, "learning_rate": 2.6226592339456125e-07, "loss": 0.2578, "step": 7541 }, { "epoch": 6.348484848484849, "grad_norm": 0.3917682468891144, "learning_rate": 2.615955054189573e-07, "loss": 0.2664, "step": 7542 }, { "epoch": 6.3493265993265995, "grad_norm": 0.4062120020389557, "learning_rate": 2.6092592240074375e-07, "loss": 0.2521, "step": 7543 }, { "epoch": 6.35016835016835, "grad_norm": 0.3815457224845886, "learning_rate": 2.602571744579069e-07, "loss": 0.2561, "step": 7544 }, { "epoch": 6.351010101010101, "grad_norm": 0.3919947147369385, "learning_rate": 2.5958926170828903e-07, "loss": 0.2561, "step": 7545 }, { "epoch": 6.351851851851852, "grad_norm": 0.4083101451396942, "learning_rate": 2.5892218426958213e-07, "loss": 0.2818, "step": 7546 }, { "epoch": 6.352693602693603, "grad_norm": 0.41970887780189514, "learning_rate": 2.5825594225933336e-07, "loss": 0.2241, "step": 7547 }, { "epoch": 6.353535353535354, "grad_norm": 0.42558279633522034, "learning_rate": 2.5759053579494095e-07, "loss": 0.2673, "step": 7548 }, { "epoch": 6.354377104377105, "grad_norm": 0.4175984561443329, "learning_rate": 2.569259649936573e-07, "loss": 0.2462, "step": 7549 }, { "epoch": 6.3552188552188555, "grad_norm": 0.43533971905708313, "learning_rate": 2.56262229972587e-07, "loss": 0.2256, "step": 7550 }, { "epoch": 6.356060606060606, "grad_norm": 0.44237324595451355, "learning_rate": 2.555993308486865e-07, "loss": 0.2626, "step": 7551 }, { "epoch": 6.356902356902357, "grad_norm": 0.37379616498947144, "learning_rate": 2.5493726773876516e-07, "loss": 0.2584, "step": 7552 }, { "epoch": 6.357744107744108, "grad_norm": 0.4123789966106415, "learning_rate": 2.5427604075948686e-07, "loss": 0.2522, "step": 7553 }, { "epoch": 6.358585858585858, "grad_norm": 0.431776762008667, "learning_rate": 2.5361565002736675e-07, "loss": 0.2714, "step": 7554 }, { "epoch": 6.359427609427609, "grad_norm": 0.41824617981910706, "learning_rate": 2.5295609565877234e-07, "loss": 0.2337, "step": 7555 }, { "epoch": 6.36026936026936, "grad_norm": 0.4049185812473297, "learning_rate": 2.5229737776992337e-07, "loss": 0.2063, "step": 7556 }, { "epoch": 6.361111111111111, "grad_norm": 0.4102713465690613, "learning_rate": 2.5163949647689365e-07, "loss": 0.2333, "step": 7557 }, { "epoch": 6.361952861952862, "grad_norm": 0.4135322868824005, "learning_rate": 2.509824518956094e-07, "loss": 0.2636, "step": 7558 }, { "epoch": 6.3627946127946124, "grad_norm": 0.4466683864593506, "learning_rate": 2.503262441418486e-07, "loss": 0.2384, "step": 7559 }, { "epoch": 6.363636363636363, "grad_norm": 0.4338516592979431, "learning_rate": 2.4967087333124195e-07, "loss": 0.2537, "step": 7560 }, { "epoch": 6.364478114478114, "grad_norm": 0.398327112197876, "learning_rate": 2.4901633957927217e-07, "loss": 0.2725, "step": 7561 }, { "epoch": 6.365319865319865, "grad_norm": 0.40728476643562317, "learning_rate": 2.483626430012753e-07, "loss": 0.2552, "step": 7562 }, { "epoch": 6.366161616161616, "grad_norm": 0.4350526034832001, "learning_rate": 2.4770978371244036e-07, "loss": 0.2458, "step": 7563 }, { "epoch": 6.367003367003367, "grad_norm": 0.4000720977783203, "learning_rate": 2.4705776182780807e-07, "loss": 0.25, "step": 7564 }, { "epoch": 6.367845117845118, "grad_norm": 0.44189056754112244, "learning_rate": 2.4640657746227095e-07, "loss": 0.2366, "step": 7565 }, { "epoch": 6.3686868686868685, "grad_norm": 0.42224183678627014, "learning_rate": 2.4575623073057495e-07, "loss": 0.2519, "step": 7566 }, { "epoch": 6.369528619528619, "grad_norm": 0.408439964056015, "learning_rate": 2.4510672174731855e-07, "loss": 0.2393, "step": 7567 }, { "epoch": 6.37037037037037, "grad_norm": 0.4039676785469055, "learning_rate": 2.4445805062695063e-07, "loss": 0.2606, "step": 7568 }, { "epoch": 6.371212121212121, "grad_norm": 0.4115229547023773, "learning_rate": 2.438102174837753e-07, "loss": 0.2287, "step": 7569 }, { "epoch": 6.372053872053872, "grad_norm": 0.4463532865047455, "learning_rate": 2.43163222431948e-07, "loss": 0.258, "step": 7570 }, { "epoch": 6.372895622895623, "grad_norm": 0.40193724632263184, "learning_rate": 2.425170655854758e-07, "loss": 0.2497, "step": 7571 }, { "epoch": 6.373737373737374, "grad_norm": 0.4105502963066101, "learning_rate": 2.418717470582177e-07, "loss": 0.2208, "step": 7572 }, { "epoch": 6.374579124579125, "grad_norm": 0.40513166785240173, "learning_rate": 2.41227266963886e-07, "loss": 0.2572, "step": 7573 }, { "epoch": 6.375420875420875, "grad_norm": 0.4456568658351898, "learning_rate": 2.4058362541604487e-07, "loss": 0.2736, "step": 7574 }, { "epoch": 6.376262626262626, "grad_norm": 0.41309067606925964, "learning_rate": 2.399408225281119e-07, "loss": 0.236, "step": 7575 }, { "epoch": 6.377104377104377, "grad_norm": 0.4286624491214752, "learning_rate": 2.3929885841335485e-07, "loss": 0.2357, "step": 7576 }, { "epoch": 6.377946127946128, "grad_norm": 0.44209975004196167, "learning_rate": 2.3865773318489493e-07, "loss": 0.2244, "step": 7577 }, { "epoch": 6.378787878787879, "grad_norm": 0.4499285817146301, "learning_rate": 2.380174469557034e-07, "loss": 0.261, "step": 7578 }, { "epoch": 6.37962962962963, "grad_norm": 0.37966188788414, "learning_rate": 2.3737799983860888e-07, "loss": 0.2449, "step": 7579 }, { "epoch": 6.380471380471381, "grad_norm": 0.4205692410469055, "learning_rate": 2.3673939194628681e-07, "loss": 0.2511, "step": 7580 }, { "epoch": 6.3813131313131315, "grad_norm": 0.4368452727794647, "learning_rate": 2.361016233912672e-07, "loss": 0.2589, "step": 7581 }, { "epoch": 6.382154882154882, "grad_norm": 0.4007389545440674, "learning_rate": 2.3546469428593122e-07, "loss": 0.2488, "step": 7582 }, { "epoch": 6.382996632996633, "grad_norm": 0.40958672761917114, "learning_rate": 2.34828604742513e-07, "loss": 0.2336, "step": 7583 }, { "epoch": 6.383838383838384, "grad_norm": 0.4026840329170227, "learning_rate": 2.3419335487309737e-07, "loss": 0.2309, "step": 7584 }, { "epoch": 6.384680134680135, "grad_norm": 0.448314368724823, "learning_rate": 2.3355894478962414e-07, "loss": 0.208, "step": 7585 }, { "epoch": 6.385521885521886, "grad_norm": 0.4197838008403778, "learning_rate": 2.3292537460388066e-07, "loss": 0.2468, "step": 7586 }, { "epoch": 6.386363636363637, "grad_norm": 0.40598368644714355, "learning_rate": 2.3229264442751088e-07, "loss": 0.2275, "step": 7587 }, { "epoch": 6.3872053872053876, "grad_norm": 0.4383719861507416, "learning_rate": 2.3166075437200785e-07, "loss": 0.2473, "step": 7588 }, { "epoch": 6.388047138047138, "grad_norm": 0.41804951429367065, "learning_rate": 2.3102970454871587e-07, "loss": 0.2514, "step": 7589 }, { "epoch": 6.388888888888889, "grad_norm": 0.4608636498451233, "learning_rate": 2.3039949506883542e-07, "loss": 0.2549, "step": 7590 }, { "epoch": 6.38973063973064, "grad_norm": 0.4299733638763428, "learning_rate": 2.297701260434132e-07, "loss": 0.253, "step": 7591 }, { "epoch": 6.390572390572391, "grad_norm": 0.43627363443374634, "learning_rate": 2.291415975833533e-07, "loss": 0.2309, "step": 7592 }, { "epoch": 6.391414141414142, "grad_norm": 0.42021888494491577, "learning_rate": 2.2851390979940825e-07, "loss": 0.2424, "step": 7593 }, { "epoch": 6.392255892255892, "grad_norm": 0.4249880611896515, "learning_rate": 2.278870628021823e-07, "loss": 0.2359, "step": 7594 }, { "epoch": 6.393097643097643, "grad_norm": 0.42597416043281555, "learning_rate": 2.2726105670213327e-07, "loss": 0.2486, "step": 7595 }, { "epoch": 6.393939393939394, "grad_norm": 0.40035927295684814, "learning_rate": 2.2663589160957067e-07, "loss": 0.2466, "step": 7596 }, { "epoch": 6.3947811447811445, "grad_norm": 0.42242518067359924, "learning_rate": 2.2601156763465525e-07, "loss": 0.2242, "step": 7597 }, { "epoch": 6.395622895622895, "grad_norm": 0.4366401731967926, "learning_rate": 2.2538808488739906e-07, "loss": 0.2372, "step": 7598 }, { "epoch": 6.396464646464646, "grad_norm": 0.41982993483543396, "learning_rate": 2.2476544347766581e-07, "loss": 0.2302, "step": 7599 }, { "epoch": 6.397306397306397, "grad_norm": 0.4400941729545593, "learning_rate": 2.241436435151717e-07, "loss": 0.2241, "step": 7600 }, { "epoch": 6.398148148148148, "grad_norm": 0.47041770815849304, "learning_rate": 2.2352268510948627e-07, "loss": 0.2287, "step": 7601 }, { "epoch": 6.398989898989899, "grad_norm": 0.4073878526687622, "learning_rate": 2.2290256837002755e-07, "loss": 0.2359, "step": 7602 }, { "epoch": 6.39983164983165, "grad_norm": 0.46510255336761475, "learning_rate": 2.2228329340606647e-07, "loss": 0.2698, "step": 7603 }, { "epoch": 6.4006734006734005, "grad_norm": 0.4311455190181732, "learning_rate": 2.2166486032672685e-07, "loss": 0.2847, "step": 7604 }, { "epoch": 6.401515151515151, "grad_norm": 0.41456085443496704, "learning_rate": 2.2104726924098207e-07, "loss": 0.252, "step": 7605 }, { "epoch": 6.402356902356902, "grad_norm": 0.4444170594215393, "learning_rate": 2.2043052025765954e-07, "loss": 0.2434, "step": 7606 }, { "epoch": 6.403198653198653, "grad_norm": 0.43949925899505615, "learning_rate": 2.198146134854351e-07, "loss": 0.237, "step": 7607 }, { "epoch": 6.404040404040404, "grad_norm": 0.42982184886932373, "learning_rate": 2.191995490328408e-07, "loss": 0.2398, "step": 7608 }, { "epoch": 6.404882154882155, "grad_norm": 0.44986721873283386, "learning_rate": 2.1858532700825552e-07, "loss": 0.263, "step": 7609 }, { "epoch": 6.405723905723906, "grad_norm": 0.44311782717704773, "learning_rate": 2.1797194751991214e-07, "loss": 0.2342, "step": 7610 }, { "epoch": 6.406565656565657, "grad_norm": 0.4696143567562103, "learning_rate": 2.1735941067589527e-07, "loss": 0.2368, "step": 7611 }, { "epoch": 6.407407407407407, "grad_norm": 0.4366375803947449, "learning_rate": 2.1674771658413918e-07, "loss": 0.2464, "step": 7612 }, { "epoch": 6.408249158249158, "grad_norm": 0.4468158185482025, "learning_rate": 2.161368653524326e-07, "loss": 0.2247, "step": 7613 }, { "epoch": 6.409090909090909, "grad_norm": 0.4277539551258087, "learning_rate": 2.1552685708841336e-07, "loss": 0.2099, "step": 7614 }, { "epoch": 6.40993265993266, "grad_norm": 0.44235673546791077, "learning_rate": 2.149176918995699e-07, "loss": 0.2434, "step": 7615 }, { "epoch": 6.410774410774411, "grad_norm": 0.43901577591896057, "learning_rate": 2.1430936989324524e-07, "loss": 0.2505, "step": 7616 }, { "epoch": 6.411616161616162, "grad_norm": 0.42384517192840576, "learning_rate": 2.1370189117663254e-07, "loss": 0.2432, "step": 7617 }, { "epoch": 6.412457912457913, "grad_norm": 0.4278242290019989, "learning_rate": 2.130952558567756e-07, "loss": 0.2286, "step": 7618 }, { "epoch": 6.4132996632996635, "grad_norm": 0.4081309735774994, "learning_rate": 2.1248946404056947e-07, "loss": 0.2597, "step": 7619 }, { "epoch": 6.414141414141414, "grad_norm": 0.4434566795825958, "learning_rate": 2.1188451583476043e-07, "loss": 0.243, "step": 7620 }, { "epoch": 6.414983164983165, "grad_norm": 0.4157750606536865, "learning_rate": 2.1128041134594825e-07, "loss": 0.2559, "step": 7621 }, { "epoch": 6.415824915824916, "grad_norm": 0.4535045325756073, "learning_rate": 2.106771506805827e-07, "loss": 0.2604, "step": 7622 }, { "epoch": 6.416666666666667, "grad_norm": 0.42027735710144043, "learning_rate": 2.1007473394496435e-07, "loss": 0.2509, "step": 7623 }, { "epoch": 6.417508417508418, "grad_norm": 0.4033581614494324, "learning_rate": 2.0947316124524498e-07, "loss": 0.2653, "step": 7624 }, { "epoch": 6.418350168350169, "grad_norm": 0.41627049446105957, "learning_rate": 2.0887243268742862e-07, "loss": 0.2439, "step": 7625 }, { "epoch": 6.41919191919192, "grad_norm": 0.417153924703598, "learning_rate": 2.0827254837736898e-07, "loss": 0.2583, "step": 7626 }, { "epoch": 6.42003367003367, "grad_norm": 0.4672709107398987, "learning_rate": 2.0767350842077373e-07, "loss": 0.2517, "step": 7627 }, { "epoch": 6.420875420875421, "grad_norm": 0.4058259129524231, "learning_rate": 2.0707531292319949e-07, "loss": 0.2537, "step": 7628 }, { "epoch": 6.421717171717171, "grad_norm": 0.4689287841320038, "learning_rate": 2.0647796199005366e-07, "loss": 0.2032, "step": 7629 }, { "epoch": 6.422558922558922, "grad_norm": 0.3967859745025635, "learning_rate": 2.0588145572659757e-07, "loss": 0.2465, "step": 7630 }, { "epoch": 6.423400673400673, "grad_norm": 0.42120760679244995, "learning_rate": 2.0528579423794103e-07, "loss": 0.233, "step": 7631 }, { "epoch": 6.424242424242424, "grad_norm": 0.40904635190963745, "learning_rate": 2.046909776290451e-07, "loss": 0.2483, "step": 7632 }, { "epoch": 6.425084175084175, "grad_norm": 0.42674294114112854, "learning_rate": 2.0409700600472427e-07, "loss": 0.2288, "step": 7633 }, { "epoch": 6.425925925925926, "grad_norm": 0.40321314334869385, "learning_rate": 2.0350387946964256e-07, "loss": 0.2514, "step": 7634 }, { "epoch": 6.4267676767676765, "grad_norm": 0.391603946685791, "learning_rate": 2.0291159812831473e-07, "loss": 0.267, "step": 7635 }, { "epoch": 6.427609427609427, "grad_norm": 0.43431001901626587, "learning_rate": 2.0232016208510675e-07, "loss": 0.238, "step": 7636 }, { "epoch": 6.428451178451178, "grad_norm": 0.4296516478061676, "learning_rate": 2.0172957144423522e-07, "loss": 0.2281, "step": 7637 }, { "epoch": 6.429292929292929, "grad_norm": 0.44292977452278137, "learning_rate": 2.011398263097708e-07, "loss": 0.2293, "step": 7638 }, { "epoch": 6.43013468013468, "grad_norm": 0.40136733651161194, "learning_rate": 2.0055092678563204e-07, "loss": 0.2598, "step": 7639 }, { "epoch": 6.430976430976431, "grad_norm": 0.41876548528671265, "learning_rate": 1.999628729755887e-07, "loss": 0.2419, "step": 7640 }, { "epoch": 6.431818181818182, "grad_norm": 0.38291212916374207, "learning_rate": 1.9937566498326178e-07, "loss": 0.2569, "step": 7641 }, { "epoch": 6.4326599326599325, "grad_norm": 0.4402828514575958, "learning_rate": 1.9878930291212407e-07, "loss": 0.2498, "step": 7642 }, { "epoch": 6.433501683501683, "grad_norm": 0.4306812882423401, "learning_rate": 1.9820378686549957e-07, "loss": 0.2269, "step": 7643 }, { "epoch": 6.434343434343434, "grad_norm": 0.44726210832595825, "learning_rate": 1.9761911694656187e-07, "loss": 0.2173, "step": 7644 }, { "epoch": 6.435185185185185, "grad_norm": 0.45195305347442627, "learning_rate": 1.9703529325833525e-07, "loss": 0.2419, "step": 7645 }, { "epoch": 6.436026936026936, "grad_norm": 0.39761292934417725, "learning_rate": 1.9645231590369685e-07, "loss": 0.2422, "step": 7646 }, { "epoch": 6.436868686868687, "grad_norm": 0.41568028926849365, "learning_rate": 1.9587018498537336e-07, "loss": 0.2666, "step": 7647 }, { "epoch": 6.437710437710438, "grad_norm": 0.4363856017589569, "learning_rate": 1.952889006059411e-07, "loss": 0.2408, "step": 7648 }, { "epoch": 6.438552188552189, "grad_norm": 0.42029857635498047, "learning_rate": 1.9470846286783084e-07, "loss": 0.2517, "step": 7649 }, { "epoch": 6.4393939393939394, "grad_norm": 0.3895813822746277, "learning_rate": 1.9412887187331975e-07, "loss": 0.2448, "step": 7650 }, { "epoch": 6.44023569023569, "grad_norm": 0.44493576884269714, "learning_rate": 1.935501277245394e-07, "loss": 0.2562, "step": 7651 }, { "epoch": 6.441077441077441, "grad_norm": 0.40912094712257385, "learning_rate": 1.929722305234699e-07, "loss": 0.2422, "step": 7652 }, { "epoch": 6.441919191919192, "grad_norm": 0.3911171555519104, "learning_rate": 1.9239518037194317e-07, "loss": 0.2635, "step": 7653 }, { "epoch": 6.442760942760943, "grad_norm": 0.48098599910736084, "learning_rate": 1.9181897737164113e-07, "loss": 0.236, "step": 7654 }, { "epoch": 6.443602693602694, "grad_norm": 0.4477013945579529, "learning_rate": 1.912436216240976e-07, "loss": 0.2285, "step": 7655 }, { "epoch": 6.444444444444445, "grad_norm": 0.40850114822387695, "learning_rate": 1.9066911323069648e-07, "loss": 0.2303, "step": 7656 }, { "epoch": 6.4452861952861955, "grad_norm": 0.4518074095249176, "learning_rate": 1.9009545229267234e-07, "loss": 0.2398, "step": 7657 }, { "epoch": 6.446127946127946, "grad_norm": 0.4347514808177948, "learning_rate": 1.8952263891110877e-07, "loss": 0.2222, "step": 7658 }, { "epoch": 6.446969696969697, "grad_norm": 0.44073158502578735, "learning_rate": 1.8895067318694283e-07, "loss": 0.2431, "step": 7659 }, { "epoch": 6.447811447811448, "grad_norm": 0.4468860924243927, "learning_rate": 1.8837955522096162e-07, "loss": 0.2134, "step": 7660 }, { "epoch": 6.448653198653199, "grad_norm": 0.4533787667751312, "learning_rate": 1.8780928511380138e-07, "loss": 0.2371, "step": 7661 }, { "epoch": 6.44949494949495, "grad_norm": 0.4072078764438629, "learning_rate": 1.8723986296594941e-07, "loss": 0.2428, "step": 7662 }, { "epoch": 6.450336700336701, "grad_norm": 0.4329415261745453, "learning_rate": 1.86671288877745e-07, "loss": 0.2127, "step": 7663 }, { "epoch": 6.451178451178452, "grad_norm": 0.4606415033340454, "learning_rate": 1.861035629493757e-07, "loss": 0.2267, "step": 7664 }, { "epoch": 6.452020202020202, "grad_norm": 0.43526485562324524, "learning_rate": 1.8553668528088208e-07, "loss": 0.2641, "step": 7665 }, { "epoch": 6.452861952861953, "grad_norm": 0.47079527378082275, "learning_rate": 1.8497065597215423e-07, "loss": 0.228, "step": 7666 }, { "epoch": 6.453703703703704, "grad_norm": 0.4743163287639618, "learning_rate": 1.8440547512293073e-07, "loss": 0.2356, "step": 7667 }, { "epoch": 6.454545454545454, "grad_norm": 0.4553942084312439, "learning_rate": 1.838411428328041e-07, "loss": 0.2528, "step": 7668 }, { "epoch": 6.455387205387205, "grad_norm": 0.41381770372390747, "learning_rate": 1.8327765920121476e-07, "loss": 0.2686, "step": 7669 }, { "epoch": 6.456228956228956, "grad_norm": 0.4518068730831146, "learning_rate": 1.827150243274556e-07, "loss": 0.2271, "step": 7670 }, { "epoch": 6.457070707070707, "grad_norm": 0.46521180868148804, "learning_rate": 1.8215323831066778e-07, "loss": 0.2116, "step": 7671 }, { "epoch": 6.457912457912458, "grad_norm": 0.4450235366821289, "learning_rate": 1.8159230124984495e-07, "loss": 0.2295, "step": 7672 }, { "epoch": 6.4587542087542085, "grad_norm": 0.42912524938583374, "learning_rate": 1.810322132438297e-07, "loss": 0.2322, "step": 7673 }, { "epoch": 6.459595959595959, "grad_norm": 0.4669003486633301, "learning_rate": 1.8047297439131473e-07, "loss": 0.2231, "step": 7674 }, { "epoch": 6.46043771043771, "grad_norm": 0.4153645932674408, "learning_rate": 1.799145847908451e-07, "loss": 0.2324, "step": 7675 }, { "epoch": 6.461279461279461, "grad_norm": 0.44447576999664307, "learning_rate": 1.7935704454081493e-07, "loss": 0.2259, "step": 7676 }, { "epoch": 6.462121212121212, "grad_norm": 0.4140705168247223, "learning_rate": 1.7880035373946836e-07, "loss": 0.2395, "step": 7677 }, { "epoch": 6.462962962962963, "grad_norm": 0.4428408741950989, "learning_rate": 1.7824451248490026e-07, "loss": 0.2632, "step": 7678 }, { "epoch": 6.463804713804714, "grad_norm": 0.4207862317562103, "learning_rate": 1.7768952087505508e-07, "loss": 0.2648, "step": 7679 }, { "epoch": 6.4646464646464645, "grad_norm": 0.41288313269615173, "learning_rate": 1.771353790077296e-07, "loss": 0.2835, "step": 7680 }, { "epoch": 6.465488215488215, "grad_norm": 0.43601107597351074, "learning_rate": 1.76582086980569e-07, "loss": 0.2526, "step": 7681 }, { "epoch": 6.466329966329966, "grad_norm": 0.4382724463939667, "learning_rate": 1.7602964489106977e-07, "loss": 0.2342, "step": 7682 }, { "epoch": 6.467171717171717, "grad_norm": 0.43643391132354736, "learning_rate": 1.7547805283657625e-07, "loss": 0.2223, "step": 7683 }, { "epoch": 6.468013468013468, "grad_norm": 0.504719078540802, "learning_rate": 1.7492731091428684e-07, "loss": 0.2322, "step": 7684 }, { "epoch": 6.468855218855219, "grad_norm": 0.44187548756599426, "learning_rate": 1.743774192212472e-07, "loss": 0.259, "step": 7685 }, { "epoch": 6.46969696969697, "grad_norm": 0.4048267602920532, "learning_rate": 1.7382837785435425e-07, "loss": 0.2259, "step": 7686 }, { "epoch": 6.470538720538721, "grad_norm": 0.42428573966026306, "learning_rate": 1.7328018691035564e-07, "loss": 0.2372, "step": 7687 }, { "epoch": 6.4713804713804715, "grad_norm": 0.4021638333797455, "learning_rate": 1.7273284648584686e-07, "loss": 0.2529, "step": 7688 }, { "epoch": 6.472222222222222, "grad_norm": 0.4180724024772644, "learning_rate": 1.7218635667727635e-07, "loss": 0.276, "step": 7689 }, { "epoch": 6.473063973063973, "grad_norm": 0.4024483859539032, "learning_rate": 1.716407175809409e-07, "loss": 0.2603, "step": 7690 }, { "epoch": 6.473905723905724, "grad_norm": 0.42993399500846863, "learning_rate": 1.710959292929887e-07, "loss": 0.2282, "step": 7691 }, { "epoch": 6.474747474747475, "grad_norm": 0.44724568724632263, "learning_rate": 1.7055199190941618e-07, "loss": 0.2294, "step": 7692 }, { "epoch": 6.475589225589226, "grad_norm": 0.4228421747684479, "learning_rate": 1.700089055260723e-07, "loss": 0.2198, "step": 7693 }, { "epoch": 6.476430976430977, "grad_norm": 0.3868064284324646, "learning_rate": 1.6946667023865327e-07, "loss": 0.2593, "step": 7694 }, { "epoch": 6.4772727272727275, "grad_norm": 0.41794031858444214, "learning_rate": 1.689252861427071e-07, "loss": 0.2199, "step": 7695 }, { "epoch": 6.478114478114478, "grad_norm": 0.41060131788253784, "learning_rate": 1.6838475333363247e-07, "loss": 0.2535, "step": 7696 }, { "epoch": 6.478956228956229, "grad_norm": 0.4659389555454254, "learning_rate": 1.6784507190667542e-07, "loss": 0.2494, "step": 7697 }, { "epoch": 6.47979797979798, "grad_norm": 0.40724632143974304, "learning_rate": 1.6730624195693434e-07, "loss": 0.2416, "step": 7698 }, { "epoch": 6.480639730639731, "grad_norm": 0.4343944191932678, "learning_rate": 1.6676826357935773e-07, "loss": 0.2789, "step": 7699 }, { "epoch": 6.481481481481482, "grad_norm": 0.44551435112953186, "learning_rate": 1.662311368687408e-07, "loss": 0.2396, "step": 7700 }, { "epoch": 6.482323232323233, "grad_norm": 0.4377487599849701, "learning_rate": 1.6569486191973294e-07, "loss": 0.2427, "step": 7701 }, { "epoch": 6.483164983164984, "grad_norm": 0.4241126775741577, "learning_rate": 1.6515943882683127e-07, "loss": 0.265, "step": 7702 }, { "epoch": 6.4840067340067336, "grad_norm": 0.4183051586151123, "learning_rate": 1.6462486768438314e-07, "loss": 0.2795, "step": 7703 }, { "epoch": 6.484848484848484, "grad_norm": 0.45959317684173584, "learning_rate": 1.640911485865848e-07, "loss": 0.2407, "step": 7704 }, { "epoch": 6.485690235690235, "grad_norm": 0.3980998694896698, "learning_rate": 1.6355828162748333e-07, "loss": 0.2258, "step": 7705 }, { "epoch": 6.486531986531986, "grad_norm": 0.4320000112056732, "learning_rate": 1.6302626690097633e-07, "loss": 0.2331, "step": 7706 }, { "epoch": 6.487373737373737, "grad_norm": 0.4157402813434601, "learning_rate": 1.6249510450081051e-07, "loss": 0.2453, "step": 7707 }, { "epoch": 6.488215488215488, "grad_norm": 0.4211595356464386, "learning_rate": 1.6196479452058155e-07, "loss": 0.2457, "step": 7708 }, { "epoch": 6.489057239057239, "grad_norm": 0.42507264018058777, "learning_rate": 1.6143533705373582e-07, "loss": 0.2199, "step": 7709 }, { "epoch": 6.48989898989899, "grad_norm": 0.42606863379478455, "learning_rate": 1.6090673219357033e-07, "loss": 0.2147, "step": 7710 }, { "epoch": 6.4907407407407405, "grad_norm": 0.41840246319770813, "learning_rate": 1.603789800332295e-07, "loss": 0.2453, "step": 7711 }, { "epoch": 6.491582491582491, "grad_norm": 0.42477619647979736, "learning_rate": 1.5985208066571056e-07, "loss": 0.2403, "step": 7712 }, { "epoch": 6.492424242424242, "grad_norm": 0.4214938282966614, "learning_rate": 1.5932603418385705e-07, "loss": 0.254, "step": 7713 }, { "epoch": 6.493265993265993, "grad_norm": 0.40029817819595337, "learning_rate": 1.588008406803654e-07, "loss": 0.2673, "step": 7714 }, { "epoch": 6.494107744107744, "grad_norm": 0.383249968290329, "learning_rate": 1.5827650024777984e-07, "loss": 0.2625, "step": 7715 }, { "epoch": 6.494949494949495, "grad_norm": 0.44548335671424866, "learning_rate": 1.5775301297849433e-07, "loss": 0.257, "step": 7716 }, { "epoch": 6.495791245791246, "grad_norm": 0.4018527865409851, "learning_rate": 1.572303789647528e-07, "loss": 0.245, "step": 7717 }, { "epoch": 6.4966329966329965, "grad_norm": 0.4152697026729584, "learning_rate": 1.567085982986488e-07, "loss": 0.2538, "step": 7718 }, { "epoch": 6.497474747474747, "grad_norm": 0.44004589319229126, "learning_rate": 1.5618767107212718e-07, "loss": 0.261, "step": 7719 }, { "epoch": 6.498316498316498, "grad_norm": 0.417663037776947, "learning_rate": 1.5566759737698002e-07, "loss": 0.2503, "step": 7720 }, { "epoch": 6.499158249158249, "grad_norm": 0.3883981704711914, "learning_rate": 1.5514837730484845e-07, "loss": 0.2552, "step": 7721 }, { "epoch": 6.5, "grad_norm": 0.44686487317085266, "learning_rate": 1.5463001094722653e-07, "loss": 0.2765, "step": 7722 }, { "epoch": 6.500841750841751, "grad_norm": 0.4116784930229187, "learning_rate": 1.541124983954556e-07, "loss": 0.2399, "step": 7723 }, { "epoch": 6.501683501683502, "grad_norm": 0.4304162263870239, "learning_rate": 1.5359583974072612e-07, "loss": 0.294, "step": 7724 }, { "epoch": 6.502525252525253, "grad_norm": 0.4183177649974823, "learning_rate": 1.5308003507407963e-07, "loss": 0.2458, "step": 7725 }, { "epoch": 6.5033670033670035, "grad_norm": 0.45919713377952576, "learning_rate": 1.5256508448640518e-07, "loss": 0.2512, "step": 7726 }, { "epoch": 6.504208754208754, "grad_norm": 0.40858370065689087, "learning_rate": 1.5205098806844344e-07, "loss": 0.2448, "step": 7727 }, { "epoch": 6.505050505050505, "grad_norm": 0.4197198450565338, "learning_rate": 1.5153774591078419e-07, "loss": 0.2789, "step": 7728 }, { "epoch": 6.505892255892256, "grad_norm": 0.4270916283130646, "learning_rate": 1.5102535810386566e-07, "loss": 0.2406, "step": 7729 }, { "epoch": 6.506734006734007, "grad_norm": 0.4049086570739746, "learning_rate": 1.505138247379756e-07, "loss": 0.2703, "step": 7730 }, { "epoch": 6.507575757575758, "grad_norm": 0.39468494057655334, "learning_rate": 1.500031459032525e-07, "loss": 0.2167, "step": 7731 }, { "epoch": 6.508417508417509, "grad_norm": 0.4182526469230652, "learning_rate": 1.4949332168968324e-07, "loss": 0.2523, "step": 7732 }, { "epoch": 6.5092592592592595, "grad_norm": 0.4183179438114166, "learning_rate": 1.489843521871026e-07, "loss": 0.2895, "step": 7733 }, { "epoch": 6.51010101010101, "grad_norm": 0.4671708643436432, "learning_rate": 1.484762374851989e-07, "loss": 0.2239, "step": 7734 }, { "epoch": 6.510942760942761, "grad_norm": 0.4113391935825348, "learning_rate": 1.4796897767350603e-07, "loss": 0.2305, "step": 7735 }, { "epoch": 6.511784511784512, "grad_norm": 0.4334380030632019, "learning_rate": 1.474625728414092e-07, "loss": 0.2284, "step": 7736 }, { "epoch": 6.512626262626263, "grad_norm": 0.42983072996139526, "learning_rate": 1.4695702307814198e-07, "loss": 0.2212, "step": 7737 }, { "epoch": 6.513468013468014, "grad_norm": 0.4082311987876892, "learning_rate": 1.46452328472787e-07, "loss": 0.2411, "step": 7738 }, { "epoch": 6.514309764309765, "grad_norm": 0.4328223764896393, "learning_rate": 1.4594848911427706e-07, "loss": 0.2343, "step": 7739 }, { "epoch": 6.515151515151516, "grad_norm": 0.384546160697937, "learning_rate": 1.4544550509139554e-07, "loss": 0.268, "step": 7740 }, { "epoch": 6.5159932659932664, "grad_norm": 0.4182617664337158, "learning_rate": 1.4494337649277213e-07, "loss": 0.2538, "step": 7741 }, { "epoch": 6.516835016835017, "grad_norm": 0.42303839325904846, "learning_rate": 1.4444210340688826e-07, "loss": 0.2443, "step": 7742 }, { "epoch": 6.517676767676767, "grad_norm": 0.4098053574562073, "learning_rate": 1.4394168592207102e-07, "loss": 0.2372, "step": 7743 }, { "epoch": 6.518518518518518, "grad_norm": 0.4033154249191284, "learning_rate": 1.4344212412650272e-07, "loss": 0.2601, "step": 7744 }, { "epoch": 6.519360269360269, "grad_norm": 0.41378071904182434, "learning_rate": 1.4294341810820956e-07, "loss": 0.2391, "step": 7745 }, { "epoch": 6.52020202020202, "grad_norm": 0.4294326603412628, "learning_rate": 1.4244556795506959e-07, "loss": 0.2606, "step": 7746 }, { "epoch": 6.521043771043771, "grad_norm": 0.39769840240478516, "learning_rate": 1.4194857375480875e-07, "loss": 0.2724, "step": 7747 }, { "epoch": 6.521885521885522, "grad_norm": 0.4216866195201874, "learning_rate": 1.414524355950031e-07, "loss": 0.2422, "step": 7748 }, { "epoch": 6.5227272727272725, "grad_norm": 0.4176965653896332, "learning_rate": 1.409571535630766e-07, "loss": 0.2601, "step": 7749 }, { "epoch": 6.523569023569023, "grad_norm": 0.41026318073272705, "learning_rate": 1.4046272774630497e-07, "loss": 0.2677, "step": 7750 }, { "epoch": 6.524410774410774, "grad_norm": 0.4103512167930603, "learning_rate": 1.399691582318091e-07, "loss": 0.262, "step": 7751 }, { "epoch": 6.525252525252525, "grad_norm": 0.417612224817276, "learning_rate": 1.3947644510656333e-07, "loss": 0.2683, "step": 7752 }, { "epoch": 6.526094276094276, "grad_norm": 0.44057828187942505, "learning_rate": 1.3898458845738817e-07, "loss": 0.2247, "step": 7753 }, { "epoch": 6.526936026936027, "grad_norm": 0.40811365842819214, "learning_rate": 1.3849358837095317e-07, "loss": 0.2661, "step": 7754 }, { "epoch": 6.527777777777778, "grad_norm": 0.41804975271224976, "learning_rate": 1.3800344493377914e-07, "loss": 0.2304, "step": 7755 }, { "epoch": 6.5286195286195285, "grad_norm": 0.40991050004959106, "learning_rate": 1.3751415823223313e-07, "loss": 0.2706, "step": 7756 }, { "epoch": 6.529461279461279, "grad_norm": 0.38034215569496155, "learning_rate": 1.370257283525339e-07, "loss": 0.2566, "step": 7757 }, { "epoch": 6.53030303030303, "grad_norm": 0.40682026743888855, "learning_rate": 1.3653815538074823e-07, "loss": 0.2391, "step": 7758 }, { "epoch": 6.531144781144781, "grad_norm": 0.4160152077674866, "learning_rate": 1.3605143940278953e-07, "loss": 0.221, "step": 7759 }, { "epoch": 6.531986531986532, "grad_norm": 0.406209260225296, "learning_rate": 1.3556558050442426e-07, "loss": 0.2464, "step": 7760 }, { "epoch": 6.532828282828283, "grad_norm": 0.3974800109863281, "learning_rate": 1.3508057877126614e-07, "loss": 0.2614, "step": 7761 }, { "epoch": 6.533670033670034, "grad_norm": 0.3968677818775177, "learning_rate": 1.3459643428877678e-07, "loss": 0.2313, "step": 7762 }, { "epoch": 6.534511784511785, "grad_norm": 0.39506015181541443, "learning_rate": 1.3411314714226798e-07, "loss": 0.2308, "step": 7763 }, { "epoch": 6.5353535353535355, "grad_norm": 0.43584850430488586, "learning_rate": 1.3363071741689937e-07, "loss": 0.2595, "step": 7764 }, { "epoch": 6.536195286195286, "grad_norm": 0.4223094880580902, "learning_rate": 1.3314914519768073e-07, "loss": 0.2469, "step": 7765 }, { "epoch": 6.537037037037037, "grad_norm": 0.39219796657562256, "learning_rate": 1.3266843056947087e-07, "loss": 0.2571, "step": 7766 }, { "epoch": 6.537878787878788, "grad_norm": 0.3961121737957001, "learning_rate": 1.3218857361697645e-07, "loss": 0.255, "step": 7767 }, { "epoch": 6.538720538720539, "grad_norm": 0.42203617095947266, "learning_rate": 1.3170957442475208e-07, "loss": 0.2371, "step": 7768 }, { "epoch": 6.53956228956229, "grad_norm": 0.4027787446975708, "learning_rate": 1.312314330772041e-07, "loss": 0.2516, "step": 7769 }, { "epoch": 6.540404040404041, "grad_norm": 0.40656253695487976, "learning_rate": 1.3075414965858568e-07, "loss": 0.2688, "step": 7770 }, { "epoch": 6.5412457912457915, "grad_norm": 0.3936285376548767, "learning_rate": 1.30277724252999e-07, "loss": 0.2358, "step": 7771 }, { "epoch": 6.542087542087542, "grad_norm": 0.38693657517433167, "learning_rate": 1.2980215694439636e-07, "loss": 0.2451, "step": 7772 }, { "epoch": 6.542929292929293, "grad_norm": 0.43062329292297363, "learning_rate": 1.293274478165757e-07, "loss": 0.2443, "step": 7773 }, { "epoch": 6.543771043771044, "grad_norm": 0.4312251806259155, "learning_rate": 1.2885359695318734e-07, "loss": 0.2399, "step": 7774 }, { "epoch": 6.544612794612795, "grad_norm": 0.38979431986808777, "learning_rate": 1.2838060443772837e-07, "loss": 0.2698, "step": 7775 }, { "epoch": 6.545454545454545, "grad_norm": 0.45070797204971313, "learning_rate": 1.2790847035354604e-07, "loss": 0.2239, "step": 7776 }, { "epoch": 6.546296296296296, "grad_norm": 0.46040838956832886, "learning_rate": 1.2743719478383377e-07, "loss": 0.2421, "step": 7777 }, { "epoch": 6.547138047138047, "grad_norm": 0.40883564949035645, "learning_rate": 1.269667778116368e-07, "loss": 0.2461, "step": 7778 }, { "epoch": 6.547979797979798, "grad_norm": 0.3995952904224396, "learning_rate": 1.2649721951984717e-07, "loss": 0.2454, "step": 7779 }, { "epoch": 6.548821548821548, "grad_norm": 0.4130491614341736, "learning_rate": 1.2602851999120592e-07, "loss": 0.247, "step": 7780 }, { "epoch": 6.549663299663299, "grad_norm": 0.4725889563560486, "learning_rate": 1.2556067930830196e-07, "loss": 0.2108, "step": 7781 }, { "epoch": 6.55050505050505, "grad_norm": 0.4693812131881714, "learning_rate": 1.2509369755357548e-07, "loss": 0.2526, "step": 7782 }, { "epoch": 6.551346801346801, "grad_norm": 0.42513132095336914, "learning_rate": 1.2462757480931342e-07, "loss": 0.2668, "step": 7783 }, { "epoch": 6.552188552188552, "grad_norm": 0.4435824453830719, "learning_rate": 1.2416231115765066e-07, "loss": 0.2289, "step": 7784 }, { "epoch": 6.553030303030303, "grad_norm": 0.4208647310733795, "learning_rate": 1.2369790668057157e-07, "loss": 0.2537, "step": 7785 }, { "epoch": 6.553872053872054, "grad_norm": 0.40599173307418823, "learning_rate": 1.2323436145990963e-07, "loss": 0.2515, "step": 7786 }, { "epoch": 6.5547138047138045, "grad_norm": 0.4325297474861145, "learning_rate": 1.2277167557734726e-07, "loss": 0.2635, "step": 7787 }, { "epoch": 6.555555555555555, "grad_norm": 0.4404585659503937, "learning_rate": 1.2230984911441369e-07, "loss": 0.2326, "step": 7788 }, { "epoch": 6.556397306397306, "grad_norm": 0.4003777801990509, "learning_rate": 1.2184888215248712e-07, "loss": 0.2374, "step": 7789 }, { "epoch": 6.557239057239057, "grad_norm": 0.43584683537483215, "learning_rate": 1.21388774772796e-07, "loss": 0.2406, "step": 7790 }, { "epoch": 6.558080808080808, "grad_norm": 0.4174977242946625, "learning_rate": 1.2092952705641538e-07, "loss": 0.2481, "step": 7791 }, { "epoch": 6.558922558922559, "grad_norm": 0.43031173944473267, "learning_rate": 1.2047113908427056e-07, "loss": 0.2446, "step": 7792 }, { "epoch": 6.55976430976431, "grad_norm": 0.4080667495727539, "learning_rate": 1.200136109371336e-07, "loss": 0.2727, "step": 7793 }, { "epoch": 6.5606060606060606, "grad_norm": 0.40513506531715393, "learning_rate": 1.1955694269562556e-07, "loss": 0.2397, "step": 7794 }, { "epoch": 6.561447811447811, "grad_norm": 0.4554062485694885, "learning_rate": 1.1910113444021709e-07, "loss": 0.2429, "step": 7795 }, { "epoch": 6.562289562289562, "grad_norm": 0.45296162366867065, "learning_rate": 1.1864618625122615e-07, "loss": 0.2542, "step": 7796 }, { "epoch": 6.563131313131313, "grad_norm": 0.4361836612224579, "learning_rate": 1.1819209820881861e-07, "loss": 0.2419, "step": 7797 }, { "epoch": 6.563973063973064, "grad_norm": 0.47644996643066406, "learning_rate": 1.1773887039301046e-07, "loss": 0.235, "step": 7798 }, { "epoch": 6.564814814814815, "grad_norm": 0.41302353143692017, "learning_rate": 1.1728650288366616e-07, "loss": 0.2401, "step": 7799 }, { "epoch": 6.565656565656566, "grad_norm": 0.404610276222229, "learning_rate": 1.1683499576049583e-07, "loss": 0.2668, "step": 7800 }, { "epoch": 6.566498316498317, "grad_norm": 0.43761223554611206, "learning_rate": 1.163843491030614e-07, "loss": 0.2206, "step": 7801 }, { "epoch": 6.5673400673400675, "grad_norm": 0.39359235763549805, "learning_rate": 1.1593456299077044e-07, "loss": 0.2483, "step": 7802 }, { "epoch": 6.568181818181818, "grad_norm": 0.4260510504245758, "learning_rate": 1.1548563750288011e-07, "loss": 0.264, "step": 7803 }, { "epoch": 6.569023569023569, "grad_norm": 0.433437705039978, "learning_rate": 1.1503757271849714e-07, "loss": 0.2246, "step": 7804 }, { "epoch": 6.56986531986532, "grad_norm": 0.4278266727924347, "learning_rate": 1.1459036871657448e-07, "loss": 0.2511, "step": 7805 }, { "epoch": 6.570707070707071, "grad_norm": 0.4097703695297241, "learning_rate": 1.1414402557591409e-07, "loss": 0.2561, "step": 7806 }, { "epoch": 6.571548821548822, "grad_norm": 0.43761444091796875, "learning_rate": 1.1369854337516639e-07, "loss": 0.2568, "step": 7807 }, { "epoch": 6.572390572390573, "grad_norm": 0.40628519654273987, "learning_rate": 1.1325392219283026e-07, "loss": 0.2333, "step": 7808 }, { "epoch": 6.5732323232323235, "grad_norm": 0.47240325808525085, "learning_rate": 1.1281016210725304e-07, "loss": 0.2721, "step": 7809 }, { "epoch": 6.574074074074074, "grad_norm": 0.4110891819000244, "learning_rate": 1.1236726319662994e-07, "loss": 0.2665, "step": 7810 }, { "epoch": 6.574915824915825, "grad_norm": 0.4259273409843445, "learning_rate": 1.1192522553900298e-07, "loss": 0.2212, "step": 7811 }, { "epoch": 6.575757575757576, "grad_norm": 0.4451664984226227, "learning_rate": 1.1148404921226597e-07, "loss": 0.2502, "step": 7812 }, { "epoch": 6.576599326599327, "grad_norm": 0.4445432722568512, "learning_rate": 1.1104373429415783e-07, "loss": 0.2316, "step": 7813 }, { "epoch": 6.577441077441078, "grad_norm": 0.4359075129032135, "learning_rate": 1.1060428086226705e-07, "loss": 0.2246, "step": 7814 }, { "epoch": 6.578282828282829, "grad_norm": 0.4817937910556793, "learning_rate": 1.1016568899402947e-07, "loss": 0.2455, "step": 7815 }, { "epoch": 6.57912457912458, "grad_norm": 0.44227591156959534, "learning_rate": 1.0972795876673103e-07, "loss": 0.2671, "step": 7816 }, { "epoch": 6.5799663299663305, "grad_norm": 0.4653630256652832, "learning_rate": 1.0929109025750284e-07, "loss": 0.2345, "step": 7817 }, { "epoch": 6.58080808080808, "grad_norm": 0.41055572032928467, "learning_rate": 1.0885508354332608e-07, "loss": 0.2583, "step": 7818 }, { "epoch": 6.581649831649831, "grad_norm": 0.405543714761734, "learning_rate": 1.0841993870103041e-07, "loss": 0.2323, "step": 7819 }, { "epoch": 6.582491582491582, "grad_norm": 0.42307591438293457, "learning_rate": 1.0798565580729337e-07, "loss": 0.2784, "step": 7820 }, { "epoch": 6.583333333333333, "grad_norm": 0.46412959694862366, "learning_rate": 1.0755223493863931e-07, "loss": 0.2395, "step": 7821 }, { "epoch": 6.584175084175084, "grad_norm": 0.4308834969997406, "learning_rate": 1.0711967617144158e-07, "loss": 0.2233, "step": 7822 }, { "epoch": 6.585016835016835, "grad_norm": 0.4238247275352478, "learning_rate": 1.0668797958192201e-07, "loss": 0.2564, "step": 7823 }, { "epoch": 6.585858585858586, "grad_norm": 0.4408906400203705, "learning_rate": 1.0625714524615027e-07, "loss": 0.2564, "step": 7824 }, { "epoch": 6.5867003367003365, "grad_norm": 0.4118526875972748, "learning_rate": 1.0582717324004399e-07, "loss": 0.2226, "step": 7825 }, { "epoch": 6.587542087542087, "grad_norm": 0.43712159991264343, "learning_rate": 1.0539806363936811e-07, "loss": 0.2563, "step": 7826 }, { "epoch": 6.588383838383838, "grad_norm": 0.42260634899139404, "learning_rate": 1.0496981651973714e-07, "loss": 0.2816, "step": 7827 }, { "epoch": 6.589225589225589, "grad_norm": 0.42719361186027527, "learning_rate": 1.045424319566124e-07, "loss": 0.2317, "step": 7828 }, { "epoch": 6.59006734006734, "grad_norm": 0.510625958442688, "learning_rate": 1.0411591002530364e-07, "loss": 0.2288, "step": 7829 }, { "epoch": 6.590909090909091, "grad_norm": 0.4541179835796356, "learning_rate": 1.0369025080096906e-07, "loss": 0.2481, "step": 7830 }, { "epoch": 6.591750841750842, "grad_norm": 0.3922891616821289, "learning_rate": 1.0326545435861368e-07, "loss": 0.2653, "step": 7831 }, { "epoch": 6.592592592592593, "grad_norm": 0.40550389885902405, "learning_rate": 1.0284152077309095e-07, "loss": 0.2377, "step": 7832 }, { "epoch": 6.593434343434343, "grad_norm": 0.42928335070610046, "learning_rate": 1.0241845011910334e-07, "loss": 0.2241, "step": 7833 }, { "epoch": 6.594276094276094, "grad_norm": 0.3789941668510437, "learning_rate": 1.0199624247119954e-07, "loss": 0.2363, "step": 7834 }, { "epoch": 6.595117845117845, "grad_norm": 0.4418914020061493, "learning_rate": 1.0157489790377839e-07, "loss": 0.2494, "step": 7835 }, { "epoch": 6.595959595959596, "grad_norm": 0.4131902754306793, "learning_rate": 1.0115441649108382e-07, "loss": 0.2401, "step": 7836 }, { "epoch": 6.596801346801347, "grad_norm": 0.43655046820640564, "learning_rate": 1.007347983072099e-07, "loss": 0.2382, "step": 7837 }, { "epoch": 6.597643097643098, "grad_norm": 0.43032926321029663, "learning_rate": 1.0031604342609747e-07, "loss": 0.2484, "step": 7838 }, { "epoch": 6.598484848484849, "grad_norm": 0.4369594156742096, "learning_rate": 9.989815192153585e-08, "loss": 0.2285, "step": 7839 }, { "epoch": 6.5993265993265995, "grad_norm": 0.4197549521923065, "learning_rate": 9.948112386716169e-08, "loss": 0.2646, "step": 7840 }, { "epoch": 6.60016835016835, "grad_norm": 0.42242902517318726, "learning_rate": 9.90649593364601e-08, "loss": 0.2336, "step": 7841 }, { "epoch": 6.601010101010101, "grad_norm": 0.42150938510894775, "learning_rate": 9.864965840276408e-08, "loss": 0.2416, "step": 7842 }, { "epoch": 6.601851851851852, "grad_norm": 0.39537596702575684, "learning_rate": 9.823522113925343e-08, "loss": 0.2743, "step": 7843 }, { "epoch": 6.602693602693603, "grad_norm": 0.41264092922210693, "learning_rate": 9.78216476189564e-08, "loss": 0.2754, "step": 7844 }, { "epoch": 6.603535353535354, "grad_norm": 0.42260709404945374, "learning_rate": 9.740893791474915e-08, "loss": 0.2277, "step": 7845 }, { "epoch": 6.604377104377105, "grad_norm": 0.4232630133628845, "learning_rate": 9.69970920993557e-08, "loss": 0.2287, "step": 7846 }, { "epoch": 6.6052188552188555, "grad_norm": 0.40502697229385376, "learning_rate": 9.658611024534803e-08, "loss": 0.2169, "step": 7847 }, { "epoch": 6.606060606060606, "grad_norm": 0.4088004231452942, "learning_rate": 9.617599242514487e-08, "loss": 0.2455, "step": 7848 }, { "epoch": 6.606902356902357, "grad_norm": 0.41299498081207275, "learning_rate": 9.576673871101338e-08, "loss": 0.2213, "step": 7849 }, { "epoch": 6.607744107744107, "grad_norm": 0.415278822183609, "learning_rate": 9.535834917506814e-08, "loss": 0.2481, "step": 7850 }, { "epoch": 6.608585858585858, "grad_norm": 0.4245213270187378, "learning_rate": 9.495082388927323e-08, "loss": 0.2409, "step": 7851 }, { "epoch": 6.609427609427609, "grad_norm": 0.4416203200817108, "learning_rate": 9.454416292543733e-08, "loss": 0.2773, "step": 7852 }, { "epoch": 6.61026936026936, "grad_norm": 0.40803462266921997, "learning_rate": 9.413836635521922e-08, "loss": 0.2143, "step": 7853 }, { "epoch": 6.611111111111111, "grad_norm": 0.4058579206466675, "learning_rate": 9.373343425012448e-08, "loss": 0.2377, "step": 7854 }, { "epoch": 6.611952861952862, "grad_norm": 0.45612990856170654, "learning_rate": 9.332936668150605e-08, "loss": 0.2461, "step": 7855 }, { "epoch": 6.6127946127946124, "grad_norm": 0.41306376457214355, "learning_rate": 9.292616372056585e-08, "loss": 0.2574, "step": 7856 }, { "epoch": 6.613636363636363, "grad_norm": 0.4344332814216614, "learning_rate": 9.252382543835148e-08, "loss": 0.2326, "step": 7857 }, { "epoch": 6.614478114478114, "grad_norm": 0.448345810174942, "learning_rate": 9.212235190576069e-08, "loss": 0.2465, "step": 7858 }, { "epoch": 6.615319865319865, "grad_norm": 0.45094311237335205, "learning_rate": 9.172174319353578e-08, "loss": 0.2191, "step": 7859 }, { "epoch": 6.616161616161616, "grad_norm": 0.41641733050346375, "learning_rate": 9.132199937226915e-08, "loss": 0.2196, "step": 7860 }, { "epoch": 6.617003367003367, "grad_norm": 0.4761151969432831, "learning_rate": 9.092312051239893e-08, "loss": 0.236, "step": 7861 }, { "epoch": 6.617845117845118, "grad_norm": 0.4333548843860626, "learning_rate": 9.052510668421332e-08, "loss": 0.2099, "step": 7862 }, { "epoch": 6.6186868686868685, "grad_norm": 0.4090255796909332, "learning_rate": 9.012795795784568e-08, "loss": 0.2674, "step": 7863 }, { "epoch": 6.619528619528619, "grad_norm": 0.4635721445083618, "learning_rate": 8.973167440327835e-08, "loss": 0.2388, "step": 7864 }, { "epoch": 6.62037037037037, "grad_norm": 0.459511399269104, "learning_rate": 8.933625609033992e-08, "loss": 0.2801, "step": 7865 }, { "epoch": 6.621212121212121, "grad_norm": 0.4362848401069641, "learning_rate": 8.894170308870797e-08, "loss": 0.2602, "step": 7866 }, { "epoch": 6.622053872053872, "grad_norm": 0.4264500141143799, "learning_rate": 8.854801546790692e-08, "loss": 0.2376, "step": 7867 }, { "epoch": 6.622895622895623, "grad_norm": 0.44621482491493225, "learning_rate": 8.815519329730904e-08, "loss": 0.2533, "step": 7868 }, { "epoch": 6.623737373737374, "grad_norm": 0.41415882110595703, "learning_rate": 8.77632366461334e-08, "loss": 0.2754, "step": 7869 }, { "epoch": 6.624579124579125, "grad_norm": 0.4550504684448242, "learning_rate": 8.737214558344587e-08, "loss": 0.2515, "step": 7870 }, { "epoch": 6.625420875420875, "grad_norm": 0.43127337098121643, "learning_rate": 8.698192017816243e-08, "loss": 0.2455, "step": 7871 }, { "epoch": 6.626262626262626, "grad_norm": 0.41892799735069275, "learning_rate": 8.659256049904474e-08, "loss": 0.2456, "step": 7872 }, { "epoch": 6.627104377104377, "grad_norm": 0.4116719961166382, "learning_rate": 8.620406661470127e-08, "loss": 0.2731, "step": 7873 }, { "epoch": 6.627946127946128, "grad_norm": 0.4188373386859894, "learning_rate": 8.581643859358946e-08, "loss": 0.2637, "step": 7874 }, { "epoch": 6.628787878787879, "grad_norm": 0.4269236922264099, "learning_rate": 8.542967650401357e-08, "loss": 0.2623, "step": 7875 }, { "epoch": 6.62962962962963, "grad_norm": 0.7932599782943726, "learning_rate": 8.50437804141252e-08, "loss": 0.2611, "step": 7876 }, { "epoch": 6.630471380471381, "grad_norm": 0.468515008687973, "learning_rate": 8.465875039192272e-08, "loss": 0.2415, "step": 7877 }, { "epoch": 6.6313131313131315, "grad_norm": 0.4275892972946167, "learning_rate": 8.4274586505253e-08, "loss": 0.2504, "step": 7878 }, { "epoch": 6.632154882154882, "grad_norm": 0.4463840126991272, "learning_rate": 8.389128882180964e-08, "loss": 0.2342, "step": 7879 }, { "epoch": 6.632996632996633, "grad_norm": 0.4384472668170929, "learning_rate": 8.350885740913417e-08, "loss": 0.2477, "step": 7880 }, { "epoch": 6.633838383838384, "grad_norm": 0.42471715807914734, "learning_rate": 8.312729233461491e-08, "loss": 0.2348, "step": 7881 }, { "epoch": 6.634680134680135, "grad_norm": 0.4719696044921875, "learning_rate": 8.274659366548699e-08, "loss": 0.2357, "step": 7882 }, { "epoch": 6.635521885521886, "grad_norm": 0.4646303355693817, "learning_rate": 8.236676146883449e-08, "loss": 0.2102, "step": 7883 }, { "epoch": 6.636363636363637, "grad_norm": 0.5716280937194824, "learning_rate": 8.198779581158777e-08, "loss": 0.2535, "step": 7884 }, { "epoch": 6.6372053872053876, "grad_norm": 0.4459933638572693, "learning_rate": 8.160969676052454e-08, "loss": 0.2296, "step": 7885 }, { "epoch": 6.638047138047138, "grad_norm": 0.4348316788673401, "learning_rate": 8.123246438226983e-08, "loss": 0.2458, "step": 7886 }, { "epoch": 6.638888888888889, "grad_norm": 0.4792332351207733, "learning_rate": 8.085609874329492e-08, "loss": 0.2572, "step": 7887 }, { "epoch": 6.63973063973064, "grad_norm": 0.7650473117828369, "learning_rate": 8.048059990992119e-08, "loss": 0.2518, "step": 7888 }, { "epoch": 6.640572390572391, "grad_norm": 0.4472450315952301, "learning_rate": 8.010596794831515e-08, "loss": 0.2341, "step": 7889 }, { "epoch": 6.641414141414142, "grad_norm": 0.48758038878440857, "learning_rate": 7.973220292449069e-08, "loss": 0.2661, "step": 7890 }, { "epoch": 6.642255892255893, "grad_norm": 0.572820246219635, "learning_rate": 7.935930490430844e-08, "loss": 0.2852, "step": 7891 }, { "epoch": 6.643097643097643, "grad_norm": 0.5144976377487183, "learning_rate": 7.898727395347804e-08, "loss": 0.2672, "step": 7892 }, { "epoch": 6.643939393939394, "grad_norm": 2.3472113609313965, "learning_rate": 7.861611013755488e-08, "loss": 0.2699, "step": 7893 }, { "epoch": 6.6447811447811445, "grad_norm": 0.516178548336029, "learning_rate": 7.824581352194216e-08, "loss": 0.2589, "step": 7894 }, { "epoch": 6.645622895622895, "grad_norm": 0.4505818486213684, "learning_rate": 7.787638417188992e-08, "loss": 0.249, "step": 7895 }, { "epoch": 6.646464646464646, "grad_norm": 0.45387861132621765, "learning_rate": 7.750782215249552e-08, "loss": 0.2516, "step": 7896 }, { "epoch": 6.647306397306397, "grad_norm": 0.4973078966140747, "learning_rate": 7.714012752870425e-08, "loss": 0.2525, "step": 7897 }, { "epoch": 6.648148148148148, "grad_norm": 0.7253732085227966, "learning_rate": 7.67733003653065e-08, "loss": 0.2478, "step": 7898 }, { "epoch": 6.648989898989899, "grad_norm": 0.4546608328819275, "learning_rate": 7.640734072694222e-08, "loss": 0.2548, "step": 7899 }, { "epoch": 6.64983164983165, "grad_norm": 0.4525928795337677, "learning_rate": 7.604224867809651e-08, "loss": 0.2266, "step": 7900 }, { "epoch": 6.6506734006734005, "grad_norm": 0.480866938829422, "learning_rate": 7.567802428310344e-08, "loss": 0.2496, "step": 7901 }, { "epoch": 6.651515151515151, "grad_norm": 0.4546586573123932, "learning_rate": 7.53146676061428e-08, "loss": 0.2335, "step": 7902 }, { "epoch": 6.652356902356902, "grad_norm": 0.45740360021591187, "learning_rate": 7.495217871124116e-08, "loss": 0.2551, "step": 7903 }, { "epoch": 6.653198653198653, "grad_norm": 0.43095874786376953, "learning_rate": 7.459055766227407e-08, "loss": 0.2522, "step": 7904 }, { "epoch": 6.654040404040404, "grad_norm": 0.446344256401062, "learning_rate": 7.42298045229628e-08, "loss": 0.2648, "step": 7905 }, { "epoch": 6.654882154882155, "grad_norm": 0.43020644783973694, "learning_rate": 7.386991935687593e-08, "loss": 0.2644, "step": 7906 }, { "epoch": 6.655723905723906, "grad_norm": 0.43564945459365845, "learning_rate": 7.35109022274283e-08, "loss": 0.2547, "step": 7907 }, { "epoch": 6.656565656565657, "grad_norm": 0.3985383212566376, "learning_rate": 7.315275319788317e-08, "loss": 0.2592, "step": 7908 }, { "epoch": 6.657407407407407, "grad_norm": 1.3167930841445923, "learning_rate": 7.279547233135009e-08, "loss": 0.2556, "step": 7909 }, { "epoch": 6.658249158249158, "grad_norm": 0.4726661443710327, "learning_rate": 7.243905969078591e-08, "loss": 0.2423, "step": 7910 }, { "epoch": 6.659090909090909, "grad_norm": 0.45442232489585876, "learning_rate": 7.208351533899427e-08, "loss": 0.2515, "step": 7911 }, { "epoch": 6.65993265993266, "grad_norm": 0.4229814410209656, "learning_rate": 7.172883933862562e-08, "loss": 0.2331, "step": 7912 }, { "epoch": 6.660774410774411, "grad_norm": 0.47652801871299744, "learning_rate": 7.13750317521783e-08, "loss": 0.2242, "step": 7913 }, { "epoch": 6.661616161616162, "grad_norm": 0.46761634945869446, "learning_rate": 7.102209264199577e-08, "loss": 0.2719, "step": 7914 }, { "epoch": 6.662457912457913, "grad_norm": 0.4608408808708191, "learning_rate": 7.067002207027107e-08, "loss": 0.2699, "step": 7915 }, { "epoch": 6.6632996632996635, "grad_norm": 0.4475562274456024, "learning_rate": 7.031882009904234e-08, "loss": 0.2765, "step": 7916 }, { "epoch": 6.664141414141414, "grad_norm": 0.5038113594055176, "learning_rate": 6.996848679019396e-08, "loss": 0.248, "step": 7917 }, { "epoch": 6.664983164983165, "grad_norm": 0.4691965878009796, "learning_rate": 6.961902220546046e-08, "loss": 0.2381, "step": 7918 }, { "epoch": 6.665824915824916, "grad_norm": 0.43881040811538696, "learning_rate": 6.927042640641923e-08, "loss": 0.2895, "step": 7919 }, { "epoch": 6.666666666666667, "grad_norm": 0.43294575810432434, "learning_rate": 6.892269945449781e-08, "loss": 0.2534, "step": 7920 }, { "epoch": 6.667508417508418, "grad_norm": 0.45935097336769104, "learning_rate": 6.857584141096884e-08, "loss": 0.2583, "step": 7921 }, { "epoch": 6.668350168350169, "grad_norm": 0.45869147777557373, "learning_rate": 6.82298523369529e-08, "loss": 0.2342, "step": 7922 }, { "epoch": 6.66919191919192, "grad_norm": 0.43914246559143066, "learning_rate": 6.788473229341675e-08, "loss": 0.264, "step": 7923 }, { "epoch": 6.6700336700336695, "grad_norm": 0.4126952886581421, "learning_rate": 6.754048134117397e-08, "loss": 0.2573, "step": 7924 }, { "epoch": 6.67087542087542, "grad_norm": 0.48317551612854004, "learning_rate": 6.719709954088494e-08, "loss": 0.2525, "step": 7925 }, { "epoch": 6.671717171717171, "grad_norm": 0.6077154278755188, "learning_rate": 6.685458695305846e-08, "loss": 0.2426, "step": 7926 }, { "epoch": 6.672558922558922, "grad_norm": 0.44338762760162354, "learning_rate": 6.651294363804795e-08, "loss": 0.2514, "step": 7927 }, { "epoch": 6.673400673400673, "grad_norm": 0.422849178314209, "learning_rate": 6.617216965605466e-08, "loss": 0.2558, "step": 7928 }, { "epoch": 6.674242424242424, "grad_norm": 0.5612330436706543, "learning_rate": 6.583226506712614e-08, "loss": 0.2478, "step": 7929 }, { "epoch": 6.675084175084175, "grad_norm": 0.48107799887657166, "learning_rate": 6.549322993115837e-08, "loss": 0.2535, "step": 7930 }, { "epoch": 6.675925925925926, "grad_norm": 0.5182533860206604, "learning_rate": 6.515506430789187e-08, "loss": 0.2358, "step": 7931 }, { "epoch": 6.6767676767676765, "grad_norm": 0.4320874810218811, "learning_rate": 6.481776825691621e-08, "loss": 0.231, "step": 7932 }, { "epoch": 6.677609427609427, "grad_norm": 0.4409750699996948, "learning_rate": 6.448134183766497e-08, "loss": 0.2358, "step": 7933 }, { "epoch": 6.678451178451178, "grad_norm": 0.4216385781764984, "learning_rate": 6.414578510942182e-08, "loss": 0.2538, "step": 7934 }, { "epoch": 6.679292929292929, "grad_norm": 0.4320267140865326, "learning_rate": 6.381109813131392e-08, "loss": 0.2361, "step": 7935 }, { "epoch": 6.68013468013468, "grad_norm": 0.4650171101093292, "learning_rate": 6.347728096231797e-08, "loss": 0.2297, "step": 7936 }, { "epoch": 6.680976430976431, "grad_norm": 0.4626995325088501, "learning_rate": 6.314433366125472e-08, "loss": 0.2517, "step": 7937 }, { "epoch": 6.681818181818182, "grad_norm": 0.4166358411312103, "learning_rate": 6.281225628679443e-08, "loss": 0.2431, "step": 7938 }, { "epoch": 6.6826599326599325, "grad_norm": 0.3906804323196411, "learning_rate": 6.248104889745144e-08, "loss": 0.2454, "step": 7939 }, { "epoch": 6.683501683501683, "grad_norm": 0.4131794571876526, "learning_rate": 6.215071155158903e-08, "loss": 0.2587, "step": 7940 }, { "epoch": 6.684343434343434, "grad_norm": 0.4399358630180359, "learning_rate": 6.182124430741565e-08, "loss": 0.2581, "step": 7941 }, { "epoch": 6.685185185185185, "grad_norm": 0.41218388080596924, "learning_rate": 6.149264722298653e-08, "loss": 0.2469, "step": 7942 }, { "epoch": 6.686026936026936, "grad_norm": 0.4838857650756836, "learning_rate": 6.116492035620537e-08, "loss": 0.2502, "step": 7943 }, { "epoch": 6.686868686868687, "grad_norm": 0.4879959225654602, "learning_rate": 6.083806376481982e-08, "loss": 0.2408, "step": 7944 }, { "epoch": 6.687710437710438, "grad_norm": 0.4673900902271271, "learning_rate": 6.051207750642606e-08, "loss": 0.2428, "step": 7945 }, { "epoch": 6.688552188552189, "grad_norm": 0.4340228736400604, "learning_rate": 6.018696163846538e-08, "loss": 0.2302, "step": 7946 }, { "epoch": 6.6893939393939394, "grad_norm": 0.44881290197372437, "learning_rate": 5.986271621822859e-08, "loss": 0.235, "step": 7947 }, { "epoch": 6.69023569023569, "grad_norm": 0.4502499997615814, "learning_rate": 5.953934130284944e-08, "loss": 0.2455, "step": 7948 }, { "epoch": 6.691077441077441, "grad_norm": 0.41264891624450684, "learning_rate": 5.9216836949311243e-08, "loss": 0.2403, "step": 7949 }, { "epoch": 6.691919191919192, "grad_norm": 0.44085463881492615, "learning_rate": 5.889520321444131e-08, "loss": 0.2407, "step": 7950 }, { "epoch": 6.692760942760943, "grad_norm": 0.40595123171806335, "learning_rate": 5.857444015491598e-08, "loss": 0.232, "step": 7951 }, { "epoch": 6.693602693602694, "grad_norm": 0.40886637568473816, "learning_rate": 5.825454782725781e-08, "loss": 0.2584, "step": 7952 }, { "epoch": 6.694444444444445, "grad_norm": 0.42604565620422363, "learning_rate": 5.793552628783394e-08, "loss": 0.2724, "step": 7953 }, { "epoch": 6.6952861952861955, "grad_norm": 0.4384261667728424, "learning_rate": 5.761737559285885e-08, "loss": 0.2407, "step": 7954 }, { "epoch": 6.696127946127946, "grad_norm": 0.42575159668922424, "learning_rate": 5.7300095798396015e-08, "loss": 0.2653, "step": 7955 }, { "epoch": 6.696969696969697, "grad_norm": 0.424037903547287, "learning_rate": 5.698368696035239e-08, "loss": 0.2368, "step": 7956 }, { "epoch": 6.697811447811448, "grad_norm": 0.4181269705295563, "learning_rate": 5.666814913448226e-08, "loss": 0.2478, "step": 7957 }, { "epoch": 6.698653198653199, "grad_norm": 0.45440566539764404, "learning_rate": 5.635348237638782e-08, "loss": 0.2315, "step": 7958 }, { "epoch": 6.69949494949495, "grad_norm": 0.41462236642837524, "learning_rate": 5.603968674151528e-08, "loss": 0.2658, "step": 7959 }, { "epoch": 6.700336700336701, "grad_norm": 0.4417324364185333, "learning_rate": 5.572676228516039e-08, "loss": 0.2559, "step": 7960 }, { "epoch": 6.701178451178452, "grad_norm": 0.398490309715271, "learning_rate": 5.5414709062462936e-08, "loss": 0.2655, "step": 7961 }, { "epoch": 6.702020202020202, "grad_norm": 0.4085494875907898, "learning_rate": 5.510352712840949e-08, "loss": 0.2529, "step": 7962 }, { "epoch": 6.702861952861953, "grad_norm": 0.39864856004714966, "learning_rate": 5.479321653783398e-08, "loss": 0.2523, "step": 7963 }, { "epoch": 6.703703703703704, "grad_norm": 0.40677589178085327, "learning_rate": 5.448377734541765e-08, "loss": 0.2186, "step": 7964 }, { "epoch": 6.704545454545455, "grad_norm": 0.4413403272628784, "learning_rate": 5.417520960568523e-08, "loss": 0.2394, "step": 7965 }, { "epoch": 6.705387205387205, "grad_norm": 0.5354292392730713, "learning_rate": 5.386751337301099e-08, "loss": 0.2443, "step": 7966 }, { "epoch": 6.706228956228956, "grad_norm": 0.41562604904174805, "learning_rate": 5.356068870161324e-08, "loss": 0.2562, "step": 7967 }, { "epoch": 6.707070707070707, "grad_norm": 0.4000895321369171, "learning_rate": 5.325473564555761e-08, "loss": 0.2524, "step": 7968 }, { "epoch": 6.707912457912458, "grad_norm": 0.47506165504455566, "learning_rate": 5.29496542587582e-08, "loss": 0.2521, "step": 7969 }, { "epoch": 6.7087542087542085, "grad_norm": 0.40362510085105896, "learning_rate": 5.264544459497145e-08, "loss": 0.2685, "step": 7970 }, { "epoch": 6.709595959595959, "grad_norm": 0.41988077759742737, "learning_rate": 5.2342106707803376e-08, "loss": 0.2376, "step": 7971 }, { "epoch": 6.71043771043771, "grad_norm": 0.4483652114868164, "learning_rate": 5.20396406507051e-08, "loss": 0.2149, "step": 7972 }, { "epoch": 6.711279461279461, "grad_norm": 0.4344198405742645, "learning_rate": 5.1738046476974554e-08, "loss": 0.2507, "step": 7973 }, { "epoch": 6.712121212121212, "grad_norm": 0.4064551591873169, "learning_rate": 5.1437324239755895e-08, "loss": 0.2703, "step": 7974 }, { "epoch": 6.712962962962963, "grad_norm": 0.7684438228607178, "learning_rate": 5.11374739920395e-08, "loss": 0.2072, "step": 7975 }, { "epoch": 6.713804713804714, "grad_norm": 0.4365333020687103, "learning_rate": 5.0838495786661446e-08, "loss": 0.2315, "step": 7976 }, { "epoch": 6.7146464646464645, "grad_norm": 0.43874165415763855, "learning_rate": 5.054038967630625e-08, "loss": 0.2469, "step": 7977 }, { "epoch": 6.715488215488215, "grad_norm": 0.47672873735427856, "learning_rate": 5.024315571350191e-08, "loss": 0.239, "step": 7978 }, { "epoch": 6.716329966329966, "grad_norm": 0.4358423352241516, "learning_rate": 4.994679395062596e-08, "loss": 0.2755, "step": 7979 }, { "epoch": 6.717171717171717, "grad_norm": 0.4169863164424896, "learning_rate": 4.965130443989885e-08, "loss": 0.2699, "step": 7980 }, { "epoch": 6.718013468013468, "grad_norm": 0.42336949706077576, "learning_rate": 4.9356687233389486e-08, "loss": 0.2297, "step": 7981 }, { "epoch": 6.718855218855219, "grad_norm": 0.5112847685813904, "learning_rate": 4.906294238301357e-08, "loss": 0.2219, "step": 7982 }, { "epoch": 6.71969696969697, "grad_norm": 0.45648688077926636, "learning_rate": 4.877006994053024e-08, "loss": 0.2612, "step": 7983 }, { "epoch": 6.720538720538721, "grad_norm": 0.4718899726867676, "learning_rate": 4.8478069957548226e-08, "loss": 0.2498, "step": 7984 }, { "epoch": 6.7213804713804715, "grad_norm": 0.4433843791484833, "learning_rate": 4.81869424855208e-08, "loss": 0.2324, "step": 7985 }, { "epoch": 6.722222222222222, "grad_norm": 0.48078206181526184, "learning_rate": 4.789668757574695e-08, "loss": 0.2748, "step": 7986 }, { "epoch": 6.723063973063973, "grad_norm": 0.44792771339416504, "learning_rate": 4.7607305279373514e-08, "loss": 0.2518, "step": 7987 }, { "epoch": 6.723905723905724, "grad_norm": 0.4709182679653168, "learning_rate": 4.73187956473925e-08, "loss": 0.2521, "step": 7988 }, { "epoch": 6.724747474747475, "grad_norm": 0.4400613009929657, "learning_rate": 4.703115873064212e-08, "loss": 0.2575, "step": 7989 }, { "epoch": 6.725589225589226, "grad_norm": 0.4029322862625122, "learning_rate": 4.67443945798074e-08, "loss": 0.2521, "step": 7990 }, { "epoch": 6.726430976430977, "grad_norm": 0.672539472579956, "learning_rate": 4.6458503245419005e-08, "loss": 0.2441, "step": 7991 }, { "epoch": 6.7272727272727275, "grad_norm": 0.4535450339317322, "learning_rate": 4.617348477785388e-08, "loss": 0.2444, "step": 7992 }, { "epoch": 6.728114478114478, "grad_norm": 0.44552767276763916, "learning_rate": 4.588933922733629e-08, "loss": 0.265, "step": 7993 }, { "epoch": 6.728956228956229, "grad_norm": 0.5979976058006287, "learning_rate": 4.560606664393397e-08, "loss": 0.2345, "step": 7994 }, { "epoch": 6.72979797979798, "grad_norm": 0.4547877311706543, "learning_rate": 4.5323667077564745e-08, "loss": 0.284, "step": 7995 }, { "epoch": 6.730639730639731, "grad_norm": 0.4991534948348999, "learning_rate": 4.504214057798883e-08, "loss": 0.2585, "step": 7996 }, { "epoch": 6.731481481481482, "grad_norm": 0.4690903127193451, "learning_rate": 4.476148719481432e-08, "loss": 0.232, "step": 7997 }, { "epoch": 6.732323232323233, "grad_norm": 0.4249463379383087, "learning_rate": 4.448170697749554e-08, "loss": 0.2475, "step": 7998 }, { "epoch": 6.733164983164983, "grad_norm": 0.46491575241088867, "learning_rate": 4.4202799975333055e-08, "loss": 0.2624, "step": 7999 }, { "epoch": 6.7340067340067336, "grad_norm": 0.40995943546295166, "learning_rate": 4.392476623747366e-08, "loss": 0.2538, "step": 8000 }, { "epoch": 6.734848484848484, "grad_norm": 0.4721425771713257, "learning_rate": 4.3647605812908165e-08, "loss": 0.2296, "step": 8001 }, { "epoch": 6.735690235690235, "grad_norm": 0.43182018399238586, "learning_rate": 4.337131875047751e-08, "loss": 0.2465, "step": 8002 }, { "epoch": 6.736531986531986, "grad_norm": 0.4003061056137085, "learning_rate": 4.309590509886441e-08, "loss": 0.2572, "step": 8003 }, { "epoch": 6.737373737373737, "grad_norm": 0.40933090448379517, "learning_rate": 4.2821364906600606e-08, "loss": 0.2576, "step": 8004 }, { "epoch": 6.738215488215488, "grad_norm": 0.4304264783859253, "learning_rate": 4.254769822206295e-08, "loss": 0.2606, "step": 8005 }, { "epoch": 6.739057239057239, "grad_norm": 0.43371865153312683, "learning_rate": 4.227490509347398e-08, "loss": 0.2379, "step": 8006 }, { "epoch": 6.73989898989899, "grad_norm": 0.42457690834999084, "learning_rate": 4.2002985568903585e-08, "loss": 0.2796, "step": 8007 }, { "epoch": 6.7407407407407405, "grad_norm": 0.47085320949554443, "learning_rate": 4.1731939696266764e-08, "loss": 0.248, "step": 8008 }, { "epoch": 6.741582491582491, "grad_norm": 0.447751522064209, "learning_rate": 4.146176752332365e-08, "loss": 0.2372, "step": 8009 }, { "epoch": 6.742424242424242, "grad_norm": 0.46753737330436707, "learning_rate": 4.1192469097682265e-08, "loss": 0.2668, "step": 8010 }, { "epoch": 6.743265993265993, "grad_norm": 0.4249885678291321, "learning_rate": 4.092404446679632e-08, "loss": 0.2458, "step": 8011 }, { "epoch": 6.744107744107744, "grad_norm": 0.44669926166534424, "learning_rate": 4.065649367796409e-08, "loss": 0.2667, "step": 8012 }, { "epoch": 6.744949494949495, "grad_norm": 0.4191698729991913, "learning_rate": 4.038981677833176e-08, "loss": 0.2486, "step": 8013 }, { "epoch": 6.745791245791246, "grad_norm": 0.4479946494102478, "learning_rate": 4.0124013814889506e-08, "loss": 0.223, "step": 8014 }, { "epoch": 6.7466329966329965, "grad_norm": 0.4341362416744232, "learning_rate": 3.9859084834475426e-08, "loss": 0.2306, "step": 8015 }, { "epoch": 6.747474747474747, "grad_norm": 0.43498924374580383, "learning_rate": 3.9595029883773283e-08, "loss": 0.2775, "step": 8016 }, { "epoch": 6.748316498316498, "grad_norm": 0.45255836844444275, "learning_rate": 3.933184900931142e-08, "loss": 0.2415, "step": 8017 }, { "epoch": 6.749158249158249, "grad_norm": 0.42001038789749146, "learning_rate": 3.906954225746551e-08, "loss": 0.2398, "step": 8018 }, { "epoch": 6.75, "grad_norm": 0.4406849443912506, "learning_rate": 3.880810967445747e-08, "loss": 0.2719, "step": 8019 }, { "epoch": 6.750841750841751, "grad_norm": 0.4706053137779236, "learning_rate": 3.8547551306353237e-08, "loss": 0.2443, "step": 8020 }, { "epoch": 6.751683501683502, "grad_norm": 0.3788938522338867, "learning_rate": 3.828786719906663e-08, "loss": 0.2681, "step": 8021 }, { "epoch": 6.752525252525253, "grad_norm": 0.4148952066898346, "learning_rate": 3.802905739835661e-08, "loss": 0.2748, "step": 8022 }, { "epoch": 6.7533670033670035, "grad_norm": 0.43103334307670593, "learning_rate": 3.77711219498289e-08, "loss": 0.2461, "step": 8023 }, { "epoch": 6.754208754208754, "grad_norm": 0.4226977229118347, "learning_rate": 3.751406089893383e-08, "loss": 0.2348, "step": 8024 }, { "epoch": 6.755050505050505, "grad_norm": 0.4383535087108612, "learning_rate": 3.725787429096794e-08, "loss": 0.2292, "step": 8025 }, { "epoch": 6.755892255892256, "grad_norm": 0.4131900668144226, "learning_rate": 3.7002562171074566e-08, "loss": 0.2485, "step": 8026 }, { "epoch": 6.756734006734007, "grad_norm": 0.5007175803184509, "learning_rate": 3.674812458424271e-08, "loss": 0.2485, "step": 8027 }, { "epoch": 6.757575757575758, "grad_norm": 0.42478975653648376, "learning_rate": 3.649456157530651e-08, "loss": 0.2547, "step": 8028 }, { "epoch": 6.758417508417509, "grad_norm": 0.412039577960968, "learning_rate": 3.624187318894634e-08, "loss": 0.2423, "step": 8029 }, { "epoch": 6.7592592592592595, "grad_norm": 0.46180886030197144, "learning_rate": 3.599005946968881e-08, "loss": 0.2582, "step": 8030 }, { "epoch": 6.76010101010101, "grad_norm": 0.41610756516456604, "learning_rate": 3.5739120461906196e-08, "loss": 0.2129, "step": 8031 }, { "epoch": 6.760942760942761, "grad_norm": 0.5137214064598083, "learning_rate": 3.548905620981702e-08, "loss": 0.234, "step": 8032 }, { "epoch": 6.761784511784512, "grad_norm": 0.3923676908016205, "learning_rate": 3.523986675748492e-08, "loss": 0.2458, "step": 8033 }, { "epoch": 6.762626262626263, "grad_norm": 0.4197588264942169, "learning_rate": 3.499155214881922e-08, "loss": 0.2481, "step": 8034 }, { "epoch": 6.763468013468014, "grad_norm": 0.401614248752594, "learning_rate": 3.474411242757658e-08, "loss": 0.2808, "step": 8035 }, { "epoch": 6.764309764309765, "grad_norm": 0.4286206364631653, "learning_rate": 3.4497547637357684e-08, "loss": 0.2423, "step": 8036 }, { "epoch": 6.765151515151516, "grad_norm": 0.43552160263061523, "learning_rate": 3.425185782161e-08, "loss": 0.2506, "step": 8037 }, { "epoch": 6.7659932659932664, "grad_norm": 0.42223963141441345, "learning_rate": 3.400704302362723e-08, "loss": 0.2486, "step": 8038 }, { "epoch": 6.766835016835017, "grad_norm": 0.44677823781967163, "learning_rate": 3.3763103286548194e-08, "loss": 0.2724, "step": 8039 }, { "epoch": 6.767676767676767, "grad_norm": 0.4364713132381439, "learning_rate": 3.35200386533574e-08, "loss": 0.2345, "step": 8040 }, { "epoch": 6.768518518518518, "grad_norm": 0.5067551136016846, "learning_rate": 3.3277849166885035e-08, "loss": 0.2546, "step": 8041 }, { "epoch": 6.769360269360269, "grad_norm": 0.4411182105541229, "learning_rate": 3.303653486980862e-08, "loss": 0.2393, "step": 8042 }, { "epoch": 6.77020202020202, "grad_norm": 0.42105695605278015, "learning_rate": 3.279609580464915e-08, "loss": 0.2516, "step": 8043 }, { "epoch": 6.771043771043771, "grad_norm": 0.396797239780426, "learning_rate": 3.255653201377551e-08, "loss": 0.2365, "step": 8044 }, { "epoch": 6.771885521885522, "grad_norm": 0.41330716013908386, "learning_rate": 3.23178435394006e-08, "loss": 0.2517, "step": 8045 }, { "epoch": 6.7727272727272725, "grad_norm": 0.47340020537376404, "learning_rate": 3.208003042358465e-08, "loss": 0.2272, "step": 8046 }, { "epoch": 6.773569023569023, "grad_norm": 0.44475528597831726, "learning_rate": 3.184309270823194e-08, "loss": 0.2348, "step": 8047 }, { "epoch": 6.774410774410774, "grad_norm": 0.44681501388549805, "learning_rate": 3.1607030435094053e-08, "loss": 0.2746, "step": 8048 }, { "epoch": 6.775252525252525, "grad_norm": 0.4099847078323364, "learning_rate": 3.1371843645767176e-08, "loss": 0.26, "step": 8049 }, { "epoch": 6.776094276094276, "grad_norm": 0.4382650554180145, "learning_rate": 3.113753238169481e-08, "loss": 0.2651, "step": 8050 }, { "epoch": 6.776936026936027, "grad_norm": 0.47684118151664734, "learning_rate": 3.0904096684163944e-08, "loss": 0.2602, "step": 8051 }, { "epoch": 6.777777777777778, "grad_norm": 0.429573655128479, "learning_rate": 3.0671536594308884e-08, "loss": 0.2661, "step": 8052 }, { "epoch": 6.7786195286195285, "grad_norm": 0.40992608666419983, "learning_rate": 3.043985215310907e-08, "loss": 0.2383, "step": 8053 }, { "epoch": 6.779461279461279, "grad_norm": 0.4073900878429413, "learning_rate": 3.020904340139019e-08, "loss": 0.2544, "step": 8054 }, { "epoch": 6.78030303030303, "grad_norm": 0.4505603611469269, "learning_rate": 2.997911037982304e-08, "loss": 0.2139, "step": 8055 }, { "epoch": 6.781144781144781, "grad_norm": 0.4311800003051758, "learning_rate": 2.975005312892354e-08, "loss": 0.2345, "step": 8056 }, { "epoch": 6.781986531986532, "grad_norm": 0.440023809671402, "learning_rate": 2.9521871689055515e-08, "loss": 0.2388, "step": 8057 }, { "epoch": 6.782828282828283, "grad_norm": 0.43937280774116516, "learning_rate": 2.9294566100425135e-08, "loss": 0.2645, "step": 8058 }, { "epoch": 6.783670033670034, "grad_norm": 0.4259999990463257, "learning_rate": 2.9068136403087587e-08, "loss": 0.2621, "step": 8059 }, { "epoch": 6.784511784511785, "grad_norm": 0.4442819356918335, "learning_rate": 2.884258263694095e-08, "loss": 0.2539, "step": 8060 }, { "epoch": 6.7853535353535355, "grad_norm": 0.4627664387226105, "learning_rate": 2.861790484173177e-08, "loss": 0.2427, "step": 8061 }, { "epoch": 6.786195286195286, "grad_norm": 0.40286239981651306, "learning_rate": 2.839410305704893e-08, "loss": 0.2372, "step": 8062 }, { "epoch": 6.787037037037037, "grad_norm": 0.43738609552383423, "learning_rate": 2.8171177322329767e-08, "loss": 0.218, "step": 8063 }, { "epoch": 6.787878787878788, "grad_norm": 0.444576233625412, "learning_rate": 2.79491276768562e-08, "loss": 0.2153, "step": 8064 }, { "epoch": 6.788720538720539, "grad_norm": 0.42781564593315125, "learning_rate": 2.7727954159755265e-08, "loss": 0.22, "step": 8065 }, { "epoch": 6.78956228956229, "grad_norm": 0.4388132095336914, "learning_rate": 2.7507656810000227e-08, "loss": 0.2385, "step": 8066 }, { "epoch": 6.790404040404041, "grad_norm": 0.42631518840789795, "learning_rate": 2.7288235666410035e-08, "loss": 0.2742, "step": 8067 }, { "epoch": 6.7912457912457915, "grad_norm": 0.4216606616973877, "learning_rate": 2.706969076764876e-08, "loss": 0.2458, "step": 8068 }, { "epoch": 6.792087542087542, "grad_norm": 0.3967618942260742, "learning_rate": 2.6852022152226708e-08, "loss": 0.2227, "step": 8069 }, { "epoch": 6.792929292929293, "grad_norm": 0.4351973831653595, "learning_rate": 2.6635229858499312e-08, "loss": 0.2489, "step": 8070 }, { "epoch": 6.793771043771044, "grad_norm": 0.411205917596817, "learning_rate": 2.6419313924667124e-08, "loss": 0.2538, "step": 8071 }, { "epoch": 6.794612794612795, "grad_norm": 0.423886775970459, "learning_rate": 2.6204274388777484e-08, "loss": 0.2361, "step": 8072 }, { "epoch": 6.795454545454545, "grad_norm": 0.38439005613327026, "learning_rate": 2.599011128872231e-08, "loss": 0.2504, "step": 8073 }, { "epoch": 6.796296296296296, "grad_norm": 0.45315882563591003, "learning_rate": 2.5776824662239187e-08, "loss": 0.2251, "step": 8074 }, { "epoch": 6.797138047138047, "grad_norm": 0.40241584181785583, "learning_rate": 2.5564414546912498e-08, "loss": 0.2593, "step": 8075 }, { "epoch": 6.797979797979798, "grad_norm": 0.4192306101322174, "learning_rate": 2.5352880980170635e-08, "loss": 0.233, "step": 8076 }, { "epoch": 6.798821548821548, "grad_norm": 0.9835978746414185, "learning_rate": 2.514222399928712e-08, "loss": 0.2542, "step": 8077 }, { "epoch": 6.799663299663299, "grad_norm": 0.4296455681324005, "learning_rate": 2.4932443641383363e-08, "loss": 0.2522, "step": 8078 }, { "epoch": 6.80050505050505, "grad_norm": 0.4061324894428253, "learning_rate": 2.4723539943424247e-08, "loss": 0.2422, "step": 8079 }, { "epoch": 6.801346801346801, "grad_norm": 0.40144333243370056, "learning_rate": 2.4515512942220875e-08, "loss": 0.2526, "step": 8080 }, { "epoch": 6.802188552188552, "grad_norm": 0.4107275903224945, "learning_rate": 2.430836267443004e-08, "loss": 0.2325, "step": 8081 }, { "epoch": 6.803030303030303, "grad_norm": 0.4097050428390503, "learning_rate": 2.4102089176553657e-08, "loss": 0.234, "step": 8082 }, { "epoch": 6.803872053872054, "grad_norm": 0.39891791343688965, "learning_rate": 2.389669248493931e-08, "loss": 0.2525, "step": 8083 }, { "epoch": 6.8047138047138045, "grad_norm": 0.4652697443962097, "learning_rate": 2.369217263577972e-08, "loss": 0.2509, "step": 8084 }, { "epoch": 6.805555555555555, "grad_norm": 0.43413662910461426, "learning_rate": 2.3488529665113836e-08, "loss": 0.2689, "step": 8085 }, { "epoch": 6.806397306397306, "grad_norm": 0.4273231029510498, "learning_rate": 2.3285763608826283e-08, "loss": 0.2381, "step": 8086 }, { "epoch": 6.807239057239057, "grad_norm": 0.42917588353157043, "learning_rate": 2.308387450264571e-08, "loss": 0.2489, "step": 8087 }, { "epoch": 6.808080808080808, "grad_norm": 0.4423801302909851, "learning_rate": 2.2882862382147543e-08, "loss": 0.2659, "step": 8088 }, { "epoch": 6.808922558922559, "grad_norm": 0.42742136120796204, "learning_rate": 2.268272728275178e-08, "loss": 0.2301, "step": 8089 }, { "epoch": 6.80976430976431, "grad_norm": 0.420261025428772, "learning_rate": 2.2483469239724665e-08, "loss": 0.2555, "step": 8090 }, { "epoch": 6.8106060606060606, "grad_norm": 0.4613786041736603, "learning_rate": 2.2285088288178104e-08, "loss": 0.2696, "step": 8091 }, { "epoch": 6.811447811447811, "grad_norm": 0.4666854739189148, "learning_rate": 2.208758446306858e-08, "loss": 0.2467, "step": 8092 }, { "epoch": 6.812289562289562, "grad_norm": 0.402306467294693, "learning_rate": 2.1890957799198254e-08, "loss": 0.2591, "step": 8093 }, { "epoch": 6.813131313131313, "grad_norm": 0.4554370641708374, "learning_rate": 2.1695208331214413e-08, "loss": 0.2311, "step": 8094 }, { "epoch": 6.813973063973064, "grad_norm": 0.46236273646354675, "learning_rate": 2.150033609361113e-08, "loss": 0.2617, "step": 8095 }, { "epoch": 6.814814814814815, "grad_norm": 0.4232053756713867, "learning_rate": 2.1306341120726493e-08, "loss": 0.2414, "step": 8096 }, { "epoch": 6.815656565656566, "grad_norm": 0.42152121663093567, "learning_rate": 2.1113223446744268e-08, "loss": 0.2316, "step": 8097 }, { "epoch": 6.816498316498317, "grad_norm": 0.41937559843063354, "learning_rate": 2.092098310569446e-08, "loss": 0.2356, "step": 8098 }, { "epoch": 6.8173400673400675, "grad_norm": 0.41178855299949646, "learning_rate": 2.072962013145108e-08, "loss": 0.234, "step": 8099 }, { "epoch": 6.818181818181818, "grad_norm": 0.4141520857810974, "learning_rate": 2.053913455773493e-08, "loss": 0.263, "step": 8100 }, { "epoch": 6.819023569023569, "grad_norm": 0.44345977902412415, "learning_rate": 2.0349526418111386e-08, "loss": 0.2356, "step": 8101 }, { "epoch": 6.81986531986532, "grad_norm": 0.40438416600227356, "learning_rate": 2.01607957459915e-08, "loss": 0.2726, "step": 8102 }, { "epoch": 6.820707070707071, "grad_norm": 0.42855334281921387, "learning_rate": 1.9972942574631447e-08, "loss": 0.263, "step": 8103 }, { "epoch": 6.821548821548822, "grad_norm": 0.4135681390762329, "learning_rate": 1.9785966937133084e-08, "loss": 0.2589, "step": 8104 }, { "epoch": 6.822390572390573, "grad_norm": 0.4133782684803009, "learning_rate": 1.959986886644394e-08, "loss": 0.2477, "step": 8105 }, { "epoch": 6.8232323232323235, "grad_norm": 0.48663774132728577, "learning_rate": 1.9414648395355562e-08, "loss": 0.2166, "step": 8106 }, { "epoch": 6.824074074074074, "grad_norm": 0.43916186690330505, "learning_rate": 1.923030555650629e-08, "loss": 0.2751, "step": 8107 }, { "epoch": 6.824915824915825, "grad_norm": 0.4100300669670105, "learning_rate": 1.904684038237903e-08, "loss": 0.2547, "step": 8108 }, { "epoch": 6.825757575757576, "grad_norm": 0.4187939763069153, "learning_rate": 1.886425290530347e-08, "loss": 0.225, "step": 8109 }, { "epoch": 6.826599326599327, "grad_norm": 0.412821501493454, "learning_rate": 1.868254315745166e-08, "loss": 0.2153, "step": 8110 }, { "epoch": 6.827441077441078, "grad_norm": 0.45925602316856384, "learning_rate": 1.8501711170844093e-08, "loss": 0.2402, "step": 8111 }, { "epoch": 6.828282828282829, "grad_norm": 0.3937370777130127, "learning_rate": 1.8321756977344174e-08, "loss": 0.2513, "step": 8112 }, { "epoch": 6.82912457912458, "grad_norm": 0.4104149341583252, "learning_rate": 1.8142680608663198e-08, "loss": 0.2593, "step": 8113 }, { "epoch": 6.8299663299663305, "grad_norm": 0.4480176866054535, "learning_rate": 1.7964482096355374e-08, "loss": 0.2561, "step": 8114 }, { "epoch": 6.83080808080808, "grad_norm": 0.39610880613327026, "learning_rate": 1.778716147182169e-08, "loss": 0.2727, "step": 8115 }, { "epoch": 6.831649831649831, "grad_norm": 0.4294256567955017, "learning_rate": 1.7610718766307155e-08, "loss": 0.245, "step": 8116 }, { "epoch": 6.832491582491582, "grad_norm": 0.43752700090408325, "learning_rate": 1.7435154010903567e-08, "loss": 0.2703, "step": 8117 }, { "epoch": 6.833333333333333, "grad_norm": 0.40223225951194763, "learning_rate": 1.726046723654673e-08, "loss": 0.2623, "step": 8118 }, { "epoch": 6.834175084175084, "grad_norm": 0.4323994219303131, "learning_rate": 1.7086658474019248e-08, "loss": 0.2394, "step": 8119 }, { "epoch": 6.835016835016835, "grad_norm": 0.4640810489654541, "learning_rate": 1.6913727753947174e-08, "loss": 0.2426, "step": 8120 }, { "epoch": 6.835858585858586, "grad_norm": 0.40309226512908936, "learning_rate": 1.6741675106802802e-08, "loss": 0.2493, "step": 8121 }, { "epoch": 6.8367003367003365, "grad_norm": 0.4103192090988159, "learning_rate": 1.6570500562904102e-08, "loss": 0.2547, "step": 8122 }, { "epoch": 6.837542087542087, "grad_norm": 0.46845728158950806, "learning_rate": 1.6400204152413613e-08, "loss": 0.2523, "step": 8123 }, { "epoch": 6.838383838383838, "grad_norm": 0.4342949688434601, "learning_rate": 1.623078590533955e-08, "loss": 0.2258, "step": 8124 }, { "epoch": 6.839225589225589, "grad_norm": 0.40765073895454407, "learning_rate": 1.6062245851535262e-08, "loss": 0.2561, "step": 8125 }, { "epoch": 6.84006734006734, "grad_norm": 0.44572514295578003, "learning_rate": 1.5894584020698657e-08, "loss": 0.2483, "step": 8126 }, { "epoch": 6.840909090909091, "grad_norm": 0.42083731293678284, "learning_rate": 1.5727800442373875e-08, "loss": 0.2568, "step": 8127 }, { "epoch": 6.841750841750842, "grad_norm": 0.4336468577384949, "learning_rate": 1.5561895145950745e-08, "loss": 0.2418, "step": 8128 }, { "epoch": 6.842592592592593, "grad_norm": 0.4257296919822693, "learning_rate": 1.539686816066255e-08, "loss": 0.2253, "step": 8129 }, { "epoch": 6.843434343434343, "grad_norm": 0.4752040505409241, "learning_rate": 1.52327195155888e-08, "loss": 0.2549, "step": 8130 }, { "epoch": 6.844276094276094, "grad_norm": 0.4271463453769684, "learning_rate": 1.5069449239655255e-08, "loss": 0.2444, "step": 8131 }, { "epoch": 6.845117845117845, "grad_norm": 0.42700451612472534, "learning_rate": 1.490705736163056e-08, "loss": 0.2448, "step": 8132 }, { "epoch": 6.845959595959596, "grad_norm": 0.43424174189567566, "learning_rate": 1.4745543910130721e-08, "loss": 0.2505, "step": 8133 }, { "epoch": 6.846801346801347, "grad_norm": 0.4350326359272003, "learning_rate": 1.4584908913616303e-08, "loss": 0.2771, "step": 8134 }, { "epoch": 6.847643097643098, "grad_norm": 0.45186036825180054, "learning_rate": 1.442515240039244e-08, "loss": 0.2512, "step": 8135 }, { "epoch": 6.848484848484849, "grad_norm": 0.4150189161300659, "learning_rate": 1.4266274398609948e-08, "loss": 0.2662, "step": 8136 }, { "epoch": 6.8493265993265995, "grad_norm": 0.42637670040130615, "learning_rate": 1.4108274936264766e-08, "loss": 0.2454, "step": 8137 }, { "epoch": 6.85016835016835, "grad_norm": 0.42179080843925476, "learning_rate": 1.3951154041197957e-08, "loss": 0.259, "step": 8138 }, { "epoch": 6.851010101010101, "grad_norm": 0.41920098662376404, "learning_rate": 1.3794911741096816e-08, "loss": 0.274, "step": 8139 }, { "epoch": 6.851851851851852, "grad_norm": 0.4467150568962097, "learning_rate": 1.3639548063492102e-08, "loss": 0.2862, "step": 8140 }, { "epoch": 6.852693602693603, "grad_norm": 0.41385436058044434, "learning_rate": 1.3485063035760249e-08, "loss": 0.2487, "step": 8141 }, { "epoch": 6.853535353535354, "grad_norm": 0.4254080653190613, "learning_rate": 1.3331456685124477e-08, "loss": 0.2565, "step": 8142 }, { "epoch": 6.854377104377105, "grad_norm": 0.4175633192062378, "learning_rate": 1.3178729038650361e-08, "loss": 0.2531, "step": 8143 }, { "epoch": 6.8552188552188555, "grad_norm": 0.41350317001342773, "learning_rate": 1.3026880123250817e-08, "loss": 0.2519, "step": 8144 }, { "epoch": 6.856060606060606, "grad_norm": 0.4553215801715851, "learning_rate": 1.287590996568333e-08, "loss": 0.2538, "step": 8145 }, { "epoch": 6.856902356902357, "grad_norm": 0.42358046770095825, "learning_rate": 1.272581859255051e-08, "loss": 0.23, "step": 8146 }, { "epoch": 6.857744107744107, "grad_norm": 0.41988128423690796, "learning_rate": 1.2576606030299532e-08, "loss": 0.2595, "step": 8147 }, { "epoch": 6.858585858585858, "grad_norm": 0.4408142566680908, "learning_rate": 1.2428272305224365e-08, "loss": 0.2452, "step": 8148 }, { "epoch": 6.859427609427609, "grad_norm": 0.43746355175971985, "learning_rate": 1.2280817443461324e-08, "loss": 0.2296, "step": 8149 }, { "epoch": 6.86026936026936, "grad_norm": 0.4056510031223297, "learning_rate": 1.2134241470994623e-08, "loss": 0.2836, "step": 8150 }, { "epoch": 6.861111111111111, "grad_norm": 0.41724660992622375, "learning_rate": 1.1988544413652491e-08, "loss": 0.2774, "step": 8151 }, { "epoch": 6.861952861952862, "grad_norm": 0.4296113848686218, "learning_rate": 1.1843726297108283e-08, "loss": 0.2678, "step": 8152 }, { "epoch": 6.8627946127946124, "grad_norm": 0.41863951086997986, "learning_rate": 1.1699787146879916e-08, "loss": 0.2381, "step": 8153 }, { "epoch": 6.863636363636363, "grad_norm": 0.47336775064468384, "learning_rate": 1.1556726988331547e-08, "loss": 0.2568, "step": 8154 }, { "epoch": 6.864478114478114, "grad_norm": 0.4469755291938782, "learning_rate": 1.1414545846671898e-08, "loss": 0.231, "step": 8155 }, { "epoch": 6.865319865319865, "grad_norm": 0.4341851472854614, "learning_rate": 1.127324374695482e-08, "loss": 0.2514, "step": 8156 }, { "epoch": 6.866161616161616, "grad_norm": 0.4004786014556885, "learning_rate": 1.1132820714079284e-08, "loss": 0.2533, "step": 8157 }, { "epoch": 6.867003367003367, "grad_norm": 0.42310473322868347, "learning_rate": 1.099327677278883e-08, "loss": 0.2255, "step": 8158 }, { "epoch": 6.867845117845118, "grad_norm": 0.43854472041130066, "learning_rate": 1.0854611947672678e-08, "loss": 0.2461, "step": 8159 }, { "epoch": 6.8686868686868685, "grad_norm": 0.4491567015647888, "learning_rate": 1.0716826263165725e-08, "loss": 0.2657, "step": 8160 }, { "epoch": 6.869528619528619, "grad_norm": 0.44098690152168274, "learning_rate": 1.0579919743546884e-08, "loss": 0.261, "step": 8161 }, { "epoch": 6.87037037037037, "grad_norm": 0.43246757984161377, "learning_rate": 1.0443892412940193e-08, "loss": 0.235, "step": 8162 }, { "epoch": 6.871212121212121, "grad_norm": 0.42546913027763367, "learning_rate": 1.0308744295315365e-08, "loss": 0.2672, "step": 8163 }, { "epoch": 6.872053872053872, "grad_norm": 0.43727797269821167, "learning_rate": 1.017447541448724e-08, "loss": 0.2674, "step": 8164 }, { "epoch": 6.872895622895623, "grad_norm": 0.40557676553726196, "learning_rate": 1.0041085794114669e-08, "loss": 0.2453, "step": 8165 }, { "epoch": 6.873737373737374, "grad_norm": 0.4222603738307953, "learning_rate": 9.908575457703296e-09, "loss": 0.2552, "step": 8166 }, { "epoch": 6.874579124579125, "grad_norm": 0.4332832098007202, "learning_rate": 9.776944428602219e-09, "loss": 0.2581, "step": 8167 }, { "epoch": 6.875420875420875, "grad_norm": 0.4021824300289154, "learning_rate": 9.646192730006775e-09, "loss": 0.2398, "step": 8168 }, { "epoch": 6.876262626262626, "grad_norm": 0.4177200198173523, "learning_rate": 9.516320384956313e-09, "loss": 0.2766, "step": 8169 }, { "epoch": 6.877104377104377, "grad_norm": 0.3729129433631897, "learning_rate": 9.387327416335856e-09, "loss": 0.2709, "step": 8170 }, { "epoch": 6.877946127946128, "grad_norm": 0.4002228081226349, "learning_rate": 9.259213846875004e-09, "loss": 0.2503, "step": 8171 }, { "epoch": 6.878787878787879, "grad_norm": 0.40851321816444397, "learning_rate": 9.13197969914903e-09, "loss": 0.2519, "step": 8172 }, { "epoch": 6.87962962962963, "grad_norm": 0.4494236409664154, "learning_rate": 9.005624995578332e-09, "loss": 0.2171, "step": 8173 }, { "epoch": 6.880471380471381, "grad_norm": 0.4299907088279724, "learning_rate": 8.880149758427326e-09, "loss": 0.2623, "step": 8174 }, { "epoch": 6.8813131313131315, "grad_norm": 0.41347068548202515, "learning_rate": 8.755554009806656e-09, "loss": 0.245, "step": 8175 }, { "epoch": 6.882154882154882, "grad_norm": 0.410225510597229, "learning_rate": 8.631837771670982e-09, "loss": 0.2669, "step": 8176 }, { "epoch": 6.882996632996633, "grad_norm": 0.4381839632987976, "learning_rate": 8.509001065820643e-09, "loss": 0.232, "step": 8177 }, { "epoch": 6.883838383838384, "grad_norm": 0.4421037435531616, "learning_rate": 8.38704391390055e-09, "loss": 0.2527, "step": 8178 }, { "epoch": 6.884680134680135, "grad_norm": 0.47844433784484863, "learning_rate": 8.265966337401288e-09, "loss": 0.2599, "step": 8179 }, { "epoch": 6.885521885521886, "grad_norm": 0.44024214148521423, "learning_rate": 8.145768357657458e-09, "loss": 0.2302, "step": 8180 }, { "epoch": 6.886363636363637, "grad_norm": 0.39440134167671204, "learning_rate": 8.026449995849894e-09, "loss": 0.2178, "step": 8181 }, { "epoch": 6.8872053872053876, "grad_norm": 0.41567036509513855, "learning_rate": 7.908011273002892e-09, "loss": 0.2439, "step": 8182 }, { "epoch": 6.888047138047138, "grad_norm": 0.4507554769515991, "learning_rate": 7.790452209987532e-09, "loss": 0.2304, "step": 8183 }, { "epoch": 6.888888888888889, "grad_norm": 0.45138999819755554, "learning_rate": 7.673772827518355e-09, "loss": 0.2363, "step": 8184 }, { "epoch": 6.88973063973064, "grad_norm": 0.45969343185424805, "learning_rate": 7.557973146155583e-09, "loss": 0.221, "step": 8185 }, { "epoch": 6.890572390572391, "grad_norm": 0.40162089467048645, "learning_rate": 7.4430531863045605e-09, "loss": 0.2832, "step": 8186 }, { "epoch": 6.891414141414142, "grad_norm": 0.4252229332923889, "learning_rate": 7.329012968215199e-09, "loss": 0.2439, "step": 8187 }, { "epoch": 6.892255892255893, "grad_norm": 0.4185779392719269, "learning_rate": 7.215852511982535e-09, "loss": 0.2338, "step": 8188 }, { "epoch": 6.893097643097643, "grad_norm": 0.4223805069923401, "learning_rate": 7.103571837547285e-09, "loss": 0.2412, "step": 8189 }, { "epoch": 6.893939393939394, "grad_norm": 0.4035782814025879, "learning_rate": 6.9921709646941784e-09, "loss": 0.2342, "step": 8190 }, { "epoch": 6.8947811447811445, "grad_norm": 0.398272305727005, "learning_rate": 6.881649913052513e-09, "loss": 0.236, "step": 8191 }, { "epoch": 6.895622895622895, "grad_norm": 0.41867244243621826, "learning_rate": 6.772008702098376e-09, "loss": 0.2459, "step": 8192 }, { "epoch": 6.896464646464646, "grad_norm": 0.41654136776924133, "learning_rate": 6.663247351151314e-09, "loss": 0.2546, "step": 8193 }, { "epoch": 6.897306397306397, "grad_norm": 0.4455567002296448, "learning_rate": 6.5553658793754415e-09, "loss": 0.2531, "step": 8194 }, { "epoch": 6.898148148148148, "grad_norm": 0.4406099021434784, "learning_rate": 6.448364305782218e-09, "loss": 0.2273, "step": 8195 }, { "epoch": 6.898989898989899, "grad_norm": 0.43124136328697205, "learning_rate": 6.342242649224895e-09, "loss": 0.2429, "step": 8196 }, { "epoch": 6.89983164983165, "grad_norm": 0.47065258026123047, "learning_rate": 6.237000928404069e-09, "loss": 0.253, "step": 8197 }, { "epoch": 6.9006734006734005, "grad_norm": 0.4333563446998596, "learning_rate": 6.132639161864351e-09, "loss": 0.2527, "step": 8198 }, { "epoch": 6.901515151515151, "grad_norm": 0.44411343336105347, "learning_rate": 6.029157367995475e-09, "loss": 0.2495, "step": 8199 }, { "epoch": 6.902356902356902, "grad_norm": 0.4261072874069214, "learning_rate": 5.926555565031744e-09, "loss": 0.2589, "step": 8200 }, { "epoch": 6.903198653198653, "grad_norm": 0.40049779415130615, "learning_rate": 5.8248337710525845e-09, "loss": 0.2428, "step": 8201 }, { "epoch": 6.904040404040404, "grad_norm": 0.41499167680740356, "learning_rate": 5.723992003983103e-09, "loss": 0.2413, "step": 8202 }, { "epoch": 6.904882154882155, "grad_norm": 0.4011606276035309, "learning_rate": 5.624030281591863e-09, "loss": 0.2841, "step": 8203 }, { "epoch": 6.905723905723906, "grad_norm": 0.4230775535106659, "learning_rate": 5.5249486214942196e-09, "loss": 0.2402, "step": 8204 }, { "epoch": 6.906565656565657, "grad_norm": 0.453885942697525, "learning_rate": 5.426747041148428e-09, "loss": 0.2432, "step": 8205 }, { "epoch": 6.907407407407407, "grad_norm": 0.4056825041770935, "learning_rate": 5.329425557859535e-09, "loss": 0.251, "step": 8206 }, { "epoch": 6.908249158249158, "grad_norm": 0.42982611060142517, "learning_rate": 5.232984188775492e-09, "loss": 0.2621, "step": 8207 }, { "epoch": 6.909090909090909, "grad_norm": 0.44741198420524597, "learning_rate": 5.137422950891591e-09, "loss": 0.2496, "step": 8208 }, { "epoch": 6.90993265993266, "grad_norm": 0.3867114186286926, "learning_rate": 5.042741861045475e-09, "loss": 0.2777, "step": 8209 }, { "epoch": 6.910774410774411, "grad_norm": 0.439006507396698, "learning_rate": 4.948940935922131e-09, "loss": 0.2346, "step": 8210 }, { "epoch": 6.911616161616162, "grad_norm": 0.4639546871185303, "learning_rate": 4.856020192049449e-09, "loss": 0.2721, "step": 8211 }, { "epoch": 6.912457912457913, "grad_norm": 0.4214153289794922, "learning_rate": 4.763979645801553e-09, "loss": 0.2505, "step": 8212 }, { "epoch": 6.9132996632996635, "grad_norm": 0.42842257022857666, "learning_rate": 4.6728193133965815e-09, "loss": 0.2732, "step": 8213 }, { "epoch": 6.914141414141414, "grad_norm": 0.45094722509384155, "learning_rate": 4.582539210897796e-09, "loss": 0.2362, "step": 8214 }, { "epoch": 6.914983164983165, "grad_norm": 0.5048632025718689, "learning_rate": 4.493139354214693e-09, "loss": 0.2777, "step": 8215 }, { "epoch": 6.915824915824916, "grad_norm": 0.41760414838790894, "learning_rate": 4.404619759099116e-09, "loss": 0.2638, "step": 8216 }, { "epoch": 6.916666666666667, "grad_norm": 0.45524269342422485, "learning_rate": 4.316980441149699e-09, "loss": 0.2535, "step": 8217 }, { "epoch": 6.917508417508418, "grad_norm": 0.4047381579875946, "learning_rate": 4.230221415809643e-09, "loss": 0.2654, "step": 8218 }, { "epoch": 6.918350168350169, "grad_norm": 0.3840022385120392, "learning_rate": 4.144342698366166e-09, "loss": 0.2734, "step": 8219 }, { "epoch": 6.91919191919192, "grad_norm": 0.42794138193130493, "learning_rate": 4.059344303952717e-09, "loss": 0.2631, "step": 8220 }, { "epoch": 6.9200336700336695, "grad_norm": 0.4421829581260681, "learning_rate": 3.9752262475467595e-09, "loss": 0.2589, "step": 8221 }, { "epoch": 6.92087542087542, "grad_norm": 0.41528332233428955, "learning_rate": 3.891988543970326e-09, "loss": 0.2397, "step": 8222 }, { "epoch": 6.921717171717171, "grad_norm": 0.40400269627571106, "learning_rate": 3.809631207891129e-09, "loss": 0.232, "step": 8223 }, { "epoch": 6.922558922558922, "grad_norm": 0.4305869936943054, "learning_rate": 3.728154253821448e-09, "loss": 0.2106, "step": 8224 }, { "epoch": 6.923400673400673, "grad_norm": 0.43500691652297974, "learning_rate": 3.6475576961186864e-09, "loss": 0.2493, "step": 8225 }, { "epoch": 6.924242424242424, "grad_norm": 0.4022281765937805, "learning_rate": 3.567841548984263e-09, "loss": 0.2986, "step": 8226 }, { "epoch": 6.925084175084175, "grad_norm": 0.39213061332702637, "learning_rate": 3.489005826465275e-09, "loss": 0.2349, "step": 8227 }, { "epoch": 6.925925925925926, "grad_norm": 0.39034304022789, "learning_rate": 3.4110505424533867e-09, "loss": 0.2725, "step": 8228 }, { "epoch": 6.9267676767676765, "grad_norm": 0.424581378698349, "learning_rate": 3.333975710684834e-09, "loss": 0.2644, "step": 8229 }, { "epoch": 6.927609427609427, "grad_norm": 0.4053855836391449, "learning_rate": 3.2577813447415284e-09, "loss": 0.2366, "step": 8230 }, { "epoch": 6.928451178451178, "grad_norm": 0.4325176775455475, "learning_rate": 3.182467458049954e-09, "loss": 0.2414, "step": 8231 }, { "epoch": 6.929292929292929, "grad_norm": 0.46805912256240845, "learning_rate": 3.10803406388005e-09, "loss": 0.2309, "step": 8232 }, { "epoch": 6.93013468013468, "grad_norm": 0.413346529006958, "learning_rate": 3.034481175349102e-09, "loss": 0.2546, "step": 8233 }, { "epoch": 6.930976430976431, "grad_norm": 0.4039704203605652, "learning_rate": 2.9618088054167436e-09, "loss": 0.2376, "step": 8234 }, { "epoch": 6.931818181818182, "grad_norm": 0.43652185797691345, "learning_rate": 2.8900169668899525e-09, "loss": 0.2682, "step": 8235 }, { "epoch": 6.9326599326599325, "grad_norm": 0.433542937040329, "learning_rate": 2.8191056724180542e-09, "loss": 0.2484, "step": 8236 }, { "epoch": 6.933501683501683, "grad_norm": 0.4168585538864136, "learning_rate": 2.7490749344966094e-09, "loss": 0.2322, "step": 8237 }, { "epoch": 6.934343434343434, "grad_norm": 0.4271705448627472, "learning_rate": 2.679924765466302e-09, "loss": 0.239, "step": 8238 }, { "epoch": 6.935185185185185, "grad_norm": 0.4209720492362976, "learning_rate": 2.6116551775118292e-09, "loss": 0.2692, "step": 8239 }, { "epoch": 6.936026936026936, "grad_norm": 0.424909770488739, "learning_rate": 2.544266182662458e-09, "loss": 0.2479, "step": 8240 }, { "epoch": 6.936868686868687, "grad_norm": 0.4195607006549835, "learning_rate": 2.477757792793689e-09, "loss": 0.2617, "step": 8241 }, { "epoch": 6.937710437710438, "grad_norm": 0.4191210865974426, "learning_rate": 2.4121300196250362e-09, "loss": 0.2511, "step": 8242 }, { "epoch": 6.938552188552189, "grad_norm": 0.422431081533432, "learning_rate": 2.347382874720028e-09, "loss": 0.2384, "step": 8243 }, { "epoch": 6.9393939393939394, "grad_norm": 0.42578309774398804, "learning_rate": 2.2835163694884253e-09, "loss": 0.2784, "step": 8244 }, { "epoch": 6.94023569023569, "grad_norm": 0.4015375077724457, "learning_rate": 2.220530515184005e-09, "loss": 0.2528, "step": 8245 }, { "epoch": 6.941077441077441, "grad_norm": 0.41567128896713257, "learning_rate": 2.158425322905111e-09, "loss": 0.2224, "step": 8246 }, { "epoch": 6.941919191919192, "grad_norm": 0.3818052113056183, "learning_rate": 2.097200803596322e-09, "loss": 0.2553, "step": 8247 }, { "epoch": 6.942760942760943, "grad_norm": 0.4379405975341797, "learning_rate": 2.0368569680451202e-09, "loss": 0.2735, "step": 8248 }, { "epoch": 6.943602693602694, "grad_norm": 0.4650488495826721, "learning_rate": 1.977393826885221e-09, "loss": 0.2383, "step": 8249 }, { "epoch": 6.944444444444445, "grad_norm": 0.4157780110836029, "learning_rate": 1.91881139059491e-09, "loss": 0.2528, "step": 8250 }, { "epoch": 6.9452861952861955, "grad_norm": 0.41417333483695984, "learning_rate": 1.8611096694964858e-09, "loss": 0.2476, "step": 8251 }, { "epoch": 6.946127946127946, "grad_norm": 0.4275321066379547, "learning_rate": 1.8042886737573706e-09, "loss": 0.2243, "step": 8252 }, { "epoch": 6.946969696969697, "grad_norm": 0.424264132976532, "learning_rate": 1.748348413391221e-09, "loss": 0.256, "step": 8253 }, { "epoch": 6.947811447811448, "grad_norm": 0.43374109268188477, "learning_rate": 1.6932888982540419e-09, "loss": 0.2162, "step": 8254 }, { "epoch": 6.948653198653199, "grad_norm": 0.45475199818611145, "learning_rate": 1.6391101380486273e-09, "loss": 0.2349, "step": 8255 }, { "epoch": 6.94949494949495, "grad_norm": 0.42048773169517517, "learning_rate": 1.5858121423212303e-09, "loss": 0.2687, "step": 8256 }, { "epoch": 6.950336700336701, "grad_norm": 0.4030672311782837, "learning_rate": 1.5333949204643372e-09, "loss": 0.2599, "step": 8257 }, { "epoch": 6.951178451178452, "grad_norm": 0.42015567421913147, "learning_rate": 1.4818584817138938e-09, "loss": 0.2581, "step": 8258 }, { "epoch": 6.952020202020202, "grad_norm": 0.45850077271461487, "learning_rate": 1.4312028351515239e-09, "loss": 0.2298, "step": 8259 }, { "epoch": 6.952861952861953, "grad_norm": 0.4258354604244232, "learning_rate": 1.381427989702866e-09, "loss": 0.2597, "step": 8260 }, { "epoch": 6.953703703703704, "grad_norm": 0.42977407574653625, "learning_rate": 1.3325339541386817e-09, "loss": 0.2527, "step": 8261 }, { "epoch": 6.954545454545455, "grad_norm": 0.44011637568473816, "learning_rate": 1.284520737075412e-09, "loss": 0.2387, "step": 8262 }, { "epoch": 6.955387205387205, "grad_norm": 0.4859594404697418, "learning_rate": 1.2373883469724014e-09, "loss": 0.2194, "step": 8263 }, { "epoch": 6.956228956228956, "grad_norm": 0.423588365316391, "learning_rate": 1.1911367921363381e-09, "loss": 0.2587, "step": 8264 }, { "epoch": 6.957070707070707, "grad_norm": 0.41436058282852173, "learning_rate": 1.1457660807157046e-09, "loss": 0.2547, "step": 8265 }, { "epoch": 6.957912457912458, "grad_norm": 0.4316777288913727, "learning_rate": 1.1012762207063265e-09, "loss": 0.245, "step": 8266 }, { "epoch": 6.9587542087542085, "grad_norm": 0.4347928464412689, "learning_rate": 1.057667219946934e-09, "loss": 0.2552, "step": 8267 }, { "epoch": 6.959595959595959, "grad_norm": 0.42596668004989624, "learning_rate": 1.0149390861230457e-09, "loss": 0.2474, "step": 8268 }, { "epoch": 6.96043771043771, "grad_norm": 0.41447076201438904, "learning_rate": 9.73091826763084e-10, "loss": 0.2544, "step": 8269 }, { "epoch": 6.961279461279461, "grad_norm": 0.4275788962841034, "learning_rate": 9.32125449240595e-10, "loss": 0.2477, "step": 8270 }, { "epoch": 6.962121212121212, "grad_norm": 0.40020421147346497, "learning_rate": 8.920399607753594e-10, "loss": 0.2567, "step": 8271 }, { "epoch": 6.962962962962963, "grad_norm": 0.42275625467300415, "learning_rate": 8.528353684300605e-10, "loss": 0.249, "step": 8272 }, { "epoch": 6.963804713804714, "grad_norm": 0.4352809190750122, "learning_rate": 8.145116791130614e-10, "loss": 0.2599, "step": 8273 }, { "epoch": 6.9646464646464645, "grad_norm": 0.41433003544807434, "learning_rate": 7.770688995772935e-10, "loss": 0.2441, "step": 8274 }, { "epoch": 6.965488215488215, "grad_norm": 0.40866029262542725, "learning_rate": 7.405070364213674e-10, "loss": 0.2693, "step": 8275 }, { "epoch": 6.966329966329966, "grad_norm": 0.43814870715141296, "learning_rate": 7.048260960867969e-10, "loss": 0.2196, "step": 8276 }, { "epoch": 6.967171717171717, "grad_norm": 0.42411595582962036, "learning_rate": 6.700260848618856e-10, "loss": 0.2458, "step": 8277 }, { "epoch": 6.968013468013468, "grad_norm": 0.4139501452445984, "learning_rate": 6.361070088783949e-10, "loss": 0.26, "step": 8278 }, { "epoch": 6.968855218855219, "grad_norm": 0.40473997592926025, "learning_rate": 6.030688741126556e-10, "loss": 0.2475, "step": 8279 }, { "epoch": 6.96969696969697, "grad_norm": 0.42664608359336853, "learning_rate": 5.709116863872321e-10, "loss": 0.2453, "step": 8280 }, { "epoch": 6.970538720538721, "grad_norm": 0.4251115918159485, "learning_rate": 5.396354513681478e-10, "loss": 0.2422, "step": 8281 }, { "epoch": 6.9713804713804715, "grad_norm": 0.40810638666152954, "learning_rate": 5.09240174567105e-10, "loss": 0.2379, "step": 8282 }, { "epoch": 6.972222222222222, "grad_norm": 0.43449631333351135, "learning_rate": 4.79725861338709e-10, "loss": 0.2261, "step": 8283 }, { "epoch": 6.973063973063973, "grad_norm": 0.418042927980423, "learning_rate": 4.510925168854652e-10, "loss": 0.2315, "step": 8284 }, { "epoch": 6.973905723905724, "grad_norm": 0.41624879837036133, "learning_rate": 4.2334014625167175e-10, "loss": 0.2556, "step": 8285 }, { "epoch": 6.974747474747475, "grad_norm": 0.43870681524276733, "learning_rate": 3.964687543278611e-10, "loss": 0.2581, "step": 8286 }, { "epoch": 6.975589225589226, "grad_norm": 0.4191230535507202, "learning_rate": 3.7047834584913457e-10, "loss": 0.2668, "step": 8287 }, { "epoch": 6.976430976430977, "grad_norm": 0.39500692486763, "learning_rate": 3.45368925395162e-10, "loss": 0.2257, "step": 8288 }, { "epoch": 6.9772727272727275, "grad_norm": 0.6327937245368958, "learning_rate": 3.211404973907373e-10, "loss": 0.2525, "step": 8289 }, { "epoch": 6.978114478114478, "grad_norm": 0.4053162634372711, "learning_rate": 2.9779306610522307e-10, "loss": 0.2434, "step": 8290 }, { "epoch": 6.978956228956229, "grad_norm": 0.4703013002872467, "learning_rate": 2.7532663565199567e-10, "loss": 0.233, "step": 8291 }, { "epoch": 6.97979797979798, "grad_norm": 0.40991467237472534, "learning_rate": 2.537412099912207e-10, "loss": 0.254, "step": 8292 }, { "epoch": 6.980639730639731, "grad_norm": 0.3997485637664795, "learning_rate": 2.330367929248567e-10, "loss": 0.2592, "step": 8293 }, { "epoch": 6.981481481481482, "grad_norm": 0.3835304081439972, "learning_rate": 2.1321338810220692e-10, "loss": 0.264, "step": 8294 }, { "epoch": 6.982323232323233, "grad_norm": 0.470580518245697, "learning_rate": 1.9427099901658808e-10, "loss": 0.2342, "step": 8295 }, { "epoch": 6.983164983164983, "grad_norm": 0.4531618058681488, "learning_rate": 1.7620962900533058e-10, "loss": 0.2549, "step": 8296 }, { "epoch": 6.9840067340067336, "grad_norm": 0.42648249864578247, "learning_rate": 1.5902928125088867e-10, "loss": 0.2556, "step": 8297 }, { "epoch": 6.984848484848484, "grad_norm": 0.45011958479881287, "learning_rate": 1.427299587808406e-10, "loss": 0.2357, "step": 8298 }, { "epoch": 6.985690235690235, "grad_norm": 0.4406871497631073, "learning_rate": 1.2731166446788846e-10, "loss": 0.2478, "step": 8299 }, { "epoch": 6.986531986531986, "grad_norm": 0.4080612063407898, "learning_rate": 1.1277440102763771e-10, "loss": 0.2389, "step": 8300 }, { "epoch": 6.987373737373737, "grad_norm": 0.44067856669425964, "learning_rate": 9.911817102303822e-11, "loss": 0.2444, "step": 8301 }, { "epoch": 6.988215488215488, "grad_norm": 0.49260151386260986, "learning_rate": 8.63429768599433e-11, "loss": 0.2446, "step": 8302 }, { "epoch": 6.989057239057239, "grad_norm": 0.4396147131919861, "learning_rate": 7.444882078877502e-11, "loss": 0.2348, "step": 8303 }, { "epoch": 6.98989898989899, "grad_norm": 0.4583425521850586, "learning_rate": 6.343570490674466e-11, "loss": 0.2389, "step": 8304 }, { "epoch": 6.9907407407407405, "grad_norm": 0.4229132831096649, "learning_rate": 5.330363115341186e-11, "loss": 0.2283, "step": 8305 }, { "epoch": 6.991582491582491, "grad_norm": 0.4375646412372589, "learning_rate": 4.4052601314570345e-11, "loss": 0.2147, "step": 8306 }, { "epoch": 6.992424242424242, "grad_norm": 0.44654542207717896, "learning_rate": 3.568261702002751e-11, "loss": 0.2453, "step": 8307 }, { "epoch": 6.993265993265993, "grad_norm": 0.44349879026412964, "learning_rate": 2.8193679745269763e-11, "loss": 0.2684, "step": 8308 }, { "epoch": 6.994107744107744, "grad_norm": 0.43197259306907654, "learning_rate": 2.158579080979717e-11, "loss": 0.2294, "step": 8309 }, { "epoch": 6.994949494949495, "grad_norm": 0.4419395327568054, "learning_rate": 1.5858951377123454e-11, "loss": 0.2349, "step": 8310 }, { "epoch": 6.995791245791246, "grad_norm": 0.4491584002971649, "learning_rate": 1.1013162457551573e-11, "loss": 0.2411, "step": 8311 }, { "epoch": 6.9966329966329965, "grad_norm": 0.44478219747543335, "learning_rate": 7.0484249042879205e-12, "loss": 0.2529, "step": 8312 }, { "epoch": 6.997474747474747, "grad_norm": 0.42163243889808655, "learning_rate": 3.964739416217889e-12, "loss": 0.2238, "step": 8313 }, { "epoch": 6.998316498316498, "grad_norm": 0.4159494936466217, "learning_rate": 1.7621065367956491e-12, "loss": 0.243, "step": 8314 }, { "epoch": 6.999158249158249, "grad_norm": 0.40114626288414, "learning_rate": 4.4052665348903735e-13, "loss": 0.2396, "step": 8315 }, { "epoch": 7.0, "grad_norm": 0.43578237295150757, "learning_rate": 0.0, "loss": 0.2259, "step": 8316 }, { "epoch": 7.0, "step": 8316, "total_flos": 1.0787518180687872e+16, "train_loss": 0.14147361681159154, "train_runtime": 56048.8788, "train_samples_per_second": 14.232, "train_steps_per_second": 0.148 } ], "logging_steps": 1, "max_steps": 8316, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0787518180687872e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }