{ "best_metric": 0.6554008152173914, "best_model_checkpoint": "demo_LID_ntu-spml_distilhubert/checkpoint-6930", "epoch": 9.99891891891892, "eval_steps": 500, "global_step": 6930, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014414414414414415, "grad_norm": 2.169387102127075, "learning_rate": 4.329004329004329e-06, "loss": 15.2197, "step": 10 }, { "epoch": 0.02882882882882883, "grad_norm": 2.4440665245056152, "learning_rate": 8.658008658008657e-06, "loss": 15.2046, "step": 20 }, { "epoch": 0.043243243243243246, "grad_norm": 1.9768311977386475, "learning_rate": 1.2987012987012986e-05, "loss": 15.2027, "step": 30 }, { "epoch": 0.05765765765765766, "grad_norm": 2.598134994506836, "learning_rate": 1.7316017316017315e-05, "loss": 15.1842, "step": 40 }, { "epoch": 0.07207207207207207, "grad_norm": 2.2137622833251953, "learning_rate": 2.164502164502164e-05, "loss": 15.1876, "step": 50 }, { "epoch": 0.08648648648648649, "grad_norm": 2.252912759780884, "learning_rate": 2.5974025974025972e-05, "loss": 15.172, "step": 60 }, { "epoch": 0.1009009009009009, "grad_norm": 2.699625015258789, "learning_rate": 3.03030303030303e-05, "loss": 15.1004, "step": 70 }, { "epoch": 0.11531531531531532, "grad_norm": 2.774757146835327, "learning_rate": 3.463203463203463e-05, "loss": 15.0877, "step": 80 }, { "epoch": 0.12972972972972974, "grad_norm": 2.9454381465911865, "learning_rate": 3.896103896103895e-05, "loss": 15.0704, "step": 90 }, { "epoch": 0.14414414414414414, "grad_norm": 3.3984997272491455, "learning_rate": 4.329004329004328e-05, "loss": 15.0211, "step": 100 }, { "epoch": 0.15855855855855855, "grad_norm": 3.1876633167266846, "learning_rate": 4.7619047619047614e-05, "loss": 14.9973, "step": 110 }, { "epoch": 0.17297297297297298, "grad_norm": 3.87903094291687, "learning_rate": 5.1948051948051944e-05, "loss": 14.9534, "step": 120 }, { "epoch": 0.1873873873873874, "grad_norm": 4.1114983558654785, "learning_rate": 5.627705627705627e-05, "loss": 14.8464, "step": 130 }, { "epoch": 0.2018018018018018, "grad_norm": 3.837207078933716, "learning_rate": 6.06060606060606e-05, "loss": 14.8935, "step": 140 }, { "epoch": 0.21621621621621623, "grad_norm": 3.978295087814331, "learning_rate": 6.493506493506494e-05, "loss": 14.8274, "step": 150 }, { "epoch": 0.23063063063063063, "grad_norm": 5.2168145179748535, "learning_rate": 6.926406926406926e-05, "loss": 14.7065, "step": 160 }, { "epoch": 0.24504504504504504, "grad_norm": 5.752880096435547, "learning_rate": 7.359307359307358e-05, "loss": 14.6178, "step": 170 }, { "epoch": 0.2594594594594595, "grad_norm": 6.018016338348389, "learning_rate": 7.79220779220779e-05, "loss": 14.4008, "step": 180 }, { "epoch": 0.27387387387387385, "grad_norm": 5.537229537963867, "learning_rate": 8.225108225108224e-05, "loss": 14.3105, "step": 190 }, { "epoch": 0.2882882882882883, "grad_norm": 6.358255863189697, "learning_rate": 8.658008658008657e-05, "loss": 14.1688, "step": 200 }, { "epoch": 0.3027027027027027, "grad_norm": 6.9536356925964355, "learning_rate": 9.09090909090909e-05, "loss": 14.1205, "step": 210 }, { "epoch": 0.3171171171171171, "grad_norm": 8.093494415283203, "learning_rate": 9.523809523809523e-05, "loss": 14.1292, "step": 220 }, { "epoch": 0.33153153153153153, "grad_norm": 6.803300380706787, "learning_rate": 9.956709956709956e-05, "loss": 13.9276, "step": 230 }, { "epoch": 0.34594594594594597, "grad_norm": 6.665808200836182, "learning_rate": 0.00010389610389610389, "loss": 13.9136, "step": 240 }, { "epoch": 0.36036036036036034, "grad_norm": 10.191052436828613, "learning_rate": 0.00010822510822510823, "loss": 13.708, "step": 250 }, { "epoch": 0.3747747747747748, "grad_norm": 7.783840656280518, "learning_rate": 0.00011255411255411254, "loss": 13.6658, "step": 260 }, { "epoch": 0.3891891891891892, "grad_norm": 11.964157104492188, "learning_rate": 0.00011688311688311687, "loss": 13.6014, "step": 270 }, { "epoch": 0.4036036036036036, "grad_norm": 7.828129291534424, "learning_rate": 0.0001212121212121212, "loss": 13.3956, "step": 280 }, { "epoch": 0.418018018018018, "grad_norm": 8.642557144165039, "learning_rate": 0.00012554112554112555, "loss": 13.4701, "step": 290 }, { "epoch": 0.43243243243243246, "grad_norm": 8.499011993408203, "learning_rate": 0.00012987012987012987, "loss": 13.2608, "step": 300 }, { "epoch": 0.44684684684684683, "grad_norm": 9.103832244873047, "learning_rate": 0.0001341991341991342, "loss": 12.8141, "step": 310 }, { "epoch": 0.46126126126126127, "grad_norm": 8.243462562561035, "learning_rate": 0.00013852813852813852, "loss": 12.8678, "step": 320 }, { "epoch": 0.4756756756756757, "grad_norm": 12.445680618286133, "learning_rate": 0.00014285714285714284, "loss": 12.9204, "step": 330 }, { "epoch": 0.4900900900900901, "grad_norm": 10.037951469421387, "learning_rate": 0.00014718614718614716, "loss": 12.9456, "step": 340 }, { "epoch": 0.5045045045045045, "grad_norm": 14.364166259765625, "learning_rate": 0.00015151515151515152, "loss": 12.5126, "step": 350 }, { "epoch": 0.518918918918919, "grad_norm": 10.338336944580078, "learning_rate": 0.0001558441558441558, "loss": 13.2546, "step": 360 }, { "epoch": 0.5333333333333333, "grad_norm": 9.899740219116211, "learning_rate": 0.00016017316017316016, "loss": 12.3445, "step": 370 }, { "epoch": 0.5477477477477477, "grad_norm": 11.309089660644531, "learning_rate": 0.00016450216450216449, "loss": 12.2799, "step": 380 }, { "epoch": 0.5621621621621622, "grad_norm": 11.268434524536133, "learning_rate": 0.00016883116883116884, "loss": 12.1578, "step": 390 }, { "epoch": 0.5765765765765766, "grad_norm": 9.793964385986328, "learning_rate": 0.00017316017316017313, "loss": 11.9812, "step": 400 }, { "epoch": 0.590990990990991, "grad_norm": 11.267273902893066, "learning_rate": 0.00017748917748917746, "loss": 12.1401, "step": 410 }, { "epoch": 0.6054054054054054, "grad_norm": 10.383160591125488, "learning_rate": 0.0001818181818181818, "loss": 12.0603, "step": 420 }, { "epoch": 0.6198198198198198, "grad_norm": 14.343868255615234, "learning_rate": 0.00018614718614718616, "loss": 11.2182, "step": 430 }, { "epoch": 0.6342342342342342, "grad_norm": 13.931622505187988, "learning_rate": 0.00019047619047619045, "loss": 11.6929, "step": 440 }, { "epoch": 0.6486486486486487, "grad_norm": 12.756230354309082, "learning_rate": 0.00019480519480519478, "loss": 11.9651, "step": 450 }, { "epoch": 0.6630630630630631, "grad_norm": 13.018777847290039, "learning_rate": 0.00019913419913419913, "loss": 11.6416, "step": 460 }, { "epoch": 0.6774774774774774, "grad_norm": 13.232623100280762, "learning_rate": 0.00020346320346320345, "loss": 11.4997, "step": 470 }, { "epoch": 0.6918918918918919, "grad_norm": 12.543861389160156, "learning_rate": 0.00020779220779220778, "loss": 11.5597, "step": 480 }, { "epoch": 0.7063063063063063, "grad_norm": 12.517231941223145, "learning_rate": 0.0002121212121212121, "loss": 11.1162, "step": 490 }, { "epoch": 0.7207207207207207, "grad_norm": 14.61859130859375, "learning_rate": 0.00021645021645021645, "loss": 11.2086, "step": 500 }, { "epoch": 0.7351351351351352, "grad_norm": 14.246715545654297, "learning_rate": 0.00022077922077922075, "loss": 11.2519, "step": 510 }, { "epoch": 0.7495495495495496, "grad_norm": 13.88980484008789, "learning_rate": 0.00022510822510822507, "loss": 10.9391, "step": 520 }, { "epoch": 0.7639639639639639, "grad_norm": 14.310384750366211, "learning_rate": 0.00022943722943722942, "loss": 10.7129, "step": 530 }, { "epoch": 0.7783783783783784, "grad_norm": 13.765666007995605, "learning_rate": 0.00023376623376623374, "loss": 11.218, "step": 540 }, { "epoch": 0.7927927927927928, "grad_norm": 17.789613723754883, "learning_rate": 0.00023809523809523807, "loss": 10.2992, "step": 550 }, { "epoch": 0.8072072072072072, "grad_norm": 17.212533950805664, "learning_rate": 0.0002424242424242424, "loss": 11.1959, "step": 560 }, { "epoch": 0.8216216216216217, "grad_norm": 14.872720718383789, "learning_rate": 0.00024675324675324674, "loss": 9.933, "step": 570 }, { "epoch": 0.836036036036036, "grad_norm": 14.751778602600098, "learning_rate": 0.0002510822510822511, "loss": 10.2721, "step": 580 }, { "epoch": 0.8504504504504504, "grad_norm": 13.110413551330566, "learning_rate": 0.0002554112554112554, "loss": 10.0697, "step": 590 }, { "epoch": 0.8648648648648649, "grad_norm": 14.484004020690918, "learning_rate": 0.00025974025974025974, "loss": 10.6599, "step": 600 }, { "epoch": 0.8792792792792793, "grad_norm": 15.150849342346191, "learning_rate": 0.00026406926406926404, "loss": 10.3077, "step": 610 }, { "epoch": 0.8936936936936937, "grad_norm": 19.270540237426758, "learning_rate": 0.0002683982683982684, "loss": 10.2954, "step": 620 }, { "epoch": 0.9081081081081082, "grad_norm": 17.365564346313477, "learning_rate": 0.0002727272727272727, "loss": 10.2966, "step": 630 }, { "epoch": 0.9225225225225225, "grad_norm": 23.610044479370117, "learning_rate": 0.00027705627705627703, "loss": 9.4401, "step": 640 }, { "epoch": 0.9369369369369369, "grad_norm": 16.38220977783203, "learning_rate": 0.0002813852813852814, "loss": 9.8423, "step": 650 }, { "epoch": 0.9513513513513514, "grad_norm": 18.670101165771484, "learning_rate": 0.0002857142857142857, "loss": 10.2396, "step": 660 }, { "epoch": 0.9657657657657658, "grad_norm": 20.733997344970703, "learning_rate": 0.00029004329004329003, "loss": 9.3347, "step": 670 }, { "epoch": 0.9801801801801802, "grad_norm": 18.066375732421875, "learning_rate": 0.00029437229437229433, "loss": 10.4626, "step": 680 }, { "epoch": 0.9945945945945946, "grad_norm": 18.0963191986084, "learning_rate": 0.0002987012987012987, "loss": 9.6557, "step": 690 }, { "epoch": 0.9989189189189189, "eval_accuracy": 0.26137907608695654, "eval_loss": 2.65486216545105, "eval_runtime": 541.7254, "eval_samples_per_second": 10.869, "eval_steps_per_second": 10.869, "step": 693 }, { "epoch": 1.01009009009009, "grad_norm": 15.17456340789795, "learning_rate": 0.00029966329966329963, "loss": 10.1474, "step": 700 }, { "epoch": 1.0245045045045045, "grad_norm": 19.106407165527344, "learning_rate": 0.00029918229918229916, "loss": 8.6672, "step": 710 }, { "epoch": 1.038918918918919, "grad_norm": 16.296113967895508, "learning_rate": 0.0002987012987012987, "loss": 8.7251, "step": 720 }, { "epoch": 1.0533333333333332, "grad_norm": 22.187761306762695, "learning_rate": 0.00029826839826839827, "loss": 9.2252, "step": 730 }, { "epoch": 1.0677477477477477, "grad_norm": 17.774612426757812, "learning_rate": 0.00029778739778739773, "loss": 8.3988, "step": 740 }, { "epoch": 1.0821621621621622, "grad_norm": 22.759864807128906, "learning_rate": 0.0002973063973063973, "loss": 8.4637, "step": 750 }, { "epoch": 1.0965765765765765, "grad_norm": 22.068397521972656, "learning_rate": 0.0002968253968253968, "loss": 9.4532, "step": 760 }, { "epoch": 1.110990990990991, "grad_norm": 22.11869239807129, "learning_rate": 0.0002963443963443963, "loss": 8.5823, "step": 770 }, { "epoch": 1.1254054054054055, "grad_norm": 20.577394485473633, "learning_rate": 0.0002958633958633958, "loss": 8.8257, "step": 780 }, { "epoch": 1.1398198198198197, "grad_norm": 19.24051856994629, "learning_rate": 0.00029538239538239535, "loss": 8.4165, "step": 790 }, { "epoch": 1.1542342342342342, "grad_norm": 18.745025634765625, "learning_rate": 0.00029490139490139487, "loss": 8.4419, "step": 800 }, { "epoch": 1.1686486486486487, "grad_norm": 16.836870193481445, "learning_rate": 0.0002944203944203944, "loss": 8.2076, "step": 810 }, { "epoch": 1.183063063063063, "grad_norm": 23.824594497680664, "learning_rate": 0.0002939393939393939, "loss": 7.8032, "step": 820 }, { "epoch": 1.1974774774774775, "grad_norm": 17.577869415283203, "learning_rate": 0.00029345839345839344, "loss": 8.3441, "step": 830 }, { "epoch": 1.211891891891892, "grad_norm": 17.508779525756836, "learning_rate": 0.00029297739297739296, "loss": 8.1213, "step": 840 }, { "epoch": 1.2263063063063062, "grad_norm": 16.90478515625, "learning_rate": 0.0002924963924963925, "loss": 7.6077, "step": 850 }, { "epoch": 1.2407207207207207, "grad_norm": 20.760663986206055, "learning_rate": 0.000292015392015392, "loss": 7.8654, "step": 860 }, { "epoch": 1.2551351351351352, "grad_norm": 20.966073989868164, "learning_rate": 0.00029153439153439153, "loss": 7.7627, "step": 870 }, { "epoch": 1.2695495495495495, "grad_norm": 18.766395568847656, "learning_rate": 0.000291053391053391, "loss": 7.0404, "step": 880 }, { "epoch": 1.283963963963964, "grad_norm": 20.34043312072754, "learning_rate": 0.0002905723905723906, "loss": 8.2117, "step": 890 }, { "epoch": 1.2983783783783784, "grad_norm": 22.05991554260254, "learning_rate": 0.00029009139009139004, "loss": 7.5249, "step": 900 }, { "epoch": 1.3127927927927927, "grad_norm": 18.58563232421875, "learning_rate": 0.00028961038961038956, "loss": 7.9662, "step": 910 }, { "epoch": 1.3272072072072072, "grad_norm": 18.942352294921875, "learning_rate": 0.0002891293891293891, "loss": 7.7609, "step": 920 }, { "epoch": 1.3416216216216217, "grad_norm": 23.675949096679688, "learning_rate": 0.0002886483886483886, "loss": 7.4968, "step": 930 }, { "epoch": 1.356036036036036, "grad_norm": 22.53910255432129, "learning_rate": 0.00028816738816738813, "loss": 7.9113, "step": 940 }, { "epoch": 1.3704504504504504, "grad_norm": 21.479690551757812, "learning_rate": 0.00028768638768638766, "loss": 6.8956, "step": 950 }, { "epoch": 1.384864864864865, "grad_norm": 20.469209671020508, "learning_rate": 0.0002872053872053872, "loss": 7.2737, "step": 960 }, { "epoch": 1.3992792792792792, "grad_norm": 17.538774490356445, "learning_rate": 0.0002867243867243867, "loss": 7.2458, "step": 970 }, { "epoch": 1.4136936936936937, "grad_norm": 22.793577194213867, "learning_rate": 0.0002862433862433862, "loss": 7.2339, "step": 980 }, { "epoch": 1.4281081081081082, "grad_norm": 18.235897064208984, "learning_rate": 0.00028576238576238575, "loss": 7.6416, "step": 990 }, { "epoch": 1.4425225225225224, "grad_norm": 24.108549118041992, "learning_rate": 0.00028528138528138527, "loss": 7.5449, "step": 1000 }, { "epoch": 1.456936936936937, "grad_norm": 23.248693466186523, "learning_rate": 0.0002848003848003848, "loss": 7.0878, "step": 1010 }, { "epoch": 1.4713513513513514, "grad_norm": 20.034454345703125, "learning_rate": 0.00028431938431938426, "loss": 7.426, "step": 1020 }, { "epoch": 1.4857657657657657, "grad_norm": 22.129047393798828, "learning_rate": 0.00028383838383838384, "loss": 6.9635, "step": 1030 }, { "epoch": 1.5001801801801802, "grad_norm": 20.906335830688477, "learning_rate": 0.0002833573833573833, "loss": 7.1704, "step": 1040 }, { "epoch": 1.5145945945945947, "grad_norm": 22.88907814025879, "learning_rate": 0.0002828763828763829, "loss": 7.1875, "step": 1050 }, { "epoch": 1.529009009009009, "grad_norm": 23.162479400634766, "learning_rate": 0.00028239538239538235, "loss": 7.665, "step": 1060 }, { "epoch": 1.5434234234234234, "grad_norm": 22.069990158081055, "learning_rate": 0.00028191438191438187, "loss": 7.0347, "step": 1070 }, { "epoch": 1.557837837837838, "grad_norm": 21.646320343017578, "learning_rate": 0.0002814333814333814, "loss": 7.4735, "step": 1080 }, { "epoch": 1.5722522522522522, "grad_norm": 22.21576499938965, "learning_rate": 0.0002809523809523809, "loss": 7.3836, "step": 1090 }, { "epoch": 1.5866666666666667, "grad_norm": 17.76190757751465, "learning_rate": 0.00028047138047138044, "loss": 7.2981, "step": 1100 }, { "epoch": 1.6010810810810812, "grad_norm": 15.208210945129395, "learning_rate": 0.00027999037999037996, "loss": 6.1374, "step": 1110 }, { "epoch": 1.6154954954954954, "grad_norm": 24.096397399902344, "learning_rate": 0.0002795093795093795, "loss": 6.3449, "step": 1120 }, { "epoch": 1.62990990990991, "grad_norm": 23.264659881591797, "learning_rate": 0.000279028379028379, "loss": 6.9955, "step": 1130 }, { "epoch": 1.6443243243243244, "grad_norm": 23.365312576293945, "learning_rate": 0.00027854737854737853, "loss": 6.7135, "step": 1140 }, { "epoch": 1.6587387387387387, "grad_norm": 18.671892166137695, "learning_rate": 0.00027806637806637805, "loss": 6.3113, "step": 1150 }, { "epoch": 1.6731531531531532, "grad_norm": 22.89389991760254, "learning_rate": 0.0002775853775853776, "loss": 6.6979, "step": 1160 }, { "epoch": 1.6875675675675677, "grad_norm": 22.493839263916016, "learning_rate": 0.0002771043771043771, "loss": 5.7641, "step": 1170 }, { "epoch": 1.701981981981982, "grad_norm": 24.027435302734375, "learning_rate": 0.00027662337662337657, "loss": 7.2983, "step": 1180 }, { "epoch": 1.7163963963963964, "grad_norm": 19.027225494384766, "learning_rate": 0.00027614237614237614, "loss": 6.2111, "step": 1190 }, { "epoch": 1.730810810810811, "grad_norm": 27.56620979309082, "learning_rate": 0.0002756613756613756, "loss": 6.7366, "step": 1200 }, { "epoch": 1.7452252252252252, "grad_norm": 16.027616500854492, "learning_rate": 0.00027518037518037513, "loss": 6.1943, "step": 1210 }, { "epoch": 1.7596396396396397, "grad_norm": 20.16025161743164, "learning_rate": 0.0002746993746993747, "loss": 6.3816, "step": 1220 }, { "epoch": 1.7740540540540541, "grad_norm": 13.574505805969238, "learning_rate": 0.0002742183742183742, "loss": 5.9191, "step": 1230 }, { "epoch": 1.7884684684684684, "grad_norm": 19.855785369873047, "learning_rate": 0.0002737373737373737, "loss": 6.3663, "step": 1240 }, { "epoch": 1.802882882882883, "grad_norm": 20.211448669433594, "learning_rate": 0.0002732563732563732, "loss": 6.4382, "step": 1250 }, { "epoch": 1.8172972972972974, "grad_norm": 21.60570526123047, "learning_rate": 0.00027277537277537275, "loss": 6.7056, "step": 1260 }, { "epoch": 1.8317117117117117, "grad_norm": 20.5418758392334, "learning_rate": 0.00027229437229437227, "loss": 5.5842, "step": 1270 }, { "epoch": 1.8461261261261261, "grad_norm": 27.491355895996094, "learning_rate": 0.0002718133718133718, "loss": 5.9011, "step": 1280 }, { "epoch": 1.8605405405405406, "grad_norm": 23.979827880859375, "learning_rate": 0.0002713323713323713, "loss": 6.084, "step": 1290 }, { "epoch": 1.874954954954955, "grad_norm": 18.55582618713379, "learning_rate": 0.00027085137085137084, "loss": 6.0097, "step": 1300 }, { "epoch": 1.8893693693693694, "grad_norm": 19.917762756347656, "learning_rate": 0.00027037037037037036, "loss": 5.7525, "step": 1310 }, { "epoch": 1.9037837837837839, "grad_norm": 17.546810150146484, "learning_rate": 0.00026988936988936983, "loss": 6.3093, "step": 1320 }, { "epoch": 1.9181981981981981, "grad_norm": 26.043676376342773, "learning_rate": 0.0002694083694083694, "loss": 5.9062, "step": 1330 }, { "epoch": 1.9326126126126126, "grad_norm": 22.03000831604004, "learning_rate": 0.00026892736892736893, "loss": 6.4594, "step": 1340 }, { "epoch": 1.9470270270270271, "grad_norm": 23.965402603149414, "learning_rate": 0.0002684463684463684, "loss": 6.3053, "step": 1350 }, { "epoch": 1.9614414414414414, "grad_norm": 21.040790557861328, "learning_rate": 0.000267965367965368, "loss": 5.4142, "step": 1360 }, { "epoch": 1.9758558558558559, "grad_norm": 22.65288543701172, "learning_rate": 0.00026748436748436744, "loss": 6.5429, "step": 1370 }, { "epoch": 1.9902702702702704, "grad_norm": 19.748960494995117, "learning_rate": 0.00026700336700336696, "loss": 6.1707, "step": 1380 }, { "epoch": 1.998918918918919, "eval_accuracy": 0.468070652173913, "eval_loss": 1.8478443622589111, "eval_runtime": 536.954, "eval_samples_per_second": 10.966, "eval_steps_per_second": 10.966, "step": 1386 }, { "epoch": 2.0057657657657657, "grad_norm": 20.129833221435547, "learning_rate": 0.0002665223665223665, "loss": 5.5637, "step": 1390 }, { "epoch": 2.02018018018018, "grad_norm": 18.542203903198242, "learning_rate": 0.000266041366041366, "loss": 4.8547, "step": 1400 }, { "epoch": 2.0345945945945947, "grad_norm": 16.80269432067871, "learning_rate": 0.00026556036556036553, "loss": 4.9395, "step": 1410 }, { "epoch": 2.049009009009009, "grad_norm": 24.43153953552246, "learning_rate": 0.00026507936507936506, "loss": 4.8408, "step": 1420 }, { "epoch": 2.063423423423423, "grad_norm": 20.406522750854492, "learning_rate": 0.0002645983645983646, "loss": 4.3663, "step": 1430 }, { "epoch": 2.077837837837838, "grad_norm": 17.540870666503906, "learning_rate": 0.0002641173641173641, "loss": 3.6172, "step": 1440 }, { "epoch": 2.092252252252252, "grad_norm": 22.39369773864746, "learning_rate": 0.0002636363636363636, "loss": 4.5143, "step": 1450 }, { "epoch": 2.1066666666666665, "grad_norm": 24.582853317260742, "learning_rate": 0.00026315536315536315, "loss": 4.4835, "step": 1460 }, { "epoch": 2.121081081081081, "grad_norm": 22.656949996948242, "learning_rate": 0.00026267436267436267, "loss": 4.4713, "step": 1470 }, { "epoch": 2.1354954954954954, "grad_norm": 22.375396728515625, "learning_rate": 0.0002621933621933622, "loss": 4.4695, "step": 1480 }, { "epoch": 2.1499099099099097, "grad_norm": 17.02708625793457, "learning_rate": 0.00026171236171236166, "loss": 3.8927, "step": 1490 }, { "epoch": 2.1643243243243244, "grad_norm": 19.711584091186523, "learning_rate": 0.00026123136123136124, "loss": 3.9472, "step": 1500 }, { "epoch": 2.1787387387387387, "grad_norm": 18.87154197692871, "learning_rate": 0.0002607503607503607, "loss": 4.8518, "step": 1510 }, { "epoch": 2.193153153153153, "grad_norm": 25.693981170654297, "learning_rate": 0.0002602693602693603, "loss": 4.6599, "step": 1520 }, { "epoch": 2.2075675675675677, "grad_norm": 15.880191802978516, "learning_rate": 0.00025978835978835975, "loss": 4.1435, "step": 1530 }, { "epoch": 2.221981981981982, "grad_norm": 20.515146255493164, "learning_rate": 0.00025930735930735927, "loss": 4.1378, "step": 1540 }, { "epoch": 2.236396396396396, "grad_norm": 23.654556274414062, "learning_rate": 0.0002588263588263588, "loss": 4.4749, "step": 1550 }, { "epoch": 2.250810810810811, "grad_norm": 25.85966682434082, "learning_rate": 0.0002583453583453583, "loss": 4.2029, "step": 1560 }, { "epoch": 2.265225225225225, "grad_norm": 21.542530059814453, "learning_rate": 0.00025786435786435784, "loss": 4.6039, "step": 1570 }, { "epoch": 2.2796396396396394, "grad_norm": 19.57372283935547, "learning_rate": 0.00025738335738335736, "loss": 4.0779, "step": 1580 }, { "epoch": 2.294054054054054, "grad_norm": 20.794376373291016, "learning_rate": 0.0002569023569023569, "loss": 4.7794, "step": 1590 }, { "epoch": 2.3084684684684684, "grad_norm": 23.753938674926758, "learning_rate": 0.0002564213564213564, "loss": 4.8506, "step": 1600 }, { "epoch": 2.3228828828828827, "grad_norm": 19.38469123840332, "learning_rate": 0.00025594035594035593, "loss": 3.4325, "step": 1610 }, { "epoch": 2.3372972972972974, "grad_norm": 21.55483627319336, "learning_rate": 0.00025545935545935545, "loss": 4.151, "step": 1620 }, { "epoch": 2.3517117117117117, "grad_norm": 24.347623825073242, "learning_rate": 0.000254978354978355, "loss": 4.3691, "step": 1630 }, { "epoch": 2.366126126126126, "grad_norm": 22.3781795501709, "learning_rate": 0.0002544973544973545, "loss": 4.5897, "step": 1640 }, { "epoch": 2.3805405405405407, "grad_norm": 23.88686180114746, "learning_rate": 0.00025401635401635397, "loss": 4.1445, "step": 1650 }, { "epoch": 2.394954954954955, "grad_norm": 22.73502540588379, "learning_rate": 0.00025353535353535354, "loss": 4.7023, "step": 1660 }, { "epoch": 2.409369369369369, "grad_norm": 28.19312286376953, "learning_rate": 0.000253054353054353, "loss": 4.1145, "step": 1670 }, { "epoch": 2.423783783783784, "grad_norm": 18.269119262695312, "learning_rate": 0.00025257335257335253, "loss": 4.2782, "step": 1680 }, { "epoch": 2.438198198198198, "grad_norm": 23.031797409057617, "learning_rate": 0.00025209235209235206, "loss": 4.1351, "step": 1690 }, { "epoch": 2.4526126126126124, "grad_norm": 29.572736740112305, "learning_rate": 0.0002516113516113516, "loss": 3.9022, "step": 1700 }, { "epoch": 2.467027027027027, "grad_norm": 27.48060417175293, "learning_rate": 0.0002511303511303511, "loss": 4.2383, "step": 1710 }, { "epoch": 2.4814414414414414, "grad_norm": 20.07984733581543, "learning_rate": 0.0002506493506493506, "loss": 4.8254, "step": 1720 }, { "epoch": 2.4958558558558557, "grad_norm": 15.536605834960938, "learning_rate": 0.00025016835016835015, "loss": 4.4781, "step": 1730 }, { "epoch": 2.5102702702702704, "grad_norm": 24.318782806396484, "learning_rate": 0.00024968734968734967, "loss": 3.9879, "step": 1740 }, { "epoch": 2.5246846846846847, "grad_norm": 16.27837562561035, "learning_rate": 0.0002492063492063492, "loss": 3.9869, "step": 1750 }, { "epoch": 2.539099099099099, "grad_norm": 17.794788360595703, "learning_rate": 0.0002487253487253487, "loss": 3.9309, "step": 1760 }, { "epoch": 2.5535135135135136, "grad_norm": 21.39970588684082, "learning_rate": 0.00024824434824434824, "loss": 4.3936, "step": 1770 }, { "epoch": 2.567927927927928, "grad_norm": 22.3472957611084, "learning_rate": 0.00024776334776334776, "loss": 4.5431, "step": 1780 }, { "epoch": 2.5823423423423426, "grad_norm": 22.283802032470703, "learning_rate": 0.00024728234728234723, "loss": 3.7322, "step": 1790 }, { "epoch": 2.596756756756757, "grad_norm": 20.59347152709961, "learning_rate": 0.0002468013468013468, "loss": 4.7168, "step": 1800 }, { "epoch": 2.611171171171171, "grad_norm": 21.301950454711914, "learning_rate": 0.0002463203463203463, "loss": 4.2457, "step": 1810 }, { "epoch": 2.6255855855855854, "grad_norm": 24.100994110107422, "learning_rate": 0.0002458393458393458, "loss": 4.0849, "step": 1820 }, { "epoch": 2.64, "grad_norm": 20.029577255249023, "learning_rate": 0.0002453583453583453, "loss": 3.956, "step": 1830 }, { "epoch": 2.6544144144144144, "grad_norm": 18.682430267333984, "learning_rate": 0.00024487734487734484, "loss": 4.0165, "step": 1840 }, { "epoch": 2.668828828828829, "grad_norm": 24.04487419128418, "learning_rate": 0.00024439634439634437, "loss": 4.0105, "step": 1850 }, { "epoch": 2.6832432432432434, "grad_norm": 21.22220802307129, "learning_rate": 0.0002439153439153439, "loss": 3.997, "step": 1860 }, { "epoch": 2.6976576576576576, "grad_norm": 19.668106079101562, "learning_rate": 0.0002434343434343434, "loss": 4.0831, "step": 1870 }, { "epoch": 2.712072072072072, "grad_norm": 30.692045211791992, "learning_rate": 0.00024295334295334293, "loss": 4.0591, "step": 1880 }, { "epoch": 2.7264864864864866, "grad_norm": 22.906898498535156, "learning_rate": 0.00024247234247234246, "loss": 4.5457, "step": 1890 }, { "epoch": 2.740900900900901, "grad_norm": 22.690523147583008, "learning_rate": 0.00024199134199134195, "loss": 3.8756, "step": 1900 }, { "epoch": 2.755315315315315, "grad_norm": 21.029132843017578, "learning_rate": 0.0002415103415103415, "loss": 4.011, "step": 1910 }, { "epoch": 2.76972972972973, "grad_norm": 21.587825775146484, "learning_rate": 0.000241029341029341, "loss": 3.7924, "step": 1920 }, { "epoch": 2.784144144144144, "grad_norm": 22.353364944458008, "learning_rate": 0.00024054834054834052, "loss": 4.3143, "step": 1930 }, { "epoch": 2.7985585585585584, "grad_norm": 21.176376342773438, "learning_rate": 0.00024006734006734004, "loss": 4.6675, "step": 1940 }, { "epoch": 2.812972972972973, "grad_norm": 18.859739303588867, "learning_rate": 0.00023958633958633956, "loss": 4.0779, "step": 1950 }, { "epoch": 2.8273873873873874, "grad_norm": 18.34664535522461, "learning_rate": 0.0002391053391053391, "loss": 4.2849, "step": 1960 }, { "epoch": 2.8418018018018016, "grad_norm": 22.619640350341797, "learning_rate": 0.0002386243386243386, "loss": 3.9383, "step": 1970 }, { "epoch": 2.8562162162162164, "grad_norm": 22.183664321899414, "learning_rate": 0.0002381433381433381, "loss": 3.7888, "step": 1980 }, { "epoch": 2.8706306306306306, "grad_norm": 26.002941131591797, "learning_rate": 0.00023766233766233765, "loss": 3.912, "step": 1990 }, { "epoch": 2.885045045045045, "grad_norm": 27.130271911621094, "learning_rate": 0.00023718133718133715, "loss": 3.9044, "step": 2000 }, { "epoch": 2.8994594594594596, "grad_norm": 21.608003616333008, "learning_rate": 0.00023670033670033667, "loss": 4.2128, "step": 2010 }, { "epoch": 2.913873873873874, "grad_norm": 19.621829986572266, "learning_rate": 0.0002362193362193362, "loss": 3.8509, "step": 2020 }, { "epoch": 2.928288288288288, "grad_norm": 23.38471031188965, "learning_rate": 0.00023573833573833572, "loss": 4.067, "step": 2030 }, { "epoch": 2.942702702702703, "grad_norm": 13.28516674041748, "learning_rate": 0.0002352573352573352, "loss": 4.186, "step": 2040 }, { "epoch": 2.957117117117117, "grad_norm": 18.91407585144043, "learning_rate": 0.00023477633477633476, "loss": 3.7117, "step": 2050 }, { "epoch": 2.9715315315315314, "grad_norm": 18.93157196044922, "learning_rate": 0.00023429533429533426, "loss": 3.8855, "step": 2060 }, { "epoch": 2.985945945945946, "grad_norm": 20.980789184570312, "learning_rate": 0.0002338143338143338, "loss": 3.7871, "step": 2070 }, { "epoch": 2.998918918918919, "eval_accuracy": 0.5473845108695652, "eval_loss": 1.6941322088241577, "eval_runtime": 536.9387, "eval_samples_per_second": 10.966, "eval_steps_per_second": 10.966, "step": 2079 }, { "epoch": 3.0014414414414414, "grad_norm": 28.662826538085938, "learning_rate": 0.0002333333333333333, "loss": 4.0376, "step": 2080 }, { "epoch": 3.0158558558558557, "grad_norm": 13.298629760742188, "learning_rate": 0.00023285233285233283, "loss": 2.4392, "step": 2090 }, { "epoch": 3.0302702702702704, "grad_norm": 20.722625732421875, "learning_rate": 0.00023237133237133238, "loss": 2.5711, "step": 2100 }, { "epoch": 3.0446846846846847, "grad_norm": 18.076677322387695, "learning_rate": 0.00023189033189033187, "loss": 2.4815, "step": 2110 }, { "epoch": 3.059099099099099, "grad_norm": 23.47679328918457, "learning_rate": 0.00023140933140933137, "loss": 2.4175, "step": 2120 }, { "epoch": 3.0735135135135137, "grad_norm": 25.233163833618164, "learning_rate": 0.00023092833092833092, "loss": 2.6018, "step": 2130 }, { "epoch": 3.087927927927928, "grad_norm": 23.916234970092773, "learning_rate": 0.0002304473304473304, "loss": 2.9529, "step": 2140 }, { "epoch": 3.102342342342342, "grad_norm": 20.37197494506836, "learning_rate": 0.00022996632996632994, "loss": 2.2146, "step": 2150 }, { "epoch": 3.116756756756757, "grad_norm": 20.04782485961914, "learning_rate": 0.00022948532948532948, "loss": 2.1764, "step": 2160 }, { "epoch": 3.131171171171171, "grad_norm": 24.065858840942383, "learning_rate": 0.00022900432900432898, "loss": 2.7395, "step": 2170 }, { "epoch": 3.1455855855855854, "grad_norm": 20.15619468688965, "learning_rate": 0.00022852332852332853, "loss": 2.6955, "step": 2180 }, { "epoch": 3.16, "grad_norm": 15.333986282348633, "learning_rate": 0.00022804232804232803, "loss": 2.378, "step": 2190 }, { "epoch": 3.1744144144144144, "grad_norm": 17.780742645263672, "learning_rate": 0.00022756132756132752, "loss": 2.4017, "step": 2200 }, { "epoch": 3.1888288288288287, "grad_norm": 22.119949340820312, "learning_rate": 0.00022708032708032707, "loss": 2.3123, "step": 2210 }, { "epoch": 3.2032432432432434, "grad_norm": 22.979034423828125, "learning_rate": 0.0002265993265993266, "loss": 1.877, "step": 2220 }, { "epoch": 3.2176576576576577, "grad_norm": 21.25425910949707, "learning_rate": 0.0002261183261183261, "loss": 2.3021, "step": 2230 }, { "epoch": 3.232072072072072, "grad_norm": 20.077585220336914, "learning_rate": 0.00022563732563732564, "loss": 2.5026, "step": 2240 }, { "epoch": 3.2464864864864866, "grad_norm": 21.955101013183594, "learning_rate": 0.00022515632515632513, "loss": 2.4518, "step": 2250 }, { "epoch": 3.260900900900901, "grad_norm": 23.3514347076416, "learning_rate": 0.00022467532467532463, "loss": 2.4694, "step": 2260 }, { "epoch": 3.275315315315315, "grad_norm": 11.233248710632324, "learning_rate": 0.00022419432419432418, "loss": 2.2057, "step": 2270 }, { "epoch": 3.28972972972973, "grad_norm": 20.17824363708496, "learning_rate": 0.0002237133237133237, "loss": 2.3982, "step": 2280 }, { "epoch": 3.304144144144144, "grad_norm": 20.694353103637695, "learning_rate": 0.00022323232323232322, "loss": 3.0053, "step": 2290 }, { "epoch": 3.3185585585585584, "grad_norm": 24.36587142944336, "learning_rate": 0.00022275132275132275, "loss": 2.3132, "step": 2300 }, { "epoch": 3.332972972972973, "grad_norm": 18.3751277923584, "learning_rate": 0.00022227032227032224, "loss": 2.2867, "step": 2310 }, { "epoch": 3.3473873873873874, "grad_norm": 19.790868759155273, "learning_rate": 0.0002217893217893218, "loss": 2.7789, "step": 2320 }, { "epoch": 3.3618018018018017, "grad_norm": 24.86772346496582, "learning_rate": 0.0002213083213083213, "loss": 3.0161, "step": 2330 }, { "epoch": 3.3762162162162164, "grad_norm": 21.827804565429688, "learning_rate": 0.0002208273208273208, "loss": 2.546, "step": 2340 }, { "epoch": 3.3906306306306306, "grad_norm": 19.654054641723633, "learning_rate": 0.00022034632034632033, "loss": 2.6371, "step": 2350 }, { "epoch": 3.405045045045045, "grad_norm": 21.734804153442383, "learning_rate": 0.00021986531986531986, "loss": 2.4253, "step": 2360 }, { "epoch": 3.4194594594594596, "grad_norm": 27.88010597229004, "learning_rate": 0.00021938431938431935, "loss": 2.2937, "step": 2370 }, { "epoch": 3.433873873873874, "grad_norm": 22.679140090942383, "learning_rate": 0.0002189033189033189, "loss": 2.6596, "step": 2380 }, { "epoch": 3.448288288288288, "grad_norm": 21.52387809753418, "learning_rate": 0.0002184223184223184, "loss": 2.0818, "step": 2390 }, { "epoch": 3.462702702702703, "grad_norm": 20.006406784057617, "learning_rate": 0.00021794131794131792, "loss": 2.8108, "step": 2400 }, { "epoch": 3.477117117117117, "grad_norm": 19.29098892211914, "learning_rate": 0.00021746031746031744, "loss": 2.3845, "step": 2410 }, { "epoch": 3.4915315315315314, "grad_norm": 16.946989059448242, "learning_rate": 0.00021697931697931696, "loss": 2.5469, "step": 2420 }, { "epoch": 3.505945945945946, "grad_norm": 25.288267135620117, "learning_rate": 0.0002164983164983165, "loss": 2.5397, "step": 2430 }, { "epoch": 3.5203603603603604, "grad_norm": 25.8332462310791, "learning_rate": 0.000216017316017316, "loss": 2.1714, "step": 2440 }, { "epoch": 3.5347747747747746, "grad_norm": 19.762386322021484, "learning_rate": 0.0002155363155363155, "loss": 3.3805, "step": 2450 }, { "epoch": 3.5491891891891894, "grad_norm": 20.7349796295166, "learning_rate": 0.00021505531505531505, "loss": 2.7777, "step": 2460 }, { "epoch": 3.5636036036036036, "grad_norm": 22.35674285888672, "learning_rate": 0.00021457431457431455, "loss": 2.1907, "step": 2470 }, { "epoch": 3.578018018018018, "grad_norm": 21.76331901550293, "learning_rate": 0.00021409331409331407, "loss": 2.7713, "step": 2480 }, { "epoch": 3.5924324324324326, "grad_norm": 20.995986938476562, "learning_rate": 0.0002136123136123136, "loss": 2.6262, "step": 2490 }, { "epoch": 3.606846846846847, "grad_norm": 23.074106216430664, "learning_rate": 0.00021313131313131312, "loss": 2.0651, "step": 2500 }, { "epoch": 3.621261261261261, "grad_norm": 23.654848098754883, "learning_rate": 0.00021265031265031261, "loss": 2.718, "step": 2510 }, { "epoch": 3.6356756756756754, "grad_norm": 25.261152267456055, "learning_rate": 0.00021216931216931216, "loss": 2.6679, "step": 2520 }, { "epoch": 3.65009009009009, "grad_norm": 21.01721954345703, "learning_rate": 0.00021168831168831166, "loss": 2.8435, "step": 2530 }, { "epoch": 3.6645045045045044, "grad_norm": 22.361772537231445, "learning_rate": 0.0002112073112073112, "loss": 2.7907, "step": 2540 }, { "epoch": 3.678918918918919, "grad_norm": 25.23889923095703, "learning_rate": 0.0002107263107263107, "loss": 2.8608, "step": 2550 }, { "epoch": 3.6933333333333334, "grad_norm": 21.43499183654785, "learning_rate": 0.00021024531024531023, "loss": 2.3714, "step": 2560 }, { "epoch": 3.7077477477477476, "grad_norm": 20.24538230895996, "learning_rate": 0.00020976430976430975, "loss": 2.4759, "step": 2570 }, { "epoch": 3.722162162162162, "grad_norm": 22.164335250854492, "learning_rate": 0.00020928330928330927, "loss": 2.8105, "step": 2580 }, { "epoch": 3.7365765765765766, "grad_norm": 25.067033767700195, "learning_rate": 0.00020880230880230877, "loss": 2.3837, "step": 2590 }, { "epoch": 3.750990990990991, "grad_norm": 27.547651290893555, "learning_rate": 0.00020832130832130832, "loss": 2.4441, "step": 2600 }, { "epoch": 3.7654054054054056, "grad_norm": 19.971914291381836, "learning_rate": 0.0002078403078403078, "loss": 2.4194, "step": 2610 }, { "epoch": 3.77981981981982, "grad_norm": 17.411178588867188, "learning_rate": 0.00020735930735930734, "loss": 2.3971, "step": 2620 }, { "epoch": 3.794234234234234, "grad_norm": 31.035659790039062, "learning_rate": 0.00020687830687830686, "loss": 2.6306, "step": 2630 }, { "epoch": 3.8086486486486484, "grad_norm": 26.793031692504883, "learning_rate": 0.00020639730639730638, "loss": 3.0321, "step": 2640 }, { "epoch": 3.823063063063063, "grad_norm": 27.277006149291992, "learning_rate": 0.0002059163059163059, "loss": 2.1434, "step": 2650 }, { "epoch": 3.8374774774774774, "grad_norm": 29.178829193115234, "learning_rate": 0.00020543530543530543, "loss": 2.7848, "step": 2660 }, { "epoch": 3.851891891891892, "grad_norm": 17.34369659423828, "learning_rate": 0.00020495430495430492, "loss": 2.5354, "step": 2670 }, { "epoch": 3.8663063063063063, "grad_norm": 24.41458511352539, "learning_rate": 0.00020447330447330447, "loss": 2.4852, "step": 2680 }, { "epoch": 3.8807207207207206, "grad_norm": 27.604721069335938, "learning_rate": 0.00020399230399230397, "loss": 2.6835, "step": 2690 }, { "epoch": 3.895135135135135, "grad_norm": 19.998043060302734, "learning_rate": 0.0002035113035113035, "loss": 2.2523, "step": 2700 }, { "epoch": 3.9095495495495496, "grad_norm": 26.73026466369629, "learning_rate": 0.000203030303030303, "loss": 3.4174, "step": 2710 }, { "epoch": 3.923963963963964, "grad_norm": 27.696605682373047, "learning_rate": 0.00020254930254930253, "loss": 2.5488, "step": 2720 }, { "epoch": 3.9383783783783786, "grad_norm": 25.43397331237793, "learning_rate": 0.00020206830206830203, "loss": 2.1643, "step": 2730 }, { "epoch": 3.952792792792793, "grad_norm": 18.155502319335938, "learning_rate": 0.00020158730158730158, "loss": 2.2196, "step": 2740 }, { "epoch": 3.967207207207207, "grad_norm": 27.430566787719727, "learning_rate": 0.00020110630110630108, "loss": 2.2681, "step": 2750 }, { "epoch": 3.9816216216216214, "grad_norm": 17.62324333190918, "learning_rate": 0.00020062530062530062, "loss": 2.3872, "step": 2760 }, { "epoch": 3.996036036036036, "grad_norm": 22.322702407836914, "learning_rate": 0.00020014430014430012, "loss": 2.7966, "step": 2770 }, { "epoch": 3.998918918918919, "eval_accuracy": 0.5579144021739131, "eval_loss": 1.8579920530319214, "eval_runtime": 536.9866, "eval_samples_per_second": 10.965, "eval_steps_per_second": 10.965, "step": 2772 }, { "epoch": 4.011531531531531, "grad_norm": 17.038963317871094, "learning_rate": 0.00019971139971139968, "loss": 1.7853, "step": 2780 }, { "epoch": 4.025945945945946, "grad_norm": 21.912731170654297, "learning_rate": 0.00019923039923039923, "loss": 1.4446, "step": 2790 }, { "epoch": 4.04036036036036, "grad_norm": 8.3090238571167, "learning_rate": 0.00019874939874939873, "loss": 1.1382, "step": 2800 }, { "epoch": 4.054774774774775, "grad_norm": 10.985939979553223, "learning_rate": 0.00019826839826839825, "loss": 1.4296, "step": 2810 }, { "epoch": 4.069189189189189, "grad_norm": 14.48794174194336, "learning_rate": 0.00019778739778739777, "loss": 1.3267, "step": 2820 }, { "epoch": 4.083603603603604, "grad_norm": 7.6786789894104, "learning_rate": 0.0001973063973063973, "loss": 1.3823, "step": 2830 }, { "epoch": 4.098018018018018, "grad_norm": 21.3938045501709, "learning_rate": 0.0001968253968253968, "loss": 1.636, "step": 2840 }, { "epoch": 4.112432432432432, "grad_norm": 16.059181213378906, "learning_rate": 0.00019634439634439634, "loss": 1.4253, "step": 2850 }, { "epoch": 4.126846846846846, "grad_norm": 31.663381576538086, "learning_rate": 0.00019586339586339583, "loss": 1.6679, "step": 2860 }, { "epoch": 4.141261261261262, "grad_norm": 28.778202056884766, "learning_rate": 0.00019538239538239536, "loss": 1.7084, "step": 2870 }, { "epoch": 4.155675675675676, "grad_norm": 24.17688751220703, "learning_rate": 0.00019490139490139488, "loss": 1.503, "step": 2880 }, { "epoch": 4.17009009009009, "grad_norm": 18.74388313293457, "learning_rate": 0.0001944203944203944, "loss": 1.4459, "step": 2890 }, { "epoch": 4.184504504504504, "grad_norm": 25.333425521850586, "learning_rate": 0.00019393939393939395, "loss": 1.5935, "step": 2900 }, { "epoch": 4.198918918918919, "grad_norm": 19.402793884277344, "learning_rate": 0.00019345839345839345, "loss": 1.3032, "step": 2910 }, { "epoch": 4.213333333333333, "grad_norm": 11.908445358276367, "learning_rate": 0.00019297739297739294, "loss": 1.4052, "step": 2920 }, { "epoch": 4.227747747747748, "grad_norm": 10.511947631835938, "learning_rate": 0.0001924963924963925, "loss": 1.3532, "step": 2930 }, { "epoch": 4.242162162162162, "grad_norm": 18.962549209594727, "learning_rate": 0.000192015392015392, "loss": 1.4759, "step": 2940 }, { "epoch": 4.256576576576577, "grad_norm": 29.238679885864258, "learning_rate": 0.0001915343915343915, "loss": 1.6444, "step": 2950 }, { "epoch": 4.270990990990991, "grad_norm": 13.944114685058594, "learning_rate": 0.00019105339105339106, "loss": 1.5509, "step": 2960 }, { "epoch": 4.285405405405405, "grad_norm": 17.7829532623291, "learning_rate": 0.00019057239057239056, "loss": 1.4536, "step": 2970 }, { "epoch": 4.299819819819819, "grad_norm": 13.711050033569336, "learning_rate": 0.00019009139009139005, "loss": 1.299, "step": 2980 }, { "epoch": 4.314234234234235, "grad_norm": 24.686168670654297, "learning_rate": 0.0001896103896103896, "loss": 1.3826, "step": 2990 }, { "epoch": 4.328648648648649, "grad_norm": 21.13921546936035, "learning_rate": 0.0001891293891293891, "loss": 1.7036, "step": 3000 }, { "epoch": 4.343063063063063, "grad_norm": 14.596439361572266, "learning_rate": 0.00018864838864838862, "loss": 1.5839, "step": 3010 }, { "epoch": 4.357477477477477, "grad_norm": 22.715736389160156, "learning_rate": 0.00018816738816738817, "loss": 1.5686, "step": 3020 }, { "epoch": 4.371891891891892, "grad_norm": 17.39431381225586, "learning_rate": 0.00018768638768638766, "loss": 1.5422, "step": 3030 }, { "epoch": 4.386306306306306, "grad_norm": 24.868406295776367, "learning_rate": 0.0001872053872053872, "loss": 1.7397, "step": 3040 }, { "epoch": 4.400720720720721, "grad_norm": 26.22691535949707, "learning_rate": 0.0001867243867243867, "loss": 1.4283, "step": 3050 }, { "epoch": 4.415135135135135, "grad_norm": 15.568745613098145, "learning_rate": 0.0001862433862433862, "loss": 1.2897, "step": 3060 }, { "epoch": 4.42954954954955, "grad_norm": 19.749555587768555, "learning_rate": 0.00018576238576238575, "loss": 1.4769, "step": 3070 }, { "epoch": 4.443963963963964, "grad_norm": 29.223718643188477, "learning_rate": 0.00018528138528138528, "loss": 1.3324, "step": 3080 }, { "epoch": 4.458378378378378, "grad_norm": 19.438663482666016, "learning_rate": 0.00018480038480038477, "loss": 1.568, "step": 3090 }, { "epoch": 4.472792792792792, "grad_norm": 10.73144245147705, "learning_rate": 0.00018431938431938432, "loss": 1.1532, "step": 3100 }, { "epoch": 4.487207207207208, "grad_norm": 16.664306640625, "learning_rate": 0.00018383838383838382, "loss": 1.4775, "step": 3110 }, { "epoch": 4.501621621621622, "grad_norm": 25.43704605102539, "learning_rate": 0.0001833573833573833, "loss": 1.3084, "step": 3120 }, { "epoch": 4.516036036036036, "grad_norm": 22.560327529907227, "learning_rate": 0.00018287638287638286, "loss": 1.4541, "step": 3130 }, { "epoch": 4.53045045045045, "grad_norm": 22.581119537353516, "learning_rate": 0.00018239538239538239, "loss": 1.4581, "step": 3140 }, { "epoch": 4.544864864864865, "grad_norm": 19.075603485107422, "learning_rate": 0.0001819143819143819, "loss": 1.3255, "step": 3150 }, { "epoch": 4.559279279279279, "grad_norm": 15.375678062438965, "learning_rate": 0.00018143338143338143, "loss": 1.035, "step": 3160 }, { "epoch": 4.573693693693694, "grad_norm": 30.394746780395508, "learning_rate": 0.00018095238095238093, "loss": 1.7147, "step": 3170 }, { "epoch": 4.588108108108108, "grad_norm": 29.191686630249023, "learning_rate": 0.00018047138047138048, "loss": 1.3125, "step": 3180 }, { "epoch": 4.602522522522523, "grad_norm": 21.012161254882812, "learning_rate": 0.00017999037999037997, "loss": 1.5039, "step": 3190 }, { "epoch": 4.616936936936937, "grad_norm": 17.093364715576172, "learning_rate": 0.0001795093795093795, "loss": 1.4667, "step": 3200 }, { "epoch": 4.631351351351351, "grad_norm": 14.385228157043457, "learning_rate": 0.00017902837902837902, "loss": 1.2575, "step": 3210 }, { "epoch": 4.645765765765765, "grad_norm": 16.330244064331055, "learning_rate": 0.00017854737854737854, "loss": 1.2436, "step": 3220 }, { "epoch": 4.6601801801801805, "grad_norm": 17.112266540527344, "learning_rate": 0.00017806637806637803, "loss": 1.5148, "step": 3230 }, { "epoch": 4.674594594594595, "grad_norm": 25.027666091918945, "learning_rate": 0.00017758537758537758, "loss": 1.6239, "step": 3240 }, { "epoch": 4.689009009009009, "grad_norm": 11.63669490814209, "learning_rate": 0.00017710437710437708, "loss": 1.4982, "step": 3250 }, { "epoch": 4.703423423423423, "grad_norm": 18.43046760559082, "learning_rate": 0.00017662337662337663, "loss": 1.4225, "step": 3260 }, { "epoch": 4.717837837837838, "grad_norm": 17.656518936157227, "learning_rate": 0.00017614237614237613, "loss": 1.4843, "step": 3270 }, { "epoch": 4.732252252252252, "grad_norm": 17.17339324951172, "learning_rate": 0.00017566137566137565, "loss": 1.5321, "step": 3280 }, { "epoch": 4.746666666666667, "grad_norm": 18.681303024291992, "learning_rate": 0.00017518037518037517, "loss": 1.6286, "step": 3290 }, { "epoch": 4.761081081081081, "grad_norm": 22.697771072387695, "learning_rate": 0.0001746993746993747, "loss": 1.4057, "step": 3300 }, { "epoch": 4.775495495495496, "grad_norm": 16.85506248474121, "learning_rate": 0.0001742183742183742, "loss": 1.6464, "step": 3310 }, { "epoch": 4.78990990990991, "grad_norm": 23.760793685913086, "learning_rate": 0.00017373737373737374, "loss": 1.4451, "step": 3320 }, { "epoch": 4.804324324324324, "grad_norm": 19.93245506286621, "learning_rate": 0.00017325637325637323, "loss": 1.821, "step": 3330 }, { "epoch": 4.818738738738738, "grad_norm": 15.235669136047363, "learning_rate": 0.00017277537277537276, "loss": 1.3603, "step": 3340 }, { "epoch": 4.8331531531531535, "grad_norm": 18.125097274780273, "learning_rate": 0.00017229437229437228, "loss": 1.2805, "step": 3350 }, { "epoch": 4.847567567567568, "grad_norm": 19.607587814331055, "learning_rate": 0.0001718133718133718, "loss": 1.7882, "step": 3360 }, { "epoch": 4.861981981981982, "grad_norm": 30.157733917236328, "learning_rate": 0.00017133237133237132, "loss": 1.5676, "step": 3370 }, { "epoch": 4.876396396396396, "grad_norm": 14.961874961853027, "learning_rate": 0.00017085137085137085, "loss": 1.2282, "step": 3380 }, { "epoch": 4.890810810810811, "grad_norm": 29.467988967895508, "learning_rate": 0.00017037037037037034, "loss": 1.6735, "step": 3390 }, { "epoch": 4.905225225225225, "grad_norm": 22.682449340820312, "learning_rate": 0.0001698893698893699, "loss": 1.4523, "step": 3400 }, { "epoch": 4.91963963963964, "grad_norm": 17.40091323852539, "learning_rate": 0.0001694083694083694, "loss": 1.1466, "step": 3410 }, { "epoch": 4.934054054054054, "grad_norm": 24.69778823852539, "learning_rate": 0.0001689273689273689, "loss": 1.2446, "step": 3420 }, { "epoch": 4.9484684684684686, "grad_norm": 14.909017562866211, "learning_rate": 0.00016844636844636843, "loss": 1.5575, "step": 3430 }, { "epoch": 4.962882882882883, "grad_norm": 13.104373931884766, "learning_rate": 0.00016796536796536796, "loss": 1.5514, "step": 3440 }, { "epoch": 4.977297297297297, "grad_norm": 24.999370574951172, "learning_rate": 0.00016748436748436745, "loss": 1.4959, "step": 3450 }, { "epoch": 4.991711711711711, "grad_norm": 29.072294235229492, "learning_rate": 0.000167003367003367, "loss": 1.5871, "step": 3460 }, { "epoch": 4.998918918918919, "eval_accuracy": 0.6139605978260869, "eval_loss": 1.6662975549697876, "eval_runtime": 540.9629, "eval_samples_per_second": 10.884, "eval_steps_per_second": 10.884, "step": 3465 }, { "epoch": 5.007207207207207, "grad_norm": 12.2052640914917, "learning_rate": 0.0001665223665223665, "loss": 0.9848, "step": 3470 }, { "epoch": 5.021621621621621, "grad_norm": 11.040346145629883, "learning_rate": 0.00016604136604136605, "loss": 0.7229, "step": 3480 }, { "epoch": 5.036036036036036, "grad_norm": 11.913896560668945, "learning_rate": 0.00016556036556036554, "loss": 0.5134, "step": 3490 }, { "epoch": 5.050450450450451, "grad_norm": 16.600475311279297, "learning_rate": 0.00016507936507936506, "loss": 0.5581, "step": 3500 }, { "epoch": 5.064864864864865, "grad_norm": 9.584583282470703, "learning_rate": 0.0001645983645983646, "loss": 0.7335, "step": 3510 }, { "epoch": 5.079279279279279, "grad_norm": 15.97603702545166, "learning_rate": 0.0001641173641173641, "loss": 0.9761, "step": 3520 }, { "epoch": 5.093693693693694, "grad_norm": 21.01009178161621, "learning_rate": 0.0001636363636363636, "loss": 0.6637, "step": 3530 }, { "epoch": 5.108108108108108, "grad_norm": 18.944791793823242, "learning_rate": 0.00016315536315536315, "loss": 0.8514, "step": 3540 }, { "epoch": 5.122522522522522, "grad_norm": 15.107224464416504, "learning_rate": 0.00016267436267436265, "loss": 0.7069, "step": 3550 }, { "epoch": 5.136936936936937, "grad_norm": 20.789289474487305, "learning_rate": 0.00016219336219336217, "loss": 0.7369, "step": 3560 }, { "epoch": 5.151351351351352, "grad_norm": 25.02975845336914, "learning_rate": 0.0001617123617123617, "loss": 0.85, "step": 3570 }, { "epoch": 5.165765765765766, "grad_norm": 14.045705795288086, "learning_rate": 0.00016123136123136122, "loss": 1.0056, "step": 3580 }, { "epoch": 5.18018018018018, "grad_norm": 19.27486801147461, "learning_rate": 0.00016075036075036074, "loss": 0.8829, "step": 3590 }, { "epoch": 5.194594594594594, "grad_norm": 16.740869522094727, "learning_rate": 0.00016026936026936026, "loss": 0.7436, "step": 3600 }, { "epoch": 5.209009009009009, "grad_norm": 22.02817153930664, "learning_rate": 0.00015978835978835976, "loss": 0.8404, "step": 3610 }, { "epoch": 5.223423423423424, "grad_norm": 18.062744140625, "learning_rate": 0.0001593073593073593, "loss": 0.9403, "step": 3620 }, { "epoch": 5.237837837837838, "grad_norm": 11.673712730407715, "learning_rate": 0.0001588263588263588, "loss": 0.8351, "step": 3630 }, { "epoch": 5.252252252252252, "grad_norm": 13.337545394897461, "learning_rate": 0.00015834535834535833, "loss": 0.6274, "step": 3640 }, { "epoch": 5.266666666666667, "grad_norm": 19.310646057128906, "learning_rate": 0.00015786435786435785, "loss": 0.969, "step": 3650 }, { "epoch": 5.281081081081081, "grad_norm": 19.875566482543945, "learning_rate": 0.00015738335738335737, "loss": 0.6036, "step": 3660 }, { "epoch": 5.295495495495495, "grad_norm": 15.952252388000488, "learning_rate": 0.00015690235690235687, "loss": 0.6879, "step": 3670 }, { "epoch": 5.30990990990991, "grad_norm": 17.611326217651367, "learning_rate": 0.00015642135642135642, "loss": 0.5589, "step": 3680 }, { "epoch": 5.324324324324325, "grad_norm": 19.946884155273438, "learning_rate": 0.0001559403559403559, "loss": 0.7953, "step": 3690 }, { "epoch": 5.338738738738739, "grad_norm": 11.897385597229004, "learning_rate": 0.00015545935545935546, "loss": 0.5896, "step": 3700 }, { "epoch": 5.353153153153153, "grad_norm": 15.592938423156738, "learning_rate": 0.00015497835497835496, "loss": 1.1955, "step": 3710 }, { "epoch": 5.367567567567567, "grad_norm": 15.585307121276855, "learning_rate": 0.00015449735449735448, "loss": 1.0289, "step": 3720 }, { "epoch": 5.381981981981982, "grad_norm": 14.25250244140625, "learning_rate": 0.000154016354016354, "loss": 0.5986, "step": 3730 }, { "epoch": 5.396396396396397, "grad_norm": 23.96398162841797, "learning_rate": 0.00015353535353535353, "loss": 0.7085, "step": 3740 }, { "epoch": 5.410810810810811, "grad_norm": 23.628772735595703, "learning_rate": 0.00015305435305435302, "loss": 0.826, "step": 3750 }, { "epoch": 5.425225225225225, "grad_norm": 17.359643936157227, "learning_rate": 0.00015257335257335257, "loss": 0.7858, "step": 3760 }, { "epoch": 5.43963963963964, "grad_norm": 22.010915756225586, "learning_rate": 0.00015209235209235207, "loss": 0.7688, "step": 3770 }, { "epoch": 5.454054054054054, "grad_norm": 28.990123748779297, "learning_rate": 0.0001516113516113516, "loss": 0.7106, "step": 3780 }, { "epoch": 5.468468468468468, "grad_norm": 11.545175552368164, "learning_rate": 0.0001511303511303511, "loss": 0.9866, "step": 3790 }, { "epoch": 5.482882882882883, "grad_norm": 25.446990966796875, "learning_rate": 0.00015064935064935063, "loss": 0.9894, "step": 3800 }, { "epoch": 5.4972972972972975, "grad_norm": 28.915557861328125, "learning_rate": 0.00015016835016835018, "loss": 0.8584, "step": 3810 }, { "epoch": 5.511711711711712, "grad_norm": 19.692970275878906, "learning_rate": 0.00014968734968734968, "loss": 0.6045, "step": 3820 }, { "epoch": 5.526126126126126, "grad_norm": 25.059045791625977, "learning_rate": 0.00014920634920634917, "loss": 1.1067, "step": 3830 }, { "epoch": 5.54054054054054, "grad_norm": 13.645286560058594, "learning_rate": 0.0001487253487253487, "loss": 0.7451, "step": 3840 }, { "epoch": 5.554954954954955, "grad_norm": 22.43482780456543, "learning_rate": 0.00014824434824434822, "loss": 0.8842, "step": 3850 }, { "epoch": 5.569369369369369, "grad_norm": 11.246109008789062, "learning_rate": 0.00014776334776334774, "loss": 0.629, "step": 3860 }, { "epoch": 5.583783783783784, "grad_norm": 21.903657913208008, "learning_rate": 0.00014728234728234727, "loss": 0.9014, "step": 3870 }, { "epoch": 5.598198198198198, "grad_norm": 9.34262752532959, "learning_rate": 0.0001468013468013468, "loss": 0.8017, "step": 3880 }, { "epoch": 5.612612612612613, "grad_norm": 28.314603805541992, "learning_rate": 0.0001463203463203463, "loss": 0.8316, "step": 3890 }, { "epoch": 5.627027027027027, "grad_norm": 23.812631607055664, "learning_rate": 0.00014583934583934583, "loss": 1.1573, "step": 3900 }, { "epoch": 5.641441441441441, "grad_norm": 19.350114822387695, "learning_rate": 0.00014535834535834533, "loss": 0.6841, "step": 3910 }, { "epoch": 5.655855855855856, "grad_norm": 36.78022766113281, "learning_rate": 0.00014487734487734485, "loss": 0.8235, "step": 3920 }, { "epoch": 5.6702702702702705, "grad_norm": 14.95051097869873, "learning_rate": 0.0001443963443963444, "loss": 0.6835, "step": 3930 }, { "epoch": 5.684684684684685, "grad_norm": 11.998274803161621, "learning_rate": 0.0001439153439153439, "loss": 0.9942, "step": 3940 }, { "epoch": 5.699099099099099, "grad_norm": 19.465404510498047, "learning_rate": 0.00014343434343434342, "loss": 0.9386, "step": 3950 }, { "epoch": 5.713513513513513, "grad_norm": 15.735244750976562, "learning_rate": 0.00014295334295334294, "loss": 0.8174, "step": 3960 }, { "epoch": 5.727927927927928, "grad_norm": 24.03779411315918, "learning_rate": 0.00014247234247234246, "loss": 1.0849, "step": 3970 }, { "epoch": 5.742342342342342, "grad_norm": 12.98159408569336, "learning_rate": 0.00014199134199134196, "loss": 0.6748, "step": 3980 }, { "epoch": 5.756756756756757, "grad_norm": 13.99123477935791, "learning_rate": 0.0001415103415103415, "loss": 0.6744, "step": 3990 }, { "epoch": 5.771171171171171, "grad_norm": 24.469266891479492, "learning_rate": 0.00014102934102934103, "loss": 0.6449, "step": 4000 }, { "epoch": 5.7855855855855856, "grad_norm": 28.23906898498535, "learning_rate": 0.00014054834054834055, "loss": 0.757, "step": 4010 }, { "epoch": 5.8, "grad_norm": 18.971261978149414, "learning_rate": 0.00014006734006734005, "loss": 0.7486, "step": 4020 }, { "epoch": 5.814414414414414, "grad_norm": 19.77442169189453, "learning_rate": 0.00013958633958633957, "loss": 0.8439, "step": 4030 }, { "epoch": 5.828828828828829, "grad_norm": 19.546371459960938, "learning_rate": 0.0001391053391053391, "loss": 0.8859, "step": 4040 }, { "epoch": 5.8432432432432435, "grad_norm": 12.447526931762695, "learning_rate": 0.0001386243386243386, "loss": 0.6841, "step": 4050 }, { "epoch": 5.857657657657658, "grad_norm": 18.02086639404297, "learning_rate": 0.00013814333814333814, "loss": 0.8155, "step": 4060 }, { "epoch": 5.872072072072072, "grad_norm": 23.19020652770996, "learning_rate": 0.00013766233766233766, "loss": 0.8727, "step": 4070 }, { "epoch": 5.886486486486486, "grad_norm": 9.812922477722168, "learning_rate": 0.00013718133718133719, "loss": 0.8107, "step": 4080 }, { "epoch": 5.900900900900901, "grad_norm": 18.993051528930664, "learning_rate": 0.00013670033670033668, "loss": 0.6686, "step": 4090 }, { "epoch": 5.915315315315315, "grad_norm": 24.841590881347656, "learning_rate": 0.0001362193362193362, "loss": 0.8777, "step": 4100 }, { "epoch": 5.92972972972973, "grad_norm": 12.165318489074707, "learning_rate": 0.00013573833573833573, "loss": 0.7149, "step": 4110 }, { "epoch": 5.944144144144144, "grad_norm": 25.776872634887695, "learning_rate": 0.00013525733525733525, "loss": 0.9527, "step": 4120 }, { "epoch": 5.9585585585585585, "grad_norm": 15.240096092224121, "learning_rate": 0.00013477633477633477, "loss": 0.7363, "step": 4130 }, { "epoch": 5.972972972972973, "grad_norm": 18.949817657470703, "learning_rate": 0.0001342953342953343, "loss": 0.8795, "step": 4140 }, { "epoch": 5.987387387387387, "grad_norm": 23.45053482055664, "learning_rate": 0.00013381433381433382, "loss": 0.7355, "step": 4150 }, { "epoch": 5.998918918918919, "eval_accuracy": 0.6154891304347826, "eval_loss": 1.9490801095962524, "eval_runtime": 540.4624, "eval_samples_per_second": 10.894, "eval_steps_per_second": 10.894, "step": 4158 }, { "epoch": 6.002882882882883, "grad_norm": 19.96414566040039, "learning_rate": 0.0001333333333333333, "loss": 0.7705, "step": 4160 }, { "epoch": 6.017297297297297, "grad_norm": 12.935175895690918, "learning_rate": 0.00013285233285233284, "loss": 0.4507, "step": 4170 }, { "epoch": 6.031711711711711, "grad_norm": 18.57610511779785, "learning_rate": 0.00013237133237133236, "loss": 0.4772, "step": 4180 }, { "epoch": 6.0461261261261265, "grad_norm": 18.15093231201172, "learning_rate": 0.00013189033189033188, "loss": 0.4697, "step": 4190 }, { "epoch": 6.060540540540541, "grad_norm": 9.7061128616333, "learning_rate": 0.0001314093314093314, "loss": 0.3953, "step": 4200 }, { "epoch": 6.074954954954955, "grad_norm": 14.228235244750977, "learning_rate": 0.00013092833092833093, "loss": 0.4857, "step": 4210 }, { "epoch": 6.089369369369369, "grad_norm": 12.73335075378418, "learning_rate": 0.00013044733044733045, "loss": 0.2774, "step": 4220 }, { "epoch": 6.103783783783784, "grad_norm": 26.926279067993164, "learning_rate": 0.00012996632996632997, "loss": 0.4033, "step": 4230 }, { "epoch": 6.118198198198198, "grad_norm": 5.05507755279541, "learning_rate": 0.00012948532948532947, "loss": 0.379, "step": 4240 }, { "epoch": 6.132612612612613, "grad_norm": 13.0632905960083, "learning_rate": 0.000129004329004329, "loss": 0.5064, "step": 4250 }, { "epoch": 6.147027027027027, "grad_norm": 9.610346794128418, "learning_rate": 0.0001285233285233285, "loss": 0.5576, "step": 4260 }, { "epoch": 6.161441441441442, "grad_norm": 9.474533081054688, "learning_rate": 0.00012804232804232803, "loss": 0.4405, "step": 4270 }, { "epoch": 6.175855855855856, "grad_norm": 6.424566745758057, "learning_rate": 0.00012756132756132756, "loss": 0.4283, "step": 4280 }, { "epoch": 6.19027027027027, "grad_norm": 22.856693267822266, "learning_rate": 0.00012708032708032708, "loss": 0.5386, "step": 4290 }, { "epoch": 6.204684684684684, "grad_norm": 14.695728302001953, "learning_rate": 0.0001265993265993266, "loss": 0.4684, "step": 4300 }, { "epoch": 6.2190990990990995, "grad_norm": 12.434320449829102, "learning_rate": 0.0001261183261183261, "loss": 0.3499, "step": 4310 }, { "epoch": 6.233513513513514, "grad_norm": 3.9371864795684814, "learning_rate": 0.00012563732563732562, "loss": 0.4161, "step": 4320 }, { "epoch": 6.247927927927928, "grad_norm": 11.733071327209473, "learning_rate": 0.00012515632515632514, "loss": 0.4829, "step": 4330 }, { "epoch": 6.262342342342342, "grad_norm": 5.837855815887451, "learning_rate": 0.00012467532467532467, "loss": 0.5473, "step": 4340 }, { "epoch": 6.276756756756757, "grad_norm": 10.520476341247559, "learning_rate": 0.0001241943241943242, "loss": 0.432, "step": 4350 }, { "epoch": 6.291171171171171, "grad_norm": 14.354527473449707, "learning_rate": 0.0001237133237133237, "loss": 0.3837, "step": 4360 }, { "epoch": 6.305585585585586, "grad_norm": 24.440963745117188, "learning_rate": 0.00012323232323232323, "loss": 0.6812, "step": 4370 }, { "epoch": 6.32, "grad_norm": 21.688756942749023, "learning_rate": 0.00012275132275132273, "loss": 0.6889, "step": 4380 }, { "epoch": 6.3344144144144146, "grad_norm": 4.70493221282959, "learning_rate": 0.00012227032227032225, "loss": 0.4692, "step": 4390 }, { "epoch": 6.348828828828829, "grad_norm": 10.504195213317871, "learning_rate": 0.00012178932178932179, "loss": 0.3945, "step": 4400 }, { "epoch": 6.363243243243243, "grad_norm": 12.554998397827148, "learning_rate": 0.00012130832130832131, "loss": 0.4145, "step": 4410 }, { "epoch": 6.377657657657657, "grad_norm": 5.851123809814453, "learning_rate": 0.0001208273208273208, "loss": 0.3595, "step": 4420 }, { "epoch": 6.392072072072072, "grad_norm": 33.16427993774414, "learning_rate": 0.00012034632034632034, "loss": 0.5448, "step": 4430 }, { "epoch": 6.406486486486487, "grad_norm": 17.474634170532227, "learning_rate": 0.00011986531986531986, "loss": 0.4775, "step": 4440 }, { "epoch": 6.420900900900901, "grad_norm": 21.54201889038086, "learning_rate": 0.00011938431938431936, "loss": 0.4061, "step": 4450 }, { "epoch": 6.435315315315315, "grad_norm": 27.28333854675293, "learning_rate": 0.00011890331890331888, "loss": 0.41, "step": 4460 }, { "epoch": 6.44972972972973, "grad_norm": 31.519390106201172, "learning_rate": 0.00011842231842231842, "loss": 0.4323, "step": 4470 }, { "epoch": 6.464144144144144, "grad_norm": 18.609390258789062, "learning_rate": 0.00011794131794131794, "loss": 0.323, "step": 4480 }, { "epoch": 6.478558558558559, "grad_norm": 16.234210968017578, "learning_rate": 0.00011746031746031744, "loss": 0.3677, "step": 4490 }, { "epoch": 6.492972972972973, "grad_norm": 18.266056060791016, "learning_rate": 0.00011697931697931697, "loss": 0.4261, "step": 4500 }, { "epoch": 6.5073873873873875, "grad_norm": 13.765610694885254, "learning_rate": 0.0001164983164983165, "loss": 0.2749, "step": 4510 }, { "epoch": 6.521801801801802, "grad_norm": 19.466411590576172, "learning_rate": 0.00011601731601731602, "loss": 0.5191, "step": 4520 }, { "epoch": 6.536216216216216, "grad_norm": 5.606191635131836, "learning_rate": 0.00011553631553631553, "loss": 0.2674, "step": 4530 }, { "epoch": 6.55063063063063, "grad_norm": 21.999649047851562, "learning_rate": 0.00011505531505531505, "loss": 0.3778, "step": 4540 }, { "epoch": 6.565045045045045, "grad_norm": 5.735301494598389, "learning_rate": 0.00011457431457431457, "loss": 0.5567, "step": 4550 }, { "epoch": 6.57945945945946, "grad_norm": 10.661727905273438, "learning_rate": 0.00011409331409331408, "loss": 0.319, "step": 4560 }, { "epoch": 6.593873873873874, "grad_norm": 23.01692771911621, "learning_rate": 0.0001136123136123136, "loss": 0.4116, "step": 4570 }, { "epoch": 6.608288288288288, "grad_norm": 11.15292739868164, "learning_rate": 0.00011313131313131313, "loss": 0.395, "step": 4580 }, { "epoch": 6.622702702702703, "grad_norm": 15.197105407714844, "learning_rate": 0.00011265031265031265, "loss": 0.5435, "step": 4590 }, { "epoch": 6.637117117117117, "grad_norm": 23.04345703125, "learning_rate": 0.00011216931216931216, "loss": 0.4702, "step": 4600 }, { "epoch": 6.651531531531532, "grad_norm": 8.85188102722168, "learning_rate": 0.00011168831168831168, "loss": 0.3533, "step": 4610 }, { "epoch": 6.665945945945946, "grad_norm": 9.123584747314453, "learning_rate": 0.0001112073112073112, "loss": 0.4277, "step": 4620 }, { "epoch": 6.6803603603603605, "grad_norm": 8.331842422485352, "learning_rate": 0.00011072631072631073, "loss": 0.5292, "step": 4630 }, { "epoch": 6.694774774774775, "grad_norm": 12.688973426818848, "learning_rate": 0.00011024531024531024, "loss": 0.3495, "step": 4640 }, { "epoch": 6.709189189189189, "grad_norm": 22.717866897583008, "learning_rate": 0.00010976430976430976, "loss": 0.4317, "step": 4650 }, { "epoch": 6.723603603603603, "grad_norm": 22.28693962097168, "learning_rate": 0.00010928330928330928, "loss": 0.5334, "step": 4660 }, { "epoch": 6.738018018018018, "grad_norm": 18.496274948120117, "learning_rate": 0.00010880230880230879, "loss": 0.4481, "step": 4670 }, { "epoch": 6.752432432432433, "grad_norm": 22.91065216064453, "learning_rate": 0.00010832130832130831, "loss": 0.3546, "step": 4680 }, { "epoch": 6.766846846846847, "grad_norm": 24.638437271118164, "learning_rate": 0.00010784030784030783, "loss": 0.6028, "step": 4690 }, { "epoch": 6.781261261261261, "grad_norm": 12.158951759338379, "learning_rate": 0.00010735930735930736, "loss": 0.3595, "step": 4700 }, { "epoch": 6.7956756756756755, "grad_norm": 3.462782144546509, "learning_rate": 0.00010687830687830687, "loss": 0.3434, "step": 4710 }, { "epoch": 6.81009009009009, "grad_norm": 14.709941864013672, "learning_rate": 0.00010639730639730639, "loss": 0.3708, "step": 4720 }, { "epoch": 6.824504504504505, "grad_norm": 2.6258020401000977, "learning_rate": 0.00010591630591630591, "loss": 0.3561, "step": 4730 }, { "epoch": 6.838918918918919, "grad_norm": 4.584090709686279, "learning_rate": 0.00010543530543530543, "loss": 0.4685, "step": 4740 }, { "epoch": 6.8533333333333335, "grad_norm": 27.684444427490234, "learning_rate": 0.00010495430495430494, "loss": 0.2848, "step": 4750 }, { "epoch": 6.867747747747748, "grad_norm": 5.796729564666748, "learning_rate": 0.00010447330447330447, "loss": 0.3553, "step": 4760 }, { "epoch": 6.882162162162162, "grad_norm": 4.9681396484375, "learning_rate": 0.00010399230399230399, "loss": 0.3048, "step": 4770 }, { "epoch": 6.896576576576576, "grad_norm": 22.89188575744629, "learning_rate": 0.0001035113035113035, "loss": 0.6352, "step": 4780 }, { "epoch": 6.910990990990991, "grad_norm": 2.380059003829956, "learning_rate": 0.00010303030303030302, "loss": 0.4462, "step": 4790 }, { "epoch": 6.925405405405406, "grad_norm": 13.61782455444336, "learning_rate": 0.00010254930254930254, "loss": 0.4329, "step": 4800 }, { "epoch": 6.93981981981982, "grad_norm": 6.834221839904785, "learning_rate": 0.00010206830206830207, "loss": 0.2754, "step": 4810 }, { "epoch": 6.954234234234234, "grad_norm": 1.0478729009628296, "learning_rate": 0.00010158730158730157, "loss": 0.221, "step": 4820 }, { "epoch": 6.9686486486486485, "grad_norm": 8.622994422912598, "learning_rate": 0.0001011063011063011, "loss": 0.2593, "step": 4830 }, { "epoch": 6.983063063063063, "grad_norm": 22.14352035522461, "learning_rate": 0.00010062530062530062, "loss": 0.3164, "step": 4840 }, { "epoch": 6.997477477477478, "grad_norm": 8.023240089416504, "learning_rate": 0.00010014430014430014, "loss": 0.4492, "step": 4850 }, { "epoch": 6.998918918918919, "eval_accuracy": 0.6379076086956522, "eval_loss": 2.059363842010498, "eval_runtime": 537.7178, "eval_samples_per_second": 10.95, "eval_steps_per_second": 10.95, "step": 4851 }, { "epoch": 7.012972972972973, "grad_norm": 14.681108474731445, "learning_rate": 9.966329966329965e-05, "loss": 0.2425, "step": 4860 }, { "epoch": 7.027387387387387, "grad_norm": 25.905927658081055, "learning_rate": 9.918229918229917e-05, "loss": 0.2949, "step": 4870 }, { "epoch": 7.041801801801801, "grad_norm": 2.836951971054077, "learning_rate": 9.87012987012987e-05, "loss": 0.1989, "step": 4880 }, { "epoch": 7.0562162162162165, "grad_norm": 1.04839026927948, "learning_rate": 9.82202982202982e-05, "loss": 0.1024, "step": 4890 }, { "epoch": 7.070630630630631, "grad_norm": 10.27518367767334, "learning_rate": 9.773929773929773e-05, "loss": 0.1522, "step": 4900 }, { "epoch": 7.085045045045045, "grad_norm": 15.933104515075684, "learning_rate": 9.725829725829725e-05, "loss": 0.145, "step": 4910 }, { "epoch": 7.099459459459459, "grad_norm": 18.11174201965332, "learning_rate": 9.677729677729677e-05, "loss": 0.1838, "step": 4920 }, { "epoch": 7.113873873873874, "grad_norm": 1.1443898677825928, "learning_rate": 9.629629629629628e-05, "loss": 0.1418, "step": 4930 }, { "epoch": 7.128288288288288, "grad_norm": 15.602287292480469, "learning_rate": 9.58152958152958e-05, "loss": 0.3214, "step": 4940 }, { "epoch": 7.142702702702703, "grad_norm": 16.450904846191406, "learning_rate": 9.533429533429533e-05, "loss": 0.1656, "step": 4950 }, { "epoch": 7.157117117117117, "grad_norm": 14.295945167541504, "learning_rate": 9.485329485329484e-05, "loss": 0.3092, "step": 4960 }, { "epoch": 7.1715315315315316, "grad_norm": 3.2762200832366943, "learning_rate": 9.437229437229436e-05, "loss": 0.0993, "step": 4970 }, { "epoch": 7.185945945945946, "grad_norm": 1.229925274848938, "learning_rate": 9.389129389129388e-05, "loss": 0.1636, "step": 4980 }, { "epoch": 7.20036036036036, "grad_norm": 8.866992950439453, "learning_rate": 9.34102934102934e-05, "loss": 0.1434, "step": 4990 }, { "epoch": 7.214774774774774, "grad_norm": 6.15886116027832, "learning_rate": 9.292929292929291e-05, "loss": 0.1759, "step": 5000 }, { "epoch": 7.2291891891891895, "grad_norm": 6.583317279815674, "learning_rate": 9.244829244829244e-05, "loss": 0.1752, "step": 5010 }, { "epoch": 7.243603603603604, "grad_norm": 13.805874824523926, "learning_rate": 9.196729196729196e-05, "loss": 0.1778, "step": 5020 }, { "epoch": 7.258018018018018, "grad_norm": 4.149932861328125, "learning_rate": 9.148629148629148e-05, "loss": 0.3115, "step": 5030 }, { "epoch": 7.272432432432432, "grad_norm": 13.87183666229248, "learning_rate": 9.100529100529099e-05, "loss": 0.1509, "step": 5040 }, { "epoch": 7.286846846846847, "grad_norm": 8.47652530670166, "learning_rate": 9.052429052429051e-05, "loss": 0.3549, "step": 5050 }, { "epoch": 7.301261261261261, "grad_norm": 9.171941757202148, "learning_rate": 9.004329004329004e-05, "loss": 0.1054, "step": 5060 }, { "epoch": 7.315675675675676, "grad_norm": 9.501484870910645, "learning_rate": 8.956228956228955e-05, "loss": 0.1728, "step": 5070 }, { "epoch": 7.33009009009009, "grad_norm": 0.5740847587585449, "learning_rate": 8.908128908128907e-05, "loss": 0.116, "step": 5080 }, { "epoch": 7.3445045045045045, "grad_norm": 2.0156924724578857, "learning_rate": 8.860028860028859e-05, "loss": 0.1889, "step": 5090 }, { "epoch": 7.358918918918919, "grad_norm": 4.784016132354736, "learning_rate": 8.811928811928811e-05, "loss": 0.2124, "step": 5100 }, { "epoch": 7.373333333333333, "grad_norm": 2.135333299636841, "learning_rate": 8.763828763828762e-05, "loss": 0.1885, "step": 5110 }, { "epoch": 7.387747747747747, "grad_norm": 13.758618354797363, "learning_rate": 8.715728715728714e-05, "loss": 0.2869, "step": 5120 }, { "epoch": 7.4021621621621625, "grad_norm": 10.508682250976562, "learning_rate": 8.667628667628667e-05, "loss": 0.09, "step": 5130 }, { "epoch": 7.416576576576577, "grad_norm": 8.677715301513672, "learning_rate": 8.619528619528619e-05, "loss": 0.1022, "step": 5140 }, { "epoch": 7.430990990990991, "grad_norm": 7.379012584686279, "learning_rate": 8.57142857142857e-05, "loss": 0.2095, "step": 5150 }, { "epoch": 7.445405405405405, "grad_norm": 16.449451446533203, "learning_rate": 8.523328523328522e-05, "loss": 0.1052, "step": 5160 }, { "epoch": 7.45981981981982, "grad_norm": 14.736000061035156, "learning_rate": 8.475228475228474e-05, "loss": 0.2009, "step": 5170 }, { "epoch": 7.474234234234234, "grad_norm": 3.677145004272461, "learning_rate": 8.427128427128425e-05, "loss": 0.1472, "step": 5180 }, { "epoch": 7.488648648648649, "grad_norm": 0.6532973051071167, "learning_rate": 8.379028379028378e-05, "loss": 0.1467, "step": 5190 }, { "epoch": 7.503063063063063, "grad_norm": 14.072589874267578, "learning_rate": 8.33092833092833e-05, "loss": 0.1727, "step": 5200 }, { "epoch": 7.5174774774774775, "grad_norm": 15.414175033569336, "learning_rate": 8.282828282828282e-05, "loss": 0.1885, "step": 5210 }, { "epoch": 7.531891891891892, "grad_norm": 2.108407735824585, "learning_rate": 8.234728234728233e-05, "loss": 0.1228, "step": 5220 }, { "epoch": 7.546306306306306, "grad_norm": 13.167756080627441, "learning_rate": 8.186628186628185e-05, "loss": 0.1511, "step": 5230 }, { "epoch": 7.56072072072072, "grad_norm": 12.300124168395996, "learning_rate": 8.138528138528138e-05, "loss": 0.1712, "step": 5240 }, { "epoch": 7.5751351351351355, "grad_norm": 4.797776222229004, "learning_rate": 8.09042809042809e-05, "loss": 0.1385, "step": 5250 }, { "epoch": 7.58954954954955, "grad_norm": 9.989211082458496, "learning_rate": 8.042328042328041e-05, "loss": 0.2256, "step": 5260 }, { "epoch": 7.603963963963964, "grad_norm": 21.55989646911621, "learning_rate": 7.994227994227993e-05, "loss": 0.2175, "step": 5270 }, { "epoch": 7.618378378378378, "grad_norm": 12.825868606567383, "learning_rate": 7.946127946127945e-05, "loss": 0.1561, "step": 5280 }, { "epoch": 7.6327927927927925, "grad_norm": 5.119826793670654, "learning_rate": 7.902837902837901e-05, "loss": 0.1237, "step": 5290 }, { "epoch": 7.647207207207208, "grad_norm": 8.325628280639648, "learning_rate": 7.854737854737855e-05, "loss": 0.3462, "step": 5300 }, { "epoch": 7.661621621621622, "grad_norm": 8.451800346374512, "learning_rate": 7.806637806637807e-05, "loss": 0.2437, "step": 5310 }, { "epoch": 7.676036036036036, "grad_norm": 9.6069974899292, "learning_rate": 7.758537758537757e-05, "loss": 0.1846, "step": 5320 }, { "epoch": 7.6904504504504505, "grad_norm": 14.663230895996094, "learning_rate": 7.71043771043771e-05, "loss": 0.2186, "step": 5330 }, { "epoch": 7.704864864864865, "grad_norm": 16.57319450378418, "learning_rate": 7.662337662337662e-05, "loss": 0.1133, "step": 5340 }, { "epoch": 7.719279279279279, "grad_norm": 10.028879165649414, "learning_rate": 7.614237614237615e-05, "loss": 0.1361, "step": 5350 }, { "epoch": 7.733693693693693, "grad_norm": 17.944252014160156, "learning_rate": 7.566137566137566e-05, "loss": 0.2533, "step": 5360 }, { "epoch": 7.7481081081081085, "grad_norm": 4.871366500854492, "learning_rate": 7.518037518037518e-05, "loss": 0.1396, "step": 5370 }, { "epoch": 7.762522522522523, "grad_norm": 5.787502765655518, "learning_rate": 7.469937469937469e-05, "loss": 0.3421, "step": 5380 }, { "epoch": 7.776936936936937, "grad_norm": 20.75065040588379, "learning_rate": 7.421837421837421e-05, "loss": 0.1679, "step": 5390 }, { "epoch": 7.791351351351351, "grad_norm": 16.226171493530273, "learning_rate": 7.373737373737373e-05, "loss": 0.2005, "step": 5400 }, { "epoch": 7.8057657657657655, "grad_norm": 1.3808518648147583, "learning_rate": 7.325637325637326e-05, "loss": 0.2236, "step": 5410 }, { "epoch": 7.82018018018018, "grad_norm": 5.49656343460083, "learning_rate": 7.277537277537277e-05, "loss": 0.2159, "step": 5420 }, { "epoch": 7.834594594594595, "grad_norm": 4.51519250869751, "learning_rate": 7.229437229437229e-05, "loss": 0.1601, "step": 5430 }, { "epoch": 7.849009009009009, "grad_norm": 3.9731264114379883, "learning_rate": 7.181337181337181e-05, "loss": 0.2402, "step": 5440 }, { "epoch": 7.8634234234234235, "grad_norm": 1.414002776145935, "learning_rate": 7.133237133237133e-05, "loss": 0.1709, "step": 5450 }, { "epoch": 7.877837837837838, "grad_norm": 3.847299575805664, "learning_rate": 7.085137085137084e-05, "loss": 0.2866, "step": 5460 }, { "epoch": 7.892252252252252, "grad_norm": 16.216571807861328, "learning_rate": 7.037037037037036e-05, "loss": 0.1026, "step": 5470 }, { "epoch": 7.906666666666666, "grad_norm": 1.87873113155365, "learning_rate": 6.988936988936989e-05, "loss": 0.1027, "step": 5480 }, { "epoch": 7.921081081081081, "grad_norm": 11.856677055358887, "learning_rate": 6.94083694083694e-05, "loss": 0.0807, "step": 5490 }, { "epoch": 7.935495495495496, "grad_norm": 1.2753289937973022, "learning_rate": 6.892736892736892e-05, "loss": 0.1885, "step": 5500 }, { "epoch": 7.94990990990991, "grad_norm": 5.382585048675537, "learning_rate": 6.844636844636844e-05, "loss": 0.1034, "step": 5510 }, { "epoch": 7.964324324324324, "grad_norm": 4.376471996307373, "learning_rate": 6.796536796536796e-05, "loss": 0.1051, "step": 5520 }, { "epoch": 7.9787387387387385, "grad_norm": 6.501208782196045, "learning_rate": 6.748436748436747e-05, "loss": 0.1589, "step": 5530 }, { "epoch": 7.993153153153153, "grad_norm": 7.671748161315918, "learning_rate": 6.7003367003367e-05, "loss": 0.1528, "step": 5540 }, { "epoch": 7.998918918918919, "eval_accuracy": 0.6402853260869565, "eval_loss": 2.1739323139190674, "eval_runtime": 537.1422, "eval_samples_per_second": 10.962, "eval_steps_per_second": 10.962, "step": 5544 }, { "epoch": 8.008648648648649, "grad_norm": 0.7333820462226868, "learning_rate": 6.652236652236652e-05, "loss": 0.1737, "step": 5550 }, { "epoch": 8.023063063063063, "grad_norm": 1.1993273496627808, "learning_rate": 6.604136604136604e-05, "loss": 0.0923, "step": 5560 }, { "epoch": 8.037477477477477, "grad_norm": 18.680021286010742, "learning_rate": 6.556036556036555e-05, "loss": 0.1005, "step": 5570 }, { "epoch": 8.051891891891891, "grad_norm": 19.182872772216797, "learning_rate": 6.507936507936507e-05, "loss": 0.1297, "step": 5580 }, { "epoch": 8.066306306306306, "grad_norm": 2.575910806655884, "learning_rate": 6.45983645983646e-05, "loss": 0.049, "step": 5590 }, { "epoch": 8.08072072072072, "grad_norm": 1.0843993425369263, "learning_rate": 6.41173641173641e-05, "loss": 0.0646, "step": 5600 }, { "epoch": 8.095135135135136, "grad_norm": 0.35826346278190613, "learning_rate": 6.363636363636363e-05, "loss": 0.0356, "step": 5610 }, { "epoch": 8.10954954954955, "grad_norm": 1.4210469722747803, "learning_rate": 6.315536315536315e-05, "loss": 0.0329, "step": 5620 }, { "epoch": 8.123963963963964, "grad_norm": 8.666502952575684, "learning_rate": 6.267436267436267e-05, "loss": 0.0496, "step": 5630 }, { "epoch": 8.138378378378379, "grad_norm": 0.4810231328010559, "learning_rate": 6.219336219336218e-05, "loss": 0.0276, "step": 5640 }, { "epoch": 8.152792792792793, "grad_norm": 5.4928789138793945, "learning_rate": 6.17123617123617e-05, "loss": 0.0692, "step": 5650 }, { "epoch": 8.167207207207207, "grad_norm": 5.067449569702148, "learning_rate": 6.123136123136123e-05, "loss": 0.058, "step": 5660 }, { "epoch": 8.181621621621622, "grad_norm": 25.670732498168945, "learning_rate": 6.075036075036074e-05, "loss": 0.1061, "step": 5670 }, { "epoch": 8.196036036036036, "grad_norm": 6.106614589691162, "learning_rate": 6.0269360269360265e-05, "loss": 0.0554, "step": 5680 }, { "epoch": 8.21045045045045, "grad_norm": 7.492941379547119, "learning_rate": 5.978835978835978e-05, "loss": 0.0667, "step": 5690 }, { "epoch": 8.224864864864864, "grad_norm": 1.3118231296539307, "learning_rate": 5.9307359307359304e-05, "loss": 0.0388, "step": 5700 }, { "epoch": 8.239279279279279, "grad_norm": 4.273688316345215, "learning_rate": 5.882635882635882e-05, "loss": 0.047, "step": 5710 }, { "epoch": 8.253693693693693, "grad_norm": 2.6258041858673096, "learning_rate": 5.834535834535834e-05, "loss": 0.0652, "step": 5720 }, { "epoch": 8.268108108108109, "grad_norm": 5.456060886383057, "learning_rate": 5.786435786435786e-05, "loss": 0.1954, "step": 5730 }, { "epoch": 8.282522522522523, "grad_norm": 3.158957004547119, "learning_rate": 5.738335738335738e-05, "loss": 0.0662, "step": 5740 }, { "epoch": 8.296936936936937, "grad_norm": 3.201091766357422, "learning_rate": 5.6902356902356896e-05, "loss": 0.199, "step": 5750 }, { "epoch": 8.311351351351352, "grad_norm": 1.514101505279541, "learning_rate": 5.642135642135642e-05, "loss": 0.1082, "step": 5760 }, { "epoch": 8.325765765765766, "grad_norm": 0.24764111638069153, "learning_rate": 5.5940355940355935e-05, "loss": 0.0607, "step": 5770 }, { "epoch": 8.34018018018018, "grad_norm": 1.5579568147659302, "learning_rate": 5.545935545935545e-05, "loss": 0.0205, "step": 5780 }, { "epoch": 8.354594594594595, "grad_norm": 9.406379699707031, "learning_rate": 5.497835497835497e-05, "loss": 0.0614, "step": 5790 }, { "epoch": 8.369009009009009, "grad_norm": 3.4456870555877686, "learning_rate": 5.449735449735449e-05, "loss": 0.0169, "step": 5800 }, { "epoch": 8.383423423423423, "grad_norm": 0.3121024966239929, "learning_rate": 5.401635401635401e-05, "loss": 0.078, "step": 5810 }, { "epoch": 8.397837837837837, "grad_norm": 7.2323832511901855, "learning_rate": 5.353535353535353e-05, "loss": 0.0794, "step": 5820 }, { "epoch": 8.412252252252252, "grad_norm": 0.42312678694725037, "learning_rate": 5.305435305435305e-05, "loss": 0.0229, "step": 5830 }, { "epoch": 8.426666666666666, "grad_norm": 1.5303746461868286, "learning_rate": 5.2573352573352566e-05, "loss": 0.0555, "step": 5840 }, { "epoch": 8.441081081081082, "grad_norm": 0.5218743681907654, "learning_rate": 5.209235209235209e-05, "loss": 0.097, "step": 5850 }, { "epoch": 8.455495495495496, "grad_norm": 3.4224956035614014, "learning_rate": 5.1611351611351604e-05, "loss": 0.0415, "step": 5860 }, { "epoch": 8.46990990990991, "grad_norm": 0.56160569190979, "learning_rate": 5.113035113035113e-05, "loss": 0.0476, "step": 5870 }, { "epoch": 8.484324324324325, "grad_norm": 2.77597975730896, "learning_rate": 5.064935064935064e-05, "loss": 0.0231, "step": 5880 }, { "epoch": 8.498738738738739, "grad_norm": 2.240520477294922, "learning_rate": 5.016835016835016e-05, "loss": 0.051, "step": 5890 }, { "epoch": 8.513153153153153, "grad_norm": 1.585841178894043, "learning_rate": 4.968734968734968e-05, "loss": 0.0575, "step": 5900 }, { "epoch": 8.527567567567568, "grad_norm": 12.269892692565918, "learning_rate": 4.92063492063492e-05, "loss": 0.0419, "step": 5910 }, { "epoch": 8.541981981981982, "grad_norm": 4.764209747314453, "learning_rate": 4.872534872534872e-05, "loss": 0.1574, "step": 5920 }, { "epoch": 8.556396396396396, "grad_norm": 6.484140396118164, "learning_rate": 4.8244348244348236e-05, "loss": 0.0667, "step": 5930 }, { "epoch": 8.57081081081081, "grad_norm": 8.274352073669434, "learning_rate": 4.7763347763347765e-05, "loss": 0.1035, "step": 5940 }, { "epoch": 8.585225225225225, "grad_norm": 18.833515167236328, "learning_rate": 4.7282347282347274e-05, "loss": 0.0372, "step": 5950 }, { "epoch": 8.599639639639639, "grad_norm": 4.068152904510498, "learning_rate": 4.68013468013468e-05, "loss": 0.0689, "step": 5960 }, { "epoch": 8.614054054054055, "grad_norm": 4.497600078582764, "learning_rate": 4.632034632034632e-05, "loss": 0.0501, "step": 5970 }, { "epoch": 8.62846846846847, "grad_norm": 1.556960940361023, "learning_rate": 4.583934583934583e-05, "loss": 0.0988, "step": 5980 }, { "epoch": 8.642882882882883, "grad_norm": 14.646133422851562, "learning_rate": 4.535834535834536e-05, "loss": 0.055, "step": 5990 }, { "epoch": 8.657297297297298, "grad_norm": 0.7149348258972168, "learning_rate": 4.4877344877344874e-05, "loss": 0.0471, "step": 6000 }, { "epoch": 8.671711711711712, "grad_norm": 0.4112788438796997, "learning_rate": 4.4396344396344396e-05, "loss": 0.0755, "step": 6010 }, { "epoch": 8.686126126126126, "grad_norm": 0.7935078740119934, "learning_rate": 4.391534391534391e-05, "loss": 0.0194, "step": 6020 }, { "epoch": 8.70054054054054, "grad_norm": 2.739198684692383, "learning_rate": 4.3434343434343435e-05, "loss": 0.0313, "step": 6030 }, { "epoch": 8.714954954954955, "grad_norm": 1.197202444076538, "learning_rate": 4.295334295334295e-05, "loss": 0.0473, "step": 6040 }, { "epoch": 8.729369369369369, "grad_norm": 2.7497189044952393, "learning_rate": 4.247234247234247e-05, "loss": 0.0168, "step": 6050 }, { "epoch": 8.743783783783783, "grad_norm": 22.05868911743164, "learning_rate": 4.199134199134199e-05, "loss": 0.0741, "step": 6060 }, { "epoch": 8.758198198198198, "grad_norm": 2.2377078533172607, "learning_rate": 4.151034151034151e-05, "loss": 0.0413, "step": 6070 }, { "epoch": 8.772612612612612, "grad_norm": 1.0943878889083862, "learning_rate": 4.102934102934103e-05, "loss": 0.0475, "step": 6080 }, { "epoch": 8.787027027027026, "grad_norm": 1.7506133317947388, "learning_rate": 4.054834054834054e-05, "loss": 0.0188, "step": 6090 }, { "epoch": 8.801441441441442, "grad_norm": 2.1582717895507812, "learning_rate": 4.0067340067340066e-05, "loss": 0.0407, "step": 6100 }, { "epoch": 8.815855855855856, "grad_norm": 13.355046272277832, "learning_rate": 3.958633958633958e-05, "loss": 0.1049, "step": 6110 }, { "epoch": 8.83027027027027, "grad_norm": 3.4152133464813232, "learning_rate": 3.9105339105339104e-05, "loss": 0.0346, "step": 6120 }, { "epoch": 8.844684684684685, "grad_norm": 0.4933088421821594, "learning_rate": 3.862433862433862e-05, "loss": 0.1112, "step": 6130 }, { "epoch": 8.8590990990991, "grad_norm": 12.00542163848877, "learning_rate": 3.814333814333814e-05, "loss": 0.0318, "step": 6140 }, { "epoch": 8.873513513513513, "grad_norm": 9.061931610107422, "learning_rate": 3.766233766233766e-05, "loss": 0.0962, "step": 6150 }, { "epoch": 8.887927927927928, "grad_norm": 0.15183605253696442, "learning_rate": 3.7181337181337174e-05, "loss": 0.093, "step": 6160 }, { "epoch": 8.902342342342342, "grad_norm": 5.919425010681152, "learning_rate": 3.67003367003367e-05, "loss": 0.0287, "step": 6170 }, { "epoch": 8.916756756756756, "grad_norm": 6.494754791259766, "learning_rate": 3.621933621933621e-05, "loss": 0.0287, "step": 6180 }, { "epoch": 8.93117117117117, "grad_norm": 3.5904083251953125, "learning_rate": 3.5738335738335735e-05, "loss": 0.0247, "step": 6190 }, { "epoch": 8.945585585585585, "grad_norm": 5.52282190322876, "learning_rate": 3.525733525733526e-05, "loss": 0.0644, "step": 6200 }, { "epoch": 8.96, "grad_norm": 3.505472183227539, "learning_rate": 3.4776334776334774e-05, "loss": 0.0133, "step": 6210 }, { "epoch": 8.974414414414415, "grad_norm": 0.13238631188869476, "learning_rate": 3.4295334295334296e-05, "loss": 0.0294, "step": 6220 }, { "epoch": 8.98882882882883, "grad_norm": 1.1236836910247803, "learning_rate": 3.381433381433381e-05, "loss": 0.0468, "step": 6230 }, { "epoch": 8.99891891891892, "eval_accuracy": 0.6504755434782609, "eval_loss": 2.3125061988830566, "eval_runtime": 539.1351, "eval_samples_per_second": 10.921, "eval_steps_per_second": 10.921, "step": 6237 }, { "epoch": 9.004324324324324, "grad_norm": 1.5750885009765625, "learning_rate": 3.333333333333333e-05, "loss": 0.0234, "step": 6240 }, { "epoch": 9.018738738738739, "grad_norm": 0.3882788121700287, "learning_rate": 3.285233285233285e-05, "loss": 0.0151, "step": 6250 }, { "epoch": 9.033153153153153, "grad_norm": 0.2824605107307434, "learning_rate": 3.2371332371332367e-05, "loss": 0.0045, "step": 6260 }, { "epoch": 9.047567567567567, "grad_norm": 0.8951876759529114, "learning_rate": 3.189033189033189e-05, "loss": 0.0058, "step": 6270 }, { "epoch": 9.061981981981981, "grad_norm": 0.6100791096687317, "learning_rate": 3.1409331409331405e-05, "loss": 0.0148, "step": 6280 }, { "epoch": 9.076396396396396, "grad_norm": 8.918787002563477, "learning_rate": 3.092833092833093e-05, "loss": 0.0175, "step": 6290 }, { "epoch": 9.090810810810812, "grad_norm": 0.46548986434936523, "learning_rate": 3.0447330447330447e-05, "loss": 0.006, "step": 6300 }, { "epoch": 9.105225225225226, "grad_norm": 2.6482155323028564, "learning_rate": 2.9966329966329966e-05, "loss": 0.0089, "step": 6310 }, { "epoch": 9.11963963963964, "grad_norm": 0.44524553418159485, "learning_rate": 2.9485329485329485e-05, "loss": 0.0063, "step": 6320 }, { "epoch": 9.134054054054054, "grad_norm": 1.2146574258804321, "learning_rate": 2.9004329004329005e-05, "loss": 0.0065, "step": 6330 }, { "epoch": 9.148468468468469, "grad_norm": 5.5731201171875, "learning_rate": 2.852332852332852e-05, "loss": 0.017, "step": 6340 }, { "epoch": 9.162882882882883, "grad_norm": 1.0001026391983032, "learning_rate": 2.804232804232804e-05, "loss": 0.0095, "step": 6350 }, { "epoch": 9.177297297297297, "grad_norm": 0.22491152584552765, "learning_rate": 2.756132756132756e-05, "loss": 0.0301, "step": 6360 }, { "epoch": 9.191711711711712, "grad_norm": 0.5325976610183716, "learning_rate": 2.7080327080327078e-05, "loss": 0.0296, "step": 6370 }, { "epoch": 9.206126126126126, "grad_norm": 0.44546425342559814, "learning_rate": 2.6599326599326597e-05, "loss": 0.0056, "step": 6380 }, { "epoch": 9.22054054054054, "grad_norm": 3.602013349533081, "learning_rate": 2.6118326118326117e-05, "loss": 0.014, "step": 6390 }, { "epoch": 9.234954954954954, "grad_norm": 0.4638885259628296, "learning_rate": 2.5637325637325636e-05, "loss": 0.01, "step": 6400 }, { "epoch": 9.249369369369369, "grad_norm": 0.21774759888648987, "learning_rate": 2.5156325156325155e-05, "loss": 0.0543, "step": 6410 }, { "epoch": 9.263783783783785, "grad_norm": 0.2262602001428604, "learning_rate": 2.4675324675324674e-05, "loss": 0.0086, "step": 6420 }, { "epoch": 9.278198198198199, "grad_norm": 1.7811743021011353, "learning_rate": 2.4194324194324193e-05, "loss": 0.0109, "step": 6430 }, { "epoch": 9.292612612612613, "grad_norm": 1.6832902431488037, "learning_rate": 2.371332371332371e-05, "loss": 0.0076, "step": 6440 }, { "epoch": 9.307027027027027, "grad_norm": 0.11599577963352203, "learning_rate": 2.323232323232323e-05, "loss": 0.0065, "step": 6450 }, { "epoch": 9.321441441441442, "grad_norm": 0.049297433346509933, "learning_rate": 2.2751322751322748e-05, "loss": 0.0094, "step": 6460 }, { "epoch": 9.335855855855856, "grad_norm": 0.6120862364768982, "learning_rate": 2.2270322270322267e-05, "loss": 0.0065, "step": 6470 }, { "epoch": 9.35027027027027, "grad_norm": 0.24179236590862274, "learning_rate": 2.1789321789321786e-05, "loss": 0.0156, "step": 6480 }, { "epoch": 9.364684684684685, "grad_norm": 1.3065845966339111, "learning_rate": 2.1308321308321305e-05, "loss": 0.0114, "step": 6490 }, { "epoch": 9.379099099099099, "grad_norm": 1.4051166772842407, "learning_rate": 2.0827320827320825e-05, "loss": 0.005, "step": 6500 }, { "epoch": 9.393513513513513, "grad_norm": 1.3191016912460327, "learning_rate": 2.0346320346320344e-05, "loss": 0.0079, "step": 6510 }, { "epoch": 9.407927927927927, "grad_norm": 0.15781471133232117, "learning_rate": 1.9865319865319863e-05, "loss": 0.0144, "step": 6520 }, { "epoch": 9.422342342342342, "grad_norm": 0.2565706968307495, "learning_rate": 1.9384319384319386e-05, "loss": 0.0338, "step": 6530 }, { "epoch": 9.436756756756758, "grad_norm": 0.3341190814971924, "learning_rate": 1.8903318903318905e-05, "loss": 0.0105, "step": 6540 }, { "epoch": 9.451171171171172, "grad_norm": 0.5033118724822998, "learning_rate": 1.842231842231842e-05, "loss": 0.0568, "step": 6550 }, { "epoch": 9.465585585585586, "grad_norm": 1.653732419013977, "learning_rate": 1.794131794131794e-05, "loss": 0.0084, "step": 6560 }, { "epoch": 9.48, "grad_norm": 11.09926700592041, "learning_rate": 1.746031746031746e-05, "loss": 0.0144, "step": 6570 }, { "epoch": 9.494414414414415, "grad_norm": 0.14694152772426605, "learning_rate": 1.697931697931698e-05, "loss": 0.0047, "step": 6580 }, { "epoch": 9.508828828828829, "grad_norm": 0.05755695700645447, "learning_rate": 1.6498316498316498e-05, "loss": 0.0096, "step": 6590 }, { "epoch": 9.523243243243243, "grad_norm": 0.30771782994270325, "learning_rate": 1.6017316017316017e-05, "loss": 0.0143, "step": 6600 }, { "epoch": 9.537657657657657, "grad_norm": 0.2555331885814667, "learning_rate": 1.5536315536315536e-05, "loss": 0.0152, "step": 6610 }, { "epoch": 9.552072072072072, "grad_norm": 0.45528095960617065, "learning_rate": 1.5055315055315054e-05, "loss": 0.0055, "step": 6620 }, { "epoch": 9.566486486486486, "grad_norm": 1.118922472000122, "learning_rate": 1.4574314574314573e-05, "loss": 0.019, "step": 6630 }, { "epoch": 9.5809009009009, "grad_norm": 0.5122382044792175, "learning_rate": 1.4093314093314092e-05, "loss": 0.0534, "step": 6640 }, { "epoch": 9.595315315315315, "grad_norm": 0.18795226514339447, "learning_rate": 1.3612313612313611e-05, "loss": 0.0247, "step": 6650 }, { "epoch": 9.609729729729729, "grad_norm": 1.0938136577606201, "learning_rate": 1.313131313131313e-05, "loss": 0.0062, "step": 6660 }, { "epoch": 9.624144144144145, "grad_norm": 0.13021990656852722, "learning_rate": 1.265031265031265e-05, "loss": 0.0052, "step": 6670 }, { "epoch": 9.63855855855856, "grad_norm": 1.0237598419189453, "learning_rate": 1.2169312169312167e-05, "loss": 0.0106, "step": 6680 }, { "epoch": 9.652972972972973, "grad_norm": 0.8002647161483765, "learning_rate": 1.1688311688311687e-05, "loss": 0.0051, "step": 6690 }, { "epoch": 9.667387387387388, "grad_norm": 0.5976181030273438, "learning_rate": 1.1207311207311206e-05, "loss": 0.0026, "step": 6700 }, { "epoch": 9.681801801801802, "grad_norm": 0.4594089388847351, "learning_rate": 1.0726310726310727e-05, "loss": 0.0045, "step": 6710 }, { "epoch": 9.696216216216216, "grad_norm": 0.6820192933082581, "learning_rate": 1.0245310245310246e-05, "loss": 0.005, "step": 6720 }, { "epoch": 9.71063063063063, "grad_norm": 0.21790215373039246, "learning_rate": 9.764309764309763e-06, "loss": 0.0093, "step": 6730 }, { "epoch": 9.725045045045045, "grad_norm": 3.2225234508514404, "learning_rate": 9.283309283309283e-06, "loss": 0.008, "step": 6740 }, { "epoch": 9.739459459459459, "grad_norm": 2.9584898948669434, "learning_rate": 8.802308802308802e-06, "loss": 0.032, "step": 6750 }, { "epoch": 9.753873873873873, "grad_norm": 0.250264972448349, "learning_rate": 8.321308321308321e-06, "loss": 0.0075, "step": 6760 }, { "epoch": 9.768288288288288, "grad_norm": 14.774813652038574, "learning_rate": 7.840307840307839e-06, "loss": 0.0137, "step": 6770 }, { "epoch": 9.782702702702704, "grad_norm": 16.798877716064453, "learning_rate": 7.359307359307359e-06, "loss": 0.0144, "step": 6780 }, { "epoch": 9.797117117117118, "grad_norm": 0.39727962017059326, "learning_rate": 6.878306878306877e-06, "loss": 0.0068, "step": 6790 }, { "epoch": 9.811531531531532, "grad_norm": 0.6047233939170837, "learning_rate": 6.397306397306397e-06, "loss": 0.0046, "step": 6800 }, { "epoch": 9.825945945945946, "grad_norm": 0.6603574752807617, "learning_rate": 5.916305916305916e-06, "loss": 0.0058, "step": 6810 }, { "epoch": 9.84036036036036, "grad_norm": 0.07351452112197876, "learning_rate": 5.435305435305435e-06, "loss": 0.007, "step": 6820 }, { "epoch": 9.854774774774775, "grad_norm": 0.48447152972221375, "learning_rate": 4.954304954304954e-06, "loss": 0.0059, "step": 6830 }, { "epoch": 9.86918918918919, "grad_norm": 0.12311412394046783, "learning_rate": 4.473304473304473e-06, "loss": 0.0151, "step": 6840 }, { "epoch": 9.883603603603603, "grad_norm": 0.08983255177736282, "learning_rate": 3.992303992303992e-06, "loss": 0.0072, "step": 6850 }, { "epoch": 9.898018018018018, "grad_norm": 0.78732830286026, "learning_rate": 3.511303511303511e-06, "loss": 0.0613, "step": 6860 }, { "epoch": 9.912432432432432, "grad_norm": 0.09099213033914566, "learning_rate": 3.0303030303030305e-06, "loss": 0.0064, "step": 6870 }, { "epoch": 9.926846846846846, "grad_norm": 0.3043908476829529, "learning_rate": 2.5493025493025493e-06, "loss": 0.0062, "step": 6880 }, { "epoch": 9.94126126126126, "grad_norm": 0.16236887872219086, "learning_rate": 2.068302068302068e-06, "loss": 0.007, "step": 6890 }, { "epoch": 9.955675675675675, "grad_norm": 1.6547272205352783, "learning_rate": 1.587301587301587e-06, "loss": 0.0141, "step": 6900 }, { "epoch": 9.97009009009009, "grad_norm": 14.645796775817871, "learning_rate": 1.1063011063011063e-06, "loss": 0.0272, "step": 6910 }, { "epoch": 9.984504504504505, "grad_norm": 0.29277849197387695, "learning_rate": 6.253006253006252e-07, "loss": 0.0082, "step": 6920 }, { "epoch": 9.99891891891892, "grad_norm": 0.7076464891433716, "learning_rate": 1.4430014430014428e-07, "loss": 0.0045, "step": 6930 }, { "epoch": 9.99891891891892, "eval_accuracy": 0.6554008152173914, "eval_loss": 2.2544686794281006, "eval_runtime": 539.1275, "eval_samples_per_second": 10.921, "eval_steps_per_second": 10.921, "step": 6930 }, { "epoch": 9.99891891891892, "step": 6930, "total_flos": 3.884969846408101e+18, "train_loss": 2.9567832476562925, "train_runtime": 60026.293, "train_samples_per_second": 3.697, "train_steps_per_second": 0.115 } ], "logging_steps": 10, "max_steps": 6930, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.884969846408101e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }