{ "best_metric": 0.9884455399784317, "best_model_checkpoint": "swin-large-patch4-window7-224-in22k-finetuned-lora-medmnistv2/checkpoint-4324", "epoch": 9.990749306197966, "eval_steps": 500, "global_step": 5400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 2.677260637283325, "learning_rate": 0.004990740740740741, "loss": 1.6626, "step": 10 }, { "epoch": 0.04, "grad_norm": 2.661841630935669, "learning_rate": 0.004981481481481482, "loss": 1.0711, "step": 20 }, { "epoch": 0.06, "grad_norm": 1.4014948606491089, "learning_rate": 0.0049722222222222225, "loss": 0.9878, "step": 30 }, { "epoch": 0.07, "grad_norm": 1.5774717330932617, "learning_rate": 0.004962962962962963, "loss": 0.832, "step": 40 }, { "epoch": 0.09, "grad_norm": 1.944169521331787, "learning_rate": 0.004953703703703703, "loss": 0.8072, "step": 50 }, { "epoch": 0.11, "grad_norm": 0.9590211510658264, "learning_rate": 0.004944444444444445, "loss": 0.7049, "step": 60 }, { "epoch": 0.13, "grad_norm": 1.8535215854644775, "learning_rate": 0.004935185185185186, "loss": 0.6882, "step": 70 }, { "epoch": 0.15, "grad_norm": 2.145578145980835, "learning_rate": 0.004925925925925926, "loss": 0.8774, "step": 80 }, { "epoch": 0.17, "grad_norm": 1.580334186553955, "learning_rate": 0.004916666666666666, "loss": 0.7384, "step": 90 }, { "epoch": 0.19, "grad_norm": 1.8743101358413696, "learning_rate": 0.004907407407407408, "loss": 0.7732, "step": 100 }, { "epoch": 0.2, "grad_norm": 1.0922596454620361, "learning_rate": 0.004898148148148148, "loss": 0.8463, "step": 110 }, { "epoch": 0.22, "grad_norm": 1.5585048198699951, "learning_rate": 0.004888888888888889, "loss": 0.8079, "step": 120 }, { "epoch": 0.24, "grad_norm": 1.7118746042251587, "learning_rate": 0.00487962962962963, "loss": 0.7567, "step": 130 }, { "epoch": 0.26, "grad_norm": 1.6968052387237549, "learning_rate": 0.00487037037037037, "loss": 0.7144, "step": 140 }, { "epoch": 0.28, "grad_norm": 1.3183552026748657, "learning_rate": 0.004861111111111111, "loss": 0.7303, "step": 150 }, { "epoch": 0.3, "grad_norm": 1.2560759782791138, "learning_rate": 0.004851851851851852, "loss": 0.7694, "step": 160 }, { "epoch": 0.31, "grad_norm": 1.185666561126709, "learning_rate": 0.004842592592592593, "loss": 0.6305, "step": 170 }, { "epoch": 0.33, "grad_norm": 1.3163808584213257, "learning_rate": 0.004833333333333334, "loss": 0.6565, "step": 180 }, { "epoch": 0.35, "grad_norm": 1.3728744983673096, "learning_rate": 0.004824074074074074, "loss": 0.663, "step": 190 }, { "epoch": 0.37, "grad_norm": 1.2197016477584839, "learning_rate": 0.004814814814814814, "loss": 0.7628, "step": 200 }, { "epoch": 0.39, "grad_norm": 2.1777913570404053, "learning_rate": 0.004805555555555556, "loss": 0.5866, "step": 210 }, { "epoch": 0.41, "grad_norm": 2.0859122276306152, "learning_rate": 0.004796296296296297, "loss": 0.6867, "step": 220 }, { "epoch": 0.43, "grad_norm": 1.4262019395828247, "learning_rate": 0.004787962962962963, "loss": 0.6883, "step": 230 }, { "epoch": 0.44, "grad_norm": 1.2657884359359741, "learning_rate": 0.004778703703703704, "loss": 0.651, "step": 240 }, { "epoch": 0.46, "grad_norm": 2.6731324195861816, "learning_rate": 0.004769444444444444, "loss": 0.6012, "step": 250 }, { "epoch": 0.48, "grad_norm": 1.3679182529449463, "learning_rate": 0.004760185185185185, "loss": 0.6211, "step": 260 }, { "epoch": 0.5, "grad_norm": 1.6990712881088257, "learning_rate": 0.004750925925925926, "loss": 0.6239, "step": 270 }, { "epoch": 0.52, "grad_norm": 2.043832302093506, "learning_rate": 0.004741666666666667, "loss": 0.7126, "step": 280 }, { "epoch": 0.54, "grad_norm": 1.4476696252822876, "learning_rate": 0.004732407407407407, "loss": 0.6775, "step": 290 }, { "epoch": 0.56, "grad_norm": 1.9128739833831787, "learning_rate": 0.004723148148148148, "loss": 0.6141, "step": 300 }, { "epoch": 0.57, "grad_norm": 1.5211902856826782, "learning_rate": 0.004713888888888889, "loss": 0.6442, "step": 310 }, { "epoch": 0.59, "grad_norm": 3.0712132453918457, "learning_rate": 0.00470462962962963, "loss": 0.663, "step": 320 }, { "epoch": 0.61, "grad_norm": 1.7837520837783813, "learning_rate": 0.004695370370370371, "loss": 0.674, "step": 330 }, { "epoch": 0.63, "grad_norm": 5.605283260345459, "learning_rate": 0.004686111111111111, "loss": 0.593, "step": 340 }, { "epoch": 0.65, "grad_norm": 2.6977906227111816, "learning_rate": 0.004676851851851852, "loss": 0.6011, "step": 350 }, { "epoch": 0.67, "grad_norm": 1.653228998184204, "learning_rate": 0.004667592592592593, "loss": 0.615, "step": 360 }, { "epoch": 0.68, "grad_norm": 1.4604382514953613, "learning_rate": 0.004658333333333333, "loss": 0.6079, "step": 370 }, { "epoch": 0.7, "grad_norm": 1.8553276062011719, "learning_rate": 0.004649074074074074, "loss": 0.6564, "step": 380 }, { "epoch": 0.72, "grad_norm": 2.2343759536743164, "learning_rate": 0.004639814814814815, "loss": 0.6295, "step": 390 }, { "epoch": 0.74, "grad_norm": 1.534677505493164, "learning_rate": 0.004630555555555555, "loss": 0.6339, "step": 400 }, { "epoch": 0.76, "grad_norm": 2.4698429107666016, "learning_rate": 0.004621296296296296, "loss": 0.7063, "step": 410 }, { "epoch": 0.78, "grad_norm": 2.9529638290405273, "learning_rate": 0.004612037037037038, "loss": 0.6948, "step": 420 }, { "epoch": 0.8, "grad_norm": 1.716919183731079, "learning_rate": 0.004603703703703704, "loss": 0.6224, "step": 430 }, { "epoch": 0.81, "grad_norm": 2.205298662185669, "learning_rate": 0.004594444444444444, "loss": 0.6794, "step": 440 }, { "epoch": 0.83, "grad_norm": 1.8521095514297485, "learning_rate": 0.004585185185185185, "loss": 0.6854, "step": 450 }, { "epoch": 0.85, "grad_norm": 1.3433890342712402, "learning_rate": 0.004575925925925926, "loss": 0.6622, "step": 460 }, { "epoch": 0.87, "grad_norm": 1.289505124092102, "learning_rate": 0.004566666666666667, "loss": 0.5805, "step": 470 }, { "epoch": 0.89, "grad_norm": 2.4270637035369873, "learning_rate": 0.004557407407407408, "loss": 0.5984, "step": 480 }, { "epoch": 0.91, "grad_norm": 2.5372653007507324, "learning_rate": 0.004548148148148148, "loss": 0.6616, "step": 490 }, { "epoch": 0.93, "grad_norm": 2.072876453399658, "learning_rate": 0.004538888888888889, "loss": 0.7577, "step": 500 }, { "epoch": 0.94, "grad_norm": 1.5720523595809937, "learning_rate": 0.00452962962962963, "loss": 0.5948, "step": 510 }, { "epoch": 0.96, "grad_norm": 1.810542106628418, "learning_rate": 0.00452037037037037, "loss": 0.6173, "step": 520 }, { "epoch": 0.98, "grad_norm": 2.1614763736724854, "learning_rate": 0.004511111111111112, "loss": 0.6523, "step": 530 }, { "epoch": 1.0, "grad_norm": 1.757303237915039, "learning_rate": 0.004501851851851852, "loss": 0.6172, "step": 540 }, { "epoch": 1.0, "eval_accuracy": 0.9372977969496226, "eval_f1": 0.9422208225547629, "eval_loss": 0.191307932138443, "eval_precision": 0.9480855535269659, "eval_recall": 0.9426859850591417, "eval_runtime": 105.8196, "eval_samples_per_second": 61.34, "eval_steps_per_second": 3.837, "step": 540 }, { "epoch": 1.02, "grad_norm": 1.6954269409179688, "learning_rate": 0.004492592592592592, "loss": 0.5184, "step": 550 }, { "epoch": 1.04, "grad_norm": 2.1348953247070312, "learning_rate": 0.004483333333333333, "loss": 0.638, "step": 560 }, { "epoch": 1.05, "grad_norm": 2.005446195602417, "learning_rate": 0.004474074074074074, "loss": 0.646, "step": 570 }, { "epoch": 1.07, "grad_norm": 2.4814555644989014, "learning_rate": 0.004464814814814815, "loss": 0.6302, "step": 580 }, { "epoch": 1.09, "grad_norm": 2.1637706756591797, "learning_rate": 0.0044555555555555555, "loss": 0.5987, "step": 590 }, { "epoch": 1.11, "grad_norm": 1.1565651893615723, "learning_rate": 0.004446296296296296, "loss": 0.5838, "step": 600 }, { "epoch": 1.13, "grad_norm": 1.41851007938385, "learning_rate": 0.004437037037037037, "loss": 0.6723, "step": 610 }, { "epoch": 1.15, "grad_norm": 1.4741177558898926, "learning_rate": 0.004427777777777778, "loss": 0.6424, "step": 620 }, { "epoch": 1.17, "grad_norm": 1.8886082172393799, "learning_rate": 0.004418518518518519, "loss": 0.5423, "step": 630 }, { "epoch": 1.18, "grad_norm": 3.1668787002563477, "learning_rate": 0.0044092592592592595, "loss": 0.6426, "step": 640 }, { "epoch": 1.2, "grad_norm": 2.8490853309631348, "learning_rate": 0.0044, "loss": 0.6451, "step": 650 }, { "epoch": 1.22, "grad_norm": 1.922230839729309, "learning_rate": 0.004390740740740741, "loss": 0.646, "step": 660 }, { "epoch": 1.24, "grad_norm": 1.6063907146453857, "learning_rate": 0.004381481481481482, "loss": 0.631, "step": 670 }, { "epoch": 1.26, "grad_norm": 2.440619707107544, "learning_rate": 0.004372222222222223, "loss": 0.6694, "step": 680 }, { "epoch": 1.28, "grad_norm": 2.543225049972534, "learning_rate": 0.0043629629629629635, "loss": 0.6064, "step": 690 }, { "epoch": 1.3, "grad_norm": 2.2633233070373535, "learning_rate": 0.004353703703703703, "loss": 0.7172, "step": 700 }, { "epoch": 1.31, "grad_norm": 2.6559178829193115, "learning_rate": 0.004344444444444445, "loss": 0.7276, "step": 710 }, { "epoch": 1.33, "grad_norm": 2.212648391723633, "learning_rate": 0.004335185185185185, "loss": 0.6017, "step": 720 }, { "epoch": 1.35, "grad_norm": 1.5736876726150513, "learning_rate": 0.004325925925925926, "loss": 0.6352, "step": 730 }, { "epoch": 1.37, "grad_norm": 1.8303929567337036, "learning_rate": 0.004316666666666667, "loss": 0.5693, "step": 740 }, { "epoch": 1.39, "grad_norm": 1.7651495933532715, "learning_rate": 0.004307407407407407, "loss": 0.5643, "step": 750 }, { "epoch": 1.41, "grad_norm": 2.3067328929901123, "learning_rate": 0.004298148148148148, "loss": 0.6661, "step": 760 }, { "epoch": 1.42, "grad_norm": 2.121184825897217, "learning_rate": 0.004288888888888889, "loss": 0.5675, "step": 770 }, { "epoch": 1.44, "grad_norm": 1.3322380781173706, "learning_rate": 0.00427962962962963, "loss": 0.6631, "step": 780 }, { "epoch": 1.46, "grad_norm": 1.6407716274261475, "learning_rate": 0.0042703703703703706, "loss": 0.6184, "step": 790 }, { "epoch": 1.48, "grad_norm": 2.4977192878723145, "learning_rate": 0.004261111111111111, "loss": 0.5324, "step": 800 }, { "epoch": 1.5, "grad_norm": 1.2957355976104736, "learning_rate": 0.004251851851851852, "loss": 0.6039, "step": 810 }, { "epoch": 1.52, "grad_norm": 1.6499762535095215, "learning_rate": 0.004242592592592593, "loss": 0.6452, "step": 820 }, { "epoch": 1.54, "grad_norm": 2.024014949798584, "learning_rate": 0.004233333333333334, "loss": 0.673, "step": 830 }, { "epoch": 1.55, "grad_norm": 1.1818047761917114, "learning_rate": 0.004224074074074074, "loss": 0.6826, "step": 840 }, { "epoch": 1.57, "grad_norm": 1.610490083694458, "learning_rate": 0.0042148148148148145, "loss": 0.6284, "step": 850 }, { "epoch": 1.59, "grad_norm": 4.2171311378479, "learning_rate": 0.004205555555555556, "loss": 0.583, "step": 860 }, { "epoch": 1.61, "grad_norm": 1.984252691268921, "learning_rate": 0.004196296296296296, "loss": 0.6036, "step": 870 }, { "epoch": 1.63, "grad_norm": 1.9518765211105347, "learning_rate": 0.004187037037037037, "loss": 0.5909, "step": 880 }, { "epoch": 1.65, "grad_norm": 2.7360193729400635, "learning_rate": 0.0041777777777777785, "loss": 0.6071, "step": 890 }, { "epoch": 1.67, "grad_norm": 3.141552209854126, "learning_rate": 0.0041685185185185184, "loss": 0.5846, "step": 900 }, { "epoch": 1.68, "grad_norm": 3.816471815109253, "learning_rate": 0.004159259259259259, "loss": 0.6058, "step": 910 }, { "epoch": 1.7, "grad_norm": 1.957501769065857, "learning_rate": 0.00415, "loss": 0.6491, "step": 920 }, { "epoch": 1.72, "grad_norm": 7.639843463897705, "learning_rate": 0.004140740740740741, "loss": 0.6144, "step": 930 }, { "epoch": 1.74, "grad_norm": 2.967031955718994, "learning_rate": 0.004131481481481482, "loss": 0.7173, "step": 940 }, { "epoch": 1.76, "grad_norm": 2.3182485103607178, "learning_rate": 0.004122222222222222, "loss": 0.6002, "step": 950 }, { "epoch": 1.78, "grad_norm": 2.3655409812927246, "learning_rate": 0.004112962962962962, "loss": 0.5993, "step": 960 }, { "epoch": 1.79, "grad_norm": 2.82918643951416, "learning_rate": 0.004103703703703704, "loss": 0.5827, "step": 970 }, { "epoch": 1.81, "grad_norm": 3.4129443168640137, "learning_rate": 0.004094444444444445, "loss": 0.6606, "step": 980 }, { "epoch": 1.83, "grad_norm": 2.3152365684509277, "learning_rate": 0.004085185185185185, "loss": 0.6762, "step": 990 }, { "epoch": 1.85, "grad_norm": 1.8874484300613403, "learning_rate": 0.004075925925925926, "loss": 0.7619, "step": 1000 }, { "epoch": 1.87, "grad_norm": 1.565685510635376, "learning_rate": 0.004066666666666667, "loss": 0.7038, "step": 1010 }, { "epoch": 1.89, "grad_norm": 3.39355206489563, "learning_rate": 0.004057407407407407, "loss": 0.6543, "step": 1020 }, { "epoch": 1.91, "grad_norm": 1.670486330986023, "learning_rate": 0.004048148148148148, "loss": 0.7117, "step": 1030 }, { "epoch": 1.92, "grad_norm": 2.0435540676116943, "learning_rate": 0.004038888888888889, "loss": 0.7173, "step": 1040 }, { "epoch": 1.94, "grad_norm": 2.260481119155884, "learning_rate": 0.0040296296296296295, "loss": 0.6442, "step": 1050 }, { "epoch": 1.96, "grad_norm": 4.03023099899292, "learning_rate": 0.00402037037037037, "loss": 0.5518, "step": 1060 }, { "epoch": 1.98, "grad_norm": 2.807311773300171, "learning_rate": 0.004011111111111111, "loss": 0.5961, "step": 1070 }, { "epoch": 2.0, "grad_norm": 3.02470326423645, "learning_rate": 0.004001851851851852, "loss": 0.6346, "step": 1080 }, { "epoch": 2.0, "eval_accuracy": 0.9759667231551379, "eval_f1": 0.9769796000181948, "eval_loss": 0.0756431594491005, "eval_precision": 0.9799222297554231, "eval_recall": 0.9751785999265923, "eval_runtime": 105.7688, "eval_samples_per_second": 61.37, "eval_steps_per_second": 3.839, "step": 1081 }, { "epoch": 2.02, "grad_norm": 2.4306721687316895, "learning_rate": 0.003992592592592593, "loss": 0.5623, "step": 1090 }, { "epoch": 2.04, "grad_norm": 3.5204544067382812, "learning_rate": 0.0039833333333333335, "loss": 0.6119, "step": 1100 }, { "epoch": 2.05, "grad_norm": 1.6892510652542114, "learning_rate": 0.003974074074074074, "loss": 0.6725, "step": 1110 }, { "epoch": 2.07, "grad_norm": 1.9901163578033447, "learning_rate": 0.003964814814814815, "loss": 0.6753, "step": 1120 }, { "epoch": 2.09, "grad_norm": 2.7705817222595215, "learning_rate": 0.003955555555555556, "loss": 0.6667, "step": 1130 }, { "epoch": 2.11, "grad_norm": 2.376143217086792, "learning_rate": 0.003946296296296296, "loss": 0.6805, "step": 1140 }, { "epoch": 2.13, "grad_norm": 2.719224214553833, "learning_rate": 0.0039370370370370375, "loss": 0.66, "step": 1150 }, { "epoch": 2.15, "grad_norm": 2.0327024459838867, "learning_rate": 0.003927777777777778, "loss": 0.6336, "step": 1160 }, { "epoch": 2.16, "grad_norm": 1.984775424003601, "learning_rate": 0.003918518518518518, "loss": 0.5996, "step": 1170 }, { "epoch": 2.18, "grad_norm": 2.203216075897217, "learning_rate": 0.003909259259259259, "loss": 0.6567, "step": 1180 }, { "epoch": 2.2, "grad_norm": 1.6019185781478882, "learning_rate": 0.0039000000000000003, "loss": 0.6497, "step": 1190 }, { "epoch": 2.22, "grad_norm": 1.9226101636886597, "learning_rate": 0.0038907407407407406, "loss": 0.6658, "step": 1200 }, { "epoch": 2.24, "grad_norm": 3.1916518211364746, "learning_rate": 0.0038814814814814814, "loss": 0.7171, "step": 1210 }, { "epoch": 2.26, "grad_norm": 1.9380624294281006, "learning_rate": 0.0038722222222222226, "loss": 0.7219, "step": 1220 }, { "epoch": 2.28, "grad_norm": 2.232342481613159, "learning_rate": 0.003862962962962963, "loss": 0.7147, "step": 1230 }, { "epoch": 2.29, "grad_norm": 2.556619644165039, "learning_rate": 0.003853703703703704, "loss": 0.6061, "step": 1240 }, { "epoch": 2.31, "grad_norm": 4.261257648468018, "learning_rate": 0.003844444444444444, "loss": 0.6232, "step": 1250 }, { "epoch": 2.33, "grad_norm": 5.815138339996338, "learning_rate": 0.0038351851851851854, "loss": 0.6059, "step": 1260 }, { "epoch": 2.35, "grad_norm": 2.543003797531128, "learning_rate": 0.003825925925925926, "loss": 0.7635, "step": 1270 }, { "epoch": 2.37, "grad_norm": 3.6232597827911377, "learning_rate": 0.0038166666666666666, "loss": 0.6533, "step": 1280 }, { "epoch": 2.39, "grad_norm": 3.13364839553833, "learning_rate": 0.003807407407407408, "loss": 0.6824, "step": 1290 }, { "epoch": 2.41, "grad_norm": 1.5059545040130615, "learning_rate": 0.003798148148148148, "loss": 0.6682, "step": 1300 }, { "epoch": 2.42, "grad_norm": 2.015592575073242, "learning_rate": 0.003788888888888889, "loss": 0.6536, "step": 1310 }, { "epoch": 2.44, "grad_norm": 1.8305333852767944, "learning_rate": 0.0037796296296296297, "loss": 0.6503, "step": 1320 }, { "epoch": 2.46, "grad_norm": 2.674013376235962, "learning_rate": 0.0037703703703703705, "loss": 0.6235, "step": 1330 }, { "epoch": 2.48, "grad_norm": 4.233829021453857, "learning_rate": 0.0037611111111111113, "loss": 0.7144, "step": 1340 }, { "epoch": 2.5, "grad_norm": 2.49525785446167, "learning_rate": 0.0037518518518518517, "loss": 0.5574, "step": 1350 }, { "epoch": 2.52, "grad_norm": 2.353803873062134, "learning_rate": 0.0037425925925925925, "loss": 0.687, "step": 1360 }, { "epoch": 2.53, "grad_norm": 2.101120710372925, "learning_rate": 0.0037333333333333337, "loss": 0.5579, "step": 1370 }, { "epoch": 2.55, "grad_norm": 1.9760828018188477, "learning_rate": 0.003724074074074074, "loss": 0.7199, "step": 1380 }, { "epoch": 2.57, "grad_norm": 3.785053014755249, "learning_rate": 0.003714814814814815, "loss": 0.6915, "step": 1390 }, { "epoch": 2.59, "grad_norm": 3.4531219005584717, "learning_rate": 0.0037055555555555557, "loss": 0.6379, "step": 1400 }, { "epoch": 2.61, "grad_norm": 2.45957088470459, "learning_rate": 0.0036962962962962965, "loss": 0.6632, "step": 1410 }, { "epoch": 2.63, "grad_norm": 2.7392988204956055, "learning_rate": 0.0036870370370370373, "loss": 0.6151, "step": 1420 }, { "epoch": 2.65, "grad_norm": 1.5700833797454834, "learning_rate": 0.0036777777777777776, "loss": 0.6362, "step": 1430 }, { "epoch": 2.66, "grad_norm": 2.2544972896575928, "learning_rate": 0.003668518518518519, "loss": 0.6567, "step": 1440 }, { "epoch": 2.68, "grad_norm": 2.2333929538726807, "learning_rate": 0.0036592592592592592, "loss": 0.6005, "step": 1450 }, { "epoch": 2.7, "grad_norm": 1.6665147542953491, "learning_rate": 0.00365, "loss": 0.5697, "step": 1460 }, { "epoch": 2.72, "grad_norm": 2.4265899658203125, "learning_rate": 0.0036407407407407404, "loss": 0.5576, "step": 1470 }, { "epoch": 2.74, "grad_norm": 2.1805577278137207, "learning_rate": 0.0036314814814814816, "loss": 0.5729, "step": 1480 }, { "epoch": 2.76, "grad_norm": 1.8916809558868408, "learning_rate": 0.0036222222222222224, "loss": 0.6201, "step": 1490 }, { "epoch": 2.78, "grad_norm": 1.9663447141647339, "learning_rate": 0.003612962962962963, "loss": 0.5541, "step": 1500 }, { "epoch": 2.79, "grad_norm": 1.565356731414795, "learning_rate": 0.003603703703703704, "loss": 0.6453, "step": 1510 }, { "epoch": 2.81, "grad_norm": 1.5807263851165771, "learning_rate": 0.003594444444444445, "loss": 0.5988, "step": 1520 }, { "epoch": 2.83, "grad_norm": 2.013056993484497, "learning_rate": 0.003585185185185185, "loss": 0.5613, "step": 1530 }, { "epoch": 2.85, "grad_norm": 2.1835033893585205, "learning_rate": 0.003575925925925926, "loss": 0.68, "step": 1540 }, { "epoch": 2.87, "grad_norm": 3.3731331825256348, "learning_rate": 0.0035666666666666668, "loss": 0.6693, "step": 1550 }, { "epoch": 2.89, "grad_norm": 2.4271302223205566, "learning_rate": 0.0035574074074074076, "loss": 0.5841, "step": 1560 }, { "epoch": 2.9, "grad_norm": 3.0260560512542725, "learning_rate": 0.003548148148148148, "loss": 0.5775, "step": 1570 }, { "epoch": 2.92, "grad_norm": 2.215425968170166, "learning_rate": 0.0035388888888888887, "loss": 0.6203, "step": 1580 }, { "epoch": 2.94, "grad_norm": 2.4513442516326904, "learning_rate": 0.00352962962962963, "loss": 0.6032, "step": 1590 }, { "epoch": 2.96, "grad_norm": 3.0825629234313965, "learning_rate": 0.0035212962962962962, "loss": 0.5534, "step": 1600 }, { "epoch": 2.98, "grad_norm": 3.933262825012207, "learning_rate": 0.003512037037037037, "loss": 0.5943, "step": 1610 }, { "epoch": 3.0, "grad_norm": 3.9379372596740723, "learning_rate": 0.0035027777777777783, "loss": 0.6405, "step": 1620 }, { "epoch": 3.0, "eval_accuracy": 0.9553227545832691, "eval_f1": 0.9537075013208948, "eval_loss": 0.13096250593662262, "eval_precision": 0.959969032218872, "eval_recall": 0.9515227670577108, "eval_runtime": 105.8838, "eval_samples_per_second": 61.303, "eval_steps_per_second": 3.834, "step": 1621 }, { "epoch": 3.02, "grad_norm": 4.636434078216553, "learning_rate": 0.0034935185185185186, "loss": 0.5521, "step": 1630 }, { "epoch": 3.03, "grad_norm": 2.875552177429199, "learning_rate": 0.0034842592592592594, "loss": 0.584, "step": 1640 }, { "epoch": 3.05, "grad_norm": 2.166994333267212, "learning_rate": 0.003475, "loss": 0.6998, "step": 1650 }, { "epoch": 3.07, "grad_norm": 2.1271350383758545, "learning_rate": 0.003465740740740741, "loss": 0.5648, "step": 1660 }, { "epoch": 3.09, "grad_norm": 2.3423948287963867, "learning_rate": 0.0034564814814814814, "loss": 0.6761, "step": 1670 }, { "epoch": 3.11, "grad_norm": 4.546567440032959, "learning_rate": 0.003447222222222222, "loss": 0.6555, "step": 1680 }, { "epoch": 3.13, "grad_norm": 2.9972333908081055, "learning_rate": 0.0034379629629629634, "loss": 0.6147, "step": 1690 }, { "epoch": 3.15, "grad_norm": 1.9627290964126587, "learning_rate": 0.0034287037037037038, "loss": 0.6231, "step": 1700 }, { "epoch": 3.16, "grad_norm": 3.252190589904785, "learning_rate": 0.0034194444444444446, "loss": 0.6926, "step": 1710 }, { "epoch": 3.18, "grad_norm": 3.4038755893707275, "learning_rate": 0.003410185185185185, "loss": 0.6724, "step": 1720 }, { "epoch": 3.2, "grad_norm": 2.501878261566162, "learning_rate": 0.003400925925925926, "loss": 0.6141, "step": 1730 }, { "epoch": 3.22, "grad_norm": 2.0606675148010254, "learning_rate": 0.003391666666666667, "loss": 0.5758, "step": 1740 }, { "epoch": 3.24, "grad_norm": 2.631194829940796, "learning_rate": 0.0033824074074074073, "loss": 0.6344, "step": 1750 }, { "epoch": 3.26, "grad_norm": 2.717015027999878, "learning_rate": 0.003373148148148148, "loss": 0.6729, "step": 1760 }, { "epoch": 3.27, "grad_norm": 2.4547502994537354, "learning_rate": 0.003363888888888889, "loss": 0.6162, "step": 1770 }, { "epoch": 3.29, "grad_norm": 4.384427070617676, "learning_rate": 0.0033546296296296297, "loss": 0.5629, "step": 1780 }, { "epoch": 3.31, "grad_norm": 3.193366765975952, "learning_rate": 0.00334537037037037, "loss": 0.6054, "step": 1790 }, { "epoch": 3.33, "grad_norm": 2.8141987323760986, "learning_rate": 0.0033361111111111113, "loss": 0.6117, "step": 1800 }, { "epoch": 3.35, "grad_norm": 1.7347623109817505, "learning_rate": 0.003326851851851852, "loss": 0.6268, "step": 1810 }, { "epoch": 3.37, "grad_norm": 5.643610000610352, "learning_rate": 0.0033175925925925925, "loss": 0.5513, "step": 1820 }, { "epoch": 3.39, "grad_norm": 3.0566015243530273, "learning_rate": 0.0033083333333333333, "loss": 0.7019, "step": 1830 }, { "epoch": 3.4, "grad_norm": 2.0485594272613525, "learning_rate": 0.0032990740740740745, "loss": 0.5568, "step": 1840 }, { "epoch": 3.42, "grad_norm": 2.5610201358795166, "learning_rate": 0.003289814814814815, "loss": 0.5935, "step": 1850 }, { "epoch": 3.44, "grad_norm": 14.397619247436523, "learning_rate": 0.0032805555555555557, "loss": 0.5506, "step": 1860 }, { "epoch": 3.46, "grad_norm": 2.7251882553100586, "learning_rate": 0.003271296296296296, "loss": 0.6064, "step": 1870 }, { "epoch": 3.48, "grad_norm": 2.093822479248047, "learning_rate": 0.0032620370370370372, "loss": 0.7011, "step": 1880 }, { "epoch": 3.5, "grad_norm": 2.208249807357788, "learning_rate": 0.003252777777777778, "loss": 0.6524, "step": 1890 }, { "epoch": 3.52, "grad_norm": 11.702254295349121, "learning_rate": 0.0032435185185185184, "loss": 0.6089, "step": 1900 }, { "epoch": 3.53, "grad_norm": 3.271688938140869, "learning_rate": 0.0032342592592592596, "loss": 0.7139, "step": 1910 }, { "epoch": 3.55, "grad_norm": 5.287911415100098, "learning_rate": 0.003225, "loss": 0.575, "step": 1920 }, { "epoch": 3.57, "grad_norm": 3.2775847911834717, "learning_rate": 0.003215740740740741, "loss": 0.6546, "step": 1930 }, { "epoch": 3.59, "grad_norm": 2.3923332691192627, "learning_rate": 0.003206481481481481, "loss": 0.5964, "step": 1940 }, { "epoch": 3.61, "grad_norm": 2.3694956302642822, "learning_rate": 0.0031972222222222224, "loss": 0.6172, "step": 1950 }, { "epoch": 3.63, "grad_norm": 3.5944793224334717, "learning_rate": 0.003187962962962963, "loss": 0.574, "step": 1960 }, { "epoch": 3.64, "grad_norm": 1.8562227487564087, "learning_rate": 0.0031787037037037036, "loss": 0.6079, "step": 1970 }, { "epoch": 3.66, "grad_norm": 3.2188656330108643, "learning_rate": 0.0031694444444444443, "loss": 0.5484, "step": 1980 }, { "epoch": 3.68, "grad_norm": 3.9696381092071533, "learning_rate": 0.0031601851851851856, "loss": 0.6438, "step": 1990 }, { "epoch": 3.7, "grad_norm": 1.935719609260559, "learning_rate": 0.003150925925925926, "loss": 0.6175, "step": 2000 }, { "epoch": 3.72, "grad_norm": 2.686809778213501, "learning_rate": 0.0031416666666666667, "loss": 0.5267, "step": 2010 }, { "epoch": 3.74, "grad_norm": 1.6655242443084717, "learning_rate": 0.0031324074074074075, "loss": 0.6053, "step": 2020 }, { "epoch": 3.76, "grad_norm": 2.1189332008361816, "learning_rate": 0.0031231481481481483, "loss": 0.5282, "step": 2030 }, { "epoch": 3.77, "grad_norm": 2.889960289001465, "learning_rate": 0.0031138888888888887, "loss": 0.61, "step": 2040 }, { "epoch": 3.79, "grad_norm": 3.7036290168762207, "learning_rate": 0.0031046296296296295, "loss": 0.6091, "step": 2050 }, { "epoch": 3.81, "grad_norm": 4.608720302581787, "learning_rate": 0.0030953703703703707, "loss": 0.6725, "step": 2060 }, { "epoch": 3.83, "grad_norm": 3.6915183067321777, "learning_rate": 0.003086111111111111, "loss": 0.6051, "step": 2070 }, { "epoch": 3.85, "grad_norm": 5.5720133781433105, "learning_rate": 0.003076851851851852, "loss": 0.6552, "step": 2080 }, { "epoch": 3.87, "grad_norm": 3.984224319458008, "learning_rate": 0.0030675925925925922, "loss": 0.6326, "step": 2090 }, { "epoch": 3.89, "grad_norm": 4.13102388381958, "learning_rate": 0.0030583333333333335, "loss": 0.6286, "step": 2100 }, { "epoch": 3.9, "grad_norm": 5.23762845993042, "learning_rate": 0.0030490740740740743, "loss": 0.6575, "step": 2110 }, { "epoch": 3.92, "grad_norm": 3.083714008331299, "learning_rate": 0.0030398148148148146, "loss": 0.6214, "step": 2120 }, { "epoch": 3.94, "grad_norm": 2.6413919925689697, "learning_rate": 0.003030555555555556, "loss": 0.6019, "step": 2130 }, { "epoch": 3.96, "grad_norm": 1.659439206123352, "learning_rate": 0.0030212962962962962, "loss": 0.6417, "step": 2140 }, { "epoch": 3.98, "grad_norm": 3.3802456855773926, "learning_rate": 0.003012037037037037, "loss": 0.5898, "step": 2150 }, { "epoch": 4.0, "grad_norm": 3.054283618927002, "learning_rate": 0.003002777777777778, "loss": 0.5005, "step": 2160 }, { "epoch": 4.0, "eval_accuracy": 0.9662609767370205, "eval_f1": 0.972867802534218, "eval_loss": 0.1138468086719513, "eval_precision": 0.9756603079675337, "eval_recall": 0.971821943372282, "eval_runtime": 105.7692, "eval_samples_per_second": 61.369, "eval_steps_per_second": 3.839, "step": 2162 }, { "epoch": 4.01, "grad_norm": 3.2501673698425293, "learning_rate": 0.0029935185185185186, "loss": 0.6619, "step": 2170 }, { "epoch": 4.03, "grad_norm": 3.0431628227233887, "learning_rate": 0.0029842592592592594, "loss": 0.6635, "step": 2180 }, { "epoch": 4.05, "grad_norm": 2.6278398036956787, "learning_rate": 0.0029749999999999998, "loss": 0.6468, "step": 2190 }, { "epoch": 4.07, "grad_norm": 1.7868436574935913, "learning_rate": 0.002965740740740741, "loss": 0.6325, "step": 2200 }, { "epoch": 4.09, "grad_norm": 4.32276725769043, "learning_rate": 0.002956481481481482, "loss": 0.5745, "step": 2210 }, { "epoch": 4.11, "grad_norm": 1.9450454711914062, "learning_rate": 0.002947222222222222, "loss": 0.5264, "step": 2220 }, { "epoch": 4.13, "grad_norm": 8.342818260192871, "learning_rate": 0.002937962962962963, "loss": 0.5547, "step": 2230 }, { "epoch": 4.14, "grad_norm": 4.070902347564697, "learning_rate": 0.002928703703703704, "loss": 0.5232, "step": 2240 }, { "epoch": 4.16, "grad_norm": 2.327827215194702, "learning_rate": 0.0029194444444444446, "loss": 0.5651, "step": 2250 }, { "epoch": 4.18, "grad_norm": 2.1814072132110596, "learning_rate": 0.0029101851851851854, "loss": 0.6027, "step": 2260 }, { "epoch": 4.2, "grad_norm": 3.3634531497955322, "learning_rate": 0.0029009259259259257, "loss": 0.579, "step": 2270 }, { "epoch": 4.22, "grad_norm": 2.795017957687378, "learning_rate": 0.002891666666666667, "loss": 0.6487, "step": 2280 }, { "epoch": 4.24, "grad_norm": 2.5090396404266357, "learning_rate": 0.0028824074074074073, "loss": 0.6351, "step": 2290 }, { "epoch": 4.26, "grad_norm": 4.025092601776123, "learning_rate": 0.002873148148148148, "loss": 0.6395, "step": 2300 }, { "epoch": 4.27, "grad_norm": 2.139432668685913, "learning_rate": 0.0028638888888888893, "loss": 0.5465, "step": 2310 }, { "epoch": 4.29, "grad_norm": 2.7012085914611816, "learning_rate": 0.0028546296296296297, "loss": 0.5872, "step": 2320 }, { "epoch": 4.31, "grad_norm": 3.428055763244629, "learning_rate": 0.0028453703703703705, "loss": 0.5588, "step": 2330 }, { "epoch": 4.33, "grad_norm": 3.3620471954345703, "learning_rate": 0.002836111111111111, "loss": 0.5958, "step": 2340 }, { "epoch": 4.35, "grad_norm": 1.7665202617645264, "learning_rate": 0.002826851851851852, "loss": 0.4828, "step": 2350 }, { "epoch": 4.37, "grad_norm": 2.4530186653137207, "learning_rate": 0.002817592592592593, "loss": 0.6112, "step": 2360 }, { "epoch": 4.38, "grad_norm": 1.5522198677062988, "learning_rate": 0.0028083333333333333, "loss": 0.6276, "step": 2370 }, { "epoch": 4.4, "grad_norm": 2.6151957511901855, "learning_rate": 0.002799074074074074, "loss": 0.5665, "step": 2380 }, { "epoch": 4.42, "grad_norm": 2.1417715549468994, "learning_rate": 0.002789814814814815, "loss": 0.5213, "step": 2390 }, { "epoch": 4.44, "grad_norm": 3.485398530960083, "learning_rate": 0.0027805555555555556, "loss": 0.6137, "step": 2400 }, { "epoch": 4.46, "grad_norm": 7.003100395202637, "learning_rate": 0.002771296296296296, "loss": 0.5872, "step": 2410 }, { "epoch": 4.48, "grad_norm": 3.0954577922821045, "learning_rate": 0.0027620370370370372, "loss": 0.5891, "step": 2420 }, { "epoch": 4.5, "grad_norm": 6.439086437225342, "learning_rate": 0.002752777777777778, "loss": 0.6085, "step": 2430 }, { "epoch": 4.51, "grad_norm": 3.5923094749450684, "learning_rate": 0.0027435185185185184, "loss": 0.6787, "step": 2440 }, { "epoch": 4.53, "grad_norm": 2.043170690536499, "learning_rate": 0.002734259259259259, "loss": 0.5666, "step": 2450 }, { "epoch": 4.55, "grad_norm": 2.8164470195770264, "learning_rate": 0.0027250000000000004, "loss": 0.6586, "step": 2460 }, { "epoch": 4.57, "grad_norm": 2.2632648944854736, "learning_rate": 0.002715740740740741, "loss": 0.6106, "step": 2470 }, { "epoch": 4.59, "grad_norm": 2.98234224319458, "learning_rate": 0.0027064814814814816, "loss": 0.5545, "step": 2480 }, { "epoch": 4.61, "grad_norm": 2.2962846755981445, "learning_rate": 0.002697222222222222, "loss": 0.5755, "step": 2490 }, { "epoch": 4.63, "grad_norm": 4.67765474319458, "learning_rate": 0.002687962962962963, "loss": 0.4411, "step": 2500 }, { "epoch": 4.64, "grad_norm": 3.6627418994903564, "learning_rate": 0.0026787037037037035, "loss": 0.63, "step": 2510 }, { "epoch": 4.66, "grad_norm": 3.7983245849609375, "learning_rate": 0.0026694444444444443, "loss": 0.5024, "step": 2520 }, { "epoch": 4.68, "grad_norm": 2.797543525695801, "learning_rate": 0.0026601851851851856, "loss": 0.5812, "step": 2530 }, { "epoch": 4.7, "grad_norm": 2.688131332397461, "learning_rate": 0.002650925925925926, "loss": 0.6246, "step": 2540 }, { "epoch": 4.72, "grad_norm": 3.1840107440948486, "learning_rate": 0.0026416666666666667, "loss": 0.5858, "step": 2550 }, { "epoch": 4.74, "grad_norm": 1.386940598487854, "learning_rate": 0.002632407407407407, "loss": 0.5242, "step": 2560 }, { "epoch": 4.75, "grad_norm": 2.7501676082611084, "learning_rate": 0.0026231481481481483, "loss": 0.5272, "step": 2570 }, { "epoch": 4.77, "grad_norm": 2.656128406524658, "learning_rate": 0.002613888888888889, "loss": 0.5537, "step": 2580 }, { "epoch": 4.79, "grad_norm": 3.3853468894958496, "learning_rate": 0.0026046296296296295, "loss": 0.6347, "step": 2590 }, { "epoch": 4.81, "grad_norm": 3.0706825256347656, "learning_rate": 0.0025953703703703703, "loss": 0.5902, "step": 2600 }, { "epoch": 4.83, "grad_norm": 2.470815658569336, "learning_rate": 0.0025861111111111115, "loss": 0.5707, "step": 2610 }, { "epoch": 4.85, "grad_norm": 2.8364624977111816, "learning_rate": 0.002576851851851852, "loss": 0.6205, "step": 2620 }, { "epoch": 4.87, "grad_norm": 2.347895860671997, "learning_rate": 0.0025675925925925927, "loss": 0.6068, "step": 2630 }, { "epoch": 4.88, "grad_norm": 2.5347235202789307, "learning_rate": 0.0025583333333333335, "loss": 0.5555, "step": 2640 }, { "epoch": 4.9, "grad_norm": 1.7792493104934692, "learning_rate": 0.0025490740740740743, "loss": 0.6124, "step": 2650 }, { "epoch": 4.92, "grad_norm": 1.5881773233413696, "learning_rate": 0.0025398148148148146, "loss": 0.5311, "step": 2660 }, { "epoch": 4.94, "grad_norm": 2.866182327270508, "learning_rate": 0.0025305555555555554, "loss": 0.5833, "step": 2670 }, { "epoch": 4.96, "grad_norm": 2.6761910915374756, "learning_rate": 0.0025212962962962967, "loss": 0.5367, "step": 2680 }, { "epoch": 4.98, "grad_norm": 3.0073177814483643, "learning_rate": 0.002512037037037037, "loss": 0.5316, "step": 2690 }, { "epoch": 5.0, "grad_norm": 2.769132375717163, "learning_rate": 0.002502777777777778, "loss": 0.5669, "step": 2700 }, { "epoch": 5.0, "eval_accuracy": 0.9602526575258049, "eval_f1": 0.9665087531565827, "eval_loss": 0.11420344561338425, "eval_precision": 0.9704223501308576, "eval_recall": 0.9647162237423828, "eval_runtime": 105.6456, "eval_samples_per_second": 61.441, "eval_steps_per_second": 3.843, "step": 2702 }, { "epoch": 5.01, "grad_norm": 2.4607672691345215, "learning_rate": 0.0024935185185185186, "loss": 0.5564, "step": 2710 }, { "epoch": 5.03, "grad_norm": 1.7590909004211426, "learning_rate": 0.0024842592592592594, "loss": 0.5319, "step": 2720 }, { "epoch": 5.05, "grad_norm": 6.092741966247559, "learning_rate": 0.002475, "loss": 0.5901, "step": 2730 }, { "epoch": 5.07, "grad_norm": 2.170964479446411, "learning_rate": 0.002465740740740741, "loss": 0.5454, "step": 2740 }, { "epoch": 5.09, "grad_norm": 1.7933937311172485, "learning_rate": 0.0024564814814814814, "loss": 0.5448, "step": 2750 }, { "epoch": 5.11, "grad_norm": 1.6023855209350586, "learning_rate": 0.002447222222222222, "loss": 0.5337, "step": 2760 }, { "epoch": 5.12, "grad_norm": 2.6876323223114014, "learning_rate": 0.002437962962962963, "loss": 0.621, "step": 2770 }, { "epoch": 5.14, "grad_norm": 3.007115125656128, "learning_rate": 0.0024287037037037038, "loss": 0.6147, "step": 2780 }, { "epoch": 5.16, "grad_norm": 1.462850570678711, "learning_rate": 0.0024194444444444446, "loss": 0.5601, "step": 2790 }, { "epoch": 5.18, "grad_norm": 2.3480947017669678, "learning_rate": 0.0024101851851851853, "loss": 0.5242, "step": 2800 }, { "epoch": 5.2, "grad_norm": 2.632812023162842, "learning_rate": 0.0024009259259259257, "loss": 0.5104, "step": 2810 }, { "epoch": 5.22, "grad_norm": 4.110454559326172, "learning_rate": 0.0023916666666666665, "loss": 0.5169, "step": 2820 }, { "epoch": 5.24, "grad_norm": 1.5732221603393555, "learning_rate": 0.0023824074074074077, "loss": 0.5126, "step": 2830 }, { "epoch": 5.25, "grad_norm": 2.7537384033203125, "learning_rate": 0.002373148148148148, "loss": 0.5494, "step": 2840 }, { "epoch": 5.27, "grad_norm": 1.9949324131011963, "learning_rate": 0.002363888888888889, "loss": 0.6381, "step": 2850 }, { "epoch": 5.29, "grad_norm": 2.0324654579162598, "learning_rate": 0.0023546296296296297, "loss": 0.5944, "step": 2860 }, { "epoch": 5.31, "grad_norm": 2.691060781478882, "learning_rate": 0.0023453703703703705, "loss": 0.5745, "step": 2870 }, { "epoch": 5.33, "grad_norm": 3.5940804481506348, "learning_rate": 0.0023361111111111113, "loss": 0.5423, "step": 2880 }, { "epoch": 5.35, "grad_norm": 2.7014036178588867, "learning_rate": 0.002326851851851852, "loss": 0.5708, "step": 2890 }, { "epoch": 5.37, "grad_norm": 4.630244255065918, "learning_rate": 0.0023175925925925924, "loss": 0.5235, "step": 2900 }, { "epoch": 5.38, "grad_norm": 4.1839752197265625, "learning_rate": 0.0023083333333333332, "loss": 0.5385, "step": 2910 }, { "epoch": 5.4, "grad_norm": 2.4296653270721436, "learning_rate": 0.002299074074074074, "loss": 0.4755, "step": 2920 }, { "epoch": 5.42, "grad_norm": 3.140748977661133, "learning_rate": 0.002289814814814815, "loss": 0.5625, "step": 2930 }, { "epoch": 5.44, "grad_norm": 3.9935102462768555, "learning_rate": 0.0022805555555555556, "loss": 0.5864, "step": 2940 }, { "epoch": 5.46, "grad_norm": 3.6175553798675537, "learning_rate": 0.0022712962962962964, "loss": 0.4942, "step": 2950 }, { "epoch": 5.48, "grad_norm": 2.6416540145874023, "learning_rate": 0.0022620370370370372, "loss": 0.5522, "step": 2960 }, { "epoch": 5.49, "grad_norm": 2.512601137161255, "learning_rate": 0.0022527777777777776, "loss": 0.5134, "step": 2970 }, { "epoch": 5.51, "grad_norm": 2.6757636070251465, "learning_rate": 0.002243518518518519, "loss": 0.4753, "step": 2980 }, { "epoch": 5.53, "grad_norm": 5.020870208740234, "learning_rate": 0.002234259259259259, "loss": 0.5596, "step": 2990 }, { "epoch": 5.55, "grad_norm": 1.6189966201782227, "learning_rate": 0.002225, "loss": 0.4691, "step": 3000 }, { "epoch": 5.57, "grad_norm": 1.8074935674667358, "learning_rate": 0.0022157407407407408, "loss": 0.5388, "step": 3010 }, { "epoch": 5.59, "grad_norm": 2.1580889225006104, "learning_rate": 0.0022064814814814816, "loss": 0.5539, "step": 3020 }, { "epoch": 5.61, "grad_norm": 3.8424339294433594, "learning_rate": 0.0021972222222222224, "loss": 0.5362, "step": 3030 }, { "epoch": 5.62, "grad_norm": 2.5609259605407715, "learning_rate": 0.002187962962962963, "loss": 0.4883, "step": 3040 }, { "epoch": 5.64, "grad_norm": 3.077549934387207, "learning_rate": 0.002178703703703704, "loss": 0.5324, "step": 3050 }, { "epoch": 5.66, "grad_norm": 3.0118846893310547, "learning_rate": 0.0021694444444444443, "loss": 0.483, "step": 3060 }, { "epoch": 5.68, "grad_norm": 2.3558125495910645, "learning_rate": 0.002160185185185185, "loss": 0.5453, "step": 3070 }, { "epoch": 5.7, "grad_norm": 3.1456894874572754, "learning_rate": 0.002150925925925926, "loss": 0.5333, "step": 3080 }, { "epoch": 5.72, "grad_norm": 2.671800374984741, "learning_rate": 0.0021416666666666667, "loss": 0.5446, "step": 3090 }, { "epoch": 5.74, "grad_norm": 4.476646423339844, "learning_rate": 0.0021324074074074075, "loss": 0.5464, "step": 3100 }, { "epoch": 5.75, "grad_norm": 2.332273483276367, "learning_rate": 0.0021231481481481483, "loss": 0.4712, "step": 3110 }, { "epoch": 5.77, "grad_norm": 2.072690725326538, "learning_rate": 0.0021138888888888887, "loss": 0.4919, "step": 3120 }, { "epoch": 5.79, "grad_norm": 2.1038475036621094, "learning_rate": 0.0021046296296296295, "loss": 0.4628, "step": 3130 }, { "epoch": 5.81, "grad_norm": 3.188100576400757, "learning_rate": 0.0020953703703703707, "loss": 0.5378, "step": 3140 }, { "epoch": 5.83, "grad_norm": 1.139237403869629, "learning_rate": 0.002086111111111111, "loss": 0.4754, "step": 3150 }, { "epoch": 5.85, "grad_norm": 1.6128747463226318, "learning_rate": 0.002076851851851852, "loss": 0.4934, "step": 3160 }, { "epoch": 5.86, "grad_norm": 2.8483083248138428, "learning_rate": 0.0020675925925925927, "loss": 0.5424, "step": 3170 }, { "epoch": 5.88, "grad_norm": 2.0242919921875, "learning_rate": 0.0020583333333333335, "loss": 0.5385, "step": 3180 }, { "epoch": 5.9, "grad_norm": 2.0240182876586914, "learning_rate": 0.002049074074074074, "loss": 0.5485, "step": 3190 }, { "epoch": 5.92, "grad_norm": 4.104608535766602, "learning_rate": 0.002039814814814815, "loss": 0.5597, "step": 3200 }, { "epoch": 5.94, "grad_norm": 2.405686616897583, "learning_rate": 0.0020305555555555554, "loss": 0.6002, "step": 3210 }, { "epoch": 5.96, "grad_norm": 2.7426254749298096, "learning_rate": 0.002021296296296296, "loss": 0.544, "step": 3220 }, { "epoch": 5.98, "grad_norm": 2.5380473136901855, "learning_rate": 0.002012037037037037, "loss": 0.52, "step": 3230 }, { "epoch": 5.99, "grad_norm": 2.7862181663513184, "learning_rate": 0.002002777777777778, "loss": 0.5548, "step": 3240 }, { "epoch": 6.0, "eval_accuracy": 0.9771991988907719, "eval_f1": 0.9795134539181521, "eval_loss": 0.056862443685531616, "eval_precision": 0.9812055832949096, "eval_recall": 0.9784771483290412, "eval_runtime": 105.3717, "eval_samples_per_second": 61.601, "eval_steps_per_second": 3.853, "step": 3243 }, { "epoch": 6.01, "grad_norm": 3.968397378921509, "learning_rate": 0.0019935185185185186, "loss": 0.5039, "step": 3250 }, { "epoch": 6.03, "grad_norm": 1.8520976305007935, "learning_rate": 0.0019842592592592594, "loss": 0.498, "step": 3260 }, { "epoch": 6.05, "grad_norm": 2.38435959815979, "learning_rate": 0.001975, "loss": 0.4984, "step": 3270 }, { "epoch": 6.07, "grad_norm": 2.9417970180511475, "learning_rate": 0.0019657407407407406, "loss": 0.489, "step": 3280 }, { "epoch": 6.09, "grad_norm": 2.5652129650115967, "learning_rate": 0.001956481481481482, "loss": 0.5028, "step": 3290 }, { "epoch": 6.11, "grad_norm": 3.151473045349121, "learning_rate": 0.0019472222222222221, "loss": 0.4651, "step": 3300 }, { "epoch": 6.12, "grad_norm": 2.743474006652832, "learning_rate": 0.001937962962962963, "loss": 0.4489, "step": 3310 }, { "epoch": 6.14, "grad_norm": 2.15732741355896, "learning_rate": 0.0019287037037037035, "loss": 0.4736, "step": 3320 }, { "epoch": 6.16, "grad_norm": 3.1959879398345947, "learning_rate": 0.0019194444444444445, "loss": 0.4998, "step": 3330 }, { "epoch": 6.18, "grad_norm": 3.215951919555664, "learning_rate": 0.0019101851851851853, "loss": 0.4991, "step": 3340 }, { "epoch": 6.2, "grad_norm": 2.889775276184082, "learning_rate": 0.001900925925925926, "loss": 0.4817, "step": 3350 }, { "epoch": 6.22, "grad_norm": 3.2150237560272217, "learning_rate": 0.0018916666666666667, "loss": 0.5287, "step": 3360 }, { "epoch": 6.23, "grad_norm": 1.6671727895736694, "learning_rate": 0.0018824074074074075, "loss": 0.4492, "step": 3370 }, { "epoch": 6.25, "grad_norm": 2.812654733657837, "learning_rate": 0.0018731481481481483, "loss": 0.5098, "step": 3380 }, { "epoch": 6.27, "grad_norm": 2.0343258380889893, "learning_rate": 0.0018638888888888889, "loss": 0.4513, "step": 3390 }, { "epoch": 6.29, "grad_norm": 2.830615997314453, "learning_rate": 0.0018546296296296297, "loss": 0.4621, "step": 3400 }, { "epoch": 6.31, "grad_norm": 1.4622855186462402, "learning_rate": 0.0018453703703703703, "loss": 0.4741, "step": 3410 }, { "epoch": 6.33, "grad_norm": 2.521087884902954, "learning_rate": 0.0018361111111111113, "loss": 0.4932, "step": 3420 }, { "epoch": 6.35, "grad_norm": 2.6355080604553223, "learning_rate": 0.0018268518518518519, "loss": 0.4676, "step": 3430 }, { "epoch": 6.36, "grad_norm": 6.4964189529418945, "learning_rate": 0.0018175925925925927, "loss": 0.4437, "step": 3440 }, { "epoch": 6.38, "grad_norm": 2.58648943901062, "learning_rate": 0.0018083333333333335, "loss": 0.4583, "step": 3450 }, { "epoch": 6.4, "grad_norm": 2.710174560546875, "learning_rate": 0.001799074074074074, "loss": 0.5327, "step": 3460 }, { "epoch": 6.42, "grad_norm": 4.029747486114502, "learning_rate": 0.001789814814814815, "loss": 0.5417, "step": 3470 }, { "epoch": 6.44, "grad_norm": 1.4080957174301147, "learning_rate": 0.0017805555555555556, "loss": 0.527, "step": 3480 }, { "epoch": 6.46, "grad_norm": 2.5939226150512695, "learning_rate": 0.0017712962962962964, "loss": 0.4533, "step": 3490 }, { "epoch": 6.48, "grad_norm": 1.7863126993179321, "learning_rate": 0.001762037037037037, "loss": 0.4796, "step": 3500 }, { "epoch": 6.49, "grad_norm": 2.5097310543060303, "learning_rate": 0.0017527777777777778, "loss": 0.5026, "step": 3510 }, { "epoch": 6.51, "grad_norm": 3.5115163326263428, "learning_rate": 0.0017435185185185184, "loss": 0.5033, "step": 3520 }, { "epoch": 6.53, "grad_norm": 3.9437549114227295, "learning_rate": 0.0017342592592592594, "loss": 0.5327, "step": 3530 }, { "epoch": 6.55, "grad_norm": 2.61135196685791, "learning_rate": 0.001725, "loss": 0.4928, "step": 3540 }, { "epoch": 6.57, "grad_norm": 2.595803737640381, "learning_rate": 0.0017157407407407408, "loss": 0.4475, "step": 3550 }, { "epoch": 6.59, "grad_norm": 2.401987075805664, "learning_rate": 0.0017064814814814816, "loss": 0.4093, "step": 3560 }, { "epoch": 6.6, "grad_norm": 2.670671224594116, "learning_rate": 0.0016972222222222221, "loss": 0.4736, "step": 3570 }, { "epoch": 6.62, "grad_norm": 2.6677041053771973, "learning_rate": 0.0016879629629629632, "loss": 0.3864, "step": 3580 }, { "epoch": 6.64, "grad_norm": 2.3009049892425537, "learning_rate": 0.0016787037037037037, "loss": 0.486, "step": 3590 }, { "epoch": 6.66, "grad_norm": 3.2118513584136963, "learning_rate": 0.0016694444444444445, "loss": 0.4634, "step": 3600 }, { "epoch": 6.68, "grad_norm": 2.185634136199951, "learning_rate": 0.0016601851851851851, "loss": 0.4865, "step": 3610 }, { "epoch": 6.7, "grad_norm": 4.046822547912598, "learning_rate": 0.001650925925925926, "loss": 0.4624, "step": 3620 }, { "epoch": 6.72, "grad_norm": 2.3376824855804443, "learning_rate": 0.0016416666666666665, "loss": 0.4652, "step": 3630 }, { "epoch": 6.73, "grad_norm": 3.318936347961426, "learning_rate": 0.0016324074074074075, "loss": 0.4595, "step": 3640 }, { "epoch": 6.75, "grad_norm": 2.0213863849639893, "learning_rate": 0.0016231481481481483, "loss": 0.4768, "step": 3650 }, { "epoch": 6.77, "grad_norm": 1.718332052230835, "learning_rate": 0.0016138888888888889, "loss": 0.5239, "step": 3660 }, { "epoch": 6.79, "grad_norm": 1.7439289093017578, "learning_rate": 0.0016046296296296297, "loss": 0.525, "step": 3670 }, { "epoch": 6.81, "grad_norm": 2.79032826423645, "learning_rate": 0.0015953703703703703, "loss": 0.4299, "step": 3680 }, { "epoch": 6.83, "grad_norm": 1.8835906982421875, "learning_rate": 0.0015861111111111113, "loss": 0.4833, "step": 3690 }, { "epoch": 6.85, "grad_norm": 2.179826021194458, "learning_rate": 0.0015768518518518519, "loss": 0.4499, "step": 3700 }, { "epoch": 6.86, "grad_norm": 2.882157325744629, "learning_rate": 0.0015675925925925926, "loss": 0.4296, "step": 3710 }, { "epoch": 6.88, "grad_norm": 2.04009747505188, "learning_rate": 0.0015583333333333332, "loss": 0.4246, "step": 3720 }, { "epoch": 6.9, "grad_norm": 1.883562684059143, "learning_rate": 0.0015490740740740742, "loss": 0.4877, "step": 3730 }, { "epoch": 6.92, "grad_norm": 2.195427179336548, "learning_rate": 0.0015398148148148148, "loss": 0.4985, "step": 3740 }, { "epoch": 6.94, "grad_norm": 1.643837332725525, "learning_rate": 0.0015305555555555556, "loss": 0.4668, "step": 3750 }, { "epoch": 6.96, "grad_norm": 1.886772871017456, "learning_rate": 0.0015212962962962964, "loss": 0.4143, "step": 3760 }, { "epoch": 6.98, "grad_norm": 1.8772404193878174, "learning_rate": 0.001512037037037037, "loss": 0.4765, "step": 3770 }, { "epoch": 6.99, "grad_norm": 2.053422689437866, "learning_rate": 0.001502777777777778, "loss": 0.4298, "step": 3780 }, { "epoch": 7.0, "eval_accuracy": 0.9662609767370205, "eval_f1": 0.9735569034773577, "eval_loss": 0.09894031286239624, "eval_precision": 0.9770102531109687, "eval_recall": 0.9722826041431603, "eval_runtime": 105.603, "eval_samples_per_second": 61.466, "eval_steps_per_second": 3.845, "step": 3783 }, { "epoch": 7.01, "grad_norm": 2.0501725673675537, "learning_rate": 0.0014935185185185186, "loss": 0.4187, "step": 3790 }, { "epoch": 7.03, "grad_norm": 1.9505852460861206, "learning_rate": 0.0014842592592592594, "loss": 0.5045, "step": 3800 }, { "epoch": 7.05, "grad_norm": 2.5469818115234375, "learning_rate": 0.001475, "loss": 0.4567, "step": 3810 }, { "epoch": 7.07, "grad_norm": 3.28265643119812, "learning_rate": 0.0014657407407407408, "loss": 0.4738, "step": 3820 }, { "epoch": 7.09, "grad_norm": 0.9891900420188904, "learning_rate": 0.0014564814814814813, "loss": 0.4332, "step": 3830 }, { "epoch": 7.1, "grad_norm": 2.7753162384033203, "learning_rate": 0.0014472222222222224, "loss": 0.4334, "step": 3840 }, { "epoch": 7.12, "grad_norm": 2.795895576477051, "learning_rate": 0.001437962962962963, "loss": 0.5147, "step": 3850 }, { "epoch": 7.14, "grad_norm": 1.353401780128479, "learning_rate": 0.0014287037037037037, "loss": 0.4358, "step": 3860 }, { "epoch": 7.16, "grad_norm": 2.1106724739074707, "learning_rate": 0.0014194444444444445, "loss": 0.3712, "step": 3870 }, { "epoch": 7.18, "grad_norm": 3.6988472938537598, "learning_rate": 0.0014101851851851851, "loss": 0.4664, "step": 3880 }, { "epoch": 7.2, "grad_norm": 2.6754424571990967, "learning_rate": 0.0014009259259259261, "loss": 0.4231, "step": 3890 }, { "epoch": 7.22, "grad_norm": 2.6841065883636475, "learning_rate": 0.0013916666666666667, "loss": 0.4378, "step": 3900 }, { "epoch": 7.23, "grad_norm": 2.5743584632873535, "learning_rate": 0.0013824074074074075, "loss": 0.3719, "step": 3910 }, { "epoch": 7.25, "grad_norm": 1.586349606513977, "learning_rate": 0.001373148148148148, "loss": 0.3748, "step": 3920 }, { "epoch": 7.27, "grad_norm": 1.5181403160095215, "learning_rate": 0.0013638888888888889, "loss": 0.4544, "step": 3930 }, { "epoch": 7.29, "grad_norm": 1.539527416229248, "learning_rate": 0.0013546296296296295, "loss": 0.3758, "step": 3940 }, { "epoch": 7.31, "grad_norm": 1.6864418983459473, "learning_rate": 0.0013453703703703705, "loss": 0.3795, "step": 3950 }, { "epoch": 7.33, "grad_norm": 2.111827850341797, "learning_rate": 0.0013361111111111113, "loss": 0.4379, "step": 3960 }, { "epoch": 7.35, "grad_norm": 1.9402803182601929, "learning_rate": 0.0013268518518518518, "loss": 0.446, "step": 3970 }, { "epoch": 7.36, "grad_norm": 1.413064956665039, "learning_rate": 0.0013175925925925926, "loss": 0.509, "step": 3980 }, { "epoch": 7.38, "grad_norm": 1.7911455631256104, "learning_rate": 0.0013083333333333332, "loss": 0.4938, "step": 3990 }, { "epoch": 7.4, "grad_norm": 2.256588935852051, "learning_rate": 0.0012990740740740742, "loss": 0.4844, "step": 4000 }, { "epoch": 7.42, "grad_norm": 1.993911623954773, "learning_rate": 0.0012898148148148148, "loss": 0.419, "step": 4010 }, { "epoch": 7.44, "grad_norm": 1.3340710401535034, "learning_rate": 0.0012805555555555556, "loss": 0.4464, "step": 4020 }, { "epoch": 7.46, "grad_norm": 2.902743339538574, "learning_rate": 0.0012712962962962962, "loss": 0.4538, "step": 4030 }, { "epoch": 7.47, "grad_norm": 2.30949068069458, "learning_rate": 0.001262037037037037, "loss": 0.4209, "step": 4040 }, { "epoch": 7.49, "grad_norm": 2.7385501861572266, "learning_rate": 0.0012527777777777778, "loss": 0.3801, "step": 4050 }, { "epoch": 7.51, "grad_norm": 1.1858464479446411, "learning_rate": 0.0012435185185185186, "loss": 0.3956, "step": 4060 }, { "epoch": 7.53, "grad_norm": 1.694409728050232, "learning_rate": 0.0012342592592592594, "loss": 0.3786, "step": 4070 }, { "epoch": 7.55, "grad_norm": 2.7329421043395996, "learning_rate": 0.001225, "loss": 0.4497, "step": 4080 }, { "epoch": 7.57, "grad_norm": 2.352189540863037, "learning_rate": 0.0012157407407407408, "loss": 0.3877, "step": 4090 }, { "epoch": 7.59, "grad_norm": 4.443812847137451, "learning_rate": 0.0012064814814814816, "loss": 0.367, "step": 4100 }, { "epoch": 7.6, "grad_norm": 2.3426809310913086, "learning_rate": 0.0011972222222222221, "loss": 0.4192, "step": 4110 }, { "epoch": 7.62, "grad_norm": 2.791630268096924, "learning_rate": 0.0011879629629629631, "loss": 0.407, "step": 4120 }, { "epoch": 7.64, "grad_norm": 2.74001145362854, "learning_rate": 0.0011787037037037037, "loss": 0.3679, "step": 4130 }, { "epoch": 7.66, "grad_norm": 3.0963590145111084, "learning_rate": 0.0011694444444444445, "loss": 0.3808, "step": 4140 }, { "epoch": 7.68, "grad_norm": 1.7934086322784424, "learning_rate": 0.0011601851851851853, "loss": 0.427, "step": 4150 }, { "epoch": 7.7, "grad_norm": 2.269930601119995, "learning_rate": 0.001150925925925926, "loss": 0.3847, "step": 4160 }, { "epoch": 7.72, "grad_norm": 1.5360560417175293, "learning_rate": 0.0011416666666666667, "loss": 0.4534, "step": 4170 }, { "epoch": 7.73, "grad_norm": 1.9300364255905151, "learning_rate": 0.0011324074074074075, "loss": 0.4014, "step": 4180 }, { "epoch": 7.75, "grad_norm": 1.613265037536621, "learning_rate": 0.001123148148148148, "loss": 0.4852, "step": 4190 }, { "epoch": 7.77, "grad_norm": 1.625320553779602, "learning_rate": 0.0011138888888888889, "loss": 0.4753, "step": 4200 }, { "epoch": 7.79, "grad_norm": 2.176623821258545, "learning_rate": 0.0011046296296296297, "loss": 0.4611, "step": 4210 }, { "epoch": 7.81, "grad_norm": 2.2461256980895996, "learning_rate": 0.0010953703703703705, "loss": 0.4326, "step": 4220 }, { "epoch": 7.83, "grad_norm": 2.6430141925811768, "learning_rate": 0.0010861111111111113, "loss": 0.3267, "step": 4230 }, { "epoch": 7.84, "grad_norm": 2.5161099433898926, "learning_rate": 0.0010768518518518518, "loss": 0.4114, "step": 4240 }, { "epoch": 7.86, "grad_norm": 2.0569732189178467, "learning_rate": 0.0010675925925925926, "loss": 0.4323, "step": 4250 }, { "epoch": 7.88, "grad_norm": 1.884279489517212, "learning_rate": 0.0010583333333333334, "loss": 0.4652, "step": 4260 }, { "epoch": 7.9, "grad_norm": 1.4818202257156372, "learning_rate": 0.001049074074074074, "loss": 0.3994, "step": 4270 }, { "epoch": 7.92, "grad_norm": 2.8348388671875, "learning_rate": 0.0010398148148148148, "loss": 0.3829, "step": 4280 }, { "epoch": 7.94, "grad_norm": 1.3215655088424683, "learning_rate": 0.0010305555555555556, "loss": 0.4266, "step": 4290 }, { "epoch": 7.96, "grad_norm": 2.0123190879821777, "learning_rate": 0.0010212962962962962, "loss": 0.3671, "step": 4300 }, { "epoch": 7.97, "grad_norm": 2.014122486114502, "learning_rate": 0.001012037037037037, "loss": 0.4353, "step": 4310 }, { "epoch": 7.99, "grad_norm": 5.686545372009277, "learning_rate": 0.0010027777777777778, "loss": 0.3932, "step": 4320 }, { "epoch": 8.0, "eval_accuracy": 0.9884455399784317, "eval_f1": 0.989380892500223, "eval_loss": 0.03349991887807846, "eval_precision": 0.9903161849271732, "eval_recall": 0.9887487397554633, "eval_runtime": 105.0862, "eval_samples_per_second": 61.768, "eval_steps_per_second": 3.863, "step": 4324 }, { "epoch": 8.01, "grad_norm": 1.598591685295105, "learning_rate": 0.0009935185185185186, "loss": 0.4489, "step": 4330 }, { "epoch": 8.03, "grad_norm": 1.8468151092529297, "learning_rate": 0.0009842592592592594, "loss": 0.3364, "step": 4340 }, { "epoch": 8.05, "grad_norm": 2.6677463054656982, "learning_rate": 0.0009750000000000001, "loss": 0.4332, "step": 4350 }, { "epoch": 8.07, "grad_norm": 2.6927859783172607, "learning_rate": 0.0009657407407407408, "loss": 0.4052, "step": 4360 }, { "epoch": 8.09, "grad_norm": 1.6345399618148804, "learning_rate": 0.0009564814814814815, "loss": 0.4157, "step": 4370 }, { "epoch": 8.1, "grad_norm": 2.7183444499969482, "learning_rate": 0.0009472222222222222, "loss": 0.4309, "step": 4380 }, { "epoch": 8.12, "grad_norm": 1.2181589603424072, "learning_rate": 0.0009379629629629629, "loss": 0.34, "step": 4390 }, { "epoch": 8.14, "grad_norm": 1.3929060697555542, "learning_rate": 0.0009287037037037037, "loss": 0.3394, "step": 4400 }, { "epoch": 8.16, "grad_norm": 2.487053632736206, "learning_rate": 0.0009194444444444444, "loss": 0.3793, "step": 4410 }, { "epoch": 8.18, "grad_norm": 4.430356979370117, "learning_rate": 0.0009101851851851851, "loss": 0.3657, "step": 4420 }, { "epoch": 8.2, "grad_norm": 3.2201452255249023, "learning_rate": 0.000900925925925926, "loss": 0.4235, "step": 4430 }, { "epoch": 8.21, "grad_norm": 1.5028598308563232, "learning_rate": 0.0008916666666666667, "loss": 0.4478, "step": 4440 }, { "epoch": 8.23, "grad_norm": 1.374112844467163, "learning_rate": 0.0008824074074074075, "loss": 0.4129, "step": 4450 }, { "epoch": 8.25, "grad_norm": 1.8899506330490112, "learning_rate": 0.0008731481481481482, "loss": 0.3859, "step": 4460 }, { "epoch": 8.27, "grad_norm": 2.019902229309082, "learning_rate": 0.0008638888888888889, "loss": 0.4101, "step": 4470 }, { "epoch": 8.29, "grad_norm": 1.705667495727539, "learning_rate": 0.0008546296296296297, "loss": 0.3493, "step": 4480 }, { "epoch": 8.31, "grad_norm": 2.031428098678589, "learning_rate": 0.0008453703703703704, "loss": 0.4074, "step": 4490 }, { "epoch": 8.33, "grad_norm": 2.7189221382141113, "learning_rate": 0.0008361111111111111, "loss": 0.3786, "step": 4500 }, { "epoch": 8.34, "grad_norm": 2.454854726791382, "learning_rate": 0.0008268518518518518, "loss": 0.4436, "step": 4510 }, { "epoch": 8.36, "grad_norm": 1.298943281173706, "learning_rate": 0.0008175925925925925, "loss": 0.4313, "step": 4520 }, { "epoch": 8.38, "grad_norm": 2.463287591934204, "learning_rate": 0.0008083333333333333, "loss": 0.4414, "step": 4530 }, { "epoch": 8.4, "grad_norm": 2.80354642868042, "learning_rate": 0.0007990740740740741, "loss": 0.4188, "step": 4540 }, { "epoch": 8.42, "grad_norm": 3.166337251663208, "learning_rate": 0.0007898148148148149, "loss": 0.4059, "step": 4550 }, { "epoch": 8.44, "grad_norm": 1.655117392539978, "learning_rate": 0.0007805555555555556, "loss": 0.3877, "step": 4560 }, { "epoch": 8.46, "grad_norm": 2.249906539916992, "learning_rate": 0.0007712962962962963, "loss": 0.355, "step": 4570 }, { "epoch": 8.47, "grad_norm": 1.6171294450759888, "learning_rate": 0.0007620370370370371, "loss": 0.3233, "step": 4580 }, { "epoch": 8.49, "grad_norm": 2.709031581878662, "learning_rate": 0.0007527777777777778, "loss": 0.3407, "step": 4590 }, { "epoch": 8.51, "grad_norm": 2.6156270503997803, "learning_rate": 0.0007435185185185185, "loss": 0.3779, "step": 4600 }, { "epoch": 8.53, "grad_norm": 1.640161156654358, "learning_rate": 0.0007342592592592593, "loss": 0.3557, "step": 4610 }, { "epoch": 8.55, "grad_norm": 1.8425498008728027, "learning_rate": 0.000725, "loss": 0.3987, "step": 4620 }, { "epoch": 8.57, "grad_norm": 2.946497917175293, "learning_rate": 0.0007157407407407406, "loss": 0.4044, "step": 4630 }, { "epoch": 8.58, "grad_norm": 2.3729591369628906, "learning_rate": 0.0007064814814814815, "loss": 0.358, "step": 4640 }, { "epoch": 8.6, "grad_norm": 1.634698510169983, "learning_rate": 0.0006972222222222222, "loss": 0.3837, "step": 4650 }, { "epoch": 8.62, "grad_norm": 2.881890296936035, "learning_rate": 0.000687962962962963, "loss": 0.3643, "step": 4660 }, { "epoch": 8.64, "grad_norm": 1.3335785865783691, "learning_rate": 0.0006787037037037037, "loss": 0.3887, "step": 4670 }, { "epoch": 8.66, "grad_norm": 2.1087048053741455, "learning_rate": 0.0006694444444444445, "loss": 0.3354, "step": 4680 }, { "epoch": 8.68, "grad_norm": 3.421175956726074, "learning_rate": 0.0006601851851851852, "loss": 0.4143, "step": 4690 }, { "epoch": 8.7, "grad_norm": 1.5602165460586548, "learning_rate": 0.0006509259259259259, "loss": 0.3928, "step": 4700 }, { "epoch": 8.71, "grad_norm": 3.690290689468384, "learning_rate": 0.0006416666666666667, "loss": 0.3157, "step": 4710 }, { "epoch": 8.73, "grad_norm": 2.0590598583221436, "learning_rate": 0.0006324074074074074, "loss": 0.3986, "step": 4720 }, { "epoch": 8.75, "grad_norm": 1.1233676671981812, "learning_rate": 0.0006231481481481482, "loss": 0.3456, "step": 4730 }, { "epoch": 8.77, "grad_norm": 1.8680920600891113, "learning_rate": 0.0006138888888888889, "loss": 0.4048, "step": 4740 }, { "epoch": 8.79, "grad_norm": 1.1571403741836548, "learning_rate": 0.0006046296296296297, "loss": 0.3535, "step": 4750 }, { "epoch": 8.81, "grad_norm": 1.505861520767212, "learning_rate": 0.0005953703703703703, "loss": 0.3203, "step": 4760 }, { "epoch": 8.83, "grad_norm": 1.7797300815582275, "learning_rate": 0.0005861111111111111, "loss": 0.3308, "step": 4770 }, { "epoch": 8.84, "grad_norm": 1.0974406003952026, "learning_rate": 0.0005768518518518518, "loss": 0.3759, "step": 4780 }, { "epoch": 8.86, "grad_norm": 1.5202006101608276, "learning_rate": 0.0005675925925925926, "loss": 0.3725, "step": 4790 }, { "epoch": 8.88, "grad_norm": 1.2904443740844727, "learning_rate": 0.0005583333333333333, "loss": 0.3535, "step": 4800 }, { "epoch": 8.9, "grad_norm": 1.4673963785171509, "learning_rate": 0.0005490740740740741, "loss": 0.3937, "step": 4810 }, { "epoch": 8.92, "grad_norm": 1.0130095481872559, "learning_rate": 0.0005398148148148149, "loss": 0.3306, "step": 4820 }, { "epoch": 8.94, "grad_norm": 1.4989911317825317, "learning_rate": 0.0005305555555555556, "loss": 0.4111, "step": 4830 }, { "epoch": 8.95, "grad_norm": 1.36995267868042, "learning_rate": 0.0005212962962962963, "loss": 0.3791, "step": 4840 }, { "epoch": 8.97, "grad_norm": 2.134408950805664, "learning_rate": 0.0005120370370370371, "loss": 0.3691, "step": 4850 }, { "epoch": 8.99, "grad_norm": 2.304680347442627, "learning_rate": 0.0005027777777777778, "loss": 0.3409, "step": 4860 }, { "epoch": 9.0, "eval_accuracy": 0.9878293021106147, "eval_f1": 0.9886590982069928, "eval_loss": 0.03709910064935684, "eval_precision": 0.989964784159849, "eval_recall": 0.9876911349748236, "eval_runtime": 105.3983, "eval_samples_per_second": 61.585, "eval_steps_per_second": 3.852, "step": 4864 }, { "epoch": 9.01, "grad_norm": 1.6748008728027344, "learning_rate": 0.0004935185185185186, "loss": 0.3296, "step": 4870 }, { "epoch": 9.03, "grad_norm": 0.8384755849838257, "learning_rate": 0.0004842592592592593, "loss": 0.3316, "step": 4880 }, { "epoch": 9.05, "grad_norm": 2.137038230895996, "learning_rate": 0.000475, "loss": 0.3698, "step": 4890 }, { "epoch": 9.07, "grad_norm": 1.4509310722351074, "learning_rate": 0.00046574074074074074, "loss": 0.3456, "step": 4900 }, { "epoch": 9.08, "grad_norm": 2.2383487224578857, "learning_rate": 0.0004564814814814815, "loss": 0.3705, "step": 4910 }, { "epoch": 9.1, "grad_norm": 1.851224660873413, "learning_rate": 0.00044722222222222217, "loss": 0.3645, "step": 4920 }, { "epoch": 9.12, "grad_norm": 1.9950298070907593, "learning_rate": 0.00043796296296296297, "loss": 0.3502, "step": 4930 }, { "epoch": 9.14, "grad_norm": 2.0750277042388916, "learning_rate": 0.0004287037037037037, "loss": 0.2884, "step": 4940 }, { "epoch": 9.16, "grad_norm": 2.44145131111145, "learning_rate": 0.00041944444444444445, "loss": 0.3615, "step": 4950 }, { "epoch": 9.18, "grad_norm": 1.4347975254058838, "learning_rate": 0.0004101851851851852, "loss": 0.3275, "step": 4960 }, { "epoch": 9.2, "grad_norm": 1.7355468273162842, "learning_rate": 0.0004009259259259259, "loss": 0.3158, "step": 4970 }, { "epoch": 9.21, "grad_norm": 1.5040051937103271, "learning_rate": 0.0003916666666666667, "loss": 0.3265, "step": 4980 }, { "epoch": 9.23, "grad_norm": 2.3825201988220215, "learning_rate": 0.0003824074074074074, "loss": 0.3635, "step": 4990 }, { "epoch": 9.25, "grad_norm": 1.7622706890106201, "learning_rate": 0.00037314814814814817, "loss": 0.3064, "step": 5000 }, { "epoch": 9.27, "grad_norm": 1.3905962705612183, "learning_rate": 0.00036388888888888886, "loss": 0.3192, "step": 5010 }, { "epoch": 9.29, "grad_norm": 1.1275396347045898, "learning_rate": 0.0003546296296296296, "loss": 0.3793, "step": 5020 }, { "epoch": 9.31, "grad_norm": 1.043797254562378, "learning_rate": 0.0003453703703703704, "loss": 0.3532, "step": 5030 }, { "epoch": 9.32, "grad_norm": 2.222782611846924, "learning_rate": 0.00033611111111111114, "loss": 0.3444, "step": 5040 }, { "epoch": 9.34, "grad_norm": 1.1857503652572632, "learning_rate": 0.0003268518518518519, "loss": 0.2676, "step": 5050 }, { "epoch": 9.36, "grad_norm": 1.542576789855957, "learning_rate": 0.00031759259259259257, "loss": 0.3942, "step": 5060 }, { "epoch": 9.38, "grad_norm": 1.9008642435073853, "learning_rate": 0.00030833333333333337, "loss": 0.3854, "step": 5070 }, { "epoch": 9.4, "grad_norm": 1.819470763206482, "learning_rate": 0.00029907407407407405, "loss": 0.3426, "step": 5080 }, { "epoch": 9.42, "grad_norm": 1.4119468927383423, "learning_rate": 0.00028981481481481485, "loss": 0.3093, "step": 5090 }, { "epoch": 9.44, "grad_norm": 2.1372573375701904, "learning_rate": 0.00028055555555555554, "loss": 0.3178, "step": 5100 }, { "epoch": 9.45, "grad_norm": 1.6984546184539795, "learning_rate": 0.0002712962962962963, "loss": 0.3057, "step": 5110 }, { "epoch": 9.47, "grad_norm": 1.6501168012619019, "learning_rate": 0.0002620370370370371, "loss": 0.3501, "step": 5120 }, { "epoch": 9.49, "grad_norm": 1.3018875122070312, "learning_rate": 0.00025277777777777777, "loss": 0.3547, "step": 5130 }, { "epoch": 9.51, "grad_norm": 1.2195825576782227, "learning_rate": 0.0002435185185185185, "loss": 0.3597, "step": 5140 }, { "epoch": 9.53, "grad_norm": 1.5462696552276611, "learning_rate": 0.00023425925925925928, "loss": 0.378, "step": 5150 }, { "epoch": 9.55, "grad_norm": 1.2776215076446533, "learning_rate": 0.000225, "loss": 0.2943, "step": 5160 }, { "epoch": 9.57, "grad_norm": 1.9827752113342285, "learning_rate": 0.00021574074074074076, "loss": 0.3208, "step": 5170 }, { "epoch": 9.58, "grad_norm": 0.7241113185882568, "learning_rate": 0.00020648148148148148, "loss": 0.3298, "step": 5180 }, { "epoch": 9.6, "grad_norm": 1.118054986000061, "learning_rate": 0.00019722222222222222, "loss": 0.3145, "step": 5190 }, { "epoch": 9.62, "grad_norm": 1.2864770889282227, "learning_rate": 0.00018796296296296296, "loss": 0.3694, "step": 5200 }, { "epoch": 9.64, "grad_norm": 2.4528350830078125, "learning_rate": 0.0001787037037037037, "loss": 0.36, "step": 5210 }, { "epoch": 9.66, "grad_norm": 1.4478658437728882, "learning_rate": 0.00016944444444444448, "loss": 0.3421, "step": 5220 }, { "epoch": 9.68, "grad_norm": 1.7859172821044922, "learning_rate": 0.0001601851851851852, "loss": 0.3438, "step": 5230 }, { "epoch": 9.69, "grad_norm": 1.2724491357803345, "learning_rate": 0.00015092592592592593, "loss": 0.3298, "step": 5240 }, { "epoch": 9.71, "grad_norm": 1.7209391593933105, "learning_rate": 0.00014166666666666665, "loss": 0.3227, "step": 5250 }, { "epoch": 9.73, "grad_norm": 1.1616936922073364, "learning_rate": 0.00013240740740740742, "loss": 0.2874, "step": 5260 }, { "epoch": 9.75, "grad_norm": 1.5946358442306519, "learning_rate": 0.00012314814814814816, "loss": 0.3292, "step": 5270 }, { "epoch": 9.77, "grad_norm": 1.7766077518463135, "learning_rate": 0.00011388888888888889, "loss": 0.3105, "step": 5280 }, { "epoch": 9.79, "grad_norm": 1.2184169292449951, "learning_rate": 0.00010462962962962962, "loss": 0.3077, "step": 5290 }, { "epoch": 9.81, "grad_norm": 1.8188039064407349, "learning_rate": 9.537037037037036e-05, "loss": 0.3829, "step": 5300 }, { "epoch": 9.82, "grad_norm": 1.7841200828552246, "learning_rate": 8.611111111111112e-05, "loss": 0.4086, "step": 5310 }, { "epoch": 9.84, "grad_norm": 2.1823301315307617, "learning_rate": 7.685185185185186e-05, "loss": 0.3443, "step": 5320 }, { "epoch": 9.86, "grad_norm": 0.7260684370994568, "learning_rate": 6.759259259259259e-05, "loss": 0.2493, "step": 5330 }, { "epoch": 9.88, "grad_norm": 1.395943522453308, "learning_rate": 5.833333333333334e-05, "loss": 0.3521, "step": 5340 }, { "epoch": 9.9, "grad_norm": 1.731939435005188, "learning_rate": 4.9074074074074075e-05, "loss": 0.3337, "step": 5350 }, { "epoch": 9.92, "grad_norm": 1.1158477067947388, "learning_rate": 3.981481481481482e-05, "loss": 0.2676, "step": 5360 }, { "epoch": 9.94, "grad_norm": 1.2401944398880005, "learning_rate": 3.055555555555556e-05, "loss": 0.3386, "step": 5370 }, { "epoch": 9.95, "grad_norm": 1.0820297002792358, "learning_rate": 2.12962962962963e-05, "loss": 0.2561, "step": 5380 }, { "epoch": 9.97, "grad_norm": 1.5358567237854004, "learning_rate": 1.2037037037037039e-05, "loss": 0.2979, "step": 5390 }, { "epoch": 9.99, "grad_norm": 1.8409550189971924, "learning_rate": 2.777777777777778e-06, "loss": 0.3111, "step": 5400 }, { "epoch": 9.99, "eval_accuracy": 0.9845940533045756, "eval_f1": 0.9874036641651607, "eval_loss": 0.04327414557337761, "eval_precision": 0.9887972691294948, "eval_recall": 0.9864475392019028, "eval_runtime": 105.3994, "eval_samples_per_second": 61.585, "eval_steps_per_second": 3.852, "step": 5400 }, { "epoch": 9.99, "step": 5400, "total_flos": 6.121843696765878e+19, "train_loss": 0.5330523451169332, "train_runtime": 12454.7917, "train_samples_per_second": 27.765, "train_steps_per_second": 0.434 } ], "logging_steps": 10, "max_steps": 5400, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 6.121843696765878e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }