{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.8361344537815127, "eval_steps": 50, "global_step": 2700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01050420168067227, "grad_norm": 2.5582146644592285, "learning_rate": 4.98249299719888e-05, "loss": 1.6787, "step": 10 }, { "epoch": 0.02100840336134454, "grad_norm": 0.9345070719718933, "learning_rate": 4.96498599439776e-05, "loss": 0.518, "step": 20 }, { "epoch": 0.031512605042016806, "grad_norm": 1.6358414888381958, "learning_rate": 4.947478991596639e-05, "loss": 0.4604, "step": 30 }, { "epoch": 0.04201680672268908, "grad_norm": 0.7778844237327576, "learning_rate": 4.9299719887955186e-05, "loss": 0.3771, "step": 40 }, { "epoch": 0.052521008403361345, "grad_norm": 0.7006077766418457, "learning_rate": 4.912464985994398e-05, "loss": 0.3842, "step": 50 }, { "epoch": 0.052521008403361345, "eval_loss": 0.42603224515914917, "eval_runtime": 13.5673, "eval_samples_per_second": 35.379, "eval_steps_per_second": 2.211, "step": 50 }, { "epoch": 0.06302521008403361, "grad_norm": 0.6415153741836548, "learning_rate": 4.8949579831932775e-05, "loss": 0.3399, "step": 60 }, { "epoch": 0.07352941176470588, "grad_norm": 0.6030780076980591, "learning_rate": 4.877450980392157e-05, "loss": 0.3447, "step": 70 }, { "epoch": 0.08403361344537816, "grad_norm": 0.688852071762085, "learning_rate": 4.859943977591036e-05, "loss": 0.3219, "step": 80 }, { "epoch": 0.09453781512605042, "grad_norm": 0.6371557712554932, "learning_rate": 4.8424369747899164e-05, "loss": 0.3379, "step": 90 }, { "epoch": 0.10504201680672269, "grad_norm": 0.7739270329475403, "learning_rate": 4.824929971988796e-05, "loss": 0.3177, "step": 100 }, { "epoch": 0.10504201680672269, "eval_loss": 0.3801896274089813, "eval_runtime": 13.6107, "eval_samples_per_second": 35.266, "eval_steps_per_second": 2.204, "step": 100 }, { "epoch": 0.11554621848739496, "grad_norm": 0.649507462978363, "learning_rate": 4.807422969187675e-05, "loss": 0.3415, "step": 110 }, { "epoch": 0.12605042016806722, "grad_norm": 0.594717264175415, "learning_rate": 4.7899159663865554e-05, "loss": 0.3325, "step": 120 }, { "epoch": 0.13655462184873948, "grad_norm": 0.627918541431427, "learning_rate": 4.772408963585435e-05, "loss": 0.3222, "step": 130 }, { "epoch": 0.14705882352941177, "grad_norm": 0.5384674668312073, "learning_rate": 4.7549019607843135e-05, "loss": 0.3426, "step": 140 }, { "epoch": 0.15756302521008403, "grad_norm": 0.5673420429229736, "learning_rate": 4.7373949579831936e-05, "loss": 0.3061, "step": 150 }, { "epoch": 0.15756302521008403, "eval_loss": 0.3653399348258972, "eval_runtime": 13.5947, "eval_samples_per_second": 35.308, "eval_steps_per_second": 2.207, "step": 150 }, { "epoch": 0.16806722689075632, "grad_norm": 0.6111018657684326, "learning_rate": 4.719887955182073e-05, "loss": 0.3271, "step": 160 }, { "epoch": 0.17857142857142858, "grad_norm": 0.7422594428062439, "learning_rate": 4.7023809523809525e-05, "loss": 0.315, "step": 170 }, { "epoch": 0.18907563025210083, "grad_norm": 0.7226534485816956, "learning_rate": 4.684873949579832e-05, "loss": 0.3031, "step": 180 }, { "epoch": 0.19957983193277312, "grad_norm": 0.6302976012229919, "learning_rate": 4.667366946778712e-05, "loss": 0.3161, "step": 190 }, { "epoch": 0.21008403361344538, "grad_norm": 0.6225076913833618, "learning_rate": 4.6498599439775914e-05, "loss": 0.3038, "step": 200 }, { "epoch": 0.21008403361344538, "eval_loss": 0.35061606764793396, "eval_runtime": 13.5616, "eval_samples_per_second": 35.394, "eval_steps_per_second": 2.212, "step": 200 }, { "epoch": 0.22058823529411764, "grad_norm": 0.6001319885253906, "learning_rate": 4.632352941176471e-05, "loss": 0.3129, "step": 210 }, { "epoch": 0.23109243697478993, "grad_norm": 0.5385990142822266, "learning_rate": 4.61484593837535e-05, "loss": 0.2991, "step": 220 }, { "epoch": 0.2415966386554622, "grad_norm": 0.4513624906539917, "learning_rate": 4.59733893557423e-05, "loss": 0.2896, "step": 230 }, { "epoch": 0.25210084033613445, "grad_norm": 0.6142160892486572, "learning_rate": 4.579831932773109e-05, "loss": 0.3059, "step": 240 }, { "epoch": 0.26260504201680673, "grad_norm": 0.6714802384376526, "learning_rate": 4.562324929971989e-05, "loss": 0.2897, "step": 250 }, { "epoch": 0.26260504201680673, "eval_loss": 0.3456435203552246, "eval_runtime": 13.5552, "eval_samples_per_second": 35.411, "eval_steps_per_second": 2.213, "step": 250 }, { "epoch": 0.27310924369747897, "grad_norm": 0.6518235206604004, "learning_rate": 4.5448179271708687e-05, "loss": 0.312, "step": 260 }, { "epoch": 0.28361344537815125, "grad_norm": 0.6250632405281067, "learning_rate": 4.527310924369748e-05, "loss": 0.2959, "step": 270 }, { "epoch": 0.29411764705882354, "grad_norm": 0.5683826804161072, "learning_rate": 4.5098039215686275e-05, "loss": 0.3027, "step": 280 }, { "epoch": 0.30462184873949577, "grad_norm": 0.560312807559967, "learning_rate": 4.4922969187675076e-05, "loss": 0.3002, "step": 290 }, { "epoch": 0.31512605042016806, "grad_norm": 0.66291743516922, "learning_rate": 4.474789915966387e-05, "loss": 0.2925, "step": 300 }, { "epoch": 0.31512605042016806, "eval_loss": 0.3431606888771057, "eval_runtime": 13.5629, "eval_samples_per_second": 35.391, "eval_steps_per_second": 2.212, "step": 300 }, { "epoch": 0.32563025210084034, "grad_norm": 0.6478439569473267, "learning_rate": 4.4572829131652665e-05, "loss": 0.2893, "step": 310 }, { "epoch": 0.33613445378151263, "grad_norm": 0.5832348465919495, "learning_rate": 4.439775910364146e-05, "loss": 0.2842, "step": 320 }, { "epoch": 0.34663865546218486, "grad_norm": 0.525932252407074, "learning_rate": 4.422268907563025e-05, "loss": 0.2837, "step": 330 }, { "epoch": 0.35714285714285715, "grad_norm": 0.5487508177757263, "learning_rate": 4.404761904761905e-05, "loss": 0.2706, "step": 340 }, { "epoch": 0.36764705882352944, "grad_norm": 0.5392388701438904, "learning_rate": 4.387254901960784e-05, "loss": 0.2835, "step": 350 }, { "epoch": 0.36764705882352944, "eval_loss": 0.33528536558151245, "eval_runtime": 13.5508, "eval_samples_per_second": 35.422, "eval_steps_per_second": 2.214, "step": 350 }, { "epoch": 0.37815126050420167, "grad_norm": 0.6706260442733765, "learning_rate": 4.369747899159664e-05, "loss": 0.2844, "step": 360 }, { "epoch": 0.38865546218487396, "grad_norm": 0.6042625904083252, "learning_rate": 4.352240896358544e-05, "loss": 0.2758, "step": 370 }, { "epoch": 0.39915966386554624, "grad_norm": 0.534008264541626, "learning_rate": 4.334733893557423e-05, "loss": 0.2918, "step": 380 }, { "epoch": 0.4096638655462185, "grad_norm": 0.48162588477134705, "learning_rate": 4.317226890756303e-05, "loss": 0.273, "step": 390 }, { "epoch": 0.42016806722689076, "grad_norm": 0.5669644474983215, "learning_rate": 4.2997198879551826e-05, "loss": 0.285, "step": 400 }, { "epoch": 0.42016806722689076, "eval_loss": 0.3348632752895355, "eval_runtime": 13.5507, "eval_samples_per_second": 35.423, "eval_steps_per_second": 2.214, "step": 400 }, { "epoch": 0.43067226890756305, "grad_norm": 0.6257824897766113, "learning_rate": 4.2822128851540614e-05, "loss": 0.299, "step": 410 }, { "epoch": 0.4411764705882353, "grad_norm": 0.5430576205253601, "learning_rate": 4.2647058823529415e-05, "loss": 0.2868, "step": 420 }, { "epoch": 0.45168067226890757, "grad_norm": 0.5633955597877502, "learning_rate": 4.247198879551821e-05, "loss": 0.2589, "step": 430 }, { "epoch": 0.46218487394957986, "grad_norm": 0.5294789671897888, "learning_rate": 4.2296918767507e-05, "loss": 0.2777, "step": 440 }, { "epoch": 0.4726890756302521, "grad_norm": 0.5480856895446777, "learning_rate": 4.21218487394958e-05, "loss": 0.2704, "step": 450 }, { "epoch": 0.4726890756302521, "eval_loss": 0.329515278339386, "eval_runtime": 13.5423, "eval_samples_per_second": 35.445, "eval_steps_per_second": 2.215, "step": 450 }, { "epoch": 0.4831932773109244, "grad_norm": 0.5051332116127014, "learning_rate": 4.19467787114846e-05, "loss": 0.2438, "step": 460 }, { "epoch": 0.49369747899159666, "grad_norm": 0.6251511573791504, "learning_rate": 4.177170868347339e-05, "loss": 0.2748, "step": 470 }, { "epoch": 0.5042016806722689, "grad_norm": 0.4729413092136383, "learning_rate": 4.159663865546219e-05, "loss": 0.2689, "step": 480 }, { "epoch": 0.5147058823529411, "grad_norm": 0.5220003724098206, "learning_rate": 4.142156862745099e-05, "loss": 0.2899, "step": 490 }, { "epoch": 0.5252100840336135, "grad_norm": 0.54283207654953, "learning_rate": 4.1246498599439776e-05, "loss": 0.272, "step": 500 }, { "epoch": 0.5252100840336135, "eval_loss": 0.32714489102363586, "eval_runtime": 13.5497, "eval_samples_per_second": 35.425, "eval_steps_per_second": 2.214, "step": 500 }, { "epoch": 0.5357142857142857, "grad_norm": 0.5851682424545288, "learning_rate": 4.107142857142857e-05, "loss": 0.2691, "step": 510 }, { "epoch": 0.5462184873949579, "grad_norm": 0.6026607751846313, "learning_rate": 4.089635854341737e-05, "loss": 0.2716, "step": 520 }, { "epoch": 0.5567226890756303, "grad_norm": 0.522422730922699, "learning_rate": 4.0721288515406165e-05, "loss": 0.2774, "step": 530 }, { "epoch": 0.5672268907563025, "grad_norm": 0.516901433467865, "learning_rate": 4.054621848739496e-05, "loss": 0.2726, "step": 540 }, { "epoch": 0.5777310924369747, "grad_norm": 0.667030394077301, "learning_rate": 4.0371148459383754e-05, "loss": 0.2622, "step": 550 }, { "epoch": 0.5777310924369747, "eval_loss": 0.3262839615345001, "eval_runtime": 13.5448, "eval_samples_per_second": 35.438, "eval_steps_per_second": 2.215, "step": 550 }, { "epoch": 0.5882352941176471, "grad_norm": 0.542658269405365, "learning_rate": 4.0196078431372555e-05, "loss": 0.2572, "step": 560 }, { "epoch": 0.5987394957983193, "grad_norm": 0.5408573746681213, "learning_rate": 4.002100840336135e-05, "loss": 0.2636, "step": 570 }, { "epoch": 0.6092436974789915, "grad_norm": 0.5691037774085999, "learning_rate": 3.984593837535014e-05, "loss": 0.268, "step": 580 }, { "epoch": 0.6197478991596639, "grad_norm": 0.5530794858932495, "learning_rate": 3.967086834733894e-05, "loss": 0.2583, "step": 590 }, { "epoch": 0.6302521008403361, "grad_norm": 0.546229362487793, "learning_rate": 3.949579831932773e-05, "loss": 0.2622, "step": 600 }, { "epoch": 0.6302521008403361, "eval_loss": 0.3219989836215973, "eval_runtime": 13.5524, "eval_samples_per_second": 35.418, "eval_steps_per_second": 2.214, "step": 600 }, { "epoch": 0.6407563025210085, "grad_norm": 0.5098925232887268, "learning_rate": 3.9320728291316526e-05, "loss": 0.2553, "step": 610 }, { "epoch": 0.6512605042016807, "grad_norm": 0.5201871991157532, "learning_rate": 3.914565826330533e-05, "loss": 0.2584, "step": 620 }, { "epoch": 0.6617647058823529, "grad_norm": 0.47408100962638855, "learning_rate": 3.897058823529412e-05, "loss": 0.2686, "step": 630 }, { "epoch": 0.6722689075630253, "grad_norm": 0.5591098666191101, "learning_rate": 3.8795518207282915e-05, "loss": 0.2772, "step": 640 }, { "epoch": 0.6827731092436975, "grad_norm": 0.5344163179397583, "learning_rate": 3.862044817927171e-05, "loss": 0.263, "step": 650 }, { "epoch": 0.6827731092436975, "eval_loss": 0.31990015506744385, "eval_runtime": 13.5501, "eval_samples_per_second": 35.424, "eval_steps_per_second": 2.214, "step": 650 }, { "epoch": 0.6932773109243697, "grad_norm": 0.6538853049278259, "learning_rate": 3.844537815126051e-05, "loss": 0.2605, "step": 660 }, { "epoch": 0.7037815126050421, "grad_norm": 0.43679994344711304, "learning_rate": 3.82703081232493e-05, "loss": 0.2486, "step": 670 }, { "epoch": 0.7142857142857143, "grad_norm": 0.5602915287017822, "learning_rate": 3.809523809523809e-05, "loss": 0.2359, "step": 680 }, { "epoch": 0.7247899159663865, "grad_norm": 0.5354353189468384, "learning_rate": 3.792016806722689e-05, "loss": 0.2541, "step": 690 }, { "epoch": 0.7352941176470589, "grad_norm": 0.5954485535621643, "learning_rate": 3.774509803921569e-05, "loss": 0.2649, "step": 700 }, { "epoch": 0.7352941176470589, "eval_loss": 0.3204093873500824, "eval_runtime": 13.5523, "eval_samples_per_second": 35.418, "eval_steps_per_second": 2.214, "step": 700 }, { "epoch": 0.7457983193277311, "grad_norm": 0.6093761324882507, "learning_rate": 3.757002801120448e-05, "loss": 0.2456, "step": 710 }, { "epoch": 0.7563025210084033, "grad_norm": 0.4796586334705353, "learning_rate": 3.739495798319328e-05, "loss": 0.253, "step": 720 }, { "epoch": 0.7668067226890757, "grad_norm": 0.5846813917160034, "learning_rate": 3.721988795518208e-05, "loss": 0.2442, "step": 730 }, { "epoch": 0.7773109243697479, "grad_norm": 0.4811939597129822, "learning_rate": 3.704481792717087e-05, "loss": 0.2522, "step": 740 }, { "epoch": 0.7878151260504201, "grad_norm": 0.5718042850494385, "learning_rate": 3.6869747899159665e-05, "loss": 0.2562, "step": 750 }, { "epoch": 0.7878151260504201, "eval_loss": 0.32099905610084534, "eval_runtime": 13.5408, "eval_samples_per_second": 35.448, "eval_steps_per_second": 2.216, "step": 750 }, { "epoch": 0.7983193277310925, "grad_norm": 0.5630698204040527, "learning_rate": 3.669467787114846e-05, "loss": 0.2476, "step": 760 }, { "epoch": 0.8088235294117647, "grad_norm": 0.6513442397117615, "learning_rate": 3.6519607843137254e-05, "loss": 0.2544, "step": 770 }, { "epoch": 0.819327731092437, "grad_norm": 0.6139647960662842, "learning_rate": 3.634453781512605e-05, "loss": 0.258, "step": 780 }, { "epoch": 0.8298319327731093, "grad_norm": 0.5916554927825928, "learning_rate": 3.616946778711485e-05, "loss": 0.2415, "step": 790 }, { "epoch": 0.8403361344537815, "grad_norm": 0.5163634419441223, "learning_rate": 3.5994397759103643e-05, "loss": 0.252, "step": 800 }, { "epoch": 0.8403361344537815, "eval_loss": 0.3215568959712982, "eval_runtime": 13.5293, "eval_samples_per_second": 35.479, "eval_steps_per_second": 2.217, "step": 800 }, { "epoch": 0.8508403361344538, "grad_norm": 0.5768859386444092, "learning_rate": 3.581932773109244e-05, "loss": 0.2421, "step": 810 }, { "epoch": 0.8613445378151261, "grad_norm": 0.6197952032089233, "learning_rate": 3.564425770308123e-05, "loss": 0.2672, "step": 820 }, { "epoch": 0.8718487394957983, "grad_norm": 0.5396980047225952, "learning_rate": 3.546918767507003e-05, "loss": 0.2393, "step": 830 }, { "epoch": 0.8823529411764706, "grad_norm": 0.5783377885818481, "learning_rate": 3.529411764705883e-05, "loss": 0.2374, "step": 840 }, { "epoch": 0.8928571428571429, "grad_norm": 0.5808666944503784, "learning_rate": 3.511904761904762e-05, "loss": 0.2405, "step": 850 }, { "epoch": 0.8928571428571429, "eval_loss": 0.3207303583621979, "eval_runtime": 13.5299, "eval_samples_per_second": 35.477, "eval_steps_per_second": 2.217, "step": 850 }, { "epoch": 0.9033613445378151, "grad_norm": 0.4931146204471588, "learning_rate": 3.4943977591036416e-05, "loss": 0.2382, "step": 860 }, { "epoch": 0.9138655462184874, "grad_norm": 0.6456460952758789, "learning_rate": 3.476890756302521e-05, "loss": 0.2416, "step": 870 }, { "epoch": 0.9243697478991597, "grad_norm": 0.5459381937980652, "learning_rate": 3.4593837535014004e-05, "loss": 0.2611, "step": 880 }, { "epoch": 0.9348739495798319, "grad_norm": 0.5317162275314331, "learning_rate": 3.4418767507002805e-05, "loss": 0.2383, "step": 890 }, { "epoch": 0.9453781512605042, "grad_norm": 0.5790566205978394, "learning_rate": 3.42436974789916e-05, "loss": 0.2455, "step": 900 }, { "epoch": 0.9453781512605042, "eval_loss": 0.31993839144706726, "eval_runtime": 13.5323, "eval_samples_per_second": 35.471, "eval_steps_per_second": 2.217, "step": 900 }, { "epoch": 0.9558823529411765, "grad_norm": 0.5805277228355408, "learning_rate": 3.4068627450980394e-05, "loss": 0.2393, "step": 910 }, { "epoch": 0.9663865546218487, "grad_norm": 0.6085871458053589, "learning_rate": 3.389355742296919e-05, "loss": 0.2505, "step": 920 }, { "epoch": 0.976890756302521, "grad_norm": 0.6251375079154968, "learning_rate": 3.371848739495799e-05, "loss": 0.2378, "step": 930 }, { "epoch": 0.9873949579831933, "grad_norm": 0.6169071197509766, "learning_rate": 3.3543417366946776e-05, "loss": 0.239, "step": 940 }, { "epoch": 0.9978991596638656, "grad_norm": 0.5361204147338867, "learning_rate": 3.336834733893557e-05, "loss": 0.2484, "step": 950 }, { "epoch": 0.9978991596638656, "eval_loss": 0.31845951080322266, "eval_runtime": 13.5483, "eval_samples_per_second": 35.429, "eval_steps_per_second": 2.214, "step": 950 }, { "epoch": 1.0084033613445378, "grad_norm": 0.47690704464912415, "learning_rate": 3.319327731092437e-05, "loss": 0.2006, "step": 960 }, { "epoch": 1.01890756302521, "grad_norm": 0.7460448741912842, "learning_rate": 3.3018207282913166e-05, "loss": 0.1983, "step": 970 }, { "epoch": 1.0294117647058822, "grad_norm": 0.5729458332061768, "learning_rate": 3.284313725490196e-05, "loss": 0.1989, "step": 980 }, { "epoch": 1.0399159663865547, "grad_norm": 0.5524929761886597, "learning_rate": 3.266806722689076e-05, "loss": 0.2039, "step": 990 }, { "epoch": 1.050420168067227, "grad_norm": 0.6426274180412292, "learning_rate": 3.2492997198879555e-05, "loss": 0.2184, "step": 1000 }, { "epoch": 1.050420168067227, "eval_loss": 0.3305407762527466, "eval_runtime": 13.5592, "eval_samples_per_second": 35.4, "eval_steps_per_second": 2.213, "step": 1000 }, { "epoch": 1.0609243697478992, "grad_norm": 0.4944634437561035, "learning_rate": 3.231792717086835e-05, "loss": 0.2042, "step": 1010 }, { "epoch": 1.0714285714285714, "grad_norm": 0.5576530694961548, "learning_rate": 3.2142857142857144e-05, "loss": 0.2029, "step": 1020 }, { "epoch": 1.0819327731092436, "grad_norm": 0.674849271774292, "learning_rate": 3.196778711484594e-05, "loss": 0.2026, "step": 1030 }, { "epoch": 1.092436974789916, "grad_norm": 0.5424471497535706, "learning_rate": 3.179271708683473e-05, "loss": 0.2027, "step": 1040 }, { "epoch": 1.1029411764705883, "grad_norm": 0.6491550207138062, "learning_rate": 3.161764705882353e-05, "loss": 0.2029, "step": 1050 }, { "epoch": 1.1029411764705883, "eval_loss": 0.33292290568351746, "eval_runtime": 13.5611, "eval_samples_per_second": 35.395, "eval_steps_per_second": 2.212, "step": 1050 }, { "epoch": 1.1134453781512605, "grad_norm": 0.6113711595535278, "learning_rate": 3.144257703081233e-05, "loss": 0.2, "step": 1060 }, { "epoch": 1.1239495798319328, "grad_norm": 0.5068053603172302, "learning_rate": 3.126750700280112e-05, "loss": 0.1903, "step": 1070 }, { "epoch": 1.134453781512605, "grad_norm": 0.6518192291259766, "learning_rate": 3.1092436974789916e-05, "loss": 0.2006, "step": 1080 }, { "epoch": 1.1449579831932772, "grad_norm": 0.6932762861251831, "learning_rate": 3.091736694677872e-05, "loss": 0.2051, "step": 1090 }, { "epoch": 1.1554621848739495, "grad_norm": 0.5372537970542908, "learning_rate": 3.074229691876751e-05, "loss": 0.2037, "step": 1100 }, { "epoch": 1.1554621848739495, "eval_loss": 0.33518460392951965, "eval_runtime": 13.5628, "eval_samples_per_second": 35.391, "eval_steps_per_second": 2.212, "step": 1100 }, { "epoch": 1.165966386554622, "grad_norm": 0.6258675456047058, "learning_rate": 3.0567226890756306e-05, "loss": 0.1953, "step": 1110 }, { "epoch": 1.1764705882352942, "grad_norm": 0.5910756587982178, "learning_rate": 3.0392156862745097e-05, "loss": 0.2098, "step": 1120 }, { "epoch": 1.1869747899159664, "grad_norm": 0.6931313276290894, "learning_rate": 3.0217086834733894e-05, "loss": 0.2022, "step": 1130 }, { "epoch": 1.1974789915966386, "grad_norm": 0.6102430820465088, "learning_rate": 3.004201680672269e-05, "loss": 0.1946, "step": 1140 }, { "epoch": 1.2079831932773109, "grad_norm": 0.6068236827850342, "learning_rate": 2.9866946778711486e-05, "loss": 0.2089, "step": 1150 }, { "epoch": 1.2079831932773109, "eval_loss": 0.3338312804698944, "eval_runtime": 13.5402, "eval_samples_per_second": 35.45, "eval_steps_per_second": 2.216, "step": 1150 }, { "epoch": 1.2184873949579833, "grad_norm": 0.6477882862091064, "learning_rate": 2.969187675070028e-05, "loss": 0.2065, "step": 1160 }, { "epoch": 1.2289915966386555, "grad_norm": 0.6269820928573608, "learning_rate": 2.9516806722689078e-05, "loss": 0.1926, "step": 1170 }, { "epoch": 1.2394957983193278, "grad_norm": 0.6708040833473206, "learning_rate": 2.9341736694677872e-05, "loss": 0.1983, "step": 1180 }, { "epoch": 1.25, "grad_norm": 0.6625474095344543, "learning_rate": 2.916666666666667e-05, "loss": 0.1967, "step": 1190 }, { "epoch": 1.2605042016806722, "grad_norm": 0.5640371441841125, "learning_rate": 2.8991596638655467e-05, "loss": 0.205, "step": 1200 }, { "epoch": 1.2605042016806722, "eval_loss": 0.33764752745628357, "eval_runtime": 13.5241, "eval_samples_per_second": 35.492, "eval_steps_per_second": 2.218, "step": 1200 }, { "epoch": 1.2710084033613445, "grad_norm": 0.590160608291626, "learning_rate": 2.8816526610644258e-05, "loss": 0.2008, "step": 1210 }, { "epoch": 1.2815126050420167, "grad_norm": 0.6975618004798889, "learning_rate": 2.8641456582633052e-05, "loss": 0.1956, "step": 1220 }, { "epoch": 1.2920168067226891, "grad_norm": 0.6742042899131775, "learning_rate": 2.846638655462185e-05, "loss": 0.2089, "step": 1230 }, { "epoch": 1.3025210084033614, "grad_norm": 0.5806481838226318, "learning_rate": 2.8291316526610644e-05, "loss": 0.1937, "step": 1240 }, { "epoch": 1.3130252100840336, "grad_norm": 0.9894171953201294, "learning_rate": 2.8116246498599442e-05, "loss": 0.2098, "step": 1250 }, { "epoch": 1.3130252100840336, "eval_loss": 0.3376815617084503, "eval_runtime": 13.503, "eval_samples_per_second": 35.548, "eval_steps_per_second": 2.222, "step": 1250 }, { "epoch": 1.3235294117647058, "grad_norm": 0.8022045493125916, "learning_rate": 2.7941176470588236e-05, "loss": 0.2043, "step": 1260 }, { "epoch": 1.334033613445378, "grad_norm": 0.687003493309021, "learning_rate": 2.7766106442577034e-05, "loss": 0.2023, "step": 1270 }, { "epoch": 1.3445378151260505, "grad_norm": 0.7270589470863342, "learning_rate": 2.7591036414565828e-05, "loss": 0.1994, "step": 1280 }, { "epoch": 1.3550420168067228, "grad_norm": 0.5873326063156128, "learning_rate": 2.7415966386554626e-05, "loss": 0.1904, "step": 1290 }, { "epoch": 1.365546218487395, "grad_norm": 0.6190339922904968, "learning_rate": 2.7240896358543417e-05, "loss": 0.198, "step": 1300 }, { "epoch": 1.365546218487395, "eval_loss": 0.3378330171108246, "eval_runtime": 13.4926, "eval_samples_per_second": 35.575, "eval_steps_per_second": 2.223, "step": 1300 }, { "epoch": 1.3760504201680672, "grad_norm": 0.6693372130393982, "learning_rate": 2.706582633053221e-05, "loss": 0.1953, "step": 1310 }, { "epoch": 1.3865546218487395, "grad_norm": 0.5310758352279663, "learning_rate": 2.689075630252101e-05, "loss": 0.2013, "step": 1320 }, { "epoch": 1.3970588235294117, "grad_norm": 0.7299119830131531, "learning_rate": 2.6715686274509806e-05, "loss": 0.1967, "step": 1330 }, { "epoch": 1.407563025210084, "grad_norm": 0.7364310026168823, "learning_rate": 2.65406162464986e-05, "loss": 0.1996, "step": 1340 }, { "epoch": 1.4180672268907564, "grad_norm": 0.6624464988708496, "learning_rate": 2.6365546218487398e-05, "loss": 0.1889, "step": 1350 }, { "epoch": 1.4180672268907564, "eval_loss": 0.33763712644577026, "eval_runtime": 13.5125, "eval_samples_per_second": 35.523, "eval_steps_per_second": 2.22, "step": 1350 }, { "epoch": 1.4285714285714286, "grad_norm": 0.6339524984359741, "learning_rate": 2.6190476190476192e-05, "loss": 0.1898, "step": 1360 }, { "epoch": 1.4390756302521008, "grad_norm": 0.7600531578063965, "learning_rate": 2.601540616246499e-05, "loss": 0.1976, "step": 1370 }, { "epoch": 1.449579831932773, "grad_norm": 0.7129259705543518, "learning_rate": 2.5840336134453784e-05, "loss": 0.1978, "step": 1380 }, { "epoch": 1.4600840336134453, "grad_norm": 0.6824067234992981, "learning_rate": 2.5665266106442575e-05, "loss": 0.1917, "step": 1390 }, { "epoch": 1.4705882352941178, "grad_norm": 0.8596220016479492, "learning_rate": 2.5490196078431373e-05, "loss": 0.1821, "step": 1400 }, { "epoch": 1.4705882352941178, "eval_loss": 0.3433762192726135, "eval_runtime": 13.5224, "eval_samples_per_second": 35.497, "eval_steps_per_second": 2.219, "step": 1400 }, { "epoch": 1.48109243697479, "grad_norm": 0.5555725693702698, "learning_rate": 2.5315126050420167e-05, "loss": 0.1952, "step": 1410 }, { "epoch": 1.4915966386554622, "grad_norm": 0.7239671349525452, "learning_rate": 2.5140056022408964e-05, "loss": 0.1989, "step": 1420 }, { "epoch": 1.5021008403361344, "grad_norm": 0.70365971326828, "learning_rate": 2.4964985994397762e-05, "loss": 0.1944, "step": 1430 }, { "epoch": 1.5126050420168067, "grad_norm": 0.8063983917236328, "learning_rate": 2.4789915966386556e-05, "loss": 0.1964, "step": 1440 }, { "epoch": 1.523109243697479, "grad_norm": 0.7536810636520386, "learning_rate": 2.4614845938375354e-05, "loss": 0.2004, "step": 1450 }, { "epoch": 1.523109243697479, "eval_loss": 0.3417983055114746, "eval_runtime": 13.5372, "eval_samples_per_second": 35.458, "eval_steps_per_second": 2.216, "step": 1450 }, { "epoch": 1.5336134453781511, "grad_norm": 0.608958899974823, "learning_rate": 2.4439775910364145e-05, "loss": 0.1907, "step": 1460 }, { "epoch": 1.5441176470588234, "grad_norm": 0.6138647794723511, "learning_rate": 2.4264705882352942e-05, "loss": 0.193, "step": 1470 }, { "epoch": 1.5546218487394958, "grad_norm": 0.8067657947540283, "learning_rate": 2.4089635854341737e-05, "loss": 0.188, "step": 1480 }, { "epoch": 1.565126050420168, "grad_norm": 0.7985292077064514, "learning_rate": 2.3914565826330534e-05, "loss": 0.197, "step": 1490 }, { "epoch": 1.5756302521008403, "grad_norm": 0.697371244430542, "learning_rate": 2.373949579831933e-05, "loss": 0.203, "step": 1500 }, { "epoch": 1.5756302521008403, "eval_loss": 0.34146031737327576, "eval_runtime": 13.5585, "eval_samples_per_second": 35.402, "eval_steps_per_second": 2.213, "step": 1500 }, { "epoch": 1.5861344537815127, "grad_norm": 0.6977027058601379, "learning_rate": 2.3564425770308123e-05, "loss": 0.197, "step": 1510 }, { "epoch": 1.596638655462185, "grad_norm": 0.80245441198349, "learning_rate": 2.338935574229692e-05, "loss": 0.1876, "step": 1520 }, { "epoch": 1.6071428571428572, "grad_norm": 0.7472719550132751, "learning_rate": 2.3214285714285715e-05, "loss": 0.1936, "step": 1530 }, { "epoch": 1.6176470588235294, "grad_norm": 0.7296733260154724, "learning_rate": 2.303921568627451e-05, "loss": 0.1762, "step": 1540 }, { "epoch": 1.6281512605042017, "grad_norm": 0.7784711718559265, "learning_rate": 2.2864145658263307e-05, "loss": 0.1828, "step": 1550 }, { "epoch": 1.6281512605042017, "eval_loss": 0.34527555108070374, "eval_runtime": 13.5577, "eval_samples_per_second": 35.404, "eval_steps_per_second": 2.213, "step": 1550 }, { "epoch": 1.638655462184874, "grad_norm": 0.8349794149398804, "learning_rate": 2.26890756302521e-05, "loss": 0.1904, "step": 1560 }, { "epoch": 1.6491596638655461, "grad_norm": 0.7088022828102112, "learning_rate": 2.25140056022409e-05, "loss": 0.1863, "step": 1570 }, { "epoch": 1.6596638655462184, "grad_norm": 0.6738799810409546, "learning_rate": 2.2338935574229693e-05, "loss": 0.1847, "step": 1580 }, { "epoch": 1.6701680672268906, "grad_norm": 0.7299010157585144, "learning_rate": 2.2163865546218487e-05, "loss": 0.1892, "step": 1590 }, { "epoch": 1.680672268907563, "grad_norm": 0.9265356063842773, "learning_rate": 2.1988795518207285e-05, "loss": 0.1916, "step": 1600 }, { "epoch": 1.680672268907563, "eval_loss": 0.34289926290512085, "eval_runtime": 13.5614, "eval_samples_per_second": 35.394, "eval_steps_per_second": 2.212, "step": 1600 }, { "epoch": 1.6911764705882353, "grad_norm": 0.6903817653656006, "learning_rate": 2.181372549019608e-05, "loss": 0.1849, "step": 1610 }, { "epoch": 1.7016806722689075, "grad_norm": 0.7027392983436584, "learning_rate": 2.1638655462184876e-05, "loss": 0.1782, "step": 1620 }, { "epoch": 1.71218487394958, "grad_norm": 0.6882468461990356, "learning_rate": 2.146358543417367e-05, "loss": 0.1849, "step": 1630 }, { "epoch": 1.7226890756302522, "grad_norm": 0.7120645642280579, "learning_rate": 2.1288515406162465e-05, "loss": 0.1916, "step": 1640 }, { "epoch": 1.7331932773109244, "grad_norm": 0.7592134475708008, "learning_rate": 2.1113445378151263e-05, "loss": 0.1748, "step": 1650 }, { "epoch": 1.7331932773109244, "eval_loss": 0.34829968214035034, "eval_runtime": 13.5535, "eval_samples_per_second": 35.415, "eval_steps_per_second": 2.213, "step": 1650 }, { "epoch": 1.7436974789915967, "grad_norm": 0.6609721183776855, "learning_rate": 2.0938375350140057e-05, "loss": 0.1877, "step": 1660 }, { "epoch": 1.754201680672269, "grad_norm": 0.8183338046073914, "learning_rate": 2.0763305322128854e-05, "loss": 0.1795, "step": 1670 }, { "epoch": 1.7647058823529411, "grad_norm": 0.6892154812812805, "learning_rate": 2.058823529411765e-05, "loss": 0.1812, "step": 1680 }, { "epoch": 1.7752100840336134, "grad_norm": 0.618888258934021, "learning_rate": 2.0413165266106443e-05, "loss": 0.1852, "step": 1690 }, { "epoch": 1.7857142857142856, "grad_norm": 0.6212893128395081, "learning_rate": 2.023809523809524e-05, "loss": 0.1935, "step": 1700 }, { "epoch": 1.7857142857142856, "eval_loss": 0.34380197525024414, "eval_runtime": 13.526, "eval_samples_per_second": 35.487, "eval_steps_per_second": 2.218, "step": 1700 }, { "epoch": 1.7962184873949578, "grad_norm": 0.6889869570732117, "learning_rate": 2.0063025210084035e-05, "loss": 0.182, "step": 1710 }, { "epoch": 1.8067226890756303, "grad_norm": 0.6283136606216431, "learning_rate": 1.988795518207283e-05, "loss": 0.1837, "step": 1720 }, { "epoch": 1.8172268907563025, "grad_norm": 0.6092699766159058, "learning_rate": 1.9712885154061627e-05, "loss": 0.178, "step": 1730 }, { "epoch": 1.8277310924369747, "grad_norm": 0.665622889995575, "learning_rate": 1.953781512605042e-05, "loss": 0.1785, "step": 1740 }, { "epoch": 1.8382352941176472, "grad_norm": 0.8595823049545288, "learning_rate": 1.936274509803922e-05, "loss": 0.184, "step": 1750 }, { "epoch": 1.8382352941176472, "eval_loss": 0.35087138414382935, "eval_runtime": 13.5028, "eval_samples_per_second": 35.548, "eval_steps_per_second": 2.222, "step": 1750 }, { "epoch": 1.8487394957983194, "grad_norm": 0.8481118679046631, "learning_rate": 1.9187675070028013e-05, "loss": 0.1841, "step": 1760 }, { "epoch": 1.8592436974789917, "grad_norm": 0.6994073987007141, "learning_rate": 1.9012605042016807e-05, "loss": 0.1765, "step": 1770 }, { "epoch": 1.8697478991596639, "grad_norm": 0.6001220941543579, "learning_rate": 1.88375350140056e-05, "loss": 0.1807, "step": 1780 }, { "epoch": 1.8802521008403361, "grad_norm": 0.7995849847793579, "learning_rate": 1.86624649859944e-05, "loss": 0.1731, "step": 1790 }, { "epoch": 1.8907563025210083, "grad_norm": 0.6178213953971863, "learning_rate": 1.8487394957983196e-05, "loss": 0.1723, "step": 1800 }, { "epoch": 1.8907563025210083, "eval_loss": 0.35629966855049133, "eval_runtime": 13.4945, "eval_samples_per_second": 35.57, "eval_steps_per_second": 2.223, "step": 1800 }, { "epoch": 1.9012605042016806, "grad_norm": 0.6287909746170044, "learning_rate": 1.8312324929971987e-05, "loss": 0.1756, "step": 1810 }, { "epoch": 1.9117647058823528, "grad_norm": 0.7970178723335266, "learning_rate": 1.8137254901960785e-05, "loss": 0.1815, "step": 1820 }, { "epoch": 1.9222689075630253, "grad_norm": 0.7358176708221436, "learning_rate": 1.796218487394958e-05, "loss": 0.1816, "step": 1830 }, { "epoch": 1.9327731092436975, "grad_norm": 0.7053601741790771, "learning_rate": 1.7787114845938377e-05, "loss": 0.1711, "step": 1840 }, { "epoch": 1.9432773109243697, "grad_norm": 0.7120839953422546, "learning_rate": 1.7612044817927174e-05, "loss": 0.1737, "step": 1850 }, { "epoch": 1.9432773109243697, "eval_loss": 0.35368964076042175, "eval_runtime": 13.499, "eval_samples_per_second": 35.558, "eval_steps_per_second": 2.222, "step": 1850 }, { "epoch": 1.9537815126050422, "grad_norm": 0.6667950749397278, "learning_rate": 1.7436974789915965e-05, "loss": 0.1729, "step": 1860 }, { "epoch": 1.9642857142857144, "grad_norm": 0.6258378028869629, "learning_rate": 1.7261904761904763e-05, "loss": 0.1811, "step": 1870 }, { "epoch": 1.9747899159663866, "grad_norm": 0.7531097531318665, "learning_rate": 1.7086834733893557e-05, "loss": 0.1742, "step": 1880 }, { "epoch": 1.9852941176470589, "grad_norm": 0.7142496109008789, "learning_rate": 1.6911764705882355e-05, "loss": 0.1738, "step": 1890 }, { "epoch": 1.995798319327731, "grad_norm": 0.7428011298179626, "learning_rate": 1.673669467787115e-05, "loss": 0.1694, "step": 1900 }, { "epoch": 1.995798319327731, "eval_loss": 0.3620971739292145, "eval_runtime": 13.5186, "eval_samples_per_second": 35.507, "eval_steps_per_second": 2.219, "step": 1900 }, { "epoch": 2.0063025210084033, "grad_norm": 0.5604604482650757, "learning_rate": 1.6561624649859943e-05, "loss": 0.1524, "step": 1910 }, { "epoch": 2.0168067226890756, "grad_norm": 0.8657930493354797, "learning_rate": 1.638655462184874e-05, "loss": 0.1496, "step": 1920 }, { "epoch": 2.027310924369748, "grad_norm": 0.642977237701416, "learning_rate": 1.6211484593837535e-05, "loss": 0.1452, "step": 1930 }, { "epoch": 2.03781512605042, "grad_norm": 0.7766443490982056, "learning_rate": 1.6036414565826333e-05, "loss": 0.1474, "step": 1940 }, { "epoch": 2.0483193277310923, "grad_norm": 0.6917587518692017, "learning_rate": 1.5861344537815127e-05, "loss": 0.149, "step": 1950 }, { "epoch": 2.0483193277310923, "eval_loss": 0.3917512893676758, "eval_runtime": 13.5475, "eval_samples_per_second": 35.431, "eval_steps_per_second": 2.214, "step": 1950 }, { "epoch": 2.0588235294117645, "grad_norm": 0.7516221404075623, "learning_rate": 1.568627450980392e-05, "loss": 0.1507, "step": 1960 }, { "epoch": 2.069327731092437, "grad_norm": 0.8124748468399048, "learning_rate": 1.551120448179272e-05, "loss": 0.1478, "step": 1970 }, { "epoch": 2.0798319327731094, "grad_norm": 0.664069652557373, "learning_rate": 1.5336134453781513e-05, "loss": 0.1505, "step": 1980 }, { "epoch": 2.0903361344537816, "grad_norm": 0.6899878978729248, "learning_rate": 1.5161064425770307e-05, "loss": 0.1472, "step": 1990 }, { "epoch": 2.100840336134454, "grad_norm": 0.680801510810852, "learning_rate": 1.4985994397759103e-05, "loss": 0.1527, "step": 2000 }, { "epoch": 2.100840336134454, "eval_loss": 0.3939915895462036, "eval_runtime": 13.5573, "eval_samples_per_second": 35.405, "eval_steps_per_second": 2.213, "step": 2000 }, { "epoch": 2.111344537815126, "grad_norm": 0.5556862950325012, "learning_rate": 1.48109243697479e-05, "loss": 0.1476, "step": 2010 }, { "epoch": 2.1218487394957983, "grad_norm": 0.6164736151695251, "learning_rate": 1.4635854341736697e-05, "loss": 0.1471, "step": 2020 }, { "epoch": 2.1323529411764706, "grad_norm": 0.7886316180229187, "learning_rate": 1.4460784313725493e-05, "loss": 0.1447, "step": 2030 }, { "epoch": 2.142857142857143, "grad_norm": 0.780383825302124, "learning_rate": 1.4285714285714285e-05, "loss": 0.1456, "step": 2040 }, { "epoch": 2.153361344537815, "grad_norm": 0.8698550462722778, "learning_rate": 1.4110644257703081e-05, "loss": 0.1529, "step": 2050 }, { "epoch": 2.153361344537815, "eval_loss": 0.4003276228904724, "eval_runtime": 13.5575, "eval_samples_per_second": 35.405, "eval_steps_per_second": 2.213, "step": 2050 }, { "epoch": 2.1638655462184873, "grad_norm": 0.709811806678772, "learning_rate": 1.3935574229691877e-05, "loss": 0.1434, "step": 2060 }, { "epoch": 2.1743697478991595, "grad_norm": 0.6641806364059448, "learning_rate": 1.3760504201680673e-05, "loss": 0.1479, "step": 2070 }, { "epoch": 2.184873949579832, "grad_norm": 0.6883408427238464, "learning_rate": 1.3585434173669467e-05, "loss": 0.152, "step": 2080 }, { "epoch": 2.1953781512605044, "grad_norm": 0.754300057888031, "learning_rate": 1.3410364145658263e-05, "loss": 0.1451, "step": 2090 }, { "epoch": 2.2058823529411766, "grad_norm": 0.7174961566925049, "learning_rate": 1.323529411764706e-05, "loss": 0.1469, "step": 2100 }, { "epoch": 2.2058823529411766, "eval_loss": 0.40417909622192383, "eval_runtime": 13.5536, "eval_samples_per_second": 35.415, "eval_steps_per_second": 2.213, "step": 2100 }, { "epoch": 2.216386554621849, "grad_norm": 0.8382938504219055, "learning_rate": 1.3060224089635855e-05, "loss": 0.145, "step": 2110 }, { "epoch": 2.226890756302521, "grad_norm": 0.6055848002433777, "learning_rate": 1.288515406162465e-05, "loss": 0.1415, "step": 2120 }, { "epoch": 2.2373949579831933, "grad_norm": 0.6858454942703247, "learning_rate": 1.2710084033613445e-05, "loss": 0.1477, "step": 2130 }, { "epoch": 2.2478991596638656, "grad_norm": 0.825764000415802, "learning_rate": 1.2535014005602241e-05, "loss": 0.1454, "step": 2140 }, { "epoch": 2.258403361344538, "grad_norm": 0.7165321707725525, "learning_rate": 1.2359943977591037e-05, "loss": 0.1583, "step": 2150 }, { "epoch": 2.258403361344538, "eval_loss": 0.4027842581272125, "eval_runtime": 13.5575, "eval_samples_per_second": 35.405, "eval_steps_per_second": 2.213, "step": 2150 }, { "epoch": 2.26890756302521, "grad_norm": 0.8113967776298523, "learning_rate": 1.2184873949579832e-05, "loss": 0.146, "step": 2160 }, { "epoch": 2.2794117647058822, "grad_norm": 0.5904905796051025, "learning_rate": 1.200980392156863e-05, "loss": 0.1445, "step": 2170 }, { "epoch": 2.2899159663865545, "grad_norm": 0.6510922908782959, "learning_rate": 1.1834733893557423e-05, "loss": 0.1417, "step": 2180 }, { "epoch": 2.3004201680672267, "grad_norm": 1.0300558805465698, "learning_rate": 1.165966386554622e-05, "loss": 0.1445, "step": 2190 }, { "epoch": 2.310924369747899, "grad_norm": 0.8051159977912903, "learning_rate": 1.1484593837535014e-05, "loss": 0.1441, "step": 2200 }, { "epoch": 2.310924369747899, "eval_loss": 0.4135133922100067, "eval_runtime": 13.5402, "eval_samples_per_second": 35.45, "eval_steps_per_second": 2.216, "step": 2200 }, { "epoch": 2.3214285714285716, "grad_norm": 0.7365185022354126, "learning_rate": 1.130952380952381e-05, "loss": 0.1445, "step": 2210 }, { "epoch": 2.331932773109244, "grad_norm": 0.6721594929695129, "learning_rate": 1.1134453781512606e-05, "loss": 0.1417, "step": 2220 }, { "epoch": 2.342436974789916, "grad_norm": 0.9220572113990784, "learning_rate": 1.0959383753501401e-05, "loss": 0.1455, "step": 2230 }, { "epoch": 2.3529411764705883, "grad_norm": 0.7778609395027161, "learning_rate": 1.0784313725490197e-05, "loss": 0.1451, "step": 2240 }, { "epoch": 2.3634453781512605, "grad_norm": 1.1159327030181885, "learning_rate": 1.0609243697478992e-05, "loss": 0.1458, "step": 2250 }, { "epoch": 2.3634453781512605, "eval_loss": 0.40705686807632446, "eval_runtime": 13.5021, "eval_samples_per_second": 35.55, "eval_steps_per_second": 2.222, "step": 2250 }, { "epoch": 2.3739495798319328, "grad_norm": 0.6801664233207703, "learning_rate": 1.0434173669467788e-05, "loss": 0.1428, "step": 2260 }, { "epoch": 2.384453781512605, "grad_norm": 0.8475140333175659, "learning_rate": 1.0259103641456584e-05, "loss": 0.1401, "step": 2270 }, { "epoch": 2.3949579831932772, "grad_norm": 1.0753763914108276, "learning_rate": 1.008403361344538e-05, "loss": 0.1485, "step": 2280 }, { "epoch": 2.4054621848739495, "grad_norm": 0.7163957953453064, "learning_rate": 9.908963585434174e-06, "loss": 0.148, "step": 2290 }, { "epoch": 2.4159663865546217, "grad_norm": 0.73530113697052, "learning_rate": 9.73389355742297e-06, "loss": 0.1462, "step": 2300 }, { "epoch": 2.4159663865546217, "eval_loss": 0.4083961546421051, "eval_runtime": 13.5005, "eval_samples_per_second": 35.554, "eval_steps_per_second": 2.222, "step": 2300 }, { "epoch": 2.426470588235294, "grad_norm": 0.8182764649391174, "learning_rate": 9.558823529411764e-06, "loss": 0.145, "step": 2310 }, { "epoch": 2.4369747899159666, "grad_norm": 0.8590428829193115, "learning_rate": 9.38375350140056e-06, "loss": 0.146, "step": 2320 }, { "epoch": 2.447478991596639, "grad_norm": 0.83616042137146, "learning_rate": 9.208683473389357e-06, "loss": 0.1455, "step": 2330 }, { "epoch": 2.457983193277311, "grad_norm": 0.7634994387626648, "learning_rate": 9.033613445378152e-06, "loss": 0.1363, "step": 2340 }, { "epoch": 2.4684873949579833, "grad_norm": 0.7826119661331177, "learning_rate": 8.858543417366948e-06, "loss": 0.1447, "step": 2350 }, { "epoch": 2.4684873949579833, "eval_loss": 0.4166198670864105, "eval_runtime": 13.5099, "eval_samples_per_second": 35.529, "eval_steps_per_second": 2.221, "step": 2350 }, { "epoch": 2.4789915966386555, "grad_norm": 0.7558380961418152, "learning_rate": 8.683473389355742e-06, "loss": 0.1449, "step": 2360 }, { "epoch": 2.4894957983193278, "grad_norm": 0.6801384091377258, "learning_rate": 8.508403361344538e-06, "loss": 0.1382, "step": 2370 }, { "epoch": 2.5, "grad_norm": 0.9655725359916687, "learning_rate": 8.333333333333334e-06, "loss": 0.1425, "step": 2380 }, { "epoch": 2.5105042016806722, "grad_norm": 0.9613827466964722, "learning_rate": 8.15826330532213e-06, "loss": 0.1374, "step": 2390 }, { "epoch": 2.5210084033613445, "grad_norm": 0.8105395436286926, "learning_rate": 7.983193277310924e-06, "loss": 0.1431, "step": 2400 }, { "epoch": 2.5210084033613445, "eval_loss": 0.4192233979701996, "eval_runtime": 13.5325, "eval_samples_per_second": 35.47, "eval_steps_per_second": 2.217, "step": 2400 }, { "epoch": 2.5315126050420167, "grad_norm": 0.7973752617835999, "learning_rate": 7.80812324929972e-06, "loss": 0.1436, "step": 2410 }, { "epoch": 2.542016806722689, "grad_norm": 0.6894790530204773, "learning_rate": 7.633053221288516e-06, "loss": 0.1425, "step": 2420 }, { "epoch": 2.552521008403361, "grad_norm": 0.6263079047203064, "learning_rate": 7.457983193277311e-06, "loss": 0.1473, "step": 2430 }, { "epoch": 2.5630252100840334, "grad_norm": 0.7887470722198486, "learning_rate": 7.282913165266108e-06, "loss": 0.1405, "step": 2440 }, { "epoch": 2.5735294117647056, "grad_norm": 0.6866456270217896, "learning_rate": 7.107843137254902e-06, "loss": 0.1409, "step": 2450 }, { "epoch": 2.5735294117647056, "eval_loss": 0.4140944480895996, "eval_runtime": 13.5509, "eval_samples_per_second": 35.422, "eval_steps_per_second": 2.214, "step": 2450 }, { "epoch": 2.5840336134453783, "grad_norm": 0.7917041778564453, "learning_rate": 6.932773109243698e-06, "loss": 0.1406, "step": 2460 }, { "epoch": 2.5945378151260505, "grad_norm": 0.6841098666191101, "learning_rate": 6.757703081232493e-06, "loss": 0.1415, "step": 2470 }, { "epoch": 2.6050420168067228, "grad_norm": 0.7307237982749939, "learning_rate": 6.582633053221289e-06, "loss": 0.1386, "step": 2480 }, { "epoch": 2.615546218487395, "grad_norm": 0.8012540340423584, "learning_rate": 6.407563025210084e-06, "loss": 0.1404, "step": 2490 }, { "epoch": 2.6260504201680672, "grad_norm": 0.7140881419181824, "learning_rate": 6.23249299719888e-06, "loss": 0.14, "step": 2500 }, { "epoch": 2.6260504201680672, "eval_loss": 0.42485204339027405, "eval_runtime": 13.5543, "eval_samples_per_second": 35.413, "eval_steps_per_second": 2.213, "step": 2500 }, { "epoch": 2.6365546218487395, "grad_norm": 0.8732954263687134, "learning_rate": 6.057422969187675e-06, "loss": 0.1459, "step": 2510 }, { "epoch": 2.6470588235294117, "grad_norm": 0.7600938677787781, "learning_rate": 5.882352941176471e-06, "loss": 0.1429, "step": 2520 }, { "epoch": 2.657563025210084, "grad_norm": 0.922292947769165, "learning_rate": 5.707282913165266e-06, "loss": 0.144, "step": 2530 }, { "epoch": 2.668067226890756, "grad_norm": 0.5363701581954956, "learning_rate": 5.532212885154062e-06, "loss": 0.1448, "step": 2540 }, { "epoch": 2.678571428571429, "grad_norm": 0.8106054663658142, "learning_rate": 5.357142857142857e-06, "loss": 0.1385, "step": 2550 }, { "epoch": 2.678571428571429, "eval_loss": 0.4227524399757385, "eval_runtime": 13.566, "eval_samples_per_second": 35.383, "eval_steps_per_second": 2.211, "step": 2550 }, { "epoch": 2.689075630252101, "grad_norm": 0.8006922006607056, "learning_rate": 5.182072829131653e-06, "loss": 0.1433, "step": 2560 }, { "epoch": 2.6995798319327733, "grad_norm": 0.6566533446311951, "learning_rate": 5.007002801120449e-06, "loss": 0.1374, "step": 2570 }, { "epoch": 2.7100840336134455, "grad_norm": 0.6680784225463867, "learning_rate": 4.831932773109244e-06, "loss": 0.1426, "step": 2580 }, { "epoch": 2.7205882352941178, "grad_norm": 0.7459174394607544, "learning_rate": 4.65686274509804e-06, "loss": 0.1412, "step": 2590 }, { "epoch": 2.73109243697479, "grad_norm": 0.6131850481033325, "learning_rate": 4.481792717086835e-06, "loss": 0.1417, "step": 2600 }, { "epoch": 2.73109243697479, "eval_loss": 0.42155376076698303, "eval_runtime": 13.5469, "eval_samples_per_second": 35.432, "eval_steps_per_second": 2.215, "step": 2600 }, { "epoch": 2.741596638655462, "grad_norm": 0.8135959506034851, "learning_rate": 4.30672268907563e-06, "loss": 0.137, "step": 2610 }, { "epoch": 2.7521008403361344, "grad_norm": 0.7350234985351562, "learning_rate": 4.131652661064426e-06, "loss": 0.1444, "step": 2620 }, { "epoch": 2.7626050420168067, "grad_norm": 2.5002171993255615, "learning_rate": 3.956582633053221e-06, "loss": 0.1448, "step": 2630 }, { "epoch": 2.773109243697479, "grad_norm": 0.9199714660644531, "learning_rate": 3.7815126050420167e-06, "loss": 0.1385, "step": 2640 }, { "epoch": 2.783613445378151, "grad_norm": 0.6327308416366577, "learning_rate": 3.606442577030812e-06, "loss": 0.1388, "step": 2650 }, { "epoch": 2.783613445378151, "eval_loss": 0.42358073592185974, "eval_runtime": 13.5314, "eval_samples_per_second": 35.473, "eval_steps_per_second": 2.217, "step": 2650 }, { "epoch": 2.7941176470588234, "grad_norm": 0.7996506690979004, "learning_rate": 3.431372549019608e-06, "loss": 0.1402, "step": 2660 }, { "epoch": 2.8046218487394956, "grad_norm": 0.7575409412384033, "learning_rate": 3.2563025210084036e-06, "loss": 0.1523, "step": 2670 }, { "epoch": 2.815126050420168, "grad_norm": 0.7814265489578247, "learning_rate": 3.081232492997199e-06, "loss": 0.1351, "step": 2680 }, { "epoch": 2.82563025210084, "grad_norm": 0.8243302702903748, "learning_rate": 2.9061624649859946e-06, "loss": 0.1362, "step": 2690 }, { "epoch": 2.8361344537815127, "grad_norm": 0.628792941570282, "learning_rate": 2.73109243697479e-06, "loss": 0.1423, "step": 2700 }, { "epoch": 2.8361344537815127, "eval_loss": 0.4288436770439148, "eval_runtime": 13.5168, "eval_samples_per_second": 35.511, "eval_steps_per_second": 2.219, "step": 2700 } ], "logging_steps": 10, "max_steps": 2856, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.2323898534854656e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }