|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9591836734693877, |
|
"eval_steps": 2, |
|
"global_step": 120, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0, |
|
"eval_loss": 1.2313029766082764, |
|
"eval_runtime": 18.2352, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 0 |
|
}, |
|
{ |
|
"epoch": 0.0163265306122449, |
|
"grad_norm": 0.37113556265830994, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.4085, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0326530612244898, |
|
"grad_norm": 0.35803329944610596, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3876, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0326530612244898, |
|
"eval_loss": 1.2231345176696777, |
|
"eval_runtime": 18.2243, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04897959183673469, |
|
"grad_norm": 0.3112759590148926, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.3946, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0653061224489796, |
|
"grad_norm": 0.2448713332414627, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4363, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0653061224489796, |
|
"eval_loss": 1.1564743518829346, |
|
"eval_runtime": 18.253, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.08163265306122448, |
|
"grad_norm": 0.2955642342567444, |
|
"learning_rate": 0.000125, |
|
"loss": 0.4394, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09795918367346938, |
|
"grad_norm": 0.41399946808815, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.4902, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.09795918367346938, |
|
"eval_loss": 1.1353044509887695, |
|
"eval_runtime": 18.2664, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.2643347978591919, |
|
"learning_rate": 0.000175, |
|
"loss": 0.3528, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1306122448979592, |
|
"grad_norm": 0.21472330391407013, |
|
"learning_rate": 0.0002, |
|
"loss": 0.357, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1306122448979592, |
|
"eval_loss": 1.1218546628952026, |
|
"eval_runtime": 18.2139, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1469387755102041, |
|
"grad_norm": 0.23261462152004242, |
|
"learning_rate": 0.00022500000000000002, |
|
"loss": 0.3924, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"grad_norm": 0.18365171551704407, |
|
"learning_rate": 0.00025, |
|
"loss": 0.283, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"eval_loss": 1.1094393730163574, |
|
"eval_runtime": 18.249, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.17959183673469387, |
|
"grad_norm": 0.20431634783744812, |
|
"learning_rate": 0.000275, |
|
"loss": 0.3178, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.19591836734693877, |
|
"grad_norm": 0.2033773809671402, |
|
"learning_rate": 0.00030000000000000003, |
|
"loss": 0.3366, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.19591836734693877, |
|
"eval_loss": 1.1021316051483154, |
|
"eval_runtime": 18.2562, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.21224489795918366, |
|
"grad_norm": 0.27416911721229553, |
|
"learning_rate": 0.00032500000000000004, |
|
"loss": 0.3367, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.32283130288124084, |
|
"learning_rate": 0.00035, |
|
"loss": 0.3364, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"eval_loss": 1.0523244142532349, |
|
"eval_runtime": 18.2347, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.24489795918367346, |
|
"grad_norm": 0.21469368040561676, |
|
"learning_rate": 0.000375, |
|
"loss": 0.3496, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2612244897959184, |
|
"grad_norm": 0.19361759722232819, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2961, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2612244897959184, |
|
"eval_loss": 1.0401124954223633, |
|
"eval_runtime": 18.2288, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.27755102040816326, |
|
"grad_norm": 0.1797463297843933, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3016, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2938775510204082, |
|
"grad_norm": 0.28122591972351074, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3656, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2938775510204082, |
|
"eval_loss": 1.0228931903839111, |
|
"eval_runtime": 18.2429, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.31020408163265306, |
|
"grad_norm": 0.227556511759758, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3246, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"grad_norm": 0.211012601852417, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3453, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"eval_loss": 1.017521858215332, |
|
"eval_runtime": 18.2213, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.34285714285714286, |
|
"grad_norm": 0.20422972738742828, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3258, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.35918367346938773, |
|
"grad_norm": 0.206649512052536, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3248, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.35918367346938773, |
|
"eval_loss": 0.9992413520812988, |
|
"eval_runtime": 18.249, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.37551020408163266, |
|
"grad_norm": 0.19837401807308197, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3181, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.39183673469387753, |
|
"grad_norm": 0.20325997471809387, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2826, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.39183673469387753, |
|
"eval_loss": 0.9943413734436035, |
|
"eval_runtime": 18.2482, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": 0.3371317386627197, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3197, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.42448979591836733, |
|
"grad_norm": 0.21709182858467102, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3272, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.42448979591836733, |
|
"eval_loss": 0.9733779430389404, |
|
"eval_runtime": 18.2232, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.44081632653061226, |
|
"grad_norm": 0.2420857548713684, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3293, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 0.30486994981765747, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3958, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"eval_loss": 0.9588731527328491, |
|
"eval_runtime": 18.2458, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.47346938775510206, |
|
"grad_norm": 0.24714964628219604, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3305, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4897959183673469, |
|
"grad_norm": 0.21984225511550903, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3395, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4897959183673469, |
|
"eval_loss": 0.9427903890609741, |
|
"eval_runtime": 18.2414, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5061224489795918, |
|
"grad_norm": 0.19778016209602356, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2918, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5224489795918368, |
|
"grad_norm": 0.21754617989063263, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2855, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5224489795918368, |
|
"eval_loss": 0.9335330128669739, |
|
"eval_runtime": 18.275, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5387755102040817, |
|
"grad_norm": 0.2221430391073227, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2946, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5551020408163265, |
|
"grad_norm": 0.2112974375486374, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3149, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5551020408163265, |
|
"eval_loss": 0.9311869144439697, |
|
"eval_runtime": 18.2431, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.19651219248771667, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2606, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5877551020408164, |
|
"grad_norm": 0.22742077708244324, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3245, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5877551020408164, |
|
"eval_loss": 0.9281033873558044, |
|
"eval_runtime": 18.2248, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.6040816326530613, |
|
"grad_norm": 0.2320890575647354, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3532, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6204081632653061, |
|
"grad_norm": 0.21191200613975525, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2973, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.6204081632653061, |
|
"eval_loss": 0.9274996519088745, |
|
"eval_runtime": 18.2309, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.636734693877551, |
|
"grad_norm": 0.24098484218120575, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3194, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6530612244897959, |
|
"grad_norm": 0.3358725607395172, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2833, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6530612244897959, |
|
"eval_loss": 0.9254183769226074, |
|
"eval_runtime": 18.2685, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6693877551020408, |
|
"grad_norm": 0.2399401068687439, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3381, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6857142857142857, |
|
"grad_norm": 0.23229075968265533, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3501, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6857142857142857, |
|
"eval_loss": 0.9213573932647705, |
|
"eval_runtime": 18.2788, |
|
"eval_samples_per_second": 0.875, |
|
"eval_steps_per_second": 0.875, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.7020408163265306, |
|
"grad_norm": 0.2341497391462326, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3033, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.7183673469387755, |
|
"grad_norm": 0.22997914254665375, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3675, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7183673469387755, |
|
"eval_loss": 0.9130539298057556, |
|
"eval_runtime": 18.2601, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7346938775510204, |
|
"grad_norm": 0.23445634543895721, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3113, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7510204081632653, |
|
"grad_norm": 0.22852188348770142, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3593, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7510204081632653, |
|
"eval_loss": 0.9100953936576843, |
|
"eval_runtime": 18.2446, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7673469387755102, |
|
"grad_norm": 0.2316325306892395, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3121, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7836734693877551, |
|
"grad_norm": 0.2397606372833252, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2979, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7836734693877551, |
|
"eval_loss": 0.9087210297584534, |
|
"eval_runtime": 18.2833, |
|
"eval_samples_per_second": 0.875, |
|
"eval_steps_per_second": 0.875, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.22637659311294556, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2919, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 0.255599707365036, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3741, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"eval_loss": 0.9020435810089111, |
|
"eval_runtime": 18.2931, |
|
"eval_samples_per_second": 0.875, |
|
"eval_steps_per_second": 0.875, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"eval_loss": 0.9020435810089111, |
|
"eval_runtime": 18.1276, |
|
"eval_samples_per_second": 0.883, |
|
"eval_steps_per_second": 0.883, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8326530612244898, |
|
"grad_norm": 0.2062978744506836, |
|
"learning_rate": 0.0003877467715307749, |
|
"loss": 0.3303, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.8489795918367347, |
|
"grad_norm": 0.20149795711040497, |
|
"learning_rate": 0.0003870443502801494, |
|
"loss": 0.3212, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8489795918367347, |
|
"eval_loss": 0.8994156718254089, |
|
"eval_runtime": 18.1853, |
|
"eval_samples_per_second": 0.88, |
|
"eval_steps_per_second": 0.88, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8653061224489796, |
|
"grad_norm": 0.17913399636745453, |
|
"learning_rate": 0.0003863230255984052, |
|
"loss": 0.2637, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8816326530612245, |
|
"grad_norm": 0.19331133365631104, |
|
"learning_rate": 0.00038558287038542615, |
|
"loss": 0.3564, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8816326530612245, |
|
"eval_loss": 0.8963940143585205, |
|
"eval_runtime": 18.223, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8979591836734694, |
|
"grad_norm": 0.20378044247627258, |
|
"learning_rate": 0.00038482395944418313, |
|
"loss": 0.3253, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.9142857142857143, |
|
"grad_norm": 0.19742076098918915, |
|
"learning_rate": 0.0003840463694731741, |
|
"loss": 0.3715, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.9142857142857143, |
|
"eval_loss": 0.8902382850646973, |
|
"eval_runtime": 18.2093, |
|
"eval_samples_per_second": 0.879, |
|
"eval_steps_per_second": 0.879, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.9306122448979591, |
|
"grad_norm": 0.1908055692911148, |
|
"learning_rate": 0.0003832501790586724, |
|
"loss": 0.3305, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.9469387755102041, |
|
"grad_norm": 0.21527761220932007, |
|
"learning_rate": 0.0003824354686667848, |
|
"loss": 0.3523, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.9469387755102041, |
|
"eval_loss": 0.8786917328834534, |
|
"eval_runtime": 18.2124, |
|
"eval_samples_per_second": 0.879, |
|
"eval_steps_per_second": 0.879, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.963265306122449, |
|
"grad_norm": 0.1817687749862671, |
|
"learning_rate": 0.00038160232063531917, |
|
"loss": 0.2616, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.9795918367346939, |
|
"grad_norm": 0.17659063637256622, |
|
"learning_rate": 0.0003807508191654631, |
|
"loss": 0.251, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9795918367346939, |
|
"eval_loss": 0.8799586296081543, |
|
"eval_runtime": 18.2543, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9959183673469387, |
|
"grad_norm": 0.22647982835769653, |
|
"learning_rate": 0.0003798810503132742, |
|
"loss": 0.2881, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.0122448979591836, |
|
"grad_norm": 0.45377442240715027, |
|
"learning_rate": 0.00037899310198098295, |
|
"loss": 0.5515, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.0122448979591836, |
|
"eval_loss": 0.8895297050476074, |
|
"eval_runtime": 18.2212, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.0285714285714285, |
|
"grad_norm": 0.3527546226978302, |
|
"learning_rate": 0.000378087063908109, |
|
"loss": 0.2427, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.0448979591836736, |
|
"grad_norm": 0.2781178951263428, |
|
"learning_rate": 0.0003771630276623915, |
|
"loss": 0.2215, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.0448979591836736, |
|
"eval_loss": 0.9294220209121704, |
|
"eval_runtime": 18.226, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.0612244897959184, |
|
"grad_norm": 0.2543565034866333, |
|
"learning_rate": 0.00037622108663053536, |
|
"loss": 0.2761, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.0775510204081633, |
|
"grad_norm": 0.26259011030197144, |
|
"learning_rate": 0.0003752613360087727, |
|
"loss": 0.2425, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0775510204081633, |
|
"eval_loss": 0.9212721586227417, |
|
"eval_runtime": 18.2309, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0938775510204082, |
|
"grad_norm": 0.36850446462631226, |
|
"learning_rate": 0.00037428387279324257, |
|
"loss": 0.225, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.110204081632653, |
|
"grad_norm": 0.23748713731765747, |
|
"learning_rate": 0.0003732887957701874, |
|
"loss": 0.2363, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.110204081632653, |
|
"eval_loss": 0.9431418776512146, |
|
"eval_runtime": 18.2471, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.126530612244898, |
|
"grad_norm": 0.23217403888702393, |
|
"learning_rate": 0.00037227620550597, |
|
"loss": 0.2364, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.28124868869781494, |
|
"learning_rate": 0.0003712462043369093, |
|
"loss": 0.2197, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"eval_loss": 0.9205393195152283, |
|
"eval_runtime": 18.2662, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1591836734693877, |
|
"grad_norm": 0.2142041176557541, |
|
"learning_rate": 0.00037019889635893843, |
|
"loss": 0.1958, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.1755102040816325, |
|
"grad_norm": 0.34761127829551697, |
|
"learning_rate": 0.0003691343874170838, |
|
"loss": 0.2183, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.1755102040816325, |
|
"eval_loss": 0.9254322648048401, |
|
"eval_runtime": 18.2303, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.1918367346938776, |
|
"grad_norm": 0.2560065686702728, |
|
"learning_rate": 0.00036805278509476844, |
|
"loss": 0.248, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.2081632653061225, |
|
"grad_norm": 0.7435296773910522, |
|
"learning_rate": 0.00036695419870293915, |
|
"loss": 0.2356, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.2081632653061225, |
|
"eval_loss": 0.9271378517150879, |
|
"eval_runtime": 18.2503, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.2244897959183674, |
|
"grad_norm": 0.8685758113861084, |
|
"learning_rate": 0.00036583873926901867, |
|
"loss": 0.2129, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.2408163265306122, |
|
"grad_norm": 1.2740998268127441, |
|
"learning_rate": 0.0003647065195256855, |
|
"loss": 0.2469, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.2408163265306122, |
|
"eval_loss": 0.9236885905265808, |
|
"eval_runtime": 18.2226, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.2571428571428571, |
|
"grad_norm": 0.28524693846702576, |
|
"learning_rate": 0.0003635576538994801, |
|
"loss": 0.2278, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.273469387755102, |
|
"grad_norm": 1.0116885900497437, |
|
"learning_rate": 0.0003623922584992409, |
|
"loss": 0.2488, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.273469387755102, |
|
"eval_loss": 0.923646867275238, |
|
"eval_runtime": 18.2519, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.2897959183673469, |
|
"grad_norm": 0.9336573481559753, |
|
"learning_rate": 0.0003612104511043694, |
|
"loss": 0.3095, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.306122448979592, |
|
"grad_norm": 0.43459734320640564, |
|
"learning_rate": 0.0003600123511529278, |
|
"loss": 0.2215, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.306122448979592, |
|
"eval_loss": 0.941472589969635, |
|
"eval_runtime": 18.251, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.3224489795918366, |
|
"grad_norm": 0.39941656589508057, |
|
"learning_rate": 0.0003587980797295671, |
|
"loss": 0.251, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.3387755102040817, |
|
"grad_norm": 0.2876632511615753, |
|
"learning_rate": 0.0003575677595532904, |
|
"loss": 0.208, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.3387755102040817, |
|
"eval_loss": 0.932316780090332, |
|
"eval_runtime": 18.2326, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.3551020408163266, |
|
"grad_norm": 0.30556565523147583, |
|
"learning_rate": 0.0003563215149650505, |
|
"loss": 0.2821, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.3714285714285714, |
|
"grad_norm": 0.3039940595626831, |
|
"learning_rate": 0.00035505947191518316, |
|
"loss": 0.2177, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.3714285714285714, |
|
"eval_loss": 0.9171479344367981, |
|
"eval_runtime": 18.2397, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.3877551020408163, |
|
"grad_norm": 0.32876476645469666, |
|
"learning_rate": 0.0003537817579506783, |
|
"loss": 0.2142, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.4040816326530612, |
|
"grad_norm": 0.26468542218208313, |
|
"learning_rate": 0.0003524885022022896, |
|
"loss": 0.2293, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.4040816326530612, |
|
"eval_loss": 0.9176874756813049, |
|
"eval_runtime": 18.2553, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.420408163265306, |
|
"grad_norm": 0.9715031981468201, |
|
"learning_rate": 0.000351179835371484, |
|
"loss": 0.235, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.436734693877551, |
|
"grad_norm": 0.8392952680587769, |
|
"learning_rate": 0.00034985588971723233, |
|
"loss": 0.2307, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.436734693877551, |
|
"eval_loss": 0.9272938370704651, |
|
"eval_runtime": 18.2194, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.453061224489796, |
|
"grad_norm": 0.28850337862968445, |
|
"learning_rate": 0.00034851679904264314, |
|
"loss": 0.2403, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.469387755102041, |
|
"grad_norm": 0.46389341354370117, |
|
"learning_rate": 0.00034716269868143956, |
|
"loss": 0.2464, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.469387755102041, |
|
"eval_loss": 0.9267984628677368, |
|
"eval_runtime": 18.2514, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4857142857142858, |
|
"grad_norm": 0.29597121477127075, |
|
"learning_rate": 0.00034579372548428235, |
|
"loss": 0.2307, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.5020408163265306, |
|
"grad_norm": 0.3392711877822876, |
|
"learning_rate": 0.00034441001780493886, |
|
"loss": 0.231, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.5020408163265306, |
|
"eval_loss": 0.9267009496688843, |
|
"eval_runtime": 18.2179, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.5183673469387755, |
|
"grad_norm": 0.2309531569480896, |
|
"learning_rate": 0.00034301171548630063, |
|
"loss": 0.2307, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.5346938775510204, |
|
"grad_norm": 0.28941744565963745, |
|
"learning_rate": 0.0003415989598462506, |
|
"loss": 0.2384, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.5346938775510204, |
|
"eval_loss": 0.9336121082305908, |
|
"eval_runtime": 18.2184, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.5510204081632653, |
|
"grad_norm": 0.22579389810562134, |
|
"learning_rate": 0.00034017189366338034, |
|
"loss": 0.1949, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.5673469387755103, |
|
"grad_norm": 0.2894729971885681, |
|
"learning_rate": 0.000338730661162561, |
|
"loss": 0.2348, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.5673469387755103, |
|
"eval_loss": 0.9432627558708191, |
|
"eval_runtime": 18.2158, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.583673469387755, |
|
"grad_norm": 0.24218647181987762, |
|
"learning_rate": 0.00033727540800036683, |
|
"loss": 0.2113, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.2695946991443634, |
|
"learning_rate": 0.0003358062812503548, |
|
"loss": 0.2209, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.9510765075683594, |
|
"eval_runtime": 18.2181, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.6163265306122447, |
|
"grad_norm": 0.27196425199508667, |
|
"learning_rate": 0.00033432342938820086, |
|
"loss": 0.2366, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.6326530612244898, |
|
"grad_norm": 0.2302471101284027, |
|
"learning_rate": 0.0003328270022766941, |
|
"loss": 0.2099, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6326530612244898, |
|
"eval_loss": 0.9363111853599548, |
|
"eval_runtime": 18.2175, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6489795918367347, |
|
"grad_norm": 0.24810455739498138, |
|
"learning_rate": 0.0003313171511505913, |
|
"loss": 0.2196, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.6653061224489796, |
|
"grad_norm": 0.25561368465423584, |
|
"learning_rate": 0.0003297940286013326, |
|
"loss": 0.253, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.6653061224489796, |
|
"eval_loss": 0.942488431930542, |
|
"eval_runtime": 18.2068, |
|
"eval_samples_per_second": 0.879, |
|
"eval_steps_per_second": 0.879, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.6816326530612244, |
|
"grad_norm": 0.22677375376224518, |
|
"learning_rate": 0.0003282577885616198, |
|
"loss": 0.1789, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.6979591836734693, |
|
"grad_norm": 0.26009589433670044, |
|
"learning_rate": 0.0003267085862898594, |
|
"loss": 0.2541, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.6979591836734693, |
|
"eval_loss": 0.9335595369338989, |
|
"eval_runtime": 18.2507, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.20782141387462616, |
|
"learning_rate": 0.0003251465783544716, |
|
"loss": 0.2308, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.730612244897959, |
|
"grad_norm": 0.22768278419971466, |
|
"learning_rate": 0.0003235719226180669, |
|
"loss": 0.2354, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.730612244897959, |
|
"eval_loss": 0.9376662373542786, |
|
"eval_runtime": 18.2284, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.7469387755102042, |
|
"grad_norm": 0.23742002248764038, |
|
"learning_rate": 0.00032198477822149185, |
|
"loss": 0.2306, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.763265306122449, |
|
"grad_norm": 0.25688832998275757, |
|
"learning_rate": 0.0003203853055677457, |
|
"loss": 0.2816, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.763265306122449, |
|
"eval_loss": 0.9434496164321899, |
|
"eval_runtime": 18.2263, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.779591836734694, |
|
"grad_norm": 0.2690117359161377, |
|
"learning_rate": 0.00031877366630576946, |
|
"loss": 0.2013, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.7959183673469388, |
|
"grad_norm": 0.2317412942647934, |
|
"learning_rate": 0.0003171500233141089, |
|
"loss": 0.2398, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7959183673469388, |
|
"eval_loss": 0.9329774975776672, |
|
"eval_runtime": 18.2112, |
|
"eval_samples_per_second": 0.879, |
|
"eval_steps_per_second": 0.879, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.8122448979591836, |
|
"grad_norm": 0.2262117713689804, |
|
"learning_rate": 0.0003155145406844535, |
|
"loss": 0.207, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.8285714285714287, |
|
"grad_norm": 0.22458741068840027, |
|
"learning_rate": 0.00031386738370505293, |
|
"loss": 0.2093, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.8285714285714287, |
|
"eval_loss": 0.9222925901412964, |
|
"eval_runtime": 18.2333, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.8448979591836734, |
|
"grad_norm": 0.24273625016212463, |
|
"learning_rate": 0.0003122087188440118, |
|
"loss": 0.2434, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.8612244897959185, |
|
"grad_norm": 0.2206314653158188, |
|
"learning_rate": 0.0003105387137324663, |
|
"loss": 0.2056, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.8612244897959185, |
|
"eval_loss": 0.9319095015525818, |
|
"eval_runtime": 18.2327, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.8775510204081631, |
|
"grad_norm": 0.2606954574584961, |
|
"learning_rate": 0.0003088575371476426, |
|
"loss": 0.2198, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.8938775510204082, |
|
"grad_norm": 0.23156337440013885, |
|
"learning_rate": 0.0003071653589957993, |
|
"loss": 0.1957, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.8938775510204082, |
|
"eval_loss": 0.9314719438552856, |
|
"eval_runtime": 18.2411, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.910204081632653, |
|
"grad_norm": 0.2253030389547348, |
|
"learning_rate": 0.0003054623502950565, |
|
"loss": 0.266, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.926530612244898, |
|
"grad_norm": 0.24510890245437622, |
|
"learning_rate": 0.0003037486831581115, |
|
"loss": 0.2493, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.926530612244898, |
|
"eval_loss": 0.926245927810669, |
|
"eval_runtime": 18.2428, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.9428571428571428, |
|
"grad_norm": 0.22151368856430054, |
|
"learning_rate": 0.00030202453077484494, |
|
"loss": 0.2666, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.9591836734693877, |
|
"grad_norm": 0.21714863181114197, |
|
"learning_rate": 0.0003002900673948173, |
|
"loss": 0.253, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.9591836734693877, |
|
"eval_loss": 0.9263309240341187, |
|
"eval_runtime": 18.2196, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 120 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 305, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 30, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.4518993974018048e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|