|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9795918367346939, |
|
"eval_steps": 2, |
|
"global_step": 60, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0, |
|
"eval_loss": 1.2313029766082764, |
|
"eval_runtime": 18.2352, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 0 |
|
}, |
|
{ |
|
"epoch": 0.0163265306122449, |
|
"grad_norm": 0.37113556265830994, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.4085, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0326530612244898, |
|
"grad_norm": 0.35803329944610596, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3876, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0326530612244898, |
|
"eval_loss": 1.2231345176696777, |
|
"eval_runtime": 18.2243, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04897959183673469, |
|
"grad_norm": 0.3112759590148926, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.3946, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0653061224489796, |
|
"grad_norm": 0.2448713332414627, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4363, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0653061224489796, |
|
"eval_loss": 1.1564743518829346, |
|
"eval_runtime": 18.253, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.08163265306122448, |
|
"grad_norm": 0.2955642342567444, |
|
"learning_rate": 0.000125, |
|
"loss": 0.4394, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09795918367346938, |
|
"grad_norm": 0.41399946808815, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.4902, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.09795918367346938, |
|
"eval_loss": 1.1353044509887695, |
|
"eval_runtime": 18.2664, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 0.2643347978591919, |
|
"learning_rate": 0.000175, |
|
"loss": 0.3528, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1306122448979592, |
|
"grad_norm": 0.21472330391407013, |
|
"learning_rate": 0.0002, |
|
"loss": 0.357, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1306122448979592, |
|
"eval_loss": 1.1218546628952026, |
|
"eval_runtime": 18.2139, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1469387755102041, |
|
"grad_norm": 0.23261462152004242, |
|
"learning_rate": 0.00022500000000000002, |
|
"loss": 0.3924, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"grad_norm": 0.18365171551704407, |
|
"learning_rate": 0.00025, |
|
"loss": 0.283, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"eval_loss": 1.1094393730163574, |
|
"eval_runtime": 18.249, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.17959183673469387, |
|
"grad_norm": 0.20431634783744812, |
|
"learning_rate": 0.000275, |
|
"loss": 0.3178, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.19591836734693877, |
|
"grad_norm": 0.2033773809671402, |
|
"learning_rate": 0.00030000000000000003, |
|
"loss": 0.3366, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.19591836734693877, |
|
"eval_loss": 1.1021316051483154, |
|
"eval_runtime": 18.2562, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.21224489795918366, |
|
"grad_norm": 0.27416911721229553, |
|
"learning_rate": 0.00032500000000000004, |
|
"loss": 0.3367, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.32283130288124084, |
|
"learning_rate": 0.00035, |
|
"loss": 0.3364, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"eval_loss": 1.0523244142532349, |
|
"eval_runtime": 18.2347, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.24489795918367346, |
|
"grad_norm": 0.21469368040561676, |
|
"learning_rate": 0.000375, |
|
"loss": 0.3496, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2612244897959184, |
|
"grad_norm": 0.19361759722232819, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2961, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2612244897959184, |
|
"eval_loss": 1.0401124954223633, |
|
"eval_runtime": 18.2288, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.27755102040816326, |
|
"grad_norm": 0.1797463297843933, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3016, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2938775510204082, |
|
"grad_norm": 0.28122591972351074, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3656, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2938775510204082, |
|
"eval_loss": 1.0228931903839111, |
|
"eval_runtime": 18.2429, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.31020408163265306, |
|
"grad_norm": 0.227556511759758, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3246, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"grad_norm": 0.211012601852417, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3453, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"eval_loss": 1.017521858215332, |
|
"eval_runtime": 18.2213, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.34285714285714286, |
|
"grad_norm": 0.20422972738742828, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3258, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.35918367346938773, |
|
"grad_norm": 0.206649512052536, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3248, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.35918367346938773, |
|
"eval_loss": 0.9992413520812988, |
|
"eval_runtime": 18.249, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.37551020408163266, |
|
"grad_norm": 0.19837401807308197, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3181, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.39183673469387753, |
|
"grad_norm": 0.20325997471809387, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2826, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.39183673469387753, |
|
"eval_loss": 0.9943413734436035, |
|
"eval_runtime": 18.2482, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": 0.3371317386627197, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3197, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.42448979591836733, |
|
"grad_norm": 0.21709182858467102, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3272, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.42448979591836733, |
|
"eval_loss": 0.9733779430389404, |
|
"eval_runtime": 18.2232, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.44081632653061226, |
|
"grad_norm": 0.2420857548713684, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3293, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 0.30486994981765747, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3958, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"eval_loss": 0.9588731527328491, |
|
"eval_runtime": 18.2458, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.47346938775510206, |
|
"grad_norm": 0.24714964628219604, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3305, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4897959183673469, |
|
"grad_norm": 0.21984225511550903, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3395, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4897959183673469, |
|
"eval_loss": 0.9427903890609741, |
|
"eval_runtime": 18.2414, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5061224489795918, |
|
"grad_norm": 0.19778016209602356, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2918, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5224489795918368, |
|
"grad_norm": 0.21754617989063263, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2855, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5224489795918368, |
|
"eval_loss": 0.9335330128669739, |
|
"eval_runtime": 18.275, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5387755102040817, |
|
"grad_norm": 0.2221430391073227, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2946, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5551020408163265, |
|
"grad_norm": 0.2112974375486374, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3149, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5551020408163265, |
|
"eval_loss": 0.9311869144439697, |
|
"eval_runtime": 18.2431, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.19651219248771667, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2606, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5877551020408164, |
|
"grad_norm": 0.22742077708244324, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3245, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5877551020408164, |
|
"eval_loss": 0.9281033873558044, |
|
"eval_runtime": 18.2248, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.6040816326530613, |
|
"grad_norm": 0.2320890575647354, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3532, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6204081632653061, |
|
"grad_norm": 0.21191200613975525, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2973, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.6204081632653061, |
|
"eval_loss": 0.9274996519088745, |
|
"eval_runtime": 18.2309, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.636734693877551, |
|
"grad_norm": 0.24098484218120575, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3194, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6530612244897959, |
|
"grad_norm": 0.3358725607395172, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2833, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6530612244897959, |
|
"eval_loss": 0.9254183769226074, |
|
"eval_runtime": 18.2685, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6693877551020408, |
|
"grad_norm": 0.2399401068687439, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3381, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6857142857142857, |
|
"grad_norm": 0.23229075968265533, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3501, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6857142857142857, |
|
"eval_loss": 0.9213573932647705, |
|
"eval_runtime": 18.2788, |
|
"eval_samples_per_second": 0.875, |
|
"eval_steps_per_second": 0.875, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.7020408163265306, |
|
"grad_norm": 0.2341497391462326, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3033, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.7183673469387755, |
|
"grad_norm": 0.22997914254665375, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3675, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7183673469387755, |
|
"eval_loss": 0.9130539298057556, |
|
"eval_runtime": 18.2601, |
|
"eval_samples_per_second": 0.876, |
|
"eval_steps_per_second": 0.876, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7346938775510204, |
|
"grad_norm": 0.23445634543895721, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3113, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7510204081632653, |
|
"grad_norm": 0.22852188348770142, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3593, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7510204081632653, |
|
"eval_loss": 0.9100953936576843, |
|
"eval_runtime": 18.2446, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7673469387755102, |
|
"grad_norm": 0.2316325306892395, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3121, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7836734693877551, |
|
"grad_norm": 0.2397606372833252, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2979, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7836734693877551, |
|
"eval_loss": 0.9087210297584534, |
|
"eval_runtime": 18.2833, |
|
"eval_samples_per_second": 0.875, |
|
"eval_steps_per_second": 0.875, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.22637659311294556, |
|
"learning_rate": 0.0004, |
|
"loss": 0.2919, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 0.255599707365036, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3741, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"eval_loss": 0.9020435810089111, |
|
"eval_runtime": 18.2931, |
|
"eval_samples_per_second": 0.875, |
|
"eval_steps_per_second": 0.875, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"eval_loss": 0.9020435810089111, |
|
"eval_runtime": 18.1276, |
|
"eval_samples_per_second": 0.883, |
|
"eval_steps_per_second": 0.883, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8326530612244898, |
|
"grad_norm": 0.2062978744506836, |
|
"learning_rate": 0.0003877467715307749, |
|
"loss": 0.3303, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.8489795918367347, |
|
"grad_norm": 0.20149795711040497, |
|
"learning_rate": 0.0003870443502801494, |
|
"loss": 0.3212, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8489795918367347, |
|
"eval_loss": 0.8994156718254089, |
|
"eval_runtime": 18.1853, |
|
"eval_samples_per_second": 0.88, |
|
"eval_steps_per_second": 0.88, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8653061224489796, |
|
"grad_norm": 0.17913399636745453, |
|
"learning_rate": 0.0003863230255984052, |
|
"loss": 0.2637, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8816326530612245, |
|
"grad_norm": 0.19331133365631104, |
|
"learning_rate": 0.00038558287038542615, |
|
"loss": 0.3564, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8816326530612245, |
|
"eval_loss": 0.8963940143585205, |
|
"eval_runtime": 18.223, |
|
"eval_samples_per_second": 0.878, |
|
"eval_steps_per_second": 0.878, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8979591836734694, |
|
"grad_norm": 0.20378044247627258, |
|
"learning_rate": 0.00038482395944418313, |
|
"loss": 0.3253, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.9142857142857143, |
|
"grad_norm": 0.19742076098918915, |
|
"learning_rate": 0.0003840463694731741, |
|
"loss": 0.3715, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.9142857142857143, |
|
"eval_loss": 0.8902382850646973, |
|
"eval_runtime": 18.2093, |
|
"eval_samples_per_second": 0.879, |
|
"eval_steps_per_second": 0.879, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.9306122448979591, |
|
"grad_norm": 0.1908055692911148, |
|
"learning_rate": 0.0003832501790586724, |
|
"loss": 0.3305, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.9469387755102041, |
|
"grad_norm": 0.21527761220932007, |
|
"learning_rate": 0.0003824354686667848, |
|
"loss": 0.3523, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.9469387755102041, |
|
"eval_loss": 0.8786917328834534, |
|
"eval_runtime": 18.2124, |
|
"eval_samples_per_second": 0.879, |
|
"eval_steps_per_second": 0.879, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.963265306122449, |
|
"grad_norm": 0.1817687749862671, |
|
"learning_rate": 0.00038160232063531917, |
|
"loss": 0.2616, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.9795918367346939, |
|
"grad_norm": 0.17659063637256622, |
|
"learning_rate": 0.0003807508191654631, |
|
"loss": 0.251, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9795918367346939, |
|
"eval_loss": 0.8799586296081543, |
|
"eval_runtime": 18.2543, |
|
"eval_samples_per_second": 0.877, |
|
"eval_steps_per_second": 0.877, |
|
"step": 60 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 305, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 30, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.260605121060864e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|