|
{ |
|
"best_metric": 1.2006784677505493, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-550", |
|
"epoch": 0.3917727717923604, |
|
"eval_steps": 50, |
|
"global_step": 600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000652954619653934, |
|
"eval_loss": 1.7593097686767578, |
|
"eval_runtime": 42.0818, |
|
"eval_samples_per_second": 61.285, |
|
"eval_steps_per_second": 15.327, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00652954619653934, |
|
"grad_norm": 0.1703435182571411, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3545, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01305909239307868, |
|
"grad_norm": 0.10457056760787964, |
|
"learning_rate": 0.0001998582695676762, |
|
"loss": 1.3658, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.019588638589618023, |
|
"grad_norm": 0.10769687592983246, |
|
"learning_rate": 0.00019943348002101371, |
|
"loss": 1.409, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02611818478615736, |
|
"grad_norm": 0.23312892019748688, |
|
"learning_rate": 0.00019872683547213446, |
|
"loss": 1.5118, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0326477309826967, |
|
"grad_norm": 0.8387290239334106, |
|
"learning_rate": 0.00019774033898178667, |
|
"loss": 1.6039, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0326477309826967, |
|
"eval_loss": 1.4602386951446533, |
|
"eval_runtime": 42.1985, |
|
"eval_samples_per_second": 61.116, |
|
"eval_steps_per_second": 15.285, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.039177277179236046, |
|
"grad_norm": 0.09031832218170166, |
|
"learning_rate": 0.0001964767868814516, |
|
"loss": 1.2569, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.045706823375775384, |
|
"grad_norm": 0.09168185293674469, |
|
"learning_rate": 0.00019493976084683813, |
|
"loss": 1.2857, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05223636957231472, |
|
"grad_norm": 0.1381785273551941, |
|
"learning_rate": 0.00019313361774523385, |
|
"loss": 1.3339, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.058765915768854066, |
|
"grad_norm": 0.3087773323059082, |
|
"learning_rate": 0.00019106347728549135, |
|
"loss": 1.3948, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0652954619653934, |
|
"grad_norm": 0.6337625980377197, |
|
"learning_rate": 0.00018873520750565718, |
|
"loss": 1.3688, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0652954619653934, |
|
"eval_loss": 1.3073418140411377, |
|
"eval_runtime": 42.0855, |
|
"eval_samples_per_second": 61.28, |
|
"eval_steps_per_second": 15.326, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07182500816193274, |
|
"grad_norm": 0.15941736102104187, |
|
"learning_rate": 0.0001861554081393806, |
|
"loss": 1.2118, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07835455435847209, |
|
"grad_norm": 0.07974281907081604, |
|
"learning_rate": 0.0001833313919082515, |
|
"loss": 1.2545, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08488410055501143, |
|
"grad_norm": 0.14130432903766632, |
|
"learning_rate": 0.00018027116379309638, |
|
"loss": 1.3471, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09141364675155077, |
|
"grad_norm": 0.31553900241851807, |
|
"learning_rate": 0.00017698339834299061, |
|
"loss": 1.3692, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0979431929480901, |
|
"grad_norm": 0.9388893246650696, |
|
"learning_rate": 0.00017347741508630672, |
|
"loss": 1.4355, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0979431929480901, |
|
"eval_loss": 1.267966389656067, |
|
"eval_runtime": 42.2941, |
|
"eval_samples_per_second": 60.978, |
|
"eval_steps_per_second": 15.25, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.10447273914462944, |
|
"grad_norm": 0.07010999321937561, |
|
"learning_rate": 0.0001697631521134985, |
|
"loss": 1.1417, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1110022853411688, |
|
"grad_norm": 0.10216495394706726, |
|
"learning_rate": 0.00016585113790650388, |
|
"loss": 1.238, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.11753183153770813, |
|
"grad_norm": 0.12675678730010986, |
|
"learning_rate": 0.0001617524614946192, |
|
"loss": 1.3154, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.12406137773424747, |
|
"grad_norm": 0.31328052282333374, |
|
"learning_rate": 0.0001574787410214407, |
|
"loss": 1.3155, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1305909239307868, |
|
"grad_norm": 1.081733226776123, |
|
"learning_rate": 0.00015304209081197425, |
|
"loss": 1.3807, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1305909239307868, |
|
"eval_loss": 1.2385743856430054, |
|
"eval_runtime": 42.1021, |
|
"eval_samples_per_second": 61.256, |
|
"eval_steps_per_second": 15.32, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13712047012732614, |
|
"grad_norm": 0.06139560416340828, |
|
"learning_rate": 0.00014845508703326504, |
|
"loss": 1.1561, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.14365001632386548, |
|
"grad_norm": 0.07948515564203262, |
|
"learning_rate": 0.00014373073204588556, |
|
"loss": 1.2563, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.15017956252040482, |
|
"grad_norm": 0.14629638195037842, |
|
"learning_rate": 0.00013888241754733208, |
|
"loss": 1.3074, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.15670910871694418, |
|
"grad_norm": 0.3273250162601471, |
|
"learning_rate": 0.00013392388661180303, |
|
"loss": 1.288, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.16323865491348352, |
|
"grad_norm": 1.0025297403335571, |
|
"learning_rate": 0.0001288691947339621, |
|
"loss": 1.242, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.16323865491348352, |
|
"eval_loss": 1.2301549911499023, |
|
"eval_runtime": 41.9764, |
|
"eval_samples_per_second": 61.439, |
|
"eval_steps_per_second": 15.366, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.16976820111002286, |
|
"grad_norm": 0.05828835442662239, |
|
"learning_rate": 0.0001237326699871115, |
|
"loss": 1.1586, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1762977473065622, |
|
"grad_norm": 0.09446071833372116, |
|
"learning_rate": 0.00011852887240871145, |
|
"loss": 1.2516, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.18282729350310153, |
|
"grad_norm": 0.15607990324497223, |
|
"learning_rate": 0.00011327255272837221, |
|
"loss": 1.2933, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.18935683969964087, |
|
"grad_norm": 0.35059285163879395, |
|
"learning_rate": 0.00010797861055530831, |
|
"loss": 1.2938, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1958863858961802, |
|
"grad_norm": 1.048234462738037, |
|
"learning_rate": 0.00010266205214377748, |
|
"loss": 1.1819, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1958863858961802, |
|
"eval_loss": 1.2161424160003662, |
|
"eval_runtime": 42.3336, |
|
"eval_samples_per_second": 60.921, |
|
"eval_steps_per_second": 15.236, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.20241593209271955, |
|
"grad_norm": 0.058099668473005295, |
|
"learning_rate": 9.733794785622253e-05, |
|
"loss": 1.1525, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.20894547828925888, |
|
"grad_norm": 0.09674125164747238, |
|
"learning_rate": 9.202138944469168e-05, |
|
"loss": 1.2296, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.21547502448579825, |
|
"grad_norm": 0.15291635692119598, |
|
"learning_rate": 8.672744727162781e-05, |
|
"loss": 1.2739, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2220045706823376, |
|
"grad_norm": 0.2810160219669342, |
|
"learning_rate": 8.147112759128859e-05, |
|
"loss": 1.3062, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.22853411687887693, |
|
"grad_norm": 0.978479266166687, |
|
"learning_rate": 7.626733001288851e-05, |
|
"loss": 1.3263, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.22853411687887693, |
|
"eval_loss": 1.2151703834533691, |
|
"eval_runtime": 41.9719, |
|
"eval_samples_per_second": 61.446, |
|
"eval_steps_per_second": 15.367, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.23506366307541626, |
|
"grad_norm": 0.054964158684015274, |
|
"learning_rate": 7.113080526603792e-05, |
|
"loss": 1.1284, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2415932092719556, |
|
"grad_norm": 0.07777497917413712, |
|
"learning_rate": 6.607611338819697e-05, |
|
"loss": 1.2268, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.24812275546849494, |
|
"grad_norm": 0.1765056848526001, |
|
"learning_rate": 6.111758245266794e-05, |
|
"loss": 1.2698, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2546523016650343, |
|
"grad_norm": 0.33249780535697937, |
|
"learning_rate": 5.626926795411447e-05, |
|
"loss": 1.2465, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2611818478615736, |
|
"grad_norm": 0.7363329529762268, |
|
"learning_rate": 5.1544912966734994e-05, |
|
"loss": 1.1405, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2611818478615736, |
|
"eval_loss": 1.208735466003418, |
|
"eval_runtime": 41.7762, |
|
"eval_samples_per_second": 61.734, |
|
"eval_steps_per_second": 15.439, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.26771139405811295, |
|
"grad_norm": 0.06299945712089539, |
|
"learning_rate": 4.695790918802576e-05, |
|
"loss": 1.1717, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2742409402546523, |
|
"grad_norm": 0.08601243048906326, |
|
"learning_rate": 4.252125897855932e-05, |
|
"loss": 1.2408, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2807704864511916, |
|
"grad_norm": 0.14494818449020386, |
|
"learning_rate": 3.824753850538082e-05, |
|
"loss": 1.2919, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.28730003264773096, |
|
"grad_norm": 0.31678304076194763, |
|
"learning_rate": 3.414886209349615e-05, |
|
"loss": 1.239, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2938295788442703, |
|
"grad_norm": 1.0417225360870361, |
|
"learning_rate": 3.0236847886501542e-05, |
|
"loss": 1.1049, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2938295788442703, |
|
"eval_loss": 1.2068712711334229, |
|
"eval_runtime": 42.7188, |
|
"eval_samples_per_second": 60.372, |
|
"eval_steps_per_second": 15.099, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.30035912504080964, |
|
"grad_norm": 0.054399143904447556, |
|
"learning_rate": 2.6522584913693294e-05, |
|
"loss": 1.1521, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.30688867123734903, |
|
"grad_norm": 0.08509223908185959, |
|
"learning_rate": 2.301660165700936e-05, |
|
"loss": 1.2187, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.31341821743388837, |
|
"grad_norm": 0.1509586125612259, |
|
"learning_rate": 1.9728836206903656e-05, |
|
"loss": 1.3379, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3199477636304277, |
|
"grad_norm": 0.3751097619533539, |
|
"learning_rate": 1.6668608091748495e-05, |
|
"loss": 1.3362, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.32647730982696704, |
|
"grad_norm": 0.8351140022277832, |
|
"learning_rate": 1.3844591860619383e-05, |
|
"loss": 1.1598, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.32647730982696704, |
|
"eval_loss": 1.2039023637771606, |
|
"eval_runtime": 42.6078, |
|
"eval_samples_per_second": 60.529, |
|
"eval_steps_per_second": 15.138, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3330068560235064, |
|
"grad_norm": 0.05650899186730385, |
|
"learning_rate": 1.1264792494342857e-05, |
|
"loss": 1.1749, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3395364022200457, |
|
"grad_norm": 0.0797816812992096, |
|
"learning_rate": 8.936522714508678e-06, |
|
"loss": 1.2085, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.34606594841658506, |
|
"grad_norm": 0.14462202787399292, |
|
"learning_rate": 6.866382254766157e-06, |
|
"loss": 1.2441, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3525954946131244, |
|
"grad_norm": 0.30583634972572327, |
|
"learning_rate": 5.060239153161872e-06, |
|
"loss": 1.2586, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.35912504080966373, |
|
"grad_norm": 0.7339666485786438, |
|
"learning_rate": 3.5232131185484076e-06, |
|
"loss": 1.1969, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.35912504080966373, |
|
"eval_loss": 1.2006784677505493, |
|
"eval_runtime": 41.9627, |
|
"eval_samples_per_second": 61.459, |
|
"eval_steps_per_second": 15.371, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.36565458700620307, |
|
"grad_norm": 0.05688609927892685, |
|
"learning_rate": 2.259661018213333e-06, |
|
"loss": 1.1429, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3721841332027424, |
|
"grad_norm": 0.08461929857730865, |
|
"learning_rate": 1.2731645278655445e-06, |
|
"loss": 1.2267, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.37871367939928174, |
|
"grad_norm": 0.17034894227981567, |
|
"learning_rate": 5.665199789862907e-07, |
|
"loss": 1.2762, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.3852432255958211, |
|
"grad_norm": 0.2856343984603882, |
|
"learning_rate": 1.4173043232380557e-07, |
|
"loss": 1.2923, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3917727717923604, |
|
"grad_norm": 0.926727831363678, |
|
"learning_rate": 0.0, |
|
"loss": 1.2127, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3917727717923604, |
|
"eval_loss": 1.2015607357025146, |
|
"eval_runtime": 42.3181, |
|
"eval_samples_per_second": 60.943, |
|
"eval_steps_per_second": 15.242, |
|
"step": 600 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 1 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.759171124867891e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|