{ "best_metric": 1.2006784677505493, "best_model_checkpoint": "miner_id_24/checkpoint-550", "epoch": 0.3917727717923604, "eval_steps": 50, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000652954619653934, "eval_loss": 1.7593097686767578, "eval_runtime": 42.0818, "eval_samples_per_second": 61.285, "eval_steps_per_second": 15.327, "step": 1 }, { "epoch": 0.00652954619653934, "grad_norm": 0.1703435182571411, "learning_rate": 0.0002, "loss": 1.3545, "step": 10 }, { "epoch": 0.01305909239307868, "grad_norm": 0.10457056760787964, "learning_rate": 0.0001998582695676762, "loss": 1.3658, "step": 20 }, { "epoch": 0.019588638589618023, "grad_norm": 0.10769687592983246, "learning_rate": 0.00019943348002101371, "loss": 1.409, "step": 30 }, { "epoch": 0.02611818478615736, "grad_norm": 0.23312892019748688, "learning_rate": 0.00019872683547213446, "loss": 1.5118, "step": 40 }, { "epoch": 0.0326477309826967, "grad_norm": 0.8387290239334106, "learning_rate": 0.00019774033898178667, "loss": 1.6039, "step": 50 }, { "epoch": 0.0326477309826967, "eval_loss": 1.4602386951446533, "eval_runtime": 42.1985, "eval_samples_per_second": 61.116, "eval_steps_per_second": 15.285, "step": 50 }, { "epoch": 0.039177277179236046, "grad_norm": 0.09031832218170166, "learning_rate": 0.0001964767868814516, "loss": 1.2569, "step": 60 }, { "epoch": 0.045706823375775384, "grad_norm": 0.09168185293674469, "learning_rate": 0.00019493976084683813, "loss": 1.2857, "step": 70 }, { "epoch": 0.05223636957231472, "grad_norm": 0.1381785273551941, "learning_rate": 0.00019313361774523385, "loss": 1.3339, "step": 80 }, { "epoch": 0.058765915768854066, "grad_norm": 0.3087773323059082, "learning_rate": 0.00019106347728549135, "loss": 1.3948, "step": 90 }, { "epoch": 0.0652954619653934, "grad_norm": 0.6337625980377197, "learning_rate": 0.00018873520750565718, "loss": 1.3688, "step": 100 }, { "epoch": 0.0652954619653934, "eval_loss": 1.3073418140411377, "eval_runtime": 42.0855, "eval_samples_per_second": 61.28, "eval_steps_per_second": 15.326, "step": 100 }, { "epoch": 0.07182500816193274, "grad_norm": 0.15941736102104187, "learning_rate": 0.0001861554081393806, "loss": 1.2118, "step": 110 }, { "epoch": 0.07835455435847209, "grad_norm": 0.07974281907081604, "learning_rate": 0.0001833313919082515, "loss": 1.2545, "step": 120 }, { "epoch": 0.08488410055501143, "grad_norm": 0.14130432903766632, "learning_rate": 0.00018027116379309638, "loss": 1.3471, "step": 130 }, { "epoch": 0.09141364675155077, "grad_norm": 0.31553900241851807, "learning_rate": 0.00017698339834299061, "loss": 1.3692, "step": 140 }, { "epoch": 0.0979431929480901, "grad_norm": 0.9388893246650696, "learning_rate": 0.00017347741508630672, "loss": 1.4355, "step": 150 }, { "epoch": 0.0979431929480901, "eval_loss": 1.267966389656067, "eval_runtime": 42.2941, "eval_samples_per_second": 60.978, "eval_steps_per_second": 15.25, "step": 150 }, { "epoch": 0.10447273914462944, "grad_norm": 0.07010999321937561, "learning_rate": 0.0001697631521134985, "loss": 1.1417, "step": 160 }, { "epoch": 0.1110022853411688, "grad_norm": 0.10216495394706726, "learning_rate": 0.00016585113790650388, "loss": 1.238, "step": 170 }, { "epoch": 0.11753183153770813, "grad_norm": 0.12675678730010986, "learning_rate": 0.0001617524614946192, "loss": 1.3154, "step": 180 }, { "epoch": 0.12406137773424747, "grad_norm": 0.31328052282333374, "learning_rate": 0.0001574787410214407, "loss": 1.3155, "step": 190 }, { "epoch": 0.1305909239307868, "grad_norm": 1.081733226776123, "learning_rate": 0.00015304209081197425, "loss": 1.3807, "step": 200 }, { "epoch": 0.1305909239307868, "eval_loss": 1.2385743856430054, "eval_runtime": 42.1021, "eval_samples_per_second": 61.256, "eval_steps_per_second": 15.32, "step": 200 }, { "epoch": 0.13712047012732614, "grad_norm": 0.06139560416340828, "learning_rate": 0.00014845508703326504, "loss": 1.1561, "step": 210 }, { "epoch": 0.14365001632386548, "grad_norm": 0.07948515564203262, "learning_rate": 0.00014373073204588556, "loss": 1.2563, "step": 220 }, { "epoch": 0.15017956252040482, "grad_norm": 0.14629638195037842, "learning_rate": 0.00013888241754733208, "loss": 1.3074, "step": 230 }, { "epoch": 0.15670910871694418, "grad_norm": 0.3273250162601471, "learning_rate": 0.00013392388661180303, "loss": 1.288, "step": 240 }, { "epoch": 0.16323865491348352, "grad_norm": 1.0025297403335571, "learning_rate": 0.0001288691947339621, "loss": 1.242, "step": 250 }, { "epoch": 0.16323865491348352, "eval_loss": 1.2301549911499023, "eval_runtime": 41.9764, "eval_samples_per_second": 61.439, "eval_steps_per_second": 15.366, "step": 250 }, { "epoch": 0.16976820111002286, "grad_norm": 0.05828835442662239, "learning_rate": 0.0001237326699871115, "loss": 1.1586, "step": 260 }, { "epoch": 0.1762977473065622, "grad_norm": 0.09446071833372116, "learning_rate": 0.00011852887240871145, "loss": 1.2516, "step": 270 }, { "epoch": 0.18282729350310153, "grad_norm": 0.15607990324497223, "learning_rate": 0.00011327255272837221, "loss": 1.2933, "step": 280 }, { "epoch": 0.18935683969964087, "grad_norm": 0.35059285163879395, "learning_rate": 0.00010797861055530831, "loss": 1.2938, "step": 290 }, { "epoch": 0.1958863858961802, "grad_norm": 1.048234462738037, "learning_rate": 0.00010266205214377748, "loss": 1.1819, "step": 300 }, { "epoch": 0.1958863858961802, "eval_loss": 1.2161424160003662, "eval_runtime": 42.3336, "eval_samples_per_second": 60.921, "eval_steps_per_second": 15.236, "step": 300 }, { "epoch": 0.20241593209271955, "grad_norm": 0.058099668473005295, "learning_rate": 9.733794785622253e-05, "loss": 1.1525, "step": 310 }, { "epoch": 0.20894547828925888, "grad_norm": 0.09674125164747238, "learning_rate": 9.202138944469168e-05, "loss": 1.2296, "step": 320 }, { "epoch": 0.21547502448579825, "grad_norm": 0.15291635692119598, "learning_rate": 8.672744727162781e-05, "loss": 1.2739, "step": 330 }, { "epoch": 0.2220045706823376, "grad_norm": 0.2810160219669342, "learning_rate": 8.147112759128859e-05, "loss": 1.3062, "step": 340 }, { "epoch": 0.22853411687887693, "grad_norm": 0.978479266166687, "learning_rate": 7.626733001288851e-05, "loss": 1.3263, "step": 350 }, { "epoch": 0.22853411687887693, "eval_loss": 1.2151703834533691, "eval_runtime": 41.9719, "eval_samples_per_second": 61.446, "eval_steps_per_second": 15.367, "step": 350 }, { "epoch": 0.23506366307541626, "grad_norm": 0.054964158684015274, "learning_rate": 7.113080526603792e-05, "loss": 1.1284, "step": 360 }, { "epoch": 0.2415932092719556, "grad_norm": 0.07777497917413712, "learning_rate": 6.607611338819697e-05, "loss": 1.2268, "step": 370 }, { "epoch": 0.24812275546849494, "grad_norm": 0.1765056848526001, "learning_rate": 6.111758245266794e-05, "loss": 1.2698, "step": 380 }, { "epoch": 0.2546523016650343, "grad_norm": 0.33249780535697937, "learning_rate": 5.626926795411447e-05, "loss": 1.2465, "step": 390 }, { "epoch": 0.2611818478615736, "grad_norm": 0.7363329529762268, "learning_rate": 5.1544912966734994e-05, "loss": 1.1405, "step": 400 }, { "epoch": 0.2611818478615736, "eval_loss": 1.208735466003418, "eval_runtime": 41.7762, "eval_samples_per_second": 61.734, "eval_steps_per_second": 15.439, "step": 400 }, { "epoch": 0.26771139405811295, "grad_norm": 0.06299945712089539, "learning_rate": 4.695790918802576e-05, "loss": 1.1717, "step": 410 }, { "epoch": 0.2742409402546523, "grad_norm": 0.08601243048906326, "learning_rate": 4.252125897855932e-05, "loss": 1.2408, "step": 420 }, { "epoch": 0.2807704864511916, "grad_norm": 0.14494818449020386, "learning_rate": 3.824753850538082e-05, "loss": 1.2919, "step": 430 }, { "epoch": 0.28730003264773096, "grad_norm": 0.31678304076194763, "learning_rate": 3.414886209349615e-05, "loss": 1.239, "step": 440 }, { "epoch": 0.2938295788442703, "grad_norm": 1.0417225360870361, "learning_rate": 3.0236847886501542e-05, "loss": 1.1049, "step": 450 }, { "epoch": 0.2938295788442703, "eval_loss": 1.2068712711334229, "eval_runtime": 42.7188, "eval_samples_per_second": 60.372, "eval_steps_per_second": 15.099, "step": 450 }, { "epoch": 0.30035912504080964, "grad_norm": 0.054399143904447556, "learning_rate": 2.6522584913693294e-05, "loss": 1.1521, "step": 460 }, { "epoch": 0.30688867123734903, "grad_norm": 0.08509223908185959, "learning_rate": 2.301660165700936e-05, "loss": 1.2187, "step": 470 }, { "epoch": 0.31341821743388837, "grad_norm": 0.1509586125612259, "learning_rate": 1.9728836206903656e-05, "loss": 1.3379, "step": 480 }, { "epoch": 0.3199477636304277, "grad_norm": 0.3751097619533539, "learning_rate": 1.6668608091748495e-05, "loss": 1.3362, "step": 490 }, { "epoch": 0.32647730982696704, "grad_norm": 0.8351140022277832, "learning_rate": 1.3844591860619383e-05, "loss": 1.1598, "step": 500 }, { "epoch": 0.32647730982696704, "eval_loss": 1.2039023637771606, "eval_runtime": 42.6078, "eval_samples_per_second": 60.529, "eval_steps_per_second": 15.138, "step": 500 }, { "epoch": 0.3330068560235064, "grad_norm": 0.05650899186730385, "learning_rate": 1.1264792494342857e-05, "loss": 1.1749, "step": 510 }, { "epoch": 0.3395364022200457, "grad_norm": 0.0797816812992096, "learning_rate": 8.936522714508678e-06, "loss": 1.2085, "step": 520 }, { "epoch": 0.34606594841658506, "grad_norm": 0.14462202787399292, "learning_rate": 6.866382254766157e-06, "loss": 1.2441, "step": 530 }, { "epoch": 0.3525954946131244, "grad_norm": 0.30583634972572327, "learning_rate": 5.060239153161872e-06, "loss": 1.2586, "step": 540 }, { "epoch": 0.35912504080966373, "grad_norm": 0.7339666485786438, "learning_rate": 3.5232131185484076e-06, "loss": 1.1969, "step": 550 }, { "epoch": 0.35912504080966373, "eval_loss": 1.2006784677505493, "eval_runtime": 41.9627, "eval_samples_per_second": 61.459, "eval_steps_per_second": 15.371, "step": 550 }, { "epoch": 0.36565458700620307, "grad_norm": 0.05688609927892685, "learning_rate": 2.259661018213333e-06, "loss": 1.1429, "step": 560 }, { "epoch": 0.3721841332027424, "grad_norm": 0.08461929857730865, "learning_rate": 1.2731645278655445e-06, "loss": 1.2267, "step": 570 }, { "epoch": 0.37871367939928174, "grad_norm": 0.17034894227981567, "learning_rate": 5.665199789862907e-07, "loss": 1.2762, "step": 580 }, { "epoch": 0.3852432255958211, "grad_norm": 0.2856343984603882, "learning_rate": 1.4173043232380557e-07, "loss": 1.2923, "step": 590 }, { "epoch": 0.3917727717923604, "grad_norm": 0.926727831363678, "learning_rate": 0.0, "loss": 1.2127, "step": 600 }, { "epoch": 0.3917727717923604, "eval_loss": 1.2015607357025146, "eval_runtime": 42.3181, "eval_samples_per_second": 60.943, "eval_steps_per_second": 15.242, "step": 600 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.759171124867891e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }