{ "best_metric": 1.4702097177505493, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.0844451950684006, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00016889039013680122, "eval_loss": 2.846999168395996, "eval_runtime": 224.2391, "eval_samples_per_second": 11.122, "eval_steps_per_second": 2.783, "step": 1 }, { "epoch": 0.0016889039013680122, "grad_norm": 3.6970326900482178, "learning_rate": 4.2800000000000004e-05, "loss": 4.3636, "step": 10 }, { "epoch": 0.0033778078027360244, "grad_norm": 6.2170329093933105, "learning_rate": 8.560000000000001e-05, "loss": 3.9342, "step": 20 }, { "epoch": 0.005066711704104036, "grad_norm": 5.741078853607178, "learning_rate": 0.0001284, "loss": 3.5268, "step": 30 }, { "epoch": 0.006755615605472049, "grad_norm": 5.3039350509643555, "learning_rate": 0.00017120000000000001, "loss": 3.5435, "step": 40 }, { "epoch": 0.00844451950684006, "grad_norm": 11.378582000732422, "learning_rate": 0.000214, "loss": 3.773, "step": 50 }, { "epoch": 0.00844451950684006, "eval_loss": 1.7026833295822144, "eval_runtime": 225.0764, "eval_samples_per_second": 11.081, "eval_steps_per_second": 2.772, "step": 50 }, { "epoch": 0.010133423408208072, "grad_norm": 5.438906192779541, "learning_rate": 0.00021373935337780118, "loss": 2.9566, "step": 60 }, { "epoch": 0.011822327309576086, "grad_norm": 3.3323323726654053, "learning_rate": 0.00021295868335534802, "loss": 3.2056, "step": 70 }, { "epoch": 0.013511231210944098, "grad_norm": 4.04654598236084, "learning_rate": 0.0002116617932785172, "loss": 3.5019, "step": 80 }, { "epoch": 0.01520013511231211, "grad_norm": 10.413492202758789, "learning_rate": 0.00020985500146540012, "loss": 3.6202, "step": 90 }, { "epoch": 0.01688903901368012, "grad_norm": 8.809098243713379, "learning_rate": 0.0002075471104240922, "loss": 4.0946, "step": 100 }, { "epoch": 0.01688903901368012, "eval_loss": 1.7498723268508911, "eval_runtime": 223.5899, "eval_samples_per_second": 11.154, "eval_steps_per_second": 2.791, "step": 100 }, { "epoch": 0.018577942915048135, "grad_norm": 3.0546703338623047, "learning_rate": 0.00020474936396775828, "loss": 2.9975, "step": 110 }, { "epoch": 0.020266846816416145, "grad_norm": 3.504708766937256, "learning_rate": 0.00020147539243590517, "loss": 3.1181, "step": 120 }, { "epoch": 0.021955750717784158, "grad_norm": 3.6388931274414062, "learning_rate": 0.00019774114628873756, "loss": 3.3875, "step": 130 }, { "epoch": 0.02364465461915217, "grad_norm": 5.671792030334473, "learning_rate": 0.00019356481839811937, "loss": 3.8488, "step": 140 }, { "epoch": 0.02533355852052018, "grad_norm": 6.272680759429932, "learning_rate": 0.00018896675541373064, "loss": 3.8281, "step": 150 }, { "epoch": 0.02533355852052018, "eval_loss": 1.678876519203186, "eval_runtime": 224.1842, "eval_samples_per_second": 11.125, "eval_steps_per_second": 2.783, "step": 150 }, { "epoch": 0.027022462421888195, "grad_norm": 2.720691204071045, "learning_rate": 0.00018396935863623567, "loss": 2.7892, "step": 160 }, { "epoch": 0.028711366323256205, "grad_norm": 2.612640619277954, "learning_rate": 0.00017859697488039784, "loss": 3.295, "step": 170 }, { "epoch": 0.03040027022462422, "grad_norm": 4.609539985656738, "learning_rate": 0.00017287577785984542, "loss": 3.5508, "step": 180 }, { "epoch": 0.03208917412599223, "grad_norm": 5.039632320404053, "learning_rate": 0.0001668336406713699, "loss": 3.4738, "step": 190 }, { "epoch": 0.03377807802736024, "grad_norm": 7.778958797454834, "learning_rate": 0.0001605, "loss": 3.7922, "step": 200 }, { "epoch": 0.03377807802736024, "eval_loss": 1.6119084358215332, "eval_runtime": 224.0622, "eval_samples_per_second": 11.131, "eval_steps_per_second": 2.785, "step": 200 }, { "epoch": 0.035466981928728256, "grad_norm": 2.715378761291504, "learning_rate": 0.00015390571270643128, "loss": 2.6685, "step": 210 }, { "epoch": 0.03715588583009627, "grad_norm": 3.9339587688446045, "learning_rate": 0.0001470829054955026, "loss": 3.3036, "step": 220 }, { "epoch": 0.03884478973146428, "grad_norm": 4.110659122467041, "learning_rate": 0.00014006481839811937, "loss": 3.3478, "step": 230 }, { "epoch": 0.04053369363283229, "grad_norm": 4.37723970413208, "learning_rate": 0.00013288564282916442, "loss": 3.2481, "step": 240 }, { "epoch": 0.0422225975342003, "grad_norm": 5.84151554107666, "learning_rate": 0.00012558035501036158, "loss": 3.5592, "step": 250 }, { "epoch": 0.0422225975342003, "eval_loss": 1.601406216621399, "eval_runtime": 226.5276, "eval_samples_per_second": 11.01, "eval_steps_per_second": 2.755, "step": 250 }, { "epoch": 0.043911501435568316, "grad_norm": 3.127474546432495, "learning_rate": 0.00011818454556963892, "loss": 2.9031, "step": 260 }, { "epoch": 0.04560040533693633, "grad_norm": 3.238449811935425, "learning_rate": 0.00011073424614716762, "loss": 3.1569, "step": 270 }, { "epoch": 0.04728930923830434, "grad_norm": 4.280186653137207, "learning_rate": 0.00010326575385283242, "loss": 3.2566, "step": 280 }, { "epoch": 0.04897821313967235, "grad_norm": 3.685248613357544, "learning_rate": 9.58154544303611e-05, "loss": 3.2076, "step": 290 }, { "epoch": 0.05066711704104036, "grad_norm": 7.450679302215576, "learning_rate": 8.841964498963846e-05, "loss": 3.3199, "step": 300 }, { "epoch": 0.05066711704104036, "eval_loss": 1.5676641464233398, "eval_runtime": 224.0674, "eval_samples_per_second": 11.131, "eval_steps_per_second": 2.785, "step": 300 }, { "epoch": 0.05235602094240838, "grad_norm": 2.852640390396118, "learning_rate": 8.111435717083556e-05, "loss": 2.8621, "step": 310 }, { "epoch": 0.05404492484377639, "grad_norm": 2.660027027130127, "learning_rate": 7.393518160188063e-05, "loss": 3.0032, "step": 320 }, { "epoch": 0.055733828745144404, "grad_norm": 4.215891361236572, "learning_rate": 6.69170945044974e-05, "loss": 2.9492, "step": 330 }, { "epoch": 0.05742273264651241, "grad_norm": 3.443974256515503, "learning_rate": 6.009428729356871e-05, "loss": 3.2442, "step": 340 }, { "epoch": 0.059111636547880424, "grad_norm": 4.448852062225342, "learning_rate": 5.3500000000000026e-05, "loss": 3.1454, "step": 350 }, { "epoch": 0.059111636547880424, "eval_loss": 1.511659026145935, "eval_runtime": 224.0431, "eval_samples_per_second": 11.132, "eval_steps_per_second": 2.785, "step": 350 }, { "epoch": 0.06080054044924844, "grad_norm": 2.7422585487365723, "learning_rate": 4.7166359328630106e-05, "loss": 2.7311, "step": 360 }, { "epoch": 0.06248944435061645, "grad_norm": 3.939330816268921, "learning_rate": 4.112422214015456e-05, "loss": 2.9315, "step": 370 }, { "epoch": 0.06417834825198446, "grad_norm": 3.160043954849243, "learning_rate": 3.5403025119602206e-05, "loss": 3.0538, "step": 380 }, { "epoch": 0.06586725215335247, "grad_norm": 3.95194411277771, "learning_rate": 3.0030641363764346e-05, "loss": 3.1343, "step": 390 }, { "epoch": 0.06755615605472048, "grad_norm": 4.495863914489746, "learning_rate": 2.5033244586269365e-05, "loss": 3.1658, "step": 400 }, { "epoch": 0.06755615605472048, "eval_loss": 1.480776071548462, "eval_runtime": 224.2238, "eval_samples_per_second": 11.123, "eval_steps_per_second": 2.783, "step": 400 }, { "epoch": 0.0692450599560885, "grad_norm": 2.1745190620422363, "learning_rate": 2.0435181601880635e-05, "loss": 2.6099, "step": 410 }, { "epoch": 0.07093396385745651, "grad_norm": 2.9737164974212646, "learning_rate": 1.625885371126242e-05, "loss": 2.8922, "step": 420 }, { "epoch": 0.07262286775882452, "grad_norm": 2.6742162704467773, "learning_rate": 1.2524607564094813e-05, "loss": 2.8869, "step": 430 }, { "epoch": 0.07431177166019254, "grad_norm": 2.8387765884399414, "learning_rate": 9.250636032241695e-06, "loss": 3.2082, "step": 440 }, { "epoch": 0.07600067556156055, "grad_norm": 5.102243900299072, "learning_rate": 6.45288957590781e-06, "loss": 3.0588, "step": 450 }, { "epoch": 0.07600067556156055, "eval_loss": 1.4717568159103394, "eval_runtime": 225.0055, "eval_samples_per_second": 11.084, "eval_steps_per_second": 2.773, "step": 450 }, { "epoch": 0.07768957946292857, "grad_norm": 2.2478957176208496, "learning_rate": 4.144998534599878e-06, "loss": 2.7015, "step": 460 }, { "epoch": 0.07937848336429656, "grad_norm": 2.600090265274048, "learning_rate": 2.3382067214827915e-06, "loss": 3.0485, "step": 470 }, { "epoch": 0.08106738726566458, "grad_norm": 2.7953197956085205, "learning_rate": 1.0413166446519713e-06, "loss": 2.9653, "step": 480 }, { "epoch": 0.08275629116703259, "grad_norm": 3.734795570373535, "learning_rate": 2.6064662219881083e-07, "loss": 3.1589, "step": 490 }, { "epoch": 0.0844451950684006, "grad_norm": 5.725466728210449, "learning_rate": 0.0, "loss": 3.0986, "step": 500 }, { "epoch": 0.0844451950684006, "eval_loss": 1.4702097177505493, "eval_runtime": 223.9171, "eval_samples_per_second": 11.138, "eval_steps_per_second": 2.787, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6100295180288e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }