{ "best_metric": 0.4729672074317932, "best_model_checkpoint": "miner_id_24/checkpoint-600", "epoch": 0.05658242172764994, "eval_steps": 150, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.43040362127499e-05, "eval_loss": 0.9546859264373779, "eval_runtime": 92.7204, "eval_samples_per_second": 24.083, "eval_steps_per_second": 6.029, "step": 1 }, { "epoch": 0.000943040362127499, "grad_norm": 0.9397637248039246, "learning_rate": 7.333333333333333e-06, "loss": 1.0657, "step": 10 }, { "epoch": 0.001886080724254998, "grad_norm": 0.9389505982398987, "learning_rate": 1.4666666666666666e-05, "loss": 0.9028, "step": 20 }, { "epoch": 0.002829121086382497, "grad_norm": 1.1130671501159668, "learning_rate": 2.2e-05, "loss": 0.8773, "step": 30 }, { "epoch": 0.003772161448509996, "grad_norm": 1.175963282585144, "learning_rate": 2.9333333333333333e-05, "loss": 0.8419, "step": 40 }, { "epoch": 0.004715201810637495, "grad_norm": 1.3391618728637695, "learning_rate": 3.6666666666666666e-05, "loss": 0.722, "step": 50 }, { "epoch": 0.005658242172764994, "grad_norm": 0.8508893251419067, "learning_rate": 4.4e-05, "loss": 0.6673, "step": 60 }, { "epoch": 0.006601282534892493, "grad_norm": 0.9721490144729614, "learning_rate": 4.39627794819679e-05, "loss": 0.6406, "step": 70 }, { "epoch": 0.007544322897019992, "grad_norm": 1.2315011024475098, "learning_rate": 4.3851243870322744e-05, "loss": 0.6032, "step": 80 }, { "epoch": 0.008487363259147491, "grad_norm": 1.0548521280288696, "learning_rate": 4.366577056626858e-05, "loss": 0.6413, "step": 90 }, { "epoch": 0.00943040362127499, "grad_norm": 0.9409500956535339, "learning_rate": 4.340698715275612e-05, "loss": 0.6007, "step": 100 }, { "epoch": 0.01037344398340249, "grad_norm": 1.5821508169174194, "learning_rate": 4.3075769270940754e-05, "loss": 0.5532, "step": 110 }, { "epoch": 0.011316484345529988, "grad_norm": 1.399810552597046, "learning_rate": 4.267323765728998e-05, "loss": 0.6105, "step": 120 }, { "epoch": 0.012259524707657487, "grad_norm": 1.087052583694458, "learning_rate": 4.220075435136603e-05, "loss": 0.6174, "step": 130 }, { "epoch": 0.013202565069784986, "grad_norm": 1.5819586515426636, "learning_rate": 4.165991808711507e-05, "loss": 0.5532, "step": 140 }, { "epoch": 0.014145605431912485, "grad_norm": 0.8601478338241577, "learning_rate": 4.105255888325765e-05, "loss": 0.5277, "step": 150 }, { "epoch": 0.014145605431912485, "eval_loss": 0.5393942594528198, "eval_runtime": 93.5873, "eval_samples_per_second": 23.86, "eval_steps_per_second": 5.973, "step": 150 }, { "epoch": 0.015088645794039984, "grad_norm": 1.0490013360977173, "learning_rate": 4.03807318510846e-05, "loss": 0.527, "step": 160 }, { "epoch": 0.016031686156167485, "grad_norm": 1.1231379508972168, "learning_rate": 3.9646710240610966e-05, "loss": 0.5501, "step": 170 }, { "epoch": 0.016974726518294982, "grad_norm": 1.4164365530014038, "learning_rate": 3.885297774861751e-05, "loss": 0.5653, "step": 180 }, { "epoch": 0.017917766880422483, "grad_norm": 1.2814961671829224, "learning_rate": 3.800222011460707e-05, "loss": 0.5362, "step": 190 }, { "epoch": 0.01886080724254998, "grad_norm": 1.0821915864944458, "learning_rate": 3.709731603311214e-05, "loss": 0.4797, "step": 200 }, { "epoch": 0.01980384760467748, "grad_norm": 2.1310160160064697, "learning_rate": 3.614132741310386e-05, "loss": 0.4864, "step": 210 }, { "epoch": 0.02074688796680498, "grad_norm": 1.0708684921264648, "learning_rate": 3.5137489017461296e-05, "loss": 0.5207, "step": 220 }, { "epoch": 0.02168992832893248, "grad_norm": 1.3040435314178467, "learning_rate": 3.4089197517557735e-05, "loss": 0.5583, "step": 230 }, { "epoch": 0.022632968691059976, "grad_norm": 1.337860107421875, "learning_rate": 3.3e-05, "loss": 0.5339, "step": 240 }, { "epoch": 0.023576009053187477, "grad_norm": 1.037545919418335, "learning_rate": 3.187358196441017e-05, "loss": 0.5466, "step": 250 }, { "epoch": 0.024519049415314974, "grad_norm": 0.8419134616851807, "learning_rate": 3.071375485286145e-05, "loss": 0.4922, "step": 260 }, { "epoch": 0.025462089777442475, "grad_norm": 1.230167269706726, "learning_rate": 2.9524443153164715e-05, "loss": 0.5098, "step": 270 }, { "epoch": 0.026405130139569973, "grad_norm": 1.1250079870224, "learning_rate": 2.8309671119643985e-05, "loss": 0.5077, "step": 280 }, { "epoch": 0.027348170501697473, "grad_norm": 0.918197512626648, "learning_rate": 2.7073549156333684e-05, "loss": 0.4052, "step": 290 }, { "epoch": 0.02829121086382497, "grad_norm": 1.6295733451843262, "learning_rate": 2.5820259908672472e-05, "loss": 0.5295, "step": 300 }, { "epoch": 0.02829121086382497, "eval_loss": 0.49462634325027466, "eval_runtime": 93.6546, "eval_samples_per_second": 23.843, "eval_steps_per_second": 5.969, "step": 300 }, { "epoch": 0.02923425122595247, "grad_norm": 1.1752290725708008, "learning_rate": 2.4554044110755066e-05, "loss": 0.5023, "step": 310 }, { "epoch": 0.03017729158807997, "grad_norm": 1.613908290863037, "learning_rate": 2.3279186236030468e-05, "loss": 0.452, "step": 320 }, { "epoch": 0.03112033195020747, "grad_norm": 1.15360426902771, "learning_rate": 2.2e-05, "loss": 0.4926, "step": 330 }, { "epoch": 0.03206337231233497, "grad_norm": 0.985500693321228, "learning_rate": 2.072081376396953e-05, "loss": 0.5447, "step": 340 }, { "epoch": 0.033006412674462464, "grad_norm": 0.9483439922332764, "learning_rate": 1.9445955889244933e-05, "loss": 0.5562, "step": 350 }, { "epoch": 0.033949453036589965, "grad_norm": 1.3792706727981567, "learning_rate": 1.8179740091327534e-05, "loss": 0.5402, "step": 360 }, { "epoch": 0.034892493398717465, "grad_norm": 0.8846226334571838, "learning_rate": 1.6926450843666314e-05, "loss": 0.5073, "step": 370 }, { "epoch": 0.035835533760844966, "grad_norm": 1.1274725198745728, "learning_rate": 1.569032888035602e-05, "loss": 0.4089, "step": 380 }, { "epoch": 0.03677857412297246, "grad_norm": 1.4087094068527222, "learning_rate": 1.447555684683529e-05, "loss": 0.5137, "step": 390 }, { "epoch": 0.03772161448509996, "grad_norm": 1.4227954149246216, "learning_rate": 1.3286245147138549e-05, "loss": 0.4764, "step": 400 }, { "epoch": 0.03866465484722746, "grad_norm": 1.189937710762024, "learning_rate": 1.2126418035589831e-05, "loss": 0.4483, "step": 410 }, { "epoch": 0.03960769520935496, "grad_norm": 1.5125129222869873, "learning_rate": 1.1000000000000005e-05, "loss": 0.3932, "step": 420 }, { "epoch": 0.040550735571482456, "grad_norm": 1.0411937236785889, "learning_rate": 9.910802482442268e-06, "loss": 0.47, "step": 430 }, { "epoch": 0.04149377593360996, "grad_norm": 2.2022852897644043, "learning_rate": 8.86251098253871e-06, "loss": 0.5126, "step": 440 }, { "epoch": 0.04243681629573746, "grad_norm": 1.1059151887893677, "learning_rate": 7.858672586896134e-06, "loss": 0.4136, "step": 450 }, { "epoch": 0.04243681629573746, "eval_loss": 0.4766067862510681, "eval_runtime": 93.545, "eval_samples_per_second": 23.871, "eval_steps_per_second": 5.976, "step": 450 }, { "epoch": 0.04337985665786496, "grad_norm": 1.4336448907852173, "learning_rate": 6.902683966887863e-06, "loss": 0.4743, "step": 460 }, { "epoch": 0.04432289701999246, "grad_norm": 1.0714585781097412, "learning_rate": 5.997779885392928e-06, "loss": 0.4757, "step": 470 }, { "epoch": 0.04526593738211995, "grad_norm": 1.258355975151062, "learning_rate": 5.147022251382486e-06, "loss": 0.4376, "step": 480 }, { "epoch": 0.046208977744247454, "grad_norm": 0.9438008666038513, "learning_rate": 4.3532897593890356e-06, "loss": 0.483, "step": 490 }, { "epoch": 0.047152018106374954, "grad_norm": 1.3534599542617798, "learning_rate": 3.619268148915402e-06, "loss": 0.4502, "step": 500 }, { "epoch": 0.048095058468502455, "grad_norm": 1.6557931900024414, "learning_rate": 2.947441116742348e-06, "loss": 0.5207, "step": 510 }, { "epoch": 0.04903809883062995, "grad_norm": 1.548148512840271, "learning_rate": 2.3400819128849325e-06, "loss": 0.4162, "step": 520 }, { "epoch": 0.04998113919275745, "grad_norm": 1.4258723258972168, "learning_rate": 1.7992456486339744e-06, "loss": 0.4655, "step": 530 }, { "epoch": 0.05092417955488495, "grad_norm": 1.1291389465332031, "learning_rate": 1.326762342710017e-06, "loss": 0.4796, "step": 540 }, { "epoch": 0.05186721991701245, "grad_norm": 1.4239860773086548, "learning_rate": 9.242307290592442e-07, "loss": 0.4622, "step": 550 }, { "epoch": 0.052810260279139945, "grad_norm": 1.2388654947280884, "learning_rate": 5.930128472438762e-07, "loss": 0.5011, "step": 560 }, { "epoch": 0.053753300641267446, "grad_norm": 1.3684238195419312, "learning_rate": 3.3422943373142354e-07, "loss": 0.446, "step": 570 }, { "epoch": 0.05469634100339495, "grad_norm": 1.4726179838180542, "learning_rate": 1.4875612967725348e-07, "loss": 0.4642, "step": 580 }, { "epoch": 0.05563938136552245, "grad_norm": 1.8181157112121582, "learning_rate": 3.722051803210014e-08, "loss": 0.4721, "step": 590 }, { "epoch": 0.05658242172764994, "grad_norm": 1.503574252128601, "learning_rate": 0.0, "loss": 0.4171, "step": 600 }, { "epoch": 0.05658242172764994, "eval_loss": 0.4729672074317932, "eval_runtime": 93.8663, "eval_samples_per_second": 23.789, "eval_steps_per_second": 5.955, "step": 600 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.12347940585472e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }