|
{ |
|
"best_metric": 0.4729672074317932, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-600", |
|
"epoch": 0.05658242172764994, |
|
"eval_steps": 150, |
|
"global_step": 600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 9.43040362127499e-05, |
|
"eval_loss": 0.9546859264373779, |
|
"eval_runtime": 92.7204, |
|
"eval_samples_per_second": 24.083, |
|
"eval_steps_per_second": 6.029, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.000943040362127499, |
|
"grad_norm": 0.9397637248039246, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 1.0657, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.001886080724254998, |
|
"grad_norm": 0.9389505982398987, |
|
"learning_rate": 1.4666666666666666e-05, |
|
"loss": 0.9028, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002829121086382497, |
|
"grad_norm": 1.1130671501159668, |
|
"learning_rate": 2.2e-05, |
|
"loss": 0.8773, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.003772161448509996, |
|
"grad_norm": 1.175963282585144, |
|
"learning_rate": 2.9333333333333333e-05, |
|
"loss": 0.8419, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.004715201810637495, |
|
"grad_norm": 1.3391618728637695, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 0.722, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.005658242172764994, |
|
"grad_norm": 0.8508893251419067, |
|
"learning_rate": 4.4e-05, |
|
"loss": 0.6673, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.006601282534892493, |
|
"grad_norm": 0.9721490144729614, |
|
"learning_rate": 4.39627794819679e-05, |
|
"loss": 0.6406, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.007544322897019992, |
|
"grad_norm": 1.2315011024475098, |
|
"learning_rate": 4.3851243870322744e-05, |
|
"loss": 0.6032, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.008487363259147491, |
|
"grad_norm": 1.0548521280288696, |
|
"learning_rate": 4.366577056626858e-05, |
|
"loss": 0.6413, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.00943040362127499, |
|
"grad_norm": 0.9409500956535339, |
|
"learning_rate": 4.340698715275612e-05, |
|
"loss": 0.6007, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01037344398340249, |
|
"grad_norm": 1.5821508169174194, |
|
"learning_rate": 4.3075769270940754e-05, |
|
"loss": 0.5532, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.011316484345529988, |
|
"grad_norm": 1.399810552597046, |
|
"learning_rate": 4.267323765728998e-05, |
|
"loss": 0.6105, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.012259524707657487, |
|
"grad_norm": 1.087052583694458, |
|
"learning_rate": 4.220075435136603e-05, |
|
"loss": 0.6174, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.013202565069784986, |
|
"grad_norm": 1.5819586515426636, |
|
"learning_rate": 4.165991808711507e-05, |
|
"loss": 0.5532, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.014145605431912485, |
|
"grad_norm": 0.8601478338241577, |
|
"learning_rate": 4.105255888325765e-05, |
|
"loss": 0.5277, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.014145605431912485, |
|
"eval_loss": 0.5393942594528198, |
|
"eval_runtime": 93.5873, |
|
"eval_samples_per_second": 23.86, |
|
"eval_steps_per_second": 5.973, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.015088645794039984, |
|
"grad_norm": 1.0490013360977173, |
|
"learning_rate": 4.03807318510846e-05, |
|
"loss": 0.527, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.016031686156167485, |
|
"grad_norm": 1.1231379508972168, |
|
"learning_rate": 3.9646710240610966e-05, |
|
"loss": 0.5501, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.016974726518294982, |
|
"grad_norm": 1.4164365530014038, |
|
"learning_rate": 3.885297774861751e-05, |
|
"loss": 0.5653, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.017917766880422483, |
|
"grad_norm": 1.2814961671829224, |
|
"learning_rate": 3.800222011460707e-05, |
|
"loss": 0.5362, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.01886080724254998, |
|
"grad_norm": 1.0821915864944458, |
|
"learning_rate": 3.709731603311214e-05, |
|
"loss": 0.4797, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01980384760467748, |
|
"grad_norm": 2.1310160160064697, |
|
"learning_rate": 3.614132741310386e-05, |
|
"loss": 0.4864, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.02074688796680498, |
|
"grad_norm": 1.0708684921264648, |
|
"learning_rate": 3.5137489017461296e-05, |
|
"loss": 0.5207, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.02168992832893248, |
|
"grad_norm": 1.3040435314178467, |
|
"learning_rate": 3.4089197517557735e-05, |
|
"loss": 0.5583, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.022632968691059976, |
|
"grad_norm": 1.337860107421875, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.5339, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.023576009053187477, |
|
"grad_norm": 1.037545919418335, |
|
"learning_rate": 3.187358196441017e-05, |
|
"loss": 0.5466, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.024519049415314974, |
|
"grad_norm": 0.8419134616851807, |
|
"learning_rate": 3.071375485286145e-05, |
|
"loss": 0.4922, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.025462089777442475, |
|
"grad_norm": 1.230167269706726, |
|
"learning_rate": 2.9524443153164715e-05, |
|
"loss": 0.5098, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.026405130139569973, |
|
"grad_norm": 1.1250079870224, |
|
"learning_rate": 2.8309671119643985e-05, |
|
"loss": 0.5077, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.027348170501697473, |
|
"grad_norm": 0.918197512626648, |
|
"learning_rate": 2.7073549156333684e-05, |
|
"loss": 0.4052, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.02829121086382497, |
|
"grad_norm": 1.6295733451843262, |
|
"learning_rate": 2.5820259908672472e-05, |
|
"loss": 0.5295, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02829121086382497, |
|
"eval_loss": 0.49462634325027466, |
|
"eval_runtime": 93.6546, |
|
"eval_samples_per_second": 23.843, |
|
"eval_steps_per_second": 5.969, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02923425122595247, |
|
"grad_norm": 1.1752290725708008, |
|
"learning_rate": 2.4554044110755066e-05, |
|
"loss": 0.5023, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.03017729158807997, |
|
"grad_norm": 1.613908290863037, |
|
"learning_rate": 2.3279186236030468e-05, |
|
"loss": 0.452, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.03112033195020747, |
|
"grad_norm": 1.15360426902771, |
|
"learning_rate": 2.2e-05, |
|
"loss": 0.4926, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.03206337231233497, |
|
"grad_norm": 0.985500693321228, |
|
"learning_rate": 2.072081376396953e-05, |
|
"loss": 0.5447, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.033006412674462464, |
|
"grad_norm": 0.9483439922332764, |
|
"learning_rate": 1.9445955889244933e-05, |
|
"loss": 0.5562, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.033949453036589965, |
|
"grad_norm": 1.3792706727981567, |
|
"learning_rate": 1.8179740091327534e-05, |
|
"loss": 0.5402, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.034892493398717465, |
|
"grad_norm": 0.8846226334571838, |
|
"learning_rate": 1.6926450843666314e-05, |
|
"loss": 0.5073, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.035835533760844966, |
|
"grad_norm": 1.1274725198745728, |
|
"learning_rate": 1.569032888035602e-05, |
|
"loss": 0.4089, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.03677857412297246, |
|
"grad_norm": 1.4087094068527222, |
|
"learning_rate": 1.447555684683529e-05, |
|
"loss": 0.5137, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.03772161448509996, |
|
"grad_norm": 1.4227954149246216, |
|
"learning_rate": 1.3286245147138549e-05, |
|
"loss": 0.4764, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03866465484722746, |
|
"grad_norm": 1.189937710762024, |
|
"learning_rate": 1.2126418035589831e-05, |
|
"loss": 0.4483, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.03960769520935496, |
|
"grad_norm": 1.5125129222869873, |
|
"learning_rate": 1.1000000000000005e-05, |
|
"loss": 0.3932, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.040550735571482456, |
|
"grad_norm": 1.0411937236785889, |
|
"learning_rate": 9.910802482442268e-06, |
|
"loss": 0.47, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.04149377593360996, |
|
"grad_norm": 2.2022852897644043, |
|
"learning_rate": 8.86251098253871e-06, |
|
"loss": 0.5126, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.04243681629573746, |
|
"grad_norm": 1.1059151887893677, |
|
"learning_rate": 7.858672586896134e-06, |
|
"loss": 0.4136, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04243681629573746, |
|
"eval_loss": 0.4766067862510681, |
|
"eval_runtime": 93.545, |
|
"eval_samples_per_second": 23.871, |
|
"eval_steps_per_second": 5.976, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.04337985665786496, |
|
"grad_norm": 1.4336448907852173, |
|
"learning_rate": 6.902683966887863e-06, |
|
"loss": 0.4743, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.04432289701999246, |
|
"grad_norm": 1.0714585781097412, |
|
"learning_rate": 5.997779885392928e-06, |
|
"loss": 0.4757, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.04526593738211995, |
|
"grad_norm": 1.258355975151062, |
|
"learning_rate": 5.147022251382486e-06, |
|
"loss": 0.4376, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.046208977744247454, |
|
"grad_norm": 0.9438008666038513, |
|
"learning_rate": 4.3532897593890356e-06, |
|
"loss": 0.483, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.047152018106374954, |
|
"grad_norm": 1.3534599542617798, |
|
"learning_rate": 3.619268148915402e-06, |
|
"loss": 0.4502, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.048095058468502455, |
|
"grad_norm": 1.6557931900024414, |
|
"learning_rate": 2.947441116742348e-06, |
|
"loss": 0.5207, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.04903809883062995, |
|
"grad_norm": 1.548148512840271, |
|
"learning_rate": 2.3400819128849325e-06, |
|
"loss": 0.4162, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04998113919275745, |
|
"grad_norm": 1.4258723258972168, |
|
"learning_rate": 1.7992456486339744e-06, |
|
"loss": 0.4655, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.05092417955488495, |
|
"grad_norm": 1.1291389465332031, |
|
"learning_rate": 1.326762342710017e-06, |
|
"loss": 0.4796, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.05186721991701245, |
|
"grad_norm": 1.4239860773086548, |
|
"learning_rate": 9.242307290592442e-07, |
|
"loss": 0.4622, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.052810260279139945, |
|
"grad_norm": 1.2388654947280884, |
|
"learning_rate": 5.930128472438762e-07, |
|
"loss": 0.5011, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.053753300641267446, |
|
"grad_norm": 1.3684238195419312, |
|
"learning_rate": 3.3422943373142354e-07, |
|
"loss": 0.446, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.05469634100339495, |
|
"grad_norm": 1.4726179838180542, |
|
"learning_rate": 1.4875612967725348e-07, |
|
"loss": 0.4642, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.05563938136552245, |
|
"grad_norm": 1.8181157112121582, |
|
"learning_rate": 3.722051803210014e-08, |
|
"loss": 0.4721, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.05658242172764994, |
|
"grad_norm": 1.503574252128601, |
|
"learning_rate": 0.0, |
|
"loss": 0.4171, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.05658242172764994, |
|
"eval_loss": 0.4729672074317932, |
|
"eval_runtime": 93.8663, |
|
"eval_samples_per_second": 23.789, |
|
"eval_steps_per_second": 5.955, |
|
"step": 600 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.12347940585472e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|