auxyus's picture
Training in progress, step 600, checkpoint
3ebb3c0 verified
{
"best_metric": 0.4729672074317932,
"best_model_checkpoint": "miner_id_24/checkpoint-600",
"epoch": 0.05658242172764994,
"eval_steps": 150,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 9.43040362127499e-05,
"eval_loss": 0.9546859264373779,
"eval_runtime": 92.7204,
"eval_samples_per_second": 24.083,
"eval_steps_per_second": 6.029,
"step": 1
},
{
"epoch": 0.000943040362127499,
"grad_norm": 0.9397637248039246,
"learning_rate": 7.333333333333333e-06,
"loss": 1.0657,
"step": 10
},
{
"epoch": 0.001886080724254998,
"grad_norm": 0.9389505982398987,
"learning_rate": 1.4666666666666666e-05,
"loss": 0.9028,
"step": 20
},
{
"epoch": 0.002829121086382497,
"grad_norm": 1.1130671501159668,
"learning_rate": 2.2e-05,
"loss": 0.8773,
"step": 30
},
{
"epoch": 0.003772161448509996,
"grad_norm": 1.175963282585144,
"learning_rate": 2.9333333333333333e-05,
"loss": 0.8419,
"step": 40
},
{
"epoch": 0.004715201810637495,
"grad_norm": 1.3391618728637695,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.722,
"step": 50
},
{
"epoch": 0.005658242172764994,
"grad_norm": 0.8508893251419067,
"learning_rate": 4.4e-05,
"loss": 0.6673,
"step": 60
},
{
"epoch": 0.006601282534892493,
"grad_norm": 0.9721490144729614,
"learning_rate": 4.39627794819679e-05,
"loss": 0.6406,
"step": 70
},
{
"epoch": 0.007544322897019992,
"grad_norm": 1.2315011024475098,
"learning_rate": 4.3851243870322744e-05,
"loss": 0.6032,
"step": 80
},
{
"epoch": 0.008487363259147491,
"grad_norm": 1.0548521280288696,
"learning_rate": 4.366577056626858e-05,
"loss": 0.6413,
"step": 90
},
{
"epoch": 0.00943040362127499,
"grad_norm": 0.9409500956535339,
"learning_rate": 4.340698715275612e-05,
"loss": 0.6007,
"step": 100
},
{
"epoch": 0.01037344398340249,
"grad_norm": 1.5821508169174194,
"learning_rate": 4.3075769270940754e-05,
"loss": 0.5532,
"step": 110
},
{
"epoch": 0.011316484345529988,
"grad_norm": 1.399810552597046,
"learning_rate": 4.267323765728998e-05,
"loss": 0.6105,
"step": 120
},
{
"epoch": 0.012259524707657487,
"grad_norm": 1.087052583694458,
"learning_rate": 4.220075435136603e-05,
"loss": 0.6174,
"step": 130
},
{
"epoch": 0.013202565069784986,
"grad_norm": 1.5819586515426636,
"learning_rate": 4.165991808711507e-05,
"loss": 0.5532,
"step": 140
},
{
"epoch": 0.014145605431912485,
"grad_norm": 0.8601478338241577,
"learning_rate": 4.105255888325765e-05,
"loss": 0.5277,
"step": 150
},
{
"epoch": 0.014145605431912485,
"eval_loss": 0.5393942594528198,
"eval_runtime": 93.5873,
"eval_samples_per_second": 23.86,
"eval_steps_per_second": 5.973,
"step": 150
},
{
"epoch": 0.015088645794039984,
"grad_norm": 1.0490013360977173,
"learning_rate": 4.03807318510846e-05,
"loss": 0.527,
"step": 160
},
{
"epoch": 0.016031686156167485,
"grad_norm": 1.1231379508972168,
"learning_rate": 3.9646710240610966e-05,
"loss": 0.5501,
"step": 170
},
{
"epoch": 0.016974726518294982,
"grad_norm": 1.4164365530014038,
"learning_rate": 3.885297774861751e-05,
"loss": 0.5653,
"step": 180
},
{
"epoch": 0.017917766880422483,
"grad_norm": 1.2814961671829224,
"learning_rate": 3.800222011460707e-05,
"loss": 0.5362,
"step": 190
},
{
"epoch": 0.01886080724254998,
"grad_norm": 1.0821915864944458,
"learning_rate": 3.709731603311214e-05,
"loss": 0.4797,
"step": 200
},
{
"epoch": 0.01980384760467748,
"grad_norm": 2.1310160160064697,
"learning_rate": 3.614132741310386e-05,
"loss": 0.4864,
"step": 210
},
{
"epoch": 0.02074688796680498,
"grad_norm": 1.0708684921264648,
"learning_rate": 3.5137489017461296e-05,
"loss": 0.5207,
"step": 220
},
{
"epoch": 0.02168992832893248,
"grad_norm": 1.3040435314178467,
"learning_rate": 3.4089197517557735e-05,
"loss": 0.5583,
"step": 230
},
{
"epoch": 0.022632968691059976,
"grad_norm": 1.337860107421875,
"learning_rate": 3.3e-05,
"loss": 0.5339,
"step": 240
},
{
"epoch": 0.023576009053187477,
"grad_norm": 1.037545919418335,
"learning_rate": 3.187358196441017e-05,
"loss": 0.5466,
"step": 250
},
{
"epoch": 0.024519049415314974,
"grad_norm": 0.8419134616851807,
"learning_rate": 3.071375485286145e-05,
"loss": 0.4922,
"step": 260
},
{
"epoch": 0.025462089777442475,
"grad_norm": 1.230167269706726,
"learning_rate": 2.9524443153164715e-05,
"loss": 0.5098,
"step": 270
},
{
"epoch": 0.026405130139569973,
"grad_norm": 1.1250079870224,
"learning_rate": 2.8309671119643985e-05,
"loss": 0.5077,
"step": 280
},
{
"epoch": 0.027348170501697473,
"grad_norm": 0.918197512626648,
"learning_rate": 2.7073549156333684e-05,
"loss": 0.4052,
"step": 290
},
{
"epoch": 0.02829121086382497,
"grad_norm": 1.6295733451843262,
"learning_rate": 2.5820259908672472e-05,
"loss": 0.5295,
"step": 300
},
{
"epoch": 0.02829121086382497,
"eval_loss": 0.49462634325027466,
"eval_runtime": 93.6546,
"eval_samples_per_second": 23.843,
"eval_steps_per_second": 5.969,
"step": 300
},
{
"epoch": 0.02923425122595247,
"grad_norm": 1.1752290725708008,
"learning_rate": 2.4554044110755066e-05,
"loss": 0.5023,
"step": 310
},
{
"epoch": 0.03017729158807997,
"grad_norm": 1.613908290863037,
"learning_rate": 2.3279186236030468e-05,
"loss": 0.452,
"step": 320
},
{
"epoch": 0.03112033195020747,
"grad_norm": 1.15360426902771,
"learning_rate": 2.2e-05,
"loss": 0.4926,
"step": 330
},
{
"epoch": 0.03206337231233497,
"grad_norm": 0.985500693321228,
"learning_rate": 2.072081376396953e-05,
"loss": 0.5447,
"step": 340
},
{
"epoch": 0.033006412674462464,
"grad_norm": 0.9483439922332764,
"learning_rate": 1.9445955889244933e-05,
"loss": 0.5562,
"step": 350
},
{
"epoch": 0.033949453036589965,
"grad_norm": 1.3792706727981567,
"learning_rate": 1.8179740091327534e-05,
"loss": 0.5402,
"step": 360
},
{
"epoch": 0.034892493398717465,
"grad_norm": 0.8846226334571838,
"learning_rate": 1.6926450843666314e-05,
"loss": 0.5073,
"step": 370
},
{
"epoch": 0.035835533760844966,
"grad_norm": 1.1274725198745728,
"learning_rate": 1.569032888035602e-05,
"loss": 0.4089,
"step": 380
},
{
"epoch": 0.03677857412297246,
"grad_norm": 1.4087094068527222,
"learning_rate": 1.447555684683529e-05,
"loss": 0.5137,
"step": 390
},
{
"epoch": 0.03772161448509996,
"grad_norm": 1.4227954149246216,
"learning_rate": 1.3286245147138549e-05,
"loss": 0.4764,
"step": 400
},
{
"epoch": 0.03866465484722746,
"grad_norm": 1.189937710762024,
"learning_rate": 1.2126418035589831e-05,
"loss": 0.4483,
"step": 410
},
{
"epoch": 0.03960769520935496,
"grad_norm": 1.5125129222869873,
"learning_rate": 1.1000000000000005e-05,
"loss": 0.3932,
"step": 420
},
{
"epoch": 0.040550735571482456,
"grad_norm": 1.0411937236785889,
"learning_rate": 9.910802482442268e-06,
"loss": 0.47,
"step": 430
},
{
"epoch": 0.04149377593360996,
"grad_norm": 2.2022852897644043,
"learning_rate": 8.86251098253871e-06,
"loss": 0.5126,
"step": 440
},
{
"epoch": 0.04243681629573746,
"grad_norm": 1.1059151887893677,
"learning_rate": 7.858672586896134e-06,
"loss": 0.4136,
"step": 450
},
{
"epoch": 0.04243681629573746,
"eval_loss": 0.4766067862510681,
"eval_runtime": 93.545,
"eval_samples_per_second": 23.871,
"eval_steps_per_second": 5.976,
"step": 450
},
{
"epoch": 0.04337985665786496,
"grad_norm": 1.4336448907852173,
"learning_rate": 6.902683966887863e-06,
"loss": 0.4743,
"step": 460
},
{
"epoch": 0.04432289701999246,
"grad_norm": 1.0714585781097412,
"learning_rate": 5.997779885392928e-06,
"loss": 0.4757,
"step": 470
},
{
"epoch": 0.04526593738211995,
"grad_norm": 1.258355975151062,
"learning_rate": 5.147022251382486e-06,
"loss": 0.4376,
"step": 480
},
{
"epoch": 0.046208977744247454,
"grad_norm": 0.9438008666038513,
"learning_rate": 4.3532897593890356e-06,
"loss": 0.483,
"step": 490
},
{
"epoch": 0.047152018106374954,
"grad_norm": 1.3534599542617798,
"learning_rate": 3.619268148915402e-06,
"loss": 0.4502,
"step": 500
},
{
"epoch": 0.048095058468502455,
"grad_norm": 1.6557931900024414,
"learning_rate": 2.947441116742348e-06,
"loss": 0.5207,
"step": 510
},
{
"epoch": 0.04903809883062995,
"grad_norm": 1.548148512840271,
"learning_rate": 2.3400819128849325e-06,
"loss": 0.4162,
"step": 520
},
{
"epoch": 0.04998113919275745,
"grad_norm": 1.4258723258972168,
"learning_rate": 1.7992456486339744e-06,
"loss": 0.4655,
"step": 530
},
{
"epoch": 0.05092417955488495,
"grad_norm": 1.1291389465332031,
"learning_rate": 1.326762342710017e-06,
"loss": 0.4796,
"step": 540
},
{
"epoch": 0.05186721991701245,
"grad_norm": 1.4239860773086548,
"learning_rate": 9.242307290592442e-07,
"loss": 0.4622,
"step": 550
},
{
"epoch": 0.052810260279139945,
"grad_norm": 1.2388654947280884,
"learning_rate": 5.930128472438762e-07,
"loss": 0.5011,
"step": 560
},
{
"epoch": 0.053753300641267446,
"grad_norm": 1.3684238195419312,
"learning_rate": 3.3422943373142354e-07,
"loss": 0.446,
"step": 570
},
{
"epoch": 0.05469634100339495,
"grad_norm": 1.4726179838180542,
"learning_rate": 1.4875612967725348e-07,
"loss": 0.4642,
"step": 580
},
{
"epoch": 0.05563938136552245,
"grad_norm": 1.8181157112121582,
"learning_rate": 3.722051803210014e-08,
"loss": 0.4721,
"step": 590
},
{
"epoch": 0.05658242172764994,
"grad_norm": 1.503574252128601,
"learning_rate": 0.0,
"loss": 0.4171,
"step": 600
},
{
"epoch": 0.05658242172764994,
"eval_loss": 0.4729672074317932,
"eval_runtime": 93.8663,
"eval_samples_per_second": 23.789,
"eval_steps_per_second": 5.955,
"step": 600
}
],
"logging_steps": 10,
"max_steps": 600,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.12347940585472e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}