lesso17's picture
Training in progress, step 500, checkpoint
74891c4 verified
{
"best_metric": 0.6367893815040588,
"best_model_checkpoint": "miner_id_24/checkpoint-500",
"epoch": 0.018298261665141813,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 3.659652333028362e-05,
"eval_loss": 3.3232226371765137,
"eval_runtime": 654.1201,
"eval_samples_per_second": 17.59,
"eval_steps_per_second": 4.398,
"step": 1
},
{
"epoch": 0.00036596523330283625,
"grad_norm": 3.221409559249878,
"learning_rate": 4.34e-05,
"loss": 2.2773,
"step": 10
},
{
"epoch": 0.0007319304666056725,
"grad_norm": 2.2033233642578125,
"learning_rate": 8.68e-05,
"loss": 1.6412,
"step": 20
},
{
"epoch": 0.0010978956999085087,
"grad_norm": 2.0530292987823486,
"learning_rate": 0.0001302,
"loss": 1.2863,
"step": 30
},
{
"epoch": 0.001463860933211345,
"grad_norm": 2.043330430984497,
"learning_rate": 0.0001736,
"loss": 1.1969,
"step": 40
},
{
"epoch": 0.0018298261665141812,
"grad_norm": 5.248676300048828,
"learning_rate": 0.000217,
"loss": 1.1612,
"step": 50
},
{
"epoch": 0.0018298261665141812,
"eval_loss": 1.7948706150054932,
"eval_runtime": 654.0329,
"eval_samples_per_second": 17.592,
"eval_steps_per_second": 4.399,
"step": 50
},
{
"epoch": 0.0021957913998170175,
"grad_norm": 1.6893078088760376,
"learning_rate": 0.00021673569945319091,
"loss": 1.2685,
"step": 60
},
{
"epoch": 0.0025617566331198535,
"grad_norm": 1.2931175231933594,
"learning_rate": 0.00021594408545846038,
"loss": 1.0429,
"step": 70
},
{
"epoch": 0.00292772186642269,
"grad_norm": 1.124497652053833,
"learning_rate": 0.0002146290146796179,
"loss": 1.0055,
"step": 80
},
{
"epoch": 0.003293687099725526,
"grad_norm": 2.295246124267578,
"learning_rate": 0.0002127968940093076,
"loss": 1.0593,
"step": 90
},
{
"epoch": 0.0036596523330283625,
"grad_norm": 2.5044991970062256,
"learning_rate": 0.00021045664935527106,
"loss": 1.0974,
"step": 100
},
{
"epoch": 0.0036596523330283625,
"eval_loss": 1.6511973142623901,
"eval_runtime": 653.846,
"eval_samples_per_second": 17.597,
"eval_steps_per_second": 4.4,
"step": 100
},
{
"epoch": 0.0040256175663311985,
"grad_norm": 1.1868317127227783,
"learning_rate": 0.00020761968215422217,
"loss": 1.1934,
"step": 110
},
{
"epoch": 0.004391582799634035,
"grad_norm": 1.1751034259796143,
"learning_rate": 0.00020429981382519356,
"loss": 0.8967,
"step": 120
},
{
"epoch": 0.004757548032936871,
"grad_norm": 1.8134667873382568,
"learning_rate": 0.00020051321843297219,
"loss": 0.959,
"step": 130
},
{
"epoch": 0.005123513266239707,
"grad_norm": 1.202273964881897,
"learning_rate": 0.0001962783438896818,
"loss": 1.0112,
"step": 140
},
{
"epoch": 0.0054894784995425435,
"grad_norm": 2.1259782314300537,
"learning_rate": 0.0001916158220784091,
"loss": 0.8748,
"step": 150
},
{
"epoch": 0.0054894784995425435,
"eval_loss": 1.6277738809585571,
"eval_runtime": 654.0264,
"eval_samples_per_second": 17.593,
"eval_steps_per_second": 4.399,
"step": 150
},
{
"epoch": 0.00585544373284538,
"grad_norm": 0.8180252313613892,
"learning_rate": 0.00018654836833674362,
"loss": 1.1652,
"step": 160
},
{
"epoch": 0.006221408966148216,
"grad_norm": 1.229128360748291,
"learning_rate": 0.0001811006707899361,
"loss": 0.9004,
"step": 170
},
{
"epoch": 0.006587374199451052,
"grad_norm": 2.126133680343628,
"learning_rate": 0.0001752992700728339,
"loss": 0.8799,
"step": 180
},
{
"epoch": 0.0069533394327538885,
"grad_norm": 1.0177737474441528,
"learning_rate": 0.00016917243002657602,
"loss": 0.8698,
"step": 190
},
{
"epoch": 0.007319304666056725,
"grad_norm": 2.6272943019866943,
"learning_rate": 0.00016275,
"loss": 0.8727,
"step": 200
},
{
"epoch": 0.007319304666056725,
"eval_loss": 1.3153941631317139,
"eval_runtime": 653.5309,
"eval_samples_per_second": 17.606,
"eval_steps_per_second": 4.402,
"step": 200
},
{
"epoch": 0.0076852698993595606,
"grad_norm": 0.8264740109443665,
"learning_rate": 0.0001560632694266149,
"loss": 0.9661,
"step": 210
},
{
"epoch": 0.008051235132662397,
"grad_norm": 0.7382209897041321,
"learning_rate": 0.00014914481538562646,
"loss": 0.8098,
"step": 220
},
{
"epoch": 0.008417200365965233,
"grad_norm": 0.9931471943855286,
"learning_rate": 0.0001420283438896818,
"loss": 0.7825,
"step": 230
},
{
"epoch": 0.00878316559926807,
"grad_norm": 0.7864680886268616,
"learning_rate": 0.00013474852567256393,
"loss": 0.8832,
"step": 240
},
{
"epoch": 0.009149130832570906,
"grad_norm": 2.8572616577148438,
"learning_rate": 0.00012734082727686196,
"loss": 0.8983,
"step": 250
},
{
"epoch": 0.009149130832570906,
"eval_loss": 1.1016149520874023,
"eval_runtime": 654.6016,
"eval_samples_per_second": 17.577,
"eval_steps_per_second": 4.395,
"step": 250
},
{
"epoch": 0.009515096065873741,
"grad_norm": 0.7957597970962524,
"learning_rate": 0.0001198413382645404,
"loss": 0.9094,
"step": 260
},
{
"epoch": 0.009881061299176578,
"grad_norm": 0.9204810261726379,
"learning_rate": 0.00011228659539222137,
"loss": 0.8334,
"step": 270
},
{
"epoch": 0.010247026532479414,
"grad_norm": 1.0104973316192627,
"learning_rate": 0.00010471340460777866,
"loss": 0.7847,
"step": 280
},
{
"epoch": 0.01061299176578225,
"grad_norm": 0.8759304285049438,
"learning_rate": 9.715866173545961e-05,
"loss": 0.7903,
"step": 290
},
{
"epoch": 0.010978956999085087,
"grad_norm": 3.211150646209717,
"learning_rate": 8.965917272313806e-05,
"loss": 0.7951,
"step": 300
},
{
"epoch": 0.010978956999085087,
"eval_loss": 0.9889414310455322,
"eval_runtime": 653.2449,
"eval_samples_per_second": 17.614,
"eval_steps_per_second": 4.404,
"step": 300
},
{
"epoch": 0.011344922232387923,
"grad_norm": 0.8451036810874939,
"learning_rate": 8.225147432743606e-05,
"loss": 0.7332,
"step": 310
},
{
"epoch": 0.01171088746569076,
"grad_norm": 0.6947381496429443,
"learning_rate": 7.497165611031821e-05,
"loss": 0.7797,
"step": 320
},
{
"epoch": 0.012076852698993596,
"grad_norm": 0.7813195586204529,
"learning_rate": 6.785518461437353e-05,
"loss": 0.7427,
"step": 330
},
{
"epoch": 0.012442817932296431,
"grad_norm": 1.1668095588684082,
"learning_rate": 6.093673057338509e-05,
"loss": 0.6847,
"step": 340
},
{
"epoch": 0.012808783165599268,
"grad_norm": 1.675475001335144,
"learning_rate": 5.4250000000000024e-05,
"loss": 0.7452,
"step": 350
},
{
"epoch": 0.012808783165599268,
"eval_loss": 0.799678385257721,
"eval_runtime": 653.6373,
"eval_samples_per_second": 17.603,
"eval_steps_per_second": 4.402,
"step": 350
},
{
"epoch": 0.013174748398902104,
"grad_norm": 0.8130257725715637,
"learning_rate": 4.782756997342398e-05,
"loss": 0.6847,
"step": 360
},
{
"epoch": 0.01354071363220494,
"grad_norm": 0.8550499081611633,
"learning_rate": 4.170072992716607e-05,
"loss": 0.6897,
"step": 370
},
{
"epoch": 0.013906678865507777,
"grad_norm": 0.7728462815284729,
"learning_rate": 3.5899329210063916e-05,
"loss": 0.7088,
"step": 380
},
{
"epoch": 0.014272644098810613,
"grad_norm": 1.0523146390914917,
"learning_rate": 3.045163166325637e-05,
"loss": 0.6873,
"step": 390
},
{
"epoch": 0.01463860933211345,
"grad_norm": 2.0533359050750732,
"learning_rate": 2.5384177921590895e-05,
"loss": 0.6468,
"step": 400
},
{
"epoch": 0.01463860933211345,
"eval_loss": 0.7176544666290283,
"eval_runtime": 654.5529,
"eval_samples_per_second": 17.578,
"eval_steps_per_second": 4.395,
"step": 400
},
{
"epoch": 0.015004574565416285,
"grad_norm": 0.7370195984840393,
"learning_rate": 2.0721656110318213e-05,
"loss": 0.6352,
"step": 410
},
{
"epoch": 0.015370539798719121,
"grad_norm": 0.7632651329040527,
"learning_rate": 1.6486781567027783e-05,
"loss": 0.6466,
"step": 420
},
{
"epoch": 0.01573650503202196,
"grad_norm": 0.9370796084403992,
"learning_rate": 1.2700186174806422e-05,
"loss": 0.6818,
"step": 430
},
{
"epoch": 0.016102470265324794,
"grad_norm": 0.8790647387504578,
"learning_rate": 9.380317845777794e-06,
"loss": 0.7192,
"step": 440
},
{
"epoch": 0.01646843549862763,
"grad_norm": 2.4070212841033936,
"learning_rate": 6.543350644728947e-06,
"loss": 0.6286,
"step": 450
},
{
"epoch": 0.01646843549862763,
"eval_loss": 0.6477181315422058,
"eval_runtime": 654.5484,
"eval_samples_per_second": 17.579,
"eval_steps_per_second": 4.395,
"step": 450
},
{
"epoch": 0.016834400731930467,
"grad_norm": 0.7986826300621033,
"learning_rate": 4.2031059906924e-06,
"loss": 0.5613,
"step": 460
},
{
"epoch": 0.0172003659652333,
"grad_norm": 0.9289596080780029,
"learning_rate": 2.3709853203820825e-06,
"loss": 0.6961,
"step": 470
},
{
"epoch": 0.01756633119853614,
"grad_norm": 0.9395463466644287,
"learning_rate": 1.0559145415396157e-06,
"loss": 0.6394,
"step": 480
},
{
"epoch": 0.017932296431838975,
"grad_norm": 0.949378252029419,
"learning_rate": 2.643005468090745e-07,
"loss": 0.6338,
"step": 490
},
{
"epoch": 0.018298261665141813,
"grad_norm": 1.6931450366973877,
"learning_rate": 0.0,
"loss": 0.6262,
"step": 500
},
{
"epoch": 0.018298261665141813,
"eval_loss": 0.6367893815040588,
"eval_runtime": 654.042,
"eval_samples_per_second": 17.592,
"eval_steps_per_second": 4.399,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.3877540397056e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}