leixa's picture
Training in progress, step 150, checkpoint
647a71e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.046667185190946564,
"eval_steps": 13,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00031111456793964375,
"eval_loss": 3.795616626739502,
"eval_runtime": 387.5587,
"eval_samples_per_second": 13.969,
"eval_steps_per_second": 1.747,
"step": 1
},
{
"epoch": 0.0009333437038189313,
"grad_norm": 1.8201688528060913,
"learning_rate": 1.5e-05,
"loss": 3.6704,
"step": 3
},
{
"epoch": 0.0018666874076378626,
"grad_norm": 1.9469057321548462,
"learning_rate": 3e-05,
"loss": 3.829,
"step": 6
},
{
"epoch": 0.002800031111456794,
"grad_norm": 2.4936416149139404,
"learning_rate": 4.5e-05,
"loss": 3.6616,
"step": 9
},
{
"epoch": 0.003733374815275725,
"grad_norm": 3.112192392349243,
"learning_rate": 4.997482666353287e-05,
"loss": 3.1786,
"step": 12
},
{
"epoch": 0.004044489383215369,
"eval_loss": 2.5751025676727295,
"eval_runtime": 389.7804,
"eval_samples_per_second": 13.89,
"eval_steps_per_second": 1.737,
"step": 13
},
{
"epoch": 0.004666718519094656,
"grad_norm": 2.7909445762634277,
"learning_rate": 4.984280524733107e-05,
"loss": 2.6525,
"step": 15
},
{
"epoch": 0.005600062222913588,
"grad_norm": 2.6461470127105713,
"learning_rate": 4.959823971496574e-05,
"loss": 2.2006,
"step": 18
},
{
"epoch": 0.00653340592673252,
"grad_norm": 2.1072444915771484,
"learning_rate": 4.9242238009417175e-05,
"loss": 1.9303,
"step": 21
},
{
"epoch": 0.00746674963055145,
"grad_norm": 1.8727749586105347,
"learning_rate": 4.877641290737884e-05,
"loss": 1.8151,
"step": 24
},
{
"epoch": 0.008088978766430738,
"eval_loss": 1.7247991561889648,
"eval_runtime": 389.9247,
"eval_samples_per_second": 13.885,
"eval_steps_per_second": 1.736,
"step": 26
},
{
"epoch": 0.008400093334370381,
"grad_norm": 1.5817272663116455,
"learning_rate": 4.820287471297598e-05,
"loss": 1.6959,
"step": 27
},
{
"epoch": 0.009333437038189313,
"grad_norm": 1.7775505781173706,
"learning_rate": 4.752422169756048e-05,
"loss": 1.7599,
"step": 30
},
{
"epoch": 0.010266780742008244,
"grad_norm": 1.512369155883789,
"learning_rate": 4.674352832889239e-05,
"loss": 1.6668,
"step": 33
},
{
"epoch": 0.011200124445827176,
"grad_norm": 1.427876353263855,
"learning_rate": 4.586433134303257e-05,
"loss": 1.6359,
"step": 36
},
{
"epoch": 0.012133468149646108,
"grad_norm": 1.557162880897522,
"learning_rate": 4.489061372204453e-05,
"loss": 1.5442,
"step": 39
},
{
"epoch": 0.012133468149646108,
"eval_loss": 1.5903956890106201,
"eval_runtime": 389.6183,
"eval_samples_per_second": 13.896,
"eval_steps_per_second": 1.738,
"step": 39
},
{
"epoch": 0.01306681185346504,
"grad_norm": 1.3834956884384155,
"learning_rate": 4.382678665009028e-05,
"loss": 1.5914,
"step": 42
},
{
"epoch": 0.01400015555728397,
"grad_norm": 1.2795754671096802,
"learning_rate": 4.267766952966369e-05,
"loss": 1.512,
"step": 45
},
{
"epoch": 0.0149334992611029,
"grad_norm": 1.383254885673523,
"learning_rate": 4.144846814849282e-05,
"loss": 1.5425,
"step": 48
},
{
"epoch": 0.015866842964921832,
"grad_norm": 1.4243319034576416,
"learning_rate": 4.01447510960205e-05,
"loss": 1.5153,
"step": 51
},
{
"epoch": 0.016177957532861477,
"eval_loss": 1.5446308851242065,
"eval_runtime": 390.0683,
"eval_samples_per_second": 13.88,
"eval_steps_per_second": 1.736,
"step": 52
},
{
"epoch": 0.016800186668740762,
"grad_norm": 1.3810615539550781,
"learning_rate": 3.8772424536302564e-05,
"loss": 1.5528,
"step": 54
},
{
"epoch": 0.017733530372559696,
"grad_norm": 1.4023598432540894,
"learning_rate": 3.7337705451608674e-05,
"loss": 1.5652,
"step": 57
},
{
"epoch": 0.018666874076378626,
"grad_norm": 1.2947098016738892,
"learning_rate": 3.5847093477938956e-05,
"loss": 1.5906,
"step": 60
},
{
"epoch": 0.01960021778019756,
"grad_norm": 1.40787935256958,
"learning_rate": 3.4307341460048633e-05,
"loss": 1.4431,
"step": 63
},
{
"epoch": 0.020222446916076844,
"eval_loss": 1.5237524509429932,
"eval_runtime": 389.7942,
"eval_samples_per_second": 13.889,
"eval_steps_per_second": 1.737,
"step": 65
},
{
"epoch": 0.02053356148401649,
"grad_norm": 1.2801828384399414,
"learning_rate": 3.272542485937369e-05,
"loss": 1.5227,
"step": 66
},
{
"epoch": 0.02146690518783542,
"grad_norm": 1.3553149700164795,
"learning_rate": 3.110851015344735e-05,
"loss": 1.5033,
"step": 69
},
{
"epoch": 0.022400248891654352,
"grad_norm": 1.339362621307373,
"learning_rate": 2.9463922369965917e-05,
"loss": 1.517,
"step": 72
},
{
"epoch": 0.023333592595473282,
"grad_norm": 1.307166337966919,
"learning_rate": 2.7799111902582696e-05,
"loss": 1.4759,
"step": 75
},
{
"epoch": 0.024266936299292215,
"grad_norm": 1.2301127910614014,
"learning_rate": 2.6121620758762877e-05,
"loss": 1.4794,
"step": 78
},
{
"epoch": 0.024266936299292215,
"eval_loss": 1.506866216659546,
"eval_runtime": 389.7553,
"eval_samples_per_second": 13.891,
"eval_steps_per_second": 1.737,
"step": 78
},
{
"epoch": 0.025200280003111145,
"grad_norm": 1.2243834733963013,
"learning_rate": 2.443904839260488e-05,
"loss": 1.4497,
"step": 81
},
{
"epoch": 0.02613362370693008,
"grad_norm": 1.1867769956588745,
"learning_rate": 2.2759017277414166e-05,
"loss": 1.4718,
"step": 84
},
{
"epoch": 0.02706696741074901,
"grad_norm": 1.1967134475708008,
"learning_rate": 2.1089138373994223e-05,
"loss": 1.5324,
"step": 87
},
{
"epoch": 0.02800031111456794,
"grad_norm": 1.2109935283660889,
"learning_rate": 1.9436976651092144e-05,
"loss": 1.5148,
"step": 90
},
{
"epoch": 0.028311425682507583,
"eval_loss": 1.4927889108657837,
"eval_runtime": 389.8573,
"eval_samples_per_second": 13.887,
"eval_steps_per_second": 1.737,
"step": 91
},
{
"epoch": 0.028933654818386872,
"grad_norm": 1.2042676210403442,
"learning_rate": 1.781001681419957e-05,
"loss": 1.5146,
"step": 93
},
{
"epoch": 0.0298669985222058,
"grad_norm": 1.2717711925506592,
"learning_rate": 1.621562939796643e-05,
"loss": 1.4681,
"step": 96
},
{
"epoch": 0.030800342226024735,
"grad_norm": 1.1750036478042603,
"learning_rate": 1.466103737583699e-05,
"loss": 1.5074,
"step": 99
},
{
"epoch": 0.031733685929843665,
"grad_norm": 1.3590319156646729,
"learning_rate": 1.3153283438175034e-05,
"loss": 1.5168,
"step": 102
},
{
"epoch": 0.032355915065722954,
"eval_loss": 1.4859765768051147,
"eval_runtime": 389.9351,
"eval_samples_per_second": 13.884,
"eval_steps_per_second": 1.736,
"step": 104
},
{
"epoch": 0.032667029633662595,
"grad_norm": 1.15152907371521,
"learning_rate": 1.1699198087116589e-05,
"loss": 1.4838,
"step": 105
},
{
"epoch": 0.033600373337481525,
"grad_norm": 1.2240478992462158,
"learning_rate": 1.0305368692688174e-05,
"loss": 1.5484,
"step": 108
},
{
"epoch": 0.03453371704130046,
"grad_norm": 1.2042181491851807,
"learning_rate": 8.978109650374397e-06,
"loss": 1.5173,
"step": 111
},
{
"epoch": 0.03546706074511939,
"grad_norm": 1.205437183380127,
"learning_rate": 7.723433775328384e-06,
"loss": 1.472,
"step": 114
},
{
"epoch": 0.03640040444893832,
"grad_norm": 1.2650120258331299,
"learning_rate": 6.547025062816486e-06,
"loss": 1.5346,
"step": 117
},
{
"epoch": 0.03640040444893832,
"eval_loss": 1.4806684255599976,
"eval_runtime": 390.1832,
"eval_samples_per_second": 13.876,
"eval_steps_per_second": 1.735,
"step": 117
},
{
"epoch": 0.03733374815275725,
"grad_norm": 1.1355212926864624,
"learning_rate": 5.454212938299255e-06,
"loss": 1.4929,
"step": 120
},
{
"epoch": 0.03826709185657618,
"grad_norm": 1.2554372549057007,
"learning_rate": 4.4499481138022544e-06,
"loss": 1.4357,
"step": 123
},
{
"epoch": 0.03920043556039512,
"grad_norm": 1.289537787437439,
"learning_rate": 3.5387801599533475e-06,
"loss": 1.4257,
"step": 126
},
{
"epoch": 0.04013377926421405,
"grad_norm": 1.439562201499939,
"learning_rate": 2.7248368952908053e-06,
"loss": 1.4607,
"step": 129
},
{
"epoch": 0.04044489383215369,
"eval_loss": 1.4785248041152954,
"eval_runtime": 390.0687,
"eval_samples_per_second": 13.88,
"eval_steps_per_second": 1.736,
"step": 130
},
{
"epoch": 0.04106712296803298,
"grad_norm": 1.2261285781860352,
"learning_rate": 2.0118056862137357e-06,
"loss": 1.5202,
"step": 132
},
{
"epoch": 0.04200046667185191,
"grad_norm": 1.3027156591415405,
"learning_rate": 1.4029167422908107e-06,
"loss": 1.5151,
"step": 135
},
{
"epoch": 0.04293381037567084,
"grad_norm": 1.1372510194778442,
"learning_rate": 9.009284826036691e-07,
"loss": 1.5352,
"step": 138
},
{
"epoch": 0.043867154079489774,
"grad_norm": 1.248124122619629,
"learning_rate": 5.08115039419113e-07,
"loss": 1.5216,
"step": 141
},
{
"epoch": 0.044489383215369056,
"eval_loss": 1.4776010513305664,
"eval_runtime": 389.8727,
"eval_samples_per_second": 13.887,
"eval_steps_per_second": 1.736,
"step": 143
},
{
"epoch": 0.044800497783308704,
"grad_norm": 1.2946685552597046,
"learning_rate": 2.262559558016325e-07,
"loss": 1.5154,
"step": 144
},
{
"epoch": 0.045733841487127634,
"grad_norm": 1.1807682514190674,
"learning_rate": 5.662812383859795e-08,
"loss": 1.4998,
"step": 147
},
{
"epoch": 0.046667185190946564,
"grad_norm": 1.2081470489501953,
"learning_rate": 0.0,
"loss": 1.5239,
"step": 150
}
],
"logging_steps": 3,
"max_steps": 150,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 13,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.225661850681344e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}