leixa's picture
Training in progress, step 200, checkpoint
79b5885 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.7027027027027026,
"eval_steps": 17,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013513513513513514,
"eval_loss": 1.4550755023956299,
"eval_runtime": 16.9459,
"eval_samples_per_second": 7.376,
"eval_steps_per_second": 0.944,
"step": 1
},
{
"epoch": 0.04054054054054054,
"grad_norm": 0.2170189917087555,
"learning_rate": 3e-05,
"loss": 1.4534,
"step": 3
},
{
"epoch": 0.08108108108108109,
"grad_norm": 0.25585711002349854,
"learning_rate": 6e-05,
"loss": 1.463,
"step": 6
},
{
"epoch": 0.12162162162162163,
"grad_norm": 0.24171331524848938,
"learning_rate": 9e-05,
"loss": 1.4157,
"step": 9
},
{
"epoch": 0.16216216216216217,
"grad_norm": 0.2852655351161957,
"learning_rate": 9.997266286704631e-05,
"loss": 1.3769,
"step": 12
},
{
"epoch": 0.20270270270270271,
"grad_norm": 0.3118453323841095,
"learning_rate": 9.98292246503335e-05,
"loss": 1.3167,
"step": 15
},
{
"epoch": 0.22972972972972974,
"eval_loss": 1.230002760887146,
"eval_runtime": 17.2352,
"eval_samples_per_second": 7.253,
"eval_steps_per_second": 0.928,
"step": 17
},
{
"epoch": 0.24324324324324326,
"grad_norm": 0.3110101521015167,
"learning_rate": 9.956320346634876e-05,
"loss": 1.2565,
"step": 18
},
{
"epoch": 0.28378378378378377,
"grad_norm": 0.2778882086277008,
"learning_rate": 9.917525374361912e-05,
"loss": 1.216,
"step": 21
},
{
"epoch": 0.32432432432432434,
"grad_norm": 0.27365806698799133,
"learning_rate": 9.86663298624003e-05,
"loss": 1.2426,
"step": 24
},
{
"epoch": 0.36486486486486486,
"grad_norm": 0.27578839659690857,
"learning_rate": 9.803768380684242e-05,
"loss": 1.1636,
"step": 27
},
{
"epoch": 0.40540540540540543,
"grad_norm": 0.3146299123764038,
"learning_rate": 9.729086208503174e-05,
"loss": 1.1645,
"step": 30
},
{
"epoch": 0.44594594594594594,
"grad_norm": 0.3067275583744049,
"learning_rate": 9.642770192448536e-05,
"loss": 1.1556,
"step": 33
},
{
"epoch": 0.4594594594594595,
"eval_loss": 1.0813400745391846,
"eval_runtime": 17.2384,
"eval_samples_per_second": 7.251,
"eval_steps_per_second": 0.928,
"step": 34
},
{
"epoch": 0.4864864864864865,
"grad_norm": 0.30459922552108765,
"learning_rate": 9.545032675245813e-05,
"loss": 1.1033,
"step": 36
},
{
"epoch": 0.527027027027027,
"grad_norm": 0.31739258766174316,
"learning_rate": 9.43611409721806e-05,
"loss": 1.0876,
"step": 39
},
{
"epoch": 0.5675675675675675,
"grad_norm": 0.3213581144809723,
"learning_rate": 9.316282404787871e-05,
"loss": 1.0733,
"step": 42
},
{
"epoch": 0.6081081081081081,
"grad_norm": 0.3462018370628357,
"learning_rate": 9.185832391312644e-05,
"loss": 1.1107,
"step": 45
},
{
"epoch": 0.6486486486486487,
"grad_norm": 0.3432579040527344,
"learning_rate": 9.045084971874738e-05,
"loss": 1.0265,
"step": 48
},
{
"epoch": 0.6891891891891891,
"grad_norm": 0.3420480191707611,
"learning_rate": 8.894386393810563e-05,
"loss": 1.053,
"step": 51
},
{
"epoch": 0.6891891891891891,
"eval_loss": 1.0123796463012695,
"eval_runtime": 17.2325,
"eval_samples_per_second": 7.254,
"eval_steps_per_second": 0.928,
"step": 51
},
{
"epoch": 0.7297297297297297,
"grad_norm": 0.35601529479026794,
"learning_rate": 8.73410738492077e-05,
"loss": 1.0257,
"step": 54
},
{
"epoch": 0.7702702702702703,
"grad_norm": 0.3662780821323395,
"learning_rate": 8.564642241456986e-05,
"loss": 1.0528,
"step": 57
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.39324405789375305,
"learning_rate": 8.386407858128706e-05,
"loss": 1.0185,
"step": 60
},
{
"epoch": 0.8513513513513513,
"grad_norm": 0.39999616146087646,
"learning_rate": 8.199842702516583e-05,
"loss": 1.0132,
"step": 63
},
{
"epoch": 0.8918918918918919,
"grad_norm": 0.4472688138484955,
"learning_rate": 8.005405736415126e-05,
"loss": 1.0243,
"step": 66
},
{
"epoch": 0.918918918918919,
"eval_loss": 0.9729012250900269,
"eval_runtime": 17.2325,
"eval_samples_per_second": 7.254,
"eval_steps_per_second": 0.928,
"step": 68
},
{
"epoch": 0.9324324324324325,
"grad_norm": 0.3893688917160034,
"learning_rate": 7.803575286758364e-05,
"loss": 0.9998,
"step": 69
},
{
"epoch": 0.972972972972973,
"grad_norm": 0.38515031337738037,
"learning_rate": 7.594847868906076e-05,
"loss": 0.9943,
"step": 72
},
{
"epoch": 1.0135135135135136,
"grad_norm": 0.4097621738910675,
"learning_rate": 7.379736965185368e-05,
"loss": 0.9972,
"step": 75
},
{
"epoch": 1.054054054054054,
"grad_norm": 0.4480155408382416,
"learning_rate": 7.158771761692464e-05,
"loss": 0.9534,
"step": 78
},
{
"epoch": 1.0945945945945945,
"grad_norm": 0.42299342155456543,
"learning_rate": 6.932495846462261e-05,
"loss": 0.9408,
"step": 81
},
{
"epoch": 1.135135135135135,
"grad_norm": 0.41073334217071533,
"learning_rate": 6.701465872208216e-05,
"loss": 0.9503,
"step": 84
},
{
"epoch": 1.1486486486486487,
"eval_loss": 0.9501034021377563,
"eval_runtime": 17.2388,
"eval_samples_per_second": 7.251,
"eval_steps_per_second": 0.928,
"step": 85
},
{
"epoch": 1.1756756756756757,
"grad_norm": 0.49050870537757874,
"learning_rate": 6.466250186922325e-05,
"loss": 0.9146,
"step": 87
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.48001334071159363,
"learning_rate": 6.227427435703997e-05,
"loss": 0.9215,
"step": 90
},
{
"epoch": 1.2567567567567568,
"grad_norm": 0.47052550315856934,
"learning_rate": 5.985585137257401e-05,
"loss": 0.8942,
"step": 93
},
{
"epoch": 1.2972972972972974,
"grad_norm": 0.46515583992004395,
"learning_rate": 5.74131823855921e-05,
"loss": 0.9304,
"step": 96
},
{
"epoch": 1.3378378378378377,
"grad_norm": 0.5048130750656128,
"learning_rate": 5.495227651252315e-05,
"loss": 0.95,
"step": 99
},
{
"epoch": 1.3783783783783785,
"grad_norm": 0.5172950029373169,
"learning_rate": 5.247918773366112e-05,
"loss": 0.9288,
"step": 102
},
{
"epoch": 1.3783783783783785,
"eval_loss": 0.9384378790855408,
"eval_runtime": 17.2324,
"eval_samples_per_second": 7.254,
"eval_steps_per_second": 0.928,
"step": 102
},
{
"epoch": 1.4189189189189189,
"grad_norm": 0.5409041047096252,
"learning_rate": 5e-05,
"loss": 0.9294,
"step": 105
},
{
"epoch": 1.4594594594594594,
"grad_norm": 0.49371689558029175,
"learning_rate": 4.7520812266338885e-05,
"loss": 0.9109,
"step": 108
},
{
"epoch": 1.5,
"grad_norm": 0.4969201982021332,
"learning_rate": 4.504772348747687e-05,
"loss": 0.94,
"step": 111
},
{
"epoch": 1.5405405405405406,
"grad_norm": 0.5152857899665833,
"learning_rate": 4.2586817614407895e-05,
"loss": 0.9083,
"step": 114
},
{
"epoch": 1.5810810810810811,
"grad_norm": 0.5423870086669922,
"learning_rate": 4.0144148627425993e-05,
"loss": 0.8869,
"step": 117
},
{
"epoch": 1.6081081081081081,
"eval_loss": 0.924616813659668,
"eval_runtime": 17.244,
"eval_samples_per_second": 7.249,
"eval_steps_per_second": 0.928,
"step": 119
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.49147549271583557,
"learning_rate": 3.772572564296005e-05,
"loss": 0.8855,
"step": 120
},
{
"epoch": 1.6621621621621623,
"grad_norm": 0.4987981915473938,
"learning_rate": 3.533749813077677e-05,
"loss": 0.8915,
"step": 123
},
{
"epoch": 1.7027027027027026,
"grad_norm": 0.4984021484851837,
"learning_rate": 3.298534127791785e-05,
"loss": 0.9181,
"step": 126
},
{
"epoch": 1.7432432432432432,
"grad_norm": 0.5324372053146362,
"learning_rate": 3.0675041535377405e-05,
"loss": 0.8911,
"step": 129
},
{
"epoch": 1.7837837837837838,
"grad_norm": 0.521675169467926,
"learning_rate": 2.8412282383075363e-05,
"loss": 0.8905,
"step": 132
},
{
"epoch": 1.8243243243243243,
"grad_norm": 0.5264241099357605,
"learning_rate": 2.6202630348146324e-05,
"loss": 0.8928,
"step": 135
},
{
"epoch": 1.8378378378378377,
"eval_loss": 0.9162411689758301,
"eval_runtime": 17.2393,
"eval_samples_per_second": 7.251,
"eval_steps_per_second": 0.928,
"step": 136
},
{
"epoch": 1.864864864864865,
"grad_norm": 0.5409694910049438,
"learning_rate": 2.405152131093926e-05,
"loss": 0.9095,
"step": 138
},
{
"epoch": 1.9054054054054053,
"grad_norm": 0.5272465348243713,
"learning_rate": 2.196424713241637e-05,
"loss": 0.8844,
"step": 141
},
{
"epoch": 1.945945945945946,
"grad_norm": 0.5436295866966248,
"learning_rate": 1.9945942635848748e-05,
"loss": 0.8699,
"step": 144
},
{
"epoch": 1.9864864864864864,
"grad_norm": 0.523435115814209,
"learning_rate": 1.800157297483417e-05,
"loss": 0.9164,
"step": 147
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.5417100787162781,
"learning_rate": 1.6135921418712956e-05,
"loss": 0.8899,
"step": 150
},
{
"epoch": 2.0675675675675675,
"grad_norm": 0.569843053817749,
"learning_rate": 1.435357758543015e-05,
"loss": 0.8476,
"step": 153
},
{
"epoch": 2.0675675675675675,
"eval_loss": 0.9106208682060242,
"eval_runtime": 17.255,
"eval_samples_per_second": 7.244,
"eval_steps_per_second": 0.927,
"step": 153
},
{
"epoch": 2.108108108108108,
"grad_norm": 0.575161337852478,
"learning_rate": 1.2658926150792322e-05,
"loss": 0.8375,
"step": 156
},
{
"epoch": 2.1486486486486487,
"grad_norm": 0.5137052536010742,
"learning_rate": 1.1056136061894384e-05,
"loss": 0.823,
"step": 159
},
{
"epoch": 2.189189189189189,
"grad_norm": 0.532635509967804,
"learning_rate": 9.549150281252633e-06,
"loss": 0.8504,
"step": 162
},
{
"epoch": 2.22972972972973,
"grad_norm": 0.5798172950744629,
"learning_rate": 8.141676086873572e-06,
"loss": 0.8315,
"step": 165
},
{
"epoch": 2.27027027027027,
"grad_norm": 0.5635726451873779,
"learning_rate": 6.837175952121306e-06,
"loss": 0.8626,
"step": 168
},
{
"epoch": 2.2972972972972974,
"eval_loss": 0.9093061089515686,
"eval_runtime": 17.2439,
"eval_samples_per_second": 7.249,
"eval_steps_per_second": 0.928,
"step": 170
},
{
"epoch": 2.310810810810811,
"grad_norm": 0.5467652082443237,
"learning_rate": 5.6388590278194096e-06,
"loss": 0.8334,
"step": 171
},
{
"epoch": 2.3513513513513513,
"grad_norm": 0.534943699836731,
"learning_rate": 4.549673247541875e-06,
"loss": 0.8773,
"step": 174
},
{
"epoch": 2.391891891891892,
"grad_norm": 0.5133833885192871,
"learning_rate": 3.5722980755146517e-06,
"loss": 0.8081,
"step": 177
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.5191574096679688,
"learning_rate": 2.7091379149682685e-06,
"loss": 0.8595,
"step": 180
},
{
"epoch": 2.472972972972973,
"grad_norm": 0.5596672892570496,
"learning_rate": 1.962316193157593e-06,
"loss": 0.8525,
"step": 183
},
{
"epoch": 2.5135135135135136,
"grad_norm": 0.5567631125450134,
"learning_rate": 1.333670137599713e-06,
"loss": 0.8269,
"step": 186
},
{
"epoch": 2.527027027027027,
"eval_loss": 0.9086253643035889,
"eval_runtime": 17.2348,
"eval_samples_per_second": 7.253,
"eval_steps_per_second": 0.928,
"step": 187
},
{
"epoch": 2.554054054054054,
"grad_norm": 0.5464707016944885,
"learning_rate": 8.247462563808817e-07,
"loss": 0.8524,
"step": 189
},
{
"epoch": 2.5945945945945947,
"grad_norm": 0.6008809804916382,
"learning_rate": 4.367965336512403e-07,
"loss": 0.8633,
"step": 192
},
{
"epoch": 2.635135135135135,
"grad_norm": 0.5401892066001892,
"learning_rate": 1.7077534966650766e-07,
"loss": 0.8186,
"step": 195
},
{
"epoch": 2.6756756756756754,
"grad_norm": 0.5296628475189209,
"learning_rate": 2.7337132953697554e-08,
"loss": 0.8183,
"step": 198
}
],
"logging_steps": 3,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 17,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.262308567427318e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}