leixa's picture
Training in progress, step 238, checkpoint
4805880 verified
raw
history blame
16.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.016036655211912942,
"eval_steps": 34,
"global_step": 238,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 6.738090425173506e-05,
"eval_loss": 1.0482484102249146,
"eval_runtime": 1771.3022,
"eval_samples_per_second": 14.111,
"eval_steps_per_second": 1.764,
"step": 1
},
{
"epoch": 0.00020214271275520516,
"grad_norm": 2.1362860202789307,
"learning_rate": 1.5e-05,
"loss": 1.0166,
"step": 3
},
{
"epoch": 0.0004042854255104103,
"grad_norm": 1.772956371307373,
"learning_rate": 3e-05,
"loss": 1.0004,
"step": 6
},
{
"epoch": 0.0006064281382656155,
"grad_norm": 1.3759489059448242,
"learning_rate": 4.5e-05,
"loss": 0.9564,
"step": 9
},
{
"epoch": 0.0008085708510208206,
"grad_norm": 1.2244105339050293,
"learning_rate": 4.999675562428437e-05,
"loss": 0.891,
"step": 12
},
{
"epoch": 0.001010713563776026,
"grad_norm": 1.1622825860977173,
"learning_rate": 4.9979724954289244e-05,
"loss": 0.8578,
"step": 15
},
{
"epoch": 0.001212856276531231,
"grad_norm": 1.089287281036377,
"learning_rate": 4.994810682835951e-05,
"loss": 0.832,
"step": 18
},
{
"epoch": 0.0014149989892864362,
"grad_norm": 1.685937762260437,
"learning_rate": 4.990191971059033e-05,
"loss": 0.8445,
"step": 21
},
{
"epoch": 0.0016171417020416413,
"grad_norm": 0.9353536367416382,
"learning_rate": 4.984119057295783e-05,
"loss": 0.8481,
"step": 24
},
{
"epoch": 0.0018192844147968466,
"grad_norm": 0.9442921876907349,
"learning_rate": 4.976595487956823e-05,
"loss": 0.8389,
"step": 27
},
{
"epoch": 0.002021427127552052,
"grad_norm": 1.0226161479949951,
"learning_rate": 4.967625656594782e-05,
"loss": 0.8224,
"step": 30
},
{
"epoch": 0.002223569840307257,
"grad_norm": 0.8120137453079224,
"learning_rate": 4.957214801338581e-05,
"loss": 0.849,
"step": 33
},
{
"epoch": 0.002290950744558992,
"eval_loss": 0.8384992480278015,
"eval_runtime": 1781.1941,
"eval_samples_per_second": 14.033,
"eval_steps_per_second": 1.754,
"step": 34
},
{
"epoch": 0.002425712553062462,
"grad_norm": 0.8147669434547424,
"learning_rate": 4.9453690018345144e-05,
"loss": 0.8287,
"step": 36
},
{
"epoch": 0.0026278552658176675,
"grad_norm": 0.9256265163421631,
"learning_rate": 4.932095175695911e-05,
"loss": 0.8112,
"step": 39
},
{
"epoch": 0.0028299979785728724,
"grad_norm": 0.9979642033576965,
"learning_rate": 4.917401074463441e-05,
"loss": 0.8657,
"step": 42
},
{
"epoch": 0.0030321406913280777,
"grad_norm": 0.7800766825675964,
"learning_rate": 4.901295279078431e-05,
"loss": 0.7922,
"step": 45
},
{
"epoch": 0.0032342834040832826,
"grad_norm": 0.7466667294502258,
"learning_rate": 4.883787194871841e-05,
"loss": 0.8455,
"step": 48
},
{
"epoch": 0.003436426116838488,
"grad_norm": 0.7748194932937622,
"learning_rate": 4.864887046071813e-05,
"loss": 0.8259,
"step": 51
},
{
"epoch": 0.0036385688295936932,
"grad_norm": 1.103950023651123,
"learning_rate": 4.8446058698330115e-05,
"loss": 0.8387,
"step": 54
},
{
"epoch": 0.003840711542348898,
"grad_norm": 0.7845460176467896,
"learning_rate": 4.822955509791233e-05,
"loss": 0.8067,
"step": 57
},
{
"epoch": 0.004042854255104104,
"grad_norm": 0.7884831428527832,
"learning_rate": 4.799948609147061e-05,
"loss": 0.8231,
"step": 60
},
{
"epoch": 0.004244996967859308,
"grad_norm": 0.9247403144836426,
"learning_rate": 4.7755986032825864e-05,
"loss": 0.8159,
"step": 63
},
{
"epoch": 0.004447139680614514,
"grad_norm": 0.7747366428375244,
"learning_rate": 4.74991971191553e-05,
"loss": 0.8132,
"step": 66
},
{
"epoch": 0.004581901489117984,
"eval_loss": 0.8286266922950745,
"eval_runtime": 1781.5654,
"eval_samples_per_second": 14.03,
"eval_steps_per_second": 1.754,
"step": 68
},
{
"epoch": 0.004649282393369719,
"grad_norm": 0.7278714776039124,
"learning_rate": 4.7229269307953235e-05,
"loss": 0.741,
"step": 69
},
{
"epoch": 0.004851425106124924,
"grad_norm": 0.9090484380722046,
"learning_rate": 4.694636022946012e-05,
"loss": 0.8075,
"step": 72
},
{
"epoch": 0.00505356781888013,
"grad_norm": 0.8673194050788879,
"learning_rate": 4.665063509461097e-05,
"loss": 0.8395,
"step": 75
},
{
"epoch": 0.005255710531635335,
"grad_norm": 0.7307804822921753,
"learning_rate": 4.6342266598556814e-05,
"loss": 0.7995,
"step": 78
},
{
"epoch": 0.005457853244390539,
"grad_norm": 0.9184255003929138,
"learning_rate": 4.6021434819815555e-05,
"loss": 0.8318,
"step": 81
},
{
"epoch": 0.005659995957145745,
"grad_norm": 0.7924419045448303,
"learning_rate": 4.568832711511125e-05,
"loss": 0.8095,
"step": 84
},
{
"epoch": 0.00586213866990095,
"grad_norm": 0.7330142259597778,
"learning_rate": 4.534313800996299e-05,
"loss": 0.7674,
"step": 87
},
{
"epoch": 0.006064281382656155,
"grad_norm": 0.7591224908828735,
"learning_rate": 4.498606908508754e-05,
"loss": 0.8409,
"step": 90
},
{
"epoch": 0.006266424095411361,
"grad_norm": 0.7741730213165283,
"learning_rate": 4.46173288586818e-05,
"loss": 0.8541,
"step": 93
},
{
"epoch": 0.006468566808166565,
"grad_norm": 0.7202086448669434,
"learning_rate": 4.4237132664654154e-05,
"loss": 0.85,
"step": 96
},
{
"epoch": 0.0066707095209217705,
"grad_norm": 0.7738878726959229,
"learning_rate": 4.384570252687542e-05,
"loss": 0.8571,
"step": 99
},
{
"epoch": 0.006872852233676976,
"grad_norm": 0.7431773543357849,
"learning_rate": 4.344326702952326e-05,
"loss": 0.8264,
"step": 102
},
{
"epoch": 0.006872852233676976,
"eval_loss": 0.8239989280700684,
"eval_runtime": 1781.4353,
"eval_samples_per_second": 14.031,
"eval_steps_per_second": 1.754,
"step": 102
},
{
"epoch": 0.007074994946432181,
"grad_norm": 0.7040189504623413,
"learning_rate": 4.303006118359537e-05,
"loss": 0.8247,
"step": 105
},
{
"epoch": 0.0072771376591873865,
"grad_norm": 0.7915964722633362,
"learning_rate": 4.260632628966974e-05,
"loss": 0.8551,
"step": 108
},
{
"epoch": 0.007479280371942592,
"grad_norm": 0.7852084040641785,
"learning_rate": 4.217230979699188e-05,
"loss": 0.8425,
"step": 111
},
{
"epoch": 0.007681423084697796,
"grad_norm": 0.6728894114494324,
"learning_rate": 4.172826515897146e-05,
"loss": 0.8141,
"step": 114
},
{
"epoch": 0.007883565797453002,
"grad_norm": 0.7391681671142578,
"learning_rate": 4.12744516851726e-05,
"loss": 0.7987,
"step": 117
},
{
"epoch": 0.008085708510208208,
"grad_norm": 0.7469043135643005,
"learning_rate": 4.0811134389884433e-05,
"loss": 0.7909,
"step": 120
},
{
"epoch": 0.008287851222963412,
"grad_norm": 0.7632879614830017,
"learning_rate": 4.0338583837360225e-05,
"loss": 0.8031,
"step": 123
},
{
"epoch": 0.008489993935718617,
"grad_norm": 0.7656901478767395,
"learning_rate": 3.985707598381544e-05,
"loss": 0.843,
"step": 126
},
{
"epoch": 0.008692136648473823,
"grad_norm": 0.8024786114692688,
"learning_rate": 3.9366892016277096e-05,
"loss": 0.8403,
"step": 129
},
{
"epoch": 0.008894279361229027,
"grad_norm": 0.6944208145141602,
"learning_rate": 3.886831818837847e-05,
"loss": 0.7908,
"step": 132
},
{
"epoch": 0.009096422073984234,
"grad_norm": 0.719901442527771,
"learning_rate": 3.8361645653195026e-05,
"loss": 0.8151,
"step": 135
},
{
"epoch": 0.009163802978235968,
"eval_loss": 0.8194563388824463,
"eval_runtime": 1782.0949,
"eval_samples_per_second": 14.026,
"eval_steps_per_second": 1.754,
"step": 136
},
{
"epoch": 0.009298564786739438,
"grad_norm": 0.6918753981590271,
"learning_rate": 3.784717029321922e-05,
"loss": 0.8194,
"step": 138
},
{
"epoch": 0.009500707499494642,
"grad_norm": 0.7483247518539429,
"learning_rate": 3.732519254757344e-05,
"loss": 0.8422,
"step": 141
},
{
"epoch": 0.009702850212249849,
"grad_norm": 0.7642280459403992,
"learning_rate": 3.679601723656205e-05,
"loss": 0.8222,
"step": 144
},
{
"epoch": 0.009904992925005053,
"grad_norm": 0.7145370244979858,
"learning_rate": 3.625995338366492e-05,
"loss": 0.8073,
"step": 147
},
{
"epoch": 0.01010713563776026,
"grad_norm": 0.732183039188385,
"learning_rate": 3.5717314035076355e-05,
"loss": 0.8163,
"step": 150
},
{
"epoch": 0.010309278350515464,
"grad_norm": 0.6954637765884399,
"learning_rate": 3.516841607689501e-05,
"loss": 0.7573,
"step": 153
},
{
"epoch": 0.01051142106327067,
"grad_norm": 0.7373840808868408,
"learning_rate": 3.461358005007128e-05,
"loss": 0.7868,
"step": 156
},
{
"epoch": 0.010713563776025874,
"grad_norm": 0.7047626376152039,
"learning_rate": 3.405312996322042e-05,
"loss": 0.821,
"step": 159
},
{
"epoch": 0.010915706488781079,
"grad_norm": 0.7702988982200623,
"learning_rate": 3.348739310341068e-05,
"loss": 0.8194,
"step": 162
},
{
"epoch": 0.011117849201536285,
"grad_norm": 0.7867685556411743,
"learning_rate": 3.2916699845036816e-05,
"loss": 0.7898,
"step": 165
},
{
"epoch": 0.01131999191429149,
"grad_norm": 0.7021005153656006,
"learning_rate": 3.234138345689077e-05,
"loss": 0.7621,
"step": 168
},
{
"epoch": 0.011454753722794959,
"eval_loss": 0.8163909316062927,
"eval_runtime": 1780.9274,
"eval_samples_per_second": 14.035,
"eval_steps_per_second": 1.755,
"step": 170
},
{
"epoch": 0.011522134627046696,
"grad_norm": 0.7096220850944519,
"learning_rate": 3.17617799075421e-05,
"loss": 0.7807,
"step": 171
},
{
"epoch": 0.0117242773398019,
"grad_norm": 0.7657400369644165,
"learning_rate": 3.1178227669141744e-05,
"loss": 0.7858,
"step": 174
},
{
"epoch": 0.011926420052557105,
"grad_norm": 0.8024412393569946,
"learning_rate": 3.0591067519763895e-05,
"loss": 0.8122,
"step": 177
},
{
"epoch": 0.01212856276531231,
"grad_norm": 0.6976025700569153,
"learning_rate": 3.0000642344401113e-05,
"loss": 0.8288,
"step": 180
},
{
"epoch": 0.012330705478067515,
"grad_norm": 0.6966779828071594,
"learning_rate": 2.9407296934729227e-05,
"loss": 0.793,
"step": 183
},
{
"epoch": 0.012532848190822721,
"grad_norm": 0.7219818830490112,
"learning_rate": 2.8811377787758636e-05,
"loss": 0.7883,
"step": 186
},
{
"epoch": 0.012734990903577926,
"grad_norm": 0.8189945816993713,
"learning_rate": 2.8213232903489865e-05,
"loss": 0.885,
"step": 189
},
{
"epoch": 0.01293713361633313,
"grad_norm": 0.902603805065155,
"learning_rate": 2.761321158169134e-05,
"loss": 0.8383,
"step": 192
},
{
"epoch": 0.013139276329088337,
"grad_norm": 0.8128630518913269,
"learning_rate": 2.7011664217918154e-05,
"loss": 0.852,
"step": 195
},
{
"epoch": 0.013341419041843541,
"grad_norm": 0.7031587958335876,
"learning_rate": 2.6408942098890936e-05,
"loss": 0.8622,
"step": 198
},
{
"epoch": 0.013543561754598747,
"grad_norm": 0.7614731788635254,
"learning_rate": 2.580539719735433e-05,
"loss": 0.8162,
"step": 201
},
{
"epoch": 0.013745704467353952,
"grad_norm": 0.6810929179191589,
"learning_rate": 2.5201381966534748e-05,
"loss": 0.8271,
"step": 204
},
{
"epoch": 0.013745704467353952,
"eval_loss": 0.8147265315055847,
"eval_runtime": 1782.1355,
"eval_samples_per_second": 14.025,
"eval_steps_per_second": 1.754,
"step": 204
},
{
"epoch": 0.013947847180109158,
"grad_norm": 0.7248020768165588,
"learning_rate": 2.459724913431772e-05,
"loss": 0.814,
"step": 207
},
{
"epoch": 0.014149989892864362,
"grad_norm": 0.7375376224517822,
"learning_rate": 2.399335149726463e-05,
"loss": 0.8381,
"step": 210
},
{
"epoch": 0.014352132605619567,
"grad_norm": 0.75850510597229,
"learning_rate": 2.3390041714589514e-05,
"loss": 0.7851,
"step": 213
},
{
"epoch": 0.014554275318374773,
"grad_norm": 0.711068332195282,
"learning_rate": 2.2787672102216042e-05,
"loss": 0.7992,
"step": 216
},
{
"epoch": 0.014756418031129977,
"grad_norm": 0.7301695346832275,
"learning_rate": 2.2186594427034864e-05,
"loss": 0.8506,
"step": 219
},
{
"epoch": 0.014958560743885184,
"grad_norm": 0.720683753490448,
"learning_rate": 2.1587159701481716e-05,
"loss": 0.8061,
"step": 222
},
{
"epoch": 0.015160703456640388,
"grad_norm": 0.665138304233551,
"learning_rate": 2.098971797855599e-05,
"loss": 0.7454,
"step": 225
},
{
"epoch": 0.015362846169395592,
"grad_norm": 0.6854159235954285,
"learning_rate": 2.0394618147399713e-05,
"loss": 0.828,
"step": 228
},
{
"epoch": 0.015564988882150799,
"grad_norm": 0.7191194891929626,
"learning_rate": 1.980220772955602e-05,
"loss": 0.7885,
"step": 231
},
{
"epoch": 0.015767131594906003,
"grad_norm": 0.7301977276802063,
"learning_rate": 1.921283267602643e-05,
"loss": 0.81,
"step": 234
},
{
"epoch": 0.015969274307661208,
"grad_norm": 0.7239346504211426,
"learning_rate": 1.8626837165245165e-05,
"loss": 0.787,
"step": 237
},
{
"epoch": 0.016036655211912942,
"eval_loss": 0.8127490878105164,
"eval_runtime": 1781.9808,
"eval_samples_per_second": 14.027,
"eval_steps_per_second": 1.754,
"step": 238
}
],
"logging_steps": 3,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 34,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.346898330352353e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}