leixa's picture
Training in progress, step 340, checkpoint
df2a176 verified
raw
history blame
22.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.022909507445589918,
"eval_steps": 34,
"global_step": 340,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 6.738090425173506e-05,
"eval_loss": 1.0482484102249146,
"eval_runtime": 1771.3022,
"eval_samples_per_second": 14.111,
"eval_steps_per_second": 1.764,
"step": 1
},
{
"epoch": 0.00020214271275520516,
"grad_norm": 2.1362860202789307,
"learning_rate": 1.5e-05,
"loss": 1.0166,
"step": 3
},
{
"epoch": 0.0004042854255104103,
"grad_norm": 1.772956371307373,
"learning_rate": 3e-05,
"loss": 1.0004,
"step": 6
},
{
"epoch": 0.0006064281382656155,
"grad_norm": 1.3759489059448242,
"learning_rate": 4.5e-05,
"loss": 0.9564,
"step": 9
},
{
"epoch": 0.0008085708510208206,
"grad_norm": 1.2244105339050293,
"learning_rate": 4.999675562428437e-05,
"loss": 0.891,
"step": 12
},
{
"epoch": 0.001010713563776026,
"grad_norm": 1.1622825860977173,
"learning_rate": 4.9979724954289244e-05,
"loss": 0.8578,
"step": 15
},
{
"epoch": 0.001212856276531231,
"grad_norm": 1.089287281036377,
"learning_rate": 4.994810682835951e-05,
"loss": 0.832,
"step": 18
},
{
"epoch": 0.0014149989892864362,
"grad_norm": 1.685937762260437,
"learning_rate": 4.990191971059033e-05,
"loss": 0.8445,
"step": 21
},
{
"epoch": 0.0016171417020416413,
"grad_norm": 0.9353536367416382,
"learning_rate": 4.984119057295783e-05,
"loss": 0.8481,
"step": 24
},
{
"epoch": 0.0018192844147968466,
"grad_norm": 0.9442921876907349,
"learning_rate": 4.976595487956823e-05,
"loss": 0.8389,
"step": 27
},
{
"epoch": 0.002021427127552052,
"grad_norm": 1.0226161479949951,
"learning_rate": 4.967625656594782e-05,
"loss": 0.8224,
"step": 30
},
{
"epoch": 0.002223569840307257,
"grad_norm": 0.8120137453079224,
"learning_rate": 4.957214801338581e-05,
"loss": 0.849,
"step": 33
},
{
"epoch": 0.002290950744558992,
"eval_loss": 0.8384992480278015,
"eval_runtime": 1781.1941,
"eval_samples_per_second": 14.033,
"eval_steps_per_second": 1.754,
"step": 34
},
{
"epoch": 0.002425712553062462,
"grad_norm": 0.8147669434547424,
"learning_rate": 4.9453690018345144e-05,
"loss": 0.8287,
"step": 36
},
{
"epoch": 0.0026278552658176675,
"grad_norm": 0.9256265163421631,
"learning_rate": 4.932095175695911e-05,
"loss": 0.8112,
"step": 39
},
{
"epoch": 0.0028299979785728724,
"grad_norm": 0.9979642033576965,
"learning_rate": 4.917401074463441e-05,
"loss": 0.8657,
"step": 42
},
{
"epoch": 0.0030321406913280777,
"grad_norm": 0.7800766825675964,
"learning_rate": 4.901295279078431e-05,
"loss": 0.7922,
"step": 45
},
{
"epoch": 0.0032342834040832826,
"grad_norm": 0.7466667294502258,
"learning_rate": 4.883787194871841e-05,
"loss": 0.8455,
"step": 48
},
{
"epoch": 0.003436426116838488,
"grad_norm": 0.7748194932937622,
"learning_rate": 4.864887046071813e-05,
"loss": 0.8259,
"step": 51
},
{
"epoch": 0.0036385688295936932,
"grad_norm": 1.103950023651123,
"learning_rate": 4.8446058698330115e-05,
"loss": 0.8387,
"step": 54
},
{
"epoch": 0.003840711542348898,
"grad_norm": 0.7845460176467896,
"learning_rate": 4.822955509791233e-05,
"loss": 0.8067,
"step": 57
},
{
"epoch": 0.004042854255104104,
"grad_norm": 0.7884831428527832,
"learning_rate": 4.799948609147061e-05,
"loss": 0.8231,
"step": 60
},
{
"epoch": 0.004244996967859308,
"grad_norm": 0.9247403144836426,
"learning_rate": 4.7755986032825864e-05,
"loss": 0.8159,
"step": 63
},
{
"epoch": 0.004447139680614514,
"grad_norm": 0.7747366428375244,
"learning_rate": 4.74991971191553e-05,
"loss": 0.8132,
"step": 66
},
{
"epoch": 0.004581901489117984,
"eval_loss": 0.8286266922950745,
"eval_runtime": 1781.5654,
"eval_samples_per_second": 14.03,
"eval_steps_per_second": 1.754,
"step": 68
},
{
"epoch": 0.004649282393369719,
"grad_norm": 0.7278714776039124,
"learning_rate": 4.7229269307953235e-05,
"loss": 0.741,
"step": 69
},
{
"epoch": 0.004851425106124924,
"grad_norm": 0.9090484380722046,
"learning_rate": 4.694636022946012e-05,
"loss": 0.8075,
"step": 72
},
{
"epoch": 0.00505356781888013,
"grad_norm": 0.8673194050788879,
"learning_rate": 4.665063509461097e-05,
"loss": 0.8395,
"step": 75
},
{
"epoch": 0.005255710531635335,
"grad_norm": 0.7307804822921753,
"learning_rate": 4.6342266598556814e-05,
"loss": 0.7995,
"step": 78
},
{
"epoch": 0.005457853244390539,
"grad_norm": 0.9184255003929138,
"learning_rate": 4.6021434819815555e-05,
"loss": 0.8318,
"step": 81
},
{
"epoch": 0.005659995957145745,
"grad_norm": 0.7924419045448303,
"learning_rate": 4.568832711511125e-05,
"loss": 0.8095,
"step": 84
},
{
"epoch": 0.00586213866990095,
"grad_norm": 0.7330142259597778,
"learning_rate": 4.534313800996299e-05,
"loss": 0.7674,
"step": 87
},
{
"epoch": 0.006064281382656155,
"grad_norm": 0.7591224908828735,
"learning_rate": 4.498606908508754e-05,
"loss": 0.8409,
"step": 90
},
{
"epoch": 0.006266424095411361,
"grad_norm": 0.7741730213165283,
"learning_rate": 4.46173288586818e-05,
"loss": 0.8541,
"step": 93
},
{
"epoch": 0.006468566808166565,
"grad_norm": 0.7202086448669434,
"learning_rate": 4.4237132664654154e-05,
"loss": 0.85,
"step": 96
},
{
"epoch": 0.0066707095209217705,
"grad_norm": 0.7738878726959229,
"learning_rate": 4.384570252687542e-05,
"loss": 0.8571,
"step": 99
},
{
"epoch": 0.006872852233676976,
"grad_norm": 0.7431773543357849,
"learning_rate": 4.344326702952326e-05,
"loss": 0.8264,
"step": 102
},
{
"epoch": 0.006872852233676976,
"eval_loss": 0.8239989280700684,
"eval_runtime": 1781.4353,
"eval_samples_per_second": 14.031,
"eval_steps_per_second": 1.754,
"step": 102
},
{
"epoch": 0.007074994946432181,
"grad_norm": 0.7040189504623413,
"learning_rate": 4.303006118359537e-05,
"loss": 0.8247,
"step": 105
},
{
"epoch": 0.0072771376591873865,
"grad_norm": 0.7915964722633362,
"learning_rate": 4.260632628966974e-05,
"loss": 0.8551,
"step": 108
},
{
"epoch": 0.007479280371942592,
"grad_norm": 0.7852084040641785,
"learning_rate": 4.217230979699188e-05,
"loss": 0.8425,
"step": 111
},
{
"epoch": 0.007681423084697796,
"grad_norm": 0.6728894114494324,
"learning_rate": 4.172826515897146e-05,
"loss": 0.8141,
"step": 114
},
{
"epoch": 0.007883565797453002,
"grad_norm": 0.7391681671142578,
"learning_rate": 4.12744516851726e-05,
"loss": 0.7987,
"step": 117
},
{
"epoch": 0.008085708510208208,
"grad_norm": 0.7469043135643005,
"learning_rate": 4.0811134389884433e-05,
"loss": 0.7909,
"step": 120
},
{
"epoch": 0.008287851222963412,
"grad_norm": 0.7632879614830017,
"learning_rate": 4.0338583837360225e-05,
"loss": 0.8031,
"step": 123
},
{
"epoch": 0.008489993935718617,
"grad_norm": 0.7656901478767395,
"learning_rate": 3.985707598381544e-05,
"loss": 0.843,
"step": 126
},
{
"epoch": 0.008692136648473823,
"grad_norm": 0.8024786114692688,
"learning_rate": 3.9366892016277096e-05,
"loss": 0.8403,
"step": 129
},
{
"epoch": 0.008894279361229027,
"grad_norm": 0.6944208145141602,
"learning_rate": 3.886831818837847e-05,
"loss": 0.7908,
"step": 132
},
{
"epoch": 0.009096422073984234,
"grad_norm": 0.719901442527771,
"learning_rate": 3.8361645653195026e-05,
"loss": 0.8151,
"step": 135
},
{
"epoch": 0.009163802978235968,
"eval_loss": 0.8194563388824463,
"eval_runtime": 1782.0949,
"eval_samples_per_second": 14.026,
"eval_steps_per_second": 1.754,
"step": 136
},
{
"epoch": 0.009298564786739438,
"grad_norm": 0.6918753981590271,
"learning_rate": 3.784717029321922e-05,
"loss": 0.8194,
"step": 138
},
{
"epoch": 0.009500707499494642,
"grad_norm": 0.7483247518539429,
"learning_rate": 3.732519254757344e-05,
"loss": 0.8422,
"step": 141
},
{
"epoch": 0.009702850212249849,
"grad_norm": 0.7642280459403992,
"learning_rate": 3.679601723656205e-05,
"loss": 0.8222,
"step": 144
},
{
"epoch": 0.009904992925005053,
"grad_norm": 0.7145370244979858,
"learning_rate": 3.625995338366492e-05,
"loss": 0.8073,
"step": 147
},
{
"epoch": 0.01010713563776026,
"grad_norm": 0.732183039188385,
"learning_rate": 3.5717314035076355e-05,
"loss": 0.8163,
"step": 150
},
{
"epoch": 0.010309278350515464,
"grad_norm": 0.6954637765884399,
"learning_rate": 3.516841607689501e-05,
"loss": 0.7573,
"step": 153
},
{
"epoch": 0.01051142106327067,
"grad_norm": 0.7373840808868408,
"learning_rate": 3.461358005007128e-05,
"loss": 0.7868,
"step": 156
},
{
"epoch": 0.010713563776025874,
"grad_norm": 0.7047626376152039,
"learning_rate": 3.405312996322042e-05,
"loss": 0.821,
"step": 159
},
{
"epoch": 0.010915706488781079,
"grad_norm": 0.7702988982200623,
"learning_rate": 3.348739310341068e-05,
"loss": 0.8194,
"step": 162
},
{
"epoch": 0.011117849201536285,
"grad_norm": 0.7867685556411743,
"learning_rate": 3.2916699845036816e-05,
"loss": 0.7898,
"step": 165
},
{
"epoch": 0.01131999191429149,
"grad_norm": 0.7021005153656006,
"learning_rate": 3.234138345689077e-05,
"loss": 0.7621,
"step": 168
},
{
"epoch": 0.011454753722794959,
"eval_loss": 0.8163909316062927,
"eval_runtime": 1780.9274,
"eval_samples_per_second": 14.035,
"eval_steps_per_second": 1.755,
"step": 170
},
{
"epoch": 0.011522134627046696,
"grad_norm": 0.7096220850944519,
"learning_rate": 3.17617799075421e-05,
"loss": 0.7807,
"step": 171
},
{
"epoch": 0.0117242773398019,
"grad_norm": 0.7657400369644165,
"learning_rate": 3.1178227669141744e-05,
"loss": 0.7858,
"step": 174
},
{
"epoch": 0.011926420052557105,
"grad_norm": 0.8024412393569946,
"learning_rate": 3.0591067519763895e-05,
"loss": 0.8122,
"step": 177
},
{
"epoch": 0.01212856276531231,
"grad_norm": 0.6976025700569153,
"learning_rate": 3.0000642344401113e-05,
"loss": 0.8288,
"step": 180
},
{
"epoch": 0.012330705478067515,
"grad_norm": 0.6966779828071594,
"learning_rate": 2.9407296934729227e-05,
"loss": 0.793,
"step": 183
},
{
"epoch": 0.012532848190822721,
"grad_norm": 0.7219818830490112,
"learning_rate": 2.8811377787758636e-05,
"loss": 0.7883,
"step": 186
},
{
"epoch": 0.012734990903577926,
"grad_norm": 0.8189945816993713,
"learning_rate": 2.8213232903489865e-05,
"loss": 0.885,
"step": 189
},
{
"epoch": 0.01293713361633313,
"grad_norm": 0.902603805065155,
"learning_rate": 2.761321158169134e-05,
"loss": 0.8383,
"step": 192
},
{
"epoch": 0.013139276329088337,
"grad_norm": 0.8128630518913269,
"learning_rate": 2.7011664217918154e-05,
"loss": 0.852,
"step": 195
},
{
"epoch": 0.013341419041843541,
"grad_norm": 0.7031587958335876,
"learning_rate": 2.6408942098890936e-05,
"loss": 0.8622,
"step": 198
},
{
"epoch": 0.013543561754598747,
"grad_norm": 0.7614731788635254,
"learning_rate": 2.580539719735433e-05,
"loss": 0.8162,
"step": 201
},
{
"epoch": 0.013745704467353952,
"grad_norm": 0.6810929179191589,
"learning_rate": 2.5201381966534748e-05,
"loss": 0.8271,
"step": 204
},
{
"epoch": 0.013745704467353952,
"eval_loss": 0.8147265315055847,
"eval_runtime": 1782.1355,
"eval_samples_per_second": 14.025,
"eval_steps_per_second": 1.754,
"step": 204
},
{
"epoch": 0.013947847180109158,
"grad_norm": 0.7248020768165588,
"learning_rate": 2.459724913431772e-05,
"loss": 0.814,
"step": 207
},
{
"epoch": 0.014149989892864362,
"grad_norm": 0.7375376224517822,
"learning_rate": 2.399335149726463e-05,
"loss": 0.8381,
"step": 210
},
{
"epoch": 0.014352132605619567,
"grad_norm": 0.75850510597229,
"learning_rate": 2.3390041714589514e-05,
"loss": 0.7851,
"step": 213
},
{
"epoch": 0.014554275318374773,
"grad_norm": 0.711068332195282,
"learning_rate": 2.2787672102216042e-05,
"loss": 0.7992,
"step": 216
},
{
"epoch": 0.014756418031129977,
"grad_norm": 0.7301695346832275,
"learning_rate": 2.2186594427034864e-05,
"loss": 0.8506,
"step": 219
},
{
"epoch": 0.014958560743885184,
"grad_norm": 0.720683753490448,
"learning_rate": 2.1587159701481716e-05,
"loss": 0.8061,
"step": 222
},
{
"epoch": 0.015160703456640388,
"grad_norm": 0.665138304233551,
"learning_rate": 2.098971797855599e-05,
"loss": 0.7454,
"step": 225
},
{
"epoch": 0.015362846169395592,
"grad_norm": 0.6854159235954285,
"learning_rate": 2.0394618147399713e-05,
"loss": 0.828,
"step": 228
},
{
"epoch": 0.015564988882150799,
"grad_norm": 0.7191194891929626,
"learning_rate": 1.980220772955602e-05,
"loss": 0.7885,
"step": 231
},
{
"epoch": 0.015767131594906003,
"grad_norm": 0.7301977276802063,
"learning_rate": 1.921283267602643e-05,
"loss": 0.81,
"step": 234
},
{
"epoch": 0.015969274307661208,
"grad_norm": 0.7239346504211426,
"learning_rate": 1.8626837165245165e-05,
"loss": 0.787,
"step": 237
},
{
"epoch": 0.016036655211912942,
"eval_loss": 0.8127490878105164,
"eval_runtime": 1781.9808,
"eval_samples_per_second": 14.027,
"eval_steps_per_second": 1.754,
"step": 238
},
{
"epoch": 0.016171417020416416,
"grad_norm": 0.7089824676513672,
"learning_rate": 1.8044563402088684e-05,
"loss": 0.8143,
"step": 240
},
{
"epoch": 0.01637355973317162,
"grad_norm": 0.6729727983474731,
"learning_rate": 1.746635141803761e-05,
"loss": 0.7973,
"step": 243
},
{
"epoch": 0.016575702445926824,
"grad_norm": 0.7322119474411011,
"learning_rate": 1.6892538872607937e-05,
"loss": 0.8065,
"step": 246
},
{
"epoch": 0.01677784515868203,
"grad_norm": 0.7230767607688904,
"learning_rate": 1.6323460856167426e-05,
"loss": 0.8034,
"step": 249
},
{
"epoch": 0.016979987871437233,
"grad_norm": 0.6473975777626038,
"learning_rate": 1.5759449694252226e-05,
"loss": 0.7781,
"step": 252
},
{
"epoch": 0.01718213058419244,
"grad_norm": 0.7108025550842285,
"learning_rate": 1.5200834753498128e-05,
"loss": 0.8175,
"step": 255
},
{
"epoch": 0.017384273296947646,
"grad_norm": 0.672478199005127,
"learning_rate": 1.4647942249299707e-05,
"loss": 0.8328,
"step": 258
},
{
"epoch": 0.01758641600970285,
"grad_norm": 0.7066530585289001,
"learning_rate": 1.4101095055309746e-05,
"loss": 0.7698,
"step": 261
},
{
"epoch": 0.017788558722458055,
"grad_norm": 0.7493249773979187,
"learning_rate": 1.356061251489012e-05,
"loss": 0.8237,
"step": 264
},
{
"epoch": 0.01799070143521326,
"grad_norm": 0.6934426426887512,
"learning_rate": 1.302681025462424e-05,
"loss": 0.82,
"step": 267
},
{
"epoch": 0.018192844147968467,
"grad_norm": 0.6936736106872559,
"learning_rate": 1.2500000000000006e-05,
"loss": 0.8079,
"step": 270
},
{
"epoch": 0.018327605956471937,
"eval_loss": 0.8106825351715088,
"eval_runtime": 1782.0227,
"eval_samples_per_second": 14.026,
"eval_steps_per_second": 1.754,
"step": 272
},
{
"epoch": 0.01839498686072367,
"grad_norm": 0.6460986733436584,
"learning_rate": 1.1980489393370938e-05,
"loss": 0.8341,
"step": 273
},
{
"epoch": 0.018597129573478876,
"grad_norm": 0.6542893052101135,
"learning_rate": 1.1468581814301717e-05,
"loss": 0.7814,
"step": 276
},
{
"epoch": 0.01879927228623408,
"grad_norm": 0.6104385852813721,
"learning_rate": 1.096457620240298e-05,
"loss": 0.8269,
"step": 279
},
{
"epoch": 0.019001414998989285,
"grad_norm": 0.822834849357605,
"learning_rate": 1.0468766882759094e-05,
"loss": 0.8001,
"step": 282
},
{
"epoch": 0.019203557711744493,
"grad_norm": 0.6357617378234863,
"learning_rate": 9.981443394050525e-06,
"loss": 0.8261,
"step": 285
},
{
"epoch": 0.019405700424499697,
"grad_norm": 0.6451523900032043,
"learning_rate": 9.502890319471491e-06,
"loss": 0.827,
"step": 288
},
{
"epoch": 0.0196078431372549,
"grad_norm": 0.6993770003318787,
"learning_rate": 9.033387120541306e-06,
"loss": 0.7993,
"step": 291
},
{
"epoch": 0.019809985850010106,
"grad_norm": 0.7399018406867981,
"learning_rate": 8.573207973906735e-06,
"loss": 0.8537,
"step": 294
},
{
"epoch": 0.02001212856276531,
"grad_norm": 0.6726659536361694,
"learning_rate": 8.1226216112306e-06,
"loss": 0.7875,
"step": 297
},
{
"epoch": 0.02021427127552052,
"grad_norm": 0.6281954646110535,
"learning_rate": 7.681891162260015e-06,
"loss": 0.7966,
"step": 300
},
{
"epoch": 0.020416413988275723,
"grad_norm": 0.7878900170326233,
"learning_rate": 7.251274001166044e-06,
"loss": 0.8103,
"step": 303
},
{
"epoch": 0.020618556701030927,
"grad_norm": 0.6884163022041321,
"learning_rate": 6.831021596244424e-06,
"loss": 0.7842,
"step": 306
},
{
"epoch": 0.020618556701030927,
"eval_loss": 0.8096863031387329,
"eval_runtime": 1780.6626,
"eval_samples_per_second": 14.037,
"eval_steps_per_second": 1.755,
"step": 306
},
{
"epoch": 0.020820699413786132,
"grad_norm": 0.8132256269454956,
"learning_rate": 6.421379363065142e-06,
"loss": 0.8069,
"step": 309
},
{
"epoch": 0.02102284212654134,
"grad_norm": 0.7123071551322937,
"learning_rate": 6.022586521156715e-06,
"loss": 0.7624,
"step": 312
},
{
"epoch": 0.021224984839296544,
"grad_norm": 0.6497386693954468,
"learning_rate": 5.634875954308638e-06,
"loss": 0.7902,
"step": 315
},
{
"epoch": 0.02142712755205175,
"grad_norm": 0.6508458256721497,
"learning_rate": 5.258474074573877e-06,
"loss": 0.8201,
"step": 318
},
{
"epoch": 0.021629270264806953,
"grad_norm": 0.9117996096611023,
"learning_rate": 4.893600690050579e-06,
"loss": 0.8328,
"step": 321
},
{
"epoch": 0.021831412977562158,
"grad_norm": 0.693020761013031,
"learning_rate": 4.540468876520323e-06,
"loss": 0.7926,
"step": 324
},
{
"epoch": 0.022033555690317366,
"grad_norm": 0.6869902014732361,
"learning_rate": 4.199284853017896e-06,
"loss": 0.805,
"step": 327
},
{
"epoch": 0.02223569840307257,
"grad_norm": 0.7282816171646118,
"learning_rate": 3.8702478614051355e-06,
"loss": 0.8067,
"step": 330
},
{
"epoch": 0.022437841115827774,
"grad_norm": 0.6699129343032837,
"learning_rate": 3.5535500500193357e-06,
"loss": 0.8041,
"step": 333
},
{
"epoch": 0.02263998382858298,
"grad_norm": 0.6829515695571899,
"learning_rate": 3.249376361464021e-06,
"loss": 0.8149,
"step": 336
},
{
"epoch": 0.022842126541338183,
"grad_norm": 0.7807720303535461,
"learning_rate": 2.957904424607652e-06,
"loss": 0.825,
"step": 339
},
{
"epoch": 0.022909507445589918,
"eval_loss": 0.8090208768844604,
"eval_runtime": 1781.7524,
"eval_samples_per_second": 14.028,
"eval_steps_per_second": 1.754,
"step": 340
}
],
"logging_steps": 3,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 34,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.7812833290747904e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}