Boffl's picture
Upload trainer_state.json with huggingface_hub
a2bea2f verified
raw
history blame
18.1 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9992414664981036,
"eval_steps": 500,
"global_step": 988,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010113780025284451,
"grad_norm": 0.7041019201278687,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.9035,
"step": 10
},
{
"epoch": 0.020227560050568902,
"grad_norm": 0.48630091547966003,
"learning_rate": 9.595959595959595e-06,
"loss": 1.821,
"step": 20
},
{
"epoch": 0.03034134007585335,
"grad_norm": 0.26346465945243835,
"learning_rate": 1.4141414141414141e-05,
"loss": 1.794,
"step": 30
},
{
"epoch": 0.040455120101137804,
"grad_norm": 0.21580003201961517,
"learning_rate": 1.919191919191919e-05,
"loss": 1.7533,
"step": 40
},
{
"epoch": 0.05056890012642225,
"grad_norm": 0.5718111991882324,
"learning_rate": 2.4242424242424244e-05,
"loss": 1.7247,
"step": 50
},
{
"epoch": 0.0606826801517067,
"grad_norm": 0.28544941544532776,
"learning_rate": 2.9292929292929294e-05,
"loss": 1.6648,
"step": 60
},
{
"epoch": 0.07079646017699115,
"grad_norm": 0.2751332223415375,
"learning_rate": 3.434343434343435e-05,
"loss": 1.6736,
"step": 70
},
{
"epoch": 0.08091024020227561,
"grad_norm": 0.21601912379264832,
"learning_rate": 3.939393939393939e-05,
"loss": 1.6558,
"step": 80
},
{
"epoch": 0.09102402022756005,
"grad_norm": 0.7239705324172974,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.6553,
"step": 90
},
{
"epoch": 0.1011378002528445,
"grad_norm": 0.4910496473312378,
"learning_rate": 4.94949494949495e-05,
"loss": 1.6474,
"step": 100
},
{
"epoch": 0.11125158027812895,
"grad_norm": 0.2504318952560425,
"learning_rate": 4.9987356868753015e-05,
"loss": 1.6752,
"step": 110
},
{
"epoch": 0.1213653603034134,
"grad_norm": 0.21965070068836212,
"learning_rate": 4.994366863501053e-05,
"loss": 1.6347,
"step": 120
},
{
"epoch": 0.13147914032869784,
"grad_norm": 0.17810115218162537,
"learning_rate": 4.9868833750344404e-05,
"loss": 1.6873,
"step": 130
},
{
"epoch": 0.1415929203539823,
"grad_norm": 0.1999112367630005,
"learning_rate": 4.976294565955074e-05,
"loss": 1.6358,
"step": 140
},
{
"epoch": 0.15170670037926676,
"grad_norm": 0.3722081482410431,
"learning_rate": 4.962613658293158e-05,
"loss": 1.6192,
"step": 150
},
{
"epoch": 0.16182048040455121,
"grad_norm": 0.24635489284992218,
"learning_rate": 4.94767113681815e-05,
"loss": 1.6016,
"step": 160
},
{
"epoch": 0.17193426042983564,
"grad_norm": 0.18981686234474182,
"learning_rate": 4.928165479747021e-05,
"loss": 1.5955,
"step": 170
},
{
"epoch": 0.1820480404551201,
"grad_norm": 0.20855137705802917,
"learning_rate": 4.9056278219029025e-05,
"loss": 1.6105,
"step": 180
},
{
"epoch": 0.19216182048040456,
"grad_norm": 0.5716936588287354,
"learning_rate": 4.880086305600057e-05,
"loss": 1.5953,
"step": 190
},
{
"epoch": 0.202275600505689,
"grad_norm": 0.22516073286533356,
"learning_rate": 4.854556924777239e-05,
"loss": 1.604,
"step": 200
},
{
"epoch": 0.21238938053097345,
"grad_norm": 1.1213630437850952,
"learning_rate": 4.823399011155354e-05,
"loss": 1.5866,
"step": 210
},
{
"epoch": 0.2225031605562579,
"grad_norm": 0.8772207498550415,
"learning_rate": 4.789339916515726e-05,
"loss": 1.6409,
"step": 220
},
{
"epoch": 0.23261694058154236,
"grad_norm": 0.20904238522052765,
"learning_rate": 4.752422169756048e-05,
"loss": 1.6172,
"step": 230
},
{
"epoch": 0.2427307206068268,
"grad_norm": 0.5943711996078491,
"learning_rate": 4.712691869314688e-05,
"loss": 1.6009,
"step": 240
},
{
"epoch": 0.2528445006321112,
"grad_norm": 0.2711758017539978,
"learning_rate": 4.6701986256085046e-05,
"loss": 1.6021,
"step": 250
},
{
"epoch": 0.2629582806573957,
"grad_norm": 0.29488369822502136,
"learning_rate": 4.6249954990853274e-05,
"loss": 1.6121,
"step": 260
},
{
"epoch": 0.27307206068268014,
"grad_norm": 0.5323128700256348,
"learning_rate": 4.577138933968461e-05,
"loss": 1.5909,
"step": 270
},
{
"epoch": 0.2831858407079646,
"grad_norm": 0.21803885698318481,
"learning_rate": 4.526688687775934e-05,
"loss": 1.5544,
"step": 280
},
{
"epoch": 0.29329962073324906,
"grad_norm": 0.24969695508480072,
"learning_rate": 4.473707756702496e-05,
"loss": 1.5741,
"step": 290
},
{
"epoch": 0.3034134007585335,
"grad_norm": 0.19897674024105072,
"learning_rate": 4.418262296957563e-05,
"loss": 1.5675,
"step": 300
},
{
"epoch": 0.31352718078381797,
"grad_norm": 0.3320370316505432,
"learning_rate": 4.360421542157295e-05,
"loss": 1.5626,
"step": 310
},
{
"epoch": 0.32364096080910243,
"grad_norm": 0.2747906744480133,
"learning_rate": 4.300257716874001e-05,
"loss": 1.5884,
"step": 320
},
{
"epoch": 0.33375474083438683,
"grad_norm": 0.40590158104896545,
"learning_rate": 4.237845946450779e-05,
"loss": 1.5671,
"step": 330
},
{
"epoch": 0.3438685208596713,
"grad_norm": 0.2609237730503082,
"learning_rate": 4.1732641631940314e-05,
"loss": 1.5584,
"step": 340
},
{
"epoch": 0.35398230088495575,
"grad_norm": 0.21924744546413422,
"learning_rate": 4.1065930090609864e-05,
"loss": 1.5669,
"step": 350
},
{
"epoch": 0.3640960809102402,
"grad_norm": 0.18368321657180786,
"learning_rate": 4.0379157349637207e-05,
"loss": 1.5447,
"step": 360
},
{
"epoch": 0.37420986093552466,
"grad_norm": 0.33152899146080017,
"learning_rate": 3.967318096815449e-05,
"loss": 1.5676,
"step": 370
},
{
"epoch": 0.3843236409608091,
"grad_norm": 0.2354184240102768,
"learning_rate": 3.894888248448857e-05,
"loss": 1.571,
"step": 380
},
{
"epoch": 0.3944374209860936,
"grad_norm": 0.2143399715423584,
"learning_rate": 3.820716631540209e-05,
"loss": 1.5445,
"step": 390
},
{
"epoch": 0.404551201011378,
"grad_norm": 0.2204110026359558,
"learning_rate": 3.74489586267667e-05,
"loss": 1.5592,
"step": 400
},
{
"epoch": 0.41466498103666244,
"grad_norm": 0.34792593121528625,
"learning_rate": 3.6675206177078527e-05,
"loss": 1.5533,
"step": 410
},
{
"epoch": 0.4247787610619469,
"grad_norm": 0.32608234882354736,
"learning_rate": 3.5966336358580976e-05,
"loss": 1.5161,
"step": 420
},
{
"epoch": 0.43489254108723135,
"grad_norm": 0.19128628075122833,
"learning_rate": 3.5165725729171826e-05,
"loss": 1.5477,
"step": 430
},
{
"epoch": 0.4450063211125158,
"grad_norm": 0.22888797521591187,
"learning_rate": 3.435242136511984e-05,
"loss": 1.5358,
"step": 440
},
{
"epoch": 0.45512010113780027,
"grad_norm": 0.4585898518562317,
"learning_rate": 3.3527438823017426e-05,
"loss": 1.4987,
"step": 450
},
{
"epoch": 0.46523388116308473,
"grad_norm": 0.2261190563440323,
"learning_rate": 3.269180824176009e-05,
"loss": 1.5341,
"step": 460
},
{
"epoch": 0.47534766118836913,
"grad_norm": 0.3994990289211273,
"learning_rate": 3.184657305623289e-05,
"loss": 1.5591,
"step": 470
},
{
"epoch": 0.4854614412136536,
"grad_norm": 0.9193454384803772,
"learning_rate": 3.099278869439462e-05,
"loss": 1.5145,
"step": 480
},
{
"epoch": 0.49557522123893805,
"grad_norm": 0.22547656297683716,
"learning_rate": 3.013152125938638e-05,
"loss": 1.5155,
"step": 490
},
{
"epoch": 0.5056890012642224,
"grad_norm": 0.24402552843093872,
"learning_rate": 2.9263846198310286e-05,
"loss": 1.4918,
"step": 500
},
{
"epoch": 0.515802781289507,
"grad_norm": 0.254681259393692,
"learning_rate": 2.8390846959340638e-05,
"loss": 1.5119,
"step": 510
},
{
"epoch": 0.5259165613147914,
"grad_norm": 0.4979060888290405,
"learning_rate": 2.7513613638844195e-05,
"loss": 1.4987,
"step": 520
},
{
"epoch": 0.5360303413400759,
"grad_norm": 0.8083274960517883,
"learning_rate": 2.6633241620199072e-05,
"loss": 1.5273,
"step": 530
},
{
"epoch": 0.5461441213653603,
"grad_norm": 0.2711585462093353,
"learning_rate": 2.575083020601183e-05,
"loss": 1.5294,
"step": 540
},
{
"epoch": 0.5562579013906448,
"grad_norm": 0.21405388414859772,
"learning_rate": 2.4867481245440705e-05,
"loss": 1.5185,
"step": 550
},
{
"epoch": 0.5663716814159292,
"grad_norm": 0.3120310306549072,
"learning_rate": 2.3984297758338998e-05,
"loss": 1.5372,
"step": 560
},
{
"epoch": 0.5764854614412137,
"grad_norm": 0.469420462846756,
"learning_rate": 2.3102382557936657e-05,
"loss": 1.5139,
"step": 570
},
{
"epoch": 0.5865992414664981,
"grad_norm": 0.28421998023986816,
"learning_rate": 2.2222836873779888e-05,
"loss": 1.5201,
"step": 580
},
{
"epoch": 0.5967130214917825,
"grad_norm": 0.6928972601890564,
"learning_rate": 2.134675897664819e-05,
"loss": 1.5131,
"step": 590
},
{
"epoch": 0.606826801517067,
"grad_norm": 0.24055446684360504,
"learning_rate": 2.047524280716608e-05,
"loss": 1.5204,
"step": 600
},
{
"epoch": 0.6169405815423514,
"grad_norm": 0.23597495257854462,
"learning_rate": 1.9609376609821648e-05,
"loss": 1.5112,
"step": 610
},
{
"epoch": 0.6270543615676359,
"grad_norm": 0.23814287781715393,
"learning_rate": 1.875024157409789e-05,
"loss": 1.4903,
"step": 620
},
{
"epoch": 0.6371681415929203,
"grad_norm": 0.35423246026039124,
"learning_rate": 1.789891048441338e-05,
"loss": 1.5151,
"step": 630
},
{
"epoch": 0.6472819216182049,
"grad_norm": 0.21488147974014282,
"learning_rate": 1.7056446380558257e-05,
"loss": 1.5058,
"step": 640
},
{
"epoch": 0.6573957016434893,
"grad_norm": 0.2943096458911896,
"learning_rate": 1.6223901230298062e-05,
"loss": 1.4911,
"step": 650
},
{
"epoch": 0.6675094816687737,
"grad_norm": 0.2584967613220215,
"learning_rate": 1.540231461580303e-05,
"loss": 1.5227,
"step": 660
},
{
"epoch": 0.6776232616940582,
"grad_norm": 0.6467604637145996,
"learning_rate": 1.459271243554303e-05,
"loss": 1.4993,
"step": 670
},
{
"epoch": 0.6877370417193426,
"grad_norm": 0.26516446471214294,
"learning_rate": 1.3796105623268996e-05,
"loss": 1.4785,
"step": 680
},
{
"epoch": 0.6978508217446271,
"grad_norm": 0.3446539640426636,
"learning_rate": 1.3013488885680591e-05,
"loss": 1.4793,
"step": 690
},
{
"epoch": 0.7079646017699115,
"grad_norm": 0.29439467191696167,
"learning_rate": 1.224583946035619e-05,
"loss": 1.5058,
"step": 700
},
{
"epoch": 0.718078381795196,
"grad_norm": 0.20196016132831573,
"learning_rate": 1.1494115895496224e-05,
"loss": 1.4927,
"step": 710
},
{
"epoch": 0.7281921618204804,
"grad_norm": 0.23624677956104279,
"learning_rate": 1.0759256853003578e-05,
"loss": 1.5192,
"step": 720
},
{
"epoch": 0.7383059418457648,
"grad_norm": 0.23491314053535461,
"learning_rate": 1.0042179936395573e-05,
"loss": 1.4667,
"step": 730
},
{
"epoch": 0.7484197218710493,
"grad_norm": 0.20944607257843018,
"learning_rate": 9.34378054501118e-06,
"loss": 1.4674,
"step": 740
},
{
"epoch": 0.7585335018963337,
"grad_norm": 0.27759242057800293,
"learning_rate": 8.664930755944062e-06,
"loss": 1.5156,
"step": 750
},
{
"epoch": 0.7686472819216182,
"grad_norm": 0.24669459462165833,
"learning_rate": 8.006478235097706e-06,
"loss": 1.4981,
"step": 760
},
{
"epoch": 0.7787610619469026,
"grad_norm": 0.24962100386619568,
"learning_rate": 7.369245178722253e-06,
"loss": 1.4705,
"step": 770
},
{
"epoch": 0.7888748419721872,
"grad_norm": 1.8606995344161987,
"learning_rate": 6.754027286754802e-06,
"loss": 1.4913,
"step": 780
},
{
"epoch": 0.7989886219974716,
"grad_norm": 0.23127563297748566,
"learning_rate": 6.161592769245114e-06,
"loss": 1.4892,
"step": 790
},
{
"epoch": 0.809102402022756,
"grad_norm": 0.5031234622001648,
"learning_rate": 5.5926813871073455e-06,
"loss": 1.483,
"step": 800
},
{
"epoch": 0.8192161820480405,
"grad_norm": 0.23935504257678986,
"learning_rate": 5.048003528395687e-06,
"loss": 1.4815,
"step": 810
},
{
"epoch": 0.8293299620733249,
"grad_norm": 0.2615523934364319,
"learning_rate": 4.528239321257255e-06,
"loss": 1.4645,
"step": 820
},
{
"epoch": 0.8394437420986094,
"grad_norm": 0.25076770782470703,
"learning_rate": 4.034037784669942e-06,
"loss": 1.5147,
"step": 830
},
{
"epoch": 0.8495575221238938,
"grad_norm": 0.23863032460212708,
"learning_rate": 3.5660160180256254e-06,
"loss": 1.5028,
"step": 840
},
{
"epoch": 0.8596713021491783,
"grad_norm": 0.2394942194223404,
"learning_rate": 3.1247584305707565e-06,
"loss": 1.4779,
"step": 850
},
{
"epoch": 0.8697850821744627,
"grad_norm": 0.2323542833328247,
"learning_rate": 2.7108160116663893e-06,
"loss": 1.487,
"step": 860
},
{
"epoch": 0.8798988621997471,
"grad_norm": 0.23858946561813354,
"learning_rate": 2.3247056427790347e-06,
"loss": 1.4714,
"step": 870
},
{
"epoch": 0.8900126422250316,
"grad_norm": 0.6770968437194824,
"learning_rate": 1.96690945206128e-06,
"loss": 1.46,
"step": 880
},
{
"epoch": 0.900126422250316,
"grad_norm": 0.20593154430389404,
"learning_rate": 1.637874212328186e-06,
"loss": 1.4839,
"step": 890
},
{
"epoch": 0.9102402022756005,
"grad_norm": 0.340822696685791,
"learning_rate": 1.3380107831811816e-06,
"loss": 1.457,
"step": 900
},
{
"epoch": 0.9203539823008849,
"grad_norm": 0.5303720235824585,
"learning_rate": 1.0676935979760466e-06,
"loss": 1.482,
"step": 910
},
{
"epoch": 0.9304677623261695,
"grad_norm": 0.401429146528244,
"learning_rate": 8.272601962756005e-07,
"loss": 1.4632,
"step": 920
},
{
"epoch": 0.9405815423514539,
"grad_norm": 0.22615137696266174,
"learning_rate": 6.170108023709348e-07,
"loss": 1.4632,
"step": 930
},
{
"epoch": 0.9506953223767383,
"grad_norm": 0.7189086079597473,
"learning_rate": 4.37207950397453e-07,
"loss": 1.4675,
"step": 940
},
{
"epoch": 0.9608091024020228,
"grad_norm": 0.2771843671798706,
"learning_rate": 2.880761565138418e-07,
"loss": 1.4791,
"step": 950
},
{
"epoch": 0.9709228824273072,
"grad_norm": 0.27663376927375793,
"learning_rate": 1.6980163855331854e-07,
"loss": 1.4927,
"step": 960
},
{
"epoch": 0.9810366624525917,
"grad_norm": 0.3906765878200531,
"learning_rate": 8.253208349721653e-08,
"loss": 1.4982,
"step": 970
},
{
"epoch": 0.9911504424778761,
"grad_norm": 0.7076007127761841,
"learning_rate": 2.6376463061256185e-08,
"loss": 1.467,
"step": 980
},
{
"epoch": 0.9992414664981036,
"step": 988,
"total_flos": 1.352098799838403e+19,
"train_loss": 1.5513278781643762,
"train_runtime": 82050.3389,
"train_samples_per_second": 0.386,
"train_steps_per_second": 0.012
}
],
"logging_steps": 10,
"max_steps": 988,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.352098799838403e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}