{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992414664981036, "eval_steps": 500, "global_step": 988, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010113780025284451, "grad_norm": 0.7041019201278687, "learning_rate": 4.5454545454545455e-06, "loss": 1.9035, "step": 10 }, { "epoch": 0.020227560050568902, "grad_norm": 0.48630091547966003, "learning_rate": 9.595959595959595e-06, "loss": 1.821, "step": 20 }, { "epoch": 0.03034134007585335, "grad_norm": 0.26346465945243835, "learning_rate": 1.4141414141414141e-05, "loss": 1.794, "step": 30 }, { "epoch": 0.040455120101137804, "grad_norm": 0.21580003201961517, "learning_rate": 1.919191919191919e-05, "loss": 1.7533, "step": 40 }, { "epoch": 0.05056890012642225, "grad_norm": 0.5718111991882324, "learning_rate": 2.4242424242424244e-05, "loss": 1.7247, "step": 50 }, { "epoch": 0.0606826801517067, "grad_norm": 0.28544941544532776, "learning_rate": 2.9292929292929294e-05, "loss": 1.6648, "step": 60 }, { "epoch": 0.07079646017699115, "grad_norm": 0.2751332223415375, "learning_rate": 3.434343434343435e-05, "loss": 1.6736, "step": 70 }, { "epoch": 0.08091024020227561, "grad_norm": 0.21601912379264832, "learning_rate": 3.939393939393939e-05, "loss": 1.6558, "step": 80 }, { "epoch": 0.09102402022756005, "grad_norm": 0.7239705324172974, "learning_rate": 4.4444444444444447e-05, "loss": 1.6553, "step": 90 }, { "epoch": 0.1011378002528445, "grad_norm": 0.4910496473312378, "learning_rate": 4.94949494949495e-05, "loss": 1.6474, "step": 100 }, { "epoch": 0.11125158027812895, "grad_norm": 0.2504318952560425, "learning_rate": 4.9987356868753015e-05, "loss": 1.6752, "step": 110 }, { "epoch": 0.1213653603034134, "grad_norm": 0.21965070068836212, "learning_rate": 4.994366863501053e-05, "loss": 1.6347, "step": 120 }, { "epoch": 0.13147914032869784, "grad_norm": 0.17810115218162537, "learning_rate": 4.9868833750344404e-05, "loss": 1.6873, "step": 130 }, { "epoch": 0.1415929203539823, "grad_norm": 0.1999112367630005, "learning_rate": 4.976294565955074e-05, "loss": 1.6358, "step": 140 }, { "epoch": 0.15170670037926676, "grad_norm": 0.3722081482410431, "learning_rate": 4.962613658293158e-05, "loss": 1.6192, "step": 150 }, { "epoch": 0.16182048040455121, "grad_norm": 0.24635489284992218, "learning_rate": 4.94767113681815e-05, "loss": 1.6016, "step": 160 }, { "epoch": 0.17193426042983564, "grad_norm": 0.18981686234474182, "learning_rate": 4.928165479747021e-05, "loss": 1.5955, "step": 170 }, { "epoch": 0.1820480404551201, "grad_norm": 0.20855137705802917, "learning_rate": 4.9056278219029025e-05, "loss": 1.6105, "step": 180 }, { "epoch": 0.19216182048040456, "grad_norm": 0.5716936588287354, "learning_rate": 4.880086305600057e-05, "loss": 1.5953, "step": 190 }, { "epoch": 0.202275600505689, "grad_norm": 0.22516073286533356, "learning_rate": 4.854556924777239e-05, "loss": 1.604, "step": 200 }, { "epoch": 0.21238938053097345, "grad_norm": 1.1213630437850952, "learning_rate": 4.823399011155354e-05, "loss": 1.5866, "step": 210 }, { "epoch": 0.2225031605562579, "grad_norm": 0.8772207498550415, "learning_rate": 4.789339916515726e-05, "loss": 1.6409, "step": 220 }, { "epoch": 0.23261694058154236, "grad_norm": 0.20904238522052765, "learning_rate": 4.752422169756048e-05, "loss": 1.6172, "step": 230 }, { "epoch": 0.2427307206068268, "grad_norm": 0.5943711996078491, "learning_rate": 4.712691869314688e-05, "loss": 1.6009, "step": 240 }, { "epoch": 0.2528445006321112, "grad_norm": 0.2711758017539978, "learning_rate": 4.6701986256085046e-05, "loss": 1.6021, "step": 250 }, { "epoch": 0.2629582806573957, "grad_norm": 0.29488369822502136, "learning_rate": 4.6249954990853274e-05, "loss": 1.6121, "step": 260 }, { "epoch": 0.27307206068268014, "grad_norm": 0.5323128700256348, "learning_rate": 4.577138933968461e-05, "loss": 1.5909, "step": 270 }, { "epoch": 0.2831858407079646, "grad_norm": 0.21803885698318481, "learning_rate": 4.526688687775934e-05, "loss": 1.5544, "step": 280 }, { "epoch": 0.29329962073324906, "grad_norm": 0.24969695508480072, "learning_rate": 4.473707756702496e-05, "loss": 1.5741, "step": 290 }, { "epoch": 0.3034134007585335, "grad_norm": 0.19897674024105072, "learning_rate": 4.418262296957563e-05, "loss": 1.5675, "step": 300 }, { "epoch": 0.31352718078381797, "grad_norm": 0.3320370316505432, "learning_rate": 4.360421542157295e-05, "loss": 1.5626, "step": 310 }, { "epoch": 0.32364096080910243, "grad_norm": 0.2747906744480133, "learning_rate": 4.300257716874001e-05, "loss": 1.5884, "step": 320 }, { "epoch": 0.33375474083438683, "grad_norm": 0.40590158104896545, "learning_rate": 4.237845946450779e-05, "loss": 1.5671, "step": 330 }, { "epoch": 0.3438685208596713, "grad_norm": 0.2609237730503082, "learning_rate": 4.1732641631940314e-05, "loss": 1.5584, "step": 340 }, { "epoch": 0.35398230088495575, "grad_norm": 0.21924744546413422, "learning_rate": 4.1065930090609864e-05, "loss": 1.5669, "step": 350 }, { "epoch": 0.3640960809102402, "grad_norm": 0.18368321657180786, "learning_rate": 4.0379157349637207e-05, "loss": 1.5447, "step": 360 }, { "epoch": 0.37420986093552466, "grad_norm": 0.33152899146080017, "learning_rate": 3.967318096815449e-05, "loss": 1.5676, "step": 370 }, { "epoch": 0.3843236409608091, "grad_norm": 0.2354184240102768, "learning_rate": 3.894888248448857e-05, "loss": 1.571, "step": 380 }, { "epoch": 0.3944374209860936, "grad_norm": 0.2143399715423584, "learning_rate": 3.820716631540209e-05, "loss": 1.5445, "step": 390 }, { "epoch": 0.404551201011378, "grad_norm": 0.2204110026359558, "learning_rate": 3.74489586267667e-05, "loss": 1.5592, "step": 400 }, { "epoch": 0.41466498103666244, "grad_norm": 0.34792593121528625, "learning_rate": 3.6675206177078527e-05, "loss": 1.5533, "step": 410 }, { "epoch": 0.4247787610619469, "grad_norm": 0.32608234882354736, "learning_rate": 3.5966336358580976e-05, "loss": 1.5161, "step": 420 }, { "epoch": 0.43489254108723135, "grad_norm": 0.19128628075122833, "learning_rate": 3.5165725729171826e-05, "loss": 1.5477, "step": 430 }, { "epoch": 0.4450063211125158, "grad_norm": 0.22888797521591187, "learning_rate": 3.435242136511984e-05, "loss": 1.5358, "step": 440 }, { "epoch": 0.45512010113780027, "grad_norm": 0.4585898518562317, "learning_rate": 3.3527438823017426e-05, "loss": 1.4987, "step": 450 }, { "epoch": 0.46523388116308473, "grad_norm": 0.2261190563440323, "learning_rate": 3.269180824176009e-05, "loss": 1.5341, "step": 460 }, { "epoch": 0.47534766118836913, "grad_norm": 0.3994990289211273, "learning_rate": 3.184657305623289e-05, "loss": 1.5591, "step": 470 }, { "epoch": 0.4854614412136536, "grad_norm": 0.9193454384803772, "learning_rate": 3.099278869439462e-05, "loss": 1.5145, "step": 480 }, { "epoch": 0.49557522123893805, "grad_norm": 0.22547656297683716, "learning_rate": 3.013152125938638e-05, "loss": 1.5155, "step": 490 }, { "epoch": 0.5056890012642224, "grad_norm": 0.24402552843093872, "learning_rate": 2.9263846198310286e-05, "loss": 1.4918, "step": 500 }, { "epoch": 0.515802781289507, "grad_norm": 0.254681259393692, "learning_rate": 2.8390846959340638e-05, "loss": 1.5119, "step": 510 }, { "epoch": 0.5259165613147914, "grad_norm": 0.4979060888290405, "learning_rate": 2.7513613638844195e-05, "loss": 1.4987, "step": 520 }, { "epoch": 0.5360303413400759, "grad_norm": 0.8083274960517883, "learning_rate": 2.6633241620199072e-05, "loss": 1.5273, "step": 530 }, { "epoch": 0.5461441213653603, "grad_norm": 0.2711585462093353, "learning_rate": 2.575083020601183e-05, "loss": 1.5294, "step": 540 }, { "epoch": 0.5562579013906448, "grad_norm": 0.21405388414859772, "learning_rate": 2.4867481245440705e-05, "loss": 1.5185, "step": 550 }, { "epoch": 0.5663716814159292, "grad_norm": 0.3120310306549072, "learning_rate": 2.3984297758338998e-05, "loss": 1.5372, "step": 560 }, { "epoch": 0.5764854614412137, "grad_norm": 0.469420462846756, "learning_rate": 2.3102382557936657e-05, "loss": 1.5139, "step": 570 }, { "epoch": 0.5865992414664981, "grad_norm": 0.28421998023986816, "learning_rate": 2.2222836873779888e-05, "loss": 1.5201, "step": 580 }, { "epoch": 0.5967130214917825, "grad_norm": 0.6928972601890564, "learning_rate": 2.134675897664819e-05, "loss": 1.5131, "step": 590 }, { "epoch": 0.606826801517067, "grad_norm": 0.24055446684360504, "learning_rate": 2.047524280716608e-05, "loss": 1.5204, "step": 600 }, { "epoch": 0.6169405815423514, "grad_norm": 0.23597495257854462, "learning_rate": 1.9609376609821648e-05, "loss": 1.5112, "step": 610 }, { "epoch": 0.6270543615676359, "grad_norm": 0.23814287781715393, "learning_rate": 1.875024157409789e-05, "loss": 1.4903, "step": 620 }, { "epoch": 0.6371681415929203, "grad_norm": 0.35423246026039124, "learning_rate": 1.789891048441338e-05, "loss": 1.5151, "step": 630 }, { "epoch": 0.6472819216182049, "grad_norm": 0.21488147974014282, "learning_rate": 1.7056446380558257e-05, "loss": 1.5058, "step": 640 }, { "epoch": 0.6573957016434893, "grad_norm": 0.2943096458911896, "learning_rate": 1.6223901230298062e-05, "loss": 1.4911, "step": 650 }, { "epoch": 0.6675094816687737, "grad_norm": 0.2584967613220215, "learning_rate": 1.540231461580303e-05, "loss": 1.5227, "step": 660 }, { "epoch": 0.6776232616940582, "grad_norm": 0.6467604637145996, "learning_rate": 1.459271243554303e-05, "loss": 1.4993, "step": 670 }, { "epoch": 0.6877370417193426, "grad_norm": 0.26516446471214294, "learning_rate": 1.3796105623268996e-05, "loss": 1.4785, "step": 680 }, { "epoch": 0.6978508217446271, "grad_norm": 0.3446539640426636, "learning_rate": 1.3013488885680591e-05, "loss": 1.4793, "step": 690 }, { "epoch": 0.7079646017699115, "grad_norm": 0.29439467191696167, "learning_rate": 1.224583946035619e-05, "loss": 1.5058, "step": 700 }, { "epoch": 0.718078381795196, "grad_norm": 0.20196016132831573, "learning_rate": 1.1494115895496224e-05, "loss": 1.4927, "step": 710 }, { "epoch": 0.7281921618204804, "grad_norm": 0.23624677956104279, "learning_rate": 1.0759256853003578e-05, "loss": 1.5192, "step": 720 }, { "epoch": 0.7383059418457648, "grad_norm": 0.23491314053535461, "learning_rate": 1.0042179936395573e-05, "loss": 1.4667, "step": 730 }, { "epoch": 0.7484197218710493, "grad_norm": 0.20944607257843018, "learning_rate": 9.34378054501118e-06, "loss": 1.4674, "step": 740 }, { "epoch": 0.7585335018963337, "grad_norm": 0.27759242057800293, "learning_rate": 8.664930755944062e-06, "loss": 1.5156, "step": 750 }, { "epoch": 0.7686472819216182, "grad_norm": 0.24669459462165833, "learning_rate": 8.006478235097706e-06, "loss": 1.4981, "step": 760 }, { "epoch": 0.7787610619469026, "grad_norm": 0.24962100386619568, "learning_rate": 7.369245178722253e-06, "loss": 1.4705, "step": 770 }, { "epoch": 0.7888748419721872, "grad_norm": 1.8606995344161987, "learning_rate": 6.754027286754802e-06, "loss": 1.4913, "step": 780 }, { "epoch": 0.7989886219974716, "grad_norm": 0.23127563297748566, "learning_rate": 6.161592769245114e-06, "loss": 1.4892, "step": 790 }, { "epoch": 0.809102402022756, "grad_norm": 0.5031234622001648, "learning_rate": 5.5926813871073455e-06, "loss": 1.483, "step": 800 }, { "epoch": 0.8192161820480405, "grad_norm": 0.23935504257678986, "learning_rate": 5.048003528395687e-06, "loss": 1.4815, "step": 810 }, { "epoch": 0.8293299620733249, "grad_norm": 0.2615523934364319, "learning_rate": 4.528239321257255e-06, "loss": 1.4645, "step": 820 }, { "epoch": 0.8394437420986094, "grad_norm": 0.25076770782470703, "learning_rate": 4.034037784669942e-06, "loss": 1.5147, "step": 830 }, { "epoch": 0.8495575221238938, "grad_norm": 0.23863032460212708, "learning_rate": 3.5660160180256254e-06, "loss": 1.5028, "step": 840 }, { "epoch": 0.8596713021491783, "grad_norm": 0.2394942194223404, "learning_rate": 3.1247584305707565e-06, "loss": 1.4779, "step": 850 }, { "epoch": 0.8697850821744627, "grad_norm": 0.2323542833328247, "learning_rate": 2.7108160116663893e-06, "loss": 1.487, "step": 860 }, { "epoch": 0.8798988621997471, "grad_norm": 0.23858946561813354, "learning_rate": 2.3247056427790347e-06, "loss": 1.4714, "step": 870 }, { "epoch": 0.8900126422250316, "grad_norm": 0.6770968437194824, "learning_rate": 1.96690945206128e-06, "loss": 1.46, "step": 880 }, { "epoch": 0.900126422250316, "grad_norm": 0.20593154430389404, "learning_rate": 1.637874212328186e-06, "loss": 1.4839, "step": 890 }, { "epoch": 0.9102402022756005, "grad_norm": 0.340822696685791, "learning_rate": 1.3380107831811816e-06, "loss": 1.457, "step": 900 }, { "epoch": 0.9203539823008849, "grad_norm": 0.5303720235824585, "learning_rate": 1.0676935979760466e-06, "loss": 1.482, "step": 910 }, { "epoch": 0.9304677623261695, "grad_norm": 0.401429146528244, "learning_rate": 8.272601962756005e-07, "loss": 1.4632, "step": 920 }, { "epoch": 0.9405815423514539, "grad_norm": 0.22615137696266174, "learning_rate": 6.170108023709348e-07, "loss": 1.4632, "step": 930 }, { "epoch": 0.9506953223767383, "grad_norm": 0.7189086079597473, "learning_rate": 4.37207950397453e-07, "loss": 1.4675, "step": 940 }, { "epoch": 0.9608091024020228, "grad_norm": 0.2771843671798706, "learning_rate": 2.880761565138418e-07, "loss": 1.4791, "step": 950 }, { "epoch": 0.9709228824273072, "grad_norm": 0.27663376927375793, "learning_rate": 1.6980163855331854e-07, "loss": 1.4927, "step": 960 }, { "epoch": 0.9810366624525917, "grad_norm": 0.3906765878200531, "learning_rate": 8.253208349721653e-08, "loss": 1.4982, "step": 970 }, { "epoch": 0.9911504424778761, "grad_norm": 0.7076007127761841, "learning_rate": 2.6376463061256185e-08, "loss": 1.467, "step": 980 }, { "epoch": 0.9992414664981036, "step": 988, "total_flos": 1.352098799838403e+19, "train_loss": 1.5513278781643762, "train_runtime": 82050.3389, "train_samples_per_second": 0.386, "train_steps_per_second": 0.012 } ], "logging_steps": 10, "max_steps": 988, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.352098799838403e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }