{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06603111716396352, "eval_steps": 20, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006603111716396351, "grad_norm": 300608.875, "learning_rate": 2.9999999999999997e-05, "loss": 118.0917, "step": 1 }, { "epoch": 0.0006603111716396351, "eval_loss": 15.74350357055664, "eval_runtime": 6.9487, "eval_samples_per_second": 71.236, "eval_steps_per_second": 71.236, "step": 1 }, { "epoch": 0.0013206223432792703, "grad_norm": 277566.0, "learning_rate": 5.9999999999999995e-05, "loss": 120.2507, "step": 2 }, { "epoch": 0.0019809335149189055, "grad_norm": 418405.1875, "learning_rate": 8.999999999999999e-05, "loss": 143.5985, "step": 3 }, { "epoch": 0.0026412446865585405, "grad_norm": 222641.09375, "learning_rate": 0.00011999999999999999, "loss": 113.3078, "step": 4 }, { "epoch": 0.003301555858198176, "grad_norm": 80659.1640625, "learning_rate": 0.00015, "loss": 89.0113, "step": 5 }, { "epoch": 0.003961867029837811, "grad_norm": 174515.8125, "learning_rate": 0.00017999999999999998, "loss": 92.8388, "step": 6 }, { "epoch": 0.004622178201477446, "grad_norm": 68650.6640625, "learning_rate": 0.00020999999999999998, "loss": 88.8969, "step": 7 }, { "epoch": 0.005282489373117081, "grad_norm": 52668.09765625, "learning_rate": 0.00023999999999999998, "loss": 83.2267, "step": 8 }, { "epoch": 0.005942800544756717, "grad_norm": 61407.41796875, "learning_rate": 0.00027, "loss": 92.7579, "step": 9 }, { "epoch": 0.006603111716396352, "grad_norm": 37122.61328125, "learning_rate": 0.0003, "loss": 84.5321, "step": 10 }, { "epoch": 0.007263422888035987, "grad_norm": 58673.69921875, "learning_rate": 0.0002999911984174669, "loss": 93.7724, "step": 11 }, { "epoch": 0.007923734059675622, "grad_norm": 121362.359375, "learning_rate": 0.0002999647947027726, "loss": 108.6176, "step": 12 }, { "epoch": 0.008584045231315257, "grad_norm": 233485.53125, "learning_rate": 0.0002999207919545099, "loss": 105.0768, "step": 13 }, { "epoch": 0.009244356402954892, "grad_norm": 318686.03125, "learning_rate": 0.0002998591953365965, "loss": 99.2428, "step": 14 }, { "epoch": 0.009904667574594527, "grad_norm": 101194.0625, "learning_rate": 0.00029978001207766854, "loss": 95.0131, "step": 15 }, { "epoch": 0.010564978746234162, "grad_norm": 83839.953125, "learning_rate": 0.00029968325147023263, "loss": 86.1607, "step": 16 }, { "epoch": 0.011225289917873797, "grad_norm": 68342.7109375, "learning_rate": 0.000299568924869575, "loss": 85.744, "step": 17 }, { "epoch": 0.011885601089513434, "grad_norm": 232218.078125, "learning_rate": 0.00029943704569242917, "loss": 89.0831, "step": 18 }, { "epoch": 0.012545912261153069, "grad_norm": 142226.359375, "learning_rate": 0.0002992876294154013, "loss": 87.8575, "step": 19 }, { "epoch": 0.013206223432792704, "grad_norm": 99313.8125, "learning_rate": 0.00029912069357315393, "loss": 91.0717, "step": 20 }, { "epoch": 0.013206223432792704, "eval_loss": 15.472484588623047, "eval_runtime": 6.5506, "eval_samples_per_second": 75.565, "eval_steps_per_second": 75.565, "step": 20 }, { "epoch": 0.013866534604432339, "grad_norm": 361644.25, "learning_rate": 0.00029893625775634835, "loss": 97.0436, "step": 21 }, { "epoch": 0.014526845776071974, "grad_norm": 1564964.125, "learning_rate": 0.0002987343436093454, "loss": 279.3873, "step": 22 }, { "epoch": 0.015187156947711609, "grad_norm": 1539204.875, "learning_rate": 0.00029851497482766547, "loss": 668.731, "step": 23 }, { "epoch": 0.015847468119351244, "grad_norm": 2246419.0, "learning_rate": 0.00029827817715520773, "loss": 758.2188, "step": 24 }, { "epoch": 0.01650777929099088, "grad_norm": 2776799.5, "learning_rate": 0.0002980239783812289, "loss": 546.4688, "step": 25 }, { "epoch": 0.017168090462630514, "grad_norm": 1617533.25, "learning_rate": 0.0002977524083370822, "loss": 422.7344, "step": 26 }, { "epoch": 0.01782840163427015, "grad_norm": 450934.75, "learning_rate": 0.00029746349889271645, "loss": 339.3945, "step": 27 }, { "epoch": 0.018488712805909784, "grad_norm": 2286499.5, "learning_rate": 0.0002971572839529358, "loss": 236.2812, "step": 28 }, { "epoch": 0.01914902397754942, "grad_norm": 3999992.25, "learning_rate": 0.00029683379945342125, "loss": 159.8123, "step": 29 }, { "epoch": 0.019809335149189054, "grad_norm": 308621.125, "learning_rate": 0.000296493083356513, "loss": 124.3508, "step": 30 }, { "epoch": 0.02046964632082869, "grad_norm": 497120.59375, "learning_rate": 0.00029613517564675565, "loss": 138.8941, "step": 31 }, { "epoch": 0.021129957492468324, "grad_norm": 99928.7890625, "learning_rate": 0.0002957601183262058, "loss": 151.8223, "step": 32 }, { "epoch": 0.02179026866410796, "grad_norm": 405539.0625, "learning_rate": 0.000295367955409503, "loss": 1203.2074, "step": 33 }, { "epoch": 0.022450579835747594, "grad_norm": 463994.21875, "learning_rate": 0.00029495873291870436, "loss": 1256.5367, "step": 34 }, { "epoch": 0.02311089100738723, "grad_norm": 953879.5625, "learning_rate": 0.0002945324988778834, "loss": 938.6675, "step": 35 }, { "epoch": 0.023771202179026868, "grad_norm": 178221.28125, "learning_rate": 0.00029408930330749477, "loss": 356.9532, "step": 36 }, { "epoch": 0.0244315133506665, "grad_norm": 146182.625, "learning_rate": 0.0002936291982185036, "loss": 238.3703, "step": 37 }, { "epoch": 0.025091824522306138, "grad_norm": 116065.7421875, "learning_rate": 0.00029315223760628217, "loss": 212.9555, "step": 38 }, { "epoch": 0.02575213569394577, "grad_norm": 129193.03125, "learning_rate": 0.00029265847744427303, "loss": 227.929, "step": 39 }, { "epoch": 0.026412446865585408, "grad_norm": 152996.75, "learning_rate": 0.00029214797567742035, "loss": 220.4361, "step": 40 }, { "epoch": 0.026412446865585408, "eval_loss": 16.9798526763916, "eval_runtime": 6.5479, "eval_samples_per_second": 75.597, "eval_steps_per_second": 75.597, "step": 40 }, { "epoch": 0.02707275803722504, "grad_norm": 166313.5, "learning_rate": 0.00029162079221537, "loss": 178.7949, "step": 41 }, { "epoch": 0.027733069208864678, "grad_norm": 135891.28125, "learning_rate": 0.0002910769889254386, "loss": 201.0785, "step": 42 }, { "epoch": 0.02839338038050431, "grad_norm": 64060.7890625, "learning_rate": 0.0002905166296253533, "loss": 163.5094, "step": 43 }, { "epoch": 0.029053691552143948, "grad_norm": 96362.4921875, "learning_rate": 0.0002899397800757626, "loss": 140.6384, "step": 44 }, { "epoch": 0.02971400272378358, "grad_norm": 166254.203125, "learning_rate": 0.0002893465079725187, "loss": 139.1684, "step": 45 }, { "epoch": 0.030374313895423218, "grad_norm": 161925.5625, "learning_rate": 0.0002887368829387333, "loss": 140.9152, "step": 46 }, { "epoch": 0.031034625067062855, "grad_norm": 637966.3125, "learning_rate": 0.0002881109765166071, "loss": 131.3419, "step": 47 }, { "epoch": 0.03169493623870249, "grad_norm": 367775.03125, "learning_rate": 0.00028746886215903387, "loss": 155.0525, "step": 48 }, { "epoch": 0.032355247410342125, "grad_norm": 307120.84375, "learning_rate": 0.00028681061522098047, "loss": 148.0313, "step": 49 }, { "epoch": 0.03301555858198176, "grad_norm": 164428.203125, "learning_rate": 0.0002861363129506435, "loss": 139.0605, "step": 50 }, { "epoch": 0.03367586975362139, "grad_norm": 56655.140625, "learning_rate": 0.0002854460344803842, "loss": 105.2498, "step": 51 }, { "epoch": 0.03433618092526103, "grad_norm": 110095.71875, "learning_rate": 0.00028473986081744163, "loss": 107.1039, "step": 52 }, { "epoch": 0.034996492096900665, "grad_norm": 84727.8125, "learning_rate": 0.000284017874834426, "loss": 114.7597, "step": 53 }, { "epoch": 0.0356568032685403, "grad_norm": 108558.9140625, "learning_rate": 0.0002832801612595937, "loss": 131.0451, "step": 54 }, { "epoch": 0.03631711444017993, "grad_norm": 35595.47265625, "learning_rate": 0.0002825268066669034, "loss": 135.1516, "step": 55 }, { "epoch": 0.03697742561181957, "grad_norm": 54421.08203125, "learning_rate": 0.00028175789946585693, "loss": 116.2731, "step": 56 }, { "epoch": 0.037637736783459205, "grad_norm": 72844.515625, "learning_rate": 0.0002809735298911234, "loss": 101.419, "step": 57 }, { "epoch": 0.03829804795509884, "grad_norm": 58473.41015625, "learning_rate": 0.00028017378999195015, "loss": 101.8432, "step": 58 }, { "epoch": 0.03895835912673848, "grad_norm": 41094.68359375, "learning_rate": 0.0002793587736213603, "loss": 114.5148, "step": 59 }, { "epoch": 0.03961867029837811, "grad_norm": 75345.5234375, "learning_rate": 0.00027852857642513836, "loss": 119.3659, "step": 60 }, { "epoch": 0.03961867029837811, "eval_loss": 16.175268173217773, "eval_runtime": 6.5722, "eval_samples_per_second": 75.317, "eval_steps_per_second": 75.317, "step": 60 }, { "epoch": 0.040278981470017745, "grad_norm": 70272.1640625, "learning_rate": 0.00027768329583060635, "loss": 110.4526, "step": 61 }, { "epoch": 0.04093929264165738, "grad_norm": 34992.48828125, "learning_rate": 0.00027682303103518976, "loss": 116.5263, "step": 62 }, { "epoch": 0.04159960381329702, "grad_norm": 56546.65625, "learning_rate": 0.00027594788299477655, "loss": 107.9915, "step": 63 }, { "epoch": 0.04225991498493665, "grad_norm": 23483.95703125, "learning_rate": 0.0002750579544118695, "loss": 105.6012, "step": 64 }, { "epoch": 0.042920226156576285, "grad_norm": 56980.02734375, "learning_rate": 0.00027415334972353357, "loss": 115.8085, "step": 65 }, { "epoch": 0.04358053732821592, "grad_norm": 32112.759765625, "learning_rate": 0.0002732341750891397, "loss": 100.6383, "step": 66 }, { "epoch": 0.04424084849985556, "grad_norm": 26189.25, "learning_rate": 0.00027230053837790666, "loss": 95.9158, "step": 67 }, { "epoch": 0.04490115967149519, "grad_norm": 27246.33203125, "learning_rate": 0.0002713525491562421, "loss": 107.4022, "step": 68 }, { "epoch": 0.045561470843134826, "grad_norm": 55204.88671875, "learning_rate": 0.0002703903186748843, "loss": 104.1617, "step": 69 }, { "epoch": 0.04622178201477446, "grad_norm": 47799.21875, "learning_rate": 0.00026941395985584653, "loss": 107.3508, "step": 70 }, { "epoch": 0.0468820931864141, "grad_norm": 24187.337890625, "learning_rate": 0.00026842358727916524, "loss": 98.8449, "step": 71 }, { "epoch": 0.047542404358053736, "grad_norm": 99253.2421875, "learning_rate": 0.0002674193171694533, "loss": 226.1011, "step": 72 }, { "epoch": 0.048202715529693366, "grad_norm": 293996.9375, "learning_rate": 0.0002664012673822609, "loss": 507.2474, "step": 73 }, { "epoch": 0.048863026701333, "grad_norm": 565162.4375, "learning_rate": 0.0002653695573902443, "loss": 619.3906, "step": 74 }, { "epoch": 0.04952333787297264, "grad_norm": 569260.125, "learning_rate": 0.0002643243082691454, "loss": 489.5234, "step": 75 }, { "epoch": 0.050183649044612276, "grad_norm": 564170.875, "learning_rate": 0.0002632656426835831, "loss": 570.9297, "step": 76 }, { "epoch": 0.050843960216251906, "grad_norm": 505015.0625, "learning_rate": 0.00026219368487265753, "loss": 504.5522, "step": 77 }, { "epoch": 0.05150427138789154, "grad_norm": 1804335.375, "learning_rate": 0.00026110856063537083, "loss": 404.3008, "step": 78 }, { "epoch": 0.05216458255953118, "grad_norm": 255148.265625, "learning_rate": 0.00026001039731586334, "loss": 266.7525, "step": 79 }, { "epoch": 0.052824893731170816, "grad_norm": 410041.71875, "learning_rate": 0.0002588993237884696, "loss": 194.2475, "step": 80 }, { "epoch": 0.052824893731170816, "eval_loss": 21.787084579467773, "eval_runtime": 6.6223, "eval_samples_per_second": 74.747, "eval_steps_per_second": 74.747, "step": 80 }, { "epoch": 0.05348520490281045, "grad_norm": 259394.6875, "learning_rate": 0.00025777547044259435, "loss": 200.2073, "step": 81 }, { "epoch": 0.05414551607445008, "grad_norm": 1378326.625, "learning_rate": 0.0002566389691674106, "loss": 100.5475, "step": 82 }, { "epoch": 0.05480582724608972, "grad_norm": 63462.6875, "learning_rate": 0.00025548995333638197, "loss": 189.407, "step": 83 }, { "epoch": 0.055466138417729356, "grad_norm": 149989.21875, "learning_rate": 0.00025432855779161076, "loss": 655.0445, "step": 84 }, { "epoch": 0.05612644958936899, "grad_norm": 161908.9375, "learning_rate": 0.00025315491882801347, "loss": 542.2335, "step": 85 }, { "epoch": 0.05678676076100862, "grad_norm": 140391.09375, "learning_rate": 0.00025196917417732615, "loss": 178.0071, "step": 86 }, { "epoch": 0.05744707193264826, "grad_norm": 45774.61328125, "learning_rate": 0.0002507714629919409, "loss": 145.9398, "step": 87 }, { "epoch": 0.058107383104287896, "grad_norm": 74355.359375, "learning_rate": 0.0002495619258285757, "loss": 162.5158, "step": 88 }, { "epoch": 0.05876769427592753, "grad_norm": 112329.7265625, "learning_rate": 0.0002483407046317794, "loss": 223.498, "step": 89 }, { "epoch": 0.05942800544756716, "grad_norm": 488449.875, "learning_rate": 0.00024710794271727413, "loss": 223.1561, "step": 90 }, { "epoch": 0.0600883166192068, "grad_norm": 146916.296875, "learning_rate": 0.0002458637847551364, "loss": 252.3947, "step": 91 }, { "epoch": 0.060748627790846436, "grad_norm": 115853.0703125, "learning_rate": 0.00024460837675281926, "loss": 265.611, "step": 92 }, { "epoch": 0.06140893896248607, "grad_norm": 95760.921875, "learning_rate": 0.00024334186603801807, "loss": 195.9439, "step": 93 }, { "epoch": 0.06206925013412571, "grad_norm": 58220.4609375, "learning_rate": 0.00024206440124138062, "loss": 173.6973, "step": 94 }, { "epoch": 0.06272956130576535, "grad_norm": 44573.25390625, "learning_rate": 0.0002407761322790648, "loss": 130.0355, "step": 95 }, { "epoch": 0.06338987247740498, "grad_norm": 48302.27734375, "learning_rate": 0.00023947721033514512, "loss": 110.2012, "step": 96 }, { "epoch": 0.0640501836490446, "grad_norm": 18446.73046875, "learning_rate": 0.00023816778784387094, "loss": 118.2505, "step": 97 }, { "epoch": 0.06471049482068425, "grad_norm": 35311.09375, "learning_rate": 0.0002368480184717773, "loss": 133.5809, "step": 98 }, { "epoch": 0.06537080599232388, "grad_norm": 38145.79296875, "learning_rate": 0.00023551805709965147, "loss": 129.8271, "step": 99 }, { "epoch": 0.06603111716396352, "grad_norm": 32865.98046875, "learning_rate": 0.00023417805980435736, "loss": 116.0362, "step": 100 }, { "epoch": 0.06603111716396352, "eval_loss": 9.961955070495605, "eval_runtime": 6.5733, "eval_samples_per_second": 75.305, "eval_steps_per_second": 75.305, "step": 100 } ], "logging_steps": 1, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 65231696953344.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }