|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.09244356402954892, |
|
"eval_steps": 20, |
|
"global_step": 140, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006603111716396351, |
|
"grad_norm": 300608.875, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 118.0917, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0006603111716396351, |
|
"eval_loss": 15.74350357055664, |
|
"eval_runtime": 6.9487, |
|
"eval_samples_per_second": 71.236, |
|
"eval_steps_per_second": 71.236, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0013206223432792703, |
|
"grad_norm": 277566.0, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 120.2507, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0019809335149189055, |
|
"grad_norm": 418405.1875, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 143.5985, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0026412446865585405, |
|
"grad_norm": 222641.09375, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 113.3078, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.003301555858198176, |
|
"grad_norm": 80659.1640625, |
|
"learning_rate": 0.00015, |
|
"loss": 89.0113, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003961867029837811, |
|
"grad_norm": 174515.8125, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 92.8388, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004622178201477446, |
|
"grad_norm": 68650.6640625, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 88.8969, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.005282489373117081, |
|
"grad_norm": 52668.09765625, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 83.2267, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.005942800544756717, |
|
"grad_norm": 61407.41796875, |
|
"learning_rate": 0.00027, |
|
"loss": 92.7579, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.006603111716396352, |
|
"grad_norm": 37122.61328125, |
|
"learning_rate": 0.0003, |
|
"loss": 84.5321, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007263422888035987, |
|
"grad_norm": 58673.69921875, |
|
"learning_rate": 0.0002999911984174669, |
|
"loss": 93.7724, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.007923734059675622, |
|
"grad_norm": 121362.359375, |
|
"learning_rate": 0.0002999647947027726, |
|
"loss": 108.6176, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.008584045231315257, |
|
"grad_norm": 233485.53125, |
|
"learning_rate": 0.0002999207919545099, |
|
"loss": 105.0768, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.009244356402954892, |
|
"grad_norm": 318686.03125, |
|
"learning_rate": 0.0002998591953365965, |
|
"loss": 99.2428, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.009904667574594527, |
|
"grad_norm": 101194.0625, |
|
"learning_rate": 0.00029978001207766854, |
|
"loss": 95.0131, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.010564978746234162, |
|
"grad_norm": 83839.953125, |
|
"learning_rate": 0.00029968325147023263, |
|
"loss": 86.1607, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.011225289917873797, |
|
"grad_norm": 68342.7109375, |
|
"learning_rate": 0.000299568924869575, |
|
"loss": 85.744, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.011885601089513434, |
|
"grad_norm": 232218.078125, |
|
"learning_rate": 0.00029943704569242917, |
|
"loss": 89.0831, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.012545912261153069, |
|
"grad_norm": 142226.359375, |
|
"learning_rate": 0.0002992876294154013, |
|
"loss": 87.8575, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.013206223432792704, |
|
"grad_norm": 99313.8125, |
|
"learning_rate": 0.00029912069357315393, |
|
"loss": 91.0717, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013206223432792704, |
|
"eval_loss": 15.472484588623047, |
|
"eval_runtime": 6.5506, |
|
"eval_samples_per_second": 75.565, |
|
"eval_steps_per_second": 75.565, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013866534604432339, |
|
"grad_norm": 361644.25, |
|
"learning_rate": 0.00029893625775634835, |
|
"loss": 97.0436, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.014526845776071974, |
|
"grad_norm": 1564964.125, |
|
"learning_rate": 0.0002987343436093454, |
|
"loss": 279.3873, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.015187156947711609, |
|
"grad_norm": 1539204.875, |
|
"learning_rate": 0.00029851497482766547, |
|
"loss": 668.731, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.015847468119351244, |
|
"grad_norm": 2246419.0, |
|
"learning_rate": 0.00029827817715520773, |
|
"loss": 758.2188, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01650777929099088, |
|
"grad_norm": 2776799.5, |
|
"learning_rate": 0.0002980239783812289, |
|
"loss": 546.4688, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.017168090462630514, |
|
"grad_norm": 1617533.25, |
|
"learning_rate": 0.0002977524083370822, |
|
"loss": 422.7344, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.01782840163427015, |
|
"grad_norm": 450934.75, |
|
"learning_rate": 0.00029746349889271645, |
|
"loss": 339.3945, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.018488712805909784, |
|
"grad_norm": 2286499.5, |
|
"learning_rate": 0.0002971572839529358, |
|
"loss": 236.2812, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.01914902397754942, |
|
"grad_norm": 3999992.25, |
|
"learning_rate": 0.00029683379945342125, |
|
"loss": 159.8123, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.019809335149189054, |
|
"grad_norm": 308621.125, |
|
"learning_rate": 0.000296493083356513, |
|
"loss": 124.3508, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02046964632082869, |
|
"grad_norm": 497120.59375, |
|
"learning_rate": 0.00029613517564675565, |
|
"loss": 138.8941, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.021129957492468324, |
|
"grad_norm": 99928.7890625, |
|
"learning_rate": 0.0002957601183262058, |
|
"loss": 151.8223, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.02179026866410796, |
|
"grad_norm": 405539.0625, |
|
"learning_rate": 0.000295367955409503, |
|
"loss": 1203.2074, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.022450579835747594, |
|
"grad_norm": 463994.21875, |
|
"learning_rate": 0.00029495873291870436, |
|
"loss": 1256.5367, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.02311089100738723, |
|
"grad_norm": 953879.5625, |
|
"learning_rate": 0.0002945324988778834, |
|
"loss": 938.6675, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.023771202179026868, |
|
"grad_norm": 178221.28125, |
|
"learning_rate": 0.00029408930330749477, |
|
"loss": 356.9532, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0244315133506665, |
|
"grad_norm": 146182.625, |
|
"learning_rate": 0.0002936291982185036, |
|
"loss": 238.3703, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.025091824522306138, |
|
"grad_norm": 116065.7421875, |
|
"learning_rate": 0.00029315223760628217, |
|
"loss": 212.9555, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.02575213569394577, |
|
"grad_norm": 129193.03125, |
|
"learning_rate": 0.00029265847744427303, |
|
"loss": 227.929, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.026412446865585408, |
|
"grad_norm": 152996.75, |
|
"learning_rate": 0.00029214797567742035, |
|
"loss": 220.4361, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.026412446865585408, |
|
"eval_loss": 16.9798526763916, |
|
"eval_runtime": 6.5479, |
|
"eval_samples_per_second": 75.597, |
|
"eval_steps_per_second": 75.597, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02707275803722504, |
|
"grad_norm": 166313.5, |
|
"learning_rate": 0.00029162079221537, |
|
"loss": 178.7949, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.027733069208864678, |
|
"grad_norm": 135891.28125, |
|
"learning_rate": 0.0002910769889254386, |
|
"loss": 201.0785, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.02839338038050431, |
|
"grad_norm": 64060.7890625, |
|
"learning_rate": 0.0002905166296253533, |
|
"loss": 163.5094, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.029053691552143948, |
|
"grad_norm": 96362.4921875, |
|
"learning_rate": 0.0002899397800757626, |
|
"loss": 140.6384, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.02971400272378358, |
|
"grad_norm": 166254.203125, |
|
"learning_rate": 0.0002893465079725187, |
|
"loss": 139.1684, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.030374313895423218, |
|
"grad_norm": 161925.5625, |
|
"learning_rate": 0.0002887368829387333, |
|
"loss": 140.9152, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.031034625067062855, |
|
"grad_norm": 637966.3125, |
|
"learning_rate": 0.0002881109765166071, |
|
"loss": 131.3419, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.03169493623870249, |
|
"grad_norm": 367775.03125, |
|
"learning_rate": 0.00028746886215903387, |
|
"loss": 155.0525, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.032355247410342125, |
|
"grad_norm": 307120.84375, |
|
"learning_rate": 0.00028681061522098047, |
|
"loss": 148.0313, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.03301555858198176, |
|
"grad_norm": 164428.203125, |
|
"learning_rate": 0.0002861363129506435, |
|
"loss": 139.0605, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03367586975362139, |
|
"grad_norm": 56655.140625, |
|
"learning_rate": 0.0002854460344803842, |
|
"loss": 105.2498, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.03433618092526103, |
|
"grad_norm": 110095.71875, |
|
"learning_rate": 0.00028473986081744163, |
|
"loss": 107.1039, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.034996492096900665, |
|
"grad_norm": 84727.8125, |
|
"learning_rate": 0.000284017874834426, |
|
"loss": 114.7597, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0356568032685403, |
|
"grad_norm": 108558.9140625, |
|
"learning_rate": 0.0002832801612595937, |
|
"loss": 131.0451, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03631711444017993, |
|
"grad_norm": 35595.47265625, |
|
"learning_rate": 0.0002825268066669034, |
|
"loss": 135.1516, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03697742561181957, |
|
"grad_norm": 54421.08203125, |
|
"learning_rate": 0.00028175789946585693, |
|
"loss": 116.2731, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.037637736783459205, |
|
"grad_norm": 72844.515625, |
|
"learning_rate": 0.0002809735298911234, |
|
"loss": 101.419, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.03829804795509884, |
|
"grad_norm": 58473.41015625, |
|
"learning_rate": 0.00028017378999195015, |
|
"loss": 101.8432, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.03895835912673848, |
|
"grad_norm": 41094.68359375, |
|
"learning_rate": 0.0002793587736213603, |
|
"loss": 114.5148, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03961867029837811, |
|
"grad_norm": 75345.5234375, |
|
"learning_rate": 0.00027852857642513836, |
|
"loss": 119.3659, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03961867029837811, |
|
"eval_loss": 16.175268173217773, |
|
"eval_runtime": 6.5722, |
|
"eval_samples_per_second": 75.317, |
|
"eval_steps_per_second": 75.317, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.040278981470017745, |
|
"grad_norm": 70272.1640625, |
|
"learning_rate": 0.00027768329583060635, |
|
"loss": 110.4526, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.04093929264165738, |
|
"grad_norm": 34992.48828125, |
|
"learning_rate": 0.00027682303103518976, |
|
"loss": 116.5263, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.04159960381329702, |
|
"grad_norm": 56546.65625, |
|
"learning_rate": 0.00027594788299477655, |
|
"loss": 107.9915, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.04225991498493665, |
|
"grad_norm": 23483.95703125, |
|
"learning_rate": 0.0002750579544118695, |
|
"loss": 105.6012, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.042920226156576285, |
|
"grad_norm": 56980.02734375, |
|
"learning_rate": 0.00027415334972353357, |
|
"loss": 115.8085, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04358053732821592, |
|
"grad_norm": 32112.759765625, |
|
"learning_rate": 0.0002732341750891397, |
|
"loss": 100.6383, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.04424084849985556, |
|
"grad_norm": 26189.25, |
|
"learning_rate": 0.00027230053837790666, |
|
"loss": 95.9158, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.04490115967149519, |
|
"grad_norm": 27246.33203125, |
|
"learning_rate": 0.0002713525491562421, |
|
"loss": 107.4022, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.045561470843134826, |
|
"grad_norm": 55204.88671875, |
|
"learning_rate": 0.0002703903186748843, |
|
"loss": 104.1617, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.04622178201477446, |
|
"grad_norm": 47799.21875, |
|
"learning_rate": 0.00026941395985584653, |
|
"loss": 107.3508, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0468820931864141, |
|
"grad_norm": 24187.337890625, |
|
"learning_rate": 0.00026842358727916524, |
|
"loss": 98.8449, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.047542404358053736, |
|
"grad_norm": 99253.2421875, |
|
"learning_rate": 0.0002674193171694533, |
|
"loss": 226.1011, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.048202715529693366, |
|
"grad_norm": 293996.9375, |
|
"learning_rate": 0.0002664012673822609, |
|
"loss": 507.2474, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.048863026701333, |
|
"grad_norm": 565162.4375, |
|
"learning_rate": 0.0002653695573902443, |
|
"loss": 619.3906, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.04952333787297264, |
|
"grad_norm": 569260.125, |
|
"learning_rate": 0.0002643243082691454, |
|
"loss": 489.5234, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.050183649044612276, |
|
"grad_norm": 564170.875, |
|
"learning_rate": 0.0002632656426835831, |
|
"loss": 570.9297, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.050843960216251906, |
|
"grad_norm": 505015.0625, |
|
"learning_rate": 0.00026219368487265753, |
|
"loss": 504.5522, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.05150427138789154, |
|
"grad_norm": 1804335.375, |
|
"learning_rate": 0.00026110856063537083, |
|
"loss": 404.3008, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.05216458255953118, |
|
"grad_norm": 255148.265625, |
|
"learning_rate": 0.00026001039731586334, |
|
"loss": 266.7525, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.052824893731170816, |
|
"grad_norm": 410041.71875, |
|
"learning_rate": 0.0002588993237884696, |
|
"loss": 194.2475, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.052824893731170816, |
|
"eval_loss": 21.787084579467773, |
|
"eval_runtime": 6.6223, |
|
"eval_samples_per_second": 74.747, |
|
"eval_steps_per_second": 74.747, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05348520490281045, |
|
"grad_norm": 259394.6875, |
|
"learning_rate": 0.00025777547044259435, |
|
"loss": 200.2073, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.05414551607445008, |
|
"grad_norm": 1378326.625, |
|
"learning_rate": 0.0002566389691674106, |
|
"loss": 100.5475, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.05480582724608972, |
|
"grad_norm": 63462.6875, |
|
"learning_rate": 0.00025548995333638197, |
|
"loss": 189.407, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.055466138417729356, |
|
"grad_norm": 149989.21875, |
|
"learning_rate": 0.00025432855779161076, |
|
"loss": 655.0445, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.05612644958936899, |
|
"grad_norm": 161908.9375, |
|
"learning_rate": 0.00025315491882801347, |
|
"loss": 542.2335, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05678676076100862, |
|
"grad_norm": 140391.09375, |
|
"learning_rate": 0.00025196917417732615, |
|
"loss": 178.0071, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.05744707193264826, |
|
"grad_norm": 45774.61328125, |
|
"learning_rate": 0.0002507714629919409, |
|
"loss": 145.9398, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.058107383104287896, |
|
"grad_norm": 74355.359375, |
|
"learning_rate": 0.0002495619258285757, |
|
"loss": 162.5158, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.05876769427592753, |
|
"grad_norm": 112329.7265625, |
|
"learning_rate": 0.0002483407046317794, |
|
"loss": 223.498, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.05942800544756716, |
|
"grad_norm": 488449.875, |
|
"learning_rate": 0.00024710794271727413, |
|
"loss": 223.1561, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0600883166192068, |
|
"grad_norm": 146916.296875, |
|
"learning_rate": 0.0002458637847551364, |
|
"loss": 252.3947, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.060748627790846436, |
|
"grad_norm": 115853.0703125, |
|
"learning_rate": 0.00024460837675281926, |
|
"loss": 265.611, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.06140893896248607, |
|
"grad_norm": 95760.921875, |
|
"learning_rate": 0.00024334186603801807, |
|
"loss": 195.9439, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.06206925013412571, |
|
"grad_norm": 58220.4609375, |
|
"learning_rate": 0.00024206440124138062, |
|
"loss": 173.6973, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.06272956130576535, |
|
"grad_norm": 44573.25390625, |
|
"learning_rate": 0.0002407761322790648, |
|
"loss": 130.0355, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06338987247740498, |
|
"grad_norm": 48302.27734375, |
|
"learning_rate": 0.00023947721033514512, |
|
"loss": 110.2012, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0640501836490446, |
|
"grad_norm": 18446.73046875, |
|
"learning_rate": 0.00023816778784387094, |
|
"loss": 118.2505, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.06471049482068425, |
|
"grad_norm": 35311.09375, |
|
"learning_rate": 0.0002368480184717773, |
|
"loss": 133.5809, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.06537080599232388, |
|
"grad_norm": 38145.79296875, |
|
"learning_rate": 0.00023551805709965147, |
|
"loss": 129.8271, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.06603111716396352, |
|
"grad_norm": 32865.98046875, |
|
"learning_rate": 0.00023417805980435736, |
|
"loss": 116.0362, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06603111716396352, |
|
"eval_loss": 9.961955070495605, |
|
"eval_runtime": 6.5733, |
|
"eval_samples_per_second": 75.305, |
|
"eval_steps_per_second": 75.305, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06669142833560315, |
|
"grad_norm": 18817.16796875, |
|
"learning_rate": 0.00023282818384051866, |
|
"loss": 113.1262, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.06735173950724278, |
|
"grad_norm": 24054.869140625, |
|
"learning_rate": 0.00023146858762206489, |
|
"loss": 108.1982, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.06801205067888243, |
|
"grad_norm": 34655.8671875, |
|
"learning_rate": 0.00023009943070364044, |
|
"loss": 108.4203, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.06867236185052206, |
|
"grad_norm": 29425.787109375, |
|
"learning_rate": 0.0002287208737618801, |
|
"loss": 107.4073, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.0693326730221617, |
|
"grad_norm": 13952.1171875, |
|
"learning_rate": 0.00022733307857655325, |
|
"loss": 105.1398, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06999298419380133, |
|
"grad_norm": 20227.431640625, |
|
"learning_rate": 0.00022593620801157808, |
|
"loss": 115.2134, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.07065329536544096, |
|
"grad_norm": 19999.79296875, |
|
"learning_rate": 0.00022453042599590882, |
|
"loss": 113.6159, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.0713136065370806, |
|
"grad_norm": 18226.33203125, |
|
"learning_rate": 0.00022311589750429787, |
|
"loss": 110.2182, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.07197391770872023, |
|
"grad_norm": 15471.123046875, |
|
"learning_rate": 0.00022169278853793545, |
|
"loss": 98.862, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.07263422888035986, |
|
"grad_norm": 9518.90625, |
|
"learning_rate": 0.00022026126610496852, |
|
"loss": 100.519, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07329454005199951, |
|
"grad_norm": 12838.0771484375, |
|
"learning_rate": 0.0002188214982009016, |
|
"loss": 99.1184, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.07395485122363914, |
|
"grad_norm": 13236.9697265625, |
|
"learning_rate": 0.00021737365378888187, |
|
"loss": 108.3643, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.07461516239527878, |
|
"grad_norm": 21540.712890625, |
|
"learning_rate": 0.00021591790277987043, |
|
"loss": 106.4385, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.07527547356691841, |
|
"grad_norm": 13282.7333984375, |
|
"learning_rate": 0.00021445441601270276, |
|
"loss": 111.6325, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.07593578473855804, |
|
"grad_norm": 32402.203125, |
|
"learning_rate": 0.00021298336523403968, |
|
"loss": 102.4856, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.07659609591019768, |
|
"grad_norm": 23308.939453125, |
|
"learning_rate": 0.0002115049230782124, |
|
"loss": 99.6906, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.07725640708183731, |
|
"grad_norm": 21524.953125, |
|
"learning_rate": 0.00021001926304696296, |
|
"loss": 90.451, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.07791671825347696, |
|
"grad_norm": 13045.5537109375, |
|
"learning_rate": 0.00020852655948908316, |
|
"loss": 93.52, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.07857702942511659, |
|
"grad_norm": 18377.09375, |
|
"learning_rate": 0.0002070269875799538, |
|
"loss": 85.6482, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.07923734059675622, |
|
"grad_norm": 12025.564453125, |
|
"learning_rate": 0.00020552072330098716, |
|
"loss": 89.6598, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07923734059675622, |
|
"eval_loss": 12.788580894470215, |
|
"eval_runtime": 6.5873, |
|
"eval_samples_per_second": 75.145, |
|
"eval_steps_per_second": 75.145, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07989765176839586, |
|
"grad_norm": 10677.837890625, |
|
"learning_rate": 0.0002040079434189748, |
|
"loss": 90.3255, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.08055796294003549, |
|
"grad_norm": 13001.4951171875, |
|
"learning_rate": 0.00020248882546534326, |
|
"loss": 94.9763, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.08121827411167512, |
|
"grad_norm": 88364.6875, |
|
"learning_rate": 0.00020096354771531976, |
|
"loss": 210.5573, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.08187858528331476, |
|
"grad_norm": 314990.40625, |
|
"learning_rate": 0.00019943228916701104, |
|
"loss": 602.6875, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.0825388964549544, |
|
"grad_norm": 264931.5, |
|
"learning_rate": 0.00019789522952039695, |
|
"loss": 513.6562, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.08319920762659404, |
|
"grad_norm": 157227.640625, |
|
"learning_rate": 0.0001963525491562421, |
|
"loss": 511.8125, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.08385951879823367, |
|
"grad_norm": 537004.0625, |
|
"learning_rate": 0.00019480442911492702, |
|
"loss": 268.8125, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.0845198299698733, |
|
"grad_norm": 1529112.375, |
|
"learning_rate": 0.00019325105107520263, |
|
"loss": 257.3359, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.08518014114151294, |
|
"grad_norm": 231625.046875, |
|
"learning_rate": 0.00019169259733286913, |
|
"loss": 202.3438, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.08584045231315257, |
|
"grad_norm": 306631.5625, |
|
"learning_rate": 0.00019012925077938314, |
|
"loss": 181.8047, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08650076348479221, |
|
"grad_norm": 118856.3203125, |
|
"learning_rate": 0.0001885611948803941, |
|
"loss": 179.7969, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.08716107465643184, |
|
"grad_norm": 98292.1796875, |
|
"learning_rate": 0.0001869886136542143, |
|
"loss": 392.7283, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.08782138582807147, |
|
"grad_norm": 89386.7734375, |
|
"learning_rate": 0.00018541169165022298, |
|
"loss": 318.7403, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.08848169699971112, |
|
"grad_norm": 137117.296875, |
|
"learning_rate": 0.00018383061392720913, |
|
"loss": 245.4405, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.08914200817135075, |
|
"grad_norm": 150909.71875, |
|
"learning_rate": 0.0001822455660316536, |
|
"loss": 156.249, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.08980231934299038, |
|
"grad_norm": 60323.1875, |
|
"learning_rate": 0.00018065673397595473, |
|
"loss": 113.9602, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.09046263051463002, |
|
"grad_norm": 42032.9921875, |
|
"learning_rate": 0.00017906430421659876, |
|
"loss": 107.9544, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.09112294168626965, |
|
"grad_norm": 33694.890625, |
|
"learning_rate": 0.00017746846363227842, |
|
"loss": 111.8, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.0917832528579093, |
|
"grad_norm": 34851.86328125, |
|
"learning_rate": 0.00017586939950196186, |
|
"loss": 103.5465, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.09244356402954892, |
|
"grad_norm": 24885.861328125, |
|
"learning_rate": 0.00017426729948291474, |
|
"loss": 101.8808, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09244356402954892, |
|
"eval_loss": 10.751233100891113, |
|
"eval_runtime": 6.5882, |
|
"eval_samples_per_second": 75.135, |
|
"eval_steps_per_second": 75.135, |
|
"step": 140 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 93374726012928.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|