|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.03961867029837811, |
|
"eval_steps": 20, |
|
"global_step": 60, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006603111716396351, |
|
"grad_norm": 300608.875, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 118.0917, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0006603111716396351, |
|
"eval_loss": 15.74350357055664, |
|
"eval_runtime": 6.9487, |
|
"eval_samples_per_second": 71.236, |
|
"eval_steps_per_second": 71.236, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0013206223432792703, |
|
"grad_norm": 277566.0, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 120.2507, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0019809335149189055, |
|
"grad_norm": 418405.1875, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 143.5985, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0026412446865585405, |
|
"grad_norm": 222641.09375, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 113.3078, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.003301555858198176, |
|
"grad_norm": 80659.1640625, |
|
"learning_rate": 0.00015, |
|
"loss": 89.0113, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003961867029837811, |
|
"grad_norm": 174515.8125, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 92.8388, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004622178201477446, |
|
"grad_norm": 68650.6640625, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 88.8969, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.005282489373117081, |
|
"grad_norm": 52668.09765625, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 83.2267, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.005942800544756717, |
|
"grad_norm": 61407.41796875, |
|
"learning_rate": 0.00027, |
|
"loss": 92.7579, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.006603111716396352, |
|
"grad_norm": 37122.61328125, |
|
"learning_rate": 0.0003, |
|
"loss": 84.5321, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007263422888035987, |
|
"grad_norm": 58673.69921875, |
|
"learning_rate": 0.0002999911984174669, |
|
"loss": 93.7724, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.007923734059675622, |
|
"grad_norm": 121362.359375, |
|
"learning_rate": 0.0002999647947027726, |
|
"loss": 108.6176, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.008584045231315257, |
|
"grad_norm": 233485.53125, |
|
"learning_rate": 0.0002999207919545099, |
|
"loss": 105.0768, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.009244356402954892, |
|
"grad_norm": 318686.03125, |
|
"learning_rate": 0.0002998591953365965, |
|
"loss": 99.2428, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.009904667574594527, |
|
"grad_norm": 101194.0625, |
|
"learning_rate": 0.00029978001207766854, |
|
"loss": 95.0131, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.010564978746234162, |
|
"grad_norm": 83839.953125, |
|
"learning_rate": 0.00029968325147023263, |
|
"loss": 86.1607, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.011225289917873797, |
|
"grad_norm": 68342.7109375, |
|
"learning_rate": 0.000299568924869575, |
|
"loss": 85.744, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.011885601089513434, |
|
"grad_norm": 232218.078125, |
|
"learning_rate": 0.00029943704569242917, |
|
"loss": 89.0831, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.012545912261153069, |
|
"grad_norm": 142226.359375, |
|
"learning_rate": 0.0002992876294154013, |
|
"loss": 87.8575, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.013206223432792704, |
|
"grad_norm": 99313.8125, |
|
"learning_rate": 0.00029912069357315393, |
|
"loss": 91.0717, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013206223432792704, |
|
"eval_loss": 15.472484588623047, |
|
"eval_runtime": 6.5506, |
|
"eval_samples_per_second": 75.565, |
|
"eval_steps_per_second": 75.565, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013866534604432339, |
|
"grad_norm": 361644.25, |
|
"learning_rate": 0.00029893625775634835, |
|
"loss": 97.0436, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.014526845776071974, |
|
"grad_norm": 1564964.125, |
|
"learning_rate": 0.0002987343436093454, |
|
"loss": 279.3873, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.015187156947711609, |
|
"grad_norm": 1539204.875, |
|
"learning_rate": 0.00029851497482766547, |
|
"loss": 668.731, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.015847468119351244, |
|
"grad_norm": 2246419.0, |
|
"learning_rate": 0.00029827817715520773, |
|
"loss": 758.2188, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01650777929099088, |
|
"grad_norm": 2776799.5, |
|
"learning_rate": 0.0002980239783812289, |
|
"loss": 546.4688, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.017168090462630514, |
|
"grad_norm": 1617533.25, |
|
"learning_rate": 0.0002977524083370822, |
|
"loss": 422.7344, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.01782840163427015, |
|
"grad_norm": 450934.75, |
|
"learning_rate": 0.00029746349889271645, |
|
"loss": 339.3945, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.018488712805909784, |
|
"grad_norm": 2286499.5, |
|
"learning_rate": 0.0002971572839529358, |
|
"loss": 236.2812, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.01914902397754942, |
|
"grad_norm": 3999992.25, |
|
"learning_rate": 0.00029683379945342125, |
|
"loss": 159.8123, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.019809335149189054, |
|
"grad_norm": 308621.125, |
|
"learning_rate": 0.000296493083356513, |
|
"loss": 124.3508, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02046964632082869, |
|
"grad_norm": 497120.59375, |
|
"learning_rate": 0.00029613517564675565, |
|
"loss": 138.8941, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.021129957492468324, |
|
"grad_norm": 99928.7890625, |
|
"learning_rate": 0.0002957601183262058, |
|
"loss": 151.8223, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.02179026866410796, |
|
"grad_norm": 405539.0625, |
|
"learning_rate": 0.000295367955409503, |
|
"loss": 1203.2074, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.022450579835747594, |
|
"grad_norm": 463994.21875, |
|
"learning_rate": 0.00029495873291870436, |
|
"loss": 1256.5367, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.02311089100738723, |
|
"grad_norm": 953879.5625, |
|
"learning_rate": 0.0002945324988778834, |
|
"loss": 938.6675, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.023771202179026868, |
|
"grad_norm": 178221.28125, |
|
"learning_rate": 0.00029408930330749477, |
|
"loss": 356.9532, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0244315133506665, |
|
"grad_norm": 146182.625, |
|
"learning_rate": 0.0002936291982185036, |
|
"loss": 238.3703, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.025091824522306138, |
|
"grad_norm": 116065.7421875, |
|
"learning_rate": 0.00029315223760628217, |
|
"loss": 212.9555, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.02575213569394577, |
|
"grad_norm": 129193.03125, |
|
"learning_rate": 0.00029265847744427303, |
|
"loss": 227.929, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.026412446865585408, |
|
"grad_norm": 152996.75, |
|
"learning_rate": 0.00029214797567742035, |
|
"loss": 220.4361, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.026412446865585408, |
|
"eval_loss": 16.9798526763916, |
|
"eval_runtime": 6.5479, |
|
"eval_samples_per_second": 75.597, |
|
"eval_steps_per_second": 75.597, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02707275803722504, |
|
"grad_norm": 166313.5, |
|
"learning_rate": 0.00029162079221537, |
|
"loss": 178.7949, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.027733069208864678, |
|
"grad_norm": 135891.28125, |
|
"learning_rate": 0.0002910769889254386, |
|
"loss": 201.0785, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.02839338038050431, |
|
"grad_norm": 64060.7890625, |
|
"learning_rate": 0.0002905166296253533, |
|
"loss": 163.5094, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.029053691552143948, |
|
"grad_norm": 96362.4921875, |
|
"learning_rate": 0.0002899397800757626, |
|
"loss": 140.6384, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.02971400272378358, |
|
"grad_norm": 166254.203125, |
|
"learning_rate": 0.0002893465079725187, |
|
"loss": 139.1684, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.030374313895423218, |
|
"grad_norm": 161925.5625, |
|
"learning_rate": 0.0002887368829387333, |
|
"loss": 140.9152, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.031034625067062855, |
|
"grad_norm": 637966.3125, |
|
"learning_rate": 0.0002881109765166071, |
|
"loss": 131.3419, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.03169493623870249, |
|
"grad_norm": 367775.03125, |
|
"learning_rate": 0.00028746886215903387, |
|
"loss": 155.0525, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.032355247410342125, |
|
"grad_norm": 307120.84375, |
|
"learning_rate": 0.00028681061522098047, |
|
"loss": 148.0313, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.03301555858198176, |
|
"grad_norm": 164428.203125, |
|
"learning_rate": 0.0002861363129506435, |
|
"loss": 139.0605, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03367586975362139, |
|
"grad_norm": 56655.140625, |
|
"learning_rate": 0.0002854460344803842, |
|
"loss": 105.2498, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.03433618092526103, |
|
"grad_norm": 110095.71875, |
|
"learning_rate": 0.00028473986081744163, |
|
"loss": 107.1039, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.034996492096900665, |
|
"grad_norm": 84727.8125, |
|
"learning_rate": 0.000284017874834426, |
|
"loss": 114.7597, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0356568032685403, |
|
"grad_norm": 108558.9140625, |
|
"learning_rate": 0.0002832801612595937, |
|
"loss": 131.0451, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03631711444017993, |
|
"grad_norm": 35595.47265625, |
|
"learning_rate": 0.0002825268066669034, |
|
"loss": 135.1516, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03697742561181957, |
|
"grad_norm": 54421.08203125, |
|
"learning_rate": 0.00028175789946585693, |
|
"loss": 116.2731, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.037637736783459205, |
|
"grad_norm": 72844.515625, |
|
"learning_rate": 0.0002809735298911234, |
|
"loss": 101.419, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.03829804795509884, |
|
"grad_norm": 58473.41015625, |
|
"learning_rate": 0.00028017378999195015, |
|
"loss": 101.8432, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.03895835912673848, |
|
"grad_norm": 41094.68359375, |
|
"learning_rate": 0.0002793587736213603, |
|
"loss": 114.5148, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03961867029837811, |
|
"grad_norm": 75345.5234375, |
|
"learning_rate": 0.00027852857642513836, |
|
"loss": 119.3659, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03961867029837811, |
|
"eval_loss": 16.175268173217773, |
|
"eval_runtime": 6.5722, |
|
"eval_samples_per_second": 75.317, |
|
"eval_steps_per_second": 75.317, |
|
"step": 60 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 300, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 40390914736128.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|