{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03961867029837811, "eval_steps": 20, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006603111716396351, "grad_norm": 300608.875, "learning_rate": 2.9999999999999997e-05, "loss": 118.0917, "step": 1 }, { "epoch": 0.0006603111716396351, "eval_loss": 15.74350357055664, "eval_runtime": 6.9487, "eval_samples_per_second": 71.236, "eval_steps_per_second": 71.236, "step": 1 }, { "epoch": 0.0013206223432792703, "grad_norm": 277566.0, "learning_rate": 5.9999999999999995e-05, "loss": 120.2507, "step": 2 }, { "epoch": 0.0019809335149189055, "grad_norm": 418405.1875, "learning_rate": 8.999999999999999e-05, "loss": 143.5985, "step": 3 }, { "epoch": 0.0026412446865585405, "grad_norm": 222641.09375, "learning_rate": 0.00011999999999999999, "loss": 113.3078, "step": 4 }, { "epoch": 0.003301555858198176, "grad_norm": 80659.1640625, "learning_rate": 0.00015, "loss": 89.0113, "step": 5 }, { "epoch": 0.003961867029837811, "grad_norm": 174515.8125, "learning_rate": 0.00017999999999999998, "loss": 92.8388, "step": 6 }, { "epoch": 0.004622178201477446, "grad_norm": 68650.6640625, "learning_rate": 0.00020999999999999998, "loss": 88.8969, "step": 7 }, { "epoch": 0.005282489373117081, "grad_norm": 52668.09765625, "learning_rate": 0.00023999999999999998, "loss": 83.2267, "step": 8 }, { "epoch": 0.005942800544756717, "grad_norm": 61407.41796875, "learning_rate": 0.00027, "loss": 92.7579, "step": 9 }, { "epoch": 0.006603111716396352, "grad_norm": 37122.61328125, "learning_rate": 0.0003, "loss": 84.5321, "step": 10 }, { "epoch": 0.007263422888035987, "grad_norm": 58673.69921875, "learning_rate": 0.0002999911984174669, "loss": 93.7724, "step": 11 }, { "epoch": 0.007923734059675622, "grad_norm": 121362.359375, "learning_rate": 0.0002999647947027726, "loss": 108.6176, "step": 12 }, { "epoch": 0.008584045231315257, "grad_norm": 233485.53125, "learning_rate": 0.0002999207919545099, "loss": 105.0768, "step": 13 }, { "epoch": 0.009244356402954892, "grad_norm": 318686.03125, "learning_rate": 0.0002998591953365965, "loss": 99.2428, "step": 14 }, { "epoch": 0.009904667574594527, "grad_norm": 101194.0625, "learning_rate": 0.00029978001207766854, "loss": 95.0131, "step": 15 }, { "epoch": 0.010564978746234162, "grad_norm": 83839.953125, "learning_rate": 0.00029968325147023263, "loss": 86.1607, "step": 16 }, { "epoch": 0.011225289917873797, "grad_norm": 68342.7109375, "learning_rate": 0.000299568924869575, "loss": 85.744, "step": 17 }, { "epoch": 0.011885601089513434, "grad_norm": 232218.078125, "learning_rate": 0.00029943704569242917, "loss": 89.0831, "step": 18 }, { "epoch": 0.012545912261153069, "grad_norm": 142226.359375, "learning_rate": 0.0002992876294154013, "loss": 87.8575, "step": 19 }, { "epoch": 0.013206223432792704, "grad_norm": 99313.8125, "learning_rate": 0.00029912069357315393, "loss": 91.0717, "step": 20 }, { "epoch": 0.013206223432792704, "eval_loss": 15.472484588623047, "eval_runtime": 6.5506, "eval_samples_per_second": 75.565, "eval_steps_per_second": 75.565, "step": 20 }, { "epoch": 0.013866534604432339, "grad_norm": 361644.25, "learning_rate": 0.00029893625775634835, "loss": 97.0436, "step": 21 }, { "epoch": 0.014526845776071974, "grad_norm": 1564964.125, "learning_rate": 0.0002987343436093454, "loss": 279.3873, "step": 22 }, { "epoch": 0.015187156947711609, "grad_norm": 1539204.875, "learning_rate": 0.00029851497482766547, "loss": 668.731, "step": 23 }, { "epoch": 0.015847468119351244, "grad_norm": 2246419.0, "learning_rate": 0.00029827817715520773, "loss": 758.2188, "step": 24 }, { "epoch": 0.01650777929099088, "grad_norm": 2776799.5, "learning_rate": 0.0002980239783812289, "loss": 546.4688, "step": 25 }, { "epoch": 0.017168090462630514, "grad_norm": 1617533.25, "learning_rate": 0.0002977524083370822, "loss": 422.7344, "step": 26 }, { "epoch": 0.01782840163427015, "grad_norm": 450934.75, "learning_rate": 0.00029746349889271645, "loss": 339.3945, "step": 27 }, { "epoch": 0.018488712805909784, "grad_norm": 2286499.5, "learning_rate": 0.0002971572839529358, "loss": 236.2812, "step": 28 }, { "epoch": 0.01914902397754942, "grad_norm": 3999992.25, "learning_rate": 0.00029683379945342125, "loss": 159.8123, "step": 29 }, { "epoch": 0.019809335149189054, "grad_norm": 308621.125, "learning_rate": 0.000296493083356513, "loss": 124.3508, "step": 30 }, { "epoch": 0.02046964632082869, "grad_norm": 497120.59375, "learning_rate": 0.00029613517564675565, "loss": 138.8941, "step": 31 }, { "epoch": 0.021129957492468324, "grad_norm": 99928.7890625, "learning_rate": 0.0002957601183262058, "loss": 151.8223, "step": 32 }, { "epoch": 0.02179026866410796, "grad_norm": 405539.0625, "learning_rate": 0.000295367955409503, "loss": 1203.2074, "step": 33 }, { "epoch": 0.022450579835747594, "grad_norm": 463994.21875, "learning_rate": 0.00029495873291870436, "loss": 1256.5367, "step": 34 }, { "epoch": 0.02311089100738723, "grad_norm": 953879.5625, "learning_rate": 0.0002945324988778834, "loss": 938.6675, "step": 35 }, { "epoch": 0.023771202179026868, "grad_norm": 178221.28125, "learning_rate": 0.00029408930330749477, "loss": 356.9532, "step": 36 }, { "epoch": 0.0244315133506665, "grad_norm": 146182.625, "learning_rate": 0.0002936291982185036, "loss": 238.3703, "step": 37 }, { "epoch": 0.025091824522306138, "grad_norm": 116065.7421875, "learning_rate": 0.00029315223760628217, "loss": 212.9555, "step": 38 }, { "epoch": 0.02575213569394577, "grad_norm": 129193.03125, "learning_rate": 0.00029265847744427303, "loss": 227.929, "step": 39 }, { "epoch": 0.026412446865585408, "grad_norm": 152996.75, "learning_rate": 0.00029214797567742035, "loss": 220.4361, "step": 40 }, { "epoch": 0.026412446865585408, "eval_loss": 16.9798526763916, "eval_runtime": 6.5479, "eval_samples_per_second": 75.597, "eval_steps_per_second": 75.597, "step": 40 }, { "epoch": 0.02707275803722504, "grad_norm": 166313.5, "learning_rate": 0.00029162079221537, "loss": 178.7949, "step": 41 }, { "epoch": 0.027733069208864678, "grad_norm": 135891.28125, "learning_rate": 0.0002910769889254386, "loss": 201.0785, "step": 42 }, { "epoch": 0.02839338038050431, "grad_norm": 64060.7890625, "learning_rate": 0.0002905166296253533, "loss": 163.5094, "step": 43 }, { "epoch": 0.029053691552143948, "grad_norm": 96362.4921875, "learning_rate": 0.0002899397800757626, "loss": 140.6384, "step": 44 }, { "epoch": 0.02971400272378358, "grad_norm": 166254.203125, "learning_rate": 0.0002893465079725187, "loss": 139.1684, "step": 45 }, { "epoch": 0.030374313895423218, "grad_norm": 161925.5625, "learning_rate": 0.0002887368829387333, "loss": 140.9152, "step": 46 }, { "epoch": 0.031034625067062855, "grad_norm": 637966.3125, "learning_rate": 0.0002881109765166071, "loss": 131.3419, "step": 47 }, { "epoch": 0.03169493623870249, "grad_norm": 367775.03125, "learning_rate": 0.00028746886215903387, "loss": 155.0525, "step": 48 }, { "epoch": 0.032355247410342125, "grad_norm": 307120.84375, "learning_rate": 0.00028681061522098047, "loss": 148.0313, "step": 49 }, { "epoch": 0.03301555858198176, "grad_norm": 164428.203125, "learning_rate": 0.0002861363129506435, "loss": 139.0605, "step": 50 }, { "epoch": 0.03367586975362139, "grad_norm": 56655.140625, "learning_rate": 0.0002854460344803842, "loss": 105.2498, "step": 51 }, { "epoch": 0.03433618092526103, "grad_norm": 110095.71875, "learning_rate": 0.00028473986081744163, "loss": 107.1039, "step": 52 }, { "epoch": 0.034996492096900665, "grad_norm": 84727.8125, "learning_rate": 0.000284017874834426, "loss": 114.7597, "step": 53 }, { "epoch": 0.0356568032685403, "grad_norm": 108558.9140625, "learning_rate": 0.0002832801612595937, "loss": 131.0451, "step": 54 }, { "epoch": 0.03631711444017993, "grad_norm": 35595.47265625, "learning_rate": 0.0002825268066669034, "loss": 135.1516, "step": 55 }, { "epoch": 0.03697742561181957, "grad_norm": 54421.08203125, "learning_rate": 0.00028175789946585693, "loss": 116.2731, "step": 56 }, { "epoch": 0.037637736783459205, "grad_norm": 72844.515625, "learning_rate": 0.0002809735298911234, "loss": 101.419, "step": 57 }, { "epoch": 0.03829804795509884, "grad_norm": 58473.41015625, "learning_rate": 0.00028017378999195015, "loss": 101.8432, "step": 58 }, { "epoch": 0.03895835912673848, "grad_norm": 41094.68359375, "learning_rate": 0.0002793587736213603, "loss": 114.5148, "step": 59 }, { "epoch": 0.03961867029837811, "grad_norm": 75345.5234375, "learning_rate": 0.00027852857642513836, "loss": 119.3659, "step": 60 }, { "epoch": 0.03961867029837811, "eval_loss": 16.175268173217773, "eval_runtime": 6.5722, "eval_samples_per_second": 75.317, "eval_steps_per_second": 75.317, "step": 60 } ], "logging_steps": 1, "max_steps": 300, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 40390914736128.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }