{ "best_metric": 1.862091302871704, "best_model_checkpoint": "miner_id_24/checkpoint-550", "epoch": 0.15018089972011742, "eval_steps": 50, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002730561813093044, "eval_loss": 4.23640251159668, "eval_runtime": 242.769, "eval_samples_per_second": 25.407, "eval_steps_per_second": 6.352, "step": 1 }, { "epoch": 0.0027305618130930437, "grad_norm": 17.406291961669922, "learning_rate": 0.0002, "loss": 10.2808, "step": 10 }, { "epoch": 0.005461123626186087, "grad_norm": 11.717205047607422, "learning_rate": 0.0001998582695676762, "loss": 8.3282, "step": 20 }, { "epoch": 0.008191685439279131, "grad_norm": 12.451804161071777, "learning_rate": 0.00019943348002101371, "loss": 8.6122, "step": 30 }, { "epoch": 0.010922247252372175, "grad_norm": 14.363651275634766, "learning_rate": 0.00019872683547213446, "loss": 8.3687, "step": 40 }, { "epoch": 0.01365280906546522, "grad_norm": 34.83021545410156, "learning_rate": 0.00019774033898178667, "loss": 8.9732, "step": 50 }, { "epoch": 0.01365280906546522, "eval_loss": 2.319276809692383, "eval_runtime": 244.7603, "eval_samples_per_second": 25.2, "eval_steps_per_second": 6.3, "step": 50 }, { "epoch": 0.016383370878558262, "grad_norm": 10.075106620788574, "learning_rate": 0.0001964767868814516, "loss": 8.2567, "step": 60 }, { "epoch": 0.019113932691651306, "grad_norm": 12.682146072387695, "learning_rate": 0.00019493976084683813, "loss": 8.2734, "step": 70 }, { "epoch": 0.02184449450474435, "grad_norm": 13.281365394592285, "learning_rate": 0.00019313361774523385, "loss": 8.0909, "step": 80 }, { "epoch": 0.024575056317837397, "grad_norm": 18.51453971862793, "learning_rate": 0.00019106347728549135, "loss": 8.4039, "step": 90 }, { "epoch": 0.02730561813093044, "grad_norm": 36.62446212768555, "learning_rate": 0.00018873520750565718, "loss": 8.0267, "step": 100 }, { "epoch": 0.02730561813093044, "eval_loss": 2.1572701930999756, "eval_runtime": 245.0524, "eval_samples_per_second": 25.17, "eval_steps_per_second": 6.293, "step": 100 }, { "epoch": 0.030036179944023484, "grad_norm": 10.612593650817871, "learning_rate": 0.0001861554081393806, "loss": 7.8833, "step": 110 }, { "epoch": 0.032766741757116524, "grad_norm": 10.446393966674805, "learning_rate": 0.0001833313919082515, "loss": 8.3278, "step": 120 }, { "epoch": 0.03549730357020957, "grad_norm": 14.34956169128418, "learning_rate": 0.00018027116379309638, "loss": 8.2577, "step": 130 }, { "epoch": 0.03822786538330261, "grad_norm": 14.526724815368652, "learning_rate": 0.00017698339834299061, "loss": 7.8844, "step": 140 }, { "epoch": 0.040958427196395655, "grad_norm": 37.110965728759766, "learning_rate": 0.00017347741508630672, "loss": 8.4204, "step": 150 }, { "epoch": 0.040958427196395655, "eval_loss": 2.1126084327697754, "eval_runtime": 244.2124, "eval_samples_per_second": 25.257, "eval_steps_per_second": 6.314, "step": 150 }, { "epoch": 0.0436889890094887, "grad_norm": 12.598857879638672, "learning_rate": 0.0001697631521134985, "loss": 7.9538, "step": 160 }, { "epoch": 0.04641955082258175, "grad_norm": 11.983827590942383, "learning_rate": 0.00016585113790650388, "loss": 8.1731, "step": 170 }, { "epoch": 0.04915011263567479, "grad_norm": 13.680892944335938, "learning_rate": 0.0001617524614946192, "loss": 7.9786, "step": 180 }, { "epoch": 0.05188067444876784, "grad_norm": 16.39923858642578, "learning_rate": 0.0001574787410214407, "loss": 7.8267, "step": 190 }, { "epoch": 0.05461123626186088, "grad_norm": 35.790550231933594, "learning_rate": 0.00015304209081197425, "loss": 7.9481, "step": 200 }, { "epoch": 0.05461123626186088, "eval_loss": 2.1126017570495605, "eval_runtime": 244.847, "eval_samples_per_second": 25.191, "eval_steps_per_second": 6.298, "step": 200 }, { "epoch": 0.057341798074953924, "grad_norm": 11.671783447265625, "learning_rate": 0.00014845508703326504, "loss": 8.1613, "step": 210 }, { "epoch": 0.06007235988804697, "grad_norm": 11.406132698059082, "learning_rate": 0.00014373073204588556, "loss": 7.8322, "step": 220 }, { "epoch": 0.06280292170114, "grad_norm": 11.966191291809082, "learning_rate": 0.00013888241754733208, "loss": 7.5859, "step": 230 }, { "epoch": 0.06553348351423305, "grad_norm": 14.312826156616211, "learning_rate": 0.00013392388661180303, "loss": 8.059, "step": 240 }, { "epoch": 0.06826404532732609, "grad_norm": 26.518217086791992, "learning_rate": 0.0001288691947339621, "loss": 7.8758, "step": 250 }, { "epoch": 0.06826404532732609, "eval_loss": 2.0528900623321533, "eval_runtime": 245.0717, "eval_samples_per_second": 25.168, "eval_steps_per_second": 6.292, "step": 250 }, { "epoch": 0.07099460714041914, "grad_norm": 11.75592041015625, "learning_rate": 0.0001237326699871115, "loss": 7.8156, "step": 260 }, { "epoch": 0.07372516895351218, "grad_norm": 10.208357810974121, "learning_rate": 0.00011852887240871145, "loss": 7.9382, "step": 270 }, { "epoch": 0.07645573076660522, "grad_norm": 12.978534698486328, "learning_rate": 0.00011327255272837221, "loss": 7.6911, "step": 280 }, { "epoch": 0.07918629257969827, "grad_norm": 15.648476600646973, "learning_rate": 0.00010797861055530831, "loss": 7.5699, "step": 290 }, { "epoch": 0.08191685439279131, "grad_norm": 31.975311279296875, "learning_rate": 0.00010266205214377748, "loss": 8.2882, "step": 300 }, { "epoch": 0.08191685439279131, "eval_loss": 2.002148151397705, "eval_runtime": 245.0084, "eval_samples_per_second": 25.175, "eval_steps_per_second": 6.294, "step": 300 }, { "epoch": 0.08464741620588435, "grad_norm": 10.377389907836914, "learning_rate": 9.733794785622253e-05, "loss": 7.7774, "step": 310 }, { "epoch": 0.0873779780189774, "grad_norm": 10.131440162658691, "learning_rate": 9.202138944469168e-05, "loss": 7.7762, "step": 320 }, { "epoch": 0.09010853983207044, "grad_norm": 12.15221118927002, "learning_rate": 8.672744727162781e-05, "loss": 7.8511, "step": 330 }, { "epoch": 0.0928391016451635, "grad_norm": 13.989408493041992, "learning_rate": 8.147112759128859e-05, "loss": 7.7617, "step": 340 }, { "epoch": 0.09556966345825654, "grad_norm": 24.143465042114258, "learning_rate": 7.626733001288851e-05, "loss": 8.1177, "step": 350 }, { "epoch": 0.09556966345825654, "eval_loss": 1.9585822820663452, "eval_runtime": 245.0532, "eval_samples_per_second": 25.17, "eval_steps_per_second": 6.293, "step": 350 }, { "epoch": 0.09830022527134959, "grad_norm": 10.882676124572754, "learning_rate": 7.113080526603792e-05, "loss": 7.953, "step": 360 }, { "epoch": 0.10103078708444263, "grad_norm": 10.572400093078613, "learning_rate": 6.607611338819697e-05, "loss": 7.8085, "step": 370 }, { "epoch": 0.10376134889753567, "grad_norm": 12.119694709777832, "learning_rate": 6.111758245266794e-05, "loss": 7.6887, "step": 380 }, { "epoch": 0.10649191071062872, "grad_norm": 15.668352127075195, "learning_rate": 5.626926795411447e-05, "loss": 7.7462, "step": 390 }, { "epoch": 0.10922247252372176, "grad_norm": 22.603986740112305, "learning_rate": 5.1544912966734994e-05, "loss": 7.8605, "step": 400 }, { "epoch": 0.10922247252372176, "eval_loss": 1.8981908559799194, "eval_runtime": 245.2102, "eval_samples_per_second": 25.154, "eval_steps_per_second": 6.288, "step": 400 }, { "epoch": 0.1119530343368148, "grad_norm": 11.885726928710938, "learning_rate": 4.695790918802576e-05, "loss": 7.2024, "step": 410 }, { "epoch": 0.11468359614990785, "grad_norm": 11.538261413574219, "learning_rate": 4.252125897855932e-05, "loss": 7.3657, "step": 420 }, { "epoch": 0.11741415796300089, "grad_norm": 12.95737075805664, "learning_rate": 3.824753850538082e-05, "loss": 7.6156, "step": 430 }, { "epoch": 0.12014471977609394, "grad_norm": 13.987143516540527, "learning_rate": 3.414886209349615e-05, "loss": 7.5926, "step": 440 }, { "epoch": 0.12287528158918698, "grad_norm": 17.603551864624023, "learning_rate": 3.0236847886501542e-05, "loss": 7.9609, "step": 450 }, { "epoch": 0.12287528158918698, "eval_loss": 1.8793390989303589, "eval_runtime": 244.4883, "eval_samples_per_second": 25.228, "eval_steps_per_second": 6.307, "step": 450 }, { "epoch": 0.12560584340228, "grad_norm": 11.999287605285645, "learning_rate": 2.6522584913693294e-05, "loss": 7.4036, "step": 460 }, { "epoch": 0.12833640521537307, "grad_norm": 12.017765998840332, "learning_rate": 2.301660165700936e-05, "loss": 7.5689, "step": 470 }, { "epoch": 0.1310669670284661, "grad_norm": 12.495355606079102, "learning_rate": 1.9728836206903656e-05, "loss": 7.765, "step": 480 }, { "epoch": 0.13379752884155915, "grad_norm": 13.036554336547852, "learning_rate": 1.6668608091748495e-05, "loss": 7.1911, "step": 490 }, { "epoch": 0.13652809065465218, "grad_norm": 23.29970359802246, "learning_rate": 1.3844591860619383e-05, "loss": 7.5728, "step": 500 }, { "epoch": 0.13652809065465218, "eval_loss": 1.8650081157684326, "eval_runtime": 245.1119, "eval_samples_per_second": 25.164, "eval_steps_per_second": 6.291, "step": 500 }, { "epoch": 0.13925865246774524, "grad_norm": 10.133707046508789, "learning_rate": 1.1264792494342857e-05, "loss": 7.0925, "step": 510 }, { "epoch": 0.14198921428083827, "grad_norm": 10.4485445022583, "learning_rate": 8.936522714508678e-06, "loss": 7.8088, "step": 520 }, { "epoch": 0.14471977609393133, "grad_norm": 13.265645980834961, "learning_rate": 6.866382254766157e-06, "loss": 7.6878, "step": 530 }, { "epoch": 0.14745033790702436, "grad_norm": 12.639242172241211, "learning_rate": 5.060239153161872e-06, "loss": 7.4659, "step": 540 }, { "epoch": 0.15018089972011742, "grad_norm": 26.739566802978516, "learning_rate": 3.5232131185484076e-06, "loss": 7.7885, "step": 550 }, { "epoch": 0.15018089972011742, "eval_loss": 1.862091302871704, "eval_runtime": 245.1426, "eval_samples_per_second": 25.161, "eval_steps_per_second": 6.29, "step": 550 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.867212242747392e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }