{ "best_metric": 1.0833667516708374, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.32663726931242854, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006532745386248571, "eval_loss": 1.8997453451156616, "eval_runtime": 135.0082, "eval_samples_per_second": 19.095, "eval_steps_per_second": 4.777, "step": 1 }, { "epoch": 0.006532745386248571, "grad_norm": 0.5372188687324524, "learning_rate": 0.0002, "loss": 1.0612, "step": 10 }, { "epoch": 0.013065490772497142, "grad_norm": 0.3328971266746521, "learning_rate": 0.0001998582695676762, "loss": 1.022, "step": 20 }, { "epoch": 0.019598236158745713, "grad_norm": 0.5935003161430359, "learning_rate": 0.00019943348002101371, "loss": 1.0128, "step": 30 }, { "epoch": 0.026130981544994283, "grad_norm": 2.405089855194092, "learning_rate": 0.00019872683547213446, "loss": 1.2233, "step": 40 }, { "epoch": 0.03266372693124286, "grad_norm": 3.8762905597686768, "learning_rate": 0.00019774033898178667, "loss": 1.6253, "step": 50 }, { "epoch": 0.03266372693124286, "eval_loss": 1.367977261543274, "eval_runtime": 137.0715, "eval_samples_per_second": 18.808, "eval_steps_per_second": 4.706, "step": 50 }, { "epoch": 0.039196472317491425, "grad_norm": 0.32745811343193054, "learning_rate": 0.0001964767868814516, "loss": 0.9681, "step": 60 }, { "epoch": 0.04572921770374, "grad_norm": 0.3707106113433838, "learning_rate": 0.00019493976084683813, "loss": 0.9395, "step": 70 }, { "epoch": 0.05226196308998857, "grad_norm": 0.5433753728866577, "learning_rate": 0.00019313361774523385, "loss": 1.0418, "step": 80 }, { "epoch": 0.05879470847623714, "grad_norm": 0.9471728205680847, "learning_rate": 0.00019106347728549135, "loss": 1.1623, "step": 90 }, { "epoch": 0.06532745386248572, "grad_norm": 2.322746753692627, "learning_rate": 0.00018873520750565718, "loss": 1.533, "step": 100 }, { "epoch": 0.06532745386248572, "eval_loss": 1.3632649183273315, "eval_runtime": 136.6492, "eval_samples_per_second": 18.866, "eval_steps_per_second": 4.72, "step": 100 }, { "epoch": 0.07186019924873428, "grad_norm": 0.3458673059940338, "learning_rate": 0.0001861554081393806, "loss": 0.9667, "step": 110 }, { "epoch": 0.07839294463498285, "grad_norm": 0.34533071517944336, "learning_rate": 0.0001833313919082515, "loss": 0.9634, "step": 120 }, { "epoch": 0.08492569002123142, "grad_norm": 0.5500729084014893, "learning_rate": 0.00018027116379309638, "loss": 1.0725, "step": 130 }, { "epoch": 0.09145843540748, "grad_norm": 0.853624701499939, "learning_rate": 0.00017698339834299061, "loss": 1.152, "step": 140 }, { "epoch": 0.09799118079372857, "grad_norm": 2.3200571537017822, "learning_rate": 0.00017347741508630672, "loss": 1.5809, "step": 150 }, { "epoch": 0.09799118079372857, "eval_loss": 1.2978923320770264, "eval_runtime": 136.7406, "eval_samples_per_second": 18.853, "eval_steps_per_second": 4.717, "step": 150 }, { "epoch": 0.10452392617997713, "grad_norm": 0.3230119049549103, "learning_rate": 0.0001697631521134985, "loss": 0.9422, "step": 160 }, { "epoch": 0.1110566715662257, "grad_norm": 0.34993496537208557, "learning_rate": 0.00016585113790650388, "loss": 0.9937, "step": 170 }, { "epoch": 0.11758941695247428, "grad_norm": 0.6289933323860168, "learning_rate": 0.0001617524614946192, "loss": 1.042, "step": 180 }, { "epoch": 0.12412216233872285, "grad_norm": 1.0999337434768677, "learning_rate": 0.0001574787410214407, "loss": 1.1156, "step": 190 }, { "epoch": 0.13065490772497143, "grad_norm": 2.4530375003814697, "learning_rate": 0.00015304209081197425, "loss": 1.503, "step": 200 }, { "epoch": 0.13065490772497143, "eval_loss": 1.2213399410247803, "eval_runtime": 136.9165, "eval_samples_per_second": 18.829, "eval_steps_per_second": 4.711, "step": 200 }, { "epoch": 0.13718765311122, "grad_norm": 0.23753122985363007, "learning_rate": 0.00014845508703326504, "loss": 0.9201, "step": 210 }, { "epoch": 0.14372039849746857, "grad_norm": 0.32499635219573975, "learning_rate": 0.00014373073204588556, "loss": 0.9755, "step": 220 }, { "epoch": 0.15025314388371713, "grad_norm": 0.38442739844322205, "learning_rate": 0.00013888241754733208, "loss": 1.0136, "step": 230 }, { "epoch": 0.1567858892699657, "grad_norm": 0.8352793455123901, "learning_rate": 0.00013392388661180303, "loss": 1.1551, "step": 240 }, { "epoch": 0.16331863465621427, "grad_norm": 2.5753586292266846, "learning_rate": 0.0001288691947339621, "loss": 1.5156, "step": 250 }, { "epoch": 0.16331863465621427, "eval_loss": 1.155815839767456, "eval_runtime": 137.0554, "eval_samples_per_second": 18.81, "eval_steps_per_second": 4.706, "step": 250 }, { "epoch": 0.16985138004246284, "grad_norm": 0.2267913520336151, "learning_rate": 0.0001237326699871115, "loss": 0.8876, "step": 260 }, { "epoch": 0.1763841254287114, "grad_norm": 0.3093341886997223, "learning_rate": 0.00011852887240871145, "loss": 0.9403, "step": 270 }, { "epoch": 0.18291687081496, "grad_norm": 0.43315809965133667, "learning_rate": 0.00011327255272837221, "loss": 0.9982, "step": 280 }, { "epoch": 0.18944961620120856, "grad_norm": 0.8610518574714661, "learning_rate": 0.00010797861055530831, "loss": 1.1578, "step": 290 }, { "epoch": 0.19598236158745713, "grad_norm": 3.0702617168426514, "learning_rate": 0.00010266205214377748, "loss": 1.3948, "step": 300 }, { "epoch": 0.19598236158745713, "eval_loss": 1.1323291063308716, "eval_runtime": 136.8684, "eval_samples_per_second": 18.836, "eval_steps_per_second": 4.713, "step": 300 }, { "epoch": 0.2025151069737057, "grad_norm": 0.35798534750938416, "learning_rate": 9.733794785622253e-05, "loss": 0.8783, "step": 310 }, { "epoch": 0.20904785235995427, "grad_norm": 0.31939324736595154, "learning_rate": 9.202138944469168e-05, "loss": 0.9702, "step": 320 }, { "epoch": 0.21558059774620283, "grad_norm": 0.5139957666397095, "learning_rate": 8.672744727162781e-05, "loss": 1.0535, "step": 330 }, { "epoch": 0.2221133431324514, "grad_norm": 0.8899235725402832, "learning_rate": 8.147112759128859e-05, "loss": 1.1356, "step": 340 }, { "epoch": 0.2286460885187, "grad_norm": 2.6317663192749023, "learning_rate": 7.626733001288851e-05, "loss": 1.5253, "step": 350 }, { "epoch": 0.2286460885187, "eval_loss": 1.107937216758728, "eval_runtime": 136.976, "eval_samples_per_second": 18.821, "eval_steps_per_second": 4.709, "step": 350 }, { "epoch": 0.23517883390494856, "grad_norm": 0.21436120569705963, "learning_rate": 7.113080526603792e-05, "loss": 0.8752, "step": 360 }, { "epoch": 0.24171157929119713, "grad_norm": 0.33155834674835205, "learning_rate": 6.607611338819697e-05, "loss": 0.9239, "step": 370 }, { "epoch": 0.2482443246774457, "grad_norm": 0.44394493103027344, "learning_rate": 6.111758245266794e-05, "loss": 0.9837, "step": 380 }, { "epoch": 0.25477707006369427, "grad_norm": 0.7826946377754211, "learning_rate": 5.626926795411447e-05, "loss": 1.1915, "step": 390 }, { "epoch": 0.26130981544994286, "grad_norm": 2.0713536739349365, "learning_rate": 5.1544912966734994e-05, "loss": 1.431, "step": 400 }, { "epoch": 0.26130981544994286, "eval_loss": 1.0921169519424438, "eval_runtime": 136.7603, "eval_samples_per_second": 18.85, "eval_steps_per_second": 4.716, "step": 400 }, { "epoch": 0.2678425608361914, "grad_norm": 0.2013624608516693, "learning_rate": 4.695790918802576e-05, "loss": 0.898, "step": 410 }, { "epoch": 0.27437530622244, "grad_norm": 0.35071200132369995, "learning_rate": 4.252125897855932e-05, "loss": 0.9476, "step": 420 }, { "epoch": 0.28090805160868854, "grad_norm": 0.5039893388748169, "learning_rate": 3.824753850538082e-05, "loss": 1.0456, "step": 430 }, { "epoch": 0.28744079699493713, "grad_norm": 0.9260233640670776, "learning_rate": 3.414886209349615e-05, "loss": 1.1276, "step": 440 }, { "epoch": 0.29397354238118567, "grad_norm": 3.32053279876709, "learning_rate": 3.0236847886501542e-05, "loss": 1.501, "step": 450 }, { "epoch": 0.29397354238118567, "eval_loss": 1.085567831993103, "eval_runtime": 136.7334, "eval_samples_per_second": 18.854, "eval_steps_per_second": 4.717, "step": 450 }, { "epoch": 0.30050628776743427, "grad_norm": 0.19558390974998474, "learning_rate": 2.6522584913693294e-05, "loss": 0.89, "step": 460 }, { "epoch": 0.30703903315368286, "grad_norm": 0.3003573715686798, "learning_rate": 2.301660165700936e-05, "loss": 0.9225, "step": 470 }, { "epoch": 0.3135717785399314, "grad_norm": 0.584018886089325, "learning_rate": 1.9728836206903656e-05, "loss": 1.0287, "step": 480 }, { "epoch": 0.32010452392618, "grad_norm": 0.8156113624572754, "learning_rate": 1.6668608091748495e-05, "loss": 1.0906, "step": 490 }, { "epoch": 0.32663726931242854, "grad_norm": 2.664822816848755, "learning_rate": 1.3844591860619383e-05, "loss": 1.5452, "step": 500 }, { "epoch": 0.32663726931242854, "eval_loss": 1.0833667516708374, "eval_runtime": 136.7852, "eval_samples_per_second": 18.847, "eval_steps_per_second": 4.715, "step": 500 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.356406596473979e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }