{ "best_metric": 0.7403023838996887, "best_model_checkpoint": "miner_id_24/checkpoint-550", "epoch": 0.6472491909385113, "eval_steps": 50, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011768167107972932, "eval_loss": 1.4336028099060059, "eval_runtime": 63.7044, "eval_samples_per_second": 22.479, "eval_steps_per_second": 5.62, "step": 1 }, { "epoch": 0.011768167107972934, "grad_norm": 1.2304390668869019, "learning_rate": 0.0002, "loss": 7.4146, "step": 10 }, { "epoch": 0.023536334215945868, "grad_norm": 0.6472563743591309, "learning_rate": 0.0001998582695676762, "loss": 6.0483, "step": 20 }, { "epoch": 0.0353045013239188, "grad_norm": 0.7562745213508606, "learning_rate": 0.00019943348002101371, "loss": 3.9027, "step": 30 }, { "epoch": 0.047072668431891736, "grad_norm": 0.8033158779144287, "learning_rate": 0.00019872683547213446, "loss": 1.8585, "step": 40 }, { "epoch": 0.05884083553986467, "grad_norm": 1.7948169708251953, "learning_rate": 0.00019774033898178667, "loss": 1.7989, "step": 50 }, { "epoch": 0.05884083553986467, "eval_loss": 0.9343328475952148, "eval_runtime": 64.9646, "eval_samples_per_second": 22.043, "eval_steps_per_second": 5.511, "step": 50 }, { "epoch": 0.0706090026478376, "grad_norm": 0.8677065968513489, "learning_rate": 0.0001964767868814516, "loss": 6.2807, "step": 60 }, { "epoch": 0.08237716975581054, "grad_norm": 0.7287414073944092, "learning_rate": 0.00019493976084683813, "loss": 4.9082, "step": 70 }, { "epoch": 0.09414533686378347, "grad_norm": 1.209147572517395, "learning_rate": 0.00019313361774523385, "loss": 3.0766, "step": 80 }, { "epoch": 0.1059135039717564, "grad_norm": 0.9230589866638184, "learning_rate": 0.00019106347728549135, "loss": 1.5117, "step": 90 }, { "epoch": 0.11768167107972934, "grad_norm": 2.2932982444763184, "learning_rate": 0.00018873520750565718, "loss": 1.5895, "step": 100 }, { "epoch": 0.11768167107972934, "eval_loss": 0.8446639180183411, "eval_runtime": 64.7346, "eval_samples_per_second": 22.121, "eval_steps_per_second": 5.53, "step": 100 }, { "epoch": 0.12944983818770225, "grad_norm": 0.867832362651825, "learning_rate": 0.0001861554081393806, "loss": 6.0238, "step": 110 }, { "epoch": 0.1412180052956752, "grad_norm": 0.8156207203865051, "learning_rate": 0.0001833313919082515, "loss": 4.7902, "step": 120 }, { "epoch": 0.15298617240364812, "grad_norm": 0.802191436290741, "learning_rate": 0.00018027116379309638, "loss": 2.8416, "step": 130 }, { "epoch": 0.16475433951162108, "grad_norm": 1.274532675743103, "learning_rate": 0.00017698339834299061, "loss": 1.3511, "step": 140 }, { "epoch": 0.176522506619594, "grad_norm": 1.8214877843856812, "learning_rate": 0.00017347741508630672, "loss": 1.4454, "step": 150 }, { "epoch": 0.176522506619594, "eval_loss": 0.8176446557044983, "eval_runtime": 64.9157, "eval_samples_per_second": 22.059, "eval_steps_per_second": 5.515, "step": 150 }, { "epoch": 0.18829067372756694, "grad_norm": 0.7749770283699036, "learning_rate": 0.0001697631521134985, "loss": 5.9457, "step": 160 }, { "epoch": 0.20005884083553988, "grad_norm": 0.843367338180542, "learning_rate": 0.00016585113790650388, "loss": 4.737, "step": 170 }, { "epoch": 0.2118270079435128, "grad_norm": 0.7175513505935669, "learning_rate": 0.0001617524614946192, "loss": 2.8268, "step": 180 }, { "epoch": 0.22359517505148574, "grad_norm": 0.8358870148658752, "learning_rate": 0.0001574787410214407, "loss": 1.3233, "step": 190 }, { "epoch": 0.23536334215945867, "grad_norm": 2.047827959060669, "learning_rate": 0.00015304209081197425, "loss": 1.4239, "step": 200 }, { "epoch": 0.23536334215945867, "eval_loss": 0.7979318499565125, "eval_runtime": 64.8822, "eval_samples_per_second": 22.071, "eval_steps_per_second": 5.518, "step": 200 }, { "epoch": 0.2471315092674316, "grad_norm": 0.7978292107582092, "learning_rate": 0.00014845508703326504, "loss": 5.7924, "step": 210 }, { "epoch": 0.2588996763754045, "grad_norm": 0.9069154858589172, "learning_rate": 0.00014373073204588556, "loss": 4.7768, "step": 220 }, { "epoch": 0.27066784348337747, "grad_norm": 0.7871260643005371, "learning_rate": 0.00013888241754733208, "loss": 2.6733, "step": 230 }, { "epoch": 0.2824360105913504, "grad_norm": 0.7734578847885132, "learning_rate": 0.00013392388661180303, "loss": 1.2857, "step": 240 }, { "epoch": 0.29420417769932333, "grad_norm": 1.4109017848968506, "learning_rate": 0.0001288691947339621, "loss": 1.3427, "step": 250 }, { "epoch": 0.29420417769932333, "eval_loss": 0.7788119912147522, "eval_runtime": 65.0714, "eval_samples_per_second": 22.007, "eval_steps_per_second": 5.502, "step": 250 }, { "epoch": 0.30597234480729624, "grad_norm": 0.7562018632888794, "learning_rate": 0.0001237326699871115, "loss": 5.6205, "step": 260 }, { "epoch": 0.3177405119152692, "grad_norm": 0.7738041281700134, "learning_rate": 0.00011852887240871145, "loss": 4.6291, "step": 270 }, { "epoch": 0.32950867902324216, "grad_norm": 1.0495797395706177, "learning_rate": 0.00011327255272837221, "loss": 2.5666, "step": 280 }, { "epoch": 0.34127684613121506, "grad_norm": 0.8168486952781677, "learning_rate": 0.00010797861055530831, "loss": 1.2404, "step": 290 }, { "epoch": 0.353045013239188, "grad_norm": 1.7189139127731323, "learning_rate": 0.00010266205214377748, "loss": 1.3585, "step": 300 }, { "epoch": 0.353045013239188, "eval_loss": 0.7667000889778137, "eval_runtime": 64.8942, "eval_samples_per_second": 22.067, "eval_steps_per_second": 5.517, "step": 300 }, { "epoch": 0.3648131803471609, "grad_norm": 0.7242693305015564, "learning_rate": 9.733794785622253e-05, "loss": 5.6994, "step": 310 }, { "epoch": 0.3765813474551339, "grad_norm": 0.810172438621521, "learning_rate": 9.202138944469168e-05, "loss": 4.7286, "step": 320 }, { "epoch": 0.3883495145631068, "grad_norm": 0.7765442728996277, "learning_rate": 8.672744727162781e-05, "loss": 2.8564, "step": 330 }, { "epoch": 0.40011768167107975, "grad_norm": 0.7657132744789124, "learning_rate": 8.147112759128859e-05, "loss": 1.3036, "step": 340 }, { "epoch": 0.41188584877905265, "grad_norm": 1.4623005390167236, "learning_rate": 7.626733001288851e-05, "loss": 1.3171, "step": 350 }, { "epoch": 0.41188584877905265, "eval_loss": 0.7572553157806396, "eval_runtime": 64.9777, "eval_samples_per_second": 22.038, "eval_steps_per_second": 5.51, "step": 350 }, { "epoch": 0.4236540158870256, "grad_norm": 0.696776270866394, "learning_rate": 7.113080526603792e-05, "loss": 5.7138, "step": 360 }, { "epoch": 0.4354221829949985, "grad_norm": 0.9223313331604004, "learning_rate": 6.607611338819697e-05, "loss": 4.5998, "step": 370 }, { "epoch": 0.4471903501029715, "grad_norm": 0.7238697409629822, "learning_rate": 6.111758245266794e-05, "loss": 2.5621, "step": 380 }, { "epoch": 0.4589585172109444, "grad_norm": 0.9471986889839172, "learning_rate": 5.626926795411447e-05, "loss": 1.2488, "step": 390 }, { "epoch": 0.47072668431891734, "grad_norm": 1.4697206020355225, "learning_rate": 5.1544912966734994e-05, "loss": 1.3135, "step": 400 }, { "epoch": 0.47072668431891734, "eval_loss": 0.7500874400138855, "eval_runtime": 64.8413, "eval_samples_per_second": 22.085, "eval_steps_per_second": 5.521, "step": 400 }, { "epoch": 0.48249485142689025, "grad_norm": 0.8447908759117126, "learning_rate": 4.695790918802576e-05, "loss": 5.6785, "step": 410 }, { "epoch": 0.4942630185348632, "grad_norm": 0.8665351867675781, "learning_rate": 4.252125897855932e-05, "loss": 4.643, "step": 420 }, { "epoch": 0.5060311856428361, "grad_norm": 0.7426034808158875, "learning_rate": 3.824753850538082e-05, "loss": 2.7279, "step": 430 }, { "epoch": 0.517799352750809, "grad_norm": 0.9415715932846069, "learning_rate": 3.414886209349615e-05, "loss": 1.2065, "step": 440 }, { "epoch": 0.529567519858782, "grad_norm": 1.705180048942566, "learning_rate": 3.0236847886501542e-05, "loss": 1.3062, "step": 450 }, { "epoch": 0.529567519858782, "eval_loss": 0.7452362179756165, "eval_runtime": 64.9352, "eval_samples_per_second": 22.053, "eval_steps_per_second": 5.513, "step": 450 }, { "epoch": 0.5413356869667549, "grad_norm": 0.7531328201293945, "learning_rate": 2.6522584913693294e-05, "loss": 5.4884, "step": 460 }, { "epoch": 0.5531038540747278, "grad_norm": 0.7240074872970581, "learning_rate": 2.301660165700936e-05, "loss": 4.6025, "step": 470 }, { "epoch": 0.5648720211827007, "grad_norm": 1.1110539436340332, "learning_rate": 1.9728836206903656e-05, "loss": 2.8274, "step": 480 }, { "epoch": 0.5766401882906738, "grad_norm": 1.0983327627182007, "learning_rate": 1.6668608091748495e-05, "loss": 1.2374, "step": 490 }, { "epoch": 0.5884083553986467, "grad_norm": 1.2942715883255005, "learning_rate": 1.3844591860619383e-05, "loss": 1.221, "step": 500 }, { "epoch": 0.5884083553986467, "eval_loss": 0.7421861290931702, "eval_runtime": 64.9523, "eval_samples_per_second": 22.047, "eval_steps_per_second": 5.512, "step": 500 }, { "epoch": 0.6001765225066196, "grad_norm": 0.7555685043334961, "learning_rate": 1.1264792494342857e-05, "loss": 5.5477, "step": 510 }, { "epoch": 0.6119446896145925, "grad_norm": 0.7698599696159363, "learning_rate": 8.936522714508678e-06, "loss": 4.6595, "step": 520 }, { "epoch": 0.6237128567225655, "grad_norm": 0.8012061715126038, "learning_rate": 6.866382254766157e-06, "loss": 2.9295, "step": 530 }, { "epoch": 0.6354810238305384, "grad_norm": 0.8370329737663269, "learning_rate": 5.060239153161872e-06, "loss": 1.2254, "step": 540 }, { "epoch": 0.6472491909385113, "grad_norm": 1.4551666975021362, "learning_rate": 3.5232131185484076e-06, "loss": 1.2469, "step": 550 }, { "epoch": 0.6472491909385113, "eval_loss": 0.7403023838996887, "eval_runtime": 65.1388, "eval_samples_per_second": 21.984, "eval_steps_per_second": 5.496, "step": 550 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.706288114835128e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }