{ "best_metric": 0.401653915643692, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.1199184554502938, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002398369109005876, "eval_loss": 2.359610080718994, "eval_runtime": 175.1852, "eval_samples_per_second": 10.024, "eval_steps_per_second": 2.506, "step": 1 }, { "epoch": 0.002398369109005876, "grad_norm": 3.903085231781006, "learning_rate": 4.12e-05, "loss": 3.5638, "step": 10 }, { "epoch": 0.004796738218011752, "grad_norm": 4.722385883331299, "learning_rate": 8.24e-05, "loss": 1.9339, "step": 20 }, { "epoch": 0.007195107327017628, "grad_norm": 5.696664810180664, "learning_rate": 0.0001236, "loss": 1.4953, "step": 30 }, { "epoch": 0.009593476436023504, "grad_norm": 3.0536234378814697, "learning_rate": 0.0001648, "loss": 1.4307, "step": 40 }, { "epoch": 0.01199184554502938, "grad_norm": 3.5544991493225098, "learning_rate": 0.000206, "loss": 2.5797, "step": 50 }, { "epoch": 0.01199184554502938, "eval_loss": 0.8046712875366211, "eval_runtime": 175.2533, "eval_samples_per_second": 10.02, "eval_steps_per_second": 2.505, "step": 50 }, { "epoch": 0.014390214654035257, "grad_norm": 3.480740547180176, "learning_rate": 0.0002057490971767619, "loss": 1.2293, "step": 60 }, { "epoch": 0.016788583763041133, "grad_norm": 1.953612208366394, "learning_rate": 0.00020499761108038175, "loss": 0.7409, "step": 70 }, { "epoch": 0.019186952872047008, "grad_norm": 3.514718532562256, "learning_rate": 0.00020374920287558198, "loss": 0.9618, "step": 80 }, { "epoch": 0.021585321981052882, "grad_norm": 1.9172500371932983, "learning_rate": 0.00020200995468164684, "loss": 1.3345, "step": 90 }, { "epoch": 0.02398369109005876, "grad_norm": 4.029332160949707, "learning_rate": 0.00019978833994094855, "loss": 2.4339, "step": 100 }, { "epoch": 0.02398369109005876, "eval_loss": 0.678347110748291, "eval_runtime": 175.235, "eval_samples_per_second": 10.021, "eval_steps_per_second": 2.505, "step": 100 }, { "epoch": 0.026382060199064635, "grad_norm": 1.8706626892089844, "learning_rate": 0.00019709518213718787, "loss": 1.0316, "step": 110 }, { "epoch": 0.028780429308070513, "grad_norm": 2.0400331020355225, "learning_rate": 0.00019394360206446948, "loss": 0.723, "step": 120 }, { "epoch": 0.031178798417076388, "grad_norm": 2.933223009109497, "learning_rate": 0.00019034895390411186, "loss": 0.731, "step": 130 }, { "epoch": 0.033577167526082266, "grad_norm": 2.936216354370117, "learning_rate": 0.0001863287504206196, "loss": 1.3271, "step": 140 }, { "epoch": 0.03597553663508814, "grad_norm": 3.1987624168395996, "learning_rate": 0.00018190257764125471, "loss": 1.8452, "step": 150 }, { "epoch": 0.03597553663508814, "eval_loss": 0.6230275630950928, "eval_runtime": 175.1317, "eval_samples_per_second": 10.027, "eval_steps_per_second": 2.507, "step": 150 }, { "epoch": 0.038373905744094015, "grad_norm": 1.3045629262924194, "learning_rate": 0.00017709199943488106, "loss": 0.8008, "step": 160 }, { "epoch": 0.040772274853099894, "grad_norm": 0.9459063410758972, "learning_rate": 0.00017192045245496238, "loss": 0.6722, "step": 170 }, { "epoch": 0.043170643962105765, "grad_norm": 2.614128589630127, "learning_rate": 0.00016641313195854277, "loss": 0.7101, "step": 180 }, { "epoch": 0.04556901307111164, "grad_norm": 1.9415876865386963, "learning_rate": 0.0001605968690574869, "loss": 1.0997, "step": 190 }, { "epoch": 0.04796738218011752, "grad_norm": 3.9025213718414307, "learning_rate": 0.0001545, "loss": 1.6012, "step": 200 }, { "epoch": 0.04796738218011752, "eval_loss": 0.5664369463920593, "eval_runtime": 175.1329, "eval_samples_per_second": 10.027, "eval_steps_per_second": 2.507, "step": 200 }, { "epoch": 0.0503657512891234, "grad_norm": 1.9600988626480103, "learning_rate": 0.00014815222811927496, "loss": 0.7902, "step": 210 }, { "epoch": 0.05276412039812927, "grad_norm": 2.0275771617889404, "learning_rate": 0.00014158447912183896, "loss": 0.5071, "step": 220 }, { "epoch": 0.05516248950713515, "grad_norm": 2.4012374877929688, "learning_rate": 0.00013482875042061958, "loss": 0.7438, "step": 230 }, { "epoch": 0.05756085861614103, "grad_norm": 2.113513231277466, "learning_rate": 0.00012791795524676576, "loss": 1.2536, "step": 240 }, { "epoch": 0.0599592277251469, "grad_norm": 2.7452316284179688, "learning_rate": 0.00012088576229969385, "loss": 1.489, "step": 250 }, { "epoch": 0.0599592277251469, "eval_loss": 0.5060864686965942, "eval_runtime": 175.4272, "eval_samples_per_second": 10.01, "eval_steps_per_second": 2.502, "step": 250 }, { "epoch": 0.062357596834152776, "grad_norm": 1.0596331357955933, "learning_rate": 0.0001137664317165683, "loss": 0.5781, "step": 260 }, { "epoch": 0.06475596594315865, "grad_norm": 2.4672203063964844, "learning_rate": 0.00010659464816035761, "loss": 0.3945, "step": 270 }, { "epoch": 0.06715433505216453, "grad_norm": 1.586638331413269, "learning_rate": 9.940535183964242e-05, "loss": 0.6164, "step": 280 }, { "epoch": 0.0695527041611704, "grad_norm": 1.856692910194397, "learning_rate": 9.22335682834317e-05, "loss": 1.3025, "step": 290 }, { "epoch": 0.07195107327017627, "grad_norm": 3.173532247543335, "learning_rate": 8.511423770030617e-05, "loss": 1.3479, "step": 300 }, { "epoch": 0.07195107327017627, "eval_loss": 0.4862947463989258, "eval_runtime": 175.2171, "eval_samples_per_second": 10.022, "eval_steps_per_second": 2.505, "step": 300 }, { "epoch": 0.07434944237918216, "grad_norm": 0.8681183457374573, "learning_rate": 7.808204475323423e-05, "loss": 0.5307, "step": 310 }, { "epoch": 0.07674781148818803, "grad_norm": 0.9014914631843567, "learning_rate": 7.117124957938042e-05, "loss": 0.5389, "step": 320 }, { "epoch": 0.0791461805971939, "grad_norm": 0.8682736158370972, "learning_rate": 6.441552087816105e-05, "loss": 0.753, "step": 330 }, { "epoch": 0.08154454970619979, "grad_norm": 2.1298677921295166, "learning_rate": 5.784777188072502e-05, "loss": 0.679, "step": 340 }, { "epoch": 0.08394291881520566, "grad_norm": 1.1377344131469727, "learning_rate": 5.150000000000002e-05, "loss": 1.4152, "step": 350 }, { "epoch": 0.08394291881520566, "eval_loss": 0.4434095621109009, "eval_runtime": 175.2648, "eval_samples_per_second": 10.019, "eval_steps_per_second": 2.505, "step": 350 }, { "epoch": 0.08634128792421153, "grad_norm": 0.7453836798667908, "learning_rate": 4.540313094251309e-05, "loss": 0.4925, "step": 360 }, { "epoch": 0.08873965703321741, "grad_norm": 3.931169033050537, "learning_rate": 3.958686804145719e-05, "loss": 0.476, "step": 370 }, { "epoch": 0.09113802614222329, "grad_norm": 1.9024658203125, "learning_rate": 3.4079547545037634e-05, "loss": 0.6328, "step": 380 }, { "epoch": 0.09353639525122917, "grad_norm": 1.585979700088501, "learning_rate": 2.8908000565118947e-05, "loss": 1.3884, "step": 390 }, { "epoch": 0.09593476436023504, "grad_norm": 3.1228091716766357, "learning_rate": 2.4097422358745275e-05, "loss": 1.8286, "step": 400 }, { "epoch": 0.09593476436023504, "eval_loss": 0.4184797704219818, "eval_runtime": 175.4625, "eval_samples_per_second": 10.008, "eval_steps_per_second": 2.502, "step": 400 }, { "epoch": 0.09833313346924091, "grad_norm": 0.6214273571968079, "learning_rate": 1.9671249579380422e-05, "loss": 0.6116, "step": 410 }, { "epoch": 0.1007315025782468, "grad_norm": 1.4425666332244873, "learning_rate": 1.5651046095888127e-05, "loss": 0.3289, "step": 420 }, { "epoch": 0.10312987168725267, "grad_norm": 1.5458945035934448, "learning_rate": 1.205639793553052e-05, "loss": 0.5484, "step": 430 }, { "epoch": 0.10552824079625854, "grad_norm": 2.1187777519226074, "learning_rate": 8.904817862812098e-06, "loss": 0.9235, "step": 440 }, { "epoch": 0.10792660990526443, "grad_norm": 6.571332931518555, "learning_rate": 6.211660059051443e-06, "loss": 1.2354, "step": 450 }, { "epoch": 0.10792660990526443, "eval_loss": 0.4052506685256958, "eval_runtime": 175.1413, "eval_samples_per_second": 10.026, "eval_steps_per_second": 2.507, "step": 450 }, { "epoch": 0.1103249790142703, "grad_norm": 0.9613426923751831, "learning_rate": 3.990045318353154e-06, "loss": 0.4621, "step": 460 }, { "epoch": 0.11272334812327617, "grad_norm": 1.016109585762024, "learning_rate": 2.250797124418014e-06, "loss": 0.4734, "step": 470 }, { "epoch": 0.11512171723228205, "grad_norm": 1.2553995847702026, "learning_rate": 1.0023889196182526e-06, "loss": 0.3936, "step": 480 }, { "epoch": 0.11752008634128792, "grad_norm": 2.9403693675994873, "learning_rate": 2.5090282323810766e-07, "loss": 1.2895, "step": 490 }, { "epoch": 0.1199184554502938, "grad_norm": 1.9742144346237183, "learning_rate": 0.0, "loss": 0.9806, "step": 500 }, { "epoch": 0.1199184554502938, "eval_loss": 0.401653915643692, "eval_runtime": 175.283, "eval_samples_per_second": 10.018, "eval_steps_per_second": 2.505, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6100295180288e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }