{ "best_metric": 0.800472617149353, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.18412815319462345, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003682563063892469, "eval_loss": 2.228027582168579, "eval_runtime": 104.8652, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.727, "step": 1 }, { "epoch": 0.0036825630638924692, "grad_norm": 3.6284472942352295, "learning_rate": 4.2000000000000004e-05, "loss": 4.2939, "step": 10 }, { "epoch": 0.0073651261277849385, "grad_norm": 1.8257993459701538, "learning_rate": 8.400000000000001e-05, "loss": 3.6458, "step": 20 }, { "epoch": 0.011047689191677408, "grad_norm": 2.0177488327026367, "learning_rate": 0.000126, "loss": 3.1733, "step": 30 }, { "epoch": 0.014730252255569877, "grad_norm": 1.6397349834442139, "learning_rate": 0.00016800000000000002, "loss": 2.822, "step": 40 }, { "epoch": 0.018412815319462345, "grad_norm": 1.8872606754302979, "learning_rate": 0.00021, "loss": 2.5885, "step": 50 }, { "epoch": 0.018412815319462345, "eval_loss": 1.3068158626556396, "eval_runtime": 104.7025, "eval_samples_per_second": 10.926, "eval_steps_per_second": 2.732, "step": 50 }, { "epoch": 0.022095378383354815, "grad_norm": 1.3547375202178955, "learning_rate": 0.00020974422527728155, "loss": 2.5193, "step": 60 }, { "epoch": 0.025777941447247283, "grad_norm": 1.4830182790756226, "learning_rate": 0.0002089781472178649, "loss": 2.4821, "step": 70 }, { "epoch": 0.029460504511139754, "grad_norm": 1.312664270401001, "learning_rate": 0.0002077054980770496, "loss": 2.3063, "step": 80 }, { "epoch": 0.03314306757503222, "grad_norm": 1.2727351188659668, "learning_rate": 0.00020593247807352348, "loss": 2.3387, "step": 90 }, { "epoch": 0.03682563063892469, "grad_norm": 1.5231223106384277, "learning_rate": 0.00020366772518252038, "loss": 2.0461, "step": 100 }, { "epoch": 0.03682563063892469, "eval_loss": 1.1203691959381104, "eval_runtime": 104.8463, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.728, "step": 100 }, { "epoch": 0.040508193702817163, "grad_norm": 1.1611711978912354, "learning_rate": 0.0002009222730524731, "loss": 2.1793, "step": 110 }, { "epoch": 0.04419075676670963, "grad_norm": 1.2379854917526245, "learning_rate": 0.00019770949725018733, "loss": 2.153, "step": 120 }, { "epoch": 0.0478733198306021, "grad_norm": 1.262009859085083, "learning_rate": 0.00019404505009642473, "loss": 2.1378, "step": 130 }, { "epoch": 0.051555882894494566, "grad_norm": 1.2100757360458374, "learning_rate": 0.0001899467844093695, "loss": 2.0749, "step": 140 }, { "epoch": 0.05523844595838704, "grad_norm": 1.489582896232605, "learning_rate": 0.00018543466652749268, "loss": 1.9353, "step": 150 }, { "epoch": 0.05523844595838704, "eval_loss": 1.013873815536499, "eval_runtime": 104.9915, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.724, "step": 150 }, { "epoch": 0.05892100902227951, "grad_norm": 1.1306333541870117, "learning_rate": 0.00018053067903555837, "loss": 2.099, "step": 160 }, { "epoch": 0.06260357208617198, "grad_norm": 1.1093506813049316, "learning_rate": 0.00017525871366768012, "loss": 2.0291, "step": 170 }, { "epoch": 0.06628613515006444, "grad_norm": 1.1496107578277588, "learning_rate": 0.00016964445490919413, "loss": 2.0047, "step": 180 }, { "epoch": 0.06996869821395692, "grad_norm": 1.0870957374572754, "learning_rate": 0.00016371525486442843, "loss": 1.969, "step": 190 }, { "epoch": 0.07365126127784938, "grad_norm": 1.6815812587738037, "learning_rate": 0.0001575, "loss": 1.8075, "step": 200 }, { "epoch": 0.07365126127784938, "eval_loss": 0.967093825340271, "eval_runtime": 104.7287, "eval_samples_per_second": 10.923, "eval_steps_per_second": 2.731, "step": 200 }, { "epoch": 0.07733382434174185, "grad_norm": 1.2532967329025269, "learning_rate": 0.00015102897041285315, "loss": 1.9389, "step": 210 }, { "epoch": 0.08101638740563433, "grad_norm": 1.169727087020874, "learning_rate": 0.00014433369230867077, "loss": 1.9091, "step": 220 }, { "epoch": 0.08469895046952679, "grad_norm": 1.0527361631393433, "learning_rate": 0.0001374467844093695, "loss": 1.902, "step": 230 }, { "epoch": 0.08838151353341926, "grad_norm": 1.1085879802703857, "learning_rate": 0.0001304017990379651, "loss": 1.8043, "step": 240 }, { "epoch": 0.09206407659731172, "grad_norm": 1.3278361558914185, "learning_rate": 0.0001232330586550277, "loss": 1.5737, "step": 250 }, { "epoch": 0.09206407659731172, "eval_loss": 0.9121530055999756, "eval_runtime": 104.8025, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.729, "step": 250 }, { "epoch": 0.0957466396612042, "grad_norm": 1.1355035305023193, "learning_rate": 0.00011597548864310363, "loss": 1.8759, "step": 260 }, { "epoch": 0.09942920272509667, "grad_norm": 1.1549241542816162, "learning_rate": 0.00010866444715376263, "loss": 1.8859, "step": 270 }, { "epoch": 0.10311176578898913, "grad_norm": 1.074413537979126, "learning_rate": 0.00010133555284623744, "loss": 1.8034, "step": 280 }, { "epoch": 0.1067943288528816, "grad_norm": 1.0826478004455566, "learning_rate": 9.402451135689641e-05, "loss": 1.8102, "step": 290 }, { "epoch": 0.11047689191677408, "grad_norm": 1.3882075548171997, "learning_rate": 8.676694134497232e-05, "loss": 1.6126, "step": 300 }, { "epoch": 0.11047689191677408, "eval_loss": 0.8696406483650208, "eval_runtime": 104.6164, "eval_samples_per_second": 10.935, "eval_steps_per_second": 2.734, "step": 300 }, { "epoch": 0.11415945498066654, "grad_norm": 0.9779600501060486, "learning_rate": 7.95982009620349e-05, "loss": 1.7683, "step": 310 }, { "epoch": 0.11784201804455902, "grad_norm": 1.0219613313674927, "learning_rate": 7.255321559063053e-05, "loss": 1.6802, "step": 320 }, { "epoch": 0.12152458110845148, "grad_norm": 1.0174745321273804, "learning_rate": 6.566630769132923e-05, "loss": 1.7034, "step": 330 }, { "epoch": 0.12520714417234396, "grad_norm": 1.0329629182815552, "learning_rate": 5.897102958714686e-05, "loss": 1.6781, "step": 340 }, { "epoch": 0.1288897072362364, "grad_norm": 1.218892216682434, "learning_rate": 5.250000000000002e-05, "loss": 1.4372, "step": 350 }, { "epoch": 0.1288897072362364, "eval_loss": 0.8357728123664856, "eval_runtime": 104.8025, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.729, "step": 350 }, { "epoch": 0.13257227030012889, "grad_norm": 1.0614081621170044, "learning_rate": 4.62847451355716e-05, "loss": 1.7161, "step": 360 }, { "epoch": 0.13625483336402136, "grad_norm": 1.2188026905059814, "learning_rate": 4.035554509080588e-05, "loss": 1.6848, "step": 370 }, { "epoch": 0.13993739642791383, "grad_norm": 1.0060714483261108, "learning_rate": 3.474128633231992e-05, "loss": 1.6229, "step": 380 }, { "epoch": 0.1436199594918063, "grad_norm": 1.1352717876434326, "learning_rate": 2.946932096444165e-05, "loss": 1.6946, "step": 390 }, { "epoch": 0.14730252255569876, "grad_norm": 1.3413875102996826, "learning_rate": 2.456533347250732e-05, "loss": 1.5453, "step": 400 }, { "epoch": 0.14730252255569876, "eval_loss": 0.8128843307495117, "eval_runtime": 104.9077, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.726, "step": 400 }, { "epoch": 0.15098508561959123, "grad_norm": 0.9193152189254761, "learning_rate": 2.005321559063053e-05, "loss": 1.6275, "step": 410 }, { "epoch": 0.1546676486834837, "grad_norm": 1.0370545387268066, "learning_rate": 1.5954949903575276e-05, "loss": 1.6423, "step": 420 }, { "epoch": 0.15835021174737618, "grad_norm": 0.9968223571777344, "learning_rate": 1.2290502749812666e-05, "loss": 1.5805, "step": 430 }, { "epoch": 0.16203277481126865, "grad_norm": 0.9633427858352661, "learning_rate": 9.077726947526898e-06, "loss": 1.6058, "step": 440 }, { "epoch": 0.1657153378751611, "grad_norm": 1.313849687576294, "learning_rate": 6.332274817479627e-06, "loss": 1.4868, "step": 450 }, { "epoch": 0.1657153378751611, "eval_loss": 0.8024091124534607, "eval_runtime": 104.8859, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.727, "step": 450 }, { "epoch": 0.16939790093905358, "grad_norm": 1.1269716024398804, "learning_rate": 4.067521926476516e-06, "loss": 1.6888, "step": 460 }, { "epoch": 0.17308046400294605, "grad_norm": 1.138323426246643, "learning_rate": 2.294501922950403e-06, "loss": 1.6791, "step": 470 }, { "epoch": 0.17676302706683852, "grad_norm": 1.0652822256088257, "learning_rate": 1.021852782135112e-06, "loss": 1.6347, "step": 480 }, { "epoch": 0.180445590130731, "grad_norm": 0.9608269333839417, "learning_rate": 2.5577472271845927e-07, "loss": 1.7125, "step": 490 }, { "epoch": 0.18412815319462345, "grad_norm": 1.3455790281295776, "learning_rate": 0.0, "loss": 1.5007, "step": 500 }, { "epoch": 0.18412815319462345, "eval_loss": 0.800472617149353, "eval_runtime": 104.603, "eval_samples_per_second": 10.937, "eval_steps_per_second": 2.734, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5092811439276032e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }