|
{ |
|
"best_metric": 0.800472617149353, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.18412815319462345, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0003682563063892469, |
|
"eval_loss": 2.228027582168579, |
|
"eval_runtime": 104.8652, |
|
"eval_samples_per_second": 10.909, |
|
"eval_steps_per_second": 2.727, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0036825630638924692, |
|
"grad_norm": 3.6284472942352295, |
|
"learning_rate": 4.2000000000000004e-05, |
|
"loss": 4.2939, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0073651261277849385, |
|
"grad_norm": 1.8257993459701538, |
|
"learning_rate": 8.400000000000001e-05, |
|
"loss": 3.6458, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.011047689191677408, |
|
"grad_norm": 2.0177488327026367, |
|
"learning_rate": 0.000126, |
|
"loss": 3.1733, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014730252255569877, |
|
"grad_norm": 1.6397349834442139, |
|
"learning_rate": 0.00016800000000000002, |
|
"loss": 2.822, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.018412815319462345, |
|
"grad_norm": 1.8872606754302979, |
|
"learning_rate": 0.00021, |
|
"loss": 2.5885, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.018412815319462345, |
|
"eval_loss": 1.3068158626556396, |
|
"eval_runtime": 104.7025, |
|
"eval_samples_per_second": 10.926, |
|
"eval_steps_per_second": 2.732, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.022095378383354815, |
|
"grad_norm": 1.3547375202178955, |
|
"learning_rate": 0.00020974422527728155, |
|
"loss": 2.5193, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.025777941447247283, |
|
"grad_norm": 1.4830182790756226, |
|
"learning_rate": 0.0002089781472178649, |
|
"loss": 2.4821, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.029460504511139754, |
|
"grad_norm": 1.312664270401001, |
|
"learning_rate": 0.0002077054980770496, |
|
"loss": 2.3063, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03314306757503222, |
|
"grad_norm": 1.2727351188659668, |
|
"learning_rate": 0.00020593247807352348, |
|
"loss": 2.3387, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03682563063892469, |
|
"grad_norm": 1.5231223106384277, |
|
"learning_rate": 0.00020366772518252038, |
|
"loss": 2.0461, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03682563063892469, |
|
"eval_loss": 1.1203691959381104, |
|
"eval_runtime": 104.8463, |
|
"eval_samples_per_second": 10.911, |
|
"eval_steps_per_second": 2.728, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.040508193702817163, |
|
"grad_norm": 1.1611711978912354, |
|
"learning_rate": 0.0002009222730524731, |
|
"loss": 2.1793, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04419075676670963, |
|
"grad_norm": 1.2379854917526245, |
|
"learning_rate": 0.00019770949725018733, |
|
"loss": 2.153, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0478733198306021, |
|
"grad_norm": 1.262009859085083, |
|
"learning_rate": 0.00019404505009642473, |
|
"loss": 2.1378, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.051555882894494566, |
|
"grad_norm": 1.2100757360458374, |
|
"learning_rate": 0.0001899467844093695, |
|
"loss": 2.0749, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05523844595838704, |
|
"grad_norm": 1.489582896232605, |
|
"learning_rate": 0.00018543466652749268, |
|
"loss": 1.9353, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05523844595838704, |
|
"eval_loss": 1.013873815536499, |
|
"eval_runtime": 104.9915, |
|
"eval_samples_per_second": 10.896, |
|
"eval_steps_per_second": 2.724, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05892100902227951, |
|
"grad_norm": 1.1306333541870117, |
|
"learning_rate": 0.00018053067903555837, |
|
"loss": 2.099, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06260357208617198, |
|
"grad_norm": 1.1093506813049316, |
|
"learning_rate": 0.00017525871366768012, |
|
"loss": 2.0291, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06628613515006444, |
|
"grad_norm": 1.1496107578277588, |
|
"learning_rate": 0.00016964445490919413, |
|
"loss": 2.0047, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06996869821395692, |
|
"grad_norm": 1.0870957374572754, |
|
"learning_rate": 0.00016371525486442843, |
|
"loss": 1.969, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07365126127784938, |
|
"grad_norm": 1.6815812587738037, |
|
"learning_rate": 0.0001575, |
|
"loss": 1.8075, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07365126127784938, |
|
"eval_loss": 0.967093825340271, |
|
"eval_runtime": 104.7287, |
|
"eval_samples_per_second": 10.923, |
|
"eval_steps_per_second": 2.731, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07733382434174185, |
|
"grad_norm": 1.2532967329025269, |
|
"learning_rate": 0.00015102897041285315, |
|
"loss": 1.9389, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08101638740563433, |
|
"grad_norm": 1.169727087020874, |
|
"learning_rate": 0.00014433369230867077, |
|
"loss": 1.9091, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08469895046952679, |
|
"grad_norm": 1.0527361631393433, |
|
"learning_rate": 0.0001374467844093695, |
|
"loss": 1.902, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08838151353341926, |
|
"grad_norm": 1.1085879802703857, |
|
"learning_rate": 0.0001304017990379651, |
|
"loss": 1.8043, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09206407659731172, |
|
"grad_norm": 1.3278361558914185, |
|
"learning_rate": 0.0001232330586550277, |
|
"loss": 1.5737, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09206407659731172, |
|
"eval_loss": 0.9121530055999756, |
|
"eval_runtime": 104.8025, |
|
"eval_samples_per_second": 10.916, |
|
"eval_steps_per_second": 2.729, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0957466396612042, |
|
"grad_norm": 1.1355035305023193, |
|
"learning_rate": 0.00011597548864310363, |
|
"loss": 1.8759, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09942920272509667, |
|
"grad_norm": 1.1549241542816162, |
|
"learning_rate": 0.00010866444715376263, |
|
"loss": 1.8859, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.10311176578898913, |
|
"grad_norm": 1.074413537979126, |
|
"learning_rate": 0.00010133555284623744, |
|
"loss": 1.8034, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1067943288528816, |
|
"grad_norm": 1.0826478004455566, |
|
"learning_rate": 9.402451135689641e-05, |
|
"loss": 1.8102, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.11047689191677408, |
|
"grad_norm": 1.3882075548171997, |
|
"learning_rate": 8.676694134497232e-05, |
|
"loss": 1.6126, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11047689191677408, |
|
"eval_loss": 0.8696406483650208, |
|
"eval_runtime": 104.6164, |
|
"eval_samples_per_second": 10.935, |
|
"eval_steps_per_second": 2.734, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11415945498066654, |
|
"grad_norm": 0.9779600501060486, |
|
"learning_rate": 7.95982009620349e-05, |
|
"loss": 1.7683, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.11784201804455902, |
|
"grad_norm": 1.0219613313674927, |
|
"learning_rate": 7.255321559063053e-05, |
|
"loss": 1.6802, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.12152458110845148, |
|
"grad_norm": 1.0174745321273804, |
|
"learning_rate": 6.566630769132923e-05, |
|
"loss": 1.7034, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12520714417234396, |
|
"grad_norm": 1.0329629182815552, |
|
"learning_rate": 5.897102958714686e-05, |
|
"loss": 1.6781, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1288897072362364, |
|
"grad_norm": 1.218892216682434, |
|
"learning_rate": 5.250000000000002e-05, |
|
"loss": 1.4372, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1288897072362364, |
|
"eval_loss": 0.8357728123664856, |
|
"eval_runtime": 104.8025, |
|
"eval_samples_per_second": 10.916, |
|
"eval_steps_per_second": 2.729, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.13257227030012889, |
|
"grad_norm": 1.0614081621170044, |
|
"learning_rate": 4.62847451355716e-05, |
|
"loss": 1.7161, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13625483336402136, |
|
"grad_norm": 1.2188026905059814, |
|
"learning_rate": 4.035554509080588e-05, |
|
"loss": 1.6848, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.13993739642791383, |
|
"grad_norm": 1.0060714483261108, |
|
"learning_rate": 3.474128633231992e-05, |
|
"loss": 1.6229, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1436199594918063, |
|
"grad_norm": 1.1352717876434326, |
|
"learning_rate": 2.946932096444165e-05, |
|
"loss": 1.6946, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.14730252255569876, |
|
"grad_norm": 1.3413875102996826, |
|
"learning_rate": 2.456533347250732e-05, |
|
"loss": 1.5453, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14730252255569876, |
|
"eval_loss": 0.8128843307495117, |
|
"eval_runtime": 104.9077, |
|
"eval_samples_per_second": 10.905, |
|
"eval_steps_per_second": 2.726, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.15098508561959123, |
|
"grad_norm": 0.9193152189254761, |
|
"learning_rate": 2.005321559063053e-05, |
|
"loss": 1.6275, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.1546676486834837, |
|
"grad_norm": 1.0370545387268066, |
|
"learning_rate": 1.5954949903575276e-05, |
|
"loss": 1.6423, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.15835021174737618, |
|
"grad_norm": 0.9968223571777344, |
|
"learning_rate": 1.2290502749812666e-05, |
|
"loss": 1.5805, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.16203277481126865, |
|
"grad_norm": 0.9633427858352661, |
|
"learning_rate": 9.077726947526898e-06, |
|
"loss": 1.6058, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1657153378751611, |
|
"grad_norm": 1.313849687576294, |
|
"learning_rate": 6.332274817479627e-06, |
|
"loss": 1.4868, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1657153378751611, |
|
"eval_loss": 0.8024091124534607, |
|
"eval_runtime": 104.8859, |
|
"eval_samples_per_second": 10.907, |
|
"eval_steps_per_second": 2.727, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16939790093905358, |
|
"grad_norm": 1.1269716024398804, |
|
"learning_rate": 4.067521926476516e-06, |
|
"loss": 1.6888, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.17308046400294605, |
|
"grad_norm": 1.138323426246643, |
|
"learning_rate": 2.294501922950403e-06, |
|
"loss": 1.6791, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.17676302706683852, |
|
"grad_norm": 1.0652822256088257, |
|
"learning_rate": 1.021852782135112e-06, |
|
"loss": 1.6347, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.180445590130731, |
|
"grad_norm": 0.9608269333839417, |
|
"learning_rate": 2.5577472271845927e-07, |
|
"loss": 1.7125, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.18412815319462345, |
|
"grad_norm": 1.3455790281295776, |
|
"learning_rate": 0.0, |
|
"loss": 1.5007, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18412815319462345, |
|
"eval_loss": 0.800472617149353, |
|
"eval_runtime": 104.603, |
|
"eval_samples_per_second": 10.937, |
|
"eval_steps_per_second": 2.734, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5092811439276032e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|