|
{ |
|
"best_metric": 1.860011100769043, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-600", |
|
"epoch": 0.16383370878558262, |
|
"eval_steps": 50, |
|
"global_step": 600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002730561813093044, |
|
"eval_loss": 4.23640251159668, |
|
"eval_runtime": 242.769, |
|
"eval_samples_per_second": 25.407, |
|
"eval_steps_per_second": 6.352, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0027305618130930437, |
|
"grad_norm": 17.406291961669922, |
|
"learning_rate": 0.0002, |
|
"loss": 10.2808, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005461123626186087, |
|
"grad_norm": 11.717205047607422, |
|
"learning_rate": 0.0001998582695676762, |
|
"loss": 8.3282, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008191685439279131, |
|
"grad_norm": 12.451804161071777, |
|
"learning_rate": 0.00019943348002101371, |
|
"loss": 8.6122, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.010922247252372175, |
|
"grad_norm": 14.363651275634766, |
|
"learning_rate": 0.00019872683547213446, |
|
"loss": 8.3687, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01365280906546522, |
|
"grad_norm": 34.83021545410156, |
|
"learning_rate": 0.00019774033898178667, |
|
"loss": 8.9732, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01365280906546522, |
|
"eval_loss": 2.319276809692383, |
|
"eval_runtime": 244.7603, |
|
"eval_samples_per_second": 25.2, |
|
"eval_steps_per_second": 6.3, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.016383370878558262, |
|
"grad_norm": 10.075106620788574, |
|
"learning_rate": 0.0001964767868814516, |
|
"loss": 8.2567, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.019113932691651306, |
|
"grad_norm": 12.682146072387695, |
|
"learning_rate": 0.00019493976084683813, |
|
"loss": 8.2734, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02184449450474435, |
|
"grad_norm": 13.281365394592285, |
|
"learning_rate": 0.00019313361774523385, |
|
"loss": 8.0909, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.024575056317837397, |
|
"grad_norm": 18.51453971862793, |
|
"learning_rate": 0.00019106347728549135, |
|
"loss": 8.4039, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02730561813093044, |
|
"grad_norm": 36.62446212768555, |
|
"learning_rate": 0.00018873520750565718, |
|
"loss": 8.0267, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02730561813093044, |
|
"eval_loss": 2.1572701930999756, |
|
"eval_runtime": 245.0524, |
|
"eval_samples_per_second": 25.17, |
|
"eval_steps_per_second": 6.293, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.030036179944023484, |
|
"grad_norm": 10.612593650817871, |
|
"learning_rate": 0.0001861554081393806, |
|
"loss": 7.8833, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.032766741757116524, |
|
"grad_norm": 10.446393966674805, |
|
"learning_rate": 0.0001833313919082515, |
|
"loss": 8.3278, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03549730357020957, |
|
"grad_norm": 14.34956169128418, |
|
"learning_rate": 0.00018027116379309638, |
|
"loss": 8.2577, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03822786538330261, |
|
"grad_norm": 14.526724815368652, |
|
"learning_rate": 0.00017698339834299061, |
|
"loss": 7.8844, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.040958427196395655, |
|
"grad_norm": 37.110965728759766, |
|
"learning_rate": 0.00017347741508630672, |
|
"loss": 8.4204, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.040958427196395655, |
|
"eval_loss": 2.1126084327697754, |
|
"eval_runtime": 244.2124, |
|
"eval_samples_per_second": 25.257, |
|
"eval_steps_per_second": 6.314, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0436889890094887, |
|
"grad_norm": 12.598857879638672, |
|
"learning_rate": 0.0001697631521134985, |
|
"loss": 7.9538, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04641955082258175, |
|
"grad_norm": 11.983827590942383, |
|
"learning_rate": 0.00016585113790650388, |
|
"loss": 8.1731, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04915011263567479, |
|
"grad_norm": 13.680892944335938, |
|
"learning_rate": 0.0001617524614946192, |
|
"loss": 7.9786, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05188067444876784, |
|
"grad_norm": 16.39923858642578, |
|
"learning_rate": 0.0001574787410214407, |
|
"loss": 7.8267, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05461123626186088, |
|
"grad_norm": 35.790550231933594, |
|
"learning_rate": 0.00015304209081197425, |
|
"loss": 7.9481, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05461123626186088, |
|
"eval_loss": 2.1126017570495605, |
|
"eval_runtime": 244.847, |
|
"eval_samples_per_second": 25.191, |
|
"eval_steps_per_second": 6.298, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.057341798074953924, |
|
"grad_norm": 11.671783447265625, |
|
"learning_rate": 0.00014845508703326504, |
|
"loss": 8.1613, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06007235988804697, |
|
"grad_norm": 11.406132698059082, |
|
"learning_rate": 0.00014373073204588556, |
|
"loss": 7.8322, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.06280292170114, |
|
"grad_norm": 11.966191291809082, |
|
"learning_rate": 0.00013888241754733208, |
|
"loss": 7.5859, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06553348351423305, |
|
"grad_norm": 14.312826156616211, |
|
"learning_rate": 0.00013392388661180303, |
|
"loss": 8.059, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.06826404532732609, |
|
"grad_norm": 26.518217086791992, |
|
"learning_rate": 0.0001288691947339621, |
|
"loss": 7.8758, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06826404532732609, |
|
"eval_loss": 2.0528900623321533, |
|
"eval_runtime": 245.0717, |
|
"eval_samples_per_second": 25.168, |
|
"eval_steps_per_second": 6.292, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.07099460714041914, |
|
"grad_norm": 11.75592041015625, |
|
"learning_rate": 0.0001237326699871115, |
|
"loss": 7.8156, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.07372516895351218, |
|
"grad_norm": 10.208357810974121, |
|
"learning_rate": 0.00011852887240871145, |
|
"loss": 7.9382, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07645573076660522, |
|
"grad_norm": 12.978534698486328, |
|
"learning_rate": 0.00011327255272837221, |
|
"loss": 7.6911, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.07918629257969827, |
|
"grad_norm": 15.648476600646973, |
|
"learning_rate": 0.00010797861055530831, |
|
"loss": 7.5699, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.08191685439279131, |
|
"grad_norm": 31.975311279296875, |
|
"learning_rate": 0.00010266205214377748, |
|
"loss": 8.2882, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08191685439279131, |
|
"eval_loss": 2.002148151397705, |
|
"eval_runtime": 245.0084, |
|
"eval_samples_per_second": 25.175, |
|
"eval_steps_per_second": 6.294, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08464741620588435, |
|
"grad_norm": 10.377389907836914, |
|
"learning_rate": 9.733794785622253e-05, |
|
"loss": 7.7774, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0873779780189774, |
|
"grad_norm": 10.131440162658691, |
|
"learning_rate": 9.202138944469168e-05, |
|
"loss": 7.7762, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.09010853983207044, |
|
"grad_norm": 12.15221118927002, |
|
"learning_rate": 8.672744727162781e-05, |
|
"loss": 7.8511, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0928391016451635, |
|
"grad_norm": 13.989408493041992, |
|
"learning_rate": 8.147112759128859e-05, |
|
"loss": 7.7617, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.09556966345825654, |
|
"grad_norm": 24.143465042114258, |
|
"learning_rate": 7.626733001288851e-05, |
|
"loss": 8.1177, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09556966345825654, |
|
"eval_loss": 1.9585822820663452, |
|
"eval_runtime": 245.0532, |
|
"eval_samples_per_second": 25.17, |
|
"eval_steps_per_second": 6.293, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09830022527134959, |
|
"grad_norm": 10.882676124572754, |
|
"learning_rate": 7.113080526603792e-05, |
|
"loss": 7.953, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.10103078708444263, |
|
"grad_norm": 10.572400093078613, |
|
"learning_rate": 6.607611338819697e-05, |
|
"loss": 7.8085, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.10376134889753567, |
|
"grad_norm": 12.119694709777832, |
|
"learning_rate": 6.111758245266794e-05, |
|
"loss": 7.6887, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.10649191071062872, |
|
"grad_norm": 15.668352127075195, |
|
"learning_rate": 5.626926795411447e-05, |
|
"loss": 7.7462, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.10922247252372176, |
|
"grad_norm": 22.603986740112305, |
|
"learning_rate": 5.1544912966734994e-05, |
|
"loss": 7.8605, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10922247252372176, |
|
"eval_loss": 1.8981908559799194, |
|
"eval_runtime": 245.2102, |
|
"eval_samples_per_second": 25.154, |
|
"eval_steps_per_second": 6.288, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1119530343368148, |
|
"grad_norm": 11.885726928710938, |
|
"learning_rate": 4.695790918802576e-05, |
|
"loss": 7.2024, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.11468359614990785, |
|
"grad_norm": 11.538261413574219, |
|
"learning_rate": 4.252125897855932e-05, |
|
"loss": 7.3657, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.11741415796300089, |
|
"grad_norm": 12.95737075805664, |
|
"learning_rate": 3.824753850538082e-05, |
|
"loss": 7.6156, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.12014471977609394, |
|
"grad_norm": 13.987143516540527, |
|
"learning_rate": 3.414886209349615e-05, |
|
"loss": 7.5926, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.12287528158918698, |
|
"grad_norm": 17.603551864624023, |
|
"learning_rate": 3.0236847886501542e-05, |
|
"loss": 7.9609, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.12287528158918698, |
|
"eval_loss": 1.8793390989303589, |
|
"eval_runtime": 244.4883, |
|
"eval_samples_per_second": 25.228, |
|
"eval_steps_per_second": 6.307, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.12560584340228, |
|
"grad_norm": 11.999287605285645, |
|
"learning_rate": 2.6522584913693294e-05, |
|
"loss": 7.4036, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.12833640521537307, |
|
"grad_norm": 12.017765998840332, |
|
"learning_rate": 2.301660165700936e-05, |
|
"loss": 7.5689, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1310669670284661, |
|
"grad_norm": 12.495355606079102, |
|
"learning_rate": 1.9728836206903656e-05, |
|
"loss": 7.765, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.13379752884155915, |
|
"grad_norm": 13.036554336547852, |
|
"learning_rate": 1.6668608091748495e-05, |
|
"loss": 7.1911, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.13652809065465218, |
|
"grad_norm": 23.29970359802246, |
|
"learning_rate": 1.3844591860619383e-05, |
|
"loss": 7.5728, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13652809065465218, |
|
"eval_loss": 1.8650081157684326, |
|
"eval_runtime": 245.1119, |
|
"eval_samples_per_second": 25.164, |
|
"eval_steps_per_second": 6.291, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13925865246774524, |
|
"grad_norm": 10.133707046508789, |
|
"learning_rate": 1.1264792494342857e-05, |
|
"loss": 7.0925, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.14198921428083827, |
|
"grad_norm": 10.4485445022583, |
|
"learning_rate": 8.936522714508678e-06, |
|
"loss": 7.8088, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.14471977609393133, |
|
"grad_norm": 13.265645980834961, |
|
"learning_rate": 6.866382254766157e-06, |
|
"loss": 7.6878, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.14745033790702436, |
|
"grad_norm": 12.639242172241211, |
|
"learning_rate": 5.060239153161872e-06, |
|
"loss": 7.4659, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.15018089972011742, |
|
"grad_norm": 26.739566802978516, |
|
"learning_rate": 3.5232131185484076e-06, |
|
"loss": 7.7885, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.15018089972011742, |
|
"eval_loss": 1.862091302871704, |
|
"eval_runtime": 245.1426, |
|
"eval_samples_per_second": 25.161, |
|
"eval_steps_per_second": 6.29, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.15291146153321045, |
|
"grad_norm": 11.77377986907959, |
|
"learning_rate": 2.259661018213333e-06, |
|
"loss": 7.2398, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1556420233463035, |
|
"grad_norm": 10.591424942016602, |
|
"learning_rate": 1.2731645278655445e-06, |
|
"loss": 7.6634, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.15837258515939653, |
|
"grad_norm": 12.146462440490723, |
|
"learning_rate": 5.665199789862907e-07, |
|
"loss": 7.112, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1611031469724896, |
|
"grad_norm": 15.12509822845459, |
|
"learning_rate": 1.4173043232380557e-07, |
|
"loss": 7.505, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.16383370878558262, |
|
"grad_norm": 28.584569931030273, |
|
"learning_rate": 0.0, |
|
"loss": 7.9429, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.16383370878558262, |
|
"eval_loss": 1.860011100769043, |
|
"eval_runtime": 244.4477, |
|
"eval_samples_per_second": 25.232, |
|
"eval_steps_per_second": 6.308, |
|
"step": 600 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.218776992088064e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|