|
{ |
|
"best_metric": 1.4701261520385742, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-450", |
|
"epoch": 0.07600067556156055, |
|
"eval_steps": 50, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00016889039013680122, |
|
"eval_loss": 2.846999168395996, |
|
"eval_runtime": 220.8139, |
|
"eval_samples_per_second": 11.295, |
|
"eval_steps_per_second": 2.826, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0016889039013680122, |
|
"grad_norm": 3.695821523666382, |
|
"learning_rate": 4.0400000000000006e-05, |
|
"loss": 4.3694, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0033778078027360244, |
|
"grad_norm": 6.247311592102051, |
|
"learning_rate": 8.080000000000001e-05, |
|
"loss": 3.9724, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.005066711704104036, |
|
"grad_norm": 5.978872299194336, |
|
"learning_rate": 0.00012119999999999999, |
|
"loss": 3.5361, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.006755615605472049, |
|
"grad_norm": 5.326415538787842, |
|
"learning_rate": 0.00016160000000000002, |
|
"loss": 3.546, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.00844451950684006, |
|
"grad_norm": 9.239958763122559, |
|
"learning_rate": 0.000202, |
|
"loss": 3.7854, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00844451950684006, |
|
"eval_loss": 1.693381667137146, |
|
"eval_runtime": 220.7239, |
|
"eval_samples_per_second": 11.299, |
|
"eval_steps_per_second": 2.827, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.010133423408208072, |
|
"grad_norm": 7.550833225250244, |
|
"learning_rate": 0.00020175396907624226, |
|
"loss": 2.9515, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.011822327309576086, |
|
"grad_norm": 3.227522134780884, |
|
"learning_rate": 0.0002010170749428986, |
|
"loss": 3.2175, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.013511231210944098, |
|
"grad_norm": 4.046976566314697, |
|
"learning_rate": 0.00019979290767411438, |
|
"loss": 3.5135, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01520013511231211, |
|
"grad_norm": 8.03056812286377, |
|
"learning_rate": 0.0001980874312897702, |
|
"loss": 3.6262, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01688903901368012, |
|
"grad_norm": 8.536865234375, |
|
"learning_rate": 0.00019590895469937675, |
|
"loss": 4.0803, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01688903901368012, |
|
"eval_loss": 1.7287895679473877, |
|
"eval_runtime": 221.2619, |
|
"eval_samples_per_second": 11.272, |
|
"eval_steps_per_second": 2.82, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.018577942915048135, |
|
"grad_norm": 3.2818431854248047, |
|
"learning_rate": 0.0001932680912219027, |
|
"loss": 2.962, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.020266846816416145, |
|
"grad_norm": 3.375999689102173, |
|
"learning_rate": 0.00019017770687875164, |
|
"loss": 3.0989, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.021955750717784158, |
|
"grad_norm": 3.638335943222046, |
|
"learning_rate": 0.000186652857711799, |
|
"loss": 3.3325, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.02364465461915217, |
|
"grad_norm": 6.7433762550354, |
|
"learning_rate": 0.00018271071643186968, |
|
"loss": 3.8392, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.02533355852052018, |
|
"grad_norm": 6.1501593589782715, |
|
"learning_rate": 0.00017837048875501678, |
|
"loss": 3.8591, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02533355852052018, |
|
"eval_loss": 1.6560685634613037, |
|
"eval_runtime": 221.9305, |
|
"eval_samples_per_second": 11.238, |
|
"eval_steps_per_second": 2.812, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.027022462421888195, |
|
"grad_norm": 2.800673246383667, |
|
"learning_rate": 0.00017365331983420376, |
|
"loss": 2.7496, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.028711366323256205, |
|
"grad_norm": 2.549720525741577, |
|
"learning_rate": 0.0001685821912422447, |
|
"loss": 3.2596, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.03040027022462422, |
|
"grad_norm": 4.524278163909912, |
|
"learning_rate": 0.00016318180900789148, |
|
"loss": 3.5261, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03208917412599223, |
|
"grad_norm": 5.028242588043213, |
|
"learning_rate": 0.00015747848325054544, |
|
"loss": 3.4352, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.03377807802736024, |
|
"grad_norm": 6.687849044799805, |
|
"learning_rate": 0.0001515, |
|
"loss": 3.7396, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03377807802736024, |
|
"eval_loss": 1.6160459518432617, |
|
"eval_runtime": 221.3916, |
|
"eval_samples_per_second": 11.265, |
|
"eval_steps_per_second": 2.819, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.035466981928728256, |
|
"grad_norm": 2.702817440032959, |
|
"learning_rate": 0.00014527548582569683, |
|
"loss": 2.6468, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.03715588583009627, |
|
"grad_norm": 4.0655436515808105, |
|
"learning_rate": 0.00013883526593500714, |
|
"loss": 3.2802, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.03884478973146428, |
|
"grad_norm": 3.6759231090545654, |
|
"learning_rate": 0.0001322107164318697, |
|
"loss": 3.3334, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.04053369363283229, |
|
"grad_norm": 4.863379001617432, |
|
"learning_rate": 0.00012543411145556643, |
|
"loss": 3.2032, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0422225975342003, |
|
"grad_norm": 6.288718223571777, |
|
"learning_rate": 0.00011853846594435998, |
|
"loss": 3.5356, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0422225975342003, |
|
"eval_loss": 1.5846458673477173, |
|
"eval_runtime": 221.1464, |
|
"eval_samples_per_second": 11.278, |
|
"eval_steps_per_second": 2.822, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.043911501435568316, |
|
"grad_norm": 3.2017464637756348, |
|
"learning_rate": 0.00011155737479003301, |
|
"loss": 2.8565, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.04560040533693633, |
|
"grad_norm": 3.160783529281616, |
|
"learning_rate": 0.00010452484916695262, |
|
"loss": 3.1377, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.04728930923830434, |
|
"grad_norm": 4.057458877563477, |
|
"learning_rate": 9.747515083304742e-05, |
|
"loss": 3.2321, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.04897821313967235, |
|
"grad_norm": 3.2311322689056396, |
|
"learning_rate": 9.044262520996702e-05, |
|
"loss": 3.1823, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.05066711704104036, |
|
"grad_norm": 6.725953578948975, |
|
"learning_rate": 8.346153405564004e-05, |
|
"loss": 3.3379, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05066711704104036, |
|
"eval_loss": 1.5483067035675049, |
|
"eval_runtime": 221.1074, |
|
"eval_samples_per_second": 11.28, |
|
"eval_steps_per_second": 2.822, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05235602094240838, |
|
"grad_norm": 2.5730559825897217, |
|
"learning_rate": 7.656588854443357e-05, |
|
"loss": 2.8366, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.05404492484377639, |
|
"grad_norm": 2.557634115219116, |
|
"learning_rate": 6.978928356813031e-05, |
|
"loss": 2.963, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.055733828745144404, |
|
"grad_norm": 3.629833221435547, |
|
"learning_rate": 6.316473406499288e-05, |
|
"loss": 2.9596, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.05742273264651241, |
|
"grad_norm": 3.6670989990234375, |
|
"learning_rate": 5.672451417430317e-05, |
|
"loss": 3.2497, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.059111636547880424, |
|
"grad_norm": 4.485354900360107, |
|
"learning_rate": 5.050000000000002e-05, |
|
"loss": 3.1528, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.059111636547880424, |
|
"eval_loss": 1.5068892240524292, |
|
"eval_runtime": 221.0414, |
|
"eval_samples_per_second": 11.283, |
|
"eval_steps_per_second": 2.823, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06080054044924844, |
|
"grad_norm": 2.8697142601013184, |
|
"learning_rate": 4.452151674945458e-05, |
|
"loss": 2.7241, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.06248944435061645, |
|
"grad_norm": 4.0349016189575195, |
|
"learning_rate": 3.8818190992108515e-05, |
|
"loss": 2.9007, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.06417834825198446, |
|
"grad_norm": 3.124457359313965, |
|
"learning_rate": 3.3417808757755355e-05, |
|
"loss": 3.0556, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.06586725215335247, |
|
"grad_norm": 3.4859092235565186, |
|
"learning_rate": 2.8346680165796253e-05, |
|
"loss": 3.115, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.06755615605472048, |
|
"grad_norm": 4.689525127410889, |
|
"learning_rate": 2.362951124498323e-05, |
|
"loss": 3.1429, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06755615605472048, |
|
"eval_loss": 1.4789899587631226, |
|
"eval_runtime": 225.1952, |
|
"eval_samples_per_second": 11.075, |
|
"eval_steps_per_second": 2.771, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.0692450599560885, |
|
"grad_norm": 2.2446751594543457, |
|
"learning_rate": 1.928928356813032e-05, |
|
"loss": 2.6135, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.07093396385745651, |
|
"grad_norm": 2.9616000652313232, |
|
"learning_rate": 1.5347142288200977e-05, |
|
"loss": 2.8901, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.07262286775882452, |
|
"grad_norm": 2.9250693321228027, |
|
"learning_rate": 1.1822293121248375e-05, |
|
"loss": 2.8757, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.07431177166019254, |
|
"grad_norm": 2.9157168865203857, |
|
"learning_rate": 8.731908778097302e-06, |
|
"loss": 3.2046, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.07600067556156055, |
|
"grad_norm": 5.299656867980957, |
|
"learning_rate": 6.09104530062326e-06, |
|
"loss": 3.061, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.07600067556156055, |
|
"eval_loss": 1.4701261520385742, |
|
"eval_runtime": 221.003, |
|
"eval_samples_per_second": 11.285, |
|
"eval_steps_per_second": 2.823, |
|
"step": 450 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.44902656622592e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|