{ "best_metric": 1.4685018062591553, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.0844451950684006, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00016889039013680122, "eval_loss": 2.846999168395996, "eval_runtime": 220.8139, "eval_samples_per_second": 11.295, "eval_steps_per_second": 2.826, "step": 1 }, { "epoch": 0.0016889039013680122, "grad_norm": 3.695821523666382, "learning_rate": 4.0400000000000006e-05, "loss": 4.3694, "step": 10 }, { "epoch": 0.0033778078027360244, "grad_norm": 6.247311592102051, "learning_rate": 8.080000000000001e-05, "loss": 3.9724, "step": 20 }, { "epoch": 0.005066711704104036, "grad_norm": 5.978872299194336, "learning_rate": 0.00012119999999999999, "loss": 3.5361, "step": 30 }, { "epoch": 0.006755615605472049, "grad_norm": 5.326415538787842, "learning_rate": 0.00016160000000000002, "loss": 3.546, "step": 40 }, { "epoch": 0.00844451950684006, "grad_norm": 9.239958763122559, "learning_rate": 0.000202, "loss": 3.7854, "step": 50 }, { "epoch": 0.00844451950684006, "eval_loss": 1.693381667137146, "eval_runtime": 220.7239, "eval_samples_per_second": 11.299, "eval_steps_per_second": 2.827, "step": 50 }, { "epoch": 0.010133423408208072, "grad_norm": 7.550833225250244, "learning_rate": 0.00020175396907624226, "loss": 2.9515, "step": 60 }, { "epoch": 0.011822327309576086, "grad_norm": 3.227522134780884, "learning_rate": 0.0002010170749428986, "loss": 3.2175, "step": 70 }, { "epoch": 0.013511231210944098, "grad_norm": 4.046976566314697, "learning_rate": 0.00019979290767411438, "loss": 3.5135, "step": 80 }, { "epoch": 0.01520013511231211, "grad_norm": 8.03056812286377, "learning_rate": 0.0001980874312897702, "loss": 3.6262, "step": 90 }, { "epoch": 0.01688903901368012, "grad_norm": 8.536865234375, "learning_rate": 0.00019590895469937675, "loss": 4.0803, "step": 100 }, { "epoch": 0.01688903901368012, "eval_loss": 1.7287895679473877, "eval_runtime": 221.2619, "eval_samples_per_second": 11.272, "eval_steps_per_second": 2.82, "step": 100 }, { "epoch": 0.018577942915048135, "grad_norm": 3.2818431854248047, "learning_rate": 0.0001932680912219027, "loss": 2.962, "step": 110 }, { "epoch": 0.020266846816416145, "grad_norm": 3.375999689102173, "learning_rate": 0.00019017770687875164, "loss": 3.0989, "step": 120 }, { "epoch": 0.021955750717784158, "grad_norm": 3.638335943222046, "learning_rate": 0.000186652857711799, "loss": 3.3325, "step": 130 }, { "epoch": 0.02364465461915217, "grad_norm": 6.7433762550354, "learning_rate": 0.00018271071643186968, "loss": 3.8392, "step": 140 }, { "epoch": 0.02533355852052018, "grad_norm": 6.1501593589782715, "learning_rate": 0.00017837048875501678, "loss": 3.8591, "step": 150 }, { "epoch": 0.02533355852052018, "eval_loss": 1.6560685634613037, "eval_runtime": 221.9305, "eval_samples_per_second": 11.238, "eval_steps_per_second": 2.812, "step": 150 }, { "epoch": 0.027022462421888195, "grad_norm": 2.800673246383667, "learning_rate": 0.00017365331983420376, "loss": 2.7496, "step": 160 }, { "epoch": 0.028711366323256205, "grad_norm": 2.549720525741577, "learning_rate": 0.0001685821912422447, "loss": 3.2596, "step": 170 }, { "epoch": 0.03040027022462422, "grad_norm": 4.524278163909912, "learning_rate": 0.00016318180900789148, "loss": 3.5261, "step": 180 }, { "epoch": 0.03208917412599223, "grad_norm": 5.028242588043213, "learning_rate": 0.00015747848325054544, "loss": 3.4352, "step": 190 }, { "epoch": 0.03377807802736024, "grad_norm": 6.687849044799805, "learning_rate": 0.0001515, "loss": 3.7396, "step": 200 }, { "epoch": 0.03377807802736024, "eval_loss": 1.6160459518432617, "eval_runtime": 221.3916, "eval_samples_per_second": 11.265, "eval_steps_per_second": 2.819, "step": 200 }, { "epoch": 0.035466981928728256, "grad_norm": 2.702817440032959, "learning_rate": 0.00014527548582569683, "loss": 2.6468, "step": 210 }, { "epoch": 0.03715588583009627, "grad_norm": 4.0655436515808105, "learning_rate": 0.00013883526593500714, "loss": 3.2802, "step": 220 }, { "epoch": 0.03884478973146428, "grad_norm": 3.6759231090545654, "learning_rate": 0.0001322107164318697, "loss": 3.3334, "step": 230 }, { "epoch": 0.04053369363283229, "grad_norm": 4.863379001617432, "learning_rate": 0.00012543411145556643, "loss": 3.2032, "step": 240 }, { "epoch": 0.0422225975342003, "grad_norm": 6.288718223571777, "learning_rate": 0.00011853846594435998, "loss": 3.5356, "step": 250 }, { "epoch": 0.0422225975342003, "eval_loss": 1.5846458673477173, "eval_runtime": 221.1464, "eval_samples_per_second": 11.278, "eval_steps_per_second": 2.822, "step": 250 }, { "epoch": 0.043911501435568316, "grad_norm": 3.2017464637756348, "learning_rate": 0.00011155737479003301, "loss": 2.8565, "step": 260 }, { "epoch": 0.04560040533693633, "grad_norm": 3.160783529281616, "learning_rate": 0.00010452484916695262, "loss": 3.1377, "step": 270 }, { "epoch": 0.04728930923830434, "grad_norm": 4.057458877563477, "learning_rate": 9.747515083304742e-05, "loss": 3.2321, "step": 280 }, { "epoch": 0.04897821313967235, "grad_norm": 3.2311322689056396, "learning_rate": 9.044262520996702e-05, "loss": 3.1823, "step": 290 }, { "epoch": 0.05066711704104036, "grad_norm": 6.725953578948975, "learning_rate": 8.346153405564004e-05, "loss": 3.3379, "step": 300 }, { "epoch": 0.05066711704104036, "eval_loss": 1.5483067035675049, "eval_runtime": 221.1074, "eval_samples_per_second": 11.28, "eval_steps_per_second": 2.822, "step": 300 }, { "epoch": 0.05235602094240838, "grad_norm": 2.5730559825897217, "learning_rate": 7.656588854443357e-05, "loss": 2.8366, "step": 310 }, { "epoch": 0.05404492484377639, "grad_norm": 2.557634115219116, "learning_rate": 6.978928356813031e-05, "loss": 2.963, "step": 320 }, { "epoch": 0.055733828745144404, "grad_norm": 3.629833221435547, "learning_rate": 6.316473406499288e-05, "loss": 2.9596, "step": 330 }, { "epoch": 0.05742273264651241, "grad_norm": 3.6670989990234375, "learning_rate": 5.672451417430317e-05, "loss": 3.2497, "step": 340 }, { "epoch": 0.059111636547880424, "grad_norm": 4.485354900360107, "learning_rate": 5.050000000000002e-05, "loss": 3.1528, "step": 350 }, { "epoch": 0.059111636547880424, "eval_loss": 1.5068892240524292, "eval_runtime": 221.0414, "eval_samples_per_second": 11.283, "eval_steps_per_second": 2.823, "step": 350 }, { "epoch": 0.06080054044924844, "grad_norm": 2.8697142601013184, "learning_rate": 4.452151674945458e-05, "loss": 2.7241, "step": 360 }, { "epoch": 0.06248944435061645, "grad_norm": 4.0349016189575195, "learning_rate": 3.8818190992108515e-05, "loss": 2.9007, "step": 370 }, { "epoch": 0.06417834825198446, "grad_norm": 3.124457359313965, "learning_rate": 3.3417808757755355e-05, "loss": 3.0556, "step": 380 }, { "epoch": 0.06586725215335247, "grad_norm": 3.4859092235565186, "learning_rate": 2.8346680165796253e-05, "loss": 3.115, "step": 390 }, { "epoch": 0.06755615605472048, "grad_norm": 4.689525127410889, "learning_rate": 2.362951124498323e-05, "loss": 3.1429, "step": 400 }, { "epoch": 0.06755615605472048, "eval_loss": 1.4789899587631226, "eval_runtime": 225.1952, "eval_samples_per_second": 11.075, "eval_steps_per_second": 2.771, "step": 400 }, { "epoch": 0.0692450599560885, "grad_norm": 2.2446751594543457, "learning_rate": 1.928928356813032e-05, "loss": 2.6135, "step": 410 }, { "epoch": 0.07093396385745651, "grad_norm": 2.9616000652313232, "learning_rate": 1.5347142288200977e-05, "loss": 2.8901, "step": 420 }, { "epoch": 0.07262286775882452, "grad_norm": 2.9250693321228027, "learning_rate": 1.1822293121248375e-05, "loss": 2.8757, "step": 430 }, { "epoch": 0.07431177166019254, "grad_norm": 2.9157168865203857, "learning_rate": 8.731908778097302e-06, "loss": 3.2046, "step": 440 }, { "epoch": 0.07600067556156055, "grad_norm": 5.299656867980957, "learning_rate": 6.09104530062326e-06, "loss": 3.061, "step": 450 }, { "epoch": 0.07600067556156055, "eval_loss": 1.4701261520385742, "eval_runtime": 221.003, "eval_samples_per_second": 11.285, "eval_steps_per_second": 2.823, "step": 450 }, { "epoch": 0.07768957946292857, "grad_norm": 1.9850503206253052, "learning_rate": 3.912568710229791e-06, "loss": 2.6809, "step": 460 }, { "epoch": 0.07937848336429656, "grad_norm": 2.561471700668335, "learning_rate": 2.2070923258856255e-06, "loss": 3.0437, "step": 470 }, { "epoch": 0.08106738726566458, "grad_norm": 2.748377799987793, "learning_rate": 9.829250571013935e-07, "loss": 2.9728, "step": 480 }, { "epoch": 0.08275629116703259, "grad_norm": 3.935912609100342, "learning_rate": 2.4603092375775605e-07, "loss": 3.1649, "step": 490 }, { "epoch": 0.0844451950684006, "grad_norm": 5.447554588317871, "learning_rate": 0.0, "loss": 3.0799, "step": 500 }, { "epoch": 0.0844451950684006, "eval_loss": 1.4685018062591553, "eval_runtime": 221.3238, "eval_samples_per_second": 11.269, "eval_steps_per_second": 2.819, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6100295180288e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }