{ "best_metric": 1.9572868347167969, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.18014772113132768, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003602954422626554, "eval_loss": 2.904447555541992, "eval_runtime": 63.3703, "eval_samples_per_second": 18.447, "eval_steps_per_second": 4.624, "step": 1 }, { "epoch": 0.0036029544226265538, "grad_norm": 1.3350719213485718, "learning_rate": 4.02e-05, "loss": 2.9148, "step": 10 }, { "epoch": 0.0072059088452531075, "grad_norm": 0.9005210399627686, "learning_rate": 8.04e-05, "loss": 2.5671, "step": 20 }, { "epoch": 0.010808863267879661, "grad_norm": 0.7564655542373657, "learning_rate": 0.0001206, "loss": 2.4647, "step": 30 }, { "epoch": 0.014411817690506215, "grad_norm": 0.7415841817855835, "learning_rate": 0.0001608, "loss": 2.0857, "step": 40 }, { "epoch": 0.01801477211313277, "grad_norm": 1.4382355213165283, "learning_rate": 0.000201, "loss": 2.1977, "step": 50 }, { "epoch": 0.01801477211313277, "eval_loss": 2.32846999168396, "eval_runtime": 63.5256, "eval_samples_per_second": 18.402, "eval_steps_per_second": 4.612, "step": 50 }, { "epoch": 0.021617726535759323, "grad_norm": 0.6720779538154602, "learning_rate": 0.00020075518705111234, "loss": 2.4082, "step": 60 }, { "epoch": 0.025220680958385876, "grad_norm": 0.6647239327430725, "learning_rate": 0.00020002194090852784, "loss": 2.2645, "step": 70 }, { "epoch": 0.02882363538101243, "grad_norm": 0.6222731471061707, "learning_rate": 0.00019880383387374748, "loss": 2.2457, "step": 80 }, { "epoch": 0.03242658980363899, "grad_norm": 0.6799861192703247, "learning_rate": 0.00019710680044180106, "loss": 2.0081, "step": 90 }, { "epoch": 0.03602954422626554, "grad_norm": 1.6060737371444702, "learning_rate": 0.0001949391083889838, "loss": 2.1461, "step": 100 }, { "epoch": 0.03602954422626554, "eval_loss": 2.2165262699127197, "eval_runtime": 63.3419, "eval_samples_per_second": 18.455, "eval_steps_per_second": 4.626, "step": 100 }, { "epoch": 0.039632498648892095, "grad_norm": 0.6089875102043152, "learning_rate": 0.00019231131849308138, "loss": 2.2921, "step": 110 }, { "epoch": 0.043235453071518645, "grad_norm": 0.6623978614807129, "learning_rate": 0.00018923623308232218, "loss": 2.1825, "step": 120 }, { "epoch": 0.0468384074941452, "grad_norm": 0.6278699040412903, "learning_rate": 0.00018572883366372081, "loss": 2.1697, "step": 130 }, { "epoch": 0.05044136191677175, "grad_norm": 0.6262252330780029, "learning_rate": 0.00018180620793468224, "loss": 1.9721, "step": 140 }, { "epoch": 0.05404431633939831, "grad_norm": 1.050173282623291, "learning_rate": 0.00017748746653345728, "loss": 2.068, "step": 150 }, { "epoch": 0.05404431633939831, "eval_loss": 2.16098952293396, "eval_runtime": 63.2967, "eval_samples_per_second": 18.469, "eval_steps_per_second": 4.629, "step": 150 }, { "epoch": 0.05764727076202486, "grad_norm": 0.6275399327278137, "learning_rate": 0.00017279364993403443, "loss": 2.2454, "step": 160 }, { "epoch": 0.06125022518465142, "grad_norm": 0.6378840208053589, "learning_rate": 0.00016774762593906525, "loss": 2.1606, "step": 170 }, { "epoch": 0.06485317960727797, "grad_norm": 0.6081838011741638, "learning_rate": 0.00016237397827022866, "loss": 2.0971, "step": 180 }, { "epoch": 0.06845613402990453, "grad_norm": 0.5905754566192627, "learning_rate": 0.00015669888679881007, "loss": 1.9485, "step": 190 }, { "epoch": 0.07205908845253108, "grad_norm": 1.0750936269760132, "learning_rate": 0.00015075, "loss": 2.0445, "step": 200 }, { "epoch": 0.07205908845253108, "eval_loss": 2.1002931594848633, "eval_runtime": 63.4836, "eval_samples_per_second": 18.414, "eval_steps_per_second": 4.615, "step": 200 }, { "epoch": 0.07566204287515763, "grad_norm": 0.6942563652992249, "learning_rate": 0.00014455630025230227, "loss": 2.1977, "step": 210 }, { "epoch": 0.07926499729778419, "grad_norm": 0.6279932856559753, "learning_rate": 0.00013814796263829918, "loss": 2.1514, "step": 220 }, { "epoch": 0.08286795172041074, "grad_norm": 0.5558415055274963, "learning_rate": 0.00013155620793468223, "loss": 2.0617, "step": 230 }, { "epoch": 0.08647090614303729, "grad_norm": 0.532662034034729, "learning_rate": 0.0001248131505077666, "loss": 1.904, "step": 240 }, { "epoch": 0.09007386056566384, "grad_norm": 1.1002955436706543, "learning_rate": 0.00011795164185552652, "loss": 1.9553, "step": 250 }, { "epoch": 0.09007386056566384, "eval_loss": 2.065007448196411, "eval_runtime": 63.3385, "eval_samples_per_second": 18.456, "eval_steps_per_second": 4.626, "step": 250 }, { "epoch": 0.0936768149882904, "grad_norm": 0.5946807861328125, "learning_rate": 0.00011100511055839919, "loss": 2.193, "step": 260 }, { "epoch": 0.09727976941091696, "grad_norm": 0.5511180758476257, "learning_rate": 0.00010400739941860137, "loss": 2.0602, "step": 270 }, { "epoch": 0.1008827238335435, "grad_norm": 0.5188409090042114, "learning_rate": 9.699260058139868e-05, "loss": 2.0481, "step": 280 }, { "epoch": 0.10448567825617006, "grad_norm": 0.5795491337776184, "learning_rate": 8.999488944160085e-05, "loss": 1.8138, "step": 290 }, { "epoch": 0.10808863267879662, "grad_norm": 0.9792818427085876, "learning_rate": 8.30483581444735e-05, "loss": 1.8413, "step": 300 }, { "epoch": 0.10808863267879662, "eval_loss": 2.025852680206299, "eval_runtime": 63.3985, "eval_samples_per_second": 18.439, "eval_steps_per_second": 4.622, "step": 300 }, { "epoch": 0.11169158710142317, "grad_norm": 0.5762483477592468, "learning_rate": 7.618684949223341e-05, "loss": 2.1275, "step": 310 }, { "epoch": 0.11529454152404972, "grad_norm": 0.5712476968765259, "learning_rate": 6.94437920653178e-05, "loss": 2.0378, "step": 320 }, { "epoch": 0.11889749594667627, "grad_norm": 0.5272972583770752, "learning_rate": 6.285203736170084e-05, "loss": 1.9894, "step": 330 }, { "epoch": 0.12250045036930284, "grad_norm": 0.6240211129188538, "learning_rate": 5.6443699747697714e-05, "loss": 1.8432, "step": 340 }, { "epoch": 0.12610340479192939, "grad_norm": 0.9896277785301208, "learning_rate": 5.025000000000002e-05, "loss": 1.8631, "step": 350 }, { "epoch": 0.12610340479192939, "eval_loss": 1.9985251426696777, "eval_runtime": 63.5342, "eval_samples_per_second": 18.4, "eval_steps_per_second": 4.612, "step": 350 }, { "epoch": 0.12970635921455595, "grad_norm": 0.5958810448646545, "learning_rate": 4.430111320118996e-05, "loss": 2.1226, "step": 360 }, { "epoch": 0.13330931363718249, "grad_norm": 0.5658702254295349, "learning_rate": 3.862602172977134e-05, "loss": 2.008, "step": 370 }, { "epoch": 0.13691226805980905, "grad_norm": 0.5407539010047913, "learning_rate": 3.325237406093478e-05, "loss": 1.9236, "step": 380 }, { "epoch": 0.1405152224824356, "grad_norm": 0.521379292011261, "learning_rate": 2.820635006596558e-05, "loss": 1.7738, "step": 390 }, { "epoch": 0.14411817690506215, "grad_norm": 1.012351393699646, "learning_rate": 2.351253346654272e-05, "loss": 1.8504, "step": 400 }, { "epoch": 0.14411817690506215, "eval_loss": 1.971751093864441, "eval_runtime": 63.2383, "eval_samples_per_second": 18.486, "eval_steps_per_second": 4.633, "step": 400 }, { "epoch": 0.14772113132768872, "grad_norm": 0.5679916143417358, "learning_rate": 1.9193792065317794e-05, "loss": 2.0539, "step": 410 }, { "epoch": 0.15132408575031525, "grad_norm": 0.5590336918830872, "learning_rate": 1.5271166336279193e-05, "loss": 2.0377, "step": 420 }, { "epoch": 0.15492704017294182, "grad_norm": 0.5411215424537659, "learning_rate": 1.1763766917677837e-05, "loss": 1.9581, "step": 430 }, { "epoch": 0.15852999459556838, "grad_norm": 0.5810568928718567, "learning_rate": 8.688681506918602e-06, "loss": 1.8081, "step": 440 }, { "epoch": 0.16213294901819492, "grad_norm": 1.0498058795928955, "learning_rate": 6.060891611016215e-06, "loss": 1.8513, "step": 450 }, { "epoch": 0.16213294901819492, "eval_loss": 1.9625978469848633, "eval_runtime": 63.3636, "eval_samples_per_second": 18.449, "eval_steps_per_second": 4.624, "step": 450 }, { "epoch": 0.16573590344082148, "grad_norm": 0.5630381107330322, "learning_rate": 3.893199558198952e-06, "loss": 2.0962, "step": 460 }, { "epoch": 0.16933885786344802, "grad_norm": 0.5653451681137085, "learning_rate": 2.1961661262525285e-06, "loss": 1.948, "step": 470 }, { "epoch": 0.17294181228607458, "grad_norm": 0.528786838054657, "learning_rate": 9.780590914721787e-07, "loss": 1.9981, "step": 480 }, { "epoch": 0.17654476670870115, "grad_norm": 0.5580498576164246, "learning_rate": 2.4481294888766817e-07, "loss": 1.8072, "step": 490 }, { "epoch": 0.18014772113132768, "grad_norm": 1.0382150411605835, "learning_rate": 0.0, "loss": 1.8913, "step": 500 }, { "epoch": 0.18014772113132768, "eval_loss": 1.9572868347167969, "eval_runtime": 63.3164, "eval_samples_per_second": 18.463, "eval_steps_per_second": 4.628, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.000556687327232e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }