|
{ |
|
"best_metric": 1.9572868347167969, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.18014772113132768, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0003602954422626554, |
|
"eval_loss": 2.904447555541992, |
|
"eval_runtime": 63.3703, |
|
"eval_samples_per_second": 18.447, |
|
"eval_steps_per_second": 4.624, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0036029544226265538, |
|
"grad_norm": 1.3350719213485718, |
|
"learning_rate": 4.02e-05, |
|
"loss": 2.9148, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0072059088452531075, |
|
"grad_norm": 0.9005210399627686, |
|
"learning_rate": 8.04e-05, |
|
"loss": 2.5671, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010808863267879661, |
|
"grad_norm": 0.7564655542373657, |
|
"learning_rate": 0.0001206, |
|
"loss": 2.4647, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014411817690506215, |
|
"grad_norm": 0.7415841817855835, |
|
"learning_rate": 0.0001608, |
|
"loss": 2.0857, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01801477211313277, |
|
"grad_norm": 1.4382355213165283, |
|
"learning_rate": 0.000201, |
|
"loss": 2.1977, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01801477211313277, |
|
"eval_loss": 2.32846999168396, |
|
"eval_runtime": 63.5256, |
|
"eval_samples_per_second": 18.402, |
|
"eval_steps_per_second": 4.612, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.021617726535759323, |
|
"grad_norm": 0.6720779538154602, |
|
"learning_rate": 0.00020075518705111234, |
|
"loss": 2.4082, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.025220680958385876, |
|
"grad_norm": 0.6647239327430725, |
|
"learning_rate": 0.00020002194090852784, |
|
"loss": 2.2645, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02882363538101243, |
|
"grad_norm": 0.6222731471061707, |
|
"learning_rate": 0.00019880383387374748, |
|
"loss": 2.2457, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03242658980363899, |
|
"grad_norm": 0.6799861192703247, |
|
"learning_rate": 0.00019710680044180106, |
|
"loss": 2.0081, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03602954422626554, |
|
"grad_norm": 1.6060737371444702, |
|
"learning_rate": 0.0001949391083889838, |
|
"loss": 2.1461, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03602954422626554, |
|
"eval_loss": 2.2165262699127197, |
|
"eval_runtime": 63.3419, |
|
"eval_samples_per_second": 18.455, |
|
"eval_steps_per_second": 4.626, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.039632498648892095, |
|
"grad_norm": 0.6089875102043152, |
|
"learning_rate": 0.00019231131849308138, |
|
"loss": 2.2921, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.043235453071518645, |
|
"grad_norm": 0.6623978614807129, |
|
"learning_rate": 0.00018923623308232218, |
|
"loss": 2.1825, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0468384074941452, |
|
"grad_norm": 0.6278699040412903, |
|
"learning_rate": 0.00018572883366372081, |
|
"loss": 2.1697, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.05044136191677175, |
|
"grad_norm": 0.6262252330780029, |
|
"learning_rate": 0.00018180620793468224, |
|
"loss": 1.9721, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05404431633939831, |
|
"grad_norm": 1.050173282623291, |
|
"learning_rate": 0.00017748746653345728, |
|
"loss": 2.068, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05404431633939831, |
|
"eval_loss": 2.16098952293396, |
|
"eval_runtime": 63.2967, |
|
"eval_samples_per_second": 18.469, |
|
"eval_steps_per_second": 4.629, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05764727076202486, |
|
"grad_norm": 0.6275399327278137, |
|
"learning_rate": 0.00017279364993403443, |
|
"loss": 2.2454, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06125022518465142, |
|
"grad_norm": 0.6378840208053589, |
|
"learning_rate": 0.00016774762593906525, |
|
"loss": 2.1606, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06485317960727797, |
|
"grad_norm": 0.6081838011741638, |
|
"learning_rate": 0.00016237397827022866, |
|
"loss": 2.0971, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06845613402990453, |
|
"grad_norm": 0.5905754566192627, |
|
"learning_rate": 0.00015669888679881007, |
|
"loss": 1.9485, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07205908845253108, |
|
"grad_norm": 1.0750936269760132, |
|
"learning_rate": 0.00015075, |
|
"loss": 2.0445, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07205908845253108, |
|
"eval_loss": 2.1002931594848633, |
|
"eval_runtime": 63.4836, |
|
"eval_samples_per_second": 18.414, |
|
"eval_steps_per_second": 4.615, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07566204287515763, |
|
"grad_norm": 0.6942563652992249, |
|
"learning_rate": 0.00014455630025230227, |
|
"loss": 2.1977, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.07926499729778419, |
|
"grad_norm": 0.6279932856559753, |
|
"learning_rate": 0.00013814796263829918, |
|
"loss": 2.1514, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08286795172041074, |
|
"grad_norm": 0.5558415055274963, |
|
"learning_rate": 0.00013155620793468223, |
|
"loss": 2.0617, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08647090614303729, |
|
"grad_norm": 0.532662034034729, |
|
"learning_rate": 0.0001248131505077666, |
|
"loss": 1.904, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09007386056566384, |
|
"grad_norm": 1.1002955436706543, |
|
"learning_rate": 0.00011795164185552652, |
|
"loss": 1.9553, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09007386056566384, |
|
"eval_loss": 2.065007448196411, |
|
"eval_runtime": 63.3385, |
|
"eval_samples_per_second": 18.456, |
|
"eval_steps_per_second": 4.626, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0936768149882904, |
|
"grad_norm": 0.5946807861328125, |
|
"learning_rate": 0.00011100511055839919, |
|
"loss": 2.193, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09727976941091696, |
|
"grad_norm": 0.5511180758476257, |
|
"learning_rate": 0.00010400739941860137, |
|
"loss": 2.0602, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1008827238335435, |
|
"grad_norm": 0.5188409090042114, |
|
"learning_rate": 9.699260058139868e-05, |
|
"loss": 2.0481, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.10448567825617006, |
|
"grad_norm": 0.5795491337776184, |
|
"learning_rate": 8.999488944160085e-05, |
|
"loss": 1.8138, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.10808863267879662, |
|
"grad_norm": 0.9792818427085876, |
|
"learning_rate": 8.30483581444735e-05, |
|
"loss": 1.8413, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.10808863267879662, |
|
"eval_loss": 2.025852680206299, |
|
"eval_runtime": 63.3985, |
|
"eval_samples_per_second": 18.439, |
|
"eval_steps_per_second": 4.622, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11169158710142317, |
|
"grad_norm": 0.5762483477592468, |
|
"learning_rate": 7.618684949223341e-05, |
|
"loss": 2.1275, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.11529454152404972, |
|
"grad_norm": 0.5712476968765259, |
|
"learning_rate": 6.94437920653178e-05, |
|
"loss": 2.0378, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.11889749594667627, |
|
"grad_norm": 0.5272972583770752, |
|
"learning_rate": 6.285203736170084e-05, |
|
"loss": 1.9894, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12250045036930284, |
|
"grad_norm": 0.6240211129188538, |
|
"learning_rate": 5.6443699747697714e-05, |
|
"loss": 1.8432, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.12610340479192939, |
|
"grad_norm": 0.9896277785301208, |
|
"learning_rate": 5.025000000000002e-05, |
|
"loss": 1.8631, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.12610340479192939, |
|
"eval_loss": 1.9985251426696777, |
|
"eval_runtime": 63.5342, |
|
"eval_samples_per_second": 18.4, |
|
"eval_steps_per_second": 4.612, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.12970635921455595, |
|
"grad_norm": 0.5958810448646545, |
|
"learning_rate": 4.430111320118996e-05, |
|
"loss": 2.1226, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13330931363718249, |
|
"grad_norm": 0.5658702254295349, |
|
"learning_rate": 3.862602172977134e-05, |
|
"loss": 2.008, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.13691226805980905, |
|
"grad_norm": 0.5407539010047913, |
|
"learning_rate": 3.325237406093478e-05, |
|
"loss": 1.9236, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1405152224824356, |
|
"grad_norm": 0.521379292011261, |
|
"learning_rate": 2.820635006596558e-05, |
|
"loss": 1.7738, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.14411817690506215, |
|
"grad_norm": 1.012351393699646, |
|
"learning_rate": 2.351253346654272e-05, |
|
"loss": 1.8504, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14411817690506215, |
|
"eval_loss": 1.971751093864441, |
|
"eval_runtime": 63.2383, |
|
"eval_samples_per_second": 18.486, |
|
"eval_steps_per_second": 4.633, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14772113132768872, |
|
"grad_norm": 0.5679916143417358, |
|
"learning_rate": 1.9193792065317794e-05, |
|
"loss": 2.0539, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.15132408575031525, |
|
"grad_norm": 0.5590336918830872, |
|
"learning_rate": 1.5271166336279193e-05, |
|
"loss": 2.0377, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.15492704017294182, |
|
"grad_norm": 0.5411215424537659, |
|
"learning_rate": 1.1763766917677837e-05, |
|
"loss": 1.9581, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.15852999459556838, |
|
"grad_norm": 0.5810568928718567, |
|
"learning_rate": 8.688681506918602e-06, |
|
"loss": 1.8081, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16213294901819492, |
|
"grad_norm": 1.0498058795928955, |
|
"learning_rate": 6.060891611016215e-06, |
|
"loss": 1.8513, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16213294901819492, |
|
"eval_loss": 1.9625978469848633, |
|
"eval_runtime": 63.3636, |
|
"eval_samples_per_second": 18.449, |
|
"eval_steps_per_second": 4.624, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16573590344082148, |
|
"grad_norm": 0.5630381107330322, |
|
"learning_rate": 3.893199558198952e-06, |
|
"loss": 2.0962, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.16933885786344802, |
|
"grad_norm": 0.5653451681137085, |
|
"learning_rate": 2.1961661262525285e-06, |
|
"loss": 1.948, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.17294181228607458, |
|
"grad_norm": 0.528786838054657, |
|
"learning_rate": 9.780590914721787e-07, |
|
"loss": 1.9981, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.17654476670870115, |
|
"grad_norm": 0.5580498576164246, |
|
"learning_rate": 2.4481294888766817e-07, |
|
"loss": 1.8072, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.18014772113132768, |
|
"grad_norm": 1.0382150411605835, |
|
"learning_rate": 0.0, |
|
"loss": 1.8913, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18014772113132768, |
|
"eval_loss": 1.9572868347167969, |
|
"eval_runtime": 63.3164, |
|
"eval_samples_per_second": 18.463, |
|
"eval_steps_per_second": 4.628, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.000556687327232e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|