|
{ |
|
"best_metric": 0.5226322412490845, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.09402914903620123, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00018805829807240243, |
|
"eval_loss": 1.0244735479354858, |
|
"eval_runtime": 110.7584, |
|
"eval_samples_per_second": 20.215, |
|
"eval_steps_per_second": 5.056, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0018805829807240243, |
|
"grad_norm": 1.312527060508728, |
|
"learning_rate": 4.24e-05, |
|
"loss": 1.1622, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0037611659614480487, |
|
"grad_norm": 1.2194122076034546, |
|
"learning_rate": 8.48e-05, |
|
"loss": 0.7124, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.005641748942172073, |
|
"grad_norm": 0.9045761823654175, |
|
"learning_rate": 0.0001272, |
|
"loss": 0.558, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.007522331922896097, |
|
"grad_norm": 0.840013325214386, |
|
"learning_rate": 0.0001696, |
|
"loss": 0.5548, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.009402914903620122, |
|
"grad_norm": 1.0572164058685303, |
|
"learning_rate": 0.000212, |
|
"loss": 0.6082, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.009402914903620122, |
|
"eval_loss": 0.6099416613578796, |
|
"eval_runtime": 110.8667, |
|
"eval_samples_per_second": 20.195, |
|
"eval_steps_per_second": 5.051, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.011283497884344146, |
|
"grad_norm": 0.8899786472320557, |
|
"learning_rate": 0.00021174178932754136, |
|
"loss": 0.8371, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.013164080865068171, |
|
"grad_norm": 0.8299662470817566, |
|
"learning_rate": 0.00021096841528660647, |
|
"loss": 0.5882, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.015044663845792195, |
|
"grad_norm": 0.7784881591796875, |
|
"learning_rate": 0.0002096836456777834, |
|
"loss": 0.5139, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01692524682651622, |
|
"grad_norm": 1.0369268655776978, |
|
"learning_rate": 0.00020789373976946182, |
|
"loss": 0.5615, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.018805829807240243, |
|
"grad_norm": 0.8029696941375732, |
|
"learning_rate": 0.0002056074178033063, |
|
"loss": 0.477, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.018805829807240243, |
|
"eval_loss": 0.5923790335655212, |
|
"eval_runtime": 110.746, |
|
"eval_samples_per_second": 20.217, |
|
"eval_steps_per_second": 5.057, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.020686412787964268, |
|
"grad_norm": 0.7454051971435547, |
|
"learning_rate": 0.00020283581851011567, |
|
"loss": 0.7692, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.022566995768688293, |
|
"grad_norm": 0.7566020488739014, |
|
"learning_rate": 0.00019959244484304625, |
|
"loss": 0.6041, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.024447578749412318, |
|
"grad_norm": 0.8764553070068359, |
|
"learning_rate": 0.00019589309819258114, |
|
"loss": 0.4961, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.026328161730136343, |
|
"grad_norm": 0.8373245596885681, |
|
"learning_rate": 0.00019175580140374444, |
|
"loss": 0.547, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.028208744710860368, |
|
"grad_norm": 0.7634709477424622, |
|
"learning_rate": 0.00018720071097061167, |
|
"loss": 0.4938, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.028208744710860368, |
|
"eval_loss": 0.5824873447418213, |
|
"eval_runtime": 110.6291, |
|
"eval_samples_per_second": 20.239, |
|
"eval_steps_per_second": 5.062, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03008932769158439, |
|
"grad_norm": 0.7595392465591431, |
|
"learning_rate": 0.00018225001883589702, |
|
"loss": 0.6898, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.031969910672308414, |
|
"grad_norm": 0.759728193283081, |
|
"learning_rate": 0.00017692784427403898, |
|
"loss": 0.597, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.03385049365303244, |
|
"grad_norm": 0.7240564823150635, |
|
"learning_rate": 0.00017126011638451976, |
|
"loss": 0.5055, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.035731076633756464, |
|
"grad_norm": 0.8146303296089172, |
|
"learning_rate": 0.00016527444776789915, |
|
"loss": 0.5259, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.037611659614480486, |
|
"grad_norm": 0.9344813823699951, |
|
"learning_rate": 0.00015900000000000002, |
|
"loss": 0.4835, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.037611659614480486, |
|
"eval_loss": 0.5645079612731934, |
|
"eval_runtime": 110.3239, |
|
"eval_samples_per_second": 20.295, |
|
"eval_steps_per_second": 5.076, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.039492242595204514, |
|
"grad_norm": 0.7228832244873047, |
|
"learning_rate": 0.0001524673415596422, |
|
"loss": 0.7051, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.041372825575928536, |
|
"grad_norm": 0.7438748478889465, |
|
"learning_rate": 0.00014570829890208668, |
|
"loss": 0.5814, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.043253408556652564, |
|
"grad_norm": 0.8777850866317749, |
|
"learning_rate": 0.00013875580140374443, |
|
"loss": 0.5303, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.045133991537376586, |
|
"grad_norm": 0.7555531859397888, |
|
"learning_rate": 0.00013164372093356477, |
|
"loss": 0.4952, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.047014574518100614, |
|
"grad_norm": 0.8109617233276367, |
|
"learning_rate": 0.00012440670683269464, |
|
"loss": 0.4875, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.047014574518100614, |
|
"eval_loss": 0.5527585744857788, |
|
"eval_runtime": 110.855, |
|
"eval_samples_per_second": 20.198, |
|
"eval_steps_per_second": 5.052, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.048895157498824636, |
|
"grad_norm": 0.7539621591567993, |
|
"learning_rate": 0.00011708001710637128, |
|
"loss": 0.6915, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.05077574047954866, |
|
"grad_norm": 0.7228267788887024, |
|
"learning_rate": 0.00010969934665046512, |
|
"loss": 0.5286, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.052656323460272686, |
|
"grad_norm": 0.774669885635376, |
|
"learning_rate": 0.00010230065334953492, |
|
"loss": 0.5363, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.05453690644099671, |
|
"grad_norm": 0.7888806462287903, |
|
"learning_rate": 9.491998289362875e-05, |
|
"loss": 0.5055, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.056417489421720736, |
|
"grad_norm": 1.0556362867355347, |
|
"learning_rate": 8.759329316730539e-05, |
|
"loss": 0.499, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.056417489421720736, |
|
"eval_loss": 0.5459651947021484, |
|
"eval_runtime": 110.2865, |
|
"eval_samples_per_second": 20.302, |
|
"eval_steps_per_second": 5.078, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05829807240244476, |
|
"grad_norm": 0.7479456067085266, |
|
"learning_rate": 8.035627906643523e-05, |
|
"loss": 0.7484, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.06017865538316878, |
|
"grad_norm": 0.852773129940033, |
|
"learning_rate": 7.324419859625559e-05, |
|
"loss": 0.588, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.06205923836389281, |
|
"grad_norm": 0.653623640537262, |
|
"learning_rate": 6.629170109791332e-05, |
|
"loss": 0.5215, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.06393982134461683, |
|
"grad_norm": 0.9154057502746582, |
|
"learning_rate": 5.9532658440357784e-05, |
|
"loss": 0.4859, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06582040432534085, |
|
"grad_norm": 0.7963242530822754, |
|
"learning_rate": 5.300000000000002e-05, |
|
"loss": 0.4577, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06582040432534085, |
|
"eval_loss": 0.5348652005195618, |
|
"eval_runtime": 110.893, |
|
"eval_samples_per_second": 20.191, |
|
"eval_steps_per_second": 5.05, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06770098730606489, |
|
"grad_norm": 0.721904456615448, |
|
"learning_rate": 4.672555223210085e-05, |
|
"loss": 0.6492, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.06958157028678891, |
|
"grad_norm": 0.5901376605033875, |
|
"learning_rate": 4.073988361548022e-05, |
|
"loss": 0.5297, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.07146215326751293, |
|
"grad_norm": 0.7072864770889282, |
|
"learning_rate": 3.507215572596106e-05, |
|
"loss": 0.516, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.07334273624823695, |
|
"grad_norm": 0.7061730623245239, |
|
"learning_rate": 2.9749981164102997e-05, |
|
"loss": 0.507, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.07522331922896097, |
|
"grad_norm": 0.810987114906311, |
|
"learning_rate": 2.479928902938834e-05, |
|
"loss": 0.4757, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07522331922896097, |
|
"eval_loss": 0.528462827205658, |
|
"eval_runtime": 110.7354, |
|
"eval_samples_per_second": 20.219, |
|
"eval_steps_per_second": 5.057, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07710390220968501, |
|
"grad_norm": 1.0959731340408325, |
|
"learning_rate": 2.024419859625558e-05, |
|
"loss": 0.7066, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.07898448519040903, |
|
"grad_norm": 0.6210688948631287, |
|
"learning_rate": 1.610690180741885e-05, |
|
"loss": 0.5475, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.08086506817113305, |
|
"grad_norm": 0.6310533881187439, |
|
"learning_rate": 1.240755515695374e-05, |
|
"loss": 0.4826, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.08274565115185707, |
|
"grad_norm": 0.6733012199401855, |
|
"learning_rate": 9.164181489884296e-06, |
|
"loss": 0.4465, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.0846262341325811, |
|
"grad_norm": 0.8576263785362244, |
|
"learning_rate": 6.392582196693718e-06, |
|
"loss": 0.47, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.0846262341325811, |
|
"eval_loss": 0.524067223072052, |
|
"eval_runtime": 110.922, |
|
"eval_samples_per_second": 20.185, |
|
"eval_steps_per_second": 5.049, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08650681711330513, |
|
"grad_norm": 0.9640802145004272, |
|
"learning_rate": 4.106260230538197e-06, |
|
"loss": 0.6474, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.08838740009402915, |
|
"grad_norm": 0.6373594403266907, |
|
"learning_rate": 2.316354322216597e-06, |
|
"loss": 0.5344, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.09026798307475317, |
|
"grad_norm": 0.6497265696525574, |
|
"learning_rate": 1.0315847133935416e-06, |
|
"loss": 0.5107, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.0921485660554772, |
|
"grad_norm": 0.7859962582588196, |
|
"learning_rate": 2.582106724586351e-07, |
|
"loss": 0.4527, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.09402914903620123, |
|
"grad_norm": 0.8895713090896606, |
|
"learning_rate": 0.0, |
|
"loss": 0.4403, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09402914903620123, |
|
"eval_loss": 0.5226322412490845, |
|
"eval_runtime": 110.6062, |
|
"eval_samples_per_second": 20.243, |
|
"eval_steps_per_second": 5.063, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.442856891396915e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|