|
{ |
|
"best_metric": 1.9433879852294922, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.7256894049346879, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001451378809869376, |
|
"eval_loss": 2.528571605682373, |
|
"eval_runtime": 24.4881, |
|
"eval_samples_per_second": 11.843, |
|
"eval_steps_per_second": 2.981, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01451378809869376, |
|
"grad_norm": 1.4596052169799805, |
|
"learning_rate": 4.1400000000000003e-05, |
|
"loss": 2.3431, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02902757619738752, |
|
"grad_norm": 1.2653985023498535, |
|
"learning_rate": 8.280000000000001e-05, |
|
"loss": 2.1844, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04354136429608128, |
|
"grad_norm": 1.5904498100280762, |
|
"learning_rate": 0.00012419999999999998, |
|
"loss": 2.2177, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05805515239477504, |
|
"grad_norm": 1.749571442604065, |
|
"learning_rate": 0.00016560000000000001, |
|
"loss": 2.2032, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07256894049346879, |
|
"grad_norm": 5.132608890533447, |
|
"learning_rate": 0.000207, |
|
"loss": 2.2296, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07256894049346879, |
|
"eval_loss": 2.383007764816284, |
|
"eval_runtime": 24.5963, |
|
"eval_samples_per_second": 11.79, |
|
"eval_steps_per_second": 2.968, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08708272859216255, |
|
"grad_norm": 1.2386906147003174, |
|
"learning_rate": 0.00020674787920189178, |
|
"loss": 2.2268, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10159651669085631, |
|
"grad_norm": 1.256988525390625, |
|
"learning_rate": 0.00020599274511475253, |
|
"loss": 2.1518, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11611030478955008, |
|
"grad_norm": 1.7277191877365112, |
|
"learning_rate": 0.00020473827667594888, |
|
"loss": 2.1788, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13062409288824384, |
|
"grad_norm": 1.7233295440673828, |
|
"learning_rate": 0.00020299058552961598, |
|
"loss": 2.1713, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14513788098693758, |
|
"grad_norm": 3.663735866546631, |
|
"learning_rate": 0.00020075818625134152, |
|
"loss": 2.1251, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.14513788098693758, |
|
"eval_loss": 2.3279521465301514, |
|
"eval_runtime": 24.3674, |
|
"eval_samples_per_second": 11.901, |
|
"eval_steps_per_second": 2.996, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15965166908563136, |
|
"grad_norm": 1.2206093072891235, |
|
"learning_rate": 0.00019805195486600916, |
|
"loss": 2.1955, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1741654571843251, |
|
"grad_norm": 1.1765705347061157, |
|
"learning_rate": 0.00019488507586089894, |
|
"loss": 2.1238, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18867924528301888, |
|
"grad_norm": 1.411720871925354, |
|
"learning_rate": 0.00019127297795219008, |
|
"loss": 2.1084, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.20319303338171263, |
|
"grad_norm": 1.5477889776229858, |
|
"learning_rate": 0.00018723325891780706, |
|
"loss": 2.1483, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.21770682148040638, |
|
"grad_norm": 3.5689947605133057, |
|
"learning_rate": 0.0001827855998628142, |
|
"loss": 2.0485, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.21770682148040638, |
|
"eval_loss": 2.181504249572754, |
|
"eval_runtime": 24.6073, |
|
"eval_samples_per_second": 11.785, |
|
"eval_steps_per_second": 2.967, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23222060957910015, |
|
"grad_norm": 1.0636928081512451, |
|
"learning_rate": 0.0001779516693350504, |
|
"loss": 2.1444, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2467343976777939, |
|
"grad_norm": 1.1345711946487427, |
|
"learning_rate": 0.00017275501775814182, |
|
"loss": 2.0798, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2612481857764877, |
|
"grad_norm": 1.450459361076355, |
|
"learning_rate": 0.00016722096269620562, |
|
"loss": 2.1037, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2757619738751814, |
|
"grad_norm": 1.63530695438385, |
|
"learning_rate": 0.00016137646550922228, |
|
"loss": 2.1009, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.29027576197387517, |
|
"grad_norm": 4.483078479766846, |
|
"learning_rate": 0.00015525, |
|
"loss": 2.0577, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.29027576197387517, |
|
"eval_loss": 2.118581533432007, |
|
"eval_runtime": 24.8624, |
|
"eval_samples_per_second": 11.664, |
|
"eval_steps_per_second": 2.936, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3047895500725689, |
|
"grad_norm": 1.1066410541534424, |
|
"learning_rate": 0.0001488714136926695, |
|
"loss": 2.1385, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3193033381712627, |
|
"grad_norm": 1.1471424102783203, |
|
"learning_rate": 0.0001422717824185469, |
|
"loss": 2.0349, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.33381712626995647, |
|
"grad_norm": 1.2833964824676514, |
|
"learning_rate": 0.00013548325891780705, |
|
"loss": 2.0561, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3483309143686502, |
|
"grad_norm": 1.6310219764709473, |
|
"learning_rate": 0.0001285389161945656, |
|
"loss": 2.0573, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.36284470246734396, |
|
"grad_norm": 2.8112406730651855, |
|
"learning_rate": 0.0001214725863885273, |
|
"loss": 2.0216, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.36284470246734396, |
|
"eval_loss": 2.0795581340789795, |
|
"eval_runtime": 24.4537, |
|
"eval_samples_per_second": 11.859, |
|
"eval_steps_per_second": 2.985, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 1.0176899433135986, |
|
"learning_rate": 0.00011431869594820213, |
|
"loss": 2.0896, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3918722786647315, |
|
"grad_norm": 1.0335183143615723, |
|
"learning_rate": 0.00010711209790870886, |
|
"loss": 1.9858, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.40638606676342526, |
|
"grad_norm": 1.2859773635864258, |
|
"learning_rate": 9.988790209129117e-05, |
|
"loss": 2.0391, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.420899854862119, |
|
"grad_norm": 1.4198274612426758, |
|
"learning_rate": 9.268130405179787e-05, |
|
"loss": 1.9753, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.43541364296081275, |
|
"grad_norm": 3.4623429775238037, |
|
"learning_rate": 8.55274136114727e-05, |
|
"loss": 1.9868, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.43541364296081275, |
|
"eval_loss": 2.034973382949829, |
|
"eval_runtime": 24.5613, |
|
"eval_samples_per_second": 11.807, |
|
"eval_steps_per_second": 2.972, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.44992743105950656, |
|
"grad_norm": 0.9694491624832153, |
|
"learning_rate": 7.84610838054344e-05, |
|
"loss": 2.0345, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4644412191582003, |
|
"grad_norm": 1.099524736404419, |
|
"learning_rate": 7.151674108219295e-05, |
|
"loss": 1.991, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.47895500725689405, |
|
"grad_norm": 1.2441976070404053, |
|
"learning_rate": 6.472821758145309e-05, |
|
"loss": 2.0073, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4934687953555878, |
|
"grad_norm": 1.3850934505462646, |
|
"learning_rate": 5.8128586307330475e-05, |
|
"loss": 2.0145, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5079825834542816, |
|
"grad_norm": 3.0692977905273438, |
|
"learning_rate": 5.175000000000002e-05, |
|
"loss": 1.9566, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5079825834542816, |
|
"eval_loss": 1.9906885623931885, |
|
"eval_runtime": 24.3553, |
|
"eval_samples_per_second": 11.907, |
|
"eval_steps_per_second": 2.997, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5224963715529753, |
|
"grad_norm": 0.9822912812232971, |
|
"learning_rate": 4.5623534490777714e-05, |
|
"loss": 2.0033, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5370101596516691, |
|
"grad_norm": 0.9240527153015137, |
|
"learning_rate": 3.9779037303794365e-05, |
|
"loss": 1.933, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5515239477503628, |
|
"grad_norm": 1.1260921955108643, |
|
"learning_rate": 3.42449822418582e-05, |
|
"loss": 1.9529, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5660377358490566, |
|
"grad_norm": 1.302894949913025, |
|
"learning_rate": 2.9048330664949622e-05, |
|
"loss": 1.9263, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5805515239477503, |
|
"grad_norm": 2.6783480644226074, |
|
"learning_rate": 2.4214400137185785e-05, |
|
"loss": 1.8083, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5805515239477503, |
|
"eval_loss": 1.9605308771133423, |
|
"eval_runtime": 24.3765, |
|
"eval_samples_per_second": 11.897, |
|
"eval_steps_per_second": 2.995, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5950653120464441, |
|
"grad_norm": 0.927768886089325, |
|
"learning_rate": 1.976674108219295e-05, |
|
"loss": 1.9795, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6095791001451378, |
|
"grad_norm": 0.8885520696640015, |
|
"learning_rate": 1.572702204780991e-05, |
|
"loss": 1.9164, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6240928882438317, |
|
"grad_norm": 1.0483330488204956, |
|
"learning_rate": 1.2114924139101056e-05, |
|
"loss": 1.9517, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6386066763425254, |
|
"grad_norm": 1.1401947736740112, |
|
"learning_rate": 8.948045133990798e-06, |
|
"loss": 1.9481, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6531204644412192, |
|
"grad_norm": 3.351959705352783, |
|
"learning_rate": 6.241813748658489e-06, |
|
"loss": 1.8876, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6531204644412192, |
|
"eval_loss": 1.946034550666809, |
|
"eval_runtime": 24.5405, |
|
"eval_samples_per_second": 11.817, |
|
"eval_steps_per_second": 2.975, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6676342525399129, |
|
"grad_norm": 0.895086407661438, |
|
"learning_rate": 4.009414470383994e-06, |
|
"loss": 1.9791, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6821480406386067, |
|
"grad_norm": 0.8754801750183105, |
|
"learning_rate": 2.261723324051111e-06, |
|
"loss": 1.9452, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6966618287373004, |
|
"grad_norm": 1.139264702796936, |
|
"learning_rate": 1.0072548852474675e-06, |
|
"loss": 1.9159, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7111756168359942, |
|
"grad_norm": 1.3206005096435547, |
|
"learning_rate": 2.5212079810819554e-07, |
|
"loss": 1.9687, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7256894049346879, |
|
"grad_norm": 2.6603949069976807, |
|
"learning_rate": 0.0, |
|
"loss": 1.8801, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7256894049346879, |
|
"eval_loss": 1.9433879852294922, |
|
"eval_runtime": 24.5489, |
|
"eval_samples_per_second": 11.813, |
|
"eval_steps_per_second": 2.974, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2944932147612877e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|