{ "best_metric": 1.9433879852294922, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.7256894049346879, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001451378809869376, "eval_loss": 2.528571605682373, "eval_runtime": 24.4881, "eval_samples_per_second": 11.843, "eval_steps_per_second": 2.981, "step": 1 }, { "epoch": 0.01451378809869376, "grad_norm": 1.4596052169799805, "learning_rate": 4.1400000000000003e-05, "loss": 2.3431, "step": 10 }, { "epoch": 0.02902757619738752, "grad_norm": 1.2653985023498535, "learning_rate": 8.280000000000001e-05, "loss": 2.1844, "step": 20 }, { "epoch": 0.04354136429608128, "grad_norm": 1.5904498100280762, "learning_rate": 0.00012419999999999998, "loss": 2.2177, "step": 30 }, { "epoch": 0.05805515239477504, "grad_norm": 1.749571442604065, "learning_rate": 0.00016560000000000001, "loss": 2.2032, "step": 40 }, { "epoch": 0.07256894049346879, "grad_norm": 5.132608890533447, "learning_rate": 0.000207, "loss": 2.2296, "step": 50 }, { "epoch": 0.07256894049346879, "eval_loss": 2.383007764816284, "eval_runtime": 24.5963, "eval_samples_per_second": 11.79, "eval_steps_per_second": 2.968, "step": 50 }, { "epoch": 0.08708272859216255, "grad_norm": 1.2386906147003174, "learning_rate": 0.00020674787920189178, "loss": 2.2268, "step": 60 }, { "epoch": 0.10159651669085631, "grad_norm": 1.256988525390625, "learning_rate": 0.00020599274511475253, "loss": 2.1518, "step": 70 }, { "epoch": 0.11611030478955008, "grad_norm": 1.7277191877365112, "learning_rate": 0.00020473827667594888, "loss": 2.1788, "step": 80 }, { "epoch": 0.13062409288824384, "grad_norm": 1.7233295440673828, "learning_rate": 0.00020299058552961598, "loss": 2.1713, "step": 90 }, { "epoch": 0.14513788098693758, "grad_norm": 3.663735866546631, "learning_rate": 0.00020075818625134152, "loss": 2.1251, "step": 100 }, { "epoch": 0.14513788098693758, "eval_loss": 2.3279521465301514, "eval_runtime": 24.3674, "eval_samples_per_second": 11.901, "eval_steps_per_second": 2.996, "step": 100 }, { "epoch": 0.15965166908563136, "grad_norm": 1.2206093072891235, "learning_rate": 0.00019805195486600916, "loss": 2.1955, "step": 110 }, { "epoch": 0.1741654571843251, "grad_norm": 1.1765705347061157, "learning_rate": 0.00019488507586089894, "loss": 2.1238, "step": 120 }, { "epoch": 0.18867924528301888, "grad_norm": 1.411720871925354, "learning_rate": 0.00019127297795219008, "loss": 2.1084, "step": 130 }, { "epoch": 0.20319303338171263, "grad_norm": 1.5477889776229858, "learning_rate": 0.00018723325891780706, "loss": 2.1483, "step": 140 }, { "epoch": 0.21770682148040638, "grad_norm": 3.5689947605133057, "learning_rate": 0.0001827855998628142, "loss": 2.0485, "step": 150 }, { "epoch": 0.21770682148040638, "eval_loss": 2.181504249572754, "eval_runtime": 24.6073, "eval_samples_per_second": 11.785, "eval_steps_per_second": 2.967, "step": 150 }, { "epoch": 0.23222060957910015, "grad_norm": 1.0636928081512451, "learning_rate": 0.0001779516693350504, "loss": 2.1444, "step": 160 }, { "epoch": 0.2467343976777939, "grad_norm": 1.1345711946487427, "learning_rate": 0.00017275501775814182, "loss": 2.0798, "step": 170 }, { "epoch": 0.2612481857764877, "grad_norm": 1.450459361076355, "learning_rate": 0.00016722096269620562, "loss": 2.1037, "step": 180 }, { "epoch": 0.2757619738751814, "grad_norm": 1.63530695438385, "learning_rate": 0.00016137646550922228, "loss": 2.1009, "step": 190 }, { "epoch": 0.29027576197387517, "grad_norm": 4.483078479766846, "learning_rate": 0.00015525, "loss": 2.0577, "step": 200 }, { "epoch": 0.29027576197387517, "eval_loss": 2.118581533432007, "eval_runtime": 24.8624, "eval_samples_per_second": 11.664, "eval_steps_per_second": 2.936, "step": 200 }, { "epoch": 0.3047895500725689, "grad_norm": 1.1066410541534424, "learning_rate": 0.0001488714136926695, "loss": 2.1385, "step": 210 }, { "epoch": 0.3193033381712627, "grad_norm": 1.1471424102783203, "learning_rate": 0.0001422717824185469, "loss": 2.0349, "step": 220 }, { "epoch": 0.33381712626995647, "grad_norm": 1.2833964824676514, "learning_rate": 0.00013548325891780705, "loss": 2.0561, "step": 230 }, { "epoch": 0.3483309143686502, "grad_norm": 1.6310219764709473, "learning_rate": 0.0001285389161945656, "loss": 2.0573, "step": 240 }, { "epoch": 0.36284470246734396, "grad_norm": 2.8112406730651855, "learning_rate": 0.0001214725863885273, "loss": 2.0216, "step": 250 }, { "epoch": 0.36284470246734396, "eval_loss": 2.0795581340789795, "eval_runtime": 24.4537, "eval_samples_per_second": 11.859, "eval_steps_per_second": 2.985, "step": 250 }, { "epoch": 0.37735849056603776, "grad_norm": 1.0176899433135986, "learning_rate": 0.00011431869594820213, "loss": 2.0896, "step": 260 }, { "epoch": 0.3918722786647315, "grad_norm": 1.0335183143615723, "learning_rate": 0.00010711209790870886, "loss": 1.9858, "step": 270 }, { "epoch": 0.40638606676342526, "grad_norm": 1.2859773635864258, "learning_rate": 9.988790209129117e-05, "loss": 2.0391, "step": 280 }, { "epoch": 0.420899854862119, "grad_norm": 1.4198274612426758, "learning_rate": 9.268130405179787e-05, "loss": 1.9753, "step": 290 }, { "epoch": 0.43541364296081275, "grad_norm": 3.4623429775238037, "learning_rate": 8.55274136114727e-05, "loss": 1.9868, "step": 300 }, { "epoch": 0.43541364296081275, "eval_loss": 2.034973382949829, "eval_runtime": 24.5613, "eval_samples_per_second": 11.807, "eval_steps_per_second": 2.972, "step": 300 }, { "epoch": 0.44992743105950656, "grad_norm": 0.9694491624832153, "learning_rate": 7.84610838054344e-05, "loss": 2.0345, "step": 310 }, { "epoch": 0.4644412191582003, "grad_norm": 1.099524736404419, "learning_rate": 7.151674108219295e-05, "loss": 1.991, "step": 320 }, { "epoch": 0.47895500725689405, "grad_norm": 1.2441976070404053, "learning_rate": 6.472821758145309e-05, "loss": 2.0073, "step": 330 }, { "epoch": 0.4934687953555878, "grad_norm": 1.3850934505462646, "learning_rate": 5.8128586307330475e-05, "loss": 2.0145, "step": 340 }, { "epoch": 0.5079825834542816, "grad_norm": 3.0692977905273438, "learning_rate": 5.175000000000002e-05, "loss": 1.9566, "step": 350 }, { "epoch": 0.5079825834542816, "eval_loss": 1.9906885623931885, "eval_runtime": 24.3553, "eval_samples_per_second": 11.907, "eval_steps_per_second": 2.997, "step": 350 }, { "epoch": 0.5224963715529753, "grad_norm": 0.9822912812232971, "learning_rate": 4.5623534490777714e-05, "loss": 2.0033, "step": 360 }, { "epoch": 0.5370101596516691, "grad_norm": 0.9240527153015137, "learning_rate": 3.9779037303794365e-05, "loss": 1.933, "step": 370 }, { "epoch": 0.5515239477503628, "grad_norm": 1.1260921955108643, "learning_rate": 3.42449822418582e-05, "loss": 1.9529, "step": 380 }, { "epoch": 0.5660377358490566, "grad_norm": 1.302894949913025, "learning_rate": 2.9048330664949622e-05, "loss": 1.9263, "step": 390 }, { "epoch": 0.5805515239477503, "grad_norm": 2.6783480644226074, "learning_rate": 2.4214400137185785e-05, "loss": 1.8083, "step": 400 }, { "epoch": 0.5805515239477503, "eval_loss": 1.9605308771133423, "eval_runtime": 24.3765, "eval_samples_per_second": 11.897, "eval_steps_per_second": 2.995, "step": 400 }, { "epoch": 0.5950653120464441, "grad_norm": 0.927768886089325, "learning_rate": 1.976674108219295e-05, "loss": 1.9795, "step": 410 }, { "epoch": 0.6095791001451378, "grad_norm": 0.8885520696640015, "learning_rate": 1.572702204780991e-05, "loss": 1.9164, "step": 420 }, { "epoch": 0.6240928882438317, "grad_norm": 1.0483330488204956, "learning_rate": 1.2114924139101056e-05, "loss": 1.9517, "step": 430 }, { "epoch": 0.6386066763425254, "grad_norm": 1.1401947736740112, "learning_rate": 8.948045133990798e-06, "loss": 1.9481, "step": 440 }, { "epoch": 0.6531204644412192, "grad_norm": 3.351959705352783, "learning_rate": 6.241813748658489e-06, "loss": 1.8876, "step": 450 }, { "epoch": 0.6531204644412192, "eval_loss": 1.946034550666809, "eval_runtime": 24.5405, "eval_samples_per_second": 11.817, "eval_steps_per_second": 2.975, "step": 450 }, { "epoch": 0.6676342525399129, "grad_norm": 0.895086407661438, "learning_rate": 4.009414470383994e-06, "loss": 1.9791, "step": 460 }, { "epoch": 0.6821480406386067, "grad_norm": 0.8754801750183105, "learning_rate": 2.261723324051111e-06, "loss": 1.9452, "step": 470 }, { "epoch": 0.6966618287373004, "grad_norm": 1.139264702796936, "learning_rate": 1.0072548852474675e-06, "loss": 1.9159, "step": 480 }, { "epoch": 0.7111756168359942, "grad_norm": 1.3206005096435547, "learning_rate": 2.5212079810819554e-07, "loss": 1.9687, "step": 490 }, { "epoch": 0.7256894049346879, "grad_norm": 2.6603949069976807, "learning_rate": 0.0, "loss": 1.8801, "step": 500 }, { "epoch": 0.7256894049346879, "eval_loss": 1.9433879852294922, "eval_runtime": 24.5489, "eval_samples_per_second": 11.813, "eval_steps_per_second": 2.974, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2944932147612877e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }