{ "best_metric": 0.5222252011299133, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.09402914903620123, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00018805829807240243, "eval_loss": 1.0244735479354858, "eval_runtime": 109.8257, "eval_samples_per_second": 20.387, "eval_steps_per_second": 5.099, "step": 1 }, { "epoch": 0.0018805829807240243, "grad_norm": 1.325929880142212, "learning_rate": 4.36e-05, "loss": 1.1597, "step": 10 }, { "epoch": 0.0037611659614480487, "grad_norm": 1.204135775566101, "learning_rate": 8.72e-05, "loss": 0.7094, "step": 20 }, { "epoch": 0.005641748942172073, "grad_norm": 0.9052909016609192, "learning_rate": 0.0001308, "loss": 0.5569, "step": 30 }, { "epoch": 0.007522331922896097, "grad_norm": 0.8271329998970032, "learning_rate": 0.0001744, "loss": 0.5544, "step": 40 }, { "epoch": 0.009402914903620122, "grad_norm": 1.0857207775115967, "learning_rate": 0.000218, "loss": 0.6082, "step": 50 }, { "epoch": 0.009402914903620122, "eval_loss": 0.6114992499351501, "eval_runtime": 109.3954, "eval_samples_per_second": 20.467, "eval_steps_per_second": 5.119, "step": 50 }, { "epoch": 0.011283497884344146, "grad_norm": 0.8605771064758301, "learning_rate": 0.00021773448147832086, "loss": 0.8378, "step": 60 }, { "epoch": 0.013164080865068171, "grad_norm": 0.8328661322593689, "learning_rate": 0.0002169392194928312, "loss": 0.5875, "step": 70 }, { "epoch": 0.015044663845792195, "grad_norm": 0.7275546193122864, "learning_rate": 0.00021561808847998484, "loss": 0.5119, "step": 80 }, { "epoch": 0.01692524682651622, "grad_norm": 1.0191245079040527, "learning_rate": 0.00021377752485727676, "loss": 0.5626, "step": 90 }, { "epoch": 0.018805829807240243, "grad_norm": 0.8722258806228638, "learning_rate": 0.00021142649566566402, "loss": 0.4782, "step": 100 }, { "epoch": 0.018805829807240243, "eval_loss": 0.5893458724021912, "eval_runtime": 109.6174, "eval_samples_per_second": 20.426, "eval_steps_per_second": 5.109, "step": 100 }, { "epoch": 0.020686412787964268, "grad_norm": 0.8007014989852905, "learning_rate": 0.0002085764548830435, "loss": 0.7676, "step": 110 }, { "epoch": 0.022566995768688293, "grad_norm": 0.765451192855835, "learning_rate": 0.00020524128762162305, "loss": 0.6089, "step": 120 }, { "epoch": 0.024447578749412318, "grad_norm": 0.775343656539917, "learning_rate": 0.00020143724248105043, "loss": 0.495, "step": 130 }, { "epoch": 0.026328161730136343, "grad_norm": 0.856227695941925, "learning_rate": 0.0001971828523868693, "loss": 0.5466, "step": 140 }, { "epoch": 0.028208744710860368, "grad_norm": 0.7619214653968811, "learning_rate": 0.0001924988442999686, "loss": 0.4931, "step": 150 }, { "epoch": 0.028208744710860368, "eval_loss": 0.582227885723114, "eval_runtime": 109.3257, "eval_samples_per_second": 20.48, "eval_steps_per_second": 5.122, "step": 150 }, { "epoch": 0.03008932769158439, "grad_norm": 0.756696343421936, "learning_rate": 0.00018740803823691298, "loss": 0.6907, "step": 160 }, { "epoch": 0.031969910672308414, "grad_norm": 0.7834084033966064, "learning_rate": 0.00018193523609311556, "loss": 0.5946, "step": 170 }, { "epoch": 0.03385049365303244, "grad_norm": 0.7365180253982544, "learning_rate": 0.00017610710081049675, "loss": 0.507, "step": 180 }, { "epoch": 0.035731076633756464, "grad_norm": 0.8306977152824402, "learning_rate": 0.00016995202647831142, "loss": 0.5288, "step": 190 }, { "epoch": 0.037611659614480486, "grad_norm": 0.9597771763801575, "learning_rate": 0.00016350000000000002, "loss": 0.4864, "step": 200 }, { "epoch": 0.037611659614480486, "eval_loss": 0.5659375786781311, "eval_runtime": 109.741, "eval_samples_per_second": 20.403, "eval_steps_per_second": 5.103, "step": 200 }, { "epoch": 0.039492242595204514, "grad_norm": 0.737483024597168, "learning_rate": 0.00015678245500000943, "loss": 0.709, "step": 210 }, { "epoch": 0.041372825575928536, "grad_norm": 0.7262437343597412, "learning_rate": 0.00014983211868233444, "loss": 0.5847, "step": 220 }, { "epoch": 0.043253408556652564, "grad_norm": 0.8214493989944458, "learning_rate": 0.00014268285238686927, "loss": 0.5337, "step": 230 }, { "epoch": 0.045133991537376586, "grad_norm": 0.7294580936431885, "learning_rate": 0.00013536948662036378, "loss": 0.4972, "step": 240 }, { "epoch": 0.047014574518100614, "grad_norm": 0.8281709551811218, "learning_rate": 0.00012792765136569544, "loss": 0.4884, "step": 250 }, { "epoch": 0.047014574518100614, "eval_loss": 0.5536655783653259, "eval_runtime": 109.8147, "eval_samples_per_second": 20.389, "eval_steps_per_second": 5.1, "step": 250 }, { "epoch": 0.048895157498824636, "grad_norm": 0.7920747995376587, "learning_rate": 0.00012039360249617425, "loss": 0.6937, "step": 260 }, { "epoch": 0.05077574047954866, "grad_norm": 0.7113749980926514, "learning_rate": 0.00011280404514057264, "loss": 0.5313, "step": 270 }, { "epoch": 0.052656323460272686, "grad_norm": 0.7820623517036438, "learning_rate": 0.00010519595485942743, "loss": 0.5391, "step": 280 }, { "epoch": 0.05453690644099671, "grad_norm": 0.762988805770874, "learning_rate": 9.76063975038258e-05, "loss": 0.5062, "step": 290 }, { "epoch": 0.056417489421720736, "grad_norm": 1.0487847328186035, "learning_rate": 9.00723486343046e-05, "loss": 0.4998, "step": 300 }, { "epoch": 0.056417489421720736, "eval_loss": 0.5473635196685791, "eval_runtime": 109.2505, "eval_samples_per_second": 20.494, "eval_steps_per_second": 5.126, "step": 300 }, { "epoch": 0.05829807240244476, "grad_norm": 0.7474403977394104, "learning_rate": 8.263051337963623e-05, "loss": 0.7482, "step": 310 }, { "epoch": 0.06017865538316878, "grad_norm": 0.824717104434967, "learning_rate": 7.531714761313074e-05, "loss": 0.5858, "step": 320 }, { "epoch": 0.06205923836389281, "grad_norm": 0.6236701011657715, "learning_rate": 6.816788131766559e-05, "loss": 0.5217, "step": 330 }, { "epoch": 0.06393982134461683, "grad_norm": 0.8206383585929871, "learning_rate": 6.121754499999055e-05, "loss": 0.4875, "step": 340 }, { "epoch": 0.06582040432534085, "grad_norm": 0.7857638001441956, "learning_rate": 5.450000000000003e-05, "loss": 0.4591, "step": 350 }, { "epoch": 0.06582040432534085, "eval_loss": 0.5349125862121582, "eval_runtime": 110.1428, "eval_samples_per_second": 20.328, "eval_steps_per_second": 5.084, "step": 350 }, { "epoch": 0.06770098730606489, "grad_norm": 0.721478283405304, "learning_rate": 4.804797352168861e-05, "loss": 0.6473, "step": 360 }, { "epoch": 0.06958157028678891, "grad_norm": 0.6118484735488892, "learning_rate": 4.189289918950325e-05, "loss": 0.5278, "step": 370 }, { "epoch": 0.07146215326751293, "grad_norm": 0.6939198970794678, "learning_rate": 3.606476390688449e-05, "loss": 0.516, "step": 380 }, { "epoch": 0.07334273624823695, "grad_norm": 0.6780189275741577, "learning_rate": 3.0591961763087043e-05, "loss": 0.5085, "step": 390 }, { "epoch": 0.07522331922896097, "grad_norm": 0.786106288433075, "learning_rate": 2.550115570003141e-05, "loss": 0.4749, "step": 400 }, { "epoch": 0.07522331922896097, "eval_loss": 0.5289891362190247, "eval_runtime": 109.4848, "eval_samples_per_second": 20.45, "eval_steps_per_second": 5.115, "step": 400 }, { "epoch": 0.07710390220968501, "grad_norm": 0.6698129773139954, "learning_rate": 2.081714761313074e-05, "loss": 0.7042, "step": 410 }, { "epoch": 0.07898448519040903, "grad_norm": 0.6338317394256592, "learning_rate": 1.656275751894957e-05, "loss": 0.5512, "step": 420 }, { "epoch": 0.08086506817113305, "grad_norm": 0.6592607498168945, "learning_rate": 1.275871237837696e-05, "loss": 0.4842, "step": 430 }, { "epoch": 0.08274565115185707, "grad_norm": 0.6793785095214844, "learning_rate": 9.423545116956494e-06, "loss": 0.4469, "step": 440 }, { "epoch": 0.0846262341325811, "grad_norm": 0.8718426823616028, "learning_rate": 6.573504334335994e-06, "loss": 0.4685, "step": 450 }, { "epoch": 0.0846262341325811, "eval_loss": 0.5236806869506836, "eval_runtime": 109.5695, "eval_samples_per_second": 20.435, "eval_steps_per_second": 5.111, "step": 450 }, { "epoch": 0.08650681711330513, "grad_norm": 1.0036578178405762, "learning_rate": 4.22247514272324e-06, "loss": 0.6469, "step": 460 }, { "epoch": 0.08838740009402915, "grad_norm": 0.6492578983306885, "learning_rate": 2.38191152001518e-06, "loss": 0.535, "step": 470 }, { "epoch": 0.09026798307475317, "grad_norm": 0.6323084235191345, "learning_rate": 1.0607805071688306e-06, "loss": 0.51, "step": 480 }, { "epoch": 0.0921485660554772, "grad_norm": 0.7388293743133545, "learning_rate": 2.655185216791625e-07, "loss": 0.4515, "step": 490 }, { "epoch": 0.09402914903620123, "grad_norm": 0.8955107927322388, "learning_rate": 0.0, "loss": 0.4406, "step": 500 }, { "epoch": 0.09402914903620123, "eval_loss": 0.5222252011299133, "eval_runtime": 109.295, "eval_samples_per_second": 20.486, "eval_steps_per_second": 5.124, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.442856891396915e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }