{ "best_metric": 0.864070475101471, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.04722327162825841, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.444654325651681e-05, "eval_loss": 1.392892837524414, "eval_runtime": 256.0791, "eval_samples_per_second": 17.409, "eval_steps_per_second": 4.354, "step": 1 }, { "epoch": 0.0009444654325651682, "grad_norm": 0.7860835790634155, "learning_rate": 4.36e-05, "loss": 1.2044, "step": 10 }, { "epoch": 0.0018889308651303363, "grad_norm": 1.0544121265411377, "learning_rate": 8.72e-05, "loss": 1.1564, "step": 20 }, { "epoch": 0.0028333962976955043, "grad_norm": 1.270194411277771, "learning_rate": 0.0001308, "loss": 1.0678, "step": 30 }, { "epoch": 0.0037778617302606727, "grad_norm": 1.6109638214111328, "learning_rate": 0.0001744, "loss": 0.9977, "step": 40 }, { "epoch": 0.004722327162825841, "grad_norm": 2.586062431335449, "learning_rate": 0.000218, "loss": 0.9803, "step": 50 }, { "epoch": 0.004722327162825841, "eval_loss": 1.0977070331573486, "eval_runtime": 256.1997, "eval_samples_per_second": 17.4, "eval_steps_per_second": 4.352, "step": 50 }, { "epoch": 0.0056667925953910086, "grad_norm": 0.8779242634773254, "learning_rate": 0.00021773448147832086, "loss": 1.0668, "step": 60 }, { "epoch": 0.0066112580279561765, "grad_norm": 1.3890700340270996, "learning_rate": 0.0002169392194928312, "loss": 1.0106, "step": 70 }, { "epoch": 0.007555723460521345, "grad_norm": 0.963046669960022, "learning_rate": 0.00021561808847998484, "loss": 1.0095, "step": 80 }, { "epoch": 0.008500188893086513, "grad_norm": 1.1754647493362427, "learning_rate": 0.00021377752485727676, "loss": 0.9492, "step": 90 }, { "epoch": 0.009444654325651681, "grad_norm": 1.9377000331878662, "learning_rate": 0.00021142649566566402, "loss": 0.7452, "step": 100 }, { "epoch": 0.009444654325651681, "eval_loss": 1.0563247203826904, "eval_runtime": 256.3063, "eval_samples_per_second": 17.393, "eval_steps_per_second": 4.35, "step": 100 }, { "epoch": 0.01038911975821685, "grad_norm": 0.8509911894798279, "learning_rate": 0.0002085764548830435, "loss": 1.0203, "step": 110 }, { "epoch": 0.011333585190782017, "grad_norm": 0.8470553755760193, "learning_rate": 0.00020524128762162305, "loss": 0.9954, "step": 120 }, { "epoch": 0.012278050623347185, "grad_norm": 0.9998864531517029, "learning_rate": 0.00020143724248105043, "loss": 0.9616, "step": 130 }, { "epoch": 0.013222516055912353, "grad_norm": 1.3236018419265747, "learning_rate": 0.0001971828523868693, "loss": 1.0109, "step": 140 }, { "epoch": 0.014166981488477521, "grad_norm": 1.5146818161010742, "learning_rate": 0.0001924988442999686, "loss": 0.8894, "step": 150 }, { "epoch": 0.014166981488477521, "eval_loss": 1.04743492603302, "eval_runtime": 256.4077, "eval_samples_per_second": 17.386, "eval_steps_per_second": 4.349, "step": 150 }, { "epoch": 0.01511144692104269, "grad_norm": 0.7381725907325745, "learning_rate": 0.00018740803823691298, "loss": 1.0541, "step": 160 }, { "epoch": 0.016055912353607857, "grad_norm": 0.8298458456993103, "learning_rate": 0.00018193523609311556, "loss": 0.9556, "step": 170 }, { "epoch": 0.017000377786173027, "grad_norm": 1.0353724956512451, "learning_rate": 0.00017610710081049675, "loss": 0.877, "step": 180 }, { "epoch": 0.017944843218738193, "grad_norm": 1.3693569898605347, "learning_rate": 0.00016995202647831142, "loss": 0.9031, "step": 190 }, { "epoch": 0.018889308651303362, "grad_norm": 1.9539626836776733, "learning_rate": 0.00016350000000000002, "loss": 0.878, "step": 200 }, { "epoch": 0.018889308651303362, "eval_loss": 1.0043987035751343, "eval_runtime": 256.9248, "eval_samples_per_second": 17.351, "eval_steps_per_second": 4.34, "step": 200 }, { "epoch": 0.019833774083868532, "grad_norm": 0.7500446438789368, "learning_rate": 0.00015678245500000943, "loss": 1.0322, "step": 210 }, { "epoch": 0.0207782395164337, "grad_norm": 0.8743635416030884, "learning_rate": 0.00014983211868233444, "loss": 0.9676, "step": 220 }, { "epoch": 0.021722704948998868, "grad_norm": 0.9597961902618408, "learning_rate": 0.00014268285238686927, "loss": 1.0018, "step": 230 }, { "epoch": 0.022667170381564034, "grad_norm": 1.6248764991760254, "learning_rate": 0.00013536948662036378, "loss": 0.8842, "step": 240 }, { "epoch": 0.023611635814129204, "grad_norm": 2.130323886871338, "learning_rate": 0.00012792765136569544, "loss": 0.8014, "step": 250 }, { "epoch": 0.023611635814129204, "eval_loss": 0.9553887248039246, "eval_runtime": 256.8747, "eval_samples_per_second": 17.355, "eval_steps_per_second": 4.341, "step": 250 }, { "epoch": 0.02455610124669437, "grad_norm": 0.6607593894004822, "learning_rate": 0.00012039360249617425, "loss": 0.9588, "step": 260 }, { "epoch": 0.02550056667925954, "grad_norm": 1.0234694480895996, "learning_rate": 0.00011280404514057264, "loss": 0.9644, "step": 270 }, { "epoch": 0.026445032111824706, "grad_norm": 1.1011518239974976, "learning_rate": 0.00010519595485942743, "loss": 0.8414, "step": 280 }, { "epoch": 0.027389497544389876, "grad_norm": 1.0725549459457397, "learning_rate": 9.76063975038258e-05, "loss": 0.9655, "step": 290 }, { "epoch": 0.028333962976955042, "grad_norm": 2.3623476028442383, "learning_rate": 9.00723486343046e-05, "loss": 0.7658, "step": 300 }, { "epoch": 0.028333962976955042, "eval_loss": 0.9112545847892761, "eval_runtime": 256.7605, "eval_samples_per_second": 17.362, "eval_steps_per_second": 4.343, "step": 300 }, { "epoch": 0.02927842840952021, "grad_norm": 0.5989169478416443, "learning_rate": 8.263051337963623e-05, "loss": 1.007, "step": 310 }, { "epoch": 0.03022289384208538, "grad_norm": 0.7678573131561279, "learning_rate": 7.531714761313074e-05, "loss": 0.9324, "step": 320 }, { "epoch": 0.031167359274650547, "grad_norm": 0.8592800498008728, "learning_rate": 6.816788131766559e-05, "loss": 0.8615, "step": 330 }, { "epoch": 0.032111824707215714, "grad_norm": 1.0715807676315308, "learning_rate": 6.121754499999055e-05, "loss": 0.8406, "step": 340 }, { "epoch": 0.03305629013978088, "grad_norm": 2.368269205093384, "learning_rate": 5.450000000000003e-05, "loss": 0.6401, "step": 350 }, { "epoch": 0.03305629013978088, "eval_loss": 0.8866726160049438, "eval_runtime": 257.0405, "eval_samples_per_second": 17.344, "eval_steps_per_second": 4.338, "step": 350 }, { "epoch": 0.03400075557234605, "grad_norm": 0.9228352308273315, "learning_rate": 4.804797352168861e-05, "loss": 0.9052, "step": 360 }, { "epoch": 0.03494522100491122, "grad_norm": 0.7789422273635864, "learning_rate": 4.189289918950325e-05, "loss": 0.9935, "step": 370 }, { "epoch": 0.035889686437476385, "grad_norm": 1.0351366996765137, "learning_rate": 3.606476390688449e-05, "loss": 0.9121, "step": 380 }, { "epoch": 0.036834151870041555, "grad_norm": 0.9583853483200073, "learning_rate": 3.0591961763087043e-05, "loss": 0.8531, "step": 390 }, { "epoch": 0.037778617302606725, "grad_norm": 1.4442740678787231, "learning_rate": 2.550115570003141e-05, "loss": 0.7264, "step": 400 }, { "epoch": 0.037778617302606725, "eval_loss": 0.8729479908943176, "eval_runtime": 256.8229, "eval_samples_per_second": 17.358, "eval_steps_per_second": 4.342, "step": 400 }, { "epoch": 0.038723082735171895, "grad_norm": 0.7448956966400146, "learning_rate": 2.081714761313074e-05, "loss": 0.917, "step": 410 }, { "epoch": 0.039667548167737064, "grad_norm": 0.9566893577575684, "learning_rate": 1.656275751894957e-05, "loss": 1.004, "step": 420 }, { "epoch": 0.04061201360030223, "grad_norm": 0.772722065448761, "learning_rate": 1.275871237837696e-05, "loss": 0.8142, "step": 430 }, { "epoch": 0.0415564790328674, "grad_norm": 1.1294012069702148, "learning_rate": 9.423545116956494e-06, "loss": 0.7214, "step": 440 }, { "epoch": 0.042500944465432566, "grad_norm": 3.888120174407959, "learning_rate": 6.573504334335994e-06, "loss": 0.7121, "step": 450 }, { "epoch": 0.042500944465432566, "eval_loss": 0.864070475101471, "eval_runtime": 257.296, "eval_samples_per_second": 17.326, "eval_steps_per_second": 4.334, "step": 450 }, { "epoch": 0.043445409897997736, "grad_norm": 0.5827313661575317, "learning_rate": 4.22247514272324e-06, "loss": 0.9635, "step": 460 }, { "epoch": 0.0443898753305629, "grad_norm": 0.8031895756721497, "learning_rate": 2.38191152001518e-06, "loss": 0.9467, "step": 470 }, { "epoch": 0.04533434076312807, "grad_norm": 0.6544033288955688, "learning_rate": 1.0607805071688306e-06, "loss": 0.8662, "step": 480 }, { "epoch": 0.04627880619569324, "grad_norm": 1.268267273902893, "learning_rate": 2.655185216791625e-07, "loss": 0.8408, "step": 490 }, { "epoch": 0.04722327162825841, "grad_norm": 2.5217607021331787, "learning_rate": 0.0, "loss": 0.7067, "step": 500 }, { "epoch": 0.04722327162825841, "eval_loss": 0.8645666837692261, "eval_runtime": 256.669, "eval_samples_per_second": 17.369, "eval_steps_per_second": 4.344, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.4194585739264e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }