{ "best_metric": 2.243114471435547, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.07436787307882994, "eval_steps": 50, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00016526194017517766, "eval_loss": 3.1149251461029053, "eval_runtime": 43.6347, "eval_samples_per_second": 58.394, "eval_steps_per_second": 14.598, "step": 1 }, { "epoch": 0.0016526194017517765, "grad_norm": 2.4044153690338135, "learning_rate": 4.36e-05, "loss": 2.4216, "step": 10 }, { "epoch": 0.003305238803503553, "grad_norm": 3.2215139865875244, "learning_rate": 8.72e-05, "loss": 2.2895, "step": 20 }, { "epoch": 0.00495785820525533, "grad_norm": 4.437511444091797, "learning_rate": 0.0001308, "loss": 2.2952, "step": 30 }, { "epoch": 0.006610477607007106, "grad_norm": 6.676159381866455, "learning_rate": 0.0001744, "loss": 2.2616, "step": 40 }, { "epoch": 0.008263097008758883, "grad_norm": 22.480907440185547, "learning_rate": 0.000218, "loss": 2.7042, "step": 50 }, { "epoch": 0.008263097008758883, "eval_loss": 2.7512803077697754, "eval_runtime": 43.5634, "eval_samples_per_second": 58.489, "eval_steps_per_second": 14.622, "step": 50 }, { "epoch": 0.00991571641051066, "grad_norm": 2.332223415374756, "learning_rate": 0.00021773448147832086, "loss": 2.3331, "step": 60 }, { "epoch": 0.011568335812262435, "grad_norm": 2.4621620178222656, "learning_rate": 0.0002169392194928312, "loss": 2.3293, "step": 70 }, { "epoch": 0.013220955214014212, "grad_norm": 3.450859546661377, "learning_rate": 0.00021561808847998484, "loss": 2.1836, "step": 80 }, { "epoch": 0.014873574615765989, "grad_norm": 7.603415489196777, "learning_rate": 0.00021377752485727676, "loss": 2.2165, "step": 90 }, { "epoch": 0.016526194017517766, "grad_norm": 19.09664535522461, "learning_rate": 0.00021142649566566402, "loss": 3.0109, "step": 100 }, { "epoch": 0.016526194017517766, "eval_loss": 2.7991316318511963, "eval_runtime": 43.5731, "eval_samples_per_second": 58.476, "eval_steps_per_second": 14.619, "step": 100 }, { "epoch": 0.01817881341926954, "grad_norm": 2.182637929916382, "learning_rate": 0.0002085764548830435, "loss": 2.4546, "step": 110 }, { "epoch": 0.01983143282102132, "grad_norm": 2.9767863750457764, "learning_rate": 0.00020524128762162305, "loss": 2.252, "step": 120 }, { "epoch": 0.021484052222773095, "grad_norm": 4.451344966888428, "learning_rate": 0.00020143724248105043, "loss": 2.1921, "step": 130 }, { "epoch": 0.02313667162452487, "grad_norm": 5.990928649902344, "learning_rate": 0.0001971828523868693, "loss": 2.0887, "step": 140 }, { "epoch": 0.02478929102627665, "grad_norm": 19.66733741760254, "learning_rate": 0.0001924988442999686, "loss": 2.5636, "step": 150 }, { "epoch": 0.02478929102627665, "eval_loss": 2.7373805046081543, "eval_runtime": 43.5877, "eval_samples_per_second": 58.457, "eval_steps_per_second": 14.614, "step": 150 }, { "epoch": 0.026441910428028424, "grad_norm": 2.0548884868621826, "learning_rate": 0.00018740803823691298, "loss": 2.4033, "step": 160 }, { "epoch": 0.028094529829780203, "grad_norm": 2.9234230518341064, "learning_rate": 0.00018193523609311556, "loss": 2.2948, "step": 170 }, { "epoch": 0.029747149231531978, "grad_norm": 3.7016618251800537, "learning_rate": 0.00017610710081049675, "loss": 2.3234, "step": 180 }, { "epoch": 0.03139976863328375, "grad_norm": 5.965432643890381, "learning_rate": 0.00016995202647831142, "loss": 2.2832, "step": 190 }, { "epoch": 0.03305238803503553, "grad_norm": 17.460437774658203, "learning_rate": 0.00016350000000000002, "loss": 3.0321, "step": 200 }, { "epoch": 0.03305238803503553, "eval_loss": 2.5573222637176514, "eval_runtime": 43.5955, "eval_samples_per_second": 58.446, "eval_steps_per_second": 14.612, "step": 200 }, { "epoch": 0.03470500743678731, "grad_norm": 2.2317683696746826, "learning_rate": 0.00015678245500000943, "loss": 2.3917, "step": 210 }, { "epoch": 0.03635762683853908, "grad_norm": 2.6078310012817383, "learning_rate": 0.00014983211868233444, "loss": 2.3408, "step": 220 }, { "epoch": 0.03801024624029086, "grad_norm": 4.730777263641357, "learning_rate": 0.00014268285238686927, "loss": 2.2385, "step": 230 }, { "epoch": 0.03966286564204264, "grad_norm": 5.267191410064697, "learning_rate": 0.00013536948662036378, "loss": 2.4694, "step": 240 }, { "epoch": 0.04131548504379441, "grad_norm": 14.102747917175293, "learning_rate": 0.00012792765136569544, "loss": 2.8198, "step": 250 }, { "epoch": 0.04131548504379441, "eval_loss": 2.482667922973633, "eval_runtime": 43.6594, "eval_samples_per_second": 58.361, "eval_steps_per_second": 14.59, "step": 250 }, { "epoch": 0.04296810444554619, "grad_norm": 2.223804473876953, "learning_rate": 0.00012039360249617425, "loss": 2.3776, "step": 260 }, { "epoch": 0.04462072384729797, "grad_norm": 2.835205078125, "learning_rate": 0.00011280404514057264, "loss": 2.2912, "step": 270 }, { "epoch": 0.04627334324904974, "grad_norm": 4.878274917602539, "learning_rate": 0.00010519595485942743, "loss": 2.1857, "step": 280 }, { "epoch": 0.04792596265080152, "grad_norm": 5.728613376617432, "learning_rate": 9.76063975038258e-05, "loss": 2.2346, "step": 290 }, { "epoch": 0.0495785820525533, "grad_norm": 16.714305877685547, "learning_rate": 9.00723486343046e-05, "loss": 2.2206, "step": 300 }, { "epoch": 0.0495785820525533, "eval_loss": 2.4008781909942627, "eval_runtime": 43.6638, "eval_samples_per_second": 58.355, "eval_steps_per_second": 14.589, "step": 300 }, { "epoch": 0.05123120145430508, "grad_norm": 2.1384823322296143, "learning_rate": 8.263051337963623e-05, "loss": 2.2682, "step": 310 }, { "epoch": 0.05288382085605685, "grad_norm": 2.6786699295043945, "learning_rate": 7.531714761313074e-05, "loss": 2.1271, "step": 320 }, { "epoch": 0.05453644025780863, "grad_norm": 4.25484037399292, "learning_rate": 6.816788131766559e-05, "loss": 2.3849, "step": 330 }, { "epoch": 0.056189059659560406, "grad_norm": 4.951492786407471, "learning_rate": 6.121754499999055e-05, "loss": 2.1332, "step": 340 }, { "epoch": 0.05784167906131218, "grad_norm": 14.745146751403809, "learning_rate": 5.450000000000003e-05, "loss": 2.9364, "step": 350 }, { "epoch": 0.05784167906131218, "eval_loss": 2.2891488075256348, "eval_runtime": 43.6713, "eval_samples_per_second": 58.345, "eval_steps_per_second": 14.586, "step": 350 }, { "epoch": 0.059494298463063956, "grad_norm": 1.9254217147827148, "learning_rate": 4.804797352168861e-05, "loss": 2.1897, "step": 360 }, { "epoch": 0.061146917864815735, "grad_norm": 2.3489015102386475, "learning_rate": 4.189289918950325e-05, "loss": 2.2346, "step": 370 }, { "epoch": 0.0627995372665675, "grad_norm": 3.7030725479125977, "learning_rate": 3.606476390688449e-05, "loss": 2.2318, "step": 380 }, { "epoch": 0.06445215666831929, "grad_norm": 5.087347507476807, "learning_rate": 3.0591961763087043e-05, "loss": 2.2742, "step": 390 }, { "epoch": 0.06610477607007106, "grad_norm": 16.04279899597168, "learning_rate": 2.550115570003141e-05, "loss": 2.5913, "step": 400 }, { "epoch": 0.06610477607007106, "eval_loss": 2.2623276710510254, "eval_runtime": 43.6913, "eval_samples_per_second": 58.318, "eval_steps_per_second": 14.58, "step": 400 }, { "epoch": 0.06775739547182284, "grad_norm": 2.179595470428467, "learning_rate": 2.081714761313074e-05, "loss": 2.2334, "step": 410 }, { "epoch": 0.06941001487357462, "grad_norm": 2.5870578289031982, "learning_rate": 1.656275751894957e-05, "loss": 2.1487, "step": 420 }, { "epoch": 0.07106263427532639, "grad_norm": 3.1885182857513428, "learning_rate": 1.275871237837696e-05, "loss": 2.0605, "step": 430 }, { "epoch": 0.07271525367707816, "grad_norm": 5.060018539428711, "learning_rate": 9.423545116956494e-06, "loss": 2.3016, "step": 440 }, { "epoch": 0.07436787307882994, "grad_norm": 12.431950569152832, "learning_rate": 6.573504334335994e-06, "loss": 2.5269, "step": 450 }, { "epoch": 0.07436787307882994, "eval_loss": 2.243114471435547, "eval_runtime": 43.9155, "eval_samples_per_second": 58.021, "eval_steps_per_second": 14.505, "step": 450 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.13234235359232e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }