{ "best_metric": 0.4650544822216034, "best_model_checkpoint": "miner_id_24/checkpoint-300", "epoch": 0.06238951856088177, "eval_steps": 50, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002079650618696059, "eval_loss": 0.9533683061599731, "eval_runtime": 156.6179, "eval_samples_per_second": 3.237, "eval_steps_per_second": 0.811, "step": 1 }, { "epoch": 0.002079650618696059, "grad_norm": 3.18047833442688, "learning_rate": 4.36e-05, "loss": 0.8342, "step": 10 }, { "epoch": 0.004159301237392118, "grad_norm": 2.9091711044311523, "learning_rate": 8.72e-05, "loss": 0.5367, "step": 20 }, { "epoch": 0.006238951856088177, "grad_norm": 12.142705917358398, "learning_rate": 0.0001308, "loss": 0.5186, "step": 30 }, { "epoch": 0.008318602474784236, "grad_norm": 1.364218831062317, "learning_rate": 0.0001744, "loss": 0.5148, "step": 40 }, { "epoch": 0.010398253093480296, "grad_norm": 2.0976734161376953, "learning_rate": 0.000218, "loss": 0.5504, "step": 50 }, { "epoch": 0.010398253093480296, "eval_loss": 0.5830835103988647, "eval_runtime": 80.9255, "eval_samples_per_second": 6.265, "eval_steps_per_second": 1.569, "step": 50 }, { "epoch": 0.012477903712176355, "grad_norm": 1.4801353216171265, "learning_rate": 0.00021773448147832086, "loss": 0.5768, "step": 60 }, { "epoch": 0.014557554330872413, "grad_norm": 1.2638306617736816, "learning_rate": 0.0002169392194928312, "loss": 0.585, "step": 70 }, { "epoch": 0.01663720494956847, "grad_norm": 1.3111450672149658, "learning_rate": 0.00021561808847998484, "loss": 0.6144, "step": 80 }, { "epoch": 0.01871685556826453, "grad_norm": 1.4548810720443726, "learning_rate": 0.00021377752485727676, "loss": 0.5459, "step": 90 }, { "epoch": 0.020796506186960592, "grad_norm": 1.1708745956420898, "learning_rate": 0.00021142649566566402, "loss": 0.6452, "step": 100 }, { "epoch": 0.020796506186960592, "eval_loss": 0.553749680519104, "eval_runtime": 80.7517, "eval_samples_per_second": 6.279, "eval_steps_per_second": 1.573, "step": 100 }, { "epoch": 0.02287615680565665, "grad_norm": 1.927892804145813, "learning_rate": 0.0002085764548830435, "loss": 0.5392, "step": 110 }, { "epoch": 0.02495580742435271, "grad_norm": 1.463425636291504, "learning_rate": 0.00020524128762162305, "loss": 0.5582, "step": 120 }, { "epoch": 0.027035458043048768, "grad_norm": 1.3318297863006592, "learning_rate": 0.00020143724248105043, "loss": 0.549, "step": 130 }, { "epoch": 0.029115108661744826, "grad_norm": 2.8513331413269043, "learning_rate": 0.0001971828523868693, "loss": 0.559, "step": 140 }, { "epoch": 0.031194759280440885, "grad_norm": 1.3817558288574219, "learning_rate": 0.0001924988442999686, "loss": 0.5554, "step": 150 }, { "epoch": 0.031194759280440885, "eval_loss": 0.5377894043922424, "eval_runtime": 80.77, "eval_samples_per_second": 6.277, "eval_steps_per_second": 1.572, "step": 150 }, { "epoch": 0.03327440989913694, "grad_norm": 1.0492221117019653, "learning_rate": 0.00018740803823691298, "loss": 0.569, "step": 160 }, { "epoch": 0.035354060517833005, "grad_norm": 1.0622763633728027, "learning_rate": 0.00018193523609311556, "loss": 0.5299, "step": 170 }, { "epoch": 0.03743371113652906, "grad_norm": 1.661616325378418, "learning_rate": 0.00017610710081049675, "loss": 0.5821, "step": 180 }, { "epoch": 0.03951336175522512, "grad_norm": 1.344832420349121, "learning_rate": 0.00016995202647831142, "loss": 0.5302, "step": 190 }, { "epoch": 0.041593012373921184, "grad_norm": 1.5419880151748657, "learning_rate": 0.00016350000000000002, "loss": 0.566, "step": 200 }, { "epoch": 0.041593012373921184, "eval_loss": 0.5337647199630737, "eval_runtime": 80.6952, "eval_samples_per_second": 6.283, "eval_steps_per_second": 1.574, "step": 200 }, { "epoch": 0.04367266299261724, "grad_norm": 1.1745060682296753, "learning_rate": 0.00015678245500000943, "loss": 0.5042, "step": 210 }, { "epoch": 0.0457523136113133, "grad_norm": 1.176100492477417, "learning_rate": 0.00014983211868233444, "loss": 0.5065, "step": 220 }, { "epoch": 0.047831964230009356, "grad_norm": 1.4111114740371704, "learning_rate": 0.00014268285238686927, "loss": 0.4751, "step": 230 }, { "epoch": 0.04991161484870542, "grad_norm": 1.2417322397232056, "learning_rate": 0.00013536948662036378, "loss": 0.5278, "step": 240 }, { "epoch": 0.051991265467401473, "grad_norm": 1.5264148712158203, "learning_rate": 0.00012792765136569544, "loss": 0.4541, "step": 250 }, { "epoch": 0.051991265467401473, "eval_loss": 0.5013260841369629, "eval_runtime": 80.7104, "eval_samples_per_second": 6.282, "eval_steps_per_second": 1.574, "step": 250 }, { "epoch": 0.054070916086097535, "grad_norm": 1.2806155681610107, "learning_rate": 0.00012039360249617425, "loss": 0.4848, "step": 260 }, { "epoch": 0.0561505667047936, "grad_norm": 0.8744197487831116, "learning_rate": 0.00011280404514057264, "loss": 0.4888, "step": 270 }, { "epoch": 0.05823021732348965, "grad_norm": 1.0379194021224976, "learning_rate": 0.00010519595485942743, "loss": 0.4923, "step": 280 }, { "epoch": 0.060309867942185715, "grad_norm": 1.0062249898910522, "learning_rate": 9.76063975038258e-05, "loss": 0.4953, "step": 290 }, { "epoch": 0.06238951856088177, "grad_norm": 1.2005985975265503, "learning_rate": 9.00723486343046e-05, "loss": 0.4437, "step": 300 }, { "epoch": 0.06238951856088177, "eval_loss": 0.4650544822216034, "eval_runtime": 80.7039, "eval_samples_per_second": 6.282, "eval_steps_per_second": 1.574, "step": 300 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.027195140656333e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }