{ "best_metric": 0.3960544764995575, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.10398253093480295, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002079650618696059, "eval_loss": 0.9533683061599731, "eval_runtime": 156.6179, "eval_samples_per_second": 3.237, "eval_steps_per_second": 0.811, "step": 1 }, { "epoch": 0.002079650618696059, "grad_norm": 3.18047833442688, "learning_rate": 4.36e-05, "loss": 0.8342, "step": 10 }, { "epoch": 0.004159301237392118, "grad_norm": 2.9091711044311523, "learning_rate": 8.72e-05, "loss": 0.5367, "step": 20 }, { "epoch": 0.006238951856088177, "grad_norm": 12.142705917358398, "learning_rate": 0.0001308, "loss": 0.5186, "step": 30 }, { "epoch": 0.008318602474784236, "grad_norm": 1.364218831062317, "learning_rate": 0.0001744, "loss": 0.5148, "step": 40 }, { "epoch": 0.010398253093480296, "grad_norm": 2.0976734161376953, "learning_rate": 0.000218, "loss": 0.5504, "step": 50 }, { "epoch": 0.010398253093480296, "eval_loss": 0.5830835103988647, "eval_runtime": 80.9255, "eval_samples_per_second": 6.265, "eval_steps_per_second": 1.569, "step": 50 }, { "epoch": 0.012477903712176355, "grad_norm": 1.4801353216171265, "learning_rate": 0.00021773448147832086, "loss": 0.5768, "step": 60 }, { "epoch": 0.014557554330872413, "grad_norm": 1.2638306617736816, "learning_rate": 0.0002169392194928312, "loss": 0.585, "step": 70 }, { "epoch": 0.01663720494956847, "grad_norm": 1.3111450672149658, "learning_rate": 0.00021561808847998484, "loss": 0.6144, "step": 80 }, { "epoch": 0.01871685556826453, "grad_norm": 1.4548810720443726, "learning_rate": 0.00021377752485727676, "loss": 0.5459, "step": 90 }, { "epoch": 0.020796506186960592, "grad_norm": 1.1708745956420898, "learning_rate": 0.00021142649566566402, "loss": 0.6452, "step": 100 }, { "epoch": 0.020796506186960592, "eval_loss": 0.553749680519104, "eval_runtime": 80.7517, "eval_samples_per_second": 6.279, "eval_steps_per_second": 1.573, "step": 100 }, { "epoch": 0.02287615680565665, "grad_norm": 1.927892804145813, "learning_rate": 0.0002085764548830435, "loss": 0.5392, "step": 110 }, { "epoch": 0.02495580742435271, "grad_norm": 1.463425636291504, "learning_rate": 0.00020524128762162305, "loss": 0.5582, "step": 120 }, { "epoch": 0.027035458043048768, "grad_norm": 1.3318297863006592, "learning_rate": 0.00020143724248105043, "loss": 0.549, "step": 130 }, { "epoch": 0.029115108661744826, "grad_norm": 2.8513331413269043, "learning_rate": 0.0001971828523868693, "loss": 0.559, "step": 140 }, { "epoch": 0.031194759280440885, "grad_norm": 1.3817558288574219, "learning_rate": 0.0001924988442999686, "loss": 0.5554, "step": 150 }, { "epoch": 0.031194759280440885, "eval_loss": 0.5377894043922424, "eval_runtime": 80.77, "eval_samples_per_second": 6.277, "eval_steps_per_second": 1.572, "step": 150 }, { "epoch": 0.03327440989913694, "grad_norm": 1.0492221117019653, "learning_rate": 0.00018740803823691298, "loss": 0.569, "step": 160 }, { "epoch": 0.035354060517833005, "grad_norm": 1.0622763633728027, "learning_rate": 0.00018193523609311556, "loss": 0.5299, "step": 170 }, { "epoch": 0.03743371113652906, "grad_norm": 1.661616325378418, "learning_rate": 0.00017610710081049675, "loss": 0.5821, "step": 180 }, { "epoch": 0.03951336175522512, "grad_norm": 1.344832420349121, "learning_rate": 0.00016995202647831142, "loss": 0.5302, "step": 190 }, { "epoch": 0.041593012373921184, "grad_norm": 1.5419880151748657, "learning_rate": 0.00016350000000000002, "loss": 0.566, "step": 200 }, { "epoch": 0.041593012373921184, "eval_loss": 0.5337647199630737, "eval_runtime": 80.6952, "eval_samples_per_second": 6.283, "eval_steps_per_second": 1.574, "step": 200 }, { "epoch": 0.04367266299261724, "grad_norm": 1.1745060682296753, "learning_rate": 0.00015678245500000943, "loss": 0.5042, "step": 210 }, { "epoch": 0.0457523136113133, "grad_norm": 1.176100492477417, "learning_rate": 0.00014983211868233444, "loss": 0.5065, "step": 220 }, { "epoch": 0.047831964230009356, "grad_norm": 1.4111114740371704, "learning_rate": 0.00014268285238686927, "loss": 0.4751, "step": 230 }, { "epoch": 0.04991161484870542, "grad_norm": 1.2417322397232056, "learning_rate": 0.00013536948662036378, "loss": 0.5278, "step": 240 }, { "epoch": 0.051991265467401473, "grad_norm": 1.5264148712158203, "learning_rate": 0.00012792765136569544, "loss": 0.4541, "step": 250 }, { "epoch": 0.051991265467401473, "eval_loss": 0.5013260841369629, "eval_runtime": 80.7104, "eval_samples_per_second": 6.282, "eval_steps_per_second": 1.574, "step": 250 }, { "epoch": 0.054070916086097535, "grad_norm": 1.2806155681610107, "learning_rate": 0.00012039360249617425, "loss": 0.4848, "step": 260 }, { "epoch": 0.0561505667047936, "grad_norm": 0.8744197487831116, "learning_rate": 0.00011280404514057264, "loss": 0.4888, "step": 270 }, { "epoch": 0.05823021732348965, "grad_norm": 1.0379194021224976, "learning_rate": 0.00010519595485942743, "loss": 0.4923, "step": 280 }, { "epoch": 0.060309867942185715, "grad_norm": 1.0062249898910522, "learning_rate": 9.76063975038258e-05, "loss": 0.4953, "step": 290 }, { "epoch": 0.06238951856088177, "grad_norm": 1.2005985975265503, "learning_rate": 9.00723486343046e-05, "loss": 0.4437, "step": 300 }, { "epoch": 0.06238951856088177, "eval_loss": 0.4650544822216034, "eval_runtime": 80.7039, "eval_samples_per_second": 6.282, "eval_steps_per_second": 1.574, "step": 300 }, { "epoch": 0.06446916917957783, "grad_norm": 1.3032387495040894, "learning_rate": 8.263051337963623e-05, "loss": 0.4818, "step": 310 }, { "epoch": 0.06654881979827389, "grad_norm": 1.2554705142974854, "learning_rate": 7.531714761313074e-05, "loss": 0.469, "step": 320 }, { "epoch": 0.06862847041696996, "grad_norm": 1.2291592359542847, "learning_rate": 6.816788131766559e-05, "loss": 0.4692, "step": 330 }, { "epoch": 0.07070812103566601, "grad_norm": 0.8150742650032043, "learning_rate": 6.121754499999055e-05, "loss": 0.4679, "step": 340 }, { "epoch": 0.07278777165436207, "grad_norm": 0.9758222699165344, "learning_rate": 5.450000000000003e-05, "loss": 0.4195, "step": 350 }, { "epoch": 0.07278777165436207, "eval_loss": 0.4387960731983185, "eval_runtime": 80.8477, "eval_samples_per_second": 6.271, "eval_steps_per_second": 1.571, "step": 350 }, { "epoch": 0.07486742227305812, "grad_norm": 0.8487042188644409, "learning_rate": 4.804797352168861e-05, "loss": 0.4459, "step": 360 }, { "epoch": 0.07694707289175419, "grad_norm": 1.076621413230896, "learning_rate": 4.189289918950325e-05, "loss": 0.4533, "step": 370 }, { "epoch": 0.07902672351045024, "grad_norm": 0.8744729161262512, "learning_rate": 3.606476390688449e-05, "loss": 0.4404, "step": 380 }, { "epoch": 0.0811063741291463, "grad_norm": 0.8504107594490051, "learning_rate": 3.0591961763087043e-05, "loss": 0.4176, "step": 390 }, { "epoch": 0.08318602474784237, "grad_norm": 0.8338978290557861, "learning_rate": 2.550115570003141e-05, "loss": 0.4642, "step": 400 }, { "epoch": 0.08318602474784237, "eval_loss": 0.4155665934085846, "eval_runtime": 80.7198, "eval_samples_per_second": 6.281, "eval_steps_per_second": 1.573, "step": 400 }, { "epoch": 0.08526567536653842, "grad_norm": 0.9245039820671082, "learning_rate": 2.081714761313074e-05, "loss": 0.4084, "step": 410 }, { "epoch": 0.08734532598523448, "grad_norm": 0.9160316586494446, "learning_rate": 1.656275751894957e-05, "loss": 0.4022, "step": 420 }, { "epoch": 0.08942497660393053, "grad_norm": 0.9029657244682312, "learning_rate": 1.275871237837696e-05, "loss": 0.3921, "step": 430 }, { "epoch": 0.0915046272226266, "grad_norm": 0.8298301696777344, "learning_rate": 9.423545116956494e-06, "loss": 0.3997, "step": 440 }, { "epoch": 0.09358427784132266, "grad_norm": 0.8411400318145752, "learning_rate": 6.573504334335994e-06, "loss": 0.3864, "step": 450 }, { "epoch": 0.09358427784132266, "eval_loss": 0.39971646666526794, "eval_runtime": 80.8995, "eval_samples_per_second": 6.267, "eval_steps_per_second": 1.57, "step": 450 }, { "epoch": 0.09566392846001871, "grad_norm": 1.0529413223266602, "learning_rate": 4.22247514272324e-06, "loss": 0.3989, "step": 460 }, { "epoch": 0.09774357907871478, "grad_norm": 1.2673580646514893, "learning_rate": 2.38191152001518e-06, "loss": 0.4017, "step": 470 }, { "epoch": 0.09982322969741084, "grad_norm": 0.7570623755455017, "learning_rate": 1.0607805071688306e-06, "loss": 0.4142, "step": 480 }, { "epoch": 0.10190288031610689, "grad_norm": 0.9010934829711914, "learning_rate": 2.655185216791625e-07, "loss": 0.3869, "step": 490 }, { "epoch": 0.10398253093480295, "grad_norm": 1.2820464372634888, "learning_rate": 0.0, "loss": 0.4153, "step": 500 }, { "epoch": 0.10398253093480295, "eval_loss": 0.3960544764995575, "eval_runtime": 80.8174, "eval_samples_per_second": 6.273, "eval_steps_per_second": 1.571, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.694211790097613e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }