{ "best_metric": 0.5226322412490845, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.09402914903620123, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00018805829807240243, "eval_loss": 1.0244735479354858, "eval_runtime": 110.7584, "eval_samples_per_second": 20.215, "eval_steps_per_second": 5.056, "step": 1 }, { "epoch": 0.0018805829807240243, "grad_norm": 1.312527060508728, "learning_rate": 4.24e-05, "loss": 1.1622, "step": 10 }, { "epoch": 0.0037611659614480487, "grad_norm": 1.2194122076034546, "learning_rate": 8.48e-05, "loss": 0.7124, "step": 20 }, { "epoch": 0.005641748942172073, "grad_norm": 0.9045761823654175, "learning_rate": 0.0001272, "loss": 0.558, "step": 30 }, { "epoch": 0.007522331922896097, "grad_norm": 0.840013325214386, "learning_rate": 0.0001696, "loss": 0.5548, "step": 40 }, { "epoch": 0.009402914903620122, "grad_norm": 1.0572164058685303, "learning_rate": 0.000212, "loss": 0.6082, "step": 50 }, { "epoch": 0.009402914903620122, "eval_loss": 0.6099416613578796, "eval_runtime": 110.8667, "eval_samples_per_second": 20.195, "eval_steps_per_second": 5.051, "step": 50 }, { "epoch": 0.011283497884344146, "grad_norm": 0.8899786472320557, "learning_rate": 0.00021174178932754136, "loss": 0.8371, "step": 60 }, { "epoch": 0.013164080865068171, "grad_norm": 0.8299662470817566, "learning_rate": 0.00021096841528660647, "loss": 0.5882, "step": 70 }, { "epoch": 0.015044663845792195, "grad_norm": 0.7784881591796875, "learning_rate": 0.0002096836456777834, "loss": 0.5139, "step": 80 }, { "epoch": 0.01692524682651622, "grad_norm": 1.0369268655776978, "learning_rate": 0.00020789373976946182, "loss": 0.5615, "step": 90 }, { "epoch": 0.018805829807240243, "grad_norm": 0.8029696941375732, "learning_rate": 0.0002056074178033063, "loss": 0.477, "step": 100 }, { "epoch": 0.018805829807240243, "eval_loss": 0.5923790335655212, "eval_runtime": 110.746, "eval_samples_per_second": 20.217, "eval_steps_per_second": 5.057, "step": 100 }, { "epoch": 0.020686412787964268, "grad_norm": 0.7454051971435547, "learning_rate": 0.00020283581851011567, "loss": 0.7692, "step": 110 }, { "epoch": 0.022566995768688293, "grad_norm": 0.7566020488739014, "learning_rate": 0.00019959244484304625, "loss": 0.6041, "step": 120 }, { "epoch": 0.024447578749412318, "grad_norm": 0.8764553070068359, "learning_rate": 0.00019589309819258114, "loss": 0.4961, "step": 130 }, { "epoch": 0.026328161730136343, "grad_norm": 0.8373245596885681, "learning_rate": 0.00019175580140374444, "loss": 0.547, "step": 140 }, { "epoch": 0.028208744710860368, "grad_norm": 0.7634709477424622, "learning_rate": 0.00018720071097061167, "loss": 0.4938, "step": 150 }, { "epoch": 0.028208744710860368, "eval_loss": 0.5824873447418213, "eval_runtime": 110.6291, "eval_samples_per_second": 20.239, "eval_steps_per_second": 5.062, "step": 150 }, { "epoch": 0.03008932769158439, "grad_norm": 0.7595392465591431, "learning_rate": 0.00018225001883589702, "loss": 0.6898, "step": 160 }, { "epoch": 0.031969910672308414, "grad_norm": 0.759728193283081, "learning_rate": 0.00017692784427403898, "loss": 0.597, "step": 170 }, { "epoch": 0.03385049365303244, "grad_norm": 0.7240564823150635, "learning_rate": 0.00017126011638451976, "loss": 0.5055, "step": 180 }, { "epoch": 0.035731076633756464, "grad_norm": 0.8146303296089172, "learning_rate": 0.00016527444776789915, "loss": 0.5259, "step": 190 }, { "epoch": 0.037611659614480486, "grad_norm": 0.9344813823699951, "learning_rate": 0.00015900000000000002, "loss": 0.4835, "step": 200 }, { "epoch": 0.037611659614480486, "eval_loss": 0.5645079612731934, "eval_runtime": 110.3239, "eval_samples_per_second": 20.295, "eval_steps_per_second": 5.076, "step": 200 }, { "epoch": 0.039492242595204514, "grad_norm": 0.7228832244873047, "learning_rate": 0.0001524673415596422, "loss": 0.7051, "step": 210 }, { "epoch": 0.041372825575928536, "grad_norm": 0.7438748478889465, "learning_rate": 0.00014570829890208668, "loss": 0.5814, "step": 220 }, { "epoch": 0.043253408556652564, "grad_norm": 0.8777850866317749, "learning_rate": 0.00013875580140374443, "loss": 0.5303, "step": 230 }, { "epoch": 0.045133991537376586, "grad_norm": 0.7555531859397888, "learning_rate": 0.00013164372093356477, "loss": 0.4952, "step": 240 }, { "epoch": 0.047014574518100614, "grad_norm": 0.8109617233276367, "learning_rate": 0.00012440670683269464, "loss": 0.4875, "step": 250 }, { "epoch": 0.047014574518100614, "eval_loss": 0.5527585744857788, "eval_runtime": 110.855, "eval_samples_per_second": 20.198, "eval_steps_per_second": 5.052, "step": 250 }, { "epoch": 0.048895157498824636, "grad_norm": 0.7539621591567993, "learning_rate": 0.00011708001710637128, "loss": 0.6915, "step": 260 }, { "epoch": 0.05077574047954866, "grad_norm": 0.7228267788887024, "learning_rate": 0.00010969934665046512, "loss": 0.5286, "step": 270 }, { "epoch": 0.052656323460272686, "grad_norm": 0.774669885635376, "learning_rate": 0.00010230065334953492, "loss": 0.5363, "step": 280 }, { "epoch": 0.05453690644099671, "grad_norm": 0.7888806462287903, "learning_rate": 9.491998289362875e-05, "loss": 0.5055, "step": 290 }, { "epoch": 0.056417489421720736, "grad_norm": 1.0556362867355347, "learning_rate": 8.759329316730539e-05, "loss": 0.499, "step": 300 }, { "epoch": 0.056417489421720736, "eval_loss": 0.5459651947021484, "eval_runtime": 110.2865, "eval_samples_per_second": 20.302, "eval_steps_per_second": 5.078, "step": 300 }, { "epoch": 0.05829807240244476, "grad_norm": 0.7479456067085266, "learning_rate": 8.035627906643523e-05, "loss": 0.7484, "step": 310 }, { "epoch": 0.06017865538316878, "grad_norm": 0.852773129940033, "learning_rate": 7.324419859625559e-05, "loss": 0.588, "step": 320 }, { "epoch": 0.06205923836389281, "grad_norm": 0.653623640537262, "learning_rate": 6.629170109791332e-05, "loss": 0.5215, "step": 330 }, { "epoch": 0.06393982134461683, "grad_norm": 0.9154057502746582, "learning_rate": 5.9532658440357784e-05, "loss": 0.4859, "step": 340 }, { "epoch": 0.06582040432534085, "grad_norm": 0.7963242530822754, "learning_rate": 5.300000000000002e-05, "loss": 0.4577, "step": 350 }, { "epoch": 0.06582040432534085, "eval_loss": 0.5348652005195618, "eval_runtime": 110.893, "eval_samples_per_second": 20.191, "eval_steps_per_second": 5.05, "step": 350 }, { "epoch": 0.06770098730606489, "grad_norm": 0.721904456615448, "learning_rate": 4.672555223210085e-05, "loss": 0.6492, "step": 360 }, { "epoch": 0.06958157028678891, "grad_norm": 0.5901376605033875, "learning_rate": 4.073988361548022e-05, "loss": 0.5297, "step": 370 }, { "epoch": 0.07146215326751293, "grad_norm": 0.7072864770889282, "learning_rate": 3.507215572596106e-05, "loss": 0.516, "step": 380 }, { "epoch": 0.07334273624823695, "grad_norm": 0.7061730623245239, "learning_rate": 2.9749981164102997e-05, "loss": 0.507, "step": 390 }, { "epoch": 0.07522331922896097, "grad_norm": 0.810987114906311, "learning_rate": 2.479928902938834e-05, "loss": 0.4757, "step": 400 }, { "epoch": 0.07522331922896097, "eval_loss": 0.528462827205658, "eval_runtime": 110.7354, "eval_samples_per_second": 20.219, "eval_steps_per_second": 5.057, "step": 400 }, { "epoch": 0.07710390220968501, "grad_norm": 1.0959731340408325, "learning_rate": 2.024419859625558e-05, "loss": 0.7066, "step": 410 }, { "epoch": 0.07898448519040903, "grad_norm": 0.6210688948631287, "learning_rate": 1.610690180741885e-05, "loss": 0.5475, "step": 420 }, { "epoch": 0.08086506817113305, "grad_norm": 0.6310533881187439, "learning_rate": 1.240755515695374e-05, "loss": 0.4826, "step": 430 }, { "epoch": 0.08274565115185707, "grad_norm": 0.6733012199401855, "learning_rate": 9.164181489884296e-06, "loss": 0.4465, "step": 440 }, { "epoch": 0.0846262341325811, "grad_norm": 0.8576263785362244, "learning_rate": 6.392582196693718e-06, "loss": 0.47, "step": 450 }, { "epoch": 0.0846262341325811, "eval_loss": 0.524067223072052, "eval_runtime": 110.922, "eval_samples_per_second": 20.185, "eval_steps_per_second": 5.049, "step": 450 }, { "epoch": 0.08650681711330513, "grad_norm": 0.9640802145004272, "learning_rate": 4.106260230538197e-06, "loss": 0.6474, "step": 460 }, { "epoch": 0.08838740009402915, "grad_norm": 0.6373594403266907, "learning_rate": 2.316354322216597e-06, "loss": 0.5344, "step": 470 }, { "epoch": 0.09026798307475317, "grad_norm": 0.6497265696525574, "learning_rate": 1.0315847133935416e-06, "loss": 0.5107, "step": 480 }, { "epoch": 0.0921485660554772, "grad_norm": 0.7859962582588196, "learning_rate": 2.582106724586351e-07, "loss": 0.4527, "step": 490 }, { "epoch": 0.09402914903620123, "grad_norm": 0.8895713090896606, "learning_rate": 0.0, "loss": 0.4403, "step": 500 }, { "epoch": 0.09402914903620123, "eval_loss": 0.5226322412490845, "eval_runtime": 110.6062, "eval_samples_per_second": 20.243, "eval_steps_per_second": 5.063, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.442856891396915e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }