{ "best_metric": 0.700386643409729, "best_model_checkpoint": "miner_id_24/checkpoint-4788", "epoch": 2.14330478378943, "eval_steps": 114, "global_step": 5130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004177982034677251, "eval_loss": 1.2215561866760254, "eval_runtime": 46.7676, "eval_samples_per_second": 86.192, "eval_steps_per_second": 2.694, "step": 1 }, { "epoch": 0.04762899519532066, "grad_norm": 0.31823158264160156, "learning_rate": 0.00039999999661909113, "loss": 1.0283, "step": 114 }, { "epoch": 0.04762899519532066, "eval_loss": 0.9293507933616638, "eval_runtime": 46.9797, "eval_samples_per_second": 85.803, "eval_steps_per_second": 2.682, "step": 114 }, { "epoch": 0.09525799039064133, "grad_norm": 0.32075753808021545, "learning_rate": 0.00039999971738368344, "loss": 0.8867, "step": 228 }, { "epoch": 0.09525799039064133, "eval_loss": 0.8753141760826111, "eval_runtime": 46.6254, "eval_samples_per_second": 86.455, "eval_steps_per_second": 2.702, "step": 228 }, { "epoch": 0.14288698558596197, "grad_norm": 0.3348535895347595, "learning_rate": 0.0003999989897990767, "loss": 0.8604, "step": 342 }, { "epoch": 0.14288698558596197, "eval_loss": 0.8479313850402832, "eval_runtime": 45.9896, "eval_samples_per_second": 87.65, "eval_steps_per_second": 2.74, "step": 342 }, { "epoch": 0.19051598078128265, "grad_norm": 0.3427460193634033, "learning_rate": 0.00039999781386690196, "loss": 0.8315, "step": 456 }, { "epoch": 0.19051598078128265, "eval_loss": 0.8264108896255493, "eval_runtime": 46.5472, "eval_samples_per_second": 86.6, "eval_steps_per_second": 2.707, "step": 456 }, { "epoch": 0.2381449759766033, "grad_norm": 0.3403862714767456, "learning_rate": 0.00039999618958979537, "loss": 0.8176, "step": 570 }, { "epoch": 0.2381449759766033, "eval_loss": 0.8114441633224487, "eval_runtime": 46.0285, "eval_samples_per_second": 87.576, "eval_steps_per_second": 2.737, "step": 570 }, { "epoch": 0.28577397117192394, "grad_norm": 0.35282206535339355, "learning_rate": 0.00039999411697139815, "loss": 0.7998, "step": 684 }, { "epoch": 0.28577397117192394, "eval_loss": 0.7995946407318115, "eval_runtime": 46.3676, "eval_samples_per_second": 86.936, "eval_steps_per_second": 2.717, "step": 684 }, { "epoch": 0.33340296636724465, "grad_norm": 0.34027719497680664, "learning_rate": 0.00039999159601635653, "loss": 0.7906, "step": 798 }, { "epoch": 0.33340296636724465, "eval_loss": 0.7885904908180237, "eval_runtime": 45.7643, "eval_samples_per_second": 88.082, "eval_steps_per_second": 2.753, "step": 798 }, { "epoch": 0.3810319615625653, "grad_norm": 0.3781284689903259, "learning_rate": 0.000399988626730322, "loss": 0.7806, "step": 912 }, { "epoch": 0.3810319615625653, "eval_loss": 0.7824249863624573, "eval_runtime": 46.3806, "eval_samples_per_second": 86.911, "eval_steps_per_second": 2.717, "step": 912 }, { "epoch": 0.42866095675788596, "grad_norm": 0.3700368106365204, "learning_rate": 0.0003999852091199508, "loss": 0.777, "step": 1026 }, { "epoch": 0.42866095675788596, "eval_loss": 0.7744758725166321, "eval_runtime": 45.8437, "eval_samples_per_second": 87.929, "eval_steps_per_second": 2.748, "step": 1026 }, { "epoch": 0.4762899519532066, "grad_norm": 0.38603395223617554, "learning_rate": 0.0003999813431929044, "loss": 0.7707, "step": 1140 }, { "epoch": 0.4762899519532066, "eval_loss": 0.7685624957084656, "eval_runtime": 46.2481, "eval_samples_per_second": 87.16, "eval_steps_per_second": 2.724, "step": 1140 }, { "epoch": 0.5239189471485273, "grad_norm": 0.3672977387905121, "learning_rate": 0.00039997702895784934, "loss": 0.7659, "step": 1254 }, { "epoch": 0.5239189471485273, "eval_loss": 0.7641800045967102, "eval_runtime": 46.2259, "eval_samples_per_second": 87.202, "eval_steps_per_second": 2.726, "step": 1254 }, { "epoch": 0.5715479423438479, "grad_norm": 0.3618619441986084, "learning_rate": 0.0003999722664244569, "loss": 0.7638, "step": 1368 }, { "epoch": 0.5715479423438479, "eval_loss": 0.7571395635604858, "eval_runtime": 46.0842, "eval_samples_per_second": 87.47, "eval_steps_per_second": 2.734, "step": 1368 }, { "epoch": 0.6191769375391686, "grad_norm": 0.35834816098213196, "learning_rate": 0.00039996705560340365, "loss": 0.7561, "step": 1482 }, { "epoch": 0.6191769375391686, "eval_loss": 0.7542693018913269, "eval_runtime": 46.3568, "eval_samples_per_second": 86.956, "eval_steps_per_second": 2.718, "step": 1482 }, { "epoch": 0.6668059327344893, "grad_norm": 0.37180235981941223, "learning_rate": 0.0003999613965063708, "loss": 0.7521, "step": 1596 }, { "epoch": 0.6668059327344893, "eval_loss": 0.7510305643081665, "eval_runtime": 46.2746, "eval_samples_per_second": 87.11, "eval_steps_per_second": 2.723, "step": 1596 }, { "epoch": 0.7144349279298099, "grad_norm": 0.3470219373703003, "learning_rate": 0.0003999552891460447, "loss": 0.7468, "step": 1710 }, { "epoch": 0.7144349279298099, "eval_loss": 0.7475996017456055, "eval_runtime": 46.1524, "eval_samples_per_second": 87.341, "eval_steps_per_second": 2.73, "step": 1710 }, { "epoch": 0.7620639231251306, "grad_norm": 0.39654770493507385, "learning_rate": 0.0003999487335361165, "loss": 0.7471, "step": 1824 }, { "epoch": 0.7620639231251306, "eval_loss": 0.7432805299758911, "eval_runtime": 46.2621, "eval_samples_per_second": 87.134, "eval_steps_per_second": 2.724, "step": 1824 }, { "epoch": 0.8096929183204512, "grad_norm": 0.3417552709579468, "learning_rate": 0.0003999417296912822, "loss": 0.7413, "step": 1938 }, { "epoch": 0.8096929183204512, "eval_loss": 0.7406983375549316, "eval_runtime": 46.2072, "eval_samples_per_second": 87.237, "eval_steps_per_second": 2.727, "step": 1938 }, { "epoch": 0.8573219135157719, "grad_norm": 0.40963587164878845, "learning_rate": 0.0003999342776272427, "loss": 0.7385, "step": 2052 }, { "epoch": 0.8573219135157719, "eval_loss": 0.7358281016349792, "eval_runtime": 46.2868, "eval_samples_per_second": 87.087, "eval_steps_per_second": 2.722, "step": 2052 }, { "epoch": 0.9049509087110925, "grad_norm": 0.37562674283981323, "learning_rate": 0.0003999263773607037, "loss": 0.7286, "step": 2166 }, { "epoch": 0.9049509087110925, "eval_loss": 0.7351658344268799, "eval_runtime": 46.0804, "eval_samples_per_second": 87.478, "eval_steps_per_second": 2.734, "step": 2166 }, { "epoch": 0.9525799039064132, "grad_norm": 0.366798460483551, "learning_rate": 0.0003999180289093755, "loss": 0.7323, "step": 2280 }, { "epoch": 0.9525799039064132, "eval_loss": 0.7323192954063416, "eval_runtime": 46.0228, "eval_samples_per_second": 87.587, "eval_steps_per_second": 2.738, "step": 2280 }, { "epoch": 1.000208899101734, "grad_norm": 0.6370559334754944, "learning_rate": 0.0003999092322919734, "loss": 0.7328, "step": 2394 }, { "epoch": 1.000208899101734, "eval_loss": 0.7303313612937927, "eval_runtime": 46.5425, "eval_samples_per_second": 86.609, "eval_steps_per_second": 2.707, "step": 2394 }, { "epoch": 1.0478378942970545, "grad_norm": 0.3973049223423004, "learning_rate": 0.0003998999875282171, "loss": 0.6772, "step": 2508 }, { "epoch": 1.0478378942970545, "eval_loss": 0.7296268343925476, "eval_runtime": 46.3229, "eval_samples_per_second": 87.02, "eval_steps_per_second": 2.72, "step": 2508 }, { "epoch": 1.0954668894923751, "grad_norm": 0.3830915689468384, "learning_rate": 0.00039989029463883104, "loss": 0.6772, "step": 2622 }, { "epoch": 1.0954668894923751, "eval_loss": 0.729572594165802, "eval_runtime": 46.4647, "eval_samples_per_second": 86.754, "eval_steps_per_second": 2.712, "step": 2622 }, { "epoch": 1.1430958846876957, "grad_norm": 0.39841774106025696, "learning_rate": 0.00039988015364554423, "loss": 0.6746, "step": 2736 }, { "epoch": 1.1430958846876957, "eval_loss": 0.7292018532752991, "eval_runtime": 46.4327, "eval_samples_per_second": 86.814, "eval_steps_per_second": 2.714, "step": 2736 }, { "epoch": 1.1907248798830166, "grad_norm": 0.381573885679245, "learning_rate": 0.00039986956457109025, "loss": 0.682, "step": 2850 }, { "epoch": 1.1907248798830166, "eval_loss": 0.7255826592445374, "eval_runtime": 46.3679, "eval_samples_per_second": 86.935, "eval_steps_per_second": 2.717, "step": 2850 }, { "epoch": 1.2383538750783372, "grad_norm": 0.43838706612586975, "learning_rate": 0.0003998585274392072, "loss": 0.69, "step": 2964 }, { "epoch": 1.2383538750783372, "eval_loss": 0.7252304553985596, "eval_runtime": 46.3392, "eval_samples_per_second": 86.989, "eval_steps_per_second": 2.719, "step": 2964 }, { "epoch": 1.2859828702736578, "grad_norm": 0.41060566902160645, "learning_rate": 0.0003998470422746375, "loss": 0.6872, "step": 3078 }, { "epoch": 1.2859828702736578, "eval_loss": 0.7243731617927551, "eval_runtime": 46.4575, "eval_samples_per_second": 86.768, "eval_steps_per_second": 2.712, "step": 3078 }, { "epoch": 1.3336118654689786, "grad_norm": 0.39319896697998047, "learning_rate": 0.0003998351091031281, "loss": 0.6921, "step": 3192 }, { "epoch": 1.3336118654689786, "eval_loss": 0.7217971682548523, "eval_runtime": 46.3727, "eval_samples_per_second": 86.926, "eval_steps_per_second": 2.717, "step": 3192 }, { "epoch": 1.3812408606642992, "grad_norm": 0.3909294903278351, "learning_rate": 0.0003998227279514301, "loss": 0.6872, "step": 3306 }, { "epoch": 1.3812408606642992, "eval_loss": 0.7193012833595276, "eval_runtime": 46.437, "eval_samples_per_second": 86.806, "eval_steps_per_second": 2.713, "step": 3306 }, { "epoch": 1.4288698558596198, "grad_norm": 0.39885425567626953, "learning_rate": 0.0003998098988472989, "loss": 0.686, "step": 3420 }, { "epoch": 1.4288698558596198, "eval_loss": 0.7186093926429749, "eval_runtime": 45.9897, "eval_samples_per_second": 87.65, "eval_steps_per_second": 2.74, "step": 3420 }, { "epoch": 1.4764988510549404, "grad_norm": 0.41706719994544983, "learning_rate": 0.00039979662181949423, "loss": 0.6857, "step": 3534 }, { "epoch": 1.4764988510549404, "eval_loss": 0.7160599827766418, "eval_runtime": 46.2419, "eval_samples_per_second": 87.172, "eval_steps_per_second": 2.725, "step": 3534 }, { "epoch": 1.524127846250261, "grad_norm": 0.4174472391605377, "learning_rate": 0.00039978289689777973, "loss": 0.6834, "step": 3648 }, { "epoch": 1.524127846250261, "eval_loss": 0.7159973978996277, "eval_runtime": 46.2251, "eval_samples_per_second": 87.204, "eval_steps_per_second": 2.726, "step": 3648 }, { "epoch": 1.5717568414455818, "grad_norm": 0.4188074767589569, "learning_rate": 0.0003997687241129234, "loss": 0.6831, "step": 3762 }, { "epoch": 1.5717568414455818, "eval_loss": 0.7150419354438782, "eval_runtime": 46.3467, "eval_samples_per_second": 86.975, "eval_steps_per_second": 2.719, "step": 3762 }, { "epoch": 1.6193858366409024, "grad_norm": 0.4279221296310425, "learning_rate": 0.0003997541034966969, "loss": 0.6892, "step": 3876 }, { "epoch": 1.6193858366409024, "eval_loss": 0.710929274559021, "eval_runtime": 46.2076, "eval_samples_per_second": 87.237, "eval_steps_per_second": 2.727, "step": 3876 }, { "epoch": 1.667014831836223, "grad_norm": 0.39455175399780273, "learning_rate": 0.00039973903508187613, "loss": 0.6778, "step": 3990 }, { "epoch": 1.667014831836223, "eval_loss": 0.7102333903312683, "eval_runtime": 46.2868, "eval_samples_per_second": 87.088, "eval_steps_per_second": 2.722, "step": 3990 }, { "epoch": 1.7146438270315438, "grad_norm": 0.38114461302757263, "learning_rate": 0.00039972351890224056, "loss": 0.6772, "step": 4104 }, { "epoch": 1.7146438270315438, "eval_loss": 0.7093734741210938, "eval_runtime": 47.654, "eval_samples_per_second": 84.589, "eval_steps_per_second": 2.644, "step": 4104 }, { "epoch": 1.7622728222268644, "grad_norm": 0.4051247239112854, "learning_rate": 0.00039970755499257367, "loss": 0.682, "step": 4218 }, { "epoch": 1.7622728222268644, "eval_loss": 0.7086232900619507, "eval_runtime": 47.2757, "eval_samples_per_second": 85.266, "eval_steps_per_second": 2.665, "step": 4218 }, { "epoch": 1.809901817422185, "grad_norm": 0.40680763125419617, "learning_rate": 0.0003996911433886625, "loss": 0.681, "step": 4332 }, { "epoch": 1.809901817422185, "eval_loss": 0.7078903317451477, "eval_runtime": 45.9606, "eval_samples_per_second": 87.706, "eval_steps_per_second": 2.741, "step": 4332 }, { "epoch": 1.8575308126175059, "grad_norm": 0.43473345041275024, "learning_rate": 0.0003996742841272978, "loss": 0.6807, "step": 4446 }, { "epoch": 1.8575308126175059, "eval_loss": 0.7049083113670349, "eval_runtime": 46.3325, "eval_samples_per_second": 87.002, "eval_steps_per_second": 2.719, "step": 4446 }, { "epoch": 1.9051598078128265, "grad_norm": 0.41520005464553833, "learning_rate": 0.0003996569772462738, "loss": 0.6839, "step": 4560 }, { "epoch": 1.9051598078128265, "eval_loss": 0.7048254609107971, "eval_runtime": 45.8431, "eval_samples_per_second": 87.93, "eval_steps_per_second": 2.749, "step": 4560 }, { "epoch": 1.952788803008147, "grad_norm": 0.3669950067996979, "learning_rate": 0.0003996392227843881, "loss": 0.6821, "step": 4674 }, { "epoch": 1.952788803008147, "eval_loss": 0.7025982141494751, "eval_runtime": 46.3532, "eval_samples_per_second": 86.963, "eval_steps_per_second": 2.718, "step": 4674 }, { "epoch": 2.000417798203468, "grad_norm": 0.42403531074523926, "learning_rate": 0.0003996210207814418, "loss": 0.6858, "step": 4788 }, { "epoch": 2.000417798203468, "eval_loss": 0.700386643409729, "eval_runtime": 45.9133, "eval_samples_per_second": 87.796, "eval_steps_per_second": 2.744, "step": 4788 }, { "epoch": 2.0480467933987883, "grad_norm": 0.41869914531707764, "learning_rate": 0.00039960237127823915, "loss": 0.6241, "step": 4902 }, { "epoch": 2.0480467933987883, "eval_loss": 0.7085886597633362, "eval_runtime": 46.2485, "eval_samples_per_second": 87.16, "eval_steps_per_second": 2.724, "step": 4902 }, { "epoch": 2.095675788594109, "grad_norm": 0.4239591658115387, "learning_rate": 0.00039958327431658785, "loss": 0.6297, "step": 5016 }, { "epoch": 2.095675788594109, "eval_loss": 0.709389865398407, "eval_runtime": 45.7774, "eval_samples_per_second": 88.057, "eval_steps_per_second": 2.752, "step": 5016 }, { "epoch": 2.14330478378943, "grad_norm": 0.4279311001300812, "learning_rate": 0.00039956372993929837, "loss": 0.6282, "step": 5130 }, { "epoch": 2.14330478378943, "eval_loss": 0.7085138559341431, "eval_runtime": 46.1621, "eval_samples_per_second": 87.323, "eval_steps_per_second": 2.73, "step": 5130 } ], "logging_steps": 114, "max_steps": 239300, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 114, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.78979848758231e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }