{ "best_metric": 1.9448524713516235, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.7256894049346879, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001451378809869376, "eval_loss": 2.528571605682373, "eval_runtime": 24.5264, "eval_samples_per_second": 11.824, "eval_steps_per_second": 2.976, "step": 1 }, { "epoch": 0.01451378809869376, "grad_norm": 1.4658918380737305, "learning_rate": 4.2600000000000005e-05, "loss": 2.3421, "step": 10 }, { "epoch": 0.02902757619738752, "grad_norm": 1.2775057554244995, "learning_rate": 8.520000000000001e-05, "loss": 2.1831, "step": 20 }, { "epoch": 0.04354136429608128, "grad_norm": 1.5997238159179688, "learning_rate": 0.0001278, "loss": 2.2168, "step": 30 }, { "epoch": 0.05805515239477504, "grad_norm": 1.8565945625305176, "learning_rate": 0.00017040000000000002, "loss": 2.2017, "step": 40 }, { "epoch": 0.07256894049346879, "grad_norm": 5.11404275894165, "learning_rate": 0.000213, "loss": 2.2357, "step": 50 }, { "epoch": 0.07256894049346879, "eval_loss": 2.3870110511779785, "eval_runtime": 24.5991, "eval_samples_per_second": 11.789, "eval_steps_per_second": 2.968, "step": 50 }, { "epoch": 0.08708272859216255, "grad_norm": 1.203863263130188, "learning_rate": 0.00021274057135267128, "loss": 2.2284, "step": 60 }, { "epoch": 0.10159651669085631, "grad_norm": 1.2065693140029907, "learning_rate": 0.00021196354932097723, "loss": 2.1552, "step": 70 }, { "epoch": 0.11611030478955008, "grad_norm": 1.6284523010253906, "learning_rate": 0.0002106727194781503, "loss": 2.181, "step": 80 }, { "epoch": 0.13062409288824384, "grad_norm": 1.8947309255599976, "learning_rate": 0.00020887437061743096, "loss": 2.1743, "step": 90 }, { "epoch": 0.14513788098693758, "grad_norm": 3.7555911540985107, "learning_rate": 0.00020657726411369925, "loss": 2.1284, "step": 100 }, { "epoch": 0.14513788098693758, "eval_loss": 2.3372433185577393, "eval_runtime": 24.4036, "eval_samples_per_second": 11.884, "eval_steps_per_second": 2.991, "step": 100 }, { "epoch": 0.15965166908563136, "grad_norm": 1.1661285161972046, "learning_rate": 0.000203792591238937, "loss": 2.1987, "step": 110 }, { "epoch": 0.1741654571843251, "grad_norm": 1.179563045501709, "learning_rate": 0.0002005339186394757, "loss": 2.1252, "step": 120 }, { "epoch": 0.18867924528301888, "grad_norm": 1.3845571279525757, "learning_rate": 0.00019681712224065936, "loss": 2.1114, "step": 130 }, { "epoch": 0.20319303338171263, "grad_norm": 1.61588454246521, "learning_rate": 0.0001926603099009319, "loss": 2.1507, "step": 140 }, { "epoch": 0.21770682148040638, "grad_norm": 3.884338140487671, "learning_rate": 0.00018808373319217114, "loss": 2.0528, "step": 150 }, { "epoch": 0.21770682148040638, "eval_loss": 2.194847345352173, "eval_runtime": 24.5982, "eval_samples_per_second": 11.789, "eval_steps_per_second": 2.968, "step": 150 }, { "epoch": 0.23222060957910015, "grad_norm": 1.1318994760513306, "learning_rate": 0.00018310968873606635, "loss": 2.1511, "step": 160 }, { "epoch": 0.2467343976777939, "grad_norm": 1.1854448318481445, "learning_rate": 0.0001777624095772184, "loss": 2.0839, "step": 170 }, { "epoch": 0.2612481857764877, "grad_norm": 1.4225796461105347, "learning_rate": 0.0001720679471221826, "loss": 2.0992, "step": 180 }, { "epoch": 0.2757619738751814, "grad_norm": 1.514849305152893, "learning_rate": 0.00016605404421963453, "loss": 2.1068, "step": 190 }, { "epoch": 0.29027576197387517, "grad_norm": 4.458597660064697, "learning_rate": 0.00015975, "loss": 2.0656, "step": 200 }, { "epoch": 0.29027576197387517, "eval_loss": 2.1229233741760254, "eval_runtime": 24.8076, "eval_samples_per_second": 11.69, "eval_steps_per_second": 2.943, "step": 200 }, { "epoch": 0.3047895500725689, "grad_norm": 1.1364762783050537, "learning_rate": 0.00015318652713303674, "loss": 2.144, "step": 210 }, { "epoch": 0.3193033381712627, "grad_norm": 1.0806138515472412, "learning_rate": 0.00014639560219879464, "loss": 2.0368, "step": 220 }, { "epoch": 0.33381712626995647, "grad_norm": 1.2863917350769043, "learning_rate": 0.0001394103099009319, "loss": 2.0586, "step": 230 }, { "epoch": 0.3483309143686502, "grad_norm": 1.7007073163986206, "learning_rate": 0.0001322646818813646, "loss": 2.058, "step": 240 }, { "epoch": 0.36284470246734396, "grad_norm": 2.966454029083252, "learning_rate": 0.0001249935309215281, "loss": 2.0311, "step": 250 }, { "epoch": 0.36284470246734396, "eval_loss": 2.0804994106292725, "eval_runtime": 24.4819, "eval_samples_per_second": 11.845, "eval_steps_per_second": 2.982, "step": 250 }, { "epoch": 0.37735849056603776, "grad_norm": 1.0515992641448975, "learning_rate": 0.0001176322813380051, "loss": 2.0973, "step": 260 }, { "epoch": 0.3918722786647315, "grad_norm": 1.0517977476119995, "learning_rate": 0.00011021679639881638, "loss": 1.9887, "step": 270 }, { "epoch": 0.40638606676342526, "grad_norm": 1.2678823471069336, "learning_rate": 0.00010278320360118368, "loss": 2.0417, "step": 280 }, { "epoch": 0.420899854862119, "grad_norm": 1.3994240760803223, "learning_rate": 9.536771866199493e-05, "loss": 1.9758, "step": 290 }, { "epoch": 0.43541364296081275, "grad_norm": 3.492399215698242, "learning_rate": 8.800646907847192e-05, "loss": 1.9881, "step": 300 }, { "epoch": 0.43541364296081275, "eval_loss": 2.0401976108551025, "eval_runtime": 24.604, "eval_samples_per_second": 11.787, "eval_steps_per_second": 2.967, "step": 300 }, { "epoch": 0.44992743105950656, "grad_norm": 0.9537028074264526, "learning_rate": 8.07353181186354e-05, "loss": 2.0359, "step": 310 }, { "epoch": 0.4644412191582003, "grad_norm": 1.1064801216125488, "learning_rate": 7.35896900990681e-05, "loss": 1.9925, "step": 320 }, { "epoch": 0.47895500725689405, "grad_norm": 1.2894712686538696, "learning_rate": 6.660439780120536e-05, "loss": 2.0071, "step": 330 }, { "epoch": 0.4934687953555878, "grad_norm": 1.4024039506912231, "learning_rate": 5.981347286696324e-05, "loss": 2.0153, "step": 340 }, { "epoch": 0.5079825834542816, "grad_norm": 3.1581623554229736, "learning_rate": 5.325000000000002e-05, "loss": 1.9632, "step": 350 }, { "epoch": 0.5079825834542816, "eval_loss": 1.9959750175476074, "eval_runtime": 24.4071, "eval_samples_per_second": 11.882, "eval_steps_per_second": 2.991, "step": 350 }, { "epoch": 0.5224963715529753, "grad_norm": 1.0032169818878174, "learning_rate": 4.6945955780365475e-05, "loss": 2.0021, "step": 360 }, { "epoch": 0.5370101596516691, "grad_norm": 0.9521530866622925, "learning_rate": 4.0932052877817393e-05, "loss": 1.9331, "step": 370 }, { "epoch": 0.5515239477503628, "grad_norm": 1.1082749366760254, "learning_rate": 3.523759042278163e-05, "loss": 1.9542, "step": 380 }, { "epoch": 0.5660377358490566, "grad_norm": 1.272968053817749, "learning_rate": 2.989031126393367e-05, "loss": 1.9261, "step": 390 }, { "epoch": 0.5805515239477503, "grad_norm": 2.808889865875244, "learning_rate": 2.4916266807828855e-05, "loss": 1.8051, "step": 400 }, { "epoch": 0.5805515239477503, "eval_loss": 1.9632947444915771, "eval_runtime": 24.3908, "eval_samples_per_second": 11.89, "eval_steps_per_second": 2.993, "step": 400 }, { "epoch": 0.5950653120464441, "grad_norm": 0.8925508856773376, "learning_rate": 2.033969009906811e-05, "loss": 1.9807, "step": 410 }, { "epoch": 0.6095791001451378, "grad_norm": 0.901614248752594, "learning_rate": 1.6182877759340637e-05, "loss": 1.9157, "step": 420 }, { "epoch": 0.6240928882438317, "grad_norm": 1.059577226638794, "learning_rate": 1.2466081360524275e-05, "loss": 1.9516, "step": 430 }, { "epoch": 0.6386066763425254, "grad_norm": 1.138191819190979, "learning_rate": 9.207408761062996e-06, "loss": 1.9491, "step": 440 }, { "epoch": 0.6531204644412192, "grad_norm": 3.2761425971984863, "learning_rate": 6.422735886300764e-06, "loss": 1.889, "step": 450 }, { "epoch": 0.6531204644412192, "eval_loss": 1.9480090141296387, "eval_runtime": 24.632, "eval_samples_per_second": 11.773, "eval_steps_per_second": 2.964, "step": 450 }, { "epoch": 0.6676342525399129, "grad_norm": 0.9090232253074646, "learning_rate": 4.125629382569038e-06, "loss": 1.9756, "step": 460 }, { "epoch": 0.6821480406386067, "grad_norm": 0.8540071249008179, "learning_rate": 2.327280521849694e-06, "loss": 1.9429, "step": 470 }, { "epoch": 0.6966618287373004, "grad_norm": 1.0961964130401611, "learning_rate": 1.0364506790227565e-06, "loss": 1.9158, "step": 480 }, { "epoch": 0.7111756168359942, "grad_norm": 1.251514196395874, "learning_rate": 2.5942864732872295e-07, "loss": 1.9717, "step": 490 }, { "epoch": 0.7256894049346879, "grad_norm": 2.7084903717041016, "learning_rate": 0.0, "loss": 1.8717, "step": 500 }, { "epoch": 0.7256894049346879, "eval_loss": 1.9448524713516235, "eval_runtime": 24.6136, "eval_samples_per_second": 11.782, "eval_steps_per_second": 2.966, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2944932147612877e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }