{ "best_metric": 2.208037853240967, "best_model_checkpoint": "miner_id_24/checkpoint-600", "epoch": 0.4193604752752053, "eval_steps": 50, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006989341254586755, "eval_loss": 2.663147449493408, "eval_runtime": 174.7211, "eval_samples_per_second": 13.793, "eval_steps_per_second": 3.451, "step": 1 }, { "epoch": 0.006989341254586755, "grad_norm": 13.655975341796875, "learning_rate": 0.0002, "loss": 9.0459, "step": 10 }, { "epoch": 0.01397868250917351, "grad_norm": 9.660070419311523, "learning_rate": 0.0001998582695676762, "loss": 8.499, "step": 20 }, { "epoch": 0.020968023763760266, "grad_norm": 9.97973346710205, "learning_rate": 0.00019943348002101371, "loss": 8.7002, "step": 30 }, { "epoch": 0.02795736501834702, "grad_norm": 17.681541442871094, "learning_rate": 0.00019872683547213446, "loss": 8.5935, "step": 40 }, { "epoch": 0.034946706272933774, "grad_norm": 26.141830444335938, "learning_rate": 0.00019774033898178667, "loss": 9.1154, "step": 50 }, { "epoch": 0.034946706272933774, "eval_loss": 2.532803535461426, "eval_runtime": 176.3642, "eval_samples_per_second": 13.665, "eval_steps_per_second": 3.419, "step": 50 }, { "epoch": 0.04193604752752053, "grad_norm": 6.8255109786987305, "learning_rate": 0.0001964767868814516, "loss": 9.2041, "step": 60 }, { "epoch": 0.04892538878210729, "grad_norm": 9.947444915771484, "learning_rate": 0.00019493976084683813, "loss": 9.0573, "step": 70 }, { "epoch": 0.05591473003669404, "grad_norm": 9.930855751037598, "learning_rate": 0.00019313361774523385, "loss": 8.798, "step": 80 }, { "epoch": 0.0629040712912808, "grad_norm": 11.831235885620117, "learning_rate": 0.00019106347728549135, "loss": 9.0113, "step": 90 }, { "epoch": 0.06989341254586755, "grad_norm": 16.385021209716797, "learning_rate": 0.00018873520750565718, "loss": 9.299, "step": 100 }, { "epoch": 0.06989341254586755, "eval_loss": 2.5892250537872314, "eval_runtime": 176.5774, "eval_samples_per_second": 13.648, "eval_steps_per_second": 3.415, "step": 100 }, { "epoch": 0.07688275380045431, "grad_norm": 10.318001747131348, "learning_rate": 0.0001861554081393806, "loss": 9.2588, "step": 110 }, { "epoch": 0.08387209505504106, "grad_norm": 13.06207275390625, "learning_rate": 0.0001833313919082515, "loss": 8.8356, "step": 120 }, { "epoch": 0.09086143630962781, "grad_norm": 11.55815315246582, "learning_rate": 0.00018027116379309638, "loss": 8.8293, "step": 130 }, { "epoch": 0.09785077756421458, "grad_norm": 12.704540252685547, "learning_rate": 0.00017698339834299061, "loss": 8.8347, "step": 140 }, { "epoch": 0.10484011881880133, "grad_norm": 17.712913513183594, "learning_rate": 0.00017347741508630672, "loss": 9.3842, "step": 150 }, { "epoch": 0.10484011881880133, "eval_loss": 3.1769754886627197, "eval_runtime": 176.6339, "eval_samples_per_second": 13.644, "eval_steps_per_second": 3.414, "step": 150 }, { "epoch": 0.11182946007338808, "grad_norm": 15.227362632751465, "learning_rate": 0.0001697631521134985, "loss": 9.612, "step": 160 }, { "epoch": 0.11881880132797484, "grad_norm": 11.904091835021973, "learning_rate": 0.00016585113790650388, "loss": 8.8541, "step": 170 }, { "epoch": 0.1258081425825616, "grad_norm": 10.797449111938477, "learning_rate": 0.0001617524614946192, "loss": 8.8466, "step": 180 }, { "epoch": 0.13279748383714834, "grad_norm": 9.96870231628418, "learning_rate": 0.0001574787410214407, "loss": 9.1952, "step": 190 }, { "epoch": 0.1397868250917351, "grad_norm": 19.630910873413086, "learning_rate": 0.00015304209081197425, "loss": 9.3036, "step": 200 }, { "epoch": 0.1397868250917351, "eval_loss": 2.615246295928955, "eval_runtime": 176.2603, "eval_samples_per_second": 13.673, "eval_steps_per_second": 3.421, "step": 200 }, { "epoch": 0.14677616634632185, "grad_norm": 9.76240062713623, "learning_rate": 0.00014845508703326504, "loss": 9.3619, "step": 210 }, { "epoch": 0.15376550760090862, "grad_norm": 10.90725326538086, "learning_rate": 0.00014373073204588556, "loss": 8.9003, "step": 220 }, { "epoch": 0.16075484885549537, "grad_norm": 9.980475425720215, "learning_rate": 0.00013888241754733208, "loss": 8.7837, "step": 230 }, { "epoch": 0.16774419011008213, "grad_norm": 15.180036544799805, "learning_rate": 0.00013392388661180303, "loss": 8.8429, "step": 240 }, { "epoch": 0.17473353136466888, "grad_norm": 20.72552490234375, "learning_rate": 0.0001288691947339621, "loss": 9.3729, "step": 250 }, { "epoch": 0.17473353136466888, "eval_loss": 2.52176833152771, "eval_runtime": 176.7071, "eval_samples_per_second": 13.638, "eval_steps_per_second": 3.412, "step": 250 }, { "epoch": 0.18172287261925563, "grad_norm": 9.573226928710938, "learning_rate": 0.0001237326699871115, "loss": 9.2621, "step": 260 }, { "epoch": 0.18871221387384238, "grad_norm": 9.390510559082031, "learning_rate": 0.00011852887240871145, "loss": 8.9882, "step": 270 }, { "epoch": 0.19570155512842916, "grad_norm": 13.314726829528809, "learning_rate": 0.00011327255272837221, "loss": 8.7959, "step": 280 }, { "epoch": 0.2026908963830159, "grad_norm": 14.31113338470459, "learning_rate": 0.00010797861055530831, "loss": 8.9453, "step": 290 }, { "epoch": 0.20968023763760266, "grad_norm": 18.423933029174805, "learning_rate": 0.00010266205214377748, "loss": 9.2963, "step": 300 }, { "epoch": 0.20968023763760266, "eval_loss": 2.367055892944336, "eval_runtime": 176.4348, "eval_samples_per_second": 13.659, "eval_steps_per_second": 3.418, "step": 300 }, { "epoch": 0.2166695788921894, "grad_norm": 8.509469985961914, "learning_rate": 9.733794785622253e-05, "loss": 9.1317, "step": 310 }, { "epoch": 0.22365892014677616, "grad_norm": 8.39034366607666, "learning_rate": 9.202138944469168e-05, "loss": 8.8204, "step": 320 }, { "epoch": 0.2306482614013629, "grad_norm": 10.102583885192871, "learning_rate": 8.672744727162781e-05, "loss": 8.7958, "step": 330 }, { "epoch": 0.2376376026559497, "grad_norm": 12.610382080078125, "learning_rate": 8.147112759128859e-05, "loss": 8.883, "step": 340 }, { "epoch": 0.24462694391053644, "grad_norm": 17.47761344909668, "learning_rate": 7.626733001288851e-05, "loss": 9.2836, "step": 350 }, { "epoch": 0.24462694391053644, "eval_loss": 2.3271288871765137, "eval_runtime": 176.8202, "eval_samples_per_second": 13.63, "eval_steps_per_second": 3.41, "step": 350 }, { "epoch": 0.2516162851651232, "grad_norm": 11.626654624938965, "learning_rate": 7.113080526603792e-05, "loss": 9.1225, "step": 360 }, { "epoch": 0.25860562641970997, "grad_norm": 9.128327369689941, "learning_rate": 6.607611338819697e-05, "loss": 8.8792, "step": 370 }, { "epoch": 0.2655949676742967, "grad_norm": 14.587638854980469, "learning_rate": 6.111758245266794e-05, "loss": 8.6776, "step": 380 }, { "epoch": 0.27258430892888347, "grad_norm": 13.366926193237305, "learning_rate": 5.626926795411447e-05, "loss": 8.9456, "step": 390 }, { "epoch": 0.2795736501834702, "grad_norm": 21.293872833251953, "learning_rate": 5.1544912966734994e-05, "loss": 9.5686, "step": 400 }, { "epoch": 0.2795736501834702, "eval_loss": 2.3335325717926025, "eval_runtime": 176.6154, "eval_samples_per_second": 13.645, "eval_steps_per_second": 3.414, "step": 400 }, { "epoch": 0.28656299143805697, "grad_norm": 11.748668670654297, "learning_rate": 4.695790918802576e-05, "loss": 9.1355, "step": 410 }, { "epoch": 0.2935523326926437, "grad_norm": 12.503898620605469, "learning_rate": 4.252125897855932e-05, "loss": 8.7879, "step": 420 }, { "epoch": 0.30054167394723047, "grad_norm": 13.619047164916992, "learning_rate": 3.824753850538082e-05, "loss": 8.7023, "step": 430 }, { "epoch": 0.30753101520181725, "grad_norm": 11.604446411132812, "learning_rate": 3.414886209349615e-05, "loss": 8.7273, "step": 440 }, { "epoch": 0.31452035645640397, "grad_norm": 20.676069259643555, "learning_rate": 3.0236847886501542e-05, "loss": 9.2037, "step": 450 }, { "epoch": 0.31452035645640397, "eval_loss": 2.2449896335601807, "eval_runtime": 176.7959, "eval_samples_per_second": 13.632, "eval_steps_per_second": 3.411, "step": 450 }, { "epoch": 0.32150969771099075, "grad_norm": 7.940160751342773, "learning_rate": 2.6522584913693294e-05, "loss": 8.9088, "step": 460 }, { "epoch": 0.3284990389655775, "grad_norm": 10.04442024230957, "learning_rate": 2.301660165700936e-05, "loss": 8.8949, "step": 470 }, { "epoch": 0.33548838022016425, "grad_norm": 11.001014709472656, "learning_rate": 1.9728836206903656e-05, "loss": 8.5611, "step": 480 }, { "epoch": 0.34247772147475103, "grad_norm": 16.02702522277832, "learning_rate": 1.6668608091748495e-05, "loss": 8.8279, "step": 490 }, { "epoch": 0.34946706272933775, "grad_norm": 20.989906311035156, "learning_rate": 1.3844591860619383e-05, "loss": 9.2899, "step": 500 }, { "epoch": 0.34946706272933775, "eval_loss": 2.228527069091797, "eval_runtime": 176.4794, "eval_samples_per_second": 13.656, "eval_steps_per_second": 3.417, "step": 500 }, { "epoch": 0.35645640398392453, "grad_norm": 9.240239143371582, "learning_rate": 1.1264792494342857e-05, "loss": 9.0505, "step": 510 }, { "epoch": 0.36344574523851125, "grad_norm": 11.176291465759277, "learning_rate": 8.936522714508678e-06, "loss": 9.0306, "step": 520 }, { "epoch": 0.37043508649309803, "grad_norm": 13.71631908416748, "learning_rate": 6.866382254766157e-06, "loss": 8.7588, "step": 530 }, { "epoch": 0.37742442774768475, "grad_norm": 16.25080680847168, "learning_rate": 5.060239153161872e-06, "loss": 8.8861, "step": 540 }, { "epoch": 0.38441376900227153, "grad_norm": 19.564149856567383, "learning_rate": 3.5232131185484076e-06, "loss": 9.3167, "step": 550 }, { "epoch": 0.38441376900227153, "eval_loss": 2.207789421081543, "eval_runtime": 176.8368, "eval_samples_per_second": 13.628, "eval_steps_per_second": 3.41, "step": 550 }, { "epoch": 0.3914031102568583, "grad_norm": 7.936872482299805, "learning_rate": 2.259661018213333e-06, "loss": 9.0509, "step": 560 }, { "epoch": 0.39839245151144503, "grad_norm": 10.679981231689453, "learning_rate": 1.2731645278655445e-06, "loss": 8.7371, "step": 570 }, { "epoch": 0.4053817927660318, "grad_norm": 10.612337112426758, "learning_rate": 5.665199789862907e-07, "loss": 8.9785, "step": 580 }, { "epoch": 0.41237113402061853, "grad_norm": 17.461828231811523, "learning_rate": 1.4173043232380557e-07, "loss": 8.6828, "step": 590 }, { "epoch": 0.4193604752752053, "grad_norm": 24.876750946044922, "learning_rate": 0.0, "loss": 9.6177, "step": 600 }, { "epoch": 0.4193604752752053, "eval_loss": 2.208037853240967, "eval_runtime": 176.5573, "eval_samples_per_second": 13.65, "eval_steps_per_second": 3.415, "step": 600 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.487032007426048e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }