{ "best_metric": 1.0756638050079346, "best_model_checkpoint": "miner_id_24/checkpoint-50", "epoch": 1.1038251366120218, "eval_steps": 50, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02185792349726776, "grad_norm": 0.054219260811805725, "learning_rate": 1.16e-05, "loss": 1.0221, "step": 1 }, { "epoch": 0.02185792349726776, "eval_loss": 1.3064125776290894, "eval_runtime": 1.5023, "eval_samples_per_second": 409.374, "eval_steps_per_second": 13.313, "step": 1 }, { "epoch": 0.04371584699453552, "grad_norm": 0.07374625653028488, "learning_rate": 2.32e-05, "loss": 1.13, "step": 2 }, { "epoch": 0.06557377049180328, "grad_norm": 0.08744122087955475, "learning_rate": 3.48e-05, "loss": 1.2471, "step": 3 }, { "epoch": 0.08743169398907104, "grad_norm": 0.1099563017487526, "learning_rate": 4.64e-05, "loss": 1.3512, "step": 4 }, { "epoch": 0.1092896174863388, "grad_norm": 0.14091312885284424, "learning_rate": 5.8e-05, "loss": 1.382, "step": 5 }, { "epoch": 0.13114754098360656, "grad_norm": 0.19244275987148285, "learning_rate": 6.96e-05, "loss": 1.524, "step": 6 }, { "epoch": 0.15300546448087432, "grad_norm": 0.052936580032110214, "learning_rate": 8.12e-05, "loss": 1.0329, "step": 7 }, { "epoch": 0.17486338797814208, "grad_norm": 0.06494678556919098, "learning_rate": 9.28e-05, "loss": 1.1503, "step": 8 }, { "epoch": 0.19672131147540983, "grad_norm": 0.07551469653844833, "learning_rate": 0.0001044, "loss": 1.2085, "step": 9 }, { "epoch": 0.2185792349726776, "grad_norm": 0.08664041757583618, "learning_rate": 0.000116, "loss": 1.2444, "step": 10 }, { "epoch": 0.24043715846994534, "grad_norm": 0.10655322670936584, "learning_rate": 0.00011598225532067881, "loss": 1.3136, "step": 11 }, { "epoch": 0.26229508196721313, "grad_norm": 0.14484980702400208, "learning_rate": 0.00011592903214042715, "loss": 1.3774, "step": 12 }, { "epoch": 0.28415300546448086, "grad_norm": 0.049404121935367584, "learning_rate": 0.00011584036302573693, "loss": 0.9998, "step": 13 }, { "epoch": 0.30601092896174864, "grad_norm": 0.05533352494239807, "learning_rate": 0.0001157163022319532, "loss": 1.077, "step": 14 }, { "epoch": 0.32786885245901637, "grad_norm": 0.06618451327085495, "learning_rate": 0.00011555692567007598, "loss": 1.1209, "step": 15 }, { "epoch": 0.34972677595628415, "grad_norm": 0.07199019938707352, "learning_rate": 0.00011536233086031157, "loss": 1.2181, "step": 16 }, { "epoch": 0.37158469945355194, "grad_norm": 0.08229127526283264, "learning_rate": 0.00011513263687240126, "loss": 1.2544, "step": 17 }, { "epoch": 0.39344262295081966, "grad_norm": 0.10118231177330017, "learning_rate": 0.00011486798425276428, "loss": 1.3167, "step": 18 }, { "epoch": 0.41530054644808745, "grad_norm": 0.06382325291633606, "learning_rate": 0.00011456853493849944, "loss": 0.9757, "step": 19 }, { "epoch": 0.4371584699453552, "grad_norm": 0.06287430226802826, "learning_rate": 0.0001142344721582983, "loss": 1.0141, "step": 20 }, { "epoch": 0.45901639344262296, "grad_norm": 0.061046287417411804, "learning_rate": 0.00011386600032033012, "loss": 1.1142, "step": 21 }, { "epoch": 0.4808743169398907, "grad_norm": 0.05975975841283798, "learning_rate": 0.0001134633448871674, "loss": 1.172, "step": 22 }, { "epoch": 0.5027322404371585, "grad_norm": 0.06590148061513901, "learning_rate": 0.00011302675223782873, "loss": 1.1934, "step": 23 }, { "epoch": 0.5245901639344263, "grad_norm": 0.07652608305215836, "learning_rate": 0.00011255648951702296, "loss": 1.2285, "step": 24 }, { "epoch": 0.546448087431694, "grad_norm": 0.11880210041999817, "learning_rate": 0.0001120528444716872, "loss": 1.2294, "step": 25 }, { "epoch": 0.5683060109289617, "grad_norm": 0.04327382519841194, "learning_rate": 0.00011151612527491878, "loss": 0.9457, "step": 26 }, { "epoch": 0.5901639344262295, "grad_norm": 0.05113707482814789, "learning_rate": 0.00011094666033740846, "loss": 1.0301, "step": 27 }, { "epoch": 0.6120218579234973, "grad_norm": 0.04633456468582153, "learning_rate": 0.00011034479810649071, "loss": 1.1369, "step": 28 }, { "epoch": 0.6338797814207651, "grad_norm": 0.052176687866449356, "learning_rate": 0.00010971090685293396, "loss": 1.1575, "step": 29 }, { "epoch": 0.6557377049180327, "grad_norm": 0.05911482125520706, "learning_rate": 0.00010904537444560093, "loss": 1.1915, "step": 30 }, { "epoch": 0.6775956284153005, "grad_norm": 0.08560285717248917, "learning_rate": 0.0001083486081141173, "loss": 1.1844, "step": 31 }, { "epoch": 0.6994535519125683, "grad_norm": 0.0443929098546505, "learning_rate": 0.00010762103419969393, "loss": 0.9784, "step": 32 }, { "epoch": 0.7213114754098361, "grad_norm": 0.04982827231287956, "learning_rate": 0.00010686309789425474, "loss": 1.0368, "step": 33 }, { "epoch": 0.7431693989071039, "grad_norm": 0.04613876715302467, "learning_rate": 0.00010607526296803026, "loss": 1.0534, "step": 34 }, { "epoch": 0.7650273224043715, "grad_norm": 0.04624936357140541, "learning_rate": 0.00010525801148578341, "loss": 1.1136, "step": 35 }, { "epoch": 0.7868852459016393, "grad_norm": 0.050727903842926025, "learning_rate": 0.000104411843511841, "loss": 1.1563, "step": 36 }, { "epoch": 0.8087431693989071, "grad_norm": 0.07218360155820847, "learning_rate": 0.00010353727680411158, "loss": 1.148, "step": 37 }, { "epoch": 0.8306010928961749, "grad_norm": 0.04049117863178253, "learning_rate": 0.00010263484649727705, "loss": 0.9096, "step": 38 }, { "epoch": 0.8524590163934426, "grad_norm": 0.0455789640545845, "learning_rate": 0.00010170510477535133, "loss": 1.0006, "step": 39 }, { "epoch": 0.8743169398907104, "grad_norm": 0.039463143795728683, "learning_rate": 0.00010074862053380711, "loss": 1.0411, "step": 40 }, { "epoch": 0.8961748633879781, "grad_norm": 0.042614974081516266, "learning_rate": 9.976597903147682e-05, "loss": 1.1396, "step": 41 }, { "epoch": 0.9180327868852459, "grad_norm": 0.04930881783366203, "learning_rate": 9.875778153244143e-05, "loss": 1.1744, "step": 42 }, { "epoch": 0.9398907103825137, "grad_norm": 0.06974472105503082, "learning_rate": 9.772464493812549e-05, "loss": 1.15, "step": 43 }, { "epoch": 0.9617486338797814, "grad_norm": 0.04092060774564743, "learning_rate": 9.66672014098242e-05, "loss": 0.9676, "step": 44 }, { "epoch": 0.9836065573770492, "grad_norm": 0.0392816998064518, "learning_rate": 9.558609798189311e-05, "loss": 1.0893, "step": 45 }, { "epoch": 1.0163934426229508, "grad_norm": 0.08897832781076431, "learning_rate": 9.448199616583707e-05, "loss": 1.8898, "step": 46 }, { "epoch": 1.0382513661202186, "grad_norm": 0.03982605040073395, "learning_rate": 9.335557154554105e-05, "loss": 0.9943, "step": 47 }, { "epoch": 1.0601092896174864, "grad_norm": 0.03858646750450134, "learning_rate": 9.220751336389013e-05, "loss": 1.0459, "step": 48 }, { "epoch": 1.0819672131147542, "grad_norm": 0.040587618947029114, "learning_rate": 9.10385241010317e-05, "loss": 1.1494, "step": 49 }, { "epoch": 1.1038251366120218, "grad_norm": 0.052482884377241135, "learning_rate": 8.984931904453821e-05, "loss": 1.1475, "step": 50 }, { "epoch": 1.1038251366120218, "eval_loss": 1.0756638050079346, "eval_runtime": 1.9721, "eval_samples_per_second": 311.855, "eval_steps_per_second": 10.142, "step": 50 } ], "logging_steps": 1, "max_steps": 137, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.8231094746742784e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }