{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.02160818950382195, "eval_steps": 17, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010804094751910974, "eval_loss": 11.93105697631836, "eval_runtime": 164.9846, "eval_samples_per_second": 94.488, "eval_steps_per_second": 11.813, "step": 1 }, { "epoch": 0.00032412284255732923, "grad_norm": 0.010265783406794071, "learning_rate": 3e-05, "loss": 11.9317, "step": 3 }, { "epoch": 0.0006482456851146585, "grad_norm": 0.009611615911126137, "learning_rate": 6e-05, "loss": 11.932, "step": 6 }, { "epoch": 0.0009723685276719876, "grad_norm": 0.010489562526345253, "learning_rate": 9e-05, "loss": 11.9319, "step": 9 }, { "epoch": 0.001296491370229317, "grad_norm": 0.010087359696626663, "learning_rate": 9.997266286704631e-05, "loss": 11.9335, "step": 12 }, { "epoch": 0.0016206142127866462, "grad_norm": 0.012165901251137257, "learning_rate": 9.98292246503335e-05, "loss": 11.9299, "step": 15 }, { "epoch": 0.0018366961078248656, "eval_loss": 11.930828094482422, "eval_runtime": 164.8494, "eval_samples_per_second": 94.565, "eval_steps_per_second": 11.823, "step": 17 }, { "epoch": 0.0019447370553439753, "grad_norm": 0.012323531322181225, "learning_rate": 9.956320346634876e-05, "loss": 11.9309, "step": 18 }, { "epoch": 0.0022688598979013048, "grad_norm": 0.010800227522850037, "learning_rate": 9.917525374361912e-05, "loss": 11.93, "step": 21 }, { "epoch": 0.002592982740458634, "grad_norm": 0.012847968377172947, "learning_rate": 9.86663298624003e-05, "loss": 11.931, "step": 24 }, { "epoch": 0.002917105583015963, "grad_norm": 0.01434872206300497, "learning_rate": 9.803768380684242e-05, "loss": 11.9303, "step": 27 }, { "epoch": 0.0032412284255732924, "grad_norm": 0.014339370653033257, "learning_rate": 9.729086208503174e-05, "loss": 11.9315, "step": 30 }, { "epoch": 0.0035653512681306215, "grad_norm": 0.016050850972533226, "learning_rate": 9.642770192448536e-05, "loss": 11.9305, "step": 33 }, { "epoch": 0.0036733922156497313, "eval_loss": 11.930416107177734, "eval_runtime": 164.4735, "eval_samples_per_second": 94.781, "eval_steps_per_second": 11.85, "step": 34 }, { "epoch": 0.0038894741106879505, "grad_norm": 0.01582852564752102, "learning_rate": 9.545032675245813e-05, "loss": 11.9296, "step": 36 }, { "epoch": 0.00421359695324528, "grad_norm": 0.021600833162665367, "learning_rate": 9.43611409721806e-05, "loss": 11.9302, "step": 39 }, { "epoch": 0.0045377197958026095, "grad_norm": 0.018255416303873062, "learning_rate": 9.316282404787871e-05, "loss": 11.9301, "step": 42 }, { "epoch": 0.004861842638359939, "grad_norm": 0.01591956801712513, "learning_rate": 9.185832391312644e-05, "loss": 11.9292, "step": 45 }, { "epoch": 0.005185965480917268, "grad_norm": 0.024251235648989677, "learning_rate": 9.045084971874738e-05, "loss": 11.9312, "step": 48 }, { "epoch": 0.005510088323474597, "grad_norm": 0.022893432527780533, "learning_rate": 8.894386393810563e-05, "loss": 11.9312, "step": 51 }, { "epoch": 0.005510088323474597, "eval_loss": 11.929776191711426, "eval_runtime": 164.7642, "eval_samples_per_second": 94.614, "eval_steps_per_second": 11.829, "step": 51 }, { "epoch": 0.005834211166031926, "grad_norm": 0.026547182351350784, "learning_rate": 8.73410738492077e-05, "loss": 11.9294, "step": 54 }, { "epoch": 0.006158334008589256, "grad_norm": 0.027833979576826096, "learning_rate": 8.564642241456986e-05, "loss": 11.9275, "step": 57 }, { "epoch": 0.006482456851146585, "grad_norm": 0.031050728633999825, "learning_rate": 8.386407858128706e-05, "loss": 11.9303, "step": 60 }, { "epoch": 0.006806579693703914, "grad_norm": 0.04116954654455185, "learning_rate": 8.199842702516583e-05, "loss": 11.9298, "step": 63 }, { "epoch": 0.007130702536261243, "grad_norm": 0.04182201623916626, "learning_rate": 8.005405736415126e-05, "loss": 11.928, "step": 66 }, { "epoch": 0.007346784431299463, "eval_loss": 11.928645133972168, "eval_runtime": 164.5709, "eval_samples_per_second": 94.725, "eval_steps_per_second": 11.843, "step": 68 }, { "epoch": 0.007454825378818572, "grad_norm": 0.041909895837306976, "learning_rate": 7.803575286758364e-05, "loss": 11.9285, "step": 69 }, { "epoch": 0.007778948221375901, "grad_norm": 0.051412295550107956, "learning_rate": 7.594847868906076e-05, "loss": 11.9299, "step": 72 }, { "epoch": 0.00810307106393323, "grad_norm": 0.042676132172346115, "learning_rate": 7.379736965185368e-05, "loss": 11.9294, "step": 75 }, { "epoch": 0.00842719390649056, "grad_norm": 0.04502801597118378, "learning_rate": 7.158771761692464e-05, "loss": 11.9274, "step": 78 }, { "epoch": 0.00875131674904789, "grad_norm": 0.04950621724128723, "learning_rate": 6.932495846462261e-05, "loss": 11.9283, "step": 81 }, { "epoch": 0.009075439591605219, "grad_norm": 0.05746316909790039, "learning_rate": 6.701465872208216e-05, "loss": 11.9265, "step": 84 }, { "epoch": 0.009183480539124328, "eval_loss": 11.926851272583008, "eval_runtime": 164.5429, "eval_samples_per_second": 94.741, "eval_steps_per_second": 11.845, "step": 85 }, { "epoch": 0.009399562434162548, "grad_norm": 0.05312173441052437, "learning_rate": 6.466250186922325e-05, "loss": 11.9268, "step": 87 }, { "epoch": 0.009723685276719877, "grad_norm": 0.058377258479595184, "learning_rate": 6.227427435703997e-05, "loss": 11.9267, "step": 90 }, { "epoch": 0.010047808119277206, "grad_norm": 0.07541938871145248, "learning_rate": 5.985585137257401e-05, "loss": 11.9251, "step": 93 }, { "epoch": 0.010371930961834535, "grad_norm": 0.07769893109798431, "learning_rate": 5.74131823855921e-05, "loss": 11.926, "step": 96 }, { "epoch": 0.010696053804391864, "grad_norm": 0.06545033305883408, "learning_rate": 5.495227651252315e-05, "loss": 11.9258, "step": 99 }, { "epoch": 0.011020176646949193, "grad_norm": 0.07242189347743988, "learning_rate": 5.247918773366112e-05, "loss": 11.9252, "step": 102 }, { "epoch": 0.011020176646949193, "eval_loss": 11.924546241760254, "eval_runtime": 165.0729, "eval_samples_per_second": 94.437, "eval_steps_per_second": 11.807, "step": 102 }, { "epoch": 0.011344299489506523, "grad_norm": 0.0839807540178299, "learning_rate": 5e-05, "loss": 11.9236, "step": 105 }, { "epoch": 0.011668422332063852, "grad_norm": 0.08577441424131393, "learning_rate": 4.7520812266338885e-05, "loss": 11.9254, "step": 108 }, { "epoch": 0.01199254517462118, "grad_norm": 0.07489103823900223, "learning_rate": 4.504772348747687e-05, "loss": 11.923, "step": 111 }, { "epoch": 0.012316668017178511, "grad_norm": 0.08259328454732895, "learning_rate": 4.2586817614407895e-05, "loss": 11.9243, "step": 114 }, { "epoch": 0.01264079085973584, "grad_norm": 0.09051191061735153, "learning_rate": 4.0144148627425993e-05, "loss": 11.9247, "step": 117 }, { "epoch": 0.01285687275477406, "eval_loss": 11.922492980957031, "eval_runtime": 164.8761, "eval_samples_per_second": 94.55, "eval_steps_per_second": 11.821, "step": 119 }, { "epoch": 0.01296491370229317, "grad_norm": 0.06937046349048615, "learning_rate": 3.772572564296005e-05, "loss": 11.9224, "step": 120 }, { "epoch": 0.013289036544850499, "grad_norm": 0.08952979743480682, "learning_rate": 3.533749813077677e-05, "loss": 11.9221, "step": 123 }, { "epoch": 0.013613159387407828, "grad_norm": 0.07382088154554367, "learning_rate": 3.298534127791785e-05, "loss": 11.9231, "step": 126 }, { "epoch": 0.013937282229965157, "grad_norm": 0.07770540565252304, "learning_rate": 3.0675041535377405e-05, "loss": 11.9223, "step": 129 }, { "epoch": 0.014261405072522486, "grad_norm": 0.09618521481752396, "learning_rate": 2.8412282383075363e-05, "loss": 11.9195, "step": 132 }, { "epoch": 0.014585527915079815, "grad_norm": 0.09997902065515518, "learning_rate": 2.6202630348146324e-05, "loss": 11.9205, "step": 135 }, { "epoch": 0.014693568862598925, "eval_loss": 11.921192169189453, "eval_runtime": 164.5526, "eval_samples_per_second": 94.736, "eval_steps_per_second": 11.844, "step": 136 }, { "epoch": 0.014909650757637144, "grad_norm": 0.08282013982534409, "learning_rate": 2.405152131093926e-05, "loss": 11.9208, "step": 138 }, { "epoch": 0.015233773600194473, "grad_norm": 0.08347820490598679, "learning_rate": 2.196424713241637e-05, "loss": 11.9199, "step": 141 }, { "epoch": 0.015557896442751802, "grad_norm": 0.08832279592752457, "learning_rate": 1.9945942635848748e-05, "loss": 11.922, "step": 144 }, { "epoch": 0.01588201928530913, "grad_norm": 0.07434365898370743, "learning_rate": 1.800157297483417e-05, "loss": 11.9218, "step": 147 }, { "epoch": 0.01620614212786646, "grad_norm": 0.06998869776725769, "learning_rate": 1.6135921418712956e-05, "loss": 11.9218, "step": 150 }, { "epoch": 0.01653026497042379, "grad_norm": 0.05750780925154686, "learning_rate": 1.435357758543015e-05, "loss": 11.9201, "step": 153 }, { "epoch": 0.01653026497042379, "eval_loss": 11.920516014099121, "eval_runtime": 164.838, "eval_samples_per_second": 94.572, "eval_steps_per_second": 11.824, "step": 153 }, { "epoch": 0.01685438781298112, "grad_norm": 0.07401713728904724, "learning_rate": 1.2658926150792322e-05, "loss": 11.9217, "step": 156 }, { "epoch": 0.017178510655538447, "grad_norm": 0.06368040293455124, "learning_rate": 1.1056136061894384e-05, "loss": 11.9197, "step": 159 }, { "epoch": 0.01750263349809578, "grad_norm": 0.06895526498556137, "learning_rate": 9.549150281252633e-06, "loss": 11.9188, "step": 162 }, { "epoch": 0.01782675634065311, "grad_norm": 0.07913003116846085, "learning_rate": 8.141676086873572e-06, "loss": 11.9202, "step": 165 }, { "epoch": 0.018150879183210438, "grad_norm": 0.05910761281847954, "learning_rate": 6.837175952121306e-06, "loss": 11.9205, "step": 168 }, { "epoch": 0.018366961078248655, "eval_loss": 11.92021369934082, "eval_runtime": 165.2155, "eval_samples_per_second": 94.356, "eval_steps_per_second": 11.797, "step": 170 }, { "epoch": 0.018475002025767767, "grad_norm": 0.06519605964422226, "learning_rate": 5.6388590278194096e-06, "loss": 11.9219, "step": 171 }, { "epoch": 0.018799124868325096, "grad_norm": 0.06076359748840332, "learning_rate": 4.549673247541875e-06, "loss": 11.9209, "step": 174 }, { "epoch": 0.019123247710882425, "grad_norm": 0.06132715195417404, "learning_rate": 3.5722980755146517e-06, "loss": 11.9201, "step": 177 }, { "epoch": 0.019447370553439754, "grad_norm": 0.08634161949157715, "learning_rate": 2.7091379149682685e-06, "loss": 11.9207, "step": 180 }, { "epoch": 0.019771493395997083, "grad_norm": 0.0701214000582695, "learning_rate": 1.962316193157593e-06, "loss": 11.9204, "step": 183 }, { "epoch": 0.020095616238554413, "grad_norm": 0.06903617829084396, "learning_rate": 1.333670137599713e-06, "loss": 11.9207, "step": 186 }, { "epoch": 0.02020365718607352, "eval_loss": 11.920133590698242, "eval_runtime": 164.4607, "eval_samples_per_second": 94.789, "eval_steps_per_second": 11.851, "step": 187 }, { "epoch": 0.02041973908111174, "grad_norm": 0.050048764795064926, "learning_rate": 8.247462563808817e-07, "loss": 11.921, "step": 189 }, { "epoch": 0.02074386192366907, "grad_norm": 0.06726805865764618, "learning_rate": 4.367965336512403e-07, "loss": 11.9195, "step": 192 }, { "epoch": 0.0210679847662264, "grad_norm": 0.07607147097587585, "learning_rate": 1.7077534966650766e-07, "loss": 11.9209, "step": 195 }, { "epoch": 0.02139210760878373, "grad_norm": 0.0720086470246315, "learning_rate": 2.7337132953697554e-08, "loss": 11.9206, "step": 198 } ], "logging_steps": 3, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 17, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 589457915904.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }