{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 5, "global_step": 46, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021739130434782608, "grad_norm": 2.5425055027008057, "learning_rate": 1e-05, "loss": 0.706, "step": 1 }, { "epoch": 0.043478260869565216, "grad_norm": 2.3038032054901123, "learning_rate": 9.987820251299121e-06, "loss": 0.643, "step": 2 }, { "epoch": 0.06521739130434782, "grad_norm": 2.222012996673584, "learning_rate": 9.951340343707852e-06, "loss": 0.722, "step": 3 }, { "epoch": 0.08695652173913043, "grad_norm": 1.4986803531646729, "learning_rate": 9.890738003669029e-06, "loss": 0.6543, "step": 4 }, { "epoch": 0.10869565217391304, "grad_norm": 1.5336999893188477, "learning_rate": 9.806308479691595e-06, "loss": 0.766, "step": 5 }, { "epoch": 0.10869565217391304, "eval_loss": 0.5912319421768188, "eval_runtime": 0.8443, "eval_samples_per_second": 4.737, "eval_steps_per_second": 1.184, "step": 5 }, { "epoch": 0.13043478260869565, "grad_norm": 1.4662328958511353, "learning_rate": 9.698463103929542e-06, "loss": 0.5495, "step": 6 }, { "epoch": 0.15217391304347827, "grad_norm": 1.3746005296707153, "learning_rate": 9.567727288213005e-06, "loss": 0.5193, "step": 7 }, { "epoch": 0.17391304347826086, "grad_norm": 1.4526853561401367, "learning_rate": 9.414737964294636e-06, "loss": 0.5578, "step": 8 }, { "epoch": 0.1956521739130435, "grad_norm": 0.8964347243309021, "learning_rate": 9.24024048078213e-06, "loss": 0.3643, "step": 9 }, { "epoch": 0.21739130434782608, "grad_norm": 1.2126158475875854, "learning_rate": 9.045084971874738e-06, "loss": 0.5873, "step": 10 }, { "epoch": 0.21739130434782608, "eval_loss": 0.5282274484634399, "eval_runtime": 0.843, "eval_samples_per_second": 4.745, "eval_steps_per_second": 1.186, "step": 10 }, { "epoch": 0.2391304347826087, "grad_norm": 1.2183283567428589, "learning_rate": 8.83022221559489e-06, "loss": 0.6398, "step": 11 }, { "epoch": 0.2608695652173913, "grad_norm": 0.9250560402870178, "learning_rate": 8.596699001693257e-06, "loss": 0.4296, "step": 12 }, { "epoch": 0.2826086956521739, "grad_norm": 1.0050208568572998, "learning_rate": 8.345653031794292e-06, "loss": 0.5244, "step": 13 }, { "epoch": 0.30434782608695654, "grad_norm": 1.0231624841690063, "learning_rate": 8.078307376628292e-06, "loss": 0.4739, "step": 14 }, { "epoch": 0.32608695652173914, "grad_norm": 0.8328154683113098, "learning_rate": 7.795964517353734e-06, "loss": 0.3868, "step": 15 }, { "epoch": 0.32608695652173914, "eval_loss": 0.49576932191848755, "eval_runtime": 0.8436, "eval_samples_per_second": 4.742, "eval_steps_per_second": 1.185, "step": 15 }, { "epoch": 0.34782608695652173, "grad_norm": 1.0239394903182983, "learning_rate": 7.500000000000001e-06, "loss": 0.5849, "step": 16 }, { "epoch": 0.3695652173913043, "grad_norm": 0.9168555736541748, "learning_rate": 7.191855733945388e-06, "loss": 0.4854, "step": 17 }, { "epoch": 0.391304347826087, "grad_norm": 0.9247157573699951, "learning_rate": 6.873032967079562e-06, "loss": 0.3887, "step": 18 }, { "epoch": 0.41304347826086957, "grad_norm": 1.1805756092071533, "learning_rate": 6.545084971874738e-06, "loss": 0.6612, "step": 19 }, { "epoch": 0.43478260869565216, "grad_norm": 0.9535987377166748, "learning_rate": 6.209609477998339e-06, "loss": 0.5101, "step": 20 }, { "epoch": 0.43478260869565216, "eval_loss": 0.4761270582675934, "eval_runtime": 0.8437, "eval_samples_per_second": 4.741, "eval_steps_per_second": 1.185, "step": 20 }, { "epoch": 0.45652173913043476, "grad_norm": 0.821264922618866, "learning_rate": 5.8682408883346535e-06, "loss": 0.4696, "step": 21 }, { "epoch": 0.4782608695652174, "grad_norm": 0.9166697263717651, "learning_rate": 5.522642316338268e-06, "loss": 0.4555, "step": 22 }, { "epoch": 0.5, "grad_norm": 0.7674450278282166, "learning_rate": 5.174497483512506e-06, "loss": 0.4064, "step": 23 }, { "epoch": 0.5217391304347826, "grad_norm": 0.7370434999465942, "learning_rate": 4.825502516487497e-06, "loss": 0.3378, "step": 24 }, { "epoch": 0.5434782608695652, "grad_norm": 0.8126187324523926, "learning_rate": 4.477357683661734e-06, "loss": 0.4085, "step": 25 }, { "epoch": 0.5434782608695652, "eval_loss": 0.46437764167785645, "eval_runtime": 0.8435, "eval_samples_per_second": 4.742, "eval_steps_per_second": 1.186, "step": 25 }, { "epoch": 0.5652173913043478, "grad_norm": 0.8102895021438599, "learning_rate": 4.131759111665349e-06, "loss": 0.4565, "step": 26 }, { "epoch": 0.5869565217391305, "grad_norm": 1.0004063844680786, "learning_rate": 3.790390522001662e-06, "loss": 0.613, "step": 27 }, { "epoch": 0.6086956521739131, "grad_norm": 0.8794491291046143, "learning_rate": 3.4549150281252635e-06, "loss": 0.4919, "step": 28 }, { "epoch": 0.6304347826086957, "grad_norm": 0.7904605865478516, "learning_rate": 3.12696703292044e-06, "loss": 0.4456, "step": 29 }, { "epoch": 0.6521739130434783, "grad_norm": 0.8982527256011963, "learning_rate": 2.8081442660546126e-06, "loss": 0.5561, "step": 30 }, { "epoch": 0.6521739130434783, "eval_loss": 0.45777273178100586, "eval_runtime": 0.8438, "eval_samples_per_second": 4.741, "eval_steps_per_second": 1.185, "step": 30 }, { "epoch": 0.6739130434782609, "grad_norm": 0.7891985774040222, "learning_rate": 2.5000000000000015e-06, "loss": 0.4395, "step": 31 }, { "epoch": 0.6956521739130435, "grad_norm": 0.7818904519081116, "learning_rate": 2.204035482646267e-06, "loss": 0.4492, "step": 32 }, { "epoch": 0.717391304347826, "grad_norm": 0.8261966705322266, "learning_rate": 1.9216926233717087e-06, "loss": 0.389, "step": 33 }, { "epoch": 0.7391304347826086, "grad_norm": 0.8626196384429932, "learning_rate": 1.6543469682057105e-06, "loss": 0.4336, "step": 34 }, { "epoch": 0.7608695652173914, "grad_norm": 0.8142871856689453, "learning_rate": 1.4033009983067454e-06, "loss": 0.4683, "step": 35 }, { "epoch": 0.7608695652173914, "eval_loss": 0.45417019724845886, "eval_runtime": 0.8436, "eval_samples_per_second": 4.742, "eval_steps_per_second": 1.185, "step": 35 }, { "epoch": 0.782608695652174, "grad_norm": 0.8224750757217407, "learning_rate": 1.1697777844051105e-06, "loss": 0.4277, "step": 36 }, { "epoch": 0.8043478260869565, "grad_norm": 1.101127028465271, "learning_rate": 9.549150281252633e-07, "loss": 0.4057, "step": 37 }, { "epoch": 0.8260869565217391, "grad_norm": 0.9483347535133362, "learning_rate": 7.597595192178702e-07, "loss": 0.5928, "step": 38 }, { "epoch": 0.8478260869565217, "grad_norm": 0.9000388383865356, "learning_rate": 5.852620357053651e-07, "loss": 0.5955, "step": 39 }, { "epoch": 0.8695652173913043, "grad_norm": 0.8246148228645325, "learning_rate": 4.322727117869951e-07, "loss": 0.5055, "step": 40 }, { "epoch": 0.8695652173913043, "eval_loss": 0.452594518661499, "eval_runtime": 0.8425, "eval_samples_per_second": 4.748, "eval_steps_per_second": 1.187, "step": 40 }, { "epoch": 0.8913043478260869, "grad_norm": 0.7705091834068298, "learning_rate": 3.015368960704584e-07, "loss": 0.4106, "step": 41 }, { "epoch": 0.9130434782608695, "grad_norm": 0.7705535888671875, "learning_rate": 1.9369152030840553e-07, "loss": 0.4183, "step": 42 }, { "epoch": 0.9347826086956522, "grad_norm": 0.78244948387146, "learning_rate": 1.0926199633097156e-07, "loss": 0.4416, "step": 43 }, { "epoch": 0.9565217391304348, "grad_norm": 0.8442041277885437, "learning_rate": 4.865965629214819e-08, "loss": 0.4899, "step": 44 }, { "epoch": 0.9782608695652174, "grad_norm": 0.8914652466773987, "learning_rate": 1.2179748700879013e-08, "loss": 0.5359, "step": 45 }, { "epoch": 0.9782608695652174, "eval_loss": 0.45189881324768066, "eval_runtime": 0.843, "eval_samples_per_second": 4.745, "eval_steps_per_second": 1.186, "step": 45 }, { "epoch": 1.0, "grad_norm": 0.797126829624176, "learning_rate": 0.0, "loss": 0.4124, "step": 46 }, { "epoch": 1.0, "step": 46, "total_flos": 7.03161217014825e+16, "train_loss": 0.50457603516786, "train_runtime": 362.0803, "train_samples_per_second": 1.016, "train_steps_per_second": 0.127 } ], "logging_steps": 1, "max_steps": 46, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 46, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.03161217014825e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }