{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01204202667308908, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012042026673089081, "eval_loss": 2.8115124702453613, "eval_runtime": 1968.0615, "eval_samples_per_second": 7.107, "eval_steps_per_second": 0.889, "step": 1 }, { "epoch": 0.00036126080019267245, "grad_norm": 41.35398864746094, "learning_rate": 3e-05, "loss": 11.0864, "step": 3 }, { "epoch": 0.0007225216003853449, "grad_norm": 32.59439468383789, "learning_rate": 6e-05, "loss": 10.0289, "step": 6 }, { "epoch": 0.0010837824005780173, "grad_norm": 23.11261749267578, "learning_rate": 9e-05, "loss": 9.0931, "step": 9 }, { "epoch": 0.0010837824005780173, "eval_loss": 2.0141799449920654, "eval_runtime": 1984.0515, "eval_samples_per_second": 7.05, "eval_steps_per_second": 0.882, "step": 9 }, { "epoch": 0.0014450432007706898, "grad_norm": 25.98846435546875, "learning_rate": 9.987820251299122e-05, "loss": 7.6576, "step": 12 }, { "epoch": 0.001806304000963362, "grad_norm": 32.36982727050781, "learning_rate": 9.924038765061042e-05, "loss": 7.2228, "step": 15 }, { "epoch": 0.0021675648011560346, "grad_norm": 16.200027465820312, "learning_rate": 9.806308479691595e-05, "loss": 6.5358, "step": 18 }, { "epoch": 0.0021675648011560346, "eval_loss": 1.6316649913787842, "eval_runtime": 1985.2556, "eval_samples_per_second": 7.045, "eval_steps_per_second": 0.881, "step": 18 }, { "epoch": 0.002528825601348707, "grad_norm": 18.970165252685547, "learning_rate": 9.635919272833938e-05, "loss": 6.6555, "step": 21 }, { "epoch": 0.0028900864015413796, "grad_norm": 15.357333183288574, "learning_rate": 9.414737964294636e-05, "loss": 6.3444, "step": 24 }, { "epoch": 0.0032513472017340517, "grad_norm": 15.496698379516602, "learning_rate": 9.145187862775209e-05, "loss": 5.7976, "step": 27 }, { "epoch": 0.0032513472017340517, "eval_loss": 1.5133966207504272, "eval_runtime": 1986.8922, "eval_samples_per_second": 7.04, "eval_steps_per_second": 0.88, "step": 27 }, { "epoch": 0.003612608001926724, "grad_norm": 13.662860870361328, "learning_rate": 8.83022221559489e-05, "loss": 6.0724, "step": 30 }, { "epoch": 0.003973868802119396, "grad_norm": 15.459967613220215, "learning_rate": 8.473291852294987e-05, "loss": 6.2818, "step": 33 }, { "epoch": 0.004335129602312069, "grad_norm": 11.845706939697266, "learning_rate": 8.07830737662829e-05, "loss": 5.7472, "step": 36 }, { "epoch": 0.004335129602312069, "eval_loss": 1.4950615167617798, "eval_runtime": 1983.0961, "eval_samples_per_second": 7.053, "eval_steps_per_second": 0.882, "step": 36 }, { "epoch": 0.004696390402504741, "grad_norm": 16.0750675201416, "learning_rate": 7.649596321166024e-05, "loss": 5.9232, "step": 39 }, { "epoch": 0.005057651202697414, "grad_norm": 15.54041576385498, "learning_rate": 7.191855733945387e-05, "loss": 5.7156, "step": 42 }, { "epoch": 0.005418912002890086, "grad_norm": 13.875934600830078, "learning_rate": 6.710100716628344e-05, "loss": 5.7691, "step": 45 }, { "epoch": 0.005418912002890086, "eval_loss": 1.454808235168457, "eval_runtime": 1987.0553, "eval_samples_per_second": 7.039, "eval_steps_per_second": 0.88, "step": 45 }, { "epoch": 0.005780172803082759, "grad_norm": 13.306793212890625, "learning_rate": 6.209609477998338e-05, "loss": 6.0942, "step": 48 }, { "epoch": 0.006141433603275431, "grad_norm": 13.40745735168457, "learning_rate": 5.695865504800327e-05, "loss": 5.8127, "step": 51 }, { "epoch": 0.006502694403468103, "grad_norm": 13.341890335083008, "learning_rate": 5.174497483512506e-05, "loss": 5.6365, "step": 54 }, { "epoch": 0.006502694403468103, "eval_loss": 1.4377180337905884, "eval_runtime": 1988.1753, "eval_samples_per_second": 7.035, "eval_steps_per_second": 0.88, "step": 54 }, { "epoch": 0.006863955203660776, "grad_norm": 14.141239166259766, "learning_rate": 4.6512176312793736e-05, "loss": 5.6544, "step": 57 }, { "epoch": 0.007225216003853448, "grad_norm": 13.24157428741455, "learning_rate": 4.131759111665349e-05, "loss": 5.7787, "step": 60 }, { "epoch": 0.007586476804046121, "grad_norm": 13.354693412780762, "learning_rate": 3.6218132209150045e-05, "loss": 5.9927, "step": 63 }, { "epoch": 0.007586476804046121, "eval_loss": 1.4332555532455444, "eval_runtime": 1985.8578, "eval_samples_per_second": 7.043, "eval_steps_per_second": 0.881, "step": 63 }, { "epoch": 0.007947737604238793, "grad_norm": 12.176066398620605, "learning_rate": 3.12696703292044e-05, "loss": 5.5682, "step": 66 }, { "epoch": 0.008308998404431465, "grad_norm": 15.984620094299316, "learning_rate": 2.6526421860705473e-05, "loss": 5.8425, "step": 69 }, { "epoch": 0.008670259204624138, "grad_norm": 13.588284492492676, "learning_rate": 2.2040354826462668e-05, "loss": 6.1731, "step": 72 }, { "epoch": 0.008670259204624138, "eval_loss": 1.423056721687317, "eval_runtime": 1986.6579, "eval_samples_per_second": 7.04, "eval_steps_per_second": 0.88, "step": 72 }, { "epoch": 0.009031520004816811, "grad_norm": 12.658801078796387, "learning_rate": 1.7860619515673033e-05, "loss": 5.9402, "step": 75 }, { "epoch": 0.009392780805009483, "grad_norm": 15.684614181518555, "learning_rate": 1.4033009983067452e-05, "loss": 5.6994, "step": 78 }, { "epoch": 0.009754041605202156, "grad_norm": 11.472796440124512, "learning_rate": 1.0599462319663905e-05, "loss": 5.8439, "step": 81 }, { "epoch": 0.009754041605202156, "eval_loss": 1.4167842864990234, "eval_runtime": 1987.061, "eval_samples_per_second": 7.039, "eval_steps_per_second": 0.88, "step": 81 }, { "epoch": 0.010115302405394828, "grad_norm": 11.89443588256836, "learning_rate": 7.597595192178702e-06, "loss": 5.465, "step": 84 }, { "epoch": 0.0104765632055875, "grad_norm": 13.721631050109863, "learning_rate": 5.060297685041659e-06, "loss": 5.8636, "step": 87 }, { "epoch": 0.010837824005780173, "grad_norm": 13.445150375366211, "learning_rate": 3.0153689607045845e-06, "loss": 5.7462, "step": 90 }, { "epoch": 0.010837824005780173, "eval_loss": 1.4135538339614868, "eval_runtime": 1984.5973, "eval_samples_per_second": 7.048, "eval_steps_per_second": 0.881, "step": 90 }, { "epoch": 0.011199084805972846, "grad_norm": 12.587129592895508, "learning_rate": 1.4852136862001764e-06, "loss": 5.5065, "step": 93 }, { "epoch": 0.011560345606165518, "grad_norm": 13.361394882202148, "learning_rate": 4.865965629214819e-07, "loss": 6.0172, "step": 96 }, { "epoch": 0.01192160640635819, "grad_norm": 13.288008689880371, "learning_rate": 3.04586490452119e-08, "loss": 5.6398, "step": 99 }, { "epoch": 0.01192160640635819, "eval_loss": 1.4129655361175537, "eval_runtime": 1979.296, "eval_samples_per_second": 7.067, "eval_steps_per_second": 0.884, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.812521215950848e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }