{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7467248908296944, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017467248908296942, "eval_loss": 2.185687303543091, "eval_runtime": 8.5055, "eval_samples_per_second": 11.404, "eval_steps_per_second": 1.528, "step": 1 }, { "epoch": 0.05240174672489083, "grad_norm": 3.7532355785369873, "learning_rate": 3e-05, "loss": 2.1745, "step": 3 }, { "epoch": 0.10480349344978165, "grad_norm": 3.853607177734375, "learning_rate": 6e-05, "loss": 2.0202, "step": 6 }, { "epoch": 0.1572052401746725, "grad_norm": 2.7308859825134277, "learning_rate": 9e-05, "loss": 1.2437, "step": 9 }, { "epoch": 0.1572052401746725, "eval_loss": 0.7501007318496704, "eval_runtime": 8.6626, "eval_samples_per_second": 11.198, "eval_steps_per_second": 1.501, "step": 9 }, { "epoch": 0.2096069868995633, "grad_norm": 1.2397022247314453, "learning_rate": 9.987820251299122e-05, "loss": 0.6726, "step": 12 }, { "epoch": 0.26200873362445415, "grad_norm": 0.9129244089126587, "learning_rate": 9.924038765061042e-05, "loss": 0.4927, "step": 15 }, { "epoch": 0.314410480349345, "grad_norm": 0.7560355067253113, "learning_rate": 9.806308479691595e-05, "loss": 0.4404, "step": 18 }, { "epoch": 0.314410480349345, "eval_loss": 0.3994991183280945, "eval_runtime": 8.7008, "eval_samples_per_second": 11.148, "eval_steps_per_second": 1.494, "step": 18 }, { "epoch": 0.36681222707423583, "grad_norm": 0.7913711071014404, "learning_rate": 9.635919272833938e-05, "loss": 0.3265, "step": 21 }, { "epoch": 0.4192139737991266, "grad_norm": 0.6146355271339417, "learning_rate": 9.414737964294636e-05, "loss": 0.3328, "step": 24 }, { "epoch": 0.47161572052401746, "grad_norm": 0.762830913066864, "learning_rate": 9.145187862775209e-05, "loss": 0.3289, "step": 27 }, { "epoch": 0.47161572052401746, "eval_loss": 0.3602748215198517, "eval_runtime": 8.7103, "eval_samples_per_second": 11.136, "eval_steps_per_second": 1.492, "step": 27 }, { "epoch": 0.5240174672489083, "grad_norm": 0.7961021661758423, "learning_rate": 8.83022221559489e-05, "loss": 0.3447, "step": 30 }, { "epoch": 0.5764192139737991, "grad_norm": 0.6712778210639954, "learning_rate": 8.473291852294987e-05, "loss": 0.3778, "step": 33 }, { "epoch": 0.62882096069869, "grad_norm": 0.5632585883140564, "learning_rate": 8.07830737662829e-05, "loss": 0.2967, "step": 36 }, { "epoch": 0.62882096069869, "eval_loss": 0.3325504958629608, "eval_runtime": 8.6977, "eval_samples_per_second": 11.152, "eval_steps_per_second": 1.495, "step": 36 }, { "epoch": 0.6812227074235808, "grad_norm": 0.4797973930835724, "learning_rate": 7.649596321166024e-05, "loss": 0.2961, "step": 39 }, { "epoch": 0.7336244541484717, "grad_norm": 0.5742776989936829, "learning_rate": 7.191855733945387e-05, "loss": 0.3068, "step": 42 }, { "epoch": 0.7860262008733624, "grad_norm": 0.6687835454940796, "learning_rate": 6.710100716628344e-05, "loss": 0.2926, "step": 45 }, { "epoch": 0.7860262008733624, "eval_loss": 0.31762516498565674, "eval_runtime": 8.7094, "eval_samples_per_second": 11.137, "eval_steps_per_second": 1.493, "step": 45 }, { "epoch": 0.8384279475982532, "grad_norm": 0.8334570527076721, "learning_rate": 6.209609477998338e-05, "loss": 0.324, "step": 48 }, { "epoch": 0.8908296943231441, "grad_norm": 0.730705976486206, "learning_rate": 5.695865504800327e-05, "loss": 0.2717, "step": 51 }, { "epoch": 0.9432314410480349, "grad_norm": 0.7632585763931274, "learning_rate": 5.174497483512506e-05, "loss": 0.335, "step": 54 }, { "epoch": 0.9432314410480349, "eval_loss": 0.31074467301368713, "eval_runtime": 8.7163, "eval_samples_per_second": 11.129, "eval_steps_per_second": 1.491, "step": 54 }, { "epoch": 0.9956331877729258, "grad_norm": 0.6203595399856567, "learning_rate": 4.6512176312793736e-05, "loss": 0.2952, "step": 57 }, { "epoch": 1.0480349344978166, "grad_norm": 0.6411959528923035, "learning_rate": 4.131759111665349e-05, "loss": 0.3086, "step": 60 }, { "epoch": 1.1004366812227073, "grad_norm": 0.5493583083152771, "learning_rate": 3.6218132209150045e-05, "loss": 0.2665, "step": 63 }, { "epoch": 1.1004366812227073, "eval_loss": 0.30614474415779114, "eval_runtime": 8.7092, "eval_samples_per_second": 11.138, "eval_steps_per_second": 1.493, "step": 63 }, { "epoch": 1.1528384279475983, "grad_norm": 0.8020545840263367, "learning_rate": 3.12696703292044e-05, "loss": 0.1996, "step": 66 }, { "epoch": 1.205240174672489, "grad_norm": 0.5899050235748291, "learning_rate": 2.6526421860705473e-05, "loss": 0.2316, "step": 69 }, { "epoch": 1.25764192139738, "grad_norm": 0.793855607509613, "learning_rate": 2.2040354826462668e-05, "loss": 0.234, "step": 72 }, { "epoch": 1.25764192139738, "eval_loss": 0.3036545515060425, "eval_runtime": 8.7238, "eval_samples_per_second": 11.119, "eval_steps_per_second": 1.49, "step": 72 }, { "epoch": 1.3100436681222707, "grad_norm": 0.7186128497123718, "learning_rate": 1.7860619515673033e-05, "loss": 0.205, "step": 75 }, { "epoch": 1.3624454148471616, "grad_norm": 0.621354877948761, "learning_rate": 1.4033009983067452e-05, "loss": 0.2249, "step": 78 }, { "epoch": 1.4148471615720524, "grad_norm": 0.820740818977356, "learning_rate": 1.0599462319663905e-05, "loss": 0.2493, "step": 81 }, { "epoch": 1.4148471615720524, "eval_loss": 0.29848697781562805, "eval_runtime": 8.7163, "eval_samples_per_second": 11.129, "eval_steps_per_second": 1.491, "step": 81 }, { "epoch": 1.467248908296943, "grad_norm": 0.5625925660133362, "learning_rate": 7.597595192178702e-06, "loss": 0.1867, "step": 84 }, { "epoch": 1.519650655021834, "grad_norm": 0.6035133004188538, "learning_rate": 5.060297685041659e-06, "loss": 0.2114, "step": 87 }, { "epoch": 1.572052401746725, "grad_norm": 0.598051905632019, "learning_rate": 3.0153689607045845e-06, "loss": 0.2172, "step": 90 }, { "epoch": 1.572052401746725, "eval_loss": 0.29645535349845886, "eval_runtime": 8.7188, "eval_samples_per_second": 11.125, "eval_steps_per_second": 1.491, "step": 90 }, { "epoch": 1.6244541484716157, "grad_norm": 0.899366021156311, "learning_rate": 1.4852136862001764e-06, "loss": 0.2307, "step": 93 }, { "epoch": 1.6768558951965065, "grad_norm": 0.6130616068840027, "learning_rate": 4.865965629214819e-07, "loss": 0.2302, "step": 96 }, { "epoch": 1.7292576419213974, "grad_norm": 0.7148019075393677, "learning_rate": 3.04586490452119e-08, "loss": 0.2076, "step": 99 }, { "epoch": 1.7292576419213974, "eval_loss": 0.29628854990005493, "eval_runtime": 8.709, "eval_samples_per_second": 11.138, "eval_steps_per_second": 1.493, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.717877747126108e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }