|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.7467248908296944, |
|
"eval_steps": 9, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017467248908296942, |
|
"eval_loss": 2.185687303543091, |
|
"eval_runtime": 8.5055, |
|
"eval_samples_per_second": 11.404, |
|
"eval_steps_per_second": 1.528, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05240174672489083, |
|
"grad_norm": 3.7532355785369873, |
|
"learning_rate": 3e-05, |
|
"loss": 2.1745, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.10480349344978165, |
|
"grad_norm": 3.853607177734375, |
|
"learning_rate": 6e-05, |
|
"loss": 2.0202, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1572052401746725, |
|
"grad_norm": 2.7308859825134277, |
|
"learning_rate": 9e-05, |
|
"loss": 1.2437, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1572052401746725, |
|
"eval_loss": 0.7501007318496704, |
|
"eval_runtime": 8.6626, |
|
"eval_samples_per_second": 11.198, |
|
"eval_steps_per_second": 1.501, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2096069868995633, |
|
"grad_norm": 1.2397022247314453, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 0.6726, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.26200873362445415, |
|
"grad_norm": 0.9129244089126587, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 0.4927, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.314410480349345, |
|
"grad_norm": 0.7560355067253113, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 0.4404, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.314410480349345, |
|
"eval_loss": 0.3994991183280945, |
|
"eval_runtime": 8.7008, |
|
"eval_samples_per_second": 11.148, |
|
"eval_steps_per_second": 1.494, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.36681222707423583, |
|
"grad_norm": 0.7913711071014404, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 0.3265, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.4192139737991266, |
|
"grad_norm": 0.6146355271339417, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 0.3328, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.47161572052401746, |
|
"grad_norm": 0.762830913066864, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 0.3289, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.47161572052401746, |
|
"eval_loss": 0.3602748215198517, |
|
"eval_runtime": 8.7103, |
|
"eval_samples_per_second": 11.136, |
|
"eval_steps_per_second": 1.492, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5240174672489083, |
|
"grad_norm": 0.7961021661758423, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.3447, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5764192139737991, |
|
"grad_norm": 0.6712778210639954, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 0.3778, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.62882096069869, |
|
"grad_norm": 0.5632585883140564, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 0.2967, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.62882096069869, |
|
"eval_loss": 0.3325504958629608, |
|
"eval_runtime": 8.6977, |
|
"eval_samples_per_second": 11.152, |
|
"eval_steps_per_second": 1.495, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.6812227074235808, |
|
"grad_norm": 0.4797973930835724, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 0.2961, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.7336244541484717, |
|
"grad_norm": 0.5742776989936829, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 0.3068, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.7860262008733624, |
|
"grad_norm": 0.6687835454940796, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.2926, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7860262008733624, |
|
"eval_loss": 0.31762516498565674, |
|
"eval_runtime": 8.7094, |
|
"eval_samples_per_second": 11.137, |
|
"eval_steps_per_second": 1.493, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.8384279475982532, |
|
"grad_norm": 0.8334570527076721, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 0.324, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.8908296943231441, |
|
"grad_norm": 0.730705976486206, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 0.2717, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.9432314410480349, |
|
"grad_norm": 0.7632585763931274, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 0.335, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.9432314410480349, |
|
"eval_loss": 0.31074467301368713, |
|
"eval_runtime": 8.7163, |
|
"eval_samples_per_second": 11.129, |
|
"eval_steps_per_second": 1.491, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.9956331877729258, |
|
"grad_norm": 0.6203595399856567, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 0.2952, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.0480349344978166, |
|
"grad_norm": 0.6411959528923035, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.3086, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1004366812227073, |
|
"grad_norm": 0.5493583083152771, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 0.2665, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.1004366812227073, |
|
"eval_loss": 0.30614474415779114, |
|
"eval_runtime": 8.7092, |
|
"eval_samples_per_second": 11.138, |
|
"eval_steps_per_second": 1.493, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.1528384279475983, |
|
"grad_norm": 0.8020545840263367, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 0.1996, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.205240174672489, |
|
"grad_norm": 0.5899050235748291, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 0.2316, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.25764192139738, |
|
"grad_norm": 0.793855607509613, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 0.234, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.25764192139738, |
|
"eval_loss": 0.3036545515060425, |
|
"eval_runtime": 8.7238, |
|
"eval_samples_per_second": 11.119, |
|
"eval_steps_per_second": 1.49, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.3100436681222707, |
|
"grad_norm": 0.7186128497123718, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.205, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.3624454148471616, |
|
"grad_norm": 0.621354877948761, |
|
"learning_rate": 1.4033009983067452e-05, |
|
"loss": 0.2249, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.4148471615720524, |
|
"grad_norm": 0.820740818977356, |
|
"learning_rate": 1.0599462319663905e-05, |
|
"loss": 0.2493, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.4148471615720524, |
|
"eval_loss": 0.29848697781562805, |
|
"eval_runtime": 8.7163, |
|
"eval_samples_per_second": 11.129, |
|
"eval_steps_per_second": 1.491, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.467248908296943, |
|
"grad_norm": 0.5625925660133362, |
|
"learning_rate": 7.597595192178702e-06, |
|
"loss": 0.1867, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.519650655021834, |
|
"grad_norm": 0.6035133004188538, |
|
"learning_rate": 5.060297685041659e-06, |
|
"loss": 0.2114, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.572052401746725, |
|
"grad_norm": 0.598051905632019, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.2172, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.572052401746725, |
|
"eval_loss": 0.29645535349845886, |
|
"eval_runtime": 8.7188, |
|
"eval_samples_per_second": 11.125, |
|
"eval_steps_per_second": 1.491, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.6244541484716157, |
|
"grad_norm": 0.899366021156311, |
|
"learning_rate": 1.4852136862001764e-06, |
|
"loss": 0.2307, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.6768558951965065, |
|
"grad_norm": 0.6130616068840027, |
|
"learning_rate": 4.865965629214819e-07, |
|
"loss": 0.2302, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.7292576419213974, |
|
"grad_norm": 0.7148019075393677, |
|
"learning_rate": 3.04586490452119e-08, |
|
"loss": 0.2076, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.7292576419213974, |
|
"eval_loss": 0.29628854990005493, |
|
"eval_runtime": 8.709, |
|
"eval_samples_per_second": 11.138, |
|
"eval_steps_per_second": 1.493, |
|
"step": 99 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 9, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.717877747126108e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|