|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.13178036605657237, |
|
"eval_steps": 9, |
|
"global_step": 99, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0013311148086522463, |
|
"eval_loss": 5.295779705047607, |
|
"eval_runtime": 93.7316, |
|
"eval_samples_per_second": 13.496, |
|
"eval_steps_per_second": 1.696, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003993344425956739, |
|
"grad_norm": 6.330746650695801, |
|
"learning_rate": 3e-05, |
|
"loss": 5.2865, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007986688851913478, |
|
"grad_norm": 5.513250350952148, |
|
"learning_rate": 6e-05, |
|
"loss": 4.7382, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.011980033277870216, |
|
"grad_norm": 4.898468017578125, |
|
"learning_rate": 9e-05, |
|
"loss": 3.1585, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.011980033277870216, |
|
"eval_loss": 2.123068332672119, |
|
"eval_runtime": 94.2958, |
|
"eval_samples_per_second": 13.415, |
|
"eval_steps_per_second": 1.686, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.015973377703826955, |
|
"grad_norm": 5.28930139541626, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 2.0198, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.019966722129783693, |
|
"grad_norm": 5.305483341217041, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 1.5995, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02396006655574043, |
|
"grad_norm": 3.3500654697418213, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 1.2395, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02396006655574043, |
|
"eval_loss": 0.9006592631340027, |
|
"eval_runtime": 94.3026, |
|
"eval_samples_per_second": 13.414, |
|
"eval_steps_per_second": 1.686, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.027953410981697173, |
|
"grad_norm": 2.782661199569702, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 0.8408, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.03194675540765391, |
|
"grad_norm": 4.695487022399902, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 0.5291, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03594009983361065, |
|
"grad_norm": 4.321674346923828, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 0.5393, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03594009983361065, |
|
"eval_loss": 0.36986345052719116, |
|
"eval_runtime": 94.3472, |
|
"eval_samples_per_second": 13.408, |
|
"eval_steps_per_second": 1.685, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03993344425956739, |
|
"grad_norm": 2.1721081733703613, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.3295, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.043926788685524125, |
|
"grad_norm": 1.9686952829360962, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 0.2444, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.04792013311148086, |
|
"grad_norm": 3.1062581539154053, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 0.3405, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04792013311148086, |
|
"eval_loss": 0.22597768902778625, |
|
"eval_runtime": 94.2858, |
|
"eval_samples_per_second": 13.417, |
|
"eval_steps_per_second": 1.686, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0519134775374376, |
|
"grad_norm": 2.1065447330474854, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 0.2113, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.055906821963394346, |
|
"grad_norm": 2.7513046264648438, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 0.1744, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.059900166389351084, |
|
"grad_norm": 2.247255802154541, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.1895, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.059900166389351084, |
|
"eval_loss": 0.15109963715076447, |
|
"eval_runtime": 94.3125, |
|
"eval_samples_per_second": 13.413, |
|
"eval_steps_per_second": 1.686, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06389351081530782, |
|
"grad_norm": 2.3737082481384277, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 0.2152, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06788685524126456, |
|
"grad_norm": 2.481839656829834, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 0.1393, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0718801996672213, |
|
"grad_norm": 4.425018787384033, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 0.159, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0718801996672213, |
|
"eval_loss": 0.12050312012434006, |
|
"eval_runtime": 94.2539, |
|
"eval_samples_per_second": 13.421, |
|
"eval_steps_per_second": 1.687, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.07587354409317804, |
|
"grad_norm": 1.1071043014526367, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 0.1544, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07986688851913477, |
|
"grad_norm": 1.475027322769165, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.0855, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08386023294509151, |
|
"grad_norm": 1.2657231092453003, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 0.1186, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.08386023294509151, |
|
"eval_loss": 0.10414409637451172, |
|
"eval_runtime": 94.3116, |
|
"eval_samples_per_second": 13.413, |
|
"eval_steps_per_second": 1.686, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.08785357737104825, |
|
"grad_norm": 2.2030110359191895, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 0.1065, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.09184692179700499, |
|
"grad_norm": 1.347221851348877, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 0.0879, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.09584026622296173, |
|
"grad_norm": 1.6131019592285156, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 0.1761, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09584026622296173, |
|
"eval_loss": 0.08786367624998093, |
|
"eval_runtime": 94.295, |
|
"eval_samples_per_second": 13.415, |
|
"eval_steps_per_second": 1.686, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09983361064891846, |
|
"grad_norm": 1.8289293050765991, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.0966, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1038269550748752, |
|
"grad_norm": 1.5840426683425903, |
|
"learning_rate": 1.4033009983067452e-05, |
|
"loss": 0.0604, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.10782029950083195, |
|
"grad_norm": 1.362238883972168, |
|
"learning_rate": 1.0599462319663905e-05, |
|
"loss": 0.0767, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.10782029950083195, |
|
"eval_loss": 0.08052682131528854, |
|
"eval_runtime": 94.2717, |
|
"eval_samples_per_second": 13.419, |
|
"eval_steps_per_second": 1.687, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.11181364392678869, |
|
"grad_norm": 2.0179669857025146, |
|
"learning_rate": 7.597595192178702e-06, |
|
"loss": 0.0702, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.11580698835274543, |
|
"grad_norm": 1.466431975364685, |
|
"learning_rate": 5.060297685041659e-06, |
|
"loss": 0.0989, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.11980033277870217, |
|
"grad_norm": 1.6172847747802734, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.1468, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11980033277870217, |
|
"eval_loss": 0.07370365411043167, |
|
"eval_runtime": 94.2945, |
|
"eval_samples_per_second": 13.415, |
|
"eval_steps_per_second": 1.686, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1237936772046589, |
|
"grad_norm": 0.7145574688911438, |
|
"learning_rate": 1.4852136862001764e-06, |
|
"loss": 0.0618, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.12778702163061564, |
|
"grad_norm": 0.8748223185539246, |
|
"learning_rate": 4.865965629214819e-07, |
|
"loss": 0.0521, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.13178036605657237, |
|
"grad_norm": 1.464680790901184, |
|
"learning_rate": 3.04586490452119e-08, |
|
"loss": 0.0948, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.13178036605657237, |
|
"eval_loss": 0.07193463295698166, |
|
"eval_runtime": 94.3059, |
|
"eval_samples_per_second": 13.414, |
|
"eval_steps_per_second": 1.686, |
|
"step": 99 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 9, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.33230992818176e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|