{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.13311148086522462, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013311148086522463, "eval_loss": 5.295779705047607, "eval_runtime": 93.7316, "eval_samples_per_second": 13.496, "eval_steps_per_second": 1.696, "step": 1 }, { "epoch": 0.003993344425956739, "grad_norm": 6.330746650695801, "learning_rate": 3e-05, "loss": 5.2865, "step": 3 }, { "epoch": 0.007986688851913478, "grad_norm": 5.513250350952148, "learning_rate": 6e-05, "loss": 4.7382, "step": 6 }, { "epoch": 0.011980033277870216, "grad_norm": 4.898468017578125, "learning_rate": 9e-05, "loss": 3.1585, "step": 9 }, { "epoch": 0.011980033277870216, "eval_loss": 2.123068332672119, "eval_runtime": 94.2958, "eval_samples_per_second": 13.415, "eval_steps_per_second": 1.686, "step": 9 }, { "epoch": 0.015973377703826955, "grad_norm": 5.28930139541626, "learning_rate": 9.987820251299122e-05, "loss": 2.0198, "step": 12 }, { "epoch": 0.019966722129783693, "grad_norm": 5.305483341217041, "learning_rate": 9.924038765061042e-05, "loss": 1.5995, "step": 15 }, { "epoch": 0.02396006655574043, "grad_norm": 3.3500654697418213, "learning_rate": 9.806308479691595e-05, "loss": 1.2395, "step": 18 }, { "epoch": 0.02396006655574043, "eval_loss": 0.9006592631340027, "eval_runtime": 94.3026, "eval_samples_per_second": 13.414, "eval_steps_per_second": 1.686, "step": 18 }, { "epoch": 0.027953410981697173, "grad_norm": 2.782661199569702, "learning_rate": 9.635919272833938e-05, "loss": 0.8408, "step": 21 }, { "epoch": 0.03194675540765391, "grad_norm": 4.695487022399902, "learning_rate": 9.414737964294636e-05, "loss": 0.5291, "step": 24 }, { "epoch": 0.03594009983361065, "grad_norm": 4.321674346923828, "learning_rate": 9.145187862775209e-05, "loss": 0.5393, "step": 27 }, { "epoch": 0.03594009983361065, "eval_loss": 0.36986345052719116, "eval_runtime": 94.3472, "eval_samples_per_second": 13.408, "eval_steps_per_second": 1.685, "step": 27 }, { "epoch": 0.03993344425956739, "grad_norm": 2.1721081733703613, "learning_rate": 8.83022221559489e-05, "loss": 0.3295, "step": 30 }, { "epoch": 0.043926788685524125, "grad_norm": 1.9686952829360962, "learning_rate": 8.473291852294987e-05, "loss": 0.2444, "step": 33 }, { "epoch": 0.04792013311148086, "grad_norm": 3.1062581539154053, "learning_rate": 8.07830737662829e-05, "loss": 0.3405, "step": 36 }, { "epoch": 0.04792013311148086, "eval_loss": 0.22597768902778625, "eval_runtime": 94.2858, "eval_samples_per_second": 13.417, "eval_steps_per_second": 1.686, "step": 36 }, { "epoch": 0.0519134775374376, "grad_norm": 2.1065447330474854, "learning_rate": 7.649596321166024e-05, "loss": 0.2113, "step": 39 }, { "epoch": 0.055906821963394346, "grad_norm": 2.7513046264648438, "learning_rate": 7.191855733945387e-05, "loss": 0.1744, "step": 42 }, { "epoch": 0.059900166389351084, "grad_norm": 2.247255802154541, "learning_rate": 6.710100716628344e-05, "loss": 0.1895, "step": 45 }, { "epoch": 0.059900166389351084, "eval_loss": 0.15109963715076447, "eval_runtime": 94.3125, "eval_samples_per_second": 13.413, "eval_steps_per_second": 1.686, "step": 45 }, { "epoch": 0.06389351081530782, "grad_norm": 2.3737082481384277, "learning_rate": 6.209609477998338e-05, "loss": 0.2152, "step": 48 }, { "epoch": 0.06788685524126456, "grad_norm": 2.481839656829834, "learning_rate": 5.695865504800327e-05, "loss": 0.1393, "step": 51 }, { "epoch": 0.0718801996672213, "grad_norm": 4.425018787384033, "learning_rate": 5.174497483512506e-05, "loss": 0.159, "step": 54 }, { "epoch": 0.0718801996672213, "eval_loss": 0.12050312012434006, "eval_runtime": 94.2539, "eval_samples_per_second": 13.421, "eval_steps_per_second": 1.687, "step": 54 }, { "epoch": 0.07587354409317804, "grad_norm": 1.1071043014526367, "learning_rate": 4.6512176312793736e-05, "loss": 0.1544, "step": 57 }, { "epoch": 0.07986688851913477, "grad_norm": 1.475027322769165, "learning_rate": 4.131759111665349e-05, "loss": 0.0855, "step": 60 }, { "epoch": 0.08386023294509151, "grad_norm": 1.2657231092453003, "learning_rate": 3.6218132209150045e-05, "loss": 0.1186, "step": 63 }, { "epoch": 0.08386023294509151, "eval_loss": 0.10414409637451172, "eval_runtime": 94.3116, "eval_samples_per_second": 13.413, "eval_steps_per_second": 1.686, "step": 63 }, { "epoch": 0.08785357737104825, "grad_norm": 2.2030110359191895, "learning_rate": 3.12696703292044e-05, "loss": 0.1065, "step": 66 }, { "epoch": 0.09184692179700499, "grad_norm": 1.347221851348877, "learning_rate": 2.6526421860705473e-05, "loss": 0.0879, "step": 69 }, { "epoch": 0.09584026622296173, "grad_norm": 1.6131019592285156, "learning_rate": 2.2040354826462668e-05, "loss": 0.1761, "step": 72 }, { "epoch": 0.09584026622296173, "eval_loss": 0.08786367624998093, "eval_runtime": 94.295, "eval_samples_per_second": 13.415, "eval_steps_per_second": 1.686, "step": 72 }, { "epoch": 0.09983361064891846, "grad_norm": 1.8289293050765991, "learning_rate": 1.7860619515673033e-05, "loss": 0.0966, "step": 75 }, { "epoch": 0.1038269550748752, "grad_norm": 1.5840426683425903, "learning_rate": 1.4033009983067452e-05, "loss": 0.0604, "step": 78 }, { "epoch": 0.10782029950083195, "grad_norm": 1.362238883972168, "learning_rate": 1.0599462319663905e-05, "loss": 0.0767, "step": 81 }, { "epoch": 0.10782029950083195, "eval_loss": 0.08052682131528854, "eval_runtime": 94.2717, "eval_samples_per_second": 13.419, "eval_steps_per_second": 1.687, "step": 81 }, { "epoch": 0.11181364392678869, "grad_norm": 2.0179669857025146, "learning_rate": 7.597595192178702e-06, "loss": 0.0702, "step": 84 }, { "epoch": 0.11580698835274543, "grad_norm": 1.466431975364685, "learning_rate": 5.060297685041659e-06, "loss": 0.0989, "step": 87 }, { "epoch": 0.11980033277870217, "grad_norm": 1.6172847747802734, "learning_rate": 3.0153689607045845e-06, "loss": 0.1468, "step": 90 }, { "epoch": 0.11980033277870217, "eval_loss": 0.07370365411043167, "eval_runtime": 94.2945, "eval_samples_per_second": 13.415, "eval_steps_per_second": 1.686, "step": 90 }, { "epoch": 0.1237936772046589, "grad_norm": 0.7145574688911438, "learning_rate": 1.4852136862001764e-06, "loss": 0.0618, "step": 93 }, { "epoch": 0.12778702163061564, "grad_norm": 0.8748223185539246, "learning_rate": 4.865965629214819e-07, "loss": 0.0521, "step": 96 }, { "epoch": 0.13178036605657237, "grad_norm": 1.464680790901184, "learning_rate": 3.04586490452119e-08, "loss": 0.0948, "step": 99 }, { "epoch": 0.13178036605657237, "eval_loss": 0.07193463295698166, "eval_runtime": 94.3059, "eval_samples_per_second": 13.414, "eval_steps_per_second": 1.686, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.345767604224e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }