{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0785391714117416, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007853917141174161, "eval_loss": 8.724822998046875, "eval_runtime": 233.5003, "eval_samples_per_second": 19.392, "eval_steps_per_second": 2.424, "step": 1 }, { "epoch": 0.0039269585705870805, "grad_norm": 6.884418964385986, "learning_rate": 3.75e-05, "loss": 8.547, "step": 5 }, { "epoch": 0.007068525427056745, "eval_loss": 6.619219779968262, "eval_runtime": 233.7681, "eval_samples_per_second": 19.37, "eval_steps_per_second": 2.421, "step": 9 }, { "epoch": 0.007853917141174161, "grad_norm": 9.305004119873047, "learning_rate": 7.5e-05, "loss": 7.0279, "step": 10 }, { "epoch": 0.011780875711761242, "grad_norm": 8.224003791809082, "learning_rate": 0.0001125, "loss": 5.2814, "step": 15 }, { "epoch": 0.01413705085411349, "eval_loss": 4.465071678161621, "eval_runtime": 233.7503, "eval_samples_per_second": 19.371, "eval_steps_per_second": 2.421, "step": 18 }, { "epoch": 0.015707834282348322, "grad_norm": 8.007319450378418, "learning_rate": 0.00015, "loss": 4.5609, "step": 20 }, { "epoch": 0.0196347928529354, "grad_norm": 8.696537017822266, "learning_rate": 0.00014855889603024227, "loss": 4.2927, "step": 25 }, { "epoch": 0.021205576281170233, "eval_loss": 4.00045919418335, "eval_runtime": 233.7374, "eval_samples_per_second": 19.372, "eval_steps_per_second": 2.422, "step": 27 }, { "epoch": 0.023561751423522483, "grad_norm": 6.938048839569092, "learning_rate": 0.0001442909649383465, "loss": 4.0025, "step": 30 }, { "epoch": 0.027488709994109562, "grad_norm": 7.009843826293945, "learning_rate": 0.0001373602209226909, "loss": 3.8531, "step": 35 }, { "epoch": 0.02827410170822698, "eval_loss": 3.82582426071167, "eval_runtime": 233.7172, "eval_samples_per_second": 19.374, "eval_steps_per_second": 2.422, "step": 36 }, { "epoch": 0.031415668564696644, "grad_norm": 6.201370716094971, "learning_rate": 0.00012803300858899104, "loss": 3.719, "step": 40 }, { "epoch": 0.03534262713528372, "grad_norm": 5.8150105476379395, "learning_rate": 0.00011666776747647015, "loss": 3.6351, "step": 45 }, { "epoch": 0.03534262713528372, "eval_loss": 3.6261699199676514, "eval_runtime": 233.6588, "eval_samples_per_second": 19.379, "eval_steps_per_second": 2.422, "step": 45 }, { "epoch": 0.0392695857058708, "grad_norm": 4.917875289916992, "learning_rate": 0.00010370125742738173, "loss": 3.6745, "step": 50 }, { "epoch": 0.042411152562340466, "eval_loss": 3.5405988693237305, "eval_runtime": 233.7238, "eval_samples_per_second": 19.373, "eval_steps_per_second": 2.422, "step": 54 }, { "epoch": 0.04319654427645788, "grad_norm": 4.494648456573486, "learning_rate": 8.963177415120962e-05, "loss": 3.5363, "step": 55 }, { "epoch": 0.04712350284704497, "grad_norm": 6.017573833465576, "learning_rate": 7.5e-05, "loss": 3.2906, "step": 60 }, { "epoch": 0.04947967798939721, "eval_loss": 3.450070381164551, "eval_runtime": 233.7549, "eval_samples_per_second": 19.371, "eval_steps_per_second": 2.421, "step": 63 }, { "epoch": 0.051050461417632045, "grad_norm": 4.84181022644043, "learning_rate": 6.036822584879038e-05, "loss": 3.3757, "step": 65 }, { "epoch": 0.054977419988219124, "grad_norm": 6.739719390869141, "learning_rate": 4.6298742572618266e-05, "loss": 3.4344, "step": 70 }, { "epoch": 0.05654820341645396, "eval_loss": 3.3793962001800537, "eval_runtime": 233.8278, "eval_samples_per_second": 19.365, "eval_steps_per_second": 2.421, "step": 72 }, { "epoch": 0.0589043785588062, "grad_norm": 4.579084873199463, "learning_rate": 3.333223252352985e-05, "loss": 3.3768, "step": 75 }, { "epoch": 0.06283133712939329, "grad_norm": 5.169261932373047, "learning_rate": 2.1966991411008938e-05, "loss": 3.4602, "step": 80 }, { "epoch": 0.0636167288435107, "eval_loss": 3.332181453704834, "eval_runtime": 233.7069, "eval_samples_per_second": 19.375, "eval_steps_per_second": 2.422, "step": 81 }, { "epoch": 0.06675829569998036, "grad_norm": 5.313036918640137, "learning_rate": 1.2639779077309098e-05, "loss": 3.3791, "step": 85 }, { "epoch": 0.07068525427056745, "grad_norm": 4.830722332000732, "learning_rate": 5.709035061653494e-06, "loss": 3.0172, "step": 90 }, { "epoch": 0.07068525427056745, "eval_loss": 3.3156657218933105, "eval_runtime": 233.6444, "eval_samples_per_second": 19.38, "eval_steps_per_second": 2.422, "step": 90 }, { "epoch": 0.07461221284115453, "grad_norm": 5.533350944519043, "learning_rate": 1.4411039697577175e-06, "loss": 3.4561, "step": 95 }, { "epoch": 0.07775377969762419, "eval_loss": 3.3061177730560303, "eval_runtime": 233.6519, "eval_samples_per_second": 19.379, "eval_steps_per_second": 2.422, "step": 99 }, { "epoch": 0.0785391714117416, "grad_norm": 4.896818161010742, "learning_rate": 0.0, "loss": 2.9994, "step": 100 } ], "logging_steps": 5, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.508513578745856e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }