{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2251279135872655, "eval_steps": 9, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022740193291642978, "eval_loss": 2.2858026027679443, "eval_runtime": 35.4041, "eval_samples_per_second": 20.93, "eval_steps_per_second": 2.627, "step": 1 }, { "epoch": 0.006822057987492893, "grad_norm": 0.60309898853302, "learning_rate": 3e-05, "loss": 2.083, "step": 3 }, { "epoch": 0.013644115974985787, "grad_norm": 0.683789074420929, "learning_rate": 6e-05, "loss": 2.3169, "step": 6 }, { "epoch": 0.02046617396247868, "grad_norm": 0.8379518985748291, "learning_rate": 9e-05, "loss": 2.2734, "step": 9 }, { "epoch": 0.02046617396247868, "eval_loss": 2.204383373260498, "eval_runtime": 35.8511, "eval_samples_per_second": 20.669, "eval_steps_per_second": 2.594, "step": 9 }, { "epoch": 0.027288231949971573, "grad_norm": 0.6264774799346924, "learning_rate": 9.987820251299122e-05, "loss": 2.0739, "step": 12 }, { "epoch": 0.03411028993746447, "grad_norm": 0.5958771705627441, "learning_rate": 9.924038765061042e-05, "loss": 2.0297, "step": 15 }, { "epoch": 0.04093234792495736, "grad_norm": 1.1767833232879639, "learning_rate": 9.806308479691595e-05, "loss": 2.04, "step": 18 }, { "epoch": 0.04093234792495736, "eval_loss": 1.970629334449768, "eval_runtime": 35.8984, "eval_samples_per_second": 20.642, "eval_steps_per_second": 2.591, "step": 18 }, { "epoch": 0.047754405912450254, "grad_norm": 0.9643926620483398, "learning_rate": 9.635919272833938e-05, "loss": 1.8702, "step": 21 }, { "epoch": 0.054576463899943146, "grad_norm": 1.1769558191299438, "learning_rate": 9.414737964294636e-05, "loss": 1.8235, "step": 24 }, { "epoch": 0.061398521887436046, "grad_norm": 1.374096393585205, "learning_rate": 9.145187862775209e-05, "loss": 1.647, "step": 27 }, { "epoch": 0.061398521887436046, "eval_loss": 1.761104941368103, "eval_runtime": 35.916, "eval_samples_per_second": 20.631, "eval_steps_per_second": 2.589, "step": 27 }, { "epoch": 0.06822057987492894, "grad_norm": 0.7258775234222412, "learning_rate": 8.83022221559489e-05, "loss": 1.7701, "step": 30 }, { "epoch": 0.07504263786242182, "grad_norm": 0.5846672654151917, "learning_rate": 8.473291852294987e-05, "loss": 1.5202, "step": 33 }, { "epoch": 0.08186469584991472, "grad_norm": 0.7897035479545593, "learning_rate": 8.07830737662829e-05, "loss": 1.6917, "step": 36 }, { "epoch": 0.08186469584991472, "eval_loss": 1.7188748121261597, "eval_runtime": 35.9555, "eval_samples_per_second": 20.609, "eval_steps_per_second": 2.587, "step": 36 }, { "epoch": 0.08868675383740762, "grad_norm": 0.4667467176914215, "learning_rate": 7.649596321166024e-05, "loss": 1.6435, "step": 39 }, { "epoch": 0.09550881182490051, "grad_norm": 0.4430086612701416, "learning_rate": 7.191855733945387e-05, "loss": 1.6451, "step": 42 }, { "epoch": 0.10233086981239341, "grad_norm": 0.5567775368690491, "learning_rate": 6.710100716628344e-05, "loss": 1.5085, "step": 45 }, { "epoch": 0.10233086981239341, "eval_loss": 1.696839451789856, "eval_runtime": 35.9155, "eval_samples_per_second": 20.632, "eval_steps_per_second": 2.589, "step": 45 }, { "epoch": 0.10915292779988629, "grad_norm": 0.4994668662548065, "learning_rate": 6.209609477998338e-05, "loss": 1.5962, "step": 48 }, { "epoch": 0.11597498578737919, "grad_norm": 0.4232550859451294, "learning_rate": 5.695865504800327e-05, "loss": 1.7046, "step": 51 }, { "epoch": 0.12279704377487209, "grad_norm": 0.5421936511993408, "learning_rate": 5.174497483512506e-05, "loss": 1.7675, "step": 54 }, { "epoch": 0.12279704377487209, "eval_loss": 1.6868067979812622, "eval_runtime": 35.9161, "eval_samples_per_second": 20.631, "eval_steps_per_second": 2.589, "step": 54 }, { "epoch": 0.129619101762365, "grad_norm": 0.4595838785171509, "learning_rate": 4.6512176312793736e-05, "loss": 1.7213, "step": 57 }, { "epoch": 0.13644115974985788, "grad_norm": 0.3871839642524719, "learning_rate": 4.131759111665349e-05, "loss": 1.6915, "step": 60 }, { "epoch": 0.14326321773735076, "grad_norm": 0.48633936047554016, "learning_rate": 3.6218132209150045e-05, "loss": 1.5636, "step": 63 }, { "epoch": 0.14326321773735076, "eval_loss": 1.6816630363464355, "eval_runtime": 35.9227, "eval_samples_per_second": 20.628, "eval_steps_per_second": 2.589, "step": 63 }, { "epoch": 0.15008527572484365, "grad_norm": 0.3406151235103607, "learning_rate": 3.12696703292044e-05, "loss": 1.7242, "step": 66 }, { "epoch": 0.15690733371233656, "grad_norm": 0.4090305268764496, "learning_rate": 2.6526421860705473e-05, "loss": 1.6906, "step": 69 }, { "epoch": 0.16372939169982945, "grad_norm": 0.39295023679733276, "learning_rate": 2.2040354826462668e-05, "loss": 1.7433, "step": 72 }, { "epoch": 0.16372939169982945, "eval_loss": 1.678359866142273, "eval_runtime": 35.9533, "eval_samples_per_second": 20.61, "eval_steps_per_second": 2.587, "step": 72 }, { "epoch": 0.17055144968732233, "grad_norm": 0.47114595770835876, "learning_rate": 1.7860619515673033e-05, "loss": 1.7471, "step": 75 }, { "epoch": 0.17737350767481525, "grad_norm": 0.4847165644168854, "learning_rate": 1.4033009983067452e-05, "loss": 1.514, "step": 78 }, { "epoch": 0.18419556566230813, "grad_norm": 0.35954052209854126, "learning_rate": 1.0599462319663905e-05, "loss": 1.9685, "step": 81 }, { "epoch": 0.18419556566230813, "eval_loss": 1.6781070232391357, "eval_runtime": 35.9632, "eval_samples_per_second": 20.604, "eval_steps_per_second": 2.586, "step": 81 }, { "epoch": 0.19101762364980102, "grad_norm": 0.39467528462409973, "learning_rate": 7.597595192178702e-06, "loss": 1.7774, "step": 84 }, { "epoch": 0.19783968163729393, "grad_norm": 0.7397677302360535, "learning_rate": 5.060297685041659e-06, "loss": 1.7756, "step": 87 }, { "epoch": 0.20466173962478681, "grad_norm": 0.33035725355148315, "learning_rate": 3.0153689607045845e-06, "loss": 1.9051, "step": 90 }, { "epoch": 0.20466173962478681, "eval_loss": 1.677160620689392, "eval_runtime": 35.9298, "eval_samples_per_second": 20.624, "eval_steps_per_second": 2.588, "step": 90 }, { "epoch": 0.2114837976122797, "grad_norm": 0.4519857168197632, "learning_rate": 1.4852136862001764e-06, "loss": 1.394, "step": 93 }, { "epoch": 0.21830585559977259, "grad_norm": 0.6549511551856995, "learning_rate": 4.865965629214819e-07, "loss": 1.6634, "step": 96 }, { "epoch": 0.2251279135872655, "grad_norm": 0.3595646619796753, "learning_rate": 3.04586490452119e-08, "loss": 1.9199, "step": 99 }, { "epoch": 0.2251279135872655, "eval_loss": 1.6768484115600586, "eval_runtime": 35.9594, "eval_samples_per_second": 20.607, "eval_steps_per_second": 2.586, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.955421302108979e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }