{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1511144692104269, "eval_steps": 17, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007555723460521345, "eval_loss": 12.450613975524902, "eval_runtime": 61.6454, "eval_samples_per_second": 36.175, "eval_steps_per_second": 4.526, "step": 1 }, { "epoch": 0.0022667170381564035, "grad_norm": 0.009174207225441933, "learning_rate": 3e-05, "loss": 12.453, "step": 3 }, { "epoch": 0.004533434076312807, "grad_norm": 0.009654853492975235, "learning_rate": 6e-05, "loss": 12.4526, "step": 6 }, { "epoch": 0.00680015111446921, "grad_norm": 0.008986242115497589, "learning_rate": 9e-05, "loss": 12.4507, "step": 9 }, { "epoch": 0.009066868152625614, "grad_norm": 0.009755842387676239, "learning_rate": 9.997266286704631e-05, "loss": 12.4496, "step": 12 }, { "epoch": 0.011333585190782017, "grad_norm": 0.011493120342493057, "learning_rate": 9.98292246503335e-05, "loss": 12.4487, "step": 15 }, { "epoch": 0.012844729882886286, "eval_loss": 12.450437545776367, "eval_runtime": 61.7871, "eval_samples_per_second": 36.092, "eval_steps_per_second": 4.516, "step": 17 }, { "epoch": 0.01360030222893842, "grad_norm": 0.009524165652692318, "learning_rate": 9.956320346634876e-05, "loss": 12.454, "step": 18 }, { "epoch": 0.015867019267094825, "grad_norm": 0.012806428596377373, "learning_rate": 9.917525374361912e-05, "loss": 12.4518, "step": 21 }, { "epoch": 0.018133736305251228, "grad_norm": 0.011713715270161629, "learning_rate": 9.86663298624003e-05, "loss": 12.4506, "step": 24 }, { "epoch": 0.02040045334340763, "grad_norm": 0.010967453010380268, "learning_rate": 9.803768380684242e-05, "loss": 12.4491, "step": 27 }, { "epoch": 0.022667170381564034, "grad_norm": 0.0131283700466156, "learning_rate": 9.729086208503174e-05, "loss": 12.4512, "step": 30 }, { "epoch": 0.024933887419720437, "grad_norm": 0.012522528879344463, "learning_rate": 9.642770192448536e-05, "loss": 12.4509, "step": 33 }, { "epoch": 0.02568945976577257, "eval_loss": 12.45008373260498, "eval_runtime": 61.8082, "eval_samples_per_second": 36.079, "eval_steps_per_second": 4.514, "step": 34 }, { "epoch": 0.02720060445787684, "grad_norm": 0.013970930129289627, "learning_rate": 9.545032675245813e-05, "loss": 12.4541, "step": 36 }, { "epoch": 0.029467321496033247, "grad_norm": 0.014797762967646122, "learning_rate": 9.43611409721806e-05, "loss": 12.4476, "step": 39 }, { "epoch": 0.03173403853418965, "grad_norm": 0.01638209819793701, "learning_rate": 9.316282404787871e-05, "loss": 12.4498, "step": 42 }, { "epoch": 0.03400075557234605, "grad_norm": 0.017414981499314308, "learning_rate": 9.185832391312644e-05, "loss": 12.4489, "step": 45 }, { "epoch": 0.036267472610502456, "grad_norm": 0.01976948417723179, "learning_rate": 9.045084971874738e-05, "loss": 12.4498, "step": 48 }, { "epoch": 0.03853418964865886, "grad_norm": 0.017705973237752914, "learning_rate": 8.894386393810563e-05, "loss": 12.4478, "step": 51 }, { "epoch": 0.03853418964865886, "eval_loss": 12.449511528015137, "eval_runtime": 61.8317, "eval_samples_per_second": 36.066, "eval_steps_per_second": 4.512, "step": 51 }, { "epoch": 0.04080090668681526, "grad_norm": 0.018775586038827896, "learning_rate": 8.73410738492077e-05, "loss": 12.4476, "step": 54 }, { "epoch": 0.043067623724971665, "grad_norm": 0.020082606002688408, "learning_rate": 8.564642241456986e-05, "loss": 12.4519, "step": 57 }, { "epoch": 0.04533434076312807, "grad_norm": 0.01945601962506771, "learning_rate": 8.386407858128706e-05, "loss": 12.4513, "step": 60 }, { "epoch": 0.04760105780128447, "grad_norm": 0.02504323236644268, "learning_rate": 8.199842702516583e-05, "loss": 12.451, "step": 63 }, { "epoch": 0.049867774839440875, "grad_norm": 0.02477116324007511, "learning_rate": 8.005405736415126e-05, "loss": 12.4508, "step": 66 }, { "epoch": 0.05137891953154514, "eval_loss": 12.448628425598145, "eval_runtime": 61.7963, "eval_samples_per_second": 36.086, "eval_steps_per_second": 4.515, "step": 68 }, { "epoch": 0.05213449187759728, "grad_norm": 0.02452562004327774, "learning_rate": 7.803575286758364e-05, "loss": 12.4501, "step": 69 }, { "epoch": 0.05440120891575368, "grad_norm": 0.024608276784420013, "learning_rate": 7.594847868906076e-05, "loss": 12.449, "step": 72 }, { "epoch": 0.056667925953910084, "grad_norm": 0.025729818269610405, "learning_rate": 7.379736965185368e-05, "loss": 12.4491, "step": 75 }, { "epoch": 0.058934642992066494, "grad_norm": 0.026810096576809883, "learning_rate": 7.158771761692464e-05, "loss": 12.4492, "step": 78 }, { "epoch": 0.0612013600302229, "grad_norm": 0.030193008482456207, "learning_rate": 6.932495846462261e-05, "loss": 12.4488, "step": 81 }, { "epoch": 0.0634680770683793, "grad_norm": 0.0322081558406353, "learning_rate": 6.701465872208216e-05, "loss": 12.4457, "step": 84 }, { "epoch": 0.06422364941443143, "eval_loss": 12.447479248046875, "eval_runtime": 61.6601, "eval_samples_per_second": 36.166, "eval_steps_per_second": 4.525, "step": 85 }, { "epoch": 0.0657347941065357, "grad_norm": 0.03805829957127571, "learning_rate": 6.466250186922325e-05, "loss": 12.4443, "step": 87 }, { "epoch": 0.0680015111446921, "grad_norm": 0.03595505282282829, "learning_rate": 6.227427435703997e-05, "loss": 12.4494, "step": 90 }, { "epoch": 0.0702682281828485, "grad_norm": 0.03157583624124527, "learning_rate": 5.985585137257401e-05, "loss": 12.446, "step": 93 }, { "epoch": 0.07253494522100491, "grad_norm": 0.03606283292174339, "learning_rate": 5.74131823855921e-05, "loss": 12.4462, "step": 96 }, { "epoch": 0.07480166225916131, "grad_norm": 0.03671179711818695, "learning_rate": 5.495227651252315e-05, "loss": 12.4473, "step": 99 }, { "epoch": 0.07706837929731772, "grad_norm": 0.042302753776311874, "learning_rate": 5.247918773366112e-05, "loss": 12.4479, "step": 102 }, { "epoch": 0.07706837929731772, "eval_loss": 12.446091651916504, "eval_runtime": 61.8106, "eval_samples_per_second": 36.078, "eval_steps_per_second": 4.514, "step": 102 }, { "epoch": 0.07933509633547413, "grad_norm": 0.04813135042786598, "learning_rate": 5e-05, "loss": 12.4455, "step": 105 }, { "epoch": 0.08160181337363052, "grad_norm": 0.046277038753032684, "learning_rate": 4.7520812266338885e-05, "loss": 12.4466, "step": 108 }, { "epoch": 0.08386853041178693, "grad_norm": 0.046285875141620636, "learning_rate": 4.504772348747687e-05, "loss": 12.4448, "step": 111 }, { "epoch": 0.08613524744994333, "grad_norm": 0.051637545228004456, "learning_rate": 4.2586817614407895e-05, "loss": 12.4436, "step": 114 }, { "epoch": 0.08840196448809974, "grad_norm": 0.04436314478516579, "learning_rate": 4.0144148627425993e-05, "loss": 12.4446, "step": 117 }, { "epoch": 0.08991310918020401, "eval_loss": 12.444755554199219, "eval_runtime": 61.8074, "eval_samples_per_second": 36.08, "eval_steps_per_second": 4.514, "step": 119 }, { "epoch": 0.09066868152625614, "grad_norm": 0.05455072224140167, "learning_rate": 3.772572564296005e-05, "loss": 12.4468, "step": 120 }, { "epoch": 0.09293539856441255, "grad_norm": 0.05117850750684738, "learning_rate": 3.533749813077677e-05, "loss": 12.4441, "step": 123 }, { "epoch": 0.09520211560256894, "grad_norm": 0.0508827343583107, "learning_rate": 3.298534127791785e-05, "loss": 12.4432, "step": 126 }, { "epoch": 0.09746883264072535, "grad_norm": 0.05112558230757713, "learning_rate": 3.0675041535377405e-05, "loss": 12.444, "step": 129 }, { "epoch": 0.09973554967888175, "grad_norm": 0.05653185769915581, "learning_rate": 2.8412282383075363e-05, "loss": 12.4435, "step": 132 }, { "epoch": 0.10200226671703816, "grad_norm": 0.050629470497369766, "learning_rate": 2.6202630348146324e-05, "loss": 12.4432, "step": 135 }, { "epoch": 0.10275783906309029, "eval_loss": 12.443622589111328, "eval_runtime": 61.7899, "eval_samples_per_second": 36.09, "eval_steps_per_second": 4.515, "step": 136 }, { "epoch": 0.10426898375519456, "grad_norm": 0.05781077593564987, "learning_rate": 2.405152131093926e-05, "loss": 12.4441, "step": 138 }, { "epoch": 0.10653570079335097, "grad_norm": 0.05344181880354881, "learning_rate": 2.196424713241637e-05, "loss": 12.4449, "step": 141 }, { "epoch": 0.10880241783150736, "grad_norm": 0.05852736532688141, "learning_rate": 1.9945942635848748e-05, "loss": 12.4415, "step": 144 }, { "epoch": 0.11106913486966377, "grad_norm": 0.05481090024113655, "learning_rate": 1.800157297483417e-05, "loss": 12.4416, "step": 147 }, { "epoch": 0.11333585190782017, "grad_norm": 0.05853069946169853, "learning_rate": 1.6135921418712956e-05, "loss": 12.4436, "step": 150 }, { "epoch": 0.11560256894597658, "grad_norm": 0.05612453445792198, "learning_rate": 1.435357758543015e-05, "loss": 12.4425, "step": 153 }, { "epoch": 0.11560256894597658, "eval_loss": 12.44286060333252, "eval_runtime": 61.813, "eval_samples_per_second": 36.077, "eval_steps_per_second": 4.514, "step": 153 }, { "epoch": 0.11786928598413299, "grad_norm": 0.05935904011130333, "learning_rate": 1.2658926150792322e-05, "loss": 12.4428, "step": 156 }, { "epoch": 0.12013600302228938, "grad_norm": 0.06513120234012604, "learning_rate": 1.1056136061894384e-05, "loss": 12.4436, "step": 159 }, { "epoch": 0.1224027200604458, "grad_norm": 0.060658689588308334, "learning_rate": 9.549150281252633e-06, "loss": 12.4426, "step": 162 }, { "epoch": 0.12466943709860219, "grad_norm": 0.06469012796878815, "learning_rate": 8.141676086873572e-06, "loss": 12.4421, "step": 165 }, { "epoch": 0.1269361541367586, "grad_norm": 0.05972781777381897, "learning_rate": 6.837175952121306e-06, "loss": 12.4421, "step": 168 }, { "epoch": 0.12844729882886285, "eval_loss": 12.442495346069336, "eval_runtime": 61.8344, "eval_samples_per_second": 36.064, "eval_steps_per_second": 4.512, "step": 170 }, { "epoch": 0.129202871174915, "grad_norm": 0.06537988781929016, "learning_rate": 5.6388590278194096e-06, "loss": 12.4418, "step": 171 }, { "epoch": 0.1314695882130714, "grad_norm": 0.052994538098573685, "learning_rate": 4.549673247541875e-06, "loss": 12.4399, "step": 174 }, { "epoch": 0.13373630525122782, "grad_norm": 0.061471715569496155, "learning_rate": 3.5722980755146517e-06, "loss": 12.4436, "step": 177 }, { "epoch": 0.1360030222893842, "grad_norm": 0.057937003672122955, "learning_rate": 2.7091379149682685e-06, "loss": 12.4452, "step": 180 }, { "epoch": 0.1382697393275406, "grad_norm": 0.06667575240135193, "learning_rate": 1.962316193157593e-06, "loss": 12.439, "step": 183 }, { "epoch": 0.140536456365697, "grad_norm": 0.05928559973835945, "learning_rate": 1.333670137599713e-06, "loss": 12.4442, "step": 186 }, { "epoch": 0.14129202871174915, "eval_loss": 12.44235610961914, "eval_runtime": 61.7029, "eval_samples_per_second": 36.141, "eval_steps_per_second": 4.522, "step": 187 }, { "epoch": 0.14280317340385343, "grad_norm": 0.0606582872569561, "learning_rate": 8.247462563808817e-07, "loss": 12.4418, "step": 189 }, { "epoch": 0.14506989044200982, "grad_norm": 0.06177810579538345, "learning_rate": 4.367965336512403e-07, "loss": 12.4451, "step": 192 }, { "epoch": 0.14733660748016622, "grad_norm": 0.05963071435689926, "learning_rate": 1.7077534966650766e-07, "loss": 12.4409, "step": 195 }, { "epoch": 0.14960332451832262, "grad_norm": 0.06588052958250046, "learning_rate": 2.7337132953697554e-08, "loss": 12.4448, "step": 198 } ], "logging_steps": 3, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 17, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 442692009984.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }