|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.1511144692104269, |
|
"eval_steps": 17, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007555723460521345, |
|
"eval_loss": 12.450613975524902, |
|
"eval_runtime": 61.6454, |
|
"eval_samples_per_second": 36.175, |
|
"eval_steps_per_second": 4.526, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0022667170381564035, |
|
"grad_norm": 0.009174207225441933, |
|
"learning_rate": 3e-05, |
|
"loss": 12.453, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.004533434076312807, |
|
"grad_norm": 0.009654853492975235, |
|
"learning_rate": 6e-05, |
|
"loss": 12.4526, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00680015111446921, |
|
"grad_norm": 0.008986242115497589, |
|
"learning_rate": 9e-05, |
|
"loss": 12.4507, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.009066868152625614, |
|
"grad_norm": 0.009755842387676239, |
|
"learning_rate": 9.997266286704631e-05, |
|
"loss": 12.4496, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.011333585190782017, |
|
"grad_norm": 0.011493120342493057, |
|
"learning_rate": 9.98292246503335e-05, |
|
"loss": 12.4487, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.012844729882886286, |
|
"eval_loss": 12.450437545776367, |
|
"eval_runtime": 61.7871, |
|
"eval_samples_per_second": 36.092, |
|
"eval_steps_per_second": 4.516, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01360030222893842, |
|
"grad_norm": 0.009524165652692318, |
|
"learning_rate": 9.956320346634876e-05, |
|
"loss": 12.454, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.015867019267094825, |
|
"grad_norm": 0.012806428596377373, |
|
"learning_rate": 9.917525374361912e-05, |
|
"loss": 12.4518, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.018133736305251228, |
|
"grad_norm": 0.011713715270161629, |
|
"learning_rate": 9.86663298624003e-05, |
|
"loss": 12.4506, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.02040045334340763, |
|
"grad_norm": 0.010967453010380268, |
|
"learning_rate": 9.803768380684242e-05, |
|
"loss": 12.4491, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.022667170381564034, |
|
"grad_norm": 0.0131283700466156, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 12.4512, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.024933887419720437, |
|
"grad_norm": 0.012522528879344463, |
|
"learning_rate": 9.642770192448536e-05, |
|
"loss": 12.4509, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.02568945976577257, |
|
"eval_loss": 12.45008373260498, |
|
"eval_runtime": 61.8082, |
|
"eval_samples_per_second": 36.079, |
|
"eval_steps_per_second": 4.514, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.02720060445787684, |
|
"grad_norm": 0.013970930129289627, |
|
"learning_rate": 9.545032675245813e-05, |
|
"loss": 12.4541, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.029467321496033247, |
|
"grad_norm": 0.014797762967646122, |
|
"learning_rate": 9.43611409721806e-05, |
|
"loss": 12.4476, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.03173403853418965, |
|
"grad_norm": 0.01638209819793701, |
|
"learning_rate": 9.316282404787871e-05, |
|
"loss": 12.4498, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03400075557234605, |
|
"grad_norm": 0.017414981499314308, |
|
"learning_rate": 9.185832391312644e-05, |
|
"loss": 12.4489, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.036267472610502456, |
|
"grad_norm": 0.01976948417723179, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 12.4498, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.03853418964865886, |
|
"grad_norm": 0.017705973237752914, |
|
"learning_rate": 8.894386393810563e-05, |
|
"loss": 12.4478, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.03853418964865886, |
|
"eval_loss": 12.449511528015137, |
|
"eval_runtime": 61.8317, |
|
"eval_samples_per_second": 36.066, |
|
"eval_steps_per_second": 4.512, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.04080090668681526, |
|
"grad_norm": 0.018775586038827896, |
|
"learning_rate": 8.73410738492077e-05, |
|
"loss": 12.4476, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.043067623724971665, |
|
"grad_norm": 0.020082606002688408, |
|
"learning_rate": 8.564642241456986e-05, |
|
"loss": 12.4519, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.04533434076312807, |
|
"grad_norm": 0.01945601962506771, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 12.4513, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04760105780128447, |
|
"grad_norm": 0.02504323236644268, |
|
"learning_rate": 8.199842702516583e-05, |
|
"loss": 12.451, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.049867774839440875, |
|
"grad_norm": 0.02477116324007511, |
|
"learning_rate": 8.005405736415126e-05, |
|
"loss": 12.4508, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.05137891953154514, |
|
"eval_loss": 12.448628425598145, |
|
"eval_runtime": 61.7963, |
|
"eval_samples_per_second": 36.086, |
|
"eval_steps_per_second": 4.515, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.05213449187759728, |
|
"grad_norm": 0.02452562004327774, |
|
"learning_rate": 7.803575286758364e-05, |
|
"loss": 12.4501, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.05440120891575368, |
|
"grad_norm": 0.024608276784420013, |
|
"learning_rate": 7.594847868906076e-05, |
|
"loss": 12.449, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.056667925953910084, |
|
"grad_norm": 0.025729818269610405, |
|
"learning_rate": 7.379736965185368e-05, |
|
"loss": 12.4491, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.058934642992066494, |
|
"grad_norm": 0.026810096576809883, |
|
"learning_rate": 7.158771761692464e-05, |
|
"loss": 12.4492, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.0612013600302229, |
|
"grad_norm": 0.030193008482456207, |
|
"learning_rate": 6.932495846462261e-05, |
|
"loss": 12.4488, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0634680770683793, |
|
"grad_norm": 0.0322081558406353, |
|
"learning_rate": 6.701465872208216e-05, |
|
"loss": 12.4457, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.06422364941443143, |
|
"eval_loss": 12.447479248046875, |
|
"eval_runtime": 61.6601, |
|
"eval_samples_per_second": 36.166, |
|
"eval_steps_per_second": 4.525, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0657347941065357, |
|
"grad_norm": 0.03805829957127571, |
|
"learning_rate": 6.466250186922325e-05, |
|
"loss": 12.4443, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.0680015111446921, |
|
"grad_norm": 0.03595505282282829, |
|
"learning_rate": 6.227427435703997e-05, |
|
"loss": 12.4494, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0702682281828485, |
|
"grad_norm": 0.03157583624124527, |
|
"learning_rate": 5.985585137257401e-05, |
|
"loss": 12.446, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.07253494522100491, |
|
"grad_norm": 0.03606283292174339, |
|
"learning_rate": 5.74131823855921e-05, |
|
"loss": 12.4462, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.07480166225916131, |
|
"grad_norm": 0.03671179711818695, |
|
"learning_rate": 5.495227651252315e-05, |
|
"loss": 12.4473, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.07706837929731772, |
|
"grad_norm": 0.042302753776311874, |
|
"learning_rate": 5.247918773366112e-05, |
|
"loss": 12.4479, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.07706837929731772, |
|
"eval_loss": 12.446091651916504, |
|
"eval_runtime": 61.8106, |
|
"eval_samples_per_second": 36.078, |
|
"eval_steps_per_second": 4.514, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.07933509633547413, |
|
"grad_norm": 0.04813135042786598, |
|
"learning_rate": 5e-05, |
|
"loss": 12.4455, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.08160181337363052, |
|
"grad_norm": 0.046277038753032684, |
|
"learning_rate": 4.7520812266338885e-05, |
|
"loss": 12.4466, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.08386853041178693, |
|
"grad_norm": 0.046285875141620636, |
|
"learning_rate": 4.504772348747687e-05, |
|
"loss": 12.4448, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.08613524744994333, |
|
"grad_norm": 0.051637545228004456, |
|
"learning_rate": 4.2586817614407895e-05, |
|
"loss": 12.4436, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.08840196448809974, |
|
"grad_norm": 0.04436314478516579, |
|
"learning_rate": 4.0144148627425993e-05, |
|
"loss": 12.4446, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.08991310918020401, |
|
"eval_loss": 12.444755554199219, |
|
"eval_runtime": 61.8074, |
|
"eval_samples_per_second": 36.08, |
|
"eval_steps_per_second": 4.514, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.09066868152625614, |
|
"grad_norm": 0.05455072224140167, |
|
"learning_rate": 3.772572564296005e-05, |
|
"loss": 12.4468, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09293539856441255, |
|
"grad_norm": 0.05117850750684738, |
|
"learning_rate": 3.533749813077677e-05, |
|
"loss": 12.4441, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.09520211560256894, |
|
"grad_norm": 0.0508827343583107, |
|
"learning_rate": 3.298534127791785e-05, |
|
"loss": 12.4432, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.09746883264072535, |
|
"grad_norm": 0.05112558230757713, |
|
"learning_rate": 3.0675041535377405e-05, |
|
"loss": 12.444, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.09973554967888175, |
|
"grad_norm": 0.05653185769915581, |
|
"learning_rate": 2.8412282383075363e-05, |
|
"loss": 12.4435, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.10200226671703816, |
|
"grad_norm": 0.050629470497369766, |
|
"learning_rate": 2.6202630348146324e-05, |
|
"loss": 12.4432, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.10275783906309029, |
|
"eval_loss": 12.443622589111328, |
|
"eval_runtime": 61.7899, |
|
"eval_samples_per_second": 36.09, |
|
"eval_steps_per_second": 4.515, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.10426898375519456, |
|
"grad_norm": 0.05781077593564987, |
|
"learning_rate": 2.405152131093926e-05, |
|
"loss": 12.4441, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.10653570079335097, |
|
"grad_norm": 0.05344181880354881, |
|
"learning_rate": 2.196424713241637e-05, |
|
"loss": 12.4449, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.10880241783150736, |
|
"grad_norm": 0.05852736532688141, |
|
"learning_rate": 1.9945942635848748e-05, |
|
"loss": 12.4415, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.11106913486966377, |
|
"grad_norm": 0.05481090024113655, |
|
"learning_rate": 1.800157297483417e-05, |
|
"loss": 12.4416, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.11333585190782017, |
|
"grad_norm": 0.05853069946169853, |
|
"learning_rate": 1.6135921418712956e-05, |
|
"loss": 12.4436, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11560256894597658, |
|
"grad_norm": 0.05612453445792198, |
|
"learning_rate": 1.435357758543015e-05, |
|
"loss": 12.4425, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.11560256894597658, |
|
"eval_loss": 12.44286060333252, |
|
"eval_runtime": 61.813, |
|
"eval_samples_per_second": 36.077, |
|
"eval_steps_per_second": 4.514, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.11786928598413299, |
|
"grad_norm": 0.05935904011130333, |
|
"learning_rate": 1.2658926150792322e-05, |
|
"loss": 12.4428, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.12013600302228938, |
|
"grad_norm": 0.06513120234012604, |
|
"learning_rate": 1.1056136061894384e-05, |
|
"loss": 12.4436, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.1224027200604458, |
|
"grad_norm": 0.060658689588308334, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 12.4426, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.12466943709860219, |
|
"grad_norm": 0.06469012796878815, |
|
"learning_rate": 8.141676086873572e-06, |
|
"loss": 12.4421, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.1269361541367586, |
|
"grad_norm": 0.05972781777381897, |
|
"learning_rate": 6.837175952121306e-06, |
|
"loss": 12.4421, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.12844729882886285, |
|
"eval_loss": 12.442495346069336, |
|
"eval_runtime": 61.8344, |
|
"eval_samples_per_second": 36.064, |
|
"eval_steps_per_second": 4.512, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.129202871174915, |
|
"grad_norm": 0.06537988781929016, |
|
"learning_rate": 5.6388590278194096e-06, |
|
"loss": 12.4418, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.1314695882130714, |
|
"grad_norm": 0.052994538098573685, |
|
"learning_rate": 4.549673247541875e-06, |
|
"loss": 12.4399, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.13373630525122782, |
|
"grad_norm": 0.061471715569496155, |
|
"learning_rate": 3.5722980755146517e-06, |
|
"loss": 12.4436, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.1360030222893842, |
|
"grad_norm": 0.057937003672122955, |
|
"learning_rate": 2.7091379149682685e-06, |
|
"loss": 12.4452, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1382697393275406, |
|
"grad_norm": 0.06667575240135193, |
|
"learning_rate": 1.962316193157593e-06, |
|
"loss": 12.439, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.140536456365697, |
|
"grad_norm": 0.05928559973835945, |
|
"learning_rate": 1.333670137599713e-06, |
|
"loss": 12.4442, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.14129202871174915, |
|
"eval_loss": 12.44235610961914, |
|
"eval_runtime": 61.7029, |
|
"eval_samples_per_second": 36.141, |
|
"eval_steps_per_second": 4.522, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.14280317340385343, |
|
"grad_norm": 0.0606582872569561, |
|
"learning_rate": 8.247462563808817e-07, |
|
"loss": 12.4418, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.14506989044200982, |
|
"grad_norm": 0.06177810579538345, |
|
"learning_rate": 4.367965336512403e-07, |
|
"loss": 12.4451, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.14733660748016622, |
|
"grad_norm": 0.05963071435689926, |
|
"learning_rate": 1.7077534966650766e-07, |
|
"loss": 12.4409, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.14960332451832262, |
|
"grad_norm": 0.06588052958250046, |
|
"learning_rate": 2.7337132953697554e-08, |
|
"loss": 12.4448, |
|
"step": 198 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 17, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 442692009984.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|