|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.7027027027027026, |
|
"eval_steps": 17, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013513513513513514, |
|
"eval_loss": 1.4550755023956299, |
|
"eval_runtime": 16.9459, |
|
"eval_samples_per_second": 7.376, |
|
"eval_steps_per_second": 0.944, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04054054054054054, |
|
"grad_norm": 0.2170189917087555, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4534, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.08108108108108109, |
|
"grad_norm": 0.25585711002349854, |
|
"learning_rate": 6e-05, |
|
"loss": 1.463, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.12162162162162163, |
|
"grad_norm": 0.24171331524848938, |
|
"learning_rate": 9e-05, |
|
"loss": 1.4157, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"grad_norm": 0.2852655351161957, |
|
"learning_rate": 9.997266286704631e-05, |
|
"loss": 1.3769, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.20270270270270271, |
|
"grad_norm": 0.3118453323841095, |
|
"learning_rate": 9.98292246503335e-05, |
|
"loss": 1.3167, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.22972972972972974, |
|
"eval_loss": 1.230002760887146, |
|
"eval_runtime": 17.2352, |
|
"eval_samples_per_second": 7.253, |
|
"eval_steps_per_second": 0.928, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.24324324324324326, |
|
"grad_norm": 0.3110101521015167, |
|
"learning_rate": 9.956320346634876e-05, |
|
"loss": 1.2565, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.28378378378378377, |
|
"grad_norm": 0.2778882086277008, |
|
"learning_rate": 9.917525374361912e-05, |
|
"loss": 1.216, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"grad_norm": 0.27365806698799133, |
|
"learning_rate": 9.86663298624003e-05, |
|
"loss": 1.2426, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.36486486486486486, |
|
"grad_norm": 0.27578839659690857, |
|
"learning_rate": 9.803768380684242e-05, |
|
"loss": 1.1636, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 0.3146299123764038, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 1.1645, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.44594594594594594, |
|
"grad_norm": 0.3067275583744049, |
|
"learning_rate": 9.642770192448536e-05, |
|
"loss": 1.1556, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.4594594594594595, |
|
"eval_loss": 1.0813400745391846, |
|
"eval_runtime": 17.2384, |
|
"eval_samples_per_second": 7.251, |
|
"eval_steps_per_second": 0.928, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"grad_norm": 0.30459922552108765, |
|
"learning_rate": 9.545032675245813e-05, |
|
"loss": 1.1033, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.527027027027027, |
|
"grad_norm": 0.31739258766174316, |
|
"learning_rate": 9.43611409721806e-05, |
|
"loss": 1.0876, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5675675675675675, |
|
"grad_norm": 0.3213581144809723, |
|
"learning_rate": 9.316282404787871e-05, |
|
"loss": 1.0733, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6081081081081081, |
|
"grad_norm": 0.3462018370628357, |
|
"learning_rate": 9.185832391312644e-05, |
|
"loss": 1.1107, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 0.3432579040527344, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 1.0265, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.6891891891891891, |
|
"grad_norm": 0.3420480191707611, |
|
"learning_rate": 8.894386393810563e-05, |
|
"loss": 1.053, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.6891891891891891, |
|
"eval_loss": 1.0123796463012695, |
|
"eval_runtime": 17.2325, |
|
"eval_samples_per_second": 7.254, |
|
"eval_steps_per_second": 0.928, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7297297297297297, |
|
"grad_norm": 0.35601529479026794, |
|
"learning_rate": 8.73410738492077e-05, |
|
"loss": 1.0257, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.7702702702702703, |
|
"grad_norm": 0.3662780821323395, |
|
"learning_rate": 8.564642241456986e-05, |
|
"loss": 1.0528, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 0.39324405789375305, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 1.0185, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8513513513513513, |
|
"grad_norm": 0.39999616146087646, |
|
"learning_rate": 8.199842702516583e-05, |
|
"loss": 1.0132, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.8918918918918919, |
|
"grad_norm": 0.4472688138484955, |
|
"learning_rate": 8.005405736415126e-05, |
|
"loss": 1.0243, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.918918918918919, |
|
"eval_loss": 0.9729012250900269, |
|
"eval_runtime": 17.2325, |
|
"eval_samples_per_second": 7.254, |
|
"eval_steps_per_second": 0.928, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.9324324324324325, |
|
"grad_norm": 0.3893688917160034, |
|
"learning_rate": 7.803575286758364e-05, |
|
"loss": 0.9998, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.972972972972973, |
|
"grad_norm": 0.38515031337738037, |
|
"learning_rate": 7.594847868906076e-05, |
|
"loss": 0.9943, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.0135135135135136, |
|
"grad_norm": 0.4097621738910675, |
|
"learning_rate": 7.379736965185368e-05, |
|
"loss": 0.9972, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.054054054054054, |
|
"grad_norm": 0.4480155408382416, |
|
"learning_rate": 7.158771761692464e-05, |
|
"loss": 0.9534, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.0945945945945945, |
|
"grad_norm": 0.42299342155456543, |
|
"learning_rate": 6.932495846462261e-05, |
|
"loss": 0.9408, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.135135135135135, |
|
"grad_norm": 0.41073334217071533, |
|
"learning_rate": 6.701465872208216e-05, |
|
"loss": 0.9503, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.1486486486486487, |
|
"eval_loss": 0.9501034021377563, |
|
"eval_runtime": 17.2388, |
|
"eval_samples_per_second": 7.251, |
|
"eval_steps_per_second": 0.928, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.1756756756756757, |
|
"grad_norm": 0.49050870537757874, |
|
"learning_rate": 6.466250186922325e-05, |
|
"loss": 0.9146, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 0.48001334071159363, |
|
"learning_rate": 6.227427435703997e-05, |
|
"loss": 0.9215, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.2567567567567568, |
|
"grad_norm": 0.47052550315856934, |
|
"learning_rate": 5.985585137257401e-05, |
|
"loss": 0.8942, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.2972972972972974, |
|
"grad_norm": 0.46515583992004395, |
|
"learning_rate": 5.74131823855921e-05, |
|
"loss": 0.9304, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.3378378378378377, |
|
"grad_norm": 0.5048130750656128, |
|
"learning_rate": 5.495227651252315e-05, |
|
"loss": 0.95, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.3783783783783785, |
|
"grad_norm": 0.5172950029373169, |
|
"learning_rate": 5.247918773366112e-05, |
|
"loss": 0.9288, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.3783783783783785, |
|
"eval_loss": 0.9384378790855408, |
|
"eval_runtime": 17.2324, |
|
"eval_samples_per_second": 7.254, |
|
"eval_steps_per_second": 0.928, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.4189189189189189, |
|
"grad_norm": 0.5409041047096252, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9294, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.4594594594594594, |
|
"grad_norm": 0.49371689558029175, |
|
"learning_rate": 4.7520812266338885e-05, |
|
"loss": 0.9109, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.4969201982021332, |
|
"learning_rate": 4.504772348747687e-05, |
|
"loss": 0.94, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.5405405405405406, |
|
"grad_norm": 0.5152857899665833, |
|
"learning_rate": 4.2586817614407895e-05, |
|
"loss": 0.9083, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.5810810810810811, |
|
"grad_norm": 0.5423870086669922, |
|
"learning_rate": 4.0144148627425993e-05, |
|
"loss": 0.8869, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.6081081081081081, |
|
"eval_loss": 0.924616813659668, |
|
"eval_runtime": 17.244, |
|
"eval_samples_per_second": 7.249, |
|
"eval_steps_per_second": 0.928, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 0.49147549271583557, |
|
"learning_rate": 3.772572564296005e-05, |
|
"loss": 0.8855, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.6621621621621623, |
|
"grad_norm": 0.4987981915473938, |
|
"learning_rate": 3.533749813077677e-05, |
|
"loss": 0.8915, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.7027027027027026, |
|
"grad_norm": 0.4984021484851837, |
|
"learning_rate": 3.298534127791785e-05, |
|
"loss": 0.9181, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.7432432432432432, |
|
"grad_norm": 0.5324372053146362, |
|
"learning_rate": 3.0675041535377405e-05, |
|
"loss": 0.8911, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.7837837837837838, |
|
"grad_norm": 0.521675169467926, |
|
"learning_rate": 2.8412282383075363e-05, |
|
"loss": 0.8905, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.8243243243243243, |
|
"grad_norm": 0.5264241099357605, |
|
"learning_rate": 2.6202630348146324e-05, |
|
"loss": 0.8928, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.8378378378378377, |
|
"eval_loss": 0.9162411689758301, |
|
"eval_runtime": 17.2393, |
|
"eval_samples_per_second": 7.251, |
|
"eval_steps_per_second": 0.928, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.864864864864865, |
|
"grad_norm": 0.5409694910049438, |
|
"learning_rate": 2.405152131093926e-05, |
|
"loss": 0.9095, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.9054054054054053, |
|
"grad_norm": 0.5272465348243713, |
|
"learning_rate": 2.196424713241637e-05, |
|
"loss": 0.8844, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.945945945945946, |
|
"grad_norm": 0.5436295866966248, |
|
"learning_rate": 1.9945942635848748e-05, |
|
"loss": 0.8699, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.9864864864864864, |
|
"grad_norm": 0.523435115814209, |
|
"learning_rate": 1.800157297483417e-05, |
|
"loss": 0.9164, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.027027027027027, |
|
"grad_norm": 0.5417100787162781, |
|
"learning_rate": 1.6135921418712956e-05, |
|
"loss": 0.8899, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.0675675675675675, |
|
"grad_norm": 0.569843053817749, |
|
"learning_rate": 1.435357758543015e-05, |
|
"loss": 0.8476, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.0675675675675675, |
|
"eval_loss": 0.9106208682060242, |
|
"eval_runtime": 17.255, |
|
"eval_samples_per_second": 7.244, |
|
"eval_steps_per_second": 0.927, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.108108108108108, |
|
"grad_norm": 0.575161337852478, |
|
"learning_rate": 1.2658926150792322e-05, |
|
"loss": 0.8375, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.1486486486486487, |
|
"grad_norm": 0.5137052536010742, |
|
"learning_rate": 1.1056136061894384e-05, |
|
"loss": 0.823, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.189189189189189, |
|
"grad_norm": 0.532635509967804, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.8504, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.22972972972973, |
|
"grad_norm": 0.5798172950744629, |
|
"learning_rate": 8.141676086873572e-06, |
|
"loss": 0.8315, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.27027027027027, |
|
"grad_norm": 0.5635726451873779, |
|
"learning_rate": 6.837175952121306e-06, |
|
"loss": 0.8626, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.2972972972972974, |
|
"eval_loss": 0.9093061089515686, |
|
"eval_runtime": 17.2439, |
|
"eval_samples_per_second": 7.249, |
|
"eval_steps_per_second": 0.928, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.310810810810811, |
|
"grad_norm": 0.5467652082443237, |
|
"learning_rate": 5.6388590278194096e-06, |
|
"loss": 0.8334, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.3513513513513513, |
|
"grad_norm": 0.534943699836731, |
|
"learning_rate": 4.549673247541875e-06, |
|
"loss": 0.8773, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.391891891891892, |
|
"grad_norm": 0.5133833885192871, |
|
"learning_rate": 3.5722980755146517e-06, |
|
"loss": 0.8081, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 0.5191574096679688, |
|
"learning_rate": 2.7091379149682685e-06, |
|
"loss": 0.8595, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.472972972972973, |
|
"grad_norm": 0.5596672892570496, |
|
"learning_rate": 1.962316193157593e-06, |
|
"loss": 0.8525, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 2.5135135135135136, |
|
"grad_norm": 0.5567631125450134, |
|
"learning_rate": 1.333670137599713e-06, |
|
"loss": 0.8269, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.527027027027027, |
|
"eval_loss": 0.9086253643035889, |
|
"eval_runtime": 17.2348, |
|
"eval_samples_per_second": 7.253, |
|
"eval_steps_per_second": 0.928, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 2.554054054054054, |
|
"grad_norm": 0.5464707016944885, |
|
"learning_rate": 8.247462563808817e-07, |
|
"loss": 0.8524, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.5945945945945947, |
|
"grad_norm": 0.6008809804916382, |
|
"learning_rate": 4.367965336512403e-07, |
|
"loss": 0.8633, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.635135135135135, |
|
"grad_norm": 0.5401892066001892, |
|
"learning_rate": 1.7077534966650766e-07, |
|
"loss": 0.8186, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.6756756756756754, |
|
"grad_norm": 0.5296628475189209, |
|
"learning_rate": 2.7337132953697554e-08, |
|
"loss": 0.8183, |
|
"step": 198 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 17, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.262308567427318e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|