|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.022909507445589918, |
|
"eval_steps": 34, |
|
"global_step": 340, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 6.738090425173506e-05, |
|
"eval_loss": 1.0482484102249146, |
|
"eval_runtime": 1771.3022, |
|
"eval_samples_per_second": 14.111, |
|
"eval_steps_per_second": 1.764, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00020214271275520516, |
|
"grad_norm": 2.1362860202789307, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.0166, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0004042854255104103, |
|
"grad_norm": 1.772956371307373, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0004, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0006064281382656155, |
|
"grad_norm": 1.3759489059448242, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.9564, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0008085708510208206, |
|
"grad_norm": 1.2244105339050293, |
|
"learning_rate": 4.999675562428437e-05, |
|
"loss": 0.891, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.001010713563776026, |
|
"grad_norm": 1.1622825860977173, |
|
"learning_rate": 4.9979724954289244e-05, |
|
"loss": 0.8578, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.001212856276531231, |
|
"grad_norm": 1.089287281036377, |
|
"learning_rate": 4.994810682835951e-05, |
|
"loss": 0.832, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0014149989892864362, |
|
"grad_norm": 1.685937762260437, |
|
"learning_rate": 4.990191971059033e-05, |
|
"loss": 0.8445, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0016171417020416413, |
|
"grad_norm": 0.9353536367416382, |
|
"learning_rate": 4.984119057295783e-05, |
|
"loss": 0.8481, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0018192844147968466, |
|
"grad_norm": 0.9442921876907349, |
|
"learning_rate": 4.976595487956823e-05, |
|
"loss": 0.8389, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.002021427127552052, |
|
"grad_norm": 1.0226161479949951, |
|
"learning_rate": 4.967625656594782e-05, |
|
"loss": 0.8224, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.002223569840307257, |
|
"grad_norm": 0.8120137453079224, |
|
"learning_rate": 4.957214801338581e-05, |
|
"loss": 0.849, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.002290950744558992, |
|
"eval_loss": 0.8384992480278015, |
|
"eval_runtime": 1781.1941, |
|
"eval_samples_per_second": 14.033, |
|
"eval_steps_per_second": 1.754, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.002425712553062462, |
|
"grad_norm": 0.8147669434547424, |
|
"learning_rate": 4.9453690018345144e-05, |
|
"loss": 0.8287, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0026278552658176675, |
|
"grad_norm": 0.9256265163421631, |
|
"learning_rate": 4.932095175695911e-05, |
|
"loss": 0.8112, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0028299979785728724, |
|
"grad_norm": 0.9979642033576965, |
|
"learning_rate": 4.917401074463441e-05, |
|
"loss": 0.8657, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0030321406913280777, |
|
"grad_norm": 0.7800766825675964, |
|
"learning_rate": 4.901295279078431e-05, |
|
"loss": 0.7922, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0032342834040832826, |
|
"grad_norm": 0.7466667294502258, |
|
"learning_rate": 4.883787194871841e-05, |
|
"loss": 0.8455, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.003436426116838488, |
|
"grad_norm": 0.7748194932937622, |
|
"learning_rate": 4.864887046071813e-05, |
|
"loss": 0.8259, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0036385688295936932, |
|
"grad_norm": 1.103950023651123, |
|
"learning_rate": 4.8446058698330115e-05, |
|
"loss": 0.8387, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.003840711542348898, |
|
"grad_norm": 0.7845460176467896, |
|
"learning_rate": 4.822955509791233e-05, |
|
"loss": 0.8067, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.004042854255104104, |
|
"grad_norm": 0.7884831428527832, |
|
"learning_rate": 4.799948609147061e-05, |
|
"loss": 0.8231, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.004244996967859308, |
|
"grad_norm": 0.9247403144836426, |
|
"learning_rate": 4.7755986032825864e-05, |
|
"loss": 0.8159, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.004447139680614514, |
|
"grad_norm": 0.7747366428375244, |
|
"learning_rate": 4.74991971191553e-05, |
|
"loss": 0.8132, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.004581901489117984, |
|
"eval_loss": 0.8286266922950745, |
|
"eval_runtime": 1781.5654, |
|
"eval_samples_per_second": 14.03, |
|
"eval_steps_per_second": 1.754, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.004649282393369719, |
|
"grad_norm": 0.7278714776039124, |
|
"learning_rate": 4.7229269307953235e-05, |
|
"loss": 0.741, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.004851425106124924, |
|
"grad_norm": 0.9090484380722046, |
|
"learning_rate": 4.694636022946012e-05, |
|
"loss": 0.8075, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.00505356781888013, |
|
"grad_norm": 0.8673194050788879, |
|
"learning_rate": 4.665063509461097e-05, |
|
"loss": 0.8395, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.005255710531635335, |
|
"grad_norm": 0.7307804822921753, |
|
"learning_rate": 4.6342266598556814e-05, |
|
"loss": 0.7995, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.005457853244390539, |
|
"grad_norm": 0.9184255003929138, |
|
"learning_rate": 4.6021434819815555e-05, |
|
"loss": 0.8318, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.005659995957145745, |
|
"grad_norm": 0.7924419045448303, |
|
"learning_rate": 4.568832711511125e-05, |
|
"loss": 0.8095, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.00586213866990095, |
|
"grad_norm": 0.7330142259597778, |
|
"learning_rate": 4.534313800996299e-05, |
|
"loss": 0.7674, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.006064281382656155, |
|
"grad_norm": 0.7591224908828735, |
|
"learning_rate": 4.498606908508754e-05, |
|
"loss": 0.8409, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.006266424095411361, |
|
"grad_norm": 0.7741730213165283, |
|
"learning_rate": 4.46173288586818e-05, |
|
"loss": 0.8541, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.006468566808166565, |
|
"grad_norm": 0.7202086448669434, |
|
"learning_rate": 4.4237132664654154e-05, |
|
"loss": 0.85, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0066707095209217705, |
|
"grad_norm": 0.7738878726959229, |
|
"learning_rate": 4.384570252687542e-05, |
|
"loss": 0.8571, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.006872852233676976, |
|
"grad_norm": 0.7431773543357849, |
|
"learning_rate": 4.344326702952326e-05, |
|
"loss": 0.8264, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.006872852233676976, |
|
"eval_loss": 0.8239989280700684, |
|
"eval_runtime": 1781.4353, |
|
"eval_samples_per_second": 14.031, |
|
"eval_steps_per_second": 1.754, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.007074994946432181, |
|
"grad_norm": 0.7040189504623413, |
|
"learning_rate": 4.303006118359537e-05, |
|
"loss": 0.8247, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0072771376591873865, |
|
"grad_norm": 0.7915964722633362, |
|
"learning_rate": 4.260632628966974e-05, |
|
"loss": 0.8551, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.007479280371942592, |
|
"grad_norm": 0.7852084040641785, |
|
"learning_rate": 4.217230979699188e-05, |
|
"loss": 0.8425, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.007681423084697796, |
|
"grad_norm": 0.6728894114494324, |
|
"learning_rate": 4.172826515897146e-05, |
|
"loss": 0.8141, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.007883565797453002, |
|
"grad_norm": 0.7391681671142578, |
|
"learning_rate": 4.12744516851726e-05, |
|
"loss": 0.7987, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.008085708510208208, |
|
"grad_norm": 0.7469043135643005, |
|
"learning_rate": 4.0811134389884433e-05, |
|
"loss": 0.7909, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.008287851222963412, |
|
"grad_norm": 0.7632879614830017, |
|
"learning_rate": 4.0338583837360225e-05, |
|
"loss": 0.8031, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.008489993935718617, |
|
"grad_norm": 0.7656901478767395, |
|
"learning_rate": 3.985707598381544e-05, |
|
"loss": 0.843, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.008692136648473823, |
|
"grad_norm": 0.8024786114692688, |
|
"learning_rate": 3.9366892016277096e-05, |
|
"loss": 0.8403, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.008894279361229027, |
|
"grad_norm": 0.6944208145141602, |
|
"learning_rate": 3.886831818837847e-05, |
|
"loss": 0.7908, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.009096422073984234, |
|
"grad_norm": 0.719901442527771, |
|
"learning_rate": 3.8361645653195026e-05, |
|
"loss": 0.8151, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.009163802978235968, |
|
"eval_loss": 0.8194563388824463, |
|
"eval_runtime": 1782.0949, |
|
"eval_samples_per_second": 14.026, |
|
"eval_steps_per_second": 1.754, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.009298564786739438, |
|
"grad_norm": 0.6918753981590271, |
|
"learning_rate": 3.784717029321922e-05, |
|
"loss": 0.8194, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.009500707499494642, |
|
"grad_norm": 0.7483247518539429, |
|
"learning_rate": 3.732519254757344e-05, |
|
"loss": 0.8422, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.009702850212249849, |
|
"grad_norm": 0.7642280459403992, |
|
"learning_rate": 3.679601723656205e-05, |
|
"loss": 0.8222, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.009904992925005053, |
|
"grad_norm": 0.7145370244979858, |
|
"learning_rate": 3.625995338366492e-05, |
|
"loss": 0.8073, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.01010713563776026, |
|
"grad_norm": 0.732183039188385, |
|
"learning_rate": 3.5717314035076355e-05, |
|
"loss": 0.8163, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.010309278350515464, |
|
"grad_norm": 0.6954637765884399, |
|
"learning_rate": 3.516841607689501e-05, |
|
"loss": 0.7573, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.01051142106327067, |
|
"grad_norm": 0.7373840808868408, |
|
"learning_rate": 3.461358005007128e-05, |
|
"loss": 0.7868, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.010713563776025874, |
|
"grad_norm": 0.7047626376152039, |
|
"learning_rate": 3.405312996322042e-05, |
|
"loss": 0.821, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.010915706488781079, |
|
"grad_norm": 0.7702988982200623, |
|
"learning_rate": 3.348739310341068e-05, |
|
"loss": 0.8194, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.011117849201536285, |
|
"grad_norm": 0.7867685556411743, |
|
"learning_rate": 3.2916699845036816e-05, |
|
"loss": 0.7898, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.01131999191429149, |
|
"grad_norm": 0.7021005153656006, |
|
"learning_rate": 3.234138345689077e-05, |
|
"loss": 0.7621, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.011454753722794959, |
|
"eval_loss": 0.8163909316062927, |
|
"eval_runtime": 1780.9274, |
|
"eval_samples_per_second": 14.035, |
|
"eval_steps_per_second": 1.755, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.011522134627046696, |
|
"grad_norm": 0.7096220850944519, |
|
"learning_rate": 3.17617799075421e-05, |
|
"loss": 0.7807, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.0117242773398019, |
|
"grad_norm": 0.7657400369644165, |
|
"learning_rate": 3.1178227669141744e-05, |
|
"loss": 0.7858, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.011926420052557105, |
|
"grad_norm": 0.8024412393569946, |
|
"learning_rate": 3.0591067519763895e-05, |
|
"loss": 0.8122, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.01212856276531231, |
|
"grad_norm": 0.6976025700569153, |
|
"learning_rate": 3.0000642344401113e-05, |
|
"loss": 0.8288, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.012330705478067515, |
|
"grad_norm": 0.6966779828071594, |
|
"learning_rate": 2.9407296934729227e-05, |
|
"loss": 0.793, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.012532848190822721, |
|
"grad_norm": 0.7219818830490112, |
|
"learning_rate": 2.8811377787758636e-05, |
|
"loss": 0.7883, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.012734990903577926, |
|
"grad_norm": 0.8189945816993713, |
|
"learning_rate": 2.8213232903489865e-05, |
|
"loss": 0.885, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.01293713361633313, |
|
"grad_norm": 0.902603805065155, |
|
"learning_rate": 2.761321158169134e-05, |
|
"loss": 0.8383, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.013139276329088337, |
|
"grad_norm": 0.8128630518913269, |
|
"learning_rate": 2.7011664217918154e-05, |
|
"loss": 0.852, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.013341419041843541, |
|
"grad_norm": 0.7031587958335876, |
|
"learning_rate": 2.6408942098890936e-05, |
|
"loss": 0.8622, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.013543561754598747, |
|
"grad_norm": 0.7614731788635254, |
|
"learning_rate": 2.580539719735433e-05, |
|
"loss": 0.8162, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.013745704467353952, |
|
"grad_norm": 0.6810929179191589, |
|
"learning_rate": 2.5201381966534748e-05, |
|
"loss": 0.8271, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.013745704467353952, |
|
"eval_loss": 0.8147265315055847, |
|
"eval_runtime": 1782.1355, |
|
"eval_samples_per_second": 14.025, |
|
"eval_steps_per_second": 1.754, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.013947847180109158, |
|
"grad_norm": 0.7248020768165588, |
|
"learning_rate": 2.459724913431772e-05, |
|
"loss": 0.814, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.014149989892864362, |
|
"grad_norm": 0.7375376224517822, |
|
"learning_rate": 2.399335149726463e-05, |
|
"loss": 0.8381, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.014352132605619567, |
|
"grad_norm": 0.75850510597229, |
|
"learning_rate": 2.3390041714589514e-05, |
|
"loss": 0.7851, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.014554275318374773, |
|
"grad_norm": 0.711068332195282, |
|
"learning_rate": 2.2787672102216042e-05, |
|
"loss": 0.7992, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.014756418031129977, |
|
"grad_norm": 0.7301695346832275, |
|
"learning_rate": 2.2186594427034864e-05, |
|
"loss": 0.8506, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.014958560743885184, |
|
"grad_norm": 0.720683753490448, |
|
"learning_rate": 2.1587159701481716e-05, |
|
"loss": 0.8061, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.015160703456640388, |
|
"grad_norm": 0.665138304233551, |
|
"learning_rate": 2.098971797855599e-05, |
|
"loss": 0.7454, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.015362846169395592, |
|
"grad_norm": 0.6854159235954285, |
|
"learning_rate": 2.0394618147399713e-05, |
|
"loss": 0.828, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.015564988882150799, |
|
"grad_norm": 0.7191194891929626, |
|
"learning_rate": 1.980220772955602e-05, |
|
"loss": 0.7885, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.015767131594906003, |
|
"grad_norm": 0.7301977276802063, |
|
"learning_rate": 1.921283267602643e-05, |
|
"loss": 0.81, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.015969274307661208, |
|
"grad_norm": 0.7239346504211426, |
|
"learning_rate": 1.8626837165245165e-05, |
|
"loss": 0.787, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.016036655211912942, |
|
"eval_loss": 0.8127490878105164, |
|
"eval_runtime": 1781.9808, |
|
"eval_samples_per_second": 14.027, |
|
"eval_steps_per_second": 1.754, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.016171417020416416, |
|
"grad_norm": 0.7089824676513672, |
|
"learning_rate": 1.8044563402088684e-05, |
|
"loss": 0.8143, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.01637355973317162, |
|
"grad_norm": 0.6729727983474731, |
|
"learning_rate": 1.746635141803761e-05, |
|
"loss": 0.7973, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.016575702445926824, |
|
"grad_norm": 0.7322119474411011, |
|
"learning_rate": 1.6892538872607937e-05, |
|
"loss": 0.8065, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.01677784515868203, |
|
"grad_norm": 0.7230767607688904, |
|
"learning_rate": 1.6323460856167426e-05, |
|
"loss": 0.8034, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.016979987871437233, |
|
"grad_norm": 0.6473975777626038, |
|
"learning_rate": 1.5759449694252226e-05, |
|
"loss": 0.7781, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.01718213058419244, |
|
"grad_norm": 0.7108025550842285, |
|
"learning_rate": 1.5200834753498128e-05, |
|
"loss": 0.8175, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.017384273296947646, |
|
"grad_norm": 0.672478199005127, |
|
"learning_rate": 1.4647942249299707e-05, |
|
"loss": 0.8328, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.01758641600970285, |
|
"grad_norm": 0.7066530585289001, |
|
"learning_rate": 1.4101095055309746e-05, |
|
"loss": 0.7698, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.017788558722458055, |
|
"grad_norm": 0.7493249773979187, |
|
"learning_rate": 1.356061251489012e-05, |
|
"loss": 0.8237, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.01799070143521326, |
|
"grad_norm": 0.6934426426887512, |
|
"learning_rate": 1.302681025462424e-05, |
|
"loss": 0.82, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.018192844147968467, |
|
"grad_norm": 0.6936736106872559, |
|
"learning_rate": 1.2500000000000006e-05, |
|
"loss": 0.8079, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.018327605956471937, |
|
"eval_loss": 0.8106825351715088, |
|
"eval_runtime": 1782.0227, |
|
"eval_samples_per_second": 14.026, |
|
"eval_steps_per_second": 1.754, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.01839498686072367, |
|
"grad_norm": 0.6460986733436584, |
|
"learning_rate": 1.1980489393370938e-05, |
|
"loss": 0.8341, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.018597129573478876, |
|
"grad_norm": 0.6542893052101135, |
|
"learning_rate": 1.1468581814301717e-05, |
|
"loss": 0.7814, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.01879927228623408, |
|
"grad_norm": 0.6104385852813721, |
|
"learning_rate": 1.096457620240298e-05, |
|
"loss": 0.8269, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.019001414998989285, |
|
"grad_norm": 0.822834849357605, |
|
"learning_rate": 1.0468766882759094e-05, |
|
"loss": 0.8001, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.019203557711744493, |
|
"grad_norm": 0.6357617378234863, |
|
"learning_rate": 9.981443394050525e-06, |
|
"loss": 0.8261, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.019405700424499697, |
|
"grad_norm": 0.6451523900032043, |
|
"learning_rate": 9.502890319471491e-06, |
|
"loss": 0.827, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.0196078431372549, |
|
"grad_norm": 0.6993770003318787, |
|
"learning_rate": 9.033387120541306e-06, |
|
"loss": 0.7993, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.019809985850010106, |
|
"grad_norm": 0.7399018406867981, |
|
"learning_rate": 8.573207973906735e-06, |
|
"loss": 0.8537, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.02001212856276531, |
|
"grad_norm": 0.6726659536361694, |
|
"learning_rate": 8.1226216112306e-06, |
|
"loss": 0.7875, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.02021427127552052, |
|
"grad_norm": 0.6281954646110535, |
|
"learning_rate": 7.681891162260015e-06, |
|
"loss": 0.7966, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.020416413988275723, |
|
"grad_norm": 0.7878900170326233, |
|
"learning_rate": 7.251274001166044e-06, |
|
"loss": 0.8103, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.020618556701030927, |
|
"grad_norm": 0.6884163022041321, |
|
"learning_rate": 6.831021596244424e-06, |
|
"loss": 0.7842, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.020618556701030927, |
|
"eval_loss": 0.8096863031387329, |
|
"eval_runtime": 1780.6626, |
|
"eval_samples_per_second": 14.037, |
|
"eval_steps_per_second": 1.755, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.020820699413786132, |
|
"grad_norm": 0.8132256269454956, |
|
"learning_rate": 6.421379363065142e-06, |
|
"loss": 0.8069, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.02102284212654134, |
|
"grad_norm": 0.7123071551322937, |
|
"learning_rate": 6.022586521156715e-06, |
|
"loss": 0.7624, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.021224984839296544, |
|
"grad_norm": 0.6497386693954468, |
|
"learning_rate": 5.634875954308638e-06, |
|
"loss": 0.7902, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.02142712755205175, |
|
"grad_norm": 0.6508458256721497, |
|
"learning_rate": 5.258474074573877e-06, |
|
"loss": 0.8201, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.021629270264806953, |
|
"grad_norm": 0.9117996096611023, |
|
"learning_rate": 4.893600690050579e-06, |
|
"loss": 0.8328, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.021831412977562158, |
|
"grad_norm": 0.693020761013031, |
|
"learning_rate": 4.540468876520323e-06, |
|
"loss": 0.7926, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.022033555690317366, |
|
"grad_norm": 0.6869902014732361, |
|
"learning_rate": 4.199284853017896e-06, |
|
"loss": 0.805, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.02223569840307257, |
|
"grad_norm": 0.7282816171646118, |
|
"learning_rate": 3.8702478614051355e-06, |
|
"loss": 0.8067, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.022437841115827774, |
|
"grad_norm": 0.6699129343032837, |
|
"learning_rate": 3.5535500500193357e-06, |
|
"loss": 0.8041, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.02263998382858298, |
|
"grad_norm": 0.6829515695571899, |
|
"learning_rate": 3.249376361464021e-06, |
|
"loss": 0.8149, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.022842126541338183, |
|
"grad_norm": 0.7807720303535461, |
|
"learning_rate": 2.957904424607652e-06, |
|
"loss": 0.825, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.022909507445589918, |
|
"eval_loss": 0.8090208768844604, |
|
"eval_runtime": 1781.7524, |
|
"eval_samples_per_second": 14.028, |
|
"eval_steps_per_second": 1.754, |
|
"step": 340 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 34, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.7812833290747904e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|