|
{ |
|
"best_metric": 0.7205991148948669, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-350", |
|
"epoch": 0.009728910848430519, |
|
"eval_steps": 50, |
|
"global_step": 350, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 2.779688813837291e-05, |
|
"grad_norm": 0.04725373163819313, |
|
"learning_rate": 0.0002, |
|
"loss": 1.23, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 2.779688813837291e-05, |
|
"eval_loss": 1.0243470668792725, |
|
"eval_runtime": 50.0777, |
|
"eval_samples_per_second": 11.522, |
|
"eval_steps_per_second": 5.771, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 5.559377627674582e-05, |
|
"grad_norm": 0.04354099556803703, |
|
"learning_rate": 0.0004, |
|
"loss": 1.0633, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 8.339066441511873e-05, |
|
"grad_norm": 0.04554685205221176, |
|
"learning_rate": 0.0006, |
|
"loss": 1.1887, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00011118755255349164, |
|
"grad_norm": 0.04557386413216591, |
|
"learning_rate": 0.0008, |
|
"loss": 1.121, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00013898444069186455, |
|
"grad_norm": 0.05429883301258087, |
|
"learning_rate": 0.001, |
|
"loss": 1.2029, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00016678132883023747, |
|
"grad_norm": 0.058254778385162354, |
|
"learning_rate": 0.0012, |
|
"loss": 0.8706, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00019457821696861036, |
|
"grad_norm": 0.09290671348571777, |
|
"learning_rate": 0.0014, |
|
"loss": 1.0657, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00022237510510698328, |
|
"grad_norm": 0.05990983918309212, |
|
"learning_rate": 0.0016, |
|
"loss": 0.8619, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0002501719932453562, |
|
"grad_norm": 0.060997381806373596, |
|
"learning_rate": 0.0018000000000000002, |
|
"loss": 1.1914, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0002779688813837291, |
|
"grad_norm": 0.05487683787941933, |
|
"learning_rate": 0.002, |
|
"loss": 0.8724, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.000305765769522102, |
|
"grad_norm": 0.051148172467947006, |
|
"learning_rate": 0.001999979446958366, |
|
"loss": 0.8865, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00033356265766047494, |
|
"grad_norm": 0.07391902059316635, |
|
"learning_rate": 0.001999917788678319, |
|
"loss": 0.994, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0003613595457988478, |
|
"grad_norm": 0.061822760850191116, |
|
"learning_rate": 0.00199981502769439, |
|
"loss": 0.9617, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0003891564339372207, |
|
"grad_norm": 0.06005921587347984, |
|
"learning_rate": 0.00199967116823068, |
|
"loss": 0.736, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.00041695332207559364, |
|
"grad_norm": 0.06111058592796326, |
|
"learning_rate": 0.001999486216200688, |
|
"loss": 0.8405, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00044475021021396656, |
|
"grad_norm": 0.06315408647060394, |
|
"learning_rate": 0.0019992601792070677, |
|
"loss": 0.7523, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.00047254709835233943, |
|
"grad_norm": 0.06354085355997086, |
|
"learning_rate": 0.0019989930665413147, |
|
"loss": 0.8865, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0005003439864907124, |
|
"grad_norm": 0.06111137568950653, |
|
"learning_rate": 0.0019986848891833846, |
|
"loss": 0.8308, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0005281408746290853, |
|
"grad_norm": 0.0758710503578186, |
|
"learning_rate": 0.001998335659801241, |
|
"loss": 1.0734, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0005559377627674582, |
|
"grad_norm": 0.056908175349235535, |
|
"learning_rate": 0.0019979453927503363, |
|
"loss": 0.8955, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0005837346509058311, |
|
"grad_norm": 0.06023595482110977, |
|
"learning_rate": 0.0019975141040730207, |
|
"loss": 0.8492, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.000611531539044204, |
|
"grad_norm": 0.05432360619306564, |
|
"learning_rate": 0.001997041811497882, |
|
"loss": 0.8083, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.000639328427182577, |
|
"grad_norm": 0.05626050382852554, |
|
"learning_rate": 0.0019965285344390182, |
|
"loss": 0.9985, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0006671253153209499, |
|
"grad_norm": 0.06779271364212036, |
|
"learning_rate": 0.0019959742939952394, |
|
"loss": 0.9792, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0006949222034593227, |
|
"grad_norm": 0.05818340182304382, |
|
"learning_rate": 0.0019953791129491984, |
|
"loss": 0.8439, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0007227190915976956, |
|
"grad_norm": 0.08297158777713776, |
|
"learning_rate": 0.0019947430157664574, |
|
"loss": 0.6787, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0007505159797360685, |
|
"grad_norm": 0.06646459549665451, |
|
"learning_rate": 0.00199406602859448, |
|
"loss": 0.7956, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0007783128678744414, |
|
"grad_norm": 0.06349553912878036, |
|
"learning_rate": 0.001993348179261558, |
|
"loss": 0.8621, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0008061097560128144, |
|
"grad_norm": 0.06375010311603546, |
|
"learning_rate": 0.001992589497275665, |
|
"loss": 0.7232, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0008339066441511873, |
|
"grad_norm": 0.07310320436954498, |
|
"learning_rate": 0.001991790013823246, |
|
"loss": 0.7295, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0008617035322895602, |
|
"grad_norm": 0.07351308315992355, |
|
"learning_rate": 0.001990949761767935, |
|
"loss": 0.6556, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0008895004204279331, |
|
"grad_norm": 0.0795268714427948, |
|
"learning_rate": 0.0019900687756492018, |
|
"loss": 0.8927, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.000917297308566306, |
|
"grad_norm": 0.06989472359418869, |
|
"learning_rate": 0.0019891470916809364, |
|
"loss": 0.6571, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0009450941967046789, |
|
"grad_norm": 0.07375137507915497, |
|
"learning_rate": 0.0019881847477499557, |
|
"loss": 0.6175, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0009728910848430518, |
|
"grad_norm": 0.07746679335832596, |
|
"learning_rate": 0.0019871817834144503, |
|
"loss": 0.6992, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0010006879729814247, |
|
"grad_norm": 0.07618487626314163, |
|
"learning_rate": 0.001986138239902355, |
|
"loss": 0.625, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0010284848611197977, |
|
"grad_norm": 0.0905221700668335, |
|
"learning_rate": 0.001985054160109657, |
|
"loss": 0.6513, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0010562817492581705, |
|
"grad_norm": 0.09803163260221481, |
|
"learning_rate": 0.0019839295885986296, |
|
"loss": 0.7122, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0010840786373965434, |
|
"grad_norm": 0.08399751782417297, |
|
"learning_rate": 0.0019827645715960037, |
|
"loss": 0.5943, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0011118755255349164, |
|
"grad_norm": 0.09010937064886093, |
|
"learning_rate": 0.0019815591569910655, |
|
"loss": 0.7233, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0011396724136732892, |
|
"grad_norm": 0.09423992037773132, |
|
"learning_rate": 0.0019803133943336873, |
|
"loss": 0.7266, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0011674693018116622, |
|
"grad_norm": 0.09580416232347488, |
|
"learning_rate": 0.001979027334832293, |
|
"loss": 0.7426, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.001195266189950035, |
|
"grad_norm": 0.08860920369625092, |
|
"learning_rate": 0.0019777010313517516, |
|
"loss": 0.6335, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.001223063078088408, |
|
"grad_norm": 0.0893678069114685, |
|
"learning_rate": 0.0019763345384112042, |
|
"loss": 0.6863, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0012508599662267809, |
|
"grad_norm": 0.08261829614639282, |
|
"learning_rate": 0.0019749279121818236, |
|
"loss": 0.5307, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.001278656854365154, |
|
"grad_norm": 0.09626446664333344, |
|
"learning_rate": 0.001973481210484505, |
|
"loss": 0.6697, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0013064537425035267, |
|
"grad_norm": 0.127803772687912, |
|
"learning_rate": 0.001971994492787488, |
|
"loss": 0.6944, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0013342506306418997, |
|
"grad_norm": 0.1326465904712677, |
|
"learning_rate": 0.001970467820203915, |
|
"loss": 0.6792, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0013620475187802726, |
|
"grad_norm": 0.14524292945861816, |
|
"learning_rate": 0.0019689012554893154, |
|
"loss": 0.6784, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0013898444069186454, |
|
"grad_norm": 0.22971156239509583, |
|
"learning_rate": 0.0019672948630390296, |
|
"loss": 0.8101, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0013898444069186454, |
|
"eval_loss": 0.9230208396911621, |
|
"eval_runtime": 49.9615, |
|
"eval_samples_per_second": 11.549, |
|
"eval_steps_per_second": 5.784, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0014176412950570184, |
|
"grad_norm": 0.1536542773246765, |
|
"learning_rate": 0.001965648708885559, |
|
"loss": 1.0933, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0014454381831953912, |
|
"grad_norm": 0.1058596670627594, |
|
"learning_rate": 0.001963962860695853, |
|
"loss": 1.0704, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0014732350713337642, |
|
"grad_norm": 0.08224259316921234, |
|
"learning_rate": 0.001962237387768529, |
|
"loss": 1.1229, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.001501031959472137, |
|
"grad_norm": 0.08697132021188736, |
|
"learning_rate": 0.0019604723610310193, |
|
"loss": 0.9118, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.00152882884761051, |
|
"grad_norm": 0.08103405684232712, |
|
"learning_rate": 0.0019586678530366607, |
|
"loss": 0.946, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0015566257357488829, |
|
"grad_norm": 0.07975370436906815, |
|
"learning_rate": 0.0019568239379617086, |
|
"loss": 0.9038, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.001584422623887256, |
|
"grad_norm": 0.07595320045948029, |
|
"learning_rate": 0.0019549406916022907, |
|
"loss": 0.8122, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0016122195120256287, |
|
"grad_norm": 0.08768190443515778, |
|
"learning_rate": 0.0019530181913712872, |
|
"loss": 0.8606, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0016400164001640015, |
|
"grad_norm": 0.09886912256479263, |
|
"learning_rate": 0.0019510565162951536, |
|
"loss": 0.8902, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0016678132883023746, |
|
"grad_norm": 0.08801861107349396, |
|
"learning_rate": 0.0019490557470106687, |
|
"loss": 0.9151, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0016956101764407474, |
|
"grad_norm": 0.08995132893323898, |
|
"learning_rate": 0.0019470159657616214, |
|
"loss": 0.7793, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0017234070645791204, |
|
"grad_norm": 0.08425740152597427, |
|
"learning_rate": 0.0019449372563954293, |
|
"loss": 0.7646, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0017512039527174932, |
|
"grad_norm": 0.11669428646564484, |
|
"learning_rate": 0.001942819704359693, |
|
"loss": 0.9056, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0017790008408558663, |
|
"grad_norm": 0.08303668349981308, |
|
"learning_rate": 0.0019406633966986826, |
|
"loss": 0.7583, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.001806797728994239, |
|
"grad_norm": 0.08263064175844193, |
|
"learning_rate": 0.0019384684220497604, |
|
"loss": 0.9233, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.001834594617132612, |
|
"grad_norm": 0.08262008428573608, |
|
"learning_rate": 0.0019362348706397372, |
|
"loss": 0.8359, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.001862391505270985, |
|
"grad_norm": 0.10420376062393188, |
|
"learning_rate": 0.0019339628342811633, |
|
"loss": 0.9978, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0018901883934093577, |
|
"grad_norm": 0.0830477699637413, |
|
"learning_rate": 0.001931652406368554, |
|
"loss": 0.8834, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0019179852815477307, |
|
"grad_norm": 0.08504804968833923, |
|
"learning_rate": 0.0019293036818745519, |
|
"loss": 0.9164, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0019457821696861036, |
|
"grad_norm": 0.08910652250051498, |
|
"learning_rate": 0.0019269167573460217, |
|
"loss": 0.9095, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0019735790578244766, |
|
"grad_norm": 0.09257230162620544, |
|
"learning_rate": 0.0019244917309000815, |
|
"loss": 0.7138, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0020013759459628494, |
|
"grad_norm": 0.09553885459899902, |
|
"learning_rate": 0.0019220287022200706, |
|
"loss": 0.9544, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.002029172834101222, |
|
"grad_norm": 0.08890817314386368, |
|
"learning_rate": 0.0019195277725514508, |
|
"loss": 0.7013, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0020569697222395955, |
|
"grad_norm": 0.10616549849510193, |
|
"learning_rate": 0.0019169890446976451, |
|
"loss": 0.7119, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0020847666103779683, |
|
"grad_norm": 0.09758912026882172, |
|
"learning_rate": 0.0019144126230158124, |
|
"loss": 0.811, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.002112563498516341, |
|
"grad_norm": 0.09248580783605576, |
|
"learning_rate": 0.001911798613412557, |
|
"loss": 0.8025, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.002140360386654714, |
|
"grad_norm": 0.09431200474500656, |
|
"learning_rate": 0.001909147123339575, |
|
"loss": 0.7038, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.0021681572747930867, |
|
"grad_norm": 0.09258091449737549, |
|
"learning_rate": 0.001906458261789238, |
|
"loss": 0.7752, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.00219595416293146, |
|
"grad_norm": 0.08860747516155243, |
|
"learning_rate": 0.0019037321392901135, |
|
"loss": 0.6832, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.0022237510510698328, |
|
"grad_norm": 0.10791260004043579, |
|
"learning_rate": 0.001900968867902419, |
|
"loss": 0.7183, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0022515479392082056, |
|
"grad_norm": 0.0878261998295784, |
|
"learning_rate": 0.001898168561213419, |
|
"loss": 0.6677, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0022793448273465784, |
|
"grad_norm": 0.10915020108222961, |
|
"learning_rate": 0.0018953313343327532, |
|
"loss": 0.8602, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0023071417154849516, |
|
"grad_norm": 0.10625939816236496, |
|
"learning_rate": 0.001892457303887706, |
|
"loss": 0.8385, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.0023349386036233244, |
|
"grad_norm": 0.10215223580598831, |
|
"learning_rate": 0.001889546588018412, |
|
"loss": 0.7723, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0023627354917616973, |
|
"grad_norm": 0.08778225630521774, |
|
"learning_rate": 0.0018865993063730002, |
|
"loss": 0.6503, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.00239053237990007, |
|
"grad_norm": 0.10662350058555603, |
|
"learning_rate": 0.0018836155801026753, |
|
"loss": 0.6592, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.002418329268038443, |
|
"grad_norm": 0.10347293317317963, |
|
"learning_rate": 0.001880595531856738, |
|
"loss": 0.602, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.002446126156176816, |
|
"grad_norm": 0.11098446696996689, |
|
"learning_rate": 0.001877539285777543, |
|
"loss": 0.7291, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.002473923044315189, |
|
"grad_norm": 0.10774262994527817, |
|
"learning_rate": 0.0018744469674953957, |
|
"loss": 0.6501, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.0025017199324535618, |
|
"grad_norm": 0.10596223175525665, |
|
"learning_rate": 0.0018713187041233894, |
|
"loss": 0.7274, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0025295168205919346, |
|
"grad_norm": 0.11689383536577225, |
|
"learning_rate": 0.0018681546242521785, |
|
"loss": 0.6693, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.002557313708730308, |
|
"grad_norm": 0.11212435364723206, |
|
"learning_rate": 0.0018649548579446936, |
|
"loss": 0.6218, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0025851105968686806, |
|
"grad_norm": 0.13619789481163025, |
|
"learning_rate": 0.0018617195367307952, |
|
"loss": 0.5839, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0026129074850070534, |
|
"grad_norm": 0.18084552884101868, |
|
"learning_rate": 0.001858448793601866, |
|
"loss": 0.6083, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.0026407043731454262, |
|
"grad_norm": 0.14780890941619873, |
|
"learning_rate": 0.0018551427630053464, |
|
"loss": 0.6095, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.0026685012612837995, |
|
"grad_norm": 0.12189039587974548, |
|
"learning_rate": 0.0018518015808392043, |
|
"loss": 0.6313, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0026962981494221723, |
|
"grad_norm": 0.2006332129240036, |
|
"learning_rate": 0.0018484253844463525, |
|
"loss": 0.6528, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.002724095037560545, |
|
"grad_norm": 0.19439570605754852, |
|
"learning_rate": 0.0018450143126090013, |
|
"loss": 0.655, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.002751891925698918, |
|
"grad_norm": 0.23627929389476776, |
|
"learning_rate": 0.0018415685055429532, |
|
"loss": 0.6663, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.0027796888138372907, |
|
"grad_norm": 0.22060082852840424, |
|
"learning_rate": 0.0018380881048918405, |
|
"loss": 0.6549, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0027796888138372907, |
|
"eval_loss": 1.0665974617004395, |
|
"eval_runtime": 49.9362, |
|
"eval_samples_per_second": 11.555, |
|
"eval_steps_per_second": 5.787, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.002807485701975664, |
|
"grad_norm": 1.510432481765747, |
|
"learning_rate": 0.0018345732537213026, |
|
"loss": 1.4424, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.002835282590114037, |
|
"grad_norm": 0.1634325534105301, |
|
"learning_rate": 0.001831024096513104, |
|
"loss": 0.9927, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.0028630794782524096, |
|
"grad_norm": 0.11758747696876526, |
|
"learning_rate": 0.0018274407791591964, |
|
"loss": 1.0068, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.0028908763663907824, |
|
"grad_norm": 0.13593055307865143, |
|
"learning_rate": 0.0018238234489557216, |
|
"loss": 1.0023, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.0029186732545291557, |
|
"grad_norm": 0.1236187294125557, |
|
"learning_rate": 0.0018201722545969558, |
|
"loss": 0.8913, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0029464701426675285, |
|
"grad_norm": 0.1045624166727066, |
|
"learning_rate": 0.0018164873461691987, |
|
"loss": 0.952, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.0029742670308059013, |
|
"grad_norm": 0.1336205154657364, |
|
"learning_rate": 0.0018127688751446028, |
|
"loss": 0.9185, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.003002063918944274, |
|
"grad_norm": 0.20375819504261017, |
|
"learning_rate": 0.0018090169943749475, |
|
"loss": 0.7956, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.003029860807082647, |
|
"grad_norm": 0.11180537939071655, |
|
"learning_rate": 0.0018052318580853563, |
|
"loss": 0.6942, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.00305765769522102, |
|
"grad_norm": 0.3445965349674225, |
|
"learning_rate": 0.0018014136218679566, |
|
"loss": 0.8457, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.003085454583359393, |
|
"grad_norm": 0.19809404015541077, |
|
"learning_rate": 0.0017975624426754845, |
|
"loss": 0.689, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.0031132514714977658, |
|
"grad_norm": 0.4088907837867737, |
|
"learning_rate": 0.0017936784788148326, |
|
"loss": 0.887, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.0031410483596361386, |
|
"grad_norm": 0.757510781288147, |
|
"learning_rate": 0.0017897618899405424, |
|
"loss": 0.8129, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.003168845247774512, |
|
"grad_norm": 0.1284535676240921, |
|
"learning_rate": 0.0017858128370482425, |
|
"loss": 0.9147, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.0031966421359128847, |
|
"grad_norm": 1.1778861284255981, |
|
"learning_rate": 0.00178183148246803, |
|
"loss": 0.8639, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0032244390240512575, |
|
"grad_norm": 0.24793654680252075, |
|
"learning_rate": 0.0017778179898577974, |
|
"loss": 0.9027, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.0032522359121896303, |
|
"grad_norm": 0.17602455615997314, |
|
"learning_rate": 0.0017737725241965068, |
|
"loss": 1.0078, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.003280032800328003, |
|
"grad_norm": 0.18490758538246155, |
|
"learning_rate": 0.0017696952517774062, |
|
"loss": 0.7916, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.0033078296884663763, |
|
"grad_norm": 0.17348746955394745, |
|
"learning_rate": 0.0017655863402011947, |
|
"loss": 0.9932, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.003335626576604749, |
|
"grad_norm": 0.22050045430660248, |
|
"learning_rate": 0.0017614459583691344, |
|
"loss": 0.8613, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.003363423464743122, |
|
"grad_norm": 0.12003365904092789, |
|
"learning_rate": 0.0017572742764761053, |
|
"loss": 0.9392, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.0033912203528814948, |
|
"grad_norm": 0.10987789183855057, |
|
"learning_rate": 0.001753071466003611, |
|
"loss": 0.7309, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.003419017241019868, |
|
"grad_norm": 0.11258568614721298, |
|
"learning_rate": 0.0017488376997127281, |
|
"loss": 0.7541, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.003446814129158241, |
|
"grad_norm": 0.14257432520389557, |
|
"learning_rate": 0.0017445731516370071, |
|
"loss": 0.7806, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.0034746110172966136, |
|
"grad_norm": 0.1824330985546112, |
|
"learning_rate": 0.0017402779970753155, |
|
"loss": 0.8882, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0035024079054349865, |
|
"grad_norm": 0.1172509491443634, |
|
"learning_rate": 0.001735952412584635, |
|
"loss": 0.8695, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.0035302047935733593, |
|
"grad_norm": 0.1342688351869583, |
|
"learning_rate": 0.0017315965759728013, |
|
"loss": 0.6943, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.0035580016817117325, |
|
"grad_norm": 0.11804018169641495, |
|
"learning_rate": 0.0017272106662911972, |
|
"loss": 0.6733, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.0035857985698501053, |
|
"grad_norm": 0.1424594670534134, |
|
"learning_rate": 0.0017227948638273915, |
|
"loss": 0.8569, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.003613595457988478, |
|
"grad_norm": 0.10879236459732056, |
|
"learning_rate": 0.0017183493500977276, |
|
"loss": 0.62, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.003641392346126851, |
|
"grad_norm": 0.14222201704978943, |
|
"learning_rate": 0.0017138743078398632, |
|
"loss": 0.7501, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.003669189234265224, |
|
"grad_norm": 0.11895573884248734, |
|
"learning_rate": 0.0017093699210052578, |
|
"loss": 0.7722, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.003696986122403597, |
|
"grad_norm": 0.18408416211605072, |
|
"learning_rate": 0.0017048363747516118, |
|
"loss": 0.8163, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.00372478301054197, |
|
"grad_norm": 0.13170836865901947, |
|
"learning_rate": 0.001700273855435255, |
|
"loss": 0.7529, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.0037525798986803426, |
|
"grad_norm": 0.2225480079650879, |
|
"learning_rate": 0.0016956825506034864, |
|
"loss": 0.7412, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0037803767868187154, |
|
"grad_norm": 0.14438864588737488, |
|
"learning_rate": 0.0016910626489868648, |
|
"loss": 0.7257, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.0038081736749570887, |
|
"grad_norm": 0.14967116713523865, |
|
"learning_rate": 0.0016864143404914506, |
|
"loss": 0.7663, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.0038359705630954615, |
|
"grad_norm": 0.12598782777786255, |
|
"learning_rate": 0.0016817378161909995, |
|
"loss": 0.7768, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.0038637674512338343, |
|
"grad_norm": 0.1321249157190323, |
|
"learning_rate": 0.0016770332683191096, |
|
"loss": 0.5741, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.003891564339372207, |
|
"grad_norm": 0.1356271356344223, |
|
"learning_rate": 0.0016723008902613168, |
|
"loss": 0.628, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.00391936122751058, |
|
"grad_norm": 0.1559610813856125, |
|
"learning_rate": 0.0016675408765471481, |
|
"loss": 0.5979, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.003947158115648953, |
|
"grad_norm": 0.14019130170345306, |
|
"learning_rate": 0.001662753422842123, |
|
"loss": 0.6001, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.003974955003787326, |
|
"grad_norm": 0.3218002915382385, |
|
"learning_rate": 0.0016579387259397127, |
|
"loss": 0.6655, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.004002751891925699, |
|
"grad_norm": 0.14642581343650818, |
|
"learning_rate": 0.0016530969837532485, |
|
"loss": 0.6712, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.004030548780064072, |
|
"grad_norm": 0.30584490299224854, |
|
"learning_rate": 0.0016482283953077885, |
|
"loss": 0.7566, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.004058345668202444, |
|
"grad_norm": 0.17027173936367035, |
|
"learning_rate": 0.0016433331607319341, |
|
"loss": 0.7187, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.004086142556340817, |
|
"grad_norm": 0.22963419556617737, |
|
"learning_rate": 0.0016384114812496057, |
|
"loss": 0.732, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.004113939444479191, |
|
"grad_norm": 0.6273247599601746, |
|
"learning_rate": 0.0016334635591717702, |
|
"loss": 0.6352, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.004141736332617564, |
|
"grad_norm": 0.14807386696338654, |
|
"learning_rate": 0.0016284895978881237, |
|
"loss": 0.6705, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.0041695332207559365, |
|
"grad_norm": 0.19606240093708038, |
|
"learning_rate": 0.0016234898018587336, |
|
"loss": 0.7171, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0041695332207559365, |
|
"eval_loss": 0.8552482724189758, |
|
"eval_runtime": 50.0085, |
|
"eval_samples_per_second": 11.538, |
|
"eval_steps_per_second": 5.779, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.004197330108894309, |
|
"grad_norm": 0.4086618423461914, |
|
"learning_rate": 0.0016184643766056315, |
|
"loss": 1.0707, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.004225126997032682, |
|
"grad_norm": 0.2321600317955017, |
|
"learning_rate": 0.0016134135287043667, |
|
"loss": 1.0719, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.004252923885171055, |
|
"grad_norm": 0.09448766708374023, |
|
"learning_rate": 0.0016083374657755133, |
|
"loss": 1.035, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.004280720773309428, |
|
"grad_norm": 0.0990653857588768, |
|
"learning_rate": 0.0016032363964761363, |
|
"loss": 0.9306, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.004308517661447801, |
|
"grad_norm": 0.09489451348781586, |
|
"learning_rate": 0.001598110530491216, |
|
"loss": 0.8499, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.004336314549586173, |
|
"grad_norm": 0.09735696017742157, |
|
"learning_rate": 0.0015929600785250257, |
|
"loss": 0.8704, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.004364111437724547, |
|
"grad_norm": 0.1042320653796196, |
|
"learning_rate": 0.0015877852522924731, |
|
"loss": 0.8512, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.00439190832586292, |
|
"grad_norm": 0.12351106107234955, |
|
"learning_rate": 0.0015825862645103962, |
|
"loss": 0.9413, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.004419705214001293, |
|
"grad_norm": 0.09481119364500046, |
|
"learning_rate": 0.0015773633288888196, |
|
"loss": 0.8296, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.0044475021021396655, |
|
"grad_norm": 0.105812206864357, |
|
"learning_rate": 0.0015721166601221697, |
|
"loss": 0.7714, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.004475298990278038, |
|
"grad_norm": 0.11305416375398636, |
|
"learning_rate": 0.00156684647388045, |
|
"loss": 0.9412, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.004503095878416411, |
|
"grad_norm": 0.10926243662834167, |
|
"learning_rate": 0.0015615529868003748, |
|
"loss": 0.8682, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.004530892766554784, |
|
"grad_norm": 0.10087363421916962, |
|
"learning_rate": 0.0015562364164764648, |
|
"loss": 0.8325, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.004558689654693157, |
|
"grad_norm": 0.09206137806177139, |
|
"learning_rate": 0.0015508969814521025, |
|
"loss": 0.7958, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.00458648654283153, |
|
"grad_norm": 0.11067818850278854, |
|
"learning_rate": 0.0015455349012105486, |
|
"loss": 1.0522, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.004614283430969903, |
|
"grad_norm": 0.11335323750972748, |
|
"learning_rate": 0.0015401503961659203, |
|
"loss": 0.9374, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.004642080319108276, |
|
"grad_norm": 0.1060660108923912, |
|
"learning_rate": 0.0015347436876541297, |
|
"loss": 0.9861, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.004669877207246649, |
|
"grad_norm": 0.12663975358009338, |
|
"learning_rate": 0.0015293149979237874, |
|
"loss": 0.7834, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.004697674095385022, |
|
"grad_norm": 0.1315016895532608, |
|
"learning_rate": 0.0015238645501270654, |
|
"loss": 0.9251, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.0047254709835233945, |
|
"grad_norm": 0.16219031810760498, |
|
"learning_rate": 0.0015183925683105253, |
|
"loss": 0.8951, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.004753267871661767, |
|
"grad_norm": 0.1081567108631134, |
|
"learning_rate": 0.0015128992774059062, |
|
"loss": 0.9457, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.00478106475980014, |
|
"grad_norm": 0.1203177273273468, |
|
"learning_rate": 0.0015073849032208823, |
|
"loss": 0.9307, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.004808861647938513, |
|
"grad_norm": 0.11848781257867813, |
|
"learning_rate": 0.0015018496724297776, |
|
"loss": 0.8301, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.004836658536076886, |
|
"grad_norm": 0.13573849201202393, |
|
"learning_rate": 0.0014962938125642501, |
|
"loss": 0.723, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.0048644554242152594, |
|
"grad_norm": 0.10743360966444016, |
|
"learning_rate": 0.001490717552003938, |
|
"loss": 0.7861, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.004892252312353632, |
|
"grad_norm": 0.10989518463611603, |
|
"learning_rate": 0.001485121119967072, |
|
"loss": 0.7905, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.004920049200492005, |
|
"grad_norm": 0.10301119089126587, |
|
"learning_rate": 0.001479504746501054, |
|
"loss": 0.6952, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.004947846088630378, |
|
"grad_norm": 0.12021514773368835, |
|
"learning_rate": 0.0014738686624729987, |
|
"loss": 0.782, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.004975642976768751, |
|
"grad_norm": 0.11153309047222137, |
|
"learning_rate": 0.0014682130995602458, |
|
"loss": 0.8065, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.0050034398649071235, |
|
"grad_norm": 0.11229830235242844, |
|
"learning_rate": 0.0014625382902408355, |
|
"loss": 0.7113, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.005031236753045496, |
|
"grad_norm": 0.11783714592456818, |
|
"learning_rate": 0.0014568444677839517, |
|
"loss": 0.6218, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.005059033641183869, |
|
"grad_norm": 0.12635229527950287, |
|
"learning_rate": 0.0014511318662403345, |
|
"loss": 0.6719, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.005086830529322243, |
|
"grad_norm": 0.12254615128040314, |
|
"learning_rate": 0.0014454007204326592, |
|
"loss": 0.7263, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.005114627417460616, |
|
"grad_norm": 0.11896725744009018, |
|
"learning_rate": 0.0014396512659458822, |
|
"loss": 0.8887, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.005142424305598988, |
|
"grad_norm": 0.10813646763563156, |
|
"learning_rate": 0.0014338837391175581, |
|
"loss": 0.6242, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.005170221193737361, |
|
"grad_norm": 0.10913447290658951, |
|
"learning_rate": 0.0014280983770281258, |
|
"loss": 0.5932, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.005198018081875734, |
|
"grad_norm": 0.11929253488779068, |
|
"learning_rate": 0.00142229541749116, |
|
"loss": 0.7183, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.005225814970014107, |
|
"grad_norm": 0.1152404397726059, |
|
"learning_rate": 0.001416475099043599, |
|
"loss": 0.7041, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.00525361185815248, |
|
"grad_norm": 0.11904522776603699, |
|
"learning_rate": 0.001410637660935938, |
|
"loss": 0.7405, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.0052814087462908525, |
|
"grad_norm": 0.10983660072088242, |
|
"learning_rate": 0.0014047833431223937, |
|
"loss": 0.7169, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.005309205634429225, |
|
"grad_norm": 0.1285189539194107, |
|
"learning_rate": 0.0013989123862510418, |
|
"loss": 0.7584, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.005337002522567599, |
|
"grad_norm": 0.13100877404212952, |
|
"learning_rate": 0.0013930250316539236, |
|
"loss": 0.7606, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.005364799410705972, |
|
"grad_norm": 0.14966635406017303, |
|
"learning_rate": 0.0013871215213371283, |
|
"loss": 0.5972, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.005392596298844345, |
|
"grad_norm": 0.13933199644088745, |
|
"learning_rate": 0.0013812020979708417, |
|
"loss": 0.5656, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.005420393186982717, |
|
"grad_norm": 0.13888134062290192, |
|
"learning_rate": 0.0013752670048793744, |
|
"loss": 0.7759, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.00544819007512109, |
|
"grad_norm": 0.14111551642417908, |
|
"learning_rate": 0.0013693164860311565, |
|
"loss": 0.7513, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.005475986963259463, |
|
"grad_norm": 0.16336259245872498, |
|
"learning_rate": 0.0013633507860287115, |
|
"loss": 0.7926, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.005503783851397836, |
|
"grad_norm": 0.1474616378545761, |
|
"learning_rate": 0.0013573701500986012, |
|
"loss": 0.5559, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.005531580739536209, |
|
"grad_norm": 0.18535536527633667, |
|
"learning_rate": 0.001351374824081343, |
|
"loss": 0.7313, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.0055593776276745815, |
|
"grad_norm": 0.1758536696434021, |
|
"learning_rate": 0.0013453650544213076, |
|
"loss": 0.606, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0055593776276745815, |
|
"eval_loss": 0.7939153909683228, |
|
"eval_runtime": 50.0064, |
|
"eval_samples_per_second": 11.539, |
|
"eval_steps_per_second": 5.779, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.005587174515812955, |
|
"grad_norm": 0.13161815702915192, |
|
"learning_rate": 0.0013393410881565877, |
|
"loss": 0.8241, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.005614971403951328, |
|
"grad_norm": 0.1270628422498703, |
|
"learning_rate": 0.0013333031729088419, |
|
"loss": 0.9452, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.005642768292089701, |
|
"grad_norm": 0.10348006337881088, |
|
"learning_rate": 0.0013272515568731168, |
|
"loss": 0.9022, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.005670565180228074, |
|
"grad_norm": 0.1050761267542839, |
|
"learning_rate": 0.0013211864888076456, |
|
"loss": 0.947, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.005698362068366446, |
|
"grad_norm": 0.09829563647508621, |
|
"learning_rate": 0.0013151082180236208, |
|
"loss": 0.8633, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.005726158956504819, |
|
"grad_norm": 0.10269923508167267, |
|
"learning_rate": 0.0013090169943749475, |
|
"loss": 0.9481, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.005753955844643192, |
|
"grad_norm": 0.09650863707065582, |
|
"learning_rate": 0.001302913068247972, |
|
"loss": 0.8289, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.005781752732781565, |
|
"grad_norm": 0.10350701212882996, |
|
"learning_rate": 0.0012967966905511905, |
|
"loss": 0.8332, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.005809549620919938, |
|
"grad_norm": 0.09841641783714294, |
|
"learning_rate": 0.0012906681127049337, |
|
"loss": 0.783, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.005837346509058311, |
|
"grad_norm": 0.10020092874765396, |
|
"learning_rate": 0.0012845275866310323, |
|
"loss": 0.6906, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.005865143397196684, |
|
"grad_norm": 0.10702440142631531, |
|
"learning_rate": 0.0012783753647424634, |
|
"loss": 0.6887, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.005892940285335057, |
|
"grad_norm": 0.10005506873130798, |
|
"learning_rate": 0.001272211699932971, |
|
"loss": 0.9122, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.00592073717347343, |
|
"grad_norm": 0.09415773302316666, |
|
"learning_rate": 0.0012660368455666752, |
|
"loss": 0.7632, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.005948534061611803, |
|
"grad_norm": 0.08957406878471375, |
|
"learning_rate": 0.001259851055467653, |
|
"loss": 0.6393, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.005976330949750175, |
|
"grad_norm": 0.12815257906913757, |
|
"learning_rate": 0.0012536545839095072, |
|
"loss": 0.9806, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.006004127837888548, |
|
"grad_norm": 0.10439406335353851, |
|
"learning_rate": 0.0012474476856049145, |
|
"loss": 0.8639, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.006031924726026921, |
|
"grad_norm": 0.10832219570875168, |
|
"learning_rate": 0.0012412306156951525, |
|
"loss": 0.9136, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.006059721614165294, |
|
"grad_norm": 0.09555172920227051, |
|
"learning_rate": 0.0012350036297396152, |
|
"loss": 0.6873, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.0060875185023036675, |
|
"grad_norm": 0.09809956699609756, |
|
"learning_rate": 0.0012287669837053054, |
|
"loss": 0.9795, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.00611531539044204, |
|
"grad_norm": 0.10025002062320709, |
|
"learning_rate": 0.0012225209339563144, |
|
"loss": 0.7104, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.006143112278580413, |
|
"grad_norm": 0.10654330253601074, |
|
"learning_rate": 0.0012162657372432836, |
|
"loss": 0.7548, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.006170909166718786, |
|
"grad_norm": 0.11099881678819656, |
|
"learning_rate": 0.0012100016506928493, |
|
"loss": 0.8631, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.006198706054857159, |
|
"grad_norm": 0.10728763788938522, |
|
"learning_rate": 0.0012037289317970757, |
|
"loss": 0.7795, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.0062265029429955316, |
|
"grad_norm": 0.1109003946185112, |
|
"learning_rate": 0.001197447838402867, |
|
"loss": 0.8686, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.006254299831133904, |
|
"grad_norm": 0.11378157138824463, |
|
"learning_rate": 0.0011911586287013725, |
|
"loss": 0.7896, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.006282096719272277, |
|
"grad_norm": 0.1115458682179451, |
|
"learning_rate": 0.0011848615612173687, |
|
"loss": 0.6893, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.00630989360741065, |
|
"grad_norm": 0.10416682809591293, |
|
"learning_rate": 0.0011785568947986368, |
|
"loss": 0.7173, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.006337690495549024, |
|
"grad_norm": 0.10408192873001099, |
|
"learning_rate": 0.001172244888605319, |
|
"loss": 0.6849, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.0063654873836873965, |
|
"grad_norm": 0.11150062829256058, |
|
"learning_rate": 0.001165925802099268, |
|
"loss": 0.6506, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.006393284271825769, |
|
"grad_norm": 0.11302520334720612, |
|
"learning_rate": 0.0011595998950333793, |
|
"loss": 0.7856, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.006421081159964142, |
|
"grad_norm": 0.1063392385840416, |
|
"learning_rate": 0.001153267427440916, |
|
"loss": 0.6815, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.006448878048102515, |
|
"grad_norm": 0.10356339067220688, |
|
"learning_rate": 0.001146928659624818, |
|
"loss": 0.7153, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.006476674936240888, |
|
"grad_norm": 0.11310825496912003, |
|
"learning_rate": 0.0011405838521470028, |
|
"loss": 0.7835, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.0065044718243792606, |
|
"grad_norm": 0.11676845699548721, |
|
"learning_rate": 0.0011342332658176555, |
|
"loss": 0.6789, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.006532268712517633, |
|
"grad_norm": 0.10645750910043716, |
|
"learning_rate": 0.001127877161684506, |
|
"loss": 0.6765, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.006560065600656006, |
|
"grad_norm": 0.10652610659599304, |
|
"learning_rate": 0.0011215158010221004, |
|
"loss": 0.7016, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.00658786248879438, |
|
"grad_norm": 0.12389989197254181, |
|
"learning_rate": 0.0011151494453210595, |
|
"loss": 0.6711, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.006615659376932753, |
|
"grad_norm": 0.12299291789531708, |
|
"learning_rate": 0.0011087783562773311, |
|
"loss": 0.6813, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.0066434562650711255, |
|
"grad_norm": 0.1117280051112175, |
|
"learning_rate": 0.0011024027957814314, |
|
"loss": 0.6857, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.006671253153209498, |
|
"grad_norm": 0.1141078993678093, |
|
"learning_rate": 0.0010960230259076818, |
|
"loss": 0.5767, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.006699050041347871, |
|
"grad_norm": 0.11716222018003464, |
|
"learning_rate": 0.0010896393089034335, |
|
"loss": 0.617, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.006726846929486244, |
|
"grad_norm": 0.11783386021852493, |
|
"learning_rate": 0.0010832519071782894, |
|
"loss": 0.623, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.006754643817624617, |
|
"grad_norm": 0.1618836522102356, |
|
"learning_rate": 0.0010768610832933168, |
|
"loss": 0.8134, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.0067824407057629895, |
|
"grad_norm": 0.12553377449512482, |
|
"learning_rate": 0.0010704670999502539, |
|
"loss": 0.6001, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.006810237593901362, |
|
"grad_norm": 0.11109066009521484, |
|
"learning_rate": 0.001064070219980713, |
|
"loss": 0.5716, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.006838034482039736, |
|
"grad_norm": 0.11495482176542282, |
|
"learning_rate": 0.0010576707063353744, |
|
"loss": 0.5186, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.006865831370178109, |
|
"grad_norm": 0.12931731343269348, |
|
"learning_rate": 0.0010512688220731792, |
|
"loss": 0.6666, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.006893628258316482, |
|
"grad_norm": 0.12755966186523438, |
|
"learning_rate": 0.001044864830350515, |
|
"loss": 0.5354, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.0069214251464548545, |
|
"grad_norm": 0.16778767108917236, |
|
"learning_rate": 0.0010384589944103983, |
|
"loss": 0.7248, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.006949222034593227, |
|
"grad_norm": 0.19699546694755554, |
|
"learning_rate": 0.0010320515775716554, |
|
"loss": 0.5582, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.006949222034593227, |
|
"eval_loss": 0.7649185657501221, |
|
"eval_runtime": 50.0992, |
|
"eval_samples_per_second": 11.517, |
|
"eval_steps_per_second": 5.769, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0069770189227316, |
|
"grad_norm": 0.14372354745864868, |
|
"learning_rate": 0.0010256428432180956, |
|
"loss": 1.0122, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.007004815810869973, |
|
"grad_norm": 0.11382071673870087, |
|
"learning_rate": 0.0010192330547876872, |
|
"loss": 0.8098, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.007032612699008346, |
|
"grad_norm": 0.10247013717889786, |
|
"learning_rate": 0.0010128224757617274, |
|
"loss": 0.9305, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.0070604095871467185, |
|
"grad_norm": 0.09587737172842026, |
|
"learning_rate": 0.0010064113696540112, |
|
"loss": 0.9049, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.007088206475285092, |
|
"grad_norm": 0.0981241911649704, |
|
"learning_rate": 0.001, |
|
"loss": 0.7943, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.007116003363423465, |
|
"grad_norm": 0.10372012108564377, |
|
"learning_rate": 0.0009935886303459888, |
|
"loss": 0.8079, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.007143800251561838, |
|
"grad_norm": 0.0910145714879036, |
|
"learning_rate": 0.0009871775242382727, |
|
"loss": 0.7809, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.007171597139700211, |
|
"grad_norm": 0.09489095211029053, |
|
"learning_rate": 0.0009807669452123128, |
|
"loss": 0.7955, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.0071993940278385835, |
|
"grad_norm": 0.09354311227798462, |
|
"learning_rate": 0.0009743571567819046, |
|
"loss": 0.8035, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.007227190915976956, |
|
"grad_norm": 0.09070082753896713, |
|
"learning_rate": 0.0009679484224283449, |
|
"loss": 0.7908, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.007254987804115329, |
|
"grad_norm": 0.1037682518362999, |
|
"learning_rate": 0.0009615410055896016, |
|
"loss": 0.7739, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.007282784692253702, |
|
"grad_norm": 0.09658119082450867, |
|
"learning_rate": 0.0009551351696494854, |
|
"loss": 0.8614, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.007310581580392075, |
|
"grad_norm": 0.10361933708190918, |
|
"learning_rate": 0.0009487311779268209, |
|
"loss": 0.9021, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.007338378468530448, |
|
"grad_norm": 0.08979956805706024, |
|
"learning_rate": 0.0009423292936646257, |
|
"loss": 0.7559, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.007366175356668821, |
|
"grad_norm": 0.09771803766489029, |
|
"learning_rate": 0.0009359297800192872, |
|
"loss": 0.7717, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.007393972244807194, |
|
"grad_norm": 0.10628974437713623, |
|
"learning_rate": 0.0009295329000497459, |
|
"loss": 0.8083, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.007421769132945567, |
|
"grad_norm": 0.10804276913404465, |
|
"learning_rate": 0.0009231389167066836, |
|
"loss": 0.8077, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.00744956602108394, |
|
"grad_norm": 0.13234536349773407, |
|
"learning_rate": 0.0009167480928217108, |
|
"loss": 0.8623, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.0074773629092223124, |
|
"grad_norm": 0.10681891441345215, |
|
"learning_rate": 0.0009103606910965666, |
|
"loss": 0.7624, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.007505159797360685, |
|
"grad_norm": 0.11654768884181976, |
|
"learning_rate": 0.0009039769740923182, |
|
"loss": 0.8082, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.007532956685499058, |
|
"grad_norm": 0.1005023717880249, |
|
"learning_rate": 0.0008975972042185687, |
|
"loss": 0.7448, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.007560753573637431, |
|
"grad_norm": 0.10492710024118423, |
|
"learning_rate": 0.0008912216437226692, |
|
"loss": 0.8734, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.0075885504617758046, |
|
"grad_norm": 0.10636113584041595, |
|
"learning_rate": 0.0008848505546789408, |
|
"loss": 0.7397, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.007616347349914177, |
|
"grad_norm": 0.0918545350432396, |
|
"learning_rate": 0.0008784841989778997, |
|
"loss": 0.6861, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.00764414423805255, |
|
"grad_norm": 0.10185957700014114, |
|
"learning_rate": 0.0008721228383154939, |
|
"loss": 0.7906, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.007671941126190923, |
|
"grad_norm": 0.09865249693393707, |
|
"learning_rate": 0.0008657667341823448, |
|
"loss": 0.6877, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.007699738014329296, |
|
"grad_norm": 0.10265160351991653, |
|
"learning_rate": 0.0008594161478529974, |
|
"loss": 0.7361, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.007727534902467669, |
|
"grad_norm": 0.10760695487260818, |
|
"learning_rate": 0.0008530713403751821, |
|
"loss": 0.6973, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.007755331790606041, |
|
"grad_norm": 0.1106325089931488, |
|
"learning_rate": 0.000846732572559084, |
|
"loss": 0.7736, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.007783128678744414, |
|
"grad_norm": 0.11260738223791122, |
|
"learning_rate": 0.000840400104966621, |
|
"loss": 0.7567, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.007810925566882787, |
|
"grad_norm": 0.10231205821037292, |
|
"learning_rate": 0.0008340741979007324, |
|
"loss": 0.7018, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.00783872245502116, |
|
"grad_norm": 0.10939161479473114, |
|
"learning_rate": 0.0008277551113946811, |
|
"loss": 0.5913, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.007866519343159533, |
|
"grad_norm": 0.10841213166713715, |
|
"learning_rate": 0.0008214431052013634, |
|
"loss": 0.7421, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.007894316231297906, |
|
"grad_norm": 0.11439742892980576, |
|
"learning_rate": 0.0008151384387826313, |
|
"loss": 0.7312, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.007922113119436278, |
|
"grad_norm": 0.11929941177368164, |
|
"learning_rate": 0.0008088413712986279, |
|
"loss": 0.7529, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.007949910007574652, |
|
"grad_norm": 0.11869197338819504, |
|
"learning_rate": 0.0008025521615971329, |
|
"loss": 0.6848, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.007977706895713026, |
|
"grad_norm": 0.10041101276874542, |
|
"learning_rate": 0.0007962710682029245, |
|
"loss": 0.5243, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.008005503783851398, |
|
"grad_norm": 0.11425133794546127, |
|
"learning_rate": 0.0007899983493071507, |
|
"loss": 0.6441, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.008033300671989771, |
|
"grad_norm": 0.10552135854959488, |
|
"learning_rate": 0.0007837342627567166, |
|
"loss": 0.5675, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.008061097560128143, |
|
"grad_norm": 0.12019749730825424, |
|
"learning_rate": 0.0007774790660436857, |
|
"loss": 0.7536, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.008088894448266517, |
|
"grad_norm": 0.11838424205780029, |
|
"learning_rate": 0.0007712330162946947, |
|
"loss": 0.6558, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.008116691336404889, |
|
"grad_norm": 0.11879291385412216, |
|
"learning_rate": 0.0007649963702603848, |
|
"loss": 0.6071, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.008144488224543263, |
|
"grad_norm": 0.1303434818983078, |
|
"learning_rate": 0.0007587693843048475, |
|
"loss": 0.5558, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.008172285112681634, |
|
"grad_norm": 0.1721266806125641, |
|
"learning_rate": 0.0007525523143950859, |
|
"loss": 0.6311, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.008200082000820008, |
|
"grad_norm": 0.13676951825618744, |
|
"learning_rate": 0.0007463454160904928, |
|
"loss": 0.5886, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.008227878888958382, |
|
"grad_norm": 0.16192977130413055, |
|
"learning_rate": 0.0007401489445323472, |
|
"loss": 0.746, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.008255675777096754, |
|
"grad_norm": 0.12178271263837814, |
|
"learning_rate": 0.000733963154433325, |
|
"loss": 0.544, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.008283472665235127, |
|
"grad_norm": 0.1465931534767151, |
|
"learning_rate": 0.0007277883000670289, |
|
"loss": 0.4949, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.0083112695533735, |
|
"grad_norm": 0.15438374876976013, |
|
"learning_rate": 0.0007216246352575369, |
|
"loss": 0.6408, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.008339066441511873, |
|
"grad_norm": 0.2604023814201355, |
|
"learning_rate": 0.0007154724133689676, |
|
"loss": 0.7431, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.008339066441511873, |
|
"eval_loss": 0.7490311861038208, |
|
"eval_runtime": 50.0501, |
|
"eval_samples_per_second": 11.528, |
|
"eval_steps_per_second": 5.774, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.008366863329650245, |
|
"grad_norm": 0.12751354277133942, |
|
"learning_rate": 0.0007093318872950665, |
|
"loss": 0.9951, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.008394660217788619, |
|
"grad_norm": 0.10959775000810623, |
|
"learning_rate": 0.0007032033094488094, |
|
"loss": 0.8628, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.00842245710592699, |
|
"grad_norm": 0.11009855568408966, |
|
"learning_rate": 0.0006970869317520279, |
|
"loss": 0.9437, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.008450253994065364, |
|
"grad_norm": 0.09348214417695999, |
|
"learning_rate": 0.0006909830056250527, |
|
"loss": 0.7982, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.008478050882203738, |
|
"grad_norm": 0.09079182893037796, |
|
"learning_rate": 0.0006848917819763793, |
|
"loss": 0.759, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.00850584777034211, |
|
"grad_norm": 0.09549093246459961, |
|
"learning_rate": 0.0006788135111923545, |
|
"loss": 0.93, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.008533644658480484, |
|
"grad_norm": 0.09193756431341171, |
|
"learning_rate": 0.0006727484431268831, |
|
"loss": 0.8092, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.008561441546618856, |
|
"grad_norm": 0.0906609520316124, |
|
"learning_rate": 0.0006666968270911584, |
|
"loss": 0.8559, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.00858923843475723, |
|
"grad_norm": 0.09579095989465714, |
|
"learning_rate": 0.0006606589118434126, |
|
"loss": 0.9007, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.008617035322895601, |
|
"grad_norm": 0.09106986969709396, |
|
"learning_rate": 0.0006546349455786926, |
|
"loss": 0.7693, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.008644832211033975, |
|
"grad_norm": 0.08439186215400696, |
|
"learning_rate": 0.0006486251759186573, |
|
"loss": 0.7757, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.008672629099172347, |
|
"grad_norm": 0.09723315387964249, |
|
"learning_rate": 0.0006426298499013994, |
|
"loss": 0.7181, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.00870042598731072, |
|
"grad_norm": 0.09807878732681274, |
|
"learning_rate": 0.0006366492139712886, |
|
"loss": 0.9234, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.008728222875449094, |
|
"grad_norm": 0.0981912836432457, |
|
"learning_rate": 0.0006306835139688439, |
|
"loss": 0.8984, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.008756019763587466, |
|
"grad_norm": 0.0935656800866127, |
|
"learning_rate": 0.000624732995120626, |
|
"loss": 0.7538, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.00878381665172584, |
|
"grad_norm": 0.09538404643535614, |
|
"learning_rate": 0.0006187979020291583, |
|
"loss": 0.86, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.008811613539864212, |
|
"grad_norm": 0.10517210513353348, |
|
"learning_rate": 0.000612878478662872, |
|
"loss": 0.9827, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.008839410428002585, |
|
"grad_norm": 0.11277417838573456, |
|
"learning_rate": 0.0006069749683460764, |
|
"loss": 0.7863, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.008867207316140957, |
|
"grad_norm": 0.10222118347883224, |
|
"learning_rate": 0.0006010876137489584, |
|
"loss": 0.7519, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.008895004204279331, |
|
"grad_norm": 0.08672767877578735, |
|
"learning_rate": 0.0005952166568776062, |
|
"loss": 0.8205, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.008922801092417703, |
|
"grad_norm": 0.10612691193819046, |
|
"learning_rate": 0.0005893623390640622, |
|
"loss": 0.8871, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.008950597980556077, |
|
"grad_norm": 0.09716677665710449, |
|
"learning_rate": 0.0005835249009564013, |
|
"loss": 0.7042, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.00897839486869445, |
|
"grad_norm": 0.0943322479724884, |
|
"learning_rate": 0.0005777045825088404, |
|
"loss": 0.6464, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.009006191756832822, |
|
"grad_norm": 0.10483499616384506, |
|
"learning_rate": 0.0005719016229718748, |
|
"loss": 0.6854, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.009033988644971196, |
|
"grad_norm": 0.10260860621929169, |
|
"learning_rate": 0.0005661162608824419, |
|
"loss": 0.7816, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.009061785533109568, |
|
"grad_norm": 0.1006467193365097, |
|
"learning_rate": 0.0005603487340541181, |
|
"loss": 0.8237, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.009089582421247942, |
|
"grad_norm": 0.09507747739553452, |
|
"learning_rate": 0.0005545992795673408, |
|
"loss": 0.6931, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.009117379309386314, |
|
"grad_norm": 0.08719619363546371, |
|
"learning_rate": 0.0005488681337596652, |
|
"loss": 0.5646, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.009145176197524687, |
|
"grad_norm": 0.0974939689040184, |
|
"learning_rate": 0.0005431555322160483, |
|
"loss": 0.655, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.00917297308566306, |
|
"grad_norm": 0.11043986678123474, |
|
"learning_rate": 0.000537461709759165, |
|
"loss": 0.8183, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.009200769973801433, |
|
"grad_norm": 0.10393897444009781, |
|
"learning_rate": 0.0005317869004397545, |
|
"loss": 0.7377, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.009228566861939807, |
|
"grad_norm": 0.10675422102212906, |
|
"learning_rate": 0.0005261313375270014, |
|
"loss": 0.7157, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.009256363750078178, |
|
"grad_norm": 0.11000238358974457, |
|
"learning_rate": 0.0005204952534989462, |
|
"loss": 0.78, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.009284160638216552, |
|
"grad_norm": 0.11165881901979446, |
|
"learning_rate": 0.0005148788800329278, |
|
"loss": 0.6822, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.009311957526354924, |
|
"grad_norm": 0.10411150753498077, |
|
"learning_rate": 0.0005092824479960625, |
|
"loss": 0.5622, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.009339754414493298, |
|
"grad_norm": 0.11088427156209946, |
|
"learning_rate": 0.0005037061874357502, |
|
"loss": 0.5462, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.00936755130263167, |
|
"grad_norm": 0.11364025622606277, |
|
"learning_rate": 0.0004981503275702227, |
|
"loss": 0.5887, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.009395348190770043, |
|
"grad_norm": 0.1214662715792656, |
|
"learning_rate": 0.000492615096779118, |
|
"loss": 0.7526, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.009423145078908415, |
|
"grad_norm": 0.11156253516674042, |
|
"learning_rate": 0.000487100722594094, |
|
"loss": 0.5498, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.009450941967046789, |
|
"grad_norm": 0.10013176500797272, |
|
"learning_rate": 0.00048160743168947496, |
|
"loss": 0.4897, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.009478738855185163, |
|
"grad_norm": 0.12529128789901733, |
|
"learning_rate": 0.00047613544987293446, |
|
"loss": 0.6721, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.009506535743323535, |
|
"grad_norm": 0.1168283075094223, |
|
"learning_rate": 0.00047068500207621256, |
|
"loss": 0.5659, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.009534332631461908, |
|
"grad_norm": 0.16703759133815765, |
|
"learning_rate": 0.0004652563123458703, |
|
"loss": 0.7979, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.00956212951960028, |
|
"grad_norm": 0.13577383756637573, |
|
"learning_rate": 0.00045984960383408004, |
|
"loss": 0.7327, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.009589926407738654, |
|
"grad_norm": 0.14807015657424927, |
|
"learning_rate": 0.0004544650987894514, |
|
"loss": 0.6478, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.009617723295877026, |
|
"grad_norm": 0.12839584052562714, |
|
"learning_rate": 0.0004491030185478976, |
|
"loss": 0.5733, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.0096455201840154, |
|
"grad_norm": 0.13728651404380798, |
|
"learning_rate": 0.0004437635835235353, |
|
"loss": 0.6203, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.009673317072153772, |
|
"grad_norm": 0.12902460992336273, |
|
"learning_rate": 0.0004384470131996252, |
|
"loss": 0.5744, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.009701113960292145, |
|
"grad_norm": 0.1356661170721054, |
|
"learning_rate": 0.0004331535261195504, |
|
"loss": 0.4755, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.009728910848430519, |
|
"grad_norm": 0.18092875182628632, |
|
"learning_rate": 0.0004278833398778306, |
|
"loss": 0.7017, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.009728910848430519, |
|
"eval_loss": 0.7205991148948669, |
|
"eval_runtime": 50.1118, |
|
"eval_samples_per_second": 11.514, |
|
"eval_steps_per_second": 5.767, |
|
"step": 350 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.248730763853824e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|