|
{ |
|
"best_metric": 0.9426594972610474, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-900", |
|
"epoch": 0.45184503388837755, |
|
"eval_steps": 150, |
|
"global_step": 900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005020500376537528, |
|
"grad_norm": 2.236064910888672, |
|
"learning_rate": 5e-06, |
|
"loss": 2.3327, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0005020500376537528, |
|
"eval_loss": 2.112123489379883, |
|
"eval_runtime": 710.0183, |
|
"eval_samples_per_second": 21.263, |
|
"eval_steps_per_second": 2.659, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0010041000753075056, |
|
"grad_norm": 2.1631500720977783, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1082, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0015061501129612586, |
|
"grad_norm": 2.1660964488983154, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.1297, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0020082001506150113, |
|
"grad_norm": 2.329266309738159, |
|
"learning_rate": 2e-05, |
|
"loss": 2.1222, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.002510250188268764, |
|
"grad_norm": 2.6099958419799805, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.9193, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003012300225922517, |
|
"grad_norm": 1.4518094062805176, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8033, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0035143502635762696, |
|
"grad_norm": 1.176897644996643, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.8032, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0040164003012300225, |
|
"grad_norm": 0.9449969530105591, |
|
"learning_rate": 4e-05, |
|
"loss": 1.7104, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0045184503388837755, |
|
"grad_norm": 1.2403700351715088, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.697, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.005020500376537528, |
|
"grad_norm": 1.3342478275299072, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6163, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005522550414191281, |
|
"grad_norm": 1.2211023569107056, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 1.6009, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.006024600451845034, |
|
"grad_norm": 0.969095766544342, |
|
"learning_rate": 6e-05, |
|
"loss": 1.5262, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.006526650489498786, |
|
"grad_norm": 0.9153647422790527, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 1.5286, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.007028700527152539, |
|
"grad_norm": 0.8192627429962158, |
|
"learning_rate": 7e-05, |
|
"loss": 1.444, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.007530750564806292, |
|
"grad_norm": 0.753913938999176, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.4985, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.008032800602460045, |
|
"grad_norm": 0.7413599491119385, |
|
"learning_rate": 8e-05, |
|
"loss": 1.4897, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.008534850640113799, |
|
"grad_norm": 0.6402814388275146, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.4066, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.009036900677767551, |
|
"grad_norm": 0.546452522277832, |
|
"learning_rate": 9e-05, |
|
"loss": 1.377, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.009538950715421303, |
|
"grad_norm": 0.5637431144714355, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.3894, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.010041000753075057, |
|
"grad_norm": 0.5624980330467224, |
|
"learning_rate": 0.0001, |
|
"loss": 1.4045, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010543050790728809, |
|
"grad_norm": 0.5827409029006958, |
|
"learning_rate": 9.999988735390004e-05, |
|
"loss": 1.3753, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.011045100828382563, |
|
"grad_norm": 0.5779702067375183, |
|
"learning_rate": 9.999954941610768e-05, |
|
"loss": 1.3436, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.011547150866036315, |
|
"grad_norm": 0.5667853951454163, |
|
"learning_rate": 9.999898618814565e-05, |
|
"loss": 1.2881, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.012049200903690068, |
|
"grad_norm": 0.575544536113739, |
|
"learning_rate": 9.999819767255174e-05, |
|
"loss": 1.2713, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01255125094134382, |
|
"grad_norm": 0.5908040404319763, |
|
"learning_rate": 9.99971838728789e-05, |
|
"loss": 1.3644, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.013053300978997573, |
|
"grad_norm": 0.6002948880195618, |
|
"learning_rate": 9.999594479369514e-05, |
|
"loss": 1.2234, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.013555351016651326, |
|
"grad_norm": 0.5986592769622803, |
|
"learning_rate": 9.999448044058358e-05, |
|
"loss": 1.2277, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.014057401054305078, |
|
"grad_norm": 0.7111369967460632, |
|
"learning_rate": 9.999279082014232e-05, |
|
"loss": 1.287, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.014559451091958832, |
|
"grad_norm": 0.5424044728279114, |
|
"learning_rate": 9.999087593998458e-05, |
|
"loss": 1.2089, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.015061501129612584, |
|
"grad_norm": 0.5523831248283386, |
|
"learning_rate": 9.998873580873848e-05, |
|
"loss": 1.2078, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.015563551167266338, |
|
"grad_norm": 0.567329466342926, |
|
"learning_rate": 9.998637043604711e-05, |
|
"loss": 1.2944, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.01606560120492009, |
|
"grad_norm": 0.5832935571670532, |
|
"learning_rate": 9.99837798325685e-05, |
|
"loss": 1.1488, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.016567651242573842, |
|
"grad_norm": 0.5010744333267212, |
|
"learning_rate": 9.998096400997549e-05, |
|
"loss": 1.1848, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.017069701280227598, |
|
"grad_norm": 0.5734532475471497, |
|
"learning_rate": 9.997792298095572e-05, |
|
"loss": 1.2756, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.01757175131788135, |
|
"grad_norm": 0.5715062022209167, |
|
"learning_rate": 9.997465675921163e-05, |
|
"loss": 1.1441, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.018073801355535102, |
|
"grad_norm": 0.5624764561653137, |
|
"learning_rate": 9.997116535946028e-05, |
|
"loss": 1.2006, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.018575851393188854, |
|
"grad_norm": 0.5436444282531738, |
|
"learning_rate": 9.996744879743337e-05, |
|
"loss": 1.1561, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.019077901430842606, |
|
"grad_norm": 0.6079046130180359, |
|
"learning_rate": 9.996350708987713e-05, |
|
"loss": 1.1539, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.01957995146849636, |
|
"grad_norm": 0.6179251074790955, |
|
"learning_rate": 9.995934025455235e-05, |
|
"loss": 1.0866, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.020082001506150114, |
|
"grad_norm": 0.5955487489700317, |
|
"learning_rate": 9.995494831023409e-05, |
|
"loss": 1.1225, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.020584051543803866, |
|
"grad_norm": 0.6442997455596924, |
|
"learning_rate": 9.995033127671174e-05, |
|
"loss": 1.0479, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.021086101581457618, |
|
"grad_norm": 0.6124761700630188, |
|
"learning_rate": 9.994548917478899e-05, |
|
"loss": 1.1356, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.021588151619111373, |
|
"grad_norm": 0.5994246006011963, |
|
"learning_rate": 9.994042202628357e-05, |
|
"loss": 1.0242, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.022090201656765125, |
|
"grad_norm": 0.6428829431533813, |
|
"learning_rate": 9.993512985402724e-05, |
|
"loss": 1.1755, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.022592251694418877, |
|
"grad_norm": 0.6374857425689697, |
|
"learning_rate": 9.992961268186573e-05, |
|
"loss": 1.1132, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02309430173207263, |
|
"grad_norm": 0.6873119473457336, |
|
"learning_rate": 9.992387053465857e-05, |
|
"loss": 1.1258, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.02359635176972638, |
|
"grad_norm": 0.7355693578720093, |
|
"learning_rate": 9.991790343827895e-05, |
|
"loss": 1.1345, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.024098401807380137, |
|
"grad_norm": 0.724719762802124, |
|
"learning_rate": 9.991171141961369e-05, |
|
"loss": 1.0772, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.02460045184503389, |
|
"grad_norm": 0.7716226577758789, |
|
"learning_rate": 9.990529450656303e-05, |
|
"loss": 1.0207, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.02510250188268764, |
|
"grad_norm": 0.8283296227455139, |
|
"learning_rate": 9.989865272804063e-05, |
|
"loss": 0.9559, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.025604551920341393, |
|
"grad_norm": 1.4440653324127197, |
|
"learning_rate": 9.989178611397327e-05, |
|
"loss": 1.5617, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.026106601957995145, |
|
"grad_norm": 1.06277334690094, |
|
"learning_rate": 9.988469469530086e-05, |
|
"loss": 1.5022, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0266086519956489, |
|
"grad_norm": 0.6431909203529358, |
|
"learning_rate": 9.987737850397623e-05, |
|
"loss": 1.4657, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.027110702033302653, |
|
"grad_norm": 0.6749468445777893, |
|
"learning_rate": 9.986983757296498e-05, |
|
"loss": 1.3443, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.027612752070956405, |
|
"grad_norm": 0.6402671933174133, |
|
"learning_rate": 9.986207193624536e-05, |
|
"loss": 1.1939, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.028114802108610157, |
|
"grad_norm": 0.5656529068946838, |
|
"learning_rate": 9.985408162880813e-05, |
|
"loss": 1.2935, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.028616852146263912, |
|
"grad_norm": 0.5140467286109924, |
|
"learning_rate": 9.98458666866564e-05, |
|
"loss": 1.256, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.029118902183917664, |
|
"grad_norm": 0.4963027238845825, |
|
"learning_rate": 9.983742714680538e-05, |
|
"loss": 1.2508, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.029620952221571416, |
|
"grad_norm": 0.5174992680549622, |
|
"learning_rate": 9.982876304728232e-05, |
|
"loss": 1.2935, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.03012300225922517, |
|
"grad_norm": 0.45506641268730164, |
|
"learning_rate": 9.981987442712633e-05, |
|
"loss": 1.2477, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03062505229687892, |
|
"grad_norm": 0.5008202195167542, |
|
"learning_rate": 9.981076132638812e-05, |
|
"loss": 1.253, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.031127102334532676, |
|
"grad_norm": 0.4946709871292114, |
|
"learning_rate": 9.98014237861299e-05, |
|
"loss": 1.1136, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.03162915237218643, |
|
"grad_norm": 0.4489033818244934, |
|
"learning_rate": 9.979186184842517e-05, |
|
"loss": 1.2179, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.03213120240984018, |
|
"grad_norm": 0.42558974027633667, |
|
"learning_rate": 9.978207555635856e-05, |
|
"loss": 1.1858, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.03263325244749393, |
|
"grad_norm": 0.4478650987148285, |
|
"learning_rate": 9.977206495402554e-05, |
|
"loss": 1.2091, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.033135302485147684, |
|
"grad_norm": 0.4109612703323364, |
|
"learning_rate": 9.976183008653233e-05, |
|
"loss": 1.1997, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.033637352522801436, |
|
"grad_norm": 0.419210821390152, |
|
"learning_rate": 9.975137099999566e-05, |
|
"loss": 1.1183, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.034139402560455195, |
|
"grad_norm": 0.43940436840057373, |
|
"learning_rate": 9.974068774154251e-05, |
|
"loss": 1.2011, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.03464145259810895, |
|
"grad_norm": 0.45610693097114563, |
|
"learning_rate": 9.972978035931001e-05, |
|
"loss": 1.2022, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0351435026357627, |
|
"grad_norm": 0.45957621932029724, |
|
"learning_rate": 9.971864890244513e-05, |
|
"loss": 1.1934, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03564555267341645, |
|
"grad_norm": 0.4197078347206116, |
|
"learning_rate": 9.970729342110446e-05, |
|
"loss": 1.1708, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.036147602711070204, |
|
"grad_norm": 0.4471045136451721, |
|
"learning_rate": 9.969571396645399e-05, |
|
"loss": 1.1901, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.036649652748723956, |
|
"grad_norm": 0.510300874710083, |
|
"learning_rate": 9.9683910590669e-05, |
|
"loss": 1.1461, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.03715170278637771, |
|
"grad_norm": 0.417583703994751, |
|
"learning_rate": 9.967188334693363e-05, |
|
"loss": 1.1288, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.03765375282403146, |
|
"grad_norm": 0.4619269371032715, |
|
"learning_rate": 9.965963228944078e-05, |
|
"loss": 1.1442, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03815580286168521, |
|
"grad_norm": 0.44993823766708374, |
|
"learning_rate": 9.964715747339178e-05, |
|
"loss": 1.1821, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.03865785289933897, |
|
"grad_norm": 0.4357517659664154, |
|
"learning_rate": 9.963445895499622e-05, |
|
"loss": 1.0655, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.03915990293699272, |
|
"grad_norm": 0.43514949083328247, |
|
"learning_rate": 9.962153679147161e-05, |
|
"loss": 1.1104, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.039661952974646475, |
|
"grad_norm": 0.4674883484840393, |
|
"learning_rate": 9.960839104104327e-05, |
|
"loss": 1.056, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.04016400301230023, |
|
"grad_norm": 0.463422030210495, |
|
"learning_rate": 9.959502176294383e-05, |
|
"loss": 1.169, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04066605304995398, |
|
"grad_norm": 0.4640124440193176, |
|
"learning_rate": 9.958142901741324e-05, |
|
"loss": 1.0641, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.04116810308760773, |
|
"grad_norm": 0.4530577063560486, |
|
"learning_rate": 9.956761286569824e-05, |
|
"loss": 1.1478, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.04167015312526148, |
|
"grad_norm": 0.4701811671257019, |
|
"learning_rate": 9.955357337005227e-05, |
|
"loss": 1.0432, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.042172203162915235, |
|
"grad_norm": 0.49071431159973145, |
|
"learning_rate": 9.953931059373511e-05, |
|
"loss": 1.1219, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.04267425320056899, |
|
"grad_norm": 0.4607682228088379, |
|
"learning_rate": 9.95248246010126e-05, |
|
"loss": 1.0986, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.043176303238222746, |
|
"grad_norm": 0.4900347888469696, |
|
"learning_rate": 9.951011545715636e-05, |
|
"loss": 1.1192, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.0436783532758765, |
|
"grad_norm": 0.49459338188171387, |
|
"learning_rate": 9.94951832284435e-05, |
|
"loss": 1.1103, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.04418040331353025, |
|
"grad_norm": 0.48701831698417664, |
|
"learning_rate": 9.948002798215632e-05, |
|
"loss": 1.0517, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.044682453351184, |
|
"grad_norm": 0.4620456397533417, |
|
"learning_rate": 9.946464978658199e-05, |
|
"loss": 1.0084, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.045184503388837755, |
|
"grad_norm": 0.5349761247634888, |
|
"learning_rate": 9.944904871101228e-05, |
|
"loss": 1.1153, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04568655342649151, |
|
"grad_norm": 0.5464606285095215, |
|
"learning_rate": 9.943322482574315e-05, |
|
"loss": 0.9737, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.04618860346414526, |
|
"grad_norm": 0.5389485955238342, |
|
"learning_rate": 9.941717820207461e-05, |
|
"loss": 0.9921, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.04669065350179901, |
|
"grad_norm": 0.5406158566474915, |
|
"learning_rate": 9.940090891231025e-05, |
|
"loss": 1.0869, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.04719270353945276, |
|
"grad_norm": 0.5455155968666077, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 1.0236, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.04769475357710652, |
|
"grad_norm": 0.5470486283302307, |
|
"learning_rate": 9.936770262872443e-05, |
|
"loss": 1.0166, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.048196803614760274, |
|
"grad_norm": 0.5690567493438721, |
|
"learning_rate": 9.935076578452534e-05, |
|
"loss": 1.0256, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.048698853652414026, |
|
"grad_norm": 0.5862897038459778, |
|
"learning_rate": 9.933360657347441e-05, |
|
"loss": 0.9532, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.04920090369006778, |
|
"grad_norm": 0.5633604526519775, |
|
"learning_rate": 9.931622507288834e-05, |
|
"loss": 0.9018, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.04970295372772153, |
|
"grad_norm": 0.6516064405441284, |
|
"learning_rate": 9.929862136108549e-05, |
|
"loss": 0.9507, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.05020500376537528, |
|
"grad_norm": 0.8028525114059448, |
|
"learning_rate": 9.928079551738543e-05, |
|
"loss": 0.8808, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.050707053803029034, |
|
"grad_norm": 1.5746766328811646, |
|
"learning_rate": 9.926274762210862e-05, |
|
"loss": 1.6471, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.051209103840682786, |
|
"grad_norm": 0.919540524482727, |
|
"learning_rate": 9.924447775657605e-05, |
|
"loss": 1.4097, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.05171115387833654, |
|
"grad_norm": 0.5336892008781433, |
|
"learning_rate": 9.922598600310893e-05, |
|
"loss": 1.2989, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.05221320391599029, |
|
"grad_norm": 0.5801246166229248, |
|
"learning_rate": 9.920727244502818e-05, |
|
"loss": 1.2606, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.05271525395364405, |
|
"grad_norm": 0.5943406224250793, |
|
"learning_rate": 9.918833716665419e-05, |
|
"loss": 1.1681, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0532173039912978, |
|
"grad_norm": 0.49195364117622375, |
|
"learning_rate": 9.916918025330635e-05, |
|
"loss": 1.2577, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.05371935402895155, |
|
"grad_norm": 0.5099748373031616, |
|
"learning_rate": 9.914980179130273e-05, |
|
"loss": 1.262, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.054221404066605305, |
|
"grad_norm": 0.4685007929801941, |
|
"learning_rate": 9.913020186795967e-05, |
|
"loss": 1.1403, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.05472345410425906, |
|
"grad_norm": 0.45162394642829895, |
|
"learning_rate": 9.911038057159135e-05, |
|
"loss": 1.213, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.05522550414191281, |
|
"grad_norm": 0.4480658173561096, |
|
"learning_rate": 9.909033799150946e-05, |
|
"loss": 1.1956, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05572755417956656, |
|
"grad_norm": 0.460193395614624, |
|
"learning_rate": 9.907007421802272e-05, |
|
"loss": 1.2344, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.056229604217220314, |
|
"grad_norm": 0.46196773648262024, |
|
"learning_rate": 9.904958934243654e-05, |
|
"loss": 1.0947, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.056731654254874066, |
|
"grad_norm": 0.430660218000412, |
|
"learning_rate": 9.902888345705258e-05, |
|
"loss": 1.0833, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.057233704292527825, |
|
"grad_norm": 0.4444407522678375, |
|
"learning_rate": 9.900795665516831e-05, |
|
"loss": 1.1319, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.05773575433018158, |
|
"grad_norm": 0.4496801793575287, |
|
"learning_rate": 9.898680903107666e-05, |
|
"loss": 1.1493, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.05823780436783533, |
|
"grad_norm": 0.40639162063598633, |
|
"learning_rate": 9.89654406800655e-05, |
|
"loss": 1.1085, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.05873985440548908, |
|
"grad_norm": 0.43786799907684326, |
|
"learning_rate": 9.894385169841731e-05, |
|
"loss": 1.1228, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.05924190444314283, |
|
"grad_norm": 0.4193272888660431, |
|
"learning_rate": 9.892204218340866e-05, |
|
"loss": 1.1277, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.059743954480796585, |
|
"grad_norm": 0.4199259281158447, |
|
"learning_rate": 9.890001223330983e-05, |
|
"loss": 1.1616, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.06024600451845034, |
|
"grad_norm": 0.42120906710624695, |
|
"learning_rate": 9.887776194738432e-05, |
|
"loss": 1.0961, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06074805455610409, |
|
"grad_norm": 0.5002040863037109, |
|
"learning_rate": 9.885529142588845e-05, |
|
"loss": 1.2211, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.06125010459375784, |
|
"grad_norm": 0.4081428050994873, |
|
"learning_rate": 9.883260077007092e-05, |
|
"loss": 1.1441, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.0617521546314116, |
|
"grad_norm": 0.4227527379989624, |
|
"learning_rate": 9.880969008217224e-05, |
|
"loss": 1.0954, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.06225420466906535, |
|
"grad_norm": 0.4485255181789398, |
|
"learning_rate": 9.878655946542443e-05, |
|
"loss": 1.1613, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.0627562547067191, |
|
"grad_norm": 0.4632939100265503, |
|
"learning_rate": 9.876320902405042e-05, |
|
"loss": 1.1114, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06325830474437286, |
|
"grad_norm": 0.43671849370002747, |
|
"learning_rate": 9.873963886326365e-05, |
|
"loss": 1.1023, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.06376035478202662, |
|
"grad_norm": 0.41638222336769104, |
|
"learning_rate": 9.871584908926763e-05, |
|
"loss": 1.0879, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.06426240481968036, |
|
"grad_norm": 0.46212083101272583, |
|
"learning_rate": 9.869183980925532e-05, |
|
"loss": 1.0654, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.06476445485733412, |
|
"grad_norm": 0.4403015971183777, |
|
"learning_rate": 9.86676111314088e-05, |
|
"loss": 1.1132, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.06526650489498786, |
|
"grad_norm": 0.46132922172546387, |
|
"learning_rate": 9.864316316489873e-05, |
|
"loss": 1.1063, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06576855493264162, |
|
"grad_norm": 0.4559481739997864, |
|
"learning_rate": 9.861849601988383e-05, |
|
"loss": 1.0584, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.06627060497029537, |
|
"grad_norm": 0.451659619808197, |
|
"learning_rate": 9.85936098075104e-05, |
|
"loss": 1.0576, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.06677265500794913, |
|
"grad_norm": 0.44598105549812317, |
|
"learning_rate": 9.856850463991186e-05, |
|
"loss": 1.0529, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.06727470504560287, |
|
"grad_norm": 0.4388767182826996, |
|
"learning_rate": 9.85431806302081e-05, |
|
"loss": 1.0369, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.06777675508325663, |
|
"grad_norm": 0.4496581554412842, |
|
"learning_rate": 9.851763789250525e-05, |
|
"loss": 1.0197, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.06827880512091039, |
|
"grad_norm": 0.4703635573387146, |
|
"learning_rate": 9.849187654189487e-05, |
|
"loss": 1.0579, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.06878085515856414, |
|
"grad_norm": 0.48079192638397217, |
|
"learning_rate": 9.846589669445355e-05, |
|
"loss": 1.072, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.0692829051962179, |
|
"grad_norm": 0.4830515384674072, |
|
"learning_rate": 9.843969846724247e-05, |
|
"loss": 1.0606, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.06978495523387164, |
|
"grad_norm": 0.48843124508857727, |
|
"learning_rate": 9.841328197830675e-05, |
|
"loss": 1.0112, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.0702870052715254, |
|
"grad_norm": 0.5190821290016174, |
|
"learning_rate": 9.838664734667495e-05, |
|
"loss": 0.9057, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07078905530917914, |
|
"grad_norm": 0.4610013961791992, |
|
"learning_rate": 9.835979469235857e-05, |
|
"loss": 0.9143, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.0712911053468329, |
|
"grad_norm": 0.5105639696121216, |
|
"learning_rate": 9.83327241363515e-05, |
|
"loss": 0.9804, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.07179315538448665, |
|
"grad_norm": 0.5167362093925476, |
|
"learning_rate": 9.830543580062943e-05, |
|
"loss": 1.0125, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.07229520542214041, |
|
"grad_norm": 0.48223769664764404, |
|
"learning_rate": 9.827792980814933e-05, |
|
"loss": 0.9303, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.07279725545979417, |
|
"grad_norm": 0.5426216125488281, |
|
"learning_rate": 9.825020628284896e-05, |
|
"loss": 0.9922, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.07329930549744791, |
|
"grad_norm": 0.5274797081947327, |
|
"learning_rate": 9.822226534964614e-05, |
|
"loss": 0.9352, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.07380135553510167, |
|
"grad_norm": 0.5349685549736023, |
|
"learning_rate": 9.819410713443837e-05, |
|
"loss": 0.9655, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.07430340557275542, |
|
"grad_norm": 0.5646516680717468, |
|
"learning_rate": 9.81657317641022e-05, |
|
"loss": 0.8978, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.07480545561040917, |
|
"grad_norm": 0.6755395531654358, |
|
"learning_rate": 9.81371393664926e-05, |
|
"loss": 0.9381, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.07530750564806292, |
|
"grad_norm": 0.7590980529785156, |
|
"learning_rate": 9.810833007044247e-05, |
|
"loss": 0.7978, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07530750564806292, |
|
"eval_loss": 1.1477792263031006, |
|
"eval_runtime": 710.8849, |
|
"eval_samples_per_second": 21.237, |
|
"eval_steps_per_second": 2.656, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07580955568571668, |
|
"grad_norm": 0.9611290097236633, |
|
"learning_rate": 9.807930400576199e-05, |
|
"loss": 1.5282, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.07631160572337042, |
|
"grad_norm": 0.8264356851577759, |
|
"learning_rate": 9.805006130323809e-05, |
|
"loss": 1.3807, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.07681365576102418, |
|
"grad_norm": 0.6040631532669067, |
|
"learning_rate": 9.802060209463382e-05, |
|
"loss": 1.3432, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.07731570579867794, |
|
"grad_norm": 0.5833274722099304, |
|
"learning_rate": 9.799092651268778e-05, |
|
"loss": 1.2819, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.07781775583633169, |
|
"grad_norm": 0.555841863155365, |
|
"learning_rate": 9.796103469111351e-05, |
|
"loss": 1.1248, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.07831980587398545, |
|
"grad_norm": 0.4553599953651428, |
|
"learning_rate": 9.79309267645989e-05, |
|
"loss": 1.1664, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.07882185591163919, |
|
"grad_norm": 0.42019009590148926, |
|
"learning_rate": 9.790060286880556e-05, |
|
"loss": 1.2007, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.07932390594929295, |
|
"grad_norm": 0.48074933886528015, |
|
"learning_rate": 9.787006314036824e-05, |
|
"loss": 1.1545, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.0798259559869467, |
|
"grad_norm": 0.4458232820034027, |
|
"learning_rate": 9.783930771689418e-05, |
|
"loss": 1.0934, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.08032800602460045, |
|
"grad_norm": 0.4490962624549866, |
|
"learning_rate": 9.780833673696254e-05, |
|
"loss": 1.1753, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0808300560622542, |
|
"grad_norm": 0.4294869601726532, |
|
"learning_rate": 9.777715034012374e-05, |
|
"loss": 1.1133, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.08133210609990796, |
|
"grad_norm": 0.4288542866706848, |
|
"learning_rate": 9.774574866689877e-05, |
|
"loss": 1.1664, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.08183415613756172, |
|
"grad_norm": 0.4405251443386078, |
|
"learning_rate": 9.771413185877872e-05, |
|
"loss": 1.1115, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.08233620617521546, |
|
"grad_norm": 0.41953468322753906, |
|
"learning_rate": 9.768230005822395e-05, |
|
"loss": 1.1264, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.08283825621286922, |
|
"grad_norm": 0.39969509840011597, |
|
"learning_rate": 9.76502534086636e-05, |
|
"loss": 1.056, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.08334030625052297, |
|
"grad_norm": 0.4138109087944031, |
|
"learning_rate": 9.76179920544949e-05, |
|
"loss": 1.1076, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.08384235628817673, |
|
"grad_norm": 0.412165105342865, |
|
"learning_rate": 9.758551614108246e-05, |
|
"loss": 1.1159, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.08434440632583047, |
|
"grad_norm": 0.38842642307281494, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 1.0444, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.08484645636348423, |
|
"grad_norm": 0.3983784019947052, |
|
"learning_rate": 9.751992122281808e-05, |
|
"loss": 1.1385, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.08534850640113797, |
|
"grad_norm": 0.42566153407096863, |
|
"learning_rate": 9.74868025135266e-05, |
|
"loss": 1.1183, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08585055643879173, |
|
"grad_norm": 0.39850881695747375, |
|
"learning_rate": 9.745346983611099e-05, |
|
"loss": 1.0954, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.08635260647644549, |
|
"grad_norm": 0.39748743176460266, |
|
"learning_rate": 9.741992334076308e-05, |
|
"loss": 1.0581, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.08685465651409924, |
|
"grad_norm": 0.42799192667007446, |
|
"learning_rate": 9.738616317863818e-05, |
|
"loss": 1.1318, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.087356706551753, |
|
"grad_norm": 0.41576746106147766, |
|
"learning_rate": 9.735218950185428e-05, |
|
"loss": 1.1525, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.08785875658940674, |
|
"grad_norm": 0.4112211763858795, |
|
"learning_rate": 9.731800246349148e-05, |
|
"loss": 1.0731, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.0883608066270605, |
|
"grad_norm": 0.43050485849380493, |
|
"learning_rate": 9.728360221759123e-05, |
|
"loss": 1.0604, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.08886285666471425, |
|
"grad_norm": 0.44277775287628174, |
|
"learning_rate": 9.72489889191557e-05, |
|
"loss": 1.0127, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.089364906702368, |
|
"grad_norm": 0.442449152469635, |
|
"learning_rate": 9.721416272414699e-05, |
|
"loss": 1.039, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.08986695674002175, |
|
"grad_norm": 0.4507065415382385, |
|
"learning_rate": 9.71791237894865e-05, |
|
"loss": 1.0508, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.09036900677767551, |
|
"grad_norm": 0.4348186254501343, |
|
"learning_rate": 9.714387227305422e-05, |
|
"loss": 1.0597, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09087105681532927, |
|
"grad_norm": 0.42365097999572754, |
|
"learning_rate": 9.710840833368797e-05, |
|
"loss": 1.0212, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.09137310685298301, |
|
"grad_norm": 0.4242313504219055, |
|
"learning_rate": 9.707273213118271e-05, |
|
"loss": 1.019, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.09187515689063677, |
|
"grad_norm": 0.4419156014919281, |
|
"learning_rate": 9.703684382628989e-05, |
|
"loss": 1.0509, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.09237720692829052, |
|
"grad_norm": 0.43379202485084534, |
|
"learning_rate": 9.700074358071659e-05, |
|
"loss": 1.0329, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.09287925696594428, |
|
"grad_norm": 0.44969063997268677, |
|
"learning_rate": 9.696443155712486e-05, |
|
"loss": 0.9929, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.09338130700359802, |
|
"grad_norm": 0.4435906410217285, |
|
"learning_rate": 9.692790791913106e-05, |
|
"loss": 1.0103, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.09388335704125178, |
|
"grad_norm": 0.4611569941043854, |
|
"learning_rate": 9.689117283130498e-05, |
|
"loss": 1.0245, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.09438540707890553, |
|
"grad_norm": 0.4579900801181793, |
|
"learning_rate": 9.685422645916918e-05, |
|
"loss": 1.0386, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.09488745711655928, |
|
"grad_norm": 0.4896557033061981, |
|
"learning_rate": 9.681706896919829e-05, |
|
"loss": 0.991, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.09538950715421304, |
|
"grad_norm": 0.4932405948638916, |
|
"learning_rate": 9.67797005288181e-05, |
|
"loss": 0.9557, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09589155719186679, |
|
"grad_norm": 0.5124619603157043, |
|
"learning_rate": 9.674212130640506e-05, |
|
"loss": 0.9505, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.09639360722952055, |
|
"grad_norm": 0.5189158916473389, |
|
"learning_rate": 9.670433147128521e-05, |
|
"loss": 0.9757, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.09689565726717429, |
|
"grad_norm": 0.4920775890350342, |
|
"learning_rate": 9.666633119373368e-05, |
|
"loss": 0.925, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.09739770730482805, |
|
"grad_norm": 0.5255336761474609, |
|
"learning_rate": 9.66281206449738e-05, |
|
"loss": 0.9272, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.0978997573424818, |
|
"grad_norm": 0.5087072849273682, |
|
"learning_rate": 9.65896999971763e-05, |
|
"loss": 0.9373, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.09840180738013556, |
|
"grad_norm": 0.5356236100196838, |
|
"learning_rate": 9.65510694234587e-05, |
|
"loss": 0.9119, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.0989038574177893, |
|
"grad_norm": 0.5867013335227966, |
|
"learning_rate": 9.651222909788427e-05, |
|
"loss": 0.8701, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.09940590745544306, |
|
"grad_norm": 0.5810437202453613, |
|
"learning_rate": 9.64731791954615e-05, |
|
"loss": 0.8611, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.0999079574930968, |
|
"grad_norm": 0.6373634338378906, |
|
"learning_rate": 9.643391989214312e-05, |
|
"loss": 0.9195, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.10041000753075056, |
|
"grad_norm": 0.7272390723228455, |
|
"learning_rate": 9.639445136482548e-05, |
|
"loss": 0.8179, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10091205756840432, |
|
"grad_norm": 0.711874783039093, |
|
"learning_rate": 9.635477379134756e-05, |
|
"loss": 1.3114, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.10141410760605807, |
|
"grad_norm": 0.6842883229255676, |
|
"learning_rate": 9.631488735049033e-05, |
|
"loss": 1.3263, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.10191615764371183, |
|
"grad_norm": 0.4919327199459076, |
|
"learning_rate": 9.627479222197587e-05, |
|
"loss": 1.1895, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.10241820768136557, |
|
"grad_norm": 0.4409739673137665, |
|
"learning_rate": 9.623448858646657e-05, |
|
"loss": 1.1812, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.10292025771901933, |
|
"grad_norm": 0.492781400680542, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 1.1623, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.10342230775667308, |
|
"grad_norm": 0.4331713616847992, |
|
"learning_rate": 9.615325652180975e-05, |
|
"loss": 1.1714, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.10392435779432684, |
|
"grad_norm": 0.41304811835289, |
|
"learning_rate": 9.611232845868124e-05, |
|
"loss": 1.1732, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.10442640783198058, |
|
"grad_norm": 0.4479162395000458, |
|
"learning_rate": 9.607119262059425e-05, |
|
"loss": 1.1447, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.10492845786963434, |
|
"grad_norm": 0.4152972102165222, |
|
"learning_rate": 9.602984919290047e-05, |
|
"loss": 1.1563, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.1054305079072881, |
|
"grad_norm": 0.421634703874588, |
|
"learning_rate": 9.598829836188694e-05, |
|
"loss": 1.1044, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.10593255794494184, |
|
"grad_norm": 0.41844844818115234, |
|
"learning_rate": 9.594654031477521e-05, |
|
"loss": 1.0942, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.1064346079825956, |
|
"grad_norm": 0.43519729375839233, |
|
"learning_rate": 9.590457523972056e-05, |
|
"loss": 1.0787, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.10693665802024935, |
|
"grad_norm": 0.39169546961784363, |
|
"learning_rate": 9.5862403325811e-05, |
|
"loss": 1.0474, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.1074387080579031, |
|
"grad_norm": 0.4049958884716034, |
|
"learning_rate": 9.582002476306668e-05, |
|
"loss": 1.1092, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.10794075809555685, |
|
"grad_norm": 0.41217753291130066, |
|
"learning_rate": 9.577743974243874e-05, |
|
"loss": 1.0595, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.10844280813321061, |
|
"grad_norm": 0.37548142671585083, |
|
"learning_rate": 9.573464845580864e-05, |
|
"loss": 1.0365, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.10894485817086436, |
|
"grad_norm": 0.3726944029331207, |
|
"learning_rate": 9.569165109598725e-05, |
|
"loss": 1.0813, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.10944690820851811, |
|
"grad_norm": 0.4017277657985687, |
|
"learning_rate": 9.564844785671398e-05, |
|
"loss": 1.066, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.10994895824617187, |
|
"grad_norm": 0.3842703700065613, |
|
"learning_rate": 9.560503893265589e-05, |
|
"loss": 1.0937, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.11045100828382562, |
|
"grad_norm": 0.37564817070961, |
|
"learning_rate": 9.55614245194068e-05, |
|
"loss": 1.0732, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.11095305832147938, |
|
"grad_norm": 0.3989981412887573, |
|
"learning_rate": 9.551760481348644e-05, |
|
"loss": 1.0755, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.11145510835913312, |
|
"grad_norm": 0.388481467962265, |
|
"learning_rate": 9.547358001233959e-05, |
|
"loss": 1.1052, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.11195715839678688, |
|
"grad_norm": 0.41220539808273315, |
|
"learning_rate": 9.542935031433515e-05, |
|
"loss": 1.1182, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.11245920843444063, |
|
"grad_norm": 0.4094482958316803, |
|
"learning_rate": 9.538491591876522e-05, |
|
"loss": 0.9925, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.11296125847209439, |
|
"grad_norm": 0.4174862802028656, |
|
"learning_rate": 9.534027702584425e-05, |
|
"loss": 1.0755, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.11346330850974813, |
|
"grad_norm": 0.4093203842639923, |
|
"learning_rate": 9.529543383670814e-05, |
|
"loss": 1.0757, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.11396535854740189, |
|
"grad_norm": 0.41040605306625366, |
|
"learning_rate": 9.525038655341329e-05, |
|
"loss": 1.016, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.11446740858505565, |
|
"grad_norm": 0.3920714855194092, |
|
"learning_rate": 9.520513537893574e-05, |
|
"loss": 0.9406, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.1149694586227094, |
|
"grad_norm": 0.4755348265171051, |
|
"learning_rate": 9.515968051717022e-05, |
|
"loss": 1.076, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.11547150866036315, |
|
"grad_norm": 0.43063998222351074, |
|
"learning_rate": 9.511402217292926e-05, |
|
"loss": 1.0341, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1159735586980169, |
|
"grad_norm": 0.4011836647987366, |
|
"learning_rate": 9.506816055194223e-05, |
|
"loss": 1.0272, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.11647560873567066, |
|
"grad_norm": 0.4279603660106659, |
|
"learning_rate": 9.502209586085444e-05, |
|
"loss": 1.0628, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.1169776587733244, |
|
"grad_norm": 0.4363585412502289, |
|
"learning_rate": 9.497582830722617e-05, |
|
"loss": 1.0168, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.11747970881097816, |
|
"grad_norm": 0.44320476055145264, |
|
"learning_rate": 9.492935809953185e-05, |
|
"loss": 1.0361, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.1179817588486319, |
|
"grad_norm": 0.4238687753677368, |
|
"learning_rate": 9.488268544715896e-05, |
|
"loss": 0.9586, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.11848380888628567, |
|
"grad_norm": 0.41311508417129517, |
|
"learning_rate": 9.483581056040719e-05, |
|
"loss": 0.9994, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.11898585892393942, |
|
"grad_norm": 0.4387330710887909, |
|
"learning_rate": 9.478873365048748e-05, |
|
"loss": 0.9888, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.11948790896159317, |
|
"grad_norm": 0.4367882013320923, |
|
"learning_rate": 9.474145492952102e-05, |
|
"loss": 0.9316, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.11998995899924693, |
|
"grad_norm": 0.47169986367225647, |
|
"learning_rate": 9.469397461053837e-05, |
|
"loss": 0.9869, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.12049200903690067, |
|
"grad_norm": 0.4648449420928955, |
|
"learning_rate": 9.464629290747842e-05, |
|
"loss": 0.9891, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.12099405907455443, |
|
"grad_norm": 0.47016164660453796, |
|
"learning_rate": 9.459841003518753e-05, |
|
"loss": 0.8839, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.12149610911220818, |
|
"grad_norm": 0.46333047747612, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 0.871, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.12199815914986194, |
|
"grad_norm": 0.5084211230278015, |
|
"learning_rate": 9.450204164682928e-05, |
|
"loss": 0.9316, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.12250020918751568, |
|
"grad_norm": 0.48900070786476135, |
|
"learning_rate": 9.445355656498285e-05, |
|
"loss": 0.9197, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.12300225922516944, |
|
"grad_norm": 0.5633825659751892, |
|
"learning_rate": 9.440487118234535e-05, |
|
"loss": 0.9677, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1235043092628232, |
|
"grad_norm": 0.5049698352813721, |
|
"learning_rate": 9.435598571828552e-05, |
|
"loss": 0.9555, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.12400635930047695, |
|
"grad_norm": 0.558955729007721, |
|
"learning_rate": 9.430690039307363e-05, |
|
"loss": 0.9873, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.1245084093381307, |
|
"grad_norm": 0.6211025714874268, |
|
"learning_rate": 9.425761542788048e-05, |
|
"loss": 0.9365, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.12501045937578445, |
|
"grad_norm": 0.577601969242096, |
|
"learning_rate": 9.420813104477646e-05, |
|
"loss": 0.8319, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.1255125094134382, |
|
"grad_norm": 0.7420045137405396, |
|
"learning_rate": 9.415844746673047e-05, |
|
"loss": 0.8585, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12601455945109197, |
|
"grad_norm": 0.6604503393173218, |
|
"learning_rate": 9.410856491760895e-05, |
|
"loss": 1.3716, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.1265166094887457, |
|
"grad_norm": 0.5581377148628235, |
|
"learning_rate": 9.405848362217491e-05, |
|
"loss": 1.2896, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.12701865952639946, |
|
"grad_norm": 0.48465076088905334, |
|
"learning_rate": 9.400820380608683e-05, |
|
"loss": 1.2519, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.12752070956405323, |
|
"grad_norm": 0.44643500447273254, |
|
"learning_rate": 9.395772569589774e-05, |
|
"loss": 1.1591, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.12802275960170698, |
|
"grad_norm": 0.46593335270881653, |
|
"learning_rate": 9.390704951905411e-05, |
|
"loss": 1.1837, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.12852480963936072, |
|
"grad_norm": 0.45252177119255066, |
|
"learning_rate": 9.38561755038949e-05, |
|
"loss": 1.1487, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.12902685967701447, |
|
"grad_norm": 0.39205488562583923, |
|
"learning_rate": 9.380510387965047e-05, |
|
"loss": 1.1948, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.12952890971466824, |
|
"grad_norm": 0.3986876308917999, |
|
"learning_rate": 9.37538348764416e-05, |
|
"loss": 1.1644, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.13003095975232198, |
|
"grad_norm": 0.40925174951553345, |
|
"learning_rate": 9.370236872527845e-05, |
|
"loss": 1.1403, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.13053300978997573, |
|
"grad_norm": 0.4211632311344147, |
|
"learning_rate": 9.365070565805941e-05, |
|
"loss": 1.129, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.13103505982762947, |
|
"grad_norm": 0.39426669478416443, |
|
"learning_rate": 9.359884590757025e-05, |
|
"loss": 1.1036, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.13153710986528325, |
|
"grad_norm": 0.3872944116592407, |
|
"learning_rate": 9.35467897074829e-05, |
|
"loss": 1.0752, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.132039159902937, |
|
"grad_norm": 0.39355534315109253, |
|
"learning_rate": 9.349453729235447e-05, |
|
"loss": 0.9972, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.13254120994059074, |
|
"grad_norm": 0.36666789650917053, |
|
"learning_rate": 9.34420888976262e-05, |
|
"loss": 1.0353, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.1330432599782445, |
|
"grad_norm": 0.39839133620262146, |
|
"learning_rate": 9.338944475962237e-05, |
|
"loss": 1.0541, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.13354531001589826, |
|
"grad_norm": 0.3860282599925995, |
|
"learning_rate": 9.333660511554925e-05, |
|
"loss": 1.0672, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.134047360053552, |
|
"grad_norm": 0.38217252492904663, |
|
"learning_rate": 9.328357020349405e-05, |
|
"loss": 1.0534, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.13454941009120575, |
|
"grad_norm": 0.39358577132225037, |
|
"learning_rate": 9.323034026242377e-05, |
|
"loss": 1.1266, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.13505146012885952, |
|
"grad_norm": 0.39111077785491943, |
|
"learning_rate": 9.317691553218428e-05, |
|
"loss": 1.1044, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.13555351016651326, |
|
"grad_norm": 0.3801279067993164, |
|
"learning_rate": 9.312329625349902e-05, |
|
"loss": 1.0242, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.136055560204167, |
|
"grad_norm": 0.42816299200057983, |
|
"learning_rate": 9.306948266796816e-05, |
|
"loss": 1.0546, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.13655761024182078, |
|
"grad_norm": 0.3893824517726898, |
|
"learning_rate": 9.301547501806726e-05, |
|
"loss": 1.0505, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.13705966027947453, |
|
"grad_norm": 0.393541157245636, |
|
"learning_rate": 9.29612735471464e-05, |
|
"loss": 1.0875, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.13756171031712827, |
|
"grad_norm": 0.38969358801841736, |
|
"learning_rate": 9.290687849942893e-05, |
|
"loss": 1.048, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.13806376035478202, |
|
"grad_norm": 0.4060596525669098, |
|
"learning_rate": 9.285229012001047e-05, |
|
"loss": 1.0514, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.1385658103924358, |
|
"grad_norm": 0.4034577012062073, |
|
"learning_rate": 9.279750865485772e-05, |
|
"loss": 0.9808, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.13906786043008953, |
|
"grad_norm": 0.4346413314342499, |
|
"learning_rate": 9.274253435080746e-05, |
|
"loss": 1.0776, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.13956991046774328, |
|
"grad_norm": 0.3979721963405609, |
|
"learning_rate": 9.268736745556527e-05, |
|
"loss": 0.984, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.14007196050539703, |
|
"grad_norm": 0.4149005115032196, |
|
"learning_rate": 9.263200821770461e-05, |
|
"loss": 1.0041, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.1405740105430508, |
|
"grad_norm": 0.4252713620662689, |
|
"learning_rate": 9.257645688666556e-05, |
|
"loss": 0.9957, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.14107606058070454, |
|
"grad_norm": 0.41585591435432434, |
|
"learning_rate": 9.252071371275378e-05, |
|
"loss": 1.0147, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.1415781106183583, |
|
"grad_norm": 0.4276868402957916, |
|
"learning_rate": 9.246477894713925e-05, |
|
"loss": 1.0093, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.14208016065601206, |
|
"grad_norm": 0.4250052571296692, |
|
"learning_rate": 9.240865284185536e-05, |
|
"loss": 1.0084, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.1425822106936658, |
|
"grad_norm": 0.4250898063182831, |
|
"learning_rate": 9.235233564979755e-05, |
|
"loss": 0.9515, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.14308426073131955, |
|
"grad_norm": 0.44376522302627563, |
|
"learning_rate": 9.22958276247223e-05, |
|
"loss": 0.9607, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.1435863107689733, |
|
"grad_norm": 0.42781707644462585, |
|
"learning_rate": 9.223912902124601e-05, |
|
"loss": 0.9635, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.14408836080662707, |
|
"grad_norm": 0.4453868865966797, |
|
"learning_rate": 9.218224009484366e-05, |
|
"loss": 0.9683, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.14459041084428081, |
|
"grad_norm": 0.43436458706855774, |
|
"learning_rate": 9.212516110184794e-05, |
|
"loss": 0.9129, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.14509246088193456, |
|
"grad_norm": 0.47016844153404236, |
|
"learning_rate": 9.206789229944786e-05, |
|
"loss": 0.9555, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.14559451091958833, |
|
"grad_norm": 0.4709494113922119, |
|
"learning_rate": 9.201043394568773e-05, |
|
"loss": 0.9643, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.14609656095724208, |
|
"grad_norm": 0.4938959777355194, |
|
"learning_rate": 9.195278629946589e-05, |
|
"loss": 0.9555, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.14659861099489582, |
|
"grad_norm": 0.49377843737602234, |
|
"learning_rate": 9.189494962053368e-05, |
|
"loss": 0.9807, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.14710066103254957, |
|
"grad_norm": 0.4660574793815613, |
|
"learning_rate": 9.183692416949414e-05, |
|
"loss": 0.8629, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.14760271107020334, |
|
"grad_norm": 0.46316561102867126, |
|
"learning_rate": 9.17787102078009e-05, |
|
"loss": 0.9329, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.14810476110785709, |
|
"grad_norm": 0.48996853828430176, |
|
"learning_rate": 9.172030799775699e-05, |
|
"loss": 0.9179, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.14860681114551083, |
|
"grad_norm": 0.4847288131713867, |
|
"learning_rate": 9.166171780251365e-05, |
|
"loss": 0.9084, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.14910886118316458, |
|
"grad_norm": 0.4630123972892761, |
|
"learning_rate": 9.160293988606916e-05, |
|
"loss": 0.8722, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.14961091122081835, |
|
"grad_norm": 0.5303933620452881, |
|
"learning_rate": 9.154397451326766e-05, |
|
"loss": 0.8966, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.1501129612584721, |
|
"grad_norm": 0.530169665813446, |
|
"learning_rate": 9.148482194979789e-05, |
|
"loss": 0.8084, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.15061501129612584, |
|
"grad_norm": 0.6588028073310852, |
|
"learning_rate": 9.142548246219212e-05, |
|
"loss": 0.7829, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15061501129612584, |
|
"eval_loss": 1.0374255180358887, |
|
"eval_runtime": 709.916, |
|
"eval_samples_per_second": 21.266, |
|
"eval_steps_per_second": 2.659, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1511170613337796, |
|
"grad_norm": 0.4993877410888672, |
|
"learning_rate": 9.136595631782478e-05, |
|
"loss": 1.2287, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.15161911137143336, |
|
"grad_norm": 0.5264070630073547, |
|
"learning_rate": 9.13062437849114e-05, |
|
"loss": 1.2563, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.1521211614090871, |
|
"grad_norm": 0.4640633761882782, |
|
"learning_rate": 9.124634513250736e-05, |
|
"loss": 1.2391, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.15262321144674085, |
|
"grad_norm": 0.4271823763847351, |
|
"learning_rate": 9.118626063050661e-05, |
|
"loss": 1.1237, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.15312526148439462, |
|
"grad_norm": 0.4822617471218109, |
|
"learning_rate": 9.112599054964057e-05, |
|
"loss": 1.1054, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.15362731152204837, |
|
"grad_norm": 0.4398539066314697, |
|
"learning_rate": 9.106553516147682e-05, |
|
"loss": 1.1482, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.1541293615597021, |
|
"grad_norm": 0.39565637707710266, |
|
"learning_rate": 9.100489473841792e-05, |
|
"loss": 1.0734, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.15463141159735588, |
|
"grad_norm": 0.40112727880477905, |
|
"learning_rate": 9.09440695537001e-05, |
|
"loss": 1.2046, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.15513346163500963, |
|
"grad_norm": 0.40704309940338135, |
|
"learning_rate": 9.088305988139221e-05, |
|
"loss": 1.0738, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.15563551167266337, |
|
"grad_norm": 0.39162713289260864, |
|
"learning_rate": 9.082186599639428e-05, |
|
"loss": 1.0979, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15613756171031712, |
|
"grad_norm": 0.38577142357826233, |
|
"learning_rate": 9.076048817443645e-05, |
|
"loss": 1.0685, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.1566396117479709, |
|
"grad_norm": 0.3882400691509247, |
|
"learning_rate": 9.069892669207758e-05, |
|
"loss": 1.0758, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.15714166178562464, |
|
"grad_norm": 0.37418147921562195, |
|
"learning_rate": 9.06371818267041e-05, |
|
"loss": 0.9834, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.15764371182327838, |
|
"grad_norm": 0.3902306854724884, |
|
"learning_rate": 9.057525385652878e-05, |
|
"loss": 1.0335, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.15814576186093213, |
|
"grad_norm": 0.3840450048446655, |
|
"learning_rate": 9.051314306058933e-05, |
|
"loss": 1.068, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1586478118985859, |
|
"grad_norm": 0.3719392716884613, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 1.031, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.15914986193623964, |
|
"grad_norm": 0.3819999694824219, |
|
"learning_rate": 9.038837411168696e-05, |
|
"loss": 1.052, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.1596519119738934, |
|
"grad_norm": 0.37122640013694763, |
|
"learning_rate": 9.032571652091342e-05, |
|
"loss": 1.0321, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.16015396201154716, |
|
"grad_norm": 0.3737955093383789, |
|
"learning_rate": 9.026287722875209e-05, |
|
"loss": 1.0579, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.1606560120492009, |
|
"grad_norm": 0.388288676738739, |
|
"learning_rate": 9.019985651834703e-05, |
|
"loss": 1.0124, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.16115806208685465, |
|
"grad_norm": 0.4136764705181122, |
|
"learning_rate": 9.013665467365973e-05, |
|
"loss": 1.0084, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.1616601121245084, |
|
"grad_norm": 0.39497965574264526, |
|
"learning_rate": 9.007327197946781e-05, |
|
"loss": 1.0847, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"grad_norm": 0.4033185839653015, |
|
"learning_rate": 9.000970872136383e-05, |
|
"loss": 1.0314, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.16266421219981592, |
|
"grad_norm": 0.40545448660850525, |
|
"learning_rate": 8.994596518575392e-05, |
|
"loss": 1.0589, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.16316626223746966, |
|
"grad_norm": 0.3762631118297577, |
|
"learning_rate": 8.988204165985649e-05, |
|
"loss": 0.9565, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.16366831227512343, |
|
"grad_norm": 0.40594545006752014, |
|
"learning_rate": 8.981793843170098e-05, |
|
"loss": 0.9948, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.16417036231277718, |
|
"grad_norm": 0.40294238924980164, |
|
"learning_rate": 8.975365579012655e-05, |
|
"loss": 1.0012, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.16467241235043092, |
|
"grad_norm": 0.4173141121864319, |
|
"learning_rate": 8.968919402478075e-05, |
|
"loss": 1.0945, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.16517446238808467, |
|
"grad_norm": 0.4323413074016571, |
|
"learning_rate": 8.962455342611821e-05, |
|
"loss": 1.0233, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.16567651242573844, |
|
"grad_norm": 0.4198532700538635, |
|
"learning_rate": 8.955973428539944e-05, |
|
"loss": 0.9737, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1661785624633922, |
|
"grad_norm": 0.420789510011673, |
|
"learning_rate": 8.94947368946893e-05, |
|
"loss": 0.9872, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.16668061250104593, |
|
"grad_norm": 0.408327579498291, |
|
"learning_rate": 8.942956154685596e-05, |
|
"loss": 1.008, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.16718266253869968, |
|
"grad_norm": 0.4309915006160736, |
|
"learning_rate": 8.936420853556935e-05, |
|
"loss": 1.0114, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.16768471257635345, |
|
"grad_norm": 0.4261639416217804, |
|
"learning_rate": 8.929867815529993e-05, |
|
"loss": 0.9308, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.1681867626140072, |
|
"grad_norm": 0.42096462845802307, |
|
"learning_rate": 8.923297070131737e-05, |
|
"loss": 0.9615, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.16868881265166094, |
|
"grad_norm": 0.4670826494693756, |
|
"learning_rate": 8.916708646968923e-05, |
|
"loss": 0.969, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.1691908626893147, |
|
"grad_norm": 0.4317393898963928, |
|
"learning_rate": 8.910102575727957e-05, |
|
"loss": 1.0044, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.16969291272696846, |
|
"grad_norm": 0.4464769959449768, |
|
"learning_rate": 8.903478886174763e-05, |
|
"loss": 1.0213, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.1701949627646222, |
|
"grad_norm": 0.44120046496391296, |
|
"learning_rate": 8.896837608154655e-05, |
|
"loss": 0.9162, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.17069701280227595, |
|
"grad_norm": 0.4458862245082855, |
|
"learning_rate": 8.890178771592199e-05, |
|
"loss": 0.9079, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.17119906283992972, |
|
"grad_norm": 0.435587078332901, |
|
"learning_rate": 8.883502406491067e-05, |
|
"loss": 0.9403, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.17170111287758347, |
|
"grad_norm": 0.47128617763519287, |
|
"learning_rate": 8.876808542933924e-05, |
|
"loss": 0.9312, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.1722031629152372, |
|
"grad_norm": 0.4681444466114044, |
|
"learning_rate": 8.870097211082271e-05, |
|
"loss": 0.9711, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.17270521295289099, |
|
"grad_norm": 0.4990653693675995, |
|
"learning_rate": 8.863368441176326e-05, |
|
"loss": 0.9206, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.17320726299054473, |
|
"grad_norm": 0.6548157930374146, |
|
"learning_rate": 8.856622263534875e-05, |
|
"loss": 0.9235, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.17370931302819848, |
|
"grad_norm": 0.5148348212242126, |
|
"learning_rate": 8.849858708555142e-05, |
|
"loss": 0.9176, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.17421136306585222, |
|
"grad_norm": 0.5113969445228577, |
|
"learning_rate": 8.843077806712648e-05, |
|
"loss": 0.8961, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.174713413103506, |
|
"grad_norm": 0.5211741328239441, |
|
"learning_rate": 8.836279588561083e-05, |
|
"loss": 0.8647, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.17521546314115974, |
|
"grad_norm": 0.5579087138175964, |
|
"learning_rate": 8.829464084732156e-05, |
|
"loss": 0.8901, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.17571751317881348, |
|
"grad_norm": 0.6655847430229187, |
|
"learning_rate": 8.822631325935463e-05, |
|
"loss": 0.8291, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17621956321646723, |
|
"grad_norm": 1.6874302625656128, |
|
"learning_rate": 8.815781342958351e-05, |
|
"loss": 1.385, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.176721613254121, |
|
"grad_norm": 0.522758960723877, |
|
"learning_rate": 8.808914166665772e-05, |
|
"loss": 1.2028, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.17722366329177475, |
|
"grad_norm": 0.4740375876426697, |
|
"learning_rate": 8.802029828000156e-05, |
|
"loss": 1.1728, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.1777257133294285, |
|
"grad_norm": 0.4499627351760864, |
|
"learning_rate": 8.795128357981253e-05, |
|
"loss": 1.1861, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.17822776336708226, |
|
"grad_norm": 0.4773704707622528, |
|
"learning_rate": 8.788209787706015e-05, |
|
"loss": 1.1703, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.178729813404736, |
|
"grad_norm": 0.4251996576786041, |
|
"learning_rate": 8.781274148348437e-05, |
|
"loss": 1.1624, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.17923186344238975, |
|
"grad_norm": 0.4025990962982178, |
|
"learning_rate": 8.77432147115943e-05, |
|
"loss": 1.0905, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.1797339134800435, |
|
"grad_norm": 0.3834582269191742, |
|
"learning_rate": 8.767351787466673e-05, |
|
"loss": 1.1365, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.18023596351769727, |
|
"grad_norm": 0.4131288230419159, |
|
"learning_rate": 8.760365128674473e-05, |
|
"loss": 1.1159, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.18073801355535102, |
|
"grad_norm": 0.409453421831131, |
|
"learning_rate": 8.753361526263621e-05, |
|
"loss": 1.1026, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.18124006359300476, |
|
"grad_norm": 0.38488978147506714, |
|
"learning_rate": 8.746341011791264e-05, |
|
"loss": 1.036, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.18174211363065854, |
|
"grad_norm": 0.3848975598812103, |
|
"learning_rate": 8.73930361689074e-05, |
|
"loss": 1.067, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.18224416366831228, |
|
"grad_norm": 0.3877599835395813, |
|
"learning_rate": 8.732249373271455e-05, |
|
"loss": 1.0209, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.18274621370596603, |
|
"grad_norm": 0.38656899333000183, |
|
"learning_rate": 8.725178312718725e-05, |
|
"loss": 1.087, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.18324826374361977, |
|
"grad_norm": 0.394137978553772, |
|
"learning_rate": 8.718090467093654e-05, |
|
"loss": 1.0651, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.18375031378127354, |
|
"grad_norm": 0.39841341972351074, |
|
"learning_rate": 8.710985868332962e-05, |
|
"loss": 1.0186, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.1842523638189273, |
|
"grad_norm": 0.39646363258361816, |
|
"learning_rate": 8.703864548448868e-05, |
|
"loss": 1.029, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.18475441385658103, |
|
"grad_norm": 0.3801933825016022, |
|
"learning_rate": 8.696726539528924e-05, |
|
"loss": 1.054, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.18525646389423478, |
|
"grad_norm": 0.3627118468284607, |
|
"learning_rate": 8.689571873735884e-05, |
|
"loss": 1.1052, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.18575851393188855, |
|
"grad_norm": 0.39008674025535583, |
|
"learning_rate": 8.682400583307562e-05, |
|
"loss": 1.0064, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1862605639695423, |
|
"grad_norm": 0.37145888805389404, |
|
"learning_rate": 8.675212700556668e-05, |
|
"loss": 0.9877, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.18676261400719604, |
|
"grad_norm": 0.3664349913597107, |
|
"learning_rate": 8.668008257870683e-05, |
|
"loss": 1.0103, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.18726466404484982, |
|
"grad_norm": 0.37273725867271423, |
|
"learning_rate": 8.660787287711703e-05, |
|
"loss": 1.0636, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.18776671408250356, |
|
"grad_norm": 0.38857051730155945, |
|
"learning_rate": 8.653549822616289e-05, |
|
"loss": 1.1021, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.1882687641201573, |
|
"grad_norm": 0.3906739056110382, |
|
"learning_rate": 8.646295895195333e-05, |
|
"loss": 1.0698, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.18877081415781105, |
|
"grad_norm": 0.3983420133590698, |
|
"learning_rate": 8.639025538133898e-05, |
|
"loss": 1.0459, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.18927286419546482, |
|
"grad_norm": 0.38264894485473633, |
|
"learning_rate": 8.631738784191083e-05, |
|
"loss": 1.041, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.18977491423311857, |
|
"grad_norm": 0.40160712599754333, |
|
"learning_rate": 8.62443566619986e-05, |
|
"loss": 0.9657, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.19027696427077231, |
|
"grad_norm": 0.4029211103916168, |
|
"learning_rate": 8.617116217066942e-05, |
|
"loss": 1.0126, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.1907790143084261, |
|
"grad_norm": 0.41848793625831604, |
|
"learning_rate": 8.609780469772623e-05, |
|
"loss": 1.0143, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.19128106434607983, |
|
"grad_norm": 0.3983096480369568, |
|
"learning_rate": 8.602428457370637e-05, |
|
"loss": 1.0024, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.19178311438373358, |
|
"grad_norm": 0.3817248046398163, |
|
"learning_rate": 8.595060212988006e-05, |
|
"loss": 0.9107, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.19228516442138732, |
|
"grad_norm": 0.4119492769241333, |
|
"learning_rate": 8.587675769824887e-05, |
|
"loss": 0.9464, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.1927872144590411, |
|
"grad_norm": 0.40975409746170044, |
|
"learning_rate": 8.580275161154431e-05, |
|
"loss": 0.8996, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.19328926449669484, |
|
"grad_norm": 0.4254794418811798, |
|
"learning_rate": 8.572858420322627e-05, |
|
"loss": 0.9331, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.19379131453434859, |
|
"grad_norm": 0.4199373722076416, |
|
"learning_rate": 8.56542558074815e-05, |
|
"loss": 1.0189, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.19429336457200233, |
|
"grad_norm": 0.4211234450340271, |
|
"learning_rate": 8.557976675922217e-05, |
|
"loss": 0.9798, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.1947954146096561, |
|
"grad_norm": 0.4226566553115845, |
|
"learning_rate": 8.550511739408428e-05, |
|
"loss": 0.9475, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.19529746464730985, |
|
"grad_norm": 0.46705394983291626, |
|
"learning_rate": 8.543030804842629e-05, |
|
"loss": 0.9535, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.1957995146849636, |
|
"grad_norm": 0.4537680745124817, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.9774, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19630156472261737, |
|
"grad_norm": 0.43357518315315247, |
|
"learning_rate": 8.528021076458615e-05, |
|
"loss": 0.9001, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.1968036147602711, |
|
"grad_norm": 0.45762643218040466, |
|
"learning_rate": 8.520492350271896e-05, |
|
"loss": 0.9012, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.19730566479792486, |
|
"grad_norm": 0.4584790766239166, |
|
"learning_rate": 8.512947761295846e-05, |
|
"loss": 0.8805, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.1978077148355786, |
|
"grad_norm": 0.484757661819458, |
|
"learning_rate": 8.505387343525209e-05, |
|
"loss": 0.868, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.19830976487323237, |
|
"grad_norm": 0.5136643052101135, |
|
"learning_rate": 8.497811131026046e-05, |
|
"loss": 0.9755, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.19881181491088612, |
|
"grad_norm": 0.5092843770980835, |
|
"learning_rate": 8.490219157935589e-05, |
|
"loss": 0.9072, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.19931386494853987, |
|
"grad_norm": 0.5307949185371399, |
|
"learning_rate": 8.482611458462083e-05, |
|
"loss": 0.9028, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.1998159149861936, |
|
"grad_norm": 0.5171916484832764, |
|
"learning_rate": 8.47498806688464e-05, |
|
"loss": 0.8684, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.20031796502384738, |
|
"grad_norm": 0.5054696202278137, |
|
"learning_rate": 8.467349017553067e-05, |
|
"loss": 0.7905, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.20082001506150113, |
|
"grad_norm": 0.6332175731658936, |
|
"learning_rate": 8.459694344887732e-05, |
|
"loss": 0.8408, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.20132206509915487, |
|
"grad_norm": 0.562515377998352, |
|
"learning_rate": 8.452024083379394e-05, |
|
"loss": 1.3941, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.20182411513680865, |
|
"grad_norm": 0.43945592641830444, |
|
"learning_rate": 8.444338267589057e-05, |
|
"loss": 1.2801, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.2023261651744624, |
|
"grad_norm": 0.42131316661834717, |
|
"learning_rate": 8.436636932147806e-05, |
|
"loss": 1.2589, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.20282821521211614, |
|
"grad_norm": 0.3926401436328888, |
|
"learning_rate": 8.428920111756658e-05, |
|
"loss": 1.125, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.20333026524976988, |
|
"grad_norm": 0.4347395896911621, |
|
"learning_rate": 8.421187841186402e-05, |
|
"loss": 1.1564, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.20383231528742365, |
|
"grad_norm": 0.3934774100780487, |
|
"learning_rate": 8.413440155277443e-05, |
|
"loss": 1.0942, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.2043343653250774, |
|
"grad_norm": 0.40075141191482544, |
|
"learning_rate": 8.405677088939644e-05, |
|
"loss": 1.1296, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.20483641536273114, |
|
"grad_norm": 0.36235958337783813, |
|
"learning_rate": 8.397898677152173e-05, |
|
"loss": 1.1378, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.20533846540038492, |
|
"grad_norm": 0.4117681384086609, |
|
"learning_rate": 8.390104954963338e-05, |
|
"loss": 1.134, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.20584051543803866, |
|
"grad_norm": 0.3808246850967407, |
|
"learning_rate": 8.382295957490436e-05, |
|
"loss": 1.0572, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2063425654756924, |
|
"grad_norm": 0.39057350158691406, |
|
"learning_rate": 8.37447171991959e-05, |
|
"loss": 1.1136, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.20684461551334615, |
|
"grad_norm": 0.39303159713745117, |
|
"learning_rate": 8.366632277505597e-05, |
|
"loss": 1.0216, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.20734666555099993, |
|
"grad_norm": 0.37181228399276733, |
|
"learning_rate": 8.35877766557176e-05, |
|
"loss": 1.0096, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.20784871558865367, |
|
"grad_norm": 0.378421813249588, |
|
"learning_rate": 8.350907919509734e-05, |
|
"loss": 1.0492, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.20835076562630742, |
|
"grad_norm": 0.38374465703964233, |
|
"learning_rate": 8.343023074779368e-05, |
|
"loss": 1.0271, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.20885281566396116, |
|
"grad_norm": 0.37486276030540466, |
|
"learning_rate": 8.335123166908544e-05, |
|
"loss": 1.027, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.20935486570161493, |
|
"grad_norm": 0.37390416860580444, |
|
"learning_rate": 8.327208231493011e-05, |
|
"loss": 0.9933, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.20985691573926868, |
|
"grad_norm": 0.39402034878730774, |
|
"learning_rate": 8.319278304196237e-05, |
|
"loss": 1.0998, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.21035896577692242, |
|
"grad_norm": 0.3804149925708771, |
|
"learning_rate": 8.311333420749232e-05, |
|
"loss": 1.0575, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.2108610158145762, |
|
"grad_norm": 0.37954866886138916, |
|
"learning_rate": 8.303373616950408e-05, |
|
"loss": 1.0209, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.21136306585222994, |
|
"grad_norm": 0.36630332469940186, |
|
"learning_rate": 8.295398928665394e-05, |
|
"loss": 0.953, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.2118651158898837, |
|
"grad_norm": 0.37623950839042664, |
|
"learning_rate": 8.287409391826895e-05, |
|
"loss": 0.9686, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.21236716592753743, |
|
"grad_norm": 0.384235680103302, |
|
"learning_rate": 8.279405042434515e-05, |
|
"loss": 1.0683, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.2128692159651912, |
|
"grad_norm": 0.3830919563770294, |
|
"learning_rate": 8.271385916554605e-05, |
|
"loss": 0.9916, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.21337126600284495, |
|
"grad_norm": 0.39329853653907776, |
|
"learning_rate": 8.263352050320094e-05, |
|
"loss": 1.0264, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.2138733160404987, |
|
"grad_norm": 0.39238932728767395, |
|
"learning_rate": 8.255303479930333e-05, |
|
"loss": 0.9725, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.21437536607815247, |
|
"grad_norm": 0.41246023774147034, |
|
"learning_rate": 8.247240241650918e-05, |
|
"loss": 0.9592, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.2148774161158062, |
|
"grad_norm": 0.4108837842941284, |
|
"learning_rate": 8.239162371813551e-05, |
|
"loss": 1.0114, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.21537946615345996, |
|
"grad_norm": 0.3942084312438965, |
|
"learning_rate": 8.231069906815847e-05, |
|
"loss": 0.9637, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.2158815161911137, |
|
"grad_norm": 0.4277946949005127, |
|
"learning_rate": 8.222962883121196e-05, |
|
"loss": 1.012, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.21638356622876748, |
|
"grad_norm": 0.43043553829193115, |
|
"learning_rate": 8.214841337258578e-05, |
|
"loss": 0.9617, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.21688561626642122, |
|
"grad_norm": 0.40695276856422424, |
|
"learning_rate": 8.206705305822413e-05, |
|
"loss": 0.9876, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.21738766630407497, |
|
"grad_norm": 0.41154372692108154, |
|
"learning_rate": 8.19855482547239e-05, |
|
"loss": 0.9719, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.2178897163417287, |
|
"grad_norm": 0.4162918031215668, |
|
"learning_rate": 8.190389932933301e-05, |
|
"loss": 0.9352, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.21839176637938248, |
|
"grad_norm": 0.4280974268913269, |
|
"learning_rate": 8.182210664994878e-05, |
|
"loss": 0.9462, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.21889381641703623, |
|
"grad_norm": 0.4325559437274933, |
|
"learning_rate": 8.174017058511629e-05, |
|
"loss": 0.9444, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.21939586645468998, |
|
"grad_norm": 0.43471524119377136, |
|
"learning_rate": 8.165809150402663e-05, |
|
"loss": 0.9441, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.21989791649234375, |
|
"grad_norm": 0.4418407380580902, |
|
"learning_rate": 8.157586977651534e-05, |
|
"loss": 0.9465, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.2203999665299975, |
|
"grad_norm": 0.45979785919189453, |
|
"learning_rate": 8.149350577306074e-05, |
|
"loss": 0.9426, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.22090201656765124, |
|
"grad_norm": 0.45479616522789, |
|
"learning_rate": 8.141099986478212e-05, |
|
"loss": 0.8374, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.22140406660530498, |
|
"grad_norm": 0.437326043844223, |
|
"learning_rate": 8.132835242343827e-05, |
|
"loss": 0.8725, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.22190611664295876, |
|
"grad_norm": 0.4658799469470978, |
|
"learning_rate": 8.124556382142565e-05, |
|
"loss": 0.8982, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.2224081666806125, |
|
"grad_norm": 0.5004392862319946, |
|
"learning_rate": 8.11626344317768e-05, |
|
"loss": 0.9902, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.22291021671826625, |
|
"grad_norm": 0.46578583121299744, |
|
"learning_rate": 8.107956462815861e-05, |
|
"loss": 0.8265, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.22341226675592002, |
|
"grad_norm": 0.48835834860801697, |
|
"learning_rate": 8.099635478487064e-05, |
|
"loss": 0.8986, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.22391431679357376, |
|
"grad_norm": 0.5076184868812561, |
|
"learning_rate": 8.091300527684349e-05, |
|
"loss": 0.8746, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.2244163668312275, |
|
"grad_norm": 0.502265989780426, |
|
"learning_rate": 8.082951647963701e-05, |
|
"loss": 0.9168, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.22491841686888125, |
|
"grad_norm": 0.558822512626648, |
|
"learning_rate": 8.074588876943873e-05, |
|
"loss": 0.8786, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.22542046690653503, |
|
"grad_norm": 0.5506950616836548, |
|
"learning_rate": 8.066212252306203e-05, |
|
"loss": 0.8613, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.22592251694418877, |
|
"grad_norm": 0.7210969924926758, |
|
"learning_rate": 8.057821811794458e-05, |
|
"loss": 0.746, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.22592251694418877, |
|
"eval_loss": 1.012302041053772, |
|
"eval_runtime": 708.8163, |
|
"eval_samples_per_second": 21.299, |
|
"eval_steps_per_second": 2.664, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.22642456698184252, |
|
"grad_norm": 0.49422305822372437, |
|
"learning_rate": 8.049417593214652e-05, |
|
"loss": 1.3625, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.22692661701949626, |
|
"grad_norm": 0.45369595289230347, |
|
"learning_rate": 8.040999634434883e-05, |
|
"loss": 1.2001, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.22742866705715004, |
|
"grad_norm": 0.4486617147922516, |
|
"learning_rate": 8.032567973385162e-05, |
|
"loss": 1.2561, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.22793071709480378, |
|
"grad_norm": 0.422780841588974, |
|
"learning_rate": 8.024122648057234e-05, |
|
"loss": 1.1671, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.22843276713245753, |
|
"grad_norm": 0.4150182008743286, |
|
"learning_rate": 8.015663696504422e-05, |
|
"loss": 1.0727, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2289348171701113, |
|
"grad_norm": 0.4196764826774597, |
|
"learning_rate": 8.007191156841441e-05, |
|
"loss": 1.1269, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.22943686720776504, |
|
"grad_norm": 0.3779695928096771, |
|
"learning_rate": 7.998705067244232e-05, |
|
"loss": 1.1152, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.2299389172454188, |
|
"grad_norm": 0.3510948419570923, |
|
"learning_rate": 7.990205465949791e-05, |
|
"loss": 1.0677, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.23044096728307253, |
|
"grad_norm": 0.3578283488750458, |
|
"learning_rate": 7.981692391255997e-05, |
|
"loss": 1.115, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.2309430173207263, |
|
"grad_norm": 0.3872191607952118, |
|
"learning_rate": 7.973165881521434e-05, |
|
"loss": 1.0569, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.23144506735838005, |
|
"grad_norm": 0.4070218503475189, |
|
"learning_rate": 7.964625975165225e-05, |
|
"loss": 1.0516, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.2319471173960338, |
|
"grad_norm": 0.35880640149116516, |
|
"learning_rate": 7.956072710666859e-05, |
|
"loss": 1.0315, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.23244916743368757, |
|
"grad_norm": 0.448629230260849, |
|
"learning_rate": 7.947506126566009e-05, |
|
"loss": 1.0253, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.23295121747134132, |
|
"grad_norm": 0.3651820719242096, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 1.0126, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.23345326750899506, |
|
"grad_norm": 0.3588433265686035, |
|
"learning_rate": 7.930333154015466e-05, |
|
"loss": 1.0329, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.2339553175466488, |
|
"grad_norm": 0.3761132061481476, |
|
"learning_rate": 7.921726842944508e-05, |
|
"loss": 1.0054, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.23445736758430258, |
|
"grad_norm": 0.36542749404907227, |
|
"learning_rate": 7.913107367028187e-05, |
|
"loss": 1.0458, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.23495941762195632, |
|
"grad_norm": 0.3760159909725189, |
|
"learning_rate": 7.90447476510452e-05, |
|
"loss": 1.016, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.23546146765961007, |
|
"grad_norm": 0.34772396087646484, |
|
"learning_rate": 7.895829076070663e-05, |
|
"loss": 0.9758, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.2359635176972638, |
|
"grad_norm": 0.3899083137512207, |
|
"learning_rate": 7.88717033888274e-05, |
|
"loss": 1.0391, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2364655677349176, |
|
"grad_norm": 0.3794157803058624, |
|
"learning_rate": 7.878498592555674e-05, |
|
"loss": 1.0162, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.23696761777257133, |
|
"grad_norm": 0.3927205801010132, |
|
"learning_rate": 7.869813876162998e-05, |
|
"loss": 0.9797, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.23746966781022508, |
|
"grad_norm": 0.3774932324886322, |
|
"learning_rate": 7.86111622883669e-05, |
|
"loss": 0.9606, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.23797171784787885, |
|
"grad_norm": 0.37682032585144043, |
|
"learning_rate": 7.852405689766993e-05, |
|
"loss": 1.0554, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.2384737678855326, |
|
"grad_norm": 0.3759259879589081, |
|
"learning_rate": 7.843682298202235e-05, |
|
"loss": 0.9883, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.23897581792318634, |
|
"grad_norm": 0.38955962657928467, |
|
"learning_rate": 7.834946093448659e-05, |
|
"loss": 1.0126, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.23947786796084009, |
|
"grad_norm": 0.39181217551231384, |
|
"learning_rate": 7.826197114870242e-05, |
|
"loss": 1.0209, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.23997991799849386, |
|
"grad_norm": 0.38797685503959656, |
|
"learning_rate": 7.817435401888513e-05, |
|
"loss": 1.0166, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.2404819680361476, |
|
"grad_norm": 0.3912067413330078, |
|
"learning_rate": 7.808660993982388e-05, |
|
"loss": 0.9866, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.24098401807380135, |
|
"grad_norm": 0.3997304439544678, |
|
"learning_rate": 7.799873930687978e-05, |
|
"loss": 0.9763, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.24148606811145512, |
|
"grad_norm": 0.40459659695625305, |
|
"learning_rate": 7.79107425159842e-05, |
|
"loss": 1.0234, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.24198811814910887, |
|
"grad_norm": 0.4033385217189789, |
|
"learning_rate": 7.782261996363693e-05, |
|
"loss": 0.9801, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.2424901681867626, |
|
"grad_norm": 0.41744333505630493, |
|
"learning_rate": 7.773437204690449e-05, |
|
"loss": 0.9665, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.24299221822441636, |
|
"grad_norm": 0.4200511872768402, |
|
"learning_rate": 7.764599916341817e-05, |
|
"loss": 0.957, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.24349426826207013, |
|
"grad_norm": 0.4265231490135193, |
|
"learning_rate": 7.755750171137246e-05, |
|
"loss": 0.9379, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.24399631829972387, |
|
"grad_norm": 0.4306912124156952, |
|
"learning_rate": 7.746888008952301e-05, |
|
"loss": 0.9734, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.24449836833737762, |
|
"grad_norm": 0.4338829219341278, |
|
"learning_rate": 7.738013469718507e-05, |
|
"loss": 0.9265, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.24500041837503136, |
|
"grad_norm": 0.43540337681770325, |
|
"learning_rate": 7.729126593423151e-05, |
|
"loss": 0.9211, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.24550246841268514, |
|
"grad_norm": 0.46909114718437195, |
|
"learning_rate": 7.720227420109112e-05, |
|
"loss": 0.928, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.24600451845033888, |
|
"grad_norm": 0.4378572404384613, |
|
"learning_rate": 7.711315989874677e-05, |
|
"loss": 0.8604, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.24650656848799263, |
|
"grad_norm": 0.4667833745479584, |
|
"learning_rate": 7.702392342873358e-05, |
|
"loss": 0.8831, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.2470086185256464, |
|
"grad_norm": 0.44659602642059326, |
|
"learning_rate": 7.69345651931372e-05, |
|
"loss": 0.9048, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.24751066856330015, |
|
"grad_norm": 0.4557839334011078, |
|
"learning_rate": 7.684508559459187e-05, |
|
"loss": 0.8803, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.2480127186009539, |
|
"grad_norm": 0.4604610204696655, |
|
"learning_rate": 7.675548503627871e-05, |
|
"loss": 0.8387, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.24851476863860764, |
|
"grad_norm": 0.4708879888057709, |
|
"learning_rate": 7.666576392192389e-05, |
|
"loss": 0.8432, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.2490168186762614, |
|
"grad_norm": 0.5023857951164246, |
|
"learning_rate": 7.65759226557967e-05, |
|
"loss": 0.9374, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.24951886871391515, |
|
"grad_norm": 0.5210058689117432, |
|
"learning_rate": 7.648596164270791e-05, |
|
"loss": 0.9176, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.2500209187515689, |
|
"grad_norm": 0.5268908739089966, |
|
"learning_rate": 7.639588128800778e-05, |
|
"loss": 0.8858, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.25052296878922264, |
|
"grad_norm": 0.5862696170806885, |
|
"learning_rate": 7.630568199758436e-05, |
|
"loss": 0.8763, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.2510250188268764, |
|
"grad_norm": 0.6300661563873291, |
|
"learning_rate": 7.621536417786159e-05, |
|
"loss": 0.7728, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2515270688645302, |
|
"grad_norm": 0.4835459589958191, |
|
"learning_rate": 7.612492823579745e-05, |
|
"loss": 1.2268, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.25202911890218394, |
|
"grad_norm": 0.43844684958457947, |
|
"learning_rate": 7.60343745788822e-05, |
|
"loss": 1.2638, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.2525311689398377, |
|
"grad_norm": 0.44383448362350464, |
|
"learning_rate": 7.594370361513648e-05, |
|
"loss": 1.204, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.2530332189774914, |
|
"grad_norm": 0.40099960565567017, |
|
"learning_rate": 7.585291575310952e-05, |
|
"loss": 1.1228, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.25353526901514517, |
|
"grad_norm": 0.3856929838657379, |
|
"learning_rate": 7.576201140187727e-05, |
|
"loss": 1.127, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.2540373190527989, |
|
"grad_norm": 0.41922733187675476, |
|
"learning_rate": 7.567099097104054e-05, |
|
"loss": 1.1535, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.25453936909045266, |
|
"grad_norm": 0.39519184827804565, |
|
"learning_rate": 7.557985487072318e-05, |
|
"loss": 1.1119, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.25504141912810646, |
|
"grad_norm": 0.3693808317184448, |
|
"learning_rate": 7.548860351157027e-05, |
|
"loss": 1.1379, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.2555434691657602, |
|
"grad_norm": 0.36474886536598206, |
|
"learning_rate": 7.539723730474619e-05, |
|
"loss": 1.1053, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.25604551920341395, |
|
"grad_norm": 0.4072096645832062, |
|
"learning_rate": 7.530575666193283e-05, |
|
"loss": 1.0756, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2565475692410677, |
|
"grad_norm": 0.3847082257270813, |
|
"learning_rate": 7.521416199532765e-05, |
|
"loss": 1.0432, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.25704961927872144, |
|
"grad_norm": 0.3695790469646454, |
|
"learning_rate": 7.512245371764197e-05, |
|
"loss": 0.9927, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.2575516693163752, |
|
"grad_norm": 0.36473801732063293, |
|
"learning_rate": 7.503063224209896e-05, |
|
"loss": 1.0291, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.25805371935402893, |
|
"grad_norm": 0.36407670378685, |
|
"learning_rate": 7.493869798243187e-05, |
|
"loss": 1.014, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.2585557693916827, |
|
"grad_norm": 0.37464427947998047, |
|
"learning_rate": 7.484665135288213e-05, |
|
"loss": 1.0923, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.2590578194293365, |
|
"grad_norm": 0.34929415583610535, |
|
"learning_rate": 7.475449276819753e-05, |
|
"loss": 1.0533, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.2595598694669902, |
|
"grad_norm": 0.36770978569984436, |
|
"learning_rate": 7.466222264363021e-05, |
|
"loss": 0.9745, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.26006191950464397, |
|
"grad_norm": 0.3667100965976715, |
|
"learning_rate": 7.456984139493502e-05, |
|
"loss": 0.9944, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.2605639695422977, |
|
"grad_norm": 0.3640177845954895, |
|
"learning_rate": 7.447734943836741e-05, |
|
"loss": 1.0289, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.26106601957995146, |
|
"grad_norm": 0.35481715202331543, |
|
"learning_rate": 7.438474719068173e-05, |
|
"loss": 1.0214, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2615680696176052, |
|
"grad_norm": 0.36664754152297974, |
|
"learning_rate": 7.429203506912927e-05, |
|
"loss": 1.0307, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.26207011965525895, |
|
"grad_norm": 0.3693181276321411, |
|
"learning_rate": 7.419921349145634e-05, |
|
"loss": 0.9277, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.26257216969291275, |
|
"grad_norm": 0.38111287355422974, |
|
"learning_rate": 7.410628287590254e-05, |
|
"loss": 0.9725, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.2630742197305665, |
|
"grad_norm": 0.3914952576160431, |
|
"learning_rate": 7.401324364119871e-05, |
|
"loss": 1.0405, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.26357626976822024, |
|
"grad_norm": 0.38030022382736206, |
|
"learning_rate": 7.392009620656513e-05, |
|
"loss": 0.9838, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.264078319805874, |
|
"grad_norm": 0.41087502241134644, |
|
"learning_rate": 7.382684099170959e-05, |
|
"loss": 1.0151, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.26458036984352773, |
|
"grad_norm": 0.40365880727767944, |
|
"learning_rate": 7.373347841682556e-05, |
|
"loss": 0.9753, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.2650824198811815, |
|
"grad_norm": 0.4079309105873108, |
|
"learning_rate": 7.364000890259025e-05, |
|
"loss": 1.0174, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.2655844699188352, |
|
"grad_norm": 0.4056829810142517, |
|
"learning_rate": 7.354643287016268e-05, |
|
"loss": 1.024, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.266086519956489, |
|
"grad_norm": 0.39864933490753174, |
|
"learning_rate": 7.345275074118185e-05, |
|
"loss": 0.9795, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.26658856999414277, |
|
"grad_norm": 0.39665892720222473, |
|
"learning_rate": 7.335896293776486e-05, |
|
"loss": 0.9327, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.2670906200317965, |
|
"grad_norm": 0.38788363337516785, |
|
"learning_rate": 7.326506988250488e-05, |
|
"loss": 0.9648, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.26759267006945026, |
|
"grad_norm": 0.41023018956184387, |
|
"learning_rate": 7.31710719984694e-05, |
|
"loss": 0.9254, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.268094720107104, |
|
"grad_norm": 0.38603848218917847, |
|
"learning_rate": 7.307696970919818e-05, |
|
"loss": 0.958, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.26859677014475775, |
|
"grad_norm": 0.42242223024368286, |
|
"learning_rate": 7.298276343870151e-05, |
|
"loss": 0.9136, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.2690988201824115, |
|
"grad_norm": 0.4157050549983978, |
|
"learning_rate": 7.288845361145811e-05, |
|
"loss": 0.9641, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.2696008702200653, |
|
"grad_norm": 0.4187794625759125, |
|
"learning_rate": 7.279404065241337e-05, |
|
"loss": 0.8804, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.27010292025771904, |
|
"grad_norm": 0.4192327857017517, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 0.9528, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.2706049702953728, |
|
"grad_norm": 0.42294546961784363, |
|
"learning_rate": 7.260490704102287e-05, |
|
"loss": 0.9, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.2711070203330265, |
|
"grad_norm": 0.45047277212142944, |
|
"learning_rate": 7.251018724088367e-05, |
|
"loss": 0.8122, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2716090703706803, |
|
"grad_norm": 0.45989593863487244, |
|
"learning_rate": 7.241536601335237e-05, |
|
"loss": 0.8988, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.272111120408334, |
|
"grad_norm": 0.5204156041145325, |
|
"learning_rate": 7.232044378567864e-05, |
|
"loss": 0.9557, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.27261317044598776, |
|
"grad_norm": 0.4537619948387146, |
|
"learning_rate": 7.222542098556721e-05, |
|
"loss": 0.8729, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.27311522048364156, |
|
"grad_norm": 0.46789640188217163, |
|
"learning_rate": 7.213029804117604e-05, |
|
"loss": 0.8732, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.2736172705212953, |
|
"grad_norm": 0.4757324159145355, |
|
"learning_rate": 7.203507538111423e-05, |
|
"loss": 0.8749, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.27411932055894905, |
|
"grad_norm": 0.46748244762420654, |
|
"learning_rate": 7.193975343444023e-05, |
|
"loss": 0.7785, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.2746213705966028, |
|
"grad_norm": 0.508681058883667, |
|
"learning_rate": 7.18443326306599e-05, |
|
"loss": 0.8732, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.27512342063425654, |
|
"grad_norm": 0.5589388608932495, |
|
"learning_rate": 7.174881339972448e-05, |
|
"loss": 0.8308, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.2756254706719103, |
|
"grad_norm": 0.5891793966293335, |
|
"learning_rate": 7.165319617202871e-05, |
|
"loss": 0.7965, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.27612752070956403, |
|
"grad_norm": 0.6700708866119385, |
|
"learning_rate": 7.155748137840892e-05, |
|
"loss": 0.7379, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2766295707472178, |
|
"grad_norm": 0.4654090404510498, |
|
"learning_rate": 7.146166945014102e-05, |
|
"loss": 1.1523, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.2771316207848716, |
|
"grad_norm": 0.4521055221557617, |
|
"learning_rate": 7.136576081893863e-05, |
|
"loss": 1.1763, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.2776336708225253, |
|
"grad_norm": 0.39871326088905334, |
|
"learning_rate": 7.126975591695108e-05, |
|
"loss": 1.1915, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.27813572086017907, |
|
"grad_norm": 0.3950759172439575, |
|
"learning_rate": 7.117365517676145e-05, |
|
"loss": 1.1688, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.2786377708978328, |
|
"grad_norm": 0.4023323357105255, |
|
"learning_rate": 7.107745903138472e-05, |
|
"loss": 1.0745, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.27913982093548656, |
|
"grad_norm": 0.4257362186908722, |
|
"learning_rate": 7.09811679142657e-05, |
|
"loss": 1.1143, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.2796418709731403, |
|
"grad_norm": 0.39084288477897644, |
|
"learning_rate": 7.088478225927715e-05, |
|
"loss": 1.1569, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.28014392101079405, |
|
"grad_norm": 0.3621457815170288, |
|
"learning_rate": 7.078830250071777e-05, |
|
"loss": 1.1078, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.28064597104844785, |
|
"grad_norm": 0.3504067063331604, |
|
"learning_rate": 7.069172907331034e-05, |
|
"loss": 1.0506, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.2811480210861016, |
|
"grad_norm": 0.3640802502632141, |
|
"learning_rate": 7.059506241219965e-05, |
|
"loss": 1.0844, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.28165007112375534, |
|
"grad_norm": 0.47682300209999084, |
|
"learning_rate": 7.049830295295057e-05, |
|
"loss": 0.9911, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.2821521211614091, |
|
"grad_norm": 0.3758324086666107, |
|
"learning_rate": 7.040145113154612e-05, |
|
"loss": 1.0008, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.28265417119906283, |
|
"grad_norm": 0.3609547019004822, |
|
"learning_rate": 7.030450738438553e-05, |
|
"loss": 0.9903, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.2831562212367166, |
|
"grad_norm": 0.38031235337257385, |
|
"learning_rate": 7.020747214828221e-05, |
|
"loss": 1.0049, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.2836582712743703, |
|
"grad_norm": 0.38092276453971863, |
|
"learning_rate": 7.011034586046176e-05, |
|
"loss": 1.0064, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.2841603213120241, |
|
"grad_norm": 0.37201064825057983, |
|
"learning_rate": 7.001312895856011e-05, |
|
"loss": 1.034, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.28466237134967787, |
|
"grad_norm": 0.3920980989933014, |
|
"learning_rate": 6.991582188062143e-05, |
|
"loss": 1.0447, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.2851644213873316, |
|
"grad_norm": 0.3509131669998169, |
|
"learning_rate": 6.981842506509625e-05, |
|
"loss": 0.9887, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.28566647142498536, |
|
"grad_norm": 0.36149969696998596, |
|
"learning_rate": 6.972093895083945e-05, |
|
"loss": 1.0549, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.2861685214626391, |
|
"grad_norm": 0.3768817186355591, |
|
"learning_rate": 6.962336397710819e-05, |
|
"loss": 1.0034, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.28667057150029285, |
|
"grad_norm": 0.37715521454811096, |
|
"learning_rate": 6.952570058356013e-05, |
|
"loss": 1.0081, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.2871726215379466, |
|
"grad_norm": 0.35239478945732117, |
|
"learning_rate": 6.942794921025126e-05, |
|
"loss": 0.9283, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.2876746715756004, |
|
"grad_norm": 0.34368762373924255, |
|
"learning_rate": 6.933011029763405e-05, |
|
"loss": 0.9346, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.28817672161325414, |
|
"grad_norm": 0.3795548677444458, |
|
"learning_rate": 6.923218428655534e-05, |
|
"loss": 0.9778, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.2886787716509079, |
|
"grad_norm": 0.3852332830429077, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 0.9668, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.28918082168856163, |
|
"grad_norm": 0.37631848454475403, |
|
"learning_rate": 6.903607273436128e-05, |
|
"loss": 0.9594, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.2896828717262154, |
|
"grad_norm": 0.3791573941707611, |
|
"learning_rate": 6.893788807689396e-05, |
|
"loss": 0.916, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.2901849217638691, |
|
"grad_norm": 0.3761579096317291, |
|
"learning_rate": 6.883961808825732e-05, |
|
"loss": 0.9475, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.29068697180152286, |
|
"grad_norm": 0.3986110985279083, |
|
"learning_rate": 6.874126321124058e-05, |
|
"loss": 0.9524, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.29118902183917666, |
|
"grad_norm": 0.38280758261680603, |
|
"learning_rate": 6.864282388901544e-05, |
|
"loss": 0.9335, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.2916910718768304, |
|
"grad_norm": 0.41545820236206055, |
|
"learning_rate": 6.854430056513417e-05, |
|
"loss": 0.9306, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.29219312191448416, |
|
"grad_norm": 0.40962398052215576, |
|
"learning_rate": 6.844569368352748e-05, |
|
"loss": 0.9019, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.2926951719521379, |
|
"grad_norm": 0.3950769901275635, |
|
"learning_rate": 6.83470036885026e-05, |
|
"loss": 0.8951, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.29319722198979165, |
|
"grad_norm": 0.4112852215766907, |
|
"learning_rate": 6.824823102474128e-05, |
|
"loss": 0.9652, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.2936992720274454, |
|
"grad_norm": 0.4129278361797333, |
|
"learning_rate": 6.814937613729766e-05, |
|
"loss": 0.9319, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.29420132206509914, |
|
"grad_norm": 0.42486321926116943, |
|
"learning_rate": 6.805043947159651e-05, |
|
"loss": 0.9717, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.2947033721027529, |
|
"grad_norm": 0.42396050691604614, |
|
"learning_rate": 6.795142147343101e-05, |
|
"loss": 0.938, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.2952054221404067, |
|
"grad_norm": 0.4266931116580963, |
|
"learning_rate": 6.785232258896077e-05, |
|
"loss": 0.9092, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.2957074721780604, |
|
"grad_norm": 0.4176103472709656, |
|
"learning_rate": 6.775314326470992e-05, |
|
"loss": 0.8908, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.29620952221571417, |
|
"grad_norm": 0.4264911413192749, |
|
"learning_rate": 6.765388394756504e-05, |
|
"loss": 0.801, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.2967115722533679, |
|
"grad_norm": 0.45049166679382324, |
|
"learning_rate": 6.755454508477312e-05, |
|
"loss": 0.8206, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.29721362229102166, |
|
"grad_norm": 0.4606582820415497, |
|
"learning_rate": 6.745512712393957e-05, |
|
"loss": 0.8618, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.2977156723286754, |
|
"grad_norm": 0.4533351957798004, |
|
"learning_rate": 6.735563051302622e-05, |
|
"loss": 0.8264, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.29821772236632915, |
|
"grad_norm": 0.48195162415504456, |
|
"learning_rate": 6.725605570034929e-05, |
|
"loss": 0.8726, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.29871977240398295, |
|
"grad_norm": 0.5006521344184875, |
|
"learning_rate": 6.715640313457733e-05, |
|
"loss": 0.8731, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.2992218224416367, |
|
"grad_norm": 0.5022867321968079, |
|
"learning_rate": 6.705667326472925e-05, |
|
"loss": 0.8804, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.29972387247929044, |
|
"grad_norm": 0.4990813136100769, |
|
"learning_rate": 6.69568665401723e-05, |
|
"loss": 0.8631, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.3002259225169442, |
|
"grad_norm": 0.5281215906143188, |
|
"learning_rate": 6.685698341062002e-05, |
|
"loss": 0.8227, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.30072797255459793, |
|
"grad_norm": 0.5664414167404175, |
|
"learning_rate": 6.67570243261302e-05, |
|
"loss": 0.8378, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.3012300225922517, |
|
"grad_norm": 0.6653980612754822, |
|
"learning_rate": 6.665698973710288e-05, |
|
"loss": 0.8032, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3012300225922517, |
|
"eval_loss": 0.9845598936080933, |
|
"eval_runtime": 710.1126, |
|
"eval_samples_per_second": 21.26, |
|
"eval_steps_per_second": 2.659, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3017320726299054, |
|
"grad_norm": 0.445065438747406, |
|
"learning_rate": 6.655688009427832e-05, |
|
"loss": 1.2529, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.3022341226675592, |
|
"grad_norm": 0.39952167868614197, |
|
"learning_rate": 6.645669584873494e-05, |
|
"loss": 1.2194, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.30273617270521297, |
|
"grad_norm": 0.403266042470932, |
|
"learning_rate": 6.635643745188734e-05, |
|
"loss": 1.2289, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.3032382227428667, |
|
"grad_norm": 0.38917073607444763, |
|
"learning_rate": 6.625610535548418e-05, |
|
"loss": 1.1336, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.30374027278052046, |
|
"grad_norm": 0.4072120785713196, |
|
"learning_rate": 6.615570001160626e-05, |
|
"loss": 1.0642, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.3042423228181742, |
|
"grad_norm": 0.4204983711242676, |
|
"learning_rate": 6.605522187266441e-05, |
|
"loss": 1.0719, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.30474437285582795, |
|
"grad_norm": 0.39132463932037354, |
|
"learning_rate": 6.595467139139743e-05, |
|
"loss": 1.0398, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.3052464228934817, |
|
"grad_norm": 0.35773175954818726, |
|
"learning_rate": 6.585404902087011e-05, |
|
"loss": 1.0631, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.3057484729311355, |
|
"grad_norm": 0.36051151156425476, |
|
"learning_rate": 6.575335521447114e-05, |
|
"loss": 1.04, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.30625052296878924, |
|
"grad_norm": 0.36739856004714966, |
|
"learning_rate": 6.565259042591113e-05, |
|
"loss": 1.0239, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.306752573006443, |
|
"grad_norm": 0.3616657853126526, |
|
"learning_rate": 6.555175510922047e-05, |
|
"loss": 1.0545, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.30725462304409673, |
|
"grad_norm": 0.3667794167995453, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.9624, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.3077566730817505, |
|
"grad_norm": 0.3631950318813324, |
|
"learning_rate": 6.53498747091558e-05, |
|
"loss": 1.0004, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.3082587231194042, |
|
"grad_norm": 0.35089895129203796, |
|
"learning_rate": 6.524883053542339e-05, |
|
"loss": 1.0094, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.30876077315705797, |
|
"grad_norm": 0.38375306129455566, |
|
"learning_rate": 6.514771765283942e-05, |
|
"loss": 1.018, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.30926282319471177, |
|
"grad_norm": 0.3634318709373474, |
|
"learning_rate": 6.504653651700278e-05, |
|
"loss": 1.0375, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.3097648732323655, |
|
"grad_norm": 0.3617091774940491, |
|
"learning_rate": 6.494528758381984e-05, |
|
"loss": 1.0412, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.31026692327001926, |
|
"grad_norm": 0.3729401230812073, |
|
"learning_rate": 6.484397130950254e-05, |
|
"loss": 1.0327, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.310768973307673, |
|
"grad_norm": 0.3525683581829071, |
|
"learning_rate": 6.474258815056622e-05, |
|
"loss": 1.0164, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.31127102334532675, |
|
"grad_norm": 0.3672581911087036, |
|
"learning_rate": 6.464113856382752e-05, |
|
"loss": 1.0148, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3117730733829805, |
|
"grad_norm": 0.3790574371814728, |
|
"learning_rate": 6.453962300640249e-05, |
|
"loss": 0.9997, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.31227512342063424, |
|
"grad_norm": 0.36040011048316956, |
|
"learning_rate": 6.44380419357044e-05, |
|
"loss": 0.9505, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.312777173458288, |
|
"grad_norm": 0.3569061756134033, |
|
"learning_rate": 6.43363958094417e-05, |
|
"loss": 0.9429, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.3132792234959418, |
|
"grad_norm": 0.36146458983421326, |
|
"learning_rate": 6.423468508561599e-05, |
|
"loss": 0.9924, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.31378127353359553, |
|
"grad_norm": 0.37957096099853516, |
|
"learning_rate": 6.413291022251989e-05, |
|
"loss": 0.9934, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3142833235712493, |
|
"grad_norm": 0.37144365906715393, |
|
"learning_rate": 6.403107167873509e-05, |
|
"loss": 0.9251, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.314785373608903, |
|
"grad_norm": 0.3828261196613312, |
|
"learning_rate": 6.392916991313016e-05, |
|
"loss": 0.9649, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.31528742364655676, |
|
"grad_norm": 0.3864898681640625, |
|
"learning_rate": 6.382720538485856e-05, |
|
"loss": 0.9834, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 0.3928738832473755, |
|
"learning_rate": 6.372517855335655e-05, |
|
"loss": 0.9759, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.31629152372186425, |
|
"grad_norm": 0.42996037006378174, |
|
"learning_rate": 6.362308987834115e-05, |
|
"loss": 0.9628, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.31679357375951805, |
|
"grad_norm": 0.3807196319103241, |
|
"learning_rate": 6.352093981980796e-05, |
|
"loss": 0.9842, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.3172956237971718, |
|
"grad_norm": 0.39248624444007874, |
|
"learning_rate": 6.341872883802923e-05, |
|
"loss": 0.9539, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.31779767383482554, |
|
"grad_norm": 0.4059353470802307, |
|
"learning_rate": 6.331645739355168e-05, |
|
"loss": 0.9635, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.3182997238724793, |
|
"grad_norm": 0.4235178828239441, |
|
"learning_rate": 6.321412594719451e-05, |
|
"loss": 0.9473, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.31880177391013304, |
|
"grad_norm": 0.45633211731910706, |
|
"learning_rate": 6.311173496004723e-05, |
|
"loss": 0.9836, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.3193038239477868, |
|
"grad_norm": 0.4051073491573334, |
|
"learning_rate": 6.300928489346766e-05, |
|
"loss": 0.9482, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.3198058739854405, |
|
"grad_norm": 0.4133238196372986, |
|
"learning_rate": 6.290677620907982e-05, |
|
"loss": 0.9009, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.3203079240230943, |
|
"grad_norm": 0.4294078052043915, |
|
"learning_rate": 6.280420936877188e-05, |
|
"loss": 0.9389, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.32080997406074807, |
|
"grad_norm": 0.4092111885547638, |
|
"learning_rate": 6.270158483469397e-05, |
|
"loss": 0.8397, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.3213120240984018, |
|
"grad_norm": 0.42124441266059875, |
|
"learning_rate": 6.259890306925627e-05, |
|
"loss": 0.8405, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.32181407413605556, |
|
"grad_norm": 0.4422035217285156, |
|
"learning_rate": 6.249616453512677e-05, |
|
"loss": 0.8641, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.3223161241737093, |
|
"grad_norm": 0.4448348879814148, |
|
"learning_rate": 6.239336969522932e-05, |
|
"loss": 0.9077, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.32281817421136305, |
|
"grad_norm": 0.4691510796546936, |
|
"learning_rate": 6.229051901274137e-05, |
|
"loss": 0.8585, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.3233202242490168, |
|
"grad_norm": 0.4641557037830353, |
|
"learning_rate": 6.218761295109208e-05, |
|
"loss": 0.8527, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.3238222742866706, |
|
"grad_norm": 0.5288779735565186, |
|
"learning_rate": 6.208465197396013e-05, |
|
"loss": 0.8489, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"grad_norm": 0.45869073271751404, |
|
"learning_rate": 6.19816365452716e-05, |
|
"loss": 0.8505, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.3248263743619781, |
|
"grad_norm": 0.49422523379325867, |
|
"learning_rate": 6.187856712919795e-05, |
|
"loss": 0.8555, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.32532842439963183, |
|
"grad_norm": 0.5668922066688538, |
|
"learning_rate": 6.177544419015388e-05, |
|
"loss": 0.7629, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.3258304744372856, |
|
"grad_norm": 0.5716300010681152, |
|
"learning_rate": 6.167226819279528e-05, |
|
"loss": 0.8643, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.3263325244749393, |
|
"grad_norm": 0.6652288436889648, |
|
"learning_rate": 6.156903960201709e-05, |
|
"loss": 0.7433, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.32683457451259307, |
|
"grad_norm": 0.6001056432723999, |
|
"learning_rate": 6.146575888295123e-05, |
|
"loss": 1.2497, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.32733662455024687, |
|
"grad_norm": 0.3522529900074005, |
|
"learning_rate": 6.136242650096451e-05, |
|
"loss": 1.177, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.3278386745879006, |
|
"grad_norm": 0.3846982717514038, |
|
"learning_rate": 6.125904292165652e-05, |
|
"loss": 1.1357, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.32834072462555436, |
|
"grad_norm": 0.389482706785202, |
|
"learning_rate": 6.115560861085756e-05, |
|
"loss": 1.0675, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.3288427746632081, |
|
"grad_norm": 0.41399508714675903, |
|
"learning_rate": 6.105212403462651e-05, |
|
"loss": 1.1065, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.32934482470086185, |
|
"grad_norm": 0.5792128443717957, |
|
"learning_rate": 6.0948589659248654e-05, |
|
"loss": 1.1188, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.3298468747385156, |
|
"grad_norm": 0.3753111958503723, |
|
"learning_rate": 6.084500595123383e-05, |
|
"loss": 1.1127, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.33034892477616934, |
|
"grad_norm": 0.3663425147533417, |
|
"learning_rate": 6.0741373377314005e-05, |
|
"loss": 1.019, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.3308509748138231, |
|
"grad_norm": 0.39105096459388733, |
|
"learning_rate": 6.0637692404441416e-05, |
|
"loss": 1.0186, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.3313530248514769, |
|
"grad_norm": 0.38673144578933716, |
|
"learning_rate": 6.0533963499786314e-05, |
|
"loss": 1.0256, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.33185507488913063, |
|
"grad_norm": 0.3633407950401306, |
|
"learning_rate": 6.0430187130735016e-05, |
|
"loss": 1.0332, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.3323571249267844, |
|
"grad_norm": 0.35200172662734985, |
|
"learning_rate": 6.032636376488763e-05, |
|
"loss": 0.9356, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.3328591749644381, |
|
"grad_norm": 0.3665078282356262, |
|
"learning_rate": 6.0222493870056044e-05, |
|
"loss": 1.0154, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.33336122500209187, |
|
"grad_norm": 0.3591248095035553, |
|
"learning_rate": 6.0118577914261784e-05, |
|
"loss": 0.9798, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.3338632750397456, |
|
"grad_norm": 0.361217200756073, |
|
"learning_rate": 6.001461636573397e-05, |
|
"loss": 0.9813, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.33436532507739936, |
|
"grad_norm": 0.37569659948349, |
|
"learning_rate": 5.99106096929071e-05, |
|
"loss": 1.011, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.33486737511505316, |
|
"grad_norm": 0.3692183494567871, |
|
"learning_rate": 5.980655836441902e-05, |
|
"loss": 1.0294, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.3353694251527069, |
|
"grad_norm": 0.374726802110672, |
|
"learning_rate": 5.970246284910876e-05, |
|
"loss": 0.9654, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.33587147519036065, |
|
"grad_norm": 0.3687571585178375, |
|
"learning_rate": 5.959832361601453e-05, |
|
"loss": 1.0423, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.3363735252280144, |
|
"grad_norm": 0.36362433433532715, |
|
"learning_rate": 5.949414113437142e-05, |
|
"loss": 0.8874, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.33687557526566814, |
|
"grad_norm": 0.34844672679901123, |
|
"learning_rate": 5.938991587360946e-05, |
|
"loss": 0.8979, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.3373776253033219, |
|
"grad_norm": 0.3646034598350525, |
|
"learning_rate": 5.9285648303351404e-05, |
|
"loss": 0.9435, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.3378796753409756, |
|
"grad_norm": 0.37094947695732117, |
|
"learning_rate": 5.9181338893410663e-05, |
|
"loss": 0.9679, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.3383817253786294, |
|
"grad_norm": 0.385873943567276, |
|
"learning_rate": 5.907698811378919e-05, |
|
"loss": 0.9898, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.3388837754162832, |
|
"grad_norm": 0.38623571395874023, |
|
"learning_rate": 5.897259643467527e-05, |
|
"loss": 0.987, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.3393858254539369, |
|
"grad_norm": 0.3703857362270355, |
|
"learning_rate": 5.8868164326441546e-05, |
|
"loss": 0.919, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.33988787549159066, |
|
"grad_norm": 0.3874402344226837, |
|
"learning_rate": 5.876369225964283e-05, |
|
"loss": 0.959, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.3403899255292444, |
|
"grad_norm": 0.37169700860977173, |
|
"learning_rate": 5.8659180705013936e-05, |
|
"loss": 0.9883, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.34089197556689815, |
|
"grad_norm": 0.4187929332256317, |
|
"learning_rate": 5.8554630133467624e-05, |
|
"loss": 0.9527, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.3413940256045519, |
|
"grad_norm": 0.39550694823265076, |
|
"learning_rate": 5.8450041016092464e-05, |
|
"loss": 0.9152, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3418960756422057, |
|
"grad_norm": 0.40294429659843445, |
|
"learning_rate": 5.83454138241507e-05, |
|
"loss": 0.95, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.34239812567985944, |
|
"grad_norm": 0.38999685645103455, |
|
"learning_rate": 5.8240749029076134e-05, |
|
"loss": 0.9475, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.3429001757175132, |
|
"grad_norm": 0.40788596868515015, |
|
"learning_rate": 5.8136047102472e-05, |
|
"loss": 1.01, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.34340222575516693, |
|
"grad_norm": 0.4204280972480774, |
|
"learning_rate": 5.803130851610886e-05, |
|
"loss": 0.934, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.3439042757928207, |
|
"grad_norm": 0.4102809429168701, |
|
"learning_rate": 5.792653374192245e-05, |
|
"loss": 0.9398, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.3444063258304744, |
|
"grad_norm": 0.4025559723377228, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 0.9245, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.34490837586812817, |
|
"grad_norm": 0.4101907014846802, |
|
"learning_rate": 5.771687751863587e-05, |
|
"loss": 0.9279, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.34541042590578197, |
|
"grad_norm": 0.43221110105514526, |
|
"learning_rate": 5.761199701421391e-05, |
|
"loss": 0.8831, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.3459124759434357, |
|
"grad_norm": 0.42259782552719116, |
|
"learning_rate": 5.750708221132092e-05, |
|
"loss": 0.8903, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.34641452598108946, |
|
"grad_norm": 0.4195202887058258, |
|
"learning_rate": 5.7402133582686576e-05, |
|
"loss": 0.8291, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.3469165760187432, |
|
"grad_norm": 0.4531534016132355, |
|
"learning_rate": 5.7297151601193056e-05, |
|
"loss": 0.8893, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.34741862605639695, |
|
"grad_norm": 0.46428826451301575, |
|
"learning_rate": 5.719213673987277e-05, |
|
"loss": 0.9049, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.3479206760940507, |
|
"grad_norm": 0.4338727295398712, |
|
"learning_rate": 5.708708947190634e-05, |
|
"loss": 0.8142, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.34842272613170444, |
|
"grad_norm": 0.44543692469596863, |
|
"learning_rate": 5.698201027062034e-05, |
|
"loss": 0.8463, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.3489247761693582, |
|
"grad_norm": 0.4769425094127655, |
|
"learning_rate": 5.6876899609485256e-05, |
|
"loss": 0.8931, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.349426826207012, |
|
"grad_norm": 0.49232223629951477, |
|
"learning_rate": 5.6771757962113323e-05, |
|
"loss": 0.8189, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.34992887624466573, |
|
"grad_norm": 0.49148690700531006, |
|
"learning_rate": 5.666658580225643e-05, |
|
"loss": 0.8153, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.3504309262823195, |
|
"grad_norm": 0.5055503845214844, |
|
"learning_rate": 5.656138360380391e-05, |
|
"loss": 0.8018, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.3509329763199732, |
|
"grad_norm": 0.5481170415878296, |
|
"learning_rate": 5.645615184078044e-05, |
|
"loss": 0.8587, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.35143502635762697, |
|
"grad_norm": 0.6615381240844727, |
|
"learning_rate": 5.6350890987343944e-05, |
|
"loss": 0.777, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3519370763952807, |
|
"grad_norm": 0.434299111366272, |
|
"learning_rate": 5.6245601517783406e-05, |
|
"loss": 1.2088, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.35243912643293446, |
|
"grad_norm": 0.39533907175064087, |
|
"learning_rate": 5.614028390651675e-05, |
|
"loss": 1.1814, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.35294117647058826, |
|
"grad_norm": 0.3828687369823456, |
|
"learning_rate": 5.6034938628088705e-05, |
|
"loss": 1.1873, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.353443226508242, |
|
"grad_norm": 0.3660382628440857, |
|
"learning_rate": 5.5929566157168665e-05, |
|
"loss": 1.0862, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.35394527654589575, |
|
"grad_norm": 0.39876964688301086, |
|
"learning_rate": 5.582416696854853e-05, |
|
"loss": 1.0083, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.3544473265835495, |
|
"grad_norm": 0.409247487783432, |
|
"learning_rate": 5.571874153714063e-05, |
|
"loss": 1.0714, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.35494937662120324, |
|
"grad_norm": 0.3872778117656708, |
|
"learning_rate": 5.561329033797547e-05, |
|
"loss": 1.085, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.355451426658857, |
|
"grad_norm": 0.38185930252075195, |
|
"learning_rate": 5.550781384619973e-05, |
|
"loss": 1.0762, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.35595347669651073, |
|
"grad_norm": 0.3866881728172302, |
|
"learning_rate": 5.540231253707403e-05, |
|
"loss": 1.0326, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.35645552673416453, |
|
"grad_norm": 0.37910160422325134, |
|
"learning_rate": 5.5296786885970805e-05, |
|
"loss": 1.0769, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3569575767718183, |
|
"grad_norm": 0.3608991205692291, |
|
"learning_rate": 5.519123736837217e-05, |
|
"loss": 1.0523, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.357459626809472, |
|
"grad_norm": 0.36697694659233093, |
|
"learning_rate": 5.50856644598678e-05, |
|
"loss": 0.9778, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.35796167684712576, |
|
"grad_norm": 0.4545275568962097, |
|
"learning_rate": 5.498006863615275e-05, |
|
"loss": 1.0207, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.3584637268847795, |
|
"grad_norm": 0.3483712375164032, |
|
"learning_rate": 5.487445037302531e-05, |
|
"loss": 1.0002, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.35896577692243326, |
|
"grad_norm": 0.3665158152580261, |
|
"learning_rate": 5.476881014638491e-05, |
|
"loss": 1.0274, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.359467826960087, |
|
"grad_norm": 0.35564157366752625, |
|
"learning_rate": 5.466314843222993e-05, |
|
"loss": 0.9884, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.3599698769977408, |
|
"grad_norm": 0.3559761345386505, |
|
"learning_rate": 5.4557465706655564e-05, |
|
"loss": 1.0143, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.36047192703539455, |
|
"grad_norm": 0.38508090376853943, |
|
"learning_rate": 5.4451762445851705e-05, |
|
"loss": 1.0679, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.3609739770730483, |
|
"grad_norm": 0.3513292670249939, |
|
"learning_rate": 5.4346039126100733e-05, |
|
"loss": 0.948, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.36147602711070204, |
|
"grad_norm": 0.36502474546432495, |
|
"learning_rate": 5.4240296223775465e-05, |
|
"loss": 1.0246, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3619780771483558, |
|
"grad_norm": 0.3846004605293274, |
|
"learning_rate": 5.41345342153369e-05, |
|
"loss": 1.0332, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.3624801271860095, |
|
"grad_norm": 0.35061997175216675, |
|
"learning_rate": 5.4028753577332146e-05, |
|
"loss": 0.9286, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.36298217722366327, |
|
"grad_norm": 0.37235984206199646, |
|
"learning_rate": 5.392295478639225e-05, |
|
"loss": 1.0385, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.36348422726131707, |
|
"grad_norm": 0.3770149350166321, |
|
"learning_rate": 5.3817138319230076e-05, |
|
"loss": 0.9865, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.3639862772989708, |
|
"grad_norm": 0.3904590606689453, |
|
"learning_rate": 5.3711304652638126e-05, |
|
"loss": 0.934, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.36448832733662456, |
|
"grad_norm": 0.3823120892047882, |
|
"learning_rate": 5.360545426348638e-05, |
|
"loss": 0.9394, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.3649903773742783, |
|
"grad_norm": 0.36231666803359985, |
|
"learning_rate": 5.349958762872016e-05, |
|
"loss": 0.9282, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.36549242741193205, |
|
"grad_norm": 0.3757944405078888, |
|
"learning_rate": 5.3393705225358046e-05, |
|
"loss": 0.8884, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.3659944774495858, |
|
"grad_norm": 0.4007607102394104, |
|
"learning_rate": 5.32878075304896e-05, |
|
"loss": 0.9739, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.36649652748723954, |
|
"grad_norm": 0.40476924180984497, |
|
"learning_rate": 5.318189502127332e-05, |
|
"loss": 0.9458, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3669985775248933, |
|
"grad_norm": 0.39884302020072937, |
|
"learning_rate": 5.307596817493445e-05, |
|
"loss": 0.8989, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.3675006275625471, |
|
"grad_norm": 0.42604318261146545, |
|
"learning_rate": 5.297002746876284e-05, |
|
"loss": 0.9337, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.36800267760020083, |
|
"grad_norm": 0.41235285997390747, |
|
"learning_rate": 5.286407338011079e-05, |
|
"loss": 0.9191, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.3685047276378546, |
|
"grad_norm": 0.40768033266067505, |
|
"learning_rate": 5.275810638639088e-05, |
|
"loss": 0.957, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.3690067776755083, |
|
"grad_norm": 0.42073965072631836, |
|
"learning_rate": 5.265212696507387e-05, |
|
"loss": 0.9503, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.36950882771316207, |
|
"grad_norm": 0.40175575017929077, |
|
"learning_rate": 5.254613559368649e-05, |
|
"loss": 0.9277, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.3700108777508158, |
|
"grad_norm": 0.39959418773651123, |
|
"learning_rate": 5.2440132749809313e-05, |
|
"loss": 0.9021, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.37051292778846956, |
|
"grad_norm": 0.45893776416778564, |
|
"learning_rate": 5.2334118911074635e-05, |
|
"loss": 0.9413, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.37101497782612336, |
|
"grad_norm": 0.4203508794307709, |
|
"learning_rate": 5.2228094555164265e-05, |
|
"loss": 0.9131, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.3715170278637771, |
|
"grad_norm": 0.4097796082496643, |
|
"learning_rate": 5.212206015980742e-05, |
|
"loss": 0.881, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.37201907790143085, |
|
"grad_norm": 0.44615375995635986, |
|
"learning_rate": 5.201601620277854e-05, |
|
"loss": 0.8147, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.3725211279390846, |
|
"grad_norm": 0.4491327702999115, |
|
"learning_rate": 5.190996316189515e-05, |
|
"loss": 0.8368, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.37302317797673834, |
|
"grad_norm": 0.4489690065383911, |
|
"learning_rate": 5.180390151501569e-05, |
|
"loss": 0.9062, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.3735252280143921, |
|
"grad_norm": 0.4554278552532196, |
|
"learning_rate": 5.1697831740037436e-05, |
|
"loss": 0.841, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.37402727805204583, |
|
"grad_norm": 0.4591432213783264, |
|
"learning_rate": 5.159175431489424e-05, |
|
"loss": 0.8241, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.37452932808969963, |
|
"grad_norm": 0.4552235007286072, |
|
"learning_rate": 5.1485669717554396e-05, |
|
"loss": 0.7784, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.3750313781273534, |
|
"grad_norm": 0.4900113046169281, |
|
"learning_rate": 5.137957842601856e-05, |
|
"loss": 0.7905, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.3755334281650071, |
|
"grad_norm": 0.5452777743339539, |
|
"learning_rate": 5.1273480918317554e-05, |
|
"loss": 0.8248, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.37603547820266087, |
|
"grad_norm": 0.5230666399002075, |
|
"learning_rate": 5.116737767251021e-05, |
|
"loss": 0.781, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.3765375282403146, |
|
"grad_norm": 0.632352352142334, |
|
"learning_rate": 5.1061269166681183e-05, |
|
"loss": 0.7272, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3765375282403146, |
|
"eval_loss": 0.9616905450820923, |
|
"eval_runtime": 709.9548, |
|
"eval_samples_per_second": 21.265, |
|
"eval_steps_per_second": 2.659, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.37703957827796836, |
|
"grad_norm": 0.4330700933933258, |
|
"learning_rate": 5.095515587893884e-05, |
|
"loss": 1.2318, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.3775416283156221, |
|
"grad_norm": 0.3779419958591461, |
|
"learning_rate": 5.084903828741312e-05, |
|
"loss": 1.2228, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.3780436783532759, |
|
"grad_norm": 0.376594603061676, |
|
"learning_rate": 5.0742916870253334e-05, |
|
"loss": 1.1351, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.37854572839092965, |
|
"grad_norm": 0.3838042914867401, |
|
"learning_rate": 5.063679210562602e-05, |
|
"loss": 1.1161, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.3790477784285834, |
|
"grad_norm": 0.37450775504112244, |
|
"learning_rate": 5.053066447171282e-05, |
|
"loss": 1.0012, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.37954982846623714, |
|
"grad_norm": 0.37315741181373596, |
|
"learning_rate": 5.042453444670828e-05, |
|
"loss": 1.1146, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.3800518785038909, |
|
"grad_norm": 0.3619626760482788, |
|
"learning_rate": 5.031840250881776e-05, |
|
"loss": 1.0954, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.38055392854154463, |
|
"grad_norm": 0.3665991723537445, |
|
"learning_rate": 5.021226913625522e-05, |
|
"loss": 1.0704, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.3810559785791984, |
|
"grad_norm": 0.3833234906196594, |
|
"learning_rate": 5.0106134807241045e-05, |
|
"loss": 1.0973, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.3815580286168522, |
|
"grad_norm": 0.37826788425445557, |
|
"learning_rate": 5e-05, |
|
"loss": 1.016, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3820600786545059, |
|
"grad_norm": 0.3752281665802002, |
|
"learning_rate": 4.989386519275895e-05, |
|
"loss": 1.0214, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.38256212869215966, |
|
"grad_norm": 0.35231512784957886, |
|
"learning_rate": 4.978773086374479e-05, |
|
"loss": 0.9812, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.3830641787298134, |
|
"grad_norm": 0.34861356019973755, |
|
"learning_rate": 4.968159749118223e-05, |
|
"loss": 0.9588, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.38356622876746715, |
|
"grad_norm": 0.3637848198413849, |
|
"learning_rate": 4.957546555329173e-05, |
|
"loss": 0.9808, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.3840682788051209, |
|
"grad_norm": 0.38542938232421875, |
|
"learning_rate": 4.94693355282872e-05, |
|
"loss": 1.0052, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.38457032884277464, |
|
"grad_norm": 0.3675108850002289, |
|
"learning_rate": 4.9363207894374e-05, |
|
"loss": 0.9797, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.3850723788804284, |
|
"grad_norm": 0.3529476523399353, |
|
"learning_rate": 4.925708312974667e-05, |
|
"loss": 1.0427, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.3855744289180822, |
|
"grad_norm": 0.35466766357421875, |
|
"learning_rate": 4.9150961712586895e-05, |
|
"loss": 1.0076, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.38607647895573594, |
|
"grad_norm": 0.3574579358100891, |
|
"learning_rate": 4.904484412106117e-05, |
|
"loss": 1.0206, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.3865785289933897, |
|
"grad_norm": 0.35434436798095703, |
|
"learning_rate": 4.893873083331882e-05, |
|
"loss": 0.944, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.3870805790310434, |
|
"grad_norm": 0.37650713324546814, |
|
"learning_rate": 4.88326223274898e-05, |
|
"loss": 0.9769, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.38758262906869717, |
|
"grad_norm": 0.3571126461029053, |
|
"learning_rate": 4.8726519081682444e-05, |
|
"loss": 0.996, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.3880846791063509, |
|
"grad_norm": 0.3663455843925476, |
|
"learning_rate": 4.862042157398146e-05, |
|
"loss": 0.908, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.38858672914400466, |
|
"grad_norm": 0.380512535572052, |
|
"learning_rate": 4.851433028244562e-05, |
|
"loss": 1.0196, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.38908877918165846, |
|
"grad_norm": 0.38776859641075134, |
|
"learning_rate": 4.840824568510579e-05, |
|
"loss": 0.9251, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.3895908292193122, |
|
"grad_norm": 0.39721420407295227, |
|
"learning_rate": 4.830216825996257e-05, |
|
"loss": 0.9202, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.39009287925696595, |
|
"grad_norm": 0.3933786153793335, |
|
"learning_rate": 4.8196098484984305e-05, |
|
"loss": 0.944, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.3905949292946197, |
|
"grad_norm": 0.3744068741798401, |
|
"learning_rate": 4.809003683810486e-05, |
|
"loss": 0.9442, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.39109697933227344, |
|
"grad_norm": 0.39798104763031006, |
|
"learning_rate": 4.798398379722147e-05, |
|
"loss": 0.9739, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.3915990293699272, |
|
"grad_norm": 0.3898034691810608, |
|
"learning_rate": 4.78779398401926e-05, |
|
"loss": 0.9401, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.39210107940758093, |
|
"grad_norm": 0.3922993540763855, |
|
"learning_rate": 4.777190544483574e-05, |
|
"loss": 0.9504, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.39260312944523473, |
|
"grad_norm": 0.38821038603782654, |
|
"learning_rate": 4.7665881088925376e-05, |
|
"loss": 0.9617, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.3931051794828885, |
|
"grad_norm": 0.3955070674419403, |
|
"learning_rate": 4.75598672501907e-05, |
|
"loss": 0.9072, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.3936072295205422, |
|
"grad_norm": 0.38435256481170654, |
|
"learning_rate": 4.7453864406313544e-05, |
|
"loss": 0.9285, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.39410927955819597, |
|
"grad_norm": 0.40070778131484985, |
|
"learning_rate": 4.734787303492615e-05, |
|
"loss": 0.9422, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.3946113295958497, |
|
"grad_norm": 0.4178116023540497, |
|
"learning_rate": 4.7241893613609126e-05, |
|
"loss": 0.9361, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.39511337963350346, |
|
"grad_norm": 0.4187740087509155, |
|
"learning_rate": 4.7135926619889226e-05, |
|
"loss": 0.8883, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.3956154296711572, |
|
"grad_norm": 0.42808717489242554, |
|
"learning_rate": 4.702997253123716e-05, |
|
"loss": 0.8763, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.396117479708811, |
|
"grad_norm": 0.4418085813522339, |
|
"learning_rate": 4.6924031825065566e-05, |
|
"loss": 0.9475, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.39661952974646475, |
|
"grad_norm": 0.4347171485424042, |
|
"learning_rate": 4.6818104978726685e-05, |
|
"loss": 0.7853, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3971215797841185, |
|
"grad_norm": 0.4366185665130615, |
|
"learning_rate": 4.6712192469510425e-05, |
|
"loss": 0.8485, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.39762362982177224, |
|
"grad_norm": 0.4427374601364136, |
|
"learning_rate": 4.6606294774641966e-05, |
|
"loss": 0.8737, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.398125679859426, |
|
"grad_norm": 0.4442150890827179, |
|
"learning_rate": 4.6500412371279836e-05, |
|
"loss": 0.8032, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.39862772989707973, |
|
"grad_norm": 0.4936541020870209, |
|
"learning_rate": 4.6394545736513634e-05, |
|
"loss": 0.8794, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.3991297799347335, |
|
"grad_norm": 0.47061917185783386, |
|
"learning_rate": 4.628869534736187e-05, |
|
"loss": 0.8568, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.3996318299723872, |
|
"grad_norm": 0.525748610496521, |
|
"learning_rate": 4.618286168076993e-05, |
|
"loss": 0.8513, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.400133880010041, |
|
"grad_norm": 0.4828825891017914, |
|
"learning_rate": 4.607704521360776e-05, |
|
"loss": 0.8328, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.40063593004769477, |
|
"grad_norm": 0.4649796187877655, |
|
"learning_rate": 4.597124642266788e-05, |
|
"loss": 0.7556, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.4011379800853485, |
|
"grad_norm": 0.5552456974983215, |
|
"learning_rate": 4.5865465784663114e-05, |
|
"loss": 0.8184, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.40164003012300226, |
|
"grad_norm": 0.706791341304779, |
|
"learning_rate": 4.575970377622456e-05, |
|
"loss": 0.7444, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.402142080160656, |
|
"grad_norm": 0.4323110282421112, |
|
"learning_rate": 4.565396087389927e-05, |
|
"loss": 1.1972, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.40264413019830975, |
|
"grad_norm": 0.354783833026886, |
|
"learning_rate": 4.554823755414829e-05, |
|
"loss": 1.1179, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.4031461802359635, |
|
"grad_norm": 0.3601534068584442, |
|
"learning_rate": 4.544253429334444e-05, |
|
"loss": 1.1264, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.4036482302736173, |
|
"grad_norm": 0.3654196858406067, |
|
"learning_rate": 4.5336851567770076e-05, |
|
"loss": 1.0834, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.40415028031127104, |
|
"grad_norm": 0.3873622715473175, |
|
"learning_rate": 4.52311898536151e-05, |
|
"loss": 1.0247, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.4046523303489248, |
|
"grad_norm": 0.37240368127822876, |
|
"learning_rate": 4.5125549626974696e-05, |
|
"loss": 1.0396, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.4051543803865785, |
|
"grad_norm": 0.36485597491264343, |
|
"learning_rate": 4.5019931363847275e-05, |
|
"loss": 1.0249, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.4056564304242323, |
|
"grad_norm": 0.38187476992607117, |
|
"learning_rate": 4.491433554013221e-05, |
|
"loss": 1.0405, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.406158480461886, |
|
"grad_norm": 0.36962300539016724, |
|
"learning_rate": 4.480876263162783e-05, |
|
"loss": 1.0253, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.40666053049953976, |
|
"grad_norm": 0.34921392798423767, |
|
"learning_rate": 4.47032131140292e-05, |
|
"loss": 1.016, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.40716258053719356, |
|
"grad_norm": 0.3537079691886902, |
|
"learning_rate": 4.459768746292597e-05, |
|
"loss": 1.0478, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.4076646305748473, |
|
"grad_norm": 0.3565637767314911, |
|
"learning_rate": 4.449218615380029e-05, |
|
"loss": 1.0148, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.40816668061250105, |
|
"grad_norm": 0.35647860169410706, |
|
"learning_rate": 4.4386709662024544e-05, |
|
"loss": 0.9924, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.4086687306501548, |
|
"grad_norm": 0.34907302260398865, |
|
"learning_rate": 4.4281258462859396e-05, |
|
"loss": 1.0018, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.40917078068780854, |
|
"grad_norm": 0.3495464026927948, |
|
"learning_rate": 4.4175833031451473e-05, |
|
"loss": 0.9449, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.4096728307254623, |
|
"grad_norm": 0.3409779369831085, |
|
"learning_rate": 4.407043384283136e-05, |
|
"loss": 0.9676, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.41017488076311603, |
|
"grad_norm": 0.3575940430164337, |
|
"learning_rate": 4.396506137191131e-05, |
|
"loss": 0.9863, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.41067693080076983, |
|
"grad_norm": 0.36198464035987854, |
|
"learning_rate": 4.3859716093483245e-05, |
|
"loss": 0.9905, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.4111789808384236, |
|
"grad_norm": 0.34198319911956787, |
|
"learning_rate": 4.3754398482216606e-05, |
|
"loss": 0.9482, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.4116810308760773, |
|
"grad_norm": 0.3572383224964142, |
|
"learning_rate": 4.364910901265606e-05, |
|
"loss": 0.934, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.41218308091373107, |
|
"grad_norm": 0.3588048219680786, |
|
"learning_rate": 4.354384815921958e-05, |
|
"loss": 0.9856, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.4126851309513848, |
|
"grad_norm": 0.3628753125667572, |
|
"learning_rate": 4.343861639619611e-05, |
|
"loss": 0.9762, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.41318718098903856, |
|
"grad_norm": 0.3723025321960449, |
|
"learning_rate": 4.3333414197743595e-05, |
|
"loss": 0.9704, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.4136892310266923, |
|
"grad_norm": 0.3608042597770691, |
|
"learning_rate": 4.322824203788669e-05, |
|
"loss": 0.951, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.4141912810643461, |
|
"grad_norm": 0.3752797245979309, |
|
"learning_rate": 4.3123100390514756e-05, |
|
"loss": 0.9878, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.41469333110199985, |
|
"grad_norm": 0.37421780824661255, |
|
"learning_rate": 4.3017989729379675e-05, |
|
"loss": 0.9776, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.4151953811396536, |
|
"grad_norm": 0.3613242506980896, |
|
"learning_rate": 4.291291052809366e-05, |
|
"loss": 0.9205, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.41569743117730734, |
|
"grad_norm": 0.3855215609073639, |
|
"learning_rate": 4.280786326012723e-05, |
|
"loss": 0.986, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.4161994812149611, |
|
"grad_norm": 0.41651931405067444, |
|
"learning_rate": 4.2702848398806956e-05, |
|
"loss": 0.9639, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.41670153125261483, |
|
"grad_norm": 0.3905417323112488, |
|
"learning_rate": 4.2597866417313436e-05, |
|
"loss": 0.9319, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.4172035812902686, |
|
"grad_norm": 0.4226928651332855, |
|
"learning_rate": 4.249291778867909e-05, |
|
"loss": 0.9213, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.4177056313279223, |
|
"grad_norm": 0.382017582654953, |
|
"learning_rate": 4.23880029857861e-05, |
|
"loss": 0.8846, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.4182076813655761, |
|
"grad_norm": 0.417928546667099, |
|
"learning_rate": 4.2283122481364144e-05, |
|
"loss": 0.9288, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.41870973140322987, |
|
"grad_norm": 0.41737717390060425, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 0.9423, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.4192117814408836, |
|
"grad_norm": 0.39423155784606934, |
|
"learning_rate": 4.207346625807756e-05, |
|
"loss": 0.8894, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.41971383147853736, |
|
"grad_norm": 0.427852064371109, |
|
"learning_rate": 4.196869148389114e-05, |
|
"loss": 0.9639, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.4202158815161911, |
|
"grad_norm": 0.4028894007205963, |
|
"learning_rate": 4.1863952897528e-05, |
|
"loss": 0.9309, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.42071793155384485, |
|
"grad_norm": 0.42165279388427734, |
|
"learning_rate": 4.175925097092388e-05, |
|
"loss": 0.9514, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.4212199815914986, |
|
"grad_norm": 0.4179115295410156, |
|
"learning_rate": 4.165458617584933e-05, |
|
"loss": 0.8544, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.4217220316291524, |
|
"grad_norm": 0.479951947927475, |
|
"learning_rate": 4.1549958983907555e-05, |
|
"loss": 0.811, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.42222408166680614, |
|
"grad_norm": 0.45290902256965637, |
|
"learning_rate": 4.144536986653239e-05, |
|
"loss": 0.8243, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.4227261317044599, |
|
"grad_norm": 0.4473222494125366, |
|
"learning_rate": 4.1340819294986076e-05, |
|
"loss": 0.8137, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.42322818174211363, |
|
"grad_norm": 0.42771241068840027, |
|
"learning_rate": 4.1236307740357173e-05, |
|
"loss": 0.8189, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.4237302317797674, |
|
"grad_norm": 0.45651838183403015, |
|
"learning_rate": 4.113183567355846e-05, |
|
"loss": 0.8224, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.4242322818174211, |
|
"grad_norm": 0.4706350266933441, |
|
"learning_rate": 4.102740356532473e-05, |
|
"loss": 0.8297, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.42473433185507486, |
|
"grad_norm": 0.4705800712108612, |
|
"learning_rate": 4.092301188621084e-05, |
|
"loss": 0.7732, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.42523638189272867, |
|
"grad_norm": 0.5137692093849182, |
|
"learning_rate": 4.081866110658934e-05, |
|
"loss": 0.8374, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.4257384319303824, |
|
"grad_norm": 0.5054532885551453, |
|
"learning_rate": 4.0714351696648614e-05, |
|
"loss": 0.8556, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.42624048196803616, |
|
"grad_norm": 0.5825408697128296, |
|
"learning_rate": 4.061008412639055e-05, |
|
"loss": 0.8321, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.4267425320056899, |
|
"grad_norm": 0.6395136117935181, |
|
"learning_rate": 4.050585886562858e-05, |
|
"loss": 0.721, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.42724458204334365, |
|
"grad_norm": 0.5878275632858276, |
|
"learning_rate": 4.0401676383985484e-05, |
|
"loss": 1.3045, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.4277466320809974, |
|
"grad_norm": 0.3765466511249542, |
|
"learning_rate": 4.0297537150891235e-05, |
|
"loss": 1.1244, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.42824868211865114, |
|
"grad_norm": 0.38248923420906067, |
|
"learning_rate": 4.0193441635581e-05, |
|
"loss": 1.1962, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.42875073215630494, |
|
"grad_norm": 0.3714083433151245, |
|
"learning_rate": 4.008939030709291e-05, |
|
"loss": 1.026, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.4292527821939587, |
|
"grad_norm": 0.3839676082134247, |
|
"learning_rate": 3.998538363426605e-05, |
|
"loss": 1.101, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.4297548322316124, |
|
"grad_norm": 0.3552037477493286, |
|
"learning_rate": 3.988142208573822e-05, |
|
"loss": 1.0671, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.43025688226926617, |
|
"grad_norm": 0.36277374625205994, |
|
"learning_rate": 3.977750612994396e-05, |
|
"loss": 1.115, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.4307589323069199, |
|
"grad_norm": 0.3462297022342682, |
|
"learning_rate": 3.9673636235112376e-05, |
|
"loss": 1.0309, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.43126098234457366, |
|
"grad_norm": 0.3610150218009949, |
|
"learning_rate": 3.956981286926498e-05, |
|
"loss": 1.0359, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.4317630323822274, |
|
"grad_norm": 0.35921838879585266, |
|
"learning_rate": 3.94660365002137e-05, |
|
"loss": 1.0397, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4322650824198812, |
|
"grad_norm": 0.3716135621070862, |
|
"learning_rate": 3.93623075955586e-05, |
|
"loss": 1.0673, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.43276713245753495, |
|
"grad_norm": 0.37005794048309326, |
|
"learning_rate": 3.925862662268602e-05, |
|
"loss": 1.0354, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.4332691824951887, |
|
"grad_norm": 0.34723684191703796, |
|
"learning_rate": 3.9154994048766184e-05, |
|
"loss": 1.0334, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.43377123253284244, |
|
"grad_norm": 0.3506997525691986, |
|
"learning_rate": 3.905141034075135e-05, |
|
"loss": 0.9656, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.4342732825704962, |
|
"grad_norm": 0.37688568234443665, |
|
"learning_rate": 3.894787596537352e-05, |
|
"loss": 0.9302, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.43477533260814993, |
|
"grad_norm": 0.3472607433795929, |
|
"learning_rate": 3.884439138914243e-05, |
|
"loss": 0.9686, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.4352773826458037, |
|
"grad_norm": 0.35843560099601746, |
|
"learning_rate": 3.874095707834349e-05, |
|
"loss": 0.9701, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.4357794326834574, |
|
"grad_norm": 0.3564199209213257, |
|
"learning_rate": 3.863757349903551e-05, |
|
"loss": 1.0456, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.4362814827211112, |
|
"grad_norm": 0.38524752855300903, |
|
"learning_rate": 3.853424111704879e-05, |
|
"loss": 0.9603, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.43678353275876497, |
|
"grad_norm": 0.3552170693874359, |
|
"learning_rate": 3.843096039798293e-05, |
|
"loss": 0.9274, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.4372855827964187, |
|
"grad_norm": 0.37275344133377075, |
|
"learning_rate": 3.832773180720475e-05, |
|
"loss": 1.0213, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.43778763283407246, |
|
"grad_norm": 0.3630153238773346, |
|
"learning_rate": 3.822455580984613e-05, |
|
"loss": 0.9482, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.4382896828717262, |
|
"grad_norm": 0.36190661787986755, |
|
"learning_rate": 3.8121432870802045e-05, |
|
"loss": 0.881, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.43879173290937995, |
|
"grad_norm": 0.3701936602592468, |
|
"learning_rate": 3.801836345472841e-05, |
|
"loss": 1.0065, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.4392937829470337, |
|
"grad_norm": 0.4397743344306946, |
|
"learning_rate": 3.791534802603988e-05, |
|
"loss": 0.9938, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.4397958329846875, |
|
"grad_norm": 0.36815145611763, |
|
"learning_rate": 3.781238704890793e-05, |
|
"loss": 0.9628, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.44029788302234124, |
|
"grad_norm": 0.3762166500091553, |
|
"learning_rate": 3.7709480987258636e-05, |
|
"loss": 0.9478, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.440799933059995, |
|
"grad_norm": 0.39231258630752563, |
|
"learning_rate": 3.760663030477072e-05, |
|
"loss": 1.0166, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.44130198309764873, |
|
"grad_norm": 0.38583433628082275, |
|
"learning_rate": 3.750383546487324e-05, |
|
"loss": 0.9232, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.4418040331353025, |
|
"grad_norm": 0.3934246301651001, |
|
"learning_rate": 3.740109693074375e-05, |
|
"loss": 0.9657, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.4423060831729562, |
|
"grad_norm": 0.4055297374725342, |
|
"learning_rate": 3.729841516530604e-05, |
|
"loss": 0.9054, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.44280813321060997, |
|
"grad_norm": 0.4082297682762146, |
|
"learning_rate": 3.7195790631228136e-05, |
|
"loss": 0.9365, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.44331018324826377, |
|
"grad_norm": 0.39798596501350403, |
|
"learning_rate": 3.709322379092019e-05, |
|
"loss": 0.9023, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.4438122332859175, |
|
"grad_norm": 0.418045312166214, |
|
"learning_rate": 3.6990715106532356e-05, |
|
"loss": 0.9233, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.44431428332357126, |
|
"grad_norm": 0.4316072463989258, |
|
"learning_rate": 3.68882650399528e-05, |
|
"loss": 0.8931, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.444816333361225, |
|
"grad_norm": 0.42850467562675476, |
|
"learning_rate": 3.6785874052805516e-05, |
|
"loss": 0.8839, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.44531838339887875, |
|
"grad_norm": 0.4238118529319763, |
|
"learning_rate": 3.6683542606448347e-05, |
|
"loss": 0.9291, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.4458204334365325, |
|
"grad_norm": 0.415999174118042, |
|
"learning_rate": 3.658127116197079e-05, |
|
"loss": 0.9257, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.44632248347418624, |
|
"grad_norm": 0.4444602131843567, |
|
"learning_rate": 3.6479060180192034e-05, |
|
"loss": 0.8785, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.44682453351184004, |
|
"grad_norm": 0.4339217245578766, |
|
"learning_rate": 3.637691012165886e-05, |
|
"loss": 0.7952, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.4473265835494938, |
|
"grad_norm": 0.4458482563495636, |
|
"learning_rate": 3.627482144664344e-05, |
|
"loss": 0.8247, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.44782863358714753, |
|
"grad_norm": 0.4593295454978943, |
|
"learning_rate": 3.6172794615141446e-05, |
|
"loss": 0.8401, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.4483306836248013, |
|
"grad_norm": 0.47604867815971375, |
|
"learning_rate": 3.607083008686985e-05, |
|
"loss": 0.8271, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.448832733662455, |
|
"grad_norm": 0.45923951268196106, |
|
"learning_rate": 3.596892832126494e-05, |
|
"loss": 0.858, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.44933478370010876, |
|
"grad_norm": 0.4550018608570099, |
|
"learning_rate": 3.586708977748012e-05, |
|
"loss": 0.7788, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.4498368337377625, |
|
"grad_norm": 0.4726627469062805, |
|
"learning_rate": 3.5765314914384026e-05, |
|
"loss": 0.8576, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.4503388837754163, |
|
"grad_norm": 0.4911380708217621, |
|
"learning_rate": 3.5663604190558296e-05, |
|
"loss": 0.8507, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.45084093381307005, |
|
"grad_norm": 0.5006689429283142, |
|
"learning_rate": 3.556195806429559e-05, |
|
"loss": 0.7908, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.4513429838507238, |
|
"grad_norm": 0.6167373061180115, |
|
"learning_rate": 3.546037699359751e-05, |
|
"loss": 0.7922, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.45184503388837755, |
|
"grad_norm": 0.6547103524208069, |
|
"learning_rate": 3.5358861436172485e-05, |
|
"loss": 0.6946, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.45184503388837755, |
|
"eval_loss": 0.9426594972610474, |
|
"eval_runtime": 710.4868, |
|
"eval_samples_per_second": 21.249, |
|
"eval_steps_per_second": 2.657, |
|
"step": 900 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.249771756744081e+18, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|