broodmother41's picture
Training in progress, step 900, checkpoint
bc08c4d verified
{
"best_metric": 0.9426594972610474,
"best_model_checkpoint": "miner_id_24/checkpoint-900",
"epoch": 0.45184503388837755,
"eval_steps": 150,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005020500376537528,
"grad_norm": 2.236064910888672,
"learning_rate": 5e-06,
"loss": 2.3327,
"step": 1
},
{
"epoch": 0.0005020500376537528,
"eval_loss": 2.112123489379883,
"eval_runtime": 710.0183,
"eval_samples_per_second": 21.263,
"eval_steps_per_second": 2.659,
"step": 1
},
{
"epoch": 0.0010041000753075056,
"grad_norm": 2.1631500720977783,
"learning_rate": 1e-05,
"loss": 2.1082,
"step": 2
},
{
"epoch": 0.0015061501129612586,
"grad_norm": 2.1660964488983154,
"learning_rate": 1.5e-05,
"loss": 2.1297,
"step": 3
},
{
"epoch": 0.0020082001506150113,
"grad_norm": 2.329266309738159,
"learning_rate": 2e-05,
"loss": 2.1222,
"step": 4
},
{
"epoch": 0.002510250188268764,
"grad_norm": 2.6099958419799805,
"learning_rate": 2.5e-05,
"loss": 1.9193,
"step": 5
},
{
"epoch": 0.003012300225922517,
"grad_norm": 1.4518094062805176,
"learning_rate": 3e-05,
"loss": 1.8033,
"step": 6
},
{
"epoch": 0.0035143502635762696,
"grad_norm": 1.176897644996643,
"learning_rate": 3.5e-05,
"loss": 1.8032,
"step": 7
},
{
"epoch": 0.0040164003012300225,
"grad_norm": 0.9449969530105591,
"learning_rate": 4e-05,
"loss": 1.7104,
"step": 8
},
{
"epoch": 0.0045184503388837755,
"grad_norm": 1.2403700351715088,
"learning_rate": 4.5e-05,
"loss": 1.697,
"step": 9
},
{
"epoch": 0.005020500376537528,
"grad_norm": 1.3342478275299072,
"learning_rate": 5e-05,
"loss": 1.6163,
"step": 10
},
{
"epoch": 0.005522550414191281,
"grad_norm": 1.2211023569107056,
"learning_rate": 5.500000000000001e-05,
"loss": 1.6009,
"step": 11
},
{
"epoch": 0.006024600451845034,
"grad_norm": 0.969095766544342,
"learning_rate": 6e-05,
"loss": 1.5262,
"step": 12
},
{
"epoch": 0.006526650489498786,
"grad_norm": 0.9153647422790527,
"learning_rate": 6.500000000000001e-05,
"loss": 1.5286,
"step": 13
},
{
"epoch": 0.007028700527152539,
"grad_norm": 0.8192627429962158,
"learning_rate": 7e-05,
"loss": 1.444,
"step": 14
},
{
"epoch": 0.007530750564806292,
"grad_norm": 0.753913938999176,
"learning_rate": 7.500000000000001e-05,
"loss": 1.4985,
"step": 15
},
{
"epoch": 0.008032800602460045,
"grad_norm": 0.7413599491119385,
"learning_rate": 8e-05,
"loss": 1.4897,
"step": 16
},
{
"epoch": 0.008534850640113799,
"grad_norm": 0.6402814388275146,
"learning_rate": 8.5e-05,
"loss": 1.4066,
"step": 17
},
{
"epoch": 0.009036900677767551,
"grad_norm": 0.546452522277832,
"learning_rate": 9e-05,
"loss": 1.377,
"step": 18
},
{
"epoch": 0.009538950715421303,
"grad_norm": 0.5637431144714355,
"learning_rate": 9.5e-05,
"loss": 1.3894,
"step": 19
},
{
"epoch": 0.010041000753075057,
"grad_norm": 0.5624980330467224,
"learning_rate": 0.0001,
"loss": 1.4045,
"step": 20
},
{
"epoch": 0.010543050790728809,
"grad_norm": 0.5827409029006958,
"learning_rate": 9.999988735390004e-05,
"loss": 1.3753,
"step": 21
},
{
"epoch": 0.011045100828382563,
"grad_norm": 0.5779702067375183,
"learning_rate": 9.999954941610768e-05,
"loss": 1.3436,
"step": 22
},
{
"epoch": 0.011547150866036315,
"grad_norm": 0.5667853951454163,
"learning_rate": 9.999898618814565e-05,
"loss": 1.2881,
"step": 23
},
{
"epoch": 0.012049200903690068,
"grad_norm": 0.575544536113739,
"learning_rate": 9.999819767255174e-05,
"loss": 1.2713,
"step": 24
},
{
"epoch": 0.01255125094134382,
"grad_norm": 0.5908040404319763,
"learning_rate": 9.99971838728789e-05,
"loss": 1.3644,
"step": 25
},
{
"epoch": 0.013053300978997573,
"grad_norm": 0.6002948880195618,
"learning_rate": 9.999594479369514e-05,
"loss": 1.2234,
"step": 26
},
{
"epoch": 0.013555351016651326,
"grad_norm": 0.5986592769622803,
"learning_rate": 9.999448044058358e-05,
"loss": 1.2277,
"step": 27
},
{
"epoch": 0.014057401054305078,
"grad_norm": 0.7111369967460632,
"learning_rate": 9.999279082014232e-05,
"loss": 1.287,
"step": 28
},
{
"epoch": 0.014559451091958832,
"grad_norm": 0.5424044728279114,
"learning_rate": 9.999087593998458e-05,
"loss": 1.2089,
"step": 29
},
{
"epoch": 0.015061501129612584,
"grad_norm": 0.5523831248283386,
"learning_rate": 9.998873580873848e-05,
"loss": 1.2078,
"step": 30
},
{
"epoch": 0.015563551167266338,
"grad_norm": 0.567329466342926,
"learning_rate": 9.998637043604711e-05,
"loss": 1.2944,
"step": 31
},
{
"epoch": 0.01606560120492009,
"grad_norm": 0.5832935571670532,
"learning_rate": 9.99837798325685e-05,
"loss": 1.1488,
"step": 32
},
{
"epoch": 0.016567651242573842,
"grad_norm": 0.5010744333267212,
"learning_rate": 9.998096400997549e-05,
"loss": 1.1848,
"step": 33
},
{
"epoch": 0.017069701280227598,
"grad_norm": 0.5734532475471497,
"learning_rate": 9.997792298095572e-05,
"loss": 1.2756,
"step": 34
},
{
"epoch": 0.01757175131788135,
"grad_norm": 0.5715062022209167,
"learning_rate": 9.997465675921163e-05,
"loss": 1.1441,
"step": 35
},
{
"epoch": 0.018073801355535102,
"grad_norm": 0.5624764561653137,
"learning_rate": 9.997116535946028e-05,
"loss": 1.2006,
"step": 36
},
{
"epoch": 0.018575851393188854,
"grad_norm": 0.5436444282531738,
"learning_rate": 9.996744879743337e-05,
"loss": 1.1561,
"step": 37
},
{
"epoch": 0.019077901430842606,
"grad_norm": 0.6079046130180359,
"learning_rate": 9.996350708987713e-05,
"loss": 1.1539,
"step": 38
},
{
"epoch": 0.01957995146849636,
"grad_norm": 0.6179251074790955,
"learning_rate": 9.995934025455235e-05,
"loss": 1.0866,
"step": 39
},
{
"epoch": 0.020082001506150114,
"grad_norm": 0.5955487489700317,
"learning_rate": 9.995494831023409e-05,
"loss": 1.1225,
"step": 40
},
{
"epoch": 0.020584051543803866,
"grad_norm": 0.6442997455596924,
"learning_rate": 9.995033127671174e-05,
"loss": 1.0479,
"step": 41
},
{
"epoch": 0.021086101581457618,
"grad_norm": 0.6124761700630188,
"learning_rate": 9.994548917478899e-05,
"loss": 1.1356,
"step": 42
},
{
"epoch": 0.021588151619111373,
"grad_norm": 0.5994246006011963,
"learning_rate": 9.994042202628357e-05,
"loss": 1.0242,
"step": 43
},
{
"epoch": 0.022090201656765125,
"grad_norm": 0.6428829431533813,
"learning_rate": 9.993512985402724e-05,
"loss": 1.1755,
"step": 44
},
{
"epoch": 0.022592251694418877,
"grad_norm": 0.6374857425689697,
"learning_rate": 9.992961268186573e-05,
"loss": 1.1132,
"step": 45
},
{
"epoch": 0.02309430173207263,
"grad_norm": 0.6873119473457336,
"learning_rate": 9.992387053465857e-05,
"loss": 1.1258,
"step": 46
},
{
"epoch": 0.02359635176972638,
"grad_norm": 0.7355693578720093,
"learning_rate": 9.991790343827895e-05,
"loss": 1.1345,
"step": 47
},
{
"epoch": 0.024098401807380137,
"grad_norm": 0.724719762802124,
"learning_rate": 9.991171141961369e-05,
"loss": 1.0772,
"step": 48
},
{
"epoch": 0.02460045184503389,
"grad_norm": 0.7716226577758789,
"learning_rate": 9.990529450656303e-05,
"loss": 1.0207,
"step": 49
},
{
"epoch": 0.02510250188268764,
"grad_norm": 0.8283296227455139,
"learning_rate": 9.989865272804063e-05,
"loss": 0.9559,
"step": 50
},
{
"epoch": 0.025604551920341393,
"grad_norm": 1.4440653324127197,
"learning_rate": 9.989178611397327e-05,
"loss": 1.5617,
"step": 51
},
{
"epoch": 0.026106601957995145,
"grad_norm": 1.06277334690094,
"learning_rate": 9.988469469530086e-05,
"loss": 1.5022,
"step": 52
},
{
"epoch": 0.0266086519956489,
"grad_norm": 0.6431909203529358,
"learning_rate": 9.987737850397623e-05,
"loss": 1.4657,
"step": 53
},
{
"epoch": 0.027110702033302653,
"grad_norm": 0.6749468445777893,
"learning_rate": 9.986983757296498e-05,
"loss": 1.3443,
"step": 54
},
{
"epoch": 0.027612752070956405,
"grad_norm": 0.6402671933174133,
"learning_rate": 9.986207193624536e-05,
"loss": 1.1939,
"step": 55
},
{
"epoch": 0.028114802108610157,
"grad_norm": 0.5656529068946838,
"learning_rate": 9.985408162880813e-05,
"loss": 1.2935,
"step": 56
},
{
"epoch": 0.028616852146263912,
"grad_norm": 0.5140467286109924,
"learning_rate": 9.98458666866564e-05,
"loss": 1.256,
"step": 57
},
{
"epoch": 0.029118902183917664,
"grad_norm": 0.4963027238845825,
"learning_rate": 9.983742714680538e-05,
"loss": 1.2508,
"step": 58
},
{
"epoch": 0.029620952221571416,
"grad_norm": 0.5174992680549622,
"learning_rate": 9.982876304728232e-05,
"loss": 1.2935,
"step": 59
},
{
"epoch": 0.03012300225922517,
"grad_norm": 0.45506641268730164,
"learning_rate": 9.981987442712633e-05,
"loss": 1.2477,
"step": 60
},
{
"epoch": 0.03062505229687892,
"grad_norm": 0.5008202195167542,
"learning_rate": 9.981076132638812e-05,
"loss": 1.253,
"step": 61
},
{
"epoch": 0.031127102334532676,
"grad_norm": 0.4946709871292114,
"learning_rate": 9.98014237861299e-05,
"loss": 1.1136,
"step": 62
},
{
"epoch": 0.03162915237218643,
"grad_norm": 0.4489033818244934,
"learning_rate": 9.979186184842517e-05,
"loss": 1.2179,
"step": 63
},
{
"epoch": 0.03213120240984018,
"grad_norm": 0.42558974027633667,
"learning_rate": 9.978207555635856e-05,
"loss": 1.1858,
"step": 64
},
{
"epoch": 0.03263325244749393,
"grad_norm": 0.4478650987148285,
"learning_rate": 9.977206495402554e-05,
"loss": 1.2091,
"step": 65
},
{
"epoch": 0.033135302485147684,
"grad_norm": 0.4109612703323364,
"learning_rate": 9.976183008653233e-05,
"loss": 1.1997,
"step": 66
},
{
"epoch": 0.033637352522801436,
"grad_norm": 0.419210821390152,
"learning_rate": 9.975137099999566e-05,
"loss": 1.1183,
"step": 67
},
{
"epoch": 0.034139402560455195,
"grad_norm": 0.43940436840057373,
"learning_rate": 9.974068774154251e-05,
"loss": 1.2011,
"step": 68
},
{
"epoch": 0.03464145259810895,
"grad_norm": 0.45610693097114563,
"learning_rate": 9.972978035931001e-05,
"loss": 1.2022,
"step": 69
},
{
"epoch": 0.0351435026357627,
"grad_norm": 0.45957621932029724,
"learning_rate": 9.971864890244513e-05,
"loss": 1.1934,
"step": 70
},
{
"epoch": 0.03564555267341645,
"grad_norm": 0.4197078347206116,
"learning_rate": 9.970729342110446e-05,
"loss": 1.1708,
"step": 71
},
{
"epoch": 0.036147602711070204,
"grad_norm": 0.4471045136451721,
"learning_rate": 9.969571396645399e-05,
"loss": 1.1901,
"step": 72
},
{
"epoch": 0.036649652748723956,
"grad_norm": 0.510300874710083,
"learning_rate": 9.9683910590669e-05,
"loss": 1.1461,
"step": 73
},
{
"epoch": 0.03715170278637771,
"grad_norm": 0.417583703994751,
"learning_rate": 9.967188334693363e-05,
"loss": 1.1288,
"step": 74
},
{
"epoch": 0.03765375282403146,
"grad_norm": 0.4619269371032715,
"learning_rate": 9.965963228944078e-05,
"loss": 1.1442,
"step": 75
},
{
"epoch": 0.03815580286168521,
"grad_norm": 0.44993823766708374,
"learning_rate": 9.964715747339178e-05,
"loss": 1.1821,
"step": 76
},
{
"epoch": 0.03865785289933897,
"grad_norm": 0.4357517659664154,
"learning_rate": 9.963445895499622e-05,
"loss": 1.0655,
"step": 77
},
{
"epoch": 0.03915990293699272,
"grad_norm": 0.43514949083328247,
"learning_rate": 9.962153679147161e-05,
"loss": 1.1104,
"step": 78
},
{
"epoch": 0.039661952974646475,
"grad_norm": 0.4674883484840393,
"learning_rate": 9.960839104104327e-05,
"loss": 1.056,
"step": 79
},
{
"epoch": 0.04016400301230023,
"grad_norm": 0.463422030210495,
"learning_rate": 9.959502176294383e-05,
"loss": 1.169,
"step": 80
},
{
"epoch": 0.04066605304995398,
"grad_norm": 0.4640124440193176,
"learning_rate": 9.958142901741324e-05,
"loss": 1.0641,
"step": 81
},
{
"epoch": 0.04116810308760773,
"grad_norm": 0.4530577063560486,
"learning_rate": 9.956761286569824e-05,
"loss": 1.1478,
"step": 82
},
{
"epoch": 0.04167015312526148,
"grad_norm": 0.4701811671257019,
"learning_rate": 9.955357337005227e-05,
"loss": 1.0432,
"step": 83
},
{
"epoch": 0.042172203162915235,
"grad_norm": 0.49071431159973145,
"learning_rate": 9.953931059373511e-05,
"loss": 1.1219,
"step": 84
},
{
"epoch": 0.04267425320056899,
"grad_norm": 0.4607682228088379,
"learning_rate": 9.95248246010126e-05,
"loss": 1.0986,
"step": 85
},
{
"epoch": 0.043176303238222746,
"grad_norm": 0.4900347888469696,
"learning_rate": 9.951011545715636e-05,
"loss": 1.1192,
"step": 86
},
{
"epoch": 0.0436783532758765,
"grad_norm": 0.49459338188171387,
"learning_rate": 9.94951832284435e-05,
"loss": 1.1103,
"step": 87
},
{
"epoch": 0.04418040331353025,
"grad_norm": 0.48701831698417664,
"learning_rate": 9.948002798215632e-05,
"loss": 1.0517,
"step": 88
},
{
"epoch": 0.044682453351184,
"grad_norm": 0.4620456397533417,
"learning_rate": 9.946464978658199e-05,
"loss": 1.0084,
"step": 89
},
{
"epoch": 0.045184503388837755,
"grad_norm": 0.5349761247634888,
"learning_rate": 9.944904871101228e-05,
"loss": 1.1153,
"step": 90
},
{
"epoch": 0.04568655342649151,
"grad_norm": 0.5464606285095215,
"learning_rate": 9.943322482574315e-05,
"loss": 0.9737,
"step": 91
},
{
"epoch": 0.04618860346414526,
"grad_norm": 0.5389485955238342,
"learning_rate": 9.941717820207461e-05,
"loss": 0.9921,
"step": 92
},
{
"epoch": 0.04669065350179901,
"grad_norm": 0.5406158566474915,
"learning_rate": 9.940090891231025e-05,
"loss": 1.0869,
"step": 93
},
{
"epoch": 0.04719270353945276,
"grad_norm": 0.5455155968666077,
"learning_rate": 9.938441702975689e-05,
"loss": 1.0236,
"step": 94
},
{
"epoch": 0.04769475357710652,
"grad_norm": 0.5470486283302307,
"learning_rate": 9.936770262872443e-05,
"loss": 1.0166,
"step": 95
},
{
"epoch": 0.048196803614760274,
"grad_norm": 0.5690567493438721,
"learning_rate": 9.935076578452534e-05,
"loss": 1.0256,
"step": 96
},
{
"epoch": 0.048698853652414026,
"grad_norm": 0.5862897038459778,
"learning_rate": 9.933360657347441e-05,
"loss": 0.9532,
"step": 97
},
{
"epoch": 0.04920090369006778,
"grad_norm": 0.5633604526519775,
"learning_rate": 9.931622507288834e-05,
"loss": 0.9018,
"step": 98
},
{
"epoch": 0.04970295372772153,
"grad_norm": 0.6516064405441284,
"learning_rate": 9.929862136108549e-05,
"loss": 0.9507,
"step": 99
},
{
"epoch": 0.05020500376537528,
"grad_norm": 0.8028525114059448,
"learning_rate": 9.928079551738543e-05,
"loss": 0.8808,
"step": 100
},
{
"epoch": 0.050707053803029034,
"grad_norm": 1.5746766328811646,
"learning_rate": 9.926274762210862e-05,
"loss": 1.6471,
"step": 101
},
{
"epoch": 0.051209103840682786,
"grad_norm": 0.919540524482727,
"learning_rate": 9.924447775657605e-05,
"loss": 1.4097,
"step": 102
},
{
"epoch": 0.05171115387833654,
"grad_norm": 0.5336892008781433,
"learning_rate": 9.922598600310893e-05,
"loss": 1.2989,
"step": 103
},
{
"epoch": 0.05221320391599029,
"grad_norm": 0.5801246166229248,
"learning_rate": 9.920727244502818e-05,
"loss": 1.2606,
"step": 104
},
{
"epoch": 0.05271525395364405,
"grad_norm": 0.5943406224250793,
"learning_rate": 9.918833716665419e-05,
"loss": 1.1681,
"step": 105
},
{
"epoch": 0.0532173039912978,
"grad_norm": 0.49195364117622375,
"learning_rate": 9.916918025330635e-05,
"loss": 1.2577,
"step": 106
},
{
"epoch": 0.05371935402895155,
"grad_norm": 0.5099748373031616,
"learning_rate": 9.914980179130273e-05,
"loss": 1.262,
"step": 107
},
{
"epoch": 0.054221404066605305,
"grad_norm": 0.4685007929801941,
"learning_rate": 9.913020186795967e-05,
"loss": 1.1403,
"step": 108
},
{
"epoch": 0.05472345410425906,
"grad_norm": 0.45162394642829895,
"learning_rate": 9.911038057159135e-05,
"loss": 1.213,
"step": 109
},
{
"epoch": 0.05522550414191281,
"grad_norm": 0.4480658173561096,
"learning_rate": 9.909033799150946e-05,
"loss": 1.1956,
"step": 110
},
{
"epoch": 0.05572755417956656,
"grad_norm": 0.460193395614624,
"learning_rate": 9.907007421802272e-05,
"loss": 1.2344,
"step": 111
},
{
"epoch": 0.056229604217220314,
"grad_norm": 0.46196773648262024,
"learning_rate": 9.904958934243654e-05,
"loss": 1.0947,
"step": 112
},
{
"epoch": 0.056731654254874066,
"grad_norm": 0.430660218000412,
"learning_rate": 9.902888345705258e-05,
"loss": 1.0833,
"step": 113
},
{
"epoch": 0.057233704292527825,
"grad_norm": 0.4444407522678375,
"learning_rate": 9.900795665516831e-05,
"loss": 1.1319,
"step": 114
},
{
"epoch": 0.05773575433018158,
"grad_norm": 0.4496801793575287,
"learning_rate": 9.898680903107666e-05,
"loss": 1.1493,
"step": 115
},
{
"epoch": 0.05823780436783533,
"grad_norm": 0.40639162063598633,
"learning_rate": 9.89654406800655e-05,
"loss": 1.1085,
"step": 116
},
{
"epoch": 0.05873985440548908,
"grad_norm": 0.43786799907684326,
"learning_rate": 9.894385169841731e-05,
"loss": 1.1228,
"step": 117
},
{
"epoch": 0.05924190444314283,
"grad_norm": 0.4193272888660431,
"learning_rate": 9.892204218340866e-05,
"loss": 1.1277,
"step": 118
},
{
"epoch": 0.059743954480796585,
"grad_norm": 0.4199259281158447,
"learning_rate": 9.890001223330983e-05,
"loss": 1.1616,
"step": 119
},
{
"epoch": 0.06024600451845034,
"grad_norm": 0.42120906710624695,
"learning_rate": 9.887776194738432e-05,
"loss": 1.0961,
"step": 120
},
{
"epoch": 0.06074805455610409,
"grad_norm": 0.5002040863037109,
"learning_rate": 9.885529142588845e-05,
"loss": 1.2211,
"step": 121
},
{
"epoch": 0.06125010459375784,
"grad_norm": 0.4081428050994873,
"learning_rate": 9.883260077007092e-05,
"loss": 1.1441,
"step": 122
},
{
"epoch": 0.0617521546314116,
"grad_norm": 0.4227527379989624,
"learning_rate": 9.880969008217224e-05,
"loss": 1.0954,
"step": 123
},
{
"epoch": 0.06225420466906535,
"grad_norm": 0.4485255181789398,
"learning_rate": 9.878655946542443e-05,
"loss": 1.1613,
"step": 124
},
{
"epoch": 0.0627562547067191,
"grad_norm": 0.4632939100265503,
"learning_rate": 9.876320902405042e-05,
"loss": 1.1114,
"step": 125
},
{
"epoch": 0.06325830474437286,
"grad_norm": 0.43671849370002747,
"learning_rate": 9.873963886326365e-05,
"loss": 1.1023,
"step": 126
},
{
"epoch": 0.06376035478202662,
"grad_norm": 0.41638222336769104,
"learning_rate": 9.871584908926763e-05,
"loss": 1.0879,
"step": 127
},
{
"epoch": 0.06426240481968036,
"grad_norm": 0.46212083101272583,
"learning_rate": 9.869183980925532e-05,
"loss": 1.0654,
"step": 128
},
{
"epoch": 0.06476445485733412,
"grad_norm": 0.4403015971183777,
"learning_rate": 9.86676111314088e-05,
"loss": 1.1132,
"step": 129
},
{
"epoch": 0.06526650489498786,
"grad_norm": 0.46132922172546387,
"learning_rate": 9.864316316489873e-05,
"loss": 1.1063,
"step": 130
},
{
"epoch": 0.06576855493264162,
"grad_norm": 0.4559481739997864,
"learning_rate": 9.861849601988383e-05,
"loss": 1.0584,
"step": 131
},
{
"epoch": 0.06627060497029537,
"grad_norm": 0.451659619808197,
"learning_rate": 9.85936098075104e-05,
"loss": 1.0576,
"step": 132
},
{
"epoch": 0.06677265500794913,
"grad_norm": 0.44598105549812317,
"learning_rate": 9.856850463991186e-05,
"loss": 1.0529,
"step": 133
},
{
"epoch": 0.06727470504560287,
"grad_norm": 0.4388767182826996,
"learning_rate": 9.85431806302081e-05,
"loss": 1.0369,
"step": 134
},
{
"epoch": 0.06777675508325663,
"grad_norm": 0.4496581554412842,
"learning_rate": 9.851763789250525e-05,
"loss": 1.0197,
"step": 135
},
{
"epoch": 0.06827880512091039,
"grad_norm": 0.4703635573387146,
"learning_rate": 9.849187654189487e-05,
"loss": 1.0579,
"step": 136
},
{
"epoch": 0.06878085515856414,
"grad_norm": 0.48079192638397217,
"learning_rate": 9.846589669445355e-05,
"loss": 1.072,
"step": 137
},
{
"epoch": 0.0692829051962179,
"grad_norm": 0.4830515384674072,
"learning_rate": 9.843969846724247e-05,
"loss": 1.0606,
"step": 138
},
{
"epoch": 0.06978495523387164,
"grad_norm": 0.48843124508857727,
"learning_rate": 9.841328197830675e-05,
"loss": 1.0112,
"step": 139
},
{
"epoch": 0.0702870052715254,
"grad_norm": 0.5190821290016174,
"learning_rate": 9.838664734667495e-05,
"loss": 0.9057,
"step": 140
},
{
"epoch": 0.07078905530917914,
"grad_norm": 0.4610013961791992,
"learning_rate": 9.835979469235857e-05,
"loss": 0.9143,
"step": 141
},
{
"epoch": 0.0712911053468329,
"grad_norm": 0.5105639696121216,
"learning_rate": 9.83327241363515e-05,
"loss": 0.9804,
"step": 142
},
{
"epoch": 0.07179315538448665,
"grad_norm": 0.5167362093925476,
"learning_rate": 9.830543580062943e-05,
"loss": 1.0125,
"step": 143
},
{
"epoch": 0.07229520542214041,
"grad_norm": 0.48223769664764404,
"learning_rate": 9.827792980814933e-05,
"loss": 0.9303,
"step": 144
},
{
"epoch": 0.07279725545979417,
"grad_norm": 0.5426216125488281,
"learning_rate": 9.825020628284896e-05,
"loss": 0.9922,
"step": 145
},
{
"epoch": 0.07329930549744791,
"grad_norm": 0.5274797081947327,
"learning_rate": 9.822226534964614e-05,
"loss": 0.9352,
"step": 146
},
{
"epoch": 0.07380135553510167,
"grad_norm": 0.5349685549736023,
"learning_rate": 9.819410713443837e-05,
"loss": 0.9655,
"step": 147
},
{
"epoch": 0.07430340557275542,
"grad_norm": 0.5646516680717468,
"learning_rate": 9.81657317641022e-05,
"loss": 0.8978,
"step": 148
},
{
"epoch": 0.07480545561040917,
"grad_norm": 0.6755395531654358,
"learning_rate": 9.81371393664926e-05,
"loss": 0.9381,
"step": 149
},
{
"epoch": 0.07530750564806292,
"grad_norm": 0.7590980529785156,
"learning_rate": 9.810833007044247e-05,
"loss": 0.7978,
"step": 150
},
{
"epoch": 0.07530750564806292,
"eval_loss": 1.1477792263031006,
"eval_runtime": 710.8849,
"eval_samples_per_second": 21.237,
"eval_steps_per_second": 2.656,
"step": 150
},
{
"epoch": 0.07580955568571668,
"grad_norm": 0.9611290097236633,
"learning_rate": 9.807930400576199e-05,
"loss": 1.5282,
"step": 151
},
{
"epoch": 0.07631160572337042,
"grad_norm": 0.8264356851577759,
"learning_rate": 9.805006130323809e-05,
"loss": 1.3807,
"step": 152
},
{
"epoch": 0.07681365576102418,
"grad_norm": 0.6040631532669067,
"learning_rate": 9.802060209463382e-05,
"loss": 1.3432,
"step": 153
},
{
"epoch": 0.07731570579867794,
"grad_norm": 0.5833274722099304,
"learning_rate": 9.799092651268778e-05,
"loss": 1.2819,
"step": 154
},
{
"epoch": 0.07781775583633169,
"grad_norm": 0.555841863155365,
"learning_rate": 9.796103469111351e-05,
"loss": 1.1248,
"step": 155
},
{
"epoch": 0.07831980587398545,
"grad_norm": 0.4553599953651428,
"learning_rate": 9.79309267645989e-05,
"loss": 1.1664,
"step": 156
},
{
"epoch": 0.07882185591163919,
"grad_norm": 0.42019009590148926,
"learning_rate": 9.790060286880556e-05,
"loss": 1.2007,
"step": 157
},
{
"epoch": 0.07932390594929295,
"grad_norm": 0.48074933886528015,
"learning_rate": 9.787006314036824e-05,
"loss": 1.1545,
"step": 158
},
{
"epoch": 0.0798259559869467,
"grad_norm": 0.4458232820034027,
"learning_rate": 9.783930771689418e-05,
"loss": 1.0934,
"step": 159
},
{
"epoch": 0.08032800602460045,
"grad_norm": 0.4490962624549866,
"learning_rate": 9.780833673696254e-05,
"loss": 1.1753,
"step": 160
},
{
"epoch": 0.0808300560622542,
"grad_norm": 0.4294869601726532,
"learning_rate": 9.777715034012374e-05,
"loss": 1.1133,
"step": 161
},
{
"epoch": 0.08133210609990796,
"grad_norm": 0.4288542866706848,
"learning_rate": 9.774574866689877e-05,
"loss": 1.1664,
"step": 162
},
{
"epoch": 0.08183415613756172,
"grad_norm": 0.4405251443386078,
"learning_rate": 9.771413185877872e-05,
"loss": 1.1115,
"step": 163
},
{
"epoch": 0.08233620617521546,
"grad_norm": 0.41953468322753906,
"learning_rate": 9.768230005822395e-05,
"loss": 1.1264,
"step": 164
},
{
"epoch": 0.08283825621286922,
"grad_norm": 0.39969509840011597,
"learning_rate": 9.76502534086636e-05,
"loss": 1.056,
"step": 165
},
{
"epoch": 0.08334030625052297,
"grad_norm": 0.4138109087944031,
"learning_rate": 9.76179920544949e-05,
"loss": 1.1076,
"step": 166
},
{
"epoch": 0.08384235628817673,
"grad_norm": 0.412165105342865,
"learning_rate": 9.758551614108246e-05,
"loss": 1.1159,
"step": 167
},
{
"epoch": 0.08434440632583047,
"grad_norm": 0.38842642307281494,
"learning_rate": 9.755282581475769e-05,
"loss": 1.0444,
"step": 168
},
{
"epoch": 0.08484645636348423,
"grad_norm": 0.3983784019947052,
"learning_rate": 9.751992122281808e-05,
"loss": 1.1385,
"step": 169
},
{
"epoch": 0.08534850640113797,
"grad_norm": 0.42566153407096863,
"learning_rate": 9.74868025135266e-05,
"loss": 1.1183,
"step": 170
},
{
"epoch": 0.08585055643879173,
"grad_norm": 0.39850881695747375,
"learning_rate": 9.745346983611099e-05,
"loss": 1.0954,
"step": 171
},
{
"epoch": 0.08635260647644549,
"grad_norm": 0.39748743176460266,
"learning_rate": 9.741992334076308e-05,
"loss": 1.0581,
"step": 172
},
{
"epoch": 0.08685465651409924,
"grad_norm": 0.42799192667007446,
"learning_rate": 9.738616317863818e-05,
"loss": 1.1318,
"step": 173
},
{
"epoch": 0.087356706551753,
"grad_norm": 0.41576746106147766,
"learning_rate": 9.735218950185428e-05,
"loss": 1.1525,
"step": 174
},
{
"epoch": 0.08785875658940674,
"grad_norm": 0.4112211763858795,
"learning_rate": 9.731800246349148e-05,
"loss": 1.0731,
"step": 175
},
{
"epoch": 0.0883608066270605,
"grad_norm": 0.43050485849380493,
"learning_rate": 9.728360221759123e-05,
"loss": 1.0604,
"step": 176
},
{
"epoch": 0.08886285666471425,
"grad_norm": 0.44277775287628174,
"learning_rate": 9.72489889191557e-05,
"loss": 1.0127,
"step": 177
},
{
"epoch": 0.089364906702368,
"grad_norm": 0.442449152469635,
"learning_rate": 9.721416272414699e-05,
"loss": 1.039,
"step": 178
},
{
"epoch": 0.08986695674002175,
"grad_norm": 0.4507065415382385,
"learning_rate": 9.71791237894865e-05,
"loss": 1.0508,
"step": 179
},
{
"epoch": 0.09036900677767551,
"grad_norm": 0.4348186254501343,
"learning_rate": 9.714387227305422e-05,
"loss": 1.0597,
"step": 180
},
{
"epoch": 0.09087105681532927,
"grad_norm": 0.42365097999572754,
"learning_rate": 9.710840833368797e-05,
"loss": 1.0212,
"step": 181
},
{
"epoch": 0.09137310685298301,
"grad_norm": 0.4242313504219055,
"learning_rate": 9.707273213118271e-05,
"loss": 1.019,
"step": 182
},
{
"epoch": 0.09187515689063677,
"grad_norm": 0.4419156014919281,
"learning_rate": 9.703684382628989e-05,
"loss": 1.0509,
"step": 183
},
{
"epoch": 0.09237720692829052,
"grad_norm": 0.43379202485084534,
"learning_rate": 9.700074358071659e-05,
"loss": 1.0329,
"step": 184
},
{
"epoch": 0.09287925696594428,
"grad_norm": 0.44969063997268677,
"learning_rate": 9.696443155712486e-05,
"loss": 0.9929,
"step": 185
},
{
"epoch": 0.09338130700359802,
"grad_norm": 0.4435906410217285,
"learning_rate": 9.692790791913106e-05,
"loss": 1.0103,
"step": 186
},
{
"epoch": 0.09388335704125178,
"grad_norm": 0.4611569941043854,
"learning_rate": 9.689117283130498e-05,
"loss": 1.0245,
"step": 187
},
{
"epoch": 0.09438540707890553,
"grad_norm": 0.4579900801181793,
"learning_rate": 9.685422645916918e-05,
"loss": 1.0386,
"step": 188
},
{
"epoch": 0.09488745711655928,
"grad_norm": 0.4896557033061981,
"learning_rate": 9.681706896919829e-05,
"loss": 0.991,
"step": 189
},
{
"epoch": 0.09538950715421304,
"grad_norm": 0.4932405948638916,
"learning_rate": 9.67797005288181e-05,
"loss": 0.9557,
"step": 190
},
{
"epoch": 0.09589155719186679,
"grad_norm": 0.5124619603157043,
"learning_rate": 9.674212130640506e-05,
"loss": 0.9505,
"step": 191
},
{
"epoch": 0.09639360722952055,
"grad_norm": 0.5189158916473389,
"learning_rate": 9.670433147128521e-05,
"loss": 0.9757,
"step": 192
},
{
"epoch": 0.09689565726717429,
"grad_norm": 0.4920775890350342,
"learning_rate": 9.666633119373368e-05,
"loss": 0.925,
"step": 193
},
{
"epoch": 0.09739770730482805,
"grad_norm": 0.5255336761474609,
"learning_rate": 9.66281206449738e-05,
"loss": 0.9272,
"step": 194
},
{
"epoch": 0.0978997573424818,
"grad_norm": 0.5087072849273682,
"learning_rate": 9.65896999971763e-05,
"loss": 0.9373,
"step": 195
},
{
"epoch": 0.09840180738013556,
"grad_norm": 0.5356236100196838,
"learning_rate": 9.65510694234587e-05,
"loss": 0.9119,
"step": 196
},
{
"epoch": 0.0989038574177893,
"grad_norm": 0.5867013335227966,
"learning_rate": 9.651222909788427e-05,
"loss": 0.8701,
"step": 197
},
{
"epoch": 0.09940590745544306,
"grad_norm": 0.5810437202453613,
"learning_rate": 9.64731791954615e-05,
"loss": 0.8611,
"step": 198
},
{
"epoch": 0.0999079574930968,
"grad_norm": 0.6373634338378906,
"learning_rate": 9.643391989214312e-05,
"loss": 0.9195,
"step": 199
},
{
"epoch": 0.10041000753075056,
"grad_norm": 0.7272390723228455,
"learning_rate": 9.639445136482548e-05,
"loss": 0.8179,
"step": 200
},
{
"epoch": 0.10091205756840432,
"grad_norm": 0.711874783039093,
"learning_rate": 9.635477379134756e-05,
"loss": 1.3114,
"step": 201
},
{
"epoch": 0.10141410760605807,
"grad_norm": 0.6842883229255676,
"learning_rate": 9.631488735049033e-05,
"loss": 1.3263,
"step": 202
},
{
"epoch": 0.10191615764371183,
"grad_norm": 0.4919327199459076,
"learning_rate": 9.627479222197587e-05,
"loss": 1.1895,
"step": 203
},
{
"epoch": 0.10241820768136557,
"grad_norm": 0.4409739673137665,
"learning_rate": 9.623448858646657e-05,
"loss": 1.1812,
"step": 204
},
{
"epoch": 0.10292025771901933,
"grad_norm": 0.492781400680542,
"learning_rate": 9.619397662556435e-05,
"loss": 1.1623,
"step": 205
},
{
"epoch": 0.10342230775667308,
"grad_norm": 0.4331713616847992,
"learning_rate": 9.615325652180975e-05,
"loss": 1.1714,
"step": 206
},
{
"epoch": 0.10392435779432684,
"grad_norm": 0.41304811835289,
"learning_rate": 9.611232845868124e-05,
"loss": 1.1732,
"step": 207
},
{
"epoch": 0.10442640783198058,
"grad_norm": 0.4479162395000458,
"learning_rate": 9.607119262059425e-05,
"loss": 1.1447,
"step": 208
},
{
"epoch": 0.10492845786963434,
"grad_norm": 0.4152972102165222,
"learning_rate": 9.602984919290047e-05,
"loss": 1.1563,
"step": 209
},
{
"epoch": 0.1054305079072881,
"grad_norm": 0.421634703874588,
"learning_rate": 9.598829836188694e-05,
"loss": 1.1044,
"step": 210
},
{
"epoch": 0.10593255794494184,
"grad_norm": 0.41844844818115234,
"learning_rate": 9.594654031477521e-05,
"loss": 1.0942,
"step": 211
},
{
"epoch": 0.1064346079825956,
"grad_norm": 0.43519729375839233,
"learning_rate": 9.590457523972056e-05,
"loss": 1.0787,
"step": 212
},
{
"epoch": 0.10693665802024935,
"grad_norm": 0.39169546961784363,
"learning_rate": 9.5862403325811e-05,
"loss": 1.0474,
"step": 213
},
{
"epoch": 0.1074387080579031,
"grad_norm": 0.4049958884716034,
"learning_rate": 9.582002476306668e-05,
"loss": 1.1092,
"step": 214
},
{
"epoch": 0.10794075809555685,
"grad_norm": 0.41217753291130066,
"learning_rate": 9.577743974243874e-05,
"loss": 1.0595,
"step": 215
},
{
"epoch": 0.10844280813321061,
"grad_norm": 0.37548142671585083,
"learning_rate": 9.573464845580864e-05,
"loss": 1.0365,
"step": 216
},
{
"epoch": 0.10894485817086436,
"grad_norm": 0.3726944029331207,
"learning_rate": 9.569165109598725e-05,
"loss": 1.0813,
"step": 217
},
{
"epoch": 0.10944690820851811,
"grad_norm": 0.4017277657985687,
"learning_rate": 9.564844785671398e-05,
"loss": 1.066,
"step": 218
},
{
"epoch": 0.10994895824617187,
"grad_norm": 0.3842703700065613,
"learning_rate": 9.560503893265589e-05,
"loss": 1.0937,
"step": 219
},
{
"epoch": 0.11045100828382562,
"grad_norm": 0.37564817070961,
"learning_rate": 9.55614245194068e-05,
"loss": 1.0732,
"step": 220
},
{
"epoch": 0.11095305832147938,
"grad_norm": 0.3989981412887573,
"learning_rate": 9.551760481348644e-05,
"loss": 1.0755,
"step": 221
},
{
"epoch": 0.11145510835913312,
"grad_norm": 0.388481467962265,
"learning_rate": 9.547358001233959e-05,
"loss": 1.1052,
"step": 222
},
{
"epoch": 0.11195715839678688,
"grad_norm": 0.41220539808273315,
"learning_rate": 9.542935031433515e-05,
"loss": 1.1182,
"step": 223
},
{
"epoch": 0.11245920843444063,
"grad_norm": 0.4094482958316803,
"learning_rate": 9.538491591876522e-05,
"loss": 0.9925,
"step": 224
},
{
"epoch": 0.11296125847209439,
"grad_norm": 0.4174862802028656,
"learning_rate": 9.534027702584425e-05,
"loss": 1.0755,
"step": 225
},
{
"epoch": 0.11346330850974813,
"grad_norm": 0.4093203842639923,
"learning_rate": 9.529543383670814e-05,
"loss": 1.0757,
"step": 226
},
{
"epoch": 0.11396535854740189,
"grad_norm": 0.41040605306625366,
"learning_rate": 9.525038655341329e-05,
"loss": 1.016,
"step": 227
},
{
"epoch": 0.11446740858505565,
"grad_norm": 0.3920714855194092,
"learning_rate": 9.520513537893574e-05,
"loss": 0.9406,
"step": 228
},
{
"epoch": 0.1149694586227094,
"grad_norm": 0.4755348265171051,
"learning_rate": 9.515968051717022e-05,
"loss": 1.076,
"step": 229
},
{
"epoch": 0.11547150866036315,
"grad_norm": 0.43063998222351074,
"learning_rate": 9.511402217292926e-05,
"loss": 1.0341,
"step": 230
},
{
"epoch": 0.1159735586980169,
"grad_norm": 0.4011836647987366,
"learning_rate": 9.506816055194223e-05,
"loss": 1.0272,
"step": 231
},
{
"epoch": 0.11647560873567066,
"grad_norm": 0.4279603660106659,
"learning_rate": 9.502209586085444e-05,
"loss": 1.0628,
"step": 232
},
{
"epoch": 0.1169776587733244,
"grad_norm": 0.4363585412502289,
"learning_rate": 9.497582830722617e-05,
"loss": 1.0168,
"step": 233
},
{
"epoch": 0.11747970881097816,
"grad_norm": 0.44320476055145264,
"learning_rate": 9.492935809953185e-05,
"loss": 1.0361,
"step": 234
},
{
"epoch": 0.1179817588486319,
"grad_norm": 0.4238687753677368,
"learning_rate": 9.488268544715896e-05,
"loss": 0.9586,
"step": 235
},
{
"epoch": 0.11848380888628567,
"grad_norm": 0.41311508417129517,
"learning_rate": 9.483581056040719e-05,
"loss": 0.9994,
"step": 236
},
{
"epoch": 0.11898585892393942,
"grad_norm": 0.4387330710887909,
"learning_rate": 9.478873365048748e-05,
"loss": 0.9888,
"step": 237
},
{
"epoch": 0.11948790896159317,
"grad_norm": 0.4367882013320923,
"learning_rate": 9.474145492952102e-05,
"loss": 0.9316,
"step": 238
},
{
"epoch": 0.11998995899924693,
"grad_norm": 0.47169986367225647,
"learning_rate": 9.469397461053837e-05,
"loss": 0.9869,
"step": 239
},
{
"epoch": 0.12049200903690067,
"grad_norm": 0.4648449420928955,
"learning_rate": 9.464629290747842e-05,
"loss": 0.9891,
"step": 240
},
{
"epoch": 0.12099405907455443,
"grad_norm": 0.47016164660453796,
"learning_rate": 9.459841003518753e-05,
"loss": 0.8839,
"step": 241
},
{
"epoch": 0.12149610911220818,
"grad_norm": 0.46333047747612,
"learning_rate": 9.45503262094184e-05,
"loss": 0.871,
"step": 242
},
{
"epoch": 0.12199815914986194,
"grad_norm": 0.5084211230278015,
"learning_rate": 9.450204164682928e-05,
"loss": 0.9316,
"step": 243
},
{
"epoch": 0.12250020918751568,
"grad_norm": 0.48900070786476135,
"learning_rate": 9.445355656498285e-05,
"loss": 0.9197,
"step": 244
},
{
"epoch": 0.12300225922516944,
"grad_norm": 0.5633825659751892,
"learning_rate": 9.440487118234535e-05,
"loss": 0.9677,
"step": 245
},
{
"epoch": 0.1235043092628232,
"grad_norm": 0.5049698352813721,
"learning_rate": 9.435598571828552e-05,
"loss": 0.9555,
"step": 246
},
{
"epoch": 0.12400635930047695,
"grad_norm": 0.558955729007721,
"learning_rate": 9.430690039307363e-05,
"loss": 0.9873,
"step": 247
},
{
"epoch": 0.1245084093381307,
"grad_norm": 0.6211025714874268,
"learning_rate": 9.425761542788048e-05,
"loss": 0.9365,
"step": 248
},
{
"epoch": 0.12501045937578445,
"grad_norm": 0.577601969242096,
"learning_rate": 9.420813104477646e-05,
"loss": 0.8319,
"step": 249
},
{
"epoch": 0.1255125094134382,
"grad_norm": 0.7420045137405396,
"learning_rate": 9.415844746673047e-05,
"loss": 0.8585,
"step": 250
},
{
"epoch": 0.12601455945109197,
"grad_norm": 0.6604503393173218,
"learning_rate": 9.410856491760895e-05,
"loss": 1.3716,
"step": 251
},
{
"epoch": 0.1265166094887457,
"grad_norm": 0.5581377148628235,
"learning_rate": 9.405848362217491e-05,
"loss": 1.2896,
"step": 252
},
{
"epoch": 0.12701865952639946,
"grad_norm": 0.48465076088905334,
"learning_rate": 9.400820380608683e-05,
"loss": 1.2519,
"step": 253
},
{
"epoch": 0.12752070956405323,
"grad_norm": 0.44643500447273254,
"learning_rate": 9.395772569589774e-05,
"loss": 1.1591,
"step": 254
},
{
"epoch": 0.12802275960170698,
"grad_norm": 0.46593335270881653,
"learning_rate": 9.390704951905411e-05,
"loss": 1.1837,
"step": 255
},
{
"epoch": 0.12852480963936072,
"grad_norm": 0.45252177119255066,
"learning_rate": 9.38561755038949e-05,
"loss": 1.1487,
"step": 256
},
{
"epoch": 0.12902685967701447,
"grad_norm": 0.39205488562583923,
"learning_rate": 9.380510387965047e-05,
"loss": 1.1948,
"step": 257
},
{
"epoch": 0.12952890971466824,
"grad_norm": 0.3986876308917999,
"learning_rate": 9.37538348764416e-05,
"loss": 1.1644,
"step": 258
},
{
"epoch": 0.13003095975232198,
"grad_norm": 0.40925174951553345,
"learning_rate": 9.370236872527845e-05,
"loss": 1.1403,
"step": 259
},
{
"epoch": 0.13053300978997573,
"grad_norm": 0.4211632311344147,
"learning_rate": 9.365070565805941e-05,
"loss": 1.129,
"step": 260
},
{
"epoch": 0.13103505982762947,
"grad_norm": 0.39426669478416443,
"learning_rate": 9.359884590757025e-05,
"loss": 1.1036,
"step": 261
},
{
"epoch": 0.13153710986528325,
"grad_norm": 0.3872944116592407,
"learning_rate": 9.35467897074829e-05,
"loss": 1.0752,
"step": 262
},
{
"epoch": 0.132039159902937,
"grad_norm": 0.39355534315109253,
"learning_rate": 9.349453729235447e-05,
"loss": 0.9972,
"step": 263
},
{
"epoch": 0.13254120994059074,
"grad_norm": 0.36666789650917053,
"learning_rate": 9.34420888976262e-05,
"loss": 1.0353,
"step": 264
},
{
"epoch": 0.1330432599782445,
"grad_norm": 0.39839133620262146,
"learning_rate": 9.338944475962237e-05,
"loss": 1.0541,
"step": 265
},
{
"epoch": 0.13354531001589826,
"grad_norm": 0.3860282599925995,
"learning_rate": 9.333660511554925e-05,
"loss": 1.0672,
"step": 266
},
{
"epoch": 0.134047360053552,
"grad_norm": 0.38217252492904663,
"learning_rate": 9.328357020349405e-05,
"loss": 1.0534,
"step": 267
},
{
"epoch": 0.13454941009120575,
"grad_norm": 0.39358577132225037,
"learning_rate": 9.323034026242377e-05,
"loss": 1.1266,
"step": 268
},
{
"epoch": 0.13505146012885952,
"grad_norm": 0.39111077785491943,
"learning_rate": 9.317691553218428e-05,
"loss": 1.1044,
"step": 269
},
{
"epoch": 0.13555351016651326,
"grad_norm": 0.3801279067993164,
"learning_rate": 9.312329625349902e-05,
"loss": 1.0242,
"step": 270
},
{
"epoch": 0.136055560204167,
"grad_norm": 0.42816299200057983,
"learning_rate": 9.306948266796816e-05,
"loss": 1.0546,
"step": 271
},
{
"epoch": 0.13655761024182078,
"grad_norm": 0.3893824517726898,
"learning_rate": 9.301547501806726e-05,
"loss": 1.0505,
"step": 272
},
{
"epoch": 0.13705966027947453,
"grad_norm": 0.393541157245636,
"learning_rate": 9.29612735471464e-05,
"loss": 1.0875,
"step": 273
},
{
"epoch": 0.13756171031712827,
"grad_norm": 0.38969358801841736,
"learning_rate": 9.290687849942893e-05,
"loss": 1.048,
"step": 274
},
{
"epoch": 0.13806376035478202,
"grad_norm": 0.4060596525669098,
"learning_rate": 9.285229012001047e-05,
"loss": 1.0514,
"step": 275
},
{
"epoch": 0.1385658103924358,
"grad_norm": 0.4034577012062073,
"learning_rate": 9.279750865485772e-05,
"loss": 0.9808,
"step": 276
},
{
"epoch": 0.13906786043008953,
"grad_norm": 0.4346413314342499,
"learning_rate": 9.274253435080746e-05,
"loss": 1.0776,
"step": 277
},
{
"epoch": 0.13956991046774328,
"grad_norm": 0.3979721963405609,
"learning_rate": 9.268736745556527e-05,
"loss": 0.984,
"step": 278
},
{
"epoch": 0.14007196050539703,
"grad_norm": 0.4149005115032196,
"learning_rate": 9.263200821770461e-05,
"loss": 1.0041,
"step": 279
},
{
"epoch": 0.1405740105430508,
"grad_norm": 0.4252713620662689,
"learning_rate": 9.257645688666556e-05,
"loss": 0.9957,
"step": 280
},
{
"epoch": 0.14107606058070454,
"grad_norm": 0.41585591435432434,
"learning_rate": 9.252071371275378e-05,
"loss": 1.0147,
"step": 281
},
{
"epoch": 0.1415781106183583,
"grad_norm": 0.4276868402957916,
"learning_rate": 9.246477894713925e-05,
"loss": 1.0093,
"step": 282
},
{
"epoch": 0.14208016065601206,
"grad_norm": 0.4250052571296692,
"learning_rate": 9.240865284185536e-05,
"loss": 1.0084,
"step": 283
},
{
"epoch": 0.1425822106936658,
"grad_norm": 0.4250898063182831,
"learning_rate": 9.235233564979755e-05,
"loss": 0.9515,
"step": 284
},
{
"epoch": 0.14308426073131955,
"grad_norm": 0.44376522302627563,
"learning_rate": 9.22958276247223e-05,
"loss": 0.9607,
"step": 285
},
{
"epoch": 0.1435863107689733,
"grad_norm": 0.42781707644462585,
"learning_rate": 9.223912902124601e-05,
"loss": 0.9635,
"step": 286
},
{
"epoch": 0.14408836080662707,
"grad_norm": 0.4453868865966797,
"learning_rate": 9.218224009484366e-05,
"loss": 0.9683,
"step": 287
},
{
"epoch": 0.14459041084428081,
"grad_norm": 0.43436458706855774,
"learning_rate": 9.212516110184794e-05,
"loss": 0.9129,
"step": 288
},
{
"epoch": 0.14509246088193456,
"grad_norm": 0.47016844153404236,
"learning_rate": 9.206789229944786e-05,
"loss": 0.9555,
"step": 289
},
{
"epoch": 0.14559451091958833,
"grad_norm": 0.4709494113922119,
"learning_rate": 9.201043394568773e-05,
"loss": 0.9643,
"step": 290
},
{
"epoch": 0.14609656095724208,
"grad_norm": 0.4938959777355194,
"learning_rate": 9.195278629946589e-05,
"loss": 0.9555,
"step": 291
},
{
"epoch": 0.14659861099489582,
"grad_norm": 0.49377843737602234,
"learning_rate": 9.189494962053368e-05,
"loss": 0.9807,
"step": 292
},
{
"epoch": 0.14710066103254957,
"grad_norm": 0.4660574793815613,
"learning_rate": 9.183692416949414e-05,
"loss": 0.8629,
"step": 293
},
{
"epoch": 0.14760271107020334,
"grad_norm": 0.46316561102867126,
"learning_rate": 9.17787102078009e-05,
"loss": 0.9329,
"step": 294
},
{
"epoch": 0.14810476110785709,
"grad_norm": 0.48996853828430176,
"learning_rate": 9.172030799775699e-05,
"loss": 0.9179,
"step": 295
},
{
"epoch": 0.14860681114551083,
"grad_norm": 0.4847288131713867,
"learning_rate": 9.166171780251365e-05,
"loss": 0.9084,
"step": 296
},
{
"epoch": 0.14910886118316458,
"grad_norm": 0.4630123972892761,
"learning_rate": 9.160293988606916e-05,
"loss": 0.8722,
"step": 297
},
{
"epoch": 0.14961091122081835,
"grad_norm": 0.5303933620452881,
"learning_rate": 9.154397451326766e-05,
"loss": 0.8966,
"step": 298
},
{
"epoch": 0.1501129612584721,
"grad_norm": 0.530169665813446,
"learning_rate": 9.148482194979789e-05,
"loss": 0.8084,
"step": 299
},
{
"epoch": 0.15061501129612584,
"grad_norm": 0.6588028073310852,
"learning_rate": 9.142548246219212e-05,
"loss": 0.7829,
"step": 300
},
{
"epoch": 0.15061501129612584,
"eval_loss": 1.0374255180358887,
"eval_runtime": 709.916,
"eval_samples_per_second": 21.266,
"eval_steps_per_second": 2.659,
"step": 300
},
{
"epoch": 0.1511170613337796,
"grad_norm": 0.4993877410888672,
"learning_rate": 9.136595631782478e-05,
"loss": 1.2287,
"step": 301
},
{
"epoch": 0.15161911137143336,
"grad_norm": 0.5264070630073547,
"learning_rate": 9.13062437849114e-05,
"loss": 1.2563,
"step": 302
},
{
"epoch": 0.1521211614090871,
"grad_norm": 0.4640633761882782,
"learning_rate": 9.124634513250736e-05,
"loss": 1.2391,
"step": 303
},
{
"epoch": 0.15262321144674085,
"grad_norm": 0.4271823763847351,
"learning_rate": 9.118626063050661e-05,
"loss": 1.1237,
"step": 304
},
{
"epoch": 0.15312526148439462,
"grad_norm": 0.4822617471218109,
"learning_rate": 9.112599054964057e-05,
"loss": 1.1054,
"step": 305
},
{
"epoch": 0.15362731152204837,
"grad_norm": 0.4398539066314697,
"learning_rate": 9.106553516147682e-05,
"loss": 1.1482,
"step": 306
},
{
"epoch": 0.1541293615597021,
"grad_norm": 0.39565637707710266,
"learning_rate": 9.100489473841792e-05,
"loss": 1.0734,
"step": 307
},
{
"epoch": 0.15463141159735588,
"grad_norm": 0.40112727880477905,
"learning_rate": 9.09440695537001e-05,
"loss": 1.2046,
"step": 308
},
{
"epoch": 0.15513346163500963,
"grad_norm": 0.40704309940338135,
"learning_rate": 9.088305988139221e-05,
"loss": 1.0738,
"step": 309
},
{
"epoch": 0.15563551167266337,
"grad_norm": 0.39162713289260864,
"learning_rate": 9.082186599639428e-05,
"loss": 1.0979,
"step": 310
},
{
"epoch": 0.15613756171031712,
"grad_norm": 0.38577142357826233,
"learning_rate": 9.076048817443645e-05,
"loss": 1.0685,
"step": 311
},
{
"epoch": 0.1566396117479709,
"grad_norm": 0.3882400691509247,
"learning_rate": 9.069892669207758e-05,
"loss": 1.0758,
"step": 312
},
{
"epoch": 0.15714166178562464,
"grad_norm": 0.37418147921562195,
"learning_rate": 9.06371818267041e-05,
"loss": 0.9834,
"step": 313
},
{
"epoch": 0.15764371182327838,
"grad_norm": 0.3902306854724884,
"learning_rate": 9.057525385652878e-05,
"loss": 1.0335,
"step": 314
},
{
"epoch": 0.15814576186093213,
"grad_norm": 0.3840450048446655,
"learning_rate": 9.051314306058933e-05,
"loss": 1.068,
"step": 315
},
{
"epoch": 0.1586478118985859,
"grad_norm": 0.3719392716884613,
"learning_rate": 9.045084971874738e-05,
"loss": 1.031,
"step": 316
},
{
"epoch": 0.15914986193623964,
"grad_norm": 0.3819999694824219,
"learning_rate": 9.038837411168696e-05,
"loss": 1.052,
"step": 317
},
{
"epoch": 0.1596519119738934,
"grad_norm": 0.37122640013694763,
"learning_rate": 9.032571652091342e-05,
"loss": 1.0321,
"step": 318
},
{
"epoch": 0.16015396201154716,
"grad_norm": 0.3737955093383789,
"learning_rate": 9.026287722875209e-05,
"loss": 1.0579,
"step": 319
},
{
"epoch": 0.1606560120492009,
"grad_norm": 0.388288676738739,
"learning_rate": 9.019985651834703e-05,
"loss": 1.0124,
"step": 320
},
{
"epoch": 0.16115806208685465,
"grad_norm": 0.4136764705181122,
"learning_rate": 9.013665467365973e-05,
"loss": 1.0084,
"step": 321
},
{
"epoch": 0.1616601121245084,
"grad_norm": 0.39497965574264526,
"learning_rate": 9.007327197946781e-05,
"loss": 1.0847,
"step": 322
},
{
"epoch": 0.16216216216216217,
"grad_norm": 0.4033185839653015,
"learning_rate": 9.000970872136383e-05,
"loss": 1.0314,
"step": 323
},
{
"epoch": 0.16266421219981592,
"grad_norm": 0.40545448660850525,
"learning_rate": 8.994596518575392e-05,
"loss": 1.0589,
"step": 324
},
{
"epoch": 0.16316626223746966,
"grad_norm": 0.3762631118297577,
"learning_rate": 8.988204165985649e-05,
"loss": 0.9565,
"step": 325
},
{
"epoch": 0.16366831227512343,
"grad_norm": 0.40594545006752014,
"learning_rate": 8.981793843170098e-05,
"loss": 0.9948,
"step": 326
},
{
"epoch": 0.16417036231277718,
"grad_norm": 0.40294238924980164,
"learning_rate": 8.975365579012655e-05,
"loss": 1.0012,
"step": 327
},
{
"epoch": 0.16467241235043092,
"grad_norm": 0.4173141121864319,
"learning_rate": 8.968919402478075e-05,
"loss": 1.0945,
"step": 328
},
{
"epoch": 0.16517446238808467,
"grad_norm": 0.4323413074016571,
"learning_rate": 8.962455342611821e-05,
"loss": 1.0233,
"step": 329
},
{
"epoch": 0.16567651242573844,
"grad_norm": 0.4198532700538635,
"learning_rate": 8.955973428539944e-05,
"loss": 0.9737,
"step": 330
},
{
"epoch": 0.1661785624633922,
"grad_norm": 0.420789510011673,
"learning_rate": 8.94947368946893e-05,
"loss": 0.9872,
"step": 331
},
{
"epoch": 0.16668061250104593,
"grad_norm": 0.408327579498291,
"learning_rate": 8.942956154685596e-05,
"loss": 1.008,
"step": 332
},
{
"epoch": 0.16718266253869968,
"grad_norm": 0.4309915006160736,
"learning_rate": 8.936420853556935e-05,
"loss": 1.0114,
"step": 333
},
{
"epoch": 0.16768471257635345,
"grad_norm": 0.4261639416217804,
"learning_rate": 8.929867815529993e-05,
"loss": 0.9308,
"step": 334
},
{
"epoch": 0.1681867626140072,
"grad_norm": 0.42096462845802307,
"learning_rate": 8.923297070131737e-05,
"loss": 0.9615,
"step": 335
},
{
"epoch": 0.16868881265166094,
"grad_norm": 0.4670826494693756,
"learning_rate": 8.916708646968923e-05,
"loss": 0.969,
"step": 336
},
{
"epoch": 0.1691908626893147,
"grad_norm": 0.4317393898963928,
"learning_rate": 8.910102575727957e-05,
"loss": 1.0044,
"step": 337
},
{
"epoch": 0.16969291272696846,
"grad_norm": 0.4464769959449768,
"learning_rate": 8.903478886174763e-05,
"loss": 1.0213,
"step": 338
},
{
"epoch": 0.1701949627646222,
"grad_norm": 0.44120046496391296,
"learning_rate": 8.896837608154655e-05,
"loss": 0.9162,
"step": 339
},
{
"epoch": 0.17069701280227595,
"grad_norm": 0.4458862245082855,
"learning_rate": 8.890178771592199e-05,
"loss": 0.9079,
"step": 340
},
{
"epoch": 0.17119906283992972,
"grad_norm": 0.435587078332901,
"learning_rate": 8.883502406491067e-05,
"loss": 0.9403,
"step": 341
},
{
"epoch": 0.17170111287758347,
"grad_norm": 0.47128617763519287,
"learning_rate": 8.876808542933924e-05,
"loss": 0.9312,
"step": 342
},
{
"epoch": 0.1722031629152372,
"grad_norm": 0.4681444466114044,
"learning_rate": 8.870097211082271e-05,
"loss": 0.9711,
"step": 343
},
{
"epoch": 0.17270521295289099,
"grad_norm": 0.4990653693675995,
"learning_rate": 8.863368441176326e-05,
"loss": 0.9206,
"step": 344
},
{
"epoch": 0.17320726299054473,
"grad_norm": 0.6548157930374146,
"learning_rate": 8.856622263534875e-05,
"loss": 0.9235,
"step": 345
},
{
"epoch": 0.17370931302819848,
"grad_norm": 0.5148348212242126,
"learning_rate": 8.849858708555142e-05,
"loss": 0.9176,
"step": 346
},
{
"epoch": 0.17421136306585222,
"grad_norm": 0.5113969445228577,
"learning_rate": 8.843077806712648e-05,
"loss": 0.8961,
"step": 347
},
{
"epoch": 0.174713413103506,
"grad_norm": 0.5211741328239441,
"learning_rate": 8.836279588561083e-05,
"loss": 0.8647,
"step": 348
},
{
"epoch": 0.17521546314115974,
"grad_norm": 0.5579087138175964,
"learning_rate": 8.829464084732156e-05,
"loss": 0.8901,
"step": 349
},
{
"epoch": 0.17571751317881348,
"grad_norm": 0.6655847430229187,
"learning_rate": 8.822631325935463e-05,
"loss": 0.8291,
"step": 350
},
{
"epoch": 0.17621956321646723,
"grad_norm": 1.6874302625656128,
"learning_rate": 8.815781342958351e-05,
"loss": 1.385,
"step": 351
},
{
"epoch": 0.176721613254121,
"grad_norm": 0.522758960723877,
"learning_rate": 8.808914166665772e-05,
"loss": 1.2028,
"step": 352
},
{
"epoch": 0.17722366329177475,
"grad_norm": 0.4740375876426697,
"learning_rate": 8.802029828000156e-05,
"loss": 1.1728,
"step": 353
},
{
"epoch": 0.1777257133294285,
"grad_norm": 0.4499627351760864,
"learning_rate": 8.795128357981253e-05,
"loss": 1.1861,
"step": 354
},
{
"epoch": 0.17822776336708226,
"grad_norm": 0.4773704707622528,
"learning_rate": 8.788209787706015e-05,
"loss": 1.1703,
"step": 355
},
{
"epoch": 0.178729813404736,
"grad_norm": 0.4251996576786041,
"learning_rate": 8.781274148348437e-05,
"loss": 1.1624,
"step": 356
},
{
"epoch": 0.17923186344238975,
"grad_norm": 0.4025990962982178,
"learning_rate": 8.77432147115943e-05,
"loss": 1.0905,
"step": 357
},
{
"epoch": 0.1797339134800435,
"grad_norm": 0.3834582269191742,
"learning_rate": 8.767351787466673e-05,
"loss": 1.1365,
"step": 358
},
{
"epoch": 0.18023596351769727,
"grad_norm": 0.4131288230419159,
"learning_rate": 8.760365128674473e-05,
"loss": 1.1159,
"step": 359
},
{
"epoch": 0.18073801355535102,
"grad_norm": 0.409453421831131,
"learning_rate": 8.753361526263621e-05,
"loss": 1.1026,
"step": 360
},
{
"epoch": 0.18124006359300476,
"grad_norm": 0.38488978147506714,
"learning_rate": 8.746341011791264e-05,
"loss": 1.036,
"step": 361
},
{
"epoch": 0.18174211363065854,
"grad_norm": 0.3848975598812103,
"learning_rate": 8.73930361689074e-05,
"loss": 1.067,
"step": 362
},
{
"epoch": 0.18224416366831228,
"grad_norm": 0.3877599835395813,
"learning_rate": 8.732249373271455e-05,
"loss": 1.0209,
"step": 363
},
{
"epoch": 0.18274621370596603,
"grad_norm": 0.38656899333000183,
"learning_rate": 8.725178312718725e-05,
"loss": 1.087,
"step": 364
},
{
"epoch": 0.18324826374361977,
"grad_norm": 0.394137978553772,
"learning_rate": 8.718090467093654e-05,
"loss": 1.0651,
"step": 365
},
{
"epoch": 0.18375031378127354,
"grad_norm": 0.39841341972351074,
"learning_rate": 8.710985868332962e-05,
"loss": 1.0186,
"step": 366
},
{
"epoch": 0.1842523638189273,
"grad_norm": 0.39646363258361816,
"learning_rate": 8.703864548448868e-05,
"loss": 1.029,
"step": 367
},
{
"epoch": 0.18475441385658103,
"grad_norm": 0.3801933825016022,
"learning_rate": 8.696726539528924e-05,
"loss": 1.054,
"step": 368
},
{
"epoch": 0.18525646389423478,
"grad_norm": 0.3627118468284607,
"learning_rate": 8.689571873735884e-05,
"loss": 1.1052,
"step": 369
},
{
"epoch": 0.18575851393188855,
"grad_norm": 0.39008674025535583,
"learning_rate": 8.682400583307562e-05,
"loss": 1.0064,
"step": 370
},
{
"epoch": 0.1862605639695423,
"grad_norm": 0.37145888805389404,
"learning_rate": 8.675212700556668e-05,
"loss": 0.9877,
"step": 371
},
{
"epoch": 0.18676261400719604,
"grad_norm": 0.3664349913597107,
"learning_rate": 8.668008257870683e-05,
"loss": 1.0103,
"step": 372
},
{
"epoch": 0.18726466404484982,
"grad_norm": 0.37273725867271423,
"learning_rate": 8.660787287711703e-05,
"loss": 1.0636,
"step": 373
},
{
"epoch": 0.18776671408250356,
"grad_norm": 0.38857051730155945,
"learning_rate": 8.653549822616289e-05,
"loss": 1.1021,
"step": 374
},
{
"epoch": 0.1882687641201573,
"grad_norm": 0.3906739056110382,
"learning_rate": 8.646295895195333e-05,
"loss": 1.0698,
"step": 375
},
{
"epoch": 0.18877081415781105,
"grad_norm": 0.3983420133590698,
"learning_rate": 8.639025538133898e-05,
"loss": 1.0459,
"step": 376
},
{
"epoch": 0.18927286419546482,
"grad_norm": 0.38264894485473633,
"learning_rate": 8.631738784191083e-05,
"loss": 1.041,
"step": 377
},
{
"epoch": 0.18977491423311857,
"grad_norm": 0.40160712599754333,
"learning_rate": 8.62443566619986e-05,
"loss": 0.9657,
"step": 378
},
{
"epoch": 0.19027696427077231,
"grad_norm": 0.4029211103916168,
"learning_rate": 8.617116217066942e-05,
"loss": 1.0126,
"step": 379
},
{
"epoch": 0.1907790143084261,
"grad_norm": 0.41848793625831604,
"learning_rate": 8.609780469772623e-05,
"loss": 1.0143,
"step": 380
},
{
"epoch": 0.19128106434607983,
"grad_norm": 0.3983096480369568,
"learning_rate": 8.602428457370637e-05,
"loss": 1.0024,
"step": 381
},
{
"epoch": 0.19178311438373358,
"grad_norm": 0.3817248046398163,
"learning_rate": 8.595060212988006e-05,
"loss": 0.9107,
"step": 382
},
{
"epoch": 0.19228516442138732,
"grad_norm": 0.4119492769241333,
"learning_rate": 8.587675769824887e-05,
"loss": 0.9464,
"step": 383
},
{
"epoch": 0.1927872144590411,
"grad_norm": 0.40975409746170044,
"learning_rate": 8.580275161154431e-05,
"loss": 0.8996,
"step": 384
},
{
"epoch": 0.19328926449669484,
"grad_norm": 0.4254794418811798,
"learning_rate": 8.572858420322627e-05,
"loss": 0.9331,
"step": 385
},
{
"epoch": 0.19379131453434859,
"grad_norm": 0.4199373722076416,
"learning_rate": 8.56542558074815e-05,
"loss": 1.0189,
"step": 386
},
{
"epoch": 0.19429336457200233,
"grad_norm": 0.4211234450340271,
"learning_rate": 8.557976675922217e-05,
"loss": 0.9798,
"step": 387
},
{
"epoch": 0.1947954146096561,
"grad_norm": 0.4226566553115845,
"learning_rate": 8.550511739408428e-05,
"loss": 0.9475,
"step": 388
},
{
"epoch": 0.19529746464730985,
"grad_norm": 0.46705394983291626,
"learning_rate": 8.543030804842629e-05,
"loss": 0.9535,
"step": 389
},
{
"epoch": 0.1957995146849636,
"grad_norm": 0.4537680745124817,
"learning_rate": 8.535533905932738e-05,
"loss": 0.9774,
"step": 390
},
{
"epoch": 0.19630156472261737,
"grad_norm": 0.43357518315315247,
"learning_rate": 8.528021076458615e-05,
"loss": 0.9001,
"step": 391
},
{
"epoch": 0.1968036147602711,
"grad_norm": 0.45762643218040466,
"learning_rate": 8.520492350271896e-05,
"loss": 0.9012,
"step": 392
},
{
"epoch": 0.19730566479792486,
"grad_norm": 0.4584790766239166,
"learning_rate": 8.512947761295846e-05,
"loss": 0.8805,
"step": 393
},
{
"epoch": 0.1978077148355786,
"grad_norm": 0.484757661819458,
"learning_rate": 8.505387343525209e-05,
"loss": 0.868,
"step": 394
},
{
"epoch": 0.19830976487323237,
"grad_norm": 0.5136643052101135,
"learning_rate": 8.497811131026046e-05,
"loss": 0.9755,
"step": 395
},
{
"epoch": 0.19881181491088612,
"grad_norm": 0.5092843770980835,
"learning_rate": 8.490219157935589e-05,
"loss": 0.9072,
"step": 396
},
{
"epoch": 0.19931386494853987,
"grad_norm": 0.5307949185371399,
"learning_rate": 8.482611458462083e-05,
"loss": 0.9028,
"step": 397
},
{
"epoch": 0.1998159149861936,
"grad_norm": 0.5171916484832764,
"learning_rate": 8.47498806688464e-05,
"loss": 0.8684,
"step": 398
},
{
"epoch": 0.20031796502384738,
"grad_norm": 0.5054696202278137,
"learning_rate": 8.467349017553067e-05,
"loss": 0.7905,
"step": 399
},
{
"epoch": 0.20082001506150113,
"grad_norm": 0.6332175731658936,
"learning_rate": 8.459694344887732e-05,
"loss": 0.8408,
"step": 400
},
{
"epoch": 0.20132206509915487,
"grad_norm": 0.562515377998352,
"learning_rate": 8.452024083379394e-05,
"loss": 1.3941,
"step": 401
},
{
"epoch": 0.20182411513680865,
"grad_norm": 0.43945592641830444,
"learning_rate": 8.444338267589057e-05,
"loss": 1.2801,
"step": 402
},
{
"epoch": 0.2023261651744624,
"grad_norm": 0.42131316661834717,
"learning_rate": 8.436636932147806e-05,
"loss": 1.2589,
"step": 403
},
{
"epoch": 0.20282821521211614,
"grad_norm": 0.3926401436328888,
"learning_rate": 8.428920111756658e-05,
"loss": 1.125,
"step": 404
},
{
"epoch": 0.20333026524976988,
"grad_norm": 0.4347395896911621,
"learning_rate": 8.421187841186402e-05,
"loss": 1.1564,
"step": 405
},
{
"epoch": 0.20383231528742365,
"grad_norm": 0.3934774100780487,
"learning_rate": 8.413440155277443e-05,
"loss": 1.0942,
"step": 406
},
{
"epoch": 0.2043343653250774,
"grad_norm": 0.40075141191482544,
"learning_rate": 8.405677088939644e-05,
"loss": 1.1296,
"step": 407
},
{
"epoch": 0.20483641536273114,
"grad_norm": 0.36235958337783813,
"learning_rate": 8.397898677152173e-05,
"loss": 1.1378,
"step": 408
},
{
"epoch": 0.20533846540038492,
"grad_norm": 0.4117681384086609,
"learning_rate": 8.390104954963338e-05,
"loss": 1.134,
"step": 409
},
{
"epoch": 0.20584051543803866,
"grad_norm": 0.3808246850967407,
"learning_rate": 8.382295957490436e-05,
"loss": 1.0572,
"step": 410
},
{
"epoch": 0.2063425654756924,
"grad_norm": 0.39057350158691406,
"learning_rate": 8.37447171991959e-05,
"loss": 1.1136,
"step": 411
},
{
"epoch": 0.20684461551334615,
"grad_norm": 0.39303159713745117,
"learning_rate": 8.366632277505597e-05,
"loss": 1.0216,
"step": 412
},
{
"epoch": 0.20734666555099993,
"grad_norm": 0.37181228399276733,
"learning_rate": 8.35877766557176e-05,
"loss": 1.0096,
"step": 413
},
{
"epoch": 0.20784871558865367,
"grad_norm": 0.378421813249588,
"learning_rate": 8.350907919509734e-05,
"loss": 1.0492,
"step": 414
},
{
"epoch": 0.20835076562630742,
"grad_norm": 0.38374465703964233,
"learning_rate": 8.343023074779368e-05,
"loss": 1.0271,
"step": 415
},
{
"epoch": 0.20885281566396116,
"grad_norm": 0.37486276030540466,
"learning_rate": 8.335123166908544e-05,
"loss": 1.027,
"step": 416
},
{
"epoch": 0.20935486570161493,
"grad_norm": 0.37390416860580444,
"learning_rate": 8.327208231493011e-05,
"loss": 0.9933,
"step": 417
},
{
"epoch": 0.20985691573926868,
"grad_norm": 0.39402034878730774,
"learning_rate": 8.319278304196237e-05,
"loss": 1.0998,
"step": 418
},
{
"epoch": 0.21035896577692242,
"grad_norm": 0.3804149925708771,
"learning_rate": 8.311333420749232e-05,
"loss": 1.0575,
"step": 419
},
{
"epoch": 0.2108610158145762,
"grad_norm": 0.37954866886138916,
"learning_rate": 8.303373616950408e-05,
"loss": 1.0209,
"step": 420
},
{
"epoch": 0.21136306585222994,
"grad_norm": 0.36630332469940186,
"learning_rate": 8.295398928665394e-05,
"loss": 0.953,
"step": 421
},
{
"epoch": 0.2118651158898837,
"grad_norm": 0.37623950839042664,
"learning_rate": 8.287409391826895e-05,
"loss": 0.9686,
"step": 422
},
{
"epoch": 0.21236716592753743,
"grad_norm": 0.384235680103302,
"learning_rate": 8.279405042434515e-05,
"loss": 1.0683,
"step": 423
},
{
"epoch": 0.2128692159651912,
"grad_norm": 0.3830919563770294,
"learning_rate": 8.271385916554605e-05,
"loss": 0.9916,
"step": 424
},
{
"epoch": 0.21337126600284495,
"grad_norm": 0.39329853653907776,
"learning_rate": 8.263352050320094e-05,
"loss": 1.0264,
"step": 425
},
{
"epoch": 0.2138733160404987,
"grad_norm": 0.39238932728767395,
"learning_rate": 8.255303479930333e-05,
"loss": 0.9725,
"step": 426
},
{
"epoch": 0.21437536607815247,
"grad_norm": 0.41246023774147034,
"learning_rate": 8.247240241650918e-05,
"loss": 0.9592,
"step": 427
},
{
"epoch": 0.2148774161158062,
"grad_norm": 0.4108837842941284,
"learning_rate": 8.239162371813551e-05,
"loss": 1.0114,
"step": 428
},
{
"epoch": 0.21537946615345996,
"grad_norm": 0.3942084312438965,
"learning_rate": 8.231069906815847e-05,
"loss": 0.9637,
"step": 429
},
{
"epoch": 0.2158815161911137,
"grad_norm": 0.4277946949005127,
"learning_rate": 8.222962883121196e-05,
"loss": 1.012,
"step": 430
},
{
"epoch": 0.21638356622876748,
"grad_norm": 0.43043553829193115,
"learning_rate": 8.214841337258578e-05,
"loss": 0.9617,
"step": 431
},
{
"epoch": 0.21688561626642122,
"grad_norm": 0.40695276856422424,
"learning_rate": 8.206705305822413e-05,
"loss": 0.9876,
"step": 432
},
{
"epoch": 0.21738766630407497,
"grad_norm": 0.41154372692108154,
"learning_rate": 8.19855482547239e-05,
"loss": 0.9719,
"step": 433
},
{
"epoch": 0.2178897163417287,
"grad_norm": 0.4162918031215668,
"learning_rate": 8.190389932933301e-05,
"loss": 0.9352,
"step": 434
},
{
"epoch": 0.21839176637938248,
"grad_norm": 0.4280974268913269,
"learning_rate": 8.182210664994878e-05,
"loss": 0.9462,
"step": 435
},
{
"epoch": 0.21889381641703623,
"grad_norm": 0.4325559437274933,
"learning_rate": 8.174017058511629e-05,
"loss": 0.9444,
"step": 436
},
{
"epoch": 0.21939586645468998,
"grad_norm": 0.43471524119377136,
"learning_rate": 8.165809150402663e-05,
"loss": 0.9441,
"step": 437
},
{
"epoch": 0.21989791649234375,
"grad_norm": 0.4418407380580902,
"learning_rate": 8.157586977651534e-05,
"loss": 0.9465,
"step": 438
},
{
"epoch": 0.2203999665299975,
"grad_norm": 0.45979785919189453,
"learning_rate": 8.149350577306074e-05,
"loss": 0.9426,
"step": 439
},
{
"epoch": 0.22090201656765124,
"grad_norm": 0.45479616522789,
"learning_rate": 8.141099986478212e-05,
"loss": 0.8374,
"step": 440
},
{
"epoch": 0.22140406660530498,
"grad_norm": 0.437326043844223,
"learning_rate": 8.132835242343827e-05,
"loss": 0.8725,
"step": 441
},
{
"epoch": 0.22190611664295876,
"grad_norm": 0.4658799469470978,
"learning_rate": 8.124556382142565e-05,
"loss": 0.8982,
"step": 442
},
{
"epoch": 0.2224081666806125,
"grad_norm": 0.5004392862319946,
"learning_rate": 8.11626344317768e-05,
"loss": 0.9902,
"step": 443
},
{
"epoch": 0.22291021671826625,
"grad_norm": 0.46578583121299744,
"learning_rate": 8.107956462815861e-05,
"loss": 0.8265,
"step": 444
},
{
"epoch": 0.22341226675592002,
"grad_norm": 0.48835834860801697,
"learning_rate": 8.099635478487064e-05,
"loss": 0.8986,
"step": 445
},
{
"epoch": 0.22391431679357376,
"grad_norm": 0.5076184868812561,
"learning_rate": 8.091300527684349e-05,
"loss": 0.8746,
"step": 446
},
{
"epoch": 0.2244163668312275,
"grad_norm": 0.502265989780426,
"learning_rate": 8.082951647963701e-05,
"loss": 0.9168,
"step": 447
},
{
"epoch": 0.22491841686888125,
"grad_norm": 0.558822512626648,
"learning_rate": 8.074588876943873e-05,
"loss": 0.8786,
"step": 448
},
{
"epoch": 0.22542046690653503,
"grad_norm": 0.5506950616836548,
"learning_rate": 8.066212252306203e-05,
"loss": 0.8613,
"step": 449
},
{
"epoch": 0.22592251694418877,
"grad_norm": 0.7210969924926758,
"learning_rate": 8.057821811794458e-05,
"loss": 0.746,
"step": 450
},
{
"epoch": 0.22592251694418877,
"eval_loss": 1.012302041053772,
"eval_runtime": 708.8163,
"eval_samples_per_second": 21.299,
"eval_steps_per_second": 2.664,
"step": 450
},
{
"epoch": 0.22642456698184252,
"grad_norm": 0.49422305822372437,
"learning_rate": 8.049417593214652e-05,
"loss": 1.3625,
"step": 451
},
{
"epoch": 0.22692661701949626,
"grad_norm": 0.45369595289230347,
"learning_rate": 8.040999634434883e-05,
"loss": 1.2001,
"step": 452
},
{
"epoch": 0.22742866705715004,
"grad_norm": 0.4486617147922516,
"learning_rate": 8.032567973385162e-05,
"loss": 1.2561,
"step": 453
},
{
"epoch": 0.22793071709480378,
"grad_norm": 0.422780841588974,
"learning_rate": 8.024122648057234e-05,
"loss": 1.1671,
"step": 454
},
{
"epoch": 0.22843276713245753,
"grad_norm": 0.4150182008743286,
"learning_rate": 8.015663696504422e-05,
"loss": 1.0727,
"step": 455
},
{
"epoch": 0.2289348171701113,
"grad_norm": 0.4196764826774597,
"learning_rate": 8.007191156841441e-05,
"loss": 1.1269,
"step": 456
},
{
"epoch": 0.22943686720776504,
"grad_norm": 0.3779695928096771,
"learning_rate": 7.998705067244232e-05,
"loss": 1.1152,
"step": 457
},
{
"epoch": 0.2299389172454188,
"grad_norm": 0.3510948419570923,
"learning_rate": 7.990205465949791e-05,
"loss": 1.0677,
"step": 458
},
{
"epoch": 0.23044096728307253,
"grad_norm": 0.3578283488750458,
"learning_rate": 7.981692391255997e-05,
"loss": 1.115,
"step": 459
},
{
"epoch": 0.2309430173207263,
"grad_norm": 0.3872191607952118,
"learning_rate": 7.973165881521434e-05,
"loss": 1.0569,
"step": 460
},
{
"epoch": 0.23144506735838005,
"grad_norm": 0.4070218503475189,
"learning_rate": 7.964625975165225e-05,
"loss": 1.0516,
"step": 461
},
{
"epoch": 0.2319471173960338,
"grad_norm": 0.35880640149116516,
"learning_rate": 7.956072710666859e-05,
"loss": 1.0315,
"step": 462
},
{
"epoch": 0.23244916743368757,
"grad_norm": 0.448629230260849,
"learning_rate": 7.947506126566009e-05,
"loss": 1.0253,
"step": 463
},
{
"epoch": 0.23295121747134132,
"grad_norm": 0.3651820719242096,
"learning_rate": 7.938926261462366e-05,
"loss": 1.0126,
"step": 464
},
{
"epoch": 0.23345326750899506,
"grad_norm": 0.3588433265686035,
"learning_rate": 7.930333154015466e-05,
"loss": 1.0329,
"step": 465
},
{
"epoch": 0.2339553175466488,
"grad_norm": 0.3761132061481476,
"learning_rate": 7.921726842944508e-05,
"loss": 1.0054,
"step": 466
},
{
"epoch": 0.23445736758430258,
"grad_norm": 0.36542749404907227,
"learning_rate": 7.913107367028187e-05,
"loss": 1.0458,
"step": 467
},
{
"epoch": 0.23495941762195632,
"grad_norm": 0.3760159909725189,
"learning_rate": 7.90447476510452e-05,
"loss": 1.016,
"step": 468
},
{
"epoch": 0.23546146765961007,
"grad_norm": 0.34772396087646484,
"learning_rate": 7.895829076070663e-05,
"loss": 0.9758,
"step": 469
},
{
"epoch": 0.2359635176972638,
"grad_norm": 0.3899083137512207,
"learning_rate": 7.88717033888274e-05,
"loss": 1.0391,
"step": 470
},
{
"epoch": 0.2364655677349176,
"grad_norm": 0.3794157803058624,
"learning_rate": 7.878498592555674e-05,
"loss": 1.0162,
"step": 471
},
{
"epoch": 0.23696761777257133,
"grad_norm": 0.3927205801010132,
"learning_rate": 7.869813876162998e-05,
"loss": 0.9797,
"step": 472
},
{
"epoch": 0.23746966781022508,
"grad_norm": 0.3774932324886322,
"learning_rate": 7.86111622883669e-05,
"loss": 0.9606,
"step": 473
},
{
"epoch": 0.23797171784787885,
"grad_norm": 0.37682032585144043,
"learning_rate": 7.852405689766993e-05,
"loss": 1.0554,
"step": 474
},
{
"epoch": 0.2384737678855326,
"grad_norm": 0.3759259879589081,
"learning_rate": 7.843682298202235e-05,
"loss": 0.9883,
"step": 475
},
{
"epoch": 0.23897581792318634,
"grad_norm": 0.38955962657928467,
"learning_rate": 7.834946093448659e-05,
"loss": 1.0126,
"step": 476
},
{
"epoch": 0.23947786796084009,
"grad_norm": 0.39181217551231384,
"learning_rate": 7.826197114870242e-05,
"loss": 1.0209,
"step": 477
},
{
"epoch": 0.23997991799849386,
"grad_norm": 0.38797685503959656,
"learning_rate": 7.817435401888513e-05,
"loss": 1.0166,
"step": 478
},
{
"epoch": 0.2404819680361476,
"grad_norm": 0.3912067413330078,
"learning_rate": 7.808660993982388e-05,
"loss": 0.9866,
"step": 479
},
{
"epoch": 0.24098401807380135,
"grad_norm": 0.3997304439544678,
"learning_rate": 7.799873930687978e-05,
"loss": 0.9763,
"step": 480
},
{
"epoch": 0.24148606811145512,
"grad_norm": 0.40459659695625305,
"learning_rate": 7.79107425159842e-05,
"loss": 1.0234,
"step": 481
},
{
"epoch": 0.24198811814910887,
"grad_norm": 0.4033385217189789,
"learning_rate": 7.782261996363693e-05,
"loss": 0.9801,
"step": 482
},
{
"epoch": 0.2424901681867626,
"grad_norm": 0.41744333505630493,
"learning_rate": 7.773437204690449e-05,
"loss": 0.9665,
"step": 483
},
{
"epoch": 0.24299221822441636,
"grad_norm": 0.4200511872768402,
"learning_rate": 7.764599916341817e-05,
"loss": 0.957,
"step": 484
},
{
"epoch": 0.24349426826207013,
"grad_norm": 0.4265231490135193,
"learning_rate": 7.755750171137246e-05,
"loss": 0.9379,
"step": 485
},
{
"epoch": 0.24399631829972387,
"grad_norm": 0.4306912124156952,
"learning_rate": 7.746888008952301e-05,
"loss": 0.9734,
"step": 486
},
{
"epoch": 0.24449836833737762,
"grad_norm": 0.4338829219341278,
"learning_rate": 7.738013469718507e-05,
"loss": 0.9265,
"step": 487
},
{
"epoch": 0.24500041837503136,
"grad_norm": 0.43540337681770325,
"learning_rate": 7.729126593423151e-05,
"loss": 0.9211,
"step": 488
},
{
"epoch": 0.24550246841268514,
"grad_norm": 0.46909114718437195,
"learning_rate": 7.720227420109112e-05,
"loss": 0.928,
"step": 489
},
{
"epoch": 0.24600451845033888,
"grad_norm": 0.4378572404384613,
"learning_rate": 7.711315989874677e-05,
"loss": 0.8604,
"step": 490
},
{
"epoch": 0.24650656848799263,
"grad_norm": 0.4667833745479584,
"learning_rate": 7.702392342873358e-05,
"loss": 0.8831,
"step": 491
},
{
"epoch": 0.2470086185256464,
"grad_norm": 0.44659602642059326,
"learning_rate": 7.69345651931372e-05,
"loss": 0.9048,
"step": 492
},
{
"epoch": 0.24751066856330015,
"grad_norm": 0.4557839334011078,
"learning_rate": 7.684508559459187e-05,
"loss": 0.8803,
"step": 493
},
{
"epoch": 0.2480127186009539,
"grad_norm": 0.4604610204696655,
"learning_rate": 7.675548503627871e-05,
"loss": 0.8387,
"step": 494
},
{
"epoch": 0.24851476863860764,
"grad_norm": 0.4708879888057709,
"learning_rate": 7.666576392192389e-05,
"loss": 0.8432,
"step": 495
},
{
"epoch": 0.2490168186762614,
"grad_norm": 0.5023857951164246,
"learning_rate": 7.65759226557967e-05,
"loss": 0.9374,
"step": 496
},
{
"epoch": 0.24951886871391515,
"grad_norm": 0.5210058689117432,
"learning_rate": 7.648596164270791e-05,
"loss": 0.9176,
"step": 497
},
{
"epoch": 0.2500209187515689,
"grad_norm": 0.5268908739089966,
"learning_rate": 7.639588128800778e-05,
"loss": 0.8858,
"step": 498
},
{
"epoch": 0.25052296878922264,
"grad_norm": 0.5862696170806885,
"learning_rate": 7.630568199758436e-05,
"loss": 0.8763,
"step": 499
},
{
"epoch": 0.2510250188268764,
"grad_norm": 0.6300661563873291,
"learning_rate": 7.621536417786159e-05,
"loss": 0.7728,
"step": 500
},
{
"epoch": 0.2515270688645302,
"grad_norm": 0.4835459589958191,
"learning_rate": 7.612492823579745e-05,
"loss": 1.2268,
"step": 501
},
{
"epoch": 0.25202911890218394,
"grad_norm": 0.43844684958457947,
"learning_rate": 7.60343745788822e-05,
"loss": 1.2638,
"step": 502
},
{
"epoch": 0.2525311689398377,
"grad_norm": 0.44383448362350464,
"learning_rate": 7.594370361513648e-05,
"loss": 1.204,
"step": 503
},
{
"epoch": 0.2530332189774914,
"grad_norm": 0.40099960565567017,
"learning_rate": 7.585291575310952e-05,
"loss": 1.1228,
"step": 504
},
{
"epoch": 0.25353526901514517,
"grad_norm": 0.3856929838657379,
"learning_rate": 7.576201140187727e-05,
"loss": 1.127,
"step": 505
},
{
"epoch": 0.2540373190527989,
"grad_norm": 0.41922733187675476,
"learning_rate": 7.567099097104054e-05,
"loss": 1.1535,
"step": 506
},
{
"epoch": 0.25453936909045266,
"grad_norm": 0.39519184827804565,
"learning_rate": 7.557985487072318e-05,
"loss": 1.1119,
"step": 507
},
{
"epoch": 0.25504141912810646,
"grad_norm": 0.3693808317184448,
"learning_rate": 7.548860351157027e-05,
"loss": 1.1379,
"step": 508
},
{
"epoch": 0.2555434691657602,
"grad_norm": 0.36474886536598206,
"learning_rate": 7.539723730474619e-05,
"loss": 1.1053,
"step": 509
},
{
"epoch": 0.25604551920341395,
"grad_norm": 0.4072096645832062,
"learning_rate": 7.530575666193283e-05,
"loss": 1.0756,
"step": 510
},
{
"epoch": 0.2565475692410677,
"grad_norm": 0.3847082257270813,
"learning_rate": 7.521416199532765e-05,
"loss": 1.0432,
"step": 511
},
{
"epoch": 0.25704961927872144,
"grad_norm": 0.3695790469646454,
"learning_rate": 7.512245371764197e-05,
"loss": 0.9927,
"step": 512
},
{
"epoch": 0.2575516693163752,
"grad_norm": 0.36473801732063293,
"learning_rate": 7.503063224209896e-05,
"loss": 1.0291,
"step": 513
},
{
"epoch": 0.25805371935402893,
"grad_norm": 0.36407670378685,
"learning_rate": 7.493869798243187e-05,
"loss": 1.014,
"step": 514
},
{
"epoch": 0.2585557693916827,
"grad_norm": 0.37464427947998047,
"learning_rate": 7.484665135288213e-05,
"loss": 1.0923,
"step": 515
},
{
"epoch": 0.2590578194293365,
"grad_norm": 0.34929415583610535,
"learning_rate": 7.475449276819753e-05,
"loss": 1.0533,
"step": 516
},
{
"epoch": 0.2595598694669902,
"grad_norm": 0.36770978569984436,
"learning_rate": 7.466222264363021e-05,
"loss": 0.9745,
"step": 517
},
{
"epoch": 0.26006191950464397,
"grad_norm": 0.3667100965976715,
"learning_rate": 7.456984139493502e-05,
"loss": 0.9944,
"step": 518
},
{
"epoch": 0.2605639695422977,
"grad_norm": 0.3640177845954895,
"learning_rate": 7.447734943836741e-05,
"loss": 1.0289,
"step": 519
},
{
"epoch": 0.26106601957995146,
"grad_norm": 0.35481715202331543,
"learning_rate": 7.438474719068173e-05,
"loss": 1.0214,
"step": 520
},
{
"epoch": 0.2615680696176052,
"grad_norm": 0.36664754152297974,
"learning_rate": 7.429203506912927e-05,
"loss": 1.0307,
"step": 521
},
{
"epoch": 0.26207011965525895,
"grad_norm": 0.3693181276321411,
"learning_rate": 7.419921349145634e-05,
"loss": 0.9277,
"step": 522
},
{
"epoch": 0.26257216969291275,
"grad_norm": 0.38111287355422974,
"learning_rate": 7.410628287590254e-05,
"loss": 0.9725,
"step": 523
},
{
"epoch": 0.2630742197305665,
"grad_norm": 0.3914952576160431,
"learning_rate": 7.401324364119871e-05,
"loss": 1.0405,
"step": 524
},
{
"epoch": 0.26357626976822024,
"grad_norm": 0.38030022382736206,
"learning_rate": 7.392009620656513e-05,
"loss": 0.9838,
"step": 525
},
{
"epoch": 0.264078319805874,
"grad_norm": 0.41087502241134644,
"learning_rate": 7.382684099170959e-05,
"loss": 1.0151,
"step": 526
},
{
"epoch": 0.26458036984352773,
"grad_norm": 0.40365880727767944,
"learning_rate": 7.373347841682556e-05,
"loss": 0.9753,
"step": 527
},
{
"epoch": 0.2650824198811815,
"grad_norm": 0.4079309105873108,
"learning_rate": 7.364000890259025e-05,
"loss": 1.0174,
"step": 528
},
{
"epoch": 0.2655844699188352,
"grad_norm": 0.4056829810142517,
"learning_rate": 7.354643287016268e-05,
"loss": 1.024,
"step": 529
},
{
"epoch": 0.266086519956489,
"grad_norm": 0.39864933490753174,
"learning_rate": 7.345275074118185e-05,
"loss": 0.9795,
"step": 530
},
{
"epoch": 0.26658856999414277,
"grad_norm": 0.39665892720222473,
"learning_rate": 7.335896293776486e-05,
"loss": 0.9327,
"step": 531
},
{
"epoch": 0.2670906200317965,
"grad_norm": 0.38788363337516785,
"learning_rate": 7.326506988250488e-05,
"loss": 0.9648,
"step": 532
},
{
"epoch": 0.26759267006945026,
"grad_norm": 0.41023018956184387,
"learning_rate": 7.31710719984694e-05,
"loss": 0.9254,
"step": 533
},
{
"epoch": 0.268094720107104,
"grad_norm": 0.38603848218917847,
"learning_rate": 7.307696970919818e-05,
"loss": 0.958,
"step": 534
},
{
"epoch": 0.26859677014475775,
"grad_norm": 0.42242223024368286,
"learning_rate": 7.298276343870151e-05,
"loss": 0.9136,
"step": 535
},
{
"epoch": 0.2690988201824115,
"grad_norm": 0.4157050549983978,
"learning_rate": 7.288845361145811e-05,
"loss": 0.9641,
"step": 536
},
{
"epoch": 0.2696008702200653,
"grad_norm": 0.4187794625759125,
"learning_rate": 7.279404065241337e-05,
"loss": 0.8804,
"step": 537
},
{
"epoch": 0.27010292025771904,
"grad_norm": 0.4192327857017517,
"learning_rate": 7.269952498697734e-05,
"loss": 0.9528,
"step": 538
},
{
"epoch": 0.2706049702953728,
"grad_norm": 0.42294546961784363,
"learning_rate": 7.260490704102287e-05,
"loss": 0.9,
"step": 539
},
{
"epoch": 0.2711070203330265,
"grad_norm": 0.45047277212142944,
"learning_rate": 7.251018724088367e-05,
"loss": 0.8122,
"step": 540
},
{
"epoch": 0.2716090703706803,
"grad_norm": 0.45989593863487244,
"learning_rate": 7.241536601335237e-05,
"loss": 0.8988,
"step": 541
},
{
"epoch": 0.272111120408334,
"grad_norm": 0.5204156041145325,
"learning_rate": 7.232044378567864e-05,
"loss": 0.9557,
"step": 542
},
{
"epoch": 0.27261317044598776,
"grad_norm": 0.4537619948387146,
"learning_rate": 7.222542098556721e-05,
"loss": 0.8729,
"step": 543
},
{
"epoch": 0.27311522048364156,
"grad_norm": 0.46789640188217163,
"learning_rate": 7.213029804117604e-05,
"loss": 0.8732,
"step": 544
},
{
"epoch": 0.2736172705212953,
"grad_norm": 0.4757324159145355,
"learning_rate": 7.203507538111423e-05,
"loss": 0.8749,
"step": 545
},
{
"epoch": 0.27411932055894905,
"grad_norm": 0.46748244762420654,
"learning_rate": 7.193975343444023e-05,
"loss": 0.7785,
"step": 546
},
{
"epoch": 0.2746213705966028,
"grad_norm": 0.508681058883667,
"learning_rate": 7.18443326306599e-05,
"loss": 0.8732,
"step": 547
},
{
"epoch": 0.27512342063425654,
"grad_norm": 0.5589388608932495,
"learning_rate": 7.174881339972448e-05,
"loss": 0.8308,
"step": 548
},
{
"epoch": 0.2756254706719103,
"grad_norm": 0.5891793966293335,
"learning_rate": 7.165319617202871e-05,
"loss": 0.7965,
"step": 549
},
{
"epoch": 0.27612752070956403,
"grad_norm": 0.6700708866119385,
"learning_rate": 7.155748137840892e-05,
"loss": 0.7379,
"step": 550
},
{
"epoch": 0.2766295707472178,
"grad_norm": 0.4654090404510498,
"learning_rate": 7.146166945014102e-05,
"loss": 1.1523,
"step": 551
},
{
"epoch": 0.2771316207848716,
"grad_norm": 0.4521055221557617,
"learning_rate": 7.136576081893863e-05,
"loss": 1.1763,
"step": 552
},
{
"epoch": 0.2776336708225253,
"grad_norm": 0.39871326088905334,
"learning_rate": 7.126975591695108e-05,
"loss": 1.1915,
"step": 553
},
{
"epoch": 0.27813572086017907,
"grad_norm": 0.3950759172439575,
"learning_rate": 7.117365517676145e-05,
"loss": 1.1688,
"step": 554
},
{
"epoch": 0.2786377708978328,
"grad_norm": 0.4023323357105255,
"learning_rate": 7.107745903138472e-05,
"loss": 1.0745,
"step": 555
},
{
"epoch": 0.27913982093548656,
"grad_norm": 0.4257362186908722,
"learning_rate": 7.09811679142657e-05,
"loss": 1.1143,
"step": 556
},
{
"epoch": 0.2796418709731403,
"grad_norm": 0.39084288477897644,
"learning_rate": 7.088478225927715e-05,
"loss": 1.1569,
"step": 557
},
{
"epoch": 0.28014392101079405,
"grad_norm": 0.3621457815170288,
"learning_rate": 7.078830250071777e-05,
"loss": 1.1078,
"step": 558
},
{
"epoch": 0.28064597104844785,
"grad_norm": 0.3504067063331604,
"learning_rate": 7.069172907331034e-05,
"loss": 1.0506,
"step": 559
},
{
"epoch": 0.2811480210861016,
"grad_norm": 0.3640802502632141,
"learning_rate": 7.059506241219965e-05,
"loss": 1.0844,
"step": 560
},
{
"epoch": 0.28165007112375534,
"grad_norm": 0.47682300209999084,
"learning_rate": 7.049830295295057e-05,
"loss": 0.9911,
"step": 561
},
{
"epoch": 0.2821521211614091,
"grad_norm": 0.3758324086666107,
"learning_rate": 7.040145113154612e-05,
"loss": 1.0008,
"step": 562
},
{
"epoch": 0.28265417119906283,
"grad_norm": 0.3609547019004822,
"learning_rate": 7.030450738438553e-05,
"loss": 0.9903,
"step": 563
},
{
"epoch": 0.2831562212367166,
"grad_norm": 0.38031235337257385,
"learning_rate": 7.020747214828221e-05,
"loss": 1.0049,
"step": 564
},
{
"epoch": 0.2836582712743703,
"grad_norm": 0.38092276453971863,
"learning_rate": 7.011034586046176e-05,
"loss": 1.0064,
"step": 565
},
{
"epoch": 0.2841603213120241,
"grad_norm": 0.37201064825057983,
"learning_rate": 7.001312895856011e-05,
"loss": 1.034,
"step": 566
},
{
"epoch": 0.28466237134967787,
"grad_norm": 0.3920980989933014,
"learning_rate": 6.991582188062143e-05,
"loss": 1.0447,
"step": 567
},
{
"epoch": 0.2851644213873316,
"grad_norm": 0.3509131669998169,
"learning_rate": 6.981842506509625e-05,
"loss": 0.9887,
"step": 568
},
{
"epoch": 0.28566647142498536,
"grad_norm": 0.36149969696998596,
"learning_rate": 6.972093895083945e-05,
"loss": 1.0549,
"step": 569
},
{
"epoch": 0.2861685214626391,
"grad_norm": 0.3768817186355591,
"learning_rate": 6.962336397710819e-05,
"loss": 1.0034,
"step": 570
},
{
"epoch": 0.28667057150029285,
"grad_norm": 0.37715521454811096,
"learning_rate": 6.952570058356013e-05,
"loss": 1.0081,
"step": 571
},
{
"epoch": 0.2871726215379466,
"grad_norm": 0.35239478945732117,
"learning_rate": 6.942794921025126e-05,
"loss": 0.9283,
"step": 572
},
{
"epoch": 0.2876746715756004,
"grad_norm": 0.34368762373924255,
"learning_rate": 6.933011029763405e-05,
"loss": 0.9346,
"step": 573
},
{
"epoch": 0.28817672161325414,
"grad_norm": 0.3795548677444458,
"learning_rate": 6.923218428655534e-05,
"loss": 0.9778,
"step": 574
},
{
"epoch": 0.2886787716509079,
"grad_norm": 0.3852332830429077,
"learning_rate": 6.91341716182545e-05,
"loss": 0.9668,
"step": 575
},
{
"epoch": 0.28918082168856163,
"grad_norm": 0.37631848454475403,
"learning_rate": 6.903607273436128e-05,
"loss": 0.9594,
"step": 576
},
{
"epoch": 0.2896828717262154,
"grad_norm": 0.3791573941707611,
"learning_rate": 6.893788807689396e-05,
"loss": 0.916,
"step": 577
},
{
"epoch": 0.2901849217638691,
"grad_norm": 0.3761579096317291,
"learning_rate": 6.883961808825732e-05,
"loss": 0.9475,
"step": 578
},
{
"epoch": 0.29068697180152286,
"grad_norm": 0.3986110985279083,
"learning_rate": 6.874126321124058e-05,
"loss": 0.9524,
"step": 579
},
{
"epoch": 0.29118902183917666,
"grad_norm": 0.38280758261680603,
"learning_rate": 6.864282388901544e-05,
"loss": 0.9335,
"step": 580
},
{
"epoch": 0.2916910718768304,
"grad_norm": 0.41545820236206055,
"learning_rate": 6.854430056513417e-05,
"loss": 0.9306,
"step": 581
},
{
"epoch": 0.29219312191448416,
"grad_norm": 0.40962398052215576,
"learning_rate": 6.844569368352748e-05,
"loss": 0.9019,
"step": 582
},
{
"epoch": 0.2926951719521379,
"grad_norm": 0.3950769901275635,
"learning_rate": 6.83470036885026e-05,
"loss": 0.8951,
"step": 583
},
{
"epoch": 0.29319722198979165,
"grad_norm": 0.4112852215766907,
"learning_rate": 6.824823102474128e-05,
"loss": 0.9652,
"step": 584
},
{
"epoch": 0.2936992720274454,
"grad_norm": 0.4129278361797333,
"learning_rate": 6.814937613729766e-05,
"loss": 0.9319,
"step": 585
},
{
"epoch": 0.29420132206509914,
"grad_norm": 0.42486321926116943,
"learning_rate": 6.805043947159651e-05,
"loss": 0.9717,
"step": 586
},
{
"epoch": 0.2947033721027529,
"grad_norm": 0.42396050691604614,
"learning_rate": 6.795142147343101e-05,
"loss": 0.938,
"step": 587
},
{
"epoch": 0.2952054221404067,
"grad_norm": 0.4266931116580963,
"learning_rate": 6.785232258896077e-05,
"loss": 0.9092,
"step": 588
},
{
"epoch": 0.2957074721780604,
"grad_norm": 0.4176103472709656,
"learning_rate": 6.775314326470992e-05,
"loss": 0.8908,
"step": 589
},
{
"epoch": 0.29620952221571417,
"grad_norm": 0.4264911413192749,
"learning_rate": 6.765388394756504e-05,
"loss": 0.801,
"step": 590
},
{
"epoch": 0.2967115722533679,
"grad_norm": 0.45049166679382324,
"learning_rate": 6.755454508477312e-05,
"loss": 0.8206,
"step": 591
},
{
"epoch": 0.29721362229102166,
"grad_norm": 0.4606582820415497,
"learning_rate": 6.745512712393957e-05,
"loss": 0.8618,
"step": 592
},
{
"epoch": 0.2977156723286754,
"grad_norm": 0.4533351957798004,
"learning_rate": 6.735563051302622e-05,
"loss": 0.8264,
"step": 593
},
{
"epoch": 0.29821772236632915,
"grad_norm": 0.48195162415504456,
"learning_rate": 6.725605570034929e-05,
"loss": 0.8726,
"step": 594
},
{
"epoch": 0.29871977240398295,
"grad_norm": 0.5006521344184875,
"learning_rate": 6.715640313457733e-05,
"loss": 0.8731,
"step": 595
},
{
"epoch": 0.2992218224416367,
"grad_norm": 0.5022867321968079,
"learning_rate": 6.705667326472925e-05,
"loss": 0.8804,
"step": 596
},
{
"epoch": 0.29972387247929044,
"grad_norm": 0.4990813136100769,
"learning_rate": 6.69568665401723e-05,
"loss": 0.8631,
"step": 597
},
{
"epoch": 0.3002259225169442,
"grad_norm": 0.5281215906143188,
"learning_rate": 6.685698341062002e-05,
"loss": 0.8227,
"step": 598
},
{
"epoch": 0.30072797255459793,
"grad_norm": 0.5664414167404175,
"learning_rate": 6.67570243261302e-05,
"loss": 0.8378,
"step": 599
},
{
"epoch": 0.3012300225922517,
"grad_norm": 0.6653980612754822,
"learning_rate": 6.665698973710288e-05,
"loss": 0.8032,
"step": 600
},
{
"epoch": 0.3012300225922517,
"eval_loss": 0.9845598936080933,
"eval_runtime": 710.1126,
"eval_samples_per_second": 21.26,
"eval_steps_per_second": 2.659,
"step": 600
},
{
"epoch": 0.3017320726299054,
"grad_norm": 0.445065438747406,
"learning_rate": 6.655688009427832e-05,
"loss": 1.2529,
"step": 601
},
{
"epoch": 0.3022341226675592,
"grad_norm": 0.39952167868614197,
"learning_rate": 6.645669584873494e-05,
"loss": 1.2194,
"step": 602
},
{
"epoch": 0.30273617270521297,
"grad_norm": 0.403266042470932,
"learning_rate": 6.635643745188734e-05,
"loss": 1.2289,
"step": 603
},
{
"epoch": 0.3032382227428667,
"grad_norm": 0.38917073607444763,
"learning_rate": 6.625610535548418e-05,
"loss": 1.1336,
"step": 604
},
{
"epoch": 0.30374027278052046,
"grad_norm": 0.4072120785713196,
"learning_rate": 6.615570001160626e-05,
"loss": 1.0642,
"step": 605
},
{
"epoch": 0.3042423228181742,
"grad_norm": 0.4204983711242676,
"learning_rate": 6.605522187266441e-05,
"loss": 1.0719,
"step": 606
},
{
"epoch": 0.30474437285582795,
"grad_norm": 0.39132463932037354,
"learning_rate": 6.595467139139743e-05,
"loss": 1.0398,
"step": 607
},
{
"epoch": 0.3052464228934817,
"grad_norm": 0.35773175954818726,
"learning_rate": 6.585404902087011e-05,
"loss": 1.0631,
"step": 608
},
{
"epoch": 0.3057484729311355,
"grad_norm": 0.36051151156425476,
"learning_rate": 6.575335521447114e-05,
"loss": 1.04,
"step": 609
},
{
"epoch": 0.30625052296878924,
"grad_norm": 0.36739856004714966,
"learning_rate": 6.565259042591113e-05,
"loss": 1.0239,
"step": 610
},
{
"epoch": 0.306752573006443,
"grad_norm": 0.3616657853126526,
"learning_rate": 6.555175510922047e-05,
"loss": 1.0545,
"step": 611
},
{
"epoch": 0.30725462304409673,
"grad_norm": 0.3667794167995453,
"learning_rate": 6.545084971874738e-05,
"loss": 0.9624,
"step": 612
},
{
"epoch": 0.3077566730817505,
"grad_norm": 0.3631950318813324,
"learning_rate": 6.53498747091558e-05,
"loss": 1.0004,
"step": 613
},
{
"epoch": 0.3082587231194042,
"grad_norm": 0.35089895129203796,
"learning_rate": 6.524883053542339e-05,
"loss": 1.0094,
"step": 614
},
{
"epoch": 0.30876077315705797,
"grad_norm": 0.38375306129455566,
"learning_rate": 6.514771765283942e-05,
"loss": 1.018,
"step": 615
},
{
"epoch": 0.30926282319471177,
"grad_norm": 0.3634318709373474,
"learning_rate": 6.504653651700278e-05,
"loss": 1.0375,
"step": 616
},
{
"epoch": 0.3097648732323655,
"grad_norm": 0.3617091774940491,
"learning_rate": 6.494528758381984e-05,
"loss": 1.0412,
"step": 617
},
{
"epoch": 0.31026692327001926,
"grad_norm": 0.3729401230812073,
"learning_rate": 6.484397130950254e-05,
"loss": 1.0327,
"step": 618
},
{
"epoch": 0.310768973307673,
"grad_norm": 0.3525683581829071,
"learning_rate": 6.474258815056622e-05,
"loss": 1.0164,
"step": 619
},
{
"epoch": 0.31127102334532675,
"grad_norm": 0.3672581911087036,
"learning_rate": 6.464113856382752e-05,
"loss": 1.0148,
"step": 620
},
{
"epoch": 0.3117730733829805,
"grad_norm": 0.3790574371814728,
"learning_rate": 6.453962300640249e-05,
"loss": 0.9997,
"step": 621
},
{
"epoch": 0.31227512342063424,
"grad_norm": 0.36040011048316956,
"learning_rate": 6.44380419357044e-05,
"loss": 0.9505,
"step": 622
},
{
"epoch": 0.312777173458288,
"grad_norm": 0.3569061756134033,
"learning_rate": 6.43363958094417e-05,
"loss": 0.9429,
"step": 623
},
{
"epoch": 0.3132792234959418,
"grad_norm": 0.36146458983421326,
"learning_rate": 6.423468508561599e-05,
"loss": 0.9924,
"step": 624
},
{
"epoch": 0.31378127353359553,
"grad_norm": 0.37957096099853516,
"learning_rate": 6.413291022251989e-05,
"loss": 0.9934,
"step": 625
},
{
"epoch": 0.3142833235712493,
"grad_norm": 0.37144365906715393,
"learning_rate": 6.403107167873509e-05,
"loss": 0.9251,
"step": 626
},
{
"epoch": 0.314785373608903,
"grad_norm": 0.3828261196613312,
"learning_rate": 6.392916991313016e-05,
"loss": 0.9649,
"step": 627
},
{
"epoch": 0.31528742364655676,
"grad_norm": 0.3864898681640625,
"learning_rate": 6.382720538485856e-05,
"loss": 0.9834,
"step": 628
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.3928738832473755,
"learning_rate": 6.372517855335655e-05,
"loss": 0.9759,
"step": 629
},
{
"epoch": 0.31629152372186425,
"grad_norm": 0.42996037006378174,
"learning_rate": 6.362308987834115e-05,
"loss": 0.9628,
"step": 630
},
{
"epoch": 0.31679357375951805,
"grad_norm": 0.3807196319103241,
"learning_rate": 6.352093981980796e-05,
"loss": 0.9842,
"step": 631
},
{
"epoch": 0.3172956237971718,
"grad_norm": 0.39248624444007874,
"learning_rate": 6.341872883802923e-05,
"loss": 0.9539,
"step": 632
},
{
"epoch": 0.31779767383482554,
"grad_norm": 0.4059353470802307,
"learning_rate": 6.331645739355168e-05,
"loss": 0.9635,
"step": 633
},
{
"epoch": 0.3182997238724793,
"grad_norm": 0.4235178828239441,
"learning_rate": 6.321412594719451e-05,
"loss": 0.9473,
"step": 634
},
{
"epoch": 0.31880177391013304,
"grad_norm": 0.45633211731910706,
"learning_rate": 6.311173496004723e-05,
"loss": 0.9836,
"step": 635
},
{
"epoch": 0.3193038239477868,
"grad_norm": 0.4051073491573334,
"learning_rate": 6.300928489346766e-05,
"loss": 0.9482,
"step": 636
},
{
"epoch": 0.3198058739854405,
"grad_norm": 0.4133238196372986,
"learning_rate": 6.290677620907982e-05,
"loss": 0.9009,
"step": 637
},
{
"epoch": 0.3203079240230943,
"grad_norm": 0.4294078052043915,
"learning_rate": 6.280420936877188e-05,
"loss": 0.9389,
"step": 638
},
{
"epoch": 0.32080997406074807,
"grad_norm": 0.4092111885547638,
"learning_rate": 6.270158483469397e-05,
"loss": 0.8397,
"step": 639
},
{
"epoch": 0.3213120240984018,
"grad_norm": 0.42124441266059875,
"learning_rate": 6.259890306925627e-05,
"loss": 0.8405,
"step": 640
},
{
"epoch": 0.32181407413605556,
"grad_norm": 0.4422035217285156,
"learning_rate": 6.249616453512677e-05,
"loss": 0.8641,
"step": 641
},
{
"epoch": 0.3223161241737093,
"grad_norm": 0.4448348879814148,
"learning_rate": 6.239336969522932e-05,
"loss": 0.9077,
"step": 642
},
{
"epoch": 0.32281817421136305,
"grad_norm": 0.4691510796546936,
"learning_rate": 6.229051901274137e-05,
"loss": 0.8585,
"step": 643
},
{
"epoch": 0.3233202242490168,
"grad_norm": 0.4641557037830353,
"learning_rate": 6.218761295109208e-05,
"loss": 0.8527,
"step": 644
},
{
"epoch": 0.3238222742866706,
"grad_norm": 0.5288779735565186,
"learning_rate": 6.208465197396013e-05,
"loss": 0.8489,
"step": 645
},
{
"epoch": 0.32432432432432434,
"grad_norm": 0.45869073271751404,
"learning_rate": 6.19816365452716e-05,
"loss": 0.8505,
"step": 646
},
{
"epoch": 0.3248263743619781,
"grad_norm": 0.49422523379325867,
"learning_rate": 6.187856712919795e-05,
"loss": 0.8555,
"step": 647
},
{
"epoch": 0.32532842439963183,
"grad_norm": 0.5668922066688538,
"learning_rate": 6.177544419015388e-05,
"loss": 0.7629,
"step": 648
},
{
"epoch": 0.3258304744372856,
"grad_norm": 0.5716300010681152,
"learning_rate": 6.167226819279528e-05,
"loss": 0.8643,
"step": 649
},
{
"epoch": 0.3263325244749393,
"grad_norm": 0.6652288436889648,
"learning_rate": 6.156903960201709e-05,
"loss": 0.7433,
"step": 650
},
{
"epoch": 0.32683457451259307,
"grad_norm": 0.6001056432723999,
"learning_rate": 6.146575888295123e-05,
"loss": 1.2497,
"step": 651
},
{
"epoch": 0.32733662455024687,
"grad_norm": 0.3522529900074005,
"learning_rate": 6.136242650096451e-05,
"loss": 1.177,
"step": 652
},
{
"epoch": 0.3278386745879006,
"grad_norm": 0.3846982717514038,
"learning_rate": 6.125904292165652e-05,
"loss": 1.1357,
"step": 653
},
{
"epoch": 0.32834072462555436,
"grad_norm": 0.389482706785202,
"learning_rate": 6.115560861085756e-05,
"loss": 1.0675,
"step": 654
},
{
"epoch": 0.3288427746632081,
"grad_norm": 0.41399508714675903,
"learning_rate": 6.105212403462651e-05,
"loss": 1.1065,
"step": 655
},
{
"epoch": 0.32934482470086185,
"grad_norm": 0.5792128443717957,
"learning_rate": 6.0948589659248654e-05,
"loss": 1.1188,
"step": 656
},
{
"epoch": 0.3298468747385156,
"grad_norm": 0.3753111958503723,
"learning_rate": 6.084500595123383e-05,
"loss": 1.1127,
"step": 657
},
{
"epoch": 0.33034892477616934,
"grad_norm": 0.3663425147533417,
"learning_rate": 6.0741373377314005e-05,
"loss": 1.019,
"step": 658
},
{
"epoch": 0.3308509748138231,
"grad_norm": 0.39105096459388733,
"learning_rate": 6.0637692404441416e-05,
"loss": 1.0186,
"step": 659
},
{
"epoch": 0.3313530248514769,
"grad_norm": 0.38673144578933716,
"learning_rate": 6.0533963499786314e-05,
"loss": 1.0256,
"step": 660
},
{
"epoch": 0.33185507488913063,
"grad_norm": 0.3633407950401306,
"learning_rate": 6.0430187130735016e-05,
"loss": 1.0332,
"step": 661
},
{
"epoch": 0.3323571249267844,
"grad_norm": 0.35200172662734985,
"learning_rate": 6.032636376488763e-05,
"loss": 0.9356,
"step": 662
},
{
"epoch": 0.3328591749644381,
"grad_norm": 0.3665078282356262,
"learning_rate": 6.0222493870056044e-05,
"loss": 1.0154,
"step": 663
},
{
"epoch": 0.33336122500209187,
"grad_norm": 0.3591248095035553,
"learning_rate": 6.0118577914261784e-05,
"loss": 0.9798,
"step": 664
},
{
"epoch": 0.3338632750397456,
"grad_norm": 0.361217200756073,
"learning_rate": 6.001461636573397e-05,
"loss": 0.9813,
"step": 665
},
{
"epoch": 0.33436532507739936,
"grad_norm": 0.37569659948349,
"learning_rate": 5.99106096929071e-05,
"loss": 1.011,
"step": 666
},
{
"epoch": 0.33486737511505316,
"grad_norm": 0.3692183494567871,
"learning_rate": 5.980655836441902e-05,
"loss": 1.0294,
"step": 667
},
{
"epoch": 0.3353694251527069,
"grad_norm": 0.374726802110672,
"learning_rate": 5.970246284910876e-05,
"loss": 0.9654,
"step": 668
},
{
"epoch": 0.33587147519036065,
"grad_norm": 0.3687571585178375,
"learning_rate": 5.959832361601453e-05,
"loss": 1.0423,
"step": 669
},
{
"epoch": 0.3363735252280144,
"grad_norm": 0.36362433433532715,
"learning_rate": 5.949414113437142e-05,
"loss": 0.8874,
"step": 670
},
{
"epoch": 0.33687557526566814,
"grad_norm": 0.34844672679901123,
"learning_rate": 5.938991587360946e-05,
"loss": 0.8979,
"step": 671
},
{
"epoch": 0.3373776253033219,
"grad_norm": 0.3646034598350525,
"learning_rate": 5.9285648303351404e-05,
"loss": 0.9435,
"step": 672
},
{
"epoch": 0.3378796753409756,
"grad_norm": 0.37094947695732117,
"learning_rate": 5.9181338893410663e-05,
"loss": 0.9679,
"step": 673
},
{
"epoch": 0.3383817253786294,
"grad_norm": 0.385873943567276,
"learning_rate": 5.907698811378919e-05,
"loss": 0.9898,
"step": 674
},
{
"epoch": 0.3388837754162832,
"grad_norm": 0.38623571395874023,
"learning_rate": 5.897259643467527e-05,
"loss": 0.987,
"step": 675
},
{
"epoch": 0.3393858254539369,
"grad_norm": 0.3703857362270355,
"learning_rate": 5.8868164326441546e-05,
"loss": 0.919,
"step": 676
},
{
"epoch": 0.33988787549159066,
"grad_norm": 0.3874402344226837,
"learning_rate": 5.876369225964283e-05,
"loss": 0.959,
"step": 677
},
{
"epoch": 0.3403899255292444,
"grad_norm": 0.37169700860977173,
"learning_rate": 5.8659180705013936e-05,
"loss": 0.9883,
"step": 678
},
{
"epoch": 0.34089197556689815,
"grad_norm": 0.4187929332256317,
"learning_rate": 5.8554630133467624e-05,
"loss": 0.9527,
"step": 679
},
{
"epoch": 0.3413940256045519,
"grad_norm": 0.39550694823265076,
"learning_rate": 5.8450041016092464e-05,
"loss": 0.9152,
"step": 680
},
{
"epoch": 0.3418960756422057,
"grad_norm": 0.40294429659843445,
"learning_rate": 5.83454138241507e-05,
"loss": 0.95,
"step": 681
},
{
"epoch": 0.34239812567985944,
"grad_norm": 0.38999685645103455,
"learning_rate": 5.8240749029076134e-05,
"loss": 0.9475,
"step": 682
},
{
"epoch": 0.3429001757175132,
"grad_norm": 0.40788596868515015,
"learning_rate": 5.8136047102472e-05,
"loss": 1.01,
"step": 683
},
{
"epoch": 0.34340222575516693,
"grad_norm": 0.4204280972480774,
"learning_rate": 5.803130851610886e-05,
"loss": 0.934,
"step": 684
},
{
"epoch": 0.3439042757928207,
"grad_norm": 0.4102809429168701,
"learning_rate": 5.792653374192245e-05,
"loss": 0.9398,
"step": 685
},
{
"epoch": 0.3444063258304744,
"grad_norm": 0.4025559723377228,
"learning_rate": 5.782172325201155e-05,
"loss": 0.9245,
"step": 686
},
{
"epoch": 0.34490837586812817,
"grad_norm": 0.4101907014846802,
"learning_rate": 5.771687751863587e-05,
"loss": 0.9279,
"step": 687
},
{
"epoch": 0.34541042590578197,
"grad_norm": 0.43221110105514526,
"learning_rate": 5.761199701421391e-05,
"loss": 0.8831,
"step": 688
},
{
"epoch": 0.3459124759434357,
"grad_norm": 0.42259782552719116,
"learning_rate": 5.750708221132092e-05,
"loss": 0.8903,
"step": 689
},
{
"epoch": 0.34641452598108946,
"grad_norm": 0.4195202887058258,
"learning_rate": 5.7402133582686576e-05,
"loss": 0.8291,
"step": 690
},
{
"epoch": 0.3469165760187432,
"grad_norm": 0.4531534016132355,
"learning_rate": 5.7297151601193056e-05,
"loss": 0.8893,
"step": 691
},
{
"epoch": 0.34741862605639695,
"grad_norm": 0.46428826451301575,
"learning_rate": 5.719213673987277e-05,
"loss": 0.9049,
"step": 692
},
{
"epoch": 0.3479206760940507,
"grad_norm": 0.4338727295398712,
"learning_rate": 5.708708947190634e-05,
"loss": 0.8142,
"step": 693
},
{
"epoch": 0.34842272613170444,
"grad_norm": 0.44543692469596863,
"learning_rate": 5.698201027062034e-05,
"loss": 0.8463,
"step": 694
},
{
"epoch": 0.3489247761693582,
"grad_norm": 0.4769425094127655,
"learning_rate": 5.6876899609485256e-05,
"loss": 0.8931,
"step": 695
},
{
"epoch": 0.349426826207012,
"grad_norm": 0.49232223629951477,
"learning_rate": 5.6771757962113323e-05,
"loss": 0.8189,
"step": 696
},
{
"epoch": 0.34992887624466573,
"grad_norm": 0.49148690700531006,
"learning_rate": 5.666658580225643e-05,
"loss": 0.8153,
"step": 697
},
{
"epoch": 0.3504309262823195,
"grad_norm": 0.5055503845214844,
"learning_rate": 5.656138360380391e-05,
"loss": 0.8018,
"step": 698
},
{
"epoch": 0.3509329763199732,
"grad_norm": 0.5481170415878296,
"learning_rate": 5.645615184078044e-05,
"loss": 0.8587,
"step": 699
},
{
"epoch": 0.35143502635762697,
"grad_norm": 0.6615381240844727,
"learning_rate": 5.6350890987343944e-05,
"loss": 0.777,
"step": 700
},
{
"epoch": 0.3519370763952807,
"grad_norm": 0.434299111366272,
"learning_rate": 5.6245601517783406e-05,
"loss": 1.2088,
"step": 701
},
{
"epoch": 0.35243912643293446,
"grad_norm": 0.39533907175064087,
"learning_rate": 5.614028390651675e-05,
"loss": 1.1814,
"step": 702
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.3828687369823456,
"learning_rate": 5.6034938628088705e-05,
"loss": 1.1873,
"step": 703
},
{
"epoch": 0.353443226508242,
"grad_norm": 0.3660382628440857,
"learning_rate": 5.5929566157168665e-05,
"loss": 1.0862,
"step": 704
},
{
"epoch": 0.35394527654589575,
"grad_norm": 0.39876964688301086,
"learning_rate": 5.582416696854853e-05,
"loss": 1.0083,
"step": 705
},
{
"epoch": 0.3544473265835495,
"grad_norm": 0.409247487783432,
"learning_rate": 5.571874153714063e-05,
"loss": 1.0714,
"step": 706
},
{
"epoch": 0.35494937662120324,
"grad_norm": 0.3872778117656708,
"learning_rate": 5.561329033797547e-05,
"loss": 1.085,
"step": 707
},
{
"epoch": 0.355451426658857,
"grad_norm": 0.38185930252075195,
"learning_rate": 5.550781384619973e-05,
"loss": 1.0762,
"step": 708
},
{
"epoch": 0.35595347669651073,
"grad_norm": 0.3866881728172302,
"learning_rate": 5.540231253707403e-05,
"loss": 1.0326,
"step": 709
},
{
"epoch": 0.35645552673416453,
"grad_norm": 0.37910160422325134,
"learning_rate": 5.5296786885970805e-05,
"loss": 1.0769,
"step": 710
},
{
"epoch": 0.3569575767718183,
"grad_norm": 0.3608991205692291,
"learning_rate": 5.519123736837217e-05,
"loss": 1.0523,
"step": 711
},
{
"epoch": 0.357459626809472,
"grad_norm": 0.36697694659233093,
"learning_rate": 5.50856644598678e-05,
"loss": 0.9778,
"step": 712
},
{
"epoch": 0.35796167684712576,
"grad_norm": 0.4545275568962097,
"learning_rate": 5.498006863615275e-05,
"loss": 1.0207,
"step": 713
},
{
"epoch": 0.3584637268847795,
"grad_norm": 0.3483712375164032,
"learning_rate": 5.487445037302531e-05,
"loss": 1.0002,
"step": 714
},
{
"epoch": 0.35896577692243326,
"grad_norm": 0.3665158152580261,
"learning_rate": 5.476881014638491e-05,
"loss": 1.0274,
"step": 715
},
{
"epoch": 0.359467826960087,
"grad_norm": 0.35564157366752625,
"learning_rate": 5.466314843222993e-05,
"loss": 0.9884,
"step": 716
},
{
"epoch": 0.3599698769977408,
"grad_norm": 0.3559761345386505,
"learning_rate": 5.4557465706655564e-05,
"loss": 1.0143,
"step": 717
},
{
"epoch": 0.36047192703539455,
"grad_norm": 0.38508090376853943,
"learning_rate": 5.4451762445851705e-05,
"loss": 1.0679,
"step": 718
},
{
"epoch": 0.3609739770730483,
"grad_norm": 0.3513292670249939,
"learning_rate": 5.4346039126100733e-05,
"loss": 0.948,
"step": 719
},
{
"epoch": 0.36147602711070204,
"grad_norm": 0.36502474546432495,
"learning_rate": 5.4240296223775465e-05,
"loss": 1.0246,
"step": 720
},
{
"epoch": 0.3619780771483558,
"grad_norm": 0.3846004605293274,
"learning_rate": 5.41345342153369e-05,
"loss": 1.0332,
"step": 721
},
{
"epoch": 0.3624801271860095,
"grad_norm": 0.35061997175216675,
"learning_rate": 5.4028753577332146e-05,
"loss": 0.9286,
"step": 722
},
{
"epoch": 0.36298217722366327,
"grad_norm": 0.37235984206199646,
"learning_rate": 5.392295478639225e-05,
"loss": 1.0385,
"step": 723
},
{
"epoch": 0.36348422726131707,
"grad_norm": 0.3770149350166321,
"learning_rate": 5.3817138319230076e-05,
"loss": 0.9865,
"step": 724
},
{
"epoch": 0.3639862772989708,
"grad_norm": 0.3904590606689453,
"learning_rate": 5.3711304652638126e-05,
"loss": 0.934,
"step": 725
},
{
"epoch": 0.36448832733662456,
"grad_norm": 0.3823120892047882,
"learning_rate": 5.360545426348638e-05,
"loss": 0.9394,
"step": 726
},
{
"epoch": 0.3649903773742783,
"grad_norm": 0.36231666803359985,
"learning_rate": 5.349958762872016e-05,
"loss": 0.9282,
"step": 727
},
{
"epoch": 0.36549242741193205,
"grad_norm": 0.3757944405078888,
"learning_rate": 5.3393705225358046e-05,
"loss": 0.8884,
"step": 728
},
{
"epoch": 0.3659944774495858,
"grad_norm": 0.4007607102394104,
"learning_rate": 5.32878075304896e-05,
"loss": 0.9739,
"step": 729
},
{
"epoch": 0.36649652748723954,
"grad_norm": 0.40476924180984497,
"learning_rate": 5.318189502127332e-05,
"loss": 0.9458,
"step": 730
},
{
"epoch": 0.3669985775248933,
"grad_norm": 0.39884302020072937,
"learning_rate": 5.307596817493445e-05,
"loss": 0.8989,
"step": 731
},
{
"epoch": 0.3675006275625471,
"grad_norm": 0.42604318261146545,
"learning_rate": 5.297002746876284e-05,
"loss": 0.9337,
"step": 732
},
{
"epoch": 0.36800267760020083,
"grad_norm": 0.41235285997390747,
"learning_rate": 5.286407338011079e-05,
"loss": 0.9191,
"step": 733
},
{
"epoch": 0.3685047276378546,
"grad_norm": 0.40768033266067505,
"learning_rate": 5.275810638639088e-05,
"loss": 0.957,
"step": 734
},
{
"epoch": 0.3690067776755083,
"grad_norm": 0.42073965072631836,
"learning_rate": 5.265212696507387e-05,
"loss": 0.9503,
"step": 735
},
{
"epoch": 0.36950882771316207,
"grad_norm": 0.40175575017929077,
"learning_rate": 5.254613559368649e-05,
"loss": 0.9277,
"step": 736
},
{
"epoch": 0.3700108777508158,
"grad_norm": 0.39959418773651123,
"learning_rate": 5.2440132749809313e-05,
"loss": 0.9021,
"step": 737
},
{
"epoch": 0.37051292778846956,
"grad_norm": 0.45893776416778564,
"learning_rate": 5.2334118911074635e-05,
"loss": 0.9413,
"step": 738
},
{
"epoch": 0.37101497782612336,
"grad_norm": 0.4203508794307709,
"learning_rate": 5.2228094555164265e-05,
"loss": 0.9131,
"step": 739
},
{
"epoch": 0.3715170278637771,
"grad_norm": 0.4097796082496643,
"learning_rate": 5.212206015980742e-05,
"loss": 0.881,
"step": 740
},
{
"epoch": 0.37201907790143085,
"grad_norm": 0.44615375995635986,
"learning_rate": 5.201601620277854e-05,
"loss": 0.8147,
"step": 741
},
{
"epoch": 0.3725211279390846,
"grad_norm": 0.4491327702999115,
"learning_rate": 5.190996316189515e-05,
"loss": 0.8368,
"step": 742
},
{
"epoch": 0.37302317797673834,
"grad_norm": 0.4489690065383911,
"learning_rate": 5.180390151501569e-05,
"loss": 0.9062,
"step": 743
},
{
"epoch": 0.3735252280143921,
"grad_norm": 0.4554278552532196,
"learning_rate": 5.1697831740037436e-05,
"loss": 0.841,
"step": 744
},
{
"epoch": 0.37402727805204583,
"grad_norm": 0.4591432213783264,
"learning_rate": 5.159175431489424e-05,
"loss": 0.8241,
"step": 745
},
{
"epoch": 0.37452932808969963,
"grad_norm": 0.4552235007286072,
"learning_rate": 5.1485669717554396e-05,
"loss": 0.7784,
"step": 746
},
{
"epoch": 0.3750313781273534,
"grad_norm": 0.4900113046169281,
"learning_rate": 5.137957842601856e-05,
"loss": 0.7905,
"step": 747
},
{
"epoch": 0.3755334281650071,
"grad_norm": 0.5452777743339539,
"learning_rate": 5.1273480918317554e-05,
"loss": 0.8248,
"step": 748
},
{
"epoch": 0.37603547820266087,
"grad_norm": 0.5230666399002075,
"learning_rate": 5.116737767251021e-05,
"loss": 0.781,
"step": 749
},
{
"epoch": 0.3765375282403146,
"grad_norm": 0.632352352142334,
"learning_rate": 5.1061269166681183e-05,
"loss": 0.7272,
"step": 750
},
{
"epoch": 0.3765375282403146,
"eval_loss": 0.9616905450820923,
"eval_runtime": 709.9548,
"eval_samples_per_second": 21.265,
"eval_steps_per_second": 2.659,
"step": 750
},
{
"epoch": 0.37703957827796836,
"grad_norm": 0.4330700933933258,
"learning_rate": 5.095515587893884e-05,
"loss": 1.2318,
"step": 751
},
{
"epoch": 0.3775416283156221,
"grad_norm": 0.3779419958591461,
"learning_rate": 5.084903828741312e-05,
"loss": 1.2228,
"step": 752
},
{
"epoch": 0.3780436783532759,
"grad_norm": 0.376594603061676,
"learning_rate": 5.0742916870253334e-05,
"loss": 1.1351,
"step": 753
},
{
"epoch": 0.37854572839092965,
"grad_norm": 0.3838042914867401,
"learning_rate": 5.063679210562602e-05,
"loss": 1.1161,
"step": 754
},
{
"epoch": 0.3790477784285834,
"grad_norm": 0.37450775504112244,
"learning_rate": 5.053066447171282e-05,
"loss": 1.0012,
"step": 755
},
{
"epoch": 0.37954982846623714,
"grad_norm": 0.37315741181373596,
"learning_rate": 5.042453444670828e-05,
"loss": 1.1146,
"step": 756
},
{
"epoch": 0.3800518785038909,
"grad_norm": 0.3619626760482788,
"learning_rate": 5.031840250881776e-05,
"loss": 1.0954,
"step": 757
},
{
"epoch": 0.38055392854154463,
"grad_norm": 0.3665991723537445,
"learning_rate": 5.021226913625522e-05,
"loss": 1.0704,
"step": 758
},
{
"epoch": 0.3810559785791984,
"grad_norm": 0.3833234906196594,
"learning_rate": 5.0106134807241045e-05,
"loss": 1.0973,
"step": 759
},
{
"epoch": 0.3815580286168522,
"grad_norm": 0.37826788425445557,
"learning_rate": 5e-05,
"loss": 1.016,
"step": 760
},
{
"epoch": 0.3820600786545059,
"grad_norm": 0.3752281665802002,
"learning_rate": 4.989386519275895e-05,
"loss": 1.0214,
"step": 761
},
{
"epoch": 0.38256212869215966,
"grad_norm": 0.35231512784957886,
"learning_rate": 4.978773086374479e-05,
"loss": 0.9812,
"step": 762
},
{
"epoch": 0.3830641787298134,
"grad_norm": 0.34861356019973755,
"learning_rate": 4.968159749118223e-05,
"loss": 0.9588,
"step": 763
},
{
"epoch": 0.38356622876746715,
"grad_norm": 0.3637848198413849,
"learning_rate": 4.957546555329173e-05,
"loss": 0.9808,
"step": 764
},
{
"epoch": 0.3840682788051209,
"grad_norm": 0.38542938232421875,
"learning_rate": 4.94693355282872e-05,
"loss": 1.0052,
"step": 765
},
{
"epoch": 0.38457032884277464,
"grad_norm": 0.3675108850002289,
"learning_rate": 4.9363207894374e-05,
"loss": 0.9797,
"step": 766
},
{
"epoch": 0.3850723788804284,
"grad_norm": 0.3529476523399353,
"learning_rate": 4.925708312974667e-05,
"loss": 1.0427,
"step": 767
},
{
"epoch": 0.3855744289180822,
"grad_norm": 0.35466766357421875,
"learning_rate": 4.9150961712586895e-05,
"loss": 1.0076,
"step": 768
},
{
"epoch": 0.38607647895573594,
"grad_norm": 0.3574579358100891,
"learning_rate": 4.904484412106117e-05,
"loss": 1.0206,
"step": 769
},
{
"epoch": 0.3865785289933897,
"grad_norm": 0.35434436798095703,
"learning_rate": 4.893873083331882e-05,
"loss": 0.944,
"step": 770
},
{
"epoch": 0.3870805790310434,
"grad_norm": 0.37650713324546814,
"learning_rate": 4.88326223274898e-05,
"loss": 0.9769,
"step": 771
},
{
"epoch": 0.38758262906869717,
"grad_norm": 0.3571126461029053,
"learning_rate": 4.8726519081682444e-05,
"loss": 0.996,
"step": 772
},
{
"epoch": 0.3880846791063509,
"grad_norm": 0.3663455843925476,
"learning_rate": 4.862042157398146e-05,
"loss": 0.908,
"step": 773
},
{
"epoch": 0.38858672914400466,
"grad_norm": 0.380512535572052,
"learning_rate": 4.851433028244562e-05,
"loss": 1.0196,
"step": 774
},
{
"epoch": 0.38908877918165846,
"grad_norm": 0.38776859641075134,
"learning_rate": 4.840824568510579e-05,
"loss": 0.9251,
"step": 775
},
{
"epoch": 0.3895908292193122,
"grad_norm": 0.39721420407295227,
"learning_rate": 4.830216825996257e-05,
"loss": 0.9202,
"step": 776
},
{
"epoch": 0.39009287925696595,
"grad_norm": 0.3933786153793335,
"learning_rate": 4.8196098484984305e-05,
"loss": 0.944,
"step": 777
},
{
"epoch": 0.3905949292946197,
"grad_norm": 0.3744068741798401,
"learning_rate": 4.809003683810486e-05,
"loss": 0.9442,
"step": 778
},
{
"epoch": 0.39109697933227344,
"grad_norm": 0.39798104763031006,
"learning_rate": 4.798398379722147e-05,
"loss": 0.9739,
"step": 779
},
{
"epoch": 0.3915990293699272,
"grad_norm": 0.3898034691810608,
"learning_rate": 4.78779398401926e-05,
"loss": 0.9401,
"step": 780
},
{
"epoch": 0.39210107940758093,
"grad_norm": 0.3922993540763855,
"learning_rate": 4.777190544483574e-05,
"loss": 0.9504,
"step": 781
},
{
"epoch": 0.39260312944523473,
"grad_norm": 0.38821038603782654,
"learning_rate": 4.7665881088925376e-05,
"loss": 0.9617,
"step": 782
},
{
"epoch": 0.3931051794828885,
"grad_norm": 0.3955070674419403,
"learning_rate": 4.75598672501907e-05,
"loss": 0.9072,
"step": 783
},
{
"epoch": 0.3936072295205422,
"grad_norm": 0.38435256481170654,
"learning_rate": 4.7453864406313544e-05,
"loss": 0.9285,
"step": 784
},
{
"epoch": 0.39410927955819597,
"grad_norm": 0.40070778131484985,
"learning_rate": 4.734787303492615e-05,
"loss": 0.9422,
"step": 785
},
{
"epoch": 0.3946113295958497,
"grad_norm": 0.4178116023540497,
"learning_rate": 4.7241893613609126e-05,
"loss": 0.9361,
"step": 786
},
{
"epoch": 0.39511337963350346,
"grad_norm": 0.4187740087509155,
"learning_rate": 4.7135926619889226e-05,
"loss": 0.8883,
"step": 787
},
{
"epoch": 0.3956154296711572,
"grad_norm": 0.42808717489242554,
"learning_rate": 4.702997253123716e-05,
"loss": 0.8763,
"step": 788
},
{
"epoch": 0.396117479708811,
"grad_norm": 0.4418085813522339,
"learning_rate": 4.6924031825065566e-05,
"loss": 0.9475,
"step": 789
},
{
"epoch": 0.39661952974646475,
"grad_norm": 0.4347171485424042,
"learning_rate": 4.6818104978726685e-05,
"loss": 0.7853,
"step": 790
},
{
"epoch": 0.3971215797841185,
"grad_norm": 0.4366185665130615,
"learning_rate": 4.6712192469510425e-05,
"loss": 0.8485,
"step": 791
},
{
"epoch": 0.39762362982177224,
"grad_norm": 0.4427374601364136,
"learning_rate": 4.6606294774641966e-05,
"loss": 0.8737,
"step": 792
},
{
"epoch": 0.398125679859426,
"grad_norm": 0.4442150890827179,
"learning_rate": 4.6500412371279836e-05,
"loss": 0.8032,
"step": 793
},
{
"epoch": 0.39862772989707973,
"grad_norm": 0.4936541020870209,
"learning_rate": 4.6394545736513634e-05,
"loss": 0.8794,
"step": 794
},
{
"epoch": 0.3991297799347335,
"grad_norm": 0.47061917185783386,
"learning_rate": 4.628869534736187e-05,
"loss": 0.8568,
"step": 795
},
{
"epoch": 0.3996318299723872,
"grad_norm": 0.525748610496521,
"learning_rate": 4.618286168076993e-05,
"loss": 0.8513,
"step": 796
},
{
"epoch": 0.400133880010041,
"grad_norm": 0.4828825891017914,
"learning_rate": 4.607704521360776e-05,
"loss": 0.8328,
"step": 797
},
{
"epoch": 0.40063593004769477,
"grad_norm": 0.4649796187877655,
"learning_rate": 4.597124642266788e-05,
"loss": 0.7556,
"step": 798
},
{
"epoch": 0.4011379800853485,
"grad_norm": 0.5552456974983215,
"learning_rate": 4.5865465784663114e-05,
"loss": 0.8184,
"step": 799
},
{
"epoch": 0.40164003012300226,
"grad_norm": 0.706791341304779,
"learning_rate": 4.575970377622456e-05,
"loss": 0.7444,
"step": 800
},
{
"epoch": 0.402142080160656,
"grad_norm": 0.4323110282421112,
"learning_rate": 4.565396087389927e-05,
"loss": 1.1972,
"step": 801
},
{
"epoch": 0.40264413019830975,
"grad_norm": 0.354783833026886,
"learning_rate": 4.554823755414829e-05,
"loss": 1.1179,
"step": 802
},
{
"epoch": 0.4031461802359635,
"grad_norm": 0.3601534068584442,
"learning_rate": 4.544253429334444e-05,
"loss": 1.1264,
"step": 803
},
{
"epoch": 0.4036482302736173,
"grad_norm": 0.3654196858406067,
"learning_rate": 4.5336851567770076e-05,
"loss": 1.0834,
"step": 804
},
{
"epoch": 0.40415028031127104,
"grad_norm": 0.3873622715473175,
"learning_rate": 4.52311898536151e-05,
"loss": 1.0247,
"step": 805
},
{
"epoch": 0.4046523303489248,
"grad_norm": 0.37240368127822876,
"learning_rate": 4.5125549626974696e-05,
"loss": 1.0396,
"step": 806
},
{
"epoch": 0.4051543803865785,
"grad_norm": 0.36485597491264343,
"learning_rate": 4.5019931363847275e-05,
"loss": 1.0249,
"step": 807
},
{
"epoch": 0.4056564304242323,
"grad_norm": 0.38187476992607117,
"learning_rate": 4.491433554013221e-05,
"loss": 1.0405,
"step": 808
},
{
"epoch": 0.406158480461886,
"grad_norm": 0.36962300539016724,
"learning_rate": 4.480876263162783e-05,
"loss": 1.0253,
"step": 809
},
{
"epoch": 0.40666053049953976,
"grad_norm": 0.34921392798423767,
"learning_rate": 4.47032131140292e-05,
"loss": 1.016,
"step": 810
},
{
"epoch": 0.40716258053719356,
"grad_norm": 0.3537079691886902,
"learning_rate": 4.459768746292597e-05,
"loss": 1.0478,
"step": 811
},
{
"epoch": 0.4076646305748473,
"grad_norm": 0.3565637767314911,
"learning_rate": 4.449218615380029e-05,
"loss": 1.0148,
"step": 812
},
{
"epoch": 0.40816668061250105,
"grad_norm": 0.35647860169410706,
"learning_rate": 4.4386709662024544e-05,
"loss": 0.9924,
"step": 813
},
{
"epoch": 0.4086687306501548,
"grad_norm": 0.34907302260398865,
"learning_rate": 4.4281258462859396e-05,
"loss": 1.0018,
"step": 814
},
{
"epoch": 0.40917078068780854,
"grad_norm": 0.3495464026927948,
"learning_rate": 4.4175833031451473e-05,
"loss": 0.9449,
"step": 815
},
{
"epoch": 0.4096728307254623,
"grad_norm": 0.3409779369831085,
"learning_rate": 4.407043384283136e-05,
"loss": 0.9676,
"step": 816
},
{
"epoch": 0.41017488076311603,
"grad_norm": 0.3575940430164337,
"learning_rate": 4.396506137191131e-05,
"loss": 0.9863,
"step": 817
},
{
"epoch": 0.41067693080076983,
"grad_norm": 0.36198464035987854,
"learning_rate": 4.3859716093483245e-05,
"loss": 0.9905,
"step": 818
},
{
"epoch": 0.4111789808384236,
"grad_norm": 0.34198319911956787,
"learning_rate": 4.3754398482216606e-05,
"loss": 0.9482,
"step": 819
},
{
"epoch": 0.4116810308760773,
"grad_norm": 0.3572383224964142,
"learning_rate": 4.364910901265606e-05,
"loss": 0.934,
"step": 820
},
{
"epoch": 0.41218308091373107,
"grad_norm": 0.3588048219680786,
"learning_rate": 4.354384815921958e-05,
"loss": 0.9856,
"step": 821
},
{
"epoch": 0.4126851309513848,
"grad_norm": 0.3628753125667572,
"learning_rate": 4.343861639619611e-05,
"loss": 0.9762,
"step": 822
},
{
"epoch": 0.41318718098903856,
"grad_norm": 0.3723025321960449,
"learning_rate": 4.3333414197743595e-05,
"loss": 0.9704,
"step": 823
},
{
"epoch": 0.4136892310266923,
"grad_norm": 0.3608042597770691,
"learning_rate": 4.322824203788669e-05,
"loss": 0.951,
"step": 824
},
{
"epoch": 0.4141912810643461,
"grad_norm": 0.3752797245979309,
"learning_rate": 4.3123100390514756e-05,
"loss": 0.9878,
"step": 825
},
{
"epoch": 0.41469333110199985,
"grad_norm": 0.37421780824661255,
"learning_rate": 4.3017989729379675e-05,
"loss": 0.9776,
"step": 826
},
{
"epoch": 0.4151953811396536,
"grad_norm": 0.3613242506980896,
"learning_rate": 4.291291052809366e-05,
"loss": 0.9205,
"step": 827
},
{
"epoch": 0.41569743117730734,
"grad_norm": 0.3855215609073639,
"learning_rate": 4.280786326012723e-05,
"loss": 0.986,
"step": 828
},
{
"epoch": 0.4161994812149611,
"grad_norm": 0.41651931405067444,
"learning_rate": 4.2702848398806956e-05,
"loss": 0.9639,
"step": 829
},
{
"epoch": 0.41670153125261483,
"grad_norm": 0.3905417323112488,
"learning_rate": 4.2597866417313436e-05,
"loss": 0.9319,
"step": 830
},
{
"epoch": 0.4172035812902686,
"grad_norm": 0.4226928651332855,
"learning_rate": 4.249291778867909e-05,
"loss": 0.9213,
"step": 831
},
{
"epoch": 0.4177056313279223,
"grad_norm": 0.382017582654953,
"learning_rate": 4.23880029857861e-05,
"loss": 0.8846,
"step": 832
},
{
"epoch": 0.4182076813655761,
"grad_norm": 0.417928546667099,
"learning_rate": 4.2283122481364144e-05,
"loss": 0.9288,
"step": 833
},
{
"epoch": 0.41870973140322987,
"grad_norm": 0.41737717390060425,
"learning_rate": 4.2178276747988446e-05,
"loss": 0.9423,
"step": 834
},
{
"epoch": 0.4192117814408836,
"grad_norm": 0.39423155784606934,
"learning_rate": 4.207346625807756e-05,
"loss": 0.8894,
"step": 835
},
{
"epoch": 0.41971383147853736,
"grad_norm": 0.427852064371109,
"learning_rate": 4.196869148389114e-05,
"loss": 0.9639,
"step": 836
},
{
"epoch": 0.4202158815161911,
"grad_norm": 0.4028894007205963,
"learning_rate": 4.1863952897528e-05,
"loss": 0.9309,
"step": 837
},
{
"epoch": 0.42071793155384485,
"grad_norm": 0.42165279388427734,
"learning_rate": 4.175925097092388e-05,
"loss": 0.9514,
"step": 838
},
{
"epoch": 0.4212199815914986,
"grad_norm": 0.4179115295410156,
"learning_rate": 4.165458617584933e-05,
"loss": 0.8544,
"step": 839
},
{
"epoch": 0.4217220316291524,
"grad_norm": 0.479951947927475,
"learning_rate": 4.1549958983907555e-05,
"loss": 0.811,
"step": 840
},
{
"epoch": 0.42222408166680614,
"grad_norm": 0.45290902256965637,
"learning_rate": 4.144536986653239e-05,
"loss": 0.8243,
"step": 841
},
{
"epoch": 0.4227261317044599,
"grad_norm": 0.4473222494125366,
"learning_rate": 4.1340819294986076e-05,
"loss": 0.8137,
"step": 842
},
{
"epoch": 0.42322818174211363,
"grad_norm": 0.42771241068840027,
"learning_rate": 4.1236307740357173e-05,
"loss": 0.8189,
"step": 843
},
{
"epoch": 0.4237302317797674,
"grad_norm": 0.45651838183403015,
"learning_rate": 4.113183567355846e-05,
"loss": 0.8224,
"step": 844
},
{
"epoch": 0.4242322818174211,
"grad_norm": 0.4706350266933441,
"learning_rate": 4.102740356532473e-05,
"loss": 0.8297,
"step": 845
},
{
"epoch": 0.42473433185507486,
"grad_norm": 0.4705800712108612,
"learning_rate": 4.092301188621084e-05,
"loss": 0.7732,
"step": 846
},
{
"epoch": 0.42523638189272867,
"grad_norm": 0.5137692093849182,
"learning_rate": 4.081866110658934e-05,
"loss": 0.8374,
"step": 847
},
{
"epoch": 0.4257384319303824,
"grad_norm": 0.5054532885551453,
"learning_rate": 4.0714351696648614e-05,
"loss": 0.8556,
"step": 848
},
{
"epoch": 0.42624048196803616,
"grad_norm": 0.5825408697128296,
"learning_rate": 4.061008412639055e-05,
"loss": 0.8321,
"step": 849
},
{
"epoch": 0.4267425320056899,
"grad_norm": 0.6395136117935181,
"learning_rate": 4.050585886562858e-05,
"loss": 0.721,
"step": 850
},
{
"epoch": 0.42724458204334365,
"grad_norm": 0.5878275632858276,
"learning_rate": 4.0401676383985484e-05,
"loss": 1.3045,
"step": 851
},
{
"epoch": 0.4277466320809974,
"grad_norm": 0.3765466511249542,
"learning_rate": 4.0297537150891235e-05,
"loss": 1.1244,
"step": 852
},
{
"epoch": 0.42824868211865114,
"grad_norm": 0.38248923420906067,
"learning_rate": 4.0193441635581e-05,
"loss": 1.1962,
"step": 853
},
{
"epoch": 0.42875073215630494,
"grad_norm": 0.3714083433151245,
"learning_rate": 4.008939030709291e-05,
"loss": 1.026,
"step": 854
},
{
"epoch": 0.4292527821939587,
"grad_norm": 0.3839676082134247,
"learning_rate": 3.998538363426605e-05,
"loss": 1.101,
"step": 855
},
{
"epoch": 0.4297548322316124,
"grad_norm": 0.3552037477493286,
"learning_rate": 3.988142208573822e-05,
"loss": 1.0671,
"step": 856
},
{
"epoch": 0.43025688226926617,
"grad_norm": 0.36277374625205994,
"learning_rate": 3.977750612994396e-05,
"loss": 1.115,
"step": 857
},
{
"epoch": 0.4307589323069199,
"grad_norm": 0.3462297022342682,
"learning_rate": 3.9673636235112376e-05,
"loss": 1.0309,
"step": 858
},
{
"epoch": 0.43126098234457366,
"grad_norm": 0.3610150218009949,
"learning_rate": 3.956981286926498e-05,
"loss": 1.0359,
"step": 859
},
{
"epoch": 0.4317630323822274,
"grad_norm": 0.35921838879585266,
"learning_rate": 3.94660365002137e-05,
"loss": 1.0397,
"step": 860
},
{
"epoch": 0.4322650824198812,
"grad_norm": 0.3716135621070862,
"learning_rate": 3.93623075955586e-05,
"loss": 1.0673,
"step": 861
},
{
"epoch": 0.43276713245753495,
"grad_norm": 0.37005794048309326,
"learning_rate": 3.925862662268602e-05,
"loss": 1.0354,
"step": 862
},
{
"epoch": 0.4332691824951887,
"grad_norm": 0.34723684191703796,
"learning_rate": 3.9154994048766184e-05,
"loss": 1.0334,
"step": 863
},
{
"epoch": 0.43377123253284244,
"grad_norm": 0.3506997525691986,
"learning_rate": 3.905141034075135e-05,
"loss": 0.9656,
"step": 864
},
{
"epoch": 0.4342732825704962,
"grad_norm": 0.37688568234443665,
"learning_rate": 3.894787596537352e-05,
"loss": 0.9302,
"step": 865
},
{
"epoch": 0.43477533260814993,
"grad_norm": 0.3472607433795929,
"learning_rate": 3.884439138914243e-05,
"loss": 0.9686,
"step": 866
},
{
"epoch": 0.4352773826458037,
"grad_norm": 0.35843560099601746,
"learning_rate": 3.874095707834349e-05,
"loss": 0.9701,
"step": 867
},
{
"epoch": 0.4357794326834574,
"grad_norm": 0.3564199209213257,
"learning_rate": 3.863757349903551e-05,
"loss": 1.0456,
"step": 868
},
{
"epoch": 0.4362814827211112,
"grad_norm": 0.38524752855300903,
"learning_rate": 3.853424111704879e-05,
"loss": 0.9603,
"step": 869
},
{
"epoch": 0.43678353275876497,
"grad_norm": 0.3552170693874359,
"learning_rate": 3.843096039798293e-05,
"loss": 0.9274,
"step": 870
},
{
"epoch": 0.4372855827964187,
"grad_norm": 0.37275344133377075,
"learning_rate": 3.832773180720475e-05,
"loss": 1.0213,
"step": 871
},
{
"epoch": 0.43778763283407246,
"grad_norm": 0.3630153238773346,
"learning_rate": 3.822455580984613e-05,
"loss": 0.9482,
"step": 872
},
{
"epoch": 0.4382896828717262,
"grad_norm": 0.36190661787986755,
"learning_rate": 3.8121432870802045e-05,
"loss": 0.881,
"step": 873
},
{
"epoch": 0.43879173290937995,
"grad_norm": 0.3701936602592468,
"learning_rate": 3.801836345472841e-05,
"loss": 1.0065,
"step": 874
},
{
"epoch": 0.4392937829470337,
"grad_norm": 0.4397743344306946,
"learning_rate": 3.791534802603988e-05,
"loss": 0.9938,
"step": 875
},
{
"epoch": 0.4397958329846875,
"grad_norm": 0.36815145611763,
"learning_rate": 3.781238704890793e-05,
"loss": 0.9628,
"step": 876
},
{
"epoch": 0.44029788302234124,
"grad_norm": 0.3762166500091553,
"learning_rate": 3.7709480987258636e-05,
"loss": 0.9478,
"step": 877
},
{
"epoch": 0.440799933059995,
"grad_norm": 0.39231258630752563,
"learning_rate": 3.760663030477072e-05,
"loss": 1.0166,
"step": 878
},
{
"epoch": 0.44130198309764873,
"grad_norm": 0.38583433628082275,
"learning_rate": 3.750383546487324e-05,
"loss": 0.9232,
"step": 879
},
{
"epoch": 0.4418040331353025,
"grad_norm": 0.3934246301651001,
"learning_rate": 3.740109693074375e-05,
"loss": 0.9657,
"step": 880
},
{
"epoch": 0.4423060831729562,
"grad_norm": 0.4055297374725342,
"learning_rate": 3.729841516530604e-05,
"loss": 0.9054,
"step": 881
},
{
"epoch": 0.44280813321060997,
"grad_norm": 0.4082297682762146,
"learning_rate": 3.7195790631228136e-05,
"loss": 0.9365,
"step": 882
},
{
"epoch": 0.44331018324826377,
"grad_norm": 0.39798596501350403,
"learning_rate": 3.709322379092019e-05,
"loss": 0.9023,
"step": 883
},
{
"epoch": 0.4438122332859175,
"grad_norm": 0.418045312166214,
"learning_rate": 3.6990715106532356e-05,
"loss": 0.9233,
"step": 884
},
{
"epoch": 0.44431428332357126,
"grad_norm": 0.4316072463989258,
"learning_rate": 3.68882650399528e-05,
"loss": 0.8931,
"step": 885
},
{
"epoch": 0.444816333361225,
"grad_norm": 0.42850467562675476,
"learning_rate": 3.6785874052805516e-05,
"loss": 0.8839,
"step": 886
},
{
"epoch": 0.44531838339887875,
"grad_norm": 0.4238118529319763,
"learning_rate": 3.6683542606448347e-05,
"loss": 0.9291,
"step": 887
},
{
"epoch": 0.4458204334365325,
"grad_norm": 0.415999174118042,
"learning_rate": 3.658127116197079e-05,
"loss": 0.9257,
"step": 888
},
{
"epoch": 0.44632248347418624,
"grad_norm": 0.4444602131843567,
"learning_rate": 3.6479060180192034e-05,
"loss": 0.8785,
"step": 889
},
{
"epoch": 0.44682453351184004,
"grad_norm": 0.4339217245578766,
"learning_rate": 3.637691012165886e-05,
"loss": 0.7952,
"step": 890
},
{
"epoch": 0.4473265835494938,
"grad_norm": 0.4458482563495636,
"learning_rate": 3.627482144664344e-05,
"loss": 0.8247,
"step": 891
},
{
"epoch": 0.44782863358714753,
"grad_norm": 0.4593295454978943,
"learning_rate": 3.6172794615141446e-05,
"loss": 0.8401,
"step": 892
},
{
"epoch": 0.4483306836248013,
"grad_norm": 0.47604867815971375,
"learning_rate": 3.607083008686985e-05,
"loss": 0.8271,
"step": 893
},
{
"epoch": 0.448832733662455,
"grad_norm": 0.45923951268196106,
"learning_rate": 3.596892832126494e-05,
"loss": 0.858,
"step": 894
},
{
"epoch": 0.44933478370010876,
"grad_norm": 0.4550018608570099,
"learning_rate": 3.586708977748012e-05,
"loss": 0.7788,
"step": 895
},
{
"epoch": 0.4498368337377625,
"grad_norm": 0.4726627469062805,
"learning_rate": 3.5765314914384026e-05,
"loss": 0.8576,
"step": 896
},
{
"epoch": 0.4503388837754163,
"grad_norm": 0.4911380708217621,
"learning_rate": 3.5663604190558296e-05,
"loss": 0.8507,
"step": 897
},
{
"epoch": 0.45084093381307005,
"grad_norm": 0.5006689429283142,
"learning_rate": 3.556195806429559e-05,
"loss": 0.7908,
"step": 898
},
{
"epoch": 0.4513429838507238,
"grad_norm": 0.6167373061180115,
"learning_rate": 3.546037699359751e-05,
"loss": 0.7922,
"step": 899
},
{
"epoch": 0.45184503388837755,
"grad_norm": 0.6547103524208069,
"learning_rate": 3.5358861436172485e-05,
"loss": 0.6946,
"step": 900
},
{
"epoch": 0.45184503388837755,
"eval_loss": 0.9426594972610474,
"eval_runtime": 710.4868,
"eval_samples_per_second": 21.249,
"eval_steps_per_second": 2.657,
"step": 900
}
],
"logging_steps": 1,
"max_steps": 1500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.249771756744081e+18,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}