cimol's picture
Training in progress, step 200, checkpoint
109be1f verified
raw
history blame
36.8 kB
{
"best_metric": 0.00024003432190511376,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 1.7278617710583153,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008639308855291577,
"grad_norm": 0.7260720729827881,
"learning_rate": 7e-06,
"loss": 0.0407,
"step": 1
},
{
"epoch": 0.008639308855291577,
"eval_loss": 0.15066038072109222,
"eval_runtime": 20.6837,
"eval_samples_per_second": 9.428,
"eval_steps_per_second": 2.369,
"step": 1
},
{
"epoch": 0.017278617710583154,
"grad_norm": 0.557333767414093,
"learning_rate": 1.4e-05,
"loss": 0.034,
"step": 2
},
{
"epoch": 0.02591792656587473,
"grad_norm": 0.3476514518260956,
"learning_rate": 2.1e-05,
"loss": 0.0328,
"step": 3
},
{
"epoch": 0.03455723542116631,
"grad_norm": 0.18027208745479584,
"learning_rate": 2.8e-05,
"loss": 0.0306,
"step": 4
},
{
"epoch": 0.04319654427645788,
"grad_norm": 0.15658758580684662,
"learning_rate": 3.5e-05,
"loss": 0.0283,
"step": 5
},
{
"epoch": 0.05183585313174946,
"grad_norm": 0.2384180724620819,
"learning_rate": 4.2e-05,
"loss": 0.0351,
"step": 6
},
{
"epoch": 0.06047516198704104,
"grad_norm": 0.31615033745765686,
"learning_rate": 4.899999999999999e-05,
"loss": 0.0305,
"step": 7
},
{
"epoch": 0.06911447084233262,
"grad_norm": 0.1833355873823166,
"learning_rate": 5.6e-05,
"loss": 0.03,
"step": 8
},
{
"epoch": 0.07775377969762419,
"grad_norm": 0.24100624024868011,
"learning_rate": 6.3e-05,
"loss": 0.0226,
"step": 9
},
{
"epoch": 0.08639308855291576,
"grad_norm": 0.23082295060157776,
"learning_rate": 7e-05,
"loss": 0.0112,
"step": 10
},
{
"epoch": 0.09503239740820735,
"grad_norm": 0.09087604284286499,
"learning_rate": 6.999521567473641e-05,
"loss": 0.0045,
"step": 11
},
{
"epoch": 0.10367170626349892,
"grad_norm": 0.5570639371871948,
"learning_rate": 6.998086400693241e-05,
"loss": 0.0162,
"step": 12
},
{
"epoch": 0.11231101511879049,
"grad_norm": 0.08612176030874252,
"learning_rate": 6.995694892019065e-05,
"loss": 0.0043,
"step": 13
},
{
"epoch": 0.12095032397408208,
"grad_norm": 0.062429703772068024,
"learning_rate": 6.99234769526571e-05,
"loss": 0.0019,
"step": 14
},
{
"epoch": 0.12958963282937366,
"grad_norm": 0.3207145035266876,
"learning_rate": 6.988045725523343e-05,
"loss": 0.0064,
"step": 15
},
{
"epoch": 0.13822894168466524,
"grad_norm": 0.11560860276222229,
"learning_rate": 6.982790158907539e-05,
"loss": 0.003,
"step": 16
},
{
"epoch": 0.1468682505399568,
"grad_norm": 0.04708254709839821,
"learning_rate": 6.976582432237733e-05,
"loss": 0.0016,
"step": 17
},
{
"epoch": 0.15550755939524838,
"grad_norm": 0.07981427013874054,
"learning_rate": 6.969424242644413e-05,
"loss": 0.0017,
"step": 18
},
{
"epoch": 0.16414686825053995,
"grad_norm": 0.09298226237297058,
"learning_rate": 6.961317547105138e-05,
"loss": 0.0006,
"step": 19
},
{
"epoch": 0.17278617710583152,
"grad_norm": 0.02390686422586441,
"learning_rate": 6.952264561909527e-05,
"loss": 0.0008,
"step": 20
},
{
"epoch": 0.18142548596112312,
"grad_norm": 0.06466929614543915,
"learning_rate": 6.942267762053337e-05,
"loss": 0.0014,
"step": 21
},
{
"epoch": 0.1900647948164147,
"grad_norm": 0.05321163311600685,
"learning_rate": 6.931329880561832e-05,
"loss": 0.001,
"step": 22
},
{
"epoch": 0.19870410367170627,
"grad_norm": 0.02070157416164875,
"learning_rate": 6.919453907742597e-05,
"loss": 0.0006,
"step": 23
},
{
"epoch": 0.20734341252699784,
"grad_norm": 0.16782431304454803,
"learning_rate": 6.90664309036802e-05,
"loss": 0.0049,
"step": 24
},
{
"epoch": 0.2159827213822894,
"grad_norm": 0.14060857892036438,
"learning_rate": 6.892900930787656e-05,
"loss": 0.0012,
"step": 25
},
{
"epoch": 0.22462203023758098,
"grad_norm": 0.43972817063331604,
"learning_rate": 6.87823118597072e-05,
"loss": 0.0071,
"step": 26
},
{
"epoch": 0.23326133909287258,
"grad_norm": 0.05086011067032814,
"learning_rate": 6.862637866478969e-05,
"loss": 0.0013,
"step": 27
},
{
"epoch": 0.24190064794816415,
"grad_norm": 0.06401721388101578,
"learning_rate": 6.846125235370252e-05,
"loss": 0.0013,
"step": 28
},
{
"epoch": 0.2505399568034557,
"grad_norm": 0.09170061349868774,
"learning_rate": 6.828697807033038e-05,
"loss": 0.0183,
"step": 29
},
{
"epoch": 0.2591792656587473,
"grad_norm": 0.06554131209850311,
"learning_rate": 6.81036034595222e-05,
"loss": 0.018,
"step": 30
},
{
"epoch": 0.2678185745140389,
"grad_norm": 0.08401723951101303,
"learning_rate": 6.791117865406564e-05,
"loss": 0.0175,
"step": 31
},
{
"epoch": 0.27645788336933047,
"grad_norm": 0.06230723112821579,
"learning_rate": 6.770975626098112e-05,
"loss": 0.0122,
"step": 32
},
{
"epoch": 0.28509719222462204,
"grad_norm": 0.066391222178936,
"learning_rate": 6.749939134713974e-05,
"loss": 0.0137,
"step": 33
},
{
"epoch": 0.2937365010799136,
"grad_norm": 0.01942325197160244,
"learning_rate": 6.728014142420846e-05,
"loss": 0.0023,
"step": 34
},
{
"epoch": 0.3023758099352052,
"grad_norm": 0.05515532195568085,
"learning_rate": 6.7052066432927e-05,
"loss": 0.0023,
"step": 35
},
{
"epoch": 0.31101511879049676,
"grad_norm": 0.038139645010232925,
"learning_rate": 6.681522872672069e-05,
"loss": 0.0015,
"step": 36
},
{
"epoch": 0.31965442764578833,
"grad_norm": 0.03340466320514679,
"learning_rate": 6.656969305465356e-05,
"loss": 0.0016,
"step": 37
},
{
"epoch": 0.3282937365010799,
"grad_norm": 0.04159877821803093,
"learning_rate": 6.631552654372672e-05,
"loss": 0.0015,
"step": 38
},
{
"epoch": 0.3369330453563715,
"grad_norm": 0.03693181276321411,
"learning_rate": 6.60527986805264e-05,
"loss": 0.0017,
"step": 39
},
{
"epoch": 0.34557235421166305,
"grad_norm": 0.034342411905527115,
"learning_rate": 6.578158129222711e-05,
"loss": 0.0013,
"step": 40
},
{
"epoch": 0.3542116630669546,
"grad_norm": 0.022351600229740143,
"learning_rate": 6.550194852695469e-05,
"loss": 0.0013,
"step": 41
},
{
"epoch": 0.36285097192224625,
"grad_norm": 0.07802402973175049,
"learning_rate": 6.521397683351509e-05,
"loss": 0.0012,
"step": 42
},
{
"epoch": 0.3714902807775378,
"grad_norm": 0.011767297983169556,
"learning_rate": 6.491774494049386e-05,
"loss": 0.0004,
"step": 43
},
{
"epoch": 0.3801295896328294,
"grad_norm": 0.0234123133122921,
"learning_rate": 6.461333383473272e-05,
"loss": 0.0013,
"step": 44
},
{
"epoch": 0.38876889848812096,
"grad_norm": 0.007028356194496155,
"learning_rate": 6.430082673918849e-05,
"loss": 0.0004,
"step": 45
},
{
"epoch": 0.39740820734341253,
"grad_norm": 0.011285451240837574,
"learning_rate": 6.398030909018069e-05,
"loss": 0.0003,
"step": 46
},
{
"epoch": 0.4060475161987041,
"grad_norm": 0.07014564424753189,
"learning_rate": 6.365186851403423e-05,
"loss": 0.001,
"step": 47
},
{
"epoch": 0.4146868250539957,
"grad_norm": 0.023154348134994507,
"learning_rate": 6.331559480312315e-05,
"loss": 0.0003,
"step": 48
},
{
"epoch": 0.42332613390928725,
"grad_norm": 0.08951613306999207,
"learning_rate": 6.297157989132236e-05,
"loss": 0.0011,
"step": 49
},
{
"epoch": 0.4319654427645788,
"grad_norm": 0.03926246613264084,
"learning_rate": 6.261991782887377e-05,
"loss": 0.0006,
"step": 50
},
{
"epoch": 0.4319654427645788,
"eval_loss": 0.0019684885628521442,
"eval_runtime": 20.5581,
"eval_samples_per_second": 9.485,
"eval_steps_per_second": 2.383,
"step": 50
},
{
"epoch": 0.4406047516198704,
"grad_norm": 0.17613102495670319,
"learning_rate": 6.226070475667393e-05,
"loss": 0.0021,
"step": 51
},
{
"epoch": 0.44924406047516197,
"grad_norm": 0.05827736854553223,
"learning_rate": 6.189403887999006e-05,
"loss": 0.001,
"step": 52
},
{
"epoch": 0.45788336933045354,
"grad_norm": 0.12556667625904083,
"learning_rate": 6.152002044161171e-05,
"loss": 0.0015,
"step": 53
},
{
"epoch": 0.46652267818574517,
"grad_norm": 0.07447590678930283,
"learning_rate": 6.113875169444539e-05,
"loss": 0.0009,
"step": 54
},
{
"epoch": 0.47516198704103674,
"grad_norm": 0.27384987473487854,
"learning_rate": 6.0750336873559605e-05,
"loss": 0.01,
"step": 55
},
{
"epoch": 0.4838012958963283,
"grad_norm": 0.005780680105090141,
"learning_rate": 6.035488216768811e-05,
"loss": 0.0002,
"step": 56
},
{
"epoch": 0.4924406047516199,
"grad_norm": 0.04053672403097153,
"learning_rate": 5.9952495690198894e-05,
"loss": 0.0051,
"step": 57
},
{
"epoch": 0.5010799136069114,
"grad_norm": 0.04079966992139816,
"learning_rate": 5.954328744953709e-05,
"loss": 0.0057,
"step": 58
},
{
"epoch": 0.509719222462203,
"grad_norm": 0.03938170522451401,
"learning_rate": 5.91273693191498e-05,
"loss": 0.0049,
"step": 59
},
{
"epoch": 0.5183585313174947,
"grad_norm": 0.06116793677210808,
"learning_rate": 5.870485500690094e-05,
"loss": 0.0053,
"step": 60
},
{
"epoch": 0.5269978401727862,
"grad_norm": 0.06775252521038055,
"learning_rate": 5.827586002398468e-05,
"loss": 0.0034,
"step": 61
},
{
"epoch": 0.5356371490280778,
"grad_norm": 0.04742324352264404,
"learning_rate": 5.784050165334589e-05,
"loss": 0.0009,
"step": 62
},
{
"epoch": 0.5442764578833693,
"grad_norm": 0.0582570843398571,
"learning_rate": 5.739889891761608e-05,
"loss": 0.0021,
"step": 63
},
{
"epoch": 0.5529157667386609,
"grad_norm": 0.024544579908251762,
"learning_rate": 5.6951172546573794e-05,
"loss": 0.0006,
"step": 64
},
{
"epoch": 0.5615550755939525,
"grad_norm": 0.07139912247657776,
"learning_rate": 5.6497444944138376e-05,
"loss": 0.0017,
"step": 65
},
{
"epoch": 0.5701943844492441,
"grad_norm": 0.02395671233534813,
"learning_rate": 5.603784015490587e-05,
"loss": 0.0005,
"step": 66
},
{
"epoch": 0.5788336933045356,
"grad_norm": 0.004293499980121851,
"learning_rate": 5.557248383023655e-05,
"loss": 0.0003,
"step": 67
},
{
"epoch": 0.5874730021598272,
"grad_norm": 0.029220029711723328,
"learning_rate": 5.510150319390302e-05,
"loss": 0.0005,
"step": 68
},
{
"epoch": 0.5961123110151187,
"grad_norm": 0.037274319678545,
"learning_rate": 5.4625027007308546e-05,
"loss": 0.0015,
"step": 69
},
{
"epoch": 0.6047516198704104,
"grad_norm": 0.00902900006622076,
"learning_rate": 5.414318553428494e-05,
"loss": 0.0003,
"step": 70
},
{
"epoch": 0.6133909287257019,
"grad_norm": 0.01143543142825365,
"learning_rate": 5.3656110505479776e-05,
"loss": 0.0003,
"step": 71
},
{
"epoch": 0.6220302375809935,
"grad_norm": 0.005858670454472303,
"learning_rate": 5.316393508234253e-05,
"loss": 0.0004,
"step": 72
},
{
"epoch": 0.6306695464362851,
"grad_norm": 0.006607948802411556,
"learning_rate": 5.266679382071953e-05,
"loss": 0.0004,
"step": 73
},
{
"epoch": 0.6393088552915767,
"grad_norm": 0.05994042009115219,
"learning_rate": 5.216482263406778e-05,
"loss": 0.0006,
"step": 74
},
{
"epoch": 0.6479481641468683,
"grad_norm": 0.003944529686123133,
"learning_rate": 5.1658158756297576e-05,
"loss": 0.0002,
"step": 75
},
{
"epoch": 0.6565874730021598,
"grad_norm": 0.005714634899049997,
"learning_rate": 5.114694070425407e-05,
"loss": 0.0002,
"step": 76
},
{
"epoch": 0.6652267818574514,
"grad_norm": 0.24551953375339508,
"learning_rate": 5.063130823984823e-05,
"loss": 0.0005,
"step": 77
},
{
"epoch": 0.673866090712743,
"grad_norm": 0.10831040889024734,
"learning_rate": 5.011140233184724e-05,
"loss": 0.0027,
"step": 78
},
{
"epoch": 0.6825053995680346,
"grad_norm": 0.0029632514342665672,
"learning_rate": 4.958736511733516e-05,
"loss": 0.0002,
"step": 79
},
{
"epoch": 0.6911447084233261,
"grad_norm": 0.007232643198221922,
"learning_rate": 4.905933986285393e-05,
"loss": 0.0001,
"step": 80
},
{
"epoch": 0.6997840172786177,
"grad_norm": 0.010217340663075447,
"learning_rate": 4.8527470925235824e-05,
"loss": 0.0002,
"step": 81
},
{
"epoch": 0.7084233261339092,
"grad_norm": 0.2681877613067627,
"learning_rate": 4.799190371213772e-05,
"loss": 0.0037,
"step": 82
},
{
"epoch": 0.7170626349892009,
"grad_norm": 0.006039237137883902,
"learning_rate": 4.745278464228808e-05,
"loss": 0.0002,
"step": 83
},
{
"epoch": 0.7257019438444925,
"grad_norm": 0.005269297398626804,
"learning_rate": 4.69102611054575e-05,
"loss": 0.0003,
"step": 84
},
{
"epoch": 0.734341252699784,
"grad_norm": 0.06765911728143692,
"learning_rate": 4.6364481422163926e-05,
"loss": 0.0019,
"step": 85
},
{
"epoch": 0.7429805615550756,
"grad_norm": 0.05636543780565262,
"learning_rate": 4.581559480312316e-05,
"loss": 0.0023,
"step": 86
},
{
"epoch": 0.7516198704103672,
"grad_norm": 0.026066439226269722,
"learning_rate": 4.526375130845627e-05,
"loss": 0.0015,
"step": 87
},
{
"epoch": 0.7602591792656588,
"grad_norm": 0.018351661041378975,
"learning_rate": 4.4709101806664554e-05,
"loss": 0.0014,
"step": 88
},
{
"epoch": 0.7688984881209503,
"grad_norm": 0.1412251740694046,
"learning_rate": 4.4151797933383685e-05,
"loss": 0.0096,
"step": 89
},
{
"epoch": 0.7775377969762419,
"grad_norm": 0.21160076558589935,
"learning_rate": 4.359199204992797e-05,
"loss": 0.0059,
"step": 90
},
{
"epoch": 0.7861771058315334,
"grad_norm": 0.059807900339365005,
"learning_rate": 4.30298372016363e-05,
"loss": 0.0005,
"step": 91
},
{
"epoch": 0.7948164146868251,
"grad_norm": 0.010206430219113827,
"learning_rate": 4.246548707603114e-05,
"loss": 0.0003,
"step": 92
},
{
"epoch": 0.8034557235421166,
"grad_norm": 0.021596243605017662,
"learning_rate": 4.1899095960801805e-05,
"loss": 0.0004,
"step": 93
},
{
"epoch": 0.8120950323974082,
"grad_norm": 0.0017379262717440724,
"learning_rate": 4.133081870162385e-05,
"loss": 0.0002,
"step": 94
},
{
"epoch": 0.8207343412526998,
"grad_norm": 0.01433930266648531,
"learning_rate": 4.076081065982569e-05,
"loss": 0.0003,
"step": 95
},
{
"epoch": 0.8293736501079914,
"grad_norm": 0.03355858847498894,
"learning_rate": 4.018922766991447e-05,
"loss": 0.0006,
"step": 96
},
{
"epoch": 0.838012958963283,
"grad_norm": 0.1033296138048172,
"learning_rate": 3.961622599697241e-05,
"loss": 0.0013,
"step": 97
},
{
"epoch": 0.8466522678185745,
"grad_norm": 0.10396935045719147,
"learning_rate": 3.9041962293935516e-05,
"loss": 0.0035,
"step": 98
},
{
"epoch": 0.8552915766738661,
"grad_norm": 0.007392291445285082,
"learning_rate": 3.84665935587662e-05,
"loss": 0.0003,
"step": 99
},
{
"epoch": 0.8639308855291576,
"grad_norm": 0.06569644808769226,
"learning_rate": 3.7890277091531636e-05,
"loss": 0.0013,
"step": 100
},
{
"epoch": 0.8639308855291576,
"eval_loss": 0.0009457149426452816,
"eval_runtime": 20.7602,
"eval_samples_per_second": 9.393,
"eval_steps_per_second": 2.36,
"step": 100
},
{
"epoch": 0.8725701943844493,
"grad_norm": 0.06337860226631165,
"learning_rate": 3.7313170451399475e-05,
"loss": 0.0019,
"step": 101
},
{
"epoch": 0.8812095032397408,
"grad_norm": 0.07296153157949448,
"learning_rate": 3.673543141356278e-05,
"loss": 0.0033,
"step": 102
},
{
"epoch": 0.8898488120950324,
"grad_norm": 0.09170746803283691,
"learning_rate": 3.6157217926105783e-05,
"loss": 0.0004,
"step": 103
},
{
"epoch": 0.8984881209503239,
"grad_norm": 0.0043894099071621895,
"learning_rate": 3.557868806682255e-05,
"loss": 0.0002,
"step": 104
},
{
"epoch": 0.9071274298056156,
"grad_norm": 0.004214062821120024,
"learning_rate": 3.5e-05,
"loss": 0.0003,
"step": 105
},
{
"epoch": 0.9157667386609071,
"grad_norm": 0.004896323662251234,
"learning_rate": 3.442131193317745e-05,
"loss": 0.0002,
"step": 106
},
{
"epoch": 0.9244060475161987,
"grad_norm": 0.04607084020972252,
"learning_rate": 3.384278207389421e-05,
"loss": 0.0005,
"step": 107
},
{
"epoch": 0.9330453563714903,
"grad_norm": 0.024103185161948204,
"learning_rate": 3.3264568586437216e-05,
"loss": 0.0004,
"step": 108
},
{
"epoch": 0.9416846652267818,
"grad_norm": 0.19529423117637634,
"learning_rate": 3.268682954860052e-05,
"loss": 0.0016,
"step": 109
},
{
"epoch": 0.9503239740820735,
"grad_norm": 0.013852166011929512,
"learning_rate": 3.210972290846837e-05,
"loss": 0.0003,
"step": 110
},
{
"epoch": 0.958963282937365,
"grad_norm": 0.005281697493046522,
"learning_rate": 3.15334064412338e-05,
"loss": 0.0003,
"step": 111
},
{
"epoch": 0.9676025917926566,
"grad_norm": 0.04835696145892143,
"learning_rate": 3.0958037706064485e-05,
"loss": 0.0007,
"step": 112
},
{
"epoch": 0.9762419006479481,
"grad_norm": 0.007758499588817358,
"learning_rate": 3.038377400302758e-05,
"loss": 0.0003,
"step": 113
},
{
"epoch": 0.9848812095032398,
"grad_norm": 0.006247072480618954,
"learning_rate": 2.9810772330085524e-05,
"loss": 0.0006,
"step": 114
},
{
"epoch": 0.9935205183585313,
"grad_norm": 0.06823667138814926,
"learning_rate": 2.9239189340174306e-05,
"loss": 0.0006,
"step": 115
},
{
"epoch": 1.0021598272138228,
"grad_norm": 0.13855108618736267,
"learning_rate": 2.8669181298376163e-05,
"loss": 0.0025,
"step": 116
},
{
"epoch": 1.0107991360691144,
"grad_norm": 0.013171014375984669,
"learning_rate": 2.8100904039198193e-05,
"loss": 0.0007,
"step": 117
},
{
"epoch": 1.019438444924406,
"grad_norm": 0.012365025468170643,
"learning_rate": 2.7534512923968863e-05,
"loss": 0.0006,
"step": 118
},
{
"epoch": 1.0280777537796977,
"grad_norm": 0.009904728271067142,
"learning_rate": 2.6970162798363695e-05,
"loss": 0.0008,
"step": 119
},
{
"epoch": 1.0367170626349893,
"grad_norm": 0.006425977218896151,
"learning_rate": 2.640800795007203e-05,
"loss": 0.0004,
"step": 120
},
{
"epoch": 1.0453563714902807,
"grad_norm": 0.01372888870537281,
"learning_rate": 2.5848202066616305e-05,
"loss": 0.0002,
"step": 121
},
{
"epoch": 1.0539956803455723,
"grad_norm": 0.0021302136592566967,
"learning_rate": 2.5290898193335446e-05,
"loss": 0.0003,
"step": 122
},
{
"epoch": 1.062634989200864,
"grad_norm": 0.0052225952968001366,
"learning_rate": 2.4736248691543736e-05,
"loss": 0.0002,
"step": 123
},
{
"epoch": 1.0712742980561556,
"grad_norm": 0.0029196590185165405,
"learning_rate": 2.4184405196876842e-05,
"loss": 0.0001,
"step": 124
},
{
"epoch": 1.079913606911447,
"grad_norm": 0.09910155832767487,
"learning_rate": 2.363551857783608e-05,
"loss": 0.0003,
"step": 125
},
{
"epoch": 1.0885529157667386,
"grad_norm": 0.0008615689584985375,
"learning_rate": 2.308973889454249e-05,
"loss": 0.0002,
"step": 126
},
{
"epoch": 1.0971922246220303,
"grad_norm": 0.0013229359174147248,
"learning_rate": 2.2547215357711918e-05,
"loss": 0.0001,
"step": 127
},
{
"epoch": 1.1058315334773219,
"grad_norm": 0.000881396175827831,
"learning_rate": 2.2008096287862266e-05,
"loss": 0.0001,
"step": 128
},
{
"epoch": 1.1144708423326133,
"grad_norm": 0.0023514782078564167,
"learning_rate": 2.1472529074764177e-05,
"loss": 0.0002,
"step": 129
},
{
"epoch": 1.123110151187905,
"grad_norm": 0.013889284804463387,
"learning_rate": 2.0940660137146074e-05,
"loss": 0.0003,
"step": 130
},
{
"epoch": 1.1317494600431965,
"grad_norm": 0.0022602914832532406,
"learning_rate": 2.041263488266484e-05,
"loss": 0.0001,
"step": 131
},
{
"epoch": 1.1403887688984882,
"grad_norm": 0.0012188655091449618,
"learning_rate": 1.988859766815275e-05,
"loss": 0.0001,
"step": 132
},
{
"epoch": 1.1490280777537798,
"grad_norm": 0.0018668539123609662,
"learning_rate": 1.9368691760151773e-05,
"loss": 0.0001,
"step": 133
},
{
"epoch": 1.1576673866090712,
"grad_norm": 0.0017291579861193895,
"learning_rate": 1.885305929574593e-05,
"loss": 0.0001,
"step": 134
},
{
"epoch": 1.1663066954643628,
"grad_norm": 0.0010335007682442665,
"learning_rate": 1.8341841243702424e-05,
"loss": 0.0001,
"step": 135
},
{
"epoch": 1.1749460043196545,
"grad_norm": 0.18275120854377747,
"learning_rate": 1.7835177365932225e-05,
"loss": 0.0022,
"step": 136
},
{
"epoch": 1.183585313174946,
"grad_norm": 0.0107800904661417,
"learning_rate": 1.7333206179280478e-05,
"loss": 0.0002,
"step": 137
},
{
"epoch": 1.1922246220302375,
"grad_norm": 0.0653991624712944,
"learning_rate": 1.6836064917657478e-05,
"loss": 0.0004,
"step": 138
},
{
"epoch": 1.2008639308855291,
"grad_norm": 0.005070838611572981,
"learning_rate": 1.6343889494520224e-05,
"loss": 0.0002,
"step": 139
},
{
"epoch": 1.2095032397408207,
"grad_norm": 0.004730647429823875,
"learning_rate": 1.5856814465715064e-05,
"loss": 0.0001,
"step": 140
},
{
"epoch": 1.2181425485961124,
"grad_norm": 0.0011722528142854571,
"learning_rate": 1.5374972992691458e-05,
"loss": 0.0001,
"step": 141
},
{
"epoch": 1.226781857451404,
"grad_norm": 0.001693835249170661,
"learning_rate": 1.4898496806096974e-05,
"loss": 0.0001,
"step": 142
},
{
"epoch": 1.2354211663066954,
"grad_norm": 0.003972134552896023,
"learning_rate": 1.4427516169763444e-05,
"loss": 0.0001,
"step": 143
},
{
"epoch": 1.244060475161987,
"grad_norm": 0.01975773461163044,
"learning_rate": 1.396215984509412e-05,
"loss": 0.0005,
"step": 144
},
{
"epoch": 1.2526997840172787,
"grad_norm": 0.007931800559163094,
"learning_rate": 1.3502555055861625e-05,
"loss": 0.0004,
"step": 145
},
{
"epoch": 1.26133909287257,
"grad_norm": 0.022132746875286102,
"learning_rate": 1.3048827453426203e-05,
"loss": 0.0005,
"step": 146
},
{
"epoch": 1.2699784017278617,
"grad_norm": 0.010564382188022137,
"learning_rate": 1.2601101082383917e-05,
"loss": 0.0004,
"step": 147
},
{
"epoch": 1.2786177105831533,
"grad_norm": 0.009835362434387207,
"learning_rate": 1.2159498346654094e-05,
"loss": 0.0005,
"step": 148
},
{
"epoch": 1.287257019438445,
"grad_norm": 0.006598853040486574,
"learning_rate": 1.1724139976015306e-05,
"loss": 0.0003,
"step": 149
},
{
"epoch": 1.2958963282937366,
"grad_norm": 0.00920469593256712,
"learning_rate": 1.1295144993099068e-05,
"loss": 0.0005,
"step": 150
},
{
"epoch": 1.2958963282937366,
"eval_loss": 0.00024003432190511376,
"eval_runtime": 20.7717,
"eval_samples_per_second": 9.388,
"eval_steps_per_second": 2.359,
"step": 150
},
{
"epoch": 1.3045356371490282,
"grad_norm": 0.0014002382522448897,
"learning_rate": 1.0872630680850196e-05,
"loss": 0.0001,
"step": 151
},
{
"epoch": 1.3131749460043196,
"grad_norm": 0.0010238329414278269,
"learning_rate": 1.0456712550462898e-05,
"loss": 0.0002,
"step": 152
},
{
"epoch": 1.3218142548596112,
"grad_norm": 0.0012431687209755182,
"learning_rate": 1.0047504309801104e-05,
"loss": 0.0001,
"step": 153
},
{
"epoch": 1.3304535637149029,
"grad_norm": 0.0026770096737891436,
"learning_rate": 9.645117832311886e-06,
"loss": 0.0001,
"step": 154
},
{
"epoch": 1.3390928725701943,
"grad_norm": 0.01820327155292034,
"learning_rate": 9.249663126440394e-06,
"loss": 0.0002,
"step": 155
},
{
"epoch": 1.347732181425486,
"grad_norm": 0.01555480808019638,
"learning_rate": 8.861248305554624e-06,
"loss": 0.0002,
"step": 156
},
{
"epoch": 1.3563714902807775,
"grad_norm": 0.0015801583649590611,
"learning_rate": 8.47997955838829e-06,
"loss": 0.0002,
"step": 157
},
{
"epoch": 1.3650107991360692,
"grad_norm": 0.10153518617153168,
"learning_rate": 8.10596112000994e-06,
"loss": 0.0004,
"step": 158
},
{
"epoch": 1.3736501079913608,
"grad_norm": 0.0009876766707748175,
"learning_rate": 7.739295243326067e-06,
"loss": 0.0001,
"step": 159
},
{
"epoch": 1.3822894168466522,
"grad_norm": 0.0026555354706943035,
"learning_rate": 7.380082171126228e-06,
"loss": 0.0002,
"step": 160
},
{
"epoch": 1.3909287257019438,
"grad_norm": 0.0006382952560670674,
"learning_rate": 7.028420108677635e-06,
"loss": 0.0001,
"step": 161
},
{
"epoch": 1.3995680345572354,
"grad_norm": 0.0014154494274407625,
"learning_rate": 6.684405196876842e-06,
"loss": 0.0001,
"step": 162
},
{
"epoch": 1.408207343412527,
"grad_norm": 0.0011231210082769394,
"learning_rate": 6.3481314859657675e-06,
"loss": 0.0001,
"step": 163
},
{
"epoch": 1.4168466522678185,
"grad_norm": 0.0014514840440824628,
"learning_rate": 6.019690909819298e-06,
"loss": 0.0001,
"step": 164
},
{
"epoch": 1.42548596112311,
"grad_norm": 0.0013826994691044092,
"learning_rate": 5.6991732608115e-06,
"loss": 0.0001,
"step": 165
},
{
"epoch": 1.4341252699784017,
"grad_norm": 0.0013565586414188147,
"learning_rate": 5.386666165267256e-06,
"loss": 0.0001,
"step": 166
},
{
"epoch": 1.4427645788336934,
"grad_norm": 0.007900253869593143,
"learning_rate": 5.08225505950613e-06,
"loss": 0.0001,
"step": 167
},
{
"epoch": 1.451403887688985,
"grad_norm": 0.0012855289969593287,
"learning_rate": 4.786023166484913e-06,
"loss": 0.0001,
"step": 168
},
{
"epoch": 1.4600431965442764,
"grad_norm": 0.01582699827849865,
"learning_rate": 4.498051473045291e-06,
"loss": 0.0002,
"step": 169
},
{
"epoch": 1.468682505399568,
"grad_norm": 0.0007394661079160869,
"learning_rate": 4.218418707772886e-06,
"loss": 0.0001,
"step": 170
},
{
"epoch": 1.4773218142548596,
"grad_norm": 0.0008164668688550591,
"learning_rate": 3.947201319473587e-06,
"loss": 0.0001,
"step": 171
},
{
"epoch": 1.485961123110151,
"grad_norm": 0.002124256454408169,
"learning_rate": 3.684473456273278e-06,
"loss": 0.0002,
"step": 172
},
{
"epoch": 1.4946004319654427,
"grad_norm": 0.00903933122754097,
"learning_rate": 3.4303069453464383e-06,
"loss": 0.0003,
"step": 173
},
{
"epoch": 1.5032397408207343,
"grad_norm": 0.017047259956598282,
"learning_rate": 3.184771273279312e-06,
"loss": 0.0006,
"step": 174
},
{
"epoch": 1.511879049676026,
"grad_norm": 0.01836921088397503,
"learning_rate": 2.947933567072987e-06,
"loss": 0.0006,
"step": 175
},
{
"epoch": 1.5205183585313176,
"grad_norm": 0.0054769194684922695,
"learning_rate": 2.719858575791534e-06,
"loss": 0.0003,
"step": 176
},
{
"epoch": 1.5291576673866092,
"grad_norm": 0.05894150957465172,
"learning_rate": 2.500608652860256e-06,
"loss": 0.0009,
"step": 177
},
{
"epoch": 1.5377969762419006,
"grad_norm": 0.0017646638443693519,
"learning_rate": 2.2902437390188737e-06,
"loss": 0.0002,
"step": 178
},
{
"epoch": 1.5464362850971922,
"grad_norm": 0.0016004899516701698,
"learning_rate": 2.0888213459343587e-06,
"loss": 0.0002,
"step": 179
},
{
"epoch": 1.5550755939524838,
"grad_norm": 0.0009360113763250411,
"learning_rate": 1.8963965404777875e-06,
"loss": 0.0001,
"step": 180
},
{
"epoch": 1.5637149028077753,
"grad_norm": 0.0009685845579952002,
"learning_rate": 1.7130219296696263e-06,
"loss": 0.0001,
"step": 181
},
{
"epoch": 1.5723542116630669,
"grad_norm": 0.0022252460476011038,
"learning_rate": 1.5387476462974824e-06,
"loss": 0.0003,
"step": 182
},
{
"epoch": 1.5809935205183585,
"grad_norm": 0.0034834735561162233,
"learning_rate": 1.3736213352103147e-06,
"loss": 0.0001,
"step": 183
},
{
"epoch": 1.5896328293736501,
"grad_norm": 0.0007790013332851231,
"learning_rate": 1.2176881402928002e-06,
"loss": 0.0001,
"step": 184
},
{
"epoch": 1.5982721382289418,
"grad_norm": 0.0013246826129034162,
"learning_rate": 1.0709906921234367e-06,
"loss": 0.0001,
"step": 185
},
{
"epoch": 1.6069114470842334,
"grad_norm": 0.0007768021896481514,
"learning_rate": 9.33569096319799e-07,
"loss": 0.0001,
"step": 186
},
{
"epoch": 1.6155507559395248,
"grad_norm": 0.0009022291051223874,
"learning_rate": 8.054609225740255e-07,
"loss": 0.0001,
"step": 187
},
{
"epoch": 1.6241900647948164,
"grad_norm": 0.0007956126355566084,
"learning_rate": 6.867011943816724e-07,
"loss": 0.0001,
"step": 188
},
{
"epoch": 1.6328293736501078,
"grad_norm": 0.0012011040234938264,
"learning_rate": 5.77322379466617e-07,
"loss": 0.0001,
"step": 189
},
{
"epoch": 1.6414686825053995,
"grad_norm": 0.0011474161874502897,
"learning_rate": 4.773543809047186e-07,
"loss": 0.0001,
"step": 190
},
{
"epoch": 1.650107991360691,
"grad_norm": 0.0006911220261827111,
"learning_rate": 3.868245289486027e-07,
"loss": 0.0001,
"step": 191
},
{
"epoch": 1.6587473002159827,
"grad_norm": 0.0015587140806019306,
"learning_rate": 3.0575757355586817e-07,
"loss": 0.0001,
"step": 192
},
{
"epoch": 1.6673866090712743,
"grad_norm": 0.0011123515432700515,
"learning_rate": 2.3417567762266497e-07,
"loss": 0.0001,
"step": 193
},
{
"epoch": 1.676025917926566,
"grad_norm": 0.002342136111110449,
"learning_rate": 1.7209841092460043e-07,
"loss": 0.0001,
"step": 194
},
{
"epoch": 1.6846652267818576,
"grad_norm": 0.0016477032331749797,
"learning_rate": 1.1954274476655534e-07,
"loss": 0.0001,
"step": 195
},
{
"epoch": 1.693304535637149,
"grad_norm": 0.0014787918189540505,
"learning_rate": 7.652304734289127e-08,
"loss": 0.0001,
"step": 196
},
{
"epoch": 1.7019438444924406,
"grad_norm": 0.00296723167411983,
"learning_rate": 4.30510798093342e-08,
"loss": 0.0001,
"step": 197
},
{
"epoch": 1.710583153347732,
"grad_norm": 0.0015325212152674794,
"learning_rate": 1.9135993067588284e-08,
"loss": 0.0001,
"step": 198
},
{
"epoch": 1.7192224622030237,
"grad_norm": 0.00978434830904007,
"learning_rate": 4.784325263584854e-09,
"loss": 0.0002,
"step": 199
},
{
"epoch": 1.7278617710583153,
"grad_norm": 0.0022683811839669943,
"learning_rate": 0.0,
"loss": 0.0002,
"step": 200
},
{
"epoch": 1.7278617710583153,
"eval_loss": 0.00024209167168010026,
"eval_runtime": 20.7713,
"eval_samples_per_second": 9.388,
"eval_steps_per_second": 2.359,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 4,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.970690585555108e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}