lesso's picture
Training in progress, step 200, checkpoint
4922831 verified
{
"best_metric": 1.1956850290298462,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.09519276534983341,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00047596382674916705,
"grad_norm": 18.821937561035156,
"learning_rate": 1.0170000000000001e-05,
"loss": 3.7617,
"step": 1
},
{
"epoch": 0.00047596382674916705,
"eval_loss": 2.0851309299468994,
"eval_runtime": 94.1092,
"eval_samples_per_second": 9.404,
"eval_steps_per_second": 2.359,
"step": 1
},
{
"epoch": 0.0009519276534983341,
"grad_norm": 20.83098602294922,
"learning_rate": 2.0340000000000002e-05,
"loss": 3.8855,
"step": 2
},
{
"epoch": 0.0014278914802475012,
"grad_norm": 18.45833969116211,
"learning_rate": 3.051e-05,
"loss": 4.0341,
"step": 3
},
{
"epoch": 0.0019038553069966682,
"grad_norm": 15.618721961975098,
"learning_rate": 4.0680000000000004e-05,
"loss": 3.4497,
"step": 4
},
{
"epoch": 0.002379819133745835,
"grad_norm": 14.598923683166504,
"learning_rate": 5.085e-05,
"loss": 3.2374,
"step": 5
},
{
"epoch": 0.0028557829604950024,
"grad_norm": 16.081514358520508,
"learning_rate": 6.102e-05,
"loss": 3.4533,
"step": 6
},
{
"epoch": 0.0033317467872441696,
"grad_norm": 14.391841888427734,
"learning_rate": 7.119e-05,
"loss": 3.2119,
"step": 7
},
{
"epoch": 0.0038077106139933364,
"grad_norm": 13.710631370544434,
"learning_rate": 8.136000000000001e-05,
"loss": 3.3193,
"step": 8
},
{
"epoch": 0.004283674440742504,
"grad_norm": 12.184770584106445,
"learning_rate": 9.153000000000001e-05,
"loss": 3.0283,
"step": 9
},
{
"epoch": 0.00475963826749167,
"grad_norm": 11.110732078552246,
"learning_rate": 0.0001017,
"loss": 3.0447,
"step": 10
},
{
"epoch": 0.005235602094240838,
"grad_norm": 13.03386402130127,
"learning_rate": 0.00010116473684210527,
"loss": 3.0503,
"step": 11
},
{
"epoch": 0.005711565920990005,
"grad_norm": 11.393197059631348,
"learning_rate": 0.00010062947368421052,
"loss": 3.1731,
"step": 12
},
{
"epoch": 0.006187529747739172,
"grad_norm": 11.392594337463379,
"learning_rate": 0.00010009421052631579,
"loss": 2.7808,
"step": 13
},
{
"epoch": 0.006663493574488339,
"grad_norm": 11.060273170471191,
"learning_rate": 9.955894736842107e-05,
"loss": 2.9735,
"step": 14
},
{
"epoch": 0.007139457401237506,
"grad_norm": 10.616848945617676,
"learning_rate": 9.902368421052632e-05,
"loss": 2.8027,
"step": 15
},
{
"epoch": 0.007615421227986673,
"grad_norm": 11.916224479675293,
"learning_rate": 9.848842105263159e-05,
"loss": 3.3891,
"step": 16
},
{
"epoch": 0.00809138505473584,
"grad_norm": 11.447843551635742,
"learning_rate": 9.795315789473685e-05,
"loss": 3.1294,
"step": 17
},
{
"epoch": 0.008567348881485007,
"grad_norm": 9.17795467376709,
"learning_rate": 9.74178947368421e-05,
"loss": 2.7995,
"step": 18
},
{
"epoch": 0.009043312708234174,
"grad_norm": 10.779789924621582,
"learning_rate": 9.688263157894737e-05,
"loss": 3.2716,
"step": 19
},
{
"epoch": 0.00951927653498334,
"grad_norm": 10.012481689453125,
"learning_rate": 9.634736842105264e-05,
"loss": 2.9491,
"step": 20
},
{
"epoch": 0.009995240361732508,
"grad_norm": 10.166115760803223,
"learning_rate": 9.58121052631579e-05,
"loss": 3.024,
"step": 21
},
{
"epoch": 0.010471204188481676,
"grad_norm": 9.625226020812988,
"learning_rate": 9.527684210526317e-05,
"loss": 2.9964,
"step": 22
},
{
"epoch": 0.010947168015230843,
"grad_norm": 11.936861991882324,
"learning_rate": 9.474157894736843e-05,
"loss": 3.0226,
"step": 23
},
{
"epoch": 0.01142313184198001,
"grad_norm": 9.899079322814941,
"learning_rate": 9.420631578947368e-05,
"loss": 3.0418,
"step": 24
},
{
"epoch": 0.011899095668729176,
"grad_norm": 9.566582679748535,
"learning_rate": 9.367105263157895e-05,
"loss": 2.5971,
"step": 25
},
{
"epoch": 0.012375059495478343,
"grad_norm": 9.53864860534668,
"learning_rate": 9.313578947368422e-05,
"loss": 3.0019,
"step": 26
},
{
"epoch": 0.01285102332222751,
"grad_norm": 13.411975860595703,
"learning_rate": 9.260052631578948e-05,
"loss": 2.7033,
"step": 27
},
{
"epoch": 0.013326987148976678,
"grad_norm": 26.838424682617188,
"learning_rate": 9.206526315789475e-05,
"loss": 2.5882,
"step": 28
},
{
"epoch": 0.013802950975725845,
"grad_norm": 11.076753616333008,
"learning_rate": 9.153000000000001e-05,
"loss": 2.9107,
"step": 29
},
{
"epoch": 0.014278914802475012,
"grad_norm": 10.66457748413086,
"learning_rate": 9.099473684210527e-05,
"loss": 3.1288,
"step": 30
},
{
"epoch": 0.014754878629224179,
"grad_norm": 11.16288948059082,
"learning_rate": 9.045947368421053e-05,
"loss": 2.913,
"step": 31
},
{
"epoch": 0.015230842455973346,
"grad_norm": 9.841339111328125,
"learning_rate": 8.99242105263158e-05,
"loss": 2.8651,
"step": 32
},
{
"epoch": 0.015706806282722512,
"grad_norm": 9.972783088684082,
"learning_rate": 8.938894736842105e-05,
"loss": 2.6382,
"step": 33
},
{
"epoch": 0.01618277010947168,
"grad_norm": 8.628253936767578,
"learning_rate": 8.885368421052633e-05,
"loss": 2.8631,
"step": 34
},
{
"epoch": 0.016658733936220846,
"grad_norm": 9.764812469482422,
"learning_rate": 8.83184210526316e-05,
"loss": 3.1602,
"step": 35
},
{
"epoch": 0.017134697762970014,
"grad_norm": 9.357998847961426,
"learning_rate": 8.778315789473685e-05,
"loss": 3.232,
"step": 36
},
{
"epoch": 0.017610661589719183,
"grad_norm": 8.898050308227539,
"learning_rate": 8.724789473684211e-05,
"loss": 3.064,
"step": 37
},
{
"epoch": 0.018086625416468348,
"grad_norm": 8.951642990112305,
"learning_rate": 8.671263157894738e-05,
"loss": 2.6545,
"step": 38
},
{
"epoch": 0.018562589243217516,
"grad_norm": 8.89450454711914,
"learning_rate": 8.617736842105263e-05,
"loss": 2.8535,
"step": 39
},
{
"epoch": 0.01903855306996668,
"grad_norm": 8.89578628540039,
"learning_rate": 8.56421052631579e-05,
"loss": 2.775,
"step": 40
},
{
"epoch": 0.01951451689671585,
"grad_norm": 11.88039493560791,
"learning_rate": 8.510684210526316e-05,
"loss": 2.9885,
"step": 41
},
{
"epoch": 0.019990480723465015,
"grad_norm": 9.176898956298828,
"learning_rate": 8.457157894736843e-05,
"loss": 2.7476,
"step": 42
},
{
"epoch": 0.020466444550214184,
"grad_norm": 10.42405891418457,
"learning_rate": 8.403631578947369e-05,
"loss": 3.1671,
"step": 43
},
{
"epoch": 0.020942408376963352,
"grad_norm": 9.580604553222656,
"learning_rate": 8.350105263157896e-05,
"loss": 2.7088,
"step": 44
},
{
"epoch": 0.021418372203712517,
"grad_norm": 11.169540405273438,
"learning_rate": 8.296578947368421e-05,
"loss": 3.0075,
"step": 45
},
{
"epoch": 0.021894336030461686,
"grad_norm": 8.268192291259766,
"learning_rate": 8.243052631578948e-05,
"loss": 2.5109,
"step": 46
},
{
"epoch": 0.02237029985721085,
"grad_norm": 9.292203903198242,
"learning_rate": 8.189526315789474e-05,
"loss": 2.9182,
"step": 47
},
{
"epoch": 0.02284626368396002,
"grad_norm": 10.59662914276123,
"learning_rate": 8.136000000000001e-05,
"loss": 3.2872,
"step": 48
},
{
"epoch": 0.023322227510709188,
"grad_norm": 10.883740425109863,
"learning_rate": 8.082473684210527e-05,
"loss": 3.0321,
"step": 49
},
{
"epoch": 0.023798191337458353,
"grad_norm": 8.746244430541992,
"learning_rate": 8.028947368421054e-05,
"loss": 2.9888,
"step": 50
},
{
"epoch": 0.023798191337458353,
"eval_loss": 1.4506478309631348,
"eval_runtime": 94.223,
"eval_samples_per_second": 9.393,
"eval_steps_per_second": 2.356,
"step": 50
},
{
"epoch": 0.02427415516420752,
"grad_norm": 8.132503509521484,
"learning_rate": 7.975421052631579e-05,
"loss": 2.8369,
"step": 51
},
{
"epoch": 0.024750118990956686,
"grad_norm": 7.467820644378662,
"learning_rate": 7.921894736842106e-05,
"loss": 3.1423,
"step": 52
},
{
"epoch": 0.025226082817705855,
"grad_norm": 6.4784369468688965,
"learning_rate": 7.868368421052632e-05,
"loss": 2.7428,
"step": 53
},
{
"epoch": 0.02570204664445502,
"grad_norm": 6.803327560424805,
"learning_rate": 7.814842105263157e-05,
"loss": 2.8238,
"step": 54
},
{
"epoch": 0.02617801047120419,
"grad_norm": 9.669547080993652,
"learning_rate": 7.761315789473685e-05,
"loss": 2.7933,
"step": 55
},
{
"epoch": 0.026653974297953357,
"grad_norm": 9.685455322265625,
"learning_rate": 7.70778947368421e-05,
"loss": 3.0942,
"step": 56
},
{
"epoch": 0.027129938124702522,
"grad_norm": 8.498844146728516,
"learning_rate": 7.654263157894737e-05,
"loss": 3.0961,
"step": 57
},
{
"epoch": 0.02760590195145169,
"grad_norm": 7.0263776779174805,
"learning_rate": 7.600736842105264e-05,
"loss": 2.9287,
"step": 58
},
{
"epoch": 0.028081865778200855,
"grad_norm": 7.7478742599487305,
"learning_rate": 7.54721052631579e-05,
"loss": 2.7414,
"step": 59
},
{
"epoch": 0.028557829604950024,
"grad_norm": 6.691257953643799,
"learning_rate": 7.493684210526315e-05,
"loss": 2.6123,
"step": 60
},
{
"epoch": 0.029033793431699192,
"grad_norm": 7.181465148925781,
"learning_rate": 7.440157894736843e-05,
"loss": 2.6851,
"step": 61
},
{
"epoch": 0.029509757258448358,
"grad_norm": 6.531435012817383,
"learning_rate": 7.386631578947369e-05,
"loss": 2.6416,
"step": 62
},
{
"epoch": 0.029985721085197526,
"grad_norm": 9.599217414855957,
"learning_rate": 7.333105263157895e-05,
"loss": 2.7074,
"step": 63
},
{
"epoch": 0.03046168491194669,
"grad_norm": 7.4359846115112305,
"learning_rate": 7.279578947368422e-05,
"loss": 2.8352,
"step": 64
},
{
"epoch": 0.03093764873869586,
"grad_norm": 6.91318941116333,
"learning_rate": 7.226052631578947e-05,
"loss": 2.5786,
"step": 65
},
{
"epoch": 0.031413612565445025,
"grad_norm": 8.484053611755371,
"learning_rate": 7.172526315789474e-05,
"loss": 3.0093,
"step": 66
},
{
"epoch": 0.0318895763921942,
"grad_norm": 7.760731220245361,
"learning_rate": 7.119e-05,
"loss": 2.8831,
"step": 67
},
{
"epoch": 0.03236554021894336,
"grad_norm": 7.7634100914001465,
"learning_rate": 7.065473684210527e-05,
"loss": 2.8134,
"step": 68
},
{
"epoch": 0.03284150404569253,
"grad_norm": 15.038714408874512,
"learning_rate": 7.011947368421053e-05,
"loss": 2.446,
"step": 69
},
{
"epoch": 0.03331746787244169,
"grad_norm": 6.84593391418457,
"learning_rate": 6.95842105263158e-05,
"loss": 2.8314,
"step": 70
},
{
"epoch": 0.033793431699190864,
"grad_norm": 6.8339667320251465,
"learning_rate": 6.904894736842105e-05,
"loss": 2.507,
"step": 71
},
{
"epoch": 0.03426939552594003,
"grad_norm": 8.750052452087402,
"learning_rate": 6.851368421052632e-05,
"loss": 2.637,
"step": 72
},
{
"epoch": 0.034745359352689194,
"grad_norm": 7.736267566680908,
"learning_rate": 6.797842105263158e-05,
"loss": 2.748,
"step": 73
},
{
"epoch": 0.035221323179438366,
"grad_norm": 7.89774227142334,
"learning_rate": 6.744315789473685e-05,
"loss": 2.7948,
"step": 74
},
{
"epoch": 0.03569728700618753,
"grad_norm": 7.224119663238525,
"learning_rate": 6.690789473684211e-05,
"loss": 2.7275,
"step": 75
},
{
"epoch": 0.036173250832936696,
"grad_norm": 7.180510520935059,
"learning_rate": 6.637263157894738e-05,
"loss": 2.5776,
"step": 76
},
{
"epoch": 0.03664921465968586,
"grad_norm": 6.440933704376221,
"learning_rate": 6.583736842105263e-05,
"loss": 2.241,
"step": 77
},
{
"epoch": 0.03712517848643503,
"grad_norm": 8.913047790527344,
"learning_rate": 6.53021052631579e-05,
"loss": 2.9477,
"step": 78
},
{
"epoch": 0.0376011423131842,
"grad_norm": 7.736593723297119,
"learning_rate": 6.476684210526316e-05,
"loss": 2.8696,
"step": 79
},
{
"epoch": 0.03807710613993336,
"grad_norm": 7.847418308258057,
"learning_rate": 6.423157894736841e-05,
"loss": 2.3813,
"step": 80
},
{
"epoch": 0.038553069966682535,
"grad_norm": 9.366430282592773,
"learning_rate": 6.369631578947368e-05,
"loss": 2.8983,
"step": 81
},
{
"epoch": 0.0390290337934317,
"grad_norm": 6.681727886199951,
"learning_rate": 6.316105263157896e-05,
"loss": 2.4428,
"step": 82
},
{
"epoch": 0.039504997620180865,
"grad_norm": 8.800130844116211,
"learning_rate": 6.262578947368421e-05,
"loss": 2.9551,
"step": 83
},
{
"epoch": 0.03998096144693003,
"grad_norm": 8.04470157623291,
"learning_rate": 6.209052631578948e-05,
"loss": 2.7357,
"step": 84
},
{
"epoch": 0.0404569252736792,
"grad_norm": 6.852024555206299,
"learning_rate": 6.155526315789474e-05,
"loss": 2.6536,
"step": 85
},
{
"epoch": 0.04093288910042837,
"grad_norm": 7.370736598968506,
"learning_rate": 6.102e-05,
"loss": 2.6235,
"step": 86
},
{
"epoch": 0.04140885292717753,
"grad_norm": 7.6142401695251465,
"learning_rate": 6.048473684210526e-05,
"loss": 2.7034,
"step": 87
},
{
"epoch": 0.041884816753926704,
"grad_norm": 7.17495059967041,
"learning_rate": 5.9949473684210527e-05,
"loss": 2.7207,
"step": 88
},
{
"epoch": 0.04236078058067587,
"grad_norm": 7.216758728027344,
"learning_rate": 5.94142105263158e-05,
"loss": 2.4448,
"step": 89
},
{
"epoch": 0.042836744407425034,
"grad_norm": 7.9468817710876465,
"learning_rate": 5.887894736842106e-05,
"loss": 2.9818,
"step": 90
},
{
"epoch": 0.043312708234174206,
"grad_norm": 8.539376258850098,
"learning_rate": 5.834368421052632e-05,
"loss": 2.4128,
"step": 91
},
{
"epoch": 0.04378867206092337,
"grad_norm": 8.345818519592285,
"learning_rate": 5.780842105263158e-05,
"loss": 2.7606,
"step": 92
},
{
"epoch": 0.044264635887672536,
"grad_norm": 8.508038520812988,
"learning_rate": 5.727315789473684e-05,
"loss": 2.6035,
"step": 93
},
{
"epoch": 0.0447405997144217,
"grad_norm": 8.71206283569336,
"learning_rate": 5.673789473684211e-05,
"loss": 2.9024,
"step": 94
},
{
"epoch": 0.04521656354117087,
"grad_norm": 7.256693363189697,
"learning_rate": 5.620263157894738e-05,
"loss": 2.5654,
"step": 95
},
{
"epoch": 0.04569252736792004,
"grad_norm": 6.628811359405518,
"learning_rate": 5.566736842105264e-05,
"loss": 2.0765,
"step": 96
},
{
"epoch": 0.0461684911946692,
"grad_norm": 9.192995071411133,
"learning_rate": 5.51321052631579e-05,
"loss": 3.0441,
"step": 97
},
{
"epoch": 0.046644455021418375,
"grad_norm": 9.181817054748535,
"learning_rate": 5.459684210526316e-05,
"loss": 2.8811,
"step": 98
},
{
"epoch": 0.04712041884816754,
"grad_norm": 9.44265079498291,
"learning_rate": 5.406157894736842e-05,
"loss": 2.6583,
"step": 99
},
{
"epoch": 0.047596382674916705,
"grad_norm": 9.560362815856934,
"learning_rate": 5.352631578947368e-05,
"loss": 2.5805,
"step": 100
},
{
"epoch": 0.047596382674916705,
"eval_loss": 1.34207284450531,
"eval_runtime": 94.197,
"eval_samples_per_second": 9.395,
"eval_steps_per_second": 2.357,
"step": 100
},
{
"epoch": 0.04807234650166587,
"grad_norm": 7.516977787017822,
"learning_rate": 5.299105263157895e-05,
"loss": 2.5181,
"step": 101
},
{
"epoch": 0.04854831032841504,
"grad_norm": 7.009253025054932,
"learning_rate": 5.245578947368422e-05,
"loss": 2.6719,
"step": 102
},
{
"epoch": 0.04902427415516421,
"grad_norm": 6.846374988555908,
"learning_rate": 5.192052631578948e-05,
"loss": 2.5637,
"step": 103
},
{
"epoch": 0.04950023798191337,
"grad_norm": 5.941534519195557,
"learning_rate": 5.1385263157894744e-05,
"loss": 2.2845,
"step": 104
},
{
"epoch": 0.049976201808662545,
"grad_norm": 7.417452812194824,
"learning_rate": 5.085e-05,
"loss": 3.1752,
"step": 105
},
{
"epoch": 0.05045216563541171,
"grad_norm": 6.396870136260986,
"learning_rate": 5.031473684210526e-05,
"loss": 2.5322,
"step": 106
},
{
"epoch": 0.050928129462160875,
"grad_norm": 6.419276237487793,
"learning_rate": 4.9779473684210534e-05,
"loss": 2.5846,
"step": 107
},
{
"epoch": 0.05140409328891004,
"grad_norm": 9.035964012145996,
"learning_rate": 4.924421052631579e-05,
"loss": 2.8623,
"step": 108
},
{
"epoch": 0.05188005711565921,
"grad_norm": 6.224599361419678,
"learning_rate": 4.870894736842105e-05,
"loss": 2.3845,
"step": 109
},
{
"epoch": 0.05235602094240838,
"grad_norm": 7.3065409660339355,
"learning_rate": 4.817368421052632e-05,
"loss": 2.7384,
"step": 110
},
{
"epoch": 0.05283198476915754,
"grad_norm": 6.494760036468506,
"learning_rate": 4.763842105263158e-05,
"loss": 2.3574,
"step": 111
},
{
"epoch": 0.053307948595906714,
"grad_norm": 5.581643104553223,
"learning_rate": 4.710315789473684e-05,
"loss": 2.2699,
"step": 112
},
{
"epoch": 0.05378391242265588,
"grad_norm": 6.040269374847412,
"learning_rate": 4.656789473684211e-05,
"loss": 2.5859,
"step": 113
},
{
"epoch": 0.054259876249405044,
"grad_norm": 6.788252353668213,
"learning_rate": 4.6032631578947374e-05,
"loss": 2.4009,
"step": 114
},
{
"epoch": 0.05473584007615421,
"grad_norm": 7.658879280090332,
"learning_rate": 4.549736842105263e-05,
"loss": 2.0763,
"step": 115
},
{
"epoch": 0.05521180390290338,
"grad_norm": 9.23805046081543,
"learning_rate": 4.49621052631579e-05,
"loss": 2.2703,
"step": 116
},
{
"epoch": 0.055687767729652546,
"grad_norm": 7.436956882476807,
"learning_rate": 4.4426842105263164e-05,
"loss": 2.7847,
"step": 117
},
{
"epoch": 0.05616373155640171,
"grad_norm": 8.176994323730469,
"learning_rate": 4.389157894736842e-05,
"loss": 2.5287,
"step": 118
},
{
"epoch": 0.05663969538315088,
"grad_norm": 7.547792434692383,
"learning_rate": 4.335631578947369e-05,
"loss": 2.81,
"step": 119
},
{
"epoch": 0.05711565920990005,
"grad_norm": 7.150010108947754,
"learning_rate": 4.282105263157895e-05,
"loss": 2.7525,
"step": 120
},
{
"epoch": 0.05759162303664921,
"grad_norm": 7.117188930511475,
"learning_rate": 4.228578947368421e-05,
"loss": 2.5581,
"step": 121
},
{
"epoch": 0.058067586863398385,
"grad_norm": 6.114779472351074,
"learning_rate": 4.175052631578948e-05,
"loss": 2.5203,
"step": 122
},
{
"epoch": 0.05854355069014755,
"grad_norm": 6.867531776428223,
"learning_rate": 4.121526315789474e-05,
"loss": 2.6818,
"step": 123
},
{
"epoch": 0.059019514516896715,
"grad_norm": 7.646029949188232,
"learning_rate": 4.0680000000000004e-05,
"loss": 2.6518,
"step": 124
},
{
"epoch": 0.05949547834364588,
"grad_norm": 8.080942153930664,
"learning_rate": 4.014473684210527e-05,
"loss": 2.8059,
"step": 125
},
{
"epoch": 0.05997144217039505,
"grad_norm": 6.926398277282715,
"learning_rate": 3.960947368421053e-05,
"loss": 2.5314,
"step": 126
},
{
"epoch": 0.06044740599714422,
"grad_norm": 7.169958114624023,
"learning_rate": 3.907421052631579e-05,
"loss": 2.5107,
"step": 127
},
{
"epoch": 0.06092336982389338,
"grad_norm": 7.661369323730469,
"learning_rate": 3.853894736842105e-05,
"loss": 2.3918,
"step": 128
},
{
"epoch": 0.061399333650642554,
"grad_norm": 7.439793586730957,
"learning_rate": 3.800368421052632e-05,
"loss": 2.8541,
"step": 129
},
{
"epoch": 0.06187529747739172,
"grad_norm": 6.659045696258545,
"learning_rate": 3.746842105263158e-05,
"loss": 2.5837,
"step": 130
},
{
"epoch": 0.062351261304140884,
"grad_norm": 8.68233585357666,
"learning_rate": 3.693315789473684e-05,
"loss": 2.5465,
"step": 131
},
{
"epoch": 0.06282722513089005,
"grad_norm": 7.132856845855713,
"learning_rate": 3.639789473684211e-05,
"loss": 2.4912,
"step": 132
},
{
"epoch": 0.06330318895763921,
"grad_norm": 7.528649806976318,
"learning_rate": 3.586263157894737e-05,
"loss": 2.7235,
"step": 133
},
{
"epoch": 0.0637791527843884,
"grad_norm": 6.1327667236328125,
"learning_rate": 3.5327368421052633e-05,
"loss": 2.4542,
"step": 134
},
{
"epoch": 0.06425511661113756,
"grad_norm": 6.09030294418335,
"learning_rate": 3.47921052631579e-05,
"loss": 2.3305,
"step": 135
},
{
"epoch": 0.06473108043788672,
"grad_norm": 6.725249290466309,
"learning_rate": 3.425684210526316e-05,
"loss": 2.4557,
"step": 136
},
{
"epoch": 0.06520704426463589,
"grad_norm": 6.322257041931152,
"learning_rate": 3.3721578947368424e-05,
"loss": 2.2034,
"step": 137
},
{
"epoch": 0.06568300809138505,
"grad_norm": 5.7170305252075195,
"learning_rate": 3.318631578947369e-05,
"loss": 2.1993,
"step": 138
},
{
"epoch": 0.06615897191813422,
"grad_norm": 7.491881370544434,
"learning_rate": 3.265105263157895e-05,
"loss": 2.6061,
"step": 139
},
{
"epoch": 0.06663493574488338,
"grad_norm": 7.218158721923828,
"learning_rate": 3.211578947368421e-05,
"loss": 2.622,
"step": 140
},
{
"epoch": 0.06711089957163256,
"grad_norm": 10.26785659790039,
"learning_rate": 3.158052631578948e-05,
"loss": 2.6993,
"step": 141
},
{
"epoch": 0.06758686339838173,
"grad_norm": 7.6761393547058105,
"learning_rate": 3.104526315789474e-05,
"loss": 2.6623,
"step": 142
},
{
"epoch": 0.06806282722513089,
"grad_norm": 7.0337395668029785,
"learning_rate": 3.051e-05,
"loss": 2.632,
"step": 143
},
{
"epoch": 0.06853879105188006,
"grad_norm": 7.8988189697265625,
"learning_rate": 2.9974736842105263e-05,
"loss": 3.1195,
"step": 144
},
{
"epoch": 0.06901475487862922,
"grad_norm": 8.312713623046875,
"learning_rate": 2.943947368421053e-05,
"loss": 2.5157,
"step": 145
},
{
"epoch": 0.06949071870537839,
"grad_norm": 7.518918514251709,
"learning_rate": 2.890421052631579e-05,
"loss": 2.7354,
"step": 146
},
{
"epoch": 0.06996668253212755,
"grad_norm": 7.5307841300964355,
"learning_rate": 2.8368947368421054e-05,
"loss": 2.5474,
"step": 147
},
{
"epoch": 0.07044264635887673,
"grad_norm": 8.257736206054688,
"learning_rate": 2.783368421052632e-05,
"loss": 2.8553,
"step": 148
},
{
"epoch": 0.0709186101856259,
"grad_norm": 6.292891979217529,
"learning_rate": 2.729842105263158e-05,
"loss": 2.3596,
"step": 149
},
{
"epoch": 0.07139457401237506,
"grad_norm": 9.312799453735352,
"learning_rate": 2.676315789473684e-05,
"loss": 2.5738,
"step": 150
},
{
"epoch": 0.07139457401237506,
"eval_loss": 1.2638894319534302,
"eval_runtime": 94.1331,
"eval_samples_per_second": 9.402,
"eval_steps_per_second": 2.358,
"step": 150
},
{
"epoch": 0.07187053783912423,
"grad_norm": 5.808437824249268,
"learning_rate": 2.622789473684211e-05,
"loss": 2.4866,
"step": 151
},
{
"epoch": 0.07234650166587339,
"grad_norm": 6.367058753967285,
"learning_rate": 2.5692631578947372e-05,
"loss": 2.5468,
"step": 152
},
{
"epoch": 0.07282246549262256,
"grad_norm": 7.049677848815918,
"learning_rate": 2.515736842105263e-05,
"loss": 2.6916,
"step": 153
},
{
"epoch": 0.07329842931937172,
"grad_norm": 6.486673831939697,
"learning_rate": 2.4622105263157897e-05,
"loss": 2.5986,
"step": 154
},
{
"epoch": 0.0737743931461209,
"grad_norm": 6.483848571777344,
"learning_rate": 2.408684210526316e-05,
"loss": 2.6228,
"step": 155
},
{
"epoch": 0.07425035697287007,
"grad_norm": 8.12568473815918,
"learning_rate": 2.355157894736842e-05,
"loss": 3.0707,
"step": 156
},
{
"epoch": 0.07472632079961923,
"grad_norm": 6.405067443847656,
"learning_rate": 2.3016315789473687e-05,
"loss": 2.654,
"step": 157
},
{
"epoch": 0.0752022846263684,
"grad_norm": 5.833227634429932,
"learning_rate": 2.248105263157895e-05,
"loss": 2.4285,
"step": 158
},
{
"epoch": 0.07567824845311756,
"grad_norm": 5.303393840789795,
"learning_rate": 2.194578947368421e-05,
"loss": 2.2755,
"step": 159
},
{
"epoch": 0.07615421227986673,
"grad_norm": 6.173555850982666,
"learning_rate": 2.1410526315789474e-05,
"loss": 2.4719,
"step": 160
},
{
"epoch": 0.07663017610661589,
"grad_norm": 5.758234977722168,
"learning_rate": 2.087526315789474e-05,
"loss": 2.6648,
"step": 161
},
{
"epoch": 0.07710613993336507,
"grad_norm": 5.576732158660889,
"learning_rate": 2.0340000000000002e-05,
"loss": 2.5195,
"step": 162
},
{
"epoch": 0.07758210376011423,
"grad_norm": 6.40316104888916,
"learning_rate": 1.9804736842105264e-05,
"loss": 2.6616,
"step": 163
},
{
"epoch": 0.0780580675868634,
"grad_norm": 5.3443427085876465,
"learning_rate": 1.9269473684210526e-05,
"loss": 2.1508,
"step": 164
},
{
"epoch": 0.07853403141361257,
"grad_norm": 7.0513014793396,
"learning_rate": 1.873421052631579e-05,
"loss": 2.3545,
"step": 165
},
{
"epoch": 0.07900999524036173,
"grad_norm": 6.783944129943848,
"learning_rate": 1.8198947368421054e-05,
"loss": 2.361,
"step": 166
},
{
"epoch": 0.0794859590671109,
"grad_norm": 7.000679016113281,
"learning_rate": 1.7663684210526317e-05,
"loss": 2.6073,
"step": 167
},
{
"epoch": 0.07996192289386006,
"grad_norm": 5.986188888549805,
"learning_rate": 1.712842105263158e-05,
"loss": 2.3954,
"step": 168
},
{
"epoch": 0.08043788672060924,
"grad_norm": 6.529272556304932,
"learning_rate": 1.6593157894736845e-05,
"loss": 2.7285,
"step": 169
},
{
"epoch": 0.0809138505473584,
"grad_norm": 5.4884772300720215,
"learning_rate": 1.6057894736842104e-05,
"loss": 2.1842,
"step": 170
},
{
"epoch": 0.08138981437410757,
"grad_norm": 6.433114528656006,
"learning_rate": 1.552263157894737e-05,
"loss": 2.3042,
"step": 171
},
{
"epoch": 0.08186577820085673,
"grad_norm": 6.97841215133667,
"learning_rate": 1.4987368421052632e-05,
"loss": 2.2691,
"step": 172
},
{
"epoch": 0.0823417420276059,
"grad_norm": 6.401273727416992,
"learning_rate": 1.4452105263157896e-05,
"loss": 2.5178,
"step": 173
},
{
"epoch": 0.08281770585435506,
"grad_norm": 6.540492534637451,
"learning_rate": 1.391684210526316e-05,
"loss": 2.2453,
"step": 174
},
{
"epoch": 0.08329366968110423,
"grad_norm": 6.713190078735352,
"learning_rate": 1.338157894736842e-05,
"loss": 2.2148,
"step": 175
},
{
"epoch": 0.08376963350785341,
"grad_norm": 7.125333309173584,
"learning_rate": 1.2846315789473686e-05,
"loss": 2.635,
"step": 176
},
{
"epoch": 0.08424559733460257,
"grad_norm": 7.104244709014893,
"learning_rate": 1.2311052631578948e-05,
"loss": 2.5499,
"step": 177
},
{
"epoch": 0.08472156116135174,
"grad_norm": 7.630082130432129,
"learning_rate": 1.177578947368421e-05,
"loss": 2.3113,
"step": 178
},
{
"epoch": 0.0851975249881009,
"grad_norm": 7.205958366394043,
"learning_rate": 1.1240526315789475e-05,
"loss": 2.3001,
"step": 179
},
{
"epoch": 0.08567348881485007,
"grad_norm": 6.465893268585205,
"learning_rate": 1.0705263157894737e-05,
"loss": 2.4646,
"step": 180
},
{
"epoch": 0.08614945264159923,
"grad_norm": 6.484694957733154,
"learning_rate": 1.0170000000000001e-05,
"loss": 2.3538,
"step": 181
},
{
"epoch": 0.08662541646834841,
"grad_norm": 6.740143775939941,
"learning_rate": 9.634736842105263e-06,
"loss": 2.1774,
"step": 182
},
{
"epoch": 0.08710138029509758,
"grad_norm": 6.847339153289795,
"learning_rate": 9.099473684210527e-06,
"loss": 2.5101,
"step": 183
},
{
"epoch": 0.08757734412184674,
"grad_norm": 7.767678260803223,
"learning_rate": 8.56421052631579e-06,
"loss": 2.5202,
"step": 184
},
{
"epoch": 0.08805330794859591,
"grad_norm": 8.59636116027832,
"learning_rate": 8.028947368421052e-06,
"loss": 2.7538,
"step": 185
},
{
"epoch": 0.08852927177534507,
"grad_norm": 8.195631980895996,
"learning_rate": 7.493684210526316e-06,
"loss": 2.4046,
"step": 186
},
{
"epoch": 0.08900523560209424,
"grad_norm": 6.393590927124023,
"learning_rate": 6.95842105263158e-06,
"loss": 2.2026,
"step": 187
},
{
"epoch": 0.0894811994288434,
"grad_norm": 6.6723551750183105,
"learning_rate": 6.423157894736843e-06,
"loss": 2.1685,
"step": 188
},
{
"epoch": 0.08995716325559258,
"grad_norm": 7.914825439453125,
"learning_rate": 5.887894736842105e-06,
"loss": 2.5318,
"step": 189
},
{
"epoch": 0.09043312708234175,
"grad_norm": 5.79133415222168,
"learning_rate": 5.3526315789473684e-06,
"loss": 2.1259,
"step": 190
},
{
"epoch": 0.09090909090909091,
"grad_norm": 8.203691482543945,
"learning_rate": 4.817368421052632e-06,
"loss": 2.6837,
"step": 191
},
{
"epoch": 0.09138505473584008,
"grad_norm": 8.923236846923828,
"learning_rate": 4.282105263157895e-06,
"loss": 2.6346,
"step": 192
},
{
"epoch": 0.09186101856258924,
"grad_norm": 7.227407932281494,
"learning_rate": 3.746842105263158e-06,
"loss": 2.5861,
"step": 193
},
{
"epoch": 0.0923369823893384,
"grad_norm": 6.976375102996826,
"learning_rate": 3.2115789473684215e-06,
"loss": 2.2964,
"step": 194
},
{
"epoch": 0.09281294621608757,
"grad_norm": 6.509735584259033,
"learning_rate": 2.6763157894736842e-06,
"loss": 2.341,
"step": 195
},
{
"epoch": 0.09328891004283675,
"grad_norm": 8.12691879272461,
"learning_rate": 2.1410526315789474e-06,
"loss": 2.4133,
"step": 196
},
{
"epoch": 0.09376487386958592,
"grad_norm": 7.400885581970215,
"learning_rate": 1.6057894736842107e-06,
"loss": 2.6121,
"step": 197
},
{
"epoch": 0.09424083769633508,
"grad_norm": 6.809811592102051,
"learning_rate": 1.0705263157894737e-06,
"loss": 2.4394,
"step": 198
},
{
"epoch": 0.09471680152308425,
"grad_norm": 7.949410915374756,
"learning_rate": 5.352631578947368e-07,
"loss": 2.7608,
"step": 199
},
{
"epoch": 0.09519276534983341,
"grad_norm": 8.497697830200195,
"learning_rate": 0.0,
"loss": 2.583,
"step": 200
},
{
"epoch": 0.09519276534983341,
"eval_loss": 1.1956850290298462,
"eval_runtime": 94.1756,
"eval_samples_per_second": 9.397,
"eval_steps_per_second": 2.357,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.15499809800192e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}