cilooor's picture
Training in progress, step 200, checkpoint
3dbd56d verified
raw
history blame
36.4 kB
{
"best_metric": 0.36728447675704956,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.1568627450980392,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000784313725490196,
"grad_norm": 12.951228141784668,
"learning_rate": 9e-06,
"loss": 4.2676,
"step": 1
},
{
"epoch": 0.000784313725490196,
"eval_loss": 1.2241708040237427,
"eval_runtime": 170.1958,
"eval_samples_per_second": 12.621,
"eval_steps_per_second": 3.155,
"step": 1
},
{
"epoch": 0.001568627450980392,
"grad_norm": 15.065213203430176,
"learning_rate": 1.8e-05,
"loss": 2.7549,
"step": 2
},
{
"epoch": 0.002352941176470588,
"grad_norm": 13.755888938903809,
"learning_rate": 2.7000000000000002e-05,
"loss": 2.7658,
"step": 3
},
{
"epoch": 0.003137254901960784,
"grad_norm": 11.931817054748535,
"learning_rate": 3.6e-05,
"loss": 2.2277,
"step": 4
},
{
"epoch": 0.00392156862745098,
"grad_norm": 8.542208671569824,
"learning_rate": 4.5e-05,
"loss": 1.8974,
"step": 5
},
{
"epoch": 0.004705882352941176,
"grad_norm": 7.989168643951416,
"learning_rate": 5.4000000000000005e-05,
"loss": 2.0913,
"step": 6
},
{
"epoch": 0.005490196078431373,
"grad_norm": 6.630977630615234,
"learning_rate": 6.3e-05,
"loss": 1.6821,
"step": 7
},
{
"epoch": 0.006274509803921568,
"grad_norm": 6.496756553649902,
"learning_rate": 7.2e-05,
"loss": 1.6665,
"step": 8
},
{
"epoch": 0.007058823529411765,
"grad_norm": 7.982497215270996,
"learning_rate": 8.1e-05,
"loss": 2.0694,
"step": 9
},
{
"epoch": 0.00784313725490196,
"grad_norm": 6.226460933685303,
"learning_rate": 9e-05,
"loss": 1.6821,
"step": 10
},
{
"epoch": 0.008627450980392156,
"grad_norm": 6.819662570953369,
"learning_rate": 8.999384872466111e-05,
"loss": 1.73,
"step": 11
},
{
"epoch": 0.009411764705882352,
"grad_norm": 8.317466735839844,
"learning_rate": 8.997539658034168e-05,
"loss": 1.7702,
"step": 12
},
{
"epoch": 0.01019607843137255,
"grad_norm": 7.111891269683838,
"learning_rate": 8.994464861167372e-05,
"loss": 1.6366,
"step": 13
},
{
"epoch": 0.010980392156862745,
"grad_norm": 7.476611614227295,
"learning_rate": 8.990161322484486e-05,
"loss": 1.6355,
"step": 14
},
{
"epoch": 0.011764705882352941,
"grad_norm": 6.031744480133057,
"learning_rate": 8.984630218530015e-05,
"loss": 1.4807,
"step": 15
},
{
"epoch": 0.012549019607843137,
"grad_norm": 6.812473773956299,
"learning_rate": 8.977873061452552e-05,
"loss": 1.6878,
"step": 16
},
{
"epoch": 0.013333333333333334,
"grad_norm": 8.05068588256836,
"learning_rate": 8.969891698591372e-05,
"loss": 1.828,
"step": 17
},
{
"epoch": 0.01411764705882353,
"grad_norm": 12.39442253112793,
"learning_rate": 8.96068831197139e-05,
"loss": 1.5053,
"step": 18
},
{
"epoch": 0.014901960784313726,
"grad_norm": 10.175419807434082,
"learning_rate": 8.950265417706609e-05,
"loss": 1.7489,
"step": 19
},
{
"epoch": 0.01568627450980392,
"grad_norm": 8.073866844177246,
"learning_rate": 8.938625865312251e-05,
"loss": 1.9391,
"step": 20
},
{
"epoch": 0.01647058823529412,
"grad_norm": 7.202723026275635,
"learning_rate": 8.925772836925722e-05,
"loss": 1.3958,
"step": 21
},
{
"epoch": 0.017254901960784313,
"grad_norm": 8.959539413452148,
"learning_rate": 8.911709846436643e-05,
"loss": 1.6471,
"step": 22
},
{
"epoch": 0.01803921568627451,
"grad_norm": 9.780714988708496,
"learning_rate": 8.896440738526198e-05,
"loss": 1.3311,
"step": 23
},
{
"epoch": 0.018823529411764704,
"grad_norm": 9.363273620605469,
"learning_rate": 8.879969687616027e-05,
"loss": 1.5693,
"step": 24
},
{
"epoch": 0.0196078431372549,
"grad_norm": 11.106522560119629,
"learning_rate": 8.862301196726988e-05,
"loss": 1.3139,
"step": 25
},
{
"epoch": 0.0203921568627451,
"grad_norm": 9.613574028015137,
"learning_rate": 8.84344009624807e-05,
"loss": 1.0847,
"step": 26
},
{
"epoch": 0.021176470588235293,
"grad_norm": 9.658738136291504,
"learning_rate": 8.823391542615818e-05,
"loss": 1.432,
"step": 27
},
{
"epoch": 0.02196078431372549,
"grad_norm": 7.206670761108398,
"learning_rate": 8.80216101690461e-05,
"loss": 1.0293,
"step": 28
},
{
"epoch": 0.022745098039215685,
"grad_norm": 11.575990676879883,
"learning_rate": 8.779754323328193e-05,
"loss": 1.3784,
"step": 29
},
{
"epoch": 0.023529411764705882,
"grad_norm": 9.105690956115723,
"learning_rate": 8.756177587652856e-05,
"loss": 0.9299,
"step": 30
},
{
"epoch": 0.02431372549019608,
"grad_norm": 11.146190643310547,
"learning_rate": 8.731437255522727e-05,
"loss": 1.127,
"step": 31
},
{
"epoch": 0.025098039215686273,
"grad_norm": 13.225046157836914,
"learning_rate": 8.705540090697575e-05,
"loss": 1.3228,
"step": 32
},
{
"epoch": 0.02588235294117647,
"grad_norm": 12.464346885681152,
"learning_rate": 8.678493173203682e-05,
"loss": 1.9641,
"step": 33
},
{
"epoch": 0.02666666666666667,
"grad_norm": 10.996861457824707,
"learning_rate": 8.650303897398232e-05,
"loss": 1.6458,
"step": 34
},
{
"epoch": 0.027450980392156862,
"grad_norm": 11.745558738708496,
"learning_rate": 8.620979969947759e-05,
"loss": 1.6805,
"step": 35
},
{
"epoch": 0.02823529411764706,
"grad_norm": 10.623291015625,
"learning_rate": 8.590529407721231e-05,
"loss": 1.594,
"step": 36
},
{
"epoch": 0.029019607843137254,
"grad_norm": 9.937420845031738,
"learning_rate": 8.558960535598317e-05,
"loss": 1.4287,
"step": 37
},
{
"epoch": 0.02980392156862745,
"grad_norm": 12.139042854309082,
"learning_rate": 8.526281984193436e-05,
"loss": 1.7026,
"step": 38
},
{
"epoch": 0.03058823529411765,
"grad_norm": 12.166768074035645,
"learning_rate": 8.492502687496253e-05,
"loss": 2.1068,
"step": 39
},
{
"epoch": 0.03137254901960784,
"grad_norm": 12.726622581481934,
"learning_rate": 8.4576318804292e-05,
"loss": 1.583,
"step": 40
},
{
"epoch": 0.03215686274509804,
"grad_norm": 21.52069854736328,
"learning_rate": 8.421679096322747e-05,
"loss": 1.7145,
"step": 41
},
{
"epoch": 0.03294117647058824,
"grad_norm": 13.835341453552246,
"learning_rate": 8.384654164309084e-05,
"loss": 2.445,
"step": 42
},
{
"epoch": 0.03372549019607843,
"grad_norm": 16.390380859375,
"learning_rate": 8.346567206634926e-05,
"loss": 1.6284,
"step": 43
},
{
"epoch": 0.034509803921568626,
"grad_norm": 19.605005264282227,
"learning_rate": 8.307428635894209e-05,
"loss": 2.1787,
"step": 44
},
{
"epoch": 0.03529411764705882,
"grad_norm": 12.839860916137695,
"learning_rate": 8.26724915218138e-05,
"loss": 1.6432,
"step": 45
},
{
"epoch": 0.03607843137254902,
"grad_norm": 17.19524383544922,
"learning_rate": 8.226039740166091e-05,
"loss": 2.6786,
"step": 46
},
{
"epoch": 0.03686274509803922,
"grad_norm": 22.435400009155273,
"learning_rate": 8.183811666090117e-05,
"loss": 2.043,
"step": 47
},
{
"epoch": 0.03764705882352941,
"grad_norm": 52.507537841796875,
"learning_rate": 8.140576474687264e-05,
"loss": 2.563,
"step": 48
},
{
"epoch": 0.038431372549019606,
"grad_norm": 33.50967025756836,
"learning_rate": 8.096345986027161e-05,
"loss": 2.8917,
"step": 49
},
{
"epoch": 0.0392156862745098,
"grad_norm": 47.0520133972168,
"learning_rate": 8.051132292283772e-05,
"loss": 3.6867,
"step": 50
},
{
"epoch": 0.0392156862745098,
"eval_loss": 0.4549524486064911,
"eval_runtime": 171.8457,
"eval_samples_per_second": 12.5,
"eval_steps_per_second": 3.125,
"step": 50
},
{
"epoch": 0.04,
"grad_norm": 14.582710266113281,
"learning_rate": 8.004947754429507e-05,
"loss": 4.1472,
"step": 51
},
{
"epoch": 0.0407843137254902,
"grad_norm": 6.1638922691345215,
"learning_rate": 7.957804998855866e-05,
"loss": 2.2978,
"step": 52
},
{
"epoch": 0.04156862745098039,
"grad_norm": 3.731342077255249,
"learning_rate": 7.909716913921508e-05,
"loss": 1.9173,
"step": 53
},
{
"epoch": 0.042352941176470586,
"grad_norm": 3.8371353149414062,
"learning_rate": 7.860696646428694e-05,
"loss": 1.4256,
"step": 54
},
{
"epoch": 0.043137254901960784,
"grad_norm": 4.627960205078125,
"learning_rate": 7.810757598029093e-05,
"loss": 1.6277,
"step": 55
},
{
"epoch": 0.04392156862745098,
"grad_norm": 4.3101043701171875,
"learning_rate": 7.759913421559902e-05,
"loss": 1.5975,
"step": 56
},
{
"epoch": 0.04470588235294118,
"grad_norm": 2.9555671215057373,
"learning_rate": 7.708178017311287e-05,
"loss": 1.0289,
"step": 57
},
{
"epoch": 0.04549019607843137,
"grad_norm": 3.413313865661621,
"learning_rate": 7.655565529226198e-05,
"loss": 1.3867,
"step": 58
},
{
"epoch": 0.04627450980392157,
"grad_norm": 3.6849825382232666,
"learning_rate": 7.602090341033547e-05,
"loss": 1.2773,
"step": 59
},
{
"epoch": 0.047058823529411764,
"grad_norm": 3.837523937225342,
"learning_rate": 7.547767072315837e-05,
"loss": 1.4187,
"step": 60
},
{
"epoch": 0.04784313725490196,
"grad_norm": 3.476501226425171,
"learning_rate": 7.492610574512317e-05,
"loss": 1.2678,
"step": 61
},
{
"epoch": 0.04862745098039216,
"grad_norm": 3.9961957931518555,
"learning_rate": 7.436635926858759e-05,
"loss": 1.4783,
"step": 62
},
{
"epoch": 0.04941176470588235,
"grad_norm": 4.245054244995117,
"learning_rate": 7.379858432264925e-05,
"loss": 1.5066,
"step": 63
},
{
"epoch": 0.05019607843137255,
"grad_norm": 5.134169578552246,
"learning_rate": 7.322293613130917e-05,
"loss": 1.4297,
"step": 64
},
{
"epoch": 0.050980392156862744,
"grad_norm": 4.725772380828857,
"learning_rate": 7.263957207103507e-05,
"loss": 1.6951,
"step": 65
},
{
"epoch": 0.05176470588235294,
"grad_norm": 4.962766647338867,
"learning_rate": 7.204865162773613e-05,
"loss": 1.5281,
"step": 66
},
{
"epoch": 0.05254901960784314,
"grad_norm": 4.3428192138671875,
"learning_rate": 7.14503363531613e-05,
"loss": 1.4586,
"step": 67
},
{
"epoch": 0.05333333333333334,
"grad_norm": 5.0155029296875,
"learning_rate": 7.084478982073247e-05,
"loss": 1.5896,
"step": 68
},
{
"epoch": 0.05411764705882353,
"grad_norm": 4.897243976593018,
"learning_rate": 7.023217758082528e-05,
"loss": 1.1711,
"step": 69
},
{
"epoch": 0.054901960784313725,
"grad_norm": 5.0609612464904785,
"learning_rate": 6.961266711550922e-05,
"loss": 1.4897,
"step": 70
},
{
"epoch": 0.05568627450980392,
"grad_norm": 4.689002990722656,
"learning_rate": 6.898642779275972e-05,
"loss": 0.9881,
"step": 71
},
{
"epoch": 0.05647058823529412,
"grad_norm": 6.720233917236328,
"learning_rate": 6.835363082015469e-05,
"loss": 1.5692,
"step": 72
},
{
"epoch": 0.05725490196078432,
"grad_norm": 5.96003532409668,
"learning_rate": 6.771444919806798e-05,
"loss": 1.3238,
"step": 73
},
{
"epoch": 0.05803921568627451,
"grad_norm": 6.871821880340576,
"learning_rate": 6.706905767237288e-05,
"loss": 1.4878,
"step": 74
},
{
"epoch": 0.058823529411764705,
"grad_norm": 6.003084182739258,
"learning_rate": 6.641763268666833e-05,
"loss": 1.5651,
"step": 75
},
{
"epoch": 0.0596078431372549,
"grad_norm": 6.855623245239258,
"learning_rate": 6.576035233404096e-05,
"loss": 1.5011,
"step": 76
},
{
"epoch": 0.0603921568627451,
"grad_norm": 6.763059616088867,
"learning_rate": 6.509739630837631e-05,
"loss": 1.6047,
"step": 77
},
{
"epoch": 0.0611764705882353,
"grad_norm": 6.753271579742432,
"learning_rate": 6.442894585523218e-05,
"loss": 1.5268,
"step": 78
},
{
"epoch": 0.06196078431372549,
"grad_norm": 5.556756973266602,
"learning_rate": 6.375518372228808e-05,
"loss": 1.0404,
"step": 79
},
{
"epoch": 0.06274509803921569,
"grad_norm": 6.503464221954346,
"learning_rate": 6.307629410938363e-05,
"loss": 1.0106,
"step": 80
},
{
"epoch": 0.06352941176470588,
"grad_norm": 4.450430870056152,
"learning_rate": 6.239246261816035e-05,
"loss": 0.6093,
"step": 81
},
{
"epoch": 0.06431372549019608,
"grad_norm": 6.413719177246094,
"learning_rate": 6.170387620131993e-05,
"loss": 1.238,
"step": 82
},
{
"epoch": 0.06509803921568627,
"grad_norm": 5.047544956207275,
"learning_rate": 6.101072311151325e-05,
"loss": 0.8442,
"step": 83
},
{
"epoch": 0.06588235294117648,
"grad_norm": 7.372629165649414,
"learning_rate": 6.0313192849873945e-05,
"loss": 1.4398,
"step": 84
},
{
"epoch": 0.06666666666666667,
"grad_norm": 9.576523780822754,
"learning_rate": 5.961147611421077e-05,
"loss": 1.4834,
"step": 85
},
{
"epoch": 0.06745098039215686,
"grad_norm": 6.8355207443237305,
"learning_rate": 5.890576474687264e-05,
"loss": 1.6157,
"step": 86
},
{
"epoch": 0.06823529411764706,
"grad_norm": 7.017658233642578,
"learning_rate": 5.8196251682300926e-05,
"loss": 1.6794,
"step": 87
},
{
"epoch": 0.06901960784313725,
"grad_norm": 8.371786117553711,
"learning_rate": 5.748313089428301e-05,
"loss": 1.5957,
"step": 88
},
{
"epoch": 0.06980392156862746,
"grad_norm": 10.280915260314941,
"learning_rate": 5.676659734292189e-05,
"loss": 2.2618,
"step": 89
},
{
"epoch": 0.07058823529411765,
"grad_norm": 8.091904640197754,
"learning_rate": 5.604684692133597e-05,
"loss": 1.3325,
"step": 90
},
{
"epoch": 0.07137254901960784,
"grad_norm": 8.49736499786377,
"learning_rate": 5.532407640210383e-05,
"loss": 1.4017,
"step": 91
},
{
"epoch": 0.07215686274509804,
"grad_norm": 7.801628112792969,
"learning_rate": 5.4598483383468616e-05,
"loss": 1.3546,
"step": 92
},
{
"epoch": 0.07294117647058823,
"grad_norm": 9.922377586364746,
"learning_rate": 5.3870266235316614e-05,
"loss": 1.8899,
"step": 93
},
{
"epoch": 0.07372549019607844,
"grad_norm": 12.533098220825195,
"learning_rate": 5.313962404494496e-05,
"loss": 1.8032,
"step": 94
},
{
"epoch": 0.07450980392156863,
"grad_norm": 13.20348072052002,
"learning_rate": 5.240675656263303e-05,
"loss": 1.8548,
"step": 95
},
{
"epoch": 0.07529411764705882,
"grad_norm": 22.03706932067871,
"learning_rate": 5.167186414703289e-05,
"loss": 2.4985,
"step": 96
},
{
"epoch": 0.07607843137254902,
"grad_norm": 18.293331146240234,
"learning_rate": 5.093514771039311e-05,
"loss": 1.9427,
"step": 97
},
{
"epoch": 0.07686274509803921,
"grad_norm": 19.484935760498047,
"learning_rate": 5.019680866363139e-05,
"loss": 2.3599,
"step": 98
},
{
"epoch": 0.07764705882352942,
"grad_norm": 19.723976135253906,
"learning_rate": 4.9457048861270835e-05,
"loss": 2.401,
"step": 99
},
{
"epoch": 0.0784313725490196,
"grad_norm": 19.807950973510742,
"learning_rate": 4.871607054625497e-05,
"loss": 1.9219,
"step": 100
},
{
"epoch": 0.0784313725490196,
"eval_loss": 0.423828125,
"eval_runtime": 172.1475,
"eval_samples_per_second": 12.478,
"eval_steps_per_second": 3.119,
"step": 100
},
{
"epoch": 0.0792156862745098,
"grad_norm": 6.207220077514648,
"learning_rate": 4.797407629465648e-05,
"loss": 3.1864,
"step": 101
},
{
"epoch": 0.08,
"grad_norm": 4.361649990081787,
"learning_rate": 4.7231268960295003e-05,
"loss": 2.0343,
"step": 102
},
{
"epoch": 0.08078431372549019,
"grad_norm": 3.420992851257324,
"learning_rate": 4.648785161927887e-05,
"loss": 1.307,
"step": 103
},
{
"epoch": 0.0815686274509804,
"grad_norm": 2.5062878131866455,
"learning_rate": 4.574402751448614e-05,
"loss": 1.256,
"step": 104
},
{
"epoch": 0.08235294117647059,
"grad_norm": 2.8243720531463623,
"learning_rate": 4.5e-05,
"loss": 1.6743,
"step": 105
},
{
"epoch": 0.08313725490196078,
"grad_norm": 2.8885185718536377,
"learning_rate": 4.425597248551387e-05,
"loss": 1.2425,
"step": 106
},
{
"epoch": 0.08392156862745098,
"grad_norm": 3.3675057888031006,
"learning_rate": 4.3512148380721134e-05,
"loss": 1.2662,
"step": 107
},
{
"epoch": 0.08470588235294117,
"grad_norm": 3.927375316619873,
"learning_rate": 4.2768731039704995e-05,
"loss": 1.5353,
"step": 108
},
{
"epoch": 0.08549019607843138,
"grad_norm": 3.4482688903808594,
"learning_rate": 4.202592370534353e-05,
"loss": 1.5533,
"step": 109
},
{
"epoch": 0.08627450980392157,
"grad_norm": 4.18345832824707,
"learning_rate": 4.128392945374505e-05,
"loss": 1.1801,
"step": 110
},
{
"epoch": 0.08705882352941176,
"grad_norm": 3.1618025302886963,
"learning_rate": 4.0542951138729184e-05,
"loss": 1.1809,
"step": 111
},
{
"epoch": 0.08784313725490196,
"grad_norm": 3.343364953994751,
"learning_rate": 3.980319133636863e-05,
"loss": 1.3565,
"step": 112
},
{
"epoch": 0.08862745098039215,
"grad_norm": 3.7614049911499023,
"learning_rate": 3.9064852289606895e-05,
"loss": 1.2552,
"step": 113
},
{
"epoch": 0.08941176470588236,
"grad_norm": 3.514059066772461,
"learning_rate": 3.832813585296711e-05,
"loss": 1.2708,
"step": 114
},
{
"epoch": 0.09019607843137255,
"grad_norm": 3.6900923252105713,
"learning_rate": 3.759324343736697e-05,
"loss": 1.1696,
"step": 115
},
{
"epoch": 0.09098039215686274,
"grad_norm": 3.7292330265045166,
"learning_rate": 3.686037595505507e-05,
"loss": 1.4199,
"step": 116
},
{
"epoch": 0.09176470588235294,
"grad_norm": 4.216084957122803,
"learning_rate": 3.612973376468339e-05,
"loss": 1.2863,
"step": 117
},
{
"epoch": 0.09254901960784313,
"grad_norm": 4.107268810272217,
"learning_rate": 3.54015166165314e-05,
"loss": 1.1495,
"step": 118
},
{
"epoch": 0.09333333333333334,
"grad_norm": 5.08326530456543,
"learning_rate": 3.4675923597896184e-05,
"loss": 1.4167,
"step": 119
},
{
"epoch": 0.09411764705882353,
"grad_norm": 4.708593368530273,
"learning_rate": 3.395315307866404e-05,
"loss": 1.3636,
"step": 120
},
{
"epoch": 0.09490196078431372,
"grad_norm": 4.576901912689209,
"learning_rate": 3.3233402657078116e-05,
"loss": 1.3424,
"step": 121
},
{
"epoch": 0.09568627450980392,
"grad_norm": 4.741860866546631,
"learning_rate": 3.2516869105717005e-05,
"loss": 1.2734,
"step": 122
},
{
"epoch": 0.09647058823529411,
"grad_norm": 4.217694282531738,
"learning_rate": 3.1803748317699093e-05,
"loss": 0.9932,
"step": 123
},
{
"epoch": 0.09725490196078432,
"grad_norm": 5.4652419090271,
"learning_rate": 3.1094235253127374e-05,
"loss": 1.1499,
"step": 124
},
{
"epoch": 0.09803921568627451,
"grad_norm": 5.599730014801025,
"learning_rate": 3.038852388578925e-05,
"loss": 1.2061,
"step": 125
},
{
"epoch": 0.0988235294117647,
"grad_norm": 5.845987319946289,
"learning_rate": 2.9686807150126064e-05,
"loss": 0.9805,
"step": 126
},
{
"epoch": 0.0996078431372549,
"grad_norm": 5.0457258224487305,
"learning_rate": 2.8989276888486755e-05,
"loss": 0.9718,
"step": 127
},
{
"epoch": 0.1003921568627451,
"grad_norm": 6.9256744384765625,
"learning_rate": 2.829612379868006e-05,
"loss": 1.0395,
"step": 128
},
{
"epoch": 0.1011764705882353,
"grad_norm": 5.739282131195068,
"learning_rate": 2.760753738183966e-05,
"loss": 1.2639,
"step": 129
},
{
"epoch": 0.10196078431372549,
"grad_norm": 5.214005947113037,
"learning_rate": 2.6923705890616385e-05,
"loss": 1.1527,
"step": 130
},
{
"epoch": 0.1027450980392157,
"grad_norm": 6.843442916870117,
"learning_rate": 2.6244816277711943e-05,
"loss": 1.446,
"step": 131
},
{
"epoch": 0.10352941176470588,
"grad_norm": 5.266831398010254,
"learning_rate": 2.5571054144767825e-05,
"loss": 1.0564,
"step": 132
},
{
"epoch": 0.10431372549019607,
"grad_norm": 7.751642227172852,
"learning_rate": 2.4902603691623712e-05,
"loss": 1.1319,
"step": 133
},
{
"epoch": 0.10509803921568628,
"grad_norm": 5.268066883087158,
"learning_rate": 2.4239647665959058e-05,
"loss": 0.9805,
"step": 134
},
{
"epoch": 0.10588235294117647,
"grad_norm": 6.458575248718262,
"learning_rate": 2.358236731333169e-05,
"loss": 1.1729,
"step": 135
},
{
"epoch": 0.10666666666666667,
"grad_norm": 7.804651260375977,
"learning_rate": 2.293094232762715e-05,
"loss": 1.919,
"step": 136
},
{
"epoch": 0.10745098039215686,
"grad_norm": 7.9424052238464355,
"learning_rate": 2.2285550801932047e-05,
"loss": 1.468,
"step": 137
},
{
"epoch": 0.10823529411764705,
"grad_norm": 6.931085109710693,
"learning_rate": 2.164636917984533e-05,
"loss": 1.5243,
"step": 138
},
{
"epoch": 0.10901960784313726,
"grad_norm": 8.593782424926758,
"learning_rate": 2.1013572207240293e-05,
"loss": 1.4203,
"step": 139
},
{
"epoch": 0.10980392156862745,
"grad_norm": 7.546792984008789,
"learning_rate": 2.03873328844908e-05,
"loss": 1.4472,
"step": 140
},
{
"epoch": 0.11058823529411765,
"grad_norm": 9.1332426071167,
"learning_rate": 1.9767822419174733e-05,
"loss": 1.5902,
"step": 141
},
{
"epoch": 0.11137254901960784,
"grad_norm": 10.055811882019043,
"learning_rate": 1.915521017926754e-05,
"loss": 2.183,
"step": 142
},
{
"epoch": 0.11215686274509803,
"grad_norm": 9.171233177185059,
"learning_rate": 1.8549663646838714e-05,
"loss": 1.7493,
"step": 143
},
{
"epoch": 0.11294117647058824,
"grad_norm": 7.260237693786621,
"learning_rate": 1.7951348372263872e-05,
"loss": 1.3346,
"step": 144
},
{
"epoch": 0.11372549019607843,
"grad_norm": 12.887181282043457,
"learning_rate": 1.7360427928964948e-05,
"loss": 2.0163,
"step": 145
},
{
"epoch": 0.11450980392156863,
"grad_norm": 9.76959228515625,
"learning_rate": 1.6777063868690835e-05,
"loss": 1.9468,
"step": 146
},
{
"epoch": 0.11529411764705882,
"grad_norm": 12.850676536560059,
"learning_rate": 1.6201415677350752e-05,
"loss": 2.0807,
"step": 147
},
{
"epoch": 0.11607843137254902,
"grad_norm": 11.191648483276367,
"learning_rate": 1.563364073141241e-05,
"loss": 1.9254,
"step": 148
},
{
"epoch": 0.11686274509803922,
"grad_norm": 16.186717987060547,
"learning_rate": 1.5073894254876825e-05,
"loss": 2.6053,
"step": 149
},
{
"epoch": 0.11764705882352941,
"grad_norm": 24.65969467163086,
"learning_rate": 1.452232927684166e-05,
"loss": 2.2168,
"step": 150
},
{
"epoch": 0.11764705882352941,
"eval_loss": 0.37640249729156494,
"eval_runtime": 171.5386,
"eval_samples_per_second": 12.522,
"eval_steps_per_second": 3.13,
"step": 150
},
{
"epoch": 0.11843137254901961,
"grad_norm": 3.1740787029266357,
"learning_rate": 1.397909658966454e-05,
"loss": 3.2078,
"step": 151
},
{
"epoch": 0.1192156862745098,
"grad_norm": 2.5368189811706543,
"learning_rate": 1.3444344707738015e-05,
"loss": 1.736,
"step": 152
},
{
"epoch": 0.12,
"grad_norm": 2.4591469764709473,
"learning_rate": 1.2918219826887136e-05,
"loss": 1.6981,
"step": 153
},
{
"epoch": 0.1207843137254902,
"grad_norm": 2.356226921081543,
"learning_rate": 1.2400865784400998e-05,
"loss": 1.3207,
"step": 154
},
{
"epoch": 0.12156862745098039,
"grad_norm": 2.378990888595581,
"learning_rate": 1.189242401970908e-05,
"loss": 1.3112,
"step": 155
},
{
"epoch": 0.1223529411764706,
"grad_norm": 2.623610734939575,
"learning_rate": 1.139303353571309e-05,
"loss": 1.2035,
"step": 156
},
{
"epoch": 0.12313725490196079,
"grad_norm": 2.538539409637451,
"learning_rate": 1.0902830860784946e-05,
"loss": 1.1537,
"step": 157
},
{
"epoch": 0.12392156862745098,
"grad_norm": 2.950873613357544,
"learning_rate": 1.0421950011441355e-05,
"loss": 1.091,
"step": 158
},
{
"epoch": 0.12470588235294118,
"grad_norm": 3.1460249423980713,
"learning_rate": 9.950522455704946e-06,
"loss": 1.4024,
"step": 159
},
{
"epoch": 0.12549019607843137,
"grad_norm": 3.339446544647217,
"learning_rate": 9.488677077162294e-06,
"loss": 1.3126,
"step": 160
},
{
"epoch": 0.12627450980392158,
"grad_norm": 3.6245152950286865,
"learning_rate": 9.03654013972839e-06,
"loss": 1.4356,
"step": 161
},
{
"epoch": 0.12705882352941175,
"grad_norm": 3.8416507244110107,
"learning_rate": 8.59423525312737e-06,
"loss": 1.3236,
"step": 162
},
{
"epoch": 0.12784313725490196,
"grad_norm": 4.071051120758057,
"learning_rate": 8.161883339098845e-06,
"loss": 1.4993,
"step": 163
},
{
"epoch": 0.12862745098039216,
"grad_norm": 4.043711185455322,
"learning_rate": 7.739602598339099e-06,
"loss": 1.6008,
"step": 164
},
{
"epoch": 0.12941176470588237,
"grad_norm": 3.690749168395996,
"learning_rate": 7.327508478186216e-06,
"loss": 1.1721,
"step": 165
},
{
"epoch": 0.13019607843137254,
"grad_norm": 3.896296262741089,
"learning_rate": 6.925713641057902e-06,
"loss": 1.2947,
"step": 166
},
{
"epoch": 0.13098039215686275,
"grad_norm": 3.99424409866333,
"learning_rate": 6.53432793365074e-06,
"loss": 1.28,
"step": 167
},
{
"epoch": 0.13176470588235295,
"grad_norm": 4.055082321166992,
"learning_rate": 6.153458356909174e-06,
"loss": 1.0857,
"step": 168
},
{
"epoch": 0.13254901960784313,
"grad_norm": 4.159709930419922,
"learning_rate": 5.783209036772518e-06,
"loss": 1.1397,
"step": 169
},
{
"epoch": 0.13333333333333333,
"grad_norm": 4.578479290008545,
"learning_rate": 5.423681195707997e-06,
"loss": 1.4361,
"step": 170
},
{
"epoch": 0.13411764705882354,
"grad_norm": 5.518746376037598,
"learning_rate": 5.074973125037469e-06,
"loss": 1.8645,
"step": 171
},
{
"epoch": 0.1349019607843137,
"grad_norm": 4.65179967880249,
"learning_rate": 4.737180158065644e-06,
"loss": 1.2877,
"step": 172
},
{
"epoch": 0.13568627450980392,
"grad_norm": 4.779722213745117,
"learning_rate": 4.41039464401685e-06,
"loss": 1.2109,
"step": 173
},
{
"epoch": 0.13647058823529412,
"grad_norm": 4.066786289215088,
"learning_rate": 4.094705922787687e-06,
"loss": 1.1928,
"step": 174
},
{
"epoch": 0.13725490196078433,
"grad_norm": 4.523605823516846,
"learning_rate": 3.7902003005224126e-06,
"loss": 1.1624,
"step": 175
},
{
"epoch": 0.1380392156862745,
"grad_norm": 5.148304462432861,
"learning_rate": 3.4969610260176865e-06,
"loss": 1.0653,
"step": 176
},
{
"epoch": 0.1388235294117647,
"grad_norm": 3.7846312522888184,
"learning_rate": 3.2150682679631867e-06,
"loss": 0.6153,
"step": 177
},
{
"epoch": 0.1396078431372549,
"grad_norm": 4.997846603393555,
"learning_rate": 2.9445990930242668e-06,
"loss": 0.9906,
"step": 178
},
{
"epoch": 0.1403921568627451,
"grad_norm": 4.470591068267822,
"learning_rate": 2.6856274447727475e-06,
"loss": 0.9009,
"step": 179
},
{
"epoch": 0.1411764705882353,
"grad_norm": 4.197781085968018,
"learning_rate": 2.4382241234714413e-06,
"loss": 0.7853,
"step": 180
},
{
"epoch": 0.1419607843137255,
"grad_norm": 5.868466854095459,
"learning_rate": 2.2024567667180914e-06,
"loss": 1.2172,
"step": 181
},
{
"epoch": 0.14274509803921567,
"grad_norm": 7.754011154174805,
"learning_rate": 1.978389830953906e-06,
"loss": 1.363,
"step": 182
},
{
"epoch": 0.14352941176470588,
"grad_norm": 4.4714884757995605,
"learning_rate": 1.7660845738418336e-06,
"loss": 0.8215,
"step": 183
},
{
"epoch": 0.14431372549019608,
"grad_norm": 5.938106536865234,
"learning_rate": 1.5655990375193147e-06,
"loss": 1.4177,
"step": 184
},
{
"epoch": 0.1450980392156863,
"grad_norm": 6.424489498138428,
"learning_rate": 1.3769880327301332e-06,
"loss": 1.4252,
"step": 185
},
{
"epoch": 0.14588235294117646,
"grad_norm": 4.783531188964844,
"learning_rate": 1.2003031238397417e-06,
"loss": 0.8835,
"step": 186
},
{
"epoch": 0.14666666666666667,
"grad_norm": 7.15680456161499,
"learning_rate": 1.035592614738033e-06,
"loss": 1.6154,
"step": 187
},
{
"epoch": 0.14745098039215687,
"grad_norm": 7.671852111816406,
"learning_rate": 8.829015356335791e-07,
"loss": 1.4224,
"step": 188
},
{
"epoch": 0.14823529411764705,
"grad_norm": 8.599688529968262,
"learning_rate": 7.422716307427936e-07,
"loss": 1.8422,
"step": 189
},
{
"epoch": 0.14901960784313725,
"grad_norm": 9.633552551269531,
"learning_rate": 6.137413468774955e-07,
"loss": 1.9472,
"step": 190
},
{
"epoch": 0.14980392156862746,
"grad_norm": 7.004188060760498,
"learning_rate": 4.973458229339179e-07,
"loss": 1.5712,
"step": 191
},
{
"epoch": 0.15058823529411763,
"grad_norm": 8.92959976196289,
"learning_rate": 3.9311688028611627e-07,
"loss": 1.8607,
"step": 192
},
{
"epoch": 0.15137254901960784,
"grad_norm": 11.083267211914062,
"learning_rate": 3.010830140862836e-07,
"loss": 1.9006,
"step": 193
},
{
"epoch": 0.15215686274509804,
"grad_norm": 8.419087409973145,
"learning_rate": 2.2126938547448627e-07,
"loss": 1.9129,
"step": 194
},
{
"epoch": 0.15294117647058825,
"grad_norm": 19.231441497802734,
"learning_rate": 1.536978146998569e-07,
"loss": 2.5608,
"step": 195
},
{
"epoch": 0.15372549019607842,
"grad_norm": 9.949979782104492,
"learning_rate": 9.838677515514594e-08,
"loss": 1.9126,
"step": 196
},
{
"epoch": 0.15450980392156863,
"grad_norm": 13.315712928771973,
"learning_rate": 5.5351388326286834e-08,
"loss": 2.0499,
"step": 197
},
{
"epoch": 0.15529411764705883,
"grad_norm": 14.174652099609375,
"learning_rate": 2.4603419658327797e-08,
"loss": 2.4591,
"step": 198
},
{
"epoch": 0.156078431372549,
"grad_norm": 15.293482780456543,
"learning_rate": 6.151275338894813e-09,
"loss": 2.6328,
"step": 199
},
{
"epoch": 0.1568627450980392,
"grad_norm": 13.656365394592285,
"learning_rate": 0.0,
"loss": 1.9934,
"step": 200
},
{
"epoch": 0.1568627450980392,
"eval_loss": 0.36728447675704956,
"eval_runtime": 172.0542,
"eval_samples_per_second": 12.484,
"eval_steps_per_second": 3.121,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 4,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.030141694503813e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}