lesso's picture
Training in progress, step 200, checkpoint
c0831e7 verified
{
"best_metric": 0.3262763023376465,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 2.3112391930835736,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011527377521613832,
"grad_norm": 15.795230865478516,
"learning_rate": 1.0100000000000002e-05,
"loss": 4.073,
"step": 1
},
{
"epoch": 0.011527377521613832,
"eval_loss": 1.3264315128326416,
"eval_runtime": 17.2268,
"eval_samples_per_second": 67.685,
"eval_steps_per_second": 2.148,
"step": 1
},
{
"epoch": 0.023054755043227664,
"grad_norm": 16.31910514831543,
"learning_rate": 2.0200000000000003e-05,
"loss": 4.1133,
"step": 2
},
{
"epoch": 0.0345821325648415,
"grad_norm": 9.573389053344727,
"learning_rate": 3.0299999999999998e-05,
"loss": 3.7994,
"step": 3
},
{
"epoch": 0.04610951008645533,
"grad_norm": 6.555534839630127,
"learning_rate": 4.0400000000000006e-05,
"loss": 3.7398,
"step": 4
},
{
"epoch": 0.05763688760806916,
"grad_norm": 10.255160331726074,
"learning_rate": 5.05e-05,
"loss": 4.3368,
"step": 5
},
{
"epoch": 0.069164265129683,
"grad_norm": 15.695327758789062,
"learning_rate": 6.0599999999999996e-05,
"loss": 5.9102,
"step": 6
},
{
"epoch": 0.08069164265129683,
"grad_norm": 10.010425567626953,
"learning_rate": 7.07e-05,
"loss": 4.749,
"step": 7
},
{
"epoch": 0.09221902017291066,
"grad_norm": 5.381749153137207,
"learning_rate": 8.080000000000001e-05,
"loss": 3.3209,
"step": 8
},
{
"epoch": 0.1037463976945245,
"grad_norm": 2.711754083633423,
"learning_rate": 9.09e-05,
"loss": 3.2273,
"step": 9
},
{
"epoch": 0.11527377521613832,
"grad_norm": 3.078639268875122,
"learning_rate": 0.000101,
"loss": 3.1344,
"step": 10
},
{
"epoch": 0.12680115273775217,
"grad_norm": 3.792743682861328,
"learning_rate": 0.00010046842105263158,
"loss": 3.5957,
"step": 11
},
{
"epoch": 0.138328530259366,
"grad_norm": 5.194520473480225,
"learning_rate": 9.993684210526315e-05,
"loss": 4.1843,
"step": 12
},
{
"epoch": 0.14985590778097982,
"grad_norm": 8.25356674194336,
"learning_rate": 9.940526315789473e-05,
"loss": 4.6054,
"step": 13
},
{
"epoch": 0.16138328530259366,
"grad_norm": 3.3276405334472656,
"learning_rate": 9.887368421052632e-05,
"loss": 2.9768,
"step": 14
},
{
"epoch": 0.1729106628242075,
"grad_norm": 2.0909225940704346,
"learning_rate": 9.83421052631579e-05,
"loss": 2.8321,
"step": 15
},
{
"epoch": 0.1844380403458213,
"grad_norm": 1.5769151449203491,
"learning_rate": 9.781052631578948e-05,
"loss": 2.6877,
"step": 16
},
{
"epoch": 0.19596541786743515,
"grad_norm": 3.1385788917541504,
"learning_rate": 9.727894736842106e-05,
"loss": 2.9761,
"step": 17
},
{
"epoch": 0.207492795389049,
"grad_norm": 3.5247676372528076,
"learning_rate": 9.674736842105263e-05,
"loss": 3.5156,
"step": 18
},
{
"epoch": 0.21902017291066284,
"grad_norm": 6.310729503631592,
"learning_rate": 9.621578947368421e-05,
"loss": 4.3218,
"step": 19
},
{
"epoch": 0.23054755043227665,
"grad_norm": 3.2671117782592773,
"learning_rate": 9.568421052631578e-05,
"loss": 2.677,
"step": 20
},
{
"epoch": 0.2420749279538905,
"grad_norm": 2.4811301231384277,
"learning_rate": 9.515263157894737e-05,
"loss": 2.6409,
"step": 21
},
{
"epoch": 0.25360230547550433,
"grad_norm": 1.5807781219482422,
"learning_rate": 9.462105263157895e-05,
"loss": 2.4935,
"step": 22
},
{
"epoch": 0.26512968299711814,
"grad_norm": 2.4525790214538574,
"learning_rate": 9.408947368421054e-05,
"loss": 2.6027,
"step": 23
},
{
"epoch": 0.276657060518732,
"grad_norm": 2.5126707553863525,
"learning_rate": 9.355789473684211e-05,
"loss": 2.9244,
"step": 24
},
{
"epoch": 0.2881844380403458,
"grad_norm": 5.332083702087402,
"learning_rate": 9.302631578947369e-05,
"loss": 3.8721,
"step": 25
},
{
"epoch": 0.29971181556195964,
"grad_norm": 2.8693103790283203,
"learning_rate": 9.249473684210526e-05,
"loss": 2.4373,
"step": 26
},
{
"epoch": 0.3112391930835735,
"grad_norm": 1.3940588235855103,
"learning_rate": 9.196315789473685e-05,
"loss": 2.3714,
"step": 27
},
{
"epoch": 0.3227665706051873,
"grad_norm": 1.8282368183135986,
"learning_rate": 9.143157894736843e-05,
"loss": 2.272,
"step": 28
},
{
"epoch": 0.33429394812680113,
"grad_norm": 1.570576786994934,
"learning_rate": 9.09e-05,
"loss": 2.3689,
"step": 29
},
{
"epoch": 0.345821325648415,
"grad_norm": 1.7571359872817993,
"learning_rate": 9.036842105263158e-05,
"loss": 2.5752,
"step": 30
},
{
"epoch": 0.3573487031700288,
"grad_norm": 3.287325143814087,
"learning_rate": 8.983684210526316e-05,
"loss": 2.9271,
"step": 31
},
{
"epoch": 0.3688760806916426,
"grad_norm": 3.290728807449341,
"learning_rate": 8.930526315789474e-05,
"loss": 2.8079,
"step": 32
},
{
"epoch": 0.3804034582132565,
"grad_norm": 1.4297772645950317,
"learning_rate": 8.877368421052632e-05,
"loss": 2.2268,
"step": 33
},
{
"epoch": 0.3919308357348703,
"grad_norm": 1.2625175714492798,
"learning_rate": 8.82421052631579e-05,
"loss": 2.1646,
"step": 34
},
{
"epoch": 0.4034582132564842,
"grad_norm": 1.2002922296524048,
"learning_rate": 8.771052631578948e-05,
"loss": 2.0982,
"step": 35
},
{
"epoch": 0.414985590778098,
"grad_norm": 1.4599192142486572,
"learning_rate": 8.717894736842105e-05,
"loss": 2.3861,
"step": 36
},
{
"epoch": 0.4265129682997118,
"grad_norm": 2.3603785037994385,
"learning_rate": 8.664736842105263e-05,
"loss": 2.7989,
"step": 37
},
{
"epoch": 0.43804034582132567,
"grad_norm": 3.2342662811279297,
"learning_rate": 8.61157894736842e-05,
"loss": 2.6078,
"step": 38
},
{
"epoch": 0.4495677233429395,
"grad_norm": 1.4671645164489746,
"learning_rate": 8.55842105263158e-05,
"loss": 2.1885,
"step": 39
},
{
"epoch": 0.4610951008645533,
"grad_norm": 1.2487242221832275,
"learning_rate": 8.505263157894737e-05,
"loss": 1.9964,
"step": 40
},
{
"epoch": 0.47262247838616717,
"grad_norm": 1.3007112741470337,
"learning_rate": 8.452105263157896e-05,
"loss": 1.9985,
"step": 41
},
{
"epoch": 0.484149855907781,
"grad_norm": 1.5477087497711182,
"learning_rate": 8.398947368421053e-05,
"loss": 2.1297,
"step": 42
},
{
"epoch": 0.4956772334293948,
"grad_norm": 2.269618272781372,
"learning_rate": 8.345789473684211e-05,
"loss": 2.4867,
"step": 43
},
{
"epoch": 0.5072046109510087,
"grad_norm": 3.860551595687866,
"learning_rate": 8.292631578947368e-05,
"loss": 3.0229,
"step": 44
},
{
"epoch": 0.5187319884726225,
"grad_norm": 1.6341471672058105,
"learning_rate": 8.239473684210526e-05,
"loss": 1.9718,
"step": 45
},
{
"epoch": 0.5302593659942363,
"grad_norm": 1.2065647840499878,
"learning_rate": 8.186315789473683e-05,
"loss": 2.0059,
"step": 46
},
{
"epoch": 0.5417867435158501,
"grad_norm": 1.6778544187545776,
"learning_rate": 8.133157894736842e-05,
"loss": 1.9092,
"step": 47
},
{
"epoch": 0.553314121037464,
"grad_norm": 1.4027705192565918,
"learning_rate": 8.080000000000001e-05,
"loss": 2.0142,
"step": 48
},
{
"epoch": 0.5648414985590778,
"grad_norm": 1.6712775230407715,
"learning_rate": 8.026842105263159e-05,
"loss": 2.289,
"step": 49
},
{
"epoch": 0.5763688760806917,
"grad_norm": 3.5697531700134277,
"learning_rate": 7.973684210526316e-05,
"loss": 2.8535,
"step": 50
},
{
"epoch": 0.5763688760806917,
"eval_loss": 0.522826075553894,
"eval_runtime": 16.9076,
"eval_samples_per_second": 68.963,
"eval_steps_per_second": 2.188,
"step": 50
},
{
"epoch": 0.5878962536023055,
"grad_norm": 3.06416392326355,
"learning_rate": 7.920526315789474e-05,
"loss": 1.8642,
"step": 51
},
{
"epoch": 0.5994236311239193,
"grad_norm": 1.5283445119857788,
"learning_rate": 7.867368421052631e-05,
"loss": 1.8985,
"step": 52
},
{
"epoch": 0.6109510086455331,
"grad_norm": 1.7631272077560425,
"learning_rate": 7.814210526315789e-05,
"loss": 1.7763,
"step": 53
},
{
"epoch": 0.622478386167147,
"grad_norm": 1.3397003412246704,
"learning_rate": 7.761052631578946e-05,
"loss": 1.9477,
"step": 54
},
{
"epoch": 0.6340057636887608,
"grad_norm": 1.6053922176361084,
"learning_rate": 7.707894736842105e-05,
"loss": 2.18,
"step": 55
},
{
"epoch": 0.6455331412103746,
"grad_norm": 2.7249755859375,
"learning_rate": 7.654736842105264e-05,
"loss": 2.3534,
"step": 56
},
{
"epoch": 0.6570605187319885,
"grad_norm": 2.97141432762146,
"learning_rate": 7.601578947368422e-05,
"loss": 2.4034,
"step": 57
},
{
"epoch": 0.6685878962536023,
"grad_norm": 1.9900548458099365,
"learning_rate": 7.548421052631579e-05,
"loss": 1.8997,
"step": 58
},
{
"epoch": 0.6801152737752162,
"grad_norm": 1.2884211540222168,
"learning_rate": 7.495263157894737e-05,
"loss": 1.7887,
"step": 59
},
{
"epoch": 0.69164265129683,
"grad_norm": 1.2060939073562622,
"learning_rate": 7.442105263157894e-05,
"loss": 1.7919,
"step": 60
},
{
"epoch": 0.7031700288184438,
"grad_norm": 1.6086294651031494,
"learning_rate": 7.388947368421053e-05,
"loss": 1.947,
"step": 61
},
{
"epoch": 0.7146974063400576,
"grad_norm": 2.0594823360443115,
"learning_rate": 7.335789473684211e-05,
"loss": 2.2118,
"step": 62
},
{
"epoch": 0.7262247838616714,
"grad_norm": 3.474539041519165,
"learning_rate": 7.282631578947368e-05,
"loss": 2.4448,
"step": 63
},
{
"epoch": 0.7377521613832853,
"grad_norm": 2.4555892944335938,
"learning_rate": 7.229473684210527e-05,
"loss": 1.6301,
"step": 64
},
{
"epoch": 0.7492795389048992,
"grad_norm": 1.3167078495025635,
"learning_rate": 7.176315789473685e-05,
"loss": 1.8001,
"step": 65
},
{
"epoch": 0.760806916426513,
"grad_norm": 1.3120259046554565,
"learning_rate": 7.123157894736842e-05,
"loss": 1.6406,
"step": 66
},
{
"epoch": 0.7723342939481268,
"grad_norm": 1.2128572463989258,
"learning_rate": 7.07e-05,
"loss": 1.9768,
"step": 67
},
{
"epoch": 0.7838616714697406,
"grad_norm": 1.7433110475540161,
"learning_rate": 7.016842105263159e-05,
"loss": 2.0889,
"step": 68
},
{
"epoch": 0.7953890489913544,
"grad_norm": 2.9662296772003174,
"learning_rate": 6.963684210526316e-05,
"loss": 2.4274,
"step": 69
},
{
"epoch": 0.8069164265129684,
"grad_norm": 1.7265340089797974,
"learning_rate": 6.910526315789474e-05,
"loss": 1.6667,
"step": 70
},
{
"epoch": 0.8184438040345822,
"grad_norm": 1.2966630458831787,
"learning_rate": 6.857368421052631e-05,
"loss": 1.5911,
"step": 71
},
{
"epoch": 0.829971181556196,
"grad_norm": 1.1538748741149902,
"learning_rate": 6.80421052631579e-05,
"loss": 1.623,
"step": 72
},
{
"epoch": 0.8414985590778098,
"grad_norm": 1.2530492544174194,
"learning_rate": 6.751052631578948e-05,
"loss": 1.6128,
"step": 73
},
{
"epoch": 0.8530259365994236,
"grad_norm": 1.6641517877578735,
"learning_rate": 6.697894736842105e-05,
"loss": 2.1622,
"step": 74
},
{
"epoch": 0.8645533141210374,
"grad_norm": 2.932410955429077,
"learning_rate": 6.644736842105264e-05,
"loss": 2.34,
"step": 75
},
{
"epoch": 0.8760806916426513,
"grad_norm": 1.564931869506836,
"learning_rate": 6.591578947368422e-05,
"loss": 1.5343,
"step": 76
},
{
"epoch": 0.8876080691642652,
"grad_norm": 1.22451913356781,
"learning_rate": 6.538421052631579e-05,
"loss": 1.5855,
"step": 77
},
{
"epoch": 0.899135446685879,
"grad_norm": 1.0155211687088013,
"learning_rate": 6.485263157894737e-05,
"loss": 1.4917,
"step": 78
},
{
"epoch": 0.9106628242074928,
"grad_norm": 1.1343241930007935,
"learning_rate": 6.432105263157894e-05,
"loss": 1.4143,
"step": 79
},
{
"epoch": 0.9221902017291066,
"grad_norm": 1.3770869970321655,
"learning_rate": 6.378947368421053e-05,
"loss": 1.9376,
"step": 80
},
{
"epoch": 0.9337175792507204,
"grad_norm": 2.184307813644409,
"learning_rate": 6.32578947368421e-05,
"loss": 2.1743,
"step": 81
},
{
"epoch": 0.9452449567723343,
"grad_norm": 2.2855091094970703,
"learning_rate": 6.27263157894737e-05,
"loss": 2.0407,
"step": 82
},
{
"epoch": 0.9567723342939481,
"grad_norm": 1.2995476722717285,
"learning_rate": 6.219473684210527e-05,
"loss": 1.5107,
"step": 83
},
{
"epoch": 0.968299711815562,
"grad_norm": 1.1009643077850342,
"learning_rate": 6.166315789473685e-05,
"loss": 1.4408,
"step": 84
},
{
"epoch": 0.9798270893371758,
"grad_norm": 1.1911717653274536,
"learning_rate": 6.113157894736842e-05,
"loss": 1.7328,
"step": 85
},
{
"epoch": 0.9913544668587896,
"grad_norm": 1.749642014503479,
"learning_rate": 6.0599999999999996e-05,
"loss": 1.8929,
"step": 86
},
{
"epoch": 1.005763688760807,
"grad_norm": 2.45554256439209,
"learning_rate": 6.006842105263158e-05,
"loss": 1.955,
"step": 87
},
{
"epoch": 1.0172910662824208,
"grad_norm": 1.4383282661437988,
"learning_rate": 5.953684210526315e-05,
"loss": 1.2303,
"step": 88
},
{
"epoch": 1.0288184438040346,
"grad_norm": 1.0873348712921143,
"learning_rate": 5.900526315789474e-05,
"loss": 1.2904,
"step": 89
},
{
"epoch": 1.0403458213256485,
"grad_norm": 1.093670129776001,
"learning_rate": 5.847368421052632e-05,
"loss": 1.342,
"step": 90
},
{
"epoch": 1.0518731988472623,
"grad_norm": 1.1975847482681274,
"learning_rate": 5.79421052631579e-05,
"loss": 1.5195,
"step": 91
},
{
"epoch": 1.063400576368876,
"grad_norm": 1.6481062173843384,
"learning_rate": 5.7410526315789475e-05,
"loss": 1.7187,
"step": 92
},
{
"epoch": 1.07492795389049,
"grad_norm": 2.8624935150146484,
"learning_rate": 5.687894736842105e-05,
"loss": 1.8387,
"step": 93
},
{
"epoch": 1.0864553314121037,
"grad_norm": 1.3205828666687012,
"learning_rate": 5.6347368421052625e-05,
"loss": 1.3021,
"step": 94
},
{
"epoch": 1.0979827089337175,
"grad_norm": 1.1411229372024536,
"learning_rate": 5.5815789473684214e-05,
"loss": 1.3437,
"step": 95
},
{
"epoch": 1.1095100864553313,
"grad_norm": 1.0338324308395386,
"learning_rate": 5.5284210526315796e-05,
"loss": 1.3011,
"step": 96
},
{
"epoch": 1.1210374639769451,
"grad_norm": 1.0531305074691772,
"learning_rate": 5.475263157894737e-05,
"loss": 1.3966,
"step": 97
},
{
"epoch": 1.1325648414985592,
"grad_norm": 1.6607333421707153,
"learning_rate": 5.422105263157895e-05,
"loss": 1.7677,
"step": 98
},
{
"epoch": 1.144092219020173,
"grad_norm": 2.978102684020996,
"learning_rate": 5.368947368421053e-05,
"loss": 2.0096,
"step": 99
},
{
"epoch": 1.1556195965417868,
"grad_norm": 1.1680306196212769,
"learning_rate": 5.3157894736842104e-05,
"loss": 1.2335,
"step": 100
},
{
"epoch": 1.1556195965417868,
"eval_loss": 0.3946053087711334,
"eval_runtime": 17.2978,
"eval_samples_per_second": 67.407,
"eval_steps_per_second": 2.139,
"step": 100
},
{
"epoch": 1.1671469740634006,
"grad_norm": 1.1594160795211792,
"learning_rate": 5.262631578947368e-05,
"loss": 1.1509,
"step": 101
},
{
"epoch": 1.1786743515850144,
"grad_norm": 1.061645269393921,
"learning_rate": 5.209473684210527e-05,
"loss": 1.1868,
"step": 102
},
{
"epoch": 1.1902017291066282,
"grad_norm": 1.021350622177124,
"learning_rate": 5.1563157894736844e-05,
"loss": 1.2714,
"step": 103
},
{
"epoch": 1.201729106628242,
"grad_norm": 1.3108984231948853,
"learning_rate": 5.1031578947368426e-05,
"loss": 1.5794,
"step": 104
},
{
"epoch": 1.2132564841498559,
"grad_norm": 2.1817190647125244,
"learning_rate": 5.05e-05,
"loss": 1.6861,
"step": 105
},
{
"epoch": 1.2247838616714697,
"grad_norm": 2.2876627445220947,
"learning_rate": 4.9968421052631576e-05,
"loss": 1.647,
"step": 106
},
{
"epoch": 1.2363112391930835,
"grad_norm": 1.1767092943191528,
"learning_rate": 4.943684210526316e-05,
"loss": 1.1436,
"step": 107
},
{
"epoch": 1.2478386167146973,
"grad_norm": 1.122977614402771,
"learning_rate": 4.890526315789474e-05,
"loss": 1.2584,
"step": 108
},
{
"epoch": 1.2593659942363113,
"grad_norm": 1.0112011432647705,
"learning_rate": 4.8373684210526316e-05,
"loss": 1.2698,
"step": 109
},
{
"epoch": 1.270893371757925,
"grad_norm": 1.2474466562271118,
"learning_rate": 4.784210526315789e-05,
"loss": 1.4298,
"step": 110
},
{
"epoch": 1.282420749279539,
"grad_norm": 2.0961225032806396,
"learning_rate": 4.731052631578947e-05,
"loss": 1.5249,
"step": 111
},
{
"epoch": 1.2939481268011528,
"grad_norm": 2.892947196960449,
"learning_rate": 4.6778947368421055e-05,
"loss": 1.7823,
"step": 112
},
{
"epoch": 1.3054755043227666,
"grad_norm": 1.2904059886932373,
"learning_rate": 4.624736842105263e-05,
"loss": 1.0597,
"step": 113
},
{
"epoch": 1.3170028818443804,
"grad_norm": 1.0860971212387085,
"learning_rate": 4.571578947368421e-05,
"loss": 1.3479,
"step": 114
},
{
"epoch": 1.3285302593659942,
"grad_norm": 1.0123194456100464,
"learning_rate": 4.518421052631579e-05,
"loss": 1.1235,
"step": 115
},
{
"epoch": 1.340057636887608,
"grad_norm": 1.1925913095474243,
"learning_rate": 4.465263157894737e-05,
"loss": 1.3878,
"step": 116
},
{
"epoch": 1.3515850144092219,
"grad_norm": 1.8660753965377808,
"learning_rate": 4.412105263157895e-05,
"loss": 1.6422,
"step": 117
},
{
"epoch": 1.3631123919308357,
"grad_norm": 2.7606379985809326,
"learning_rate": 4.358947368421053e-05,
"loss": 1.6949,
"step": 118
},
{
"epoch": 1.3746397694524495,
"grad_norm": 1.5100210905075073,
"learning_rate": 4.30578947368421e-05,
"loss": 1.0997,
"step": 119
},
{
"epoch": 1.3861671469740635,
"grad_norm": 1.165024995803833,
"learning_rate": 4.2526315789473685e-05,
"loss": 1.1644,
"step": 120
},
{
"epoch": 1.397694524495677,
"grad_norm": 0.9900702834129333,
"learning_rate": 4.199473684210527e-05,
"loss": 1.2457,
"step": 121
},
{
"epoch": 1.4092219020172911,
"grad_norm": 1.1262096166610718,
"learning_rate": 4.146315789473684e-05,
"loss": 1.341,
"step": 122
},
{
"epoch": 1.420749279538905,
"grad_norm": 1.4549776315689087,
"learning_rate": 4.093157894736842e-05,
"loss": 1.4951,
"step": 123
},
{
"epoch": 1.4322766570605188,
"grad_norm": 2.960393190383911,
"learning_rate": 4.0400000000000006e-05,
"loss": 1.9611,
"step": 124
},
{
"epoch": 1.4438040345821326,
"grad_norm": 1.1258149147033691,
"learning_rate": 3.986842105263158e-05,
"loss": 1.2408,
"step": 125
},
{
"epoch": 1.4553314121037464,
"grad_norm": 1.0389220714569092,
"learning_rate": 3.933684210526316e-05,
"loss": 1.1756,
"step": 126
},
{
"epoch": 1.4668587896253602,
"grad_norm": 1.1349718570709229,
"learning_rate": 3.880526315789473e-05,
"loss": 1.2792,
"step": 127
},
{
"epoch": 1.478386167146974,
"grad_norm": 1.1235551834106445,
"learning_rate": 3.827368421052632e-05,
"loss": 1.3884,
"step": 128
},
{
"epoch": 1.4899135446685878,
"grad_norm": 1.4590579271316528,
"learning_rate": 3.7742105263157896e-05,
"loss": 1.547,
"step": 129
},
{
"epoch": 1.5014409221902016,
"grad_norm": 2.1345937252044678,
"learning_rate": 3.721052631578947e-05,
"loss": 1.699,
"step": 130
},
{
"epoch": 1.5129682997118157,
"grad_norm": 2.1057217121124268,
"learning_rate": 3.6678947368421054e-05,
"loss": 1.3336,
"step": 131
},
{
"epoch": 1.5244956772334293,
"grad_norm": 1.0975521802902222,
"learning_rate": 3.6147368421052636e-05,
"loss": 1.1837,
"step": 132
},
{
"epoch": 1.5360230547550433,
"grad_norm": 1.0290075540542603,
"learning_rate": 3.561578947368421e-05,
"loss": 1.0588,
"step": 133
},
{
"epoch": 1.547550432276657,
"grad_norm": 1.109420895576477,
"learning_rate": 3.508421052631579e-05,
"loss": 1.2782,
"step": 134
},
{
"epoch": 1.559077809798271,
"grad_norm": 1.3788710832595825,
"learning_rate": 3.455263157894737e-05,
"loss": 1.4358,
"step": 135
},
{
"epoch": 1.5706051873198847,
"grad_norm": 1.9282630681991577,
"learning_rate": 3.402105263157895e-05,
"loss": 1.5525,
"step": 136
},
{
"epoch": 1.5821325648414986,
"grad_norm": 3.0819172859191895,
"learning_rate": 3.3489473684210526e-05,
"loss": 1.6032,
"step": 137
},
{
"epoch": 1.5936599423631124,
"grad_norm": 1.0416362285614014,
"learning_rate": 3.295789473684211e-05,
"loss": 1.0864,
"step": 138
},
{
"epoch": 1.6051873198847262,
"grad_norm": 1.0696144104003906,
"learning_rate": 3.242631578947368e-05,
"loss": 1.0025,
"step": 139
},
{
"epoch": 1.6167146974063402,
"grad_norm": 0.9461542963981628,
"learning_rate": 3.1894736842105265e-05,
"loss": 0.982,
"step": 140
},
{
"epoch": 1.6282420749279538,
"grad_norm": 1.1103463172912598,
"learning_rate": 3.136315789473685e-05,
"loss": 1.2823,
"step": 141
},
{
"epoch": 1.6397694524495678,
"grad_norm": 1.6639349460601807,
"learning_rate": 3.083157894736842e-05,
"loss": 1.486,
"step": 142
},
{
"epoch": 1.6512968299711814,
"grad_norm": 2.9342904090881348,
"learning_rate": 3.0299999999999998e-05,
"loss": 1.8242,
"step": 143
},
{
"epoch": 1.6628242074927955,
"grad_norm": 1.3234608173370361,
"learning_rate": 2.9768421052631577e-05,
"loss": 1.1624,
"step": 144
},
{
"epoch": 1.674351585014409,
"grad_norm": 1.2971738576889038,
"learning_rate": 2.923684210526316e-05,
"loss": 1.1139,
"step": 145
},
{
"epoch": 1.685878962536023,
"grad_norm": 1.0851243734359741,
"learning_rate": 2.8705263157894737e-05,
"loss": 1.0419,
"step": 146
},
{
"epoch": 1.697406340057637,
"grad_norm": 1.0544915199279785,
"learning_rate": 2.8173684210526313e-05,
"loss": 1.2045,
"step": 147
},
{
"epoch": 1.7089337175792507,
"grad_norm": 1.4829477071762085,
"learning_rate": 2.7642105263157898e-05,
"loss": 1.3821,
"step": 148
},
{
"epoch": 1.7204610951008645,
"grad_norm": 2.9280033111572266,
"learning_rate": 2.7110526315789473e-05,
"loss": 1.8452,
"step": 149
},
{
"epoch": 1.7319884726224783,
"grad_norm": 1.1372859477996826,
"learning_rate": 2.6578947368421052e-05,
"loss": 1.0575,
"step": 150
},
{
"epoch": 1.7319884726224783,
"eval_loss": 0.3432846665382385,
"eval_runtime": 18.7516,
"eval_samples_per_second": 62.181,
"eval_steps_per_second": 1.973,
"step": 150
},
{
"epoch": 1.7435158501440924,
"grad_norm": 1.023056983947754,
"learning_rate": 2.6047368421052634e-05,
"loss": 1.0717,
"step": 151
},
{
"epoch": 1.755043227665706,
"grad_norm": 0.9638779759407043,
"learning_rate": 2.5515789473684213e-05,
"loss": 0.9971,
"step": 152
},
{
"epoch": 1.76657060518732,
"grad_norm": 1.0617165565490723,
"learning_rate": 2.4984210526315788e-05,
"loss": 1.1202,
"step": 153
},
{
"epoch": 1.7780979827089336,
"grad_norm": 1.5653163194656372,
"learning_rate": 2.445263157894737e-05,
"loss": 1.4338,
"step": 154
},
{
"epoch": 1.7896253602305476,
"grad_norm": 2.3075835704803467,
"learning_rate": 2.3921052631578946e-05,
"loss": 1.5135,
"step": 155
},
{
"epoch": 1.8011527377521612,
"grad_norm": 2.41831111907959,
"learning_rate": 2.3389473684210528e-05,
"loss": 1.4573,
"step": 156
},
{
"epoch": 1.8126801152737753,
"grad_norm": 1.1299927234649658,
"learning_rate": 2.2857894736842106e-05,
"loss": 0.9739,
"step": 157
},
{
"epoch": 1.824207492795389,
"grad_norm": 0.9729629755020142,
"learning_rate": 2.2326315789473685e-05,
"loss": 1.1128,
"step": 158
},
{
"epoch": 1.8357348703170029,
"grad_norm": 0.9762557744979858,
"learning_rate": 2.1794736842105264e-05,
"loss": 1.0709,
"step": 159
},
{
"epoch": 1.8472622478386167,
"grad_norm": 1.2971409559249878,
"learning_rate": 2.1263157894736842e-05,
"loss": 1.3502,
"step": 160
},
{
"epoch": 1.8587896253602305,
"grad_norm": 1.9666305780410767,
"learning_rate": 2.073157894736842e-05,
"loss": 1.6911,
"step": 161
},
{
"epoch": 1.8703170028818443,
"grad_norm": 2.376969575881958,
"learning_rate": 2.0200000000000003e-05,
"loss": 1.433,
"step": 162
},
{
"epoch": 1.8818443804034581,
"grad_norm": 0.978244423866272,
"learning_rate": 1.966842105263158e-05,
"loss": 1.0352,
"step": 163
},
{
"epoch": 1.8933717579250722,
"grad_norm": 0.9526923298835754,
"learning_rate": 1.913684210526316e-05,
"loss": 0.9,
"step": 164
},
{
"epoch": 1.9048991354466858,
"grad_norm": 0.9895343780517578,
"learning_rate": 1.8605263157894736e-05,
"loss": 0.981,
"step": 165
},
{
"epoch": 1.9164265129682998,
"grad_norm": 1.156259536743164,
"learning_rate": 1.8073684210526318e-05,
"loss": 1.1636,
"step": 166
},
{
"epoch": 1.9279538904899134,
"grad_norm": 1.878818154335022,
"learning_rate": 1.7542105263157897e-05,
"loss": 1.4938,
"step": 167
},
{
"epoch": 1.9394812680115274,
"grad_norm": 2.605971097946167,
"learning_rate": 1.7010526315789475e-05,
"loss": 1.4686,
"step": 168
},
{
"epoch": 1.9510086455331412,
"grad_norm": 0.9951879978179932,
"learning_rate": 1.6478947368421054e-05,
"loss": 1.0558,
"step": 169
},
{
"epoch": 1.962536023054755,
"grad_norm": 0.976740300655365,
"learning_rate": 1.5947368421052633e-05,
"loss": 1.0558,
"step": 170
},
{
"epoch": 1.9740634005763689,
"grad_norm": 0.9469358325004578,
"learning_rate": 1.541578947368421e-05,
"loss": 1.0353,
"step": 171
},
{
"epoch": 1.9855907780979827,
"grad_norm": 1.4167894124984741,
"learning_rate": 1.4884210526315788e-05,
"loss": 1.2964,
"step": 172
},
{
"epoch": 1.9971181556195965,
"grad_norm": 2.729344129562378,
"learning_rate": 1.4352631578947369e-05,
"loss": 1.7346,
"step": 173
},
{
"epoch": 2.011527377521614,
"grad_norm": 0.9175971746444702,
"learning_rate": 1.3821052631578949e-05,
"loss": 0.8195,
"step": 174
},
{
"epoch": 2.0230547550432276,
"grad_norm": 0.883823037147522,
"learning_rate": 1.3289473684210526e-05,
"loss": 1.0166,
"step": 175
},
{
"epoch": 2.0345821325648417,
"grad_norm": 0.8910732865333557,
"learning_rate": 1.2757894736842106e-05,
"loss": 0.9309,
"step": 176
},
{
"epoch": 2.0461095100864553,
"grad_norm": 0.9672825932502747,
"learning_rate": 1.2226315789473685e-05,
"loss": 0.9933,
"step": 177
},
{
"epoch": 2.0576368876080693,
"grad_norm": 1.295758605003357,
"learning_rate": 1.1694736842105264e-05,
"loss": 1.1036,
"step": 178
},
{
"epoch": 2.069164265129683,
"grad_norm": 2.083310127258301,
"learning_rate": 1.1163157894736842e-05,
"loss": 1.1551,
"step": 179
},
{
"epoch": 2.080691642651297,
"grad_norm": 2.122234344482422,
"learning_rate": 1.0631578947368421e-05,
"loss": 1.114,
"step": 180
},
{
"epoch": 2.0922190201729105,
"grad_norm": 1.4002490043640137,
"learning_rate": 1.0100000000000002e-05,
"loss": 0.8692,
"step": 181
},
{
"epoch": 2.1037463976945245,
"grad_norm": 1.2972763776779175,
"learning_rate": 9.56842105263158e-06,
"loss": 0.8139,
"step": 182
},
{
"epoch": 2.115273775216138,
"grad_norm": 1.2130416631698608,
"learning_rate": 9.036842105263159e-06,
"loss": 0.9519,
"step": 183
},
{
"epoch": 2.126801152737752,
"grad_norm": 1.3538533449172974,
"learning_rate": 8.505263157894738e-06,
"loss": 1.1313,
"step": 184
},
{
"epoch": 2.138328530259366,
"grad_norm": 1.8970357179641724,
"learning_rate": 7.973684210526316e-06,
"loss": 1.1983,
"step": 185
},
{
"epoch": 2.14985590778098,
"grad_norm": 2.499178886413574,
"learning_rate": 7.442105263157894e-06,
"loss": 1.1611,
"step": 186
},
{
"epoch": 2.161383285302594,
"grad_norm": 0.9207624197006226,
"learning_rate": 6.9105263157894745e-06,
"loss": 0.7852,
"step": 187
},
{
"epoch": 2.1729106628242074,
"grad_norm": 0.9183100461959839,
"learning_rate": 6.378947368421053e-06,
"loss": 0.8561,
"step": 188
},
{
"epoch": 2.1844380403458215,
"grad_norm": 0.886722207069397,
"learning_rate": 5.847368421052632e-06,
"loss": 0.8833,
"step": 189
},
{
"epoch": 2.195965417867435,
"grad_norm": 1.110753059387207,
"learning_rate": 5.315789473684211e-06,
"loss": 1.1024,
"step": 190
},
{
"epoch": 2.207492795389049,
"grad_norm": 1.6218575239181519,
"learning_rate": 4.78421052631579e-06,
"loss": 1.0674,
"step": 191
},
{
"epoch": 2.2190201729106627,
"grad_norm": 2.869983673095703,
"learning_rate": 4.252631578947369e-06,
"loss": 1.2698,
"step": 192
},
{
"epoch": 2.2305475504322767,
"grad_norm": 0.937856912612915,
"learning_rate": 3.721052631578947e-06,
"loss": 0.916,
"step": 193
},
{
"epoch": 2.2420749279538903,
"grad_norm": 0.8958096504211426,
"learning_rate": 3.1894736842105266e-06,
"loss": 0.9195,
"step": 194
},
{
"epoch": 2.2536023054755043,
"grad_norm": 0.9468475580215454,
"learning_rate": 2.6578947368421053e-06,
"loss": 0.9558,
"step": 195
},
{
"epoch": 2.2651296829971184,
"grad_norm": 1.0763096809387207,
"learning_rate": 2.1263157894736844e-06,
"loss": 1.0005,
"step": 196
},
{
"epoch": 2.276657060518732,
"grad_norm": 1.6214865446090698,
"learning_rate": 1.5947368421052633e-06,
"loss": 1.2395,
"step": 197
},
{
"epoch": 2.288184438040346,
"grad_norm": 2.93674898147583,
"learning_rate": 1.0631578947368422e-06,
"loss": 1.3516,
"step": 198
},
{
"epoch": 2.2997118155619596,
"grad_norm": 0.9342450499534607,
"learning_rate": 5.315789473684211e-07,
"loss": 0.7604,
"step": 199
},
{
"epoch": 2.3112391930835736,
"grad_norm": 0.9478535652160645,
"learning_rate": 0.0,
"loss": 0.7978,
"step": 200
},
{
"epoch": 2.3112391930835736,
"eval_loss": 0.3262763023376465,
"eval_runtime": 17.7432,
"eval_samples_per_second": 65.715,
"eval_steps_per_second": 2.085,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.5438705567323914e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}